xref: /titanic_52/usr/src/uts/sun4v/cpu/niagara_copy.s (revision 7978b887afc61cd14c12d646224b6f1cd8494ea0)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/param.h>
29#include <sys/errno.h>
30#include <sys/asm_linkage.h>
31#include <sys/vtrace.h>
32#include <sys/machthread.h>
33#include <sys/clock.h>
34#include <sys/asi.h>
35#include <sys/fsr.h>
36#include <sys/privregs.h>
37#include <sys/machasi.h>
38#include <sys/niagaraasi.h>
39
40#if !defined(lint)
41#include "assym.h"
42#endif	/* lint */
43
44
45/*
46 * Pseudo-code to aid in understanding the control flow of the
47 * bcopy/kcopy routine.
48 *
49 *	! WARNING : <Register usage convention>
50 *	! In kcopy() the %o5, holds previous error handler and a flag
51 *	! LOFAULT_SET (low bits). The %o5 is null in bcopy().
52 *	! The %o5 is not available for any other use.
53 *
54 * kcopy():
55 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
56 *	%o5 |= LOFAULT_SET;			! ORed with LOFAULT_SET flag
57 *	curthread->t_lofault = .copyerr;
58 *	Call bcopy();
59 *
60 * bcopy():
61 * 	if (length < 128)
62 * 		goto regular_copy;
63 *
64 * 	if (!use_hw_bcopy)
65 * 		goto regular_copy;
66 *
67 * 	blockcopy;
68 *	restore t_lofault handler if came from kcopy();
69 *
70 *	regular_copy;
71 *	restore t_lofault handler if came from kcopy();
72 *
73 * In lofault handler:
74 *	curthread->t_lofault = (%o5 & ~LOFAULT_SET);	! restore old t_lofault
75 *	return (errno)
76 *
77 */
78
79/*
80 * Less then or equal this number of bytes we will always copy byte-for-byte
81 */
82#define	SMALL_LIMIT	7
83
84/*
85 * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
86 * handler was set
87 */
88#define	LOFAULT_SET 2
89
90/*
91 * This define is to align data for the unaligned source cases.
92 * The data1, data2 and data3 is merged into data1 and data2.
93 * The data3 is preserved for next merge.
94 */
95#define	ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)	\
96	sllx	data1, lshift, data1				;\
97	srlx	data2, rshift, tmp				;\
98	or	data1, tmp, data1				;\
99	sllx	data2, lshift, data2				;\
100	srlx	data3, rshift, tmp				;\
101	or	data2, tmp, data2
102/*
103 * This macro is to align the data. Basically it merges
104 * data1 and data2 to form double word.
105 */
106#define	ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)	\
107	sllx	data1, lshift, data1				;\
108	srlx	data2, rshift, tmp				;\
109	or	data1, tmp, data1
110
111#if !defined(NIAGARA_IMPL)
112/*
113 * Flags set in the lower bits of the t_lofault address:
114 * FPUSED_FLAG: The FP registers were in use and must be restored
115 * BCOPY_FLAG: Set for bcopy calls, cleared for kcopy calls
116 * COPY_FLAGS: Both of the above
117 *
118 * Other flags:
119 * KPREEMPT_FLAG: kpreempt needs to be called
120 */
121#define	FPUSED_FLAG	1
122#define	BCOPY_FLAG	2
123#define	COPY_FLAGS	(FPUSED_FLAG | BCOPY_FLAG)
124#define	KPREEMPT_FLAG	4
125
126#define	ALIGN_OFF_1_7			\
127	faligndata %d0, %d2, %d48	;\
128	faligndata %d2, %d4, %d50	;\
129	faligndata %d4, %d6, %d52	;\
130	faligndata %d6, %d8, %d54	;\
131	faligndata %d8, %d10, %d56	;\
132	faligndata %d10, %d12, %d58	;\
133	faligndata %d12, %d14, %d60	;\
134	faligndata %d14, %d16, %d62
135
136#define	ALIGN_OFF_8_15			\
137	faligndata %d2, %d4, %d48	;\
138	faligndata %d4, %d6, %d50	;\
139	faligndata %d6, %d8, %d52	;\
140	faligndata %d8, %d10, %d54	;\
141	faligndata %d10, %d12, %d56	;\
142	faligndata %d12, %d14, %d58	;\
143	faligndata %d14, %d16, %d60	;\
144	faligndata %d16, %d18, %d62
145
146#define	ALIGN_OFF_16_23			\
147	faligndata %d4, %d6, %d48	;\
148	faligndata %d6, %d8, %d50	;\
149	faligndata %d8, %d10, %d52	;\
150	faligndata %d10, %d12, %d54	;\
151	faligndata %d12, %d14, %d56	;\
152	faligndata %d14, %d16, %d58	;\
153	faligndata %d16, %d18, %d60	;\
154	faligndata %d18, %d20, %d62
155
156#define	ALIGN_OFF_24_31			\
157	faligndata %d6, %d8, %d48	;\
158	faligndata %d8, %d10, %d50	;\
159	faligndata %d10, %d12, %d52	;\
160	faligndata %d12, %d14, %d54	;\
161	faligndata %d14, %d16, %d56	;\
162	faligndata %d16, %d18, %d58	;\
163	faligndata %d18, %d20, %d60	;\
164	faligndata %d20, %d22, %d62
165
166#define	ALIGN_OFF_32_39			\
167	faligndata %d8, %d10, %d48	;\
168	faligndata %d10, %d12, %d50	;\
169	faligndata %d12, %d14, %d52	;\
170	faligndata %d14, %d16, %d54	;\
171	faligndata %d16, %d18, %d56	;\
172	faligndata %d18, %d20, %d58	;\
173	faligndata %d20, %d22, %d60	;\
174	faligndata %d22, %d24, %d62
175
176#define	ALIGN_OFF_40_47			\
177	faligndata %d10, %d12, %d48	;\
178	faligndata %d12, %d14, %d50	;\
179	faligndata %d14, %d16, %d52	;\
180	faligndata %d16, %d18, %d54	;\
181	faligndata %d18, %d20, %d56	;\
182	faligndata %d20, %d22, %d58	;\
183	faligndata %d22, %d24, %d60	;\
184	faligndata %d24, %d26, %d62
185
186#define	ALIGN_OFF_48_55			\
187	faligndata %d12, %d14, %d48	;\
188	faligndata %d14, %d16, %d50	;\
189	faligndata %d16, %d18, %d52	;\
190	faligndata %d18, %d20, %d54	;\
191	faligndata %d20, %d22, %d56	;\
192	faligndata %d22, %d24, %d58	;\
193	faligndata %d24, %d26, %d60	;\
194	faligndata %d26, %d28, %d62
195
196#define	ALIGN_OFF_56_63			\
197	faligndata %d14, %d16, %d48	;\
198	faligndata %d16, %d18, %d50	;\
199	faligndata %d18, %d20, %d52	;\
200	faligndata %d20, %d22, %d54	;\
201	faligndata %d22, %d24, %d56	;\
202	faligndata %d24, %d26, %d58	;\
203	faligndata %d26, %d28, %d60	;\
204	faligndata %d28, %d30, %d62
205
206#define	VIS_BLOCKSIZE		64
207
208/*
209 * Size of stack frame in order to accomodate a 64-byte aligned
210 * floating-point register save area and 2 64-bit temp locations.
211 * All copy functions use three quadrants of fp registers; to assure a
212 * block-aligned three block buffer in which to save we must reserve
213 * four blocks on stack.
214 *
215 *    _______________________________________ <-- %fp + STACK_BIAS
216 *    | We may need to preserve 3 quadrants |
217 *    | of fp regs, but since we do so with |
218 *    | BST/BLD we need room in which to    |
219 *    | align to VIS_BLOCKSIZE bytes.  So   |
220 *    | this area is 4 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
221 *    |-------------------------------------|
222 *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
223 *    |-------------------------------------|
224 *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
225 *    ---------------------------------------
226 */
227#define HWCOPYFRAMESIZE         ((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8))
228#define SAVED_FPREGS_OFFSET     (VIS_BLOCKSIZE * 4)
229#define SAVED_FPREGS_ADJUST     ((VIS_BLOCKSIZE * 3) + 1)
230#define SAVED_FPRS_OFFSET       (SAVED_FPREGS_OFFSET + 8)
231#define SAVED_GSR_OFFSET        (SAVED_FPRS_OFFSET + 8)
232
233/*
234 * In FP copies if we do not have preserved data to restore over
235 * the fp regs we used then we must zero those regs to avoid
236 * exposing portions of the data to later threads (data security).
237 */
238#define	FZERO				\
239	fzero	%f0			;\
240	fzero	%f2			;\
241	faddd	%f0, %f2, %f4		;\
242	fmuld	%f0, %f2, %f6		;\
243	faddd	%f0, %f2, %f8		;\
244	fmuld	%f0, %f2, %f10		;\
245	faddd	%f0, %f2, %f12		;\
246	fmuld	%f0, %f2, %f14		;\
247	faddd	%f0, %f2, %f16		;\
248	fmuld	%f0, %f2, %f18		;\
249	faddd	%f0, %f2, %f20		;\
250	fmuld	%f0, %f2, %f22		;\
251	faddd	%f0, %f2, %f24		;\
252	fmuld	%f0, %f2, %f26		;\
253	faddd	%f0, %f2, %f28		;\
254	fmuld	%f0, %f2, %f30		;\
255	faddd	%f0, %f2, %f48		;\
256	fmuld	%f0, %f2, %f50		;\
257	faddd	%f0, %f2, %f52		;\
258	fmuld	%f0, %f2, %f54		;\
259	faddd	%f0, %f2, %f56		;\
260	fmuld	%f0, %f2, %f58		;\
261	faddd	%f0, %f2, %f60		;\
262	fmuld	%f0, %f2, %f62
263
264/*
265 * Macros to save and restore fp registers to/from the stack.
266 * Used to save and restore in-use fp registers when we want to use FP.
267 */
268#define BST_FP_TOSTACK(tmp1)					\
269	/* membar #Sync	*/					;\
270	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
271	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
272	stda	%f0, [tmp1]ASI_BLK_P				;\
273	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
274	stda	%f16, [tmp1]ASI_BLK_P				;\
275	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
276	stda	%f48, [tmp1]ASI_BLK_P				;\
277	membar	#Sync
278
279#define	BLD_FP_FROMSTACK(tmp1)					\
280	/* membar #Sync - provided at copy completion */	;\
281	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
282	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
283	ldda	[tmp1]ASI_BLK_P, %f0				;\
284	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
285	ldda	[tmp1]ASI_BLK_P, %f16				;\
286	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
287	ldda	[tmp1]ASI_BLK_P, %f48				;\
288	membar	#Sync
289#endif	/* NIAGARA_IMPL */
290
291/*
292 * Copy a block of storage, returning an error code if `from' or
293 * `to' takes a kernel pagefault which cannot be resolved.
294 * Returns errno value on pagefault error, 0 if all ok
295 */
296
297#if defined(lint)
298
299/* ARGSUSED */
300int
301kcopy(const void *from, void *to, size_t count)
302{ return(0); }
303
304#else	/* lint */
305
306	.seg	".text"
307	.align	4
308
309	ENTRY(kcopy)
310
311#if !defined(NIAGARA_IMPL)
312	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
313	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
314	or	%l7, %lo(.copyerr), %l7
315	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
316	! Note that we carefully do *not* flag the setting of
317	! t_lofault.
318	membar	#Sync				! sync error barrier
319	b	.do_copy			! common code
320	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
321
322/*
323 * We got here because of a fault during kcopy or bcopy if a fault
324 * handler existed when bcopy was called.
325 * Errno value is in %g1.
326 */
327.copyerr:
328	sethi	%hi(.copyerr2), %l1
329	or	%l1, %lo(.copyerr2), %l1
330	membar	#Sync				! sync error barrier
331	stn	%l1, [THREAD_REG + T_LOFAULT]	! set t_lofault
332	btst	FPUSED_FLAG, %o5
333	bz,pt	%xcc, 1f
334	and	%o5, BCOPY_FLAG, %l1	! copy flag to %l1
335
336	membar	#Sync				! sync error barrier
337	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
338	wr	%o2, 0, %gsr
339
340	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
341	btst	FPRS_FEF, %o3
342	bz,pt	%icc, 4f
343	  nop
344
345	! restore fpregs from stack
346	BLD_FP_FROMSTACK(%o2)
347
348	ba,pt	%ncc, 2f
349	  wr	%o3, 0, %fprs		! restore fprs
350
3514:
352	FZERO
353	wr	%o3, 0, %fprs		! restore fprs
354
3552:
356	ldn	[THREAD_REG + T_LWP], %o2
357	brnz,pt	%o2, 1f
358	  nop
359
360	ldsb	[THREAD_REG + T_PREEMPT], %l0
361	deccc	%l0
362	bnz,pn	%ncc, 1f
363	  stb	%l0, [THREAD_REG + T_PREEMPT]
364
365	! Check for a kernel preemption request
366	ldn	[THREAD_REG + T_CPU], %l0
367	ldub	[%l0 + CPU_KPRUNRUN], %l0
368	brnz,a,pt	%l0, 1f	! Need to call kpreempt?
369	  or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
370
371	! The kcopy will always set a t_lofault handler. If it fires,
372	! we're expected to just return the error code and not to
373	! invoke any existing error handler. As far as bcopy is concerned,
374	! we only set t_lofault if there was an existing lofault handler.
375	! In that case we're expected to invoke the previously existing
376	! handler after restting the t_lofault value.
3771:
378	andn	%o5, COPY_FLAGS, %o5	! remove flags from lofault address
379	membar	#Sync				! sync error barrier
380	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
381
382	! call kpreempt if necessary
383	btst	KPREEMPT_FLAG, %l1
384	bz,pt	%icc, 2f
385	  nop
386	call	kpreempt
387	  rdpr	%pil, %o0	! pass %pil
3882:
389	btst	BCOPY_FLAG, %l1
390	bnz,pn	%ncc, 3f
391	nop
392	ret
393	restore	%g1, 0, %o0
394
3953:
396	! We're here via bcopy. There must have been an error handler
397	! in place otherwise we would have died a nasty death already.
398	jmp	%o5				! goto real handler
399	restore	%g0, 0, %o0			! dispose of copy window
400
401/*
402 * We got here because of a fault in .copyerr.  We can't safely restore fp
403 * state, so we panic.
404 */
405fp_panic_msg:
406	.asciz	"Unable to restore fp state after copy operation"
407
408	.align	4
409.copyerr2:
410	set	fp_panic_msg, %o0
411	call	panic
412	  nop
413#else	/* NIAGARA_IMPL */
414	save	%sp, -SA(MINFRAME), %sp
415	set	.copyerr, %l7			! copyerr is lofault value
416	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
417	or	%o5, LOFAULT_SET, %o5
418	membar	#Sync				! sync error barrier
419	b	.do_copy			! common code
420	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
421
422/*
423 * We got here because of a fault during kcopy.
424 * Errno value is in %g1.
425 */
426.copyerr:
427	! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET
428	! into %o5 to indicate it has set t_lofault handler. Need to clear
429	! LOFAULT_SET flag before restoring the error handler.
430	andn	%o5, LOFAULT_SET, %o5
431	membar	#Sync				! sync error barrier
432	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
433	ret
434	restore	%g1, 0, %o0
435#endif	/* NIAGARA_IMPL */
436
437	SET_SIZE(kcopy)
438#endif	/* lint */
439
440
441/*
442 * Copy a block of storage - must not overlap (from + len <= to).
443 */
444#if defined(lint)
445
446/* ARGSUSED */
447void
448bcopy(const void *from, void *to, size_t count)
449{}
450
451#else	/* lint */
452
453	ENTRY(bcopy)
454
455#if !defined(NIAGARA_IMPL)
456	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
457	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
458	brz,pt	%o5, .do_copy
459	  nop
460	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
461	or	%l7, %lo(.copyerr), %l7
462	membar	#Sync				! sync error barrier
463	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
464	! We've already captured whether t_lofault was zero on entry.
465	! We need to mark ourselves as being from bcopy since both
466	! kcopy and bcopy use the same code path. If BCOPY_FLAG is
467	! set and the saved lofault was zero, we won't reset lofault on
468	! returning.
469	or	%o5, BCOPY_FLAG, %o5
470#else	/* NIAGARA_IMPL */
471	save	%sp, -SA(MINFRAME), %sp
472	clr	%o5			! flag LOFAULT_SET is not set for bcopy
473#endif	/* NIAGARA_IMPL */
474
475.do_copy:
476	cmp	%i2, 12			! for small counts
477	blu	%ncc, .bytecp		! just copy bytes
478	  .empty
479
480	cmp	%i2, 128		! for less than 128 bytes
481	blu,pn	%ncc, .bcb_punt		! no block st/quad ld
482	  nop
483
484	set	use_hw_bcopy, %o2
485	ld	[%o2], %o2
486	brz,pn	%o2, .bcb_punt
487	  nop
488
489	subcc	%i1, %i0, %i3
490	bneg,a,pn %ncc, 1f
491	neg	%i3
4921:
493	/*
494	 * Compare against 256 since we should be checking block addresses
495	 * and (dest & ~63) - (src & ~63) can be 3 blocks even if
496	 * src = dest + (64 * 3) + 63.
497	 */
498	cmp	%i3, 256
499	blu,pn	%ncc, .bcb_punt
500	  nop
501
502	/*
503	 * Copy that reach here have at least 2 blocks of data to copy.
504	 */
505#if !defined(NIAGARA_IMPL)
506	ldn	[THREAD_REG + T_LWP], %o3
507	brnz,pt	%o3, 1f
508	  nop
509
510	! kpreempt_disable();
511	ldsb	[THREAD_REG + T_PREEMPT], %o2
512	inc	%o2
513	stb	%o2, [THREAD_REG + T_PREEMPT]
514
5151:
516	rd	%fprs, %o2              ! check for unused fp
517	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
518	btst	FPRS_FEF, %o2
519	bz,a,pt	%icc, .do_blockcopy
520	wr	%g0, FPRS_FEF, %fprs
521
522	! save in-use fpregs on stack
523	BST_FP_TOSTACK(%o2)
524#endif	/* NIAGARA_IMPL */
525
526.do_blockcopy:
527
528#if !defined(NIAGARA_IMPL)
529	rd	%gsr, %o2
530	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
531	or	%o5, FPUSED_FLAG, %o5		! fp regs are in use
532#endif	/* NIAGARA_IMPL */
533
534	! Swap src/dst since the code below is memcpy code
535	! and memcpy/bcopy have different calling sequences
536	mov	%i1, %i5
537	mov	%i0, %i1
538	mov	%i5, %i0
539
540	! Block (64 bytes) align the destination.
541	andcc	%i0, 0x3f, %i3		! is dst aligned on a 64 bytes
542	bz	%xcc, .chksrc		! dst is already double aligned
543	sub	%i3, 0x40, %i3
544	neg	%i3			! bytes till dst 64 bytes aligned
545	sub	%i2, %i3, %i2		! update i2 with new count
546
547	! Based on source and destination alignment do
548	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
549
550	! Is dst & src 8B aligned
551	or	%i0, %i1, %o2
552	andcc	%o2, 0x7, %g0
553	bz	%ncc, .alewdcp
554	nop
555
556	! Is dst & src 4B aligned
557	andcc	%o2, 0x3, %g0
558	bz	%ncc, .alwdcp
559	nop
560
561	! Is dst & src 2B aligned
562	andcc	%o2, 0x1, %g0
563	bz	%ncc, .alhlfwdcp
564	nop
565
566	! 1B aligned
5671:	ldub	[%i1], %o2
568	stb	%o2, [%i0]
569	inc	%i1
570	deccc	%i3
571	bgu,pt	%ncc, 1b
572	inc	%i0
573
574	ba	.chksrc
575	nop
576
577	! dst & src 4B aligned
578.alwdcp:
579	ld	[%i1], %o2
580	st	%o2, [%i0]
581	add	%i1, 0x4, %i1
582	subcc	%i3, 0x4, %i3
583	bgu,pt	%ncc, .alwdcp
584	add	%i0, 0x4, %i0
585
586	ba	.chksrc
587	nop
588
589	! dst & src 2B aligned
590.alhlfwdcp:
591	lduh	[%i1], %o2
592	stuh	%o2, [%i0]
593	add	%i1, 0x2, %i1
594	subcc	%i3, 0x2, %i3
595	bgu,pt	%ncc, .alhlfwdcp
596	add	%i0, 0x2, %i0
597
598	ba	.chksrc
599	nop
600
601	! dst & src 8B aligned
602.alewdcp:
603	ldx	[%i1], %o2
604	stx	%o2, [%i0]
605	add	%i1, 0x8, %i1
606	subcc	%i3, 0x8, %i3
607	bgu,pt	%ncc, .alewdcp
608	add	%i0, 0x8, %i0
609
610	! Now Destination is block (64 bytes) aligned
611.chksrc:
612	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
613	sub	%i2, %i3, %i2		! Residue bytes in %i2
614
615	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
616
617#if !defined(NIAGARA_IMPL)
618	andn	%i1, 0x3f, %l0		! %l0 has block aligned src address
619	prefetch [%l0+0x0], #one_read
620	andcc	%i1, 0x3f, %g0		! is src 64B aligned
621	bz,pn	%ncc, .blkcpy
622	nop
623
624	! handle misaligned source cases
625	alignaddr %i1, %g0, %g0		! generate %gsr
626
627	srl	%i1, 0x3, %l1		! src add bits 3, 4, 5 are now least
628					! significant in %l1
629	andcc	%l1, 0x7, %l2		! mask everything except bits 1, 2, 3
630	add	%i1, %i3, %i1
631
632	! switch statement to get to right 8 byte block within
633	! 64 byte block
634	cmp	 %l2, 0x4
635	bgeu,a	 hlf
636	cmp	 %l2, 0x6
637	cmp	 %l2, 0x2
638	bgeu,a	 sqtr
639	nop
640	cmp	 %l2, 0x1
641	be,a	 off15
642	nop
643	ba	 off7
644	nop
645sqtr:
646	be,a	 off23
647	nop
648	ba,a	 off31
649	nop
650
651hlf:
652	bgeu,a	 fqtr
653	nop
654	cmp	 %l2, 0x5
655	be,a	 off47
656	nop
657	ba	 off39
658	nop
659fqtr:
660	be,a	 off55
661	nop
662
663	! Falls through when the source offset is greater than 56
664	ldd	[%l0+0x38], %d14
665	prefetch [%l0+0x40], #one_read
666	prefetch [%l0+0x80], #one_read
6677:
668	add	%l0, 0x40, %l0
669	stxa	%g0, [%i0]%asi		! initialize the cache line
670
671	ldda	[%l0]ASI_BLK_P, %d16
672	ALIGN_OFF_56_63
673	fmovd	%d30, %d14
674
675	stda	%d48, [%i0]ASI_BLK_P
676	subcc	%i3, 0x40, %i3
677	add	%i0, 0x40, %i0
678	bgu,pt	%ncc, 7b
679	prefetch [%l0+0x80], #one_read
680	ba	.blkdone
681	membar	#Sync
682
683	! This copy case for source offset between 1 and 7
684off7:
685	ldda	[%l0]ASI_BLK_P, %d0
686	prefetch [%l0+0x40], #one_read
687	prefetch [%l0+0x80], #one_read
6880:
689	add	%l0, 0x40, %l0
690	stxa	%g0, [%i0]%asi		! initialize the cache line
691
692	ldda	[%l0]ASI_BLK_P, %d16
693	ALIGN_OFF_1_7
694	fmovd	%d16, %d0
695	fmovd	%d18, %d2
696	fmovd	%d20, %d4
697	fmovd	%d22, %d6
698	fmovd	%d24, %d8
699	fmovd	%d26, %d10
700	fmovd	%d28, %d12
701	fmovd	%d30, %d14
702
703	stda	%d48, [%i0]ASI_BLK_P
704	subcc	%i3, 0x40, %i3
705	add	%i0, 0x40, %i0
706	bgu,pt	%ncc, 0b
707	prefetch [%l0+0x80], #one_read
708	ba	.blkdone
709	membar	#Sync
710
711	! This copy case for source offset between 8 and 15
712off15:
713	ldd	[%l0+0x8], %d2
714	ldd	[%l0+0x10], %d4
715	ldd	[%l0+0x18], %d6
716	ldd	[%l0+0x20], %d8
717	ldd	[%l0+0x28], %d10
718	ldd	[%l0+0x30], %d12
719	ldd	[%l0+0x38], %d14
720	prefetch [%l0+0x40], #one_read
721	prefetch [%l0+0x80], #one_read
7221:
723	add	%l0, 0x40, %l0
724	stxa	%g0, [%i0]%asi		! initialize the cache line
725
726	ldda	[%l0]ASI_BLK_P, %d16
727	ALIGN_OFF_8_15
728	fmovd	%d18, %d2
729	fmovd	%d20, %d4
730	fmovd	%d22, %d6
731	fmovd	%d24, %d8
732	fmovd	%d26, %d10
733	fmovd	%d28, %d12
734	fmovd	%d30, %d14
735
736	stda	%d48, [%i0]ASI_BLK_P
737	subcc	%i3, 0x40, %i3
738	add	%i0, 0x40, %i0
739	bgu,pt	%ncc, 1b
740	prefetch [%l0+0x80], #one_read
741	ba	.blkdone
742	membar	#Sync
743
744	! This copy case for source offset between 16 and 23
745off23:
746	ldd	[%l0+0x10], %d4
747	ldd	[%l0+0x18], %d6
748	ldd	[%l0+0x20], %d8
749	ldd	[%l0+0x28], %d10
750	ldd	[%l0+0x30], %d12
751	ldd	[%l0+0x38], %d14
752	prefetch [%l0+0x40], #one_read
753	prefetch [%l0+0x80], #one_read
7542:
755	add	%l0, 0x40, %l0
756	stxa	%g0, [%i0]%asi		! initialize the cache line
757
758	ldda	[%l0]ASI_BLK_P, %d16
759	ALIGN_OFF_16_23
760	fmovd	%d20, %d4
761	fmovd	%d22, %d6
762	fmovd	%d24, %d8
763	fmovd	%d26, %d10
764	fmovd	%d28, %d12
765	fmovd	%d30, %d14
766
767	stda	%d48, [%i0]ASI_BLK_P
768	subcc	%i3, 0x40, %i3
769	add	%i0, 0x40, %i0
770	bgu,pt	%ncc, 2b
771	prefetch [%l0+0x80], #one_read
772	ba	.blkdone
773	membar	#Sync
774
775	! This copy case for source offset between 24 and 31
776off31:
777	ldd	[%l0+0x18], %d6
778	ldd	[%l0+0x20], %d8
779	ldd	[%l0+0x28], %d10
780	ldd	[%l0+0x30], %d12
781	ldd	[%l0+0x38], %d14
782	prefetch [%l0+0x40], #one_read
783	prefetch [%l0+0x80], #one_read
7843:
785	add	%l0, 0x40, %l0
786	stxa	%g0, [%i0]%asi		! initialize the cache line
787
788	ldda	[%l0]ASI_BLK_P, %d16
789	ALIGN_OFF_24_31
790	fmovd	%d22, %d6
791	fmovd	%d24, %d8
792	fmovd	%d26, %d10
793	fmovd	%d28, %d12
794	fmovd	%d30, %d14
795
796	stda	%d48, [%i0]ASI_BLK_P
797	subcc	%i3, 0x40, %i3
798	add	%i0, 0x40, %i0
799	bgu,pt	%ncc, 3b
800	prefetch [%l0+0x80], #one_read
801	ba	.blkdone
802	membar	#Sync
803
804	! This copy case for source offset between 32 and 39
805off39:
806	ldd	[%l0+0x20], %d8
807	ldd	[%l0+0x28], %d10
808	ldd	[%l0+0x30], %d12
809	ldd	[%l0+0x38], %d14
810	prefetch [%l0+0x40], #one_read
811	prefetch [%l0+0x80], #one_read
8124:
813	add	%l0, 0x40, %l0
814	stxa	%g0, [%i0]%asi		! initialize the cache line
815
816	ldda	[%l0]ASI_BLK_P, %d16
817	ALIGN_OFF_32_39
818	fmovd	%d24, %d8
819	fmovd	%d26, %d10
820	fmovd	%d28, %d12
821	fmovd	%d30, %d14
822
823	stda	%d48, [%i0]ASI_BLK_P
824	subcc	%i3, 0x40, %i3
825	add	%i0, 0x40, %i0
826	bgu,pt	%ncc, 4b
827	prefetch [%l0+0x80], #one_read
828	ba	.blkdone
829	membar	#Sync
830
831	! This copy case for source offset between 40 and 47
832off47:
833	ldd	[%l0+0x28], %d10
834	ldd	[%l0+0x30], %d12
835	ldd	[%l0+0x38], %d14
836	prefetch [%l0+0x40], #one_read
837	prefetch [%l0+0x80], #one_read
8385:
839	add	%l0, 0x40, %l0
840	stxa	%g0, [%i0]%asi		! initialize the cache line
841
842	ldda	[%l0]ASI_BLK_P, %d16
843	ALIGN_OFF_40_47
844	fmovd	%d26, %d10
845	fmovd	%d28, %d12
846	fmovd	%d30, %d14
847
848	stda	%d48, [%i0]ASI_BLK_P
849	subcc	%i3, 0x40, %i3
850	add	%i0, 0x40, %i0
851	bgu,pt	%ncc, 5b
852	prefetch [%l0+0x80], #one_read
853	ba	.blkdone
854	membar	#Sync
855
856	! This copy case for source offset between 48 and 55
857off55:
858	ldd	[%l0+0x30], %d12
859	ldd	[%l0+0x38], %d14
860	prefetch [%l0+0x40], #one_read
861	prefetch [%l0+0x80], #one_read
8626:
863	add	%l0, 0x40, %l0
864	stxa	%g0, [%i0]%asi		! initialize the cache line
865
866	ldda	[%l0]ASI_BLK_P, %d16
867	ALIGN_OFF_48_55
868	fmovd	%d28, %d12
869	fmovd	%d30, %d14
870
871	stda	%d48, [%i0]ASI_BLK_P
872	subcc	%i3, 0x40, %i3
873	add	%i0, 0x40, %i0
874	bgu,pt	%ncc, 6b
875	prefetch [%l0+0x80], #one_read
876	ba	.blkdone
877	membar	#Sync
878
879	! Both source and destination are block aligned.
880.blkcpy:
881	prefetch [%i1+0x40], #one_read
882	prefetch [%i1+0x80], #one_read
8838:
884	stxa	%g0, [%i0]%asi		! initialize the cache line
885	ldda	[%i1]ASI_BLK_P, %d0
886	stda	%d0, [%i0]ASI_BLK_P
887
888	add	%i1, 0x40, %i1
889	subcc	%i3, 0x40, %i3
890	add	%i0, 0x40, %i0
891	bgu,pt	%ncc, 8b
892	prefetch [%i1+0x80], #one_read
893	membar	#Sync
894
895.blkdone:
896#else	/* NIAGARA_IMPL */
897	andcc	%i1, 0xf, %o2		! is src quadword aligned
898	bz,pn	%xcc, .blkcpy		! src offset in %o2
899	nop
900	cmp	%o2, 0x8
901	bg	.cpy_upper_double
902	nop
903	bl	.cpy_lower_double
904	nop
905
906	! Falls through when source offset is equal to 8 i.e.
907	! source is double word aligned.
908	! In this case no shift/merge of data is required
909	sub	%i1, %o2, %i1		! align the src at 16 bytes.
910	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
911	prefetch [%l0+0x0], #one_read
912	ldda	[%i1+0x0]%asi, %l2
913loop0:
914	ldda	[%i1+0x10]%asi, %l4
915	prefetch [%l0+0x40], #one_read
916
917	stxa	%l3, [%i0+0x0]%asi
918	stxa	%l4, [%i0+0x8]%asi
919
920	ldda	[%i1+0x20]%asi, %l2
921	stxa	%l5, [%i0+0x10]%asi
922	stxa	%l2, [%i0+0x18]%asi
923
924	ldda	[%i1+0x30]%asi, %l4
925	stxa	%l3, [%i0+0x20]%asi
926	stxa	%l4, [%i0+0x28]%asi
927
928	ldda	[%i1+0x40]%asi, %l2
929	stxa	%l5, [%i0+0x30]%asi
930	stxa	%l2, [%i0+0x38]%asi
931
932	add	%l0, 0x40, %l0
933	add	%i1, 0x40, %i1
934	subcc	%i3, 0x40, %i3
935	bgu,pt	%xcc, loop0
936	add	%i0, 0x40, %i0
937	ba	.blkdone
938	add	%i1, %o2, %i1		! increment the source by src offset
939					! the src offset was stored in %o2
940
941.cpy_lower_double:
942	sub	%i1, %o2, %i1		! align the src at 16 bytes.
943	sll	%o2, 3, %o0		! %o0 left shift
944	mov	0x40, %o1
945	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
946	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
947	prefetch [%l0+0x0], #one_read
948	ldda	[%i1+0x0]%asi, %l2	! partial data in %l2 and %l3 has
949					! complete data
950loop1:
951	ldda	[%i1+0x10]%asi, %l4	! %l4 has partial data for this read.
952	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
953							! into %l2 and %l3
954	prefetch [%l0+0x40], #one_read
955	stxa	%l2, [%i0+0x0]%asi
956	stxa	%l3, [%i0+0x8]%asi
957
958	ldda	[%i1+0x20]%asi, %l2
959	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
960	stxa	%l4, [%i0+0x10]%asi			! %l4 from previous read
961	stxa	%l5, [%i0+0x18]%asi			! into %l4 and %l5
962
963	! Repeat the same for next 32 bytes.
964
965	ldda	[%i1+0x30]%asi, %l4
966	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
967	stxa	%l2, [%i0+0x20]%asi
968	stxa	%l3, [%i0+0x28]%asi
969
970	ldda	[%i1+0x40]%asi, %l2
971	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
972	stxa	%l4, [%i0+0x30]%asi
973	stxa	%l5, [%i0+0x38]%asi
974
975	add	%l0, 0x40, %l0
976	add	%i1, 0x40, %i1
977	subcc	%i3, 0x40, %i3
978	bgu,pt	%xcc, loop1
979	add	%i0, 0x40, %i0
980	ba	.blkdone
981	add	%i1, %o2, %i1		! increment the source by src offset
982					! the src offset was stored in %o2
983
984.cpy_upper_double:
985	sub	%i1, %o2, %i1		! align the src at 16 bytes.
986	mov	0x8, %o0
987	sub	%o2, %o0, %o0
988	sll	%o0, 3, %o0		! %o0 left shift
989	mov	0x40, %o1
990	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
991	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
992	prefetch [%l0+0x0], #one_read
993	ldda	[%i1+0x0]%asi, %l2	! partial data in %l3 for this read and
994					! no data in %l2
995loop2:
996	ldda	[%i1+0x10]%asi, %l4	! %l4 has complete data and %l5 has
997					! partial
998	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
999							! into %l3 and %l4
1000	prefetch [%l0+0x40], #one_read
1001	stxa	%l3, [%i0+0x0]%asi
1002	stxa	%l4, [%i0+0x8]%asi
1003
1004	ldda	[%i1+0x20]%asi, %l2
1005	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
1006	stxa	%l5, [%i0+0x10]%asi			! %l5 from previous read
1007	stxa	%l2, [%i0+0x18]%asi			! into %l5 and %l2
1008
1009	! Repeat the same for next 32 bytes.
1010
1011	ldda	[%i1+0x30]%asi, %l4
1012	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
1013	stxa	%l3, [%i0+0x20]%asi
1014	stxa	%l4, [%i0+0x28]%asi
1015
1016	ldda	[%i1+0x40]%asi, %l2
1017	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
1018	stxa	%l5, [%i0+0x30]%asi
1019	stxa	%l2, [%i0+0x38]%asi
1020
1021	add	%l0, 0x40, %l0
1022	add	%i1, 0x40, %i1
1023	subcc	%i3, 0x40, %i3
1024	bgu,pt	%xcc, loop2
1025	add	%i0, 0x40, %i0
1026	ba	.blkdone
1027	add	%i1, %o2, %i1		! increment the source by src offset
1028					! the src offset was stored in %o2
1029
1030
1031	! Both Source and Destination are block aligned.
1032	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
1033.blkcpy:
1034	prefetch [%i1+0x0], #one_read
10351:
1036	ldda	[%i1+0x0]%asi, %l0
1037	ldda	[%i1+0x10]%asi, %l2
1038	prefetch [%i1+0x40], #one_read
1039
1040	stxa	%l0, [%i0+0x0]%asi
1041	ldda	[%i1+0x20]%asi, %l4
1042	ldda	[%i1+0x30]%asi, %l6
1043
1044	stxa	%l1, [%i0+0x8]%asi
1045	stxa	%l2, [%i0+0x10]%asi
1046	stxa	%l3, [%i0+0x18]%asi
1047	stxa	%l4, [%i0+0x20]%asi
1048	stxa	%l5, [%i0+0x28]%asi
1049	stxa	%l6, [%i0+0x30]%asi
1050	stxa	%l7, [%i0+0x38]%asi
1051
1052	add	%i1, 0x40, %i1
1053	subcc	%i3, 0x40, %i3
1054	bgu,pt	%xcc, 1b
1055	add	%i0, 0x40, %i0
1056
1057.blkdone:
1058	membar	#Sync
1059#endif	/* NIAGARA_IMPL */
1060
1061	brz,pt	%i2, .blkexit
1062	nop
1063
1064	! Handle trailing bytes
1065	cmp	%i2, 0x8
1066	blu,pt	%ncc, .residue
1067	nop
1068
1069	! Can we do some 8B ops
1070	or	%i1, %i0, %o2
1071	andcc	%o2, 0x7, %g0
1072	bnz	%ncc, .last4
1073	nop
1074
1075	! Do 8byte ops as long as possible
1076.last8:
1077	ldx	[%i1], %o2
1078	stx	%o2, [%i0]
1079	add	%i1, 0x8, %i1
1080	sub	%i2, 0x8, %i2
1081	cmp	%i2, 0x8
1082	bgu,pt	%ncc, .last8
1083	add	%i0, 0x8, %i0
1084
1085	brz,pt	%i2, .blkexit
1086	nop
1087
1088	ba	.residue
1089	nop
1090
1091.last4:
1092	! Can we do 4B ops
1093	andcc	%o2, 0x3, %g0
1094	bnz	%ncc, .last2
1095	nop
10961:
1097	ld	[%i1], %o2
1098	st	%o2, [%i0]
1099	add	%i1, 0x4, %i1
1100	sub	%i2, 0x4, %i2
1101	cmp	%i2, 0x4
1102	bgu,pt	%ncc, 1b
1103	add	%i0, 0x4, %i0
1104
1105	brz,pt	%i2, .blkexit
1106	nop
1107
1108	ba	.residue
1109	nop
1110
1111.last2:
1112	! Can we do 2B ops
1113	andcc	%o2, 0x1, %g0
1114	bnz	%ncc, .residue
1115	nop
1116
11171:
1118	lduh	[%i1], %o2
1119	stuh	%o2, [%i0]
1120	add	%i1, 0x2, %i1
1121	sub	%i2, 0x2, %i2
1122	cmp	%i2, 0x2
1123	bgu,pt	%ncc, 1b
1124	add	%i0, 0x2, %i0
1125
1126	brz,pt	%i2, .blkexit
1127	nop
1128
1129.residue:
1130	ldub	[%i1], %o2
1131	stb	%o2, [%i0]
1132	inc	%i1
1133	deccc	%i2
1134	bgu,pt	%ncc, .residue
1135	inc	%i0
1136
1137.blkexit:
1138#if !defined(NIAGARA_IMPL)
1139	btst	FPUSED_FLAG, %o5
1140	bz	%icc, 1f
1141	  and	%o5,  COPY_FLAGS, %l1	! Store flags in %l1
1142					! We can't clear the flags from %o5 yet
1143					! If there's an error, .copyerr will
1144					! need them
1145
1146	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
1147	wr	%o2, 0, %gsr
1148
1149	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1150	btst	FPRS_FEF, %o3
1151	bz,pt	%icc, 4f
1152	  nop
1153
1154	! restore fpregs from stack
1155	BLD_FP_FROMSTACK(%o2)
1156
1157	ba,pt	%ncc, 2f
1158	  wr	%o3, 0, %fprs		! restore fprs
1159
11604:
1161	FZERO
1162	wr	%o3, 0, %fprs		! restore fprs
1163
11642:
1165	ldn	[THREAD_REG + T_LWP], %o2
1166	brnz,pt	%o2, 1f
1167	  nop
1168
1169	ldsb	[THREAD_REG + T_PREEMPT], %l0
1170	deccc	%l0
1171	bnz,pn	%ncc, 1f
1172	  stb	%l0, [THREAD_REG + T_PREEMPT]
1173
1174	! Check for a kernel preemption request
1175	ldn	[THREAD_REG + T_CPU], %l0
1176	ldub	[%l0 + CPU_KPRUNRUN], %l0
1177	brnz,a,pt	%l0, 1f	! Need to call kpreempt?
1178	  or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
1179
11801:
1181	btst	BCOPY_FLAG, %l1
1182	bz,pn	%icc, 3f
1183	andncc	%o5, COPY_FLAGS, %o5
1184
1185	! Here via bcopy. Check to see if the handler was NULL.
1186	! If so, just return quietly. Otherwise, reset the
1187	! handler and go home.
1188	bnz,pn	%ncc, 3f
1189	nop
1190
1191	! Null handler.
1192	btst	KPREEMPT_FLAG, %l1
1193	bz,pt	%icc, 2f
1194	  nop
1195	call	kpreempt
1196	  rdpr	%pil, %o0	! pass %pil
11972:
1198
1199	ret
1200	restore	%g0, 0, %o0
1201
1202	! Here via kcopy or bcopy with a handler.
1203	! Reset the fault handler.
12043:
1205	membar	#Sync
1206	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1207
1208	! call kpreempt if necessary
1209	btst	KPREEMPT_FLAG, %l1
1210	bz,pt	%icc, 4f
1211	  nop
1212	call	kpreempt
1213	  rdpr	%pil, %o0
12144:
1215#else	/* NIAGARA_IMPL */
1216	membar	#Sync				! sync error barrier
1217	! Restore t_lofault handler, if came here from kcopy().
1218	tst	%o5
1219	bz	%ncc, 1f
1220	andn	%o5, LOFAULT_SET, %o5
1221	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
12221:
1223#endif	/* NIAGARA_IMPL */
1224	ret
1225	restore	%g0, 0, %o0
1226
1227.bcb_punt:
1228	!
1229	! use aligned transfers where possible
1230	!
1231	xor	%i0, %i1, %o4		! xor from and to address
1232	btst	7, %o4			! if lower three bits zero
1233	bz	.aldoubcp		! can align on double boundary
1234	.empty	! assembler complaints about label
1235
1236	xor	%i0, %i1, %o4		! xor from and to address
1237	btst	3, %o4			! if lower two bits zero
1238	bz	.alwordcp		! can align on word boundary
1239	btst	3, %i0			! delay slot, from address unaligned?
1240	!
1241	! use aligned reads and writes where possible
1242	! this differs from wordcp in that it copes
1243	! with odd alignment between source and destnation
1244	! using word reads and writes with the proper shifts
1245	! in between to align transfers to and from memory
1246	! i0 - src address, i1 - dest address, i2 - count
1247	! i3, i4 - tmps for used generating complete word
1248	! i5 (word to write)
1249	! l0 size in bits of upper part of source word (US)
1250	! l1 size in bits of lower part of source word (LS = 32 - US)
1251	! l2 size in bits of upper part of destination word (UD)
1252	! l3 size in bits of lower part of destination word (LD = 32 - UD)
1253	! l4 number of bytes leftover after aligned transfers complete
1254	! l5 the number 32
1255	!
1256	mov	32, %l5			! load an oft-needed constant
1257	bz	.align_dst_only
1258	btst	3, %i1			! is destnation address aligned?
1259	clr	%i4			! clear registers used in either case
1260	bz	.align_src_only
1261	clr	%l0
1262	!
1263	! both source and destination addresses are unaligned
1264	!
12651:					! align source
1266	ldub	[%i0], %i3		! read a byte from source address
1267	add	%i0, 1, %i0		! increment source address
1268	or	%i4, %i3, %i4		! or in with previous bytes (if any)
1269	btst	3, %i0			! is source aligned?
1270	add	%l0, 8, %l0		! increment size of upper source (US)
1271	bnz,a	1b
1272	sll	%i4, 8, %i4		! make room for next byte
1273
1274	sub	%l5, %l0, %l1		! generate shift left count (LS)
1275	sll	%i4, %l1, %i4		! prepare to get rest
1276	ld	[%i0], %i3		! read a word
1277	add	%i0, 4, %i0		! increment source address
1278	srl	%i3, %l0, %i5		! upper src bits into lower dst bits
1279	or	%i4, %i5, %i5		! merge
1280	mov	24, %l3			! align destination
12811:
1282	srl	%i5, %l3, %i4		! prepare to write a single byte
1283	stb	%i4, [%i1]		! write a byte
1284	add	%i1, 1, %i1		! increment destination address
1285	sub	%i2, 1, %i2		! decrement count
1286	btst	3, %i1			! is destination aligned?
1287	bnz,a	1b
1288	sub	%l3, 8, %l3		! delay slot, decrement shift count (LD)
1289	sub	%l5, %l3, %l2		! generate shift left count (UD)
1290	sll	%i5, %l2, %i5		! move leftover into upper bytes
1291	cmp	%l2, %l0		! cmp # reqd to fill dst w old src left
1292	bgu	%ncc, .more_needed	! need more to fill than we have
1293	nop
1294
1295	sll	%i3, %l1, %i3		! clear upper used byte(s)
1296	srl	%i3, %l1, %i3
1297	! get the odd bytes between alignments
1298	sub	%l0, %l2, %l0		! regenerate shift count
1299	sub	%l5, %l0, %l1		! generate new shift left count (LS)
1300	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
1301	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
1302	srl	%i3, %l0, %i4
1303	or	%i5, %i4, %i5
1304	st	%i5, [%i1]		! write a word
1305	subcc	%i2, 4, %i2		! decrement count
1306	bz	%ncc, .unalign_out
1307	add	%i1, 4, %i1		! increment destination address
1308
1309	b	2f
1310	sll	%i3, %l1, %i5		! get leftover into upper bits
1311.more_needed:
1312	sll	%i3, %l0, %i3		! save remaining byte(s)
1313	srl	%i3, %l0, %i3
1314	sub	%l2, %l0, %l1		! regenerate shift count
1315	sub	%l5, %l1, %l0		! generate new shift left count
1316	sll	%i3, %l1, %i4		! move to fill empty space
1317	b	3f
1318	or	%i5, %i4, %i5		! merge to complete word
1319	!
1320	! the source address is aligned and destination is not
1321	!
1322.align_dst_only:
1323	ld	[%i0], %i4		! read a word
1324	add	%i0, 4, %i0		! increment source address
1325	mov	24, %l0			! initial shift alignment count
13261:
1327	srl	%i4, %l0, %i3		! prepare to write a single byte
1328	stb	%i3, [%i1]		! write a byte
1329	add	%i1, 1, %i1		! increment destination address
1330	sub	%i2, 1, %i2		! decrement count
1331	btst	3, %i1			! is destination aligned?
1332	bnz,a	1b
1333	sub	%l0, 8, %l0		! delay slot, decrement shift count
1334.xfer:
1335	sub	%l5, %l0, %l1		! generate shift left count
1336	sll	%i4, %l1, %i5		! get leftover
13373:
1338	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
1339	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
13402:
1341	ld	[%i0], %i3		! read a source word
1342	add	%i0, 4, %i0		! increment source address
1343	srl	%i3, %l0, %i4		! upper src bits into lower dst bits
1344	or	%i5, %i4, %i5		! merge with upper dest bits (leftover)
1345	st	%i5, [%i1]		! write a destination word
1346	subcc	%i2, 4, %i2		! decrement count
1347	bz	%ncc, .unalign_out	! check if done
1348	add	%i1, 4, %i1		! increment destination address
1349	b	2b			! loop
1350	sll	%i3, %l1, %i5		! get leftover
1351.unalign_out:
1352	tst	%l4			! any bytes leftover?
1353	bz	%ncc, .cpdone
1354	.empty				! allow next instruction in delay slot
13551:
1356	sub	%l0, 8, %l0		! decrement shift
1357	srl	%i3, %l0, %i4		! upper src byte into lower dst byte
1358	stb	%i4, [%i1]		! write a byte
1359	subcc	%l4, 1, %l4		! decrement count
1360	bz	%ncc, .cpdone		! done?
1361	add	%i1, 1, %i1		! increment destination
1362	tst	%l0			! any more previously read bytes
1363	bnz	%ncc, 1b		! we have leftover bytes
1364	mov	%l4, %i2		! delay slot, mv cnt where dbytecp wants
1365	b	.dbytecp		! let dbytecp do the rest
1366	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
1367	!
1368	! the destination address is aligned and the source is not
1369	!
1370.align_src_only:
1371	ldub	[%i0], %i3		! read a byte from source address
1372	add	%i0, 1, %i0		! increment source address
1373	or	%i4, %i3, %i4		! or in with previous bytes (if any)
1374	btst	3, %i0			! is source aligned?
1375	add	%l0, 8, %l0		! increment shift count (US)
1376	bnz,a	.align_src_only
1377	sll	%i4, 8, %i4		! make room for next byte
1378	b,a	.xfer
1379	!
1380	! if from address unaligned for double-word moves,
1381	! move bytes till it is, if count is < 56 it could take
1382	! longer to align the thing than to do the transfer
1383	! in word size chunks right away
1384	!
1385.aldoubcp:
1386	cmp	%i2, 56			! if count < 56, use wordcp, it takes
1387	blu,a	%ncc, .alwordcp		! longer to align doubles than words
1388	mov	3, %o0			! mask for word alignment
1389	call	.alignit		! copy bytes until aligned
1390	mov	7, %o0			! mask for double alignment
1391	!
1392	! source and destination are now double-word aligned
1393	! i3 has aligned count returned by alignit
1394	!
1395	and	%i2, 7, %i2		! unaligned leftover count
1396	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
13975:
1398	ldx	[%i0+%i1], %o4		! read from address
1399	stx	%o4, [%i1]		! write at destination address
1400	subcc	%i3, 8, %i3		! dec count
1401	bgu	%ncc, 5b
1402	add	%i1, 8, %i1		! delay slot, inc to address
1403	cmp	%i2, 4			! see if we can copy a word
1404	blu	%ncc, .dbytecp		! if 3 or less bytes use bytecp
1405	.empty
1406	!
1407	! for leftover bytes we fall into wordcp, if needed
1408	!
1409.wordcp:
1410	and	%i2, 3, %i2		! unaligned leftover count
14115:
1412	ld	[%i0+%i1], %o4		! read from address
1413	st	%o4, [%i1]		! write at destination address
1414	subcc	%i3, 4, %i3		! dec count
1415	bgu	%ncc, 5b
1416	add	%i1, 4, %i1		! delay slot, inc to address
1417	b,a	.dbytecp
1418
1419	! we come here to align copies on word boundaries
1420.alwordcp:
1421	call	.alignit		! go word-align it
1422	mov	3, %o0			! bits that must be zero to be aligned
1423	b	.wordcp
1424	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
1425
1426	!
1427	! byte copy, works with any alignment
1428	!
1429.bytecp:
1430	b	.dbytecp
1431	sub	%i0, %i1, %i0		! i0 gets difference of src and dst
1432
1433	!
1434	! differenced byte copy, works with any alignment
1435	! assumes dest in %i1 and (source - dest) in %i0
1436	!
14371:
1438	stb	%o4, [%i1]		! write to address
1439	inc	%i1			! inc to address
1440.dbytecp:
1441	deccc	%i2			! dec count
1442	bgeu,a	%ncc, 1b		! loop till done
1443	ldub	[%i0+%i1], %o4		! read from address
1444.cpdone:
1445#if !defined(NIAGARA_IMPL)
1446	! FPUSED_FLAG will not have been set in any path leading to
1447	! this point. No need to deal with it.
1448	btst	BCOPY_FLAG, %o5
1449	bz,pn	%icc, 2f
1450	andcc	%o5, BCOPY_FLAG, %o5
1451	! Here via bcopy. Check to see if the handler was NULL.
1452	! If so, just return quietly. Otherwise, reset the
1453	! handler and go home.
1454	bnz,pn	%ncc, 2f
1455	nop
1456	!
1457	! Null handler.
1458	!
1459	ret
1460	restore %g0, 0, %o0
1461	! Here via kcopy or bcopy with a handler.
1462	! Reset the fault handler.
14632:
1464	membar	#Sync
1465	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1466#else	/* NIAGARA_IMPL */
1467	membar	#Sync				! sync error barrier
1468	! Restore t_lofault handler, if came here from kcopy().
1469	tst	%o5
1470	bz	%ncc, 1f
1471	andn	%o5, LOFAULT_SET, %o5
1472	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
14731:
1474#endif	/* NIAGARA_IMPL */
1475	ret
1476	restore %g0, 0, %o0		! return (0)
1477
1478/*
1479 * Common code used to align transfers on word and doubleword
1480 * boudaries.  Aligns source and destination and returns a count
1481 * of aligned bytes to transfer in %i3
1482 */
14831:
1484	inc	%i0			! inc from
1485	stb	%o4, [%i1]		! write a byte
1486	inc	%i1			! inc to
1487	dec	%i2			! dec count
1488.alignit:
1489	btst	%o0, %i0		! %o0 is bit mask to check for alignment
1490	bnz,a	1b
1491	ldub	[%i0], %o4		! read next byte
1492
1493	retl
1494	andn	%i2, %o0, %i3		! return size of aligned bytes
1495	SET_SIZE(bcopy)
1496
1497#endif	/* lint */
1498
1499/*
1500 * Block copy with possibly overlapped operands.
1501 */
1502
1503#if defined(lint)
1504
1505/*ARGSUSED*/
1506void
1507ovbcopy(const void *from, void *to, size_t count)
1508{}
1509
1510#else	/* lint */
1511
1512	ENTRY(ovbcopy)
1513	tst	%o2			! check count
1514	bgu,a	%ncc, 1f		! nothing to do or bad arguments
1515	subcc	%o0, %o1, %o3		! difference of from and to address
1516
1517	retl				! return
1518	nop
15191:
1520	bneg,a	%ncc, 2f
1521	neg	%o3			! if < 0, make it positive
15222:	cmp	%o2, %o3		! cmp size and abs(from - to)
1523	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
1524	.empty				!   no overlap
1525	cmp	%o0, %o1		! compare from and to addresses
1526	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
1527	nop
1528	!
1529	! Copy forwards.
1530	!
1531.ov_fwd:
1532	ldub	[%o0], %o3		! read from address
1533	inc	%o0			! inc from address
1534	stb	%o3, [%o1]		! write to address
1535	deccc	%o2			! dec count
1536	bgu	%ncc, .ov_fwd		! loop till done
1537	inc	%o1			! inc to address
1538
1539	retl				! return
1540	nop
1541	!
1542	! Copy backwards.
1543	!
1544.ov_bkwd:
1545	deccc	%o2			! dec count
1546	ldub	[%o0 + %o2], %o3	! get byte at end of src
1547	bgu	%ncc, .ov_bkwd		! loop till done
1548	stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
1549
1550	retl				! return
1551	nop
1552	SET_SIZE(ovbcopy)
1553
1554#endif	/* lint */
1555
1556/*
1557 * hwblkpagecopy()
1558 *
1559 * Copies exactly one page.  This routine assumes the caller (ppcopy)
1560 * has already disabled kernel preemption and has checked
1561 * use_hw_bcopy.
1562 */
1563#ifdef lint
1564/*ARGSUSED*/
1565void
1566hwblkpagecopy(const void *src, void *dst)
1567{ }
1568#else /* lint */
1569	ENTRY(hwblkpagecopy)
1570	save	%sp, -SA(MINFRAME), %sp
1571
1572	! %i0 - source address (arg)
1573	! %i1 - destination address (arg)
1574	! %i2 - length of region (not arg)
1575
1576	set	PAGESIZE, %i2
1577
1578	/*
1579	 * Copying exactly one page and PAGESIZE is in mutliple of 0x80.
1580	 */
1581	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
1582	prefetch [%i0+0x0], #one_read
1583	prefetch [%i0+0x40], #one_read
15841:
1585	prefetch [%i0+0x80], #one_read
1586	prefetch [%i0+0xc0], #one_read
1587	ldda	[%i0+0x0]%asi, %l0
1588	ldda	[%i0+0x10]%asi, %l2
1589	ldda	[%i0+0x20]%asi, %l4
1590	ldda	[%i0+0x30]%asi, %l6
1591	stxa	%l0, [%i1+0x0]%asi
1592	stxa	%l1, [%i1+0x8]%asi
1593	stxa	%l2, [%i1+0x10]%asi
1594	stxa	%l3, [%i1+0x18]%asi
1595	stxa	%l4, [%i1+0x20]%asi
1596	stxa	%l5, [%i1+0x28]%asi
1597	stxa	%l6, [%i1+0x30]%asi
1598	stxa	%l7, [%i1+0x38]%asi
1599	ldda	[%i0+0x40]%asi, %l0
1600	ldda	[%i0+0x50]%asi, %l2
1601	ldda	[%i0+0x60]%asi, %l4
1602	ldda	[%i0+0x70]%asi, %l6
1603	stxa	%l0, [%i1+0x40]%asi
1604	stxa	%l1, [%i1+0x48]%asi
1605	stxa	%l2, [%i1+0x50]%asi
1606	stxa	%l3, [%i1+0x58]%asi
1607	stxa	%l4, [%i1+0x60]%asi
1608	stxa	%l5, [%i1+0x68]%asi
1609	stxa	%l6, [%i1+0x70]%asi
1610	stxa	%l7, [%i1+0x78]%asi
1611
1612	add	%i0, 0x80, %i0
1613	subcc	%i2, 0x80, %i2
1614	bgu,pt	%xcc, 1b
1615	add	%i1, 0x80, %i1
1616
1617	membar #Sync
1618	ret
1619	restore	%g0, 0, %o0
1620	SET_SIZE(hwblkpagecopy)
1621#endif	/* lint */
1622
1623
1624/*
1625 * Transfer data to and from user space -
1626 * Note that these routines can cause faults
1627 * It is assumed that the kernel has nothing at
1628 * less than KERNELBASE in the virtual address space.
1629 *
1630 * Note that copyin(9F) and copyout(9F) are part of the
1631 * DDI/DKI which specifies that they return '-1' on "errors."
1632 *
1633 * Sigh.
1634 *
1635 * So there's two extremely similar routines - xcopyin() and xcopyout()
1636 * which return the errno that we've faithfully computed.  This
1637 * allows other callers (e.g. uiomove(9F)) to work correctly.
1638 * Given that these are used pretty heavily, we expand the calling
1639 * sequences inline for all flavours (rather than making wrappers).
1640 *
1641 * There are also stub routines for xcopyout_little and xcopyin_little,
1642 * which currently are intended to handle requests of <= 16 bytes from
1643 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1644 * is left as an exercise...
1645 */
1646
1647/*
1648 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1649 *
1650 * General theory of operation:
1651 *
1652 * None of the copyops routines grab a window until it's decided that
1653 * we need to do a HW block copy operation. This saves a window
1654 * spill/fill when we're called during socket ops. The typical IO
1655 * path won't cause spill/fill traps.
1656 *
1657 * This code uses a set of 4 limits for the maximum size that will
1658 * be copied given a particular input/output address alignment.
1659 * the default limits are:
1660 *
1661 * single byte aligned - 256 (hw_copy_limit_1)
1662 * two byte aligned - 512 (hw_copy_limit_2)
1663 * four byte aligned - 1024 (hw_copy_limit_4)
1664 * eight byte aligned - 1024 (hw_copy_limit_8)
1665 *
1666 * If the value for a particular limit is zero, the copy will be done
1667 * via the copy loops rather than block store/quad load instructions.
1668 *
1669 * Flow:
1670 *
1671 * If count == zero return zero.
1672 *
1673 * Store the previous lo_fault handler into %g6.
1674 * Place our secondary lofault handler into %g5.
1675 * Place the address of our nowindow fault handler into %o3.
1676 * Place the address of the windowed fault handler into %o4.
1677 * --> We'll use this handler if we end up grabbing a window
1678 * --> before we use block initializing store and quad load ASIs
1679 *
1680 * If count is less than or equal to SMALL_LIMIT (7) we
1681 * always do a byte for byte copy.
1682 *
1683 * If count is > SMALL_LIMIT, we check the alignment of the input
1684 * and output pointers. Based on the alignment we check count
1685 * against a limit based on detected alignment.  If we exceed the
1686 * alignment value we copy via block initializing store and quad
1687 * load instructions.
1688 *
1689 * If we don't exceed one of the limits, we store -count in %o3,
1690 * we store the number of chunks (8, 4, 2 or 1 byte) operated
1691 * on in our basic copy loop in %o2. Following this we branch
1692 * to the appropriate copy loop and copy that many chunks.
1693 * Since we've been adding the chunk size to %o3 each time through
1694 * as well as decrementing %o2, we can tell if any data is
1695 * is left to be copied by examining %o3. If that is zero, we're
1696 * done and can go home. If not, we figure out what the largest
1697 * chunk size left to be copied is and branch to that copy loop
1698 * unless there's only one byte left. We load that as we're
1699 * branching to code that stores it just before we return.
1700 *
1701 * Fault handlers are invoked if we reference memory that has no
1702 * current mapping.  All forms share the same copyio_fault handler.
1703 * This routine handles fixing up the stack and general housecleaning.
1704 * Each copy operation has a simple fault handler that is then called
1705 * to do the work specific to the invidual operation.  The handler
1706 * for copyOP and xcopyOP are found at the end of individual function.
1707 * The handlers for xcopyOP_little are found at the end of xcopyin_little.
1708 * The handlers for copyOP_noerr are found at the end of copyin_noerr.
1709 */
1710
1711/*
1712 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1713 */
1714
1715#if defined(lint)
1716
1717/*ARGSUSED*/
1718int
1719copyout(const void *kaddr, void *uaddr, size_t count)
1720{ return (0); }
1721
1722#else	/* lint */
1723
1724/*
1725 * We save the arguments in the following registers in case of a fault:
1726 * 	kaddr - %g2
1727 * 	uaddr - %g3
1728 * 	count - %g4
1729 */
1730#define	SAVE_SRC	%g2
1731#define	SAVE_DST	%g3
1732#define	SAVE_COUNT	%g4
1733
1734#define	REAL_LOFAULT		%g5
1735#define	SAVED_LOFAULT		%g6
1736
1737/*
1738 * Generic copyio fault handler.  This is the first line of defense when a
1739 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
1740 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1741 * This allows us to share common code for all the flavors of the copy
1742 * operations, including the _noerr versions.
1743 *
1744 * Note that this function will restore the original input parameters before
1745 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
1746 * member of the t_copyop structure, if needed.
1747 */
1748	ENTRY(copyio_fault)
1749#if !defined(NIAGARA_IMPL)
1750	btst	FPUSED_FLAG, SAVED_LOFAULT
1751	bz	1f
1752	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
1753
1754	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1755	wr	%o2, 0, %gsr		! restore gsr
1756
1757	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1758	btst	FPRS_FEF, %o3
1759	bz	%icc, 4f
1760	  nop
1761
1762	! restore fpregs from stack
1763	BLD_FP_FROMSTACK(%o2)
1764
1765	ba,pt	%ncc, 1f
1766	  wr	%o3, 0, %fprs		! restore fprs
1767
17684:
1769	FZERO				! zero all of the fpregs
1770	wr	%o3, 0, %fprs		! restore fprs
1771
17721:
1773#else	/* NIAGARA_IMPL */
1774	membar	#Sync
1775	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1776#endif	/* NIAGARA_IMPL */
1777
1778	restore
1779
1780	mov	SAVE_SRC, %o0
1781	mov	SAVE_DST, %o1
1782	jmp	REAL_LOFAULT
1783	  mov	SAVE_COUNT, %o2
1784	SET_SIZE(copyio_fault)
1785
1786	ENTRY(copyio_fault_nowindow)
1787	membar	#Sync
1788	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1789
1790	mov	SAVE_SRC, %o0
1791	mov	SAVE_DST, %o1
1792	jmp	REAL_LOFAULT
1793	  mov	SAVE_COUNT, %o2
1794	SET_SIZE(copyio_fault_nowindow)
1795
1796	ENTRY(copyout)
1797	sethi	%hi(.copyout_err), REAL_LOFAULT
1798	or	REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
1799
1800.do_copyout:
1801	!
1802	! Check the length and bail if zero.
1803	!
1804	tst	%o2
1805	bnz,pt	%ncc, 1f
1806	  nop
1807	retl
1808	  clr	%o0
18091:
1810	sethi	%hi(copyio_fault), %o4
1811	or	%o4, %lo(copyio_fault), %o4
1812	sethi	%hi(copyio_fault_nowindow), %o3
1813	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
1814	or	%o3, %lo(copyio_fault_nowindow), %o3
1815	membar	#Sync
1816	stn	%o3, [THREAD_REG + T_LOFAULT]
1817
1818	mov	%o0, SAVE_SRC
1819	mov	%o1, SAVE_DST
1820	mov	%o2, SAVE_COUNT
1821
1822	!
1823	! Check to see if we're more than SMALL_LIMIT (7 bytes).
1824	! Run in leaf mode, using the %o regs as our input regs.
1825	!
1826	subcc	%o2, SMALL_LIMIT, %o3
1827	bgu,a,pt %ncc, .dco_ns
1828	or	%o0, %o1, %o3
1829	!
1830	! What was previously ".small_copyout"
1831	! Do full differenced copy.
1832	!
1833.dcobcp:
1834	sub	%g0, %o2, %o3		! negate count
1835	add	%o0, %o2, %o0		! make %o0 point at the end
1836	add	%o1, %o2, %o1		! make %o1 point at the end
1837	ba,pt	%ncc, .dcocl
1838	ldub	[%o0 + %o3], %o4	! load first byte
1839	!
1840	! %o0 and %o2 point at the end and remain pointing at the end
1841	! of their buffers. We pull things out by adding %o3 (which is
1842	! the negation of the length) to the buffer end which gives us
1843	! the curent location in the buffers. By incrementing %o3 we walk
1844	! through both buffers without having to bump each buffer's
1845	! pointer. A very fast 4 instruction loop.
1846	!
1847	.align 16
1848.dcocl:
1849	stba	%o4, [%o1 + %o3]ASI_USER
1850	inccc	%o3
1851	bl,a,pt	%ncc, .dcocl
1852	ldub	[%o0 + %o3], %o4
1853	!
1854	! We're done. Go home.
1855	!
1856	membar	#Sync
1857	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
1858	retl
1859	clr	%o0
1860	!
1861	! Try aligned copies from here.
1862	!
1863.dco_ns:
1864	! %o0 = kernel addr (to be copied from)
1865	! %o1 = user addr (to be copied to)
1866	! %o2 = length
1867	! %o3 = %o1 | %o2 (used for alignment checking)
1868	! %o4 is alternate lo_fault
1869	! %o5 is original lo_fault
1870	!
1871	! See if we're single byte aligned. If we are, check the
1872	! limit for single byte copies. If we're smaller or equal,
1873	! bounce to the byte for byte copy loop. Otherwise do it in
1874	! HW (if enabled).
1875	!
1876	btst	1, %o3
1877	bz,pt	%icc, .dcoh8
1878	btst	7, %o3
1879	!
1880	! Single byte aligned. Do we do it via HW or via
1881	! byte for byte? Do a quick no memory reference
1882	! check to pick up small copies.
1883	!
1884	sethi	%hi(hw_copy_limit_1), %o3
1885	!
1886	! Big enough that we need to check the HW limit for
1887	! this size copy.
1888	!
1889	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
1890	!
1891	! Is HW copy on? If not, do everything byte for byte.
1892	!
1893	tst	%o3
1894	bz,pn	%icc, .dcobcp
1895	subcc	%o3, %o2, %o3
1896	!
1897	! If we're less than or equal to the single byte copy limit,
1898	! bop to the copy loop.
1899	!
1900	bge,pt	%ncc, .dcobcp
1901	nop
1902	!
1903	! We're big enough and copy is on. Do it with HW.
1904	!
1905	ba,pt	%ncc, .big_copyout
1906	nop
1907.dcoh8:
1908	!
1909	! 8 byte aligned?
1910	!
1911	bnz,a	%ncc, .dcoh4
1912	btst	3, %o3
1913	!
1914	! See if we're in the "small range".
1915	! If so, go off and do the copy.
1916	! If not, load the hard limit. %o3 is
1917	! available for reuse.
1918	!
1919	sethi	%hi(hw_copy_limit_8), %o3
1920	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
1921	!
1922	! If it's zero, there's no HW bcopy.
1923	! Bop off to the aligned copy.
1924	!
1925	tst	%o3
1926	bz,pn	%icc, .dcos8
1927	subcc	%o3, %o2, %o3
1928	!
1929	! We're negative if our size is larger than hw_copy_limit_8.
1930	!
1931	bge,pt	%ncc, .dcos8
1932	nop
1933	!
1934	! HW assist is on and we're large enough. Do it.
1935	!
1936	ba,pt	%ncc, .big_copyout
1937	nop
1938.dcos8:
1939	!
1940	! Housekeeping for copy loops. Uses same idea as in the byte for
1941	! byte copy loop above.
1942	!
1943	add	%o0, %o2, %o0
1944	add	%o1, %o2, %o1
1945	sub	%g0, %o2, %o3
1946	ba,pt	%ncc, .dodebc
1947	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
1948	!
1949	! 4 byte aligned?
1950	!
1951.dcoh4:
1952	bnz,pn	%ncc, .dcoh2
1953	!
1954	! See if we're in the "small range".
1955	! If so, go off an do the copy.
1956	! If not, load the hard limit. %o3 is
1957	! available for reuse.
1958	!
1959	sethi	%hi(hw_copy_limit_4), %o3
1960	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
1961	!
1962	! If it's zero, there's no HW bcopy.
1963	! Bop off to the aligned copy.
1964	!
1965	tst	%o3
1966	bz,pn	%icc, .dcos4
1967	subcc	%o3, %o2, %o3
1968	!
1969	! We're negative if our size is larger than hw_copy_limit_4.
1970	!
1971	bge,pt	%ncc, .dcos4
1972	nop
1973	!
1974	! HW assist is on and we're large enough. Do it.
1975	!
1976	ba,pt	%ncc, .big_copyout
1977	nop
1978.dcos4:
1979	add	%o0, %o2, %o0
1980	add	%o1, %o2, %o1
1981	sub	%g0, %o2, %o3
1982	ba,pt	%ncc, .dodfbc
1983	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
1984	!
1985	! We must be 2 byte aligned. Off we go.
1986	! The check for small copies was done in the
1987	! delay at .dcoh4
1988	!
1989.dcoh2:
1990	ble	%ncc, .dcos2
1991	sethi	%hi(hw_copy_limit_2), %o3
1992	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
1993	tst	%o3
1994	bz,pn	%icc, .dcos2
1995	subcc	%o3, %o2, %o3
1996	bge,pt	%ncc, .dcos2
1997	nop
1998	!
1999	! HW is on and we're big enough. Do it.
2000	!
2001	ba,pt	%ncc, .big_copyout
2002	nop
2003.dcos2:
2004	add	%o0, %o2, %o0
2005	add	%o1, %o2, %o1
2006	sub	%g0, %o2, %o3
2007	ba,pt	%ncc, .dodtbc
2008	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
2009.small_copyout:
2010	!
2011	! Why are we doing this AGAIN? There are certain conditions in
2012	! big_copyout that will cause us to forego the HW assisted copies
2013	! and bounce back to a non-HW assisted copy. This dispatches those
2014	! copies. Note that we branch around this in the main line code.
2015	!
2016	! We make no check for limits or HW enablement here. We've
2017	! already been told that we're a poster child so just go off
2018	! and do it.
2019	!
2020	or	%o0, %o1, %o3
2021	btst	1, %o3
2022	bnz	%icc, .dcobcp		! Most likely
2023	btst	7, %o3
2024	bz	%icc, .dcos8
2025	btst	3, %o3
2026	bz	%icc, .dcos4
2027	nop
2028	ba,pt	%ncc, .dcos2
2029	nop
2030	.align 32
2031.dodebc:
2032	ldx	[%o0 + %o3], %o4
2033	deccc	%o2
2034	stxa	%o4, [%o1 + %o3]ASI_USER
2035	bg,pt	%ncc, .dodebc
2036	addcc	%o3, 8, %o3
2037	!
2038	! End of copy loop. Check to see if we're done. Most
2039	! eight byte aligned copies end here.
2040	!
2041	bz,pt	%ncc, .dcofh
2042	nop
2043	!
2044	! Something is left - do it byte for byte.
2045	!
2046	ba,pt	%ncc, .dcocl
2047	ldub	[%o0 + %o3], %o4	! load next byte
2048	!
2049	! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
2050	!
2051	.align 32
2052.dodfbc:
2053	lduw	[%o0 + %o3], %o4
2054	deccc	%o2
2055	sta	%o4, [%o1 + %o3]ASI_USER
2056	bg,pt	%ncc, .dodfbc
2057	addcc	%o3, 4, %o3
2058	!
2059	! End of copy loop. Check to see if we're done. Most
2060	! four byte aligned copies end here.
2061	!
2062	bz,pt	%ncc, .dcofh
2063	nop
2064	!
2065	! Something is left. Do it byte for byte.
2066	!
2067	ba,pt	%ncc, .dcocl
2068	ldub	[%o0 + %o3], %o4	! load next byte
2069	!
2070	! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
2071	! copy.
2072	!
2073	.align 32
2074.dodtbc:
2075	lduh	[%o0 + %o3], %o4
2076	deccc	%o2
2077	stha	%o4, [%o1 + %o3]ASI_USER
2078	bg,pt	%ncc, .dodtbc
2079	addcc	%o3, 2, %o3
2080	!
2081	! End of copy loop. Anything left?
2082	!
2083	bz,pt	%ncc, .dcofh
2084	nop
2085	!
2086	! Deal with the last byte
2087	!
2088	ldub	[%o0 + %o3], %o4
2089	stba	%o4, [%o1 + %o3]ASI_USER
2090.dcofh:
2091	membar	#Sync
2092	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2093	retl
2094	clr	%o0
2095
2096.big_copyout:
2097	! We're going to go off and do a block copy.
2098	! Switch fault handlers and grab a window. We
2099	! don't do a membar #Sync since we've done only
2100	! kernel data to this point.
2101	stn	%o4, [THREAD_REG + T_LOFAULT]
2102
2103	! Copy out that reach here are larger than 256 bytes. The
2104	! hw_copy_limit_1 is set to 256. Never set this limit less
2105	! 128 bytes.
2106#if !defined(NIAGARA_IMPL)
2107	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2108
2109	rd	%fprs, %o2			! check for unused fp
2110	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]	! save %fprs
2111	btst	FPRS_FEF, %o2
2112	bz,a,pt	%icc, .do_block_copyout
2113	wr	%g0, FPRS_FEF, %fprs
2114
2115	! save in-use fpregs on stack
2116	BST_FP_TOSTACK(%o2)
2117#else	/* NIAGARA_IMPL */
2118	save	%sp, -SA(MINFRAME), %sp
2119#endif	/* NIAGARA_IMPL */
2120
2121.do_block_copyout:
2122
2123#if !defined(NIAGARA_IMPL)
2124	rd	%gsr, %o2
2125	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
2126	! set the lower bit saved t_lofault to indicate that we need
2127	! clear %fprs register on the way out
2128	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2129#endif	/* NIAGARA_IMPL */
2130
2131	! Swap src/dst since the code below is memcpy code
2132	! and memcpy/bcopy have different calling sequences
2133	mov	%i1, %i5
2134	mov	%i0, %i1
2135	mov	%i5, %i0
2136
2137	! Block (64 bytes) align the destination.
2138	andcc	%i0, 0x3f, %i3		! is dst block aligned
2139	bz	%ncc, copyout_blalign	! dst already block aligned
2140	sub	%i3, 0x40, %i3
2141	neg	%i3			! bytes till dst 64 bytes aligned
2142	sub	%i2, %i3, %i2		! update i2 with new count
2143
2144	! Based on source and destination alignment do
2145	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
2146
2147	! Is dst & src 8B aligned
2148	or	%i0, %i1, %o2
2149	andcc	%o2, 0x7, %g0
2150	bz	%ncc, .co_alewdcp
2151	nop
2152
2153	! Is dst & src 4B aligned
2154	andcc	%o2, 0x3, %g0
2155	bz	%ncc, .co_alwdcp
2156	nop
2157
2158	! Is dst & src 2B aligned
2159	andcc	%o2, 0x1, %g0
2160	bz	%ncc, .co_alhlfwdcp
2161	nop
2162
2163	! 1B aligned
21641:	ldub	[%i1], %o2
2165	stba	%o2, [%i0]ASI_USER
2166	inc	%i1
2167	deccc	%i3
2168	bgu,pt	%ncc, 1b
2169	inc	%i0
2170
2171	ba	copyout_blalign
2172	nop
2173
2174	! dst & src 4B aligned
2175.co_alwdcp:
2176	ld	[%i1], %o2
2177	sta	%o2, [%i0]ASI_USER
2178	add	%i1, 0x4, %i1
2179	subcc	%i3, 0x4, %i3
2180	bgu,pt	%ncc, .co_alwdcp
2181	add	%i0, 0x4, %i0
2182
2183	ba	copyout_blalign
2184	nop
2185
2186	! dst & src 2B aligned
2187.co_alhlfwdcp:
2188	lduh	[%i1], %o2
2189	stuha	%o2, [%i0]ASI_USER
2190	add	%i1, 0x2, %i1
2191	subcc	%i3, 0x2, %i3
2192	bgu,pt	%ncc, .co_alhlfwdcp
2193	add	%i0, 0x2, %i0
2194
2195	ba	copyout_blalign
2196	nop
2197
2198	! dst & src 8B aligned
2199.co_alewdcp:
2200	ldx	[%i1], %o2
2201	stxa	%o2, [%i0]ASI_USER
2202	add	%i1, 0x8, %i1
2203	subcc	%i3, 0x8, %i3
2204	bgu,pt	%ncc, .co_alewdcp
2205	add	%i0, 0x8, %i0
2206
2207	! Now Destination is block (64 bytes) aligned
2208copyout_blalign:
2209	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
2210	sub	%i2, %i3, %i2		! Residue bytes in %i2
2211
2212	mov	ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
2213
2214#if !defined(NIAGARA_IMPL)
2215	andn	%i1, 0x3f, %l0		! %l0 has block aligned src address
2216	prefetch [%l0+0x0], #one_read
2217	andcc	%i1, 0x3f, %g0		! is src 64B aligned
2218	bz,pn	%ncc, .co_blkcpy
2219	nop
2220
2221	! handle misaligned source cases
2222	alignaddr %i1, %g0, %g0		! generate %gsr
2223
2224	srl	%i1, 0x3, %l1		! src add bits 3, 4, 5 are now least
2225					! significant in %l1
2226	andcc	%l1, 0x7, %l2		! mask everything except bits 1, 2, 3
2227	add	%i1, %i3, %i1
2228
2229	! switch statement to get to right 8 byte block within
2230	! 64 byte block
2231	cmp	 %l2, 0x4
2232	bgeu,a	 co_hlf
2233	cmp	 %l2, 0x6
2234	cmp	 %l2, 0x2
2235	bgeu,a	 co_sqtr
2236	nop
2237	cmp	 %l2, 0x1
2238	be,a	 co_off15
2239	nop
2240	ba	 co_off7
2241	nop
2242co_sqtr:
2243	be,a	 co_off23
2244	nop
2245	ba,a	 co_off31
2246	nop
2247
2248co_hlf:
2249	bgeu,a	 co_fqtr
2250	nop
2251	cmp	 %l2, 0x5
2252	be,a	 co_off47
2253	nop
2254	ba	 co_off39
2255	nop
2256co_fqtr:
2257	be,a	 co_off55
2258	nop
2259
2260	ldd	[%l0+0x38], %d14
2261	prefetch [%l0+0x40], #one_read
2262	prefetch [%l0+0x80], #one_read
22637:
2264	add	%l0, 0x40, %l0
2265	stxa	%g0, [%i0]%asi		! initialize the cache line
2266
2267	ldda	[%l0]ASI_BLK_P, %d16
2268	ALIGN_OFF_56_63
2269	fmovd	%d30, %d14
2270
2271	stda	%d48, [%i0]ASI_BLK_AIUS
2272	subcc	%i3, 0x40, %i3
2273	add	%i0, 0x40, %i0
2274	bgu,pt	%ncc, 7b
2275	prefetch [%l0+0x80], #one_read
2276	ba	.co_blkdone
2277	membar	#Sync
2278
2279co_off7:
2280	ldda	[%l0]ASI_BLK_P, %d0
2281	prefetch [%l0+0x40], #one_read
2282	prefetch [%l0+0x80], #one_read
22830:
2284	add	%l0, 0x40, %l0
2285	stxa	%g0, [%i0]%asi		! initialize the cache line
2286
2287	ldda	[%l0]ASI_BLK_P, %d16
2288	ALIGN_OFF_1_7
2289	fmovd	%d16, %d0
2290	fmovd	%d18, %d2
2291	fmovd	%d20, %d4
2292	fmovd	%d22, %d6
2293	fmovd	%d24, %d8
2294	fmovd	%d26, %d10
2295	fmovd	%d28, %d12
2296	fmovd	%d30, %d14
2297
2298	stda	%d48, [%i0]ASI_BLK_AIUS
2299	subcc	%i3, 0x40, %i3
2300	add	%i0, 0x40, %i0
2301	bgu,pt	%ncc, 0b
2302	prefetch [%l0+0x80], #one_read
2303	ba	.co_blkdone
2304	membar	#Sync
2305
2306co_off15:
2307	ldd	[%l0+0x8], %d2
2308	ldd	[%l0+0x10], %d4
2309	ldd	[%l0+0x18], %d6
2310	ldd	[%l0+0x20], %d8
2311	ldd	[%l0+0x28], %d10
2312	ldd	[%l0+0x30], %d12
2313	ldd	[%l0+0x38], %d14
2314	prefetch [%l0+0x40], #one_read
2315	prefetch [%l0+0x80], #one_read
23161:
2317	add	%l0, 0x40, %l0
2318	stxa	%g0, [%i0]%asi		! initialize the cache line
2319
2320	ldda	[%l0]ASI_BLK_P, %d16
2321	ALIGN_OFF_8_15
2322	fmovd	%d18, %d2
2323	fmovd	%d20, %d4
2324	fmovd	%d22, %d6
2325	fmovd	%d24, %d8
2326	fmovd	%d26, %d10
2327	fmovd	%d28, %d12
2328	fmovd	%d30, %d14
2329
2330	stda	%d48, [%i0]ASI_BLK_AIUS
2331	subcc	%i3, 0x40, %i3
2332	add	%i0, 0x40, %i0
2333	bgu,pt	%ncc, 1b
2334	prefetch [%l0+0x80], #one_read
2335	ba	.co_blkdone
2336	membar	#Sync
2337
2338co_off23:
2339	ldd	[%l0+0x10], %d4
2340	ldd	[%l0+0x18], %d6
2341	ldd	[%l0+0x20], %d8
2342	ldd	[%l0+0x28], %d10
2343	ldd	[%l0+0x30], %d12
2344	ldd	[%l0+0x38], %d14
2345	prefetch [%l0+0x40], #one_read
2346	prefetch [%l0+0x80], #one_read
23472:
2348	add	%l0, 0x40, %l0
2349	stxa	%g0, [%i0]%asi		! initialize the cache line
2350
2351	ldda	[%l0]ASI_BLK_P, %d16
2352	ALIGN_OFF_16_23
2353	fmovd	%d20, %d4
2354	fmovd	%d22, %d6
2355	fmovd	%d24, %d8
2356	fmovd	%d26, %d10
2357	fmovd	%d28, %d12
2358	fmovd	%d30, %d14
2359
2360	stda	%d48, [%i0]ASI_BLK_AIUS
2361	subcc	%i3, 0x40, %i3
2362	add	%i0, 0x40, %i0
2363	bgu,pt	%ncc, 2b
2364	prefetch [%l0+0x80], #one_read
2365	ba	.co_blkdone
2366	membar	#Sync
2367
2368co_off31:
2369	ldd	[%l0+0x18], %d6
2370	ldd	[%l0+0x20], %d8
2371	ldd	[%l0+0x28], %d10
2372	ldd	[%l0+0x30], %d12
2373	ldd	[%l0+0x38], %d14
2374	prefetch [%l0+0x40], #one_read
2375	prefetch [%l0+0x80], #one_read
23763:
2377	add	%l0, 0x40, %l0
2378	stxa	%g0, [%i0]%asi		! initialize the cache line
2379
2380	ldda	[%l0]ASI_BLK_P, %d16
2381	ALIGN_OFF_24_31
2382	fmovd	%d22, %d6
2383	fmovd	%d24, %d8
2384	fmovd	%d26, %d10
2385	fmovd	%d28, %d12
2386	fmovd	%d30, %d14
2387
2388	stda	%d48, [%i0]ASI_BLK_AIUS
2389	subcc	%i3, 0x40, %i3
2390	add	%i0, 0x40, %i0
2391	bgu,pt	%ncc, 3b
2392	prefetch [%l0+0x80], #one_read
2393	ba	.co_blkdone
2394	membar	#Sync
2395
2396co_off39:
2397	ldd	[%l0+0x20], %d8
2398	ldd	[%l0+0x28], %d10
2399	ldd	[%l0+0x30], %d12
2400	ldd	[%l0+0x38], %d14
2401	prefetch [%l0+0x40], #one_read
2402	prefetch [%l0+0x80], #one_read
24034:
2404	add	%l0, 0x40, %l0
2405	stxa	%g0, [%i0]%asi		! initialize the cache line
2406
2407	ldda	[%l0]ASI_BLK_P, %d16
2408	ALIGN_OFF_32_39
2409	fmovd	%d24, %d8
2410	fmovd	%d26, %d10
2411	fmovd	%d28, %d12
2412	fmovd	%d30, %d14
2413
2414	stda	%d48, [%i0]ASI_BLK_AIUS
2415	subcc	%i3, 0x40, %i3
2416	add	%i0, 0x40, %i0
2417	bgu,pt	%ncc, 4b
2418	prefetch [%l0+0x80], #one_read
2419	ba	.co_blkdone
2420	membar	#Sync
2421
2422co_off47:
2423	ldd	[%l0+0x28], %d10
2424	ldd	[%l0+0x30], %d12
2425	ldd	[%l0+0x38], %d14
2426	prefetch [%l0+0x40], #one_read
2427	prefetch [%l0+0x80], #one_read
24285:
2429	add	%l0, 0x40, %l0
2430	stxa	%g0, [%i0]%asi		! initialize the cache line
2431
2432	ldda	[%l0]ASI_BLK_P, %d16
2433	ALIGN_OFF_40_47
2434	fmovd	%d26, %d10
2435	fmovd	%d28, %d12
2436	fmovd	%d30, %d14
2437
2438	stda	%d48, [%i0]ASI_BLK_AIUS
2439	subcc	%i3, 0x40, %i3
2440	add	%i0, 0x40, %i0
2441	bgu,pt	%ncc, 5b
2442	prefetch [%l0+0x80], #one_read
2443	ba	.co_blkdone
2444	membar	#Sync
2445
2446co_off55:
2447	ldd	[%l0+0x30], %d12
2448	ldd	[%l0+0x38], %d14
2449	prefetch [%l0+0x40], #one_read
2450	prefetch [%l0+0x80], #one_read
24516:
2452	add	%l0, 0x40, %l0
2453	stxa	%g0, [%i0]%asi		! initialize the cache line
2454
2455	ldda	[%l0]ASI_BLK_P, %d16
2456	ALIGN_OFF_48_55
2457	fmovd	%d28, %d12
2458	fmovd	%d30, %d14
2459
2460	stda	%d48, [%i0]ASI_BLK_AIUS
2461	subcc	%i3, 0x40, %i3
2462	add	%i0, 0x40, %i0
2463	bgu,pt	%ncc, 6b
2464	prefetch [%l0+0x80], #one_read
2465	ba	.co_blkdone
2466	membar	#Sync
2467
2468.co_blkcpy:
2469	prefetch [%i1+0x40], #one_read
2470	prefetch [%i1+0x80], #one_read
24718:
2472	stxa	%g0, [%i0]%asi		! initialize the cache line
2473	ldda	[%i1]ASI_BLK_P, %d0
2474	stda	%d0, [%i0]ASI_BLK_AIUS
2475
2476	add	%i1, 0x40, %i1
2477	subcc	%i3, 0x40, %i3
2478	add	%i0, 0x40, %i0
2479	bgu,pt	%ncc, 8b
2480	prefetch [%i1+0x80], #one_read
2481	membar	#Sync
2482
2483.co_blkdone:
2484#else	/* NIAGARA_IMPL */
2485	andcc	%i1, 0xf, %o2		! is src quadword aligned
2486	bz,pn	%xcc, .co_blkcpy	! src offset in %o2 (last 4-bits)
2487	nop
2488	cmp	%o2, 0x8
2489	bg	.co_upper_double
2490	nop
2491	bl	.co_lower_double
2492	nop
2493
2494	! Falls through when source offset is equal to 8 i.e.
2495	! source is double word aligned.
2496	! In this case no shift/merge of data is required
2497
2498	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2499	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2500	prefetch [%l0+0x0], #one_read
2501	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2502.co_loop0:
2503	add	%i1, 0x10, %i1
2504	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
2505	prefetch [%l0+0x40], #one_read
2506
2507	stxa	%l3, [%i0+0x0]%asi
2508	stxa	%l4, [%i0+0x8]%asi
2509
2510	add	%i1, 0x10, %i1
2511	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2512
2513	stxa	%l5, [%i0+0x10]%asi
2514	stxa	%l2, [%i0+0x18]%asi
2515
2516	add	%i1, 0x10, %i1
2517	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
2518
2519	stxa	%l3, [%i0+0x20]%asi
2520	stxa	%l4, [%i0+0x28]%asi
2521
2522	add	%i1, 0x10, %i1
2523	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2524
2525	stxa	%l5, [%i0+0x30]%asi
2526	stxa	%l2, [%i0+0x38]%asi
2527
2528	add	%l0, 0x40, %l0
2529	subcc	%i3, 0x40, %i3
2530	bgu,pt	%xcc, .co_loop0
2531	add	%i0, 0x40, %i0
2532	ba	.co_blkdone
2533	add	%i1, %o2, %i1		! increment the source by src offset
2534					! the src offset was stored in %o2
2535
2536.co_lower_double:
2537
2538	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2539	sll	%o2, 3, %o0		! %o0 left shift
2540	mov	0x40, %o1
2541	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
2542	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2543	prefetch [%l0+0x0], #one_read
2544	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2	! partial data in %l2 and %l3 has
2545					! complete data
2546.co_loop1:
2547	add	%i1, 0x10, %i1
2548	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4	! %l4 has partial data
2549							! for this read.
2550	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
2551							! into %l2 and %l3
2552	prefetch [%l0+0x40], #one_read
2553
2554	stxa	%l2, [%i0+0x0]%asi
2555	stxa	%l3, [%i0+0x8]%asi
2556
2557	add	%i1, 0x10, %i1
2558	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2559	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
2560							! %l4 from previous read
2561							! into %l4 and %l5
2562	stxa	%l4, [%i0+0x10]%asi
2563	stxa	%l5, [%i0+0x18]%asi
2564
2565	! Repeat the same for next 32 bytes.
2566
2567	add	%i1, 0x10, %i1
2568	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
2569	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
2570
2571	stxa	%l2, [%i0+0x20]%asi
2572	stxa	%l3, [%i0+0x28]%asi
2573
2574	add	%i1, 0x10, %i1
2575	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2576	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
2577
2578	stxa	%l4, [%i0+0x30]%asi
2579	stxa	%l5, [%i0+0x38]%asi
2580
2581	add	%l0, 0x40, %l0
2582	subcc	%i3, 0x40, %i3
2583	bgu,pt	%xcc, .co_loop1
2584	add	%i0, 0x40, %i0
2585	ba	.co_blkdone
2586	add	%i1, %o2, %i1		! increment the source by src offset
2587					! the src offset was stored in %o2
2588
2589.co_upper_double:
2590
2591	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2592	sub	%o2, 0x8, %o0
2593	sll	%o0, 3, %o0		! %o0 left shift
2594	mov	0x40, %o1
2595	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
2596	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2597	prefetch [%l0+0x0], #one_read
2598	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2	! partial data in %l3
2599							! for this read and
2600							! no data in %l2
2601.co_loop2:
2602	add	%i1, 0x10, %i1
2603	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4	! %l4 has complete data
2604							! and %l5 has partial
2605	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
2606							! into %l3 and %l4
2607	prefetch [%l0+0x40], #one_read
2608
2609	stxa	%l3, [%i0+0x0]%asi
2610	stxa	%l4, [%i0+0x8]%asi
2611
2612	add	%i1, 0x10, %i1
2613	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2614	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
2615							! %l5 from previous read
2616							! into %l5 and %l2
2617
2618	stxa	%l5, [%i0+0x10]%asi
2619	stxa	%l2, [%i0+0x18]%asi
2620
2621	! Repeat the same for next 32 bytes.
2622
2623	add	%i1, 0x10, %i1
2624	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
2625	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
2626
2627	stxa	%l3, [%i0+0x20]%asi
2628	stxa	%l4, [%i0+0x28]%asi
2629
2630	add	%i1, 0x10, %i1
2631	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2632	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
2633
2634	stxa	%l5, [%i0+0x30]%asi
2635	stxa	%l2, [%i0+0x38]%asi
2636
2637	add	%l0, 0x40, %l0
2638	subcc	%i3, 0x40, %i3
2639	bgu,pt	%xcc, .co_loop2
2640	add	%i0, 0x40, %i0
2641	ba	.co_blkdone
2642	add	%i1, %o2, %i1		! increment the source by src offset
2643					! the src offset was stored in %o2
2644
2645
2646	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2647.co_blkcpy:
2648
2649	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
2650	prefetch [%o0+0x0], #one_read
26511:
2652	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0
2653	add	%i1, 0x10, %i1
2654	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2655	add	%i1, 0x10, %i1
2656
2657	prefetch [%o0+0x40], #one_read
2658
2659	stxa	%l0, [%i0+0x0]%asi
2660
2661	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
2662	add	%i1, 0x10, %i1
2663	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6
2664	add	%i1, 0x10, %i1
2665
2666	stxa	%l1, [%i0+0x8]%asi
2667	stxa	%l2, [%i0+0x10]%asi
2668	stxa	%l3, [%i0+0x18]%asi
2669	stxa	%l4, [%i0+0x20]%asi
2670	stxa	%l5, [%i0+0x28]%asi
2671	stxa	%l6, [%i0+0x30]%asi
2672	stxa	%l7, [%i0+0x38]%asi
2673
2674	add	%o0, 0x40, %o0
2675	subcc	%i3, 0x40, %i3
2676	bgu,pt	%xcc, 1b
2677	add	%i0, 0x40, %i0
2678
2679.co_blkdone:
2680	membar	#Sync
2681#endif	/* NIAGARA_IMPL */
2682
2683	brz,pt	%i2, .copyout_exit
2684	nop
2685
2686	! Handle trailing bytes
2687	cmp	%i2, 0x8
2688	blu,pt	%ncc, .co_residue
2689	nop
2690
2691	! Can we do some 8B ops
2692	or	%i1, %i0, %o2
2693	andcc	%o2, 0x7, %g0
2694	bnz	%ncc, .co_last4
2695	nop
2696
2697	! Do 8byte ops as long as possible
2698.co_last8:
2699	ldx	[%i1], %o2
2700	stxa	%o2, [%i0]ASI_USER
2701	add	%i1, 0x8, %i1
2702	sub	%i2, 0x8, %i2
2703	cmp	%i2, 0x8
2704	bgu,pt	%ncc, .co_last8
2705	add	%i0, 0x8, %i0
2706
2707	brz,pt	%i2, .copyout_exit
2708	nop
2709
2710	ba	.co_residue
2711	nop
2712
2713.co_last4:
2714	! Can we do 4B ops
2715	andcc	%o2, 0x3, %g0
2716	bnz	%ncc, .co_last2
2717	nop
27181:
2719	ld	[%i1], %o2
2720	sta	%o2, [%i0]ASI_USER
2721	add	%i1, 0x4, %i1
2722	sub	%i2, 0x4, %i2
2723	cmp	%i2, 0x4
2724	bgu,pt	%ncc, 1b
2725	add	%i0, 0x4, %i0
2726
2727	brz,pt	%i2, .copyout_exit
2728	nop
2729
2730	ba	.co_residue
2731	nop
2732
2733.co_last2:
2734	! Can we do 2B ops
2735	andcc	%o2, 0x1, %g0
2736	bnz	%ncc, .co_residue
2737	nop
2738
27391:
2740	lduh	[%i1], %o2
2741	stuha	%o2, [%i0]ASI_USER
2742	add	%i1, 0x2, %i1
2743	sub	%i2, 0x2, %i2
2744	cmp	%i2, 0x2
2745	bgu,pt	%ncc, 1b
2746	add	%i0, 0x2, %i0
2747
2748	brz,pt	%i2, .copyout_exit
2749	nop
2750
2751	! Copy the residue as byte copy
2752.co_residue:
2753	ldub	[%i1], %i4
2754	stba	%i4, [%i0]ASI_USER
2755	inc	%i1
2756	deccc	%i2
2757	bgu,pt	%xcc, .co_residue
2758	inc	%i0
2759
2760.copyout_exit:
2761#if !defined(NIAGARA_IMPL)
2762	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2763	wr	%o2, 0, %gsr		! restore gsr
2764
2765	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2766	btst	FPRS_FEF, %o3
2767	bz	%icc, 4f
2768	  nop
2769
2770	! restore fpregs from stack
2771	BLD_FP_FROMSTACK(%o2)
2772
2773	ba,pt	%ncc, 2f
2774	  wr	%o3, 0, %fprs		! restore fprs
2775
27764:
2777	FZERO				! zero all of the fpregs
2778	wr	%o3, 0, %fprs		! restore fprs
2779
27802:
2781	membar	#Sync
2782	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2783#else	/* NIAGARA_IMPL */
2784	membar	#Sync
2785#endif	/* NIAGARA_IMPL */
2786	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2787	ret
2788	restore	%g0, 0, %o0
2789
2790.copyout_err:
2791	ldn	[THREAD_REG + T_COPYOPS], %o4
2792	brz	%o4, 2f
2793	nop
2794	ldn	[%o4 + CP_COPYOUT], %g2
2795	jmp	%g2
2796	nop
27972:
2798	retl
2799	mov	-1, %o0
2800	SET_SIZE(copyout)
2801
2802#endif	/* lint */
2803
2804
2805#ifdef	lint
2806
2807/*ARGSUSED*/
2808int
2809xcopyout(const void *kaddr, void *uaddr, size_t count)
2810{ return (0); }
2811
2812#else	/* lint */
2813
2814	ENTRY(xcopyout)
2815	sethi	%hi(.xcopyout_err), REAL_LOFAULT
2816	b	.do_copyout
2817	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2818.xcopyout_err:
2819	ldn	[THREAD_REG + T_COPYOPS], %o4
2820	brz	%o4, 2f
2821	nop
2822	ldn	[%o4 + CP_XCOPYOUT], %g2
2823	jmp	%g2
2824	nop
28252:
2826	retl
2827	mov	%g1, %o0
2828	SET_SIZE(xcopyout)
2829
2830#endif	/* lint */
2831
2832#ifdef	lint
2833
2834/*ARGSUSED*/
2835int
2836xcopyout_little(const void *kaddr, void *uaddr, size_t count)
2837{ return (0); }
2838
2839#else	/* lint */
2840
2841	ENTRY(xcopyout_little)
2842	sethi	%hi(.little_err), %o4
2843	ldn	[THREAD_REG + T_LOFAULT], %o5
2844	or	%o4, %lo(.little_err), %o4
2845	membar	#Sync			! sync error barrier
2846	stn	%o4, [THREAD_REG + T_LOFAULT]
2847
2848	subcc	%g0, %o2, %o3
2849	add	%o0, %o2, %o0
2850	bz,pn	%ncc, 2f		! check for zero bytes
2851	sub	%o2, 1, %o4
2852	add	%o0, %o4, %o0		! start w/last byte
2853	add	%o1, %o2, %o1
2854	ldub	[%o0+%o3], %o4
2855
28561:	stba	%o4, [%o1+%o3]ASI_AIUSL
2857	inccc	%o3
2858	sub	%o0, 2, %o0		! get next byte
2859	bcc,a,pt %ncc, 1b
2860	  ldub	[%o0+%o3], %o4
2861
28622:	membar	#Sync			! sync error barrier
2863	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2864	retl
2865	mov	%g0, %o0		! return (0)
2866	SET_SIZE(xcopyout_little)
2867
2868#endif	/* lint */
2869
2870/*
2871 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2872 */
2873
2874#if defined(lint)
2875
2876/*ARGSUSED*/
2877int
2878copyin(const void *uaddr, void *kaddr, size_t count)
2879{ return (0); }
2880
2881#else	/* lint */
2882
2883	ENTRY(copyin)
2884	sethi	%hi(.copyin_err), REAL_LOFAULT
2885	or	REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
2886
2887.do_copyin:
2888	!
2889	! Check the length and bail if zero.
2890	!
2891	tst	%o2
2892	bnz,pt	%ncc, 1f
2893	  nop
2894	retl
2895	  clr	%o0
28961:
2897	sethi	%hi(copyio_fault), %o4
2898	or	%o4, %lo(copyio_fault), %o4
2899	sethi	%hi(copyio_fault_nowindow), %o3
2900	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
2901	or	%o3, %lo(copyio_fault_nowindow), %o3
2902	membar	#Sync
2903	stn	%o3, [THREAD_REG + T_LOFAULT]
2904
2905	mov	%o0, SAVE_SRC
2906	mov	%o1, SAVE_DST
2907	mov	%o2, SAVE_COUNT
2908
2909	!
2910	! Check to see if we're more than SMALL_LIMIT.
2911	!
2912	subcc	%o2, SMALL_LIMIT, %o3
2913	bgu,a,pt %ncc, .dci_ns
2914	or	%o0, %o1, %o3
2915	!
2916	! What was previously ".small_copyin"
2917	!
2918.dcibcp:
2919	sub	%g0, %o2, %o3		! setup for copy loop
2920	add	%o0, %o2, %o0
2921	add	%o1, %o2, %o1
2922	ba,pt	%ncc, .dcicl
2923	lduba	[%o0 + %o3]ASI_USER, %o4
2924	!
2925	! %o0 and %o1 point at the end and remain pointing at the end
2926	! of their buffers. We pull things out by adding %o3 (which is
2927	! the negation of the length) to the buffer end which gives us
2928	! the curent location in the buffers. By incrementing %o3 we walk
2929	! through both buffers without having to bump each buffer's
2930	! pointer. A very fast 4 instruction loop.
2931	!
2932	.align 16
2933.dcicl:
2934	stb	%o4, [%o1 + %o3]
2935	inccc	%o3
2936	bl,a,pt %ncc, .dcicl
2937	lduba	[%o0 + %o3]ASI_USER, %o4
2938	!
2939	! We're done. Go home.
2940	!
2941	membar	#Sync
2942	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
2943	retl
2944	clr	%o0
2945	!
2946	! Try aligned copies from here.
2947	!
2948.dci_ns:
2949	!
2950	! See if we're single byte aligned. If we are, check the
2951	! limit for single byte copies. If we're smaller, or equal,
2952	! bounce to the byte for byte copy loop. Otherwise do it in
2953	! HW (if enabled).
2954	!
2955	btst	1, %o3
2956	bz,a,pt	%icc, .dcih8
2957	btst	7, %o3
2958	!
2959	! We're single byte aligned.
2960	!
2961	sethi	%hi(hw_copy_limit_1), %o3
2962	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2963	!
2964	! Is HW copy on? If not do everything byte for byte.
2965	!
2966	tst	%o3
2967	bz,pn	%icc, .dcibcp
2968	subcc	%o3, %o2, %o3
2969	!
2970	! Are we bigger than the HW limit? If not
2971	! go to byte for byte.
2972	!
2973	bge,pt	%ncc, .dcibcp
2974	nop
2975	!
2976	! We're big enough and copy is on. Do it with HW.
2977	!
2978	ba,pt	%ncc, .big_copyin
2979	nop
2980.dcih8:
2981	!
2982	! 8 byte aligned?
2983	!
2984	bnz,a	%ncc, .dcih4
2985	btst	3, %o3
2986	!
2987	! We're eight byte aligned.
2988	!
2989	sethi	%hi(hw_copy_limit_8), %o3
2990	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2991	!
2992	! Is HW assist on? If not, do it with the aligned copy.
2993	!
2994	tst	%o3
2995	bz,pn	%icc, .dcis8
2996	subcc	%o3, %o2, %o3
2997	bge	%ncc, .dcis8
2998	nop
2999	ba,pt	%ncc, .big_copyin
3000	nop
3001.dcis8:
3002	!
3003	! Housekeeping for copy loops. Uses same idea as in the byte for
3004	! byte copy loop above.
3005	!
3006	add	%o0, %o2, %o0
3007	add	%o1, %o2, %o1
3008	sub	%g0, %o2, %o3
3009	ba,pt	%ncc, .didebc
3010	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
3011	!
3012	! 4 byte aligned?
3013	!
3014.dcih4:
3015	bnz	%ncc, .dcih2
3016	sethi	%hi(hw_copy_limit_4), %o3
3017	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3018	!
3019	! Is HW assist on? If not, do it with the aligned copy.
3020	!
3021	tst	%o3
3022	bz,pn	%icc, .dcis4
3023	subcc	%o3, %o2, %o3
3024	!
3025	! We're negative if our size is less than or equal to hw_copy_limit_4.
3026	!
3027	bge	%ncc, .dcis4
3028	nop
3029	ba,pt	%ncc, .big_copyin
3030	nop
3031.dcis4:
3032	!
3033	! Housekeeping for copy loops. Uses same idea as in the byte
3034	! for byte copy loop above.
3035	!
3036	add	%o0, %o2, %o0
3037	add	%o1, %o2, %o1
3038	sub	%g0, %o2, %o3
3039	ba,pt	%ncc, .didfbc
3040	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
3041.dcih2:
3042	!
3043	! We're two byte aligned. Check for "smallness"
3044	! done in delay at .dcih4
3045	!
3046	bleu,pt	%ncc, .dcis2
3047	sethi	%hi(hw_copy_limit_2), %o3
3048	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3049	!
3050	! Is HW assist on? If not, do it with the aligned copy.
3051	!
3052	tst	%o3
3053	bz,pn	%icc, .dcis2
3054	subcc	%o3, %o2, %o3
3055	!
3056	! Are we larger than the HW limit?
3057	!
3058	bge	%ncc, .dcis2
3059	nop
3060	!
3061	! HW assist is on and we're large enough to use it.
3062	!
3063	ba,pt	%ncc, .big_copyin
3064	nop
3065	!
3066	! Housekeeping for copy loops. Uses same idea as in the byte
3067	! for byte copy loop above.
3068	!
3069.dcis2:
3070	add	%o0, %o2, %o0
3071	add	%o1, %o2, %o1
3072	sub	%g0, %o2, %o3
3073	ba,pt	%ncc, .didtbc
3074	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
3075	!
3076.small_copyin:
3077	!
3078	! Why are we doing this AGAIN? There are certain conditions in
3079	! big copyin that will cause us to forgo the HW assisted copys
3080	! and bounce back to a non-hw assisted copy. This dispatches
3081	! those copies. Note that we branch around this in the main line
3082	! code.
3083	!
3084	! We make no check for limits or HW enablement here. We've
3085	! already been told that we're a poster child so just go off
3086	! and do it.
3087	!
3088	or	%o0, %o1, %o3
3089	btst	1, %o3
3090	bnz	%icc, .dcibcp		! Most likely
3091	btst	7, %o3
3092	bz	%icc, .dcis8
3093	btst	3, %o3
3094	bz	%icc, .dcis4
3095	nop
3096	ba,pt	%ncc, .dcis2
3097	nop
3098	!
3099	! Eight byte aligned copies. A steal from the original .small_copyin
3100	! with modifications. %o2 is number of 8 byte chunks to copy. When
3101	! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
3102	! to copy.
3103	!
3104	.align 32
3105.didebc:
3106	ldxa	[%o0 + %o3]ASI_USER, %o4
3107	deccc	%o2
3108	stx	%o4, [%o1 + %o3]
3109	bg,pt	%ncc, .didebc
3110	addcc	%o3, 8, %o3
3111	!
3112	! End of copy loop. Most 8 byte aligned copies end here.
3113	!
3114	bz,pt	%ncc, .dcifh
3115	nop
3116	!
3117	! Something is left. Do it byte for byte.
3118	!
3119	ba,pt	%ncc, .dcicl
3120	lduba	[%o0 + %o3]ASI_USER, %o4
3121	!
3122	! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
3123	!
3124	.align 32
3125.didfbc:
3126	lduwa	[%o0 + %o3]ASI_USER, %o4
3127	deccc	%o2
3128	st	%o4, [%o1 + %o3]
3129	bg,pt	%ncc, .didfbc
3130	addcc	%o3, 4, %o3
3131	!
3132	! End of copy loop. Most 4 byte aligned copies end here.
3133	!
3134	bz,pt	%ncc, .dcifh
3135	nop
3136	!
3137	! Something is left. Do it byte for byte.
3138	!
3139	ba,pt	%ncc, .dcicl
3140	lduba	[%o0 + %o3]ASI_USER, %o4
3141	!
3142	! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
3143	! copy.
3144	!
3145	.align 32
3146.didtbc:
3147	lduha	[%o0 + %o3]ASI_USER, %o4
3148	deccc	%o2
3149	sth	%o4, [%o1 + %o3]
3150	bg,pt	%ncc, .didtbc
3151	addcc	%o3, 2, %o3
3152	!
3153	! End of copy loop. Most 2 byte aligned copies end here.
3154	!
3155	bz,pt	%ncc, .dcifh
3156	nop
3157	!
3158	! Deal with the last byte
3159	!
3160	lduba	[%o0 + %o3]ASI_USER, %o4
3161	stb	%o4, [%o1 + %o3]
3162.dcifh:
3163	membar	#Sync
3164	stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3165	retl
3166	clr	%o0
3167
3168.big_copyin:
3169	! We're going off to do a block copy.
3170	! Switch fault hendlers and grab a window. We
3171	! don't do a membar #Sync since we've done only
3172	! kernel data to this point.
3173	stn	%o4, [THREAD_REG + T_LOFAULT]
3174
3175	! Copy in that reach here are larger than 256 bytes. The
3176	! hw_copy_limit_1 is set to 256. Never set this limit less
3177	! 128 bytes.
3178#if !defined(NIAGARA_IMPL)
3179	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3180
3181	rd	%fprs, %o2			! check for unused fp
3182	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]	! save %fprs
3183	btst	FPRS_FEF, %o2
3184	bz,a,pt	%icc, .do_blockcopyin
3185	wr	%g0, FPRS_FEF, %fprs
3186
3187	! save in-use fpregs on stack
3188	BST_FP_TOSTACK(%o2)
3189#else	/* NIAGARA_IMPL */
3190	save	%sp, -SA(MINFRAME), %sp
3191#endif	/* NIAGARA_IMPL */
3192
3193.do_blockcopyin:
3194
3195#if !defined(NIAGARA_IMPL)
3196	rd	%gsr, %o2
3197	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
3198	! set the lower bit saved t_lofault to indicate that we need
3199	! clear %fprs register on the way out
3200	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3201#endif	/* NIAGARA_IMPL */
3202
3203	! Swap src/dst since the code below is memcpy code
3204	! and memcpy/bcopy have different calling sequences
3205	mov	%i1, %i5
3206	mov	%i0, %i1
3207	mov	%i5, %i0
3208
3209	! Block (64 bytes) align the destination.
3210	andcc	%i0, 0x3f, %i3		! is dst block aligned
3211	bz	%ncc, copyin_blalign	! dst already block aligned
3212	sub	%i3, 0x40, %i3
3213	neg	%i3			! bytes till dst 64 bytes aligned
3214	sub	%i2, %i3, %i2		! update i2 with new count
3215
3216	! Based on source and destination alignment do
3217	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
3218
3219	! Is dst & src 8B aligned
3220	or	%i0, %i1, %o2
3221	andcc	%o2, 0x7, %g0
3222	bz	%ncc, .ci_alewdcp
3223	nop
3224
3225	! Is dst & src 4B aligned
3226	andcc	%o2, 0x3, %g0
3227	bz	%ncc, .ci_alwdcp
3228	nop
3229
3230	! Is dst & src 2B aligned
3231	andcc	%o2, 0x1, %g0
3232	bz	%ncc, .ci_alhlfwdcp
3233	nop
3234
3235	! 1B aligned
32361:	lduba	[%i1]ASI_USER, %o2
3237	stb	%o2, [%i0]
3238	inc	%i1
3239	deccc	%i3
3240	bgu,pt	%ncc, 1b
3241	inc	%i0
3242
3243	ba	copyin_blalign
3244	nop
3245
3246	! dst & src 4B aligned
3247.ci_alwdcp:
3248	lda	[%i1]ASI_USER, %o2
3249	st	%o2, [%i0]
3250	add	%i1, 0x4, %i1
3251	subcc	%i3, 0x4, %i3
3252	bgu,pt	%ncc, .ci_alwdcp
3253	add	%i0, 0x4, %i0
3254
3255	ba	copyin_blalign
3256	nop
3257
3258	! dst & src 2B aligned
3259.ci_alhlfwdcp:
3260	lduha	[%i1]ASI_USER, %o2
3261	stuh	%o2, [%i0]
3262	add	%i1, 0x2, %i1
3263	subcc	%i3, 0x2, %i3
3264	bgu,pt	%ncc, .ci_alhlfwdcp
3265	add	%i0, 0x2, %i0
3266
3267	ba	copyin_blalign
3268	nop
3269
3270	! dst & src 8B aligned
3271.ci_alewdcp:
3272	ldxa	[%i1]ASI_USER, %o2
3273	stx	%o2, [%i0]
3274	add	%i1, 0x8, %i1
3275	subcc	%i3, 0x8, %i3
3276	bgu,pt	%ncc, .ci_alewdcp
3277	add	%i0, 0x8, %i0
3278
3279copyin_blalign:
3280	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
3281	sub	%i2, %i3, %i2		! Residue bytes in %i2
3282
3283#if !defined(NIAGARA_IMPL)
3284	mov	ASI_USER, %asi
3285
3286	andn	%i1, 0x3f, %l0		! %l0 has block aligned src address
3287	prefetch [%l0+0x0], #one_read
3288	andcc	%i1, 0x3f, %g0		! is src 64B aligned
3289	bz,pn	%ncc, .ci_blkcpy
3290	nop
3291
3292	! handle misaligned source cases
3293	alignaddr %i1, %g0, %g0		! generate %gsr
3294
3295	srl	%i1, 0x3, %l1		! src add bits 3, 4, 5 are now least
3296					! significant in %l1
3297	andcc	%l1, 0x7, %l2		! mask everything except bits 1, 2, 3
3298	add	%i1, %i3, %i1
3299
3300	! switch statement to get to right 8 byte block within
3301	! 64 byte block
3302	cmp	 %l2, 0x4
3303	bgeu,a	 ci_hlf
3304	cmp	 %l2, 0x6
3305	cmp	 %l2, 0x2
3306	bgeu,a	 ci_sqtr
3307	nop
3308	cmp	 %l2, 0x1
3309	be,a	 ci_off15
3310	nop
3311	ba	 ci_off7
3312	nop
3313ci_sqtr:
3314	be,a	 ci_off23
3315	nop
3316	ba,a	 ci_off31
3317	nop
3318
3319ci_hlf:
3320	bgeu,a	 ci_fqtr
3321	nop
3322	cmp	 %l2, 0x5
3323	be,a	 ci_off47
3324	nop
3325	ba	 ci_off39
3326	nop
3327ci_fqtr:
3328	be,a	 ci_off55
3329	nop
3330
3331	ldda	[%l0+0x38]%asi, %d14
3332	prefetch [%l0+0x40], #one_read
3333	prefetch [%l0+0x80], #one_read
33347:
3335	add	%l0, 0x40, %l0
3336	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3337
3338	ldda	[%l0]ASI_BLK_AIUS, %d16
3339	ALIGN_OFF_56_63
3340	fmovd	%d30, %d14
3341
3342	stda	%d48, [%i0]ASI_BLK_P
3343	subcc	%i3, 0x40, %i3
3344	add	%i0, 0x40, %i0
3345	bgu,pt	%ncc, 7b
3346	prefetch [%l0+0x80], #one_read
3347	ba	.ci_blkdone
3348	membar	#Sync
3349
3350ci_off7:
3351	ldda	[%l0]ASI_BLK_AIUS, %d0
3352	prefetch [%l0+0x40], #one_read
3353	prefetch [%l0+0x80], #one_read
33540:
3355	add	%l0, 0x40, %l0
3356	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3357
3358	ldda	[%l0]ASI_BLK_AIUS, %d16
3359	ALIGN_OFF_1_7
3360	fmovd	%d16, %d0
3361	fmovd	%d18, %d2
3362	fmovd	%d20, %d4
3363	fmovd	%d22, %d6
3364	fmovd	%d24, %d8
3365	fmovd	%d26, %d10
3366	fmovd	%d28, %d12
3367	fmovd	%d30, %d14
3368
3369	stda	%d48, [%i0]ASI_BLK_P
3370	subcc	%i3, 0x40, %i3
3371	add	%i0, 0x40, %i0
3372	bgu,pt	%ncc, 0b
3373	prefetch [%l0+0x80], #one_read
3374	ba	.ci_blkdone
3375	membar	#Sync
3376
3377ci_off15:
3378	ldda	[%l0+0x8]%asi, %d2
3379	ldda	[%l0+0x10]%asi, %d4
3380	ldda	[%l0+0x18]%asi, %d6
3381	ldda	[%l0+0x20]%asi, %d8
3382	ldda	[%l0+0x28]%asi, %d10
3383	ldda	[%l0+0x30]%asi, %d12
3384	ldda	[%l0+0x38]%asi, %d14
3385	prefetch [%l0+0x40], #one_read
3386	prefetch [%l0+0x80], #one_read
33871:
3388	add	%l0, 0x40, %l0
3389	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3390
3391	ldda	[%l0]ASI_BLK_AIUS, %d16
3392	ALIGN_OFF_8_15
3393	fmovd	%d18, %d2
3394	fmovd	%d20, %d4
3395	fmovd	%d22, %d6
3396	fmovd	%d24, %d8
3397	fmovd	%d26, %d10
3398	fmovd	%d28, %d12
3399	fmovd	%d30, %d14
3400
3401	stda	%d48, [%i0]ASI_BLK_P
3402	subcc	%i3, 0x40, %i3
3403	add	%i0, 0x40, %i0
3404	bgu,pt	%ncc, 1b
3405	prefetch [%l0+0x80], #one_read
3406	ba	.ci_blkdone
3407	membar	#Sync
3408
3409ci_off23:
3410	ldda	[%l0+0x10]%asi, %d4
3411	ldda	[%l0+0x18]%asi, %d6
3412	ldda	[%l0+0x20]%asi, %d8
3413	ldda	[%l0+0x28]%asi, %d10
3414	ldda	[%l0+0x30]%asi, %d12
3415	ldda	[%l0+0x38]%asi, %d14
3416	prefetch [%l0+0x40], #one_read
3417	prefetch [%l0+0x80], #one_read
34182:
3419	add	%l0, 0x40, %l0
3420	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3421
3422	ldda	[%l0]ASI_BLK_AIUS, %d16
3423	ALIGN_OFF_16_23
3424	fmovd	%d20, %d4
3425	fmovd	%d22, %d6
3426	fmovd	%d24, %d8
3427	fmovd	%d26, %d10
3428	fmovd	%d28, %d12
3429	fmovd	%d30, %d14
3430
3431	stda	%d48, [%i0]ASI_BLK_P
3432	subcc	%i3, 0x40, %i3
3433	add	%i0, 0x40, %i0
3434	bgu,pt	%ncc, 2b
3435	prefetch [%l0+0x80], #one_read
3436	ba	.ci_blkdone
3437	membar	#Sync
3438
3439ci_off31:
3440	ldda	[%l0+0x18]%asi, %d6
3441	ldda	[%l0+0x20]%asi, %d8
3442	ldda	[%l0+0x28]%asi, %d10
3443	ldda	[%l0+0x30]%asi, %d12
3444	ldda	[%l0+0x38]%asi, %d14
3445	prefetch [%l0+0x40], #one_read
3446	prefetch [%l0+0x80], #one_read
34473:
3448	add	%l0, 0x40, %l0
3449	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3450
3451	ldda	[%l0]ASI_BLK_AIUS, %d16
3452	ALIGN_OFF_24_31
3453	fmovd	%d22, %d6
3454	fmovd	%d24, %d8
3455	fmovd	%d26, %d10
3456	fmovd	%d28, %d12
3457	fmovd	%d30, %d14
3458
3459	stda	%d48, [%i0]ASI_BLK_P
3460	subcc	%i3, 0x40, %i3
3461	add	%i0, 0x40, %i0
3462	bgu,pt	%ncc, 3b
3463	prefetch [%l0+0x80], #one_read
3464	ba	.ci_blkdone
3465	membar	#Sync
3466
3467ci_off39:
3468	ldda	[%l0+0x20]%asi, %d8
3469	ldda	[%l0+0x28]%asi, %d10
3470	ldda	[%l0+0x30]%asi, %d12
3471	ldda	[%l0+0x38]%asi, %d14
3472	prefetch [%l0+0x40], #one_read
3473	prefetch [%l0+0x80], #one_read
34744:
3475	add	%l0, 0x40, %l0
3476	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3477
3478	ldda	[%l0]ASI_BLK_AIUS, %d16
3479	ALIGN_OFF_32_39
3480	fmovd	%d24, %d8
3481	fmovd	%d26, %d10
3482	fmovd	%d28, %d12
3483	fmovd	%d30, %d14
3484
3485	stda	%d48, [%i0]ASI_BLK_P
3486	subcc	%i3, 0x40, %i3
3487	add	%i0, 0x40, %i0
3488	bgu,pt	%ncc, 4b
3489	prefetch [%l0+0x80], #one_read
3490	ba	.ci_blkdone
3491	membar	#Sync
3492
3493ci_off47:
3494	ldda	[%l0+0x28]%asi, %d10
3495	ldda	[%l0+0x30]%asi, %d12
3496	ldda	[%l0+0x38]%asi, %d14
3497	prefetch [%l0+0x40], #one_read
3498	prefetch [%l0+0x80], #one_read
34995:
3500	add	%l0, 0x40, %l0
3501	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3502
3503	ldda	[%l0]ASI_BLK_AIUS, %d16
3504	ALIGN_OFF_40_47
3505	fmovd	%d26, %d10
3506	fmovd	%d28, %d12
3507	fmovd	%d30, %d14
3508
3509	stda	%d48, [%i0]ASI_BLK_P
3510	subcc	%i3, 0x40, %i3
3511	add	%i0, 0x40, %i0
3512	bgu,pt	%ncc, 5b
3513	prefetch [%l0+0x80], #one_read
3514	ba	.ci_blkdone
3515	membar	#Sync
3516
3517ci_off55:
3518	ldda	[%l0+0x30]%asi, %d12
3519	ldda	[%l0+0x38]%asi, %d14
3520	prefetch [%l0+0x40], #one_read
3521	prefetch [%l0+0x80], #one_read
35226:
3523	add	%l0, 0x40, %l0
3524	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3525
3526	ldda	[%l0]ASI_BLK_AIUS, %d16
3527	ALIGN_OFF_48_55
3528	fmovd	%d28, %d12
3529	fmovd	%d30, %d14
3530
3531	stda	%d48, [%i0]ASI_BLK_P
3532	subcc	%i3, 0x40, %i3
3533	add	%i0, 0x40, %i0
3534	bgu,pt	%ncc, 6b
3535	prefetch [%l0+0x80], #one_read
3536	ba	.ci_blkdone
3537	membar	#Sync
3538
3539.ci_blkcpy:
3540	prefetch [%i1+0x40], #one_read
3541	prefetch [%i1+0x80], #one_read
35428:
3543	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3544	ldda	[%i1]ASI_BLK_AIUS, %d0
3545	stda	%d0, [%i0]ASI_BLK_P
3546
3547	add	%i1, 0x40, %i1
3548	subcc	%i3, 0x40, %i3
3549	add	%i0, 0x40, %i0
3550	bgu,pt	%ncc, 8b
3551	prefetch [%i1+0x80], #one_read
3552	membar	#Sync
3553
3554.ci_blkdone:
3555#else	/* NIAGARA_IMPL */
3556	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
3557
3558	andcc	%i1, 0xf, %o2		! is src quadword aligned
3559	bz,pn	%xcc, .ci_blkcpy	! src offset in %o2 (last 4-bits)
3560	nop
3561	cmp	%o2, 0x8
3562	bg	.ci_upper_double
3563	nop
3564	bl	.ci_lower_double
3565	nop
3566
3567	! Falls through when source offset is equal to 8 i.e.
3568	! source is double word aligned.
3569	! In this case no shift/merge of data is required
3570
3571	sub	%i1, %o2, %i1		! align the src at 16 bytes.
3572	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
3573	prefetch [%l0+0x0], #one_read
3574	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3575.ci_loop0:
3576	add	%i1, 0x10, %i1
3577	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
3578
3579	prefetch [%l0+0x40], #one_read
3580
3581	stxa	%l3, [%i0+0x0]%asi
3582	stxa	%l4, [%i0+0x8]%asi
3583
3584	add	%i1, 0x10, %i1
3585	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3586
3587	stxa	%l5, [%i0+0x10]%asi
3588	stxa	%l2, [%i0+0x18]%asi
3589
3590	add	%i1, 0x10, %i1
3591	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
3592
3593	stxa	%l3, [%i0+0x20]%asi
3594	stxa	%l4, [%i0+0x28]%asi
3595
3596	add	%i1, 0x10, %i1
3597	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3598
3599	stxa	%l5, [%i0+0x30]%asi
3600	stxa	%l2, [%i0+0x38]%asi
3601
3602	add	%l0, 0x40, %l0
3603	subcc	%i3, 0x40, %i3
3604	bgu,pt	%xcc, .ci_loop0
3605	add	%i0, 0x40, %i0
3606	ba	.ci_blkdone
3607	add	%i1, %o2, %i1		! increment the source by src offset
3608					! the src offset was stored in %o2
3609
3610.ci_lower_double:
3611
3612	sub	%i1, %o2, %i1		! align the src at 16 bytes.
3613	sll	%o2, 3, %o0		! %o0 left shift
3614	mov	0x40, %o1
3615	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
3616	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
3617	prefetch [%l0+0x0], #one_read
3618	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2	! partial data in %l2
3619							! and %l3 has complete
3620							! data
3621.ci_loop1:
3622	add	%i1, 0x10, %i1
3623	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4	! %l4 has partial data
3624							! for this read.
3625	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
3626							! into %l2 and %l3
3627
3628	prefetch [%l0+0x40], #one_read
3629
3630	stxa	%l2, [%i0+0x0]%asi
3631	stxa	%l3, [%i0+0x8]%asi
3632
3633	add	%i1, 0x10, %i1
3634	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3635	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
3636							! %l4 from previous read
3637							! into %l4 and %l5
3638	stxa	%l4, [%i0+0x10]%asi
3639	stxa	%l5, [%i0+0x18]%asi
3640
3641	! Repeat the same for next 32 bytes.
3642
3643	add	%i1, 0x10, %i1
3644	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
3645	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
3646
3647	stxa	%l2, [%i0+0x20]%asi
3648	stxa	%l3, [%i0+0x28]%asi
3649
3650	add	%i1, 0x10, %i1
3651	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3652	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
3653
3654	stxa	%l4, [%i0+0x30]%asi
3655	stxa	%l5, [%i0+0x38]%asi
3656
3657	add	%l0, 0x40, %l0
3658	subcc	%i3, 0x40, %i3
3659	bgu,pt	%xcc, .ci_loop1
3660	add	%i0, 0x40, %i0
3661	ba	.ci_blkdone
3662	add	%i1, %o2, %i1		! increment the source by src offset
3663					! the src offset was stored in %o2
3664
3665.ci_upper_double:
3666
3667	sub	%i1, %o2, %i1		! align the src at 16 bytes.
3668	sub	%o2, 0x8, %o0
3669	sll	%o0, 3, %o0		! %o0 left shift
3670	mov	0x40, %o1
3671	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
3672	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
3673	prefetch [%l0+0x0], #one_read
3674	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2	! partial data in %l3
3675							! for this read and
3676							! no data in %l2
3677.ci_loop2:
3678	add	%i1, 0x10, %i1
3679	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4	! %l4 has complete data
3680							! and %l5 has partial
3681	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
3682							! into %l3 and %l4
3683	prefetch [%l0+0x40], #one_read
3684
3685	stxa	%l3, [%i0+0x0]%asi
3686	stxa	%l4, [%i0+0x8]%asi
3687
3688	add	%i1, 0x10, %i1
3689	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3690	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
3691							! %l5 from previous read
3692							! into %l5 and %l2
3693
3694	stxa	%l5, [%i0+0x10]%asi
3695	stxa	%l2, [%i0+0x18]%asi
3696
3697	! Repeat the same for next 32 bytes.
3698
3699	add	%i1, 0x10, %i1
3700	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
3701	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
3702
3703	stxa	%l3, [%i0+0x20]%asi
3704	stxa	%l4, [%i0+0x28]%asi
3705
3706	add	%i1, 0x10, %i1
3707	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3708	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
3709
3710	stxa	%l5, [%i0+0x30]%asi
3711	stxa	%l2, [%i0+0x38]%asi
3712
3713	add	%l0, 0x40, %l0
3714	subcc	%i3, 0x40, %i3
3715	bgu,pt	%xcc, .ci_loop2
3716	add	%i0, 0x40, %i0
3717	ba	.ci_blkdone
3718	add	%i1, %o2, %i1		! increment the source by src offset
3719					! the src offset was stored in %o2
3720
3721
3722	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
3723.ci_blkcpy:
3724
3725	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
3726	prefetch [%o0+0x0], #one_read
37271:
3728	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0
3729	add	%i1, 0x10, %i1
3730	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3731	add	%i1, 0x10, %i1
3732
3733	prefetch [%o0+0x40], #one_read
3734
3735	stxa	%l0, [%i0+0x0]%asi
3736
3737	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
3738	add	%i1, 0x10, %i1
3739	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6
3740	add	%i1, 0x10, %i1
3741
3742	stxa	%l1, [%i0+0x8]%asi
3743	stxa	%l2, [%i0+0x10]%asi
3744	stxa	%l3, [%i0+0x18]%asi
3745	stxa	%l4, [%i0+0x20]%asi
3746	stxa	%l5, [%i0+0x28]%asi
3747	stxa	%l6, [%i0+0x30]%asi
3748	stxa	%l7, [%i0+0x38]%asi
3749
3750	add	%o0, 0x40, %o0
3751	subcc	%i3, 0x40, %i3
3752	bgu,pt	%xcc, 1b
3753	add	%i0, 0x40, %i0
3754
3755.ci_blkdone:
3756	membar	#Sync
3757#endif	/* NIAGARA_IMPL */
3758
3759	brz,pt	%i2, .copyin_exit
3760	nop
3761
3762	! Handle trailing bytes
3763	cmp	%i2, 0x8
3764	blu,pt	%ncc, .ci_residue
3765	nop
3766
3767	! Can we do some 8B ops
3768	or	%i1, %i0, %o2
3769	andcc	%o2, 0x7, %g0
3770	bnz	%ncc, .ci_last4
3771	nop
3772
3773	! Do 8byte ops as long as possible
3774.ci_last8:
3775	ldxa	[%i1]ASI_USER, %o2
3776	stx	%o2, [%i0]
3777	add	%i1, 0x8, %i1
3778	sub	%i2, 0x8, %i2
3779	cmp	%i2, 0x8
3780	bgu,pt	%ncc, .ci_last8
3781	add	%i0, 0x8, %i0
3782
3783	brz,pt	%i2, .copyin_exit
3784	nop
3785
3786	ba	.ci_residue
3787	nop
3788
3789.ci_last4:
3790	! Can we do 4B ops
3791	andcc	%o2, 0x3, %g0
3792	bnz	%ncc, .ci_last2
3793	nop
37941:
3795	lda	[%i1]ASI_USER, %o2
3796	st	%o2, [%i0]
3797	add	%i1, 0x4, %i1
3798	sub	%i2, 0x4, %i2
3799	cmp	%i2, 0x4
3800	bgu,pt	%ncc, 1b
3801	add	%i0, 0x4, %i0
3802
3803	brz,pt	%i2, .copyin_exit
3804	nop
3805
3806	ba	.ci_residue
3807	nop
3808
3809.ci_last2:
3810	! Can we do 2B ops
3811	andcc	%o2, 0x1, %g0
3812	bnz	%ncc, .ci_residue
3813	nop
3814
38151:
3816	lduha	[%i1]ASI_USER, %o2
3817	stuh	%o2, [%i0]
3818	add	%i1, 0x2, %i1
3819	sub	%i2, 0x2, %i2
3820	cmp	%i2, 0x2
3821	bgu,pt	%ncc, 1b
3822	add	%i0, 0x2, %i0
3823
3824	brz,pt	%i2, .copyin_exit
3825	nop
3826
3827	! Copy the residue as byte copy
3828.ci_residue:
3829	lduba	[%i1]ASI_USER, %i4
3830	stb	%i4, [%i0]
3831	inc	%i1
3832	deccc	%i2
3833	bgu,pt	%xcc, .ci_residue
3834	inc	%i0
3835
3836.copyin_exit:
3837#if !defined(NIAGARA_IMPL)
3838	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
3839	wr	%o2, 0, %gsr		! restore gsr
3840
3841	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3842	btst	FPRS_FEF, %o3
3843	bz	%icc, 4f
3844	  nop
3845
3846	! restore fpregs from stack
3847	BLD_FP_FROMSTACK(%o2)
3848
3849	ba,pt	%ncc, 2f
3850	  wr	%o3, 0, %fprs		! restore fprs
3851
38524:
3853	FZERO				! zero all of the fpregs
3854	wr	%o3, 0, %fprs		! restore fprs
3855
38562:
3857	membar	#Sync			! sync error barrier
3858	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3859#else	/* NIAGARA_IMPL */
3860	membar	#Sync
3861#endif	/* NIAGARA_IMPL */
3862	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3863	ret
3864	restore	%g0, 0, %o0
3865.copyin_err:
3866	ldn	[THREAD_REG + T_COPYOPS], %o4
3867	brz	%o4, 2f
3868	nop
3869	ldn	[%o4 + CP_COPYIN], %g2
3870	jmp	%g2
3871	nop
38722:
3873	retl
3874	mov	-1, %o0
3875	SET_SIZE(copyin)
3876
3877#endif	/* lint */
3878
3879#ifdef	lint
3880
3881/*ARGSUSED*/
3882int
3883xcopyin(const void *uaddr, void *kaddr, size_t count)
3884{ return (0); }
3885
3886#else	/* lint */
3887
3888	ENTRY(xcopyin)
3889	sethi	%hi(.xcopyin_err), REAL_LOFAULT
3890	b	.do_copyin
3891	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3892.xcopyin_err:
3893	ldn	[THREAD_REG + T_COPYOPS], %o4
3894	brz	%o4, 2f
3895	nop
3896	ldn	[%o4 + CP_XCOPYIN], %g2
3897	jmp	%g2
3898	nop
38992:
3900	retl
3901	mov	%g1, %o0
3902	SET_SIZE(xcopyin)
3903
3904#endif	/* lint */
3905
3906#ifdef	lint
3907
3908/*ARGSUSED*/
3909int
3910xcopyin_little(const void *uaddr, void *kaddr, size_t count)
3911{ return (0); }
3912
3913#else	/* lint */
3914
3915	ENTRY(xcopyin_little)
3916	sethi	%hi(.little_err), %o4
3917	ldn	[THREAD_REG + T_LOFAULT], %o5
3918	or	%o4, %lo(.little_err), %o4
3919	membar	#Sync				! sync error barrier
3920	stn	%o4, [THREAD_REG + T_LOFAULT]
3921
3922	subcc	%g0, %o2, %o3
3923	add	%o0, %o2, %o0
3924	bz,pn	%ncc, 2f		! check for zero bytes
3925	sub	%o2, 1, %o4
3926	add	%o0, %o4, %o0		! start w/last byte
3927	add	%o1, %o2, %o1
3928	lduba	[%o0+%o3]ASI_AIUSL, %o4
3929
39301:	stb	%o4, [%o1+%o3]
3931	inccc	%o3
3932	sub	%o0, 2, %o0		! get next byte
3933	bcc,a,pt %ncc, 1b
3934	  lduba	[%o0+%o3]ASI_AIUSL, %o4
3935
39362:	membar	#Sync				! sync error barrier
3937	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3938	retl
3939	mov	%g0, %o0		! return (0)
3940
3941.little_err:
3942	membar	#Sync				! sync error barrier
3943	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3944	retl
3945	mov	%g1, %o0
3946	SET_SIZE(xcopyin_little)
3947
3948#endif	/* lint */
3949
3950
3951/*
3952 * Copy a block of storage - must not overlap (from + len <= to).
3953 * No fault handler installed (to be called under on_fault())
3954 */
3955#if defined(lint)
3956
3957/* ARGSUSED */
3958void
3959copyin_noerr(const void *ufrom, void *kto, size_t count)
3960{}
3961
3962#else	/* lint */
3963
3964	ENTRY(copyin_noerr)
3965	sethi	%hi(.copyio_noerr), REAL_LOFAULT
3966	b	.do_copyin
3967	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3968.copyio_noerr:
3969	jmp	SAVED_LOFAULT
3970	  nop
3971	SET_SIZE(copyin_noerr)
3972
3973#endif /* lint */
3974
3975/*
3976 * Copy a block of storage - must not overlap (from + len <= to).
3977 * No fault handler installed (to be called under on_fault())
3978 */
3979
3980#if defined(lint)
3981
3982/* ARGSUSED */
3983void
3984copyout_noerr(const void *kfrom, void *uto, size_t count)
3985{}
3986
3987#else	/* lint */
3988
3989	ENTRY(copyout_noerr)
3990	sethi	%hi(.copyio_noerr), REAL_LOFAULT
3991	b	.do_copyout
3992	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3993	SET_SIZE(copyout_noerr)
3994
3995#endif /* lint */
3996
3997#if defined(lint)
3998
3999int use_hw_bcopy = 1;
4000int use_hw_bzero = 1;
4001uint_t hw_copy_limit_1 = 0x100;
4002uint_t hw_copy_limit_2 = 0x200;
4003uint_t hw_copy_limit_4 = 0x400;
4004uint_t hw_copy_limit_8 = 0x400;
4005
4006#else /* !lint */
4007
4008	.align	4
4009	DGDEF(use_hw_bcopy)
4010	.word	1
4011	DGDEF(use_hw_bzero)
4012	.word	1
4013	DGDEF(hw_copy_limit_1)
4014	.word	0x100
4015	DGDEF(hw_copy_limit_2)
4016	.word	0x200
4017	DGDEF(hw_copy_limit_4)
4018	.word	0x400
4019	DGDEF(hw_copy_limit_8)
4020	.word	0x400
4021
4022	.align	64
4023	.section ".text"
4024#endif /* !lint */
4025
4026/*
4027 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
4028 * longer than 256 bytes in length using Niagara's block stores/quad store.
4029 * If the criteria for using this routine are not met then it calls bzero
4030 * and returns 1.  Otherwise 0 is returned indicating success.
4031 * Caller is responsible for ensuring use_hw_bzero is true and that
4032 * kpreempt_disable() has been called.
4033 */
4034#ifdef lint
4035/*ARGSUSED*/
4036int
4037hwblkclr(void *addr, size_t len)
4038{
4039	return(0);
4040}
4041#else /* lint */
4042	! %i0 - start address
4043	! %i1 - length of region (multiple of 64)
4044
4045	ENTRY(hwblkclr)
4046	save	%sp, -SA(MINFRAME), %sp
4047
4048	! Must be block-aligned
4049	andcc	%i0, 0x3f, %g0
4050	bnz,pn	%ncc, 1f
4051	  nop
4052
4053	! ... and must be 256 bytes or more
4054	cmp	%i1, 0x100
4055	blu,pn	%ncc, 1f
4056	  nop
4057
4058	! ... and length must be a multiple of 64
4059	andcc	%i1, 0x3f, %g0
4060	bz,pn	%ncc, .pz_doblock
4061	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
4062
40631:	! punt, call bzero but notify the caller that bzero was used
4064	mov	%i0, %o0
4065	call	bzero
4066	  mov	%i1, %o1
4067	ret
4068	restore	%g0, 1, %o0	! return (1) - did not use block operations
4069
4070	! Already verified that there are at least 256 bytes to set
4071.pz_doblock:
4072	stxa	%g0, [%i0+0x0]%asi
4073	stxa	%g0, [%i0+0x40]%asi
4074	stxa	%g0, [%i0+0x80]%asi
4075	stxa	%g0, [%i0+0xc0]%asi
4076
4077	stxa	%g0, [%i0+0x8]%asi
4078	stxa	%g0, [%i0+0x10]%asi
4079	stxa	%g0, [%i0+0x18]%asi
4080	stxa	%g0, [%i0+0x20]%asi
4081	stxa	%g0, [%i0+0x28]%asi
4082	stxa	%g0, [%i0+0x30]%asi
4083	stxa	%g0, [%i0+0x38]%asi
4084
4085	stxa	%g0, [%i0+0x48]%asi
4086	stxa	%g0, [%i0+0x50]%asi
4087	stxa	%g0, [%i0+0x58]%asi
4088	stxa	%g0, [%i0+0x60]%asi
4089	stxa	%g0, [%i0+0x68]%asi
4090	stxa	%g0, [%i0+0x70]%asi
4091	stxa	%g0, [%i0+0x78]%asi
4092
4093	stxa	%g0, [%i0+0x88]%asi
4094	stxa	%g0, [%i0+0x90]%asi
4095	stxa	%g0, [%i0+0x98]%asi
4096	stxa	%g0, [%i0+0xa0]%asi
4097	stxa	%g0, [%i0+0xa8]%asi
4098	stxa	%g0, [%i0+0xb0]%asi
4099	stxa	%g0, [%i0+0xb8]%asi
4100
4101	stxa	%g0, [%i0+0xc8]%asi
4102	stxa	%g0, [%i0+0xd0]%asi
4103	stxa	%g0, [%i0+0xd8]%asi
4104	stxa	%g0, [%i0+0xe0]%asi
4105	stxa	%g0, [%i0+0xe8]%asi
4106	stxa	%g0, [%i0+0xf0]%asi
4107	stxa	%g0, [%i0+0xf8]%asi
4108
4109	sub	%i1, 0x100, %i1
4110	cmp	%i1, 0x100
4111	bgu,pt	%ncc, .pz_doblock
4112	add	%i0, 0x100, %i0
4113
41142:
4115	! Check if more than 64 bytes to set
4116	cmp	%i1,0x40
4117	blu	%ncc, .pz_finish
4118	nop
4119
41203:
4121	stxa	%g0, [%i0+0x0]%asi
4122	stxa	%g0, [%i0+0x8]%asi
4123	stxa	%g0, [%i0+0x10]%asi
4124	stxa	%g0, [%i0+0x18]%asi
4125	stxa	%g0, [%i0+0x20]%asi
4126	stxa	%g0, [%i0+0x28]%asi
4127	stxa	%g0, [%i0+0x30]%asi
4128	stxa	%g0, [%i0+0x38]%asi
4129
4130	subcc	%i1, 0x40, %i1
4131	bgu,pt	%ncc, 3b
4132	add	%i0, 0x40, %i0
4133
4134.pz_finish:
4135	membar	#Sync
4136	ret
4137	restore	%g0, 0, %o0		! return (bzero or not)
4138	SET_SIZE(hwblkclr)
4139#endif	/* lint */
4140
4141#ifdef	lint
4142/* Copy 32 bytes of data from src to dst using physical addresses */
4143/*ARGSUSED*/
4144void
4145hw_pa_bcopy32(uint64_t src, uint64_t dst)
4146{}
4147#else	/*!lint */
4148
4149	/*
4150	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
4151	 * using physical addresses.
4152	 */
4153	ENTRY_NP(hw_pa_bcopy32)
4154	rdpr    %pstate, %g1
4155	andn    %g1, PSTATE_IE, %g2
4156	wrpr    %g0, %g2, %pstate
4157
4158	ldxa    [%o0]ASI_MEM, %o2
4159	add     %o0, 8, %o0
4160	ldxa    [%o0]ASI_MEM, %o3
4161	add     %o0, 8, %o0
4162	ldxa    [%o0]ASI_MEM, %o4
4163	add     %o0, 8, %o0
4164	ldxa    [%o0]ASI_MEM, %o5
4165	stxa    %o2, [%o1]ASI_MEM
4166	add     %o1, 8, %o1
4167	stxa    %o3, [%o1]ASI_MEM
4168	add     %o1, 8, %o1
4169	stxa    %o4, [%o1]ASI_MEM
4170	add     %o1, 8, %o1
4171	stxa    %o5, [%o1]ASI_MEM
4172
4173	membar	#Sync
4174	retl
4175	  wrpr    %g0, %g1, %pstate
4176	SET_SIZE(hw_pa_bcopy32)
4177#endif /* lint */
4178
4179/*
4180 * Zero a block of storage.
4181 *
4182 * uzero is used by the kernel to zero a block in user address space.
4183 */
4184
4185/*
4186 * Control flow of the bzero/kzero/uzero routine.
4187 *
4188 *	For fewer than 7 bytes stores, bytes will be zeroed.
4189 *
4190 *	For less than 15 bytes stores, align the address on 4 byte boundary.
4191 *	Then store as many 4-byte chunks, followed by trailing bytes.
4192 *
4193 *	For sizes greater than 15 bytes, align the address on 8 byte boundary.
4194 *	if (count > 128) {
4195 *		store as many 8-bytes chunks to block align the address
4196 *		store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR
4197 *		store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero)
4198 *	}
4199 *	Store as many 8-byte chunks, followed by trailing bytes.
4200 */
4201
4202#if defined(lint)
4203
4204/* ARGSUSED */
4205int
4206kzero(void *addr, size_t count)
4207{ return(0); }
4208
4209/* ARGSUSED */
4210void
4211uzero(void *addr, size_t count)
4212{}
4213
4214#else	/* lint */
4215
4216	ENTRY(uzero)
4217	!
4218	! Set a new lo_fault handler only if we came in with one
4219	! already specified.
4220	!
4221	wr	%g0, ASI_USER, %asi
4222	ldn	[THREAD_REG + T_LOFAULT], %o5
4223	tst	%o5
4224	bz,pt	%ncc, .do_zero
4225	sethi	%hi(.zeroerr), %o2
4226	or	%o2, %lo(.zeroerr), %o2
4227	membar	#Sync
4228	ba,pt	%ncc, .do_zero
4229	stn	%o2, [THREAD_REG + T_LOFAULT]
4230
4231	ENTRY(kzero)
4232	!
4233	! Always set a lo_fault handler
4234	!
4235	wr	%g0, ASI_P, %asi
4236	ldn	[THREAD_REG + T_LOFAULT], %o5
4237	sethi	%hi(.zeroerr), %o2
4238	or	%o5, LOFAULT_SET, %o5
4239	or	%o2, %lo(.zeroerr), %o2
4240	membar	#Sync
4241	ba,pt	%ncc, .do_zero
4242	stn	%o2, [THREAD_REG + T_LOFAULT]
4243
4244/*
4245 * We got here because of a fault during kzero or if
4246 * uzero or bzero was called with t_lofault non-zero.
4247 * Otherwise we've already run screaming from the room.
4248 * Errno value is in %g1. Note that we're here iff
4249 * we did set t_lofault.
4250 */
4251.zeroerr:
4252	!
4253	! Undo asi register setting. Just set it to be the
4254        ! kernel default without checking.
4255	!
4256	wr	%g0, ASI_P, %asi
4257
4258	!
4259	! We did set t_lofault. It may well have been zero coming in.
4260	!
42611:
4262	tst	%o5
4263	membar #Sync
4264	bne,pn	%ncc, 3f
4265	andncc	%o5, LOFAULT_SET, %o5
42662:
4267	!
4268	! Old handler was zero. Just return the error.
4269	!
4270	retl				! return
4271	mov	%g1, %o0		! error code from %g1
42723:
4273	!
4274	! We're here because %o5 was non-zero. It was non-zero
4275	! because either LOFAULT_SET was present, a previous fault
4276	! handler was present or both. In all cases we need to reset
4277	! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
4278	! before we either simply return the error or we invoke the
4279	! previously specified handler.
4280	!
4281	be	%ncc, 2b
4282	stn	%o5, [THREAD_REG + T_LOFAULT]
4283	jmp	%o5			! goto real handler
4284	  nop
4285	SET_SIZE(kzero)
4286	SET_SIZE(uzero)
4287
4288#endif	/* lint */
4289
4290/*
4291 * Zero a block of storage.
4292 */
4293
4294#if defined(lint)
4295
4296/* ARGSUSED */
4297void
4298bzero(void *addr, size_t count)
4299{}
4300
4301#else	/* lint */
4302
4303	ENTRY(bzero)
4304	wr	%g0, ASI_P, %asi
4305
4306	ldn	[THREAD_REG + T_LOFAULT], %o5	! save old vector
4307	tst	%o5
4308	bz,pt	%ncc, .do_zero
4309	sethi	%hi(.zeroerr), %o2
4310	or	%o2, %lo(.zeroerr), %o2
4311	membar	#Sync				! sync error barrier
4312	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
4313
4314.do_zero:
4315	cmp	%o1, 7
4316	blu,pn	%ncc, .byteclr
4317	nop
4318
4319	cmp	%o1, 15
4320	blu,pn	%ncc, .wdalign
4321	nop
4322
4323	andcc	%o0, 7, %o3		! is add aligned on a 8 byte bound
4324	bz,pt	%ncc, .blkalign		! already double aligned
4325	sub	%o3, 8, %o3		! -(bytes till double aligned)
4326	add	%o1, %o3, %o1		! update o1 with new count
4327
43281:
4329	stba	%g0, [%o0]%asi
4330	inccc	%o3
4331	bl,pt	%ncc, 1b
4332	inc	%o0
4333
4334	! Now address is double aligned
4335.blkalign:
4336	cmp	%o1, 0x80		! check if there are 128 bytes to set
4337	blu,pn	%ncc, .bzero_small
4338	mov	%o1, %o3
4339
4340	sethi	%hi(use_hw_bzero), %o2
4341	ld	[%o2 + %lo(use_hw_bzero)], %o2
4342	tst	%o2
4343	bz	%ncc, .bzero_small
4344	mov	%o1, %o3
4345
4346	rd	%asi, %o3
4347	wr	%g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
4348	cmp	%o3, ASI_P
4349	bne,a	%ncc, .algnblk
4350	wr	%g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
4351
4352.algnblk:
4353	andcc	%o0, 0x3f, %o3		! is block aligned?
4354	bz,pt	%ncc, .bzero_blk
4355	sub	%o3, 0x40, %o3		! -(bytes till block aligned)
4356	add	%o1, %o3, %o1		! o1 is the remainder
4357
4358	! Clear -(%o3) bytes till block aligned
43591:
4360	stxa	%g0, [%o0]%asi
4361	addcc	%o3, 8, %o3
4362	bl,pt	%ncc, 1b
4363	add	%o0, 8, %o0
4364
4365.bzero_blk:
4366	and	%o1, 0x3f, %o3		! calc bytes left after blk clear
4367	andn	%o1, 0x3f, %o4		! calc size of blocks in bytes
4368
4369	cmp	%o4, 0x100		! 256 bytes or more
4370	blu,pn	%ncc, 3f
4371	nop
4372
43732:
4374	stxa	%g0, [%o0+0x0]%asi
4375	stxa	%g0, [%o0+0x40]%asi
4376	stxa	%g0, [%o0+0x80]%asi
4377	stxa	%g0, [%o0+0xc0]%asi
4378
4379	stxa	%g0, [%o0+0x8]%asi
4380	stxa	%g0, [%o0+0x10]%asi
4381	stxa	%g0, [%o0+0x18]%asi
4382	stxa	%g0, [%o0+0x20]%asi
4383	stxa	%g0, [%o0+0x28]%asi
4384	stxa	%g0, [%o0+0x30]%asi
4385	stxa	%g0, [%o0+0x38]%asi
4386
4387	stxa	%g0, [%o0+0x48]%asi
4388	stxa	%g0, [%o0+0x50]%asi
4389	stxa	%g0, [%o0+0x58]%asi
4390	stxa	%g0, [%o0+0x60]%asi
4391	stxa	%g0, [%o0+0x68]%asi
4392	stxa	%g0, [%o0+0x70]%asi
4393	stxa	%g0, [%o0+0x78]%asi
4394
4395	stxa	%g0, [%o0+0x88]%asi
4396	stxa	%g0, [%o0+0x90]%asi
4397	stxa	%g0, [%o0+0x98]%asi
4398	stxa	%g0, [%o0+0xa0]%asi
4399	stxa	%g0, [%o0+0xa8]%asi
4400	stxa	%g0, [%o0+0xb0]%asi
4401	stxa	%g0, [%o0+0xb8]%asi
4402
4403	stxa	%g0, [%o0+0xc8]%asi
4404	stxa	%g0, [%o0+0xd0]%asi
4405	stxa	%g0, [%o0+0xd8]%asi
4406	stxa	%g0, [%o0+0xe0]%asi
4407	stxa	%g0, [%o0+0xe8]%asi
4408	stxa	%g0, [%o0+0xf0]%asi
4409	stxa	%g0, [%o0+0xf8]%asi
4410
4411	sub	%o4, 0x100, %o4
4412	cmp	%o4, 0x100
4413	bgu,pt	%ncc, 2b
4414	add	%o0, 0x100, %o0
4415
44163:
4417	! ... check if 64 bytes to set
4418	cmp	%o4, 0x40
4419	blu	%ncc, .bzero_blk_done
4420	nop
4421
44224:
4423	stxa	%g0, [%o0+0x0]%asi
4424	stxa	%g0, [%o0+0x8]%asi
4425	stxa	%g0, [%o0+0x10]%asi
4426	stxa	%g0, [%o0+0x18]%asi
4427	stxa	%g0, [%o0+0x20]%asi
4428	stxa	%g0, [%o0+0x28]%asi
4429	stxa	%g0, [%o0+0x30]%asi
4430	stxa	%g0, [%o0+0x38]%asi
4431
4432	subcc	%o4, 0x40, %o4
4433	bgu,pt	%ncc, 3b
4434	add	%o0, 0x40, %o0
4435
4436.bzero_blk_done:
4437	membar	#Sync
4438	!
4439	! Undo asi register setting.
4440	!
4441	rd	%asi, %o4
4442	wr	%g0, ASI_P, %asi
4443	cmp	%o4, ASI_BLK_INIT_ST_QUAD_LDD_P
4444	bne,a	%ncc, .bzero_small
4445	wr	%g0, ASI_USER, %asi
4446
4447.bzero_small:
4448	! Set the remaining doubles
4449	subcc	%o3, 8, %o3		! Can we store any doubles?
4450	blu,pn	%ncc, .byteclr
4451	and	%o1, 7, %o1		! calc bytes left after doubles
4452
4453.dbclr:
4454	stxa	%g0, [%o0]%asi		! Clear the doubles
4455	subcc	%o3, 8, %o3
4456	bgeu,pt	%ncc, .dbclr
4457	add	%o0, 8, %o0
4458
4459	ba	.byteclr
4460	nop
4461
4462.wdalign:
4463	andcc	%o0, 3, %o3		! is add aligned on a word boundary
4464	bz,pn	%ncc, .wdclr
4465	andn	%o1, 3, %o3		! create word sized count in %o3
4466
4467	dec	%o1			! decrement count
4468	stba	%g0, [%o0]%asi		! clear a byte
4469	ba	.wdalign
4470	inc	%o0			! next byte
4471
4472.wdclr:
4473	sta	%g0, [%o0]%asi		! 4-byte clearing loop
4474	subcc	%o3, 4, %o3
4475	bnz,pt	%ncc, .wdclr
4476	inc	4, %o0
4477
4478	and	%o1, 3, %o1		! leftover count, if any
4479
4480.byteclr:
4481	! Set the leftover bytes
4482	brz	%o1, .bzero_exit
4483	nop
4484
44857:
4486	deccc	%o1			! byte clearing loop
4487	stba	%g0, [%o0]%asi
4488	bgu,pt	%ncc, 7b
4489	inc	%o0
4490
4491.bzero_exit:
4492	!
4493	! We're just concerned with whether t_lofault was set
4494	! when we came in. We end up here from either kzero()
4495	! or bzero(). kzero() *always* sets a lofault handler.
4496	! It ors LOFAULT_SET into %o5 to indicate it has done
4497	! this even if the value of %o5 is otherwise zero.
4498	! bzero() sets a lofault handler *only* if one was
4499	! previously set. Accordingly we need to examine
4500	! %o5 and if it is non-zero be sure to clear LOFAULT_SET
4501	! before resetting the error handler.
4502	!
4503	tst	%o5
4504	bz	%ncc, 1f
4505	andn	%o5, LOFAULT_SET, %o5
4506	membar	#Sync				! sync error barrier
4507	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
45081:
4509	retl
4510	clr	%o0			! return (0)
4511
4512	SET_SIZE(bzero)
4513#endif	/* lint */
4514