xref: /titanic_50/usr/src/uts/sun4v/cpu/niagara_copy.s (revision e764d406651fd9ae1eb625acae7fb932af107587)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/param.h>
29#include <sys/errno.h>
30#include <sys/asm_linkage.h>
31#include <sys/vtrace.h>
32#include <sys/machthread.h>
33#include <sys/clock.h>
34#include <sys/asi.h>
35#include <sys/fsr.h>
36#include <sys/privregs.h>
37#include <sys/machasi.h>
38#include <sys/niagaraasi.h>
39
40#if !defined(lint)
41#include "assym.h"
42#endif	/* lint */
43
44
45/*
46 * Pseudo-code to aid in understanding the control flow of the
47 * bcopy/kcopy routine.
48 *
49 *	! WARNING : <Register usage convention>
50 *	! In kcopy() the %o5, holds previous error handler and a flag
51 *	! LOFAULT_SET (low bits). The %o5 is null in bcopy().
52 *	! The %o5 is not available for any other use.
53 *
54 * kcopy():
55 *	%o5 = curthread->t_lofault;		! save existing handler in %o5
56 *	%o5 |= LOFAULT_SET;			! ORed with LOFAULT_SET flag
57 *	curthread->t_lofault = .copyerr;
58 *	Call bcopy();
59 *
60 * bcopy():
61 * 	if (length < 128)
62 * 		goto regular_copy;
63 *
64 * 	if (!use_hw_bcopy)
65 * 		goto regular_copy;
66 *
67 * 	blockcopy;
68 *	restore t_lofault handler if came from kcopy();
69 *
70 *	regular_copy;
71 *	restore t_lofault handler if came from kcopy();
72 *
73 * In lofault handler:
74 *	curthread->t_lofault = (%o5 & ~LOFAULT_SET);	! restore old t_lofault
75 *	return (errno)
76 *
77 */
78
79/*
80 * Less then or equal this number of bytes we will always copy byte-for-byte
81 */
82#define	SMALL_LIMIT	7
83
84/*
85 * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault
86 * handler was set
87 */
88#define	LOFAULT_SET 2
89
90/*
91 * This define is to align data for the unaligned source cases.
92 * The data1, data2 and data3 is merged into data1 and data2.
93 * The data3 is preserved for next merge.
94 */
95#define	ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp)	\
96	sllx	data1, lshift, data1				;\
97	srlx	data2, rshift, tmp				;\
98	or	data1, tmp, data1				;\
99	sllx	data2, lshift, data2				;\
100	srlx	data3, rshift, tmp				;\
101	or	data2, tmp, data2
102/*
103 * This macro is to align the data. Basically it merges
104 * data1 and data2 to form double word.
105 */
106#define	ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp)	\
107	sllx	data1, lshift, data1				;\
108	srlx	data2, rshift, tmp				;\
109	or	data1, tmp, data1
110
111#if !defined(NIAGARA_IMPL)
112/*
113 * Flags set in the lower bits of the t_lofault address:
114 * FPUSED_FLAG: The FP registers were in use and must be restored
115 * BCOPY_FLAG: Set for bcopy calls, cleared for kcopy calls
116 * COPY_FLAGS: Both of the above
117 *
118 * Other flags:
119 * KPREEMPT_FLAG: kpreempt needs to be called
120 */
121#define	FPUSED_FLAG	1
122#define	BCOPY_FLAG	2
123#define	COPY_FLAGS	(FPUSED_FLAG | BCOPY_FLAG)
124#define	KPREEMPT_FLAG	4
125
126#define	ALIGN_OFF_1_7			\
127	faligndata %d0, %d2, %d48	;\
128	faligndata %d2, %d4, %d50	;\
129	faligndata %d4, %d6, %d52	;\
130	faligndata %d6, %d8, %d54	;\
131	faligndata %d8, %d10, %d56	;\
132	faligndata %d10, %d12, %d58	;\
133	faligndata %d12, %d14, %d60	;\
134	faligndata %d14, %d16, %d62
135
136#define	ALIGN_OFF_8_15			\
137	faligndata %d2, %d4, %d48	;\
138	faligndata %d4, %d6, %d50	;\
139	faligndata %d6, %d8, %d52	;\
140	faligndata %d8, %d10, %d54	;\
141	faligndata %d10, %d12, %d56	;\
142	faligndata %d12, %d14, %d58	;\
143	faligndata %d14, %d16, %d60	;\
144	faligndata %d16, %d18, %d62
145
146#define	ALIGN_OFF_16_23			\
147	faligndata %d4, %d6, %d48	;\
148	faligndata %d6, %d8, %d50	;\
149	faligndata %d8, %d10, %d52	;\
150	faligndata %d10, %d12, %d54	;\
151	faligndata %d12, %d14, %d56	;\
152	faligndata %d14, %d16, %d58	;\
153	faligndata %d16, %d18, %d60	;\
154	faligndata %d18, %d20, %d62
155
156#define	ALIGN_OFF_24_31			\
157	faligndata %d6, %d8, %d48	;\
158	faligndata %d8, %d10, %d50	;\
159	faligndata %d10, %d12, %d52	;\
160	faligndata %d12, %d14, %d54	;\
161	faligndata %d14, %d16, %d56	;\
162	faligndata %d16, %d18, %d58	;\
163	faligndata %d18, %d20, %d60	;\
164	faligndata %d20, %d22, %d62
165
166#define	ALIGN_OFF_32_39			\
167	faligndata %d8, %d10, %d48	;\
168	faligndata %d10, %d12, %d50	;\
169	faligndata %d12, %d14, %d52	;\
170	faligndata %d14, %d16, %d54	;\
171	faligndata %d16, %d18, %d56	;\
172	faligndata %d18, %d20, %d58	;\
173	faligndata %d20, %d22, %d60	;\
174	faligndata %d22, %d24, %d62
175
176#define	ALIGN_OFF_40_47			\
177	faligndata %d10, %d12, %d48	;\
178	faligndata %d12, %d14, %d50	;\
179	faligndata %d14, %d16, %d52	;\
180	faligndata %d16, %d18, %d54	;\
181	faligndata %d18, %d20, %d56	;\
182	faligndata %d20, %d22, %d58	;\
183	faligndata %d22, %d24, %d60	;\
184	faligndata %d24, %d26, %d62
185
186#define	ALIGN_OFF_48_55			\
187	faligndata %d12, %d14, %d48	;\
188	faligndata %d14, %d16, %d50	;\
189	faligndata %d16, %d18, %d52	;\
190	faligndata %d18, %d20, %d54	;\
191	faligndata %d20, %d22, %d56	;\
192	faligndata %d22, %d24, %d58	;\
193	faligndata %d24, %d26, %d60	;\
194	faligndata %d26, %d28, %d62
195
196#define	ALIGN_OFF_56_63			\
197	faligndata %d14, %d16, %d48	;\
198	faligndata %d16, %d18, %d50	;\
199	faligndata %d18, %d20, %d52	;\
200	faligndata %d20, %d22, %d54	;\
201	faligndata %d22, %d24, %d56	;\
202	faligndata %d24, %d26, %d58	;\
203	faligndata %d26, %d28, %d60	;\
204	faligndata %d28, %d30, %d62
205
206#define	VIS_BLOCKSIZE		64
207
208/*
209 * Size of stack frame in order to accomodate a 64-byte aligned
210 * floating-point register save area and 2 64-bit temp locations.
211 * All copy functions use three quadrants of fp registers; to assure a
212 * block-aligned three block buffer in which to save we must reserve
213 * four blocks on stack.
214 *
215 *    _______________________________________ <-- %fp + STACK_BIAS
216 *    | We may need to preserve 3 quadrants |
217 *    | of fp regs, but since we do so with |
218 *    | BST/BLD we need room in which to    |
219 *    | align to VIS_BLOCKSIZE bytes.  So   |
220 *    | this area is 4 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
221 *    |-------------------------------------|
222 *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
223 *    |-------------------------------------|
224 *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
225 *    ---------------------------------------
226 */
227#define HWCOPYFRAMESIZE         ((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8))
228#define SAVED_FPREGS_OFFSET     (VIS_BLOCKSIZE * 4)
229#define SAVED_FPREGS_ADJUST     ((VIS_BLOCKSIZE * 3) + 1)
230#define SAVED_FPRS_OFFSET       (SAVED_FPREGS_OFFSET + 8)
231#define SAVED_GSR_OFFSET        (SAVED_FPRS_OFFSET + 8)
232
233/*
234 * In FP copies if we do not have preserved data to restore over
235 * the fp regs we used then we must zero those regs to avoid
236 * exposing portions of the data to later threads (data security).
237 */
238#define	FZERO				\
239	fzero	%f0			;\
240	fzero	%f2			;\
241	faddd	%f0, %f2, %f4		;\
242	fmuld	%f0, %f2, %f6		;\
243	faddd	%f0, %f2, %f8		;\
244	fmuld	%f0, %f2, %f10		;\
245	faddd	%f0, %f2, %f12		;\
246	fmuld	%f0, %f2, %f14		;\
247	faddd	%f0, %f2, %f16		;\
248	fmuld	%f0, %f2, %f18		;\
249	faddd	%f0, %f2, %f20		;\
250	fmuld	%f0, %f2, %f22		;\
251	faddd	%f0, %f2, %f24		;\
252	fmuld	%f0, %f2, %f26		;\
253	faddd	%f0, %f2, %f28		;\
254	fmuld	%f0, %f2, %f30		;\
255	faddd	%f0, %f2, %f48		;\
256	fmuld	%f0, %f2, %f50		;\
257	faddd	%f0, %f2, %f52		;\
258	fmuld	%f0, %f2, %f54		;\
259	faddd	%f0, %f2, %f56		;\
260	fmuld	%f0, %f2, %f58		;\
261	faddd	%f0, %f2, %f60		;\
262	fmuld	%f0, %f2, %f62
263
264/*
265 * Macros to save and restore fp registers to/from the stack.
266 * Used to save and restore in-use fp registers when we want to use FP.
267 */
268#define BST_FP_TOSTACK(tmp1)					\
269	/* membar #Sync	*/					;\
270	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
271	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
272	stda	%f0, [tmp1]ASI_BLK_P				;\
273	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
274	stda	%f16, [tmp1]ASI_BLK_P				;\
275	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
276	stda	%f48, [tmp1]ASI_BLK_P				;\
277	membar	#Sync
278
279#define	BLD_FP_FROMSTACK(tmp1)					\
280	/* membar #Sync - provided at copy completion */	;\
281	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
282	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
283	ldda	[tmp1]ASI_BLK_P, %f0				;\
284	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
285	ldda	[tmp1]ASI_BLK_P, %f16				;\
286	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
287	ldda	[tmp1]ASI_BLK_P, %f48				;\
288	membar	#Sync
289#endif	/* NIAGARA_IMPL */
290
291/*
292 * Copy a block of storage, returning an error code if `from' or
293 * `to' takes a kernel pagefault which cannot be resolved.
294 * Returns errno value on pagefault error, 0 if all ok
295 */
296
297#if defined(lint)
298
299/* ARGSUSED */
300int
301kcopy(const void *from, void *to, size_t count)
302{ return(0); }
303
304#else	/* lint */
305
306	.seg	".text"
307	.align	4
308
309	ENTRY(kcopy)
310
311#if !defined(NIAGARA_IMPL)
312	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
313	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
314	or	%l7, %lo(.copyerr), %l7
315	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
316	! Note that we carefully do *not* flag the setting of
317	! t_lofault.
318	membar	#Sync				! sync error barrier
319	b	.do_copy			! common code
320	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
321
322/*
323 * We got here because of a fault during kcopy or bcopy if a fault
324 * handler existed when bcopy was called.
325 * Errno value is in %g1.
326 */
327.copyerr:
328	sethi	%hi(.copyerr2), %l1
329	or	%l1, %lo(.copyerr2), %l1
330	membar	#Sync				! sync error barrier
331	stn	%l1, [THREAD_REG + T_LOFAULT]	! set t_lofault
332	btst	FPUSED_FLAG, %o5
333	bz,pt	%xcc, 1f
334	and	%o5, BCOPY_FLAG, %l1	! copy flag to %l1
335
336	membar	#Sync				! sync error barrier
337	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
338	wr	%o2, 0, %gsr
339
340	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
341	btst	FPRS_FEF, %o3
342	bz,pt	%icc, 4f
343	  nop
344
345	! restore fpregs from stack
346	BLD_FP_FROMSTACK(%o2)
347
348	ba,pt	%ncc, 2f
349	  wr	%o3, 0, %fprs		! restore fprs
350
3514:
352	FZERO
353	wr	%o3, 0, %fprs		! restore fprs
354
3552:
356	ldn	[THREAD_REG + T_LWP], %o2
357	brnz,pt	%o2, 1f
358	  nop
359
360	ldsb	[THREAD_REG + T_PREEMPT], %l0
361	deccc	%l0
362	bnz,pn	%ncc, 1f
363	  stb	%l0, [THREAD_REG + T_PREEMPT]
364
365	ldsb	[THREAD_REG + T_PREEMPT], %l0
366	dec	%l0
367	stb	%l0, [THREAD_REG + T_PREEMPT]
368
369	! Check for a kernel preemption request
370	ldn	[THREAD_REG + T_CPU], %l0
371	ldub	[%l0 + CPU_KPRUNRUN], %l0
372	brnz,a,pt	%l0, 1f	! Need to call kpreempt?
373	  or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
374
375	! The kcopy will always set a t_lofault handler. If it fires,
376	! we're expected to just return the error code and not to
377	! invoke any existing error handler. As far as bcopy is concerned,
378	! we only set t_lofault if there was an existing lofault handler.
379	! In that case we're expected to invoke the previously existing
380	! handler after restting the t_lofault value.
3811:
382	andn	%o5, COPY_FLAGS, %o5	! remove flags from lofault address
383	membar	#Sync				! sync error barrier
384	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
385
386	! call kpreempt if necessary
387	btst	KPREEMPT_FLAG, %l1
388	bz,pt	%icc, 2f
389	  nop
390	call	kpreempt
391	  rdpr	%pil, %o0	! pass %pil
3922:
393	btst	BCOPY_FLAG, %l1
394	bnz,pn	%ncc, 3f
395	nop
396	ret
397	restore	%g1, 0, %o0
398
3993:
400	! We're here via bcopy. There must have been an error handler
401	! in place otherwise we would have died a nasty death already.
402	jmp	%o5				! goto real handler
403	restore	%g0, 0, %o0			! dispose of copy window
404
405/*
406 * We got here because of a fault in .copyerr.  We can't safely restore fp
407 * state, so we panic.
408 */
409fp_panic_msg:
410	.asciz	"Unable to restore fp state after copy operation"
411
412	.align	4
413.copyerr2:
414	set	fp_panic_msg, %o0
415	call	panic
416	  nop
417#else	/* NIAGARA_IMPL */
418	save	%sp, -SA(MINFRAME), %sp
419	set	.copyerr, %l7			! copyerr is lofault value
420	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
421	or	%o5, LOFAULT_SET, %o5
422	membar	#Sync				! sync error barrier
423	b	.do_copy			! common code
424	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
425
426/*
427 * We got here because of a fault during kcopy.
428 * Errno value is in %g1.
429 */
430.copyerr:
431	! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET
432	! into %o5 to indicate it has set t_lofault handler. Need to clear
433	! LOFAULT_SET flag before restoring the error handler.
434	andn	%o5, LOFAULT_SET, %o5
435	membar	#Sync				! sync error barrier
436	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
437	ret
438	restore	%g1, 0, %o0
439#endif	/* NIAGARA_IMPL */
440
441	SET_SIZE(kcopy)
442#endif	/* lint */
443
444
445/*
446 * Copy a block of storage - must not overlap (from + len <= to).
447 */
448#if defined(lint)
449
450/* ARGSUSED */
451void
452bcopy(const void *from, void *to, size_t count)
453{}
454
455#else	/* lint */
456
457	ENTRY(bcopy)
458
459#if !defined(NIAGARA_IMPL)
460	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
461	ldn	[THREAD_REG + T_LOFAULT], %o5	! save existing handler
462	brz,pt	%o5, .do_copy
463	  nop
464	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
465	or	%l7, %lo(.copyerr), %l7
466	membar	#Sync				! sync error barrier
467	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
468	! We've already captured whether t_lofault was zero on entry.
469	! We need to mark ourselves as being from bcopy since both
470	! kcopy and bcopy use the same code path. If BCOPY_FLAG is
471	! set and the saved lofault was zero, we won't reset lofault on
472	! returning.
473	or	%o5, BCOPY_FLAG, %o5
474#else	/* NIAGARA_IMPL */
475	save	%sp, -SA(MINFRAME), %sp
476	clr	%o5			! flag LOFAULT_SET is not set for bcopy
477#endif	/* NIAGARA_IMPL */
478
479.do_copy:
480	cmp	%i2, 12			! for small counts
481	blu	%ncc, .bytecp		! just copy bytes
482	  .empty
483
484	cmp	%i2, 128		! for less than 128 bytes
485	blu,pn	%ncc, .bcb_punt		! no block st/quad ld
486	  nop
487
488	set	use_hw_bcopy, %o2
489	ld	[%o2], %o2
490	brz,pn	%o2, .bcb_punt
491	  nop
492
493	subcc	%i1, %i0, %i3
494	bneg,a,pn %ncc, 1f
495	neg	%i3
4961:
497	/*
498	 * Compare against 256 since we should be checking block addresses
499	 * and (dest & ~63) - (src & ~63) can be 3 blocks even if
500	 * src = dest + (64 * 3) + 63.
501	 */
502	cmp	%i3, 256
503	blu,pn	%ncc, .bcb_punt
504	  nop
505
506	/*
507	 * Copy that reach here have at least 2 blocks of data to copy.
508	 */
509#if !defined(NIAGARA_IMPL)
510	ldn	[THREAD_REG + T_LWP], %o3
511	brnz,pt	%o3, 1f
512	  nop
513
514	! kpreempt_disable();
515	ldsb	[THREAD_REG + T_PREEMPT], %o2
516	inc	%o2
517	stb	%o2, [THREAD_REG + T_PREEMPT]
518
5191:
520	rd	%fprs, %o2              ! check for unused fp
521	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
522	btst	FPRS_FEF, %o2
523	bz,a,pt	%icc, .do_blockcopy
524	wr	%g0, FPRS_FEF, %fprs
525
526	! save in-use fpregs on stack
527	BST_FP_TOSTACK(%o2)
528#endif	/* NIAGARA_IMPL */
529
530.do_blockcopy:
531
532#if !defined(NIAGARA_IMPL)
533	rd	%gsr, %o2
534	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]      ! save gsr
535	or	%o5, FPUSED_FLAG, %o5		! fp regs are in use
536#endif	/* NIAGARA_IMPL */
537
538	! Swap src/dst since the code below is memcpy code
539	! and memcpy/bcopy have different calling sequences
540	mov	%i1, %i5
541	mov	%i0, %i1
542	mov	%i5, %i0
543
544	! Block (64 bytes) align the destination.
545	andcc	%i0, 0x3f, %i3		! is dst aligned on a 64 bytes
546	bz	%xcc, .chksrc		! dst is already double aligned
547	sub	%i3, 0x40, %i3
548	neg	%i3			! bytes till dst 64 bytes aligned
549	sub	%i2, %i3, %i2		! update i2 with new count
550
551	! Based on source and destination alignment do
552	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
553
554	! Is dst & src 8B aligned
555	or	%i0, %i1, %o2
556	andcc	%o2, 0x7, %g0
557	bz	%ncc, .alewdcp
558	nop
559
560	! Is dst & src 4B aligned
561	andcc	%o2, 0x3, %g0
562	bz	%ncc, .alwdcp
563	nop
564
565	! Is dst & src 2B aligned
566	andcc	%o2, 0x1, %g0
567	bz	%ncc, .alhlfwdcp
568	nop
569
570	! 1B aligned
5711:	ldub	[%i1], %o2
572	stb	%o2, [%i0]
573	inc	%i1
574	deccc	%i3
575	bgu,pt	%ncc, 1b
576	inc	%i0
577
578	ba	.chksrc
579	nop
580
581	! dst & src 4B aligned
582.alwdcp:
583	ld	[%i1], %o2
584	st	%o2, [%i0]
585	add	%i1, 0x4, %i1
586	subcc	%i3, 0x4, %i3
587	bgu,pt	%ncc, .alwdcp
588	add	%i0, 0x4, %i0
589
590	ba	.chksrc
591	nop
592
593	! dst & src 2B aligned
594.alhlfwdcp:
595	lduh	[%i1], %o2
596	stuh	%o2, [%i0]
597	add	%i1, 0x2, %i1
598	subcc	%i3, 0x2, %i3
599	bgu,pt	%ncc, .alhlfwdcp
600	add	%i0, 0x2, %i0
601
602	ba	.chksrc
603	nop
604
605	! dst & src 8B aligned
606.alewdcp:
607	ldx	[%i1], %o2
608	stx	%o2, [%i0]
609	add	%i1, 0x8, %i1
610	subcc	%i3, 0x8, %i3
611	bgu,pt	%ncc, .alewdcp
612	add	%i0, 0x8, %i0
613
614	! Now Destination is block (64 bytes) aligned
615.chksrc:
616	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
617	sub	%i2, %i3, %i2		! Residue bytes in %i2
618
619	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
620
621#if !defined(NIAGARA_IMPL)
622	andn	%i1, 0x3f, %l0		! %l0 has block aligned src address
623	prefetch [%l0+0x0], #one_read
624	andcc	%i1, 0x3f, %g0		! is src 64B aligned
625	bz,pn	%ncc, .blkcpy
626	nop
627
628	! handle misaligned source cases
629	alignaddr %i1, %g0, %g0		! generate %gsr
630
631	srl	%i1, 0x3, %l1		! src add bits 3, 4, 5 are now least
632					! significant in %l1
633	andcc	%l1, 0x7, %l2		! mask everything except bits 1, 2, 3
634	add	%i1, %i3, %i1
635
636	! switch statement to get to right 8 byte block within
637	! 64 byte block
638	cmp	 %l2, 0x4
639	bgeu,a	 hlf
640	cmp	 %l2, 0x6
641	cmp	 %l2, 0x2
642	bgeu,a	 sqtr
643	nop
644	cmp	 %l2, 0x1
645	be,a	 off15
646	nop
647	ba	 off7
648	nop
649sqtr:
650	be,a	 off23
651	nop
652	ba,a	 off31
653	nop
654
655hlf:
656	bgeu,a	 fqtr
657	nop
658	cmp	 %l2, 0x5
659	be,a	 off47
660	nop
661	ba	 off39
662	nop
663fqtr:
664	be,a	 off55
665	nop
666
667	! Falls through when the source offset is greater than 56
668	ldd	[%l0+0x38], %d14
669	prefetch [%l0+0x40], #one_read
670	prefetch [%l0+0x80], #one_read
6717:
672	add	%l0, 0x40, %l0
673	stxa	%g0, [%i0]%asi		! initialize the cache line
674
675	ldda	[%l0]ASI_BLK_P, %d16
676	ALIGN_OFF_56_63
677	fmovd	%d30, %d14
678
679	stda	%d48, [%i0]ASI_BLK_P
680	subcc	%i3, 0x40, %i3
681	add	%i0, 0x40, %i0
682	bgu,pt	%ncc, 7b
683	prefetch [%l0+0x80], #one_read
684	ba	.blkdone
685	membar	#Sync
686
687	! This copy case for source offset between 1 and 7
688off7:
689	ldda	[%l0]ASI_BLK_P, %d0
690	prefetch [%l0+0x40], #one_read
691	prefetch [%l0+0x80], #one_read
6920:
693	add	%l0, 0x40, %l0
694	stxa	%g0, [%i0]%asi		! initialize the cache line
695
696	ldda	[%l0]ASI_BLK_P, %d16
697	ALIGN_OFF_1_7
698	fmovd	%d16, %d0
699	fmovd	%d18, %d2
700	fmovd	%d20, %d4
701	fmovd	%d22, %d6
702	fmovd	%d24, %d8
703	fmovd	%d26, %d10
704	fmovd	%d28, %d12
705	fmovd	%d30, %d14
706
707	stda	%d48, [%i0]ASI_BLK_P
708	subcc	%i3, 0x40, %i3
709	add	%i0, 0x40, %i0
710	bgu,pt	%ncc, 0b
711	prefetch [%l0+0x80], #one_read
712	ba	.blkdone
713	membar	#Sync
714
715	! This copy case for source offset between 8 and 15
716off15:
717	ldd	[%l0+0x8], %d2
718	ldd	[%l0+0x10], %d4
719	ldd	[%l0+0x18], %d6
720	ldd	[%l0+0x20], %d8
721	ldd	[%l0+0x28], %d10
722	ldd	[%l0+0x30], %d12
723	ldd	[%l0+0x38], %d14
724	prefetch [%l0+0x40], #one_read
725	prefetch [%l0+0x80], #one_read
7261:
727	add	%l0, 0x40, %l0
728	stxa	%g0, [%i0]%asi		! initialize the cache line
729
730	ldda	[%l0]ASI_BLK_P, %d16
731	ALIGN_OFF_8_15
732	fmovd	%d18, %d2
733	fmovd	%d20, %d4
734	fmovd	%d22, %d6
735	fmovd	%d24, %d8
736	fmovd	%d26, %d10
737	fmovd	%d28, %d12
738	fmovd	%d30, %d14
739
740	stda	%d48, [%i0]ASI_BLK_P
741	subcc	%i3, 0x40, %i3
742	add	%i0, 0x40, %i0
743	bgu,pt	%ncc, 1b
744	prefetch [%l0+0x80], #one_read
745	ba	.blkdone
746	membar	#Sync
747
748	! This copy case for source offset between 16 and 23
749off23:
750	ldd	[%l0+0x10], %d4
751	ldd	[%l0+0x18], %d6
752	ldd	[%l0+0x20], %d8
753	ldd	[%l0+0x28], %d10
754	ldd	[%l0+0x30], %d12
755	ldd	[%l0+0x38], %d14
756	prefetch [%l0+0x40], #one_read
757	prefetch [%l0+0x80], #one_read
7582:
759	add	%l0, 0x40, %l0
760	stxa	%g0, [%i0]%asi		! initialize the cache line
761
762	ldda	[%l0]ASI_BLK_P, %d16
763	ALIGN_OFF_16_23
764	fmovd	%d20, %d4
765	fmovd	%d22, %d6
766	fmovd	%d24, %d8
767	fmovd	%d26, %d10
768	fmovd	%d28, %d12
769	fmovd	%d30, %d14
770
771	stda	%d48, [%i0]ASI_BLK_P
772	subcc	%i3, 0x40, %i3
773	add	%i0, 0x40, %i0
774	bgu,pt	%ncc, 2b
775	prefetch [%l0+0x80], #one_read
776	ba	.blkdone
777	membar	#Sync
778
779	! This copy case for source offset between 24 and 31
780off31:
781	ldd	[%l0+0x18], %d6
782	ldd	[%l0+0x20], %d8
783	ldd	[%l0+0x28], %d10
784	ldd	[%l0+0x30], %d12
785	ldd	[%l0+0x38], %d14
786	prefetch [%l0+0x40], #one_read
787	prefetch [%l0+0x80], #one_read
7883:
789	add	%l0, 0x40, %l0
790	stxa	%g0, [%i0]%asi		! initialize the cache line
791
792	ldda	[%l0]ASI_BLK_P, %d16
793	ALIGN_OFF_24_31
794	fmovd	%d22, %d6
795	fmovd	%d24, %d8
796	fmovd	%d26, %d10
797	fmovd	%d28, %d12
798	fmovd	%d30, %d14
799
800	stda	%d48, [%i0]ASI_BLK_P
801	subcc	%i3, 0x40, %i3
802	add	%i0, 0x40, %i0
803	bgu,pt	%ncc, 3b
804	prefetch [%l0+0x80], #one_read
805	ba	.blkdone
806	membar	#Sync
807
808	! This copy case for source offset between 32 and 39
809off39:
810	ldd	[%l0+0x20], %d8
811	ldd	[%l0+0x28], %d10
812	ldd	[%l0+0x30], %d12
813	ldd	[%l0+0x38], %d14
814	prefetch [%l0+0x40], #one_read
815	prefetch [%l0+0x80], #one_read
8164:
817	add	%l0, 0x40, %l0
818	stxa	%g0, [%i0]%asi		! initialize the cache line
819
820	ldda	[%l0]ASI_BLK_P, %d16
821	ALIGN_OFF_32_39
822	fmovd	%d24, %d8
823	fmovd	%d26, %d10
824	fmovd	%d28, %d12
825	fmovd	%d30, %d14
826
827	stda	%d48, [%i0]ASI_BLK_P
828	subcc	%i3, 0x40, %i3
829	add	%i0, 0x40, %i0
830	bgu,pt	%ncc, 4b
831	prefetch [%l0+0x80], #one_read
832	ba	.blkdone
833	membar	#Sync
834
835	! This copy case for source offset between 40 and 47
836off47:
837	ldd	[%l0+0x28], %d10
838	ldd	[%l0+0x30], %d12
839	ldd	[%l0+0x38], %d14
840	prefetch [%l0+0x40], #one_read
841	prefetch [%l0+0x80], #one_read
8425:
843	add	%l0, 0x40, %l0
844	stxa	%g0, [%i0]%asi		! initialize the cache line
845
846	ldda	[%l0]ASI_BLK_P, %d16
847	ALIGN_OFF_40_47
848	fmovd	%d26, %d10
849	fmovd	%d28, %d12
850	fmovd	%d30, %d14
851
852	stda	%d48, [%i0]ASI_BLK_P
853	subcc	%i3, 0x40, %i3
854	add	%i0, 0x40, %i0
855	bgu,pt	%ncc, 5b
856	prefetch [%l0+0x80], #one_read
857	ba	.blkdone
858	membar	#Sync
859
860	! This copy case for source offset between 48 and 55
861off55:
862	ldd	[%l0+0x30], %d12
863	ldd	[%l0+0x38], %d14
864	prefetch [%l0+0x40], #one_read
865	prefetch [%l0+0x80], #one_read
8666:
867	add	%l0, 0x40, %l0
868	stxa	%g0, [%i0]%asi		! initialize the cache line
869
870	ldda	[%l0]ASI_BLK_P, %d16
871	ALIGN_OFF_48_55
872	fmovd	%d28, %d12
873	fmovd	%d30, %d14
874
875	stda	%d48, [%i0]ASI_BLK_P
876	subcc	%i3, 0x40, %i3
877	add	%i0, 0x40, %i0
878	bgu,pt	%ncc, 6b
879	prefetch [%l0+0x80], #one_read
880	ba	.blkdone
881	membar	#Sync
882
883	! Both source and destination are block aligned.
884.blkcpy:
885	prefetch [%i1+0x40], #one_read
886	prefetch [%i1+0x80], #one_read
8878:
888	stxa	%g0, [%i0]%asi		! initialize the cache line
889	ldda	[%i1]ASI_BLK_P, %d0
890	stda	%d0, [%i0]ASI_BLK_P
891
892	add	%i1, 0x40, %i1
893	subcc	%i3, 0x40, %i3
894	add	%i0, 0x40, %i0
895	bgu,pt	%ncc, 8b
896	prefetch [%i1+0x80], #one_read
897	membar	#Sync
898
899.blkdone:
900#else	/* NIAGARA_IMPL */
901	andcc	%i1, 0xf, %o2		! is src quadword aligned
902	bz,pn	%xcc, .blkcpy		! src offset in %o2
903	nop
904	cmp	%o2, 0x8
905	bg	.cpy_upper_double
906	nop
907	bl	.cpy_lower_double
908	nop
909
910	! Falls through when source offset is equal to 8 i.e.
911	! source is double word aligned.
912	! In this case no shift/merge of data is required
913	sub	%i1, %o2, %i1		! align the src at 16 bytes.
914	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
915	prefetch [%l0+0x0], #one_read
916	ldda	[%i1+0x0]%asi, %l2
917loop0:
918	ldda	[%i1+0x10]%asi, %l4
919	prefetch [%l0+0x40], #one_read
920
921	stxa	%l3, [%i0+0x0]%asi
922	stxa	%l4, [%i0+0x8]%asi
923
924	ldda	[%i1+0x20]%asi, %l2
925	stxa	%l5, [%i0+0x10]%asi
926	stxa	%l2, [%i0+0x18]%asi
927
928	ldda	[%i1+0x30]%asi, %l4
929	stxa	%l3, [%i0+0x20]%asi
930	stxa	%l4, [%i0+0x28]%asi
931
932	ldda	[%i1+0x40]%asi, %l2
933	stxa	%l5, [%i0+0x30]%asi
934	stxa	%l2, [%i0+0x38]%asi
935
936	add	%l0, 0x40, %l0
937	add	%i1, 0x40, %i1
938	subcc	%i3, 0x40, %i3
939	bgu,pt	%xcc, loop0
940	add	%i0, 0x40, %i0
941	ba	.blkdone
942	add	%i1, %o2, %i1		! increment the source by src offset
943					! the src offset was stored in %o2
944
945.cpy_lower_double:
946	sub	%i1, %o2, %i1		! align the src at 16 bytes.
947	sll	%o2, 3, %o0		! %o0 left shift
948	mov	0x40, %o1
949	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
950	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
951	prefetch [%l0+0x0], #one_read
952	ldda	[%i1+0x0]%asi, %l2	! partial data in %l2 and %l3 has
953					! complete data
954loop1:
955	ldda	[%i1+0x10]%asi, %l4	! %l4 has partial data for this read.
956	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
957							! into %l2 and %l3
958	prefetch [%l0+0x40], #one_read
959	stxa	%l2, [%i0+0x0]%asi
960	stxa	%l3, [%i0+0x8]%asi
961
962	ldda	[%i1+0x20]%asi, %l2
963	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
964	stxa	%l4, [%i0+0x10]%asi			! %l4 from previous read
965	stxa	%l5, [%i0+0x18]%asi			! into %l4 and %l5
966
967	! Repeat the same for next 32 bytes.
968
969	ldda	[%i1+0x30]%asi, %l4
970	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
971	stxa	%l2, [%i0+0x20]%asi
972	stxa	%l3, [%i0+0x28]%asi
973
974	ldda	[%i1+0x40]%asi, %l2
975	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
976	stxa	%l4, [%i0+0x30]%asi
977	stxa	%l5, [%i0+0x38]%asi
978
979	add	%l0, 0x40, %l0
980	add	%i1, 0x40, %i1
981	subcc	%i3, 0x40, %i3
982	bgu,pt	%xcc, loop1
983	add	%i0, 0x40, %i0
984	ba	.blkdone
985	add	%i1, %o2, %i1		! increment the source by src offset
986					! the src offset was stored in %o2
987
988.cpy_upper_double:
989	sub	%i1, %o2, %i1		! align the src at 16 bytes.
990	mov	0x8, %o0
991	sub	%o2, %o0, %o0
992	sll	%o0, 3, %o0		! %o0 left shift
993	mov	0x40, %o1
994	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
995	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
996	prefetch [%l0+0x0], #one_read
997	ldda	[%i1+0x0]%asi, %l2	! partial data in %l3 for this read and
998					! no data in %l2
999loop2:
1000	ldda	[%i1+0x10]%asi, %l4	! %l4 has complete data and %l5 has
1001					! partial
1002	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
1003							! into %l3 and %l4
1004	prefetch [%l0+0x40], #one_read
1005	stxa	%l3, [%i0+0x0]%asi
1006	stxa	%l4, [%i0+0x8]%asi
1007
1008	ldda	[%i1+0x20]%asi, %l2
1009	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
1010	stxa	%l5, [%i0+0x10]%asi			! %l5 from previous read
1011	stxa	%l2, [%i0+0x18]%asi			! into %l5 and %l2
1012
1013	! Repeat the same for next 32 bytes.
1014
1015	ldda	[%i1+0x30]%asi, %l4
1016	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
1017	stxa	%l3, [%i0+0x20]%asi
1018	stxa	%l4, [%i0+0x28]%asi
1019
1020	ldda	[%i1+0x40]%asi, %l2
1021	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
1022	stxa	%l5, [%i0+0x30]%asi
1023	stxa	%l2, [%i0+0x38]%asi
1024
1025	add	%l0, 0x40, %l0
1026	add	%i1, 0x40, %i1
1027	subcc	%i3, 0x40, %i3
1028	bgu,pt	%xcc, loop2
1029	add	%i0, 0x40, %i0
1030	ba	.blkdone
1031	add	%i1, %o2, %i1		! increment the source by src offset
1032					! the src offset was stored in %o2
1033
1034
1035	! Both Source and Destination are block aligned.
1036	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
1037.blkcpy:
1038	prefetch [%i1+0x0], #one_read
10391:
1040	ldda	[%i1+0x0]%asi, %l0
1041	ldda	[%i1+0x10]%asi, %l2
1042	prefetch [%i1+0x40], #one_read
1043
1044	stxa	%l0, [%i0+0x0]%asi
1045	ldda	[%i1+0x20]%asi, %l4
1046	ldda	[%i1+0x30]%asi, %l6
1047
1048	stxa	%l1, [%i0+0x8]%asi
1049	stxa	%l2, [%i0+0x10]%asi
1050	stxa	%l3, [%i0+0x18]%asi
1051	stxa	%l4, [%i0+0x20]%asi
1052	stxa	%l5, [%i0+0x28]%asi
1053	stxa	%l6, [%i0+0x30]%asi
1054	stxa	%l7, [%i0+0x38]%asi
1055
1056	add	%i1, 0x40, %i1
1057	subcc	%i3, 0x40, %i3
1058	bgu,pt	%xcc, 1b
1059	add	%i0, 0x40, %i0
1060
1061.blkdone:
1062	membar	#Sync
1063#endif	/* NIAGARA_IMPL */
1064
1065	brz,pt	%i2, .blkexit
1066	nop
1067
1068	! Handle trailing bytes
1069	cmp	%i2, 0x8
1070	blu,pt	%ncc, .residue
1071	nop
1072
1073	! Can we do some 8B ops
1074	or	%i1, %i0, %o2
1075	andcc	%o2, 0x7, %g0
1076	bnz	%ncc, .last4
1077	nop
1078
1079	! Do 8byte ops as long as possible
1080.last8:
1081	ldx	[%i1], %o2
1082	stx	%o2, [%i0]
1083	add	%i1, 0x8, %i1
1084	sub	%i2, 0x8, %i2
1085	cmp	%i2, 0x8
1086	bgu,pt	%ncc, .last8
1087	add	%i0, 0x8, %i0
1088
1089	brz,pt	%i2, .blkexit
1090	nop
1091
1092	ba	.residue
1093	nop
1094
1095.last4:
1096	! Can we do 4B ops
1097	andcc	%o2, 0x3, %g0
1098	bnz	%ncc, .last2
1099	nop
11001:
1101	ld	[%i1], %o2
1102	st	%o2, [%i0]
1103	add	%i1, 0x4, %i1
1104	sub	%i2, 0x4, %i2
1105	cmp	%i2, 0x4
1106	bgu,pt	%ncc, 1b
1107	add	%i0, 0x4, %i0
1108
1109	brz,pt	%i2, .blkexit
1110	nop
1111
1112	ba	.residue
1113	nop
1114
1115.last2:
1116	! Can we do 2B ops
1117	andcc	%o2, 0x1, %g0
1118	bnz	%ncc, .residue
1119	nop
1120
11211:
1122	lduh	[%i1], %o2
1123	stuh	%o2, [%i0]
1124	add	%i1, 0x2, %i1
1125	sub	%i2, 0x2, %i2
1126	cmp	%i2, 0x2
1127	bgu,pt	%ncc, 1b
1128	add	%i0, 0x2, %i0
1129
1130	brz,pt	%i2, .blkexit
1131	nop
1132
1133.residue:
1134	ldub	[%i1], %o2
1135	stb	%o2, [%i0]
1136	inc	%i1
1137	deccc	%i2
1138	bgu,pt	%ncc, .residue
1139	inc	%i0
1140
1141.blkexit:
1142#if !defined(NIAGARA_IMPL)
1143	btst	FPUSED_FLAG, %o5
1144	bz	%icc, 1f
1145	  and	%o5,  COPY_FLAGS, %l1	! Store flags in %l1
1146					! We can't clear the flags from %o5 yet
1147					! If there's an error, .copyerr will
1148					! need them
1149
1150	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2      ! restore gsr
1151	wr	%o2, 0, %gsr
1152
1153	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1154	btst	FPRS_FEF, %o3
1155	bz,pt	%icc, 4f
1156	  nop
1157
1158	! restore fpregs from stack
1159	BLD_FP_FROMSTACK(%o2)
1160
1161	ba,pt	%ncc, 2f
1162	  wr	%o3, 0, %fprs		! restore fprs
1163
11644:
1165	FZERO
1166	wr	%o3, 0, %fprs		! restore fprs
1167
11682:
1169	ldn	[THREAD_REG + T_LWP], %o2
1170	brnz,pt	%o2, 1f
1171	  nop
1172
1173	ldsb	[THREAD_REG + T_PREEMPT], %l0
1174	deccc	%l0
1175	bnz,pn	%ncc, 1f
1176	  stb	%l0, [THREAD_REG + T_PREEMPT]
1177
1178	! Check for a kernel preemption request
1179	ldn	[THREAD_REG + T_CPU], %l0
1180	ldub	[%l0 + CPU_KPRUNRUN], %l0
1181	brnz,a,pt	%l0, 1f	! Need to call kpreempt?
1182	  or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
1183
1184	ldsb	[THREAD_REG + T_PREEMPT], %l0
1185	dec	%l0
1186	stb	%l0, [THREAD_REG + T_PREEMPT]
11871:
1188	btst	BCOPY_FLAG, %l1
1189	bz,pn	%icc, 3f
1190	andncc	%o5, COPY_FLAGS, %o5
1191
1192	! Here via bcopy. Check to see if the handler was NULL.
1193	! If so, just return quietly. Otherwise, reset the
1194	! handler and go home.
1195	bnz,pn	%ncc, 3f
1196	nop
1197
1198	! Null handler.
1199	btst	KPREEMPT_FLAG, %l1
1200	bz,pt	%icc, 2f
1201	  nop
1202	call	kpreempt
1203	  rdpr	%pil, %o0	! pass %pil
12042:
1205
1206	ret
1207	restore	%g0, 0, %o0
1208
1209	! Here via kcopy or bcopy with a handler.
1210	! Reset the fault handler.
12113:
1212	membar	#Sync
1213	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1214
1215	! call kpreempt if necessary
1216	btst	KPREEMPT_FLAG, %l1
1217	bz,pt	%icc, 4f
1218	  nop
1219	call	kpreempt
1220	  rdpr	%pil, %o0
12214:
1222#else	/* NIAGARA_IMPL */
1223	membar	#Sync				! sync error barrier
1224	! Restore t_lofault handler, if came here from kcopy().
1225	tst	%o5
1226	bz	%ncc, 1f
1227	andn	%o5, LOFAULT_SET, %o5
1228	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
12291:
1230#endif	/* NIAGARA_IMPL */
1231	ret
1232	restore	%g0, 0, %o0
1233
1234.bcb_punt:
1235	!
1236	! use aligned transfers where possible
1237	!
1238	xor	%i0, %i1, %o4		! xor from and to address
1239	btst	7, %o4			! if lower three bits zero
1240	bz	.aldoubcp		! can align on double boundary
1241	.empty	! assembler complaints about label
1242
1243	xor	%i0, %i1, %o4		! xor from and to address
1244	btst	3, %o4			! if lower two bits zero
1245	bz	.alwordcp		! can align on word boundary
1246	btst	3, %i0			! delay slot, from address unaligned?
1247	!
1248	! use aligned reads and writes where possible
1249	! this differs from wordcp in that it copes
1250	! with odd alignment between source and destnation
1251	! using word reads and writes with the proper shifts
1252	! in between to align transfers to and from memory
1253	! i0 - src address, i1 - dest address, i2 - count
1254	! i3, i4 - tmps for used generating complete word
1255	! i5 (word to write)
1256	! l0 size in bits of upper part of source word (US)
1257	! l1 size in bits of lower part of source word (LS = 32 - US)
1258	! l2 size in bits of upper part of destination word (UD)
1259	! l3 size in bits of lower part of destination word (LD = 32 - UD)
1260	! l4 number of bytes leftover after aligned transfers complete
1261	! l5 the number 32
1262	!
1263	mov	32, %l5			! load an oft-needed constant
1264	bz	.align_dst_only
1265	btst	3, %i1			! is destnation address aligned?
1266	clr	%i4			! clear registers used in either case
1267	bz	.align_src_only
1268	clr	%l0
1269	!
1270	! both source and destination addresses are unaligned
1271	!
12721:					! align source
1273	ldub	[%i0], %i3		! read a byte from source address
1274	add	%i0, 1, %i0		! increment source address
1275	or	%i4, %i3, %i4		! or in with previous bytes (if any)
1276	btst	3, %i0			! is source aligned?
1277	add	%l0, 8, %l0		! increment size of upper source (US)
1278	bnz,a	1b
1279	sll	%i4, 8, %i4		! make room for next byte
1280
1281	sub	%l5, %l0, %l1		! generate shift left count (LS)
1282	sll	%i4, %l1, %i4		! prepare to get rest
1283	ld	[%i0], %i3		! read a word
1284	add	%i0, 4, %i0		! increment source address
1285	srl	%i3, %l0, %i5		! upper src bits into lower dst bits
1286	or	%i4, %i5, %i5		! merge
1287	mov	24, %l3			! align destination
12881:
1289	srl	%i5, %l3, %i4		! prepare to write a single byte
1290	stb	%i4, [%i1]		! write a byte
1291	add	%i1, 1, %i1		! increment destination address
1292	sub	%i2, 1, %i2		! decrement count
1293	btst	3, %i1			! is destination aligned?
1294	bnz,a	1b
1295	sub	%l3, 8, %l3		! delay slot, decrement shift count (LD)
1296	sub	%l5, %l3, %l2		! generate shift left count (UD)
1297	sll	%i5, %l2, %i5		! move leftover into upper bytes
1298	cmp	%l2, %l0		! cmp # reqd to fill dst w old src left
1299	bgu	%ncc, .more_needed	! need more to fill than we have
1300	nop
1301
1302	sll	%i3, %l1, %i3		! clear upper used byte(s)
1303	srl	%i3, %l1, %i3
1304	! get the odd bytes between alignments
1305	sub	%l0, %l2, %l0		! regenerate shift count
1306	sub	%l5, %l0, %l1		! generate new shift left count (LS)
1307	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
1308	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
1309	srl	%i3, %l0, %i4
1310	or	%i5, %i4, %i5
1311	st	%i5, [%i1]		! write a word
1312	subcc	%i2, 4, %i2		! decrement count
1313	bz	%ncc, .unalign_out
1314	add	%i1, 4, %i1		! increment destination address
1315
1316	b	2f
1317	sll	%i3, %l1, %i5		! get leftover into upper bits
1318.more_needed:
1319	sll	%i3, %l0, %i3		! save remaining byte(s)
1320	srl	%i3, %l0, %i3
1321	sub	%l2, %l0, %l1		! regenerate shift count
1322	sub	%l5, %l1, %l0		! generate new shift left count
1323	sll	%i3, %l1, %i4		! move to fill empty space
1324	b	3f
1325	or	%i5, %i4, %i5		! merge to complete word
1326	!
1327	! the source address is aligned and destination is not
1328	!
1329.align_dst_only:
1330	ld	[%i0], %i4		! read a word
1331	add	%i0, 4, %i0		! increment source address
1332	mov	24, %l0			! initial shift alignment count
13331:
1334	srl	%i4, %l0, %i3		! prepare to write a single byte
1335	stb	%i3, [%i1]		! write a byte
1336	add	%i1, 1, %i1		! increment destination address
1337	sub	%i2, 1, %i2		! decrement count
1338	btst	3, %i1			! is destination aligned?
1339	bnz,a	1b
1340	sub	%l0, 8, %l0		! delay slot, decrement shift count
1341.xfer:
1342	sub	%l5, %l0, %l1		! generate shift left count
1343	sll	%i4, %l1, %i5		! get leftover
13443:
1345	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
1346	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
13472:
1348	ld	[%i0], %i3		! read a source word
1349	add	%i0, 4, %i0		! increment source address
1350	srl	%i3, %l0, %i4		! upper src bits into lower dst bits
1351	or	%i5, %i4, %i5		! merge with upper dest bits (leftover)
1352	st	%i5, [%i1]		! write a destination word
1353	subcc	%i2, 4, %i2		! decrement count
1354	bz	%ncc, .unalign_out	! check if done
1355	add	%i1, 4, %i1		! increment destination address
1356	b	2b			! loop
1357	sll	%i3, %l1, %i5		! get leftover
1358.unalign_out:
1359	tst	%l4			! any bytes leftover?
1360	bz	%ncc, .cpdone
1361	.empty				! allow next instruction in delay slot
13621:
1363	sub	%l0, 8, %l0		! decrement shift
1364	srl	%i3, %l0, %i4		! upper src byte into lower dst byte
1365	stb	%i4, [%i1]		! write a byte
1366	subcc	%l4, 1, %l4		! decrement count
1367	bz	%ncc, .cpdone		! done?
1368	add	%i1, 1, %i1		! increment destination
1369	tst	%l0			! any more previously read bytes
1370	bnz	%ncc, 1b		! we have leftover bytes
1371	mov	%l4, %i2		! delay slot, mv cnt where dbytecp wants
1372	b	.dbytecp		! let dbytecp do the rest
1373	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
1374	!
1375	! the destination address is aligned and the source is not
1376	!
1377.align_src_only:
1378	ldub	[%i0], %i3		! read a byte from source address
1379	add	%i0, 1, %i0		! increment source address
1380	or	%i4, %i3, %i4		! or in with previous bytes (if any)
1381	btst	3, %i0			! is source aligned?
1382	add	%l0, 8, %l0		! increment shift count (US)
1383	bnz,a	.align_src_only
1384	sll	%i4, 8, %i4		! make room for next byte
1385	b,a	.xfer
1386	!
1387	! if from address unaligned for double-word moves,
1388	! move bytes till it is, if count is < 56 it could take
1389	! longer to align the thing than to do the transfer
1390	! in word size chunks right away
1391	!
1392.aldoubcp:
1393	cmp	%i2, 56			! if count < 56, use wordcp, it takes
1394	blu,a	%ncc, .alwordcp		! longer to align doubles than words
1395	mov	3, %o0			! mask for word alignment
1396	call	.alignit		! copy bytes until aligned
1397	mov	7, %o0			! mask for double alignment
1398	!
1399	! source and destination are now double-word aligned
1400	! i3 has aligned count returned by alignit
1401	!
1402	and	%i2, 7, %i2		! unaligned leftover count
1403	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
14045:
1405	ldx	[%i0+%i1], %o4		! read from address
1406	stx	%o4, [%i1]		! write at destination address
1407	subcc	%i3, 8, %i3		! dec count
1408	bgu	%ncc, 5b
1409	add	%i1, 8, %i1		! delay slot, inc to address
1410	cmp	%i2, 4			! see if we can copy a word
1411	blu	%ncc, .dbytecp		! if 3 or less bytes use bytecp
1412	.empty
1413	!
1414	! for leftover bytes we fall into wordcp, if needed
1415	!
1416.wordcp:
1417	and	%i2, 3, %i2		! unaligned leftover count
14185:
1419	ld	[%i0+%i1], %o4		! read from address
1420	st	%o4, [%i1]		! write at destination address
1421	subcc	%i3, 4, %i3		! dec count
1422	bgu	%ncc, 5b
1423	add	%i1, 4, %i1		! delay slot, inc to address
1424	b,a	.dbytecp
1425
1426	! we come here to align copies on word boundaries
1427.alwordcp:
1428	call	.alignit		! go word-align it
1429	mov	3, %o0			! bits that must be zero to be aligned
1430	b	.wordcp
1431	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
1432
1433	!
1434	! byte copy, works with any alignment
1435	!
1436.bytecp:
1437	b	.dbytecp
1438	sub	%i0, %i1, %i0		! i0 gets difference of src and dst
1439
1440	!
1441	! differenced byte copy, works with any alignment
1442	! assumes dest in %i1 and (source - dest) in %i0
1443	!
14441:
1445	stb	%o4, [%i1]		! write to address
1446	inc	%i1			! inc to address
1447.dbytecp:
1448	deccc	%i2			! dec count
1449	bgeu,a	%ncc, 1b		! loop till done
1450	ldub	[%i0+%i1], %o4		! read from address
1451.cpdone:
1452#if !defined(NIAGARA_IMPL)
1453	! FPUSED_FLAG will not have been set in any path leading to
1454	! this point. No need to deal with it.
1455	btst	BCOPY_FLAG, %o5
1456	bz,pn	%icc, 2f
1457	andcc	%o5, BCOPY_FLAG, %o5
1458	! Here via bcopy. Check to see if the handler was NULL.
1459	! If so, just return quietly. Otherwise, reset the
1460	! handler and go home.
1461	bnz,pn	%ncc, 2f
1462	nop
1463	!
1464	! Null handler.
1465	!
1466	ret
1467	restore %g0, 0, %o0
1468	! Here via kcopy or bcopy with a handler.
1469	! Reset the fault handler.
14702:
1471	membar	#Sync
1472	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1473#else	/* NIAGARA_IMPL */
1474	membar	#Sync				! sync error barrier
1475	! Restore t_lofault handler, if came here from kcopy().
1476	tst	%o5
1477	bz	%ncc, 1f
1478	andn	%o5, LOFAULT_SET, %o5
1479	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
14801:
1481#endif	/* NIAGARA_IMPL */
1482	ret
1483	restore %g0, 0, %o0		! return (0)
1484
1485/*
1486 * Common code used to align transfers on word and doubleword
1487 * boudaries.  Aligns source and destination and returns a count
1488 * of aligned bytes to transfer in %i3
1489 */
14901:
1491	inc	%i0			! inc from
1492	stb	%o4, [%i1]		! write a byte
1493	inc	%i1			! inc to
1494	dec	%i2			! dec count
1495.alignit:
1496	btst	%o0, %i0		! %o0 is bit mask to check for alignment
1497	bnz,a	1b
1498	ldub	[%i0], %o4		! read next byte
1499
1500	retl
1501	andn	%i2, %o0, %i3		! return size of aligned bytes
1502	SET_SIZE(bcopy)
1503
1504#endif	/* lint */
1505
1506/*
1507 * Block copy with possibly overlapped operands.
1508 */
1509
1510#if defined(lint)
1511
1512/*ARGSUSED*/
1513void
1514ovbcopy(const void *from, void *to, size_t count)
1515{}
1516
1517#else	/* lint */
1518
1519	ENTRY(ovbcopy)
1520	tst	%o2			! check count
1521	bgu,a	%ncc, 1f		! nothing to do or bad arguments
1522	subcc	%o0, %o1, %o3		! difference of from and to address
1523
1524	retl				! return
1525	nop
15261:
1527	bneg,a	%ncc, 2f
1528	neg	%o3			! if < 0, make it positive
15292:	cmp	%o2, %o3		! cmp size and abs(from - to)
1530	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
1531	.empty				!   no overlap
1532	cmp	%o0, %o1		! compare from and to addresses
1533	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
1534	nop
1535	!
1536	! Copy forwards.
1537	!
1538.ov_fwd:
1539	ldub	[%o0], %o3		! read from address
1540	inc	%o0			! inc from address
1541	stb	%o3, [%o1]		! write to address
1542	deccc	%o2			! dec count
1543	bgu	%ncc, .ov_fwd		! loop till done
1544	inc	%o1			! inc to address
1545
1546	retl				! return
1547	nop
1548	!
1549	! Copy backwards.
1550	!
1551.ov_bkwd:
1552	deccc	%o2			! dec count
1553	ldub	[%o0 + %o2], %o3	! get byte at end of src
1554	bgu	%ncc, .ov_bkwd		! loop till done
1555	stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
1556
1557	retl				! return
1558	nop
1559	SET_SIZE(ovbcopy)
1560
1561#endif	/* lint */
1562
1563/*
1564 * hwblkpagecopy()
1565 *
1566 * Copies exactly one page.  This routine assumes the caller (ppcopy)
1567 * has already disabled kernel preemption and has checked
1568 * use_hw_bcopy.
1569 */
1570#ifdef lint
1571/*ARGSUSED*/
1572void
1573hwblkpagecopy(const void *src, void *dst)
1574{ }
1575#else /* lint */
1576	ENTRY(hwblkpagecopy)
1577	save	%sp, -SA(MINFRAME), %sp
1578
1579	! %i0 - source address (arg)
1580	! %i1 - destination address (arg)
1581	! %i2 - length of region (not arg)
1582
1583	set	PAGESIZE, %i2
1584
1585	/*
1586	 * Copying exactly one page and PAGESIZE is in mutliple of 0x80.
1587	 */
1588	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
1589	prefetch [%i0+0x0], #one_read
1590	prefetch [%i0+0x40], #one_read
15911:
1592	prefetch [%i0+0x80], #one_read
1593	prefetch [%i0+0xc0], #one_read
1594	ldda	[%i0+0x0]%asi, %l0
1595	ldda	[%i0+0x10]%asi, %l2
1596	ldda	[%i0+0x20]%asi, %l4
1597	ldda	[%i0+0x30]%asi, %l6
1598	stxa	%l0, [%i1+0x0]%asi
1599	stxa	%l1, [%i1+0x8]%asi
1600	stxa	%l2, [%i1+0x10]%asi
1601	stxa	%l3, [%i1+0x18]%asi
1602	stxa	%l4, [%i1+0x20]%asi
1603	stxa	%l5, [%i1+0x28]%asi
1604	stxa	%l6, [%i1+0x30]%asi
1605	stxa	%l7, [%i1+0x38]%asi
1606	ldda	[%i0+0x40]%asi, %l0
1607	ldda	[%i0+0x50]%asi, %l2
1608	ldda	[%i0+0x60]%asi, %l4
1609	ldda	[%i0+0x70]%asi, %l6
1610	stxa	%l0, [%i1+0x40]%asi
1611	stxa	%l1, [%i1+0x48]%asi
1612	stxa	%l2, [%i1+0x50]%asi
1613	stxa	%l3, [%i1+0x58]%asi
1614	stxa	%l4, [%i1+0x60]%asi
1615	stxa	%l5, [%i1+0x68]%asi
1616	stxa	%l6, [%i1+0x70]%asi
1617	stxa	%l7, [%i1+0x78]%asi
1618
1619	add	%i0, 0x80, %i0
1620	subcc	%i2, 0x80, %i2
1621	bgu,pt	%xcc, 1b
1622	add	%i1, 0x80, %i1
1623
1624	membar #Sync
1625	ret
1626	restore	%g0, 0, %o0
1627	SET_SIZE(hwblkpagecopy)
1628#endif	/* lint */
1629
1630
1631/*
1632 * Transfer data to and from user space -
1633 * Note that these routines can cause faults
1634 * It is assumed that the kernel has nothing at
1635 * less than KERNELBASE in the virtual address space.
1636 *
1637 * Note that copyin(9F) and copyout(9F) are part of the
1638 * DDI/DKI which specifies that they return '-1' on "errors."
1639 *
1640 * Sigh.
1641 *
1642 * So there's two extremely similar routines - xcopyin() and xcopyout()
1643 * which return the errno that we've faithfully computed.  This
1644 * allows other callers (e.g. uiomove(9F)) to work correctly.
1645 * Given that these are used pretty heavily, we expand the calling
1646 * sequences inline for all flavours (rather than making wrappers).
1647 *
1648 * There are also stub routines for xcopyout_little and xcopyin_little,
1649 * which currently are intended to handle requests of <= 16 bytes from
1650 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1651 * is left as an exercise...
1652 */
1653
1654/*
1655 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1656 *
1657 * General theory of operation:
1658 *
1659 * None of the copyops routines grab a window until it's decided that
1660 * we need to do a HW block copy operation. This saves a window
1661 * spill/fill when we're called during socket ops. The typical IO
1662 * path won't cause spill/fill traps.
1663 *
1664 * This code uses a set of 4 limits for the maximum size that will
1665 * be copied given a particular input/output address alignment.
1666 * the default limits are:
1667 *
1668 * single byte aligned - 256 (hw_copy_limit_1)
1669 * two byte aligned - 512 (hw_copy_limit_2)
1670 * four byte aligned - 1024 (hw_copy_limit_4)
1671 * eight byte aligned - 1024 (hw_copy_limit_8)
1672 *
1673 * If the value for a particular limit is zero, the copy will be done
1674 * via the copy loops rather than block store/quad load instructions.
1675 *
1676 * Flow:
1677 *
1678 * If count == zero return zero.
1679 *
1680 * Store the previous lo_fault handler into %g6.
1681 * Place our secondary lofault handler into %g5.
1682 * Place the address of our nowindow fault handler into %o3.
1683 * Place the address of the windowed fault handler into %o4.
1684 * --> We'll use this handler if we end up grabbing a window
1685 * --> before we use block initializing store and quad load ASIs
1686 *
1687 * If count is less than or equal to SMALL_LIMIT (7) we
1688 * always do a byte for byte copy.
1689 *
1690 * If count is > SMALL_LIMIT, we check the alignment of the input
1691 * and output pointers. Based on the alignment we check count
1692 * against a limit based on detected alignment.  If we exceed the
1693 * alignment value we copy via block initializing store and quad
1694 * load instructions.
1695 *
1696 * If we don't exceed one of the limits, we store -count in %o3,
1697 * we store the number of chunks (8, 4, 2 or 1 byte) operated
1698 * on in our basic copy loop in %o2. Following this we branch
1699 * to the appropriate copy loop and copy that many chunks.
1700 * Since we've been adding the chunk size to %o3 each time through
1701 * as well as decrementing %o2, we can tell if any data is
1702 * is left to be copied by examining %o3. If that is zero, we're
1703 * done and can go home. If not, we figure out what the largest
1704 * chunk size left to be copied is and branch to that copy loop
1705 * unless there's only one byte left. We load that as we're
1706 * branching to code that stores it just before we return.
1707 *
1708 * Fault handlers are invoked if we reference memory that has no
1709 * current mapping.  All forms share the same copyio_fault handler.
1710 * This routine handles fixing up the stack and general housecleaning.
1711 * Each copy operation has a simple fault handler that is then called
1712 * to do the work specific to the invidual operation.  The handler
1713 * for copyOP and xcopyOP are found at the end of individual function.
1714 * The handlers for xcopyOP_little are found at the end of xcopyin_little.
1715 * The handlers for copyOP_noerr are found at the end of copyin_noerr.
1716 */
1717
1718/*
1719 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1720 */
1721
1722#if defined(lint)
1723
1724/*ARGSUSED*/
1725int
1726copyout(const void *kaddr, void *uaddr, size_t count)
1727{ return (0); }
1728
1729#else	/* lint */
1730
1731/*
1732 * We save the arguments in the following registers in case of a fault:
1733 * 	kaddr - %g2
1734 * 	uaddr - %g3
1735 * 	count - %g4
1736 */
1737#define	SAVE_SRC	%g2
1738#define	SAVE_DST	%g3
1739#define	SAVE_COUNT	%g4
1740
1741#define	REAL_LOFAULT		%g5
1742#define	SAVED_LOFAULT		%g6
1743
1744/*
1745 * Generic copyio fault handler.  This is the first line of defense when a
1746 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
1747 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1748 * This allows us to share common code for all the flavors of the copy
1749 * operations, including the _noerr versions.
1750 *
1751 * Note that this function will restore the original input parameters before
1752 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
1753 * member of the t_copyop structure, if needed.
1754 */
1755	ENTRY(copyio_fault)
1756#if !defined(NIAGARA_IMPL)
1757	btst	FPUSED_FLAG, SAVED_LOFAULT
1758	bz	1f
1759	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
1760
1761	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1762	wr	%o2, 0, %gsr		! restore gsr
1763
1764	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1765	btst	FPRS_FEF, %o3
1766	bz	%icc, 4f
1767	  nop
1768
1769	! restore fpregs from stack
1770	BLD_FP_FROMSTACK(%o2)
1771
1772	ba,pt	%ncc, 1f
1773	  wr	%o3, 0, %fprs		! restore fprs
1774
17754:
1776	FZERO				! zero all of the fpregs
1777	wr	%o3, 0, %fprs		! restore fprs
1778
17791:
1780#else	/* NIAGARA_IMPL */
1781	membar	#Sync
1782	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1783#endif	/* NIAGARA_IMPL */
1784
1785	restore
1786
1787	mov	SAVE_SRC, %o0
1788	mov	SAVE_DST, %o1
1789	jmp	REAL_LOFAULT
1790	  mov	SAVE_COUNT, %o2
1791	SET_SIZE(copyio_fault)
1792
1793	ENTRY(copyio_fault_nowindow)
1794	membar	#Sync
1795	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1796
1797	mov	SAVE_SRC, %o0
1798	mov	SAVE_DST, %o1
1799	jmp	REAL_LOFAULT
1800	  mov	SAVE_COUNT, %o2
1801	SET_SIZE(copyio_fault_nowindow)
1802
1803	ENTRY(copyout)
1804	sethi	%hi(.copyout_err), REAL_LOFAULT
1805	or	REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
1806
1807.do_copyout:
1808	!
1809	! Check the length and bail if zero.
1810	!
1811	tst	%o2
1812	bnz,pt	%ncc, 1f
1813	  nop
1814	retl
1815	  clr	%o0
18161:
1817	sethi	%hi(copyio_fault), %o4
1818	or	%o4, %lo(copyio_fault), %o4
1819	sethi	%hi(copyio_fault_nowindow), %o3
1820	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
1821	or	%o3, %lo(copyio_fault_nowindow), %o3
1822	membar	#Sync
1823	stn	%o3, [THREAD_REG + T_LOFAULT]
1824
1825	mov	%o0, SAVE_SRC
1826	mov	%o1, SAVE_DST
1827	mov	%o2, SAVE_COUNT
1828
1829	!
1830	! Check to see if we're more than SMALL_LIMIT (7 bytes).
1831	! Run in leaf mode, using the %o regs as our input regs.
1832	!
1833	subcc	%o2, SMALL_LIMIT, %o3
1834	bgu,a,pt %ncc, .dco_ns
1835	or	%o0, %o1, %o3
1836	!
1837	! What was previously ".small_copyout"
1838	! Do full differenced copy.
1839	!
1840.dcobcp:
1841	sub	%g0, %o2, %o3		! negate count
1842	add	%o0, %o2, %o0		! make %o0 point at the end
1843	add	%o1, %o2, %o1		! make %o1 point at the end
1844	ba,pt	%ncc, .dcocl
1845	ldub	[%o0 + %o3], %o4	! load first byte
1846	!
1847	! %o0 and %o2 point at the end and remain pointing at the end
1848	! of their buffers. We pull things out by adding %o3 (which is
1849	! the negation of the length) to the buffer end which gives us
1850	! the curent location in the buffers. By incrementing %o3 we walk
1851	! through both buffers without having to bump each buffer's
1852	! pointer. A very fast 4 instruction loop.
1853	!
1854	.align 16
1855.dcocl:
1856	stba	%o4, [%o1 + %o3]ASI_USER
1857	inccc	%o3
1858	bl,a,pt	%ncc, .dcocl
1859	ldub	[%o0 + %o3], %o4
1860	!
1861	! We're done. Go home.
1862	!
1863	membar	#Sync
1864	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
1865	retl
1866	clr	%o0
1867	!
1868	! Try aligned copies from here.
1869	!
1870.dco_ns:
1871	! %o0 = kernel addr (to be copied from)
1872	! %o1 = user addr (to be copied to)
1873	! %o2 = length
1874	! %o3 = %o1 | %o2 (used for alignment checking)
1875	! %o4 is alternate lo_fault
1876	! %o5 is original lo_fault
1877	!
1878	! See if we're single byte aligned. If we are, check the
1879	! limit for single byte copies. If we're smaller or equal,
1880	! bounce to the byte for byte copy loop. Otherwise do it in
1881	! HW (if enabled).
1882	!
1883	btst	1, %o3
1884	bz,pt	%icc, .dcoh8
1885	btst	7, %o3
1886	!
1887	! Single byte aligned. Do we do it via HW or via
1888	! byte for byte? Do a quick no memory reference
1889	! check to pick up small copies.
1890	!
1891	sethi	%hi(hw_copy_limit_1), %o3
1892	!
1893	! Big enough that we need to check the HW limit for
1894	! this size copy.
1895	!
1896	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
1897	!
1898	! Is HW copy on? If not, do everything byte for byte.
1899	!
1900	tst	%o3
1901	bz,pn	%icc, .dcobcp
1902	subcc	%o3, %o2, %o3
1903	!
1904	! If we're less than or equal to the single byte copy limit,
1905	! bop to the copy loop.
1906	!
1907	bge,pt	%ncc, .dcobcp
1908	nop
1909	!
1910	! We're big enough and copy is on. Do it with HW.
1911	!
1912	ba,pt	%ncc, .big_copyout
1913	nop
1914.dcoh8:
1915	!
1916	! 8 byte aligned?
1917	!
1918	bnz,a	%ncc, .dcoh4
1919	btst	3, %o3
1920	!
1921	! See if we're in the "small range".
1922	! If so, go off and do the copy.
1923	! If not, load the hard limit. %o3 is
1924	! available for reuse.
1925	!
1926	sethi	%hi(hw_copy_limit_8), %o3
1927	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
1928	!
1929	! If it's zero, there's no HW bcopy.
1930	! Bop off to the aligned copy.
1931	!
1932	tst	%o3
1933	bz,pn	%icc, .dcos8
1934	subcc	%o3, %o2, %o3
1935	!
1936	! We're negative if our size is larger than hw_copy_limit_8.
1937	!
1938	bge,pt	%ncc, .dcos8
1939	nop
1940	!
1941	! HW assist is on and we're large enough. Do it.
1942	!
1943	ba,pt	%ncc, .big_copyout
1944	nop
1945.dcos8:
1946	!
1947	! Housekeeping for copy loops. Uses same idea as in the byte for
1948	! byte copy loop above.
1949	!
1950	add	%o0, %o2, %o0
1951	add	%o1, %o2, %o1
1952	sub	%g0, %o2, %o3
1953	ba,pt	%ncc, .dodebc
1954	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
1955	!
1956	! 4 byte aligned?
1957	!
1958.dcoh4:
1959	bnz,pn	%ncc, .dcoh2
1960	!
1961	! See if we're in the "small range".
1962	! If so, go off an do the copy.
1963	! If not, load the hard limit. %o3 is
1964	! available for reuse.
1965	!
1966	sethi	%hi(hw_copy_limit_4), %o3
1967	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
1968	!
1969	! If it's zero, there's no HW bcopy.
1970	! Bop off to the aligned copy.
1971	!
1972	tst	%o3
1973	bz,pn	%icc, .dcos4
1974	subcc	%o3, %o2, %o3
1975	!
1976	! We're negative if our size is larger than hw_copy_limit_4.
1977	!
1978	bge,pt	%ncc, .dcos4
1979	nop
1980	!
1981	! HW assist is on and we're large enough. Do it.
1982	!
1983	ba,pt	%ncc, .big_copyout
1984	nop
1985.dcos4:
1986	add	%o0, %o2, %o0
1987	add	%o1, %o2, %o1
1988	sub	%g0, %o2, %o3
1989	ba,pt	%ncc, .dodfbc
1990	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
1991	!
1992	! We must be 2 byte aligned. Off we go.
1993	! The check for small copies was done in the
1994	! delay at .dcoh4
1995	!
1996.dcoh2:
1997	ble	%ncc, .dcos2
1998	sethi	%hi(hw_copy_limit_2), %o3
1999	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
2000	tst	%o3
2001	bz,pn	%icc, .dcos2
2002	subcc	%o3, %o2, %o3
2003	bge,pt	%ncc, .dcos2
2004	nop
2005	!
2006	! HW is on and we're big enough. Do it.
2007	!
2008	ba,pt	%ncc, .big_copyout
2009	nop
2010.dcos2:
2011	add	%o0, %o2, %o0
2012	add	%o1, %o2, %o1
2013	sub	%g0, %o2, %o3
2014	ba,pt	%ncc, .dodtbc
2015	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
2016.small_copyout:
2017	!
2018	! Why are we doing this AGAIN? There are certain conditions in
2019	! big_copyout that will cause us to forego the HW assisted copies
2020	! and bounce back to a non-HW assisted copy. This dispatches those
2021	! copies. Note that we branch around this in the main line code.
2022	!
2023	! We make no check for limits or HW enablement here. We've
2024	! already been told that we're a poster child so just go off
2025	! and do it.
2026	!
2027	or	%o0, %o1, %o3
2028	btst	1, %o3
2029	bnz	%icc, .dcobcp		! Most likely
2030	btst	7, %o3
2031	bz	%icc, .dcos8
2032	btst	3, %o3
2033	bz	%icc, .dcos4
2034	nop
2035	ba,pt	%ncc, .dcos2
2036	nop
2037	.align 32
2038.dodebc:
2039	ldx	[%o0 + %o3], %o4
2040	deccc	%o2
2041	stxa	%o4, [%o1 + %o3]ASI_USER
2042	bg,pt	%ncc, .dodebc
2043	addcc	%o3, 8, %o3
2044	!
2045	! End of copy loop. Check to see if we're done. Most
2046	! eight byte aligned copies end here.
2047	!
2048	bz,pt	%ncc, .dcofh
2049	nop
2050	!
2051	! Something is left - do it byte for byte.
2052	!
2053	ba,pt	%ncc, .dcocl
2054	ldub	[%o0 + %o3], %o4	! load next byte
2055	!
2056	! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
2057	!
2058	.align 32
2059.dodfbc:
2060	lduw	[%o0 + %o3], %o4
2061	deccc	%o2
2062	sta	%o4, [%o1 + %o3]ASI_USER
2063	bg,pt	%ncc, .dodfbc
2064	addcc	%o3, 4, %o3
2065	!
2066	! End of copy loop. Check to see if we're done. Most
2067	! four byte aligned copies end here.
2068	!
2069	bz,pt	%ncc, .dcofh
2070	nop
2071	!
2072	! Something is left. Do it byte for byte.
2073	!
2074	ba,pt	%ncc, .dcocl
2075	ldub	[%o0 + %o3], %o4	! load next byte
2076	!
2077	! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
2078	! copy.
2079	!
2080	.align 32
2081.dodtbc:
2082	lduh	[%o0 + %o3], %o4
2083	deccc	%o2
2084	stha	%o4, [%o1 + %o3]ASI_USER
2085	bg,pt	%ncc, .dodtbc
2086	addcc	%o3, 2, %o3
2087	!
2088	! End of copy loop. Anything left?
2089	!
2090	bz,pt	%ncc, .dcofh
2091	nop
2092	!
2093	! Deal with the last byte
2094	!
2095	ldub	[%o0 + %o3], %o4
2096	stba	%o4, [%o1 + %o3]ASI_USER
2097.dcofh:
2098	membar	#Sync
2099	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2100	retl
2101	clr	%o0
2102
2103.big_copyout:
2104	! We're going to go off and do a block copy.
2105	! Switch fault handlers and grab a window. We
2106	! don't do a membar #Sync since we've done only
2107	! kernel data to this point.
2108	stn	%o4, [THREAD_REG + T_LOFAULT]
2109
2110	! Copy out that reach here are larger than 256 bytes. The
2111	! hw_copy_limit_1 is set to 256. Never set this limit less
2112	! 128 bytes.
2113#if !defined(NIAGARA_IMPL)
2114	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2115
2116	rd	%fprs, %o2			! check for unused fp
2117	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]	! save %fprs
2118	btst	FPRS_FEF, %o2
2119	bz,a,pt	%icc, .do_block_copyout
2120	wr	%g0, FPRS_FEF, %fprs
2121
2122	! save in-use fpregs on stack
2123	BST_FP_TOSTACK(%o2)
2124#else	/* NIAGARA_IMPL */
2125	save	%sp, -SA(MINFRAME), %sp
2126#endif	/* NIAGARA_IMPL */
2127
2128.do_block_copyout:
2129
2130#if !defined(NIAGARA_IMPL)
2131	rd	%gsr, %o2
2132	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
2133	! set the lower bit saved t_lofault to indicate that we need
2134	! clear %fprs register on the way out
2135	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2136#endif	/* NIAGARA_IMPL */
2137
2138	! Swap src/dst since the code below is memcpy code
2139	! and memcpy/bcopy have different calling sequences
2140	mov	%i1, %i5
2141	mov	%i0, %i1
2142	mov	%i5, %i0
2143
2144	! Block (64 bytes) align the destination.
2145	andcc	%i0, 0x3f, %i3		! is dst block aligned
2146	bz	%ncc, copyout_blalign	! dst already block aligned
2147	sub	%i3, 0x40, %i3
2148	neg	%i3			! bytes till dst 64 bytes aligned
2149	sub	%i2, %i3, %i2		! update i2 with new count
2150
2151	! Based on source and destination alignment do
2152	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
2153
2154	! Is dst & src 8B aligned
2155	or	%i0, %i1, %o2
2156	andcc	%o2, 0x7, %g0
2157	bz	%ncc, .co_alewdcp
2158	nop
2159
2160	! Is dst & src 4B aligned
2161	andcc	%o2, 0x3, %g0
2162	bz	%ncc, .co_alwdcp
2163	nop
2164
2165	! Is dst & src 2B aligned
2166	andcc	%o2, 0x1, %g0
2167	bz	%ncc, .co_alhlfwdcp
2168	nop
2169
2170	! 1B aligned
21711:	ldub	[%i1], %o2
2172	stba	%o2, [%i0]ASI_USER
2173	inc	%i1
2174	deccc	%i3
2175	bgu,pt	%ncc, 1b
2176	inc	%i0
2177
2178	ba	copyout_blalign
2179	nop
2180
2181	! dst & src 4B aligned
2182.co_alwdcp:
2183	ld	[%i1], %o2
2184	sta	%o2, [%i0]ASI_USER
2185	add	%i1, 0x4, %i1
2186	subcc	%i3, 0x4, %i3
2187	bgu,pt	%ncc, .co_alwdcp
2188	add	%i0, 0x4, %i0
2189
2190	ba	copyout_blalign
2191	nop
2192
2193	! dst & src 2B aligned
2194.co_alhlfwdcp:
2195	lduh	[%i1], %o2
2196	stuha	%o2, [%i0]ASI_USER
2197	add	%i1, 0x2, %i1
2198	subcc	%i3, 0x2, %i3
2199	bgu,pt	%ncc, .co_alhlfwdcp
2200	add	%i0, 0x2, %i0
2201
2202	ba	copyout_blalign
2203	nop
2204
2205	! dst & src 8B aligned
2206.co_alewdcp:
2207	ldx	[%i1], %o2
2208	stxa	%o2, [%i0]ASI_USER
2209	add	%i1, 0x8, %i1
2210	subcc	%i3, 0x8, %i3
2211	bgu,pt	%ncc, .co_alewdcp
2212	add	%i0, 0x8, %i0
2213
2214	! Now Destination is block (64 bytes) aligned
2215copyout_blalign:
2216	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
2217	sub	%i2, %i3, %i2		! Residue bytes in %i2
2218
2219	mov	ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
2220
2221#if !defined(NIAGARA_IMPL)
2222	andn	%i1, 0x3f, %l0		! %l0 has block aligned src address
2223	prefetch [%l0+0x0], #one_read
2224	andcc	%i1, 0x3f, %g0		! is src 64B aligned
2225	bz,pn	%ncc, .co_blkcpy
2226	nop
2227
2228	! handle misaligned source cases
2229	alignaddr %i1, %g0, %g0		! generate %gsr
2230
2231	srl	%i1, 0x3, %l1		! src add bits 3, 4, 5 are now least
2232					! significant in %l1
2233	andcc	%l1, 0x7, %l2		! mask everything except bits 1, 2, 3
2234	add	%i1, %i3, %i1
2235
2236	! switch statement to get to right 8 byte block within
2237	! 64 byte block
2238	cmp	 %l2, 0x4
2239	bgeu,a	 co_hlf
2240	cmp	 %l2, 0x6
2241	cmp	 %l2, 0x2
2242	bgeu,a	 co_sqtr
2243	nop
2244	cmp	 %l2, 0x1
2245	be,a	 co_off15
2246	nop
2247	ba	 co_off7
2248	nop
2249co_sqtr:
2250	be,a	 co_off23
2251	nop
2252	ba,a	 co_off31
2253	nop
2254
2255co_hlf:
2256	bgeu,a	 co_fqtr
2257	nop
2258	cmp	 %l2, 0x5
2259	be,a	 co_off47
2260	nop
2261	ba	 co_off39
2262	nop
2263co_fqtr:
2264	be,a	 co_off55
2265	nop
2266
2267	ldd	[%l0+0x38], %d14
2268	prefetch [%l0+0x40], #one_read
2269	prefetch [%l0+0x80], #one_read
22707:
2271	add	%l0, 0x40, %l0
2272	stxa	%g0, [%i0]%asi		! initialize the cache line
2273
2274	ldda	[%l0]ASI_BLK_P, %d16
2275	ALIGN_OFF_56_63
2276	fmovd	%d30, %d14
2277
2278	stda	%d48, [%i0]ASI_BLK_AIUS
2279	subcc	%i3, 0x40, %i3
2280	add	%i0, 0x40, %i0
2281	bgu,pt	%ncc, 7b
2282	prefetch [%l0+0x80], #one_read
2283	ba	.co_blkdone
2284	membar	#Sync
2285
2286co_off7:
2287	ldda	[%l0]ASI_BLK_P, %d0
2288	prefetch [%l0+0x40], #one_read
2289	prefetch [%l0+0x80], #one_read
22900:
2291	add	%l0, 0x40, %l0
2292	stxa	%g0, [%i0]%asi		! initialize the cache line
2293
2294	ldda	[%l0]ASI_BLK_P, %d16
2295	ALIGN_OFF_1_7
2296	fmovd	%d16, %d0
2297	fmovd	%d18, %d2
2298	fmovd	%d20, %d4
2299	fmovd	%d22, %d6
2300	fmovd	%d24, %d8
2301	fmovd	%d26, %d10
2302	fmovd	%d28, %d12
2303	fmovd	%d30, %d14
2304
2305	stda	%d48, [%i0]ASI_BLK_AIUS
2306	subcc	%i3, 0x40, %i3
2307	add	%i0, 0x40, %i0
2308	bgu,pt	%ncc, 0b
2309	prefetch [%l0+0x80], #one_read
2310	ba	.co_blkdone
2311	membar	#Sync
2312
2313co_off15:
2314	ldd	[%l0+0x8], %d2
2315	ldd	[%l0+0x10], %d4
2316	ldd	[%l0+0x18], %d6
2317	ldd	[%l0+0x20], %d8
2318	ldd	[%l0+0x28], %d10
2319	ldd	[%l0+0x30], %d12
2320	ldd	[%l0+0x38], %d14
2321	prefetch [%l0+0x40], #one_read
2322	prefetch [%l0+0x80], #one_read
23231:
2324	add	%l0, 0x40, %l0
2325	stxa	%g0, [%i0]%asi		! initialize the cache line
2326
2327	ldda	[%l0]ASI_BLK_P, %d16
2328	ALIGN_OFF_8_15
2329	fmovd	%d18, %d2
2330	fmovd	%d20, %d4
2331	fmovd	%d22, %d6
2332	fmovd	%d24, %d8
2333	fmovd	%d26, %d10
2334	fmovd	%d28, %d12
2335	fmovd	%d30, %d14
2336
2337	stda	%d48, [%i0]ASI_BLK_AIUS
2338	subcc	%i3, 0x40, %i3
2339	add	%i0, 0x40, %i0
2340	bgu,pt	%ncc, 1b
2341	prefetch [%l0+0x80], #one_read
2342	ba	.co_blkdone
2343	membar	#Sync
2344
2345co_off23:
2346	ldd	[%l0+0x10], %d4
2347	ldd	[%l0+0x18], %d6
2348	ldd	[%l0+0x20], %d8
2349	ldd	[%l0+0x28], %d10
2350	ldd	[%l0+0x30], %d12
2351	ldd	[%l0+0x38], %d14
2352	prefetch [%l0+0x40], #one_read
2353	prefetch [%l0+0x80], #one_read
23542:
2355	add	%l0, 0x40, %l0
2356	stxa	%g0, [%i0]%asi		! initialize the cache line
2357
2358	ldda	[%l0]ASI_BLK_P, %d16
2359	ALIGN_OFF_16_23
2360	fmovd	%d20, %d4
2361	fmovd	%d22, %d6
2362	fmovd	%d24, %d8
2363	fmovd	%d26, %d10
2364	fmovd	%d28, %d12
2365	fmovd	%d30, %d14
2366
2367	stda	%d48, [%i0]ASI_BLK_AIUS
2368	subcc	%i3, 0x40, %i3
2369	add	%i0, 0x40, %i0
2370	bgu,pt	%ncc, 2b
2371	prefetch [%l0+0x80], #one_read
2372	ba	.co_blkdone
2373	membar	#Sync
2374
2375co_off31:
2376	ldd	[%l0+0x18], %d6
2377	ldd	[%l0+0x20], %d8
2378	ldd	[%l0+0x28], %d10
2379	ldd	[%l0+0x30], %d12
2380	ldd	[%l0+0x38], %d14
2381	prefetch [%l0+0x40], #one_read
2382	prefetch [%l0+0x80], #one_read
23833:
2384	add	%l0, 0x40, %l0
2385	stxa	%g0, [%i0]%asi		! initialize the cache line
2386
2387	ldda	[%l0]ASI_BLK_P, %d16
2388	ALIGN_OFF_24_31
2389	fmovd	%d22, %d6
2390	fmovd	%d24, %d8
2391	fmovd	%d26, %d10
2392	fmovd	%d28, %d12
2393	fmovd	%d30, %d14
2394
2395	stda	%d48, [%i0]ASI_BLK_AIUS
2396	subcc	%i3, 0x40, %i3
2397	add	%i0, 0x40, %i0
2398	bgu,pt	%ncc, 3b
2399	prefetch [%l0+0x80], #one_read
2400	ba	.co_blkdone
2401	membar	#Sync
2402
2403co_off39:
2404	ldd	[%l0+0x20], %d8
2405	ldd	[%l0+0x28], %d10
2406	ldd	[%l0+0x30], %d12
2407	ldd	[%l0+0x38], %d14
2408	prefetch [%l0+0x40], #one_read
2409	prefetch [%l0+0x80], #one_read
24104:
2411	add	%l0, 0x40, %l0
2412	stxa	%g0, [%i0]%asi		! initialize the cache line
2413
2414	ldda	[%l0]ASI_BLK_P, %d16
2415	ALIGN_OFF_32_39
2416	fmovd	%d24, %d8
2417	fmovd	%d26, %d10
2418	fmovd	%d28, %d12
2419	fmovd	%d30, %d14
2420
2421	stda	%d48, [%i0]ASI_BLK_AIUS
2422	subcc	%i3, 0x40, %i3
2423	add	%i0, 0x40, %i0
2424	bgu,pt	%ncc, 4b
2425	prefetch [%l0+0x80], #one_read
2426	ba	.co_blkdone
2427	membar	#Sync
2428
2429co_off47:
2430	ldd	[%l0+0x28], %d10
2431	ldd	[%l0+0x30], %d12
2432	ldd	[%l0+0x38], %d14
2433	prefetch [%l0+0x40], #one_read
2434	prefetch [%l0+0x80], #one_read
24355:
2436	add	%l0, 0x40, %l0
2437	stxa	%g0, [%i0]%asi		! initialize the cache line
2438
2439	ldda	[%l0]ASI_BLK_P, %d16
2440	ALIGN_OFF_40_47
2441	fmovd	%d26, %d10
2442	fmovd	%d28, %d12
2443	fmovd	%d30, %d14
2444
2445	stda	%d48, [%i0]ASI_BLK_AIUS
2446	subcc	%i3, 0x40, %i3
2447	add	%i0, 0x40, %i0
2448	bgu,pt	%ncc, 5b
2449	prefetch [%l0+0x80], #one_read
2450	ba	.co_blkdone
2451	membar	#Sync
2452
2453co_off55:
2454	ldd	[%l0+0x30], %d12
2455	ldd	[%l0+0x38], %d14
2456	prefetch [%l0+0x40], #one_read
2457	prefetch [%l0+0x80], #one_read
24586:
2459	add	%l0, 0x40, %l0
2460	stxa	%g0, [%i0]%asi		! initialize the cache line
2461
2462	ldda	[%l0]ASI_BLK_P, %d16
2463	ALIGN_OFF_48_55
2464	fmovd	%d28, %d12
2465	fmovd	%d30, %d14
2466
2467	stda	%d48, [%i0]ASI_BLK_AIUS
2468	subcc	%i3, 0x40, %i3
2469	add	%i0, 0x40, %i0
2470	bgu,pt	%ncc, 6b
2471	prefetch [%l0+0x80], #one_read
2472	ba	.co_blkdone
2473	membar	#Sync
2474
2475.co_blkcpy:
2476	prefetch [%i1+0x40], #one_read
2477	prefetch [%i1+0x80], #one_read
24788:
2479	stxa	%g0, [%i0]%asi		! initialize the cache line
2480	ldda	[%i1]ASI_BLK_P, %d0
2481	stda	%d0, [%i0]ASI_BLK_AIUS
2482
2483	add	%i1, 0x40, %i1
2484	subcc	%i3, 0x40, %i3
2485	add	%i0, 0x40, %i0
2486	bgu,pt	%ncc, 8b
2487	prefetch [%i1+0x80], #one_read
2488	membar	#Sync
2489
2490.co_blkdone:
2491#else	/* NIAGARA_IMPL */
2492	andcc	%i1, 0xf, %o2		! is src quadword aligned
2493	bz,pn	%xcc, .co_blkcpy	! src offset in %o2 (last 4-bits)
2494	nop
2495	cmp	%o2, 0x8
2496	bg	.co_upper_double
2497	nop
2498	bl	.co_lower_double
2499	nop
2500
2501	! Falls through when source offset is equal to 8 i.e.
2502	! source is double word aligned.
2503	! In this case no shift/merge of data is required
2504
2505	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2506	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2507	prefetch [%l0+0x0], #one_read
2508	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2509.co_loop0:
2510	add	%i1, 0x10, %i1
2511	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
2512	prefetch [%l0+0x40], #one_read
2513
2514	stxa	%l3, [%i0+0x0]%asi
2515	stxa	%l4, [%i0+0x8]%asi
2516
2517	add	%i1, 0x10, %i1
2518	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2519
2520	stxa	%l5, [%i0+0x10]%asi
2521	stxa	%l2, [%i0+0x18]%asi
2522
2523	add	%i1, 0x10, %i1
2524	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
2525
2526	stxa	%l3, [%i0+0x20]%asi
2527	stxa	%l4, [%i0+0x28]%asi
2528
2529	add	%i1, 0x10, %i1
2530	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2531
2532	stxa	%l5, [%i0+0x30]%asi
2533	stxa	%l2, [%i0+0x38]%asi
2534
2535	add	%l0, 0x40, %l0
2536	subcc	%i3, 0x40, %i3
2537	bgu,pt	%xcc, .co_loop0
2538	add	%i0, 0x40, %i0
2539	ba	.co_blkdone
2540	add	%i1, %o2, %i1		! increment the source by src offset
2541					! the src offset was stored in %o2
2542
2543.co_lower_double:
2544
2545	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2546	sll	%o2, 3, %o0		! %o0 left shift
2547	mov	0x40, %o1
2548	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
2549	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2550	prefetch [%l0+0x0], #one_read
2551	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2	! partial data in %l2 and %l3 has
2552					! complete data
2553.co_loop1:
2554	add	%i1, 0x10, %i1
2555	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4	! %l4 has partial data
2556							! for this read.
2557	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
2558							! into %l2 and %l3
2559	prefetch [%l0+0x40], #one_read
2560
2561	stxa	%l2, [%i0+0x0]%asi
2562	stxa	%l3, [%i0+0x8]%asi
2563
2564	add	%i1, 0x10, %i1
2565	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2566	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
2567							! %l4 from previous read
2568							! into %l4 and %l5
2569	stxa	%l4, [%i0+0x10]%asi
2570	stxa	%l5, [%i0+0x18]%asi
2571
2572	! Repeat the same for next 32 bytes.
2573
2574	add	%i1, 0x10, %i1
2575	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
2576	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
2577
2578	stxa	%l2, [%i0+0x20]%asi
2579	stxa	%l3, [%i0+0x28]%asi
2580
2581	add	%i1, 0x10, %i1
2582	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2583	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
2584
2585	stxa	%l4, [%i0+0x30]%asi
2586	stxa	%l5, [%i0+0x38]%asi
2587
2588	add	%l0, 0x40, %l0
2589	subcc	%i3, 0x40, %i3
2590	bgu,pt	%xcc, .co_loop1
2591	add	%i0, 0x40, %i0
2592	ba	.co_blkdone
2593	add	%i1, %o2, %i1		! increment the source by src offset
2594					! the src offset was stored in %o2
2595
2596.co_upper_double:
2597
2598	sub	%i1, %o2, %i1		! align the src at 16 bytes.
2599	sub	%o2, 0x8, %o0
2600	sll	%o0, 3, %o0		! %o0 left shift
2601	mov	0x40, %o1
2602	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
2603	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
2604	prefetch [%l0+0x0], #one_read
2605	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2	! partial data in %l3
2606							! for this read and
2607							! no data in %l2
2608.co_loop2:
2609	add	%i1, 0x10, %i1
2610	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4	! %l4 has complete data
2611							! and %l5 has partial
2612	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
2613							! into %l3 and %l4
2614	prefetch [%l0+0x40], #one_read
2615
2616	stxa	%l3, [%i0+0x0]%asi
2617	stxa	%l4, [%i0+0x8]%asi
2618
2619	add	%i1, 0x10, %i1
2620	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2621	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
2622							! %l5 from previous read
2623							! into %l5 and %l2
2624
2625	stxa	%l5, [%i0+0x10]%asi
2626	stxa	%l2, [%i0+0x18]%asi
2627
2628	! Repeat the same for next 32 bytes.
2629
2630	add	%i1, 0x10, %i1
2631	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
2632	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
2633
2634	stxa	%l3, [%i0+0x20]%asi
2635	stxa	%l4, [%i0+0x28]%asi
2636
2637	add	%i1, 0x10, %i1
2638	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2639	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
2640
2641	stxa	%l5, [%i0+0x30]%asi
2642	stxa	%l2, [%i0+0x38]%asi
2643
2644	add	%l0, 0x40, %l0
2645	subcc	%i3, 0x40, %i3
2646	bgu,pt	%xcc, .co_loop2
2647	add	%i0, 0x40, %i0
2648	ba	.co_blkdone
2649	add	%i1, %o2, %i1		! increment the source by src offset
2650					! the src offset was stored in %o2
2651
2652
2653	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
2654.co_blkcpy:
2655
2656	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
2657	prefetch [%o0+0x0], #one_read
26581:
2659	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0
2660	add	%i1, 0x10, %i1
2661	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2
2662	add	%i1, 0x10, %i1
2663
2664	prefetch [%o0+0x40], #one_read
2665
2666	stxa	%l0, [%i0+0x0]%asi
2667
2668	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4
2669	add	%i1, 0x10, %i1
2670	ldda	[%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6
2671	add	%i1, 0x10, %i1
2672
2673	stxa	%l1, [%i0+0x8]%asi
2674	stxa	%l2, [%i0+0x10]%asi
2675	stxa	%l3, [%i0+0x18]%asi
2676	stxa	%l4, [%i0+0x20]%asi
2677	stxa	%l5, [%i0+0x28]%asi
2678	stxa	%l6, [%i0+0x30]%asi
2679	stxa	%l7, [%i0+0x38]%asi
2680
2681	add	%o0, 0x40, %o0
2682	subcc	%i3, 0x40, %i3
2683	bgu,pt	%xcc, 1b
2684	add	%i0, 0x40, %i0
2685
2686.co_blkdone:
2687	membar	#Sync
2688#endif	/* NIAGARA_IMPL */
2689
2690	brz,pt	%i2, .copyout_exit
2691	nop
2692
2693	! Handle trailing bytes
2694	cmp	%i2, 0x8
2695	blu,pt	%ncc, .co_residue
2696	nop
2697
2698	! Can we do some 8B ops
2699	or	%i1, %i0, %o2
2700	andcc	%o2, 0x7, %g0
2701	bnz	%ncc, .co_last4
2702	nop
2703
2704	! Do 8byte ops as long as possible
2705.co_last8:
2706	ldx	[%i1], %o2
2707	stxa	%o2, [%i0]ASI_USER
2708	add	%i1, 0x8, %i1
2709	sub	%i2, 0x8, %i2
2710	cmp	%i2, 0x8
2711	bgu,pt	%ncc, .co_last8
2712	add	%i0, 0x8, %i0
2713
2714	brz,pt	%i2, .copyout_exit
2715	nop
2716
2717	ba	.co_residue
2718	nop
2719
2720.co_last4:
2721	! Can we do 4B ops
2722	andcc	%o2, 0x3, %g0
2723	bnz	%ncc, .co_last2
2724	nop
27251:
2726	ld	[%i1], %o2
2727	sta	%o2, [%i0]ASI_USER
2728	add	%i1, 0x4, %i1
2729	sub	%i2, 0x4, %i2
2730	cmp	%i2, 0x4
2731	bgu,pt	%ncc, 1b
2732	add	%i0, 0x4, %i0
2733
2734	brz,pt	%i2, .copyout_exit
2735	nop
2736
2737	ba	.co_residue
2738	nop
2739
2740.co_last2:
2741	! Can we do 2B ops
2742	andcc	%o2, 0x1, %g0
2743	bnz	%ncc, .co_residue
2744	nop
2745
27461:
2747	lduh	[%i1], %o2
2748	stuha	%o2, [%i0]ASI_USER
2749	add	%i1, 0x2, %i1
2750	sub	%i2, 0x2, %i2
2751	cmp	%i2, 0x2
2752	bgu,pt	%ncc, 1b
2753	add	%i0, 0x2, %i0
2754
2755	brz,pt	%i2, .copyout_exit
2756	nop
2757
2758	! Copy the residue as byte copy
2759.co_residue:
2760	ldub	[%i1], %i4
2761	stba	%i4, [%i0]ASI_USER
2762	inc	%i1
2763	deccc	%i2
2764	bgu,pt	%xcc, .co_residue
2765	inc	%i0
2766
2767.copyout_exit:
2768#if !defined(NIAGARA_IMPL)
2769	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2770	wr	%o2, 0, %gsr		! restore gsr
2771
2772	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2773	btst	FPRS_FEF, %o3
2774	bz	%icc, 4f
2775	  nop
2776
2777	! restore fpregs from stack
2778	BLD_FP_FROMSTACK(%o2)
2779
2780	ba,pt	%ncc, 2f
2781	  wr	%o3, 0, %fprs		! restore fprs
2782
27834:
2784	FZERO				! zero all of the fpregs
2785	wr	%o3, 0, %fprs		! restore fprs
2786
27872:
2788	membar	#Sync
2789	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2790#else	/* NIAGARA_IMPL */
2791	membar	#Sync
2792#endif	/* NIAGARA_IMPL */
2793	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2794	ret
2795	restore	%g0, 0, %o0
2796
2797.copyout_err:
2798	ldn	[THREAD_REG + T_COPYOPS], %o4
2799	brz	%o4, 2f
2800	nop
2801	ldn	[%o4 + CP_COPYOUT], %g2
2802	jmp	%g2
2803	nop
28042:
2805	retl
2806	mov	-1, %o0
2807	SET_SIZE(copyout)
2808
2809#endif	/* lint */
2810
2811
2812#ifdef	lint
2813
2814/*ARGSUSED*/
2815int
2816xcopyout(const void *kaddr, void *uaddr, size_t count)
2817{ return (0); }
2818
2819#else	/* lint */
2820
2821	ENTRY(xcopyout)
2822	sethi	%hi(.xcopyout_err), REAL_LOFAULT
2823	b	.do_copyout
2824	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2825.xcopyout_err:
2826	ldn	[THREAD_REG + T_COPYOPS], %o4
2827	brz	%o4, 2f
2828	nop
2829	ldn	[%o4 + CP_XCOPYOUT], %g2
2830	jmp	%g2
2831	nop
28322:
2833	retl
2834	mov	%g1, %o0
2835	SET_SIZE(xcopyout)
2836
2837#endif	/* lint */
2838
2839#ifdef	lint
2840
2841/*ARGSUSED*/
2842int
2843xcopyout_little(const void *kaddr, void *uaddr, size_t count)
2844{ return (0); }
2845
2846#else	/* lint */
2847
2848	ENTRY(xcopyout_little)
2849	sethi	%hi(.little_err), %o4
2850	ldn	[THREAD_REG + T_LOFAULT], %o5
2851	or	%o4, %lo(.little_err), %o4
2852	membar	#Sync			! sync error barrier
2853	stn	%o4, [THREAD_REG + T_LOFAULT]
2854
2855	subcc	%g0, %o2, %o3
2856	add	%o0, %o2, %o0
2857	bz,pn	%ncc, 2f		! check for zero bytes
2858	sub	%o2, 1, %o4
2859	add	%o0, %o4, %o0		! start w/last byte
2860	add	%o1, %o2, %o1
2861	ldub	[%o0+%o3], %o4
2862
28631:	stba	%o4, [%o1+%o3]ASI_AIUSL
2864	inccc	%o3
2865	sub	%o0, 2, %o0		! get next byte
2866	bcc,a,pt %ncc, 1b
2867	  ldub	[%o0+%o3], %o4
2868
28692:	membar	#Sync			! sync error barrier
2870	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2871	retl
2872	mov	%g0, %o0		! return (0)
2873	SET_SIZE(xcopyout_little)
2874
2875#endif	/* lint */
2876
2877/*
2878 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2879 */
2880
2881#if defined(lint)
2882
2883/*ARGSUSED*/
2884int
2885copyin(const void *uaddr, void *kaddr, size_t count)
2886{ return (0); }
2887
2888#else	/* lint */
2889
2890	ENTRY(copyin)
2891	sethi	%hi(.copyin_err), REAL_LOFAULT
2892	or	REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
2893
2894.do_copyin:
2895	!
2896	! Check the length and bail if zero.
2897	!
2898	tst	%o2
2899	bnz,pt	%ncc, 1f
2900	  nop
2901	retl
2902	  clr	%o0
29031:
2904	sethi	%hi(copyio_fault), %o4
2905	or	%o4, %lo(copyio_fault), %o4
2906	sethi	%hi(copyio_fault_nowindow), %o3
2907	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
2908	or	%o3, %lo(copyio_fault_nowindow), %o3
2909	membar	#Sync
2910	stn	%o3, [THREAD_REG + T_LOFAULT]
2911
2912	mov	%o0, SAVE_SRC
2913	mov	%o1, SAVE_DST
2914	mov	%o2, SAVE_COUNT
2915
2916	!
2917	! Check to see if we're more than SMALL_LIMIT.
2918	!
2919	subcc	%o2, SMALL_LIMIT, %o3
2920	bgu,a,pt %ncc, .dci_ns
2921	or	%o0, %o1, %o3
2922	!
2923	! What was previously ".small_copyin"
2924	!
2925.dcibcp:
2926	sub	%g0, %o2, %o3		! setup for copy loop
2927	add	%o0, %o2, %o0
2928	add	%o1, %o2, %o1
2929	ba,pt	%ncc, .dcicl
2930	lduba	[%o0 + %o3]ASI_USER, %o4
2931	!
2932	! %o0 and %o1 point at the end and remain pointing at the end
2933	! of their buffers. We pull things out by adding %o3 (which is
2934	! the negation of the length) to the buffer end which gives us
2935	! the curent location in the buffers. By incrementing %o3 we walk
2936	! through both buffers without having to bump each buffer's
2937	! pointer. A very fast 4 instruction loop.
2938	!
2939	.align 16
2940.dcicl:
2941	stb	%o4, [%o1 + %o3]
2942	inccc	%o3
2943	bl,a,pt %ncc, .dcicl
2944	lduba	[%o0 + %o3]ASI_USER, %o4
2945	!
2946	! We're done. Go home.
2947	!
2948	membar	#Sync
2949	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
2950	retl
2951	clr	%o0
2952	!
2953	! Try aligned copies from here.
2954	!
2955.dci_ns:
2956	!
2957	! See if we're single byte aligned. If we are, check the
2958	! limit for single byte copies. If we're smaller, or equal,
2959	! bounce to the byte for byte copy loop. Otherwise do it in
2960	! HW (if enabled).
2961	!
2962	btst	1, %o3
2963	bz,a,pt	%icc, .dcih8
2964	btst	7, %o3
2965	!
2966	! We're single byte aligned.
2967	!
2968	sethi	%hi(hw_copy_limit_1), %o3
2969	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2970	!
2971	! Is HW copy on? If not do everything byte for byte.
2972	!
2973	tst	%o3
2974	bz,pn	%icc, .dcibcp
2975	subcc	%o3, %o2, %o3
2976	!
2977	! Are we bigger than the HW limit? If not
2978	! go to byte for byte.
2979	!
2980	bge,pt	%ncc, .dcibcp
2981	nop
2982	!
2983	! We're big enough and copy is on. Do it with HW.
2984	!
2985	ba,pt	%ncc, .big_copyin
2986	nop
2987.dcih8:
2988	!
2989	! 8 byte aligned?
2990	!
2991	bnz,a	%ncc, .dcih4
2992	btst	3, %o3
2993	!
2994	! We're eight byte aligned.
2995	!
2996	sethi	%hi(hw_copy_limit_8), %o3
2997	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2998	!
2999	! Is HW assist on? If not, do it with the aligned copy.
3000	!
3001	tst	%o3
3002	bz,pn	%icc, .dcis8
3003	subcc	%o3, %o2, %o3
3004	bge	%ncc, .dcis8
3005	nop
3006	ba,pt	%ncc, .big_copyin
3007	nop
3008.dcis8:
3009	!
3010	! Housekeeping for copy loops. Uses same idea as in the byte for
3011	! byte copy loop above.
3012	!
3013	add	%o0, %o2, %o0
3014	add	%o1, %o2, %o1
3015	sub	%g0, %o2, %o3
3016	ba,pt	%ncc, .didebc
3017	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
3018	!
3019	! 4 byte aligned?
3020	!
3021.dcih4:
3022	bnz	%ncc, .dcih2
3023	sethi	%hi(hw_copy_limit_4), %o3
3024	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3025	!
3026	! Is HW assist on? If not, do it with the aligned copy.
3027	!
3028	tst	%o3
3029	bz,pn	%icc, .dcis4
3030	subcc	%o3, %o2, %o3
3031	!
3032	! We're negative if our size is less than or equal to hw_copy_limit_4.
3033	!
3034	bge	%ncc, .dcis4
3035	nop
3036	ba,pt	%ncc, .big_copyin
3037	nop
3038.dcis4:
3039	!
3040	! Housekeeping for copy loops. Uses same idea as in the byte
3041	! for byte copy loop above.
3042	!
3043	add	%o0, %o2, %o0
3044	add	%o1, %o2, %o1
3045	sub	%g0, %o2, %o3
3046	ba,pt	%ncc, .didfbc
3047	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
3048.dcih2:
3049	!
3050	! We're two byte aligned. Check for "smallness"
3051	! done in delay at .dcih4
3052	!
3053	bleu,pt	%ncc, .dcis2
3054	sethi	%hi(hw_copy_limit_2), %o3
3055	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3056	!
3057	! Is HW assist on? If not, do it with the aligned copy.
3058	!
3059	tst	%o3
3060	bz,pn	%icc, .dcis2
3061	subcc	%o3, %o2, %o3
3062	!
3063	! Are we larger than the HW limit?
3064	!
3065	bge	%ncc, .dcis2
3066	nop
3067	!
3068	! HW assist is on and we're large enough to use it.
3069	!
3070	ba,pt	%ncc, .big_copyin
3071	nop
3072	!
3073	! Housekeeping for copy loops. Uses same idea as in the byte
3074	! for byte copy loop above.
3075	!
3076.dcis2:
3077	add	%o0, %o2, %o0
3078	add	%o1, %o2, %o1
3079	sub	%g0, %o2, %o3
3080	ba,pt	%ncc, .didtbc
3081	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
3082	!
3083.small_copyin:
3084	!
3085	! Why are we doing this AGAIN? There are certain conditions in
3086	! big copyin that will cause us to forgo the HW assisted copys
3087	! and bounce back to a non-hw assisted copy. This dispatches
3088	! those copies. Note that we branch around this in the main line
3089	! code.
3090	!
3091	! We make no check for limits or HW enablement here. We've
3092	! already been told that we're a poster child so just go off
3093	! and do it.
3094	!
3095	or	%o0, %o1, %o3
3096	btst	1, %o3
3097	bnz	%icc, .dcibcp		! Most likely
3098	btst	7, %o3
3099	bz	%icc, .dcis8
3100	btst	3, %o3
3101	bz	%icc, .dcis4
3102	nop
3103	ba,pt	%ncc, .dcis2
3104	nop
3105	!
3106	! Eight byte aligned copies. A steal from the original .small_copyin
3107	! with modifications. %o2 is number of 8 byte chunks to copy. When
3108	! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
3109	! to copy.
3110	!
3111	.align 32
3112.didebc:
3113	ldxa	[%o0 + %o3]ASI_USER, %o4
3114	deccc	%o2
3115	stx	%o4, [%o1 + %o3]
3116	bg,pt	%ncc, .didebc
3117	addcc	%o3, 8, %o3
3118	!
3119	! End of copy loop. Most 8 byte aligned copies end here.
3120	!
3121	bz,pt	%ncc, .dcifh
3122	nop
3123	!
3124	! Something is left. Do it byte for byte.
3125	!
3126	ba,pt	%ncc, .dcicl
3127	lduba	[%o0 + %o3]ASI_USER, %o4
3128	!
3129	! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
3130	!
3131	.align 32
3132.didfbc:
3133	lduwa	[%o0 + %o3]ASI_USER, %o4
3134	deccc	%o2
3135	st	%o4, [%o1 + %o3]
3136	bg,pt	%ncc, .didfbc
3137	addcc	%o3, 4, %o3
3138	!
3139	! End of copy loop. Most 4 byte aligned copies end here.
3140	!
3141	bz,pt	%ncc, .dcifh
3142	nop
3143	!
3144	! Something is left. Do it byte for byte.
3145	!
3146	ba,pt	%ncc, .dcicl
3147	lduba	[%o0 + %o3]ASI_USER, %o4
3148	!
3149	! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
3150	! copy.
3151	!
3152	.align 32
3153.didtbc:
3154	lduha	[%o0 + %o3]ASI_USER, %o4
3155	deccc	%o2
3156	sth	%o4, [%o1 + %o3]
3157	bg,pt	%ncc, .didtbc
3158	addcc	%o3, 2, %o3
3159	!
3160	! End of copy loop. Most 2 byte aligned copies end here.
3161	!
3162	bz,pt	%ncc, .dcifh
3163	nop
3164	!
3165	! Deal with the last byte
3166	!
3167	lduba	[%o0 + %o3]ASI_USER, %o4
3168	stb	%o4, [%o1 + %o3]
3169.dcifh:
3170	membar	#Sync
3171	stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3172	retl
3173	clr	%o0
3174
3175.big_copyin:
3176	! We're going off to do a block copy.
3177	! Switch fault hendlers and grab a window. We
3178	! don't do a membar #Sync since we've done only
3179	! kernel data to this point.
3180	stn	%o4, [THREAD_REG + T_LOFAULT]
3181
3182	! Copy in that reach here are larger than 256 bytes. The
3183	! hw_copy_limit_1 is set to 256. Never set this limit less
3184	! 128 bytes.
3185#if !defined(NIAGARA_IMPL)
3186	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3187
3188	rd	%fprs, %o2			! check for unused fp
3189	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]	! save %fprs
3190	btst	FPRS_FEF, %o2
3191	bz,a,pt	%icc, .do_blockcopyin
3192	wr	%g0, FPRS_FEF, %fprs
3193
3194	! save in-use fpregs on stack
3195	BST_FP_TOSTACK(%o2)
3196#else	/* NIAGARA_IMPL */
3197	save	%sp, -SA(MINFRAME), %sp
3198#endif	/* NIAGARA_IMPL */
3199
3200.do_blockcopyin:
3201
3202#if !defined(NIAGARA_IMPL)
3203	rd	%gsr, %o2
3204	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
3205	! set the lower bit saved t_lofault to indicate that we need
3206	! clear %fprs register on the way out
3207	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3208#endif	/* NIAGARA_IMPL */
3209
3210	! Swap src/dst since the code below is memcpy code
3211	! and memcpy/bcopy have different calling sequences
3212	mov	%i1, %i5
3213	mov	%i0, %i1
3214	mov	%i5, %i0
3215
3216	! Block (64 bytes) align the destination.
3217	andcc	%i0, 0x3f, %i3		! is dst block aligned
3218	bz	%ncc, copyin_blalign	! dst already block aligned
3219	sub	%i3, 0x40, %i3
3220	neg	%i3			! bytes till dst 64 bytes aligned
3221	sub	%i2, %i3, %i2		! update i2 with new count
3222
3223	! Based on source and destination alignment do
3224	! either 8 bytes, 4 bytes, 2 bytes or byte copy.
3225
3226	! Is dst & src 8B aligned
3227	or	%i0, %i1, %o2
3228	andcc	%o2, 0x7, %g0
3229	bz	%ncc, .ci_alewdcp
3230	nop
3231
3232	! Is dst & src 4B aligned
3233	andcc	%o2, 0x3, %g0
3234	bz	%ncc, .ci_alwdcp
3235	nop
3236
3237	! Is dst & src 2B aligned
3238	andcc	%o2, 0x1, %g0
3239	bz	%ncc, .ci_alhlfwdcp
3240	nop
3241
3242	! 1B aligned
32431:	lduba	[%i1]ASI_USER, %o2
3244	stb	%o2, [%i0]
3245	inc	%i1
3246	deccc	%i3
3247	bgu,pt	%ncc, 1b
3248	inc	%i0
3249
3250	ba	copyin_blalign
3251	nop
3252
3253	! dst & src 4B aligned
3254.ci_alwdcp:
3255	lda	[%i1]ASI_USER, %o2
3256	st	%o2, [%i0]
3257	add	%i1, 0x4, %i1
3258	subcc	%i3, 0x4, %i3
3259	bgu,pt	%ncc, .ci_alwdcp
3260	add	%i0, 0x4, %i0
3261
3262	ba	copyin_blalign
3263	nop
3264
3265	! dst & src 2B aligned
3266.ci_alhlfwdcp:
3267	lduha	[%i1]ASI_USER, %o2
3268	stuh	%o2, [%i0]
3269	add	%i1, 0x2, %i1
3270	subcc	%i3, 0x2, %i3
3271	bgu,pt	%ncc, .ci_alhlfwdcp
3272	add	%i0, 0x2, %i0
3273
3274	ba	copyin_blalign
3275	nop
3276
3277	! dst & src 8B aligned
3278.ci_alewdcp:
3279	ldxa	[%i1]ASI_USER, %o2
3280	stx	%o2, [%i0]
3281	add	%i1, 0x8, %i1
3282	subcc	%i3, 0x8, %i3
3283	bgu,pt	%ncc, .ci_alewdcp
3284	add	%i0, 0x8, %i0
3285
3286copyin_blalign:
3287	andn	%i2, 0x3f, %i3		! %i3 count is multiple of block size
3288	sub	%i2, %i3, %i2		! Residue bytes in %i2
3289
3290#if !defined(NIAGARA_IMPL)
3291	mov	ASI_USER, %asi
3292
3293	andn	%i1, 0x3f, %l0		! %l0 has block aligned src address
3294	prefetch [%l0+0x0], #one_read
3295	andcc	%i1, 0x3f, %g0		! is src 64B aligned
3296	bz,pn	%ncc, .ci_blkcpy
3297	nop
3298
3299	! handle misaligned source cases
3300	alignaddr %i1, %g0, %g0		! generate %gsr
3301
3302	srl	%i1, 0x3, %l1		! src add bits 3, 4, 5 are now least
3303					! significant in %l1
3304	andcc	%l1, 0x7, %l2		! mask everything except bits 1, 2, 3
3305	add	%i1, %i3, %i1
3306
3307	! switch statement to get to right 8 byte block within
3308	! 64 byte block
3309	cmp	 %l2, 0x4
3310	bgeu,a	 ci_hlf
3311	cmp	 %l2, 0x6
3312	cmp	 %l2, 0x2
3313	bgeu,a	 ci_sqtr
3314	nop
3315	cmp	 %l2, 0x1
3316	be,a	 ci_off15
3317	nop
3318	ba	 ci_off7
3319	nop
3320ci_sqtr:
3321	be,a	 ci_off23
3322	nop
3323	ba,a	 ci_off31
3324	nop
3325
3326ci_hlf:
3327	bgeu,a	 ci_fqtr
3328	nop
3329	cmp	 %l2, 0x5
3330	be,a	 ci_off47
3331	nop
3332	ba	 ci_off39
3333	nop
3334ci_fqtr:
3335	be,a	 ci_off55
3336	nop
3337
3338	ldda	[%l0+0x38]%asi, %d14
3339	prefetch [%l0+0x40], #one_read
3340	prefetch [%l0+0x80], #one_read
33417:
3342	add	%l0, 0x40, %l0
3343	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3344
3345	ldda	[%l0]ASI_BLK_AIUS, %d16
3346	ALIGN_OFF_56_63
3347	fmovd	%d30, %d14
3348
3349	stda	%d48, [%i0]ASI_BLK_P
3350	subcc	%i3, 0x40, %i3
3351	add	%i0, 0x40, %i0
3352	bgu,pt	%ncc, 7b
3353	prefetch [%l0+0x80], #one_read
3354	ba	.ci_blkdone
3355	membar	#Sync
3356
3357ci_off7:
3358	ldda	[%l0]ASI_BLK_AIUS, %d0
3359	prefetch [%l0+0x40], #one_read
3360	prefetch [%l0+0x80], #one_read
33610:
3362	add	%l0, 0x40, %l0
3363	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3364
3365	ldda	[%l0]ASI_BLK_AIUS, %d16
3366	ALIGN_OFF_1_7
3367	fmovd	%d16, %d0
3368	fmovd	%d18, %d2
3369	fmovd	%d20, %d4
3370	fmovd	%d22, %d6
3371	fmovd	%d24, %d8
3372	fmovd	%d26, %d10
3373	fmovd	%d28, %d12
3374	fmovd	%d30, %d14
3375
3376	stda	%d48, [%i0]ASI_BLK_P
3377	subcc	%i3, 0x40, %i3
3378	add	%i0, 0x40, %i0
3379	bgu,pt	%ncc, 0b
3380	prefetch [%l0+0x80], #one_read
3381	ba	.ci_blkdone
3382	membar	#Sync
3383
3384ci_off15:
3385	ldda	[%l0+0x8]%asi, %d2
3386	ldda	[%l0+0x10]%asi, %d4
3387	ldda	[%l0+0x18]%asi, %d6
3388	ldda	[%l0+0x20]%asi, %d8
3389	ldda	[%l0+0x28]%asi, %d10
3390	ldda	[%l0+0x30]%asi, %d12
3391	ldda	[%l0+0x38]%asi, %d14
3392	prefetch [%l0+0x40], #one_read
3393	prefetch [%l0+0x80], #one_read
33941:
3395	add	%l0, 0x40, %l0
3396	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3397
3398	ldda	[%l0]ASI_BLK_AIUS, %d16
3399	ALIGN_OFF_8_15
3400	fmovd	%d18, %d2
3401	fmovd	%d20, %d4
3402	fmovd	%d22, %d6
3403	fmovd	%d24, %d8
3404	fmovd	%d26, %d10
3405	fmovd	%d28, %d12
3406	fmovd	%d30, %d14
3407
3408	stda	%d48, [%i0]ASI_BLK_P
3409	subcc	%i3, 0x40, %i3
3410	add	%i0, 0x40, %i0
3411	bgu,pt	%ncc, 1b
3412	prefetch [%l0+0x80], #one_read
3413	ba	.ci_blkdone
3414	membar	#Sync
3415
3416ci_off23:
3417	ldda	[%l0+0x10]%asi, %d4
3418	ldda	[%l0+0x18]%asi, %d6
3419	ldda	[%l0+0x20]%asi, %d8
3420	ldda	[%l0+0x28]%asi, %d10
3421	ldda	[%l0+0x30]%asi, %d12
3422	ldda	[%l0+0x38]%asi, %d14
3423	prefetch [%l0+0x40], #one_read
3424	prefetch [%l0+0x80], #one_read
34252:
3426	add	%l0, 0x40, %l0
3427	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3428
3429	ldda	[%l0]ASI_BLK_AIUS, %d16
3430	ALIGN_OFF_16_23
3431	fmovd	%d20, %d4
3432	fmovd	%d22, %d6
3433	fmovd	%d24, %d8
3434	fmovd	%d26, %d10
3435	fmovd	%d28, %d12
3436	fmovd	%d30, %d14
3437
3438	stda	%d48, [%i0]ASI_BLK_P
3439	subcc	%i3, 0x40, %i3
3440	add	%i0, 0x40, %i0
3441	bgu,pt	%ncc, 2b
3442	prefetch [%l0+0x80], #one_read
3443	ba	.ci_blkdone
3444	membar	#Sync
3445
3446ci_off31:
3447	ldda	[%l0+0x18]%asi, %d6
3448	ldda	[%l0+0x20]%asi, %d8
3449	ldda	[%l0+0x28]%asi, %d10
3450	ldda	[%l0+0x30]%asi, %d12
3451	ldda	[%l0+0x38]%asi, %d14
3452	prefetch [%l0+0x40], #one_read
3453	prefetch [%l0+0x80], #one_read
34543:
3455	add	%l0, 0x40, %l0
3456	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3457
3458	ldda	[%l0]ASI_BLK_AIUS, %d16
3459	ALIGN_OFF_24_31
3460	fmovd	%d22, %d6
3461	fmovd	%d24, %d8
3462	fmovd	%d26, %d10
3463	fmovd	%d28, %d12
3464	fmovd	%d30, %d14
3465
3466	stda	%d48, [%i0]ASI_BLK_P
3467	subcc	%i3, 0x40, %i3
3468	add	%i0, 0x40, %i0
3469	bgu,pt	%ncc, 3b
3470	prefetch [%l0+0x80], #one_read
3471	ba	.ci_blkdone
3472	membar	#Sync
3473
3474ci_off39:
3475	ldda	[%l0+0x20]%asi, %d8
3476	ldda	[%l0+0x28]%asi, %d10
3477	ldda	[%l0+0x30]%asi, %d12
3478	ldda	[%l0+0x38]%asi, %d14
3479	prefetch [%l0+0x40], #one_read
3480	prefetch [%l0+0x80], #one_read
34814:
3482	add	%l0, 0x40, %l0
3483	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3484
3485	ldda	[%l0]ASI_BLK_AIUS, %d16
3486	ALIGN_OFF_32_39
3487	fmovd	%d24, %d8
3488	fmovd	%d26, %d10
3489	fmovd	%d28, %d12
3490	fmovd	%d30, %d14
3491
3492	stda	%d48, [%i0]ASI_BLK_P
3493	subcc	%i3, 0x40, %i3
3494	add	%i0, 0x40, %i0
3495	bgu,pt	%ncc, 4b
3496	prefetch [%l0+0x80], #one_read
3497	ba	.ci_blkdone
3498	membar	#Sync
3499
3500ci_off47:
3501	ldda	[%l0+0x28]%asi, %d10
3502	ldda	[%l0+0x30]%asi, %d12
3503	ldda	[%l0+0x38]%asi, %d14
3504	prefetch [%l0+0x40], #one_read
3505	prefetch [%l0+0x80], #one_read
35065:
3507	add	%l0, 0x40, %l0
3508	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3509
3510	ldda	[%l0]ASI_BLK_AIUS, %d16
3511	ALIGN_OFF_40_47
3512	fmovd	%d26, %d10
3513	fmovd	%d28, %d12
3514	fmovd	%d30, %d14
3515
3516	stda	%d48, [%i0]ASI_BLK_P
3517	subcc	%i3, 0x40, %i3
3518	add	%i0, 0x40, %i0
3519	bgu,pt	%ncc, 5b
3520	prefetch [%l0+0x80], #one_read
3521	ba	.ci_blkdone
3522	membar	#Sync
3523
3524ci_off55:
3525	ldda	[%l0+0x30]%asi, %d12
3526	ldda	[%l0+0x38]%asi, %d14
3527	prefetch [%l0+0x40], #one_read
3528	prefetch [%l0+0x80], #one_read
35296:
3530	add	%l0, 0x40, %l0
3531	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3532
3533	ldda	[%l0]ASI_BLK_AIUS, %d16
3534	ALIGN_OFF_48_55
3535	fmovd	%d28, %d12
3536	fmovd	%d30, %d14
3537
3538	stda	%d48, [%i0]ASI_BLK_P
3539	subcc	%i3, 0x40, %i3
3540	add	%i0, 0x40, %i0
3541	bgu,pt	%ncc, 6b
3542	prefetch [%l0+0x80], #one_read
3543	ba	.ci_blkdone
3544	membar	#Sync
3545
3546.ci_blkcpy:
3547	prefetch [%i1+0x40], #one_read
3548	prefetch [%i1+0x80], #one_read
35498:
3550	stxa	%g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line
3551	ldda	[%i1]ASI_BLK_AIUS, %d0
3552	stda	%d0, [%i0]ASI_BLK_P
3553
3554	add	%i1, 0x40, %i1
3555	subcc	%i3, 0x40, %i3
3556	add	%i0, 0x40, %i0
3557	bgu,pt	%ncc, 8b
3558	prefetch [%i1+0x80], #one_read
3559	membar	#Sync
3560
3561.ci_blkdone:
3562#else	/* NIAGARA_IMPL */
3563	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
3564
3565	andcc	%i1, 0xf, %o2		! is src quadword aligned
3566	bz,pn	%xcc, .ci_blkcpy	! src offset in %o2 (last 4-bits)
3567	nop
3568	cmp	%o2, 0x8
3569	bg	.ci_upper_double
3570	nop
3571	bl	.ci_lower_double
3572	nop
3573
3574	! Falls through when source offset is equal to 8 i.e.
3575	! source is double word aligned.
3576	! In this case no shift/merge of data is required
3577
3578	sub	%i1, %o2, %i1		! align the src at 16 bytes.
3579	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
3580	prefetch [%l0+0x0], #one_read
3581	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3582.ci_loop0:
3583	add	%i1, 0x10, %i1
3584	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
3585
3586	prefetch [%l0+0x40], #one_read
3587
3588	stxa	%l3, [%i0+0x0]%asi
3589	stxa	%l4, [%i0+0x8]%asi
3590
3591	add	%i1, 0x10, %i1
3592	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3593
3594	stxa	%l5, [%i0+0x10]%asi
3595	stxa	%l2, [%i0+0x18]%asi
3596
3597	add	%i1, 0x10, %i1
3598	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
3599
3600	stxa	%l3, [%i0+0x20]%asi
3601	stxa	%l4, [%i0+0x28]%asi
3602
3603	add	%i1, 0x10, %i1
3604	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3605
3606	stxa	%l5, [%i0+0x30]%asi
3607	stxa	%l2, [%i0+0x38]%asi
3608
3609	add	%l0, 0x40, %l0
3610	subcc	%i3, 0x40, %i3
3611	bgu,pt	%xcc, .ci_loop0
3612	add	%i0, 0x40, %i0
3613	ba	.ci_blkdone
3614	add	%i1, %o2, %i1		! increment the source by src offset
3615					! the src offset was stored in %o2
3616
3617.ci_lower_double:
3618
3619	sub	%i1, %o2, %i1		! align the src at 16 bytes.
3620	sll	%o2, 3, %o0		! %o0 left shift
3621	mov	0x40, %o1
3622	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
3623	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
3624	prefetch [%l0+0x0], #one_read
3625	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2	! partial data in %l2
3626							! and %l3 has complete
3627							! data
3628.ci_loop1:
3629	add	%i1, 0x10, %i1
3630	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4	! %l4 has partial data
3631							! for this read.
3632	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)	! merge %l2, %l3 and %l4
3633							! into %l2 and %l3
3634
3635	prefetch [%l0+0x40], #one_read
3636
3637	stxa	%l2, [%i0+0x0]%asi
3638	stxa	%l3, [%i0+0x8]%asi
3639
3640	add	%i1, 0x10, %i1
3641	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3642	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)	! merge %l2 with %l5 and
3643							! %l4 from previous read
3644							! into %l4 and %l5
3645	stxa	%l4, [%i0+0x10]%asi
3646	stxa	%l5, [%i0+0x18]%asi
3647
3648	! Repeat the same for next 32 bytes.
3649
3650	add	%i1, 0x10, %i1
3651	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
3652	ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6)
3653
3654	stxa	%l2, [%i0+0x20]%asi
3655	stxa	%l3, [%i0+0x28]%asi
3656
3657	add	%i1, 0x10, %i1
3658	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3659	ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6)
3660
3661	stxa	%l4, [%i0+0x30]%asi
3662	stxa	%l5, [%i0+0x38]%asi
3663
3664	add	%l0, 0x40, %l0
3665	subcc	%i3, 0x40, %i3
3666	bgu,pt	%xcc, .ci_loop1
3667	add	%i0, 0x40, %i0
3668	ba	.ci_blkdone
3669	add	%i1, %o2, %i1		! increment the source by src offset
3670					! the src offset was stored in %o2
3671
3672.ci_upper_double:
3673
3674	sub	%i1, %o2, %i1		! align the src at 16 bytes.
3675	sub	%o2, 0x8, %o0
3676	sll	%o0, 3, %o0		! %o0 left shift
3677	mov	0x40, %o1
3678	sub	%o1, %o0, %o1		! %o1 right shift = (64 - left shift)
3679	andn	%i1, 0x3f, %l0		! %l0 has block aligned source
3680	prefetch [%l0+0x0], #one_read
3681	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2	! partial data in %l3
3682							! for this read and
3683							! no data in %l2
3684.ci_loop2:
3685	add	%i1, 0x10, %i1
3686	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4	! %l4 has complete data
3687							! and %l5 has partial
3688	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)	! merge %l3, %l4 and %l5
3689							! into %l3 and %l4
3690	prefetch [%l0+0x40], #one_read
3691
3692	stxa	%l3, [%i0+0x0]%asi
3693	stxa	%l4, [%i0+0x8]%asi
3694
3695	add	%i1, 0x10, %i1
3696	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3697	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)	! merge %l2 and %l3 with
3698							! %l5 from previous read
3699							! into %l5 and %l2
3700
3701	stxa	%l5, [%i0+0x10]%asi
3702	stxa	%l2, [%i0+0x18]%asi
3703
3704	! Repeat the same for next 32 bytes.
3705
3706	add	%i1, 0x10, %i1
3707	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
3708	ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6)
3709
3710	stxa	%l3, [%i0+0x20]%asi
3711	stxa	%l4, [%i0+0x28]%asi
3712
3713	add	%i1, 0x10, %i1
3714	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3715	ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6)
3716
3717	stxa	%l5, [%i0+0x30]%asi
3718	stxa	%l2, [%i0+0x38]%asi
3719
3720	add	%l0, 0x40, %l0
3721	subcc	%i3, 0x40, %i3
3722	bgu,pt	%xcc, .ci_loop2
3723	add	%i0, 0x40, %i0
3724	ba	.ci_blkdone
3725	add	%i1, %o2, %i1		! increment the source by src offset
3726					! the src offset was stored in %o2
3727
3728
3729	! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P
3730.ci_blkcpy:
3731
3732	andn	%i1, 0x3f, %o0		! %o0 has block aligned source
3733	prefetch [%o0+0x0], #one_read
37341:
3735	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0
3736	add	%i1, 0x10, %i1
3737	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2
3738	add	%i1, 0x10, %i1
3739
3740	prefetch [%o0+0x40], #one_read
3741
3742	stxa	%l0, [%i0+0x0]%asi
3743
3744	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4
3745	add	%i1, 0x10, %i1
3746	ldda	[%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6
3747	add	%i1, 0x10, %i1
3748
3749	stxa	%l1, [%i0+0x8]%asi
3750	stxa	%l2, [%i0+0x10]%asi
3751	stxa	%l3, [%i0+0x18]%asi
3752	stxa	%l4, [%i0+0x20]%asi
3753	stxa	%l5, [%i0+0x28]%asi
3754	stxa	%l6, [%i0+0x30]%asi
3755	stxa	%l7, [%i0+0x38]%asi
3756
3757	add	%o0, 0x40, %o0
3758	subcc	%i3, 0x40, %i3
3759	bgu,pt	%xcc, 1b
3760	add	%i0, 0x40, %i0
3761
3762.ci_blkdone:
3763	membar	#Sync
3764#endif	/* NIAGARA_IMPL */
3765
3766	brz,pt	%i2, .copyin_exit
3767	nop
3768
3769	! Handle trailing bytes
3770	cmp	%i2, 0x8
3771	blu,pt	%ncc, .ci_residue
3772	nop
3773
3774	! Can we do some 8B ops
3775	or	%i1, %i0, %o2
3776	andcc	%o2, 0x7, %g0
3777	bnz	%ncc, .ci_last4
3778	nop
3779
3780	! Do 8byte ops as long as possible
3781.ci_last8:
3782	ldxa	[%i1]ASI_USER, %o2
3783	stx	%o2, [%i0]
3784	add	%i1, 0x8, %i1
3785	sub	%i2, 0x8, %i2
3786	cmp	%i2, 0x8
3787	bgu,pt	%ncc, .ci_last8
3788	add	%i0, 0x8, %i0
3789
3790	brz,pt	%i2, .copyin_exit
3791	nop
3792
3793	ba	.ci_residue
3794	nop
3795
3796.ci_last4:
3797	! Can we do 4B ops
3798	andcc	%o2, 0x3, %g0
3799	bnz	%ncc, .ci_last2
3800	nop
38011:
3802	lda	[%i1]ASI_USER, %o2
3803	st	%o2, [%i0]
3804	add	%i1, 0x4, %i1
3805	sub	%i2, 0x4, %i2
3806	cmp	%i2, 0x4
3807	bgu,pt	%ncc, 1b
3808	add	%i0, 0x4, %i0
3809
3810	brz,pt	%i2, .copyin_exit
3811	nop
3812
3813	ba	.ci_residue
3814	nop
3815
3816.ci_last2:
3817	! Can we do 2B ops
3818	andcc	%o2, 0x1, %g0
3819	bnz	%ncc, .ci_residue
3820	nop
3821
38221:
3823	lduha	[%i1]ASI_USER, %o2
3824	stuh	%o2, [%i0]
3825	add	%i1, 0x2, %i1
3826	sub	%i2, 0x2, %i2
3827	cmp	%i2, 0x2
3828	bgu,pt	%ncc, 1b
3829	add	%i0, 0x2, %i0
3830
3831	brz,pt	%i2, .copyin_exit
3832	nop
3833
3834	! Copy the residue as byte copy
3835.ci_residue:
3836	lduba	[%i1]ASI_USER, %i4
3837	stb	%i4, [%i0]
3838	inc	%i1
3839	deccc	%i2
3840	bgu,pt	%xcc, .ci_residue
3841	inc	%i0
3842
3843.copyin_exit:
3844#if !defined(NIAGARA_IMPL)
3845	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
3846	wr	%o2, 0, %gsr		! restore gsr
3847
3848	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3849	btst	FPRS_FEF, %o3
3850	bz	%icc, 4f
3851	  nop
3852
3853	! restore fpregs from stack
3854	BLD_FP_FROMSTACK(%o2)
3855
3856	ba,pt	%ncc, 2f
3857	  wr	%o3, 0, %fprs		! restore fprs
3858
38594:
3860	FZERO				! zero all of the fpregs
3861	wr	%o3, 0, %fprs		! restore fprs
3862
38632:
3864	membar	#Sync			! sync error barrier
3865	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3866#else	/* NIAGARA_IMPL */
3867	membar	#Sync
3868#endif	/* NIAGARA_IMPL */
3869	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3870	ret
3871	restore	%g0, 0, %o0
3872.copyin_err:
3873	ldn	[THREAD_REG + T_COPYOPS], %o4
3874	brz	%o4, 2f
3875	nop
3876	ldn	[%o4 + CP_COPYIN], %g2
3877	jmp	%g2
3878	nop
38792:
3880	retl
3881	mov	-1, %o0
3882	SET_SIZE(copyin)
3883
3884#endif	/* lint */
3885
3886#ifdef	lint
3887
3888/*ARGSUSED*/
3889int
3890xcopyin(const void *uaddr, void *kaddr, size_t count)
3891{ return (0); }
3892
3893#else	/* lint */
3894
3895	ENTRY(xcopyin)
3896	sethi	%hi(.xcopyin_err), REAL_LOFAULT
3897	b	.do_copyin
3898	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3899.xcopyin_err:
3900	ldn	[THREAD_REG + T_COPYOPS], %o4
3901	brz	%o4, 2f
3902	nop
3903	ldn	[%o4 + CP_XCOPYIN], %g2
3904	jmp	%g2
3905	nop
39062:
3907	retl
3908	mov	%g1, %o0
3909	SET_SIZE(xcopyin)
3910
3911#endif	/* lint */
3912
3913#ifdef	lint
3914
3915/*ARGSUSED*/
3916int
3917xcopyin_little(const void *uaddr, void *kaddr, size_t count)
3918{ return (0); }
3919
3920#else	/* lint */
3921
3922	ENTRY(xcopyin_little)
3923	sethi	%hi(.little_err), %o4
3924	ldn	[THREAD_REG + T_LOFAULT], %o5
3925	or	%o4, %lo(.little_err), %o4
3926	membar	#Sync				! sync error barrier
3927	stn	%o4, [THREAD_REG + T_LOFAULT]
3928
3929	subcc	%g0, %o2, %o3
3930	add	%o0, %o2, %o0
3931	bz,pn	%ncc, 2f		! check for zero bytes
3932	sub	%o2, 1, %o4
3933	add	%o0, %o4, %o0		! start w/last byte
3934	add	%o1, %o2, %o1
3935	lduba	[%o0+%o3]ASI_AIUSL, %o4
3936
39371:	stb	%o4, [%o1+%o3]
3938	inccc	%o3
3939	sub	%o0, 2, %o0		! get next byte
3940	bcc,a,pt %ncc, 1b
3941	  lduba	[%o0+%o3]ASI_AIUSL, %o4
3942
39432:	membar	#Sync				! sync error barrier
3944	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3945	retl
3946	mov	%g0, %o0		! return (0)
3947
3948.little_err:
3949	membar	#Sync				! sync error barrier
3950	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3951	retl
3952	mov	%g1, %o0
3953	SET_SIZE(xcopyin_little)
3954
3955#endif	/* lint */
3956
3957
3958/*
3959 * Copy a block of storage - must not overlap (from + len <= to).
3960 * No fault handler installed (to be called under on_fault())
3961 */
3962#if defined(lint)
3963
3964/* ARGSUSED */
3965void
3966copyin_noerr(const void *ufrom, void *kto, size_t count)
3967{}
3968
3969#else	/* lint */
3970
3971	ENTRY(copyin_noerr)
3972	sethi	%hi(.copyio_noerr), REAL_LOFAULT
3973	b	.do_copyin
3974	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3975.copyio_noerr:
3976	jmp	SAVED_LOFAULT
3977	  nop
3978	SET_SIZE(copyin_noerr)
3979
3980#endif /* lint */
3981
3982/*
3983 * Copy a block of storage - must not overlap (from + len <= to).
3984 * No fault handler installed (to be called under on_fault())
3985 */
3986
3987#if defined(lint)
3988
3989/* ARGSUSED */
3990void
3991copyout_noerr(const void *kfrom, void *uto, size_t count)
3992{}
3993
3994#else	/* lint */
3995
3996	ENTRY(copyout_noerr)
3997	sethi	%hi(.copyio_noerr), REAL_LOFAULT
3998	b	.do_copyout
3999	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
4000	SET_SIZE(copyout_noerr)
4001
4002#endif /* lint */
4003
4004#if defined(lint)
4005
4006int use_hw_bcopy = 1;
4007int use_hw_bzero = 1;
4008uint_t hw_copy_limit_1 = 0x100;
4009uint_t hw_copy_limit_2 = 0x200;
4010uint_t hw_copy_limit_4 = 0x400;
4011uint_t hw_copy_limit_8 = 0x400;
4012
4013#else /* !lint */
4014
4015	.align	4
4016	DGDEF(use_hw_bcopy)
4017	.word	1
4018	DGDEF(use_hw_bzero)
4019	.word	1
4020	DGDEF(hw_copy_limit_1)
4021	.word	0x100
4022	DGDEF(hw_copy_limit_2)
4023	.word	0x200
4024	DGDEF(hw_copy_limit_4)
4025	.word	0x400
4026	DGDEF(hw_copy_limit_8)
4027	.word	0x400
4028
4029	.align	64
4030	.section ".text"
4031#endif /* !lint */
4032
4033/*
4034 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
4035 * longer than 256 bytes in length using Niagara's block stores/quad store.
4036 * If the criteria for using this routine are not met then it calls bzero
4037 * and returns 1.  Otherwise 0 is returned indicating success.
4038 * Caller is responsible for ensuring use_hw_bzero is true and that
4039 * kpreempt_disable() has been called.
4040 */
4041#ifdef lint
4042/*ARGSUSED*/
4043int
4044hwblkclr(void *addr, size_t len)
4045{
4046	return(0);
4047}
4048#else /* lint */
4049	! %i0 - start address
4050	! %i1 - length of region (multiple of 64)
4051
4052	ENTRY(hwblkclr)
4053	save	%sp, -SA(MINFRAME), %sp
4054
4055	! Must be block-aligned
4056	andcc	%i0, 0x3f, %g0
4057	bnz,pn	%ncc, 1f
4058	  nop
4059
4060	! ... and must be 256 bytes or more
4061	cmp	%i1, 0x100
4062	blu,pn	%ncc, 1f
4063	  nop
4064
4065	! ... and length must be a multiple of 64
4066	andcc	%i1, 0x3f, %g0
4067	bz,pn	%ncc, .pz_doblock
4068	mov	ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
4069
40701:	! punt, call bzero but notify the caller that bzero was used
4071	mov	%i0, %o0
4072	call	bzero
4073	  mov	%i1, %o1
4074	ret
4075	restore	%g0, 1, %o0	! return (1) - did not use block operations
4076
4077	! Already verified that there are at least 256 bytes to set
4078.pz_doblock:
4079	stxa	%g0, [%i0+0x0]%asi
4080	stxa	%g0, [%i0+0x40]%asi
4081	stxa	%g0, [%i0+0x80]%asi
4082	stxa	%g0, [%i0+0xc0]%asi
4083
4084	stxa	%g0, [%i0+0x8]%asi
4085	stxa	%g0, [%i0+0x10]%asi
4086	stxa	%g0, [%i0+0x18]%asi
4087	stxa	%g0, [%i0+0x20]%asi
4088	stxa	%g0, [%i0+0x28]%asi
4089	stxa	%g0, [%i0+0x30]%asi
4090	stxa	%g0, [%i0+0x38]%asi
4091
4092	stxa	%g0, [%i0+0x48]%asi
4093	stxa	%g0, [%i0+0x50]%asi
4094	stxa	%g0, [%i0+0x58]%asi
4095	stxa	%g0, [%i0+0x60]%asi
4096	stxa	%g0, [%i0+0x68]%asi
4097	stxa	%g0, [%i0+0x70]%asi
4098	stxa	%g0, [%i0+0x78]%asi
4099
4100	stxa	%g0, [%i0+0x88]%asi
4101	stxa	%g0, [%i0+0x90]%asi
4102	stxa	%g0, [%i0+0x98]%asi
4103	stxa	%g0, [%i0+0xa0]%asi
4104	stxa	%g0, [%i0+0xa8]%asi
4105	stxa	%g0, [%i0+0xb0]%asi
4106	stxa	%g0, [%i0+0xb8]%asi
4107
4108	stxa	%g0, [%i0+0xc8]%asi
4109	stxa	%g0, [%i0+0xd0]%asi
4110	stxa	%g0, [%i0+0xd8]%asi
4111	stxa	%g0, [%i0+0xe0]%asi
4112	stxa	%g0, [%i0+0xe8]%asi
4113	stxa	%g0, [%i0+0xf0]%asi
4114	stxa	%g0, [%i0+0xf8]%asi
4115
4116	sub	%i1, 0x100, %i1
4117	cmp	%i1, 0x100
4118	bgu,pt	%ncc, .pz_doblock
4119	add	%i0, 0x100, %i0
4120
41212:
4122	! Check if more than 64 bytes to set
4123	cmp	%i1,0x40
4124	blu	%ncc, .pz_finish
4125	nop
4126
41273:
4128	stxa	%g0, [%i0+0x0]%asi
4129	stxa	%g0, [%i0+0x8]%asi
4130	stxa	%g0, [%i0+0x10]%asi
4131	stxa	%g0, [%i0+0x18]%asi
4132	stxa	%g0, [%i0+0x20]%asi
4133	stxa	%g0, [%i0+0x28]%asi
4134	stxa	%g0, [%i0+0x30]%asi
4135	stxa	%g0, [%i0+0x38]%asi
4136
4137	subcc	%i1, 0x40, %i1
4138	bgu,pt	%ncc, 3b
4139	add	%i0, 0x40, %i0
4140
4141.pz_finish:
4142	membar	#Sync
4143	ret
4144	restore	%g0, 0, %o0		! return (bzero or not)
4145	SET_SIZE(hwblkclr)
4146#endif	/* lint */
4147
4148#ifdef	lint
4149/* Copy 32 bytes of data from src to dst using physical addresses */
4150/*ARGSUSED*/
4151void
4152hw_pa_bcopy32(uint64_t src, uint64_t dst)
4153{}
4154#else	/*!lint */
4155
4156	/*
4157	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
4158	 * using physical addresses.
4159	 */
4160	ENTRY_NP(hw_pa_bcopy32)
4161	rdpr    %pstate, %g1
4162	andn    %g1, PSTATE_IE, %g2
4163	wrpr    %g0, %g2, %pstate
4164
4165	ldxa    [%o0]ASI_MEM, %o2
4166	add     %o0, 8, %o0
4167	ldxa    [%o0]ASI_MEM, %o3
4168	add     %o0, 8, %o0
4169	ldxa    [%o0]ASI_MEM, %o4
4170	add     %o0, 8, %o0
4171	ldxa    [%o0]ASI_MEM, %o5
4172	stxa    %o2, [%o1]ASI_MEM
4173	add     %o1, 8, %o1
4174	stxa    %o3, [%o1]ASI_MEM
4175	add     %o1, 8, %o1
4176	stxa    %o4, [%o1]ASI_MEM
4177	add     %o1, 8, %o1
4178	stxa    %o5, [%o1]ASI_MEM
4179
4180	membar	#Sync
4181	retl
4182	  wrpr    %g0, %g1, %pstate
4183	SET_SIZE(hw_pa_bcopy32)
4184#endif /* lint */
4185
4186/*
4187 * Zero a block of storage.
4188 *
4189 * uzero is used by the kernel to zero a block in user address space.
4190 */
4191
4192/*
4193 * Control flow of the bzero/kzero/uzero routine.
4194 *
4195 *	For fewer than 7 bytes stores, bytes will be zeroed.
4196 *
4197 *	For less than 15 bytes stores, align the address on 4 byte boundary.
4198 *	Then store as many 4-byte chunks, followed by trailing bytes.
4199 *
4200 *	For sizes greater than 15 bytes, align the address on 8 byte boundary.
4201 *	if (count > 128) {
4202 *		store as many 8-bytes chunks to block align the address
4203 *		store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR
4204 *		store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero)
4205 *	}
4206 *	Store as many 8-byte chunks, followed by trailing bytes.
4207 */
4208
4209#if defined(lint)
4210
4211/* ARGSUSED */
4212int
4213kzero(void *addr, size_t count)
4214{ return(0); }
4215
4216/* ARGSUSED */
4217void
4218uzero(void *addr, size_t count)
4219{}
4220
4221#else	/* lint */
4222
4223	ENTRY(uzero)
4224	!
4225	! Set a new lo_fault handler only if we came in with one
4226	! already specified.
4227	!
4228	wr	%g0, ASI_USER, %asi
4229	ldn	[THREAD_REG + T_LOFAULT], %o5
4230	tst	%o5
4231	bz,pt	%ncc, .do_zero
4232	sethi	%hi(.zeroerr), %o2
4233	or	%o2, %lo(.zeroerr), %o2
4234	membar	#Sync
4235	ba,pt	%ncc, .do_zero
4236	stn	%o2, [THREAD_REG + T_LOFAULT]
4237
4238	ENTRY(kzero)
4239	!
4240	! Always set a lo_fault handler
4241	!
4242	wr	%g0, ASI_P, %asi
4243	ldn	[THREAD_REG + T_LOFAULT], %o5
4244	sethi	%hi(.zeroerr), %o2
4245	or	%o5, LOFAULT_SET, %o5
4246	or	%o2, %lo(.zeroerr), %o2
4247	membar	#Sync
4248	ba,pt	%ncc, .do_zero
4249	stn	%o2, [THREAD_REG + T_LOFAULT]
4250
4251/*
4252 * We got here because of a fault during kzero or if
4253 * uzero or bzero was called with t_lofault non-zero.
4254 * Otherwise we've already run screaming from the room.
4255 * Errno value is in %g1. Note that we're here iff
4256 * we did set t_lofault.
4257 */
4258.zeroerr:
4259	!
4260	! Undo asi register setting. Just set it to be the
4261        ! kernel default without checking.
4262	!
4263	wr	%g0, ASI_P, %asi
4264
4265	!
4266	! We did set t_lofault. It may well have been zero coming in.
4267	!
42681:
4269	tst	%o5
4270	membar #Sync
4271	bne,pn	%ncc, 3f
4272	andncc	%o5, LOFAULT_SET, %o5
42732:
4274	!
4275	! Old handler was zero. Just return the error.
4276	!
4277	retl				! return
4278	mov	%g1, %o0		! error code from %g1
42793:
4280	!
4281	! We're here because %o5 was non-zero. It was non-zero
4282	! because either LOFAULT_SET was present, a previous fault
4283	! handler was present or both. In all cases we need to reset
4284	! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET
4285	! before we either simply return the error or we invoke the
4286	! previously specified handler.
4287	!
4288	be	%ncc, 2b
4289	stn	%o5, [THREAD_REG + T_LOFAULT]
4290	jmp	%o5			! goto real handler
4291	  nop
4292	SET_SIZE(kzero)
4293	SET_SIZE(uzero)
4294
4295#endif	/* lint */
4296
4297/*
4298 * Zero a block of storage.
4299 */
4300
4301#if defined(lint)
4302
4303/* ARGSUSED */
4304void
4305bzero(void *addr, size_t count)
4306{}
4307
4308#else	/* lint */
4309
4310	ENTRY(bzero)
4311	wr	%g0, ASI_P, %asi
4312
4313	ldn	[THREAD_REG + T_LOFAULT], %o5	! save old vector
4314	tst	%o5
4315	bz,pt	%ncc, .do_zero
4316	sethi	%hi(.zeroerr), %o2
4317	or	%o2, %lo(.zeroerr), %o2
4318	membar	#Sync				! sync error barrier
4319	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
4320
4321.do_zero:
4322	cmp	%o1, 7
4323	blu,pn	%ncc, .byteclr
4324	nop
4325
4326	cmp	%o1, 15
4327	blu,pn	%ncc, .wdalign
4328	nop
4329
4330	andcc	%o0, 7, %o3		! is add aligned on a 8 byte bound
4331	bz,pt	%ncc, .blkalign		! already double aligned
4332	sub	%o3, 8, %o3		! -(bytes till double aligned)
4333	add	%o1, %o3, %o1		! update o1 with new count
4334
43351:
4336	stba	%g0, [%o0]%asi
4337	inccc	%o3
4338	bl,pt	%ncc, 1b
4339	inc	%o0
4340
4341	! Now address is double aligned
4342.blkalign:
4343	cmp	%o1, 0x80		! check if there are 128 bytes to set
4344	blu,pn	%ncc, .bzero_small
4345	mov	%o1, %o3
4346
4347	sethi	%hi(use_hw_bzero), %o2
4348	ld	[%o2 + %lo(use_hw_bzero)], %o2
4349	tst	%o2
4350	bz	%ncc, .bzero_small
4351	mov	%o1, %o3
4352
4353	rd	%asi, %o3
4354	wr	%g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi
4355	cmp	%o3, ASI_P
4356	bne,a	%ncc, .algnblk
4357	wr	%g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi
4358
4359.algnblk:
4360	andcc	%o0, 0x3f, %o3		! is block aligned?
4361	bz,pt	%ncc, .bzero_blk
4362	sub	%o3, 0x40, %o3		! -(bytes till block aligned)
4363	add	%o1, %o3, %o1		! o1 is the remainder
4364
4365	! Clear -(%o3) bytes till block aligned
43661:
4367	stxa	%g0, [%o0]%asi
4368	addcc	%o3, 8, %o3
4369	bl,pt	%ncc, 1b
4370	add	%o0, 8, %o0
4371
4372.bzero_blk:
4373	and	%o1, 0x3f, %o3		! calc bytes left after blk clear
4374	andn	%o1, 0x3f, %o4		! calc size of blocks in bytes
4375
4376	cmp	%o4, 0x100		! 256 bytes or more
4377	blu,pn	%ncc, 3f
4378	nop
4379
43802:
4381	stxa	%g0, [%o0+0x0]%asi
4382	stxa	%g0, [%o0+0x40]%asi
4383	stxa	%g0, [%o0+0x80]%asi
4384	stxa	%g0, [%o0+0xc0]%asi
4385
4386	stxa	%g0, [%o0+0x8]%asi
4387	stxa	%g0, [%o0+0x10]%asi
4388	stxa	%g0, [%o0+0x18]%asi
4389	stxa	%g0, [%o0+0x20]%asi
4390	stxa	%g0, [%o0+0x28]%asi
4391	stxa	%g0, [%o0+0x30]%asi
4392	stxa	%g0, [%o0+0x38]%asi
4393
4394	stxa	%g0, [%o0+0x48]%asi
4395	stxa	%g0, [%o0+0x50]%asi
4396	stxa	%g0, [%o0+0x58]%asi
4397	stxa	%g0, [%o0+0x60]%asi
4398	stxa	%g0, [%o0+0x68]%asi
4399	stxa	%g0, [%o0+0x70]%asi
4400	stxa	%g0, [%o0+0x78]%asi
4401
4402	stxa	%g0, [%o0+0x88]%asi
4403	stxa	%g0, [%o0+0x90]%asi
4404	stxa	%g0, [%o0+0x98]%asi
4405	stxa	%g0, [%o0+0xa0]%asi
4406	stxa	%g0, [%o0+0xa8]%asi
4407	stxa	%g0, [%o0+0xb0]%asi
4408	stxa	%g0, [%o0+0xb8]%asi
4409
4410	stxa	%g0, [%o0+0xc8]%asi
4411	stxa	%g0, [%o0+0xd0]%asi
4412	stxa	%g0, [%o0+0xd8]%asi
4413	stxa	%g0, [%o0+0xe0]%asi
4414	stxa	%g0, [%o0+0xe8]%asi
4415	stxa	%g0, [%o0+0xf0]%asi
4416	stxa	%g0, [%o0+0xf8]%asi
4417
4418	sub	%o4, 0x100, %o4
4419	cmp	%o4, 0x100
4420	bgu,pt	%ncc, 2b
4421	add	%o0, 0x100, %o0
4422
44233:
4424	! ... check if 64 bytes to set
4425	cmp	%o4, 0x40
4426	blu	%ncc, .bzero_blk_done
4427	nop
4428
44294:
4430	stxa	%g0, [%o0+0x0]%asi
4431	stxa	%g0, [%o0+0x8]%asi
4432	stxa	%g0, [%o0+0x10]%asi
4433	stxa	%g0, [%o0+0x18]%asi
4434	stxa	%g0, [%o0+0x20]%asi
4435	stxa	%g0, [%o0+0x28]%asi
4436	stxa	%g0, [%o0+0x30]%asi
4437	stxa	%g0, [%o0+0x38]%asi
4438
4439	subcc	%o4, 0x40, %o4
4440	bgu,pt	%ncc, 3b
4441	add	%o0, 0x40, %o0
4442
4443.bzero_blk_done:
4444	membar	#Sync
4445	!
4446	! Undo asi register setting.
4447	!
4448	rd	%asi, %o4
4449	wr	%g0, ASI_P, %asi
4450	cmp	%o4, ASI_BLK_INIT_ST_QUAD_LDD_P
4451	bne,a	%ncc, .bzero_small
4452	wr	%g0, ASI_USER, %asi
4453
4454.bzero_small:
4455	! Set the remaining doubles
4456	subcc	%o3, 8, %o3		! Can we store any doubles?
4457	blu,pn	%ncc, .byteclr
4458	and	%o1, 7, %o1		! calc bytes left after doubles
4459
4460.dbclr:
4461	stxa	%g0, [%o0]%asi		! Clear the doubles
4462	subcc	%o3, 8, %o3
4463	bgeu,pt	%ncc, .dbclr
4464	add	%o0, 8, %o0
4465
4466	ba	.byteclr
4467	nop
4468
4469.wdalign:
4470	andcc	%o0, 3, %o3		! is add aligned on a word boundary
4471	bz,pn	%ncc, .wdclr
4472	andn	%o1, 3, %o3		! create word sized count in %o3
4473
4474	dec	%o1			! decrement count
4475	stba	%g0, [%o0]%asi		! clear a byte
4476	ba	.wdalign
4477	inc	%o0			! next byte
4478
4479.wdclr:
4480	sta	%g0, [%o0]%asi		! 4-byte clearing loop
4481	subcc	%o3, 4, %o3
4482	bnz,pt	%ncc, .wdclr
4483	inc	4, %o0
4484
4485	and	%o1, 3, %o1		! leftover count, if any
4486
4487.byteclr:
4488	! Set the leftover bytes
4489	brz	%o1, .bzero_exit
4490	nop
4491
44927:
4493	deccc	%o1			! byte clearing loop
4494	stba	%g0, [%o0]%asi
4495	bgu,pt	%ncc, 7b
4496	inc	%o0
4497
4498.bzero_exit:
4499	!
4500	! We're just concerned with whether t_lofault was set
4501	! when we came in. We end up here from either kzero()
4502	! or bzero(). kzero() *always* sets a lofault handler.
4503	! It ors LOFAULT_SET into %o5 to indicate it has done
4504	! this even if the value of %o5 is otherwise zero.
4505	! bzero() sets a lofault handler *only* if one was
4506	! previously set. Accordingly we need to examine
4507	! %o5 and if it is non-zero be sure to clear LOFAULT_SET
4508	! before resetting the error handler.
4509	!
4510	tst	%o5
4511	bz	%ncc, 1f
4512	andn	%o5, LOFAULT_SET, %o5
4513	membar	#Sync				! sync error barrier
4514	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
45151:
4516	retl
4517	clr	%o0			! return (0)
4518
4519	SET_SIZE(bzero)
4520#endif	/* lint */
4521