xref: /titanic_51/usr/src/uts/sun4u/cpu/spitfire_copy.s (revision 381a2a9a387f449fab7d0c7e97c4184c26963abf)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/param.h>
30#include <sys/errno.h>
31#include <sys/asm_linkage.h>
32#include <sys/vtrace.h>
33#include <sys/machthread.h>
34#include <sys/clock.h>
35#include <sys/asi.h>
36#include <sys/fsr.h>
37#include <sys/privregs.h>
38
39#if !defined(lint)
40#include "assym.h"
41#endif	/* lint */
42
43
44/*
45 * Pseudo-code to aid in understanding the control flow of the
46 * bcopy routine.
47 *
48 * On entry to bcopy:
49 *
50 *	%l6 = curthread->t_lofault;
51 *	used_block_copy = FALSE;			! %l6 |= 1
52 *	if (%l6 != NULL) {
53 *		curthread->t_lofault = .copyerr;
54 *		caller_error_handler = TRUE		! %l6 |= 2
55 *	}
56 *
57 * 	if (length < VIS_COPY)
58 * 		goto regular_copy;
59 *
60 * 	if (!use_vis)
61 * 		goto_regular_copy;
62 *
63 * 	if (curthread->t_lwp == NULL) {
64 *		! Kernel threads do not have pcb's in which to store
65 *		! the floating point state, disallow preemption during
66 *		! the copy.
67 * 		kpreempt_disable(curthread);
68 *	}
69 *
70 * 	old_fprs = %fprs;
71 * 	old_gsr = %gsr;
72 * 	if (%fprs.fef) {
73 *              ! If we need to save 4 blocks of fpregs then make sure
74 *		! the length is still appropriate for that extra overhead.
75 * 		if (length < (large_length + (64 * 4))) {
76 * 			if (curthread->t_lwp == NULL)
77 * 				kpreempt_enable(curthread);
78 * 			goto regular_copy;
79 * 		}
80 * 		%fprs.fef = 1;
81 * 		save current fpregs on stack using blockstore
82 * 	} else {
83 * 		%fprs.fef = 1;
84 * 	}
85 *
86 * 	used_block_copy = 1;				! %l6 |= 1
87 * 	do_blockcopy_here;
88 *
89 * In lofault handler:
90 *	curthread->t_lofault = .copyerr2;
91 *	Continue on with the normal exit handler
92 *
93 * On exit:
94 *	call_kpreempt = 0;
95 * 	if (used_block_copy) {				! %l6 & 1
96 * 		%gsr = old_gsr;
97 * 		if (old_fprs & FPRS_FEF)
98 * 			restore fpregs from stack using blockload
99 *		else
100 *			zero fpregs
101 * 		%fprs = old_fprs;
102 * 		if (curthread->t_lwp == NULL) {
103 *			kpreempt_enable(curthread);
104 *			call_kpreempt = 1;
105 *		}
106 * 	}
107 * 	curthread->t_lofault = (%l6 & ~3);
108 *	if (call_kpreempt)
109 *		kpreempt(%pil);
110 * 	return (0)
111 *
112 * In second lofault handler (.copyerr2):
113 *	We've tried to restore fp state from the stack and failed.  To
114 *	prevent from returning with a corrupted fp state, we will panic.
115 */
116
117/*
118 * Notes on preserving existing fp state:
119 *
120 * When a copyOP decides to use fp we may have to preserve existing
121 * floating point state.  It is not the caller's state that we need to
122 * preserve - the rest of the kernel does not use fp and, anyway, fp
123 * registers are volatile across a call.  Some examples:
124 *
125 *	- userland has fp state and is interrupted (device interrupt
126 *	  or trap) and within the interrupt/trap handling we use
127 *	  bcopy()
128 *	- another (higher level) interrupt or trap handler uses bcopy
129 *	  while a bcopy from an earlier interrupt is still active
130 *	- an asynchronous error trap occurs while fp state exists (in
131 *	  userland or in kernel copy) and the tl0 component of the handling
132 *	  uses bcopy
133 *	- a user process with fp state incurs a copy-on-write fault and
134 *	  hwblkpagecopy always uses fp
135 *
136 * We therefore need a per-call place in which to preserve fp state -
137 * using our stack is ideal (and since fp copy cannot be leaf optimized
138 * because of calls it makes, this is no hardship).
139 *
140 * To make sure that floating point state is always saved and restored
141 * correctly, the following "big rules" must be followed when the floating
142 * point registers will be used:
143 *
144 * 1. %l6 always holds the caller's lofault handler.  Also in this register,
145 *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
146 *    use.  Bit 2 (BCOPY_FLAG) indicates that the call was to bcopy.
147 *
148 * 2. The FPUSED flag indicates that all FP state has been successfully stored
149 *    on the stack.  It should not be set until this save has been completed.
150 *
151 * 3. The FPUSED flag should not be cleared on exit until all FP state has
152 *    been restored from the stack.  If an error occurs while restoring
153 *    data from the stack, the error handler can check this flag to see if
154 *    a restore is necessary.
155 *
156 * 4. Code run under the new lofault handler must be kept to a minimum.  In
157 *    particular, any calls to kpreempt() should not be made until after the
158 *    lofault handler has been restored.
159 */
160
161/*
162 * This shadows sys/machsystm.h which can't be included due to the lack of
163 * _ASM guards in include files it references. Change it here, change it there.
164 */
165#define VIS_COPY_THRESHOLD 900
166
167/*
168 * Less then or equal this number of bytes we will always copy byte-for-byte
169 */
170#define	SMALL_LIMIT	7
171
172/*
173 * Flags set in the lower bits of the t_lofault address:
174 * FPUSED_FLAG: The FP registers were in use and must be restored
175 * BCOPY_FLAG: Set for bcopy calls, cleared for kcopy calls
176 * COPY_FLAGS: Both of the above
177 *
178 * Other flags:
179 * KPREEMPT_FLAG: kpreempt needs to be called
180 */
181#define	FPUSED_FLAG	1
182#define BCOPY_FLAG	2
183#define	COPY_FLAGS	(FPUSED_FLAG | BCOPY_FLAG)
184#define	KPREEMPT_FLAG	4
185
186/*
187 * Size of stack frame in order to accomodate a 64-byte aligned
188 * floating-point register save area and 2 32-bit temp locations.
189 */
190#define	HWCOPYFRAMESIZE	((64 * 5) + (2 * 4))
191
192#define SAVED_FPREGS_OFFSET	(64 * 5)
193#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 4)
194#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 4)
195
196/*
197 * Common macros used by the various versions of the block copy
198 * routines in this file.
199 */
200
201#define	FZERO				\
202	fzero	%f0			;\
203	fzero	%f2			;\
204	faddd	%f0, %f2, %f4		;\
205	fmuld	%f0, %f2, %f6		;\
206	faddd	%f0, %f2, %f8		;\
207	fmuld	%f0, %f2, %f10		;\
208	faddd	%f0, %f2, %f12		;\
209	fmuld	%f0, %f2, %f14		;\
210	faddd	%f0, %f2, %f16		;\
211	fmuld	%f0, %f2, %f18		;\
212	faddd	%f0, %f2, %f20		;\
213	fmuld	%f0, %f2, %f22		;\
214	faddd	%f0, %f2, %f24		;\
215	fmuld	%f0, %f2, %f26		;\
216	faddd	%f0, %f2, %f28		;\
217	fmuld	%f0, %f2, %f30		;\
218	faddd	%f0, %f2, %f32		;\
219	fmuld	%f0, %f2, %f34		;\
220	faddd	%f0, %f2, %f36		;\
221	fmuld	%f0, %f2, %f38		;\
222	faddd	%f0, %f2, %f40		;\
223	fmuld	%f0, %f2, %f42		;\
224	faddd	%f0, %f2, %f44		;\
225	fmuld	%f0, %f2, %f46		;\
226	faddd	%f0, %f2, %f48		;\
227	fmuld	%f0, %f2, %f50		;\
228	faddd	%f0, %f2, %f52		;\
229	fmuld	%f0, %f2, %f54		;\
230	faddd	%f0, %f2, %f56		;\
231	fmuld	%f0, %f2, %f58		;\
232	faddd	%f0, %f2, %f60		;\
233	fmuld	%f0, %f2, %f62
234
235
236#define	FALIGN_D0			\
237	faligndata %d0, %d2, %d48	;\
238	faligndata %d2, %d4, %d50	;\
239	faligndata %d4, %d6, %d52	;\
240	faligndata %d6, %d8, %d54	;\
241	faligndata %d8, %d10, %d56	;\
242	faligndata %d10, %d12, %d58	;\
243	faligndata %d12, %d14, %d60	;\
244	faligndata %d14, %d16, %d62
245
246#define	FALIGN_D16			\
247	faligndata %d16, %d18, %d48	;\
248	faligndata %d18, %d20, %d50	;\
249	faligndata %d20, %d22, %d52	;\
250	faligndata %d22, %d24, %d54	;\
251	faligndata %d24, %d26, %d56	;\
252	faligndata %d26, %d28, %d58	;\
253	faligndata %d28, %d30, %d60	;\
254	faligndata %d30, %d32, %d62
255
256#define	FALIGN_D32			\
257	faligndata %d32, %d34, %d48	;\
258	faligndata %d34, %d36, %d50	;\
259	faligndata %d36, %d38, %d52	;\
260	faligndata %d38, %d40, %d54	;\
261	faligndata %d40, %d42, %d56	;\
262	faligndata %d42, %d44, %d58	;\
263	faligndata %d44, %d46, %d60	;\
264	faligndata %d46, %d0, %d62
265
266#define	FALIGN_D2			\
267	faligndata %d2, %d4, %d48	;\
268	faligndata %d4, %d6, %d50	;\
269	faligndata %d6, %d8, %d52	;\
270	faligndata %d8, %d10, %d54	;\
271	faligndata %d10, %d12, %d56	;\
272	faligndata %d12, %d14, %d58	;\
273	faligndata %d14, %d16, %d60	;\
274	faligndata %d16, %d18, %d62
275
276#define	FALIGN_D18			\
277	faligndata %d18, %d20, %d48	;\
278	faligndata %d20, %d22, %d50	;\
279	faligndata %d22, %d24, %d52	;\
280	faligndata %d24, %d26, %d54	;\
281	faligndata %d26, %d28, %d56	;\
282	faligndata %d28, %d30, %d58	;\
283	faligndata %d30, %d32, %d60	;\
284	faligndata %d32, %d34, %d62
285
286#define	FALIGN_D34			\
287	faligndata %d34, %d36, %d48	;\
288	faligndata %d36, %d38, %d50	;\
289	faligndata %d38, %d40, %d52	;\
290	faligndata %d40, %d42, %d54	;\
291	faligndata %d42, %d44, %d56	;\
292	faligndata %d44, %d46, %d58	;\
293	faligndata %d46, %d0, %d60	;\
294	faligndata %d0, %d2, %d62
295
296#define	FALIGN_D4			\
297	faligndata %d4, %d6, %d48	;\
298	faligndata %d6, %d8, %d50	;\
299	faligndata %d8, %d10, %d52	;\
300	faligndata %d10, %d12, %d54	;\
301	faligndata %d12, %d14, %d56	;\
302	faligndata %d14, %d16, %d58	;\
303	faligndata %d16, %d18, %d60	;\
304	faligndata %d18, %d20, %d62
305
306#define	FALIGN_D20			\
307	faligndata %d20, %d22, %d48	;\
308	faligndata %d22, %d24, %d50	;\
309	faligndata %d24, %d26, %d52	;\
310	faligndata %d26, %d28, %d54	;\
311	faligndata %d28, %d30, %d56	;\
312	faligndata %d30, %d32, %d58	;\
313	faligndata %d32, %d34, %d60	;\
314	faligndata %d34, %d36, %d62
315
316#define	FALIGN_D36			\
317	faligndata %d36, %d38, %d48	;\
318	faligndata %d38, %d40, %d50	;\
319	faligndata %d40, %d42, %d52	;\
320	faligndata %d42, %d44, %d54	;\
321	faligndata %d44, %d46, %d56	;\
322	faligndata %d46, %d0, %d58	;\
323	faligndata %d0, %d2, %d60	;\
324	faligndata %d2, %d4, %d62
325
326#define	FALIGN_D6			\
327	faligndata %d6, %d8, %d48	;\
328	faligndata %d8, %d10, %d50	;\
329	faligndata %d10, %d12, %d52	;\
330	faligndata %d12, %d14, %d54	;\
331	faligndata %d14, %d16, %d56	;\
332	faligndata %d16, %d18, %d58	;\
333	faligndata %d18, %d20, %d60	;\
334	faligndata %d20, %d22, %d62
335
336#define	FALIGN_D22			\
337	faligndata %d22, %d24, %d48	;\
338	faligndata %d24, %d26, %d50	;\
339	faligndata %d26, %d28, %d52	;\
340	faligndata %d28, %d30, %d54	;\
341	faligndata %d30, %d32, %d56	;\
342	faligndata %d32, %d34, %d58	;\
343	faligndata %d34, %d36, %d60	;\
344	faligndata %d36, %d38, %d62
345
346#define	FALIGN_D38			\
347	faligndata %d38, %d40, %d48	;\
348	faligndata %d40, %d42, %d50	;\
349	faligndata %d42, %d44, %d52	;\
350	faligndata %d44, %d46, %d54	;\
351	faligndata %d46, %d0, %d56	;\
352	faligndata %d0, %d2, %d58	;\
353	faligndata %d2, %d4, %d60	;\
354	faligndata %d4, %d6, %d62
355
356#define	FALIGN_D8			\
357	faligndata %d8, %d10, %d48	;\
358	faligndata %d10, %d12, %d50	;\
359	faligndata %d12, %d14, %d52	;\
360	faligndata %d14, %d16, %d54	;\
361	faligndata %d16, %d18, %d56	;\
362	faligndata %d18, %d20, %d58	;\
363	faligndata %d20, %d22, %d60	;\
364	faligndata %d22, %d24, %d62
365
366#define	FALIGN_D24			\
367	faligndata %d24, %d26, %d48	;\
368	faligndata %d26, %d28, %d50	;\
369	faligndata %d28, %d30, %d52	;\
370	faligndata %d30, %d32, %d54	;\
371	faligndata %d32, %d34, %d56	;\
372	faligndata %d34, %d36, %d58	;\
373	faligndata %d36, %d38, %d60	;\
374	faligndata %d38, %d40, %d62
375
376#define	FALIGN_D40			\
377	faligndata %d40, %d42, %d48	;\
378	faligndata %d42, %d44, %d50	;\
379	faligndata %d44, %d46, %d52	;\
380	faligndata %d46, %d0, %d54	;\
381	faligndata %d0, %d2, %d56	;\
382	faligndata %d2, %d4, %d58	;\
383	faligndata %d4, %d6, %d60	;\
384	faligndata %d6, %d8, %d62
385
386#define	FALIGN_D10			\
387	faligndata %d10, %d12, %d48	;\
388	faligndata %d12, %d14, %d50	;\
389	faligndata %d14, %d16, %d52	;\
390	faligndata %d16, %d18, %d54	;\
391	faligndata %d18, %d20, %d56	;\
392	faligndata %d20, %d22, %d58	;\
393	faligndata %d22, %d24, %d60	;\
394	faligndata %d24, %d26, %d62
395
396#define	FALIGN_D26			\
397	faligndata %d26, %d28, %d48	;\
398	faligndata %d28, %d30, %d50	;\
399	faligndata %d30, %d32, %d52	;\
400	faligndata %d32, %d34, %d54	;\
401	faligndata %d34, %d36, %d56	;\
402	faligndata %d36, %d38, %d58	;\
403	faligndata %d38, %d40, %d60	;\
404	faligndata %d40, %d42, %d62
405
406#define	FALIGN_D42			\
407	faligndata %d42, %d44, %d48	;\
408	faligndata %d44, %d46, %d50	;\
409	faligndata %d46, %d0, %d52	;\
410	faligndata %d0, %d2, %d54	;\
411	faligndata %d2, %d4, %d56	;\
412	faligndata %d4, %d6, %d58	;\
413	faligndata %d6, %d8, %d60	;\
414	faligndata %d8, %d10, %d62
415
416#define	FALIGN_D12			\
417	faligndata %d12, %d14, %d48	;\
418	faligndata %d14, %d16, %d50	;\
419	faligndata %d16, %d18, %d52	;\
420	faligndata %d18, %d20, %d54	;\
421	faligndata %d20, %d22, %d56	;\
422	faligndata %d22, %d24, %d58	;\
423	faligndata %d24, %d26, %d60	;\
424	faligndata %d26, %d28, %d62
425
426#define	FALIGN_D28			\
427	faligndata %d28, %d30, %d48	;\
428	faligndata %d30, %d32, %d50	;\
429	faligndata %d32, %d34, %d52	;\
430	faligndata %d34, %d36, %d54	;\
431	faligndata %d36, %d38, %d56	;\
432	faligndata %d38, %d40, %d58	;\
433	faligndata %d40, %d42, %d60	;\
434	faligndata %d42, %d44, %d62
435
436#define	FALIGN_D44			\
437	faligndata %d44, %d46, %d48	;\
438	faligndata %d46, %d0, %d50	;\
439	faligndata %d0, %d2, %d52	;\
440	faligndata %d2, %d4, %d54	;\
441	faligndata %d4, %d6, %d56	;\
442	faligndata %d6, %d8, %d58	;\
443	faligndata %d8, %d10, %d60	;\
444	faligndata %d10, %d12, %d62
445
446#define	FALIGN_D14			\
447	faligndata %d14, %d16, %d48	;\
448	faligndata %d16, %d18, %d50	;\
449	faligndata %d18, %d20, %d52	;\
450	faligndata %d20, %d22, %d54	;\
451	faligndata %d22, %d24, %d56	;\
452	faligndata %d24, %d26, %d58	;\
453	faligndata %d26, %d28, %d60	;\
454	faligndata %d28, %d30, %d62
455
456#define	FALIGN_D30			\
457	faligndata %d30, %d32, %d48	;\
458	faligndata %d32, %d34, %d50	;\
459	faligndata %d34, %d36, %d52	;\
460	faligndata %d36, %d38, %d54	;\
461	faligndata %d38, %d40, %d56	;\
462	faligndata %d40, %d42, %d58	;\
463	faligndata %d42, %d44, %d60	;\
464	faligndata %d44, %d46, %d62
465
466#define	FALIGN_D46			\
467	faligndata %d46, %d0, %d48	;\
468	faligndata %d0, %d2, %d50	;\
469	faligndata %d2, %d4, %d52	;\
470	faligndata %d4, %d6, %d54	;\
471	faligndata %d6, %d8, %d56	;\
472	faligndata %d8, %d10, %d58	;\
473	faligndata %d10, %d12, %d60	;\
474	faligndata %d12, %d14, %d62
475
476
477/*
478 * Copy a block of storage, returning an error code if `from' or
479 * `to' takes a kernel pagefault which cannot be resolved.
480 * Returns errno value on pagefault error, 0 if all ok
481 */
482
483
484
485#if defined(lint)
486
487/* ARGSUSED */
488int
489kcopy(const void *from, void *to, size_t count)
490{ return(0); }
491
492#else	/* lint */
493
494	.seg	".text"
495	.align	4
496
497	ENTRY(kcopy)
498
499	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
500	set	.copyerr, %l6		! copyerr is lofault value
501	ldn	[THREAD_REG + T_LOFAULT], %l7	! save existing handler
502	membar	#Sync			! sync error barrier (see copy.s)
503	stn	%l6, [THREAD_REG + T_LOFAULT]	! set t_lofault
504	!
505	! Note that we carefully do *not* flag the setting of
506	! t_lofault.
507	!
508	ba,pt	%ncc, .do_copy		! common code
509	  mov	%l7, %l6
510
511/*
512 * We got here because of a fault during kcopy or bcopy if a fault
513 * handler existed when bcopy was called.
514 * Errno value is in %g1.
515 */
516.copyerr:
517	set	.copyerr2, %l1
518	membar	#Sync			! sync error barrier
519	stn	%l1, [THREAD_REG + T_LOFAULT]	! set t_lofault
520	btst	FPUSED_FLAG, %l6
521	bz	%icc, 1f
522	  and	%l6, BCOPY_FLAG, %l1	! copy flag to %l1
523
524	membar	#Sync
525
526	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
527	wr	%o2, 0, %gsr
528
529	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
530	btst	FPRS_FEF, %o3
531	bz	%icc, 4f
532	  nop
533
534	! restore fpregs from stack
535	membar	#Sync
536	add	%fp, STACK_BIAS - 257, %o2
537	and	%o2, -64, %o2
538	ldda	[%o2]ASI_BLK_P, %d0
539	add	%o2, 64, %o2
540	ldda	[%o2]ASI_BLK_P, %d16
541	add	%o2, 64, %o2
542	ldda	[%o2]ASI_BLK_P, %d32
543	add	%o2, 64, %o2
544	ldda	[%o2]ASI_BLK_P, %d48
545	membar	#Sync
546
547	ba,pt	%ncc, 2f
548	  wr	%o3, 0, %fprs		! restore fprs
549
5504:
551	FZERO				! zero all of the fpregs
552	wr	%o3, 0, %fprs		! restore fprs
553
5542:	ldn	[THREAD_REG + T_LWP], %o2
555	tst	%o2
556	bnz,pt	%ncc, 1f
557	  nop
558
559	ldsb	[THREAD_REG + T_PREEMPT], %l0
560	deccc	%l0
561	bnz,pn	%ncc, 1f
562	  stb	%l0, [THREAD_REG + T_PREEMPT]
563
564	! Check for a kernel preemption request
565	ldn	[THREAD_REG + T_CPU], %l0
566	ldub	[%l0 + CPU_KPRUNRUN], %l0
567	tst	%l0
568	bnz,a,pt	%ncc, 1f	! Need to call kpreempt?
569	  or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
570
571	!
572	! Need to cater for the different expectations of kcopy
573	! and bcopy. kcopy will *always* set a t_lofault handler
574	! If it fires, we're expected to just return the error code
575	! and *not* to invoke any existing error handler. As far as
576	! bcopy is concerned, we only set t_lofault if there was an
577	! existing lofault handler. In that case we're expected to
578	! invoke the previously existing handler after restting the
579	! t_lofault value.
580	!
5811:
582	andn	%l6, COPY_FLAGS, %l6	! remove flags from lofault address
583	membar	#Sync			! sync error barrier
584	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
585
586	! call kpreempt if necessary
587	btst	KPREEMPT_FLAG, %l1
588	bz,pt	%icc, 2f
589	  nop
590	call	kpreempt
591	  rdpr	%pil, %o0	! pass %pil
5922:
593	btst	BCOPY_FLAG, %l1
594	bnz,pn	%ncc, 3f
595	  nop
596	ret
597	restore	%g1, 0, %o0
598
5993:
600	!
601	! We're here via bcopy. There *must* have been an error handler
602	! in place otheerwise we would have died a nasty death already.
603	!
604	jmp	%l6				! goto real handler
605	restore	%g0, 0, %o0			! dispose of copy window
606
607/*
608 * We got here because of a fault in .copyerr.  We can't safely restore fp
609 * state, so we panic.
610 */
611fp_panic_msg:
612	.asciz	"Unable to restore fp state after copy operation"
613
614	.align	4
615.copyerr2:
616	set	fp_panic_msg, %o0
617	call	panic
618	  nop
619	SET_SIZE(kcopy)
620#endif	/* lint */
621
622
623/*
624 * Copy a block of storage - must not overlap (from + len <= to).
625 * Registers: l6 - saved t_lofault
626 *
627 * Copy a page of memory.
628 * Assumes double word alignment and a count >= 256.
629 */
630#if defined(lint)
631
632/* ARGSUSED */
633void
634bcopy(const void *from, void *to, size_t count)
635{}
636
637#else	/* lint */
638
639	ENTRY(bcopy)
640
641	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
642	ldn	[THREAD_REG + T_LOFAULT], %l6	! save t_lofault
643	tst	%l6
644        !
645        ! We've already captured whether t_lofault was zero on entry.
646        ! We need to mark ourselves as being from bcopy since both
647        ! kcopy and bcopy use the same code path. If BCOPY_FLAG is set
648        ! and the saved lofault was zero, we won't reset lofault on
649        ! returning.
650        !
651	or	%l6, BCOPY_FLAG, %l6
652	bz,pt	%ncc, .do_copy
653	sethi	%hi(.copyerr), %o2
654	or	%o2, %lo(.copyerr), %o2
655	membar	#Sync			! sync error barrier
656	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
657
658.do_copy:
659	cmp	%i2, 12			! for small counts
660	blu	%ncc, .bytecp		! just copy bytes
661	  .empty
662
663	cmp	%i2, VIS_COPY_THRESHOLD	! for large counts
664	blu,pt	%ncc, .bcb_punt
665	  .empty
666
667	!
668	! Check to see if VIS acceleration is enabled
669	!
670	sethi	%hi(use_hw_bcopy), %o2
671	ld	[%o2 + %lo(use_hw_bcopy)], %o2
672	tst	%o2
673	bz,pn	%icc, .bcb_punt
674	  nop
675
676	subcc	%i1, %i0, %i3
677	bneg,a,pn %ncc, 1f
678	neg	%i3
6791:
680	/*
681	 * Compare against 256 since we should be checking block addresses
682	 * and (dest & ~63) - (src & ~63) can be 3 blocks even if
683	 * src = dest + (64 * 3) + 63.
684	 */
685	cmp	%i3, 256
686	blu,pn	%ncc, .bcb_punt
687	  nop
688
689	ldn	[THREAD_REG + T_LWP], %o3
690	tst	%o3
691	bnz,pt	%ncc, 1f
692	  nop
693
694	! kpreempt_disable();
695	ldsb	[THREAD_REG + T_PREEMPT], %o2
696	inc	%o2
697	stb	%o2, [THREAD_REG + T_PREEMPT]
698
6991:
700	rd	%fprs, %o2		! check for unused fp
701	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
702	btst	FPRS_FEF, %o2
703	bz,a	%icc, .do_blockcopy
704	  wr	%g0, FPRS_FEF, %fprs
705
706.bcb_fpregs_inuse:
707	cmp	%i2, VIS_COPY_THRESHOLD+(64*4) ! for large counts (larger
708	bgeu	%ncc, 1f		!  if we have to save the fpregs)
709	  nop
710
711	tst	%o3
712	bnz,pt	%ncc, .bcb_punt
713	  nop
714
715	ldsb	[THREAD_REG + T_PREEMPT], %l0
716	deccc	%l0
717	bnz,pn	%icc, .bcb_punt
718	  stb	%l0, [THREAD_REG + T_PREEMPT]
719
720	! Check for a kernel preemption request
721	ldn	[THREAD_REG + T_CPU], %l0
722	ldub	[%l0 + CPU_KPRUNRUN], %l0
723	tst	%l0
724	bz,pt	%icc, .bcb_punt
725	  nop
726
727	! Attempt to preempt
728	call	kpreempt
729	  rdpr	  %pil, %o0		  ! pass %pil
730
731	ba,pt	%ncc, .bcb_punt
732	  nop
733
7341:
735	wr	%g0, FPRS_FEF, %fprs
736
737	! save in-use fpregs on stack
738	membar	#Sync
739	add	%fp, STACK_BIAS - 257, %o2
740	and	%o2, -64, %o2
741	stda	%d0, [%o2]ASI_BLK_P
742	add	%o2, 64, %o2
743	stda	%d16, [%o2]ASI_BLK_P
744	add	%o2, 64, %o2
745	stda	%d32, [%o2]ASI_BLK_P
746	add	%o2, 64, %o2
747	stda	%d48, [%o2]ASI_BLK_P
748	membar	#Sync
749
750.do_blockcopy:
751	membar	#StoreStore|#StoreLoad|#LoadStore
752
753	rd	%gsr, %o2
754	st	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
755
756	! Set the lower bit in the saved t_lofault to indicate
757	! that we need to clear the %fprs register on the way
758	! out
759	or	%l6, FPUSED_FLAG, %l6
760
761	! Swap src/dst since the code below is memcpy code
762	! and memcpy/bcopy have different calling sequences
763	mov	%i1, %i5
764	mov	%i0, %i1
765	mov	%i5, %i0
766
767!!! This code is nearly identical to the version in the sun4u
768!!! libc_psr.  Most bugfixes made to that file should be
769!!! merged into this routine.
770
771	andcc	%i0, 7, %o3
772	bz,pt	%ncc, blkcpy
773	sub	%o3, 8, %o3
774	neg	%o3
775	sub	%i2, %o3, %i2
776
777	! Align Destination on double-word boundary
778
7792:	ldub	[%i1], %o4
780	inc	%i1
781	inc	%i0
782	deccc	%o3
783	bgu	%ncc, 2b
784	stb	%o4, [%i0 - 1]
785blkcpy:
786	andcc	%i0, 63, %i3
787	bz,pn	%ncc, blalign		! now block aligned
788	sub	%i3, 64, %i3
789	neg	%i3			! bytes till block aligned
790	sub	%i2, %i3, %i2		! update %i2 with new count
791
792	! Copy %i3 bytes till dst is block (64 byte) aligned. use
793	! double word copies.
794
795	alignaddr %i1, %g0, %g1
796	ldd	[%g1], %d0
797	add	%g1, 8, %g1
7986:
799	ldd	[%g1], %d2
800	add	%g1, 8, %g1
801	subcc	%i3, 8, %i3
802	faligndata %d0, %d2, %d8
803	std	%d8, [%i0]
804	add	%i1, 8, %i1
805	bz,pn	%ncc, blalign
806	add	%i0, 8, %i0
807	ldd	[%g1], %d0
808	add	%g1, 8, %g1
809	subcc	%i3, 8, %i3
810	faligndata %d2, %d0, %d8
811	std	%d8, [%i0]
812	add	%i1, 8, %i1
813	bgu,pn	%ncc, 6b
814	add	%i0, 8, %i0
815
816blalign:
817	membar	#StoreLoad
818	! %i2 = total length
819	! %i3 = blocks	(length - 64) / 64
820	! %i4 = doubles remaining  (length - blocks)
821	sub	%i2, 64, %i3
822	andn	%i3, 63, %i3
823	sub	%i2, %i3, %i4
824	andn	%i4, 7, %i4
825	sub	%i4, 16, %i4
826	sub	%i2, %i4, %i2
827	sub	%i2, %i3, %i2
828
829	andn	%i1, 0x3f, %l7		! blk aligned address
830	alignaddr %i1, %g0, %g0		! gen %gsr
831
832	srl	%i1, 3, %l5		! bits 3,4,5 are now least sig in  %l5
833	andcc	%l5, 7, %i5		! mask everything except bits 1,2 3
834	add	%i1, %i4, %i1
835	add	%i1, %i3, %i1
836
837	ldda	[%l7]ASI_BLK_P, %d0
838	add	%l7, 64, %l7
839	ldda	[%l7]ASI_BLK_P, %d16
840	add	%l7, 64, %l7
841	ldda	[%l7]ASI_BLK_P, %d32
842	add	%l7, 64, %l7
843	sub	%i3, 128, %i3
844
845	! switch statement to get us to the right 8 byte blk within a
846	! 64 byte block
847	cmp	 %i5, 4
848	bgeu,a	 hlf
849	cmp	 %i5, 6
850	cmp	 %i5, 2
851	bgeu,a	 sqtr
852	nop
853	cmp	 %i5, 1
854	be,a	 seg1
855	nop
856	ba,pt	 %ncc, seg0
857	nop
858sqtr:
859	be,a	 seg2
860	nop
861	ba,pt	 %ncc, seg3
862	nop
863
864hlf:
865	bgeu,a	 fqtr
866	nop
867	cmp	 %i5, 5
868	be,a	 seg5
869	nop
870	ba,pt	 %ncc, seg4
871	nop
872fqtr:
873	be,a	 seg6
874	nop
875	ba,pt	 %ncc, seg7
876	nop
877
878
879seg0:
880	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
881	FALIGN_D0
882	ldda	[%l7]ASI_BLK_P, %d0
883	stda	%d48, [%i0]ASI_BLK_P
884	add	%l7, 64, %l7
885	subcc	%i3, 64, %i3
886	bz,pn	%ncc, 0f
887	add	%i0, 64, %i0
888	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
889	FALIGN_D16
890	ldda	[%l7]ASI_BLK_P, %d16
891	stda	%d48, [%i0]ASI_BLK_P
892	add	%l7, 64, %l7
893	subcc	%i3, 64, %i3
894	bz,pn	%ncc, 1f
895	add	%i0, 64, %i0
896	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
897	FALIGN_D32
898	ldda	[%l7]ASI_BLK_P, %d32
899	stda	%d48, [%i0]ASI_BLK_P
900	add	%l7, 64, %l7
901	subcc	%i3, 64, %i3
902	bz,pn	%ncc, 2f
903	add	%i0, 64, %i0
904	ba,a,pt	%ncc, seg0
905
9060:
907	FALIGN_D16
908	stda	%d48, [%i0]ASI_BLK_P
909	add	%i0, 64, %i0
910	membar	#Sync
911	FALIGN_D32
912	stda	%d48, [%i0]ASI_BLK_P
913	ba,pt	%ncc, blkd0
914	add	%i0, 64, %i0
915
9161:
917	FALIGN_D32
918	stda	%d48, [%i0]ASI_BLK_P
919	add	%i0, 64, %i0
920	membar	#Sync
921	FALIGN_D0
922	stda	%d48, [%i0]ASI_BLK_P
923	ba,pt	%ncc, blkd16
924	add	%i0, 64, %i0
925
9262:
927	FALIGN_D0
928	stda	%d48, [%i0]ASI_BLK_P
929	add	%i0, 64, %i0
930	membar	#Sync
931	FALIGN_D16
932	stda	%d48, [%i0]ASI_BLK_P
933	ba,pt	%ncc, blkd32
934	add	%i0, 64, %i0
935
936seg1:
937	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
938	FALIGN_D2
939	ldda	[%l7]ASI_BLK_P, %d0
940	stda	%d48, [%i0]ASI_BLK_P
941	add	%l7, 64, %l7
942	subcc	%i3, 64, %i3
943	bz,pn	%ncc, 0f
944	add	%i0, 64, %i0
945	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
946	FALIGN_D18
947	ldda	[%l7]ASI_BLK_P, %d16
948	stda	%d48, [%i0]ASI_BLK_P
949	add	%l7, 64, %l7
950	subcc	%i3, 64, %i3
951	bz,pn	%ncc, 1f
952	add	%i0, 64, %i0
953	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
954	FALIGN_D34
955	ldda	[%l7]ASI_BLK_P, %d32
956	stda	%d48, [%i0]ASI_BLK_P
957	add	%l7, 64, %l7
958	subcc	%i3, 64, %i3
959	bz,pn	%ncc, 2f
960	add	%i0, 64, %i0
961	ba,a,pt	%ncc, seg1
9620:
963	FALIGN_D18
964	stda	%d48, [%i0]ASI_BLK_P
965	add	%i0, 64, %i0
966	membar	#Sync
967	FALIGN_D34
968	stda	%d48, [%i0]ASI_BLK_P
969	ba,pt	%ncc, blkd2
970	add	%i0, 64, %i0
971
9721:
973	FALIGN_D34
974	stda	%d48, [%i0]ASI_BLK_P
975	add	%i0, 64, %i0
976	membar	#Sync
977	FALIGN_D2
978	stda	%d48, [%i0]ASI_BLK_P
979	ba,pt	%ncc, blkd18
980	add	%i0, 64, %i0
981
9822:
983	FALIGN_D2
984	stda	%d48, [%i0]ASI_BLK_P
985	add	%i0, 64, %i0
986	membar	#Sync
987	FALIGN_D18
988	stda	%d48, [%i0]ASI_BLK_P
989	ba,pt	%ncc, blkd34
990	add	%i0, 64, %i0
991
992seg2:
993	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
994	FALIGN_D4
995	ldda	[%l7]ASI_BLK_P, %d0
996	stda	%d48, [%i0]ASI_BLK_P
997	add	%l7, 64, %l7
998	subcc	%i3, 64, %i3
999	bz,pn	%ncc, 0f
1000	add	%i0, 64, %i0
1001	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1002	FALIGN_D20
1003	ldda	[%l7]ASI_BLK_P, %d16
1004	stda	%d48, [%i0]ASI_BLK_P
1005	add	%l7, 64, %l7
1006	subcc	%i3, 64, %i3
1007	bz,pn	%ncc, 1f
1008	add	%i0, 64, %i0
1009	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1010	FALIGN_D36
1011	ldda	[%l7]ASI_BLK_P, %d32
1012	stda	%d48, [%i0]ASI_BLK_P
1013	add	%l7, 64, %l7
1014	subcc	%i3, 64, %i3
1015	bz,pn	%ncc, 2f
1016	add	%i0, 64, %i0
1017	ba,a,pt	%ncc, seg2
1018
10190:
1020	FALIGN_D20
1021	stda	%d48, [%i0]ASI_BLK_P
1022	add	%i0, 64, %i0
1023	membar	#Sync
1024	FALIGN_D36
1025	stda	%d48, [%i0]ASI_BLK_P
1026	ba,pt	%ncc, blkd4
1027	add	%i0, 64, %i0
1028
10291:
1030	FALIGN_D36
1031	stda	%d48, [%i0]ASI_BLK_P
1032	add	%i0, 64, %i0
1033	membar	#Sync
1034	FALIGN_D4
1035	stda	%d48, [%i0]ASI_BLK_P
1036	ba,pt	%ncc, blkd20
1037	add	%i0, 64, %i0
1038
10392:
1040	FALIGN_D4
1041	stda	%d48, [%i0]ASI_BLK_P
1042	add	%i0, 64, %i0
1043	membar	#Sync
1044	FALIGN_D20
1045	stda	%d48, [%i0]ASI_BLK_P
1046	ba,pt	%ncc, blkd36
1047	add	%i0, 64, %i0
1048
1049seg3:
1050	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1051	FALIGN_D6
1052	ldda	[%l7]ASI_BLK_P, %d0
1053	stda	%d48, [%i0]ASI_BLK_P
1054	add	%l7, 64, %l7
1055	subcc	%i3, 64, %i3
1056	bz,pn	%ncc, 0f
1057	add	%i0, 64, %i0
1058	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1059	FALIGN_D22
1060	ldda	[%l7]ASI_BLK_P, %d16
1061	stda	%d48, [%i0]ASI_BLK_P
1062	add	%l7, 64, %l7
1063	subcc	%i3, 64, %i3
1064	bz,pn	%ncc, 1f
1065	add	%i0, 64, %i0
1066	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1067	FALIGN_D38
1068	ldda	[%l7]ASI_BLK_P, %d32
1069	stda	%d48, [%i0]ASI_BLK_P
1070	add	%l7, 64, %l7
1071	subcc	%i3, 64, %i3
1072	bz,pn	%ncc, 2f
1073	add	%i0, 64, %i0
1074	ba,a,pt	%ncc, seg3
1075
10760:
1077	FALIGN_D22
1078	stda	%d48, [%i0]ASI_BLK_P
1079	add	%i0, 64, %i0
1080	membar	#Sync
1081	FALIGN_D38
1082	stda	%d48, [%i0]ASI_BLK_P
1083	ba,pt	%ncc, blkd6
1084	add	%i0, 64, %i0
1085
10861:
1087	FALIGN_D38
1088	stda	%d48, [%i0]ASI_BLK_P
1089	add	%i0, 64, %i0
1090	membar	#Sync
1091	FALIGN_D6
1092	stda	%d48, [%i0]ASI_BLK_P
1093	ba,pt	%ncc, blkd22
1094	add	%i0, 64, %i0
1095
10962:
1097	FALIGN_D6
1098	stda	%d48, [%i0]ASI_BLK_P
1099	add	%i0, 64, %i0
1100	membar	#Sync
1101	FALIGN_D22
1102	stda	%d48, [%i0]ASI_BLK_P
1103	ba,pt	%ncc, blkd38
1104	add	%i0, 64, %i0
1105
1106seg4:
1107	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1108	FALIGN_D8
1109	ldda	[%l7]ASI_BLK_P, %d0
1110	stda	%d48, [%i0]ASI_BLK_P
1111	add	%l7, 64, %l7
1112	subcc	%i3, 64, %i3
1113	bz,pn	%ncc, 0f
1114	add	%i0, 64, %i0
1115	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1116	FALIGN_D24
1117	ldda	[%l7]ASI_BLK_P, %d16
1118	stda	%d48, [%i0]ASI_BLK_P
1119	add	%l7, 64, %l7
1120	subcc	%i3, 64, %i3
1121	bz,pn	%ncc, 1f
1122	add	%i0, 64, %i0
1123	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1124	FALIGN_D40
1125	ldda	[%l7]ASI_BLK_P, %d32
1126	stda	%d48, [%i0]ASI_BLK_P
1127	add	%l7, 64, %l7
1128	subcc	%i3, 64, %i3
1129	bz,pn	%ncc, 2f
1130	add	%i0, 64, %i0
1131	ba,a,pt	%ncc, seg4
1132
11330:
1134	FALIGN_D24
1135	stda	%d48, [%i0]ASI_BLK_P
1136	add	%i0, 64, %i0
1137	membar	#Sync
1138	FALIGN_D40
1139	stda	%d48, [%i0]ASI_BLK_P
1140	ba,pt	%ncc, blkd8
1141	add	%i0, 64, %i0
1142
11431:
1144	FALIGN_D40
1145	stda	%d48, [%i0]ASI_BLK_P
1146	add	%i0, 64, %i0
1147	membar	#Sync
1148	FALIGN_D8
1149	stda	%d48, [%i0]ASI_BLK_P
1150	ba,pt	%ncc, blkd24
1151	add	%i0, 64, %i0
1152
11532:
1154	FALIGN_D8
1155	stda	%d48, [%i0]ASI_BLK_P
1156	add	%i0, 64, %i0
1157	membar	#Sync
1158	FALIGN_D24
1159	stda	%d48, [%i0]ASI_BLK_P
1160	ba,pt	%ncc, blkd40
1161	add	%i0, 64, %i0
1162
1163seg5:
1164	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1165	FALIGN_D10
1166	ldda	[%l7]ASI_BLK_P, %d0
1167	stda	%d48, [%i0]ASI_BLK_P
1168	add	%l7, 64, %l7
1169	subcc	%i3, 64, %i3
1170	bz,pn	%ncc, 0f
1171	add	%i0, 64, %i0
1172	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1173	FALIGN_D26
1174	ldda	[%l7]ASI_BLK_P, %d16
1175	stda	%d48, [%i0]ASI_BLK_P
1176	add	%l7, 64, %l7
1177	subcc	%i3, 64, %i3
1178	bz,pn	%ncc, 1f
1179	add	%i0, 64, %i0
1180	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1181	FALIGN_D42
1182	ldda	[%l7]ASI_BLK_P, %d32
1183	stda	%d48, [%i0]ASI_BLK_P
1184	add	%l7, 64, %l7
1185	subcc	%i3, 64, %i3
1186	bz,pn	%ncc, 2f
1187	add	%i0, 64, %i0
1188	ba,a,pt	%ncc, seg5
1189
11900:
1191	FALIGN_D26
1192	stda	%d48, [%i0]ASI_BLK_P
1193	add	%i0, 64, %i0
1194	membar	#Sync
1195	FALIGN_D42
1196	stda	%d48, [%i0]ASI_BLK_P
1197	ba,pt	%ncc, blkd10
1198	add	%i0, 64, %i0
1199
12001:
1201	FALIGN_D42
1202	stda	%d48, [%i0]ASI_BLK_P
1203	add	%i0, 64, %i0
1204	membar	#Sync
1205	FALIGN_D10
1206	stda	%d48, [%i0]ASI_BLK_P
1207	ba,pt	%ncc, blkd26
1208	add	%i0, 64, %i0
1209
12102:
1211	FALIGN_D10
1212	stda	%d48, [%i0]ASI_BLK_P
1213	add	%i0, 64, %i0
1214	membar	#Sync
1215	FALIGN_D26
1216	stda	%d48, [%i0]ASI_BLK_P
1217	ba,pt	%ncc, blkd42
1218	add	%i0, 64, %i0
1219
1220seg6:
1221	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1222	FALIGN_D12
1223	ldda	[%l7]ASI_BLK_P, %d0
1224	stda	%d48, [%i0]ASI_BLK_P
1225	add	%l7, 64, %l7
1226	subcc	%i3, 64, %i3
1227	bz,pn	%ncc, 0f
1228	add	%i0, 64, %i0
1229	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1230	FALIGN_D28
1231	ldda	[%l7]ASI_BLK_P, %d16
1232	stda	%d48, [%i0]ASI_BLK_P
1233	add	%l7, 64, %l7
1234	subcc	%i3, 64, %i3
1235	bz,pn	%ncc, 1f
1236	add	%i0, 64, %i0
1237	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1238	FALIGN_D44
1239	ldda	[%l7]ASI_BLK_P, %d32
1240	stda	%d48, [%i0]ASI_BLK_P
1241	add	%l7, 64, %l7
1242	subcc	%i3, 64, %i3
1243	bz,pn	%ncc, 2f
1244	add	%i0, 64, %i0
1245	ba,a,pt	%ncc, seg6
1246
12470:
1248	FALIGN_D28
1249	stda	%d48, [%i0]ASI_BLK_P
1250	add	%i0, 64, %i0
1251	membar	#Sync
1252	FALIGN_D44
1253	stda	%d48, [%i0]ASI_BLK_P
1254	ba,pt	%ncc, blkd12
1255	add	%i0, 64, %i0
1256
12571:
1258	FALIGN_D44
1259	stda	%d48, [%i0]ASI_BLK_P
1260	add	%i0, 64, %i0
1261	membar	#Sync
1262	FALIGN_D12
1263	stda	%d48, [%i0]ASI_BLK_P
1264	ba,pt	%ncc, blkd28
1265	add	%i0, 64, %i0
1266
12672:
1268	FALIGN_D12
1269	stda	%d48, [%i0]ASI_BLK_P
1270	add	%i0, 64, %i0
1271	membar	#Sync
1272	FALIGN_D28
1273	stda	%d48, [%i0]ASI_BLK_P
1274	ba,pt	%ncc, blkd44
1275	add	%i0, 64, %i0
1276
1277seg7:
1278	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1279	FALIGN_D14
1280	ldda	[%l7]ASI_BLK_P, %d0
1281	stda	%d48, [%i0]ASI_BLK_P
1282	add	%l7, 64, %l7
1283	subcc	%i3, 64, %i3
1284	bz,pn	%ncc, 0f
1285	add	%i0, 64, %i0
1286	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1287	FALIGN_D30
1288	ldda	[%l7]ASI_BLK_P, %d16
1289	stda	%d48, [%i0]ASI_BLK_P
1290	add	%l7, 64, %l7
1291	subcc	%i3, 64, %i3
1292	bz,pn	%ncc, 1f
1293	add	%i0, 64, %i0
1294	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1295	FALIGN_D46
1296	ldda	[%l7]ASI_BLK_P, %d32
1297	stda	%d48, [%i0]ASI_BLK_P
1298	add	%l7, 64, %l7
1299	subcc	%i3, 64, %i3
1300	bz,pn	%ncc, 2f
1301	add	%i0, 64, %i0
1302	ba,a,pt	%ncc, seg7
1303
13040:
1305	FALIGN_D30
1306	stda	%d48, [%i0]ASI_BLK_P
1307	add	%i0, 64, %i0
1308	membar	#Sync
1309	FALIGN_D46
1310	stda	%d48, [%i0]ASI_BLK_P
1311	ba,pt	%ncc, blkd14
1312	add	%i0, 64, %i0
1313
13141:
1315	FALIGN_D46
1316	stda	%d48, [%i0]ASI_BLK_P
1317	add	%i0, 64, %i0
1318	membar	#Sync
1319	FALIGN_D14
1320	stda	%d48, [%i0]ASI_BLK_P
1321	ba,pt	%ncc, blkd30
1322	add	%i0, 64, %i0
1323
13242:
1325	FALIGN_D14
1326	stda	%d48, [%i0]ASI_BLK_P
1327	add	%i0, 64, %i0
1328	membar	#Sync
1329	FALIGN_D30
1330	stda	%d48, [%i0]ASI_BLK_P
1331	ba,pt	%ncc, blkd46
1332	add	%i0, 64, %i0
1333
1334
1335	!
1336	! dribble out the last partial block
1337	!
1338blkd0:
1339	subcc	%i4, 8, %i4
1340	blu,pn	%ncc, blkdone
1341	faligndata %d0, %d2, %d48
1342	std	%d48, [%i0]
1343	add	%i0, 8, %i0
1344blkd2:
1345	subcc	%i4, 8, %i4
1346	blu,pn	%ncc, blkdone
1347	faligndata %d2, %d4, %d48
1348	std	%d48, [%i0]
1349	add	%i0, 8, %i0
1350blkd4:
1351	subcc	%i4, 8, %i4
1352	blu,pn	%ncc, blkdone
1353	faligndata %d4, %d6, %d48
1354	std	%d48, [%i0]
1355	add	%i0, 8, %i0
1356blkd6:
1357	subcc	%i4, 8, %i4
1358	blu,pn	%ncc, blkdone
1359	faligndata %d6, %d8, %d48
1360	std	%d48, [%i0]
1361	add	%i0, 8, %i0
1362blkd8:
1363	subcc	%i4, 8, %i4
1364	blu,pn	%ncc, blkdone
1365	faligndata %d8, %d10, %d48
1366	std	%d48, [%i0]
1367	add	%i0, 8, %i0
1368blkd10:
1369	subcc	%i4, 8, %i4
1370	blu,pn	%ncc, blkdone
1371	faligndata %d10, %d12, %d48
1372	std	%d48, [%i0]
1373	add	%i0, 8, %i0
1374blkd12:
1375	subcc	%i4, 8, %i4
1376	blu,pn	%ncc, blkdone
1377	faligndata %d12, %d14, %d48
1378	std	%d48, [%i0]
1379	add	%i0, 8, %i0
1380blkd14:
1381	subcc	%i4, 8, %i4
1382	blu,pn	%ncc, blkdone
1383	fsrc1	%d14, %d0
1384	ba,a,pt	%ncc, blkleft
1385
1386blkd16:
1387	subcc	%i4, 8, %i4
1388	blu,pn	%ncc, blkdone
1389	faligndata %d16, %d18, %d48
1390	std	%d48, [%i0]
1391	add	%i0, 8, %i0
1392blkd18:
1393	subcc	%i4, 8, %i4
1394	blu,pn	%ncc, blkdone
1395	faligndata %d18, %d20, %d48
1396	std	%d48, [%i0]
1397	add	%i0, 8, %i0
1398blkd20:
1399	subcc	%i4, 8, %i4
1400	blu,pn	%ncc, blkdone
1401	faligndata %d20, %d22, %d48
1402	std	%d48, [%i0]
1403	add	%i0, 8, %i0
1404blkd22:
1405	subcc	%i4, 8, %i4
1406	blu,pn	%ncc, blkdone
1407	faligndata %d22, %d24, %d48
1408	std	%d48, [%i0]
1409	add	%i0, 8, %i0
1410blkd24:
1411	subcc	%i4, 8, %i4
1412	blu,pn	%ncc, blkdone
1413	faligndata %d24, %d26, %d48
1414	std	%d48, [%i0]
1415	add	%i0, 8, %i0
1416blkd26:
1417	subcc	%i4, 8, %i4
1418	blu,pn	%ncc, blkdone
1419	faligndata %d26, %d28, %d48
1420	std	%d48, [%i0]
1421	add	%i0, 8, %i0
1422blkd28:
1423	subcc	%i4, 8, %i4
1424	blu,pn	%ncc, blkdone
1425	faligndata %d28, %d30, %d48
1426	std	%d48, [%i0]
1427	add	%i0, 8, %i0
1428blkd30:
1429	subcc	%i4, 8, %i4
1430	blu,pn	%ncc, blkdone
1431	fsrc1	%d30, %d0
1432	ba,a,pt	%ncc, blkleft
1433blkd32:
1434	subcc	%i4, 8, %i4
1435	blu,pn	%ncc, blkdone
1436	faligndata %d32, %d34, %d48
1437	std	%d48, [%i0]
1438	add	%i0, 8, %i0
1439blkd34:
1440	subcc	%i4, 8, %i4
1441	blu,pn	%ncc, blkdone
1442	faligndata %d34, %d36, %d48
1443	std	%d48, [%i0]
1444	add	%i0, 8, %i0
1445blkd36:
1446	subcc	%i4, 8, %i4
1447	blu,pn	%ncc, blkdone
1448	faligndata %d36, %d38, %d48
1449	std	%d48, [%i0]
1450	add	%i0, 8, %i0
1451blkd38:
1452	subcc	%i4, 8, %i4
1453	blu,pn	%ncc, blkdone
1454	faligndata %d38, %d40, %d48
1455	std	%d48, [%i0]
1456	add	%i0, 8, %i0
1457blkd40:
1458	subcc	%i4, 8, %i4
1459	blu,pn	%ncc, blkdone
1460	faligndata %d40, %d42, %d48
1461	std	%d48, [%i0]
1462	add	%i0, 8, %i0
1463blkd42:
1464	subcc	%i4, 8, %i4
1465	blu,pn	%ncc, blkdone
1466	faligndata %d42, %d44, %d48
1467	std	%d48, [%i0]
1468	add	%i0, 8, %i0
1469blkd44:
1470	subcc	%i4, 8, %i4
1471	blu,pn	%ncc, blkdone
1472	faligndata %d44, %d46, %d48
1473	std	%d48, [%i0]
1474	add	%i0, 8, %i0
1475blkd46:
1476	subcc	%i4, 8, %i4
1477	blu,pn	%ncc, blkdone
1478	fsrc1	%d46, %d0
1479
1480blkleft:
14811:
1482	ldd	[%l7], %d2
1483	add	%l7, 8, %l7
1484	subcc	%i4, 8, %i4
1485	faligndata %d0, %d2, %d8
1486	std	%d8, [%i0]
1487	blu,pn	%ncc, blkdone
1488	add	%i0, 8, %i0
1489	ldd	[%l7], %d0
1490	add	%l7, 8, %l7
1491	subcc	%i4, 8, %i4
1492	faligndata %d2, %d0, %d8
1493	std	%d8, [%i0]
1494	bgeu,pt	%ncc, 1b
1495	add	%i0, 8, %i0
1496
1497blkdone:
1498	tst	%i2
1499	bz,pt	%ncc, .bcb_exit
1500	and	%l3, 0x4, %l3		! fprs.du = fprs.dl = 0
1501
15027:	ldub	[%i1], %i4
1503	inc	%i1
1504	inc	%i0
1505	deccc	%i2
1506	bgu,pt	%ncc, 7b
1507	  stb	  %i4, [%i0 - 1]
1508
1509.bcb_exit:
1510	membar	#StoreLoad|#StoreStore
1511	btst	FPUSED_FLAG, %l6
1512	bz	%icc, 1f
1513	  and	%l6, COPY_FLAGS, %l1	! Store flags in %l1
1514					! We can't clear the flags from %l6 yet.
1515					! If there's an error, .copyerr will
1516					! need them
1517
1518	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
1519	wr	%o2, 0, %gsr
1520
1521	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1522	btst	FPRS_FEF, %o3
1523	bz	%icc, 4f
1524	  nop
1525
1526	! restore fpregs from stack
1527	membar	#Sync
1528	add	%fp, STACK_BIAS - 257, %o2
1529	and	%o2, -64, %o2
1530	ldda	[%o2]ASI_BLK_P, %d0
1531	add	%o2, 64, %o2
1532	ldda	[%o2]ASI_BLK_P, %d16
1533	add	%o2, 64, %o2
1534	ldda	[%o2]ASI_BLK_P, %d32
1535	add	%o2, 64, %o2
1536	ldda	[%o2]ASI_BLK_P, %d48
1537	membar	#Sync
1538
1539	ba,pt	%ncc, 2f
1540	  wr	%o3, 0, %fprs		! restore fprs
1541
15424:
1543	FZERO				! zero all of the fpregs
1544	wr	%o3, 0, %fprs		! restore fprs
1545
15462:	ldn	[THREAD_REG + T_LWP], %o2
1547	tst	%o2
1548	bnz,pt	%ncc, 1f
1549	  nop
1550
1551	ldsb	[THREAD_REG + T_PREEMPT], %l0
1552	deccc	%l0
1553	bnz,pn	%ncc, 1f
1554	  stb	%l0, [THREAD_REG + T_PREEMPT]
1555
1556	! Check for a kernel preemption request
1557	ldn	[THREAD_REG + T_CPU], %l0
1558	ldub	[%l0 + CPU_KPRUNRUN], %l0
1559	tst	%l0
1560	bnz,a,pt	%ncc, 1f	! Need to call kpreempt?
1561	  or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
1562
15631:
1564	btst	BCOPY_FLAG, %l1
1565	bz,pn	%icc, 3f
1566	  andncc	%l6, COPY_FLAGS, %l6
1567
1568	!
1569	! Here via bcopy. Check to see if the handler was NULL.
1570	! If so, just return quietly. Otherwise, reset the
1571	! handler and go home.
1572	!
1573	bnz,pn	%ncc, 3f
1574	  nop
1575
1576	!
1577	! Null handler.  Check for kpreempt flag, call if necessary,
1578	! then return.
1579	!
1580	btst	KPREEMPT_FLAG, %l1
1581	bz,pt	%icc, 2f
1582	  nop
1583	call	kpreempt
1584	  rdpr	%pil, %o0	! pass %pil
15852:
1586	ret
1587	  restore	%g0, 0, %o0
1588
1589	!
1590	! Here via kcopy or bcopy with a handler.Reset the
1591	! fault handler.
1592	!
15933:
1594	membar	#Sync
1595	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1596
1597	! call kpreempt if necessary
1598	btst	KPREEMPT_FLAG, %l1
1599	bz,pt	%icc, 4f
1600	  nop
1601	call	kpreempt
1602	  rdpr	%pil, %o0
16034:
1604	ret
1605	  restore	%g0, 0, %o0
1606
1607.bcb_punt:
1608	!
1609	! use aligned transfers where possible
1610	!
1611	xor	%i0, %i1, %o4		! xor from and to address
1612	btst	7, %o4			! if lower three bits zero
1613	bz	%icc, .aldoubcp		! can align on double boundary
1614	.empty	! assembler complaints about label
1615
1616	xor	%i0, %i1, %o4		! xor from and to address
1617	btst	3, %o4			! if lower two bits zero
1618	bz	%icc, .alwordcp		! can align on word boundary
1619	btst	3, %i0			! delay slot, from address unaligned?
1620	!
1621	! use aligned reads and writes where possible
1622	! this differs from wordcp in that it copes
1623	! with odd alignment between source and destnation
1624	! using word reads and writes with the proper shifts
1625	! in between to align transfers to and from memory
1626	! i0 - src address, i1 - dest address, i2 - count
1627	! i3, i4 - tmps for used generating complete word
1628	! i5 (word to write)
1629	! l0 size in bits of upper part of source word (US)
1630	! l1 size in bits of lower part of source word (LS = 32 - US)
1631	! l2 size in bits of upper part of destination word (UD)
1632	! l3 size in bits of lower part of destination word (LD = 32 - UD)
1633	! l4 number of bytes leftover after aligned transfers complete
1634	! l5 the number 32
1635	!
1636	mov	32, %l5			! load an oft-needed constant
1637	bz	.align_dst_only
1638	btst	3, %i1			! is destnation address aligned?
1639	clr	%i4			! clear registers used in either case
1640	bz	%icc, .align_src_only
1641	clr	%l0
1642	!
1643	! both source and destination addresses are unaligned
1644	!
16451:					! align source
1646	ldub	[%i0], %i3		! read a byte from source address
1647	add	%i0, 1, %i0		! increment source address
1648	or	%i4, %i3, %i4		! or in with previous bytes (if any)
1649	btst	3, %i0			! is source aligned?
1650	add	%l0, 8, %l0		! increment size of upper source (US)
1651	bnz,a	1b
1652	sll	%i4, 8, %i4		! make room for next byte
1653
1654	sub	%l5, %l0, %l1		! generate shift left count (LS)
1655	sll	%i4, %l1, %i4		! prepare to get rest
1656	ld	[%i0], %i3		! read a word
1657	add	%i0, 4, %i0		! increment source address
1658	srl	%i3, %l0, %i5		! upper src bits into lower dst bits
1659	or	%i4, %i5, %i5		! merge
1660	mov	24, %l3			! align destination
16611:
1662	srl	%i5, %l3, %i4		! prepare to write a single byte
1663	stb	%i4, [%i1]		! write a byte
1664	add	%i1, 1, %i1		! increment destination address
1665	sub	%i2, 1, %i2		! decrement count
1666	btst	3, %i1			! is destination aligned?
1667	bnz,a	1b
1668	sub	%l3, 8, %l3		! delay slot, decrement shift count (LD)
1669	sub	%l5, %l3, %l2		! generate shift left count (UD)
1670	sll	%i5, %l2, %i5		! move leftover into upper bytes
1671	cmp	%l2, %l0		! cmp # reqd to fill dst w old src left
1672	bgu	%ncc, .more_needed	! need more to fill than we have
1673	nop
1674
1675	sll	%i3, %l1, %i3		! clear upper used byte(s)
1676	srl	%i3, %l1, %i3
1677	! get the odd bytes between alignments
1678	sub	%l0, %l2, %l0		! regenerate shift count
1679	sub	%l5, %l0, %l1		! generate new shift left count (LS)
1680	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
1681	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
1682	srl	%i3, %l0, %i4
1683	or	%i5, %i4, %i5
1684	st	%i5, [%i1]		! write a word
1685	subcc	%i2, 4, %i2		! decrement count
1686	bz	%ncc, .unalign_out
1687	add	%i1, 4, %i1		! increment destination address
1688
1689	b	2f
1690	sll	%i3, %l1, %i5		! get leftover into upper bits
1691.more_needed:
1692	sll	%i3, %l0, %i3		! save remaining byte(s)
1693	srl	%i3, %l0, %i3
1694	sub	%l2, %l0, %l1		! regenerate shift count
1695	sub	%l5, %l1, %l0		! generate new shift left count
1696	sll	%i3, %l1, %i4		! move to fill empty space
1697	b	3f
1698	or	%i5, %i4, %i5		! merge to complete word
1699	!
1700	! the source address is aligned and destination is not
1701	!
1702.align_dst_only:
1703	ld	[%i0], %i4		! read a word
1704	add	%i0, 4, %i0		! increment source address
1705	mov	24, %l0			! initial shift alignment count
17061:
1707	srl	%i4, %l0, %i3		! prepare to write a single byte
1708	stb	%i3, [%i1]		! write a byte
1709	add	%i1, 1, %i1		! increment destination address
1710	sub	%i2, 1, %i2		! decrement count
1711	btst	3, %i1			! is destination aligned?
1712	bnz,a	1b
1713	sub	%l0, 8, %l0		! delay slot, decrement shift count
1714.xfer:
1715	sub	%l5, %l0, %l1		! generate shift left count
1716	sll	%i4, %l1, %i5		! get leftover
17173:
1718	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
1719	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
17202:
1721	ld	[%i0], %i3		! read a source word
1722	add	%i0, 4, %i0		! increment source address
1723	srl	%i3, %l0, %i4		! upper src bits into lower dst bits
1724	or	%i5, %i4, %i5		! merge with upper dest bits (leftover)
1725	st	%i5, [%i1]		! write a destination word
1726	subcc	%i2, 4, %i2		! decrement count
1727	bz	%ncc, .unalign_out	! check if done
1728	add	%i1, 4, %i1		! increment destination address
1729	b	2b			! loop
1730	sll	%i3, %l1, %i5		! get leftover
1731.unalign_out:
1732	tst	%l4			! any bytes leftover?
1733	bz	%ncc, .cpdone
1734	.empty				! allow next instruction in delay slot
17351:
1736	sub	%l0, 8, %l0		! decrement shift
1737	srl	%i3, %l0, %i4		! upper src byte into lower dst byte
1738	stb	%i4, [%i1]		! write a byte
1739	subcc	%l4, 1, %l4		! decrement count
1740	bz	%ncc, .cpdone		! done?
1741	add	%i1, 1, %i1		! increment destination
1742	tst	%l0			! any more previously read bytes
1743	bnz	%ncc, 1b		! we have leftover bytes
1744	mov	%l4, %i2		! delay slot, mv cnt where dbytecp wants
1745	b	.dbytecp		! let dbytecp do the rest
1746	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
1747	!
1748	! the destination address is aligned and the source is not
1749	!
1750.align_src_only:
1751	ldub	[%i0], %i3		! read a byte from source address
1752	add	%i0, 1, %i0		! increment source address
1753	or	%i4, %i3, %i4		! or in with previous bytes (if any)
1754	btst	3, %i0			! is source aligned?
1755	add	%l0, 8, %l0		! increment shift count (US)
1756	bnz,a	.align_src_only
1757	sll	%i4, 8, %i4		! make room for next byte
1758	b,a	.xfer
1759	!
1760	! if from address unaligned for double-word moves,
1761	! move bytes till it is, if count is < 56 it could take
1762	! longer to align the thing than to do the transfer
1763	! in word size chunks right away
1764	!
1765.aldoubcp:
1766	cmp	%i2, 56			! if count < 56, use wordcp, it takes
1767	blu,a	%ncc, .alwordcp		! longer to align doubles than words
1768	mov	3, %o0			! mask for word alignment
1769	call	.alignit		! copy bytes until aligned
1770	mov	7, %o0			! mask for double alignment
1771	!
1772	! source and destination are now double-word aligned
1773	! i3 has aligned count returned by alignit
1774	!
1775	and	%i2, 7, %i2		! unaligned leftover count
1776	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
17775:
1778	ldx	[%i0+%i1], %o4		! read from address
1779	stx	%o4, [%i1]		! write at destination address
1780	subcc	%i3, 8, %i3		! dec count
1781	bgu	%ncc, 5b
1782	add	%i1, 8, %i1		! delay slot, inc to address
1783	cmp	%i2, 4			! see if we can copy a word
1784	blu	%ncc, .dbytecp		! if 3 or less bytes use bytecp
1785	.empty
1786	!
1787	! for leftover bytes we fall into wordcp, if needed
1788	!
1789.wordcp:
1790	and	%i2, 3, %i2		! unaligned leftover count
17915:
1792	ld	[%i0+%i1], %o4		! read from address
1793	st	%o4, [%i1]		! write at destination address
1794	subcc	%i3, 4, %i3		! dec count
1795	bgu	%ncc, 5b
1796	add	%i1, 4, %i1		! delay slot, inc to address
1797	b,a	.dbytecp
1798
1799	! we come here to align copies on word boundaries
1800.alwordcp:
1801	call	.alignit		! go word-align it
1802	mov	3, %o0			! bits that must be zero to be aligned
1803	b	.wordcp
1804	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
1805
1806	!
1807	! byte copy, works with any alignment
1808	!
1809.bytecp:
1810	b	.dbytecp
1811	sub	%i0, %i1, %i0		! i0 gets difference of src and dst
1812
1813	!
1814	! differenced byte copy, works with any alignment
1815	! assumes dest in %i1 and (source - dest) in %i0
1816	!
18171:
1818	stb	%o4, [%i1]		! write to address
1819	inc	%i1			! inc to address
1820.dbytecp:
1821	deccc	%i2			! dec count
1822	bgeu,a	%ncc, 1b		! loop till done
1823	ldub	[%i0+%i1], %o4		! read from address
1824	!
1825	! FPUSED_FLAG will not have been set in any path leading to
1826	! this point. No need to deal with it.
1827	!
1828.cpdone:
1829	btst	BCOPY_FLAG, %l6
1830	bz,pn	%icc, 2f
1831	andncc	%l6, BCOPY_FLAG, %l6
1832	!
1833	! Here via bcopy. Check to see if the handler was NULL.
1834	! If so, just return quietly. Otherwise, reset the
1835	! handler and go home.
1836	!
1837	bnz,pn	%ncc, 2f
1838	nop
1839	!
1840	! Null handler.
1841	!
1842	ret
1843	restore %g0, 0, %o0
1844	!
1845	! Here via kcopy or bcopy with a handler.Reset the
1846	! fault handler.
1847	!
18482:
1849  	membar	#Sync
1850	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1851	ret
1852	restore	%g0, 0, %o0		! return (0)
1853
1854/*
1855 * Common code used to align transfers on word and doubleword
1856 * boudaries.  Aligns source and destination and returns a count
1857 * of aligned bytes to transfer in %i3
1858 */
18591:
1860	inc	%i0			! inc from
1861	stb	%o4, [%i1]		! write a byte
1862	inc	%i1			! inc to
1863	dec	%i2			! dec count
1864.alignit:
1865	btst	%o0, %i0		! %o0 is bit mask to check for alignment
1866	bnz,a	1b
1867	ldub	[%i0], %o4		! read next byte
1868
1869	retl
1870	andn	%i2, %o0, %i3		! return size of aligned bytes
1871	SET_SIZE(bcopy)
1872
1873#endif	/* lint */
1874
1875/*
1876 * Block copy with possibly overlapped operands.
1877 */
1878
1879#if defined(lint)
1880
1881/*ARGSUSED*/
1882void
1883ovbcopy(const void *from, void *to, size_t count)
1884{}
1885
1886#else	/* lint */
1887
1888	ENTRY(ovbcopy)
1889	tst	%o2			! check count
1890	bgu,a	%ncc, 1f		! nothing to do or bad arguments
1891	subcc	%o0, %o1, %o3		! difference of from and to address
1892
1893	retl				! return
1894	nop
18951:
1896	bneg,a	%ncc, 2f
1897	neg	%o3			! if < 0, make it positive
18982:	cmp	%o2, %o3		! cmp size and abs(from - to)
1899	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
1900	.empty				!   no overlap
1901	cmp	%o0, %o1		! compare from and to addresses
1902	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
1903	nop
1904	!
1905	! Copy forwards.
1906	!
1907.ov_fwd:
1908	ldub	[%o0], %o3		! read from address
1909	inc	%o0			! inc from address
1910	stb	%o3, [%o1]		! write to address
1911	deccc	%o2			! dec count
1912	bgu	%ncc, .ov_fwd		! loop till done
1913	inc	%o1			! inc to address
1914
1915	retl				! return
1916	nop
1917	!
1918	! Copy backwards.
1919	!
1920.ov_bkwd:
1921	deccc	%o2			! dec count
1922	ldub	[%o0 + %o2], %o3	! get byte at end of src
1923	bgu	%ncc, .ov_bkwd		! loop till done
1924	stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
1925
1926	retl				! return
1927	nop
1928	SET_SIZE(ovbcopy)
1929
1930#endif	/* lint */
1931
1932/*
1933 * hwblkpagecopy()
1934 *
1935 * Copies exactly one page.  This routine assumes the caller (ppcopy)
1936 * has already disabled kernel preemption and has checked
1937 * use_hw_bcopy.
1938 */
1939#ifdef lint
1940/*ARGSUSED*/
1941void
1942hwblkpagecopy(const void *src, void *dst)
1943{ }
1944#else /* lint */
1945	ENTRY(hwblkpagecopy)
1946	! get another window w/space for three aligned blocks of saved fpregs
1947	save	%sp, -SA(MINFRAME + 4*64), %sp
1948
1949	! %i0 - source address (arg)
1950	! %i1 - destination address (arg)
1951	! %i2 - length of region (not arg)
1952	! %l0 - saved fprs
1953	! %l1 - pointer to saved fpregs
1954
1955	rd	%fprs, %l0		! check for unused fp
1956	btst	FPRS_FEF, %l0
1957	bz	1f
1958	membar	#Sync
1959
1960	! save in-use fpregs on stack
1961	add	%fp, STACK_BIAS - 193, %l1
1962	and	%l1, -64, %l1
1963	stda	%d0, [%l1]ASI_BLK_P
1964	add	%l1, 64, %l3
1965	stda	%d16, [%l3]ASI_BLK_P
1966	add	%l3, 64, %l3
1967	stda	%d32, [%l3]ASI_BLK_P
1968	membar	#Sync
1969
19701:	wr	%g0, FPRS_FEF, %fprs
1971	ldda	[%i0]ASI_BLK_P, %d0
1972	add	%i0, 64, %i0
1973	set	PAGESIZE - 64, %i2
1974
19752:	ldda	[%i0]ASI_BLK_P, %d16
1976	fsrc1	%d0, %d32
1977	fsrc1	%d2, %d34
1978	fsrc1	%d4, %d36
1979	fsrc1	%d6, %d38
1980	fsrc1	%d8, %d40
1981	fsrc1	%d10, %d42
1982	fsrc1	%d12, %d44
1983	fsrc1	%d14, %d46
1984	stda	%d32, [%i1]ASI_BLK_P
1985	add	%i0, 64, %i0
1986	subcc	%i2, 64, %i2
1987	bz,pn	%ncc, 3f
1988	add	%i1, 64, %i1
1989	ldda	[%i0]ASI_BLK_P, %d0
1990	fsrc1	%d16, %d32
1991	fsrc1	%d18, %d34
1992	fsrc1	%d20, %d36
1993	fsrc1	%d22, %d38
1994	fsrc1	%d24, %d40
1995	fsrc1	%d26, %d42
1996	fsrc1	%d28, %d44
1997	fsrc1	%d30, %d46
1998	stda	%d32, [%i1]ASI_BLK_P
1999	add	%i0, 64, %i0
2000	sub	%i2, 64, %i2
2001	ba,pt	%ncc, 2b
2002	add	%i1, 64, %i1
2003
20043:	membar	#Sync
2005	btst	FPRS_FEF, %l0
2006	bz	4f
2007	stda	%d16, [%i1]ASI_BLK_P
2008
2009	! restore fpregs from stack
2010	membar	#Sync
2011	ldda	[%l1]ASI_BLK_P, %d0
2012	add	%l1, 64, %l3
2013	ldda	[%l3]ASI_BLK_P, %d16
2014	add	%l3, 64, %l3
2015	ldda	[%l3]ASI_BLK_P, %d32
2016
20174:	wr	%l0, 0, %fprs		! restore fprs
2018	membar #Sync
2019	ret
2020	restore	%g0, 0, %o0
2021	SET_SIZE(hwblkpagecopy)
2022#endif	/* lint */
2023
2024
2025/*
2026 * Transfer data to and from user space -
2027 * Note that these routines can cause faults
2028 * It is assumed that the kernel has nothing at
2029 * less than KERNELBASE in the virtual address space.
2030 *
2031 * Note that copyin(9F) and copyout(9F) are part of the
2032 * DDI/DKI which specifies that they return '-1' on "errors."
2033 *
2034 * Sigh.
2035 *
2036 * So there's two extremely similar routines - xcopyin() and xcopyout()
2037 * which return the errno that we've faithfully computed.  This
2038 * allows other callers (e.g. uiomove(9F)) to work correctly.
2039 * Given that these are used pretty heavily, we expand the calling
2040 * sequences inline for all flavours (rather than making wrappers).
2041 *
2042 * There are also stub routines for xcopyout_little and xcopyin_little,
2043 * which currently are intended to handle requests of <= 16 bytes from
2044 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
2045 * is left as an exercise...
2046 */
2047
2048/*
2049 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
2050 *
2051 * General theory of operation:
2052 *
2053 * The only difference between default_copy{in,out} and
2054 * default_xcopy{in,out} is in the error handling routine they invoke
2055 * when a memory access error is seen. default_xcopyOP returns the errno
2056 * while default_copyOP returns -1 (see above). copy{in,out}_noerr set
2057 * a special flag (by oring the value 2 into the fault handler address)
2058 * if they are called with a fault handler already in place. That flag
2059 * causes the default handlers to trampoline to the previous handler
2060 * upon an error.
2061 *
2062 * None of the copyops routines grab a window until it's decided that
2063 * we need to do a HW block copy operation. This saves a window
2064 * spill/fill when we're called during socket ops. The typical IO
2065 * path won't cause spill/fill traps.
2066 *
2067 * This code uses a set of 4 limits for the maximum size that will
2068 * be copied given a particular input/output address alignment.
2069 * the default limits are:
2070 *
2071 * single byte aligned - 900 (hw_copy_limit_1)
2072 * two byte aligned - 1800 (hw_copy_limit_2)
2073 * four byte aligned - 3600 (hw_copy_limit_4)
2074 * eight byte aligned - 7200 (hw_copy_limit_8)
2075 *
2076 * If the value for a particular limit is zero, the copy will be done
2077 * via the copy loops rather than VIS.
2078 *
2079 * Flow:
2080 *
2081 * If count == zero return zero.
2082 *
2083 * Store the previous lo_fault handler into %g6.
2084 * Place our secondary lofault handler into %g5.
2085 * Place the address of our nowindow fault handler into %o3.
2086 * Place the address of the windowed fault handler into %o4.
2087 * --> We'll use this handler if we end up grabbing a window
2088 * --> before we use VIS instructions.
2089 *
2090 * If count is less than or equal to SMALL_LIMIT (7) we
2091 * always do a byte for byte copy.
2092 *
2093 * If count is > SMALL_LIMIT, we check the alignment of the input
2094 * and output pointers. Based on the alignment we check count
2095 * against a soft limit of VIS_COPY_THRESHOLD (900 on spitfire). If
2096 * we're larger than VIS_COPY_THRESHOLD, we check against a limit based
2097 * on detected alignment. If we exceed the alignment value we copy
2098 * via VIS instructions.
2099 *
2100 * If we don't exceed one of the limits, we store -count in %o3,
2101 * we store the number of chunks (8, 4, 2 or 1 byte) operated
2102 * on in our basic copy loop in %o2. Following this we branch
2103 * to the appropriate copy loop and copy that many chunks.
2104 * Since we've been adding the chunk size to %o3 each time through
2105 * as well as decrementing %o2, we can tell if any data is
2106 * is left to be copied by examining %o3. If that is zero, we're
2107 * done and can go home. If not, we figure out what the largest
2108 * chunk size left to be copied is and branch to that copy loop
2109 * unless there's only one byte left. We load that as we're
2110 * branching to code that stores it just before we return.
2111 *
2112 * There is one potential situation in which we start to do a VIS
2113 * copy but decide to punt and return to the copy loops. There is
2114 * (in the default configuration) a window of 256 bytes between
2115 * the single byte aligned copy limit and what VIS treats as its
2116 * minimum if floating point is in use in the calling app. We need
2117 * to be prepared to handle this. See the .small_copyOP label for
2118 * details.
2119 *
2120 * Fault handlers are invoked if we reference memory that has no
2121 * current mapping.  All forms share the same copyio_fault handler.
2122 * This routine handles fixing up the stack and general housecleaning.
2123 * Each copy operation has a simple fault handler that is then called
2124 * to do the work specific to the invidual operation.  The handlers
2125 * for default_copyOP and copyOP_noerr are found at the end of
2126 * default_copyout. The handlers for default_xcopyOP are found at the
2127 * end of xdefault_copyin.
2128 */
2129
2130/*
2131 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
2132 */
2133
2134#if defined(lint)
2135
2136/*ARGSUSED*/
2137int
2138copyout(const void *kaddr, void *uaddr, size_t count)
2139{ return (0); }
2140
2141#else	/* lint */
2142
2143/*
2144 * We save the arguments in the following registers in case of a fault:
2145 * 	kaddr - %g2
2146 * 	uaddr - %g3
2147 * 	count - %g4
2148 */
2149#define	SAVE_SRC	%g2
2150#define	SAVE_DST	%g3
2151#define	SAVE_COUNT	%g4
2152
2153#define	REAL_LOFAULT		%g5
2154#define	SAVED_LOFAULT		%g6
2155
2156/*
2157 * Generic copyio fault handler.  This is the first line of defense when a
2158 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
2159 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
2160 * This allows us to share common code for all the flavors of the copy
2161 * operations, including the _noerr versions.
2162 *
2163 * Note that this function will restore the original input parameters before
2164 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
2165 * member of the t_copyop structure, if needed.
2166 */
2167	ENTRY(copyio_fault)
2168	btst	FPUSED_FLAG, SAVED_LOFAULT
2169	bz	1f
2170	  andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2171
2172	membar	#Sync
2173
2174	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2175	wr	%o2, 0, %gsr		! restore gsr
2176
2177	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2178	btst	FPRS_FEF, %o3
2179	bz	4f
2180	  nop
2181
2182	! restore fpregs from stack
2183	membar	#Sync
2184	add	%fp, STACK_BIAS - 257, %o2
2185	and	%o2, -64, %o2
2186	ldda	[%o2]ASI_BLK_P, %d0
2187	add	%o2, 64, %o2
2188	ldda	[%o2]ASI_BLK_P, %d16
2189	add	%o2, 64, %o2
2190	ldda	[%o2]ASI_BLK_P, %d32
2191	add	%o2, 64, %o2
2192	ldda	[%o2]ASI_BLK_P, %d48
2193	membar	#Sync
2194
2195	ba,pt	%ncc, 1f
2196	  wr	%o3, 0, %fprs		! restore fprs
2197
21984:
2199	FZERO				! zero all of the fpregs
2200	wr	%o3, 0, %fprs		! restore fprs
2201
22021:
2203
2204	restore
2205
2206	mov	SAVE_SRC, %o0
2207	mov	SAVE_DST, %o1
2208	jmp	REAL_LOFAULT
2209	  mov	SAVE_COUNT, %o2
2210	SET_SIZE(copyio_fault)
2211
2212	ENTRY(copyio_fault_nowindow)
2213	membar	#Sync
2214	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2215
2216	mov	SAVE_SRC, %o0
2217	mov	SAVE_DST, %o1
2218	jmp	REAL_LOFAULT
2219	  mov	SAVE_COUNT, %o2
2220	SET_SIZE(copyio_fault_nowindow)
2221
2222	ENTRY(copyout)
2223	sethi	%hi(.copyout_err), REAL_LOFAULT
2224	or	REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
2225
2226.do_copyout:
2227	!
2228	! Check the length and bail if zero.
2229	!
2230	tst	%o2
2231	bnz,pt	%ncc, 1f
2232	  nop
2233	retl
2234	  clr	%o0
22351:
2236	sethi	%hi(copyio_fault), %o4
2237	or	%o4, %lo(copyio_fault), %o4
2238	sethi	%hi(copyio_fault_nowindow), %o3
2239	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
2240	or	%o3, %lo(copyio_fault_nowindow), %o3
2241	membar	#Sync
2242	stn	%o3, [THREAD_REG + T_LOFAULT]
2243
2244	mov	%o0, SAVE_SRC
2245	mov	%o1, SAVE_DST
2246	mov	%o2, SAVE_COUNT
2247
2248	!
2249	! Check to see if we're more than SMALL_LIMIT (7 bytes).
2250	! Run in leaf mode, using the %o regs as our input regs.
2251	!
2252	subcc	%o2, SMALL_LIMIT, %o3
2253	bgu,a,pt %ncc, .dco_ns
2254	or	%o0, %o1, %o3
2255	!
2256	! What was previously ".small_copyout"
2257	! Do full differenced copy.
2258	!
2259.dcobcp:
2260	sub	%g0, %o2, %o3		! negate count
2261	add	%o0, %o2, %o0		! make %o0 point at the end
2262	add	%o1, %o2, %o1		! make %o1 point at the end
2263	ba,pt	%ncc, .dcocl
2264	ldub	[%o0 + %o3], %o4	! load first byte
2265	!
2266	! %o0 and %o2 point at the end and remain pointing at the end
2267	! of their buffers. We pull things out by adding %o3 (which is
2268	! the negation of the length) to the buffer end which gives us
2269	! the curent location in the buffers. By incrementing %o3 we walk
2270	! through both buffers without having to bump each buffer's
2271	! pointer. A very fast 4 instruction loop.
2272	!
2273	.align 16
2274.dcocl:
2275	stba	%o4, [%o1 + %o3]ASI_USER
2276	inccc	%o3
2277	bl,a,pt	%ncc, .dcocl
2278	ldub	[%o0 + %o3], %o4
2279	!
2280	! We're done. Go home.
2281	!
2282	membar	#Sync
2283	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
2284	retl
2285	clr	%o0
2286	!
2287	! Try aligned copies from here.
2288	!
2289.dco_ns:
2290	! %o0 = kernel addr (to be copied from)
2291	! %o1 = user addr (to be copied to)
2292	! %o2 = length
2293	! %o3 = %o1 | %o2 (used for alignment checking)
2294	! %o4 is alternate lo_fault
2295	! %o5 is original lo_fault
2296	!
2297	! See if we're single byte aligned. If we are, check the
2298	! limit for single byte copies. If we're smaller or equal,
2299	! bounce to the byte for byte copy loop. Otherwise do it in
2300	! HW (if enabled).
2301	!
2302	btst	1, %o3
2303	bz,pt	%icc, .dcoh8
2304	btst	7, %o3
2305	!
2306	! Single byte aligned. Do we do it via HW or via
2307	! byte for byte? Do a quick no memory reference
2308	! check to pick up small copies.
2309	!
2310	subcc	%o2, VIS_COPY_THRESHOLD, %o3
2311	bleu,pt	%ncc, .dcobcp
2312	sethi	%hi(hw_copy_limit_1), %o3
2313	!
2314	! Big enough that we need to check the HW limit for
2315	! this size copy.
2316	!
2317	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2318	!
2319	! Is HW copy on? If not, do everything byte for byte.
2320	!
2321	tst	%o3
2322	bz,pn	%icc, .dcobcp
2323	subcc	%o3, %o2, %o3
2324	!
2325	! If we're less than or equal to the single byte copy limit,
2326	! bop to the copy loop.
2327	!
2328	bge,pt	%ncc, .dcobcp
2329	nop
2330	!
2331	! We're big enough and copy is on. Do it with HW.
2332	!
2333	ba,pt	%ncc, .big_copyout
2334	nop
2335.dcoh8:
2336	!
2337	! 8 byte aligned?
2338	!
2339	bnz,a	%ncc, .dcoh4
2340	btst	3, %o3
2341	!
2342	! See if we're in the "small range".
2343	! If so, go off and do the copy.
2344	! If not, load the hard limit. %o3 is
2345	! available for reuse.
2346	!
2347	subcc	%o2, VIS_COPY_THRESHOLD, %o3
2348	bleu,pt	%ncc, .dcos8
2349	sethi	%hi(hw_copy_limit_8), %o3
2350	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2351	!
2352	! If it's zero, there's no HW bcopy.
2353	! Bop off to the aligned copy.
2354	!
2355	tst	%o3
2356	bz,pn	%icc, .dcos8
2357	subcc	%o3, %o2, %o3
2358	!
2359	! We're negative if our size is larger than hw_copy_limit_8.
2360	!
2361	bge,pt	%ncc, .dcos8
2362	nop
2363	!
2364	! HW assist is on and we're large enough. Do it.
2365	!
2366	ba,pt	%ncc, .big_copyout
2367	nop
2368.dcos8:
2369	!
2370	! Housekeeping for copy loops. Uses same idea as in the byte for
2371	! byte copy loop above.
2372	!
2373	add	%o0, %o2, %o0
2374	add	%o1, %o2, %o1
2375	sub	%g0, %o2, %o3
2376	ba,pt	%ncc, .dodebc
2377	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
2378	!
2379	! 4 byte aligned?
2380	!
2381.dcoh4:
2382	bnz,pn	%ncc, .dcoh2
2383	!
2384	! See if we're in the "small range".
2385	! If so, go off an do the copy.
2386	! If not, load the hard limit. %o3 is
2387	! available for reuse.
2388	!
2389	subcc	%o2, VIS_COPY_THRESHOLD, %o3
2390	bleu,pt	%ncc, .dcos4
2391	sethi	%hi(hw_copy_limit_4), %o3
2392	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
2393	!
2394	! If it's zero, there's no HW bcopy.
2395	! Bop off to the aligned copy.
2396	!
2397	tst	%o3
2398	bz,pn	%icc, .dcos4
2399	subcc	%o3, %o2, %o3
2400	!
2401	! We're negative if our size is larger than hw_copy_limit_4.
2402	!
2403	bge,pt	%ncc, .dcos4
2404	nop
2405	!
2406	! HW assist is on and we're large enough. Do it.
2407	!
2408	ba,pt	%ncc, .big_copyout
2409	nop
2410.dcos4:
2411	add	%o0, %o2, %o0
2412	add	%o1, %o2, %o1
2413	sub	%g0, %o2, %o3
2414	ba,pt	%ncc, .dodfbc
2415	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
2416	!
2417	! We must be 2 byte aligned. Off we go.
2418	! The check for small copies was done in the
2419	! delay at .dcoh4
2420	!
2421.dcoh2:
2422	ble	%ncc, .dcos2
2423	sethi	%hi(hw_copy_limit_2), %o3
2424	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
2425	tst	%o3
2426	bz,pn	%icc, .dcos2
2427	subcc	%o3, %o2, %o3
2428	bge,pt	%ncc, .dcos2
2429	nop
2430	!
2431	! HW is on and we're big enough. Do it.
2432	!
2433	ba,pt	%ncc, .big_copyout
2434	nop
2435.dcos2:
2436	add	%o0, %o2, %o0
2437	add	%o1, %o2, %o1
2438	sub	%g0, %o2, %o3
2439	ba,pt	%ncc, .dodtbc
2440	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
2441.small_copyout:
2442	!
2443	! Why are we doing this AGAIN? There are certain conditions in
2444	! big_copyout that will cause us to forego the HW assisted copies
2445	! and bounce back to a non-HW assisted copy. This dispatches those
2446	! copies. Note that we branch around this in the main line code.
2447	!
2448	! We make no check for limits or HW enablement here. We've
2449	! already been told that we're a poster child so just go off
2450	! and do it.
2451	!
2452	or	%o0, %o1, %o3
2453	btst	1, %o3
2454	bnz	%icc, .dcobcp		! Most likely
2455	btst	7, %o3
2456	bz	%icc, .dcos8
2457	btst	3, %o3
2458	bz	%icc, .dcos4
2459	nop
2460	ba,pt	%ncc, .dcos2
2461	nop
2462	.align 32
2463.dodebc:
2464	ldx	[%o0 + %o3], %o4
2465	deccc	%o2
2466	stxa	%o4, [%o1 + %o3]ASI_USER
2467	bg,pt	%ncc, .dodebc
2468	addcc	%o3, 8, %o3
2469	!
2470	! End of copy loop. Check to see if we're done. Most
2471	! eight byte aligned copies end here.
2472	!
2473	bz,pt	%ncc, .dcofh
2474	nop
2475	!
2476	! Something is left - do it byte for byte.
2477	!
2478	ba,pt	%ncc, .dcocl
2479	ldub	[%o0 + %o3], %o4	! load next byte
2480	!
2481	! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
2482	!
2483	.align 32
2484.dodfbc:
2485	lduw	[%o0 + %o3], %o4
2486	deccc	%o2
2487	sta	%o4, [%o1 + %o3]ASI_USER
2488	bg,pt	%ncc, .dodfbc
2489	addcc	%o3, 4, %o3
2490	!
2491	! End of copy loop. Check to see if we're done. Most
2492	! four byte aligned copies end here.
2493	!
2494	bz,pt	%ncc, .dcofh
2495	nop
2496	!
2497	! Something is left. Do it byte for byte.
2498	!
2499	ba,pt	%ncc, .dcocl
2500	ldub	[%o0 + %o3], %o4	! load next byte
2501	!
2502	! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
2503	! copy.
2504	!
2505	.align 32
2506.dodtbc:
2507	lduh	[%o0 + %o3], %o4
2508	deccc	%o2
2509	stha	%o4, [%o1 + %o3]ASI_USER
2510	bg,pt	%ncc, .dodtbc
2511	addcc	%o3, 2, %o3
2512	!
2513	! End of copy loop. Anything left?
2514	!
2515	bz,pt	%ncc, .dcofh
2516	nop
2517	!
2518	! Deal with the last byte
2519	!
2520	ldub	[%o0 + %o3], %o4
2521	stba	%o4, [%o1 + %o3]ASI_USER
2522.dcofh:
2523	membar	#Sync
2524	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2525	retl
2526	clr	%o0
2527
2528.big_copyout:
2529	!
2530	! Are we using the FP registers?
2531	!
2532	rd	%fprs, %o3			! check for unused fp
2533	btst	FPRS_FEF, %o3
2534	bnz	%icc, .copyout_fpregs_inuse
2535	nop
2536	!
2537	! We're going to go off and do a block copy.
2538	! Switch fault hendlers and grab a window. We
2539	! don't do a membar #Sync since we've done only
2540	! kernel data to this point.
2541	!
2542	stn	%o4, [THREAD_REG + T_LOFAULT]
2543	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2544	!
2545	! %o3 is now %i3. Save original %fprs.
2546	!
2547	st	%i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
2548	ba,pt	%ncc, .do_block_copyout		! Not in use. Go off and do it.
2549	wr	%g0, FPRS_FEF, %fprs		! clear %fprs
2550	!
2551.copyout_fpregs_inuse:
2552	!
2553	! We're here if the FP regs are in use. Need to see if the request
2554	! exceeds our suddenly larger minimum.
2555	!
2556	cmp	%i2, VIS_COPY_THRESHOLD+(64*4) ! for large counts (larger
2557	bl	%ncc, .small_copyout
2558	  nop
2559	!
2560	! We're going to go off and do a block copy.
2561	! Change to the heavy duty fault handler and grab a window first.
2562	!
2563	stn	%o4, [THREAD_REG + T_LOFAULT]
2564	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2565	st	%i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
2566	!
2567	! save in-use fpregs on stack
2568	!
2569	wr	%g0, FPRS_FEF, %fprs
2570	membar	#Sync
2571	add	%fp, STACK_BIAS - 257, %o2
2572	and	%o2, -64, %o2
2573	stda	%d0, [%o2]ASI_BLK_P
2574	add	%o2, 64, %o2
2575	stda	%d16, [%o2]ASI_BLK_P
2576	add	%o2, 64, %o2
2577	stda	%d32, [%o2]ASI_BLK_P
2578	add	%o2, 64, %o2
2579	stda	%d48, [%o2]ASI_BLK_P
2580	membar	#Sync
2581
2582.do_block_copyout:
2583	membar	#StoreStore|#StoreLoad|#LoadStore
2584
2585	rd	%gsr, %o2
2586	st	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
2587
2588	! Set the lower bit in the saved t_lofault to indicate
2589	! that we need to clear the %fprs register on the way
2590	! out
2591	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2592
2593	! Swap src/dst since the code below is memcpy code
2594	! and memcpy/bcopy have different calling sequences
2595	mov	%i1, %i5
2596	mov	%i0, %i1
2597	mov	%i5, %i0
2598
2599!!! This code is nearly identical to the version in the sun4u
2600!!! libc_psr.  Most bugfixes made to that file should be
2601!!! merged into this routine.
2602
2603	andcc	%i0, 7, %o3
2604	bz	%ncc, copyout_blkcpy
2605	sub	%o3, 8, %o3
2606	neg	%o3
2607	sub	%i2, %o3, %i2
2608
2609	! Align Destination on double-word boundary
2610
26112:	ldub	[%i1], %o4
2612	inc	%i1
2613	stba	%o4, [%i0]ASI_USER
2614	deccc	%o3
2615	bgu	%ncc, 2b
2616	  inc	%i0
2617copyout_blkcpy:
2618	andcc	%i0, 63, %i3
2619	bz,pn	%ncc, copyout_blalign	! now block aligned
2620	sub	%i3, 64, %i3
2621	neg	%i3			! bytes till block aligned
2622	sub	%i2, %i3, %i2		! update %i2 with new count
2623
2624	! Copy %i3 bytes till dst is block (64 byte) aligned. use
2625	! double word copies.
2626
2627	alignaddr %i1, %g0, %g1
2628	ldd	[%g1], %d0
2629	add	%g1, 8, %g1
26306:
2631	ldd	[%g1], %d2
2632	add	%g1, 8, %g1
2633	subcc	%i3, 8, %i3
2634	faligndata %d0, %d2, %d8
2635	stda	 %d8, [%i0]ASI_USER
2636	add	%i1, 8, %i1
2637	bz,pn	%ncc, copyout_blalign
2638	add	%i0, 8, %i0
2639	ldd	[%g1], %d0
2640	add	%g1, 8, %g1
2641	subcc	%i3, 8, %i3
2642	faligndata %d2, %d0, %d8
2643	stda	 %d8, [%i0]ASI_USER
2644	add	%i1, 8, %i1
2645	bgu,pn	%ncc, 6b
2646	add	%i0, 8, %i0
2647
2648copyout_blalign:
2649	membar	#StoreLoad
2650	! %i2 = total length
2651	! %i3 = blocks	(length - 64) / 64
2652	! %i4 = doubles remaining  (length - blocks)
2653	sub	%i2, 64, %i3
2654	andn	%i3, 63, %i3
2655	sub	%i2, %i3, %i4
2656	andn	%i4, 7, %i4
2657	sub	%i4, 16, %i4
2658	sub	%i2, %i4, %i2
2659	sub	%i2, %i3, %i2
2660
2661	andn	%i1, 0x3f, %l7		! blk aligned address
2662	alignaddr %i1, %g0, %g0		! gen %gsr
2663
2664	srl	%i1, 3, %l5		! bits 3,4,5 are now least sig in  %l5
2665	andcc	%l5, 7, %i5		! mask everything except bits 1,2 3
2666	add	%i1, %i4, %i1
2667	add	%i1, %i3, %i1
2668
2669	ldda	[%l7]ASI_BLK_P, %d0
2670	add	%l7, 64, %l7
2671	ldda	[%l7]ASI_BLK_P, %d16
2672	add	%l7, 64, %l7
2673	ldda	[%l7]ASI_BLK_P, %d32
2674	add	%l7, 64, %l7
2675	sub	%i3, 128, %i3
2676
2677	! switch statement to get us to the right 8 byte blk within a
2678	! 64 byte block
2679
2680	cmp	 %i5, 4
2681	bgeu,a	 copyout_hlf
2682	cmp	 %i5, 6
2683	cmp	 %i5, 2
2684	bgeu,a	 copyout_sqtr
2685	nop
2686	cmp	 %i5, 1
2687	be,a	 copyout_seg1
2688	nop
2689	ba,pt	 %ncc, copyout_seg0
2690	nop
2691copyout_sqtr:
2692	be,a	 copyout_seg2
2693	nop
2694	ba,pt	 %ncc, copyout_seg3
2695	nop
2696
2697copyout_hlf:
2698	bgeu,a	 copyout_fqtr
2699	nop
2700	cmp	 %i5, 5
2701	be,a	 copyout_seg5
2702	nop
2703	ba,pt	 %ncc, copyout_seg4
2704	nop
2705copyout_fqtr:
2706	be,a	 copyout_seg6
2707	nop
2708	ba,pt	 %ncc, copyout_seg7
2709	nop
2710
2711copyout_seg0:
2712	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2713	FALIGN_D0
2714	ldda	[%l7]ASI_BLK_P, %d0
2715	stda	%d48, [%i0]ASI_BLK_AIUS
2716	add	%l7, 64, %l7
2717	subcc	%i3, 64, %i3
2718	bz,pn	%ncc, 0f
2719	add	%i0, 64, %i0
2720	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2721	FALIGN_D16
2722	ldda	[%l7]ASI_BLK_P, %d16
2723	stda	%d48, [%i0]ASI_BLK_AIUS
2724	add	%l7, 64, %l7
2725	subcc	%i3, 64, %i3
2726	bz,pn	%ncc, 1f
2727	add	%i0, 64, %i0
2728	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2729	FALIGN_D32
2730	ldda	[%l7]ASI_BLK_P, %d32
2731	stda	%d48, [%i0]ASI_BLK_AIUS
2732	add	%l7, 64, %l7
2733	subcc	%i3, 64, %i3
2734	bz,pn	%ncc, 2f
2735	add	%i0, 64, %i0
2736	ba,a,pt	%ncc, copyout_seg0
2737
27380:
2739	FALIGN_D16
2740	stda	%d48, [%i0]ASI_BLK_AIUS
2741	add	%i0, 64, %i0
2742	membar	#Sync
2743	FALIGN_D32
2744	stda	%d48, [%i0]ASI_BLK_AIUS
2745	ba,pt	%ncc, copyout_blkd0
2746	add	%i0, 64, %i0
2747
27481:
2749	FALIGN_D32
2750	stda	%d48, [%i0]ASI_BLK_AIUS
2751	add	%i0, 64, %i0
2752	membar	#Sync
2753	FALIGN_D0
2754	stda	%d48, [%i0]ASI_BLK_AIUS
2755	ba,pt	%ncc, copyout_blkd16
2756	add	%i0, 64, %i0
2757
27582:
2759	FALIGN_D0
2760	stda	%d48, [%i0]ASI_BLK_AIUS
2761	add	%i0, 64, %i0
2762	membar	#Sync
2763	FALIGN_D16
2764	stda	%d48, [%i0]ASI_BLK_AIUS
2765	ba,pt	%ncc, copyout_blkd32
2766	add	%i0, 64, %i0
2767
2768copyout_seg1:
2769	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2770	FALIGN_D2
2771	ldda	[%l7]ASI_BLK_P, %d0
2772	stda	%d48, [%i0]ASI_BLK_AIUS
2773	add	%l7, 64, %l7
2774	subcc	%i3, 64, %i3
2775	bz,pn	%ncc, 0f
2776	add	%i0, 64, %i0
2777	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2778	FALIGN_D18
2779	ldda	[%l7]ASI_BLK_P, %d16
2780	stda	%d48, [%i0]ASI_BLK_AIUS
2781	add	%l7, 64, %l7
2782	subcc	%i3, 64, %i3
2783	bz,pn	%ncc, 1f
2784	add	%i0, 64, %i0
2785	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2786	FALIGN_D34
2787	ldda	[%l7]ASI_BLK_P, %d32
2788	stda	%d48, [%i0]ASI_BLK_AIUS
2789	add	%l7, 64, %l7
2790	subcc	%i3, 64, %i3
2791	bz,pn	%ncc, 2f
2792	add	%i0, 64, %i0
2793	ba,a,pt	%ncc, copyout_seg1
27940:
2795	FALIGN_D18
2796	stda	%d48, [%i0]ASI_BLK_AIUS
2797	add	%i0, 64, %i0
2798	membar	#Sync
2799	FALIGN_D34
2800	stda	%d48, [%i0]ASI_BLK_AIUS
2801	ba,pt	%ncc, copyout_blkd2
2802	add	%i0, 64, %i0
2803
28041:
2805	FALIGN_D34
2806	stda	%d48, [%i0]ASI_BLK_AIUS
2807	add	%i0, 64, %i0
2808	membar	#Sync
2809	FALIGN_D2
2810	stda	%d48, [%i0]ASI_BLK_AIUS
2811	ba,pt	%ncc, copyout_blkd18
2812	add	%i0, 64, %i0
2813
28142:
2815	FALIGN_D2
2816	stda	%d48, [%i0]ASI_BLK_AIUS
2817	add	%i0, 64, %i0
2818	membar	#Sync
2819	FALIGN_D18
2820	stda	%d48, [%i0]ASI_BLK_AIUS
2821	ba,pt	%ncc, copyout_blkd34
2822	add	%i0, 64, %i0
2823
2824copyout_seg2:
2825	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2826	FALIGN_D4
2827	ldda	[%l7]ASI_BLK_P, %d0
2828	stda	%d48, [%i0]ASI_BLK_AIUS
2829	add	%l7, 64, %l7
2830	subcc	%i3, 64, %i3
2831	bz,pn	%ncc, 0f
2832	add	%i0, 64, %i0
2833	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2834	FALIGN_D20
2835	ldda	[%l7]ASI_BLK_P, %d16
2836	stda	%d48, [%i0]ASI_BLK_AIUS
2837	add	%l7, 64, %l7
2838	subcc	%i3, 64, %i3
2839	bz,pn	%ncc, 1f
2840	add	%i0, 64, %i0
2841	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2842	FALIGN_D36
2843	ldda	[%l7]ASI_BLK_P, %d32
2844	stda	%d48, [%i0]ASI_BLK_AIUS
2845	add	%l7, 64, %l7
2846	subcc	%i3, 64, %i3
2847	bz,pn	%ncc, 2f
2848	add	%i0, 64, %i0
2849	ba,a,pt	%ncc, copyout_seg2
2850
28510:
2852	FALIGN_D20
2853	stda	%d48, [%i0]ASI_BLK_AIUS
2854	add	%i0, 64, %i0
2855	membar	#Sync
2856	FALIGN_D36
2857	stda	%d48, [%i0]ASI_BLK_AIUS
2858	ba,pt	%ncc, copyout_blkd4
2859	add	%i0, 64, %i0
2860
28611:
2862	FALIGN_D36
2863	stda	%d48, [%i0]ASI_BLK_AIUS
2864	add	%i0, 64, %i0
2865	membar	#Sync
2866	FALIGN_D4
2867	stda	%d48, [%i0]ASI_BLK_AIUS
2868	ba,pt	%ncc, copyout_blkd20
2869	add	%i0, 64, %i0
2870
28712:
2872	FALIGN_D4
2873	stda	%d48, [%i0]ASI_BLK_AIUS
2874	add	%i0, 64, %i0
2875	membar	#Sync
2876	FALIGN_D20
2877	stda	%d48, [%i0]ASI_BLK_AIUS
2878	ba,pt	%ncc, copyout_blkd36
2879	add	%i0, 64, %i0
2880
2881copyout_seg3:
2882	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2883	FALIGN_D6
2884	ldda	[%l7]ASI_BLK_P, %d0
2885	stda	%d48, [%i0]ASI_BLK_AIUS
2886	add	%l7, 64, %l7
2887	subcc	%i3, 64, %i3
2888	bz,pn	%ncc, 0f
2889	add	%i0, 64, %i0
2890	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2891	FALIGN_D22
2892	ldda	[%l7]ASI_BLK_P, %d16
2893	stda	%d48, [%i0]ASI_BLK_AIUS
2894	add	%l7, 64, %l7
2895	subcc	%i3, 64, %i3
2896	bz,pn	%ncc, 1f
2897	add	%i0, 64, %i0
2898	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2899	FALIGN_D38
2900	ldda	[%l7]ASI_BLK_P, %d32
2901	stda	%d48, [%i0]ASI_BLK_AIUS
2902	add	%l7, 64, %l7
2903	subcc	%i3, 64, %i3
2904	bz,pn	%ncc, 2f
2905	add	%i0, 64, %i0
2906	ba,a,pt	%ncc, copyout_seg3
2907
29080:
2909	FALIGN_D22
2910	stda	%d48, [%i0]ASI_BLK_AIUS
2911	add	%i0, 64, %i0
2912	membar	#Sync
2913	FALIGN_D38
2914	stda	%d48, [%i0]ASI_BLK_AIUS
2915	ba,pt	%ncc, copyout_blkd6
2916	add	%i0, 64, %i0
2917
29181:
2919	FALIGN_D38
2920	stda	%d48, [%i0]ASI_BLK_AIUS
2921	add	%i0, 64, %i0
2922	membar	#Sync
2923	FALIGN_D6
2924	stda	%d48, [%i0]ASI_BLK_AIUS
2925	ba,pt	%ncc, copyout_blkd22
2926	add	%i0, 64, %i0
2927
29282:
2929	FALIGN_D6
2930	stda	%d48, [%i0]ASI_BLK_AIUS
2931	add	%i0, 64, %i0
2932	membar	#Sync
2933	FALIGN_D22
2934	stda	%d48, [%i0]ASI_BLK_AIUS
2935	ba,pt	%ncc, copyout_blkd38
2936	add	%i0, 64, %i0
2937
2938copyout_seg4:
2939	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2940	FALIGN_D8
2941	ldda	[%l7]ASI_BLK_P, %d0
2942	stda	%d48, [%i0]ASI_BLK_AIUS
2943	add	%l7, 64, %l7
2944	subcc	%i3, 64, %i3
2945	bz,pn	%ncc, 0f
2946	add	%i0, 64, %i0
2947	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2948	FALIGN_D24
2949	ldda	[%l7]ASI_BLK_P, %d16
2950	stda	%d48, [%i0]ASI_BLK_AIUS
2951	add	%l7, 64, %l7
2952	subcc	%i3, 64, %i3
2953	bz,pn	%ncc, 1f
2954	add	%i0, 64, %i0
2955	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2956	FALIGN_D40
2957	ldda	[%l7]ASI_BLK_P, %d32
2958	stda	%d48, [%i0]ASI_BLK_AIUS
2959	add	%l7, 64, %l7
2960	subcc	%i3, 64, %i3
2961	bz,pn	%ncc, 2f
2962	add	%i0, 64, %i0
2963	ba,a,pt	%ncc, copyout_seg4
2964
29650:
2966	FALIGN_D24
2967	stda	%d48, [%i0]ASI_BLK_AIUS
2968	add	%i0, 64, %i0
2969	membar	#Sync
2970	FALIGN_D40
2971	stda	%d48, [%i0]ASI_BLK_AIUS
2972	ba,pt	%ncc, copyout_blkd8
2973	add	%i0, 64, %i0
2974
29751:
2976	FALIGN_D40
2977	stda	%d48, [%i0]ASI_BLK_AIUS
2978	add	%i0, 64, %i0
2979	membar	#Sync
2980	FALIGN_D8
2981	stda	%d48, [%i0]ASI_BLK_AIUS
2982	ba,pt	%ncc, copyout_blkd24
2983	add	%i0, 64, %i0
2984
29852:
2986	FALIGN_D8
2987	stda	%d48, [%i0]ASI_BLK_AIUS
2988	add	%i0, 64, %i0
2989	membar	#Sync
2990	FALIGN_D24
2991	stda	%d48, [%i0]ASI_BLK_AIUS
2992	ba,pt	%ncc, copyout_blkd40
2993	add	%i0, 64, %i0
2994
2995copyout_seg5:
2996	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2997	FALIGN_D10
2998	ldda	[%l7]ASI_BLK_P, %d0
2999	stda	%d48, [%i0]ASI_BLK_AIUS
3000	add	%l7, 64, %l7
3001	subcc	%i3, 64, %i3
3002	bz,pn	%ncc, 0f
3003	add	%i0, 64, %i0
3004	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
3005	FALIGN_D26
3006	ldda	[%l7]ASI_BLK_P, %d16
3007	stda	%d48, [%i0]ASI_BLK_AIUS
3008	add	%l7, 64, %l7
3009	subcc	%i3, 64, %i3
3010	bz,pn	%ncc, 1f
3011	add	%i0, 64, %i0
3012	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
3013	FALIGN_D42
3014	ldda	[%l7]ASI_BLK_P, %d32
3015	stda	%d48, [%i0]ASI_BLK_AIUS
3016	add	%l7, 64, %l7
3017	subcc	%i3, 64, %i3
3018	bz,pn	%ncc, 2f
3019	add	%i0, 64, %i0
3020	ba,a,pt	%ncc, copyout_seg5
3021
30220:
3023	FALIGN_D26
3024	stda	%d48, [%i0]ASI_BLK_AIUS
3025	add	%i0, 64, %i0
3026	membar	#Sync
3027	FALIGN_D42
3028	stda	%d48, [%i0]ASI_BLK_AIUS
3029	ba,pt	%ncc, copyout_blkd10
3030	add	%i0, 64, %i0
3031
30321:
3033	FALIGN_D42
3034	stda	%d48, [%i0]ASI_BLK_AIUS
3035	add	%i0, 64, %i0
3036	membar	#Sync
3037	FALIGN_D10
3038	stda	%d48, [%i0]ASI_BLK_AIUS
3039	ba,pt	%ncc, copyout_blkd26
3040	add	%i0, 64, %i0
3041
30422:
3043	FALIGN_D10
3044	stda	%d48, [%i0]ASI_BLK_AIUS
3045	add	%i0, 64, %i0
3046	membar	#Sync
3047	FALIGN_D26
3048	stda	%d48, [%i0]ASI_BLK_AIUS
3049	ba,pt	%ncc, copyout_blkd42
3050	add	%i0, 64, %i0
3051
3052copyout_seg6:
3053	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
3054	FALIGN_D12
3055	ldda	[%l7]ASI_BLK_P, %d0
3056	stda	%d48, [%i0]ASI_BLK_AIUS
3057	add	%l7, 64, %l7
3058	subcc	%i3, 64, %i3
3059	bz,pn	%ncc, 0f
3060	add	%i0, 64, %i0
3061	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
3062	FALIGN_D28
3063	ldda	[%l7]ASI_BLK_P, %d16
3064	stda	%d48, [%i0]ASI_BLK_AIUS
3065	add	%l7, 64, %l7
3066	subcc	%i3, 64, %i3
3067	bz,pn	%ncc, 1f
3068	add	%i0, 64, %i0
3069	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
3070	FALIGN_D44
3071	ldda	[%l7]ASI_BLK_P, %d32
3072	stda	%d48, [%i0]ASI_BLK_AIUS
3073	add	%l7, 64, %l7
3074	subcc	%i3, 64, %i3
3075	bz,pn	%ncc, 2f
3076	add	%i0, 64, %i0
3077	ba,a,pt	%ncc, copyout_seg6
3078
30790:
3080	FALIGN_D28
3081	stda	%d48, [%i0]ASI_BLK_AIUS
3082	add	%i0, 64, %i0
3083	membar	#Sync
3084	FALIGN_D44
3085	stda	%d48, [%i0]ASI_BLK_AIUS
3086	ba,pt	%ncc, copyout_blkd12
3087	add	%i0, 64, %i0
3088
30891:
3090	FALIGN_D44
3091	stda	%d48, [%i0]ASI_BLK_AIUS
3092	add	%i0, 64, %i0
3093	membar	#Sync
3094	FALIGN_D12
3095	stda	%d48, [%i0]ASI_BLK_AIUS
3096	ba,pt	%ncc, copyout_blkd28
3097	add	%i0, 64, %i0
3098
30992:
3100	FALIGN_D12
3101	stda	%d48, [%i0]ASI_BLK_AIUS
3102	add	%i0, 64, %i0
3103	membar	#Sync
3104	FALIGN_D28
3105	stda	%d48, [%i0]ASI_BLK_AIUS
3106	ba,pt	%ncc, copyout_blkd44
3107	add	%i0, 64, %i0
3108
3109copyout_seg7:
3110	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
3111	FALIGN_D14
3112	ldda	[%l7]ASI_BLK_P, %d0
3113	stda	%d48, [%i0]ASI_BLK_AIUS
3114	add	%l7, 64, %l7
3115	subcc	%i3, 64, %i3
3116	bz,pn	%ncc, 0f
3117	add	%i0, 64, %i0
3118	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
3119	FALIGN_D30
3120	ldda	[%l7]ASI_BLK_P, %d16
3121	stda	%d48, [%i0]ASI_BLK_AIUS
3122	add	%l7, 64, %l7
3123	subcc	%i3, 64, %i3
3124	bz,pn	%ncc, 1f
3125	add	%i0, 64, %i0
3126	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
3127	FALIGN_D46
3128	ldda	[%l7]ASI_BLK_P, %d32
3129	stda	%d48, [%i0]ASI_BLK_AIUS
3130	add	%l7, 64, %l7
3131	subcc	%i3, 64, %i3
3132	bz,pn	%ncc, 2f
3133	add	%i0, 64, %i0
3134	ba,a,pt	%ncc, copyout_seg7
3135
31360:
3137	FALIGN_D30
3138	stda	%d48, [%i0]ASI_BLK_AIUS
3139	add	%i0, 64, %i0
3140	membar	#Sync
3141	FALIGN_D46
3142	stda	%d48, [%i0]ASI_BLK_AIUS
3143	ba,pt	%ncc, copyout_blkd14
3144	add	%i0, 64, %i0
3145
31461:
3147	FALIGN_D46
3148	stda	%d48, [%i0]ASI_BLK_AIUS
3149	add	%i0, 64, %i0
3150	membar	#Sync
3151	FALIGN_D14
3152	stda	%d48, [%i0]ASI_BLK_AIUS
3153	ba,pt	%ncc, copyout_blkd30
3154	add	%i0, 64, %i0
3155
31562:
3157	FALIGN_D14
3158	stda	%d48, [%i0]ASI_BLK_AIUS
3159	add	%i0, 64, %i0
3160	membar	#Sync
3161	FALIGN_D30
3162	stda	%d48, [%i0]ASI_BLK_AIUS
3163	ba,pt	%ncc, copyout_blkd46
3164	add	%i0, 64, %i0
3165
3166
3167	!
3168	! dribble out the last partial block
3169	!
3170copyout_blkd0:
3171	subcc	%i4, 8, %i4
3172	blu,pn	%ncc, copyout_blkdone
3173	faligndata %d0, %d2, %d48
3174	stda	%d48, [%i0]ASI_USER
3175	add	%i0, 8, %i0
3176copyout_blkd2:
3177	subcc	%i4, 8, %i4
3178	blu,pn	%ncc, copyout_blkdone
3179	faligndata %d2, %d4, %d48
3180	stda	%d48, [%i0]ASI_USER
3181	add	%i0, 8, %i0
3182copyout_blkd4:
3183	subcc	%i4, 8, %i4
3184	blu,pn	%ncc, copyout_blkdone
3185	faligndata %d4, %d6, %d48
3186	stda	%d48, [%i0]ASI_USER
3187	add	%i0, 8, %i0
3188copyout_blkd6:
3189	subcc	%i4, 8, %i4
3190	blu,pn	%ncc, copyout_blkdone
3191	faligndata %d6, %d8, %d48
3192	stda	%d48, [%i0]ASI_USER
3193	add	%i0, 8, %i0
3194copyout_blkd8:
3195	subcc	%i4, 8, %i4
3196	blu,pn	%ncc, copyout_blkdone
3197	faligndata %d8, %d10, %d48
3198	stda	%d48, [%i0]ASI_USER
3199	add	%i0, 8, %i0
3200copyout_blkd10:
3201	subcc	%i4, 8, %i4
3202	blu,pn	%ncc, copyout_blkdone
3203	faligndata %d10, %d12, %d48
3204	stda	%d48, [%i0]ASI_USER
3205	add	%i0, 8, %i0
3206copyout_blkd12:
3207	subcc	%i4, 8, %i4
3208	blu,pn	%ncc, copyout_blkdone
3209	faligndata %d12, %d14, %d48
3210	stda	%d48, [%i0]ASI_USER
3211	add	%i0, 8, %i0
3212copyout_blkd14:
3213	subcc	%i4, 8, %i4
3214	blu,pn	%ncc, copyout_blkdone
3215	fsrc1	%d14, %d0
3216	ba,a,pt	%ncc, copyout_blkleft
3217
3218copyout_blkd16:
3219	subcc	%i4, 8, %i4
3220	blu,pn	%ncc, copyout_blkdone
3221	faligndata %d16, %d18, %d48
3222	stda	%d48, [%i0]ASI_USER
3223	add	%i0, 8, %i0
3224copyout_blkd18:
3225	subcc	%i4, 8, %i4
3226	blu,pn	%ncc, copyout_blkdone
3227	faligndata %d18, %d20, %d48
3228	stda	%d48, [%i0]ASI_USER
3229	add	%i0, 8, %i0
3230copyout_blkd20:
3231	subcc	%i4, 8, %i4
3232	blu,pn	%ncc, copyout_blkdone
3233	faligndata %d20, %d22, %d48
3234	stda	%d48, [%i0]ASI_USER
3235	add	%i0, 8, %i0
3236copyout_blkd22:
3237	subcc	%i4, 8, %i4
3238	blu,pn	%ncc, copyout_blkdone
3239	faligndata %d22, %d24, %d48
3240	stda	%d48, [%i0]ASI_USER
3241	add	%i0, 8, %i0
3242copyout_blkd24:
3243	subcc	%i4, 8, %i4
3244	blu,pn	%ncc, copyout_blkdone
3245	faligndata %d24, %d26, %d48
3246	stda	%d48, [%i0]ASI_USER
3247	add	%i0, 8, %i0
3248copyout_blkd26:
3249	subcc	%i4, 8, %i4
3250	blu,pn	%ncc, copyout_blkdone
3251	faligndata %d26, %d28, %d48
3252	stda	%d48, [%i0]ASI_USER
3253	add	%i0, 8, %i0
3254copyout_blkd28:
3255	subcc	%i4, 8, %i4
3256	blu,pn	%ncc, copyout_blkdone
3257	faligndata %d28, %d30, %d48
3258	stda	%d48, [%i0]ASI_USER
3259	add	%i0, 8, %i0
3260copyout_blkd30:
3261	subcc	%i4, 8, %i4
3262	blu,pn	%ncc, copyout_blkdone
3263	fsrc1	%d30, %d0
3264	ba,a,pt	%ncc, copyout_blkleft
3265copyout_blkd32:
3266	subcc	%i4, 8, %i4
3267	blu,pn	%ncc, copyout_blkdone
3268	faligndata %d32, %d34, %d48
3269	stda	%d48, [%i0]ASI_USER
3270	add	%i0, 8, %i0
3271copyout_blkd34:
3272	subcc	%i4, 8, %i4
3273	blu,pn	%ncc, copyout_blkdone
3274	faligndata %d34, %d36, %d48
3275	stda	%d48, [%i0]ASI_USER
3276	add	%i0, 8, %i0
3277copyout_blkd36:
3278	subcc	%i4, 8, %i4
3279	blu,pn	%ncc, copyout_blkdone
3280	faligndata %d36, %d38, %d48
3281	stda	%d48, [%i0]ASI_USER
3282	add	%i0, 8, %i0
3283copyout_blkd38:
3284	subcc	%i4, 8, %i4
3285	blu,pn	%ncc, copyout_blkdone
3286	faligndata %d38, %d40, %d48
3287	stda	%d48, [%i0]ASI_USER
3288	add	%i0, 8, %i0
3289copyout_blkd40:
3290	subcc	%i4, 8, %i4
3291	blu,pn	%ncc, copyout_blkdone
3292	faligndata %d40, %d42, %d48
3293	stda	%d48, [%i0]ASI_USER
3294	add	%i0, 8, %i0
3295copyout_blkd42:
3296	subcc	%i4, 8, %i4
3297	blu,pn	%ncc, copyout_blkdone
3298	faligndata %d42, %d44, %d48
3299	stda	%d48, [%i0]ASI_USER
3300	add	%i0, 8, %i0
3301copyout_blkd44:
3302	subcc	%i4, 8, %i4
3303	blu,pn	%ncc, copyout_blkdone
3304	faligndata %d44, %d46, %d48
3305	stda	%d48, [%i0]ASI_USER
3306	add	%i0, 8, %i0
3307copyout_blkd46:
3308	subcc	%i4, 8, %i4
3309	blu,pn	%ncc, copyout_blkdone
3310	fsrc1	%d46, %d0
3311
3312copyout_blkleft:
33131:
3314	ldd	[%l7], %d2
3315	add	%l7, 8, %l7
3316	subcc	%i4, 8, %i4
3317	faligndata %d0, %d2, %d8
3318	stda	%d8, [%i0]ASI_USER
3319	blu,pn	%ncc, copyout_blkdone
3320	add	%i0, 8, %i0
3321	ldd	[%l7], %d0
3322	add	%l7, 8, %l7
3323	subcc	%i4, 8, %i4
3324	faligndata %d2, %d0, %d8
3325	stda	%d8, [%i0]ASI_USER
3326	bgeu,pt	%ncc, 1b
3327	add	%i0, 8, %i0
3328
3329copyout_blkdone:
3330	tst	%i2
3331	bz,pt	%ncc, .copyout_exit
3332	and	%l3, 0x4, %l3		! fprs.du = fprs.dl = 0
3333
33347:	ldub	[%i1], %i4
3335	inc	%i1
3336	stba	%i4, [%i0]ASI_USER
3337	inc	%i0
3338	deccc	%i2
3339	bgu	%ncc, 7b
3340	  nop
3341
3342.copyout_exit:
3343	membar	#StoreLoad|#StoreStore
3344	btst	FPUSED_FLAG, SAVED_LOFAULT
3345	bz	1f
3346	  nop
3347
3348	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
3349	wr	%o2, 0, %gsr		! restore gsr
3350
3351	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3352	btst	FPRS_FEF, %o3
3353	bz	4f
3354	  nop
3355
3356	! restore fpregs from stack
3357	membar	#Sync
3358	add	%fp, STACK_BIAS - 257, %o2
3359	and	%o2, -64, %o2
3360	ldda	[%o2]ASI_BLK_P, %d0
3361	add	%o2, 64, %o2
3362	ldda	[%o2]ASI_BLK_P, %d16
3363	add	%o2, 64, %o2
3364	ldda	[%o2]ASI_BLK_P, %d32
3365	add	%o2, 64, %o2
3366	ldda	[%o2]ASI_BLK_P, %d48
3367	membar	#Sync
3368
3369	ba,pt	%ncc, 1f
3370	  wr	%o3, 0, %fprs		! restore fprs
3371
33724:
3373	FZERO				! zero all of the fpregs
3374	wr	%o3, 0, %fprs		! restore fprs
3375
33761:
3377	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3378	membar	#Sync			! sync error barrier
3379	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3380	ret
3381	restore	%g0, 0, %o0
3382
3383.copyout_err:
3384	ldn	[THREAD_REG + T_COPYOPS], %o4
3385	brz	%o4, 2f
3386	nop
3387	ldn	[%o4 + CP_COPYOUT], %g2
3388	jmp	%g2
3389	nop
33902:
3391	retl
3392	mov	-1, %o0
3393	SET_SIZE(copyout)
3394
3395#endif	/* lint */
3396
3397
3398#ifdef	lint
3399
3400/*ARGSUSED*/
3401int
3402xcopyout(const void *kaddr, void *uaddr, size_t count)
3403{ return (0); }
3404
3405#else	/* lint */
3406
3407	ENTRY(xcopyout)
3408	sethi	%hi(.xcopyout_err), REAL_LOFAULT
3409	b	.do_copyout
3410	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
3411.xcopyout_err:
3412	ldn	[THREAD_REG + T_COPYOPS], %o4
3413	brz	%o4, 2f
3414	nop
3415	ldn	[%o4 + CP_XCOPYOUT], %g2
3416	jmp	%g2
3417	nop
34182:
3419	retl
3420	mov	%g1, %o0
3421	SET_SIZE(xcopyout)
3422
3423#endif	/* lint */
3424
3425#ifdef	lint
3426
3427/*ARGSUSED*/
3428int
3429xcopyout_little(const void *kaddr, void *uaddr, size_t count)
3430{ return (0); }
3431
3432#else	/* lint */
3433
3434	ENTRY(xcopyout_little)
3435	sethi	%hi(.little_err), %o4
3436	ldn	[THREAD_REG + T_LOFAULT], %o5
3437	or	%o4, %lo(.little_err), %o4
3438	membar	#Sync			! sync error barrier
3439	stn	%o4, [THREAD_REG + T_LOFAULT]
3440
3441	subcc	%g0, %o2, %o3
3442	add	%o0, %o2, %o0
3443	bz,pn	%ncc, 2f		! check for zero bytes
3444	sub	%o2, 1, %o4
3445	add	%o0, %o4, %o0		! start w/last byte
3446	add	%o1, %o2, %o1
3447	ldub	[%o0+%o3], %o4
3448
34491:	stba	%o4, [%o1+%o3]ASI_AIUSL
3450	inccc	%o3
3451	sub	%o0, 2, %o0		! get next byte
3452	bcc,a,pt %ncc, 1b
3453	  ldub	[%o0+%o3], %o4
3454
34552:	membar	#Sync			! sync error barrier
3456	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3457	retl
3458	mov	%g0, %o0		! return (0)
3459	SET_SIZE(xcopyout_little)
3460
3461#endif	/* lint */
3462
3463/*
3464 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
3465 */
3466
3467#if defined(lint)
3468
3469/*ARGSUSED*/
3470int
3471copyin(const void *uaddr, void *kaddr, size_t count)
3472{ return (0); }
3473
3474#else	/* lint */
3475
3476	ENTRY(copyin)
3477	sethi	%hi(.copyin_err), REAL_LOFAULT
3478	or	REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
3479
3480.do_copyin:
3481	!
3482	! Check the length and bail if zero.
3483	!
3484	tst	%o2
3485	bnz,pt	%ncc, 1f
3486	  nop
3487	retl
3488	  clr	%o0
34891:
3490	sethi	%hi(copyio_fault), %o4
3491	or	%o4, %lo(copyio_fault), %o4
3492	sethi	%hi(copyio_fault_nowindow), %o3
3493	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
3494	or	%o3, %lo(copyio_fault_nowindow), %o3
3495	membar	#Sync
3496	stn	%o3, [THREAD_REG + T_LOFAULT]
3497
3498	mov	%o0, SAVE_SRC
3499	mov	%o1, SAVE_DST
3500	mov	%o2, SAVE_COUNT
3501
3502	!
3503	! Check to see if we're more than SMALL_LIMIT.
3504	!
3505	subcc	%o2, SMALL_LIMIT, %o3
3506	bgu,a,pt %ncc, .dci_ns
3507	or	%o0, %o1, %o3
3508	!
3509	! What was previously ".small_copyin"
3510	!
3511.dcibcp:
3512	sub	%g0, %o2, %o3		! setup for copy loop
3513	add	%o0, %o2, %o0
3514	add	%o1, %o2, %o1
3515	ba,pt	%ncc, .dcicl
3516	lduba	[%o0 + %o3]ASI_USER, %o4
3517	!
3518	! %o0 and %o1 point at the end and remain pointing at the end
3519	! of their buffers. We pull things out by adding %o3 (which is
3520	! the negation of the length) to the buffer end which gives us
3521	! the curent location in the buffers. By incrementing %o3 we walk
3522	! through both buffers without having to bump each buffer's
3523	! pointer. A very fast 4 instruction loop.
3524	!
3525	.align 16
3526.dcicl:
3527	stb	%o4, [%o1 + %o3]
3528	inccc	%o3
3529	bl,a,pt %ncc, .dcicl
3530	lduba	[%o0 + %o3]ASI_USER, %o4
3531	!
3532	! We're done. Go home.
3533	!
3534	membar	#Sync
3535	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3536	retl
3537	clr	%o0
3538	!
3539	! Try aligned copies from here.
3540	!
3541.dci_ns:
3542	!
3543	! See if we're single byte aligned. If we are, check the
3544	! limit for single byte copies. If we're smaller, or equal,
3545	! bounce to the byte for byte copy loop. Otherwise do it in
3546	! HW (if enabled).
3547	!
3548	btst	1, %o3
3549	bz,a,pt	%icc, .dcih8
3550	btst	7, %o3
3551	!
3552	! We're single byte aligned.
3553	!
3554	subcc	%o2, VIS_COPY_THRESHOLD, %o3
3555	bleu,pt	%ncc, .dcibcp
3556	sethi	%hi(hw_copy_limit_1), %o3
3557	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3558	!
3559	! Is HW copy on? If not do everything byte for byte.
3560	!
3561	tst	%o3
3562	bz,pn	%icc, .dcibcp
3563	subcc	%o3, %o2, %o3
3564	!
3565	! Are we bigger than the HW limit? If not
3566	! go to byte for byte.
3567	!
3568	bge,pt	%ncc, .dcibcp
3569	nop
3570	!
3571	! We're big enough and copy is on. Do it with HW.
3572	!
3573	ba,pt	%ncc, .big_copyin
3574	nop
3575.dcih8:
3576	!
3577	! 8 byte aligned?
3578	!
3579	bnz,a	%ncc, .dcih4
3580	btst	3, %o3
3581	!
3582	! We're eight byte aligned.
3583	!
3584	subcc	%o2, VIS_COPY_THRESHOLD, %o3
3585	bleu,pt	%ncc, .dcis8
3586	sethi	%hi(hw_copy_limit_8), %o3
3587	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3588	!
3589	! Is HW assist on? If not, do it with the aligned copy.
3590	!
3591	tst	%o3
3592	bz,pn	%icc, .dcis8
3593	subcc	%o3, %o2, %o3
3594	bge	%ncc, .dcis8
3595	nop
3596	ba,pt	%ncc, .big_copyin
3597	nop
3598.dcis8:
3599	!
3600	! Housekeeping for copy loops. Uses same idea as in the byte for
3601	! byte copy loop above.
3602	!
3603	add	%o0, %o2, %o0
3604	add	%o1, %o2, %o1
3605	sub	%g0, %o2, %o3
3606	ba,pt	%ncc, .didebc
3607	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
3608	!
3609	! 4 byte aligned?
3610	!
3611.dcih4:
3612	bnz	%ncc, .dcih2
3613	subcc	%o2, VIS_COPY_THRESHOLD, %o3
3614	bleu,pt	%ncc, .dcis4
3615	sethi	%hi(hw_copy_limit_4), %o3
3616	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3617	!
3618	! Is HW assist on? If not, do it with the aligned copy.
3619	!
3620	tst	%o3
3621	bz,pn	%icc, .dcis4
3622	subcc	%o3, %o2, %o3
3623	!
3624	! We're negative if our size is less than or equal to hw_copy_limit_4.
3625	!
3626	bge	%ncc, .dcis4
3627	nop
3628	ba,pt	%ncc, .big_copyin
3629	nop
3630.dcis4:
3631	!
3632	! Housekeeping for copy loops. Uses same idea as in the byte
3633	! for byte copy loop above.
3634	!
3635	add	%o0, %o2, %o0
3636	add	%o1, %o2, %o1
3637	sub	%g0, %o2, %o3
3638	ba,pt	%ncc, .didfbc
3639	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
3640.dcih2:
3641	!
3642	! We're two byte aligned. Check for "smallness"
3643	! done in delay at .dcih4
3644	!
3645	bleu,pt	%ncc, .dcis2
3646	sethi	%hi(hw_copy_limit_2), %o3
3647	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3648	!
3649	! Is HW assist on? If not, do it with the aligned copy.
3650	!
3651	tst	%o3
3652	bz,pn	%icc, .dcis2
3653	subcc	%o3, %o2, %o3
3654	!
3655	! Are we larger than the HW limit?
3656	!
3657	bge	%ncc, .dcis2
3658	nop
3659	!
3660	! HW assist is on and we're large enough to use it.
3661	!
3662	ba,pt	%ncc, .big_copyin
3663	nop
3664	!
3665	! Housekeeping for copy loops. Uses same idea as in the byte
3666	! for byte copy loop above.
3667	!
3668.dcis2:
3669	add	%o0, %o2, %o0
3670	add	%o1, %o2, %o1
3671	sub	%g0, %o2, %o3
3672	ba,pt	%ncc, .didtbc
3673	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
3674	!
3675.small_copyin:
3676	!
3677	! Why are we doing this AGAIN? There are certain conditions in
3678	! big copyin that will cause us to forgo the HW assisted copys
3679	! and bounce back to a non-hw assisted copy. This dispatches
3680	! those copies. Note that we branch around this in the main line
3681	! code.
3682	!
3683	! We make no check for limits or HW enablement here. We've
3684	! already been told that we're a poster child so just go off
3685	! and do it.
3686	!
3687	or	%o0, %o1, %o3
3688	btst	1, %o3
3689	bnz	%icc, .dcibcp		! Most likely
3690	btst	7, %o3
3691	bz	%icc, .dcis8
3692	btst	3, %o3
3693	bz	%icc, .dcis4
3694	nop
3695	ba,pt	%ncc, .dcis2
3696	nop
3697	!
3698	! Eight byte aligned copies. A steal from the original .small_copyin
3699	! with modifications. %o2 is number of 8 byte chunks to copy. When
3700	! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
3701	! to copy.
3702	!
3703	.align 32
3704.didebc:
3705	ldxa	[%o0 + %o3]ASI_USER, %o4
3706	deccc	%o2
3707	stx	%o4, [%o1 + %o3]
3708	bg,pt	%ncc, .didebc
3709	addcc	%o3, 8, %o3
3710	!
3711	! End of copy loop. Most 8 byte aligned copies end here.
3712	!
3713	bz,pt	%ncc, .dcifh
3714	nop
3715	!
3716	! Something is left. Do it byte for byte.
3717	!
3718	ba,pt	%ncc, .dcicl
3719	lduba	[%o0 + %o3]ASI_USER, %o4
3720	!
3721	! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
3722	!
3723	.align 32
3724.didfbc:
3725	lduwa	[%o0 + %o3]ASI_USER, %o4
3726	deccc	%o2
3727	st	%o4, [%o1 + %o3]
3728	bg,pt	%ncc, .didfbc
3729	addcc	%o3, 4, %o3
3730	!
3731	! End of copy loop. Most 4 byte aligned copies end here.
3732	!
3733	bz,pt	%ncc, .dcifh
3734	nop
3735	!
3736	! Something is left. Do it byte for byte.
3737	!
3738	ba,pt	%ncc, .dcicl
3739	lduba	[%o0 + %o3]ASI_USER, %o4
3740	!
3741	! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
3742	! copy.
3743	!
3744	.align 32
3745.didtbc:
3746	lduha	[%o0 + %o3]ASI_USER, %o4
3747	deccc	%o2
3748	sth	%o4, [%o1 + %o3]
3749	bg,pt	%ncc, .didtbc
3750	addcc	%o3, 2, %o3
3751	!
3752	! End of copy loop. Most 2 byte aligned copies end here.
3753	!
3754	bz,pt	%ncc, .dcifh
3755	nop
3756	!
3757	! Deal with the last byte
3758	!
3759	lduba	[%o0 + %o3]ASI_USER, %o4
3760	stb	%o4, [%o1 + %o3]
3761.dcifh:
3762	membar	#Sync
3763	stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3764	retl
3765	clr	%o0
3766
3767.big_copyin:
3768	!
3769	! Are we using the FP registers?
3770	!
3771	rd	%fprs, %o3		! check for unused fp
3772	btst	FPRS_FEF, %o3
3773	bnz	%ncc, .copyin_fpregs_inuse
3774	nop
3775	!
3776	! We're going off to do a block copy.
3777	! Switch fault hendlers and grab a window. We
3778	! don't do a membar #Sync since we've done only
3779	! kernel data to this point.
3780	!
3781	stn	%o4, [THREAD_REG + T_LOFAULT]
3782	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3783	!
3784	! %o3 is %i3 after the save...
3785	!
3786	st	%i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
3787	ba,pt	%ncc, .do_blockcopyin
3788	wr	%g0, FPRS_FEF, %fprs
3789.copyin_fpregs_inuse:
3790	!
3791	! We're here if the FP regs are in use. Need to see if the request
3792	! exceeds our suddenly larger minimum.
3793	!
3794	cmp	%i2, VIS_COPY_THRESHOLD+(64*4)
3795	bl	%ncc, .small_copyin
3796	nop
3797	!
3798	! We're going off and do a block copy.
3799	! Change to the heavy duty fault handler and grab a window first.
3800	! New handler is passed in
3801	!
3802	stn	%o4, [THREAD_REG + T_LOFAULT]
3803	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3804	!
3805	! %o3 is now %i3
3806	!
3807	st	%i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
3808
3809	! save in-use fpregs on stack
3810	wr	%g0, FPRS_FEF, %fprs
3811	membar	#Sync
3812	add	%fp, STACK_BIAS - 257, %o2
3813	and	%o2, -64, %o2
3814	stda	%d0, [%o2]ASI_BLK_P
3815	add	%o2, 64, %o2
3816	stda	%d16, [%o2]ASI_BLK_P
3817	add	%o2, 64, %o2
3818	stda	%d32, [%o2]ASI_BLK_P
3819	add	%o2, 64, %o2
3820	stda	%d48, [%o2]ASI_BLK_P
3821	membar	#Sync
3822
3823.do_blockcopyin:
3824	membar	#StoreStore|#StoreLoad|#LoadStore
3825
3826	rd	%gsr, %o2
3827	st	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
3828
3829	! Set the lower bit in the saved t_lofault to indicate
3830	! that we need to clear the %fprs register on the way
3831	! out
3832	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3833
3834	! Swap src/dst since the code below is memcpy code
3835	! and memcpy/bcopy have different calling sequences
3836	mov	%i1, %i5
3837	mov	%i0, %i1
3838	mov	%i5, %i0
3839
3840!!! This code is nearly identical to the version in the sun4u
3841!!! libc_psr.  Most bugfixes made to that file should be
3842!!! merged into this routine.
3843
3844	andcc	%i0, 7, %o3
3845	bz	copyin_blkcpy
3846	sub	%o3, 8, %o3
3847	neg	%o3
3848	sub	%i2, %o3, %i2
3849
3850	! Align Destination on double-word boundary
3851
38522:	lduba	[%i1]ASI_USER, %o4
3853	inc	%i1
3854	inc	%i0
3855	deccc	%o3
3856	bgu	%ncc, 2b
3857	stb	%o4, [%i0-1]
3858copyin_blkcpy:
3859	andcc	%i0, 63, %i3
3860	bz,pn	%ncc, copyin_blalign	! now block aligned
3861	sub	%i3, 64, %i3
3862	neg	%i3			! bytes till block aligned
3863	sub	%i2, %i3, %i2		! update %i2 with new count
3864
3865	! Copy %i3 bytes till dst is block (64 byte) aligned. use
3866	! double word copies.
3867
3868	alignaddr %i1, %g0, %g1
3869	ldda	[%g1]ASI_USER, %d0
3870	add	%g1, 8, %g1
38716:
3872	ldda	[%g1]ASI_USER, %d2
3873	add	%g1, 8, %g1
3874	subcc	%i3, 8, %i3
3875	faligndata %d0, %d2, %d8
3876	std	%d8, [%i0]
3877	add	%i1, 8, %i1
3878	bz,pn	%ncc, copyin_blalign
3879	add	%i0, 8, %i0
3880	ldda	[%g1]ASI_USER, %d0
3881	add	%g1, 8, %g1
3882	subcc	%i3, 8, %i3
3883	faligndata %d2, %d0, %d8
3884	std	%d8, [%i0]
3885	add	%i1, 8, %i1
3886	bgu,pn	%ncc, 6b
3887	add	%i0, 8, %i0
3888
3889copyin_blalign:
3890	membar	#StoreLoad
3891	! %i2 = total length
3892	! %i3 = blocks	(length - 64) / 64
3893	! %i4 = doubles remaining  (length - blocks)
3894	sub	%i2, 64, %i3
3895	andn	%i3, 63, %i3
3896	sub	%i2, %i3, %i4
3897	andn	%i4, 7, %i4
3898	sub	%i4, 16, %i4
3899	sub	%i2, %i4, %i2
3900	sub	%i2, %i3, %i2
3901
3902	andn	%i1, 0x3f, %l7		! blk aligned address
3903	alignaddr %i1, %g0, %g0		! gen %gsr
3904
3905	srl	%i1, 3, %l5		! bits 3,4,5 are now least sig in  %l5
3906	andcc	%l5, 7, %i5		! mask everything except bits 1,2 3
3907	add	%i1, %i4, %i1
3908	add	%i1, %i3, %i1
3909
3910	ldda	[%l7]ASI_BLK_AIUS, %d0
3911	add	%l7, 64, %l7
3912	ldda	[%l7]ASI_BLK_AIUS, %d16
3913	add	%l7, 64, %l7
3914	ldda	[%l7]ASI_BLK_AIUS, %d32
3915	add	%l7, 64, %l7
3916	sub	%i3, 128, %i3
3917
3918	! switch statement to get us to the right 8 byte blk within a
3919	! 64 byte block
3920
3921	cmp	 %i5, 4
3922	bgeu,a	 copyin_hlf
3923	cmp	 %i5, 6
3924	cmp	 %i5, 2
3925	bgeu,a	 copyin_sqtr
3926	nop
3927	cmp	 %i5, 1
3928	be,a	 copyin_seg1
3929	nop
3930	ba,pt	 %ncc, copyin_seg0
3931	nop
3932copyin_sqtr:
3933	be,a	 copyin_seg2
3934	nop
3935	ba,pt	 %ncc, copyin_seg3
3936	nop
3937
3938copyin_hlf:
3939	bgeu,a	 copyin_fqtr
3940	nop
3941	cmp	 %i5, 5
3942	be,a	 copyin_seg5
3943	nop
3944	ba,pt	 %ncc, copyin_seg4
3945	nop
3946copyin_fqtr:
3947	be,a	 copyin_seg6
3948	nop
3949	ba,pt	 %ncc, copyin_seg7
3950	nop
3951
3952copyin_seg0:
3953	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
3954	FALIGN_D0
3955	ldda	[%l7]ASI_BLK_AIUS, %d0
3956	stda	%d48, [%i0]ASI_BLK_P
3957	add	%l7, 64, %l7
3958	subcc	%i3, 64, %i3
3959	bz,pn	%ncc, 0f
3960	add	%i0, 64, %i0
3961	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
3962	FALIGN_D16
3963	ldda	[%l7]ASI_BLK_AIUS, %d16
3964	stda	%d48, [%i0]ASI_BLK_P
3965	add	%l7, 64, %l7
3966	subcc	%i3, 64, %i3
3967	bz,pn	%ncc, 1f
3968	add	%i0, 64, %i0
3969	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
3970	FALIGN_D32
3971	ldda	[%l7]ASI_BLK_AIUS, %d32
3972	stda	%d48, [%i0]ASI_BLK_P
3973	add	%l7, 64, %l7
3974	subcc	%i3, 64, %i3
3975	bz,pn	%ncc, 2f
3976	add	%i0, 64, %i0
3977	ba,a,pt	%ncc, copyin_seg0
3978
39790:
3980	FALIGN_D16
3981	stda	%d48, [%i0]ASI_BLK_P
3982	add	%i0, 64, %i0
3983	membar	#Sync
3984	FALIGN_D32
3985	stda	%d48, [%i0]ASI_BLK_P
3986	ba,pt	%ncc, copyin_blkd0
3987	add	%i0, 64, %i0
3988
39891:
3990	FALIGN_D32
3991	stda	%d48, [%i0]ASI_BLK_P
3992	add	%i0, 64, %i0
3993	membar	#Sync
3994	FALIGN_D0
3995	stda	%d48, [%i0]ASI_BLK_P
3996	ba,pt	%ncc, copyin_blkd16
3997	add	%i0, 64, %i0
3998
39992:
4000	FALIGN_D0
4001	stda	%d48, [%i0]ASI_BLK_P
4002	add	%i0, 64, %i0
4003	membar	#Sync
4004	FALIGN_D16
4005	stda	%d48, [%i0]ASI_BLK_P
4006	ba,pt	%ncc, copyin_blkd32
4007	add	%i0, 64, %i0
4008
4009copyin_seg1:
4010	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4011	FALIGN_D2
4012	ldda	[%l7]ASI_BLK_AIUS, %d0
4013	stda	%d48, [%i0]ASI_BLK_P
4014	add	%l7, 64, %l7
4015	subcc	%i3, 64, %i3
4016	bz,pn	%ncc, 0f
4017	add	%i0, 64, %i0
4018	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4019	FALIGN_D18
4020	ldda	[%l7]ASI_BLK_AIUS, %d16
4021	stda	%d48, [%i0]ASI_BLK_P
4022	add	%l7, 64, %l7
4023	subcc	%i3, 64, %i3
4024	bz,pn	%ncc, 1f
4025	add	%i0, 64, %i0
4026	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4027	FALIGN_D34
4028	ldda	[%l7]ASI_BLK_AIUS, %d32
4029	stda	%d48, [%i0]ASI_BLK_P
4030	add	%l7, 64, %l7
4031	subcc	%i3, 64, %i3
4032	bz,pn	%ncc, 2f
4033	add	%i0, 64, %i0
4034	ba,a,pt	%ncc, copyin_seg1
40350:
4036	FALIGN_D18
4037	stda	%d48, [%i0]ASI_BLK_P
4038	add	%i0, 64, %i0
4039	membar	#Sync
4040	FALIGN_D34
4041	stda	%d48, [%i0]ASI_BLK_P
4042	ba,pt	%ncc, copyin_blkd2
4043	add	%i0, 64, %i0
4044
40451:
4046	FALIGN_D34
4047	stda	%d48, [%i0]ASI_BLK_P
4048	add	%i0, 64, %i0
4049	membar	#Sync
4050	FALIGN_D2
4051	stda	%d48, [%i0]ASI_BLK_P
4052	ba,pt	%ncc, copyin_blkd18
4053	add	%i0, 64, %i0
4054
40552:
4056	FALIGN_D2
4057	stda	%d48, [%i0]ASI_BLK_P
4058	add	%i0, 64, %i0
4059	membar	#Sync
4060	FALIGN_D18
4061	stda	%d48, [%i0]ASI_BLK_P
4062	ba,pt	%ncc, copyin_blkd34
4063	add	%i0, 64, %i0
4064copyin_seg2:
4065	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4066	FALIGN_D4
4067	ldda	[%l7]ASI_BLK_AIUS, %d0
4068	stda	%d48, [%i0]ASI_BLK_P
4069	add	%l7, 64, %l7
4070	subcc	%i3, 64, %i3
4071	bz,pn	%ncc, 0f
4072	add	%i0, 64, %i0
4073	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4074	FALIGN_D20
4075	ldda	[%l7]ASI_BLK_AIUS, %d16
4076	stda	%d48, [%i0]ASI_BLK_P
4077	add	%l7, 64, %l7
4078	subcc	%i3, 64, %i3
4079	bz,pn	%ncc, 1f
4080	add	%i0, 64, %i0
4081	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4082	FALIGN_D36
4083	ldda	[%l7]ASI_BLK_AIUS, %d32
4084	stda	%d48, [%i0]ASI_BLK_P
4085	add	%l7, 64, %l7
4086	subcc	%i3, 64, %i3
4087	bz,pn	%ncc, 2f
4088	add	%i0, 64, %i0
4089	ba,a,pt	%ncc, copyin_seg2
4090
40910:
4092	FALIGN_D20
4093	stda	%d48, [%i0]ASI_BLK_P
4094	add	%i0, 64, %i0
4095	membar	#Sync
4096	FALIGN_D36
4097	stda	%d48, [%i0]ASI_BLK_P
4098	ba,pt	%ncc, copyin_blkd4
4099	add	%i0, 64, %i0
4100
41011:
4102	FALIGN_D36
4103	stda	%d48, [%i0]ASI_BLK_P
4104	add	%i0, 64, %i0
4105	membar	#Sync
4106	FALIGN_D4
4107	stda	%d48, [%i0]ASI_BLK_P
4108	ba,pt	%ncc, copyin_blkd20
4109	add	%i0, 64, %i0
4110
41112:
4112	FALIGN_D4
4113	stda	%d48, [%i0]ASI_BLK_P
4114	add	%i0, 64, %i0
4115	membar	#Sync
4116	FALIGN_D20
4117	stda	%d48, [%i0]ASI_BLK_P
4118	ba,pt	%ncc, copyin_blkd36
4119	add	%i0, 64, %i0
4120
4121copyin_seg3:
4122	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4123	FALIGN_D6
4124	ldda	[%l7]ASI_BLK_AIUS, %d0
4125	stda	%d48, [%i0]ASI_BLK_P
4126	add	%l7, 64, %l7
4127	subcc	%i3, 64, %i3
4128	bz,pn	%ncc, 0f
4129	add	%i0, 64, %i0
4130	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4131	FALIGN_D22
4132	ldda	[%l7]ASI_BLK_AIUS, %d16
4133	stda	%d48, [%i0]ASI_BLK_P
4134	add	%l7, 64, %l7
4135	subcc	%i3, 64, %i3
4136	bz,pn	%ncc, 1f
4137	add	%i0, 64, %i0
4138	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4139	FALIGN_D38
4140	ldda	[%l7]ASI_BLK_AIUS, %d32
4141	stda	%d48, [%i0]ASI_BLK_P
4142	add	%l7, 64, %l7
4143	subcc	%i3, 64, %i3
4144	bz,pn	%ncc, 2f
4145	add	%i0, 64, %i0
4146	ba,a,pt	%ncc, copyin_seg3
4147
41480:
4149	FALIGN_D22
4150	stda	%d48, [%i0]ASI_BLK_P
4151	add	%i0, 64, %i0
4152	membar	#Sync
4153	FALIGN_D38
4154	stda	%d48, [%i0]ASI_BLK_P
4155	ba,pt	%ncc, copyin_blkd6
4156	add	%i0, 64, %i0
4157
41581:
4159	FALIGN_D38
4160	stda	%d48, [%i0]ASI_BLK_P
4161	add	%i0, 64, %i0
4162	membar	#Sync
4163	FALIGN_D6
4164	stda	%d48, [%i0]ASI_BLK_P
4165	ba,pt	%ncc, copyin_blkd22
4166	add	%i0, 64, %i0
4167
41682:
4169	FALIGN_D6
4170	stda	%d48, [%i0]ASI_BLK_P
4171	add	%i0, 64, %i0
4172	membar	#Sync
4173	FALIGN_D22
4174	stda	%d48, [%i0]ASI_BLK_P
4175	ba,pt	%ncc, copyin_blkd38
4176	add	%i0, 64, %i0
4177
4178copyin_seg4:
4179	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4180	FALIGN_D8
4181	ldda	[%l7]ASI_BLK_AIUS, %d0
4182	stda	%d48, [%i0]ASI_BLK_P
4183	add	%l7, 64, %l7
4184	subcc	%i3, 64, %i3
4185	bz,pn	%ncc, 0f
4186	add	%i0, 64, %i0
4187	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4188	FALIGN_D24
4189	ldda	[%l7]ASI_BLK_AIUS, %d16
4190	stda	%d48, [%i0]ASI_BLK_P
4191	add	%l7, 64, %l7
4192	subcc	%i3, 64, %i3
4193	bz,pn	%ncc, 1f
4194	add	%i0, 64, %i0
4195	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4196	FALIGN_D40
4197	ldda	[%l7]ASI_BLK_AIUS, %d32
4198	stda	%d48, [%i0]ASI_BLK_P
4199	add	%l7, 64, %l7
4200	subcc	%i3, 64, %i3
4201	bz,pn	%ncc, 2f
4202	add	%i0, 64, %i0
4203	ba,a,pt	%ncc, copyin_seg4
4204
42050:
4206	FALIGN_D24
4207	stda	%d48, [%i0]ASI_BLK_P
4208	add	%i0, 64, %i0
4209	membar	#Sync
4210	FALIGN_D40
4211	stda	%d48, [%i0]ASI_BLK_P
4212	ba,pt	%ncc, copyin_blkd8
4213	add	%i0, 64, %i0
4214
42151:
4216	FALIGN_D40
4217	stda	%d48, [%i0]ASI_BLK_P
4218	add	%i0, 64, %i0
4219	membar	#Sync
4220	FALIGN_D8
4221	stda	%d48, [%i0]ASI_BLK_P
4222	ba,pt	%ncc, copyin_blkd24
4223	add	%i0, 64, %i0
4224
42252:
4226	FALIGN_D8
4227	stda	%d48, [%i0]ASI_BLK_P
4228	add	%i0, 64, %i0
4229	membar	#Sync
4230	FALIGN_D24
4231	stda	%d48, [%i0]ASI_BLK_P
4232	ba,pt	%ncc, copyin_blkd40
4233	add	%i0, 64, %i0
4234
4235copyin_seg5:
4236	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4237	FALIGN_D10
4238	ldda	[%l7]ASI_BLK_AIUS, %d0
4239	stda	%d48, [%i0]ASI_BLK_P
4240	add	%l7, 64, %l7
4241	subcc	%i3, 64, %i3
4242	bz,pn	%ncc, 0f
4243	add	%i0, 64, %i0
4244	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4245	FALIGN_D26
4246	ldda	[%l7]ASI_BLK_AIUS, %d16
4247	stda	%d48, [%i0]ASI_BLK_P
4248	add	%l7, 64, %l7
4249	subcc	%i3, 64, %i3
4250	bz,pn	%ncc, 1f
4251	add	%i0, 64, %i0
4252	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4253	FALIGN_D42
4254	ldda	[%l7]ASI_BLK_AIUS, %d32
4255	stda	%d48, [%i0]ASI_BLK_P
4256	add	%l7, 64, %l7
4257	subcc	%i3, 64, %i3
4258	bz,pn	%ncc, 2f
4259	add	%i0, 64, %i0
4260	ba,a,pt	%ncc, copyin_seg5
4261
42620:
4263	FALIGN_D26
4264	stda	%d48, [%i0]ASI_BLK_P
4265	add	%i0, 64, %i0
4266	membar	#Sync
4267	FALIGN_D42
4268	stda	%d48, [%i0]ASI_BLK_P
4269	ba,pt	%ncc, copyin_blkd10
4270	add	%i0, 64, %i0
4271
42721:
4273	FALIGN_D42
4274	stda	%d48, [%i0]ASI_BLK_P
4275	add	%i0, 64, %i0
4276	membar	#Sync
4277	FALIGN_D10
4278	stda	%d48, [%i0]ASI_BLK_P
4279	ba,pt	%ncc, copyin_blkd26
4280	add	%i0, 64, %i0
4281
42822:
4283	FALIGN_D10
4284	stda	%d48, [%i0]ASI_BLK_P
4285	add	%i0, 64, %i0
4286	membar	#Sync
4287	FALIGN_D26
4288	stda	%d48, [%i0]ASI_BLK_P
4289	ba,pt	%ncc, copyin_blkd42
4290	add	%i0, 64, %i0
4291
4292copyin_seg6:
4293	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4294	FALIGN_D12
4295	ldda	[%l7]ASI_BLK_AIUS, %d0
4296	stda	%d48, [%i0]ASI_BLK_P
4297	add	%l7, 64, %l7
4298	subcc	%i3, 64, %i3
4299	bz,pn	%ncc, 0f
4300	add	%i0, 64, %i0
4301	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4302	FALIGN_D28
4303	ldda	[%l7]ASI_BLK_AIUS, %d16
4304	stda	%d48, [%i0]ASI_BLK_P
4305	add	%l7, 64, %l7
4306	subcc	%i3, 64, %i3
4307	bz,pn	%ncc, 1f
4308	add	%i0, 64, %i0
4309	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4310	FALIGN_D44
4311	ldda	[%l7]ASI_BLK_AIUS, %d32
4312	stda	%d48, [%i0]ASI_BLK_P
4313	add	%l7, 64, %l7
4314	subcc	%i3, 64, %i3
4315	bz,pn	%ncc, 2f
4316	add	%i0, 64, %i0
4317	ba,a,pt	%ncc, copyin_seg6
4318
43190:
4320	FALIGN_D28
4321	stda	%d48, [%i0]ASI_BLK_P
4322	add	%i0, 64, %i0
4323	membar	#Sync
4324	FALIGN_D44
4325	stda	%d48, [%i0]ASI_BLK_P
4326	ba,pt	%ncc, copyin_blkd12
4327	add	%i0, 64, %i0
4328
43291:
4330	FALIGN_D44
4331	stda	%d48, [%i0]ASI_BLK_P
4332	add	%i0, 64, %i0
4333	membar	#Sync
4334	FALIGN_D12
4335	stda	%d48, [%i0]ASI_BLK_P
4336	ba,pt	%ncc, copyin_blkd28
4337	add	%i0, 64, %i0
4338
43392:
4340	FALIGN_D12
4341	stda	%d48, [%i0]ASI_BLK_P
4342	add	%i0, 64, %i0
4343	membar	#Sync
4344	FALIGN_D28
4345	stda	%d48, [%i0]ASI_BLK_P
4346	ba,pt	%ncc, copyin_blkd44
4347	add	%i0, 64, %i0
4348
4349copyin_seg7:
4350	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4351	FALIGN_D14
4352	ldda	[%l7]ASI_BLK_AIUS, %d0
4353	stda	%d48, [%i0]ASI_BLK_P
4354	add	%l7, 64, %l7
4355	subcc	%i3, 64, %i3
4356	bz,pn	%ncc, 0f
4357	add	%i0, 64, %i0
4358	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4359	FALIGN_D30
4360	ldda	[%l7]ASI_BLK_AIUS, %d16
4361	stda	%d48, [%i0]ASI_BLK_P
4362	add	%l7, 64, %l7
4363	subcc	%i3, 64, %i3
4364	bz,pn	%ncc, 1f
4365	add	%i0, 64, %i0
4366	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4367	FALIGN_D46
4368	ldda	[%l7]ASI_BLK_AIUS, %d32
4369	stda	%d48, [%i0]ASI_BLK_P
4370	add	%l7, 64, %l7
4371	subcc	%i3, 64, %i3
4372	bz,pn	%ncc, 2f
4373	add	%i0, 64, %i0
4374	ba,a,pt	%ncc, copyin_seg7
4375
43760:
4377	FALIGN_D30
4378	stda	%d48, [%i0]ASI_BLK_P
4379	add	%i0, 64, %i0
4380	membar	#Sync
4381	FALIGN_D46
4382	stda	%d48, [%i0]ASI_BLK_P
4383	ba,pt	%ncc, copyin_blkd14
4384	add	%i0, 64, %i0
4385
43861:
4387	FALIGN_D46
4388	stda	%d48, [%i0]ASI_BLK_P
4389	add	%i0, 64, %i0
4390	membar	#Sync
4391	FALIGN_D14
4392	stda	%d48, [%i0]ASI_BLK_P
4393	ba,pt	%ncc, copyin_blkd30
4394	add	%i0, 64, %i0
4395
43962:
4397	FALIGN_D14
4398	stda	%d48, [%i0]ASI_BLK_P
4399	add	%i0, 64, %i0
4400	membar	#Sync
4401	FALIGN_D30
4402	stda	%d48, [%i0]ASI_BLK_P
4403	ba,pt	%ncc, copyin_blkd46
4404	add	%i0, 64, %i0
4405
4406
4407	!
4408	! dribble out the last partial block
4409	!
4410copyin_blkd0:
4411	subcc	%i4, 8, %i4
4412	blu,pn	%ncc, copyin_blkdone
4413	faligndata %d0, %d2, %d48
4414	std	%d48, [%i0]
4415	add	%i0, 8, %i0
4416copyin_blkd2:
4417	subcc	%i4, 8, %i4
4418	blu,pn	%ncc, copyin_blkdone
4419	faligndata %d2, %d4, %d48
4420	std	%d48, [%i0]
4421	add	%i0, 8, %i0
4422copyin_blkd4:
4423	subcc	%i4, 8, %i4
4424	blu,pn	%ncc, copyin_blkdone
4425	faligndata %d4, %d6, %d48
4426	std	%d48, [%i0]
4427	add	%i0, 8, %i0
4428copyin_blkd6:
4429	subcc	%i4, 8, %i4
4430	blu,pn	%ncc, copyin_blkdone
4431	faligndata %d6, %d8, %d48
4432	std	%d48, [%i0]
4433	add	%i0, 8, %i0
4434copyin_blkd8:
4435	subcc	%i4, 8, %i4
4436	blu,pn	%ncc, copyin_blkdone
4437	faligndata %d8, %d10, %d48
4438	std	%d48, [%i0]
4439	add	%i0, 8, %i0
4440copyin_blkd10:
4441	subcc	%i4, 8, %i4
4442	blu,pn	%ncc, copyin_blkdone
4443	faligndata %d10, %d12, %d48
4444	std	%d48, [%i0]
4445	add	%i0, 8, %i0
4446copyin_blkd12:
4447	subcc	%i4, 8, %i4
4448	blu,pn	%ncc, copyin_blkdone
4449	faligndata %d12, %d14, %d48
4450	std	%d48, [%i0]
4451	add	%i0, 8, %i0
4452copyin_blkd14:
4453	subcc	%i4, 8, %i4
4454	blu,pn	%ncc, copyin_blkdone
4455	fsrc1	%d14, %d0
4456	ba,a,pt	%ncc, copyin_blkleft
4457
4458copyin_blkd16:
4459	subcc	%i4, 8, %i4
4460	blu,pn	%ncc, copyin_blkdone
4461	faligndata %d16, %d18, %d48
4462	std	%d48, [%i0]
4463	add	%i0, 8, %i0
4464copyin_blkd18:
4465	subcc	%i4, 8, %i4
4466	blu,pn	%ncc, copyin_blkdone
4467	faligndata %d18, %d20, %d48
4468	std	%d48, [%i0]
4469	add	%i0, 8, %i0
4470copyin_blkd20:
4471	subcc	%i4, 8, %i4
4472	blu,pn	%ncc, copyin_blkdone
4473	faligndata %d20, %d22, %d48
4474	std	%d48, [%i0]
4475	add	%i0, 8, %i0
4476copyin_blkd22:
4477	subcc	%i4, 8, %i4
4478	blu,pn	%ncc, copyin_blkdone
4479	faligndata %d22, %d24, %d48
4480	std	%d48, [%i0]
4481	add	%i0, 8, %i0
4482copyin_blkd24:
4483	subcc	%i4, 8, %i4
4484	blu,pn	%ncc, copyin_blkdone
4485	faligndata %d24, %d26, %d48
4486	std	%d48, [%i0]
4487	add	%i0, 8, %i0
4488copyin_blkd26:
4489	subcc	%i4, 8, %i4
4490	blu,pn	%ncc, copyin_blkdone
4491	faligndata %d26, %d28, %d48
4492	std	%d48, [%i0]
4493	add	%i0, 8, %i0
4494copyin_blkd28:
4495	subcc	%i4, 8, %i4
4496	blu,pn	%ncc, copyin_blkdone
4497	faligndata %d28, %d30, %d48
4498	std	%d48, [%i0]
4499	add	%i0, 8, %i0
4500copyin_blkd30:
4501	subcc	%i4, 8, %i4
4502	blu,pn	%ncc, copyin_blkdone
4503	fsrc1	%d30, %d0
4504	ba,a,pt	%ncc, copyin_blkleft
4505copyin_blkd32:
4506	subcc	%i4, 8, %i4
4507	blu,pn	%ncc, copyin_blkdone
4508	faligndata %d32, %d34, %d48
4509	std	%d48, [%i0]
4510	add	%i0, 8, %i0
4511copyin_blkd34:
4512	subcc	%i4, 8, %i4
4513	blu,pn	%ncc, copyin_blkdone
4514	faligndata %d34, %d36, %d48
4515	std	%d48, [%i0]
4516	add	%i0, 8, %i0
4517copyin_blkd36:
4518	subcc	%i4, 8, %i4
4519	blu,pn	%ncc, copyin_blkdone
4520	faligndata %d36, %d38, %d48
4521	std	%d48, [%i0]
4522	add	%i0, 8, %i0
4523copyin_blkd38:
4524	subcc	%i4, 8, %i4
4525	blu,pn	%ncc, copyin_blkdone
4526	faligndata %d38, %d40, %d48
4527	std	%d48, [%i0]
4528	add	%i0, 8, %i0
4529copyin_blkd40:
4530	subcc	%i4, 8, %i4
4531	blu,pn	%ncc, copyin_blkdone
4532	faligndata %d40, %d42, %d48
4533	std	%d48, [%i0]
4534	add	%i0, 8, %i0
4535copyin_blkd42:
4536	subcc	%i4, 8, %i4
4537	blu,pn	%ncc, copyin_blkdone
4538	faligndata %d42, %d44, %d48
4539	std	%d48, [%i0]
4540	add	%i0, 8, %i0
4541copyin_blkd44:
4542	subcc	%i4, 8, %i4
4543	blu,pn	%ncc, copyin_blkdone
4544	faligndata %d44, %d46, %d48
4545	std	%d48, [%i0]
4546	add	%i0, 8, %i0
4547copyin_blkd46:
4548	subcc	%i4, 8, %i4
4549	blu,pn	%ncc, copyin_blkdone
4550	fsrc1	%d46, %d0
4551
4552copyin_blkleft:
45531:
4554	ldda	[%l7]ASI_USER, %d2
4555	add	%l7, 8, %l7
4556	subcc	%i4, 8, %i4
4557	faligndata %d0, %d2, %d8
4558	std	%d8, [%i0]
4559	blu,pn	%ncc, copyin_blkdone
4560	add	%i0, 8, %i0
4561	ldda	[%l7]ASI_USER, %d0
4562	add	%l7, 8, %l7
4563	subcc	%i4, 8, %i4
4564	faligndata %d2, %d0, %d8
4565	std	%d8, [%i0]
4566	bgeu,pt	%ncc, 1b
4567	add	%i0, 8, %i0
4568
4569copyin_blkdone:
4570	tst	%i2
4571	bz,pt	%ncc, .copyin_exit
4572	and	%l3, 0x4, %l3		! fprs.du = fprs.dl = 0
4573
45747:	lduba	[%i1]ASI_USER, %i4
4575	inc	%i1
4576	inc	%i0
4577	deccc	%i2
4578	bgu	%ncc, 7b
4579	  stb	  %i4, [%i0 - 1]
4580
4581.copyin_exit:
4582	membar	#StoreLoad|#StoreStore
4583	btst	FPUSED_FLAG, SAVED_LOFAULT
4584	bz	%icc, 1f
4585	  nop
4586
4587	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
4588	wr	%o2, 0, %gsr
4589
4590	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
4591	btst	FPRS_FEF, %o3
4592	bz	%icc, 4f
4593	  nop
4594
4595	! restore fpregs from stack
4596	membar	#Sync
4597	add	%fp, STACK_BIAS - 257, %o2
4598	and	%o2, -64, %o2
4599	ldda	[%o2]ASI_BLK_P, %d0
4600	add	%o2, 64, %o2
4601	ldda	[%o2]ASI_BLK_P, %d16
4602	add	%o2, 64, %o2
4603	ldda	[%o2]ASI_BLK_P, %d32
4604	add	%o2, 64, %o2
4605	ldda	[%o2]ASI_BLK_P, %d48
4606	membar	#Sync
4607
4608	ba,pt	%ncc, 1f
4609	  wr	%o3, 0, %fprs		! restore fprs
4610
46114:
4612	FZERO				! zero all of the fpregs
4613	wr	%o3, 0, %fprs		! restore fprs
4614
46151:
4616	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
4617	membar	#Sync				! sync error barrier
4618	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
4619	ret
4620	restore	%g0, 0, %o0
4621.copyin_err:
4622	ldn	[THREAD_REG + T_COPYOPS], %o4
4623	brz	%o4, 2f
4624	nop
4625	ldn	[%o4 + CP_COPYIN], %g2
4626	jmp	%g2
4627	nop
46282:
4629	retl
4630	mov	-1, %o0
4631	SET_SIZE(copyin)
4632
4633#endif	/* lint */
4634
4635#ifdef	lint
4636
4637/*ARGSUSED*/
4638int
4639xcopyin(const void *uaddr, void *kaddr, size_t count)
4640{ return (0); }
4641
4642#else	/* lint */
4643
4644	ENTRY(xcopyin)
4645	sethi	%hi(.xcopyin_err), REAL_LOFAULT
4646	b	.do_copyin
4647	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
4648.xcopyin_err:
4649	ldn	[THREAD_REG + T_COPYOPS], %o4
4650	brz	%o4, 2f
4651	nop
4652	ldn	[%o4 + CP_XCOPYIN], %g2
4653	jmp	%g2
4654	nop
46552:
4656	retl
4657	mov	%g1, %o0
4658	SET_SIZE(xcopyin)
4659
4660#endif	/* lint */
4661
4662#ifdef	lint
4663
4664/*ARGSUSED*/
4665int
4666xcopyin_little(const void *uaddr, void *kaddr, size_t count)
4667{ return (0); }
4668
4669#else	/* lint */
4670
4671	ENTRY(xcopyin_little)
4672	sethi	%hi(.little_err), %o4
4673	ldn	[THREAD_REG + T_LOFAULT], %o5
4674	or	%o4, %lo(.little_err), %o4
4675	membar	#Sync				! sync error barrier
4676	stn	%o4, [THREAD_REG + T_LOFAULT]
4677
4678	subcc	%g0, %o2, %o3
4679	add	%o0, %o2, %o0
4680	bz,pn	%ncc, 2f		! check for zero bytes
4681	sub	%o2, 1, %o4
4682	add	%o0, %o4, %o0		! start w/last byte
4683	add	%o1, %o2, %o1
4684	lduba	[%o0+%o3]ASI_AIUSL, %o4
4685
46861:	stb	%o4, [%o1+%o3]
4687	inccc	%o3
4688	sub	%o0, 2, %o0		! get next byte
4689	bcc,a,pt %ncc, 1b
4690	  lduba	[%o0+%o3]ASI_AIUSL, %o4
4691
46922:	membar	#Sync				! sync error barrier
4693	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
4694	retl
4695	mov	%g0, %o0		! return (0)
4696
4697.little_err:
4698	membar	#Sync				! sync error barrier
4699	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
4700	retl
4701	mov	%g1, %o0
4702	SET_SIZE(xcopyin_little)
4703
4704#endif	/* lint */
4705
4706
4707/*
4708 * Copy a block of storage - must not overlap (from + len <= to).
4709 * No fault handler installed (to be called under on_fault())
4710 */
4711#if defined(lint)
4712
4713/* ARGSUSED */
4714void
4715copyin_noerr(const void *ufrom, void *kto, size_t count)
4716{}
4717
4718#else	/* lint */
4719
4720	ENTRY(copyin_noerr)
4721	sethi	%hi(.copyio_noerr), REAL_LOFAULT
4722	b	.do_copyin
4723	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
4724.copyio_noerr:
4725	jmp	SAVED_LOFAULT
4726	  nop
4727	SET_SIZE(copyin_noerr)
4728
4729#endif /* lint */
4730
4731/*
4732 * Copy a block of storage - must not overlap (from + len <= to).
4733 * No fault handler installed (to be called under on_fault())
4734 */
4735
4736#if defined(lint)
4737
4738/* ARGSUSED */
4739void
4740copyout_noerr(const void *kfrom, void *uto, size_t count)
4741{}
4742
4743#else	/* lint */
4744
4745	ENTRY(copyout_noerr)
4746	sethi	%hi(.copyio_noerr), REAL_LOFAULT
4747	b	.do_copyout
4748	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
4749	SET_SIZE(copyout_noerr)
4750
4751#endif /* lint */
4752
4753#if defined(lint)
4754
4755int use_hw_bcopy = 1;
4756int use_hw_copyio = 1;
4757int use_hw_bzero = 1;
4758uint_t hw_copy_limit_1 = 0;
4759uint_t hw_copy_limit_2 = 0;
4760uint_t hw_copy_limit_4 = 0;
4761uint_t hw_copy_limit_8 = 0;
4762
4763#else /* !lint */
4764
4765	.align	4
4766	DGDEF(use_hw_bcopy)
4767	.word	1
4768	DGDEF(use_hw_copyio)
4769	.word	1
4770	DGDEF(use_hw_bzero)
4771	.word	1
4772	DGDEF(hw_copy_limit_1)
4773	.word	0
4774	DGDEF(hw_copy_limit_2)
4775	.word	0
4776	DGDEF(hw_copy_limit_4)
4777	.word	0
4778	DGDEF(hw_copy_limit_8)
4779	.word	0
4780
4781	.align	64
4782	.section ".text"
4783#endif /* !lint */
4784
4785
4786/*
4787 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
4788 * longer than 256 bytes in length using spitfire's block stores.  If
4789 * the criteria for using this routine are not met then it calls bzero
4790 * and returns 1.  Otherwise 0 is returned indicating success.
4791 * Caller is responsible for ensuring use_hw_bzero is true and that
4792 * kpreempt_disable() has been called.
4793 */
4794#ifdef lint
4795/*ARGSUSED*/
4796int
4797hwblkclr(void *addr, size_t len)
4798{
4799	return(0);
4800}
4801#else /* lint */
4802	! %i0 - start address
4803	! %i1 - length of region (multiple of 64)
4804	! %l0 - saved fprs
4805	! %l1 - pointer to saved %d0 block
4806	! %l2 - saved curthread->t_lwp
4807
4808	ENTRY(hwblkclr)
4809	! get another window w/space for one aligned block of saved fpregs
4810	save	%sp, -SA(MINFRAME + 2*64), %sp
4811
4812	! Must be block-aligned
4813	andcc	%i0, (64-1), %g0
4814	bnz,pn	%ncc, 1f
4815	  nop
4816
4817	! ... and must be 256 bytes or more
4818	cmp	%i1, 256
4819	blu,pn	%ncc, 1f
4820	  nop
4821
4822	! ... and length must be a multiple of 64
4823	andcc	%i1, (64-1), %g0
4824	bz,pn	%ncc, 2f
4825	  nop
4826
48271:	! punt, call bzero but notify the caller that bzero was used
4828	mov	%i0, %o0
4829	call	bzero
4830	  mov	%i1, %o1
4831	ret
4832	restore	%g0, 1, %o0	! return (1) - did not use block operations
4833
48342:	rd	%fprs, %l0		! check for unused fp
4835	btst	FPRS_FEF, %l0
4836	bz	1f
4837	  nop
4838
4839	! save in-use fpregs on stack
4840	membar	#Sync
4841	add	%fp, STACK_BIAS - 65, %l1
4842	and	%l1, -64, %l1
4843	stda	%d0, [%l1]ASI_BLK_P
4844
48451:	membar	#StoreStore|#StoreLoad|#LoadStore
4846	wr	%g0, FPRS_FEF, %fprs
4847	wr	%g0, ASI_BLK_P, %asi
4848
4849	! Clear block
4850	fzero	%d0
4851	fzero	%d2
4852	fzero	%d4
4853	fzero	%d6
4854	fzero	%d8
4855	fzero	%d10
4856	fzero	%d12
4857	fzero	%d14
4858
4859	mov	256, %i3
4860	ba	.pz_doblock
4861	  nop
4862
4863.pz_blkstart:
4864      ! stda	%d0, [%i0+192]%asi  ! in dly slot of branch that got us here
4865	stda	%d0, [%i0+128]%asi
4866	stda	%d0, [%i0+64]%asi
4867	stda	%d0, [%i0]%asi
4868.pz_zinst:
4869	add	%i0, %i3, %i0
4870	sub	%i1, %i3, %i1
4871.pz_doblock:
4872	cmp	%i1, 256
4873	bgeu,a	%ncc, .pz_blkstart
4874	  stda	%d0, [%i0+192]%asi
4875
4876	cmp	%i1, 64
4877	blu	%ncc, .pz_finish
4878
4879	andn	%i1, (64-1), %i3
4880	srl	%i3, 4, %i2		! using blocks, 1 instr / 16 words
4881	set	.pz_zinst, %i4
4882	sub	%i4, %i2, %i4
4883	jmp	%i4
4884	  nop
4885
4886.pz_finish:
4887	membar	#Sync
4888	btst	FPRS_FEF, %l0
4889	bz,a	.pz_finished
4890	  wr	%l0, 0, %fprs		! restore fprs
4891
4892	! restore fpregs from stack
4893	ldda	[%l1]ASI_BLK_P, %d0
4894	membar	#Sync
4895	wr	%l0, 0, %fprs		! restore fprs
4896
4897.pz_finished:
4898	ret
4899	restore	%g0, 0, %o0		! return (bzero or not)
4900	SET_SIZE(hwblkclr)
4901#endif	/* lint */
4902
4903#ifdef	lint
4904/* Copy 32 bytes of data from src to dst using physical addresses */
4905/*ARGSUSED*/
4906void
4907hw_pa_bcopy32(uint64_t src, uint64_t dst)
4908{}
4909#else	/*!lint */
4910
4911	/*
4912	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
4913	 * using physical addresses.
4914	 */
4915	ENTRY_NP(hw_pa_bcopy32)
4916	rdpr    %pstate, %g1
4917	andn    %g1, PSTATE_IE, %g2
4918	wrpr    %g0, %g2, %pstate
4919
4920	ldxa    [%o0]ASI_MEM, %o2
4921	add     %o0, 8, %o0
4922	ldxa    [%o0]ASI_MEM, %o3
4923	add     %o0, 8, %o0
4924	ldxa    [%o0]ASI_MEM, %o4
4925	add     %o0, 8, %o0
4926	ldxa    [%o0]ASI_MEM, %o5
4927	stxa    %o2, [%o1]ASI_MEM
4928	add     %o1, 8, %o1
4929	stxa    %o3, [%o1]ASI_MEM
4930	add     %o1, 8, %o1
4931	stxa    %o4, [%o1]ASI_MEM
4932	add     %o1, 8, %o1
4933	stxa    %o5, [%o1]ASI_MEM
4934
4935	membar	#Sync
4936	retl
4937	  wrpr    %g0, %g1, %pstate
4938	SET_SIZE(hw_pa_bcopy32)
4939#endif /* lint */
4940