xref: /illumos-gate/usr/src/uts/sun4u/cpu/spitfire_copy.S (revision 784279176e68a516c9e391eb98dda7bd543fa6dd)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/param.h>
28#include <sys/errno.h>
29#include <sys/asm_linkage.h>
30#include <sys/vtrace.h>
31#include <sys/machthread.h>
32#include <sys/clock.h>
33#include <sys/asi.h>
34#include <sys/fsr.h>
35#include <sys/privregs.h>
36
37#include "assym.h"
38
39
40/*
41 * Pseudo-code to aid in understanding the control flow of the
42 * bcopy routine.
43 *
44 * On entry to bcopy:
45 *
46 *	%l6 = curthread->t_lofault;
47 *	used_block_copy = FALSE;			! %l6 |= 1
48 *	if (%l6 != NULL) {
49 *		curthread->t_lofault = .copyerr;
50 *		caller_error_handler = TRUE		! %l6 |= 2
51 *	}
52 *
53 * 	if (length < VIS_COPY)
54 * 		goto regular_copy;
55 *
56 * 	if (!use_vis)
57 * 		goto_regular_copy;
58 *
59 * 	if (curthread->t_lwp == NULL) {
60 *		! Kernel threads do not have pcb's in which to store
61 *		! the floating point state, disallow preemption during
62 *		! the copy.
63 * 		kpreempt_disable(curthread);
64 *	}
65 *
66 * 	old_fprs = %fprs;
67 * 	old_gsr = %gsr;
68 * 	if (%fprs.fef) {
69 *              ! If we need to save 4 blocks of fpregs then make sure
70 *		! the length is still appropriate for that extra overhead.
71 * 		if (length < (large_length + (64 * 4))) {
72 * 			if (curthread->t_lwp == NULL)
73 * 				kpreempt_enable(curthread);
74 * 			goto regular_copy;
75 * 		}
76 * 		%fprs.fef = 1;
77 * 		save current fpregs on stack using blockstore
78 * 	} else {
79 * 		%fprs.fef = 1;
80 * 	}
81 *
82 * 	used_block_copy = 1;				! %l6 |= 1
83 * 	do_blockcopy_here;
84 *
85 * In lofault handler:
86 *	curthread->t_lofault = .copyerr2;
87 *	Continue on with the normal exit handler
88 *
89 * On exit:
90 *	call_kpreempt = 0;
91 * 	if (used_block_copy) {				! %l6 & 1
92 * 		%gsr = old_gsr;
93 * 		if (old_fprs & FPRS_FEF)
94 * 			restore fpregs from stack using blockload
95 *		else
96 *			zero fpregs
97 * 		%fprs = old_fprs;
98 * 		if (curthread->t_lwp == NULL) {
99 *			kpreempt_enable(curthread);
100 *			call_kpreempt = 1;
101 *		}
102 * 	}
103 * 	curthread->t_lofault = (%l6 & ~3);
104 *	if (call_kpreempt)
105 *		kpreempt(%pil);
106 * 	return (0)
107 *
108 * In second lofault handler (.copyerr2):
109 *	We've tried to restore fp state from the stack and failed.  To
110 *	prevent from returning with a corrupted fp state, we will panic.
111 */
112
113/*
114 * Notes on preserving existing fp state:
115 *
116 * When a copyOP decides to use fp we may have to preserve existing
117 * floating point state.  It is not the caller's state that we need to
118 * preserve - the rest of the kernel does not use fp and, anyway, fp
119 * registers are volatile across a call.  Some examples:
120 *
121 *	- userland has fp state and is interrupted (device interrupt
122 *	  or trap) and within the interrupt/trap handling we use
123 *	  bcopy()
124 *	- another (higher level) interrupt or trap handler uses bcopy
125 *	  while a bcopy from an earlier interrupt is still active
126 *	- an asynchronous error trap occurs while fp state exists (in
127 *	  userland or in kernel copy) and the tl0 component of the handling
128 *	  uses bcopy
129 *	- a user process with fp state incurs a copy-on-write fault and
130 *	  hwblkpagecopy always uses fp
131 *
132 * We therefore need a per-call place in which to preserve fp state -
133 * using our stack is ideal (and since fp copy cannot be leaf optimized
134 * because of calls it makes, this is no hardship).
135 *
136 * To make sure that floating point state is always saved and restored
137 * correctly, the following "big rules" must be followed when the floating
138 * point registers will be used:
139 *
140 * 1. %l6 always holds the caller's lofault handler.  Also in this register,
141 *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
142 *    use.  Bit 2 (BCOPY_FLAG) indicates that the call was to bcopy.
143 *
144 * 2. The FPUSED flag indicates that all FP state has been successfully stored
145 *    on the stack.  It should not be set until this save has been completed.
146 *
147 * 3. The FPUSED flag should not be cleared on exit until all FP state has
148 *    been restored from the stack.  If an error occurs while restoring
149 *    data from the stack, the error handler can check this flag to see if
150 *    a restore is necessary.
151 *
152 * 4. Code run under the new lofault handler must be kept to a minimum.  In
153 *    particular, any calls to kpreempt() should not be made until after the
154 *    lofault handler has been restored.
155 */
156
157/*
158 * This shadows sys/machsystm.h which can't be included due to the lack of
159 * _ASM guards in include files it references. Change it here, change it there.
160 */
161#define VIS_COPY_THRESHOLD 900
162
163/*
164 * Less then or equal this number of bytes we will always copy byte-for-byte
165 */
166#define	SMALL_LIMIT	7
167
168/*
169 * Flags set in the lower bits of the t_lofault address:
170 * FPUSED_FLAG: The FP registers were in use and must be restored
171 * BCOPY_FLAG: Set for bcopy calls, cleared for kcopy calls
172 * COPY_FLAGS: Both of the above
173 *
174 * Other flags:
175 * KPREEMPT_FLAG: kpreempt needs to be called
176 */
177#define	FPUSED_FLAG	1
178#define BCOPY_FLAG	2
179#define	COPY_FLAGS	(FPUSED_FLAG | BCOPY_FLAG)
180#define	KPREEMPT_FLAG	4
181
182/*
183 * Size of stack frame in order to accomodate a 64-byte aligned
184 * floating-point register save area and 2 32-bit temp locations.
185 */
186#define	HWCOPYFRAMESIZE	((64 * 5) + (2 * 4))
187
188#define SAVED_FPREGS_OFFSET	(64 * 5)
189#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 4)
190#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 4)
191
192/*
193 * Common macros used by the various versions of the block copy
194 * routines in this file.
195 */
196
197#define	FZERO				\
198	fzero	%f0			;\
199	fzero	%f2			;\
200	faddd	%f0, %f2, %f4		;\
201	fmuld	%f0, %f2, %f6		;\
202	faddd	%f0, %f2, %f8		;\
203	fmuld	%f0, %f2, %f10		;\
204	faddd	%f0, %f2, %f12		;\
205	fmuld	%f0, %f2, %f14		;\
206	faddd	%f0, %f2, %f16		;\
207	fmuld	%f0, %f2, %f18		;\
208	faddd	%f0, %f2, %f20		;\
209	fmuld	%f0, %f2, %f22		;\
210	faddd	%f0, %f2, %f24		;\
211	fmuld	%f0, %f2, %f26		;\
212	faddd	%f0, %f2, %f28		;\
213	fmuld	%f0, %f2, %f30		;\
214	faddd	%f0, %f2, %f32		;\
215	fmuld	%f0, %f2, %f34		;\
216	faddd	%f0, %f2, %f36		;\
217	fmuld	%f0, %f2, %f38		;\
218	faddd	%f0, %f2, %f40		;\
219	fmuld	%f0, %f2, %f42		;\
220	faddd	%f0, %f2, %f44		;\
221	fmuld	%f0, %f2, %f46		;\
222	faddd	%f0, %f2, %f48		;\
223	fmuld	%f0, %f2, %f50		;\
224	faddd	%f0, %f2, %f52		;\
225	fmuld	%f0, %f2, %f54		;\
226	faddd	%f0, %f2, %f56		;\
227	fmuld	%f0, %f2, %f58		;\
228	faddd	%f0, %f2, %f60		;\
229	fmuld	%f0, %f2, %f62
230
231
232#define	FALIGN_D0			\
233	faligndata %d0, %d2, %d48	;\
234	faligndata %d2, %d4, %d50	;\
235	faligndata %d4, %d6, %d52	;\
236	faligndata %d6, %d8, %d54	;\
237	faligndata %d8, %d10, %d56	;\
238	faligndata %d10, %d12, %d58	;\
239	faligndata %d12, %d14, %d60	;\
240	faligndata %d14, %d16, %d62
241
242#define	FALIGN_D16			\
243	faligndata %d16, %d18, %d48	;\
244	faligndata %d18, %d20, %d50	;\
245	faligndata %d20, %d22, %d52	;\
246	faligndata %d22, %d24, %d54	;\
247	faligndata %d24, %d26, %d56	;\
248	faligndata %d26, %d28, %d58	;\
249	faligndata %d28, %d30, %d60	;\
250	faligndata %d30, %d32, %d62
251
252#define	FALIGN_D32			\
253	faligndata %d32, %d34, %d48	;\
254	faligndata %d34, %d36, %d50	;\
255	faligndata %d36, %d38, %d52	;\
256	faligndata %d38, %d40, %d54	;\
257	faligndata %d40, %d42, %d56	;\
258	faligndata %d42, %d44, %d58	;\
259	faligndata %d44, %d46, %d60	;\
260	faligndata %d46, %d0, %d62
261
262#define	FALIGN_D2			\
263	faligndata %d2, %d4, %d48	;\
264	faligndata %d4, %d6, %d50	;\
265	faligndata %d6, %d8, %d52	;\
266	faligndata %d8, %d10, %d54	;\
267	faligndata %d10, %d12, %d56	;\
268	faligndata %d12, %d14, %d58	;\
269	faligndata %d14, %d16, %d60	;\
270	faligndata %d16, %d18, %d62
271
272#define	FALIGN_D18			\
273	faligndata %d18, %d20, %d48	;\
274	faligndata %d20, %d22, %d50	;\
275	faligndata %d22, %d24, %d52	;\
276	faligndata %d24, %d26, %d54	;\
277	faligndata %d26, %d28, %d56	;\
278	faligndata %d28, %d30, %d58	;\
279	faligndata %d30, %d32, %d60	;\
280	faligndata %d32, %d34, %d62
281
282#define	FALIGN_D34			\
283	faligndata %d34, %d36, %d48	;\
284	faligndata %d36, %d38, %d50	;\
285	faligndata %d38, %d40, %d52	;\
286	faligndata %d40, %d42, %d54	;\
287	faligndata %d42, %d44, %d56	;\
288	faligndata %d44, %d46, %d58	;\
289	faligndata %d46, %d0, %d60	;\
290	faligndata %d0, %d2, %d62
291
292#define	FALIGN_D4			\
293	faligndata %d4, %d6, %d48	;\
294	faligndata %d6, %d8, %d50	;\
295	faligndata %d8, %d10, %d52	;\
296	faligndata %d10, %d12, %d54	;\
297	faligndata %d12, %d14, %d56	;\
298	faligndata %d14, %d16, %d58	;\
299	faligndata %d16, %d18, %d60	;\
300	faligndata %d18, %d20, %d62
301
302#define	FALIGN_D20			\
303	faligndata %d20, %d22, %d48	;\
304	faligndata %d22, %d24, %d50	;\
305	faligndata %d24, %d26, %d52	;\
306	faligndata %d26, %d28, %d54	;\
307	faligndata %d28, %d30, %d56	;\
308	faligndata %d30, %d32, %d58	;\
309	faligndata %d32, %d34, %d60	;\
310	faligndata %d34, %d36, %d62
311
312#define	FALIGN_D36			\
313	faligndata %d36, %d38, %d48	;\
314	faligndata %d38, %d40, %d50	;\
315	faligndata %d40, %d42, %d52	;\
316	faligndata %d42, %d44, %d54	;\
317	faligndata %d44, %d46, %d56	;\
318	faligndata %d46, %d0, %d58	;\
319	faligndata %d0, %d2, %d60	;\
320	faligndata %d2, %d4, %d62
321
322#define	FALIGN_D6			\
323	faligndata %d6, %d8, %d48	;\
324	faligndata %d8, %d10, %d50	;\
325	faligndata %d10, %d12, %d52	;\
326	faligndata %d12, %d14, %d54	;\
327	faligndata %d14, %d16, %d56	;\
328	faligndata %d16, %d18, %d58	;\
329	faligndata %d18, %d20, %d60	;\
330	faligndata %d20, %d22, %d62
331
332#define	FALIGN_D22			\
333	faligndata %d22, %d24, %d48	;\
334	faligndata %d24, %d26, %d50	;\
335	faligndata %d26, %d28, %d52	;\
336	faligndata %d28, %d30, %d54	;\
337	faligndata %d30, %d32, %d56	;\
338	faligndata %d32, %d34, %d58	;\
339	faligndata %d34, %d36, %d60	;\
340	faligndata %d36, %d38, %d62
341
342#define	FALIGN_D38			\
343	faligndata %d38, %d40, %d48	;\
344	faligndata %d40, %d42, %d50	;\
345	faligndata %d42, %d44, %d52	;\
346	faligndata %d44, %d46, %d54	;\
347	faligndata %d46, %d0, %d56	;\
348	faligndata %d0, %d2, %d58	;\
349	faligndata %d2, %d4, %d60	;\
350	faligndata %d4, %d6, %d62
351
352#define	FALIGN_D8			\
353	faligndata %d8, %d10, %d48	;\
354	faligndata %d10, %d12, %d50	;\
355	faligndata %d12, %d14, %d52	;\
356	faligndata %d14, %d16, %d54	;\
357	faligndata %d16, %d18, %d56	;\
358	faligndata %d18, %d20, %d58	;\
359	faligndata %d20, %d22, %d60	;\
360	faligndata %d22, %d24, %d62
361
362#define	FALIGN_D24			\
363	faligndata %d24, %d26, %d48	;\
364	faligndata %d26, %d28, %d50	;\
365	faligndata %d28, %d30, %d52	;\
366	faligndata %d30, %d32, %d54	;\
367	faligndata %d32, %d34, %d56	;\
368	faligndata %d34, %d36, %d58	;\
369	faligndata %d36, %d38, %d60	;\
370	faligndata %d38, %d40, %d62
371
372#define	FALIGN_D40			\
373	faligndata %d40, %d42, %d48	;\
374	faligndata %d42, %d44, %d50	;\
375	faligndata %d44, %d46, %d52	;\
376	faligndata %d46, %d0, %d54	;\
377	faligndata %d0, %d2, %d56	;\
378	faligndata %d2, %d4, %d58	;\
379	faligndata %d4, %d6, %d60	;\
380	faligndata %d6, %d8, %d62
381
382#define	FALIGN_D10			\
383	faligndata %d10, %d12, %d48	;\
384	faligndata %d12, %d14, %d50	;\
385	faligndata %d14, %d16, %d52	;\
386	faligndata %d16, %d18, %d54	;\
387	faligndata %d18, %d20, %d56	;\
388	faligndata %d20, %d22, %d58	;\
389	faligndata %d22, %d24, %d60	;\
390	faligndata %d24, %d26, %d62
391
392#define	FALIGN_D26			\
393	faligndata %d26, %d28, %d48	;\
394	faligndata %d28, %d30, %d50	;\
395	faligndata %d30, %d32, %d52	;\
396	faligndata %d32, %d34, %d54	;\
397	faligndata %d34, %d36, %d56	;\
398	faligndata %d36, %d38, %d58	;\
399	faligndata %d38, %d40, %d60	;\
400	faligndata %d40, %d42, %d62
401
402#define	FALIGN_D42			\
403	faligndata %d42, %d44, %d48	;\
404	faligndata %d44, %d46, %d50	;\
405	faligndata %d46, %d0, %d52	;\
406	faligndata %d0, %d2, %d54	;\
407	faligndata %d2, %d4, %d56	;\
408	faligndata %d4, %d6, %d58	;\
409	faligndata %d6, %d8, %d60	;\
410	faligndata %d8, %d10, %d62
411
412#define	FALIGN_D12			\
413	faligndata %d12, %d14, %d48	;\
414	faligndata %d14, %d16, %d50	;\
415	faligndata %d16, %d18, %d52	;\
416	faligndata %d18, %d20, %d54	;\
417	faligndata %d20, %d22, %d56	;\
418	faligndata %d22, %d24, %d58	;\
419	faligndata %d24, %d26, %d60	;\
420	faligndata %d26, %d28, %d62
421
422#define	FALIGN_D28			\
423	faligndata %d28, %d30, %d48	;\
424	faligndata %d30, %d32, %d50	;\
425	faligndata %d32, %d34, %d52	;\
426	faligndata %d34, %d36, %d54	;\
427	faligndata %d36, %d38, %d56	;\
428	faligndata %d38, %d40, %d58	;\
429	faligndata %d40, %d42, %d60	;\
430	faligndata %d42, %d44, %d62
431
432#define	FALIGN_D44			\
433	faligndata %d44, %d46, %d48	;\
434	faligndata %d46, %d0, %d50	;\
435	faligndata %d0, %d2, %d52	;\
436	faligndata %d2, %d4, %d54	;\
437	faligndata %d4, %d6, %d56	;\
438	faligndata %d6, %d8, %d58	;\
439	faligndata %d8, %d10, %d60	;\
440	faligndata %d10, %d12, %d62
441
442#define	FALIGN_D14			\
443	faligndata %d14, %d16, %d48	;\
444	faligndata %d16, %d18, %d50	;\
445	faligndata %d18, %d20, %d52	;\
446	faligndata %d20, %d22, %d54	;\
447	faligndata %d22, %d24, %d56	;\
448	faligndata %d24, %d26, %d58	;\
449	faligndata %d26, %d28, %d60	;\
450	faligndata %d28, %d30, %d62
451
452#define	FALIGN_D30			\
453	faligndata %d30, %d32, %d48	;\
454	faligndata %d32, %d34, %d50	;\
455	faligndata %d34, %d36, %d52	;\
456	faligndata %d36, %d38, %d54	;\
457	faligndata %d38, %d40, %d56	;\
458	faligndata %d40, %d42, %d58	;\
459	faligndata %d42, %d44, %d60	;\
460	faligndata %d44, %d46, %d62
461
462#define	FALIGN_D46			\
463	faligndata %d46, %d0, %d48	;\
464	faligndata %d0, %d2, %d50	;\
465	faligndata %d2, %d4, %d52	;\
466	faligndata %d4, %d6, %d54	;\
467	faligndata %d6, %d8, %d56	;\
468	faligndata %d8, %d10, %d58	;\
469	faligndata %d10, %d12, %d60	;\
470	faligndata %d12, %d14, %d62
471
472
473/*
474 * Copy a block of storage, returning an error code if `from' or
475 * `to' takes a kernel pagefault which cannot be resolved.
476 * Returns errno value on pagefault error, 0 if all ok
477 */
478
479
480
481	.seg	".text"
482	.align	4
483
484	ENTRY(kcopy)
485
486	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
487	set	.copyerr, %l6		! copyerr is lofault value
488	ldn	[THREAD_REG + T_LOFAULT], %l7	! save existing handler
489	membar	#Sync			! sync error barrier (see copy.s)
490	stn	%l6, [THREAD_REG + T_LOFAULT]	! set t_lofault
491	!
492	! Note that we carefully do *not* flag the setting of
493	! t_lofault.
494	!
495	ba,pt	%ncc, .do_copy		! common code
496	  mov	%l7, %l6
497
498/*
499 * We got here because of a fault during kcopy or bcopy if a fault
500 * handler existed when bcopy was called.
501 * Errno value is in %g1.
502 */
503.copyerr:
504	set	.copyerr2, %l1
505	membar	#Sync			! sync error barrier
506	stn	%l1, [THREAD_REG + T_LOFAULT]	! set t_lofault
507	btst	FPUSED_FLAG, %l6
508	bz	%icc, 1f
509	  and	%l6, BCOPY_FLAG, %l1	! copy flag to %l1
510
511	membar	#Sync
512
513	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
514	wr	%o2, 0, %gsr
515
516	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
517	btst	FPRS_FEF, %o3
518	bz	%icc, 4f
519	  nop
520
521	! restore fpregs from stack
522	membar	#Sync
523	add	%fp, STACK_BIAS - 257, %o2
524	and	%o2, -64, %o2
525	ldda	[%o2]ASI_BLK_P, %d0
526	add	%o2, 64, %o2
527	ldda	[%o2]ASI_BLK_P, %d16
528	add	%o2, 64, %o2
529	ldda	[%o2]ASI_BLK_P, %d32
530	add	%o2, 64, %o2
531	ldda	[%o2]ASI_BLK_P, %d48
532	membar	#Sync
533
534	ba,pt	%ncc, 2f
535	  wr	%o3, 0, %fprs		! restore fprs
536
5374:
538	FZERO				! zero all of the fpregs
539	wr	%o3, 0, %fprs		! restore fprs
540
5412:	ldn	[THREAD_REG + T_LWP], %o2
542	tst	%o2
543	bnz,pt	%ncc, 1f
544	  nop
545
546	ldsb	[THREAD_REG + T_PREEMPT], %l0
547	deccc	%l0
548	bnz,pn	%ncc, 1f
549	  stb	%l0, [THREAD_REG + T_PREEMPT]
550
551	! Check for a kernel preemption request
552	ldn	[THREAD_REG + T_CPU], %l0
553	ldub	[%l0 + CPU_KPRUNRUN], %l0
554	tst	%l0
555	bnz,a,pt	%ncc, 1f	! Need to call kpreempt?
556	  or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
557
558	!
559	! Need to cater for the different expectations of kcopy
560	! and bcopy. kcopy will *always* set a t_lofault handler
561	! If it fires, we're expected to just return the error code
562	! and *not* to invoke any existing error handler. As far as
563	! bcopy is concerned, we only set t_lofault if there was an
564	! existing lofault handler. In that case we're expected to
565	! invoke the previously existing handler after restting the
566	! t_lofault value.
567	!
5681:
569	andn	%l6, COPY_FLAGS, %l6	! remove flags from lofault address
570	membar	#Sync			! sync error barrier
571	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
572
573	! call kpreempt if necessary
574	btst	KPREEMPT_FLAG, %l1
575	bz,pt	%icc, 2f
576	  nop
577	call	kpreempt
578	  rdpr	%pil, %o0	! pass %pil
5792:
580	btst	BCOPY_FLAG, %l1
581	bnz,pn	%ncc, 3f
582	  nop
583	ret
584	restore	%g1, 0, %o0
585
5863:
587	!
588	! We're here via bcopy. There *must* have been an error handler
589	! in place otheerwise we would have died a nasty death already.
590	!
591	jmp	%l6				! goto real handler
592	restore	%g0, 0, %o0			! dispose of copy window
593
594/*
595 * We got here because of a fault in .copyerr.  We can't safely restore fp
596 * state, so we panic.
597 */
598fp_panic_msg:
599	.asciz	"Unable to restore fp state after copy operation"
600
601	.align	4
602.copyerr2:
603	set	fp_panic_msg, %o0
604	call	panic
605	  nop
606	SET_SIZE(kcopy)
607
608
609/*
610 * Copy a block of storage - must not overlap (from + len <= to).
611 * Registers: l6 - saved t_lofault
612 *
613 * Copy a page of memory.
614 * Assumes double word alignment and a count >= 256.
615 */
616
617	ENTRY(bcopy)
618
619	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
620	ldn	[THREAD_REG + T_LOFAULT], %l6	! save t_lofault
621	tst	%l6
622        !
623        ! We've already captured whether t_lofault was zero on entry.
624        ! We need to mark ourselves as being from bcopy since both
625        ! kcopy and bcopy use the same code path. If BCOPY_FLAG is set
626        ! and the saved lofault was zero, we won't reset lofault on
627        ! returning.
628        !
629	or	%l6, BCOPY_FLAG, %l6
630	bz,pt	%ncc, .do_copy
631	sethi	%hi(.copyerr), %o2
632	or	%o2, %lo(.copyerr), %o2
633	membar	#Sync			! sync error barrier
634	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
635
636.do_copy:
637	cmp	%i2, 12			! for small counts
638	blu	%ncc, .bytecp		! just copy bytes
639	  .empty
640
641	cmp	%i2, VIS_COPY_THRESHOLD	! for large counts
642	blu,pt	%ncc, .bcb_punt
643	  .empty
644
645	!
646	! Check to see if VIS acceleration is enabled
647	!
648	sethi	%hi(use_hw_bcopy), %o2
649	ld	[%o2 + %lo(use_hw_bcopy)], %o2
650	tst	%o2
651	bz,pn	%icc, .bcb_punt
652	  nop
653
654	subcc	%i1, %i0, %i3
655	bneg,a,pn %ncc, 1f
656	neg	%i3
6571:
658	/*
659	 * Compare against 256 since we should be checking block addresses
660	 * and (dest & ~63) - (src & ~63) can be 3 blocks even if
661	 * src = dest + (64 * 3) + 63.
662	 */
663	cmp	%i3, 256
664	blu,pn	%ncc, .bcb_punt
665	  nop
666
667	ldn	[THREAD_REG + T_LWP], %o3
668	tst	%o3
669	bnz,pt	%ncc, 1f
670	  nop
671
672	! kpreempt_disable();
673	ldsb	[THREAD_REG + T_PREEMPT], %o2
674	inc	%o2
675	stb	%o2, [THREAD_REG + T_PREEMPT]
676
6771:
678	rd	%fprs, %o2		! check for unused fp
679	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
680	btst	FPRS_FEF, %o2
681	bz,a	%icc, .do_blockcopy
682	  wr	%g0, FPRS_FEF, %fprs
683
684.bcb_fpregs_inuse:
685	cmp	%i2, VIS_COPY_THRESHOLD+(64*4) ! for large counts (larger
686	bgeu	%ncc, 1f		!  if we have to save the fpregs)
687	  nop
688
689	tst	%o3
690	bnz,pt	%ncc, .bcb_punt
691	  nop
692
693	ldsb	[THREAD_REG + T_PREEMPT], %l0
694	deccc	%l0
695	bnz,pn	%icc, .bcb_punt
696	  stb	%l0, [THREAD_REG + T_PREEMPT]
697
698	! Check for a kernel preemption request
699	ldn	[THREAD_REG + T_CPU], %l0
700	ldub	[%l0 + CPU_KPRUNRUN], %l0
701	tst	%l0
702	bz,pt	%icc, .bcb_punt
703	  nop
704
705	! Attempt to preempt
706	call	kpreempt
707	  rdpr	  %pil, %o0		  ! pass %pil
708
709	ba,pt	%ncc, .bcb_punt
710	  nop
711
7121:
713	wr	%g0, FPRS_FEF, %fprs
714
715	! save in-use fpregs on stack
716	membar	#Sync
717	add	%fp, STACK_BIAS - 257, %o2
718	and	%o2, -64, %o2
719	stda	%d0, [%o2]ASI_BLK_P
720	add	%o2, 64, %o2
721	stda	%d16, [%o2]ASI_BLK_P
722	add	%o2, 64, %o2
723	stda	%d32, [%o2]ASI_BLK_P
724	add	%o2, 64, %o2
725	stda	%d48, [%o2]ASI_BLK_P
726	membar	#Sync
727
728.do_blockcopy:
729	membar	#StoreStore|#StoreLoad|#LoadStore
730
731	rd	%gsr, %o2
732	st	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
733
734	! Set the lower bit in the saved t_lofault to indicate
735	! that we need to clear the %fprs register on the way
736	! out
737	or	%l6, FPUSED_FLAG, %l6
738
739	! Swap src/dst since the code below is memcpy code
740	! and memcpy/bcopy have different calling sequences
741	mov	%i1, %i5
742	mov	%i0, %i1
743	mov	%i5, %i0
744
745!!! This code is nearly identical to the version in the sun4u
746!!! libc_psr.  Most bugfixes made to that file should be
747!!! merged into this routine.
748
749	andcc	%i0, 7, %o3
750	bz,pt	%ncc, blkcpy
751	sub	%o3, 8, %o3
752	neg	%o3
753	sub	%i2, %o3, %i2
754
755	! Align Destination on double-word boundary
756
7572:	ldub	[%i1], %o4
758	inc	%i1
759	inc	%i0
760	deccc	%o3
761	bgu	%ncc, 2b
762	stb	%o4, [%i0 - 1]
763blkcpy:
764	andcc	%i0, 63, %i3
765	bz,pn	%ncc, blalign		! now block aligned
766	sub	%i3, 64, %i3
767	neg	%i3			! bytes till block aligned
768	sub	%i2, %i3, %i2		! update %i2 with new count
769
770	! Copy %i3 bytes till dst is block (64 byte) aligned. use
771	! double word copies.
772
773	alignaddr %i1, %g0, %g1
774	ldd	[%g1], %d0
775	add	%g1, 8, %g1
7766:
777	ldd	[%g1], %d2
778	add	%g1, 8, %g1
779	subcc	%i3, 8, %i3
780	faligndata %d0, %d2, %d8
781	std	%d8, [%i0]
782	add	%i1, 8, %i1
783	bz,pn	%ncc, blalign
784	add	%i0, 8, %i0
785	ldd	[%g1], %d0
786	add	%g1, 8, %g1
787	subcc	%i3, 8, %i3
788	faligndata %d2, %d0, %d8
789	std	%d8, [%i0]
790	add	%i1, 8, %i1
791	bgu,pn	%ncc, 6b
792	add	%i0, 8, %i0
793
794blalign:
795	membar	#StoreLoad
796	! %i2 = total length
797	! %i3 = blocks	(length - 64) / 64
798	! %i4 = doubles remaining  (length - blocks)
799	sub	%i2, 64, %i3
800	andn	%i3, 63, %i3
801	sub	%i2, %i3, %i4
802	andn	%i4, 7, %i4
803	sub	%i4, 16, %i4
804	sub	%i2, %i4, %i2
805	sub	%i2, %i3, %i2
806
807	andn	%i1, 0x3f, %l7		! blk aligned address
808	alignaddr %i1, %g0, %g0		! gen %gsr
809
810	srl	%i1, 3, %l5		! bits 3,4,5 are now least sig in  %l5
811	andcc	%l5, 7, %i5		! mask everything except bits 1,2 3
812	add	%i1, %i4, %i1
813	add	%i1, %i3, %i1
814
815	ldda	[%l7]ASI_BLK_P, %d0
816	add	%l7, 64, %l7
817	ldda	[%l7]ASI_BLK_P, %d16
818	add	%l7, 64, %l7
819	ldda	[%l7]ASI_BLK_P, %d32
820	add	%l7, 64, %l7
821	sub	%i3, 128, %i3
822
823	! switch statement to get us to the right 8 byte blk within a
824	! 64 byte block
825	cmp	 %i5, 4
826	bgeu,a	 hlf
827	cmp	 %i5, 6
828	cmp	 %i5, 2
829	bgeu,a	 sqtr
830	nop
831	cmp	 %i5, 1
832	be,a	 seg1
833	nop
834	ba,pt	 %ncc, seg0
835	nop
836sqtr:
837	be,a	 seg2
838	nop
839	ba,pt	 %ncc, seg3
840	nop
841
842hlf:
843	bgeu,a	 fqtr
844	nop
845	cmp	 %i5, 5
846	be,a	 seg5
847	nop
848	ba,pt	 %ncc, seg4
849	nop
850fqtr:
851	be,a	 seg6
852	nop
853	ba,pt	 %ncc, seg7
854	nop
855
856
857seg0:
858	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
859	FALIGN_D0
860	ldda	[%l7]ASI_BLK_P, %d0
861	stda	%d48, [%i0]ASI_BLK_P
862	add	%l7, 64, %l7
863	subcc	%i3, 64, %i3
864	bz,pn	%ncc, 0f
865	add	%i0, 64, %i0
866	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
867	FALIGN_D16
868	ldda	[%l7]ASI_BLK_P, %d16
869	stda	%d48, [%i0]ASI_BLK_P
870	add	%l7, 64, %l7
871	subcc	%i3, 64, %i3
872	bz,pn	%ncc, 1f
873	add	%i0, 64, %i0
874	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
875	FALIGN_D32
876	ldda	[%l7]ASI_BLK_P, %d32
877	stda	%d48, [%i0]ASI_BLK_P
878	add	%l7, 64, %l7
879	subcc	%i3, 64, %i3
880	bz,pn	%ncc, 2f
881	add	%i0, 64, %i0
882	ba,a,pt	%ncc, seg0
883
8840:
885	FALIGN_D16
886	stda	%d48, [%i0]ASI_BLK_P
887	add	%i0, 64, %i0
888	membar	#Sync
889	FALIGN_D32
890	stda	%d48, [%i0]ASI_BLK_P
891	ba,pt	%ncc, blkd0
892	add	%i0, 64, %i0
893
8941:
895	FALIGN_D32
896	stda	%d48, [%i0]ASI_BLK_P
897	add	%i0, 64, %i0
898	membar	#Sync
899	FALIGN_D0
900	stda	%d48, [%i0]ASI_BLK_P
901	ba,pt	%ncc, blkd16
902	add	%i0, 64, %i0
903
9042:
905	FALIGN_D0
906	stda	%d48, [%i0]ASI_BLK_P
907	add	%i0, 64, %i0
908	membar	#Sync
909	FALIGN_D16
910	stda	%d48, [%i0]ASI_BLK_P
911	ba,pt	%ncc, blkd32
912	add	%i0, 64, %i0
913
914seg1:
915	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
916	FALIGN_D2
917	ldda	[%l7]ASI_BLK_P, %d0
918	stda	%d48, [%i0]ASI_BLK_P
919	add	%l7, 64, %l7
920	subcc	%i3, 64, %i3
921	bz,pn	%ncc, 0f
922	add	%i0, 64, %i0
923	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
924	FALIGN_D18
925	ldda	[%l7]ASI_BLK_P, %d16
926	stda	%d48, [%i0]ASI_BLK_P
927	add	%l7, 64, %l7
928	subcc	%i3, 64, %i3
929	bz,pn	%ncc, 1f
930	add	%i0, 64, %i0
931	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
932	FALIGN_D34
933	ldda	[%l7]ASI_BLK_P, %d32
934	stda	%d48, [%i0]ASI_BLK_P
935	add	%l7, 64, %l7
936	subcc	%i3, 64, %i3
937	bz,pn	%ncc, 2f
938	add	%i0, 64, %i0
939	ba,a,pt	%ncc, seg1
9400:
941	FALIGN_D18
942	stda	%d48, [%i0]ASI_BLK_P
943	add	%i0, 64, %i0
944	membar	#Sync
945	FALIGN_D34
946	stda	%d48, [%i0]ASI_BLK_P
947	ba,pt	%ncc, blkd2
948	add	%i0, 64, %i0
949
9501:
951	FALIGN_D34
952	stda	%d48, [%i0]ASI_BLK_P
953	add	%i0, 64, %i0
954	membar	#Sync
955	FALIGN_D2
956	stda	%d48, [%i0]ASI_BLK_P
957	ba,pt	%ncc, blkd18
958	add	%i0, 64, %i0
959
9602:
961	FALIGN_D2
962	stda	%d48, [%i0]ASI_BLK_P
963	add	%i0, 64, %i0
964	membar	#Sync
965	FALIGN_D18
966	stda	%d48, [%i0]ASI_BLK_P
967	ba,pt	%ncc, blkd34
968	add	%i0, 64, %i0
969
970seg2:
971	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
972	FALIGN_D4
973	ldda	[%l7]ASI_BLK_P, %d0
974	stda	%d48, [%i0]ASI_BLK_P
975	add	%l7, 64, %l7
976	subcc	%i3, 64, %i3
977	bz,pn	%ncc, 0f
978	add	%i0, 64, %i0
979	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
980	FALIGN_D20
981	ldda	[%l7]ASI_BLK_P, %d16
982	stda	%d48, [%i0]ASI_BLK_P
983	add	%l7, 64, %l7
984	subcc	%i3, 64, %i3
985	bz,pn	%ncc, 1f
986	add	%i0, 64, %i0
987	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
988	FALIGN_D36
989	ldda	[%l7]ASI_BLK_P, %d32
990	stda	%d48, [%i0]ASI_BLK_P
991	add	%l7, 64, %l7
992	subcc	%i3, 64, %i3
993	bz,pn	%ncc, 2f
994	add	%i0, 64, %i0
995	ba,a,pt	%ncc, seg2
996
9970:
998	FALIGN_D20
999	stda	%d48, [%i0]ASI_BLK_P
1000	add	%i0, 64, %i0
1001	membar	#Sync
1002	FALIGN_D36
1003	stda	%d48, [%i0]ASI_BLK_P
1004	ba,pt	%ncc, blkd4
1005	add	%i0, 64, %i0
1006
10071:
1008	FALIGN_D36
1009	stda	%d48, [%i0]ASI_BLK_P
1010	add	%i0, 64, %i0
1011	membar	#Sync
1012	FALIGN_D4
1013	stda	%d48, [%i0]ASI_BLK_P
1014	ba,pt	%ncc, blkd20
1015	add	%i0, 64, %i0
1016
10172:
1018	FALIGN_D4
1019	stda	%d48, [%i0]ASI_BLK_P
1020	add	%i0, 64, %i0
1021	membar	#Sync
1022	FALIGN_D20
1023	stda	%d48, [%i0]ASI_BLK_P
1024	ba,pt	%ncc, blkd36
1025	add	%i0, 64, %i0
1026
1027seg3:
1028	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1029	FALIGN_D6
1030	ldda	[%l7]ASI_BLK_P, %d0
1031	stda	%d48, [%i0]ASI_BLK_P
1032	add	%l7, 64, %l7
1033	subcc	%i3, 64, %i3
1034	bz,pn	%ncc, 0f
1035	add	%i0, 64, %i0
1036	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1037	FALIGN_D22
1038	ldda	[%l7]ASI_BLK_P, %d16
1039	stda	%d48, [%i0]ASI_BLK_P
1040	add	%l7, 64, %l7
1041	subcc	%i3, 64, %i3
1042	bz,pn	%ncc, 1f
1043	add	%i0, 64, %i0
1044	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1045	FALIGN_D38
1046	ldda	[%l7]ASI_BLK_P, %d32
1047	stda	%d48, [%i0]ASI_BLK_P
1048	add	%l7, 64, %l7
1049	subcc	%i3, 64, %i3
1050	bz,pn	%ncc, 2f
1051	add	%i0, 64, %i0
1052	ba,a,pt	%ncc, seg3
1053
10540:
1055	FALIGN_D22
1056	stda	%d48, [%i0]ASI_BLK_P
1057	add	%i0, 64, %i0
1058	membar	#Sync
1059	FALIGN_D38
1060	stda	%d48, [%i0]ASI_BLK_P
1061	ba,pt	%ncc, blkd6
1062	add	%i0, 64, %i0
1063
10641:
1065	FALIGN_D38
1066	stda	%d48, [%i0]ASI_BLK_P
1067	add	%i0, 64, %i0
1068	membar	#Sync
1069	FALIGN_D6
1070	stda	%d48, [%i0]ASI_BLK_P
1071	ba,pt	%ncc, blkd22
1072	add	%i0, 64, %i0
1073
10742:
1075	FALIGN_D6
1076	stda	%d48, [%i0]ASI_BLK_P
1077	add	%i0, 64, %i0
1078	membar	#Sync
1079	FALIGN_D22
1080	stda	%d48, [%i0]ASI_BLK_P
1081	ba,pt	%ncc, blkd38
1082	add	%i0, 64, %i0
1083
1084seg4:
1085	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1086	FALIGN_D8
1087	ldda	[%l7]ASI_BLK_P, %d0
1088	stda	%d48, [%i0]ASI_BLK_P
1089	add	%l7, 64, %l7
1090	subcc	%i3, 64, %i3
1091	bz,pn	%ncc, 0f
1092	add	%i0, 64, %i0
1093	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1094	FALIGN_D24
1095	ldda	[%l7]ASI_BLK_P, %d16
1096	stda	%d48, [%i0]ASI_BLK_P
1097	add	%l7, 64, %l7
1098	subcc	%i3, 64, %i3
1099	bz,pn	%ncc, 1f
1100	add	%i0, 64, %i0
1101	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1102	FALIGN_D40
1103	ldda	[%l7]ASI_BLK_P, %d32
1104	stda	%d48, [%i0]ASI_BLK_P
1105	add	%l7, 64, %l7
1106	subcc	%i3, 64, %i3
1107	bz,pn	%ncc, 2f
1108	add	%i0, 64, %i0
1109	ba,a,pt	%ncc, seg4
1110
11110:
1112	FALIGN_D24
1113	stda	%d48, [%i0]ASI_BLK_P
1114	add	%i0, 64, %i0
1115	membar	#Sync
1116	FALIGN_D40
1117	stda	%d48, [%i0]ASI_BLK_P
1118	ba,pt	%ncc, blkd8
1119	add	%i0, 64, %i0
1120
11211:
1122	FALIGN_D40
1123	stda	%d48, [%i0]ASI_BLK_P
1124	add	%i0, 64, %i0
1125	membar	#Sync
1126	FALIGN_D8
1127	stda	%d48, [%i0]ASI_BLK_P
1128	ba,pt	%ncc, blkd24
1129	add	%i0, 64, %i0
1130
11312:
1132	FALIGN_D8
1133	stda	%d48, [%i0]ASI_BLK_P
1134	add	%i0, 64, %i0
1135	membar	#Sync
1136	FALIGN_D24
1137	stda	%d48, [%i0]ASI_BLK_P
1138	ba,pt	%ncc, blkd40
1139	add	%i0, 64, %i0
1140
1141seg5:
1142	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1143	FALIGN_D10
1144	ldda	[%l7]ASI_BLK_P, %d0
1145	stda	%d48, [%i0]ASI_BLK_P
1146	add	%l7, 64, %l7
1147	subcc	%i3, 64, %i3
1148	bz,pn	%ncc, 0f
1149	add	%i0, 64, %i0
1150	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1151	FALIGN_D26
1152	ldda	[%l7]ASI_BLK_P, %d16
1153	stda	%d48, [%i0]ASI_BLK_P
1154	add	%l7, 64, %l7
1155	subcc	%i3, 64, %i3
1156	bz,pn	%ncc, 1f
1157	add	%i0, 64, %i0
1158	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1159	FALIGN_D42
1160	ldda	[%l7]ASI_BLK_P, %d32
1161	stda	%d48, [%i0]ASI_BLK_P
1162	add	%l7, 64, %l7
1163	subcc	%i3, 64, %i3
1164	bz,pn	%ncc, 2f
1165	add	%i0, 64, %i0
1166	ba,a,pt	%ncc, seg5
1167
11680:
1169	FALIGN_D26
1170	stda	%d48, [%i0]ASI_BLK_P
1171	add	%i0, 64, %i0
1172	membar	#Sync
1173	FALIGN_D42
1174	stda	%d48, [%i0]ASI_BLK_P
1175	ba,pt	%ncc, blkd10
1176	add	%i0, 64, %i0
1177
11781:
1179	FALIGN_D42
1180	stda	%d48, [%i0]ASI_BLK_P
1181	add	%i0, 64, %i0
1182	membar	#Sync
1183	FALIGN_D10
1184	stda	%d48, [%i0]ASI_BLK_P
1185	ba,pt	%ncc, blkd26
1186	add	%i0, 64, %i0
1187
11882:
1189	FALIGN_D10
1190	stda	%d48, [%i0]ASI_BLK_P
1191	add	%i0, 64, %i0
1192	membar	#Sync
1193	FALIGN_D26
1194	stda	%d48, [%i0]ASI_BLK_P
1195	ba,pt	%ncc, blkd42
1196	add	%i0, 64, %i0
1197
1198seg6:
1199	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1200	FALIGN_D12
1201	ldda	[%l7]ASI_BLK_P, %d0
1202	stda	%d48, [%i0]ASI_BLK_P
1203	add	%l7, 64, %l7
1204	subcc	%i3, 64, %i3
1205	bz,pn	%ncc, 0f
1206	add	%i0, 64, %i0
1207	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1208	FALIGN_D28
1209	ldda	[%l7]ASI_BLK_P, %d16
1210	stda	%d48, [%i0]ASI_BLK_P
1211	add	%l7, 64, %l7
1212	subcc	%i3, 64, %i3
1213	bz,pn	%ncc, 1f
1214	add	%i0, 64, %i0
1215	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1216	FALIGN_D44
1217	ldda	[%l7]ASI_BLK_P, %d32
1218	stda	%d48, [%i0]ASI_BLK_P
1219	add	%l7, 64, %l7
1220	subcc	%i3, 64, %i3
1221	bz,pn	%ncc, 2f
1222	add	%i0, 64, %i0
1223	ba,a,pt	%ncc, seg6
1224
12250:
1226	FALIGN_D28
1227	stda	%d48, [%i0]ASI_BLK_P
1228	add	%i0, 64, %i0
1229	membar	#Sync
1230	FALIGN_D44
1231	stda	%d48, [%i0]ASI_BLK_P
1232	ba,pt	%ncc, blkd12
1233	add	%i0, 64, %i0
1234
12351:
1236	FALIGN_D44
1237	stda	%d48, [%i0]ASI_BLK_P
1238	add	%i0, 64, %i0
1239	membar	#Sync
1240	FALIGN_D12
1241	stda	%d48, [%i0]ASI_BLK_P
1242	ba,pt	%ncc, blkd28
1243	add	%i0, 64, %i0
1244
12452:
1246	FALIGN_D12
1247	stda	%d48, [%i0]ASI_BLK_P
1248	add	%i0, 64, %i0
1249	membar	#Sync
1250	FALIGN_D28
1251	stda	%d48, [%i0]ASI_BLK_P
1252	ba,pt	%ncc, blkd44
1253	add	%i0, 64, %i0
1254
1255seg7:
1256	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
1257	FALIGN_D14
1258	ldda	[%l7]ASI_BLK_P, %d0
1259	stda	%d48, [%i0]ASI_BLK_P
1260	add	%l7, 64, %l7
1261	subcc	%i3, 64, %i3
1262	bz,pn	%ncc, 0f
1263	add	%i0, 64, %i0
1264	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
1265	FALIGN_D30
1266	ldda	[%l7]ASI_BLK_P, %d16
1267	stda	%d48, [%i0]ASI_BLK_P
1268	add	%l7, 64, %l7
1269	subcc	%i3, 64, %i3
1270	bz,pn	%ncc, 1f
1271	add	%i0, 64, %i0
1272	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
1273	FALIGN_D46
1274	ldda	[%l7]ASI_BLK_P, %d32
1275	stda	%d48, [%i0]ASI_BLK_P
1276	add	%l7, 64, %l7
1277	subcc	%i3, 64, %i3
1278	bz,pn	%ncc, 2f
1279	add	%i0, 64, %i0
1280	ba,a,pt	%ncc, seg7
1281
12820:
1283	FALIGN_D30
1284	stda	%d48, [%i0]ASI_BLK_P
1285	add	%i0, 64, %i0
1286	membar	#Sync
1287	FALIGN_D46
1288	stda	%d48, [%i0]ASI_BLK_P
1289	ba,pt	%ncc, blkd14
1290	add	%i0, 64, %i0
1291
12921:
1293	FALIGN_D46
1294	stda	%d48, [%i0]ASI_BLK_P
1295	add	%i0, 64, %i0
1296	membar	#Sync
1297	FALIGN_D14
1298	stda	%d48, [%i0]ASI_BLK_P
1299	ba,pt	%ncc, blkd30
1300	add	%i0, 64, %i0
1301
13022:
1303	FALIGN_D14
1304	stda	%d48, [%i0]ASI_BLK_P
1305	add	%i0, 64, %i0
1306	membar	#Sync
1307	FALIGN_D30
1308	stda	%d48, [%i0]ASI_BLK_P
1309	ba,pt	%ncc, blkd46
1310	add	%i0, 64, %i0
1311
1312
1313	!
1314	! dribble out the last partial block
1315	!
1316blkd0:
1317	subcc	%i4, 8, %i4
1318	blu,pn	%ncc, blkdone
1319	faligndata %d0, %d2, %d48
1320	std	%d48, [%i0]
1321	add	%i0, 8, %i0
1322blkd2:
1323	subcc	%i4, 8, %i4
1324	blu,pn	%ncc, blkdone
1325	faligndata %d2, %d4, %d48
1326	std	%d48, [%i0]
1327	add	%i0, 8, %i0
1328blkd4:
1329	subcc	%i4, 8, %i4
1330	blu,pn	%ncc, blkdone
1331	faligndata %d4, %d6, %d48
1332	std	%d48, [%i0]
1333	add	%i0, 8, %i0
1334blkd6:
1335	subcc	%i4, 8, %i4
1336	blu,pn	%ncc, blkdone
1337	faligndata %d6, %d8, %d48
1338	std	%d48, [%i0]
1339	add	%i0, 8, %i0
1340blkd8:
1341	subcc	%i4, 8, %i4
1342	blu,pn	%ncc, blkdone
1343	faligndata %d8, %d10, %d48
1344	std	%d48, [%i0]
1345	add	%i0, 8, %i0
1346blkd10:
1347	subcc	%i4, 8, %i4
1348	blu,pn	%ncc, blkdone
1349	faligndata %d10, %d12, %d48
1350	std	%d48, [%i0]
1351	add	%i0, 8, %i0
1352blkd12:
1353	subcc	%i4, 8, %i4
1354	blu,pn	%ncc, blkdone
1355	faligndata %d12, %d14, %d48
1356	std	%d48, [%i0]
1357	add	%i0, 8, %i0
1358blkd14:
1359	subcc	%i4, 8, %i4
1360	blu,pn	%ncc, blkdone
1361	fsrc1	%d14, %d0
1362	ba,a,pt	%ncc, blkleft
1363
1364blkd16:
1365	subcc	%i4, 8, %i4
1366	blu,pn	%ncc, blkdone
1367	faligndata %d16, %d18, %d48
1368	std	%d48, [%i0]
1369	add	%i0, 8, %i0
1370blkd18:
1371	subcc	%i4, 8, %i4
1372	blu,pn	%ncc, blkdone
1373	faligndata %d18, %d20, %d48
1374	std	%d48, [%i0]
1375	add	%i0, 8, %i0
1376blkd20:
1377	subcc	%i4, 8, %i4
1378	blu,pn	%ncc, blkdone
1379	faligndata %d20, %d22, %d48
1380	std	%d48, [%i0]
1381	add	%i0, 8, %i0
1382blkd22:
1383	subcc	%i4, 8, %i4
1384	blu,pn	%ncc, blkdone
1385	faligndata %d22, %d24, %d48
1386	std	%d48, [%i0]
1387	add	%i0, 8, %i0
1388blkd24:
1389	subcc	%i4, 8, %i4
1390	blu,pn	%ncc, blkdone
1391	faligndata %d24, %d26, %d48
1392	std	%d48, [%i0]
1393	add	%i0, 8, %i0
1394blkd26:
1395	subcc	%i4, 8, %i4
1396	blu,pn	%ncc, blkdone
1397	faligndata %d26, %d28, %d48
1398	std	%d48, [%i0]
1399	add	%i0, 8, %i0
1400blkd28:
1401	subcc	%i4, 8, %i4
1402	blu,pn	%ncc, blkdone
1403	faligndata %d28, %d30, %d48
1404	std	%d48, [%i0]
1405	add	%i0, 8, %i0
1406blkd30:
1407	subcc	%i4, 8, %i4
1408	blu,pn	%ncc, blkdone
1409	fsrc1	%d30, %d0
1410	ba,a,pt	%ncc, blkleft
1411blkd32:
1412	subcc	%i4, 8, %i4
1413	blu,pn	%ncc, blkdone
1414	faligndata %d32, %d34, %d48
1415	std	%d48, [%i0]
1416	add	%i0, 8, %i0
1417blkd34:
1418	subcc	%i4, 8, %i4
1419	blu,pn	%ncc, blkdone
1420	faligndata %d34, %d36, %d48
1421	std	%d48, [%i0]
1422	add	%i0, 8, %i0
1423blkd36:
1424	subcc	%i4, 8, %i4
1425	blu,pn	%ncc, blkdone
1426	faligndata %d36, %d38, %d48
1427	std	%d48, [%i0]
1428	add	%i0, 8, %i0
1429blkd38:
1430	subcc	%i4, 8, %i4
1431	blu,pn	%ncc, blkdone
1432	faligndata %d38, %d40, %d48
1433	std	%d48, [%i0]
1434	add	%i0, 8, %i0
1435blkd40:
1436	subcc	%i4, 8, %i4
1437	blu,pn	%ncc, blkdone
1438	faligndata %d40, %d42, %d48
1439	std	%d48, [%i0]
1440	add	%i0, 8, %i0
1441blkd42:
1442	subcc	%i4, 8, %i4
1443	blu,pn	%ncc, blkdone
1444	faligndata %d42, %d44, %d48
1445	std	%d48, [%i0]
1446	add	%i0, 8, %i0
1447blkd44:
1448	subcc	%i4, 8, %i4
1449	blu,pn	%ncc, blkdone
1450	faligndata %d44, %d46, %d48
1451	std	%d48, [%i0]
1452	add	%i0, 8, %i0
1453blkd46:
1454	subcc	%i4, 8, %i4
1455	blu,pn	%ncc, blkdone
1456	fsrc1	%d46, %d0
1457
1458blkleft:
14591:
1460	ldd	[%l7], %d2
1461	add	%l7, 8, %l7
1462	subcc	%i4, 8, %i4
1463	faligndata %d0, %d2, %d8
1464	std	%d8, [%i0]
1465	blu,pn	%ncc, blkdone
1466	add	%i0, 8, %i0
1467	ldd	[%l7], %d0
1468	add	%l7, 8, %l7
1469	subcc	%i4, 8, %i4
1470	faligndata %d2, %d0, %d8
1471	std	%d8, [%i0]
1472	bgeu,pt	%ncc, 1b
1473	add	%i0, 8, %i0
1474
1475blkdone:
1476	tst	%i2
1477	bz,pt	%ncc, .bcb_exit
1478	and	%l3, 0x4, %l3		! fprs.du = fprs.dl = 0
1479
14807:	ldub	[%i1], %i4
1481	inc	%i1
1482	inc	%i0
1483	deccc	%i2
1484	bgu,pt	%ncc, 7b
1485	  stb	  %i4, [%i0 - 1]
1486
1487.bcb_exit:
1488	membar	#StoreLoad|#StoreStore
1489	btst	FPUSED_FLAG, %l6
1490	bz	%icc, 1f
1491	  and	%l6, COPY_FLAGS, %l1	! Store flags in %l1
1492					! We can't clear the flags from %l6 yet.
1493					! If there's an error, .copyerr will
1494					! need them
1495
1496	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
1497	wr	%o2, 0, %gsr
1498
1499	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1500	btst	FPRS_FEF, %o3
1501	bz	%icc, 4f
1502	  nop
1503
1504	! restore fpregs from stack
1505	membar	#Sync
1506	add	%fp, STACK_BIAS - 257, %o2
1507	and	%o2, -64, %o2
1508	ldda	[%o2]ASI_BLK_P, %d0
1509	add	%o2, 64, %o2
1510	ldda	[%o2]ASI_BLK_P, %d16
1511	add	%o2, 64, %o2
1512	ldda	[%o2]ASI_BLK_P, %d32
1513	add	%o2, 64, %o2
1514	ldda	[%o2]ASI_BLK_P, %d48
1515	membar	#Sync
1516
1517	ba,pt	%ncc, 2f
1518	  wr	%o3, 0, %fprs		! restore fprs
1519
15204:
1521	FZERO				! zero all of the fpregs
1522	wr	%o3, 0, %fprs		! restore fprs
1523
15242:	ldn	[THREAD_REG + T_LWP], %o2
1525	tst	%o2
1526	bnz,pt	%ncc, 1f
1527	  nop
1528
1529	ldsb	[THREAD_REG + T_PREEMPT], %l0
1530	deccc	%l0
1531	bnz,pn	%ncc, 1f
1532	  stb	%l0, [THREAD_REG + T_PREEMPT]
1533
1534	! Check for a kernel preemption request
1535	ldn	[THREAD_REG + T_CPU], %l0
1536	ldub	[%l0 + CPU_KPRUNRUN], %l0
1537	tst	%l0
1538	bnz,a,pt	%ncc, 1f	! Need to call kpreempt?
1539	  or	%l1, KPREEMPT_FLAG, %l1	! If so, set the flag
1540
15411:
1542	btst	BCOPY_FLAG, %l1
1543	bz,pn	%icc, 3f
1544	  andncc	%l6, COPY_FLAGS, %l6
1545
1546	!
1547	! Here via bcopy. Check to see if the handler was NULL.
1548	! If so, just return quietly. Otherwise, reset the
1549	! handler and go home.
1550	!
1551	bnz,pn	%ncc, 3f
1552	  nop
1553
1554	!
1555	! Null handler.  Check for kpreempt flag, call if necessary,
1556	! then return.
1557	!
1558	btst	KPREEMPT_FLAG, %l1
1559	bz,pt	%icc, 2f
1560	  nop
1561	call	kpreempt
1562	  rdpr	%pil, %o0	! pass %pil
15632:
1564	ret
1565	  restore	%g0, 0, %o0
1566
1567	!
1568	! Here via kcopy or bcopy with a handler.Reset the
1569	! fault handler.
1570	!
15713:
1572	membar	#Sync
1573	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1574
1575	! call kpreempt if necessary
1576	btst	KPREEMPT_FLAG, %l1
1577	bz,pt	%icc, 4f
1578	  nop
1579	call	kpreempt
1580	  rdpr	%pil, %o0
15814:
1582	ret
1583	  restore	%g0, 0, %o0
1584
1585.bcb_punt:
1586	!
1587	! use aligned transfers where possible
1588	!
1589	xor	%i0, %i1, %o4		! xor from and to address
1590	btst	7, %o4			! if lower three bits zero
1591	bz	%icc, .aldoubcp		! can align on double boundary
1592	.empty	! assembler complaints about label
1593
1594	xor	%i0, %i1, %o4		! xor from and to address
1595	btst	3, %o4			! if lower two bits zero
1596	bz	%icc, .alwordcp		! can align on word boundary
1597	btst	3, %i0			! delay slot, from address unaligned?
1598	!
1599	! use aligned reads and writes where possible
1600	! this differs from wordcp in that it copes
1601	! with odd alignment between source and destnation
1602	! using word reads and writes with the proper shifts
1603	! in between to align transfers to and from memory
1604	! i0 - src address, i1 - dest address, i2 - count
1605	! i3, i4 - tmps for used generating complete word
1606	! i5 (word to write)
1607	! l0 size in bits of upper part of source word (US)
1608	! l1 size in bits of lower part of source word (LS = 32 - US)
1609	! l2 size in bits of upper part of destination word (UD)
1610	! l3 size in bits of lower part of destination word (LD = 32 - UD)
1611	! l4 number of bytes leftover after aligned transfers complete
1612	! l5 the number 32
1613	!
1614	mov	32, %l5			! load an oft-needed constant
1615	bz	.align_dst_only
1616	btst	3, %i1			! is destnation address aligned?
1617	clr	%i4			! clear registers used in either case
1618	bz	%icc, .align_src_only
1619	clr	%l0
1620	!
1621	! both source and destination addresses are unaligned
1622	!
16231:					! align source
1624	ldub	[%i0], %i3		! read a byte from source address
1625	add	%i0, 1, %i0		! increment source address
1626	or	%i4, %i3, %i4		! or in with previous bytes (if any)
1627	btst	3, %i0			! is source aligned?
1628	add	%l0, 8, %l0		! increment size of upper source (US)
1629	bnz,a	1b
1630	sll	%i4, 8, %i4		! make room for next byte
1631
1632	sub	%l5, %l0, %l1		! generate shift left count (LS)
1633	sll	%i4, %l1, %i4		! prepare to get rest
1634	ld	[%i0], %i3		! read a word
1635	add	%i0, 4, %i0		! increment source address
1636	srl	%i3, %l0, %i5		! upper src bits into lower dst bits
1637	or	%i4, %i5, %i5		! merge
1638	mov	24, %l3			! align destination
16391:
1640	srl	%i5, %l3, %i4		! prepare to write a single byte
1641	stb	%i4, [%i1]		! write a byte
1642	add	%i1, 1, %i1		! increment destination address
1643	sub	%i2, 1, %i2		! decrement count
1644	btst	3, %i1			! is destination aligned?
1645	bnz,a	1b
1646	sub	%l3, 8, %l3		! delay slot, decrement shift count (LD)
1647	sub	%l5, %l3, %l2		! generate shift left count (UD)
1648	sll	%i5, %l2, %i5		! move leftover into upper bytes
1649	cmp	%l2, %l0		! cmp # reqd to fill dst w old src left
1650	bgu	%ncc, .more_needed	! need more to fill than we have
1651	nop
1652
1653	sll	%i3, %l1, %i3		! clear upper used byte(s)
1654	srl	%i3, %l1, %i3
1655	! get the odd bytes between alignments
1656	sub	%l0, %l2, %l0		! regenerate shift count
1657	sub	%l5, %l0, %l1		! generate new shift left count (LS)
1658	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
1659	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
1660	srl	%i3, %l0, %i4
1661	or	%i5, %i4, %i5
1662	st	%i5, [%i1]		! write a word
1663	subcc	%i2, 4, %i2		! decrement count
1664	bz	%ncc, .unalign_out
1665	add	%i1, 4, %i1		! increment destination address
1666
1667	b	2f
1668	sll	%i3, %l1, %i5		! get leftover into upper bits
1669.more_needed:
1670	sll	%i3, %l0, %i3		! save remaining byte(s)
1671	srl	%i3, %l0, %i3
1672	sub	%l2, %l0, %l1		! regenerate shift count
1673	sub	%l5, %l1, %l0		! generate new shift left count
1674	sll	%i3, %l1, %i4		! move to fill empty space
1675	b	3f
1676	or	%i5, %i4, %i5		! merge to complete word
1677	!
1678	! the source address is aligned and destination is not
1679	!
1680.align_dst_only:
1681	ld	[%i0], %i4		! read a word
1682	add	%i0, 4, %i0		! increment source address
1683	mov	24, %l0			! initial shift alignment count
16841:
1685	srl	%i4, %l0, %i3		! prepare to write a single byte
1686	stb	%i3, [%i1]		! write a byte
1687	add	%i1, 1, %i1		! increment destination address
1688	sub	%i2, 1, %i2		! decrement count
1689	btst	3, %i1			! is destination aligned?
1690	bnz,a	1b
1691	sub	%l0, 8, %l0		! delay slot, decrement shift count
1692.xfer:
1693	sub	%l5, %l0, %l1		! generate shift left count
1694	sll	%i4, %l1, %i5		! get leftover
16953:
1696	and	%i2, 3, %l4		! must do remaining bytes if count%4 > 0
1697	andn	%i2, 3, %i2		! # of aligned bytes that can be moved
16982:
1699	ld	[%i0], %i3		! read a source word
1700	add	%i0, 4, %i0		! increment source address
1701	srl	%i3, %l0, %i4		! upper src bits into lower dst bits
1702	or	%i5, %i4, %i5		! merge with upper dest bits (leftover)
1703	st	%i5, [%i1]		! write a destination word
1704	subcc	%i2, 4, %i2		! decrement count
1705	bz	%ncc, .unalign_out	! check if done
1706	add	%i1, 4, %i1		! increment destination address
1707	b	2b			! loop
1708	sll	%i3, %l1, %i5		! get leftover
1709.unalign_out:
1710	tst	%l4			! any bytes leftover?
1711	bz	%ncc, .cpdone
1712	.empty				! allow next instruction in delay slot
17131:
1714	sub	%l0, 8, %l0		! decrement shift
1715	srl	%i3, %l0, %i4		! upper src byte into lower dst byte
1716	stb	%i4, [%i1]		! write a byte
1717	subcc	%l4, 1, %l4		! decrement count
1718	bz	%ncc, .cpdone		! done?
1719	add	%i1, 1, %i1		! increment destination
1720	tst	%l0			! any more previously read bytes
1721	bnz	%ncc, 1b		! we have leftover bytes
1722	mov	%l4, %i2		! delay slot, mv cnt where dbytecp wants
1723	b	.dbytecp		! let dbytecp do the rest
1724	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
1725	!
1726	! the destination address is aligned and the source is not
1727	!
1728.align_src_only:
1729	ldub	[%i0], %i3		! read a byte from source address
1730	add	%i0, 1, %i0		! increment source address
1731	or	%i4, %i3, %i4		! or in with previous bytes (if any)
1732	btst	3, %i0			! is source aligned?
1733	add	%l0, 8, %l0		! increment shift count (US)
1734	bnz,a	.align_src_only
1735	sll	%i4, 8, %i4		! make room for next byte
1736	b,a	.xfer
1737	!
1738	! if from address unaligned for double-word moves,
1739	! move bytes till it is, if count is < 56 it could take
1740	! longer to align the thing than to do the transfer
1741	! in word size chunks right away
1742	!
1743.aldoubcp:
1744	cmp	%i2, 56			! if count < 56, use wordcp, it takes
1745	blu,a	%ncc, .alwordcp		! longer to align doubles than words
1746	mov	3, %o0			! mask for word alignment
1747	call	.alignit		! copy bytes until aligned
1748	mov	7, %o0			! mask for double alignment
1749	!
1750	! source and destination are now double-word aligned
1751	! i3 has aligned count returned by alignit
1752	!
1753	and	%i2, 7, %i2		! unaligned leftover count
1754	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
17555:
1756	ldx	[%i0+%i1], %o4		! read from address
1757	stx	%o4, [%i1]		! write at destination address
1758	subcc	%i3, 8, %i3		! dec count
1759	bgu	%ncc, 5b
1760	add	%i1, 8, %i1		! delay slot, inc to address
1761	cmp	%i2, 4			! see if we can copy a word
1762	blu	%ncc, .dbytecp		! if 3 or less bytes use bytecp
1763	.empty
1764	!
1765	! for leftover bytes we fall into wordcp, if needed
1766	!
1767.wordcp:
1768	and	%i2, 3, %i2		! unaligned leftover count
17695:
1770	ld	[%i0+%i1], %o4		! read from address
1771	st	%o4, [%i1]		! write at destination address
1772	subcc	%i3, 4, %i3		! dec count
1773	bgu	%ncc, 5b
1774	add	%i1, 4, %i1		! delay slot, inc to address
1775	b,a	.dbytecp
1776
1777	! we come here to align copies on word boundaries
1778.alwordcp:
1779	call	.alignit		! go word-align it
1780	mov	3, %o0			! bits that must be zero to be aligned
1781	b	.wordcp
1782	sub	%i0, %i1, %i0		! i0 gets the difference of src and dst
1783
1784	!
1785	! byte copy, works with any alignment
1786	!
1787.bytecp:
1788	b	.dbytecp
1789	sub	%i0, %i1, %i0		! i0 gets difference of src and dst
1790
1791	!
1792	! differenced byte copy, works with any alignment
1793	! assumes dest in %i1 and (source - dest) in %i0
1794	!
17951:
1796	stb	%o4, [%i1]		! write to address
1797	inc	%i1			! inc to address
1798.dbytecp:
1799	deccc	%i2			! dec count
1800	bgeu,a	%ncc, 1b		! loop till done
1801	ldub	[%i0+%i1], %o4		! read from address
1802	!
1803	! FPUSED_FLAG will not have been set in any path leading to
1804	! this point. No need to deal with it.
1805	!
1806.cpdone:
1807	btst	BCOPY_FLAG, %l6
1808	bz,pn	%icc, 2f
1809	andncc	%l6, BCOPY_FLAG, %l6
1810	!
1811	! Here via bcopy. Check to see if the handler was NULL.
1812	! If so, just return quietly. Otherwise, reset the
1813	! handler and go home.
1814	!
1815	bnz,pn	%ncc, 2f
1816	nop
1817	!
1818	! Null handler.
1819	!
1820	ret
1821	restore %g0, 0, %o0
1822	!
1823	! Here via kcopy or bcopy with a handler.Reset the
1824	! fault handler.
1825	!
18262:
1827  	membar	#Sync
1828	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1829	ret
1830	restore	%g0, 0, %o0		! return (0)
1831
1832/*
1833 * Common code used to align transfers on word and doubleword
1834 * boudaries.  Aligns source and destination and returns a count
1835 * of aligned bytes to transfer in %i3
1836 */
18371:
1838	inc	%i0			! inc from
1839	stb	%o4, [%i1]		! write a byte
1840	inc	%i1			! inc to
1841	dec	%i2			! dec count
1842.alignit:
1843	btst	%o0, %i0		! %o0 is bit mask to check for alignment
1844	bnz,a	1b
1845	ldub	[%i0], %o4		! read next byte
1846
1847	retl
1848	andn	%i2, %o0, %i3		! return size of aligned bytes
1849	SET_SIZE(bcopy)
1850
1851/*
1852 * Block copy with possibly overlapped operands.
1853 */
1854
1855	ENTRY(ovbcopy)
1856	tst	%o2			! check count
1857	bgu,a	%ncc, 1f		! nothing to do or bad arguments
1858	subcc	%o0, %o1, %o3		! difference of from and to address
1859
1860	retl				! return
1861	nop
18621:
1863	bneg,a	%ncc, 2f
1864	neg	%o3			! if < 0, make it positive
18652:	cmp	%o2, %o3		! cmp size and abs(from - to)
1866	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
1867	.empty				!   no overlap
1868	cmp	%o0, %o1		! compare from and to addresses
1869	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
1870	nop
1871	!
1872	! Copy forwards.
1873	!
1874.ov_fwd:
1875	ldub	[%o0], %o3		! read from address
1876	inc	%o0			! inc from address
1877	stb	%o3, [%o1]		! write to address
1878	deccc	%o2			! dec count
1879	bgu	%ncc, .ov_fwd		! loop till done
1880	inc	%o1			! inc to address
1881
1882	retl				! return
1883	nop
1884	!
1885	! Copy backwards.
1886	!
1887.ov_bkwd:
1888	deccc	%o2			! dec count
1889	ldub	[%o0 + %o2], %o3	! get byte at end of src
1890	bgu	%ncc, .ov_bkwd		! loop till done
1891	stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
1892
1893	retl				! return
1894	nop
1895	SET_SIZE(ovbcopy)
1896
1897/*
1898 * hwblkpagecopy()
1899 *
1900 * Copies exactly one page.  This routine assumes the caller (ppcopy)
1901 * has already disabled kernel preemption and has checked
1902 * use_hw_bcopy.
1903 */
1904	ENTRY(hwblkpagecopy)
1905	! get another window w/space for three aligned blocks of saved fpregs
1906	save	%sp, -SA(MINFRAME + 4*64), %sp
1907
1908	! %i0 - source address (arg)
1909	! %i1 - destination address (arg)
1910	! %i2 - length of region (not arg)
1911	! %l0 - saved fprs
1912	! %l1 - pointer to saved fpregs
1913
1914	rd	%fprs, %l0		! check for unused fp
1915	btst	FPRS_FEF, %l0
1916	bz	1f
1917	membar	#Sync
1918
1919	! save in-use fpregs on stack
1920	add	%fp, STACK_BIAS - 193, %l1
1921	and	%l1, -64, %l1
1922	stda	%d0, [%l1]ASI_BLK_P
1923	add	%l1, 64, %l3
1924	stda	%d16, [%l3]ASI_BLK_P
1925	add	%l3, 64, %l3
1926	stda	%d32, [%l3]ASI_BLK_P
1927	membar	#Sync
1928
19291:	wr	%g0, FPRS_FEF, %fprs
1930	ldda	[%i0]ASI_BLK_P, %d0
1931	add	%i0, 64, %i0
1932	set	PAGESIZE - 64, %i2
1933
19342:	ldda	[%i0]ASI_BLK_P, %d16
1935	fsrc1	%d0, %d32
1936	fsrc1	%d2, %d34
1937	fsrc1	%d4, %d36
1938	fsrc1	%d6, %d38
1939	fsrc1	%d8, %d40
1940	fsrc1	%d10, %d42
1941	fsrc1	%d12, %d44
1942	fsrc1	%d14, %d46
1943	stda	%d32, [%i1]ASI_BLK_P
1944	add	%i0, 64, %i0
1945	subcc	%i2, 64, %i2
1946	bz,pn	%ncc, 3f
1947	add	%i1, 64, %i1
1948	ldda	[%i0]ASI_BLK_P, %d0
1949	fsrc1	%d16, %d32
1950	fsrc1	%d18, %d34
1951	fsrc1	%d20, %d36
1952	fsrc1	%d22, %d38
1953	fsrc1	%d24, %d40
1954	fsrc1	%d26, %d42
1955	fsrc1	%d28, %d44
1956	fsrc1	%d30, %d46
1957	stda	%d32, [%i1]ASI_BLK_P
1958	add	%i0, 64, %i0
1959	sub	%i2, 64, %i2
1960	ba,pt	%ncc, 2b
1961	add	%i1, 64, %i1
1962
19633:	membar	#Sync
1964	btst	FPRS_FEF, %l0
1965	bz	4f
1966	stda	%d16, [%i1]ASI_BLK_P
1967
1968	! restore fpregs from stack
1969	membar	#Sync
1970	ldda	[%l1]ASI_BLK_P, %d0
1971	add	%l1, 64, %l3
1972	ldda	[%l3]ASI_BLK_P, %d16
1973	add	%l3, 64, %l3
1974	ldda	[%l3]ASI_BLK_P, %d32
1975
19764:	wr	%l0, 0, %fprs		! restore fprs
1977	membar #Sync
1978	ret
1979	restore	%g0, 0, %o0
1980	SET_SIZE(hwblkpagecopy)
1981
1982
1983/*
1984 * Transfer data to and from user space -
1985 * Note that these routines can cause faults
1986 * It is assumed that the kernel has nothing at
1987 * less than KERNELBASE in the virtual address space.
1988 *
1989 * Note that copyin(9F) and copyout(9F) are part of the
1990 * DDI/DKI which specifies that they return '-1' on "errors."
1991 *
1992 * Sigh.
1993 *
1994 * So there's two extremely similar routines - xcopyin() and xcopyout()
1995 * which return the errno that we've faithfully computed.  This
1996 * allows other callers (e.g. uiomove(9F)) to work correctly.
1997 * Given that these are used pretty heavily, we expand the calling
1998 * sequences inline for all flavours (rather than making wrappers).
1999 *
2000 * There are also stub routines for xcopyout_little and xcopyin_little,
2001 * which currently are intended to handle requests of <= 16 bytes from
2002 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
2003 * is left as an exercise...
2004 */
2005
2006/*
2007 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
2008 *
2009 * General theory of operation:
2010 *
2011 * The only difference between default_copy{in,out} and
2012 * default_xcopy{in,out} is in the error handling routine they invoke
2013 * when a memory access error is seen. default_xcopyOP returns the errno
2014 * while default_copyOP returns -1 (see above). copy{in,out}_noerr set
2015 * a special flag (by oring the value 2 into the fault handler address)
2016 * if they are called with a fault handler already in place. That flag
2017 * causes the default handlers to trampoline to the previous handler
2018 * upon an error.
2019 *
2020 * None of the copyops routines grab a window until it's decided that
2021 * we need to do a HW block copy operation. This saves a window
2022 * spill/fill when we're called during socket ops. The typical IO
2023 * path won't cause spill/fill traps.
2024 *
2025 * This code uses a set of 4 limits for the maximum size that will
2026 * be copied given a particular input/output address alignment.
2027 * the default limits are:
2028 *
2029 * single byte aligned - 900 (hw_copy_limit_1)
2030 * two byte aligned - 1800 (hw_copy_limit_2)
2031 * four byte aligned - 3600 (hw_copy_limit_4)
2032 * eight byte aligned - 7200 (hw_copy_limit_8)
2033 *
2034 * If the value for a particular limit is zero, the copy will be done
2035 * via the copy loops rather than VIS.
2036 *
2037 * Flow:
2038 *
2039 * If count == zero return zero.
2040 *
2041 * Store the previous lo_fault handler into %g6.
2042 * Place our secondary lofault handler into %g5.
2043 * Place the address of our nowindow fault handler into %o3.
2044 * Place the address of the windowed fault handler into %o4.
2045 * --> We'll use this handler if we end up grabbing a window
2046 * --> before we use VIS instructions.
2047 *
2048 * If count is less than or equal to SMALL_LIMIT (7) we
2049 * always do a byte for byte copy.
2050 *
2051 * If count is > SMALL_LIMIT, we check the alignment of the input
2052 * and output pointers. Based on the alignment we check count
2053 * against a soft limit of VIS_COPY_THRESHOLD (900 on spitfire). If
2054 * we're larger than VIS_COPY_THRESHOLD, we check against a limit based
2055 * on detected alignment. If we exceed the alignment value we copy
2056 * via VIS instructions.
2057 *
2058 * If we don't exceed one of the limits, we store -count in %o3,
2059 * we store the number of chunks (8, 4, 2 or 1 byte) operated
2060 * on in our basic copy loop in %o2. Following this we branch
2061 * to the appropriate copy loop and copy that many chunks.
2062 * Since we've been adding the chunk size to %o3 each time through
2063 * as well as decrementing %o2, we can tell if any data is
2064 * is left to be copied by examining %o3. If that is zero, we're
2065 * done and can go home. If not, we figure out what the largest
2066 * chunk size left to be copied is and branch to that copy loop
2067 * unless there's only one byte left. We load that as we're
2068 * branching to code that stores it just before we return.
2069 *
2070 * There is one potential situation in which we start to do a VIS
2071 * copy but decide to punt and return to the copy loops. There is
2072 * (in the default configuration) a window of 256 bytes between
2073 * the single byte aligned copy limit and what VIS treats as its
2074 * minimum if floating point is in use in the calling app. We need
2075 * to be prepared to handle this. See the .small_copyOP label for
2076 * details.
2077 *
2078 * Fault handlers are invoked if we reference memory that has no
2079 * current mapping.  All forms share the same copyio_fault handler.
2080 * This routine handles fixing up the stack and general housecleaning.
2081 * Each copy operation has a simple fault handler that is then called
2082 * to do the work specific to the invidual operation.  The handlers
2083 * for default_copyOP and copyOP_noerr are found at the end of
2084 * default_copyout. The handlers for default_xcopyOP are found at the
2085 * end of xdefault_copyin.
2086 */
2087
2088/*
2089 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
2090 */
2091
2092/*
2093 * We save the arguments in the following registers in case of a fault:
2094 * 	kaddr - %g2
2095 * 	uaddr - %g3
2096 * 	count - %g4
2097 */
2098#define	SAVE_SRC	%g2
2099#define	SAVE_DST	%g3
2100#define	SAVE_COUNT	%g4
2101
2102#define	REAL_LOFAULT		%g5
2103#define	SAVED_LOFAULT		%g6
2104
2105/*
2106 * Generic copyio fault handler.  This is the first line of defense when a
2107 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
2108 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
2109 * This allows us to share common code for all the flavors of the copy
2110 * operations, including the _noerr versions.
2111 *
2112 * Note that this function will restore the original input parameters before
2113 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
2114 * member of the t_copyop structure, if needed.
2115 */
2116	ENTRY(copyio_fault)
2117	btst	FPUSED_FLAG, SAVED_LOFAULT
2118	bz	1f
2119	  andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2120
2121	membar	#Sync
2122
2123	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2124	wr	%o2, 0, %gsr		! restore gsr
2125
2126	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2127	btst	FPRS_FEF, %o3
2128	bz	4f
2129	  nop
2130
2131	! restore fpregs from stack
2132	membar	#Sync
2133	add	%fp, STACK_BIAS - 257, %o2
2134	and	%o2, -64, %o2
2135	ldda	[%o2]ASI_BLK_P, %d0
2136	add	%o2, 64, %o2
2137	ldda	[%o2]ASI_BLK_P, %d16
2138	add	%o2, 64, %o2
2139	ldda	[%o2]ASI_BLK_P, %d32
2140	add	%o2, 64, %o2
2141	ldda	[%o2]ASI_BLK_P, %d48
2142	membar	#Sync
2143
2144	ba,pt	%ncc, 1f
2145	  wr	%o3, 0, %fprs		! restore fprs
2146
21474:
2148	FZERO				! zero all of the fpregs
2149	wr	%o3, 0, %fprs		! restore fprs
2150
21511:
2152
2153	restore
2154
2155	mov	SAVE_SRC, %o0
2156	mov	SAVE_DST, %o1
2157	jmp	REAL_LOFAULT
2158	  mov	SAVE_COUNT, %o2
2159	SET_SIZE(copyio_fault)
2160
2161	ENTRY(copyio_fault_nowindow)
2162	membar	#Sync
2163	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2164
2165	mov	SAVE_SRC, %o0
2166	mov	SAVE_DST, %o1
2167	jmp	REAL_LOFAULT
2168	  mov	SAVE_COUNT, %o2
2169	SET_SIZE(copyio_fault_nowindow)
2170
2171	ENTRY(copyout)
2172	sethi	%hi(.copyout_err), REAL_LOFAULT
2173	or	REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT
2174
2175.do_copyout:
2176	!
2177	! Check the length and bail if zero.
2178	!
2179	tst	%o2
2180	bnz,pt	%ncc, 1f
2181	  nop
2182	retl
2183	  clr	%o0
21841:
2185	sethi	%hi(copyio_fault), %o4
2186	or	%o4, %lo(copyio_fault), %o4
2187	sethi	%hi(copyio_fault_nowindow), %o3
2188	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
2189	or	%o3, %lo(copyio_fault_nowindow), %o3
2190	membar	#Sync
2191	stn	%o3, [THREAD_REG + T_LOFAULT]
2192
2193	mov	%o0, SAVE_SRC
2194	mov	%o1, SAVE_DST
2195	mov	%o2, SAVE_COUNT
2196
2197	!
2198	! Check to see if we're more than SMALL_LIMIT (7 bytes).
2199	! Run in leaf mode, using the %o regs as our input regs.
2200	!
2201	subcc	%o2, SMALL_LIMIT, %o3
2202	bgu,a,pt %ncc, .dco_ns
2203	or	%o0, %o1, %o3
2204	!
2205	! What was previously ".small_copyout"
2206	! Do full differenced copy.
2207	!
2208.dcobcp:
2209	sub	%g0, %o2, %o3		! negate count
2210	add	%o0, %o2, %o0		! make %o0 point at the end
2211	add	%o1, %o2, %o1		! make %o1 point at the end
2212	ba,pt	%ncc, .dcocl
2213	ldub	[%o0 + %o3], %o4	! load first byte
2214	!
2215	! %o0 and %o2 point at the end and remain pointing at the end
2216	! of their buffers. We pull things out by adding %o3 (which is
2217	! the negation of the length) to the buffer end which gives us
2218	! the curent location in the buffers. By incrementing %o3 we walk
2219	! through both buffers without having to bump each buffer's
2220	! pointer. A very fast 4 instruction loop.
2221	!
2222	.align 16
2223.dcocl:
2224	stba	%o4, [%o1 + %o3]ASI_USER
2225	inccc	%o3
2226	bl,a,pt	%ncc, .dcocl
2227	ldub	[%o0 + %o3], %o4
2228	!
2229	! We're done. Go home.
2230	!
2231	membar	#Sync
2232	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
2233	retl
2234	clr	%o0
2235	!
2236	! Try aligned copies from here.
2237	!
2238.dco_ns:
2239	! %o0 = kernel addr (to be copied from)
2240	! %o1 = user addr (to be copied to)
2241	! %o2 = length
2242	! %o3 = %o1 | %o2 (used for alignment checking)
2243	! %o4 is alternate lo_fault
2244	! %o5 is original lo_fault
2245	!
2246	! See if we're single byte aligned. If we are, check the
2247	! limit for single byte copies. If we're smaller or equal,
2248	! bounce to the byte for byte copy loop. Otherwise do it in
2249	! HW (if enabled).
2250	!
2251	btst	1, %o3
2252	bz,pt	%icc, .dcoh8
2253	btst	7, %o3
2254	!
2255	! Single byte aligned. Do we do it via HW or via
2256	! byte for byte? Do a quick no memory reference
2257	! check to pick up small copies.
2258	!
2259	subcc	%o2, VIS_COPY_THRESHOLD, %o3
2260	bleu,pt	%ncc, .dcobcp
2261	sethi	%hi(hw_copy_limit_1), %o3
2262	!
2263	! Big enough that we need to check the HW limit for
2264	! this size copy.
2265	!
2266	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2267	!
2268	! Is HW copy on? If not, do everything byte for byte.
2269	!
2270	tst	%o3
2271	bz,pn	%icc, .dcobcp
2272	subcc	%o3, %o2, %o3
2273	!
2274	! If we're less than or equal to the single byte copy limit,
2275	! bop to the copy loop.
2276	!
2277	bge,pt	%ncc, .dcobcp
2278	nop
2279	!
2280	! We're big enough and copy is on. Do it with HW.
2281	!
2282	ba,pt	%ncc, .big_copyout
2283	nop
2284.dcoh8:
2285	!
2286	! 8 byte aligned?
2287	!
2288	bnz,a	%ncc, .dcoh4
2289	btst	3, %o3
2290	!
2291	! See if we're in the "small range".
2292	! If so, go off and do the copy.
2293	! If not, load the hard limit. %o3 is
2294	! available for reuse.
2295	!
2296	subcc	%o2, VIS_COPY_THRESHOLD, %o3
2297	bleu,pt	%ncc, .dcos8
2298	sethi	%hi(hw_copy_limit_8), %o3
2299	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2300	!
2301	! If it's zero, there's no HW bcopy.
2302	! Bop off to the aligned copy.
2303	!
2304	tst	%o3
2305	bz,pn	%icc, .dcos8
2306	subcc	%o3, %o2, %o3
2307	!
2308	! We're negative if our size is larger than hw_copy_limit_8.
2309	!
2310	bge,pt	%ncc, .dcos8
2311	nop
2312	!
2313	! HW assist is on and we're large enough. Do it.
2314	!
2315	ba,pt	%ncc, .big_copyout
2316	nop
2317.dcos8:
2318	!
2319	! Housekeeping for copy loops. Uses same idea as in the byte for
2320	! byte copy loop above.
2321	!
2322	add	%o0, %o2, %o0
2323	add	%o1, %o2, %o1
2324	sub	%g0, %o2, %o3
2325	ba,pt	%ncc, .dodebc
2326	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
2327	!
2328	! 4 byte aligned?
2329	!
2330.dcoh4:
2331	bnz,pn	%ncc, .dcoh2
2332	!
2333	! See if we're in the "small range".
2334	! If so, go off an do the copy.
2335	! If not, load the hard limit. %o3 is
2336	! available for reuse.
2337	!
2338	subcc	%o2, VIS_COPY_THRESHOLD, %o3
2339	bleu,pt	%ncc, .dcos4
2340	sethi	%hi(hw_copy_limit_4), %o3
2341	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
2342	!
2343	! If it's zero, there's no HW bcopy.
2344	! Bop off to the aligned copy.
2345	!
2346	tst	%o3
2347	bz,pn	%icc, .dcos4
2348	subcc	%o3, %o2, %o3
2349	!
2350	! We're negative if our size is larger than hw_copy_limit_4.
2351	!
2352	bge,pt	%ncc, .dcos4
2353	nop
2354	!
2355	! HW assist is on and we're large enough. Do it.
2356	!
2357	ba,pt	%ncc, .big_copyout
2358	nop
2359.dcos4:
2360	add	%o0, %o2, %o0
2361	add	%o1, %o2, %o1
2362	sub	%g0, %o2, %o3
2363	ba,pt	%ncc, .dodfbc
2364	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
2365	!
2366	! We must be 2 byte aligned. Off we go.
2367	! The check for small copies was done in the
2368	! delay at .dcoh4
2369	!
2370.dcoh2:
2371	ble	%ncc, .dcos2
2372	sethi	%hi(hw_copy_limit_2), %o3
2373	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
2374	tst	%o3
2375	bz,pn	%icc, .dcos2
2376	subcc	%o3, %o2, %o3
2377	bge,pt	%ncc, .dcos2
2378	nop
2379	!
2380	! HW is on and we're big enough. Do it.
2381	!
2382	ba,pt	%ncc, .big_copyout
2383	nop
2384.dcos2:
2385	add	%o0, %o2, %o0
2386	add	%o1, %o2, %o1
2387	sub	%g0, %o2, %o3
2388	ba,pt	%ncc, .dodtbc
2389	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
2390.small_copyout:
2391	!
2392	! Why are we doing this AGAIN? There are certain conditions in
2393	! big_copyout that will cause us to forego the HW assisted copies
2394	! and bounce back to a non-HW assisted copy. This dispatches those
2395	! copies. Note that we branch around this in the main line code.
2396	!
2397	! We make no check for limits or HW enablement here. We've
2398	! already been told that we're a poster child so just go off
2399	! and do it.
2400	!
2401	or	%o0, %o1, %o3
2402	btst	1, %o3
2403	bnz	%icc, .dcobcp		! Most likely
2404	btst	7, %o3
2405	bz	%icc, .dcos8
2406	btst	3, %o3
2407	bz	%icc, .dcos4
2408	nop
2409	ba,pt	%ncc, .dcos2
2410	nop
2411	.align 32
2412.dodebc:
2413	ldx	[%o0 + %o3], %o4
2414	deccc	%o2
2415	stxa	%o4, [%o1 + %o3]ASI_USER
2416	bg,pt	%ncc, .dodebc
2417	addcc	%o3, 8, %o3
2418	!
2419	! End of copy loop. Check to see if we're done. Most
2420	! eight byte aligned copies end here.
2421	!
2422	bz,pt	%ncc, .dcofh
2423	nop
2424	!
2425	! Something is left - do it byte for byte.
2426	!
2427	ba,pt	%ncc, .dcocl
2428	ldub	[%o0 + %o3], %o4	! load next byte
2429	!
2430	! Four byte copy loop. %o2 is the number of 4 byte chunks to copy.
2431	!
2432	.align 32
2433.dodfbc:
2434	lduw	[%o0 + %o3], %o4
2435	deccc	%o2
2436	sta	%o4, [%o1 + %o3]ASI_USER
2437	bg,pt	%ncc, .dodfbc
2438	addcc	%o3, 4, %o3
2439	!
2440	! End of copy loop. Check to see if we're done. Most
2441	! four byte aligned copies end here.
2442	!
2443	bz,pt	%ncc, .dcofh
2444	nop
2445	!
2446	! Something is left. Do it byte for byte.
2447	!
2448	ba,pt	%ncc, .dcocl
2449	ldub	[%o0 + %o3], %o4	! load next byte
2450	!
2451	! two byte aligned copy loop. %o2 is the number of 2 byte chunks to
2452	! copy.
2453	!
2454	.align 32
2455.dodtbc:
2456	lduh	[%o0 + %o3], %o4
2457	deccc	%o2
2458	stha	%o4, [%o1 + %o3]ASI_USER
2459	bg,pt	%ncc, .dodtbc
2460	addcc	%o3, 2, %o3
2461	!
2462	! End of copy loop. Anything left?
2463	!
2464	bz,pt	%ncc, .dcofh
2465	nop
2466	!
2467	! Deal with the last byte
2468	!
2469	ldub	[%o0 + %o3], %o4
2470	stba	%o4, [%o1 + %o3]ASI_USER
2471.dcofh:
2472	membar	#Sync
2473	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2474	retl
2475	clr	%o0
2476
2477.big_copyout:
2478	!
2479	! Are we using the FP registers?
2480	!
2481	rd	%fprs, %o3			! check for unused fp
2482	btst	FPRS_FEF, %o3
2483	bnz	%icc, .copyout_fpregs_inuse
2484	nop
2485	!
2486	! We're going to go off and do a block copy.
2487	! Switch fault hendlers and grab a window. We
2488	! don't do a membar #Sync since we've done only
2489	! kernel data to this point.
2490	!
2491	stn	%o4, [THREAD_REG + T_LOFAULT]
2492	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2493	!
2494	! %o3 is now %i3. Save original %fprs.
2495	!
2496	st	%i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
2497	ba,pt	%ncc, .do_block_copyout		! Not in use. Go off and do it.
2498	wr	%g0, FPRS_FEF, %fprs		! clear %fprs
2499	!
2500.copyout_fpregs_inuse:
2501	!
2502	! We're here if the FP regs are in use. Need to see if the request
2503	! exceeds our suddenly larger minimum.
2504	!
2505	cmp	%i2, VIS_COPY_THRESHOLD+(64*4) ! for large counts (larger
2506	bl	%ncc, .small_copyout
2507	  nop
2508	!
2509	! We're going to go off and do a block copy.
2510	! Change to the heavy duty fault handler and grab a window first.
2511	!
2512	stn	%o4, [THREAD_REG + T_LOFAULT]
2513	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2514	st	%i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
2515	!
2516	! save in-use fpregs on stack
2517	!
2518	wr	%g0, FPRS_FEF, %fprs
2519	membar	#Sync
2520	add	%fp, STACK_BIAS - 257, %o2
2521	and	%o2, -64, %o2
2522	stda	%d0, [%o2]ASI_BLK_P
2523	add	%o2, 64, %o2
2524	stda	%d16, [%o2]ASI_BLK_P
2525	add	%o2, 64, %o2
2526	stda	%d32, [%o2]ASI_BLK_P
2527	add	%o2, 64, %o2
2528	stda	%d48, [%o2]ASI_BLK_P
2529	membar	#Sync
2530
2531.do_block_copyout:
2532	membar	#StoreStore|#StoreLoad|#LoadStore
2533
2534	rd	%gsr, %o2
2535	st	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
2536
2537	! Set the lower bit in the saved t_lofault to indicate
2538	! that we need to clear the %fprs register on the way
2539	! out
2540	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
2541
2542	! Swap src/dst since the code below is memcpy code
2543	! and memcpy/bcopy have different calling sequences
2544	mov	%i1, %i5
2545	mov	%i0, %i1
2546	mov	%i5, %i0
2547
2548!!! This code is nearly identical to the version in the sun4u
2549!!! libc_psr.  Most bugfixes made to that file should be
2550!!! merged into this routine.
2551
2552	andcc	%i0, 7, %o3
2553	bz	%ncc, copyout_blkcpy
2554	sub	%o3, 8, %o3
2555	neg	%o3
2556	sub	%i2, %o3, %i2
2557
2558	! Align Destination on double-word boundary
2559
25602:	ldub	[%i1], %o4
2561	inc	%i1
2562	stba	%o4, [%i0]ASI_USER
2563	deccc	%o3
2564	bgu	%ncc, 2b
2565	  inc	%i0
2566copyout_blkcpy:
2567	andcc	%i0, 63, %i3
2568	bz,pn	%ncc, copyout_blalign	! now block aligned
2569	sub	%i3, 64, %i3
2570	neg	%i3			! bytes till block aligned
2571	sub	%i2, %i3, %i2		! update %i2 with new count
2572
2573	! Copy %i3 bytes till dst is block (64 byte) aligned. use
2574	! double word copies.
2575
2576	alignaddr %i1, %g0, %g1
2577	ldd	[%g1], %d0
2578	add	%g1, 8, %g1
25796:
2580	ldd	[%g1], %d2
2581	add	%g1, 8, %g1
2582	subcc	%i3, 8, %i3
2583	faligndata %d0, %d2, %d8
2584	stda	 %d8, [%i0]ASI_USER
2585	add	%i1, 8, %i1
2586	bz,pn	%ncc, copyout_blalign
2587	add	%i0, 8, %i0
2588	ldd	[%g1], %d0
2589	add	%g1, 8, %g1
2590	subcc	%i3, 8, %i3
2591	faligndata %d2, %d0, %d8
2592	stda	 %d8, [%i0]ASI_USER
2593	add	%i1, 8, %i1
2594	bgu,pn	%ncc, 6b
2595	add	%i0, 8, %i0
2596
2597copyout_blalign:
2598	membar	#StoreLoad
2599	! %i2 = total length
2600	! %i3 = blocks	(length - 64) / 64
2601	! %i4 = doubles remaining  (length - blocks)
2602	sub	%i2, 64, %i3
2603	andn	%i3, 63, %i3
2604	sub	%i2, %i3, %i4
2605	andn	%i4, 7, %i4
2606	sub	%i4, 16, %i4
2607	sub	%i2, %i4, %i2
2608	sub	%i2, %i3, %i2
2609
2610	andn	%i1, 0x3f, %l7		! blk aligned address
2611	alignaddr %i1, %g0, %g0		! gen %gsr
2612
2613	srl	%i1, 3, %l5		! bits 3,4,5 are now least sig in  %l5
2614	andcc	%l5, 7, %i5		! mask everything except bits 1,2 3
2615	add	%i1, %i4, %i1
2616	add	%i1, %i3, %i1
2617
2618	ldda	[%l7]ASI_BLK_P, %d0
2619	add	%l7, 64, %l7
2620	ldda	[%l7]ASI_BLK_P, %d16
2621	add	%l7, 64, %l7
2622	ldda	[%l7]ASI_BLK_P, %d32
2623	add	%l7, 64, %l7
2624	sub	%i3, 128, %i3
2625
2626	! switch statement to get us to the right 8 byte blk within a
2627	! 64 byte block
2628
2629	cmp	 %i5, 4
2630	bgeu,a	 copyout_hlf
2631	cmp	 %i5, 6
2632	cmp	 %i5, 2
2633	bgeu,a	 copyout_sqtr
2634	nop
2635	cmp	 %i5, 1
2636	be,a	 copyout_seg1
2637	nop
2638	ba,pt	 %ncc, copyout_seg0
2639	nop
2640copyout_sqtr:
2641	be,a	 copyout_seg2
2642	nop
2643	ba,pt	 %ncc, copyout_seg3
2644	nop
2645
2646copyout_hlf:
2647	bgeu,a	 copyout_fqtr
2648	nop
2649	cmp	 %i5, 5
2650	be,a	 copyout_seg5
2651	nop
2652	ba,pt	 %ncc, copyout_seg4
2653	nop
2654copyout_fqtr:
2655	be,a	 copyout_seg6
2656	nop
2657	ba,pt	 %ncc, copyout_seg7
2658	nop
2659
2660copyout_seg0:
2661	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2662	FALIGN_D0
2663	ldda	[%l7]ASI_BLK_P, %d0
2664	stda	%d48, [%i0]ASI_BLK_AIUS
2665	add	%l7, 64, %l7
2666	subcc	%i3, 64, %i3
2667	bz,pn	%ncc, 0f
2668	add	%i0, 64, %i0
2669	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2670	FALIGN_D16
2671	ldda	[%l7]ASI_BLK_P, %d16
2672	stda	%d48, [%i0]ASI_BLK_AIUS
2673	add	%l7, 64, %l7
2674	subcc	%i3, 64, %i3
2675	bz,pn	%ncc, 1f
2676	add	%i0, 64, %i0
2677	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2678	FALIGN_D32
2679	ldda	[%l7]ASI_BLK_P, %d32
2680	stda	%d48, [%i0]ASI_BLK_AIUS
2681	add	%l7, 64, %l7
2682	subcc	%i3, 64, %i3
2683	bz,pn	%ncc, 2f
2684	add	%i0, 64, %i0
2685	ba,a,pt	%ncc, copyout_seg0
2686
26870:
2688	FALIGN_D16
2689	stda	%d48, [%i0]ASI_BLK_AIUS
2690	add	%i0, 64, %i0
2691	membar	#Sync
2692	FALIGN_D32
2693	stda	%d48, [%i0]ASI_BLK_AIUS
2694	ba,pt	%ncc, copyout_blkd0
2695	add	%i0, 64, %i0
2696
26971:
2698	FALIGN_D32
2699	stda	%d48, [%i0]ASI_BLK_AIUS
2700	add	%i0, 64, %i0
2701	membar	#Sync
2702	FALIGN_D0
2703	stda	%d48, [%i0]ASI_BLK_AIUS
2704	ba,pt	%ncc, copyout_blkd16
2705	add	%i0, 64, %i0
2706
27072:
2708	FALIGN_D0
2709	stda	%d48, [%i0]ASI_BLK_AIUS
2710	add	%i0, 64, %i0
2711	membar	#Sync
2712	FALIGN_D16
2713	stda	%d48, [%i0]ASI_BLK_AIUS
2714	ba,pt	%ncc, copyout_blkd32
2715	add	%i0, 64, %i0
2716
2717copyout_seg1:
2718	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2719	FALIGN_D2
2720	ldda	[%l7]ASI_BLK_P, %d0
2721	stda	%d48, [%i0]ASI_BLK_AIUS
2722	add	%l7, 64, %l7
2723	subcc	%i3, 64, %i3
2724	bz,pn	%ncc, 0f
2725	add	%i0, 64, %i0
2726	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2727	FALIGN_D18
2728	ldda	[%l7]ASI_BLK_P, %d16
2729	stda	%d48, [%i0]ASI_BLK_AIUS
2730	add	%l7, 64, %l7
2731	subcc	%i3, 64, %i3
2732	bz,pn	%ncc, 1f
2733	add	%i0, 64, %i0
2734	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2735	FALIGN_D34
2736	ldda	[%l7]ASI_BLK_P, %d32
2737	stda	%d48, [%i0]ASI_BLK_AIUS
2738	add	%l7, 64, %l7
2739	subcc	%i3, 64, %i3
2740	bz,pn	%ncc, 2f
2741	add	%i0, 64, %i0
2742	ba,a,pt	%ncc, copyout_seg1
27430:
2744	FALIGN_D18
2745	stda	%d48, [%i0]ASI_BLK_AIUS
2746	add	%i0, 64, %i0
2747	membar	#Sync
2748	FALIGN_D34
2749	stda	%d48, [%i0]ASI_BLK_AIUS
2750	ba,pt	%ncc, copyout_blkd2
2751	add	%i0, 64, %i0
2752
27531:
2754	FALIGN_D34
2755	stda	%d48, [%i0]ASI_BLK_AIUS
2756	add	%i0, 64, %i0
2757	membar	#Sync
2758	FALIGN_D2
2759	stda	%d48, [%i0]ASI_BLK_AIUS
2760	ba,pt	%ncc, copyout_blkd18
2761	add	%i0, 64, %i0
2762
27632:
2764	FALIGN_D2
2765	stda	%d48, [%i0]ASI_BLK_AIUS
2766	add	%i0, 64, %i0
2767	membar	#Sync
2768	FALIGN_D18
2769	stda	%d48, [%i0]ASI_BLK_AIUS
2770	ba,pt	%ncc, copyout_blkd34
2771	add	%i0, 64, %i0
2772
2773copyout_seg2:
2774	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2775	FALIGN_D4
2776	ldda	[%l7]ASI_BLK_P, %d0
2777	stda	%d48, [%i0]ASI_BLK_AIUS
2778	add	%l7, 64, %l7
2779	subcc	%i3, 64, %i3
2780	bz,pn	%ncc, 0f
2781	add	%i0, 64, %i0
2782	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2783	FALIGN_D20
2784	ldda	[%l7]ASI_BLK_P, %d16
2785	stda	%d48, [%i0]ASI_BLK_AIUS
2786	add	%l7, 64, %l7
2787	subcc	%i3, 64, %i3
2788	bz,pn	%ncc, 1f
2789	add	%i0, 64, %i0
2790	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2791	FALIGN_D36
2792	ldda	[%l7]ASI_BLK_P, %d32
2793	stda	%d48, [%i0]ASI_BLK_AIUS
2794	add	%l7, 64, %l7
2795	subcc	%i3, 64, %i3
2796	bz,pn	%ncc, 2f
2797	add	%i0, 64, %i0
2798	ba,a,pt	%ncc, copyout_seg2
2799
28000:
2801	FALIGN_D20
2802	stda	%d48, [%i0]ASI_BLK_AIUS
2803	add	%i0, 64, %i0
2804	membar	#Sync
2805	FALIGN_D36
2806	stda	%d48, [%i0]ASI_BLK_AIUS
2807	ba,pt	%ncc, copyout_blkd4
2808	add	%i0, 64, %i0
2809
28101:
2811	FALIGN_D36
2812	stda	%d48, [%i0]ASI_BLK_AIUS
2813	add	%i0, 64, %i0
2814	membar	#Sync
2815	FALIGN_D4
2816	stda	%d48, [%i0]ASI_BLK_AIUS
2817	ba,pt	%ncc, copyout_blkd20
2818	add	%i0, 64, %i0
2819
28202:
2821	FALIGN_D4
2822	stda	%d48, [%i0]ASI_BLK_AIUS
2823	add	%i0, 64, %i0
2824	membar	#Sync
2825	FALIGN_D20
2826	stda	%d48, [%i0]ASI_BLK_AIUS
2827	ba,pt	%ncc, copyout_blkd36
2828	add	%i0, 64, %i0
2829
2830copyout_seg3:
2831	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2832	FALIGN_D6
2833	ldda	[%l7]ASI_BLK_P, %d0
2834	stda	%d48, [%i0]ASI_BLK_AIUS
2835	add	%l7, 64, %l7
2836	subcc	%i3, 64, %i3
2837	bz,pn	%ncc, 0f
2838	add	%i0, 64, %i0
2839	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2840	FALIGN_D22
2841	ldda	[%l7]ASI_BLK_P, %d16
2842	stda	%d48, [%i0]ASI_BLK_AIUS
2843	add	%l7, 64, %l7
2844	subcc	%i3, 64, %i3
2845	bz,pn	%ncc, 1f
2846	add	%i0, 64, %i0
2847	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2848	FALIGN_D38
2849	ldda	[%l7]ASI_BLK_P, %d32
2850	stda	%d48, [%i0]ASI_BLK_AIUS
2851	add	%l7, 64, %l7
2852	subcc	%i3, 64, %i3
2853	bz,pn	%ncc, 2f
2854	add	%i0, 64, %i0
2855	ba,a,pt	%ncc, copyout_seg3
2856
28570:
2858	FALIGN_D22
2859	stda	%d48, [%i0]ASI_BLK_AIUS
2860	add	%i0, 64, %i0
2861	membar	#Sync
2862	FALIGN_D38
2863	stda	%d48, [%i0]ASI_BLK_AIUS
2864	ba,pt	%ncc, copyout_blkd6
2865	add	%i0, 64, %i0
2866
28671:
2868	FALIGN_D38
2869	stda	%d48, [%i0]ASI_BLK_AIUS
2870	add	%i0, 64, %i0
2871	membar	#Sync
2872	FALIGN_D6
2873	stda	%d48, [%i0]ASI_BLK_AIUS
2874	ba,pt	%ncc, copyout_blkd22
2875	add	%i0, 64, %i0
2876
28772:
2878	FALIGN_D6
2879	stda	%d48, [%i0]ASI_BLK_AIUS
2880	add	%i0, 64, %i0
2881	membar	#Sync
2882	FALIGN_D22
2883	stda	%d48, [%i0]ASI_BLK_AIUS
2884	ba,pt	%ncc, copyout_blkd38
2885	add	%i0, 64, %i0
2886
2887copyout_seg4:
2888	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2889	FALIGN_D8
2890	ldda	[%l7]ASI_BLK_P, %d0
2891	stda	%d48, [%i0]ASI_BLK_AIUS
2892	add	%l7, 64, %l7
2893	subcc	%i3, 64, %i3
2894	bz,pn	%ncc, 0f
2895	add	%i0, 64, %i0
2896	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2897	FALIGN_D24
2898	ldda	[%l7]ASI_BLK_P, %d16
2899	stda	%d48, [%i0]ASI_BLK_AIUS
2900	add	%l7, 64, %l7
2901	subcc	%i3, 64, %i3
2902	bz,pn	%ncc, 1f
2903	add	%i0, 64, %i0
2904	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2905	FALIGN_D40
2906	ldda	[%l7]ASI_BLK_P, %d32
2907	stda	%d48, [%i0]ASI_BLK_AIUS
2908	add	%l7, 64, %l7
2909	subcc	%i3, 64, %i3
2910	bz,pn	%ncc, 2f
2911	add	%i0, 64, %i0
2912	ba,a,pt	%ncc, copyout_seg4
2913
29140:
2915	FALIGN_D24
2916	stda	%d48, [%i0]ASI_BLK_AIUS
2917	add	%i0, 64, %i0
2918	membar	#Sync
2919	FALIGN_D40
2920	stda	%d48, [%i0]ASI_BLK_AIUS
2921	ba,pt	%ncc, copyout_blkd8
2922	add	%i0, 64, %i0
2923
29241:
2925	FALIGN_D40
2926	stda	%d48, [%i0]ASI_BLK_AIUS
2927	add	%i0, 64, %i0
2928	membar	#Sync
2929	FALIGN_D8
2930	stda	%d48, [%i0]ASI_BLK_AIUS
2931	ba,pt	%ncc, copyout_blkd24
2932	add	%i0, 64, %i0
2933
29342:
2935	FALIGN_D8
2936	stda	%d48, [%i0]ASI_BLK_AIUS
2937	add	%i0, 64, %i0
2938	membar	#Sync
2939	FALIGN_D24
2940	stda	%d48, [%i0]ASI_BLK_AIUS
2941	ba,pt	%ncc, copyout_blkd40
2942	add	%i0, 64, %i0
2943
2944copyout_seg5:
2945	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
2946	FALIGN_D10
2947	ldda	[%l7]ASI_BLK_P, %d0
2948	stda	%d48, [%i0]ASI_BLK_AIUS
2949	add	%l7, 64, %l7
2950	subcc	%i3, 64, %i3
2951	bz,pn	%ncc, 0f
2952	add	%i0, 64, %i0
2953	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
2954	FALIGN_D26
2955	ldda	[%l7]ASI_BLK_P, %d16
2956	stda	%d48, [%i0]ASI_BLK_AIUS
2957	add	%l7, 64, %l7
2958	subcc	%i3, 64, %i3
2959	bz,pn	%ncc, 1f
2960	add	%i0, 64, %i0
2961	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
2962	FALIGN_D42
2963	ldda	[%l7]ASI_BLK_P, %d32
2964	stda	%d48, [%i0]ASI_BLK_AIUS
2965	add	%l7, 64, %l7
2966	subcc	%i3, 64, %i3
2967	bz,pn	%ncc, 2f
2968	add	%i0, 64, %i0
2969	ba,a,pt	%ncc, copyout_seg5
2970
29710:
2972	FALIGN_D26
2973	stda	%d48, [%i0]ASI_BLK_AIUS
2974	add	%i0, 64, %i0
2975	membar	#Sync
2976	FALIGN_D42
2977	stda	%d48, [%i0]ASI_BLK_AIUS
2978	ba,pt	%ncc, copyout_blkd10
2979	add	%i0, 64, %i0
2980
29811:
2982	FALIGN_D42
2983	stda	%d48, [%i0]ASI_BLK_AIUS
2984	add	%i0, 64, %i0
2985	membar	#Sync
2986	FALIGN_D10
2987	stda	%d48, [%i0]ASI_BLK_AIUS
2988	ba,pt	%ncc, copyout_blkd26
2989	add	%i0, 64, %i0
2990
29912:
2992	FALIGN_D10
2993	stda	%d48, [%i0]ASI_BLK_AIUS
2994	add	%i0, 64, %i0
2995	membar	#Sync
2996	FALIGN_D26
2997	stda	%d48, [%i0]ASI_BLK_AIUS
2998	ba,pt	%ncc, copyout_blkd42
2999	add	%i0, 64, %i0
3000
3001copyout_seg6:
3002	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
3003	FALIGN_D12
3004	ldda	[%l7]ASI_BLK_P, %d0
3005	stda	%d48, [%i0]ASI_BLK_AIUS
3006	add	%l7, 64, %l7
3007	subcc	%i3, 64, %i3
3008	bz,pn	%ncc, 0f
3009	add	%i0, 64, %i0
3010	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
3011	FALIGN_D28
3012	ldda	[%l7]ASI_BLK_P, %d16
3013	stda	%d48, [%i0]ASI_BLK_AIUS
3014	add	%l7, 64, %l7
3015	subcc	%i3, 64, %i3
3016	bz,pn	%ncc, 1f
3017	add	%i0, 64, %i0
3018	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
3019	FALIGN_D44
3020	ldda	[%l7]ASI_BLK_P, %d32
3021	stda	%d48, [%i0]ASI_BLK_AIUS
3022	add	%l7, 64, %l7
3023	subcc	%i3, 64, %i3
3024	bz,pn	%ncc, 2f
3025	add	%i0, 64, %i0
3026	ba,a,pt	%ncc, copyout_seg6
3027
30280:
3029	FALIGN_D28
3030	stda	%d48, [%i0]ASI_BLK_AIUS
3031	add	%i0, 64, %i0
3032	membar	#Sync
3033	FALIGN_D44
3034	stda	%d48, [%i0]ASI_BLK_AIUS
3035	ba,pt	%ncc, copyout_blkd12
3036	add	%i0, 64, %i0
3037
30381:
3039	FALIGN_D44
3040	stda	%d48, [%i0]ASI_BLK_AIUS
3041	add	%i0, 64, %i0
3042	membar	#Sync
3043	FALIGN_D12
3044	stda	%d48, [%i0]ASI_BLK_AIUS
3045	ba,pt	%ncc, copyout_blkd28
3046	add	%i0, 64, %i0
3047
30482:
3049	FALIGN_D12
3050	stda	%d48, [%i0]ASI_BLK_AIUS
3051	add	%i0, 64, %i0
3052	membar	#Sync
3053	FALIGN_D28
3054	stda	%d48, [%i0]ASI_BLK_AIUS
3055	ba,pt	%ncc, copyout_blkd44
3056	add	%i0, 64, %i0
3057
3058copyout_seg7:
3059	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
3060	FALIGN_D14
3061	ldda	[%l7]ASI_BLK_P, %d0
3062	stda	%d48, [%i0]ASI_BLK_AIUS
3063	add	%l7, 64, %l7
3064	subcc	%i3, 64, %i3
3065	bz,pn	%ncc, 0f
3066	add	%i0, 64, %i0
3067	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
3068	FALIGN_D30
3069	ldda	[%l7]ASI_BLK_P, %d16
3070	stda	%d48, [%i0]ASI_BLK_AIUS
3071	add	%l7, 64, %l7
3072	subcc	%i3, 64, %i3
3073	bz,pn	%ncc, 1f
3074	add	%i0, 64, %i0
3075	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
3076	FALIGN_D46
3077	ldda	[%l7]ASI_BLK_P, %d32
3078	stda	%d48, [%i0]ASI_BLK_AIUS
3079	add	%l7, 64, %l7
3080	subcc	%i3, 64, %i3
3081	bz,pn	%ncc, 2f
3082	add	%i0, 64, %i0
3083	ba,a,pt	%ncc, copyout_seg7
3084
30850:
3086	FALIGN_D30
3087	stda	%d48, [%i0]ASI_BLK_AIUS
3088	add	%i0, 64, %i0
3089	membar	#Sync
3090	FALIGN_D46
3091	stda	%d48, [%i0]ASI_BLK_AIUS
3092	ba,pt	%ncc, copyout_blkd14
3093	add	%i0, 64, %i0
3094
30951:
3096	FALIGN_D46
3097	stda	%d48, [%i0]ASI_BLK_AIUS
3098	add	%i0, 64, %i0
3099	membar	#Sync
3100	FALIGN_D14
3101	stda	%d48, [%i0]ASI_BLK_AIUS
3102	ba,pt	%ncc, copyout_blkd30
3103	add	%i0, 64, %i0
3104
31052:
3106	FALIGN_D14
3107	stda	%d48, [%i0]ASI_BLK_AIUS
3108	add	%i0, 64, %i0
3109	membar	#Sync
3110	FALIGN_D30
3111	stda	%d48, [%i0]ASI_BLK_AIUS
3112	ba,pt	%ncc, copyout_blkd46
3113	add	%i0, 64, %i0
3114
3115
3116	!
3117	! dribble out the last partial block
3118	!
3119copyout_blkd0:
3120	subcc	%i4, 8, %i4
3121	blu,pn	%ncc, copyout_blkdone
3122	faligndata %d0, %d2, %d48
3123	stda	%d48, [%i0]ASI_USER
3124	add	%i0, 8, %i0
3125copyout_blkd2:
3126	subcc	%i4, 8, %i4
3127	blu,pn	%ncc, copyout_blkdone
3128	faligndata %d2, %d4, %d48
3129	stda	%d48, [%i0]ASI_USER
3130	add	%i0, 8, %i0
3131copyout_blkd4:
3132	subcc	%i4, 8, %i4
3133	blu,pn	%ncc, copyout_blkdone
3134	faligndata %d4, %d6, %d48
3135	stda	%d48, [%i0]ASI_USER
3136	add	%i0, 8, %i0
3137copyout_blkd6:
3138	subcc	%i4, 8, %i4
3139	blu,pn	%ncc, copyout_blkdone
3140	faligndata %d6, %d8, %d48
3141	stda	%d48, [%i0]ASI_USER
3142	add	%i0, 8, %i0
3143copyout_blkd8:
3144	subcc	%i4, 8, %i4
3145	blu,pn	%ncc, copyout_blkdone
3146	faligndata %d8, %d10, %d48
3147	stda	%d48, [%i0]ASI_USER
3148	add	%i0, 8, %i0
3149copyout_blkd10:
3150	subcc	%i4, 8, %i4
3151	blu,pn	%ncc, copyout_blkdone
3152	faligndata %d10, %d12, %d48
3153	stda	%d48, [%i0]ASI_USER
3154	add	%i0, 8, %i0
3155copyout_blkd12:
3156	subcc	%i4, 8, %i4
3157	blu,pn	%ncc, copyout_blkdone
3158	faligndata %d12, %d14, %d48
3159	stda	%d48, [%i0]ASI_USER
3160	add	%i0, 8, %i0
3161copyout_blkd14:
3162	subcc	%i4, 8, %i4
3163	blu,pn	%ncc, copyout_blkdone
3164	fsrc1	%d14, %d0
3165	ba,a,pt	%ncc, copyout_blkleft
3166
3167copyout_blkd16:
3168	subcc	%i4, 8, %i4
3169	blu,pn	%ncc, copyout_blkdone
3170	faligndata %d16, %d18, %d48
3171	stda	%d48, [%i0]ASI_USER
3172	add	%i0, 8, %i0
3173copyout_blkd18:
3174	subcc	%i4, 8, %i4
3175	blu,pn	%ncc, copyout_blkdone
3176	faligndata %d18, %d20, %d48
3177	stda	%d48, [%i0]ASI_USER
3178	add	%i0, 8, %i0
3179copyout_blkd20:
3180	subcc	%i4, 8, %i4
3181	blu,pn	%ncc, copyout_blkdone
3182	faligndata %d20, %d22, %d48
3183	stda	%d48, [%i0]ASI_USER
3184	add	%i0, 8, %i0
3185copyout_blkd22:
3186	subcc	%i4, 8, %i4
3187	blu,pn	%ncc, copyout_blkdone
3188	faligndata %d22, %d24, %d48
3189	stda	%d48, [%i0]ASI_USER
3190	add	%i0, 8, %i0
3191copyout_blkd24:
3192	subcc	%i4, 8, %i4
3193	blu,pn	%ncc, copyout_blkdone
3194	faligndata %d24, %d26, %d48
3195	stda	%d48, [%i0]ASI_USER
3196	add	%i0, 8, %i0
3197copyout_blkd26:
3198	subcc	%i4, 8, %i4
3199	blu,pn	%ncc, copyout_blkdone
3200	faligndata %d26, %d28, %d48
3201	stda	%d48, [%i0]ASI_USER
3202	add	%i0, 8, %i0
3203copyout_blkd28:
3204	subcc	%i4, 8, %i4
3205	blu,pn	%ncc, copyout_blkdone
3206	faligndata %d28, %d30, %d48
3207	stda	%d48, [%i0]ASI_USER
3208	add	%i0, 8, %i0
3209copyout_blkd30:
3210	subcc	%i4, 8, %i4
3211	blu,pn	%ncc, copyout_blkdone
3212	fsrc1	%d30, %d0
3213	ba,a,pt	%ncc, copyout_blkleft
3214copyout_blkd32:
3215	subcc	%i4, 8, %i4
3216	blu,pn	%ncc, copyout_blkdone
3217	faligndata %d32, %d34, %d48
3218	stda	%d48, [%i0]ASI_USER
3219	add	%i0, 8, %i0
3220copyout_blkd34:
3221	subcc	%i4, 8, %i4
3222	blu,pn	%ncc, copyout_blkdone
3223	faligndata %d34, %d36, %d48
3224	stda	%d48, [%i0]ASI_USER
3225	add	%i0, 8, %i0
3226copyout_blkd36:
3227	subcc	%i4, 8, %i4
3228	blu,pn	%ncc, copyout_blkdone
3229	faligndata %d36, %d38, %d48
3230	stda	%d48, [%i0]ASI_USER
3231	add	%i0, 8, %i0
3232copyout_blkd38:
3233	subcc	%i4, 8, %i4
3234	blu,pn	%ncc, copyout_blkdone
3235	faligndata %d38, %d40, %d48
3236	stda	%d48, [%i0]ASI_USER
3237	add	%i0, 8, %i0
3238copyout_blkd40:
3239	subcc	%i4, 8, %i4
3240	blu,pn	%ncc, copyout_blkdone
3241	faligndata %d40, %d42, %d48
3242	stda	%d48, [%i0]ASI_USER
3243	add	%i0, 8, %i0
3244copyout_blkd42:
3245	subcc	%i4, 8, %i4
3246	blu,pn	%ncc, copyout_blkdone
3247	faligndata %d42, %d44, %d48
3248	stda	%d48, [%i0]ASI_USER
3249	add	%i0, 8, %i0
3250copyout_blkd44:
3251	subcc	%i4, 8, %i4
3252	blu,pn	%ncc, copyout_blkdone
3253	faligndata %d44, %d46, %d48
3254	stda	%d48, [%i0]ASI_USER
3255	add	%i0, 8, %i0
3256copyout_blkd46:
3257	subcc	%i4, 8, %i4
3258	blu,pn	%ncc, copyout_blkdone
3259	fsrc1	%d46, %d0
3260
3261copyout_blkleft:
32621:
3263	ldd	[%l7], %d2
3264	add	%l7, 8, %l7
3265	subcc	%i4, 8, %i4
3266	faligndata %d0, %d2, %d8
3267	stda	%d8, [%i0]ASI_USER
3268	blu,pn	%ncc, copyout_blkdone
3269	add	%i0, 8, %i0
3270	ldd	[%l7], %d0
3271	add	%l7, 8, %l7
3272	subcc	%i4, 8, %i4
3273	faligndata %d2, %d0, %d8
3274	stda	%d8, [%i0]ASI_USER
3275	bgeu,pt	%ncc, 1b
3276	add	%i0, 8, %i0
3277
3278copyout_blkdone:
3279	tst	%i2
3280	bz,pt	%ncc, .copyout_exit
3281	and	%l3, 0x4, %l3		! fprs.du = fprs.dl = 0
3282
32837:	ldub	[%i1], %i4
3284	inc	%i1
3285	stba	%i4, [%i0]ASI_USER
3286	inc	%i0
3287	deccc	%i2
3288	bgu	%ncc, 7b
3289	  nop
3290
3291.copyout_exit:
3292	membar	#StoreLoad|#StoreStore
3293	btst	FPUSED_FLAG, SAVED_LOFAULT
3294	bz	1f
3295	  nop
3296
3297	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
3298	wr	%o2, 0, %gsr		! restore gsr
3299
3300	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3301	btst	FPRS_FEF, %o3
3302	bz	4f
3303	  nop
3304
3305	! restore fpregs from stack
3306	membar	#Sync
3307	add	%fp, STACK_BIAS - 257, %o2
3308	and	%o2, -64, %o2
3309	ldda	[%o2]ASI_BLK_P, %d0
3310	add	%o2, 64, %o2
3311	ldda	[%o2]ASI_BLK_P, %d16
3312	add	%o2, 64, %o2
3313	ldda	[%o2]ASI_BLK_P, %d32
3314	add	%o2, 64, %o2
3315	ldda	[%o2]ASI_BLK_P, %d48
3316	membar	#Sync
3317
3318	ba,pt	%ncc, 1f
3319	  wr	%o3, 0, %fprs		! restore fprs
3320
33214:
3322	FZERO				! zero all of the fpregs
3323	wr	%o3, 0, %fprs		! restore fprs
3324
33251:
3326	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3327	membar	#Sync			! sync error barrier
3328	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3329	ret
3330	restore	%g0, 0, %o0
3331
3332.copyout_err:
3333	ldn	[THREAD_REG + T_COPYOPS], %o4
3334	brz	%o4, 2f
3335	nop
3336	ldn	[%o4 + CP_COPYOUT], %g2
3337	jmp	%g2
3338	nop
33392:
3340	retl
3341	mov	-1, %o0
3342	SET_SIZE(copyout)
3343
3344
3345	ENTRY(xcopyout)
3346	sethi	%hi(.xcopyout_err), REAL_LOFAULT
3347	b	.do_copyout
3348	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
3349.xcopyout_err:
3350	ldn	[THREAD_REG + T_COPYOPS], %o4
3351	brz	%o4, 2f
3352	nop
3353	ldn	[%o4 + CP_XCOPYOUT], %g2
3354	jmp	%g2
3355	nop
33562:
3357	retl
3358	mov	%g1, %o0
3359	SET_SIZE(xcopyout)
3360
3361	ENTRY(xcopyout_little)
3362	sethi	%hi(.little_err), %o4
3363	ldn	[THREAD_REG + T_LOFAULT], %o5
3364	or	%o4, %lo(.little_err), %o4
3365	membar	#Sync			! sync error barrier
3366	stn	%o4, [THREAD_REG + T_LOFAULT]
3367
3368	subcc	%g0, %o2, %o3
3369	add	%o0, %o2, %o0
3370	bz,pn	%ncc, 2f		! check for zero bytes
3371	sub	%o2, 1, %o4
3372	add	%o0, %o4, %o0		! start w/last byte
3373	add	%o1, %o2, %o1
3374	ldub	[%o0+%o3], %o4
3375
33761:	stba	%o4, [%o1+%o3]ASI_AIUSL
3377	inccc	%o3
3378	sub	%o0, 2, %o0		! get next byte
3379	bcc,a,pt %ncc, 1b
3380	  ldub	[%o0+%o3], %o4
3381
33822:	membar	#Sync			! sync error barrier
3383	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3384	retl
3385	mov	%g0, %o0		! return (0)
3386	SET_SIZE(xcopyout_little)
3387
3388/*
3389 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
3390 */
3391
3392	ENTRY(copyin)
3393	sethi	%hi(.copyin_err), REAL_LOFAULT
3394	or	REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT
3395
3396.do_copyin:
3397	!
3398	! Check the length and bail if zero.
3399	!
3400	tst	%o2
3401	bnz,pt	%ncc, 1f
3402	  nop
3403	retl
3404	  clr	%o0
34051:
3406	sethi	%hi(copyio_fault), %o4
3407	or	%o4, %lo(copyio_fault), %o4
3408	sethi	%hi(copyio_fault_nowindow), %o3
3409	ldn	[THREAD_REG + T_LOFAULT], SAVED_LOFAULT
3410	or	%o3, %lo(copyio_fault_nowindow), %o3
3411	membar	#Sync
3412	stn	%o3, [THREAD_REG + T_LOFAULT]
3413
3414	mov	%o0, SAVE_SRC
3415	mov	%o1, SAVE_DST
3416	mov	%o2, SAVE_COUNT
3417
3418	!
3419	! Check to see if we're more than SMALL_LIMIT.
3420	!
3421	subcc	%o2, SMALL_LIMIT, %o3
3422	bgu,a,pt %ncc, .dci_ns
3423	or	%o0, %o1, %o3
3424	!
3425	! What was previously ".small_copyin"
3426	!
3427.dcibcp:
3428	sub	%g0, %o2, %o3		! setup for copy loop
3429	add	%o0, %o2, %o0
3430	add	%o1, %o2, %o1
3431	ba,pt	%ncc, .dcicl
3432	lduba	[%o0 + %o3]ASI_USER, %o4
3433	!
3434	! %o0 and %o1 point at the end and remain pointing at the end
3435	! of their buffers. We pull things out by adding %o3 (which is
3436	! the negation of the length) to the buffer end which gives us
3437	! the curent location in the buffers. By incrementing %o3 we walk
3438	! through both buffers without having to bump each buffer's
3439	! pointer. A very fast 4 instruction loop.
3440	!
3441	.align 16
3442.dcicl:
3443	stb	%o4, [%o1 + %o3]
3444	inccc	%o3
3445	bl,a,pt %ncc, .dcicl
3446	lduba	[%o0 + %o3]ASI_USER, %o4
3447	!
3448	! We're done. Go home.
3449	!
3450	membar	#Sync
3451	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]
3452	retl
3453	clr	%o0
3454	!
3455	! Try aligned copies from here.
3456	!
3457.dci_ns:
3458	!
3459	! See if we're single byte aligned. If we are, check the
3460	! limit for single byte copies. If we're smaller, or equal,
3461	! bounce to the byte for byte copy loop. Otherwise do it in
3462	! HW (if enabled).
3463	!
3464	btst	1, %o3
3465	bz,a,pt	%icc, .dcih8
3466	btst	7, %o3
3467	!
3468	! We're single byte aligned.
3469	!
3470	subcc	%o2, VIS_COPY_THRESHOLD, %o3
3471	bleu,pt	%ncc, .dcibcp
3472	sethi	%hi(hw_copy_limit_1), %o3
3473	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3474	!
3475	! Is HW copy on? If not do everything byte for byte.
3476	!
3477	tst	%o3
3478	bz,pn	%icc, .dcibcp
3479	subcc	%o3, %o2, %o3
3480	!
3481	! Are we bigger than the HW limit? If not
3482	! go to byte for byte.
3483	!
3484	bge,pt	%ncc, .dcibcp
3485	nop
3486	!
3487	! We're big enough and copy is on. Do it with HW.
3488	!
3489	ba,pt	%ncc, .big_copyin
3490	nop
3491.dcih8:
3492	!
3493	! 8 byte aligned?
3494	!
3495	bnz,a	%ncc, .dcih4
3496	btst	3, %o3
3497	!
3498	! We're eight byte aligned.
3499	!
3500	subcc	%o2, VIS_COPY_THRESHOLD, %o3
3501	bleu,pt	%ncc, .dcis8
3502	sethi	%hi(hw_copy_limit_8), %o3
3503	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3504	!
3505	! Is HW assist on? If not, do it with the aligned copy.
3506	!
3507	tst	%o3
3508	bz,pn	%icc, .dcis8
3509	subcc	%o3, %o2, %o3
3510	bge	%ncc, .dcis8
3511	nop
3512	ba,pt	%ncc, .big_copyin
3513	nop
3514.dcis8:
3515	!
3516	! Housekeeping for copy loops. Uses same idea as in the byte for
3517	! byte copy loop above.
3518	!
3519	add	%o0, %o2, %o0
3520	add	%o1, %o2, %o1
3521	sub	%g0, %o2, %o3
3522	ba,pt	%ncc, .didebc
3523	srl	%o2, 3, %o2		! Number of 8 byte chunks to copy
3524	!
3525	! 4 byte aligned?
3526	!
3527.dcih4:
3528	bnz	%ncc, .dcih2
3529	subcc	%o2, VIS_COPY_THRESHOLD, %o3
3530	bleu,pt	%ncc, .dcis4
3531	sethi	%hi(hw_copy_limit_4), %o3
3532	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3533	!
3534	! Is HW assist on? If not, do it with the aligned copy.
3535	!
3536	tst	%o3
3537	bz,pn	%icc, .dcis4
3538	subcc	%o3, %o2, %o3
3539	!
3540	! We're negative if our size is less than or equal to hw_copy_limit_4.
3541	!
3542	bge	%ncc, .dcis4
3543	nop
3544	ba,pt	%ncc, .big_copyin
3545	nop
3546.dcis4:
3547	!
3548	! Housekeeping for copy loops. Uses same idea as in the byte
3549	! for byte copy loop above.
3550	!
3551	add	%o0, %o2, %o0
3552	add	%o1, %o2, %o1
3553	sub	%g0, %o2, %o3
3554	ba,pt	%ncc, .didfbc
3555	srl	%o2, 2, %o2		! Number of 4 byte chunks to copy
3556.dcih2:
3557	!
3558	! We're two byte aligned. Check for "smallness"
3559	! done in delay at .dcih4
3560	!
3561	bleu,pt	%ncc, .dcis2
3562	sethi	%hi(hw_copy_limit_2), %o3
3563	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3564	!
3565	! Is HW assist on? If not, do it with the aligned copy.
3566	!
3567	tst	%o3
3568	bz,pn	%icc, .dcis2
3569	subcc	%o3, %o2, %o3
3570	!
3571	! Are we larger than the HW limit?
3572	!
3573	bge	%ncc, .dcis2
3574	nop
3575	!
3576	! HW assist is on and we're large enough to use it.
3577	!
3578	ba,pt	%ncc, .big_copyin
3579	nop
3580	!
3581	! Housekeeping for copy loops. Uses same idea as in the byte
3582	! for byte copy loop above.
3583	!
3584.dcis2:
3585	add	%o0, %o2, %o0
3586	add	%o1, %o2, %o1
3587	sub	%g0, %o2, %o3
3588	ba,pt	%ncc, .didtbc
3589	srl	%o2, 1, %o2		! Number of 2 byte chunks to copy
3590	!
3591.small_copyin:
3592	!
3593	! Why are we doing this AGAIN? There are certain conditions in
3594	! big copyin that will cause us to forgo the HW assisted copys
3595	! and bounce back to a non-hw assisted copy. This dispatches
3596	! those copies. Note that we branch around this in the main line
3597	! code.
3598	!
3599	! We make no check for limits or HW enablement here. We've
3600	! already been told that we're a poster child so just go off
3601	! and do it.
3602	!
3603	or	%o0, %o1, %o3
3604	btst	1, %o3
3605	bnz	%icc, .dcibcp		! Most likely
3606	btst	7, %o3
3607	bz	%icc, .dcis8
3608	btst	3, %o3
3609	bz	%icc, .dcis4
3610	nop
3611	ba,pt	%ncc, .dcis2
3612	nop
3613	!
3614	! Eight byte aligned copies. A steal from the original .small_copyin
3615	! with modifications. %o2 is number of 8 byte chunks to copy. When
3616	! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more
3617	! to copy.
3618	!
3619	.align 32
3620.didebc:
3621	ldxa	[%o0 + %o3]ASI_USER, %o4
3622	deccc	%o2
3623	stx	%o4, [%o1 + %o3]
3624	bg,pt	%ncc, .didebc
3625	addcc	%o3, 8, %o3
3626	!
3627	! End of copy loop. Most 8 byte aligned copies end here.
3628	!
3629	bz,pt	%ncc, .dcifh
3630	nop
3631	!
3632	! Something is left. Do it byte for byte.
3633	!
3634	ba,pt	%ncc, .dcicl
3635	lduba	[%o0 + %o3]ASI_USER, %o4
3636	!
3637	! 4 byte copy loop. %o2 is number of 4 byte chunks to copy.
3638	!
3639	.align 32
3640.didfbc:
3641	lduwa	[%o0 + %o3]ASI_USER, %o4
3642	deccc	%o2
3643	st	%o4, [%o1 + %o3]
3644	bg,pt	%ncc, .didfbc
3645	addcc	%o3, 4, %o3
3646	!
3647	! End of copy loop. Most 4 byte aligned copies end here.
3648	!
3649	bz,pt	%ncc, .dcifh
3650	nop
3651	!
3652	! Something is left. Do it byte for byte.
3653	!
3654	ba,pt	%ncc, .dcicl
3655	lduba	[%o0 + %o3]ASI_USER, %o4
3656	!
3657	! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to
3658	! copy.
3659	!
3660	.align 32
3661.didtbc:
3662	lduha	[%o0 + %o3]ASI_USER, %o4
3663	deccc	%o2
3664	sth	%o4, [%o1 + %o3]
3665	bg,pt	%ncc, .didtbc
3666	addcc	%o3, 2, %o3
3667	!
3668	! End of copy loop. Most 2 byte aligned copies end here.
3669	!
3670	bz,pt	%ncc, .dcifh
3671	nop
3672	!
3673	! Deal with the last byte
3674	!
3675	lduba	[%o0 + %o3]ASI_USER, %o4
3676	stb	%o4, [%o1 + %o3]
3677.dcifh:
3678	membar	#Sync
3679	stn     SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3680	retl
3681	clr	%o0
3682
3683.big_copyin:
3684	!
3685	! Are we using the FP registers?
3686	!
3687	rd	%fprs, %o3		! check for unused fp
3688	btst	FPRS_FEF, %o3
3689	bnz	%ncc, .copyin_fpregs_inuse
3690	nop
3691	!
3692	! We're going off to do a block copy.
3693	! Switch fault hendlers and grab a window. We
3694	! don't do a membar #Sync since we've done only
3695	! kernel data to this point.
3696	!
3697	stn	%o4, [THREAD_REG + T_LOFAULT]
3698	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3699	!
3700	! %o3 is %i3 after the save...
3701	!
3702	st	%i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
3703	ba,pt	%ncc, .do_blockcopyin
3704	wr	%g0, FPRS_FEF, %fprs
3705.copyin_fpregs_inuse:
3706	!
3707	! We're here if the FP regs are in use. Need to see if the request
3708	! exceeds our suddenly larger minimum.
3709	!
3710	cmp	%i2, VIS_COPY_THRESHOLD+(64*4)
3711	bl	%ncc, .small_copyin
3712	nop
3713	!
3714	! We're going off and do a block copy.
3715	! Change to the heavy duty fault handler and grab a window first.
3716	! New handler is passed in
3717	!
3718	stn	%o4, [THREAD_REG + T_LOFAULT]
3719	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3720	!
3721	! %o3 is now %i3
3722	!
3723	st	%i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET]
3724
3725	! save in-use fpregs on stack
3726	wr	%g0, FPRS_FEF, %fprs
3727	membar	#Sync
3728	add	%fp, STACK_BIAS - 257, %o2
3729	and	%o2, -64, %o2
3730	stda	%d0, [%o2]ASI_BLK_P
3731	add	%o2, 64, %o2
3732	stda	%d16, [%o2]ASI_BLK_P
3733	add	%o2, 64, %o2
3734	stda	%d32, [%o2]ASI_BLK_P
3735	add	%o2, 64, %o2
3736	stda	%d48, [%o2]ASI_BLK_P
3737	membar	#Sync
3738
3739.do_blockcopyin:
3740	membar	#StoreStore|#StoreLoad|#LoadStore
3741
3742	rd	%gsr, %o2
3743	st	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
3744
3745	! Set the lower bit in the saved t_lofault to indicate
3746	! that we need to clear the %fprs register on the way
3747	! out
3748	or	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
3749
3750	! Swap src/dst since the code below is memcpy code
3751	! and memcpy/bcopy have different calling sequences
3752	mov	%i1, %i5
3753	mov	%i0, %i1
3754	mov	%i5, %i0
3755
3756!!! This code is nearly identical to the version in the sun4u
3757!!! libc_psr.  Most bugfixes made to that file should be
3758!!! merged into this routine.
3759
3760	andcc	%i0, 7, %o3
3761	bz	copyin_blkcpy
3762	sub	%o3, 8, %o3
3763	neg	%o3
3764	sub	%i2, %o3, %i2
3765
3766	! Align Destination on double-word boundary
3767
37682:	lduba	[%i1]ASI_USER, %o4
3769	inc	%i1
3770	inc	%i0
3771	deccc	%o3
3772	bgu	%ncc, 2b
3773	stb	%o4, [%i0-1]
3774copyin_blkcpy:
3775	andcc	%i0, 63, %i3
3776	bz,pn	%ncc, copyin_blalign	! now block aligned
3777	sub	%i3, 64, %i3
3778	neg	%i3			! bytes till block aligned
3779	sub	%i2, %i3, %i2		! update %i2 with new count
3780
3781	! Copy %i3 bytes till dst is block (64 byte) aligned. use
3782	! double word copies.
3783
3784	alignaddr %i1, %g0, %g1
3785	ldda	[%g1]ASI_USER, %d0
3786	add	%g1, 8, %g1
37876:
3788	ldda	[%g1]ASI_USER, %d2
3789	add	%g1, 8, %g1
3790	subcc	%i3, 8, %i3
3791	faligndata %d0, %d2, %d8
3792	std	%d8, [%i0]
3793	add	%i1, 8, %i1
3794	bz,pn	%ncc, copyin_blalign
3795	add	%i0, 8, %i0
3796	ldda	[%g1]ASI_USER, %d0
3797	add	%g1, 8, %g1
3798	subcc	%i3, 8, %i3
3799	faligndata %d2, %d0, %d8
3800	std	%d8, [%i0]
3801	add	%i1, 8, %i1
3802	bgu,pn	%ncc, 6b
3803	add	%i0, 8, %i0
3804
3805copyin_blalign:
3806	membar	#StoreLoad
3807	! %i2 = total length
3808	! %i3 = blocks	(length - 64) / 64
3809	! %i4 = doubles remaining  (length - blocks)
3810	sub	%i2, 64, %i3
3811	andn	%i3, 63, %i3
3812	sub	%i2, %i3, %i4
3813	andn	%i4, 7, %i4
3814	sub	%i4, 16, %i4
3815	sub	%i2, %i4, %i2
3816	sub	%i2, %i3, %i2
3817
3818	andn	%i1, 0x3f, %l7		! blk aligned address
3819	alignaddr %i1, %g0, %g0		! gen %gsr
3820
3821	srl	%i1, 3, %l5		! bits 3,4,5 are now least sig in  %l5
3822	andcc	%l5, 7, %i5		! mask everything except bits 1,2 3
3823	add	%i1, %i4, %i1
3824	add	%i1, %i3, %i1
3825
3826	ldda	[%l7]ASI_BLK_AIUS, %d0
3827	add	%l7, 64, %l7
3828	ldda	[%l7]ASI_BLK_AIUS, %d16
3829	add	%l7, 64, %l7
3830	ldda	[%l7]ASI_BLK_AIUS, %d32
3831	add	%l7, 64, %l7
3832	sub	%i3, 128, %i3
3833
3834	! switch statement to get us to the right 8 byte blk within a
3835	! 64 byte block
3836
3837	cmp	 %i5, 4
3838	bgeu,a	 copyin_hlf
3839	cmp	 %i5, 6
3840	cmp	 %i5, 2
3841	bgeu,a	 copyin_sqtr
3842	nop
3843	cmp	 %i5, 1
3844	be,a	 copyin_seg1
3845	nop
3846	ba,pt	 %ncc, copyin_seg0
3847	nop
3848copyin_sqtr:
3849	be,a	 copyin_seg2
3850	nop
3851	ba,pt	 %ncc, copyin_seg3
3852	nop
3853
3854copyin_hlf:
3855	bgeu,a	 copyin_fqtr
3856	nop
3857	cmp	 %i5, 5
3858	be,a	 copyin_seg5
3859	nop
3860	ba,pt	 %ncc, copyin_seg4
3861	nop
3862copyin_fqtr:
3863	be,a	 copyin_seg6
3864	nop
3865	ba,pt	 %ncc, copyin_seg7
3866	nop
3867
3868copyin_seg0:
3869	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
3870	FALIGN_D0
3871	ldda	[%l7]ASI_BLK_AIUS, %d0
3872	stda	%d48, [%i0]ASI_BLK_P
3873	add	%l7, 64, %l7
3874	subcc	%i3, 64, %i3
3875	bz,pn	%ncc, 0f
3876	add	%i0, 64, %i0
3877	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
3878	FALIGN_D16
3879	ldda	[%l7]ASI_BLK_AIUS, %d16
3880	stda	%d48, [%i0]ASI_BLK_P
3881	add	%l7, 64, %l7
3882	subcc	%i3, 64, %i3
3883	bz,pn	%ncc, 1f
3884	add	%i0, 64, %i0
3885	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
3886	FALIGN_D32
3887	ldda	[%l7]ASI_BLK_AIUS, %d32
3888	stda	%d48, [%i0]ASI_BLK_P
3889	add	%l7, 64, %l7
3890	subcc	%i3, 64, %i3
3891	bz,pn	%ncc, 2f
3892	add	%i0, 64, %i0
3893	ba,a,pt	%ncc, copyin_seg0
3894
38950:
3896	FALIGN_D16
3897	stda	%d48, [%i0]ASI_BLK_P
3898	add	%i0, 64, %i0
3899	membar	#Sync
3900	FALIGN_D32
3901	stda	%d48, [%i0]ASI_BLK_P
3902	ba,pt	%ncc, copyin_blkd0
3903	add	%i0, 64, %i0
3904
39051:
3906	FALIGN_D32
3907	stda	%d48, [%i0]ASI_BLK_P
3908	add	%i0, 64, %i0
3909	membar	#Sync
3910	FALIGN_D0
3911	stda	%d48, [%i0]ASI_BLK_P
3912	ba,pt	%ncc, copyin_blkd16
3913	add	%i0, 64, %i0
3914
39152:
3916	FALIGN_D0
3917	stda	%d48, [%i0]ASI_BLK_P
3918	add	%i0, 64, %i0
3919	membar	#Sync
3920	FALIGN_D16
3921	stda	%d48, [%i0]ASI_BLK_P
3922	ba,pt	%ncc, copyin_blkd32
3923	add	%i0, 64, %i0
3924
3925copyin_seg1:
3926	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
3927	FALIGN_D2
3928	ldda	[%l7]ASI_BLK_AIUS, %d0
3929	stda	%d48, [%i0]ASI_BLK_P
3930	add	%l7, 64, %l7
3931	subcc	%i3, 64, %i3
3932	bz,pn	%ncc, 0f
3933	add	%i0, 64, %i0
3934	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
3935	FALIGN_D18
3936	ldda	[%l7]ASI_BLK_AIUS, %d16
3937	stda	%d48, [%i0]ASI_BLK_P
3938	add	%l7, 64, %l7
3939	subcc	%i3, 64, %i3
3940	bz,pn	%ncc, 1f
3941	add	%i0, 64, %i0
3942	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
3943	FALIGN_D34
3944	ldda	[%l7]ASI_BLK_AIUS, %d32
3945	stda	%d48, [%i0]ASI_BLK_P
3946	add	%l7, 64, %l7
3947	subcc	%i3, 64, %i3
3948	bz,pn	%ncc, 2f
3949	add	%i0, 64, %i0
3950	ba,a,pt	%ncc, copyin_seg1
39510:
3952	FALIGN_D18
3953	stda	%d48, [%i0]ASI_BLK_P
3954	add	%i0, 64, %i0
3955	membar	#Sync
3956	FALIGN_D34
3957	stda	%d48, [%i0]ASI_BLK_P
3958	ba,pt	%ncc, copyin_blkd2
3959	add	%i0, 64, %i0
3960
39611:
3962	FALIGN_D34
3963	stda	%d48, [%i0]ASI_BLK_P
3964	add	%i0, 64, %i0
3965	membar	#Sync
3966	FALIGN_D2
3967	stda	%d48, [%i0]ASI_BLK_P
3968	ba,pt	%ncc, copyin_blkd18
3969	add	%i0, 64, %i0
3970
39712:
3972	FALIGN_D2
3973	stda	%d48, [%i0]ASI_BLK_P
3974	add	%i0, 64, %i0
3975	membar	#Sync
3976	FALIGN_D18
3977	stda	%d48, [%i0]ASI_BLK_P
3978	ba,pt	%ncc, copyin_blkd34
3979	add	%i0, 64, %i0
3980copyin_seg2:
3981	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
3982	FALIGN_D4
3983	ldda	[%l7]ASI_BLK_AIUS, %d0
3984	stda	%d48, [%i0]ASI_BLK_P
3985	add	%l7, 64, %l7
3986	subcc	%i3, 64, %i3
3987	bz,pn	%ncc, 0f
3988	add	%i0, 64, %i0
3989	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
3990	FALIGN_D20
3991	ldda	[%l7]ASI_BLK_AIUS, %d16
3992	stda	%d48, [%i0]ASI_BLK_P
3993	add	%l7, 64, %l7
3994	subcc	%i3, 64, %i3
3995	bz,pn	%ncc, 1f
3996	add	%i0, 64, %i0
3997	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
3998	FALIGN_D36
3999	ldda	[%l7]ASI_BLK_AIUS, %d32
4000	stda	%d48, [%i0]ASI_BLK_P
4001	add	%l7, 64, %l7
4002	subcc	%i3, 64, %i3
4003	bz,pn	%ncc, 2f
4004	add	%i0, 64, %i0
4005	ba,a,pt	%ncc, copyin_seg2
4006
40070:
4008	FALIGN_D20
4009	stda	%d48, [%i0]ASI_BLK_P
4010	add	%i0, 64, %i0
4011	membar	#Sync
4012	FALIGN_D36
4013	stda	%d48, [%i0]ASI_BLK_P
4014	ba,pt	%ncc, copyin_blkd4
4015	add	%i0, 64, %i0
4016
40171:
4018	FALIGN_D36
4019	stda	%d48, [%i0]ASI_BLK_P
4020	add	%i0, 64, %i0
4021	membar	#Sync
4022	FALIGN_D4
4023	stda	%d48, [%i0]ASI_BLK_P
4024	ba,pt	%ncc, copyin_blkd20
4025	add	%i0, 64, %i0
4026
40272:
4028	FALIGN_D4
4029	stda	%d48, [%i0]ASI_BLK_P
4030	add	%i0, 64, %i0
4031	membar	#Sync
4032	FALIGN_D20
4033	stda	%d48, [%i0]ASI_BLK_P
4034	ba,pt	%ncc, copyin_blkd36
4035	add	%i0, 64, %i0
4036
4037copyin_seg3:
4038	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4039	FALIGN_D6
4040	ldda	[%l7]ASI_BLK_AIUS, %d0
4041	stda	%d48, [%i0]ASI_BLK_P
4042	add	%l7, 64, %l7
4043	subcc	%i3, 64, %i3
4044	bz,pn	%ncc, 0f
4045	add	%i0, 64, %i0
4046	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4047	FALIGN_D22
4048	ldda	[%l7]ASI_BLK_AIUS, %d16
4049	stda	%d48, [%i0]ASI_BLK_P
4050	add	%l7, 64, %l7
4051	subcc	%i3, 64, %i3
4052	bz,pn	%ncc, 1f
4053	add	%i0, 64, %i0
4054	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4055	FALIGN_D38
4056	ldda	[%l7]ASI_BLK_AIUS, %d32
4057	stda	%d48, [%i0]ASI_BLK_P
4058	add	%l7, 64, %l7
4059	subcc	%i3, 64, %i3
4060	bz,pn	%ncc, 2f
4061	add	%i0, 64, %i0
4062	ba,a,pt	%ncc, copyin_seg3
4063
40640:
4065	FALIGN_D22
4066	stda	%d48, [%i0]ASI_BLK_P
4067	add	%i0, 64, %i0
4068	membar	#Sync
4069	FALIGN_D38
4070	stda	%d48, [%i0]ASI_BLK_P
4071	ba,pt	%ncc, copyin_blkd6
4072	add	%i0, 64, %i0
4073
40741:
4075	FALIGN_D38
4076	stda	%d48, [%i0]ASI_BLK_P
4077	add	%i0, 64, %i0
4078	membar	#Sync
4079	FALIGN_D6
4080	stda	%d48, [%i0]ASI_BLK_P
4081	ba,pt	%ncc, copyin_blkd22
4082	add	%i0, 64, %i0
4083
40842:
4085	FALIGN_D6
4086	stda	%d48, [%i0]ASI_BLK_P
4087	add	%i0, 64, %i0
4088	membar	#Sync
4089	FALIGN_D22
4090	stda	%d48, [%i0]ASI_BLK_P
4091	ba,pt	%ncc, copyin_blkd38
4092	add	%i0, 64, %i0
4093
4094copyin_seg4:
4095	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4096	FALIGN_D8
4097	ldda	[%l7]ASI_BLK_AIUS, %d0
4098	stda	%d48, [%i0]ASI_BLK_P
4099	add	%l7, 64, %l7
4100	subcc	%i3, 64, %i3
4101	bz,pn	%ncc, 0f
4102	add	%i0, 64, %i0
4103	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4104	FALIGN_D24
4105	ldda	[%l7]ASI_BLK_AIUS, %d16
4106	stda	%d48, [%i0]ASI_BLK_P
4107	add	%l7, 64, %l7
4108	subcc	%i3, 64, %i3
4109	bz,pn	%ncc, 1f
4110	add	%i0, 64, %i0
4111	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4112	FALIGN_D40
4113	ldda	[%l7]ASI_BLK_AIUS, %d32
4114	stda	%d48, [%i0]ASI_BLK_P
4115	add	%l7, 64, %l7
4116	subcc	%i3, 64, %i3
4117	bz,pn	%ncc, 2f
4118	add	%i0, 64, %i0
4119	ba,a,pt	%ncc, copyin_seg4
4120
41210:
4122	FALIGN_D24
4123	stda	%d48, [%i0]ASI_BLK_P
4124	add	%i0, 64, %i0
4125	membar	#Sync
4126	FALIGN_D40
4127	stda	%d48, [%i0]ASI_BLK_P
4128	ba,pt	%ncc, copyin_blkd8
4129	add	%i0, 64, %i0
4130
41311:
4132	FALIGN_D40
4133	stda	%d48, [%i0]ASI_BLK_P
4134	add	%i0, 64, %i0
4135	membar	#Sync
4136	FALIGN_D8
4137	stda	%d48, [%i0]ASI_BLK_P
4138	ba,pt	%ncc, copyin_blkd24
4139	add	%i0, 64, %i0
4140
41412:
4142	FALIGN_D8
4143	stda	%d48, [%i0]ASI_BLK_P
4144	add	%i0, 64, %i0
4145	membar	#Sync
4146	FALIGN_D24
4147	stda	%d48, [%i0]ASI_BLK_P
4148	ba,pt	%ncc, copyin_blkd40
4149	add	%i0, 64, %i0
4150
4151copyin_seg5:
4152	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4153	FALIGN_D10
4154	ldda	[%l7]ASI_BLK_AIUS, %d0
4155	stda	%d48, [%i0]ASI_BLK_P
4156	add	%l7, 64, %l7
4157	subcc	%i3, 64, %i3
4158	bz,pn	%ncc, 0f
4159	add	%i0, 64, %i0
4160	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4161	FALIGN_D26
4162	ldda	[%l7]ASI_BLK_AIUS, %d16
4163	stda	%d48, [%i0]ASI_BLK_P
4164	add	%l7, 64, %l7
4165	subcc	%i3, 64, %i3
4166	bz,pn	%ncc, 1f
4167	add	%i0, 64, %i0
4168	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4169	FALIGN_D42
4170	ldda	[%l7]ASI_BLK_AIUS, %d32
4171	stda	%d48, [%i0]ASI_BLK_P
4172	add	%l7, 64, %l7
4173	subcc	%i3, 64, %i3
4174	bz,pn	%ncc, 2f
4175	add	%i0, 64, %i0
4176	ba,a,pt	%ncc, copyin_seg5
4177
41780:
4179	FALIGN_D26
4180	stda	%d48, [%i0]ASI_BLK_P
4181	add	%i0, 64, %i0
4182	membar	#Sync
4183	FALIGN_D42
4184	stda	%d48, [%i0]ASI_BLK_P
4185	ba,pt	%ncc, copyin_blkd10
4186	add	%i0, 64, %i0
4187
41881:
4189	FALIGN_D42
4190	stda	%d48, [%i0]ASI_BLK_P
4191	add	%i0, 64, %i0
4192	membar	#Sync
4193	FALIGN_D10
4194	stda	%d48, [%i0]ASI_BLK_P
4195	ba,pt	%ncc, copyin_blkd26
4196	add	%i0, 64, %i0
4197
41982:
4199	FALIGN_D10
4200	stda	%d48, [%i0]ASI_BLK_P
4201	add	%i0, 64, %i0
4202	membar	#Sync
4203	FALIGN_D26
4204	stda	%d48, [%i0]ASI_BLK_P
4205	ba,pt	%ncc, copyin_blkd42
4206	add	%i0, 64, %i0
4207
4208copyin_seg6:
4209	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4210	FALIGN_D12
4211	ldda	[%l7]ASI_BLK_AIUS, %d0
4212	stda	%d48, [%i0]ASI_BLK_P
4213	add	%l7, 64, %l7
4214	subcc	%i3, 64, %i3
4215	bz,pn	%ncc, 0f
4216	add	%i0, 64, %i0
4217	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4218	FALIGN_D28
4219	ldda	[%l7]ASI_BLK_AIUS, %d16
4220	stda	%d48, [%i0]ASI_BLK_P
4221	add	%l7, 64, %l7
4222	subcc	%i3, 64, %i3
4223	bz,pn	%ncc, 1f
4224	add	%i0, 64, %i0
4225	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4226	FALIGN_D44
4227	ldda	[%l7]ASI_BLK_AIUS, %d32
4228	stda	%d48, [%i0]ASI_BLK_P
4229	add	%l7, 64, %l7
4230	subcc	%i3, 64, %i3
4231	bz,pn	%ncc, 2f
4232	add	%i0, 64, %i0
4233	ba,a,pt	%ncc, copyin_seg6
4234
42350:
4236	FALIGN_D28
4237	stda	%d48, [%i0]ASI_BLK_P
4238	add	%i0, 64, %i0
4239	membar	#Sync
4240	FALIGN_D44
4241	stda	%d48, [%i0]ASI_BLK_P
4242	ba,pt	%ncc, copyin_blkd12
4243	add	%i0, 64, %i0
4244
42451:
4246	FALIGN_D44
4247	stda	%d48, [%i0]ASI_BLK_P
4248	add	%i0, 64, %i0
4249	membar	#Sync
4250	FALIGN_D12
4251	stda	%d48, [%i0]ASI_BLK_P
4252	ba,pt	%ncc, copyin_blkd28
4253	add	%i0, 64, %i0
4254
42552:
4256	FALIGN_D12
4257	stda	%d48, [%i0]ASI_BLK_P
4258	add	%i0, 64, %i0
4259	membar	#Sync
4260	FALIGN_D28
4261	stda	%d48, [%i0]ASI_BLK_P
4262	ba,pt	%ncc, copyin_blkd44
4263	add	%i0, 64, %i0
4264
4265copyin_seg7:
4266	! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst
4267	FALIGN_D14
4268	ldda	[%l7]ASI_BLK_AIUS, %d0
4269	stda	%d48, [%i0]ASI_BLK_P
4270	add	%l7, 64, %l7
4271	subcc	%i3, 64, %i3
4272	bz,pn	%ncc, 0f
4273	add	%i0, 64, %i0
4274	! 2nd chunk -  %d0 pre, %d16 low, %d32 high, %d48 dst
4275	FALIGN_D30
4276	ldda	[%l7]ASI_BLK_AIUS, %d16
4277	stda	%d48, [%i0]ASI_BLK_P
4278	add	%l7, 64, %l7
4279	subcc	%i3, 64, %i3
4280	bz,pn	%ncc, 1f
4281	add	%i0, 64, %i0
4282	! 3rd chunk -  %d0 high, %d16 pre, %d32 low, %d48 dst
4283	FALIGN_D46
4284	ldda	[%l7]ASI_BLK_AIUS, %d32
4285	stda	%d48, [%i0]ASI_BLK_P
4286	add	%l7, 64, %l7
4287	subcc	%i3, 64, %i3
4288	bz,pn	%ncc, 2f
4289	add	%i0, 64, %i0
4290	ba,a,pt	%ncc, copyin_seg7
4291
42920:
4293	FALIGN_D30
4294	stda	%d48, [%i0]ASI_BLK_P
4295	add	%i0, 64, %i0
4296	membar	#Sync
4297	FALIGN_D46
4298	stda	%d48, [%i0]ASI_BLK_P
4299	ba,pt	%ncc, copyin_blkd14
4300	add	%i0, 64, %i0
4301
43021:
4303	FALIGN_D46
4304	stda	%d48, [%i0]ASI_BLK_P
4305	add	%i0, 64, %i0
4306	membar	#Sync
4307	FALIGN_D14
4308	stda	%d48, [%i0]ASI_BLK_P
4309	ba,pt	%ncc, copyin_blkd30
4310	add	%i0, 64, %i0
4311
43122:
4313	FALIGN_D14
4314	stda	%d48, [%i0]ASI_BLK_P
4315	add	%i0, 64, %i0
4316	membar	#Sync
4317	FALIGN_D30
4318	stda	%d48, [%i0]ASI_BLK_P
4319	ba,pt	%ncc, copyin_blkd46
4320	add	%i0, 64, %i0
4321
4322
4323	!
4324	! dribble out the last partial block
4325	!
4326copyin_blkd0:
4327	subcc	%i4, 8, %i4
4328	blu,pn	%ncc, copyin_blkdone
4329	faligndata %d0, %d2, %d48
4330	std	%d48, [%i0]
4331	add	%i0, 8, %i0
4332copyin_blkd2:
4333	subcc	%i4, 8, %i4
4334	blu,pn	%ncc, copyin_blkdone
4335	faligndata %d2, %d4, %d48
4336	std	%d48, [%i0]
4337	add	%i0, 8, %i0
4338copyin_blkd4:
4339	subcc	%i4, 8, %i4
4340	blu,pn	%ncc, copyin_blkdone
4341	faligndata %d4, %d6, %d48
4342	std	%d48, [%i0]
4343	add	%i0, 8, %i0
4344copyin_blkd6:
4345	subcc	%i4, 8, %i4
4346	blu,pn	%ncc, copyin_blkdone
4347	faligndata %d6, %d8, %d48
4348	std	%d48, [%i0]
4349	add	%i0, 8, %i0
4350copyin_blkd8:
4351	subcc	%i4, 8, %i4
4352	blu,pn	%ncc, copyin_blkdone
4353	faligndata %d8, %d10, %d48
4354	std	%d48, [%i0]
4355	add	%i0, 8, %i0
4356copyin_blkd10:
4357	subcc	%i4, 8, %i4
4358	blu,pn	%ncc, copyin_blkdone
4359	faligndata %d10, %d12, %d48
4360	std	%d48, [%i0]
4361	add	%i0, 8, %i0
4362copyin_blkd12:
4363	subcc	%i4, 8, %i4
4364	blu,pn	%ncc, copyin_blkdone
4365	faligndata %d12, %d14, %d48
4366	std	%d48, [%i0]
4367	add	%i0, 8, %i0
4368copyin_blkd14:
4369	subcc	%i4, 8, %i4
4370	blu,pn	%ncc, copyin_blkdone
4371	fsrc1	%d14, %d0
4372	ba,a,pt	%ncc, copyin_blkleft
4373
4374copyin_blkd16:
4375	subcc	%i4, 8, %i4
4376	blu,pn	%ncc, copyin_blkdone
4377	faligndata %d16, %d18, %d48
4378	std	%d48, [%i0]
4379	add	%i0, 8, %i0
4380copyin_blkd18:
4381	subcc	%i4, 8, %i4
4382	blu,pn	%ncc, copyin_blkdone
4383	faligndata %d18, %d20, %d48
4384	std	%d48, [%i0]
4385	add	%i0, 8, %i0
4386copyin_blkd20:
4387	subcc	%i4, 8, %i4
4388	blu,pn	%ncc, copyin_blkdone
4389	faligndata %d20, %d22, %d48
4390	std	%d48, [%i0]
4391	add	%i0, 8, %i0
4392copyin_blkd22:
4393	subcc	%i4, 8, %i4
4394	blu,pn	%ncc, copyin_blkdone
4395	faligndata %d22, %d24, %d48
4396	std	%d48, [%i0]
4397	add	%i0, 8, %i0
4398copyin_blkd24:
4399	subcc	%i4, 8, %i4
4400	blu,pn	%ncc, copyin_blkdone
4401	faligndata %d24, %d26, %d48
4402	std	%d48, [%i0]
4403	add	%i0, 8, %i0
4404copyin_blkd26:
4405	subcc	%i4, 8, %i4
4406	blu,pn	%ncc, copyin_blkdone
4407	faligndata %d26, %d28, %d48
4408	std	%d48, [%i0]
4409	add	%i0, 8, %i0
4410copyin_blkd28:
4411	subcc	%i4, 8, %i4
4412	blu,pn	%ncc, copyin_blkdone
4413	faligndata %d28, %d30, %d48
4414	std	%d48, [%i0]
4415	add	%i0, 8, %i0
4416copyin_blkd30:
4417	subcc	%i4, 8, %i4
4418	blu,pn	%ncc, copyin_blkdone
4419	fsrc1	%d30, %d0
4420	ba,a,pt	%ncc, copyin_blkleft
4421copyin_blkd32:
4422	subcc	%i4, 8, %i4
4423	blu,pn	%ncc, copyin_blkdone
4424	faligndata %d32, %d34, %d48
4425	std	%d48, [%i0]
4426	add	%i0, 8, %i0
4427copyin_blkd34:
4428	subcc	%i4, 8, %i4
4429	blu,pn	%ncc, copyin_blkdone
4430	faligndata %d34, %d36, %d48
4431	std	%d48, [%i0]
4432	add	%i0, 8, %i0
4433copyin_blkd36:
4434	subcc	%i4, 8, %i4
4435	blu,pn	%ncc, copyin_blkdone
4436	faligndata %d36, %d38, %d48
4437	std	%d48, [%i0]
4438	add	%i0, 8, %i0
4439copyin_blkd38:
4440	subcc	%i4, 8, %i4
4441	blu,pn	%ncc, copyin_blkdone
4442	faligndata %d38, %d40, %d48
4443	std	%d48, [%i0]
4444	add	%i0, 8, %i0
4445copyin_blkd40:
4446	subcc	%i4, 8, %i4
4447	blu,pn	%ncc, copyin_blkdone
4448	faligndata %d40, %d42, %d48
4449	std	%d48, [%i0]
4450	add	%i0, 8, %i0
4451copyin_blkd42:
4452	subcc	%i4, 8, %i4
4453	blu,pn	%ncc, copyin_blkdone
4454	faligndata %d42, %d44, %d48
4455	std	%d48, [%i0]
4456	add	%i0, 8, %i0
4457copyin_blkd44:
4458	subcc	%i4, 8, %i4
4459	blu,pn	%ncc, copyin_blkdone
4460	faligndata %d44, %d46, %d48
4461	std	%d48, [%i0]
4462	add	%i0, 8, %i0
4463copyin_blkd46:
4464	subcc	%i4, 8, %i4
4465	blu,pn	%ncc, copyin_blkdone
4466	fsrc1	%d46, %d0
4467
4468copyin_blkleft:
44691:
4470	ldda	[%l7]ASI_USER, %d2
4471	add	%l7, 8, %l7
4472	subcc	%i4, 8, %i4
4473	faligndata %d0, %d2, %d8
4474	std	%d8, [%i0]
4475	blu,pn	%ncc, copyin_blkdone
4476	add	%i0, 8, %i0
4477	ldda	[%l7]ASI_USER, %d0
4478	add	%l7, 8, %l7
4479	subcc	%i4, 8, %i4
4480	faligndata %d2, %d0, %d8
4481	std	%d8, [%i0]
4482	bgeu,pt	%ncc, 1b
4483	add	%i0, 8, %i0
4484
4485copyin_blkdone:
4486	tst	%i2
4487	bz,pt	%ncc, .copyin_exit
4488	and	%l3, 0x4, %l3		! fprs.du = fprs.dl = 0
4489
44907:	lduba	[%i1]ASI_USER, %i4
4491	inc	%i1
4492	inc	%i0
4493	deccc	%i2
4494	bgu	%ncc, 7b
4495	  stb	  %i4, [%i0 - 1]
4496
4497.copyin_exit:
4498	membar	#StoreLoad|#StoreStore
4499	btst	FPUSED_FLAG, SAVED_LOFAULT
4500	bz	%icc, 1f
4501	  nop
4502
4503	ld	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
4504	wr	%o2, 0, %gsr
4505
4506	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
4507	btst	FPRS_FEF, %o3
4508	bz	%icc, 4f
4509	  nop
4510
4511	! restore fpregs from stack
4512	membar	#Sync
4513	add	%fp, STACK_BIAS - 257, %o2
4514	and	%o2, -64, %o2
4515	ldda	[%o2]ASI_BLK_P, %d0
4516	add	%o2, 64, %o2
4517	ldda	[%o2]ASI_BLK_P, %d16
4518	add	%o2, 64, %o2
4519	ldda	[%o2]ASI_BLK_P, %d32
4520	add	%o2, 64, %o2
4521	ldda	[%o2]ASI_BLK_P, %d48
4522	membar	#Sync
4523
4524	ba,pt	%ncc, 1f
4525	  wr	%o3, 0, %fprs		! restore fprs
4526
45274:
4528	FZERO				! zero all of the fpregs
4529	wr	%o3, 0, %fprs		! restore fprs
4530
45311:
4532	andn	SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT
4533	membar	#Sync				! sync error barrier
4534	stn	SAVED_LOFAULT, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
4535	ret
4536	restore	%g0, 0, %o0
4537.copyin_err:
4538	ldn	[THREAD_REG + T_COPYOPS], %o4
4539	brz	%o4, 2f
4540	nop
4541	ldn	[%o4 + CP_COPYIN], %g2
4542	jmp	%g2
4543	nop
45442:
4545	retl
4546	mov	-1, %o0
4547	SET_SIZE(copyin)
4548
4549	ENTRY(xcopyin)
4550	sethi	%hi(.xcopyin_err), REAL_LOFAULT
4551	b	.do_copyin
4552	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
4553.xcopyin_err:
4554	ldn	[THREAD_REG + T_COPYOPS], %o4
4555	brz	%o4, 2f
4556	nop
4557	ldn	[%o4 + CP_XCOPYIN], %g2
4558	jmp	%g2
4559	nop
45602:
4561	retl
4562	mov	%g1, %o0
4563	SET_SIZE(xcopyin)
4564
4565	ENTRY(xcopyin_little)
4566	sethi	%hi(.little_err), %o4
4567	ldn	[THREAD_REG + T_LOFAULT], %o5
4568	or	%o4, %lo(.little_err), %o4
4569	membar	#Sync				! sync error barrier
4570	stn	%o4, [THREAD_REG + T_LOFAULT]
4571
4572	subcc	%g0, %o2, %o3
4573	add	%o0, %o2, %o0
4574	bz,pn	%ncc, 2f		! check for zero bytes
4575	sub	%o2, 1, %o4
4576	add	%o0, %o4, %o0		! start w/last byte
4577	add	%o1, %o2, %o1
4578	lduba	[%o0+%o3]ASI_AIUSL, %o4
4579
45801:	stb	%o4, [%o1+%o3]
4581	inccc	%o3
4582	sub	%o0, 2, %o0		! get next byte
4583	bcc,a,pt %ncc, 1b
4584	  lduba	[%o0+%o3]ASI_AIUSL, %o4
4585
45862:	membar	#Sync				! sync error barrier
4587	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
4588	retl
4589	mov	%g0, %o0		! return (0)
4590
4591.little_err:
4592	membar	#Sync				! sync error barrier
4593	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
4594	retl
4595	mov	%g1, %o0
4596	SET_SIZE(xcopyin_little)
4597
4598
4599/*
4600 * Copy a block of storage - must not overlap (from + len <= to).
4601 * No fault handler installed (to be called under on_fault())
4602 */
4603
4604	ENTRY(copyin_noerr)
4605	sethi	%hi(.copyio_noerr), REAL_LOFAULT
4606	b	.do_copyin
4607	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
4608.copyio_noerr:
4609	jmp	SAVED_LOFAULT
4610	  nop
4611	SET_SIZE(copyin_noerr)
4612
4613/*
4614 * Copy a block of storage - must not overlap (from + len <= to).
4615 * No fault handler installed (to be called under on_fault())
4616 */
4617
4618	ENTRY(copyout_noerr)
4619	sethi	%hi(.copyio_noerr), REAL_LOFAULT
4620	b	.do_copyout
4621	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
4622	SET_SIZE(copyout_noerr)
4623
4624	.align	4
4625	DGDEF(use_hw_bcopy)
4626	.word	1
4627	DGDEF(use_hw_copyio)
4628	.word	1
4629	DGDEF(use_hw_bzero)
4630	.word	1
4631	DGDEF(hw_copy_limit_1)
4632	.word	0
4633	DGDEF(hw_copy_limit_2)
4634	.word	0
4635	DGDEF(hw_copy_limit_4)
4636	.word	0
4637	DGDEF(hw_copy_limit_8)
4638	.word	0
4639
4640	.align	64
4641	.section ".text"
4642
4643
4644/*
4645 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
4646 * longer than 256 bytes in length using spitfire's block stores.  If
4647 * the criteria for using this routine are not met then it calls bzero
4648 * and returns 1.  Otherwise 0 is returned indicating success.
4649 * Caller is responsible for ensuring use_hw_bzero is true and that
4650 * kpreempt_disable() has been called.
4651 */
4652	! %i0 - start address
4653	! %i1 - length of region (multiple of 64)
4654	! %l0 - saved fprs
4655	! %l1 - pointer to saved %d0 block
4656	! %l2 - saved curthread->t_lwp
4657
4658	ENTRY(hwblkclr)
4659	! get another window w/space for one aligned block of saved fpregs
4660	save	%sp, -SA(MINFRAME + 2*64), %sp
4661
4662	! Must be block-aligned
4663	andcc	%i0, (64-1), %g0
4664	bnz,pn	%ncc, 1f
4665	  nop
4666
4667	! ... and must be 256 bytes or more
4668	cmp	%i1, 256
4669	blu,pn	%ncc, 1f
4670	  nop
4671
4672	! ... and length must be a multiple of 64
4673	andcc	%i1, (64-1), %g0
4674	bz,pn	%ncc, 2f
4675	  nop
4676
46771:	! punt, call bzero but notify the caller that bzero was used
4678	mov	%i0, %o0
4679	call	bzero
4680	  mov	%i1, %o1
4681	ret
4682	restore	%g0, 1, %o0	! return (1) - did not use block operations
4683
46842:	rd	%fprs, %l0		! check for unused fp
4685	btst	FPRS_FEF, %l0
4686	bz	1f
4687	  nop
4688
4689	! save in-use fpregs on stack
4690	membar	#Sync
4691	add	%fp, STACK_BIAS - 65, %l1
4692	and	%l1, -64, %l1
4693	stda	%d0, [%l1]ASI_BLK_P
4694
46951:	membar	#StoreStore|#StoreLoad|#LoadStore
4696	wr	%g0, FPRS_FEF, %fprs
4697	wr	%g0, ASI_BLK_P, %asi
4698
4699	! Clear block
4700	fzero	%d0
4701	fzero	%d2
4702	fzero	%d4
4703	fzero	%d6
4704	fzero	%d8
4705	fzero	%d10
4706	fzero	%d12
4707	fzero	%d14
4708
4709	mov	256, %i3
4710	ba	.pz_doblock
4711	  nop
4712
4713.pz_blkstart:
4714      ! stda	%d0, [%i0+192]%asi  ! in dly slot of branch that got us here
4715	stda	%d0, [%i0+128]%asi
4716	stda	%d0, [%i0+64]%asi
4717	stda	%d0, [%i0]%asi
4718.pz_zinst:
4719	add	%i0, %i3, %i0
4720	sub	%i1, %i3, %i1
4721.pz_doblock:
4722	cmp	%i1, 256
4723	bgeu,a	%ncc, .pz_blkstart
4724	  stda	%d0, [%i0+192]%asi
4725
4726	cmp	%i1, 64
4727	blu	%ncc, .pz_finish
4728
4729	andn	%i1, (64-1), %i3
4730	srl	%i3, 4, %i2		! using blocks, 1 instr / 16 words
4731	set	.pz_zinst, %i4
4732	sub	%i4, %i2, %i4
4733	jmp	%i4
4734	  nop
4735
4736.pz_finish:
4737	membar	#Sync
4738	btst	FPRS_FEF, %l0
4739	bz,a	.pz_finished
4740	  wr	%l0, 0, %fprs		! restore fprs
4741
4742	! restore fpregs from stack
4743	ldda	[%l1]ASI_BLK_P, %d0
4744	membar	#Sync
4745	wr	%l0, 0, %fprs		! restore fprs
4746
4747.pz_finished:
4748	ret
4749	restore	%g0, 0, %o0		! return (bzero or not)
4750	SET_SIZE(hwblkclr)
4751
4752	/*
4753	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
4754	 * using physical addresses.
4755	 */
4756	ENTRY_NP(hw_pa_bcopy32)
4757	rdpr    %pstate, %g1
4758	andn    %g1, PSTATE_IE, %g2
4759	wrpr    %g0, %g2, %pstate
4760
4761	ldxa    [%o0]ASI_MEM, %o2
4762	add     %o0, 8, %o0
4763	ldxa    [%o0]ASI_MEM, %o3
4764	add     %o0, 8, %o0
4765	ldxa    [%o0]ASI_MEM, %o4
4766	add     %o0, 8, %o0
4767	ldxa    [%o0]ASI_MEM, %o5
4768	stxa    %o2, [%o1]ASI_MEM
4769	add     %o1, 8, %o1
4770	stxa    %o3, [%o1]ASI_MEM
4771	add     %o1, 8, %o1
4772	stxa    %o4, [%o1]ASI_MEM
4773	add     %o1, 8, %o1
4774	stxa    %o5, [%o1]ASI_MEM
4775
4776	membar	#Sync
4777	retl
4778	  wrpr    %g0, %g1, %pstate
4779	SET_SIZE(hw_pa_bcopy32)
4780