xref: /titanic_41/usr/src/uts/sun4u/cpu/opl_olympus_copy.s (revision 4445fffbbb1ea25fd0e9ea68b9380dd7a6709025)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#include <sys/param.h>
27#include <sys/errno.h>
28#include <sys/asm_linkage.h>
29#include <sys/vtrace.h>
30#include <sys/machthread.h>
31#include <sys/clock.h>
32#include <sys/asi.h>
33#include <sys/fsr.h>
34#include <sys/privregs.h>
35
36#if !defined(lint)
37#include "assym.h"
38#endif	/* lint */
39
40/*
41 * Pseudo-code to aid in understanding the control flow of the
42 * bcopy/copyin/copyout routines.
43 *
44 * On entry:
45 *
46 * 	! Determine whether to use the FP register version
47 * 	! or the leaf routine version depending on size
48 * 	! of copy and flags.  Set up error handling accordingly.
49 *	! The transition point depends on whether the src and
50 * 	! dst addresses can be aligned to long word, word,
51 * 	! half word, or byte boundaries.
52 *	!
53 *	! WARNING: <Register usage convention>
54 *	! For FP version, %l6 holds previous error handling and
55 *	! a flag: TRAMP_FLAG (low bits)
56 *	! for leaf routine version, %o4 holds those values.
57 *	! So either %l6 or %o4 is reserved and not available for
58 *	! any other use.
59 *
60 * 	if (length <= VIS_COPY_THRESHOLD) 	! start with a quick test
61 * 		go to small_copy;		! to speed short copies
62 *
63 * 	! src, dst long word alignable
64 * 		if (hw_copy_limit_8 == 0) 	! hw_copy disabled
65 * 			go to small_copy;
66 *		if (length <= hw_copy_limit_8)
67 * 			go to small_copy;
68 * 		go to FPBLK_copy;
69 * 	}
70 * 	if (src,dst not alignable) {
71 * 		if (hw_copy_limit_1 == 0) 	! hw_copy disabled
72 * 			go to small_copy;
73 *		if (length <= hw_copy_limit_1)
74 * 			go to small_copy;
75 * 		go to FPBLK_copy;
76 * 	}
77 * 	if (src,dst halfword alignable) {
78 * 		if (hw_copy_limit_2 == 0) 	! hw_copy disabled
79 * 			go to small_copy;
80 *		if (length <= hw_copy_limit_2)
81 * 			go to small_copy;
82 * 		go to FPBLK_copy;
83 * 	}
84 * 	if (src,dst word alignable) {
85 * 		if (hw_copy_limit_4 == 0) 	! hw_copy disabled
86 * 			go to small_copy;
87 *		if (length <= hw_copy_limit_4)
88 * 			go to small_copy;
89 * 		go to FPBLK_copy;
90 * 	}
91 *
92 * small_copy:
93 *	Setup_leaf_rtn_error_handler; 		! diffs for each entry point
94 *
95 *	if (count <= 3)				! fast path for tiny copies
96 *		go to sm_left;			! special finish up code
97 *	else
98 *		if (count > CHKSIZE)		! medium sized copies
99 *			go to sm_med		! tuned by alignment
100 *		if(src&dst not both word aligned) {
101 *	sm_movebytes:
102 *			move byte by byte in 4-way unrolled loop
103 *			fall into sm_left;
104 *	sm_left:
105 *			move 0-3 bytes byte at a time as needed.
106 *			restore error handler and exit.
107 *
108 * 		} else {	! src&dst are word aligned
109 *			check for at least 8 bytes left,
110 *			move word at a time, unrolled by 2
111 *			when fewer than 8 bytes left,
112 *	sm_half:	move half word at a time while 2 or more bytes left
113 *	sm_byte:	move final byte if necessary
114 *	sm_exit:
115 *			restore error handler and exit.
116 *		}
117 *
118 * ! Medium length cases with at least CHKSIZE bytes available
119 * ! method: line up src and dst as best possible, then
120 * ! move data in 4-way unrolled loops.
121 *
122 * sm_med:
123 *	if(src&dst unalignable)
124 * 		go to sm_movebytes
125 *	if(src&dst halfword alignable)
126 *		go to sm_movehalf
127 *	if(src&dst word alignable)
128 *		go to sm_moveword
129 * ! fall into long word movement
130 *	move bytes until src is word aligned
131 *	if not long word aligned, move a word
132 *	move long words in 4-way unrolled loop until < 32 bytes left
133 *      move long words in 1-way unrolled loop until < 8 bytes left
134 *	if zero bytes left, goto sm_exit
135 *	if one byte left, go to sm_byte
136 *	else go to sm_half
137 *
138 * sm_moveword:
139 *	move bytes until src is word aligned
140 *	move words in 4-way unrolled loop until < 16 bytes left
141 *      move words in 1-way unrolled loop until < 4 bytes left
142 *	if zero bytes left, goto sm_exit
143 *	if one byte left, go to sm_byte
144 *	else go to sm_half
145 *
146 * sm_movehalf:
147 *	move a byte if needed to align src on halfword
148 *	move halfwords in 4-way unrolled loop until < 8 bytes left
149 *	if zero bytes left, goto sm_exit
150 *	if one byte left, go to sm_byte
151 *	else go to sm_half
152 *
153 *
154 * FPBLK_copy:
155 * 	%l6 = curthread->t_lofault;
156 * 	if (%l6 != NULL) {
157 * 		membar #Sync
158 * 		curthread->t_lofault = .copyerr;
159 * 		caller_error_handler = TRUE             ! %l6 |= 2
160 * 	}
161 *
162 *	! for FPU testing we must not migrate cpus
163 * 	if (curthread->t_lwp == NULL) {
164 *		! Kernel threads do not have pcb's in which to store
165 *		! the floating point state, so disallow preemption during
166 *		! the copy.  This also prevents cpu migration.
167 * 		kpreempt_disable(curthread);
168 *	} else {
169 *		thread_nomigrate();
170 *	}
171 *
172 * 	old_fprs = %fprs;
173 * 	old_gsr = %gsr;
174 * 	if (%fprs.fef) {
175 * 		%fprs.fef = 1;
176 * 		save current fpregs on stack using blockstore
177 * 	} else {
178 * 		%fprs.fef = 1;
179 * 	}
180 *
181 *
182 * 	do_blockcopy_here;
183 *
184 * In lofault handler:
185 *	curthread->t_lofault = .copyerr2;
186 *	Continue on with the normal exit handler
187 *
188 * On normal exit:
189 * 	%gsr = old_gsr;
190 * 	if (old_fprs & FPRS_FEF)
191 * 		restore fpregs from stack using blockload
192 *	else
193 *		zero fpregs
194 * 	%fprs = old_fprs;
195 * 	membar #Sync
196 * 	curthread->t_lofault = (%l6 & ~3);
197 *	! following test omitted from copyin/copyout as they
198 *	! will always have a current thread
199 * 	if (curthread->t_lwp == NULL)
200 *		kpreempt_enable(curthread);
201 *	else
202 *		thread_allowmigrate();
203 * 	return (0)
204 *
205 * In second lofault handler (.copyerr2):
206 *	We've tried to restore fp state from the stack and failed.  To
207 *	prevent from returning with a corrupted fp state, we will panic.
208 */
209
210/*
211 * Comments about optimization choices
212 *
213 * The initial optimization decision in this code is to determine
214 * whether to use the FP registers for a copy or not.  If we don't
215 * use the FP registers, we can execute the copy as a leaf routine,
216 * saving a register save and restore.  Also, less elaborate setup
217 * is required, allowing short copies to be completed more quickly.
218 * For longer copies, especially unaligned ones (where the src and
219 * dst do not align to allow simple ldx,stx operation), the FP
220 * registers allow much faster copy operations.
221 *
222 * The estimated extra cost of the FP path will vary depending on
223 * src/dst alignment, dst offset from the next 64 byte FPblock store
224 * boundary, remaining src data after the last full dst cache line is
225 * moved whether the FP registers need to be saved, and some other
226 * minor issues.  The average additional overhead is estimated to be
227 * 400 clocks.  Since each non-repeated/predicted tst and branch costs
228 * around 10 clocks, elaborate calculation would slow down to all
229 * longer copies and only benefit a small portion of medium sized
230 * copies.  Rather than incur such cost, we chose fixed transition
231 * points for each of the alignment choices.
232 *
233 * For the inner loop, here is a comparison of the per cache line
234 * costs for each alignment when src&dst are in cache:
235 *
236 * byte aligned:  108 clocks slower for non-FPBLK
237 * half aligned:   44 clocks slower for non-FPBLK
238 * word aligned:   12 clocks slower for non-FPBLK
239 * long aligned:    4 clocks >>faster<< for non-FPBLK
240 *
241 * The long aligned loop runs faster because it does no prefetching.
242 * That wins if the data is not in cache or there is too little
243 * data to gain much benefit from prefetching.  But when there
244 * is more data and that data is not in cache, failing to prefetch
245 * can run much slower.  In addition, there is a 2 Kbyte store queue
246 * which will cause the non-FPBLK inner loop to slow for larger copies.
247 * The exact tradeoff is strongly load and application dependent, with
248 * increasing risk of a customer visible performance regression if the
249 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
250 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
251 * upper limit for the non-FPBLK code.  To minimize performance regression
252 * risk while still gaining the primary benefits of the improvements to
253 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
254 * hw_copy_limit_*.  Later experimental studies using different values
255 * of hw_copy_limit_* can be used to make further adjustments if
256 * appropriate.
257 *
258 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
259 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
260 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
261 * hw_copy_limit_8 = src and dst are longword aligned
262 *
263 * To say that src and dst are word aligned means that after
264 * some initial alignment activity of moving 0 to 3 bytes,
265 * both the src and dst will be on word boundaries so that
266 * word loads and stores may be used.
267 *
268 * Default values at May,2005 are:
269 * hw_copy_limit_1 =  256
270 * hw_copy_limit_2 =  512
271 * hw_copy_limit_4 = 1024
272 * hw_copy_limit_8 = 1024 (or 1536 on some systems)
273 *
274 *
275 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
276 * disabled for that alignment choice.
277 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
278 * the value of VIS_COPY_THRESHOLD is used.
279 * It is not envisioned that hw_copy_limit_? will be changed in the field
280 * It is provided to allow for disabling FPBLK copies and to allow
281 * easy testing of alternate values on future HW implementations
282 * that might have different cache sizes, clock rates or instruction
283 * timing rules.
284 *
285 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
286 * threshold to speedup all shorter copies (less than 256).  That
287 * saves an alignment test, memory reference, and enabling test
288 * for all short copies, or an estimated 24 clocks.
289 *
290 * The order in which these limits are checked does matter since each
291 * non-predicted tst and branch costs around 10 clocks.
292 * If src and dst are randomly selected addresses,
293 * 4 of 8 will not be alignable.
294 * 2 of 8 will be half word alignable.
295 * 1 of 8 will be word alignable.
296 * 1 of 8 will be long word alignable.
297 * But, tests on running kernels show that src and dst to copy code
298 * are typically not on random alignments.  Structure copies and
299 * copies of larger data sizes are often on long word boundaries.
300 * So we test the long word alignment case first, then
301 * the byte alignment, then halfword, then word alignment.
302 *
303 * Several times, tests for length are made to split the code
304 * into subcases.  These tests often allow later tests to be
305 * avoided.  For example, within the non-FPBLK copy, we first
306 * check for tiny copies of 3 bytes or less.  That allows us
307 * to use a 4-way unrolled loop for the general byte copy case
308 * without a test on loop entry.
309 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
310 * vs longer cases.  For the really short case, we don't attempt
311 * align src and dst.  We try to minimize special case tests in
312 * the shortest loops as each test adds a significant percentage
313 * to the total time.
314 *
315 * For the medium sized cases, we allow ourselves to adjust the
316 * src and dst alignment and provide special cases for each of
317 * the four adjusted alignment cases. The CHKSIZE that was used
318 * to decide between short and medium size was chosen to be 39
319 * as that allows for the worst case of 7 bytes of alignment
320 * shift and 4 times 8 bytes for the first long word unrolling.
321 * That knowledge saves an initial test for length on entry into
322 * the medium cases.  If the general loop unrolling factor were
323 * to be increases, this number would also need to be adjusted.
324 *
325 * For all cases in the non-FPBLK code where it is known that at
326 * least 4 chunks of data are available for movement, the
327 * loop is unrolled by four.  This 4-way loop runs in 8 clocks
328 * or 2 clocks per data element.
329 *
330 * Instruction alignment is forced by used of .align 16 directives
331 * and nops which are not executed in the code.  This
332 * combination of operations shifts the alignment of following
333 * loops to insure that loops are aligned so that their instructions
334 * fall within the minimum number of 4 instruction fetch groups.
335 * If instructions are inserted or removed between the .align
336 * instruction and the unrolled loops, then the alignment needs
337 * to be readjusted.  Misaligned loops can add a clock per loop
338 * iteration to the loop timing.
339 *
340 * In a few cases, code is duplicated to avoid a branch.  Since
341 * a non-predicted tst and branch takes 10 clocks, this savings
342 * is judged an appropriate time-space tradeoff.
343 *
344 * Within the FPBLK-code, the prefetch method in the inner
345 * loop needs to be explained as it is not standard.  Two
346 * prefetches are issued for each cache line instead of one.
347 * The primary one is at the maximum reach of 8 cache lines.
348 * Most of the time, that maximum prefetch reach gives the
349 * cache line more time to reach the processor for systems with
350 * higher processor clocks.  But, sometimes memory interference
351 * can cause that prefetch to be dropped.  Putting a second
352 * prefetch at a reach of 5 cache lines catches the drops
353 * three iterations later and shows a measured improvement
354 * in performance over any similar loop with a single prefetch.
355 * The prefetches are placed in the loop so they overlap with
356 * non-memory instructions, so that there is no extra cost
357 * when the data is already in-cache.
358 *
359 */
360
361/*
362 * Notes on preserving existing fp state and on membars.
363 *
364 * When a copyOP decides to use fp we may have to preserve existing
365 * floating point state.  It is not the caller's state that we need to
366 * preserve - the rest of the kernel does not use fp and, anyway, fp
367 * registers are volatile across a call.  Some examples:
368 *
369 *	- userland has fp state and is interrupted (device interrupt
370 *	  or trap) and within the interrupt/trap handling we use
371 *	  bcopy()
372 *	- another (higher level) interrupt or trap handler uses bcopy
373 *	  while a bcopy from an earlier interrupt is still active
374 *	- an asynchronous error trap occurs while fp state exists (in
375 *	  userland or in kernel copy) and the tl0 component of the handling
376 *	  uses bcopy
377 *	- a user process with fp state incurs a copy-on-write fault and
378 *	  hwblkpagecopy always uses fp
379 *
380 * We therefore need a per-call place in which to preserve fp state -
381 * using our stack is ideal (and since fp copy cannot be leaf optimized
382 * because of calls it makes, this is no hardship).
383 *
384 * When we have finished fp copy (with it's repeated block stores)
385 * we must membar #Sync so that our block stores may complete before
386 * we either restore the original fp state into the fp registers or
387 * return to a caller which may initiate other fp operations that could
388 * modify the fp regs we used before the block stores complete.
389 *
390 * Synchronous faults (eg, unresolvable DMMU miss) that occur while
391 * t_lofault is not NULL will not panic but will instead trampoline
392 * to the registered lofault handler.  There is no need for any
393 * membars for these - eg, our store to t_lofault will always be visible to
394 * ourselves and it is our cpu which will take any trap.
395 *
396 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
397 * while t_lofault is not NULL will also not panic.  Since we're copying
398 * to or from userland the extent of the damage is known - the destination
399 * buffer is incomplete.  So trap handlers will trampoline to the lofault
400 * handler in this case which should take some form of error action to
401 * avoid using the incomplete buffer.  The trap handler also flags the
402 * fault so that later return-from-trap handling (for the trap that brought
403 * this thread into the kernel in the first place) can notify the process
404 * and reboot the system (or restart the service with Greenline/Contracts).
405 *
406 * Asynchronous faults (eg, uncorrectable ECC error from memory) can
407 * result in deferred error traps - the trap is taken sometime after
408 * the event and the trap PC may not be the PC of the faulting access.
409 * Delivery of such pending traps can be forced by a membar #Sync, acting
410 * as an "error barrier" in this role.  To accurately apply the user/kernel
411 * separation described in the preceding paragraph we must force delivery
412 * of deferred traps affecting kernel state before we install a lofault
413 * handler (if we interpose a new lofault handler on an existing one there
414 * is no need to repeat this), and we must force delivery of deferred
415 * errors affecting the lofault-protected region before we clear t_lofault.
416 * Failure to do so results in lost kernel state being interpreted as
417 * affecting a copyin/copyout only, or of an error that really only
418 * affects copy data being interpreted as losing kernel state.
419 *
420 * Since the copy operations may preserve and later restore floating
421 * point state that does not belong to the caller (see examples above),
422 * we must be careful in how we do this in order to prevent corruption
423 * of another program.
424 *
425 * To make sure that floating point state is always saved and restored
426 * correctly, the following "big rules" must be followed when the floating
427 * point registers will be used:
428 *
429 * 1. %l6 always holds the caller's lofault handler.  Also in this register,
430 *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
431 *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
432 *    lofault handler was set coming in.
433 *
434 * 2. The FPUSED flag indicates that all FP state has been successfully stored
435 *    on the stack.  It should not be set until this save has been completed.
436 *
437 * 3. The FPUSED flag should not be cleared on exit until all FP state has
438 *    been restored from the stack.  If an error occurs while restoring
439 *    data from the stack, the error handler can check this flag to see if
440 *    a restore is necessary.
441 *
442 * 4. Code run under the new lofault handler must be kept to a minimum.  In
443 *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
444 *    to kpreempt(), should not be made until after the lofault handler has
445 *    been restored.
446 */
447
448/*
449 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
450 * to "break even" using FP/VIS-accelerated memory operations.
451 * The FPBLK code assumes a minimum number of bytes are available
452 * to be moved on entry.  Check that code carefully before
453 * reducing VIS_COPY_THRESHOLD below 256.
454 */
455/*
456 * This shadows sys/machsystm.h which can't be included due to the lack of
457 * _ASM guards in include files it references. Change it here, change it there.
458 */
459#define VIS_COPY_THRESHOLD 256
460
461/*
462 * TEST for very short copies
463 * Be aware that the maximum unroll for the short unaligned case
464 * is SHORTCOPY+1
465 */
466#define SHORTCOPY 3
467#define CHKSIZE  39
468
469/*
470 * Indicates that we're to trampoline to the error handler.
471 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
472 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
473 */
474#define	FPUSED_FLAG	1
475#define	TRAMP_FLAG	2
476#define	MASK_FLAGS	3
477
478/*
479 * Number of outstanding prefetches.
480 * first prefetch moves data from L2 to L1 (n_reads)
481 * second prefetch moves data from memory to L2 (one_read)
482 */
483#define	OLYMPUS_C_PREFETCH	24
484#define	OLYMPUS_C_2ND_PREFETCH	12
485
486#define	VIS_BLOCKSIZE		64
487
488/*
489 * Size of stack frame in order to accomodate a 64-byte aligned
490 * floating-point register save area and 2 64-bit temp locations.
491 * All copy functions use two quadrants of fp registers; to assure a
492 * block-aligned two block buffer in which to save we must reserve
493 * three blocks on stack.  Not all functions preserve %pfrs on stack
494 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
495 *
496 *    _______________________________________ <-- %fp + STACK_BIAS
497 *    | We may need to preserve 2 quadrants |
498 *    | of fp regs, but since we do so with |
499 *    | BST/BLD we need room in which to    |
500 *    | align to VIS_BLOCKSIZE bytes.  So   |
501 *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
502 *    |-------------------------------------|
503 *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
504 *    |-------------------------------------|
505 *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
506 *    ---------------------------------------
507 */
508#define	HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
509#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 3)
510#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 2) - 1)
511#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
512#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
513
514/*
515 * Common macros used by the various versions of the block copy
516 * routines in this file.
517 */
518
519/*
520 * In FP copies if we do not have preserved data to restore over
521 * the fp regs we used then we must zero those regs to avoid
522 * exposing portions of the data to later threads (data security).
523 *
524 * Copy functions use either quadrants 1 and 3 or 2 and 4.
525 *
526 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
527 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
528 *
529 * The instructions below are quicker than repeated fzero instructions
530 * since they can dispatch down two fp pipelines.
531 */
532#define	FZEROQ1Q3			\
533	fzero	%f0			;\
534	fmovd	%f0, %f2		;\
535	fmovd	%f0, %f4		;\
536	fmovd	%f0, %f6		;\
537	fmovd	%f0, %f8		;\
538	fmovd	%f0, %f10		;\
539	fmovd	%f0, %f12		;\
540	fmovd	%f0, %f14		;\
541	fmovd	%f0, %f32		;\
542	fmovd	%f0, %f34		;\
543	fmovd	%f0, %f36		;\
544	fmovd	%f0, %f38		;\
545	fmovd	%f0, %f40		;\
546	fmovd	%f0, %f42		;\
547	fmovd	%f0, %f44		;\
548	fmovd	%f0, %f46
549
550#define	FZEROQ2Q4			\
551	fzero	%f16			;\
552	fmovd	%f0, %f18		;\
553	fmovd	%f0, %f20		;\
554	fmovd	%f0, %f22		;\
555	fmovd	%f0, %f24		;\
556	fmovd	%f0, %f26		;\
557	fmovd	%f0, %f28		;\
558	fmovd	%f0, %f30		;\
559	fmovd	%f0, %f48		;\
560	fmovd	%f0, %f50		;\
561	fmovd	%f0, %f52		;\
562	fmovd	%f0, %f54		;\
563	fmovd	%f0, %f56		;\
564	fmovd	%f0, %f58		;\
565	fmovd	%f0, %f60		;\
566	fmovd	%f0, %f62
567
568/*
569 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
570 * Used to save and restore in-use fp registers when we want to use FP
571 * and find fp already in use and copy size still large enough to justify
572 * the additional overhead of this save and restore.
573 *
574 * A membar #Sync is needed before save to sync fp ops initiated before
575 * the call to the copy function (by whoever has fp in use); for example
576 * an earlier block load to the quadrant we are about to save may still be
577 * "in flight".  A membar #Sync is required at the end of the save to
578 * sync our block store (the copy code is about to begin ldd's to the
579 * first quadrant).
580 *
581 * Similarly: a membar #Sync before restore allows the block stores of
582 * the copy operation to complete before we fill the quadrants with their
583 * original data, and a membar #Sync after restore lets the block loads
584 * of the restore complete before we return to whoever has the fp regs
585 * in use.  To avoid repeated membar #Sync we make it the responsibility
586 * of the copy code to membar #Sync immediately after copy is complete
587 * and before using the BLD_*_FROMSTACK macro.
588 */
589#if !defined(lint)
590#define BST_FPQ1Q3_TOSTACK(tmp1)				\
591	/* membar #Sync	*/					;\
592	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
593	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
594	stda	%f0, [tmp1]ASI_BLK_P				;\
595	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
596	stda	%f32, [tmp1]ASI_BLK_P				;\
597	membar	#Sync
598
599#define	BLD_FPQ1Q3_FROMSTACK(tmp1)				\
600	/* membar #Sync - provided at copy completion */	;\
601	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
602	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
603	ldda	[tmp1]ASI_BLK_P, %f0				;\
604	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
605	ldda	[tmp1]ASI_BLK_P, %f32				;\
606	membar	#Sync
607
608#define BST_FPQ2Q4_TOSTACK(tmp1)				\
609	/* membar #Sync */					;\
610	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
611	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
612	stda	%f16, [tmp1]ASI_BLK_P				;\
613	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
614	stda	%f48, [tmp1]ASI_BLK_P				;\
615	membar	#Sync
616
617#define	BLD_FPQ2Q4_FROMSTACK(tmp1)				\
618	/* membar #Sync - provided at copy completion */	;\
619	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
620	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
621	ldda	[tmp1]ASI_BLK_P, %f16				;\
622	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
623	ldda	[tmp1]ASI_BLK_P, %f48				;\
624	membar	#Sync
625#endif
626
627/*
628 * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
629 * prevent preemption if there is no t_lwp to save FP state to on context
630 * switch) before commencing a FP copy, and reallow it on completion or
631 * in error trampoline paths when we were using FP copy.
632 *
633 * Both macros may call other functions, so be aware that all outputs are
634 * forfeit after using these macros.  For this reason we do not pass registers
635 * to use - we just use any outputs we want.
636 *
637 * Pseudo code:
638 *
639 * FP_NOMIGRATE:
640 *
641 * if (curthread->t_lwp) {
642 *	thread_nomigrate();
643 * } else {
644 *	kpreempt_disable();
645 * }
646 *
647 * FP_ALLOWMIGRATE:
648 *
649 * if (curthread->t_lwp) {
650 *	thread_allowmigrate();
651 * } else {
652 *	kpreempt_enable();
653 * }
654 */
655
656#define	FP_NOMIGRATE(label1, label2)				\
657	ldn	[THREAD_REG + T_LWP], %o0			;\
658	brz,a,pn %o0, label1/**/f				;\
659	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
660	call	thread_nomigrate				;\
661	  nop							;\
662	ba	label2/**/f					;\
663	  nop							;\
664label1:								;\
665	inc	%o1						;\
666	stb	%o1, [THREAD_REG + T_PREEMPT]			;\
667label2:
668
669#define	FP_ALLOWMIGRATE(label1, label2)			\
670	ldn	[THREAD_REG + T_LWP], %o0			;\
671	brz,a,pn %o0, label1/**/f				;\
672	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
673	call thread_allowmigrate				;\
674	  nop							;\
675	ba	label2/**/f					;\
676	  nop							;\
677label1:								;\
678	dec	%o1						;\
679	brnz,pn	%o1, label2/**/f				;\
680	  stb	%o1, [THREAD_REG + T_PREEMPT]			;\
681	ldn	[THREAD_REG + T_CPU], %o0			;\
682	ldub	[%o0 + CPU_KPRUNRUN], %o0			;\
683	brz,pt	%o0, label2/**/f				;\
684	  nop							;\
685	call	kpreempt					;\
686	  rdpr	%pil, %o0					;\
687label2:
688
689/*
690 * Copy a block of storage, returning an error code if `from' or
691 * `to' takes a kernel pagefault which cannot be resolved.
692 * Returns errno value on pagefault error, 0 if all ok
693 */
694
695#if defined(lint)
696
697/* ARGSUSED */
698int
699kcopy(const void *from, void *to, size_t count)
700{ return(0); }
701
702#else	/* lint */
703
704	.seg	".text"
705	.align	4
706
707	ENTRY(kcopy)
708
709	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
710	bleu,pt	%ncc, .kcopy_small		! go to larger cases
711	  xor	%o0, %o1, %o3			! are src, dst alignable?
712	btst	7, %o3				!
713	bz,pt	%ncc, .kcopy_8			! check for longword alignment
714	  nop
715	btst	1, %o3				!
716	bz,pt	%ncc, .kcopy_2			! check for half-word
717	  nop
718	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
719	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
720	tst	%o3
721	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
722	  cmp	%o2, %o3			! if length <= limit
723	bleu,pt	%ncc, .kcopy_small		! go to small copy
724	  nop
725	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
726	  nop
727.kcopy_2:
728	btst	3, %o3				!
729	bz,pt	%ncc, .kcopy_4			! check for word alignment
730	  nop
731	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
732	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
733	tst	%o3
734	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
735	  cmp	%o2, %o3			! if length <= limit
736	bleu,pt	%ncc, .kcopy_small		! go to small copy
737	  nop
738	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
739	  nop
740.kcopy_4:
741	! already checked longword, must be word aligned
742	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
743	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
744	tst	%o3
745	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
746	  cmp	%o2, %o3			! if length <= limit
747	bleu,pt	%ncc, .kcopy_small		! go to small copy
748	  nop
749	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
750	  nop
751.kcopy_8:
752	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
753	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
754	tst	%o3
755	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
756	  cmp	%o2, %o3			! if length <= limit
757	bleu,pt	%ncc, .kcopy_small		! go to small copy
758	  nop
759	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
760	  nop
761
762.kcopy_small:
763	sethi	%hi(.sm_copyerr), %o5		! sm_copyerr is lofault value
764	or	%o5, %lo(.sm_copyerr), %o5
765	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
766	membar	#Sync				! sync error barrier
767	ba,pt	%ncc, .sm_do_copy		! common code
768	 stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
769
770.kcopy_more:
771	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
772	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
773	or	%l7, %lo(.copyerr), %l7
774	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
775	membar	#Sync				! sync error barrier
776	ba,pt	%ncc, .do_copy			! common code
777	  stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
778
779
780/*
781 * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
782 * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
783 */
784.copyerr:
785	set	.copyerr2, %l0
786	membar	#Sync				! sync error barrier
787	stn	%l0, [THREAD_REG + T_LOFAULT]	! set t_lofault
788	btst	FPUSED_FLAG, %l6
789	bz	%ncc, 1f
790	  and	%l6, TRAMP_FLAG, %l0		! copy trampoline flag to %l0
791
792	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
793	wr	%o2, 0, %gsr
794
795	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
796	btst	FPRS_FEF, %o3
797	bz,pt	%icc, 4f
798	  nop
799
800	BLD_FPQ1Q3_FROMSTACK(%o2)
801
802	ba,pt	%ncc, 1f
803	  wr	%o3, 0, %fprs		! restore fprs
804
8054:
806	FZEROQ1Q3
807	wr	%o3, 0, %fprs		! restore fprs
808
809	!
810	! Need to cater for the different expectations of kcopy
811	! and bcopy. kcopy will *always* set a t_lofault handler
812	! If it fires, we're expected to just return the error code
813	! and *not* to invoke any existing error handler. As far as
814	! bcopy is concerned, we only set t_lofault if there was an
815	! existing lofault handler. In that case we're expected to
816	! invoke the previously existing handler after resetting the
817	! t_lofault value.
818	!
8191:
820	andn	%l6, MASK_FLAGS, %l6		! turn trampoline flag off
821	membar	#Sync				! sync error barrier
822	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
823	FP_ALLOWMIGRATE(5, 6)
824
825	btst	TRAMP_FLAG, %l0
826	bnz,pn	%ncc, 3f
827	  nop
828	ret
829	  restore	%g1, 0, %o0
830
8313:
832	!
833	! We're here via bcopy. There *must* have been an error handler
834	! in place otherwise we would have died a nasty death already.
835	!
836	jmp	%l6				! goto real handler
837	  restore	%g0, 0, %o0		! dispose of copy window
838
839/*
840 * We got here because of a fault in .copyerr.  We can't safely restore fp
841 * state, so we panic.
842 */
843fp_panic_msg:
844	.asciz	"Unable to restore fp state after copy operation"
845
846	.align	4
847.copyerr2:
848	set	fp_panic_msg, %o0
849	call	panic
850	  nop
851
852/*
853 * We got here because of a fault during a small kcopy or bcopy.
854 * No floating point registers are used by the small copies.
855 * Errno value is in %g1.
856 */
857.sm_copyerr:
8581:
859	btst	TRAMP_FLAG, %o4
860	membar	#Sync
861	andn	%o4, TRAMP_FLAG, %o4
862	bnz,pn	%ncc, 3f
863	  stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
864	retl
865	  mov	%g1, %o0
8663:
867	jmp	%o4				! goto real handler
868	  mov	%g0, %o0			!
869
870	SET_SIZE(kcopy)
871#endif	/* lint */
872
873
874/*
875 * Copy a block of storage - must not overlap (from + len <= to).
876 * Registers: l6 - saved t_lofault
877 * (for short copies, o4 - saved t_lofault)
878 *
879 * Copy a page of memory.
880 * Assumes double word alignment and a count >= 256.
881 */
882#if defined(lint)
883
884/* ARGSUSED */
885void
886bcopy(const void *from, void *to, size_t count)
887{}
888
889#else	/* lint */
890
891	ENTRY(bcopy)
892
893	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
894	bleu,pt	%ncc, .bcopy_small		! go to larger cases
895	  xor	%o0, %o1, %o3			! are src, dst alignable?
896	btst	7, %o3				!
897	bz,pt	%ncc, .bcopy_8			! check for longword alignment
898	  nop
899	btst	1, %o3				!
900	bz,pt	%ncc, .bcopy_2			! check for half-word
901	  nop
902	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
903	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
904	tst	%o3
905	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
906	  cmp	%o2, %o3			! if length <= limit
907	bleu,pt	%ncc, .bcopy_small		! go to small copy
908	  nop
909	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
910	  nop
911.bcopy_2:
912	btst	3, %o3				!
913	bz,pt	%ncc, .bcopy_4			! check for word alignment
914	  nop
915	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
916	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
917	tst	%o3
918	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
919	  cmp	%o2, %o3			! if length <= limit
920	bleu,pt	%ncc, .bcopy_small		! go to small copy
921	  nop
922	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
923	  nop
924.bcopy_4:
925	! already checked longword, must be word aligned
926	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
927	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
928	tst	%o3
929	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
930	  cmp	%o2, %o3			! if length <= limit
931	bleu,pt	%ncc, .bcopy_small		! go to small copy
932	  nop
933	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
934	  nop
935.bcopy_8:
936	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
937	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
938	tst	%o3
939	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
940	  cmp	%o2, %o3			! if length <= limit
941	bleu,pt	%ncc, .bcopy_small		! go to small copy
942	  nop
943	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
944	  nop
945
946	.align	16
947.bcopy_small:
948	ldn	[THREAD_REG + T_LOFAULT], %o4	! save t_lofault
949	tst	%o4
950	bz,pt	%icc, .sm_do_copy
951	  nop
952	sethi	%hi(.sm_copyerr), %o5
953	or	%o5, %lo(.sm_copyerr), %o5
954	membar	#Sync				! sync error barrier
955	stn	%o5, [THREAD_REG + T_LOFAULT]	! install new vector
956	or	%o4, TRAMP_FLAG, %o4		! error should trampoline
957.sm_do_copy:
958	cmp	%o2, SHORTCOPY		! check for really short case
959	bleu,pt	%ncc, .bc_sm_left	!
960	  cmp	%o2, CHKSIZE		! check for medium length cases
961	bgu,pn	%ncc, .bc_med		!
962	  or	%o0, %o1, %o3		! prepare alignment check
963	andcc	%o3, 0x3, %g0		! test for alignment
964	bz,pt	%ncc, .bc_sm_word	! branch to word aligned case
965.bc_sm_movebytes:
966	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
967.bc_sm_notalign4:
968	ldub	[%o0], %o3		! read byte
969	stb	%o3, [%o1]		! write byte
970	subcc	%o2, 4, %o2		! reduce count by 4
971	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
972	add	%o0, 4, %o0		! advance SRC by 4
973	stb	%o3, [%o1 + 1]
974	ldub	[%o0 - 2], %o3
975	add	%o1, 4, %o1		! advance DST by 4
976	stb	%o3, [%o1 - 2]
977	ldub	[%o0 - 1], %o3
978	bgt,pt	%ncc, .bc_sm_notalign4	! loop til 3 or fewer bytes remain
979	  stb	%o3, [%o1 - 1]
980	add	%o2, 3, %o2		! restore count
981.bc_sm_left:
982	tst	%o2
983	bz,pt	%ncc, .bc_sm_exit	! check for zero length
984	  deccc	%o2			! reduce count for cc test
985	ldub	[%o0], %o3		! move one byte
986	bz,pt	%ncc, .bc_sm_exit
987	  stb	%o3, [%o1]
988	ldub	[%o0 + 1], %o3		! move another byte
989	deccc	%o2			! check for more
990	bz,pt	%ncc, .bc_sm_exit
991	  stb	%o3, [%o1 + 1]
992	ldub	[%o0 + 2], %o3		! move final byte
993	ba,pt   %ncc, .bc_sm_exit
994	  stb	%o3, [%o1 + 2]
995	.align	16
996	nop				! instruction alignment
997					! see discussion at start of file
998.bc_sm_words:
999	lduw	[%o0], %o3		! read word
1000.bc_sm_wordx:
1001	subcc	%o2, 8, %o2		! update count
1002	stw	%o3, [%o1]		! write word
1003	add	%o0, 8, %o0		! update SRC
1004	lduw	[%o0 - 4], %o3		! read word
1005	add	%o1, 8, %o1		! update DST
1006	bgt,pt	%ncc, .bc_sm_words	! loop til done
1007	  stw	%o3, [%o1 - 4]		! write word
1008	addcc	%o2, 7, %o2		! restore count
1009	bz,pt	%ncc, .bc_sm_exit
1010	  deccc	%o2
1011	bz,pt	%ncc, .bc_sm_byte
1012.bc_sm_half:
1013	  subcc	%o2, 2, %o2		! reduce count by 2
1014	add	%o0, 2, %o0		! advance SRC by 2
1015	lduh	[%o0 - 2], %o3		! read half word
1016	add	%o1, 2, %o1		! advance DST by 2
1017	bgt,pt	%ncc, .bc_sm_half	! loop til done
1018	  sth	%o3, [%o1 - 2]		! write half word
1019	addcc	%o2, 1, %o2		! restore count
1020	bz,pt	%ncc, .bc_sm_exit
1021	  nop
1022.bc_sm_byte:
1023	ldub	[%o0], %o3
1024	ba,pt   %ncc, .bc_sm_exit
1025	  stb	%o3, [%o1]
1026
1027.bc_sm_word:
1028	subcc	%o2, 4, %o2		! update count
1029	bgt,pt	%ncc, .bc_sm_wordx
1030	  lduw	[%o0], %o3		! read word
1031	addcc	%o2, 3, %o2		! restore count
1032	bz,pt	%ncc, .bc_sm_exit
1033	  stw	%o3, [%o1]		! write word
1034	deccc	%o2			! reduce count for cc test
1035	ldub	[%o0 + 4], %o3		! load one byte
1036	bz,pt	%ncc, .bc_sm_exit
1037	  stb	%o3, [%o1 + 4]		! store one byte
1038	ldub	[%o0 + 5], %o3		! load second byte
1039	deccc	%o2
1040	bz,pt	%ncc, .bc_sm_exit
1041	  stb	%o3, [%o1 + 5]		! store second byte
1042	ldub	[%o0 + 6], %o3		! load third byte
1043	stb	%o3, [%o1 + 6]		! store third byte
1044.bc_sm_exit:
1045	ldn     [THREAD_REG + T_LOFAULT], %o3
1046	brz,pt  %o3, .bc_sm_done
1047	  nop
1048	membar	#Sync				! sync error barrier
1049	andn	%o4, TRAMP_FLAG, %o4
1050	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1051.bc_sm_done:
1052	retl
1053	  mov	%g0, %o0		! return 0
1054
1055	.align 16
1056.bc_med:
1057	xor	%o0, %o1, %o3		! setup alignment check
1058	btst	1, %o3
1059	bnz,pt	%ncc, .bc_sm_movebytes	! unaligned
1060	  nop
1061	btst	3, %o3
1062	bnz,pt	%ncc, .bc_med_half	! halfword aligned
1063	  nop
1064	btst	7, %o3
1065	bnz,pt	%ncc, .bc_med_word	! word aligned
1066	  nop
1067.bc_med_long:
1068	btst	3, %o0			! check for
1069	bz,pt	%ncc, .bc_med_long1	! word alignment
1070	  nop
1071.bc_med_long0:
1072	ldub	[%o0], %o3		! load one byte
1073	inc	%o0
1074	stb	%o3,[%o1]		! store byte
1075	inc	%o1
1076	btst	3, %o0
1077	bnz,pt	%ncc, .bc_med_long0
1078	  dec	%o2
1079.bc_med_long1:			! word aligned
1080	btst	7, %o0			! check for long word
1081	bz,pt	%ncc, .bc_med_long2
1082	  nop
1083	lduw	[%o0], %o3		! load word
1084	add	%o0, 4, %o0		! advance SRC by 4
1085	stw	%o3, [%o1]		! store word
1086	add	%o1, 4, %o1		! advance DST by 4
1087	sub	%o2, 4, %o2		! reduce count by 4
1088!
1089!  Now long word aligned and have at least 32 bytes to move
1090!
1091.bc_med_long2:
1092	sub	%o2, 31, %o2		! adjust count to allow cc zero test
1093.bc_med_lmove:
1094	ldx	[%o0], %o3		! read long word
1095	stx	%o3, [%o1]		! write long word
1096	subcc	%o2, 32, %o2		! reduce count by 32
1097	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
1098	add	%o0, 32, %o0		! advance SRC by 32
1099	stx	%o3, [%o1 + 8]
1100	ldx	[%o0 - 16], %o3
1101	add	%o1, 32, %o1		! advance DST by 32
1102	stx	%o3, [%o1 - 16]
1103	ldx	[%o0 - 8], %o3
1104	bgt,pt	%ncc, .bc_med_lmove	! loop til 31 or fewer bytes left
1105	  stx	%o3, [%o1 - 8]
1106	addcc	%o2, 24, %o2		! restore count to long word offset
1107	ble,pt	%ncc, .bc_med_lextra	! check for more long words to move
1108	  nop
1109.bc_med_lword:
1110	ldx	[%o0], %o3		! read long word
1111	subcc	%o2, 8, %o2		! reduce count by 8
1112	stx	%o3, [%o1]		! write long word
1113	add	%o0, 8, %o0		! advance SRC by 8
1114	bgt,pt	%ncc, .bc_med_lword	! loop til 7 or fewer bytes left
1115	  add	%o1, 8, %o1		! advance DST by 8
1116.bc_med_lextra:
1117	addcc	%o2, 7, %o2		! restore rest of count
1118	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
1119	  deccc	%o2
1120	bz,pt	%ncc, .bc_sm_byte
1121	  nop
1122	ba,pt	%ncc, .bc_sm_half
1123	  nop
1124
1125	.align 16
1126.bc_med_word:
1127	btst	3, %o0			! check for
1128	bz,pt	%ncc, .bc_med_word1	! word alignment
1129	  nop
1130.bc_med_word0:
1131	ldub	[%o0], %o3		! load one byte
1132	inc	%o0
1133	stb	%o3,[%o1]		! store byte
1134	inc	%o1
1135	btst	3, %o0
1136	bnz,pt	%ncc, .bc_med_word0
1137	  dec	%o2
1138!
1139!  Now word aligned and have at least 36 bytes to move
1140!
1141.bc_med_word1:
1142	sub	%o2, 15, %o2		! adjust count to allow cc zero test
1143.bc_med_wmove:
1144	lduw	[%o0], %o3		! read word
1145	stw	%o3, [%o1]		! write word
1146	subcc	%o2, 16, %o2		! reduce count by 16
1147	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
1148	add	%o0, 16, %o0		! advance SRC by 16
1149	stw	%o3, [%o1 + 4]
1150	lduw	[%o0 - 8], %o3
1151	add	%o1, 16, %o1		! advance DST by 16
1152	stw	%o3, [%o1 - 8]
1153	lduw	[%o0 - 4], %o3
1154	bgt,pt	%ncc, .bc_med_wmove	! loop til 15 or fewer bytes left
1155	  stw	%o3, [%o1 - 4]
1156	addcc	%o2, 12, %o2		! restore count to word offset
1157	ble,pt	%ncc, .bc_med_wextra	! check for more words to move
1158	  nop
1159.bc_med_word2:
1160	lduw	[%o0], %o3		! read word
1161	subcc	%o2, 4, %o2		! reduce count by 4
1162	stw	%o3, [%o1]		! write word
1163	add	%o0, 4, %o0		! advance SRC by 4
1164	bgt,pt	%ncc, .bc_med_word2	! loop til 3 or fewer bytes left
1165	  add	%o1, 4, %o1		! advance DST by 4
1166.bc_med_wextra:
1167	addcc	%o2, 3, %o2		! restore rest of count
1168	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
1169	  deccc	%o2
1170	bz,pt	%ncc, .bc_sm_byte
1171	  nop
1172	ba,pt	%ncc, .bc_sm_half
1173	  nop
1174
1175	.align 16
1176.bc_med_half:
1177	btst	1, %o0			! check for
1178	bz,pt	%ncc, .bc_med_half1	! half word alignment
1179	  nop
1180	ldub	[%o0], %o3		! load one byte
1181	inc	%o0
1182	stb	%o3,[%o1]		! store byte
1183	inc	%o1
1184	dec	%o2
1185!
1186!  Now half word aligned and have at least 38 bytes to move
1187!
1188.bc_med_half1:
1189	sub	%o2, 7, %o2		! adjust count to allow cc zero test
1190.bc_med_hmove:
1191	lduh	[%o0], %o3		! read half word
1192	sth	%o3, [%o1]		! write half word
1193	subcc	%o2, 8, %o2		! reduce count by 8
1194	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
1195	add	%o0, 8, %o0		! advance SRC by 8
1196	sth	%o3, [%o1 + 2]
1197	lduh	[%o0 - 4], %o3
1198	add	%o1, 8, %o1		! advance DST by 8
1199	sth	%o3, [%o1 - 4]
1200	lduh	[%o0 - 2], %o3
1201	bgt,pt	%ncc, .bc_med_hmove	! loop til 7 or fewer bytes left
1202	  sth	%o3, [%o1 - 2]
1203	addcc	%o2, 7, %o2		! restore count
1204	bz,pt	%ncc, .bc_sm_exit
1205	  deccc	%o2
1206	bz,pt	%ncc, .bc_sm_byte
1207	  nop
1208	ba,pt	%ncc, .bc_sm_half
1209	  nop
1210
1211	SET_SIZE(bcopy)
1212
1213/*
1214 * The _more entry points are not intended to be used directly by
1215 * any caller from outside this file.  They are provided to allow
1216 * profiling and dtrace of the portions of the copy code that uses
1217 * the floating point registers.
1218 * This entry is particularly important as DTRACE (at least as of
1219 * 4/2004) does not support leaf functions.
1220 */
1221
1222	ENTRY(bcopy_more)
1223.bcopy_more:
1224	prefetch [%o0], #n_reads
1225	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1226	ldn	[THREAD_REG + T_LOFAULT], %l6	! save t_lofault
1227	tst	%l6
1228	bz,pt	%ncc, .do_copy
1229	  nop
1230	sethi	%hi(.copyerr), %o2
1231	or	%o2, %lo(.copyerr), %o2
1232	membar	#Sync				! sync error barrier
1233	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
1234	!
1235	! We've already captured whether t_lofault was zero on entry.
1236	! We need to mark ourselves as being from bcopy since both
1237	! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
1238	! and the saved lofault was zero, we won't reset lofault on
1239	! returning.
1240	!
1241	or	%l6, TRAMP_FLAG, %l6
1242
1243/*
1244 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
1245 * Also, use of FP registers has been tested to be enabled
1246 */
1247.do_copy:
1248	FP_NOMIGRATE(6, 7)
1249
1250	rd	%fprs, %o2		! check for unused fp
1251	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
1252	btst	FPRS_FEF, %o2
1253	bz,a,pt	%icc, .do_blockcopy
1254	  wr	%g0, FPRS_FEF, %fprs
1255
1256	BST_FPQ1Q3_TOSTACK(%o2)
1257
1258.do_blockcopy:
1259	rd	%gsr, %o2
1260	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
1261	or	%l6, FPUSED_FLAG, %l6
1262
1263#define	REALSRC	%i0
1264#define	DST	%i1
1265#define	CNT	%i2
1266#define	SRC	%i3
1267#define	TMP	%i5
1268
1269	andcc	DST, VIS_BLOCKSIZE - 1, TMP
1270	bz,pt	%ncc, 2f
1271	  neg	TMP
1272	add	TMP, VIS_BLOCKSIZE, TMP
1273
1274	! TMP = bytes required to align DST on FP_BLOCK boundary
1275	! Using SRC as a tmp here
1276	cmp	TMP, 3
1277	bleu,pt	%ncc, 1f
1278	  sub	CNT,TMP,CNT		! adjust main count
1279	sub	TMP, 3, TMP		! adjust for end of loop test
1280.bc_blkalign:
1281	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
1282	stb	SRC, [DST]
1283	subcc	TMP, 4, TMP
1284	ldub	[REALSRC + 1], SRC
1285	add	REALSRC, 4, REALSRC
1286	stb	SRC, [DST + 1]
1287	ldub	[REALSRC - 2], SRC
1288	add	DST, 4, DST
1289	stb	SRC, [DST - 2]
1290	ldub	[REALSRC - 1], SRC
1291	bgu,pt	%ncc, .bc_blkalign
1292	  stb	SRC, [DST - 1]
1293
1294	addcc	TMP, 3, TMP		! restore count adjustment
1295	bz,pt	%ncc, 2f		! no bytes left?
1296	  nop
12971:	ldub	[REALSRC], SRC
1298	inc	REALSRC
1299	inc	DST
1300	deccc	TMP
1301	bgu	%ncc, 1b
1302	  stb	SRC, [DST - 1]
1303
13042:
1305	membar	#StoreLoad
1306	andn	REALSRC, 0x7, SRC
1307
1308	! SRC - 8-byte aligned
1309	! DST - 64-byte aligned
1310	ldd	[SRC], %f0
1311	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1312	alignaddr REALSRC, %g0, %g0
1313	ldd	[SRC + 0x08], %f2
1314	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1315	faligndata %f0, %f2, %f32
1316	ldd	[SRC + 0x10], %f4
1317	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1318	faligndata %f2, %f4, %f34
1319	ldd	[SRC + 0x18], %f6
1320	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1321	faligndata %f4, %f6, %f36
1322	ldd	[SRC + 0x20], %f8
1323	prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
1324	faligndata %f6, %f8, %f38
1325	ldd	[SRC + 0x28], %f10
1326	prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
1327	faligndata %f8, %f10, %f40
1328	ldd	[SRC + 0x30], %f12
1329	prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
1330	faligndata %f10, %f12, %f42
1331	ldd	[SRC + 0x38], %f14
1332	ldd	[SRC + VIS_BLOCKSIZE], %f0
1333	sub	CNT, VIS_BLOCKSIZE, CNT
1334	add	SRC, VIS_BLOCKSIZE, SRC
1335	prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
1336	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1337	ba,pt	%ncc, 1f
1338	  prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
1339	.align	32
13401:
1341	ldd	[SRC + 0x08], %f2
1342	faligndata %f12, %f14, %f44
1343	ldd	[SRC + 0x10], %f4
1344	faligndata %f14, %f0, %f46
1345	stda	%f32, [DST]ASI_BLK_P
1346	ldd	[SRC + 0x18], %f6
1347	faligndata %f0, %f2, %f32
1348	ldd	[SRC + 0x20], %f8
1349	faligndata %f2, %f4, %f34
1350	ldd	[SRC + 0x28], %f10
1351	faligndata %f4, %f6, %f36
1352	ldd	[SRC + 0x30], %f12
1353	faligndata %f6, %f8, %f38
1354	sub	CNT, VIS_BLOCKSIZE, CNT
1355	ldd	[SRC + 0x38], %f14
1356	faligndata %f8, %f10, %f40
1357	add	DST, VIS_BLOCKSIZE, DST
1358	ldd	[SRC + VIS_BLOCKSIZE], %f0
1359	faligndata %f10, %f12, %f42
1360	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1361	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1362	add	SRC, VIS_BLOCKSIZE, SRC
1363	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1364	cmp	CNT, VIS_BLOCKSIZE + 8
1365	bgu,pt	%ncc, 1b
1366	  prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1367
1368	! only if REALSRC & 0x7 is 0
1369	cmp	CNT, VIS_BLOCKSIZE
1370	bne	%ncc, 3f
1371	  andcc	REALSRC, 0x7, %g0
1372	bz,pt	%ncc, 2f
1373	  nop
13743:
1375	faligndata %f12, %f14, %f44
1376	faligndata %f14, %f0, %f46
1377	stda	%f32, [DST]ASI_BLK_P
1378	add	DST, VIS_BLOCKSIZE, DST
1379	ba,pt	%ncc, 3f
1380	  nop
13812:
1382	ldd	[SRC + 0x08], %f2
1383	fsrc1	%f12, %f44
1384	ldd	[SRC + 0x10], %f4
1385	fsrc1	%f14, %f46
1386	stda	%f32, [DST]ASI_BLK_P
1387	ldd	[SRC + 0x18], %f6
1388	fsrc1	%f0, %f32
1389	ldd	[SRC + 0x20], %f8
1390	fsrc1	%f2, %f34
1391	ldd	[SRC + 0x28], %f10
1392	fsrc1	%f4, %f36
1393	ldd	[SRC + 0x30], %f12
1394	fsrc1	%f6, %f38
1395	ldd	[SRC + 0x38], %f14
1396	fsrc1	%f8, %f40
1397	sub	CNT, VIS_BLOCKSIZE, CNT
1398	add	DST, VIS_BLOCKSIZE, DST
1399	add	SRC, VIS_BLOCKSIZE, SRC
1400	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1401	fsrc1	%f10, %f42
1402	fsrc1	%f12, %f44
1403	fsrc1	%f14, %f46
1404	stda	%f32, [DST]ASI_BLK_P
1405	add	DST, VIS_BLOCKSIZE, DST
1406	ba,a,pt	%ncc, .bcb_exit
1407	  nop
1408
14093:	tst	CNT
1410	bz,a,pt	%ncc, .bcb_exit
1411	  nop
1412
14135:	ldub	[REALSRC], TMP
1414	inc	REALSRC
1415	inc	DST
1416	deccc	CNT
1417	bgu	%ncc, 5b
1418	  stb	TMP, [DST - 1]
1419.bcb_exit:
1420	membar	#Sync
1421
1422	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
1423	wr	%o2, 0, %gsr
1424
1425	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1426	btst	FPRS_FEF, %o3
1427	bz,pt	%icc, 4f
1428	  nop
1429
1430	BLD_FPQ1Q3_FROMSTACK(%o2)
1431
1432	ba,pt	%ncc, 2f
1433	  wr	%o3, 0, %fprs		! restore fprs
14344:
1435	FZEROQ1Q3
1436	wr	%o3, 0, %fprs		! restore fprs
14372:
1438	membar	#Sync				! sync error barrier
1439	andn	%l6, MASK_FLAGS, %l6
1440	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1441	FP_ALLOWMIGRATE(5, 6)
1442	ret
1443	  restore	%g0, 0, %o0
1444
1445	SET_SIZE(bcopy_more)
1446
1447#endif	/* lint */
1448
1449/*
1450 * Block copy with possibly overlapped operands.
1451 */
1452
1453#if defined(lint)
1454
1455/*ARGSUSED*/
1456void
1457ovbcopy(const void *from, void *to, size_t count)
1458{}
1459
1460#else	/* lint */
1461
1462	ENTRY(ovbcopy)
1463	tst	%o2			! check count
1464	bgu,a	%ncc, 1f		! nothing to do or bad arguments
1465	  subcc	%o0, %o1, %o3		! difference of from and to address
1466
1467	retl				! return
1468	  nop
14691:
1470	bneg,a	%ncc, 2f
1471	  neg	%o3			! if < 0, make it positive
14722:	cmp	%o2, %o3		! cmp size and abs(from - to)
1473	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
1474	  .empty				!   no overlap
1475	  cmp	%o0, %o1		! compare from and to addresses
1476	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
1477	  nop
1478	!
1479	! Copy forwards.
1480	!
1481.ov_fwd:
1482	ldub	[%o0], %o3		! read from address
1483	inc	%o0			! inc from address
1484	stb	%o3, [%o1]		! write to address
1485	deccc	%o2			! dec count
1486	bgu	%ncc, .ov_fwd		! loop till done
1487	  inc	%o1			! inc to address
1488
1489	retl				! return
1490	  nop
1491	!
1492	! Copy backwards.
1493	!
1494.ov_bkwd:
1495	deccc	%o2			! dec count
1496	ldub	[%o0 + %o2], %o3	! get byte at end of src
1497	bgu	%ncc, .ov_bkwd		! loop till done
1498	  stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
1499
1500	retl				! return
1501	  nop
1502
1503	SET_SIZE(ovbcopy)
1504
1505#endif	/* lint */
1506
1507
1508/*
1509 * hwblkpagecopy()
1510 *
1511 * Copies exactly one page.  This routine assumes the caller (ppcopy)
1512 * has already disabled kernel preemption and has checked
1513 * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
1514 */
1515#ifdef lint
1516/*ARGSUSED*/
1517void
1518hwblkpagecopy(const void *src, void *dst)
1519{ }
1520#else /* lint */
1521	ENTRY(hwblkpagecopy)
1522	! get another window w/space for three aligned blocks of saved fpregs
1523	prefetch [%o0], #n_reads
1524	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1525
1526	! %i0 - source address (arg)
1527	! %i1 - destination address (arg)
1528	! %i2 - length of region (not arg)
1529	! %l0 - saved fprs
1530	! %l1 - pointer to saved fpregs
1531
1532	rd	%fprs, %l0		! check for unused fp
1533	btst	FPRS_FEF, %l0
1534	bz,a,pt	%icc, 1f
1535	  wr	%g0, FPRS_FEF, %fprs
1536
1537	BST_FPQ1Q3_TOSTACK(%l1)
1538
15391:	set	PAGESIZE, CNT
1540	mov	REALSRC, SRC
1541
1542	ldd	[SRC], %f0
1543	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1544	ldd	[SRC + 0x08], %f2
1545	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1546	fmovd	%f0, %f32
1547	ldd	[SRC + 0x10], %f4
1548	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1549	fmovd	%f2, %f34
1550	ldd	[SRC + 0x18], %f6
1551	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1552	fmovd	%f4, %f36
1553	ldd	[SRC + 0x20], %f8
1554	prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
1555	fmovd	%f6, %f38
1556	ldd	[SRC + 0x28], %f10
1557	prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
1558	fmovd	%f8, %f40
1559	ldd	[SRC + 0x30], %f12
1560	prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
1561	fmovd	%f10, %f42
1562	ldd	[SRC + 0x38], %f14
1563	ldd	[SRC + VIS_BLOCKSIZE], %f0
1564	sub	CNT, VIS_BLOCKSIZE, CNT
1565	add	SRC, VIS_BLOCKSIZE, SRC
1566	prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
1567	ba,pt	%ncc, 2f
1568	prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
1569	.align	32
15702:
1571	ldd	[SRC + 0x08], %f2
1572	fmovd	%f12, %f44
1573	ldd	[SRC + 0x10], %f4
1574	fmovd	%f14, %f46
1575	stda	%f32, [DST]ASI_BLK_P
1576	ldd	[SRC + 0x18], %f6
1577	fmovd	%f0, %f32
1578	ldd	[SRC + 0x20], %f8
1579	fmovd	%f2, %f34
1580	ldd	[SRC + 0x28], %f10
1581	fmovd	%f4, %f36
1582	ldd	[SRC + 0x30], %f12
1583	fmovd	%f6, %f38
1584	ldd	[SRC + 0x38], %f14
1585	fmovd	%f8, %f40
1586	ldd	[SRC + VIS_BLOCKSIZE], %f0
1587	fmovd	%f10, %f42
1588	sub	CNT, VIS_BLOCKSIZE, CNT
1589	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1590	add	DST, VIS_BLOCKSIZE, DST
1591	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1592	add	SRC, VIS_BLOCKSIZE, SRC
1593	cmp	CNT, VIS_BLOCKSIZE + 8
1594	bgu,pt	%ncc, 2b
1595	  prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1596
1597	! trailing block
1598	ldd	[SRC + 0x08], %f2
1599	fsrc1	%f12, %f44
1600	ldd	[SRC + 0x10], %f4
1601	fsrc1	%f14, %f46
1602	stda	%f32, [DST]ASI_BLK_P
1603	ldd	[SRC + 0x18], %f6
1604	fsrc1	%f0, %f32
1605	ldd	[SRC + 0x20], %f8
1606	fsrc1	%f2, %f34
1607	ldd	[SRC + 0x28], %f10
1608	fsrc1	%f4, %f36
1609	ldd	[SRC + 0x30], %f12
1610	fsrc1	%f6, %f38
1611	ldd	[SRC + 0x38], %f14
1612	fsrc1	%f8, %f40
1613	sub	CNT, VIS_BLOCKSIZE, CNT
1614	add	DST, VIS_BLOCKSIZE, DST
1615	add	SRC, VIS_BLOCKSIZE, SRC
1616	fsrc1	%f10, %f42
1617	fsrc1	%f12, %f44
1618	fsrc1	%f14, %f46
1619	stda	%f32, [DST]ASI_BLK_P
1620
1621	membar	#Sync
1622
1623	btst	FPRS_FEF, %l0
1624	bz,pt	%icc, 2f
1625	  nop
1626
1627	BLD_FPQ1Q3_FROMSTACK(%l3)
1628	ba	3f
1629	  nop
1630
16312:	FZEROQ1Q3
1632
16333:	wr	%l0, 0, %fprs		! restore fprs
1634	ret
1635	  restore	%g0, 0, %o0
1636
1637	SET_SIZE(hwblkpagecopy)
1638#endif	/* lint */
1639
1640
1641/*
1642 * Transfer data to and from user space -
1643 * Note that these routines can cause faults
1644 * It is assumed that the kernel has nothing at
1645 * less than KERNELBASE in the virtual address space.
1646 *
1647 * Note that copyin(9F) and copyout(9F) are part of the
1648 * DDI/DKI which specifies that they return '-1' on "errors."
1649 *
1650 * Sigh.
1651 *
1652 * So there's two extremely similar routines - xcopyin() and xcopyout()
1653 * which return the errno that we've faithfully computed.  This
1654 * allows other callers (e.g. uiomove(9F)) to work correctly.
1655 * Given that these are used pretty heavily, we expand the calling
1656 * sequences inline for all flavours (rather than making wrappers).
1657 *
1658 * There are also stub routines for xcopyout_little and xcopyin_little,
1659 * which currently are intended to handle requests of <= 16 bytes from
1660 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1661 * is left as an exercise...
1662 */
1663
1664/*
1665 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1666 *
1667 * General theory of operation:
1668 *
1669 * The only difference between copy{in,out} and
1670 * xcopy{in,out} is in the error handling routine they invoke
1671 * when a memory access error occurs. xcopyOP returns the errno
1672 * while copyOP returns -1 (see above). copy{in,out}_noerr set
1673 * a special flag (by oring the TRAMP_FLAG into the fault handler address)
1674 * if they are called with a fault handler already in place. That flag
1675 * causes the default handlers to trampoline to the previous handler
1676 * upon an error.
1677 *
1678 * None of the copyops routines grab a window until it's decided that
1679 * we need to do a HW block copy operation. This saves a window
1680 * spill/fill when we're called during socket ops. The typical IO
1681 * path won't cause spill/fill traps.
1682 *
1683 * This code uses a set of 4 limits for the maximum size that will
1684 * be copied given a particular input/output address alignment.
1685 * If the value for a particular limit is zero, the copy will be performed
1686 * by the plain copy loops rather than FPBLK.
1687 *
1688 * See the description of bcopy above for more details of the
1689 * data copying algorithm and the default limits.
1690 *
1691 */
1692
1693/*
1694 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1695 */
1696
1697#if defined(lint)
1698
1699
1700#else	/* lint */
1701/*
1702 * We save the arguments in the following registers in case of a fault:
1703 *	kaddr - %l1
1704 *	uaddr - %l2
1705 *	count - %l3
1706 */
1707#define SAVE_SRC	%l1
1708#define SAVE_DST	%l2
1709#define SAVE_COUNT	%l3
1710
1711#define SM_SAVE_SRC		%g4
1712#define SM_SAVE_DST		%g5
1713#define SM_SAVE_COUNT		%o5
1714#define ERRNO		%l5
1715
1716
1717#define REAL_LOFAULT	%l4
1718/*
1719 * Generic copyio fault handler.  This is the first line of defense when a
1720 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
1721 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1722 * This allows us to share common code for all the flavors of the copy
1723 * operations, including the _noerr versions.
1724 *
1725 * Note that this function will restore the original input parameters before
1726 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
1727 * member of the t_copyop structure, if needed.
1728 */
1729	ENTRY(copyio_fault)
1730	membar	#Sync
1731	mov	%g1,ERRNO			! save errno in ERRNO
1732	btst	FPUSED_FLAG, %l6
1733	bz	%ncc, 1f
1734	  nop
1735
1736	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1737	wr	%o2, 0, %gsr    	! restore gsr
1738
1739	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1740	btst	FPRS_FEF, %o3
1741	bz,pt	%icc, 4f
1742	  nop
1743
1744	BLD_FPQ2Q4_FROMSTACK(%o2)
1745
1746	ba,pt	%ncc, 1f
1747	  wr	%o3, 0, %fprs   	! restore fprs
1748
17494:
1750	FZEROQ2Q4
1751	wr	%o3, 0, %fprs   	! restore fprs
1752
17531:
1754	andn	%l6, FPUSED_FLAG, %l6
1755	membar	#Sync
1756	stn	%l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1757	FP_ALLOWMIGRATE(5, 6)
1758
1759	mov	SAVE_SRC, %i0
1760	mov	SAVE_DST, %i1
1761	jmp	REAL_LOFAULT
1762	  mov	SAVE_COUNT, %i2
1763
1764	SET_SIZE(copyio_fault)
1765
1766
1767#endif
1768
1769#if defined(lint)
1770
1771/*ARGSUSED*/
1772int
1773copyout(const void *kaddr, void *uaddr, size_t count)
1774{ return (0); }
1775
1776#else	/* lint */
1777
1778	ENTRY(copyout)
1779
1780	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
1781	bleu,pt	%ncc, .copyout_small		! go to larger cases
1782	  xor	%o0, %o1, %o3			! are src, dst alignable?
1783	btst	7, %o3				!
1784	bz,pt	%ncc, .copyout_8		! check for longword alignment
1785	  nop
1786	btst	1, %o3				!
1787	bz,pt	%ncc, .copyout_2		! check for half-word
1788	  nop
1789	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
1790	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
1791	tst	%o3
1792	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1793	  cmp	%o2, %o3			! if length <= limit
1794	bleu,pt	%ncc, .copyout_small		! go to small copy
1795	  nop
1796	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1797	  nop
1798.copyout_2:
1799	btst	3, %o3				!
1800	bz,pt	%ncc, .copyout_4		! check for word alignment
1801	  nop
1802	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
1803	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
1804	tst	%o3
1805	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1806	  cmp	%o2, %o3			! if length <= limit
1807	bleu,pt	%ncc, .copyout_small		! go to small copy
1808	  nop
1809	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1810	  nop
1811.copyout_4:
1812	! already checked longword, must be word aligned
1813	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
1814	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
1815	tst	%o3
1816	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1817	  cmp	%o2, %o3			! if length <= limit
1818	bleu,pt	%ncc, .copyout_small		! go to small copy
1819	  nop
1820	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1821	  nop
1822.copyout_8:
1823	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
1824	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
1825	tst	%o3
1826	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1827	  cmp	%o2, %o3			! if length <= limit
1828	bleu,pt	%ncc, .copyout_small		! go to small copy
1829	  nop
1830	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1831	  nop
1832
1833	.align	16
1834	nop				! instruction alignment
1835					! see discussion at start of file
1836.copyout_small:
1837	sethi	%hi(.sm_copyout_err), %o5	! .sm_copyout_err is lofault
1838	or	%o5, %lo(.sm_copyout_err), %o5
1839	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
1840	membar	#Sync				! sync error barrier
1841	stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
1842.sm_do_copyout:
1843	mov	%o0, SM_SAVE_SRC
1844	mov	%o1, SM_SAVE_DST
1845	cmp	%o2, SHORTCOPY		! check for really short case
1846	bleu,pt	%ncc, .co_sm_left	!
1847	  mov	%o2, SM_SAVE_COUNT
1848	cmp	%o2, CHKSIZE		! check for medium length cases
1849	bgu,pn	%ncc, .co_med		!
1850	  or	%o0, %o1, %o3		! prepare alignment check
1851	andcc	%o3, 0x3, %g0		! test for alignment
1852	bz,pt	%ncc, .co_sm_word	! branch to word aligned case
1853.co_sm_movebytes:
1854	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
1855.co_sm_notalign4:
1856	ldub	[%o0], %o3		! read byte
1857	subcc	%o2, 4, %o2		! reduce count by 4
1858	stba	%o3, [%o1]ASI_USER	! write byte
1859	inc	%o1			! advance DST by 1
1860	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
1861	add	%o0, 4, %o0		! advance SRC by 4
1862	stba	%o3, [%o1]ASI_USER
1863	inc	%o1			! advance DST by 1
1864	ldub	[%o0 - 2], %o3
1865	stba	%o3, [%o1]ASI_USER
1866	inc	%o1			! advance DST by 1
1867	ldub	[%o0 - 1], %o3
1868	stba	%o3, [%o1]ASI_USER
1869	bgt,pt	%ncc, .co_sm_notalign4	! loop til 3 or fewer bytes remain
1870	  inc	%o1			! advance DST by 1
1871	add	%o2, 3, %o2		! restore count
1872.co_sm_left:
1873	tst	%o2
1874	bz,pt	%ncc, .co_sm_exit	! check for zero length
1875	  nop
1876	ldub	[%o0], %o3		! load one byte
1877	deccc	%o2			! reduce count for cc test
1878	bz,pt	%ncc, .co_sm_exit
1879	  stba	%o3,[%o1]ASI_USER	! store one byte
1880	ldub	[%o0 + 1], %o3		! load second byte
1881	deccc	%o2
1882	inc	%o1
1883	bz,pt	%ncc, .co_sm_exit
1884	  stba	%o3,[%o1]ASI_USER	! store second byte
1885	ldub	[%o0 + 2], %o3		! load third byte
1886	inc	%o1
1887	stba	%o3,[%o1]ASI_USER	! store third byte
1888	membar	#Sync				! sync error barrier
1889	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1890	retl
1891	  mov	%g0, %o0		! return 0
1892	.align	16
1893.co_sm_words:
1894	lduw	[%o0], %o3		! read word
1895.co_sm_wordx:
1896	subcc	%o2, 8, %o2		! update count
1897	stwa	%o3, [%o1]ASI_USER	! write word
1898	add	%o0, 8, %o0		! update SRC
1899	lduw	[%o0 - 4], %o3		! read word
1900	add	%o1, 4, %o1		! update DST
1901	stwa	%o3, [%o1]ASI_USER	! write word
1902	bgt,pt	%ncc, .co_sm_words	! loop til done
1903	  add	%o1, 4, %o1		! update DST
1904	addcc	%o2, 7, %o2		! restore count
1905	bz,pt	%ncc, .co_sm_exit
1906	  nop
1907	deccc	%o2
1908	bz,pt	%ncc, .co_sm_byte
1909.co_sm_half:
1910	  subcc	%o2, 2, %o2		! reduce count by 2
1911	lduh	[%o0], %o3		! read half word
1912	add	%o0, 2, %o0		! advance SRC by 2
1913	stha	%o3, [%o1]ASI_USER	! write half word
1914	bgt,pt	%ncc, .co_sm_half	! loop til done
1915	  add	%o1, 2, %o1		! advance DST by 2
1916	addcc	%o2, 1, %o2		! restore count
1917	bz,pt	%ncc, .co_sm_exit
1918	  nop
1919.co_sm_byte:
1920	ldub	[%o0], %o3
1921	stba	%o3, [%o1]ASI_USER
1922	membar	#Sync				! sync error barrier
1923	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1924	retl
1925	  mov	%g0, %o0		! return 0
1926	.align 16
1927.co_sm_word:
1928	subcc	%o2, 4, %o2		! update count
1929	bgt,pt	%ncc, .co_sm_wordx
1930	  lduw	[%o0], %o3		! read word
1931	addcc	%o2, 3, %o2		! restore count
1932	bz,pt	%ncc, .co_sm_exit
1933	  stwa	%o3, [%o1]ASI_USER	! write word
1934	deccc	%o2			! reduce count for cc test
1935	ldub	[%o0 + 4], %o3		! load one byte
1936	add	%o1, 4, %o1
1937	bz,pt	%ncc, .co_sm_exit
1938	  stba	%o3, [%o1]ASI_USER	! store one byte
1939	ldub	[%o0 + 5], %o3		! load second byte
1940	deccc	%o2
1941	inc	%o1
1942	bz,pt	%ncc, .co_sm_exit
1943	  stba	%o3, [%o1]ASI_USER	! store second byte
1944	ldub	[%o0 + 6], %o3		! load third byte
1945	inc	%o1
1946	stba	%o3, [%o1]ASI_USER	! store third byte
1947.co_sm_exit:
1948	  membar	#Sync				! sync error barrier
1949	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1950	retl
1951	  mov	%g0, %o0		! return 0
1952
1953	.align 16
1954.co_med:
1955	xor	%o0, %o1, %o3		! setup alignment check
1956	btst	1, %o3
1957	bnz,pt	%ncc, .co_sm_movebytes	! unaligned
1958	  nop
1959	btst	3, %o3
1960	bnz,pt	%ncc, .co_med_half	! halfword aligned
1961	  nop
1962	btst	7, %o3
1963	bnz,pt	%ncc, .co_med_word	! word aligned
1964	  nop
1965.co_med_long:
1966	btst	3, %o0			! check for
1967	bz,pt	%ncc, .co_med_long1	! word alignment
1968	  nop
1969.co_med_long0:
1970	ldub	[%o0], %o3		! load one byte
1971	inc	%o0
1972	stba	%o3,[%o1]ASI_USER	! store byte
1973	inc	%o1
1974	btst	3, %o0
1975	bnz,pt	%ncc, .co_med_long0
1976	  dec	%o2
1977.co_med_long1:			! word aligned
1978	btst	7, %o0			! check for long word
1979	bz,pt	%ncc, .co_med_long2
1980	  nop
1981	lduw	[%o0], %o3		! load word
1982	add	%o0, 4, %o0		! advance SRC by 4
1983	stwa	%o3, [%o1]ASI_USER	! store word
1984	add	%o1, 4, %o1		! advance DST by 4
1985	sub	%o2, 4, %o2		! reduce count by 4
1986!
1987!  Now long word aligned and have at least 32 bytes to move
1988!
1989.co_med_long2:
1990	sub	%o2, 31, %o2		! adjust count to allow cc zero test
1991	sub	%o1, 8, %o1		! adjust pointer to allow store in
1992					! branch delay slot instead of add
1993.co_med_lmove:
1994	add	%o1, 8, %o1		! advance DST by 8
1995	ldx	[%o0], %o3		! read long word
1996	subcc	%o2, 32, %o2		! reduce count by 32
1997	stxa	%o3, [%o1]ASI_USER	! write long word
1998	add	%o1, 8, %o1		! advance DST by 8
1999	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
2000	add	%o0, 32, %o0		! advance SRC by 32
2001	stxa	%o3, [%o1]ASI_USER
2002	ldx	[%o0 - 16], %o3
2003	add	%o1, 8, %o1		! advance DST by 8
2004	stxa	%o3, [%o1]ASI_USER
2005	ldx	[%o0 - 8], %o3
2006	add	%o1, 8, %o1		! advance DST by 8
2007	bgt,pt	%ncc, .co_med_lmove	! loop til 31 or fewer bytes left
2008	  stxa	%o3, [%o1]ASI_USER
2009	add	%o1, 8, %o1		! advance DST by 8
2010	addcc	%o2, 24, %o2		! restore count to long word offset
2011	ble,pt	%ncc, .co_med_lextra	! check for more long words to move
2012	  nop
2013.co_med_lword:
2014	ldx	[%o0], %o3		! read long word
2015	subcc	%o2, 8, %o2		! reduce count by 8
2016	stxa	%o3, [%o1]ASI_USER	! write long word
2017	add	%o0, 8, %o0		! advance SRC by 8
2018	bgt,pt	%ncc, .co_med_lword	! loop til 7 or fewer bytes left
2019	  add	%o1, 8, %o1		! advance DST by 8
2020.co_med_lextra:
2021	addcc	%o2, 7, %o2		! restore rest of count
2022	bz,pt	%ncc, .co_sm_exit	! if zero, then done
2023	  deccc	%o2
2024	bz,pt	%ncc, .co_sm_byte
2025	  nop
2026	ba,pt	%ncc, .co_sm_half
2027	  nop
2028
2029	.align 16
2030	nop				! instruction alignment
2031					! see discussion at start of file
2032.co_med_word:
2033	btst	3, %o0			! check for
2034	bz,pt	%ncc, .co_med_word1	! word alignment
2035	  nop
2036.co_med_word0:
2037	ldub	[%o0], %o3		! load one byte
2038	inc	%o0
2039	stba	%o3,[%o1]ASI_USER	! store byte
2040	inc	%o1
2041	btst	3, %o0
2042	bnz,pt	%ncc, .co_med_word0
2043	  dec	%o2
2044!
2045!  Now word aligned and have at least 36 bytes to move
2046!
2047.co_med_word1:
2048	sub	%o2, 15, %o2		! adjust count to allow cc zero test
2049.co_med_wmove:
2050	lduw	[%o0], %o3		! read word
2051	subcc	%o2, 16, %o2		! reduce count by 16
2052	stwa	%o3, [%o1]ASI_USER	! write word
2053	add	%o1, 4, %o1		! advance DST by 4
2054	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
2055	add	%o0, 16, %o0		! advance SRC by 16
2056	stwa	%o3, [%o1]ASI_USER
2057	add	%o1, 4, %o1		! advance DST by 4
2058	lduw	[%o0 - 8], %o3
2059	stwa	%o3, [%o1]ASI_USER
2060	add	%o1, 4, %o1		! advance DST by 4
2061	lduw	[%o0 - 4], %o3
2062	stwa	%o3, [%o1]ASI_USER
2063	bgt,pt	%ncc, .co_med_wmove	! loop til 15 or fewer bytes left
2064	  add	%o1, 4, %o1		! advance DST by 4
2065	addcc	%o2, 12, %o2		! restore count to word offset
2066	ble,pt	%ncc, .co_med_wextra	! check for more words to move
2067	  nop
2068.co_med_word2:
2069	lduw	[%o0], %o3		! read word
2070	subcc	%o2, 4, %o2		! reduce count by 4
2071	stwa	%o3, [%o1]ASI_USER	! write word
2072	add	%o0, 4, %o0		! advance SRC by 4
2073	bgt,pt	%ncc, .co_med_word2	! loop til 3 or fewer bytes left
2074	  add	%o1, 4, %o1		! advance DST by 4
2075.co_med_wextra:
2076	addcc	%o2, 3, %o2		! restore rest of count
2077	bz,pt	%ncc, .co_sm_exit	! if zero, then done
2078	  deccc	%o2
2079	bz,pt	%ncc, .co_sm_byte
2080	  nop
2081	ba,pt	%ncc, .co_sm_half
2082	  nop
2083
2084	.align 16
2085	nop				! instruction alignment
2086	nop				! see discussion at start of file
2087	nop
2088.co_med_half:
2089	btst	1, %o0			! check for
2090	bz,pt	%ncc, .co_med_half1	! half word alignment
2091	  nop
2092	ldub	[%o0], %o3		! load one byte
2093	inc	%o0
2094	stba	%o3,[%o1]ASI_USER	! store byte
2095	inc	%o1
2096	dec	%o2
2097!
2098!  Now half word aligned and have at least 38 bytes to move
2099!
2100.co_med_half1:
2101	sub	%o2, 7, %o2		! adjust count to allow cc zero test
2102.co_med_hmove:
2103	lduh	[%o0], %o3		! read half word
2104	subcc	%o2, 8, %o2		! reduce count by 8
2105	stha	%o3, [%o1]ASI_USER	! write half word
2106	add	%o1, 2, %o1		! advance DST by 2
2107	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
2108	add	%o0, 8, %o0		! advance SRC by 8
2109	stha	%o3, [%o1]ASI_USER
2110	add	%o1, 2, %o1		! advance DST by 2
2111	lduh	[%o0 - 4], %o3
2112	stha	%o3, [%o1]ASI_USER
2113	add	%o1, 2, %o1		! advance DST by 2
2114	lduh	[%o0 - 2], %o3
2115	stha	%o3, [%o1]ASI_USER
2116	bgt,pt	%ncc, .co_med_hmove	! loop til 7 or fewer bytes left
2117	  add	%o1, 2, %o1		! advance DST by 2
2118	addcc	%o2, 7, %o2		! restore count
2119	bz,pt	%ncc, .co_sm_exit
2120	  deccc	%o2
2121	bz,pt	%ncc, .co_sm_byte
2122	  nop
2123	ba,pt	%ncc, .co_sm_half
2124	  nop
2125
2126/*
2127 * We got here because of a fault during short copyout.
2128 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2129 */
2130.sm_copyout_err:
2131	membar	#Sync
2132	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2133	mov	SM_SAVE_SRC, %o0
2134	mov	SM_SAVE_DST, %o1
2135	mov	SM_SAVE_COUNT, %o2
2136	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2137	tst	%o3
2138	bz,pt	%ncc, 3f			! if not, return error
2139	  nop
2140	ldn	[%o3 + CP_COPYOUT], %o5		! if handler, invoke it with
2141	jmp	%o5				! original arguments
2142	  nop
21433:
2144	retl
2145	  or	%g0, -1, %o0		! return error value
2146
2147	SET_SIZE(copyout)
2148
2149/*
2150 * The _more entry points are not intended to be used directly by
2151 * any caller from outside this file.  They are provided to allow
2152 * profiling and dtrace of the portions of the copy code that uses
2153 * the floating point registers.
2154 * This entry is particularly important as DTRACE (at least as of
2155 * 4/2004) does not support leaf functions.
2156 */
2157
2158	ENTRY(copyout_more)
2159.copyout_more:
2160	prefetch [%o0], #n_reads
2161	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2162	set	.copyout_err, REAL_LOFAULT
2163
2164/*
2165 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
2166 */
2167.do_copyout:
2168        set     copyio_fault, %l7		! .copyio_fault is lofault val
2169
2170	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
2171	membar	#Sync				! sync error barrier
2172	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
2173
2174	mov	%i0, SAVE_SRC
2175	mov	%i1, SAVE_DST
2176	mov	%i2, SAVE_COUNT
2177
2178	FP_NOMIGRATE(6, 7)
2179
2180	rd	%fprs, %o2		! check for unused fp
2181	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2182	btst	FPRS_FEF, %o2
2183	bz,a,pt	%icc, .do_blockcopyout
2184	  wr	%g0, FPRS_FEF, %fprs
2185
2186	BST_FPQ2Q4_TOSTACK(%o2)
2187
2188.do_blockcopyout:
2189	rd	%gsr, %o2
2190	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
2191	or	%l6, FPUSED_FLAG, %l6
2192
2193	andcc	DST, VIS_BLOCKSIZE - 1, TMP
2194	mov	ASI_USER, %asi
2195	bz,pt	%ncc, 2f
2196	  neg	TMP
2197	add	TMP, VIS_BLOCKSIZE, TMP
2198
2199	! TMP = bytes required to align DST on FP_BLOCK boundary
2200	! Using SRC as a tmp here
2201	cmp	TMP, 3
2202	bleu,pt	%ncc, 1f
2203	  sub	CNT,TMP,CNT		! adjust main count
2204	sub	TMP, 3, TMP		! adjust for end of loop test
2205.co_blkalign:
2206	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
2207	stba	SRC, [DST]%asi
2208	subcc	TMP, 4, TMP
2209	ldub	[REALSRC + 1], SRC
2210	add	REALSRC, 4, REALSRC
2211	stba	SRC, [DST + 1]%asi
2212	ldub	[REALSRC - 2], SRC
2213	add	DST, 4, DST
2214	stba	SRC, [DST - 2]%asi
2215	ldub	[REALSRC - 1], SRC
2216	bgu,pt	%ncc, .co_blkalign
2217	  stba	SRC, [DST - 1]%asi
2218
2219	addcc	TMP, 3, TMP		! restore count adjustment
2220	bz,pt	%ncc, 2f		! no bytes left?
2221	  nop
22221:	ldub	[REALSRC], SRC
2223	inc	REALSRC
2224	inc	DST
2225	deccc	TMP
2226	bgu	%ncc, 1b
2227	  stba	SRC, [DST - 1]%asi
2228
22292:
2230	membar	#StoreLoad
2231	andn	REALSRC, 0x7, SRC
2232
2233	! SRC - 8-byte aligned
2234	! DST - 64-byte aligned
2235	ldd	[SRC], %f16
2236	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
2237	alignaddr REALSRC, %g0, %g0
2238	ldd	[SRC + 0x08], %f18
2239	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
2240	faligndata %f16, %f18, %f48
2241	ldd	[SRC + 0x10], %f20
2242	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2243	faligndata %f18, %f20, %f50
2244	ldd	[SRC + 0x18], %f22
2245	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
2246	faligndata %f20, %f22, %f52
2247	ldd	[SRC + 0x20], %f24
2248	prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
2249	faligndata %f22, %f24, %f54
2250	ldd	[SRC + 0x28], %f26
2251	prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
2252	faligndata %f24, %f26, %f56
2253	ldd	[SRC + 0x30], %f28
2254	prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
2255	faligndata %f26, %f28, %f58
2256	ldd	[SRC + 0x38], %f30
2257	ldd	[SRC + VIS_BLOCKSIZE], %f16
2258	sub	CNT, VIS_BLOCKSIZE, CNT
2259	add	SRC, VIS_BLOCKSIZE, SRC
2260	prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
2261	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2262	ba,pt	%ncc, 1f
2263	prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
2264	.align	32
22651:
2266	ldd	[SRC + 0x08], %f18
2267	faligndata %f28, %f30, %f60
2268	ldd	[SRC + 0x10], %f20
2269	faligndata %f30, %f16, %f62
2270	stda	%f48, [DST]ASI_BLK_AIUS
2271	ldd	[SRC + 0x18], %f22
2272	faligndata %f16, %f18, %f48
2273	ldd	[SRC + 0x20], %f24
2274	faligndata %f18, %f20, %f50
2275	ldd	[SRC + 0x28], %f26
2276	faligndata %f20, %f22, %f52
2277	ldd	[SRC + 0x30], %f28
2278	faligndata %f22, %f24, %f54
2279	sub	CNT, VIS_BLOCKSIZE, CNT
2280	ldd	[SRC + 0x38], %f30
2281	faligndata %f24, %f26, %f56
2282	add	DST, VIS_BLOCKSIZE, DST
2283	ldd	[SRC + VIS_BLOCKSIZE], %f16
2284	faligndata %f26, %f28, %f58
2285	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2286	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2287	add	SRC, VIS_BLOCKSIZE, SRC
2288	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2289	cmp	CNT, VIS_BLOCKSIZE + 8
2290	bgu,pt	%ncc, 1b
2291	  prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2292
2293	! only if REALSRC & 0x7 is 0
2294	cmp	CNT, VIS_BLOCKSIZE
2295	bne	%ncc, 3f
2296	  andcc	REALSRC, 0x7, %g0
2297	bz,pt	%ncc, 2f
2298	  nop
22993:
2300	faligndata %f28, %f30, %f60
2301	faligndata %f30, %f16, %f62
2302	stda	%f48, [DST]ASI_BLK_AIUS
2303	add	DST, VIS_BLOCKSIZE, DST
2304	ba,pt	%ncc, 3f
2305	  nop
23062:
2307	ldd	[SRC + 0x08], %f18
2308	fsrc1	%f28, %f60
2309	ldd	[SRC + 0x10], %f20
2310	fsrc1	%f30, %f62
2311	stda	%f48, [DST]ASI_BLK_AIUS
2312	ldd	[SRC + 0x18], %f22
2313	fsrc1	%f16, %f48
2314	ldd	[SRC + 0x20], %f24
2315	fsrc1	%f18, %f50
2316	ldd	[SRC + 0x28], %f26
2317	fsrc1	%f20, %f52
2318	ldd	[SRC + 0x30], %f28
2319	fsrc1	%f22, %f54
2320	ldd	[SRC + 0x38], %f30
2321	fsrc1	%f24, %f56
2322	sub	CNT, VIS_BLOCKSIZE, CNT
2323	add	DST, VIS_BLOCKSIZE, DST
2324	add	SRC, VIS_BLOCKSIZE, SRC
2325	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2326	fsrc1	%f26, %f58
2327	fsrc1	%f28, %f60
2328	fsrc1	%f30, %f62
2329	stda	%f48, [DST]ASI_BLK_AIUS
2330	add	DST, VIS_BLOCKSIZE, DST
2331	ba,a,pt	%ncc, 4f
2332	  nop
2333
23343:	tst	CNT
2335	bz,a	%ncc, 4f
2336	  nop
2337
23385:	ldub	[REALSRC], TMP
2339	inc	REALSRC
2340	inc	DST
2341	deccc	CNT
2342	bgu	%ncc, 5b
2343	  stba	TMP, [DST - 1]%asi
23444:
2345
2346.copyout_exit:
2347	membar	#Sync
2348
2349	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2350	wr	%o2, 0, %gsr		! restore gsr
2351
2352	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2353	btst	FPRS_FEF, %o3
2354	bz,pt	%icc, 4f
2355	  nop
2356
2357	BLD_FPQ2Q4_FROMSTACK(%o2)
2358
2359	ba,pt	%ncc, 1f
2360	  wr	%o3, 0, %fprs		! restore fprs
2361
23624:
2363	FZEROQ2Q4
2364	wr	%o3, 0, %fprs		! restore fprs
2365
23661:
2367	membar	#Sync
2368	andn	%l6, FPUSED_FLAG, %l6
2369	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2370	FP_ALLOWMIGRATE(5, 6)
2371	ret
2372	  restore	%g0, 0, %o0
2373
2374/*
2375 * We got here because of a fault during copyout.
2376 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2377 */
2378.copyout_err:
2379	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
2380	tst	%o4
2381	bz,pt	%ncc, 2f			! if not, return error
2382	  nop
2383	ldn	[%o4 + CP_COPYOUT], %g2		! if handler, invoke it with
2384	jmp	%g2				! original arguments
2385	  restore %g0, 0, %g0			! dispose of copy window
23862:
2387        ret
2388	  restore %g0, -1, %o0			! return error value
2389
2390
2391	SET_SIZE(copyout_more)
2392
2393#endif	/* lint */
2394
2395
2396#ifdef	lint
2397
2398/*ARGSUSED*/
2399int
2400xcopyout(const void *kaddr, void *uaddr, size_t count)
2401{ return (0); }
2402
2403#else	/* lint */
2404
2405	ENTRY(xcopyout)
2406	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
2407	bleu,pt	%ncc, .xcopyout_small		! go to larger cases
2408	  xor	%o0, %o1, %o3			! are src, dst alignable?
2409	btst	7, %o3				!
2410	bz,pt	%ncc, .xcopyout_8		!
2411	  nop
2412	btst	1, %o3				!
2413	bz,pt	%ncc, .xcopyout_2		! check for half-word
2414	  nop
2415	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
2416	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2417	tst	%o3
2418	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2419	  cmp	%o2, %o3			! if length <= limit
2420	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2421	  nop
2422	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2423	  nop
2424.xcopyout_2:
2425	btst	3, %o3				!
2426	bz,pt	%ncc, .xcopyout_4		! check for word alignment
2427	  nop
2428	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
2429	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
2430	tst	%o3
2431	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2432	  cmp	%o2, %o3			! if length <= limit
2433	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2434	  nop
2435	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2436	  nop
2437.xcopyout_4:
2438	! already checked longword, must be word aligned
2439	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
2440	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
2441	tst	%o3
2442	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2443	  cmp	%o2, %o3			! if length <= limit
2444	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2445	  nop
2446	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2447	  nop
2448.xcopyout_8:
2449	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
2450	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2451	tst	%o3
2452	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2453	  cmp	%o2, %o3			! if length <= limit
2454	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2455	  nop
2456	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2457	  nop
2458
2459.xcopyout_small:
2460	sethi	%hi(.sm_xcopyout_err), %o5	! .sm_xcopyout_err is lofault
2461	or	%o5, %lo(.sm_xcopyout_err), %o5
2462	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
2463	membar	#Sync				! sync error barrier
2464	ba,pt	%ncc, .sm_do_copyout		! common code
2465	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
2466
2467.xcopyout_more:
2468	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2469	sethi	%hi(.xcopyout_err), REAL_LOFAULT
2470	ba,pt	%ncc, .do_copyout		! common code
2471	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2472
2473/*
2474 * We got here because of fault during xcopyout
2475 * Errno value is in ERRNO
2476 */
2477.xcopyout_err:
2478	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
2479	tst	%o4
2480	bz,pt	%ncc, 2f			! if not, return error
2481	  nop
2482	ldn	[%o4 + CP_XCOPYOUT], %g2	! if handler, invoke it with
2483	jmp	%g2				! original arguments
2484	  restore %g0, 0, %g0			! dispose of copy window
24852:
2486        ret
2487	  restore ERRNO, 0, %o0			! return errno value
2488
2489.sm_xcopyout_err:
2490
2491	membar	#Sync
2492	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2493	mov	SM_SAVE_SRC, %o0
2494	mov	SM_SAVE_DST, %o1
2495	mov	SM_SAVE_COUNT, %o2
2496	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2497	tst	%o3
2498	bz,pt	%ncc, 3f			! if not, return error
2499	  nop
2500	ldn	[%o3 + CP_XCOPYOUT], %o5	! if handler, invoke it with
2501	jmp	%o5				! original arguments
2502	  nop
25033:
2504	retl
2505	  or	%g1, 0, %o0		! return errno value
2506
2507	SET_SIZE(xcopyout)
2508
2509#endif	/* lint */
2510
2511#ifdef	lint
2512
2513/*ARGSUSED*/
2514int
2515xcopyout_little(const void *kaddr, void *uaddr, size_t count)
2516{ return (0); }
2517
2518#else	/* lint */
2519
2520	ENTRY(xcopyout_little)
2521	sethi	%hi(.xcopyio_err), %o5
2522	or	%o5, %lo(.xcopyio_err), %o5
2523	ldn	[THREAD_REG + T_LOFAULT], %o4
2524	membar	#Sync				! sync error barrier
2525	stn	%o5, [THREAD_REG + T_LOFAULT]
2526	mov	%o4, %o5
2527
2528	subcc	%g0, %o2, %o3
2529	add	%o0, %o2, %o0
2530	bz,pn	%ncc, 2f		! check for zero bytes
2531	  sub	%o2, 1, %o4
2532	add	%o0, %o4, %o0		! start w/last byte
2533	add	%o1, %o2, %o1
2534	ldub	[%o0 + %o3], %o4
2535
25361:	stba	%o4, [%o1 + %o3]ASI_AIUSL
2537	inccc	%o3
2538	sub	%o0, 2, %o0		! get next byte
2539	bcc,a,pt %ncc, 1b
2540	  ldub	[%o0 + %o3], %o4
2541
25422:
2543	membar	#Sync				! sync error barrier
2544	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2545	retl
2546	  mov	%g0, %o0		! return (0)
2547
2548	SET_SIZE(xcopyout_little)
2549
2550#endif	/* lint */
2551
2552/*
2553 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2554 */
2555
2556#if defined(lint)
2557
2558/*ARGSUSED*/
2559int
2560copyin(const void *uaddr, void *kaddr, size_t count)
2561{ return (0); }
2562
2563#else	/* lint */
2564
2565	ENTRY(copyin)
2566	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
2567	bleu,pt	%ncc, .copyin_small		! go to larger cases
2568	  xor	%o0, %o1, %o3			! are src, dst alignable?
2569	btst	7, %o3				!
2570	bz,pt	%ncc, .copyin_8			! check for longword alignment
2571	  nop
2572	btst	1, %o3				!
2573	bz,pt	%ncc, .copyin_2			! check for half-word
2574	  nop
2575	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
2576	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2577	tst	%o3
2578	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2579	  cmp	%o2, %o3			! if length <= limit
2580	bleu,pt	%ncc, .copyin_small		! go to small copy
2581	  nop
2582	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2583	  nop
2584.copyin_2:
2585	btst	3, %o3				!
2586	bz,pt	%ncc, .copyin_4			! check for word alignment
2587	  nop
2588	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
2589	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
2590	tst	%o3
2591	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2592	  cmp	%o2, %o3			! if length <= limit
2593	bleu,pt	%ncc, .copyin_small		! go to small copy
2594	  nop
2595	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2596	  nop
2597.copyin_4:
2598	! already checked longword, must be word aligned
2599	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
2600	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
2601	tst	%o3
2602	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2603	  cmp	%o2, %o3			! if length <= limit
2604	bleu,pt	%ncc, .copyin_small		! go to small copy
2605	  nop
2606	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2607	  nop
2608.copyin_8:
2609	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
2610	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2611	tst	%o3
2612	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2613	  cmp	%o2, %o3			! if length <= limit
2614	bleu,pt	%ncc, .copyin_small		! go to small copy
2615	  nop
2616	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2617	  nop
2618
2619	.align	16
2620	nop				! instruction alignment
2621					! see discussion at start of file
2622.copyin_small:
2623	sethi	%hi(.sm_copyin_err), %o5	! .sm_copyin_err is lofault
2624	or	%o5, %lo(.sm_copyin_err), %o5
2625	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofault, no tramp
2626	membar	#Sync				! sync error barrier
2627	stn	%o5, [THREAD_REG + T_LOFAULT]
2628.sm_do_copyin:
2629	mov	%o0, SM_SAVE_SRC
2630	mov	%o1, SM_SAVE_DST
2631	cmp	%o2, SHORTCOPY		! check for really short case
2632	bleu,pt	%ncc, .ci_sm_left	!
2633	  mov	%o2, SM_SAVE_COUNT
2634	cmp	%o2, CHKSIZE		! check for medium length cases
2635	bgu,pn	%ncc, .ci_med		!
2636	  or	%o0, %o1, %o3		! prepare alignment check
2637	andcc	%o3, 0x3, %g0		! test for alignment
2638	bz,pt	%ncc, .ci_sm_word	! branch to word aligned case
2639.ci_sm_movebytes:
2640	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
2641.ci_sm_notalign4:
2642	lduba	[%o0]ASI_USER, %o3	! read byte
2643	subcc	%o2, 4, %o2		! reduce count by 4
2644	stb	%o3, [%o1]		! write byte
2645	add	%o0, 1, %o0		! advance SRC by 1
2646	lduba	[%o0]ASI_USER, %o3	! repeat for a total of 4 bytes
2647	add	%o0, 1, %o0		! advance SRC by 1
2648	stb	%o3, [%o1 + 1]
2649	add	%o1, 4, %o1		! advance DST by 4
2650	lduba	[%o0]ASI_USER, %o3
2651	add	%o0, 1, %o0		! advance SRC by 1
2652	stb	%o3, [%o1 - 2]
2653	lduba	[%o0]ASI_USER, %o3
2654	add	%o0, 1, %o0		! advance SRC by 1
2655	bgt,pt	%ncc, .ci_sm_notalign4	! loop til 3 or fewer bytes remain
2656	  stb	%o3, [%o1 - 1]
2657	add	%o2, 3, %o2		! restore count
2658.ci_sm_left:
2659	tst	%o2
2660	bz,pt	%ncc, .ci_sm_exit
2661	  nop
2662	lduba	[%o0]ASI_USER, %o3		! load one byte
2663	deccc	%o2			! reduce count for cc test
2664	bz,pt	%ncc, .ci_sm_exit
2665	  stb	%o3,[%o1]		! store one byte
2666	inc	%o0
2667	lduba	[%o0]ASI_USER, %o3	! load second byte
2668	deccc	%o2
2669	bz,pt	%ncc, .ci_sm_exit
2670	  stb	%o3,[%o1 + 1]		! store second byte
2671	inc	%o0
2672	lduba	[%o0]ASI_USER, %o3	! load third byte
2673	stb	%o3,[%o1 + 2]		! store third byte
2674	membar	#Sync				! sync error barrier
2675	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2676	retl
2677	  mov	%g0, %o0		! return 0
2678	.align	16
2679.ci_sm_words:
2680	lduwa	[%o0]ASI_USER, %o3		! read word
2681.ci_sm_wordx:
2682	subcc	%o2, 8, %o2		! update count
2683	stw	%o3, [%o1]		! write word
2684	add	%o0, 4, %o0		! update SRC
2685	add	%o1, 8, %o1		! update DST
2686	lduwa	[%o0]ASI_USER, %o3	! read word
2687	add	%o0, 4, %o0		! update SRC
2688	bgt,pt	%ncc, .ci_sm_words	! loop til done
2689	  stw	%o3, [%o1 - 4]		! write word
2690	addcc	%o2, 7, %o2		! restore count
2691	bz,pt	%ncc, .ci_sm_exit
2692	  nop
2693	deccc	%o2
2694	bz,pt	%ncc, .ci_sm_byte
2695.ci_sm_half:
2696	  subcc	%o2, 2, %o2		! reduce count by 2
2697	lduha	[%o0]ASI_USER, %o3	! read half word
2698	add	%o0, 2, %o0		! advance SRC by 2
2699	add	%o1, 2, %o1		! advance DST by 2
2700	bgt,pt	%ncc, .ci_sm_half	! loop til done
2701	  sth	%o3, [%o1 - 2]		! write half word
2702	addcc	%o2, 1, %o2		! restore count
2703	bz,pt	%ncc, .ci_sm_exit
2704	  nop
2705.ci_sm_byte:
2706	lduba	[%o0]ASI_USER, %o3
2707	stb	%o3, [%o1]
2708	membar	#Sync				! sync error barrier
2709	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2710	retl
2711	  mov	%g0, %o0		! return 0
2712	.align	16
2713.ci_sm_word:
2714	subcc	%o2, 4, %o2		! update count
2715	bgt,pt	%ncc, .ci_sm_wordx
2716	  lduwa	[%o0]ASI_USER, %o3		! read word
2717	addcc	%o2, 3, %o2		! restore count
2718	bz,pt	%ncc, .ci_sm_exit
2719	  stw	%o3, [%o1]		! write word
2720	deccc	%o2			! reduce count for cc test
2721	add	%o0, 4, %o0
2722	lduba	[%o0]ASI_USER, %o3	! load one byte
2723	bz,pt	%ncc, .ci_sm_exit
2724	  stb	%o3, [%o1 + 4]		! store one byte
2725	inc	%o0
2726	lduba	[%o0]ASI_USER, %o3	! load second byte
2727	deccc	%o2
2728	bz,pt	%ncc, .ci_sm_exit
2729	  stb	%o3, [%o1 + 5]		! store second byte
2730	inc	%o0
2731	lduba	[%o0]ASI_USER, %o3	! load third byte
2732	stb	%o3, [%o1 + 6]		! store third byte
2733.ci_sm_exit:
2734	membar	#Sync				! sync error barrier
2735	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2736	retl
2737	  mov	%g0, %o0		! return 0
2738
2739	.align 16
2740.ci_med:
2741	xor	%o0, %o1, %o3		! setup alignment check
2742	btst	1, %o3
2743	bnz,pt	%ncc, .ci_sm_movebytes	! unaligned
2744	  nop
2745	btst	3, %o3
2746	bnz,pt	%ncc, .ci_med_half	! halfword aligned
2747	  nop
2748	btst	7, %o3
2749	bnz,pt	%ncc, .ci_med_word	! word aligned
2750	  nop
2751.ci_med_long:
2752	btst	3, %o0			! check for
2753	bz,pt	%ncc, .ci_med_long1	! word alignment
2754	  nop
2755.ci_med_long0:
2756	lduba	[%o0]ASI_USER, %o3		! load one byte
2757	inc	%o0
2758	stb	%o3,[%o1]		! store byte
2759	inc	%o1
2760	btst	3, %o0
2761	bnz,pt	%ncc, .ci_med_long0
2762	  dec	%o2
2763.ci_med_long1:			! word aligned
2764	btst	7, %o0			! check for long word
2765	bz,pt	%ncc, .ci_med_long2
2766	  nop
2767	lduwa	[%o0]ASI_USER, %o3	! load word
2768	add	%o0, 4, %o0		! advance SRC by 4
2769	stw	%o3, [%o1]		! store word
2770	add	%o1, 4, %o1		! advance DST by 4
2771	sub	%o2, 4, %o2		! reduce count by 4
2772!
2773!  Now long word aligned and have at least 32 bytes to move
2774!
2775.ci_med_long2:
2776	sub	%o2, 31, %o2		! adjust count to allow cc zero test
2777.ci_med_lmove:
2778	ldxa	[%o0]ASI_USER, %o3	! read long word
2779	subcc	%o2, 32, %o2		! reduce count by 32
2780	stx	%o3, [%o1]		! write long word
2781	add	%o0, 8, %o0		! advance SRC by 8
2782	ldxa	[%o0]ASI_USER, %o3	! repeat for a total for 4 long words
2783	add	%o0, 8, %o0		! advance SRC by 8
2784	stx	%o3, [%o1 + 8]
2785	add	%o1, 32, %o1		! advance DST by 32
2786	ldxa	[%o0]ASI_USER, %o3
2787	add	%o0, 8, %o0		! advance SRC by 8
2788	stx	%o3, [%o1 - 16]
2789	ldxa	[%o0]ASI_USER, %o3
2790	add	%o0, 8, %o0		! advance SRC by 8
2791	bgt,pt	%ncc, .ci_med_lmove	! loop til 31 or fewer bytes left
2792	  stx	%o3, [%o1 - 8]
2793	addcc	%o2, 24, %o2		! restore count to long word offset
2794	ble,pt	%ncc, .ci_med_lextra	! check for more long words to move
2795	  nop
2796.ci_med_lword:
2797	ldxa	[%o0]ASI_USER, %o3	! read long word
2798	subcc	%o2, 8, %o2		! reduce count by 8
2799	stx	%o3, [%o1]		! write long word
2800	add	%o0, 8, %o0		! advance SRC by 8
2801	bgt,pt	%ncc, .ci_med_lword	! loop til 7 or fewer bytes left
2802	  add	%o1, 8, %o1		! advance DST by 8
2803.ci_med_lextra:
2804	addcc	%o2, 7, %o2		! restore rest of count
2805	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
2806	  deccc	%o2
2807	bz,pt	%ncc, .ci_sm_byte
2808	  nop
2809	ba,pt	%ncc, .ci_sm_half
2810	  nop
2811
2812	.align 16
2813	nop				! instruction alignment
2814					! see discussion at start of file
2815.ci_med_word:
2816	btst	3, %o0			! check for
2817	bz,pt	%ncc, .ci_med_word1	! word alignment
2818	  nop
2819.ci_med_word0:
2820	lduba	[%o0]ASI_USER, %o3	! load one byte
2821	inc	%o0
2822	stb	%o3,[%o1]		! store byte
2823	inc	%o1
2824	btst	3, %o0
2825	bnz,pt	%ncc, .ci_med_word0
2826	  dec	%o2
2827!
2828!  Now word aligned and have at least 36 bytes to move
2829!
2830.ci_med_word1:
2831	sub	%o2, 15, %o2		! adjust count to allow cc zero test
2832.ci_med_wmove:
2833	lduwa	[%o0]ASI_USER, %o3	! read word
2834	subcc	%o2, 16, %o2		! reduce count by 16
2835	stw	%o3, [%o1]		! write word
2836	add	%o0, 4, %o0		! advance SRC by 4
2837	lduwa	[%o0]ASI_USER, %o3	! repeat for a total for 4 words
2838	add	%o0, 4, %o0		! advance SRC by 4
2839	stw	%o3, [%o1 + 4]
2840	add	%o1, 16, %o1		! advance DST by 16
2841	lduwa	[%o0]ASI_USER, %o3
2842	add	%o0, 4, %o0		! advance SRC by 4
2843	stw	%o3, [%o1 - 8]
2844	lduwa	[%o0]ASI_USER, %o3
2845	add	%o0, 4, %o0		! advance SRC by 4
2846	bgt,pt	%ncc, .ci_med_wmove	! loop til 15 or fewer bytes left
2847	  stw	%o3, [%o1 - 4]
2848	addcc	%o2, 12, %o2		! restore count to word offset
2849	ble,pt	%ncc, .ci_med_wextra	! check for more words to move
2850	  nop
2851.ci_med_word2:
2852	lduwa	[%o0]ASI_USER, %o3	! read word
2853	subcc	%o2, 4, %o2		! reduce count by 4
2854	stw	%o3, [%o1]		! write word
2855	add	%o0, 4, %o0		! advance SRC by 4
2856	bgt,pt	%ncc, .ci_med_word2	! loop til 3 or fewer bytes left
2857	  add	%o1, 4, %o1		! advance DST by 4
2858.ci_med_wextra:
2859	addcc	%o2, 3, %o2		! restore rest of count
2860	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
2861	  deccc	%o2
2862	bz,pt	%ncc, .ci_sm_byte
2863	  nop
2864	ba,pt	%ncc, .ci_sm_half
2865	  nop
2866
2867	.align 16
2868	nop				! instruction alignment
2869					! see discussion at start of file
2870.ci_med_half:
2871	btst	1, %o0			! check for
2872	bz,pt	%ncc, .ci_med_half1	! half word alignment
2873	  nop
2874	lduba	[%o0]ASI_USER, %o3	! load one byte
2875	inc	%o0
2876	stb	%o3,[%o1]		! store byte
2877	inc	%o1
2878	dec	%o2
2879!
2880!  Now half word aligned and have at least 38 bytes to move
2881!
2882.ci_med_half1:
2883	sub	%o2, 7, %o2		! adjust count to allow cc zero test
2884.ci_med_hmove:
2885	lduha	[%o0]ASI_USER, %o3	! read half word
2886	subcc	%o2, 8, %o2		! reduce count by 8
2887	sth	%o3, [%o1]		! write half word
2888	add	%o0, 2, %o0		! advance SRC by 2
2889	lduha	[%o0]ASI_USER, %o3	! repeat for a total for 4 halfwords
2890	add	%o0, 2, %o0		! advance SRC by 2
2891	sth	%o3, [%o1 + 2]
2892	add	%o1, 8, %o1		! advance DST by 8
2893	lduha	[%o0]ASI_USER, %o3
2894	add	%o0, 2, %o0		! advance SRC by 2
2895	sth	%o3, [%o1 - 4]
2896	lduha	[%o0]ASI_USER, %o3
2897	add	%o0, 2, %o0		! advance SRC by 2
2898	bgt,pt	%ncc, .ci_med_hmove	! loop til 7 or fewer bytes left
2899	  sth	%o3, [%o1 - 2]
2900	addcc	%o2, 7, %o2		! restore count
2901	bz,pt	%ncc, .ci_sm_exit
2902	  deccc	%o2
2903	bz,pt	%ncc, .ci_sm_byte
2904	  nop
2905	ba,pt	%ncc, .ci_sm_half
2906	  nop
2907
2908.sm_copyin_err:
2909	membar	#Sync
2910	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2911	mov	SM_SAVE_SRC, %o0
2912	mov	SM_SAVE_DST, %o1
2913	mov	SM_SAVE_COUNT, %o2
2914	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2915	tst	%o3
2916	bz,pt	%ncc, 3f			! if not, return error
2917	  nop
2918	ldn	[%o3 + CP_COPYIN], %o5		! if handler, invoke it with
2919	jmp	%o5				! original arguments
2920	  nop
29213:
2922	retl
2923	  or	%g0, -1, %o0		! return errno value
2924
2925	SET_SIZE(copyin)
2926
2927
2928/*
2929 * The _more entry points are not intended to be used directly by
2930 * any caller from outside this file.  They are provided to allow
2931 * profiling and dtrace of the portions of the copy code that uses
2932 * the floating point registers.
2933 * This entry is particularly important as DTRACE (at least as of
2934 * 4/2004) does not support leaf functions.
2935 */
2936
2937	ENTRY(copyin_more)
2938.copyin_more:
2939	prefetch [%o0], #n_reads
2940	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2941	set	.copyin_err, REAL_LOFAULT
2942
2943/*
2944 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
2945 */
2946.do_copyin:
2947	set	copyio_fault, %l7		! .copyio_fault is lofault val
2948
2949	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
2950	membar	#Sync				! sync error barrier
2951	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
2952
2953	mov	%i0, SAVE_SRC
2954	mov	%i1, SAVE_DST
2955	mov	%i2, SAVE_COUNT
2956
2957	FP_NOMIGRATE(6, 7)
2958
2959	rd	%fprs, %o2		! check for unused fp
2960	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2961	btst	FPRS_FEF, %o2
2962	bz,a,pt	%icc, .do_blockcopyin
2963	  wr	%g0, FPRS_FEF, %fprs
2964
2965	BST_FPQ2Q4_TOSTACK(%o2)
2966
2967.do_blockcopyin:
2968	rd	%gsr, %o2
2969	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
2970	or	%l6, FPUSED_FLAG, %l6
2971
2972	andcc	DST, VIS_BLOCKSIZE - 1, TMP
2973	mov	ASI_USER, %asi
2974	bz,pt	%ncc, 2f
2975	  neg	TMP
2976	add	TMP, VIS_BLOCKSIZE, TMP
2977
2978	! TMP = bytes required to align DST on FP_BLOCK boundary
2979	! Using SRC as a tmp here
2980	cmp	TMP, 3
2981	bleu,pt	%ncc, 1f
2982	  sub	CNT,TMP,CNT		! adjust main count
2983	sub	TMP, 3, TMP		! adjust for end of loop test
2984.ci_blkalign:
2985	lduba	[REALSRC]%asi, SRC	! move 4 bytes per loop iteration
2986	stb	SRC, [DST]
2987	subcc	TMP, 4, TMP
2988	lduba	[REALSRC + 1]%asi, SRC
2989	add	REALSRC, 4, REALSRC
2990	stb	SRC, [DST + 1]
2991	lduba	[REALSRC - 2]%asi, SRC
2992	add	DST, 4, DST
2993	stb	SRC, [DST - 2]
2994	lduba	[REALSRC - 1]%asi, SRC
2995	bgu,pt	%ncc, .ci_blkalign
2996	  stb	SRC, [DST - 1]
2997
2998	addcc	TMP, 3, TMP		! restore count adjustment
2999	bz,pt	%ncc, 2f		! no bytes left?
3000	  nop
30011:	lduba	[REALSRC]%asi, SRC
3002	inc	REALSRC
3003	inc	DST
3004	deccc	TMP
3005	bgu	%ncc, 1b
3006	  stb	SRC, [DST - 1]
3007
30082:
3009	membar	#StoreLoad
3010	andn	REALSRC, 0x7, SRC
3011
3012	! SRC - 8-byte aligned
3013	! DST - 64-byte aligned
3014	ldda	[SRC]%asi, %f16
3015	prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #n_reads
3016	alignaddr REALSRC, %g0, %g0
3017	ldda	[SRC + 0x08]%asi, %f18
3018	prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #n_reads
3019	faligndata %f16, %f18, %f48
3020	ldda	[SRC + 0x10]%asi, %f20
3021	prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
3022	faligndata %f18, %f20, %f50
3023	ldda	[SRC + 0x18]%asi, %f22
3024	prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
3025	faligndata %f20, %f22, %f52
3026	ldda	[SRC + 0x20]%asi, %f24
3027	prefetcha [SRC + (8 * VIS_BLOCKSIZE)]%asi, #one_read
3028	faligndata %f22, %f24, %f54
3029	ldda	[SRC + 0x28]%asi, %f26
3030	prefetcha [SRC + (12 * VIS_BLOCKSIZE)]%asi, #one_read
3031	faligndata %f24, %f26, %f56
3032	ldda	[SRC + 0x30]%asi, %f28
3033	prefetcha [SRC + (16 * VIS_BLOCKSIZE)]%asi, #one_read
3034	faligndata %f26, %f28, %f58
3035	ldda	[SRC + 0x38]%asi, %f30
3036	ldda	[SRC + VIS_BLOCKSIZE]%asi, %f16
3037	sub	CNT, VIS_BLOCKSIZE, CNT
3038	add	SRC, VIS_BLOCKSIZE, SRC
3039	prefetcha [SRC + (19 * VIS_BLOCKSIZE)]%asi, #one_read
3040	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3041	ba,pt	%ncc, 1f
3042	prefetcha [SRC + (23 * VIS_BLOCKSIZE)]%asi, #one_read
3043	.align	32
30441:
3045	ldda	[SRC + 0x08]%asi, %f18
3046	faligndata %f28, %f30, %f60
3047	ldda	[SRC + 0x10]%asi, %f20
3048	faligndata %f30, %f16, %f62
3049	stda	%f48, [DST]ASI_BLK_P
3050	ldda	[SRC + 0x18]%asi, %f22
3051	faligndata %f16, %f18, %f48
3052	ldda	[SRC + 0x20]%asi, %f24
3053	faligndata %f18, %f20, %f50
3054	ldda	[SRC + 0x28]%asi, %f26
3055	faligndata %f20, %f22, %f52
3056	ldda	[SRC + 0x30]%asi, %f28
3057	faligndata %f22, %f24, %f54
3058	sub	CNT, VIS_BLOCKSIZE, CNT
3059	ldda	[SRC + 0x38]%asi, %f30
3060	faligndata %f24, %f26, %f56
3061	add	DST, VIS_BLOCKSIZE, DST
3062	ldda	[SRC + VIS_BLOCKSIZE]%asi, %f16
3063	faligndata %f26, %f28, %f58
3064	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3065	prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
3066	add	SRC, VIS_BLOCKSIZE, SRC
3067	prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3068	cmp	CNT, VIS_BLOCKSIZE + 8
3069	bgu,pt	%ncc, 1b
3070	  prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3071
3072	! only if REALSRC & 0x7 is 0
3073	cmp	CNT, VIS_BLOCKSIZE
3074	bne	%ncc, 3f
3075	  andcc	REALSRC, 0x7, %g0
3076	bz,pt	%ncc, 2f
3077	  nop
30783:
3079	faligndata %f28, %f30, %f60
3080	faligndata %f30, %f16, %f62
3081	stda	%f48, [DST]ASI_BLK_P
3082	add	DST, VIS_BLOCKSIZE, DST
3083	ba,pt	%ncc, 3f
3084	  nop
30852:
3086	ldda	[SRC + 0x08]%asi, %f18
3087	fsrc1	%f28, %f60
3088	ldda	[SRC + 0x10]%asi, %f20
3089	fsrc1	%f30, %f62
3090	stda	%f48, [DST]ASI_BLK_P
3091	ldda	[SRC + 0x18]%asi, %f22
3092	fsrc1	%f16, %f48
3093	ldda	[SRC + 0x20]%asi, %f24
3094	fsrc1	%f18, %f50
3095	ldda	[SRC + 0x28]%asi, %f26
3096	fsrc1	%f20, %f52
3097	ldda	[SRC + 0x30]%asi, %f28
3098	fsrc1	%f22, %f54
3099	ldda	[SRC + 0x38]%asi, %f30
3100	fsrc1	%f24, %f56
3101	sub	CNT, VIS_BLOCKSIZE, CNT
3102	add	DST, VIS_BLOCKSIZE, DST
3103	add	SRC, VIS_BLOCKSIZE, SRC
3104	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3105	fsrc1	%f26, %f58
3106	fsrc1	%f28, %f60
3107	fsrc1	%f30, %f62
3108	stda	%f48, [DST]ASI_BLK_P
3109	add	DST, VIS_BLOCKSIZE, DST
3110	ba,a,pt	%ncc, 4f
3111	  nop
3112
31133:	tst	CNT
3114	bz,a	%ncc, 4f
3115	  nop
3116
31175:	lduba	[REALSRC]ASI_USER, TMP
3118	inc	REALSRC
3119	inc	DST
3120	deccc	CNT
3121	bgu	%ncc, 5b
3122	  stb	TMP, [DST - 1]
31234:
3124
3125.copyin_exit:
3126	membar	#Sync
3127
3128	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
3129	wr	%o2, 0, %gsr
3130
3131	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3132	btst	FPRS_FEF, %o3
3133	bz,pt	%icc, 4f
3134	  nop
3135
3136	BLD_FPQ2Q4_FROMSTACK(%o2)
3137
3138	ba,pt	%ncc, 1f
3139	  wr	%o3, 0, %fprs		! restore fprs
3140
31414:
3142	FZEROQ2Q4
3143	wr	%o3, 0, %fprs		! restore fprs
3144
31451:
3146	membar	#Sync				! sync error barrier
3147	andn	%l6, FPUSED_FLAG, %l6
3148	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3149	FP_ALLOWMIGRATE(5, 6)
3150	ret
3151	  restore	%g0, 0, %o0
3152/*
3153 * We got here because of a fault during copyin
3154 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
3155 */
3156.copyin_err:
3157	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
3158	tst	%o4
3159	bz,pt	%ncc, 2f			! if not, return error
3160	nop
3161	ldn	[%o4 + CP_COPYIN], %g2		! if handler, invoke it with
3162	jmp	%g2				! original arguments
3163	restore %g0, 0, %g0			! dispose of copy window
31642:
3165	ret
3166	restore %g0, -1, %o0			! return error value
3167
3168
3169	SET_SIZE(copyin_more)
3170
3171#endif	/* lint */
3172
3173#ifdef	lint
3174
3175/*ARGSUSED*/
3176int
3177xcopyin(const void *uaddr, void *kaddr, size_t count)
3178{ return (0); }
3179
3180#else	/* lint */
3181
3182	ENTRY(xcopyin)
3183
3184	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
3185	bleu,pt	%ncc, .xcopyin_small		! go to larger cases
3186	  xor	%o0, %o1, %o3			! are src, dst alignable?
3187	btst	7, %o3				!
3188	bz,pt	%ncc, .xcopyin_8		! check for longword alignment
3189	  nop
3190	btst	1, %o3				!
3191	bz,pt	%ncc, .xcopyin_2		! check for half-word
3192	  nop
3193	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
3194	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3195	tst	%o3
3196	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3197	  cmp	%o2, %o3			! if length <= limit
3198	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3199	  nop
3200	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3201	  nop
3202.xcopyin_2:
3203	btst	3, %o3				!
3204	bz,pt	%ncc, .xcopyin_4		! check for word alignment
3205	  nop
3206	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
3207	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3208	tst	%o3
3209	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3210	  cmp	%o2, %o3			! if length <= limit
3211	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3212	  nop
3213	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3214	  nop
3215.xcopyin_4:
3216	! already checked longword, must be word aligned
3217	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
3218	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3219	tst	%o3
3220	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3221	  cmp	%o2, %o3			! if length <= limit
3222	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3223	  nop
3224	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3225	  nop
3226.xcopyin_8:
3227	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
3228	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3229	tst	%o3
3230	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3231	  cmp	%o2, %o3			! if length <= limit
3232	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3233	  nop
3234	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3235	  nop
3236
3237.xcopyin_small:
3238	sethi	%hi(.sm_xcopyin_err), %o5  ! .sm_xcopyin_err is lofault value
3239	or	%o5, %lo(.sm_xcopyin_err), %o5
3240	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofaul
3241	membar	#Sync				! sync error barrier
3242	ba,pt	%ncc, .sm_do_copyin		! common code
3243	  stn	%o5, [THREAD_REG + T_LOFAULT]
3244
3245.xcopyin_more:
3246	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3247	sethi	%hi(.xcopyin_err), REAL_LOFAULT	! .xcopyin_err is lofault value
3248	ba,pt	%ncc, .do_copyin
3249	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3250
3251/*
3252 * We got here because of fault during xcopyin
3253 * Errno value is in ERRNO
3254 */
3255.xcopyin_err:
3256	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
3257	tst	%o4
3258	bz,pt	%ncc, 2f			! if not, return error
3259	  nop
3260	ldn	[%o4 + CP_XCOPYIN], %g2		! if handler, invoke it with
3261	jmp	%g2				! original arguments
3262	  restore %g0, 0, %g0			! dispose of copy window
32632:
3264        ret
3265	  restore ERRNO, 0, %o0			! return errno value
3266
3267.sm_xcopyin_err:
3268
3269	membar	#Sync
3270	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3271	mov	SM_SAVE_SRC, %o0
3272	mov	SM_SAVE_DST, %o1
3273	mov	SM_SAVE_COUNT, %o2
3274	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
3275	tst	%o3
3276	bz,pt	%ncc, 3f			! if not, return error
3277	  nop
3278	ldn	[%o3 + CP_XCOPYIN], %o5		! if handler, invoke it with
3279	jmp	%o5				! original arguments
3280	  nop
32813:
3282	retl
3283	  or	%g1, 0, %o0		! return errno value
3284
3285	SET_SIZE(xcopyin)
3286
3287#endif	/* lint */
3288
3289#ifdef	lint
3290
3291/*ARGSUSED*/
3292int
3293xcopyin_little(const void *uaddr, void *kaddr, size_t count)
3294{ return (0); }
3295
3296#else	/* lint */
3297
3298	ENTRY(xcopyin_little)
3299	sethi	%hi(.xcopyio_err), %o5
3300	or	%o5, %lo(.xcopyio_err), %o5
3301	ldn	[THREAD_REG + T_LOFAULT], %o4
3302	membar	#Sync				! sync error barrier
3303	stn	%o5, [THREAD_REG + T_LOFAULT]
3304	mov	%o4, %o5
3305
3306	subcc	%g0, %o2, %o3
3307	add	%o0, %o2, %o0
3308	bz,pn	%ncc, 2f		! check for zero bytes
3309	  sub	%o2, 1, %o4
3310	add	%o0, %o4, %o0		! start w/last byte
3311	add	%o1, %o2, %o1
3312	lduba	[%o0 + %o3]ASI_AIUSL, %o4
3313
33141:	stb	%o4, [%o1 + %o3]
3315	inccc	%o3
3316	sub	%o0, 2, %o0		! get next byte
3317	bcc,a,pt %ncc, 1b
3318	  lduba	[%o0 + %o3]ASI_AIUSL, %o4
3319
33202:
3321	membar	#Sync				! sync error barrier
3322	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3323	retl
3324	  mov	%g0, %o0		! return (0)
3325
3326.xcopyio_err:
3327	membar	#Sync				! sync error barrier
3328	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3329	retl
3330	  mov	%g1, %o0
3331
3332	SET_SIZE(xcopyin_little)
3333
3334#endif	/* lint */
3335
3336
3337/*
3338 * Copy a block of storage - must not overlap (from + len <= to).
3339 * No fault handler installed (to be called under on_fault())
3340 */
3341#if defined(lint)
3342
3343/* ARGSUSED */
3344void
3345copyin_noerr(const void *ufrom, void *kto, size_t count)
3346{}
3347
3348#else	/* lint */
3349	ENTRY(copyin_noerr)
3350
3351	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
3352	bleu,pt	%ncc, .copyin_ne_small		! go to larger cases
3353	  xor	%o0, %o1, %o3			! are src, dst alignable?
3354	btst	7, %o3				!
3355	bz,pt	%ncc, .copyin_ne_8		! check for longword alignment
3356	  nop
3357	btst	1, %o3				!
3358	bz,pt	%ncc, .copyin_ne_2		! check for half-word
3359	  nop
3360	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
3361	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3362	tst	%o3
3363	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3364	  cmp	%o2, %o3			! if length <= limit
3365	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3366	  nop
3367	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3368	  nop
3369.copyin_ne_2:
3370	btst	3, %o3				!
3371	bz,pt	%ncc, .copyin_ne_4		! check for word alignment
3372	  nop
3373	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
3374	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3375	tst	%o3
3376	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3377	  cmp	%o2, %o3			! if length <= limit
3378	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3379	  nop
3380	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3381	  nop
3382.copyin_ne_4:
3383	! already checked longword, must be word aligned
3384	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
3385	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3386	tst	%o3
3387	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3388	  cmp	%o2, %o3			! if length <= limit
3389	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3390	  nop
3391	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3392	  nop
3393.copyin_ne_8:
3394	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
3395	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3396	tst	%o3
3397	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3398	  cmp	%o2, %o3			! if length <= limit
3399	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3400	  nop
3401	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3402	  nop
3403
3404.copyin_ne_small:
3405	ldn	[THREAD_REG + T_LOFAULT], %o4
3406	tst	%o4
3407	bz,pn	%ncc, .sm_do_copyin
3408	  nop
3409	sethi	%hi(.sm_copyio_noerr), %o5
3410	or	%o5, %lo(.sm_copyio_noerr), %o5
3411	membar	#Sync				! sync error barrier
3412	ba,pt	%ncc, .sm_do_copyin
3413	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
3414
3415.copyin_noerr_more:
3416	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3417	sethi	%hi(.copyio_noerr), REAL_LOFAULT
3418	ba,pt	%ncc, .do_copyin
3419	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3420
3421.copyio_noerr:
3422	jmp	%l6
3423	  restore %g0,0,%g0
3424
3425.sm_copyio_noerr:
3426	membar	#Sync
3427	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore t_lofault
3428	jmp	%o4
3429	  nop
3430
3431	SET_SIZE(copyin_noerr)
3432#endif /* lint */
3433
3434/*
3435 * Copy a block of storage - must not overlap (from + len <= to).
3436 * No fault handler installed (to be called under on_fault())
3437 */
3438
3439#if defined(lint)
3440
3441/* ARGSUSED */
3442void
3443copyout_noerr(const void *kfrom, void *uto, size_t count)
3444{}
3445
3446#else	/* lint */
3447	ENTRY(copyout_noerr)
3448
3449	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
3450	bleu,pt	%ncc, .copyout_ne_small		! go to larger cases
3451	  xor	%o0, %o1, %o3			! are src, dst alignable?
3452	btst	7, %o3				!
3453	bz,pt	%ncc, .copyout_ne_8		! check for longword alignment
3454	  nop
3455	btst	1, %o3				!
3456	bz,pt	%ncc, .copyout_ne_2		! check for half-word
3457	  nop
3458	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
3459	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3460	tst	%o3
3461	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3462	  cmp	%o2, %o3			! if length <= limit
3463	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3464	  nop
3465	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3466	  nop
3467.copyout_ne_2:
3468	btst	3, %o3				!
3469	bz,pt	%ncc, .copyout_ne_4		! check for word alignment
3470	  nop
3471	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
3472	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3473	tst	%o3
3474	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3475	  cmp	%o2, %o3			! if length <= limit
3476	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3477	  nop
3478	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3479	  nop
3480.copyout_ne_4:
3481	! already checked longword, must be word aligned
3482	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
3483	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3484	tst	%o3
3485	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3486	  cmp	%o2, %o3			! if length <= limit
3487	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3488	  nop
3489	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3490	  nop
3491.copyout_ne_8:
3492	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
3493	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3494	tst	%o3
3495	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3496	  cmp	%o2, %o3			! if length <= limit
3497	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3498	  nop
3499	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3500	  nop
3501
3502.copyout_ne_small:
3503	ldn	[THREAD_REG + T_LOFAULT], %o4
3504	tst	%o4
3505	bz,pn	%ncc, .sm_do_copyout
3506	  nop
3507	sethi	%hi(.sm_copyio_noerr), %o5
3508	or	%o5, %lo(.sm_copyio_noerr), %o5
3509	membar	#Sync				! sync error barrier
3510	ba,pt	%ncc, .sm_do_copyout
3511	stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
3512
3513.copyout_noerr_more:
3514	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3515	sethi	%hi(.copyio_noerr), REAL_LOFAULT
3516	ba,pt	%ncc, .do_copyout
3517	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3518
3519	SET_SIZE(copyout_noerr)
3520#endif /* lint */
3521
3522
3523/*
3524 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
3525 * longer than 256 bytes in length using spitfire's block stores.  If
3526 * the criteria for using this routine are not met then it calls bzero
3527 * and returns 1.  Otherwise 0 is returned indicating success.
3528 * Caller is responsible for ensuring use_hw_bzero is true and that
3529 * kpreempt_disable() has been called.
3530 */
3531#ifdef lint
3532/*ARGSUSED*/
3533int
3534hwblkclr(void *addr, size_t len)
3535{
3536	return(0);
3537}
3538#else /* lint */
3539	! %i0 - start address
3540	! %i1 - length of region (multiple of 64)
3541	! %l0 - saved fprs
3542	! %l1 - pointer to saved %d0 block
3543	! %l2 - saved curthread->t_lwp
3544
3545	ENTRY(hwblkclr)
3546	! get another window w/space for one aligned block of saved fpregs
3547	save	%sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
3548
3549	! Must be block-aligned
3550	andcc	%i0, (VIS_BLOCKSIZE-1), %g0
3551	bnz,pn	%ncc, 1f
3552	  nop
3553
3554	! ... and must be 256 bytes or more
3555	cmp	%i1, 256
3556	blu,pn	%ncc, 1f
3557	  nop
3558
3559	! ... and length must be a multiple of VIS_BLOCKSIZE
3560	andcc	%i1, (VIS_BLOCKSIZE-1), %g0
3561	bz,pn	%ncc, 2f
3562	  nop
3563
35641:	! punt, call bzero but notify the caller that bzero was used
3565	mov	%i0, %o0
3566	call	bzero
3567	mov	%i1, %o1
3568	ret
3569	  restore	%g0, 1, %o0 ! return (1) - did not use block operations
3570
35712:	rd	%fprs, %l0		! check for unused fp
3572	btst	FPRS_FEF, %l0
3573	bz,pt	%icc, 1f
3574	  nop
3575
3576	! save in-use fpregs on stack
3577	membar	#Sync
3578	add	%fp, STACK_BIAS - 65, %l1
3579	and	%l1, -VIS_BLOCKSIZE, %l1
3580	stda	%d0, [%l1]ASI_BLK_P
3581
35821:	membar	#StoreStore|#StoreLoad|#LoadStore
3583	wr	%g0, FPRS_FEF, %fprs
3584	wr	%g0, ASI_BLK_P, %asi
3585
3586	! Clear block
3587	fzero	%d0
3588	fzero	%d2
3589	fzero	%d4
3590	fzero	%d6
3591	fzero	%d8
3592	fzero	%d10
3593	fzero	%d12
3594	fzero	%d14
3595
3596	mov	256, %i3
3597	ba,pt	%ncc, .pz_doblock
3598	  nop
3599
3600.pz_blkstart:
3601      ! stda	%d0, [%i0 + 192]%asi  ! in dly slot of branch that got us here
3602	stda	%d0, [%i0 + 128]%asi
3603	stda	%d0, [%i0 + 64]%asi
3604	stda	%d0, [%i0]%asi
3605.pz_zinst:
3606	add	%i0, %i3, %i0
3607	sub	%i1, %i3, %i1
3608.pz_doblock:
3609	cmp	%i1, 256
3610	bgeu,a	%ncc, .pz_blkstart
3611	  stda	%d0, [%i0 + 192]%asi
3612
3613	cmp	%i1, 64
3614	blu	%ncc, .pz_finish
3615
3616	  andn	%i1, (64-1), %i3
3617	srl	%i3, 4, %i2		! using blocks, 1 instr / 16 words
3618	set	.pz_zinst, %i4
3619	sub	%i4, %i2, %i4
3620	jmp	%i4
3621	  nop
3622
3623.pz_finish:
3624	membar	#Sync
3625	btst	FPRS_FEF, %l0
3626	bz,a	.pz_finished
3627	  wr	%l0, 0, %fprs		! restore fprs
3628
3629	! restore fpregs from stack
3630	ldda	[%l1]ASI_BLK_P, %d0
3631	membar	#Sync
3632	wr	%l0, 0, %fprs		! restore fprs
3633
3634.pz_finished:
3635	ret
3636	  restore	%g0, 0, %o0		! return (bzero or not)
3637
3638	SET_SIZE(hwblkclr)
3639#endif	/* lint */
3640
3641#ifdef lint
3642/*ARGSUSED*/
3643void
3644hw_pa_bcopy32(uint64_t src, uint64_t dst)
3645{}
3646#else /*!lint */
3647	/*
3648	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
3649	 * using physical addresses.
3650	 */
3651	ENTRY_NP(hw_pa_bcopy32)
3652	rdpr	%pstate, %g1
3653	andn	%g1, PSTATE_IE, %g2
3654	wrpr	%g0, %g2, %pstate
3655
3656	rdpr	%pstate, %g0
3657	ldxa	[%o0]ASI_MEM, %o2
3658	add	%o0, 8, %o0
3659	ldxa	[%o0]ASI_MEM, %o3
3660	add	%o0, 8, %o0
3661	ldxa	[%o0]ASI_MEM, %o4
3662	add	%o0, 8, %o0
3663	ldxa	[%o0]ASI_MEM, %o5
3664	membar	#Sync
3665
3666	stxa	%o2, [%o1]ASI_MEM
3667	add	%o1, 8, %o1
3668	stxa	%o3, [%o1]ASI_MEM
3669	add	%o1, 8, %o1
3670	stxa	%o4, [%o1]ASI_MEM
3671	add	%o1, 8, %o1
3672	stxa	%o5, [%o1]ASI_MEM
3673
3674	retl
3675	  wrpr	  %g0, %g1, %pstate
3676
3677	SET_SIZE(hw_pa_bcopy32)
3678
3679#endif /* lint */
3680
3681#if defined(lint)
3682
3683int use_hw_bcopy = 1;
3684int use_hw_bzero = 1;
3685uint_t hw_copy_limit_1 = 0;
3686uint_t hw_copy_limit_2 = 0;
3687uint_t hw_copy_limit_4 = 0;
3688uint_t hw_copy_limit_8 = 0;
3689
3690#else /* !lint */
3691
3692	DGDEF(use_hw_bcopy)
3693	.word	1
3694	DGDEF(use_hw_bzero)
3695	.word	1
3696	DGDEF(hw_copy_limit_1)
3697	.word	0
3698	DGDEF(hw_copy_limit_2)
3699	.word	0
3700	DGDEF(hw_copy_limit_4)
3701	.word	0
3702	DGDEF(hw_copy_limit_8)
3703	.word	0
3704
3705	.align	64
3706	.section ".text"
3707#endif /* !lint */
3708