xref: /titanic_44/usr/src/uts/sun4u/cpu/cheetah_copy.s (revision 24db46411fd54f70c35b94bb952eb7ba040e43b4)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma	ident	"%Z%%M%	%I%	%E% SMI"
28
29#include <sys/param.h>
30#include <sys/errno.h>
31#include <sys/asm_linkage.h>
32#include <sys/vtrace.h>
33#include <sys/machthread.h>
34#include <sys/clock.h>
35#include <sys/asi.h>
36#include <sys/fsr.h>
37#include <sys/privregs.h>
38#include <sys/fpras_impl.h>
39
40#if !defined(lint)
41#include "assym.h"
42#endif	/* lint */
43
44/*
45 * Pseudo-code to aid in understanding the control flow of the
46 * bcopy/copyin/copyout routines.
47 *
48 * On entry:
49 *
50 * 	! Determine whether to use the FP register version
51 * 	! or the leaf routine version depending on size
52 * 	! of copy and flags.  Set up error handling accordingly.
53 *	! The transition point depends on whether the src and
54 * 	! dst addresses can be aligned to long word, word,
55 * 	! half word, or byte boundaries.
56 *	!
57 *	! WARNING: <Register usage convention>
58 *	! For FP version, %l6 holds previous error handling and
59 *	! a flag: TRAMP_FLAG (low bits)
60 *	! for leaf routine version, %o4 holds those values.
61 *	! So either %l6 or %o4 is reserved and not available for
62 *	! any other use.
63 *
64 * 	if (length <= VIS_COPY_THRESHOLD) 	! start with a quick test
65 * 		go to small_copy;		! to speed short copies
66 *
67 * 	! src, dst long word alignable
68 * 		if (hw_copy_limit_8 == 0) 	! hw_copy disabled
69 * 			go to small_copy;
70 *		if (length <= hw_copy_limit_8)
71 * 			go to small_copy;
72 * 		go to FPBLK_copy;
73 * 	}
74 * 	if (src,dst not alignable) {
75 * 		if (hw_copy_limit_1 == 0) 	! hw_copy disabled
76 * 			go to small_copy;
77 *		if (length <= hw_copy_limit_1)
78 * 			go to small_copy;
79 * 		go to FPBLK_copy;
80 * 	}
81 * 	if (src,dst halfword alignable) {
82 * 		if (hw_copy_limit_2 == 0) 	! hw_copy disabled
83 * 			go to small_copy;
84 *		if (length <= hw_copy_limit_2)
85 * 			go to small_copy;
86 * 		go to FPBLK_copy;
87 * 	}
88 * 	if (src,dst word alignable) {
89 * 		if (hw_copy_limit_4 == 0) 	! hw_copy disabled
90 * 			go to small_copy;
91 *		if (length <= hw_copy_limit_4)
92 * 			go to small_copy;
93 * 		go to FPBLK_copy;
94 * 	}
95 *
96 * small_copy:
97 *	Setup_leaf_rtn_error_handler; 		! diffs for each entry point
98 *
99 *	if (count <= 3)				! fast path for tiny copies
100 *		go to sm_left;			! special finish up code
101 *	else
102 *		if (count > CHKSIZE)		! medium sized copies
103 *			go to sm_med		! tuned by alignment
104 *		if(src&dst not both word aligned) {
105 *	sm_movebytes:
106 *			move byte by byte in 4-way unrolled loop
107 *			fall into sm_left;
108 *	sm_left:
109 *			move 0-3 bytes byte at a time as needed.
110 *			restore error handler and exit.
111 *
112 * 		} else {	! src&dst are word aligned
113 *			check for at least 8 bytes left,
114 *			move word at a time, unrolled by 2
115 *			when fewer than 8 bytes left,
116 *	sm_half:	move half word at a time while 2 or more bytes left
117 *	sm_byte:	move final byte if necessary
118 *	sm_exit:
119 *			restore error handler and exit.
120 *		}
121 *
122 * ! Medium length cases with at least CHKSIZE bytes available
123 * ! method: line up src and dst as best possible, then
124 * ! move data in 4-way unrolled loops.
125 *
126 * sm_med:
127 *	if(src&dst unalignable)
128 * 		go to sm_movebytes
129 *	if(src&dst halfword alignable)
130 *		go to sm_movehalf
131 *	if(src&dst word alignable)
132 *		go to sm_moveword
133 * ! fall into long word movement
134 *	move bytes until src is word aligned
135 *	if not long word aligned, move a word
136 *	move long words in 4-way unrolled loop until < 32 bytes left
137 *      move long words in 1-way unrolled loop until < 8 bytes left
138 *	if zero bytes left, goto sm_exit
139 *	if one byte left, go to sm_byte
140 *	else go to sm_half
141 *
142 * sm_moveword:
143 *	move bytes until src is word aligned
144 *	move words in 4-way unrolled loop until < 16 bytes left
145 *      move words in 1-way unrolled loop until < 4 bytes left
146 *	if zero bytes left, goto sm_exit
147 *	if one byte left, go to sm_byte
148 *	else go to sm_half
149 *
150 * sm_movehalf:
151 *	move a byte if needed to align src on halfword
152 *	move halfwords in 4-way unrolled loop until < 8 bytes left
153 *	if zero bytes left, goto sm_exit
154 *	if one byte left, go to sm_byte
155 *	else go to sm_half
156 *
157 *
158 * FPBLK_copy:
159 * 	%l6 = curthread->t_lofault;
160 * 	if (%l6 != NULL) {
161 * 		membar #Sync
162 * 		curthread->t_lofault = .copyerr;
163 * 		caller_error_handler = TRUE             ! %l6 |= 2
164 * 	}
165 *
166 *	! for FPU testing we must not migrate cpus
167 * 	if (curthread->t_lwp == NULL) {
168 *		! Kernel threads do not have pcb's in which to store
169 *		! the floating point state, so disallow preemption during
170 *		! the copy.  This also prevents cpu migration.
171 * 		kpreempt_disable(curthread);
172 *	} else {
173 *		thread_nomigrate();
174 *	}
175 *
176 * 	old_fprs = %fprs;
177 * 	old_gsr = %gsr;
178 * 	if (%fprs.fef) {
179 * 		%fprs.fef = 1;
180 * 		save current fpregs on stack using blockstore
181 * 	} else {
182 * 		%fprs.fef = 1;
183 * 	}
184 *
185 *
186 * 	do_blockcopy_here;
187 *
188 * In lofault handler:
189 *	curthread->t_lofault = .copyerr2;
190 *	Continue on with the normal exit handler
191 *
192 * On normal exit:
193 * 	%gsr = old_gsr;
194 * 	if (old_fprs & FPRS_FEF)
195 * 		restore fpregs from stack using blockload
196 *	else
197 *		zero fpregs
198 * 	%fprs = old_fprs;
199 * 	membar #Sync
200 * 	curthread->t_lofault = (%l6 & ~3);
201 *	! following test omitted from copyin/copyout as they
202 *	! will always have a current thread
203 * 	if (curthread->t_lwp == NULL)
204 *		kpreempt_enable(curthread);
205 *	else
206 *		thread_allowmigrate();
207 * 	return (0)
208 *
209 * In second lofault handler (.copyerr2):
210 *	We've tried to restore fp state from the stack and failed.  To
211 *	prevent from returning with a corrupted fp state, we will panic.
212 */
213
214/*
215 * Comments about optimization choices
216 *
217 * The initial optimization decision in this code is to determine
218 * whether to use the FP registers for a copy or not.  If we don't
219 * use the FP registers, we can execute the copy as a leaf routine,
220 * saving a register save and restore.  Also, less elaborate setup
221 * is required, allowing short copies to be completed more quickly.
222 * For longer copies, especially unaligned ones (where the src and
223 * dst do not align to allow simple ldx,stx operation), the FP
224 * registers allow much faster copy operations.
225 *
226 * The estimated extra cost of the FP path will vary depending on
227 * src/dst alignment, dst offset from the next 64 byte FPblock store
228 * boundary, remaining src data after the last full dst cache line is
229 * moved whether the FP registers need to be saved, and some other
230 * minor issues.  The average additional overhead is estimated to be
231 * 400 clocks.  Since each non-repeated/predicted tst and branch costs
232 * around 10 clocks, elaborate calculation would slow down to all
233 * longer copies and only benefit a small portion of medium sized
234 * copies.  Rather than incur such cost, we chose fixed transition
235 * points for each of the alignment choices.
236 *
237 * For the inner loop, here is a comparison of the per cache line
238 * costs for each alignment when src&dst are in cache:
239 *
240 * byte aligned:  108 clocks slower for non-FPBLK
241 * half aligned:   44 clocks slower for non-FPBLK
242 * word aligned:   12 clocks slower for non-FPBLK
243 * long aligned:    4 clocks >>faster<< for non-FPBLK
244 *
245 * The long aligned loop runs faster because it does no prefetching.
246 * That wins if the data is not in cache or there is too little
247 * data to gain much benefit from prefetching.  But when there
248 * is more data and that data is not in cache, failing to prefetch
249 * can run much slower.  In addition, there is a 2 Kbyte store queue
250 * which will cause the non-FPBLK inner loop to slow for larger copies.
251 * The exact tradeoff is strongly load and application dependent, with
252 * increasing risk of a customer visible performance regression if the
253 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
254 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
255 * upper limit for the non-FPBLK code.  To minimize performance regression
256 * risk while still gaining the primary benefits of the improvements to
257 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
258 * hw_copy_limit_*.  Later experimental studies using different values
259 * of hw_copy_limit_* can be used to make further adjustments if
260 * appropriate.
261 *
262 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
263 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
264 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
265 * hw_copy_limit_8 = src and dst are longword aligned
266 *
267 * To say that src and dst are word aligned means that after
268 * some initial alignment activity of moving 0 to 3 bytes,
269 * both the src and dst will be on word boundaries so that
270 * word loads and stores may be used.
271 *
272 * Recommended initial values as of Mar 2004, includes testing
273 * on Cheetah+ (900MHz), Cheetah++ (1200MHz), and Jaguar(1050MHz):
274 * hw_copy_limit_1 =  256
275 * hw_copy_limit_2 =  512
276 * hw_copy_limit_4 = 1024
277 * hw_copy_limit_8 = 1024 (or 1536 on some systems)
278 *
279 *
280 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
281 * disabled for that alignment choice.
282 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
283 * the value of VIS_COPY_THRESHOLD is used.
284 * It is not envisioned that hw_copy_limit_? will be changed in the field
285 * It is provided to allow for disabling FPBLK copies and to allow
286 * easy testing of alternate values on future HW implementations
287 * that might have different cache sizes, clock rates or instruction
288 * timing rules.
289 *
290 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
291 * threshold to speedup all shorter copies (less than 256).  That
292 * saves an alignment test, memory reference, and enabling test
293 * for all short copies, or an estimated 24 clocks.
294 *
295 * The order in which these limits are checked does matter since each
296 * non-predicted tst and branch costs around 10 clocks.
297 * If src and dst are randomly selected addresses,
298 * 4 of 8 will not be alignable.
299 * 2 of 8 will be half word alignable.
300 * 1 of 8 will be word alignable.
301 * 1 of 8 will be long word alignable.
302 * But, tests on running kernels show that src and dst to copy code
303 * are typically not on random alignments.  Structure copies and
304 * copies of larger data sizes are often on long word boundaries.
305 * So we test the long word alignment case first, then
306 * the byte alignment, then halfword, then word alignment.
307 *
308 * Several times, tests for length are made to split the code
309 * into subcases.  These tests often allow later tests to be
310 * avoided.  For example, within the non-FPBLK copy, we first
311 * check for tiny copies of 3 bytes or less.  That allows us
312 * to use a 4-way unrolled loop for the general byte copy case
313 * without a test on loop entry.
314 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
315 * vs longer cases.  For the really short case, we don't attempt
316 * align src and dst.  We try to minimize special case tests in
317 * the shortest loops as each test adds a significant percentage
318 * to the total time.
319 *
320 * For the medium sized cases, we allow ourselves to adjust the
321 * src and dst alignment and provide special cases for each of
322 * the four adjusted alignment cases. The CHKSIZE that was used
323 * to decide between short and medium size was chosen to be 39
324 * as that allows for the worst case of 7 bytes of alignment
325 * shift and 4 times 8 bytes for the first long word unrolling.
326 * That knowledge saves an initial test for length on entry into
327 * the medium cases.  If the general loop unrolling factor were
328 * to be increases, this number would also need to be adjusted.
329 *
330 * For all cases in the non-FPBLK code where it is known that at
331 * least 4 chunks of data are available for movement, the
332 * loop is unrolled by four.  This 4-way loop runs in 8 clocks
333 * or 2 clocks per data element.  Due to limitations of the
334 * branch instruction on Cheetah, Jaguar, and Panther, the
335 * minimum time for a small, tight loop is 3 clocks.  So
336 * the 4-way loop runs 50% faster than the fastest non-unrolled
337 * loop.
338 *
339 * Instruction alignment is forced by used of .align 16 directives
340 * and nops which are not executed in the code.  This
341 * combination of operations shifts the alignment of following
342 * loops to insure that loops are aligned so that their instructions
343 * fall within the minimum number of 4 instruction fetch groups.
344 * If instructions are inserted or removed between the .align
345 * instruction and the unrolled loops, then the alignment needs
346 * to be readjusted.  Misaligned loops can add a clock per loop
347 * iteration to the loop timing.
348 *
349 * In a few cases, code is duplicated to avoid a branch.  Since
350 * a non-predicted tst and branch takes 10 clocks, this savings
351 * is judged an appropriate time-space tradeoff.
352 *
353 * Within the FPBLK-code, the prefetch method in the inner
354 * loop needs to be explained as it is not standard.  Two
355 * prefetches are issued for each cache line instead of one.
356 * The primary one is at the maximum reach of 8 cache lines.
357 * Most of the time, that maximum prefetch reach gives the
358 * cache line more time to reach the processor for systems with
359 * higher processor clocks.  But, sometimes memory interference
360 * can cause that prefetch to be dropped.  Putting a second
361 * prefetch at a reach of 5 cache lines catches the drops
362 * three iterations later and shows a measured improvement
363 * in performance over any similar loop with a single prefetch.
364 * The prefetches are placed in the loop so they overlap with
365 * non-memory instructions, so that there is no extra cost
366 * when the data is already in-cache.
367 *
368 */
369
370/*
371 * Notes on preserving existing fp state and on membars.
372 *
373 * When a copyOP decides to use fp we may have to preserve existing
374 * floating point state.  It is not the caller's state that we need to
375 * preserve - the rest of the kernel does not use fp and, anyway, fp
376 * registers are volatile across a call.  Some examples:
377 *
378 *	- userland has fp state and is interrupted (device interrupt
379 *	  or trap) and within the interrupt/trap handling we use
380 *	  bcopy()
381 *	- another (higher level) interrupt or trap handler uses bcopy
382 *	  while a bcopy from an earlier interrupt is still active
383 *	- an asynchronous error trap occurs while fp state exists (in
384 *	  userland or in kernel copy) and the tl0 component of the handling
385 *	  uses bcopy
386 *	- a user process with fp state incurs a copy-on-write fault and
387 *	  hwblkpagecopy always uses fp
388 *
389 * We therefore need a per-call place in which to preserve fp state -
390 * using our stack is ideal (and since fp copy cannot be leaf optimized
391 * because of calls it makes, this is no hardship).
392 *
393 * The following membar BLD/BST discussion is Cheetah pipeline specific.
394 * In Cheetah BLD is blocking, #LoadLoad/#LoadStore/#StoreStore are
395 * nops (those semantics always apply) and #StoreLoad is implemented
396 * as a membar #Sync.
397 *
398 * It is possible that the owner of the fp state has a block load or
399 * block store still "in flight" at the time we come to preserve that
400 * state.  Block loads are blocking in Cheetah pipelines so we do not
401 * need to sync with them.  In preserving fp regs we will use block stores
402 * (which are not blocking in Cheetah pipelines) so we require a membar #Sync
403 * after storing state (so that our subsequent use of those registers
404 * does not modify them before the block stores complete);  this membar
405 * also serves to sync with block stores the owner of the fp state has
406 * initiated.
407 *
408 * When we have finished fp copy (with it's repeated block stores)
409 * we must membar #Sync so that our block stores may complete before
410 * we either restore the original fp state into the fp registers or
411 * return to a caller which may initiate other fp operations that could
412 * modify the fp regs we used before the block stores complete.
413 *
414 * Synchronous faults (eg, unresolvable DMMU miss) that occur while
415 * t_lofault is not NULL will not panic but will instead trampoline
416 * to the registered lofault handler.  There is no need for any
417 * membars for these - eg, our store to t_lofault will always be visible to
418 * ourselves and it is our cpu which will take any trap.
419 *
420 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
421 * while t_lofault is not NULL will also not panic.  Since we're copying
422 * to or from userland the extent of the damage is known - the destination
423 * buffer is incomplete.  So trap handlers will trampoline to the lofault
424 * handler in this case which should take some form of error action to
425 * avoid using the incomplete buffer.  The trap handler also flags the
426 * fault so that later return-from-trap handling (for the trap that brought
427 * this thread into the kernel in the first place) can notify the process
428 * and reboot the system (or restart the service with Greenline/Contracts).
429 *
430 * Asynchronous faults (eg, uncorrectable ECC error from memory) can
431 * result in deferred error traps - the trap is taken sometime after
432 * the event and the trap PC may not be the PC of the faulting access.
433 * Delivery of such pending traps can be forced by a membar #Sync, acting
434 * as an "error barrier" in this role.  To accurately apply the user/kernel
435 * separation described in the preceding paragraph we must force delivery
436 * of deferred traps affecting kernel state before we install a lofault
437 * handler (if we interpose a new lofault handler on an existing one there
438 * is no need to repeat this), and we must force delivery of deferred
439 * errors affecting the lofault-protected region before we clear t_lofault.
440 * Failure to do so results in lost kernel state being interpreted as
441 * affecting a copyin/copyout only, or of an error that really only
442 * affects copy data being interpreted as losing kernel state.
443 *
444 * Since the copy operations may preserve and later restore floating
445 * point state that does not belong to the caller (see examples above),
446 * we must be careful in how we do this in order to prevent corruption
447 * of another program.
448 *
449 * To make sure that floating point state is always saved and restored
450 * correctly, the following "big rules" must be followed when the floating
451 * point registers will be used:
452 *
453 * 1. %l6 always holds the caller's lofault handler.  Also in this register,
454 *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
455 *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
456 *    lofault handler was set coming in.
457 *
458 * 2. The FPUSED flag indicates that all FP state has been successfully stored
459 *    on the stack.  It should not be set until this save has been completed.
460 *
461 * 3. The FPUSED flag should not be cleared on exit until all FP state has
462 *    been restored from the stack.  If an error occurs while restoring
463 *    data from the stack, the error handler can check this flag to see if
464 *    a restore is necessary.
465 *
466 * 4. Code run under the new lofault handler must be kept to a minimum.  In
467 *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
468 *    to kpreempt(), should not be made until after the lofault handler has
469 *    been restored.
470 */
471
472/*
473 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
474 * to "break even" using FP/VIS-accelerated memory operations.
475 * The FPBLK code assumes a minimum number of bytes are available
476 * to be moved on entry.  Check that code carefully before
477 * reducing VIS_COPY_THRESHOLD below 256.
478 */
479/*
480 * This shadows sys/machsystm.h which can't be included due to the lack of
481 * _ASM guards in include files it references. Change it here, change it there.
482 */
483#define VIS_COPY_THRESHOLD 256
484
485/*
486 * TEST for very short copies
487 * Be aware that the maximum unroll for the short unaligned case
488 * is SHORTCOPY+1
489 */
490#define SHORTCOPY 3
491#define CHKSIZE  39
492
493/*
494 * Indicates that we're to trampoline to the error handler.
495 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
496 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
497 */
498#define	FPUSED_FLAG	1
499#define	TRAMP_FLAG	2
500#define	MASK_FLAGS	3
501
502/*
503 * Number of outstanding prefetches.
504 * Testing with 1200 MHz Cheetah+ and Jaguar gives best results with
505 * two prefetches, one with a reach of 8*BLOCK_SIZE+8 and one with a
506 * reach of 5*BLOCK_SIZE.  The double prefetch gives an typical improvement
507 * of 5% for large copies as compared to a single prefetch.  The reason
508 * for the improvement is that with Cheetah and Jaguar, some prefetches
509 * are dropped due to the prefetch queue being full.  The second prefetch
510 * reduces the number of cache lines that are dropped.
511 * Do not remove the double prefetch or change either CHEETAH_PREFETCH
512 * or CHEETAH_2ND_PREFETCH without extensive performance tests to prove
513 * there is no loss of performance.
514 */
515#define	CHEETAH_PREFETCH	8
516#define	CHEETAH_2ND_PREFETCH	5
517
518#define	VIS_BLOCKSIZE		64
519
520/*
521 * Size of stack frame in order to accomodate a 64-byte aligned
522 * floating-point register save area and 2 64-bit temp locations.
523 * All copy functions use two quadrants of fp registers; to assure a
524 * block-aligned two block buffer in which to save we must reserve
525 * three blocks on stack.  Not all functions preserve %pfrs on stack
526 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
527 *
528 *    _______________________________________ <-- %fp + STACK_BIAS
529 *    | We may need to preserve 2 quadrants |
530 *    | of fp regs, but since we do so with |
531 *    | BST/BLD we need room in which to    |
532 *    | align to VIS_BLOCKSIZE bytes.  So   |
533 *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
534 *    |-------------------------------------|
535 *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
536 *    |-------------------------------------|
537 *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
538 *    ---------------------------------------
539 */
540#define	HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
541#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 3)
542#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 2) - 1)
543#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
544#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
545
546/*
547 * Common macros used by the various versions of the block copy
548 * routines in this file.
549 */
550
551/*
552 * In FP copies if we do not have preserved data to restore over
553 * the fp regs we used then we must zero those regs to avoid
554 * exposing portions of the data to later threads (data security).
555 *
556 * Copy functions use either quadrants 1 and 3 or 2 and 4.
557 *
558 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
559 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
560 *
561 * The instructions below are quicker than repeated fzero instructions
562 * since they can dispatch down two fp pipelines.
563 */
564#define	FZEROQ1Q3			\
565	fzero	%f0			;\
566	fzero	%f2			;\
567	faddd	%f0, %f2, %f4		;\
568	fmuld	%f0, %f2, %f6		;\
569	faddd	%f0, %f2, %f8		;\
570	fmuld	%f0, %f2, %f10		;\
571	faddd	%f0, %f2, %f12		;\
572	fmuld	%f0, %f2, %f14		;\
573	faddd	%f0, %f2, %f32		;\
574	fmuld	%f0, %f2, %f34		;\
575	faddd	%f0, %f2, %f36		;\
576	fmuld	%f0, %f2, %f38		;\
577	faddd	%f0, %f2, %f40		;\
578	fmuld	%f0, %f2, %f42		;\
579	faddd	%f0, %f2, %f44		;\
580	fmuld	%f0, %f2, %f46
581
582#define	FZEROQ2Q4			\
583	fzero	%f16			;\
584	fzero	%f18			;\
585	faddd	%f16, %f18, %f20	;\
586	fmuld	%f16, %f18, %f22	;\
587	faddd	%f16, %f18, %f24	;\
588	fmuld	%f16, %f18, %f26	;\
589	faddd	%f16, %f18, %f28	;\
590	fmuld	%f16, %f18, %f30	;\
591	faddd	%f16, %f18, %f48	;\
592	fmuld	%f16, %f18, %f50	;\
593	faddd	%f16, %f18, %f52	;\
594	fmuld	%f16, %f18, %f54	;\
595	faddd	%f16, %f18, %f56	;\
596	fmuld	%f16, %f18, %f58	;\
597	faddd	%f16, %f18, %f60	;\
598	fmuld	%f16, %f18, %f62
599
600/*
601 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
602 * Used to save and restore in-use fp registers when we want to use FP
603 * and find fp already in use and copy size still large enough to justify
604 * the additional overhead of this save and restore.
605 *
606 * A membar #Sync is needed before save to sync fp ops initiated before
607 * the call to the copy function (by whoever has fp in use); for example
608 * an earlier block load to the quadrant we are about to save may still be
609 * "in flight".  A membar #Sync is required at the end of the save to
610 * sync our block store (the copy code is about to begin ldd's to the
611 * first quadrant).  Note, however, that since Cheetah pipeline block load
612 * is blocking we can omit the initial membar before saving fp state (they're
613 * commented below in case of future porting to a chip that does not block
614 * on block load).
615 *
616 * Similarly: a membar #Sync before restore allows the block stores of
617 * the copy operation to complete before we fill the quadrants with their
618 * original data, and a membar #Sync after restore lets the block loads
619 * of the restore complete before we return to whoever has the fp regs
620 * in use.  To avoid repeated membar #Sync we make it the responsibility
621 * of the copy code to membar #Sync immediately after copy is complete
622 * and before using the BLD_*_FROMSTACK macro.
623 */
624#if !defined(lint)
625#define BST_FPQ1Q3_TOSTACK(tmp1)				\
626	/* membar #Sync	*/					;\
627	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
628	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
629	stda	%f0, [tmp1]ASI_BLK_P				;\
630	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
631	stda	%f32, [tmp1]ASI_BLK_P				;\
632	membar	#Sync
633
634#define	BLD_FPQ1Q3_FROMSTACK(tmp1)				\
635	/* membar #Sync - provided at copy completion */	;\
636	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
637	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
638	ldda	[tmp1]ASI_BLK_P, %f0				;\
639	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
640	ldda	[tmp1]ASI_BLK_P, %f32				;\
641	membar	#Sync
642
643#define BST_FPQ2Q4_TOSTACK(tmp1)				\
644	/* membar #Sync */					;\
645	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
646	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
647	stda	%f16, [tmp1]ASI_BLK_P				;\
648	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
649	stda	%f48, [tmp1]ASI_BLK_P				;\
650	membar	#Sync
651
652#define	BLD_FPQ2Q4_FROMSTACK(tmp1)				\
653	/* membar #Sync - provided at copy completion */	;\
654	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
655	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
656	ldda	[tmp1]ASI_BLK_P, %f16				;\
657	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
658	ldda	[tmp1]ASI_BLK_P, %f48				;\
659	membar	#Sync
660#endif
661
662/*
663 * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
664 * prevent preemption if there is no t_lwp to save FP state to on context
665 * switch) before commencing a FP copy, and reallow it on completion or
666 * in error trampoline paths when we were using FP copy.
667 *
668 * Both macros may call other functions, so be aware that all outputs are
669 * forfeit after using these macros.  For this reason we do not pass registers
670 * to use - we just use any outputs we want.
671 *
672 * For fpRAS we need to perform the fpRAS mechanism test on the same
673 * CPU as we use for the copy operation, both so that we validate the
674 * CPU we perform the copy on and so that we know which CPU failed
675 * if a failure is detected.  Hence we need to be bound to "our" CPU.
676 * This could be achieved through disabling preemption (and we have do it that
677 * way for threads with no t_lwp) but for larger copies this may hold
678 * higher priority threads off of cpu for too long (eg, realtime).  So we
679 * make use of the lightweight t_nomigrate mechanism where we can (ie, when
680 * we have a t_lwp).
681 *
682 * Pseudo code:
683 *
684 * FP_NOMIGRATE:
685 *
686 * if (curthread->t_lwp) {
687 *	thread_nomigrate();
688 * } else {
689 *	kpreempt_disable();
690 * }
691 *
692 * FP_ALLOWMIGRATE:
693 *
694 * if (curthread->t_lwp) {
695 *	thread_allowmigrate();
696 * } else {
697 *	kpreempt_enable();
698 * }
699 */
700
701#define	FP_NOMIGRATE(label1, label2)				\
702	ldn	[THREAD_REG + T_LWP], %o0			;\
703	brz,a,pn %o0, label1/**/f				;\
704	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
705	call	thread_nomigrate				;\
706	  nop							;\
707	ba	label2/**/f					;\
708	  nop							;\
709label1:								;\
710	inc	%o1						;\
711	stb	%o1, [THREAD_REG + T_PREEMPT]			;\
712label2:
713
714#define	FP_ALLOWMIGRATE(label1, label2)			\
715	ldn	[THREAD_REG + T_LWP], %o0			;\
716	brz,a,pn %o0, label1/**/f				;\
717	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
718	call thread_allowmigrate				;\
719	  nop							;\
720	ba	label2/**/f					;\
721	  nop							;\
722label1:								;\
723	dec	%o1						;\
724	brnz,pn	%o1, label2/**/f				;\
725	  stb	%o1, [THREAD_REG + T_PREEMPT]			;\
726	ldn	[THREAD_REG + T_CPU], %o0			;\
727	ldub	[%o0 + CPU_KPRUNRUN], %o0			;\
728	brz,pt	%o0, label2/**/f				;\
729	  nop							;\
730	call	kpreempt					;\
731	  rdpr	%pil, %o0					;\
732label2:
733
734/*
735 * Copy a block of storage, returning an error code if `from' or
736 * `to' takes a kernel pagefault which cannot be resolved.
737 * Returns errno value on pagefault error, 0 if all ok
738 */
739
740#if defined(lint)
741
742/* ARGSUSED */
743int
744kcopy(const void *from, void *to, size_t count)
745{ return(0); }
746
747#else	/* lint */
748
749	.seg	".text"
750	.align	4
751
752	ENTRY(kcopy)
753
754	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
755	bleu,pt	%ncc, .kcopy_small		! go to larger cases
756	  xor	%o0, %o1, %o3			! are src, dst alignable?
757	btst	7, %o3				!
758	bz,pt	%ncc, .kcopy_8			! check for longword alignment
759	  nop
760	btst	1, %o3				!
761	bz,pt	%ncc, .kcopy_2			! check for half-word
762	  nop
763	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
764	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
765	tst	%o3
766	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
767	  cmp	%o2, %o3			! if length <= limit
768	bleu,pt	%ncc, .kcopy_small		! go to small copy
769	  nop
770	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
771	  nop
772.kcopy_2:
773	btst	3, %o3				!
774	bz,pt	%ncc, .kcopy_4			! check for word alignment
775	  nop
776	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
777	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
778	tst	%o3
779	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
780	  cmp	%o2, %o3			! if length <= limit
781	bleu,pt	%ncc, .kcopy_small		! go to small copy
782	  nop
783	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
784	  nop
785.kcopy_4:
786	! already checked longword, must be word aligned
787	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
788	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
789	tst	%o3
790	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
791	  cmp	%o2, %o3			! if length <= limit
792	bleu,pt	%ncc, .kcopy_small		! go to small copy
793	  nop
794	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
795	  nop
796.kcopy_8:
797	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
798	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
799	tst	%o3
800	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
801	  cmp	%o2, %o3			! if length <= limit
802	bleu,pt	%ncc, .kcopy_small		! go to small copy
803	  nop
804	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
805	  nop
806
807.kcopy_small:
808	sethi	%hi(.sm_copyerr), %o5		! sm_copyerr is lofault value
809	or	%o5, %lo(.sm_copyerr), %o5
810	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
811	membar	#Sync				! sync error barrier
812	ba,pt	%ncc, .sm_do_copy		! common code
813	 stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
814
815.kcopy_more:
816	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
817	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
818	or	%l7, %lo(.copyerr), %l7
819	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
820	membar	#Sync				! sync error barrier
821	ba,pt	%ncc, .do_copy			! common code
822	  stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
823
824
825/*
826 * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
827 * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
828 */
829.copyerr:
830	set	.copyerr2, %l0
831	membar	#Sync				! sync error barrier
832	stn	%l0, [THREAD_REG + T_LOFAULT]	! set t_lofault
833	btst	FPUSED_FLAG, %l6
834	bz	%ncc, 1f
835	  and	%l6, TRAMP_FLAG, %l0		! copy trampoline flag to %l0
836
837	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
838	wr	%o2, 0, %gsr
839
840	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
841	btst	FPRS_FEF, %o3
842	bz,pt	%icc, 4f
843	  nop
844
845	BLD_FPQ1Q3_FROMSTACK(%o2)
846
847	ba,pt	%ncc, 1f
848	  wr	%o3, 0, %fprs		! restore fprs
849
8504:
851	FZEROQ1Q3
852	wr	%o3, 0, %fprs		! restore fprs
853
854	!
855	! Need to cater for the different expectations of kcopy
856	! and bcopy. kcopy will *always* set a t_lofault handler
857	! If it fires, we're expected to just return the error code
858	! and *not* to invoke any existing error handler. As far as
859	! bcopy is concerned, we only set t_lofault if there was an
860	! existing lofault handler. In that case we're expected to
861	! invoke the previously existing handler after resetting the
862	! t_lofault value.
863	!
8641:
865	andn	%l6, MASK_FLAGS, %l6		! turn trampoline flag off
866	membar	#Sync				! sync error barrier
867	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
868	FP_ALLOWMIGRATE(5, 6)
869
870	btst	TRAMP_FLAG, %l0
871	bnz,pn	%ncc, 3f
872	  nop
873	ret
874	  restore	%g1, 0, %o0
875
8763:
877	!
878	! We're here via bcopy. There *must* have been an error handler
879	! in place otherwise we would have died a nasty death already.
880	!
881	jmp	%l6				! goto real handler
882	  restore	%g0, 0, %o0		! dispose of copy window
883
884/*
885 * We got here because of a fault in .copyerr.  We can't safely restore fp
886 * state, so we panic.
887 */
888fp_panic_msg:
889	.asciz	"Unable to restore fp state after copy operation"
890
891	.align	4
892.copyerr2:
893	set	fp_panic_msg, %o0
894	call	panic
895	  nop
896
897/*
898 * We got here because of a fault during a small kcopy or bcopy.
899 * No floating point registers are used by the small copies.
900 * Errno value is in %g1.
901 */
902.sm_copyerr:
9031:
904	btst	TRAMP_FLAG, %o4
905	membar	#Sync
906	andn	%o4, TRAMP_FLAG, %o4
907	bnz,pn	%ncc, 3f
908	  stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
909	retl
910	  mov	%g1, %o0
9113:
912	jmp	%o4				! goto real handler
913	  mov	%g0, %o0			!
914
915	SET_SIZE(kcopy)
916#endif	/* lint */
917
918
919/*
920 * Copy a block of storage - must not overlap (from + len <= to).
921 * Registers: l6 - saved t_lofault
922 * (for short copies, o4 - saved t_lofault)
923 *
924 * Copy a page of memory.
925 * Assumes double word alignment and a count >= 256.
926 */
927#if defined(lint)
928
929/* ARGSUSED */
930void
931bcopy(const void *from, void *to, size_t count)
932{}
933
934#else	/* lint */
935
936	ENTRY(bcopy)
937
938	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
939	bleu,pt	%ncc, .bcopy_small		! go to larger cases
940	  xor	%o0, %o1, %o3			! are src, dst alignable?
941	btst	7, %o3				!
942	bz,pt	%ncc, .bcopy_8			! check for longword alignment
943	  nop
944	btst	1, %o3				!
945	bz,pt	%ncc, .bcopy_2			! check for half-word
946	  nop
947	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
948	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
949	tst	%o3
950	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
951	  cmp	%o2, %o3			! if length <= limit
952	bleu,pt	%ncc, .bcopy_small		! go to small copy
953	  nop
954	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
955	  nop
956.bcopy_2:
957	btst	3, %o3				!
958	bz,pt	%ncc, .bcopy_4			! check for word alignment
959	  nop
960	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
961	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
962	tst	%o3
963	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
964	  cmp	%o2, %o3			! if length <= limit
965	bleu,pt	%ncc, .bcopy_small		! go to small copy
966	  nop
967	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
968	  nop
969.bcopy_4:
970	! already checked longword, must be word aligned
971	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
972	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
973	tst	%o3
974	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
975	  cmp	%o2, %o3			! if length <= limit
976	bleu,pt	%ncc, .bcopy_small		! go to small copy
977	  nop
978	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
979	  nop
980.bcopy_8:
981	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
982	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
983	tst	%o3
984	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
985	  cmp	%o2, %o3			! if length <= limit
986	bleu,pt	%ncc, .bcopy_small		! go to small copy
987	  nop
988	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
989	  nop
990
991	.align	16
992.bcopy_small:
993	ldn	[THREAD_REG + T_LOFAULT], %o4	! save t_lofault
994	tst	%o4
995	bz,pt	%icc, .sm_do_copy
996	  nop
997	sethi	%hi(.sm_copyerr), %o5
998	or	%o5, %lo(.sm_copyerr), %o5
999	membar	#Sync				! sync error barrier
1000	stn	%o5, [THREAD_REG + T_LOFAULT]	! install new vector
1001	or	%o4, TRAMP_FLAG, %o4		! error should trampoline
1002.sm_do_copy:
1003	cmp	%o2, SHORTCOPY		! check for really short case
1004	bleu,pt	%ncc, .bc_sm_left	!
1005	  cmp	%o2, CHKSIZE		! check for medium length cases
1006	bgu,pn	%ncc, .bc_med		!
1007	  or	%o0, %o1, %o3		! prepare alignment check
1008	andcc	%o3, 0x3, %g0		! test for alignment
1009	bz,pt	%ncc, .bc_sm_word	! branch to word aligned case
1010.bc_sm_movebytes:
1011	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
1012.bc_sm_notalign4:
1013	ldub	[%o0], %o3		! read byte
1014	stb	%o3, [%o1]		! write byte
1015	subcc	%o2, 4, %o2		! reduce count by 4
1016	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
1017	add	%o0, 4, %o0		! advance SRC by 4
1018	stb	%o3, [%o1 + 1]
1019	ldub	[%o0 - 2], %o3
1020	add	%o1, 4, %o1		! advance DST by 4
1021	stb	%o3, [%o1 - 2]
1022	ldub	[%o0 - 1], %o3
1023	bgt,pt	%ncc, .bc_sm_notalign4	! loop til 3 or fewer bytes remain
1024	  stb	%o3, [%o1 - 1]
1025	add	%o2, 3, %o2		! restore count
1026.bc_sm_left:
1027	tst	%o2
1028	bz,pt	%ncc, .bc_sm_exit	! check for zero length
1029	  deccc	%o2			! reduce count for cc test
1030	ldub	[%o0], %o3		! move one byte
1031	bz,pt	%ncc, .bc_sm_exit
1032	  stb	%o3, [%o1]
1033	ldub	[%o0 + 1], %o3		! move another byte
1034	deccc	%o2			! check for more
1035	bz,pt	%ncc, .bc_sm_exit
1036	  stb	%o3, [%o1 + 1]
1037	ldub	[%o0 + 2], %o3		! move final byte
1038	stb	%o3, [%o1 + 2]
1039	membar	#Sync				! sync error barrier
1040	andn	%o4, TRAMP_FLAG, %o4
1041	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1042	retl
1043	  mov	%g0, %o0		! return 0
1044	.align	16
1045	nop				! instruction alignment
1046					! see discussion at start of file
1047.bc_sm_words:
1048	lduw	[%o0], %o3		! read word
1049.bc_sm_wordx:
1050	subcc	%o2, 8, %o2		! update count
1051	stw	%o3, [%o1]		! write word
1052	add	%o0, 8, %o0		! update SRC
1053	lduw	[%o0 - 4], %o3		! read word
1054	add	%o1, 8, %o1		! update DST
1055	bgt,pt	%ncc, .bc_sm_words	! loop til done
1056	  stw	%o3, [%o1 - 4]		! write word
1057	addcc	%o2, 7, %o2		! restore count
1058	bz,pt	%ncc, .bc_sm_exit
1059	  deccc	%o2
1060	bz,pt	%ncc, .bc_sm_byte
1061.bc_sm_half:
1062	  subcc	%o2, 2, %o2		! reduce count by 2
1063	add	%o0, 2, %o0		! advance SRC by 2
1064	lduh	[%o0 - 2], %o3		! read half word
1065	add	%o1, 2, %o1		! advance DST by 2
1066	bgt,pt	%ncc, .bc_sm_half	! loop til done
1067	  sth	%o3, [%o1 - 2]		! write half word
1068	addcc	%o2, 1, %o2		! restore count
1069	bz,pt	%ncc, .bc_sm_exit
1070	  nop
1071.bc_sm_byte:
1072	ldub	[%o0], %o3
1073	stb	%o3, [%o1]
1074	membar	#Sync				! sync error barrier
1075	andn	%o4, TRAMP_FLAG, %o4
1076	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1077	retl
1078	  mov	%g0, %o0		! return 0
1079
1080.bc_sm_word:
1081	subcc	%o2, 4, %o2		! update count
1082	bgt,pt	%ncc, .bc_sm_wordx
1083	  lduw	[%o0], %o3		! read word
1084	addcc	%o2, 3, %o2		! restore count
1085	bz,pt	%ncc, .bc_sm_exit
1086	  stw	%o3, [%o1]		! write word
1087	deccc	%o2			! reduce count for cc test
1088	ldub	[%o0 + 4], %o3		! load one byte
1089	bz,pt	%ncc, .bc_sm_exit
1090	  stb	%o3, [%o1 + 4]		! store one byte
1091	ldub	[%o0 + 5], %o3		! load second byte
1092	deccc	%o2
1093	bz,pt	%ncc, .bc_sm_exit
1094	  stb	%o3, [%o1 + 5]		! store second byte
1095	ldub	[%o0 + 6], %o3		! load third byte
1096	stb	%o3, [%o1 + 6]		! store third byte
1097.bc_sm_exit:
1098	membar	#Sync				! sync error barrier
1099	andn	%o4, TRAMP_FLAG, %o4
1100	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1101	retl
1102	  mov	%g0, %o0		! return 0
1103
1104	.align 16
1105.bc_med:
1106	xor	%o0, %o1, %o3		! setup alignment check
1107	btst	1, %o3
1108	bnz,pt	%ncc, .bc_sm_movebytes	! unaligned
1109	  nop
1110	btst	3, %o3
1111	bnz,pt	%ncc, .bc_med_half	! halfword aligned
1112	  nop
1113	btst	7, %o3
1114	bnz,pt	%ncc, .bc_med_word	! word aligned
1115	  nop
1116.bc_med_long:
1117	btst	3, %o0			! check for
1118	bz,pt	%ncc, .bc_med_long1	! word alignment
1119	  nop
1120.bc_med_long0:
1121	ldub	[%o0], %o3		! load one byte
1122	inc	%o0
1123	stb	%o3,[%o1]		! store byte
1124	inc	%o1
1125	btst	3, %o0
1126	bnz,pt	%ncc, .bc_med_long0
1127	  dec	%o2
1128.bc_med_long1:			! word aligned
1129	btst	7, %o0			! check for long word
1130	bz,pt	%ncc, .bc_med_long2
1131	  nop
1132	lduw	[%o0], %o3		! load word
1133	add	%o0, 4, %o0		! advance SRC by 4
1134	stw	%o3, [%o1]		! store word
1135	add	%o1, 4, %o1		! advance DST by 4
1136	sub	%o2, 4, %o2		! reduce count by 4
1137!
1138!  Now long word aligned and have at least 32 bytes to move
1139!
1140.bc_med_long2:
1141	sub	%o2, 31, %o2		! adjust count to allow cc zero test
1142.bc_med_lmove:
1143	ldx	[%o0], %o3		! read long word
1144	stx	%o3, [%o1]		! write long word
1145	subcc	%o2, 32, %o2		! reduce count by 32
1146	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
1147	add	%o0, 32, %o0		! advance SRC by 32
1148	stx	%o3, [%o1 + 8]
1149	ldx	[%o0 - 16], %o3
1150	add	%o1, 32, %o1		! advance DST by 32
1151	stx	%o3, [%o1 - 16]
1152	ldx	[%o0 - 8], %o3
1153	bgt,pt	%ncc, .bc_med_lmove	! loop til 31 or fewer bytes left
1154	  stx	%o3, [%o1 - 8]
1155	addcc	%o2, 24, %o2		! restore count to long word offset
1156	ble,pt	%ncc, .bc_med_lextra	! check for more long words to move
1157	  nop
1158.bc_med_lword:
1159	ldx	[%o0], %o3		! read long word
1160	subcc	%o2, 8, %o2		! reduce count by 8
1161	stx	%o3, [%o1]		! write long word
1162	add	%o0, 8, %o0		! advance SRC by 8
1163	bgt,pt	%ncc, .bc_med_lword	! loop til 7 or fewer bytes left
1164	  add	%o1, 8, %o1		! advance DST by 8
1165.bc_med_lextra:
1166	addcc	%o2, 7, %o2		! restore rest of count
1167	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
1168	  deccc	%o2
1169	bz,pt	%ncc, .bc_sm_byte
1170	  nop
1171	ba,pt	%ncc, .bc_sm_half
1172	  nop
1173
1174	.align 16
1175.bc_med_word:
1176	btst	3, %o0			! check for
1177	bz,pt	%ncc, .bc_med_word1	! word alignment
1178	  nop
1179.bc_med_word0:
1180	ldub	[%o0], %o3		! load one byte
1181	inc	%o0
1182	stb	%o3,[%o1]		! store byte
1183	inc	%o1
1184	btst	3, %o0
1185	bnz,pt	%ncc, .bc_med_word0
1186	  dec	%o2
1187!
1188!  Now word aligned and have at least 36 bytes to move
1189!
1190.bc_med_word1:
1191	sub	%o2, 15, %o2		! adjust count to allow cc zero test
1192.bc_med_wmove:
1193	lduw	[%o0], %o3		! read word
1194	stw	%o3, [%o1]		! write word
1195	subcc	%o2, 16, %o2		! reduce count by 16
1196	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
1197	add	%o0, 16, %o0		! advance SRC by 16
1198	stw	%o3, [%o1 + 4]
1199	lduw	[%o0 - 8], %o3
1200	add	%o1, 16, %o1		! advance DST by 16
1201	stw	%o3, [%o1 - 8]
1202	lduw	[%o0 - 4], %o3
1203	bgt,pt	%ncc, .bc_med_wmove	! loop til 15 or fewer bytes left
1204	  stw	%o3, [%o1 - 4]
1205	addcc	%o2, 12, %o2		! restore count to word offset
1206	ble,pt	%ncc, .bc_med_wextra	! check for more words to move
1207	  nop
1208.bc_med_word2:
1209	lduw	[%o0], %o3		! read word
1210	subcc	%o2, 4, %o2		! reduce count by 4
1211	stw	%o3, [%o1]		! write word
1212	add	%o0, 4, %o0		! advance SRC by 4
1213	bgt,pt	%ncc, .bc_med_word2	! loop til 3 or fewer bytes left
1214	  add	%o1, 4, %o1		! advance DST by 4
1215.bc_med_wextra:
1216	addcc	%o2, 3, %o2		! restore rest of count
1217	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
1218	  deccc	%o2
1219	bz,pt	%ncc, .bc_sm_byte
1220	  nop
1221	ba,pt	%ncc, .bc_sm_half
1222	  nop
1223
1224	.align 16
1225.bc_med_half:
1226	btst	1, %o0			! check for
1227	bz,pt	%ncc, .bc_med_half1	! half word alignment
1228	  nop
1229	ldub	[%o0], %o3		! load one byte
1230	inc	%o0
1231	stb	%o3,[%o1]		! store byte
1232	inc	%o1
1233	dec	%o2
1234!
1235!  Now half word aligned and have at least 38 bytes to move
1236!
1237.bc_med_half1:
1238	sub	%o2, 7, %o2		! adjust count to allow cc zero test
1239.bc_med_hmove:
1240	lduh	[%o0], %o3		! read half word
1241	sth	%o3, [%o1]		! write half word
1242	subcc	%o2, 8, %o2		! reduce count by 8
1243	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
1244	add	%o0, 8, %o0		! advance SRC by 8
1245	sth	%o3, [%o1 + 2]
1246	lduh	[%o0 - 4], %o3
1247	add	%o1, 8, %o1		! advance DST by 8
1248	sth	%o3, [%o1 - 4]
1249	lduh	[%o0 - 2], %o3
1250	bgt,pt	%ncc, .bc_med_hmove	! loop til 7 or fewer bytes left
1251	  sth	%o3, [%o1 - 2]
1252	addcc	%o2, 7, %o2		! restore count
1253	bz,pt	%ncc, .bc_sm_exit
1254	  deccc	%o2
1255	bz,pt	%ncc, .bc_sm_byte
1256	  nop
1257	ba,pt	%ncc, .bc_sm_half
1258	  nop
1259
1260	SET_SIZE(bcopy)
1261
1262/*
1263 * The _more entry points are not intended to be used directly by
1264 * any caller from outside this file.  They are provided to allow
1265 * profiling and dtrace of the portions of the copy code that uses
1266 * the floating point registers.
1267 * This entry is particularly important as DTRACE (at least as of
1268 * 4/2004) does not support leaf functions.
1269 */
1270
1271	ENTRY(bcopy_more)
1272.bcopy_more:
1273	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1274	ldn	[THREAD_REG + T_LOFAULT], %l6	! save t_lofault
1275	tst	%l6
1276	bz,pt	%ncc, .do_copy
1277	  nop
1278	sethi	%hi(.copyerr), %o2
1279	or	%o2, %lo(.copyerr), %o2
1280	membar	#Sync				! sync error barrier
1281	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
1282	!
1283	! We've already captured whether t_lofault was zero on entry.
1284	! We need to mark ourselves as being from bcopy since both
1285	! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
1286	! and the saved lofault was zero, we won't reset lofault on
1287	! returning.
1288	!
1289	or	%l6, TRAMP_FLAG, %l6
1290
1291/*
1292 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
1293 * Also, use of FP registers has been tested to be enabled
1294 */
1295.do_copy:
1296	FP_NOMIGRATE(6, 7)
1297
1298	rd	%fprs, %o2		! check for unused fp
1299	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
1300	btst	FPRS_FEF, %o2
1301	bz,a,pt	%icc, .do_blockcopy
1302	  wr	%g0, FPRS_FEF, %fprs
1303
1304	BST_FPQ1Q3_TOSTACK(%o2)
1305
1306.do_blockcopy:
1307	rd	%gsr, %o2
1308	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
1309	or	%l6, FPUSED_FLAG, %l6
1310
1311#define	REALSRC	%i0
1312#define	DST	%i1
1313#define	CNT	%i2
1314#define	SRC	%i3
1315#define	TMP	%i5
1316
1317	andcc	DST, VIS_BLOCKSIZE - 1, TMP
1318	bz,pt	%ncc, 2f
1319	  neg	TMP
1320	add	TMP, VIS_BLOCKSIZE, TMP
1321
1322	! TMP = bytes required to align DST on FP_BLOCK boundary
1323	! Using SRC as a tmp here
1324	cmp	TMP, 3
1325	bleu,pt	%ncc, 1f
1326	  sub	CNT,TMP,CNT		! adjust main count
1327	sub	TMP, 3, TMP		! adjust for end of loop test
1328.bc_blkalign:
1329	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
1330	stb	SRC, [DST]
1331	subcc	TMP, 4, TMP
1332	ldub	[REALSRC + 1], SRC
1333	add	REALSRC, 4, REALSRC
1334	stb	SRC, [DST + 1]
1335	ldub	[REALSRC - 2], SRC
1336	add	DST, 4, DST
1337	stb	SRC, [DST - 2]
1338	ldub	[REALSRC - 1], SRC
1339	bgu,pt	%ncc, .bc_blkalign
1340	  stb	SRC, [DST - 1]
1341
1342	addcc	TMP, 3, TMP		! restore count adjustment
1343	bz,pt	%ncc, 2f		! no bytes left?
1344	  nop
13451:	ldub	[REALSRC], SRC
1346	inc	REALSRC
1347	inc	DST
1348	deccc	TMP
1349	bgu	%ncc, 1b
1350	  stb	SRC, [DST - 1]
1351
13522:
1353	andn	REALSRC, 0x7, SRC
1354	alignaddr REALSRC, %g0, %g0
1355
1356	! SRC - 8-byte aligned
1357	! DST - 64-byte aligned
1358	prefetch [SRC], #one_read
1359	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
1360	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
1361	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
1362	ldd	[SRC], %f0
1363#if CHEETAH_PREFETCH > 4
1364	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1365#endif
1366	ldd	[SRC + 0x08], %f2
1367#if CHEETAH_PREFETCH > 5
1368	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
1369#endif
1370	ldd	[SRC + 0x10], %f4
1371#if CHEETAH_PREFETCH > 6
1372	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
1373#endif
1374	faligndata %f0, %f2, %f32
1375	ldd	[SRC + 0x18], %f6
1376#if CHEETAH_PREFETCH > 7
1377	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
1378#endif
1379	faligndata %f2, %f4, %f34
1380	ldd	[SRC + 0x20], %f8
1381	faligndata %f4, %f6, %f36
1382	ldd	[SRC + 0x28], %f10
1383	faligndata %f6, %f8, %f38
1384	ldd	[SRC + 0x30], %f12
1385	faligndata %f8, %f10, %f40
1386	ldd	[SRC + 0x38], %f14
1387	faligndata %f10, %f12, %f42
1388	ldd	[SRC + VIS_BLOCKSIZE], %f0
1389	sub	CNT, VIS_BLOCKSIZE, CNT
1390	add	SRC, VIS_BLOCKSIZE, SRC
1391	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1392	ba,a,pt	%ncc, 1f
1393	  nop
1394	.align	16
13951:
1396	ldd	[SRC + 0x08], %f2
1397	faligndata %f12, %f14, %f44
1398	ldd	[SRC + 0x10], %f4
1399	faligndata %f14, %f0, %f46
1400	stda	%f32, [DST]ASI_BLK_P
1401	ldd	[SRC + 0x18], %f6
1402	faligndata %f0, %f2, %f32
1403	ldd	[SRC + 0x20], %f8
1404	faligndata %f2, %f4, %f34
1405	ldd	[SRC + 0x28], %f10
1406	faligndata %f4, %f6, %f36
1407	ldd	[SRC + 0x30], %f12
1408	faligndata %f6, %f8, %f38
1409	ldd	[SRC + 0x38], %f14
1410	faligndata %f8, %f10, %f40
1411	sub	CNT, VIS_BLOCKSIZE, CNT
1412	ldd	[SRC + VIS_BLOCKSIZE], %f0
1413	faligndata %f10, %f12, %f42
1414	prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
1415	add	DST, VIS_BLOCKSIZE, DST
1416	prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1417	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1418	cmp	CNT, VIS_BLOCKSIZE + 8
1419	bgu,pt	%ncc, 1b
1420	  add	SRC, VIS_BLOCKSIZE, SRC
1421
1422	! only if REALSRC & 0x7 is 0
1423	cmp	CNT, VIS_BLOCKSIZE
1424	bne	%ncc, 3f
1425	  andcc	REALSRC, 0x7, %g0
1426	bz,pt	%ncc, 2f
1427	  nop
14283:
1429	faligndata %f12, %f14, %f44
1430	faligndata %f14, %f0, %f46
1431	stda	%f32, [DST]ASI_BLK_P
1432	add	DST, VIS_BLOCKSIZE, DST
1433	ba,pt	%ncc, 3f
1434	  nop
14352:
1436	ldd	[SRC + 0x08], %f2
1437	fsrc1	%f12, %f44
1438	ldd	[SRC + 0x10], %f4
1439	fsrc1	%f14, %f46
1440	stda	%f32, [DST]ASI_BLK_P
1441	ldd	[SRC + 0x18], %f6
1442	fsrc1	%f0, %f32
1443	ldd	[SRC + 0x20], %f8
1444	fsrc1	%f2, %f34
1445	ldd	[SRC + 0x28], %f10
1446	fsrc1	%f4, %f36
1447	ldd	[SRC + 0x30], %f12
1448	fsrc1	%f6, %f38
1449	ldd	[SRC + 0x38], %f14
1450	fsrc1	%f8, %f40
1451	sub	CNT, VIS_BLOCKSIZE, CNT
1452	add	DST, VIS_BLOCKSIZE, DST
1453	add	SRC, VIS_BLOCKSIZE, SRC
1454	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1455	fsrc1	%f10, %f42
1456	fsrc1	%f12, %f44
1457	fsrc1	%f14, %f46
1458	stda	%f32, [DST]ASI_BLK_P
1459	add	DST, VIS_BLOCKSIZE, DST
1460	ba,a,pt	%ncc, .bcb_exit
1461	  nop
1462
14633:	tst	CNT
1464	bz,a,pt	%ncc, .bcb_exit
1465	  nop
1466
14675:	ldub	[REALSRC], TMP
1468	inc	REALSRC
1469	inc	DST
1470	deccc	CNT
1471	bgu	%ncc, 5b
1472	  stb	TMP, [DST - 1]
1473.bcb_exit:
1474	membar	#Sync
1475
1476	FPRAS_INTERVAL(FPRAS_BCOPY, 0, %l5, %o2, %o3, %o4, %o5, 8)
1477	FPRAS_REWRITE_TYPE2Q1(0, %l5, %o2, %o3, 8, 9)
1478	FPRAS_CHECK(FPRAS_BCOPY, %l5, 9)	! outputs lost
1479
1480	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
1481	wr	%o2, 0, %gsr
1482
1483	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1484	btst	FPRS_FEF, %o3
1485	bz,pt	%icc, 4f
1486	  nop
1487
1488	BLD_FPQ1Q3_FROMSTACK(%o2)
1489
1490	ba,pt	%ncc, 2f
1491	  wr	%o3, 0, %fprs		! restore fprs
14924:
1493	FZEROQ1Q3
1494	wr	%o3, 0, %fprs		! restore fprs
14952:
1496	membar	#Sync				! sync error barrier
1497	andn	%l6, MASK_FLAGS, %l6
1498	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1499	FP_ALLOWMIGRATE(5, 6)
1500	ret
1501	  restore	%g0, 0, %o0
1502
1503	SET_SIZE(bcopy_more)
1504
1505#endif	/* lint */
1506
1507/*
1508 * Block copy with possibly overlapped operands.
1509 */
1510
1511#if defined(lint)
1512
1513/*ARGSUSED*/
1514void
1515ovbcopy(const void *from, void *to, size_t count)
1516{}
1517
1518#else	/* lint */
1519
1520	ENTRY(ovbcopy)
1521	tst	%o2			! check count
1522	bgu,a	%ncc, 1f		! nothing to do or bad arguments
1523	  subcc	%o0, %o1, %o3		! difference of from and to address
1524
1525	retl				! return
1526	  nop
15271:
1528	bneg,a	%ncc, 2f
1529	  neg	%o3			! if < 0, make it positive
15302:	cmp	%o2, %o3		! cmp size and abs(from - to)
1531	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
1532	  .empty				!   no overlap
1533	  cmp	%o0, %o1		! compare from and to addresses
1534	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
1535	  nop
1536	!
1537	! Copy forwards.
1538	!
1539.ov_fwd:
1540	ldub	[%o0], %o3		! read from address
1541	inc	%o0			! inc from address
1542	stb	%o3, [%o1]		! write to address
1543	deccc	%o2			! dec count
1544	bgu	%ncc, .ov_fwd		! loop till done
1545	  inc	%o1			! inc to address
1546
1547	retl				! return
1548	  nop
1549	!
1550	! Copy backwards.
1551	!
1552.ov_bkwd:
1553	deccc	%o2			! dec count
1554	ldub	[%o0 + %o2], %o3	! get byte at end of src
1555	bgu	%ncc, .ov_bkwd		! loop till done
1556	  stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
1557
1558	retl				! return
1559	  nop
1560
1561	SET_SIZE(ovbcopy)
1562
1563#endif	/* lint */
1564
1565
1566/*
1567 * hwblkpagecopy()
1568 *
1569 * Copies exactly one page.  This routine assumes the caller (ppcopy)
1570 * has already disabled kernel preemption and has checked
1571 * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
1572 */
1573#ifdef lint
1574/*ARGSUSED*/
1575void
1576hwblkpagecopy(const void *src, void *dst)
1577{ }
1578#else /* lint */
1579	ENTRY(hwblkpagecopy)
1580	! get another window w/space for three aligned blocks of saved fpregs
1581	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1582
1583	! %i0 - source address (arg)
1584	! %i1 - destination address (arg)
1585	! %i2 - length of region (not arg)
1586	! %l0 - saved fprs
1587	! %l1 - pointer to saved fpregs
1588
1589	rd	%fprs, %l0		! check for unused fp
1590	btst	FPRS_FEF, %l0
1591	bz,a,pt	%icc, 1f
1592	  wr	%g0, FPRS_FEF, %fprs
1593
1594	BST_FPQ1Q3_TOSTACK(%l1)
1595
15961:	set	PAGESIZE, CNT
1597	mov	REALSRC, SRC
1598
1599	prefetch [SRC], #one_read
1600	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
1601	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
1602	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
1603	ldd	[SRC], %f0
1604#if CHEETAH_PREFETCH > 4
1605	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1606#endif
1607	ldd	[SRC + 0x08], %f2
1608#if CHEETAH_PREFETCH > 5
1609	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
1610#endif
1611	ldd	[SRC + 0x10], %f4
1612#if CHEETAH_PREFETCH > 6
1613	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
1614#endif
1615	fsrc1	%f0, %f32
1616	ldd	[SRC + 0x18], %f6
1617#if CHEETAH_PREFETCH > 7
1618	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
1619#endif
1620	fsrc1	%f2, %f34
1621	ldd	[SRC + 0x20], %f8
1622	fsrc1	%f4, %f36
1623	ldd	[SRC + 0x28], %f10
1624	fsrc1	%f6, %f38
1625	ldd	[SRC + 0x30], %f12
1626	fsrc1	%f8, %f40
1627	ldd	[SRC + 0x38], %f14
1628	fsrc1	%f10, %f42
1629	ldd	[SRC + VIS_BLOCKSIZE], %f0
1630	sub	CNT, VIS_BLOCKSIZE, CNT
1631	add	SRC, VIS_BLOCKSIZE, SRC
1632	ba,a,pt	%ncc, 2f
1633	  nop
1634	.align	16
16352:
1636	ldd	[SRC + 0x08], %f2
1637	fsrc1	%f12, %f44
1638	ldd	[SRC + 0x10], %f4
1639	fsrc1	%f14, %f46
1640	stda	%f32, [DST]ASI_BLK_P
1641	ldd	[SRC + 0x18], %f6
1642	fsrc1	%f0, %f32
1643	ldd	[SRC + 0x20], %f8
1644	fsrc1	%f2, %f34
1645	ldd	[SRC + 0x28], %f10
1646	fsrc1	%f4, %f36
1647	ldd	[SRC + 0x30], %f12
1648	fsrc1	%f6, %f38
1649	ldd	[SRC + 0x38], %f14
1650	fsrc1	%f8, %f40
1651	ldd	[SRC + VIS_BLOCKSIZE], %f0
1652	fsrc1	%f10, %f42
1653	prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
1654	sub	CNT, VIS_BLOCKSIZE, CNT
1655	add	DST, VIS_BLOCKSIZE, DST
1656	cmp	CNT, VIS_BLOCKSIZE + 8
1657	prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1658	bgu,pt	%ncc, 2b
1659	  add	SRC, VIS_BLOCKSIZE, SRC
1660
1661	! trailing block
1662	ldd	[SRC + 0x08], %f2
1663	fsrc1	%f12, %f44
1664	ldd	[SRC + 0x10], %f4
1665	fsrc1	%f14, %f46
1666	stda	%f32, [DST]ASI_BLK_P
1667	ldd	[SRC + 0x18], %f6
1668	fsrc1	%f0, %f32
1669	ldd	[SRC + 0x20], %f8
1670	fsrc1	%f2, %f34
1671	ldd	[SRC + 0x28], %f10
1672	fsrc1	%f4, %f36
1673	ldd	[SRC + 0x30], %f12
1674	fsrc1	%f6, %f38
1675	ldd	[SRC + 0x38], %f14
1676	fsrc1	%f8, %f40
1677	sub	CNT, VIS_BLOCKSIZE, CNT
1678	add	DST, VIS_BLOCKSIZE, DST
1679	add	SRC, VIS_BLOCKSIZE, SRC
1680	fsrc1	%f10, %f42
1681	fsrc1	%f12, %f44
1682	fsrc1	%f14, %f46
1683	stda	%f32, [DST]ASI_BLK_P
1684
1685	membar	#Sync
1686
1687	FPRAS_INTERVAL(FPRAS_PGCOPY, 1, %l5, %o2, %o3, %o4, %o5, 8)
1688	FPRAS_REWRITE_TYPE1(1, %l5, %f32, %o2, 9)
1689	FPRAS_CHECK(FPRAS_PGCOPY, %l5, 9)	! lose outputs
1690
1691	btst	FPRS_FEF, %l0
1692	bz,pt	%icc, 2f
1693	  nop
1694
1695	BLD_FPQ1Q3_FROMSTACK(%l3)
1696	ba	3f
1697	  nop
1698
16992:	FZEROQ1Q3
1700
17013:	wr	%l0, 0, %fprs		! restore fprs
1702	ret
1703	  restore	%g0, 0, %o0
1704
1705	SET_SIZE(hwblkpagecopy)
1706#endif	/* lint */
1707
1708
1709/*
1710 * Transfer data to and from user space -
1711 * Note that these routines can cause faults
1712 * It is assumed that the kernel has nothing at
1713 * less than KERNELBASE in the virtual address space.
1714 *
1715 * Note that copyin(9F) and copyout(9F) are part of the
1716 * DDI/DKI which specifies that they return '-1' on "errors."
1717 *
1718 * Sigh.
1719 *
1720 * So there's two extremely similar routines - xcopyin() and xcopyout()
1721 * which return the errno that we've faithfully computed.  This
1722 * allows other callers (e.g. uiomove(9F)) to work correctly.
1723 * Given that these are used pretty heavily, we expand the calling
1724 * sequences inline for all flavours (rather than making wrappers).
1725 *
1726 * There are also stub routines for xcopyout_little and xcopyin_little,
1727 * which currently are intended to handle requests of <= 16 bytes from
1728 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1729 * is left as an exercise...
1730 */
1731
1732/*
1733 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1734 *
1735 * General theory of operation:
1736 *
1737 * The only difference between copy{in,out} and
1738 * xcopy{in,out} is in the error handling routine they invoke
1739 * when a memory access error occurs. xcopyOP returns the errno
1740 * while copyOP returns -1 (see above). copy{in,out}_noerr set
1741 * a special flag (by oring the TRAMP_FLAG into the fault handler address)
1742 * if they are called with a fault handler already in place. That flag
1743 * causes the default handlers to trampoline to the previous handler
1744 * upon an error.
1745 *
1746 * None of the copyops routines grab a window until it's decided that
1747 * we need to do a HW block copy operation. This saves a window
1748 * spill/fill when we're called during socket ops. The typical IO
1749 * path won't cause spill/fill traps.
1750 *
1751 * This code uses a set of 4 limits for the maximum size that will
1752 * be copied given a particular input/output address alignment.
1753 * If the value for a particular limit is zero, the copy will be performed
1754 * by the plain copy loops rather than FPBLK.
1755 *
1756 * See the description of bcopy above for more details of the
1757 * data copying algorithm and the default limits.
1758 *
1759 */
1760
1761/*
1762 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1763 */
1764
1765#if defined(lint)
1766
1767
1768#else	/* lint */
1769/*
1770 * We save the arguments in the following registers in case of a fault:
1771 *	kaddr - %l1
1772 *	uaddr - %l2
1773 *	count - %l3
1774 */
1775#define SAVE_SRC	%l1
1776#define SAVE_DST	%l2
1777#define SAVE_COUNT	%l3
1778
1779#define SM_SAVE_SRC		%g4
1780#define SM_SAVE_DST		%g5
1781#define SM_SAVE_COUNT		%o5
1782#define ERRNO		%l5
1783
1784
1785#define REAL_LOFAULT	%l4
1786/*
1787 * Generic copyio fault handler.  This is the first line of defense when a
1788 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
1789 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1790 * This allows us to share common code for all the flavors of the copy
1791 * operations, including the _noerr versions.
1792 *
1793 * Note that this function will restore the original input parameters before
1794 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
1795 * member of the t_copyop structure, if needed.
1796 */
1797	ENTRY(copyio_fault)
1798	membar	#Sync
1799	mov	%g1,ERRNO			! save errno in ERRNO
1800	btst	FPUSED_FLAG, %l6
1801	bz	%ncc, 1f
1802	  nop
1803
1804	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1805	wr	%o2, 0, %gsr    	! restore gsr
1806
1807	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1808	btst	FPRS_FEF, %o3
1809	bz,pt	%icc, 4f
1810	  nop
1811
1812	BLD_FPQ2Q4_FROMSTACK(%o2)
1813
1814	ba,pt	%ncc, 1f
1815	  wr	%o3, 0, %fprs   	! restore fprs
1816
18174:
1818	FZEROQ2Q4
1819	wr	%o3, 0, %fprs   	! restore fprs
1820
18211:
1822	andn	%l6, FPUSED_FLAG, %l6
1823	membar	#Sync
1824	stn	%l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1825	FP_ALLOWMIGRATE(5, 6)
1826
1827	mov	SAVE_SRC, %i0
1828	mov	SAVE_DST, %i1
1829	jmp	REAL_LOFAULT
1830	  mov	SAVE_COUNT, %i2
1831
1832	SET_SIZE(copyio_fault)
1833
1834
1835#endif
1836
1837#if defined(lint)
1838
1839/*ARGSUSED*/
1840int
1841copyout(const void *kaddr, void *uaddr, size_t count)
1842{ return (0); }
1843
1844#else	/* lint */
1845
1846	ENTRY(copyout)
1847
1848	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
1849	bleu,pt	%ncc, .copyout_small		! go to larger cases
1850	  xor	%o0, %o1, %o3			! are src, dst alignable?
1851	btst	7, %o3				!
1852	bz,pt	%ncc, .copyout_8		! check for longword alignment
1853	  nop
1854	btst	1, %o3				!
1855	bz,pt	%ncc, .copyout_2		! check for half-word
1856	  nop
1857	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
1858	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
1859	tst	%o3
1860	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1861	  cmp	%o2, %o3			! if length <= limit
1862	bleu,pt	%ncc, .copyout_small		! go to small copy
1863	  nop
1864	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1865	  nop
1866.copyout_2:
1867	btst	3, %o3				!
1868	bz,pt	%ncc, .copyout_4		! check for word alignment
1869	  nop
1870	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
1871	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
1872	tst	%o3
1873	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1874	  cmp	%o2, %o3			! if length <= limit
1875	bleu,pt	%ncc, .copyout_small		! go to small copy
1876	  nop
1877	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1878	  nop
1879.copyout_4:
1880	! already checked longword, must be word aligned
1881	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
1882	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
1883	tst	%o3
1884	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1885	  cmp	%o2, %o3			! if length <= limit
1886	bleu,pt	%ncc, .copyout_small		! go to small copy
1887	  nop
1888	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1889	  nop
1890.copyout_8:
1891	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
1892	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
1893	tst	%o3
1894	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1895	  cmp	%o2, %o3			! if length <= limit
1896	bleu,pt	%ncc, .copyout_small		! go to small copy
1897	  nop
1898	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1899	  nop
1900
1901	.align	16
1902	nop				! instruction alignment
1903					! see discussion at start of file
1904.copyout_small:
1905	sethi	%hi(.sm_copyout_err), %o5	! .sm_copyout_err is lofault
1906	or	%o5, %lo(.sm_copyout_err), %o5
1907	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
1908	membar	#Sync				! sync error barrier
1909	stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
1910.sm_do_copyout:
1911	mov	%o0, SM_SAVE_SRC
1912	mov	%o1, SM_SAVE_DST
1913	cmp	%o2, SHORTCOPY		! check for really short case
1914	bleu,pt	%ncc, .co_sm_left	!
1915	  mov	%o2, SM_SAVE_COUNT
1916	cmp	%o2, CHKSIZE		! check for medium length cases
1917	bgu,pn	%ncc, .co_med		!
1918	  or	%o0, %o1, %o3		! prepare alignment check
1919	andcc	%o3, 0x3, %g0		! test for alignment
1920	bz,pt	%ncc, .co_sm_word	! branch to word aligned case
1921.co_sm_movebytes:
1922	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
1923.co_sm_notalign4:
1924	ldub	[%o0], %o3		! read byte
1925	subcc	%o2, 4, %o2		! reduce count by 4
1926	stba	%o3, [%o1]ASI_USER	! write byte
1927	inc	%o1			! advance DST by 1
1928	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
1929	add	%o0, 4, %o0		! advance SRC by 4
1930	stba	%o3, [%o1]ASI_USER
1931	inc	%o1			! advance DST by 1
1932	ldub	[%o0 - 2], %o3
1933	stba	%o3, [%o1]ASI_USER
1934	inc	%o1			! advance DST by 1
1935	ldub	[%o0 - 1], %o3
1936	stba	%o3, [%o1]ASI_USER
1937	bgt,pt	%ncc, .co_sm_notalign4	! loop til 3 or fewer bytes remain
1938	  inc	%o1			! advance DST by 1
1939	add	%o2, 3, %o2		! restore count
1940.co_sm_left:
1941	tst	%o2
1942	bz,pt	%ncc, .co_sm_exit	! check for zero length
1943	  nop
1944	ldub	[%o0], %o3		! load one byte
1945	deccc	%o2			! reduce count for cc test
1946	bz,pt	%ncc, .co_sm_exit
1947	  stba	%o3,[%o1]ASI_USER	! store one byte
1948	ldub	[%o0 + 1], %o3		! load second byte
1949	deccc	%o2
1950	inc	%o1
1951	bz,pt	%ncc, .co_sm_exit
1952	  stba	%o3,[%o1]ASI_USER	! store second byte
1953	ldub	[%o0 + 2], %o3		! load third byte
1954	inc	%o1
1955	stba	%o3,[%o1]ASI_USER	! store third byte
1956	membar	#Sync				! sync error barrier
1957	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1958	retl
1959	  mov	%g0, %o0		! return 0
1960	.align	16
1961.co_sm_words:
1962	lduw	[%o0], %o3		! read word
1963.co_sm_wordx:
1964	subcc	%o2, 8, %o2		! update count
1965	stwa	%o3, [%o1]ASI_USER	! write word
1966	add	%o0, 8, %o0		! update SRC
1967	lduw	[%o0 - 4], %o3		! read word
1968	add	%o1, 4, %o1		! update DST
1969	stwa	%o3, [%o1]ASI_USER	! write word
1970	bgt,pt	%ncc, .co_sm_words	! loop til done
1971	  add	%o1, 4, %o1		! update DST
1972	addcc	%o2, 7, %o2		! restore count
1973	bz,pt	%ncc, .co_sm_exit
1974	  nop
1975	deccc	%o2
1976	bz,pt	%ncc, .co_sm_byte
1977.co_sm_half:
1978	  subcc	%o2, 2, %o2		! reduce count by 2
1979	lduh	[%o0], %o3		! read half word
1980	add	%o0, 2, %o0		! advance SRC by 2
1981	stha	%o3, [%o1]ASI_USER	! write half word
1982	bgt,pt	%ncc, .co_sm_half	! loop til done
1983	  add	%o1, 2, %o1		! advance DST by 2
1984	addcc	%o2, 1, %o2		! restore count
1985	bz,pt	%ncc, .co_sm_exit
1986	  nop
1987.co_sm_byte:
1988	ldub	[%o0], %o3
1989	stba	%o3, [%o1]ASI_USER
1990	membar	#Sync				! sync error barrier
1991	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1992	retl
1993	  mov	%g0, %o0		! return 0
1994	.align 16
1995.co_sm_word:
1996	subcc	%o2, 4, %o2		! update count
1997	bgt,pt	%ncc, .co_sm_wordx
1998	  lduw	[%o0], %o3		! read word
1999	addcc	%o2, 3, %o2		! restore count
2000	bz,pt	%ncc, .co_sm_exit
2001	  stwa	%o3, [%o1]ASI_USER	! write word
2002	deccc	%o2			! reduce count for cc test
2003	ldub	[%o0 + 4], %o3		! load one byte
2004	add	%o1, 4, %o1
2005	bz,pt	%ncc, .co_sm_exit
2006	  stba	%o3, [%o1]ASI_USER	! store one byte
2007	ldub	[%o0 + 5], %o3		! load second byte
2008	deccc	%o2
2009	inc	%o1
2010	bz,pt	%ncc, .co_sm_exit
2011	  stba	%o3, [%o1]ASI_USER	! store second byte
2012	ldub	[%o0 + 6], %o3		! load third byte
2013	inc	%o1
2014	stba	%o3, [%o1]ASI_USER	! store third byte
2015.co_sm_exit:
2016	  membar	#Sync				! sync error barrier
2017	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2018	retl
2019	  mov	%g0, %o0		! return 0
2020
2021	.align 16
2022.co_med:
2023	xor	%o0, %o1, %o3		! setup alignment check
2024	btst	1, %o3
2025	bnz,pt	%ncc, .co_sm_movebytes	! unaligned
2026	  nop
2027	btst	3, %o3
2028	bnz,pt	%ncc, .co_med_half	! halfword aligned
2029	  nop
2030	btst	7, %o3
2031	bnz,pt	%ncc, .co_med_word	! word aligned
2032	  nop
2033.co_med_long:
2034	btst	3, %o0			! check for
2035	bz,pt	%ncc, .co_med_long1	! word alignment
2036	  nop
2037.co_med_long0:
2038	ldub	[%o0], %o3		! load one byte
2039	inc	%o0
2040	stba	%o3,[%o1]ASI_USER	! store byte
2041	inc	%o1
2042	btst	3, %o0
2043	bnz,pt	%ncc, .co_med_long0
2044	  dec	%o2
2045.co_med_long1:			! word aligned
2046	btst	7, %o0			! check for long word
2047	bz,pt	%ncc, .co_med_long2
2048	  nop
2049	lduw	[%o0], %o3		! load word
2050	add	%o0, 4, %o0		! advance SRC by 4
2051	stwa	%o3, [%o1]ASI_USER	! store word
2052	add	%o1, 4, %o1		! advance DST by 4
2053	sub	%o2, 4, %o2		! reduce count by 4
2054!
2055!  Now long word aligned and have at least 32 bytes to move
2056!
2057.co_med_long2:
2058	sub	%o2, 31, %o2		! adjust count to allow cc zero test
2059	sub	%o1, 8, %o1		! adjust pointer to allow store in
2060					! branch delay slot instead of add
2061.co_med_lmove:
2062	add	%o1, 8, %o1		! advance DST by 8
2063	ldx	[%o0], %o3		! read long word
2064	subcc	%o2, 32, %o2		! reduce count by 32
2065	stxa	%o3, [%o1]ASI_USER	! write long word
2066	add	%o1, 8, %o1		! advance DST by 8
2067	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
2068	add	%o0, 32, %o0		! advance SRC by 32
2069	stxa	%o3, [%o1]ASI_USER
2070	ldx	[%o0 - 16], %o3
2071	add	%o1, 8, %o1		! advance DST by 8
2072	stxa	%o3, [%o1]ASI_USER
2073	ldx	[%o0 - 8], %o3
2074	add	%o1, 8, %o1		! advance DST by 8
2075	bgt,pt	%ncc, .co_med_lmove	! loop til 31 or fewer bytes left
2076	  stxa	%o3, [%o1]ASI_USER
2077	add	%o1, 8, %o1		! advance DST by 8
2078	addcc	%o2, 24, %o2		! restore count to long word offset
2079	ble,pt	%ncc, .co_med_lextra	! check for more long words to move
2080	  nop
2081.co_med_lword:
2082	ldx	[%o0], %o3		! read long word
2083	subcc	%o2, 8, %o2		! reduce count by 8
2084	stxa	%o3, [%o1]ASI_USER	! write long word
2085	add	%o0, 8, %o0		! advance SRC by 8
2086	bgt,pt	%ncc, .co_med_lword	! loop til 7 or fewer bytes left
2087	  add	%o1, 8, %o1		! advance DST by 8
2088.co_med_lextra:
2089	addcc	%o2, 7, %o2		! restore rest of count
2090	bz,pt	%ncc, .co_sm_exit	! if zero, then done
2091	  deccc	%o2
2092	bz,pt	%ncc, .co_sm_byte
2093	  nop
2094	ba,pt	%ncc, .co_sm_half
2095	  nop
2096
2097	.align 16
2098	nop				! instruction alignment
2099					! see discussion at start of file
2100.co_med_word:
2101	btst	3, %o0			! check for
2102	bz,pt	%ncc, .co_med_word1	! word alignment
2103	  nop
2104.co_med_word0:
2105	ldub	[%o0], %o3		! load one byte
2106	inc	%o0
2107	stba	%o3,[%o1]ASI_USER	! store byte
2108	inc	%o1
2109	btst	3, %o0
2110	bnz,pt	%ncc, .co_med_word0
2111	  dec	%o2
2112!
2113!  Now word aligned and have at least 36 bytes to move
2114!
2115.co_med_word1:
2116	sub	%o2, 15, %o2		! adjust count to allow cc zero test
2117.co_med_wmove:
2118	lduw	[%o0], %o3		! read word
2119	subcc	%o2, 16, %o2		! reduce count by 16
2120	stwa	%o3, [%o1]ASI_USER	! write word
2121	add	%o1, 4, %o1		! advance DST by 4
2122	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
2123	add	%o0, 16, %o0		! advance SRC by 16
2124	stwa	%o3, [%o1]ASI_USER
2125	add	%o1, 4, %o1		! advance DST by 4
2126	lduw	[%o0 - 8], %o3
2127	stwa	%o3, [%o1]ASI_USER
2128	add	%o1, 4, %o1		! advance DST by 4
2129	lduw	[%o0 - 4], %o3
2130	stwa	%o3, [%o1]ASI_USER
2131	bgt,pt	%ncc, .co_med_wmove	! loop til 15 or fewer bytes left
2132	  add	%o1, 4, %o1		! advance DST by 4
2133	addcc	%o2, 12, %o2		! restore count to word offset
2134	ble,pt	%ncc, .co_med_wextra	! check for more words to move
2135	  nop
2136.co_med_word2:
2137	lduw	[%o0], %o3		! read word
2138	subcc	%o2, 4, %o2		! reduce count by 4
2139	stwa	%o3, [%o1]ASI_USER	! write word
2140	add	%o0, 4, %o0		! advance SRC by 4
2141	bgt,pt	%ncc, .co_med_word2	! loop til 3 or fewer bytes left
2142	  add	%o1, 4, %o1		! advance DST by 4
2143.co_med_wextra:
2144	addcc	%o2, 3, %o2		! restore rest of count
2145	bz,pt	%ncc, .co_sm_exit	! if zero, then done
2146	  deccc	%o2
2147	bz,pt	%ncc, .co_sm_byte
2148	  nop
2149	ba,pt	%ncc, .co_sm_half
2150	  nop
2151
2152	.align 16
2153	nop				! instruction alignment
2154	nop				! see discussion at start of file
2155	nop
2156.co_med_half:
2157	btst	1, %o0			! check for
2158	bz,pt	%ncc, .co_med_half1	! half word alignment
2159	  nop
2160	ldub	[%o0], %o3		! load one byte
2161	inc	%o0
2162	stba	%o3,[%o1]ASI_USER	! store byte
2163	inc	%o1
2164	dec	%o2
2165!
2166!  Now half word aligned and have at least 38 bytes to move
2167!
2168.co_med_half1:
2169	sub	%o2, 7, %o2		! adjust count to allow cc zero test
2170.co_med_hmove:
2171	lduh	[%o0], %o3		! read half word
2172	subcc	%o2, 8, %o2		! reduce count by 8
2173	stha	%o3, [%o1]ASI_USER	! write half word
2174	add	%o1, 2, %o1		! advance DST by 2
2175	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
2176	add	%o0, 8, %o0		! advance SRC by 8
2177	stha	%o3, [%o1]ASI_USER
2178	add	%o1, 2, %o1		! advance DST by 2
2179	lduh	[%o0 - 4], %o3
2180	stha	%o3, [%o1]ASI_USER
2181	add	%o1, 2, %o1		! advance DST by 2
2182	lduh	[%o0 - 2], %o3
2183	stha	%o3, [%o1]ASI_USER
2184	bgt,pt	%ncc, .co_med_hmove	! loop til 7 or fewer bytes left
2185	  add	%o1, 2, %o1		! advance DST by 2
2186	addcc	%o2, 7, %o2		! restore count
2187	bz,pt	%ncc, .co_sm_exit
2188	  deccc	%o2
2189	bz,pt	%ncc, .co_sm_byte
2190	  nop
2191	ba,pt	%ncc, .co_sm_half
2192	  nop
2193
2194/*
2195 * We got here because of a fault during short copyout.
2196 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2197 */
2198.sm_copyout_err:
2199	membar	#Sync
2200	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2201	mov	SM_SAVE_SRC, %o0
2202	mov	SM_SAVE_DST, %o1
2203	mov	SM_SAVE_COUNT, %o2
2204	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2205	tst	%o3
2206	bz,pt	%ncc, 3f			! if not, return error
2207	  nop
2208	ldn	[%o3 + CP_COPYOUT], %o5		! if handler, invoke it with
2209	jmp	%o5				! original arguments
2210	  nop
22113:
2212	retl
2213	  or	%g0, -1, %o0		! return error value
2214
2215	SET_SIZE(copyout)
2216
2217/*
2218 * The _more entry points are not intended to be used directly by
2219 * any caller from outside this file.  They are provided to allow
2220 * profiling and dtrace of the portions of the copy code that uses
2221 * the floating point registers.
2222 * This entry is particularly important as DTRACE (at least as of
2223 * 4/2004) does not support leaf functions.
2224 */
2225
2226	ENTRY(copyout_more)
2227.copyout_more:
2228	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2229	set	.copyout_err, REAL_LOFAULT
2230
2231/*
2232 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
2233 */
2234.do_copyout:
2235        set     copyio_fault, %l7		! .copyio_fault is lofault val
2236
2237	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
2238	membar	#Sync				! sync error barrier
2239	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
2240
2241	mov	%i0, SAVE_SRC
2242	mov	%i1, SAVE_DST
2243	mov	%i2, SAVE_COUNT
2244
2245	FP_NOMIGRATE(6, 7)
2246
2247	rd	%fprs, %o2		! check for unused fp
2248	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2249	btst	FPRS_FEF, %o2
2250	bz,a,pt	%icc, .do_blockcopyout
2251	  wr	%g0, FPRS_FEF, %fprs
2252
2253	BST_FPQ2Q4_TOSTACK(%o2)
2254
2255.do_blockcopyout:
2256	rd	%gsr, %o2
2257	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
2258	or	%l6, FPUSED_FLAG, %l6
2259
2260	andcc	DST, VIS_BLOCKSIZE - 1, TMP
2261	mov	ASI_USER, %asi
2262	bz,pt	%ncc, 2f
2263	  neg	TMP
2264	add	TMP, VIS_BLOCKSIZE, TMP
2265
2266	! TMP = bytes required to align DST on FP_BLOCK boundary
2267	! Using SRC as a tmp here
2268	cmp	TMP, 3
2269	bleu,pt	%ncc, 1f
2270	  sub	CNT,TMP,CNT		! adjust main count
2271	sub	TMP, 3, TMP		! adjust for end of loop test
2272.co_blkalign:
2273	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
2274	stba	SRC, [DST]%asi
2275	subcc	TMP, 4, TMP
2276	ldub	[REALSRC + 1], SRC
2277	add	REALSRC, 4, REALSRC
2278	stba	SRC, [DST + 1]%asi
2279	ldub	[REALSRC - 2], SRC
2280	add	DST, 4, DST
2281	stba	SRC, [DST - 2]%asi
2282	ldub	[REALSRC - 1], SRC
2283	bgu,pt	%ncc, .co_blkalign
2284	  stba	SRC, [DST - 1]%asi
2285
2286	addcc	TMP, 3, TMP		! restore count adjustment
2287	bz,pt	%ncc, 2f		! no bytes left?
2288	  nop
22891:	ldub	[REALSRC], SRC
2290	inc	REALSRC
2291	inc	DST
2292	deccc	TMP
2293	bgu	%ncc, 1b
2294	  stba	SRC, [DST - 1]%asi
2295
22962:
2297	andn	REALSRC, 0x7, SRC
2298	alignaddr REALSRC, %g0, %g0
2299
2300	! SRC - 8-byte aligned
2301	! DST - 64-byte aligned
2302	prefetch [SRC], #one_read
2303	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
2304	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
2305	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
2306	ldd	[SRC], %f16
2307#if CHEETAH_PREFETCH > 4
2308	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
2309#endif
2310	ldd	[SRC + 0x08], %f18
2311#if CHEETAH_PREFETCH > 5
2312	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
2313#endif
2314	ldd	[SRC + 0x10], %f20
2315#if CHEETAH_PREFETCH > 6
2316	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
2317#endif
2318	faligndata %f16, %f18, %f48
2319	ldd	[SRC + 0x18], %f22
2320#if CHEETAH_PREFETCH > 7
2321	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
2322#endif
2323	faligndata %f18, %f20, %f50
2324	ldd	[SRC + 0x20], %f24
2325	faligndata %f20, %f22, %f52
2326	ldd	[SRC + 0x28], %f26
2327	faligndata %f22, %f24, %f54
2328	ldd	[SRC + 0x30], %f28
2329	faligndata %f24, %f26, %f56
2330	ldd	[SRC + 0x38], %f30
2331	faligndata %f26, %f28, %f58
2332	ldd	[SRC + VIS_BLOCKSIZE], %f16
2333	sub	CNT, VIS_BLOCKSIZE, CNT
2334	add	SRC, VIS_BLOCKSIZE, SRC
2335	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2336	ba,a,pt	%ncc, 1f
2337	  nop
2338	.align	16
23391:
2340	ldd	[SRC + 0x08], %f18
2341	faligndata %f28, %f30, %f60
2342	ldd	[SRC + 0x10], %f20
2343	faligndata %f30, %f16, %f62
2344	stda	%f48, [DST]ASI_BLK_AIUS
2345	ldd	[SRC + 0x18], %f22
2346	faligndata %f16, %f18, %f48
2347	ldd	[SRC + 0x20], %f24
2348	faligndata %f18, %f20, %f50
2349	ldd	[SRC + 0x28], %f26
2350	faligndata %f20, %f22, %f52
2351	ldd	[SRC + 0x30], %f28
2352	faligndata %f22, %f24, %f54
2353	ldd	[SRC + 0x38], %f30
2354	faligndata %f24, %f26, %f56
2355	sub	CNT, VIS_BLOCKSIZE, CNT
2356	ldd	[SRC + VIS_BLOCKSIZE], %f16
2357	faligndata %f26, %f28, %f58
2358	prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
2359	add	DST, VIS_BLOCKSIZE, DST
2360	prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2361	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2362	cmp	CNT, VIS_BLOCKSIZE + 8
2363	bgu,pt	%ncc, 1b
2364	  add	SRC, VIS_BLOCKSIZE, SRC
2365
2366	! only if REALSRC & 0x7 is 0
2367	cmp	CNT, VIS_BLOCKSIZE
2368	bne	%ncc, 3f
2369	  andcc	REALSRC, 0x7, %g0
2370	bz,pt	%ncc, 2f
2371	  nop
23723:
2373	faligndata %f28, %f30, %f60
2374	faligndata %f30, %f16, %f62
2375	stda	%f48, [DST]ASI_BLK_AIUS
2376	add	DST, VIS_BLOCKSIZE, DST
2377	ba,pt	%ncc, 3f
2378	  nop
23792:
2380	ldd	[SRC + 0x08], %f18
2381	fsrc1	%f28, %f60
2382	ldd	[SRC + 0x10], %f20
2383	fsrc1	%f30, %f62
2384	stda	%f48, [DST]ASI_BLK_AIUS
2385	ldd	[SRC + 0x18], %f22
2386	fsrc1	%f16, %f48
2387	ldd	[SRC + 0x20], %f24
2388	fsrc1	%f18, %f50
2389	ldd	[SRC + 0x28], %f26
2390	fsrc1	%f20, %f52
2391	ldd	[SRC + 0x30], %f28
2392	fsrc1	%f22, %f54
2393	ldd	[SRC + 0x38], %f30
2394	fsrc1	%f24, %f56
2395	sub	CNT, VIS_BLOCKSIZE, CNT
2396	add	DST, VIS_BLOCKSIZE, DST
2397	add	SRC, VIS_BLOCKSIZE, SRC
2398	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2399	fsrc1	%f26, %f58
2400	fsrc1	%f28, %f60
2401	fsrc1	%f30, %f62
2402	stda	%f48, [DST]ASI_BLK_AIUS
2403	add	DST, VIS_BLOCKSIZE, DST
2404	ba,a,pt	%ncc, 4f
2405	  nop
2406
24073:	tst	CNT
2408	bz,a	%ncc, 4f
2409	  nop
2410
24115:	ldub	[REALSRC], TMP
2412	inc	REALSRC
2413	inc	DST
2414	deccc	CNT
2415	bgu	%ncc, 5b
2416	  stba	TMP, [DST - 1]%asi
24174:
2418
2419.copyout_exit:
2420	membar	#Sync
2421
2422	FPRAS_INTERVAL(FPRAS_COPYOUT, 0, %l5, %o2, %o3, %o4, %o5, 8)
2423	FPRAS_REWRITE_TYPE2Q2(0, %l5, %o2, %o3, 8, 9)
2424	FPRAS_CHECK(FPRAS_COPYOUT, %l5, 9)	! lose outputs
2425
2426	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2427	wr	%o2, 0, %gsr		! restore gsr
2428
2429	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2430	btst	FPRS_FEF, %o3
2431	bz,pt	%icc, 4f
2432	  nop
2433
2434	BLD_FPQ2Q4_FROMSTACK(%o2)
2435
2436	ba,pt	%ncc, 1f
2437	  wr	%o3, 0, %fprs		! restore fprs
2438
24394:
2440	FZEROQ2Q4
2441	wr	%o3, 0, %fprs		! restore fprs
2442
24431:
2444	membar	#Sync
2445	andn	%l6, FPUSED_FLAG, %l6
2446	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2447	FP_ALLOWMIGRATE(5, 6)
2448	ret
2449	  restore	%g0, 0, %o0
2450
2451/*
2452 * We got here because of a fault during copyout.
2453 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2454 */
2455.copyout_err:
2456	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
2457	tst	%o4
2458	bz,pt	%ncc, 2f			! if not, return error
2459	  nop
2460	ldn	[%o4 + CP_COPYOUT], %g2		! if handler, invoke it with
2461	jmp	%g2				! original arguments
2462	  restore %g0, 0, %g0			! dispose of copy window
24632:
2464        ret
2465	  restore %g0, -1, %o0			! return error value
2466
2467
2468	SET_SIZE(copyout_more)
2469
2470#endif	/* lint */
2471
2472
2473#ifdef	lint
2474
2475/*ARGSUSED*/
2476int
2477xcopyout(const void *kaddr, void *uaddr, size_t count)
2478{ return (0); }
2479
2480#else	/* lint */
2481
2482	ENTRY(xcopyout)
2483	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
2484	bleu,pt	%ncc, .xcopyout_small		! go to larger cases
2485	  xor	%o0, %o1, %o3			! are src, dst alignable?
2486	btst	7, %o3				!
2487	bz,pt	%ncc, .xcopyout_8		!
2488	  nop
2489	btst	1, %o3				!
2490	bz,pt	%ncc, .xcopyout_2		! check for half-word
2491	  nop
2492	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
2493	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2494	tst	%o3
2495	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2496	  cmp	%o2, %o3			! if length <= limit
2497	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2498	  nop
2499	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2500	  nop
2501.xcopyout_2:
2502	btst	3, %o3				!
2503	bz,pt	%ncc, .xcopyout_4		! check for word alignment
2504	  nop
2505	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
2506	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
2507	tst	%o3
2508	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2509	  cmp	%o2, %o3			! if length <= limit
2510	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2511	  nop
2512	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2513	  nop
2514.xcopyout_4:
2515	! already checked longword, must be word aligned
2516	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
2517	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
2518	tst	%o3
2519	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2520	  cmp	%o2, %o3			! if length <= limit
2521	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2522	  nop
2523	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2524	  nop
2525.xcopyout_8:
2526	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
2527	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2528	tst	%o3
2529	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2530	  cmp	%o2, %o3			! if length <= limit
2531	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2532	  nop
2533	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2534	  nop
2535
2536.xcopyout_small:
2537	sethi	%hi(.sm_xcopyout_err), %o5	! .sm_xcopyout_err is lofault
2538	or	%o5, %lo(.sm_xcopyout_err), %o5
2539	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
2540	membar	#Sync				! sync error barrier
2541	ba,pt	%ncc, .sm_do_copyout		! common code
2542	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
2543
2544.xcopyout_more:
2545	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2546	sethi	%hi(.xcopyout_err), REAL_LOFAULT
2547	ba,pt	%ncc, .do_copyout		! common code
2548	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2549
2550/*
2551 * We got here because of fault during xcopyout
2552 * Errno value is in ERRNO
2553 */
2554.xcopyout_err:
2555	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
2556	tst	%o4
2557	bz,pt	%ncc, 2f			! if not, return error
2558	  nop
2559	ldn	[%o4 + CP_XCOPYOUT], %g2	! if handler, invoke it with
2560	jmp	%g2				! original arguments
2561	  restore %g0, 0, %g0			! dispose of copy window
25622:
2563        ret
2564	  restore ERRNO, 0, %o0			! return errno value
2565
2566.sm_xcopyout_err:
2567
2568	membar	#Sync
2569	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2570	mov	SM_SAVE_SRC, %o0
2571	mov	SM_SAVE_DST, %o1
2572	mov	SM_SAVE_COUNT, %o2
2573	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2574	tst	%o3
2575	bz,pt	%ncc, 3f			! if not, return error
2576	  nop
2577	ldn	[%o3 + CP_XCOPYOUT], %o5	! if handler, invoke it with
2578	jmp	%o5				! original arguments
2579	  nop
25803:
2581	retl
2582	  or	%g1, 0, %o0		! return errno value
2583
2584	SET_SIZE(xcopyout)
2585
2586#endif	/* lint */
2587
2588#ifdef	lint
2589
2590/*ARGSUSED*/
2591int
2592xcopyout_little(const void *kaddr, void *uaddr, size_t count)
2593{ return (0); }
2594
2595#else	/* lint */
2596
2597	ENTRY(xcopyout_little)
2598	sethi	%hi(.xcopyio_err), %o5
2599	or	%o5, %lo(.xcopyio_err), %o5
2600	ldn	[THREAD_REG + T_LOFAULT], %o4
2601	membar	#Sync				! sync error barrier
2602	stn	%o5, [THREAD_REG + T_LOFAULT]
2603	mov	%o4, %o5
2604
2605	subcc	%g0, %o2, %o3
2606	add	%o0, %o2, %o0
2607	bz,pn	%ncc, 2f		! check for zero bytes
2608	  sub	%o2, 1, %o4
2609	add	%o0, %o4, %o0		! start w/last byte
2610	add	%o1, %o2, %o1
2611	ldub	[%o0 + %o3], %o4
2612
26131:	stba	%o4, [%o1 + %o3]ASI_AIUSL
2614	inccc	%o3
2615	sub	%o0, 2, %o0		! get next byte
2616	bcc,a,pt %ncc, 1b
2617	  ldub	[%o0 + %o3], %o4
2618
26192:
2620	membar	#Sync				! sync error barrier
2621	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2622	retl
2623	  mov	%g0, %o0		! return (0)
2624
2625	SET_SIZE(xcopyout_little)
2626
2627#endif	/* lint */
2628
2629/*
2630 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2631 */
2632
2633#if defined(lint)
2634
2635/*ARGSUSED*/
2636int
2637copyin(const void *uaddr, void *kaddr, size_t count)
2638{ return (0); }
2639
2640#else	/* lint */
2641
2642	ENTRY(copyin)
2643	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
2644	bleu,pt	%ncc, .copyin_small		! go to larger cases
2645	  xor	%o0, %o1, %o3			! are src, dst alignable?
2646	btst	7, %o3				!
2647	bz,pt	%ncc, .copyin_8			! check for longword alignment
2648	  nop
2649	btst	1, %o3				!
2650	bz,pt	%ncc, .copyin_2			! check for half-word
2651	  nop
2652	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
2653	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2654	tst	%o3
2655	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2656	  cmp	%o2, %o3			! if length <= limit
2657	bleu,pt	%ncc, .copyin_small		! go to small copy
2658	  nop
2659	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2660	  nop
2661.copyin_2:
2662	btst	3, %o3				!
2663	bz,pt	%ncc, .copyin_4			! check for word alignment
2664	  nop
2665	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
2666	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
2667	tst	%o3
2668	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2669	  cmp	%o2, %o3			! if length <= limit
2670	bleu,pt	%ncc, .copyin_small		! go to small copy
2671	  nop
2672	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2673	  nop
2674.copyin_4:
2675	! already checked longword, must be word aligned
2676	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
2677	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
2678	tst	%o3
2679	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2680	  cmp	%o2, %o3			! if length <= limit
2681	bleu,pt	%ncc, .copyin_small		! go to small copy
2682	  nop
2683	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2684	  nop
2685.copyin_8:
2686	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
2687	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2688	tst	%o3
2689	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2690	  cmp	%o2, %o3			! if length <= limit
2691	bleu,pt	%ncc, .copyin_small		! go to small copy
2692	  nop
2693	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2694	  nop
2695
2696	.align	16
2697	nop				! instruction alignment
2698					! see discussion at start of file
2699.copyin_small:
2700	sethi	%hi(.sm_copyin_err), %o5	! .sm_copyin_err is lofault
2701	or	%o5, %lo(.sm_copyin_err), %o5
2702	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofault, no tramp
2703	membar	#Sync				! sync error barrier
2704	stn	%o5, [THREAD_REG + T_LOFAULT]
2705.sm_do_copyin:
2706	mov	%o0, SM_SAVE_SRC
2707	mov	%o1, SM_SAVE_DST
2708	cmp	%o2, SHORTCOPY		! check for really short case
2709	bleu,pt	%ncc, .ci_sm_left	!
2710	  mov	%o2, SM_SAVE_COUNT
2711	cmp	%o2, CHKSIZE		! check for medium length cases
2712	bgu,pn	%ncc, .ci_med		!
2713	  or	%o0, %o1, %o3		! prepare alignment check
2714	andcc	%o3, 0x3, %g0		! test for alignment
2715	bz,pt	%ncc, .ci_sm_word	! branch to word aligned case
2716.ci_sm_movebytes:
2717	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
2718.ci_sm_notalign4:
2719	lduba	[%o0]ASI_USER, %o3	! read byte
2720	subcc	%o2, 4, %o2		! reduce count by 4
2721	stb	%o3, [%o1]		! write byte
2722	add	%o0, 1, %o0		! advance SRC by 1
2723	lduba	[%o0]ASI_USER, %o3	! repeat for a total of 4 bytes
2724	add	%o0, 1, %o0		! advance SRC by 1
2725	stb	%o3, [%o1 + 1]
2726	add	%o1, 4, %o1		! advance DST by 4
2727	lduba	[%o0]ASI_USER, %o3
2728	add	%o0, 1, %o0		! advance SRC by 1
2729	stb	%o3, [%o1 - 2]
2730	lduba	[%o0]ASI_USER, %o3
2731	add	%o0, 1, %o0		! advance SRC by 1
2732	bgt,pt	%ncc, .ci_sm_notalign4	! loop til 3 or fewer bytes remain
2733	  stb	%o3, [%o1 - 1]
2734	add	%o2, 3, %o2		! restore count
2735.ci_sm_left:
2736	tst	%o2
2737	bz,pt	%ncc, .ci_sm_exit
2738	  nop
2739	lduba	[%o0]ASI_USER, %o3		! load one byte
2740	deccc	%o2			! reduce count for cc test
2741	bz,pt	%ncc, .ci_sm_exit
2742	  stb	%o3,[%o1]		! store one byte
2743	inc	%o0
2744	lduba	[%o0]ASI_USER, %o3	! load second byte
2745	deccc	%o2
2746	bz,pt	%ncc, .ci_sm_exit
2747	  stb	%o3,[%o1 + 1]		! store second byte
2748	inc	%o0
2749	lduba	[%o0]ASI_USER, %o3	! load third byte
2750	stb	%o3,[%o1 + 2]		! store third byte
2751	membar	#Sync				! sync error barrier
2752	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2753	retl
2754	  mov	%g0, %o0		! return 0
2755	.align	16
2756.ci_sm_words:
2757	lduwa	[%o0]ASI_USER, %o3		! read word
2758.ci_sm_wordx:
2759	subcc	%o2, 8, %o2		! update count
2760	stw	%o3, [%o1]		! write word
2761	add	%o0, 4, %o0		! update SRC
2762	add	%o1, 8, %o1		! update DST
2763	lduwa	[%o0]ASI_USER, %o3	! read word
2764	add	%o0, 4, %o0		! update SRC
2765	bgt,pt	%ncc, .ci_sm_words	! loop til done
2766	  stw	%o3, [%o1 - 4]		! write word
2767	addcc	%o2, 7, %o2		! restore count
2768	bz,pt	%ncc, .ci_sm_exit
2769	  nop
2770	deccc	%o2
2771	bz,pt	%ncc, .ci_sm_byte
2772.ci_sm_half:
2773	  subcc	%o2, 2, %o2		! reduce count by 2
2774	lduha	[%o0]ASI_USER, %o3	! read half word
2775	add	%o0, 2, %o0		! advance SRC by 2
2776	add	%o1, 2, %o1		! advance DST by 2
2777	bgt,pt	%ncc, .ci_sm_half	! loop til done
2778	  sth	%o3, [%o1 - 2]		! write half word
2779	addcc	%o2, 1, %o2		! restore count
2780	bz,pt	%ncc, .ci_sm_exit
2781	  nop
2782.ci_sm_byte:
2783	lduba	[%o0]ASI_USER, %o3
2784	stb	%o3, [%o1]
2785	membar	#Sync				! sync error barrier
2786	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2787	retl
2788	  mov	%g0, %o0		! return 0
2789	.align	16
2790.ci_sm_word:
2791	subcc	%o2, 4, %o2		! update count
2792	bgt,pt	%ncc, .ci_sm_wordx
2793	  lduwa	[%o0]ASI_USER, %o3		! read word
2794	addcc	%o2, 3, %o2		! restore count
2795	bz,pt	%ncc, .ci_sm_exit
2796	  stw	%o3, [%o1]		! write word
2797	deccc	%o2			! reduce count for cc test
2798	add	%o0, 4, %o0
2799	lduba	[%o0]ASI_USER, %o3	! load one byte
2800	bz,pt	%ncc, .ci_sm_exit
2801	  stb	%o3, [%o1 + 4]		! store one byte
2802	inc	%o0
2803	lduba	[%o0]ASI_USER, %o3	! load second byte
2804	deccc	%o2
2805	bz,pt	%ncc, .ci_sm_exit
2806	  stb	%o3, [%o1 + 5]		! store second byte
2807	inc	%o0
2808	lduba	[%o0]ASI_USER, %o3	! load third byte
2809	stb	%o3, [%o1 + 6]		! store third byte
2810.ci_sm_exit:
2811	membar	#Sync				! sync error barrier
2812	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2813	retl
2814	  mov	%g0, %o0		! return 0
2815
2816	.align 16
2817.ci_med:
2818	xor	%o0, %o1, %o3		! setup alignment check
2819	btst	1, %o3
2820	bnz,pt	%ncc, .ci_sm_movebytes	! unaligned
2821	  nop
2822	btst	3, %o3
2823	bnz,pt	%ncc, .ci_med_half	! halfword aligned
2824	  nop
2825	btst	7, %o3
2826	bnz,pt	%ncc, .ci_med_word	! word aligned
2827	  nop
2828.ci_med_long:
2829	btst	3, %o0			! check for
2830	bz,pt	%ncc, .ci_med_long1	! word alignment
2831	  nop
2832.ci_med_long0:
2833	lduba	[%o0]ASI_USER, %o3		! load one byte
2834	inc	%o0
2835	stb	%o3,[%o1]		! store byte
2836	inc	%o1
2837	btst	3, %o0
2838	bnz,pt	%ncc, .ci_med_long0
2839	  dec	%o2
2840.ci_med_long1:			! word aligned
2841	btst	7, %o0			! check for long word
2842	bz,pt	%ncc, .ci_med_long2
2843	  nop
2844	lduwa	[%o0]ASI_USER, %o3	! load word
2845	add	%o0, 4, %o0		! advance SRC by 4
2846	stw	%o3, [%o1]		! store word
2847	add	%o1, 4, %o1		! advance DST by 4
2848	sub	%o2, 4, %o2		! reduce count by 4
2849!
2850!  Now long word aligned and have at least 32 bytes to move
2851!
2852.ci_med_long2:
2853	sub	%o2, 31, %o2		! adjust count to allow cc zero test
2854.ci_med_lmove:
2855	ldxa	[%o0]ASI_USER, %o3	! read long word
2856	subcc	%o2, 32, %o2		! reduce count by 32
2857	stx	%o3, [%o1]		! write long word
2858	add	%o0, 8, %o0		! advance SRC by 8
2859	ldxa	[%o0]ASI_USER, %o3	! repeat for a total for 4 long words
2860	add	%o0, 8, %o0		! advance SRC by 8
2861	stx	%o3, [%o1 + 8]
2862	add	%o1, 32, %o1		! advance DST by 32
2863	ldxa	[%o0]ASI_USER, %o3
2864	add	%o0, 8, %o0		! advance SRC by 8
2865	stx	%o3, [%o1 - 16]
2866	ldxa	[%o0]ASI_USER, %o3
2867	add	%o0, 8, %o0		! advance SRC by 8
2868	bgt,pt	%ncc, .ci_med_lmove	! loop til 31 or fewer bytes left
2869	  stx	%o3, [%o1 - 8]
2870	addcc	%o2, 24, %o2		! restore count to long word offset
2871	ble,pt	%ncc, .ci_med_lextra	! check for more long words to move
2872	  nop
2873.ci_med_lword:
2874	ldxa	[%o0]ASI_USER, %o3	! read long word
2875	subcc	%o2, 8, %o2		! reduce count by 8
2876	stx	%o3, [%o1]		! write long word
2877	add	%o0, 8, %o0		! advance SRC by 8
2878	bgt,pt	%ncc, .ci_med_lword	! loop til 7 or fewer bytes left
2879	  add	%o1, 8, %o1		! advance DST by 8
2880.ci_med_lextra:
2881	addcc	%o2, 7, %o2		! restore rest of count
2882	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
2883	  deccc	%o2
2884	bz,pt	%ncc, .ci_sm_byte
2885	  nop
2886	ba,pt	%ncc, .ci_sm_half
2887	  nop
2888
2889	.align 16
2890	nop				! instruction alignment
2891					! see discussion at start of file
2892.ci_med_word:
2893	btst	3, %o0			! check for
2894	bz,pt	%ncc, .ci_med_word1	! word alignment
2895	  nop
2896.ci_med_word0:
2897	lduba	[%o0]ASI_USER, %o3	! load one byte
2898	inc	%o0
2899	stb	%o3,[%o1]		! store byte
2900	inc	%o1
2901	btst	3, %o0
2902	bnz,pt	%ncc, .ci_med_word0
2903	  dec	%o2
2904!
2905!  Now word aligned and have at least 36 bytes to move
2906!
2907.ci_med_word1:
2908	sub	%o2, 15, %o2		! adjust count to allow cc zero test
2909.ci_med_wmove:
2910	lduwa	[%o0]ASI_USER, %o3	! read word
2911	subcc	%o2, 16, %o2		! reduce count by 16
2912	stw	%o3, [%o1]		! write word
2913	add	%o0, 4, %o0		! advance SRC by 4
2914	lduwa	[%o0]ASI_USER, %o3	! repeat for a total for 4 words
2915	add	%o0, 4, %o0		! advance SRC by 4
2916	stw	%o3, [%o1 + 4]
2917	add	%o1, 16, %o1		! advance DST by 16
2918	lduwa	[%o0]ASI_USER, %o3
2919	add	%o0, 4, %o0		! advance SRC by 4
2920	stw	%o3, [%o1 - 8]
2921	lduwa	[%o0]ASI_USER, %o3
2922	add	%o0, 4, %o0		! advance SRC by 4
2923	bgt,pt	%ncc, .ci_med_wmove	! loop til 15 or fewer bytes left
2924	  stw	%o3, [%o1 - 4]
2925	addcc	%o2, 12, %o2		! restore count to word offset
2926	ble,pt	%ncc, .ci_med_wextra	! check for more words to move
2927	  nop
2928.ci_med_word2:
2929	lduwa	[%o0]ASI_USER, %o3	! read word
2930	subcc	%o2, 4, %o2		! reduce count by 4
2931	stw	%o3, [%o1]		! write word
2932	add	%o0, 4, %o0		! advance SRC by 4
2933	bgt,pt	%ncc, .ci_med_word2	! loop til 3 or fewer bytes left
2934	  add	%o1, 4, %o1		! advance DST by 4
2935.ci_med_wextra:
2936	addcc	%o2, 3, %o2		! restore rest of count
2937	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
2938	  deccc	%o2
2939	bz,pt	%ncc, .ci_sm_byte
2940	  nop
2941	ba,pt	%ncc, .ci_sm_half
2942	  nop
2943
2944	.align 16
2945	nop				! instruction alignment
2946					! see discussion at start of file
2947.ci_med_half:
2948	btst	1, %o0			! check for
2949	bz,pt	%ncc, .ci_med_half1	! half word alignment
2950	  nop
2951	lduba	[%o0]ASI_USER, %o3	! load one byte
2952	inc	%o0
2953	stb	%o3,[%o1]		! store byte
2954	inc	%o1
2955	dec	%o2
2956!
2957!  Now half word aligned and have at least 38 bytes to move
2958!
2959.ci_med_half1:
2960	sub	%o2, 7, %o2		! adjust count to allow cc zero test
2961.ci_med_hmove:
2962	lduha	[%o0]ASI_USER, %o3	! read half word
2963	subcc	%o2, 8, %o2		! reduce count by 8
2964	sth	%o3, [%o1]		! write half word
2965	add	%o0, 2, %o0		! advance SRC by 2
2966	lduha	[%o0]ASI_USER, %o3	! repeat for a total for 4 halfwords
2967	add	%o0, 2, %o0		! advance SRC by 2
2968	sth	%o3, [%o1 + 2]
2969	add	%o1, 8, %o1		! advance DST by 8
2970	lduha	[%o0]ASI_USER, %o3
2971	add	%o0, 2, %o0		! advance SRC by 2
2972	sth	%o3, [%o1 - 4]
2973	lduha	[%o0]ASI_USER, %o3
2974	add	%o0, 2, %o0		! advance SRC by 2
2975	bgt,pt	%ncc, .ci_med_hmove	! loop til 7 or fewer bytes left
2976	  sth	%o3, [%o1 - 2]
2977	addcc	%o2, 7, %o2		! restore count
2978	bz,pt	%ncc, .ci_sm_exit
2979	  deccc	%o2
2980	bz,pt	%ncc, .ci_sm_byte
2981	  nop
2982	ba,pt	%ncc, .ci_sm_half
2983	  nop
2984
2985.sm_copyin_err:
2986	membar	#Sync
2987	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2988	mov	SM_SAVE_SRC, %o0
2989	mov	SM_SAVE_DST, %o1
2990	mov	SM_SAVE_COUNT, %o2
2991	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2992	tst	%o3
2993	bz,pt	%ncc, 3f			! if not, return error
2994	  nop
2995	ldn	[%o3 + CP_COPYIN], %o5		! if handler, invoke it with
2996	jmp	%o5				! original arguments
2997	  nop
29983:
2999	retl
3000	  or	%g0, -1, %o0		! return errno value
3001
3002	SET_SIZE(copyin)
3003
3004
3005/*
3006 * The _more entry points are not intended to be used directly by
3007 * any caller from outside this file.  They are provided to allow
3008 * profiling and dtrace of the portions of the copy code that uses
3009 * the floating point registers.
3010 * This entry is particularly important as DTRACE (at least as of
3011 * 4/2004) does not support leaf functions.
3012 */
3013
3014	ENTRY(copyin_more)
3015.copyin_more:
3016	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3017	set	.copyin_err, REAL_LOFAULT
3018
3019/*
3020 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
3021 */
3022.do_copyin:
3023	set	copyio_fault, %l7		! .copyio_fault is lofault val
3024
3025	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
3026	membar	#Sync				! sync error barrier
3027	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
3028
3029	mov	%i0, SAVE_SRC
3030	mov	%i1, SAVE_DST
3031	mov	%i2, SAVE_COUNT
3032
3033	FP_NOMIGRATE(6, 7)
3034
3035	rd	%fprs, %o2		! check for unused fp
3036	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
3037	btst	FPRS_FEF, %o2
3038	bz,a,pt	%icc, .do_blockcopyin
3039	  wr	%g0, FPRS_FEF, %fprs
3040
3041	BST_FPQ2Q4_TOSTACK(%o2)
3042
3043.do_blockcopyin:
3044	rd	%gsr, %o2
3045	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
3046	or	%l6, FPUSED_FLAG, %l6
3047
3048	andcc	DST, VIS_BLOCKSIZE - 1, TMP
3049	mov	ASI_USER, %asi
3050	bz,pt	%ncc, 2f
3051	  neg	TMP
3052	add	TMP, VIS_BLOCKSIZE, TMP
3053
3054	! TMP = bytes required to align DST on FP_BLOCK boundary
3055	! Using SRC as a tmp here
3056	cmp	TMP, 3
3057	bleu,pt	%ncc, 1f
3058	  sub	CNT,TMP,CNT		! adjust main count
3059	sub	TMP, 3, TMP		! adjust for end of loop test
3060.ci_blkalign:
3061	lduba	[REALSRC]%asi, SRC	! move 4 bytes per loop iteration
3062	stb	SRC, [DST]
3063	subcc	TMP, 4, TMP
3064	lduba	[REALSRC + 1]%asi, SRC
3065	add	REALSRC, 4, REALSRC
3066	stb	SRC, [DST + 1]
3067	lduba	[REALSRC - 2]%asi, SRC
3068	add	DST, 4, DST
3069	stb	SRC, [DST - 2]
3070	lduba	[REALSRC - 1]%asi, SRC
3071	bgu,pt	%ncc, .ci_blkalign
3072	  stb	SRC, [DST - 1]
3073
3074	addcc	TMP, 3, TMP		! restore count adjustment
3075	bz,pt	%ncc, 2f		! no bytes left?
3076	  nop
30771:	lduba	[REALSRC]%asi, SRC
3078	inc	REALSRC
3079	inc	DST
3080	deccc	TMP
3081	bgu	%ncc, 1b
3082	  stb	SRC, [DST - 1]
3083
30842:
3085	andn	REALSRC, 0x7, SRC
3086	alignaddr REALSRC, %g0, %g0
3087
3088	! SRC - 8-byte aligned
3089	! DST - 64-byte aligned
3090	prefetcha [SRC]%asi, #one_read
3091	prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #one_read
3092	prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #one_read
3093	prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #one_read
3094	ldda	[SRC]%asi, %f16
3095#if CHEETAH_PREFETCH > 4
3096	prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
3097#endif
3098	ldda	[SRC + 0x08]%asi, %f18
3099#if CHEETAH_PREFETCH > 5
3100	prefetcha [SRC + (5 * VIS_BLOCKSIZE)]%asi, #one_read
3101#endif
3102	ldda	[SRC + 0x10]%asi, %f20
3103#if CHEETAH_PREFETCH > 6
3104	prefetcha [SRC + (6 * VIS_BLOCKSIZE)]%asi, #one_read
3105#endif
3106	faligndata %f16, %f18, %f48
3107	ldda	[SRC + 0x18]%asi, %f22
3108#if CHEETAH_PREFETCH > 7
3109	prefetcha [SRC + (7 * VIS_BLOCKSIZE)]%asi, #one_read
3110#endif
3111	faligndata %f18, %f20, %f50
3112	ldda	[SRC + 0x20]%asi, %f24
3113	faligndata %f20, %f22, %f52
3114	ldda	[SRC + 0x28]%asi, %f26
3115	faligndata %f22, %f24, %f54
3116	ldda	[SRC + 0x30]%asi, %f28
3117	faligndata %f24, %f26, %f56
3118	ldda	[SRC + 0x38]%asi, %f30
3119	faligndata %f26, %f28, %f58
3120	ldda	[SRC + VIS_BLOCKSIZE]%asi, %f16
3121	sub	CNT, VIS_BLOCKSIZE, CNT
3122	add	SRC, VIS_BLOCKSIZE, SRC
3123	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3124	ba,a,pt	%ncc, 1f
3125	  nop
3126	.align	16
31271:
3128	ldda	[SRC + 0x08]%asi, %f18
3129	faligndata %f28, %f30, %f60
3130	ldda	[SRC + 0x10]%asi, %f20
3131	faligndata %f30, %f16, %f62
3132	stda	%f48, [DST]ASI_BLK_P
3133	ldda	[SRC + 0x18]%asi, %f22
3134	faligndata %f16, %f18, %f48
3135	ldda	[SRC + 0x20]%asi, %f24
3136	faligndata %f18, %f20, %f50
3137	ldda	[SRC + 0x28]%asi, %f26
3138	faligndata %f20, %f22, %f52
3139	ldda	[SRC + 0x30]%asi, %f28
3140	faligndata %f22, %f24, %f54
3141	ldda	[SRC + 0x38]%asi, %f30
3142	faligndata %f24, %f26, %f56
3143	sub	CNT, VIS_BLOCKSIZE, CNT
3144	ldda	[SRC + VIS_BLOCKSIZE]%asi, %f16
3145	faligndata %f26, %f28, %f58
3146	prefetcha [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8]%asi, #one_read
3147	add	DST, VIS_BLOCKSIZE, DST
3148	prefetcha [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3149	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3150	cmp	CNT, VIS_BLOCKSIZE + 8
3151	bgu,pt	%ncc, 1b
3152	  add	SRC, VIS_BLOCKSIZE, SRC
3153
3154	! only if REALSRC & 0x7 is 0
3155	cmp	CNT, VIS_BLOCKSIZE
3156	bne	%ncc, 3f
3157	  andcc	REALSRC, 0x7, %g0
3158	bz,pt	%ncc, 2f
3159	  nop
31603:
3161	faligndata %f28, %f30, %f60
3162	faligndata %f30, %f16, %f62
3163	stda	%f48, [DST]ASI_BLK_P
3164	add	DST, VIS_BLOCKSIZE, DST
3165	ba,pt	%ncc, 3f
3166	  nop
31672:
3168	ldda	[SRC + 0x08]%asi, %f18
3169	fsrc1	%f28, %f60
3170	ldda	[SRC + 0x10]%asi, %f20
3171	fsrc1	%f30, %f62
3172	stda	%f48, [DST]ASI_BLK_P
3173	ldda	[SRC + 0x18]%asi, %f22
3174	fsrc1	%f16, %f48
3175	ldda	[SRC + 0x20]%asi, %f24
3176	fsrc1	%f18, %f50
3177	ldda	[SRC + 0x28]%asi, %f26
3178	fsrc1	%f20, %f52
3179	ldda	[SRC + 0x30]%asi, %f28
3180	fsrc1	%f22, %f54
3181	ldda	[SRC + 0x38]%asi, %f30
3182	fsrc1	%f24, %f56
3183	sub	CNT, VIS_BLOCKSIZE, CNT
3184	add	DST, VIS_BLOCKSIZE, DST
3185	add	SRC, VIS_BLOCKSIZE, SRC
3186	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3187	fsrc1	%f26, %f58
3188	fsrc1	%f28, %f60
3189	fsrc1	%f30, %f62
3190	stda	%f48, [DST]ASI_BLK_P
3191	add	DST, VIS_BLOCKSIZE, DST
3192	ba,a,pt	%ncc, 4f
3193	  nop
3194
31953:	tst	CNT
3196	bz,a	%ncc, 4f
3197	  nop
3198
31995:	lduba	[REALSRC]ASI_USER, TMP
3200	inc	REALSRC
3201	inc	DST
3202	deccc	CNT
3203	bgu	%ncc, 5b
3204	  stb	TMP, [DST - 1]
32054:
3206
3207.copyin_exit:
3208	membar	#Sync
3209
3210	FPRAS_INTERVAL(FPRAS_COPYIN, 1, %l5, %o2, %o3, %o4, %o5, 8)
3211	FPRAS_REWRITE_TYPE1(1, %l5, %f48, %o2, 9)
3212	FPRAS_CHECK(FPRAS_COPYIN, %l5, 9)	! lose outputs
3213
3214	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
3215	wr	%o2, 0, %gsr
3216
3217	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3218	btst	FPRS_FEF, %o3
3219	bz,pt	%icc, 4f
3220	  nop
3221
3222	BLD_FPQ2Q4_FROMSTACK(%o2)
3223
3224	ba,pt	%ncc, 1f
3225	  wr	%o3, 0, %fprs		! restore fprs
3226
32274:
3228	FZEROQ2Q4
3229	wr	%o3, 0, %fprs		! restore fprs
3230
32311:
3232	membar	#Sync				! sync error barrier
3233	andn	%l6, FPUSED_FLAG, %l6
3234	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3235	FP_ALLOWMIGRATE(5, 6)
3236	ret
3237	  restore	%g0, 0, %o0
3238/*
3239 * We got here because of a fault during copyin
3240 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
3241 */
3242.copyin_err:
3243	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
3244	tst	%o4
3245	bz,pt	%ncc, 2f			! if not, return error
3246	nop
3247	ldn	[%o4 + CP_COPYIN], %g2		! if handler, invoke it with
3248	jmp	%g2				! original arguments
3249	restore %g0, 0, %g0			! dispose of copy window
32502:
3251	ret
3252	restore %g0, -1, %o0			! return error value
3253
3254
3255	SET_SIZE(copyin_more)
3256
3257#endif	/* lint */
3258
3259#ifdef	lint
3260
3261/*ARGSUSED*/
3262int
3263xcopyin(const void *uaddr, void *kaddr, size_t count)
3264{ return (0); }
3265
3266#else	/* lint */
3267
3268	ENTRY(xcopyin)
3269
3270	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
3271	bleu,pt	%ncc, .xcopyin_small		! go to larger cases
3272	  xor	%o0, %o1, %o3			! are src, dst alignable?
3273	btst	7, %o3				!
3274	bz,pt	%ncc, .xcopyin_8		! check for longword alignment
3275	  nop
3276	btst	1, %o3				!
3277	bz,pt	%ncc, .xcopyin_2		! check for half-word
3278	  nop
3279	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
3280	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3281	tst	%o3
3282	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3283	  cmp	%o2, %o3			! if length <= limit
3284	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3285	  nop
3286	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3287	  nop
3288.xcopyin_2:
3289	btst	3, %o3				!
3290	bz,pt	%ncc, .xcopyin_4		! check for word alignment
3291	  nop
3292	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
3293	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3294	tst	%o3
3295	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3296	  cmp	%o2, %o3			! if length <= limit
3297	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3298	  nop
3299	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3300	  nop
3301.xcopyin_4:
3302	! already checked longword, must be word aligned
3303	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
3304	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3305	tst	%o3
3306	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3307	  cmp	%o2, %o3			! if length <= limit
3308	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3309	  nop
3310	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3311	  nop
3312.xcopyin_8:
3313	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
3314	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3315	tst	%o3
3316	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3317	  cmp	%o2, %o3			! if length <= limit
3318	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3319	  nop
3320	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3321	  nop
3322
3323.xcopyin_small:
3324	sethi	%hi(.sm_xcopyin_err), %o5  ! .sm_xcopyin_err is lofault value
3325	or	%o5, %lo(.sm_xcopyin_err), %o5
3326	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofaul
3327	membar	#Sync				! sync error barrier
3328	ba,pt	%ncc, .sm_do_copyin		! common code
3329	  stn	%o5, [THREAD_REG + T_LOFAULT]
3330
3331.xcopyin_more:
3332	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3333	sethi	%hi(.xcopyin_err), REAL_LOFAULT	! .xcopyin_err is lofault value
3334	ba,pt	%ncc, .do_copyin
3335	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3336
3337/*
3338 * We got here because of fault during xcopyin
3339 * Errno value is in ERRNO
3340 */
3341.xcopyin_err:
3342	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
3343	tst	%o4
3344	bz,pt	%ncc, 2f			! if not, return error
3345	  nop
3346	ldn	[%o4 + CP_XCOPYIN], %g2		! if handler, invoke it with
3347	jmp	%g2				! original arguments
3348	  restore %g0, 0, %g0			! dispose of copy window
33492:
3350        ret
3351	  restore ERRNO, 0, %o0			! return errno value
3352
3353.sm_xcopyin_err:
3354
3355	membar	#Sync
3356	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3357	mov	SM_SAVE_SRC, %o0
3358	mov	SM_SAVE_DST, %o1
3359	mov	SM_SAVE_COUNT, %o2
3360	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
3361	tst	%o3
3362	bz,pt	%ncc, 3f			! if not, return error
3363	  nop
3364	ldn	[%o3 + CP_XCOPYIN], %o5		! if handler, invoke it with
3365	jmp	%o5				! original arguments
3366	  nop
33673:
3368	retl
3369	  or	%g1, 0, %o0		! return errno value
3370
3371	SET_SIZE(xcopyin)
3372
3373#endif	/* lint */
3374
3375#ifdef	lint
3376
3377/*ARGSUSED*/
3378int
3379xcopyin_little(const void *uaddr, void *kaddr, size_t count)
3380{ return (0); }
3381
3382#else	/* lint */
3383
3384	ENTRY(xcopyin_little)
3385	sethi	%hi(.xcopyio_err), %o5
3386	or	%o5, %lo(.xcopyio_err), %o5
3387	ldn	[THREAD_REG + T_LOFAULT], %o4
3388	membar	#Sync				! sync error barrier
3389	stn	%o5, [THREAD_REG + T_LOFAULT]
3390	mov	%o4, %o5
3391
3392	subcc	%g0, %o2, %o3
3393	add	%o0, %o2, %o0
3394	bz,pn	%ncc, 2f		! check for zero bytes
3395	  sub	%o2, 1, %o4
3396	add	%o0, %o4, %o0		! start w/last byte
3397	add	%o1, %o2, %o1
3398	lduba	[%o0 + %o3]ASI_AIUSL, %o4
3399
34001:	stb	%o4, [%o1 + %o3]
3401	inccc	%o3
3402	sub	%o0, 2, %o0		! get next byte
3403	bcc,a,pt %ncc, 1b
3404	  lduba	[%o0 + %o3]ASI_AIUSL, %o4
3405
34062:
3407	membar	#Sync				! sync error barrier
3408	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3409	retl
3410	  mov	%g0, %o0		! return (0)
3411
3412.xcopyio_err:
3413	membar	#Sync				! sync error barrier
3414	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3415	retl
3416	  mov	%g1, %o0
3417
3418	SET_SIZE(xcopyin_little)
3419
3420#endif	/* lint */
3421
3422
3423/*
3424 * Copy a block of storage - must not overlap (from + len <= to).
3425 * No fault handler installed (to be called under on_fault())
3426 */
3427#if defined(lint)
3428
3429/* ARGSUSED */
3430void
3431copyin_noerr(const void *ufrom, void *kto, size_t count)
3432{}
3433
3434#else	/* lint */
3435	ENTRY(copyin_noerr)
3436
3437	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
3438	bleu,pt	%ncc, .copyin_ne_small		! go to larger cases
3439	  xor	%o0, %o1, %o3			! are src, dst alignable?
3440	btst	7, %o3				!
3441	bz,pt	%ncc, .copyin_ne_8		! check for longword alignment
3442	  nop
3443	btst	1, %o3				!
3444	bz,pt	%ncc, .copyin_ne_2		! check for half-word
3445	  nop
3446	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
3447	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3448	tst	%o3
3449	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3450	  cmp	%o2, %o3			! if length <= limit
3451	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3452	  nop
3453	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3454	  nop
3455.copyin_ne_2:
3456	btst	3, %o3				!
3457	bz,pt	%ncc, .copyin_ne_4		! check for word alignment
3458	  nop
3459	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
3460	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3461	tst	%o3
3462	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3463	  cmp	%o2, %o3			! if length <= limit
3464	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3465	  nop
3466	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3467	  nop
3468.copyin_ne_4:
3469	! already checked longword, must be word aligned
3470	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
3471	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3472	tst	%o3
3473	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3474	  cmp	%o2, %o3			! if length <= limit
3475	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3476	  nop
3477	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3478	  nop
3479.copyin_ne_8:
3480	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
3481	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3482	tst	%o3
3483	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3484	  cmp	%o2, %o3			! if length <= limit
3485	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3486	  nop
3487	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3488	  nop
3489
3490.copyin_ne_small:
3491	ldn	[THREAD_REG + T_LOFAULT], %o4
3492	tst	%o4
3493	bz,pn	%ncc, .sm_do_copyin
3494	  nop
3495	sethi	%hi(.sm_copyio_noerr), %o5
3496	or	%o5, %lo(.sm_copyio_noerr), %o5
3497	membar	#Sync				! sync error barrier
3498	ba,pt	%ncc, .sm_do_copyin
3499	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
3500
3501.copyin_noerr_more:
3502	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3503	sethi	%hi(.copyio_noerr), REAL_LOFAULT
3504	ba,pt	%ncc, .do_copyin
3505	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3506
3507.copyio_noerr:
3508	jmp	%l6
3509	  restore %g0,0,%g0
3510
3511.sm_copyio_noerr:
3512	membar	#Sync
3513	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore t_lofault
3514	jmp	%o4
3515	  nop
3516
3517	SET_SIZE(copyin_noerr)
3518#endif /* lint */
3519
3520/*
3521 * Copy a block of storage - must not overlap (from + len <= to).
3522 * No fault handler installed (to be called under on_fault())
3523 */
3524
3525#if defined(lint)
3526
3527/* ARGSUSED */
3528void
3529copyout_noerr(const void *kfrom, void *uto, size_t count)
3530{}
3531
3532#else	/* lint */
3533	ENTRY(copyout_noerr)
3534
3535	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
3536	bleu,pt	%ncc, .copyout_ne_small		! go to larger cases
3537	  xor	%o0, %o1, %o3			! are src, dst alignable?
3538	btst	7, %o3				!
3539	bz,pt	%ncc, .copyout_ne_8		! check for longword alignment
3540	  nop
3541	btst	1, %o3				!
3542	bz,pt	%ncc, .copyout_ne_2		! check for half-word
3543	  nop
3544	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
3545	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3546	tst	%o3
3547	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3548	  cmp	%o2, %o3			! if length <= limit
3549	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3550	  nop
3551	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3552	  nop
3553.copyout_ne_2:
3554	btst	3, %o3				!
3555	bz,pt	%ncc, .copyout_ne_4		! check for word alignment
3556	  nop
3557	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
3558	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3559	tst	%o3
3560	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3561	  cmp	%o2, %o3			! if length <= limit
3562	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3563	  nop
3564	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3565	  nop
3566.copyout_ne_4:
3567	! already checked longword, must be word aligned
3568	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
3569	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3570	tst	%o3
3571	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3572	  cmp	%o2, %o3			! if length <= limit
3573	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3574	  nop
3575	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3576	  nop
3577.copyout_ne_8:
3578	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
3579	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3580	tst	%o3
3581	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3582	  cmp	%o2, %o3			! if length <= limit
3583	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3584	  nop
3585	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3586	  nop
3587
3588.copyout_ne_small:
3589	ldn	[THREAD_REG + T_LOFAULT], %o4
3590	tst	%o4
3591	bz,pn	%ncc, .sm_do_copyout
3592	  nop
3593	sethi	%hi(.sm_copyio_noerr), %o5
3594	or	%o5, %lo(.sm_copyio_noerr), %o5
3595	membar	#Sync				! sync error barrier
3596	ba,pt	%ncc, .sm_do_copyout
3597	stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
3598
3599.copyout_noerr_more:
3600	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3601	sethi	%hi(.copyio_noerr), REAL_LOFAULT
3602	ba,pt	%ncc, .do_copyout
3603	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3604
3605	SET_SIZE(copyout_noerr)
3606#endif /* lint */
3607
3608
3609/*
3610 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
3611 * longer than 256 bytes in length using spitfire's block stores.  If
3612 * the criteria for using this routine are not met then it calls bzero
3613 * and returns 1.  Otherwise 0 is returned indicating success.
3614 * Caller is responsible for ensuring use_hw_bzero is true and that
3615 * kpreempt_disable() has been called.
3616 */
3617#ifdef lint
3618/*ARGSUSED*/
3619int
3620hwblkclr(void *addr, size_t len)
3621{
3622	return(0);
3623}
3624#else /* lint */
3625	! %i0 - start address
3626	! %i1 - length of region (multiple of 64)
3627	! %l0 - saved fprs
3628	! %l1 - pointer to saved %d0 block
3629	! %l2 - saved curthread->t_lwp
3630
3631	ENTRY(hwblkclr)
3632	! get another window w/space for one aligned block of saved fpregs
3633	save	%sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
3634
3635	! Must be block-aligned
3636	andcc	%i0, (VIS_BLOCKSIZE-1), %g0
3637	bnz,pn	%ncc, 1f
3638	  nop
3639
3640	! ... and must be 256 bytes or more
3641	cmp	%i1, 256
3642	blu,pn	%ncc, 1f
3643	  nop
3644
3645	! ... and length must be a multiple of VIS_BLOCKSIZE
3646	andcc	%i1, (VIS_BLOCKSIZE-1), %g0
3647	bz,pn	%ncc, 2f
3648	  nop
3649
36501:	! punt, call bzero but notify the caller that bzero was used
3651	mov	%i0, %o0
3652	call	bzero
3653	mov	%i1, %o1
3654	ret
3655	  restore	%g0, 1, %o0 ! return (1) - did not use block operations
3656
36572:	rd	%fprs, %l0		! check for unused fp
3658	btst	FPRS_FEF, %l0
3659	bz,pt	%icc, 1f
3660	  nop
3661
3662	! save in-use fpregs on stack
3663	membar	#Sync
3664	add	%fp, STACK_BIAS - 65, %l1
3665	and	%l1, -VIS_BLOCKSIZE, %l1
3666	stda	%d0, [%l1]ASI_BLK_P
3667
36681:	membar	#StoreStore|#StoreLoad|#LoadStore
3669	wr	%g0, FPRS_FEF, %fprs
3670	wr	%g0, ASI_BLK_P, %asi
3671
3672	! Clear block
3673	fzero	%d0
3674	fzero	%d2
3675	fzero	%d4
3676	fzero	%d6
3677	fzero	%d8
3678	fzero	%d10
3679	fzero	%d12
3680	fzero	%d14
3681
3682	mov	256, %i3
3683	ba,pt	%ncc, .pz_doblock
3684	  nop
3685
3686.pz_blkstart:
3687      ! stda	%d0, [%i0 + 192]%asi  ! in dly slot of branch that got us here
3688	stda	%d0, [%i0 + 128]%asi
3689	stda	%d0, [%i0 + 64]%asi
3690	stda	%d0, [%i0]%asi
3691.pz_zinst:
3692	add	%i0, %i3, %i0
3693	sub	%i1, %i3, %i1
3694.pz_doblock:
3695	cmp	%i1, 256
3696	bgeu,a	%ncc, .pz_blkstart
3697	  stda	%d0, [%i0 + 192]%asi
3698
3699	cmp	%i1, 64
3700	blu	%ncc, .pz_finish
3701
3702	  andn	%i1, (64-1), %i3
3703	srl	%i3, 4, %i2		! using blocks, 1 instr / 16 words
3704	set	.pz_zinst, %i4
3705	sub	%i4, %i2, %i4
3706	jmp	%i4
3707	  nop
3708
3709.pz_finish:
3710	membar	#Sync
3711	btst	FPRS_FEF, %l0
3712	bz,a	.pz_finished
3713	  wr	%l0, 0, %fprs		! restore fprs
3714
3715	! restore fpregs from stack
3716	ldda	[%l1]ASI_BLK_P, %d0
3717	membar	#Sync
3718	wr	%l0, 0, %fprs		! restore fprs
3719
3720.pz_finished:
3721	ret
3722	  restore	%g0, 0, %o0		! return (bzero or not)
3723
3724	SET_SIZE(hwblkclr)
3725#endif	/* lint */
3726
3727#ifdef lint
3728/*ARGSUSED*/
3729void
3730hw_pa_bcopy32(uint64_t src, uint64_t dst)
3731{}
3732#else /*!lint */
3733	/*
3734	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
3735	 * using physical addresses.
3736	 */
3737	ENTRY_NP(hw_pa_bcopy32)
3738	rdpr	%pstate, %g1
3739	andn	%g1, PSTATE_IE, %g2
3740	wrpr	%g0, %g2, %pstate
3741
3742	rdpr	%pstate, %g0
3743	ldxa	[%o0]ASI_MEM, %o2
3744	add	%o0, 8, %o0
3745	ldxa	[%o0]ASI_MEM, %o3
3746	add	%o0, 8, %o0
3747	ldxa	[%o0]ASI_MEM, %o4
3748	add	%o0, 8, %o0
3749	ldxa	[%o0]ASI_MEM, %o5
3750
3751    	stxa	%g0, [%o1]ASI_DC_INVAL
3752	membar	#Sync
3753
3754	stxa	%o2, [%o1]ASI_MEM
3755	add	%o1, 8, %o1
3756	stxa	%o3, [%o1]ASI_MEM
3757	add	%o1, 8, %o1
3758	stxa	%o4, [%o1]ASI_MEM
3759	add	%o1, 8, %o1
3760	stxa	%o5, [%o1]ASI_MEM
3761
3762	retl
3763	  wrpr	  %g0, %g1, %pstate
3764
3765	SET_SIZE(hw_pa_bcopy32)
3766
3767#endif /* lint */
3768
3769#if defined(lint)
3770
3771int use_hw_bcopy = 1;
3772int use_hw_bzero = 1;
3773uint_t hw_copy_limit_1 = 0;
3774uint_t hw_copy_limit_2 = 0;
3775uint_t hw_copy_limit_4 = 0;
3776uint_t hw_copy_limit_8 = 0;
3777
3778#else /* !lint */
3779
3780	DGDEF(use_hw_bcopy)
3781	.word	1
3782	DGDEF(use_hw_bzero)
3783	.word	1
3784	DGDEF(hw_copy_limit_1)
3785	.word	0
3786	DGDEF(hw_copy_limit_2)
3787	.word	0
3788	DGDEF(hw_copy_limit_4)
3789	.word	0
3790	DGDEF(hw_copy_limit_8)
3791	.word	0
3792
3793	.align	64
3794	.section ".text"
3795#endif /* !lint */
3796