xref: /titanic_51/usr/src/uts/sun4u/cpu/opl_olympus_copy.s (revision e64c6c3f1a2cffb126880e90fb7670805896f505)
125cf1a30Sjl139090/*
225cf1a30Sjl139090 * CDDL HEADER START
325cf1a30Sjl139090 *
425cf1a30Sjl139090 * The contents of this file are subject to the terms of the
525cf1a30Sjl139090 * Common Development and Distribution License (the "License").
625cf1a30Sjl139090 * You may not use this file except in compliance with the License.
725cf1a30Sjl139090 *
825cf1a30Sjl139090 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
925cf1a30Sjl139090 * or http://www.opensolaris.org/os/licensing.
1025cf1a30Sjl139090 * See the License for the specific language governing permissions
1125cf1a30Sjl139090 * and limitations under the License.
1225cf1a30Sjl139090 *
1325cf1a30Sjl139090 * When distributing Covered Code, include this CDDL HEADER in each
1425cf1a30Sjl139090 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
1525cf1a30Sjl139090 * If applicable, add the following below this CDDL HEADER, with the
1625cf1a30Sjl139090 * fields enclosed by brackets "[]" replaced with your own identifying
1725cf1a30Sjl139090 * information: Portions Copyright [yyyy] [name of copyright owner]
1825cf1a30Sjl139090 *
1925cf1a30Sjl139090 * CDDL HEADER END
2025cf1a30Sjl139090 */
2125cf1a30Sjl139090/*
22*e64c6c3fSMichael Bergknoff * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
2325cf1a30Sjl139090 * Use is subject to license terms.
2425cf1a30Sjl139090 */
2525cf1a30Sjl139090
2625cf1a30Sjl139090#include <sys/param.h>
2725cf1a30Sjl139090#include <sys/errno.h>
2825cf1a30Sjl139090#include <sys/asm_linkage.h>
2925cf1a30Sjl139090#include <sys/vtrace.h>
3025cf1a30Sjl139090#include <sys/machthread.h>
3125cf1a30Sjl139090#include <sys/clock.h>
3225cf1a30Sjl139090#include <sys/asi.h>
3325cf1a30Sjl139090#include <sys/fsr.h>
3425cf1a30Sjl139090#include <sys/privregs.h>
3525cf1a30Sjl139090
3625cf1a30Sjl139090#if !defined(lint)
3725cf1a30Sjl139090#include "assym.h"
3825cf1a30Sjl139090#endif	/* lint */
3925cf1a30Sjl139090
4025cf1a30Sjl139090/*
4125cf1a30Sjl139090 * Pseudo-code to aid in understanding the control flow of the
4225cf1a30Sjl139090 * bcopy/copyin/copyout routines.
4325cf1a30Sjl139090 *
4425cf1a30Sjl139090 * On entry:
4525cf1a30Sjl139090 *
4625cf1a30Sjl139090 * 	! Determine whether to use the FP register version
4725cf1a30Sjl139090 * 	! or the leaf routine version depending on size
4825cf1a30Sjl139090 * 	! of copy and flags.  Set up error handling accordingly.
4925cf1a30Sjl139090 *	! The transition point depends on whether the src and
5025cf1a30Sjl139090 * 	! dst addresses can be aligned to long word, word,
5125cf1a30Sjl139090 * 	! half word, or byte boundaries.
5225cf1a30Sjl139090 *	!
5325cf1a30Sjl139090 *	! WARNING: <Register usage convention>
5425cf1a30Sjl139090 *	! For FP version, %l6 holds previous error handling and
5525cf1a30Sjl139090 *	! a flag: TRAMP_FLAG (low bits)
5625cf1a30Sjl139090 *	! for leaf routine version, %o4 holds those values.
5725cf1a30Sjl139090 *	! So either %l6 or %o4 is reserved and not available for
5825cf1a30Sjl139090 *	! any other use.
5925cf1a30Sjl139090 *
6025cf1a30Sjl139090 * 	if (length <= VIS_COPY_THRESHOLD) 	! start with a quick test
6125cf1a30Sjl139090 * 		go to small_copy;		! to speed short copies
6225cf1a30Sjl139090 *
6325cf1a30Sjl139090 * 	! src, dst long word alignable
6425cf1a30Sjl139090 * 		if (hw_copy_limit_8 == 0) 	! hw_copy disabled
6525cf1a30Sjl139090 * 			go to small_copy;
6625cf1a30Sjl139090 *		if (length <= hw_copy_limit_8)
6725cf1a30Sjl139090 * 			go to small_copy;
6825cf1a30Sjl139090 * 		go to FPBLK_copy;
6925cf1a30Sjl139090 * 	}
7025cf1a30Sjl139090 * 	if (src,dst not alignable) {
7125cf1a30Sjl139090 * 		if (hw_copy_limit_1 == 0) 	! hw_copy disabled
7225cf1a30Sjl139090 * 			go to small_copy;
7325cf1a30Sjl139090 *		if (length <= hw_copy_limit_1)
7425cf1a30Sjl139090 * 			go to small_copy;
7525cf1a30Sjl139090 * 		go to FPBLK_copy;
7625cf1a30Sjl139090 * 	}
7725cf1a30Sjl139090 * 	if (src,dst halfword alignable) {
7825cf1a30Sjl139090 * 		if (hw_copy_limit_2 == 0) 	! hw_copy disabled
7925cf1a30Sjl139090 * 			go to small_copy;
8025cf1a30Sjl139090 *		if (length <= hw_copy_limit_2)
8125cf1a30Sjl139090 * 			go to small_copy;
8225cf1a30Sjl139090 * 		go to FPBLK_copy;
8325cf1a30Sjl139090 * 	}
8425cf1a30Sjl139090 * 	if (src,dst word alignable) {
8525cf1a30Sjl139090 * 		if (hw_copy_limit_4 == 0) 	! hw_copy disabled
8625cf1a30Sjl139090 * 			go to small_copy;
8725cf1a30Sjl139090 *		if (length <= hw_copy_limit_4)
8825cf1a30Sjl139090 * 			go to small_copy;
8925cf1a30Sjl139090 * 		go to FPBLK_copy;
9025cf1a30Sjl139090 * 	}
9125cf1a30Sjl139090 *
9225cf1a30Sjl139090 * small_copy:
9325cf1a30Sjl139090 *	Setup_leaf_rtn_error_handler; 		! diffs for each entry point
9425cf1a30Sjl139090 *
9525cf1a30Sjl139090 *	if (count <= 3)				! fast path for tiny copies
9625cf1a30Sjl139090 *		go to sm_left;			! special finish up code
9725cf1a30Sjl139090 *	else
9825cf1a30Sjl139090 *		if (count > CHKSIZE)		! medium sized copies
9925cf1a30Sjl139090 *			go to sm_med		! tuned by alignment
10025cf1a30Sjl139090 *		if(src&dst not both word aligned) {
10125cf1a30Sjl139090 *	sm_movebytes:
10225cf1a30Sjl139090 *			move byte by byte in 4-way unrolled loop
10325cf1a30Sjl139090 *			fall into sm_left;
10425cf1a30Sjl139090 *	sm_left:
10525cf1a30Sjl139090 *			move 0-3 bytes byte at a time as needed.
10625cf1a30Sjl139090 *			restore error handler and exit.
10725cf1a30Sjl139090 *
10825cf1a30Sjl139090 * 		} else {	! src&dst are word aligned
10925cf1a30Sjl139090 *			check for at least 8 bytes left,
11025cf1a30Sjl139090 *			move word at a time, unrolled by 2
11125cf1a30Sjl139090 *			when fewer than 8 bytes left,
11225cf1a30Sjl139090 *	sm_half:	move half word at a time while 2 or more bytes left
11325cf1a30Sjl139090 *	sm_byte:	move final byte if necessary
11425cf1a30Sjl139090 *	sm_exit:
11525cf1a30Sjl139090 *			restore error handler and exit.
11625cf1a30Sjl139090 *		}
11725cf1a30Sjl139090 *
11825cf1a30Sjl139090 * ! Medium length cases with at least CHKSIZE bytes available
11925cf1a30Sjl139090 * ! method: line up src and dst as best possible, then
12025cf1a30Sjl139090 * ! move data in 4-way unrolled loops.
12125cf1a30Sjl139090 *
12225cf1a30Sjl139090 * sm_med:
12325cf1a30Sjl139090 *	if(src&dst unalignable)
12425cf1a30Sjl139090 * 		go to sm_movebytes
12525cf1a30Sjl139090 *	if(src&dst halfword alignable)
12625cf1a30Sjl139090 *		go to sm_movehalf
12725cf1a30Sjl139090 *	if(src&dst word alignable)
12825cf1a30Sjl139090 *		go to sm_moveword
12925cf1a30Sjl139090 * ! fall into long word movement
13025cf1a30Sjl139090 *	move bytes until src is word aligned
13125cf1a30Sjl139090 *	if not long word aligned, move a word
13225cf1a30Sjl139090 *	move long words in 4-way unrolled loop until < 32 bytes left
13325cf1a30Sjl139090 *      move long words in 1-way unrolled loop until < 8 bytes left
13425cf1a30Sjl139090 *	if zero bytes left, goto sm_exit
13525cf1a30Sjl139090 *	if one byte left, go to sm_byte
13625cf1a30Sjl139090 *	else go to sm_half
13725cf1a30Sjl139090 *
13825cf1a30Sjl139090 * sm_moveword:
13925cf1a30Sjl139090 *	move bytes until src is word aligned
14025cf1a30Sjl139090 *	move words in 4-way unrolled loop until < 16 bytes left
14125cf1a30Sjl139090 *      move words in 1-way unrolled loop until < 4 bytes left
14225cf1a30Sjl139090 *	if zero bytes left, goto sm_exit
14325cf1a30Sjl139090 *	if one byte left, go to sm_byte
14425cf1a30Sjl139090 *	else go to sm_half
14525cf1a30Sjl139090 *
14625cf1a30Sjl139090 * sm_movehalf:
14725cf1a30Sjl139090 *	move a byte if needed to align src on halfword
14825cf1a30Sjl139090 *	move halfwords in 4-way unrolled loop until < 8 bytes left
14925cf1a30Sjl139090 *	if zero bytes left, goto sm_exit
15025cf1a30Sjl139090 *	if one byte left, go to sm_byte
15125cf1a30Sjl139090 *	else go to sm_half
15225cf1a30Sjl139090 *
15325cf1a30Sjl139090 *
15425cf1a30Sjl139090 * FPBLK_copy:
15525cf1a30Sjl139090 * 	%l6 = curthread->t_lofault;
15625cf1a30Sjl139090 * 	if (%l6 != NULL) {
15725cf1a30Sjl139090 * 		membar #Sync
15825cf1a30Sjl139090 * 		curthread->t_lofault = .copyerr;
15925cf1a30Sjl139090 * 		caller_error_handler = TRUE             ! %l6 |= 2
16025cf1a30Sjl139090 * 	}
16125cf1a30Sjl139090 *
16225cf1a30Sjl139090 *	! for FPU testing we must not migrate cpus
16325cf1a30Sjl139090 * 	if (curthread->t_lwp == NULL) {
16425cf1a30Sjl139090 *		! Kernel threads do not have pcb's in which to store
16525cf1a30Sjl139090 *		! the floating point state, so disallow preemption during
16625cf1a30Sjl139090 *		! the copy.  This also prevents cpu migration.
16725cf1a30Sjl139090 * 		kpreempt_disable(curthread);
16825cf1a30Sjl139090 *	} else {
16925cf1a30Sjl139090 *		thread_nomigrate();
17025cf1a30Sjl139090 *	}
17125cf1a30Sjl139090 *
17225cf1a30Sjl139090 * 	old_fprs = %fprs;
17325cf1a30Sjl139090 * 	old_gsr = %gsr;
17425cf1a30Sjl139090 * 	if (%fprs.fef) {
17525cf1a30Sjl139090 * 		%fprs.fef = 1;
17625cf1a30Sjl139090 * 		save current fpregs on stack using blockstore
17725cf1a30Sjl139090 * 	} else {
17825cf1a30Sjl139090 * 		%fprs.fef = 1;
17925cf1a30Sjl139090 * 	}
18025cf1a30Sjl139090 *
18125cf1a30Sjl139090 *
18225cf1a30Sjl139090 * 	do_blockcopy_here;
18325cf1a30Sjl139090 *
18425cf1a30Sjl139090 * In lofault handler:
18525cf1a30Sjl139090 *	curthread->t_lofault = .copyerr2;
18625cf1a30Sjl139090 *	Continue on with the normal exit handler
18725cf1a30Sjl139090 *
18825cf1a30Sjl139090 * On normal exit:
18925cf1a30Sjl139090 * 	%gsr = old_gsr;
19025cf1a30Sjl139090 * 	if (old_fprs & FPRS_FEF)
19125cf1a30Sjl139090 * 		restore fpregs from stack using blockload
19225cf1a30Sjl139090 *	else
19325cf1a30Sjl139090 *		zero fpregs
19425cf1a30Sjl139090 * 	%fprs = old_fprs;
19525cf1a30Sjl139090 * 	membar #Sync
19625cf1a30Sjl139090 * 	curthread->t_lofault = (%l6 & ~3);
19725cf1a30Sjl139090 *	! following test omitted from copyin/copyout as they
19825cf1a30Sjl139090 *	! will always have a current thread
19925cf1a30Sjl139090 * 	if (curthread->t_lwp == NULL)
20025cf1a30Sjl139090 *		kpreempt_enable(curthread);
20125cf1a30Sjl139090 *	else
20225cf1a30Sjl139090 *		thread_allowmigrate();
20325cf1a30Sjl139090 * 	return (0)
20425cf1a30Sjl139090 *
20525cf1a30Sjl139090 * In second lofault handler (.copyerr2):
20625cf1a30Sjl139090 *	We've tried to restore fp state from the stack and failed.  To
20725cf1a30Sjl139090 *	prevent from returning with a corrupted fp state, we will panic.
20825cf1a30Sjl139090 */
20925cf1a30Sjl139090
21025cf1a30Sjl139090/*
21125cf1a30Sjl139090 * Comments about optimization choices
21225cf1a30Sjl139090 *
21325cf1a30Sjl139090 * The initial optimization decision in this code is to determine
21425cf1a30Sjl139090 * whether to use the FP registers for a copy or not.  If we don't
21525cf1a30Sjl139090 * use the FP registers, we can execute the copy as a leaf routine,
21625cf1a30Sjl139090 * saving a register save and restore.  Also, less elaborate setup
21725cf1a30Sjl139090 * is required, allowing short copies to be completed more quickly.
21825cf1a30Sjl139090 * For longer copies, especially unaligned ones (where the src and
21925cf1a30Sjl139090 * dst do not align to allow simple ldx,stx operation), the FP
22025cf1a30Sjl139090 * registers allow much faster copy operations.
22125cf1a30Sjl139090 *
22225cf1a30Sjl139090 * The estimated extra cost of the FP path will vary depending on
22325cf1a30Sjl139090 * src/dst alignment, dst offset from the next 64 byte FPblock store
22425cf1a30Sjl139090 * boundary, remaining src data after the last full dst cache line is
22525cf1a30Sjl139090 * moved whether the FP registers need to be saved, and some other
22625cf1a30Sjl139090 * minor issues.  The average additional overhead is estimated to be
22725cf1a30Sjl139090 * 400 clocks.  Since each non-repeated/predicted tst and branch costs
22825cf1a30Sjl139090 * around 10 clocks, elaborate calculation would slow down to all
22925cf1a30Sjl139090 * longer copies and only benefit a small portion of medium sized
23025cf1a30Sjl139090 * copies.  Rather than incur such cost, we chose fixed transition
23125cf1a30Sjl139090 * points for each of the alignment choices.
23225cf1a30Sjl139090 *
23325cf1a30Sjl139090 * For the inner loop, here is a comparison of the per cache line
23425cf1a30Sjl139090 * costs for each alignment when src&dst are in cache:
23525cf1a30Sjl139090 *
23625cf1a30Sjl139090 * byte aligned:  108 clocks slower for non-FPBLK
23725cf1a30Sjl139090 * half aligned:   44 clocks slower for non-FPBLK
23825cf1a30Sjl139090 * word aligned:   12 clocks slower for non-FPBLK
23925cf1a30Sjl139090 * long aligned:    4 clocks >>faster<< for non-FPBLK
24025cf1a30Sjl139090 *
24125cf1a30Sjl139090 * The long aligned loop runs faster because it does no prefetching.
24225cf1a30Sjl139090 * That wins if the data is not in cache or there is too little
24325cf1a30Sjl139090 * data to gain much benefit from prefetching.  But when there
24425cf1a30Sjl139090 * is more data and that data is not in cache, failing to prefetch
24525cf1a30Sjl139090 * can run much slower.  In addition, there is a 2 Kbyte store queue
24625cf1a30Sjl139090 * which will cause the non-FPBLK inner loop to slow for larger copies.
24725cf1a30Sjl139090 * The exact tradeoff is strongly load and application dependent, with
24825cf1a30Sjl139090 * increasing risk of a customer visible performance regression if the
24925cf1a30Sjl139090 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
25025cf1a30Sjl139090 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
25125cf1a30Sjl139090 * upper limit for the non-FPBLK code.  To minimize performance regression
25225cf1a30Sjl139090 * risk while still gaining the primary benefits of the improvements to
25325cf1a30Sjl139090 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
25425cf1a30Sjl139090 * hw_copy_limit_*.  Later experimental studies using different values
25525cf1a30Sjl139090 * of hw_copy_limit_* can be used to make further adjustments if
25625cf1a30Sjl139090 * appropriate.
25725cf1a30Sjl139090 *
25825cf1a30Sjl139090 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
25925cf1a30Sjl139090 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
26025cf1a30Sjl139090 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
26125cf1a30Sjl139090 * hw_copy_limit_8 = src and dst are longword aligned
26225cf1a30Sjl139090 *
26325cf1a30Sjl139090 * To say that src and dst are word aligned means that after
26425cf1a30Sjl139090 * some initial alignment activity of moving 0 to 3 bytes,
26525cf1a30Sjl139090 * both the src and dst will be on word boundaries so that
26625cf1a30Sjl139090 * word loads and stores may be used.
26725cf1a30Sjl139090 *
26825cf1a30Sjl139090 * Default values at May,2005 are:
26925cf1a30Sjl139090 * hw_copy_limit_1 =  256
27025cf1a30Sjl139090 * hw_copy_limit_2 =  512
27125cf1a30Sjl139090 * hw_copy_limit_4 = 1024
27225cf1a30Sjl139090 * hw_copy_limit_8 = 1024 (or 1536 on some systems)
27325cf1a30Sjl139090 *
27425cf1a30Sjl139090 *
27525cf1a30Sjl139090 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
27625cf1a30Sjl139090 * disabled for that alignment choice.
27725cf1a30Sjl139090 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
27825cf1a30Sjl139090 * the value of VIS_COPY_THRESHOLD is used.
27925cf1a30Sjl139090 * It is not envisioned that hw_copy_limit_? will be changed in the field
28025cf1a30Sjl139090 * It is provided to allow for disabling FPBLK copies and to allow
28125cf1a30Sjl139090 * easy testing of alternate values on future HW implementations
28225cf1a30Sjl139090 * that might have different cache sizes, clock rates or instruction
28325cf1a30Sjl139090 * timing rules.
28425cf1a30Sjl139090 *
28525cf1a30Sjl139090 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
28625cf1a30Sjl139090 * threshold to speedup all shorter copies (less than 256).  That
28725cf1a30Sjl139090 * saves an alignment test, memory reference, and enabling test
28825cf1a30Sjl139090 * for all short copies, or an estimated 24 clocks.
28925cf1a30Sjl139090 *
29025cf1a30Sjl139090 * The order in which these limits are checked does matter since each
29125cf1a30Sjl139090 * non-predicted tst and branch costs around 10 clocks.
29225cf1a30Sjl139090 * If src and dst are randomly selected addresses,
29325cf1a30Sjl139090 * 4 of 8 will not be alignable.
29425cf1a30Sjl139090 * 2 of 8 will be half word alignable.
29525cf1a30Sjl139090 * 1 of 8 will be word alignable.
29625cf1a30Sjl139090 * 1 of 8 will be long word alignable.
29725cf1a30Sjl139090 * But, tests on running kernels show that src and dst to copy code
29825cf1a30Sjl139090 * are typically not on random alignments.  Structure copies and
29925cf1a30Sjl139090 * copies of larger data sizes are often on long word boundaries.
30025cf1a30Sjl139090 * So we test the long word alignment case first, then
30125cf1a30Sjl139090 * the byte alignment, then halfword, then word alignment.
30225cf1a30Sjl139090 *
30325cf1a30Sjl139090 * Several times, tests for length are made to split the code
30425cf1a30Sjl139090 * into subcases.  These tests often allow later tests to be
30525cf1a30Sjl139090 * avoided.  For example, within the non-FPBLK copy, we first
30625cf1a30Sjl139090 * check for tiny copies of 3 bytes or less.  That allows us
30725cf1a30Sjl139090 * to use a 4-way unrolled loop for the general byte copy case
30825cf1a30Sjl139090 * without a test on loop entry.
30925cf1a30Sjl139090 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
31025cf1a30Sjl139090 * vs longer cases.  For the really short case, we don't attempt
31125cf1a30Sjl139090 * align src and dst.  We try to minimize special case tests in
31225cf1a30Sjl139090 * the shortest loops as each test adds a significant percentage
31325cf1a30Sjl139090 * to the total time.
31425cf1a30Sjl139090 *
31525cf1a30Sjl139090 * For the medium sized cases, we allow ourselves to adjust the
31625cf1a30Sjl139090 * src and dst alignment and provide special cases for each of
31725cf1a30Sjl139090 * the four adjusted alignment cases. The CHKSIZE that was used
31825cf1a30Sjl139090 * to decide between short and medium size was chosen to be 39
31925cf1a30Sjl139090 * as that allows for the worst case of 7 bytes of alignment
32025cf1a30Sjl139090 * shift and 4 times 8 bytes for the first long word unrolling.
32125cf1a30Sjl139090 * That knowledge saves an initial test for length on entry into
32225cf1a30Sjl139090 * the medium cases.  If the general loop unrolling factor were
32325cf1a30Sjl139090 * to be increases, this number would also need to be adjusted.
32425cf1a30Sjl139090 *
32525cf1a30Sjl139090 * For all cases in the non-FPBLK code where it is known that at
32625cf1a30Sjl139090 * least 4 chunks of data are available for movement, the
32725cf1a30Sjl139090 * loop is unrolled by four.  This 4-way loop runs in 8 clocks
32825cf1a30Sjl139090 * or 2 clocks per data element.
32925cf1a30Sjl139090 *
33025cf1a30Sjl139090 * Instruction alignment is forced by used of .align 16 directives
33125cf1a30Sjl139090 * and nops which are not executed in the code.  This
33225cf1a30Sjl139090 * combination of operations shifts the alignment of following
33325cf1a30Sjl139090 * loops to insure that loops are aligned so that their instructions
33425cf1a30Sjl139090 * fall within the minimum number of 4 instruction fetch groups.
33525cf1a30Sjl139090 * If instructions are inserted or removed between the .align
33625cf1a30Sjl139090 * instruction and the unrolled loops, then the alignment needs
33725cf1a30Sjl139090 * to be readjusted.  Misaligned loops can add a clock per loop
33825cf1a30Sjl139090 * iteration to the loop timing.
33925cf1a30Sjl139090 *
34025cf1a30Sjl139090 * In a few cases, code is duplicated to avoid a branch.  Since
34125cf1a30Sjl139090 * a non-predicted tst and branch takes 10 clocks, this savings
34225cf1a30Sjl139090 * is judged an appropriate time-space tradeoff.
34325cf1a30Sjl139090 *
34425cf1a30Sjl139090 * Within the FPBLK-code, the prefetch method in the inner
34525cf1a30Sjl139090 * loop needs to be explained as it is not standard.  Two
34625cf1a30Sjl139090 * prefetches are issued for each cache line instead of one.
34725cf1a30Sjl139090 * The primary one is at the maximum reach of 8 cache lines.
34825cf1a30Sjl139090 * Most of the time, that maximum prefetch reach gives the
34925cf1a30Sjl139090 * cache line more time to reach the processor for systems with
35025cf1a30Sjl139090 * higher processor clocks.  But, sometimes memory interference
35125cf1a30Sjl139090 * can cause that prefetch to be dropped.  Putting a second
35225cf1a30Sjl139090 * prefetch at a reach of 5 cache lines catches the drops
35325cf1a30Sjl139090 * three iterations later and shows a measured improvement
35425cf1a30Sjl139090 * in performance over any similar loop with a single prefetch.
35525cf1a30Sjl139090 * The prefetches are placed in the loop so they overlap with
35625cf1a30Sjl139090 * non-memory instructions, so that there is no extra cost
35725cf1a30Sjl139090 * when the data is already in-cache.
35825cf1a30Sjl139090 *
35925cf1a30Sjl139090 */
36025cf1a30Sjl139090
36125cf1a30Sjl139090/*
36225cf1a30Sjl139090 * Notes on preserving existing fp state and on membars.
36325cf1a30Sjl139090 *
36425cf1a30Sjl139090 * When a copyOP decides to use fp we may have to preserve existing
36525cf1a30Sjl139090 * floating point state.  It is not the caller's state that we need to
36625cf1a30Sjl139090 * preserve - the rest of the kernel does not use fp and, anyway, fp
36725cf1a30Sjl139090 * registers are volatile across a call.  Some examples:
36825cf1a30Sjl139090 *
36925cf1a30Sjl139090 *	- userland has fp state and is interrupted (device interrupt
37025cf1a30Sjl139090 *	  or trap) and within the interrupt/trap handling we use
37125cf1a30Sjl139090 *	  bcopy()
37225cf1a30Sjl139090 *	- another (higher level) interrupt or trap handler uses bcopy
37325cf1a30Sjl139090 *	  while a bcopy from an earlier interrupt is still active
37425cf1a30Sjl139090 *	- an asynchronous error trap occurs while fp state exists (in
37525cf1a30Sjl139090 *	  userland or in kernel copy) and the tl0 component of the handling
37625cf1a30Sjl139090 *	  uses bcopy
37725cf1a30Sjl139090 *	- a user process with fp state incurs a copy-on-write fault and
37825cf1a30Sjl139090 *	  hwblkpagecopy always uses fp
37925cf1a30Sjl139090 *
38025cf1a30Sjl139090 * We therefore need a per-call place in which to preserve fp state -
38125cf1a30Sjl139090 * using our stack is ideal (and since fp copy cannot be leaf optimized
38225cf1a30Sjl139090 * because of calls it makes, this is no hardship).
38325cf1a30Sjl139090 *
38425cf1a30Sjl139090 * When we have finished fp copy (with it's repeated block stores)
38525cf1a30Sjl139090 * we must membar #Sync so that our block stores may complete before
38625cf1a30Sjl139090 * we either restore the original fp state into the fp registers or
38725cf1a30Sjl139090 * return to a caller which may initiate other fp operations that could
38825cf1a30Sjl139090 * modify the fp regs we used before the block stores complete.
38925cf1a30Sjl139090 *
39025cf1a30Sjl139090 * Synchronous faults (eg, unresolvable DMMU miss) that occur while
39125cf1a30Sjl139090 * t_lofault is not NULL will not panic but will instead trampoline
39225cf1a30Sjl139090 * to the registered lofault handler.  There is no need for any
39325cf1a30Sjl139090 * membars for these - eg, our store to t_lofault will always be visible to
39425cf1a30Sjl139090 * ourselves and it is our cpu which will take any trap.
39525cf1a30Sjl139090 *
39625cf1a30Sjl139090 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
39725cf1a30Sjl139090 * while t_lofault is not NULL will also not panic.  Since we're copying
39825cf1a30Sjl139090 * to or from userland the extent of the damage is known - the destination
39925cf1a30Sjl139090 * buffer is incomplete.  So trap handlers will trampoline to the lofault
40025cf1a30Sjl139090 * handler in this case which should take some form of error action to
40125cf1a30Sjl139090 * avoid using the incomplete buffer.  The trap handler also flags the
40225cf1a30Sjl139090 * fault so that later return-from-trap handling (for the trap that brought
40325cf1a30Sjl139090 * this thread into the kernel in the first place) can notify the process
40425cf1a30Sjl139090 * and reboot the system (or restart the service with Greenline/Contracts).
40525cf1a30Sjl139090 *
40625cf1a30Sjl139090 * Asynchronous faults (eg, uncorrectable ECC error from memory) can
40725cf1a30Sjl139090 * result in deferred error traps - the trap is taken sometime after
40825cf1a30Sjl139090 * the event and the trap PC may not be the PC of the faulting access.
40925cf1a30Sjl139090 * Delivery of such pending traps can be forced by a membar #Sync, acting
41025cf1a30Sjl139090 * as an "error barrier" in this role.  To accurately apply the user/kernel
41125cf1a30Sjl139090 * separation described in the preceding paragraph we must force delivery
41225cf1a30Sjl139090 * of deferred traps affecting kernel state before we install a lofault
41325cf1a30Sjl139090 * handler (if we interpose a new lofault handler on an existing one there
41425cf1a30Sjl139090 * is no need to repeat this), and we must force delivery of deferred
41525cf1a30Sjl139090 * errors affecting the lofault-protected region before we clear t_lofault.
41625cf1a30Sjl139090 * Failure to do so results in lost kernel state being interpreted as
41725cf1a30Sjl139090 * affecting a copyin/copyout only, or of an error that really only
41825cf1a30Sjl139090 * affects copy data being interpreted as losing kernel state.
41925cf1a30Sjl139090 *
42025cf1a30Sjl139090 * Since the copy operations may preserve and later restore floating
42125cf1a30Sjl139090 * point state that does not belong to the caller (see examples above),
42225cf1a30Sjl139090 * we must be careful in how we do this in order to prevent corruption
42325cf1a30Sjl139090 * of another program.
42425cf1a30Sjl139090 *
42525cf1a30Sjl139090 * To make sure that floating point state is always saved and restored
42625cf1a30Sjl139090 * correctly, the following "big rules" must be followed when the floating
42725cf1a30Sjl139090 * point registers will be used:
42825cf1a30Sjl139090 *
42925cf1a30Sjl139090 * 1. %l6 always holds the caller's lofault handler.  Also in this register,
43025cf1a30Sjl139090 *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
43125cf1a30Sjl139090 *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
43225cf1a30Sjl139090 *    lofault handler was set coming in.
43325cf1a30Sjl139090 *
43425cf1a30Sjl139090 * 2. The FPUSED flag indicates that all FP state has been successfully stored
43525cf1a30Sjl139090 *    on the stack.  It should not be set until this save has been completed.
43625cf1a30Sjl139090 *
43725cf1a30Sjl139090 * 3. The FPUSED flag should not be cleared on exit until all FP state has
43825cf1a30Sjl139090 *    been restored from the stack.  If an error occurs while restoring
43925cf1a30Sjl139090 *    data from the stack, the error handler can check this flag to see if
44025cf1a30Sjl139090 *    a restore is necessary.
44125cf1a30Sjl139090 *
44225cf1a30Sjl139090 * 4. Code run under the new lofault handler must be kept to a minimum.  In
44325cf1a30Sjl139090 *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
44425cf1a30Sjl139090 *    to kpreempt(), should not be made until after the lofault handler has
44525cf1a30Sjl139090 *    been restored.
44625cf1a30Sjl139090 */
44725cf1a30Sjl139090
44825cf1a30Sjl139090/*
44925cf1a30Sjl139090 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
45025cf1a30Sjl139090 * to "break even" using FP/VIS-accelerated memory operations.
45125cf1a30Sjl139090 * The FPBLK code assumes a minimum number of bytes are available
45225cf1a30Sjl139090 * to be moved on entry.  Check that code carefully before
45325cf1a30Sjl139090 * reducing VIS_COPY_THRESHOLD below 256.
45425cf1a30Sjl139090 */
45525cf1a30Sjl139090/*
45625cf1a30Sjl139090 * This shadows sys/machsystm.h which can't be included due to the lack of
45725cf1a30Sjl139090 * _ASM guards in include files it references. Change it here, change it there.
45825cf1a30Sjl139090 */
45925cf1a30Sjl139090#define VIS_COPY_THRESHOLD 256
46025cf1a30Sjl139090
46125cf1a30Sjl139090/*
46225cf1a30Sjl139090 * TEST for very short copies
46325cf1a30Sjl139090 * Be aware that the maximum unroll for the short unaligned case
46425cf1a30Sjl139090 * is SHORTCOPY+1
46525cf1a30Sjl139090 */
46625cf1a30Sjl139090#define SHORTCOPY 3
46725cf1a30Sjl139090#define CHKSIZE  39
46825cf1a30Sjl139090
46925cf1a30Sjl139090/*
47025cf1a30Sjl139090 * Indicates that we're to trampoline to the error handler.
47125cf1a30Sjl139090 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
47225cf1a30Sjl139090 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
47325cf1a30Sjl139090 */
47425cf1a30Sjl139090#define	FPUSED_FLAG	1
47525cf1a30Sjl139090#define	TRAMP_FLAG	2
47625cf1a30Sjl139090#define	MASK_FLAGS	3
47725cf1a30Sjl139090
47825cf1a30Sjl139090/*
47925cf1a30Sjl139090 * Number of outstanding prefetches.
480c8a722abSpm145316 * first prefetch moves data from L2 to L1 (n_reads)
481c8a722abSpm145316 * second prefetch moves data from memory to L2 (one_read)
48225cf1a30Sjl139090 */
483c8a722abSpm145316#define	OLYMPUS_C_PREFETCH	24
484c8a722abSpm145316#define	OLYMPUS_C_2ND_PREFETCH	12
48525cf1a30Sjl139090
48625cf1a30Sjl139090#define	VIS_BLOCKSIZE		64
48725cf1a30Sjl139090
48825cf1a30Sjl139090/*
48925cf1a30Sjl139090 * Size of stack frame in order to accomodate a 64-byte aligned
49025cf1a30Sjl139090 * floating-point register save area and 2 64-bit temp locations.
49125cf1a30Sjl139090 * All copy functions use two quadrants of fp registers; to assure a
49225cf1a30Sjl139090 * block-aligned two block buffer in which to save we must reserve
49325cf1a30Sjl139090 * three blocks on stack.  Not all functions preserve %pfrs on stack
49425cf1a30Sjl139090 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
49525cf1a30Sjl139090 *
49625cf1a30Sjl139090 *    _______________________________________ <-- %fp + STACK_BIAS
49725cf1a30Sjl139090 *    | We may need to preserve 2 quadrants |
49825cf1a30Sjl139090 *    | of fp regs, but since we do so with |
49925cf1a30Sjl139090 *    | BST/BLD we need room in which to    |
50025cf1a30Sjl139090 *    | align to VIS_BLOCKSIZE bytes.  So   |
50125cf1a30Sjl139090 *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
50225cf1a30Sjl139090 *    |-------------------------------------|
50325cf1a30Sjl139090 *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
50425cf1a30Sjl139090 *    |-------------------------------------|
50525cf1a30Sjl139090 *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
50625cf1a30Sjl139090 *    ---------------------------------------
50725cf1a30Sjl139090 */
50825cf1a30Sjl139090#define	HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
50925cf1a30Sjl139090#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 3)
51025cf1a30Sjl139090#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 2) - 1)
51125cf1a30Sjl139090#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
51225cf1a30Sjl139090#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
51325cf1a30Sjl139090
51425cf1a30Sjl139090/*
51525cf1a30Sjl139090 * Common macros used by the various versions of the block copy
51625cf1a30Sjl139090 * routines in this file.
51725cf1a30Sjl139090 */
51825cf1a30Sjl139090
51925cf1a30Sjl139090/*
52025cf1a30Sjl139090 * In FP copies if we do not have preserved data to restore over
52125cf1a30Sjl139090 * the fp regs we used then we must zero those regs to avoid
52225cf1a30Sjl139090 * exposing portions of the data to later threads (data security).
52325cf1a30Sjl139090 *
52425cf1a30Sjl139090 * Copy functions use either quadrants 1 and 3 or 2 and 4.
52525cf1a30Sjl139090 *
52625cf1a30Sjl139090 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
52725cf1a30Sjl139090 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
52825cf1a30Sjl139090 *
52925cf1a30Sjl139090 * The instructions below are quicker than repeated fzero instructions
53025cf1a30Sjl139090 * since they can dispatch down two fp pipelines.
53125cf1a30Sjl139090 */
53225cf1a30Sjl139090#define	FZEROQ1Q3			\
53325cf1a30Sjl139090	fzero	%f0			;\
53425cf1a30Sjl139090	fmovd	%f0, %f2		;\
53525cf1a30Sjl139090	fmovd	%f0, %f4		;\
53625cf1a30Sjl139090	fmovd	%f0, %f6		;\
53725cf1a30Sjl139090	fmovd	%f0, %f8		;\
53825cf1a30Sjl139090	fmovd	%f0, %f10		;\
53925cf1a30Sjl139090	fmovd	%f0, %f12		;\
54025cf1a30Sjl139090	fmovd	%f0, %f14		;\
54125cf1a30Sjl139090	fmovd	%f0, %f32		;\
54225cf1a30Sjl139090	fmovd	%f0, %f34		;\
54325cf1a30Sjl139090	fmovd	%f0, %f36		;\
54425cf1a30Sjl139090	fmovd	%f0, %f38		;\
54525cf1a30Sjl139090	fmovd	%f0, %f40		;\
54625cf1a30Sjl139090	fmovd	%f0, %f42		;\
54725cf1a30Sjl139090	fmovd	%f0, %f44		;\
54825cf1a30Sjl139090	fmovd	%f0, %f46
54925cf1a30Sjl139090
55025cf1a30Sjl139090#define	FZEROQ2Q4			\
55125cf1a30Sjl139090	fzero	%f16			;\
55225cf1a30Sjl139090	fmovd	%f0, %f18		;\
55325cf1a30Sjl139090	fmovd	%f0, %f20		;\
55425cf1a30Sjl139090	fmovd	%f0, %f22		;\
55525cf1a30Sjl139090	fmovd	%f0, %f24		;\
55625cf1a30Sjl139090	fmovd	%f0, %f26		;\
55725cf1a30Sjl139090	fmovd	%f0, %f28		;\
55825cf1a30Sjl139090	fmovd	%f0, %f30		;\
55925cf1a30Sjl139090	fmovd	%f0, %f48		;\
56025cf1a30Sjl139090	fmovd	%f0, %f50		;\
56125cf1a30Sjl139090	fmovd	%f0, %f52		;\
56225cf1a30Sjl139090	fmovd	%f0, %f54		;\
56325cf1a30Sjl139090	fmovd	%f0, %f56		;\
56425cf1a30Sjl139090	fmovd	%f0, %f58		;\
56525cf1a30Sjl139090	fmovd	%f0, %f60		;\
56625cf1a30Sjl139090	fmovd	%f0, %f62
56725cf1a30Sjl139090
56825cf1a30Sjl139090/*
56925cf1a30Sjl139090 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
57025cf1a30Sjl139090 * Used to save and restore in-use fp registers when we want to use FP
57125cf1a30Sjl139090 * and find fp already in use and copy size still large enough to justify
57225cf1a30Sjl139090 * the additional overhead of this save and restore.
57325cf1a30Sjl139090 *
57425cf1a30Sjl139090 * A membar #Sync is needed before save to sync fp ops initiated before
57525cf1a30Sjl139090 * the call to the copy function (by whoever has fp in use); for example
57625cf1a30Sjl139090 * an earlier block load to the quadrant we are about to save may still be
57725cf1a30Sjl139090 * "in flight".  A membar #Sync is required at the end of the save to
57825cf1a30Sjl139090 * sync our block store (the copy code is about to begin ldd's to the
57925cf1a30Sjl139090 * first quadrant).
58025cf1a30Sjl139090 *
58125cf1a30Sjl139090 * Similarly: a membar #Sync before restore allows the block stores of
58225cf1a30Sjl139090 * the copy operation to complete before we fill the quadrants with their
58325cf1a30Sjl139090 * original data, and a membar #Sync after restore lets the block loads
58425cf1a30Sjl139090 * of the restore complete before we return to whoever has the fp regs
58525cf1a30Sjl139090 * in use.  To avoid repeated membar #Sync we make it the responsibility
58625cf1a30Sjl139090 * of the copy code to membar #Sync immediately after copy is complete
58725cf1a30Sjl139090 * and before using the BLD_*_FROMSTACK macro.
58825cf1a30Sjl139090 */
58925cf1a30Sjl139090#if !defined(lint)
59025cf1a30Sjl139090#define BST_FPQ1Q3_TOSTACK(tmp1)				\
59125cf1a30Sjl139090	/* membar #Sync	*/					;\
59225cf1a30Sjl139090	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
59325cf1a30Sjl139090	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
59425cf1a30Sjl139090	stda	%f0, [tmp1]ASI_BLK_P				;\
59525cf1a30Sjl139090	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
59625cf1a30Sjl139090	stda	%f32, [tmp1]ASI_BLK_P				;\
59725cf1a30Sjl139090	membar	#Sync
59825cf1a30Sjl139090
59925cf1a30Sjl139090#define	BLD_FPQ1Q3_FROMSTACK(tmp1)				\
60025cf1a30Sjl139090	/* membar #Sync - provided at copy completion */	;\
60125cf1a30Sjl139090	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
60225cf1a30Sjl139090	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
60325cf1a30Sjl139090	ldda	[tmp1]ASI_BLK_P, %f0				;\
60425cf1a30Sjl139090	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
60525cf1a30Sjl139090	ldda	[tmp1]ASI_BLK_P, %f32				;\
60625cf1a30Sjl139090	membar	#Sync
60725cf1a30Sjl139090
60825cf1a30Sjl139090#define BST_FPQ2Q4_TOSTACK(tmp1)				\
60925cf1a30Sjl139090	/* membar #Sync */					;\
61025cf1a30Sjl139090	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
61125cf1a30Sjl139090	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
61225cf1a30Sjl139090	stda	%f16, [tmp1]ASI_BLK_P				;\
61325cf1a30Sjl139090	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
61425cf1a30Sjl139090	stda	%f48, [tmp1]ASI_BLK_P				;\
61525cf1a30Sjl139090	membar	#Sync
61625cf1a30Sjl139090
61725cf1a30Sjl139090#define	BLD_FPQ2Q4_FROMSTACK(tmp1)				\
61825cf1a30Sjl139090	/* membar #Sync - provided at copy completion */	;\
61925cf1a30Sjl139090	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
62025cf1a30Sjl139090	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
62125cf1a30Sjl139090	ldda	[tmp1]ASI_BLK_P, %f16				;\
62225cf1a30Sjl139090	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
62325cf1a30Sjl139090	ldda	[tmp1]ASI_BLK_P, %f48				;\
62425cf1a30Sjl139090	membar	#Sync
62525cf1a30Sjl139090#endif
62625cf1a30Sjl139090
62725cf1a30Sjl139090/*
62825cf1a30Sjl139090 * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
62925cf1a30Sjl139090 * prevent preemption if there is no t_lwp to save FP state to on context
63025cf1a30Sjl139090 * switch) before commencing a FP copy, and reallow it on completion or
63125cf1a30Sjl139090 * in error trampoline paths when we were using FP copy.
63225cf1a30Sjl139090 *
63325cf1a30Sjl139090 * Both macros may call other functions, so be aware that all outputs are
63425cf1a30Sjl139090 * forfeit after using these macros.  For this reason we do not pass registers
63525cf1a30Sjl139090 * to use - we just use any outputs we want.
63625cf1a30Sjl139090 *
63725cf1a30Sjl139090 * Pseudo code:
63825cf1a30Sjl139090 *
63925cf1a30Sjl139090 * FP_NOMIGRATE:
64025cf1a30Sjl139090 *
64125cf1a30Sjl139090 * if (curthread->t_lwp) {
64225cf1a30Sjl139090 *	thread_nomigrate();
64325cf1a30Sjl139090 * } else {
64425cf1a30Sjl139090 *	kpreempt_disable();
64525cf1a30Sjl139090 * }
64625cf1a30Sjl139090 *
64725cf1a30Sjl139090 * FP_ALLOWMIGRATE:
64825cf1a30Sjl139090 *
64925cf1a30Sjl139090 * if (curthread->t_lwp) {
65025cf1a30Sjl139090 *	thread_allowmigrate();
65125cf1a30Sjl139090 * } else {
65225cf1a30Sjl139090 *	kpreempt_enable();
65325cf1a30Sjl139090 * }
65425cf1a30Sjl139090 */
65525cf1a30Sjl139090
65625cf1a30Sjl139090#define	FP_NOMIGRATE(label1, label2)				\
65725cf1a30Sjl139090	ldn	[THREAD_REG + T_LWP], %o0			;\
65825cf1a30Sjl139090	brz,a,pn %o0, label1/**/f				;\
65925cf1a30Sjl139090	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
66025cf1a30Sjl139090	call	thread_nomigrate				;\
66125cf1a30Sjl139090	  nop							;\
66225cf1a30Sjl139090	ba	label2/**/f					;\
66325cf1a30Sjl139090	  nop							;\
66425cf1a30Sjl139090label1:								;\
66525cf1a30Sjl139090	inc	%o1						;\
66625cf1a30Sjl139090	stb	%o1, [THREAD_REG + T_PREEMPT]			;\
66725cf1a30Sjl139090label2:
66825cf1a30Sjl139090
66925cf1a30Sjl139090#define	FP_ALLOWMIGRATE(label1, label2)			\
67025cf1a30Sjl139090	ldn	[THREAD_REG + T_LWP], %o0			;\
67125cf1a30Sjl139090	brz,a,pn %o0, label1/**/f				;\
67225cf1a30Sjl139090	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
67325cf1a30Sjl139090	call thread_allowmigrate				;\
67425cf1a30Sjl139090	  nop							;\
67525cf1a30Sjl139090	ba	label2/**/f					;\
67625cf1a30Sjl139090	  nop							;\
67725cf1a30Sjl139090label1:								;\
67825cf1a30Sjl139090	dec	%o1						;\
67925cf1a30Sjl139090	brnz,pn	%o1, label2/**/f				;\
68025cf1a30Sjl139090	  stb	%o1, [THREAD_REG + T_PREEMPT]			;\
68125cf1a30Sjl139090	ldn	[THREAD_REG + T_CPU], %o0			;\
68225cf1a30Sjl139090	ldub	[%o0 + CPU_KPRUNRUN], %o0			;\
68325cf1a30Sjl139090	brz,pt	%o0, label2/**/f				;\
68425cf1a30Sjl139090	  nop							;\
68525cf1a30Sjl139090	call	kpreempt					;\
68625cf1a30Sjl139090	  rdpr	%pil, %o0					;\
68725cf1a30Sjl139090label2:
68825cf1a30Sjl139090
68925cf1a30Sjl139090/*
69025cf1a30Sjl139090 * Copy a block of storage, returning an error code if `from' or
69125cf1a30Sjl139090 * `to' takes a kernel pagefault which cannot be resolved.
69225cf1a30Sjl139090 * Returns errno value on pagefault error, 0 if all ok
69325cf1a30Sjl139090 */
69425cf1a30Sjl139090
69525cf1a30Sjl139090#if defined(lint)
69625cf1a30Sjl139090
69725cf1a30Sjl139090/* ARGSUSED */
69825cf1a30Sjl139090int
69925cf1a30Sjl139090kcopy(const void *from, void *to, size_t count)
70025cf1a30Sjl139090{ return(0); }
70125cf1a30Sjl139090
70225cf1a30Sjl139090#else	/* lint */
70325cf1a30Sjl139090
70425cf1a30Sjl139090	.seg	".text"
70525cf1a30Sjl139090	.align	4
70625cf1a30Sjl139090
70725cf1a30Sjl139090	ENTRY(kcopy)
70825cf1a30Sjl139090
70925cf1a30Sjl139090	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
71025cf1a30Sjl139090	bleu,pt	%ncc, .kcopy_small		! go to larger cases
71125cf1a30Sjl139090	  xor	%o0, %o1, %o3			! are src, dst alignable?
71225cf1a30Sjl139090	btst	7, %o3				!
71325cf1a30Sjl139090	bz,pt	%ncc, .kcopy_8			! check for longword alignment
71425cf1a30Sjl139090	  nop
71525cf1a30Sjl139090	btst	1, %o3				!
71625cf1a30Sjl139090	bz,pt	%ncc, .kcopy_2			! check for half-word
71725cf1a30Sjl139090	  nop
71825cf1a30Sjl139090	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
71925cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
72025cf1a30Sjl139090	tst	%o3
72125cf1a30Sjl139090	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
72225cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
72325cf1a30Sjl139090	bleu,pt	%ncc, .kcopy_small		! go to small copy
72425cf1a30Sjl139090	  nop
72525cf1a30Sjl139090	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
72625cf1a30Sjl139090	  nop
72725cf1a30Sjl139090.kcopy_2:
72825cf1a30Sjl139090	btst	3, %o3				!
72925cf1a30Sjl139090	bz,pt	%ncc, .kcopy_4			! check for word alignment
73025cf1a30Sjl139090	  nop
73125cf1a30Sjl139090	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
73225cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
73325cf1a30Sjl139090	tst	%o3
73425cf1a30Sjl139090	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
73525cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
73625cf1a30Sjl139090	bleu,pt	%ncc, .kcopy_small		! go to small copy
73725cf1a30Sjl139090	  nop
73825cf1a30Sjl139090	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
73925cf1a30Sjl139090	  nop
74025cf1a30Sjl139090.kcopy_4:
74125cf1a30Sjl139090	! already checked longword, must be word aligned
74225cf1a30Sjl139090	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
74325cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
74425cf1a30Sjl139090	tst	%o3
74525cf1a30Sjl139090	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
74625cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
74725cf1a30Sjl139090	bleu,pt	%ncc, .kcopy_small		! go to small copy
74825cf1a30Sjl139090	  nop
74925cf1a30Sjl139090	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
75025cf1a30Sjl139090	  nop
75125cf1a30Sjl139090.kcopy_8:
75225cf1a30Sjl139090	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
75325cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
75425cf1a30Sjl139090	tst	%o3
75525cf1a30Sjl139090	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
75625cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
75725cf1a30Sjl139090	bleu,pt	%ncc, .kcopy_small		! go to small copy
75825cf1a30Sjl139090	  nop
75925cf1a30Sjl139090	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
76025cf1a30Sjl139090	  nop
76125cf1a30Sjl139090
76225cf1a30Sjl139090.kcopy_small:
76325cf1a30Sjl139090	sethi	%hi(.sm_copyerr), %o5		! sm_copyerr is lofault value
76425cf1a30Sjl139090	or	%o5, %lo(.sm_copyerr), %o5
76525cf1a30Sjl139090	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
76625cf1a30Sjl139090	membar	#Sync				! sync error barrier
76725cf1a30Sjl139090	ba,pt	%ncc, .sm_do_copy		! common code
76825cf1a30Sjl139090	 stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
76925cf1a30Sjl139090
77025cf1a30Sjl139090.kcopy_more:
77125cf1a30Sjl139090	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
77225cf1a30Sjl139090	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
77325cf1a30Sjl139090	or	%l7, %lo(.copyerr), %l7
77425cf1a30Sjl139090	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
77525cf1a30Sjl139090	membar	#Sync				! sync error barrier
77625cf1a30Sjl139090	ba,pt	%ncc, .do_copy			! common code
77725cf1a30Sjl139090	  stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
77825cf1a30Sjl139090
77925cf1a30Sjl139090
78025cf1a30Sjl139090/*
78125cf1a30Sjl139090 * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
78225cf1a30Sjl139090 * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
78325cf1a30Sjl139090 */
78425cf1a30Sjl139090.copyerr:
78525cf1a30Sjl139090	set	.copyerr2, %l0
78625cf1a30Sjl139090	membar	#Sync				! sync error barrier
78725cf1a30Sjl139090	stn	%l0, [THREAD_REG + T_LOFAULT]	! set t_lofault
78825cf1a30Sjl139090	btst	FPUSED_FLAG, %l6
78925cf1a30Sjl139090	bz	%ncc, 1f
79025cf1a30Sjl139090	  and	%l6, TRAMP_FLAG, %l0		! copy trampoline flag to %l0
79125cf1a30Sjl139090
79225cf1a30Sjl139090	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
79325cf1a30Sjl139090	wr	%o2, 0, %gsr
79425cf1a30Sjl139090
79525cf1a30Sjl139090	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
79625cf1a30Sjl139090	btst	FPRS_FEF, %o3
79725cf1a30Sjl139090	bz,pt	%icc, 4f
79825cf1a30Sjl139090	  nop
79925cf1a30Sjl139090
80025cf1a30Sjl139090	BLD_FPQ1Q3_FROMSTACK(%o2)
80125cf1a30Sjl139090
80225cf1a30Sjl139090	ba,pt	%ncc, 1f
80325cf1a30Sjl139090	  wr	%o3, 0, %fprs		! restore fprs
80425cf1a30Sjl139090
80525cf1a30Sjl1390904:
80625cf1a30Sjl139090	FZEROQ1Q3
80725cf1a30Sjl139090	wr	%o3, 0, %fprs		! restore fprs
80825cf1a30Sjl139090
80925cf1a30Sjl139090	!
81025cf1a30Sjl139090	! Need to cater for the different expectations of kcopy
81125cf1a30Sjl139090	! and bcopy. kcopy will *always* set a t_lofault handler
81225cf1a30Sjl139090	! If it fires, we're expected to just return the error code
81325cf1a30Sjl139090	! and *not* to invoke any existing error handler. As far as
81425cf1a30Sjl139090	! bcopy is concerned, we only set t_lofault if there was an
81525cf1a30Sjl139090	! existing lofault handler. In that case we're expected to
81625cf1a30Sjl139090	! invoke the previously existing handler after resetting the
81725cf1a30Sjl139090	! t_lofault value.
81825cf1a30Sjl139090	!
81925cf1a30Sjl1390901:
82025cf1a30Sjl139090	andn	%l6, MASK_FLAGS, %l6		! turn trampoline flag off
82125cf1a30Sjl139090	membar	#Sync				! sync error barrier
82225cf1a30Sjl139090	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
82325cf1a30Sjl139090	FP_ALLOWMIGRATE(5, 6)
82425cf1a30Sjl139090
82525cf1a30Sjl139090	btst	TRAMP_FLAG, %l0
82625cf1a30Sjl139090	bnz,pn	%ncc, 3f
82725cf1a30Sjl139090	  nop
82825cf1a30Sjl139090	ret
82925cf1a30Sjl139090	  restore	%g1, 0, %o0
83025cf1a30Sjl139090
83125cf1a30Sjl1390903:
83225cf1a30Sjl139090	!
83325cf1a30Sjl139090	! We're here via bcopy. There *must* have been an error handler
83425cf1a30Sjl139090	! in place otherwise we would have died a nasty death already.
83525cf1a30Sjl139090	!
83625cf1a30Sjl139090	jmp	%l6				! goto real handler
83725cf1a30Sjl139090	  restore	%g0, 0, %o0		! dispose of copy window
83825cf1a30Sjl139090
83925cf1a30Sjl139090/*
84025cf1a30Sjl139090 * We got here because of a fault in .copyerr.  We can't safely restore fp
84125cf1a30Sjl139090 * state, so we panic.
84225cf1a30Sjl139090 */
84325cf1a30Sjl139090fp_panic_msg:
84425cf1a30Sjl139090	.asciz	"Unable to restore fp state after copy operation"
84525cf1a30Sjl139090
84625cf1a30Sjl139090	.align	4
84725cf1a30Sjl139090.copyerr2:
84825cf1a30Sjl139090	set	fp_panic_msg, %o0
84925cf1a30Sjl139090	call	panic
85025cf1a30Sjl139090	  nop
85125cf1a30Sjl139090
85225cf1a30Sjl139090/*
85325cf1a30Sjl139090 * We got here because of a fault during a small kcopy or bcopy.
85425cf1a30Sjl139090 * No floating point registers are used by the small copies.
85525cf1a30Sjl139090 * Errno value is in %g1.
85625cf1a30Sjl139090 */
85725cf1a30Sjl139090.sm_copyerr:
85825cf1a30Sjl1390901:
85925cf1a30Sjl139090	btst	TRAMP_FLAG, %o4
86025cf1a30Sjl139090	membar	#Sync
86125cf1a30Sjl139090	andn	%o4, TRAMP_FLAG, %o4
86225cf1a30Sjl139090	bnz,pn	%ncc, 3f
86325cf1a30Sjl139090	  stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
86425cf1a30Sjl139090	retl
86525cf1a30Sjl139090	  mov	%g1, %o0
86625cf1a30Sjl1390903:
86725cf1a30Sjl139090	jmp	%o4				! goto real handler
86825cf1a30Sjl139090	  mov	%g0, %o0			!
86925cf1a30Sjl139090
87025cf1a30Sjl139090	SET_SIZE(kcopy)
87125cf1a30Sjl139090#endif	/* lint */
87225cf1a30Sjl139090
87325cf1a30Sjl139090
87425cf1a30Sjl139090/*
87525cf1a30Sjl139090 * Copy a block of storage - must not overlap (from + len <= to).
87625cf1a30Sjl139090 * Registers: l6 - saved t_lofault
87725cf1a30Sjl139090 * (for short copies, o4 - saved t_lofault)
87825cf1a30Sjl139090 *
87925cf1a30Sjl139090 * Copy a page of memory.
88025cf1a30Sjl139090 * Assumes double word alignment and a count >= 256.
88125cf1a30Sjl139090 */
88225cf1a30Sjl139090#if defined(lint)
88325cf1a30Sjl139090
88425cf1a30Sjl139090/* ARGSUSED */
88525cf1a30Sjl139090void
88625cf1a30Sjl139090bcopy(const void *from, void *to, size_t count)
88725cf1a30Sjl139090{}
88825cf1a30Sjl139090
88925cf1a30Sjl139090#else	/* lint */
89025cf1a30Sjl139090
89125cf1a30Sjl139090	ENTRY(bcopy)
89225cf1a30Sjl139090
89325cf1a30Sjl139090	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
89425cf1a30Sjl139090	bleu,pt	%ncc, .bcopy_small		! go to larger cases
89525cf1a30Sjl139090	  xor	%o0, %o1, %o3			! are src, dst alignable?
89625cf1a30Sjl139090	btst	7, %o3				!
89725cf1a30Sjl139090	bz,pt	%ncc, .bcopy_8			! check for longword alignment
89825cf1a30Sjl139090	  nop
89925cf1a30Sjl139090	btst	1, %o3				!
90025cf1a30Sjl139090	bz,pt	%ncc, .bcopy_2			! check for half-word
90125cf1a30Sjl139090	  nop
90225cf1a30Sjl139090	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
90325cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
90425cf1a30Sjl139090	tst	%o3
90525cf1a30Sjl139090	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
90625cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
90725cf1a30Sjl139090	bleu,pt	%ncc, .bcopy_small		! go to small copy
90825cf1a30Sjl139090	  nop
90925cf1a30Sjl139090	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
91025cf1a30Sjl139090	  nop
91125cf1a30Sjl139090.bcopy_2:
91225cf1a30Sjl139090	btst	3, %o3				!
91325cf1a30Sjl139090	bz,pt	%ncc, .bcopy_4			! check for word alignment
91425cf1a30Sjl139090	  nop
91525cf1a30Sjl139090	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
91625cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
91725cf1a30Sjl139090	tst	%o3
91825cf1a30Sjl139090	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
91925cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
92025cf1a30Sjl139090	bleu,pt	%ncc, .bcopy_small		! go to small copy
92125cf1a30Sjl139090	  nop
92225cf1a30Sjl139090	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
92325cf1a30Sjl139090	  nop
92425cf1a30Sjl139090.bcopy_4:
92525cf1a30Sjl139090	! already checked longword, must be word aligned
92625cf1a30Sjl139090	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
92725cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
92825cf1a30Sjl139090	tst	%o3
92925cf1a30Sjl139090	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
93025cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
93125cf1a30Sjl139090	bleu,pt	%ncc, .bcopy_small		! go to small copy
93225cf1a30Sjl139090	  nop
93325cf1a30Sjl139090	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
93425cf1a30Sjl139090	  nop
93525cf1a30Sjl139090.bcopy_8:
93625cf1a30Sjl139090	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
93725cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
93825cf1a30Sjl139090	tst	%o3
93925cf1a30Sjl139090	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
94025cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
94125cf1a30Sjl139090	bleu,pt	%ncc, .bcopy_small		! go to small copy
94225cf1a30Sjl139090	  nop
94325cf1a30Sjl139090	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
94425cf1a30Sjl139090	  nop
94525cf1a30Sjl139090
94625cf1a30Sjl139090	.align	16
94725cf1a30Sjl139090.bcopy_small:
94825cf1a30Sjl139090	ldn	[THREAD_REG + T_LOFAULT], %o4	! save t_lofault
94925cf1a30Sjl139090	tst	%o4
95025cf1a30Sjl139090	bz,pt	%icc, .sm_do_copy
95125cf1a30Sjl139090	  nop
95225cf1a30Sjl139090	sethi	%hi(.sm_copyerr), %o5
95325cf1a30Sjl139090	or	%o5, %lo(.sm_copyerr), %o5
95425cf1a30Sjl139090	membar	#Sync				! sync error barrier
95525cf1a30Sjl139090	stn	%o5, [THREAD_REG + T_LOFAULT]	! install new vector
95625cf1a30Sjl139090	or	%o4, TRAMP_FLAG, %o4		! error should trampoline
95725cf1a30Sjl139090.sm_do_copy:
95825cf1a30Sjl139090	cmp	%o2, SHORTCOPY		! check for really short case
95925cf1a30Sjl139090	bleu,pt	%ncc, .bc_sm_left	!
96025cf1a30Sjl139090	  cmp	%o2, CHKSIZE		! check for medium length cases
96125cf1a30Sjl139090	bgu,pn	%ncc, .bc_med		!
96225cf1a30Sjl139090	  or	%o0, %o1, %o3		! prepare alignment check
96325cf1a30Sjl139090	andcc	%o3, 0x3, %g0		! test for alignment
96425cf1a30Sjl139090	bz,pt	%ncc, .bc_sm_word	! branch to word aligned case
96525cf1a30Sjl139090.bc_sm_movebytes:
96625cf1a30Sjl139090	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
96725cf1a30Sjl139090.bc_sm_notalign4:
96825cf1a30Sjl139090	ldub	[%o0], %o3		! read byte
96925cf1a30Sjl139090	stb	%o3, [%o1]		! write byte
97025cf1a30Sjl139090	subcc	%o2, 4, %o2		! reduce count by 4
97125cf1a30Sjl139090	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
97225cf1a30Sjl139090	add	%o0, 4, %o0		! advance SRC by 4
97325cf1a30Sjl139090	stb	%o3, [%o1 + 1]
97425cf1a30Sjl139090	ldub	[%o0 - 2], %o3
97525cf1a30Sjl139090	add	%o1, 4, %o1		! advance DST by 4
97625cf1a30Sjl139090	stb	%o3, [%o1 - 2]
97725cf1a30Sjl139090	ldub	[%o0 - 1], %o3
97825cf1a30Sjl139090	bgt,pt	%ncc, .bc_sm_notalign4	! loop til 3 or fewer bytes remain
97925cf1a30Sjl139090	  stb	%o3, [%o1 - 1]
98025cf1a30Sjl139090	add	%o2, 3, %o2		! restore count
98125cf1a30Sjl139090.bc_sm_left:
98225cf1a30Sjl139090	tst	%o2
98325cf1a30Sjl139090	bz,pt	%ncc, .bc_sm_exit	! check for zero length
98425cf1a30Sjl139090	  deccc	%o2			! reduce count for cc test
98525cf1a30Sjl139090	ldub	[%o0], %o3		! move one byte
98625cf1a30Sjl139090	bz,pt	%ncc, .bc_sm_exit
98725cf1a30Sjl139090	  stb	%o3, [%o1]
98825cf1a30Sjl139090	ldub	[%o0 + 1], %o3		! move another byte
98925cf1a30Sjl139090	deccc	%o2			! check for more
99025cf1a30Sjl139090	bz,pt	%ncc, .bc_sm_exit
99125cf1a30Sjl139090	  stb	%o3, [%o1 + 1]
99225cf1a30Sjl139090	ldub	[%o0 + 2], %o3		! move final byte
993*e64c6c3fSMichael Bergknoff	ba,pt   %ncc, .bc_sm_exit
99425cf1a30Sjl139090	  stb	%o3, [%o1 + 2]
99525cf1a30Sjl139090	.align	16
99625cf1a30Sjl139090	nop				! instruction alignment
99725cf1a30Sjl139090					! see discussion at start of file
99825cf1a30Sjl139090.bc_sm_words:
99925cf1a30Sjl139090	lduw	[%o0], %o3		! read word
100025cf1a30Sjl139090.bc_sm_wordx:
100125cf1a30Sjl139090	subcc	%o2, 8, %o2		! update count
100225cf1a30Sjl139090	stw	%o3, [%o1]		! write word
100325cf1a30Sjl139090	add	%o0, 8, %o0		! update SRC
100425cf1a30Sjl139090	lduw	[%o0 - 4], %o3		! read word
100525cf1a30Sjl139090	add	%o1, 8, %o1		! update DST
100625cf1a30Sjl139090	bgt,pt	%ncc, .bc_sm_words	! loop til done
100725cf1a30Sjl139090	  stw	%o3, [%o1 - 4]		! write word
100825cf1a30Sjl139090	addcc	%o2, 7, %o2		! restore count
100925cf1a30Sjl139090	bz,pt	%ncc, .bc_sm_exit
101025cf1a30Sjl139090	  deccc	%o2
101125cf1a30Sjl139090	bz,pt	%ncc, .bc_sm_byte
101225cf1a30Sjl139090.bc_sm_half:
101325cf1a30Sjl139090	  subcc	%o2, 2, %o2		! reduce count by 2
101425cf1a30Sjl139090	add	%o0, 2, %o0		! advance SRC by 2
101525cf1a30Sjl139090	lduh	[%o0 - 2], %o3		! read half word
101625cf1a30Sjl139090	add	%o1, 2, %o1		! advance DST by 2
101725cf1a30Sjl139090	bgt,pt	%ncc, .bc_sm_half	! loop til done
101825cf1a30Sjl139090	  sth	%o3, [%o1 - 2]		! write half word
101925cf1a30Sjl139090	addcc	%o2, 1, %o2		! restore count
102025cf1a30Sjl139090	bz,pt	%ncc, .bc_sm_exit
102125cf1a30Sjl139090	  nop
102225cf1a30Sjl139090.bc_sm_byte:
102325cf1a30Sjl139090	ldub	[%o0], %o3
1024*e64c6c3fSMichael Bergknoff	ba,pt   %ncc, .bc_sm_exit
102525cf1a30Sjl139090	  stb	%o3, [%o1]
102625cf1a30Sjl139090
102725cf1a30Sjl139090.bc_sm_word:
102825cf1a30Sjl139090	subcc	%o2, 4, %o2		! update count
102925cf1a30Sjl139090	bgt,pt	%ncc, .bc_sm_wordx
103025cf1a30Sjl139090	  lduw	[%o0], %o3		! read word
103125cf1a30Sjl139090	addcc	%o2, 3, %o2		! restore count
103225cf1a30Sjl139090	bz,pt	%ncc, .bc_sm_exit
103325cf1a30Sjl139090	  stw	%o3, [%o1]		! write word
103425cf1a30Sjl139090	deccc	%o2			! reduce count for cc test
103525cf1a30Sjl139090	ldub	[%o0 + 4], %o3		! load one byte
103625cf1a30Sjl139090	bz,pt	%ncc, .bc_sm_exit
103725cf1a30Sjl139090	  stb	%o3, [%o1 + 4]		! store one byte
103825cf1a30Sjl139090	ldub	[%o0 + 5], %o3		! load second byte
103925cf1a30Sjl139090	deccc	%o2
104025cf1a30Sjl139090	bz,pt	%ncc, .bc_sm_exit
104125cf1a30Sjl139090	  stb	%o3, [%o1 + 5]		! store second byte
104225cf1a30Sjl139090	ldub	[%o0 + 6], %o3		! load third byte
104325cf1a30Sjl139090	stb	%o3, [%o1 + 6]		! store third byte
104425cf1a30Sjl139090.bc_sm_exit:
1045*e64c6c3fSMichael Bergknoff	ldn     [THREAD_REG + T_LOFAULT], %o3
1046*e64c6c3fSMichael Bergknoff	brz,pt  %o3, .bc_sm_done
10470090fbabSkm84432	  nop
104825cf1a30Sjl139090	membar	#Sync				! sync error barrier
104925cf1a30Sjl139090	andn	%o4, TRAMP_FLAG, %o4
105025cf1a30Sjl139090	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
10510090fbabSkm84432.bc_sm_done:
105225cf1a30Sjl139090	retl
105325cf1a30Sjl139090	  mov	%g0, %o0		! return 0
105425cf1a30Sjl139090
105525cf1a30Sjl139090	.align 16
105625cf1a30Sjl139090.bc_med:
105725cf1a30Sjl139090	xor	%o0, %o1, %o3		! setup alignment check
105825cf1a30Sjl139090	btst	1, %o3
105925cf1a30Sjl139090	bnz,pt	%ncc, .bc_sm_movebytes	! unaligned
106025cf1a30Sjl139090	  nop
106125cf1a30Sjl139090	btst	3, %o3
106225cf1a30Sjl139090	bnz,pt	%ncc, .bc_med_half	! halfword aligned
106325cf1a30Sjl139090	  nop
106425cf1a30Sjl139090	btst	7, %o3
106525cf1a30Sjl139090	bnz,pt	%ncc, .bc_med_word	! word aligned
106625cf1a30Sjl139090	  nop
106725cf1a30Sjl139090.bc_med_long:
106825cf1a30Sjl139090	btst	3, %o0			! check for
106925cf1a30Sjl139090	bz,pt	%ncc, .bc_med_long1	! word alignment
107025cf1a30Sjl139090	  nop
107125cf1a30Sjl139090.bc_med_long0:
107225cf1a30Sjl139090	ldub	[%o0], %o3		! load one byte
107325cf1a30Sjl139090	inc	%o0
107425cf1a30Sjl139090	stb	%o3,[%o1]		! store byte
107525cf1a30Sjl139090	inc	%o1
107625cf1a30Sjl139090	btst	3, %o0
107725cf1a30Sjl139090	bnz,pt	%ncc, .bc_med_long0
107825cf1a30Sjl139090	  dec	%o2
107925cf1a30Sjl139090.bc_med_long1:			! word aligned
108025cf1a30Sjl139090	btst	7, %o0			! check for long word
108125cf1a30Sjl139090	bz,pt	%ncc, .bc_med_long2
108225cf1a30Sjl139090	  nop
108325cf1a30Sjl139090	lduw	[%o0], %o3		! load word
108425cf1a30Sjl139090	add	%o0, 4, %o0		! advance SRC by 4
108525cf1a30Sjl139090	stw	%o3, [%o1]		! store word
108625cf1a30Sjl139090	add	%o1, 4, %o1		! advance DST by 4
108725cf1a30Sjl139090	sub	%o2, 4, %o2		! reduce count by 4
108825cf1a30Sjl139090!
108925cf1a30Sjl139090!  Now long word aligned and have at least 32 bytes to move
109025cf1a30Sjl139090!
109125cf1a30Sjl139090.bc_med_long2:
109225cf1a30Sjl139090	sub	%o2, 31, %o2		! adjust count to allow cc zero test
109325cf1a30Sjl139090.bc_med_lmove:
109425cf1a30Sjl139090	ldx	[%o0], %o3		! read long word
109525cf1a30Sjl139090	stx	%o3, [%o1]		! write long word
109625cf1a30Sjl139090	subcc	%o2, 32, %o2		! reduce count by 32
109725cf1a30Sjl139090	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
109825cf1a30Sjl139090	add	%o0, 32, %o0		! advance SRC by 32
109925cf1a30Sjl139090	stx	%o3, [%o1 + 8]
110025cf1a30Sjl139090	ldx	[%o0 - 16], %o3
110125cf1a30Sjl139090	add	%o1, 32, %o1		! advance DST by 32
110225cf1a30Sjl139090	stx	%o3, [%o1 - 16]
110325cf1a30Sjl139090	ldx	[%o0 - 8], %o3
110425cf1a30Sjl139090	bgt,pt	%ncc, .bc_med_lmove	! loop til 31 or fewer bytes left
110525cf1a30Sjl139090	  stx	%o3, [%o1 - 8]
110625cf1a30Sjl139090	addcc	%o2, 24, %o2		! restore count to long word offset
110725cf1a30Sjl139090	ble,pt	%ncc, .bc_med_lextra	! check for more long words to move
110825cf1a30Sjl139090	  nop
110925cf1a30Sjl139090.bc_med_lword:
111025cf1a30Sjl139090	ldx	[%o0], %o3		! read long word
111125cf1a30Sjl139090	subcc	%o2, 8, %o2		! reduce count by 8
111225cf1a30Sjl139090	stx	%o3, [%o1]		! write long word
111325cf1a30Sjl139090	add	%o0, 8, %o0		! advance SRC by 8
111425cf1a30Sjl139090	bgt,pt	%ncc, .bc_med_lword	! loop til 7 or fewer bytes left
111525cf1a30Sjl139090	  add	%o1, 8, %o1		! advance DST by 8
111625cf1a30Sjl139090.bc_med_lextra:
111725cf1a30Sjl139090	addcc	%o2, 7, %o2		! restore rest of count
111825cf1a30Sjl139090	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
111925cf1a30Sjl139090	  deccc	%o2
112025cf1a30Sjl139090	bz,pt	%ncc, .bc_sm_byte
112125cf1a30Sjl139090	  nop
112225cf1a30Sjl139090	ba,pt	%ncc, .bc_sm_half
112325cf1a30Sjl139090	  nop
112425cf1a30Sjl139090
112525cf1a30Sjl139090	.align 16
112625cf1a30Sjl139090.bc_med_word:
112725cf1a30Sjl139090	btst	3, %o0			! check for
112825cf1a30Sjl139090	bz,pt	%ncc, .bc_med_word1	! word alignment
112925cf1a30Sjl139090	  nop
113025cf1a30Sjl139090.bc_med_word0:
113125cf1a30Sjl139090	ldub	[%o0], %o3		! load one byte
113225cf1a30Sjl139090	inc	%o0
113325cf1a30Sjl139090	stb	%o3,[%o1]		! store byte
113425cf1a30Sjl139090	inc	%o1
113525cf1a30Sjl139090	btst	3, %o0
113625cf1a30Sjl139090	bnz,pt	%ncc, .bc_med_word0
113725cf1a30Sjl139090	  dec	%o2
113825cf1a30Sjl139090!
113925cf1a30Sjl139090!  Now word aligned and have at least 36 bytes to move
114025cf1a30Sjl139090!
114125cf1a30Sjl139090.bc_med_word1:
114225cf1a30Sjl139090	sub	%o2, 15, %o2		! adjust count to allow cc zero test
114325cf1a30Sjl139090.bc_med_wmove:
114425cf1a30Sjl139090	lduw	[%o0], %o3		! read word
114525cf1a30Sjl139090	stw	%o3, [%o1]		! write word
114625cf1a30Sjl139090	subcc	%o2, 16, %o2		! reduce count by 16
114725cf1a30Sjl139090	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
114825cf1a30Sjl139090	add	%o0, 16, %o0		! advance SRC by 16
114925cf1a30Sjl139090	stw	%o3, [%o1 + 4]
115025cf1a30Sjl139090	lduw	[%o0 - 8], %o3
115125cf1a30Sjl139090	add	%o1, 16, %o1		! advance DST by 16
115225cf1a30Sjl139090	stw	%o3, [%o1 - 8]
115325cf1a30Sjl139090	lduw	[%o0 - 4], %o3
115425cf1a30Sjl139090	bgt,pt	%ncc, .bc_med_wmove	! loop til 15 or fewer bytes left
115525cf1a30Sjl139090	  stw	%o3, [%o1 - 4]
115625cf1a30Sjl139090	addcc	%o2, 12, %o2		! restore count to word offset
115725cf1a30Sjl139090	ble,pt	%ncc, .bc_med_wextra	! check for more words to move
115825cf1a30Sjl139090	  nop
115925cf1a30Sjl139090.bc_med_word2:
116025cf1a30Sjl139090	lduw	[%o0], %o3		! read word
116125cf1a30Sjl139090	subcc	%o2, 4, %o2		! reduce count by 4
116225cf1a30Sjl139090	stw	%o3, [%o1]		! write word
116325cf1a30Sjl139090	add	%o0, 4, %o0		! advance SRC by 4
116425cf1a30Sjl139090	bgt,pt	%ncc, .bc_med_word2	! loop til 3 or fewer bytes left
116525cf1a30Sjl139090	  add	%o1, 4, %o1		! advance DST by 4
116625cf1a30Sjl139090.bc_med_wextra:
116725cf1a30Sjl139090	addcc	%o2, 3, %o2		! restore rest of count
116825cf1a30Sjl139090	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
116925cf1a30Sjl139090	  deccc	%o2
117025cf1a30Sjl139090	bz,pt	%ncc, .bc_sm_byte
117125cf1a30Sjl139090	  nop
117225cf1a30Sjl139090	ba,pt	%ncc, .bc_sm_half
117325cf1a30Sjl139090	  nop
117425cf1a30Sjl139090
117525cf1a30Sjl139090	.align 16
117625cf1a30Sjl139090.bc_med_half:
117725cf1a30Sjl139090	btst	1, %o0			! check for
117825cf1a30Sjl139090	bz,pt	%ncc, .bc_med_half1	! half word alignment
117925cf1a30Sjl139090	  nop
118025cf1a30Sjl139090	ldub	[%o0], %o3		! load one byte
118125cf1a30Sjl139090	inc	%o0
118225cf1a30Sjl139090	stb	%o3,[%o1]		! store byte
118325cf1a30Sjl139090	inc	%o1
118425cf1a30Sjl139090	dec	%o2
118525cf1a30Sjl139090!
118625cf1a30Sjl139090!  Now half word aligned and have at least 38 bytes to move
118725cf1a30Sjl139090!
118825cf1a30Sjl139090.bc_med_half1:
118925cf1a30Sjl139090	sub	%o2, 7, %o2		! adjust count to allow cc zero test
119025cf1a30Sjl139090.bc_med_hmove:
119125cf1a30Sjl139090	lduh	[%o0], %o3		! read half word
119225cf1a30Sjl139090	sth	%o3, [%o1]		! write half word
119325cf1a30Sjl139090	subcc	%o2, 8, %o2		! reduce count by 8
119425cf1a30Sjl139090	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
119525cf1a30Sjl139090	add	%o0, 8, %o0		! advance SRC by 8
119625cf1a30Sjl139090	sth	%o3, [%o1 + 2]
119725cf1a30Sjl139090	lduh	[%o0 - 4], %o3
119825cf1a30Sjl139090	add	%o1, 8, %o1		! advance DST by 8
119925cf1a30Sjl139090	sth	%o3, [%o1 - 4]
120025cf1a30Sjl139090	lduh	[%o0 - 2], %o3
120125cf1a30Sjl139090	bgt,pt	%ncc, .bc_med_hmove	! loop til 7 or fewer bytes left
120225cf1a30Sjl139090	  sth	%o3, [%o1 - 2]
120325cf1a30Sjl139090	addcc	%o2, 7, %o2		! restore count
120425cf1a30Sjl139090	bz,pt	%ncc, .bc_sm_exit
120525cf1a30Sjl139090	  deccc	%o2
120625cf1a30Sjl139090	bz,pt	%ncc, .bc_sm_byte
120725cf1a30Sjl139090	  nop
120825cf1a30Sjl139090	ba,pt	%ncc, .bc_sm_half
120925cf1a30Sjl139090	  nop
121025cf1a30Sjl139090
121125cf1a30Sjl139090	SET_SIZE(bcopy)
121225cf1a30Sjl139090
121325cf1a30Sjl139090/*
121425cf1a30Sjl139090 * The _more entry points are not intended to be used directly by
121525cf1a30Sjl139090 * any caller from outside this file.  They are provided to allow
121625cf1a30Sjl139090 * profiling and dtrace of the portions of the copy code that uses
121725cf1a30Sjl139090 * the floating point registers.
121825cf1a30Sjl139090 * This entry is particularly important as DTRACE (at least as of
121925cf1a30Sjl139090 * 4/2004) does not support leaf functions.
122025cf1a30Sjl139090 */
122125cf1a30Sjl139090
122225cf1a30Sjl139090	ENTRY(bcopy_more)
122325cf1a30Sjl139090.bcopy_more:
122425cf1a30Sjl139090	prefetch [%o0], #n_reads
122525cf1a30Sjl139090	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
122625cf1a30Sjl139090	ldn	[THREAD_REG + T_LOFAULT], %l6	! save t_lofault
122725cf1a30Sjl139090	tst	%l6
122825cf1a30Sjl139090	bz,pt	%ncc, .do_copy
122925cf1a30Sjl139090	  nop
123025cf1a30Sjl139090	sethi	%hi(.copyerr), %o2
123125cf1a30Sjl139090	or	%o2, %lo(.copyerr), %o2
123225cf1a30Sjl139090	membar	#Sync				! sync error barrier
123325cf1a30Sjl139090	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
123425cf1a30Sjl139090	!
123525cf1a30Sjl139090	! We've already captured whether t_lofault was zero on entry.
123625cf1a30Sjl139090	! We need to mark ourselves as being from bcopy since both
123725cf1a30Sjl139090	! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
123825cf1a30Sjl139090	! and the saved lofault was zero, we won't reset lofault on
123925cf1a30Sjl139090	! returning.
124025cf1a30Sjl139090	!
124125cf1a30Sjl139090	or	%l6, TRAMP_FLAG, %l6
124225cf1a30Sjl139090
124325cf1a30Sjl139090/*
124425cf1a30Sjl139090 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
124525cf1a30Sjl139090 * Also, use of FP registers has been tested to be enabled
124625cf1a30Sjl139090 */
124725cf1a30Sjl139090.do_copy:
124825cf1a30Sjl139090	FP_NOMIGRATE(6, 7)
124925cf1a30Sjl139090
125025cf1a30Sjl139090	rd	%fprs, %o2		! check for unused fp
125125cf1a30Sjl139090	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
125225cf1a30Sjl139090	btst	FPRS_FEF, %o2
125325cf1a30Sjl139090	bz,a,pt	%icc, .do_blockcopy
125425cf1a30Sjl139090	  wr	%g0, FPRS_FEF, %fprs
125525cf1a30Sjl139090
125625cf1a30Sjl139090	BST_FPQ1Q3_TOSTACK(%o2)
125725cf1a30Sjl139090
125825cf1a30Sjl139090.do_blockcopy:
125925cf1a30Sjl139090	rd	%gsr, %o2
126025cf1a30Sjl139090	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
126125cf1a30Sjl139090	or	%l6, FPUSED_FLAG, %l6
126225cf1a30Sjl139090
126325cf1a30Sjl139090#define	REALSRC	%i0
126425cf1a30Sjl139090#define	DST	%i1
126525cf1a30Sjl139090#define	CNT	%i2
126625cf1a30Sjl139090#define	SRC	%i3
126725cf1a30Sjl139090#define	TMP	%i5
126825cf1a30Sjl139090
126925cf1a30Sjl139090	andcc	DST, VIS_BLOCKSIZE - 1, TMP
127025cf1a30Sjl139090	bz,pt	%ncc, 2f
127125cf1a30Sjl139090	  neg	TMP
127225cf1a30Sjl139090	add	TMP, VIS_BLOCKSIZE, TMP
127325cf1a30Sjl139090
127425cf1a30Sjl139090	! TMP = bytes required to align DST on FP_BLOCK boundary
127525cf1a30Sjl139090	! Using SRC as a tmp here
127625cf1a30Sjl139090	cmp	TMP, 3
127725cf1a30Sjl139090	bleu,pt	%ncc, 1f
127825cf1a30Sjl139090	  sub	CNT,TMP,CNT		! adjust main count
127925cf1a30Sjl139090	sub	TMP, 3, TMP		! adjust for end of loop test
128025cf1a30Sjl139090.bc_blkalign:
128125cf1a30Sjl139090	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
128225cf1a30Sjl139090	stb	SRC, [DST]
128325cf1a30Sjl139090	subcc	TMP, 4, TMP
128425cf1a30Sjl139090	ldub	[REALSRC + 1], SRC
128525cf1a30Sjl139090	add	REALSRC, 4, REALSRC
128625cf1a30Sjl139090	stb	SRC, [DST + 1]
128725cf1a30Sjl139090	ldub	[REALSRC - 2], SRC
128825cf1a30Sjl139090	add	DST, 4, DST
128925cf1a30Sjl139090	stb	SRC, [DST - 2]
129025cf1a30Sjl139090	ldub	[REALSRC - 1], SRC
129125cf1a30Sjl139090	bgu,pt	%ncc, .bc_blkalign
129225cf1a30Sjl139090	  stb	SRC, [DST - 1]
129325cf1a30Sjl139090
129425cf1a30Sjl139090	addcc	TMP, 3, TMP		! restore count adjustment
129525cf1a30Sjl139090	bz,pt	%ncc, 2f		! no bytes left?
129625cf1a30Sjl139090	  nop
129725cf1a30Sjl1390901:	ldub	[REALSRC], SRC
129825cf1a30Sjl139090	inc	REALSRC
129925cf1a30Sjl139090	inc	DST
130025cf1a30Sjl139090	deccc	TMP
130125cf1a30Sjl139090	bgu	%ncc, 1b
130225cf1a30Sjl139090	  stb	SRC, [DST - 1]
130325cf1a30Sjl139090
130425cf1a30Sjl1390902:
130525cf1a30Sjl139090	membar	#StoreLoad
130625cf1a30Sjl139090	andn	REALSRC, 0x7, SRC
130725cf1a30Sjl139090
130825cf1a30Sjl139090	! SRC - 8-byte aligned
130925cf1a30Sjl139090	! DST - 64-byte aligned
131025cf1a30Sjl139090	ldd	[SRC], %f0
131125cf1a30Sjl139090	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
131225cf1a30Sjl139090	alignaddr REALSRC, %g0, %g0
131325cf1a30Sjl139090	ldd	[SRC + 0x08], %f2
131425cf1a30Sjl139090	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
131525cf1a30Sjl139090	faligndata %f0, %f2, %f32
131625cf1a30Sjl139090	ldd	[SRC + 0x10], %f4
1317c8a722abSpm145316	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
131825cf1a30Sjl139090	faligndata %f2, %f4, %f34
131925cf1a30Sjl139090	ldd	[SRC + 0x18], %f6
132025cf1a30Sjl139090	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
132125cf1a30Sjl139090	faligndata %f4, %f6, %f36
132225cf1a30Sjl139090	ldd	[SRC + 0x20], %f8
1323c8a722abSpm145316	prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
132425cf1a30Sjl139090	faligndata %f6, %f8, %f38
132525cf1a30Sjl139090	ldd	[SRC + 0x28], %f10
1326c8a722abSpm145316	prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
132725cf1a30Sjl139090	faligndata %f8, %f10, %f40
132825cf1a30Sjl139090	ldd	[SRC + 0x30], %f12
1329c8a722abSpm145316	prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
133025cf1a30Sjl139090	faligndata %f10, %f12, %f42
133125cf1a30Sjl139090	ldd	[SRC + 0x38], %f14
133225cf1a30Sjl139090	ldd	[SRC + VIS_BLOCKSIZE], %f0
133325cf1a30Sjl139090	sub	CNT, VIS_BLOCKSIZE, CNT
133425cf1a30Sjl139090	add	SRC, VIS_BLOCKSIZE, SRC
1335c8a722abSpm145316	prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
133625cf1a30Sjl139090	add	REALSRC, VIS_BLOCKSIZE, REALSRC
133725cf1a30Sjl139090	ba,pt	%ncc, 1f
1338c8a722abSpm145316	  prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
133925cf1a30Sjl139090	.align	32
134025cf1a30Sjl1390901:
134125cf1a30Sjl139090	ldd	[SRC + 0x08], %f2
134225cf1a30Sjl139090	faligndata %f12, %f14, %f44
134325cf1a30Sjl139090	ldd	[SRC + 0x10], %f4
134425cf1a30Sjl139090	faligndata %f14, %f0, %f46
134525cf1a30Sjl139090	stda	%f32, [DST]ASI_BLK_P
134625cf1a30Sjl139090	ldd	[SRC + 0x18], %f6
134725cf1a30Sjl139090	faligndata %f0, %f2, %f32
134825cf1a30Sjl139090	ldd	[SRC + 0x20], %f8
134925cf1a30Sjl139090	faligndata %f2, %f4, %f34
135025cf1a30Sjl139090	ldd	[SRC + 0x28], %f10
135125cf1a30Sjl139090	faligndata %f4, %f6, %f36
135225cf1a30Sjl139090	ldd	[SRC + 0x30], %f12
135325cf1a30Sjl139090	faligndata %f6, %f8, %f38
135425cf1a30Sjl139090	sub	CNT, VIS_BLOCKSIZE, CNT
1355c8a722abSpm145316	ldd	[SRC + 0x38], %f14
1356c8a722abSpm145316	faligndata %f8, %f10, %f40
135725cf1a30Sjl139090	add	DST, VIS_BLOCKSIZE, DST
1358c8a722abSpm145316	ldd	[SRC + VIS_BLOCKSIZE], %f0
1359c8a722abSpm145316	faligndata %f10, %f12, %f42
136025cf1a30Sjl139090	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1361c8a722abSpm145316	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1362c8a722abSpm145316	add	SRC, VIS_BLOCKSIZE, SRC
1363c8a722abSpm145316	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
136425cf1a30Sjl139090	cmp	CNT, VIS_BLOCKSIZE + 8
136525cf1a30Sjl139090	bgu,pt	%ncc, 1b
1366c8a722abSpm145316	  prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
136725cf1a30Sjl139090
136825cf1a30Sjl139090	! only if REALSRC & 0x7 is 0
136925cf1a30Sjl139090	cmp	CNT, VIS_BLOCKSIZE
137025cf1a30Sjl139090	bne	%ncc, 3f
137125cf1a30Sjl139090	  andcc	REALSRC, 0x7, %g0
137225cf1a30Sjl139090	bz,pt	%ncc, 2f
137325cf1a30Sjl139090	  nop
137425cf1a30Sjl1390903:
137525cf1a30Sjl139090	faligndata %f12, %f14, %f44
137625cf1a30Sjl139090	faligndata %f14, %f0, %f46
137725cf1a30Sjl139090	stda	%f32, [DST]ASI_BLK_P
137825cf1a30Sjl139090	add	DST, VIS_BLOCKSIZE, DST
137925cf1a30Sjl139090	ba,pt	%ncc, 3f
138025cf1a30Sjl139090	  nop
138125cf1a30Sjl1390902:
138225cf1a30Sjl139090	ldd	[SRC + 0x08], %f2
138325cf1a30Sjl139090	fsrc1	%f12, %f44
138425cf1a30Sjl139090	ldd	[SRC + 0x10], %f4
138525cf1a30Sjl139090	fsrc1	%f14, %f46
138625cf1a30Sjl139090	stda	%f32, [DST]ASI_BLK_P
138725cf1a30Sjl139090	ldd	[SRC + 0x18], %f6
138825cf1a30Sjl139090	fsrc1	%f0, %f32
138925cf1a30Sjl139090	ldd	[SRC + 0x20], %f8
139025cf1a30Sjl139090	fsrc1	%f2, %f34
139125cf1a30Sjl139090	ldd	[SRC + 0x28], %f10
139225cf1a30Sjl139090	fsrc1	%f4, %f36
139325cf1a30Sjl139090	ldd	[SRC + 0x30], %f12
139425cf1a30Sjl139090	fsrc1	%f6, %f38
139525cf1a30Sjl139090	ldd	[SRC + 0x38], %f14
139625cf1a30Sjl139090	fsrc1	%f8, %f40
139725cf1a30Sjl139090	sub	CNT, VIS_BLOCKSIZE, CNT
139825cf1a30Sjl139090	add	DST, VIS_BLOCKSIZE, DST
139925cf1a30Sjl139090	add	SRC, VIS_BLOCKSIZE, SRC
140025cf1a30Sjl139090	add	REALSRC, VIS_BLOCKSIZE, REALSRC
140125cf1a30Sjl139090	fsrc1	%f10, %f42
140225cf1a30Sjl139090	fsrc1	%f12, %f44
140325cf1a30Sjl139090	fsrc1	%f14, %f46
140425cf1a30Sjl139090	stda	%f32, [DST]ASI_BLK_P
140525cf1a30Sjl139090	add	DST, VIS_BLOCKSIZE, DST
140625cf1a30Sjl139090	ba,a,pt	%ncc, .bcb_exit
140725cf1a30Sjl139090	  nop
140825cf1a30Sjl139090
140925cf1a30Sjl1390903:	tst	CNT
141025cf1a30Sjl139090	bz,a,pt	%ncc, .bcb_exit
141125cf1a30Sjl139090	  nop
141225cf1a30Sjl139090
141325cf1a30Sjl1390905:	ldub	[REALSRC], TMP
141425cf1a30Sjl139090	inc	REALSRC
141525cf1a30Sjl139090	inc	DST
141625cf1a30Sjl139090	deccc	CNT
141725cf1a30Sjl139090	bgu	%ncc, 5b
141825cf1a30Sjl139090	  stb	TMP, [DST - 1]
141925cf1a30Sjl139090.bcb_exit:
142025cf1a30Sjl139090	membar	#Sync
142125cf1a30Sjl139090
142225cf1a30Sjl139090	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
142325cf1a30Sjl139090	wr	%o2, 0, %gsr
142425cf1a30Sjl139090
142525cf1a30Sjl139090	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
142625cf1a30Sjl139090	btst	FPRS_FEF, %o3
142725cf1a30Sjl139090	bz,pt	%icc, 4f
142825cf1a30Sjl139090	  nop
142925cf1a30Sjl139090
143025cf1a30Sjl139090	BLD_FPQ1Q3_FROMSTACK(%o2)
143125cf1a30Sjl139090
143225cf1a30Sjl139090	ba,pt	%ncc, 2f
143325cf1a30Sjl139090	  wr	%o3, 0, %fprs		! restore fprs
143425cf1a30Sjl1390904:
143525cf1a30Sjl139090	FZEROQ1Q3
143625cf1a30Sjl139090	wr	%o3, 0, %fprs		! restore fprs
143725cf1a30Sjl1390902:
143825cf1a30Sjl139090	membar	#Sync				! sync error barrier
143925cf1a30Sjl139090	andn	%l6, MASK_FLAGS, %l6
144025cf1a30Sjl139090	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
144125cf1a30Sjl139090	FP_ALLOWMIGRATE(5, 6)
144225cf1a30Sjl139090	ret
144325cf1a30Sjl139090	  restore	%g0, 0, %o0
144425cf1a30Sjl139090
144525cf1a30Sjl139090	SET_SIZE(bcopy_more)
144625cf1a30Sjl139090
144725cf1a30Sjl139090#endif	/* lint */
144825cf1a30Sjl139090
144925cf1a30Sjl139090/*
145025cf1a30Sjl139090 * Block copy with possibly overlapped operands.
145125cf1a30Sjl139090 */
145225cf1a30Sjl139090
145325cf1a30Sjl139090#if defined(lint)
145425cf1a30Sjl139090
145525cf1a30Sjl139090/*ARGSUSED*/
145625cf1a30Sjl139090void
145725cf1a30Sjl139090ovbcopy(const void *from, void *to, size_t count)
145825cf1a30Sjl139090{}
145925cf1a30Sjl139090
146025cf1a30Sjl139090#else	/* lint */
146125cf1a30Sjl139090
146225cf1a30Sjl139090	ENTRY(ovbcopy)
146325cf1a30Sjl139090	tst	%o2			! check count
146425cf1a30Sjl139090	bgu,a	%ncc, 1f		! nothing to do or bad arguments
146525cf1a30Sjl139090	  subcc	%o0, %o1, %o3		! difference of from and to address
146625cf1a30Sjl139090
146725cf1a30Sjl139090	retl				! return
146825cf1a30Sjl139090	  nop
146925cf1a30Sjl1390901:
147025cf1a30Sjl139090	bneg,a	%ncc, 2f
147125cf1a30Sjl139090	  neg	%o3			! if < 0, make it positive
147225cf1a30Sjl1390902:	cmp	%o2, %o3		! cmp size and abs(from - to)
147325cf1a30Sjl139090	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
147425cf1a30Sjl139090	  .empty				!   no overlap
147525cf1a30Sjl139090	  cmp	%o0, %o1		! compare from and to addresses
147625cf1a30Sjl139090	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
147725cf1a30Sjl139090	  nop
147825cf1a30Sjl139090	!
147925cf1a30Sjl139090	! Copy forwards.
148025cf1a30Sjl139090	!
148125cf1a30Sjl139090.ov_fwd:
148225cf1a30Sjl139090	ldub	[%o0], %o3		! read from address
148325cf1a30Sjl139090	inc	%o0			! inc from address
148425cf1a30Sjl139090	stb	%o3, [%o1]		! write to address
148525cf1a30Sjl139090	deccc	%o2			! dec count
148625cf1a30Sjl139090	bgu	%ncc, .ov_fwd		! loop till done
148725cf1a30Sjl139090	  inc	%o1			! inc to address
148825cf1a30Sjl139090
148925cf1a30Sjl139090	retl				! return
149025cf1a30Sjl139090	  nop
149125cf1a30Sjl139090	!
149225cf1a30Sjl139090	! Copy backwards.
149325cf1a30Sjl139090	!
149425cf1a30Sjl139090.ov_bkwd:
149525cf1a30Sjl139090	deccc	%o2			! dec count
149625cf1a30Sjl139090	ldub	[%o0 + %o2], %o3	! get byte at end of src
149725cf1a30Sjl139090	bgu	%ncc, .ov_bkwd		! loop till done
149825cf1a30Sjl139090	  stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
149925cf1a30Sjl139090
150025cf1a30Sjl139090	retl				! return
150125cf1a30Sjl139090	  nop
150225cf1a30Sjl139090
150325cf1a30Sjl139090	SET_SIZE(ovbcopy)
150425cf1a30Sjl139090
150525cf1a30Sjl139090#endif	/* lint */
150625cf1a30Sjl139090
150725cf1a30Sjl139090
150825cf1a30Sjl139090/*
150925cf1a30Sjl139090 * hwblkpagecopy()
151025cf1a30Sjl139090 *
151125cf1a30Sjl139090 * Copies exactly one page.  This routine assumes the caller (ppcopy)
151225cf1a30Sjl139090 * has already disabled kernel preemption and has checked
151325cf1a30Sjl139090 * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
151425cf1a30Sjl139090 */
151525cf1a30Sjl139090#ifdef lint
151625cf1a30Sjl139090/*ARGSUSED*/
151725cf1a30Sjl139090void
151825cf1a30Sjl139090hwblkpagecopy(const void *src, void *dst)
151925cf1a30Sjl139090{ }
152025cf1a30Sjl139090#else /* lint */
152125cf1a30Sjl139090	ENTRY(hwblkpagecopy)
152225cf1a30Sjl139090	! get another window w/space for three aligned blocks of saved fpregs
152325cf1a30Sjl139090	prefetch [%o0], #n_reads
152425cf1a30Sjl139090	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
152525cf1a30Sjl139090
152625cf1a30Sjl139090	! %i0 - source address (arg)
152725cf1a30Sjl139090	! %i1 - destination address (arg)
152825cf1a30Sjl139090	! %i2 - length of region (not arg)
152925cf1a30Sjl139090	! %l0 - saved fprs
153025cf1a30Sjl139090	! %l1 - pointer to saved fpregs
153125cf1a30Sjl139090
153225cf1a30Sjl139090	rd	%fprs, %l0		! check for unused fp
153325cf1a30Sjl139090	btst	FPRS_FEF, %l0
153425cf1a30Sjl139090	bz,a,pt	%icc, 1f
153525cf1a30Sjl139090	  wr	%g0, FPRS_FEF, %fprs
153625cf1a30Sjl139090
153725cf1a30Sjl139090	BST_FPQ1Q3_TOSTACK(%l1)
153825cf1a30Sjl139090
153925cf1a30Sjl1390901:	set	PAGESIZE, CNT
154025cf1a30Sjl139090	mov	REALSRC, SRC
154125cf1a30Sjl139090
154225cf1a30Sjl139090	ldd	[SRC], %f0
154325cf1a30Sjl139090	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
154425cf1a30Sjl139090	ldd	[SRC + 0x08], %f2
154525cf1a30Sjl139090	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
154625cf1a30Sjl139090	fmovd	%f0, %f32
154725cf1a30Sjl139090	ldd	[SRC + 0x10], %f4
1548c8a722abSpm145316	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
154925cf1a30Sjl139090	fmovd	%f2, %f34
155025cf1a30Sjl139090	ldd	[SRC + 0x18], %f6
155125cf1a30Sjl139090	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
155225cf1a30Sjl139090	fmovd	%f4, %f36
155325cf1a30Sjl139090	ldd	[SRC + 0x20], %f8
1554c8a722abSpm145316	prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
155525cf1a30Sjl139090	fmovd	%f6, %f38
155625cf1a30Sjl139090	ldd	[SRC + 0x28], %f10
1557c8a722abSpm145316	prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
155825cf1a30Sjl139090	fmovd	%f8, %f40
155925cf1a30Sjl139090	ldd	[SRC + 0x30], %f12
1560c8a722abSpm145316	prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
156125cf1a30Sjl139090	fmovd	%f10, %f42
156225cf1a30Sjl139090	ldd	[SRC + 0x38], %f14
156325cf1a30Sjl139090	ldd	[SRC + VIS_BLOCKSIZE], %f0
156425cf1a30Sjl139090	sub	CNT, VIS_BLOCKSIZE, CNT
156525cf1a30Sjl139090	add	SRC, VIS_BLOCKSIZE, SRC
1566c8a722abSpm145316	prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
156725cf1a30Sjl139090	ba,pt	%ncc, 2f
1568c8a722abSpm145316	prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
156925cf1a30Sjl139090	.align	32
157025cf1a30Sjl1390902:
157125cf1a30Sjl139090	ldd	[SRC + 0x08], %f2
157225cf1a30Sjl139090	fmovd	%f12, %f44
157325cf1a30Sjl139090	ldd	[SRC + 0x10], %f4
157425cf1a30Sjl139090	fmovd	%f14, %f46
157525cf1a30Sjl139090	stda	%f32, [DST]ASI_BLK_P
157625cf1a30Sjl139090	ldd	[SRC + 0x18], %f6
157725cf1a30Sjl139090	fmovd	%f0, %f32
157825cf1a30Sjl139090	ldd	[SRC + 0x20], %f8
157925cf1a30Sjl139090	fmovd	%f2, %f34
158025cf1a30Sjl139090	ldd	[SRC + 0x28], %f10
158125cf1a30Sjl139090	fmovd	%f4, %f36
158225cf1a30Sjl139090	ldd	[SRC + 0x30], %f12
158325cf1a30Sjl139090	fmovd	%f6, %f38
158425cf1a30Sjl139090	ldd	[SRC + 0x38], %f14
158525cf1a30Sjl139090	fmovd	%f8, %f40
158625cf1a30Sjl139090	ldd	[SRC + VIS_BLOCKSIZE], %f0
158725cf1a30Sjl139090	fmovd	%f10, %f42
158825cf1a30Sjl139090	sub	CNT, VIS_BLOCKSIZE, CNT
1589c8a722abSpm145316	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
159025cf1a30Sjl139090	add	DST, VIS_BLOCKSIZE, DST
1591c8a722abSpm145316	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1592c8a722abSpm145316	add	SRC, VIS_BLOCKSIZE, SRC
159325cf1a30Sjl139090	cmp	CNT, VIS_BLOCKSIZE + 8
159425cf1a30Sjl139090	bgu,pt	%ncc, 2b
1595c8a722abSpm145316	  prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
159625cf1a30Sjl139090
159725cf1a30Sjl139090	! trailing block
159825cf1a30Sjl139090	ldd	[SRC + 0x08], %f2
159925cf1a30Sjl139090	fsrc1	%f12, %f44
160025cf1a30Sjl139090	ldd	[SRC + 0x10], %f4
160125cf1a30Sjl139090	fsrc1	%f14, %f46
160225cf1a30Sjl139090	stda	%f32, [DST]ASI_BLK_P
160325cf1a30Sjl139090	ldd	[SRC + 0x18], %f6
160425cf1a30Sjl139090	fsrc1	%f0, %f32
160525cf1a30Sjl139090	ldd	[SRC + 0x20], %f8
160625cf1a30Sjl139090	fsrc1	%f2, %f34
160725cf1a30Sjl139090	ldd	[SRC + 0x28], %f10
160825cf1a30Sjl139090	fsrc1	%f4, %f36
160925cf1a30Sjl139090	ldd	[SRC + 0x30], %f12
161025cf1a30Sjl139090	fsrc1	%f6, %f38
161125cf1a30Sjl139090	ldd	[SRC + 0x38], %f14
161225cf1a30Sjl139090	fsrc1	%f8, %f40
161325cf1a30Sjl139090	sub	CNT, VIS_BLOCKSIZE, CNT
161425cf1a30Sjl139090	add	DST, VIS_BLOCKSIZE, DST
161525cf1a30Sjl139090	add	SRC, VIS_BLOCKSIZE, SRC
161625cf1a30Sjl139090	fsrc1	%f10, %f42
161725cf1a30Sjl139090	fsrc1	%f12, %f44
161825cf1a30Sjl139090	fsrc1	%f14, %f46
161925cf1a30Sjl139090	stda	%f32, [DST]ASI_BLK_P
162025cf1a30Sjl139090
162125cf1a30Sjl139090	membar	#Sync
162225cf1a30Sjl139090
162325cf1a30Sjl139090	btst	FPRS_FEF, %l0
162425cf1a30Sjl139090	bz,pt	%icc, 2f
162525cf1a30Sjl139090	  nop
162625cf1a30Sjl139090
162725cf1a30Sjl139090	BLD_FPQ1Q3_FROMSTACK(%l3)
162825cf1a30Sjl139090	ba	3f
162925cf1a30Sjl139090	  nop
163025cf1a30Sjl139090
163125cf1a30Sjl1390902:	FZEROQ1Q3
163225cf1a30Sjl139090
163325cf1a30Sjl1390903:	wr	%l0, 0, %fprs		! restore fprs
163425cf1a30Sjl139090	ret
163525cf1a30Sjl139090	  restore	%g0, 0, %o0
163625cf1a30Sjl139090
163725cf1a30Sjl139090	SET_SIZE(hwblkpagecopy)
163825cf1a30Sjl139090#endif	/* lint */
163925cf1a30Sjl139090
164025cf1a30Sjl139090
164125cf1a30Sjl139090/*
164225cf1a30Sjl139090 * Transfer data to and from user space -
164325cf1a30Sjl139090 * Note that these routines can cause faults
164425cf1a30Sjl139090 * It is assumed that the kernel has nothing at
164525cf1a30Sjl139090 * less than KERNELBASE in the virtual address space.
164625cf1a30Sjl139090 *
164725cf1a30Sjl139090 * Note that copyin(9F) and copyout(9F) are part of the
164825cf1a30Sjl139090 * DDI/DKI which specifies that they return '-1' on "errors."
164925cf1a30Sjl139090 *
165025cf1a30Sjl139090 * Sigh.
165125cf1a30Sjl139090 *
165225cf1a30Sjl139090 * So there's two extremely similar routines - xcopyin() and xcopyout()
165325cf1a30Sjl139090 * which return the errno that we've faithfully computed.  This
165425cf1a30Sjl139090 * allows other callers (e.g. uiomove(9F)) to work correctly.
165525cf1a30Sjl139090 * Given that these are used pretty heavily, we expand the calling
165625cf1a30Sjl139090 * sequences inline for all flavours (rather than making wrappers).
165725cf1a30Sjl139090 *
165825cf1a30Sjl139090 * There are also stub routines for xcopyout_little and xcopyin_little,
165925cf1a30Sjl139090 * which currently are intended to handle requests of <= 16 bytes from
166025cf1a30Sjl139090 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
166125cf1a30Sjl139090 * is left as an exercise...
166225cf1a30Sjl139090 */
166325cf1a30Sjl139090
166425cf1a30Sjl139090/*
166525cf1a30Sjl139090 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
166625cf1a30Sjl139090 *
166725cf1a30Sjl139090 * General theory of operation:
166825cf1a30Sjl139090 *
166925cf1a30Sjl139090 * The only difference between copy{in,out} and
167025cf1a30Sjl139090 * xcopy{in,out} is in the error handling routine they invoke
167125cf1a30Sjl139090 * when a memory access error occurs. xcopyOP returns the errno
167225cf1a30Sjl139090 * while copyOP returns -1 (see above). copy{in,out}_noerr set
167325cf1a30Sjl139090 * a special flag (by oring the TRAMP_FLAG into the fault handler address)
167425cf1a30Sjl139090 * if they are called with a fault handler already in place. That flag
167525cf1a30Sjl139090 * causes the default handlers to trampoline to the previous handler
167625cf1a30Sjl139090 * upon an error.
167725cf1a30Sjl139090 *
167825cf1a30Sjl139090 * None of the copyops routines grab a window until it's decided that
167925cf1a30Sjl139090 * we need to do a HW block copy operation. This saves a window
168025cf1a30Sjl139090 * spill/fill when we're called during socket ops. The typical IO
168125cf1a30Sjl139090 * path won't cause spill/fill traps.
168225cf1a30Sjl139090 *
168325cf1a30Sjl139090 * This code uses a set of 4 limits for the maximum size that will
168425cf1a30Sjl139090 * be copied given a particular input/output address alignment.
168525cf1a30Sjl139090 * If the value for a particular limit is zero, the copy will be performed
168625cf1a30Sjl139090 * by the plain copy loops rather than FPBLK.
168725cf1a30Sjl139090 *
168825cf1a30Sjl139090 * See the description of bcopy above for more details of the
168925cf1a30Sjl139090 * data copying algorithm and the default limits.
169025cf1a30Sjl139090 *
169125cf1a30Sjl139090 */
169225cf1a30Sjl139090
169325cf1a30Sjl139090/*
169425cf1a30Sjl139090 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
169525cf1a30Sjl139090 */
169625cf1a30Sjl139090
169725cf1a30Sjl139090#if defined(lint)
169825cf1a30Sjl139090
169925cf1a30Sjl139090
170025cf1a30Sjl139090#else	/* lint */
170125cf1a30Sjl139090/*
170225cf1a30Sjl139090 * We save the arguments in the following registers in case of a fault:
170325cf1a30Sjl139090 *	kaddr - %l1
170425cf1a30Sjl139090 *	uaddr - %l2
170525cf1a30Sjl139090 *	count - %l3
170625cf1a30Sjl139090 */
170725cf1a30Sjl139090#define SAVE_SRC	%l1
170825cf1a30Sjl139090#define SAVE_DST	%l2
170925cf1a30Sjl139090#define SAVE_COUNT	%l3
171025cf1a30Sjl139090
171125cf1a30Sjl139090#define SM_SAVE_SRC		%g4
171225cf1a30Sjl139090#define SM_SAVE_DST		%g5
171325cf1a30Sjl139090#define SM_SAVE_COUNT		%o5
171425cf1a30Sjl139090#define ERRNO		%l5
171525cf1a30Sjl139090
171625cf1a30Sjl139090
171725cf1a30Sjl139090#define REAL_LOFAULT	%l4
171825cf1a30Sjl139090/*
171925cf1a30Sjl139090 * Generic copyio fault handler.  This is the first line of defense when a
172025cf1a30Sjl139090 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
172125cf1a30Sjl139090 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
172225cf1a30Sjl139090 * This allows us to share common code for all the flavors of the copy
172325cf1a30Sjl139090 * operations, including the _noerr versions.
172425cf1a30Sjl139090 *
172525cf1a30Sjl139090 * Note that this function will restore the original input parameters before
172625cf1a30Sjl139090 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
172725cf1a30Sjl139090 * member of the t_copyop structure, if needed.
172825cf1a30Sjl139090 */
172925cf1a30Sjl139090	ENTRY(copyio_fault)
173025cf1a30Sjl139090	membar	#Sync
173125cf1a30Sjl139090	mov	%g1,ERRNO			! save errno in ERRNO
173225cf1a30Sjl139090	btst	FPUSED_FLAG, %l6
173325cf1a30Sjl139090	bz	%ncc, 1f
173425cf1a30Sjl139090	  nop
173525cf1a30Sjl139090
173625cf1a30Sjl139090	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
173725cf1a30Sjl139090	wr	%o2, 0, %gsr    	! restore gsr
173825cf1a30Sjl139090
173925cf1a30Sjl139090	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
174025cf1a30Sjl139090	btst	FPRS_FEF, %o3
174125cf1a30Sjl139090	bz,pt	%icc, 4f
174225cf1a30Sjl139090	  nop
174325cf1a30Sjl139090
174425cf1a30Sjl139090	BLD_FPQ2Q4_FROMSTACK(%o2)
174525cf1a30Sjl139090
174625cf1a30Sjl139090	ba,pt	%ncc, 1f
174725cf1a30Sjl139090	  wr	%o3, 0, %fprs   	! restore fprs
174825cf1a30Sjl139090
174925cf1a30Sjl1390904:
175025cf1a30Sjl139090	FZEROQ2Q4
175125cf1a30Sjl139090	wr	%o3, 0, %fprs   	! restore fprs
175225cf1a30Sjl139090
175325cf1a30Sjl1390901:
175425cf1a30Sjl139090	andn	%l6, FPUSED_FLAG, %l6
175525cf1a30Sjl139090	membar	#Sync
175625cf1a30Sjl139090	stn	%l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
175725cf1a30Sjl139090	FP_ALLOWMIGRATE(5, 6)
175825cf1a30Sjl139090
175925cf1a30Sjl139090	mov	SAVE_SRC, %i0
176025cf1a30Sjl139090	mov	SAVE_DST, %i1
176125cf1a30Sjl139090	jmp	REAL_LOFAULT
176225cf1a30Sjl139090	  mov	SAVE_COUNT, %i2
176325cf1a30Sjl139090
176425cf1a30Sjl139090	SET_SIZE(copyio_fault)
176525cf1a30Sjl139090
176625cf1a30Sjl139090
176725cf1a30Sjl139090#endif
176825cf1a30Sjl139090
176925cf1a30Sjl139090#if defined(lint)
177025cf1a30Sjl139090
177125cf1a30Sjl139090/*ARGSUSED*/
177225cf1a30Sjl139090int
177325cf1a30Sjl139090copyout(const void *kaddr, void *uaddr, size_t count)
177425cf1a30Sjl139090{ return (0); }
177525cf1a30Sjl139090
177625cf1a30Sjl139090#else	/* lint */
177725cf1a30Sjl139090
177825cf1a30Sjl139090	ENTRY(copyout)
177925cf1a30Sjl139090
178025cf1a30Sjl139090	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
178125cf1a30Sjl139090	bleu,pt	%ncc, .copyout_small		! go to larger cases
178225cf1a30Sjl139090	  xor	%o0, %o1, %o3			! are src, dst alignable?
178325cf1a30Sjl139090	btst	7, %o3				!
178425cf1a30Sjl139090	bz,pt	%ncc, .copyout_8		! check for longword alignment
178525cf1a30Sjl139090	  nop
178625cf1a30Sjl139090	btst	1, %o3				!
178725cf1a30Sjl139090	bz,pt	%ncc, .copyout_2		! check for half-word
178825cf1a30Sjl139090	  nop
178925cf1a30Sjl139090	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
179025cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
179125cf1a30Sjl139090	tst	%o3
179225cf1a30Sjl139090	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
179325cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
179425cf1a30Sjl139090	bleu,pt	%ncc, .copyout_small		! go to small copy
179525cf1a30Sjl139090	  nop
179625cf1a30Sjl139090	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
179725cf1a30Sjl139090	  nop
179825cf1a30Sjl139090.copyout_2:
179925cf1a30Sjl139090	btst	3, %o3				!
180025cf1a30Sjl139090	bz,pt	%ncc, .copyout_4		! check for word alignment
180125cf1a30Sjl139090	  nop
180225cf1a30Sjl139090	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
180325cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
180425cf1a30Sjl139090	tst	%o3
180525cf1a30Sjl139090	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
180625cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
180725cf1a30Sjl139090	bleu,pt	%ncc, .copyout_small		! go to small copy
180825cf1a30Sjl139090	  nop
180925cf1a30Sjl139090	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
181025cf1a30Sjl139090	  nop
181125cf1a30Sjl139090.copyout_4:
181225cf1a30Sjl139090	! already checked longword, must be word aligned
181325cf1a30Sjl139090	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
181425cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
181525cf1a30Sjl139090	tst	%o3
181625cf1a30Sjl139090	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
181725cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
181825cf1a30Sjl139090	bleu,pt	%ncc, .copyout_small		! go to small copy
181925cf1a30Sjl139090	  nop
182025cf1a30Sjl139090	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
182125cf1a30Sjl139090	  nop
182225cf1a30Sjl139090.copyout_8:
182325cf1a30Sjl139090	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
182425cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
182525cf1a30Sjl139090	tst	%o3
182625cf1a30Sjl139090	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
182725cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
182825cf1a30Sjl139090	bleu,pt	%ncc, .copyout_small		! go to small copy
182925cf1a30Sjl139090	  nop
183025cf1a30Sjl139090	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
183125cf1a30Sjl139090	  nop
183225cf1a30Sjl139090
183325cf1a30Sjl139090	.align	16
183425cf1a30Sjl139090	nop				! instruction alignment
183525cf1a30Sjl139090					! see discussion at start of file
183625cf1a30Sjl139090.copyout_small:
183725cf1a30Sjl139090	sethi	%hi(.sm_copyout_err), %o5	! .sm_copyout_err is lofault
183825cf1a30Sjl139090	or	%o5, %lo(.sm_copyout_err), %o5
183925cf1a30Sjl139090	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
184025cf1a30Sjl139090	membar	#Sync				! sync error barrier
184125cf1a30Sjl139090	stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
184225cf1a30Sjl139090.sm_do_copyout:
184325cf1a30Sjl139090	mov	%o0, SM_SAVE_SRC
184425cf1a30Sjl139090	mov	%o1, SM_SAVE_DST
184525cf1a30Sjl139090	cmp	%o2, SHORTCOPY		! check for really short case
184625cf1a30Sjl139090	bleu,pt	%ncc, .co_sm_left	!
184725cf1a30Sjl139090	  mov	%o2, SM_SAVE_COUNT
184825cf1a30Sjl139090	cmp	%o2, CHKSIZE		! check for medium length cases
184925cf1a30Sjl139090	bgu,pn	%ncc, .co_med		!
185025cf1a30Sjl139090	  or	%o0, %o1, %o3		! prepare alignment check
185125cf1a30Sjl139090	andcc	%o3, 0x3, %g0		! test for alignment
185225cf1a30Sjl139090	bz,pt	%ncc, .co_sm_word	! branch to word aligned case
185325cf1a30Sjl139090.co_sm_movebytes:
185425cf1a30Sjl139090	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
185525cf1a30Sjl139090.co_sm_notalign4:
185625cf1a30Sjl139090	ldub	[%o0], %o3		! read byte
185725cf1a30Sjl139090	subcc	%o2, 4, %o2		! reduce count by 4
185825cf1a30Sjl139090	stba	%o3, [%o1]ASI_USER	! write byte
185925cf1a30Sjl139090	inc	%o1			! advance DST by 1
186025cf1a30Sjl139090	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
186125cf1a30Sjl139090	add	%o0, 4, %o0		! advance SRC by 4
186225cf1a30Sjl139090	stba	%o3, [%o1]ASI_USER
186325cf1a30Sjl139090	inc	%o1			! advance DST by 1
186425cf1a30Sjl139090	ldub	[%o0 - 2], %o3
186525cf1a30Sjl139090	stba	%o3, [%o1]ASI_USER
186625cf1a30Sjl139090	inc	%o1			! advance DST by 1
186725cf1a30Sjl139090	ldub	[%o0 - 1], %o3
186825cf1a30Sjl139090	stba	%o3, [%o1]ASI_USER
186925cf1a30Sjl139090	bgt,pt	%ncc, .co_sm_notalign4	! loop til 3 or fewer bytes remain
187025cf1a30Sjl139090	  inc	%o1			! advance DST by 1
187125cf1a30Sjl139090	add	%o2, 3, %o2		! restore count
187225cf1a30Sjl139090.co_sm_left:
187325cf1a30Sjl139090	tst	%o2
187425cf1a30Sjl139090	bz,pt	%ncc, .co_sm_exit	! check for zero length
187525cf1a30Sjl139090	  nop
187625cf1a30Sjl139090	ldub	[%o0], %o3		! load one byte
187725cf1a30Sjl139090	deccc	%o2			! reduce count for cc test
187825cf1a30Sjl139090	bz,pt	%ncc, .co_sm_exit
187925cf1a30Sjl139090	  stba	%o3,[%o1]ASI_USER	! store one byte
188025cf1a30Sjl139090	ldub	[%o0 + 1], %o3		! load second byte
188125cf1a30Sjl139090	deccc	%o2
188225cf1a30Sjl139090	inc	%o1
188325cf1a30Sjl139090	bz,pt	%ncc, .co_sm_exit
188425cf1a30Sjl139090	  stba	%o3,[%o1]ASI_USER	! store second byte
188525cf1a30Sjl139090	ldub	[%o0 + 2], %o3		! load third byte
188625cf1a30Sjl139090	inc	%o1
188725cf1a30Sjl139090	stba	%o3,[%o1]ASI_USER	! store third byte
188825cf1a30Sjl139090	membar	#Sync				! sync error barrier
188925cf1a30Sjl139090	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
189025cf1a30Sjl139090	retl
189125cf1a30Sjl139090	  mov	%g0, %o0		! return 0
189225cf1a30Sjl139090	.align	16
189325cf1a30Sjl139090.co_sm_words:
189425cf1a30Sjl139090	lduw	[%o0], %o3		! read word
189525cf1a30Sjl139090.co_sm_wordx:
189625cf1a30Sjl139090	subcc	%o2, 8, %o2		! update count
189725cf1a30Sjl139090	stwa	%o3, [%o1]ASI_USER	! write word
189825cf1a30Sjl139090	add	%o0, 8, %o0		! update SRC
189925cf1a30Sjl139090	lduw	[%o0 - 4], %o3		! read word
190025cf1a30Sjl139090	add	%o1, 4, %o1		! update DST
190125cf1a30Sjl139090	stwa	%o3, [%o1]ASI_USER	! write word
190225cf1a30Sjl139090	bgt,pt	%ncc, .co_sm_words	! loop til done
190325cf1a30Sjl139090	  add	%o1, 4, %o1		! update DST
190425cf1a30Sjl139090	addcc	%o2, 7, %o2		! restore count
190525cf1a30Sjl139090	bz,pt	%ncc, .co_sm_exit
190625cf1a30Sjl139090	  nop
190725cf1a30Sjl139090	deccc	%o2
190825cf1a30Sjl139090	bz,pt	%ncc, .co_sm_byte
190925cf1a30Sjl139090.co_sm_half:
191025cf1a30Sjl139090	  subcc	%o2, 2, %o2		! reduce count by 2
191125cf1a30Sjl139090	lduh	[%o0], %o3		! read half word
191225cf1a30Sjl139090	add	%o0, 2, %o0		! advance SRC by 2
191325cf1a30Sjl139090	stha	%o3, [%o1]ASI_USER	! write half word
191425cf1a30Sjl139090	bgt,pt	%ncc, .co_sm_half	! loop til done
191525cf1a30Sjl139090	  add	%o1, 2, %o1		! advance DST by 2
191625cf1a30Sjl139090	addcc	%o2, 1, %o2		! restore count
191725cf1a30Sjl139090	bz,pt	%ncc, .co_sm_exit
191825cf1a30Sjl139090	  nop
191925cf1a30Sjl139090.co_sm_byte:
192025cf1a30Sjl139090	ldub	[%o0], %o3
192125cf1a30Sjl139090	stba	%o3, [%o1]ASI_USER
192225cf1a30Sjl139090	membar	#Sync				! sync error barrier
192325cf1a30Sjl139090	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
192425cf1a30Sjl139090	retl
192525cf1a30Sjl139090	  mov	%g0, %o0		! return 0
192625cf1a30Sjl139090	.align 16
192725cf1a30Sjl139090.co_sm_word:
192825cf1a30Sjl139090	subcc	%o2, 4, %o2		! update count
192925cf1a30Sjl139090	bgt,pt	%ncc, .co_sm_wordx
193025cf1a30Sjl139090	  lduw	[%o0], %o3		! read word
193125cf1a30Sjl139090	addcc	%o2, 3, %o2		! restore count
193225cf1a30Sjl139090	bz,pt	%ncc, .co_sm_exit
193325cf1a30Sjl139090	  stwa	%o3, [%o1]ASI_USER	! write word
193425cf1a30Sjl139090	deccc	%o2			! reduce count for cc test
193525cf1a30Sjl139090	ldub	[%o0 + 4], %o3		! load one byte
193625cf1a30Sjl139090	add	%o1, 4, %o1
193725cf1a30Sjl139090	bz,pt	%ncc, .co_sm_exit
193825cf1a30Sjl139090	  stba	%o3, [%o1]ASI_USER	! store one byte
193925cf1a30Sjl139090	ldub	[%o0 + 5], %o3		! load second byte
194025cf1a30Sjl139090	deccc	%o2
194125cf1a30Sjl139090	inc	%o1
194225cf1a30Sjl139090	bz,pt	%ncc, .co_sm_exit
194325cf1a30Sjl139090	  stba	%o3, [%o1]ASI_USER	! store second byte
194425cf1a30Sjl139090	ldub	[%o0 + 6], %o3		! load third byte
194525cf1a30Sjl139090	inc	%o1
194625cf1a30Sjl139090	stba	%o3, [%o1]ASI_USER	! store third byte
194725cf1a30Sjl139090.co_sm_exit:
194825cf1a30Sjl139090	  membar	#Sync				! sync error barrier
194925cf1a30Sjl139090	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
195025cf1a30Sjl139090	retl
195125cf1a30Sjl139090	  mov	%g0, %o0		! return 0
195225cf1a30Sjl139090
195325cf1a30Sjl139090	.align 16
195425cf1a30Sjl139090.co_med:
195525cf1a30Sjl139090	xor	%o0, %o1, %o3		! setup alignment check
195625cf1a30Sjl139090	btst	1, %o3
195725cf1a30Sjl139090	bnz,pt	%ncc, .co_sm_movebytes	! unaligned
195825cf1a30Sjl139090	  nop
195925cf1a30Sjl139090	btst	3, %o3
196025cf1a30Sjl139090	bnz,pt	%ncc, .co_med_half	! halfword aligned
196125cf1a30Sjl139090	  nop
196225cf1a30Sjl139090	btst	7, %o3
196325cf1a30Sjl139090	bnz,pt	%ncc, .co_med_word	! word aligned
196425cf1a30Sjl139090	  nop
196525cf1a30Sjl139090.co_med_long:
196625cf1a30Sjl139090	btst	3, %o0			! check for
196725cf1a30Sjl139090	bz,pt	%ncc, .co_med_long1	! word alignment
196825cf1a30Sjl139090	  nop
196925cf1a30Sjl139090.co_med_long0:
197025cf1a30Sjl139090	ldub	[%o0], %o3		! load one byte
197125cf1a30Sjl139090	inc	%o0
197225cf1a30Sjl139090	stba	%o3,[%o1]ASI_USER	! store byte
197325cf1a30Sjl139090	inc	%o1
197425cf1a30Sjl139090	btst	3, %o0
197525cf1a30Sjl139090	bnz,pt	%ncc, .co_med_long0
197625cf1a30Sjl139090	  dec	%o2
197725cf1a30Sjl139090.co_med_long1:			! word aligned
197825cf1a30Sjl139090	btst	7, %o0			! check for long word
197925cf1a30Sjl139090	bz,pt	%ncc, .co_med_long2
198025cf1a30Sjl139090	  nop
198125cf1a30Sjl139090	lduw	[%o0], %o3		! load word
198225cf1a30Sjl139090	add	%o0, 4, %o0		! advance SRC by 4
198325cf1a30Sjl139090	stwa	%o3, [%o1]ASI_USER	! store word
198425cf1a30Sjl139090	add	%o1, 4, %o1		! advance DST by 4
198525cf1a30Sjl139090	sub	%o2, 4, %o2		! reduce count by 4
198625cf1a30Sjl139090!
198725cf1a30Sjl139090!  Now long word aligned and have at least 32 bytes to move
198825cf1a30Sjl139090!
198925cf1a30Sjl139090.co_med_long2:
199025cf1a30Sjl139090	sub	%o2, 31, %o2		! adjust count to allow cc zero test
199125cf1a30Sjl139090	sub	%o1, 8, %o1		! adjust pointer to allow store in
199225cf1a30Sjl139090					! branch delay slot instead of add
199325cf1a30Sjl139090.co_med_lmove:
199425cf1a30Sjl139090	add	%o1, 8, %o1		! advance DST by 8
199525cf1a30Sjl139090	ldx	[%o0], %o3		! read long word
199625cf1a30Sjl139090	subcc	%o2, 32, %o2		! reduce count by 32
199725cf1a30Sjl139090	stxa	%o3, [%o1]ASI_USER	! write long word
199825cf1a30Sjl139090	add	%o1, 8, %o1		! advance DST by 8
199925cf1a30Sjl139090	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
200025cf1a30Sjl139090	add	%o0, 32, %o0		! advance SRC by 32
200125cf1a30Sjl139090	stxa	%o3, [%o1]ASI_USER
200225cf1a30Sjl139090	ldx	[%o0 - 16], %o3
200325cf1a30Sjl139090	add	%o1, 8, %o1		! advance DST by 8
200425cf1a30Sjl139090	stxa	%o3, [%o1]ASI_USER
200525cf1a30Sjl139090	ldx	[%o0 - 8], %o3
200625cf1a30Sjl139090	add	%o1, 8, %o1		! advance DST by 8
200725cf1a30Sjl139090	bgt,pt	%ncc, .co_med_lmove	! loop til 31 or fewer bytes left
200825cf1a30Sjl139090	  stxa	%o3, [%o1]ASI_USER
200925cf1a30Sjl139090	add	%o1, 8, %o1		! advance DST by 8
201025cf1a30Sjl139090	addcc	%o2, 24, %o2		! restore count to long word offset
201125cf1a30Sjl139090	ble,pt	%ncc, .co_med_lextra	! check for more long words to move
201225cf1a30Sjl139090	  nop
201325cf1a30Sjl139090.co_med_lword:
201425cf1a30Sjl139090	ldx	[%o0], %o3		! read long word
201525cf1a30Sjl139090	subcc	%o2, 8, %o2		! reduce count by 8
201625cf1a30Sjl139090	stxa	%o3, [%o1]ASI_USER	! write long word
201725cf1a30Sjl139090	add	%o0, 8, %o0		! advance SRC by 8
201825cf1a30Sjl139090	bgt,pt	%ncc, .co_med_lword	! loop til 7 or fewer bytes left
201925cf1a30Sjl139090	  add	%o1, 8, %o1		! advance DST by 8
202025cf1a30Sjl139090.co_med_lextra:
202125cf1a30Sjl139090	addcc	%o2, 7, %o2		! restore rest of count
202225cf1a30Sjl139090	bz,pt	%ncc, .co_sm_exit	! if zero, then done
202325cf1a30Sjl139090	  deccc	%o2
202425cf1a30Sjl139090	bz,pt	%ncc, .co_sm_byte
202525cf1a30Sjl139090	  nop
202625cf1a30Sjl139090	ba,pt	%ncc, .co_sm_half
202725cf1a30Sjl139090	  nop
202825cf1a30Sjl139090
202925cf1a30Sjl139090	.align 16
203025cf1a30Sjl139090	nop				! instruction alignment
203125cf1a30Sjl139090					! see discussion at start of file
203225cf1a30Sjl139090.co_med_word:
203325cf1a30Sjl139090	btst	3, %o0			! check for
203425cf1a30Sjl139090	bz,pt	%ncc, .co_med_word1	! word alignment
203525cf1a30Sjl139090	  nop
203625cf1a30Sjl139090.co_med_word0:
203725cf1a30Sjl139090	ldub	[%o0], %o3		! load one byte
203825cf1a30Sjl139090	inc	%o0
203925cf1a30Sjl139090	stba	%o3,[%o1]ASI_USER	! store byte
204025cf1a30Sjl139090	inc	%o1
204125cf1a30Sjl139090	btst	3, %o0
204225cf1a30Sjl139090	bnz,pt	%ncc, .co_med_word0
204325cf1a30Sjl139090	  dec	%o2
204425cf1a30Sjl139090!
204525cf1a30Sjl139090!  Now word aligned and have at least 36 bytes to move
204625cf1a30Sjl139090!
204725cf1a30Sjl139090.co_med_word1:
204825cf1a30Sjl139090	sub	%o2, 15, %o2		! adjust count to allow cc zero test
204925cf1a30Sjl139090.co_med_wmove:
205025cf1a30Sjl139090	lduw	[%o0], %o3		! read word
205125cf1a30Sjl139090	subcc	%o2, 16, %o2		! reduce count by 16
205225cf1a30Sjl139090	stwa	%o3, [%o1]ASI_USER	! write word
205325cf1a30Sjl139090	add	%o1, 4, %o1		! advance DST by 4
205425cf1a30Sjl139090	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
205525cf1a30Sjl139090	add	%o0, 16, %o0		! advance SRC by 16
205625cf1a30Sjl139090	stwa	%o3, [%o1]ASI_USER
205725cf1a30Sjl139090	add	%o1, 4, %o1		! advance DST by 4
205825cf1a30Sjl139090	lduw	[%o0 - 8], %o3
205925cf1a30Sjl139090	stwa	%o3, [%o1]ASI_USER
206025cf1a30Sjl139090	add	%o1, 4, %o1		! advance DST by 4
206125cf1a30Sjl139090	lduw	[%o0 - 4], %o3
206225cf1a30Sjl139090	stwa	%o3, [%o1]ASI_USER
206325cf1a30Sjl139090	bgt,pt	%ncc, .co_med_wmove	! loop til 15 or fewer bytes left
206425cf1a30Sjl139090	  add	%o1, 4, %o1		! advance DST by 4
206525cf1a30Sjl139090	addcc	%o2, 12, %o2		! restore count to word offset
206625cf1a30Sjl139090	ble,pt	%ncc, .co_med_wextra	! check for more words to move
206725cf1a30Sjl139090	  nop
206825cf1a30Sjl139090.co_med_word2:
206925cf1a30Sjl139090	lduw	[%o0], %o3		! read word
207025cf1a30Sjl139090	subcc	%o2, 4, %o2		! reduce count by 4
207125cf1a30Sjl139090	stwa	%o3, [%o1]ASI_USER	! write word
207225cf1a30Sjl139090	add	%o0, 4, %o0		! advance SRC by 4
207325cf1a30Sjl139090	bgt,pt	%ncc, .co_med_word2	! loop til 3 or fewer bytes left
207425cf1a30Sjl139090	  add	%o1, 4, %o1		! advance DST by 4
207525cf1a30Sjl139090.co_med_wextra:
207625cf1a30Sjl139090	addcc	%o2, 3, %o2		! restore rest of count
207725cf1a30Sjl139090	bz,pt	%ncc, .co_sm_exit	! if zero, then done
207825cf1a30Sjl139090	  deccc	%o2
207925cf1a30Sjl139090	bz,pt	%ncc, .co_sm_byte
208025cf1a30Sjl139090	  nop
208125cf1a30Sjl139090	ba,pt	%ncc, .co_sm_half
208225cf1a30Sjl139090	  nop
208325cf1a30Sjl139090
208425cf1a30Sjl139090	.align 16
208525cf1a30Sjl139090	nop				! instruction alignment
208625cf1a30Sjl139090	nop				! see discussion at start of file
208725cf1a30Sjl139090	nop
208825cf1a30Sjl139090.co_med_half:
208925cf1a30Sjl139090	btst	1, %o0			! check for
209025cf1a30Sjl139090	bz,pt	%ncc, .co_med_half1	! half word alignment
209125cf1a30Sjl139090	  nop
209225cf1a30Sjl139090	ldub	[%o0], %o3		! load one byte
209325cf1a30Sjl139090	inc	%o0
209425cf1a30Sjl139090	stba	%o3,[%o1]ASI_USER	! store byte
209525cf1a30Sjl139090	inc	%o1
209625cf1a30Sjl139090	dec	%o2
209725cf1a30Sjl139090!
209825cf1a30Sjl139090!  Now half word aligned and have at least 38 bytes to move
209925cf1a30Sjl139090!
210025cf1a30Sjl139090.co_med_half1:
210125cf1a30Sjl139090	sub	%o2, 7, %o2		! adjust count to allow cc zero test
210225cf1a30Sjl139090.co_med_hmove:
210325cf1a30Sjl139090	lduh	[%o0], %o3		! read half word
210425cf1a30Sjl139090	subcc	%o2, 8, %o2		! reduce count by 8
210525cf1a30Sjl139090	stha	%o3, [%o1]ASI_USER	! write half word
210625cf1a30Sjl139090	add	%o1, 2, %o1		! advance DST by 2
210725cf1a30Sjl139090	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
210825cf1a30Sjl139090	add	%o0, 8, %o0		! advance SRC by 8
210925cf1a30Sjl139090	stha	%o3, [%o1]ASI_USER
211025cf1a30Sjl139090	add	%o1, 2, %o1		! advance DST by 2
211125cf1a30Sjl139090	lduh	[%o0 - 4], %o3
211225cf1a30Sjl139090	stha	%o3, [%o1]ASI_USER
211325cf1a30Sjl139090	add	%o1, 2, %o1		! advance DST by 2
211425cf1a30Sjl139090	lduh	[%o0 - 2], %o3
211525cf1a30Sjl139090	stha	%o3, [%o1]ASI_USER
211625cf1a30Sjl139090	bgt,pt	%ncc, .co_med_hmove	! loop til 7 or fewer bytes left
211725cf1a30Sjl139090	  add	%o1, 2, %o1		! advance DST by 2
211825cf1a30Sjl139090	addcc	%o2, 7, %o2		! restore count
211925cf1a30Sjl139090	bz,pt	%ncc, .co_sm_exit
212025cf1a30Sjl139090	  deccc	%o2
212125cf1a30Sjl139090	bz,pt	%ncc, .co_sm_byte
212225cf1a30Sjl139090	  nop
212325cf1a30Sjl139090	ba,pt	%ncc, .co_sm_half
212425cf1a30Sjl139090	  nop
212525cf1a30Sjl139090
212625cf1a30Sjl139090/*
212725cf1a30Sjl139090 * We got here because of a fault during short copyout.
212825cf1a30Sjl139090 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
212925cf1a30Sjl139090 */
213025cf1a30Sjl139090.sm_copyout_err:
213125cf1a30Sjl139090	membar	#Sync
213225cf1a30Sjl139090	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
213325cf1a30Sjl139090	mov	SM_SAVE_SRC, %o0
213425cf1a30Sjl139090	mov	SM_SAVE_DST, %o1
213525cf1a30Sjl139090	mov	SM_SAVE_COUNT, %o2
213625cf1a30Sjl139090	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
213725cf1a30Sjl139090	tst	%o3
213825cf1a30Sjl139090	bz,pt	%ncc, 3f			! if not, return error
213925cf1a30Sjl139090	  nop
214025cf1a30Sjl139090	ldn	[%o3 + CP_COPYOUT], %o5		! if handler, invoke it with
214125cf1a30Sjl139090	jmp	%o5				! original arguments
214225cf1a30Sjl139090	  nop
214325cf1a30Sjl1390903:
214425cf1a30Sjl139090	retl
214525cf1a30Sjl139090	  or	%g0, -1, %o0		! return error value
214625cf1a30Sjl139090
214725cf1a30Sjl139090	SET_SIZE(copyout)
214825cf1a30Sjl139090
214925cf1a30Sjl139090/*
215025cf1a30Sjl139090 * The _more entry points are not intended to be used directly by
215125cf1a30Sjl139090 * any caller from outside this file.  They are provided to allow
215225cf1a30Sjl139090 * profiling and dtrace of the portions of the copy code that uses
215325cf1a30Sjl139090 * the floating point registers.
215425cf1a30Sjl139090 * This entry is particularly important as DTRACE (at least as of
215525cf1a30Sjl139090 * 4/2004) does not support leaf functions.
215625cf1a30Sjl139090 */
215725cf1a30Sjl139090
215825cf1a30Sjl139090	ENTRY(copyout_more)
215925cf1a30Sjl139090.copyout_more:
216025cf1a30Sjl139090	prefetch [%o0], #n_reads
216125cf1a30Sjl139090	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
216225cf1a30Sjl139090	set	.copyout_err, REAL_LOFAULT
216325cf1a30Sjl139090
216425cf1a30Sjl139090/*
216525cf1a30Sjl139090 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
216625cf1a30Sjl139090 */
216725cf1a30Sjl139090.do_copyout:
216825cf1a30Sjl139090        set     copyio_fault, %l7		! .copyio_fault is lofault val
216925cf1a30Sjl139090
217025cf1a30Sjl139090	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
217125cf1a30Sjl139090	membar	#Sync				! sync error barrier
217225cf1a30Sjl139090	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
217325cf1a30Sjl139090
217425cf1a30Sjl139090	mov	%i0, SAVE_SRC
217525cf1a30Sjl139090	mov	%i1, SAVE_DST
217625cf1a30Sjl139090	mov	%i2, SAVE_COUNT
217725cf1a30Sjl139090
217825cf1a30Sjl139090	FP_NOMIGRATE(6, 7)
217925cf1a30Sjl139090
218025cf1a30Sjl139090	rd	%fprs, %o2		! check for unused fp
218125cf1a30Sjl139090	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
218225cf1a30Sjl139090	btst	FPRS_FEF, %o2
218325cf1a30Sjl139090	bz,a,pt	%icc, .do_blockcopyout
218425cf1a30Sjl139090	  wr	%g0, FPRS_FEF, %fprs
218525cf1a30Sjl139090
218625cf1a30Sjl139090	BST_FPQ2Q4_TOSTACK(%o2)
218725cf1a30Sjl139090
218825cf1a30Sjl139090.do_blockcopyout:
218925cf1a30Sjl139090	rd	%gsr, %o2
219025cf1a30Sjl139090	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
219125cf1a30Sjl139090	or	%l6, FPUSED_FLAG, %l6
219225cf1a30Sjl139090
219325cf1a30Sjl139090	andcc	DST, VIS_BLOCKSIZE - 1, TMP
219425cf1a30Sjl139090	mov	ASI_USER, %asi
219525cf1a30Sjl139090	bz,pt	%ncc, 2f
219625cf1a30Sjl139090	  neg	TMP
219725cf1a30Sjl139090	add	TMP, VIS_BLOCKSIZE, TMP
219825cf1a30Sjl139090
219925cf1a30Sjl139090	! TMP = bytes required to align DST on FP_BLOCK boundary
220025cf1a30Sjl139090	! Using SRC as a tmp here
220125cf1a30Sjl139090	cmp	TMP, 3
220225cf1a30Sjl139090	bleu,pt	%ncc, 1f
220325cf1a30Sjl139090	  sub	CNT,TMP,CNT		! adjust main count
220425cf1a30Sjl139090	sub	TMP, 3, TMP		! adjust for end of loop test
220525cf1a30Sjl139090.co_blkalign:
220625cf1a30Sjl139090	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
220725cf1a30Sjl139090	stba	SRC, [DST]%asi
220825cf1a30Sjl139090	subcc	TMP, 4, TMP
220925cf1a30Sjl139090	ldub	[REALSRC + 1], SRC
221025cf1a30Sjl139090	add	REALSRC, 4, REALSRC
221125cf1a30Sjl139090	stba	SRC, [DST + 1]%asi
221225cf1a30Sjl139090	ldub	[REALSRC - 2], SRC
221325cf1a30Sjl139090	add	DST, 4, DST
221425cf1a30Sjl139090	stba	SRC, [DST - 2]%asi
221525cf1a30Sjl139090	ldub	[REALSRC - 1], SRC
221625cf1a30Sjl139090	bgu,pt	%ncc, .co_blkalign
221725cf1a30Sjl139090	  stba	SRC, [DST - 1]%asi
221825cf1a30Sjl139090
221925cf1a30Sjl139090	addcc	TMP, 3, TMP		! restore count adjustment
222025cf1a30Sjl139090	bz,pt	%ncc, 2f		! no bytes left?
222125cf1a30Sjl139090	  nop
222225cf1a30Sjl1390901:	ldub	[REALSRC], SRC
222325cf1a30Sjl139090	inc	REALSRC
222425cf1a30Sjl139090	inc	DST
222525cf1a30Sjl139090	deccc	TMP
222625cf1a30Sjl139090	bgu	%ncc, 1b
222725cf1a30Sjl139090	  stba	SRC, [DST - 1]%asi
222825cf1a30Sjl139090
222925cf1a30Sjl1390902:
223025cf1a30Sjl139090	membar	#StoreLoad
223125cf1a30Sjl139090	andn	REALSRC, 0x7, SRC
223225cf1a30Sjl139090
223325cf1a30Sjl139090	! SRC - 8-byte aligned
223425cf1a30Sjl139090	! DST - 64-byte aligned
223525cf1a30Sjl139090	ldd	[SRC], %f16
223625cf1a30Sjl139090	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
223725cf1a30Sjl139090	alignaddr REALSRC, %g0, %g0
223825cf1a30Sjl139090	ldd	[SRC + 0x08], %f18
223925cf1a30Sjl139090	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
224025cf1a30Sjl139090	faligndata %f16, %f18, %f48
224125cf1a30Sjl139090	ldd	[SRC + 0x10], %f20
2242c8a722abSpm145316	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
224325cf1a30Sjl139090	faligndata %f18, %f20, %f50
224425cf1a30Sjl139090	ldd	[SRC + 0x18], %f22
224525cf1a30Sjl139090	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
224625cf1a30Sjl139090	faligndata %f20, %f22, %f52
224725cf1a30Sjl139090	ldd	[SRC + 0x20], %f24
2248c8a722abSpm145316	prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
224925cf1a30Sjl139090	faligndata %f22, %f24, %f54
225025cf1a30Sjl139090	ldd	[SRC + 0x28], %f26
2251c8a722abSpm145316	prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
225225cf1a30Sjl139090	faligndata %f24, %f26, %f56
225325cf1a30Sjl139090	ldd	[SRC + 0x30], %f28
2254c8a722abSpm145316	prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
225525cf1a30Sjl139090	faligndata %f26, %f28, %f58
225625cf1a30Sjl139090	ldd	[SRC + 0x38], %f30
225725cf1a30Sjl139090	ldd	[SRC + VIS_BLOCKSIZE], %f16
225825cf1a30Sjl139090	sub	CNT, VIS_BLOCKSIZE, CNT
225925cf1a30Sjl139090	add	SRC, VIS_BLOCKSIZE, SRC
2260c8a722abSpm145316	prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
226125cf1a30Sjl139090	add	REALSRC, VIS_BLOCKSIZE, REALSRC
226225cf1a30Sjl139090	ba,pt	%ncc, 1f
2263c8a722abSpm145316	prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
226425cf1a30Sjl139090	.align	32
226525cf1a30Sjl1390901:
226625cf1a30Sjl139090	ldd	[SRC + 0x08], %f18
226725cf1a30Sjl139090	faligndata %f28, %f30, %f60
226825cf1a30Sjl139090	ldd	[SRC + 0x10], %f20
226925cf1a30Sjl139090	faligndata %f30, %f16, %f62
227025cf1a30Sjl139090	stda	%f48, [DST]ASI_BLK_AIUS
227125cf1a30Sjl139090	ldd	[SRC + 0x18], %f22
227225cf1a30Sjl139090	faligndata %f16, %f18, %f48
227325cf1a30Sjl139090	ldd	[SRC + 0x20], %f24
227425cf1a30Sjl139090	faligndata %f18, %f20, %f50
227525cf1a30Sjl139090	ldd	[SRC + 0x28], %f26
227625cf1a30Sjl139090	faligndata %f20, %f22, %f52
227725cf1a30Sjl139090	ldd	[SRC + 0x30], %f28
227825cf1a30Sjl139090	faligndata %f22, %f24, %f54
227925cf1a30Sjl139090	sub	CNT, VIS_BLOCKSIZE, CNT
2280c8a722abSpm145316	ldd	[SRC + 0x38], %f30
2281c8a722abSpm145316	faligndata %f24, %f26, %f56
228225cf1a30Sjl139090	add	DST, VIS_BLOCKSIZE, DST
2283c8a722abSpm145316	ldd	[SRC + VIS_BLOCKSIZE], %f16
2284c8a722abSpm145316	faligndata %f26, %f28, %f58
228525cf1a30Sjl139090	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2286c8a722abSpm145316	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2287c8a722abSpm145316	add	SRC, VIS_BLOCKSIZE, SRC
2288c8a722abSpm145316	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
228925cf1a30Sjl139090	cmp	CNT, VIS_BLOCKSIZE + 8
229025cf1a30Sjl139090	bgu,pt	%ncc, 1b
2291c8a722abSpm145316	  prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
229225cf1a30Sjl139090
229325cf1a30Sjl139090	! only if REALSRC & 0x7 is 0
229425cf1a30Sjl139090	cmp	CNT, VIS_BLOCKSIZE
229525cf1a30Sjl139090	bne	%ncc, 3f
229625cf1a30Sjl139090	  andcc	REALSRC, 0x7, %g0
229725cf1a30Sjl139090	bz,pt	%ncc, 2f
229825cf1a30Sjl139090	  nop
229925cf1a30Sjl1390903:
230025cf1a30Sjl139090	faligndata %f28, %f30, %f60
230125cf1a30Sjl139090	faligndata %f30, %f16, %f62
230225cf1a30Sjl139090	stda	%f48, [DST]ASI_BLK_AIUS
230325cf1a30Sjl139090	add	DST, VIS_BLOCKSIZE, DST
230425cf1a30Sjl139090	ba,pt	%ncc, 3f
230525cf1a30Sjl139090	  nop
230625cf1a30Sjl1390902:
230725cf1a30Sjl139090	ldd	[SRC + 0x08], %f18
230825cf1a30Sjl139090	fsrc1	%f28, %f60
230925cf1a30Sjl139090	ldd	[SRC + 0x10], %f20
231025cf1a30Sjl139090	fsrc1	%f30, %f62
231125cf1a30Sjl139090	stda	%f48, [DST]ASI_BLK_AIUS
231225cf1a30Sjl139090	ldd	[SRC + 0x18], %f22
231325cf1a30Sjl139090	fsrc1	%f16, %f48
231425cf1a30Sjl139090	ldd	[SRC + 0x20], %f24
231525cf1a30Sjl139090	fsrc1	%f18, %f50
231625cf1a30Sjl139090	ldd	[SRC + 0x28], %f26
231725cf1a30Sjl139090	fsrc1	%f20, %f52
231825cf1a30Sjl139090	ldd	[SRC + 0x30], %f28
231925cf1a30Sjl139090	fsrc1	%f22, %f54
232025cf1a30Sjl139090	ldd	[SRC + 0x38], %f30
232125cf1a30Sjl139090	fsrc1	%f24, %f56
232225cf1a30Sjl139090	sub	CNT, VIS_BLOCKSIZE, CNT
232325cf1a30Sjl139090	add	DST, VIS_BLOCKSIZE, DST
232425cf1a30Sjl139090	add	SRC, VIS_BLOCKSIZE, SRC
232525cf1a30Sjl139090	add	REALSRC, VIS_BLOCKSIZE, REALSRC
232625cf1a30Sjl139090	fsrc1	%f26, %f58
232725cf1a30Sjl139090	fsrc1	%f28, %f60
232825cf1a30Sjl139090	fsrc1	%f30, %f62
232925cf1a30Sjl139090	stda	%f48, [DST]ASI_BLK_AIUS
233025cf1a30Sjl139090	add	DST, VIS_BLOCKSIZE, DST
233125cf1a30Sjl139090	ba,a,pt	%ncc, 4f
233225cf1a30Sjl139090	  nop
233325cf1a30Sjl139090
233425cf1a30Sjl1390903:	tst	CNT
233525cf1a30Sjl139090	bz,a	%ncc, 4f
233625cf1a30Sjl139090	  nop
233725cf1a30Sjl139090
233825cf1a30Sjl1390905:	ldub	[REALSRC], TMP
233925cf1a30Sjl139090	inc	REALSRC
234025cf1a30Sjl139090	inc	DST
234125cf1a30Sjl139090	deccc	CNT
234225cf1a30Sjl139090	bgu	%ncc, 5b
234325cf1a30Sjl139090	  stba	TMP, [DST - 1]%asi
234425cf1a30Sjl1390904:
234525cf1a30Sjl139090
234625cf1a30Sjl139090.copyout_exit:
234725cf1a30Sjl139090	membar	#Sync
234825cf1a30Sjl139090
234925cf1a30Sjl139090	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
235025cf1a30Sjl139090	wr	%o2, 0, %gsr		! restore gsr
235125cf1a30Sjl139090
235225cf1a30Sjl139090	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
235325cf1a30Sjl139090	btst	FPRS_FEF, %o3
235425cf1a30Sjl139090	bz,pt	%icc, 4f
235525cf1a30Sjl139090	  nop
235625cf1a30Sjl139090
235725cf1a30Sjl139090	BLD_FPQ2Q4_FROMSTACK(%o2)
235825cf1a30Sjl139090
235925cf1a30Sjl139090	ba,pt	%ncc, 1f
236025cf1a30Sjl139090	  wr	%o3, 0, %fprs		! restore fprs
236125cf1a30Sjl139090
236225cf1a30Sjl1390904:
236325cf1a30Sjl139090	FZEROQ2Q4
236425cf1a30Sjl139090	wr	%o3, 0, %fprs		! restore fprs
236525cf1a30Sjl139090
236625cf1a30Sjl1390901:
236725cf1a30Sjl139090	membar	#Sync
236825cf1a30Sjl139090	andn	%l6, FPUSED_FLAG, %l6
236925cf1a30Sjl139090	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
237025cf1a30Sjl139090	FP_ALLOWMIGRATE(5, 6)
237125cf1a30Sjl139090	ret
237225cf1a30Sjl139090	  restore	%g0, 0, %o0
237325cf1a30Sjl139090
237425cf1a30Sjl139090/*
237525cf1a30Sjl139090 * We got here because of a fault during copyout.
237625cf1a30Sjl139090 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
237725cf1a30Sjl139090 */
237825cf1a30Sjl139090.copyout_err:
237925cf1a30Sjl139090	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
238025cf1a30Sjl139090	tst	%o4
238125cf1a30Sjl139090	bz,pt	%ncc, 2f			! if not, return error
238225cf1a30Sjl139090	  nop
238325cf1a30Sjl139090	ldn	[%o4 + CP_COPYOUT], %g2		! if handler, invoke it with
238425cf1a30Sjl139090	jmp	%g2				! original arguments
238525cf1a30Sjl139090	  restore %g0, 0, %g0			! dispose of copy window
238625cf1a30Sjl1390902:
238725cf1a30Sjl139090        ret
238825cf1a30Sjl139090	  restore %g0, -1, %o0			! return error value
238925cf1a30Sjl139090
239025cf1a30Sjl139090
239125cf1a30Sjl139090	SET_SIZE(copyout_more)
239225cf1a30Sjl139090
239325cf1a30Sjl139090#endif	/* lint */
239425cf1a30Sjl139090
239525cf1a30Sjl139090
239625cf1a30Sjl139090#ifdef	lint
239725cf1a30Sjl139090
239825cf1a30Sjl139090/*ARGSUSED*/
239925cf1a30Sjl139090int
240025cf1a30Sjl139090xcopyout(const void *kaddr, void *uaddr, size_t count)
240125cf1a30Sjl139090{ return (0); }
240225cf1a30Sjl139090
240325cf1a30Sjl139090#else	/* lint */
240425cf1a30Sjl139090
240525cf1a30Sjl139090	ENTRY(xcopyout)
240625cf1a30Sjl139090	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
240725cf1a30Sjl139090	bleu,pt	%ncc, .xcopyout_small		! go to larger cases
240825cf1a30Sjl139090	  xor	%o0, %o1, %o3			! are src, dst alignable?
240925cf1a30Sjl139090	btst	7, %o3				!
241025cf1a30Sjl139090	bz,pt	%ncc, .xcopyout_8		!
241125cf1a30Sjl139090	  nop
241225cf1a30Sjl139090	btst	1, %o3				!
241325cf1a30Sjl139090	bz,pt	%ncc, .xcopyout_2		! check for half-word
241425cf1a30Sjl139090	  nop
241525cf1a30Sjl139090	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
241625cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
241725cf1a30Sjl139090	tst	%o3
241825cf1a30Sjl139090	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
241925cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
242025cf1a30Sjl139090	bleu,pt	%ncc, .xcopyout_small		! go to small copy
242125cf1a30Sjl139090	  nop
242225cf1a30Sjl139090	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
242325cf1a30Sjl139090	  nop
242425cf1a30Sjl139090.xcopyout_2:
242525cf1a30Sjl139090	btst	3, %o3				!
242625cf1a30Sjl139090	bz,pt	%ncc, .xcopyout_4		! check for word alignment
242725cf1a30Sjl139090	  nop
242825cf1a30Sjl139090	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
242925cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
243025cf1a30Sjl139090	tst	%o3
243125cf1a30Sjl139090	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
243225cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
243325cf1a30Sjl139090	bleu,pt	%ncc, .xcopyout_small		! go to small copy
243425cf1a30Sjl139090	  nop
243525cf1a30Sjl139090	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
243625cf1a30Sjl139090	  nop
243725cf1a30Sjl139090.xcopyout_4:
243825cf1a30Sjl139090	! already checked longword, must be word aligned
243925cf1a30Sjl139090	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
244025cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
244125cf1a30Sjl139090	tst	%o3
244225cf1a30Sjl139090	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
244325cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
244425cf1a30Sjl139090	bleu,pt	%ncc, .xcopyout_small		! go to small copy
244525cf1a30Sjl139090	  nop
244625cf1a30Sjl139090	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
244725cf1a30Sjl139090	  nop
244825cf1a30Sjl139090.xcopyout_8:
244925cf1a30Sjl139090	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
245025cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
245125cf1a30Sjl139090	tst	%o3
245225cf1a30Sjl139090	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
245325cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
245425cf1a30Sjl139090	bleu,pt	%ncc, .xcopyout_small		! go to small copy
245525cf1a30Sjl139090	  nop
245625cf1a30Sjl139090	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
245725cf1a30Sjl139090	  nop
245825cf1a30Sjl139090
245925cf1a30Sjl139090.xcopyout_small:
246025cf1a30Sjl139090	sethi	%hi(.sm_xcopyout_err), %o5	! .sm_xcopyout_err is lofault
246125cf1a30Sjl139090	or	%o5, %lo(.sm_xcopyout_err), %o5
246225cf1a30Sjl139090	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
246325cf1a30Sjl139090	membar	#Sync				! sync error barrier
246425cf1a30Sjl139090	ba,pt	%ncc, .sm_do_copyout		! common code
246525cf1a30Sjl139090	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
246625cf1a30Sjl139090
246725cf1a30Sjl139090.xcopyout_more:
246825cf1a30Sjl139090	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
246925cf1a30Sjl139090	sethi	%hi(.xcopyout_err), REAL_LOFAULT
247025cf1a30Sjl139090	ba,pt	%ncc, .do_copyout		! common code
247125cf1a30Sjl139090	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
247225cf1a30Sjl139090
247325cf1a30Sjl139090/*
247425cf1a30Sjl139090 * We got here because of fault during xcopyout
247525cf1a30Sjl139090 * Errno value is in ERRNO
247625cf1a30Sjl139090 */
247725cf1a30Sjl139090.xcopyout_err:
247825cf1a30Sjl139090	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
247925cf1a30Sjl139090	tst	%o4
248025cf1a30Sjl139090	bz,pt	%ncc, 2f			! if not, return error
248125cf1a30Sjl139090	  nop
248225cf1a30Sjl139090	ldn	[%o4 + CP_XCOPYOUT], %g2	! if handler, invoke it with
248325cf1a30Sjl139090	jmp	%g2				! original arguments
248425cf1a30Sjl139090	  restore %g0, 0, %g0			! dispose of copy window
248525cf1a30Sjl1390902:
248625cf1a30Sjl139090        ret
248725cf1a30Sjl139090	  restore ERRNO, 0, %o0			! return errno value
248825cf1a30Sjl139090
248925cf1a30Sjl139090.sm_xcopyout_err:
249025cf1a30Sjl139090
249125cf1a30Sjl139090	membar	#Sync
249225cf1a30Sjl139090	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
249325cf1a30Sjl139090	mov	SM_SAVE_SRC, %o0
249425cf1a30Sjl139090	mov	SM_SAVE_DST, %o1
249525cf1a30Sjl139090	mov	SM_SAVE_COUNT, %o2
249625cf1a30Sjl139090	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
249725cf1a30Sjl139090	tst	%o3
249825cf1a30Sjl139090	bz,pt	%ncc, 3f			! if not, return error
249925cf1a30Sjl139090	  nop
250025cf1a30Sjl139090	ldn	[%o3 + CP_XCOPYOUT], %o5	! if handler, invoke it with
250125cf1a30Sjl139090	jmp	%o5				! original arguments
250225cf1a30Sjl139090	  nop
250325cf1a30Sjl1390903:
250425cf1a30Sjl139090	retl
250525cf1a30Sjl139090	  or	%g1, 0, %o0		! return errno value
250625cf1a30Sjl139090
250725cf1a30Sjl139090	SET_SIZE(xcopyout)
250825cf1a30Sjl139090
250925cf1a30Sjl139090#endif	/* lint */
251025cf1a30Sjl139090
251125cf1a30Sjl139090#ifdef	lint
251225cf1a30Sjl139090
251325cf1a30Sjl139090/*ARGSUSED*/
251425cf1a30Sjl139090int
251525cf1a30Sjl139090xcopyout_little(const void *kaddr, void *uaddr, size_t count)
251625cf1a30Sjl139090{ return (0); }
251725cf1a30Sjl139090
251825cf1a30Sjl139090#else	/* lint */
251925cf1a30Sjl139090
252025cf1a30Sjl139090	ENTRY(xcopyout_little)
252125cf1a30Sjl139090	sethi	%hi(.xcopyio_err), %o5
252225cf1a30Sjl139090	or	%o5, %lo(.xcopyio_err), %o5
252325cf1a30Sjl139090	ldn	[THREAD_REG + T_LOFAULT], %o4
252425cf1a30Sjl139090	membar	#Sync				! sync error barrier
252525cf1a30Sjl139090	stn	%o5, [THREAD_REG + T_LOFAULT]
252625cf1a30Sjl139090	mov	%o4, %o5
252725cf1a30Sjl139090
252825cf1a30Sjl139090	subcc	%g0, %o2, %o3
252925cf1a30Sjl139090	add	%o0, %o2, %o0
253025cf1a30Sjl139090	bz,pn	%ncc, 2f		! check for zero bytes
253125cf1a30Sjl139090	  sub	%o2, 1, %o4
253225cf1a30Sjl139090	add	%o0, %o4, %o0		! start w/last byte
253325cf1a30Sjl139090	add	%o1, %o2, %o1
253425cf1a30Sjl139090	ldub	[%o0 + %o3], %o4
253525cf1a30Sjl139090
253625cf1a30Sjl1390901:	stba	%o4, [%o1 + %o3]ASI_AIUSL
253725cf1a30Sjl139090	inccc	%o3
253825cf1a30Sjl139090	sub	%o0, 2, %o0		! get next byte
253925cf1a30Sjl139090	bcc,a,pt %ncc, 1b
254025cf1a30Sjl139090	  ldub	[%o0 + %o3], %o4
254125cf1a30Sjl139090
254225cf1a30Sjl1390902:
254325cf1a30Sjl139090	membar	#Sync				! sync error barrier
254425cf1a30Sjl139090	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
254525cf1a30Sjl139090	retl
254625cf1a30Sjl139090	  mov	%g0, %o0		! return (0)
254725cf1a30Sjl139090
254825cf1a30Sjl139090	SET_SIZE(xcopyout_little)
254925cf1a30Sjl139090
255025cf1a30Sjl139090#endif	/* lint */
255125cf1a30Sjl139090
255225cf1a30Sjl139090/*
255325cf1a30Sjl139090 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
255425cf1a30Sjl139090 */
255525cf1a30Sjl139090
255625cf1a30Sjl139090#if defined(lint)
255725cf1a30Sjl139090
255825cf1a30Sjl139090/*ARGSUSED*/
255925cf1a30Sjl139090int
256025cf1a30Sjl139090copyin(const void *uaddr, void *kaddr, size_t count)
256125cf1a30Sjl139090{ return (0); }
256225cf1a30Sjl139090
256325cf1a30Sjl139090#else	/* lint */
256425cf1a30Sjl139090
256525cf1a30Sjl139090	ENTRY(copyin)
256625cf1a30Sjl139090	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
256725cf1a30Sjl139090	bleu,pt	%ncc, .copyin_small		! go to larger cases
256825cf1a30Sjl139090	  xor	%o0, %o1, %o3			! are src, dst alignable?
256925cf1a30Sjl139090	btst	7, %o3				!
257025cf1a30Sjl139090	bz,pt	%ncc, .copyin_8			! check for longword alignment
257125cf1a30Sjl139090	  nop
257225cf1a30Sjl139090	btst	1, %o3				!
257325cf1a30Sjl139090	bz,pt	%ncc, .copyin_2			! check for half-word
257425cf1a30Sjl139090	  nop
257525cf1a30Sjl139090	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
257625cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
257725cf1a30Sjl139090	tst	%o3
257825cf1a30Sjl139090	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
257925cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
258025cf1a30Sjl139090	bleu,pt	%ncc, .copyin_small		! go to small copy
258125cf1a30Sjl139090	  nop
258225cf1a30Sjl139090	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
258325cf1a30Sjl139090	  nop
258425cf1a30Sjl139090.copyin_2:
258525cf1a30Sjl139090	btst	3, %o3				!
258625cf1a30Sjl139090	bz,pt	%ncc, .copyin_4			! check for word alignment
258725cf1a30Sjl139090	  nop
258825cf1a30Sjl139090	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
258925cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
259025cf1a30Sjl139090	tst	%o3
259125cf1a30Sjl139090	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
259225cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
259325cf1a30Sjl139090	bleu,pt	%ncc, .copyin_small		! go to small copy
259425cf1a30Sjl139090	  nop
259525cf1a30Sjl139090	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
259625cf1a30Sjl139090	  nop
259725cf1a30Sjl139090.copyin_4:
259825cf1a30Sjl139090	! already checked longword, must be word aligned
259925cf1a30Sjl139090	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
260025cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
260125cf1a30Sjl139090	tst	%o3
260225cf1a30Sjl139090	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
260325cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
260425cf1a30Sjl139090	bleu,pt	%ncc, .copyin_small		! go to small copy
260525cf1a30Sjl139090	  nop
260625cf1a30Sjl139090	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
260725cf1a30Sjl139090	  nop
260825cf1a30Sjl139090.copyin_8:
260925cf1a30Sjl139090	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
261025cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
261125cf1a30Sjl139090	tst	%o3
261225cf1a30Sjl139090	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
261325cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
261425cf1a30Sjl139090	bleu,pt	%ncc, .copyin_small		! go to small copy
261525cf1a30Sjl139090	  nop
261625cf1a30Sjl139090	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
261725cf1a30Sjl139090	  nop
261825cf1a30Sjl139090
261925cf1a30Sjl139090	.align	16
262025cf1a30Sjl139090	nop				! instruction alignment
262125cf1a30Sjl139090					! see discussion at start of file
262225cf1a30Sjl139090.copyin_small:
262325cf1a30Sjl139090	sethi	%hi(.sm_copyin_err), %o5	! .sm_copyin_err is lofault
262425cf1a30Sjl139090	or	%o5, %lo(.sm_copyin_err), %o5
262525cf1a30Sjl139090	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofault, no tramp
262625cf1a30Sjl139090	membar	#Sync				! sync error barrier
262725cf1a30Sjl139090	stn	%o5, [THREAD_REG + T_LOFAULT]
262825cf1a30Sjl139090.sm_do_copyin:
262925cf1a30Sjl139090	mov	%o0, SM_SAVE_SRC
263025cf1a30Sjl139090	mov	%o1, SM_SAVE_DST
263125cf1a30Sjl139090	cmp	%o2, SHORTCOPY		! check for really short case
263225cf1a30Sjl139090	bleu,pt	%ncc, .ci_sm_left	!
263325cf1a30Sjl139090	  mov	%o2, SM_SAVE_COUNT
263425cf1a30Sjl139090	cmp	%o2, CHKSIZE		! check for medium length cases
263525cf1a30Sjl139090	bgu,pn	%ncc, .ci_med		!
263625cf1a30Sjl139090	  or	%o0, %o1, %o3		! prepare alignment check
263725cf1a30Sjl139090	andcc	%o3, 0x3, %g0		! test for alignment
263825cf1a30Sjl139090	bz,pt	%ncc, .ci_sm_word	! branch to word aligned case
263925cf1a30Sjl139090.ci_sm_movebytes:
264025cf1a30Sjl139090	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
264125cf1a30Sjl139090.ci_sm_notalign4:
264225cf1a30Sjl139090	lduba	[%o0]ASI_USER, %o3	! read byte
264325cf1a30Sjl139090	subcc	%o2, 4, %o2		! reduce count by 4
264425cf1a30Sjl139090	stb	%o3, [%o1]		! write byte
264525cf1a30Sjl139090	add	%o0, 1, %o0		! advance SRC by 1
264625cf1a30Sjl139090	lduba	[%o0]ASI_USER, %o3	! repeat for a total of 4 bytes
264725cf1a30Sjl139090	add	%o0, 1, %o0		! advance SRC by 1
264825cf1a30Sjl139090	stb	%o3, [%o1 + 1]
264925cf1a30Sjl139090	add	%o1, 4, %o1		! advance DST by 4
265025cf1a30Sjl139090	lduba	[%o0]ASI_USER, %o3
265125cf1a30Sjl139090	add	%o0, 1, %o0		! advance SRC by 1
265225cf1a30Sjl139090	stb	%o3, [%o1 - 2]
265325cf1a30Sjl139090	lduba	[%o0]ASI_USER, %o3
265425cf1a30Sjl139090	add	%o0, 1, %o0		! advance SRC by 1
265525cf1a30Sjl139090	bgt,pt	%ncc, .ci_sm_notalign4	! loop til 3 or fewer bytes remain
265625cf1a30Sjl139090	  stb	%o3, [%o1 - 1]
265725cf1a30Sjl139090	add	%o2, 3, %o2		! restore count
265825cf1a30Sjl139090.ci_sm_left:
265925cf1a30Sjl139090	tst	%o2
266025cf1a30Sjl139090	bz,pt	%ncc, .ci_sm_exit
266125cf1a30Sjl139090	  nop
266225cf1a30Sjl139090	lduba	[%o0]ASI_USER, %o3		! load one byte
266325cf1a30Sjl139090	deccc	%o2			! reduce count for cc test
266425cf1a30Sjl139090	bz,pt	%ncc, .ci_sm_exit
266525cf1a30Sjl139090	  stb	%o3,[%o1]		! store one byte
266625cf1a30Sjl139090	inc	%o0
266725cf1a30Sjl139090	lduba	[%o0]ASI_USER, %o3	! load second byte
266825cf1a30Sjl139090	deccc	%o2
266925cf1a30Sjl139090	bz,pt	%ncc, .ci_sm_exit
267025cf1a30Sjl139090	  stb	%o3,[%o1 + 1]		! store second byte
267125cf1a30Sjl139090	inc	%o0
267225cf1a30Sjl139090	lduba	[%o0]ASI_USER, %o3	! load third byte
267325cf1a30Sjl139090	stb	%o3,[%o1 + 2]		! store third byte
267425cf1a30Sjl139090	membar	#Sync				! sync error barrier
267525cf1a30Sjl139090	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
267625cf1a30Sjl139090	retl
267725cf1a30Sjl139090	  mov	%g0, %o0		! return 0
267825cf1a30Sjl139090	.align	16
267925cf1a30Sjl139090.ci_sm_words:
268025cf1a30Sjl139090	lduwa	[%o0]ASI_USER, %o3		! read word
268125cf1a30Sjl139090.ci_sm_wordx:
268225cf1a30Sjl139090	subcc	%o2, 8, %o2		! update count
268325cf1a30Sjl139090	stw	%o3, [%o1]		! write word
268425cf1a30Sjl139090	add	%o0, 4, %o0		! update SRC
268525cf1a30Sjl139090	add	%o1, 8, %o1		! update DST
268625cf1a30Sjl139090	lduwa	[%o0]ASI_USER, %o3	! read word
268725cf1a30Sjl139090	add	%o0, 4, %o0		! update SRC
268825cf1a30Sjl139090	bgt,pt	%ncc, .ci_sm_words	! loop til done
268925cf1a30Sjl139090	  stw	%o3, [%o1 - 4]		! write word
269025cf1a30Sjl139090	addcc	%o2, 7, %o2		! restore count
269125cf1a30Sjl139090	bz,pt	%ncc, .ci_sm_exit
269225cf1a30Sjl139090	  nop
269325cf1a30Sjl139090	deccc	%o2
269425cf1a30Sjl139090	bz,pt	%ncc, .ci_sm_byte
269525cf1a30Sjl139090.ci_sm_half:
269625cf1a30Sjl139090	  subcc	%o2, 2, %o2		! reduce count by 2
269725cf1a30Sjl139090	lduha	[%o0]ASI_USER, %o3	! read half word
269825cf1a30Sjl139090	add	%o0, 2, %o0		! advance SRC by 2
269925cf1a30Sjl139090	add	%o1, 2, %o1		! advance DST by 2
270025cf1a30Sjl139090	bgt,pt	%ncc, .ci_sm_half	! loop til done
270125cf1a30Sjl139090	  sth	%o3, [%o1 - 2]		! write half word
270225cf1a30Sjl139090	addcc	%o2, 1, %o2		! restore count
270325cf1a30Sjl139090	bz,pt	%ncc, .ci_sm_exit
270425cf1a30Sjl139090	  nop
270525cf1a30Sjl139090.ci_sm_byte:
270625cf1a30Sjl139090	lduba	[%o0]ASI_USER, %o3
270725cf1a30Sjl139090	stb	%o3, [%o1]
270825cf1a30Sjl139090	membar	#Sync				! sync error barrier
270925cf1a30Sjl139090	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
271025cf1a30Sjl139090	retl
271125cf1a30Sjl139090	  mov	%g0, %o0		! return 0
271225cf1a30Sjl139090	.align	16
271325cf1a30Sjl139090.ci_sm_word:
271425cf1a30Sjl139090	subcc	%o2, 4, %o2		! update count
271525cf1a30Sjl139090	bgt,pt	%ncc, .ci_sm_wordx
271625cf1a30Sjl139090	  lduwa	[%o0]ASI_USER, %o3		! read word
271725cf1a30Sjl139090	addcc	%o2, 3, %o2		! restore count
271825cf1a30Sjl139090	bz,pt	%ncc, .ci_sm_exit
271925cf1a30Sjl139090	  stw	%o3, [%o1]		! write word
272025cf1a30Sjl139090	deccc	%o2			! reduce count for cc test
272125cf1a30Sjl139090	add	%o0, 4, %o0
272225cf1a30Sjl139090	lduba	[%o0]ASI_USER, %o3	! load one byte
272325cf1a30Sjl139090	bz,pt	%ncc, .ci_sm_exit
272425cf1a30Sjl139090	  stb	%o3, [%o1 + 4]		! store one byte
272525cf1a30Sjl139090	inc	%o0
272625cf1a30Sjl139090	lduba	[%o0]ASI_USER, %o3	! load second byte
272725cf1a30Sjl139090	deccc	%o2
272825cf1a30Sjl139090	bz,pt	%ncc, .ci_sm_exit
272925cf1a30Sjl139090	  stb	%o3, [%o1 + 5]		! store second byte
273025cf1a30Sjl139090	inc	%o0
273125cf1a30Sjl139090	lduba	[%o0]ASI_USER, %o3	! load third byte
273225cf1a30Sjl139090	stb	%o3, [%o1 + 6]		! store third byte
273325cf1a30Sjl139090.ci_sm_exit:
273425cf1a30Sjl139090	membar	#Sync				! sync error barrier
273525cf1a30Sjl139090	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
273625cf1a30Sjl139090	retl
273725cf1a30Sjl139090	  mov	%g0, %o0		! return 0
273825cf1a30Sjl139090
273925cf1a30Sjl139090	.align 16
274025cf1a30Sjl139090.ci_med:
274125cf1a30Sjl139090	xor	%o0, %o1, %o3		! setup alignment check
274225cf1a30Sjl139090	btst	1, %o3
274325cf1a30Sjl139090	bnz,pt	%ncc, .ci_sm_movebytes	! unaligned
274425cf1a30Sjl139090	  nop
274525cf1a30Sjl139090	btst	3, %o3
274625cf1a30Sjl139090	bnz,pt	%ncc, .ci_med_half	! halfword aligned
274725cf1a30Sjl139090	  nop
274825cf1a30Sjl139090	btst	7, %o3
274925cf1a30Sjl139090	bnz,pt	%ncc, .ci_med_word	! word aligned
275025cf1a30Sjl139090	  nop
275125cf1a30Sjl139090.ci_med_long:
275225cf1a30Sjl139090	btst	3, %o0			! check for
275325cf1a30Sjl139090	bz,pt	%ncc, .ci_med_long1	! word alignment
275425cf1a30Sjl139090	  nop
275525cf1a30Sjl139090.ci_med_long0:
275625cf1a30Sjl139090	lduba	[%o0]ASI_USER, %o3		! load one byte
275725cf1a30Sjl139090	inc	%o0
275825cf1a30Sjl139090	stb	%o3,[%o1]		! store byte
275925cf1a30Sjl139090	inc	%o1
276025cf1a30Sjl139090	btst	3, %o0
276125cf1a30Sjl139090	bnz,pt	%ncc, .ci_med_long0
276225cf1a30Sjl139090	  dec	%o2
276325cf1a30Sjl139090.ci_med_long1:			! word aligned
276425cf1a30Sjl139090	btst	7, %o0			! check for long word
276525cf1a30Sjl139090	bz,pt	%ncc, .ci_med_long2
276625cf1a30Sjl139090	  nop
276725cf1a30Sjl139090	lduwa	[%o0]ASI_USER, %o3	! load word
276825cf1a30Sjl139090	add	%o0, 4, %o0		! advance SRC by 4
276925cf1a30Sjl139090	stw	%o3, [%o1]		! store word
277025cf1a30Sjl139090	add	%o1, 4, %o1		! advance DST by 4
277125cf1a30Sjl139090	sub	%o2, 4, %o2		! reduce count by 4
277225cf1a30Sjl139090!
277325cf1a30Sjl139090!  Now long word aligned and have at least 32 bytes to move
277425cf1a30Sjl139090!
277525cf1a30Sjl139090.ci_med_long2:
277625cf1a30Sjl139090	sub	%o2, 31, %o2		! adjust count to allow cc zero test
277725cf1a30Sjl139090.ci_med_lmove:
277825cf1a30Sjl139090	ldxa	[%o0]ASI_USER, %o3	! read long word
277925cf1a30Sjl139090	subcc	%o2, 32, %o2		! reduce count by 32
278025cf1a30Sjl139090	stx	%o3, [%o1]		! write long word
278125cf1a30Sjl139090	add	%o0, 8, %o0		! advance SRC by 8
278225cf1a30Sjl139090	ldxa	[%o0]ASI_USER, %o3	! repeat for a total for 4 long words
278325cf1a30Sjl139090	add	%o0, 8, %o0		! advance SRC by 8
278425cf1a30Sjl139090	stx	%o3, [%o1 + 8]
278525cf1a30Sjl139090	add	%o1, 32, %o1		! advance DST by 32
278625cf1a30Sjl139090	ldxa	[%o0]ASI_USER, %o3
278725cf1a30Sjl139090	add	%o0, 8, %o0		! advance SRC by 8
278825cf1a30Sjl139090	stx	%o3, [%o1 - 16]
278925cf1a30Sjl139090	ldxa	[%o0]ASI_USER, %o3
279025cf1a30Sjl139090	add	%o0, 8, %o0		! advance SRC by 8
279125cf1a30Sjl139090	bgt,pt	%ncc, .ci_med_lmove	! loop til 31 or fewer bytes left
279225cf1a30Sjl139090	  stx	%o3, [%o1 - 8]
279325cf1a30Sjl139090	addcc	%o2, 24, %o2		! restore count to long word offset
279425cf1a30Sjl139090	ble,pt	%ncc, .ci_med_lextra	! check for more long words to move
279525cf1a30Sjl139090	  nop
279625cf1a30Sjl139090.ci_med_lword:
279725cf1a30Sjl139090	ldxa	[%o0]ASI_USER, %o3	! read long word
279825cf1a30Sjl139090	subcc	%o2, 8, %o2		! reduce count by 8
279925cf1a30Sjl139090	stx	%o3, [%o1]		! write long word
280025cf1a30Sjl139090	add	%o0, 8, %o0		! advance SRC by 8
280125cf1a30Sjl139090	bgt,pt	%ncc, .ci_med_lword	! loop til 7 or fewer bytes left
280225cf1a30Sjl139090	  add	%o1, 8, %o1		! advance DST by 8
280325cf1a30Sjl139090.ci_med_lextra:
280425cf1a30Sjl139090	addcc	%o2, 7, %o2		! restore rest of count
280525cf1a30Sjl139090	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
280625cf1a30Sjl139090	  deccc	%o2
280725cf1a30Sjl139090	bz,pt	%ncc, .ci_sm_byte
280825cf1a30Sjl139090	  nop
280925cf1a30Sjl139090	ba,pt	%ncc, .ci_sm_half
281025cf1a30Sjl139090	  nop
281125cf1a30Sjl139090
281225cf1a30Sjl139090	.align 16
281325cf1a30Sjl139090	nop				! instruction alignment
281425cf1a30Sjl139090					! see discussion at start of file
281525cf1a30Sjl139090.ci_med_word:
281625cf1a30Sjl139090	btst	3, %o0			! check for
281725cf1a30Sjl139090	bz,pt	%ncc, .ci_med_word1	! word alignment
281825cf1a30Sjl139090	  nop
281925cf1a30Sjl139090.ci_med_word0:
282025cf1a30Sjl139090	lduba	[%o0]ASI_USER, %o3	! load one byte
282125cf1a30Sjl139090	inc	%o0
282225cf1a30Sjl139090	stb	%o3,[%o1]		! store byte
282325cf1a30Sjl139090	inc	%o1
282425cf1a30Sjl139090	btst	3, %o0
282525cf1a30Sjl139090	bnz,pt	%ncc, .ci_med_word0
282625cf1a30Sjl139090	  dec	%o2
282725cf1a30Sjl139090!
282825cf1a30Sjl139090!  Now word aligned and have at least 36 bytes to move
282925cf1a30Sjl139090!
283025cf1a30Sjl139090.ci_med_word1:
283125cf1a30Sjl139090	sub	%o2, 15, %o2		! adjust count to allow cc zero test
283225cf1a30Sjl139090.ci_med_wmove:
283325cf1a30Sjl139090	lduwa	[%o0]ASI_USER, %o3	! read word
283425cf1a30Sjl139090	subcc	%o2, 16, %o2		! reduce count by 16
283525cf1a30Sjl139090	stw	%o3, [%o1]		! write word
283625cf1a30Sjl139090	add	%o0, 4, %o0		! advance SRC by 4
283725cf1a30Sjl139090	lduwa	[%o0]ASI_USER, %o3	! repeat for a total for 4 words
283825cf1a30Sjl139090	add	%o0, 4, %o0		! advance SRC by 4
283925cf1a30Sjl139090	stw	%o3, [%o1 + 4]
284025cf1a30Sjl139090	add	%o1, 16, %o1		! advance DST by 16
284125cf1a30Sjl139090	lduwa	[%o0]ASI_USER, %o3
284225cf1a30Sjl139090	add	%o0, 4, %o0		! advance SRC by 4
284325cf1a30Sjl139090	stw	%o3, [%o1 - 8]
284425cf1a30Sjl139090	lduwa	[%o0]ASI_USER, %o3
284525cf1a30Sjl139090	add	%o0, 4, %o0		! advance SRC by 4
284625cf1a30Sjl139090	bgt,pt	%ncc, .ci_med_wmove	! loop til 15 or fewer bytes left
284725cf1a30Sjl139090	  stw	%o3, [%o1 - 4]
284825cf1a30Sjl139090	addcc	%o2, 12, %o2		! restore count to word offset
284925cf1a30Sjl139090	ble,pt	%ncc, .ci_med_wextra	! check for more words to move
285025cf1a30Sjl139090	  nop
285125cf1a30Sjl139090.ci_med_word2:
285225cf1a30Sjl139090	lduwa	[%o0]ASI_USER, %o3	! read word
285325cf1a30Sjl139090	subcc	%o2, 4, %o2		! reduce count by 4
285425cf1a30Sjl139090	stw	%o3, [%o1]		! write word
285525cf1a30Sjl139090	add	%o0, 4, %o0		! advance SRC by 4
285625cf1a30Sjl139090	bgt,pt	%ncc, .ci_med_word2	! loop til 3 or fewer bytes left
285725cf1a30Sjl139090	  add	%o1, 4, %o1		! advance DST by 4
285825cf1a30Sjl139090.ci_med_wextra:
285925cf1a30Sjl139090	addcc	%o2, 3, %o2		! restore rest of count
286025cf1a30Sjl139090	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
286125cf1a30Sjl139090	  deccc	%o2
286225cf1a30Sjl139090	bz,pt	%ncc, .ci_sm_byte
286325cf1a30Sjl139090	  nop
286425cf1a30Sjl139090	ba,pt	%ncc, .ci_sm_half
286525cf1a30Sjl139090	  nop
286625cf1a30Sjl139090
286725cf1a30Sjl139090	.align 16
286825cf1a30Sjl139090	nop				! instruction alignment
286925cf1a30Sjl139090					! see discussion at start of file
287025cf1a30Sjl139090.ci_med_half:
287125cf1a30Sjl139090	btst	1, %o0			! check for
287225cf1a30Sjl139090	bz,pt	%ncc, .ci_med_half1	! half word alignment
287325cf1a30Sjl139090	  nop
287425cf1a30Sjl139090	lduba	[%o0]ASI_USER, %o3	! load one byte
287525cf1a30Sjl139090	inc	%o0
287625cf1a30Sjl139090	stb	%o3,[%o1]		! store byte
287725cf1a30Sjl139090	inc	%o1
287825cf1a30Sjl139090	dec	%o2
287925cf1a30Sjl139090!
288025cf1a30Sjl139090!  Now half word aligned and have at least 38 bytes to move
288125cf1a30Sjl139090!
288225cf1a30Sjl139090.ci_med_half1:
288325cf1a30Sjl139090	sub	%o2, 7, %o2		! adjust count to allow cc zero test
288425cf1a30Sjl139090.ci_med_hmove:
288525cf1a30Sjl139090	lduha	[%o0]ASI_USER, %o3	! read half word
288625cf1a30Sjl139090	subcc	%o2, 8, %o2		! reduce count by 8
288725cf1a30Sjl139090	sth	%o3, [%o1]		! write half word
288825cf1a30Sjl139090	add	%o0, 2, %o0		! advance SRC by 2
288925cf1a30Sjl139090	lduha	[%o0]ASI_USER, %o3	! repeat for a total for 4 halfwords
289025cf1a30Sjl139090	add	%o0, 2, %o0		! advance SRC by 2
289125cf1a30Sjl139090	sth	%o3, [%o1 + 2]
289225cf1a30Sjl139090	add	%o1, 8, %o1		! advance DST by 8
289325cf1a30Sjl139090	lduha	[%o0]ASI_USER, %o3
289425cf1a30Sjl139090	add	%o0, 2, %o0		! advance SRC by 2
289525cf1a30Sjl139090	sth	%o3, [%o1 - 4]
289625cf1a30Sjl139090	lduha	[%o0]ASI_USER, %o3
289725cf1a30Sjl139090	add	%o0, 2, %o0		! advance SRC by 2
289825cf1a30Sjl139090	bgt,pt	%ncc, .ci_med_hmove	! loop til 7 or fewer bytes left
289925cf1a30Sjl139090	  sth	%o3, [%o1 - 2]
290025cf1a30Sjl139090	addcc	%o2, 7, %o2		! restore count
290125cf1a30Sjl139090	bz,pt	%ncc, .ci_sm_exit
290225cf1a30Sjl139090	  deccc	%o2
290325cf1a30Sjl139090	bz,pt	%ncc, .ci_sm_byte
290425cf1a30Sjl139090	  nop
290525cf1a30Sjl139090	ba,pt	%ncc, .ci_sm_half
290625cf1a30Sjl139090	  nop
290725cf1a30Sjl139090
290825cf1a30Sjl139090.sm_copyin_err:
290925cf1a30Sjl139090	membar	#Sync
291025cf1a30Sjl139090	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
291125cf1a30Sjl139090	mov	SM_SAVE_SRC, %o0
291225cf1a30Sjl139090	mov	SM_SAVE_DST, %o1
291325cf1a30Sjl139090	mov	SM_SAVE_COUNT, %o2
291425cf1a30Sjl139090	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
291525cf1a30Sjl139090	tst	%o3
291625cf1a30Sjl139090	bz,pt	%ncc, 3f			! if not, return error
291725cf1a30Sjl139090	  nop
291825cf1a30Sjl139090	ldn	[%o3 + CP_COPYIN], %o5		! if handler, invoke it with
291925cf1a30Sjl139090	jmp	%o5				! original arguments
292025cf1a30Sjl139090	  nop
292125cf1a30Sjl1390903:
292225cf1a30Sjl139090	retl
292325cf1a30Sjl139090	  or	%g0, -1, %o0		! return errno value
292425cf1a30Sjl139090
292525cf1a30Sjl139090	SET_SIZE(copyin)
292625cf1a30Sjl139090
292725cf1a30Sjl139090
292825cf1a30Sjl139090/*
292925cf1a30Sjl139090 * The _more entry points are not intended to be used directly by
293025cf1a30Sjl139090 * any caller from outside this file.  They are provided to allow
293125cf1a30Sjl139090 * profiling and dtrace of the portions of the copy code that uses
293225cf1a30Sjl139090 * the floating point registers.
293325cf1a30Sjl139090 * This entry is particularly important as DTRACE (at least as of
293425cf1a30Sjl139090 * 4/2004) does not support leaf functions.
293525cf1a30Sjl139090 */
293625cf1a30Sjl139090
293725cf1a30Sjl139090	ENTRY(copyin_more)
293825cf1a30Sjl139090.copyin_more:
293925cf1a30Sjl139090	prefetch [%o0], #n_reads
294025cf1a30Sjl139090	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
294125cf1a30Sjl139090	set	.copyin_err, REAL_LOFAULT
294225cf1a30Sjl139090
294325cf1a30Sjl139090/*
294425cf1a30Sjl139090 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
294525cf1a30Sjl139090 */
294625cf1a30Sjl139090.do_copyin:
294725cf1a30Sjl139090	set	copyio_fault, %l7		! .copyio_fault is lofault val
294825cf1a30Sjl139090
294925cf1a30Sjl139090	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
295025cf1a30Sjl139090	membar	#Sync				! sync error barrier
295125cf1a30Sjl139090	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
295225cf1a30Sjl139090
295325cf1a30Sjl139090	mov	%i0, SAVE_SRC
295425cf1a30Sjl139090	mov	%i1, SAVE_DST
295525cf1a30Sjl139090	mov	%i2, SAVE_COUNT
295625cf1a30Sjl139090
295725cf1a30Sjl139090	FP_NOMIGRATE(6, 7)
295825cf1a30Sjl139090
295925cf1a30Sjl139090	rd	%fprs, %o2		! check for unused fp
296025cf1a30Sjl139090	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
296125cf1a30Sjl139090	btst	FPRS_FEF, %o2
296225cf1a30Sjl139090	bz,a,pt	%icc, .do_blockcopyin
296325cf1a30Sjl139090	  wr	%g0, FPRS_FEF, %fprs
296425cf1a30Sjl139090
296525cf1a30Sjl139090	BST_FPQ2Q4_TOSTACK(%o2)
296625cf1a30Sjl139090
296725cf1a30Sjl139090.do_blockcopyin:
296825cf1a30Sjl139090	rd	%gsr, %o2
296925cf1a30Sjl139090	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
297025cf1a30Sjl139090	or	%l6, FPUSED_FLAG, %l6
297125cf1a30Sjl139090
297225cf1a30Sjl139090	andcc	DST, VIS_BLOCKSIZE - 1, TMP
297325cf1a30Sjl139090	mov	ASI_USER, %asi
297425cf1a30Sjl139090	bz,pt	%ncc, 2f
297525cf1a30Sjl139090	  neg	TMP
297625cf1a30Sjl139090	add	TMP, VIS_BLOCKSIZE, TMP
297725cf1a30Sjl139090
297825cf1a30Sjl139090	! TMP = bytes required to align DST on FP_BLOCK boundary
297925cf1a30Sjl139090	! Using SRC as a tmp here
298025cf1a30Sjl139090	cmp	TMP, 3
298125cf1a30Sjl139090	bleu,pt	%ncc, 1f
298225cf1a30Sjl139090	  sub	CNT,TMP,CNT		! adjust main count
298325cf1a30Sjl139090	sub	TMP, 3, TMP		! adjust for end of loop test
298425cf1a30Sjl139090.ci_blkalign:
298525cf1a30Sjl139090	lduba	[REALSRC]%asi, SRC	! move 4 bytes per loop iteration
298625cf1a30Sjl139090	stb	SRC, [DST]
298725cf1a30Sjl139090	subcc	TMP, 4, TMP
298825cf1a30Sjl139090	lduba	[REALSRC + 1]%asi, SRC
298925cf1a30Sjl139090	add	REALSRC, 4, REALSRC
299025cf1a30Sjl139090	stb	SRC, [DST + 1]
299125cf1a30Sjl139090	lduba	[REALSRC - 2]%asi, SRC
299225cf1a30Sjl139090	add	DST, 4, DST
299325cf1a30Sjl139090	stb	SRC, [DST - 2]
299425cf1a30Sjl139090	lduba	[REALSRC - 1]%asi, SRC
299525cf1a30Sjl139090	bgu,pt	%ncc, .ci_blkalign
299625cf1a30Sjl139090	  stb	SRC, [DST - 1]
299725cf1a30Sjl139090
299825cf1a30Sjl139090	addcc	TMP, 3, TMP		! restore count adjustment
299925cf1a30Sjl139090	bz,pt	%ncc, 2f		! no bytes left?
300025cf1a30Sjl139090	  nop
300125cf1a30Sjl1390901:	lduba	[REALSRC]%asi, SRC
300225cf1a30Sjl139090	inc	REALSRC
300325cf1a30Sjl139090	inc	DST
300425cf1a30Sjl139090	deccc	TMP
300525cf1a30Sjl139090	bgu	%ncc, 1b
300625cf1a30Sjl139090	  stb	SRC, [DST - 1]
300725cf1a30Sjl139090
300825cf1a30Sjl1390902:
300925cf1a30Sjl139090	membar	#StoreLoad
301025cf1a30Sjl139090	andn	REALSRC, 0x7, SRC
301125cf1a30Sjl139090
301225cf1a30Sjl139090	! SRC - 8-byte aligned
301325cf1a30Sjl139090	! DST - 64-byte aligned
301425cf1a30Sjl139090	ldda	[SRC]%asi, %f16
301525cf1a30Sjl139090	prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #n_reads
301625cf1a30Sjl139090	alignaddr REALSRC, %g0, %g0
301725cf1a30Sjl139090	ldda	[SRC + 0x08]%asi, %f18
301825cf1a30Sjl139090	prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #n_reads
301925cf1a30Sjl139090	faligndata %f16, %f18, %f48
302025cf1a30Sjl139090	ldda	[SRC + 0x10]%asi, %f20
3021c8a722abSpm145316	prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
302225cf1a30Sjl139090	faligndata %f18, %f20, %f50
302325cf1a30Sjl139090	ldda	[SRC + 0x18]%asi, %f22
302425cf1a30Sjl139090	prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
302525cf1a30Sjl139090	faligndata %f20, %f22, %f52
302625cf1a30Sjl139090	ldda	[SRC + 0x20]%asi, %f24
3027c8a722abSpm145316	prefetcha [SRC + (8 * VIS_BLOCKSIZE)]%asi, #one_read
302825cf1a30Sjl139090	faligndata %f22, %f24, %f54
302925cf1a30Sjl139090	ldda	[SRC + 0x28]%asi, %f26
3030c8a722abSpm145316	prefetcha [SRC + (12 * VIS_BLOCKSIZE)]%asi, #one_read
303125cf1a30Sjl139090	faligndata %f24, %f26, %f56
303225cf1a30Sjl139090	ldda	[SRC + 0x30]%asi, %f28
3033c8a722abSpm145316	prefetcha [SRC + (16 * VIS_BLOCKSIZE)]%asi, #one_read
303425cf1a30Sjl139090	faligndata %f26, %f28, %f58
303525cf1a30Sjl139090	ldda	[SRC + 0x38]%asi, %f30
303625cf1a30Sjl139090	ldda	[SRC + VIS_BLOCKSIZE]%asi, %f16
303725cf1a30Sjl139090	sub	CNT, VIS_BLOCKSIZE, CNT
303825cf1a30Sjl139090	add	SRC, VIS_BLOCKSIZE, SRC
3039c8a722abSpm145316	prefetcha [SRC + (19 * VIS_BLOCKSIZE)]%asi, #one_read
304025cf1a30Sjl139090	add	REALSRC, VIS_BLOCKSIZE, REALSRC
304125cf1a30Sjl139090	ba,pt	%ncc, 1f
3042c8a722abSpm145316	prefetcha [SRC + (23 * VIS_BLOCKSIZE)]%asi, #one_read
304325cf1a30Sjl139090	.align	32
304425cf1a30Sjl1390901:
304525cf1a30Sjl139090	ldda	[SRC + 0x08]%asi, %f18
304625cf1a30Sjl139090	faligndata %f28, %f30, %f60
304725cf1a30Sjl139090	ldda	[SRC + 0x10]%asi, %f20
304825cf1a30Sjl139090	faligndata %f30, %f16, %f62
304925cf1a30Sjl139090	stda	%f48, [DST]ASI_BLK_P
305025cf1a30Sjl139090	ldda	[SRC + 0x18]%asi, %f22
305125cf1a30Sjl139090	faligndata %f16, %f18, %f48
305225cf1a30Sjl139090	ldda	[SRC + 0x20]%asi, %f24
305325cf1a30Sjl139090	faligndata %f18, %f20, %f50
305425cf1a30Sjl139090	ldda	[SRC + 0x28]%asi, %f26
305525cf1a30Sjl139090	faligndata %f20, %f22, %f52
305625cf1a30Sjl139090	ldda	[SRC + 0x30]%asi, %f28
305725cf1a30Sjl139090	faligndata %f22, %f24, %f54
305825cf1a30Sjl139090	sub	CNT, VIS_BLOCKSIZE, CNT
3059c8a722abSpm145316	ldda	[SRC + 0x38]%asi, %f30
3060c8a722abSpm145316	faligndata %f24, %f26, %f56
306125cf1a30Sjl139090	add	DST, VIS_BLOCKSIZE, DST
3062c8a722abSpm145316	ldda	[SRC + VIS_BLOCKSIZE]%asi, %f16
3063c8a722abSpm145316	faligndata %f26, %f28, %f58
306425cf1a30Sjl139090	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3065c8a722abSpm145316	prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
3066c8a722abSpm145316	add	SRC, VIS_BLOCKSIZE, SRC
3067c8a722abSpm145316	prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
306825cf1a30Sjl139090	cmp	CNT, VIS_BLOCKSIZE + 8
306925cf1a30Sjl139090	bgu,pt	%ncc, 1b
3070c8a722abSpm145316	  prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
307125cf1a30Sjl139090
307225cf1a30Sjl139090	! only if REALSRC & 0x7 is 0
307325cf1a30Sjl139090	cmp	CNT, VIS_BLOCKSIZE
307425cf1a30Sjl139090	bne	%ncc, 3f
307525cf1a30Sjl139090	  andcc	REALSRC, 0x7, %g0
307625cf1a30Sjl139090	bz,pt	%ncc, 2f
307725cf1a30Sjl139090	  nop
307825cf1a30Sjl1390903:
307925cf1a30Sjl139090	faligndata %f28, %f30, %f60
308025cf1a30Sjl139090	faligndata %f30, %f16, %f62
308125cf1a30Sjl139090	stda	%f48, [DST]ASI_BLK_P
308225cf1a30Sjl139090	add	DST, VIS_BLOCKSIZE, DST
308325cf1a30Sjl139090	ba,pt	%ncc, 3f
308425cf1a30Sjl139090	  nop
308525cf1a30Sjl1390902:
308625cf1a30Sjl139090	ldda	[SRC + 0x08]%asi, %f18
308725cf1a30Sjl139090	fsrc1	%f28, %f60
308825cf1a30Sjl139090	ldda	[SRC + 0x10]%asi, %f20
308925cf1a30Sjl139090	fsrc1	%f30, %f62
309025cf1a30Sjl139090	stda	%f48, [DST]ASI_BLK_P
309125cf1a30Sjl139090	ldda	[SRC + 0x18]%asi, %f22
309225cf1a30Sjl139090	fsrc1	%f16, %f48
309325cf1a30Sjl139090	ldda	[SRC + 0x20]%asi, %f24
309425cf1a30Sjl139090	fsrc1	%f18, %f50
309525cf1a30Sjl139090	ldda	[SRC + 0x28]%asi, %f26
309625cf1a30Sjl139090	fsrc1	%f20, %f52
309725cf1a30Sjl139090	ldda	[SRC + 0x30]%asi, %f28
309825cf1a30Sjl139090	fsrc1	%f22, %f54
309925cf1a30Sjl139090	ldda	[SRC + 0x38]%asi, %f30
310025cf1a30Sjl139090	fsrc1	%f24, %f56
310125cf1a30Sjl139090	sub	CNT, VIS_BLOCKSIZE, CNT
310225cf1a30Sjl139090	add	DST, VIS_BLOCKSIZE, DST
310325cf1a30Sjl139090	add	SRC, VIS_BLOCKSIZE, SRC
310425cf1a30Sjl139090	add	REALSRC, VIS_BLOCKSIZE, REALSRC
310525cf1a30Sjl139090	fsrc1	%f26, %f58
310625cf1a30Sjl139090	fsrc1	%f28, %f60
310725cf1a30Sjl139090	fsrc1	%f30, %f62
310825cf1a30Sjl139090	stda	%f48, [DST]ASI_BLK_P
310925cf1a30Sjl139090	add	DST, VIS_BLOCKSIZE, DST
311025cf1a30Sjl139090	ba,a,pt	%ncc, 4f
311125cf1a30Sjl139090	  nop
311225cf1a30Sjl139090
311325cf1a30Sjl1390903:	tst	CNT
311425cf1a30Sjl139090	bz,a	%ncc, 4f
311525cf1a30Sjl139090	  nop
311625cf1a30Sjl139090
311725cf1a30Sjl1390905:	lduba	[REALSRC]ASI_USER, TMP
311825cf1a30Sjl139090	inc	REALSRC
311925cf1a30Sjl139090	inc	DST
312025cf1a30Sjl139090	deccc	CNT
312125cf1a30Sjl139090	bgu	%ncc, 5b
312225cf1a30Sjl139090	  stb	TMP, [DST - 1]
312325cf1a30Sjl1390904:
312425cf1a30Sjl139090
312525cf1a30Sjl139090.copyin_exit:
312625cf1a30Sjl139090	membar	#Sync
312725cf1a30Sjl139090
312825cf1a30Sjl139090	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
312925cf1a30Sjl139090	wr	%o2, 0, %gsr
313025cf1a30Sjl139090
313125cf1a30Sjl139090	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
313225cf1a30Sjl139090	btst	FPRS_FEF, %o3
313325cf1a30Sjl139090	bz,pt	%icc, 4f
313425cf1a30Sjl139090	  nop
313525cf1a30Sjl139090
313625cf1a30Sjl139090	BLD_FPQ2Q4_FROMSTACK(%o2)
313725cf1a30Sjl139090
313825cf1a30Sjl139090	ba,pt	%ncc, 1f
313925cf1a30Sjl139090	  wr	%o3, 0, %fprs		! restore fprs
314025cf1a30Sjl139090
314125cf1a30Sjl1390904:
314225cf1a30Sjl139090	FZEROQ2Q4
314325cf1a30Sjl139090	wr	%o3, 0, %fprs		! restore fprs
314425cf1a30Sjl139090
314525cf1a30Sjl1390901:
314625cf1a30Sjl139090	membar	#Sync				! sync error barrier
314725cf1a30Sjl139090	andn	%l6, FPUSED_FLAG, %l6
314825cf1a30Sjl139090	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
314925cf1a30Sjl139090	FP_ALLOWMIGRATE(5, 6)
315025cf1a30Sjl139090	ret
315125cf1a30Sjl139090	  restore	%g0, 0, %o0
315225cf1a30Sjl139090/*
315325cf1a30Sjl139090 * We got here because of a fault during copyin
315425cf1a30Sjl139090 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
315525cf1a30Sjl139090 */
315625cf1a30Sjl139090.copyin_err:
315725cf1a30Sjl139090	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
315825cf1a30Sjl139090	tst	%o4
315925cf1a30Sjl139090	bz,pt	%ncc, 2f			! if not, return error
316025cf1a30Sjl139090	nop
316125cf1a30Sjl139090	ldn	[%o4 + CP_COPYIN], %g2		! if handler, invoke it with
316225cf1a30Sjl139090	jmp	%g2				! original arguments
316325cf1a30Sjl139090	restore %g0, 0, %g0			! dispose of copy window
316425cf1a30Sjl1390902:
316525cf1a30Sjl139090	ret
316625cf1a30Sjl139090	restore %g0, -1, %o0			! return error value
316725cf1a30Sjl139090
316825cf1a30Sjl139090
316925cf1a30Sjl139090	SET_SIZE(copyin_more)
317025cf1a30Sjl139090
317125cf1a30Sjl139090#endif	/* lint */
317225cf1a30Sjl139090
317325cf1a30Sjl139090#ifdef	lint
317425cf1a30Sjl139090
317525cf1a30Sjl139090/*ARGSUSED*/
317625cf1a30Sjl139090int
317725cf1a30Sjl139090xcopyin(const void *uaddr, void *kaddr, size_t count)
317825cf1a30Sjl139090{ return (0); }
317925cf1a30Sjl139090
318025cf1a30Sjl139090#else	/* lint */
318125cf1a30Sjl139090
318225cf1a30Sjl139090	ENTRY(xcopyin)
318325cf1a30Sjl139090
318425cf1a30Sjl139090	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
318525cf1a30Sjl139090	bleu,pt	%ncc, .xcopyin_small		! go to larger cases
318625cf1a30Sjl139090	  xor	%o0, %o1, %o3			! are src, dst alignable?
318725cf1a30Sjl139090	btst	7, %o3				!
318825cf1a30Sjl139090	bz,pt	%ncc, .xcopyin_8		! check for longword alignment
318925cf1a30Sjl139090	  nop
319025cf1a30Sjl139090	btst	1, %o3				!
319125cf1a30Sjl139090	bz,pt	%ncc, .xcopyin_2		! check for half-word
319225cf1a30Sjl139090	  nop
319325cf1a30Sjl139090	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
319425cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
319525cf1a30Sjl139090	tst	%o3
319625cf1a30Sjl139090	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
319725cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
319825cf1a30Sjl139090	bleu,pt	%ncc, .xcopyin_small		! go to small copy
319925cf1a30Sjl139090	  nop
320025cf1a30Sjl139090	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
320125cf1a30Sjl139090	  nop
320225cf1a30Sjl139090.xcopyin_2:
320325cf1a30Sjl139090	btst	3, %o3				!
320425cf1a30Sjl139090	bz,pt	%ncc, .xcopyin_4		! check for word alignment
320525cf1a30Sjl139090	  nop
320625cf1a30Sjl139090	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
320725cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
320825cf1a30Sjl139090	tst	%o3
320925cf1a30Sjl139090	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
321025cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
321125cf1a30Sjl139090	bleu,pt	%ncc, .xcopyin_small		! go to small copy
321225cf1a30Sjl139090	  nop
321325cf1a30Sjl139090	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
321425cf1a30Sjl139090	  nop
321525cf1a30Sjl139090.xcopyin_4:
321625cf1a30Sjl139090	! already checked longword, must be word aligned
321725cf1a30Sjl139090	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
321825cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
321925cf1a30Sjl139090	tst	%o3
322025cf1a30Sjl139090	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
322125cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
322225cf1a30Sjl139090	bleu,pt	%ncc, .xcopyin_small		! go to small copy
322325cf1a30Sjl139090	  nop
322425cf1a30Sjl139090	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
322525cf1a30Sjl139090	  nop
322625cf1a30Sjl139090.xcopyin_8:
322725cf1a30Sjl139090	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
322825cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
322925cf1a30Sjl139090	tst	%o3
323025cf1a30Sjl139090	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
323125cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
323225cf1a30Sjl139090	bleu,pt	%ncc, .xcopyin_small		! go to small copy
323325cf1a30Sjl139090	  nop
323425cf1a30Sjl139090	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
323525cf1a30Sjl139090	  nop
323625cf1a30Sjl139090
323725cf1a30Sjl139090.xcopyin_small:
323825cf1a30Sjl139090	sethi	%hi(.sm_xcopyin_err), %o5  ! .sm_xcopyin_err is lofault value
323925cf1a30Sjl139090	or	%o5, %lo(.sm_xcopyin_err), %o5
324025cf1a30Sjl139090	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofaul
324125cf1a30Sjl139090	membar	#Sync				! sync error barrier
324225cf1a30Sjl139090	ba,pt	%ncc, .sm_do_copyin		! common code
324325cf1a30Sjl139090	  stn	%o5, [THREAD_REG + T_LOFAULT]
324425cf1a30Sjl139090
324525cf1a30Sjl139090.xcopyin_more:
324625cf1a30Sjl139090	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
324725cf1a30Sjl139090	sethi	%hi(.xcopyin_err), REAL_LOFAULT	! .xcopyin_err is lofault value
324825cf1a30Sjl139090	ba,pt	%ncc, .do_copyin
324925cf1a30Sjl139090	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
325025cf1a30Sjl139090
325125cf1a30Sjl139090/*
325225cf1a30Sjl139090 * We got here because of fault during xcopyin
325325cf1a30Sjl139090 * Errno value is in ERRNO
325425cf1a30Sjl139090 */
325525cf1a30Sjl139090.xcopyin_err:
325625cf1a30Sjl139090	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
325725cf1a30Sjl139090	tst	%o4
325825cf1a30Sjl139090	bz,pt	%ncc, 2f			! if not, return error
325925cf1a30Sjl139090	  nop
326025cf1a30Sjl139090	ldn	[%o4 + CP_XCOPYIN], %g2		! if handler, invoke it with
326125cf1a30Sjl139090	jmp	%g2				! original arguments
326225cf1a30Sjl139090	  restore %g0, 0, %g0			! dispose of copy window
326325cf1a30Sjl1390902:
326425cf1a30Sjl139090        ret
326525cf1a30Sjl139090	  restore ERRNO, 0, %o0			! return errno value
326625cf1a30Sjl139090
326725cf1a30Sjl139090.sm_xcopyin_err:
326825cf1a30Sjl139090
326925cf1a30Sjl139090	membar	#Sync
327025cf1a30Sjl139090	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
327125cf1a30Sjl139090	mov	SM_SAVE_SRC, %o0
327225cf1a30Sjl139090	mov	SM_SAVE_DST, %o1
327325cf1a30Sjl139090	mov	SM_SAVE_COUNT, %o2
327425cf1a30Sjl139090	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
327525cf1a30Sjl139090	tst	%o3
327625cf1a30Sjl139090	bz,pt	%ncc, 3f			! if not, return error
327725cf1a30Sjl139090	  nop
327825cf1a30Sjl139090	ldn	[%o3 + CP_XCOPYIN], %o5		! if handler, invoke it with
327925cf1a30Sjl139090	jmp	%o5				! original arguments
328025cf1a30Sjl139090	  nop
328125cf1a30Sjl1390903:
328225cf1a30Sjl139090	retl
328325cf1a30Sjl139090	  or	%g1, 0, %o0		! return errno value
328425cf1a30Sjl139090
328525cf1a30Sjl139090	SET_SIZE(xcopyin)
328625cf1a30Sjl139090
328725cf1a30Sjl139090#endif	/* lint */
328825cf1a30Sjl139090
328925cf1a30Sjl139090#ifdef	lint
329025cf1a30Sjl139090
329125cf1a30Sjl139090/*ARGSUSED*/
329225cf1a30Sjl139090int
329325cf1a30Sjl139090xcopyin_little(const void *uaddr, void *kaddr, size_t count)
329425cf1a30Sjl139090{ return (0); }
329525cf1a30Sjl139090
329625cf1a30Sjl139090#else	/* lint */
329725cf1a30Sjl139090
329825cf1a30Sjl139090	ENTRY(xcopyin_little)
329925cf1a30Sjl139090	sethi	%hi(.xcopyio_err), %o5
330025cf1a30Sjl139090	or	%o5, %lo(.xcopyio_err), %o5
330125cf1a30Sjl139090	ldn	[THREAD_REG + T_LOFAULT], %o4
330225cf1a30Sjl139090	membar	#Sync				! sync error barrier
330325cf1a30Sjl139090	stn	%o5, [THREAD_REG + T_LOFAULT]
330425cf1a30Sjl139090	mov	%o4, %o5
330525cf1a30Sjl139090
330625cf1a30Sjl139090	subcc	%g0, %o2, %o3
330725cf1a30Sjl139090	add	%o0, %o2, %o0
330825cf1a30Sjl139090	bz,pn	%ncc, 2f		! check for zero bytes
330925cf1a30Sjl139090	  sub	%o2, 1, %o4
331025cf1a30Sjl139090	add	%o0, %o4, %o0		! start w/last byte
331125cf1a30Sjl139090	add	%o1, %o2, %o1
331225cf1a30Sjl139090	lduba	[%o0 + %o3]ASI_AIUSL, %o4
331325cf1a30Sjl139090
331425cf1a30Sjl1390901:	stb	%o4, [%o1 + %o3]
331525cf1a30Sjl139090	inccc	%o3
331625cf1a30Sjl139090	sub	%o0, 2, %o0		! get next byte
331725cf1a30Sjl139090	bcc,a,pt %ncc, 1b
331825cf1a30Sjl139090	  lduba	[%o0 + %o3]ASI_AIUSL, %o4
331925cf1a30Sjl139090
332025cf1a30Sjl1390902:
332125cf1a30Sjl139090	membar	#Sync				! sync error barrier
332225cf1a30Sjl139090	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
332325cf1a30Sjl139090	retl
332425cf1a30Sjl139090	  mov	%g0, %o0		! return (0)
332525cf1a30Sjl139090
332625cf1a30Sjl139090.xcopyio_err:
332725cf1a30Sjl139090	membar	#Sync				! sync error barrier
332825cf1a30Sjl139090	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
332925cf1a30Sjl139090	retl
333025cf1a30Sjl139090	  mov	%g1, %o0
333125cf1a30Sjl139090
333225cf1a30Sjl139090	SET_SIZE(xcopyin_little)
333325cf1a30Sjl139090
333425cf1a30Sjl139090#endif	/* lint */
333525cf1a30Sjl139090
333625cf1a30Sjl139090
333725cf1a30Sjl139090/*
333825cf1a30Sjl139090 * Copy a block of storage - must not overlap (from + len <= to).
333925cf1a30Sjl139090 * No fault handler installed (to be called under on_fault())
334025cf1a30Sjl139090 */
334125cf1a30Sjl139090#if defined(lint)
334225cf1a30Sjl139090
334325cf1a30Sjl139090/* ARGSUSED */
334425cf1a30Sjl139090void
334525cf1a30Sjl139090copyin_noerr(const void *ufrom, void *kto, size_t count)
334625cf1a30Sjl139090{}
334725cf1a30Sjl139090
334825cf1a30Sjl139090#else	/* lint */
334925cf1a30Sjl139090	ENTRY(copyin_noerr)
335025cf1a30Sjl139090
335125cf1a30Sjl139090	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
335225cf1a30Sjl139090	bleu,pt	%ncc, .copyin_ne_small		! go to larger cases
335325cf1a30Sjl139090	  xor	%o0, %o1, %o3			! are src, dst alignable?
335425cf1a30Sjl139090	btst	7, %o3				!
335525cf1a30Sjl139090	bz,pt	%ncc, .copyin_ne_8		! check for longword alignment
335625cf1a30Sjl139090	  nop
335725cf1a30Sjl139090	btst	1, %o3				!
335825cf1a30Sjl139090	bz,pt	%ncc, .copyin_ne_2		! check for half-word
335925cf1a30Sjl139090	  nop
336025cf1a30Sjl139090	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
336125cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
336225cf1a30Sjl139090	tst	%o3
336325cf1a30Sjl139090	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
336425cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
336525cf1a30Sjl139090	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
336625cf1a30Sjl139090	  nop
336725cf1a30Sjl139090	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
336825cf1a30Sjl139090	  nop
336925cf1a30Sjl139090.copyin_ne_2:
337025cf1a30Sjl139090	btst	3, %o3				!
337125cf1a30Sjl139090	bz,pt	%ncc, .copyin_ne_4		! check for word alignment
337225cf1a30Sjl139090	  nop
337325cf1a30Sjl139090	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
337425cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
337525cf1a30Sjl139090	tst	%o3
337625cf1a30Sjl139090	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
337725cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
337825cf1a30Sjl139090	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
337925cf1a30Sjl139090	  nop
338025cf1a30Sjl139090	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
338125cf1a30Sjl139090	  nop
338225cf1a30Sjl139090.copyin_ne_4:
338325cf1a30Sjl139090	! already checked longword, must be word aligned
338425cf1a30Sjl139090	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
338525cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
338625cf1a30Sjl139090	tst	%o3
338725cf1a30Sjl139090	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
338825cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
338925cf1a30Sjl139090	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
339025cf1a30Sjl139090	  nop
339125cf1a30Sjl139090	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
339225cf1a30Sjl139090	  nop
339325cf1a30Sjl139090.copyin_ne_8:
339425cf1a30Sjl139090	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
339525cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
339625cf1a30Sjl139090	tst	%o3
339725cf1a30Sjl139090	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
339825cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
339925cf1a30Sjl139090	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
340025cf1a30Sjl139090	  nop
340125cf1a30Sjl139090	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
340225cf1a30Sjl139090	  nop
340325cf1a30Sjl139090
340425cf1a30Sjl139090.copyin_ne_small:
340525cf1a30Sjl139090	ldn	[THREAD_REG + T_LOFAULT], %o4
340625cf1a30Sjl139090	tst	%o4
340725cf1a30Sjl139090	bz,pn	%ncc, .sm_do_copyin
340825cf1a30Sjl139090	  nop
340925cf1a30Sjl139090	sethi	%hi(.sm_copyio_noerr), %o5
341025cf1a30Sjl139090	or	%o5, %lo(.sm_copyio_noerr), %o5
341125cf1a30Sjl139090	membar	#Sync				! sync error barrier
341225cf1a30Sjl139090	ba,pt	%ncc, .sm_do_copyin
341325cf1a30Sjl139090	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
341425cf1a30Sjl139090
341525cf1a30Sjl139090.copyin_noerr_more:
341625cf1a30Sjl139090	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
341725cf1a30Sjl139090	sethi	%hi(.copyio_noerr), REAL_LOFAULT
341825cf1a30Sjl139090	ba,pt	%ncc, .do_copyin
341925cf1a30Sjl139090	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
342025cf1a30Sjl139090
342125cf1a30Sjl139090.copyio_noerr:
342225cf1a30Sjl139090	jmp	%l6
342325cf1a30Sjl139090	  restore %g0,0,%g0
342425cf1a30Sjl139090
342525cf1a30Sjl139090.sm_copyio_noerr:
342625cf1a30Sjl139090	membar	#Sync
342725cf1a30Sjl139090	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore t_lofault
342825cf1a30Sjl139090	jmp	%o4
342925cf1a30Sjl139090	  nop
343025cf1a30Sjl139090
343125cf1a30Sjl139090	SET_SIZE(copyin_noerr)
343225cf1a30Sjl139090#endif /* lint */
343325cf1a30Sjl139090
343425cf1a30Sjl139090/*
343525cf1a30Sjl139090 * Copy a block of storage - must not overlap (from + len <= to).
343625cf1a30Sjl139090 * No fault handler installed (to be called under on_fault())
343725cf1a30Sjl139090 */
343825cf1a30Sjl139090
343925cf1a30Sjl139090#if defined(lint)
344025cf1a30Sjl139090
344125cf1a30Sjl139090/* ARGSUSED */
344225cf1a30Sjl139090void
344325cf1a30Sjl139090copyout_noerr(const void *kfrom, void *uto, size_t count)
344425cf1a30Sjl139090{}
344525cf1a30Sjl139090
344625cf1a30Sjl139090#else	/* lint */
344725cf1a30Sjl139090	ENTRY(copyout_noerr)
344825cf1a30Sjl139090
344925cf1a30Sjl139090	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
345025cf1a30Sjl139090	bleu,pt	%ncc, .copyout_ne_small		! go to larger cases
345125cf1a30Sjl139090	  xor	%o0, %o1, %o3			! are src, dst alignable?
345225cf1a30Sjl139090	btst	7, %o3				!
345325cf1a30Sjl139090	bz,pt	%ncc, .copyout_ne_8		! check for longword alignment
345425cf1a30Sjl139090	  nop
345525cf1a30Sjl139090	btst	1, %o3				!
345625cf1a30Sjl139090	bz,pt	%ncc, .copyout_ne_2		! check for half-word
345725cf1a30Sjl139090	  nop
345825cf1a30Sjl139090	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
345925cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
346025cf1a30Sjl139090	tst	%o3
346125cf1a30Sjl139090	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
346225cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
346325cf1a30Sjl139090	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
346425cf1a30Sjl139090	  nop
346525cf1a30Sjl139090	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
346625cf1a30Sjl139090	  nop
346725cf1a30Sjl139090.copyout_ne_2:
346825cf1a30Sjl139090	btst	3, %o3				!
346925cf1a30Sjl139090	bz,pt	%ncc, .copyout_ne_4		! check for word alignment
347025cf1a30Sjl139090	  nop
347125cf1a30Sjl139090	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
347225cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
347325cf1a30Sjl139090	tst	%o3
347425cf1a30Sjl139090	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
347525cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
347625cf1a30Sjl139090	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
347725cf1a30Sjl139090	  nop
347825cf1a30Sjl139090	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
347925cf1a30Sjl139090	  nop
348025cf1a30Sjl139090.copyout_ne_4:
348125cf1a30Sjl139090	! already checked longword, must be word aligned
348225cf1a30Sjl139090	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
348325cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
348425cf1a30Sjl139090	tst	%o3
348525cf1a30Sjl139090	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
348625cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
348725cf1a30Sjl139090	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
348825cf1a30Sjl139090	  nop
348925cf1a30Sjl139090	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
349025cf1a30Sjl139090	  nop
349125cf1a30Sjl139090.copyout_ne_8:
349225cf1a30Sjl139090	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
349325cf1a30Sjl139090	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
349425cf1a30Sjl139090	tst	%o3
349525cf1a30Sjl139090	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
349625cf1a30Sjl139090	  cmp	%o2, %o3			! if length <= limit
349725cf1a30Sjl139090	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
349825cf1a30Sjl139090	  nop
349925cf1a30Sjl139090	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
350025cf1a30Sjl139090	  nop
350125cf1a30Sjl139090
350225cf1a30Sjl139090.copyout_ne_small:
350325cf1a30Sjl139090	ldn	[THREAD_REG + T_LOFAULT], %o4
350425cf1a30Sjl139090	tst	%o4
350525cf1a30Sjl139090	bz,pn	%ncc, .sm_do_copyout
350625cf1a30Sjl139090	  nop
350725cf1a30Sjl139090	sethi	%hi(.sm_copyio_noerr), %o5
350825cf1a30Sjl139090	or	%o5, %lo(.sm_copyio_noerr), %o5
350925cf1a30Sjl139090	membar	#Sync				! sync error barrier
351025cf1a30Sjl139090	ba,pt	%ncc, .sm_do_copyout
351125cf1a30Sjl139090	stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
351225cf1a30Sjl139090
351325cf1a30Sjl139090.copyout_noerr_more:
351425cf1a30Sjl139090	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
351525cf1a30Sjl139090	sethi	%hi(.copyio_noerr), REAL_LOFAULT
351625cf1a30Sjl139090	ba,pt	%ncc, .do_copyout
351725cf1a30Sjl139090	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
351825cf1a30Sjl139090
351925cf1a30Sjl139090	SET_SIZE(copyout_noerr)
352025cf1a30Sjl139090#endif /* lint */
352125cf1a30Sjl139090
352225cf1a30Sjl139090
352325cf1a30Sjl139090/*
352425cf1a30Sjl139090 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
352525cf1a30Sjl139090 * longer than 256 bytes in length using spitfire's block stores.  If
352625cf1a30Sjl139090 * the criteria for using this routine are not met then it calls bzero
352725cf1a30Sjl139090 * and returns 1.  Otherwise 0 is returned indicating success.
352825cf1a30Sjl139090 * Caller is responsible for ensuring use_hw_bzero is true and that
352925cf1a30Sjl139090 * kpreempt_disable() has been called.
353025cf1a30Sjl139090 */
353125cf1a30Sjl139090#ifdef lint
353225cf1a30Sjl139090/*ARGSUSED*/
353325cf1a30Sjl139090int
353425cf1a30Sjl139090hwblkclr(void *addr, size_t len)
353525cf1a30Sjl139090{
353625cf1a30Sjl139090	return(0);
353725cf1a30Sjl139090}
353825cf1a30Sjl139090#else /* lint */
353925cf1a30Sjl139090	! %i0 - start address
354025cf1a30Sjl139090	! %i1 - length of region (multiple of 64)
354125cf1a30Sjl139090	! %l0 - saved fprs
354225cf1a30Sjl139090	! %l1 - pointer to saved %d0 block
354325cf1a30Sjl139090	! %l2 - saved curthread->t_lwp
354425cf1a30Sjl139090
354525cf1a30Sjl139090	ENTRY(hwblkclr)
354625cf1a30Sjl139090	! get another window w/space for one aligned block of saved fpregs
354725cf1a30Sjl139090	save	%sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
354825cf1a30Sjl139090
354925cf1a30Sjl139090	! Must be block-aligned
355025cf1a30Sjl139090	andcc	%i0, (VIS_BLOCKSIZE-1), %g0
355125cf1a30Sjl139090	bnz,pn	%ncc, 1f
355225cf1a30Sjl139090	  nop
355325cf1a30Sjl139090
355425cf1a30Sjl139090	! ... and must be 256 bytes or more
355525cf1a30Sjl139090	cmp	%i1, 256
355625cf1a30Sjl139090	blu,pn	%ncc, 1f
355725cf1a30Sjl139090	  nop
355825cf1a30Sjl139090
355925cf1a30Sjl139090	! ... and length must be a multiple of VIS_BLOCKSIZE
356025cf1a30Sjl139090	andcc	%i1, (VIS_BLOCKSIZE-1), %g0
356125cf1a30Sjl139090	bz,pn	%ncc, 2f
356225cf1a30Sjl139090	  nop
356325cf1a30Sjl139090
356425cf1a30Sjl1390901:	! punt, call bzero but notify the caller that bzero was used
356525cf1a30Sjl139090	mov	%i0, %o0
356625cf1a30Sjl139090	call	bzero
356725cf1a30Sjl139090	mov	%i1, %o1
356825cf1a30Sjl139090	ret
356925cf1a30Sjl139090	  restore	%g0, 1, %o0 ! return (1) - did not use block operations
357025cf1a30Sjl139090
357125cf1a30Sjl1390902:	rd	%fprs, %l0		! check for unused fp
357225cf1a30Sjl139090	btst	FPRS_FEF, %l0
357325cf1a30Sjl139090	bz,pt	%icc, 1f
357425cf1a30Sjl139090	  nop
357525cf1a30Sjl139090
357625cf1a30Sjl139090	! save in-use fpregs on stack
357725cf1a30Sjl139090	membar	#Sync
357825cf1a30Sjl139090	add	%fp, STACK_BIAS - 65, %l1
357925cf1a30Sjl139090	and	%l1, -VIS_BLOCKSIZE, %l1
358025cf1a30Sjl139090	stda	%d0, [%l1]ASI_BLK_P
358125cf1a30Sjl139090
358225cf1a30Sjl1390901:	membar	#StoreStore|#StoreLoad|#LoadStore
358325cf1a30Sjl139090	wr	%g0, FPRS_FEF, %fprs
358425cf1a30Sjl139090	wr	%g0, ASI_BLK_P, %asi
358525cf1a30Sjl139090
358625cf1a30Sjl139090	! Clear block
358725cf1a30Sjl139090	fzero	%d0
358825cf1a30Sjl139090	fzero	%d2
358925cf1a30Sjl139090	fzero	%d4
359025cf1a30Sjl139090	fzero	%d6
359125cf1a30Sjl139090	fzero	%d8
359225cf1a30Sjl139090	fzero	%d10
359325cf1a30Sjl139090	fzero	%d12
359425cf1a30Sjl139090	fzero	%d14
359525cf1a30Sjl139090
359625cf1a30Sjl139090	mov	256, %i3
359725cf1a30Sjl139090	ba,pt	%ncc, .pz_doblock
359825cf1a30Sjl139090	  nop
359925cf1a30Sjl139090
360025cf1a30Sjl139090.pz_blkstart:
360125cf1a30Sjl139090      ! stda	%d0, [%i0 + 192]%asi  ! in dly slot of branch that got us here
360225cf1a30Sjl139090	stda	%d0, [%i0 + 128]%asi
360325cf1a30Sjl139090	stda	%d0, [%i0 + 64]%asi
360425cf1a30Sjl139090	stda	%d0, [%i0]%asi
360525cf1a30Sjl139090.pz_zinst:
360625cf1a30Sjl139090	add	%i0, %i3, %i0
360725cf1a30Sjl139090	sub	%i1, %i3, %i1
360825cf1a30Sjl139090.pz_doblock:
360925cf1a30Sjl139090	cmp	%i1, 256
361025cf1a30Sjl139090	bgeu,a	%ncc, .pz_blkstart
361125cf1a30Sjl139090	  stda	%d0, [%i0 + 192]%asi
361225cf1a30Sjl139090
361325cf1a30Sjl139090	cmp	%i1, 64
361425cf1a30Sjl139090	blu	%ncc, .pz_finish
361525cf1a30Sjl139090
361625cf1a30Sjl139090	  andn	%i1, (64-1), %i3
361725cf1a30Sjl139090	srl	%i3, 4, %i2		! using blocks, 1 instr / 16 words
361825cf1a30Sjl139090	set	.pz_zinst, %i4
361925cf1a30Sjl139090	sub	%i4, %i2, %i4
362025cf1a30Sjl139090	jmp	%i4
362125cf1a30Sjl139090	  nop
362225cf1a30Sjl139090
362325cf1a30Sjl139090.pz_finish:
362425cf1a30Sjl139090	membar	#Sync
362525cf1a30Sjl139090	btst	FPRS_FEF, %l0
362625cf1a30Sjl139090	bz,a	.pz_finished
362725cf1a30Sjl139090	  wr	%l0, 0, %fprs		! restore fprs
362825cf1a30Sjl139090
362925cf1a30Sjl139090	! restore fpregs from stack
363025cf1a30Sjl139090	ldda	[%l1]ASI_BLK_P, %d0
363125cf1a30Sjl139090	membar	#Sync
363225cf1a30Sjl139090	wr	%l0, 0, %fprs		! restore fprs
363325cf1a30Sjl139090
363425cf1a30Sjl139090.pz_finished:
363525cf1a30Sjl139090	ret
363625cf1a30Sjl139090	  restore	%g0, 0, %o0		! return (bzero or not)
363725cf1a30Sjl139090
363825cf1a30Sjl139090	SET_SIZE(hwblkclr)
363925cf1a30Sjl139090#endif	/* lint */
364025cf1a30Sjl139090
364125cf1a30Sjl139090#ifdef lint
364225cf1a30Sjl139090/*ARGSUSED*/
364325cf1a30Sjl139090void
364425cf1a30Sjl139090hw_pa_bcopy32(uint64_t src, uint64_t dst)
364525cf1a30Sjl139090{}
364625cf1a30Sjl139090#else /*!lint */
364725cf1a30Sjl139090	/*
364825cf1a30Sjl139090	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
364925cf1a30Sjl139090	 * using physical addresses.
365025cf1a30Sjl139090	 */
365125cf1a30Sjl139090	ENTRY_NP(hw_pa_bcopy32)
365225cf1a30Sjl139090	rdpr	%pstate, %g1
365325cf1a30Sjl139090	andn	%g1, PSTATE_IE, %g2
365425cf1a30Sjl139090	wrpr	%g0, %g2, %pstate
365525cf1a30Sjl139090
365625cf1a30Sjl139090	rdpr	%pstate, %g0
365725cf1a30Sjl139090	ldxa	[%o0]ASI_MEM, %o2
365825cf1a30Sjl139090	add	%o0, 8, %o0
365925cf1a30Sjl139090	ldxa	[%o0]ASI_MEM, %o3
366025cf1a30Sjl139090	add	%o0, 8, %o0
366125cf1a30Sjl139090	ldxa	[%o0]ASI_MEM, %o4
366225cf1a30Sjl139090	add	%o0, 8, %o0
366325cf1a30Sjl139090	ldxa	[%o0]ASI_MEM, %o5
366425cf1a30Sjl139090	membar	#Sync
366525cf1a30Sjl139090
366625cf1a30Sjl139090	stxa	%o2, [%o1]ASI_MEM
366725cf1a30Sjl139090	add	%o1, 8, %o1
366825cf1a30Sjl139090	stxa	%o3, [%o1]ASI_MEM
366925cf1a30Sjl139090	add	%o1, 8, %o1
367025cf1a30Sjl139090	stxa	%o4, [%o1]ASI_MEM
367125cf1a30Sjl139090	add	%o1, 8, %o1
367225cf1a30Sjl139090	stxa	%o5, [%o1]ASI_MEM
367325cf1a30Sjl139090
367425cf1a30Sjl139090	retl
367525cf1a30Sjl139090	  wrpr	  %g0, %g1, %pstate
367625cf1a30Sjl139090
367725cf1a30Sjl139090	SET_SIZE(hw_pa_bcopy32)
367825cf1a30Sjl139090
367925cf1a30Sjl139090#endif /* lint */
368025cf1a30Sjl139090
368125cf1a30Sjl139090#if defined(lint)
368225cf1a30Sjl139090
368325cf1a30Sjl139090int use_hw_bcopy = 1;
368425cf1a30Sjl139090int use_hw_bzero = 1;
368525cf1a30Sjl139090uint_t hw_copy_limit_1 = 0;
368625cf1a30Sjl139090uint_t hw_copy_limit_2 = 0;
368725cf1a30Sjl139090uint_t hw_copy_limit_4 = 0;
368825cf1a30Sjl139090uint_t hw_copy_limit_8 = 0;
368925cf1a30Sjl139090
369025cf1a30Sjl139090#else /* !lint */
369125cf1a30Sjl139090
369225cf1a30Sjl139090	DGDEF(use_hw_bcopy)
369325cf1a30Sjl139090	.word	1
369425cf1a30Sjl139090	DGDEF(use_hw_bzero)
369525cf1a30Sjl139090	.word	1
369625cf1a30Sjl139090	DGDEF(hw_copy_limit_1)
369725cf1a30Sjl139090	.word	0
369825cf1a30Sjl139090	DGDEF(hw_copy_limit_2)
369925cf1a30Sjl139090	.word	0
370025cf1a30Sjl139090	DGDEF(hw_copy_limit_4)
370125cf1a30Sjl139090	.word	0
370225cf1a30Sjl139090	DGDEF(hw_copy_limit_8)
370325cf1a30Sjl139090	.word	0
370425cf1a30Sjl139090
370525cf1a30Sjl139090	.align	64
370625cf1a30Sjl139090	.section ".text"
370725cf1a30Sjl139090#endif /* !lint */
3708