xref: /illumos-gate/usr/src/uts/sun4u/cpu/cheetah_copy.S (revision fc910014e8a32a65612105835a10995f2c13d942)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#include <sys/param.h>
28#include <sys/errno.h>
29#include <sys/asm_linkage.h>
30#include <sys/vtrace.h>
31#include <sys/machthread.h>
32#include <sys/clock.h>
33#include <sys/asi.h>
34#include <sys/fsr.h>
35#include <sys/privregs.h>
36#include <sys/fpras_impl.h>
37
38#include "assym.h"
39
40/*
41 * Pseudo-code to aid in understanding the control flow of the
42 * bcopy/copyin/copyout routines.
43 *
44 * On entry:
45 *
46 * 	! Determine whether to use the FP register version
47 * 	! or the leaf routine version depending on size
48 * 	! of copy and flags.  Set up error handling accordingly.
49 *	! The transition point depends on whether the src and
50 * 	! dst addresses can be aligned to long word, word,
51 * 	! half word, or byte boundaries.
52 *	!
53 *	! WARNING: <Register usage convention>
54 *	! For FP version, %l6 holds previous error handling and
55 *	! a flag: TRAMP_FLAG (low bits)
56 *	! for leaf routine version, %o4 holds those values.
57 *	! So either %l6 or %o4 is reserved and not available for
58 *	! any other use.
59 *
60 * 	if (length <= VIS_COPY_THRESHOLD) 	! start with a quick test
61 * 		go to small_copy;		! to speed short copies
62 *
63 * 	! src, dst long word alignable
64 * 		if (hw_copy_limit_8 == 0) 	! hw_copy disabled
65 * 			go to small_copy;
66 *		if (length <= hw_copy_limit_8)
67 * 			go to small_copy;
68 * 		go to FPBLK_copy;
69 * 	}
70 * 	if (src,dst not alignable) {
71 * 		if (hw_copy_limit_1 == 0) 	! hw_copy disabled
72 * 			go to small_copy;
73 *		if (length <= hw_copy_limit_1)
74 * 			go to small_copy;
75 * 		go to FPBLK_copy;
76 * 	}
77 * 	if (src,dst halfword alignable) {
78 * 		if (hw_copy_limit_2 == 0) 	! hw_copy disabled
79 * 			go to small_copy;
80 *		if (length <= hw_copy_limit_2)
81 * 			go to small_copy;
82 * 		go to FPBLK_copy;
83 * 	}
84 * 	if (src,dst word alignable) {
85 * 		if (hw_copy_limit_4 == 0) 	! hw_copy disabled
86 * 			go to small_copy;
87 *		if (length <= hw_copy_limit_4)
88 * 			go to small_copy;
89 * 		go to FPBLK_copy;
90 * 	}
91 *
92 * small_copy:
93 *	Setup_leaf_rtn_error_handler; 		! diffs for each entry point
94 *
95 *	if (count <= 3)				! fast path for tiny copies
96 *		go to sm_left;			! special finish up code
97 *	else
98 *		if (count > CHKSIZE)		! medium sized copies
99 *			go to sm_med		! tuned by alignment
100 *		if(src&dst not both word aligned) {
101 *	sm_movebytes:
102 *			move byte by byte in 4-way unrolled loop
103 *			fall into sm_left;
104 *	sm_left:
105 *			move 0-3 bytes byte at a time as needed.
106 *			restore error handler and exit.
107 *
108 * 		} else {	! src&dst are word aligned
109 *			check for at least 8 bytes left,
110 *			move word at a time, unrolled by 2
111 *			when fewer than 8 bytes left,
112 *	sm_half:	move half word at a time while 2 or more bytes left
113 *	sm_byte:	move final byte if necessary
114 *	sm_exit:
115 *			restore error handler and exit.
116 *		}
117 *
118 * ! Medium length cases with at least CHKSIZE bytes available
119 * ! method: line up src and dst as best possible, then
120 * ! move data in 4-way unrolled loops.
121 *
122 * sm_med:
123 *	if(src&dst unalignable)
124 * 		go to sm_movebytes
125 *	if(src&dst halfword alignable)
126 *		go to sm_movehalf
127 *	if(src&dst word alignable)
128 *		go to sm_moveword
129 * ! fall into long word movement
130 *	move bytes until src is word aligned
131 *	if not long word aligned, move a word
132 *	move long words in 4-way unrolled loop until < 32 bytes left
133 *      move long words in 1-way unrolled loop until < 8 bytes left
134 *	if zero bytes left, goto sm_exit
135 *	if one byte left, go to sm_byte
136 *	else go to sm_half
137 *
138 * sm_moveword:
139 *	move bytes until src is word aligned
140 *	move words in 4-way unrolled loop until < 16 bytes left
141 *      move words in 1-way unrolled loop until < 4 bytes left
142 *	if zero bytes left, goto sm_exit
143 *	if one byte left, go to sm_byte
144 *	else go to sm_half
145 *
146 * sm_movehalf:
147 *	move a byte if needed to align src on halfword
148 *	move halfwords in 4-way unrolled loop until < 8 bytes left
149 *	if zero bytes left, goto sm_exit
150 *	if one byte left, go to sm_byte
151 *	else go to sm_half
152 *
153 *
154 * FPBLK_copy:
155 * 	%l6 = curthread->t_lofault;
156 * 	if (%l6 != NULL) {
157 * 		membar #Sync
158 * 		curthread->t_lofault = .copyerr;
159 * 		caller_error_handler = TRUE             ! %l6 |= 2
160 * 	}
161 *
162 *	! for FPU testing we must not migrate cpus
163 * 	if (curthread->t_lwp == NULL) {
164 *		! Kernel threads do not have pcb's in which to store
165 *		! the floating point state, so disallow preemption during
166 *		! the copy.  This also prevents cpu migration.
167 * 		kpreempt_disable(curthread);
168 *	} else {
169 *		thread_nomigrate();
170 *	}
171 *
172 * 	old_fprs = %fprs;
173 * 	old_gsr = %gsr;
174 * 	if (%fprs.fef) {
175 * 		%fprs.fef = 1;
176 * 		save current fpregs on stack using blockstore
177 * 	} else {
178 * 		%fprs.fef = 1;
179 * 	}
180 *
181 *
182 * 	do_blockcopy_here;
183 *
184 * In lofault handler:
185 *	curthread->t_lofault = .copyerr2;
186 *	Continue on with the normal exit handler
187 *
188 * On normal exit:
189 * 	%gsr = old_gsr;
190 * 	if (old_fprs & FPRS_FEF)
191 * 		restore fpregs from stack using blockload
192 *	else
193 *		zero fpregs
194 * 	%fprs = old_fprs;
195 * 	membar #Sync
196 * 	curthread->t_lofault = (%l6 & ~3);
197 *	! following test omitted from copyin/copyout as they
198 *	! will always have a current thread
199 * 	if (curthread->t_lwp == NULL)
200 *		kpreempt_enable(curthread);
201 *	else
202 *		thread_allowmigrate();
203 * 	return (0)
204 *
205 * In second lofault handler (.copyerr2):
206 *	We've tried to restore fp state from the stack and failed.  To
207 *	prevent from returning with a corrupted fp state, we will panic.
208 */
209
210/*
211 * Comments about optimization choices
212 *
213 * The initial optimization decision in this code is to determine
214 * whether to use the FP registers for a copy or not.  If we don't
215 * use the FP registers, we can execute the copy as a leaf routine,
216 * saving a register save and restore.  Also, less elaborate setup
217 * is required, allowing short copies to be completed more quickly.
218 * For longer copies, especially unaligned ones (where the src and
219 * dst do not align to allow simple ldx,stx operation), the FP
220 * registers allow much faster copy operations.
221 *
222 * The estimated extra cost of the FP path will vary depending on
223 * src/dst alignment, dst offset from the next 64 byte FPblock store
224 * boundary, remaining src data after the last full dst cache line is
225 * moved whether the FP registers need to be saved, and some other
226 * minor issues.  The average additional overhead is estimated to be
227 * 400 clocks.  Since each non-repeated/predicted tst and branch costs
228 * around 10 clocks, elaborate calculation would slow down to all
229 * longer copies and only benefit a small portion of medium sized
230 * copies.  Rather than incur such cost, we chose fixed transition
231 * points for each of the alignment choices.
232 *
233 * For the inner loop, here is a comparison of the per cache line
234 * costs for each alignment when src&dst are in cache:
235 *
236 * byte aligned:  108 clocks slower for non-FPBLK
237 * half aligned:   44 clocks slower for non-FPBLK
238 * word aligned:   12 clocks slower for non-FPBLK
239 * long aligned:    4 clocks >>faster<< for non-FPBLK
240 *
241 * The long aligned loop runs faster because it does no prefetching.
242 * That wins if the data is not in cache or there is too little
243 * data to gain much benefit from prefetching.  But when there
244 * is more data and that data is not in cache, failing to prefetch
245 * can run much slower.  In addition, there is a 2 Kbyte store queue
246 * which will cause the non-FPBLK inner loop to slow for larger copies.
247 * The exact tradeoff is strongly load and application dependent, with
248 * increasing risk of a customer visible performance regression if the
249 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
250 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
251 * upper limit for the non-FPBLK code.  To minimize performance regression
252 * risk while still gaining the primary benefits of the improvements to
253 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
254 * hw_copy_limit_*.  Later experimental studies using different values
255 * of hw_copy_limit_* can be used to make further adjustments if
256 * appropriate.
257 *
258 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
259 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
260 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
261 * hw_copy_limit_8 = src and dst are longword aligned
262 *
263 * To say that src and dst are word aligned means that after
264 * some initial alignment activity of moving 0 to 3 bytes,
265 * both the src and dst will be on word boundaries so that
266 * word loads and stores may be used.
267 *
268 * Recommended initial values as of Mar 2004, includes testing
269 * on Cheetah+ (900MHz), Cheetah++ (1200MHz), and Jaguar(1050MHz):
270 * hw_copy_limit_1 =  256
271 * hw_copy_limit_2 =  512
272 * hw_copy_limit_4 = 1024
273 * hw_copy_limit_8 = 1024 (or 1536 on some systems)
274 *
275 *
276 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
277 * disabled for that alignment choice.
278 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
279 * the value of VIS_COPY_THRESHOLD is used.
280 * It is not envisioned that hw_copy_limit_? will be changed in the field
281 * It is provided to allow for disabling FPBLK copies and to allow
282 * easy testing of alternate values on future HW implementations
283 * that might have different cache sizes, clock rates or instruction
284 * timing rules.
285 *
286 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
287 * threshold to speedup all shorter copies (less than 256).  That
288 * saves an alignment test, memory reference, and enabling test
289 * for all short copies, or an estimated 24 clocks.
290 *
291 * The order in which these limits are checked does matter since each
292 * non-predicted tst and branch costs around 10 clocks.
293 * If src and dst are randomly selected addresses,
294 * 4 of 8 will not be alignable.
295 * 2 of 8 will be half word alignable.
296 * 1 of 8 will be word alignable.
297 * 1 of 8 will be long word alignable.
298 * But, tests on running kernels show that src and dst to copy code
299 * are typically not on random alignments.  Structure copies and
300 * copies of larger data sizes are often on long word boundaries.
301 * So we test the long word alignment case first, then
302 * the byte alignment, then halfword, then word alignment.
303 *
304 * Several times, tests for length are made to split the code
305 * into subcases.  These tests often allow later tests to be
306 * avoided.  For example, within the non-FPBLK copy, we first
307 * check for tiny copies of 3 bytes or less.  That allows us
308 * to use a 4-way unrolled loop for the general byte copy case
309 * without a test on loop entry.
310 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
311 * vs longer cases.  For the really short case, we don't attempt
312 * align src and dst.  We try to minimize special case tests in
313 * the shortest loops as each test adds a significant percentage
314 * to the total time.
315 *
316 * For the medium sized cases, we allow ourselves to adjust the
317 * src and dst alignment and provide special cases for each of
318 * the four adjusted alignment cases. The CHKSIZE that was used
319 * to decide between short and medium size was chosen to be 39
320 * as that allows for the worst case of 7 bytes of alignment
321 * shift and 4 times 8 bytes for the first long word unrolling.
322 * That knowledge saves an initial test for length on entry into
323 * the medium cases.  If the general loop unrolling factor were
324 * to be increases, this number would also need to be adjusted.
325 *
326 * For all cases in the non-FPBLK code where it is known that at
327 * least 4 chunks of data are available for movement, the
328 * loop is unrolled by four.  This 4-way loop runs in 8 clocks
329 * or 2 clocks per data element.  Due to limitations of the
330 * branch instruction on Cheetah, Jaguar, and Panther, the
331 * minimum time for a small, tight loop is 3 clocks.  So
332 * the 4-way loop runs 50% faster than the fastest non-unrolled
333 * loop.
334 *
335 * Instruction alignment is forced by used of .align 16 directives
336 * and nops which are not executed in the code.  This
337 * combination of operations shifts the alignment of following
338 * loops to insure that loops are aligned so that their instructions
339 * fall within the minimum number of 4 instruction fetch groups.
340 * If instructions are inserted or removed between the .align
341 * instruction and the unrolled loops, then the alignment needs
342 * to be readjusted.  Misaligned loops can add a clock per loop
343 * iteration to the loop timing.
344 *
345 * In a few cases, code is duplicated to avoid a branch.  Since
346 * a non-predicted tst and branch takes 10 clocks, this savings
347 * is judged an appropriate time-space tradeoff.
348 *
349 * Within the FPBLK-code, the prefetch method in the inner
350 * loop needs to be explained as it is not standard.  Two
351 * prefetches are issued for each cache line instead of one.
352 * The primary one is at the maximum reach of 8 cache lines.
353 * Most of the time, that maximum prefetch reach gives the
354 * cache line more time to reach the processor for systems with
355 * higher processor clocks.  But, sometimes memory interference
356 * can cause that prefetch to be dropped.  Putting a second
357 * prefetch at a reach of 5 cache lines catches the drops
358 * three iterations later and shows a measured improvement
359 * in performance over any similar loop with a single prefetch.
360 * The prefetches are placed in the loop so they overlap with
361 * non-memory instructions, so that there is no extra cost
362 * when the data is already in-cache.
363 *
364 */
365
366/*
367 * Notes on preserving existing fp state and on membars.
368 *
369 * When a copyOP decides to use fp we may have to preserve existing
370 * floating point state.  It is not the caller's state that we need to
371 * preserve - the rest of the kernel does not use fp and, anyway, fp
372 * registers are volatile across a call.  Some examples:
373 *
374 *	- userland has fp state and is interrupted (device interrupt
375 *	  or trap) and within the interrupt/trap handling we use
376 *	  bcopy()
377 *	- another (higher level) interrupt or trap handler uses bcopy
378 *	  while a bcopy from an earlier interrupt is still active
379 *	- an asynchronous error trap occurs while fp state exists (in
380 *	  userland or in kernel copy) and the tl0 component of the handling
381 *	  uses bcopy
382 *	- a user process with fp state incurs a copy-on-write fault and
383 *	  hwblkpagecopy always uses fp
384 *
385 * We therefore need a per-call place in which to preserve fp state -
386 * using our stack is ideal (and since fp copy cannot be leaf optimized
387 * because of calls it makes, this is no hardship).
388 *
389 * The following membar BLD/BST discussion is Cheetah pipeline specific.
390 * In Cheetah BLD is blocking, #LoadLoad/#LoadStore/#StoreStore are
391 * nops (those semantics always apply) and #StoreLoad is implemented
392 * as a membar #Sync.
393 *
394 * It is possible that the owner of the fp state has a block load or
395 * block store still "in flight" at the time we come to preserve that
396 * state.  Block loads are blocking in Cheetah pipelines so we do not
397 * need to sync with them.  In preserving fp regs we will use block stores
398 * (which are not blocking in Cheetah pipelines) so we require a membar #Sync
399 * after storing state (so that our subsequent use of those registers
400 * does not modify them before the block stores complete);  this membar
401 * also serves to sync with block stores the owner of the fp state has
402 * initiated.
403 *
404 * When we have finished fp copy (with it's repeated block stores)
405 * we must membar #Sync so that our block stores may complete before
406 * we either restore the original fp state into the fp registers or
407 * return to a caller which may initiate other fp operations that could
408 * modify the fp regs we used before the block stores complete.
409 *
410 * Synchronous faults (eg, unresolvable DMMU miss) that occur while
411 * t_lofault is not NULL will not panic but will instead trampoline
412 * to the registered lofault handler.  There is no need for any
413 * membars for these - eg, our store to t_lofault will always be visible to
414 * ourselves and it is our cpu which will take any trap.
415 *
416 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
417 * while t_lofault is not NULL will also not panic.  Since we're copying
418 * to or from userland the extent of the damage is known - the destination
419 * buffer is incomplete.  So trap handlers will trampoline to the lofault
420 * handler in this case which should take some form of error action to
421 * avoid using the incomplete buffer.  The trap handler also flags the
422 * fault so that later return-from-trap handling (for the trap that brought
423 * this thread into the kernel in the first place) can notify the process
424 * and reboot the system (or restart the service with Greenline/Contracts).
425 *
426 * Asynchronous faults (eg, uncorrectable ECC error from memory) can
427 * result in deferred error traps - the trap is taken sometime after
428 * the event and the trap PC may not be the PC of the faulting access.
429 * Delivery of such pending traps can be forced by a membar #Sync, acting
430 * as an "error barrier" in this role.  To accurately apply the user/kernel
431 * separation described in the preceding paragraph we must force delivery
432 * of deferred traps affecting kernel state before we install a lofault
433 * handler (if we interpose a new lofault handler on an existing one there
434 * is no need to repeat this), and we must force delivery of deferred
435 * errors affecting the lofault-protected region before we clear t_lofault.
436 * Failure to do so results in lost kernel state being interpreted as
437 * affecting a copyin/copyout only, or of an error that really only
438 * affects copy data being interpreted as losing kernel state.
439 *
440 * Since the copy operations may preserve and later restore floating
441 * point state that does not belong to the caller (see examples above),
442 * we must be careful in how we do this in order to prevent corruption
443 * of another program.
444 *
445 * To make sure that floating point state is always saved and restored
446 * correctly, the following "big rules" must be followed when the floating
447 * point registers will be used:
448 *
449 * 1. %l6 always holds the caller's lofault handler.  Also in this register,
450 *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
451 *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
452 *    lofault handler was set coming in.
453 *
454 * 2. The FPUSED flag indicates that all FP state has been successfully stored
455 *    on the stack.  It should not be set until this save has been completed.
456 *
457 * 3. The FPUSED flag should not be cleared on exit until all FP state has
458 *    been restored from the stack.  If an error occurs while restoring
459 *    data from the stack, the error handler can check this flag to see if
460 *    a restore is necessary.
461 *
462 * 4. Code run under the new lofault handler must be kept to a minimum.  In
463 *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
464 *    to kpreempt(), should not be made until after the lofault handler has
465 *    been restored.
466 */
467
468/*
469 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
470 * to "break even" using FP/VIS-accelerated memory operations.
471 * The FPBLK code assumes a minimum number of bytes are available
472 * to be moved on entry.  Check that code carefully before
473 * reducing VIS_COPY_THRESHOLD below 256.
474 */
475/*
476 * This shadows sys/machsystm.h which can't be included due to the lack of
477 * _ASM guards in include files it references. Change it here, change it there.
478 */
479#define VIS_COPY_THRESHOLD 256
480
481/*
482 * TEST for very short copies
483 * Be aware that the maximum unroll for the short unaligned case
484 * is SHORTCOPY+1
485 */
486#define SHORTCOPY 3
487#define CHKSIZE  39
488
489/*
490 * Indicates that we're to trampoline to the error handler.
491 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
492 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
493 */
494#define	FPUSED_FLAG	1
495#define	TRAMP_FLAG	2
496#define	MASK_FLAGS	3
497
498/*
499 * Number of outstanding prefetches.
500 * Testing with 1200 MHz Cheetah+ and Jaguar gives best results with
501 * two prefetches, one with a reach of 8*BLOCK_SIZE+8 and one with a
502 * reach of 5*BLOCK_SIZE.  The double prefetch gives an typical improvement
503 * of 5% for large copies as compared to a single prefetch.  The reason
504 * for the improvement is that with Cheetah and Jaguar, some prefetches
505 * are dropped due to the prefetch queue being full.  The second prefetch
506 * reduces the number of cache lines that are dropped.
507 * Do not remove the double prefetch or change either CHEETAH_PREFETCH
508 * or CHEETAH_2ND_PREFETCH without extensive performance tests to prove
509 * there is no loss of performance.
510 */
511#define	CHEETAH_PREFETCH	8
512#define	CHEETAH_2ND_PREFETCH	5
513
514#define	VIS_BLOCKSIZE		64
515
516/*
517 * Size of stack frame in order to accomodate a 64-byte aligned
518 * floating-point register save area and 2 64-bit temp locations.
519 * All copy functions use two quadrants of fp registers; to assure a
520 * block-aligned two block buffer in which to save we must reserve
521 * three blocks on stack.  Not all functions preserve %pfrs on stack
522 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
523 *
524 *    _______________________________________ <-- %fp + STACK_BIAS
525 *    | We may need to preserve 2 quadrants |
526 *    | of fp regs, but since we do so with |
527 *    | BST/BLD we need room in which to    |
528 *    | align to VIS_BLOCKSIZE bytes.  So   |
529 *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
530 *    |-------------------------------------|
531 *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
532 *    |-------------------------------------|
533 *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
534 *    ---------------------------------------
535 */
536#define	HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
537#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 3)
538#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 2) - 1)
539#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
540#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
541
542/*
543 * Common macros used by the various versions of the block copy
544 * routines in this file.
545 */
546
547/*
548 * In FP copies if we do not have preserved data to restore over
549 * the fp regs we used then we must zero those regs to avoid
550 * exposing portions of the data to later threads (data security).
551 *
552 * Copy functions use either quadrants 1 and 3 or 2 and 4.
553 *
554 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
555 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
556 *
557 * The instructions below are quicker than repeated fzero instructions
558 * since they can dispatch down two fp pipelines.
559 */
560#define	FZEROQ1Q3			\
561	fzero	%f0			;\
562	fzero	%f2			;\
563	faddd	%f0, %f2, %f4		;\
564	fmuld	%f0, %f2, %f6		;\
565	faddd	%f0, %f2, %f8		;\
566	fmuld	%f0, %f2, %f10		;\
567	faddd	%f0, %f2, %f12		;\
568	fmuld	%f0, %f2, %f14		;\
569	faddd	%f0, %f2, %f32		;\
570	fmuld	%f0, %f2, %f34		;\
571	faddd	%f0, %f2, %f36		;\
572	fmuld	%f0, %f2, %f38		;\
573	faddd	%f0, %f2, %f40		;\
574	fmuld	%f0, %f2, %f42		;\
575	faddd	%f0, %f2, %f44		;\
576	fmuld	%f0, %f2, %f46
577
578#define	FZEROQ2Q4			\
579	fzero	%f16			;\
580	fzero	%f18			;\
581	faddd	%f16, %f18, %f20	;\
582	fmuld	%f16, %f18, %f22	;\
583	faddd	%f16, %f18, %f24	;\
584	fmuld	%f16, %f18, %f26	;\
585	faddd	%f16, %f18, %f28	;\
586	fmuld	%f16, %f18, %f30	;\
587	faddd	%f16, %f18, %f48	;\
588	fmuld	%f16, %f18, %f50	;\
589	faddd	%f16, %f18, %f52	;\
590	fmuld	%f16, %f18, %f54	;\
591	faddd	%f16, %f18, %f56	;\
592	fmuld	%f16, %f18, %f58	;\
593	faddd	%f16, %f18, %f60	;\
594	fmuld	%f16, %f18, %f62
595
596/*
597 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
598 * Used to save and restore in-use fp registers when we want to use FP
599 * and find fp already in use and copy size still large enough to justify
600 * the additional overhead of this save and restore.
601 *
602 * A membar #Sync is needed before save to sync fp ops initiated before
603 * the call to the copy function (by whoever has fp in use); for example
604 * an earlier block load to the quadrant we are about to save may still be
605 * "in flight".  A membar #Sync is required at the end of the save to
606 * sync our block store (the copy code is about to begin ldd's to the
607 * first quadrant).  Note, however, that since Cheetah pipeline block load
608 * is blocking we can omit the initial membar before saving fp state (they're
609 * commented below in case of future porting to a chip that does not block
610 * on block load).
611 *
612 * Similarly: a membar #Sync before restore allows the block stores of
613 * the copy operation to complete before we fill the quadrants with their
614 * original data, and a membar #Sync after restore lets the block loads
615 * of the restore complete before we return to whoever has the fp regs
616 * in use.  To avoid repeated membar #Sync we make it the responsibility
617 * of the copy code to membar #Sync immediately after copy is complete
618 * and before using the BLD_*_FROMSTACK macro.
619 */
620#define BST_FPQ1Q3_TOSTACK(tmp1)				\
621	/* membar #Sync	*/					;\
622	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
623	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
624	stda	%f0, [tmp1]ASI_BLK_P				;\
625	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
626	stda	%f32, [tmp1]ASI_BLK_P				;\
627	membar	#Sync
628
629#define	BLD_FPQ1Q3_FROMSTACK(tmp1)				\
630	/* membar #Sync - provided at copy completion */	;\
631	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
632	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
633	ldda	[tmp1]ASI_BLK_P, %f0				;\
634	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
635	ldda	[tmp1]ASI_BLK_P, %f32				;\
636	membar	#Sync
637
638#define BST_FPQ2Q4_TOSTACK(tmp1)				\
639	/* membar #Sync */					;\
640	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
641	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
642	stda	%f16, [tmp1]ASI_BLK_P				;\
643	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
644	stda	%f48, [tmp1]ASI_BLK_P				;\
645	membar	#Sync
646
647#define	BLD_FPQ2Q4_FROMSTACK(tmp1)				\
648	/* membar #Sync - provided at copy completion */	;\
649	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
650	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
651	ldda	[tmp1]ASI_BLK_P, %f16				;\
652	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
653	ldda	[tmp1]ASI_BLK_P, %f48				;\
654	membar	#Sync
655
656/*
657 * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
658 * prevent preemption if there is no t_lwp to save FP state to on context
659 * switch) before commencing a FP copy, and reallow it on completion or
660 * in error trampoline paths when we were using FP copy.
661 *
662 * Both macros may call other functions, so be aware that all outputs are
663 * forfeit after using these macros.  For this reason we do not pass registers
664 * to use - we just use any outputs we want.
665 *
666 * For fpRAS we need to perform the fpRAS mechanism test on the same
667 * CPU as we use for the copy operation, both so that we validate the
668 * CPU we perform the copy on and so that we know which CPU failed
669 * if a failure is detected.  Hence we need to be bound to "our" CPU.
670 * This could be achieved through disabling preemption (and we have do it that
671 * way for threads with no t_lwp) but for larger copies this may hold
672 * higher priority threads off of cpu for too long (eg, realtime).  So we
673 * make use of the lightweight t_nomigrate mechanism where we can (ie, when
674 * we have a t_lwp).
675 *
676 * Pseudo code:
677 *
678 * FP_NOMIGRATE:
679 *
680 * if (curthread->t_lwp) {
681 *	thread_nomigrate();
682 * } else {
683 *	kpreempt_disable();
684 * }
685 *
686 * FP_ALLOWMIGRATE:
687 *
688 * if (curthread->t_lwp) {
689 *	thread_allowmigrate();
690 * } else {
691 *	kpreempt_enable();
692 * }
693 */
694
695#define	FP_NOMIGRATE(label1, label2)				\
696	ldn	[THREAD_REG + T_LWP], %o0			;\
697	brz,a,pn %o0, label1##f					;\
698	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
699	call	thread_nomigrate				;\
700	  nop							;\
701	ba	label2##f					;\
702	  nop							;\
703label1:								;\
704	inc	%o1						;\
705	stb	%o1, [THREAD_REG + T_PREEMPT]			;\
706label2:
707
708#define	FP_ALLOWMIGRATE(label1, label2)				\
709	ldn	[THREAD_REG + T_LWP], %o0			;\
710	brz,a,pn %o0, label1##f					;\
711	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
712	call thread_allowmigrate				;\
713	  nop							;\
714	ba	label2##f					;\
715	  nop							;\
716label1:								;\
717	dec	%o1						;\
718	brnz,pn	%o1, label2##f					;\
719	  stb	%o1, [THREAD_REG + T_PREEMPT]			;\
720	ldn	[THREAD_REG + T_CPU], %o0			;\
721	ldub	[%o0 + CPU_KPRUNRUN], %o0			;\
722	brz,pt	%o0, label2##f					;\
723	  nop							;\
724	call	kpreempt					;\
725	  rdpr	%pil, %o0					;\
726label2:
727
728/*
729 * Copy a block of storage, returning an error code if `from' or
730 * `to' takes a kernel pagefault which cannot be resolved.
731 * Returns errno value on pagefault error, 0 if all ok
732 */
733
734	.seg	".text"
735	.align	4
736
737	ENTRY(kcopy)
738
739	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
740	bleu,pt	%ncc, .kcopy_small		! go to larger cases
741	  xor	%o0, %o1, %o3			! are src, dst alignable?
742	btst	7, %o3				!
743	bz,pt	%ncc, .kcopy_8			! check for longword alignment
744	  nop
745	btst	1, %o3				!
746	bz,pt	%ncc, .kcopy_2			! check for half-word
747	  nop
748	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
749	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
750	tst	%o3
751	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
752	  cmp	%o2, %o3			! if length <= limit
753	bleu,pt	%ncc, .kcopy_small		! go to small copy
754	  nop
755	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
756	  nop
757.kcopy_2:
758	btst	3, %o3				!
759	bz,pt	%ncc, .kcopy_4			! check for word alignment
760	  nop
761	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
762	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
763	tst	%o3
764	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
765	  cmp	%o2, %o3			! if length <= limit
766	bleu,pt	%ncc, .kcopy_small		! go to small copy
767	  nop
768	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
769	  nop
770.kcopy_4:
771	! already checked longword, must be word aligned
772	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
773	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
774	tst	%o3
775	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
776	  cmp	%o2, %o3			! if length <= limit
777	bleu,pt	%ncc, .kcopy_small		! go to small copy
778	  nop
779	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
780	  nop
781.kcopy_8:
782	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
783	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
784	tst	%o3
785	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
786	  cmp	%o2, %o3			! if length <= limit
787	bleu,pt	%ncc, .kcopy_small		! go to small copy
788	  nop
789	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
790	  nop
791
792.kcopy_small:
793	sethi	%hi(.sm_copyerr), %o5		! sm_copyerr is lofault value
794	or	%o5, %lo(.sm_copyerr), %o5
795	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
796	membar	#Sync				! sync error barrier
797	ba,pt	%ncc, .sm_do_copy		! common code
798	 stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
799
800.kcopy_more:
801	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
802	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
803	or	%l7, %lo(.copyerr), %l7
804	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
805	membar	#Sync				! sync error barrier
806	ba,pt	%ncc, .do_copy			! common code
807	  stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
808
809
810/*
811 * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
812 * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
813 */
814.copyerr:
815	set	.copyerr2, %l0
816	membar	#Sync				! sync error barrier
817	stn	%l0, [THREAD_REG + T_LOFAULT]	! set t_lofault
818	btst	FPUSED_FLAG, %l6
819	bz	%ncc, 1f
820	  and	%l6, TRAMP_FLAG, %l0		! copy trampoline flag to %l0
821
822	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
823	wr	%o2, 0, %gsr
824
825	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
826	btst	FPRS_FEF, %o3
827	bz,pt	%icc, 4f
828	  nop
829
830	BLD_FPQ1Q3_FROMSTACK(%o2)
831
832	ba,pt	%ncc, 1f
833	  wr	%o3, 0, %fprs		! restore fprs
834
8354:
836	FZEROQ1Q3
837	wr	%o3, 0, %fprs		! restore fprs
838
839	!
840	! Need to cater for the different expectations of kcopy
841	! and bcopy. kcopy will *always* set a t_lofault handler
842	! If it fires, we're expected to just return the error code
843	! and *not* to invoke any existing error handler. As far as
844	! bcopy is concerned, we only set t_lofault if there was an
845	! existing lofault handler. In that case we're expected to
846	! invoke the previously existing handler after resetting the
847	! t_lofault value.
848	!
8491:
850	andn	%l6, MASK_FLAGS, %l6		! turn trampoline flag off
851	membar	#Sync				! sync error barrier
852	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
853	FP_ALLOWMIGRATE(5, 6)
854
855	btst	TRAMP_FLAG, %l0
856	bnz,pn	%ncc, 3f
857	  nop
858	ret
859	  restore	%g1, 0, %o0
860
8613:
862	!
863	! We're here via bcopy. There *must* have been an error handler
864	! in place otherwise we would have died a nasty death already.
865	!
866	jmp	%l6				! goto real handler
867	  restore	%g0, 0, %o0		! dispose of copy window
868
869/*
870 * We got here because of a fault in .copyerr.  We can't safely restore fp
871 * state, so we panic.
872 */
873fp_panic_msg:
874	.asciz	"Unable to restore fp state after copy operation"
875
876	.align	4
877.copyerr2:
878	set	fp_panic_msg, %o0
879	call	panic
880	  nop
881
882/*
883 * We got here because of a fault during a small kcopy or bcopy.
884 * No floating point registers are used by the small copies.
885 * Errno value is in %g1.
886 */
887.sm_copyerr:
8881:
889	btst	TRAMP_FLAG, %o4
890	membar	#Sync
891	andn	%o4, TRAMP_FLAG, %o4
892	bnz,pn	%ncc, 3f
893	  stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
894	retl
895	  mov	%g1, %o0
8963:
897	jmp	%o4				! goto real handler
898	  mov	%g0, %o0			!
899
900	SET_SIZE(kcopy)
901
902
903/*
904 * Copy a block of storage - must not overlap (from + len <= to).
905 * Registers: l6 - saved t_lofault
906 * (for short copies, o4 - saved t_lofault)
907 *
908 * Copy a page of memory.
909 * Assumes double word alignment and a count >= 256.
910 */
911
912	ENTRY(bcopy)
913
914	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
915	bleu,pt	%ncc, .bcopy_small		! go to larger cases
916	  xor	%o0, %o1, %o3			! are src, dst alignable?
917	btst	7, %o3				!
918	bz,pt	%ncc, .bcopy_8			! check for longword alignment
919	  nop
920	btst	1, %o3				!
921	bz,pt	%ncc, .bcopy_2			! check for half-word
922	  nop
923	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
924	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
925	tst	%o3
926	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
927	  cmp	%o2, %o3			! if length <= limit
928	bleu,pt	%ncc, .bcopy_small		! go to small copy
929	  nop
930	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
931	  nop
932.bcopy_2:
933	btst	3, %o3				!
934	bz,pt	%ncc, .bcopy_4			! check for word alignment
935	  nop
936	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
937	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
938	tst	%o3
939	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
940	  cmp	%o2, %o3			! if length <= limit
941	bleu,pt	%ncc, .bcopy_small		! go to small copy
942	  nop
943	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
944	  nop
945.bcopy_4:
946	! already checked longword, must be word aligned
947	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
948	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
949	tst	%o3
950	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
951	  cmp	%o2, %o3			! if length <= limit
952	bleu,pt	%ncc, .bcopy_small		! go to small copy
953	  nop
954	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
955	  nop
956.bcopy_8:
957	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
958	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
959	tst	%o3
960	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
961	  cmp	%o2, %o3			! if length <= limit
962	bleu,pt	%ncc, .bcopy_small		! go to small copy
963	  nop
964	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
965	  nop
966
967	.align	16
968.bcopy_small:
969	ldn	[THREAD_REG + T_LOFAULT], %o4	! save t_lofault
970	tst	%o4
971	bz,pt	%icc, .sm_do_copy
972	  nop
973	sethi	%hi(.sm_copyerr), %o5
974	or	%o5, %lo(.sm_copyerr), %o5
975	membar	#Sync				! sync error barrier
976	stn	%o5, [THREAD_REG + T_LOFAULT]	! install new vector
977	or	%o4, TRAMP_FLAG, %o4		! error should trampoline
978.sm_do_copy:
979	cmp	%o2, SHORTCOPY		! check for really short case
980	bleu,pt	%ncc, .bc_sm_left	!
981	  cmp	%o2, CHKSIZE		! check for medium length cases
982	bgu,pn	%ncc, .bc_med		!
983	  or	%o0, %o1, %o3		! prepare alignment check
984	andcc	%o3, 0x3, %g0		! test for alignment
985	bz,pt	%ncc, .bc_sm_word	! branch to word aligned case
986.bc_sm_movebytes:
987	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
988.bc_sm_notalign4:
989	ldub	[%o0], %o3		! read byte
990	stb	%o3, [%o1]		! write byte
991	subcc	%o2, 4, %o2		! reduce count by 4
992	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
993	add	%o0, 4, %o0		! advance SRC by 4
994	stb	%o3, [%o1 + 1]
995	ldub	[%o0 - 2], %o3
996	add	%o1, 4, %o1		! advance DST by 4
997	stb	%o3, [%o1 - 2]
998	ldub	[%o0 - 1], %o3
999	bgt,pt	%ncc, .bc_sm_notalign4	! loop til 3 or fewer bytes remain
1000	  stb	%o3, [%o1 - 1]
1001	add	%o2, 3, %o2		! restore count
1002.bc_sm_left:
1003	tst	%o2
1004	bz,pt	%ncc, .bc_sm_exit	! check for zero length
1005	  deccc	%o2			! reduce count for cc test
1006	ldub	[%o0], %o3		! move one byte
1007	bz,pt	%ncc, .bc_sm_exit
1008	  stb	%o3, [%o1]
1009	ldub	[%o0 + 1], %o3		! move another byte
1010	deccc	%o2			! check for more
1011	bz,pt	%ncc, .bc_sm_exit
1012	  stb	%o3, [%o1 + 1]
1013	ldub	[%o0 + 2], %o3		! move final byte
1014	stb	%o3, [%o1 + 2]
1015	membar	#Sync				! sync error barrier
1016	andn	%o4, TRAMP_FLAG, %o4
1017	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1018	retl
1019	  mov	%g0, %o0		! return 0
1020	.align	16
1021	nop				! instruction alignment
1022					! see discussion at start of file
1023.bc_sm_words:
1024	lduw	[%o0], %o3		! read word
1025.bc_sm_wordx:
1026	subcc	%o2, 8, %o2		! update count
1027	stw	%o3, [%o1]		! write word
1028	add	%o0, 8, %o0		! update SRC
1029	lduw	[%o0 - 4], %o3		! read word
1030	add	%o1, 8, %o1		! update DST
1031	bgt,pt	%ncc, .bc_sm_words	! loop til done
1032	  stw	%o3, [%o1 - 4]		! write word
1033	addcc	%o2, 7, %o2		! restore count
1034	bz,pt	%ncc, .bc_sm_exit
1035	  deccc	%o2
1036	bz,pt	%ncc, .bc_sm_byte
1037.bc_sm_half:
1038	  subcc	%o2, 2, %o2		! reduce count by 2
1039	add	%o0, 2, %o0		! advance SRC by 2
1040	lduh	[%o0 - 2], %o3		! read half word
1041	add	%o1, 2, %o1		! advance DST by 2
1042	bgt,pt	%ncc, .bc_sm_half	! loop til done
1043	  sth	%o3, [%o1 - 2]		! write half word
1044	addcc	%o2, 1, %o2		! restore count
1045	bz,pt	%ncc, .bc_sm_exit
1046	  nop
1047.bc_sm_byte:
1048	ldub	[%o0], %o3
1049	stb	%o3, [%o1]
1050	membar	#Sync				! sync error barrier
1051	andn	%o4, TRAMP_FLAG, %o4
1052	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1053	retl
1054	  mov	%g0, %o0		! return 0
1055
1056.bc_sm_word:
1057	subcc	%o2, 4, %o2		! update count
1058	bgt,pt	%ncc, .bc_sm_wordx
1059	  lduw	[%o0], %o3		! read word
1060	addcc	%o2, 3, %o2		! restore count
1061	bz,pt	%ncc, .bc_sm_exit
1062	  stw	%o3, [%o1]		! write word
1063	deccc	%o2			! reduce count for cc test
1064	ldub	[%o0 + 4], %o3		! load one byte
1065	bz,pt	%ncc, .bc_sm_exit
1066	  stb	%o3, [%o1 + 4]		! store one byte
1067	ldub	[%o0 + 5], %o3		! load second byte
1068	deccc	%o2
1069	bz,pt	%ncc, .bc_sm_exit
1070	  stb	%o3, [%o1 + 5]		! store second byte
1071	ldub	[%o0 + 6], %o3		! load third byte
1072	stb	%o3, [%o1 + 6]		! store third byte
1073.bc_sm_exit:
1074	membar	#Sync				! sync error barrier
1075	andn	%o4, TRAMP_FLAG, %o4
1076	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1077	retl
1078	  mov	%g0, %o0		! return 0
1079
1080	.align 16
1081.bc_med:
1082	xor	%o0, %o1, %o3		! setup alignment check
1083	btst	1, %o3
1084	bnz,pt	%ncc, .bc_sm_movebytes	! unaligned
1085	  nop
1086	btst	3, %o3
1087	bnz,pt	%ncc, .bc_med_half	! halfword aligned
1088	  nop
1089	btst	7, %o3
1090	bnz,pt	%ncc, .bc_med_word	! word aligned
1091	  nop
1092.bc_med_long:
1093	btst	3, %o0			! check for
1094	bz,pt	%ncc, .bc_med_long1	! word alignment
1095	  nop
1096.bc_med_long0:
1097	ldub	[%o0], %o3		! load one byte
1098	inc	%o0
1099	stb	%o3,[%o1]		! store byte
1100	inc	%o1
1101	btst	3, %o0
1102	bnz,pt	%ncc, .bc_med_long0
1103	  dec	%o2
1104.bc_med_long1:			! word aligned
1105	btst	7, %o0			! check for long word
1106	bz,pt	%ncc, .bc_med_long2
1107	  nop
1108	lduw	[%o0], %o3		! load word
1109	add	%o0, 4, %o0		! advance SRC by 4
1110	stw	%o3, [%o1]		! store word
1111	add	%o1, 4, %o1		! advance DST by 4
1112	sub	%o2, 4, %o2		! reduce count by 4
1113!
1114!  Now long word aligned and have at least 32 bytes to move
1115!
1116.bc_med_long2:
1117	sub	%o2, 31, %o2		! adjust count to allow cc zero test
1118.bc_med_lmove:
1119	ldx	[%o0], %o3		! read long word
1120	stx	%o3, [%o1]		! write long word
1121	subcc	%o2, 32, %o2		! reduce count by 32
1122	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
1123	add	%o0, 32, %o0		! advance SRC by 32
1124	stx	%o3, [%o1 + 8]
1125	ldx	[%o0 - 16], %o3
1126	add	%o1, 32, %o1		! advance DST by 32
1127	stx	%o3, [%o1 - 16]
1128	ldx	[%o0 - 8], %o3
1129	bgt,pt	%ncc, .bc_med_lmove	! loop til 31 or fewer bytes left
1130	  stx	%o3, [%o1 - 8]
1131	addcc	%o2, 24, %o2		! restore count to long word offset
1132	ble,pt	%ncc, .bc_med_lextra	! check for more long words to move
1133	  nop
1134.bc_med_lword:
1135	ldx	[%o0], %o3		! read long word
1136	subcc	%o2, 8, %o2		! reduce count by 8
1137	stx	%o3, [%o1]		! write long word
1138	add	%o0, 8, %o0		! advance SRC by 8
1139	bgt,pt	%ncc, .bc_med_lword	! loop til 7 or fewer bytes left
1140	  add	%o1, 8, %o1		! advance DST by 8
1141.bc_med_lextra:
1142	addcc	%o2, 7, %o2		! restore rest of count
1143	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
1144	  deccc	%o2
1145	bz,pt	%ncc, .bc_sm_byte
1146	  nop
1147	ba,pt	%ncc, .bc_sm_half
1148	  nop
1149
1150	.align 16
1151.bc_med_word:
1152	btst	3, %o0			! check for
1153	bz,pt	%ncc, .bc_med_word1	! word alignment
1154	  nop
1155.bc_med_word0:
1156	ldub	[%o0], %o3		! load one byte
1157	inc	%o0
1158	stb	%o3,[%o1]		! store byte
1159	inc	%o1
1160	btst	3, %o0
1161	bnz,pt	%ncc, .bc_med_word0
1162	  dec	%o2
1163!
1164!  Now word aligned and have at least 36 bytes to move
1165!
1166.bc_med_word1:
1167	sub	%o2, 15, %o2		! adjust count to allow cc zero test
1168.bc_med_wmove:
1169	lduw	[%o0], %o3		! read word
1170	stw	%o3, [%o1]		! write word
1171	subcc	%o2, 16, %o2		! reduce count by 16
1172	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
1173	add	%o0, 16, %o0		! advance SRC by 16
1174	stw	%o3, [%o1 + 4]
1175	lduw	[%o0 - 8], %o3
1176	add	%o1, 16, %o1		! advance DST by 16
1177	stw	%o3, [%o1 - 8]
1178	lduw	[%o0 - 4], %o3
1179	bgt,pt	%ncc, .bc_med_wmove	! loop til 15 or fewer bytes left
1180	  stw	%o3, [%o1 - 4]
1181	addcc	%o2, 12, %o2		! restore count to word offset
1182	ble,pt	%ncc, .bc_med_wextra	! check for more words to move
1183	  nop
1184.bc_med_word2:
1185	lduw	[%o0], %o3		! read word
1186	subcc	%o2, 4, %o2		! reduce count by 4
1187	stw	%o3, [%o1]		! write word
1188	add	%o0, 4, %o0		! advance SRC by 4
1189	bgt,pt	%ncc, .bc_med_word2	! loop til 3 or fewer bytes left
1190	  add	%o1, 4, %o1		! advance DST by 4
1191.bc_med_wextra:
1192	addcc	%o2, 3, %o2		! restore rest of count
1193	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
1194	  deccc	%o2
1195	bz,pt	%ncc, .bc_sm_byte
1196	  nop
1197	ba,pt	%ncc, .bc_sm_half
1198	  nop
1199
1200	.align 16
1201.bc_med_half:
1202	btst	1, %o0			! check for
1203	bz,pt	%ncc, .bc_med_half1	! half word alignment
1204	  nop
1205	ldub	[%o0], %o3		! load one byte
1206	inc	%o0
1207	stb	%o3,[%o1]		! store byte
1208	inc	%o1
1209	dec	%o2
1210!
1211!  Now half word aligned and have at least 38 bytes to move
1212!
1213.bc_med_half1:
1214	sub	%o2, 7, %o2		! adjust count to allow cc zero test
1215.bc_med_hmove:
1216	lduh	[%o0], %o3		! read half word
1217	sth	%o3, [%o1]		! write half word
1218	subcc	%o2, 8, %o2		! reduce count by 8
1219	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
1220	add	%o0, 8, %o0		! advance SRC by 8
1221	sth	%o3, [%o1 + 2]
1222	lduh	[%o0 - 4], %o3
1223	add	%o1, 8, %o1		! advance DST by 8
1224	sth	%o3, [%o1 - 4]
1225	lduh	[%o0 - 2], %o3
1226	bgt,pt	%ncc, .bc_med_hmove	! loop til 7 or fewer bytes left
1227	  sth	%o3, [%o1 - 2]
1228	addcc	%o2, 7, %o2		! restore count
1229	bz,pt	%ncc, .bc_sm_exit
1230	  deccc	%o2
1231	bz,pt	%ncc, .bc_sm_byte
1232	  nop
1233	ba,pt	%ncc, .bc_sm_half
1234	  nop
1235
1236	SET_SIZE(bcopy)
1237
1238/*
1239 * The _more entry points are not intended to be used directly by
1240 * any caller from outside this file.  They are provided to allow
1241 * profiling and dtrace of the portions of the copy code that uses
1242 * the floating point registers.
1243 * This entry is particularly important as DTRACE (at least as of
1244 * 4/2004) does not support leaf functions.
1245 */
1246
1247	ENTRY(bcopy_more)
1248.bcopy_more:
1249	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1250	ldn	[THREAD_REG + T_LOFAULT], %l6	! save t_lofault
1251	tst	%l6
1252	bz,pt	%ncc, .do_copy
1253	  nop
1254	sethi	%hi(.copyerr), %o2
1255	or	%o2, %lo(.copyerr), %o2
1256	membar	#Sync				! sync error barrier
1257	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
1258	!
1259	! We've already captured whether t_lofault was zero on entry.
1260	! We need to mark ourselves as being from bcopy since both
1261	! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
1262	! and the saved lofault was zero, we won't reset lofault on
1263	! returning.
1264	!
1265	or	%l6, TRAMP_FLAG, %l6
1266
1267/*
1268 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
1269 * Also, use of FP registers has been tested to be enabled
1270 */
1271.do_copy:
1272	FP_NOMIGRATE(6, 7)
1273
1274	rd	%fprs, %o2		! check for unused fp
1275	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
1276	btst	FPRS_FEF, %o2
1277	bz,a,pt	%icc, .do_blockcopy
1278	  wr	%g0, FPRS_FEF, %fprs
1279
1280	BST_FPQ1Q3_TOSTACK(%o2)
1281
1282.do_blockcopy:
1283	rd	%gsr, %o2
1284	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
1285	or	%l6, FPUSED_FLAG, %l6
1286
1287#define	REALSRC	%i0
1288#define	DST	%i1
1289#define	CNT	%i2
1290#define	SRC	%i3
1291#define	TMP	%i5
1292
1293	andcc	DST, VIS_BLOCKSIZE - 1, TMP
1294	bz,pt	%ncc, 2f
1295	  neg	TMP
1296	add	TMP, VIS_BLOCKSIZE, TMP
1297
1298	! TMP = bytes required to align DST on FP_BLOCK boundary
1299	! Using SRC as a tmp here
1300	cmp	TMP, 3
1301	bleu,pt	%ncc, 1f
1302	  sub	CNT,TMP,CNT		! adjust main count
1303	sub	TMP, 3, TMP		! adjust for end of loop test
1304.bc_blkalign:
1305	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
1306	stb	SRC, [DST]
1307	subcc	TMP, 4, TMP
1308	ldub	[REALSRC + 1], SRC
1309	add	REALSRC, 4, REALSRC
1310	stb	SRC, [DST + 1]
1311	ldub	[REALSRC - 2], SRC
1312	add	DST, 4, DST
1313	stb	SRC, [DST - 2]
1314	ldub	[REALSRC - 1], SRC
1315	bgu,pt	%ncc, .bc_blkalign
1316	  stb	SRC, [DST - 1]
1317
1318	addcc	TMP, 3, TMP		! restore count adjustment
1319	bz,pt	%ncc, 2f		! no bytes left?
1320	  nop
13211:	ldub	[REALSRC], SRC
1322	inc	REALSRC
1323	inc	DST
1324	deccc	TMP
1325	bgu	%ncc, 1b
1326	  stb	SRC, [DST - 1]
1327
13282:
1329	andn	REALSRC, 0x7, SRC
1330	alignaddr REALSRC, %g0, %g0
1331
1332	! SRC - 8-byte aligned
1333	! DST - 64-byte aligned
1334	prefetch [SRC], #one_read
1335	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
1336	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
1337	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
1338	ldd	[SRC], %f0
1339#if CHEETAH_PREFETCH > 4
1340	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1341#endif
1342	ldd	[SRC + 0x08], %f2
1343#if CHEETAH_PREFETCH > 5
1344	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
1345#endif
1346	ldd	[SRC + 0x10], %f4
1347#if CHEETAH_PREFETCH > 6
1348	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
1349#endif
1350	faligndata %f0, %f2, %f32
1351	ldd	[SRC + 0x18], %f6
1352#if CHEETAH_PREFETCH > 7
1353	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
1354#endif
1355	faligndata %f2, %f4, %f34
1356	ldd	[SRC + 0x20], %f8
1357	faligndata %f4, %f6, %f36
1358	ldd	[SRC + 0x28], %f10
1359	faligndata %f6, %f8, %f38
1360	ldd	[SRC + 0x30], %f12
1361	faligndata %f8, %f10, %f40
1362	ldd	[SRC + 0x38], %f14
1363	faligndata %f10, %f12, %f42
1364	ldd	[SRC + VIS_BLOCKSIZE], %f0
1365	sub	CNT, VIS_BLOCKSIZE, CNT
1366	add	SRC, VIS_BLOCKSIZE, SRC
1367	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1368	ba,a,pt	%ncc, 1f
1369	  nop
1370	.align	16
13711:
1372	ldd	[SRC + 0x08], %f2
1373	faligndata %f12, %f14, %f44
1374	ldd	[SRC + 0x10], %f4
1375	faligndata %f14, %f0, %f46
1376	stda	%f32, [DST]ASI_BLK_P
1377	ldd	[SRC + 0x18], %f6
1378	faligndata %f0, %f2, %f32
1379	ldd	[SRC + 0x20], %f8
1380	faligndata %f2, %f4, %f34
1381	ldd	[SRC + 0x28], %f10
1382	faligndata %f4, %f6, %f36
1383	ldd	[SRC + 0x30], %f12
1384	faligndata %f6, %f8, %f38
1385	ldd	[SRC + 0x38], %f14
1386	faligndata %f8, %f10, %f40
1387	sub	CNT, VIS_BLOCKSIZE, CNT
1388	ldd	[SRC + VIS_BLOCKSIZE], %f0
1389	faligndata %f10, %f12, %f42
1390	prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
1391	add	DST, VIS_BLOCKSIZE, DST
1392	prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1393	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1394	cmp	CNT, VIS_BLOCKSIZE + 8
1395	bgu,pt	%ncc, 1b
1396	  add	SRC, VIS_BLOCKSIZE, SRC
1397
1398	! only if REALSRC & 0x7 is 0
1399	cmp	CNT, VIS_BLOCKSIZE
1400	bne	%ncc, 3f
1401	  andcc	REALSRC, 0x7, %g0
1402	bz,pt	%ncc, 2f
1403	  nop
14043:
1405	faligndata %f12, %f14, %f44
1406	faligndata %f14, %f0, %f46
1407	stda	%f32, [DST]ASI_BLK_P
1408	add	DST, VIS_BLOCKSIZE, DST
1409	ba,pt	%ncc, 3f
1410	  nop
14112:
1412	ldd	[SRC + 0x08], %f2
1413	fsrc1	%f12, %f44
1414	ldd	[SRC + 0x10], %f4
1415	fsrc1	%f14, %f46
1416	stda	%f32, [DST]ASI_BLK_P
1417	ldd	[SRC + 0x18], %f6
1418	fsrc1	%f0, %f32
1419	ldd	[SRC + 0x20], %f8
1420	fsrc1	%f2, %f34
1421	ldd	[SRC + 0x28], %f10
1422	fsrc1	%f4, %f36
1423	ldd	[SRC + 0x30], %f12
1424	fsrc1	%f6, %f38
1425	ldd	[SRC + 0x38], %f14
1426	fsrc1	%f8, %f40
1427	sub	CNT, VIS_BLOCKSIZE, CNT
1428	add	DST, VIS_BLOCKSIZE, DST
1429	add	SRC, VIS_BLOCKSIZE, SRC
1430	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1431	fsrc1	%f10, %f42
1432	fsrc1	%f12, %f44
1433	fsrc1	%f14, %f46
1434	stda	%f32, [DST]ASI_BLK_P
1435	add	DST, VIS_BLOCKSIZE, DST
1436	ba,a,pt	%ncc, .bcb_exit
1437	  nop
1438
14393:	tst	CNT
1440	bz,a,pt	%ncc, .bcb_exit
1441	  nop
1442
14435:	ldub	[REALSRC], TMP
1444	inc	REALSRC
1445	inc	DST
1446	deccc	CNT
1447	bgu	%ncc, 5b
1448	  stb	TMP, [DST - 1]
1449.bcb_exit:
1450	membar	#Sync
1451
1452	FPRAS_INTERVAL(FPRAS_BCOPY, 0, %l5, %o2, %o3, %o4, %o5, 8)
1453	FPRAS_REWRITE_TYPE2Q1(0, %l5, %o2, %o3, 8, 9)
1454	FPRAS_CHECK(FPRAS_BCOPY, %l5, 9)	! outputs lost
1455
1456	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
1457	wr	%o2, 0, %gsr
1458
1459	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1460	btst	FPRS_FEF, %o3
1461	bz,pt	%icc, 4f
1462	  nop
1463
1464	BLD_FPQ1Q3_FROMSTACK(%o2)
1465
1466	ba,pt	%ncc, 2f
1467	  wr	%o3, 0, %fprs		! restore fprs
14684:
1469	FZEROQ1Q3
1470	wr	%o3, 0, %fprs		! restore fprs
14712:
1472	membar	#Sync				! sync error barrier
1473	andn	%l6, MASK_FLAGS, %l6
1474	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1475	FP_ALLOWMIGRATE(5, 6)
1476	ret
1477	  restore	%g0, 0, %o0
1478
1479	SET_SIZE(bcopy_more)
1480
1481/*
1482 * Block copy with possibly overlapped operands.
1483 */
1484
1485	ENTRY(ovbcopy)
1486	tst	%o2			! check count
1487	bgu,a	%ncc, 1f		! nothing to do or bad arguments
1488	  subcc	%o0, %o1, %o3		! difference of from and to address
1489
1490	retl				! return
1491	  nop
14921:
1493	bneg,a	%ncc, 2f
1494	  neg	%o3			! if < 0, make it positive
14952:	cmp	%o2, %o3		! cmp size and abs(from - to)
1496	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
1497	  .empty				!   no overlap
1498	  cmp	%o0, %o1		! compare from and to addresses
1499	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
1500	  nop
1501	!
1502	! Copy forwards.
1503	!
1504.ov_fwd:
1505	ldub	[%o0], %o3		! read from address
1506	inc	%o0			! inc from address
1507	stb	%o3, [%o1]		! write to address
1508	deccc	%o2			! dec count
1509	bgu	%ncc, .ov_fwd		! loop till done
1510	  inc	%o1			! inc to address
1511
1512	retl				! return
1513	  nop
1514	!
1515	! Copy backwards.
1516	!
1517.ov_bkwd:
1518	deccc	%o2			! dec count
1519	ldub	[%o0 + %o2], %o3	! get byte at end of src
1520	bgu	%ncc, .ov_bkwd		! loop till done
1521	  stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
1522
1523	retl				! return
1524	  nop
1525
1526	SET_SIZE(ovbcopy)
1527
1528
1529/*
1530 * hwblkpagecopy()
1531 *
1532 * Copies exactly one page.  This routine assumes the caller (ppcopy)
1533 * has already disabled kernel preemption and has checked
1534 * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
1535 */
1536	ENTRY(hwblkpagecopy)
1537	! get another window w/space for three aligned blocks of saved fpregs
1538	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1539
1540	! %i0 - source address (arg)
1541	! %i1 - destination address (arg)
1542	! %i2 - length of region (not arg)
1543	! %l0 - saved fprs
1544	! %l1 - pointer to saved fpregs
1545
1546	rd	%fprs, %l0		! check for unused fp
1547	btst	FPRS_FEF, %l0
1548	bz,a,pt	%icc, 1f
1549	  wr	%g0, FPRS_FEF, %fprs
1550
1551	BST_FPQ1Q3_TOSTACK(%l1)
1552
15531:	set	PAGESIZE, CNT
1554	mov	REALSRC, SRC
1555
1556	prefetch [SRC], #one_read
1557	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
1558	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
1559	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
1560	ldd	[SRC], %f0
1561#if CHEETAH_PREFETCH > 4
1562	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1563#endif
1564	ldd	[SRC + 0x08], %f2
1565#if CHEETAH_PREFETCH > 5
1566	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
1567#endif
1568	ldd	[SRC + 0x10], %f4
1569#if CHEETAH_PREFETCH > 6
1570	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
1571#endif
1572	fsrc1	%f0, %f32
1573	ldd	[SRC + 0x18], %f6
1574#if CHEETAH_PREFETCH > 7
1575	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
1576#endif
1577	fsrc1	%f2, %f34
1578	ldd	[SRC + 0x20], %f8
1579	fsrc1	%f4, %f36
1580	ldd	[SRC + 0x28], %f10
1581	fsrc1	%f6, %f38
1582	ldd	[SRC + 0x30], %f12
1583	fsrc1	%f8, %f40
1584	ldd	[SRC + 0x38], %f14
1585	fsrc1	%f10, %f42
1586	ldd	[SRC + VIS_BLOCKSIZE], %f0
1587	sub	CNT, VIS_BLOCKSIZE, CNT
1588	add	SRC, VIS_BLOCKSIZE, SRC
1589	ba,a,pt	%ncc, 2f
1590	  nop
1591	.align	16
15922:
1593	ldd	[SRC + 0x08], %f2
1594	fsrc1	%f12, %f44
1595	ldd	[SRC + 0x10], %f4
1596	fsrc1	%f14, %f46
1597	stda	%f32, [DST]ASI_BLK_P
1598	ldd	[SRC + 0x18], %f6
1599	fsrc1	%f0, %f32
1600	ldd	[SRC + 0x20], %f8
1601	fsrc1	%f2, %f34
1602	ldd	[SRC + 0x28], %f10
1603	fsrc1	%f4, %f36
1604	ldd	[SRC + 0x30], %f12
1605	fsrc1	%f6, %f38
1606	ldd	[SRC + 0x38], %f14
1607	fsrc1	%f8, %f40
1608	ldd	[SRC + VIS_BLOCKSIZE], %f0
1609	fsrc1	%f10, %f42
1610	prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
1611	sub	CNT, VIS_BLOCKSIZE, CNT
1612	add	DST, VIS_BLOCKSIZE, DST
1613	cmp	CNT, VIS_BLOCKSIZE + 8
1614	prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1615	bgu,pt	%ncc, 2b
1616	  add	SRC, VIS_BLOCKSIZE, SRC
1617
1618	! trailing block
1619	ldd	[SRC + 0x08], %f2
1620	fsrc1	%f12, %f44
1621	ldd	[SRC + 0x10], %f4
1622	fsrc1	%f14, %f46
1623	stda	%f32, [DST]ASI_BLK_P
1624	ldd	[SRC + 0x18], %f6
1625	fsrc1	%f0, %f32
1626	ldd	[SRC + 0x20], %f8
1627	fsrc1	%f2, %f34
1628	ldd	[SRC + 0x28], %f10
1629	fsrc1	%f4, %f36
1630	ldd	[SRC + 0x30], %f12
1631	fsrc1	%f6, %f38
1632	ldd	[SRC + 0x38], %f14
1633	fsrc1	%f8, %f40
1634	sub	CNT, VIS_BLOCKSIZE, CNT
1635	add	DST, VIS_BLOCKSIZE, DST
1636	add	SRC, VIS_BLOCKSIZE, SRC
1637	fsrc1	%f10, %f42
1638	fsrc1	%f12, %f44
1639	fsrc1	%f14, %f46
1640	stda	%f32, [DST]ASI_BLK_P
1641
1642	membar	#Sync
1643
1644	FPRAS_INTERVAL(FPRAS_PGCOPY, 1, %l5, %o2, %o3, %o4, %o5, 8)
1645	FPRAS_REWRITE_TYPE1(1, %l5, %f32, %o2, 9)
1646	FPRAS_CHECK(FPRAS_PGCOPY, %l5, 9)	! lose outputs
1647
1648	btst	FPRS_FEF, %l0
1649	bz,pt	%icc, 2f
1650	  nop
1651
1652	BLD_FPQ1Q3_FROMSTACK(%l3)
1653	ba	3f
1654	  nop
1655
16562:	FZEROQ1Q3
1657
16583:	wr	%l0, 0, %fprs		! restore fprs
1659	ret
1660	  restore	%g0, 0, %o0
1661
1662	SET_SIZE(hwblkpagecopy)
1663
1664
1665/*
1666 * Transfer data to and from user space -
1667 * Note that these routines can cause faults
1668 * It is assumed that the kernel has nothing at
1669 * less than KERNELBASE in the virtual address space.
1670 *
1671 * Note that copyin(9F) and copyout(9F) are part of the
1672 * DDI/DKI which specifies that they return '-1' on "errors."
1673 *
1674 * Sigh.
1675 *
1676 * So there's two extremely similar routines - xcopyin() and xcopyout()
1677 * which return the errno that we've faithfully computed.  This
1678 * allows other callers (e.g. uiomove(9F)) to work correctly.
1679 * Given that these are used pretty heavily, we expand the calling
1680 * sequences inline for all flavours (rather than making wrappers).
1681 *
1682 * There are also stub routines for xcopyout_little and xcopyin_little,
1683 * which currently are intended to handle requests of <= 16 bytes from
1684 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1685 * is left as an exercise...
1686 */
1687
1688/*
1689 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1690 *
1691 * General theory of operation:
1692 *
1693 * The only difference between copy{in,out} and
1694 * xcopy{in,out} is in the error handling routine they invoke
1695 * when a memory access error occurs. xcopyOP returns the errno
1696 * while copyOP returns -1 (see above). copy{in,out}_noerr set
1697 * a special flag (by oring the TRAMP_FLAG into the fault handler address)
1698 * if they are called with a fault handler already in place. That flag
1699 * causes the default handlers to trampoline to the previous handler
1700 * upon an error.
1701 *
1702 * None of the copyops routines grab a window until it's decided that
1703 * we need to do a HW block copy operation. This saves a window
1704 * spill/fill when we're called during socket ops. The typical IO
1705 * path won't cause spill/fill traps.
1706 *
1707 * This code uses a set of 4 limits for the maximum size that will
1708 * be copied given a particular input/output address alignment.
1709 * If the value for a particular limit is zero, the copy will be performed
1710 * by the plain copy loops rather than FPBLK.
1711 *
1712 * See the description of bcopy above for more details of the
1713 * data copying algorithm and the default limits.
1714 *
1715 */
1716
1717/*
1718 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1719 */
1720
1721/*
1722 * We save the arguments in the following registers in case of a fault:
1723 *	kaddr - %l1
1724 *	uaddr - %l2
1725 *	count - %l3
1726 */
1727#define SAVE_SRC	%l1
1728#define SAVE_DST	%l2
1729#define SAVE_COUNT	%l3
1730
1731#define SM_SAVE_SRC		%g4
1732#define SM_SAVE_DST		%g5
1733#define SM_SAVE_COUNT		%o5
1734#define ERRNO		%l5
1735
1736
1737#define REAL_LOFAULT	%l4
1738/*
1739 * Generic copyio fault handler.  This is the first line of defense when a
1740 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
1741 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1742 * This allows us to share common code for all the flavors of the copy
1743 * operations, including the _noerr versions.
1744 *
1745 * Note that this function will restore the original input parameters before
1746 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
1747 * member of the t_copyop structure, if needed.
1748 */
1749	ENTRY(copyio_fault)
1750	membar	#Sync
1751	mov	%g1,ERRNO			! save errno in ERRNO
1752	btst	FPUSED_FLAG, %l6
1753	bz	%ncc, 1f
1754	  nop
1755
1756	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1757	wr	%o2, 0, %gsr    	! restore gsr
1758
1759	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1760	btst	FPRS_FEF, %o3
1761	bz,pt	%icc, 4f
1762	  nop
1763
1764	BLD_FPQ2Q4_FROMSTACK(%o2)
1765
1766	ba,pt	%ncc, 1f
1767	  wr	%o3, 0, %fprs   	! restore fprs
1768
17694:
1770	FZEROQ2Q4
1771	wr	%o3, 0, %fprs   	! restore fprs
1772
17731:
1774	andn	%l6, FPUSED_FLAG, %l6
1775	membar	#Sync
1776	stn	%l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1777	FP_ALLOWMIGRATE(5, 6)
1778
1779	mov	SAVE_SRC, %i0
1780	mov	SAVE_DST, %i1
1781	jmp	REAL_LOFAULT
1782	  mov	SAVE_COUNT, %i2
1783
1784	SET_SIZE(copyio_fault)
1785
1786
1787	ENTRY(copyout)
1788
1789	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
1790	bleu,pt	%ncc, .copyout_small		! go to larger cases
1791	  xor	%o0, %o1, %o3			! are src, dst alignable?
1792	btst	7, %o3				!
1793	bz,pt	%ncc, .copyout_8		! check for longword alignment
1794	  nop
1795	btst	1, %o3				!
1796	bz,pt	%ncc, .copyout_2		! check for half-word
1797	  nop
1798	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
1799	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
1800	tst	%o3
1801	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1802	  cmp	%o2, %o3			! if length <= limit
1803	bleu,pt	%ncc, .copyout_small		! go to small copy
1804	  nop
1805	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1806	  nop
1807.copyout_2:
1808	btst	3, %o3				!
1809	bz,pt	%ncc, .copyout_4		! check for word alignment
1810	  nop
1811	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
1812	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
1813	tst	%o3
1814	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1815	  cmp	%o2, %o3			! if length <= limit
1816	bleu,pt	%ncc, .copyout_small		! go to small copy
1817	  nop
1818	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1819	  nop
1820.copyout_4:
1821	! already checked longword, must be word aligned
1822	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
1823	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
1824	tst	%o3
1825	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1826	  cmp	%o2, %o3			! if length <= limit
1827	bleu,pt	%ncc, .copyout_small		! go to small copy
1828	  nop
1829	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1830	  nop
1831.copyout_8:
1832	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
1833	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
1834	tst	%o3
1835	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1836	  cmp	%o2, %o3			! if length <= limit
1837	bleu,pt	%ncc, .copyout_small		! go to small copy
1838	  nop
1839	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1840	  nop
1841
1842	.align	16
1843	nop				! instruction alignment
1844					! see discussion at start of file
1845.copyout_small:
1846	sethi	%hi(.sm_copyout_err), %o5	! .sm_copyout_err is lofault
1847	or	%o5, %lo(.sm_copyout_err), %o5
1848	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
1849	membar	#Sync				! sync error barrier
1850	stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
1851.sm_do_copyout:
1852	mov	%o0, SM_SAVE_SRC
1853	mov	%o1, SM_SAVE_DST
1854	cmp	%o2, SHORTCOPY		! check for really short case
1855	bleu,pt	%ncc, .co_sm_left	!
1856	  mov	%o2, SM_SAVE_COUNT
1857	cmp	%o2, CHKSIZE		! check for medium length cases
1858	bgu,pn	%ncc, .co_med		!
1859	  or	%o0, %o1, %o3		! prepare alignment check
1860	andcc	%o3, 0x3, %g0		! test for alignment
1861	bz,pt	%ncc, .co_sm_word	! branch to word aligned case
1862.co_sm_movebytes:
1863	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
1864.co_sm_notalign4:
1865	ldub	[%o0], %o3		! read byte
1866	subcc	%o2, 4, %o2		! reduce count by 4
1867	stba	%o3, [%o1]ASI_USER	! write byte
1868	inc	%o1			! advance DST by 1
1869	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
1870	add	%o0, 4, %o0		! advance SRC by 4
1871	stba	%o3, [%o1]ASI_USER
1872	inc	%o1			! advance DST by 1
1873	ldub	[%o0 - 2], %o3
1874	stba	%o3, [%o1]ASI_USER
1875	inc	%o1			! advance DST by 1
1876	ldub	[%o0 - 1], %o3
1877	stba	%o3, [%o1]ASI_USER
1878	bgt,pt	%ncc, .co_sm_notalign4	! loop til 3 or fewer bytes remain
1879	  inc	%o1			! advance DST by 1
1880	add	%o2, 3, %o2		! restore count
1881.co_sm_left:
1882	tst	%o2
1883	bz,pt	%ncc, .co_sm_exit	! check for zero length
1884	  nop
1885	ldub	[%o0], %o3		! load one byte
1886	deccc	%o2			! reduce count for cc test
1887	bz,pt	%ncc, .co_sm_exit
1888	  stba	%o3,[%o1]ASI_USER	! store one byte
1889	ldub	[%o0 + 1], %o3		! load second byte
1890	deccc	%o2
1891	inc	%o1
1892	bz,pt	%ncc, .co_sm_exit
1893	  stba	%o3,[%o1]ASI_USER	! store second byte
1894	ldub	[%o0 + 2], %o3		! load third byte
1895	inc	%o1
1896	stba	%o3,[%o1]ASI_USER	! store third byte
1897	membar	#Sync				! sync error barrier
1898	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1899	retl
1900	  mov	%g0, %o0		! return 0
1901	.align	16
1902.co_sm_words:
1903	lduw	[%o0], %o3		! read word
1904.co_sm_wordx:
1905	subcc	%o2, 8, %o2		! update count
1906	stwa	%o3, [%o1]ASI_USER	! write word
1907	add	%o0, 8, %o0		! update SRC
1908	lduw	[%o0 - 4], %o3		! read word
1909	add	%o1, 4, %o1		! update DST
1910	stwa	%o3, [%o1]ASI_USER	! write word
1911	bgt,pt	%ncc, .co_sm_words	! loop til done
1912	  add	%o1, 4, %o1		! update DST
1913	addcc	%o2, 7, %o2		! restore count
1914	bz,pt	%ncc, .co_sm_exit
1915	  nop
1916	deccc	%o2
1917	bz,pt	%ncc, .co_sm_byte
1918.co_sm_half:
1919	  subcc	%o2, 2, %o2		! reduce count by 2
1920	lduh	[%o0], %o3		! read half word
1921	add	%o0, 2, %o0		! advance SRC by 2
1922	stha	%o3, [%o1]ASI_USER	! write half word
1923	bgt,pt	%ncc, .co_sm_half	! loop til done
1924	  add	%o1, 2, %o1		! advance DST by 2
1925	addcc	%o2, 1, %o2		! restore count
1926	bz,pt	%ncc, .co_sm_exit
1927	  nop
1928.co_sm_byte:
1929	ldub	[%o0], %o3
1930	stba	%o3, [%o1]ASI_USER
1931	membar	#Sync				! sync error barrier
1932	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1933	retl
1934	  mov	%g0, %o0		! return 0
1935	.align 16
1936.co_sm_word:
1937	subcc	%o2, 4, %o2		! update count
1938	bgt,pt	%ncc, .co_sm_wordx
1939	  lduw	[%o0], %o3		! read word
1940	addcc	%o2, 3, %o2		! restore count
1941	bz,pt	%ncc, .co_sm_exit
1942	  stwa	%o3, [%o1]ASI_USER	! write word
1943	deccc	%o2			! reduce count for cc test
1944	ldub	[%o0 + 4], %o3		! load one byte
1945	add	%o1, 4, %o1
1946	bz,pt	%ncc, .co_sm_exit
1947	  stba	%o3, [%o1]ASI_USER	! store one byte
1948	ldub	[%o0 + 5], %o3		! load second byte
1949	deccc	%o2
1950	inc	%o1
1951	bz,pt	%ncc, .co_sm_exit
1952	  stba	%o3, [%o1]ASI_USER	! store second byte
1953	ldub	[%o0 + 6], %o3		! load third byte
1954	inc	%o1
1955	stba	%o3, [%o1]ASI_USER	! store third byte
1956.co_sm_exit:
1957	  membar	#Sync				! sync error barrier
1958	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1959	retl
1960	  mov	%g0, %o0		! return 0
1961
1962	.align 16
1963.co_med:
1964	xor	%o0, %o1, %o3		! setup alignment check
1965	btst	1, %o3
1966	bnz,pt	%ncc, .co_sm_movebytes	! unaligned
1967	  nop
1968	btst	3, %o3
1969	bnz,pt	%ncc, .co_med_half	! halfword aligned
1970	  nop
1971	btst	7, %o3
1972	bnz,pt	%ncc, .co_med_word	! word aligned
1973	  nop
1974.co_med_long:
1975	btst	3, %o0			! check for
1976	bz,pt	%ncc, .co_med_long1	! word alignment
1977	  nop
1978.co_med_long0:
1979	ldub	[%o0], %o3		! load one byte
1980	inc	%o0
1981	stba	%o3,[%o1]ASI_USER	! store byte
1982	inc	%o1
1983	btst	3, %o0
1984	bnz,pt	%ncc, .co_med_long0
1985	  dec	%o2
1986.co_med_long1:			! word aligned
1987	btst	7, %o0			! check for long word
1988	bz,pt	%ncc, .co_med_long2
1989	  nop
1990	lduw	[%o0], %o3		! load word
1991	add	%o0, 4, %o0		! advance SRC by 4
1992	stwa	%o3, [%o1]ASI_USER	! store word
1993	add	%o1, 4, %o1		! advance DST by 4
1994	sub	%o2, 4, %o2		! reduce count by 4
1995!
1996!  Now long word aligned and have at least 32 bytes to move
1997!
1998.co_med_long2:
1999	sub	%o2, 31, %o2		! adjust count to allow cc zero test
2000	sub	%o1, 8, %o1		! adjust pointer to allow store in
2001					! branch delay slot instead of add
2002.co_med_lmove:
2003	add	%o1, 8, %o1		! advance DST by 8
2004	ldx	[%o0], %o3		! read long word
2005	subcc	%o2, 32, %o2		! reduce count by 32
2006	stxa	%o3, [%o1]ASI_USER	! write long word
2007	add	%o1, 8, %o1		! advance DST by 8
2008	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
2009	add	%o0, 32, %o0		! advance SRC by 32
2010	stxa	%o3, [%o1]ASI_USER
2011	ldx	[%o0 - 16], %o3
2012	add	%o1, 8, %o1		! advance DST by 8
2013	stxa	%o3, [%o1]ASI_USER
2014	ldx	[%o0 - 8], %o3
2015	add	%o1, 8, %o1		! advance DST by 8
2016	bgt,pt	%ncc, .co_med_lmove	! loop til 31 or fewer bytes left
2017	  stxa	%o3, [%o1]ASI_USER
2018	add	%o1, 8, %o1		! advance DST by 8
2019	addcc	%o2, 24, %o2		! restore count to long word offset
2020	ble,pt	%ncc, .co_med_lextra	! check for more long words to move
2021	  nop
2022.co_med_lword:
2023	ldx	[%o0], %o3		! read long word
2024	subcc	%o2, 8, %o2		! reduce count by 8
2025	stxa	%o3, [%o1]ASI_USER	! write long word
2026	add	%o0, 8, %o0		! advance SRC by 8
2027	bgt,pt	%ncc, .co_med_lword	! loop til 7 or fewer bytes left
2028	  add	%o1, 8, %o1		! advance DST by 8
2029.co_med_lextra:
2030	addcc	%o2, 7, %o2		! restore rest of count
2031	bz,pt	%ncc, .co_sm_exit	! if zero, then done
2032	  deccc	%o2
2033	bz,pt	%ncc, .co_sm_byte
2034	  nop
2035	ba,pt	%ncc, .co_sm_half
2036	  nop
2037
2038	.align 16
2039	nop				! instruction alignment
2040					! see discussion at start of file
2041.co_med_word:
2042	btst	3, %o0			! check for
2043	bz,pt	%ncc, .co_med_word1	! word alignment
2044	  nop
2045.co_med_word0:
2046	ldub	[%o0], %o3		! load one byte
2047	inc	%o0
2048	stba	%o3,[%o1]ASI_USER	! store byte
2049	inc	%o1
2050	btst	3, %o0
2051	bnz,pt	%ncc, .co_med_word0
2052	  dec	%o2
2053!
2054!  Now word aligned and have at least 36 bytes to move
2055!
2056.co_med_word1:
2057	sub	%o2, 15, %o2		! adjust count to allow cc zero test
2058.co_med_wmove:
2059	lduw	[%o0], %o3		! read word
2060	subcc	%o2, 16, %o2		! reduce count by 16
2061	stwa	%o3, [%o1]ASI_USER	! write word
2062	add	%o1, 4, %o1		! advance DST by 4
2063	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
2064	add	%o0, 16, %o0		! advance SRC by 16
2065	stwa	%o3, [%o1]ASI_USER
2066	add	%o1, 4, %o1		! advance DST by 4
2067	lduw	[%o0 - 8], %o3
2068	stwa	%o3, [%o1]ASI_USER
2069	add	%o1, 4, %o1		! advance DST by 4
2070	lduw	[%o0 - 4], %o3
2071	stwa	%o3, [%o1]ASI_USER
2072	bgt,pt	%ncc, .co_med_wmove	! loop til 15 or fewer bytes left
2073	  add	%o1, 4, %o1		! advance DST by 4
2074	addcc	%o2, 12, %o2		! restore count to word offset
2075	ble,pt	%ncc, .co_med_wextra	! check for more words to move
2076	  nop
2077.co_med_word2:
2078	lduw	[%o0], %o3		! read word
2079	subcc	%o2, 4, %o2		! reduce count by 4
2080	stwa	%o3, [%o1]ASI_USER	! write word
2081	add	%o0, 4, %o0		! advance SRC by 4
2082	bgt,pt	%ncc, .co_med_word2	! loop til 3 or fewer bytes left
2083	  add	%o1, 4, %o1		! advance DST by 4
2084.co_med_wextra:
2085	addcc	%o2, 3, %o2		! restore rest of count
2086	bz,pt	%ncc, .co_sm_exit	! if zero, then done
2087	  deccc	%o2
2088	bz,pt	%ncc, .co_sm_byte
2089	  nop
2090	ba,pt	%ncc, .co_sm_half
2091	  nop
2092
2093	.align 16
2094	nop				! instruction alignment
2095	nop				! see discussion at start of file
2096	nop
2097.co_med_half:
2098	btst	1, %o0			! check for
2099	bz,pt	%ncc, .co_med_half1	! half word alignment
2100	  nop
2101	ldub	[%o0], %o3		! load one byte
2102	inc	%o0
2103	stba	%o3,[%o1]ASI_USER	! store byte
2104	inc	%o1
2105	dec	%o2
2106!
2107!  Now half word aligned and have at least 38 bytes to move
2108!
2109.co_med_half1:
2110	sub	%o2, 7, %o2		! adjust count to allow cc zero test
2111.co_med_hmove:
2112	lduh	[%o0], %o3		! read half word
2113	subcc	%o2, 8, %o2		! reduce count by 8
2114	stha	%o3, [%o1]ASI_USER	! write half word
2115	add	%o1, 2, %o1		! advance DST by 2
2116	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
2117	add	%o0, 8, %o0		! advance SRC by 8
2118	stha	%o3, [%o1]ASI_USER
2119	add	%o1, 2, %o1		! advance DST by 2
2120	lduh	[%o0 - 4], %o3
2121	stha	%o3, [%o1]ASI_USER
2122	add	%o1, 2, %o1		! advance DST by 2
2123	lduh	[%o0 - 2], %o3
2124	stha	%o3, [%o1]ASI_USER
2125	bgt,pt	%ncc, .co_med_hmove	! loop til 7 or fewer bytes left
2126	  add	%o1, 2, %o1		! advance DST by 2
2127	addcc	%o2, 7, %o2		! restore count
2128	bz,pt	%ncc, .co_sm_exit
2129	  deccc	%o2
2130	bz,pt	%ncc, .co_sm_byte
2131	  nop
2132	ba,pt	%ncc, .co_sm_half
2133	  nop
2134
2135/*
2136 * We got here because of a fault during short copyout.
2137 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2138 */
2139.sm_copyout_err:
2140	membar	#Sync
2141	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2142	mov	SM_SAVE_SRC, %o0
2143	mov	SM_SAVE_DST, %o1
2144	mov	SM_SAVE_COUNT, %o2
2145	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2146	tst	%o3
2147	bz,pt	%ncc, 3f			! if not, return error
2148	  nop
2149	ldn	[%o3 + CP_COPYOUT], %o5		! if handler, invoke it with
2150	jmp	%o5				! original arguments
2151	  nop
21523:
2153	retl
2154	  or	%g0, -1, %o0		! return error value
2155
2156	SET_SIZE(copyout)
2157
2158/*
2159 * The _more entry points are not intended to be used directly by
2160 * any caller from outside this file.  They are provided to allow
2161 * profiling and dtrace of the portions of the copy code that uses
2162 * the floating point registers.
2163 * This entry is particularly important as DTRACE (at least as of
2164 * 4/2004) does not support leaf functions.
2165 */
2166
2167	ENTRY(copyout_more)
2168.copyout_more:
2169	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2170	set	.copyout_err, REAL_LOFAULT
2171
2172/*
2173 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
2174 */
2175.do_copyout:
2176        set     copyio_fault, %l7		! .copyio_fault is lofault val
2177
2178	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
2179	membar	#Sync				! sync error barrier
2180	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
2181
2182	mov	%i0, SAVE_SRC
2183	mov	%i1, SAVE_DST
2184	mov	%i2, SAVE_COUNT
2185
2186	FP_NOMIGRATE(6, 7)
2187
2188	rd	%fprs, %o2		! check for unused fp
2189	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2190	btst	FPRS_FEF, %o2
2191	bz,a,pt	%icc, .do_blockcopyout
2192	  wr	%g0, FPRS_FEF, %fprs
2193
2194	BST_FPQ2Q4_TOSTACK(%o2)
2195
2196.do_blockcopyout:
2197	rd	%gsr, %o2
2198	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
2199	or	%l6, FPUSED_FLAG, %l6
2200
2201	andcc	DST, VIS_BLOCKSIZE - 1, TMP
2202	mov	ASI_USER, %asi
2203	bz,pt	%ncc, 2f
2204	  neg	TMP
2205	add	TMP, VIS_BLOCKSIZE, TMP
2206
2207	! TMP = bytes required to align DST on FP_BLOCK boundary
2208	! Using SRC as a tmp here
2209	cmp	TMP, 3
2210	bleu,pt	%ncc, 1f
2211	  sub	CNT,TMP,CNT		! adjust main count
2212	sub	TMP, 3, TMP		! adjust for end of loop test
2213.co_blkalign:
2214	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
2215	stba	SRC, [DST]%asi
2216	subcc	TMP, 4, TMP
2217	ldub	[REALSRC + 1], SRC
2218	add	REALSRC, 4, REALSRC
2219	stba	SRC, [DST + 1]%asi
2220	ldub	[REALSRC - 2], SRC
2221	add	DST, 4, DST
2222	stba	SRC, [DST - 2]%asi
2223	ldub	[REALSRC - 1], SRC
2224	bgu,pt	%ncc, .co_blkalign
2225	  stba	SRC, [DST - 1]%asi
2226
2227	addcc	TMP, 3, TMP		! restore count adjustment
2228	bz,pt	%ncc, 2f		! no bytes left?
2229	  nop
22301:	ldub	[REALSRC], SRC
2231	inc	REALSRC
2232	inc	DST
2233	deccc	TMP
2234	bgu	%ncc, 1b
2235	  stba	SRC, [DST - 1]%asi
2236
22372:
2238	andn	REALSRC, 0x7, SRC
2239	alignaddr REALSRC, %g0, %g0
2240
2241	! SRC - 8-byte aligned
2242	! DST - 64-byte aligned
2243	prefetch [SRC], #one_read
2244	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #one_read
2245	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #one_read
2246	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
2247	ldd	[SRC], %f16
2248#if CHEETAH_PREFETCH > 4
2249	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
2250#endif
2251	ldd	[SRC + 0x08], %f18
2252#if CHEETAH_PREFETCH > 5
2253	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
2254#endif
2255	ldd	[SRC + 0x10], %f20
2256#if CHEETAH_PREFETCH > 6
2257	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
2258#endif
2259	faligndata %f16, %f18, %f48
2260	ldd	[SRC + 0x18], %f22
2261#if CHEETAH_PREFETCH > 7
2262	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
2263#endif
2264	faligndata %f18, %f20, %f50
2265	ldd	[SRC + 0x20], %f24
2266	faligndata %f20, %f22, %f52
2267	ldd	[SRC + 0x28], %f26
2268	faligndata %f22, %f24, %f54
2269	ldd	[SRC + 0x30], %f28
2270	faligndata %f24, %f26, %f56
2271	ldd	[SRC + 0x38], %f30
2272	faligndata %f26, %f28, %f58
2273	ldd	[SRC + VIS_BLOCKSIZE], %f16
2274	sub	CNT, VIS_BLOCKSIZE, CNT
2275	add	SRC, VIS_BLOCKSIZE, SRC
2276	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2277	ba,a,pt	%ncc, 1f
2278	  nop
2279	.align	16
22801:
2281	ldd	[SRC + 0x08], %f18
2282	faligndata %f28, %f30, %f60
2283	ldd	[SRC + 0x10], %f20
2284	faligndata %f30, %f16, %f62
2285	stda	%f48, [DST]ASI_BLK_AIUS
2286	ldd	[SRC + 0x18], %f22
2287	faligndata %f16, %f18, %f48
2288	ldd	[SRC + 0x20], %f24
2289	faligndata %f18, %f20, %f50
2290	ldd	[SRC + 0x28], %f26
2291	faligndata %f20, %f22, %f52
2292	ldd	[SRC + 0x30], %f28
2293	faligndata %f22, %f24, %f54
2294	ldd	[SRC + 0x38], %f30
2295	faligndata %f24, %f26, %f56
2296	sub	CNT, VIS_BLOCKSIZE, CNT
2297	ldd	[SRC + VIS_BLOCKSIZE], %f16
2298	faligndata %f26, %f28, %f58
2299	prefetch [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8], #one_read
2300	add	DST, VIS_BLOCKSIZE, DST
2301	prefetch [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2302	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2303	cmp	CNT, VIS_BLOCKSIZE + 8
2304	bgu,pt	%ncc, 1b
2305	  add	SRC, VIS_BLOCKSIZE, SRC
2306
2307	! only if REALSRC & 0x7 is 0
2308	cmp	CNT, VIS_BLOCKSIZE
2309	bne	%ncc, 3f
2310	  andcc	REALSRC, 0x7, %g0
2311	bz,pt	%ncc, 2f
2312	  nop
23133:
2314	faligndata %f28, %f30, %f60
2315	faligndata %f30, %f16, %f62
2316	stda	%f48, [DST]ASI_BLK_AIUS
2317	add	DST, VIS_BLOCKSIZE, DST
2318	ba,pt	%ncc, 3f
2319	  nop
23202:
2321	ldd	[SRC + 0x08], %f18
2322	fsrc1	%f28, %f60
2323	ldd	[SRC + 0x10], %f20
2324	fsrc1	%f30, %f62
2325	stda	%f48, [DST]ASI_BLK_AIUS
2326	ldd	[SRC + 0x18], %f22
2327	fsrc1	%f16, %f48
2328	ldd	[SRC + 0x20], %f24
2329	fsrc1	%f18, %f50
2330	ldd	[SRC + 0x28], %f26
2331	fsrc1	%f20, %f52
2332	ldd	[SRC + 0x30], %f28
2333	fsrc1	%f22, %f54
2334	ldd	[SRC + 0x38], %f30
2335	fsrc1	%f24, %f56
2336	sub	CNT, VIS_BLOCKSIZE, CNT
2337	add	DST, VIS_BLOCKSIZE, DST
2338	add	SRC, VIS_BLOCKSIZE, SRC
2339	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2340	fsrc1	%f26, %f58
2341	fsrc1	%f28, %f60
2342	fsrc1	%f30, %f62
2343	stda	%f48, [DST]ASI_BLK_AIUS
2344	add	DST, VIS_BLOCKSIZE, DST
2345	ba,a,pt	%ncc, 4f
2346	  nop
2347
23483:	tst	CNT
2349	bz,a	%ncc, 4f
2350	  nop
2351
23525:	ldub	[REALSRC], TMP
2353	inc	REALSRC
2354	inc	DST
2355	deccc	CNT
2356	bgu	%ncc, 5b
2357	  stba	TMP, [DST - 1]%asi
23584:
2359
2360.copyout_exit:
2361	membar	#Sync
2362
2363	FPRAS_INTERVAL(FPRAS_COPYOUT, 0, %l5, %o2, %o3, %o4, %o5, 8)
2364	FPRAS_REWRITE_TYPE2Q2(0, %l5, %o2, %o3, 8, 9)
2365	FPRAS_CHECK(FPRAS_COPYOUT, %l5, 9)	! lose outputs
2366
2367	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2368	wr	%o2, 0, %gsr		! restore gsr
2369
2370	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2371	btst	FPRS_FEF, %o3
2372	bz,pt	%icc, 4f
2373	  nop
2374
2375	BLD_FPQ2Q4_FROMSTACK(%o2)
2376
2377	ba,pt	%ncc, 1f
2378	  wr	%o3, 0, %fprs		! restore fprs
2379
23804:
2381	FZEROQ2Q4
2382	wr	%o3, 0, %fprs		! restore fprs
2383
23841:
2385	membar	#Sync
2386	andn	%l6, FPUSED_FLAG, %l6
2387	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2388	FP_ALLOWMIGRATE(5, 6)
2389	ret
2390	  restore	%g0, 0, %o0
2391
2392/*
2393 * We got here because of a fault during copyout.
2394 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2395 */
2396.copyout_err:
2397	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
2398	tst	%o4
2399	bz,pt	%ncc, 2f			! if not, return error
2400	  nop
2401	ldn	[%o4 + CP_COPYOUT], %g2		! if handler, invoke it with
2402	jmp	%g2				! original arguments
2403	  restore %g0, 0, %g0			! dispose of copy window
24042:
2405        ret
2406	  restore %g0, -1, %o0			! return error value
2407
2408
2409	SET_SIZE(copyout_more)
2410
2411
2412	ENTRY(xcopyout)
2413	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
2414	bleu,pt	%ncc, .xcopyout_small		! go to larger cases
2415	  xor	%o0, %o1, %o3			! are src, dst alignable?
2416	btst	7, %o3				!
2417	bz,pt	%ncc, .xcopyout_8		!
2418	  nop
2419	btst	1, %o3				!
2420	bz,pt	%ncc, .xcopyout_2		! check for half-word
2421	  nop
2422	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
2423	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2424	tst	%o3
2425	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2426	  cmp	%o2, %o3			! if length <= limit
2427	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2428	  nop
2429	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2430	  nop
2431.xcopyout_2:
2432	btst	3, %o3				!
2433	bz,pt	%ncc, .xcopyout_4		! check for word alignment
2434	  nop
2435	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
2436	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
2437	tst	%o3
2438	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2439	  cmp	%o2, %o3			! if length <= limit
2440	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2441	  nop
2442	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2443	  nop
2444.xcopyout_4:
2445	! already checked longword, must be word aligned
2446	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
2447	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
2448	tst	%o3
2449	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2450	  cmp	%o2, %o3			! if length <= limit
2451	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2452	  nop
2453	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2454	  nop
2455.xcopyout_8:
2456	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
2457	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2458	tst	%o3
2459	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2460	  cmp	%o2, %o3			! if length <= limit
2461	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2462	  nop
2463	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2464	  nop
2465
2466.xcopyout_small:
2467	sethi	%hi(.sm_xcopyout_err), %o5	! .sm_xcopyout_err is lofault
2468	or	%o5, %lo(.sm_xcopyout_err), %o5
2469	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
2470	membar	#Sync				! sync error barrier
2471	ba,pt	%ncc, .sm_do_copyout		! common code
2472	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
2473
2474.xcopyout_more:
2475	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2476	sethi	%hi(.xcopyout_err), REAL_LOFAULT
2477	ba,pt	%ncc, .do_copyout		! common code
2478	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2479
2480/*
2481 * We got here because of fault during xcopyout
2482 * Errno value is in ERRNO
2483 */
2484.xcopyout_err:
2485	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
2486	tst	%o4
2487	bz,pt	%ncc, 2f			! if not, return error
2488	  nop
2489	ldn	[%o4 + CP_XCOPYOUT], %g2	! if handler, invoke it with
2490	jmp	%g2				! original arguments
2491	  restore %g0, 0, %g0			! dispose of copy window
24922:
2493        ret
2494	  restore ERRNO, 0, %o0			! return errno value
2495
2496.sm_xcopyout_err:
2497
2498	membar	#Sync
2499	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2500	mov	SM_SAVE_SRC, %o0
2501	mov	SM_SAVE_DST, %o1
2502	mov	SM_SAVE_COUNT, %o2
2503	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2504	tst	%o3
2505	bz,pt	%ncc, 3f			! if not, return error
2506	  nop
2507	ldn	[%o3 + CP_XCOPYOUT], %o5	! if handler, invoke it with
2508	jmp	%o5				! original arguments
2509	  nop
25103:
2511	retl
2512	  or	%g1, 0, %o0		! return errno value
2513
2514	SET_SIZE(xcopyout)
2515
2516	ENTRY(xcopyout_little)
2517	sethi	%hi(.xcopyio_err), %o5
2518	or	%o5, %lo(.xcopyio_err), %o5
2519	ldn	[THREAD_REG + T_LOFAULT], %o4
2520	membar	#Sync				! sync error barrier
2521	stn	%o5, [THREAD_REG + T_LOFAULT]
2522	mov	%o4, %o5
2523
2524	subcc	%g0, %o2, %o3
2525	add	%o0, %o2, %o0
2526	bz,pn	%ncc, 2f		! check for zero bytes
2527	  sub	%o2, 1, %o4
2528	add	%o0, %o4, %o0		! start w/last byte
2529	add	%o1, %o2, %o1
2530	ldub	[%o0 + %o3], %o4
2531
25321:	stba	%o4, [%o1 + %o3]ASI_AIUSL
2533	inccc	%o3
2534	sub	%o0, 2, %o0		! get next byte
2535	bcc,a,pt %ncc, 1b
2536	  ldub	[%o0 + %o3], %o4
2537
25382:
2539	membar	#Sync				! sync error barrier
2540	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2541	retl
2542	  mov	%g0, %o0		! return (0)
2543
2544	SET_SIZE(xcopyout_little)
2545
2546/*
2547 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2548 */
2549
2550	ENTRY(copyin)
2551	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
2552	bleu,pt	%ncc, .copyin_small		! go to larger cases
2553	  xor	%o0, %o1, %o3			! are src, dst alignable?
2554	btst	7, %o3				!
2555	bz,pt	%ncc, .copyin_8			! check for longword alignment
2556	  nop
2557	btst	1, %o3				!
2558	bz,pt	%ncc, .copyin_2			! check for half-word
2559	  nop
2560	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
2561	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2562	tst	%o3
2563	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2564	  cmp	%o2, %o3			! if length <= limit
2565	bleu,pt	%ncc, .copyin_small		! go to small copy
2566	  nop
2567	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2568	  nop
2569.copyin_2:
2570	btst	3, %o3				!
2571	bz,pt	%ncc, .copyin_4			! check for word alignment
2572	  nop
2573	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
2574	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
2575	tst	%o3
2576	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2577	  cmp	%o2, %o3			! if length <= limit
2578	bleu,pt	%ncc, .copyin_small		! go to small copy
2579	  nop
2580	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2581	  nop
2582.copyin_4:
2583	! already checked longword, must be word aligned
2584	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
2585	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
2586	tst	%o3
2587	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2588	  cmp	%o2, %o3			! if length <= limit
2589	bleu,pt	%ncc, .copyin_small		! go to small copy
2590	  nop
2591	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2592	  nop
2593.copyin_8:
2594	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
2595	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2596	tst	%o3
2597	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2598	  cmp	%o2, %o3			! if length <= limit
2599	bleu,pt	%ncc, .copyin_small		! go to small copy
2600	  nop
2601	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2602	  nop
2603
2604	.align	16
2605	nop				! instruction alignment
2606					! see discussion at start of file
2607.copyin_small:
2608	sethi	%hi(.sm_copyin_err), %o5	! .sm_copyin_err is lofault
2609	or	%o5, %lo(.sm_copyin_err), %o5
2610	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofault, no tramp
2611	membar	#Sync				! sync error barrier
2612	stn	%o5, [THREAD_REG + T_LOFAULT]
2613.sm_do_copyin:
2614	mov	%o0, SM_SAVE_SRC
2615	mov	%o1, SM_SAVE_DST
2616	cmp	%o2, SHORTCOPY		! check for really short case
2617	bleu,pt	%ncc, .ci_sm_left	!
2618	  mov	%o2, SM_SAVE_COUNT
2619	cmp	%o2, CHKSIZE		! check for medium length cases
2620	bgu,pn	%ncc, .ci_med		!
2621	  or	%o0, %o1, %o3		! prepare alignment check
2622	andcc	%o3, 0x3, %g0		! test for alignment
2623	bz,pt	%ncc, .ci_sm_word	! branch to word aligned case
2624.ci_sm_movebytes:
2625	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
2626.ci_sm_notalign4:
2627	lduba	[%o0]ASI_USER, %o3	! read byte
2628	subcc	%o2, 4, %o2		! reduce count by 4
2629	stb	%o3, [%o1]		! write byte
2630	add	%o0, 1, %o0		! advance SRC by 1
2631	lduba	[%o0]ASI_USER, %o3	! repeat for a total of 4 bytes
2632	add	%o0, 1, %o0		! advance SRC by 1
2633	stb	%o3, [%o1 + 1]
2634	add	%o1, 4, %o1		! advance DST by 4
2635	lduba	[%o0]ASI_USER, %o3
2636	add	%o0, 1, %o0		! advance SRC by 1
2637	stb	%o3, [%o1 - 2]
2638	lduba	[%o0]ASI_USER, %o3
2639	add	%o0, 1, %o0		! advance SRC by 1
2640	bgt,pt	%ncc, .ci_sm_notalign4	! loop til 3 or fewer bytes remain
2641	  stb	%o3, [%o1 - 1]
2642	add	%o2, 3, %o2		! restore count
2643.ci_sm_left:
2644	tst	%o2
2645	bz,pt	%ncc, .ci_sm_exit
2646	  nop
2647	lduba	[%o0]ASI_USER, %o3		! load one byte
2648	deccc	%o2			! reduce count for cc test
2649	bz,pt	%ncc, .ci_sm_exit
2650	  stb	%o3,[%o1]		! store one byte
2651	inc	%o0
2652	lduba	[%o0]ASI_USER, %o3	! load second byte
2653	deccc	%o2
2654	bz,pt	%ncc, .ci_sm_exit
2655	  stb	%o3,[%o1 + 1]		! store second byte
2656	inc	%o0
2657	lduba	[%o0]ASI_USER, %o3	! load third byte
2658	stb	%o3,[%o1 + 2]		! store third byte
2659	membar	#Sync				! sync error barrier
2660	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2661	retl
2662	  mov	%g0, %o0		! return 0
2663	.align	16
2664.ci_sm_words:
2665	lduwa	[%o0]ASI_USER, %o3		! read word
2666.ci_sm_wordx:
2667	subcc	%o2, 8, %o2		! update count
2668	stw	%o3, [%o1]		! write word
2669	add	%o0, 4, %o0		! update SRC
2670	add	%o1, 8, %o1		! update DST
2671	lduwa	[%o0]ASI_USER, %o3	! read word
2672	add	%o0, 4, %o0		! update SRC
2673	bgt,pt	%ncc, .ci_sm_words	! loop til done
2674	  stw	%o3, [%o1 - 4]		! write word
2675	addcc	%o2, 7, %o2		! restore count
2676	bz,pt	%ncc, .ci_sm_exit
2677	  nop
2678	deccc	%o2
2679	bz,pt	%ncc, .ci_sm_byte
2680.ci_sm_half:
2681	  subcc	%o2, 2, %o2		! reduce count by 2
2682	lduha	[%o0]ASI_USER, %o3	! read half word
2683	add	%o0, 2, %o0		! advance SRC by 2
2684	add	%o1, 2, %o1		! advance DST by 2
2685	bgt,pt	%ncc, .ci_sm_half	! loop til done
2686	  sth	%o3, [%o1 - 2]		! write half word
2687	addcc	%o2, 1, %o2		! restore count
2688	bz,pt	%ncc, .ci_sm_exit
2689	  nop
2690.ci_sm_byte:
2691	lduba	[%o0]ASI_USER, %o3
2692	stb	%o3, [%o1]
2693	membar	#Sync				! sync error barrier
2694	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2695	retl
2696	  mov	%g0, %o0		! return 0
2697	.align	16
2698.ci_sm_word:
2699	subcc	%o2, 4, %o2		! update count
2700	bgt,pt	%ncc, .ci_sm_wordx
2701	  lduwa	[%o0]ASI_USER, %o3		! read word
2702	addcc	%o2, 3, %o2		! restore count
2703	bz,pt	%ncc, .ci_sm_exit
2704	  stw	%o3, [%o1]		! write word
2705	deccc	%o2			! reduce count for cc test
2706	add	%o0, 4, %o0
2707	lduba	[%o0]ASI_USER, %o3	! load one byte
2708	bz,pt	%ncc, .ci_sm_exit
2709	  stb	%o3, [%o1 + 4]		! store one byte
2710	inc	%o0
2711	lduba	[%o0]ASI_USER, %o3	! load second byte
2712	deccc	%o2
2713	bz,pt	%ncc, .ci_sm_exit
2714	  stb	%o3, [%o1 + 5]		! store second byte
2715	inc	%o0
2716	lduba	[%o0]ASI_USER, %o3	! load third byte
2717	stb	%o3, [%o1 + 6]		! store third byte
2718.ci_sm_exit:
2719	membar	#Sync				! sync error barrier
2720	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2721	retl
2722	  mov	%g0, %o0		! return 0
2723
2724	.align 16
2725.ci_med:
2726	xor	%o0, %o1, %o3		! setup alignment check
2727	btst	1, %o3
2728	bnz,pt	%ncc, .ci_sm_movebytes	! unaligned
2729	  nop
2730	btst	3, %o3
2731	bnz,pt	%ncc, .ci_med_half	! halfword aligned
2732	  nop
2733	btst	7, %o3
2734	bnz,pt	%ncc, .ci_med_word	! word aligned
2735	  nop
2736.ci_med_long:
2737	btst	3, %o0			! check for
2738	bz,pt	%ncc, .ci_med_long1	! word alignment
2739	  nop
2740.ci_med_long0:
2741	lduba	[%o0]ASI_USER, %o3		! load one byte
2742	inc	%o0
2743	stb	%o3,[%o1]		! store byte
2744	inc	%o1
2745	btst	3, %o0
2746	bnz,pt	%ncc, .ci_med_long0
2747	  dec	%o2
2748.ci_med_long1:			! word aligned
2749	btst	7, %o0			! check for long word
2750	bz,pt	%ncc, .ci_med_long2
2751	  nop
2752	lduwa	[%o0]ASI_USER, %o3	! load word
2753	add	%o0, 4, %o0		! advance SRC by 4
2754	stw	%o3, [%o1]		! store word
2755	add	%o1, 4, %o1		! advance DST by 4
2756	sub	%o2, 4, %o2		! reduce count by 4
2757!
2758!  Now long word aligned and have at least 32 bytes to move
2759!
2760.ci_med_long2:
2761	sub	%o2, 31, %o2		! adjust count to allow cc zero test
2762.ci_med_lmove:
2763	ldxa	[%o0]ASI_USER, %o3	! read long word
2764	subcc	%o2, 32, %o2		! reduce count by 32
2765	stx	%o3, [%o1]		! write long word
2766	add	%o0, 8, %o0		! advance SRC by 8
2767	ldxa	[%o0]ASI_USER, %o3	! repeat for a total for 4 long words
2768	add	%o0, 8, %o0		! advance SRC by 8
2769	stx	%o3, [%o1 + 8]
2770	add	%o1, 32, %o1		! advance DST by 32
2771	ldxa	[%o0]ASI_USER, %o3
2772	add	%o0, 8, %o0		! advance SRC by 8
2773	stx	%o3, [%o1 - 16]
2774	ldxa	[%o0]ASI_USER, %o3
2775	add	%o0, 8, %o0		! advance SRC by 8
2776	bgt,pt	%ncc, .ci_med_lmove	! loop til 31 or fewer bytes left
2777	  stx	%o3, [%o1 - 8]
2778	addcc	%o2, 24, %o2		! restore count to long word offset
2779	ble,pt	%ncc, .ci_med_lextra	! check for more long words to move
2780	  nop
2781.ci_med_lword:
2782	ldxa	[%o0]ASI_USER, %o3	! read long word
2783	subcc	%o2, 8, %o2		! reduce count by 8
2784	stx	%o3, [%o1]		! write long word
2785	add	%o0, 8, %o0		! advance SRC by 8
2786	bgt,pt	%ncc, .ci_med_lword	! loop til 7 or fewer bytes left
2787	  add	%o1, 8, %o1		! advance DST by 8
2788.ci_med_lextra:
2789	addcc	%o2, 7, %o2		! restore rest of count
2790	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
2791	  deccc	%o2
2792	bz,pt	%ncc, .ci_sm_byte
2793	  nop
2794	ba,pt	%ncc, .ci_sm_half
2795	  nop
2796
2797	.align 16
2798	nop				! instruction alignment
2799					! see discussion at start of file
2800.ci_med_word:
2801	btst	3, %o0			! check for
2802	bz,pt	%ncc, .ci_med_word1	! word alignment
2803	  nop
2804.ci_med_word0:
2805	lduba	[%o0]ASI_USER, %o3	! load one byte
2806	inc	%o0
2807	stb	%o3,[%o1]		! store byte
2808	inc	%o1
2809	btst	3, %o0
2810	bnz,pt	%ncc, .ci_med_word0
2811	  dec	%o2
2812!
2813!  Now word aligned and have at least 36 bytes to move
2814!
2815.ci_med_word1:
2816	sub	%o2, 15, %o2		! adjust count to allow cc zero test
2817.ci_med_wmove:
2818	lduwa	[%o0]ASI_USER, %o3	! read word
2819	subcc	%o2, 16, %o2		! reduce count by 16
2820	stw	%o3, [%o1]		! write word
2821	add	%o0, 4, %o0		! advance SRC by 4
2822	lduwa	[%o0]ASI_USER, %o3	! repeat for a total for 4 words
2823	add	%o0, 4, %o0		! advance SRC by 4
2824	stw	%o3, [%o1 + 4]
2825	add	%o1, 16, %o1		! advance DST by 16
2826	lduwa	[%o0]ASI_USER, %o3
2827	add	%o0, 4, %o0		! advance SRC by 4
2828	stw	%o3, [%o1 - 8]
2829	lduwa	[%o0]ASI_USER, %o3
2830	add	%o0, 4, %o0		! advance SRC by 4
2831	bgt,pt	%ncc, .ci_med_wmove	! loop til 15 or fewer bytes left
2832	  stw	%o3, [%o1 - 4]
2833	addcc	%o2, 12, %o2		! restore count to word offset
2834	ble,pt	%ncc, .ci_med_wextra	! check for more words to move
2835	  nop
2836.ci_med_word2:
2837	lduwa	[%o0]ASI_USER, %o3	! read word
2838	subcc	%o2, 4, %o2		! reduce count by 4
2839	stw	%o3, [%o1]		! write word
2840	add	%o0, 4, %o0		! advance SRC by 4
2841	bgt,pt	%ncc, .ci_med_word2	! loop til 3 or fewer bytes left
2842	  add	%o1, 4, %o1		! advance DST by 4
2843.ci_med_wextra:
2844	addcc	%o2, 3, %o2		! restore rest of count
2845	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
2846	  deccc	%o2
2847	bz,pt	%ncc, .ci_sm_byte
2848	  nop
2849	ba,pt	%ncc, .ci_sm_half
2850	  nop
2851
2852	.align 16
2853	nop				! instruction alignment
2854					! see discussion at start of file
2855.ci_med_half:
2856	btst	1, %o0			! check for
2857	bz,pt	%ncc, .ci_med_half1	! half word alignment
2858	  nop
2859	lduba	[%o0]ASI_USER, %o3	! load one byte
2860	inc	%o0
2861	stb	%o3,[%o1]		! store byte
2862	inc	%o1
2863	dec	%o2
2864!
2865!  Now half word aligned and have at least 38 bytes to move
2866!
2867.ci_med_half1:
2868	sub	%o2, 7, %o2		! adjust count to allow cc zero test
2869.ci_med_hmove:
2870	lduha	[%o0]ASI_USER, %o3	! read half word
2871	subcc	%o2, 8, %o2		! reduce count by 8
2872	sth	%o3, [%o1]		! write half word
2873	add	%o0, 2, %o0		! advance SRC by 2
2874	lduha	[%o0]ASI_USER, %o3	! repeat for a total for 4 halfwords
2875	add	%o0, 2, %o0		! advance SRC by 2
2876	sth	%o3, [%o1 + 2]
2877	add	%o1, 8, %o1		! advance DST by 8
2878	lduha	[%o0]ASI_USER, %o3
2879	add	%o0, 2, %o0		! advance SRC by 2
2880	sth	%o3, [%o1 - 4]
2881	lduha	[%o0]ASI_USER, %o3
2882	add	%o0, 2, %o0		! advance SRC by 2
2883	bgt,pt	%ncc, .ci_med_hmove	! loop til 7 or fewer bytes left
2884	  sth	%o3, [%o1 - 2]
2885	addcc	%o2, 7, %o2		! restore count
2886	bz,pt	%ncc, .ci_sm_exit
2887	  deccc	%o2
2888	bz,pt	%ncc, .ci_sm_byte
2889	  nop
2890	ba,pt	%ncc, .ci_sm_half
2891	  nop
2892
2893.sm_copyin_err:
2894	membar	#Sync
2895	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2896	mov	SM_SAVE_SRC, %o0
2897	mov	SM_SAVE_DST, %o1
2898	mov	SM_SAVE_COUNT, %o2
2899	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2900	tst	%o3
2901	bz,pt	%ncc, 3f			! if not, return error
2902	  nop
2903	ldn	[%o3 + CP_COPYIN], %o5		! if handler, invoke it with
2904	jmp	%o5				! original arguments
2905	  nop
29063:
2907	retl
2908	  or	%g0, -1, %o0		! return errno value
2909
2910	SET_SIZE(copyin)
2911
2912
2913/*
2914 * The _more entry points are not intended to be used directly by
2915 * any caller from outside this file.  They are provided to allow
2916 * profiling and dtrace of the portions of the copy code that uses
2917 * the floating point registers.
2918 * This entry is particularly important as DTRACE (at least as of
2919 * 4/2004) does not support leaf functions.
2920 */
2921
2922	ENTRY(copyin_more)
2923.copyin_more:
2924	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2925	set	.copyin_err, REAL_LOFAULT
2926
2927/*
2928 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
2929 */
2930.do_copyin:
2931	set	copyio_fault, %l7		! .copyio_fault is lofault val
2932
2933	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
2934	membar	#Sync				! sync error barrier
2935	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
2936
2937	mov	%i0, SAVE_SRC
2938	mov	%i1, SAVE_DST
2939	mov	%i2, SAVE_COUNT
2940
2941	FP_NOMIGRATE(6, 7)
2942
2943	rd	%fprs, %o2		! check for unused fp
2944	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2945	btst	FPRS_FEF, %o2
2946	bz,a,pt	%icc, .do_blockcopyin
2947	  wr	%g0, FPRS_FEF, %fprs
2948
2949	BST_FPQ2Q4_TOSTACK(%o2)
2950
2951.do_blockcopyin:
2952	rd	%gsr, %o2
2953	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
2954	or	%l6, FPUSED_FLAG, %l6
2955
2956	andcc	DST, VIS_BLOCKSIZE - 1, TMP
2957	mov	ASI_USER, %asi
2958	bz,pt	%ncc, 2f
2959	  neg	TMP
2960	add	TMP, VIS_BLOCKSIZE, TMP
2961
2962	! TMP = bytes required to align DST on FP_BLOCK boundary
2963	! Using SRC as a tmp here
2964	cmp	TMP, 3
2965	bleu,pt	%ncc, 1f
2966	  sub	CNT,TMP,CNT		! adjust main count
2967	sub	TMP, 3, TMP		! adjust for end of loop test
2968.ci_blkalign:
2969	lduba	[REALSRC]%asi, SRC	! move 4 bytes per loop iteration
2970	stb	SRC, [DST]
2971	subcc	TMP, 4, TMP
2972	lduba	[REALSRC + 1]%asi, SRC
2973	add	REALSRC, 4, REALSRC
2974	stb	SRC, [DST + 1]
2975	lduba	[REALSRC - 2]%asi, SRC
2976	add	DST, 4, DST
2977	stb	SRC, [DST - 2]
2978	lduba	[REALSRC - 1]%asi, SRC
2979	bgu,pt	%ncc, .ci_blkalign
2980	  stb	SRC, [DST - 1]
2981
2982	addcc	TMP, 3, TMP		! restore count adjustment
2983	bz,pt	%ncc, 2f		! no bytes left?
2984	  nop
29851:	lduba	[REALSRC]%asi, SRC
2986	inc	REALSRC
2987	inc	DST
2988	deccc	TMP
2989	bgu	%ncc, 1b
2990	  stb	SRC, [DST - 1]
2991
29922:
2993	andn	REALSRC, 0x7, SRC
2994	alignaddr REALSRC, %g0, %g0
2995
2996	! SRC - 8-byte aligned
2997	! DST - 64-byte aligned
2998	prefetcha [SRC]%asi, #one_read
2999	prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #one_read
3000	prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #one_read
3001	prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #one_read
3002	ldda	[SRC]%asi, %f16
3003#if CHEETAH_PREFETCH > 4
3004	prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
3005#endif
3006	ldda	[SRC + 0x08]%asi, %f18
3007#if CHEETAH_PREFETCH > 5
3008	prefetcha [SRC + (5 * VIS_BLOCKSIZE)]%asi, #one_read
3009#endif
3010	ldda	[SRC + 0x10]%asi, %f20
3011#if CHEETAH_PREFETCH > 6
3012	prefetcha [SRC + (6 * VIS_BLOCKSIZE)]%asi, #one_read
3013#endif
3014	faligndata %f16, %f18, %f48
3015	ldda	[SRC + 0x18]%asi, %f22
3016#if CHEETAH_PREFETCH > 7
3017	prefetcha [SRC + (7 * VIS_BLOCKSIZE)]%asi, #one_read
3018#endif
3019	faligndata %f18, %f20, %f50
3020	ldda	[SRC + 0x20]%asi, %f24
3021	faligndata %f20, %f22, %f52
3022	ldda	[SRC + 0x28]%asi, %f26
3023	faligndata %f22, %f24, %f54
3024	ldda	[SRC + 0x30]%asi, %f28
3025	faligndata %f24, %f26, %f56
3026	ldda	[SRC + 0x38]%asi, %f30
3027	faligndata %f26, %f28, %f58
3028	ldda	[SRC + VIS_BLOCKSIZE]%asi, %f16
3029	sub	CNT, VIS_BLOCKSIZE, CNT
3030	add	SRC, VIS_BLOCKSIZE, SRC
3031	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3032	ba,a,pt	%ncc, 1f
3033	  nop
3034	.align	16
30351:
3036	ldda	[SRC + 0x08]%asi, %f18
3037	faligndata %f28, %f30, %f60
3038	ldda	[SRC + 0x10]%asi, %f20
3039	faligndata %f30, %f16, %f62
3040	stda	%f48, [DST]ASI_BLK_P
3041	ldda	[SRC + 0x18]%asi, %f22
3042	faligndata %f16, %f18, %f48
3043	ldda	[SRC + 0x20]%asi, %f24
3044	faligndata %f18, %f20, %f50
3045	ldda	[SRC + 0x28]%asi, %f26
3046	faligndata %f20, %f22, %f52
3047	ldda	[SRC + 0x30]%asi, %f28
3048	faligndata %f22, %f24, %f54
3049	ldda	[SRC + 0x38]%asi, %f30
3050	faligndata %f24, %f26, %f56
3051	sub	CNT, VIS_BLOCKSIZE, CNT
3052	ldda	[SRC + VIS_BLOCKSIZE]%asi, %f16
3053	faligndata %f26, %f28, %f58
3054	prefetcha [SRC + ((CHEETAH_PREFETCH) * VIS_BLOCKSIZE) + 8]%asi, #one_read
3055	add	DST, VIS_BLOCKSIZE, DST
3056	prefetcha [SRC + ((CHEETAH_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3057	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3058	cmp	CNT, VIS_BLOCKSIZE + 8
3059	bgu,pt	%ncc, 1b
3060	  add	SRC, VIS_BLOCKSIZE, SRC
3061
3062	! only if REALSRC & 0x7 is 0
3063	cmp	CNT, VIS_BLOCKSIZE
3064	bne	%ncc, 3f
3065	  andcc	REALSRC, 0x7, %g0
3066	bz,pt	%ncc, 2f
3067	  nop
30683:
3069	faligndata %f28, %f30, %f60
3070	faligndata %f30, %f16, %f62
3071	stda	%f48, [DST]ASI_BLK_P
3072	add	DST, VIS_BLOCKSIZE, DST
3073	ba,pt	%ncc, 3f
3074	  nop
30752:
3076	ldda	[SRC + 0x08]%asi, %f18
3077	fsrc1	%f28, %f60
3078	ldda	[SRC + 0x10]%asi, %f20
3079	fsrc1	%f30, %f62
3080	stda	%f48, [DST]ASI_BLK_P
3081	ldda	[SRC + 0x18]%asi, %f22
3082	fsrc1	%f16, %f48
3083	ldda	[SRC + 0x20]%asi, %f24
3084	fsrc1	%f18, %f50
3085	ldda	[SRC + 0x28]%asi, %f26
3086	fsrc1	%f20, %f52
3087	ldda	[SRC + 0x30]%asi, %f28
3088	fsrc1	%f22, %f54
3089	ldda	[SRC + 0x38]%asi, %f30
3090	fsrc1	%f24, %f56
3091	sub	CNT, VIS_BLOCKSIZE, CNT
3092	add	DST, VIS_BLOCKSIZE, DST
3093	add	SRC, VIS_BLOCKSIZE, SRC
3094	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3095	fsrc1	%f26, %f58
3096	fsrc1	%f28, %f60
3097	fsrc1	%f30, %f62
3098	stda	%f48, [DST]ASI_BLK_P
3099	add	DST, VIS_BLOCKSIZE, DST
3100	ba,a,pt	%ncc, 4f
3101	  nop
3102
31033:	tst	CNT
3104	bz,a	%ncc, 4f
3105	  nop
3106
31075:	lduba	[REALSRC]ASI_USER, TMP
3108	inc	REALSRC
3109	inc	DST
3110	deccc	CNT
3111	bgu	%ncc, 5b
3112	  stb	TMP, [DST - 1]
31134:
3114
3115.copyin_exit:
3116	membar	#Sync
3117
3118	FPRAS_INTERVAL(FPRAS_COPYIN, 1, %l5, %o2, %o3, %o4, %o5, 8)
3119	FPRAS_REWRITE_TYPE1(1, %l5, %f48, %o2, 9)
3120	FPRAS_CHECK(FPRAS_COPYIN, %l5, 9)	! lose outputs
3121
3122	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
3123	wr	%o2, 0, %gsr
3124
3125	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3126	btst	FPRS_FEF, %o3
3127	bz,pt	%icc, 4f
3128	  nop
3129
3130	BLD_FPQ2Q4_FROMSTACK(%o2)
3131
3132	ba,pt	%ncc, 1f
3133	  wr	%o3, 0, %fprs		! restore fprs
3134
31354:
3136	FZEROQ2Q4
3137	wr	%o3, 0, %fprs		! restore fprs
3138
31391:
3140	membar	#Sync				! sync error barrier
3141	andn	%l6, FPUSED_FLAG, %l6
3142	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3143	FP_ALLOWMIGRATE(5, 6)
3144	ret
3145	  restore	%g0, 0, %o0
3146/*
3147 * We got here because of a fault during copyin
3148 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
3149 */
3150.copyin_err:
3151	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
3152	tst	%o4
3153	bz,pt	%ncc, 2f			! if not, return error
3154	nop
3155	ldn	[%o4 + CP_COPYIN], %g2		! if handler, invoke it with
3156	jmp	%g2				! original arguments
3157	restore %g0, 0, %g0			! dispose of copy window
31582:
3159	ret
3160	restore %g0, -1, %o0			! return error value
3161
3162
3163	SET_SIZE(copyin_more)
3164
3165	ENTRY(xcopyin)
3166
3167	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
3168	bleu,pt	%ncc, .xcopyin_small		! go to larger cases
3169	  xor	%o0, %o1, %o3			! are src, dst alignable?
3170	btst	7, %o3				!
3171	bz,pt	%ncc, .xcopyin_8		! check for longword alignment
3172	  nop
3173	btst	1, %o3				!
3174	bz,pt	%ncc, .xcopyin_2		! check for half-word
3175	  nop
3176	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
3177	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3178	tst	%o3
3179	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3180	  cmp	%o2, %o3			! if length <= limit
3181	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3182	  nop
3183	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3184	  nop
3185.xcopyin_2:
3186	btst	3, %o3				!
3187	bz,pt	%ncc, .xcopyin_4		! check for word alignment
3188	  nop
3189	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
3190	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3191	tst	%o3
3192	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3193	  cmp	%o2, %o3			! if length <= limit
3194	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3195	  nop
3196	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3197	  nop
3198.xcopyin_4:
3199	! already checked longword, must be word aligned
3200	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
3201	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3202	tst	%o3
3203	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3204	  cmp	%o2, %o3			! if length <= limit
3205	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3206	  nop
3207	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3208	  nop
3209.xcopyin_8:
3210	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
3211	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3212	tst	%o3
3213	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3214	  cmp	%o2, %o3			! if length <= limit
3215	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3216	  nop
3217	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3218	  nop
3219
3220.xcopyin_small:
3221	sethi	%hi(.sm_xcopyin_err), %o5  ! .sm_xcopyin_err is lofault value
3222	or	%o5, %lo(.sm_xcopyin_err), %o5
3223	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofaul
3224	membar	#Sync				! sync error barrier
3225	ba,pt	%ncc, .sm_do_copyin		! common code
3226	  stn	%o5, [THREAD_REG + T_LOFAULT]
3227
3228.xcopyin_more:
3229	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3230	sethi	%hi(.xcopyin_err), REAL_LOFAULT	! .xcopyin_err is lofault value
3231	ba,pt	%ncc, .do_copyin
3232	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3233
3234/*
3235 * We got here because of fault during xcopyin
3236 * Errno value is in ERRNO
3237 */
3238.xcopyin_err:
3239	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
3240	tst	%o4
3241	bz,pt	%ncc, 2f			! if not, return error
3242	  nop
3243	ldn	[%o4 + CP_XCOPYIN], %g2		! if handler, invoke it with
3244	jmp	%g2				! original arguments
3245	  restore %g0, 0, %g0			! dispose of copy window
32462:
3247        ret
3248	  restore ERRNO, 0, %o0			! return errno value
3249
3250.sm_xcopyin_err:
3251
3252	membar	#Sync
3253	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3254	mov	SM_SAVE_SRC, %o0
3255	mov	SM_SAVE_DST, %o1
3256	mov	SM_SAVE_COUNT, %o2
3257	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
3258	tst	%o3
3259	bz,pt	%ncc, 3f			! if not, return error
3260	  nop
3261	ldn	[%o3 + CP_XCOPYIN], %o5		! if handler, invoke it with
3262	jmp	%o5				! original arguments
3263	  nop
32643:
3265	retl
3266	  or	%g1, 0, %o0		! return errno value
3267
3268	SET_SIZE(xcopyin)
3269
3270	ENTRY(xcopyin_little)
3271	sethi	%hi(.xcopyio_err), %o5
3272	or	%o5, %lo(.xcopyio_err), %o5
3273	ldn	[THREAD_REG + T_LOFAULT], %o4
3274	membar	#Sync				! sync error barrier
3275	stn	%o5, [THREAD_REG + T_LOFAULT]
3276	mov	%o4, %o5
3277
3278	subcc	%g0, %o2, %o3
3279	add	%o0, %o2, %o0
3280	bz,pn	%ncc, 2f		! check for zero bytes
3281	  sub	%o2, 1, %o4
3282	add	%o0, %o4, %o0		! start w/last byte
3283	add	%o1, %o2, %o1
3284	lduba	[%o0 + %o3]ASI_AIUSL, %o4
3285
32861:	stb	%o4, [%o1 + %o3]
3287	inccc	%o3
3288	sub	%o0, 2, %o0		! get next byte
3289	bcc,a,pt %ncc, 1b
3290	  lduba	[%o0 + %o3]ASI_AIUSL, %o4
3291
32922:
3293	membar	#Sync				! sync error barrier
3294	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3295	retl
3296	  mov	%g0, %o0		! return (0)
3297
3298.xcopyio_err:
3299	membar	#Sync				! sync error barrier
3300	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3301	retl
3302	  mov	%g1, %o0
3303
3304	SET_SIZE(xcopyin_little)
3305
3306
3307/*
3308 * Copy a block of storage - must not overlap (from + len <= to).
3309 * No fault handler installed (to be called under on_fault())
3310 */
3311	ENTRY(copyin_noerr)
3312
3313	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
3314	bleu,pt	%ncc, .copyin_ne_small		! go to larger cases
3315	  xor	%o0, %o1, %o3			! are src, dst alignable?
3316	btst	7, %o3				!
3317	bz,pt	%ncc, .copyin_ne_8		! check for longword alignment
3318	  nop
3319	btst	1, %o3				!
3320	bz,pt	%ncc, .copyin_ne_2		! check for half-word
3321	  nop
3322	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
3323	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3324	tst	%o3
3325	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3326	  cmp	%o2, %o3			! if length <= limit
3327	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3328	  nop
3329	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3330	  nop
3331.copyin_ne_2:
3332	btst	3, %o3				!
3333	bz,pt	%ncc, .copyin_ne_4		! check for word alignment
3334	  nop
3335	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
3336	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3337	tst	%o3
3338	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3339	  cmp	%o2, %o3			! if length <= limit
3340	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3341	  nop
3342	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3343	  nop
3344.copyin_ne_4:
3345	! already checked longword, must be word aligned
3346	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
3347	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3348	tst	%o3
3349	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3350	  cmp	%o2, %o3			! if length <= limit
3351	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3352	  nop
3353	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3354	  nop
3355.copyin_ne_8:
3356	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
3357	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3358	tst	%o3
3359	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3360	  cmp	%o2, %o3			! if length <= limit
3361	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3362	  nop
3363	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3364	  nop
3365
3366.copyin_ne_small:
3367	ldn	[THREAD_REG + T_LOFAULT], %o4
3368	tst	%o4
3369	bz,pn	%ncc, .sm_do_copyin
3370	  nop
3371	sethi	%hi(.sm_copyio_noerr), %o5
3372	or	%o5, %lo(.sm_copyio_noerr), %o5
3373	membar	#Sync				! sync error barrier
3374	ba,pt	%ncc, .sm_do_copyin
3375	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
3376
3377.copyin_noerr_more:
3378	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3379	sethi	%hi(.copyio_noerr), REAL_LOFAULT
3380	ba,pt	%ncc, .do_copyin
3381	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3382
3383.copyio_noerr:
3384	jmp	%l6
3385	  restore %g0,0,%g0
3386
3387.sm_copyio_noerr:
3388	membar	#Sync
3389	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore t_lofault
3390	jmp	%o4
3391	  nop
3392
3393	SET_SIZE(copyin_noerr)
3394
3395/*
3396 * Copy a block of storage - must not overlap (from + len <= to).
3397 * No fault handler installed (to be called under on_fault())
3398 */
3399
3400	ENTRY(copyout_noerr)
3401
3402	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
3403	bleu,pt	%ncc, .copyout_ne_small		! go to larger cases
3404	  xor	%o0, %o1, %o3			! are src, dst alignable?
3405	btst	7, %o3				!
3406	bz,pt	%ncc, .copyout_ne_8		! check for longword alignment
3407	  nop
3408	btst	1, %o3				!
3409	bz,pt	%ncc, .copyout_ne_2		! check for half-word
3410	  nop
3411	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
3412	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3413	tst	%o3
3414	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3415	  cmp	%o2, %o3			! if length <= limit
3416	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3417	  nop
3418	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3419	  nop
3420.copyout_ne_2:
3421	btst	3, %o3				!
3422	bz,pt	%ncc, .copyout_ne_4		! check for word alignment
3423	  nop
3424	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
3425	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3426	tst	%o3
3427	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3428	  cmp	%o2, %o3			! if length <= limit
3429	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3430	  nop
3431	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3432	  nop
3433.copyout_ne_4:
3434	! already checked longword, must be word aligned
3435	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
3436	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3437	tst	%o3
3438	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3439	  cmp	%o2, %o3			! if length <= limit
3440	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3441	  nop
3442	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3443	  nop
3444.copyout_ne_8:
3445	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
3446	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3447	tst	%o3
3448	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3449	  cmp	%o2, %o3			! if length <= limit
3450	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3451	  nop
3452	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3453	  nop
3454
3455.copyout_ne_small:
3456	ldn	[THREAD_REG + T_LOFAULT], %o4
3457	tst	%o4
3458	bz,pn	%ncc, .sm_do_copyout
3459	  nop
3460	sethi	%hi(.sm_copyio_noerr), %o5
3461	or	%o5, %lo(.sm_copyio_noerr), %o5
3462	membar	#Sync				! sync error barrier
3463	ba,pt	%ncc, .sm_do_copyout
3464	stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
3465
3466.copyout_noerr_more:
3467	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3468	sethi	%hi(.copyio_noerr), REAL_LOFAULT
3469	ba,pt	%ncc, .do_copyout
3470	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3471
3472	SET_SIZE(copyout_noerr)
3473
3474
3475/*
3476 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
3477 * longer than 256 bytes in length using spitfire's block stores.  If
3478 * the criteria for using this routine are not met then it calls bzero
3479 * and returns 1.  Otherwise 0 is returned indicating success.
3480 * Caller is responsible for ensuring use_hw_bzero is true and that
3481 * kpreempt_disable() has been called.
3482 */
3483	! %i0 - start address
3484	! %i1 - length of region (multiple of 64)
3485	! %l0 - saved fprs
3486	! %l1 - pointer to saved %d0 block
3487	! %l2 - saved curthread->t_lwp
3488
3489	ENTRY(hwblkclr)
3490	! get another window w/space for one aligned block of saved fpregs
3491	save	%sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
3492
3493	! Must be block-aligned
3494	andcc	%i0, (VIS_BLOCKSIZE-1), %g0
3495	bnz,pn	%ncc, 1f
3496	  nop
3497
3498	! ... and must be 256 bytes or more
3499	cmp	%i1, 256
3500	blu,pn	%ncc, 1f
3501	  nop
3502
3503	! ... and length must be a multiple of VIS_BLOCKSIZE
3504	andcc	%i1, (VIS_BLOCKSIZE-1), %g0
3505	bz,pn	%ncc, 2f
3506	  nop
3507
35081:	! punt, call bzero but notify the caller that bzero was used
3509	mov	%i0, %o0
3510	call	bzero
3511	mov	%i1, %o1
3512	ret
3513	  restore	%g0, 1, %o0 ! return (1) - did not use block operations
3514
35152:	rd	%fprs, %l0		! check for unused fp
3516	btst	FPRS_FEF, %l0
3517	bz,pt	%icc, 1f
3518	  nop
3519
3520	! save in-use fpregs on stack
3521	membar	#Sync
3522	add	%fp, STACK_BIAS - 65, %l1
3523	and	%l1, -VIS_BLOCKSIZE, %l1
3524	stda	%d0, [%l1]ASI_BLK_P
3525
35261:	membar	#StoreStore|#StoreLoad|#LoadStore
3527	wr	%g0, FPRS_FEF, %fprs
3528	wr	%g0, ASI_BLK_P, %asi
3529
3530	! Clear block
3531	fzero	%d0
3532	fzero	%d2
3533	fzero	%d4
3534	fzero	%d6
3535	fzero	%d8
3536	fzero	%d10
3537	fzero	%d12
3538	fzero	%d14
3539
3540	mov	256, %i3
3541	ba,pt	%ncc, .pz_doblock
3542	  nop
3543
3544.pz_blkstart:
3545      ! stda	%d0, [%i0 + 192]%asi  ! in dly slot of branch that got us here
3546	stda	%d0, [%i0 + 128]%asi
3547	stda	%d0, [%i0 + 64]%asi
3548	stda	%d0, [%i0]%asi
3549.pz_zinst:
3550	add	%i0, %i3, %i0
3551	sub	%i1, %i3, %i1
3552.pz_doblock:
3553	cmp	%i1, 256
3554	bgeu,a	%ncc, .pz_blkstart
3555	  stda	%d0, [%i0 + 192]%asi
3556
3557	cmp	%i1, 64
3558	blu	%ncc, .pz_finish
3559
3560	  andn	%i1, (64-1), %i3
3561	srl	%i3, 4, %i2		! using blocks, 1 instr / 16 words
3562	set	.pz_zinst, %i4
3563	sub	%i4, %i2, %i4
3564	jmp	%i4
3565	  nop
3566
3567.pz_finish:
3568	membar	#Sync
3569	btst	FPRS_FEF, %l0
3570	bz,a	.pz_finished
3571	  wr	%l0, 0, %fprs		! restore fprs
3572
3573	! restore fpregs from stack
3574	ldda	[%l1]ASI_BLK_P, %d0
3575	membar	#Sync
3576	wr	%l0, 0, %fprs		! restore fprs
3577
3578.pz_finished:
3579	ret
3580	  restore	%g0, 0, %o0		! return (bzero or not)
3581
3582	SET_SIZE(hwblkclr)
3583
3584	/*
3585	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
3586	 * using physical addresses.
3587	 */
3588	ENTRY_NP(hw_pa_bcopy32)
3589	rdpr	%pstate, %g1
3590	andn	%g1, PSTATE_IE, %g2
3591	wrpr	%g0, %g2, %pstate
3592
3593	rdpr	%pstate, %g0
3594	ldxa	[%o0]ASI_MEM, %o2
3595	add	%o0, 8, %o0
3596	ldxa	[%o0]ASI_MEM, %o3
3597	add	%o0, 8, %o0
3598	ldxa	[%o0]ASI_MEM, %o4
3599	add	%o0, 8, %o0
3600	ldxa	[%o0]ASI_MEM, %o5
3601
3602    	stxa	%g0, [%o1]ASI_DC_INVAL
3603	membar	#Sync
3604
3605	stxa	%o2, [%o1]ASI_MEM
3606	add	%o1, 8, %o1
3607	stxa	%o3, [%o1]ASI_MEM
3608	add	%o1, 8, %o1
3609	stxa	%o4, [%o1]ASI_MEM
3610	add	%o1, 8, %o1
3611	stxa	%o5, [%o1]ASI_MEM
3612
3613	retl
3614	  wrpr	  %g0, %g1, %pstate
3615
3616	SET_SIZE(hw_pa_bcopy32)
3617
3618	DGDEF(use_hw_bcopy)
3619	.word	1
3620	DGDEF(use_hw_bzero)
3621	.word	1
3622	DGDEF(hw_copy_limit_1)
3623	.word	0
3624	DGDEF(hw_copy_limit_2)
3625	.word	0
3626	DGDEF(hw_copy_limit_4)
3627	.word	0
3628	DGDEF(hw_copy_limit_8)
3629	.word	0
3630
3631	.align	64
3632	.section ".text"
3633