xref: /titanic_41/usr/src/uts/sun4u/cpu/opl_olympus_copy.s (revision 890e8ff10cfc85bc7d33064a9a30c3e8477b4813)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/param.h>
29#include <sys/errno.h>
30#include <sys/asm_linkage.h>
31#include <sys/vtrace.h>
32#include <sys/machthread.h>
33#include <sys/clock.h>
34#include <sys/asi.h>
35#include <sys/fsr.h>
36#include <sys/privregs.h>
37
38#if !defined(lint)
39#include "assym.h"
40#endif	/* lint */
41
42/*
43 * Pseudo-code to aid in understanding the control flow of the
44 * bcopy/copyin/copyout routines.
45 *
46 * On entry:
47 *
48 * 	! Determine whether to use the FP register version
49 * 	! or the leaf routine version depending on size
50 * 	! of copy and flags.  Set up error handling accordingly.
51 *	! The transition point depends on whether the src and
52 * 	! dst addresses can be aligned to long word, word,
53 * 	! half word, or byte boundaries.
54 *	!
55 *	! WARNING: <Register usage convention>
56 *	! For FP version, %l6 holds previous error handling and
57 *	! a flag: TRAMP_FLAG (low bits)
58 *	! for leaf routine version, %o4 holds those values.
59 *	! So either %l6 or %o4 is reserved and not available for
60 *	! any other use.
61 *
62 * 	if (length <= VIS_COPY_THRESHOLD) 	! start with a quick test
63 * 		go to small_copy;		! to speed short copies
64 *
65 * 	! src, dst long word alignable
66 * 		if (hw_copy_limit_8 == 0) 	! hw_copy disabled
67 * 			go to small_copy;
68 *		if (length <= hw_copy_limit_8)
69 * 			go to small_copy;
70 * 		go to FPBLK_copy;
71 * 	}
72 * 	if (src,dst not alignable) {
73 * 		if (hw_copy_limit_1 == 0) 	! hw_copy disabled
74 * 			go to small_copy;
75 *		if (length <= hw_copy_limit_1)
76 * 			go to small_copy;
77 * 		go to FPBLK_copy;
78 * 	}
79 * 	if (src,dst halfword alignable) {
80 * 		if (hw_copy_limit_2 == 0) 	! hw_copy disabled
81 * 			go to small_copy;
82 *		if (length <= hw_copy_limit_2)
83 * 			go to small_copy;
84 * 		go to FPBLK_copy;
85 * 	}
86 * 	if (src,dst word alignable) {
87 * 		if (hw_copy_limit_4 == 0) 	! hw_copy disabled
88 * 			go to small_copy;
89 *		if (length <= hw_copy_limit_4)
90 * 			go to small_copy;
91 * 		go to FPBLK_copy;
92 * 	}
93 *
94 * small_copy:
95 *	Setup_leaf_rtn_error_handler; 		! diffs for each entry point
96 *
97 *	if (count <= 3)				! fast path for tiny copies
98 *		go to sm_left;			! special finish up code
99 *	else
100 *		if (count > CHKSIZE)		! medium sized copies
101 *			go to sm_med		! tuned by alignment
102 *		if(src&dst not both word aligned) {
103 *	sm_movebytes:
104 *			move byte by byte in 4-way unrolled loop
105 *			fall into sm_left;
106 *	sm_left:
107 *			move 0-3 bytes byte at a time as needed.
108 *			restore error handler and exit.
109 *
110 * 		} else {	! src&dst are word aligned
111 *			check for at least 8 bytes left,
112 *			move word at a time, unrolled by 2
113 *			when fewer than 8 bytes left,
114 *	sm_half:	move half word at a time while 2 or more bytes left
115 *	sm_byte:	move final byte if necessary
116 *	sm_exit:
117 *			restore error handler and exit.
118 *		}
119 *
120 * ! Medium length cases with at least CHKSIZE bytes available
121 * ! method: line up src and dst as best possible, then
122 * ! move data in 4-way unrolled loops.
123 *
124 * sm_med:
125 *	if(src&dst unalignable)
126 * 		go to sm_movebytes
127 *	if(src&dst halfword alignable)
128 *		go to sm_movehalf
129 *	if(src&dst word alignable)
130 *		go to sm_moveword
131 * ! fall into long word movement
132 *	move bytes until src is word aligned
133 *	if not long word aligned, move a word
134 *	move long words in 4-way unrolled loop until < 32 bytes left
135 *      move long words in 1-way unrolled loop until < 8 bytes left
136 *	if zero bytes left, goto sm_exit
137 *	if one byte left, go to sm_byte
138 *	else go to sm_half
139 *
140 * sm_moveword:
141 *	move bytes until src is word aligned
142 *	move words in 4-way unrolled loop until < 16 bytes left
143 *      move words in 1-way unrolled loop until < 4 bytes left
144 *	if zero bytes left, goto sm_exit
145 *	if one byte left, go to sm_byte
146 *	else go to sm_half
147 *
148 * sm_movehalf:
149 *	move a byte if needed to align src on halfword
150 *	move halfwords in 4-way unrolled loop until < 8 bytes left
151 *	if zero bytes left, goto sm_exit
152 *	if one byte left, go to sm_byte
153 *	else go to sm_half
154 *
155 *
156 * FPBLK_copy:
157 * 	%l6 = curthread->t_lofault;
158 * 	if (%l6 != NULL) {
159 * 		membar #Sync
160 * 		curthread->t_lofault = .copyerr;
161 * 		caller_error_handler = TRUE             ! %l6 |= 2
162 * 	}
163 *
164 *	! for FPU testing we must not migrate cpus
165 * 	if (curthread->t_lwp == NULL) {
166 *		! Kernel threads do not have pcb's in which to store
167 *		! the floating point state, so disallow preemption during
168 *		! the copy.  This also prevents cpu migration.
169 * 		kpreempt_disable(curthread);
170 *	} else {
171 *		thread_nomigrate();
172 *	}
173 *
174 * 	old_fprs = %fprs;
175 * 	old_gsr = %gsr;
176 * 	if (%fprs.fef) {
177 * 		%fprs.fef = 1;
178 * 		save current fpregs on stack using blockstore
179 * 	} else {
180 * 		%fprs.fef = 1;
181 * 	}
182 *
183 *
184 * 	do_blockcopy_here;
185 *
186 * In lofault handler:
187 *	curthread->t_lofault = .copyerr2;
188 *	Continue on with the normal exit handler
189 *
190 * On normal exit:
191 * 	%gsr = old_gsr;
192 * 	if (old_fprs & FPRS_FEF)
193 * 		restore fpregs from stack using blockload
194 *	else
195 *		zero fpregs
196 * 	%fprs = old_fprs;
197 * 	membar #Sync
198 * 	curthread->t_lofault = (%l6 & ~3);
199 *	! following test omitted from copyin/copyout as they
200 *	! will always have a current thread
201 * 	if (curthread->t_lwp == NULL)
202 *		kpreempt_enable(curthread);
203 *	else
204 *		thread_allowmigrate();
205 * 	return (0)
206 *
207 * In second lofault handler (.copyerr2):
208 *	We've tried to restore fp state from the stack and failed.  To
209 *	prevent from returning with a corrupted fp state, we will panic.
210 */
211
212/*
213 * Comments about optimization choices
214 *
215 * The initial optimization decision in this code is to determine
216 * whether to use the FP registers for a copy or not.  If we don't
217 * use the FP registers, we can execute the copy as a leaf routine,
218 * saving a register save and restore.  Also, less elaborate setup
219 * is required, allowing short copies to be completed more quickly.
220 * For longer copies, especially unaligned ones (where the src and
221 * dst do not align to allow simple ldx,stx operation), the FP
222 * registers allow much faster copy operations.
223 *
224 * The estimated extra cost of the FP path will vary depending on
225 * src/dst alignment, dst offset from the next 64 byte FPblock store
226 * boundary, remaining src data after the last full dst cache line is
227 * moved whether the FP registers need to be saved, and some other
228 * minor issues.  The average additional overhead is estimated to be
229 * 400 clocks.  Since each non-repeated/predicted tst and branch costs
230 * around 10 clocks, elaborate calculation would slow down to all
231 * longer copies and only benefit a small portion of medium sized
232 * copies.  Rather than incur such cost, we chose fixed transition
233 * points for each of the alignment choices.
234 *
235 * For the inner loop, here is a comparison of the per cache line
236 * costs for each alignment when src&dst are in cache:
237 *
238 * byte aligned:  108 clocks slower for non-FPBLK
239 * half aligned:   44 clocks slower for non-FPBLK
240 * word aligned:   12 clocks slower for non-FPBLK
241 * long aligned:    4 clocks >>faster<< for non-FPBLK
242 *
243 * The long aligned loop runs faster because it does no prefetching.
244 * That wins if the data is not in cache or there is too little
245 * data to gain much benefit from prefetching.  But when there
246 * is more data and that data is not in cache, failing to prefetch
247 * can run much slower.  In addition, there is a 2 Kbyte store queue
248 * which will cause the non-FPBLK inner loop to slow for larger copies.
249 * The exact tradeoff is strongly load and application dependent, with
250 * increasing risk of a customer visible performance regression if the
251 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
252 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
253 * upper limit for the non-FPBLK code.  To minimize performance regression
254 * risk while still gaining the primary benefits of the improvements to
255 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
256 * hw_copy_limit_*.  Later experimental studies using different values
257 * of hw_copy_limit_* can be used to make further adjustments if
258 * appropriate.
259 *
260 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
261 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
262 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
263 * hw_copy_limit_8 = src and dst are longword aligned
264 *
265 * To say that src and dst are word aligned means that after
266 * some initial alignment activity of moving 0 to 3 bytes,
267 * both the src and dst will be on word boundaries so that
268 * word loads and stores may be used.
269 *
270 * Default values at May,2005 are:
271 * hw_copy_limit_1 =  256
272 * hw_copy_limit_2 =  512
273 * hw_copy_limit_4 = 1024
274 * hw_copy_limit_8 = 1024 (or 1536 on some systems)
275 *
276 *
277 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
278 * disabled for that alignment choice.
279 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
280 * the value of VIS_COPY_THRESHOLD is used.
281 * It is not envisioned that hw_copy_limit_? will be changed in the field
282 * It is provided to allow for disabling FPBLK copies and to allow
283 * easy testing of alternate values on future HW implementations
284 * that might have different cache sizes, clock rates or instruction
285 * timing rules.
286 *
287 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
288 * threshold to speedup all shorter copies (less than 256).  That
289 * saves an alignment test, memory reference, and enabling test
290 * for all short copies, or an estimated 24 clocks.
291 *
292 * The order in which these limits are checked does matter since each
293 * non-predicted tst and branch costs around 10 clocks.
294 * If src and dst are randomly selected addresses,
295 * 4 of 8 will not be alignable.
296 * 2 of 8 will be half word alignable.
297 * 1 of 8 will be word alignable.
298 * 1 of 8 will be long word alignable.
299 * But, tests on running kernels show that src and dst to copy code
300 * are typically not on random alignments.  Structure copies and
301 * copies of larger data sizes are often on long word boundaries.
302 * So we test the long word alignment case first, then
303 * the byte alignment, then halfword, then word alignment.
304 *
305 * Several times, tests for length are made to split the code
306 * into subcases.  These tests often allow later tests to be
307 * avoided.  For example, within the non-FPBLK copy, we first
308 * check for tiny copies of 3 bytes or less.  That allows us
309 * to use a 4-way unrolled loop for the general byte copy case
310 * without a test on loop entry.
311 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
312 * vs longer cases.  For the really short case, we don't attempt
313 * align src and dst.  We try to minimize special case tests in
314 * the shortest loops as each test adds a significant percentage
315 * to the total time.
316 *
317 * For the medium sized cases, we allow ourselves to adjust the
318 * src and dst alignment and provide special cases for each of
319 * the four adjusted alignment cases. The CHKSIZE that was used
320 * to decide between short and medium size was chosen to be 39
321 * as that allows for the worst case of 7 bytes of alignment
322 * shift and 4 times 8 bytes for the first long word unrolling.
323 * That knowledge saves an initial test for length on entry into
324 * the medium cases.  If the general loop unrolling factor were
325 * to be increases, this number would also need to be adjusted.
326 *
327 * For all cases in the non-FPBLK code where it is known that at
328 * least 4 chunks of data are available for movement, the
329 * loop is unrolled by four.  This 4-way loop runs in 8 clocks
330 * or 2 clocks per data element.
331 *
332 * Instruction alignment is forced by used of .align 16 directives
333 * and nops which are not executed in the code.  This
334 * combination of operations shifts the alignment of following
335 * loops to insure that loops are aligned so that their instructions
336 * fall within the minimum number of 4 instruction fetch groups.
337 * If instructions are inserted or removed between the .align
338 * instruction and the unrolled loops, then the alignment needs
339 * to be readjusted.  Misaligned loops can add a clock per loop
340 * iteration to the loop timing.
341 *
342 * In a few cases, code is duplicated to avoid a branch.  Since
343 * a non-predicted tst and branch takes 10 clocks, this savings
344 * is judged an appropriate time-space tradeoff.
345 *
346 * Within the FPBLK-code, the prefetch method in the inner
347 * loop needs to be explained as it is not standard.  Two
348 * prefetches are issued for each cache line instead of one.
349 * The primary one is at the maximum reach of 8 cache lines.
350 * Most of the time, that maximum prefetch reach gives the
351 * cache line more time to reach the processor for systems with
352 * higher processor clocks.  But, sometimes memory interference
353 * can cause that prefetch to be dropped.  Putting a second
354 * prefetch at a reach of 5 cache lines catches the drops
355 * three iterations later and shows a measured improvement
356 * in performance over any similar loop with a single prefetch.
357 * The prefetches are placed in the loop so they overlap with
358 * non-memory instructions, so that there is no extra cost
359 * when the data is already in-cache.
360 *
361 */
362
363/*
364 * Notes on preserving existing fp state and on membars.
365 *
366 * When a copyOP decides to use fp we may have to preserve existing
367 * floating point state.  It is not the caller's state that we need to
368 * preserve - the rest of the kernel does not use fp and, anyway, fp
369 * registers are volatile across a call.  Some examples:
370 *
371 *	- userland has fp state and is interrupted (device interrupt
372 *	  or trap) and within the interrupt/trap handling we use
373 *	  bcopy()
374 *	- another (higher level) interrupt or trap handler uses bcopy
375 *	  while a bcopy from an earlier interrupt is still active
376 *	- an asynchronous error trap occurs while fp state exists (in
377 *	  userland or in kernel copy) and the tl0 component of the handling
378 *	  uses bcopy
379 *	- a user process with fp state incurs a copy-on-write fault and
380 *	  hwblkpagecopy always uses fp
381 *
382 * We therefore need a per-call place in which to preserve fp state -
383 * using our stack is ideal (and since fp copy cannot be leaf optimized
384 * because of calls it makes, this is no hardship).
385 *
386 * When we have finished fp copy (with it's repeated block stores)
387 * we must membar #Sync so that our block stores may complete before
388 * we either restore the original fp state into the fp registers or
389 * return to a caller which may initiate other fp operations that could
390 * modify the fp regs we used before the block stores complete.
391 *
392 * Synchronous faults (eg, unresolvable DMMU miss) that occur while
393 * t_lofault is not NULL will not panic but will instead trampoline
394 * to the registered lofault handler.  There is no need for any
395 * membars for these - eg, our store to t_lofault will always be visible to
396 * ourselves and it is our cpu which will take any trap.
397 *
398 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
399 * while t_lofault is not NULL will also not panic.  Since we're copying
400 * to or from userland the extent of the damage is known - the destination
401 * buffer is incomplete.  So trap handlers will trampoline to the lofault
402 * handler in this case which should take some form of error action to
403 * avoid using the incomplete buffer.  The trap handler also flags the
404 * fault so that later return-from-trap handling (for the trap that brought
405 * this thread into the kernel in the first place) can notify the process
406 * and reboot the system (or restart the service with Greenline/Contracts).
407 *
408 * Asynchronous faults (eg, uncorrectable ECC error from memory) can
409 * result in deferred error traps - the trap is taken sometime after
410 * the event and the trap PC may not be the PC of the faulting access.
411 * Delivery of such pending traps can be forced by a membar #Sync, acting
412 * as an "error barrier" in this role.  To accurately apply the user/kernel
413 * separation described in the preceding paragraph we must force delivery
414 * of deferred traps affecting kernel state before we install a lofault
415 * handler (if we interpose a new lofault handler on an existing one there
416 * is no need to repeat this), and we must force delivery of deferred
417 * errors affecting the lofault-protected region before we clear t_lofault.
418 * Failure to do so results in lost kernel state being interpreted as
419 * affecting a copyin/copyout only, or of an error that really only
420 * affects copy data being interpreted as losing kernel state.
421 *
422 * Since the copy operations may preserve and later restore floating
423 * point state that does not belong to the caller (see examples above),
424 * we must be careful in how we do this in order to prevent corruption
425 * of another program.
426 *
427 * To make sure that floating point state is always saved and restored
428 * correctly, the following "big rules" must be followed when the floating
429 * point registers will be used:
430 *
431 * 1. %l6 always holds the caller's lofault handler.  Also in this register,
432 *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
433 *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
434 *    lofault handler was set coming in.
435 *
436 * 2. The FPUSED flag indicates that all FP state has been successfully stored
437 *    on the stack.  It should not be set until this save has been completed.
438 *
439 * 3. The FPUSED flag should not be cleared on exit until all FP state has
440 *    been restored from the stack.  If an error occurs while restoring
441 *    data from the stack, the error handler can check this flag to see if
442 *    a restore is necessary.
443 *
444 * 4. Code run under the new lofault handler must be kept to a minimum.  In
445 *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
446 *    to kpreempt(), should not be made until after the lofault handler has
447 *    been restored.
448 */
449
450/*
451 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
452 * to "break even" using FP/VIS-accelerated memory operations.
453 * The FPBLK code assumes a minimum number of bytes are available
454 * to be moved on entry.  Check that code carefully before
455 * reducing VIS_COPY_THRESHOLD below 256.
456 */
457/*
458 * This shadows sys/machsystm.h which can't be included due to the lack of
459 * _ASM guards in include files it references. Change it here, change it there.
460 */
461#define VIS_COPY_THRESHOLD 256
462
463/*
464 * TEST for very short copies
465 * Be aware that the maximum unroll for the short unaligned case
466 * is SHORTCOPY+1
467 */
468#define SHORTCOPY 3
469#define CHKSIZE  39
470
471/*
472 * Indicates that we're to trampoline to the error handler.
473 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
474 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
475 */
476#define	FPUSED_FLAG	1
477#define	TRAMP_FLAG	2
478#define	MASK_FLAGS	3
479
480/*
481 * Number of outstanding prefetches.
482 * We may need more tuning when Olympus-C processor is available.
483 */
484#define	OLYMPUS_C_PREFETCH	4
485#define	OLYMPUS_C_2ND_PREFETCH	10
486
487#define	VIS_BLOCKSIZE		64
488
489/*
490 * Size of stack frame in order to accomodate a 64-byte aligned
491 * floating-point register save area and 2 64-bit temp locations.
492 * All copy functions use two quadrants of fp registers; to assure a
493 * block-aligned two block buffer in which to save we must reserve
494 * three blocks on stack.  Not all functions preserve %pfrs on stack
495 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
496 *
497 *    _______________________________________ <-- %fp + STACK_BIAS
498 *    | We may need to preserve 2 quadrants |
499 *    | of fp regs, but since we do so with |
500 *    | BST/BLD we need room in which to    |
501 *    | align to VIS_BLOCKSIZE bytes.  So   |
502 *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
503 *    |-------------------------------------|
504 *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
505 *    |-------------------------------------|
506 *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
507 *    ---------------------------------------
508 */
509#define	HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
510#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 3)
511#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 2) - 1)
512#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
513#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
514
515/*
516 * Common macros used by the various versions of the block copy
517 * routines in this file.
518 */
519
520/*
521 * In FP copies if we do not have preserved data to restore over
522 * the fp regs we used then we must zero those regs to avoid
523 * exposing portions of the data to later threads (data security).
524 *
525 * Copy functions use either quadrants 1 and 3 or 2 and 4.
526 *
527 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
528 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
529 *
530 * The instructions below are quicker than repeated fzero instructions
531 * since they can dispatch down two fp pipelines.
532 */
533#define	FZEROQ1Q3			\
534	fzero	%f0			;\
535	fmovd	%f0, %f2		;\
536	fmovd	%f0, %f4		;\
537	fmovd	%f0, %f6		;\
538	fmovd	%f0, %f8		;\
539	fmovd	%f0, %f10		;\
540	fmovd	%f0, %f12		;\
541	fmovd	%f0, %f14		;\
542	fmovd	%f0, %f32		;\
543	fmovd	%f0, %f34		;\
544	fmovd	%f0, %f36		;\
545	fmovd	%f0, %f38		;\
546	fmovd	%f0, %f40		;\
547	fmovd	%f0, %f42		;\
548	fmovd	%f0, %f44		;\
549	fmovd	%f0, %f46
550
551#define	FZEROQ2Q4			\
552	fzero	%f16			;\
553	fmovd	%f0, %f18		;\
554	fmovd	%f0, %f20		;\
555	fmovd	%f0, %f22		;\
556	fmovd	%f0, %f24		;\
557	fmovd	%f0, %f26		;\
558	fmovd	%f0, %f28		;\
559	fmovd	%f0, %f30		;\
560	fmovd	%f0, %f48		;\
561	fmovd	%f0, %f50		;\
562	fmovd	%f0, %f52		;\
563	fmovd	%f0, %f54		;\
564	fmovd	%f0, %f56		;\
565	fmovd	%f0, %f58		;\
566	fmovd	%f0, %f60		;\
567	fmovd	%f0, %f62
568
569/*
570 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
571 * Used to save and restore in-use fp registers when we want to use FP
572 * and find fp already in use and copy size still large enough to justify
573 * the additional overhead of this save and restore.
574 *
575 * A membar #Sync is needed before save to sync fp ops initiated before
576 * the call to the copy function (by whoever has fp in use); for example
577 * an earlier block load to the quadrant we are about to save may still be
578 * "in flight".  A membar #Sync is required at the end of the save to
579 * sync our block store (the copy code is about to begin ldd's to the
580 * first quadrant).
581 *
582 * Similarly: a membar #Sync before restore allows the block stores of
583 * the copy operation to complete before we fill the quadrants with their
584 * original data, and a membar #Sync after restore lets the block loads
585 * of the restore complete before we return to whoever has the fp regs
586 * in use.  To avoid repeated membar #Sync we make it the responsibility
587 * of the copy code to membar #Sync immediately after copy is complete
588 * and before using the BLD_*_FROMSTACK macro.
589 */
590#if !defined(lint)
591#define BST_FPQ1Q3_TOSTACK(tmp1)				\
592	/* membar #Sync	*/					;\
593	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
594	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
595	stda	%f0, [tmp1]ASI_BLK_P				;\
596	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
597	stda	%f32, [tmp1]ASI_BLK_P				;\
598	membar	#Sync
599
600#define	BLD_FPQ1Q3_FROMSTACK(tmp1)				\
601	/* membar #Sync - provided at copy completion */	;\
602	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
603	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
604	ldda	[tmp1]ASI_BLK_P, %f0				;\
605	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
606	ldda	[tmp1]ASI_BLK_P, %f32				;\
607	membar	#Sync
608
609#define BST_FPQ2Q4_TOSTACK(tmp1)				\
610	/* membar #Sync */					;\
611	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
612	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
613	stda	%f16, [tmp1]ASI_BLK_P				;\
614	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
615	stda	%f48, [tmp1]ASI_BLK_P				;\
616	membar	#Sync
617
618#define	BLD_FPQ2Q4_FROMSTACK(tmp1)				\
619	/* membar #Sync - provided at copy completion */	;\
620	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
621	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
622	ldda	[tmp1]ASI_BLK_P, %f16				;\
623	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
624	ldda	[tmp1]ASI_BLK_P, %f48				;\
625	membar	#Sync
626#endif
627
628/*
629 * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
630 * prevent preemption if there is no t_lwp to save FP state to on context
631 * switch) before commencing a FP copy, and reallow it on completion or
632 * in error trampoline paths when we were using FP copy.
633 *
634 * Both macros may call other functions, so be aware that all outputs are
635 * forfeit after using these macros.  For this reason we do not pass registers
636 * to use - we just use any outputs we want.
637 *
638 * Pseudo code:
639 *
640 * FP_NOMIGRATE:
641 *
642 * if (curthread->t_lwp) {
643 *	thread_nomigrate();
644 * } else {
645 *	kpreempt_disable();
646 * }
647 *
648 * FP_ALLOWMIGRATE:
649 *
650 * if (curthread->t_lwp) {
651 *	thread_allowmigrate();
652 * } else {
653 *	kpreempt_enable();
654 * }
655 */
656
657#define	FP_NOMIGRATE(label1, label2)				\
658	ldn	[THREAD_REG + T_LWP], %o0			;\
659	brz,a,pn %o0, label1/**/f				;\
660	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
661	call	thread_nomigrate				;\
662	  nop							;\
663	ba	label2/**/f					;\
664	  nop							;\
665label1:								;\
666	inc	%o1						;\
667	stb	%o1, [THREAD_REG + T_PREEMPT]			;\
668label2:
669
670#define	FP_ALLOWMIGRATE(label1, label2)			\
671	ldn	[THREAD_REG + T_LWP], %o0			;\
672	brz,a,pn %o0, label1/**/f				;\
673	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
674	call thread_allowmigrate				;\
675	  nop							;\
676	ba	label2/**/f					;\
677	  nop							;\
678label1:								;\
679	dec	%o1						;\
680	brnz,pn	%o1, label2/**/f				;\
681	  stb	%o1, [THREAD_REG + T_PREEMPT]			;\
682	ldn	[THREAD_REG + T_CPU], %o0			;\
683	ldub	[%o0 + CPU_KPRUNRUN], %o0			;\
684	brz,pt	%o0, label2/**/f				;\
685	  nop							;\
686	call	kpreempt					;\
687	  rdpr	%pil, %o0					;\
688label2:
689
690/*
691 * Copy a block of storage, returning an error code if `from' or
692 * `to' takes a kernel pagefault which cannot be resolved.
693 * Returns errno value on pagefault error, 0 if all ok
694 */
695
696#if defined(lint)
697
698/* ARGSUSED */
699int
700kcopy(const void *from, void *to, size_t count)
701{ return(0); }
702
703#else	/* lint */
704
705	.seg	".text"
706	.align	4
707
708	ENTRY(kcopy)
709
710	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
711	bleu,pt	%ncc, .kcopy_small		! go to larger cases
712	  xor	%o0, %o1, %o3			! are src, dst alignable?
713	btst	7, %o3				!
714	bz,pt	%ncc, .kcopy_8			! check for longword alignment
715	  nop
716	btst	1, %o3				!
717	bz,pt	%ncc, .kcopy_2			! check for half-word
718	  nop
719	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
720	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
721	tst	%o3
722	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
723	  cmp	%o2, %o3			! if length <= limit
724	bleu,pt	%ncc, .kcopy_small		! go to small copy
725	  nop
726	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
727	  nop
728.kcopy_2:
729	btst	3, %o3				!
730	bz,pt	%ncc, .kcopy_4			! check for word alignment
731	  nop
732	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
733	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
734	tst	%o3
735	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
736	  cmp	%o2, %o3			! if length <= limit
737	bleu,pt	%ncc, .kcopy_small		! go to small copy
738	  nop
739	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
740	  nop
741.kcopy_4:
742	! already checked longword, must be word aligned
743	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
744	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
745	tst	%o3
746	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
747	  cmp	%o2, %o3			! if length <= limit
748	bleu,pt	%ncc, .kcopy_small		! go to small copy
749	  nop
750	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
751	  nop
752.kcopy_8:
753	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
754	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
755	tst	%o3
756	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
757	  cmp	%o2, %o3			! if length <= limit
758	bleu,pt	%ncc, .kcopy_small		! go to small copy
759	  nop
760	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
761	  nop
762
763.kcopy_small:
764	sethi	%hi(.sm_copyerr), %o5		! sm_copyerr is lofault value
765	or	%o5, %lo(.sm_copyerr), %o5
766	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
767	membar	#Sync				! sync error barrier
768	ba,pt	%ncc, .sm_do_copy		! common code
769	 stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
770
771.kcopy_more:
772	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
773	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
774	or	%l7, %lo(.copyerr), %l7
775	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
776	membar	#Sync				! sync error barrier
777	ba,pt	%ncc, .do_copy			! common code
778	  stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
779
780
781/*
782 * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
783 * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
784 */
785.copyerr:
786	set	.copyerr2, %l0
787	membar	#Sync				! sync error barrier
788	stn	%l0, [THREAD_REG + T_LOFAULT]	! set t_lofault
789	btst	FPUSED_FLAG, %l6
790	bz	%ncc, 1f
791	  and	%l6, TRAMP_FLAG, %l0		! copy trampoline flag to %l0
792
793	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
794	wr	%o2, 0, %gsr
795
796	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
797	btst	FPRS_FEF, %o3
798	bz,pt	%icc, 4f
799	  nop
800
801	BLD_FPQ1Q3_FROMSTACK(%o2)
802
803	ba,pt	%ncc, 1f
804	  wr	%o3, 0, %fprs		! restore fprs
805
8064:
807	FZEROQ1Q3
808	wr	%o3, 0, %fprs		! restore fprs
809
810	!
811	! Need to cater for the different expectations of kcopy
812	! and bcopy. kcopy will *always* set a t_lofault handler
813	! If it fires, we're expected to just return the error code
814	! and *not* to invoke any existing error handler. As far as
815	! bcopy is concerned, we only set t_lofault if there was an
816	! existing lofault handler. In that case we're expected to
817	! invoke the previously existing handler after resetting the
818	! t_lofault value.
819	!
8201:
821	andn	%l6, MASK_FLAGS, %l6		! turn trampoline flag off
822	membar	#Sync				! sync error barrier
823	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
824	FP_ALLOWMIGRATE(5, 6)
825
826	btst	TRAMP_FLAG, %l0
827	bnz,pn	%ncc, 3f
828	  nop
829	ret
830	  restore	%g1, 0, %o0
831
8323:
833	!
834	! We're here via bcopy. There *must* have been an error handler
835	! in place otherwise we would have died a nasty death already.
836	!
837	jmp	%l6				! goto real handler
838	  restore	%g0, 0, %o0		! dispose of copy window
839
840/*
841 * We got here because of a fault in .copyerr.  We can't safely restore fp
842 * state, so we panic.
843 */
844fp_panic_msg:
845	.asciz	"Unable to restore fp state after copy operation"
846
847	.align	4
848.copyerr2:
849	set	fp_panic_msg, %o0
850	call	panic
851	  nop
852
853/*
854 * We got here because of a fault during a small kcopy or bcopy.
855 * No floating point registers are used by the small copies.
856 * Errno value is in %g1.
857 */
858.sm_copyerr:
8591:
860	btst	TRAMP_FLAG, %o4
861	membar	#Sync
862	andn	%o4, TRAMP_FLAG, %o4
863	bnz,pn	%ncc, 3f
864	  stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
865	retl
866	  mov	%g1, %o0
8673:
868	jmp	%o4				! goto real handler
869	  mov	%g0, %o0			!
870
871	SET_SIZE(kcopy)
872#endif	/* lint */
873
874
875/*
876 * Copy a block of storage - must not overlap (from + len <= to).
877 * Registers: l6 - saved t_lofault
878 * (for short copies, o4 - saved t_lofault)
879 *
880 * Copy a page of memory.
881 * Assumes double word alignment and a count >= 256.
882 */
883#if defined(lint)
884
885/* ARGSUSED */
886void
887bcopy(const void *from, void *to, size_t count)
888{}
889
890#else	/* lint */
891
892	ENTRY(bcopy)
893
894	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
895	bleu,pt	%ncc, .bcopy_small		! go to larger cases
896	  xor	%o0, %o1, %o3			! are src, dst alignable?
897	btst	7, %o3				!
898	bz,pt	%ncc, .bcopy_8			! check for longword alignment
899	  nop
900	btst	1, %o3				!
901	bz,pt	%ncc, .bcopy_2			! check for half-word
902	  nop
903	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
904	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
905	tst	%o3
906	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
907	  cmp	%o2, %o3			! if length <= limit
908	bleu,pt	%ncc, .bcopy_small		! go to small copy
909	  nop
910	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
911	  nop
912.bcopy_2:
913	btst	3, %o3				!
914	bz,pt	%ncc, .bcopy_4			! check for word alignment
915	  nop
916	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
917	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
918	tst	%o3
919	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
920	  cmp	%o2, %o3			! if length <= limit
921	bleu,pt	%ncc, .bcopy_small		! go to small copy
922	  nop
923	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
924	  nop
925.bcopy_4:
926	! already checked longword, must be word aligned
927	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
928	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
929	tst	%o3
930	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
931	  cmp	%o2, %o3			! if length <= limit
932	bleu,pt	%ncc, .bcopy_small		! go to small copy
933	  nop
934	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
935	  nop
936.bcopy_8:
937	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
938	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
939	tst	%o3
940	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
941	  cmp	%o2, %o3			! if length <= limit
942	bleu,pt	%ncc, .bcopy_small		! go to small copy
943	  nop
944	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
945	  nop
946
947	.align	16
948.bcopy_small:
949	ldn	[THREAD_REG + T_LOFAULT], %o4	! save t_lofault
950	tst	%o4
951	bz,pt	%icc, .sm_do_copy
952	  nop
953	sethi	%hi(.sm_copyerr), %o5
954	or	%o5, %lo(.sm_copyerr), %o5
955	membar	#Sync				! sync error barrier
956	stn	%o5, [THREAD_REG + T_LOFAULT]	! install new vector
957	or	%o4, TRAMP_FLAG, %o4		! error should trampoline
958.sm_do_copy:
959	cmp	%o2, SHORTCOPY		! check for really short case
960	bleu,pt	%ncc, .bc_sm_left	!
961	  cmp	%o2, CHKSIZE		! check for medium length cases
962	bgu,pn	%ncc, .bc_med		!
963	  or	%o0, %o1, %o3		! prepare alignment check
964	andcc	%o3, 0x3, %g0		! test for alignment
965	bz,pt	%ncc, .bc_sm_word	! branch to word aligned case
966.bc_sm_movebytes:
967	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
968.bc_sm_notalign4:
969	ldub	[%o0], %o3		! read byte
970	stb	%o3, [%o1]		! write byte
971	subcc	%o2, 4, %o2		! reduce count by 4
972	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
973	add	%o0, 4, %o0		! advance SRC by 4
974	stb	%o3, [%o1 + 1]
975	ldub	[%o0 - 2], %o3
976	add	%o1, 4, %o1		! advance DST by 4
977	stb	%o3, [%o1 - 2]
978	ldub	[%o0 - 1], %o3
979	bgt,pt	%ncc, .bc_sm_notalign4	! loop til 3 or fewer bytes remain
980	  stb	%o3, [%o1 - 1]
981	add	%o2, 3, %o2		! restore count
982.bc_sm_left:
983	tst	%o2
984	bz,pt	%ncc, .bc_sm_exit	! check for zero length
985	  deccc	%o2			! reduce count for cc test
986	ldub	[%o0], %o3		! move one byte
987	bz,pt	%ncc, .bc_sm_exit
988	  stb	%o3, [%o1]
989	ldub	[%o0 + 1], %o3		! move another byte
990	deccc	%o2			! check for more
991	bz,pt	%ncc, .bc_sm_exit
992	  stb	%o3, [%o1 + 1]
993	ldub	[%o0 + 2], %o3		! move final byte
994	stb	%o3, [%o1 + 2]
995	membar	#Sync				! sync error barrier
996	andn	%o4, TRAMP_FLAG, %o4
997	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
998	retl
999	  mov	%g0, %o0		! return 0
1000	.align	16
1001	nop				! instruction alignment
1002					! see discussion at start of file
1003.bc_sm_words:
1004	lduw	[%o0], %o3		! read word
1005.bc_sm_wordx:
1006	subcc	%o2, 8, %o2		! update count
1007	stw	%o3, [%o1]		! write word
1008	add	%o0, 8, %o0		! update SRC
1009	lduw	[%o0 - 4], %o3		! read word
1010	add	%o1, 8, %o1		! update DST
1011	bgt,pt	%ncc, .bc_sm_words	! loop til done
1012	  stw	%o3, [%o1 - 4]		! write word
1013	addcc	%o2, 7, %o2		! restore count
1014	bz,pt	%ncc, .bc_sm_exit
1015	  deccc	%o2
1016	bz,pt	%ncc, .bc_sm_byte
1017.bc_sm_half:
1018	  subcc	%o2, 2, %o2		! reduce count by 2
1019	add	%o0, 2, %o0		! advance SRC by 2
1020	lduh	[%o0 - 2], %o3		! read half word
1021	add	%o1, 2, %o1		! advance DST by 2
1022	bgt,pt	%ncc, .bc_sm_half	! loop til done
1023	  sth	%o3, [%o1 - 2]		! write half word
1024	addcc	%o2, 1, %o2		! restore count
1025	bz,pt	%ncc, .bc_sm_exit
1026	  nop
1027.bc_sm_byte:
1028	ldub	[%o0], %o3
1029	stb	%o3, [%o1]
1030	membar	#Sync				! sync error barrier
1031	andn	%o4, TRAMP_FLAG, %o4
1032	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1033	retl
1034	  mov	%g0, %o0		! return 0
1035
1036.bc_sm_word:
1037	subcc	%o2, 4, %o2		! update count
1038	bgt,pt	%ncc, .bc_sm_wordx
1039	  lduw	[%o0], %o3		! read word
1040	addcc	%o2, 3, %o2		! restore count
1041	bz,pt	%ncc, .bc_sm_exit
1042	  stw	%o3, [%o1]		! write word
1043	deccc	%o2			! reduce count for cc test
1044	ldub	[%o0 + 4], %o3		! load one byte
1045	bz,pt	%ncc, .bc_sm_exit
1046	  stb	%o3, [%o1 + 4]		! store one byte
1047	ldub	[%o0 + 5], %o3		! load second byte
1048	deccc	%o2
1049	bz,pt	%ncc, .bc_sm_exit
1050	  stb	%o3, [%o1 + 5]		! store second byte
1051	ldub	[%o0 + 6], %o3		! load third byte
1052	stb	%o3, [%o1 + 6]		! store third byte
1053.bc_sm_exit:
1054	membar	#Sync				! sync error barrier
1055	andn	%o4, TRAMP_FLAG, %o4
1056	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1057	retl
1058	  mov	%g0, %o0		! return 0
1059
1060	.align 16
1061.bc_med:
1062	xor	%o0, %o1, %o3		! setup alignment check
1063	btst	1, %o3
1064	bnz,pt	%ncc, .bc_sm_movebytes	! unaligned
1065	  nop
1066	btst	3, %o3
1067	bnz,pt	%ncc, .bc_med_half	! halfword aligned
1068	  nop
1069	btst	7, %o3
1070	bnz,pt	%ncc, .bc_med_word	! word aligned
1071	  nop
1072.bc_med_long:
1073	btst	3, %o0			! check for
1074	bz,pt	%ncc, .bc_med_long1	! word alignment
1075	  nop
1076.bc_med_long0:
1077	ldub	[%o0], %o3		! load one byte
1078	inc	%o0
1079	stb	%o3,[%o1]		! store byte
1080	inc	%o1
1081	btst	3, %o0
1082	bnz,pt	%ncc, .bc_med_long0
1083	  dec	%o2
1084.bc_med_long1:			! word aligned
1085	btst	7, %o0			! check for long word
1086	bz,pt	%ncc, .bc_med_long2
1087	  nop
1088	lduw	[%o0], %o3		! load word
1089	add	%o0, 4, %o0		! advance SRC by 4
1090	stw	%o3, [%o1]		! store word
1091	add	%o1, 4, %o1		! advance DST by 4
1092	sub	%o2, 4, %o2		! reduce count by 4
1093!
1094!  Now long word aligned and have at least 32 bytes to move
1095!
1096.bc_med_long2:
1097	sub	%o2, 31, %o2		! adjust count to allow cc zero test
1098.bc_med_lmove:
1099	ldx	[%o0], %o3		! read long word
1100	stx	%o3, [%o1]		! write long word
1101	subcc	%o2, 32, %o2		! reduce count by 32
1102	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
1103	add	%o0, 32, %o0		! advance SRC by 32
1104	stx	%o3, [%o1 + 8]
1105	ldx	[%o0 - 16], %o3
1106	add	%o1, 32, %o1		! advance DST by 32
1107	stx	%o3, [%o1 - 16]
1108	ldx	[%o0 - 8], %o3
1109	bgt,pt	%ncc, .bc_med_lmove	! loop til 31 or fewer bytes left
1110	  stx	%o3, [%o1 - 8]
1111	addcc	%o2, 24, %o2		! restore count to long word offset
1112	ble,pt	%ncc, .bc_med_lextra	! check for more long words to move
1113	  nop
1114.bc_med_lword:
1115	ldx	[%o0], %o3		! read long word
1116	subcc	%o2, 8, %o2		! reduce count by 8
1117	stx	%o3, [%o1]		! write long word
1118	add	%o0, 8, %o0		! advance SRC by 8
1119	bgt,pt	%ncc, .bc_med_lword	! loop til 7 or fewer bytes left
1120	  add	%o1, 8, %o1		! advance DST by 8
1121.bc_med_lextra:
1122	addcc	%o2, 7, %o2		! restore rest of count
1123	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
1124	  deccc	%o2
1125	bz,pt	%ncc, .bc_sm_byte
1126	  nop
1127	ba,pt	%ncc, .bc_sm_half
1128	  nop
1129
1130	.align 16
1131.bc_med_word:
1132	btst	3, %o0			! check for
1133	bz,pt	%ncc, .bc_med_word1	! word alignment
1134	  nop
1135.bc_med_word0:
1136	ldub	[%o0], %o3		! load one byte
1137	inc	%o0
1138	stb	%o3,[%o1]		! store byte
1139	inc	%o1
1140	btst	3, %o0
1141	bnz,pt	%ncc, .bc_med_word0
1142	  dec	%o2
1143!
1144!  Now word aligned and have at least 36 bytes to move
1145!
1146.bc_med_word1:
1147	sub	%o2, 15, %o2		! adjust count to allow cc zero test
1148.bc_med_wmove:
1149	lduw	[%o0], %o3		! read word
1150	stw	%o3, [%o1]		! write word
1151	subcc	%o2, 16, %o2		! reduce count by 16
1152	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
1153	add	%o0, 16, %o0		! advance SRC by 16
1154	stw	%o3, [%o1 + 4]
1155	lduw	[%o0 - 8], %o3
1156	add	%o1, 16, %o1		! advance DST by 16
1157	stw	%o3, [%o1 - 8]
1158	lduw	[%o0 - 4], %o3
1159	bgt,pt	%ncc, .bc_med_wmove	! loop til 15 or fewer bytes left
1160	  stw	%o3, [%o1 - 4]
1161	addcc	%o2, 12, %o2		! restore count to word offset
1162	ble,pt	%ncc, .bc_med_wextra	! check for more words to move
1163	  nop
1164.bc_med_word2:
1165	lduw	[%o0], %o3		! read word
1166	subcc	%o2, 4, %o2		! reduce count by 4
1167	stw	%o3, [%o1]		! write word
1168	add	%o0, 4, %o0		! advance SRC by 4
1169	bgt,pt	%ncc, .bc_med_word2	! loop til 3 or fewer bytes left
1170	  add	%o1, 4, %o1		! advance DST by 4
1171.bc_med_wextra:
1172	addcc	%o2, 3, %o2		! restore rest of count
1173	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
1174	  deccc	%o2
1175	bz,pt	%ncc, .bc_sm_byte
1176	  nop
1177	ba,pt	%ncc, .bc_sm_half
1178	  nop
1179
1180	.align 16
1181.bc_med_half:
1182	btst	1, %o0			! check for
1183	bz,pt	%ncc, .bc_med_half1	! half word alignment
1184	  nop
1185	ldub	[%o0], %o3		! load one byte
1186	inc	%o0
1187	stb	%o3,[%o1]		! store byte
1188	inc	%o1
1189	dec	%o2
1190!
1191!  Now half word aligned and have at least 38 bytes to move
1192!
1193.bc_med_half1:
1194	sub	%o2, 7, %o2		! adjust count to allow cc zero test
1195.bc_med_hmove:
1196	lduh	[%o0], %o3		! read half word
1197	sth	%o3, [%o1]		! write half word
1198	subcc	%o2, 8, %o2		! reduce count by 8
1199	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
1200	add	%o0, 8, %o0		! advance SRC by 8
1201	sth	%o3, [%o1 + 2]
1202	lduh	[%o0 - 4], %o3
1203	add	%o1, 8, %o1		! advance DST by 8
1204	sth	%o3, [%o1 - 4]
1205	lduh	[%o0 - 2], %o3
1206	bgt,pt	%ncc, .bc_med_hmove	! loop til 7 or fewer bytes left
1207	  sth	%o3, [%o1 - 2]
1208	addcc	%o2, 7, %o2		! restore count
1209	bz,pt	%ncc, .bc_sm_exit
1210	  deccc	%o2
1211	bz,pt	%ncc, .bc_sm_byte
1212	  nop
1213	ba,pt	%ncc, .bc_sm_half
1214	  nop
1215
1216	SET_SIZE(bcopy)
1217
1218/*
1219 * The _more entry points are not intended to be used directly by
1220 * any caller from outside this file.  They are provided to allow
1221 * profiling and dtrace of the portions of the copy code that uses
1222 * the floating point registers.
1223 * This entry is particularly important as DTRACE (at least as of
1224 * 4/2004) does not support leaf functions.
1225 */
1226
1227	ENTRY(bcopy_more)
1228.bcopy_more:
1229	prefetch [%o0], #n_reads
1230	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1231	ldn	[THREAD_REG + T_LOFAULT], %l6	! save t_lofault
1232	tst	%l6
1233	bz,pt	%ncc, .do_copy
1234	  nop
1235	sethi	%hi(.copyerr), %o2
1236	or	%o2, %lo(.copyerr), %o2
1237	membar	#Sync				! sync error barrier
1238	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
1239	!
1240	! We've already captured whether t_lofault was zero on entry.
1241	! We need to mark ourselves as being from bcopy since both
1242	! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
1243	! and the saved lofault was zero, we won't reset lofault on
1244	! returning.
1245	!
1246	or	%l6, TRAMP_FLAG, %l6
1247
1248/*
1249 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
1250 * Also, use of FP registers has been tested to be enabled
1251 */
1252.do_copy:
1253	FP_NOMIGRATE(6, 7)
1254
1255	rd	%fprs, %o2		! check for unused fp
1256	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
1257	btst	FPRS_FEF, %o2
1258	bz,a,pt	%icc, .do_blockcopy
1259	  wr	%g0, FPRS_FEF, %fprs
1260
1261	BST_FPQ1Q3_TOSTACK(%o2)
1262
1263.do_blockcopy:
1264	rd	%gsr, %o2
1265	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
1266	or	%l6, FPUSED_FLAG, %l6
1267
1268#define	REALSRC	%i0
1269#define	DST	%i1
1270#define	CNT	%i2
1271#define	SRC	%i3
1272#define	TMP	%i5
1273
1274	andcc	DST, VIS_BLOCKSIZE - 1, TMP
1275	bz,pt	%ncc, 2f
1276	  neg	TMP
1277	add	TMP, VIS_BLOCKSIZE, TMP
1278
1279	! TMP = bytes required to align DST on FP_BLOCK boundary
1280	! Using SRC as a tmp here
1281	cmp	TMP, 3
1282	bleu,pt	%ncc, 1f
1283	  sub	CNT,TMP,CNT		! adjust main count
1284	sub	TMP, 3, TMP		! adjust for end of loop test
1285.bc_blkalign:
1286	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
1287	stb	SRC, [DST]
1288	subcc	TMP, 4, TMP
1289	ldub	[REALSRC + 1], SRC
1290	add	REALSRC, 4, REALSRC
1291	stb	SRC, [DST + 1]
1292	ldub	[REALSRC - 2], SRC
1293	add	DST, 4, DST
1294	stb	SRC, [DST - 2]
1295	ldub	[REALSRC - 1], SRC
1296	bgu,pt	%ncc, .bc_blkalign
1297	  stb	SRC, [DST - 1]
1298
1299	addcc	TMP, 3, TMP		! restore count adjustment
1300	bz,pt	%ncc, 2f		! no bytes left?
1301	  nop
13021:	ldub	[REALSRC], SRC
1303	inc	REALSRC
1304	inc	DST
1305	deccc	TMP
1306	bgu	%ncc, 1b
1307	  stb	SRC, [DST - 1]
1308
13092:
1310	membar	#StoreLoad
1311	andn	REALSRC, 0x7, SRC
1312
1313	! SRC - 8-byte aligned
1314	! DST - 64-byte aligned
1315	ldd	[SRC], %f0
1316	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1317	alignaddr REALSRC, %g0, %g0
1318	ldd	[SRC + 0x08], %f2
1319	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1320	faligndata %f0, %f2, %f32
1321	ldd	[SRC + 0x10], %f4
1322	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
1323	faligndata %f2, %f4, %f34
1324	ldd	[SRC + 0x18], %f6
1325	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1326	faligndata %f4, %f6, %f36
1327	ldd	[SRC + 0x20], %f8
1328	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
1329	faligndata %f6, %f8, %f38
1330	ldd	[SRC + 0x28], %f10
1331	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
1332	faligndata %f8, %f10, %f40
1333	ldd	[SRC + 0x30], %f12
1334	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
1335	faligndata %f10, %f12, %f42
1336	ldd	[SRC + 0x38], %f14
1337	ldd	[SRC + VIS_BLOCKSIZE], %f0
1338	sub	CNT, VIS_BLOCKSIZE, CNT
1339	add	SRC, VIS_BLOCKSIZE, SRC
1340	prefetch [SRC + (9 * VIS_BLOCKSIZE) - VIS_BLOCKSIZE], #one_read
1341	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1342	ba,pt	%ncc, 1f
1343	  prefetch [SRC + (10 * VIS_BLOCKSIZE) - VIS_BLOCKSIZE], #one_read
1344	.align	32
13451:
1346	ldd	[SRC + 0x08], %f2
1347	faligndata %f12, %f14, %f44
1348	ldd	[SRC + 0x10], %f4
1349	faligndata %f14, %f0, %f46
1350	stda	%f32, [DST]ASI_BLK_P
1351	ldd	[SRC + 0x18], %f6
1352	faligndata %f0, %f2, %f32
1353	ldd	[SRC + 0x20], %f8
1354	faligndata %f2, %f4, %f34
1355	ldd	[SRC + 0x28], %f10
1356	faligndata %f4, %f6, %f36
1357	ldd	[SRC + 0x30], %f12
1358	faligndata %f6, %f8, %f38
1359	ldd	[SRC + 0x38], %f14
1360	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #n_reads
1361	faligndata %f8, %f10, %f40
1362	ldd	[SRC + VIS_BLOCKSIZE], %f0
1363	prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1364	faligndata %f10, %f12, %f42
1365	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE) + 0x20], #n_reads
1366	prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE) + 0x20], #one_read
1367	sub	CNT, VIS_BLOCKSIZE, CNT
1368	add	DST, VIS_BLOCKSIZE, DST
1369	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1370	cmp	CNT, VIS_BLOCKSIZE + 8
1371	bgu,pt	%ncc, 1b
1372	  add	SRC, VIS_BLOCKSIZE, SRC
1373
1374	! only if REALSRC & 0x7 is 0
1375	cmp	CNT, VIS_BLOCKSIZE
1376	bne	%ncc, 3f
1377	  andcc	REALSRC, 0x7, %g0
1378	bz,pt	%ncc, 2f
1379	  nop
13803:
1381	faligndata %f12, %f14, %f44
1382	faligndata %f14, %f0, %f46
1383	stda	%f32, [DST]ASI_BLK_P
1384	add	DST, VIS_BLOCKSIZE, DST
1385	ba,pt	%ncc, 3f
1386	  nop
13872:
1388	ldd	[SRC + 0x08], %f2
1389	fsrc1	%f12, %f44
1390	ldd	[SRC + 0x10], %f4
1391	fsrc1	%f14, %f46
1392	stda	%f32, [DST]ASI_BLK_P
1393	ldd	[SRC + 0x18], %f6
1394	fsrc1	%f0, %f32
1395	ldd	[SRC + 0x20], %f8
1396	fsrc1	%f2, %f34
1397	ldd	[SRC + 0x28], %f10
1398	fsrc1	%f4, %f36
1399	ldd	[SRC + 0x30], %f12
1400	fsrc1	%f6, %f38
1401	ldd	[SRC + 0x38], %f14
1402	fsrc1	%f8, %f40
1403	sub	CNT, VIS_BLOCKSIZE, CNT
1404	add	DST, VIS_BLOCKSIZE, DST
1405	add	SRC, VIS_BLOCKSIZE, SRC
1406	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1407	fsrc1	%f10, %f42
1408	fsrc1	%f12, %f44
1409	fsrc1	%f14, %f46
1410	stda	%f32, [DST]ASI_BLK_P
1411	add	DST, VIS_BLOCKSIZE, DST
1412	ba,a,pt	%ncc, .bcb_exit
1413	  nop
1414
14153:	tst	CNT
1416	bz,a,pt	%ncc, .bcb_exit
1417	  nop
1418
14195:	ldub	[REALSRC], TMP
1420	inc	REALSRC
1421	inc	DST
1422	deccc	CNT
1423	bgu	%ncc, 5b
1424	  stb	TMP, [DST - 1]
1425.bcb_exit:
1426	membar	#Sync
1427
1428	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
1429	wr	%o2, 0, %gsr
1430
1431	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1432	btst	FPRS_FEF, %o3
1433	bz,pt	%icc, 4f
1434	  nop
1435
1436	BLD_FPQ1Q3_FROMSTACK(%o2)
1437
1438	ba,pt	%ncc, 2f
1439	  wr	%o3, 0, %fprs		! restore fprs
14404:
1441	FZEROQ1Q3
1442	wr	%o3, 0, %fprs		! restore fprs
14432:
1444	membar	#Sync				! sync error barrier
1445	andn	%l6, MASK_FLAGS, %l6
1446	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1447	FP_ALLOWMIGRATE(5, 6)
1448	ret
1449	  restore	%g0, 0, %o0
1450
1451	SET_SIZE(bcopy_more)
1452
1453#endif	/* lint */
1454
1455/*
1456 * Block copy with possibly overlapped operands.
1457 */
1458
1459#if defined(lint)
1460
1461/*ARGSUSED*/
1462void
1463ovbcopy(const void *from, void *to, size_t count)
1464{}
1465
1466#else	/* lint */
1467
1468	ENTRY(ovbcopy)
1469	tst	%o2			! check count
1470	bgu,a	%ncc, 1f		! nothing to do or bad arguments
1471	  subcc	%o0, %o1, %o3		! difference of from and to address
1472
1473	retl				! return
1474	  nop
14751:
1476	bneg,a	%ncc, 2f
1477	  neg	%o3			! if < 0, make it positive
14782:	cmp	%o2, %o3		! cmp size and abs(from - to)
1479	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
1480	  .empty				!   no overlap
1481	  cmp	%o0, %o1		! compare from and to addresses
1482	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
1483	  nop
1484	!
1485	! Copy forwards.
1486	!
1487.ov_fwd:
1488	ldub	[%o0], %o3		! read from address
1489	inc	%o0			! inc from address
1490	stb	%o3, [%o1]		! write to address
1491	deccc	%o2			! dec count
1492	bgu	%ncc, .ov_fwd		! loop till done
1493	  inc	%o1			! inc to address
1494
1495	retl				! return
1496	  nop
1497	!
1498	! Copy backwards.
1499	!
1500.ov_bkwd:
1501	deccc	%o2			! dec count
1502	ldub	[%o0 + %o2], %o3	! get byte at end of src
1503	bgu	%ncc, .ov_bkwd		! loop till done
1504	  stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
1505
1506	retl				! return
1507	  nop
1508
1509	SET_SIZE(ovbcopy)
1510
1511#endif	/* lint */
1512
1513
1514/*
1515 * hwblkpagecopy()
1516 *
1517 * Copies exactly one page.  This routine assumes the caller (ppcopy)
1518 * has already disabled kernel preemption and has checked
1519 * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
1520 */
1521#ifdef lint
1522/*ARGSUSED*/
1523void
1524hwblkpagecopy(const void *src, void *dst)
1525{ }
1526#else /* lint */
1527	ENTRY(hwblkpagecopy)
1528	! get another window w/space for three aligned blocks of saved fpregs
1529	prefetch [%o0], #n_reads
1530	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1531
1532	! %i0 - source address (arg)
1533	! %i1 - destination address (arg)
1534	! %i2 - length of region (not arg)
1535	! %l0 - saved fprs
1536	! %l1 - pointer to saved fpregs
1537
1538	rd	%fprs, %l0		! check for unused fp
1539	btst	FPRS_FEF, %l0
1540	bz,a,pt	%icc, 1f
1541	  wr	%g0, FPRS_FEF, %fprs
1542
1543	BST_FPQ1Q3_TOSTACK(%l1)
1544
15451:	set	PAGESIZE, CNT
1546	mov	REALSRC, SRC
1547
1548	ldd	[SRC], %f0
1549	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1550	ldd	[SRC + 0x08], %f2
1551	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1552	fmovd	%f0, %f32
1553	ldd	[SRC + 0x10], %f4
1554	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
1555	fmovd	%f2, %f34
1556	ldd	[SRC + 0x18], %f6
1557	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1558	fmovd	%f4, %f36
1559	ldd	[SRC + 0x20], %f8
1560	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
1561	fmovd	%f6, %f38
1562	ldd	[SRC + 0x28], %f10
1563	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
1564	fmovd	%f8, %f40
1565	ldd	[SRC + 0x30], %f12
1566	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
1567	fmovd	%f10, %f42
1568	ldd	[SRC + 0x38], %f14
1569	ldd	[SRC + VIS_BLOCKSIZE], %f0
1570	sub	CNT, VIS_BLOCKSIZE, CNT
1571	add	SRC, VIS_BLOCKSIZE, SRC
1572	prefetch [SRC + (9 * VIS_BLOCKSIZE) - VIS_BLOCKSIZE], #one_read
1573	ba,pt	%ncc, 2f
1574	  prefetch [SRC + (10 * VIS_BLOCKSIZE) - VIS_BLOCKSIZE], #one_read
1575	.align	32
15762:
1577	ldd	[SRC + 0x08], %f2
1578	fmovd	%f12, %f44
1579	ldd	[SRC + 0x10], %f4
1580	fmovd	%f14, %f46
1581	stda	%f32, [DST]ASI_BLK_P
1582	ldd	[SRC + 0x18], %f6
1583	fmovd	%f0, %f32
1584	ldd	[SRC + 0x20], %f8
1585	fmovd	%f2, %f34
1586	ldd	[SRC + 0x28], %f10
1587	fmovd	%f4, %f36
1588	ldd	[SRC + 0x30], %f12
1589	fmovd	%f6, %f38
1590	ldd	[SRC + 0x38], %f14
1591	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #n_reads
1592	fmovd	%f8, %f40
1593	ldd	[SRC + VIS_BLOCKSIZE], %f0
1594	prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1595	fmovd	%f10, %f42
1596	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE) + 0x20], #n_reads
1597	prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE) + 0x20], #one_read
1598	sub	CNT, VIS_BLOCKSIZE, CNT
1599	add	DST, VIS_BLOCKSIZE, DST
1600	cmp	CNT, VIS_BLOCKSIZE + 8
1601	bgu,pt	%ncc, 2b
1602	  add	SRC, VIS_BLOCKSIZE, SRC
1603
1604	! trailing block
1605	ldd	[SRC + 0x08], %f2
1606	fsrc1	%f12, %f44
1607	ldd	[SRC + 0x10], %f4
1608	fsrc1	%f14, %f46
1609	stda	%f32, [DST]ASI_BLK_P
1610	ldd	[SRC + 0x18], %f6
1611	fsrc1	%f0, %f32
1612	ldd	[SRC + 0x20], %f8
1613	fsrc1	%f2, %f34
1614	ldd	[SRC + 0x28], %f10
1615	fsrc1	%f4, %f36
1616	ldd	[SRC + 0x30], %f12
1617	fsrc1	%f6, %f38
1618	ldd	[SRC + 0x38], %f14
1619	fsrc1	%f8, %f40
1620	sub	CNT, VIS_BLOCKSIZE, CNT
1621	add	DST, VIS_BLOCKSIZE, DST
1622	add	SRC, VIS_BLOCKSIZE, SRC
1623	fsrc1	%f10, %f42
1624	fsrc1	%f12, %f44
1625	fsrc1	%f14, %f46
1626	stda	%f32, [DST]ASI_BLK_P
1627
1628	membar	#Sync
1629
1630	btst	FPRS_FEF, %l0
1631	bz,pt	%icc, 2f
1632	  nop
1633
1634	BLD_FPQ1Q3_FROMSTACK(%l3)
1635	ba	3f
1636	  nop
1637
16382:	FZEROQ1Q3
1639
16403:	wr	%l0, 0, %fprs		! restore fprs
1641	ret
1642	  restore	%g0, 0, %o0
1643
1644	SET_SIZE(hwblkpagecopy)
1645#endif	/* lint */
1646
1647
1648/*
1649 * Transfer data to and from user space -
1650 * Note that these routines can cause faults
1651 * It is assumed that the kernel has nothing at
1652 * less than KERNELBASE in the virtual address space.
1653 *
1654 * Note that copyin(9F) and copyout(9F) are part of the
1655 * DDI/DKI which specifies that they return '-1' on "errors."
1656 *
1657 * Sigh.
1658 *
1659 * So there's two extremely similar routines - xcopyin() and xcopyout()
1660 * which return the errno that we've faithfully computed.  This
1661 * allows other callers (e.g. uiomove(9F)) to work correctly.
1662 * Given that these are used pretty heavily, we expand the calling
1663 * sequences inline for all flavours (rather than making wrappers).
1664 *
1665 * There are also stub routines for xcopyout_little and xcopyin_little,
1666 * which currently are intended to handle requests of <= 16 bytes from
1667 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1668 * is left as an exercise...
1669 */
1670
1671/*
1672 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1673 *
1674 * General theory of operation:
1675 *
1676 * The only difference between copy{in,out} and
1677 * xcopy{in,out} is in the error handling routine they invoke
1678 * when a memory access error occurs. xcopyOP returns the errno
1679 * while copyOP returns -1 (see above). copy{in,out}_noerr set
1680 * a special flag (by oring the TRAMP_FLAG into the fault handler address)
1681 * if they are called with a fault handler already in place. That flag
1682 * causes the default handlers to trampoline to the previous handler
1683 * upon an error.
1684 *
1685 * None of the copyops routines grab a window until it's decided that
1686 * we need to do a HW block copy operation. This saves a window
1687 * spill/fill when we're called during socket ops. The typical IO
1688 * path won't cause spill/fill traps.
1689 *
1690 * This code uses a set of 4 limits for the maximum size that will
1691 * be copied given a particular input/output address alignment.
1692 * If the value for a particular limit is zero, the copy will be performed
1693 * by the plain copy loops rather than FPBLK.
1694 *
1695 * See the description of bcopy above for more details of the
1696 * data copying algorithm and the default limits.
1697 *
1698 */
1699
1700/*
1701 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1702 */
1703
1704#if defined(lint)
1705
1706
1707#else	/* lint */
1708/*
1709 * We save the arguments in the following registers in case of a fault:
1710 *	kaddr - %l1
1711 *	uaddr - %l2
1712 *	count - %l3
1713 */
1714#define SAVE_SRC	%l1
1715#define SAVE_DST	%l2
1716#define SAVE_COUNT	%l3
1717
1718#define SM_SAVE_SRC		%g4
1719#define SM_SAVE_DST		%g5
1720#define SM_SAVE_COUNT		%o5
1721#define ERRNO		%l5
1722
1723
1724#define REAL_LOFAULT	%l4
1725/*
1726 * Generic copyio fault handler.  This is the first line of defense when a
1727 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
1728 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1729 * This allows us to share common code for all the flavors of the copy
1730 * operations, including the _noerr versions.
1731 *
1732 * Note that this function will restore the original input parameters before
1733 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
1734 * member of the t_copyop structure, if needed.
1735 */
1736	ENTRY(copyio_fault)
1737	membar	#Sync
1738	mov	%g1,ERRNO			! save errno in ERRNO
1739	btst	FPUSED_FLAG, %l6
1740	bz	%ncc, 1f
1741	  nop
1742
1743	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1744	wr	%o2, 0, %gsr    	! restore gsr
1745
1746	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1747	btst	FPRS_FEF, %o3
1748	bz,pt	%icc, 4f
1749	  nop
1750
1751	BLD_FPQ2Q4_FROMSTACK(%o2)
1752
1753	ba,pt	%ncc, 1f
1754	  wr	%o3, 0, %fprs   	! restore fprs
1755
17564:
1757	FZEROQ2Q4
1758	wr	%o3, 0, %fprs   	! restore fprs
1759
17601:
1761	andn	%l6, FPUSED_FLAG, %l6
1762	membar	#Sync
1763	stn	%l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1764	FP_ALLOWMIGRATE(5, 6)
1765
1766	mov	SAVE_SRC, %i0
1767	mov	SAVE_DST, %i1
1768	jmp	REAL_LOFAULT
1769	  mov	SAVE_COUNT, %i2
1770
1771	SET_SIZE(copyio_fault)
1772
1773
1774#endif
1775
1776#if defined(lint)
1777
1778/*ARGSUSED*/
1779int
1780copyout(const void *kaddr, void *uaddr, size_t count)
1781{ return (0); }
1782
1783#else	/* lint */
1784
1785	ENTRY(copyout)
1786
1787	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
1788	bleu,pt	%ncc, .copyout_small		! go to larger cases
1789	  xor	%o0, %o1, %o3			! are src, dst alignable?
1790	btst	7, %o3				!
1791	bz,pt	%ncc, .copyout_8		! check for longword alignment
1792	  nop
1793	btst	1, %o3				!
1794	bz,pt	%ncc, .copyout_2		! check for half-word
1795	  nop
1796	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
1797	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
1798	tst	%o3
1799	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1800	  cmp	%o2, %o3			! if length <= limit
1801	bleu,pt	%ncc, .copyout_small		! go to small copy
1802	  nop
1803	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1804	  nop
1805.copyout_2:
1806	btst	3, %o3				!
1807	bz,pt	%ncc, .copyout_4		! check for word alignment
1808	  nop
1809	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
1810	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
1811	tst	%o3
1812	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1813	  cmp	%o2, %o3			! if length <= limit
1814	bleu,pt	%ncc, .copyout_small		! go to small copy
1815	  nop
1816	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1817	  nop
1818.copyout_4:
1819	! already checked longword, must be word aligned
1820	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
1821	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
1822	tst	%o3
1823	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1824	  cmp	%o2, %o3			! if length <= limit
1825	bleu,pt	%ncc, .copyout_small		! go to small copy
1826	  nop
1827	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1828	  nop
1829.copyout_8:
1830	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
1831	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
1832	tst	%o3
1833	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1834	  cmp	%o2, %o3			! if length <= limit
1835	bleu,pt	%ncc, .copyout_small		! go to small copy
1836	  nop
1837	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1838	  nop
1839
1840	.align	16
1841	nop				! instruction alignment
1842					! see discussion at start of file
1843.copyout_small:
1844	sethi	%hi(.sm_copyout_err), %o5	! .sm_copyout_err is lofault
1845	or	%o5, %lo(.sm_copyout_err), %o5
1846	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
1847	membar	#Sync				! sync error barrier
1848	stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
1849.sm_do_copyout:
1850	mov	%o0, SM_SAVE_SRC
1851	mov	%o1, SM_SAVE_DST
1852	cmp	%o2, SHORTCOPY		! check for really short case
1853	bleu,pt	%ncc, .co_sm_left	!
1854	  mov	%o2, SM_SAVE_COUNT
1855	cmp	%o2, CHKSIZE		! check for medium length cases
1856	bgu,pn	%ncc, .co_med		!
1857	  or	%o0, %o1, %o3		! prepare alignment check
1858	andcc	%o3, 0x3, %g0		! test for alignment
1859	bz,pt	%ncc, .co_sm_word	! branch to word aligned case
1860.co_sm_movebytes:
1861	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
1862.co_sm_notalign4:
1863	ldub	[%o0], %o3		! read byte
1864	subcc	%o2, 4, %o2		! reduce count by 4
1865	stba	%o3, [%o1]ASI_USER	! write byte
1866	inc	%o1			! advance DST by 1
1867	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
1868	add	%o0, 4, %o0		! advance SRC by 4
1869	stba	%o3, [%o1]ASI_USER
1870	inc	%o1			! advance DST by 1
1871	ldub	[%o0 - 2], %o3
1872	stba	%o3, [%o1]ASI_USER
1873	inc	%o1			! advance DST by 1
1874	ldub	[%o0 - 1], %o3
1875	stba	%o3, [%o1]ASI_USER
1876	bgt,pt	%ncc, .co_sm_notalign4	! loop til 3 or fewer bytes remain
1877	  inc	%o1			! advance DST by 1
1878	add	%o2, 3, %o2		! restore count
1879.co_sm_left:
1880	tst	%o2
1881	bz,pt	%ncc, .co_sm_exit	! check for zero length
1882	  nop
1883	ldub	[%o0], %o3		! load one byte
1884	deccc	%o2			! reduce count for cc test
1885	bz,pt	%ncc, .co_sm_exit
1886	  stba	%o3,[%o1]ASI_USER	! store one byte
1887	ldub	[%o0 + 1], %o3		! load second byte
1888	deccc	%o2
1889	inc	%o1
1890	bz,pt	%ncc, .co_sm_exit
1891	  stba	%o3,[%o1]ASI_USER	! store second byte
1892	ldub	[%o0 + 2], %o3		! load third byte
1893	inc	%o1
1894	stba	%o3,[%o1]ASI_USER	! store third byte
1895	membar	#Sync				! sync error barrier
1896	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1897	retl
1898	  mov	%g0, %o0		! return 0
1899	.align	16
1900.co_sm_words:
1901	lduw	[%o0], %o3		! read word
1902.co_sm_wordx:
1903	subcc	%o2, 8, %o2		! update count
1904	stwa	%o3, [%o1]ASI_USER	! write word
1905	add	%o0, 8, %o0		! update SRC
1906	lduw	[%o0 - 4], %o3		! read word
1907	add	%o1, 4, %o1		! update DST
1908	stwa	%o3, [%o1]ASI_USER	! write word
1909	bgt,pt	%ncc, .co_sm_words	! loop til done
1910	  add	%o1, 4, %o1		! update DST
1911	addcc	%o2, 7, %o2		! restore count
1912	bz,pt	%ncc, .co_sm_exit
1913	  nop
1914	deccc	%o2
1915	bz,pt	%ncc, .co_sm_byte
1916.co_sm_half:
1917	  subcc	%o2, 2, %o2		! reduce count by 2
1918	lduh	[%o0], %o3		! read half word
1919	add	%o0, 2, %o0		! advance SRC by 2
1920	stha	%o3, [%o1]ASI_USER	! write half word
1921	bgt,pt	%ncc, .co_sm_half	! loop til done
1922	  add	%o1, 2, %o1		! advance DST by 2
1923	addcc	%o2, 1, %o2		! restore count
1924	bz,pt	%ncc, .co_sm_exit
1925	  nop
1926.co_sm_byte:
1927	ldub	[%o0], %o3
1928	stba	%o3, [%o1]ASI_USER
1929	membar	#Sync				! sync error barrier
1930	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1931	retl
1932	  mov	%g0, %o0		! return 0
1933	.align 16
1934.co_sm_word:
1935	subcc	%o2, 4, %o2		! update count
1936	bgt,pt	%ncc, .co_sm_wordx
1937	  lduw	[%o0], %o3		! read word
1938	addcc	%o2, 3, %o2		! restore count
1939	bz,pt	%ncc, .co_sm_exit
1940	  stwa	%o3, [%o1]ASI_USER	! write word
1941	deccc	%o2			! reduce count for cc test
1942	ldub	[%o0 + 4], %o3		! load one byte
1943	add	%o1, 4, %o1
1944	bz,pt	%ncc, .co_sm_exit
1945	  stba	%o3, [%o1]ASI_USER	! store one byte
1946	ldub	[%o0 + 5], %o3		! load second byte
1947	deccc	%o2
1948	inc	%o1
1949	bz,pt	%ncc, .co_sm_exit
1950	  stba	%o3, [%o1]ASI_USER	! store second byte
1951	ldub	[%o0 + 6], %o3		! load third byte
1952	inc	%o1
1953	stba	%o3, [%o1]ASI_USER	! store third byte
1954.co_sm_exit:
1955	  membar	#Sync				! sync error barrier
1956	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1957	retl
1958	  mov	%g0, %o0		! return 0
1959
1960	.align 16
1961.co_med:
1962	xor	%o0, %o1, %o3		! setup alignment check
1963	btst	1, %o3
1964	bnz,pt	%ncc, .co_sm_movebytes	! unaligned
1965	  nop
1966	btst	3, %o3
1967	bnz,pt	%ncc, .co_med_half	! halfword aligned
1968	  nop
1969	btst	7, %o3
1970	bnz,pt	%ncc, .co_med_word	! word aligned
1971	  nop
1972.co_med_long:
1973	btst	3, %o0			! check for
1974	bz,pt	%ncc, .co_med_long1	! word alignment
1975	  nop
1976.co_med_long0:
1977	ldub	[%o0], %o3		! load one byte
1978	inc	%o0
1979	stba	%o3,[%o1]ASI_USER	! store byte
1980	inc	%o1
1981	btst	3, %o0
1982	bnz,pt	%ncc, .co_med_long0
1983	  dec	%o2
1984.co_med_long1:			! word aligned
1985	btst	7, %o0			! check for long word
1986	bz,pt	%ncc, .co_med_long2
1987	  nop
1988	lduw	[%o0], %o3		! load word
1989	add	%o0, 4, %o0		! advance SRC by 4
1990	stwa	%o3, [%o1]ASI_USER	! store word
1991	add	%o1, 4, %o1		! advance DST by 4
1992	sub	%o2, 4, %o2		! reduce count by 4
1993!
1994!  Now long word aligned and have at least 32 bytes to move
1995!
1996.co_med_long2:
1997	sub	%o2, 31, %o2		! adjust count to allow cc zero test
1998	sub	%o1, 8, %o1		! adjust pointer to allow store in
1999					! branch delay slot instead of add
2000.co_med_lmove:
2001	add	%o1, 8, %o1		! advance DST by 8
2002	ldx	[%o0], %o3		! read long word
2003	subcc	%o2, 32, %o2		! reduce count by 32
2004	stxa	%o3, [%o1]ASI_USER	! write long word
2005	add	%o1, 8, %o1		! advance DST by 8
2006	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
2007	add	%o0, 32, %o0		! advance SRC by 32
2008	stxa	%o3, [%o1]ASI_USER
2009	ldx	[%o0 - 16], %o3
2010	add	%o1, 8, %o1		! advance DST by 8
2011	stxa	%o3, [%o1]ASI_USER
2012	ldx	[%o0 - 8], %o3
2013	add	%o1, 8, %o1		! advance DST by 8
2014	bgt,pt	%ncc, .co_med_lmove	! loop til 31 or fewer bytes left
2015	  stxa	%o3, [%o1]ASI_USER
2016	add	%o1, 8, %o1		! advance DST by 8
2017	addcc	%o2, 24, %o2		! restore count to long word offset
2018	ble,pt	%ncc, .co_med_lextra	! check for more long words to move
2019	  nop
2020.co_med_lword:
2021	ldx	[%o0], %o3		! read long word
2022	subcc	%o2, 8, %o2		! reduce count by 8
2023	stxa	%o3, [%o1]ASI_USER	! write long word
2024	add	%o0, 8, %o0		! advance SRC by 8
2025	bgt,pt	%ncc, .co_med_lword	! loop til 7 or fewer bytes left
2026	  add	%o1, 8, %o1		! advance DST by 8
2027.co_med_lextra:
2028	addcc	%o2, 7, %o2		! restore rest of count
2029	bz,pt	%ncc, .co_sm_exit	! if zero, then done
2030	  deccc	%o2
2031	bz,pt	%ncc, .co_sm_byte
2032	  nop
2033	ba,pt	%ncc, .co_sm_half
2034	  nop
2035
2036	.align 16
2037	nop				! instruction alignment
2038					! see discussion at start of file
2039.co_med_word:
2040	btst	3, %o0			! check for
2041	bz,pt	%ncc, .co_med_word1	! word alignment
2042	  nop
2043.co_med_word0:
2044	ldub	[%o0], %o3		! load one byte
2045	inc	%o0
2046	stba	%o3,[%o1]ASI_USER	! store byte
2047	inc	%o1
2048	btst	3, %o0
2049	bnz,pt	%ncc, .co_med_word0
2050	  dec	%o2
2051!
2052!  Now word aligned and have at least 36 bytes to move
2053!
2054.co_med_word1:
2055	sub	%o2, 15, %o2		! adjust count to allow cc zero test
2056.co_med_wmove:
2057	lduw	[%o0], %o3		! read word
2058	subcc	%o2, 16, %o2		! reduce count by 16
2059	stwa	%o3, [%o1]ASI_USER	! write word
2060	add	%o1, 4, %o1		! advance DST by 4
2061	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
2062	add	%o0, 16, %o0		! advance SRC by 16
2063	stwa	%o3, [%o1]ASI_USER
2064	add	%o1, 4, %o1		! advance DST by 4
2065	lduw	[%o0 - 8], %o3
2066	stwa	%o3, [%o1]ASI_USER
2067	add	%o1, 4, %o1		! advance DST by 4
2068	lduw	[%o0 - 4], %o3
2069	stwa	%o3, [%o1]ASI_USER
2070	bgt,pt	%ncc, .co_med_wmove	! loop til 15 or fewer bytes left
2071	  add	%o1, 4, %o1		! advance DST by 4
2072	addcc	%o2, 12, %o2		! restore count to word offset
2073	ble,pt	%ncc, .co_med_wextra	! check for more words to move
2074	  nop
2075.co_med_word2:
2076	lduw	[%o0], %o3		! read word
2077	subcc	%o2, 4, %o2		! reduce count by 4
2078	stwa	%o3, [%o1]ASI_USER	! write word
2079	add	%o0, 4, %o0		! advance SRC by 4
2080	bgt,pt	%ncc, .co_med_word2	! loop til 3 or fewer bytes left
2081	  add	%o1, 4, %o1		! advance DST by 4
2082.co_med_wextra:
2083	addcc	%o2, 3, %o2		! restore rest of count
2084	bz,pt	%ncc, .co_sm_exit	! if zero, then done
2085	  deccc	%o2
2086	bz,pt	%ncc, .co_sm_byte
2087	  nop
2088	ba,pt	%ncc, .co_sm_half
2089	  nop
2090
2091	.align 16
2092	nop				! instruction alignment
2093	nop				! see discussion at start of file
2094	nop
2095.co_med_half:
2096	btst	1, %o0			! check for
2097	bz,pt	%ncc, .co_med_half1	! half word alignment
2098	  nop
2099	ldub	[%o0], %o3		! load one byte
2100	inc	%o0
2101	stba	%o3,[%o1]ASI_USER	! store byte
2102	inc	%o1
2103	dec	%o2
2104!
2105!  Now half word aligned and have at least 38 bytes to move
2106!
2107.co_med_half1:
2108	sub	%o2, 7, %o2		! adjust count to allow cc zero test
2109.co_med_hmove:
2110	lduh	[%o0], %o3		! read half word
2111	subcc	%o2, 8, %o2		! reduce count by 8
2112	stha	%o3, [%o1]ASI_USER	! write half word
2113	add	%o1, 2, %o1		! advance DST by 2
2114	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
2115	add	%o0, 8, %o0		! advance SRC by 8
2116	stha	%o3, [%o1]ASI_USER
2117	add	%o1, 2, %o1		! advance DST by 2
2118	lduh	[%o0 - 4], %o3
2119	stha	%o3, [%o1]ASI_USER
2120	add	%o1, 2, %o1		! advance DST by 2
2121	lduh	[%o0 - 2], %o3
2122	stha	%o3, [%o1]ASI_USER
2123	bgt,pt	%ncc, .co_med_hmove	! loop til 7 or fewer bytes left
2124	  add	%o1, 2, %o1		! advance DST by 2
2125	addcc	%o2, 7, %o2		! restore count
2126	bz,pt	%ncc, .co_sm_exit
2127	  deccc	%o2
2128	bz,pt	%ncc, .co_sm_byte
2129	  nop
2130	ba,pt	%ncc, .co_sm_half
2131	  nop
2132
2133/*
2134 * We got here because of a fault during short copyout.
2135 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2136 */
2137.sm_copyout_err:
2138	membar	#Sync
2139	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2140	mov	SM_SAVE_SRC, %o0
2141	mov	SM_SAVE_DST, %o1
2142	mov	SM_SAVE_COUNT, %o2
2143	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2144	tst	%o3
2145	bz,pt	%ncc, 3f			! if not, return error
2146	  nop
2147	ldn	[%o3 + CP_COPYOUT], %o5		! if handler, invoke it with
2148	jmp	%o5				! original arguments
2149	  nop
21503:
2151	retl
2152	  or	%g0, -1, %o0		! return error value
2153
2154	SET_SIZE(copyout)
2155
2156/*
2157 * The _more entry points are not intended to be used directly by
2158 * any caller from outside this file.  They are provided to allow
2159 * profiling and dtrace of the portions of the copy code that uses
2160 * the floating point registers.
2161 * This entry is particularly important as DTRACE (at least as of
2162 * 4/2004) does not support leaf functions.
2163 */
2164
2165	ENTRY(copyout_more)
2166.copyout_more:
2167	prefetch [%o0], #n_reads
2168	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2169	set	.copyout_err, REAL_LOFAULT
2170
2171/*
2172 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
2173 */
2174.do_copyout:
2175        set     copyio_fault, %l7		! .copyio_fault is lofault val
2176
2177	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
2178	membar	#Sync				! sync error barrier
2179	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
2180
2181	mov	%i0, SAVE_SRC
2182	mov	%i1, SAVE_DST
2183	mov	%i2, SAVE_COUNT
2184
2185	FP_NOMIGRATE(6, 7)
2186
2187	rd	%fprs, %o2		! check for unused fp
2188	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2189	btst	FPRS_FEF, %o2
2190	bz,a,pt	%icc, .do_blockcopyout
2191	  wr	%g0, FPRS_FEF, %fprs
2192
2193	BST_FPQ2Q4_TOSTACK(%o2)
2194
2195.do_blockcopyout:
2196	rd	%gsr, %o2
2197	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
2198	or	%l6, FPUSED_FLAG, %l6
2199
2200	andcc	DST, VIS_BLOCKSIZE - 1, TMP
2201	mov	ASI_USER, %asi
2202	bz,pt	%ncc, 2f
2203	  neg	TMP
2204	add	TMP, VIS_BLOCKSIZE, TMP
2205
2206	! TMP = bytes required to align DST on FP_BLOCK boundary
2207	! Using SRC as a tmp here
2208	cmp	TMP, 3
2209	bleu,pt	%ncc, 1f
2210	  sub	CNT,TMP,CNT		! adjust main count
2211	sub	TMP, 3, TMP		! adjust for end of loop test
2212.co_blkalign:
2213	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
2214	stba	SRC, [DST]%asi
2215	subcc	TMP, 4, TMP
2216	ldub	[REALSRC + 1], SRC
2217	add	REALSRC, 4, REALSRC
2218	stba	SRC, [DST + 1]%asi
2219	ldub	[REALSRC - 2], SRC
2220	add	DST, 4, DST
2221	stba	SRC, [DST - 2]%asi
2222	ldub	[REALSRC - 1], SRC
2223	bgu,pt	%ncc, .co_blkalign
2224	  stba	SRC, [DST - 1]%asi
2225
2226	addcc	TMP, 3, TMP		! restore count adjustment
2227	bz,pt	%ncc, 2f		! no bytes left?
2228	  nop
22291:	ldub	[REALSRC], SRC
2230	inc	REALSRC
2231	inc	DST
2232	deccc	TMP
2233	bgu	%ncc, 1b
2234	  stba	SRC, [DST - 1]%asi
2235
22362:
2237	membar	#StoreLoad
2238	andn	REALSRC, 0x7, SRC
2239
2240	! SRC - 8-byte aligned
2241	! DST - 64-byte aligned
2242	ldd	[SRC], %f16
2243	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
2244	alignaddr REALSRC, %g0, %g0
2245	ldd	[SRC + 0x08], %f18
2246	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
2247	faligndata %f16, %f18, %f48
2248	ldd	[SRC + 0x10], %f20
2249	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read
2250	faligndata %f18, %f20, %f50
2251	ldd	[SRC + 0x18], %f22
2252	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
2253	faligndata %f20, %f22, %f52
2254	ldd	[SRC + 0x20], %f24
2255	prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read
2256	faligndata %f22, %f24, %f54
2257	ldd	[SRC + 0x28], %f26
2258	prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read
2259	faligndata %f24, %f26, %f56
2260	ldd	[SRC + 0x30], %f28
2261	prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read
2262	faligndata %f26, %f28, %f58
2263	ldd	[SRC + 0x38], %f30
2264	ldd	[SRC + VIS_BLOCKSIZE], %f16
2265	sub	CNT, VIS_BLOCKSIZE, CNT
2266	add	SRC, VIS_BLOCKSIZE, SRC
2267	prefetch [SRC + (9 * VIS_BLOCKSIZE) - VIS_BLOCKSIZE], #one_read
2268	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2269	ba,pt	%ncc, 1f
2270	  prefetch [SRC + (10 * VIS_BLOCKSIZE) - VIS_BLOCKSIZE], #one_read
2271	.align	32
22721:
2273	ldd	[SRC + 0x08], %f18
2274	faligndata %f28, %f30, %f60
2275	ldd	[SRC + 0x10], %f20
2276	faligndata %f30, %f16, %f62
2277	stda	%f48, [DST]ASI_BLK_AIUS
2278	ldd	[SRC + 0x18], %f22
2279	faligndata %f16, %f18, %f48
2280	ldd	[SRC + 0x20], %f24
2281	faligndata %f18, %f20, %f50
2282	ldd	[SRC + 0x28], %f26
2283	faligndata %f20, %f22, %f52
2284	ldd	[SRC + 0x30], %f28
2285	faligndata %f22, %f24, %f54
2286	ldd	[SRC + 0x38], %f30
2287	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #n_reads
2288	faligndata %f24, %f26, %f56
2289	ldd	[SRC + VIS_BLOCKSIZE], %f16
2290	prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2291	faligndata %f26, %f28, %f58
2292	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE) + 0x20], #n_reads
2293	prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE) + 0x20], #one_read
2294	sub	CNT, VIS_BLOCKSIZE, CNT
2295	add	DST, VIS_BLOCKSIZE, DST
2296	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2297	cmp	CNT, VIS_BLOCKSIZE + 8
2298	bgu,pt	%ncc, 1b
2299	  add	SRC, VIS_BLOCKSIZE, SRC
2300
2301	! only if REALSRC & 0x7 is 0
2302	cmp	CNT, VIS_BLOCKSIZE
2303	bne	%ncc, 3f
2304	  andcc	REALSRC, 0x7, %g0
2305	bz,pt	%ncc, 2f
2306	  nop
23073:
2308	faligndata %f28, %f30, %f60
2309	faligndata %f30, %f16, %f62
2310	stda	%f48, [DST]ASI_BLK_AIUS
2311	add	DST, VIS_BLOCKSIZE, DST
2312	ba,pt	%ncc, 3f
2313	  nop
23142:
2315	ldd	[SRC + 0x08], %f18
2316	fsrc1	%f28, %f60
2317	ldd	[SRC + 0x10], %f20
2318	fsrc1	%f30, %f62
2319	stda	%f48, [DST]ASI_BLK_AIUS
2320	ldd	[SRC + 0x18], %f22
2321	fsrc1	%f16, %f48
2322	ldd	[SRC + 0x20], %f24
2323	fsrc1	%f18, %f50
2324	ldd	[SRC + 0x28], %f26
2325	fsrc1	%f20, %f52
2326	ldd	[SRC + 0x30], %f28
2327	fsrc1	%f22, %f54
2328	ldd	[SRC + 0x38], %f30
2329	fsrc1	%f24, %f56
2330	sub	CNT, VIS_BLOCKSIZE, CNT
2331	add	DST, VIS_BLOCKSIZE, DST
2332	add	SRC, VIS_BLOCKSIZE, SRC
2333	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2334	fsrc1	%f26, %f58
2335	fsrc1	%f28, %f60
2336	fsrc1	%f30, %f62
2337	stda	%f48, [DST]ASI_BLK_AIUS
2338	add	DST, VIS_BLOCKSIZE, DST
2339	ba,a,pt	%ncc, 4f
2340	  nop
2341
23423:	tst	CNT
2343	bz,a	%ncc, 4f
2344	  nop
2345
23465:	ldub	[REALSRC], TMP
2347	inc	REALSRC
2348	inc	DST
2349	deccc	CNT
2350	bgu	%ncc, 5b
2351	  stba	TMP, [DST - 1]%asi
23524:
2353
2354.copyout_exit:
2355	membar	#Sync
2356
2357	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2358	wr	%o2, 0, %gsr		! restore gsr
2359
2360	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2361	btst	FPRS_FEF, %o3
2362	bz,pt	%icc, 4f
2363	  nop
2364
2365	BLD_FPQ2Q4_FROMSTACK(%o2)
2366
2367	ba,pt	%ncc, 1f
2368	  wr	%o3, 0, %fprs		! restore fprs
2369
23704:
2371	FZEROQ2Q4
2372	wr	%o3, 0, %fprs		! restore fprs
2373
23741:
2375	membar	#Sync
2376	andn	%l6, FPUSED_FLAG, %l6
2377	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2378	FP_ALLOWMIGRATE(5, 6)
2379	ret
2380	  restore	%g0, 0, %o0
2381
2382/*
2383 * We got here because of a fault during copyout.
2384 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2385 */
2386.copyout_err:
2387	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
2388	tst	%o4
2389	bz,pt	%ncc, 2f			! if not, return error
2390	  nop
2391	ldn	[%o4 + CP_COPYOUT], %g2		! if handler, invoke it with
2392	jmp	%g2				! original arguments
2393	  restore %g0, 0, %g0			! dispose of copy window
23942:
2395        ret
2396	  restore %g0, -1, %o0			! return error value
2397
2398
2399	SET_SIZE(copyout_more)
2400
2401#endif	/* lint */
2402
2403
2404#ifdef	lint
2405
2406/*ARGSUSED*/
2407int
2408xcopyout(const void *kaddr, void *uaddr, size_t count)
2409{ return (0); }
2410
2411#else	/* lint */
2412
2413	ENTRY(xcopyout)
2414	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
2415	bleu,pt	%ncc, .xcopyout_small		! go to larger cases
2416	  xor	%o0, %o1, %o3			! are src, dst alignable?
2417	btst	7, %o3				!
2418	bz,pt	%ncc, .xcopyout_8		!
2419	  nop
2420	btst	1, %o3				!
2421	bz,pt	%ncc, .xcopyout_2		! check for half-word
2422	  nop
2423	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
2424	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2425	tst	%o3
2426	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2427	  cmp	%o2, %o3			! if length <= limit
2428	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2429	  nop
2430	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2431	  nop
2432.xcopyout_2:
2433	btst	3, %o3				!
2434	bz,pt	%ncc, .xcopyout_4		! check for word alignment
2435	  nop
2436	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
2437	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
2438	tst	%o3
2439	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2440	  cmp	%o2, %o3			! if length <= limit
2441	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2442	  nop
2443	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2444	  nop
2445.xcopyout_4:
2446	! already checked longword, must be word aligned
2447	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
2448	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
2449	tst	%o3
2450	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2451	  cmp	%o2, %o3			! if length <= limit
2452	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2453	  nop
2454	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2455	  nop
2456.xcopyout_8:
2457	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
2458	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2459	tst	%o3
2460	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2461	  cmp	%o2, %o3			! if length <= limit
2462	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2463	  nop
2464	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2465	  nop
2466
2467.xcopyout_small:
2468	sethi	%hi(.sm_xcopyout_err), %o5	! .sm_xcopyout_err is lofault
2469	or	%o5, %lo(.sm_xcopyout_err), %o5
2470	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
2471	membar	#Sync				! sync error barrier
2472	ba,pt	%ncc, .sm_do_copyout		! common code
2473	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
2474
2475.xcopyout_more:
2476	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2477	sethi	%hi(.xcopyout_err), REAL_LOFAULT
2478	ba,pt	%ncc, .do_copyout		! common code
2479	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2480
2481/*
2482 * We got here because of fault during xcopyout
2483 * Errno value is in ERRNO
2484 */
2485.xcopyout_err:
2486	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
2487	tst	%o4
2488	bz,pt	%ncc, 2f			! if not, return error
2489	  nop
2490	ldn	[%o4 + CP_XCOPYOUT], %g2	! if handler, invoke it with
2491	jmp	%g2				! original arguments
2492	  restore %g0, 0, %g0			! dispose of copy window
24932:
2494        ret
2495	  restore ERRNO, 0, %o0			! return errno value
2496
2497.sm_xcopyout_err:
2498
2499	membar	#Sync
2500	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2501	mov	SM_SAVE_SRC, %o0
2502	mov	SM_SAVE_DST, %o1
2503	mov	SM_SAVE_COUNT, %o2
2504	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2505	tst	%o3
2506	bz,pt	%ncc, 3f			! if not, return error
2507	  nop
2508	ldn	[%o3 + CP_XCOPYOUT], %o5	! if handler, invoke it with
2509	jmp	%o5				! original arguments
2510	  nop
25113:
2512	retl
2513	  or	%g1, 0, %o0		! return errno value
2514
2515	SET_SIZE(xcopyout)
2516
2517#endif	/* lint */
2518
2519#ifdef	lint
2520
2521/*ARGSUSED*/
2522int
2523xcopyout_little(const void *kaddr, void *uaddr, size_t count)
2524{ return (0); }
2525
2526#else	/* lint */
2527
2528	ENTRY(xcopyout_little)
2529	sethi	%hi(.xcopyio_err), %o5
2530	or	%o5, %lo(.xcopyio_err), %o5
2531	ldn	[THREAD_REG + T_LOFAULT], %o4
2532	membar	#Sync				! sync error barrier
2533	stn	%o5, [THREAD_REG + T_LOFAULT]
2534	mov	%o4, %o5
2535
2536	subcc	%g0, %o2, %o3
2537	add	%o0, %o2, %o0
2538	bz,pn	%ncc, 2f		! check for zero bytes
2539	  sub	%o2, 1, %o4
2540	add	%o0, %o4, %o0		! start w/last byte
2541	add	%o1, %o2, %o1
2542	ldub	[%o0 + %o3], %o4
2543
25441:	stba	%o4, [%o1 + %o3]ASI_AIUSL
2545	inccc	%o3
2546	sub	%o0, 2, %o0		! get next byte
2547	bcc,a,pt %ncc, 1b
2548	  ldub	[%o0 + %o3], %o4
2549
25502:
2551	membar	#Sync				! sync error barrier
2552	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2553	retl
2554	  mov	%g0, %o0		! return (0)
2555
2556	SET_SIZE(xcopyout_little)
2557
2558#endif	/* lint */
2559
2560/*
2561 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2562 */
2563
2564#if defined(lint)
2565
2566/*ARGSUSED*/
2567int
2568copyin(const void *uaddr, void *kaddr, size_t count)
2569{ return (0); }
2570
2571#else	/* lint */
2572
2573	ENTRY(copyin)
2574	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
2575	bleu,pt	%ncc, .copyin_small		! go to larger cases
2576	  xor	%o0, %o1, %o3			! are src, dst alignable?
2577	btst	7, %o3				!
2578	bz,pt	%ncc, .copyin_8			! check for longword alignment
2579	  nop
2580	btst	1, %o3				!
2581	bz,pt	%ncc, .copyin_2			! check for half-word
2582	  nop
2583	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
2584	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2585	tst	%o3
2586	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2587	  cmp	%o2, %o3			! if length <= limit
2588	bleu,pt	%ncc, .copyin_small		! go to small copy
2589	  nop
2590	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2591	  nop
2592.copyin_2:
2593	btst	3, %o3				!
2594	bz,pt	%ncc, .copyin_4			! check for word alignment
2595	  nop
2596	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
2597	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
2598	tst	%o3
2599	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2600	  cmp	%o2, %o3			! if length <= limit
2601	bleu,pt	%ncc, .copyin_small		! go to small copy
2602	  nop
2603	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2604	  nop
2605.copyin_4:
2606	! already checked longword, must be word aligned
2607	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
2608	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
2609	tst	%o3
2610	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2611	  cmp	%o2, %o3			! if length <= limit
2612	bleu,pt	%ncc, .copyin_small		! go to small copy
2613	  nop
2614	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2615	  nop
2616.copyin_8:
2617	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
2618	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2619	tst	%o3
2620	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2621	  cmp	%o2, %o3			! if length <= limit
2622	bleu,pt	%ncc, .copyin_small		! go to small copy
2623	  nop
2624	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2625	  nop
2626
2627	.align	16
2628	nop				! instruction alignment
2629					! see discussion at start of file
2630.copyin_small:
2631	sethi	%hi(.sm_copyin_err), %o5	! .sm_copyin_err is lofault
2632	or	%o5, %lo(.sm_copyin_err), %o5
2633	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofault, no tramp
2634	membar	#Sync				! sync error barrier
2635	stn	%o5, [THREAD_REG + T_LOFAULT]
2636.sm_do_copyin:
2637	mov	%o0, SM_SAVE_SRC
2638	mov	%o1, SM_SAVE_DST
2639	cmp	%o2, SHORTCOPY		! check for really short case
2640	bleu,pt	%ncc, .ci_sm_left	!
2641	  mov	%o2, SM_SAVE_COUNT
2642	cmp	%o2, CHKSIZE		! check for medium length cases
2643	bgu,pn	%ncc, .ci_med		!
2644	  or	%o0, %o1, %o3		! prepare alignment check
2645	andcc	%o3, 0x3, %g0		! test for alignment
2646	bz,pt	%ncc, .ci_sm_word	! branch to word aligned case
2647.ci_sm_movebytes:
2648	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
2649.ci_sm_notalign4:
2650	lduba	[%o0]ASI_USER, %o3	! read byte
2651	subcc	%o2, 4, %o2		! reduce count by 4
2652	stb	%o3, [%o1]		! write byte
2653	add	%o0, 1, %o0		! advance SRC by 1
2654	lduba	[%o0]ASI_USER, %o3	! repeat for a total of 4 bytes
2655	add	%o0, 1, %o0		! advance SRC by 1
2656	stb	%o3, [%o1 + 1]
2657	add	%o1, 4, %o1		! advance DST by 4
2658	lduba	[%o0]ASI_USER, %o3
2659	add	%o0, 1, %o0		! advance SRC by 1
2660	stb	%o3, [%o1 - 2]
2661	lduba	[%o0]ASI_USER, %o3
2662	add	%o0, 1, %o0		! advance SRC by 1
2663	bgt,pt	%ncc, .ci_sm_notalign4	! loop til 3 or fewer bytes remain
2664	  stb	%o3, [%o1 - 1]
2665	add	%o2, 3, %o2		! restore count
2666.ci_sm_left:
2667	tst	%o2
2668	bz,pt	%ncc, .ci_sm_exit
2669	  nop
2670	lduba	[%o0]ASI_USER, %o3		! load one byte
2671	deccc	%o2			! reduce count for cc test
2672	bz,pt	%ncc, .ci_sm_exit
2673	  stb	%o3,[%o1]		! store one byte
2674	inc	%o0
2675	lduba	[%o0]ASI_USER, %o3	! load second byte
2676	deccc	%o2
2677	bz,pt	%ncc, .ci_sm_exit
2678	  stb	%o3,[%o1 + 1]		! store second byte
2679	inc	%o0
2680	lduba	[%o0]ASI_USER, %o3	! load third byte
2681	stb	%o3,[%o1 + 2]		! store third byte
2682	membar	#Sync				! sync error barrier
2683	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2684	retl
2685	  mov	%g0, %o0		! return 0
2686	.align	16
2687.ci_sm_words:
2688	lduwa	[%o0]ASI_USER, %o3		! read word
2689.ci_sm_wordx:
2690	subcc	%o2, 8, %o2		! update count
2691	stw	%o3, [%o1]		! write word
2692	add	%o0, 4, %o0		! update SRC
2693	add	%o1, 8, %o1		! update DST
2694	lduwa	[%o0]ASI_USER, %o3	! read word
2695	add	%o0, 4, %o0		! update SRC
2696	bgt,pt	%ncc, .ci_sm_words	! loop til done
2697	  stw	%o3, [%o1 - 4]		! write word
2698	addcc	%o2, 7, %o2		! restore count
2699	bz,pt	%ncc, .ci_sm_exit
2700	  nop
2701	deccc	%o2
2702	bz,pt	%ncc, .ci_sm_byte
2703.ci_sm_half:
2704	  subcc	%o2, 2, %o2		! reduce count by 2
2705	lduha	[%o0]ASI_USER, %o3	! read half word
2706	add	%o0, 2, %o0		! advance SRC by 2
2707	add	%o1, 2, %o1		! advance DST by 2
2708	bgt,pt	%ncc, .ci_sm_half	! loop til done
2709	  sth	%o3, [%o1 - 2]		! write half word
2710	addcc	%o2, 1, %o2		! restore count
2711	bz,pt	%ncc, .ci_sm_exit
2712	  nop
2713.ci_sm_byte:
2714	lduba	[%o0]ASI_USER, %o3
2715	stb	%o3, [%o1]
2716	membar	#Sync				! sync error barrier
2717	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2718	retl
2719	  mov	%g0, %o0		! return 0
2720	.align	16
2721.ci_sm_word:
2722	subcc	%o2, 4, %o2		! update count
2723	bgt,pt	%ncc, .ci_sm_wordx
2724	  lduwa	[%o0]ASI_USER, %o3		! read word
2725	addcc	%o2, 3, %o2		! restore count
2726	bz,pt	%ncc, .ci_sm_exit
2727	  stw	%o3, [%o1]		! write word
2728	deccc	%o2			! reduce count for cc test
2729	add	%o0, 4, %o0
2730	lduba	[%o0]ASI_USER, %o3	! load one byte
2731	bz,pt	%ncc, .ci_sm_exit
2732	  stb	%o3, [%o1 + 4]		! store one byte
2733	inc	%o0
2734	lduba	[%o0]ASI_USER, %o3	! load second byte
2735	deccc	%o2
2736	bz,pt	%ncc, .ci_sm_exit
2737	  stb	%o3, [%o1 + 5]		! store second byte
2738	inc	%o0
2739	lduba	[%o0]ASI_USER, %o3	! load third byte
2740	stb	%o3, [%o1 + 6]		! store third byte
2741.ci_sm_exit:
2742	membar	#Sync				! sync error barrier
2743	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2744	retl
2745	  mov	%g0, %o0		! return 0
2746
2747	.align 16
2748.ci_med:
2749	xor	%o0, %o1, %o3		! setup alignment check
2750	btst	1, %o3
2751	bnz,pt	%ncc, .ci_sm_movebytes	! unaligned
2752	  nop
2753	btst	3, %o3
2754	bnz,pt	%ncc, .ci_med_half	! halfword aligned
2755	  nop
2756	btst	7, %o3
2757	bnz,pt	%ncc, .ci_med_word	! word aligned
2758	  nop
2759.ci_med_long:
2760	btst	3, %o0			! check for
2761	bz,pt	%ncc, .ci_med_long1	! word alignment
2762	  nop
2763.ci_med_long0:
2764	lduba	[%o0]ASI_USER, %o3		! load one byte
2765	inc	%o0
2766	stb	%o3,[%o1]		! store byte
2767	inc	%o1
2768	btst	3, %o0
2769	bnz,pt	%ncc, .ci_med_long0
2770	  dec	%o2
2771.ci_med_long1:			! word aligned
2772	btst	7, %o0			! check for long word
2773	bz,pt	%ncc, .ci_med_long2
2774	  nop
2775	lduwa	[%o0]ASI_USER, %o3	! load word
2776	add	%o0, 4, %o0		! advance SRC by 4
2777	stw	%o3, [%o1]		! store word
2778	add	%o1, 4, %o1		! advance DST by 4
2779	sub	%o2, 4, %o2		! reduce count by 4
2780!
2781!  Now long word aligned and have at least 32 bytes to move
2782!
2783.ci_med_long2:
2784	sub	%o2, 31, %o2		! adjust count to allow cc zero test
2785.ci_med_lmove:
2786	ldxa	[%o0]ASI_USER, %o3	! read long word
2787	subcc	%o2, 32, %o2		! reduce count by 32
2788	stx	%o3, [%o1]		! write long word
2789	add	%o0, 8, %o0		! advance SRC by 8
2790	ldxa	[%o0]ASI_USER, %o3	! repeat for a total for 4 long words
2791	add	%o0, 8, %o0		! advance SRC by 8
2792	stx	%o3, [%o1 + 8]
2793	add	%o1, 32, %o1		! advance DST by 32
2794	ldxa	[%o0]ASI_USER, %o3
2795	add	%o0, 8, %o0		! advance SRC by 8
2796	stx	%o3, [%o1 - 16]
2797	ldxa	[%o0]ASI_USER, %o3
2798	add	%o0, 8, %o0		! advance SRC by 8
2799	bgt,pt	%ncc, .ci_med_lmove	! loop til 31 or fewer bytes left
2800	  stx	%o3, [%o1 - 8]
2801	addcc	%o2, 24, %o2		! restore count to long word offset
2802	ble,pt	%ncc, .ci_med_lextra	! check for more long words to move
2803	  nop
2804.ci_med_lword:
2805	ldxa	[%o0]ASI_USER, %o3	! read long word
2806	subcc	%o2, 8, %o2		! reduce count by 8
2807	stx	%o3, [%o1]		! write long word
2808	add	%o0, 8, %o0		! advance SRC by 8
2809	bgt,pt	%ncc, .ci_med_lword	! loop til 7 or fewer bytes left
2810	  add	%o1, 8, %o1		! advance DST by 8
2811.ci_med_lextra:
2812	addcc	%o2, 7, %o2		! restore rest of count
2813	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
2814	  deccc	%o2
2815	bz,pt	%ncc, .ci_sm_byte
2816	  nop
2817	ba,pt	%ncc, .ci_sm_half
2818	  nop
2819
2820	.align 16
2821	nop				! instruction alignment
2822					! see discussion at start of file
2823.ci_med_word:
2824	btst	3, %o0			! check for
2825	bz,pt	%ncc, .ci_med_word1	! word alignment
2826	  nop
2827.ci_med_word0:
2828	lduba	[%o0]ASI_USER, %o3	! load one byte
2829	inc	%o0
2830	stb	%o3,[%o1]		! store byte
2831	inc	%o1
2832	btst	3, %o0
2833	bnz,pt	%ncc, .ci_med_word0
2834	  dec	%o2
2835!
2836!  Now word aligned and have at least 36 bytes to move
2837!
2838.ci_med_word1:
2839	sub	%o2, 15, %o2		! adjust count to allow cc zero test
2840.ci_med_wmove:
2841	lduwa	[%o0]ASI_USER, %o3	! read word
2842	subcc	%o2, 16, %o2		! reduce count by 16
2843	stw	%o3, [%o1]		! write word
2844	add	%o0, 4, %o0		! advance SRC by 4
2845	lduwa	[%o0]ASI_USER, %o3	! repeat for a total for 4 words
2846	add	%o0, 4, %o0		! advance SRC by 4
2847	stw	%o3, [%o1 + 4]
2848	add	%o1, 16, %o1		! advance DST by 16
2849	lduwa	[%o0]ASI_USER, %o3
2850	add	%o0, 4, %o0		! advance SRC by 4
2851	stw	%o3, [%o1 - 8]
2852	lduwa	[%o0]ASI_USER, %o3
2853	add	%o0, 4, %o0		! advance SRC by 4
2854	bgt,pt	%ncc, .ci_med_wmove	! loop til 15 or fewer bytes left
2855	  stw	%o3, [%o1 - 4]
2856	addcc	%o2, 12, %o2		! restore count to word offset
2857	ble,pt	%ncc, .ci_med_wextra	! check for more words to move
2858	  nop
2859.ci_med_word2:
2860	lduwa	[%o0]ASI_USER, %o3	! read word
2861	subcc	%o2, 4, %o2		! reduce count by 4
2862	stw	%o3, [%o1]		! write word
2863	add	%o0, 4, %o0		! advance SRC by 4
2864	bgt,pt	%ncc, .ci_med_word2	! loop til 3 or fewer bytes left
2865	  add	%o1, 4, %o1		! advance DST by 4
2866.ci_med_wextra:
2867	addcc	%o2, 3, %o2		! restore rest of count
2868	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
2869	  deccc	%o2
2870	bz,pt	%ncc, .ci_sm_byte
2871	  nop
2872	ba,pt	%ncc, .ci_sm_half
2873	  nop
2874
2875	.align 16
2876	nop				! instruction alignment
2877					! see discussion at start of file
2878.ci_med_half:
2879	btst	1, %o0			! check for
2880	bz,pt	%ncc, .ci_med_half1	! half word alignment
2881	  nop
2882	lduba	[%o0]ASI_USER, %o3	! load one byte
2883	inc	%o0
2884	stb	%o3,[%o1]		! store byte
2885	inc	%o1
2886	dec	%o2
2887!
2888!  Now half word aligned and have at least 38 bytes to move
2889!
2890.ci_med_half1:
2891	sub	%o2, 7, %o2		! adjust count to allow cc zero test
2892.ci_med_hmove:
2893	lduha	[%o0]ASI_USER, %o3	! read half word
2894	subcc	%o2, 8, %o2		! reduce count by 8
2895	sth	%o3, [%o1]		! write half word
2896	add	%o0, 2, %o0		! advance SRC by 2
2897	lduha	[%o0]ASI_USER, %o3	! repeat for a total for 4 halfwords
2898	add	%o0, 2, %o0		! advance SRC by 2
2899	sth	%o3, [%o1 + 2]
2900	add	%o1, 8, %o1		! advance DST by 8
2901	lduha	[%o0]ASI_USER, %o3
2902	add	%o0, 2, %o0		! advance SRC by 2
2903	sth	%o3, [%o1 - 4]
2904	lduha	[%o0]ASI_USER, %o3
2905	add	%o0, 2, %o0		! advance SRC by 2
2906	bgt,pt	%ncc, .ci_med_hmove	! loop til 7 or fewer bytes left
2907	  sth	%o3, [%o1 - 2]
2908	addcc	%o2, 7, %o2		! restore count
2909	bz,pt	%ncc, .ci_sm_exit
2910	  deccc	%o2
2911	bz,pt	%ncc, .ci_sm_byte
2912	  nop
2913	ba,pt	%ncc, .ci_sm_half
2914	  nop
2915
2916.sm_copyin_err:
2917	membar	#Sync
2918	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2919	mov	SM_SAVE_SRC, %o0
2920	mov	SM_SAVE_DST, %o1
2921	mov	SM_SAVE_COUNT, %o2
2922	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2923	tst	%o3
2924	bz,pt	%ncc, 3f			! if not, return error
2925	  nop
2926	ldn	[%o3 + CP_COPYIN], %o5		! if handler, invoke it with
2927	jmp	%o5				! original arguments
2928	  nop
29293:
2930	retl
2931	  or	%g0, -1, %o0		! return errno value
2932
2933	SET_SIZE(copyin)
2934
2935
2936/*
2937 * The _more entry points are not intended to be used directly by
2938 * any caller from outside this file.  They are provided to allow
2939 * profiling and dtrace of the portions of the copy code that uses
2940 * the floating point registers.
2941 * This entry is particularly important as DTRACE (at least as of
2942 * 4/2004) does not support leaf functions.
2943 */
2944
2945	ENTRY(copyin_more)
2946.copyin_more:
2947	prefetch [%o0], #n_reads
2948	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2949	set	.copyin_err, REAL_LOFAULT
2950
2951/*
2952 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
2953 */
2954.do_copyin:
2955	set	copyio_fault, %l7		! .copyio_fault is lofault val
2956
2957	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
2958	membar	#Sync				! sync error barrier
2959	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
2960
2961	mov	%i0, SAVE_SRC
2962	mov	%i1, SAVE_DST
2963	mov	%i2, SAVE_COUNT
2964
2965	FP_NOMIGRATE(6, 7)
2966
2967	rd	%fprs, %o2		! check for unused fp
2968	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2969	btst	FPRS_FEF, %o2
2970	bz,a,pt	%icc, .do_blockcopyin
2971	  wr	%g0, FPRS_FEF, %fprs
2972
2973	BST_FPQ2Q4_TOSTACK(%o2)
2974
2975.do_blockcopyin:
2976	rd	%gsr, %o2
2977	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
2978	or	%l6, FPUSED_FLAG, %l6
2979
2980	andcc	DST, VIS_BLOCKSIZE - 1, TMP
2981	mov	ASI_USER, %asi
2982	bz,pt	%ncc, 2f
2983	  neg	TMP
2984	add	TMP, VIS_BLOCKSIZE, TMP
2985
2986	! TMP = bytes required to align DST on FP_BLOCK boundary
2987	! Using SRC as a tmp here
2988	cmp	TMP, 3
2989	bleu,pt	%ncc, 1f
2990	  sub	CNT,TMP,CNT		! adjust main count
2991	sub	TMP, 3, TMP		! adjust for end of loop test
2992.ci_blkalign:
2993	lduba	[REALSRC]%asi, SRC	! move 4 bytes per loop iteration
2994	stb	SRC, [DST]
2995	subcc	TMP, 4, TMP
2996	lduba	[REALSRC + 1]%asi, SRC
2997	add	REALSRC, 4, REALSRC
2998	stb	SRC, [DST + 1]
2999	lduba	[REALSRC - 2]%asi, SRC
3000	add	DST, 4, DST
3001	stb	SRC, [DST - 2]
3002	lduba	[REALSRC - 1]%asi, SRC
3003	bgu,pt	%ncc, .ci_blkalign
3004	  stb	SRC, [DST - 1]
3005
3006	addcc	TMP, 3, TMP		! restore count adjustment
3007	bz,pt	%ncc, 2f		! no bytes left?
3008	  nop
30091:	lduba	[REALSRC]%asi, SRC
3010	inc	REALSRC
3011	inc	DST
3012	deccc	TMP
3013	bgu	%ncc, 1b
3014	  stb	SRC, [DST - 1]
3015
30162:
3017	membar	#StoreLoad
3018	andn	REALSRC, 0x7, SRC
3019
3020	! SRC - 8-byte aligned
3021	! DST - 64-byte aligned
3022	ldda	[SRC]%asi, %f16
3023	prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #n_reads
3024	alignaddr REALSRC, %g0, %g0
3025	ldda	[SRC + 0x08]%asi, %f18
3026	prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #n_reads
3027	faligndata %f16, %f18, %f48
3028	ldda	[SRC + 0x10]%asi, %f20
3029	prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #one_read
3030	faligndata %f18, %f20, %f50
3031	ldda	[SRC + 0x18]%asi, %f22
3032	prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
3033	faligndata %f20, %f22, %f52
3034	ldda	[SRC + 0x20]%asi, %f24
3035	prefetcha [SRC + (5 * VIS_BLOCKSIZE)]%asi, #one_read
3036	faligndata %f22, %f24, %f54
3037	ldda	[SRC + 0x28]%asi, %f26
3038	prefetcha [SRC + (6 * VIS_BLOCKSIZE)]%asi, #one_read
3039	faligndata %f24, %f26, %f56
3040	ldda	[SRC + 0x30]%asi, %f28
3041	prefetcha [SRC + (7 * VIS_BLOCKSIZE)]%asi, #one_read
3042	faligndata %f26, %f28, %f58
3043	ldda	[SRC + 0x38]%asi, %f30
3044	ldda	[SRC + VIS_BLOCKSIZE]%asi, %f16
3045	sub	CNT, VIS_BLOCKSIZE, CNT
3046	add	SRC, VIS_BLOCKSIZE, SRC
3047	prefetcha [SRC + (9 * VIS_BLOCKSIZE) - VIS_BLOCKSIZE]%asi, #one_read
3048	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3049	ba,pt	%ncc, 1f
3050	  prefetcha [SRC + (10 * VIS_BLOCKSIZE) - VIS_BLOCKSIZE]%asi, #one_read
3051	.align	32
30521:
3053	ldda	[SRC + 0x08]%asi, %f18
3054	faligndata %f28, %f30, %f60
3055	ldda	[SRC + 0x10]%asi, %f20
3056	faligndata %f30, %f16, %f62
3057	stda	%f48, [DST]ASI_BLK_P
3058	ldda	[SRC + 0x18]%asi, %f22
3059	faligndata %f16, %f18, %f48
3060	ldda	[SRC + 0x20]%asi, %f24
3061	faligndata %f18, %f20, %f50
3062	ldda	[SRC + 0x28]%asi, %f26
3063	faligndata %f20, %f22, %f52
3064	ldda	[SRC + 0x30]%asi, %f28
3065	faligndata %f22, %f24, %f54
3066	ldda	[SRC + 0x38]%asi, %f30
3067	prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)]%asi, #n_reads
3068	faligndata %f24, %f26, %f56
3069	ldda	[SRC + VIS_BLOCKSIZE]%asi, %f16
3070	prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3071	faligndata %f26, %f28, %f58
3072	prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE) + 0x20]%asi, #n_reads
3073	prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE) + 0x20]%asi, #one_read
3074	sub	CNT, VIS_BLOCKSIZE, CNT
3075	add	DST, VIS_BLOCKSIZE, DST
3076	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3077	cmp	CNT, VIS_BLOCKSIZE + 8
3078	bgu,pt	%ncc, 1b
3079	  add	SRC, VIS_BLOCKSIZE, SRC
3080
3081	! only if REALSRC & 0x7 is 0
3082	cmp	CNT, VIS_BLOCKSIZE
3083	bne	%ncc, 3f
3084	  andcc	REALSRC, 0x7, %g0
3085	bz,pt	%ncc, 2f
3086	  nop
30873:
3088	faligndata %f28, %f30, %f60
3089	faligndata %f30, %f16, %f62
3090	stda	%f48, [DST]ASI_BLK_P
3091	add	DST, VIS_BLOCKSIZE, DST
3092	ba,pt	%ncc, 3f
3093	  nop
30942:
3095	ldda	[SRC + 0x08]%asi, %f18
3096	fsrc1	%f28, %f60
3097	ldda	[SRC + 0x10]%asi, %f20
3098	fsrc1	%f30, %f62
3099	stda	%f48, [DST]ASI_BLK_P
3100	ldda	[SRC + 0x18]%asi, %f22
3101	fsrc1	%f16, %f48
3102	ldda	[SRC + 0x20]%asi, %f24
3103	fsrc1	%f18, %f50
3104	ldda	[SRC + 0x28]%asi, %f26
3105	fsrc1	%f20, %f52
3106	ldda	[SRC + 0x30]%asi, %f28
3107	fsrc1	%f22, %f54
3108	ldda	[SRC + 0x38]%asi, %f30
3109	fsrc1	%f24, %f56
3110	sub	CNT, VIS_BLOCKSIZE, CNT
3111	add	DST, VIS_BLOCKSIZE, DST
3112	add	SRC, VIS_BLOCKSIZE, SRC
3113	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3114	fsrc1	%f26, %f58
3115	fsrc1	%f28, %f60
3116	fsrc1	%f30, %f62
3117	stda	%f48, [DST]ASI_BLK_P
3118	add	DST, VIS_BLOCKSIZE, DST
3119	ba,a,pt	%ncc, 4f
3120	  nop
3121
31223:	tst	CNT
3123	bz,a	%ncc, 4f
3124	  nop
3125
31265:	lduba	[REALSRC]ASI_USER, TMP
3127	inc	REALSRC
3128	inc	DST
3129	deccc	CNT
3130	bgu	%ncc, 5b
3131	  stb	TMP, [DST - 1]
31324:
3133
3134.copyin_exit:
3135	membar	#Sync
3136
3137	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
3138	wr	%o2, 0, %gsr
3139
3140	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3141	btst	FPRS_FEF, %o3
3142	bz,pt	%icc, 4f
3143	  nop
3144
3145	BLD_FPQ2Q4_FROMSTACK(%o2)
3146
3147	ba,pt	%ncc, 1f
3148	  wr	%o3, 0, %fprs		! restore fprs
3149
31504:
3151	FZEROQ2Q4
3152	wr	%o3, 0, %fprs		! restore fprs
3153
31541:
3155	membar	#Sync				! sync error barrier
3156	andn	%l6, FPUSED_FLAG, %l6
3157	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3158	FP_ALLOWMIGRATE(5, 6)
3159	ret
3160	  restore	%g0, 0, %o0
3161/*
3162 * We got here because of a fault during copyin
3163 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
3164 */
3165.copyin_err:
3166	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
3167	tst	%o4
3168	bz,pt	%ncc, 2f			! if not, return error
3169	nop
3170	ldn	[%o4 + CP_COPYIN], %g2		! if handler, invoke it with
3171	jmp	%g2				! original arguments
3172	restore %g0, 0, %g0			! dispose of copy window
31732:
3174	ret
3175	restore %g0, -1, %o0			! return error value
3176
3177
3178	SET_SIZE(copyin_more)
3179
3180#endif	/* lint */
3181
3182#ifdef	lint
3183
3184/*ARGSUSED*/
3185int
3186xcopyin(const void *uaddr, void *kaddr, size_t count)
3187{ return (0); }
3188
3189#else	/* lint */
3190
3191	ENTRY(xcopyin)
3192
3193	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
3194	bleu,pt	%ncc, .xcopyin_small		! go to larger cases
3195	  xor	%o0, %o1, %o3			! are src, dst alignable?
3196	btst	7, %o3				!
3197	bz,pt	%ncc, .xcopyin_8		! check for longword alignment
3198	  nop
3199	btst	1, %o3				!
3200	bz,pt	%ncc, .xcopyin_2		! check for half-word
3201	  nop
3202	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
3203	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3204	tst	%o3
3205	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3206	  cmp	%o2, %o3			! if length <= limit
3207	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3208	  nop
3209	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3210	  nop
3211.xcopyin_2:
3212	btst	3, %o3				!
3213	bz,pt	%ncc, .xcopyin_4		! check for word alignment
3214	  nop
3215	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
3216	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3217	tst	%o3
3218	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3219	  cmp	%o2, %o3			! if length <= limit
3220	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3221	  nop
3222	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3223	  nop
3224.xcopyin_4:
3225	! already checked longword, must be word aligned
3226	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
3227	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3228	tst	%o3
3229	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3230	  cmp	%o2, %o3			! if length <= limit
3231	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3232	  nop
3233	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3234	  nop
3235.xcopyin_8:
3236	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
3237	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3238	tst	%o3
3239	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3240	  cmp	%o2, %o3			! if length <= limit
3241	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3242	  nop
3243	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3244	  nop
3245
3246.xcopyin_small:
3247	sethi	%hi(.sm_xcopyin_err), %o5  ! .sm_xcopyin_err is lofault value
3248	or	%o5, %lo(.sm_xcopyin_err), %o5
3249	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofaul
3250	membar	#Sync				! sync error barrier
3251	ba,pt	%ncc, .sm_do_copyin		! common code
3252	  stn	%o5, [THREAD_REG + T_LOFAULT]
3253
3254.xcopyin_more:
3255	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3256	sethi	%hi(.xcopyin_err), REAL_LOFAULT	! .xcopyin_err is lofault value
3257	ba,pt	%ncc, .do_copyin
3258	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3259
3260/*
3261 * We got here because of fault during xcopyin
3262 * Errno value is in ERRNO
3263 */
3264.xcopyin_err:
3265	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
3266	tst	%o4
3267	bz,pt	%ncc, 2f			! if not, return error
3268	  nop
3269	ldn	[%o4 + CP_XCOPYIN], %g2		! if handler, invoke it with
3270	jmp	%g2				! original arguments
3271	  restore %g0, 0, %g0			! dispose of copy window
32722:
3273        ret
3274	  restore ERRNO, 0, %o0			! return errno value
3275
3276.sm_xcopyin_err:
3277
3278	membar	#Sync
3279	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3280	mov	SM_SAVE_SRC, %o0
3281	mov	SM_SAVE_DST, %o1
3282	mov	SM_SAVE_COUNT, %o2
3283	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
3284	tst	%o3
3285	bz,pt	%ncc, 3f			! if not, return error
3286	  nop
3287	ldn	[%o3 + CP_XCOPYIN], %o5		! if handler, invoke it with
3288	jmp	%o5				! original arguments
3289	  nop
32903:
3291	retl
3292	  or	%g1, 0, %o0		! return errno value
3293
3294	SET_SIZE(xcopyin)
3295
3296#endif	/* lint */
3297
3298#ifdef	lint
3299
3300/*ARGSUSED*/
3301int
3302xcopyin_little(const void *uaddr, void *kaddr, size_t count)
3303{ return (0); }
3304
3305#else	/* lint */
3306
3307	ENTRY(xcopyin_little)
3308	sethi	%hi(.xcopyio_err), %o5
3309	or	%o5, %lo(.xcopyio_err), %o5
3310	ldn	[THREAD_REG + T_LOFAULT], %o4
3311	membar	#Sync				! sync error barrier
3312	stn	%o5, [THREAD_REG + T_LOFAULT]
3313	mov	%o4, %o5
3314
3315	subcc	%g0, %o2, %o3
3316	add	%o0, %o2, %o0
3317	bz,pn	%ncc, 2f		! check for zero bytes
3318	  sub	%o2, 1, %o4
3319	add	%o0, %o4, %o0		! start w/last byte
3320	add	%o1, %o2, %o1
3321	lduba	[%o0 + %o3]ASI_AIUSL, %o4
3322
33231:	stb	%o4, [%o1 + %o3]
3324	inccc	%o3
3325	sub	%o0, 2, %o0		! get next byte
3326	bcc,a,pt %ncc, 1b
3327	  lduba	[%o0 + %o3]ASI_AIUSL, %o4
3328
33292:
3330	membar	#Sync				! sync error barrier
3331	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3332	retl
3333	  mov	%g0, %o0		! return (0)
3334
3335.xcopyio_err:
3336	membar	#Sync				! sync error barrier
3337	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3338	retl
3339	  mov	%g1, %o0
3340
3341	SET_SIZE(xcopyin_little)
3342
3343#endif	/* lint */
3344
3345
3346/*
3347 * Copy a block of storage - must not overlap (from + len <= to).
3348 * No fault handler installed (to be called under on_fault())
3349 */
3350#if defined(lint)
3351
3352/* ARGSUSED */
3353void
3354copyin_noerr(const void *ufrom, void *kto, size_t count)
3355{}
3356
3357#else	/* lint */
3358	ENTRY(copyin_noerr)
3359
3360	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
3361	bleu,pt	%ncc, .copyin_ne_small		! go to larger cases
3362	  xor	%o0, %o1, %o3			! are src, dst alignable?
3363	btst	7, %o3				!
3364	bz,pt	%ncc, .copyin_ne_8		! check for longword alignment
3365	  nop
3366	btst	1, %o3				!
3367	bz,pt	%ncc, .copyin_ne_2		! check for half-word
3368	  nop
3369	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
3370	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3371	tst	%o3
3372	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3373	  cmp	%o2, %o3			! if length <= limit
3374	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3375	  nop
3376	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3377	  nop
3378.copyin_ne_2:
3379	btst	3, %o3				!
3380	bz,pt	%ncc, .copyin_ne_4		! check for word alignment
3381	  nop
3382	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
3383	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3384	tst	%o3
3385	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3386	  cmp	%o2, %o3			! if length <= limit
3387	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3388	  nop
3389	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3390	  nop
3391.copyin_ne_4:
3392	! already checked longword, must be word aligned
3393	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
3394	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3395	tst	%o3
3396	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3397	  cmp	%o2, %o3			! if length <= limit
3398	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3399	  nop
3400	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3401	  nop
3402.copyin_ne_8:
3403	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
3404	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3405	tst	%o3
3406	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3407	  cmp	%o2, %o3			! if length <= limit
3408	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3409	  nop
3410	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3411	  nop
3412
3413.copyin_ne_small:
3414	ldn	[THREAD_REG + T_LOFAULT], %o4
3415	tst	%o4
3416	bz,pn	%ncc, .sm_do_copyin
3417	  nop
3418	sethi	%hi(.sm_copyio_noerr), %o5
3419	or	%o5, %lo(.sm_copyio_noerr), %o5
3420	membar	#Sync				! sync error barrier
3421	ba,pt	%ncc, .sm_do_copyin
3422	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
3423
3424.copyin_noerr_more:
3425	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3426	sethi	%hi(.copyio_noerr), REAL_LOFAULT
3427	ba,pt	%ncc, .do_copyin
3428	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3429
3430.copyio_noerr:
3431	jmp	%l6
3432	  restore %g0,0,%g0
3433
3434.sm_copyio_noerr:
3435	membar	#Sync
3436	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore t_lofault
3437	jmp	%o4
3438	  nop
3439
3440	SET_SIZE(copyin_noerr)
3441#endif /* lint */
3442
3443/*
3444 * Copy a block of storage - must not overlap (from + len <= to).
3445 * No fault handler installed (to be called under on_fault())
3446 */
3447
3448#if defined(lint)
3449
3450/* ARGSUSED */
3451void
3452copyout_noerr(const void *kfrom, void *uto, size_t count)
3453{}
3454
3455#else	/* lint */
3456	ENTRY(copyout_noerr)
3457
3458	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
3459	bleu,pt	%ncc, .copyout_ne_small		! go to larger cases
3460	  xor	%o0, %o1, %o3			! are src, dst alignable?
3461	btst	7, %o3				!
3462	bz,pt	%ncc, .copyout_ne_8		! check for longword alignment
3463	  nop
3464	btst	1, %o3				!
3465	bz,pt	%ncc, .copyout_ne_2		! check for half-word
3466	  nop
3467	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
3468	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3469	tst	%o3
3470	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3471	  cmp	%o2, %o3			! if length <= limit
3472	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3473	  nop
3474	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3475	  nop
3476.copyout_ne_2:
3477	btst	3, %o3				!
3478	bz,pt	%ncc, .copyout_ne_4		! check for word alignment
3479	  nop
3480	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
3481	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3482	tst	%o3
3483	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3484	  cmp	%o2, %o3			! if length <= limit
3485	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3486	  nop
3487	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3488	  nop
3489.copyout_ne_4:
3490	! already checked longword, must be word aligned
3491	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
3492	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3493	tst	%o3
3494	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3495	  cmp	%o2, %o3			! if length <= limit
3496	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3497	  nop
3498	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3499	  nop
3500.copyout_ne_8:
3501	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
3502	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3503	tst	%o3
3504	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3505	  cmp	%o2, %o3			! if length <= limit
3506	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3507	  nop
3508	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3509	  nop
3510
3511.copyout_ne_small:
3512	ldn	[THREAD_REG + T_LOFAULT], %o4
3513	tst	%o4
3514	bz,pn	%ncc, .sm_do_copyout
3515	  nop
3516	sethi	%hi(.sm_copyio_noerr), %o5
3517	or	%o5, %lo(.sm_copyio_noerr), %o5
3518	membar	#Sync				! sync error barrier
3519	ba,pt	%ncc, .sm_do_copyout
3520	stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
3521
3522.copyout_noerr_more:
3523	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3524	sethi	%hi(.copyio_noerr), REAL_LOFAULT
3525	ba,pt	%ncc, .do_copyout
3526	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3527
3528	SET_SIZE(copyout_noerr)
3529#endif /* lint */
3530
3531
3532/*
3533 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
3534 * longer than 256 bytes in length using spitfire's block stores.  If
3535 * the criteria for using this routine are not met then it calls bzero
3536 * and returns 1.  Otherwise 0 is returned indicating success.
3537 * Caller is responsible for ensuring use_hw_bzero is true and that
3538 * kpreempt_disable() has been called.
3539 */
3540#ifdef lint
3541/*ARGSUSED*/
3542int
3543hwblkclr(void *addr, size_t len)
3544{
3545	return(0);
3546}
3547#else /* lint */
3548	! %i0 - start address
3549	! %i1 - length of region (multiple of 64)
3550	! %l0 - saved fprs
3551	! %l1 - pointer to saved %d0 block
3552	! %l2 - saved curthread->t_lwp
3553
3554	ENTRY(hwblkclr)
3555	! get another window w/space for one aligned block of saved fpregs
3556	save	%sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
3557
3558	! Must be block-aligned
3559	andcc	%i0, (VIS_BLOCKSIZE-1), %g0
3560	bnz,pn	%ncc, 1f
3561	  nop
3562
3563	! ... and must be 256 bytes or more
3564	cmp	%i1, 256
3565	blu,pn	%ncc, 1f
3566	  nop
3567
3568	! ... and length must be a multiple of VIS_BLOCKSIZE
3569	andcc	%i1, (VIS_BLOCKSIZE-1), %g0
3570	bz,pn	%ncc, 2f
3571	  nop
3572
35731:	! punt, call bzero but notify the caller that bzero was used
3574	mov	%i0, %o0
3575	call	bzero
3576	mov	%i1, %o1
3577	ret
3578	  restore	%g0, 1, %o0 ! return (1) - did not use block operations
3579
35802:	rd	%fprs, %l0		! check for unused fp
3581	btst	FPRS_FEF, %l0
3582	bz,pt	%icc, 1f
3583	  nop
3584
3585	! save in-use fpregs on stack
3586	membar	#Sync
3587	add	%fp, STACK_BIAS - 65, %l1
3588	and	%l1, -VIS_BLOCKSIZE, %l1
3589	stda	%d0, [%l1]ASI_BLK_P
3590
35911:	membar	#StoreStore|#StoreLoad|#LoadStore
3592	wr	%g0, FPRS_FEF, %fprs
3593	wr	%g0, ASI_BLK_P, %asi
3594
3595	! Clear block
3596	fzero	%d0
3597	fzero	%d2
3598	fzero	%d4
3599	fzero	%d6
3600	fzero	%d8
3601	fzero	%d10
3602	fzero	%d12
3603	fzero	%d14
3604
3605	mov	256, %i3
3606	ba,pt	%ncc, .pz_doblock
3607	  nop
3608
3609.pz_blkstart:
3610      ! stda	%d0, [%i0 + 192]%asi  ! in dly slot of branch that got us here
3611	stda	%d0, [%i0 + 128]%asi
3612	stda	%d0, [%i0 + 64]%asi
3613	stda	%d0, [%i0]%asi
3614.pz_zinst:
3615	add	%i0, %i3, %i0
3616	sub	%i1, %i3, %i1
3617.pz_doblock:
3618	cmp	%i1, 256
3619	bgeu,a	%ncc, .pz_blkstart
3620	  stda	%d0, [%i0 + 192]%asi
3621
3622	cmp	%i1, 64
3623	blu	%ncc, .pz_finish
3624
3625	  andn	%i1, (64-1), %i3
3626	srl	%i3, 4, %i2		! using blocks, 1 instr / 16 words
3627	set	.pz_zinst, %i4
3628	sub	%i4, %i2, %i4
3629	jmp	%i4
3630	  nop
3631
3632.pz_finish:
3633	membar	#Sync
3634	btst	FPRS_FEF, %l0
3635	bz,a	.pz_finished
3636	  wr	%l0, 0, %fprs		! restore fprs
3637
3638	! restore fpregs from stack
3639	ldda	[%l1]ASI_BLK_P, %d0
3640	membar	#Sync
3641	wr	%l0, 0, %fprs		! restore fprs
3642
3643.pz_finished:
3644	ret
3645	  restore	%g0, 0, %o0		! return (bzero or not)
3646
3647	SET_SIZE(hwblkclr)
3648#endif	/* lint */
3649
3650#ifdef lint
3651/*ARGSUSED*/
3652void
3653hw_pa_bcopy32(uint64_t src, uint64_t dst)
3654{}
3655#else /*!lint */
3656	/*
3657	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
3658	 * using physical addresses.
3659	 */
3660	ENTRY_NP(hw_pa_bcopy32)
3661	rdpr	%pstate, %g1
3662	andn	%g1, PSTATE_IE, %g2
3663	wrpr	%g0, %g2, %pstate
3664
3665	rdpr	%pstate, %g0
3666	ldxa	[%o0]ASI_MEM, %o2
3667	add	%o0, 8, %o0
3668	ldxa	[%o0]ASI_MEM, %o3
3669	add	%o0, 8, %o0
3670	ldxa	[%o0]ASI_MEM, %o4
3671	add	%o0, 8, %o0
3672	ldxa	[%o0]ASI_MEM, %o5
3673	membar	#Sync
3674
3675	stxa	%o2, [%o1]ASI_MEM
3676	add	%o1, 8, %o1
3677	stxa	%o3, [%o1]ASI_MEM
3678	add	%o1, 8, %o1
3679	stxa	%o4, [%o1]ASI_MEM
3680	add	%o1, 8, %o1
3681	stxa	%o5, [%o1]ASI_MEM
3682
3683	retl
3684	  wrpr	  %g0, %g1, %pstate
3685
3686	SET_SIZE(hw_pa_bcopy32)
3687
3688#endif /* lint */
3689
3690#if defined(lint)
3691
3692int use_hw_bcopy = 1;
3693int use_hw_bzero = 1;
3694uint_t hw_copy_limit_1 = 0;
3695uint_t hw_copy_limit_2 = 0;
3696uint_t hw_copy_limit_4 = 0;
3697uint_t hw_copy_limit_8 = 0;
3698
3699#else /* !lint */
3700
3701	DGDEF(use_hw_bcopy)
3702	.word	1
3703	DGDEF(use_hw_bzero)
3704	.word	1
3705	DGDEF(hw_copy_limit_1)
3706	.word	0
3707	DGDEF(hw_copy_limit_2)
3708	.word	0
3709	DGDEF(hw_copy_limit_4)
3710	.word	0
3711	DGDEF(hw_copy_limit_8)
3712	.word	0
3713
3714	.align	64
3715	.section ".text"
3716#endif /* !lint */
3717