xref: /titanic_50/usr/src/uts/sun4u/cpu/opl_olympus_copy.s (revision ff3124eff995e6cd8ebd8c6543648e0670920034)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/param.h>
29#include <sys/errno.h>
30#include <sys/asm_linkage.h>
31#include <sys/vtrace.h>
32#include <sys/machthread.h>
33#include <sys/clock.h>
34#include <sys/asi.h>
35#include <sys/fsr.h>
36#include <sys/privregs.h>
37
38#if !defined(lint)
39#include "assym.h"
40#endif	/* lint */
41
42/*
43 * Pseudo-code to aid in understanding the control flow of the
44 * bcopy/copyin/copyout routines.
45 *
46 * On entry:
47 *
48 * 	! Determine whether to use the FP register version
49 * 	! or the leaf routine version depending on size
50 * 	! of copy and flags.  Set up error handling accordingly.
51 *	! The transition point depends on whether the src and
52 * 	! dst addresses can be aligned to long word, word,
53 * 	! half word, or byte boundaries.
54 *	!
55 *	! WARNING: <Register usage convention>
56 *	! For FP version, %l6 holds previous error handling and
57 *	! a flag: TRAMP_FLAG (low bits)
58 *	! for leaf routine version, %o4 holds those values.
59 *	! So either %l6 or %o4 is reserved and not available for
60 *	! any other use.
61 *
62 * 	if (length <= VIS_COPY_THRESHOLD) 	! start with a quick test
63 * 		go to small_copy;		! to speed short copies
64 *
65 * 	! src, dst long word alignable
66 * 		if (hw_copy_limit_8 == 0) 	! hw_copy disabled
67 * 			go to small_copy;
68 *		if (length <= hw_copy_limit_8)
69 * 			go to small_copy;
70 * 		go to FPBLK_copy;
71 * 	}
72 * 	if (src,dst not alignable) {
73 * 		if (hw_copy_limit_1 == 0) 	! hw_copy disabled
74 * 			go to small_copy;
75 *		if (length <= hw_copy_limit_1)
76 * 			go to small_copy;
77 * 		go to FPBLK_copy;
78 * 	}
79 * 	if (src,dst halfword alignable) {
80 * 		if (hw_copy_limit_2 == 0) 	! hw_copy disabled
81 * 			go to small_copy;
82 *		if (length <= hw_copy_limit_2)
83 * 			go to small_copy;
84 * 		go to FPBLK_copy;
85 * 	}
86 * 	if (src,dst word alignable) {
87 * 		if (hw_copy_limit_4 == 0) 	! hw_copy disabled
88 * 			go to small_copy;
89 *		if (length <= hw_copy_limit_4)
90 * 			go to small_copy;
91 * 		go to FPBLK_copy;
92 * 	}
93 *
94 * small_copy:
95 *	Setup_leaf_rtn_error_handler; 		! diffs for each entry point
96 *
97 *	if (count <= 3)				! fast path for tiny copies
98 *		go to sm_left;			! special finish up code
99 *	else
100 *		if (count > CHKSIZE)		! medium sized copies
101 *			go to sm_med		! tuned by alignment
102 *		if(src&dst not both word aligned) {
103 *	sm_movebytes:
104 *			move byte by byte in 4-way unrolled loop
105 *			fall into sm_left;
106 *	sm_left:
107 *			move 0-3 bytes byte at a time as needed.
108 *			restore error handler and exit.
109 *
110 * 		} else {	! src&dst are word aligned
111 *			check for at least 8 bytes left,
112 *			move word at a time, unrolled by 2
113 *			when fewer than 8 bytes left,
114 *	sm_half:	move half word at a time while 2 or more bytes left
115 *	sm_byte:	move final byte if necessary
116 *	sm_exit:
117 *			restore error handler and exit.
118 *		}
119 *
120 * ! Medium length cases with at least CHKSIZE bytes available
121 * ! method: line up src and dst as best possible, then
122 * ! move data in 4-way unrolled loops.
123 *
124 * sm_med:
125 *	if(src&dst unalignable)
126 * 		go to sm_movebytes
127 *	if(src&dst halfword alignable)
128 *		go to sm_movehalf
129 *	if(src&dst word alignable)
130 *		go to sm_moveword
131 * ! fall into long word movement
132 *	move bytes until src is word aligned
133 *	if not long word aligned, move a word
134 *	move long words in 4-way unrolled loop until < 32 bytes left
135 *      move long words in 1-way unrolled loop until < 8 bytes left
136 *	if zero bytes left, goto sm_exit
137 *	if one byte left, go to sm_byte
138 *	else go to sm_half
139 *
140 * sm_moveword:
141 *	move bytes until src is word aligned
142 *	move words in 4-way unrolled loop until < 16 bytes left
143 *      move words in 1-way unrolled loop until < 4 bytes left
144 *	if zero bytes left, goto sm_exit
145 *	if one byte left, go to sm_byte
146 *	else go to sm_half
147 *
148 * sm_movehalf:
149 *	move a byte if needed to align src on halfword
150 *	move halfwords in 4-way unrolled loop until < 8 bytes left
151 *	if zero bytes left, goto sm_exit
152 *	if one byte left, go to sm_byte
153 *	else go to sm_half
154 *
155 *
156 * FPBLK_copy:
157 * 	%l6 = curthread->t_lofault;
158 * 	if (%l6 != NULL) {
159 * 		membar #Sync
160 * 		curthread->t_lofault = .copyerr;
161 * 		caller_error_handler = TRUE             ! %l6 |= 2
162 * 	}
163 *
164 *	! for FPU testing we must not migrate cpus
165 * 	if (curthread->t_lwp == NULL) {
166 *		! Kernel threads do not have pcb's in which to store
167 *		! the floating point state, so disallow preemption during
168 *		! the copy.  This also prevents cpu migration.
169 * 		kpreempt_disable(curthread);
170 *	} else {
171 *		thread_nomigrate();
172 *	}
173 *
174 * 	old_fprs = %fprs;
175 * 	old_gsr = %gsr;
176 * 	if (%fprs.fef) {
177 * 		%fprs.fef = 1;
178 * 		save current fpregs on stack using blockstore
179 * 	} else {
180 * 		%fprs.fef = 1;
181 * 	}
182 *
183 *
184 * 	do_blockcopy_here;
185 *
186 * In lofault handler:
187 *	curthread->t_lofault = .copyerr2;
188 *	Continue on with the normal exit handler
189 *
190 * On normal exit:
191 * 	%gsr = old_gsr;
192 * 	if (old_fprs & FPRS_FEF)
193 * 		restore fpregs from stack using blockload
194 *	else
195 *		zero fpregs
196 * 	%fprs = old_fprs;
197 * 	membar #Sync
198 * 	curthread->t_lofault = (%l6 & ~3);
199 *	! following test omitted from copyin/copyout as they
200 *	! will always have a current thread
201 * 	if (curthread->t_lwp == NULL)
202 *		kpreempt_enable(curthread);
203 *	else
204 *		thread_allowmigrate();
205 * 	return (0)
206 *
207 * In second lofault handler (.copyerr2):
208 *	We've tried to restore fp state from the stack and failed.  To
209 *	prevent from returning with a corrupted fp state, we will panic.
210 */
211
212/*
213 * Comments about optimization choices
214 *
215 * The initial optimization decision in this code is to determine
216 * whether to use the FP registers for a copy or not.  If we don't
217 * use the FP registers, we can execute the copy as a leaf routine,
218 * saving a register save and restore.  Also, less elaborate setup
219 * is required, allowing short copies to be completed more quickly.
220 * For longer copies, especially unaligned ones (where the src and
221 * dst do not align to allow simple ldx,stx operation), the FP
222 * registers allow much faster copy operations.
223 *
224 * The estimated extra cost of the FP path will vary depending on
225 * src/dst alignment, dst offset from the next 64 byte FPblock store
226 * boundary, remaining src data after the last full dst cache line is
227 * moved whether the FP registers need to be saved, and some other
228 * minor issues.  The average additional overhead is estimated to be
229 * 400 clocks.  Since each non-repeated/predicted tst and branch costs
230 * around 10 clocks, elaborate calculation would slow down to all
231 * longer copies and only benefit a small portion of medium sized
232 * copies.  Rather than incur such cost, we chose fixed transition
233 * points for each of the alignment choices.
234 *
235 * For the inner loop, here is a comparison of the per cache line
236 * costs for each alignment when src&dst are in cache:
237 *
238 * byte aligned:  108 clocks slower for non-FPBLK
239 * half aligned:   44 clocks slower for non-FPBLK
240 * word aligned:   12 clocks slower for non-FPBLK
241 * long aligned:    4 clocks >>faster<< for non-FPBLK
242 *
243 * The long aligned loop runs faster because it does no prefetching.
244 * That wins if the data is not in cache or there is too little
245 * data to gain much benefit from prefetching.  But when there
246 * is more data and that data is not in cache, failing to prefetch
247 * can run much slower.  In addition, there is a 2 Kbyte store queue
248 * which will cause the non-FPBLK inner loop to slow for larger copies.
249 * The exact tradeoff is strongly load and application dependent, with
250 * increasing risk of a customer visible performance regression if the
251 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
252 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
253 * upper limit for the non-FPBLK code.  To minimize performance regression
254 * risk while still gaining the primary benefits of the improvements to
255 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
256 * hw_copy_limit_*.  Later experimental studies using different values
257 * of hw_copy_limit_* can be used to make further adjustments if
258 * appropriate.
259 *
260 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
261 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
262 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
263 * hw_copy_limit_8 = src and dst are longword aligned
264 *
265 * To say that src and dst are word aligned means that after
266 * some initial alignment activity of moving 0 to 3 bytes,
267 * both the src and dst will be on word boundaries so that
268 * word loads and stores may be used.
269 *
270 * Default values at May,2005 are:
271 * hw_copy_limit_1 =  256
272 * hw_copy_limit_2 =  512
273 * hw_copy_limit_4 = 1024
274 * hw_copy_limit_8 = 1024 (or 1536 on some systems)
275 *
276 *
277 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
278 * disabled for that alignment choice.
279 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
280 * the value of VIS_COPY_THRESHOLD is used.
281 * It is not envisioned that hw_copy_limit_? will be changed in the field
282 * It is provided to allow for disabling FPBLK copies and to allow
283 * easy testing of alternate values on future HW implementations
284 * that might have different cache sizes, clock rates or instruction
285 * timing rules.
286 *
287 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
288 * threshold to speedup all shorter copies (less than 256).  That
289 * saves an alignment test, memory reference, and enabling test
290 * for all short copies, or an estimated 24 clocks.
291 *
292 * The order in which these limits are checked does matter since each
293 * non-predicted tst and branch costs around 10 clocks.
294 * If src and dst are randomly selected addresses,
295 * 4 of 8 will not be alignable.
296 * 2 of 8 will be half word alignable.
297 * 1 of 8 will be word alignable.
298 * 1 of 8 will be long word alignable.
299 * But, tests on running kernels show that src and dst to copy code
300 * are typically not on random alignments.  Structure copies and
301 * copies of larger data sizes are often on long word boundaries.
302 * So we test the long word alignment case first, then
303 * the byte alignment, then halfword, then word alignment.
304 *
305 * Several times, tests for length are made to split the code
306 * into subcases.  These tests often allow later tests to be
307 * avoided.  For example, within the non-FPBLK copy, we first
308 * check for tiny copies of 3 bytes or less.  That allows us
309 * to use a 4-way unrolled loop for the general byte copy case
310 * without a test on loop entry.
311 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
312 * vs longer cases.  For the really short case, we don't attempt
313 * align src and dst.  We try to minimize special case tests in
314 * the shortest loops as each test adds a significant percentage
315 * to the total time.
316 *
317 * For the medium sized cases, we allow ourselves to adjust the
318 * src and dst alignment and provide special cases for each of
319 * the four adjusted alignment cases. The CHKSIZE that was used
320 * to decide between short and medium size was chosen to be 39
321 * as that allows for the worst case of 7 bytes of alignment
322 * shift and 4 times 8 bytes for the first long word unrolling.
323 * That knowledge saves an initial test for length on entry into
324 * the medium cases.  If the general loop unrolling factor were
325 * to be increases, this number would also need to be adjusted.
326 *
327 * For all cases in the non-FPBLK code where it is known that at
328 * least 4 chunks of data are available for movement, the
329 * loop is unrolled by four.  This 4-way loop runs in 8 clocks
330 * or 2 clocks per data element.
331 *
332 * Instruction alignment is forced by used of .align 16 directives
333 * and nops which are not executed in the code.  This
334 * combination of operations shifts the alignment of following
335 * loops to insure that loops are aligned so that their instructions
336 * fall within the minimum number of 4 instruction fetch groups.
337 * If instructions are inserted or removed between the .align
338 * instruction and the unrolled loops, then the alignment needs
339 * to be readjusted.  Misaligned loops can add a clock per loop
340 * iteration to the loop timing.
341 *
342 * In a few cases, code is duplicated to avoid a branch.  Since
343 * a non-predicted tst and branch takes 10 clocks, this savings
344 * is judged an appropriate time-space tradeoff.
345 *
346 * Within the FPBLK-code, the prefetch method in the inner
347 * loop needs to be explained as it is not standard.  Two
348 * prefetches are issued for each cache line instead of one.
349 * The primary one is at the maximum reach of 8 cache lines.
350 * Most of the time, that maximum prefetch reach gives the
351 * cache line more time to reach the processor for systems with
352 * higher processor clocks.  But, sometimes memory interference
353 * can cause that prefetch to be dropped.  Putting a second
354 * prefetch at a reach of 5 cache lines catches the drops
355 * three iterations later and shows a measured improvement
356 * in performance over any similar loop with a single prefetch.
357 * The prefetches are placed in the loop so they overlap with
358 * non-memory instructions, so that there is no extra cost
359 * when the data is already in-cache.
360 *
361 */
362
363/*
364 * Notes on preserving existing fp state and on membars.
365 *
366 * When a copyOP decides to use fp we may have to preserve existing
367 * floating point state.  It is not the caller's state that we need to
368 * preserve - the rest of the kernel does not use fp and, anyway, fp
369 * registers are volatile across a call.  Some examples:
370 *
371 *	- userland has fp state and is interrupted (device interrupt
372 *	  or trap) and within the interrupt/trap handling we use
373 *	  bcopy()
374 *	- another (higher level) interrupt or trap handler uses bcopy
375 *	  while a bcopy from an earlier interrupt is still active
376 *	- an asynchronous error trap occurs while fp state exists (in
377 *	  userland or in kernel copy) and the tl0 component of the handling
378 *	  uses bcopy
379 *	- a user process with fp state incurs a copy-on-write fault and
380 *	  hwblkpagecopy always uses fp
381 *
382 * We therefore need a per-call place in which to preserve fp state -
383 * using our stack is ideal (and since fp copy cannot be leaf optimized
384 * because of calls it makes, this is no hardship).
385 *
386 * When we have finished fp copy (with it's repeated block stores)
387 * we must membar #Sync so that our block stores may complete before
388 * we either restore the original fp state into the fp registers or
389 * return to a caller which may initiate other fp operations that could
390 * modify the fp regs we used before the block stores complete.
391 *
392 * Synchronous faults (eg, unresolvable DMMU miss) that occur while
393 * t_lofault is not NULL will not panic but will instead trampoline
394 * to the registered lofault handler.  There is no need for any
395 * membars for these - eg, our store to t_lofault will always be visible to
396 * ourselves and it is our cpu which will take any trap.
397 *
398 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
399 * while t_lofault is not NULL will also not panic.  Since we're copying
400 * to or from userland the extent of the damage is known - the destination
401 * buffer is incomplete.  So trap handlers will trampoline to the lofault
402 * handler in this case which should take some form of error action to
403 * avoid using the incomplete buffer.  The trap handler also flags the
404 * fault so that later return-from-trap handling (for the trap that brought
405 * this thread into the kernel in the first place) can notify the process
406 * and reboot the system (or restart the service with Greenline/Contracts).
407 *
408 * Asynchronous faults (eg, uncorrectable ECC error from memory) can
409 * result in deferred error traps - the trap is taken sometime after
410 * the event and the trap PC may not be the PC of the faulting access.
411 * Delivery of such pending traps can be forced by a membar #Sync, acting
412 * as an "error barrier" in this role.  To accurately apply the user/kernel
413 * separation described in the preceding paragraph we must force delivery
414 * of deferred traps affecting kernel state before we install a lofault
415 * handler (if we interpose a new lofault handler on an existing one there
416 * is no need to repeat this), and we must force delivery of deferred
417 * errors affecting the lofault-protected region before we clear t_lofault.
418 * Failure to do so results in lost kernel state being interpreted as
419 * affecting a copyin/copyout only, or of an error that really only
420 * affects copy data being interpreted as losing kernel state.
421 *
422 * Since the copy operations may preserve and later restore floating
423 * point state that does not belong to the caller (see examples above),
424 * we must be careful in how we do this in order to prevent corruption
425 * of another program.
426 *
427 * To make sure that floating point state is always saved and restored
428 * correctly, the following "big rules" must be followed when the floating
429 * point registers will be used:
430 *
431 * 1. %l6 always holds the caller's lofault handler.  Also in this register,
432 *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
433 *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
434 *    lofault handler was set coming in.
435 *
436 * 2. The FPUSED flag indicates that all FP state has been successfully stored
437 *    on the stack.  It should not be set until this save has been completed.
438 *
439 * 3. The FPUSED flag should not be cleared on exit until all FP state has
440 *    been restored from the stack.  If an error occurs while restoring
441 *    data from the stack, the error handler can check this flag to see if
442 *    a restore is necessary.
443 *
444 * 4. Code run under the new lofault handler must be kept to a minimum.  In
445 *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
446 *    to kpreempt(), should not be made until after the lofault handler has
447 *    been restored.
448 */
449
450/*
451 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
452 * to "break even" using FP/VIS-accelerated memory operations.
453 * The FPBLK code assumes a minimum number of bytes are available
454 * to be moved on entry.  Check that code carefully before
455 * reducing VIS_COPY_THRESHOLD below 256.
456 */
457/*
458 * This shadows sys/machsystm.h which can't be included due to the lack of
459 * _ASM guards in include files it references. Change it here, change it there.
460 */
461#define VIS_COPY_THRESHOLD 256
462
463/*
464 * TEST for very short copies
465 * Be aware that the maximum unroll for the short unaligned case
466 * is SHORTCOPY+1
467 */
468#define SHORTCOPY 3
469#define CHKSIZE  39
470
471/*
472 * Indicates that we're to trampoline to the error handler.
473 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
474 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
475 */
476#define	FPUSED_FLAG	1
477#define	TRAMP_FLAG	2
478#define	MASK_FLAGS	3
479
480/*
481 * Number of outstanding prefetches.
482 * first prefetch moves data from L2 to L1 (n_reads)
483 * second prefetch moves data from memory to L2 (one_read)
484 */
485#define	OLYMPUS_C_PREFETCH	24
486#define	OLYMPUS_C_2ND_PREFETCH	12
487
488#define	VIS_BLOCKSIZE		64
489
490/*
491 * Size of stack frame in order to accomodate a 64-byte aligned
492 * floating-point register save area and 2 64-bit temp locations.
493 * All copy functions use two quadrants of fp registers; to assure a
494 * block-aligned two block buffer in which to save we must reserve
495 * three blocks on stack.  Not all functions preserve %pfrs on stack
496 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
497 *
498 *    _______________________________________ <-- %fp + STACK_BIAS
499 *    | We may need to preserve 2 quadrants |
500 *    | of fp regs, but since we do so with |
501 *    | BST/BLD we need room in which to    |
502 *    | align to VIS_BLOCKSIZE bytes.  So   |
503 *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
504 *    |-------------------------------------|
505 *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
506 *    |-------------------------------------|
507 *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
508 *    ---------------------------------------
509 */
510#define	HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
511#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 3)
512#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 2) - 1)
513#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
514#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
515
516/*
517 * Common macros used by the various versions of the block copy
518 * routines in this file.
519 */
520
521/*
522 * In FP copies if we do not have preserved data to restore over
523 * the fp regs we used then we must zero those regs to avoid
524 * exposing portions of the data to later threads (data security).
525 *
526 * Copy functions use either quadrants 1 and 3 or 2 and 4.
527 *
528 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
529 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
530 *
531 * The instructions below are quicker than repeated fzero instructions
532 * since they can dispatch down two fp pipelines.
533 */
534#define	FZEROQ1Q3			\
535	fzero	%f0			;\
536	fmovd	%f0, %f2		;\
537	fmovd	%f0, %f4		;\
538	fmovd	%f0, %f6		;\
539	fmovd	%f0, %f8		;\
540	fmovd	%f0, %f10		;\
541	fmovd	%f0, %f12		;\
542	fmovd	%f0, %f14		;\
543	fmovd	%f0, %f32		;\
544	fmovd	%f0, %f34		;\
545	fmovd	%f0, %f36		;\
546	fmovd	%f0, %f38		;\
547	fmovd	%f0, %f40		;\
548	fmovd	%f0, %f42		;\
549	fmovd	%f0, %f44		;\
550	fmovd	%f0, %f46
551
552#define	FZEROQ2Q4			\
553	fzero	%f16			;\
554	fmovd	%f0, %f18		;\
555	fmovd	%f0, %f20		;\
556	fmovd	%f0, %f22		;\
557	fmovd	%f0, %f24		;\
558	fmovd	%f0, %f26		;\
559	fmovd	%f0, %f28		;\
560	fmovd	%f0, %f30		;\
561	fmovd	%f0, %f48		;\
562	fmovd	%f0, %f50		;\
563	fmovd	%f0, %f52		;\
564	fmovd	%f0, %f54		;\
565	fmovd	%f0, %f56		;\
566	fmovd	%f0, %f58		;\
567	fmovd	%f0, %f60		;\
568	fmovd	%f0, %f62
569
570/*
571 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
572 * Used to save and restore in-use fp registers when we want to use FP
573 * and find fp already in use and copy size still large enough to justify
574 * the additional overhead of this save and restore.
575 *
576 * A membar #Sync is needed before save to sync fp ops initiated before
577 * the call to the copy function (by whoever has fp in use); for example
578 * an earlier block load to the quadrant we are about to save may still be
579 * "in flight".  A membar #Sync is required at the end of the save to
580 * sync our block store (the copy code is about to begin ldd's to the
581 * first quadrant).
582 *
583 * Similarly: a membar #Sync before restore allows the block stores of
584 * the copy operation to complete before we fill the quadrants with their
585 * original data, and a membar #Sync after restore lets the block loads
586 * of the restore complete before we return to whoever has the fp regs
587 * in use.  To avoid repeated membar #Sync we make it the responsibility
588 * of the copy code to membar #Sync immediately after copy is complete
589 * and before using the BLD_*_FROMSTACK macro.
590 */
591#if !defined(lint)
592#define BST_FPQ1Q3_TOSTACK(tmp1)				\
593	/* membar #Sync	*/					;\
594	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
595	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
596	stda	%f0, [tmp1]ASI_BLK_P				;\
597	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
598	stda	%f32, [tmp1]ASI_BLK_P				;\
599	membar	#Sync
600
601#define	BLD_FPQ1Q3_FROMSTACK(tmp1)				\
602	/* membar #Sync - provided at copy completion */	;\
603	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
604	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
605	ldda	[tmp1]ASI_BLK_P, %f0				;\
606	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
607	ldda	[tmp1]ASI_BLK_P, %f32				;\
608	membar	#Sync
609
610#define BST_FPQ2Q4_TOSTACK(tmp1)				\
611	/* membar #Sync */					;\
612	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
613	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
614	stda	%f16, [tmp1]ASI_BLK_P				;\
615	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
616	stda	%f48, [tmp1]ASI_BLK_P				;\
617	membar	#Sync
618
619#define	BLD_FPQ2Q4_FROMSTACK(tmp1)				\
620	/* membar #Sync - provided at copy completion */	;\
621	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
622	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
623	ldda	[tmp1]ASI_BLK_P, %f16				;\
624	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
625	ldda	[tmp1]ASI_BLK_P, %f48				;\
626	membar	#Sync
627#endif
628
629/*
630 * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
631 * prevent preemption if there is no t_lwp to save FP state to on context
632 * switch) before commencing a FP copy, and reallow it on completion or
633 * in error trampoline paths when we were using FP copy.
634 *
635 * Both macros may call other functions, so be aware that all outputs are
636 * forfeit after using these macros.  For this reason we do not pass registers
637 * to use - we just use any outputs we want.
638 *
639 * Pseudo code:
640 *
641 * FP_NOMIGRATE:
642 *
643 * if (curthread->t_lwp) {
644 *	thread_nomigrate();
645 * } else {
646 *	kpreempt_disable();
647 * }
648 *
649 * FP_ALLOWMIGRATE:
650 *
651 * if (curthread->t_lwp) {
652 *	thread_allowmigrate();
653 * } else {
654 *	kpreempt_enable();
655 * }
656 */
657
658#define	FP_NOMIGRATE(label1, label2)				\
659	ldn	[THREAD_REG + T_LWP], %o0			;\
660	brz,a,pn %o0, label1/**/f				;\
661	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
662	call	thread_nomigrate				;\
663	  nop							;\
664	ba	label2/**/f					;\
665	  nop							;\
666label1:								;\
667	inc	%o1						;\
668	stb	%o1, [THREAD_REG + T_PREEMPT]			;\
669label2:
670
671#define	FP_ALLOWMIGRATE(label1, label2)			\
672	ldn	[THREAD_REG + T_LWP], %o0			;\
673	brz,a,pn %o0, label1/**/f				;\
674	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
675	call thread_allowmigrate				;\
676	  nop							;\
677	ba	label2/**/f					;\
678	  nop							;\
679label1:								;\
680	dec	%o1						;\
681	brnz,pn	%o1, label2/**/f				;\
682	  stb	%o1, [THREAD_REG + T_PREEMPT]			;\
683	ldn	[THREAD_REG + T_CPU], %o0			;\
684	ldub	[%o0 + CPU_KPRUNRUN], %o0			;\
685	brz,pt	%o0, label2/**/f				;\
686	  nop							;\
687	call	kpreempt					;\
688	  rdpr	%pil, %o0					;\
689label2:
690
691/*
692 * Copy a block of storage, returning an error code if `from' or
693 * `to' takes a kernel pagefault which cannot be resolved.
694 * Returns errno value on pagefault error, 0 if all ok
695 */
696
697#if defined(lint)
698
699/* ARGSUSED */
700int
701kcopy(const void *from, void *to, size_t count)
702{ return(0); }
703
704#else	/* lint */
705
706	.seg	".text"
707	.align	4
708
709	ENTRY(kcopy)
710
711	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
712	bleu,pt	%ncc, .kcopy_small		! go to larger cases
713	  xor	%o0, %o1, %o3			! are src, dst alignable?
714	btst	7, %o3				!
715	bz,pt	%ncc, .kcopy_8			! check for longword alignment
716	  nop
717	btst	1, %o3				!
718	bz,pt	%ncc, .kcopy_2			! check for half-word
719	  nop
720	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
721	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
722	tst	%o3
723	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
724	  cmp	%o2, %o3			! if length <= limit
725	bleu,pt	%ncc, .kcopy_small		! go to small copy
726	  nop
727	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
728	  nop
729.kcopy_2:
730	btst	3, %o3				!
731	bz,pt	%ncc, .kcopy_4			! check for word alignment
732	  nop
733	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
734	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
735	tst	%o3
736	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
737	  cmp	%o2, %o3			! if length <= limit
738	bleu,pt	%ncc, .kcopy_small		! go to small copy
739	  nop
740	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
741	  nop
742.kcopy_4:
743	! already checked longword, must be word aligned
744	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
745	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
746	tst	%o3
747	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
748	  cmp	%o2, %o3			! if length <= limit
749	bleu,pt	%ncc, .kcopy_small		! go to small copy
750	  nop
751	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
752	  nop
753.kcopy_8:
754	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
755	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
756	tst	%o3
757	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
758	  cmp	%o2, %o3			! if length <= limit
759	bleu,pt	%ncc, .kcopy_small		! go to small copy
760	  nop
761	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
762	  nop
763
764.kcopy_small:
765	sethi	%hi(.sm_copyerr), %o5		! sm_copyerr is lofault value
766	or	%o5, %lo(.sm_copyerr), %o5
767	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
768	membar	#Sync				! sync error barrier
769	ba,pt	%ncc, .sm_do_copy		! common code
770	 stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
771
772.kcopy_more:
773	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
774	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
775	or	%l7, %lo(.copyerr), %l7
776	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
777	membar	#Sync				! sync error barrier
778	ba,pt	%ncc, .do_copy			! common code
779	  stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
780
781
782/*
783 * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
784 * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
785 */
786.copyerr:
787	set	.copyerr2, %l0
788	membar	#Sync				! sync error barrier
789	stn	%l0, [THREAD_REG + T_LOFAULT]	! set t_lofault
790	btst	FPUSED_FLAG, %l6
791	bz	%ncc, 1f
792	  and	%l6, TRAMP_FLAG, %l0		! copy trampoline flag to %l0
793
794	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
795	wr	%o2, 0, %gsr
796
797	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
798	btst	FPRS_FEF, %o3
799	bz,pt	%icc, 4f
800	  nop
801
802	BLD_FPQ1Q3_FROMSTACK(%o2)
803
804	ba,pt	%ncc, 1f
805	  wr	%o3, 0, %fprs		! restore fprs
806
8074:
808	FZEROQ1Q3
809	wr	%o3, 0, %fprs		! restore fprs
810
811	!
812	! Need to cater for the different expectations of kcopy
813	! and bcopy. kcopy will *always* set a t_lofault handler
814	! If it fires, we're expected to just return the error code
815	! and *not* to invoke any existing error handler. As far as
816	! bcopy is concerned, we only set t_lofault if there was an
817	! existing lofault handler. In that case we're expected to
818	! invoke the previously existing handler after resetting the
819	! t_lofault value.
820	!
8211:
822	andn	%l6, MASK_FLAGS, %l6		! turn trampoline flag off
823	membar	#Sync				! sync error barrier
824	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
825	FP_ALLOWMIGRATE(5, 6)
826
827	btst	TRAMP_FLAG, %l0
828	bnz,pn	%ncc, 3f
829	  nop
830	ret
831	  restore	%g1, 0, %o0
832
8333:
834	!
835	! We're here via bcopy. There *must* have been an error handler
836	! in place otherwise we would have died a nasty death already.
837	!
838	jmp	%l6				! goto real handler
839	  restore	%g0, 0, %o0		! dispose of copy window
840
841/*
842 * We got here because of a fault in .copyerr.  We can't safely restore fp
843 * state, so we panic.
844 */
845fp_panic_msg:
846	.asciz	"Unable to restore fp state after copy operation"
847
848	.align	4
849.copyerr2:
850	set	fp_panic_msg, %o0
851	call	panic
852	  nop
853
854/*
855 * We got here because of a fault during a small kcopy or bcopy.
856 * No floating point registers are used by the small copies.
857 * Errno value is in %g1.
858 */
859.sm_copyerr:
8601:
861	btst	TRAMP_FLAG, %o4
862	membar	#Sync
863	andn	%o4, TRAMP_FLAG, %o4
864	bnz,pn	%ncc, 3f
865	  stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
866	retl
867	  mov	%g1, %o0
8683:
869	jmp	%o4				! goto real handler
870	  mov	%g0, %o0			!
871
872	SET_SIZE(kcopy)
873#endif	/* lint */
874
875
876/*
877 * Copy a block of storage - must not overlap (from + len <= to).
878 * Registers: l6 - saved t_lofault
879 * (for short copies, o4 - saved t_lofault)
880 *
881 * Copy a page of memory.
882 * Assumes double word alignment and a count >= 256.
883 */
884#if defined(lint)
885
886/* ARGSUSED */
887void
888bcopy(const void *from, void *to, size_t count)
889{}
890
891#else	/* lint */
892
893	ENTRY(bcopy)
894
895	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
896	bleu,pt	%ncc, .bcopy_small		! go to larger cases
897	  xor	%o0, %o1, %o3			! are src, dst alignable?
898	btst	7, %o3				!
899	bz,pt	%ncc, .bcopy_8			! check for longword alignment
900	  nop
901	btst	1, %o3				!
902	bz,pt	%ncc, .bcopy_2			! check for half-word
903	  nop
904	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
905	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
906	tst	%o3
907	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
908	  cmp	%o2, %o3			! if length <= limit
909	bleu,pt	%ncc, .bcopy_small		! go to small copy
910	  nop
911	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
912	  nop
913.bcopy_2:
914	btst	3, %o3				!
915	bz,pt	%ncc, .bcopy_4			! check for word alignment
916	  nop
917	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
918	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
919	tst	%o3
920	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
921	  cmp	%o2, %o3			! if length <= limit
922	bleu,pt	%ncc, .bcopy_small		! go to small copy
923	  nop
924	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
925	  nop
926.bcopy_4:
927	! already checked longword, must be word aligned
928	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
929	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
930	tst	%o3
931	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
932	  cmp	%o2, %o3			! if length <= limit
933	bleu,pt	%ncc, .bcopy_small		! go to small copy
934	  nop
935	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
936	  nop
937.bcopy_8:
938	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
939	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
940	tst	%o3
941	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
942	  cmp	%o2, %o3			! if length <= limit
943	bleu,pt	%ncc, .bcopy_small		! go to small copy
944	  nop
945	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
946	  nop
947
948	.align	16
949.bcopy_small:
950	ldn	[THREAD_REG + T_LOFAULT], %o4	! save t_lofault
951	tst	%o4
952	bz,pt	%icc, .sm_do_copy
953	  nop
954	sethi	%hi(.sm_copyerr), %o5
955	or	%o5, %lo(.sm_copyerr), %o5
956	membar	#Sync				! sync error barrier
957	stn	%o5, [THREAD_REG + T_LOFAULT]	! install new vector
958	or	%o4, TRAMP_FLAG, %o4		! error should trampoline
959.sm_do_copy:
960	cmp	%o2, SHORTCOPY		! check for really short case
961	bleu,pt	%ncc, .bc_sm_left	!
962	  cmp	%o2, CHKSIZE		! check for medium length cases
963	bgu,pn	%ncc, .bc_med		!
964	  or	%o0, %o1, %o3		! prepare alignment check
965	andcc	%o3, 0x3, %g0		! test for alignment
966	bz,pt	%ncc, .bc_sm_word	! branch to word aligned case
967.bc_sm_movebytes:
968	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
969.bc_sm_notalign4:
970	ldub	[%o0], %o3		! read byte
971	stb	%o3, [%o1]		! write byte
972	subcc	%o2, 4, %o2		! reduce count by 4
973	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
974	add	%o0, 4, %o0		! advance SRC by 4
975	stb	%o3, [%o1 + 1]
976	ldub	[%o0 - 2], %o3
977	add	%o1, 4, %o1		! advance DST by 4
978	stb	%o3, [%o1 - 2]
979	ldub	[%o0 - 1], %o3
980	bgt,pt	%ncc, .bc_sm_notalign4	! loop til 3 or fewer bytes remain
981	  stb	%o3, [%o1 - 1]
982	add	%o2, 3, %o2		! restore count
983.bc_sm_left:
984	tst	%o2
985	bz,pt	%ncc, .bc_sm_exit	! check for zero length
986	  deccc	%o2			! reduce count for cc test
987	ldub	[%o0], %o3		! move one byte
988	bz,pt	%ncc, .bc_sm_exit
989	  stb	%o3, [%o1]
990	ldub	[%o0 + 1], %o3		! move another byte
991	deccc	%o2			! check for more
992	bz,pt	%ncc, .bc_sm_exit
993	  stb	%o3, [%o1 + 1]
994	ldub	[%o0 + 2], %o3		! move final byte
995	stb	%o3, [%o1 + 2]
996	membar	#Sync				! sync error barrier
997	andn	%o4, TRAMP_FLAG, %o4
998	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
999	retl
1000	  mov	%g0, %o0		! return 0
1001	.align	16
1002	nop				! instruction alignment
1003					! see discussion at start of file
1004.bc_sm_words:
1005	lduw	[%o0], %o3		! read word
1006.bc_sm_wordx:
1007	subcc	%o2, 8, %o2		! update count
1008	stw	%o3, [%o1]		! write word
1009	add	%o0, 8, %o0		! update SRC
1010	lduw	[%o0 - 4], %o3		! read word
1011	add	%o1, 8, %o1		! update DST
1012	bgt,pt	%ncc, .bc_sm_words	! loop til done
1013	  stw	%o3, [%o1 - 4]		! write word
1014	addcc	%o2, 7, %o2		! restore count
1015	bz,pt	%ncc, .bc_sm_exit
1016	  deccc	%o2
1017	bz,pt	%ncc, .bc_sm_byte
1018.bc_sm_half:
1019	  subcc	%o2, 2, %o2		! reduce count by 2
1020	add	%o0, 2, %o0		! advance SRC by 2
1021	lduh	[%o0 - 2], %o3		! read half word
1022	add	%o1, 2, %o1		! advance DST by 2
1023	bgt,pt	%ncc, .bc_sm_half	! loop til done
1024	  sth	%o3, [%o1 - 2]		! write half word
1025	addcc	%o2, 1, %o2		! restore count
1026	bz,pt	%ncc, .bc_sm_exit
1027	  nop
1028.bc_sm_byte:
1029	ldub	[%o0], %o3
1030	stb	%o3, [%o1]
1031	membar	#Sync				! sync error barrier
1032	andn	%o4, TRAMP_FLAG, %o4
1033	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1034	retl
1035	  mov	%g0, %o0		! return 0
1036
1037.bc_sm_word:
1038	subcc	%o2, 4, %o2		! update count
1039	bgt,pt	%ncc, .bc_sm_wordx
1040	  lduw	[%o0], %o3		! read word
1041	addcc	%o2, 3, %o2		! restore count
1042	bz,pt	%ncc, .bc_sm_exit
1043	  stw	%o3, [%o1]		! write word
1044	deccc	%o2			! reduce count for cc test
1045	ldub	[%o0 + 4], %o3		! load one byte
1046	bz,pt	%ncc, .bc_sm_exit
1047	  stb	%o3, [%o1 + 4]		! store one byte
1048	ldub	[%o0 + 5], %o3		! load second byte
1049	deccc	%o2
1050	bz,pt	%ncc, .bc_sm_exit
1051	  stb	%o3, [%o1 + 5]		! store second byte
1052	ldub	[%o0 + 6], %o3		! load third byte
1053	stb	%o3, [%o1 + 6]		! store third byte
1054.bc_sm_exit:
1055	brz,pt  %o4, .bc_sm_done
1056	  nop
1057	membar	#Sync				! sync error barrier
1058	andn	%o4, TRAMP_FLAG, %o4
1059	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1060.bc_sm_done:
1061	retl
1062	  mov	%g0, %o0		! return 0
1063
1064	.align 16
1065.bc_med:
1066	xor	%o0, %o1, %o3		! setup alignment check
1067	btst	1, %o3
1068	bnz,pt	%ncc, .bc_sm_movebytes	! unaligned
1069	  nop
1070	btst	3, %o3
1071	bnz,pt	%ncc, .bc_med_half	! halfword aligned
1072	  nop
1073	btst	7, %o3
1074	bnz,pt	%ncc, .bc_med_word	! word aligned
1075	  nop
1076.bc_med_long:
1077	btst	3, %o0			! check for
1078	bz,pt	%ncc, .bc_med_long1	! word alignment
1079	  nop
1080.bc_med_long0:
1081	ldub	[%o0], %o3		! load one byte
1082	inc	%o0
1083	stb	%o3,[%o1]		! store byte
1084	inc	%o1
1085	btst	3, %o0
1086	bnz,pt	%ncc, .bc_med_long0
1087	  dec	%o2
1088.bc_med_long1:			! word aligned
1089	btst	7, %o0			! check for long word
1090	bz,pt	%ncc, .bc_med_long2
1091	  nop
1092	lduw	[%o0], %o3		! load word
1093	add	%o0, 4, %o0		! advance SRC by 4
1094	stw	%o3, [%o1]		! store word
1095	add	%o1, 4, %o1		! advance DST by 4
1096	sub	%o2, 4, %o2		! reduce count by 4
1097!
1098!  Now long word aligned and have at least 32 bytes to move
1099!
1100.bc_med_long2:
1101	sub	%o2, 31, %o2		! adjust count to allow cc zero test
1102.bc_med_lmove:
1103	ldx	[%o0], %o3		! read long word
1104	stx	%o3, [%o1]		! write long word
1105	subcc	%o2, 32, %o2		! reduce count by 32
1106	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
1107	add	%o0, 32, %o0		! advance SRC by 32
1108	stx	%o3, [%o1 + 8]
1109	ldx	[%o0 - 16], %o3
1110	add	%o1, 32, %o1		! advance DST by 32
1111	stx	%o3, [%o1 - 16]
1112	ldx	[%o0 - 8], %o3
1113	bgt,pt	%ncc, .bc_med_lmove	! loop til 31 or fewer bytes left
1114	  stx	%o3, [%o1 - 8]
1115	addcc	%o2, 24, %o2		! restore count to long word offset
1116	ble,pt	%ncc, .bc_med_lextra	! check for more long words to move
1117	  nop
1118.bc_med_lword:
1119	ldx	[%o0], %o3		! read long word
1120	subcc	%o2, 8, %o2		! reduce count by 8
1121	stx	%o3, [%o1]		! write long word
1122	add	%o0, 8, %o0		! advance SRC by 8
1123	bgt,pt	%ncc, .bc_med_lword	! loop til 7 or fewer bytes left
1124	  add	%o1, 8, %o1		! advance DST by 8
1125.bc_med_lextra:
1126	addcc	%o2, 7, %o2		! restore rest of count
1127	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
1128	  deccc	%o2
1129	bz,pt	%ncc, .bc_sm_byte
1130	  nop
1131	ba,pt	%ncc, .bc_sm_half
1132	  nop
1133
1134	.align 16
1135.bc_med_word:
1136	btst	3, %o0			! check for
1137	bz,pt	%ncc, .bc_med_word1	! word alignment
1138	  nop
1139.bc_med_word0:
1140	ldub	[%o0], %o3		! load one byte
1141	inc	%o0
1142	stb	%o3,[%o1]		! store byte
1143	inc	%o1
1144	btst	3, %o0
1145	bnz,pt	%ncc, .bc_med_word0
1146	  dec	%o2
1147!
1148!  Now word aligned and have at least 36 bytes to move
1149!
1150.bc_med_word1:
1151	sub	%o2, 15, %o2		! adjust count to allow cc zero test
1152.bc_med_wmove:
1153	lduw	[%o0], %o3		! read word
1154	stw	%o3, [%o1]		! write word
1155	subcc	%o2, 16, %o2		! reduce count by 16
1156	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
1157	add	%o0, 16, %o0		! advance SRC by 16
1158	stw	%o3, [%o1 + 4]
1159	lduw	[%o0 - 8], %o3
1160	add	%o1, 16, %o1		! advance DST by 16
1161	stw	%o3, [%o1 - 8]
1162	lduw	[%o0 - 4], %o3
1163	bgt,pt	%ncc, .bc_med_wmove	! loop til 15 or fewer bytes left
1164	  stw	%o3, [%o1 - 4]
1165	addcc	%o2, 12, %o2		! restore count to word offset
1166	ble,pt	%ncc, .bc_med_wextra	! check for more words to move
1167	  nop
1168.bc_med_word2:
1169	lduw	[%o0], %o3		! read word
1170	subcc	%o2, 4, %o2		! reduce count by 4
1171	stw	%o3, [%o1]		! write word
1172	add	%o0, 4, %o0		! advance SRC by 4
1173	bgt,pt	%ncc, .bc_med_word2	! loop til 3 or fewer bytes left
1174	  add	%o1, 4, %o1		! advance DST by 4
1175.bc_med_wextra:
1176	addcc	%o2, 3, %o2		! restore rest of count
1177	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
1178	  deccc	%o2
1179	bz,pt	%ncc, .bc_sm_byte
1180	  nop
1181	ba,pt	%ncc, .bc_sm_half
1182	  nop
1183
1184	.align 16
1185.bc_med_half:
1186	btst	1, %o0			! check for
1187	bz,pt	%ncc, .bc_med_half1	! half word alignment
1188	  nop
1189	ldub	[%o0], %o3		! load one byte
1190	inc	%o0
1191	stb	%o3,[%o1]		! store byte
1192	inc	%o1
1193	dec	%o2
1194!
1195!  Now half word aligned and have at least 38 bytes to move
1196!
1197.bc_med_half1:
1198	sub	%o2, 7, %o2		! adjust count to allow cc zero test
1199.bc_med_hmove:
1200	lduh	[%o0], %o3		! read half word
1201	sth	%o3, [%o1]		! write half word
1202	subcc	%o2, 8, %o2		! reduce count by 8
1203	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
1204	add	%o0, 8, %o0		! advance SRC by 8
1205	sth	%o3, [%o1 + 2]
1206	lduh	[%o0 - 4], %o3
1207	add	%o1, 8, %o1		! advance DST by 8
1208	sth	%o3, [%o1 - 4]
1209	lduh	[%o0 - 2], %o3
1210	bgt,pt	%ncc, .bc_med_hmove	! loop til 7 or fewer bytes left
1211	  sth	%o3, [%o1 - 2]
1212	addcc	%o2, 7, %o2		! restore count
1213	bz,pt	%ncc, .bc_sm_exit
1214	  deccc	%o2
1215	bz,pt	%ncc, .bc_sm_byte
1216	  nop
1217	ba,pt	%ncc, .bc_sm_half
1218	  nop
1219
1220	SET_SIZE(bcopy)
1221
1222/*
1223 * The _more entry points are not intended to be used directly by
1224 * any caller from outside this file.  They are provided to allow
1225 * profiling and dtrace of the portions of the copy code that uses
1226 * the floating point registers.
1227 * This entry is particularly important as DTRACE (at least as of
1228 * 4/2004) does not support leaf functions.
1229 */
1230
1231	ENTRY(bcopy_more)
1232.bcopy_more:
1233	prefetch [%o0], #n_reads
1234	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1235	ldn	[THREAD_REG + T_LOFAULT], %l6	! save t_lofault
1236	tst	%l6
1237	bz,pt	%ncc, .do_copy
1238	  nop
1239	sethi	%hi(.copyerr), %o2
1240	or	%o2, %lo(.copyerr), %o2
1241	membar	#Sync				! sync error barrier
1242	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
1243	!
1244	! We've already captured whether t_lofault was zero on entry.
1245	! We need to mark ourselves as being from bcopy since both
1246	! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
1247	! and the saved lofault was zero, we won't reset lofault on
1248	! returning.
1249	!
1250	or	%l6, TRAMP_FLAG, %l6
1251
1252/*
1253 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
1254 * Also, use of FP registers has been tested to be enabled
1255 */
1256.do_copy:
1257	FP_NOMIGRATE(6, 7)
1258
1259	rd	%fprs, %o2		! check for unused fp
1260	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
1261	btst	FPRS_FEF, %o2
1262	bz,a,pt	%icc, .do_blockcopy
1263	  wr	%g0, FPRS_FEF, %fprs
1264
1265	BST_FPQ1Q3_TOSTACK(%o2)
1266
1267.do_blockcopy:
1268	rd	%gsr, %o2
1269	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
1270	or	%l6, FPUSED_FLAG, %l6
1271
1272#define	REALSRC	%i0
1273#define	DST	%i1
1274#define	CNT	%i2
1275#define	SRC	%i3
1276#define	TMP	%i5
1277
1278	andcc	DST, VIS_BLOCKSIZE - 1, TMP
1279	bz,pt	%ncc, 2f
1280	  neg	TMP
1281	add	TMP, VIS_BLOCKSIZE, TMP
1282
1283	! TMP = bytes required to align DST on FP_BLOCK boundary
1284	! Using SRC as a tmp here
1285	cmp	TMP, 3
1286	bleu,pt	%ncc, 1f
1287	  sub	CNT,TMP,CNT		! adjust main count
1288	sub	TMP, 3, TMP		! adjust for end of loop test
1289.bc_blkalign:
1290	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
1291	stb	SRC, [DST]
1292	subcc	TMP, 4, TMP
1293	ldub	[REALSRC + 1], SRC
1294	add	REALSRC, 4, REALSRC
1295	stb	SRC, [DST + 1]
1296	ldub	[REALSRC - 2], SRC
1297	add	DST, 4, DST
1298	stb	SRC, [DST - 2]
1299	ldub	[REALSRC - 1], SRC
1300	bgu,pt	%ncc, .bc_blkalign
1301	  stb	SRC, [DST - 1]
1302
1303	addcc	TMP, 3, TMP		! restore count adjustment
1304	bz,pt	%ncc, 2f		! no bytes left?
1305	  nop
13061:	ldub	[REALSRC], SRC
1307	inc	REALSRC
1308	inc	DST
1309	deccc	TMP
1310	bgu	%ncc, 1b
1311	  stb	SRC, [DST - 1]
1312
13132:
1314	membar	#StoreLoad
1315	andn	REALSRC, 0x7, SRC
1316
1317	! SRC - 8-byte aligned
1318	! DST - 64-byte aligned
1319	ldd	[SRC], %f0
1320	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1321	alignaddr REALSRC, %g0, %g0
1322	ldd	[SRC + 0x08], %f2
1323	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1324	faligndata %f0, %f2, %f32
1325	ldd	[SRC + 0x10], %f4
1326	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1327	faligndata %f2, %f4, %f34
1328	ldd	[SRC + 0x18], %f6
1329	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1330	faligndata %f4, %f6, %f36
1331	ldd	[SRC + 0x20], %f8
1332	prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
1333	faligndata %f6, %f8, %f38
1334	ldd	[SRC + 0x28], %f10
1335	prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
1336	faligndata %f8, %f10, %f40
1337	ldd	[SRC + 0x30], %f12
1338	prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
1339	faligndata %f10, %f12, %f42
1340	ldd	[SRC + 0x38], %f14
1341	ldd	[SRC + VIS_BLOCKSIZE], %f0
1342	sub	CNT, VIS_BLOCKSIZE, CNT
1343	add	SRC, VIS_BLOCKSIZE, SRC
1344	prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
1345	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1346	ba,pt	%ncc, 1f
1347	  prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
1348	.align	32
13491:
1350	ldd	[SRC + 0x08], %f2
1351	faligndata %f12, %f14, %f44
1352	ldd	[SRC + 0x10], %f4
1353	faligndata %f14, %f0, %f46
1354	stda	%f32, [DST]ASI_BLK_P
1355	ldd	[SRC + 0x18], %f6
1356	faligndata %f0, %f2, %f32
1357	ldd	[SRC + 0x20], %f8
1358	faligndata %f2, %f4, %f34
1359	ldd	[SRC + 0x28], %f10
1360	faligndata %f4, %f6, %f36
1361	ldd	[SRC + 0x30], %f12
1362	faligndata %f6, %f8, %f38
1363	sub	CNT, VIS_BLOCKSIZE, CNT
1364	ldd	[SRC + 0x38], %f14
1365	faligndata %f8, %f10, %f40
1366	add	DST, VIS_BLOCKSIZE, DST
1367	ldd	[SRC + VIS_BLOCKSIZE], %f0
1368	faligndata %f10, %f12, %f42
1369	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1370	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1371	add	SRC, VIS_BLOCKSIZE, SRC
1372	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1373	cmp	CNT, VIS_BLOCKSIZE + 8
1374	bgu,pt	%ncc, 1b
1375	  prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1376
1377	! only if REALSRC & 0x7 is 0
1378	cmp	CNT, VIS_BLOCKSIZE
1379	bne	%ncc, 3f
1380	  andcc	REALSRC, 0x7, %g0
1381	bz,pt	%ncc, 2f
1382	  nop
13833:
1384	faligndata %f12, %f14, %f44
1385	faligndata %f14, %f0, %f46
1386	stda	%f32, [DST]ASI_BLK_P
1387	add	DST, VIS_BLOCKSIZE, DST
1388	ba,pt	%ncc, 3f
1389	  nop
13902:
1391	ldd	[SRC + 0x08], %f2
1392	fsrc1	%f12, %f44
1393	ldd	[SRC + 0x10], %f4
1394	fsrc1	%f14, %f46
1395	stda	%f32, [DST]ASI_BLK_P
1396	ldd	[SRC + 0x18], %f6
1397	fsrc1	%f0, %f32
1398	ldd	[SRC + 0x20], %f8
1399	fsrc1	%f2, %f34
1400	ldd	[SRC + 0x28], %f10
1401	fsrc1	%f4, %f36
1402	ldd	[SRC + 0x30], %f12
1403	fsrc1	%f6, %f38
1404	ldd	[SRC + 0x38], %f14
1405	fsrc1	%f8, %f40
1406	sub	CNT, VIS_BLOCKSIZE, CNT
1407	add	DST, VIS_BLOCKSIZE, DST
1408	add	SRC, VIS_BLOCKSIZE, SRC
1409	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1410	fsrc1	%f10, %f42
1411	fsrc1	%f12, %f44
1412	fsrc1	%f14, %f46
1413	stda	%f32, [DST]ASI_BLK_P
1414	add	DST, VIS_BLOCKSIZE, DST
1415	ba,a,pt	%ncc, .bcb_exit
1416	  nop
1417
14183:	tst	CNT
1419	bz,a,pt	%ncc, .bcb_exit
1420	  nop
1421
14225:	ldub	[REALSRC], TMP
1423	inc	REALSRC
1424	inc	DST
1425	deccc	CNT
1426	bgu	%ncc, 5b
1427	  stb	TMP, [DST - 1]
1428.bcb_exit:
1429	membar	#Sync
1430
1431	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
1432	wr	%o2, 0, %gsr
1433
1434	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1435	btst	FPRS_FEF, %o3
1436	bz,pt	%icc, 4f
1437	  nop
1438
1439	BLD_FPQ1Q3_FROMSTACK(%o2)
1440
1441	ba,pt	%ncc, 2f
1442	  wr	%o3, 0, %fprs		! restore fprs
14434:
1444	FZEROQ1Q3
1445	wr	%o3, 0, %fprs		! restore fprs
14462:
1447	membar	#Sync				! sync error barrier
1448	andn	%l6, MASK_FLAGS, %l6
1449	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1450	FP_ALLOWMIGRATE(5, 6)
1451	ret
1452	  restore	%g0, 0, %o0
1453
1454	SET_SIZE(bcopy_more)
1455
1456#endif	/* lint */
1457
1458/*
1459 * Block copy with possibly overlapped operands.
1460 */
1461
1462#if defined(lint)
1463
1464/*ARGSUSED*/
1465void
1466ovbcopy(const void *from, void *to, size_t count)
1467{}
1468
1469#else	/* lint */
1470
1471	ENTRY(ovbcopy)
1472	tst	%o2			! check count
1473	bgu,a	%ncc, 1f		! nothing to do or bad arguments
1474	  subcc	%o0, %o1, %o3		! difference of from and to address
1475
1476	retl				! return
1477	  nop
14781:
1479	bneg,a	%ncc, 2f
1480	  neg	%o3			! if < 0, make it positive
14812:	cmp	%o2, %o3		! cmp size and abs(from - to)
1482	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
1483	  .empty				!   no overlap
1484	  cmp	%o0, %o1		! compare from and to addresses
1485	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
1486	  nop
1487	!
1488	! Copy forwards.
1489	!
1490.ov_fwd:
1491	ldub	[%o0], %o3		! read from address
1492	inc	%o0			! inc from address
1493	stb	%o3, [%o1]		! write to address
1494	deccc	%o2			! dec count
1495	bgu	%ncc, .ov_fwd		! loop till done
1496	  inc	%o1			! inc to address
1497
1498	retl				! return
1499	  nop
1500	!
1501	! Copy backwards.
1502	!
1503.ov_bkwd:
1504	deccc	%o2			! dec count
1505	ldub	[%o0 + %o2], %o3	! get byte at end of src
1506	bgu	%ncc, .ov_bkwd		! loop till done
1507	  stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
1508
1509	retl				! return
1510	  nop
1511
1512	SET_SIZE(ovbcopy)
1513
1514#endif	/* lint */
1515
1516
1517/*
1518 * hwblkpagecopy()
1519 *
1520 * Copies exactly one page.  This routine assumes the caller (ppcopy)
1521 * has already disabled kernel preemption and has checked
1522 * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
1523 */
1524#ifdef lint
1525/*ARGSUSED*/
1526void
1527hwblkpagecopy(const void *src, void *dst)
1528{ }
1529#else /* lint */
1530	ENTRY(hwblkpagecopy)
1531	! get another window w/space for three aligned blocks of saved fpregs
1532	prefetch [%o0], #n_reads
1533	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1534
1535	! %i0 - source address (arg)
1536	! %i1 - destination address (arg)
1537	! %i2 - length of region (not arg)
1538	! %l0 - saved fprs
1539	! %l1 - pointer to saved fpregs
1540
1541	rd	%fprs, %l0		! check for unused fp
1542	btst	FPRS_FEF, %l0
1543	bz,a,pt	%icc, 1f
1544	  wr	%g0, FPRS_FEF, %fprs
1545
1546	BST_FPQ1Q3_TOSTACK(%l1)
1547
15481:	set	PAGESIZE, CNT
1549	mov	REALSRC, SRC
1550
1551	ldd	[SRC], %f0
1552	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1553	ldd	[SRC + 0x08], %f2
1554	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1555	fmovd	%f0, %f32
1556	ldd	[SRC + 0x10], %f4
1557	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1558	fmovd	%f2, %f34
1559	ldd	[SRC + 0x18], %f6
1560	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1561	fmovd	%f4, %f36
1562	ldd	[SRC + 0x20], %f8
1563	prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
1564	fmovd	%f6, %f38
1565	ldd	[SRC + 0x28], %f10
1566	prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
1567	fmovd	%f8, %f40
1568	ldd	[SRC + 0x30], %f12
1569	prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
1570	fmovd	%f10, %f42
1571	ldd	[SRC + 0x38], %f14
1572	ldd	[SRC + VIS_BLOCKSIZE], %f0
1573	sub	CNT, VIS_BLOCKSIZE, CNT
1574	add	SRC, VIS_BLOCKSIZE, SRC
1575	prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
1576	ba,pt	%ncc, 2f
1577	prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
1578	.align	32
15792:
1580	ldd	[SRC + 0x08], %f2
1581	fmovd	%f12, %f44
1582	ldd	[SRC + 0x10], %f4
1583	fmovd	%f14, %f46
1584	stda	%f32, [DST]ASI_BLK_P
1585	ldd	[SRC + 0x18], %f6
1586	fmovd	%f0, %f32
1587	ldd	[SRC + 0x20], %f8
1588	fmovd	%f2, %f34
1589	ldd	[SRC + 0x28], %f10
1590	fmovd	%f4, %f36
1591	ldd	[SRC + 0x30], %f12
1592	fmovd	%f6, %f38
1593	ldd	[SRC + 0x38], %f14
1594	fmovd	%f8, %f40
1595	ldd	[SRC + VIS_BLOCKSIZE], %f0
1596	fmovd	%f10, %f42
1597	sub	CNT, VIS_BLOCKSIZE, CNT
1598	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1599	add	DST, VIS_BLOCKSIZE, DST
1600	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1601	add	SRC, VIS_BLOCKSIZE, SRC
1602	cmp	CNT, VIS_BLOCKSIZE + 8
1603	bgu,pt	%ncc, 2b
1604	  prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1605
1606	! trailing block
1607	ldd	[SRC + 0x08], %f2
1608	fsrc1	%f12, %f44
1609	ldd	[SRC + 0x10], %f4
1610	fsrc1	%f14, %f46
1611	stda	%f32, [DST]ASI_BLK_P
1612	ldd	[SRC + 0x18], %f6
1613	fsrc1	%f0, %f32
1614	ldd	[SRC + 0x20], %f8
1615	fsrc1	%f2, %f34
1616	ldd	[SRC + 0x28], %f10
1617	fsrc1	%f4, %f36
1618	ldd	[SRC + 0x30], %f12
1619	fsrc1	%f6, %f38
1620	ldd	[SRC + 0x38], %f14
1621	fsrc1	%f8, %f40
1622	sub	CNT, VIS_BLOCKSIZE, CNT
1623	add	DST, VIS_BLOCKSIZE, DST
1624	add	SRC, VIS_BLOCKSIZE, SRC
1625	fsrc1	%f10, %f42
1626	fsrc1	%f12, %f44
1627	fsrc1	%f14, %f46
1628	stda	%f32, [DST]ASI_BLK_P
1629
1630	membar	#Sync
1631
1632	btst	FPRS_FEF, %l0
1633	bz,pt	%icc, 2f
1634	  nop
1635
1636	BLD_FPQ1Q3_FROMSTACK(%l3)
1637	ba	3f
1638	  nop
1639
16402:	FZEROQ1Q3
1641
16423:	wr	%l0, 0, %fprs		! restore fprs
1643	ret
1644	  restore	%g0, 0, %o0
1645
1646	SET_SIZE(hwblkpagecopy)
1647#endif	/* lint */
1648
1649
1650/*
1651 * Transfer data to and from user space -
1652 * Note that these routines can cause faults
1653 * It is assumed that the kernel has nothing at
1654 * less than KERNELBASE in the virtual address space.
1655 *
1656 * Note that copyin(9F) and copyout(9F) are part of the
1657 * DDI/DKI which specifies that they return '-1' on "errors."
1658 *
1659 * Sigh.
1660 *
1661 * So there's two extremely similar routines - xcopyin() and xcopyout()
1662 * which return the errno that we've faithfully computed.  This
1663 * allows other callers (e.g. uiomove(9F)) to work correctly.
1664 * Given that these are used pretty heavily, we expand the calling
1665 * sequences inline for all flavours (rather than making wrappers).
1666 *
1667 * There are also stub routines for xcopyout_little and xcopyin_little,
1668 * which currently are intended to handle requests of <= 16 bytes from
1669 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1670 * is left as an exercise...
1671 */
1672
1673/*
1674 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1675 *
1676 * General theory of operation:
1677 *
1678 * The only difference between copy{in,out} and
1679 * xcopy{in,out} is in the error handling routine they invoke
1680 * when a memory access error occurs. xcopyOP returns the errno
1681 * while copyOP returns -1 (see above). copy{in,out}_noerr set
1682 * a special flag (by oring the TRAMP_FLAG into the fault handler address)
1683 * if they are called with a fault handler already in place. That flag
1684 * causes the default handlers to trampoline to the previous handler
1685 * upon an error.
1686 *
1687 * None of the copyops routines grab a window until it's decided that
1688 * we need to do a HW block copy operation. This saves a window
1689 * spill/fill when we're called during socket ops. The typical IO
1690 * path won't cause spill/fill traps.
1691 *
1692 * This code uses a set of 4 limits for the maximum size that will
1693 * be copied given a particular input/output address alignment.
1694 * If the value for a particular limit is zero, the copy will be performed
1695 * by the plain copy loops rather than FPBLK.
1696 *
1697 * See the description of bcopy above for more details of the
1698 * data copying algorithm and the default limits.
1699 *
1700 */
1701
1702/*
1703 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1704 */
1705
1706#if defined(lint)
1707
1708
1709#else	/* lint */
1710/*
1711 * We save the arguments in the following registers in case of a fault:
1712 *	kaddr - %l1
1713 *	uaddr - %l2
1714 *	count - %l3
1715 */
1716#define SAVE_SRC	%l1
1717#define SAVE_DST	%l2
1718#define SAVE_COUNT	%l3
1719
1720#define SM_SAVE_SRC		%g4
1721#define SM_SAVE_DST		%g5
1722#define SM_SAVE_COUNT		%o5
1723#define ERRNO		%l5
1724
1725
1726#define REAL_LOFAULT	%l4
1727/*
1728 * Generic copyio fault handler.  This is the first line of defense when a
1729 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
1730 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1731 * This allows us to share common code for all the flavors of the copy
1732 * operations, including the _noerr versions.
1733 *
1734 * Note that this function will restore the original input parameters before
1735 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
1736 * member of the t_copyop structure, if needed.
1737 */
1738	ENTRY(copyio_fault)
1739	membar	#Sync
1740	mov	%g1,ERRNO			! save errno in ERRNO
1741	btst	FPUSED_FLAG, %l6
1742	bz	%ncc, 1f
1743	  nop
1744
1745	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1746	wr	%o2, 0, %gsr    	! restore gsr
1747
1748	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1749	btst	FPRS_FEF, %o3
1750	bz,pt	%icc, 4f
1751	  nop
1752
1753	BLD_FPQ2Q4_FROMSTACK(%o2)
1754
1755	ba,pt	%ncc, 1f
1756	  wr	%o3, 0, %fprs   	! restore fprs
1757
17584:
1759	FZEROQ2Q4
1760	wr	%o3, 0, %fprs   	! restore fprs
1761
17621:
1763	andn	%l6, FPUSED_FLAG, %l6
1764	membar	#Sync
1765	stn	%l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1766	FP_ALLOWMIGRATE(5, 6)
1767
1768	mov	SAVE_SRC, %i0
1769	mov	SAVE_DST, %i1
1770	jmp	REAL_LOFAULT
1771	  mov	SAVE_COUNT, %i2
1772
1773	SET_SIZE(copyio_fault)
1774
1775
1776#endif
1777
1778#if defined(lint)
1779
1780/*ARGSUSED*/
1781int
1782copyout(const void *kaddr, void *uaddr, size_t count)
1783{ return (0); }
1784
1785#else	/* lint */
1786
1787	ENTRY(copyout)
1788
1789	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
1790	bleu,pt	%ncc, .copyout_small		! go to larger cases
1791	  xor	%o0, %o1, %o3			! are src, dst alignable?
1792	btst	7, %o3				!
1793	bz,pt	%ncc, .copyout_8		! check for longword alignment
1794	  nop
1795	btst	1, %o3				!
1796	bz,pt	%ncc, .copyout_2		! check for half-word
1797	  nop
1798	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
1799	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
1800	tst	%o3
1801	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1802	  cmp	%o2, %o3			! if length <= limit
1803	bleu,pt	%ncc, .copyout_small		! go to small copy
1804	  nop
1805	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1806	  nop
1807.copyout_2:
1808	btst	3, %o3				!
1809	bz,pt	%ncc, .copyout_4		! check for word alignment
1810	  nop
1811	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
1812	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
1813	tst	%o3
1814	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1815	  cmp	%o2, %o3			! if length <= limit
1816	bleu,pt	%ncc, .copyout_small		! go to small copy
1817	  nop
1818	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1819	  nop
1820.copyout_4:
1821	! already checked longword, must be word aligned
1822	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
1823	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
1824	tst	%o3
1825	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1826	  cmp	%o2, %o3			! if length <= limit
1827	bleu,pt	%ncc, .copyout_small		! go to small copy
1828	  nop
1829	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1830	  nop
1831.copyout_8:
1832	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
1833	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
1834	tst	%o3
1835	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1836	  cmp	%o2, %o3			! if length <= limit
1837	bleu,pt	%ncc, .copyout_small		! go to small copy
1838	  nop
1839	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1840	  nop
1841
1842	.align	16
1843	nop				! instruction alignment
1844					! see discussion at start of file
1845.copyout_small:
1846	sethi	%hi(.sm_copyout_err), %o5	! .sm_copyout_err is lofault
1847	or	%o5, %lo(.sm_copyout_err), %o5
1848	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
1849	membar	#Sync				! sync error barrier
1850	stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
1851.sm_do_copyout:
1852	mov	%o0, SM_SAVE_SRC
1853	mov	%o1, SM_SAVE_DST
1854	cmp	%o2, SHORTCOPY		! check for really short case
1855	bleu,pt	%ncc, .co_sm_left	!
1856	  mov	%o2, SM_SAVE_COUNT
1857	cmp	%o2, CHKSIZE		! check for medium length cases
1858	bgu,pn	%ncc, .co_med		!
1859	  or	%o0, %o1, %o3		! prepare alignment check
1860	andcc	%o3, 0x3, %g0		! test for alignment
1861	bz,pt	%ncc, .co_sm_word	! branch to word aligned case
1862.co_sm_movebytes:
1863	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
1864.co_sm_notalign4:
1865	ldub	[%o0], %o3		! read byte
1866	subcc	%o2, 4, %o2		! reduce count by 4
1867	stba	%o3, [%o1]ASI_USER	! write byte
1868	inc	%o1			! advance DST by 1
1869	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
1870	add	%o0, 4, %o0		! advance SRC by 4
1871	stba	%o3, [%o1]ASI_USER
1872	inc	%o1			! advance DST by 1
1873	ldub	[%o0 - 2], %o3
1874	stba	%o3, [%o1]ASI_USER
1875	inc	%o1			! advance DST by 1
1876	ldub	[%o0 - 1], %o3
1877	stba	%o3, [%o1]ASI_USER
1878	bgt,pt	%ncc, .co_sm_notalign4	! loop til 3 or fewer bytes remain
1879	  inc	%o1			! advance DST by 1
1880	add	%o2, 3, %o2		! restore count
1881.co_sm_left:
1882	tst	%o2
1883	bz,pt	%ncc, .co_sm_exit	! check for zero length
1884	  nop
1885	ldub	[%o0], %o3		! load one byte
1886	deccc	%o2			! reduce count for cc test
1887	bz,pt	%ncc, .co_sm_exit
1888	  stba	%o3,[%o1]ASI_USER	! store one byte
1889	ldub	[%o0 + 1], %o3		! load second byte
1890	deccc	%o2
1891	inc	%o1
1892	bz,pt	%ncc, .co_sm_exit
1893	  stba	%o3,[%o1]ASI_USER	! store second byte
1894	ldub	[%o0 + 2], %o3		! load third byte
1895	inc	%o1
1896	stba	%o3,[%o1]ASI_USER	! store third byte
1897	membar	#Sync				! sync error barrier
1898	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1899	retl
1900	  mov	%g0, %o0		! return 0
1901	.align	16
1902.co_sm_words:
1903	lduw	[%o0], %o3		! read word
1904.co_sm_wordx:
1905	subcc	%o2, 8, %o2		! update count
1906	stwa	%o3, [%o1]ASI_USER	! write word
1907	add	%o0, 8, %o0		! update SRC
1908	lduw	[%o0 - 4], %o3		! read word
1909	add	%o1, 4, %o1		! update DST
1910	stwa	%o3, [%o1]ASI_USER	! write word
1911	bgt,pt	%ncc, .co_sm_words	! loop til done
1912	  add	%o1, 4, %o1		! update DST
1913	addcc	%o2, 7, %o2		! restore count
1914	bz,pt	%ncc, .co_sm_exit
1915	  nop
1916	deccc	%o2
1917	bz,pt	%ncc, .co_sm_byte
1918.co_sm_half:
1919	  subcc	%o2, 2, %o2		! reduce count by 2
1920	lduh	[%o0], %o3		! read half word
1921	add	%o0, 2, %o0		! advance SRC by 2
1922	stha	%o3, [%o1]ASI_USER	! write half word
1923	bgt,pt	%ncc, .co_sm_half	! loop til done
1924	  add	%o1, 2, %o1		! advance DST by 2
1925	addcc	%o2, 1, %o2		! restore count
1926	bz,pt	%ncc, .co_sm_exit
1927	  nop
1928.co_sm_byte:
1929	ldub	[%o0], %o3
1930	stba	%o3, [%o1]ASI_USER
1931	membar	#Sync				! sync error barrier
1932	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1933	retl
1934	  mov	%g0, %o0		! return 0
1935	.align 16
1936.co_sm_word:
1937	subcc	%o2, 4, %o2		! update count
1938	bgt,pt	%ncc, .co_sm_wordx
1939	  lduw	[%o0], %o3		! read word
1940	addcc	%o2, 3, %o2		! restore count
1941	bz,pt	%ncc, .co_sm_exit
1942	  stwa	%o3, [%o1]ASI_USER	! write word
1943	deccc	%o2			! reduce count for cc test
1944	ldub	[%o0 + 4], %o3		! load one byte
1945	add	%o1, 4, %o1
1946	bz,pt	%ncc, .co_sm_exit
1947	  stba	%o3, [%o1]ASI_USER	! store one byte
1948	ldub	[%o0 + 5], %o3		! load second byte
1949	deccc	%o2
1950	inc	%o1
1951	bz,pt	%ncc, .co_sm_exit
1952	  stba	%o3, [%o1]ASI_USER	! store second byte
1953	ldub	[%o0 + 6], %o3		! load third byte
1954	inc	%o1
1955	stba	%o3, [%o1]ASI_USER	! store third byte
1956.co_sm_exit:
1957	  membar	#Sync				! sync error barrier
1958	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1959	retl
1960	  mov	%g0, %o0		! return 0
1961
1962	.align 16
1963.co_med:
1964	xor	%o0, %o1, %o3		! setup alignment check
1965	btst	1, %o3
1966	bnz,pt	%ncc, .co_sm_movebytes	! unaligned
1967	  nop
1968	btst	3, %o3
1969	bnz,pt	%ncc, .co_med_half	! halfword aligned
1970	  nop
1971	btst	7, %o3
1972	bnz,pt	%ncc, .co_med_word	! word aligned
1973	  nop
1974.co_med_long:
1975	btst	3, %o0			! check for
1976	bz,pt	%ncc, .co_med_long1	! word alignment
1977	  nop
1978.co_med_long0:
1979	ldub	[%o0], %o3		! load one byte
1980	inc	%o0
1981	stba	%o3,[%o1]ASI_USER	! store byte
1982	inc	%o1
1983	btst	3, %o0
1984	bnz,pt	%ncc, .co_med_long0
1985	  dec	%o2
1986.co_med_long1:			! word aligned
1987	btst	7, %o0			! check for long word
1988	bz,pt	%ncc, .co_med_long2
1989	  nop
1990	lduw	[%o0], %o3		! load word
1991	add	%o0, 4, %o0		! advance SRC by 4
1992	stwa	%o3, [%o1]ASI_USER	! store word
1993	add	%o1, 4, %o1		! advance DST by 4
1994	sub	%o2, 4, %o2		! reduce count by 4
1995!
1996!  Now long word aligned and have at least 32 bytes to move
1997!
1998.co_med_long2:
1999	sub	%o2, 31, %o2		! adjust count to allow cc zero test
2000	sub	%o1, 8, %o1		! adjust pointer to allow store in
2001					! branch delay slot instead of add
2002.co_med_lmove:
2003	add	%o1, 8, %o1		! advance DST by 8
2004	ldx	[%o0], %o3		! read long word
2005	subcc	%o2, 32, %o2		! reduce count by 32
2006	stxa	%o3, [%o1]ASI_USER	! write long word
2007	add	%o1, 8, %o1		! advance DST by 8
2008	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
2009	add	%o0, 32, %o0		! advance SRC by 32
2010	stxa	%o3, [%o1]ASI_USER
2011	ldx	[%o0 - 16], %o3
2012	add	%o1, 8, %o1		! advance DST by 8
2013	stxa	%o3, [%o1]ASI_USER
2014	ldx	[%o0 - 8], %o3
2015	add	%o1, 8, %o1		! advance DST by 8
2016	bgt,pt	%ncc, .co_med_lmove	! loop til 31 or fewer bytes left
2017	  stxa	%o3, [%o1]ASI_USER
2018	add	%o1, 8, %o1		! advance DST by 8
2019	addcc	%o2, 24, %o2		! restore count to long word offset
2020	ble,pt	%ncc, .co_med_lextra	! check for more long words to move
2021	  nop
2022.co_med_lword:
2023	ldx	[%o0], %o3		! read long word
2024	subcc	%o2, 8, %o2		! reduce count by 8
2025	stxa	%o3, [%o1]ASI_USER	! write long word
2026	add	%o0, 8, %o0		! advance SRC by 8
2027	bgt,pt	%ncc, .co_med_lword	! loop til 7 or fewer bytes left
2028	  add	%o1, 8, %o1		! advance DST by 8
2029.co_med_lextra:
2030	addcc	%o2, 7, %o2		! restore rest of count
2031	bz,pt	%ncc, .co_sm_exit	! if zero, then done
2032	  deccc	%o2
2033	bz,pt	%ncc, .co_sm_byte
2034	  nop
2035	ba,pt	%ncc, .co_sm_half
2036	  nop
2037
2038	.align 16
2039	nop				! instruction alignment
2040					! see discussion at start of file
2041.co_med_word:
2042	btst	3, %o0			! check for
2043	bz,pt	%ncc, .co_med_word1	! word alignment
2044	  nop
2045.co_med_word0:
2046	ldub	[%o0], %o3		! load one byte
2047	inc	%o0
2048	stba	%o3,[%o1]ASI_USER	! store byte
2049	inc	%o1
2050	btst	3, %o0
2051	bnz,pt	%ncc, .co_med_word0
2052	  dec	%o2
2053!
2054!  Now word aligned and have at least 36 bytes to move
2055!
2056.co_med_word1:
2057	sub	%o2, 15, %o2		! adjust count to allow cc zero test
2058.co_med_wmove:
2059	lduw	[%o0], %o3		! read word
2060	subcc	%o2, 16, %o2		! reduce count by 16
2061	stwa	%o3, [%o1]ASI_USER	! write word
2062	add	%o1, 4, %o1		! advance DST by 4
2063	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
2064	add	%o0, 16, %o0		! advance SRC by 16
2065	stwa	%o3, [%o1]ASI_USER
2066	add	%o1, 4, %o1		! advance DST by 4
2067	lduw	[%o0 - 8], %o3
2068	stwa	%o3, [%o1]ASI_USER
2069	add	%o1, 4, %o1		! advance DST by 4
2070	lduw	[%o0 - 4], %o3
2071	stwa	%o3, [%o1]ASI_USER
2072	bgt,pt	%ncc, .co_med_wmove	! loop til 15 or fewer bytes left
2073	  add	%o1, 4, %o1		! advance DST by 4
2074	addcc	%o2, 12, %o2		! restore count to word offset
2075	ble,pt	%ncc, .co_med_wextra	! check for more words to move
2076	  nop
2077.co_med_word2:
2078	lduw	[%o0], %o3		! read word
2079	subcc	%o2, 4, %o2		! reduce count by 4
2080	stwa	%o3, [%o1]ASI_USER	! write word
2081	add	%o0, 4, %o0		! advance SRC by 4
2082	bgt,pt	%ncc, .co_med_word2	! loop til 3 or fewer bytes left
2083	  add	%o1, 4, %o1		! advance DST by 4
2084.co_med_wextra:
2085	addcc	%o2, 3, %o2		! restore rest of count
2086	bz,pt	%ncc, .co_sm_exit	! if zero, then done
2087	  deccc	%o2
2088	bz,pt	%ncc, .co_sm_byte
2089	  nop
2090	ba,pt	%ncc, .co_sm_half
2091	  nop
2092
2093	.align 16
2094	nop				! instruction alignment
2095	nop				! see discussion at start of file
2096	nop
2097.co_med_half:
2098	btst	1, %o0			! check for
2099	bz,pt	%ncc, .co_med_half1	! half word alignment
2100	  nop
2101	ldub	[%o0], %o3		! load one byte
2102	inc	%o0
2103	stba	%o3,[%o1]ASI_USER	! store byte
2104	inc	%o1
2105	dec	%o2
2106!
2107!  Now half word aligned and have at least 38 bytes to move
2108!
2109.co_med_half1:
2110	sub	%o2, 7, %o2		! adjust count to allow cc zero test
2111.co_med_hmove:
2112	lduh	[%o0], %o3		! read half word
2113	subcc	%o2, 8, %o2		! reduce count by 8
2114	stha	%o3, [%o1]ASI_USER	! write half word
2115	add	%o1, 2, %o1		! advance DST by 2
2116	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
2117	add	%o0, 8, %o0		! advance SRC by 8
2118	stha	%o3, [%o1]ASI_USER
2119	add	%o1, 2, %o1		! advance DST by 2
2120	lduh	[%o0 - 4], %o3
2121	stha	%o3, [%o1]ASI_USER
2122	add	%o1, 2, %o1		! advance DST by 2
2123	lduh	[%o0 - 2], %o3
2124	stha	%o3, [%o1]ASI_USER
2125	bgt,pt	%ncc, .co_med_hmove	! loop til 7 or fewer bytes left
2126	  add	%o1, 2, %o1		! advance DST by 2
2127	addcc	%o2, 7, %o2		! restore count
2128	bz,pt	%ncc, .co_sm_exit
2129	  deccc	%o2
2130	bz,pt	%ncc, .co_sm_byte
2131	  nop
2132	ba,pt	%ncc, .co_sm_half
2133	  nop
2134
2135/*
2136 * We got here because of a fault during short copyout.
2137 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2138 */
2139.sm_copyout_err:
2140	membar	#Sync
2141	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2142	mov	SM_SAVE_SRC, %o0
2143	mov	SM_SAVE_DST, %o1
2144	mov	SM_SAVE_COUNT, %o2
2145	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2146	tst	%o3
2147	bz,pt	%ncc, 3f			! if not, return error
2148	  nop
2149	ldn	[%o3 + CP_COPYOUT], %o5		! if handler, invoke it with
2150	jmp	%o5				! original arguments
2151	  nop
21523:
2153	retl
2154	  or	%g0, -1, %o0		! return error value
2155
2156	SET_SIZE(copyout)
2157
2158/*
2159 * The _more entry points are not intended to be used directly by
2160 * any caller from outside this file.  They are provided to allow
2161 * profiling and dtrace of the portions of the copy code that uses
2162 * the floating point registers.
2163 * This entry is particularly important as DTRACE (at least as of
2164 * 4/2004) does not support leaf functions.
2165 */
2166
2167	ENTRY(copyout_more)
2168.copyout_more:
2169	prefetch [%o0], #n_reads
2170	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2171	set	.copyout_err, REAL_LOFAULT
2172
2173/*
2174 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
2175 */
2176.do_copyout:
2177        set     copyio_fault, %l7		! .copyio_fault is lofault val
2178
2179	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
2180	membar	#Sync				! sync error barrier
2181	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
2182
2183	mov	%i0, SAVE_SRC
2184	mov	%i1, SAVE_DST
2185	mov	%i2, SAVE_COUNT
2186
2187	FP_NOMIGRATE(6, 7)
2188
2189	rd	%fprs, %o2		! check for unused fp
2190	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2191	btst	FPRS_FEF, %o2
2192	bz,a,pt	%icc, .do_blockcopyout
2193	  wr	%g0, FPRS_FEF, %fprs
2194
2195	BST_FPQ2Q4_TOSTACK(%o2)
2196
2197.do_blockcopyout:
2198	rd	%gsr, %o2
2199	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
2200	or	%l6, FPUSED_FLAG, %l6
2201
2202	andcc	DST, VIS_BLOCKSIZE - 1, TMP
2203	mov	ASI_USER, %asi
2204	bz,pt	%ncc, 2f
2205	  neg	TMP
2206	add	TMP, VIS_BLOCKSIZE, TMP
2207
2208	! TMP = bytes required to align DST on FP_BLOCK boundary
2209	! Using SRC as a tmp here
2210	cmp	TMP, 3
2211	bleu,pt	%ncc, 1f
2212	  sub	CNT,TMP,CNT		! adjust main count
2213	sub	TMP, 3, TMP		! adjust for end of loop test
2214.co_blkalign:
2215	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
2216	stba	SRC, [DST]%asi
2217	subcc	TMP, 4, TMP
2218	ldub	[REALSRC + 1], SRC
2219	add	REALSRC, 4, REALSRC
2220	stba	SRC, [DST + 1]%asi
2221	ldub	[REALSRC - 2], SRC
2222	add	DST, 4, DST
2223	stba	SRC, [DST - 2]%asi
2224	ldub	[REALSRC - 1], SRC
2225	bgu,pt	%ncc, .co_blkalign
2226	  stba	SRC, [DST - 1]%asi
2227
2228	addcc	TMP, 3, TMP		! restore count adjustment
2229	bz,pt	%ncc, 2f		! no bytes left?
2230	  nop
22311:	ldub	[REALSRC], SRC
2232	inc	REALSRC
2233	inc	DST
2234	deccc	TMP
2235	bgu	%ncc, 1b
2236	  stba	SRC, [DST - 1]%asi
2237
22382:
2239	membar	#StoreLoad
2240	andn	REALSRC, 0x7, SRC
2241
2242	! SRC - 8-byte aligned
2243	! DST - 64-byte aligned
2244	ldd	[SRC], %f16
2245	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
2246	alignaddr REALSRC, %g0, %g0
2247	ldd	[SRC + 0x08], %f18
2248	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
2249	faligndata %f16, %f18, %f48
2250	ldd	[SRC + 0x10], %f20
2251	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2252	faligndata %f18, %f20, %f50
2253	ldd	[SRC + 0x18], %f22
2254	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
2255	faligndata %f20, %f22, %f52
2256	ldd	[SRC + 0x20], %f24
2257	prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
2258	faligndata %f22, %f24, %f54
2259	ldd	[SRC + 0x28], %f26
2260	prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
2261	faligndata %f24, %f26, %f56
2262	ldd	[SRC + 0x30], %f28
2263	prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
2264	faligndata %f26, %f28, %f58
2265	ldd	[SRC + 0x38], %f30
2266	ldd	[SRC + VIS_BLOCKSIZE], %f16
2267	sub	CNT, VIS_BLOCKSIZE, CNT
2268	add	SRC, VIS_BLOCKSIZE, SRC
2269	prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
2270	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2271	ba,pt	%ncc, 1f
2272	prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
2273	.align	32
22741:
2275	ldd	[SRC + 0x08], %f18
2276	faligndata %f28, %f30, %f60
2277	ldd	[SRC + 0x10], %f20
2278	faligndata %f30, %f16, %f62
2279	stda	%f48, [DST]ASI_BLK_AIUS
2280	ldd	[SRC + 0x18], %f22
2281	faligndata %f16, %f18, %f48
2282	ldd	[SRC + 0x20], %f24
2283	faligndata %f18, %f20, %f50
2284	ldd	[SRC + 0x28], %f26
2285	faligndata %f20, %f22, %f52
2286	ldd	[SRC + 0x30], %f28
2287	faligndata %f22, %f24, %f54
2288	sub	CNT, VIS_BLOCKSIZE, CNT
2289	ldd	[SRC + 0x38], %f30
2290	faligndata %f24, %f26, %f56
2291	add	DST, VIS_BLOCKSIZE, DST
2292	ldd	[SRC + VIS_BLOCKSIZE], %f16
2293	faligndata %f26, %f28, %f58
2294	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2295	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2296	add	SRC, VIS_BLOCKSIZE, SRC
2297	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2298	cmp	CNT, VIS_BLOCKSIZE + 8
2299	bgu,pt	%ncc, 1b
2300	  prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2301
2302	! only if REALSRC & 0x7 is 0
2303	cmp	CNT, VIS_BLOCKSIZE
2304	bne	%ncc, 3f
2305	  andcc	REALSRC, 0x7, %g0
2306	bz,pt	%ncc, 2f
2307	  nop
23083:
2309	faligndata %f28, %f30, %f60
2310	faligndata %f30, %f16, %f62
2311	stda	%f48, [DST]ASI_BLK_AIUS
2312	add	DST, VIS_BLOCKSIZE, DST
2313	ba,pt	%ncc, 3f
2314	  nop
23152:
2316	ldd	[SRC + 0x08], %f18
2317	fsrc1	%f28, %f60
2318	ldd	[SRC + 0x10], %f20
2319	fsrc1	%f30, %f62
2320	stda	%f48, [DST]ASI_BLK_AIUS
2321	ldd	[SRC + 0x18], %f22
2322	fsrc1	%f16, %f48
2323	ldd	[SRC + 0x20], %f24
2324	fsrc1	%f18, %f50
2325	ldd	[SRC + 0x28], %f26
2326	fsrc1	%f20, %f52
2327	ldd	[SRC + 0x30], %f28
2328	fsrc1	%f22, %f54
2329	ldd	[SRC + 0x38], %f30
2330	fsrc1	%f24, %f56
2331	sub	CNT, VIS_BLOCKSIZE, CNT
2332	add	DST, VIS_BLOCKSIZE, DST
2333	add	SRC, VIS_BLOCKSIZE, SRC
2334	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2335	fsrc1	%f26, %f58
2336	fsrc1	%f28, %f60
2337	fsrc1	%f30, %f62
2338	stda	%f48, [DST]ASI_BLK_AIUS
2339	add	DST, VIS_BLOCKSIZE, DST
2340	ba,a,pt	%ncc, 4f
2341	  nop
2342
23433:	tst	CNT
2344	bz,a	%ncc, 4f
2345	  nop
2346
23475:	ldub	[REALSRC], TMP
2348	inc	REALSRC
2349	inc	DST
2350	deccc	CNT
2351	bgu	%ncc, 5b
2352	  stba	TMP, [DST - 1]%asi
23534:
2354
2355.copyout_exit:
2356	membar	#Sync
2357
2358	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2359	wr	%o2, 0, %gsr		! restore gsr
2360
2361	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2362	btst	FPRS_FEF, %o3
2363	bz,pt	%icc, 4f
2364	  nop
2365
2366	BLD_FPQ2Q4_FROMSTACK(%o2)
2367
2368	ba,pt	%ncc, 1f
2369	  wr	%o3, 0, %fprs		! restore fprs
2370
23714:
2372	FZEROQ2Q4
2373	wr	%o3, 0, %fprs		! restore fprs
2374
23751:
2376	membar	#Sync
2377	andn	%l6, FPUSED_FLAG, %l6
2378	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2379	FP_ALLOWMIGRATE(5, 6)
2380	ret
2381	  restore	%g0, 0, %o0
2382
2383/*
2384 * We got here because of a fault during copyout.
2385 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2386 */
2387.copyout_err:
2388	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
2389	tst	%o4
2390	bz,pt	%ncc, 2f			! if not, return error
2391	  nop
2392	ldn	[%o4 + CP_COPYOUT], %g2		! if handler, invoke it with
2393	jmp	%g2				! original arguments
2394	  restore %g0, 0, %g0			! dispose of copy window
23952:
2396        ret
2397	  restore %g0, -1, %o0			! return error value
2398
2399
2400	SET_SIZE(copyout_more)
2401
2402#endif	/* lint */
2403
2404
2405#ifdef	lint
2406
2407/*ARGSUSED*/
2408int
2409xcopyout(const void *kaddr, void *uaddr, size_t count)
2410{ return (0); }
2411
2412#else	/* lint */
2413
2414	ENTRY(xcopyout)
2415	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
2416	bleu,pt	%ncc, .xcopyout_small		! go to larger cases
2417	  xor	%o0, %o1, %o3			! are src, dst alignable?
2418	btst	7, %o3				!
2419	bz,pt	%ncc, .xcopyout_8		!
2420	  nop
2421	btst	1, %o3				!
2422	bz,pt	%ncc, .xcopyout_2		! check for half-word
2423	  nop
2424	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
2425	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2426	tst	%o3
2427	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2428	  cmp	%o2, %o3			! if length <= limit
2429	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2430	  nop
2431	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2432	  nop
2433.xcopyout_2:
2434	btst	3, %o3				!
2435	bz,pt	%ncc, .xcopyout_4		! check for word alignment
2436	  nop
2437	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
2438	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
2439	tst	%o3
2440	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2441	  cmp	%o2, %o3			! if length <= limit
2442	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2443	  nop
2444	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2445	  nop
2446.xcopyout_4:
2447	! already checked longword, must be word aligned
2448	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
2449	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
2450	tst	%o3
2451	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2452	  cmp	%o2, %o3			! if length <= limit
2453	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2454	  nop
2455	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2456	  nop
2457.xcopyout_8:
2458	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
2459	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2460	tst	%o3
2461	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2462	  cmp	%o2, %o3			! if length <= limit
2463	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2464	  nop
2465	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2466	  nop
2467
2468.xcopyout_small:
2469	sethi	%hi(.sm_xcopyout_err), %o5	! .sm_xcopyout_err is lofault
2470	or	%o5, %lo(.sm_xcopyout_err), %o5
2471	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
2472	membar	#Sync				! sync error barrier
2473	ba,pt	%ncc, .sm_do_copyout		! common code
2474	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
2475
2476.xcopyout_more:
2477	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2478	sethi	%hi(.xcopyout_err), REAL_LOFAULT
2479	ba,pt	%ncc, .do_copyout		! common code
2480	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2481
2482/*
2483 * We got here because of fault during xcopyout
2484 * Errno value is in ERRNO
2485 */
2486.xcopyout_err:
2487	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
2488	tst	%o4
2489	bz,pt	%ncc, 2f			! if not, return error
2490	  nop
2491	ldn	[%o4 + CP_XCOPYOUT], %g2	! if handler, invoke it with
2492	jmp	%g2				! original arguments
2493	  restore %g0, 0, %g0			! dispose of copy window
24942:
2495        ret
2496	  restore ERRNO, 0, %o0			! return errno value
2497
2498.sm_xcopyout_err:
2499
2500	membar	#Sync
2501	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2502	mov	SM_SAVE_SRC, %o0
2503	mov	SM_SAVE_DST, %o1
2504	mov	SM_SAVE_COUNT, %o2
2505	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2506	tst	%o3
2507	bz,pt	%ncc, 3f			! if not, return error
2508	  nop
2509	ldn	[%o3 + CP_XCOPYOUT], %o5	! if handler, invoke it with
2510	jmp	%o5				! original arguments
2511	  nop
25123:
2513	retl
2514	  or	%g1, 0, %o0		! return errno value
2515
2516	SET_SIZE(xcopyout)
2517
2518#endif	/* lint */
2519
2520#ifdef	lint
2521
2522/*ARGSUSED*/
2523int
2524xcopyout_little(const void *kaddr, void *uaddr, size_t count)
2525{ return (0); }
2526
2527#else	/* lint */
2528
2529	ENTRY(xcopyout_little)
2530	sethi	%hi(.xcopyio_err), %o5
2531	or	%o5, %lo(.xcopyio_err), %o5
2532	ldn	[THREAD_REG + T_LOFAULT], %o4
2533	membar	#Sync				! sync error barrier
2534	stn	%o5, [THREAD_REG + T_LOFAULT]
2535	mov	%o4, %o5
2536
2537	subcc	%g0, %o2, %o3
2538	add	%o0, %o2, %o0
2539	bz,pn	%ncc, 2f		! check for zero bytes
2540	  sub	%o2, 1, %o4
2541	add	%o0, %o4, %o0		! start w/last byte
2542	add	%o1, %o2, %o1
2543	ldub	[%o0 + %o3], %o4
2544
25451:	stba	%o4, [%o1 + %o3]ASI_AIUSL
2546	inccc	%o3
2547	sub	%o0, 2, %o0		! get next byte
2548	bcc,a,pt %ncc, 1b
2549	  ldub	[%o0 + %o3], %o4
2550
25512:
2552	membar	#Sync				! sync error barrier
2553	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2554	retl
2555	  mov	%g0, %o0		! return (0)
2556
2557	SET_SIZE(xcopyout_little)
2558
2559#endif	/* lint */
2560
2561/*
2562 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2563 */
2564
2565#if defined(lint)
2566
2567/*ARGSUSED*/
2568int
2569copyin(const void *uaddr, void *kaddr, size_t count)
2570{ return (0); }
2571
2572#else	/* lint */
2573
2574	ENTRY(copyin)
2575	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
2576	bleu,pt	%ncc, .copyin_small		! go to larger cases
2577	  xor	%o0, %o1, %o3			! are src, dst alignable?
2578	btst	7, %o3				!
2579	bz,pt	%ncc, .copyin_8			! check for longword alignment
2580	  nop
2581	btst	1, %o3				!
2582	bz,pt	%ncc, .copyin_2			! check for half-word
2583	  nop
2584	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
2585	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2586	tst	%o3
2587	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2588	  cmp	%o2, %o3			! if length <= limit
2589	bleu,pt	%ncc, .copyin_small		! go to small copy
2590	  nop
2591	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2592	  nop
2593.copyin_2:
2594	btst	3, %o3				!
2595	bz,pt	%ncc, .copyin_4			! check for word alignment
2596	  nop
2597	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
2598	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
2599	tst	%o3
2600	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2601	  cmp	%o2, %o3			! if length <= limit
2602	bleu,pt	%ncc, .copyin_small		! go to small copy
2603	  nop
2604	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2605	  nop
2606.copyin_4:
2607	! already checked longword, must be word aligned
2608	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
2609	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
2610	tst	%o3
2611	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2612	  cmp	%o2, %o3			! if length <= limit
2613	bleu,pt	%ncc, .copyin_small		! go to small copy
2614	  nop
2615	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2616	  nop
2617.copyin_8:
2618	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
2619	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2620	tst	%o3
2621	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2622	  cmp	%o2, %o3			! if length <= limit
2623	bleu,pt	%ncc, .copyin_small		! go to small copy
2624	  nop
2625	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2626	  nop
2627
2628	.align	16
2629	nop				! instruction alignment
2630					! see discussion at start of file
2631.copyin_small:
2632	sethi	%hi(.sm_copyin_err), %o5	! .sm_copyin_err is lofault
2633	or	%o5, %lo(.sm_copyin_err), %o5
2634	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofault, no tramp
2635	membar	#Sync				! sync error barrier
2636	stn	%o5, [THREAD_REG + T_LOFAULT]
2637.sm_do_copyin:
2638	mov	%o0, SM_SAVE_SRC
2639	mov	%o1, SM_SAVE_DST
2640	cmp	%o2, SHORTCOPY		! check for really short case
2641	bleu,pt	%ncc, .ci_sm_left	!
2642	  mov	%o2, SM_SAVE_COUNT
2643	cmp	%o2, CHKSIZE		! check for medium length cases
2644	bgu,pn	%ncc, .ci_med		!
2645	  or	%o0, %o1, %o3		! prepare alignment check
2646	andcc	%o3, 0x3, %g0		! test for alignment
2647	bz,pt	%ncc, .ci_sm_word	! branch to word aligned case
2648.ci_sm_movebytes:
2649	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
2650.ci_sm_notalign4:
2651	lduba	[%o0]ASI_USER, %o3	! read byte
2652	subcc	%o2, 4, %o2		! reduce count by 4
2653	stb	%o3, [%o1]		! write byte
2654	add	%o0, 1, %o0		! advance SRC by 1
2655	lduba	[%o0]ASI_USER, %o3	! repeat for a total of 4 bytes
2656	add	%o0, 1, %o0		! advance SRC by 1
2657	stb	%o3, [%o1 + 1]
2658	add	%o1, 4, %o1		! advance DST by 4
2659	lduba	[%o0]ASI_USER, %o3
2660	add	%o0, 1, %o0		! advance SRC by 1
2661	stb	%o3, [%o1 - 2]
2662	lduba	[%o0]ASI_USER, %o3
2663	add	%o0, 1, %o0		! advance SRC by 1
2664	bgt,pt	%ncc, .ci_sm_notalign4	! loop til 3 or fewer bytes remain
2665	  stb	%o3, [%o1 - 1]
2666	add	%o2, 3, %o2		! restore count
2667.ci_sm_left:
2668	tst	%o2
2669	bz,pt	%ncc, .ci_sm_exit
2670	  nop
2671	lduba	[%o0]ASI_USER, %o3		! load one byte
2672	deccc	%o2			! reduce count for cc test
2673	bz,pt	%ncc, .ci_sm_exit
2674	  stb	%o3,[%o1]		! store one byte
2675	inc	%o0
2676	lduba	[%o0]ASI_USER, %o3	! load second byte
2677	deccc	%o2
2678	bz,pt	%ncc, .ci_sm_exit
2679	  stb	%o3,[%o1 + 1]		! store second byte
2680	inc	%o0
2681	lduba	[%o0]ASI_USER, %o3	! load third byte
2682	stb	%o3,[%o1 + 2]		! store third byte
2683	membar	#Sync				! sync error barrier
2684	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2685	retl
2686	  mov	%g0, %o0		! return 0
2687	.align	16
2688.ci_sm_words:
2689	lduwa	[%o0]ASI_USER, %o3		! read word
2690.ci_sm_wordx:
2691	subcc	%o2, 8, %o2		! update count
2692	stw	%o3, [%o1]		! write word
2693	add	%o0, 4, %o0		! update SRC
2694	add	%o1, 8, %o1		! update DST
2695	lduwa	[%o0]ASI_USER, %o3	! read word
2696	add	%o0, 4, %o0		! update SRC
2697	bgt,pt	%ncc, .ci_sm_words	! loop til done
2698	  stw	%o3, [%o1 - 4]		! write word
2699	addcc	%o2, 7, %o2		! restore count
2700	bz,pt	%ncc, .ci_sm_exit
2701	  nop
2702	deccc	%o2
2703	bz,pt	%ncc, .ci_sm_byte
2704.ci_sm_half:
2705	  subcc	%o2, 2, %o2		! reduce count by 2
2706	lduha	[%o0]ASI_USER, %o3	! read half word
2707	add	%o0, 2, %o0		! advance SRC by 2
2708	add	%o1, 2, %o1		! advance DST by 2
2709	bgt,pt	%ncc, .ci_sm_half	! loop til done
2710	  sth	%o3, [%o1 - 2]		! write half word
2711	addcc	%o2, 1, %o2		! restore count
2712	bz,pt	%ncc, .ci_sm_exit
2713	  nop
2714.ci_sm_byte:
2715	lduba	[%o0]ASI_USER, %o3
2716	stb	%o3, [%o1]
2717	membar	#Sync				! sync error barrier
2718	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2719	retl
2720	  mov	%g0, %o0		! return 0
2721	.align	16
2722.ci_sm_word:
2723	subcc	%o2, 4, %o2		! update count
2724	bgt,pt	%ncc, .ci_sm_wordx
2725	  lduwa	[%o0]ASI_USER, %o3		! read word
2726	addcc	%o2, 3, %o2		! restore count
2727	bz,pt	%ncc, .ci_sm_exit
2728	  stw	%o3, [%o1]		! write word
2729	deccc	%o2			! reduce count for cc test
2730	add	%o0, 4, %o0
2731	lduba	[%o0]ASI_USER, %o3	! load one byte
2732	bz,pt	%ncc, .ci_sm_exit
2733	  stb	%o3, [%o1 + 4]		! store one byte
2734	inc	%o0
2735	lduba	[%o0]ASI_USER, %o3	! load second byte
2736	deccc	%o2
2737	bz,pt	%ncc, .ci_sm_exit
2738	  stb	%o3, [%o1 + 5]		! store second byte
2739	inc	%o0
2740	lduba	[%o0]ASI_USER, %o3	! load third byte
2741	stb	%o3, [%o1 + 6]		! store third byte
2742.ci_sm_exit:
2743	membar	#Sync				! sync error barrier
2744	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2745	retl
2746	  mov	%g0, %o0		! return 0
2747
2748	.align 16
2749.ci_med:
2750	xor	%o0, %o1, %o3		! setup alignment check
2751	btst	1, %o3
2752	bnz,pt	%ncc, .ci_sm_movebytes	! unaligned
2753	  nop
2754	btst	3, %o3
2755	bnz,pt	%ncc, .ci_med_half	! halfword aligned
2756	  nop
2757	btst	7, %o3
2758	bnz,pt	%ncc, .ci_med_word	! word aligned
2759	  nop
2760.ci_med_long:
2761	btst	3, %o0			! check for
2762	bz,pt	%ncc, .ci_med_long1	! word alignment
2763	  nop
2764.ci_med_long0:
2765	lduba	[%o0]ASI_USER, %o3		! load one byte
2766	inc	%o0
2767	stb	%o3,[%o1]		! store byte
2768	inc	%o1
2769	btst	3, %o0
2770	bnz,pt	%ncc, .ci_med_long0
2771	  dec	%o2
2772.ci_med_long1:			! word aligned
2773	btst	7, %o0			! check for long word
2774	bz,pt	%ncc, .ci_med_long2
2775	  nop
2776	lduwa	[%o0]ASI_USER, %o3	! load word
2777	add	%o0, 4, %o0		! advance SRC by 4
2778	stw	%o3, [%o1]		! store word
2779	add	%o1, 4, %o1		! advance DST by 4
2780	sub	%o2, 4, %o2		! reduce count by 4
2781!
2782!  Now long word aligned and have at least 32 bytes to move
2783!
2784.ci_med_long2:
2785	sub	%o2, 31, %o2		! adjust count to allow cc zero test
2786.ci_med_lmove:
2787	ldxa	[%o0]ASI_USER, %o3	! read long word
2788	subcc	%o2, 32, %o2		! reduce count by 32
2789	stx	%o3, [%o1]		! write long word
2790	add	%o0, 8, %o0		! advance SRC by 8
2791	ldxa	[%o0]ASI_USER, %o3	! repeat for a total for 4 long words
2792	add	%o0, 8, %o0		! advance SRC by 8
2793	stx	%o3, [%o1 + 8]
2794	add	%o1, 32, %o1		! advance DST by 32
2795	ldxa	[%o0]ASI_USER, %o3
2796	add	%o0, 8, %o0		! advance SRC by 8
2797	stx	%o3, [%o1 - 16]
2798	ldxa	[%o0]ASI_USER, %o3
2799	add	%o0, 8, %o0		! advance SRC by 8
2800	bgt,pt	%ncc, .ci_med_lmove	! loop til 31 or fewer bytes left
2801	  stx	%o3, [%o1 - 8]
2802	addcc	%o2, 24, %o2		! restore count to long word offset
2803	ble,pt	%ncc, .ci_med_lextra	! check for more long words to move
2804	  nop
2805.ci_med_lword:
2806	ldxa	[%o0]ASI_USER, %o3	! read long word
2807	subcc	%o2, 8, %o2		! reduce count by 8
2808	stx	%o3, [%o1]		! write long word
2809	add	%o0, 8, %o0		! advance SRC by 8
2810	bgt,pt	%ncc, .ci_med_lword	! loop til 7 or fewer bytes left
2811	  add	%o1, 8, %o1		! advance DST by 8
2812.ci_med_lextra:
2813	addcc	%o2, 7, %o2		! restore rest of count
2814	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
2815	  deccc	%o2
2816	bz,pt	%ncc, .ci_sm_byte
2817	  nop
2818	ba,pt	%ncc, .ci_sm_half
2819	  nop
2820
2821	.align 16
2822	nop				! instruction alignment
2823					! see discussion at start of file
2824.ci_med_word:
2825	btst	3, %o0			! check for
2826	bz,pt	%ncc, .ci_med_word1	! word alignment
2827	  nop
2828.ci_med_word0:
2829	lduba	[%o0]ASI_USER, %o3	! load one byte
2830	inc	%o0
2831	stb	%o3,[%o1]		! store byte
2832	inc	%o1
2833	btst	3, %o0
2834	bnz,pt	%ncc, .ci_med_word0
2835	  dec	%o2
2836!
2837!  Now word aligned and have at least 36 bytes to move
2838!
2839.ci_med_word1:
2840	sub	%o2, 15, %o2		! adjust count to allow cc zero test
2841.ci_med_wmove:
2842	lduwa	[%o0]ASI_USER, %o3	! read word
2843	subcc	%o2, 16, %o2		! reduce count by 16
2844	stw	%o3, [%o1]		! write word
2845	add	%o0, 4, %o0		! advance SRC by 4
2846	lduwa	[%o0]ASI_USER, %o3	! repeat for a total for 4 words
2847	add	%o0, 4, %o0		! advance SRC by 4
2848	stw	%o3, [%o1 + 4]
2849	add	%o1, 16, %o1		! advance DST by 16
2850	lduwa	[%o0]ASI_USER, %o3
2851	add	%o0, 4, %o0		! advance SRC by 4
2852	stw	%o3, [%o1 - 8]
2853	lduwa	[%o0]ASI_USER, %o3
2854	add	%o0, 4, %o0		! advance SRC by 4
2855	bgt,pt	%ncc, .ci_med_wmove	! loop til 15 or fewer bytes left
2856	  stw	%o3, [%o1 - 4]
2857	addcc	%o2, 12, %o2		! restore count to word offset
2858	ble,pt	%ncc, .ci_med_wextra	! check for more words to move
2859	  nop
2860.ci_med_word2:
2861	lduwa	[%o0]ASI_USER, %o3	! read word
2862	subcc	%o2, 4, %o2		! reduce count by 4
2863	stw	%o3, [%o1]		! write word
2864	add	%o0, 4, %o0		! advance SRC by 4
2865	bgt,pt	%ncc, .ci_med_word2	! loop til 3 or fewer bytes left
2866	  add	%o1, 4, %o1		! advance DST by 4
2867.ci_med_wextra:
2868	addcc	%o2, 3, %o2		! restore rest of count
2869	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
2870	  deccc	%o2
2871	bz,pt	%ncc, .ci_sm_byte
2872	  nop
2873	ba,pt	%ncc, .ci_sm_half
2874	  nop
2875
2876	.align 16
2877	nop				! instruction alignment
2878					! see discussion at start of file
2879.ci_med_half:
2880	btst	1, %o0			! check for
2881	bz,pt	%ncc, .ci_med_half1	! half word alignment
2882	  nop
2883	lduba	[%o0]ASI_USER, %o3	! load one byte
2884	inc	%o0
2885	stb	%o3,[%o1]		! store byte
2886	inc	%o1
2887	dec	%o2
2888!
2889!  Now half word aligned and have at least 38 bytes to move
2890!
2891.ci_med_half1:
2892	sub	%o2, 7, %o2		! adjust count to allow cc zero test
2893.ci_med_hmove:
2894	lduha	[%o0]ASI_USER, %o3	! read half word
2895	subcc	%o2, 8, %o2		! reduce count by 8
2896	sth	%o3, [%o1]		! write half word
2897	add	%o0, 2, %o0		! advance SRC by 2
2898	lduha	[%o0]ASI_USER, %o3	! repeat for a total for 4 halfwords
2899	add	%o0, 2, %o0		! advance SRC by 2
2900	sth	%o3, [%o1 + 2]
2901	add	%o1, 8, %o1		! advance DST by 8
2902	lduha	[%o0]ASI_USER, %o3
2903	add	%o0, 2, %o0		! advance SRC by 2
2904	sth	%o3, [%o1 - 4]
2905	lduha	[%o0]ASI_USER, %o3
2906	add	%o0, 2, %o0		! advance SRC by 2
2907	bgt,pt	%ncc, .ci_med_hmove	! loop til 7 or fewer bytes left
2908	  sth	%o3, [%o1 - 2]
2909	addcc	%o2, 7, %o2		! restore count
2910	bz,pt	%ncc, .ci_sm_exit
2911	  deccc	%o2
2912	bz,pt	%ncc, .ci_sm_byte
2913	  nop
2914	ba,pt	%ncc, .ci_sm_half
2915	  nop
2916
2917.sm_copyin_err:
2918	membar	#Sync
2919	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2920	mov	SM_SAVE_SRC, %o0
2921	mov	SM_SAVE_DST, %o1
2922	mov	SM_SAVE_COUNT, %o2
2923	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2924	tst	%o3
2925	bz,pt	%ncc, 3f			! if not, return error
2926	  nop
2927	ldn	[%o3 + CP_COPYIN], %o5		! if handler, invoke it with
2928	jmp	%o5				! original arguments
2929	  nop
29303:
2931	retl
2932	  or	%g0, -1, %o0		! return errno value
2933
2934	SET_SIZE(copyin)
2935
2936
2937/*
2938 * The _more entry points are not intended to be used directly by
2939 * any caller from outside this file.  They are provided to allow
2940 * profiling and dtrace of the portions of the copy code that uses
2941 * the floating point registers.
2942 * This entry is particularly important as DTRACE (at least as of
2943 * 4/2004) does not support leaf functions.
2944 */
2945
2946	ENTRY(copyin_more)
2947.copyin_more:
2948	prefetch [%o0], #n_reads
2949	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2950	set	.copyin_err, REAL_LOFAULT
2951
2952/*
2953 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
2954 */
2955.do_copyin:
2956	set	copyio_fault, %l7		! .copyio_fault is lofault val
2957
2958	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
2959	membar	#Sync				! sync error barrier
2960	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
2961
2962	mov	%i0, SAVE_SRC
2963	mov	%i1, SAVE_DST
2964	mov	%i2, SAVE_COUNT
2965
2966	FP_NOMIGRATE(6, 7)
2967
2968	rd	%fprs, %o2		! check for unused fp
2969	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2970	btst	FPRS_FEF, %o2
2971	bz,a,pt	%icc, .do_blockcopyin
2972	  wr	%g0, FPRS_FEF, %fprs
2973
2974	BST_FPQ2Q4_TOSTACK(%o2)
2975
2976.do_blockcopyin:
2977	rd	%gsr, %o2
2978	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
2979	or	%l6, FPUSED_FLAG, %l6
2980
2981	andcc	DST, VIS_BLOCKSIZE - 1, TMP
2982	mov	ASI_USER, %asi
2983	bz,pt	%ncc, 2f
2984	  neg	TMP
2985	add	TMP, VIS_BLOCKSIZE, TMP
2986
2987	! TMP = bytes required to align DST on FP_BLOCK boundary
2988	! Using SRC as a tmp here
2989	cmp	TMP, 3
2990	bleu,pt	%ncc, 1f
2991	  sub	CNT,TMP,CNT		! adjust main count
2992	sub	TMP, 3, TMP		! adjust for end of loop test
2993.ci_blkalign:
2994	lduba	[REALSRC]%asi, SRC	! move 4 bytes per loop iteration
2995	stb	SRC, [DST]
2996	subcc	TMP, 4, TMP
2997	lduba	[REALSRC + 1]%asi, SRC
2998	add	REALSRC, 4, REALSRC
2999	stb	SRC, [DST + 1]
3000	lduba	[REALSRC - 2]%asi, SRC
3001	add	DST, 4, DST
3002	stb	SRC, [DST - 2]
3003	lduba	[REALSRC - 1]%asi, SRC
3004	bgu,pt	%ncc, .ci_blkalign
3005	  stb	SRC, [DST - 1]
3006
3007	addcc	TMP, 3, TMP		! restore count adjustment
3008	bz,pt	%ncc, 2f		! no bytes left?
3009	  nop
30101:	lduba	[REALSRC]%asi, SRC
3011	inc	REALSRC
3012	inc	DST
3013	deccc	TMP
3014	bgu	%ncc, 1b
3015	  stb	SRC, [DST - 1]
3016
30172:
3018	membar	#StoreLoad
3019	andn	REALSRC, 0x7, SRC
3020
3021	! SRC - 8-byte aligned
3022	! DST - 64-byte aligned
3023	ldda	[SRC]%asi, %f16
3024	prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #n_reads
3025	alignaddr REALSRC, %g0, %g0
3026	ldda	[SRC + 0x08]%asi, %f18
3027	prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #n_reads
3028	faligndata %f16, %f18, %f48
3029	ldda	[SRC + 0x10]%asi, %f20
3030	prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
3031	faligndata %f18, %f20, %f50
3032	ldda	[SRC + 0x18]%asi, %f22
3033	prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
3034	faligndata %f20, %f22, %f52
3035	ldda	[SRC + 0x20]%asi, %f24
3036	prefetcha [SRC + (8 * VIS_BLOCKSIZE)]%asi, #one_read
3037	faligndata %f22, %f24, %f54
3038	ldda	[SRC + 0x28]%asi, %f26
3039	prefetcha [SRC + (12 * VIS_BLOCKSIZE)]%asi, #one_read
3040	faligndata %f24, %f26, %f56
3041	ldda	[SRC + 0x30]%asi, %f28
3042	prefetcha [SRC + (16 * VIS_BLOCKSIZE)]%asi, #one_read
3043	faligndata %f26, %f28, %f58
3044	ldda	[SRC + 0x38]%asi, %f30
3045	ldda	[SRC + VIS_BLOCKSIZE]%asi, %f16
3046	sub	CNT, VIS_BLOCKSIZE, CNT
3047	add	SRC, VIS_BLOCKSIZE, SRC
3048	prefetcha [SRC + (19 * VIS_BLOCKSIZE)]%asi, #one_read
3049	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3050	ba,pt	%ncc, 1f
3051	prefetcha [SRC + (23 * VIS_BLOCKSIZE)]%asi, #one_read
3052	.align	32
30531:
3054	ldda	[SRC + 0x08]%asi, %f18
3055	faligndata %f28, %f30, %f60
3056	ldda	[SRC + 0x10]%asi, %f20
3057	faligndata %f30, %f16, %f62
3058	stda	%f48, [DST]ASI_BLK_P
3059	ldda	[SRC + 0x18]%asi, %f22
3060	faligndata %f16, %f18, %f48
3061	ldda	[SRC + 0x20]%asi, %f24
3062	faligndata %f18, %f20, %f50
3063	ldda	[SRC + 0x28]%asi, %f26
3064	faligndata %f20, %f22, %f52
3065	ldda	[SRC + 0x30]%asi, %f28
3066	faligndata %f22, %f24, %f54
3067	sub	CNT, VIS_BLOCKSIZE, CNT
3068	ldda	[SRC + 0x38]%asi, %f30
3069	faligndata %f24, %f26, %f56
3070	add	DST, VIS_BLOCKSIZE, DST
3071	ldda	[SRC + VIS_BLOCKSIZE]%asi, %f16
3072	faligndata %f26, %f28, %f58
3073	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3074	prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
3075	add	SRC, VIS_BLOCKSIZE, SRC
3076	prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3077	cmp	CNT, VIS_BLOCKSIZE + 8
3078	bgu,pt	%ncc, 1b
3079	  prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3080
3081	! only if REALSRC & 0x7 is 0
3082	cmp	CNT, VIS_BLOCKSIZE
3083	bne	%ncc, 3f
3084	  andcc	REALSRC, 0x7, %g0
3085	bz,pt	%ncc, 2f
3086	  nop
30873:
3088	faligndata %f28, %f30, %f60
3089	faligndata %f30, %f16, %f62
3090	stda	%f48, [DST]ASI_BLK_P
3091	add	DST, VIS_BLOCKSIZE, DST
3092	ba,pt	%ncc, 3f
3093	  nop
30942:
3095	ldda	[SRC + 0x08]%asi, %f18
3096	fsrc1	%f28, %f60
3097	ldda	[SRC + 0x10]%asi, %f20
3098	fsrc1	%f30, %f62
3099	stda	%f48, [DST]ASI_BLK_P
3100	ldda	[SRC + 0x18]%asi, %f22
3101	fsrc1	%f16, %f48
3102	ldda	[SRC + 0x20]%asi, %f24
3103	fsrc1	%f18, %f50
3104	ldda	[SRC + 0x28]%asi, %f26
3105	fsrc1	%f20, %f52
3106	ldda	[SRC + 0x30]%asi, %f28
3107	fsrc1	%f22, %f54
3108	ldda	[SRC + 0x38]%asi, %f30
3109	fsrc1	%f24, %f56
3110	sub	CNT, VIS_BLOCKSIZE, CNT
3111	add	DST, VIS_BLOCKSIZE, DST
3112	add	SRC, VIS_BLOCKSIZE, SRC
3113	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3114	fsrc1	%f26, %f58
3115	fsrc1	%f28, %f60
3116	fsrc1	%f30, %f62
3117	stda	%f48, [DST]ASI_BLK_P
3118	add	DST, VIS_BLOCKSIZE, DST
3119	ba,a,pt	%ncc, 4f
3120	  nop
3121
31223:	tst	CNT
3123	bz,a	%ncc, 4f
3124	  nop
3125
31265:	lduba	[REALSRC]ASI_USER, TMP
3127	inc	REALSRC
3128	inc	DST
3129	deccc	CNT
3130	bgu	%ncc, 5b
3131	  stb	TMP, [DST - 1]
31324:
3133
3134.copyin_exit:
3135	membar	#Sync
3136
3137	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
3138	wr	%o2, 0, %gsr
3139
3140	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3141	btst	FPRS_FEF, %o3
3142	bz,pt	%icc, 4f
3143	  nop
3144
3145	BLD_FPQ2Q4_FROMSTACK(%o2)
3146
3147	ba,pt	%ncc, 1f
3148	  wr	%o3, 0, %fprs		! restore fprs
3149
31504:
3151	FZEROQ2Q4
3152	wr	%o3, 0, %fprs		! restore fprs
3153
31541:
3155	membar	#Sync				! sync error barrier
3156	andn	%l6, FPUSED_FLAG, %l6
3157	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3158	FP_ALLOWMIGRATE(5, 6)
3159	ret
3160	  restore	%g0, 0, %o0
3161/*
3162 * We got here because of a fault during copyin
3163 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
3164 */
3165.copyin_err:
3166	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
3167	tst	%o4
3168	bz,pt	%ncc, 2f			! if not, return error
3169	nop
3170	ldn	[%o4 + CP_COPYIN], %g2		! if handler, invoke it with
3171	jmp	%g2				! original arguments
3172	restore %g0, 0, %g0			! dispose of copy window
31732:
3174	ret
3175	restore %g0, -1, %o0			! return error value
3176
3177
3178	SET_SIZE(copyin_more)
3179
3180#endif	/* lint */
3181
3182#ifdef	lint
3183
3184/*ARGSUSED*/
3185int
3186xcopyin(const void *uaddr, void *kaddr, size_t count)
3187{ return (0); }
3188
3189#else	/* lint */
3190
3191	ENTRY(xcopyin)
3192
3193	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
3194	bleu,pt	%ncc, .xcopyin_small		! go to larger cases
3195	  xor	%o0, %o1, %o3			! are src, dst alignable?
3196	btst	7, %o3				!
3197	bz,pt	%ncc, .xcopyin_8		! check for longword alignment
3198	  nop
3199	btst	1, %o3				!
3200	bz,pt	%ncc, .xcopyin_2		! check for half-word
3201	  nop
3202	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
3203	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3204	tst	%o3
3205	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3206	  cmp	%o2, %o3			! if length <= limit
3207	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3208	  nop
3209	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3210	  nop
3211.xcopyin_2:
3212	btst	3, %o3				!
3213	bz,pt	%ncc, .xcopyin_4		! check for word alignment
3214	  nop
3215	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
3216	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3217	tst	%o3
3218	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3219	  cmp	%o2, %o3			! if length <= limit
3220	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3221	  nop
3222	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3223	  nop
3224.xcopyin_4:
3225	! already checked longword, must be word aligned
3226	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
3227	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3228	tst	%o3
3229	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3230	  cmp	%o2, %o3			! if length <= limit
3231	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3232	  nop
3233	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3234	  nop
3235.xcopyin_8:
3236	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
3237	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3238	tst	%o3
3239	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3240	  cmp	%o2, %o3			! if length <= limit
3241	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3242	  nop
3243	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3244	  nop
3245
3246.xcopyin_small:
3247	sethi	%hi(.sm_xcopyin_err), %o5  ! .sm_xcopyin_err is lofault value
3248	or	%o5, %lo(.sm_xcopyin_err), %o5
3249	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofaul
3250	membar	#Sync				! sync error barrier
3251	ba,pt	%ncc, .sm_do_copyin		! common code
3252	  stn	%o5, [THREAD_REG + T_LOFAULT]
3253
3254.xcopyin_more:
3255	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3256	sethi	%hi(.xcopyin_err), REAL_LOFAULT	! .xcopyin_err is lofault value
3257	ba,pt	%ncc, .do_copyin
3258	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3259
3260/*
3261 * We got here because of fault during xcopyin
3262 * Errno value is in ERRNO
3263 */
3264.xcopyin_err:
3265	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
3266	tst	%o4
3267	bz,pt	%ncc, 2f			! if not, return error
3268	  nop
3269	ldn	[%o4 + CP_XCOPYIN], %g2		! if handler, invoke it with
3270	jmp	%g2				! original arguments
3271	  restore %g0, 0, %g0			! dispose of copy window
32722:
3273        ret
3274	  restore ERRNO, 0, %o0			! return errno value
3275
3276.sm_xcopyin_err:
3277
3278	membar	#Sync
3279	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3280	mov	SM_SAVE_SRC, %o0
3281	mov	SM_SAVE_DST, %o1
3282	mov	SM_SAVE_COUNT, %o2
3283	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
3284	tst	%o3
3285	bz,pt	%ncc, 3f			! if not, return error
3286	  nop
3287	ldn	[%o3 + CP_XCOPYIN], %o5		! if handler, invoke it with
3288	jmp	%o5				! original arguments
3289	  nop
32903:
3291	retl
3292	  or	%g1, 0, %o0		! return errno value
3293
3294	SET_SIZE(xcopyin)
3295
3296#endif	/* lint */
3297
3298#ifdef	lint
3299
3300/*ARGSUSED*/
3301int
3302xcopyin_little(const void *uaddr, void *kaddr, size_t count)
3303{ return (0); }
3304
3305#else	/* lint */
3306
3307	ENTRY(xcopyin_little)
3308	sethi	%hi(.xcopyio_err), %o5
3309	or	%o5, %lo(.xcopyio_err), %o5
3310	ldn	[THREAD_REG + T_LOFAULT], %o4
3311	membar	#Sync				! sync error barrier
3312	stn	%o5, [THREAD_REG + T_LOFAULT]
3313	mov	%o4, %o5
3314
3315	subcc	%g0, %o2, %o3
3316	add	%o0, %o2, %o0
3317	bz,pn	%ncc, 2f		! check for zero bytes
3318	  sub	%o2, 1, %o4
3319	add	%o0, %o4, %o0		! start w/last byte
3320	add	%o1, %o2, %o1
3321	lduba	[%o0 + %o3]ASI_AIUSL, %o4
3322
33231:	stb	%o4, [%o1 + %o3]
3324	inccc	%o3
3325	sub	%o0, 2, %o0		! get next byte
3326	bcc,a,pt %ncc, 1b
3327	  lduba	[%o0 + %o3]ASI_AIUSL, %o4
3328
33292:
3330	membar	#Sync				! sync error barrier
3331	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3332	retl
3333	  mov	%g0, %o0		! return (0)
3334
3335.xcopyio_err:
3336	membar	#Sync				! sync error barrier
3337	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3338	retl
3339	  mov	%g1, %o0
3340
3341	SET_SIZE(xcopyin_little)
3342
3343#endif	/* lint */
3344
3345
3346/*
3347 * Copy a block of storage - must not overlap (from + len <= to).
3348 * No fault handler installed (to be called under on_fault())
3349 */
3350#if defined(lint)
3351
3352/* ARGSUSED */
3353void
3354copyin_noerr(const void *ufrom, void *kto, size_t count)
3355{}
3356
3357#else	/* lint */
3358	ENTRY(copyin_noerr)
3359
3360	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
3361	bleu,pt	%ncc, .copyin_ne_small		! go to larger cases
3362	  xor	%o0, %o1, %o3			! are src, dst alignable?
3363	btst	7, %o3				!
3364	bz,pt	%ncc, .copyin_ne_8		! check for longword alignment
3365	  nop
3366	btst	1, %o3				!
3367	bz,pt	%ncc, .copyin_ne_2		! check for half-word
3368	  nop
3369	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
3370	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3371	tst	%o3
3372	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3373	  cmp	%o2, %o3			! if length <= limit
3374	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3375	  nop
3376	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3377	  nop
3378.copyin_ne_2:
3379	btst	3, %o3				!
3380	bz,pt	%ncc, .copyin_ne_4		! check for word alignment
3381	  nop
3382	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
3383	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3384	tst	%o3
3385	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3386	  cmp	%o2, %o3			! if length <= limit
3387	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3388	  nop
3389	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3390	  nop
3391.copyin_ne_4:
3392	! already checked longword, must be word aligned
3393	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
3394	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3395	tst	%o3
3396	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3397	  cmp	%o2, %o3			! if length <= limit
3398	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3399	  nop
3400	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3401	  nop
3402.copyin_ne_8:
3403	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
3404	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3405	tst	%o3
3406	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3407	  cmp	%o2, %o3			! if length <= limit
3408	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3409	  nop
3410	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3411	  nop
3412
3413.copyin_ne_small:
3414	ldn	[THREAD_REG + T_LOFAULT], %o4
3415	tst	%o4
3416	bz,pn	%ncc, .sm_do_copyin
3417	  nop
3418	sethi	%hi(.sm_copyio_noerr), %o5
3419	or	%o5, %lo(.sm_copyio_noerr), %o5
3420	membar	#Sync				! sync error barrier
3421	ba,pt	%ncc, .sm_do_copyin
3422	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
3423
3424.copyin_noerr_more:
3425	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3426	sethi	%hi(.copyio_noerr), REAL_LOFAULT
3427	ba,pt	%ncc, .do_copyin
3428	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3429
3430.copyio_noerr:
3431	jmp	%l6
3432	  restore %g0,0,%g0
3433
3434.sm_copyio_noerr:
3435	membar	#Sync
3436	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore t_lofault
3437	jmp	%o4
3438	  nop
3439
3440	SET_SIZE(copyin_noerr)
3441#endif /* lint */
3442
3443/*
3444 * Copy a block of storage - must not overlap (from + len <= to).
3445 * No fault handler installed (to be called under on_fault())
3446 */
3447
3448#if defined(lint)
3449
3450/* ARGSUSED */
3451void
3452copyout_noerr(const void *kfrom, void *uto, size_t count)
3453{}
3454
3455#else	/* lint */
3456	ENTRY(copyout_noerr)
3457
3458	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
3459	bleu,pt	%ncc, .copyout_ne_small		! go to larger cases
3460	  xor	%o0, %o1, %o3			! are src, dst alignable?
3461	btst	7, %o3				!
3462	bz,pt	%ncc, .copyout_ne_8		! check for longword alignment
3463	  nop
3464	btst	1, %o3				!
3465	bz,pt	%ncc, .copyout_ne_2		! check for half-word
3466	  nop
3467	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
3468	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3469	tst	%o3
3470	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3471	  cmp	%o2, %o3			! if length <= limit
3472	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3473	  nop
3474	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3475	  nop
3476.copyout_ne_2:
3477	btst	3, %o3				!
3478	bz,pt	%ncc, .copyout_ne_4		! check for word alignment
3479	  nop
3480	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
3481	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3482	tst	%o3
3483	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3484	  cmp	%o2, %o3			! if length <= limit
3485	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3486	  nop
3487	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3488	  nop
3489.copyout_ne_4:
3490	! already checked longword, must be word aligned
3491	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
3492	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3493	tst	%o3
3494	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3495	  cmp	%o2, %o3			! if length <= limit
3496	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3497	  nop
3498	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3499	  nop
3500.copyout_ne_8:
3501	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
3502	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3503	tst	%o3
3504	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3505	  cmp	%o2, %o3			! if length <= limit
3506	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3507	  nop
3508	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3509	  nop
3510
3511.copyout_ne_small:
3512	ldn	[THREAD_REG + T_LOFAULT], %o4
3513	tst	%o4
3514	bz,pn	%ncc, .sm_do_copyout
3515	  nop
3516	sethi	%hi(.sm_copyio_noerr), %o5
3517	or	%o5, %lo(.sm_copyio_noerr), %o5
3518	membar	#Sync				! sync error barrier
3519	ba,pt	%ncc, .sm_do_copyout
3520	stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
3521
3522.copyout_noerr_more:
3523	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3524	sethi	%hi(.copyio_noerr), REAL_LOFAULT
3525	ba,pt	%ncc, .do_copyout
3526	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3527
3528	SET_SIZE(copyout_noerr)
3529#endif /* lint */
3530
3531
3532/*
3533 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
3534 * longer than 256 bytes in length using spitfire's block stores.  If
3535 * the criteria for using this routine are not met then it calls bzero
3536 * and returns 1.  Otherwise 0 is returned indicating success.
3537 * Caller is responsible for ensuring use_hw_bzero is true and that
3538 * kpreempt_disable() has been called.
3539 */
3540#ifdef lint
3541/*ARGSUSED*/
3542int
3543hwblkclr(void *addr, size_t len)
3544{
3545	return(0);
3546}
3547#else /* lint */
3548	! %i0 - start address
3549	! %i1 - length of region (multiple of 64)
3550	! %l0 - saved fprs
3551	! %l1 - pointer to saved %d0 block
3552	! %l2 - saved curthread->t_lwp
3553
3554	ENTRY(hwblkclr)
3555	! get another window w/space for one aligned block of saved fpregs
3556	save	%sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
3557
3558	! Must be block-aligned
3559	andcc	%i0, (VIS_BLOCKSIZE-1), %g0
3560	bnz,pn	%ncc, 1f
3561	  nop
3562
3563	! ... and must be 256 bytes or more
3564	cmp	%i1, 256
3565	blu,pn	%ncc, 1f
3566	  nop
3567
3568	! ... and length must be a multiple of VIS_BLOCKSIZE
3569	andcc	%i1, (VIS_BLOCKSIZE-1), %g0
3570	bz,pn	%ncc, 2f
3571	  nop
3572
35731:	! punt, call bzero but notify the caller that bzero was used
3574	mov	%i0, %o0
3575	call	bzero
3576	mov	%i1, %o1
3577	ret
3578	  restore	%g0, 1, %o0 ! return (1) - did not use block operations
3579
35802:	rd	%fprs, %l0		! check for unused fp
3581	btst	FPRS_FEF, %l0
3582	bz,pt	%icc, 1f
3583	  nop
3584
3585	! save in-use fpregs on stack
3586	membar	#Sync
3587	add	%fp, STACK_BIAS - 65, %l1
3588	and	%l1, -VIS_BLOCKSIZE, %l1
3589	stda	%d0, [%l1]ASI_BLK_P
3590
35911:	membar	#StoreStore|#StoreLoad|#LoadStore
3592	wr	%g0, FPRS_FEF, %fprs
3593	wr	%g0, ASI_BLK_P, %asi
3594
3595	! Clear block
3596	fzero	%d0
3597	fzero	%d2
3598	fzero	%d4
3599	fzero	%d6
3600	fzero	%d8
3601	fzero	%d10
3602	fzero	%d12
3603	fzero	%d14
3604
3605	mov	256, %i3
3606	ba,pt	%ncc, .pz_doblock
3607	  nop
3608
3609.pz_blkstart:
3610      ! stda	%d0, [%i0 + 192]%asi  ! in dly slot of branch that got us here
3611	stda	%d0, [%i0 + 128]%asi
3612	stda	%d0, [%i0 + 64]%asi
3613	stda	%d0, [%i0]%asi
3614.pz_zinst:
3615	add	%i0, %i3, %i0
3616	sub	%i1, %i3, %i1
3617.pz_doblock:
3618	cmp	%i1, 256
3619	bgeu,a	%ncc, .pz_blkstart
3620	  stda	%d0, [%i0 + 192]%asi
3621
3622	cmp	%i1, 64
3623	blu	%ncc, .pz_finish
3624
3625	  andn	%i1, (64-1), %i3
3626	srl	%i3, 4, %i2		! using blocks, 1 instr / 16 words
3627	set	.pz_zinst, %i4
3628	sub	%i4, %i2, %i4
3629	jmp	%i4
3630	  nop
3631
3632.pz_finish:
3633	membar	#Sync
3634	btst	FPRS_FEF, %l0
3635	bz,a	.pz_finished
3636	  wr	%l0, 0, %fprs		! restore fprs
3637
3638	! restore fpregs from stack
3639	ldda	[%l1]ASI_BLK_P, %d0
3640	membar	#Sync
3641	wr	%l0, 0, %fprs		! restore fprs
3642
3643.pz_finished:
3644	ret
3645	  restore	%g0, 0, %o0		! return (bzero or not)
3646
3647	SET_SIZE(hwblkclr)
3648#endif	/* lint */
3649
3650#ifdef lint
3651/*ARGSUSED*/
3652void
3653hw_pa_bcopy32(uint64_t src, uint64_t dst)
3654{}
3655#else /*!lint */
3656	/*
3657	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
3658	 * using physical addresses.
3659	 */
3660	ENTRY_NP(hw_pa_bcopy32)
3661	rdpr	%pstate, %g1
3662	andn	%g1, PSTATE_IE, %g2
3663	wrpr	%g0, %g2, %pstate
3664
3665	rdpr	%pstate, %g0
3666	ldxa	[%o0]ASI_MEM, %o2
3667	add	%o0, 8, %o0
3668	ldxa	[%o0]ASI_MEM, %o3
3669	add	%o0, 8, %o0
3670	ldxa	[%o0]ASI_MEM, %o4
3671	add	%o0, 8, %o0
3672	ldxa	[%o0]ASI_MEM, %o5
3673	membar	#Sync
3674
3675	stxa	%o2, [%o1]ASI_MEM
3676	add	%o1, 8, %o1
3677	stxa	%o3, [%o1]ASI_MEM
3678	add	%o1, 8, %o1
3679	stxa	%o4, [%o1]ASI_MEM
3680	add	%o1, 8, %o1
3681	stxa	%o5, [%o1]ASI_MEM
3682
3683	retl
3684	  wrpr	  %g0, %g1, %pstate
3685
3686	SET_SIZE(hw_pa_bcopy32)
3687
3688#endif /* lint */
3689
3690#if defined(lint)
3691
3692int use_hw_bcopy = 1;
3693int use_hw_bzero = 1;
3694uint_t hw_copy_limit_1 = 0;
3695uint_t hw_copy_limit_2 = 0;
3696uint_t hw_copy_limit_4 = 0;
3697uint_t hw_copy_limit_8 = 0;
3698
3699#else /* !lint */
3700
3701	DGDEF(use_hw_bcopy)
3702	.word	1
3703	DGDEF(use_hw_bzero)
3704	.word	1
3705	DGDEF(hw_copy_limit_1)
3706	.word	0
3707	DGDEF(hw_copy_limit_2)
3708	.word	0
3709	DGDEF(hw_copy_limit_4)
3710	.word	0
3711	DGDEF(hw_copy_limit_8)
3712	.word	0
3713
3714	.align	64
3715	.section ".text"
3716#endif /* !lint */
3717