xref: /titanic_52/usr/src/uts/sun4u/cpu/opl_olympus_copy.s (revision 5accf66f88a4d513d122f3df4103820499970a82)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23 * Use is subject to license terms.
24 */
25
26#pragma ident	"%Z%%M%	%I%	%E% SMI"
27
28#include <sys/param.h>
29#include <sys/errno.h>
30#include <sys/asm_linkage.h>
31#include <sys/vtrace.h>
32#include <sys/machthread.h>
33#include <sys/clock.h>
34#include <sys/asi.h>
35#include <sys/fsr.h>
36#include <sys/privregs.h>
37
38#if !defined(lint)
39#include "assym.h"
40#endif	/* lint */
41
42/*
43 * Pseudo-code to aid in understanding the control flow of the
44 * bcopy/copyin/copyout routines.
45 *
46 * On entry:
47 *
48 * 	! Determine whether to use the FP register version
49 * 	! or the leaf routine version depending on size
50 * 	! of copy and flags.  Set up error handling accordingly.
51 *	! The transition point depends on whether the src and
52 * 	! dst addresses can be aligned to long word, word,
53 * 	! half word, or byte boundaries.
54 *	!
55 *	! WARNING: <Register usage convention>
56 *	! For FP version, %l6 holds previous error handling and
57 *	! a flag: TRAMP_FLAG (low bits)
58 *	! for leaf routine version, %o4 holds those values.
59 *	! So either %l6 or %o4 is reserved and not available for
60 *	! any other use.
61 *
62 * 	if (length <= VIS_COPY_THRESHOLD) 	! start with a quick test
63 * 		go to small_copy;		! to speed short copies
64 *
65 * 	! src, dst long word alignable
66 * 		if (hw_copy_limit_8 == 0) 	! hw_copy disabled
67 * 			go to small_copy;
68 *		if (length <= hw_copy_limit_8)
69 * 			go to small_copy;
70 * 		go to FPBLK_copy;
71 * 	}
72 * 	if (src,dst not alignable) {
73 * 		if (hw_copy_limit_1 == 0) 	! hw_copy disabled
74 * 			go to small_copy;
75 *		if (length <= hw_copy_limit_1)
76 * 			go to small_copy;
77 * 		go to FPBLK_copy;
78 * 	}
79 * 	if (src,dst halfword alignable) {
80 * 		if (hw_copy_limit_2 == 0) 	! hw_copy disabled
81 * 			go to small_copy;
82 *		if (length <= hw_copy_limit_2)
83 * 			go to small_copy;
84 * 		go to FPBLK_copy;
85 * 	}
86 * 	if (src,dst word alignable) {
87 * 		if (hw_copy_limit_4 == 0) 	! hw_copy disabled
88 * 			go to small_copy;
89 *		if (length <= hw_copy_limit_4)
90 * 			go to small_copy;
91 * 		go to FPBLK_copy;
92 * 	}
93 *
94 * small_copy:
95 *	Setup_leaf_rtn_error_handler; 		! diffs for each entry point
96 *
97 *	if (count <= 3)				! fast path for tiny copies
98 *		go to sm_left;			! special finish up code
99 *	else
100 *		if (count > CHKSIZE)		! medium sized copies
101 *			go to sm_med		! tuned by alignment
102 *		if(src&dst not both word aligned) {
103 *	sm_movebytes:
104 *			move byte by byte in 4-way unrolled loop
105 *			fall into sm_left;
106 *	sm_left:
107 *			move 0-3 bytes byte at a time as needed.
108 *			restore error handler and exit.
109 *
110 * 		} else {	! src&dst are word aligned
111 *			check for at least 8 bytes left,
112 *			move word at a time, unrolled by 2
113 *			when fewer than 8 bytes left,
114 *	sm_half:	move half word at a time while 2 or more bytes left
115 *	sm_byte:	move final byte if necessary
116 *	sm_exit:
117 *			restore error handler and exit.
118 *		}
119 *
120 * ! Medium length cases with at least CHKSIZE bytes available
121 * ! method: line up src and dst as best possible, then
122 * ! move data in 4-way unrolled loops.
123 *
124 * sm_med:
125 *	if(src&dst unalignable)
126 * 		go to sm_movebytes
127 *	if(src&dst halfword alignable)
128 *		go to sm_movehalf
129 *	if(src&dst word alignable)
130 *		go to sm_moveword
131 * ! fall into long word movement
132 *	move bytes until src is word aligned
133 *	if not long word aligned, move a word
134 *	move long words in 4-way unrolled loop until < 32 bytes left
135 *      move long words in 1-way unrolled loop until < 8 bytes left
136 *	if zero bytes left, goto sm_exit
137 *	if one byte left, go to sm_byte
138 *	else go to sm_half
139 *
140 * sm_moveword:
141 *	move bytes until src is word aligned
142 *	move words in 4-way unrolled loop until < 16 bytes left
143 *      move words in 1-way unrolled loop until < 4 bytes left
144 *	if zero bytes left, goto sm_exit
145 *	if one byte left, go to sm_byte
146 *	else go to sm_half
147 *
148 * sm_movehalf:
149 *	move a byte if needed to align src on halfword
150 *	move halfwords in 4-way unrolled loop until < 8 bytes left
151 *	if zero bytes left, goto sm_exit
152 *	if one byte left, go to sm_byte
153 *	else go to sm_half
154 *
155 *
156 * FPBLK_copy:
157 * 	%l6 = curthread->t_lofault;
158 * 	if (%l6 != NULL) {
159 * 		membar #Sync
160 * 		curthread->t_lofault = .copyerr;
161 * 		caller_error_handler = TRUE             ! %l6 |= 2
162 * 	}
163 *
164 *	! for FPU testing we must not migrate cpus
165 * 	if (curthread->t_lwp == NULL) {
166 *		! Kernel threads do not have pcb's in which to store
167 *		! the floating point state, so disallow preemption during
168 *		! the copy.  This also prevents cpu migration.
169 * 		kpreempt_disable(curthread);
170 *	} else {
171 *		thread_nomigrate();
172 *	}
173 *
174 * 	old_fprs = %fprs;
175 * 	old_gsr = %gsr;
176 * 	if (%fprs.fef) {
177 * 		%fprs.fef = 1;
178 * 		save current fpregs on stack using blockstore
179 * 	} else {
180 * 		%fprs.fef = 1;
181 * 	}
182 *
183 *
184 * 	do_blockcopy_here;
185 *
186 * In lofault handler:
187 *	curthread->t_lofault = .copyerr2;
188 *	Continue on with the normal exit handler
189 *
190 * On normal exit:
191 * 	%gsr = old_gsr;
192 * 	if (old_fprs & FPRS_FEF)
193 * 		restore fpregs from stack using blockload
194 *	else
195 *		zero fpregs
196 * 	%fprs = old_fprs;
197 * 	membar #Sync
198 * 	curthread->t_lofault = (%l6 & ~3);
199 *	! following test omitted from copyin/copyout as they
200 *	! will always have a current thread
201 * 	if (curthread->t_lwp == NULL)
202 *		kpreempt_enable(curthread);
203 *	else
204 *		thread_allowmigrate();
205 * 	return (0)
206 *
207 * In second lofault handler (.copyerr2):
208 *	We've tried to restore fp state from the stack and failed.  To
209 *	prevent from returning with a corrupted fp state, we will panic.
210 */
211
212/*
213 * Comments about optimization choices
214 *
215 * The initial optimization decision in this code is to determine
216 * whether to use the FP registers for a copy or not.  If we don't
217 * use the FP registers, we can execute the copy as a leaf routine,
218 * saving a register save and restore.  Also, less elaborate setup
219 * is required, allowing short copies to be completed more quickly.
220 * For longer copies, especially unaligned ones (where the src and
221 * dst do not align to allow simple ldx,stx operation), the FP
222 * registers allow much faster copy operations.
223 *
224 * The estimated extra cost of the FP path will vary depending on
225 * src/dst alignment, dst offset from the next 64 byte FPblock store
226 * boundary, remaining src data after the last full dst cache line is
227 * moved whether the FP registers need to be saved, and some other
228 * minor issues.  The average additional overhead is estimated to be
229 * 400 clocks.  Since each non-repeated/predicted tst and branch costs
230 * around 10 clocks, elaborate calculation would slow down to all
231 * longer copies and only benefit a small portion of medium sized
232 * copies.  Rather than incur such cost, we chose fixed transition
233 * points for each of the alignment choices.
234 *
235 * For the inner loop, here is a comparison of the per cache line
236 * costs for each alignment when src&dst are in cache:
237 *
238 * byte aligned:  108 clocks slower for non-FPBLK
239 * half aligned:   44 clocks slower for non-FPBLK
240 * word aligned:   12 clocks slower for non-FPBLK
241 * long aligned:    4 clocks >>faster<< for non-FPBLK
242 *
243 * The long aligned loop runs faster because it does no prefetching.
244 * That wins if the data is not in cache or there is too little
245 * data to gain much benefit from prefetching.  But when there
246 * is more data and that data is not in cache, failing to prefetch
247 * can run much slower.  In addition, there is a 2 Kbyte store queue
248 * which will cause the non-FPBLK inner loop to slow for larger copies.
249 * The exact tradeoff is strongly load and application dependent, with
250 * increasing risk of a customer visible performance regression if the
251 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache
252 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe
253 * upper limit for the non-FPBLK code.  To minimize performance regression
254 * risk while still gaining the primary benefits of the improvements to
255 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various
256 * hw_copy_limit_*.  Later experimental studies using different values
257 * of hw_copy_limit_* can be used to make further adjustments if
258 * appropriate.
259 *
260 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned
261 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned
262 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned
263 * hw_copy_limit_8 = src and dst are longword aligned
264 *
265 * To say that src and dst are word aligned means that after
266 * some initial alignment activity of moving 0 to 3 bytes,
267 * both the src and dst will be on word boundaries so that
268 * word loads and stores may be used.
269 *
270 * Default values at May,2005 are:
271 * hw_copy_limit_1 =  256
272 * hw_copy_limit_2 =  512
273 * hw_copy_limit_4 = 1024
274 * hw_copy_limit_8 = 1024 (or 1536 on some systems)
275 *
276 *
277 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is
278 * disabled for that alignment choice.
279 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256)
280 * the value of VIS_COPY_THRESHOLD is used.
281 * It is not envisioned that hw_copy_limit_? will be changed in the field
282 * It is provided to allow for disabling FPBLK copies and to allow
283 * easy testing of alternate values on future HW implementations
284 * that might have different cache sizes, clock rates or instruction
285 * timing rules.
286 *
287 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum
288 * threshold to speedup all shorter copies (less than 256).  That
289 * saves an alignment test, memory reference, and enabling test
290 * for all short copies, or an estimated 24 clocks.
291 *
292 * The order in which these limits are checked does matter since each
293 * non-predicted tst and branch costs around 10 clocks.
294 * If src and dst are randomly selected addresses,
295 * 4 of 8 will not be alignable.
296 * 2 of 8 will be half word alignable.
297 * 1 of 8 will be word alignable.
298 * 1 of 8 will be long word alignable.
299 * But, tests on running kernels show that src and dst to copy code
300 * are typically not on random alignments.  Structure copies and
301 * copies of larger data sizes are often on long word boundaries.
302 * So we test the long word alignment case first, then
303 * the byte alignment, then halfword, then word alignment.
304 *
305 * Several times, tests for length are made to split the code
306 * into subcases.  These tests often allow later tests to be
307 * avoided.  For example, within the non-FPBLK copy, we first
308 * check for tiny copies of 3 bytes or less.  That allows us
309 * to use a 4-way unrolled loop for the general byte copy case
310 * without a test on loop entry.
311 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less
312 * vs longer cases.  For the really short case, we don't attempt
313 * align src and dst.  We try to minimize special case tests in
314 * the shortest loops as each test adds a significant percentage
315 * to the total time.
316 *
317 * For the medium sized cases, we allow ourselves to adjust the
318 * src and dst alignment and provide special cases for each of
319 * the four adjusted alignment cases. The CHKSIZE that was used
320 * to decide between short and medium size was chosen to be 39
321 * as that allows for the worst case of 7 bytes of alignment
322 * shift and 4 times 8 bytes for the first long word unrolling.
323 * That knowledge saves an initial test for length on entry into
324 * the medium cases.  If the general loop unrolling factor were
325 * to be increases, this number would also need to be adjusted.
326 *
327 * For all cases in the non-FPBLK code where it is known that at
328 * least 4 chunks of data are available for movement, the
329 * loop is unrolled by four.  This 4-way loop runs in 8 clocks
330 * or 2 clocks per data element.
331 *
332 * Instruction alignment is forced by used of .align 16 directives
333 * and nops which are not executed in the code.  This
334 * combination of operations shifts the alignment of following
335 * loops to insure that loops are aligned so that their instructions
336 * fall within the minimum number of 4 instruction fetch groups.
337 * If instructions are inserted or removed between the .align
338 * instruction and the unrolled loops, then the alignment needs
339 * to be readjusted.  Misaligned loops can add a clock per loop
340 * iteration to the loop timing.
341 *
342 * In a few cases, code is duplicated to avoid a branch.  Since
343 * a non-predicted tst and branch takes 10 clocks, this savings
344 * is judged an appropriate time-space tradeoff.
345 *
346 * Within the FPBLK-code, the prefetch method in the inner
347 * loop needs to be explained as it is not standard.  Two
348 * prefetches are issued for each cache line instead of one.
349 * The primary one is at the maximum reach of 8 cache lines.
350 * Most of the time, that maximum prefetch reach gives the
351 * cache line more time to reach the processor for systems with
352 * higher processor clocks.  But, sometimes memory interference
353 * can cause that prefetch to be dropped.  Putting a second
354 * prefetch at a reach of 5 cache lines catches the drops
355 * three iterations later and shows a measured improvement
356 * in performance over any similar loop with a single prefetch.
357 * The prefetches are placed in the loop so they overlap with
358 * non-memory instructions, so that there is no extra cost
359 * when the data is already in-cache.
360 *
361 */
362
363/*
364 * Notes on preserving existing fp state and on membars.
365 *
366 * When a copyOP decides to use fp we may have to preserve existing
367 * floating point state.  It is not the caller's state that we need to
368 * preserve - the rest of the kernel does not use fp and, anyway, fp
369 * registers are volatile across a call.  Some examples:
370 *
371 *	- userland has fp state and is interrupted (device interrupt
372 *	  or trap) and within the interrupt/trap handling we use
373 *	  bcopy()
374 *	- another (higher level) interrupt or trap handler uses bcopy
375 *	  while a bcopy from an earlier interrupt is still active
376 *	- an asynchronous error trap occurs while fp state exists (in
377 *	  userland or in kernel copy) and the tl0 component of the handling
378 *	  uses bcopy
379 *	- a user process with fp state incurs a copy-on-write fault and
380 *	  hwblkpagecopy always uses fp
381 *
382 * We therefore need a per-call place in which to preserve fp state -
383 * using our stack is ideal (and since fp copy cannot be leaf optimized
384 * because of calls it makes, this is no hardship).
385 *
386 * When we have finished fp copy (with it's repeated block stores)
387 * we must membar #Sync so that our block stores may complete before
388 * we either restore the original fp state into the fp registers or
389 * return to a caller which may initiate other fp operations that could
390 * modify the fp regs we used before the block stores complete.
391 *
392 * Synchronous faults (eg, unresolvable DMMU miss) that occur while
393 * t_lofault is not NULL will not panic but will instead trampoline
394 * to the registered lofault handler.  There is no need for any
395 * membars for these - eg, our store to t_lofault will always be visible to
396 * ourselves and it is our cpu which will take any trap.
397 *
398 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur
399 * while t_lofault is not NULL will also not panic.  Since we're copying
400 * to or from userland the extent of the damage is known - the destination
401 * buffer is incomplete.  So trap handlers will trampoline to the lofault
402 * handler in this case which should take some form of error action to
403 * avoid using the incomplete buffer.  The trap handler also flags the
404 * fault so that later return-from-trap handling (for the trap that brought
405 * this thread into the kernel in the first place) can notify the process
406 * and reboot the system (or restart the service with Greenline/Contracts).
407 *
408 * Asynchronous faults (eg, uncorrectable ECC error from memory) can
409 * result in deferred error traps - the trap is taken sometime after
410 * the event and the trap PC may not be the PC of the faulting access.
411 * Delivery of such pending traps can be forced by a membar #Sync, acting
412 * as an "error barrier" in this role.  To accurately apply the user/kernel
413 * separation described in the preceding paragraph we must force delivery
414 * of deferred traps affecting kernel state before we install a lofault
415 * handler (if we interpose a new lofault handler on an existing one there
416 * is no need to repeat this), and we must force delivery of deferred
417 * errors affecting the lofault-protected region before we clear t_lofault.
418 * Failure to do so results in lost kernel state being interpreted as
419 * affecting a copyin/copyout only, or of an error that really only
420 * affects copy data being interpreted as losing kernel state.
421 *
422 * Since the copy operations may preserve and later restore floating
423 * point state that does not belong to the caller (see examples above),
424 * we must be careful in how we do this in order to prevent corruption
425 * of another program.
426 *
427 * To make sure that floating point state is always saved and restored
428 * correctly, the following "big rules" must be followed when the floating
429 * point registers will be used:
430 *
431 * 1. %l6 always holds the caller's lofault handler.  Also in this register,
432 *    Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in
433 *    use.  Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a
434 *    lofault handler was set coming in.
435 *
436 * 2. The FPUSED flag indicates that all FP state has been successfully stored
437 *    on the stack.  It should not be set until this save has been completed.
438 *
439 * 3. The FPUSED flag should not be cleared on exit until all FP state has
440 *    been restored from the stack.  If an error occurs while restoring
441 *    data from the stack, the error handler can check this flag to see if
442 *    a restore is necessary.
443 *
444 * 4. Code run under the new lofault handler must be kept to a minimum.  In
445 *    particular, any calls to FP_ALLOWMIGRATE, which could result in a call
446 *    to kpreempt(), should not be made until after the lofault handler has
447 *    been restored.
448 */
449
450/*
451 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed
452 * to "break even" using FP/VIS-accelerated memory operations.
453 * The FPBLK code assumes a minimum number of bytes are available
454 * to be moved on entry.  Check that code carefully before
455 * reducing VIS_COPY_THRESHOLD below 256.
456 */
457/*
458 * This shadows sys/machsystm.h which can't be included due to the lack of
459 * _ASM guards in include files it references. Change it here, change it there.
460 */
461#define VIS_COPY_THRESHOLD 256
462
463/*
464 * TEST for very short copies
465 * Be aware that the maximum unroll for the short unaligned case
466 * is SHORTCOPY+1
467 */
468#define SHORTCOPY 3
469#define CHKSIZE  39
470
471/*
472 * Indicates that we're to trampoline to the error handler.
473 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag.
474 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag.
475 */
476#define	FPUSED_FLAG	1
477#define	TRAMP_FLAG	2
478#define	MASK_FLAGS	3
479
480/*
481 * Number of outstanding prefetches.
482 * first prefetch moves data from L2 to L1 (n_reads)
483 * second prefetch moves data from memory to L2 (one_read)
484 */
485#define	OLYMPUS_C_PREFETCH	24
486#define	OLYMPUS_C_2ND_PREFETCH	12
487
488#define	VIS_BLOCKSIZE		64
489
490/*
491 * Size of stack frame in order to accomodate a 64-byte aligned
492 * floating-point register save area and 2 64-bit temp locations.
493 * All copy functions use two quadrants of fp registers; to assure a
494 * block-aligned two block buffer in which to save we must reserve
495 * three blocks on stack.  Not all functions preserve %pfrs on stack
496 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all.
497 *
498 *    _______________________________________ <-- %fp + STACK_BIAS
499 *    | We may need to preserve 2 quadrants |
500 *    | of fp regs, but since we do so with |
501 *    | BST/BLD we need room in which to    |
502 *    | align to VIS_BLOCKSIZE bytes.  So   |
503 *    | this area is 3 * VIS_BLOCKSIZE.     | <--  - SAVED_FPREGS_OFFSET
504 *    |-------------------------------------|
505 *    | 8 bytes to save %fprs               | <--  - SAVED_FPRS_OFFSET
506 *    |-------------------------------------|
507 *    | 8 bytes to save %gsr                | <--  - SAVED_GSR_OFFSET
508 *    ---------------------------------------
509 */
510#define	HWCOPYFRAMESIZE		((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8))
511#define SAVED_FPREGS_OFFSET	(VIS_BLOCKSIZE * 3)
512#define SAVED_FPREGS_ADJUST	((VIS_BLOCKSIZE * 2) - 1)
513#define	SAVED_FPRS_OFFSET	(SAVED_FPREGS_OFFSET + 8)
514#define	SAVED_GSR_OFFSET	(SAVED_FPRS_OFFSET + 8)
515
516/*
517 * Common macros used by the various versions of the block copy
518 * routines in this file.
519 */
520
521/*
522 * In FP copies if we do not have preserved data to restore over
523 * the fp regs we used then we must zero those regs to avoid
524 * exposing portions of the data to later threads (data security).
525 *
526 * Copy functions use either quadrants 1 and 3 or 2 and 4.
527 *
528 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47
529 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63
530 *
531 * The instructions below are quicker than repeated fzero instructions
532 * since they can dispatch down two fp pipelines.
533 */
534#define	FZEROQ1Q3			\
535	fzero	%f0			;\
536	fmovd	%f0, %f2		;\
537	fmovd	%f0, %f4		;\
538	fmovd	%f0, %f6		;\
539	fmovd	%f0, %f8		;\
540	fmovd	%f0, %f10		;\
541	fmovd	%f0, %f12		;\
542	fmovd	%f0, %f14		;\
543	fmovd	%f0, %f32		;\
544	fmovd	%f0, %f34		;\
545	fmovd	%f0, %f36		;\
546	fmovd	%f0, %f38		;\
547	fmovd	%f0, %f40		;\
548	fmovd	%f0, %f42		;\
549	fmovd	%f0, %f44		;\
550	fmovd	%f0, %f46
551
552#define	FZEROQ2Q4			\
553	fzero	%f16			;\
554	fmovd	%f0, %f18		;\
555	fmovd	%f0, %f20		;\
556	fmovd	%f0, %f22		;\
557	fmovd	%f0, %f24		;\
558	fmovd	%f0, %f26		;\
559	fmovd	%f0, %f28		;\
560	fmovd	%f0, %f30		;\
561	fmovd	%f0, %f48		;\
562	fmovd	%f0, %f50		;\
563	fmovd	%f0, %f52		;\
564	fmovd	%f0, %f54		;\
565	fmovd	%f0, %f56		;\
566	fmovd	%f0, %f58		;\
567	fmovd	%f0, %f60		;\
568	fmovd	%f0, %f62
569
570/*
571 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack.
572 * Used to save and restore in-use fp registers when we want to use FP
573 * and find fp already in use and copy size still large enough to justify
574 * the additional overhead of this save and restore.
575 *
576 * A membar #Sync is needed before save to sync fp ops initiated before
577 * the call to the copy function (by whoever has fp in use); for example
578 * an earlier block load to the quadrant we are about to save may still be
579 * "in flight".  A membar #Sync is required at the end of the save to
580 * sync our block store (the copy code is about to begin ldd's to the
581 * first quadrant).
582 *
583 * Similarly: a membar #Sync before restore allows the block stores of
584 * the copy operation to complete before we fill the quadrants with their
585 * original data, and a membar #Sync after restore lets the block loads
586 * of the restore complete before we return to whoever has the fp regs
587 * in use.  To avoid repeated membar #Sync we make it the responsibility
588 * of the copy code to membar #Sync immediately after copy is complete
589 * and before using the BLD_*_FROMSTACK macro.
590 */
591#if !defined(lint)
592#define BST_FPQ1Q3_TOSTACK(tmp1)				\
593	/* membar #Sync	*/					;\
594	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
595	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
596	stda	%f0, [tmp1]ASI_BLK_P				;\
597	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
598	stda	%f32, [tmp1]ASI_BLK_P				;\
599	membar	#Sync
600
601#define	BLD_FPQ1Q3_FROMSTACK(tmp1)				\
602	/* membar #Sync - provided at copy completion */	;\
603	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
604	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
605	ldda	[tmp1]ASI_BLK_P, %f0				;\
606	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
607	ldda	[tmp1]ASI_BLK_P, %f32				;\
608	membar	#Sync
609
610#define BST_FPQ2Q4_TOSTACK(tmp1)				\
611	/* membar #Sync */					;\
612	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
613	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
614	stda	%f16, [tmp1]ASI_BLK_P				;\
615	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
616	stda	%f48, [tmp1]ASI_BLK_P				;\
617	membar	#Sync
618
619#define	BLD_FPQ2Q4_FROMSTACK(tmp1)				\
620	/* membar #Sync - provided at copy completion */	;\
621	add	%fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1	;\
622	and	tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */	;\
623	ldda	[tmp1]ASI_BLK_P, %f16				;\
624	add	tmp1, VIS_BLOCKSIZE, tmp1			;\
625	ldda	[tmp1]ASI_BLK_P, %f48				;\
626	membar	#Sync
627#endif
628
629/*
630 * FP_NOMIGRATE and FP_ALLOWMIGRATE.  Prevent migration (or, stronger,
631 * prevent preemption if there is no t_lwp to save FP state to on context
632 * switch) before commencing a FP copy, and reallow it on completion or
633 * in error trampoline paths when we were using FP copy.
634 *
635 * Both macros may call other functions, so be aware that all outputs are
636 * forfeit after using these macros.  For this reason we do not pass registers
637 * to use - we just use any outputs we want.
638 *
639 * Pseudo code:
640 *
641 * FP_NOMIGRATE:
642 *
643 * if (curthread->t_lwp) {
644 *	thread_nomigrate();
645 * } else {
646 *	kpreempt_disable();
647 * }
648 *
649 * FP_ALLOWMIGRATE:
650 *
651 * if (curthread->t_lwp) {
652 *	thread_allowmigrate();
653 * } else {
654 *	kpreempt_enable();
655 * }
656 */
657
658#define	FP_NOMIGRATE(label1, label2)				\
659	ldn	[THREAD_REG + T_LWP], %o0			;\
660	brz,a,pn %o0, label1/**/f				;\
661	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
662	call	thread_nomigrate				;\
663	  nop							;\
664	ba	label2/**/f					;\
665	  nop							;\
666label1:								;\
667	inc	%o1						;\
668	stb	%o1, [THREAD_REG + T_PREEMPT]			;\
669label2:
670
671#define	FP_ALLOWMIGRATE(label1, label2)			\
672	ldn	[THREAD_REG + T_LWP], %o0			;\
673	brz,a,pn %o0, label1/**/f				;\
674	  ldsb	[THREAD_REG + T_PREEMPT], %o1			;\
675	call thread_allowmigrate				;\
676	  nop							;\
677	ba	label2/**/f					;\
678	  nop							;\
679label1:								;\
680	dec	%o1						;\
681	brnz,pn	%o1, label2/**/f				;\
682	  stb	%o1, [THREAD_REG + T_PREEMPT]			;\
683	ldn	[THREAD_REG + T_CPU], %o0			;\
684	ldub	[%o0 + CPU_KPRUNRUN], %o0			;\
685	brz,pt	%o0, label2/**/f				;\
686	  nop							;\
687	call	kpreempt					;\
688	  rdpr	%pil, %o0					;\
689label2:
690
691/*
692 * Copy a block of storage, returning an error code if `from' or
693 * `to' takes a kernel pagefault which cannot be resolved.
694 * Returns errno value on pagefault error, 0 if all ok
695 */
696
697#if defined(lint)
698
699/* ARGSUSED */
700int
701kcopy(const void *from, void *to, size_t count)
702{ return(0); }
703
704#else	/* lint */
705
706	.seg	".text"
707	.align	4
708
709	ENTRY(kcopy)
710
711	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
712	bleu,pt	%ncc, .kcopy_small		! go to larger cases
713	  xor	%o0, %o1, %o3			! are src, dst alignable?
714	btst	7, %o3				!
715	bz,pt	%ncc, .kcopy_8			! check for longword alignment
716	  nop
717	btst	1, %o3				!
718	bz,pt	%ncc, .kcopy_2			! check for half-word
719	  nop
720	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
721	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
722	tst	%o3
723	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
724	  cmp	%o2, %o3			! if length <= limit
725	bleu,pt	%ncc, .kcopy_small		! go to small copy
726	  nop
727	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
728	  nop
729.kcopy_2:
730	btst	3, %o3				!
731	bz,pt	%ncc, .kcopy_4			! check for word alignment
732	  nop
733	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
734	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
735	tst	%o3
736	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
737	  cmp	%o2, %o3			! if length <= limit
738	bleu,pt	%ncc, .kcopy_small		! go to small copy
739	  nop
740	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
741	  nop
742.kcopy_4:
743	! already checked longword, must be word aligned
744	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
745	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
746	tst	%o3
747	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
748	  cmp	%o2, %o3			! if length <= limit
749	bleu,pt	%ncc, .kcopy_small		! go to small copy
750	  nop
751	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
752	  nop
753.kcopy_8:
754	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
755	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
756	tst	%o3
757	bz,pn	%icc, .kcopy_small		! if zero, disable HW copy
758	  cmp	%o2, %o3			! if length <= limit
759	bleu,pt	%ncc, .kcopy_small		! go to small copy
760	  nop
761	ba,pt	%ncc, .kcopy_more		! otherwise go to large copy
762	  nop
763
764.kcopy_small:
765	sethi	%hi(.sm_copyerr), %o5		! sm_copyerr is lofault value
766	or	%o5, %lo(.sm_copyerr), %o5
767	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
768	membar	#Sync				! sync error barrier
769	ba,pt	%ncc, .sm_do_copy		! common code
770	 stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
771
772.kcopy_more:
773	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
774	sethi	%hi(.copyerr), %l7		! copyerr is lofault value
775	or	%l7, %lo(.copyerr), %l7
776	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
777	membar	#Sync				! sync error barrier
778	ba,pt	%ncc, .do_copy			! common code
779	  stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
780
781
782/*
783 * We got here because of a fault during bcopy_more, called from kcopy or bcopy.
784 * Errno value is in %g1.  bcopy_more uses fp quadrants 1 and 3.
785 */
786.copyerr:
787	set	.copyerr2, %l0
788	membar	#Sync				! sync error barrier
789	stn	%l0, [THREAD_REG + T_LOFAULT]	! set t_lofault
790	btst	FPUSED_FLAG, %l6
791	bz	%ncc, 1f
792	  and	%l6, TRAMP_FLAG, %l0		! copy trampoline flag to %l0
793
794	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
795	wr	%o2, 0, %gsr
796
797	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
798	btst	FPRS_FEF, %o3
799	bz,pt	%icc, 4f
800	  nop
801
802	BLD_FPQ1Q3_FROMSTACK(%o2)
803
804	ba,pt	%ncc, 1f
805	  wr	%o3, 0, %fprs		! restore fprs
806
8074:
808	FZEROQ1Q3
809	wr	%o3, 0, %fprs		! restore fprs
810
811	!
812	! Need to cater for the different expectations of kcopy
813	! and bcopy. kcopy will *always* set a t_lofault handler
814	! If it fires, we're expected to just return the error code
815	! and *not* to invoke any existing error handler. As far as
816	! bcopy is concerned, we only set t_lofault if there was an
817	! existing lofault handler. In that case we're expected to
818	! invoke the previously existing handler after resetting the
819	! t_lofault value.
820	!
8211:
822	andn	%l6, MASK_FLAGS, %l6		! turn trampoline flag off
823	membar	#Sync				! sync error barrier
824	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
825	FP_ALLOWMIGRATE(5, 6)
826
827	btst	TRAMP_FLAG, %l0
828	bnz,pn	%ncc, 3f
829	  nop
830	ret
831	  restore	%g1, 0, %o0
832
8333:
834	!
835	! We're here via bcopy. There *must* have been an error handler
836	! in place otherwise we would have died a nasty death already.
837	!
838	jmp	%l6				! goto real handler
839	  restore	%g0, 0, %o0		! dispose of copy window
840
841/*
842 * We got here because of a fault in .copyerr.  We can't safely restore fp
843 * state, so we panic.
844 */
845fp_panic_msg:
846	.asciz	"Unable to restore fp state after copy operation"
847
848	.align	4
849.copyerr2:
850	set	fp_panic_msg, %o0
851	call	panic
852	  nop
853
854/*
855 * We got here because of a fault during a small kcopy or bcopy.
856 * No floating point registers are used by the small copies.
857 * Errno value is in %g1.
858 */
859.sm_copyerr:
8601:
861	btst	TRAMP_FLAG, %o4
862	membar	#Sync
863	andn	%o4, TRAMP_FLAG, %o4
864	bnz,pn	%ncc, 3f
865	  stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
866	retl
867	  mov	%g1, %o0
8683:
869	jmp	%o4				! goto real handler
870	  mov	%g0, %o0			!
871
872	SET_SIZE(kcopy)
873#endif	/* lint */
874
875
876/*
877 * Copy a block of storage - must not overlap (from + len <= to).
878 * Registers: l6 - saved t_lofault
879 * (for short copies, o4 - saved t_lofault)
880 *
881 * Copy a page of memory.
882 * Assumes double word alignment and a count >= 256.
883 */
884#if defined(lint)
885
886/* ARGSUSED */
887void
888bcopy(const void *from, void *to, size_t count)
889{}
890
891#else	/* lint */
892
893	ENTRY(bcopy)
894
895	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
896	bleu,pt	%ncc, .bcopy_small		! go to larger cases
897	  xor	%o0, %o1, %o3			! are src, dst alignable?
898	btst	7, %o3				!
899	bz,pt	%ncc, .bcopy_8			! check for longword alignment
900	  nop
901	btst	1, %o3				!
902	bz,pt	%ncc, .bcopy_2			! check for half-word
903	  nop
904	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
905	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
906	tst	%o3
907	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
908	  cmp	%o2, %o3			! if length <= limit
909	bleu,pt	%ncc, .bcopy_small		! go to small copy
910	  nop
911	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
912	  nop
913.bcopy_2:
914	btst	3, %o3				!
915	bz,pt	%ncc, .bcopy_4			! check for word alignment
916	  nop
917	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
918	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
919	tst	%o3
920	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
921	  cmp	%o2, %o3			! if length <= limit
922	bleu,pt	%ncc, .bcopy_small		! go to small copy
923	  nop
924	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
925	  nop
926.bcopy_4:
927	! already checked longword, must be word aligned
928	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
929	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
930	tst	%o3
931	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
932	  cmp	%o2, %o3			! if length <= limit
933	bleu,pt	%ncc, .bcopy_small		! go to small copy
934	  nop
935	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
936	  nop
937.bcopy_8:
938	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
939	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
940	tst	%o3
941	bz,pn	%icc, .bcopy_small		! if zero, disable HW copy
942	  cmp	%o2, %o3			! if length <= limit
943	bleu,pt	%ncc, .bcopy_small		! go to small copy
944	  nop
945	ba,pt	%ncc, .bcopy_more		! otherwise go to large copy
946	  nop
947
948	.align	16
949.bcopy_small:
950	ldn	[THREAD_REG + T_LOFAULT], %o4	! save t_lofault
951	tst	%o4
952	bz,pt	%icc, .sm_do_copy
953	  nop
954	sethi	%hi(.sm_copyerr), %o5
955	or	%o5, %lo(.sm_copyerr), %o5
956	membar	#Sync				! sync error barrier
957	stn	%o5, [THREAD_REG + T_LOFAULT]	! install new vector
958	or	%o4, TRAMP_FLAG, %o4		! error should trampoline
959.sm_do_copy:
960	cmp	%o2, SHORTCOPY		! check for really short case
961	bleu,pt	%ncc, .bc_sm_left	!
962	  cmp	%o2, CHKSIZE		! check for medium length cases
963	bgu,pn	%ncc, .bc_med		!
964	  or	%o0, %o1, %o3		! prepare alignment check
965	andcc	%o3, 0x3, %g0		! test for alignment
966	bz,pt	%ncc, .bc_sm_word	! branch to word aligned case
967.bc_sm_movebytes:
968	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
969.bc_sm_notalign4:
970	ldub	[%o0], %o3		! read byte
971	stb	%o3, [%o1]		! write byte
972	subcc	%o2, 4, %o2		! reduce count by 4
973	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
974	add	%o0, 4, %o0		! advance SRC by 4
975	stb	%o3, [%o1 + 1]
976	ldub	[%o0 - 2], %o3
977	add	%o1, 4, %o1		! advance DST by 4
978	stb	%o3, [%o1 - 2]
979	ldub	[%o0 - 1], %o3
980	bgt,pt	%ncc, .bc_sm_notalign4	! loop til 3 or fewer bytes remain
981	  stb	%o3, [%o1 - 1]
982	add	%o2, 3, %o2		! restore count
983.bc_sm_left:
984	tst	%o2
985	bz,pt	%ncc, .bc_sm_exit	! check for zero length
986	  deccc	%o2			! reduce count for cc test
987	ldub	[%o0], %o3		! move one byte
988	bz,pt	%ncc, .bc_sm_exit
989	  stb	%o3, [%o1]
990	ldub	[%o0 + 1], %o3		! move another byte
991	deccc	%o2			! check for more
992	bz,pt	%ncc, .bc_sm_exit
993	  stb	%o3, [%o1 + 1]
994	ldub	[%o0 + 2], %o3		! move final byte
995	stb	%o3, [%o1 + 2]
996	membar	#Sync				! sync error barrier
997	andn	%o4, TRAMP_FLAG, %o4
998	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
999	retl
1000	  mov	%g0, %o0		! return 0
1001	.align	16
1002	nop				! instruction alignment
1003					! see discussion at start of file
1004.bc_sm_words:
1005	lduw	[%o0], %o3		! read word
1006.bc_sm_wordx:
1007	subcc	%o2, 8, %o2		! update count
1008	stw	%o3, [%o1]		! write word
1009	add	%o0, 8, %o0		! update SRC
1010	lduw	[%o0 - 4], %o3		! read word
1011	add	%o1, 8, %o1		! update DST
1012	bgt,pt	%ncc, .bc_sm_words	! loop til done
1013	  stw	%o3, [%o1 - 4]		! write word
1014	addcc	%o2, 7, %o2		! restore count
1015	bz,pt	%ncc, .bc_sm_exit
1016	  deccc	%o2
1017	bz,pt	%ncc, .bc_sm_byte
1018.bc_sm_half:
1019	  subcc	%o2, 2, %o2		! reduce count by 2
1020	add	%o0, 2, %o0		! advance SRC by 2
1021	lduh	[%o0 - 2], %o3		! read half word
1022	add	%o1, 2, %o1		! advance DST by 2
1023	bgt,pt	%ncc, .bc_sm_half	! loop til done
1024	  sth	%o3, [%o1 - 2]		! write half word
1025	addcc	%o2, 1, %o2		! restore count
1026	bz,pt	%ncc, .bc_sm_exit
1027	  nop
1028.bc_sm_byte:
1029	ldub	[%o0], %o3
1030	stb	%o3, [%o1]
1031	membar	#Sync				! sync error barrier
1032	andn	%o4, TRAMP_FLAG, %o4
1033	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1034	retl
1035	  mov	%g0, %o0		! return 0
1036
1037.bc_sm_word:
1038	subcc	%o2, 4, %o2		! update count
1039	bgt,pt	%ncc, .bc_sm_wordx
1040	  lduw	[%o0], %o3		! read word
1041	addcc	%o2, 3, %o2		! restore count
1042	bz,pt	%ncc, .bc_sm_exit
1043	  stw	%o3, [%o1]		! write word
1044	deccc	%o2			! reduce count for cc test
1045	ldub	[%o0 + 4], %o3		! load one byte
1046	bz,pt	%ncc, .bc_sm_exit
1047	  stb	%o3, [%o1 + 4]		! store one byte
1048	ldub	[%o0 + 5], %o3		! load second byte
1049	deccc	%o2
1050	bz,pt	%ncc, .bc_sm_exit
1051	  stb	%o3, [%o1 + 5]		! store second byte
1052	ldub	[%o0 + 6], %o3		! load third byte
1053	stb	%o3, [%o1 + 6]		! store third byte
1054.bc_sm_exit:
1055	membar	#Sync				! sync error barrier
1056	andn	%o4, TRAMP_FLAG, %o4
1057	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1058	retl
1059	  mov	%g0, %o0		! return 0
1060
1061	.align 16
1062.bc_med:
1063	xor	%o0, %o1, %o3		! setup alignment check
1064	btst	1, %o3
1065	bnz,pt	%ncc, .bc_sm_movebytes	! unaligned
1066	  nop
1067	btst	3, %o3
1068	bnz,pt	%ncc, .bc_med_half	! halfword aligned
1069	  nop
1070	btst	7, %o3
1071	bnz,pt	%ncc, .bc_med_word	! word aligned
1072	  nop
1073.bc_med_long:
1074	btst	3, %o0			! check for
1075	bz,pt	%ncc, .bc_med_long1	! word alignment
1076	  nop
1077.bc_med_long0:
1078	ldub	[%o0], %o3		! load one byte
1079	inc	%o0
1080	stb	%o3,[%o1]		! store byte
1081	inc	%o1
1082	btst	3, %o0
1083	bnz,pt	%ncc, .bc_med_long0
1084	  dec	%o2
1085.bc_med_long1:			! word aligned
1086	btst	7, %o0			! check for long word
1087	bz,pt	%ncc, .bc_med_long2
1088	  nop
1089	lduw	[%o0], %o3		! load word
1090	add	%o0, 4, %o0		! advance SRC by 4
1091	stw	%o3, [%o1]		! store word
1092	add	%o1, 4, %o1		! advance DST by 4
1093	sub	%o2, 4, %o2		! reduce count by 4
1094!
1095!  Now long word aligned and have at least 32 bytes to move
1096!
1097.bc_med_long2:
1098	sub	%o2, 31, %o2		! adjust count to allow cc zero test
1099.bc_med_lmove:
1100	ldx	[%o0], %o3		! read long word
1101	stx	%o3, [%o1]		! write long word
1102	subcc	%o2, 32, %o2		! reduce count by 32
1103	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
1104	add	%o0, 32, %o0		! advance SRC by 32
1105	stx	%o3, [%o1 + 8]
1106	ldx	[%o0 - 16], %o3
1107	add	%o1, 32, %o1		! advance DST by 32
1108	stx	%o3, [%o1 - 16]
1109	ldx	[%o0 - 8], %o3
1110	bgt,pt	%ncc, .bc_med_lmove	! loop til 31 or fewer bytes left
1111	  stx	%o3, [%o1 - 8]
1112	addcc	%o2, 24, %o2		! restore count to long word offset
1113	ble,pt	%ncc, .bc_med_lextra	! check for more long words to move
1114	  nop
1115.bc_med_lword:
1116	ldx	[%o0], %o3		! read long word
1117	subcc	%o2, 8, %o2		! reduce count by 8
1118	stx	%o3, [%o1]		! write long word
1119	add	%o0, 8, %o0		! advance SRC by 8
1120	bgt,pt	%ncc, .bc_med_lword	! loop til 7 or fewer bytes left
1121	  add	%o1, 8, %o1		! advance DST by 8
1122.bc_med_lextra:
1123	addcc	%o2, 7, %o2		! restore rest of count
1124	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
1125	  deccc	%o2
1126	bz,pt	%ncc, .bc_sm_byte
1127	  nop
1128	ba,pt	%ncc, .bc_sm_half
1129	  nop
1130
1131	.align 16
1132.bc_med_word:
1133	btst	3, %o0			! check for
1134	bz,pt	%ncc, .bc_med_word1	! word alignment
1135	  nop
1136.bc_med_word0:
1137	ldub	[%o0], %o3		! load one byte
1138	inc	%o0
1139	stb	%o3,[%o1]		! store byte
1140	inc	%o1
1141	btst	3, %o0
1142	bnz,pt	%ncc, .bc_med_word0
1143	  dec	%o2
1144!
1145!  Now word aligned and have at least 36 bytes to move
1146!
1147.bc_med_word1:
1148	sub	%o2, 15, %o2		! adjust count to allow cc zero test
1149.bc_med_wmove:
1150	lduw	[%o0], %o3		! read word
1151	stw	%o3, [%o1]		! write word
1152	subcc	%o2, 16, %o2		! reduce count by 16
1153	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
1154	add	%o0, 16, %o0		! advance SRC by 16
1155	stw	%o3, [%o1 + 4]
1156	lduw	[%o0 - 8], %o3
1157	add	%o1, 16, %o1		! advance DST by 16
1158	stw	%o3, [%o1 - 8]
1159	lduw	[%o0 - 4], %o3
1160	bgt,pt	%ncc, .bc_med_wmove	! loop til 15 or fewer bytes left
1161	  stw	%o3, [%o1 - 4]
1162	addcc	%o2, 12, %o2		! restore count to word offset
1163	ble,pt	%ncc, .bc_med_wextra	! check for more words to move
1164	  nop
1165.bc_med_word2:
1166	lduw	[%o0], %o3		! read word
1167	subcc	%o2, 4, %o2		! reduce count by 4
1168	stw	%o3, [%o1]		! write word
1169	add	%o0, 4, %o0		! advance SRC by 4
1170	bgt,pt	%ncc, .bc_med_word2	! loop til 3 or fewer bytes left
1171	  add	%o1, 4, %o1		! advance DST by 4
1172.bc_med_wextra:
1173	addcc	%o2, 3, %o2		! restore rest of count
1174	bz,pt	%ncc, .bc_sm_exit	! if zero, then done
1175	  deccc	%o2
1176	bz,pt	%ncc, .bc_sm_byte
1177	  nop
1178	ba,pt	%ncc, .bc_sm_half
1179	  nop
1180
1181	.align 16
1182.bc_med_half:
1183	btst	1, %o0			! check for
1184	bz,pt	%ncc, .bc_med_half1	! half word alignment
1185	  nop
1186	ldub	[%o0], %o3		! load one byte
1187	inc	%o0
1188	stb	%o3,[%o1]		! store byte
1189	inc	%o1
1190	dec	%o2
1191!
1192!  Now half word aligned and have at least 38 bytes to move
1193!
1194.bc_med_half1:
1195	sub	%o2, 7, %o2		! adjust count to allow cc zero test
1196.bc_med_hmove:
1197	lduh	[%o0], %o3		! read half word
1198	sth	%o3, [%o1]		! write half word
1199	subcc	%o2, 8, %o2		! reduce count by 8
1200	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
1201	add	%o0, 8, %o0		! advance SRC by 8
1202	sth	%o3, [%o1 + 2]
1203	lduh	[%o0 - 4], %o3
1204	add	%o1, 8, %o1		! advance DST by 8
1205	sth	%o3, [%o1 - 4]
1206	lduh	[%o0 - 2], %o3
1207	bgt,pt	%ncc, .bc_med_hmove	! loop til 7 or fewer bytes left
1208	  sth	%o3, [%o1 - 2]
1209	addcc	%o2, 7, %o2		! restore count
1210	bz,pt	%ncc, .bc_sm_exit
1211	  deccc	%o2
1212	bz,pt	%ncc, .bc_sm_byte
1213	  nop
1214	ba,pt	%ncc, .bc_sm_half
1215	  nop
1216
1217	SET_SIZE(bcopy)
1218
1219/*
1220 * The _more entry points are not intended to be used directly by
1221 * any caller from outside this file.  They are provided to allow
1222 * profiling and dtrace of the portions of the copy code that uses
1223 * the floating point registers.
1224 * This entry is particularly important as DTRACE (at least as of
1225 * 4/2004) does not support leaf functions.
1226 */
1227
1228	ENTRY(bcopy_more)
1229.bcopy_more:
1230	prefetch [%o0], #n_reads
1231	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1232	ldn	[THREAD_REG + T_LOFAULT], %l6	! save t_lofault
1233	tst	%l6
1234	bz,pt	%ncc, .do_copy
1235	  nop
1236	sethi	%hi(.copyerr), %o2
1237	or	%o2, %lo(.copyerr), %o2
1238	membar	#Sync				! sync error barrier
1239	stn	%o2, [THREAD_REG + T_LOFAULT]	! install new vector
1240	!
1241	! We've already captured whether t_lofault was zero on entry.
1242	! We need to mark ourselves as being from bcopy since both
1243	! kcopy and bcopy use the same code path. If TRAMP_FLAG is set
1244	! and the saved lofault was zero, we won't reset lofault on
1245	! returning.
1246	!
1247	or	%l6, TRAMP_FLAG, %l6
1248
1249/*
1250 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes
1251 * Also, use of FP registers has been tested to be enabled
1252 */
1253.do_copy:
1254	FP_NOMIGRATE(6, 7)
1255
1256	rd	%fprs, %o2		! check for unused fp
1257	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
1258	btst	FPRS_FEF, %o2
1259	bz,a,pt	%icc, .do_blockcopy
1260	  wr	%g0, FPRS_FEF, %fprs
1261
1262	BST_FPQ1Q3_TOSTACK(%o2)
1263
1264.do_blockcopy:
1265	rd	%gsr, %o2
1266	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
1267	or	%l6, FPUSED_FLAG, %l6
1268
1269#define	REALSRC	%i0
1270#define	DST	%i1
1271#define	CNT	%i2
1272#define	SRC	%i3
1273#define	TMP	%i5
1274
1275	andcc	DST, VIS_BLOCKSIZE - 1, TMP
1276	bz,pt	%ncc, 2f
1277	  neg	TMP
1278	add	TMP, VIS_BLOCKSIZE, TMP
1279
1280	! TMP = bytes required to align DST on FP_BLOCK boundary
1281	! Using SRC as a tmp here
1282	cmp	TMP, 3
1283	bleu,pt	%ncc, 1f
1284	  sub	CNT,TMP,CNT		! adjust main count
1285	sub	TMP, 3, TMP		! adjust for end of loop test
1286.bc_blkalign:
1287	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
1288	stb	SRC, [DST]
1289	subcc	TMP, 4, TMP
1290	ldub	[REALSRC + 1], SRC
1291	add	REALSRC, 4, REALSRC
1292	stb	SRC, [DST + 1]
1293	ldub	[REALSRC - 2], SRC
1294	add	DST, 4, DST
1295	stb	SRC, [DST - 2]
1296	ldub	[REALSRC - 1], SRC
1297	bgu,pt	%ncc, .bc_blkalign
1298	  stb	SRC, [DST - 1]
1299
1300	addcc	TMP, 3, TMP		! restore count adjustment
1301	bz,pt	%ncc, 2f		! no bytes left?
1302	  nop
13031:	ldub	[REALSRC], SRC
1304	inc	REALSRC
1305	inc	DST
1306	deccc	TMP
1307	bgu	%ncc, 1b
1308	  stb	SRC, [DST - 1]
1309
13102:
1311	membar	#StoreLoad
1312	andn	REALSRC, 0x7, SRC
1313
1314	! SRC - 8-byte aligned
1315	! DST - 64-byte aligned
1316	ldd	[SRC], %f0
1317	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1318	alignaddr REALSRC, %g0, %g0
1319	ldd	[SRC + 0x08], %f2
1320	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1321	faligndata %f0, %f2, %f32
1322	ldd	[SRC + 0x10], %f4
1323	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1324	faligndata %f2, %f4, %f34
1325	ldd	[SRC + 0x18], %f6
1326	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1327	faligndata %f4, %f6, %f36
1328	ldd	[SRC + 0x20], %f8
1329	prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
1330	faligndata %f6, %f8, %f38
1331	ldd	[SRC + 0x28], %f10
1332	prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
1333	faligndata %f8, %f10, %f40
1334	ldd	[SRC + 0x30], %f12
1335	prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
1336	faligndata %f10, %f12, %f42
1337	ldd	[SRC + 0x38], %f14
1338	ldd	[SRC + VIS_BLOCKSIZE], %f0
1339	sub	CNT, VIS_BLOCKSIZE, CNT
1340	add	SRC, VIS_BLOCKSIZE, SRC
1341	prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
1342	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1343	ba,pt	%ncc, 1f
1344	  prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
1345	.align	32
13461:
1347	ldd	[SRC + 0x08], %f2
1348	faligndata %f12, %f14, %f44
1349	ldd	[SRC + 0x10], %f4
1350	faligndata %f14, %f0, %f46
1351	stda	%f32, [DST]ASI_BLK_P
1352	ldd	[SRC + 0x18], %f6
1353	faligndata %f0, %f2, %f32
1354	ldd	[SRC + 0x20], %f8
1355	faligndata %f2, %f4, %f34
1356	ldd	[SRC + 0x28], %f10
1357	faligndata %f4, %f6, %f36
1358	ldd	[SRC + 0x30], %f12
1359	faligndata %f6, %f8, %f38
1360	sub	CNT, VIS_BLOCKSIZE, CNT
1361	ldd	[SRC + 0x38], %f14
1362	faligndata %f8, %f10, %f40
1363	add	DST, VIS_BLOCKSIZE, DST
1364	ldd	[SRC + VIS_BLOCKSIZE], %f0
1365	faligndata %f10, %f12, %f42
1366	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1367	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1368	add	SRC, VIS_BLOCKSIZE, SRC
1369	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1370	cmp	CNT, VIS_BLOCKSIZE + 8
1371	bgu,pt	%ncc, 1b
1372	  prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1373
1374	! only if REALSRC & 0x7 is 0
1375	cmp	CNT, VIS_BLOCKSIZE
1376	bne	%ncc, 3f
1377	  andcc	REALSRC, 0x7, %g0
1378	bz,pt	%ncc, 2f
1379	  nop
13803:
1381	faligndata %f12, %f14, %f44
1382	faligndata %f14, %f0, %f46
1383	stda	%f32, [DST]ASI_BLK_P
1384	add	DST, VIS_BLOCKSIZE, DST
1385	ba,pt	%ncc, 3f
1386	  nop
13872:
1388	ldd	[SRC + 0x08], %f2
1389	fsrc1	%f12, %f44
1390	ldd	[SRC + 0x10], %f4
1391	fsrc1	%f14, %f46
1392	stda	%f32, [DST]ASI_BLK_P
1393	ldd	[SRC + 0x18], %f6
1394	fsrc1	%f0, %f32
1395	ldd	[SRC + 0x20], %f8
1396	fsrc1	%f2, %f34
1397	ldd	[SRC + 0x28], %f10
1398	fsrc1	%f4, %f36
1399	ldd	[SRC + 0x30], %f12
1400	fsrc1	%f6, %f38
1401	ldd	[SRC + 0x38], %f14
1402	fsrc1	%f8, %f40
1403	sub	CNT, VIS_BLOCKSIZE, CNT
1404	add	DST, VIS_BLOCKSIZE, DST
1405	add	SRC, VIS_BLOCKSIZE, SRC
1406	add	REALSRC, VIS_BLOCKSIZE, REALSRC
1407	fsrc1	%f10, %f42
1408	fsrc1	%f12, %f44
1409	fsrc1	%f14, %f46
1410	stda	%f32, [DST]ASI_BLK_P
1411	add	DST, VIS_BLOCKSIZE, DST
1412	ba,a,pt	%ncc, .bcb_exit
1413	  nop
1414
14153:	tst	CNT
1416	bz,a,pt	%ncc, .bcb_exit
1417	  nop
1418
14195:	ldub	[REALSRC], TMP
1420	inc	REALSRC
1421	inc	DST
1422	deccc	CNT
1423	bgu	%ncc, 5b
1424	  stb	TMP, [DST - 1]
1425.bcb_exit:
1426	membar	#Sync
1427
1428	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
1429	wr	%o2, 0, %gsr
1430
1431	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1432	btst	FPRS_FEF, %o3
1433	bz,pt	%icc, 4f
1434	  nop
1435
1436	BLD_FPQ1Q3_FROMSTACK(%o2)
1437
1438	ba,pt	%ncc, 2f
1439	  wr	%o3, 0, %fprs		! restore fprs
14404:
1441	FZEROQ1Q3
1442	wr	%o3, 0, %fprs		! restore fprs
14432:
1444	membar	#Sync				! sync error barrier
1445	andn	%l6, MASK_FLAGS, %l6
1446	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1447	FP_ALLOWMIGRATE(5, 6)
1448	ret
1449	  restore	%g0, 0, %o0
1450
1451	SET_SIZE(bcopy_more)
1452
1453#endif	/* lint */
1454
1455/*
1456 * Block copy with possibly overlapped operands.
1457 */
1458
1459#if defined(lint)
1460
1461/*ARGSUSED*/
1462void
1463ovbcopy(const void *from, void *to, size_t count)
1464{}
1465
1466#else	/* lint */
1467
1468	ENTRY(ovbcopy)
1469	tst	%o2			! check count
1470	bgu,a	%ncc, 1f		! nothing to do or bad arguments
1471	  subcc	%o0, %o1, %o3		! difference of from and to address
1472
1473	retl				! return
1474	  nop
14751:
1476	bneg,a	%ncc, 2f
1477	  neg	%o3			! if < 0, make it positive
14782:	cmp	%o2, %o3		! cmp size and abs(from - to)
1479	bleu	%ncc, bcopy		! if size <= abs(diff): use bcopy,
1480	  .empty				!   no overlap
1481	  cmp	%o0, %o1		! compare from and to addresses
1482	blu	%ncc, .ov_bkwd		! if from < to, copy backwards
1483	  nop
1484	!
1485	! Copy forwards.
1486	!
1487.ov_fwd:
1488	ldub	[%o0], %o3		! read from address
1489	inc	%o0			! inc from address
1490	stb	%o3, [%o1]		! write to address
1491	deccc	%o2			! dec count
1492	bgu	%ncc, .ov_fwd		! loop till done
1493	  inc	%o1			! inc to address
1494
1495	retl				! return
1496	  nop
1497	!
1498	! Copy backwards.
1499	!
1500.ov_bkwd:
1501	deccc	%o2			! dec count
1502	ldub	[%o0 + %o2], %o3	! get byte at end of src
1503	bgu	%ncc, .ov_bkwd		! loop till done
1504	  stb	%o3, [%o1 + %o2]	! delay slot, store at end of dst
1505
1506	retl				! return
1507	  nop
1508
1509	SET_SIZE(ovbcopy)
1510
1511#endif	/* lint */
1512
1513
1514/*
1515 * hwblkpagecopy()
1516 *
1517 * Copies exactly one page.  This routine assumes the caller (ppcopy)
1518 * has already disabled kernel preemption and has checked
1519 * use_hw_bcopy.  Preventing preemption also prevents cpu migration.
1520 */
1521#ifdef lint
1522/*ARGSUSED*/
1523void
1524hwblkpagecopy(const void *src, void *dst)
1525{ }
1526#else /* lint */
1527	ENTRY(hwblkpagecopy)
1528	! get another window w/space for three aligned blocks of saved fpregs
1529	prefetch [%o0], #n_reads
1530	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
1531
1532	! %i0 - source address (arg)
1533	! %i1 - destination address (arg)
1534	! %i2 - length of region (not arg)
1535	! %l0 - saved fprs
1536	! %l1 - pointer to saved fpregs
1537
1538	rd	%fprs, %l0		! check for unused fp
1539	btst	FPRS_FEF, %l0
1540	bz,a,pt	%icc, 1f
1541	  wr	%g0, FPRS_FEF, %fprs
1542
1543	BST_FPQ1Q3_TOSTACK(%l1)
1544
15451:	set	PAGESIZE, CNT
1546	mov	REALSRC, SRC
1547
1548	ldd	[SRC], %f0
1549	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
1550	ldd	[SRC + 0x08], %f2
1551	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
1552	fmovd	%f0, %f32
1553	ldd	[SRC + 0x10], %f4
1554	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1555	fmovd	%f2, %f34
1556	ldd	[SRC + 0x18], %f6
1557	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
1558	fmovd	%f4, %f36
1559	ldd	[SRC + 0x20], %f8
1560	prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
1561	fmovd	%f6, %f38
1562	ldd	[SRC + 0x28], %f10
1563	prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
1564	fmovd	%f8, %f40
1565	ldd	[SRC + 0x30], %f12
1566	prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
1567	fmovd	%f10, %f42
1568	ldd	[SRC + 0x38], %f14
1569	ldd	[SRC + VIS_BLOCKSIZE], %f0
1570	sub	CNT, VIS_BLOCKSIZE, CNT
1571	add	SRC, VIS_BLOCKSIZE, SRC
1572	prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
1573	ba,pt	%ncc, 2f
1574	prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
1575	.align	32
15762:
1577	ldd	[SRC + 0x08], %f2
1578	fmovd	%f12, %f44
1579	ldd	[SRC + 0x10], %f4
1580	fmovd	%f14, %f46
1581	stda	%f32, [DST]ASI_BLK_P
1582	ldd	[SRC + 0x18], %f6
1583	fmovd	%f0, %f32
1584	ldd	[SRC + 0x20], %f8
1585	fmovd	%f2, %f34
1586	ldd	[SRC + 0x28], %f10
1587	fmovd	%f4, %f36
1588	ldd	[SRC + 0x30], %f12
1589	fmovd	%f6, %f38
1590	ldd	[SRC + 0x38], %f14
1591	fmovd	%f8, %f40
1592	ldd	[SRC + VIS_BLOCKSIZE], %f0
1593	fmovd	%f10, %f42
1594	sub	CNT, VIS_BLOCKSIZE, CNT
1595	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
1596	add	DST, VIS_BLOCKSIZE, DST
1597	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1598	add	SRC, VIS_BLOCKSIZE, SRC
1599	cmp	CNT, VIS_BLOCKSIZE + 8
1600	bgu,pt	%ncc, 2b
1601	  prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
1602
1603	! trailing block
1604	ldd	[SRC + 0x08], %f2
1605	fsrc1	%f12, %f44
1606	ldd	[SRC + 0x10], %f4
1607	fsrc1	%f14, %f46
1608	stda	%f32, [DST]ASI_BLK_P
1609	ldd	[SRC + 0x18], %f6
1610	fsrc1	%f0, %f32
1611	ldd	[SRC + 0x20], %f8
1612	fsrc1	%f2, %f34
1613	ldd	[SRC + 0x28], %f10
1614	fsrc1	%f4, %f36
1615	ldd	[SRC + 0x30], %f12
1616	fsrc1	%f6, %f38
1617	ldd	[SRC + 0x38], %f14
1618	fsrc1	%f8, %f40
1619	sub	CNT, VIS_BLOCKSIZE, CNT
1620	add	DST, VIS_BLOCKSIZE, DST
1621	add	SRC, VIS_BLOCKSIZE, SRC
1622	fsrc1	%f10, %f42
1623	fsrc1	%f12, %f44
1624	fsrc1	%f14, %f46
1625	stda	%f32, [DST]ASI_BLK_P
1626
1627	membar	#Sync
1628
1629	btst	FPRS_FEF, %l0
1630	bz,pt	%icc, 2f
1631	  nop
1632
1633	BLD_FPQ1Q3_FROMSTACK(%l3)
1634	ba	3f
1635	  nop
1636
16372:	FZEROQ1Q3
1638
16393:	wr	%l0, 0, %fprs		! restore fprs
1640	ret
1641	  restore	%g0, 0, %o0
1642
1643	SET_SIZE(hwblkpagecopy)
1644#endif	/* lint */
1645
1646
1647/*
1648 * Transfer data to and from user space -
1649 * Note that these routines can cause faults
1650 * It is assumed that the kernel has nothing at
1651 * less than KERNELBASE in the virtual address space.
1652 *
1653 * Note that copyin(9F) and copyout(9F) are part of the
1654 * DDI/DKI which specifies that they return '-1' on "errors."
1655 *
1656 * Sigh.
1657 *
1658 * So there's two extremely similar routines - xcopyin() and xcopyout()
1659 * which return the errno that we've faithfully computed.  This
1660 * allows other callers (e.g. uiomove(9F)) to work correctly.
1661 * Given that these are used pretty heavily, we expand the calling
1662 * sequences inline for all flavours (rather than making wrappers).
1663 *
1664 * There are also stub routines for xcopyout_little and xcopyin_little,
1665 * which currently are intended to handle requests of <= 16 bytes from
1666 * do_unaligned. Future enhancement to make them handle 8k pages efficiently
1667 * is left as an exercise...
1668 */
1669
1670/*
1671 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr)
1672 *
1673 * General theory of operation:
1674 *
1675 * The only difference between copy{in,out} and
1676 * xcopy{in,out} is in the error handling routine they invoke
1677 * when a memory access error occurs. xcopyOP returns the errno
1678 * while copyOP returns -1 (see above). copy{in,out}_noerr set
1679 * a special flag (by oring the TRAMP_FLAG into the fault handler address)
1680 * if they are called with a fault handler already in place. That flag
1681 * causes the default handlers to trampoline to the previous handler
1682 * upon an error.
1683 *
1684 * None of the copyops routines grab a window until it's decided that
1685 * we need to do a HW block copy operation. This saves a window
1686 * spill/fill when we're called during socket ops. The typical IO
1687 * path won't cause spill/fill traps.
1688 *
1689 * This code uses a set of 4 limits for the maximum size that will
1690 * be copied given a particular input/output address alignment.
1691 * If the value for a particular limit is zero, the copy will be performed
1692 * by the plain copy loops rather than FPBLK.
1693 *
1694 * See the description of bcopy above for more details of the
1695 * data copying algorithm and the default limits.
1696 *
1697 */
1698
1699/*
1700 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little).
1701 */
1702
1703#if defined(lint)
1704
1705
1706#else	/* lint */
1707/*
1708 * We save the arguments in the following registers in case of a fault:
1709 *	kaddr - %l1
1710 *	uaddr - %l2
1711 *	count - %l3
1712 */
1713#define SAVE_SRC	%l1
1714#define SAVE_DST	%l2
1715#define SAVE_COUNT	%l3
1716
1717#define SM_SAVE_SRC		%g4
1718#define SM_SAVE_DST		%g5
1719#define SM_SAVE_COUNT		%o5
1720#define ERRNO		%l5
1721
1722
1723#define REAL_LOFAULT	%l4
1724/*
1725 * Generic copyio fault handler.  This is the first line of defense when a
1726 * fault occurs in (x)copyin/(x)copyout.  In order for this to function
1727 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT.
1728 * This allows us to share common code for all the flavors of the copy
1729 * operations, including the _noerr versions.
1730 *
1731 * Note that this function will restore the original input parameters before
1732 * calling REAL_LOFAULT.  So the real handler can vector to the appropriate
1733 * member of the t_copyop structure, if needed.
1734 */
1735	ENTRY(copyio_fault)
1736	membar	#Sync
1737	mov	%g1,ERRNO			! save errno in ERRNO
1738	btst	FPUSED_FLAG, %l6
1739	bz	%ncc, 1f
1740	  nop
1741
1742	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
1743	wr	%o2, 0, %gsr    	! restore gsr
1744
1745	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
1746	btst	FPRS_FEF, %o3
1747	bz,pt	%icc, 4f
1748	  nop
1749
1750	BLD_FPQ2Q4_FROMSTACK(%o2)
1751
1752	ba,pt	%ncc, 1f
1753	  wr	%o3, 0, %fprs   	! restore fprs
1754
17554:
1756	FZEROQ2Q4
1757	wr	%o3, 0, %fprs   	! restore fprs
1758
17591:
1760	andn	%l6, FPUSED_FLAG, %l6
1761	membar	#Sync
1762	stn	%l6, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
1763	FP_ALLOWMIGRATE(5, 6)
1764
1765	mov	SAVE_SRC, %i0
1766	mov	SAVE_DST, %i1
1767	jmp	REAL_LOFAULT
1768	  mov	SAVE_COUNT, %i2
1769
1770	SET_SIZE(copyio_fault)
1771
1772
1773#endif
1774
1775#if defined(lint)
1776
1777/*ARGSUSED*/
1778int
1779copyout(const void *kaddr, void *uaddr, size_t count)
1780{ return (0); }
1781
1782#else	/* lint */
1783
1784	ENTRY(copyout)
1785
1786	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
1787	bleu,pt	%ncc, .copyout_small		! go to larger cases
1788	  xor	%o0, %o1, %o3			! are src, dst alignable?
1789	btst	7, %o3				!
1790	bz,pt	%ncc, .copyout_8		! check for longword alignment
1791	  nop
1792	btst	1, %o3				!
1793	bz,pt	%ncc, .copyout_2		! check for half-word
1794	  nop
1795	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
1796	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
1797	tst	%o3
1798	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1799	  cmp	%o2, %o3			! if length <= limit
1800	bleu,pt	%ncc, .copyout_small		! go to small copy
1801	  nop
1802	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1803	  nop
1804.copyout_2:
1805	btst	3, %o3				!
1806	bz,pt	%ncc, .copyout_4		! check for word alignment
1807	  nop
1808	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
1809	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
1810	tst	%o3
1811	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1812	  cmp	%o2, %o3			! if length <= limit
1813	bleu,pt	%ncc, .copyout_small		! go to small copy
1814	  nop
1815	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1816	  nop
1817.copyout_4:
1818	! already checked longword, must be word aligned
1819	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
1820	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
1821	tst	%o3
1822	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1823	  cmp	%o2, %o3			! if length <= limit
1824	bleu,pt	%ncc, .copyout_small		! go to small copy
1825	  nop
1826	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1827	  nop
1828.copyout_8:
1829	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
1830	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
1831	tst	%o3
1832	bz,pn	%icc, .copyout_small		! if zero, disable HW copy
1833	  cmp	%o2, %o3			! if length <= limit
1834	bleu,pt	%ncc, .copyout_small		! go to small copy
1835	  nop
1836	ba,pt	%ncc, .copyout_more		! otherwise go to large copy
1837	  nop
1838
1839	.align	16
1840	nop				! instruction alignment
1841					! see discussion at start of file
1842.copyout_small:
1843	sethi	%hi(.sm_copyout_err), %o5	! .sm_copyout_err is lofault
1844	or	%o5, %lo(.sm_copyout_err), %o5
1845	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
1846	membar	#Sync				! sync error barrier
1847	stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
1848.sm_do_copyout:
1849	mov	%o0, SM_SAVE_SRC
1850	mov	%o1, SM_SAVE_DST
1851	cmp	%o2, SHORTCOPY		! check for really short case
1852	bleu,pt	%ncc, .co_sm_left	!
1853	  mov	%o2, SM_SAVE_COUNT
1854	cmp	%o2, CHKSIZE		! check for medium length cases
1855	bgu,pn	%ncc, .co_med		!
1856	  or	%o0, %o1, %o3		! prepare alignment check
1857	andcc	%o3, 0x3, %g0		! test for alignment
1858	bz,pt	%ncc, .co_sm_word	! branch to word aligned case
1859.co_sm_movebytes:
1860	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
1861.co_sm_notalign4:
1862	ldub	[%o0], %o3		! read byte
1863	subcc	%o2, 4, %o2		! reduce count by 4
1864	stba	%o3, [%o1]ASI_USER	! write byte
1865	inc	%o1			! advance DST by 1
1866	ldub	[%o0 + 1], %o3		! repeat for a total of 4 bytes
1867	add	%o0, 4, %o0		! advance SRC by 4
1868	stba	%o3, [%o1]ASI_USER
1869	inc	%o1			! advance DST by 1
1870	ldub	[%o0 - 2], %o3
1871	stba	%o3, [%o1]ASI_USER
1872	inc	%o1			! advance DST by 1
1873	ldub	[%o0 - 1], %o3
1874	stba	%o3, [%o1]ASI_USER
1875	bgt,pt	%ncc, .co_sm_notalign4	! loop til 3 or fewer bytes remain
1876	  inc	%o1			! advance DST by 1
1877	add	%o2, 3, %o2		! restore count
1878.co_sm_left:
1879	tst	%o2
1880	bz,pt	%ncc, .co_sm_exit	! check for zero length
1881	  nop
1882	ldub	[%o0], %o3		! load one byte
1883	deccc	%o2			! reduce count for cc test
1884	bz,pt	%ncc, .co_sm_exit
1885	  stba	%o3,[%o1]ASI_USER	! store one byte
1886	ldub	[%o0 + 1], %o3		! load second byte
1887	deccc	%o2
1888	inc	%o1
1889	bz,pt	%ncc, .co_sm_exit
1890	  stba	%o3,[%o1]ASI_USER	! store second byte
1891	ldub	[%o0 + 2], %o3		! load third byte
1892	inc	%o1
1893	stba	%o3,[%o1]ASI_USER	! store third byte
1894	membar	#Sync				! sync error barrier
1895	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1896	retl
1897	  mov	%g0, %o0		! return 0
1898	.align	16
1899.co_sm_words:
1900	lduw	[%o0], %o3		! read word
1901.co_sm_wordx:
1902	subcc	%o2, 8, %o2		! update count
1903	stwa	%o3, [%o1]ASI_USER	! write word
1904	add	%o0, 8, %o0		! update SRC
1905	lduw	[%o0 - 4], %o3		! read word
1906	add	%o1, 4, %o1		! update DST
1907	stwa	%o3, [%o1]ASI_USER	! write word
1908	bgt,pt	%ncc, .co_sm_words	! loop til done
1909	  add	%o1, 4, %o1		! update DST
1910	addcc	%o2, 7, %o2		! restore count
1911	bz,pt	%ncc, .co_sm_exit
1912	  nop
1913	deccc	%o2
1914	bz,pt	%ncc, .co_sm_byte
1915.co_sm_half:
1916	  subcc	%o2, 2, %o2		! reduce count by 2
1917	lduh	[%o0], %o3		! read half word
1918	add	%o0, 2, %o0		! advance SRC by 2
1919	stha	%o3, [%o1]ASI_USER	! write half word
1920	bgt,pt	%ncc, .co_sm_half	! loop til done
1921	  add	%o1, 2, %o1		! advance DST by 2
1922	addcc	%o2, 1, %o2		! restore count
1923	bz,pt	%ncc, .co_sm_exit
1924	  nop
1925.co_sm_byte:
1926	ldub	[%o0], %o3
1927	stba	%o3, [%o1]ASI_USER
1928	membar	#Sync				! sync error barrier
1929	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1930	retl
1931	  mov	%g0, %o0		! return 0
1932	.align 16
1933.co_sm_word:
1934	subcc	%o2, 4, %o2		! update count
1935	bgt,pt	%ncc, .co_sm_wordx
1936	  lduw	[%o0], %o3		! read word
1937	addcc	%o2, 3, %o2		! restore count
1938	bz,pt	%ncc, .co_sm_exit
1939	  stwa	%o3, [%o1]ASI_USER	! write word
1940	deccc	%o2			! reduce count for cc test
1941	ldub	[%o0 + 4], %o3		! load one byte
1942	add	%o1, 4, %o1
1943	bz,pt	%ncc, .co_sm_exit
1944	  stba	%o3, [%o1]ASI_USER	! store one byte
1945	ldub	[%o0 + 5], %o3		! load second byte
1946	deccc	%o2
1947	inc	%o1
1948	bz,pt	%ncc, .co_sm_exit
1949	  stba	%o3, [%o1]ASI_USER	! store second byte
1950	ldub	[%o0 + 6], %o3		! load third byte
1951	inc	%o1
1952	stba	%o3, [%o1]ASI_USER	! store third byte
1953.co_sm_exit:
1954	  membar	#Sync				! sync error barrier
1955	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
1956	retl
1957	  mov	%g0, %o0		! return 0
1958
1959	.align 16
1960.co_med:
1961	xor	%o0, %o1, %o3		! setup alignment check
1962	btst	1, %o3
1963	bnz,pt	%ncc, .co_sm_movebytes	! unaligned
1964	  nop
1965	btst	3, %o3
1966	bnz,pt	%ncc, .co_med_half	! halfword aligned
1967	  nop
1968	btst	7, %o3
1969	bnz,pt	%ncc, .co_med_word	! word aligned
1970	  nop
1971.co_med_long:
1972	btst	3, %o0			! check for
1973	bz,pt	%ncc, .co_med_long1	! word alignment
1974	  nop
1975.co_med_long0:
1976	ldub	[%o0], %o3		! load one byte
1977	inc	%o0
1978	stba	%o3,[%o1]ASI_USER	! store byte
1979	inc	%o1
1980	btst	3, %o0
1981	bnz,pt	%ncc, .co_med_long0
1982	  dec	%o2
1983.co_med_long1:			! word aligned
1984	btst	7, %o0			! check for long word
1985	bz,pt	%ncc, .co_med_long2
1986	  nop
1987	lduw	[%o0], %o3		! load word
1988	add	%o0, 4, %o0		! advance SRC by 4
1989	stwa	%o3, [%o1]ASI_USER	! store word
1990	add	%o1, 4, %o1		! advance DST by 4
1991	sub	%o2, 4, %o2		! reduce count by 4
1992!
1993!  Now long word aligned and have at least 32 bytes to move
1994!
1995.co_med_long2:
1996	sub	%o2, 31, %o2		! adjust count to allow cc zero test
1997	sub	%o1, 8, %o1		! adjust pointer to allow store in
1998					! branch delay slot instead of add
1999.co_med_lmove:
2000	add	%o1, 8, %o1		! advance DST by 8
2001	ldx	[%o0], %o3		! read long word
2002	subcc	%o2, 32, %o2		! reduce count by 32
2003	stxa	%o3, [%o1]ASI_USER	! write long word
2004	add	%o1, 8, %o1		! advance DST by 8
2005	ldx	[%o0 + 8], %o3		! repeat for a total for 4 long words
2006	add	%o0, 32, %o0		! advance SRC by 32
2007	stxa	%o3, [%o1]ASI_USER
2008	ldx	[%o0 - 16], %o3
2009	add	%o1, 8, %o1		! advance DST by 8
2010	stxa	%o3, [%o1]ASI_USER
2011	ldx	[%o0 - 8], %o3
2012	add	%o1, 8, %o1		! advance DST by 8
2013	bgt,pt	%ncc, .co_med_lmove	! loop til 31 or fewer bytes left
2014	  stxa	%o3, [%o1]ASI_USER
2015	add	%o1, 8, %o1		! advance DST by 8
2016	addcc	%o2, 24, %o2		! restore count to long word offset
2017	ble,pt	%ncc, .co_med_lextra	! check for more long words to move
2018	  nop
2019.co_med_lword:
2020	ldx	[%o0], %o3		! read long word
2021	subcc	%o2, 8, %o2		! reduce count by 8
2022	stxa	%o3, [%o1]ASI_USER	! write long word
2023	add	%o0, 8, %o0		! advance SRC by 8
2024	bgt,pt	%ncc, .co_med_lword	! loop til 7 or fewer bytes left
2025	  add	%o1, 8, %o1		! advance DST by 8
2026.co_med_lextra:
2027	addcc	%o2, 7, %o2		! restore rest of count
2028	bz,pt	%ncc, .co_sm_exit	! if zero, then done
2029	  deccc	%o2
2030	bz,pt	%ncc, .co_sm_byte
2031	  nop
2032	ba,pt	%ncc, .co_sm_half
2033	  nop
2034
2035	.align 16
2036	nop				! instruction alignment
2037					! see discussion at start of file
2038.co_med_word:
2039	btst	3, %o0			! check for
2040	bz,pt	%ncc, .co_med_word1	! word alignment
2041	  nop
2042.co_med_word0:
2043	ldub	[%o0], %o3		! load one byte
2044	inc	%o0
2045	stba	%o3,[%o1]ASI_USER	! store byte
2046	inc	%o1
2047	btst	3, %o0
2048	bnz,pt	%ncc, .co_med_word0
2049	  dec	%o2
2050!
2051!  Now word aligned and have at least 36 bytes to move
2052!
2053.co_med_word1:
2054	sub	%o2, 15, %o2		! adjust count to allow cc zero test
2055.co_med_wmove:
2056	lduw	[%o0], %o3		! read word
2057	subcc	%o2, 16, %o2		! reduce count by 16
2058	stwa	%o3, [%o1]ASI_USER	! write word
2059	add	%o1, 4, %o1		! advance DST by 4
2060	lduw	[%o0 + 4], %o3		! repeat for a total for 4 words
2061	add	%o0, 16, %o0		! advance SRC by 16
2062	stwa	%o3, [%o1]ASI_USER
2063	add	%o1, 4, %o1		! advance DST by 4
2064	lduw	[%o0 - 8], %o3
2065	stwa	%o3, [%o1]ASI_USER
2066	add	%o1, 4, %o1		! advance DST by 4
2067	lduw	[%o0 - 4], %o3
2068	stwa	%o3, [%o1]ASI_USER
2069	bgt,pt	%ncc, .co_med_wmove	! loop til 15 or fewer bytes left
2070	  add	%o1, 4, %o1		! advance DST by 4
2071	addcc	%o2, 12, %o2		! restore count to word offset
2072	ble,pt	%ncc, .co_med_wextra	! check for more words to move
2073	  nop
2074.co_med_word2:
2075	lduw	[%o0], %o3		! read word
2076	subcc	%o2, 4, %o2		! reduce count by 4
2077	stwa	%o3, [%o1]ASI_USER	! write word
2078	add	%o0, 4, %o0		! advance SRC by 4
2079	bgt,pt	%ncc, .co_med_word2	! loop til 3 or fewer bytes left
2080	  add	%o1, 4, %o1		! advance DST by 4
2081.co_med_wextra:
2082	addcc	%o2, 3, %o2		! restore rest of count
2083	bz,pt	%ncc, .co_sm_exit	! if zero, then done
2084	  deccc	%o2
2085	bz,pt	%ncc, .co_sm_byte
2086	  nop
2087	ba,pt	%ncc, .co_sm_half
2088	  nop
2089
2090	.align 16
2091	nop				! instruction alignment
2092	nop				! see discussion at start of file
2093	nop
2094.co_med_half:
2095	btst	1, %o0			! check for
2096	bz,pt	%ncc, .co_med_half1	! half word alignment
2097	  nop
2098	ldub	[%o0], %o3		! load one byte
2099	inc	%o0
2100	stba	%o3,[%o1]ASI_USER	! store byte
2101	inc	%o1
2102	dec	%o2
2103!
2104!  Now half word aligned and have at least 38 bytes to move
2105!
2106.co_med_half1:
2107	sub	%o2, 7, %o2		! adjust count to allow cc zero test
2108.co_med_hmove:
2109	lduh	[%o0], %o3		! read half word
2110	subcc	%o2, 8, %o2		! reduce count by 8
2111	stha	%o3, [%o1]ASI_USER	! write half word
2112	add	%o1, 2, %o1		! advance DST by 2
2113	lduh	[%o0 + 2], %o3		! repeat for a total for 4 halfwords
2114	add	%o0, 8, %o0		! advance SRC by 8
2115	stha	%o3, [%o1]ASI_USER
2116	add	%o1, 2, %o1		! advance DST by 2
2117	lduh	[%o0 - 4], %o3
2118	stha	%o3, [%o1]ASI_USER
2119	add	%o1, 2, %o1		! advance DST by 2
2120	lduh	[%o0 - 2], %o3
2121	stha	%o3, [%o1]ASI_USER
2122	bgt,pt	%ncc, .co_med_hmove	! loop til 7 or fewer bytes left
2123	  add	%o1, 2, %o1		! advance DST by 2
2124	addcc	%o2, 7, %o2		! restore count
2125	bz,pt	%ncc, .co_sm_exit
2126	  deccc	%o2
2127	bz,pt	%ncc, .co_sm_byte
2128	  nop
2129	ba,pt	%ncc, .co_sm_half
2130	  nop
2131
2132/*
2133 * We got here because of a fault during short copyout.
2134 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2135 */
2136.sm_copyout_err:
2137	membar	#Sync
2138	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2139	mov	SM_SAVE_SRC, %o0
2140	mov	SM_SAVE_DST, %o1
2141	mov	SM_SAVE_COUNT, %o2
2142	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2143	tst	%o3
2144	bz,pt	%ncc, 3f			! if not, return error
2145	  nop
2146	ldn	[%o3 + CP_COPYOUT], %o5		! if handler, invoke it with
2147	jmp	%o5				! original arguments
2148	  nop
21493:
2150	retl
2151	  or	%g0, -1, %o0		! return error value
2152
2153	SET_SIZE(copyout)
2154
2155/*
2156 * The _more entry points are not intended to be used directly by
2157 * any caller from outside this file.  They are provided to allow
2158 * profiling and dtrace of the portions of the copy code that uses
2159 * the floating point registers.
2160 * This entry is particularly important as DTRACE (at least as of
2161 * 4/2004) does not support leaf functions.
2162 */
2163
2164	ENTRY(copyout_more)
2165.copyout_more:
2166	prefetch [%o0], #n_reads
2167	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2168	set	.copyout_err, REAL_LOFAULT
2169
2170/*
2171 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes
2172 */
2173.do_copyout:
2174        set     copyio_fault, %l7		! .copyio_fault is lofault val
2175
2176	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
2177	membar	#Sync				! sync error barrier
2178	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
2179
2180	mov	%i0, SAVE_SRC
2181	mov	%i1, SAVE_DST
2182	mov	%i2, SAVE_COUNT
2183
2184	FP_NOMIGRATE(6, 7)
2185
2186	rd	%fprs, %o2		! check for unused fp
2187	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2188	btst	FPRS_FEF, %o2
2189	bz,a,pt	%icc, .do_blockcopyout
2190	  wr	%g0, FPRS_FEF, %fprs
2191
2192	BST_FPQ2Q4_TOSTACK(%o2)
2193
2194.do_blockcopyout:
2195	rd	%gsr, %o2
2196	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
2197	or	%l6, FPUSED_FLAG, %l6
2198
2199	andcc	DST, VIS_BLOCKSIZE - 1, TMP
2200	mov	ASI_USER, %asi
2201	bz,pt	%ncc, 2f
2202	  neg	TMP
2203	add	TMP, VIS_BLOCKSIZE, TMP
2204
2205	! TMP = bytes required to align DST on FP_BLOCK boundary
2206	! Using SRC as a tmp here
2207	cmp	TMP, 3
2208	bleu,pt	%ncc, 1f
2209	  sub	CNT,TMP,CNT		! adjust main count
2210	sub	TMP, 3, TMP		! adjust for end of loop test
2211.co_blkalign:
2212	ldub	[REALSRC], SRC		! move 4 bytes per loop iteration
2213	stba	SRC, [DST]%asi
2214	subcc	TMP, 4, TMP
2215	ldub	[REALSRC + 1], SRC
2216	add	REALSRC, 4, REALSRC
2217	stba	SRC, [DST + 1]%asi
2218	ldub	[REALSRC - 2], SRC
2219	add	DST, 4, DST
2220	stba	SRC, [DST - 2]%asi
2221	ldub	[REALSRC - 1], SRC
2222	bgu,pt	%ncc, .co_blkalign
2223	  stba	SRC, [DST - 1]%asi
2224
2225	addcc	TMP, 3, TMP		! restore count adjustment
2226	bz,pt	%ncc, 2f		! no bytes left?
2227	  nop
22281:	ldub	[REALSRC], SRC
2229	inc	REALSRC
2230	inc	DST
2231	deccc	TMP
2232	bgu	%ncc, 1b
2233	  stba	SRC, [DST - 1]%asi
2234
22352:
2236	membar	#StoreLoad
2237	andn	REALSRC, 0x7, SRC
2238
2239	! SRC - 8-byte aligned
2240	! DST - 64-byte aligned
2241	ldd	[SRC], %f16
2242	prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads
2243	alignaddr REALSRC, %g0, %g0
2244	ldd	[SRC + 0x08], %f18
2245	prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads
2246	faligndata %f16, %f18, %f48
2247	ldd	[SRC + 0x10], %f20
2248	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2249	faligndata %f18, %f20, %f50
2250	ldd	[SRC + 0x18], %f22
2251	prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read
2252	faligndata %f20, %f22, %f52
2253	ldd	[SRC + 0x20], %f24
2254	prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read
2255	faligndata %f22, %f24, %f54
2256	ldd	[SRC + 0x28], %f26
2257	prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read
2258	faligndata %f24, %f26, %f56
2259	ldd	[SRC + 0x30], %f28
2260	prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read
2261	faligndata %f26, %f28, %f58
2262	ldd	[SRC + 0x38], %f30
2263	ldd	[SRC + VIS_BLOCKSIZE], %f16
2264	sub	CNT, VIS_BLOCKSIZE, CNT
2265	add	SRC, VIS_BLOCKSIZE, SRC
2266	prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read
2267	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2268	ba,pt	%ncc, 1f
2269	prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read
2270	.align	32
22711:
2272	ldd	[SRC + 0x08], %f18
2273	faligndata %f28, %f30, %f60
2274	ldd	[SRC + 0x10], %f20
2275	faligndata %f30, %f16, %f62
2276	stda	%f48, [DST]ASI_BLK_AIUS
2277	ldd	[SRC + 0x18], %f22
2278	faligndata %f16, %f18, %f48
2279	ldd	[SRC + 0x20], %f24
2280	faligndata %f18, %f20, %f50
2281	ldd	[SRC + 0x28], %f26
2282	faligndata %f20, %f22, %f52
2283	ldd	[SRC + 0x30], %f28
2284	faligndata %f22, %f24, %f54
2285	sub	CNT, VIS_BLOCKSIZE, CNT
2286	ldd	[SRC + 0x38], %f30
2287	faligndata %f24, %f26, %f56
2288	add	DST, VIS_BLOCKSIZE, DST
2289	ldd	[SRC + VIS_BLOCKSIZE], %f16
2290	faligndata %f26, %f28, %f58
2291	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2292	prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads
2293	add	SRC, VIS_BLOCKSIZE, SRC
2294	prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2295	cmp	CNT, VIS_BLOCKSIZE + 8
2296	bgu,pt	%ncc, 1b
2297	  prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read
2298
2299	! only if REALSRC & 0x7 is 0
2300	cmp	CNT, VIS_BLOCKSIZE
2301	bne	%ncc, 3f
2302	  andcc	REALSRC, 0x7, %g0
2303	bz,pt	%ncc, 2f
2304	  nop
23053:
2306	faligndata %f28, %f30, %f60
2307	faligndata %f30, %f16, %f62
2308	stda	%f48, [DST]ASI_BLK_AIUS
2309	add	DST, VIS_BLOCKSIZE, DST
2310	ba,pt	%ncc, 3f
2311	  nop
23122:
2313	ldd	[SRC + 0x08], %f18
2314	fsrc1	%f28, %f60
2315	ldd	[SRC + 0x10], %f20
2316	fsrc1	%f30, %f62
2317	stda	%f48, [DST]ASI_BLK_AIUS
2318	ldd	[SRC + 0x18], %f22
2319	fsrc1	%f16, %f48
2320	ldd	[SRC + 0x20], %f24
2321	fsrc1	%f18, %f50
2322	ldd	[SRC + 0x28], %f26
2323	fsrc1	%f20, %f52
2324	ldd	[SRC + 0x30], %f28
2325	fsrc1	%f22, %f54
2326	ldd	[SRC + 0x38], %f30
2327	fsrc1	%f24, %f56
2328	sub	CNT, VIS_BLOCKSIZE, CNT
2329	add	DST, VIS_BLOCKSIZE, DST
2330	add	SRC, VIS_BLOCKSIZE, SRC
2331	add	REALSRC, VIS_BLOCKSIZE, REALSRC
2332	fsrc1	%f26, %f58
2333	fsrc1	%f28, %f60
2334	fsrc1	%f30, %f62
2335	stda	%f48, [DST]ASI_BLK_AIUS
2336	add	DST, VIS_BLOCKSIZE, DST
2337	ba,a,pt	%ncc, 4f
2338	  nop
2339
23403:	tst	CNT
2341	bz,a	%ncc, 4f
2342	  nop
2343
23445:	ldub	[REALSRC], TMP
2345	inc	REALSRC
2346	inc	DST
2347	deccc	CNT
2348	bgu	%ncc, 5b
2349	  stba	TMP, [DST - 1]%asi
23504:
2351
2352.copyout_exit:
2353	membar	#Sync
2354
2355	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2
2356	wr	%o2, 0, %gsr		! restore gsr
2357
2358	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
2359	btst	FPRS_FEF, %o3
2360	bz,pt	%icc, 4f
2361	  nop
2362
2363	BLD_FPQ2Q4_FROMSTACK(%o2)
2364
2365	ba,pt	%ncc, 1f
2366	  wr	%o3, 0, %fprs		! restore fprs
2367
23684:
2369	FZEROQ2Q4
2370	wr	%o3, 0, %fprs		! restore fprs
2371
23721:
2373	membar	#Sync
2374	andn	%l6, FPUSED_FLAG, %l6
2375	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2376	FP_ALLOWMIGRATE(5, 6)
2377	ret
2378	  restore	%g0, 0, %o0
2379
2380/*
2381 * We got here because of a fault during copyout.
2382 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
2383 */
2384.copyout_err:
2385	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
2386	tst	%o4
2387	bz,pt	%ncc, 2f			! if not, return error
2388	  nop
2389	ldn	[%o4 + CP_COPYOUT], %g2		! if handler, invoke it with
2390	jmp	%g2				! original arguments
2391	  restore %g0, 0, %g0			! dispose of copy window
23922:
2393        ret
2394	  restore %g0, -1, %o0			! return error value
2395
2396
2397	SET_SIZE(copyout_more)
2398
2399#endif	/* lint */
2400
2401
2402#ifdef	lint
2403
2404/*ARGSUSED*/
2405int
2406xcopyout(const void *kaddr, void *uaddr, size_t count)
2407{ return (0); }
2408
2409#else	/* lint */
2410
2411	ENTRY(xcopyout)
2412	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
2413	bleu,pt	%ncc, .xcopyout_small		! go to larger cases
2414	  xor	%o0, %o1, %o3			! are src, dst alignable?
2415	btst	7, %o3				!
2416	bz,pt	%ncc, .xcopyout_8		!
2417	  nop
2418	btst	1, %o3				!
2419	bz,pt	%ncc, .xcopyout_2		! check for half-word
2420	  nop
2421	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
2422	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2423	tst	%o3
2424	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2425	  cmp	%o2, %o3			! if length <= limit
2426	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2427	  nop
2428	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2429	  nop
2430.xcopyout_2:
2431	btst	3, %o3				!
2432	bz,pt	%ncc, .xcopyout_4		! check for word alignment
2433	  nop
2434	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
2435	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
2436	tst	%o3
2437	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2438	  cmp	%o2, %o3			! if length <= limit
2439	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2440	  nop
2441	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2442	  nop
2443.xcopyout_4:
2444	! already checked longword, must be word aligned
2445	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
2446	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
2447	tst	%o3
2448	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2449	  cmp	%o2, %o3			! if length <= limit
2450	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2451	  nop
2452	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2453	  nop
2454.xcopyout_8:
2455	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
2456	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2457	tst	%o3
2458	bz,pn	%icc, .xcopyout_small		! if zero, disable HW copy
2459	  cmp	%o2, %o3			! if length <= limit
2460	bleu,pt	%ncc, .xcopyout_small		! go to small copy
2461	  nop
2462	ba,pt	%ncc, .xcopyout_more		! otherwise go to large copy
2463	  nop
2464
2465.xcopyout_small:
2466	sethi	%hi(.sm_xcopyout_err), %o5	! .sm_xcopyout_err is lofault
2467	or	%o5, %lo(.sm_xcopyout_err), %o5
2468	ldn	[THREAD_REG + T_LOFAULT], %o4	! save existing handler
2469	membar	#Sync				! sync error barrier
2470	ba,pt	%ncc, .sm_do_copyout		! common code
2471	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set t_lofault
2472
2473.xcopyout_more:
2474	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2475	sethi	%hi(.xcopyout_err), REAL_LOFAULT
2476	ba,pt	%ncc, .do_copyout		! common code
2477	  or	REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT
2478
2479/*
2480 * We got here because of fault during xcopyout
2481 * Errno value is in ERRNO
2482 */
2483.xcopyout_err:
2484	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
2485	tst	%o4
2486	bz,pt	%ncc, 2f			! if not, return error
2487	  nop
2488	ldn	[%o4 + CP_XCOPYOUT], %g2	! if handler, invoke it with
2489	jmp	%g2				! original arguments
2490	  restore %g0, 0, %g0			! dispose of copy window
24912:
2492        ret
2493	  restore ERRNO, 0, %o0			! return errno value
2494
2495.sm_xcopyout_err:
2496
2497	membar	#Sync
2498	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2499	mov	SM_SAVE_SRC, %o0
2500	mov	SM_SAVE_DST, %o1
2501	mov	SM_SAVE_COUNT, %o2
2502	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2503	tst	%o3
2504	bz,pt	%ncc, 3f			! if not, return error
2505	  nop
2506	ldn	[%o3 + CP_XCOPYOUT], %o5	! if handler, invoke it with
2507	jmp	%o5				! original arguments
2508	  nop
25093:
2510	retl
2511	  or	%g1, 0, %o0		! return errno value
2512
2513	SET_SIZE(xcopyout)
2514
2515#endif	/* lint */
2516
2517#ifdef	lint
2518
2519/*ARGSUSED*/
2520int
2521xcopyout_little(const void *kaddr, void *uaddr, size_t count)
2522{ return (0); }
2523
2524#else	/* lint */
2525
2526	ENTRY(xcopyout_little)
2527	sethi	%hi(.xcopyio_err), %o5
2528	or	%o5, %lo(.xcopyio_err), %o5
2529	ldn	[THREAD_REG + T_LOFAULT], %o4
2530	membar	#Sync				! sync error barrier
2531	stn	%o5, [THREAD_REG + T_LOFAULT]
2532	mov	%o4, %o5
2533
2534	subcc	%g0, %o2, %o3
2535	add	%o0, %o2, %o0
2536	bz,pn	%ncc, 2f		! check for zero bytes
2537	  sub	%o2, 1, %o4
2538	add	%o0, %o4, %o0		! start w/last byte
2539	add	%o1, %o2, %o1
2540	ldub	[%o0 + %o3], %o4
2541
25421:	stba	%o4, [%o1 + %o3]ASI_AIUSL
2543	inccc	%o3
2544	sub	%o0, 2, %o0		! get next byte
2545	bcc,a,pt %ncc, 1b
2546	  ldub	[%o0 + %o3], %o4
2547
25482:
2549	membar	#Sync				! sync error barrier
2550	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2551	retl
2552	  mov	%g0, %o0		! return (0)
2553
2554	SET_SIZE(xcopyout_little)
2555
2556#endif	/* lint */
2557
2558/*
2559 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little)
2560 */
2561
2562#if defined(lint)
2563
2564/*ARGSUSED*/
2565int
2566copyin(const void *uaddr, void *kaddr, size_t count)
2567{ return (0); }
2568
2569#else	/* lint */
2570
2571	ENTRY(copyin)
2572	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
2573	bleu,pt	%ncc, .copyin_small		! go to larger cases
2574	  xor	%o0, %o1, %o3			! are src, dst alignable?
2575	btst	7, %o3				!
2576	bz,pt	%ncc, .copyin_8			! check for longword alignment
2577	  nop
2578	btst	1, %o3				!
2579	bz,pt	%ncc, .copyin_2			! check for half-word
2580	  nop
2581	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
2582	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
2583	tst	%o3
2584	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2585	  cmp	%o2, %o3			! if length <= limit
2586	bleu,pt	%ncc, .copyin_small		! go to small copy
2587	  nop
2588	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2589	  nop
2590.copyin_2:
2591	btst	3, %o3				!
2592	bz,pt	%ncc, .copyin_4			! check for word alignment
2593	  nop
2594	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
2595	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
2596	tst	%o3
2597	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2598	  cmp	%o2, %o3			! if length <= limit
2599	bleu,pt	%ncc, .copyin_small		! go to small copy
2600	  nop
2601	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2602	  nop
2603.copyin_4:
2604	! already checked longword, must be word aligned
2605	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
2606	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
2607	tst	%o3
2608	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2609	  cmp	%o2, %o3			! if length <= limit
2610	bleu,pt	%ncc, .copyin_small		! go to small copy
2611	  nop
2612	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2613	  nop
2614.copyin_8:
2615	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
2616	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
2617	tst	%o3
2618	bz,pn	%icc, .copyin_small		! if zero, disable HW copy
2619	  cmp	%o2, %o3			! if length <= limit
2620	bleu,pt	%ncc, .copyin_small		! go to small copy
2621	  nop
2622	ba,pt	%ncc, .copyin_more		! otherwise go to large copy
2623	  nop
2624
2625	.align	16
2626	nop				! instruction alignment
2627					! see discussion at start of file
2628.copyin_small:
2629	sethi	%hi(.sm_copyin_err), %o5	! .sm_copyin_err is lofault
2630	or	%o5, %lo(.sm_copyin_err), %o5
2631	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofault, no tramp
2632	membar	#Sync				! sync error barrier
2633	stn	%o5, [THREAD_REG + T_LOFAULT]
2634.sm_do_copyin:
2635	mov	%o0, SM_SAVE_SRC
2636	mov	%o1, SM_SAVE_DST
2637	cmp	%o2, SHORTCOPY		! check for really short case
2638	bleu,pt	%ncc, .ci_sm_left	!
2639	  mov	%o2, SM_SAVE_COUNT
2640	cmp	%o2, CHKSIZE		! check for medium length cases
2641	bgu,pn	%ncc, .ci_med		!
2642	  or	%o0, %o1, %o3		! prepare alignment check
2643	andcc	%o3, 0x3, %g0		! test for alignment
2644	bz,pt	%ncc, .ci_sm_word	! branch to word aligned case
2645.ci_sm_movebytes:
2646	  sub	%o2, 3, %o2		! adjust count to allow cc zero test
2647.ci_sm_notalign4:
2648	lduba	[%o0]ASI_USER, %o3	! read byte
2649	subcc	%o2, 4, %o2		! reduce count by 4
2650	stb	%o3, [%o1]		! write byte
2651	add	%o0, 1, %o0		! advance SRC by 1
2652	lduba	[%o0]ASI_USER, %o3	! repeat for a total of 4 bytes
2653	add	%o0, 1, %o0		! advance SRC by 1
2654	stb	%o3, [%o1 + 1]
2655	add	%o1, 4, %o1		! advance DST by 4
2656	lduba	[%o0]ASI_USER, %o3
2657	add	%o0, 1, %o0		! advance SRC by 1
2658	stb	%o3, [%o1 - 2]
2659	lduba	[%o0]ASI_USER, %o3
2660	add	%o0, 1, %o0		! advance SRC by 1
2661	bgt,pt	%ncc, .ci_sm_notalign4	! loop til 3 or fewer bytes remain
2662	  stb	%o3, [%o1 - 1]
2663	add	%o2, 3, %o2		! restore count
2664.ci_sm_left:
2665	tst	%o2
2666	bz,pt	%ncc, .ci_sm_exit
2667	  nop
2668	lduba	[%o0]ASI_USER, %o3		! load one byte
2669	deccc	%o2			! reduce count for cc test
2670	bz,pt	%ncc, .ci_sm_exit
2671	  stb	%o3,[%o1]		! store one byte
2672	inc	%o0
2673	lduba	[%o0]ASI_USER, %o3	! load second byte
2674	deccc	%o2
2675	bz,pt	%ncc, .ci_sm_exit
2676	  stb	%o3,[%o1 + 1]		! store second byte
2677	inc	%o0
2678	lduba	[%o0]ASI_USER, %o3	! load third byte
2679	stb	%o3,[%o1 + 2]		! store third byte
2680	membar	#Sync				! sync error barrier
2681	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2682	retl
2683	  mov	%g0, %o0		! return 0
2684	.align	16
2685.ci_sm_words:
2686	lduwa	[%o0]ASI_USER, %o3		! read word
2687.ci_sm_wordx:
2688	subcc	%o2, 8, %o2		! update count
2689	stw	%o3, [%o1]		! write word
2690	add	%o0, 4, %o0		! update SRC
2691	add	%o1, 8, %o1		! update DST
2692	lduwa	[%o0]ASI_USER, %o3	! read word
2693	add	%o0, 4, %o0		! update SRC
2694	bgt,pt	%ncc, .ci_sm_words	! loop til done
2695	  stw	%o3, [%o1 - 4]		! write word
2696	addcc	%o2, 7, %o2		! restore count
2697	bz,pt	%ncc, .ci_sm_exit
2698	  nop
2699	deccc	%o2
2700	bz,pt	%ncc, .ci_sm_byte
2701.ci_sm_half:
2702	  subcc	%o2, 2, %o2		! reduce count by 2
2703	lduha	[%o0]ASI_USER, %o3	! read half word
2704	add	%o0, 2, %o0		! advance SRC by 2
2705	add	%o1, 2, %o1		! advance DST by 2
2706	bgt,pt	%ncc, .ci_sm_half	! loop til done
2707	  sth	%o3, [%o1 - 2]		! write half word
2708	addcc	%o2, 1, %o2		! restore count
2709	bz,pt	%ncc, .ci_sm_exit
2710	  nop
2711.ci_sm_byte:
2712	lduba	[%o0]ASI_USER, %o3
2713	stb	%o3, [%o1]
2714	membar	#Sync				! sync error barrier
2715	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2716	retl
2717	  mov	%g0, %o0		! return 0
2718	.align	16
2719.ci_sm_word:
2720	subcc	%o2, 4, %o2		! update count
2721	bgt,pt	%ncc, .ci_sm_wordx
2722	  lduwa	[%o0]ASI_USER, %o3		! read word
2723	addcc	%o2, 3, %o2		! restore count
2724	bz,pt	%ncc, .ci_sm_exit
2725	  stw	%o3, [%o1]		! write word
2726	deccc	%o2			! reduce count for cc test
2727	add	%o0, 4, %o0
2728	lduba	[%o0]ASI_USER, %o3	! load one byte
2729	bz,pt	%ncc, .ci_sm_exit
2730	  stb	%o3, [%o1 + 4]		! store one byte
2731	inc	%o0
2732	lduba	[%o0]ASI_USER, %o3	! load second byte
2733	deccc	%o2
2734	bz,pt	%ncc, .ci_sm_exit
2735	  stb	%o3, [%o1 + 5]		! store second byte
2736	inc	%o0
2737	lduba	[%o0]ASI_USER, %o3	! load third byte
2738	stb	%o3, [%o1 + 6]		! store third byte
2739.ci_sm_exit:
2740	membar	#Sync				! sync error barrier
2741	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
2742	retl
2743	  mov	%g0, %o0		! return 0
2744
2745	.align 16
2746.ci_med:
2747	xor	%o0, %o1, %o3		! setup alignment check
2748	btst	1, %o3
2749	bnz,pt	%ncc, .ci_sm_movebytes	! unaligned
2750	  nop
2751	btst	3, %o3
2752	bnz,pt	%ncc, .ci_med_half	! halfword aligned
2753	  nop
2754	btst	7, %o3
2755	bnz,pt	%ncc, .ci_med_word	! word aligned
2756	  nop
2757.ci_med_long:
2758	btst	3, %o0			! check for
2759	bz,pt	%ncc, .ci_med_long1	! word alignment
2760	  nop
2761.ci_med_long0:
2762	lduba	[%o0]ASI_USER, %o3		! load one byte
2763	inc	%o0
2764	stb	%o3,[%o1]		! store byte
2765	inc	%o1
2766	btst	3, %o0
2767	bnz,pt	%ncc, .ci_med_long0
2768	  dec	%o2
2769.ci_med_long1:			! word aligned
2770	btst	7, %o0			! check for long word
2771	bz,pt	%ncc, .ci_med_long2
2772	  nop
2773	lduwa	[%o0]ASI_USER, %o3	! load word
2774	add	%o0, 4, %o0		! advance SRC by 4
2775	stw	%o3, [%o1]		! store word
2776	add	%o1, 4, %o1		! advance DST by 4
2777	sub	%o2, 4, %o2		! reduce count by 4
2778!
2779!  Now long word aligned and have at least 32 bytes to move
2780!
2781.ci_med_long2:
2782	sub	%o2, 31, %o2		! adjust count to allow cc zero test
2783.ci_med_lmove:
2784	ldxa	[%o0]ASI_USER, %o3	! read long word
2785	subcc	%o2, 32, %o2		! reduce count by 32
2786	stx	%o3, [%o1]		! write long word
2787	add	%o0, 8, %o0		! advance SRC by 8
2788	ldxa	[%o0]ASI_USER, %o3	! repeat for a total for 4 long words
2789	add	%o0, 8, %o0		! advance SRC by 8
2790	stx	%o3, [%o1 + 8]
2791	add	%o1, 32, %o1		! advance DST by 32
2792	ldxa	[%o0]ASI_USER, %o3
2793	add	%o0, 8, %o0		! advance SRC by 8
2794	stx	%o3, [%o1 - 16]
2795	ldxa	[%o0]ASI_USER, %o3
2796	add	%o0, 8, %o0		! advance SRC by 8
2797	bgt,pt	%ncc, .ci_med_lmove	! loop til 31 or fewer bytes left
2798	  stx	%o3, [%o1 - 8]
2799	addcc	%o2, 24, %o2		! restore count to long word offset
2800	ble,pt	%ncc, .ci_med_lextra	! check for more long words to move
2801	  nop
2802.ci_med_lword:
2803	ldxa	[%o0]ASI_USER, %o3	! read long word
2804	subcc	%o2, 8, %o2		! reduce count by 8
2805	stx	%o3, [%o1]		! write long word
2806	add	%o0, 8, %o0		! advance SRC by 8
2807	bgt,pt	%ncc, .ci_med_lword	! loop til 7 or fewer bytes left
2808	  add	%o1, 8, %o1		! advance DST by 8
2809.ci_med_lextra:
2810	addcc	%o2, 7, %o2		! restore rest of count
2811	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
2812	  deccc	%o2
2813	bz,pt	%ncc, .ci_sm_byte
2814	  nop
2815	ba,pt	%ncc, .ci_sm_half
2816	  nop
2817
2818	.align 16
2819	nop				! instruction alignment
2820					! see discussion at start of file
2821.ci_med_word:
2822	btst	3, %o0			! check for
2823	bz,pt	%ncc, .ci_med_word1	! word alignment
2824	  nop
2825.ci_med_word0:
2826	lduba	[%o0]ASI_USER, %o3	! load one byte
2827	inc	%o0
2828	stb	%o3,[%o1]		! store byte
2829	inc	%o1
2830	btst	3, %o0
2831	bnz,pt	%ncc, .ci_med_word0
2832	  dec	%o2
2833!
2834!  Now word aligned and have at least 36 bytes to move
2835!
2836.ci_med_word1:
2837	sub	%o2, 15, %o2		! adjust count to allow cc zero test
2838.ci_med_wmove:
2839	lduwa	[%o0]ASI_USER, %o3	! read word
2840	subcc	%o2, 16, %o2		! reduce count by 16
2841	stw	%o3, [%o1]		! write word
2842	add	%o0, 4, %o0		! advance SRC by 4
2843	lduwa	[%o0]ASI_USER, %o3	! repeat for a total for 4 words
2844	add	%o0, 4, %o0		! advance SRC by 4
2845	stw	%o3, [%o1 + 4]
2846	add	%o1, 16, %o1		! advance DST by 16
2847	lduwa	[%o0]ASI_USER, %o3
2848	add	%o0, 4, %o0		! advance SRC by 4
2849	stw	%o3, [%o1 - 8]
2850	lduwa	[%o0]ASI_USER, %o3
2851	add	%o0, 4, %o0		! advance SRC by 4
2852	bgt,pt	%ncc, .ci_med_wmove	! loop til 15 or fewer bytes left
2853	  stw	%o3, [%o1 - 4]
2854	addcc	%o2, 12, %o2		! restore count to word offset
2855	ble,pt	%ncc, .ci_med_wextra	! check for more words to move
2856	  nop
2857.ci_med_word2:
2858	lduwa	[%o0]ASI_USER, %o3	! read word
2859	subcc	%o2, 4, %o2		! reduce count by 4
2860	stw	%o3, [%o1]		! write word
2861	add	%o0, 4, %o0		! advance SRC by 4
2862	bgt,pt	%ncc, .ci_med_word2	! loop til 3 or fewer bytes left
2863	  add	%o1, 4, %o1		! advance DST by 4
2864.ci_med_wextra:
2865	addcc	%o2, 3, %o2		! restore rest of count
2866	bz,pt	%ncc, .ci_sm_exit	! if zero, then done
2867	  deccc	%o2
2868	bz,pt	%ncc, .ci_sm_byte
2869	  nop
2870	ba,pt	%ncc, .ci_sm_half
2871	  nop
2872
2873	.align 16
2874	nop				! instruction alignment
2875					! see discussion at start of file
2876.ci_med_half:
2877	btst	1, %o0			! check for
2878	bz,pt	%ncc, .ci_med_half1	! half word alignment
2879	  nop
2880	lduba	[%o0]ASI_USER, %o3	! load one byte
2881	inc	%o0
2882	stb	%o3,[%o1]		! store byte
2883	inc	%o1
2884	dec	%o2
2885!
2886!  Now half word aligned and have at least 38 bytes to move
2887!
2888.ci_med_half1:
2889	sub	%o2, 7, %o2		! adjust count to allow cc zero test
2890.ci_med_hmove:
2891	lduha	[%o0]ASI_USER, %o3	! read half word
2892	subcc	%o2, 8, %o2		! reduce count by 8
2893	sth	%o3, [%o1]		! write half word
2894	add	%o0, 2, %o0		! advance SRC by 2
2895	lduha	[%o0]ASI_USER, %o3	! repeat for a total for 4 halfwords
2896	add	%o0, 2, %o0		! advance SRC by 2
2897	sth	%o3, [%o1 + 2]
2898	add	%o1, 8, %o1		! advance DST by 8
2899	lduha	[%o0]ASI_USER, %o3
2900	add	%o0, 2, %o0		! advance SRC by 2
2901	sth	%o3, [%o1 - 4]
2902	lduha	[%o0]ASI_USER, %o3
2903	add	%o0, 2, %o0		! advance SRC by 2
2904	bgt,pt	%ncc, .ci_med_hmove	! loop til 7 or fewer bytes left
2905	  sth	%o3, [%o1 - 2]
2906	addcc	%o2, 7, %o2		! restore count
2907	bz,pt	%ncc, .ci_sm_exit
2908	  deccc	%o2
2909	bz,pt	%ncc, .ci_sm_byte
2910	  nop
2911	ba,pt	%ncc, .ci_sm_half
2912	  nop
2913
2914.sm_copyin_err:
2915	membar	#Sync
2916	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
2917	mov	SM_SAVE_SRC, %o0
2918	mov	SM_SAVE_DST, %o1
2919	mov	SM_SAVE_COUNT, %o2
2920	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
2921	tst	%o3
2922	bz,pt	%ncc, 3f			! if not, return error
2923	  nop
2924	ldn	[%o3 + CP_COPYIN], %o5		! if handler, invoke it with
2925	jmp	%o5				! original arguments
2926	  nop
29273:
2928	retl
2929	  or	%g0, -1, %o0		! return errno value
2930
2931	SET_SIZE(copyin)
2932
2933
2934/*
2935 * The _more entry points are not intended to be used directly by
2936 * any caller from outside this file.  They are provided to allow
2937 * profiling and dtrace of the portions of the copy code that uses
2938 * the floating point registers.
2939 * This entry is particularly important as DTRACE (at least as of
2940 * 4/2004) does not support leaf functions.
2941 */
2942
2943	ENTRY(copyin_more)
2944.copyin_more:
2945	prefetch [%o0], #n_reads
2946	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
2947	set	.copyin_err, REAL_LOFAULT
2948
2949/*
2950 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes
2951 */
2952.do_copyin:
2953	set	copyio_fault, %l7		! .copyio_fault is lofault val
2954
2955	ldn	[THREAD_REG + T_LOFAULT], %l6	! save existing handler
2956	membar	#Sync				! sync error barrier
2957	stn	%l7, [THREAD_REG + T_LOFAULT]	! set t_lofault
2958
2959	mov	%i0, SAVE_SRC
2960	mov	%i1, SAVE_DST
2961	mov	%i2, SAVE_COUNT
2962
2963	FP_NOMIGRATE(6, 7)
2964
2965	rd	%fprs, %o2		! check for unused fp
2966	st	%o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs
2967	btst	FPRS_FEF, %o2
2968	bz,a,pt	%icc, .do_blockcopyin
2969	  wr	%g0, FPRS_FEF, %fprs
2970
2971	BST_FPQ2Q4_TOSTACK(%o2)
2972
2973.do_blockcopyin:
2974	rd	%gsr, %o2
2975	stx	%o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET]	! save gsr
2976	or	%l6, FPUSED_FLAG, %l6
2977
2978	andcc	DST, VIS_BLOCKSIZE - 1, TMP
2979	mov	ASI_USER, %asi
2980	bz,pt	%ncc, 2f
2981	  neg	TMP
2982	add	TMP, VIS_BLOCKSIZE, TMP
2983
2984	! TMP = bytes required to align DST on FP_BLOCK boundary
2985	! Using SRC as a tmp here
2986	cmp	TMP, 3
2987	bleu,pt	%ncc, 1f
2988	  sub	CNT,TMP,CNT		! adjust main count
2989	sub	TMP, 3, TMP		! adjust for end of loop test
2990.ci_blkalign:
2991	lduba	[REALSRC]%asi, SRC	! move 4 bytes per loop iteration
2992	stb	SRC, [DST]
2993	subcc	TMP, 4, TMP
2994	lduba	[REALSRC + 1]%asi, SRC
2995	add	REALSRC, 4, REALSRC
2996	stb	SRC, [DST + 1]
2997	lduba	[REALSRC - 2]%asi, SRC
2998	add	DST, 4, DST
2999	stb	SRC, [DST - 2]
3000	lduba	[REALSRC - 1]%asi, SRC
3001	bgu,pt	%ncc, .ci_blkalign
3002	  stb	SRC, [DST - 1]
3003
3004	addcc	TMP, 3, TMP		! restore count adjustment
3005	bz,pt	%ncc, 2f		! no bytes left?
3006	  nop
30071:	lduba	[REALSRC]%asi, SRC
3008	inc	REALSRC
3009	inc	DST
3010	deccc	TMP
3011	bgu	%ncc, 1b
3012	  stb	SRC, [DST - 1]
3013
30142:
3015	membar	#StoreLoad
3016	andn	REALSRC, 0x7, SRC
3017
3018	! SRC - 8-byte aligned
3019	! DST - 64-byte aligned
3020	ldda	[SRC]%asi, %f16
3021	prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #n_reads
3022	alignaddr REALSRC, %g0, %g0
3023	ldda	[SRC + 0x08]%asi, %f18
3024	prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #n_reads
3025	faligndata %f16, %f18, %f48
3026	ldda	[SRC + 0x10]%asi, %f20
3027	prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
3028	faligndata %f18, %f20, %f50
3029	ldda	[SRC + 0x18]%asi, %f22
3030	prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read
3031	faligndata %f20, %f22, %f52
3032	ldda	[SRC + 0x20]%asi, %f24
3033	prefetcha [SRC + (8 * VIS_BLOCKSIZE)]%asi, #one_read
3034	faligndata %f22, %f24, %f54
3035	ldda	[SRC + 0x28]%asi, %f26
3036	prefetcha [SRC + (12 * VIS_BLOCKSIZE)]%asi, #one_read
3037	faligndata %f24, %f26, %f56
3038	ldda	[SRC + 0x30]%asi, %f28
3039	prefetcha [SRC + (16 * VIS_BLOCKSIZE)]%asi, #one_read
3040	faligndata %f26, %f28, %f58
3041	ldda	[SRC + 0x38]%asi, %f30
3042	ldda	[SRC + VIS_BLOCKSIZE]%asi, %f16
3043	sub	CNT, VIS_BLOCKSIZE, CNT
3044	add	SRC, VIS_BLOCKSIZE, SRC
3045	prefetcha [SRC + (19 * VIS_BLOCKSIZE)]%asi, #one_read
3046	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3047	ba,pt	%ncc, 1f
3048	prefetcha [SRC + (23 * VIS_BLOCKSIZE)]%asi, #one_read
3049	.align	32
30501:
3051	ldda	[SRC + 0x08]%asi, %f18
3052	faligndata %f28, %f30, %f60
3053	ldda	[SRC + 0x10]%asi, %f20
3054	faligndata %f30, %f16, %f62
3055	stda	%f48, [DST]ASI_BLK_P
3056	ldda	[SRC + 0x18]%asi, %f22
3057	faligndata %f16, %f18, %f48
3058	ldda	[SRC + 0x20]%asi, %f24
3059	faligndata %f18, %f20, %f50
3060	ldda	[SRC + 0x28]%asi, %f26
3061	faligndata %f20, %f22, %f52
3062	ldda	[SRC + 0x30]%asi, %f28
3063	faligndata %f22, %f24, %f54
3064	sub	CNT, VIS_BLOCKSIZE, CNT
3065	ldda	[SRC + 0x38]%asi, %f30
3066	faligndata %f24, %f26, %f56
3067	add	DST, VIS_BLOCKSIZE, DST
3068	ldda	[SRC + VIS_BLOCKSIZE]%asi, %f16
3069	faligndata %f26, %f28, %f58
3070	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3071	prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads
3072	add	SRC, VIS_BLOCKSIZE, SRC
3073	prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3074	cmp	CNT, VIS_BLOCKSIZE + 8
3075	bgu,pt	%ncc, 1b
3076	  prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read
3077
3078	! only if REALSRC & 0x7 is 0
3079	cmp	CNT, VIS_BLOCKSIZE
3080	bne	%ncc, 3f
3081	  andcc	REALSRC, 0x7, %g0
3082	bz,pt	%ncc, 2f
3083	  nop
30843:
3085	faligndata %f28, %f30, %f60
3086	faligndata %f30, %f16, %f62
3087	stda	%f48, [DST]ASI_BLK_P
3088	add	DST, VIS_BLOCKSIZE, DST
3089	ba,pt	%ncc, 3f
3090	  nop
30912:
3092	ldda	[SRC + 0x08]%asi, %f18
3093	fsrc1	%f28, %f60
3094	ldda	[SRC + 0x10]%asi, %f20
3095	fsrc1	%f30, %f62
3096	stda	%f48, [DST]ASI_BLK_P
3097	ldda	[SRC + 0x18]%asi, %f22
3098	fsrc1	%f16, %f48
3099	ldda	[SRC + 0x20]%asi, %f24
3100	fsrc1	%f18, %f50
3101	ldda	[SRC + 0x28]%asi, %f26
3102	fsrc1	%f20, %f52
3103	ldda	[SRC + 0x30]%asi, %f28
3104	fsrc1	%f22, %f54
3105	ldda	[SRC + 0x38]%asi, %f30
3106	fsrc1	%f24, %f56
3107	sub	CNT, VIS_BLOCKSIZE, CNT
3108	add	DST, VIS_BLOCKSIZE, DST
3109	add	SRC, VIS_BLOCKSIZE, SRC
3110	add	REALSRC, VIS_BLOCKSIZE, REALSRC
3111	fsrc1	%f26, %f58
3112	fsrc1	%f28, %f60
3113	fsrc1	%f30, %f62
3114	stda	%f48, [DST]ASI_BLK_P
3115	add	DST, VIS_BLOCKSIZE, DST
3116	ba,a,pt	%ncc, 4f
3117	  nop
3118
31193:	tst	CNT
3120	bz,a	%ncc, 4f
3121	  nop
3122
31235:	lduba	[REALSRC]ASI_USER, TMP
3124	inc	REALSRC
3125	inc	DST
3126	deccc	CNT
3127	bgu	%ncc, 5b
3128	  stb	TMP, [DST - 1]
31294:
3130
3131.copyin_exit:
3132	membar	#Sync
3133
3134	ldx	[%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2	! restore gsr
3135	wr	%o2, 0, %gsr
3136
3137	ld	[%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3
3138	btst	FPRS_FEF, %o3
3139	bz,pt	%icc, 4f
3140	  nop
3141
3142	BLD_FPQ2Q4_FROMSTACK(%o2)
3143
3144	ba,pt	%ncc, 1f
3145	  wr	%o3, 0, %fprs		! restore fprs
3146
31474:
3148	FZEROQ2Q4
3149	wr	%o3, 0, %fprs		! restore fprs
3150
31511:
3152	membar	#Sync				! sync error barrier
3153	andn	%l6, FPUSED_FLAG, %l6
3154	stn	%l6, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3155	FP_ALLOWMIGRATE(5, 6)
3156	ret
3157	  restore	%g0, 0, %o0
3158/*
3159 * We got here because of a fault during copyin
3160 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh).
3161 */
3162.copyin_err:
3163	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
3164	tst	%o4
3165	bz,pt	%ncc, 2f			! if not, return error
3166	nop
3167	ldn	[%o4 + CP_COPYIN], %g2		! if handler, invoke it with
3168	jmp	%g2				! original arguments
3169	restore %g0, 0, %g0			! dispose of copy window
31702:
3171	ret
3172	restore %g0, -1, %o0			! return error value
3173
3174
3175	SET_SIZE(copyin_more)
3176
3177#endif	/* lint */
3178
3179#ifdef	lint
3180
3181/*ARGSUSED*/
3182int
3183xcopyin(const void *uaddr, void *kaddr, size_t count)
3184{ return (0); }
3185
3186#else	/* lint */
3187
3188	ENTRY(xcopyin)
3189
3190	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
3191	bleu,pt	%ncc, .xcopyin_small		! go to larger cases
3192	  xor	%o0, %o1, %o3			! are src, dst alignable?
3193	btst	7, %o3				!
3194	bz,pt	%ncc, .xcopyin_8		! check for longword alignment
3195	  nop
3196	btst	1, %o3				!
3197	bz,pt	%ncc, .xcopyin_2		! check for half-word
3198	  nop
3199	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
3200	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3201	tst	%o3
3202	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3203	  cmp	%o2, %o3			! if length <= limit
3204	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3205	  nop
3206	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3207	  nop
3208.xcopyin_2:
3209	btst	3, %o3				!
3210	bz,pt	%ncc, .xcopyin_4		! check for word alignment
3211	  nop
3212	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
3213	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3214	tst	%o3
3215	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3216	  cmp	%o2, %o3			! if length <= limit
3217	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3218	  nop
3219	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3220	  nop
3221.xcopyin_4:
3222	! already checked longword, must be word aligned
3223	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
3224	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3225	tst	%o3
3226	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3227	  cmp	%o2, %o3			! if length <= limit
3228	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3229	  nop
3230	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3231	  nop
3232.xcopyin_8:
3233	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
3234	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3235	tst	%o3
3236	bz,pn	%icc, .xcopyin_small		! if zero, disable HW copy
3237	  cmp	%o2, %o3			! if length <= limit
3238	bleu,pt	%ncc, .xcopyin_small		! go to small copy
3239	  nop
3240	ba,pt	%ncc, .xcopyin_more		! otherwise go to large copy
3241	  nop
3242
3243.xcopyin_small:
3244	sethi	%hi(.sm_xcopyin_err), %o5  ! .sm_xcopyin_err is lofault value
3245	or	%o5, %lo(.sm_xcopyin_err), %o5
3246	ldn	[THREAD_REG + T_LOFAULT], %o4	! set/save t_lofaul
3247	membar	#Sync				! sync error barrier
3248	ba,pt	%ncc, .sm_do_copyin		! common code
3249	  stn	%o5, [THREAD_REG + T_LOFAULT]
3250
3251.xcopyin_more:
3252	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3253	sethi	%hi(.xcopyin_err), REAL_LOFAULT	! .xcopyin_err is lofault value
3254	ba,pt	%ncc, .do_copyin
3255	  or	REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT
3256
3257/*
3258 * We got here because of fault during xcopyin
3259 * Errno value is in ERRNO
3260 */
3261.xcopyin_err:
3262	ldn	[THREAD_REG + T_COPYOPS], %o4	! check for copyop handler
3263	tst	%o4
3264	bz,pt	%ncc, 2f			! if not, return error
3265	  nop
3266	ldn	[%o4 + CP_XCOPYIN], %g2		! if handler, invoke it with
3267	jmp	%g2				! original arguments
3268	  restore %g0, 0, %g0			! dispose of copy window
32692:
3270        ret
3271	  restore ERRNO, 0, %o0			! return errno value
3272
3273.sm_xcopyin_err:
3274
3275	membar	#Sync
3276	stn	%o4, [THREAD_REG + T_LOFAULT]   ! restore old t_lofault
3277	mov	SM_SAVE_SRC, %o0
3278	mov	SM_SAVE_DST, %o1
3279	mov	SM_SAVE_COUNT, %o2
3280	ldn	[THREAD_REG + T_COPYOPS], %o3	! check for copyop handler
3281	tst	%o3
3282	bz,pt	%ncc, 3f			! if not, return error
3283	  nop
3284	ldn	[%o3 + CP_XCOPYIN], %o5		! if handler, invoke it with
3285	jmp	%o5				! original arguments
3286	  nop
32873:
3288	retl
3289	  or	%g1, 0, %o0		! return errno value
3290
3291	SET_SIZE(xcopyin)
3292
3293#endif	/* lint */
3294
3295#ifdef	lint
3296
3297/*ARGSUSED*/
3298int
3299xcopyin_little(const void *uaddr, void *kaddr, size_t count)
3300{ return (0); }
3301
3302#else	/* lint */
3303
3304	ENTRY(xcopyin_little)
3305	sethi	%hi(.xcopyio_err), %o5
3306	or	%o5, %lo(.xcopyio_err), %o5
3307	ldn	[THREAD_REG + T_LOFAULT], %o4
3308	membar	#Sync				! sync error barrier
3309	stn	%o5, [THREAD_REG + T_LOFAULT]
3310	mov	%o4, %o5
3311
3312	subcc	%g0, %o2, %o3
3313	add	%o0, %o2, %o0
3314	bz,pn	%ncc, 2f		! check for zero bytes
3315	  sub	%o2, 1, %o4
3316	add	%o0, %o4, %o0		! start w/last byte
3317	add	%o1, %o2, %o1
3318	lduba	[%o0 + %o3]ASI_AIUSL, %o4
3319
33201:	stb	%o4, [%o1 + %o3]
3321	inccc	%o3
3322	sub	%o0, 2, %o0		! get next byte
3323	bcc,a,pt %ncc, 1b
3324	  lduba	[%o0 + %o3]ASI_AIUSL, %o4
3325
33262:
3327	membar	#Sync				! sync error barrier
3328	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3329	retl
3330	  mov	%g0, %o0		! return (0)
3331
3332.xcopyio_err:
3333	membar	#Sync				! sync error barrier
3334	stn	%o5, [THREAD_REG + T_LOFAULT]	! restore old t_lofault
3335	retl
3336	  mov	%g1, %o0
3337
3338	SET_SIZE(xcopyin_little)
3339
3340#endif	/* lint */
3341
3342
3343/*
3344 * Copy a block of storage - must not overlap (from + len <= to).
3345 * No fault handler installed (to be called under on_fault())
3346 */
3347#if defined(lint)
3348
3349/* ARGSUSED */
3350void
3351copyin_noerr(const void *ufrom, void *kto, size_t count)
3352{}
3353
3354#else	/* lint */
3355	ENTRY(copyin_noerr)
3356
3357	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
3358	bleu,pt	%ncc, .copyin_ne_small		! go to larger cases
3359	  xor	%o0, %o1, %o3			! are src, dst alignable?
3360	btst	7, %o3				!
3361	bz,pt	%ncc, .copyin_ne_8		! check for longword alignment
3362	  nop
3363	btst	1, %o3				!
3364	bz,pt	%ncc, .copyin_ne_2		! check for half-word
3365	  nop
3366	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
3367	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3368	tst	%o3
3369	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3370	  cmp	%o2, %o3			! if length <= limit
3371	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3372	  nop
3373	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3374	  nop
3375.copyin_ne_2:
3376	btst	3, %o3				!
3377	bz,pt	%ncc, .copyin_ne_4		! check for word alignment
3378	  nop
3379	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
3380	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3381	tst	%o3
3382	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3383	  cmp	%o2, %o3			! if length <= limit
3384	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3385	  nop
3386	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3387	  nop
3388.copyin_ne_4:
3389	! already checked longword, must be word aligned
3390	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
3391	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3392	tst	%o3
3393	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3394	  cmp	%o2, %o3			! if length <= limit
3395	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3396	  nop
3397	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3398	  nop
3399.copyin_ne_8:
3400	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
3401	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3402	tst	%o3
3403	bz,pn	%icc, .copyin_ne_small		! if zero, disable HW copy
3404	  cmp	%o2, %o3			! if length <= limit
3405	bleu,pt	%ncc, .copyin_ne_small		! go to small copy
3406	  nop
3407	ba,pt	%ncc, .copyin_noerr_more	! otherwise go to large copy
3408	  nop
3409
3410.copyin_ne_small:
3411	ldn	[THREAD_REG + T_LOFAULT], %o4
3412	tst	%o4
3413	bz,pn	%ncc, .sm_do_copyin
3414	  nop
3415	sethi	%hi(.sm_copyio_noerr), %o5
3416	or	%o5, %lo(.sm_copyio_noerr), %o5
3417	membar	#Sync				! sync error barrier
3418	ba,pt	%ncc, .sm_do_copyin
3419	  stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
3420
3421.copyin_noerr_more:
3422	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3423	sethi	%hi(.copyio_noerr), REAL_LOFAULT
3424	ba,pt	%ncc, .do_copyin
3425	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3426
3427.copyio_noerr:
3428	jmp	%l6
3429	  restore %g0,0,%g0
3430
3431.sm_copyio_noerr:
3432	membar	#Sync
3433	stn	%o4, [THREAD_REG + T_LOFAULT]	! restore t_lofault
3434	jmp	%o4
3435	  nop
3436
3437	SET_SIZE(copyin_noerr)
3438#endif /* lint */
3439
3440/*
3441 * Copy a block of storage - must not overlap (from + len <= to).
3442 * No fault handler installed (to be called under on_fault())
3443 */
3444
3445#if defined(lint)
3446
3447/* ARGSUSED */
3448void
3449copyout_noerr(const void *kfrom, void *uto, size_t count)
3450{}
3451
3452#else	/* lint */
3453	ENTRY(copyout_noerr)
3454
3455	cmp	%o2, VIS_COPY_THRESHOLD		! check for leaf rtn case
3456	bleu,pt	%ncc, .copyout_ne_small		! go to larger cases
3457	  xor	%o0, %o1, %o3			! are src, dst alignable?
3458	btst	7, %o3				!
3459	bz,pt	%ncc, .copyout_ne_8		! check for longword alignment
3460	  nop
3461	btst	1, %o3				!
3462	bz,pt	%ncc, .copyout_ne_2		! check for half-word
3463	  nop
3464	sethi	%hi(hw_copy_limit_1), %o3	! Check copy limit
3465	ld	[%o3 + %lo(hw_copy_limit_1)], %o3
3466	tst	%o3
3467	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3468	  cmp	%o2, %o3			! if length <= limit
3469	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3470	  nop
3471	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3472	  nop
3473.copyout_ne_2:
3474	btst	3, %o3				!
3475	bz,pt	%ncc, .copyout_ne_4		! check for word alignment
3476	  nop
3477	sethi	%hi(hw_copy_limit_2), %o3	! Check copy limit
3478	ld	[%o3 + %lo(hw_copy_limit_2)], %o3
3479	tst	%o3
3480	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3481	  cmp	%o2, %o3			! if length <= limit
3482	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3483	  nop
3484	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3485	  nop
3486.copyout_ne_4:
3487	! already checked longword, must be word aligned
3488	sethi	%hi(hw_copy_limit_4), %o3	! Check copy limit
3489	ld	[%o3 + %lo(hw_copy_limit_4)], %o3
3490	tst	%o3
3491	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3492	  cmp	%o2, %o3			! if length <= limit
3493	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3494	  nop
3495	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3496	  nop
3497.copyout_ne_8:
3498	sethi	%hi(hw_copy_limit_8), %o3	! Check copy limit
3499	ld	[%o3 + %lo(hw_copy_limit_8)], %o3
3500	tst	%o3
3501	bz,pn	%icc, .copyout_ne_small		! if zero, disable HW copy
3502	  cmp	%o2, %o3			! if length <= limit
3503	bleu,pt	%ncc, .copyout_ne_small		! go to small copy
3504	  nop
3505	ba,pt	%ncc, .copyout_noerr_more	! otherwise go to large copy
3506	  nop
3507
3508.copyout_ne_small:
3509	ldn	[THREAD_REG + T_LOFAULT], %o4
3510	tst	%o4
3511	bz,pn	%ncc, .sm_do_copyout
3512	  nop
3513	sethi	%hi(.sm_copyio_noerr), %o5
3514	or	%o5, %lo(.sm_copyio_noerr), %o5
3515	membar	#Sync				! sync error barrier
3516	ba,pt	%ncc, .sm_do_copyout
3517	stn	%o5, [THREAD_REG + T_LOFAULT]	! set/save t_lofault
3518
3519.copyout_noerr_more:
3520	save	%sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp
3521	sethi	%hi(.copyio_noerr), REAL_LOFAULT
3522	ba,pt	%ncc, .do_copyout
3523	  or	REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT
3524
3525	SET_SIZE(copyout_noerr)
3526#endif /* lint */
3527
3528
3529/*
3530 * hwblkclr - clears block-aligned, block-multiple-sized regions that are
3531 * longer than 256 bytes in length using spitfire's block stores.  If
3532 * the criteria for using this routine are not met then it calls bzero
3533 * and returns 1.  Otherwise 0 is returned indicating success.
3534 * Caller is responsible for ensuring use_hw_bzero is true and that
3535 * kpreempt_disable() has been called.
3536 */
3537#ifdef lint
3538/*ARGSUSED*/
3539int
3540hwblkclr(void *addr, size_t len)
3541{
3542	return(0);
3543}
3544#else /* lint */
3545	! %i0 - start address
3546	! %i1 - length of region (multiple of 64)
3547	! %l0 - saved fprs
3548	! %l1 - pointer to saved %d0 block
3549	! %l2 - saved curthread->t_lwp
3550
3551	ENTRY(hwblkclr)
3552	! get another window w/space for one aligned block of saved fpregs
3553	save	%sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp
3554
3555	! Must be block-aligned
3556	andcc	%i0, (VIS_BLOCKSIZE-1), %g0
3557	bnz,pn	%ncc, 1f
3558	  nop
3559
3560	! ... and must be 256 bytes or more
3561	cmp	%i1, 256
3562	blu,pn	%ncc, 1f
3563	  nop
3564
3565	! ... and length must be a multiple of VIS_BLOCKSIZE
3566	andcc	%i1, (VIS_BLOCKSIZE-1), %g0
3567	bz,pn	%ncc, 2f
3568	  nop
3569
35701:	! punt, call bzero but notify the caller that bzero was used
3571	mov	%i0, %o0
3572	call	bzero
3573	mov	%i1, %o1
3574	ret
3575	  restore	%g0, 1, %o0 ! return (1) - did not use block operations
3576
35772:	rd	%fprs, %l0		! check for unused fp
3578	btst	FPRS_FEF, %l0
3579	bz,pt	%icc, 1f
3580	  nop
3581
3582	! save in-use fpregs on stack
3583	membar	#Sync
3584	add	%fp, STACK_BIAS - 65, %l1
3585	and	%l1, -VIS_BLOCKSIZE, %l1
3586	stda	%d0, [%l1]ASI_BLK_P
3587
35881:	membar	#StoreStore|#StoreLoad|#LoadStore
3589	wr	%g0, FPRS_FEF, %fprs
3590	wr	%g0, ASI_BLK_P, %asi
3591
3592	! Clear block
3593	fzero	%d0
3594	fzero	%d2
3595	fzero	%d4
3596	fzero	%d6
3597	fzero	%d8
3598	fzero	%d10
3599	fzero	%d12
3600	fzero	%d14
3601
3602	mov	256, %i3
3603	ba,pt	%ncc, .pz_doblock
3604	  nop
3605
3606.pz_blkstart:
3607      ! stda	%d0, [%i0 + 192]%asi  ! in dly slot of branch that got us here
3608	stda	%d0, [%i0 + 128]%asi
3609	stda	%d0, [%i0 + 64]%asi
3610	stda	%d0, [%i0]%asi
3611.pz_zinst:
3612	add	%i0, %i3, %i0
3613	sub	%i1, %i3, %i1
3614.pz_doblock:
3615	cmp	%i1, 256
3616	bgeu,a	%ncc, .pz_blkstart
3617	  stda	%d0, [%i0 + 192]%asi
3618
3619	cmp	%i1, 64
3620	blu	%ncc, .pz_finish
3621
3622	  andn	%i1, (64-1), %i3
3623	srl	%i3, 4, %i2		! using blocks, 1 instr / 16 words
3624	set	.pz_zinst, %i4
3625	sub	%i4, %i2, %i4
3626	jmp	%i4
3627	  nop
3628
3629.pz_finish:
3630	membar	#Sync
3631	btst	FPRS_FEF, %l0
3632	bz,a	.pz_finished
3633	  wr	%l0, 0, %fprs		! restore fprs
3634
3635	! restore fpregs from stack
3636	ldda	[%l1]ASI_BLK_P, %d0
3637	membar	#Sync
3638	wr	%l0, 0, %fprs		! restore fprs
3639
3640.pz_finished:
3641	ret
3642	  restore	%g0, 0, %o0		! return (bzero or not)
3643
3644	SET_SIZE(hwblkclr)
3645#endif	/* lint */
3646
3647#ifdef lint
3648/*ARGSUSED*/
3649void
3650hw_pa_bcopy32(uint64_t src, uint64_t dst)
3651{}
3652#else /*!lint */
3653	/*
3654	 * Copy 32 bytes of data from src (%o0) to dst (%o1)
3655	 * using physical addresses.
3656	 */
3657	ENTRY_NP(hw_pa_bcopy32)
3658	rdpr	%pstate, %g1
3659	andn	%g1, PSTATE_IE, %g2
3660	wrpr	%g0, %g2, %pstate
3661
3662	rdpr	%pstate, %g0
3663	ldxa	[%o0]ASI_MEM, %o2
3664	add	%o0, 8, %o0
3665	ldxa	[%o0]ASI_MEM, %o3
3666	add	%o0, 8, %o0
3667	ldxa	[%o0]ASI_MEM, %o4
3668	add	%o0, 8, %o0
3669	ldxa	[%o0]ASI_MEM, %o5
3670	membar	#Sync
3671
3672	stxa	%o2, [%o1]ASI_MEM
3673	add	%o1, 8, %o1
3674	stxa	%o3, [%o1]ASI_MEM
3675	add	%o1, 8, %o1
3676	stxa	%o4, [%o1]ASI_MEM
3677	add	%o1, 8, %o1
3678	stxa	%o5, [%o1]ASI_MEM
3679
3680	retl
3681	  wrpr	  %g0, %g1, %pstate
3682
3683	SET_SIZE(hw_pa_bcopy32)
3684
3685#endif /* lint */
3686
3687#if defined(lint)
3688
3689int use_hw_bcopy = 1;
3690int use_hw_bzero = 1;
3691uint_t hw_copy_limit_1 = 0;
3692uint_t hw_copy_limit_2 = 0;
3693uint_t hw_copy_limit_4 = 0;
3694uint_t hw_copy_limit_8 = 0;
3695
3696#else /* !lint */
3697
3698	DGDEF(use_hw_bcopy)
3699	.word	1
3700	DGDEF(use_hw_bzero)
3701	.word	1
3702	DGDEF(hw_copy_limit_1)
3703	.word	0
3704	DGDEF(hw_copy_limit_2)
3705	.word	0
3706	DGDEF(hw_copy_limit_4)
3707	.word	0
3708	DGDEF(hw_copy_limit_8)
3709	.word	0
3710
3711	.align	64
3712	.section ".text"
3713#endif /* !lint */
3714