/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include #include #include #include "assym.h" /* * Pseudo-code to aid in understanding the control flow of the * bcopy routine. * * On entry to bcopy: * * %l6 = curthread->t_lofault; * used_block_copy = FALSE; ! %l6 |= 1 * if (%l6 != NULL) { * curthread->t_lofault = .copyerr; * caller_error_handler = TRUE ! %l6 |= 2 * } * * if (length < VIS_COPY) * goto regular_copy; * * if (!use_vis) * goto_regular_copy; * * if (curthread->t_lwp == NULL) { * ! Kernel threads do not have pcb's in which to store * ! the floating point state, disallow preemption during * ! the copy. * kpreempt_disable(curthread); * } * * old_fprs = %fprs; * old_gsr = %gsr; * if (%fprs.fef) { * ! If we need to save 4 blocks of fpregs then make sure * ! the length is still appropriate for that extra overhead. * if (length < (large_length + (64 * 4))) { * if (curthread->t_lwp == NULL) * kpreempt_enable(curthread); * goto regular_copy; * } * %fprs.fef = 1; * save current fpregs on stack using blockstore * } else { * %fprs.fef = 1; * } * * used_block_copy = 1; ! %l6 |= 1 * do_blockcopy_here; * * In lofault handler: * curthread->t_lofault = .copyerr2; * Continue on with the normal exit handler * * On exit: * call_kpreempt = 0; * if (used_block_copy) { ! %l6 & 1 * %gsr = old_gsr; * if (old_fprs & FPRS_FEF) * restore fpregs from stack using blockload * else * zero fpregs * %fprs = old_fprs; * if (curthread->t_lwp == NULL) { * kpreempt_enable(curthread); * call_kpreempt = 1; * } * } * curthread->t_lofault = (%l6 & ~3); * if (call_kpreempt) * kpreempt(%pil); * return (0) * * In second lofault handler (.copyerr2): * We've tried to restore fp state from the stack and failed. To * prevent from returning with a corrupted fp state, we will panic. */ /* * Notes on preserving existing fp state: * * When a copyOP decides to use fp we may have to preserve existing * floating point state. It is not the caller's state that we need to * preserve - the rest of the kernel does not use fp and, anyway, fp * registers are volatile across a call. Some examples: * * - userland has fp state and is interrupted (device interrupt * or trap) and within the interrupt/trap handling we use * bcopy() * - another (higher level) interrupt or trap handler uses bcopy * while a bcopy from an earlier interrupt is still active * - an asynchronous error trap occurs while fp state exists (in * userland or in kernel copy) and the tl0 component of the handling * uses bcopy * - a user process with fp state incurs a copy-on-write fault and * hwblkpagecopy always uses fp * * We therefore need a per-call place in which to preserve fp state - * using our stack is ideal (and since fp copy cannot be leaf optimized * because of calls it makes, this is no hardship). * * To make sure that floating point state is always saved and restored * correctly, the following "big rules" must be followed when the floating * point registers will be used: * * 1. %l6 always holds the caller's lofault handler. Also in this register, * Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in * use. Bit 2 (BCOPY_FLAG) indicates that the call was to bcopy. * * 2. The FPUSED flag indicates that all FP state has been successfully stored * on the stack. It should not be set until this save has been completed. * * 3. The FPUSED flag should not be cleared on exit until all FP state has * been restored from the stack. If an error occurs while restoring * data from the stack, the error handler can check this flag to see if * a restore is necessary. * * 4. Code run under the new lofault handler must be kept to a minimum. In * particular, any calls to kpreempt() should not be made until after the * lofault handler has been restored. */ /* * This shadows sys/machsystm.h which can't be included due to the lack of * _ASM guards in include files it references. Change it here, change it there. */ #define VIS_COPY_THRESHOLD 900 /* * Less then or equal this number of bytes we will always copy byte-for-byte */ #define SMALL_LIMIT 7 /* * Flags set in the lower bits of the t_lofault address: * FPUSED_FLAG: The FP registers were in use and must be restored * BCOPY_FLAG: Set for bcopy calls, cleared for kcopy calls * COPY_FLAGS: Both of the above * * Other flags: * KPREEMPT_FLAG: kpreempt needs to be called */ #define FPUSED_FLAG 1 #define BCOPY_FLAG 2 #define COPY_FLAGS (FPUSED_FLAG | BCOPY_FLAG) #define KPREEMPT_FLAG 4 /* * Size of stack frame in order to accomodate a 64-byte aligned * floating-point register save area and 2 32-bit temp locations. */ #define HWCOPYFRAMESIZE ((64 * 5) + (2 * 4)) #define SAVED_FPREGS_OFFSET (64 * 5) #define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 4) #define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 4) /* * Common macros used by the various versions of the block copy * routines in this file. */ #define FZERO \ fzero %f0 ;\ fzero %f2 ;\ faddd %f0, %f2, %f4 ;\ fmuld %f0, %f2, %f6 ;\ faddd %f0, %f2, %f8 ;\ fmuld %f0, %f2, %f10 ;\ faddd %f0, %f2, %f12 ;\ fmuld %f0, %f2, %f14 ;\ faddd %f0, %f2, %f16 ;\ fmuld %f0, %f2, %f18 ;\ faddd %f0, %f2, %f20 ;\ fmuld %f0, %f2, %f22 ;\ faddd %f0, %f2, %f24 ;\ fmuld %f0, %f2, %f26 ;\ faddd %f0, %f2, %f28 ;\ fmuld %f0, %f2, %f30 ;\ faddd %f0, %f2, %f32 ;\ fmuld %f0, %f2, %f34 ;\ faddd %f0, %f2, %f36 ;\ fmuld %f0, %f2, %f38 ;\ faddd %f0, %f2, %f40 ;\ fmuld %f0, %f2, %f42 ;\ faddd %f0, %f2, %f44 ;\ fmuld %f0, %f2, %f46 ;\ faddd %f0, %f2, %f48 ;\ fmuld %f0, %f2, %f50 ;\ faddd %f0, %f2, %f52 ;\ fmuld %f0, %f2, %f54 ;\ faddd %f0, %f2, %f56 ;\ fmuld %f0, %f2, %f58 ;\ faddd %f0, %f2, %f60 ;\ fmuld %f0, %f2, %f62 #define FALIGN_D0 \ faligndata %d0, %d2, %d48 ;\ faligndata %d2, %d4, %d50 ;\ faligndata %d4, %d6, %d52 ;\ faligndata %d6, %d8, %d54 ;\ faligndata %d8, %d10, %d56 ;\ faligndata %d10, %d12, %d58 ;\ faligndata %d12, %d14, %d60 ;\ faligndata %d14, %d16, %d62 #define FALIGN_D16 \ faligndata %d16, %d18, %d48 ;\ faligndata %d18, %d20, %d50 ;\ faligndata %d20, %d22, %d52 ;\ faligndata %d22, %d24, %d54 ;\ faligndata %d24, %d26, %d56 ;\ faligndata %d26, %d28, %d58 ;\ faligndata %d28, %d30, %d60 ;\ faligndata %d30, %d32, %d62 #define FALIGN_D32 \ faligndata %d32, %d34, %d48 ;\ faligndata %d34, %d36, %d50 ;\ faligndata %d36, %d38, %d52 ;\ faligndata %d38, %d40, %d54 ;\ faligndata %d40, %d42, %d56 ;\ faligndata %d42, %d44, %d58 ;\ faligndata %d44, %d46, %d60 ;\ faligndata %d46, %d0, %d62 #define FALIGN_D2 \ faligndata %d2, %d4, %d48 ;\ faligndata %d4, %d6, %d50 ;\ faligndata %d6, %d8, %d52 ;\ faligndata %d8, %d10, %d54 ;\ faligndata %d10, %d12, %d56 ;\ faligndata %d12, %d14, %d58 ;\ faligndata %d14, %d16, %d60 ;\ faligndata %d16, %d18, %d62 #define FALIGN_D18 \ faligndata %d18, %d20, %d48 ;\ faligndata %d20, %d22, %d50 ;\ faligndata %d22, %d24, %d52 ;\ faligndata %d24, %d26, %d54 ;\ faligndata %d26, %d28, %d56 ;\ faligndata %d28, %d30, %d58 ;\ faligndata %d30, %d32, %d60 ;\ faligndata %d32, %d34, %d62 #define FALIGN_D34 \ faligndata %d34, %d36, %d48 ;\ faligndata %d36, %d38, %d50 ;\ faligndata %d38, %d40, %d52 ;\ faligndata %d40, %d42, %d54 ;\ faligndata %d42, %d44, %d56 ;\ faligndata %d44, %d46, %d58 ;\ faligndata %d46, %d0, %d60 ;\ faligndata %d0, %d2, %d62 #define FALIGN_D4 \ faligndata %d4, %d6, %d48 ;\ faligndata %d6, %d8, %d50 ;\ faligndata %d8, %d10, %d52 ;\ faligndata %d10, %d12, %d54 ;\ faligndata %d12, %d14, %d56 ;\ faligndata %d14, %d16, %d58 ;\ faligndata %d16, %d18, %d60 ;\ faligndata %d18, %d20, %d62 #define FALIGN_D20 \ faligndata %d20, %d22, %d48 ;\ faligndata %d22, %d24, %d50 ;\ faligndata %d24, %d26, %d52 ;\ faligndata %d26, %d28, %d54 ;\ faligndata %d28, %d30, %d56 ;\ faligndata %d30, %d32, %d58 ;\ faligndata %d32, %d34, %d60 ;\ faligndata %d34, %d36, %d62 #define FALIGN_D36 \ faligndata %d36, %d38, %d48 ;\ faligndata %d38, %d40, %d50 ;\ faligndata %d40, %d42, %d52 ;\ faligndata %d42, %d44, %d54 ;\ faligndata %d44, %d46, %d56 ;\ faligndata %d46, %d0, %d58 ;\ faligndata %d0, %d2, %d60 ;\ faligndata %d2, %d4, %d62 #define FALIGN_D6 \ faligndata %d6, %d8, %d48 ;\ faligndata %d8, %d10, %d50 ;\ faligndata %d10, %d12, %d52 ;\ faligndata %d12, %d14, %d54 ;\ faligndata %d14, %d16, %d56 ;\ faligndata %d16, %d18, %d58 ;\ faligndata %d18, %d20, %d60 ;\ faligndata %d20, %d22, %d62 #define FALIGN_D22 \ faligndata %d22, %d24, %d48 ;\ faligndata %d24, %d26, %d50 ;\ faligndata %d26, %d28, %d52 ;\ faligndata %d28, %d30, %d54 ;\ faligndata %d30, %d32, %d56 ;\ faligndata %d32, %d34, %d58 ;\ faligndata %d34, %d36, %d60 ;\ faligndata %d36, %d38, %d62 #define FALIGN_D38 \ faligndata %d38, %d40, %d48 ;\ faligndata %d40, %d42, %d50 ;\ faligndata %d42, %d44, %d52 ;\ faligndata %d44, %d46, %d54 ;\ faligndata %d46, %d0, %d56 ;\ faligndata %d0, %d2, %d58 ;\ faligndata %d2, %d4, %d60 ;\ faligndata %d4, %d6, %d62 #define FALIGN_D8 \ faligndata %d8, %d10, %d48 ;\ faligndata %d10, %d12, %d50 ;\ faligndata %d12, %d14, %d52 ;\ faligndata %d14, %d16, %d54 ;\ faligndata %d16, %d18, %d56 ;\ faligndata %d18, %d20, %d58 ;\ faligndata %d20, %d22, %d60 ;\ faligndata %d22, %d24, %d62 #define FALIGN_D24 \ faligndata %d24, %d26, %d48 ;\ faligndata %d26, %d28, %d50 ;\ faligndata %d28, %d30, %d52 ;\ faligndata %d30, %d32, %d54 ;\ faligndata %d32, %d34, %d56 ;\ faligndata %d34, %d36, %d58 ;\ faligndata %d36, %d38, %d60 ;\ faligndata %d38, %d40, %d62 #define FALIGN_D40 \ faligndata %d40, %d42, %d48 ;\ faligndata %d42, %d44, %d50 ;\ faligndata %d44, %d46, %d52 ;\ faligndata %d46, %d0, %d54 ;\ faligndata %d0, %d2, %d56 ;\ faligndata %d2, %d4, %d58 ;\ faligndata %d4, %d6, %d60 ;\ faligndata %d6, %d8, %d62 #define FALIGN_D10 \ faligndata %d10, %d12, %d48 ;\ faligndata %d12, %d14, %d50 ;\ faligndata %d14, %d16, %d52 ;\ faligndata %d16, %d18, %d54 ;\ faligndata %d18, %d20, %d56 ;\ faligndata %d20, %d22, %d58 ;\ faligndata %d22, %d24, %d60 ;\ faligndata %d24, %d26, %d62 #define FALIGN_D26 \ faligndata %d26, %d28, %d48 ;\ faligndata %d28, %d30, %d50 ;\ faligndata %d30, %d32, %d52 ;\ faligndata %d32, %d34, %d54 ;\ faligndata %d34, %d36, %d56 ;\ faligndata %d36, %d38, %d58 ;\ faligndata %d38, %d40, %d60 ;\ faligndata %d40, %d42, %d62 #define FALIGN_D42 \ faligndata %d42, %d44, %d48 ;\ faligndata %d44, %d46, %d50 ;\ faligndata %d46, %d0, %d52 ;\ faligndata %d0, %d2, %d54 ;\ faligndata %d2, %d4, %d56 ;\ faligndata %d4, %d6, %d58 ;\ faligndata %d6, %d8, %d60 ;\ faligndata %d8, %d10, %d62 #define FALIGN_D12 \ faligndata %d12, %d14, %d48 ;\ faligndata %d14, %d16, %d50 ;\ faligndata %d16, %d18, %d52 ;\ faligndata %d18, %d20, %d54 ;\ faligndata %d20, %d22, %d56 ;\ faligndata %d22, %d24, %d58 ;\ faligndata %d24, %d26, %d60 ;\ faligndata %d26, %d28, %d62 #define FALIGN_D28 \ faligndata %d28, %d30, %d48 ;\ faligndata %d30, %d32, %d50 ;\ faligndata %d32, %d34, %d52 ;\ faligndata %d34, %d36, %d54 ;\ faligndata %d36, %d38, %d56 ;\ faligndata %d38, %d40, %d58 ;\ faligndata %d40, %d42, %d60 ;\ faligndata %d42, %d44, %d62 #define FALIGN_D44 \ faligndata %d44, %d46, %d48 ;\ faligndata %d46, %d0, %d50 ;\ faligndata %d0, %d2, %d52 ;\ faligndata %d2, %d4, %d54 ;\ faligndata %d4, %d6, %d56 ;\ faligndata %d6, %d8, %d58 ;\ faligndata %d8, %d10, %d60 ;\ faligndata %d10, %d12, %d62 #define FALIGN_D14 \ faligndata %d14, %d16, %d48 ;\ faligndata %d16, %d18, %d50 ;\ faligndata %d18, %d20, %d52 ;\ faligndata %d20, %d22, %d54 ;\ faligndata %d22, %d24, %d56 ;\ faligndata %d24, %d26, %d58 ;\ faligndata %d26, %d28, %d60 ;\ faligndata %d28, %d30, %d62 #define FALIGN_D30 \ faligndata %d30, %d32, %d48 ;\ faligndata %d32, %d34, %d50 ;\ faligndata %d34, %d36, %d52 ;\ faligndata %d36, %d38, %d54 ;\ faligndata %d38, %d40, %d56 ;\ faligndata %d40, %d42, %d58 ;\ faligndata %d42, %d44, %d60 ;\ faligndata %d44, %d46, %d62 #define FALIGN_D46 \ faligndata %d46, %d0, %d48 ;\ faligndata %d0, %d2, %d50 ;\ faligndata %d2, %d4, %d52 ;\ faligndata %d4, %d6, %d54 ;\ faligndata %d6, %d8, %d56 ;\ faligndata %d8, %d10, %d58 ;\ faligndata %d10, %d12, %d60 ;\ faligndata %d12, %d14, %d62 /* * Copy a block of storage, returning an error code if `from' or * `to' takes a kernel pagefault which cannot be resolved. * Returns errno value on pagefault error, 0 if all ok */ .seg ".text" .align 4 ENTRY(kcopy) save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp set .copyerr, %l6 ! copyerr is lofault value ldn [THREAD_REG + T_LOFAULT], %l7 ! save existing handler membar #Sync ! sync error barrier (see copy.s) stn %l6, [THREAD_REG + T_LOFAULT] ! set t_lofault ! ! Note that we carefully do *not* flag the setting of ! t_lofault. ! ba,pt %ncc, .do_copy ! common code mov %l7, %l6 /* * We got here because of a fault during kcopy or bcopy if a fault * handler existed when bcopy was called. * Errno value is in %g1. */ .copyerr: set .copyerr2, %l1 membar #Sync ! sync error barrier stn %l1, [THREAD_REG + T_LOFAULT] ! set t_lofault btst FPUSED_FLAG, %l6 bz %icc, 1f and %l6, BCOPY_FLAG, %l1 ! copy flag to %l1 membar #Sync ld [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr wr %o2, 0, %gsr ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 btst FPRS_FEF, %o3 bz %icc, 4f nop ! restore fpregs from stack membar #Sync add %fp, STACK_BIAS - 257, %o2 and %o2, -64, %o2 ldda [%o2]ASI_BLK_P, %d0 add %o2, 64, %o2 ldda [%o2]ASI_BLK_P, %d16 add %o2, 64, %o2 ldda [%o2]ASI_BLK_P, %d32 add %o2, 64, %o2 ldda [%o2]ASI_BLK_P, %d48 membar #Sync ba,pt %ncc, 2f wr %o3, 0, %fprs ! restore fprs 4: FZERO ! zero all of the fpregs wr %o3, 0, %fprs ! restore fprs 2: ldn [THREAD_REG + T_LWP], %o2 tst %o2 bnz,pt %ncc, 1f nop ldsb [THREAD_REG + T_PREEMPT], %l0 deccc %l0 bnz,pn %ncc, 1f stb %l0, [THREAD_REG + T_PREEMPT] ! Check for a kernel preemption request ldn [THREAD_REG + T_CPU], %l0 ldub [%l0 + CPU_KPRUNRUN], %l0 tst %l0 bnz,a,pt %ncc, 1f ! Need to call kpreempt? or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag ! ! Need to cater for the different expectations of kcopy ! and bcopy. kcopy will *always* set a t_lofault handler ! If it fires, we're expected to just return the error code ! and *not* to invoke any existing error handler. As far as ! bcopy is concerned, we only set t_lofault if there was an ! existing lofault handler. In that case we're expected to ! invoke the previously existing handler after restting the ! t_lofault value. ! 1: andn %l6, COPY_FLAGS, %l6 ! remove flags from lofault address membar #Sync ! sync error barrier stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault ! call kpreempt if necessary btst KPREEMPT_FLAG, %l1 bz,pt %icc, 2f nop call kpreempt rdpr %pil, %o0 ! pass %pil 2: btst BCOPY_FLAG, %l1 bnz,pn %ncc, 3f nop ret restore %g1, 0, %o0 3: ! ! We're here via bcopy. There *must* have been an error handler ! in place otheerwise we would have died a nasty death already. ! jmp %l6 ! goto real handler restore %g0, 0, %o0 ! dispose of copy window /* * We got here because of a fault in .copyerr. We can't safely restore fp * state, so we panic. */ fp_panic_msg: .asciz "Unable to restore fp state after copy operation" .align 4 .copyerr2: set fp_panic_msg, %o0 call panic nop SET_SIZE(kcopy) /* * Copy a block of storage - must not overlap (from + len <= to). * Registers: l6 - saved t_lofault * * Copy a page of memory. * Assumes double word alignment and a count >= 256. */ ENTRY(bcopy) save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp ldn [THREAD_REG + T_LOFAULT], %l6 ! save t_lofault tst %l6 ! ! We've already captured whether t_lofault was zero on entry. ! We need to mark ourselves as being from bcopy since both ! kcopy and bcopy use the same code path. If BCOPY_FLAG is set ! and the saved lofault was zero, we won't reset lofault on ! returning. ! or %l6, BCOPY_FLAG, %l6 bz,pt %ncc, .do_copy sethi %hi(.copyerr), %o2 or %o2, %lo(.copyerr), %o2 membar #Sync ! sync error barrier stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector .do_copy: cmp %i2, 12 ! for small counts blu %ncc, .bytecp ! just copy bytes .empty cmp %i2, VIS_COPY_THRESHOLD ! for large counts blu,pt %ncc, .bcb_punt .empty ! ! Check to see if VIS acceleration is enabled ! sethi %hi(use_hw_bcopy), %o2 ld [%o2 + %lo(use_hw_bcopy)], %o2 tst %o2 bz,pn %icc, .bcb_punt nop subcc %i1, %i0, %i3 bneg,a,pn %ncc, 1f neg %i3 1: /* * Compare against 256 since we should be checking block addresses * and (dest & ~63) - (src & ~63) can be 3 blocks even if * src = dest + (64 * 3) + 63. */ cmp %i3, 256 blu,pn %ncc, .bcb_punt nop ldn [THREAD_REG + T_LWP], %o3 tst %o3 bnz,pt %ncc, 1f nop ! kpreempt_disable(); ldsb [THREAD_REG + T_PREEMPT], %o2 inc %o2 stb %o2, [THREAD_REG + T_PREEMPT] 1: rd %fprs, %o2 ! check for unused fp st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs btst FPRS_FEF, %o2 bz,a %icc, .do_blockcopy wr %g0, FPRS_FEF, %fprs .bcb_fpregs_inuse: cmp %i2, VIS_COPY_THRESHOLD+(64*4) ! for large counts (larger bgeu %ncc, 1f ! if we have to save the fpregs) nop tst %o3 bnz,pt %ncc, .bcb_punt nop ldsb [THREAD_REG + T_PREEMPT], %l0 deccc %l0 bnz,pn %icc, .bcb_punt stb %l0, [THREAD_REG + T_PREEMPT] ! Check for a kernel preemption request ldn [THREAD_REG + T_CPU], %l0 ldub [%l0 + CPU_KPRUNRUN], %l0 tst %l0 bz,pt %icc, .bcb_punt nop ! Attempt to preempt call kpreempt rdpr %pil, %o0 ! pass %pil ba,pt %ncc, .bcb_punt nop 1: wr %g0, FPRS_FEF, %fprs ! save in-use fpregs on stack membar #Sync add %fp, STACK_BIAS - 257, %o2 and %o2, -64, %o2 stda %d0, [%o2]ASI_BLK_P add %o2, 64, %o2 stda %d16, [%o2]ASI_BLK_P add %o2, 64, %o2 stda %d32, [%o2]ASI_BLK_P add %o2, 64, %o2 stda %d48, [%o2]ASI_BLK_P membar #Sync .do_blockcopy: membar #StoreStore|#StoreLoad|#LoadStore rd %gsr, %o2 st %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr ! Set the lower bit in the saved t_lofault to indicate ! that we need to clear the %fprs register on the way ! out or %l6, FPUSED_FLAG, %l6 ! Swap src/dst since the code below is memcpy code ! and memcpy/bcopy have different calling sequences mov %i1, %i5 mov %i0, %i1 mov %i5, %i0 !!! This code is nearly identical to the version in the sun4u !!! libc_psr. Most bugfixes made to that file should be !!! merged into this routine. andcc %i0, 7, %o3 bz,pt %ncc, blkcpy sub %o3, 8, %o3 neg %o3 sub %i2, %o3, %i2 ! Align Destination on double-word boundary 2: ldub [%i1], %o4 inc %i1 inc %i0 deccc %o3 bgu %ncc, 2b stb %o4, [%i0 - 1] blkcpy: andcc %i0, 63, %i3 bz,pn %ncc, blalign ! now block aligned sub %i3, 64, %i3 neg %i3 ! bytes till block aligned sub %i2, %i3, %i2 ! update %i2 with new count ! Copy %i3 bytes till dst is block (64 byte) aligned. use ! double word copies. alignaddr %i1, %g0, %g1 ldd [%g1], %d0 add %g1, 8, %g1 6: ldd [%g1], %d2 add %g1, 8, %g1 subcc %i3, 8, %i3 faligndata %d0, %d2, %d8 std %d8, [%i0] add %i1, 8, %i1 bz,pn %ncc, blalign add %i0, 8, %i0 ldd [%g1], %d0 add %g1, 8, %g1 subcc %i3, 8, %i3 faligndata %d2, %d0, %d8 std %d8, [%i0] add %i1, 8, %i1 bgu,pn %ncc, 6b add %i0, 8, %i0 blalign: membar #StoreLoad ! %i2 = total length ! %i3 = blocks (length - 64) / 64 ! %i4 = doubles remaining (length - blocks) sub %i2, 64, %i3 andn %i3, 63, %i3 sub %i2, %i3, %i4 andn %i4, 7, %i4 sub %i4, 16, %i4 sub %i2, %i4, %i2 sub %i2, %i3, %i2 andn %i1, 0x3f, %l7 ! blk aligned address alignaddr %i1, %g0, %g0 ! gen %gsr srl %i1, 3, %l5 ! bits 3,4,5 are now least sig in %l5 andcc %l5, 7, %i5 ! mask everything except bits 1,2 3 add %i1, %i4, %i1 add %i1, %i3, %i1 ldda [%l7]ASI_BLK_P, %d0 add %l7, 64, %l7 ldda [%l7]ASI_BLK_P, %d16 add %l7, 64, %l7 ldda [%l7]ASI_BLK_P, %d32 add %l7, 64, %l7 sub %i3, 128, %i3 ! switch statement to get us to the right 8 byte blk within a ! 64 byte block cmp %i5, 4 bgeu,a hlf cmp %i5, 6 cmp %i5, 2 bgeu,a sqtr nop cmp %i5, 1 be,a seg1 nop ba,pt %ncc, seg0 nop sqtr: be,a seg2 nop ba,pt %ncc, seg3 nop hlf: bgeu,a fqtr nop cmp %i5, 5 be,a seg5 nop ba,pt %ncc, seg4 nop fqtr: be,a seg6 nop ba,pt %ncc, seg7 nop seg0: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D0 ldda [%l7]ASI_BLK_P, %d0 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D16 ldda [%l7]ASI_BLK_P, %d16 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D32 ldda [%l7]ASI_BLK_P, %d32 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, seg0 0: FALIGN_D16 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D32 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd0 add %i0, 64, %i0 1: FALIGN_D32 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D0 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd16 add %i0, 64, %i0 2: FALIGN_D0 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D16 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd32 add %i0, 64, %i0 seg1: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D2 ldda [%l7]ASI_BLK_P, %d0 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D18 ldda [%l7]ASI_BLK_P, %d16 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D34 ldda [%l7]ASI_BLK_P, %d32 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, seg1 0: FALIGN_D18 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D34 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd2 add %i0, 64, %i0 1: FALIGN_D34 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D2 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd18 add %i0, 64, %i0 2: FALIGN_D2 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D18 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd34 add %i0, 64, %i0 seg2: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D4 ldda [%l7]ASI_BLK_P, %d0 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D20 ldda [%l7]ASI_BLK_P, %d16 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D36 ldda [%l7]ASI_BLK_P, %d32 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, seg2 0: FALIGN_D20 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D36 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd4 add %i0, 64, %i0 1: FALIGN_D36 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D4 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd20 add %i0, 64, %i0 2: FALIGN_D4 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D20 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd36 add %i0, 64, %i0 seg3: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D6 ldda [%l7]ASI_BLK_P, %d0 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D22 ldda [%l7]ASI_BLK_P, %d16 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D38 ldda [%l7]ASI_BLK_P, %d32 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, seg3 0: FALIGN_D22 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D38 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd6 add %i0, 64, %i0 1: FALIGN_D38 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D6 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd22 add %i0, 64, %i0 2: FALIGN_D6 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D22 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd38 add %i0, 64, %i0 seg4: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D8 ldda [%l7]ASI_BLK_P, %d0 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D24 ldda [%l7]ASI_BLK_P, %d16 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D40 ldda [%l7]ASI_BLK_P, %d32 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, seg4 0: FALIGN_D24 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D40 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd8 add %i0, 64, %i0 1: FALIGN_D40 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D8 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd24 add %i0, 64, %i0 2: FALIGN_D8 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D24 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd40 add %i0, 64, %i0 seg5: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D10 ldda [%l7]ASI_BLK_P, %d0 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D26 ldda [%l7]ASI_BLK_P, %d16 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D42 ldda [%l7]ASI_BLK_P, %d32 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, seg5 0: FALIGN_D26 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D42 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd10 add %i0, 64, %i0 1: FALIGN_D42 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D10 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd26 add %i0, 64, %i0 2: FALIGN_D10 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D26 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd42 add %i0, 64, %i0 seg6: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D12 ldda [%l7]ASI_BLK_P, %d0 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D28 ldda [%l7]ASI_BLK_P, %d16 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D44 ldda [%l7]ASI_BLK_P, %d32 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, seg6 0: FALIGN_D28 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D44 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd12 add %i0, 64, %i0 1: FALIGN_D44 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D12 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd28 add %i0, 64, %i0 2: FALIGN_D12 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D28 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd44 add %i0, 64, %i0 seg7: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D14 ldda [%l7]ASI_BLK_P, %d0 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D30 ldda [%l7]ASI_BLK_P, %d16 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D46 ldda [%l7]ASI_BLK_P, %d32 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, seg7 0: FALIGN_D30 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D46 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd14 add %i0, 64, %i0 1: FALIGN_D46 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D14 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd30 add %i0, 64, %i0 2: FALIGN_D14 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D30 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, blkd46 add %i0, 64, %i0 ! ! dribble out the last partial block ! blkd0: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d0, %d2, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd2: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d2, %d4, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd4: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d4, %d6, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd6: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d6, %d8, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd8: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d8, %d10, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd10: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d10, %d12, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd12: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d12, %d14, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd14: subcc %i4, 8, %i4 blu,pn %ncc, blkdone fsrc1 %d14, %d0 ba,a,pt %ncc, blkleft blkd16: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d16, %d18, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd18: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d18, %d20, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd20: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d20, %d22, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd22: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d22, %d24, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd24: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d24, %d26, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd26: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d26, %d28, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd28: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d28, %d30, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd30: subcc %i4, 8, %i4 blu,pn %ncc, blkdone fsrc1 %d30, %d0 ba,a,pt %ncc, blkleft blkd32: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d32, %d34, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd34: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d34, %d36, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd36: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d36, %d38, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd38: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d38, %d40, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd40: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d40, %d42, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd42: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d42, %d44, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd44: subcc %i4, 8, %i4 blu,pn %ncc, blkdone faligndata %d44, %d46, %d48 std %d48, [%i0] add %i0, 8, %i0 blkd46: subcc %i4, 8, %i4 blu,pn %ncc, blkdone fsrc1 %d46, %d0 blkleft: 1: ldd [%l7], %d2 add %l7, 8, %l7 subcc %i4, 8, %i4 faligndata %d0, %d2, %d8 std %d8, [%i0] blu,pn %ncc, blkdone add %i0, 8, %i0 ldd [%l7], %d0 add %l7, 8, %l7 subcc %i4, 8, %i4 faligndata %d2, %d0, %d8 std %d8, [%i0] bgeu,pt %ncc, 1b add %i0, 8, %i0 blkdone: tst %i2 bz,pt %ncc, .bcb_exit and %l3, 0x4, %l3 ! fprs.du = fprs.dl = 0 7: ldub [%i1], %i4 inc %i1 inc %i0 deccc %i2 bgu,pt %ncc, 7b stb %i4, [%i0 - 1] .bcb_exit: membar #StoreLoad|#StoreStore btst FPUSED_FLAG, %l6 bz %icc, 1f and %l6, COPY_FLAGS, %l1 ! Store flags in %l1 ! We can't clear the flags from %l6 yet. ! If there's an error, .copyerr will ! need them ld [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr wr %o2, 0, %gsr ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 btst FPRS_FEF, %o3 bz %icc, 4f nop ! restore fpregs from stack membar #Sync add %fp, STACK_BIAS - 257, %o2 and %o2, -64, %o2 ldda [%o2]ASI_BLK_P, %d0 add %o2, 64, %o2 ldda [%o2]ASI_BLK_P, %d16 add %o2, 64, %o2 ldda [%o2]ASI_BLK_P, %d32 add %o2, 64, %o2 ldda [%o2]ASI_BLK_P, %d48 membar #Sync ba,pt %ncc, 2f wr %o3, 0, %fprs ! restore fprs 4: FZERO ! zero all of the fpregs wr %o3, 0, %fprs ! restore fprs 2: ldn [THREAD_REG + T_LWP], %o2 tst %o2 bnz,pt %ncc, 1f nop ldsb [THREAD_REG + T_PREEMPT], %l0 deccc %l0 bnz,pn %ncc, 1f stb %l0, [THREAD_REG + T_PREEMPT] ! Check for a kernel preemption request ldn [THREAD_REG + T_CPU], %l0 ldub [%l0 + CPU_KPRUNRUN], %l0 tst %l0 bnz,a,pt %ncc, 1f ! Need to call kpreempt? or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag 1: btst BCOPY_FLAG, %l1 bz,pn %icc, 3f andncc %l6, COPY_FLAGS, %l6 ! ! Here via bcopy. Check to see if the handler was NULL. ! If so, just return quietly. Otherwise, reset the ! handler and go home. ! bnz,pn %ncc, 3f nop ! ! Null handler. Check for kpreempt flag, call if necessary, ! then return. ! btst KPREEMPT_FLAG, %l1 bz,pt %icc, 2f nop call kpreempt rdpr %pil, %o0 ! pass %pil 2: ret restore %g0, 0, %o0 ! ! Here via kcopy or bcopy with a handler.Reset the ! fault handler. ! 3: membar #Sync stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault ! call kpreempt if necessary btst KPREEMPT_FLAG, %l1 bz,pt %icc, 4f nop call kpreempt rdpr %pil, %o0 4: ret restore %g0, 0, %o0 .bcb_punt: ! ! use aligned transfers where possible ! xor %i0, %i1, %o4 ! xor from and to address btst 7, %o4 ! if lower three bits zero bz %icc, .aldoubcp ! can align on double boundary .empty ! assembler complaints about label xor %i0, %i1, %o4 ! xor from and to address btst 3, %o4 ! if lower two bits zero bz %icc, .alwordcp ! can align on word boundary btst 3, %i0 ! delay slot, from address unaligned? ! ! use aligned reads and writes where possible ! this differs from wordcp in that it copes ! with odd alignment between source and destnation ! using word reads and writes with the proper shifts ! in between to align transfers to and from memory ! i0 - src address, i1 - dest address, i2 - count ! i3, i4 - tmps for used generating complete word ! i5 (word to write) ! l0 size in bits of upper part of source word (US) ! l1 size in bits of lower part of source word (LS = 32 - US) ! l2 size in bits of upper part of destination word (UD) ! l3 size in bits of lower part of destination word (LD = 32 - UD) ! l4 number of bytes leftover after aligned transfers complete ! l5 the number 32 ! mov 32, %l5 ! load an oft-needed constant bz .align_dst_only btst 3, %i1 ! is destnation address aligned? clr %i4 ! clear registers used in either case bz %icc, .align_src_only clr %l0 ! ! both source and destination addresses are unaligned ! 1: ! align source ldub [%i0], %i3 ! read a byte from source address add %i0, 1, %i0 ! increment source address or %i4, %i3, %i4 ! or in with previous bytes (if any) btst 3, %i0 ! is source aligned? add %l0, 8, %l0 ! increment size of upper source (US) bnz,a 1b sll %i4, 8, %i4 ! make room for next byte sub %l5, %l0, %l1 ! generate shift left count (LS) sll %i4, %l1, %i4 ! prepare to get rest ld [%i0], %i3 ! read a word add %i0, 4, %i0 ! increment source address srl %i3, %l0, %i5 ! upper src bits into lower dst bits or %i4, %i5, %i5 ! merge mov 24, %l3 ! align destination 1: srl %i5, %l3, %i4 ! prepare to write a single byte stb %i4, [%i1] ! write a byte add %i1, 1, %i1 ! increment destination address sub %i2, 1, %i2 ! decrement count btst 3, %i1 ! is destination aligned? bnz,a 1b sub %l3, 8, %l3 ! delay slot, decrement shift count (LD) sub %l5, %l3, %l2 ! generate shift left count (UD) sll %i5, %l2, %i5 ! move leftover into upper bytes cmp %l2, %l0 ! cmp # reqd to fill dst w old src left bgu %ncc, .more_needed ! need more to fill than we have nop sll %i3, %l1, %i3 ! clear upper used byte(s) srl %i3, %l1, %i3 ! get the odd bytes between alignments sub %l0, %l2, %l0 ! regenerate shift count sub %l5, %l0, %l1 ! generate new shift left count (LS) and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0 andn %i2, 3, %i2 ! # of aligned bytes that can be moved srl %i3, %l0, %i4 or %i5, %i4, %i5 st %i5, [%i1] ! write a word subcc %i2, 4, %i2 ! decrement count bz %ncc, .unalign_out add %i1, 4, %i1 ! increment destination address b 2f sll %i3, %l1, %i5 ! get leftover into upper bits .more_needed: sll %i3, %l0, %i3 ! save remaining byte(s) srl %i3, %l0, %i3 sub %l2, %l0, %l1 ! regenerate shift count sub %l5, %l1, %l0 ! generate new shift left count sll %i3, %l1, %i4 ! move to fill empty space b 3f or %i5, %i4, %i5 ! merge to complete word ! ! the source address is aligned and destination is not ! .align_dst_only: ld [%i0], %i4 ! read a word add %i0, 4, %i0 ! increment source address mov 24, %l0 ! initial shift alignment count 1: srl %i4, %l0, %i3 ! prepare to write a single byte stb %i3, [%i1] ! write a byte add %i1, 1, %i1 ! increment destination address sub %i2, 1, %i2 ! decrement count btst 3, %i1 ! is destination aligned? bnz,a 1b sub %l0, 8, %l0 ! delay slot, decrement shift count .xfer: sub %l5, %l0, %l1 ! generate shift left count sll %i4, %l1, %i5 ! get leftover 3: and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0 andn %i2, 3, %i2 ! # of aligned bytes that can be moved 2: ld [%i0], %i3 ! read a source word add %i0, 4, %i0 ! increment source address srl %i3, %l0, %i4 ! upper src bits into lower dst bits or %i5, %i4, %i5 ! merge with upper dest bits (leftover) st %i5, [%i1] ! write a destination word subcc %i2, 4, %i2 ! decrement count bz %ncc, .unalign_out ! check if done add %i1, 4, %i1 ! increment destination address b 2b ! loop sll %i3, %l1, %i5 ! get leftover .unalign_out: tst %l4 ! any bytes leftover? bz %ncc, .cpdone .empty ! allow next instruction in delay slot 1: sub %l0, 8, %l0 ! decrement shift srl %i3, %l0, %i4 ! upper src byte into lower dst byte stb %i4, [%i1] ! write a byte subcc %l4, 1, %l4 ! decrement count bz %ncc, .cpdone ! done? add %i1, 1, %i1 ! increment destination tst %l0 ! any more previously read bytes bnz %ncc, 1b ! we have leftover bytes mov %l4, %i2 ! delay slot, mv cnt where dbytecp wants b .dbytecp ! let dbytecp do the rest sub %i0, %i1, %i0 ! i0 gets the difference of src and dst ! ! the destination address is aligned and the source is not ! .align_src_only: ldub [%i0], %i3 ! read a byte from source address add %i0, 1, %i0 ! increment source address or %i4, %i3, %i4 ! or in with previous bytes (if any) btst 3, %i0 ! is source aligned? add %l0, 8, %l0 ! increment shift count (US) bnz,a .align_src_only sll %i4, 8, %i4 ! make room for next byte b,a .xfer ! ! if from address unaligned for double-word moves, ! move bytes till it is, if count is < 56 it could take ! longer to align the thing than to do the transfer ! in word size chunks right away ! .aldoubcp: cmp %i2, 56 ! if count < 56, use wordcp, it takes blu,a %ncc, .alwordcp ! longer to align doubles than words mov 3, %o0 ! mask for word alignment call .alignit ! copy bytes until aligned mov 7, %o0 ! mask for double alignment ! ! source and destination are now double-word aligned ! i3 has aligned count returned by alignit ! and %i2, 7, %i2 ! unaligned leftover count sub %i0, %i1, %i0 ! i0 gets the difference of src and dst 5: ldx [%i0+%i1], %o4 ! read from address stx %o4, [%i1] ! write at destination address subcc %i3, 8, %i3 ! dec count bgu %ncc, 5b add %i1, 8, %i1 ! delay slot, inc to address cmp %i2, 4 ! see if we can copy a word blu %ncc, .dbytecp ! if 3 or less bytes use bytecp .empty ! ! for leftover bytes we fall into wordcp, if needed ! .wordcp: and %i2, 3, %i2 ! unaligned leftover count 5: ld [%i0+%i1], %o4 ! read from address st %o4, [%i1] ! write at destination address subcc %i3, 4, %i3 ! dec count bgu %ncc, 5b add %i1, 4, %i1 ! delay slot, inc to address b,a .dbytecp ! we come here to align copies on word boundaries .alwordcp: call .alignit ! go word-align it mov 3, %o0 ! bits that must be zero to be aligned b .wordcp sub %i0, %i1, %i0 ! i0 gets the difference of src and dst ! ! byte copy, works with any alignment ! .bytecp: b .dbytecp sub %i0, %i1, %i0 ! i0 gets difference of src and dst ! ! differenced byte copy, works with any alignment ! assumes dest in %i1 and (source - dest) in %i0 ! 1: stb %o4, [%i1] ! write to address inc %i1 ! inc to address .dbytecp: deccc %i2 ! dec count bgeu,a %ncc, 1b ! loop till done ldub [%i0+%i1], %o4 ! read from address ! ! FPUSED_FLAG will not have been set in any path leading to ! this point. No need to deal with it. ! .cpdone: btst BCOPY_FLAG, %l6 bz,pn %icc, 2f andncc %l6, BCOPY_FLAG, %l6 ! ! Here via bcopy. Check to see if the handler was NULL. ! If so, just return quietly. Otherwise, reset the ! handler and go home. ! bnz,pn %ncc, 2f nop ! ! Null handler. ! ret restore %g0, 0, %o0 ! ! Here via kcopy or bcopy with a handler.Reset the ! fault handler. ! 2: membar #Sync stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault ret restore %g0, 0, %o0 ! return (0) /* * Common code used to align transfers on word and doubleword * boudaries. Aligns source and destination and returns a count * of aligned bytes to transfer in %i3 */ 1: inc %i0 ! inc from stb %o4, [%i1] ! write a byte inc %i1 ! inc to dec %i2 ! dec count .alignit: btst %o0, %i0 ! %o0 is bit mask to check for alignment bnz,a 1b ldub [%i0], %o4 ! read next byte retl andn %i2, %o0, %i3 ! return size of aligned bytes SET_SIZE(bcopy) /* * Block copy with possibly overlapped operands. */ ENTRY(ovbcopy) tst %o2 ! check count bgu,a %ncc, 1f ! nothing to do or bad arguments subcc %o0, %o1, %o3 ! difference of from and to address retl ! return nop 1: bneg,a %ncc, 2f neg %o3 ! if < 0, make it positive 2: cmp %o2, %o3 ! cmp size and abs(from - to) bleu %ncc, bcopy ! if size <= abs(diff): use bcopy, .empty ! no overlap cmp %o0, %o1 ! compare from and to addresses blu %ncc, .ov_bkwd ! if from < to, copy backwards nop ! ! Copy forwards. ! .ov_fwd: ldub [%o0], %o3 ! read from address inc %o0 ! inc from address stb %o3, [%o1] ! write to address deccc %o2 ! dec count bgu %ncc, .ov_fwd ! loop till done inc %o1 ! inc to address retl ! return nop ! ! Copy backwards. ! .ov_bkwd: deccc %o2 ! dec count ldub [%o0 + %o2], %o3 ! get byte at end of src bgu %ncc, .ov_bkwd ! loop till done stb %o3, [%o1 + %o2] ! delay slot, store at end of dst retl ! return nop SET_SIZE(ovbcopy) /* * hwblkpagecopy() * * Copies exactly one page. This routine assumes the caller (ppcopy) * has already disabled kernel preemption and has checked * use_hw_bcopy. */ ENTRY(hwblkpagecopy) ! get another window w/space for three aligned blocks of saved fpregs save %sp, -SA(MINFRAME + 4*64), %sp ! %i0 - source address (arg) ! %i1 - destination address (arg) ! %i2 - length of region (not arg) ! %l0 - saved fprs ! %l1 - pointer to saved fpregs rd %fprs, %l0 ! check for unused fp btst FPRS_FEF, %l0 bz 1f membar #Sync ! save in-use fpregs on stack add %fp, STACK_BIAS - 193, %l1 and %l1, -64, %l1 stda %d0, [%l1]ASI_BLK_P add %l1, 64, %l3 stda %d16, [%l3]ASI_BLK_P add %l3, 64, %l3 stda %d32, [%l3]ASI_BLK_P membar #Sync 1: wr %g0, FPRS_FEF, %fprs ldda [%i0]ASI_BLK_P, %d0 add %i0, 64, %i0 set PAGESIZE - 64, %i2 2: ldda [%i0]ASI_BLK_P, %d16 fsrc1 %d0, %d32 fsrc1 %d2, %d34 fsrc1 %d4, %d36 fsrc1 %d6, %d38 fsrc1 %d8, %d40 fsrc1 %d10, %d42 fsrc1 %d12, %d44 fsrc1 %d14, %d46 stda %d32, [%i1]ASI_BLK_P add %i0, 64, %i0 subcc %i2, 64, %i2 bz,pn %ncc, 3f add %i1, 64, %i1 ldda [%i0]ASI_BLK_P, %d0 fsrc1 %d16, %d32 fsrc1 %d18, %d34 fsrc1 %d20, %d36 fsrc1 %d22, %d38 fsrc1 %d24, %d40 fsrc1 %d26, %d42 fsrc1 %d28, %d44 fsrc1 %d30, %d46 stda %d32, [%i1]ASI_BLK_P add %i0, 64, %i0 sub %i2, 64, %i2 ba,pt %ncc, 2b add %i1, 64, %i1 3: membar #Sync btst FPRS_FEF, %l0 bz 4f stda %d16, [%i1]ASI_BLK_P ! restore fpregs from stack membar #Sync ldda [%l1]ASI_BLK_P, %d0 add %l1, 64, %l3 ldda [%l3]ASI_BLK_P, %d16 add %l3, 64, %l3 ldda [%l3]ASI_BLK_P, %d32 4: wr %l0, 0, %fprs ! restore fprs membar #Sync ret restore %g0, 0, %o0 SET_SIZE(hwblkpagecopy) /* * Transfer data to and from user space - * Note that these routines can cause faults * It is assumed that the kernel has nothing at * less than KERNELBASE in the virtual address space. * * Note that copyin(9F) and copyout(9F) are part of the * DDI/DKI which specifies that they return '-1' on "errors." * * Sigh. * * So there's two extremely similar routines - xcopyin() and xcopyout() * which return the errno that we've faithfully computed. This * allows other callers (e.g. uiomove(9F)) to work correctly. * Given that these are used pretty heavily, we expand the calling * sequences inline for all flavours (rather than making wrappers). * * There are also stub routines for xcopyout_little and xcopyin_little, * which currently are intended to handle requests of <= 16 bytes from * do_unaligned. Future enhancement to make them handle 8k pages efficiently * is left as an exercise... */ /* * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr) * * General theory of operation: * * The only difference between default_copy{in,out} and * default_xcopy{in,out} is in the error handling routine they invoke * when a memory access error is seen. default_xcopyOP returns the errno * while default_copyOP returns -1 (see above). copy{in,out}_noerr set * a special flag (by oring the value 2 into the fault handler address) * if they are called with a fault handler already in place. That flag * causes the default handlers to trampoline to the previous handler * upon an error. * * None of the copyops routines grab a window until it's decided that * we need to do a HW block copy operation. This saves a window * spill/fill when we're called during socket ops. The typical IO * path won't cause spill/fill traps. * * This code uses a set of 4 limits for the maximum size that will * be copied given a particular input/output address alignment. * the default limits are: * * single byte aligned - 900 (hw_copy_limit_1) * two byte aligned - 1800 (hw_copy_limit_2) * four byte aligned - 3600 (hw_copy_limit_4) * eight byte aligned - 7200 (hw_copy_limit_8) * * If the value for a particular limit is zero, the copy will be done * via the copy loops rather than VIS. * * Flow: * * If count == zero return zero. * * Store the previous lo_fault handler into %g6. * Place our secondary lofault handler into %g5. * Place the address of our nowindow fault handler into %o3. * Place the address of the windowed fault handler into %o4. * --> We'll use this handler if we end up grabbing a window * --> before we use VIS instructions. * * If count is less than or equal to SMALL_LIMIT (7) we * always do a byte for byte copy. * * If count is > SMALL_LIMIT, we check the alignment of the input * and output pointers. Based on the alignment we check count * against a soft limit of VIS_COPY_THRESHOLD (900 on spitfire). If * we're larger than VIS_COPY_THRESHOLD, we check against a limit based * on detected alignment. If we exceed the alignment value we copy * via VIS instructions. * * If we don't exceed one of the limits, we store -count in %o3, * we store the number of chunks (8, 4, 2 or 1 byte) operated * on in our basic copy loop in %o2. Following this we branch * to the appropriate copy loop and copy that many chunks. * Since we've been adding the chunk size to %o3 each time through * as well as decrementing %o2, we can tell if any data is * is left to be copied by examining %o3. If that is zero, we're * done and can go home. If not, we figure out what the largest * chunk size left to be copied is and branch to that copy loop * unless there's only one byte left. We load that as we're * branching to code that stores it just before we return. * * There is one potential situation in which we start to do a VIS * copy but decide to punt and return to the copy loops. There is * (in the default configuration) a window of 256 bytes between * the single byte aligned copy limit and what VIS treats as its * minimum if floating point is in use in the calling app. We need * to be prepared to handle this. See the .small_copyOP label for * details. * * Fault handlers are invoked if we reference memory that has no * current mapping. All forms share the same copyio_fault handler. * This routine handles fixing up the stack and general housecleaning. * Each copy operation has a simple fault handler that is then called * to do the work specific to the invidual operation. The handlers * for default_copyOP and copyOP_noerr are found at the end of * default_copyout. The handlers for default_xcopyOP are found at the * end of xdefault_copyin. */ /* * Copy kernel data to user space (copyout/xcopyout/xcopyout_little). */ /* * We save the arguments in the following registers in case of a fault: * kaddr - %g2 * uaddr - %g3 * count - %g4 */ #define SAVE_SRC %g2 #define SAVE_DST %g3 #define SAVE_COUNT %g4 #define REAL_LOFAULT %g5 #define SAVED_LOFAULT %g6 /* * Generic copyio fault handler. This is the first line of defense when a * fault occurs in (x)copyin/(x)copyout. In order for this to function * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT. * This allows us to share common code for all the flavors of the copy * operations, including the _noerr versions. * * Note that this function will restore the original input parameters before * calling REAL_LOFAULT. So the real handler can vector to the appropriate * member of the t_copyop structure, if needed. */ ENTRY(copyio_fault) btst FPUSED_FLAG, SAVED_LOFAULT bz 1f andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT membar #Sync ld [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 wr %o2, 0, %gsr ! restore gsr ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 btst FPRS_FEF, %o3 bz 4f nop ! restore fpregs from stack membar #Sync add %fp, STACK_BIAS - 257, %o2 and %o2, -64, %o2 ldda [%o2]ASI_BLK_P, %d0 add %o2, 64, %o2 ldda [%o2]ASI_BLK_P, %d16 add %o2, 64, %o2 ldda [%o2]ASI_BLK_P, %d32 add %o2, 64, %o2 ldda [%o2]ASI_BLK_P, %d48 membar #Sync ba,pt %ncc, 1f wr %o3, 0, %fprs ! restore fprs 4: FZERO ! zero all of the fpregs wr %o3, 0, %fprs ! restore fprs 1: restore mov SAVE_SRC, %o0 mov SAVE_DST, %o1 jmp REAL_LOFAULT mov SAVE_COUNT, %o2 SET_SIZE(copyio_fault) ENTRY(copyio_fault_nowindow) membar #Sync stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault mov SAVE_SRC, %o0 mov SAVE_DST, %o1 jmp REAL_LOFAULT mov SAVE_COUNT, %o2 SET_SIZE(copyio_fault_nowindow) ENTRY(copyout) sethi %hi(.copyout_err), REAL_LOFAULT or REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT .do_copyout: ! ! Check the length and bail if zero. ! tst %o2 bnz,pt %ncc, 1f nop retl clr %o0 1: sethi %hi(copyio_fault), %o4 or %o4, %lo(copyio_fault), %o4 sethi %hi(copyio_fault_nowindow), %o3 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT or %o3, %lo(copyio_fault_nowindow), %o3 membar #Sync stn %o3, [THREAD_REG + T_LOFAULT] mov %o0, SAVE_SRC mov %o1, SAVE_DST mov %o2, SAVE_COUNT ! ! Check to see if we're more than SMALL_LIMIT (7 bytes). ! Run in leaf mode, using the %o regs as our input regs. ! subcc %o2, SMALL_LIMIT, %o3 bgu,a,pt %ncc, .dco_ns or %o0, %o1, %o3 ! ! What was previously ".small_copyout" ! Do full differenced copy. ! .dcobcp: sub %g0, %o2, %o3 ! negate count add %o0, %o2, %o0 ! make %o0 point at the end add %o1, %o2, %o1 ! make %o1 point at the end ba,pt %ncc, .dcocl ldub [%o0 + %o3], %o4 ! load first byte ! ! %o0 and %o2 point at the end and remain pointing at the end ! of their buffers. We pull things out by adding %o3 (which is ! the negation of the length) to the buffer end which gives us ! the curent location in the buffers. By incrementing %o3 we walk ! through both buffers without having to bump each buffer's ! pointer. A very fast 4 instruction loop. ! .align 16 .dcocl: stba %o4, [%o1 + %o3]ASI_USER inccc %o3 bl,a,pt %ncc, .dcocl ldub [%o0 + %o3], %o4 ! ! We're done. Go home. ! membar #Sync stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] retl clr %o0 ! ! Try aligned copies from here. ! .dco_ns: ! %o0 = kernel addr (to be copied from) ! %o1 = user addr (to be copied to) ! %o2 = length ! %o3 = %o1 | %o2 (used for alignment checking) ! %o4 is alternate lo_fault ! %o5 is original lo_fault ! ! See if we're single byte aligned. If we are, check the ! limit for single byte copies. If we're smaller or equal, ! bounce to the byte for byte copy loop. Otherwise do it in ! HW (if enabled). ! btst 1, %o3 bz,pt %icc, .dcoh8 btst 7, %o3 ! ! Single byte aligned. Do we do it via HW or via ! byte for byte? Do a quick no memory reference ! check to pick up small copies. ! subcc %o2, VIS_COPY_THRESHOLD, %o3 bleu,pt %ncc, .dcobcp sethi %hi(hw_copy_limit_1), %o3 ! ! Big enough that we need to check the HW limit for ! this size copy. ! ld [%o3 + %lo(hw_copy_limit_1)], %o3 ! ! Is HW copy on? If not, do everything byte for byte. ! tst %o3 bz,pn %icc, .dcobcp subcc %o3, %o2, %o3 ! ! If we're less than or equal to the single byte copy limit, ! bop to the copy loop. ! bge,pt %ncc, .dcobcp nop ! ! We're big enough and copy is on. Do it with HW. ! ba,pt %ncc, .big_copyout nop .dcoh8: ! ! 8 byte aligned? ! bnz,a %ncc, .dcoh4 btst 3, %o3 ! ! See if we're in the "small range". ! If so, go off and do the copy. ! If not, load the hard limit. %o3 is ! available for reuse. ! subcc %o2, VIS_COPY_THRESHOLD, %o3 bleu,pt %ncc, .dcos8 sethi %hi(hw_copy_limit_8), %o3 ld [%o3 + %lo(hw_copy_limit_8)], %o3 ! ! If it's zero, there's no HW bcopy. ! Bop off to the aligned copy. ! tst %o3 bz,pn %icc, .dcos8 subcc %o3, %o2, %o3 ! ! We're negative if our size is larger than hw_copy_limit_8. ! bge,pt %ncc, .dcos8 nop ! ! HW assist is on and we're large enough. Do it. ! ba,pt %ncc, .big_copyout nop .dcos8: ! ! Housekeeping for copy loops. Uses same idea as in the byte for ! byte copy loop above. ! add %o0, %o2, %o0 add %o1, %o2, %o1 sub %g0, %o2, %o3 ba,pt %ncc, .dodebc srl %o2, 3, %o2 ! Number of 8 byte chunks to copy ! ! 4 byte aligned? ! .dcoh4: bnz,pn %ncc, .dcoh2 ! ! See if we're in the "small range". ! If so, go off an do the copy. ! If not, load the hard limit. %o3 is ! available for reuse. ! subcc %o2, VIS_COPY_THRESHOLD, %o3 bleu,pt %ncc, .dcos4 sethi %hi(hw_copy_limit_4), %o3 ld [%o3 + %lo(hw_copy_limit_4)], %o3 ! ! If it's zero, there's no HW bcopy. ! Bop off to the aligned copy. ! tst %o3 bz,pn %icc, .dcos4 subcc %o3, %o2, %o3 ! ! We're negative if our size is larger than hw_copy_limit_4. ! bge,pt %ncc, .dcos4 nop ! ! HW assist is on and we're large enough. Do it. ! ba,pt %ncc, .big_copyout nop .dcos4: add %o0, %o2, %o0 add %o1, %o2, %o1 sub %g0, %o2, %o3 ba,pt %ncc, .dodfbc srl %o2, 2, %o2 ! Number of 4 byte chunks to copy ! ! We must be 2 byte aligned. Off we go. ! The check for small copies was done in the ! delay at .dcoh4 ! .dcoh2: ble %ncc, .dcos2 sethi %hi(hw_copy_limit_2), %o3 ld [%o3 + %lo(hw_copy_limit_2)], %o3 tst %o3 bz,pn %icc, .dcos2 subcc %o3, %o2, %o3 bge,pt %ncc, .dcos2 nop ! ! HW is on and we're big enough. Do it. ! ba,pt %ncc, .big_copyout nop .dcos2: add %o0, %o2, %o0 add %o1, %o2, %o1 sub %g0, %o2, %o3 ba,pt %ncc, .dodtbc srl %o2, 1, %o2 ! Number of 2 byte chunks to copy .small_copyout: ! ! Why are we doing this AGAIN? There are certain conditions in ! big_copyout that will cause us to forego the HW assisted copies ! and bounce back to a non-HW assisted copy. This dispatches those ! copies. Note that we branch around this in the main line code. ! ! We make no check for limits or HW enablement here. We've ! already been told that we're a poster child so just go off ! and do it. ! or %o0, %o1, %o3 btst 1, %o3 bnz %icc, .dcobcp ! Most likely btst 7, %o3 bz %icc, .dcos8 btst 3, %o3 bz %icc, .dcos4 nop ba,pt %ncc, .dcos2 nop .align 32 .dodebc: ldx [%o0 + %o3], %o4 deccc %o2 stxa %o4, [%o1 + %o3]ASI_USER bg,pt %ncc, .dodebc addcc %o3, 8, %o3 ! ! End of copy loop. Check to see if we're done. Most ! eight byte aligned copies end here. ! bz,pt %ncc, .dcofh nop ! ! Something is left - do it byte for byte. ! ba,pt %ncc, .dcocl ldub [%o0 + %o3], %o4 ! load next byte ! ! Four byte copy loop. %o2 is the number of 4 byte chunks to copy. ! .align 32 .dodfbc: lduw [%o0 + %o3], %o4 deccc %o2 sta %o4, [%o1 + %o3]ASI_USER bg,pt %ncc, .dodfbc addcc %o3, 4, %o3 ! ! End of copy loop. Check to see if we're done. Most ! four byte aligned copies end here. ! bz,pt %ncc, .dcofh nop ! ! Something is left. Do it byte for byte. ! ba,pt %ncc, .dcocl ldub [%o0 + %o3], %o4 ! load next byte ! ! two byte aligned copy loop. %o2 is the number of 2 byte chunks to ! copy. ! .align 32 .dodtbc: lduh [%o0 + %o3], %o4 deccc %o2 stha %o4, [%o1 + %o3]ASI_USER bg,pt %ncc, .dodtbc addcc %o3, 2, %o3 ! ! End of copy loop. Anything left? ! bz,pt %ncc, .dcofh nop ! ! Deal with the last byte ! ldub [%o0 + %o3], %o4 stba %o4, [%o1 + %o3]ASI_USER .dcofh: membar #Sync stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault retl clr %o0 .big_copyout: ! ! Are we using the FP registers? ! rd %fprs, %o3 ! check for unused fp btst FPRS_FEF, %o3 bnz %icc, .copyout_fpregs_inuse nop ! ! We're going to go off and do a block copy. ! Switch fault hendlers and grab a window. We ! don't do a membar #Sync since we've done only ! kernel data to this point. ! stn %o4, [THREAD_REG + T_LOFAULT] save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp ! ! %o3 is now %i3. Save original %fprs. ! st %i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ba,pt %ncc, .do_block_copyout ! Not in use. Go off and do it. wr %g0, FPRS_FEF, %fprs ! clear %fprs ! .copyout_fpregs_inuse: ! ! We're here if the FP regs are in use. Need to see if the request ! exceeds our suddenly larger minimum. ! cmp %i2, VIS_COPY_THRESHOLD+(64*4) ! for large counts (larger bl %ncc, .small_copyout nop ! ! We're going to go off and do a block copy. ! Change to the heavy duty fault handler and grab a window first. ! stn %o4, [THREAD_REG + T_LOFAULT] save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp st %i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! ! save in-use fpregs on stack ! wr %g0, FPRS_FEF, %fprs membar #Sync add %fp, STACK_BIAS - 257, %o2 and %o2, -64, %o2 stda %d0, [%o2]ASI_BLK_P add %o2, 64, %o2 stda %d16, [%o2]ASI_BLK_P add %o2, 64, %o2 stda %d32, [%o2]ASI_BLK_P add %o2, 64, %o2 stda %d48, [%o2]ASI_BLK_P membar #Sync .do_block_copyout: membar #StoreStore|#StoreLoad|#LoadStore rd %gsr, %o2 st %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr ! Set the lower bit in the saved t_lofault to indicate ! that we need to clear the %fprs register on the way ! out or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT ! Swap src/dst since the code below is memcpy code ! and memcpy/bcopy have different calling sequences mov %i1, %i5 mov %i0, %i1 mov %i5, %i0 !!! This code is nearly identical to the version in the sun4u !!! libc_psr. Most bugfixes made to that file should be !!! merged into this routine. andcc %i0, 7, %o3 bz %ncc, copyout_blkcpy sub %o3, 8, %o3 neg %o3 sub %i2, %o3, %i2 ! Align Destination on double-word boundary 2: ldub [%i1], %o4 inc %i1 stba %o4, [%i0]ASI_USER deccc %o3 bgu %ncc, 2b inc %i0 copyout_blkcpy: andcc %i0, 63, %i3 bz,pn %ncc, copyout_blalign ! now block aligned sub %i3, 64, %i3 neg %i3 ! bytes till block aligned sub %i2, %i3, %i2 ! update %i2 with new count ! Copy %i3 bytes till dst is block (64 byte) aligned. use ! double word copies. alignaddr %i1, %g0, %g1 ldd [%g1], %d0 add %g1, 8, %g1 6: ldd [%g1], %d2 add %g1, 8, %g1 subcc %i3, 8, %i3 faligndata %d0, %d2, %d8 stda %d8, [%i0]ASI_USER add %i1, 8, %i1 bz,pn %ncc, copyout_blalign add %i0, 8, %i0 ldd [%g1], %d0 add %g1, 8, %g1 subcc %i3, 8, %i3 faligndata %d2, %d0, %d8 stda %d8, [%i0]ASI_USER add %i1, 8, %i1 bgu,pn %ncc, 6b add %i0, 8, %i0 copyout_blalign: membar #StoreLoad ! %i2 = total length ! %i3 = blocks (length - 64) / 64 ! %i4 = doubles remaining (length - blocks) sub %i2, 64, %i3 andn %i3, 63, %i3 sub %i2, %i3, %i4 andn %i4, 7, %i4 sub %i4, 16, %i4 sub %i2, %i4, %i2 sub %i2, %i3, %i2 andn %i1, 0x3f, %l7 ! blk aligned address alignaddr %i1, %g0, %g0 ! gen %gsr srl %i1, 3, %l5 ! bits 3,4,5 are now least sig in %l5 andcc %l5, 7, %i5 ! mask everything except bits 1,2 3 add %i1, %i4, %i1 add %i1, %i3, %i1 ldda [%l7]ASI_BLK_P, %d0 add %l7, 64, %l7 ldda [%l7]ASI_BLK_P, %d16 add %l7, 64, %l7 ldda [%l7]ASI_BLK_P, %d32 add %l7, 64, %l7 sub %i3, 128, %i3 ! switch statement to get us to the right 8 byte blk within a ! 64 byte block cmp %i5, 4 bgeu,a copyout_hlf cmp %i5, 6 cmp %i5, 2 bgeu,a copyout_sqtr nop cmp %i5, 1 be,a copyout_seg1 nop ba,pt %ncc, copyout_seg0 nop copyout_sqtr: be,a copyout_seg2 nop ba,pt %ncc, copyout_seg3 nop copyout_hlf: bgeu,a copyout_fqtr nop cmp %i5, 5 be,a copyout_seg5 nop ba,pt %ncc, copyout_seg4 nop copyout_fqtr: be,a copyout_seg6 nop ba,pt %ncc, copyout_seg7 nop copyout_seg0: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D0 ldda [%l7]ASI_BLK_P, %d0 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D16 ldda [%l7]ASI_BLK_P, %d16 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D32 ldda [%l7]ASI_BLK_P, %d32 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, copyout_seg0 0: FALIGN_D16 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D32 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd0 add %i0, 64, %i0 1: FALIGN_D32 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D0 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd16 add %i0, 64, %i0 2: FALIGN_D0 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D16 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd32 add %i0, 64, %i0 copyout_seg1: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D2 ldda [%l7]ASI_BLK_P, %d0 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D18 ldda [%l7]ASI_BLK_P, %d16 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D34 ldda [%l7]ASI_BLK_P, %d32 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, copyout_seg1 0: FALIGN_D18 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D34 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd2 add %i0, 64, %i0 1: FALIGN_D34 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D2 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd18 add %i0, 64, %i0 2: FALIGN_D2 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D18 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd34 add %i0, 64, %i0 copyout_seg2: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D4 ldda [%l7]ASI_BLK_P, %d0 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D20 ldda [%l7]ASI_BLK_P, %d16 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D36 ldda [%l7]ASI_BLK_P, %d32 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, copyout_seg2 0: FALIGN_D20 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D36 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd4 add %i0, 64, %i0 1: FALIGN_D36 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D4 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd20 add %i0, 64, %i0 2: FALIGN_D4 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D20 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd36 add %i0, 64, %i0 copyout_seg3: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D6 ldda [%l7]ASI_BLK_P, %d0 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D22 ldda [%l7]ASI_BLK_P, %d16 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D38 ldda [%l7]ASI_BLK_P, %d32 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, copyout_seg3 0: FALIGN_D22 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D38 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd6 add %i0, 64, %i0 1: FALIGN_D38 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D6 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd22 add %i0, 64, %i0 2: FALIGN_D6 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D22 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd38 add %i0, 64, %i0 copyout_seg4: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D8 ldda [%l7]ASI_BLK_P, %d0 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D24 ldda [%l7]ASI_BLK_P, %d16 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D40 ldda [%l7]ASI_BLK_P, %d32 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, copyout_seg4 0: FALIGN_D24 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D40 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd8 add %i0, 64, %i0 1: FALIGN_D40 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D8 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd24 add %i0, 64, %i0 2: FALIGN_D8 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D24 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd40 add %i0, 64, %i0 copyout_seg5: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D10 ldda [%l7]ASI_BLK_P, %d0 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D26 ldda [%l7]ASI_BLK_P, %d16 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D42 ldda [%l7]ASI_BLK_P, %d32 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, copyout_seg5 0: FALIGN_D26 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D42 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd10 add %i0, 64, %i0 1: FALIGN_D42 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D10 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd26 add %i0, 64, %i0 2: FALIGN_D10 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D26 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd42 add %i0, 64, %i0 copyout_seg6: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D12 ldda [%l7]ASI_BLK_P, %d0 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D28 ldda [%l7]ASI_BLK_P, %d16 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D44 ldda [%l7]ASI_BLK_P, %d32 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, copyout_seg6 0: FALIGN_D28 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D44 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd12 add %i0, 64, %i0 1: FALIGN_D44 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D12 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd28 add %i0, 64, %i0 2: FALIGN_D12 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D28 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd44 add %i0, 64, %i0 copyout_seg7: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D14 ldda [%l7]ASI_BLK_P, %d0 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D30 ldda [%l7]ASI_BLK_P, %d16 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D46 ldda [%l7]ASI_BLK_P, %d32 stda %d48, [%i0]ASI_BLK_AIUS add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, copyout_seg7 0: FALIGN_D30 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D46 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd14 add %i0, 64, %i0 1: FALIGN_D46 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D14 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd30 add %i0, 64, %i0 2: FALIGN_D14 stda %d48, [%i0]ASI_BLK_AIUS add %i0, 64, %i0 membar #Sync FALIGN_D30 stda %d48, [%i0]ASI_BLK_AIUS ba,pt %ncc, copyout_blkd46 add %i0, 64, %i0 ! ! dribble out the last partial block ! copyout_blkd0: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d0, %d2, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd2: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d2, %d4, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd4: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d4, %d6, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd6: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d6, %d8, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd8: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d8, %d10, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd10: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d10, %d12, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd12: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d12, %d14, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd14: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone fsrc1 %d14, %d0 ba,a,pt %ncc, copyout_blkleft copyout_blkd16: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d16, %d18, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd18: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d18, %d20, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd20: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d20, %d22, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd22: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d22, %d24, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd24: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d24, %d26, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd26: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d26, %d28, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd28: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d28, %d30, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd30: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone fsrc1 %d30, %d0 ba,a,pt %ncc, copyout_blkleft copyout_blkd32: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d32, %d34, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd34: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d34, %d36, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd36: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d36, %d38, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd38: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d38, %d40, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd40: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d40, %d42, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd42: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d42, %d44, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd44: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone faligndata %d44, %d46, %d48 stda %d48, [%i0]ASI_USER add %i0, 8, %i0 copyout_blkd46: subcc %i4, 8, %i4 blu,pn %ncc, copyout_blkdone fsrc1 %d46, %d0 copyout_blkleft: 1: ldd [%l7], %d2 add %l7, 8, %l7 subcc %i4, 8, %i4 faligndata %d0, %d2, %d8 stda %d8, [%i0]ASI_USER blu,pn %ncc, copyout_blkdone add %i0, 8, %i0 ldd [%l7], %d0 add %l7, 8, %l7 subcc %i4, 8, %i4 faligndata %d2, %d0, %d8 stda %d8, [%i0]ASI_USER bgeu,pt %ncc, 1b add %i0, 8, %i0 copyout_blkdone: tst %i2 bz,pt %ncc, .copyout_exit and %l3, 0x4, %l3 ! fprs.du = fprs.dl = 0 7: ldub [%i1], %i4 inc %i1 stba %i4, [%i0]ASI_USER inc %i0 deccc %i2 bgu %ncc, 7b nop .copyout_exit: membar #StoreLoad|#StoreStore btst FPUSED_FLAG, SAVED_LOFAULT bz 1f nop ld [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 wr %o2, 0, %gsr ! restore gsr ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 btst FPRS_FEF, %o3 bz 4f nop ! restore fpregs from stack membar #Sync add %fp, STACK_BIAS - 257, %o2 and %o2, -64, %o2 ldda [%o2]ASI_BLK_P, %d0 add %o2, 64, %o2 ldda [%o2]ASI_BLK_P, %d16 add %o2, 64, %o2 ldda [%o2]ASI_BLK_P, %d32 add %o2, 64, %o2 ldda [%o2]ASI_BLK_P, %d48 membar #Sync ba,pt %ncc, 1f wr %o3, 0, %fprs ! restore fprs 4: FZERO ! zero all of the fpregs wr %o3, 0, %fprs ! restore fprs 1: andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT membar #Sync ! sync error barrier stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault ret restore %g0, 0, %o0 .copyout_err: ldn [THREAD_REG + T_COPYOPS], %o4 brz %o4, 2f nop ldn [%o4 + CP_COPYOUT], %g2 jmp %g2 nop 2: retl mov -1, %o0 SET_SIZE(copyout) ENTRY(xcopyout) sethi %hi(.xcopyout_err), REAL_LOFAULT b .do_copyout or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT .xcopyout_err: ldn [THREAD_REG + T_COPYOPS], %o4 brz %o4, 2f nop ldn [%o4 + CP_XCOPYOUT], %g2 jmp %g2 nop 2: retl mov %g1, %o0 SET_SIZE(xcopyout) ENTRY(xcopyout_little) sethi %hi(.little_err), %o4 ldn [THREAD_REG + T_LOFAULT], %o5 or %o4, %lo(.little_err), %o4 membar #Sync ! sync error barrier stn %o4, [THREAD_REG + T_LOFAULT] subcc %g0, %o2, %o3 add %o0, %o2, %o0 bz,pn %ncc, 2f ! check for zero bytes sub %o2, 1, %o4 add %o0, %o4, %o0 ! start w/last byte add %o1, %o2, %o1 ldub [%o0+%o3], %o4 1: stba %o4, [%o1+%o3]ASI_AIUSL inccc %o3 sub %o0, 2, %o0 ! get next byte bcc,a,pt %ncc, 1b ldub [%o0+%o3], %o4 2: membar #Sync ! sync error barrier stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault retl mov %g0, %o0 ! return (0) SET_SIZE(xcopyout_little) /* * Copy user data to kernel space (copyin/xcopyin/xcopyin_little) */ ENTRY(copyin) sethi %hi(.copyin_err), REAL_LOFAULT or REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT .do_copyin: ! ! Check the length and bail if zero. ! tst %o2 bnz,pt %ncc, 1f nop retl clr %o0 1: sethi %hi(copyio_fault), %o4 or %o4, %lo(copyio_fault), %o4 sethi %hi(copyio_fault_nowindow), %o3 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT or %o3, %lo(copyio_fault_nowindow), %o3 membar #Sync stn %o3, [THREAD_REG + T_LOFAULT] mov %o0, SAVE_SRC mov %o1, SAVE_DST mov %o2, SAVE_COUNT ! ! Check to see if we're more than SMALL_LIMIT. ! subcc %o2, SMALL_LIMIT, %o3 bgu,a,pt %ncc, .dci_ns or %o0, %o1, %o3 ! ! What was previously ".small_copyin" ! .dcibcp: sub %g0, %o2, %o3 ! setup for copy loop add %o0, %o2, %o0 add %o1, %o2, %o1 ba,pt %ncc, .dcicl lduba [%o0 + %o3]ASI_USER, %o4 ! ! %o0 and %o1 point at the end and remain pointing at the end ! of their buffers. We pull things out by adding %o3 (which is ! the negation of the length) to the buffer end which gives us ! the curent location in the buffers. By incrementing %o3 we walk ! through both buffers without having to bump each buffer's ! pointer. A very fast 4 instruction loop. ! .align 16 .dcicl: stb %o4, [%o1 + %o3] inccc %o3 bl,a,pt %ncc, .dcicl lduba [%o0 + %o3]ASI_USER, %o4 ! ! We're done. Go home. ! membar #Sync stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] retl clr %o0 ! ! Try aligned copies from here. ! .dci_ns: ! ! See if we're single byte aligned. If we are, check the ! limit for single byte copies. If we're smaller, or equal, ! bounce to the byte for byte copy loop. Otherwise do it in ! HW (if enabled). ! btst 1, %o3 bz,a,pt %icc, .dcih8 btst 7, %o3 ! ! We're single byte aligned. ! subcc %o2, VIS_COPY_THRESHOLD, %o3 bleu,pt %ncc, .dcibcp sethi %hi(hw_copy_limit_1), %o3 ld [%o3 + %lo(hw_copy_limit_1)], %o3 ! ! Is HW copy on? If not do everything byte for byte. ! tst %o3 bz,pn %icc, .dcibcp subcc %o3, %o2, %o3 ! ! Are we bigger than the HW limit? If not ! go to byte for byte. ! bge,pt %ncc, .dcibcp nop ! ! We're big enough and copy is on. Do it with HW. ! ba,pt %ncc, .big_copyin nop .dcih8: ! ! 8 byte aligned? ! bnz,a %ncc, .dcih4 btst 3, %o3 ! ! We're eight byte aligned. ! subcc %o2, VIS_COPY_THRESHOLD, %o3 bleu,pt %ncc, .dcis8 sethi %hi(hw_copy_limit_8), %o3 ld [%o3 + %lo(hw_copy_limit_8)], %o3 ! ! Is HW assist on? If not, do it with the aligned copy. ! tst %o3 bz,pn %icc, .dcis8 subcc %o3, %o2, %o3 bge %ncc, .dcis8 nop ba,pt %ncc, .big_copyin nop .dcis8: ! ! Housekeeping for copy loops. Uses same idea as in the byte for ! byte copy loop above. ! add %o0, %o2, %o0 add %o1, %o2, %o1 sub %g0, %o2, %o3 ba,pt %ncc, .didebc srl %o2, 3, %o2 ! Number of 8 byte chunks to copy ! ! 4 byte aligned? ! .dcih4: bnz %ncc, .dcih2 subcc %o2, VIS_COPY_THRESHOLD, %o3 bleu,pt %ncc, .dcis4 sethi %hi(hw_copy_limit_4), %o3 ld [%o3 + %lo(hw_copy_limit_4)], %o3 ! ! Is HW assist on? If not, do it with the aligned copy. ! tst %o3 bz,pn %icc, .dcis4 subcc %o3, %o2, %o3 ! ! We're negative if our size is less than or equal to hw_copy_limit_4. ! bge %ncc, .dcis4 nop ba,pt %ncc, .big_copyin nop .dcis4: ! ! Housekeeping for copy loops. Uses same idea as in the byte ! for byte copy loop above. ! add %o0, %o2, %o0 add %o1, %o2, %o1 sub %g0, %o2, %o3 ba,pt %ncc, .didfbc srl %o2, 2, %o2 ! Number of 4 byte chunks to copy .dcih2: ! ! We're two byte aligned. Check for "smallness" ! done in delay at .dcih4 ! bleu,pt %ncc, .dcis2 sethi %hi(hw_copy_limit_2), %o3 ld [%o3 + %lo(hw_copy_limit_2)], %o3 ! ! Is HW assist on? If not, do it with the aligned copy. ! tst %o3 bz,pn %icc, .dcis2 subcc %o3, %o2, %o3 ! ! Are we larger than the HW limit? ! bge %ncc, .dcis2 nop ! ! HW assist is on and we're large enough to use it. ! ba,pt %ncc, .big_copyin nop ! ! Housekeeping for copy loops. Uses same idea as in the byte ! for byte copy loop above. ! .dcis2: add %o0, %o2, %o0 add %o1, %o2, %o1 sub %g0, %o2, %o3 ba,pt %ncc, .didtbc srl %o2, 1, %o2 ! Number of 2 byte chunks to copy ! .small_copyin: ! ! Why are we doing this AGAIN? There are certain conditions in ! big copyin that will cause us to forgo the HW assisted copys ! and bounce back to a non-hw assisted copy. This dispatches ! those copies. Note that we branch around this in the main line ! code. ! ! We make no check for limits or HW enablement here. We've ! already been told that we're a poster child so just go off ! and do it. ! or %o0, %o1, %o3 btst 1, %o3 bnz %icc, .dcibcp ! Most likely btst 7, %o3 bz %icc, .dcis8 btst 3, %o3 bz %icc, .dcis4 nop ba,pt %ncc, .dcis2 nop ! ! Eight byte aligned copies. A steal from the original .small_copyin ! with modifications. %o2 is number of 8 byte chunks to copy. When ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more ! to copy. ! .align 32 .didebc: ldxa [%o0 + %o3]ASI_USER, %o4 deccc %o2 stx %o4, [%o1 + %o3] bg,pt %ncc, .didebc addcc %o3, 8, %o3 ! ! End of copy loop. Most 8 byte aligned copies end here. ! bz,pt %ncc, .dcifh nop ! ! Something is left. Do it byte for byte. ! ba,pt %ncc, .dcicl lduba [%o0 + %o3]ASI_USER, %o4 ! ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy. ! .align 32 .didfbc: lduwa [%o0 + %o3]ASI_USER, %o4 deccc %o2 st %o4, [%o1 + %o3] bg,pt %ncc, .didfbc addcc %o3, 4, %o3 ! ! End of copy loop. Most 4 byte aligned copies end here. ! bz,pt %ncc, .dcifh nop ! ! Something is left. Do it byte for byte. ! ba,pt %ncc, .dcicl lduba [%o0 + %o3]ASI_USER, %o4 ! ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to ! copy. ! .align 32 .didtbc: lduha [%o0 + %o3]ASI_USER, %o4 deccc %o2 sth %o4, [%o1 + %o3] bg,pt %ncc, .didtbc addcc %o3, 2, %o3 ! ! End of copy loop. Most 2 byte aligned copies end here. ! bz,pt %ncc, .dcifh nop ! ! Deal with the last byte ! lduba [%o0 + %o3]ASI_USER, %o4 stb %o4, [%o1 + %o3] .dcifh: membar #Sync stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault retl clr %o0 .big_copyin: ! ! Are we using the FP registers? ! rd %fprs, %o3 ! check for unused fp btst FPRS_FEF, %o3 bnz %ncc, .copyin_fpregs_inuse nop ! ! We're going off to do a block copy. ! Switch fault hendlers and grab a window. We ! don't do a membar #Sync since we've done only ! kernel data to this point. ! stn %o4, [THREAD_REG + T_LOFAULT] save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp ! ! %o3 is %i3 after the save... ! st %i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ba,pt %ncc, .do_blockcopyin wr %g0, FPRS_FEF, %fprs .copyin_fpregs_inuse: ! ! We're here if the FP regs are in use. Need to see if the request ! exceeds our suddenly larger minimum. ! cmp %i2, VIS_COPY_THRESHOLD+(64*4) bl %ncc, .small_copyin nop ! ! We're going off and do a block copy. ! Change to the heavy duty fault handler and grab a window first. ! New handler is passed in ! stn %o4, [THREAD_REG + T_LOFAULT] save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp ! ! %o3 is now %i3 ! st %i3, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save in-use fpregs on stack wr %g0, FPRS_FEF, %fprs membar #Sync add %fp, STACK_BIAS - 257, %o2 and %o2, -64, %o2 stda %d0, [%o2]ASI_BLK_P add %o2, 64, %o2 stda %d16, [%o2]ASI_BLK_P add %o2, 64, %o2 stda %d32, [%o2]ASI_BLK_P add %o2, 64, %o2 stda %d48, [%o2]ASI_BLK_P membar #Sync .do_blockcopyin: membar #StoreStore|#StoreLoad|#LoadStore rd %gsr, %o2 st %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr ! Set the lower bit in the saved t_lofault to indicate ! that we need to clear the %fprs register on the way ! out or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT ! Swap src/dst since the code below is memcpy code ! and memcpy/bcopy have different calling sequences mov %i1, %i5 mov %i0, %i1 mov %i5, %i0 !!! This code is nearly identical to the version in the sun4u !!! libc_psr. Most bugfixes made to that file should be !!! merged into this routine. andcc %i0, 7, %o3 bz copyin_blkcpy sub %o3, 8, %o3 neg %o3 sub %i2, %o3, %i2 ! Align Destination on double-word boundary 2: lduba [%i1]ASI_USER, %o4 inc %i1 inc %i0 deccc %o3 bgu %ncc, 2b stb %o4, [%i0-1] copyin_blkcpy: andcc %i0, 63, %i3 bz,pn %ncc, copyin_blalign ! now block aligned sub %i3, 64, %i3 neg %i3 ! bytes till block aligned sub %i2, %i3, %i2 ! update %i2 with new count ! Copy %i3 bytes till dst is block (64 byte) aligned. use ! double word copies. alignaddr %i1, %g0, %g1 ldda [%g1]ASI_USER, %d0 add %g1, 8, %g1 6: ldda [%g1]ASI_USER, %d2 add %g1, 8, %g1 subcc %i3, 8, %i3 faligndata %d0, %d2, %d8 std %d8, [%i0] add %i1, 8, %i1 bz,pn %ncc, copyin_blalign add %i0, 8, %i0 ldda [%g1]ASI_USER, %d0 add %g1, 8, %g1 subcc %i3, 8, %i3 faligndata %d2, %d0, %d8 std %d8, [%i0] add %i1, 8, %i1 bgu,pn %ncc, 6b add %i0, 8, %i0 copyin_blalign: membar #StoreLoad ! %i2 = total length ! %i3 = blocks (length - 64) / 64 ! %i4 = doubles remaining (length - blocks) sub %i2, 64, %i3 andn %i3, 63, %i3 sub %i2, %i3, %i4 andn %i4, 7, %i4 sub %i4, 16, %i4 sub %i2, %i4, %i2 sub %i2, %i3, %i2 andn %i1, 0x3f, %l7 ! blk aligned address alignaddr %i1, %g0, %g0 ! gen %gsr srl %i1, 3, %l5 ! bits 3,4,5 are now least sig in %l5 andcc %l5, 7, %i5 ! mask everything except bits 1,2 3 add %i1, %i4, %i1 add %i1, %i3, %i1 ldda [%l7]ASI_BLK_AIUS, %d0 add %l7, 64, %l7 ldda [%l7]ASI_BLK_AIUS, %d16 add %l7, 64, %l7 ldda [%l7]ASI_BLK_AIUS, %d32 add %l7, 64, %l7 sub %i3, 128, %i3 ! switch statement to get us to the right 8 byte blk within a ! 64 byte block cmp %i5, 4 bgeu,a copyin_hlf cmp %i5, 6 cmp %i5, 2 bgeu,a copyin_sqtr nop cmp %i5, 1 be,a copyin_seg1 nop ba,pt %ncc, copyin_seg0 nop copyin_sqtr: be,a copyin_seg2 nop ba,pt %ncc, copyin_seg3 nop copyin_hlf: bgeu,a copyin_fqtr nop cmp %i5, 5 be,a copyin_seg5 nop ba,pt %ncc, copyin_seg4 nop copyin_fqtr: be,a copyin_seg6 nop ba,pt %ncc, copyin_seg7 nop copyin_seg0: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D0 ldda [%l7]ASI_BLK_AIUS, %d0 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D16 ldda [%l7]ASI_BLK_AIUS, %d16 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D32 ldda [%l7]ASI_BLK_AIUS, %d32 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, copyin_seg0 0: FALIGN_D16 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D32 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd0 add %i0, 64, %i0 1: FALIGN_D32 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D0 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd16 add %i0, 64, %i0 2: FALIGN_D0 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D16 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd32 add %i0, 64, %i0 copyin_seg1: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D2 ldda [%l7]ASI_BLK_AIUS, %d0 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D18 ldda [%l7]ASI_BLK_AIUS, %d16 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D34 ldda [%l7]ASI_BLK_AIUS, %d32 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, copyin_seg1 0: FALIGN_D18 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D34 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd2 add %i0, 64, %i0 1: FALIGN_D34 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D2 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd18 add %i0, 64, %i0 2: FALIGN_D2 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D18 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd34 add %i0, 64, %i0 copyin_seg2: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D4 ldda [%l7]ASI_BLK_AIUS, %d0 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D20 ldda [%l7]ASI_BLK_AIUS, %d16 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D36 ldda [%l7]ASI_BLK_AIUS, %d32 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, copyin_seg2 0: FALIGN_D20 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D36 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd4 add %i0, 64, %i0 1: FALIGN_D36 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D4 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd20 add %i0, 64, %i0 2: FALIGN_D4 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D20 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd36 add %i0, 64, %i0 copyin_seg3: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D6 ldda [%l7]ASI_BLK_AIUS, %d0 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D22 ldda [%l7]ASI_BLK_AIUS, %d16 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D38 ldda [%l7]ASI_BLK_AIUS, %d32 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, copyin_seg3 0: FALIGN_D22 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D38 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd6 add %i0, 64, %i0 1: FALIGN_D38 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D6 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd22 add %i0, 64, %i0 2: FALIGN_D6 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D22 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd38 add %i0, 64, %i0 copyin_seg4: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D8 ldda [%l7]ASI_BLK_AIUS, %d0 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D24 ldda [%l7]ASI_BLK_AIUS, %d16 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D40 ldda [%l7]ASI_BLK_AIUS, %d32 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, copyin_seg4 0: FALIGN_D24 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D40 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd8 add %i0, 64, %i0 1: FALIGN_D40 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D8 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd24 add %i0, 64, %i0 2: FALIGN_D8 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D24 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd40 add %i0, 64, %i0 copyin_seg5: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D10 ldda [%l7]ASI_BLK_AIUS, %d0 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D26 ldda [%l7]ASI_BLK_AIUS, %d16 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D42 ldda [%l7]ASI_BLK_AIUS, %d32 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, copyin_seg5 0: FALIGN_D26 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D42 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd10 add %i0, 64, %i0 1: FALIGN_D42 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D10 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd26 add %i0, 64, %i0 2: FALIGN_D10 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D26 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd42 add %i0, 64, %i0 copyin_seg6: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D12 ldda [%l7]ASI_BLK_AIUS, %d0 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D28 ldda [%l7]ASI_BLK_AIUS, %d16 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D44 ldda [%l7]ASI_BLK_AIUS, %d32 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, copyin_seg6 0: FALIGN_D28 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D44 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd12 add %i0, 64, %i0 1: FALIGN_D44 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D12 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd28 add %i0, 64, %i0 2: FALIGN_D12 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D28 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd44 add %i0, 64, %i0 copyin_seg7: ! 1st chunk - %d0 low, %d16 high, %d32 pre, %d48 dst FALIGN_D14 ldda [%l7]ASI_BLK_AIUS, %d0 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 0f add %i0, 64, %i0 ! 2nd chunk - %d0 pre, %d16 low, %d32 high, %d48 dst FALIGN_D30 ldda [%l7]ASI_BLK_AIUS, %d16 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 1f add %i0, 64, %i0 ! 3rd chunk - %d0 high, %d16 pre, %d32 low, %d48 dst FALIGN_D46 ldda [%l7]ASI_BLK_AIUS, %d32 stda %d48, [%i0]ASI_BLK_P add %l7, 64, %l7 subcc %i3, 64, %i3 bz,pn %ncc, 2f add %i0, 64, %i0 ba,a,pt %ncc, copyin_seg7 0: FALIGN_D30 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D46 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd14 add %i0, 64, %i0 1: FALIGN_D46 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D14 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd30 add %i0, 64, %i0 2: FALIGN_D14 stda %d48, [%i0]ASI_BLK_P add %i0, 64, %i0 membar #Sync FALIGN_D30 stda %d48, [%i0]ASI_BLK_P ba,pt %ncc, copyin_blkd46 add %i0, 64, %i0 ! ! dribble out the last partial block ! copyin_blkd0: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d0, %d2, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd2: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d2, %d4, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd4: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d4, %d6, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd6: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d6, %d8, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd8: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d8, %d10, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd10: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d10, %d12, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd12: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d12, %d14, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd14: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone fsrc1 %d14, %d0 ba,a,pt %ncc, copyin_blkleft copyin_blkd16: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d16, %d18, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd18: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d18, %d20, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd20: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d20, %d22, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd22: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d22, %d24, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd24: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d24, %d26, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd26: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d26, %d28, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd28: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d28, %d30, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd30: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone fsrc1 %d30, %d0 ba,a,pt %ncc, copyin_blkleft copyin_blkd32: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d32, %d34, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd34: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d34, %d36, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd36: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d36, %d38, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd38: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d38, %d40, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd40: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d40, %d42, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd42: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d42, %d44, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd44: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone faligndata %d44, %d46, %d48 std %d48, [%i0] add %i0, 8, %i0 copyin_blkd46: subcc %i4, 8, %i4 blu,pn %ncc, copyin_blkdone fsrc1 %d46, %d0 copyin_blkleft: 1: ldda [%l7]ASI_USER, %d2 add %l7, 8, %l7 subcc %i4, 8, %i4 faligndata %d0, %d2, %d8 std %d8, [%i0] blu,pn %ncc, copyin_blkdone add %i0, 8, %i0 ldda [%l7]ASI_USER, %d0 add %l7, 8, %l7 subcc %i4, 8, %i4 faligndata %d2, %d0, %d8 std %d8, [%i0] bgeu,pt %ncc, 1b add %i0, 8, %i0 copyin_blkdone: tst %i2 bz,pt %ncc, .copyin_exit and %l3, 0x4, %l3 ! fprs.du = fprs.dl = 0 7: lduba [%i1]ASI_USER, %i4 inc %i1 inc %i0 deccc %i2 bgu %ncc, 7b stb %i4, [%i0 - 1] .copyin_exit: membar #StoreLoad|#StoreStore btst FPUSED_FLAG, SAVED_LOFAULT bz %icc, 1f nop ld [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr wr %o2, 0, %gsr ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 btst FPRS_FEF, %o3 bz %icc, 4f nop ! restore fpregs from stack membar #Sync add %fp, STACK_BIAS - 257, %o2 and %o2, -64, %o2 ldda [%o2]ASI_BLK_P, %d0 add %o2, 64, %o2 ldda [%o2]ASI_BLK_P, %d16 add %o2, 64, %o2 ldda [%o2]ASI_BLK_P, %d32 add %o2, 64, %o2 ldda [%o2]ASI_BLK_P, %d48 membar #Sync ba,pt %ncc, 1f wr %o3, 0, %fprs ! restore fprs 4: FZERO ! zero all of the fpregs wr %o3, 0, %fprs ! restore fprs 1: andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT membar #Sync ! sync error barrier stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault ret restore %g0, 0, %o0 .copyin_err: ldn [THREAD_REG + T_COPYOPS], %o4 brz %o4, 2f nop ldn [%o4 + CP_COPYIN], %g2 jmp %g2 nop 2: retl mov -1, %o0 SET_SIZE(copyin) ENTRY(xcopyin) sethi %hi(.xcopyin_err), REAL_LOFAULT b .do_copyin or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT .xcopyin_err: ldn [THREAD_REG + T_COPYOPS], %o4 brz %o4, 2f nop ldn [%o4 + CP_XCOPYIN], %g2 jmp %g2 nop 2: retl mov %g1, %o0 SET_SIZE(xcopyin) ENTRY(xcopyin_little) sethi %hi(.little_err), %o4 ldn [THREAD_REG + T_LOFAULT], %o5 or %o4, %lo(.little_err), %o4 membar #Sync ! sync error barrier stn %o4, [THREAD_REG + T_LOFAULT] subcc %g0, %o2, %o3 add %o0, %o2, %o0 bz,pn %ncc, 2f ! check for zero bytes sub %o2, 1, %o4 add %o0, %o4, %o0 ! start w/last byte add %o1, %o2, %o1 lduba [%o0+%o3]ASI_AIUSL, %o4 1: stb %o4, [%o1+%o3] inccc %o3 sub %o0, 2, %o0 ! get next byte bcc,a,pt %ncc, 1b lduba [%o0+%o3]ASI_AIUSL, %o4 2: membar #Sync ! sync error barrier stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault retl mov %g0, %o0 ! return (0) .little_err: membar #Sync ! sync error barrier stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault retl mov %g1, %o0 SET_SIZE(xcopyin_little) /* * Copy a block of storage - must not overlap (from + len <= to). * No fault handler installed (to be called under on_fault()) */ ENTRY(copyin_noerr) sethi %hi(.copyio_noerr), REAL_LOFAULT b .do_copyin or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT .copyio_noerr: jmp SAVED_LOFAULT nop SET_SIZE(copyin_noerr) /* * Copy a block of storage - must not overlap (from + len <= to). * No fault handler installed (to be called under on_fault()) */ ENTRY(copyout_noerr) sethi %hi(.copyio_noerr), REAL_LOFAULT b .do_copyout or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT SET_SIZE(copyout_noerr) .align 4 DGDEF(use_hw_bcopy) .word 1 DGDEF(use_hw_copyio) .word 1 DGDEF(use_hw_bzero) .word 1 DGDEF(hw_copy_limit_1) .word 0 DGDEF(hw_copy_limit_2) .word 0 DGDEF(hw_copy_limit_4) .word 0 DGDEF(hw_copy_limit_8) .word 0 .align 64 .section ".text" /* * hwblkclr - clears block-aligned, block-multiple-sized regions that are * longer than 256 bytes in length using spitfire's block stores. If * the criteria for using this routine are not met then it calls bzero * and returns 1. Otherwise 0 is returned indicating success. * Caller is responsible for ensuring use_hw_bzero is true and that * kpreempt_disable() has been called. */ ! %i0 - start address ! %i1 - length of region (multiple of 64) ! %l0 - saved fprs ! %l1 - pointer to saved %d0 block ! %l2 - saved curthread->t_lwp ENTRY(hwblkclr) ! get another window w/space for one aligned block of saved fpregs save %sp, -SA(MINFRAME + 2*64), %sp ! Must be block-aligned andcc %i0, (64-1), %g0 bnz,pn %ncc, 1f nop ! ... and must be 256 bytes or more cmp %i1, 256 blu,pn %ncc, 1f nop ! ... and length must be a multiple of 64 andcc %i1, (64-1), %g0 bz,pn %ncc, 2f nop 1: ! punt, call bzero but notify the caller that bzero was used mov %i0, %o0 call bzero mov %i1, %o1 ret restore %g0, 1, %o0 ! return (1) - did not use block operations 2: rd %fprs, %l0 ! check for unused fp btst FPRS_FEF, %l0 bz 1f nop ! save in-use fpregs on stack membar #Sync add %fp, STACK_BIAS - 65, %l1 and %l1, -64, %l1 stda %d0, [%l1]ASI_BLK_P 1: membar #StoreStore|#StoreLoad|#LoadStore wr %g0, FPRS_FEF, %fprs wr %g0, ASI_BLK_P, %asi ! Clear block fzero %d0 fzero %d2 fzero %d4 fzero %d6 fzero %d8 fzero %d10 fzero %d12 fzero %d14 mov 256, %i3 ba .pz_doblock nop .pz_blkstart: ! stda %d0, [%i0+192]%asi ! in dly slot of branch that got us here stda %d0, [%i0+128]%asi stda %d0, [%i0+64]%asi stda %d0, [%i0]%asi .pz_zinst: add %i0, %i3, %i0 sub %i1, %i3, %i1 .pz_doblock: cmp %i1, 256 bgeu,a %ncc, .pz_blkstart stda %d0, [%i0+192]%asi cmp %i1, 64 blu %ncc, .pz_finish andn %i1, (64-1), %i3 srl %i3, 4, %i2 ! using blocks, 1 instr / 16 words set .pz_zinst, %i4 sub %i4, %i2, %i4 jmp %i4 nop .pz_finish: membar #Sync btst FPRS_FEF, %l0 bz,a .pz_finished wr %l0, 0, %fprs ! restore fprs ! restore fpregs from stack ldda [%l1]ASI_BLK_P, %d0 membar #Sync wr %l0, 0, %fprs ! restore fprs .pz_finished: ret restore %g0, 0, %o0 ! return (bzero or not) SET_SIZE(hwblkclr) /* * Copy 32 bytes of data from src (%o0) to dst (%o1) * using physical addresses. */ ENTRY_NP(hw_pa_bcopy32) rdpr %pstate, %g1 andn %g1, PSTATE_IE, %g2 wrpr %g0, %g2, %pstate ldxa [%o0]ASI_MEM, %o2 add %o0, 8, %o0 ldxa [%o0]ASI_MEM, %o3 add %o0, 8, %o0 ldxa [%o0]ASI_MEM, %o4 add %o0, 8, %o0 ldxa [%o0]ASI_MEM, %o5 stxa %o2, [%o1]ASI_MEM add %o1, 8, %o1 stxa %o3, [%o1]ASI_MEM add %o1, 8, %o1 stxa %o4, [%o1]ASI_MEM add %o1, 8, %o1 stxa %o5, [%o1]ASI_MEM membar #Sync retl wrpr %g0, %g1, %pstate SET_SIZE(hw_pa_bcopy32)