125cf1a30Sjl139090/* 225cf1a30Sjl139090 * CDDL HEADER START 325cf1a30Sjl139090 * 425cf1a30Sjl139090 * The contents of this file are subject to the terms of the 525cf1a30Sjl139090 * Common Development and Distribution License (the "License"). 625cf1a30Sjl139090 * You may not use this file except in compliance with the License. 725cf1a30Sjl139090 * 825cf1a30Sjl139090 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 925cf1a30Sjl139090 * or http://www.opensolaris.org/os/licensing. 1025cf1a30Sjl139090 * See the License for the specific language governing permissions 1125cf1a30Sjl139090 * and limitations under the License. 1225cf1a30Sjl139090 * 1325cf1a30Sjl139090 * When distributing Covered Code, include this CDDL HEADER in each 1425cf1a30Sjl139090 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 1525cf1a30Sjl139090 * If applicable, add the following below this CDDL HEADER, with the 1625cf1a30Sjl139090 * fields enclosed by brackets "[]" replaced with your own identifying 1725cf1a30Sjl139090 * information: Portions Copyright [yyyy] [name of copyright owner] 1825cf1a30Sjl139090 * 1925cf1a30Sjl139090 * CDDL HEADER END 2025cf1a30Sjl139090 */ 2125cf1a30Sjl139090/* 22*e64c6c3fSMichael Bergknoff * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 2325cf1a30Sjl139090 * Use is subject to license terms. 2425cf1a30Sjl139090 */ 2525cf1a30Sjl139090 2625cf1a30Sjl139090#include <sys/param.h> 2725cf1a30Sjl139090#include <sys/errno.h> 2825cf1a30Sjl139090#include <sys/asm_linkage.h> 2925cf1a30Sjl139090#include <sys/vtrace.h> 3025cf1a30Sjl139090#include <sys/machthread.h> 3125cf1a30Sjl139090#include <sys/clock.h> 3225cf1a30Sjl139090#include <sys/asi.h> 3325cf1a30Sjl139090#include <sys/fsr.h> 3425cf1a30Sjl139090#include <sys/privregs.h> 3525cf1a30Sjl139090 3625cf1a30Sjl139090#if !defined(lint) 3725cf1a30Sjl139090#include "assym.h" 3825cf1a30Sjl139090#endif /* lint */ 3925cf1a30Sjl139090 4025cf1a30Sjl139090/* 4125cf1a30Sjl139090 * Pseudo-code to aid in understanding the control flow of the 4225cf1a30Sjl139090 * bcopy/copyin/copyout routines. 4325cf1a30Sjl139090 * 4425cf1a30Sjl139090 * On entry: 4525cf1a30Sjl139090 * 4625cf1a30Sjl139090 * ! Determine whether to use the FP register version 4725cf1a30Sjl139090 * ! or the leaf routine version depending on size 4825cf1a30Sjl139090 * ! of copy and flags. Set up error handling accordingly. 4925cf1a30Sjl139090 * ! The transition point depends on whether the src and 5025cf1a30Sjl139090 * ! dst addresses can be aligned to long word, word, 5125cf1a30Sjl139090 * ! half word, or byte boundaries. 5225cf1a30Sjl139090 * ! 5325cf1a30Sjl139090 * ! WARNING: <Register usage convention> 5425cf1a30Sjl139090 * ! For FP version, %l6 holds previous error handling and 5525cf1a30Sjl139090 * ! a flag: TRAMP_FLAG (low bits) 5625cf1a30Sjl139090 * ! for leaf routine version, %o4 holds those values. 5725cf1a30Sjl139090 * ! So either %l6 or %o4 is reserved and not available for 5825cf1a30Sjl139090 * ! any other use. 5925cf1a30Sjl139090 * 6025cf1a30Sjl139090 * if (length <= VIS_COPY_THRESHOLD) ! start with a quick test 6125cf1a30Sjl139090 * go to small_copy; ! to speed short copies 6225cf1a30Sjl139090 * 6325cf1a30Sjl139090 * ! src, dst long word alignable 6425cf1a30Sjl139090 * if (hw_copy_limit_8 == 0) ! hw_copy disabled 6525cf1a30Sjl139090 * go to small_copy; 6625cf1a30Sjl139090 * if (length <= hw_copy_limit_8) 6725cf1a30Sjl139090 * go to small_copy; 6825cf1a30Sjl139090 * go to FPBLK_copy; 6925cf1a30Sjl139090 * } 7025cf1a30Sjl139090 * if (src,dst not alignable) { 7125cf1a30Sjl139090 * if (hw_copy_limit_1 == 0) ! hw_copy disabled 7225cf1a30Sjl139090 * go to small_copy; 7325cf1a30Sjl139090 * if (length <= hw_copy_limit_1) 7425cf1a30Sjl139090 * go to small_copy; 7525cf1a30Sjl139090 * go to FPBLK_copy; 7625cf1a30Sjl139090 * } 7725cf1a30Sjl139090 * if (src,dst halfword alignable) { 7825cf1a30Sjl139090 * if (hw_copy_limit_2 == 0) ! hw_copy disabled 7925cf1a30Sjl139090 * go to small_copy; 8025cf1a30Sjl139090 * if (length <= hw_copy_limit_2) 8125cf1a30Sjl139090 * go to small_copy; 8225cf1a30Sjl139090 * go to FPBLK_copy; 8325cf1a30Sjl139090 * } 8425cf1a30Sjl139090 * if (src,dst word alignable) { 8525cf1a30Sjl139090 * if (hw_copy_limit_4 == 0) ! hw_copy disabled 8625cf1a30Sjl139090 * go to small_copy; 8725cf1a30Sjl139090 * if (length <= hw_copy_limit_4) 8825cf1a30Sjl139090 * go to small_copy; 8925cf1a30Sjl139090 * go to FPBLK_copy; 9025cf1a30Sjl139090 * } 9125cf1a30Sjl139090 * 9225cf1a30Sjl139090 * small_copy: 9325cf1a30Sjl139090 * Setup_leaf_rtn_error_handler; ! diffs for each entry point 9425cf1a30Sjl139090 * 9525cf1a30Sjl139090 * if (count <= 3) ! fast path for tiny copies 9625cf1a30Sjl139090 * go to sm_left; ! special finish up code 9725cf1a30Sjl139090 * else 9825cf1a30Sjl139090 * if (count > CHKSIZE) ! medium sized copies 9925cf1a30Sjl139090 * go to sm_med ! tuned by alignment 10025cf1a30Sjl139090 * if(src&dst not both word aligned) { 10125cf1a30Sjl139090 * sm_movebytes: 10225cf1a30Sjl139090 * move byte by byte in 4-way unrolled loop 10325cf1a30Sjl139090 * fall into sm_left; 10425cf1a30Sjl139090 * sm_left: 10525cf1a30Sjl139090 * move 0-3 bytes byte at a time as needed. 10625cf1a30Sjl139090 * restore error handler and exit. 10725cf1a30Sjl139090 * 10825cf1a30Sjl139090 * } else { ! src&dst are word aligned 10925cf1a30Sjl139090 * check for at least 8 bytes left, 11025cf1a30Sjl139090 * move word at a time, unrolled by 2 11125cf1a30Sjl139090 * when fewer than 8 bytes left, 11225cf1a30Sjl139090 * sm_half: move half word at a time while 2 or more bytes left 11325cf1a30Sjl139090 * sm_byte: move final byte if necessary 11425cf1a30Sjl139090 * sm_exit: 11525cf1a30Sjl139090 * restore error handler and exit. 11625cf1a30Sjl139090 * } 11725cf1a30Sjl139090 * 11825cf1a30Sjl139090 * ! Medium length cases with at least CHKSIZE bytes available 11925cf1a30Sjl139090 * ! method: line up src and dst as best possible, then 12025cf1a30Sjl139090 * ! move data in 4-way unrolled loops. 12125cf1a30Sjl139090 * 12225cf1a30Sjl139090 * sm_med: 12325cf1a30Sjl139090 * if(src&dst unalignable) 12425cf1a30Sjl139090 * go to sm_movebytes 12525cf1a30Sjl139090 * if(src&dst halfword alignable) 12625cf1a30Sjl139090 * go to sm_movehalf 12725cf1a30Sjl139090 * if(src&dst word alignable) 12825cf1a30Sjl139090 * go to sm_moveword 12925cf1a30Sjl139090 * ! fall into long word movement 13025cf1a30Sjl139090 * move bytes until src is word aligned 13125cf1a30Sjl139090 * if not long word aligned, move a word 13225cf1a30Sjl139090 * move long words in 4-way unrolled loop until < 32 bytes left 13325cf1a30Sjl139090 * move long words in 1-way unrolled loop until < 8 bytes left 13425cf1a30Sjl139090 * if zero bytes left, goto sm_exit 13525cf1a30Sjl139090 * if one byte left, go to sm_byte 13625cf1a30Sjl139090 * else go to sm_half 13725cf1a30Sjl139090 * 13825cf1a30Sjl139090 * sm_moveword: 13925cf1a30Sjl139090 * move bytes until src is word aligned 14025cf1a30Sjl139090 * move words in 4-way unrolled loop until < 16 bytes left 14125cf1a30Sjl139090 * move words in 1-way unrolled loop until < 4 bytes left 14225cf1a30Sjl139090 * if zero bytes left, goto sm_exit 14325cf1a30Sjl139090 * if one byte left, go to sm_byte 14425cf1a30Sjl139090 * else go to sm_half 14525cf1a30Sjl139090 * 14625cf1a30Sjl139090 * sm_movehalf: 14725cf1a30Sjl139090 * move a byte if needed to align src on halfword 14825cf1a30Sjl139090 * move halfwords in 4-way unrolled loop until < 8 bytes left 14925cf1a30Sjl139090 * if zero bytes left, goto sm_exit 15025cf1a30Sjl139090 * if one byte left, go to sm_byte 15125cf1a30Sjl139090 * else go to sm_half 15225cf1a30Sjl139090 * 15325cf1a30Sjl139090 * 15425cf1a30Sjl139090 * FPBLK_copy: 15525cf1a30Sjl139090 * %l6 = curthread->t_lofault; 15625cf1a30Sjl139090 * if (%l6 != NULL) { 15725cf1a30Sjl139090 * membar #Sync 15825cf1a30Sjl139090 * curthread->t_lofault = .copyerr; 15925cf1a30Sjl139090 * caller_error_handler = TRUE ! %l6 |= 2 16025cf1a30Sjl139090 * } 16125cf1a30Sjl139090 * 16225cf1a30Sjl139090 * ! for FPU testing we must not migrate cpus 16325cf1a30Sjl139090 * if (curthread->t_lwp == NULL) { 16425cf1a30Sjl139090 * ! Kernel threads do not have pcb's in which to store 16525cf1a30Sjl139090 * ! the floating point state, so disallow preemption during 16625cf1a30Sjl139090 * ! the copy. This also prevents cpu migration. 16725cf1a30Sjl139090 * kpreempt_disable(curthread); 16825cf1a30Sjl139090 * } else { 16925cf1a30Sjl139090 * thread_nomigrate(); 17025cf1a30Sjl139090 * } 17125cf1a30Sjl139090 * 17225cf1a30Sjl139090 * old_fprs = %fprs; 17325cf1a30Sjl139090 * old_gsr = %gsr; 17425cf1a30Sjl139090 * if (%fprs.fef) { 17525cf1a30Sjl139090 * %fprs.fef = 1; 17625cf1a30Sjl139090 * save current fpregs on stack using blockstore 17725cf1a30Sjl139090 * } else { 17825cf1a30Sjl139090 * %fprs.fef = 1; 17925cf1a30Sjl139090 * } 18025cf1a30Sjl139090 * 18125cf1a30Sjl139090 * 18225cf1a30Sjl139090 * do_blockcopy_here; 18325cf1a30Sjl139090 * 18425cf1a30Sjl139090 * In lofault handler: 18525cf1a30Sjl139090 * curthread->t_lofault = .copyerr2; 18625cf1a30Sjl139090 * Continue on with the normal exit handler 18725cf1a30Sjl139090 * 18825cf1a30Sjl139090 * On normal exit: 18925cf1a30Sjl139090 * %gsr = old_gsr; 19025cf1a30Sjl139090 * if (old_fprs & FPRS_FEF) 19125cf1a30Sjl139090 * restore fpregs from stack using blockload 19225cf1a30Sjl139090 * else 19325cf1a30Sjl139090 * zero fpregs 19425cf1a30Sjl139090 * %fprs = old_fprs; 19525cf1a30Sjl139090 * membar #Sync 19625cf1a30Sjl139090 * curthread->t_lofault = (%l6 & ~3); 19725cf1a30Sjl139090 * ! following test omitted from copyin/copyout as they 19825cf1a30Sjl139090 * ! will always have a current thread 19925cf1a30Sjl139090 * if (curthread->t_lwp == NULL) 20025cf1a30Sjl139090 * kpreempt_enable(curthread); 20125cf1a30Sjl139090 * else 20225cf1a30Sjl139090 * thread_allowmigrate(); 20325cf1a30Sjl139090 * return (0) 20425cf1a30Sjl139090 * 20525cf1a30Sjl139090 * In second lofault handler (.copyerr2): 20625cf1a30Sjl139090 * We've tried to restore fp state from the stack and failed. To 20725cf1a30Sjl139090 * prevent from returning with a corrupted fp state, we will panic. 20825cf1a30Sjl139090 */ 20925cf1a30Sjl139090 21025cf1a30Sjl139090/* 21125cf1a30Sjl139090 * Comments about optimization choices 21225cf1a30Sjl139090 * 21325cf1a30Sjl139090 * The initial optimization decision in this code is to determine 21425cf1a30Sjl139090 * whether to use the FP registers for a copy or not. If we don't 21525cf1a30Sjl139090 * use the FP registers, we can execute the copy as a leaf routine, 21625cf1a30Sjl139090 * saving a register save and restore. Also, less elaborate setup 21725cf1a30Sjl139090 * is required, allowing short copies to be completed more quickly. 21825cf1a30Sjl139090 * For longer copies, especially unaligned ones (where the src and 21925cf1a30Sjl139090 * dst do not align to allow simple ldx,stx operation), the FP 22025cf1a30Sjl139090 * registers allow much faster copy operations. 22125cf1a30Sjl139090 * 22225cf1a30Sjl139090 * The estimated extra cost of the FP path will vary depending on 22325cf1a30Sjl139090 * src/dst alignment, dst offset from the next 64 byte FPblock store 22425cf1a30Sjl139090 * boundary, remaining src data after the last full dst cache line is 22525cf1a30Sjl139090 * moved whether the FP registers need to be saved, and some other 22625cf1a30Sjl139090 * minor issues. The average additional overhead is estimated to be 22725cf1a30Sjl139090 * 400 clocks. Since each non-repeated/predicted tst and branch costs 22825cf1a30Sjl139090 * around 10 clocks, elaborate calculation would slow down to all 22925cf1a30Sjl139090 * longer copies and only benefit a small portion of medium sized 23025cf1a30Sjl139090 * copies. Rather than incur such cost, we chose fixed transition 23125cf1a30Sjl139090 * points for each of the alignment choices. 23225cf1a30Sjl139090 * 23325cf1a30Sjl139090 * For the inner loop, here is a comparison of the per cache line 23425cf1a30Sjl139090 * costs for each alignment when src&dst are in cache: 23525cf1a30Sjl139090 * 23625cf1a30Sjl139090 * byte aligned: 108 clocks slower for non-FPBLK 23725cf1a30Sjl139090 * half aligned: 44 clocks slower for non-FPBLK 23825cf1a30Sjl139090 * word aligned: 12 clocks slower for non-FPBLK 23925cf1a30Sjl139090 * long aligned: 4 clocks >>faster<< for non-FPBLK 24025cf1a30Sjl139090 * 24125cf1a30Sjl139090 * The long aligned loop runs faster because it does no prefetching. 24225cf1a30Sjl139090 * That wins if the data is not in cache or there is too little 24325cf1a30Sjl139090 * data to gain much benefit from prefetching. But when there 24425cf1a30Sjl139090 * is more data and that data is not in cache, failing to prefetch 24525cf1a30Sjl139090 * can run much slower. In addition, there is a 2 Kbyte store queue 24625cf1a30Sjl139090 * which will cause the non-FPBLK inner loop to slow for larger copies. 24725cf1a30Sjl139090 * The exact tradeoff is strongly load and application dependent, with 24825cf1a30Sjl139090 * increasing risk of a customer visible performance regression if the 24925cf1a30Sjl139090 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache 25025cf1a30Sjl139090 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe 25125cf1a30Sjl139090 * upper limit for the non-FPBLK code. To minimize performance regression 25225cf1a30Sjl139090 * risk while still gaining the primary benefits of the improvements to 25325cf1a30Sjl139090 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various 25425cf1a30Sjl139090 * hw_copy_limit_*. Later experimental studies using different values 25525cf1a30Sjl139090 * of hw_copy_limit_* can be used to make further adjustments if 25625cf1a30Sjl139090 * appropriate. 25725cf1a30Sjl139090 * 25825cf1a30Sjl139090 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned 25925cf1a30Sjl139090 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned 26025cf1a30Sjl139090 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned 26125cf1a30Sjl139090 * hw_copy_limit_8 = src and dst are longword aligned 26225cf1a30Sjl139090 * 26325cf1a30Sjl139090 * To say that src and dst are word aligned means that after 26425cf1a30Sjl139090 * some initial alignment activity of moving 0 to 3 bytes, 26525cf1a30Sjl139090 * both the src and dst will be on word boundaries so that 26625cf1a30Sjl139090 * word loads and stores may be used. 26725cf1a30Sjl139090 * 26825cf1a30Sjl139090 * Default values at May,2005 are: 26925cf1a30Sjl139090 * hw_copy_limit_1 = 256 27025cf1a30Sjl139090 * hw_copy_limit_2 = 512 27125cf1a30Sjl139090 * hw_copy_limit_4 = 1024 27225cf1a30Sjl139090 * hw_copy_limit_8 = 1024 (or 1536 on some systems) 27325cf1a30Sjl139090 * 27425cf1a30Sjl139090 * 27525cf1a30Sjl139090 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is 27625cf1a30Sjl139090 * disabled for that alignment choice. 27725cf1a30Sjl139090 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256) 27825cf1a30Sjl139090 * the value of VIS_COPY_THRESHOLD is used. 27925cf1a30Sjl139090 * It is not envisioned that hw_copy_limit_? will be changed in the field 28025cf1a30Sjl139090 * It is provided to allow for disabling FPBLK copies and to allow 28125cf1a30Sjl139090 * easy testing of alternate values on future HW implementations 28225cf1a30Sjl139090 * that might have different cache sizes, clock rates or instruction 28325cf1a30Sjl139090 * timing rules. 28425cf1a30Sjl139090 * 28525cf1a30Sjl139090 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum 28625cf1a30Sjl139090 * threshold to speedup all shorter copies (less than 256). That 28725cf1a30Sjl139090 * saves an alignment test, memory reference, and enabling test 28825cf1a30Sjl139090 * for all short copies, or an estimated 24 clocks. 28925cf1a30Sjl139090 * 29025cf1a30Sjl139090 * The order in which these limits are checked does matter since each 29125cf1a30Sjl139090 * non-predicted tst and branch costs around 10 clocks. 29225cf1a30Sjl139090 * If src and dst are randomly selected addresses, 29325cf1a30Sjl139090 * 4 of 8 will not be alignable. 29425cf1a30Sjl139090 * 2 of 8 will be half word alignable. 29525cf1a30Sjl139090 * 1 of 8 will be word alignable. 29625cf1a30Sjl139090 * 1 of 8 will be long word alignable. 29725cf1a30Sjl139090 * But, tests on running kernels show that src and dst to copy code 29825cf1a30Sjl139090 * are typically not on random alignments. Structure copies and 29925cf1a30Sjl139090 * copies of larger data sizes are often on long word boundaries. 30025cf1a30Sjl139090 * So we test the long word alignment case first, then 30125cf1a30Sjl139090 * the byte alignment, then halfword, then word alignment. 30225cf1a30Sjl139090 * 30325cf1a30Sjl139090 * Several times, tests for length are made to split the code 30425cf1a30Sjl139090 * into subcases. These tests often allow later tests to be 30525cf1a30Sjl139090 * avoided. For example, within the non-FPBLK copy, we first 30625cf1a30Sjl139090 * check for tiny copies of 3 bytes or less. That allows us 30725cf1a30Sjl139090 * to use a 4-way unrolled loop for the general byte copy case 30825cf1a30Sjl139090 * without a test on loop entry. 30925cf1a30Sjl139090 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less 31025cf1a30Sjl139090 * vs longer cases. For the really short case, we don't attempt 31125cf1a30Sjl139090 * align src and dst. We try to minimize special case tests in 31225cf1a30Sjl139090 * the shortest loops as each test adds a significant percentage 31325cf1a30Sjl139090 * to the total time. 31425cf1a30Sjl139090 * 31525cf1a30Sjl139090 * For the medium sized cases, we allow ourselves to adjust the 31625cf1a30Sjl139090 * src and dst alignment and provide special cases for each of 31725cf1a30Sjl139090 * the four adjusted alignment cases. The CHKSIZE that was used 31825cf1a30Sjl139090 * to decide between short and medium size was chosen to be 39 31925cf1a30Sjl139090 * as that allows for the worst case of 7 bytes of alignment 32025cf1a30Sjl139090 * shift and 4 times 8 bytes for the first long word unrolling. 32125cf1a30Sjl139090 * That knowledge saves an initial test for length on entry into 32225cf1a30Sjl139090 * the medium cases. If the general loop unrolling factor were 32325cf1a30Sjl139090 * to be increases, this number would also need to be adjusted. 32425cf1a30Sjl139090 * 32525cf1a30Sjl139090 * For all cases in the non-FPBLK code where it is known that at 32625cf1a30Sjl139090 * least 4 chunks of data are available for movement, the 32725cf1a30Sjl139090 * loop is unrolled by four. This 4-way loop runs in 8 clocks 32825cf1a30Sjl139090 * or 2 clocks per data element. 32925cf1a30Sjl139090 * 33025cf1a30Sjl139090 * Instruction alignment is forced by used of .align 16 directives 33125cf1a30Sjl139090 * and nops which are not executed in the code. This 33225cf1a30Sjl139090 * combination of operations shifts the alignment of following 33325cf1a30Sjl139090 * loops to insure that loops are aligned so that their instructions 33425cf1a30Sjl139090 * fall within the minimum number of 4 instruction fetch groups. 33525cf1a30Sjl139090 * If instructions are inserted or removed between the .align 33625cf1a30Sjl139090 * instruction and the unrolled loops, then the alignment needs 33725cf1a30Sjl139090 * to be readjusted. Misaligned loops can add a clock per loop 33825cf1a30Sjl139090 * iteration to the loop timing. 33925cf1a30Sjl139090 * 34025cf1a30Sjl139090 * In a few cases, code is duplicated to avoid a branch. Since 34125cf1a30Sjl139090 * a non-predicted tst and branch takes 10 clocks, this savings 34225cf1a30Sjl139090 * is judged an appropriate time-space tradeoff. 34325cf1a30Sjl139090 * 34425cf1a30Sjl139090 * Within the FPBLK-code, the prefetch method in the inner 34525cf1a30Sjl139090 * loop needs to be explained as it is not standard. Two 34625cf1a30Sjl139090 * prefetches are issued for each cache line instead of one. 34725cf1a30Sjl139090 * The primary one is at the maximum reach of 8 cache lines. 34825cf1a30Sjl139090 * Most of the time, that maximum prefetch reach gives the 34925cf1a30Sjl139090 * cache line more time to reach the processor for systems with 35025cf1a30Sjl139090 * higher processor clocks. But, sometimes memory interference 35125cf1a30Sjl139090 * can cause that prefetch to be dropped. Putting a second 35225cf1a30Sjl139090 * prefetch at a reach of 5 cache lines catches the drops 35325cf1a30Sjl139090 * three iterations later and shows a measured improvement 35425cf1a30Sjl139090 * in performance over any similar loop with a single prefetch. 35525cf1a30Sjl139090 * The prefetches are placed in the loop so they overlap with 35625cf1a30Sjl139090 * non-memory instructions, so that there is no extra cost 35725cf1a30Sjl139090 * when the data is already in-cache. 35825cf1a30Sjl139090 * 35925cf1a30Sjl139090 */ 36025cf1a30Sjl139090 36125cf1a30Sjl139090/* 36225cf1a30Sjl139090 * Notes on preserving existing fp state and on membars. 36325cf1a30Sjl139090 * 36425cf1a30Sjl139090 * When a copyOP decides to use fp we may have to preserve existing 36525cf1a30Sjl139090 * floating point state. It is not the caller's state that we need to 36625cf1a30Sjl139090 * preserve - the rest of the kernel does not use fp and, anyway, fp 36725cf1a30Sjl139090 * registers are volatile across a call. Some examples: 36825cf1a30Sjl139090 * 36925cf1a30Sjl139090 * - userland has fp state and is interrupted (device interrupt 37025cf1a30Sjl139090 * or trap) and within the interrupt/trap handling we use 37125cf1a30Sjl139090 * bcopy() 37225cf1a30Sjl139090 * - another (higher level) interrupt or trap handler uses bcopy 37325cf1a30Sjl139090 * while a bcopy from an earlier interrupt is still active 37425cf1a30Sjl139090 * - an asynchronous error trap occurs while fp state exists (in 37525cf1a30Sjl139090 * userland or in kernel copy) and the tl0 component of the handling 37625cf1a30Sjl139090 * uses bcopy 37725cf1a30Sjl139090 * - a user process with fp state incurs a copy-on-write fault and 37825cf1a30Sjl139090 * hwblkpagecopy always uses fp 37925cf1a30Sjl139090 * 38025cf1a30Sjl139090 * We therefore need a per-call place in which to preserve fp state - 38125cf1a30Sjl139090 * using our stack is ideal (and since fp copy cannot be leaf optimized 38225cf1a30Sjl139090 * because of calls it makes, this is no hardship). 38325cf1a30Sjl139090 * 38425cf1a30Sjl139090 * When we have finished fp copy (with it's repeated block stores) 38525cf1a30Sjl139090 * we must membar #Sync so that our block stores may complete before 38625cf1a30Sjl139090 * we either restore the original fp state into the fp registers or 38725cf1a30Sjl139090 * return to a caller which may initiate other fp operations that could 38825cf1a30Sjl139090 * modify the fp regs we used before the block stores complete. 38925cf1a30Sjl139090 * 39025cf1a30Sjl139090 * Synchronous faults (eg, unresolvable DMMU miss) that occur while 39125cf1a30Sjl139090 * t_lofault is not NULL will not panic but will instead trampoline 39225cf1a30Sjl139090 * to the registered lofault handler. There is no need for any 39325cf1a30Sjl139090 * membars for these - eg, our store to t_lofault will always be visible to 39425cf1a30Sjl139090 * ourselves and it is our cpu which will take any trap. 39525cf1a30Sjl139090 * 39625cf1a30Sjl139090 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur 39725cf1a30Sjl139090 * while t_lofault is not NULL will also not panic. Since we're copying 39825cf1a30Sjl139090 * to or from userland the extent of the damage is known - the destination 39925cf1a30Sjl139090 * buffer is incomplete. So trap handlers will trampoline to the lofault 40025cf1a30Sjl139090 * handler in this case which should take some form of error action to 40125cf1a30Sjl139090 * avoid using the incomplete buffer. The trap handler also flags the 40225cf1a30Sjl139090 * fault so that later return-from-trap handling (for the trap that brought 40325cf1a30Sjl139090 * this thread into the kernel in the first place) can notify the process 40425cf1a30Sjl139090 * and reboot the system (or restart the service with Greenline/Contracts). 40525cf1a30Sjl139090 * 40625cf1a30Sjl139090 * Asynchronous faults (eg, uncorrectable ECC error from memory) can 40725cf1a30Sjl139090 * result in deferred error traps - the trap is taken sometime after 40825cf1a30Sjl139090 * the event and the trap PC may not be the PC of the faulting access. 40925cf1a30Sjl139090 * Delivery of such pending traps can be forced by a membar #Sync, acting 41025cf1a30Sjl139090 * as an "error barrier" in this role. To accurately apply the user/kernel 41125cf1a30Sjl139090 * separation described in the preceding paragraph we must force delivery 41225cf1a30Sjl139090 * of deferred traps affecting kernel state before we install a lofault 41325cf1a30Sjl139090 * handler (if we interpose a new lofault handler on an existing one there 41425cf1a30Sjl139090 * is no need to repeat this), and we must force delivery of deferred 41525cf1a30Sjl139090 * errors affecting the lofault-protected region before we clear t_lofault. 41625cf1a30Sjl139090 * Failure to do so results in lost kernel state being interpreted as 41725cf1a30Sjl139090 * affecting a copyin/copyout only, or of an error that really only 41825cf1a30Sjl139090 * affects copy data being interpreted as losing kernel state. 41925cf1a30Sjl139090 * 42025cf1a30Sjl139090 * Since the copy operations may preserve and later restore floating 42125cf1a30Sjl139090 * point state that does not belong to the caller (see examples above), 42225cf1a30Sjl139090 * we must be careful in how we do this in order to prevent corruption 42325cf1a30Sjl139090 * of another program. 42425cf1a30Sjl139090 * 42525cf1a30Sjl139090 * To make sure that floating point state is always saved and restored 42625cf1a30Sjl139090 * correctly, the following "big rules" must be followed when the floating 42725cf1a30Sjl139090 * point registers will be used: 42825cf1a30Sjl139090 * 42925cf1a30Sjl139090 * 1. %l6 always holds the caller's lofault handler. Also in this register, 43025cf1a30Sjl139090 * Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in 43125cf1a30Sjl139090 * use. Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a 43225cf1a30Sjl139090 * lofault handler was set coming in. 43325cf1a30Sjl139090 * 43425cf1a30Sjl139090 * 2. The FPUSED flag indicates that all FP state has been successfully stored 43525cf1a30Sjl139090 * on the stack. It should not be set until this save has been completed. 43625cf1a30Sjl139090 * 43725cf1a30Sjl139090 * 3. The FPUSED flag should not be cleared on exit until all FP state has 43825cf1a30Sjl139090 * been restored from the stack. If an error occurs while restoring 43925cf1a30Sjl139090 * data from the stack, the error handler can check this flag to see if 44025cf1a30Sjl139090 * a restore is necessary. 44125cf1a30Sjl139090 * 44225cf1a30Sjl139090 * 4. Code run under the new lofault handler must be kept to a minimum. In 44325cf1a30Sjl139090 * particular, any calls to FP_ALLOWMIGRATE, which could result in a call 44425cf1a30Sjl139090 * to kpreempt(), should not be made until after the lofault handler has 44525cf1a30Sjl139090 * been restored. 44625cf1a30Sjl139090 */ 44725cf1a30Sjl139090 44825cf1a30Sjl139090/* 44925cf1a30Sjl139090 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed 45025cf1a30Sjl139090 * to "break even" using FP/VIS-accelerated memory operations. 45125cf1a30Sjl139090 * The FPBLK code assumes a minimum number of bytes are available 45225cf1a30Sjl139090 * to be moved on entry. Check that code carefully before 45325cf1a30Sjl139090 * reducing VIS_COPY_THRESHOLD below 256. 45425cf1a30Sjl139090 */ 45525cf1a30Sjl139090/* 45625cf1a30Sjl139090 * This shadows sys/machsystm.h which can't be included due to the lack of 45725cf1a30Sjl139090 * _ASM guards in include files it references. Change it here, change it there. 45825cf1a30Sjl139090 */ 45925cf1a30Sjl139090#define VIS_COPY_THRESHOLD 256 46025cf1a30Sjl139090 46125cf1a30Sjl139090/* 46225cf1a30Sjl139090 * TEST for very short copies 46325cf1a30Sjl139090 * Be aware that the maximum unroll for the short unaligned case 46425cf1a30Sjl139090 * is SHORTCOPY+1 46525cf1a30Sjl139090 */ 46625cf1a30Sjl139090#define SHORTCOPY 3 46725cf1a30Sjl139090#define CHKSIZE 39 46825cf1a30Sjl139090 46925cf1a30Sjl139090/* 47025cf1a30Sjl139090 * Indicates that we're to trampoline to the error handler. 47125cf1a30Sjl139090 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag. 47225cf1a30Sjl139090 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag. 47325cf1a30Sjl139090 */ 47425cf1a30Sjl139090#define FPUSED_FLAG 1 47525cf1a30Sjl139090#define TRAMP_FLAG 2 47625cf1a30Sjl139090#define MASK_FLAGS 3 47725cf1a30Sjl139090 47825cf1a30Sjl139090/* 47925cf1a30Sjl139090 * Number of outstanding prefetches. 480c8a722abSpm145316 * first prefetch moves data from L2 to L1 (n_reads) 481c8a722abSpm145316 * second prefetch moves data from memory to L2 (one_read) 48225cf1a30Sjl139090 */ 483c8a722abSpm145316#define OLYMPUS_C_PREFETCH 24 484c8a722abSpm145316#define OLYMPUS_C_2ND_PREFETCH 12 48525cf1a30Sjl139090 48625cf1a30Sjl139090#define VIS_BLOCKSIZE 64 48725cf1a30Sjl139090 48825cf1a30Sjl139090/* 48925cf1a30Sjl139090 * Size of stack frame in order to accomodate a 64-byte aligned 49025cf1a30Sjl139090 * floating-point register save area and 2 64-bit temp locations. 49125cf1a30Sjl139090 * All copy functions use two quadrants of fp registers; to assure a 49225cf1a30Sjl139090 * block-aligned two block buffer in which to save we must reserve 49325cf1a30Sjl139090 * three blocks on stack. Not all functions preserve %pfrs on stack 49425cf1a30Sjl139090 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all. 49525cf1a30Sjl139090 * 49625cf1a30Sjl139090 * _______________________________________ <-- %fp + STACK_BIAS 49725cf1a30Sjl139090 * | We may need to preserve 2 quadrants | 49825cf1a30Sjl139090 * | of fp regs, but since we do so with | 49925cf1a30Sjl139090 * | BST/BLD we need room in which to | 50025cf1a30Sjl139090 * | align to VIS_BLOCKSIZE bytes. So | 50125cf1a30Sjl139090 * | this area is 3 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET 50225cf1a30Sjl139090 * |-------------------------------------| 50325cf1a30Sjl139090 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET 50425cf1a30Sjl139090 * |-------------------------------------| 50525cf1a30Sjl139090 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET 50625cf1a30Sjl139090 * --------------------------------------- 50725cf1a30Sjl139090 */ 50825cf1a30Sjl139090#define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8)) 50925cf1a30Sjl139090#define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 3) 51025cf1a30Sjl139090#define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 2) - 1) 51125cf1a30Sjl139090#define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8) 51225cf1a30Sjl139090#define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8) 51325cf1a30Sjl139090 51425cf1a30Sjl139090/* 51525cf1a30Sjl139090 * Common macros used by the various versions of the block copy 51625cf1a30Sjl139090 * routines in this file. 51725cf1a30Sjl139090 */ 51825cf1a30Sjl139090 51925cf1a30Sjl139090/* 52025cf1a30Sjl139090 * In FP copies if we do not have preserved data to restore over 52125cf1a30Sjl139090 * the fp regs we used then we must zero those regs to avoid 52225cf1a30Sjl139090 * exposing portions of the data to later threads (data security). 52325cf1a30Sjl139090 * 52425cf1a30Sjl139090 * Copy functions use either quadrants 1 and 3 or 2 and 4. 52525cf1a30Sjl139090 * 52625cf1a30Sjl139090 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47 52725cf1a30Sjl139090 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63 52825cf1a30Sjl139090 * 52925cf1a30Sjl139090 * The instructions below are quicker than repeated fzero instructions 53025cf1a30Sjl139090 * since they can dispatch down two fp pipelines. 53125cf1a30Sjl139090 */ 53225cf1a30Sjl139090#define FZEROQ1Q3 \ 53325cf1a30Sjl139090 fzero %f0 ;\ 53425cf1a30Sjl139090 fmovd %f0, %f2 ;\ 53525cf1a30Sjl139090 fmovd %f0, %f4 ;\ 53625cf1a30Sjl139090 fmovd %f0, %f6 ;\ 53725cf1a30Sjl139090 fmovd %f0, %f8 ;\ 53825cf1a30Sjl139090 fmovd %f0, %f10 ;\ 53925cf1a30Sjl139090 fmovd %f0, %f12 ;\ 54025cf1a30Sjl139090 fmovd %f0, %f14 ;\ 54125cf1a30Sjl139090 fmovd %f0, %f32 ;\ 54225cf1a30Sjl139090 fmovd %f0, %f34 ;\ 54325cf1a30Sjl139090 fmovd %f0, %f36 ;\ 54425cf1a30Sjl139090 fmovd %f0, %f38 ;\ 54525cf1a30Sjl139090 fmovd %f0, %f40 ;\ 54625cf1a30Sjl139090 fmovd %f0, %f42 ;\ 54725cf1a30Sjl139090 fmovd %f0, %f44 ;\ 54825cf1a30Sjl139090 fmovd %f0, %f46 54925cf1a30Sjl139090 55025cf1a30Sjl139090#define FZEROQ2Q4 \ 55125cf1a30Sjl139090 fzero %f16 ;\ 55225cf1a30Sjl139090 fmovd %f0, %f18 ;\ 55325cf1a30Sjl139090 fmovd %f0, %f20 ;\ 55425cf1a30Sjl139090 fmovd %f0, %f22 ;\ 55525cf1a30Sjl139090 fmovd %f0, %f24 ;\ 55625cf1a30Sjl139090 fmovd %f0, %f26 ;\ 55725cf1a30Sjl139090 fmovd %f0, %f28 ;\ 55825cf1a30Sjl139090 fmovd %f0, %f30 ;\ 55925cf1a30Sjl139090 fmovd %f0, %f48 ;\ 56025cf1a30Sjl139090 fmovd %f0, %f50 ;\ 56125cf1a30Sjl139090 fmovd %f0, %f52 ;\ 56225cf1a30Sjl139090 fmovd %f0, %f54 ;\ 56325cf1a30Sjl139090 fmovd %f0, %f56 ;\ 56425cf1a30Sjl139090 fmovd %f0, %f58 ;\ 56525cf1a30Sjl139090 fmovd %f0, %f60 ;\ 56625cf1a30Sjl139090 fmovd %f0, %f62 56725cf1a30Sjl139090 56825cf1a30Sjl139090/* 56925cf1a30Sjl139090 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack. 57025cf1a30Sjl139090 * Used to save and restore in-use fp registers when we want to use FP 57125cf1a30Sjl139090 * and find fp already in use and copy size still large enough to justify 57225cf1a30Sjl139090 * the additional overhead of this save and restore. 57325cf1a30Sjl139090 * 57425cf1a30Sjl139090 * A membar #Sync is needed before save to sync fp ops initiated before 57525cf1a30Sjl139090 * the call to the copy function (by whoever has fp in use); for example 57625cf1a30Sjl139090 * an earlier block load to the quadrant we are about to save may still be 57725cf1a30Sjl139090 * "in flight". A membar #Sync is required at the end of the save to 57825cf1a30Sjl139090 * sync our block store (the copy code is about to begin ldd's to the 57925cf1a30Sjl139090 * first quadrant). 58025cf1a30Sjl139090 * 58125cf1a30Sjl139090 * Similarly: a membar #Sync before restore allows the block stores of 58225cf1a30Sjl139090 * the copy operation to complete before we fill the quadrants with their 58325cf1a30Sjl139090 * original data, and a membar #Sync after restore lets the block loads 58425cf1a30Sjl139090 * of the restore complete before we return to whoever has the fp regs 58525cf1a30Sjl139090 * in use. To avoid repeated membar #Sync we make it the responsibility 58625cf1a30Sjl139090 * of the copy code to membar #Sync immediately after copy is complete 58725cf1a30Sjl139090 * and before using the BLD_*_FROMSTACK macro. 58825cf1a30Sjl139090 */ 58925cf1a30Sjl139090#if !defined(lint) 59025cf1a30Sjl139090#define BST_FPQ1Q3_TOSTACK(tmp1) \ 59125cf1a30Sjl139090 /* membar #Sync */ ;\ 59225cf1a30Sjl139090 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 59325cf1a30Sjl139090 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 59425cf1a30Sjl139090 stda %f0, [tmp1]ASI_BLK_P ;\ 59525cf1a30Sjl139090 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 59625cf1a30Sjl139090 stda %f32, [tmp1]ASI_BLK_P ;\ 59725cf1a30Sjl139090 membar #Sync 59825cf1a30Sjl139090 59925cf1a30Sjl139090#define BLD_FPQ1Q3_FROMSTACK(tmp1) \ 60025cf1a30Sjl139090 /* membar #Sync - provided at copy completion */ ;\ 60125cf1a30Sjl139090 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 60225cf1a30Sjl139090 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 60325cf1a30Sjl139090 ldda [tmp1]ASI_BLK_P, %f0 ;\ 60425cf1a30Sjl139090 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 60525cf1a30Sjl139090 ldda [tmp1]ASI_BLK_P, %f32 ;\ 60625cf1a30Sjl139090 membar #Sync 60725cf1a30Sjl139090 60825cf1a30Sjl139090#define BST_FPQ2Q4_TOSTACK(tmp1) \ 60925cf1a30Sjl139090 /* membar #Sync */ ;\ 61025cf1a30Sjl139090 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 61125cf1a30Sjl139090 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 61225cf1a30Sjl139090 stda %f16, [tmp1]ASI_BLK_P ;\ 61325cf1a30Sjl139090 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 61425cf1a30Sjl139090 stda %f48, [tmp1]ASI_BLK_P ;\ 61525cf1a30Sjl139090 membar #Sync 61625cf1a30Sjl139090 61725cf1a30Sjl139090#define BLD_FPQ2Q4_FROMSTACK(tmp1) \ 61825cf1a30Sjl139090 /* membar #Sync - provided at copy completion */ ;\ 61925cf1a30Sjl139090 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 62025cf1a30Sjl139090 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 62125cf1a30Sjl139090 ldda [tmp1]ASI_BLK_P, %f16 ;\ 62225cf1a30Sjl139090 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 62325cf1a30Sjl139090 ldda [tmp1]ASI_BLK_P, %f48 ;\ 62425cf1a30Sjl139090 membar #Sync 62525cf1a30Sjl139090#endif 62625cf1a30Sjl139090 62725cf1a30Sjl139090/* 62825cf1a30Sjl139090 * FP_NOMIGRATE and FP_ALLOWMIGRATE. Prevent migration (or, stronger, 62925cf1a30Sjl139090 * prevent preemption if there is no t_lwp to save FP state to on context 63025cf1a30Sjl139090 * switch) before commencing a FP copy, and reallow it on completion or 63125cf1a30Sjl139090 * in error trampoline paths when we were using FP copy. 63225cf1a30Sjl139090 * 63325cf1a30Sjl139090 * Both macros may call other functions, so be aware that all outputs are 63425cf1a30Sjl139090 * forfeit after using these macros. For this reason we do not pass registers 63525cf1a30Sjl139090 * to use - we just use any outputs we want. 63625cf1a30Sjl139090 * 63725cf1a30Sjl139090 * Pseudo code: 63825cf1a30Sjl139090 * 63925cf1a30Sjl139090 * FP_NOMIGRATE: 64025cf1a30Sjl139090 * 64125cf1a30Sjl139090 * if (curthread->t_lwp) { 64225cf1a30Sjl139090 * thread_nomigrate(); 64325cf1a30Sjl139090 * } else { 64425cf1a30Sjl139090 * kpreempt_disable(); 64525cf1a30Sjl139090 * } 64625cf1a30Sjl139090 * 64725cf1a30Sjl139090 * FP_ALLOWMIGRATE: 64825cf1a30Sjl139090 * 64925cf1a30Sjl139090 * if (curthread->t_lwp) { 65025cf1a30Sjl139090 * thread_allowmigrate(); 65125cf1a30Sjl139090 * } else { 65225cf1a30Sjl139090 * kpreempt_enable(); 65325cf1a30Sjl139090 * } 65425cf1a30Sjl139090 */ 65525cf1a30Sjl139090 65625cf1a30Sjl139090#define FP_NOMIGRATE(label1, label2) \ 65725cf1a30Sjl139090 ldn [THREAD_REG + T_LWP], %o0 ;\ 65825cf1a30Sjl139090 brz,a,pn %o0, label1/**/f ;\ 65925cf1a30Sjl139090 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ 66025cf1a30Sjl139090 call thread_nomigrate ;\ 66125cf1a30Sjl139090 nop ;\ 66225cf1a30Sjl139090 ba label2/**/f ;\ 66325cf1a30Sjl139090 nop ;\ 66425cf1a30Sjl139090label1: ;\ 66525cf1a30Sjl139090 inc %o1 ;\ 66625cf1a30Sjl139090 stb %o1, [THREAD_REG + T_PREEMPT] ;\ 66725cf1a30Sjl139090label2: 66825cf1a30Sjl139090 66925cf1a30Sjl139090#define FP_ALLOWMIGRATE(label1, label2) \ 67025cf1a30Sjl139090 ldn [THREAD_REG + T_LWP], %o0 ;\ 67125cf1a30Sjl139090 brz,a,pn %o0, label1/**/f ;\ 67225cf1a30Sjl139090 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ 67325cf1a30Sjl139090 call thread_allowmigrate ;\ 67425cf1a30Sjl139090 nop ;\ 67525cf1a30Sjl139090 ba label2/**/f ;\ 67625cf1a30Sjl139090 nop ;\ 67725cf1a30Sjl139090label1: ;\ 67825cf1a30Sjl139090 dec %o1 ;\ 67925cf1a30Sjl139090 brnz,pn %o1, label2/**/f ;\ 68025cf1a30Sjl139090 stb %o1, [THREAD_REG + T_PREEMPT] ;\ 68125cf1a30Sjl139090 ldn [THREAD_REG + T_CPU], %o0 ;\ 68225cf1a30Sjl139090 ldub [%o0 + CPU_KPRUNRUN], %o0 ;\ 68325cf1a30Sjl139090 brz,pt %o0, label2/**/f ;\ 68425cf1a30Sjl139090 nop ;\ 68525cf1a30Sjl139090 call kpreempt ;\ 68625cf1a30Sjl139090 rdpr %pil, %o0 ;\ 68725cf1a30Sjl139090label2: 68825cf1a30Sjl139090 68925cf1a30Sjl139090/* 69025cf1a30Sjl139090 * Copy a block of storage, returning an error code if `from' or 69125cf1a30Sjl139090 * `to' takes a kernel pagefault which cannot be resolved. 69225cf1a30Sjl139090 * Returns errno value on pagefault error, 0 if all ok 69325cf1a30Sjl139090 */ 69425cf1a30Sjl139090 69525cf1a30Sjl139090#if defined(lint) 69625cf1a30Sjl139090 69725cf1a30Sjl139090/* ARGSUSED */ 69825cf1a30Sjl139090int 69925cf1a30Sjl139090kcopy(const void *from, void *to, size_t count) 70025cf1a30Sjl139090{ return(0); } 70125cf1a30Sjl139090 70225cf1a30Sjl139090#else /* lint */ 70325cf1a30Sjl139090 70425cf1a30Sjl139090 .seg ".text" 70525cf1a30Sjl139090 .align 4 70625cf1a30Sjl139090 70725cf1a30Sjl139090 ENTRY(kcopy) 70825cf1a30Sjl139090 70925cf1a30Sjl139090 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 71025cf1a30Sjl139090 bleu,pt %ncc, .kcopy_small ! go to larger cases 71125cf1a30Sjl139090 xor %o0, %o1, %o3 ! are src, dst alignable? 71225cf1a30Sjl139090 btst 7, %o3 ! 71325cf1a30Sjl139090 bz,pt %ncc, .kcopy_8 ! check for longword alignment 71425cf1a30Sjl139090 nop 71525cf1a30Sjl139090 btst 1, %o3 ! 71625cf1a30Sjl139090 bz,pt %ncc, .kcopy_2 ! check for half-word 71725cf1a30Sjl139090 nop 71825cf1a30Sjl139090 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 71925cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_1)], %o3 72025cf1a30Sjl139090 tst %o3 72125cf1a30Sjl139090 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 72225cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 72325cf1a30Sjl139090 bleu,pt %ncc, .kcopy_small ! go to small copy 72425cf1a30Sjl139090 nop 72525cf1a30Sjl139090 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 72625cf1a30Sjl139090 nop 72725cf1a30Sjl139090.kcopy_2: 72825cf1a30Sjl139090 btst 3, %o3 ! 72925cf1a30Sjl139090 bz,pt %ncc, .kcopy_4 ! check for word alignment 73025cf1a30Sjl139090 nop 73125cf1a30Sjl139090 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 73225cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_2)], %o3 73325cf1a30Sjl139090 tst %o3 73425cf1a30Sjl139090 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 73525cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 73625cf1a30Sjl139090 bleu,pt %ncc, .kcopy_small ! go to small copy 73725cf1a30Sjl139090 nop 73825cf1a30Sjl139090 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 73925cf1a30Sjl139090 nop 74025cf1a30Sjl139090.kcopy_4: 74125cf1a30Sjl139090 ! already checked longword, must be word aligned 74225cf1a30Sjl139090 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 74325cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_4)], %o3 74425cf1a30Sjl139090 tst %o3 74525cf1a30Sjl139090 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 74625cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 74725cf1a30Sjl139090 bleu,pt %ncc, .kcopy_small ! go to small copy 74825cf1a30Sjl139090 nop 74925cf1a30Sjl139090 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 75025cf1a30Sjl139090 nop 75125cf1a30Sjl139090.kcopy_8: 75225cf1a30Sjl139090 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 75325cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_8)], %o3 75425cf1a30Sjl139090 tst %o3 75525cf1a30Sjl139090 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 75625cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 75725cf1a30Sjl139090 bleu,pt %ncc, .kcopy_small ! go to small copy 75825cf1a30Sjl139090 nop 75925cf1a30Sjl139090 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 76025cf1a30Sjl139090 nop 76125cf1a30Sjl139090 76225cf1a30Sjl139090.kcopy_small: 76325cf1a30Sjl139090 sethi %hi(.sm_copyerr), %o5 ! sm_copyerr is lofault value 76425cf1a30Sjl139090 or %o5, %lo(.sm_copyerr), %o5 76525cf1a30Sjl139090 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 76625cf1a30Sjl139090 membar #Sync ! sync error barrier 76725cf1a30Sjl139090 ba,pt %ncc, .sm_do_copy ! common code 76825cf1a30Sjl139090 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 76925cf1a30Sjl139090 77025cf1a30Sjl139090.kcopy_more: 77125cf1a30Sjl139090 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 77225cf1a30Sjl139090 sethi %hi(.copyerr), %l7 ! copyerr is lofault value 77325cf1a30Sjl139090 or %l7, %lo(.copyerr), %l7 77425cf1a30Sjl139090 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 77525cf1a30Sjl139090 membar #Sync ! sync error barrier 77625cf1a30Sjl139090 ba,pt %ncc, .do_copy ! common code 77725cf1a30Sjl139090 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 77825cf1a30Sjl139090 77925cf1a30Sjl139090 78025cf1a30Sjl139090/* 78125cf1a30Sjl139090 * We got here because of a fault during bcopy_more, called from kcopy or bcopy. 78225cf1a30Sjl139090 * Errno value is in %g1. bcopy_more uses fp quadrants 1 and 3. 78325cf1a30Sjl139090 */ 78425cf1a30Sjl139090.copyerr: 78525cf1a30Sjl139090 set .copyerr2, %l0 78625cf1a30Sjl139090 membar #Sync ! sync error barrier 78725cf1a30Sjl139090 stn %l0, [THREAD_REG + T_LOFAULT] ! set t_lofault 78825cf1a30Sjl139090 btst FPUSED_FLAG, %l6 78925cf1a30Sjl139090 bz %ncc, 1f 79025cf1a30Sjl139090 and %l6, TRAMP_FLAG, %l0 ! copy trampoline flag to %l0 79125cf1a30Sjl139090 79225cf1a30Sjl139090 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 79325cf1a30Sjl139090 wr %o2, 0, %gsr 79425cf1a30Sjl139090 79525cf1a30Sjl139090 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 79625cf1a30Sjl139090 btst FPRS_FEF, %o3 79725cf1a30Sjl139090 bz,pt %icc, 4f 79825cf1a30Sjl139090 nop 79925cf1a30Sjl139090 80025cf1a30Sjl139090 BLD_FPQ1Q3_FROMSTACK(%o2) 80125cf1a30Sjl139090 80225cf1a30Sjl139090 ba,pt %ncc, 1f 80325cf1a30Sjl139090 wr %o3, 0, %fprs ! restore fprs 80425cf1a30Sjl139090 80525cf1a30Sjl1390904: 80625cf1a30Sjl139090 FZEROQ1Q3 80725cf1a30Sjl139090 wr %o3, 0, %fprs ! restore fprs 80825cf1a30Sjl139090 80925cf1a30Sjl139090 ! 81025cf1a30Sjl139090 ! Need to cater for the different expectations of kcopy 81125cf1a30Sjl139090 ! and bcopy. kcopy will *always* set a t_lofault handler 81225cf1a30Sjl139090 ! If it fires, we're expected to just return the error code 81325cf1a30Sjl139090 ! and *not* to invoke any existing error handler. As far as 81425cf1a30Sjl139090 ! bcopy is concerned, we only set t_lofault if there was an 81525cf1a30Sjl139090 ! existing lofault handler. In that case we're expected to 81625cf1a30Sjl139090 ! invoke the previously existing handler after resetting the 81725cf1a30Sjl139090 ! t_lofault value. 81825cf1a30Sjl139090 ! 81925cf1a30Sjl1390901: 82025cf1a30Sjl139090 andn %l6, MASK_FLAGS, %l6 ! turn trampoline flag off 82125cf1a30Sjl139090 membar #Sync ! sync error barrier 82225cf1a30Sjl139090 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 82325cf1a30Sjl139090 FP_ALLOWMIGRATE(5, 6) 82425cf1a30Sjl139090 82525cf1a30Sjl139090 btst TRAMP_FLAG, %l0 82625cf1a30Sjl139090 bnz,pn %ncc, 3f 82725cf1a30Sjl139090 nop 82825cf1a30Sjl139090 ret 82925cf1a30Sjl139090 restore %g1, 0, %o0 83025cf1a30Sjl139090 83125cf1a30Sjl1390903: 83225cf1a30Sjl139090 ! 83325cf1a30Sjl139090 ! We're here via bcopy. There *must* have been an error handler 83425cf1a30Sjl139090 ! in place otherwise we would have died a nasty death already. 83525cf1a30Sjl139090 ! 83625cf1a30Sjl139090 jmp %l6 ! goto real handler 83725cf1a30Sjl139090 restore %g0, 0, %o0 ! dispose of copy window 83825cf1a30Sjl139090 83925cf1a30Sjl139090/* 84025cf1a30Sjl139090 * We got here because of a fault in .copyerr. We can't safely restore fp 84125cf1a30Sjl139090 * state, so we panic. 84225cf1a30Sjl139090 */ 84325cf1a30Sjl139090fp_panic_msg: 84425cf1a30Sjl139090 .asciz "Unable to restore fp state after copy operation" 84525cf1a30Sjl139090 84625cf1a30Sjl139090 .align 4 84725cf1a30Sjl139090.copyerr2: 84825cf1a30Sjl139090 set fp_panic_msg, %o0 84925cf1a30Sjl139090 call panic 85025cf1a30Sjl139090 nop 85125cf1a30Sjl139090 85225cf1a30Sjl139090/* 85325cf1a30Sjl139090 * We got here because of a fault during a small kcopy or bcopy. 85425cf1a30Sjl139090 * No floating point registers are used by the small copies. 85525cf1a30Sjl139090 * Errno value is in %g1. 85625cf1a30Sjl139090 */ 85725cf1a30Sjl139090.sm_copyerr: 85825cf1a30Sjl1390901: 85925cf1a30Sjl139090 btst TRAMP_FLAG, %o4 86025cf1a30Sjl139090 membar #Sync 86125cf1a30Sjl139090 andn %o4, TRAMP_FLAG, %o4 86225cf1a30Sjl139090 bnz,pn %ncc, 3f 86325cf1a30Sjl139090 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 86425cf1a30Sjl139090 retl 86525cf1a30Sjl139090 mov %g1, %o0 86625cf1a30Sjl1390903: 86725cf1a30Sjl139090 jmp %o4 ! goto real handler 86825cf1a30Sjl139090 mov %g0, %o0 ! 86925cf1a30Sjl139090 87025cf1a30Sjl139090 SET_SIZE(kcopy) 87125cf1a30Sjl139090#endif /* lint */ 87225cf1a30Sjl139090 87325cf1a30Sjl139090 87425cf1a30Sjl139090/* 87525cf1a30Sjl139090 * Copy a block of storage - must not overlap (from + len <= to). 87625cf1a30Sjl139090 * Registers: l6 - saved t_lofault 87725cf1a30Sjl139090 * (for short copies, o4 - saved t_lofault) 87825cf1a30Sjl139090 * 87925cf1a30Sjl139090 * Copy a page of memory. 88025cf1a30Sjl139090 * Assumes double word alignment and a count >= 256. 88125cf1a30Sjl139090 */ 88225cf1a30Sjl139090#if defined(lint) 88325cf1a30Sjl139090 88425cf1a30Sjl139090/* ARGSUSED */ 88525cf1a30Sjl139090void 88625cf1a30Sjl139090bcopy(const void *from, void *to, size_t count) 88725cf1a30Sjl139090{} 88825cf1a30Sjl139090 88925cf1a30Sjl139090#else /* lint */ 89025cf1a30Sjl139090 89125cf1a30Sjl139090 ENTRY(bcopy) 89225cf1a30Sjl139090 89325cf1a30Sjl139090 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 89425cf1a30Sjl139090 bleu,pt %ncc, .bcopy_small ! go to larger cases 89525cf1a30Sjl139090 xor %o0, %o1, %o3 ! are src, dst alignable? 89625cf1a30Sjl139090 btst 7, %o3 ! 89725cf1a30Sjl139090 bz,pt %ncc, .bcopy_8 ! check for longword alignment 89825cf1a30Sjl139090 nop 89925cf1a30Sjl139090 btst 1, %o3 ! 90025cf1a30Sjl139090 bz,pt %ncc, .bcopy_2 ! check for half-word 90125cf1a30Sjl139090 nop 90225cf1a30Sjl139090 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 90325cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_1)], %o3 90425cf1a30Sjl139090 tst %o3 90525cf1a30Sjl139090 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 90625cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 90725cf1a30Sjl139090 bleu,pt %ncc, .bcopy_small ! go to small copy 90825cf1a30Sjl139090 nop 90925cf1a30Sjl139090 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 91025cf1a30Sjl139090 nop 91125cf1a30Sjl139090.bcopy_2: 91225cf1a30Sjl139090 btst 3, %o3 ! 91325cf1a30Sjl139090 bz,pt %ncc, .bcopy_4 ! check for word alignment 91425cf1a30Sjl139090 nop 91525cf1a30Sjl139090 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 91625cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_2)], %o3 91725cf1a30Sjl139090 tst %o3 91825cf1a30Sjl139090 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 91925cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 92025cf1a30Sjl139090 bleu,pt %ncc, .bcopy_small ! go to small copy 92125cf1a30Sjl139090 nop 92225cf1a30Sjl139090 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 92325cf1a30Sjl139090 nop 92425cf1a30Sjl139090.bcopy_4: 92525cf1a30Sjl139090 ! already checked longword, must be word aligned 92625cf1a30Sjl139090 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 92725cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_4)], %o3 92825cf1a30Sjl139090 tst %o3 92925cf1a30Sjl139090 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 93025cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 93125cf1a30Sjl139090 bleu,pt %ncc, .bcopy_small ! go to small copy 93225cf1a30Sjl139090 nop 93325cf1a30Sjl139090 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 93425cf1a30Sjl139090 nop 93525cf1a30Sjl139090.bcopy_8: 93625cf1a30Sjl139090 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 93725cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_8)], %o3 93825cf1a30Sjl139090 tst %o3 93925cf1a30Sjl139090 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 94025cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 94125cf1a30Sjl139090 bleu,pt %ncc, .bcopy_small ! go to small copy 94225cf1a30Sjl139090 nop 94325cf1a30Sjl139090 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 94425cf1a30Sjl139090 nop 94525cf1a30Sjl139090 94625cf1a30Sjl139090 .align 16 94725cf1a30Sjl139090.bcopy_small: 94825cf1a30Sjl139090 ldn [THREAD_REG + T_LOFAULT], %o4 ! save t_lofault 94925cf1a30Sjl139090 tst %o4 95025cf1a30Sjl139090 bz,pt %icc, .sm_do_copy 95125cf1a30Sjl139090 nop 95225cf1a30Sjl139090 sethi %hi(.sm_copyerr), %o5 95325cf1a30Sjl139090 or %o5, %lo(.sm_copyerr), %o5 95425cf1a30Sjl139090 membar #Sync ! sync error barrier 95525cf1a30Sjl139090 stn %o5, [THREAD_REG + T_LOFAULT] ! install new vector 95625cf1a30Sjl139090 or %o4, TRAMP_FLAG, %o4 ! error should trampoline 95725cf1a30Sjl139090.sm_do_copy: 95825cf1a30Sjl139090 cmp %o2, SHORTCOPY ! check for really short case 95925cf1a30Sjl139090 bleu,pt %ncc, .bc_sm_left ! 96025cf1a30Sjl139090 cmp %o2, CHKSIZE ! check for medium length cases 96125cf1a30Sjl139090 bgu,pn %ncc, .bc_med ! 96225cf1a30Sjl139090 or %o0, %o1, %o3 ! prepare alignment check 96325cf1a30Sjl139090 andcc %o3, 0x3, %g0 ! test for alignment 96425cf1a30Sjl139090 bz,pt %ncc, .bc_sm_word ! branch to word aligned case 96525cf1a30Sjl139090.bc_sm_movebytes: 96625cf1a30Sjl139090 sub %o2, 3, %o2 ! adjust count to allow cc zero test 96725cf1a30Sjl139090.bc_sm_notalign4: 96825cf1a30Sjl139090 ldub [%o0], %o3 ! read byte 96925cf1a30Sjl139090 stb %o3, [%o1] ! write byte 97025cf1a30Sjl139090 subcc %o2, 4, %o2 ! reduce count by 4 97125cf1a30Sjl139090 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes 97225cf1a30Sjl139090 add %o0, 4, %o0 ! advance SRC by 4 97325cf1a30Sjl139090 stb %o3, [%o1 + 1] 97425cf1a30Sjl139090 ldub [%o0 - 2], %o3 97525cf1a30Sjl139090 add %o1, 4, %o1 ! advance DST by 4 97625cf1a30Sjl139090 stb %o3, [%o1 - 2] 97725cf1a30Sjl139090 ldub [%o0 - 1], %o3 97825cf1a30Sjl139090 bgt,pt %ncc, .bc_sm_notalign4 ! loop til 3 or fewer bytes remain 97925cf1a30Sjl139090 stb %o3, [%o1 - 1] 98025cf1a30Sjl139090 add %o2, 3, %o2 ! restore count 98125cf1a30Sjl139090.bc_sm_left: 98225cf1a30Sjl139090 tst %o2 98325cf1a30Sjl139090 bz,pt %ncc, .bc_sm_exit ! check for zero length 98425cf1a30Sjl139090 deccc %o2 ! reduce count for cc test 98525cf1a30Sjl139090 ldub [%o0], %o3 ! move one byte 98625cf1a30Sjl139090 bz,pt %ncc, .bc_sm_exit 98725cf1a30Sjl139090 stb %o3, [%o1] 98825cf1a30Sjl139090 ldub [%o0 + 1], %o3 ! move another byte 98925cf1a30Sjl139090 deccc %o2 ! check for more 99025cf1a30Sjl139090 bz,pt %ncc, .bc_sm_exit 99125cf1a30Sjl139090 stb %o3, [%o1 + 1] 99225cf1a30Sjl139090 ldub [%o0 + 2], %o3 ! move final byte 993*e64c6c3fSMichael Bergknoff ba,pt %ncc, .bc_sm_exit 99425cf1a30Sjl139090 stb %o3, [%o1 + 2] 99525cf1a30Sjl139090 .align 16 99625cf1a30Sjl139090 nop ! instruction alignment 99725cf1a30Sjl139090 ! see discussion at start of file 99825cf1a30Sjl139090.bc_sm_words: 99925cf1a30Sjl139090 lduw [%o0], %o3 ! read word 100025cf1a30Sjl139090.bc_sm_wordx: 100125cf1a30Sjl139090 subcc %o2, 8, %o2 ! update count 100225cf1a30Sjl139090 stw %o3, [%o1] ! write word 100325cf1a30Sjl139090 add %o0, 8, %o0 ! update SRC 100425cf1a30Sjl139090 lduw [%o0 - 4], %o3 ! read word 100525cf1a30Sjl139090 add %o1, 8, %o1 ! update DST 100625cf1a30Sjl139090 bgt,pt %ncc, .bc_sm_words ! loop til done 100725cf1a30Sjl139090 stw %o3, [%o1 - 4] ! write word 100825cf1a30Sjl139090 addcc %o2, 7, %o2 ! restore count 100925cf1a30Sjl139090 bz,pt %ncc, .bc_sm_exit 101025cf1a30Sjl139090 deccc %o2 101125cf1a30Sjl139090 bz,pt %ncc, .bc_sm_byte 101225cf1a30Sjl139090.bc_sm_half: 101325cf1a30Sjl139090 subcc %o2, 2, %o2 ! reduce count by 2 101425cf1a30Sjl139090 add %o0, 2, %o0 ! advance SRC by 2 101525cf1a30Sjl139090 lduh [%o0 - 2], %o3 ! read half word 101625cf1a30Sjl139090 add %o1, 2, %o1 ! advance DST by 2 101725cf1a30Sjl139090 bgt,pt %ncc, .bc_sm_half ! loop til done 101825cf1a30Sjl139090 sth %o3, [%o1 - 2] ! write half word 101925cf1a30Sjl139090 addcc %o2, 1, %o2 ! restore count 102025cf1a30Sjl139090 bz,pt %ncc, .bc_sm_exit 102125cf1a30Sjl139090 nop 102225cf1a30Sjl139090.bc_sm_byte: 102325cf1a30Sjl139090 ldub [%o0], %o3 1024*e64c6c3fSMichael Bergknoff ba,pt %ncc, .bc_sm_exit 102525cf1a30Sjl139090 stb %o3, [%o1] 102625cf1a30Sjl139090 102725cf1a30Sjl139090.bc_sm_word: 102825cf1a30Sjl139090 subcc %o2, 4, %o2 ! update count 102925cf1a30Sjl139090 bgt,pt %ncc, .bc_sm_wordx 103025cf1a30Sjl139090 lduw [%o0], %o3 ! read word 103125cf1a30Sjl139090 addcc %o2, 3, %o2 ! restore count 103225cf1a30Sjl139090 bz,pt %ncc, .bc_sm_exit 103325cf1a30Sjl139090 stw %o3, [%o1] ! write word 103425cf1a30Sjl139090 deccc %o2 ! reduce count for cc test 103525cf1a30Sjl139090 ldub [%o0 + 4], %o3 ! load one byte 103625cf1a30Sjl139090 bz,pt %ncc, .bc_sm_exit 103725cf1a30Sjl139090 stb %o3, [%o1 + 4] ! store one byte 103825cf1a30Sjl139090 ldub [%o0 + 5], %o3 ! load second byte 103925cf1a30Sjl139090 deccc %o2 104025cf1a30Sjl139090 bz,pt %ncc, .bc_sm_exit 104125cf1a30Sjl139090 stb %o3, [%o1 + 5] ! store second byte 104225cf1a30Sjl139090 ldub [%o0 + 6], %o3 ! load third byte 104325cf1a30Sjl139090 stb %o3, [%o1 + 6] ! store third byte 104425cf1a30Sjl139090.bc_sm_exit: 1045*e64c6c3fSMichael Bergknoff ldn [THREAD_REG + T_LOFAULT], %o3 1046*e64c6c3fSMichael Bergknoff brz,pt %o3, .bc_sm_done 10470090fbabSkm84432 nop 104825cf1a30Sjl139090 membar #Sync ! sync error barrier 104925cf1a30Sjl139090 andn %o4, TRAMP_FLAG, %o4 105025cf1a30Sjl139090 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 10510090fbabSkm84432.bc_sm_done: 105225cf1a30Sjl139090 retl 105325cf1a30Sjl139090 mov %g0, %o0 ! return 0 105425cf1a30Sjl139090 105525cf1a30Sjl139090 .align 16 105625cf1a30Sjl139090.bc_med: 105725cf1a30Sjl139090 xor %o0, %o1, %o3 ! setup alignment check 105825cf1a30Sjl139090 btst 1, %o3 105925cf1a30Sjl139090 bnz,pt %ncc, .bc_sm_movebytes ! unaligned 106025cf1a30Sjl139090 nop 106125cf1a30Sjl139090 btst 3, %o3 106225cf1a30Sjl139090 bnz,pt %ncc, .bc_med_half ! halfword aligned 106325cf1a30Sjl139090 nop 106425cf1a30Sjl139090 btst 7, %o3 106525cf1a30Sjl139090 bnz,pt %ncc, .bc_med_word ! word aligned 106625cf1a30Sjl139090 nop 106725cf1a30Sjl139090.bc_med_long: 106825cf1a30Sjl139090 btst 3, %o0 ! check for 106925cf1a30Sjl139090 bz,pt %ncc, .bc_med_long1 ! word alignment 107025cf1a30Sjl139090 nop 107125cf1a30Sjl139090.bc_med_long0: 107225cf1a30Sjl139090 ldub [%o0], %o3 ! load one byte 107325cf1a30Sjl139090 inc %o0 107425cf1a30Sjl139090 stb %o3,[%o1] ! store byte 107525cf1a30Sjl139090 inc %o1 107625cf1a30Sjl139090 btst 3, %o0 107725cf1a30Sjl139090 bnz,pt %ncc, .bc_med_long0 107825cf1a30Sjl139090 dec %o2 107925cf1a30Sjl139090.bc_med_long1: ! word aligned 108025cf1a30Sjl139090 btst 7, %o0 ! check for long word 108125cf1a30Sjl139090 bz,pt %ncc, .bc_med_long2 108225cf1a30Sjl139090 nop 108325cf1a30Sjl139090 lduw [%o0], %o3 ! load word 108425cf1a30Sjl139090 add %o0, 4, %o0 ! advance SRC by 4 108525cf1a30Sjl139090 stw %o3, [%o1] ! store word 108625cf1a30Sjl139090 add %o1, 4, %o1 ! advance DST by 4 108725cf1a30Sjl139090 sub %o2, 4, %o2 ! reduce count by 4 108825cf1a30Sjl139090! 108925cf1a30Sjl139090! Now long word aligned and have at least 32 bytes to move 109025cf1a30Sjl139090! 109125cf1a30Sjl139090.bc_med_long2: 109225cf1a30Sjl139090 sub %o2, 31, %o2 ! adjust count to allow cc zero test 109325cf1a30Sjl139090.bc_med_lmove: 109425cf1a30Sjl139090 ldx [%o0], %o3 ! read long word 109525cf1a30Sjl139090 stx %o3, [%o1] ! write long word 109625cf1a30Sjl139090 subcc %o2, 32, %o2 ! reduce count by 32 109725cf1a30Sjl139090 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words 109825cf1a30Sjl139090 add %o0, 32, %o0 ! advance SRC by 32 109925cf1a30Sjl139090 stx %o3, [%o1 + 8] 110025cf1a30Sjl139090 ldx [%o0 - 16], %o3 110125cf1a30Sjl139090 add %o1, 32, %o1 ! advance DST by 32 110225cf1a30Sjl139090 stx %o3, [%o1 - 16] 110325cf1a30Sjl139090 ldx [%o0 - 8], %o3 110425cf1a30Sjl139090 bgt,pt %ncc, .bc_med_lmove ! loop til 31 or fewer bytes left 110525cf1a30Sjl139090 stx %o3, [%o1 - 8] 110625cf1a30Sjl139090 addcc %o2, 24, %o2 ! restore count to long word offset 110725cf1a30Sjl139090 ble,pt %ncc, .bc_med_lextra ! check for more long words to move 110825cf1a30Sjl139090 nop 110925cf1a30Sjl139090.bc_med_lword: 111025cf1a30Sjl139090 ldx [%o0], %o3 ! read long word 111125cf1a30Sjl139090 subcc %o2, 8, %o2 ! reduce count by 8 111225cf1a30Sjl139090 stx %o3, [%o1] ! write long word 111325cf1a30Sjl139090 add %o0, 8, %o0 ! advance SRC by 8 111425cf1a30Sjl139090 bgt,pt %ncc, .bc_med_lword ! loop til 7 or fewer bytes left 111525cf1a30Sjl139090 add %o1, 8, %o1 ! advance DST by 8 111625cf1a30Sjl139090.bc_med_lextra: 111725cf1a30Sjl139090 addcc %o2, 7, %o2 ! restore rest of count 111825cf1a30Sjl139090 bz,pt %ncc, .bc_sm_exit ! if zero, then done 111925cf1a30Sjl139090 deccc %o2 112025cf1a30Sjl139090 bz,pt %ncc, .bc_sm_byte 112125cf1a30Sjl139090 nop 112225cf1a30Sjl139090 ba,pt %ncc, .bc_sm_half 112325cf1a30Sjl139090 nop 112425cf1a30Sjl139090 112525cf1a30Sjl139090 .align 16 112625cf1a30Sjl139090.bc_med_word: 112725cf1a30Sjl139090 btst 3, %o0 ! check for 112825cf1a30Sjl139090 bz,pt %ncc, .bc_med_word1 ! word alignment 112925cf1a30Sjl139090 nop 113025cf1a30Sjl139090.bc_med_word0: 113125cf1a30Sjl139090 ldub [%o0], %o3 ! load one byte 113225cf1a30Sjl139090 inc %o0 113325cf1a30Sjl139090 stb %o3,[%o1] ! store byte 113425cf1a30Sjl139090 inc %o1 113525cf1a30Sjl139090 btst 3, %o0 113625cf1a30Sjl139090 bnz,pt %ncc, .bc_med_word0 113725cf1a30Sjl139090 dec %o2 113825cf1a30Sjl139090! 113925cf1a30Sjl139090! Now word aligned and have at least 36 bytes to move 114025cf1a30Sjl139090! 114125cf1a30Sjl139090.bc_med_word1: 114225cf1a30Sjl139090 sub %o2, 15, %o2 ! adjust count to allow cc zero test 114325cf1a30Sjl139090.bc_med_wmove: 114425cf1a30Sjl139090 lduw [%o0], %o3 ! read word 114525cf1a30Sjl139090 stw %o3, [%o1] ! write word 114625cf1a30Sjl139090 subcc %o2, 16, %o2 ! reduce count by 16 114725cf1a30Sjl139090 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words 114825cf1a30Sjl139090 add %o0, 16, %o0 ! advance SRC by 16 114925cf1a30Sjl139090 stw %o3, [%o1 + 4] 115025cf1a30Sjl139090 lduw [%o0 - 8], %o3 115125cf1a30Sjl139090 add %o1, 16, %o1 ! advance DST by 16 115225cf1a30Sjl139090 stw %o3, [%o1 - 8] 115325cf1a30Sjl139090 lduw [%o0 - 4], %o3 115425cf1a30Sjl139090 bgt,pt %ncc, .bc_med_wmove ! loop til 15 or fewer bytes left 115525cf1a30Sjl139090 stw %o3, [%o1 - 4] 115625cf1a30Sjl139090 addcc %o2, 12, %o2 ! restore count to word offset 115725cf1a30Sjl139090 ble,pt %ncc, .bc_med_wextra ! check for more words to move 115825cf1a30Sjl139090 nop 115925cf1a30Sjl139090.bc_med_word2: 116025cf1a30Sjl139090 lduw [%o0], %o3 ! read word 116125cf1a30Sjl139090 subcc %o2, 4, %o2 ! reduce count by 4 116225cf1a30Sjl139090 stw %o3, [%o1] ! write word 116325cf1a30Sjl139090 add %o0, 4, %o0 ! advance SRC by 4 116425cf1a30Sjl139090 bgt,pt %ncc, .bc_med_word2 ! loop til 3 or fewer bytes left 116525cf1a30Sjl139090 add %o1, 4, %o1 ! advance DST by 4 116625cf1a30Sjl139090.bc_med_wextra: 116725cf1a30Sjl139090 addcc %o2, 3, %o2 ! restore rest of count 116825cf1a30Sjl139090 bz,pt %ncc, .bc_sm_exit ! if zero, then done 116925cf1a30Sjl139090 deccc %o2 117025cf1a30Sjl139090 bz,pt %ncc, .bc_sm_byte 117125cf1a30Sjl139090 nop 117225cf1a30Sjl139090 ba,pt %ncc, .bc_sm_half 117325cf1a30Sjl139090 nop 117425cf1a30Sjl139090 117525cf1a30Sjl139090 .align 16 117625cf1a30Sjl139090.bc_med_half: 117725cf1a30Sjl139090 btst 1, %o0 ! check for 117825cf1a30Sjl139090 bz,pt %ncc, .bc_med_half1 ! half word alignment 117925cf1a30Sjl139090 nop 118025cf1a30Sjl139090 ldub [%o0], %o3 ! load one byte 118125cf1a30Sjl139090 inc %o0 118225cf1a30Sjl139090 stb %o3,[%o1] ! store byte 118325cf1a30Sjl139090 inc %o1 118425cf1a30Sjl139090 dec %o2 118525cf1a30Sjl139090! 118625cf1a30Sjl139090! Now half word aligned and have at least 38 bytes to move 118725cf1a30Sjl139090! 118825cf1a30Sjl139090.bc_med_half1: 118925cf1a30Sjl139090 sub %o2, 7, %o2 ! adjust count to allow cc zero test 119025cf1a30Sjl139090.bc_med_hmove: 119125cf1a30Sjl139090 lduh [%o0], %o3 ! read half word 119225cf1a30Sjl139090 sth %o3, [%o1] ! write half word 119325cf1a30Sjl139090 subcc %o2, 8, %o2 ! reduce count by 8 119425cf1a30Sjl139090 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords 119525cf1a30Sjl139090 add %o0, 8, %o0 ! advance SRC by 8 119625cf1a30Sjl139090 sth %o3, [%o1 + 2] 119725cf1a30Sjl139090 lduh [%o0 - 4], %o3 119825cf1a30Sjl139090 add %o1, 8, %o1 ! advance DST by 8 119925cf1a30Sjl139090 sth %o3, [%o1 - 4] 120025cf1a30Sjl139090 lduh [%o0 - 2], %o3 120125cf1a30Sjl139090 bgt,pt %ncc, .bc_med_hmove ! loop til 7 or fewer bytes left 120225cf1a30Sjl139090 sth %o3, [%o1 - 2] 120325cf1a30Sjl139090 addcc %o2, 7, %o2 ! restore count 120425cf1a30Sjl139090 bz,pt %ncc, .bc_sm_exit 120525cf1a30Sjl139090 deccc %o2 120625cf1a30Sjl139090 bz,pt %ncc, .bc_sm_byte 120725cf1a30Sjl139090 nop 120825cf1a30Sjl139090 ba,pt %ncc, .bc_sm_half 120925cf1a30Sjl139090 nop 121025cf1a30Sjl139090 121125cf1a30Sjl139090 SET_SIZE(bcopy) 121225cf1a30Sjl139090 121325cf1a30Sjl139090/* 121425cf1a30Sjl139090 * The _more entry points are not intended to be used directly by 121525cf1a30Sjl139090 * any caller from outside this file. They are provided to allow 121625cf1a30Sjl139090 * profiling and dtrace of the portions of the copy code that uses 121725cf1a30Sjl139090 * the floating point registers. 121825cf1a30Sjl139090 * This entry is particularly important as DTRACE (at least as of 121925cf1a30Sjl139090 * 4/2004) does not support leaf functions. 122025cf1a30Sjl139090 */ 122125cf1a30Sjl139090 122225cf1a30Sjl139090 ENTRY(bcopy_more) 122325cf1a30Sjl139090.bcopy_more: 122425cf1a30Sjl139090 prefetch [%o0], #n_reads 122525cf1a30Sjl139090 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 122625cf1a30Sjl139090 ldn [THREAD_REG + T_LOFAULT], %l6 ! save t_lofault 122725cf1a30Sjl139090 tst %l6 122825cf1a30Sjl139090 bz,pt %ncc, .do_copy 122925cf1a30Sjl139090 nop 123025cf1a30Sjl139090 sethi %hi(.copyerr), %o2 123125cf1a30Sjl139090 or %o2, %lo(.copyerr), %o2 123225cf1a30Sjl139090 membar #Sync ! sync error barrier 123325cf1a30Sjl139090 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector 123425cf1a30Sjl139090 ! 123525cf1a30Sjl139090 ! We've already captured whether t_lofault was zero on entry. 123625cf1a30Sjl139090 ! We need to mark ourselves as being from bcopy since both 123725cf1a30Sjl139090 ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set 123825cf1a30Sjl139090 ! and the saved lofault was zero, we won't reset lofault on 123925cf1a30Sjl139090 ! returning. 124025cf1a30Sjl139090 ! 124125cf1a30Sjl139090 or %l6, TRAMP_FLAG, %l6 124225cf1a30Sjl139090 124325cf1a30Sjl139090/* 124425cf1a30Sjl139090 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes 124525cf1a30Sjl139090 * Also, use of FP registers has been tested to be enabled 124625cf1a30Sjl139090 */ 124725cf1a30Sjl139090.do_copy: 124825cf1a30Sjl139090 FP_NOMIGRATE(6, 7) 124925cf1a30Sjl139090 125025cf1a30Sjl139090 rd %fprs, %o2 ! check for unused fp 125125cf1a30Sjl139090 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 125225cf1a30Sjl139090 btst FPRS_FEF, %o2 125325cf1a30Sjl139090 bz,a,pt %icc, .do_blockcopy 125425cf1a30Sjl139090 wr %g0, FPRS_FEF, %fprs 125525cf1a30Sjl139090 125625cf1a30Sjl139090 BST_FPQ1Q3_TOSTACK(%o2) 125725cf1a30Sjl139090 125825cf1a30Sjl139090.do_blockcopy: 125925cf1a30Sjl139090 rd %gsr, %o2 126025cf1a30Sjl139090 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 126125cf1a30Sjl139090 or %l6, FPUSED_FLAG, %l6 126225cf1a30Sjl139090 126325cf1a30Sjl139090#define REALSRC %i0 126425cf1a30Sjl139090#define DST %i1 126525cf1a30Sjl139090#define CNT %i2 126625cf1a30Sjl139090#define SRC %i3 126725cf1a30Sjl139090#define TMP %i5 126825cf1a30Sjl139090 126925cf1a30Sjl139090 andcc DST, VIS_BLOCKSIZE - 1, TMP 127025cf1a30Sjl139090 bz,pt %ncc, 2f 127125cf1a30Sjl139090 neg TMP 127225cf1a30Sjl139090 add TMP, VIS_BLOCKSIZE, TMP 127325cf1a30Sjl139090 127425cf1a30Sjl139090 ! TMP = bytes required to align DST on FP_BLOCK boundary 127525cf1a30Sjl139090 ! Using SRC as a tmp here 127625cf1a30Sjl139090 cmp TMP, 3 127725cf1a30Sjl139090 bleu,pt %ncc, 1f 127825cf1a30Sjl139090 sub CNT,TMP,CNT ! adjust main count 127925cf1a30Sjl139090 sub TMP, 3, TMP ! adjust for end of loop test 128025cf1a30Sjl139090.bc_blkalign: 128125cf1a30Sjl139090 ldub [REALSRC], SRC ! move 4 bytes per loop iteration 128225cf1a30Sjl139090 stb SRC, [DST] 128325cf1a30Sjl139090 subcc TMP, 4, TMP 128425cf1a30Sjl139090 ldub [REALSRC + 1], SRC 128525cf1a30Sjl139090 add REALSRC, 4, REALSRC 128625cf1a30Sjl139090 stb SRC, [DST + 1] 128725cf1a30Sjl139090 ldub [REALSRC - 2], SRC 128825cf1a30Sjl139090 add DST, 4, DST 128925cf1a30Sjl139090 stb SRC, [DST - 2] 129025cf1a30Sjl139090 ldub [REALSRC - 1], SRC 129125cf1a30Sjl139090 bgu,pt %ncc, .bc_blkalign 129225cf1a30Sjl139090 stb SRC, [DST - 1] 129325cf1a30Sjl139090 129425cf1a30Sjl139090 addcc TMP, 3, TMP ! restore count adjustment 129525cf1a30Sjl139090 bz,pt %ncc, 2f ! no bytes left? 129625cf1a30Sjl139090 nop 129725cf1a30Sjl1390901: ldub [REALSRC], SRC 129825cf1a30Sjl139090 inc REALSRC 129925cf1a30Sjl139090 inc DST 130025cf1a30Sjl139090 deccc TMP 130125cf1a30Sjl139090 bgu %ncc, 1b 130225cf1a30Sjl139090 stb SRC, [DST - 1] 130325cf1a30Sjl139090 130425cf1a30Sjl1390902: 130525cf1a30Sjl139090 membar #StoreLoad 130625cf1a30Sjl139090 andn REALSRC, 0x7, SRC 130725cf1a30Sjl139090 130825cf1a30Sjl139090 ! SRC - 8-byte aligned 130925cf1a30Sjl139090 ! DST - 64-byte aligned 131025cf1a30Sjl139090 ldd [SRC], %f0 131125cf1a30Sjl139090 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 131225cf1a30Sjl139090 alignaddr REALSRC, %g0, %g0 131325cf1a30Sjl139090 ldd [SRC + 0x08], %f2 131425cf1a30Sjl139090 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 131525cf1a30Sjl139090 faligndata %f0, %f2, %f32 131625cf1a30Sjl139090 ldd [SRC + 0x10], %f4 1317c8a722abSpm145316 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 131825cf1a30Sjl139090 faligndata %f2, %f4, %f34 131925cf1a30Sjl139090 ldd [SRC + 0x18], %f6 132025cf1a30Sjl139090 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 132125cf1a30Sjl139090 faligndata %f4, %f6, %f36 132225cf1a30Sjl139090 ldd [SRC + 0x20], %f8 1323c8a722abSpm145316 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read 132425cf1a30Sjl139090 faligndata %f6, %f8, %f38 132525cf1a30Sjl139090 ldd [SRC + 0x28], %f10 1326c8a722abSpm145316 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read 132725cf1a30Sjl139090 faligndata %f8, %f10, %f40 132825cf1a30Sjl139090 ldd [SRC + 0x30], %f12 1329c8a722abSpm145316 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read 133025cf1a30Sjl139090 faligndata %f10, %f12, %f42 133125cf1a30Sjl139090 ldd [SRC + 0x38], %f14 133225cf1a30Sjl139090 ldd [SRC + VIS_BLOCKSIZE], %f0 133325cf1a30Sjl139090 sub CNT, VIS_BLOCKSIZE, CNT 133425cf1a30Sjl139090 add SRC, VIS_BLOCKSIZE, SRC 1335c8a722abSpm145316 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read 133625cf1a30Sjl139090 add REALSRC, VIS_BLOCKSIZE, REALSRC 133725cf1a30Sjl139090 ba,pt %ncc, 1f 1338c8a722abSpm145316 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read 133925cf1a30Sjl139090 .align 32 134025cf1a30Sjl1390901: 134125cf1a30Sjl139090 ldd [SRC + 0x08], %f2 134225cf1a30Sjl139090 faligndata %f12, %f14, %f44 134325cf1a30Sjl139090 ldd [SRC + 0x10], %f4 134425cf1a30Sjl139090 faligndata %f14, %f0, %f46 134525cf1a30Sjl139090 stda %f32, [DST]ASI_BLK_P 134625cf1a30Sjl139090 ldd [SRC + 0x18], %f6 134725cf1a30Sjl139090 faligndata %f0, %f2, %f32 134825cf1a30Sjl139090 ldd [SRC + 0x20], %f8 134925cf1a30Sjl139090 faligndata %f2, %f4, %f34 135025cf1a30Sjl139090 ldd [SRC + 0x28], %f10 135125cf1a30Sjl139090 faligndata %f4, %f6, %f36 135225cf1a30Sjl139090 ldd [SRC + 0x30], %f12 135325cf1a30Sjl139090 faligndata %f6, %f8, %f38 135425cf1a30Sjl139090 sub CNT, VIS_BLOCKSIZE, CNT 1355c8a722abSpm145316 ldd [SRC + 0x38], %f14 1356c8a722abSpm145316 faligndata %f8, %f10, %f40 135725cf1a30Sjl139090 add DST, VIS_BLOCKSIZE, DST 1358c8a722abSpm145316 ldd [SRC + VIS_BLOCKSIZE], %f0 1359c8a722abSpm145316 faligndata %f10, %f12, %f42 136025cf1a30Sjl139090 add REALSRC, VIS_BLOCKSIZE, REALSRC 1361c8a722abSpm145316 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 1362c8a722abSpm145316 add SRC, VIS_BLOCKSIZE, SRC 1363c8a722abSpm145316 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read 136425cf1a30Sjl139090 cmp CNT, VIS_BLOCKSIZE + 8 136525cf1a30Sjl139090 bgu,pt %ncc, 1b 1366c8a722abSpm145316 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 136725cf1a30Sjl139090 136825cf1a30Sjl139090 ! only if REALSRC & 0x7 is 0 136925cf1a30Sjl139090 cmp CNT, VIS_BLOCKSIZE 137025cf1a30Sjl139090 bne %ncc, 3f 137125cf1a30Sjl139090 andcc REALSRC, 0x7, %g0 137225cf1a30Sjl139090 bz,pt %ncc, 2f 137325cf1a30Sjl139090 nop 137425cf1a30Sjl1390903: 137525cf1a30Sjl139090 faligndata %f12, %f14, %f44 137625cf1a30Sjl139090 faligndata %f14, %f0, %f46 137725cf1a30Sjl139090 stda %f32, [DST]ASI_BLK_P 137825cf1a30Sjl139090 add DST, VIS_BLOCKSIZE, DST 137925cf1a30Sjl139090 ba,pt %ncc, 3f 138025cf1a30Sjl139090 nop 138125cf1a30Sjl1390902: 138225cf1a30Sjl139090 ldd [SRC + 0x08], %f2 138325cf1a30Sjl139090 fsrc1 %f12, %f44 138425cf1a30Sjl139090 ldd [SRC + 0x10], %f4 138525cf1a30Sjl139090 fsrc1 %f14, %f46 138625cf1a30Sjl139090 stda %f32, [DST]ASI_BLK_P 138725cf1a30Sjl139090 ldd [SRC + 0x18], %f6 138825cf1a30Sjl139090 fsrc1 %f0, %f32 138925cf1a30Sjl139090 ldd [SRC + 0x20], %f8 139025cf1a30Sjl139090 fsrc1 %f2, %f34 139125cf1a30Sjl139090 ldd [SRC + 0x28], %f10 139225cf1a30Sjl139090 fsrc1 %f4, %f36 139325cf1a30Sjl139090 ldd [SRC + 0x30], %f12 139425cf1a30Sjl139090 fsrc1 %f6, %f38 139525cf1a30Sjl139090 ldd [SRC + 0x38], %f14 139625cf1a30Sjl139090 fsrc1 %f8, %f40 139725cf1a30Sjl139090 sub CNT, VIS_BLOCKSIZE, CNT 139825cf1a30Sjl139090 add DST, VIS_BLOCKSIZE, DST 139925cf1a30Sjl139090 add SRC, VIS_BLOCKSIZE, SRC 140025cf1a30Sjl139090 add REALSRC, VIS_BLOCKSIZE, REALSRC 140125cf1a30Sjl139090 fsrc1 %f10, %f42 140225cf1a30Sjl139090 fsrc1 %f12, %f44 140325cf1a30Sjl139090 fsrc1 %f14, %f46 140425cf1a30Sjl139090 stda %f32, [DST]ASI_BLK_P 140525cf1a30Sjl139090 add DST, VIS_BLOCKSIZE, DST 140625cf1a30Sjl139090 ba,a,pt %ncc, .bcb_exit 140725cf1a30Sjl139090 nop 140825cf1a30Sjl139090 140925cf1a30Sjl1390903: tst CNT 141025cf1a30Sjl139090 bz,a,pt %ncc, .bcb_exit 141125cf1a30Sjl139090 nop 141225cf1a30Sjl139090 141325cf1a30Sjl1390905: ldub [REALSRC], TMP 141425cf1a30Sjl139090 inc REALSRC 141525cf1a30Sjl139090 inc DST 141625cf1a30Sjl139090 deccc CNT 141725cf1a30Sjl139090 bgu %ncc, 5b 141825cf1a30Sjl139090 stb TMP, [DST - 1] 141925cf1a30Sjl139090.bcb_exit: 142025cf1a30Sjl139090 membar #Sync 142125cf1a30Sjl139090 142225cf1a30Sjl139090 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 142325cf1a30Sjl139090 wr %o2, 0, %gsr 142425cf1a30Sjl139090 142525cf1a30Sjl139090 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 142625cf1a30Sjl139090 btst FPRS_FEF, %o3 142725cf1a30Sjl139090 bz,pt %icc, 4f 142825cf1a30Sjl139090 nop 142925cf1a30Sjl139090 143025cf1a30Sjl139090 BLD_FPQ1Q3_FROMSTACK(%o2) 143125cf1a30Sjl139090 143225cf1a30Sjl139090 ba,pt %ncc, 2f 143325cf1a30Sjl139090 wr %o3, 0, %fprs ! restore fprs 143425cf1a30Sjl1390904: 143525cf1a30Sjl139090 FZEROQ1Q3 143625cf1a30Sjl139090 wr %o3, 0, %fprs ! restore fprs 143725cf1a30Sjl1390902: 143825cf1a30Sjl139090 membar #Sync ! sync error barrier 143925cf1a30Sjl139090 andn %l6, MASK_FLAGS, %l6 144025cf1a30Sjl139090 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 144125cf1a30Sjl139090 FP_ALLOWMIGRATE(5, 6) 144225cf1a30Sjl139090 ret 144325cf1a30Sjl139090 restore %g0, 0, %o0 144425cf1a30Sjl139090 144525cf1a30Sjl139090 SET_SIZE(bcopy_more) 144625cf1a30Sjl139090 144725cf1a30Sjl139090#endif /* lint */ 144825cf1a30Sjl139090 144925cf1a30Sjl139090/* 145025cf1a30Sjl139090 * Block copy with possibly overlapped operands. 145125cf1a30Sjl139090 */ 145225cf1a30Sjl139090 145325cf1a30Sjl139090#if defined(lint) 145425cf1a30Sjl139090 145525cf1a30Sjl139090/*ARGSUSED*/ 145625cf1a30Sjl139090void 145725cf1a30Sjl139090ovbcopy(const void *from, void *to, size_t count) 145825cf1a30Sjl139090{} 145925cf1a30Sjl139090 146025cf1a30Sjl139090#else /* lint */ 146125cf1a30Sjl139090 146225cf1a30Sjl139090 ENTRY(ovbcopy) 146325cf1a30Sjl139090 tst %o2 ! check count 146425cf1a30Sjl139090 bgu,a %ncc, 1f ! nothing to do or bad arguments 146525cf1a30Sjl139090 subcc %o0, %o1, %o3 ! difference of from and to address 146625cf1a30Sjl139090 146725cf1a30Sjl139090 retl ! return 146825cf1a30Sjl139090 nop 146925cf1a30Sjl1390901: 147025cf1a30Sjl139090 bneg,a %ncc, 2f 147125cf1a30Sjl139090 neg %o3 ! if < 0, make it positive 147225cf1a30Sjl1390902: cmp %o2, %o3 ! cmp size and abs(from - to) 147325cf1a30Sjl139090 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy, 147425cf1a30Sjl139090 .empty ! no overlap 147525cf1a30Sjl139090 cmp %o0, %o1 ! compare from and to addresses 147625cf1a30Sjl139090 blu %ncc, .ov_bkwd ! if from < to, copy backwards 147725cf1a30Sjl139090 nop 147825cf1a30Sjl139090 ! 147925cf1a30Sjl139090 ! Copy forwards. 148025cf1a30Sjl139090 ! 148125cf1a30Sjl139090.ov_fwd: 148225cf1a30Sjl139090 ldub [%o0], %o3 ! read from address 148325cf1a30Sjl139090 inc %o0 ! inc from address 148425cf1a30Sjl139090 stb %o3, [%o1] ! write to address 148525cf1a30Sjl139090 deccc %o2 ! dec count 148625cf1a30Sjl139090 bgu %ncc, .ov_fwd ! loop till done 148725cf1a30Sjl139090 inc %o1 ! inc to address 148825cf1a30Sjl139090 148925cf1a30Sjl139090 retl ! return 149025cf1a30Sjl139090 nop 149125cf1a30Sjl139090 ! 149225cf1a30Sjl139090 ! Copy backwards. 149325cf1a30Sjl139090 ! 149425cf1a30Sjl139090.ov_bkwd: 149525cf1a30Sjl139090 deccc %o2 ! dec count 149625cf1a30Sjl139090 ldub [%o0 + %o2], %o3 ! get byte at end of src 149725cf1a30Sjl139090 bgu %ncc, .ov_bkwd ! loop till done 149825cf1a30Sjl139090 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst 149925cf1a30Sjl139090 150025cf1a30Sjl139090 retl ! return 150125cf1a30Sjl139090 nop 150225cf1a30Sjl139090 150325cf1a30Sjl139090 SET_SIZE(ovbcopy) 150425cf1a30Sjl139090 150525cf1a30Sjl139090#endif /* lint */ 150625cf1a30Sjl139090 150725cf1a30Sjl139090 150825cf1a30Sjl139090/* 150925cf1a30Sjl139090 * hwblkpagecopy() 151025cf1a30Sjl139090 * 151125cf1a30Sjl139090 * Copies exactly one page. This routine assumes the caller (ppcopy) 151225cf1a30Sjl139090 * has already disabled kernel preemption and has checked 151325cf1a30Sjl139090 * use_hw_bcopy. Preventing preemption also prevents cpu migration. 151425cf1a30Sjl139090 */ 151525cf1a30Sjl139090#ifdef lint 151625cf1a30Sjl139090/*ARGSUSED*/ 151725cf1a30Sjl139090void 151825cf1a30Sjl139090hwblkpagecopy(const void *src, void *dst) 151925cf1a30Sjl139090{ } 152025cf1a30Sjl139090#else /* lint */ 152125cf1a30Sjl139090 ENTRY(hwblkpagecopy) 152225cf1a30Sjl139090 ! get another window w/space for three aligned blocks of saved fpregs 152325cf1a30Sjl139090 prefetch [%o0], #n_reads 152425cf1a30Sjl139090 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 152525cf1a30Sjl139090 152625cf1a30Sjl139090 ! %i0 - source address (arg) 152725cf1a30Sjl139090 ! %i1 - destination address (arg) 152825cf1a30Sjl139090 ! %i2 - length of region (not arg) 152925cf1a30Sjl139090 ! %l0 - saved fprs 153025cf1a30Sjl139090 ! %l1 - pointer to saved fpregs 153125cf1a30Sjl139090 153225cf1a30Sjl139090 rd %fprs, %l0 ! check for unused fp 153325cf1a30Sjl139090 btst FPRS_FEF, %l0 153425cf1a30Sjl139090 bz,a,pt %icc, 1f 153525cf1a30Sjl139090 wr %g0, FPRS_FEF, %fprs 153625cf1a30Sjl139090 153725cf1a30Sjl139090 BST_FPQ1Q3_TOSTACK(%l1) 153825cf1a30Sjl139090 153925cf1a30Sjl1390901: set PAGESIZE, CNT 154025cf1a30Sjl139090 mov REALSRC, SRC 154125cf1a30Sjl139090 154225cf1a30Sjl139090 ldd [SRC], %f0 154325cf1a30Sjl139090 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 154425cf1a30Sjl139090 ldd [SRC + 0x08], %f2 154525cf1a30Sjl139090 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 154625cf1a30Sjl139090 fmovd %f0, %f32 154725cf1a30Sjl139090 ldd [SRC + 0x10], %f4 1548c8a722abSpm145316 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 154925cf1a30Sjl139090 fmovd %f2, %f34 155025cf1a30Sjl139090 ldd [SRC + 0x18], %f6 155125cf1a30Sjl139090 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 155225cf1a30Sjl139090 fmovd %f4, %f36 155325cf1a30Sjl139090 ldd [SRC + 0x20], %f8 1554c8a722abSpm145316 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read 155525cf1a30Sjl139090 fmovd %f6, %f38 155625cf1a30Sjl139090 ldd [SRC + 0x28], %f10 1557c8a722abSpm145316 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read 155825cf1a30Sjl139090 fmovd %f8, %f40 155925cf1a30Sjl139090 ldd [SRC + 0x30], %f12 1560c8a722abSpm145316 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read 156125cf1a30Sjl139090 fmovd %f10, %f42 156225cf1a30Sjl139090 ldd [SRC + 0x38], %f14 156325cf1a30Sjl139090 ldd [SRC + VIS_BLOCKSIZE], %f0 156425cf1a30Sjl139090 sub CNT, VIS_BLOCKSIZE, CNT 156525cf1a30Sjl139090 add SRC, VIS_BLOCKSIZE, SRC 1566c8a722abSpm145316 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read 156725cf1a30Sjl139090 ba,pt %ncc, 2f 1568c8a722abSpm145316 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read 156925cf1a30Sjl139090 .align 32 157025cf1a30Sjl1390902: 157125cf1a30Sjl139090 ldd [SRC + 0x08], %f2 157225cf1a30Sjl139090 fmovd %f12, %f44 157325cf1a30Sjl139090 ldd [SRC + 0x10], %f4 157425cf1a30Sjl139090 fmovd %f14, %f46 157525cf1a30Sjl139090 stda %f32, [DST]ASI_BLK_P 157625cf1a30Sjl139090 ldd [SRC + 0x18], %f6 157725cf1a30Sjl139090 fmovd %f0, %f32 157825cf1a30Sjl139090 ldd [SRC + 0x20], %f8 157925cf1a30Sjl139090 fmovd %f2, %f34 158025cf1a30Sjl139090 ldd [SRC + 0x28], %f10 158125cf1a30Sjl139090 fmovd %f4, %f36 158225cf1a30Sjl139090 ldd [SRC + 0x30], %f12 158325cf1a30Sjl139090 fmovd %f6, %f38 158425cf1a30Sjl139090 ldd [SRC + 0x38], %f14 158525cf1a30Sjl139090 fmovd %f8, %f40 158625cf1a30Sjl139090 ldd [SRC + VIS_BLOCKSIZE], %f0 158725cf1a30Sjl139090 fmovd %f10, %f42 158825cf1a30Sjl139090 sub CNT, VIS_BLOCKSIZE, CNT 1589c8a722abSpm145316 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 159025cf1a30Sjl139090 add DST, VIS_BLOCKSIZE, DST 1591c8a722abSpm145316 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1592c8a722abSpm145316 add SRC, VIS_BLOCKSIZE, SRC 159325cf1a30Sjl139090 cmp CNT, VIS_BLOCKSIZE + 8 159425cf1a30Sjl139090 bgu,pt %ncc, 2b 1595c8a722abSpm145316 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 159625cf1a30Sjl139090 159725cf1a30Sjl139090 ! trailing block 159825cf1a30Sjl139090 ldd [SRC + 0x08], %f2 159925cf1a30Sjl139090 fsrc1 %f12, %f44 160025cf1a30Sjl139090 ldd [SRC + 0x10], %f4 160125cf1a30Sjl139090 fsrc1 %f14, %f46 160225cf1a30Sjl139090 stda %f32, [DST]ASI_BLK_P 160325cf1a30Sjl139090 ldd [SRC + 0x18], %f6 160425cf1a30Sjl139090 fsrc1 %f0, %f32 160525cf1a30Sjl139090 ldd [SRC + 0x20], %f8 160625cf1a30Sjl139090 fsrc1 %f2, %f34 160725cf1a30Sjl139090 ldd [SRC + 0x28], %f10 160825cf1a30Sjl139090 fsrc1 %f4, %f36 160925cf1a30Sjl139090 ldd [SRC + 0x30], %f12 161025cf1a30Sjl139090 fsrc1 %f6, %f38 161125cf1a30Sjl139090 ldd [SRC + 0x38], %f14 161225cf1a30Sjl139090 fsrc1 %f8, %f40 161325cf1a30Sjl139090 sub CNT, VIS_BLOCKSIZE, CNT 161425cf1a30Sjl139090 add DST, VIS_BLOCKSIZE, DST 161525cf1a30Sjl139090 add SRC, VIS_BLOCKSIZE, SRC 161625cf1a30Sjl139090 fsrc1 %f10, %f42 161725cf1a30Sjl139090 fsrc1 %f12, %f44 161825cf1a30Sjl139090 fsrc1 %f14, %f46 161925cf1a30Sjl139090 stda %f32, [DST]ASI_BLK_P 162025cf1a30Sjl139090 162125cf1a30Sjl139090 membar #Sync 162225cf1a30Sjl139090 162325cf1a30Sjl139090 btst FPRS_FEF, %l0 162425cf1a30Sjl139090 bz,pt %icc, 2f 162525cf1a30Sjl139090 nop 162625cf1a30Sjl139090 162725cf1a30Sjl139090 BLD_FPQ1Q3_FROMSTACK(%l3) 162825cf1a30Sjl139090 ba 3f 162925cf1a30Sjl139090 nop 163025cf1a30Sjl139090 163125cf1a30Sjl1390902: FZEROQ1Q3 163225cf1a30Sjl139090 163325cf1a30Sjl1390903: wr %l0, 0, %fprs ! restore fprs 163425cf1a30Sjl139090 ret 163525cf1a30Sjl139090 restore %g0, 0, %o0 163625cf1a30Sjl139090 163725cf1a30Sjl139090 SET_SIZE(hwblkpagecopy) 163825cf1a30Sjl139090#endif /* lint */ 163925cf1a30Sjl139090 164025cf1a30Sjl139090 164125cf1a30Sjl139090/* 164225cf1a30Sjl139090 * Transfer data to and from user space - 164325cf1a30Sjl139090 * Note that these routines can cause faults 164425cf1a30Sjl139090 * It is assumed that the kernel has nothing at 164525cf1a30Sjl139090 * less than KERNELBASE in the virtual address space. 164625cf1a30Sjl139090 * 164725cf1a30Sjl139090 * Note that copyin(9F) and copyout(9F) are part of the 164825cf1a30Sjl139090 * DDI/DKI which specifies that they return '-1' on "errors." 164925cf1a30Sjl139090 * 165025cf1a30Sjl139090 * Sigh. 165125cf1a30Sjl139090 * 165225cf1a30Sjl139090 * So there's two extremely similar routines - xcopyin() and xcopyout() 165325cf1a30Sjl139090 * which return the errno that we've faithfully computed. This 165425cf1a30Sjl139090 * allows other callers (e.g. uiomove(9F)) to work correctly. 165525cf1a30Sjl139090 * Given that these are used pretty heavily, we expand the calling 165625cf1a30Sjl139090 * sequences inline for all flavours (rather than making wrappers). 165725cf1a30Sjl139090 * 165825cf1a30Sjl139090 * There are also stub routines for xcopyout_little and xcopyin_little, 165925cf1a30Sjl139090 * which currently are intended to handle requests of <= 16 bytes from 166025cf1a30Sjl139090 * do_unaligned. Future enhancement to make them handle 8k pages efficiently 166125cf1a30Sjl139090 * is left as an exercise... 166225cf1a30Sjl139090 */ 166325cf1a30Sjl139090 166425cf1a30Sjl139090/* 166525cf1a30Sjl139090 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr) 166625cf1a30Sjl139090 * 166725cf1a30Sjl139090 * General theory of operation: 166825cf1a30Sjl139090 * 166925cf1a30Sjl139090 * The only difference between copy{in,out} and 167025cf1a30Sjl139090 * xcopy{in,out} is in the error handling routine they invoke 167125cf1a30Sjl139090 * when a memory access error occurs. xcopyOP returns the errno 167225cf1a30Sjl139090 * while copyOP returns -1 (see above). copy{in,out}_noerr set 167325cf1a30Sjl139090 * a special flag (by oring the TRAMP_FLAG into the fault handler address) 167425cf1a30Sjl139090 * if they are called with a fault handler already in place. That flag 167525cf1a30Sjl139090 * causes the default handlers to trampoline to the previous handler 167625cf1a30Sjl139090 * upon an error. 167725cf1a30Sjl139090 * 167825cf1a30Sjl139090 * None of the copyops routines grab a window until it's decided that 167925cf1a30Sjl139090 * we need to do a HW block copy operation. This saves a window 168025cf1a30Sjl139090 * spill/fill when we're called during socket ops. The typical IO 168125cf1a30Sjl139090 * path won't cause spill/fill traps. 168225cf1a30Sjl139090 * 168325cf1a30Sjl139090 * This code uses a set of 4 limits for the maximum size that will 168425cf1a30Sjl139090 * be copied given a particular input/output address alignment. 168525cf1a30Sjl139090 * If the value for a particular limit is zero, the copy will be performed 168625cf1a30Sjl139090 * by the plain copy loops rather than FPBLK. 168725cf1a30Sjl139090 * 168825cf1a30Sjl139090 * See the description of bcopy above for more details of the 168925cf1a30Sjl139090 * data copying algorithm and the default limits. 169025cf1a30Sjl139090 * 169125cf1a30Sjl139090 */ 169225cf1a30Sjl139090 169325cf1a30Sjl139090/* 169425cf1a30Sjl139090 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little). 169525cf1a30Sjl139090 */ 169625cf1a30Sjl139090 169725cf1a30Sjl139090#if defined(lint) 169825cf1a30Sjl139090 169925cf1a30Sjl139090 170025cf1a30Sjl139090#else /* lint */ 170125cf1a30Sjl139090/* 170225cf1a30Sjl139090 * We save the arguments in the following registers in case of a fault: 170325cf1a30Sjl139090 * kaddr - %l1 170425cf1a30Sjl139090 * uaddr - %l2 170525cf1a30Sjl139090 * count - %l3 170625cf1a30Sjl139090 */ 170725cf1a30Sjl139090#define SAVE_SRC %l1 170825cf1a30Sjl139090#define SAVE_DST %l2 170925cf1a30Sjl139090#define SAVE_COUNT %l3 171025cf1a30Sjl139090 171125cf1a30Sjl139090#define SM_SAVE_SRC %g4 171225cf1a30Sjl139090#define SM_SAVE_DST %g5 171325cf1a30Sjl139090#define SM_SAVE_COUNT %o5 171425cf1a30Sjl139090#define ERRNO %l5 171525cf1a30Sjl139090 171625cf1a30Sjl139090 171725cf1a30Sjl139090#define REAL_LOFAULT %l4 171825cf1a30Sjl139090/* 171925cf1a30Sjl139090 * Generic copyio fault handler. This is the first line of defense when a 172025cf1a30Sjl139090 * fault occurs in (x)copyin/(x)copyout. In order for this to function 172125cf1a30Sjl139090 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT. 172225cf1a30Sjl139090 * This allows us to share common code for all the flavors of the copy 172325cf1a30Sjl139090 * operations, including the _noerr versions. 172425cf1a30Sjl139090 * 172525cf1a30Sjl139090 * Note that this function will restore the original input parameters before 172625cf1a30Sjl139090 * calling REAL_LOFAULT. So the real handler can vector to the appropriate 172725cf1a30Sjl139090 * member of the t_copyop structure, if needed. 172825cf1a30Sjl139090 */ 172925cf1a30Sjl139090 ENTRY(copyio_fault) 173025cf1a30Sjl139090 membar #Sync 173125cf1a30Sjl139090 mov %g1,ERRNO ! save errno in ERRNO 173225cf1a30Sjl139090 btst FPUSED_FLAG, %l6 173325cf1a30Sjl139090 bz %ncc, 1f 173425cf1a30Sjl139090 nop 173525cf1a30Sjl139090 173625cf1a30Sjl139090 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 173725cf1a30Sjl139090 wr %o2, 0, %gsr ! restore gsr 173825cf1a30Sjl139090 173925cf1a30Sjl139090 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 174025cf1a30Sjl139090 btst FPRS_FEF, %o3 174125cf1a30Sjl139090 bz,pt %icc, 4f 174225cf1a30Sjl139090 nop 174325cf1a30Sjl139090 174425cf1a30Sjl139090 BLD_FPQ2Q4_FROMSTACK(%o2) 174525cf1a30Sjl139090 174625cf1a30Sjl139090 ba,pt %ncc, 1f 174725cf1a30Sjl139090 wr %o3, 0, %fprs ! restore fprs 174825cf1a30Sjl139090 174925cf1a30Sjl1390904: 175025cf1a30Sjl139090 FZEROQ2Q4 175125cf1a30Sjl139090 wr %o3, 0, %fprs ! restore fprs 175225cf1a30Sjl139090 175325cf1a30Sjl1390901: 175425cf1a30Sjl139090 andn %l6, FPUSED_FLAG, %l6 175525cf1a30Sjl139090 membar #Sync 175625cf1a30Sjl139090 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 175725cf1a30Sjl139090 FP_ALLOWMIGRATE(5, 6) 175825cf1a30Sjl139090 175925cf1a30Sjl139090 mov SAVE_SRC, %i0 176025cf1a30Sjl139090 mov SAVE_DST, %i1 176125cf1a30Sjl139090 jmp REAL_LOFAULT 176225cf1a30Sjl139090 mov SAVE_COUNT, %i2 176325cf1a30Sjl139090 176425cf1a30Sjl139090 SET_SIZE(copyio_fault) 176525cf1a30Sjl139090 176625cf1a30Sjl139090 176725cf1a30Sjl139090#endif 176825cf1a30Sjl139090 176925cf1a30Sjl139090#if defined(lint) 177025cf1a30Sjl139090 177125cf1a30Sjl139090/*ARGSUSED*/ 177225cf1a30Sjl139090int 177325cf1a30Sjl139090copyout(const void *kaddr, void *uaddr, size_t count) 177425cf1a30Sjl139090{ return (0); } 177525cf1a30Sjl139090 177625cf1a30Sjl139090#else /* lint */ 177725cf1a30Sjl139090 177825cf1a30Sjl139090 ENTRY(copyout) 177925cf1a30Sjl139090 178025cf1a30Sjl139090 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 178125cf1a30Sjl139090 bleu,pt %ncc, .copyout_small ! go to larger cases 178225cf1a30Sjl139090 xor %o0, %o1, %o3 ! are src, dst alignable? 178325cf1a30Sjl139090 btst 7, %o3 ! 178425cf1a30Sjl139090 bz,pt %ncc, .copyout_8 ! check for longword alignment 178525cf1a30Sjl139090 nop 178625cf1a30Sjl139090 btst 1, %o3 ! 178725cf1a30Sjl139090 bz,pt %ncc, .copyout_2 ! check for half-word 178825cf1a30Sjl139090 nop 178925cf1a30Sjl139090 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 179025cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_1)], %o3 179125cf1a30Sjl139090 tst %o3 179225cf1a30Sjl139090 bz,pn %icc, .copyout_small ! if zero, disable HW copy 179325cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 179425cf1a30Sjl139090 bleu,pt %ncc, .copyout_small ! go to small copy 179525cf1a30Sjl139090 nop 179625cf1a30Sjl139090 ba,pt %ncc, .copyout_more ! otherwise go to large copy 179725cf1a30Sjl139090 nop 179825cf1a30Sjl139090.copyout_2: 179925cf1a30Sjl139090 btst 3, %o3 ! 180025cf1a30Sjl139090 bz,pt %ncc, .copyout_4 ! check for word alignment 180125cf1a30Sjl139090 nop 180225cf1a30Sjl139090 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 180325cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_2)], %o3 180425cf1a30Sjl139090 tst %o3 180525cf1a30Sjl139090 bz,pn %icc, .copyout_small ! if zero, disable HW copy 180625cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 180725cf1a30Sjl139090 bleu,pt %ncc, .copyout_small ! go to small copy 180825cf1a30Sjl139090 nop 180925cf1a30Sjl139090 ba,pt %ncc, .copyout_more ! otherwise go to large copy 181025cf1a30Sjl139090 nop 181125cf1a30Sjl139090.copyout_4: 181225cf1a30Sjl139090 ! already checked longword, must be word aligned 181325cf1a30Sjl139090 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 181425cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_4)], %o3 181525cf1a30Sjl139090 tst %o3 181625cf1a30Sjl139090 bz,pn %icc, .copyout_small ! if zero, disable HW copy 181725cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 181825cf1a30Sjl139090 bleu,pt %ncc, .copyout_small ! go to small copy 181925cf1a30Sjl139090 nop 182025cf1a30Sjl139090 ba,pt %ncc, .copyout_more ! otherwise go to large copy 182125cf1a30Sjl139090 nop 182225cf1a30Sjl139090.copyout_8: 182325cf1a30Sjl139090 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 182425cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_8)], %o3 182525cf1a30Sjl139090 tst %o3 182625cf1a30Sjl139090 bz,pn %icc, .copyout_small ! if zero, disable HW copy 182725cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 182825cf1a30Sjl139090 bleu,pt %ncc, .copyout_small ! go to small copy 182925cf1a30Sjl139090 nop 183025cf1a30Sjl139090 ba,pt %ncc, .copyout_more ! otherwise go to large copy 183125cf1a30Sjl139090 nop 183225cf1a30Sjl139090 183325cf1a30Sjl139090 .align 16 183425cf1a30Sjl139090 nop ! instruction alignment 183525cf1a30Sjl139090 ! see discussion at start of file 183625cf1a30Sjl139090.copyout_small: 183725cf1a30Sjl139090 sethi %hi(.sm_copyout_err), %o5 ! .sm_copyout_err is lofault 183825cf1a30Sjl139090 or %o5, %lo(.sm_copyout_err), %o5 183925cf1a30Sjl139090 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 184025cf1a30Sjl139090 membar #Sync ! sync error barrier 184125cf1a30Sjl139090 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 184225cf1a30Sjl139090.sm_do_copyout: 184325cf1a30Sjl139090 mov %o0, SM_SAVE_SRC 184425cf1a30Sjl139090 mov %o1, SM_SAVE_DST 184525cf1a30Sjl139090 cmp %o2, SHORTCOPY ! check for really short case 184625cf1a30Sjl139090 bleu,pt %ncc, .co_sm_left ! 184725cf1a30Sjl139090 mov %o2, SM_SAVE_COUNT 184825cf1a30Sjl139090 cmp %o2, CHKSIZE ! check for medium length cases 184925cf1a30Sjl139090 bgu,pn %ncc, .co_med ! 185025cf1a30Sjl139090 or %o0, %o1, %o3 ! prepare alignment check 185125cf1a30Sjl139090 andcc %o3, 0x3, %g0 ! test for alignment 185225cf1a30Sjl139090 bz,pt %ncc, .co_sm_word ! branch to word aligned case 185325cf1a30Sjl139090.co_sm_movebytes: 185425cf1a30Sjl139090 sub %o2, 3, %o2 ! adjust count to allow cc zero test 185525cf1a30Sjl139090.co_sm_notalign4: 185625cf1a30Sjl139090 ldub [%o0], %o3 ! read byte 185725cf1a30Sjl139090 subcc %o2, 4, %o2 ! reduce count by 4 185825cf1a30Sjl139090 stba %o3, [%o1]ASI_USER ! write byte 185925cf1a30Sjl139090 inc %o1 ! advance DST by 1 186025cf1a30Sjl139090 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes 186125cf1a30Sjl139090 add %o0, 4, %o0 ! advance SRC by 4 186225cf1a30Sjl139090 stba %o3, [%o1]ASI_USER 186325cf1a30Sjl139090 inc %o1 ! advance DST by 1 186425cf1a30Sjl139090 ldub [%o0 - 2], %o3 186525cf1a30Sjl139090 stba %o3, [%o1]ASI_USER 186625cf1a30Sjl139090 inc %o1 ! advance DST by 1 186725cf1a30Sjl139090 ldub [%o0 - 1], %o3 186825cf1a30Sjl139090 stba %o3, [%o1]ASI_USER 186925cf1a30Sjl139090 bgt,pt %ncc, .co_sm_notalign4 ! loop til 3 or fewer bytes remain 187025cf1a30Sjl139090 inc %o1 ! advance DST by 1 187125cf1a30Sjl139090 add %o2, 3, %o2 ! restore count 187225cf1a30Sjl139090.co_sm_left: 187325cf1a30Sjl139090 tst %o2 187425cf1a30Sjl139090 bz,pt %ncc, .co_sm_exit ! check for zero length 187525cf1a30Sjl139090 nop 187625cf1a30Sjl139090 ldub [%o0], %o3 ! load one byte 187725cf1a30Sjl139090 deccc %o2 ! reduce count for cc test 187825cf1a30Sjl139090 bz,pt %ncc, .co_sm_exit 187925cf1a30Sjl139090 stba %o3,[%o1]ASI_USER ! store one byte 188025cf1a30Sjl139090 ldub [%o0 + 1], %o3 ! load second byte 188125cf1a30Sjl139090 deccc %o2 188225cf1a30Sjl139090 inc %o1 188325cf1a30Sjl139090 bz,pt %ncc, .co_sm_exit 188425cf1a30Sjl139090 stba %o3,[%o1]ASI_USER ! store second byte 188525cf1a30Sjl139090 ldub [%o0 + 2], %o3 ! load third byte 188625cf1a30Sjl139090 inc %o1 188725cf1a30Sjl139090 stba %o3,[%o1]ASI_USER ! store third byte 188825cf1a30Sjl139090 membar #Sync ! sync error barrier 188925cf1a30Sjl139090 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 189025cf1a30Sjl139090 retl 189125cf1a30Sjl139090 mov %g0, %o0 ! return 0 189225cf1a30Sjl139090 .align 16 189325cf1a30Sjl139090.co_sm_words: 189425cf1a30Sjl139090 lduw [%o0], %o3 ! read word 189525cf1a30Sjl139090.co_sm_wordx: 189625cf1a30Sjl139090 subcc %o2, 8, %o2 ! update count 189725cf1a30Sjl139090 stwa %o3, [%o1]ASI_USER ! write word 189825cf1a30Sjl139090 add %o0, 8, %o0 ! update SRC 189925cf1a30Sjl139090 lduw [%o0 - 4], %o3 ! read word 190025cf1a30Sjl139090 add %o1, 4, %o1 ! update DST 190125cf1a30Sjl139090 stwa %o3, [%o1]ASI_USER ! write word 190225cf1a30Sjl139090 bgt,pt %ncc, .co_sm_words ! loop til done 190325cf1a30Sjl139090 add %o1, 4, %o1 ! update DST 190425cf1a30Sjl139090 addcc %o2, 7, %o2 ! restore count 190525cf1a30Sjl139090 bz,pt %ncc, .co_sm_exit 190625cf1a30Sjl139090 nop 190725cf1a30Sjl139090 deccc %o2 190825cf1a30Sjl139090 bz,pt %ncc, .co_sm_byte 190925cf1a30Sjl139090.co_sm_half: 191025cf1a30Sjl139090 subcc %o2, 2, %o2 ! reduce count by 2 191125cf1a30Sjl139090 lduh [%o0], %o3 ! read half word 191225cf1a30Sjl139090 add %o0, 2, %o0 ! advance SRC by 2 191325cf1a30Sjl139090 stha %o3, [%o1]ASI_USER ! write half word 191425cf1a30Sjl139090 bgt,pt %ncc, .co_sm_half ! loop til done 191525cf1a30Sjl139090 add %o1, 2, %o1 ! advance DST by 2 191625cf1a30Sjl139090 addcc %o2, 1, %o2 ! restore count 191725cf1a30Sjl139090 bz,pt %ncc, .co_sm_exit 191825cf1a30Sjl139090 nop 191925cf1a30Sjl139090.co_sm_byte: 192025cf1a30Sjl139090 ldub [%o0], %o3 192125cf1a30Sjl139090 stba %o3, [%o1]ASI_USER 192225cf1a30Sjl139090 membar #Sync ! sync error barrier 192325cf1a30Sjl139090 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 192425cf1a30Sjl139090 retl 192525cf1a30Sjl139090 mov %g0, %o0 ! return 0 192625cf1a30Sjl139090 .align 16 192725cf1a30Sjl139090.co_sm_word: 192825cf1a30Sjl139090 subcc %o2, 4, %o2 ! update count 192925cf1a30Sjl139090 bgt,pt %ncc, .co_sm_wordx 193025cf1a30Sjl139090 lduw [%o0], %o3 ! read word 193125cf1a30Sjl139090 addcc %o2, 3, %o2 ! restore count 193225cf1a30Sjl139090 bz,pt %ncc, .co_sm_exit 193325cf1a30Sjl139090 stwa %o3, [%o1]ASI_USER ! write word 193425cf1a30Sjl139090 deccc %o2 ! reduce count for cc test 193525cf1a30Sjl139090 ldub [%o0 + 4], %o3 ! load one byte 193625cf1a30Sjl139090 add %o1, 4, %o1 193725cf1a30Sjl139090 bz,pt %ncc, .co_sm_exit 193825cf1a30Sjl139090 stba %o3, [%o1]ASI_USER ! store one byte 193925cf1a30Sjl139090 ldub [%o0 + 5], %o3 ! load second byte 194025cf1a30Sjl139090 deccc %o2 194125cf1a30Sjl139090 inc %o1 194225cf1a30Sjl139090 bz,pt %ncc, .co_sm_exit 194325cf1a30Sjl139090 stba %o3, [%o1]ASI_USER ! store second byte 194425cf1a30Sjl139090 ldub [%o0 + 6], %o3 ! load third byte 194525cf1a30Sjl139090 inc %o1 194625cf1a30Sjl139090 stba %o3, [%o1]ASI_USER ! store third byte 194725cf1a30Sjl139090.co_sm_exit: 194825cf1a30Sjl139090 membar #Sync ! sync error barrier 194925cf1a30Sjl139090 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 195025cf1a30Sjl139090 retl 195125cf1a30Sjl139090 mov %g0, %o0 ! return 0 195225cf1a30Sjl139090 195325cf1a30Sjl139090 .align 16 195425cf1a30Sjl139090.co_med: 195525cf1a30Sjl139090 xor %o0, %o1, %o3 ! setup alignment check 195625cf1a30Sjl139090 btst 1, %o3 195725cf1a30Sjl139090 bnz,pt %ncc, .co_sm_movebytes ! unaligned 195825cf1a30Sjl139090 nop 195925cf1a30Sjl139090 btst 3, %o3 196025cf1a30Sjl139090 bnz,pt %ncc, .co_med_half ! halfword aligned 196125cf1a30Sjl139090 nop 196225cf1a30Sjl139090 btst 7, %o3 196325cf1a30Sjl139090 bnz,pt %ncc, .co_med_word ! word aligned 196425cf1a30Sjl139090 nop 196525cf1a30Sjl139090.co_med_long: 196625cf1a30Sjl139090 btst 3, %o0 ! check for 196725cf1a30Sjl139090 bz,pt %ncc, .co_med_long1 ! word alignment 196825cf1a30Sjl139090 nop 196925cf1a30Sjl139090.co_med_long0: 197025cf1a30Sjl139090 ldub [%o0], %o3 ! load one byte 197125cf1a30Sjl139090 inc %o0 197225cf1a30Sjl139090 stba %o3,[%o1]ASI_USER ! store byte 197325cf1a30Sjl139090 inc %o1 197425cf1a30Sjl139090 btst 3, %o0 197525cf1a30Sjl139090 bnz,pt %ncc, .co_med_long0 197625cf1a30Sjl139090 dec %o2 197725cf1a30Sjl139090.co_med_long1: ! word aligned 197825cf1a30Sjl139090 btst 7, %o0 ! check for long word 197925cf1a30Sjl139090 bz,pt %ncc, .co_med_long2 198025cf1a30Sjl139090 nop 198125cf1a30Sjl139090 lduw [%o0], %o3 ! load word 198225cf1a30Sjl139090 add %o0, 4, %o0 ! advance SRC by 4 198325cf1a30Sjl139090 stwa %o3, [%o1]ASI_USER ! store word 198425cf1a30Sjl139090 add %o1, 4, %o1 ! advance DST by 4 198525cf1a30Sjl139090 sub %o2, 4, %o2 ! reduce count by 4 198625cf1a30Sjl139090! 198725cf1a30Sjl139090! Now long word aligned and have at least 32 bytes to move 198825cf1a30Sjl139090! 198925cf1a30Sjl139090.co_med_long2: 199025cf1a30Sjl139090 sub %o2, 31, %o2 ! adjust count to allow cc zero test 199125cf1a30Sjl139090 sub %o1, 8, %o1 ! adjust pointer to allow store in 199225cf1a30Sjl139090 ! branch delay slot instead of add 199325cf1a30Sjl139090.co_med_lmove: 199425cf1a30Sjl139090 add %o1, 8, %o1 ! advance DST by 8 199525cf1a30Sjl139090 ldx [%o0], %o3 ! read long word 199625cf1a30Sjl139090 subcc %o2, 32, %o2 ! reduce count by 32 199725cf1a30Sjl139090 stxa %o3, [%o1]ASI_USER ! write long word 199825cf1a30Sjl139090 add %o1, 8, %o1 ! advance DST by 8 199925cf1a30Sjl139090 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words 200025cf1a30Sjl139090 add %o0, 32, %o0 ! advance SRC by 32 200125cf1a30Sjl139090 stxa %o3, [%o1]ASI_USER 200225cf1a30Sjl139090 ldx [%o0 - 16], %o3 200325cf1a30Sjl139090 add %o1, 8, %o1 ! advance DST by 8 200425cf1a30Sjl139090 stxa %o3, [%o1]ASI_USER 200525cf1a30Sjl139090 ldx [%o0 - 8], %o3 200625cf1a30Sjl139090 add %o1, 8, %o1 ! advance DST by 8 200725cf1a30Sjl139090 bgt,pt %ncc, .co_med_lmove ! loop til 31 or fewer bytes left 200825cf1a30Sjl139090 stxa %o3, [%o1]ASI_USER 200925cf1a30Sjl139090 add %o1, 8, %o1 ! advance DST by 8 201025cf1a30Sjl139090 addcc %o2, 24, %o2 ! restore count to long word offset 201125cf1a30Sjl139090 ble,pt %ncc, .co_med_lextra ! check for more long words to move 201225cf1a30Sjl139090 nop 201325cf1a30Sjl139090.co_med_lword: 201425cf1a30Sjl139090 ldx [%o0], %o3 ! read long word 201525cf1a30Sjl139090 subcc %o2, 8, %o2 ! reduce count by 8 201625cf1a30Sjl139090 stxa %o3, [%o1]ASI_USER ! write long word 201725cf1a30Sjl139090 add %o0, 8, %o0 ! advance SRC by 8 201825cf1a30Sjl139090 bgt,pt %ncc, .co_med_lword ! loop til 7 or fewer bytes left 201925cf1a30Sjl139090 add %o1, 8, %o1 ! advance DST by 8 202025cf1a30Sjl139090.co_med_lextra: 202125cf1a30Sjl139090 addcc %o2, 7, %o2 ! restore rest of count 202225cf1a30Sjl139090 bz,pt %ncc, .co_sm_exit ! if zero, then done 202325cf1a30Sjl139090 deccc %o2 202425cf1a30Sjl139090 bz,pt %ncc, .co_sm_byte 202525cf1a30Sjl139090 nop 202625cf1a30Sjl139090 ba,pt %ncc, .co_sm_half 202725cf1a30Sjl139090 nop 202825cf1a30Sjl139090 202925cf1a30Sjl139090 .align 16 203025cf1a30Sjl139090 nop ! instruction alignment 203125cf1a30Sjl139090 ! see discussion at start of file 203225cf1a30Sjl139090.co_med_word: 203325cf1a30Sjl139090 btst 3, %o0 ! check for 203425cf1a30Sjl139090 bz,pt %ncc, .co_med_word1 ! word alignment 203525cf1a30Sjl139090 nop 203625cf1a30Sjl139090.co_med_word0: 203725cf1a30Sjl139090 ldub [%o0], %o3 ! load one byte 203825cf1a30Sjl139090 inc %o0 203925cf1a30Sjl139090 stba %o3,[%o1]ASI_USER ! store byte 204025cf1a30Sjl139090 inc %o1 204125cf1a30Sjl139090 btst 3, %o0 204225cf1a30Sjl139090 bnz,pt %ncc, .co_med_word0 204325cf1a30Sjl139090 dec %o2 204425cf1a30Sjl139090! 204525cf1a30Sjl139090! Now word aligned and have at least 36 bytes to move 204625cf1a30Sjl139090! 204725cf1a30Sjl139090.co_med_word1: 204825cf1a30Sjl139090 sub %o2, 15, %o2 ! adjust count to allow cc zero test 204925cf1a30Sjl139090.co_med_wmove: 205025cf1a30Sjl139090 lduw [%o0], %o3 ! read word 205125cf1a30Sjl139090 subcc %o2, 16, %o2 ! reduce count by 16 205225cf1a30Sjl139090 stwa %o3, [%o1]ASI_USER ! write word 205325cf1a30Sjl139090 add %o1, 4, %o1 ! advance DST by 4 205425cf1a30Sjl139090 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words 205525cf1a30Sjl139090 add %o0, 16, %o0 ! advance SRC by 16 205625cf1a30Sjl139090 stwa %o3, [%o1]ASI_USER 205725cf1a30Sjl139090 add %o1, 4, %o1 ! advance DST by 4 205825cf1a30Sjl139090 lduw [%o0 - 8], %o3 205925cf1a30Sjl139090 stwa %o3, [%o1]ASI_USER 206025cf1a30Sjl139090 add %o1, 4, %o1 ! advance DST by 4 206125cf1a30Sjl139090 lduw [%o0 - 4], %o3 206225cf1a30Sjl139090 stwa %o3, [%o1]ASI_USER 206325cf1a30Sjl139090 bgt,pt %ncc, .co_med_wmove ! loop til 15 or fewer bytes left 206425cf1a30Sjl139090 add %o1, 4, %o1 ! advance DST by 4 206525cf1a30Sjl139090 addcc %o2, 12, %o2 ! restore count to word offset 206625cf1a30Sjl139090 ble,pt %ncc, .co_med_wextra ! check for more words to move 206725cf1a30Sjl139090 nop 206825cf1a30Sjl139090.co_med_word2: 206925cf1a30Sjl139090 lduw [%o0], %o3 ! read word 207025cf1a30Sjl139090 subcc %o2, 4, %o2 ! reduce count by 4 207125cf1a30Sjl139090 stwa %o3, [%o1]ASI_USER ! write word 207225cf1a30Sjl139090 add %o0, 4, %o0 ! advance SRC by 4 207325cf1a30Sjl139090 bgt,pt %ncc, .co_med_word2 ! loop til 3 or fewer bytes left 207425cf1a30Sjl139090 add %o1, 4, %o1 ! advance DST by 4 207525cf1a30Sjl139090.co_med_wextra: 207625cf1a30Sjl139090 addcc %o2, 3, %o2 ! restore rest of count 207725cf1a30Sjl139090 bz,pt %ncc, .co_sm_exit ! if zero, then done 207825cf1a30Sjl139090 deccc %o2 207925cf1a30Sjl139090 bz,pt %ncc, .co_sm_byte 208025cf1a30Sjl139090 nop 208125cf1a30Sjl139090 ba,pt %ncc, .co_sm_half 208225cf1a30Sjl139090 nop 208325cf1a30Sjl139090 208425cf1a30Sjl139090 .align 16 208525cf1a30Sjl139090 nop ! instruction alignment 208625cf1a30Sjl139090 nop ! see discussion at start of file 208725cf1a30Sjl139090 nop 208825cf1a30Sjl139090.co_med_half: 208925cf1a30Sjl139090 btst 1, %o0 ! check for 209025cf1a30Sjl139090 bz,pt %ncc, .co_med_half1 ! half word alignment 209125cf1a30Sjl139090 nop 209225cf1a30Sjl139090 ldub [%o0], %o3 ! load one byte 209325cf1a30Sjl139090 inc %o0 209425cf1a30Sjl139090 stba %o3,[%o1]ASI_USER ! store byte 209525cf1a30Sjl139090 inc %o1 209625cf1a30Sjl139090 dec %o2 209725cf1a30Sjl139090! 209825cf1a30Sjl139090! Now half word aligned and have at least 38 bytes to move 209925cf1a30Sjl139090! 210025cf1a30Sjl139090.co_med_half1: 210125cf1a30Sjl139090 sub %o2, 7, %o2 ! adjust count to allow cc zero test 210225cf1a30Sjl139090.co_med_hmove: 210325cf1a30Sjl139090 lduh [%o0], %o3 ! read half word 210425cf1a30Sjl139090 subcc %o2, 8, %o2 ! reduce count by 8 210525cf1a30Sjl139090 stha %o3, [%o1]ASI_USER ! write half word 210625cf1a30Sjl139090 add %o1, 2, %o1 ! advance DST by 2 210725cf1a30Sjl139090 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords 210825cf1a30Sjl139090 add %o0, 8, %o0 ! advance SRC by 8 210925cf1a30Sjl139090 stha %o3, [%o1]ASI_USER 211025cf1a30Sjl139090 add %o1, 2, %o1 ! advance DST by 2 211125cf1a30Sjl139090 lduh [%o0 - 4], %o3 211225cf1a30Sjl139090 stha %o3, [%o1]ASI_USER 211325cf1a30Sjl139090 add %o1, 2, %o1 ! advance DST by 2 211425cf1a30Sjl139090 lduh [%o0 - 2], %o3 211525cf1a30Sjl139090 stha %o3, [%o1]ASI_USER 211625cf1a30Sjl139090 bgt,pt %ncc, .co_med_hmove ! loop til 7 or fewer bytes left 211725cf1a30Sjl139090 add %o1, 2, %o1 ! advance DST by 2 211825cf1a30Sjl139090 addcc %o2, 7, %o2 ! restore count 211925cf1a30Sjl139090 bz,pt %ncc, .co_sm_exit 212025cf1a30Sjl139090 deccc %o2 212125cf1a30Sjl139090 bz,pt %ncc, .co_sm_byte 212225cf1a30Sjl139090 nop 212325cf1a30Sjl139090 ba,pt %ncc, .co_sm_half 212425cf1a30Sjl139090 nop 212525cf1a30Sjl139090 212625cf1a30Sjl139090/* 212725cf1a30Sjl139090 * We got here because of a fault during short copyout. 212825cf1a30Sjl139090 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 212925cf1a30Sjl139090 */ 213025cf1a30Sjl139090.sm_copyout_err: 213125cf1a30Sjl139090 membar #Sync 213225cf1a30Sjl139090 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 213325cf1a30Sjl139090 mov SM_SAVE_SRC, %o0 213425cf1a30Sjl139090 mov SM_SAVE_DST, %o1 213525cf1a30Sjl139090 mov SM_SAVE_COUNT, %o2 213625cf1a30Sjl139090 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 213725cf1a30Sjl139090 tst %o3 213825cf1a30Sjl139090 bz,pt %ncc, 3f ! if not, return error 213925cf1a30Sjl139090 nop 214025cf1a30Sjl139090 ldn [%o3 + CP_COPYOUT], %o5 ! if handler, invoke it with 214125cf1a30Sjl139090 jmp %o5 ! original arguments 214225cf1a30Sjl139090 nop 214325cf1a30Sjl1390903: 214425cf1a30Sjl139090 retl 214525cf1a30Sjl139090 or %g0, -1, %o0 ! return error value 214625cf1a30Sjl139090 214725cf1a30Sjl139090 SET_SIZE(copyout) 214825cf1a30Sjl139090 214925cf1a30Sjl139090/* 215025cf1a30Sjl139090 * The _more entry points are not intended to be used directly by 215125cf1a30Sjl139090 * any caller from outside this file. They are provided to allow 215225cf1a30Sjl139090 * profiling and dtrace of the portions of the copy code that uses 215325cf1a30Sjl139090 * the floating point registers. 215425cf1a30Sjl139090 * This entry is particularly important as DTRACE (at least as of 215525cf1a30Sjl139090 * 4/2004) does not support leaf functions. 215625cf1a30Sjl139090 */ 215725cf1a30Sjl139090 215825cf1a30Sjl139090 ENTRY(copyout_more) 215925cf1a30Sjl139090.copyout_more: 216025cf1a30Sjl139090 prefetch [%o0], #n_reads 216125cf1a30Sjl139090 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 216225cf1a30Sjl139090 set .copyout_err, REAL_LOFAULT 216325cf1a30Sjl139090 216425cf1a30Sjl139090/* 216525cf1a30Sjl139090 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes 216625cf1a30Sjl139090 */ 216725cf1a30Sjl139090.do_copyout: 216825cf1a30Sjl139090 set copyio_fault, %l7 ! .copyio_fault is lofault val 216925cf1a30Sjl139090 217025cf1a30Sjl139090 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 217125cf1a30Sjl139090 membar #Sync ! sync error barrier 217225cf1a30Sjl139090 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 217325cf1a30Sjl139090 217425cf1a30Sjl139090 mov %i0, SAVE_SRC 217525cf1a30Sjl139090 mov %i1, SAVE_DST 217625cf1a30Sjl139090 mov %i2, SAVE_COUNT 217725cf1a30Sjl139090 217825cf1a30Sjl139090 FP_NOMIGRATE(6, 7) 217925cf1a30Sjl139090 218025cf1a30Sjl139090 rd %fprs, %o2 ! check for unused fp 218125cf1a30Sjl139090 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 218225cf1a30Sjl139090 btst FPRS_FEF, %o2 218325cf1a30Sjl139090 bz,a,pt %icc, .do_blockcopyout 218425cf1a30Sjl139090 wr %g0, FPRS_FEF, %fprs 218525cf1a30Sjl139090 218625cf1a30Sjl139090 BST_FPQ2Q4_TOSTACK(%o2) 218725cf1a30Sjl139090 218825cf1a30Sjl139090.do_blockcopyout: 218925cf1a30Sjl139090 rd %gsr, %o2 219025cf1a30Sjl139090 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 219125cf1a30Sjl139090 or %l6, FPUSED_FLAG, %l6 219225cf1a30Sjl139090 219325cf1a30Sjl139090 andcc DST, VIS_BLOCKSIZE - 1, TMP 219425cf1a30Sjl139090 mov ASI_USER, %asi 219525cf1a30Sjl139090 bz,pt %ncc, 2f 219625cf1a30Sjl139090 neg TMP 219725cf1a30Sjl139090 add TMP, VIS_BLOCKSIZE, TMP 219825cf1a30Sjl139090 219925cf1a30Sjl139090 ! TMP = bytes required to align DST on FP_BLOCK boundary 220025cf1a30Sjl139090 ! Using SRC as a tmp here 220125cf1a30Sjl139090 cmp TMP, 3 220225cf1a30Sjl139090 bleu,pt %ncc, 1f 220325cf1a30Sjl139090 sub CNT,TMP,CNT ! adjust main count 220425cf1a30Sjl139090 sub TMP, 3, TMP ! adjust for end of loop test 220525cf1a30Sjl139090.co_blkalign: 220625cf1a30Sjl139090 ldub [REALSRC], SRC ! move 4 bytes per loop iteration 220725cf1a30Sjl139090 stba SRC, [DST]%asi 220825cf1a30Sjl139090 subcc TMP, 4, TMP 220925cf1a30Sjl139090 ldub [REALSRC + 1], SRC 221025cf1a30Sjl139090 add REALSRC, 4, REALSRC 221125cf1a30Sjl139090 stba SRC, [DST + 1]%asi 221225cf1a30Sjl139090 ldub [REALSRC - 2], SRC 221325cf1a30Sjl139090 add DST, 4, DST 221425cf1a30Sjl139090 stba SRC, [DST - 2]%asi 221525cf1a30Sjl139090 ldub [REALSRC - 1], SRC 221625cf1a30Sjl139090 bgu,pt %ncc, .co_blkalign 221725cf1a30Sjl139090 stba SRC, [DST - 1]%asi 221825cf1a30Sjl139090 221925cf1a30Sjl139090 addcc TMP, 3, TMP ! restore count adjustment 222025cf1a30Sjl139090 bz,pt %ncc, 2f ! no bytes left? 222125cf1a30Sjl139090 nop 222225cf1a30Sjl1390901: ldub [REALSRC], SRC 222325cf1a30Sjl139090 inc REALSRC 222425cf1a30Sjl139090 inc DST 222525cf1a30Sjl139090 deccc TMP 222625cf1a30Sjl139090 bgu %ncc, 1b 222725cf1a30Sjl139090 stba SRC, [DST - 1]%asi 222825cf1a30Sjl139090 222925cf1a30Sjl1390902: 223025cf1a30Sjl139090 membar #StoreLoad 223125cf1a30Sjl139090 andn REALSRC, 0x7, SRC 223225cf1a30Sjl139090 223325cf1a30Sjl139090 ! SRC - 8-byte aligned 223425cf1a30Sjl139090 ! DST - 64-byte aligned 223525cf1a30Sjl139090 ldd [SRC], %f16 223625cf1a30Sjl139090 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 223725cf1a30Sjl139090 alignaddr REALSRC, %g0, %g0 223825cf1a30Sjl139090 ldd [SRC + 0x08], %f18 223925cf1a30Sjl139090 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 224025cf1a30Sjl139090 faligndata %f16, %f18, %f48 224125cf1a30Sjl139090 ldd [SRC + 0x10], %f20 2242c8a722abSpm145316 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 224325cf1a30Sjl139090 faligndata %f18, %f20, %f50 224425cf1a30Sjl139090 ldd [SRC + 0x18], %f22 224525cf1a30Sjl139090 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 224625cf1a30Sjl139090 faligndata %f20, %f22, %f52 224725cf1a30Sjl139090 ldd [SRC + 0x20], %f24 2248c8a722abSpm145316 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read 224925cf1a30Sjl139090 faligndata %f22, %f24, %f54 225025cf1a30Sjl139090 ldd [SRC + 0x28], %f26 2251c8a722abSpm145316 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read 225225cf1a30Sjl139090 faligndata %f24, %f26, %f56 225325cf1a30Sjl139090 ldd [SRC + 0x30], %f28 2254c8a722abSpm145316 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read 225525cf1a30Sjl139090 faligndata %f26, %f28, %f58 225625cf1a30Sjl139090 ldd [SRC + 0x38], %f30 225725cf1a30Sjl139090 ldd [SRC + VIS_BLOCKSIZE], %f16 225825cf1a30Sjl139090 sub CNT, VIS_BLOCKSIZE, CNT 225925cf1a30Sjl139090 add SRC, VIS_BLOCKSIZE, SRC 2260c8a722abSpm145316 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read 226125cf1a30Sjl139090 add REALSRC, VIS_BLOCKSIZE, REALSRC 226225cf1a30Sjl139090 ba,pt %ncc, 1f 2263c8a722abSpm145316 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read 226425cf1a30Sjl139090 .align 32 226525cf1a30Sjl1390901: 226625cf1a30Sjl139090 ldd [SRC + 0x08], %f18 226725cf1a30Sjl139090 faligndata %f28, %f30, %f60 226825cf1a30Sjl139090 ldd [SRC + 0x10], %f20 226925cf1a30Sjl139090 faligndata %f30, %f16, %f62 227025cf1a30Sjl139090 stda %f48, [DST]ASI_BLK_AIUS 227125cf1a30Sjl139090 ldd [SRC + 0x18], %f22 227225cf1a30Sjl139090 faligndata %f16, %f18, %f48 227325cf1a30Sjl139090 ldd [SRC + 0x20], %f24 227425cf1a30Sjl139090 faligndata %f18, %f20, %f50 227525cf1a30Sjl139090 ldd [SRC + 0x28], %f26 227625cf1a30Sjl139090 faligndata %f20, %f22, %f52 227725cf1a30Sjl139090 ldd [SRC + 0x30], %f28 227825cf1a30Sjl139090 faligndata %f22, %f24, %f54 227925cf1a30Sjl139090 sub CNT, VIS_BLOCKSIZE, CNT 2280c8a722abSpm145316 ldd [SRC + 0x38], %f30 2281c8a722abSpm145316 faligndata %f24, %f26, %f56 228225cf1a30Sjl139090 add DST, VIS_BLOCKSIZE, DST 2283c8a722abSpm145316 ldd [SRC + VIS_BLOCKSIZE], %f16 2284c8a722abSpm145316 faligndata %f26, %f28, %f58 228525cf1a30Sjl139090 add REALSRC, VIS_BLOCKSIZE, REALSRC 2286c8a722abSpm145316 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 2287c8a722abSpm145316 add SRC, VIS_BLOCKSIZE, SRC 2288c8a722abSpm145316 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read 228925cf1a30Sjl139090 cmp CNT, VIS_BLOCKSIZE + 8 229025cf1a30Sjl139090 bgu,pt %ncc, 1b 2291c8a722abSpm145316 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 229225cf1a30Sjl139090 229325cf1a30Sjl139090 ! only if REALSRC & 0x7 is 0 229425cf1a30Sjl139090 cmp CNT, VIS_BLOCKSIZE 229525cf1a30Sjl139090 bne %ncc, 3f 229625cf1a30Sjl139090 andcc REALSRC, 0x7, %g0 229725cf1a30Sjl139090 bz,pt %ncc, 2f 229825cf1a30Sjl139090 nop 229925cf1a30Sjl1390903: 230025cf1a30Sjl139090 faligndata %f28, %f30, %f60 230125cf1a30Sjl139090 faligndata %f30, %f16, %f62 230225cf1a30Sjl139090 stda %f48, [DST]ASI_BLK_AIUS 230325cf1a30Sjl139090 add DST, VIS_BLOCKSIZE, DST 230425cf1a30Sjl139090 ba,pt %ncc, 3f 230525cf1a30Sjl139090 nop 230625cf1a30Sjl1390902: 230725cf1a30Sjl139090 ldd [SRC + 0x08], %f18 230825cf1a30Sjl139090 fsrc1 %f28, %f60 230925cf1a30Sjl139090 ldd [SRC + 0x10], %f20 231025cf1a30Sjl139090 fsrc1 %f30, %f62 231125cf1a30Sjl139090 stda %f48, [DST]ASI_BLK_AIUS 231225cf1a30Sjl139090 ldd [SRC + 0x18], %f22 231325cf1a30Sjl139090 fsrc1 %f16, %f48 231425cf1a30Sjl139090 ldd [SRC + 0x20], %f24 231525cf1a30Sjl139090 fsrc1 %f18, %f50 231625cf1a30Sjl139090 ldd [SRC + 0x28], %f26 231725cf1a30Sjl139090 fsrc1 %f20, %f52 231825cf1a30Sjl139090 ldd [SRC + 0x30], %f28 231925cf1a30Sjl139090 fsrc1 %f22, %f54 232025cf1a30Sjl139090 ldd [SRC + 0x38], %f30 232125cf1a30Sjl139090 fsrc1 %f24, %f56 232225cf1a30Sjl139090 sub CNT, VIS_BLOCKSIZE, CNT 232325cf1a30Sjl139090 add DST, VIS_BLOCKSIZE, DST 232425cf1a30Sjl139090 add SRC, VIS_BLOCKSIZE, SRC 232525cf1a30Sjl139090 add REALSRC, VIS_BLOCKSIZE, REALSRC 232625cf1a30Sjl139090 fsrc1 %f26, %f58 232725cf1a30Sjl139090 fsrc1 %f28, %f60 232825cf1a30Sjl139090 fsrc1 %f30, %f62 232925cf1a30Sjl139090 stda %f48, [DST]ASI_BLK_AIUS 233025cf1a30Sjl139090 add DST, VIS_BLOCKSIZE, DST 233125cf1a30Sjl139090 ba,a,pt %ncc, 4f 233225cf1a30Sjl139090 nop 233325cf1a30Sjl139090 233425cf1a30Sjl1390903: tst CNT 233525cf1a30Sjl139090 bz,a %ncc, 4f 233625cf1a30Sjl139090 nop 233725cf1a30Sjl139090 233825cf1a30Sjl1390905: ldub [REALSRC], TMP 233925cf1a30Sjl139090 inc REALSRC 234025cf1a30Sjl139090 inc DST 234125cf1a30Sjl139090 deccc CNT 234225cf1a30Sjl139090 bgu %ncc, 5b 234325cf1a30Sjl139090 stba TMP, [DST - 1]%asi 234425cf1a30Sjl1390904: 234525cf1a30Sjl139090 234625cf1a30Sjl139090.copyout_exit: 234725cf1a30Sjl139090 membar #Sync 234825cf1a30Sjl139090 234925cf1a30Sjl139090 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 235025cf1a30Sjl139090 wr %o2, 0, %gsr ! restore gsr 235125cf1a30Sjl139090 235225cf1a30Sjl139090 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 235325cf1a30Sjl139090 btst FPRS_FEF, %o3 235425cf1a30Sjl139090 bz,pt %icc, 4f 235525cf1a30Sjl139090 nop 235625cf1a30Sjl139090 235725cf1a30Sjl139090 BLD_FPQ2Q4_FROMSTACK(%o2) 235825cf1a30Sjl139090 235925cf1a30Sjl139090 ba,pt %ncc, 1f 236025cf1a30Sjl139090 wr %o3, 0, %fprs ! restore fprs 236125cf1a30Sjl139090 236225cf1a30Sjl1390904: 236325cf1a30Sjl139090 FZEROQ2Q4 236425cf1a30Sjl139090 wr %o3, 0, %fprs ! restore fprs 236525cf1a30Sjl139090 236625cf1a30Sjl1390901: 236725cf1a30Sjl139090 membar #Sync 236825cf1a30Sjl139090 andn %l6, FPUSED_FLAG, %l6 236925cf1a30Sjl139090 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 237025cf1a30Sjl139090 FP_ALLOWMIGRATE(5, 6) 237125cf1a30Sjl139090 ret 237225cf1a30Sjl139090 restore %g0, 0, %o0 237325cf1a30Sjl139090 237425cf1a30Sjl139090/* 237525cf1a30Sjl139090 * We got here because of a fault during copyout. 237625cf1a30Sjl139090 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 237725cf1a30Sjl139090 */ 237825cf1a30Sjl139090.copyout_err: 237925cf1a30Sjl139090 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 238025cf1a30Sjl139090 tst %o4 238125cf1a30Sjl139090 bz,pt %ncc, 2f ! if not, return error 238225cf1a30Sjl139090 nop 238325cf1a30Sjl139090 ldn [%o4 + CP_COPYOUT], %g2 ! if handler, invoke it with 238425cf1a30Sjl139090 jmp %g2 ! original arguments 238525cf1a30Sjl139090 restore %g0, 0, %g0 ! dispose of copy window 238625cf1a30Sjl1390902: 238725cf1a30Sjl139090 ret 238825cf1a30Sjl139090 restore %g0, -1, %o0 ! return error value 238925cf1a30Sjl139090 239025cf1a30Sjl139090 239125cf1a30Sjl139090 SET_SIZE(copyout_more) 239225cf1a30Sjl139090 239325cf1a30Sjl139090#endif /* lint */ 239425cf1a30Sjl139090 239525cf1a30Sjl139090 239625cf1a30Sjl139090#ifdef lint 239725cf1a30Sjl139090 239825cf1a30Sjl139090/*ARGSUSED*/ 239925cf1a30Sjl139090int 240025cf1a30Sjl139090xcopyout(const void *kaddr, void *uaddr, size_t count) 240125cf1a30Sjl139090{ return (0); } 240225cf1a30Sjl139090 240325cf1a30Sjl139090#else /* lint */ 240425cf1a30Sjl139090 240525cf1a30Sjl139090 ENTRY(xcopyout) 240625cf1a30Sjl139090 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 240725cf1a30Sjl139090 bleu,pt %ncc, .xcopyout_small ! go to larger cases 240825cf1a30Sjl139090 xor %o0, %o1, %o3 ! are src, dst alignable? 240925cf1a30Sjl139090 btst 7, %o3 ! 241025cf1a30Sjl139090 bz,pt %ncc, .xcopyout_8 ! 241125cf1a30Sjl139090 nop 241225cf1a30Sjl139090 btst 1, %o3 ! 241325cf1a30Sjl139090 bz,pt %ncc, .xcopyout_2 ! check for half-word 241425cf1a30Sjl139090 nop 241525cf1a30Sjl139090 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 241625cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_1)], %o3 241725cf1a30Sjl139090 tst %o3 241825cf1a30Sjl139090 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 241925cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 242025cf1a30Sjl139090 bleu,pt %ncc, .xcopyout_small ! go to small copy 242125cf1a30Sjl139090 nop 242225cf1a30Sjl139090 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 242325cf1a30Sjl139090 nop 242425cf1a30Sjl139090.xcopyout_2: 242525cf1a30Sjl139090 btst 3, %o3 ! 242625cf1a30Sjl139090 bz,pt %ncc, .xcopyout_4 ! check for word alignment 242725cf1a30Sjl139090 nop 242825cf1a30Sjl139090 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 242925cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_2)], %o3 243025cf1a30Sjl139090 tst %o3 243125cf1a30Sjl139090 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 243225cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 243325cf1a30Sjl139090 bleu,pt %ncc, .xcopyout_small ! go to small copy 243425cf1a30Sjl139090 nop 243525cf1a30Sjl139090 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 243625cf1a30Sjl139090 nop 243725cf1a30Sjl139090.xcopyout_4: 243825cf1a30Sjl139090 ! already checked longword, must be word aligned 243925cf1a30Sjl139090 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 244025cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_4)], %o3 244125cf1a30Sjl139090 tst %o3 244225cf1a30Sjl139090 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 244325cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 244425cf1a30Sjl139090 bleu,pt %ncc, .xcopyout_small ! go to small copy 244525cf1a30Sjl139090 nop 244625cf1a30Sjl139090 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 244725cf1a30Sjl139090 nop 244825cf1a30Sjl139090.xcopyout_8: 244925cf1a30Sjl139090 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 245025cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_8)], %o3 245125cf1a30Sjl139090 tst %o3 245225cf1a30Sjl139090 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 245325cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 245425cf1a30Sjl139090 bleu,pt %ncc, .xcopyout_small ! go to small copy 245525cf1a30Sjl139090 nop 245625cf1a30Sjl139090 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 245725cf1a30Sjl139090 nop 245825cf1a30Sjl139090 245925cf1a30Sjl139090.xcopyout_small: 246025cf1a30Sjl139090 sethi %hi(.sm_xcopyout_err), %o5 ! .sm_xcopyout_err is lofault 246125cf1a30Sjl139090 or %o5, %lo(.sm_xcopyout_err), %o5 246225cf1a30Sjl139090 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 246325cf1a30Sjl139090 membar #Sync ! sync error barrier 246425cf1a30Sjl139090 ba,pt %ncc, .sm_do_copyout ! common code 246525cf1a30Sjl139090 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 246625cf1a30Sjl139090 246725cf1a30Sjl139090.xcopyout_more: 246825cf1a30Sjl139090 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 246925cf1a30Sjl139090 sethi %hi(.xcopyout_err), REAL_LOFAULT 247025cf1a30Sjl139090 ba,pt %ncc, .do_copyout ! common code 247125cf1a30Sjl139090 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT 247225cf1a30Sjl139090 247325cf1a30Sjl139090/* 247425cf1a30Sjl139090 * We got here because of fault during xcopyout 247525cf1a30Sjl139090 * Errno value is in ERRNO 247625cf1a30Sjl139090 */ 247725cf1a30Sjl139090.xcopyout_err: 247825cf1a30Sjl139090 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 247925cf1a30Sjl139090 tst %o4 248025cf1a30Sjl139090 bz,pt %ncc, 2f ! if not, return error 248125cf1a30Sjl139090 nop 248225cf1a30Sjl139090 ldn [%o4 + CP_XCOPYOUT], %g2 ! if handler, invoke it with 248325cf1a30Sjl139090 jmp %g2 ! original arguments 248425cf1a30Sjl139090 restore %g0, 0, %g0 ! dispose of copy window 248525cf1a30Sjl1390902: 248625cf1a30Sjl139090 ret 248725cf1a30Sjl139090 restore ERRNO, 0, %o0 ! return errno value 248825cf1a30Sjl139090 248925cf1a30Sjl139090.sm_xcopyout_err: 249025cf1a30Sjl139090 249125cf1a30Sjl139090 membar #Sync 249225cf1a30Sjl139090 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 249325cf1a30Sjl139090 mov SM_SAVE_SRC, %o0 249425cf1a30Sjl139090 mov SM_SAVE_DST, %o1 249525cf1a30Sjl139090 mov SM_SAVE_COUNT, %o2 249625cf1a30Sjl139090 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 249725cf1a30Sjl139090 tst %o3 249825cf1a30Sjl139090 bz,pt %ncc, 3f ! if not, return error 249925cf1a30Sjl139090 nop 250025cf1a30Sjl139090 ldn [%o3 + CP_XCOPYOUT], %o5 ! if handler, invoke it with 250125cf1a30Sjl139090 jmp %o5 ! original arguments 250225cf1a30Sjl139090 nop 250325cf1a30Sjl1390903: 250425cf1a30Sjl139090 retl 250525cf1a30Sjl139090 or %g1, 0, %o0 ! return errno value 250625cf1a30Sjl139090 250725cf1a30Sjl139090 SET_SIZE(xcopyout) 250825cf1a30Sjl139090 250925cf1a30Sjl139090#endif /* lint */ 251025cf1a30Sjl139090 251125cf1a30Sjl139090#ifdef lint 251225cf1a30Sjl139090 251325cf1a30Sjl139090/*ARGSUSED*/ 251425cf1a30Sjl139090int 251525cf1a30Sjl139090xcopyout_little(const void *kaddr, void *uaddr, size_t count) 251625cf1a30Sjl139090{ return (0); } 251725cf1a30Sjl139090 251825cf1a30Sjl139090#else /* lint */ 251925cf1a30Sjl139090 252025cf1a30Sjl139090 ENTRY(xcopyout_little) 252125cf1a30Sjl139090 sethi %hi(.xcopyio_err), %o5 252225cf1a30Sjl139090 or %o5, %lo(.xcopyio_err), %o5 252325cf1a30Sjl139090 ldn [THREAD_REG + T_LOFAULT], %o4 252425cf1a30Sjl139090 membar #Sync ! sync error barrier 252525cf1a30Sjl139090 stn %o5, [THREAD_REG + T_LOFAULT] 252625cf1a30Sjl139090 mov %o4, %o5 252725cf1a30Sjl139090 252825cf1a30Sjl139090 subcc %g0, %o2, %o3 252925cf1a30Sjl139090 add %o0, %o2, %o0 253025cf1a30Sjl139090 bz,pn %ncc, 2f ! check for zero bytes 253125cf1a30Sjl139090 sub %o2, 1, %o4 253225cf1a30Sjl139090 add %o0, %o4, %o0 ! start w/last byte 253325cf1a30Sjl139090 add %o1, %o2, %o1 253425cf1a30Sjl139090 ldub [%o0 + %o3], %o4 253525cf1a30Sjl139090 253625cf1a30Sjl1390901: stba %o4, [%o1 + %o3]ASI_AIUSL 253725cf1a30Sjl139090 inccc %o3 253825cf1a30Sjl139090 sub %o0, 2, %o0 ! get next byte 253925cf1a30Sjl139090 bcc,a,pt %ncc, 1b 254025cf1a30Sjl139090 ldub [%o0 + %o3], %o4 254125cf1a30Sjl139090 254225cf1a30Sjl1390902: 254325cf1a30Sjl139090 membar #Sync ! sync error barrier 254425cf1a30Sjl139090 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 254525cf1a30Sjl139090 retl 254625cf1a30Sjl139090 mov %g0, %o0 ! return (0) 254725cf1a30Sjl139090 254825cf1a30Sjl139090 SET_SIZE(xcopyout_little) 254925cf1a30Sjl139090 255025cf1a30Sjl139090#endif /* lint */ 255125cf1a30Sjl139090 255225cf1a30Sjl139090/* 255325cf1a30Sjl139090 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little) 255425cf1a30Sjl139090 */ 255525cf1a30Sjl139090 255625cf1a30Sjl139090#if defined(lint) 255725cf1a30Sjl139090 255825cf1a30Sjl139090/*ARGSUSED*/ 255925cf1a30Sjl139090int 256025cf1a30Sjl139090copyin(const void *uaddr, void *kaddr, size_t count) 256125cf1a30Sjl139090{ return (0); } 256225cf1a30Sjl139090 256325cf1a30Sjl139090#else /* lint */ 256425cf1a30Sjl139090 256525cf1a30Sjl139090 ENTRY(copyin) 256625cf1a30Sjl139090 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 256725cf1a30Sjl139090 bleu,pt %ncc, .copyin_small ! go to larger cases 256825cf1a30Sjl139090 xor %o0, %o1, %o3 ! are src, dst alignable? 256925cf1a30Sjl139090 btst 7, %o3 ! 257025cf1a30Sjl139090 bz,pt %ncc, .copyin_8 ! check for longword alignment 257125cf1a30Sjl139090 nop 257225cf1a30Sjl139090 btst 1, %o3 ! 257325cf1a30Sjl139090 bz,pt %ncc, .copyin_2 ! check for half-word 257425cf1a30Sjl139090 nop 257525cf1a30Sjl139090 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 257625cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_1)], %o3 257725cf1a30Sjl139090 tst %o3 257825cf1a30Sjl139090 bz,pn %icc, .copyin_small ! if zero, disable HW copy 257925cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 258025cf1a30Sjl139090 bleu,pt %ncc, .copyin_small ! go to small copy 258125cf1a30Sjl139090 nop 258225cf1a30Sjl139090 ba,pt %ncc, .copyin_more ! otherwise go to large copy 258325cf1a30Sjl139090 nop 258425cf1a30Sjl139090.copyin_2: 258525cf1a30Sjl139090 btst 3, %o3 ! 258625cf1a30Sjl139090 bz,pt %ncc, .copyin_4 ! check for word alignment 258725cf1a30Sjl139090 nop 258825cf1a30Sjl139090 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 258925cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_2)], %o3 259025cf1a30Sjl139090 tst %o3 259125cf1a30Sjl139090 bz,pn %icc, .copyin_small ! if zero, disable HW copy 259225cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 259325cf1a30Sjl139090 bleu,pt %ncc, .copyin_small ! go to small copy 259425cf1a30Sjl139090 nop 259525cf1a30Sjl139090 ba,pt %ncc, .copyin_more ! otherwise go to large copy 259625cf1a30Sjl139090 nop 259725cf1a30Sjl139090.copyin_4: 259825cf1a30Sjl139090 ! already checked longword, must be word aligned 259925cf1a30Sjl139090 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 260025cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_4)], %o3 260125cf1a30Sjl139090 tst %o3 260225cf1a30Sjl139090 bz,pn %icc, .copyin_small ! if zero, disable HW copy 260325cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 260425cf1a30Sjl139090 bleu,pt %ncc, .copyin_small ! go to small copy 260525cf1a30Sjl139090 nop 260625cf1a30Sjl139090 ba,pt %ncc, .copyin_more ! otherwise go to large copy 260725cf1a30Sjl139090 nop 260825cf1a30Sjl139090.copyin_8: 260925cf1a30Sjl139090 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 261025cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_8)], %o3 261125cf1a30Sjl139090 tst %o3 261225cf1a30Sjl139090 bz,pn %icc, .copyin_small ! if zero, disable HW copy 261325cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 261425cf1a30Sjl139090 bleu,pt %ncc, .copyin_small ! go to small copy 261525cf1a30Sjl139090 nop 261625cf1a30Sjl139090 ba,pt %ncc, .copyin_more ! otherwise go to large copy 261725cf1a30Sjl139090 nop 261825cf1a30Sjl139090 261925cf1a30Sjl139090 .align 16 262025cf1a30Sjl139090 nop ! instruction alignment 262125cf1a30Sjl139090 ! see discussion at start of file 262225cf1a30Sjl139090.copyin_small: 262325cf1a30Sjl139090 sethi %hi(.sm_copyin_err), %o5 ! .sm_copyin_err is lofault 262425cf1a30Sjl139090 or %o5, %lo(.sm_copyin_err), %o5 262525cf1a30Sjl139090 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofault, no tramp 262625cf1a30Sjl139090 membar #Sync ! sync error barrier 262725cf1a30Sjl139090 stn %o5, [THREAD_REG + T_LOFAULT] 262825cf1a30Sjl139090.sm_do_copyin: 262925cf1a30Sjl139090 mov %o0, SM_SAVE_SRC 263025cf1a30Sjl139090 mov %o1, SM_SAVE_DST 263125cf1a30Sjl139090 cmp %o2, SHORTCOPY ! check for really short case 263225cf1a30Sjl139090 bleu,pt %ncc, .ci_sm_left ! 263325cf1a30Sjl139090 mov %o2, SM_SAVE_COUNT 263425cf1a30Sjl139090 cmp %o2, CHKSIZE ! check for medium length cases 263525cf1a30Sjl139090 bgu,pn %ncc, .ci_med ! 263625cf1a30Sjl139090 or %o0, %o1, %o3 ! prepare alignment check 263725cf1a30Sjl139090 andcc %o3, 0x3, %g0 ! test for alignment 263825cf1a30Sjl139090 bz,pt %ncc, .ci_sm_word ! branch to word aligned case 263925cf1a30Sjl139090.ci_sm_movebytes: 264025cf1a30Sjl139090 sub %o2, 3, %o2 ! adjust count to allow cc zero test 264125cf1a30Sjl139090.ci_sm_notalign4: 264225cf1a30Sjl139090 lduba [%o0]ASI_USER, %o3 ! read byte 264325cf1a30Sjl139090 subcc %o2, 4, %o2 ! reduce count by 4 264425cf1a30Sjl139090 stb %o3, [%o1] ! write byte 264525cf1a30Sjl139090 add %o0, 1, %o0 ! advance SRC by 1 264625cf1a30Sjl139090 lduba [%o0]ASI_USER, %o3 ! repeat for a total of 4 bytes 264725cf1a30Sjl139090 add %o0, 1, %o0 ! advance SRC by 1 264825cf1a30Sjl139090 stb %o3, [%o1 + 1] 264925cf1a30Sjl139090 add %o1, 4, %o1 ! advance DST by 4 265025cf1a30Sjl139090 lduba [%o0]ASI_USER, %o3 265125cf1a30Sjl139090 add %o0, 1, %o0 ! advance SRC by 1 265225cf1a30Sjl139090 stb %o3, [%o1 - 2] 265325cf1a30Sjl139090 lduba [%o0]ASI_USER, %o3 265425cf1a30Sjl139090 add %o0, 1, %o0 ! advance SRC by 1 265525cf1a30Sjl139090 bgt,pt %ncc, .ci_sm_notalign4 ! loop til 3 or fewer bytes remain 265625cf1a30Sjl139090 stb %o3, [%o1 - 1] 265725cf1a30Sjl139090 add %o2, 3, %o2 ! restore count 265825cf1a30Sjl139090.ci_sm_left: 265925cf1a30Sjl139090 tst %o2 266025cf1a30Sjl139090 bz,pt %ncc, .ci_sm_exit 266125cf1a30Sjl139090 nop 266225cf1a30Sjl139090 lduba [%o0]ASI_USER, %o3 ! load one byte 266325cf1a30Sjl139090 deccc %o2 ! reduce count for cc test 266425cf1a30Sjl139090 bz,pt %ncc, .ci_sm_exit 266525cf1a30Sjl139090 stb %o3,[%o1] ! store one byte 266625cf1a30Sjl139090 inc %o0 266725cf1a30Sjl139090 lduba [%o0]ASI_USER, %o3 ! load second byte 266825cf1a30Sjl139090 deccc %o2 266925cf1a30Sjl139090 bz,pt %ncc, .ci_sm_exit 267025cf1a30Sjl139090 stb %o3,[%o1 + 1] ! store second byte 267125cf1a30Sjl139090 inc %o0 267225cf1a30Sjl139090 lduba [%o0]ASI_USER, %o3 ! load third byte 267325cf1a30Sjl139090 stb %o3,[%o1 + 2] ! store third byte 267425cf1a30Sjl139090 membar #Sync ! sync error barrier 267525cf1a30Sjl139090 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 267625cf1a30Sjl139090 retl 267725cf1a30Sjl139090 mov %g0, %o0 ! return 0 267825cf1a30Sjl139090 .align 16 267925cf1a30Sjl139090.ci_sm_words: 268025cf1a30Sjl139090 lduwa [%o0]ASI_USER, %o3 ! read word 268125cf1a30Sjl139090.ci_sm_wordx: 268225cf1a30Sjl139090 subcc %o2, 8, %o2 ! update count 268325cf1a30Sjl139090 stw %o3, [%o1] ! write word 268425cf1a30Sjl139090 add %o0, 4, %o0 ! update SRC 268525cf1a30Sjl139090 add %o1, 8, %o1 ! update DST 268625cf1a30Sjl139090 lduwa [%o0]ASI_USER, %o3 ! read word 268725cf1a30Sjl139090 add %o0, 4, %o0 ! update SRC 268825cf1a30Sjl139090 bgt,pt %ncc, .ci_sm_words ! loop til done 268925cf1a30Sjl139090 stw %o3, [%o1 - 4] ! write word 269025cf1a30Sjl139090 addcc %o2, 7, %o2 ! restore count 269125cf1a30Sjl139090 bz,pt %ncc, .ci_sm_exit 269225cf1a30Sjl139090 nop 269325cf1a30Sjl139090 deccc %o2 269425cf1a30Sjl139090 bz,pt %ncc, .ci_sm_byte 269525cf1a30Sjl139090.ci_sm_half: 269625cf1a30Sjl139090 subcc %o2, 2, %o2 ! reduce count by 2 269725cf1a30Sjl139090 lduha [%o0]ASI_USER, %o3 ! read half word 269825cf1a30Sjl139090 add %o0, 2, %o0 ! advance SRC by 2 269925cf1a30Sjl139090 add %o1, 2, %o1 ! advance DST by 2 270025cf1a30Sjl139090 bgt,pt %ncc, .ci_sm_half ! loop til done 270125cf1a30Sjl139090 sth %o3, [%o1 - 2] ! write half word 270225cf1a30Sjl139090 addcc %o2, 1, %o2 ! restore count 270325cf1a30Sjl139090 bz,pt %ncc, .ci_sm_exit 270425cf1a30Sjl139090 nop 270525cf1a30Sjl139090.ci_sm_byte: 270625cf1a30Sjl139090 lduba [%o0]ASI_USER, %o3 270725cf1a30Sjl139090 stb %o3, [%o1] 270825cf1a30Sjl139090 membar #Sync ! sync error barrier 270925cf1a30Sjl139090 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 271025cf1a30Sjl139090 retl 271125cf1a30Sjl139090 mov %g0, %o0 ! return 0 271225cf1a30Sjl139090 .align 16 271325cf1a30Sjl139090.ci_sm_word: 271425cf1a30Sjl139090 subcc %o2, 4, %o2 ! update count 271525cf1a30Sjl139090 bgt,pt %ncc, .ci_sm_wordx 271625cf1a30Sjl139090 lduwa [%o0]ASI_USER, %o3 ! read word 271725cf1a30Sjl139090 addcc %o2, 3, %o2 ! restore count 271825cf1a30Sjl139090 bz,pt %ncc, .ci_sm_exit 271925cf1a30Sjl139090 stw %o3, [%o1] ! write word 272025cf1a30Sjl139090 deccc %o2 ! reduce count for cc test 272125cf1a30Sjl139090 add %o0, 4, %o0 272225cf1a30Sjl139090 lduba [%o0]ASI_USER, %o3 ! load one byte 272325cf1a30Sjl139090 bz,pt %ncc, .ci_sm_exit 272425cf1a30Sjl139090 stb %o3, [%o1 + 4] ! store one byte 272525cf1a30Sjl139090 inc %o0 272625cf1a30Sjl139090 lduba [%o0]ASI_USER, %o3 ! load second byte 272725cf1a30Sjl139090 deccc %o2 272825cf1a30Sjl139090 bz,pt %ncc, .ci_sm_exit 272925cf1a30Sjl139090 stb %o3, [%o1 + 5] ! store second byte 273025cf1a30Sjl139090 inc %o0 273125cf1a30Sjl139090 lduba [%o0]ASI_USER, %o3 ! load third byte 273225cf1a30Sjl139090 stb %o3, [%o1 + 6] ! store third byte 273325cf1a30Sjl139090.ci_sm_exit: 273425cf1a30Sjl139090 membar #Sync ! sync error barrier 273525cf1a30Sjl139090 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 273625cf1a30Sjl139090 retl 273725cf1a30Sjl139090 mov %g0, %o0 ! return 0 273825cf1a30Sjl139090 273925cf1a30Sjl139090 .align 16 274025cf1a30Sjl139090.ci_med: 274125cf1a30Sjl139090 xor %o0, %o1, %o3 ! setup alignment check 274225cf1a30Sjl139090 btst 1, %o3 274325cf1a30Sjl139090 bnz,pt %ncc, .ci_sm_movebytes ! unaligned 274425cf1a30Sjl139090 nop 274525cf1a30Sjl139090 btst 3, %o3 274625cf1a30Sjl139090 bnz,pt %ncc, .ci_med_half ! halfword aligned 274725cf1a30Sjl139090 nop 274825cf1a30Sjl139090 btst 7, %o3 274925cf1a30Sjl139090 bnz,pt %ncc, .ci_med_word ! word aligned 275025cf1a30Sjl139090 nop 275125cf1a30Sjl139090.ci_med_long: 275225cf1a30Sjl139090 btst 3, %o0 ! check for 275325cf1a30Sjl139090 bz,pt %ncc, .ci_med_long1 ! word alignment 275425cf1a30Sjl139090 nop 275525cf1a30Sjl139090.ci_med_long0: 275625cf1a30Sjl139090 lduba [%o0]ASI_USER, %o3 ! load one byte 275725cf1a30Sjl139090 inc %o0 275825cf1a30Sjl139090 stb %o3,[%o1] ! store byte 275925cf1a30Sjl139090 inc %o1 276025cf1a30Sjl139090 btst 3, %o0 276125cf1a30Sjl139090 bnz,pt %ncc, .ci_med_long0 276225cf1a30Sjl139090 dec %o2 276325cf1a30Sjl139090.ci_med_long1: ! word aligned 276425cf1a30Sjl139090 btst 7, %o0 ! check for long word 276525cf1a30Sjl139090 bz,pt %ncc, .ci_med_long2 276625cf1a30Sjl139090 nop 276725cf1a30Sjl139090 lduwa [%o0]ASI_USER, %o3 ! load word 276825cf1a30Sjl139090 add %o0, 4, %o0 ! advance SRC by 4 276925cf1a30Sjl139090 stw %o3, [%o1] ! store word 277025cf1a30Sjl139090 add %o1, 4, %o1 ! advance DST by 4 277125cf1a30Sjl139090 sub %o2, 4, %o2 ! reduce count by 4 277225cf1a30Sjl139090! 277325cf1a30Sjl139090! Now long word aligned and have at least 32 bytes to move 277425cf1a30Sjl139090! 277525cf1a30Sjl139090.ci_med_long2: 277625cf1a30Sjl139090 sub %o2, 31, %o2 ! adjust count to allow cc zero test 277725cf1a30Sjl139090.ci_med_lmove: 277825cf1a30Sjl139090 ldxa [%o0]ASI_USER, %o3 ! read long word 277925cf1a30Sjl139090 subcc %o2, 32, %o2 ! reduce count by 32 278025cf1a30Sjl139090 stx %o3, [%o1] ! write long word 278125cf1a30Sjl139090 add %o0, 8, %o0 ! advance SRC by 8 278225cf1a30Sjl139090 ldxa [%o0]ASI_USER, %o3 ! repeat for a total for 4 long words 278325cf1a30Sjl139090 add %o0, 8, %o0 ! advance SRC by 8 278425cf1a30Sjl139090 stx %o3, [%o1 + 8] 278525cf1a30Sjl139090 add %o1, 32, %o1 ! advance DST by 32 278625cf1a30Sjl139090 ldxa [%o0]ASI_USER, %o3 278725cf1a30Sjl139090 add %o0, 8, %o0 ! advance SRC by 8 278825cf1a30Sjl139090 stx %o3, [%o1 - 16] 278925cf1a30Sjl139090 ldxa [%o0]ASI_USER, %o3 279025cf1a30Sjl139090 add %o0, 8, %o0 ! advance SRC by 8 279125cf1a30Sjl139090 bgt,pt %ncc, .ci_med_lmove ! loop til 31 or fewer bytes left 279225cf1a30Sjl139090 stx %o3, [%o1 - 8] 279325cf1a30Sjl139090 addcc %o2, 24, %o2 ! restore count to long word offset 279425cf1a30Sjl139090 ble,pt %ncc, .ci_med_lextra ! check for more long words to move 279525cf1a30Sjl139090 nop 279625cf1a30Sjl139090.ci_med_lword: 279725cf1a30Sjl139090 ldxa [%o0]ASI_USER, %o3 ! read long word 279825cf1a30Sjl139090 subcc %o2, 8, %o2 ! reduce count by 8 279925cf1a30Sjl139090 stx %o3, [%o1] ! write long word 280025cf1a30Sjl139090 add %o0, 8, %o0 ! advance SRC by 8 280125cf1a30Sjl139090 bgt,pt %ncc, .ci_med_lword ! loop til 7 or fewer bytes left 280225cf1a30Sjl139090 add %o1, 8, %o1 ! advance DST by 8 280325cf1a30Sjl139090.ci_med_lextra: 280425cf1a30Sjl139090 addcc %o2, 7, %o2 ! restore rest of count 280525cf1a30Sjl139090 bz,pt %ncc, .ci_sm_exit ! if zero, then done 280625cf1a30Sjl139090 deccc %o2 280725cf1a30Sjl139090 bz,pt %ncc, .ci_sm_byte 280825cf1a30Sjl139090 nop 280925cf1a30Sjl139090 ba,pt %ncc, .ci_sm_half 281025cf1a30Sjl139090 nop 281125cf1a30Sjl139090 281225cf1a30Sjl139090 .align 16 281325cf1a30Sjl139090 nop ! instruction alignment 281425cf1a30Sjl139090 ! see discussion at start of file 281525cf1a30Sjl139090.ci_med_word: 281625cf1a30Sjl139090 btst 3, %o0 ! check for 281725cf1a30Sjl139090 bz,pt %ncc, .ci_med_word1 ! word alignment 281825cf1a30Sjl139090 nop 281925cf1a30Sjl139090.ci_med_word0: 282025cf1a30Sjl139090 lduba [%o0]ASI_USER, %o3 ! load one byte 282125cf1a30Sjl139090 inc %o0 282225cf1a30Sjl139090 stb %o3,[%o1] ! store byte 282325cf1a30Sjl139090 inc %o1 282425cf1a30Sjl139090 btst 3, %o0 282525cf1a30Sjl139090 bnz,pt %ncc, .ci_med_word0 282625cf1a30Sjl139090 dec %o2 282725cf1a30Sjl139090! 282825cf1a30Sjl139090! Now word aligned and have at least 36 bytes to move 282925cf1a30Sjl139090! 283025cf1a30Sjl139090.ci_med_word1: 283125cf1a30Sjl139090 sub %o2, 15, %o2 ! adjust count to allow cc zero test 283225cf1a30Sjl139090.ci_med_wmove: 283325cf1a30Sjl139090 lduwa [%o0]ASI_USER, %o3 ! read word 283425cf1a30Sjl139090 subcc %o2, 16, %o2 ! reduce count by 16 283525cf1a30Sjl139090 stw %o3, [%o1] ! write word 283625cf1a30Sjl139090 add %o0, 4, %o0 ! advance SRC by 4 283725cf1a30Sjl139090 lduwa [%o0]ASI_USER, %o3 ! repeat for a total for 4 words 283825cf1a30Sjl139090 add %o0, 4, %o0 ! advance SRC by 4 283925cf1a30Sjl139090 stw %o3, [%o1 + 4] 284025cf1a30Sjl139090 add %o1, 16, %o1 ! advance DST by 16 284125cf1a30Sjl139090 lduwa [%o0]ASI_USER, %o3 284225cf1a30Sjl139090 add %o0, 4, %o0 ! advance SRC by 4 284325cf1a30Sjl139090 stw %o3, [%o1 - 8] 284425cf1a30Sjl139090 lduwa [%o0]ASI_USER, %o3 284525cf1a30Sjl139090 add %o0, 4, %o0 ! advance SRC by 4 284625cf1a30Sjl139090 bgt,pt %ncc, .ci_med_wmove ! loop til 15 or fewer bytes left 284725cf1a30Sjl139090 stw %o3, [%o1 - 4] 284825cf1a30Sjl139090 addcc %o2, 12, %o2 ! restore count to word offset 284925cf1a30Sjl139090 ble,pt %ncc, .ci_med_wextra ! check for more words to move 285025cf1a30Sjl139090 nop 285125cf1a30Sjl139090.ci_med_word2: 285225cf1a30Sjl139090 lduwa [%o0]ASI_USER, %o3 ! read word 285325cf1a30Sjl139090 subcc %o2, 4, %o2 ! reduce count by 4 285425cf1a30Sjl139090 stw %o3, [%o1] ! write word 285525cf1a30Sjl139090 add %o0, 4, %o0 ! advance SRC by 4 285625cf1a30Sjl139090 bgt,pt %ncc, .ci_med_word2 ! loop til 3 or fewer bytes left 285725cf1a30Sjl139090 add %o1, 4, %o1 ! advance DST by 4 285825cf1a30Sjl139090.ci_med_wextra: 285925cf1a30Sjl139090 addcc %o2, 3, %o2 ! restore rest of count 286025cf1a30Sjl139090 bz,pt %ncc, .ci_sm_exit ! if zero, then done 286125cf1a30Sjl139090 deccc %o2 286225cf1a30Sjl139090 bz,pt %ncc, .ci_sm_byte 286325cf1a30Sjl139090 nop 286425cf1a30Sjl139090 ba,pt %ncc, .ci_sm_half 286525cf1a30Sjl139090 nop 286625cf1a30Sjl139090 286725cf1a30Sjl139090 .align 16 286825cf1a30Sjl139090 nop ! instruction alignment 286925cf1a30Sjl139090 ! see discussion at start of file 287025cf1a30Sjl139090.ci_med_half: 287125cf1a30Sjl139090 btst 1, %o0 ! check for 287225cf1a30Sjl139090 bz,pt %ncc, .ci_med_half1 ! half word alignment 287325cf1a30Sjl139090 nop 287425cf1a30Sjl139090 lduba [%o0]ASI_USER, %o3 ! load one byte 287525cf1a30Sjl139090 inc %o0 287625cf1a30Sjl139090 stb %o3,[%o1] ! store byte 287725cf1a30Sjl139090 inc %o1 287825cf1a30Sjl139090 dec %o2 287925cf1a30Sjl139090! 288025cf1a30Sjl139090! Now half word aligned and have at least 38 bytes to move 288125cf1a30Sjl139090! 288225cf1a30Sjl139090.ci_med_half1: 288325cf1a30Sjl139090 sub %o2, 7, %o2 ! adjust count to allow cc zero test 288425cf1a30Sjl139090.ci_med_hmove: 288525cf1a30Sjl139090 lduha [%o0]ASI_USER, %o3 ! read half word 288625cf1a30Sjl139090 subcc %o2, 8, %o2 ! reduce count by 8 288725cf1a30Sjl139090 sth %o3, [%o1] ! write half word 288825cf1a30Sjl139090 add %o0, 2, %o0 ! advance SRC by 2 288925cf1a30Sjl139090 lduha [%o0]ASI_USER, %o3 ! repeat for a total for 4 halfwords 289025cf1a30Sjl139090 add %o0, 2, %o0 ! advance SRC by 2 289125cf1a30Sjl139090 sth %o3, [%o1 + 2] 289225cf1a30Sjl139090 add %o1, 8, %o1 ! advance DST by 8 289325cf1a30Sjl139090 lduha [%o0]ASI_USER, %o3 289425cf1a30Sjl139090 add %o0, 2, %o0 ! advance SRC by 2 289525cf1a30Sjl139090 sth %o3, [%o1 - 4] 289625cf1a30Sjl139090 lduha [%o0]ASI_USER, %o3 289725cf1a30Sjl139090 add %o0, 2, %o0 ! advance SRC by 2 289825cf1a30Sjl139090 bgt,pt %ncc, .ci_med_hmove ! loop til 7 or fewer bytes left 289925cf1a30Sjl139090 sth %o3, [%o1 - 2] 290025cf1a30Sjl139090 addcc %o2, 7, %o2 ! restore count 290125cf1a30Sjl139090 bz,pt %ncc, .ci_sm_exit 290225cf1a30Sjl139090 deccc %o2 290325cf1a30Sjl139090 bz,pt %ncc, .ci_sm_byte 290425cf1a30Sjl139090 nop 290525cf1a30Sjl139090 ba,pt %ncc, .ci_sm_half 290625cf1a30Sjl139090 nop 290725cf1a30Sjl139090 290825cf1a30Sjl139090.sm_copyin_err: 290925cf1a30Sjl139090 membar #Sync 291025cf1a30Sjl139090 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 291125cf1a30Sjl139090 mov SM_SAVE_SRC, %o0 291225cf1a30Sjl139090 mov SM_SAVE_DST, %o1 291325cf1a30Sjl139090 mov SM_SAVE_COUNT, %o2 291425cf1a30Sjl139090 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 291525cf1a30Sjl139090 tst %o3 291625cf1a30Sjl139090 bz,pt %ncc, 3f ! if not, return error 291725cf1a30Sjl139090 nop 291825cf1a30Sjl139090 ldn [%o3 + CP_COPYIN], %o5 ! if handler, invoke it with 291925cf1a30Sjl139090 jmp %o5 ! original arguments 292025cf1a30Sjl139090 nop 292125cf1a30Sjl1390903: 292225cf1a30Sjl139090 retl 292325cf1a30Sjl139090 or %g0, -1, %o0 ! return errno value 292425cf1a30Sjl139090 292525cf1a30Sjl139090 SET_SIZE(copyin) 292625cf1a30Sjl139090 292725cf1a30Sjl139090 292825cf1a30Sjl139090/* 292925cf1a30Sjl139090 * The _more entry points are not intended to be used directly by 293025cf1a30Sjl139090 * any caller from outside this file. They are provided to allow 293125cf1a30Sjl139090 * profiling and dtrace of the portions of the copy code that uses 293225cf1a30Sjl139090 * the floating point registers. 293325cf1a30Sjl139090 * This entry is particularly important as DTRACE (at least as of 293425cf1a30Sjl139090 * 4/2004) does not support leaf functions. 293525cf1a30Sjl139090 */ 293625cf1a30Sjl139090 293725cf1a30Sjl139090 ENTRY(copyin_more) 293825cf1a30Sjl139090.copyin_more: 293925cf1a30Sjl139090 prefetch [%o0], #n_reads 294025cf1a30Sjl139090 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 294125cf1a30Sjl139090 set .copyin_err, REAL_LOFAULT 294225cf1a30Sjl139090 294325cf1a30Sjl139090/* 294425cf1a30Sjl139090 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes 294525cf1a30Sjl139090 */ 294625cf1a30Sjl139090.do_copyin: 294725cf1a30Sjl139090 set copyio_fault, %l7 ! .copyio_fault is lofault val 294825cf1a30Sjl139090 294925cf1a30Sjl139090 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 295025cf1a30Sjl139090 membar #Sync ! sync error barrier 295125cf1a30Sjl139090 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 295225cf1a30Sjl139090 295325cf1a30Sjl139090 mov %i0, SAVE_SRC 295425cf1a30Sjl139090 mov %i1, SAVE_DST 295525cf1a30Sjl139090 mov %i2, SAVE_COUNT 295625cf1a30Sjl139090 295725cf1a30Sjl139090 FP_NOMIGRATE(6, 7) 295825cf1a30Sjl139090 295925cf1a30Sjl139090 rd %fprs, %o2 ! check for unused fp 296025cf1a30Sjl139090 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 296125cf1a30Sjl139090 btst FPRS_FEF, %o2 296225cf1a30Sjl139090 bz,a,pt %icc, .do_blockcopyin 296325cf1a30Sjl139090 wr %g0, FPRS_FEF, %fprs 296425cf1a30Sjl139090 296525cf1a30Sjl139090 BST_FPQ2Q4_TOSTACK(%o2) 296625cf1a30Sjl139090 296725cf1a30Sjl139090.do_blockcopyin: 296825cf1a30Sjl139090 rd %gsr, %o2 296925cf1a30Sjl139090 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 297025cf1a30Sjl139090 or %l6, FPUSED_FLAG, %l6 297125cf1a30Sjl139090 297225cf1a30Sjl139090 andcc DST, VIS_BLOCKSIZE - 1, TMP 297325cf1a30Sjl139090 mov ASI_USER, %asi 297425cf1a30Sjl139090 bz,pt %ncc, 2f 297525cf1a30Sjl139090 neg TMP 297625cf1a30Sjl139090 add TMP, VIS_BLOCKSIZE, TMP 297725cf1a30Sjl139090 297825cf1a30Sjl139090 ! TMP = bytes required to align DST on FP_BLOCK boundary 297925cf1a30Sjl139090 ! Using SRC as a tmp here 298025cf1a30Sjl139090 cmp TMP, 3 298125cf1a30Sjl139090 bleu,pt %ncc, 1f 298225cf1a30Sjl139090 sub CNT,TMP,CNT ! adjust main count 298325cf1a30Sjl139090 sub TMP, 3, TMP ! adjust for end of loop test 298425cf1a30Sjl139090.ci_blkalign: 298525cf1a30Sjl139090 lduba [REALSRC]%asi, SRC ! move 4 bytes per loop iteration 298625cf1a30Sjl139090 stb SRC, [DST] 298725cf1a30Sjl139090 subcc TMP, 4, TMP 298825cf1a30Sjl139090 lduba [REALSRC + 1]%asi, SRC 298925cf1a30Sjl139090 add REALSRC, 4, REALSRC 299025cf1a30Sjl139090 stb SRC, [DST + 1] 299125cf1a30Sjl139090 lduba [REALSRC - 2]%asi, SRC 299225cf1a30Sjl139090 add DST, 4, DST 299325cf1a30Sjl139090 stb SRC, [DST - 2] 299425cf1a30Sjl139090 lduba [REALSRC - 1]%asi, SRC 299525cf1a30Sjl139090 bgu,pt %ncc, .ci_blkalign 299625cf1a30Sjl139090 stb SRC, [DST - 1] 299725cf1a30Sjl139090 299825cf1a30Sjl139090 addcc TMP, 3, TMP ! restore count adjustment 299925cf1a30Sjl139090 bz,pt %ncc, 2f ! no bytes left? 300025cf1a30Sjl139090 nop 300125cf1a30Sjl1390901: lduba [REALSRC]%asi, SRC 300225cf1a30Sjl139090 inc REALSRC 300325cf1a30Sjl139090 inc DST 300425cf1a30Sjl139090 deccc TMP 300525cf1a30Sjl139090 bgu %ncc, 1b 300625cf1a30Sjl139090 stb SRC, [DST - 1] 300725cf1a30Sjl139090 300825cf1a30Sjl1390902: 300925cf1a30Sjl139090 membar #StoreLoad 301025cf1a30Sjl139090 andn REALSRC, 0x7, SRC 301125cf1a30Sjl139090 301225cf1a30Sjl139090 ! SRC - 8-byte aligned 301325cf1a30Sjl139090 ! DST - 64-byte aligned 301425cf1a30Sjl139090 ldda [SRC]%asi, %f16 301525cf1a30Sjl139090 prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #n_reads 301625cf1a30Sjl139090 alignaddr REALSRC, %g0, %g0 301725cf1a30Sjl139090 ldda [SRC + 0x08]%asi, %f18 301825cf1a30Sjl139090 prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #n_reads 301925cf1a30Sjl139090 faligndata %f16, %f18, %f48 302025cf1a30Sjl139090 ldda [SRC + 0x10]%asi, %f20 3021c8a722abSpm145316 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads 302225cf1a30Sjl139090 faligndata %f18, %f20, %f50 302325cf1a30Sjl139090 ldda [SRC + 0x18]%asi, %f22 302425cf1a30Sjl139090 prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read 302525cf1a30Sjl139090 faligndata %f20, %f22, %f52 302625cf1a30Sjl139090 ldda [SRC + 0x20]%asi, %f24 3027c8a722abSpm145316 prefetcha [SRC + (8 * VIS_BLOCKSIZE)]%asi, #one_read 302825cf1a30Sjl139090 faligndata %f22, %f24, %f54 302925cf1a30Sjl139090 ldda [SRC + 0x28]%asi, %f26 3030c8a722abSpm145316 prefetcha [SRC + (12 * VIS_BLOCKSIZE)]%asi, #one_read 303125cf1a30Sjl139090 faligndata %f24, %f26, %f56 303225cf1a30Sjl139090 ldda [SRC + 0x30]%asi, %f28 3033c8a722abSpm145316 prefetcha [SRC + (16 * VIS_BLOCKSIZE)]%asi, #one_read 303425cf1a30Sjl139090 faligndata %f26, %f28, %f58 303525cf1a30Sjl139090 ldda [SRC + 0x38]%asi, %f30 303625cf1a30Sjl139090 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 303725cf1a30Sjl139090 sub CNT, VIS_BLOCKSIZE, CNT 303825cf1a30Sjl139090 add SRC, VIS_BLOCKSIZE, SRC 3039c8a722abSpm145316 prefetcha [SRC + (19 * VIS_BLOCKSIZE)]%asi, #one_read 304025cf1a30Sjl139090 add REALSRC, VIS_BLOCKSIZE, REALSRC 304125cf1a30Sjl139090 ba,pt %ncc, 1f 3042c8a722abSpm145316 prefetcha [SRC + (23 * VIS_BLOCKSIZE)]%asi, #one_read 304325cf1a30Sjl139090 .align 32 304425cf1a30Sjl1390901: 304525cf1a30Sjl139090 ldda [SRC + 0x08]%asi, %f18 304625cf1a30Sjl139090 faligndata %f28, %f30, %f60 304725cf1a30Sjl139090 ldda [SRC + 0x10]%asi, %f20 304825cf1a30Sjl139090 faligndata %f30, %f16, %f62 304925cf1a30Sjl139090 stda %f48, [DST]ASI_BLK_P 305025cf1a30Sjl139090 ldda [SRC + 0x18]%asi, %f22 305125cf1a30Sjl139090 faligndata %f16, %f18, %f48 305225cf1a30Sjl139090 ldda [SRC + 0x20]%asi, %f24 305325cf1a30Sjl139090 faligndata %f18, %f20, %f50 305425cf1a30Sjl139090 ldda [SRC + 0x28]%asi, %f26 305525cf1a30Sjl139090 faligndata %f20, %f22, %f52 305625cf1a30Sjl139090 ldda [SRC + 0x30]%asi, %f28 305725cf1a30Sjl139090 faligndata %f22, %f24, %f54 305825cf1a30Sjl139090 sub CNT, VIS_BLOCKSIZE, CNT 3059c8a722abSpm145316 ldda [SRC + 0x38]%asi, %f30 3060c8a722abSpm145316 faligndata %f24, %f26, %f56 306125cf1a30Sjl139090 add DST, VIS_BLOCKSIZE, DST 3062c8a722abSpm145316 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 3063c8a722abSpm145316 faligndata %f26, %f28, %f58 306425cf1a30Sjl139090 add REALSRC, VIS_BLOCKSIZE, REALSRC 3065c8a722abSpm145316 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads 3066c8a722abSpm145316 add SRC, VIS_BLOCKSIZE, SRC 3067c8a722abSpm145316 prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read 306825cf1a30Sjl139090 cmp CNT, VIS_BLOCKSIZE + 8 306925cf1a30Sjl139090 bgu,pt %ncc, 1b 3070c8a722abSpm145316 prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read 307125cf1a30Sjl139090 307225cf1a30Sjl139090 ! only if REALSRC & 0x7 is 0 307325cf1a30Sjl139090 cmp CNT, VIS_BLOCKSIZE 307425cf1a30Sjl139090 bne %ncc, 3f 307525cf1a30Sjl139090 andcc REALSRC, 0x7, %g0 307625cf1a30Sjl139090 bz,pt %ncc, 2f 307725cf1a30Sjl139090 nop 307825cf1a30Sjl1390903: 307925cf1a30Sjl139090 faligndata %f28, %f30, %f60 308025cf1a30Sjl139090 faligndata %f30, %f16, %f62 308125cf1a30Sjl139090 stda %f48, [DST]ASI_BLK_P 308225cf1a30Sjl139090 add DST, VIS_BLOCKSIZE, DST 308325cf1a30Sjl139090 ba,pt %ncc, 3f 308425cf1a30Sjl139090 nop 308525cf1a30Sjl1390902: 308625cf1a30Sjl139090 ldda [SRC + 0x08]%asi, %f18 308725cf1a30Sjl139090 fsrc1 %f28, %f60 308825cf1a30Sjl139090 ldda [SRC + 0x10]%asi, %f20 308925cf1a30Sjl139090 fsrc1 %f30, %f62 309025cf1a30Sjl139090 stda %f48, [DST]ASI_BLK_P 309125cf1a30Sjl139090 ldda [SRC + 0x18]%asi, %f22 309225cf1a30Sjl139090 fsrc1 %f16, %f48 309325cf1a30Sjl139090 ldda [SRC + 0x20]%asi, %f24 309425cf1a30Sjl139090 fsrc1 %f18, %f50 309525cf1a30Sjl139090 ldda [SRC + 0x28]%asi, %f26 309625cf1a30Sjl139090 fsrc1 %f20, %f52 309725cf1a30Sjl139090 ldda [SRC + 0x30]%asi, %f28 309825cf1a30Sjl139090 fsrc1 %f22, %f54 309925cf1a30Sjl139090 ldda [SRC + 0x38]%asi, %f30 310025cf1a30Sjl139090 fsrc1 %f24, %f56 310125cf1a30Sjl139090 sub CNT, VIS_BLOCKSIZE, CNT 310225cf1a30Sjl139090 add DST, VIS_BLOCKSIZE, DST 310325cf1a30Sjl139090 add SRC, VIS_BLOCKSIZE, SRC 310425cf1a30Sjl139090 add REALSRC, VIS_BLOCKSIZE, REALSRC 310525cf1a30Sjl139090 fsrc1 %f26, %f58 310625cf1a30Sjl139090 fsrc1 %f28, %f60 310725cf1a30Sjl139090 fsrc1 %f30, %f62 310825cf1a30Sjl139090 stda %f48, [DST]ASI_BLK_P 310925cf1a30Sjl139090 add DST, VIS_BLOCKSIZE, DST 311025cf1a30Sjl139090 ba,a,pt %ncc, 4f 311125cf1a30Sjl139090 nop 311225cf1a30Sjl139090 311325cf1a30Sjl1390903: tst CNT 311425cf1a30Sjl139090 bz,a %ncc, 4f 311525cf1a30Sjl139090 nop 311625cf1a30Sjl139090 311725cf1a30Sjl1390905: lduba [REALSRC]ASI_USER, TMP 311825cf1a30Sjl139090 inc REALSRC 311925cf1a30Sjl139090 inc DST 312025cf1a30Sjl139090 deccc CNT 312125cf1a30Sjl139090 bgu %ncc, 5b 312225cf1a30Sjl139090 stb TMP, [DST - 1] 312325cf1a30Sjl1390904: 312425cf1a30Sjl139090 312525cf1a30Sjl139090.copyin_exit: 312625cf1a30Sjl139090 membar #Sync 312725cf1a30Sjl139090 312825cf1a30Sjl139090 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 312925cf1a30Sjl139090 wr %o2, 0, %gsr 313025cf1a30Sjl139090 313125cf1a30Sjl139090 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 313225cf1a30Sjl139090 btst FPRS_FEF, %o3 313325cf1a30Sjl139090 bz,pt %icc, 4f 313425cf1a30Sjl139090 nop 313525cf1a30Sjl139090 313625cf1a30Sjl139090 BLD_FPQ2Q4_FROMSTACK(%o2) 313725cf1a30Sjl139090 313825cf1a30Sjl139090 ba,pt %ncc, 1f 313925cf1a30Sjl139090 wr %o3, 0, %fprs ! restore fprs 314025cf1a30Sjl139090 314125cf1a30Sjl1390904: 314225cf1a30Sjl139090 FZEROQ2Q4 314325cf1a30Sjl139090 wr %o3, 0, %fprs ! restore fprs 314425cf1a30Sjl139090 314525cf1a30Sjl1390901: 314625cf1a30Sjl139090 membar #Sync ! sync error barrier 314725cf1a30Sjl139090 andn %l6, FPUSED_FLAG, %l6 314825cf1a30Sjl139090 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 314925cf1a30Sjl139090 FP_ALLOWMIGRATE(5, 6) 315025cf1a30Sjl139090 ret 315125cf1a30Sjl139090 restore %g0, 0, %o0 315225cf1a30Sjl139090/* 315325cf1a30Sjl139090 * We got here because of a fault during copyin 315425cf1a30Sjl139090 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 315525cf1a30Sjl139090 */ 315625cf1a30Sjl139090.copyin_err: 315725cf1a30Sjl139090 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 315825cf1a30Sjl139090 tst %o4 315925cf1a30Sjl139090 bz,pt %ncc, 2f ! if not, return error 316025cf1a30Sjl139090 nop 316125cf1a30Sjl139090 ldn [%o4 + CP_COPYIN], %g2 ! if handler, invoke it with 316225cf1a30Sjl139090 jmp %g2 ! original arguments 316325cf1a30Sjl139090 restore %g0, 0, %g0 ! dispose of copy window 316425cf1a30Sjl1390902: 316525cf1a30Sjl139090 ret 316625cf1a30Sjl139090 restore %g0, -1, %o0 ! return error value 316725cf1a30Sjl139090 316825cf1a30Sjl139090 316925cf1a30Sjl139090 SET_SIZE(copyin_more) 317025cf1a30Sjl139090 317125cf1a30Sjl139090#endif /* lint */ 317225cf1a30Sjl139090 317325cf1a30Sjl139090#ifdef lint 317425cf1a30Sjl139090 317525cf1a30Sjl139090/*ARGSUSED*/ 317625cf1a30Sjl139090int 317725cf1a30Sjl139090xcopyin(const void *uaddr, void *kaddr, size_t count) 317825cf1a30Sjl139090{ return (0); } 317925cf1a30Sjl139090 318025cf1a30Sjl139090#else /* lint */ 318125cf1a30Sjl139090 318225cf1a30Sjl139090 ENTRY(xcopyin) 318325cf1a30Sjl139090 318425cf1a30Sjl139090 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 318525cf1a30Sjl139090 bleu,pt %ncc, .xcopyin_small ! go to larger cases 318625cf1a30Sjl139090 xor %o0, %o1, %o3 ! are src, dst alignable? 318725cf1a30Sjl139090 btst 7, %o3 ! 318825cf1a30Sjl139090 bz,pt %ncc, .xcopyin_8 ! check for longword alignment 318925cf1a30Sjl139090 nop 319025cf1a30Sjl139090 btst 1, %o3 ! 319125cf1a30Sjl139090 bz,pt %ncc, .xcopyin_2 ! check for half-word 319225cf1a30Sjl139090 nop 319325cf1a30Sjl139090 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 319425cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_1)], %o3 319525cf1a30Sjl139090 tst %o3 319625cf1a30Sjl139090 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 319725cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 319825cf1a30Sjl139090 bleu,pt %ncc, .xcopyin_small ! go to small copy 319925cf1a30Sjl139090 nop 320025cf1a30Sjl139090 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 320125cf1a30Sjl139090 nop 320225cf1a30Sjl139090.xcopyin_2: 320325cf1a30Sjl139090 btst 3, %o3 ! 320425cf1a30Sjl139090 bz,pt %ncc, .xcopyin_4 ! check for word alignment 320525cf1a30Sjl139090 nop 320625cf1a30Sjl139090 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 320725cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_2)], %o3 320825cf1a30Sjl139090 tst %o3 320925cf1a30Sjl139090 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 321025cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 321125cf1a30Sjl139090 bleu,pt %ncc, .xcopyin_small ! go to small copy 321225cf1a30Sjl139090 nop 321325cf1a30Sjl139090 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 321425cf1a30Sjl139090 nop 321525cf1a30Sjl139090.xcopyin_4: 321625cf1a30Sjl139090 ! already checked longword, must be word aligned 321725cf1a30Sjl139090 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 321825cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_4)], %o3 321925cf1a30Sjl139090 tst %o3 322025cf1a30Sjl139090 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 322125cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 322225cf1a30Sjl139090 bleu,pt %ncc, .xcopyin_small ! go to small copy 322325cf1a30Sjl139090 nop 322425cf1a30Sjl139090 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 322525cf1a30Sjl139090 nop 322625cf1a30Sjl139090.xcopyin_8: 322725cf1a30Sjl139090 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 322825cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_8)], %o3 322925cf1a30Sjl139090 tst %o3 323025cf1a30Sjl139090 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 323125cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 323225cf1a30Sjl139090 bleu,pt %ncc, .xcopyin_small ! go to small copy 323325cf1a30Sjl139090 nop 323425cf1a30Sjl139090 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 323525cf1a30Sjl139090 nop 323625cf1a30Sjl139090 323725cf1a30Sjl139090.xcopyin_small: 323825cf1a30Sjl139090 sethi %hi(.sm_xcopyin_err), %o5 ! .sm_xcopyin_err is lofault value 323925cf1a30Sjl139090 or %o5, %lo(.sm_xcopyin_err), %o5 324025cf1a30Sjl139090 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofaul 324125cf1a30Sjl139090 membar #Sync ! sync error barrier 324225cf1a30Sjl139090 ba,pt %ncc, .sm_do_copyin ! common code 324325cf1a30Sjl139090 stn %o5, [THREAD_REG + T_LOFAULT] 324425cf1a30Sjl139090 324525cf1a30Sjl139090.xcopyin_more: 324625cf1a30Sjl139090 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 324725cf1a30Sjl139090 sethi %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value 324825cf1a30Sjl139090 ba,pt %ncc, .do_copyin 324925cf1a30Sjl139090 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT 325025cf1a30Sjl139090 325125cf1a30Sjl139090/* 325225cf1a30Sjl139090 * We got here because of fault during xcopyin 325325cf1a30Sjl139090 * Errno value is in ERRNO 325425cf1a30Sjl139090 */ 325525cf1a30Sjl139090.xcopyin_err: 325625cf1a30Sjl139090 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 325725cf1a30Sjl139090 tst %o4 325825cf1a30Sjl139090 bz,pt %ncc, 2f ! if not, return error 325925cf1a30Sjl139090 nop 326025cf1a30Sjl139090 ldn [%o4 + CP_XCOPYIN], %g2 ! if handler, invoke it with 326125cf1a30Sjl139090 jmp %g2 ! original arguments 326225cf1a30Sjl139090 restore %g0, 0, %g0 ! dispose of copy window 326325cf1a30Sjl1390902: 326425cf1a30Sjl139090 ret 326525cf1a30Sjl139090 restore ERRNO, 0, %o0 ! return errno value 326625cf1a30Sjl139090 326725cf1a30Sjl139090.sm_xcopyin_err: 326825cf1a30Sjl139090 326925cf1a30Sjl139090 membar #Sync 327025cf1a30Sjl139090 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 327125cf1a30Sjl139090 mov SM_SAVE_SRC, %o0 327225cf1a30Sjl139090 mov SM_SAVE_DST, %o1 327325cf1a30Sjl139090 mov SM_SAVE_COUNT, %o2 327425cf1a30Sjl139090 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 327525cf1a30Sjl139090 tst %o3 327625cf1a30Sjl139090 bz,pt %ncc, 3f ! if not, return error 327725cf1a30Sjl139090 nop 327825cf1a30Sjl139090 ldn [%o3 + CP_XCOPYIN], %o5 ! if handler, invoke it with 327925cf1a30Sjl139090 jmp %o5 ! original arguments 328025cf1a30Sjl139090 nop 328125cf1a30Sjl1390903: 328225cf1a30Sjl139090 retl 328325cf1a30Sjl139090 or %g1, 0, %o0 ! return errno value 328425cf1a30Sjl139090 328525cf1a30Sjl139090 SET_SIZE(xcopyin) 328625cf1a30Sjl139090 328725cf1a30Sjl139090#endif /* lint */ 328825cf1a30Sjl139090 328925cf1a30Sjl139090#ifdef lint 329025cf1a30Sjl139090 329125cf1a30Sjl139090/*ARGSUSED*/ 329225cf1a30Sjl139090int 329325cf1a30Sjl139090xcopyin_little(const void *uaddr, void *kaddr, size_t count) 329425cf1a30Sjl139090{ return (0); } 329525cf1a30Sjl139090 329625cf1a30Sjl139090#else /* lint */ 329725cf1a30Sjl139090 329825cf1a30Sjl139090 ENTRY(xcopyin_little) 329925cf1a30Sjl139090 sethi %hi(.xcopyio_err), %o5 330025cf1a30Sjl139090 or %o5, %lo(.xcopyio_err), %o5 330125cf1a30Sjl139090 ldn [THREAD_REG + T_LOFAULT], %o4 330225cf1a30Sjl139090 membar #Sync ! sync error barrier 330325cf1a30Sjl139090 stn %o5, [THREAD_REG + T_LOFAULT] 330425cf1a30Sjl139090 mov %o4, %o5 330525cf1a30Sjl139090 330625cf1a30Sjl139090 subcc %g0, %o2, %o3 330725cf1a30Sjl139090 add %o0, %o2, %o0 330825cf1a30Sjl139090 bz,pn %ncc, 2f ! check for zero bytes 330925cf1a30Sjl139090 sub %o2, 1, %o4 331025cf1a30Sjl139090 add %o0, %o4, %o0 ! start w/last byte 331125cf1a30Sjl139090 add %o1, %o2, %o1 331225cf1a30Sjl139090 lduba [%o0 + %o3]ASI_AIUSL, %o4 331325cf1a30Sjl139090 331425cf1a30Sjl1390901: stb %o4, [%o1 + %o3] 331525cf1a30Sjl139090 inccc %o3 331625cf1a30Sjl139090 sub %o0, 2, %o0 ! get next byte 331725cf1a30Sjl139090 bcc,a,pt %ncc, 1b 331825cf1a30Sjl139090 lduba [%o0 + %o3]ASI_AIUSL, %o4 331925cf1a30Sjl139090 332025cf1a30Sjl1390902: 332125cf1a30Sjl139090 membar #Sync ! sync error barrier 332225cf1a30Sjl139090 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 332325cf1a30Sjl139090 retl 332425cf1a30Sjl139090 mov %g0, %o0 ! return (0) 332525cf1a30Sjl139090 332625cf1a30Sjl139090.xcopyio_err: 332725cf1a30Sjl139090 membar #Sync ! sync error barrier 332825cf1a30Sjl139090 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 332925cf1a30Sjl139090 retl 333025cf1a30Sjl139090 mov %g1, %o0 333125cf1a30Sjl139090 333225cf1a30Sjl139090 SET_SIZE(xcopyin_little) 333325cf1a30Sjl139090 333425cf1a30Sjl139090#endif /* lint */ 333525cf1a30Sjl139090 333625cf1a30Sjl139090 333725cf1a30Sjl139090/* 333825cf1a30Sjl139090 * Copy a block of storage - must not overlap (from + len <= to). 333925cf1a30Sjl139090 * No fault handler installed (to be called under on_fault()) 334025cf1a30Sjl139090 */ 334125cf1a30Sjl139090#if defined(lint) 334225cf1a30Sjl139090 334325cf1a30Sjl139090/* ARGSUSED */ 334425cf1a30Sjl139090void 334525cf1a30Sjl139090copyin_noerr(const void *ufrom, void *kto, size_t count) 334625cf1a30Sjl139090{} 334725cf1a30Sjl139090 334825cf1a30Sjl139090#else /* lint */ 334925cf1a30Sjl139090 ENTRY(copyin_noerr) 335025cf1a30Sjl139090 335125cf1a30Sjl139090 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 335225cf1a30Sjl139090 bleu,pt %ncc, .copyin_ne_small ! go to larger cases 335325cf1a30Sjl139090 xor %o0, %o1, %o3 ! are src, dst alignable? 335425cf1a30Sjl139090 btst 7, %o3 ! 335525cf1a30Sjl139090 bz,pt %ncc, .copyin_ne_8 ! check for longword alignment 335625cf1a30Sjl139090 nop 335725cf1a30Sjl139090 btst 1, %o3 ! 335825cf1a30Sjl139090 bz,pt %ncc, .copyin_ne_2 ! check for half-word 335925cf1a30Sjl139090 nop 336025cf1a30Sjl139090 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 336125cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_1)], %o3 336225cf1a30Sjl139090 tst %o3 336325cf1a30Sjl139090 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 336425cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 336525cf1a30Sjl139090 bleu,pt %ncc, .copyin_ne_small ! go to small copy 336625cf1a30Sjl139090 nop 336725cf1a30Sjl139090 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 336825cf1a30Sjl139090 nop 336925cf1a30Sjl139090.copyin_ne_2: 337025cf1a30Sjl139090 btst 3, %o3 ! 337125cf1a30Sjl139090 bz,pt %ncc, .copyin_ne_4 ! check for word alignment 337225cf1a30Sjl139090 nop 337325cf1a30Sjl139090 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 337425cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_2)], %o3 337525cf1a30Sjl139090 tst %o3 337625cf1a30Sjl139090 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 337725cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 337825cf1a30Sjl139090 bleu,pt %ncc, .copyin_ne_small ! go to small copy 337925cf1a30Sjl139090 nop 338025cf1a30Sjl139090 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 338125cf1a30Sjl139090 nop 338225cf1a30Sjl139090.copyin_ne_4: 338325cf1a30Sjl139090 ! already checked longword, must be word aligned 338425cf1a30Sjl139090 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 338525cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_4)], %o3 338625cf1a30Sjl139090 tst %o3 338725cf1a30Sjl139090 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 338825cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 338925cf1a30Sjl139090 bleu,pt %ncc, .copyin_ne_small ! go to small copy 339025cf1a30Sjl139090 nop 339125cf1a30Sjl139090 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 339225cf1a30Sjl139090 nop 339325cf1a30Sjl139090.copyin_ne_8: 339425cf1a30Sjl139090 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 339525cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_8)], %o3 339625cf1a30Sjl139090 tst %o3 339725cf1a30Sjl139090 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 339825cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 339925cf1a30Sjl139090 bleu,pt %ncc, .copyin_ne_small ! go to small copy 340025cf1a30Sjl139090 nop 340125cf1a30Sjl139090 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 340225cf1a30Sjl139090 nop 340325cf1a30Sjl139090 340425cf1a30Sjl139090.copyin_ne_small: 340525cf1a30Sjl139090 ldn [THREAD_REG + T_LOFAULT], %o4 340625cf1a30Sjl139090 tst %o4 340725cf1a30Sjl139090 bz,pn %ncc, .sm_do_copyin 340825cf1a30Sjl139090 nop 340925cf1a30Sjl139090 sethi %hi(.sm_copyio_noerr), %o5 341025cf1a30Sjl139090 or %o5, %lo(.sm_copyio_noerr), %o5 341125cf1a30Sjl139090 membar #Sync ! sync error barrier 341225cf1a30Sjl139090 ba,pt %ncc, .sm_do_copyin 341325cf1a30Sjl139090 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault 341425cf1a30Sjl139090 341525cf1a30Sjl139090.copyin_noerr_more: 341625cf1a30Sjl139090 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 341725cf1a30Sjl139090 sethi %hi(.copyio_noerr), REAL_LOFAULT 341825cf1a30Sjl139090 ba,pt %ncc, .do_copyin 341925cf1a30Sjl139090 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 342025cf1a30Sjl139090 342125cf1a30Sjl139090.copyio_noerr: 342225cf1a30Sjl139090 jmp %l6 342325cf1a30Sjl139090 restore %g0,0,%g0 342425cf1a30Sjl139090 342525cf1a30Sjl139090.sm_copyio_noerr: 342625cf1a30Sjl139090 membar #Sync 342725cf1a30Sjl139090 stn %o4, [THREAD_REG + T_LOFAULT] ! restore t_lofault 342825cf1a30Sjl139090 jmp %o4 342925cf1a30Sjl139090 nop 343025cf1a30Sjl139090 343125cf1a30Sjl139090 SET_SIZE(copyin_noerr) 343225cf1a30Sjl139090#endif /* lint */ 343325cf1a30Sjl139090 343425cf1a30Sjl139090/* 343525cf1a30Sjl139090 * Copy a block of storage - must not overlap (from + len <= to). 343625cf1a30Sjl139090 * No fault handler installed (to be called under on_fault()) 343725cf1a30Sjl139090 */ 343825cf1a30Sjl139090 343925cf1a30Sjl139090#if defined(lint) 344025cf1a30Sjl139090 344125cf1a30Sjl139090/* ARGSUSED */ 344225cf1a30Sjl139090void 344325cf1a30Sjl139090copyout_noerr(const void *kfrom, void *uto, size_t count) 344425cf1a30Sjl139090{} 344525cf1a30Sjl139090 344625cf1a30Sjl139090#else /* lint */ 344725cf1a30Sjl139090 ENTRY(copyout_noerr) 344825cf1a30Sjl139090 344925cf1a30Sjl139090 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 345025cf1a30Sjl139090 bleu,pt %ncc, .copyout_ne_small ! go to larger cases 345125cf1a30Sjl139090 xor %o0, %o1, %o3 ! are src, dst alignable? 345225cf1a30Sjl139090 btst 7, %o3 ! 345325cf1a30Sjl139090 bz,pt %ncc, .copyout_ne_8 ! check for longword alignment 345425cf1a30Sjl139090 nop 345525cf1a30Sjl139090 btst 1, %o3 ! 345625cf1a30Sjl139090 bz,pt %ncc, .copyout_ne_2 ! check for half-word 345725cf1a30Sjl139090 nop 345825cf1a30Sjl139090 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 345925cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_1)], %o3 346025cf1a30Sjl139090 tst %o3 346125cf1a30Sjl139090 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 346225cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 346325cf1a30Sjl139090 bleu,pt %ncc, .copyout_ne_small ! go to small copy 346425cf1a30Sjl139090 nop 346525cf1a30Sjl139090 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 346625cf1a30Sjl139090 nop 346725cf1a30Sjl139090.copyout_ne_2: 346825cf1a30Sjl139090 btst 3, %o3 ! 346925cf1a30Sjl139090 bz,pt %ncc, .copyout_ne_4 ! check for word alignment 347025cf1a30Sjl139090 nop 347125cf1a30Sjl139090 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 347225cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_2)], %o3 347325cf1a30Sjl139090 tst %o3 347425cf1a30Sjl139090 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 347525cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 347625cf1a30Sjl139090 bleu,pt %ncc, .copyout_ne_small ! go to small copy 347725cf1a30Sjl139090 nop 347825cf1a30Sjl139090 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 347925cf1a30Sjl139090 nop 348025cf1a30Sjl139090.copyout_ne_4: 348125cf1a30Sjl139090 ! already checked longword, must be word aligned 348225cf1a30Sjl139090 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 348325cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_4)], %o3 348425cf1a30Sjl139090 tst %o3 348525cf1a30Sjl139090 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 348625cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 348725cf1a30Sjl139090 bleu,pt %ncc, .copyout_ne_small ! go to small copy 348825cf1a30Sjl139090 nop 348925cf1a30Sjl139090 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 349025cf1a30Sjl139090 nop 349125cf1a30Sjl139090.copyout_ne_8: 349225cf1a30Sjl139090 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 349325cf1a30Sjl139090 ld [%o3 + %lo(hw_copy_limit_8)], %o3 349425cf1a30Sjl139090 tst %o3 349525cf1a30Sjl139090 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 349625cf1a30Sjl139090 cmp %o2, %o3 ! if length <= limit 349725cf1a30Sjl139090 bleu,pt %ncc, .copyout_ne_small ! go to small copy 349825cf1a30Sjl139090 nop 349925cf1a30Sjl139090 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 350025cf1a30Sjl139090 nop 350125cf1a30Sjl139090 350225cf1a30Sjl139090.copyout_ne_small: 350325cf1a30Sjl139090 ldn [THREAD_REG + T_LOFAULT], %o4 350425cf1a30Sjl139090 tst %o4 350525cf1a30Sjl139090 bz,pn %ncc, .sm_do_copyout 350625cf1a30Sjl139090 nop 350725cf1a30Sjl139090 sethi %hi(.sm_copyio_noerr), %o5 350825cf1a30Sjl139090 or %o5, %lo(.sm_copyio_noerr), %o5 350925cf1a30Sjl139090 membar #Sync ! sync error barrier 351025cf1a30Sjl139090 ba,pt %ncc, .sm_do_copyout 351125cf1a30Sjl139090 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault 351225cf1a30Sjl139090 351325cf1a30Sjl139090.copyout_noerr_more: 351425cf1a30Sjl139090 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 351525cf1a30Sjl139090 sethi %hi(.copyio_noerr), REAL_LOFAULT 351625cf1a30Sjl139090 ba,pt %ncc, .do_copyout 351725cf1a30Sjl139090 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 351825cf1a30Sjl139090 351925cf1a30Sjl139090 SET_SIZE(copyout_noerr) 352025cf1a30Sjl139090#endif /* lint */ 352125cf1a30Sjl139090 352225cf1a30Sjl139090 352325cf1a30Sjl139090/* 352425cf1a30Sjl139090 * hwblkclr - clears block-aligned, block-multiple-sized regions that are 352525cf1a30Sjl139090 * longer than 256 bytes in length using spitfire's block stores. If 352625cf1a30Sjl139090 * the criteria for using this routine are not met then it calls bzero 352725cf1a30Sjl139090 * and returns 1. Otherwise 0 is returned indicating success. 352825cf1a30Sjl139090 * Caller is responsible for ensuring use_hw_bzero is true and that 352925cf1a30Sjl139090 * kpreempt_disable() has been called. 353025cf1a30Sjl139090 */ 353125cf1a30Sjl139090#ifdef lint 353225cf1a30Sjl139090/*ARGSUSED*/ 353325cf1a30Sjl139090int 353425cf1a30Sjl139090hwblkclr(void *addr, size_t len) 353525cf1a30Sjl139090{ 353625cf1a30Sjl139090 return(0); 353725cf1a30Sjl139090} 353825cf1a30Sjl139090#else /* lint */ 353925cf1a30Sjl139090 ! %i0 - start address 354025cf1a30Sjl139090 ! %i1 - length of region (multiple of 64) 354125cf1a30Sjl139090 ! %l0 - saved fprs 354225cf1a30Sjl139090 ! %l1 - pointer to saved %d0 block 354325cf1a30Sjl139090 ! %l2 - saved curthread->t_lwp 354425cf1a30Sjl139090 354525cf1a30Sjl139090 ENTRY(hwblkclr) 354625cf1a30Sjl139090 ! get another window w/space for one aligned block of saved fpregs 354725cf1a30Sjl139090 save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp 354825cf1a30Sjl139090 354925cf1a30Sjl139090 ! Must be block-aligned 355025cf1a30Sjl139090 andcc %i0, (VIS_BLOCKSIZE-1), %g0 355125cf1a30Sjl139090 bnz,pn %ncc, 1f 355225cf1a30Sjl139090 nop 355325cf1a30Sjl139090 355425cf1a30Sjl139090 ! ... and must be 256 bytes or more 355525cf1a30Sjl139090 cmp %i1, 256 355625cf1a30Sjl139090 blu,pn %ncc, 1f 355725cf1a30Sjl139090 nop 355825cf1a30Sjl139090 355925cf1a30Sjl139090 ! ... and length must be a multiple of VIS_BLOCKSIZE 356025cf1a30Sjl139090 andcc %i1, (VIS_BLOCKSIZE-1), %g0 356125cf1a30Sjl139090 bz,pn %ncc, 2f 356225cf1a30Sjl139090 nop 356325cf1a30Sjl139090 356425cf1a30Sjl1390901: ! punt, call bzero but notify the caller that bzero was used 356525cf1a30Sjl139090 mov %i0, %o0 356625cf1a30Sjl139090 call bzero 356725cf1a30Sjl139090 mov %i1, %o1 356825cf1a30Sjl139090 ret 356925cf1a30Sjl139090 restore %g0, 1, %o0 ! return (1) - did not use block operations 357025cf1a30Sjl139090 357125cf1a30Sjl1390902: rd %fprs, %l0 ! check for unused fp 357225cf1a30Sjl139090 btst FPRS_FEF, %l0 357325cf1a30Sjl139090 bz,pt %icc, 1f 357425cf1a30Sjl139090 nop 357525cf1a30Sjl139090 357625cf1a30Sjl139090 ! save in-use fpregs on stack 357725cf1a30Sjl139090 membar #Sync 357825cf1a30Sjl139090 add %fp, STACK_BIAS - 65, %l1 357925cf1a30Sjl139090 and %l1, -VIS_BLOCKSIZE, %l1 358025cf1a30Sjl139090 stda %d0, [%l1]ASI_BLK_P 358125cf1a30Sjl139090 358225cf1a30Sjl1390901: membar #StoreStore|#StoreLoad|#LoadStore 358325cf1a30Sjl139090 wr %g0, FPRS_FEF, %fprs 358425cf1a30Sjl139090 wr %g0, ASI_BLK_P, %asi 358525cf1a30Sjl139090 358625cf1a30Sjl139090 ! Clear block 358725cf1a30Sjl139090 fzero %d0 358825cf1a30Sjl139090 fzero %d2 358925cf1a30Sjl139090 fzero %d4 359025cf1a30Sjl139090 fzero %d6 359125cf1a30Sjl139090 fzero %d8 359225cf1a30Sjl139090 fzero %d10 359325cf1a30Sjl139090 fzero %d12 359425cf1a30Sjl139090 fzero %d14 359525cf1a30Sjl139090 359625cf1a30Sjl139090 mov 256, %i3 359725cf1a30Sjl139090 ba,pt %ncc, .pz_doblock 359825cf1a30Sjl139090 nop 359925cf1a30Sjl139090 360025cf1a30Sjl139090.pz_blkstart: 360125cf1a30Sjl139090 ! stda %d0, [%i0 + 192]%asi ! in dly slot of branch that got us here 360225cf1a30Sjl139090 stda %d0, [%i0 + 128]%asi 360325cf1a30Sjl139090 stda %d0, [%i0 + 64]%asi 360425cf1a30Sjl139090 stda %d0, [%i0]%asi 360525cf1a30Sjl139090.pz_zinst: 360625cf1a30Sjl139090 add %i0, %i3, %i0 360725cf1a30Sjl139090 sub %i1, %i3, %i1 360825cf1a30Sjl139090.pz_doblock: 360925cf1a30Sjl139090 cmp %i1, 256 361025cf1a30Sjl139090 bgeu,a %ncc, .pz_blkstart 361125cf1a30Sjl139090 stda %d0, [%i0 + 192]%asi 361225cf1a30Sjl139090 361325cf1a30Sjl139090 cmp %i1, 64 361425cf1a30Sjl139090 blu %ncc, .pz_finish 361525cf1a30Sjl139090 361625cf1a30Sjl139090 andn %i1, (64-1), %i3 361725cf1a30Sjl139090 srl %i3, 4, %i2 ! using blocks, 1 instr / 16 words 361825cf1a30Sjl139090 set .pz_zinst, %i4 361925cf1a30Sjl139090 sub %i4, %i2, %i4 362025cf1a30Sjl139090 jmp %i4 362125cf1a30Sjl139090 nop 362225cf1a30Sjl139090 362325cf1a30Sjl139090.pz_finish: 362425cf1a30Sjl139090 membar #Sync 362525cf1a30Sjl139090 btst FPRS_FEF, %l0 362625cf1a30Sjl139090 bz,a .pz_finished 362725cf1a30Sjl139090 wr %l0, 0, %fprs ! restore fprs 362825cf1a30Sjl139090 362925cf1a30Sjl139090 ! restore fpregs from stack 363025cf1a30Sjl139090 ldda [%l1]ASI_BLK_P, %d0 363125cf1a30Sjl139090 membar #Sync 363225cf1a30Sjl139090 wr %l0, 0, %fprs ! restore fprs 363325cf1a30Sjl139090 363425cf1a30Sjl139090.pz_finished: 363525cf1a30Sjl139090 ret 363625cf1a30Sjl139090 restore %g0, 0, %o0 ! return (bzero or not) 363725cf1a30Sjl139090 363825cf1a30Sjl139090 SET_SIZE(hwblkclr) 363925cf1a30Sjl139090#endif /* lint */ 364025cf1a30Sjl139090 364125cf1a30Sjl139090#ifdef lint 364225cf1a30Sjl139090/*ARGSUSED*/ 364325cf1a30Sjl139090void 364425cf1a30Sjl139090hw_pa_bcopy32(uint64_t src, uint64_t dst) 364525cf1a30Sjl139090{} 364625cf1a30Sjl139090#else /*!lint */ 364725cf1a30Sjl139090 /* 364825cf1a30Sjl139090 * Copy 32 bytes of data from src (%o0) to dst (%o1) 364925cf1a30Sjl139090 * using physical addresses. 365025cf1a30Sjl139090 */ 365125cf1a30Sjl139090 ENTRY_NP(hw_pa_bcopy32) 365225cf1a30Sjl139090 rdpr %pstate, %g1 365325cf1a30Sjl139090 andn %g1, PSTATE_IE, %g2 365425cf1a30Sjl139090 wrpr %g0, %g2, %pstate 365525cf1a30Sjl139090 365625cf1a30Sjl139090 rdpr %pstate, %g0 365725cf1a30Sjl139090 ldxa [%o0]ASI_MEM, %o2 365825cf1a30Sjl139090 add %o0, 8, %o0 365925cf1a30Sjl139090 ldxa [%o0]ASI_MEM, %o3 366025cf1a30Sjl139090 add %o0, 8, %o0 366125cf1a30Sjl139090 ldxa [%o0]ASI_MEM, %o4 366225cf1a30Sjl139090 add %o0, 8, %o0 366325cf1a30Sjl139090 ldxa [%o0]ASI_MEM, %o5 366425cf1a30Sjl139090 membar #Sync 366525cf1a30Sjl139090 366625cf1a30Sjl139090 stxa %o2, [%o1]ASI_MEM 366725cf1a30Sjl139090 add %o1, 8, %o1 366825cf1a30Sjl139090 stxa %o3, [%o1]ASI_MEM 366925cf1a30Sjl139090 add %o1, 8, %o1 367025cf1a30Sjl139090 stxa %o4, [%o1]ASI_MEM 367125cf1a30Sjl139090 add %o1, 8, %o1 367225cf1a30Sjl139090 stxa %o5, [%o1]ASI_MEM 367325cf1a30Sjl139090 367425cf1a30Sjl139090 retl 367525cf1a30Sjl139090 wrpr %g0, %g1, %pstate 367625cf1a30Sjl139090 367725cf1a30Sjl139090 SET_SIZE(hw_pa_bcopy32) 367825cf1a30Sjl139090 367925cf1a30Sjl139090#endif /* lint */ 368025cf1a30Sjl139090 368125cf1a30Sjl139090#if defined(lint) 368225cf1a30Sjl139090 368325cf1a30Sjl139090int use_hw_bcopy = 1; 368425cf1a30Sjl139090int use_hw_bzero = 1; 368525cf1a30Sjl139090uint_t hw_copy_limit_1 = 0; 368625cf1a30Sjl139090uint_t hw_copy_limit_2 = 0; 368725cf1a30Sjl139090uint_t hw_copy_limit_4 = 0; 368825cf1a30Sjl139090uint_t hw_copy_limit_8 = 0; 368925cf1a30Sjl139090 369025cf1a30Sjl139090#else /* !lint */ 369125cf1a30Sjl139090 369225cf1a30Sjl139090 DGDEF(use_hw_bcopy) 369325cf1a30Sjl139090 .word 1 369425cf1a30Sjl139090 DGDEF(use_hw_bzero) 369525cf1a30Sjl139090 .word 1 369625cf1a30Sjl139090 DGDEF(hw_copy_limit_1) 369725cf1a30Sjl139090 .word 0 369825cf1a30Sjl139090 DGDEF(hw_copy_limit_2) 369925cf1a30Sjl139090 .word 0 370025cf1a30Sjl139090 DGDEF(hw_copy_limit_4) 370125cf1a30Sjl139090 .word 0 370225cf1a30Sjl139090 DGDEF(hw_copy_limit_8) 370325cf1a30Sjl139090 .word 0 370425cf1a30Sjl139090 370525cf1a30Sjl139090 .align 64 370625cf1a30Sjl139090 .section ".text" 370725cf1a30Sjl139090#endif /* !lint */ 3708