1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma ident "%Z%%M% %I% %E% SMI" 27 28#include <sys/param.h> 29#include <sys/errno.h> 30#include <sys/asm_linkage.h> 31#include <sys/vtrace.h> 32#include <sys/machthread.h> 33#include <sys/clock.h> 34#include <sys/asi.h> 35#include <sys/fsr.h> 36#include <sys/privregs.h> 37 38#if !defined(lint) 39#include "assym.h" 40#endif /* lint */ 41 42/* 43 * Pseudo-code to aid in understanding the control flow of the 44 * bcopy/copyin/copyout routines. 45 * 46 * On entry: 47 * 48 * ! Determine whether to use the FP register version 49 * ! or the leaf routine version depending on size 50 * ! of copy and flags. Set up error handling accordingly. 51 * ! The transition point depends on whether the src and 52 * ! dst addresses can be aligned to long word, word, 53 * ! half word, or byte boundaries. 54 * ! 55 * ! WARNING: <Register usage convention> 56 * ! For FP version, %l6 holds previous error handling and 57 * ! a flag: TRAMP_FLAG (low bits) 58 * ! for leaf routine version, %o4 holds those values. 59 * ! So either %l6 or %o4 is reserved and not available for 60 * ! any other use. 61 * 62 * if (length <= VIS_COPY_THRESHOLD) ! start with a quick test 63 * go to small_copy; ! to speed short copies 64 * 65 * ! src, dst long word alignable 66 * if (hw_copy_limit_8 == 0) ! hw_copy disabled 67 * go to small_copy; 68 * if (length <= hw_copy_limit_8) 69 * go to small_copy; 70 * go to FPBLK_copy; 71 * } 72 * if (src,dst not alignable) { 73 * if (hw_copy_limit_1 == 0) ! hw_copy disabled 74 * go to small_copy; 75 * if (length <= hw_copy_limit_1) 76 * go to small_copy; 77 * go to FPBLK_copy; 78 * } 79 * if (src,dst halfword alignable) { 80 * if (hw_copy_limit_2 == 0) ! hw_copy disabled 81 * go to small_copy; 82 * if (length <= hw_copy_limit_2) 83 * go to small_copy; 84 * go to FPBLK_copy; 85 * } 86 * if (src,dst word alignable) { 87 * if (hw_copy_limit_4 == 0) ! hw_copy disabled 88 * go to small_copy; 89 * if (length <= hw_copy_limit_4) 90 * go to small_copy; 91 * go to FPBLK_copy; 92 * } 93 * 94 * small_copy: 95 * Setup_leaf_rtn_error_handler; ! diffs for each entry point 96 * 97 * if (count <= 3) ! fast path for tiny copies 98 * go to sm_left; ! special finish up code 99 * else 100 * if (count > CHKSIZE) ! medium sized copies 101 * go to sm_med ! tuned by alignment 102 * if(src&dst not both word aligned) { 103 * sm_movebytes: 104 * move byte by byte in 4-way unrolled loop 105 * fall into sm_left; 106 * sm_left: 107 * move 0-3 bytes byte at a time as needed. 108 * restore error handler and exit. 109 * 110 * } else { ! src&dst are word aligned 111 * check for at least 8 bytes left, 112 * move word at a time, unrolled by 2 113 * when fewer than 8 bytes left, 114 * sm_half: move half word at a time while 2 or more bytes left 115 * sm_byte: move final byte if necessary 116 * sm_exit: 117 * restore error handler and exit. 118 * } 119 * 120 * ! Medium length cases with at least CHKSIZE bytes available 121 * ! method: line up src and dst as best possible, then 122 * ! move data in 4-way unrolled loops. 123 * 124 * sm_med: 125 * if(src&dst unalignable) 126 * go to sm_movebytes 127 * if(src&dst halfword alignable) 128 * go to sm_movehalf 129 * if(src&dst word alignable) 130 * go to sm_moveword 131 * ! fall into long word movement 132 * move bytes until src is word aligned 133 * if not long word aligned, move a word 134 * move long words in 4-way unrolled loop until < 32 bytes left 135 * move long words in 1-way unrolled loop until < 8 bytes left 136 * if zero bytes left, goto sm_exit 137 * if one byte left, go to sm_byte 138 * else go to sm_half 139 * 140 * sm_moveword: 141 * move bytes until src is word aligned 142 * move words in 4-way unrolled loop until < 16 bytes left 143 * move words in 1-way unrolled loop until < 4 bytes left 144 * if zero bytes left, goto sm_exit 145 * if one byte left, go to sm_byte 146 * else go to sm_half 147 * 148 * sm_movehalf: 149 * move a byte if needed to align src on halfword 150 * move halfwords in 4-way unrolled loop until < 8 bytes left 151 * if zero bytes left, goto sm_exit 152 * if one byte left, go to sm_byte 153 * else go to sm_half 154 * 155 * 156 * FPBLK_copy: 157 * %l6 = curthread->t_lofault; 158 * if (%l6 != NULL) { 159 * membar #Sync 160 * curthread->t_lofault = .copyerr; 161 * caller_error_handler = TRUE ! %l6 |= 2 162 * } 163 * 164 * ! for FPU testing we must not migrate cpus 165 * if (curthread->t_lwp == NULL) { 166 * ! Kernel threads do not have pcb's in which to store 167 * ! the floating point state, so disallow preemption during 168 * ! the copy. This also prevents cpu migration. 169 * kpreempt_disable(curthread); 170 * } else { 171 * thread_nomigrate(); 172 * } 173 * 174 * old_fprs = %fprs; 175 * old_gsr = %gsr; 176 * if (%fprs.fef) { 177 * %fprs.fef = 1; 178 * save current fpregs on stack using blockstore 179 * } else { 180 * %fprs.fef = 1; 181 * } 182 * 183 * 184 * do_blockcopy_here; 185 * 186 * In lofault handler: 187 * curthread->t_lofault = .copyerr2; 188 * Continue on with the normal exit handler 189 * 190 * On normal exit: 191 * %gsr = old_gsr; 192 * if (old_fprs & FPRS_FEF) 193 * restore fpregs from stack using blockload 194 * else 195 * zero fpregs 196 * %fprs = old_fprs; 197 * membar #Sync 198 * curthread->t_lofault = (%l6 & ~3); 199 * ! following test omitted from copyin/copyout as they 200 * ! will always have a current thread 201 * if (curthread->t_lwp == NULL) 202 * kpreempt_enable(curthread); 203 * else 204 * thread_allowmigrate(); 205 * return (0) 206 * 207 * In second lofault handler (.copyerr2): 208 * We've tried to restore fp state from the stack and failed. To 209 * prevent from returning with a corrupted fp state, we will panic. 210 */ 211 212/* 213 * Comments about optimization choices 214 * 215 * The initial optimization decision in this code is to determine 216 * whether to use the FP registers for a copy or not. If we don't 217 * use the FP registers, we can execute the copy as a leaf routine, 218 * saving a register save and restore. Also, less elaborate setup 219 * is required, allowing short copies to be completed more quickly. 220 * For longer copies, especially unaligned ones (where the src and 221 * dst do not align to allow simple ldx,stx operation), the FP 222 * registers allow much faster copy operations. 223 * 224 * The estimated extra cost of the FP path will vary depending on 225 * src/dst alignment, dst offset from the next 64 byte FPblock store 226 * boundary, remaining src data after the last full dst cache line is 227 * moved whether the FP registers need to be saved, and some other 228 * minor issues. The average additional overhead is estimated to be 229 * 400 clocks. Since each non-repeated/predicted tst and branch costs 230 * around 10 clocks, elaborate calculation would slow down to all 231 * longer copies and only benefit a small portion of medium sized 232 * copies. Rather than incur such cost, we chose fixed transition 233 * points for each of the alignment choices. 234 * 235 * For the inner loop, here is a comparison of the per cache line 236 * costs for each alignment when src&dst are in cache: 237 * 238 * byte aligned: 108 clocks slower for non-FPBLK 239 * half aligned: 44 clocks slower for non-FPBLK 240 * word aligned: 12 clocks slower for non-FPBLK 241 * long aligned: 4 clocks >>faster<< for non-FPBLK 242 * 243 * The long aligned loop runs faster because it does no prefetching. 244 * That wins if the data is not in cache or there is too little 245 * data to gain much benefit from prefetching. But when there 246 * is more data and that data is not in cache, failing to prefetch 247 * can run much slower. In addition, there is a 2 Kbyte store queue 248 * which will cause the non-FPBLK inner loop to slow for larger copies. 249 * The exact tradeoff is strongly load and application dependent, with 250 * increasing risk of a customer visible performance regression if the 251 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache 252 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe 253 * upper limit for the non-FPBLK code. To minimize performance regression 254 * risk while still gaining the primary benefits of the improvements to 255 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various 256 * hw_copy_limit_*. Later experimental studies using different values 257 * of hw_copy_limit_* can be used to make further adjustments if 258 * appropriate. 259 * 260 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned 261 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned 262 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned 263 * hw_copy_limit_8 = src and dst are longword aligned 264 * 265 * To say that src and dst are word aligned means that after 266 * some initial alignment activity of moving 0 to 3 bytes, 267 * both the src and dst will be on word boundaries so that 268 * word loads and stores may be used. 269 * 270 * Default values at May,2005 are: 271 * hw_copy_limit_1 = 256 272 * hw_copy_limit_2 = 512 273 * hw_copy_limit_4 = 1024 274 * hw_copy_limit_8 = 1024 (or 1536 on some systems) 275 * 276 * 277 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is 278 * disabled for that alignment choice. 279 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256) 280 * the value of VIS_COPY_THRESHOLD is used. 281 * It is not envisioned that hw_copy_limit_? will be changed in the field 282 * It is provided to allow for disabling FPBLK copies and to allow 283 * easy testing of alternate values on future HW implementations 284 * that might have different cache sizes, clock rates or instruction 285 * timing rules. 286 * 287 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum 288 * threshold to speedup all shorter copies (less than 256). That 289 * saves an alignment test, memory reference, and enabling test 290 * for all short copies, or an estimated 24 clocks. 291 * 292 * The order in which these limits are checked does matter since each 293 * non-predicted tst and branch costs around 10 clocks. 294 * If src and dst are randomly selected addresses, 295 * 4 of 8 will not be alignable. 296 * 2 of 8 will be half word alignable. 297 * 1 of 8 will be word alignable. 298 * 1 of 8 will be long word alignable. 299 * But, tests on running kernels show that src and dst to copy code 300 * are typically not on random alignments. Structure copies and 301 * copies of larger data sizes are often on long word boundaries. 302 * So we test the long word alignment case first, then 303 * the byte alignment, then halfword, then word alignment. 304 * 305 * Several times, tests for length are made to split the code 306 * into subcases. These tests often allow later tests to be 307 * avoided. For example, within the non-FPBLK copy, we first 308 * check for tiny copies of 3 bytes or less. That allows us 309 * to use a 4-way unrolled loop for the general byte copy case 310 * without a test on loop entry. 311 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less 312 * vs longer cases. For the really short case, we don't attempt 313 * align src and dst. We try to minimize special case tests in 314 * the shortest loops as each test adds a significant percentage 315 * to the total time. 316 * 317 * For the medium sized cases, we allow ourselves to adjust the 318 * src and dst alignment and provide special cases for each of 319 * the four adjusted alignment cases. The CHKSIZE that was used 320 * to decide between short and medium size was chosen to be 39 321 * as that allows for the worst case of 7 bytes of alignment 322 * shift and 4 times 8 bytes for the first long word unrolling. 323 * That knowledge saves an initial test for length on entry into 324 * the medium cases. If the general loop unrolling factor were 325 * to be increases, this number would also need to be adjusted. 326 * 327 * For all cases in the non-FPBLK code where it is known that at 328 * least 4 chunks of data are available for movement, the 329 * loop is unrolled by four. This 4-way loop runs in 8 clocks 330 * or 2 clocks per data element. 331 * 332 * Instruction alignment is forced by used of .align 16 directives 333 * and nops which are not executed in the code. This 334 * combination of operations shifts the alignment of following 335 * loops to insure that loops are aligned so that their instructions 336 * fall within the minimum number of 4 instruction fetch groups. 337 * If instructions are inserted or removed between the .align 338 * instruction and the unrolled loops, then the alignment needs 339 * to be readjusted. Misaligned loops can add a clock per loop 340 * iteration to the loop timing. 341 * 342 * In a few cases, code is duplicated to avoid a branch. Since 343 * a non-predicted tst and branch takes 10 clocks, this savings 344 * is judged an appropriate time-space tradeoff. 345 * 346 * Within the FPBLK-code, the prefetch method in the inner 347 * loop needs to be explained as it is not standard. Two 348 * prefetches are issued for each cache line instead of one. 349 * The primary one is at the maximum reach of 8 cache lines. 350 * Most of the time, that maximum prefetch reach gives the 351 * cache line more time to reach the processor for systems with 352 * higher processor clocks. But, sometimes memory interference 353 * can cause that prefetch to be dropped. Putting a second 354 * prefetch at a reach of 5 cache lines catches the drops 355 * three iterations later and shows a measured improvement 356 * in performance over any similar loop with a single prefetch. 357 * The prefetches are placed in the loop so they overlap with 358 * non-memory instructions, so that there is no extra cost 359 * when the data is already in-cache. 360 * 361 */ 362 363/* 364 * Notes on preserving existing fp state and on membars. 365 * 366 * When a copyOP decides to use fp we may have to preserve existing 367 * floating point state. It is not the caller's state that we need to 368 * preserve - the rest of the kernel does not use fp and, anyway, fp 369 * registers are volatile across a call. Some examples: 370 * 371 * - userland has fp state and is interrupted (device interrupt 372 * or trap) and within the interrupt/trap handling we use 373 * bcopy() 374 * - another (higher level) interrupt or trap handler uses bcopy 375 * while a bcopy from an earlier interrupt is still active 376 * - an asynchronous error trap occurs while fp state exists (in 377 * userland or in kernel copy) and the tl0 component of the handling 378 * uses bcopy 379 * - a user process with fp state incurs a copy-on-write fault and 380 * hwblkpagecopy always uses fp 381 * 382 * We therefore need a per-call place in which to preserve fp state - 383 * using our stack is ideal (and since fp copy cannot be leaf optimized 384 * because of calls it makes, this is no hardship). 385 * 386 * When we have finished fp copy (with it's repeated block stores) 387 * we must membar #Sync so that our block stores may complete before 388 * we either restore the original fp state into the fp registers or 389 * return to a caller which may initiate other fp operations that could 390 * modify the fp regs we used before the block stores complete. 391 * 392 * Synchronous faults (eg, unresolvable DMMU miss) that occur while 393 * t_lofault is not NULL will not panic but will instead trampoline 394 * to the registered lofault handler. There is no need for any 395 * membars for these - eg, our store to t_lofault will always be visible to 396 * ourselves and it is our cpu which will take any trap. 397 * 398 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur 399 * while t_lofault is not NULL will also not panic. Since we're copying 400 * to or from userland the extent of the damage is known - the destination 401 * buffer is incomplete. So trap handlers will trampoline to the lofault 402 * handler in this case which should take some form of error action to 403 * avoid using the incomplete buffer. The trap handler also flags the 404 * fault so that later return-from-trap handling (for the trap that brought 405 * this thread into the kernel in the first place) can notify the process 406 * and reboot the system (or restart the service with Greenline/Contracts). 407 * 408 * Asynchronous faults (eg, uncorrectable ECC error from memory) can 409 * result in deferred error traps - the trap is taken sometime after 410 * the event and the trap PC may not be the PC of the faulting access. 411 * Delivery of such pending traps can be forced by a membar #Sync, acting 412 * as an "error barrier" in this role. To accurately apply the user/kernel 413 * separation described in the preceding paragraph we must force delivery 414 * of deferred traps affecting kernel state before we install a lofault 415 * handler (if we interpose a new lofault handler on an existing one there 416 * is no need to repeat this), and we must force delivery of deferred 417 * errors affecting the lofault-protected region before we clear t_lofault. 418 * Failure to do so results in lost kernel state being interpreted as 419 * affecting a copyin/copyout only, or of an error that really only 420 * affects copy data being interpreted as losing kernel state. 421 * 422 * Since the copy operations may preserve and later restore floating 423 * point state that does not belong to the caller (see examples above), 424 * we must be careful in how we do this in order to prevent corruption 425 * of another program. 426 * 427 * To make sure that floating point state is always saved and restored 428 * correctly, the following "big rules" must be followed when the floating 429 * point registers will be used: 430 * 431 * 1. %l6 always holds the caller's lofault handler. Also in this register, 432 * Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in 433 * use. Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a 434 * lofault handler was set coming in. 435 * 436 * 2. The FPUSED flag indicates that all FP state has been successfully stored 437 * on the stack. It should not be set until this save has been completed. 438 * 439 * 3. The FPUSED flag should not be cleared on exit until all FP state has 440 * been restored from the stack. If an error occurs while restoring 441 * data from the stack, the error handler can check this flag to see if 442 * a restore is necessary. 443 * 444 * 4. Code run under the new lofault handler must be kept to a minimum. In 445 * particular, any calls to FP_ALLOWMIGRATE, which could result in a call 446 * to kpreempt(), should not be made until after the lofault handler has 447 * been restored. 448 */ 449 450/* 451 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed 452 * to "break even" using FP/VIS-accelerated memory operations. 453 * The FPBLK code assumes a minimum number of bytes are available 454 * to be moved on entry. Check that code carefully before 455 * reducing VIS_COPY_THRESHOLD below 256. 456 */ 457/* 458 * This shadows sys/machsystm.h which can't be included due to the lack of 459 * _ASM guards in include files it references. Change it here, change it there. 460 */ 461#define VIS_COPY_THRESHOLD 256 462 463/* 464 * TEST for very short copies 465 * Be aware that the maximum unroll for the short unaligned case 466 * is SHORTCOPY+1 467 */ 468#define SHORTCOPY 3 469#define CHKSIZE 39 470 471/* 472 * Indicates that we're to trampoline to the error handler. 473 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag. 474 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag. 475 */ 476#define FPUSED_FLAG 1 477#define TRAMP_FLAG 2 478#define MASK_FLAGS 3 479 480/* 481 * Number of outstanding prefetches. 482 * We may need more tuning when Olympus-C processor is available. 483 */ 484#define OLYMPUS_C_PREFETCH 4 485#define OLYMPUS_C_2ND_PREFETCH 10 486 487#define VIS_BLOCKSIZE 64 488 489/* 490 * Size of stack frame in order to accomodate a 64-byte aligned 491 * floating-point register save area and 2 64-bit temp locations. 492 * All copy functions use two quadrants of fp registers; to assure a 493 * block-aligned two block buffer in which to save we must reserve 494 * three blocks on stack. Not all functions preserve %pfrs on stack 495 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all. 496 * 497 * _______________________________________ <-- %fp + STACK_BIAS 498 * | We may need to preserve 2 quadrants | 499 * | of fp regs, but since we do so with | 500 * | BST/BLD we need room in which to | 501 * | align to VIS_BLOCKSIZE bytes. So | 502 * | this area is 3 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET 503 * |-------------------------------------| 504 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET 505 * |-------------------------------------| 506 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET 507 * --------------------------------------- 508 */ 509#define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8)) 510#define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 3) 511#define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 2) - 1) 512#define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8) 513#define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8) 514 515/* 516 * Common macros used by the various versions of the block copy 517 * routines in this file. 518 */ 519 520/* 521 * In FP copies if we do not have preserved data to restore over 522 * the fp regs we used then we must zero those regs to avoid 523 * exposing portions of the data to later threads (data security). 524 * 525 * Copy functions use either quadrants 1 and 3 or 2 and 4. 526 * 527 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47 528 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63 529 * 530 * The instructions below are quicker than repeated fzero instructions 531 * since they can dispatch down two fp pipelines. 532 */ 533#define FZEROQ1Q3 \ 534 fzero %f0 ;\ 535 fmovd %f0, %f2 ;\ 536 fmovd %f0, %f4 ;\ 537 fmovd %f0, %f6 ;\ 538 fmovd %f0, %f8 ;\ 539 fmovd %f0, %f10 ;\ 540 fmovd %f0, %f12 ;\ 541 fmovd %f0, %f14 ;\ 542 fmovd %f0, %f32 ;\ 543 fmovd %f0, %f34 ;\ 544 fmovd %f0, %f36 ;\ 545 fmovd %f0, %f38 ;\ 546 fmovd %f0, %f40 ;\ 547 fmovd %f0, %f42 ;\ 548 fmovd %f0, %f44 ;\ 549 fmovd %f0, %f46 550 551#define FZEROQ2Q4 \ 552 fzero %f16 ;\ 553 fmovd %f0, %f18 ;\ 554 fmovd %f0, %f20 ;\ 555 fmovd %f0, %f22 ;\ 556 fmovd %f0, %f24 ;\ 557 fmovd %f0, %f26 ;\ 558 fmovd %f0, %f28 ;\ 559 fmovd %f0, %f30 ;\ 560 fmovd %f0, %f48 ;\ 561 fmovd %f0, %f50 ;\ 562 fmovd %f0, %f52 ;\ 563 fmovd %f0, %f54 ;\ 564 fmovd %f0, %f56 ;\ 565 fmovd %f0, %f58 ;\ 566 fmovd %f0, %f60 ;\ 567 fmovd %f0, %f62 568 569/* 570 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack. 571 * Used to save and restore in-use fp registers when we want to use FP 572 * and find fp already in use and copy size still large enough to justify 573 * the additional overhead of this save and restore. 574 * 575 * A membar #Sync is needed before save to sync fp ops initiated before 576 * the call to the copy function (by whoever has fp in use); for example 577 * an earlier block load to the quadrant we are about to save may still be 578 * "in flight". A membar #Sync is required at the end of the save to 579 * sync our block store (the copy code is about to begin ldd's to the 580 * first quadrant). 581 * 582 * Similarly: a membar #Sync before restore allows the block stores of 583 * the copy operation to complete before we fill the quadrants with their 584 * original data, and a membar #Sync after restore lets the block loads 585 * of the restore complete before we return to whoever has the fp regs 586 * in use. To avoid repeated membar #Sync we make it the responsibility 587 * of the copy code to membar #Sync immediately after copy is complete 588 * and before using the BLD_*_FROMSTACK macro. 589 */ 590#if !defined(lint) 591#define BST_FPQ1Q3_TOSTACK(tmp1) \ 592 /* membar #Sync */ ;\ 593 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 594 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 595 stda %f0, [tmp1]ASI_BLK_P ;\ 596 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 597 stda %f32, [tmp1]ASI_BLK_P ;\ 598 membar #Sync 599 600#define BLD_FPQ1Q3_FROMSTACK(tmp1) \ 601 /* membar #Sync - provided at copy completion */ ;\ 602 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 603 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 604 ldda [tmp1]ASI_BLK_P, %f0 ;\ 605 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 606 ldda [tmp1]ASI_BLK_P, %f32 ;\ 607 membar #Sync 608 609#define BST_FPQ2Q4_TOSTACK(tmp1) \ 610 /* membar #Sync */ ;\ 611 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 612 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 613 stda %f16, [tmp1]ASI_BLK_P ;\ 614 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 615 stda %f48, [tmp1]ASI_BLK_P ;\ 616 membar #Sync 617 618#define BLD_FPQ2Q4_FROMSTACK(tmp1) \ 619 /* membar #Sync - provided at copy completion */ ;\ 620 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 621 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 622 ldda [tmp1]ASI_BLK_P, %f16 ;\ 623 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 624 ldda [tmp1]ASI_BLK_P, %f48 ;\ 625 membar #Sync 626#endif 627 628/* 629 * FP_NOMIGRATE and FP_ALLOWMIGRATE. Prevent migration (or, stronger, 630 * prevent preemption if there is no t_lwp to save FP state to on context 631 * switch) before commencing a FP copy, and reallow it on completion or 632 * in error trampoline paths when we were using FP copy. 633 * 634 * Both macros may call other functions, so be aware that all outputs are 635 * forfeit after using these macros. For this reason we do not pass registers 636 * to use - we just use any outputs we want. 637 * 638 * Pseudo code: 639 * 640 * FP_NOMIGRATE: 641 * 642 * if (curthread->t_lwp) { 643 * thread_nomigrate(); 644 * } else { 645 * kpreempt_disable(); 646 * } 647 * 648 * FP_ALLOWMIGRATE: 649 * 650 * if (curthread->t_lwp) { 651 * thread_allowmigrate(); 652 * } else { 653 * kpreempt_enable(); 654 * } 655 */ 656 657#define FP_NOMIGRATE(label1, label2) \ 658 ldn [THREAD_REG + T_LWP], %o0 ;\ 659 brz,a,pn %o0, label1/**/f ;\ 660 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ 661 call thread_nomigrate ;\ 662 nop ;\ 663 ba label2/**/f ;\ 664 nop ;\ 665label1: ;\ 666 inc %o1 ;\ 667 stb %o1, [THREAD_REG + T_PREEMPT] ;\ 668label2: 669 670#define FP_ALLOWMIGRATE(label1, label2) \ 671 ldn [THREAD_REG + T_LWP], %o0 ;\ 672 brz,a,pn %o0, label1/**/f ;\ 673 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ 674 call thread_allowmigrate ;\ 675 nop ;\ 676 ba label2/**/f ;\ 677 nop ;\ 678label1: ;\ 679 dec %o1 ;\ 680 brnz,pn %o1, label2/**/f ;\ 681 stb %o1, [THREAD_REG + T_PREEMPT] ;\ 682 ldn [THREAD_REG + T_CPU], %o0 ;\ 683 ldub [%o0 + CPU_KPRUNRUN], %o0 ;\ 684 brz,pt %o0, label2/**/f ;\ 685 nop ;\ 686 call kpreempt ;\ 687 rdpr %pil, %o0 ;\ 688label2: 689 690/* 691 * Copy a block of storage, returning an error code if `from' or 692 * `to' takes a kernel pagefault which cannot be resolved. 693 * Returns errno value on pagefault error, 0 if all ok 694 */ 695 696#if defined(lint) 697 698/* ARGSUSED */ 699int 700kcopy(const void *from, void *to, size_t count) 701{ return(0); } 702 703#else /* lint */ 704 705 .seg ".text" 706 .align 4 707 708 ENTRY(kcopy) 709 710 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 711 bleu,pt %ncc, .kcopy_small ! go to larger cases 712 xor %o0, %o1, %o3 ! are src, dst alignable? 713 btst 7, %o3 ! 714 bz,pt %ncc, .kcopy_8 ! check for longword alignment 715 nop 716 btst 1, %o3 ! 717 bz,pt %ncc, .kcopy_2 ! check for half-word 718 nop 719 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 720 ld [%o3 + %lo(hw_copy_limit_1)], %o3 721 tst %o3 722 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 723 cmp %o2, %o3 ! if length <= limit 724 bleu,pt %ncc, .kcopy_small ! go to small copy 725 nop 726 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 727 nop 728.kcopy_2: 729 btst 3, %o3 ! 730 bz,pt %ncc, .kcopy_4 ! check for word alignment 731 nop 732 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 733 ld [%o3 + %lo(hw_copy_limit_2)], %o3 734 tst %o3 735 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 736 cmp %o2, %o3 ! if length <= limit 737 bleu,pt %ncc, .kcopy_small ! go to small copy 738 nop 739 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 740 nop 741.kcopy_4: 742 ! already checked longword, must be word aligned 743 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 744 ld [%o3 + %lo(hw_copy_limit_4)], %o3 745 tst %o3 746 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 747 cmp %o2, %o3 ! if length <= limit 748 bleu,pt %ncc, .kcopy_small ! go to small copy 749 nop 750 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 751 nop 752.kcopy_8: 753 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 754 ld [%o3 + %lo(hw_copy_limit_8)], %o3 755 tst %o3 756 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 757 cmp %o2, %o3 ! if length <= limit 758 bleu,pt %ncc, .kcopy_small ! go to small copy 759 nop 760 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 761 nop 762 763.kcopy_small: 764 sethi %hi(.sm_copyerr), %o5 ! sm_copyerr is lofault value 765 or %o5, %lo(.sm_copyerr), %o5 766 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 767 membar #Sync ! sync error barrier 768 ba,pt %ncc, .sm_do_copy ! common code 769 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 770 771.kcopy_more: 772 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 773 sethi %hi(.copyerr), %l7 ! copyerr is lofault value 774 or %l7, %lo(.copyerr), %l7 775 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 776 membar #Sync ! sync error barrier 777 ba,pt %ncc, .do_copy ! common code 778 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 779 780 781/* 782 * We got here because of a fault during bcopy_more, called from kcopy or bcopy. 783 * Errno value is in %g1. bcopy_more uses fp quadrants 1 and 3. 784 */ 785.copyerr: 786 set .copyerr2, %l0 787 membar #Sync ! sync error barrier 788 stn %l0, [THREAD_REG + T_LOFAULT] ! set t_lofault 789 btst FPUSED_FLAG, %l6 790 bz %ncc, 1f 791 and %l6, TRAMP_FLAG, %l0 ! copy trampoline flag to %l0 792 793 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 794 wr %o2, 0, %gsr 795 796 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 797 btst FPRS_FEF, %o3 798 bz,pt %icc, 4f 799 nop 800 801 BLD_FPQ1Q3_FROMSTACK(%o2) 802 803 ba,pt %ncc, 1f 804 wr %o3, 0, %fprs ! restore fprs 805 8064: 807 FZEROQ1Q3 808 wr %o3, 0, %fprs ! restore fprs 809 810 ! 811 ! Need to cater for the different expectations of kcopy 812 ! and bcopy. kcopy will *always* set a t_lofault handler 813 ! If it fires, we're expected to just return the error code 814 ! and *not* to invoke any existing error handler. As far as 815 ! bcopy is concerned, we only set t_lofault if there was an 816 ! existing lofault handler. In that case we're expected to 817 ! invoke the previously existing handler after resetting the 818 ! t_lofault value. 819 ! 8201: 821 andn %l6, MASK_FLAGS, %l6 ! turn trampoline flag off 822 membar #Sync ! sync error barrier 823 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 824 FP_ALLOWMIGRATE(5, 6) 825 826 btst TRAMP_FLAG, %l0 827 bnz,pn %ncc, 3f 828 nop 829 ret 830 restore %g1, 0, %o0 831 8323: 833 ! 834 ! We're here via bcopy. There *must* have been an error handler 835 ! in place otherwise we would have died a nasty death already. 836 ! 837 jmp %l6 ! goto real handler 838 restore %g0, 0, %o0 ! dispose of copy window 839 840/* 841 * We got here because of a fault in .copyerr. We can't safely restore fp 842 * state, so we panic. 843 */ 844fp_panic_msg: 845 .asciz "Unable to restore fp state after copy operation" 846 847 .align 4 848.copyerr2: 849 set fp_panic_msg, %o0 850 call panic 851 nop 852 853/* 854 * We got here because of a fault during a small kcopy or bcopy. 855 * No floating point registers are used by the small copies. 856 * Errno value is in %g1. 857 */ 858.sm_copyerr: 8591: 860 btst TRAMP_FLAG, %o4 861 membar #Sync 862 andn %o4, TRAMP_FLAG, %o4 863 bnz,pn %ncc, 3f 864 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 865 retl 866 mov %g1, %o0 8673: 868 jmp %o4 ! goto real handler 869 mov %g0, %o0 ! 870 871 SET_SIZE(kcopy) 872#endif /* lint */ 873 874 875/* 876 * Copy a block of storage - must not overlap (from + len <= to). 877 * Registers: l6 - saved t_lofault 878 * (for short copies, o4 - saved t_lofault) 879 * 880 * Copy a page of memory. 881 * Assumes double word alignment and a count >= 256. 882 */ 883#if defined(lint) 884 885/* ARGSUSED */ 886void 887bcopy(const void *from, void *to, size_t count) 888{} 889 890#else /* lint */ 891 892 ENTRY(bcopy) 893 894 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 895 bleu,pt %ncc, .bcopy_small ! go to larger cases 896 xor %o0, %o1, %o3 ! are src, dst alignable? 897 btst 7, %o3 ! 898 bz,pt %ncc, .bcopy_8 ! check for longword alignment 899 nop 900 btst 1, %o3 ! 901 bz,pt %ncc, .bcopy_2 ! check for half-word 902 nop 903 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 904 ld [%o3 + %lo(hw_copy_limit_1)], %o3 905 tst %o3 906 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 907 cmp %o2, %o3 ! if length <= limit 908 bleu,pt %ncc, .bcopy_small ! go to small copy 909 nop 910 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 911 nop 912.bcopy_2: 913 btst 3, %o3 ! 914 bz,pt %ncc, .bcopy_4 ! check for word alignment 915 nop 916 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 917 ld [%o3 + %lo(hw_copy_limit_2)], %o3 918 tst %o3 919 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 920 cmp %o2, %o3 ! if length <= limit 921 bleu,pt %ncc, .bcopy_small ! go to small copy 922 nop 923 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 924 nop 925.bcopy_4: 926 ! already checked longword, must be word aligned 927 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 928 ld [%o3 + %lo(hw_copy_limit_4)], %o3 929 tst %o3 930 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 931 cmp %o2, %o3 ! if length <= limit 932 bleu,pt %ncc, .bcopy_small ! go to small copy 933 nop 934 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 935 nop 936.bcopy_8: 937 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 938 ld [%o3 + %lo(hw_copy_limit_8)], %o3 939 tst %o3 940 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 941 cmp %o2, %o3 ! if length <= limit 942 bleu,pt %ncc, .bcopy_small ! go to small copy 943 nop 944 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 945 nop 946 947 .align 16 948.bcopy_small: 949 ldn [THREAD_REG + T_LOFAULT], %o4 ! save t_lofault 950 tst %o4 951 bz,pt %icc, .sm_do_copy 952 nop 953 sethi %hi(.sm_copyerr), %o5 954 or %o5, %lo(.sm_copyerr), %o5 955 membar #Sync ! sync error barrier 956 stn %o5, [THREAD_REG + T_LOFAULT] ! install new vector 957 or %o4, TRAMP_FLAG, %o4 ! error should trampoline 958.sm_do_copy: 959 cmp %o2, SHORTCOPY ! check for really short case 960 bleu,pt %ncc, .bc_sm_left ! 961 cmp %o2, CHKSIZE ! check for medium length cases 962 bgu,pn %ncc, .bc_med ! 963 or %o0, %o1, %o3 ! prepare alignment check 964 andcc %o3, 0x3, %g0 ! test for alignment 965 bz,pt %ncc, .bc_sm_word ! branch to word aligned case 966.bc_sm_movebytes: 967 sub %o2, 3, %o2 ! adjust count to allow cc zero test 968.bc_sm_notalign4: 969 ldub [%o0], %o3 ! read byte 970 stb %o3, [%o1] ! write byte 971 subcc %o2, 4, %o2 ! reduce count by 4 972 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes 973 add %o0, 4, %o0 ! advance SRC by 4 974 stb %o3, [%o1 + 1] 975 ldub [%o0 - 2], %o3 976 add %o1, 4, %o1 ! advance DST by 4 977 stb %o3, [%o1 - 2] 978 ldub [%o0 - 1], %o3 979 bgt,pt %ncc, .bc_sm_notalign4 ! loop til 3 or fewer bytes remain 980 stb %o3, [%o1 - 1] 981 add %o2, 3, %o2 ! restore count 982.bc_sm_left: 983 tst %o2 984 bz,pt %ncc, .bc_sm_exit ! check for zero length 985 deccc %o2 ! reduce count for cc test 986 ldub [%o0], %o3 ! move one byte 987 bz,pt %ncc, .bc_sm_exit 988 stb %o3, [%o1] 989 ldub [%o0 + 1], %o3 ! move another byte 990 deccc %o2 ! check for more 991 bz,pt %ncc, .bc_sm_exit 992 stb %o3, [%o1 + 1] 993 ldub [%o0 + 2], %o3 ! move final byte 994 stb %o3, [%o1 + 2] 995 membar #Sync ! sync error barrier 996 andn %o4, TRAMP_FLAG, %o4 997 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 998 retl 999 mov %g0, %o0 ! return 0 1000 .align 16 1001 nop ! instruction alignment 1002 ! see discussion at start of file 1003.bc_sm_words: 1004 lduw [%o0], %o3 ! read word 1005.bc_sm_wordx: 1006 subcc %o2, 8, %o2 ! update count 1007 stw %o3, [%o1] ! write word 1008 add %o0, 8, %o0 ! update SRC 1009 lduw [%o0 - 4], %o3 ! read word 1010 add %o1, 8, %o1 ! update DST 1011 bgt,pt %ncc, .bc_sm_words ! loop til done 1012 stw %o3, [%o1 - 4] ! write word 1013 addcc %o2, 7, %o2 ! restore count 1014 bz,pt %ncc, .bc_sm_exit 1015 deccc %o2 1016 bz,pt %ncc, .bc_sm_byte 1017.bc_sm_half: 1018 subcc %o2, 2, %o2 ! reduce count by 2 1019 add %o0, 2, %o0 ! advance SRC by 2 1020 lduh [%o0 - 2], %o3 ! read half word 1021 add %o1, 2, %o1 ! advance DST by 2 1022 bgt,pt %ncc, .bc_sm_half ! loop til done 1023 sth %o3, [%o1 - 2] ! write half word 1024 addcc %o2, 1, %o2 ! restore count 1025 bz,pt %ncc, .bc_sm_exit 1026 nop 1027.bc_sm_byte: 1028 ldub [%o0], %o3 1029 stb %o3, [%o1] 1030 membar #Sync ! sync error barrier 1031 andn %o4, TRAMP_FLAG, %o4 1032 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1033 retl 1034 mov %g0, %o0 ! return 0 1035 1036.bc_sm_word: 1037 subcc %o2, 4, %o2 ! update count 1038 bgt,pt %ncc, .bc_sm_wordx 1039 lduw [%o0], %o3 ! read word 1040 addcc %o2, 3, %o2 ! restore count 1041 bz,pt %ncc, .bc_sm_exit 1042 stw %o3, [%o1] ! write word 1043 deccc %o2 ! reduce count for cc test 1044 ldub [%o0 + 4], %o3 ! load one byte 1045 bz,pt %ncc, .bc_sm_exit 1046 stb %o3, [%o1 + 4] ! store one byte 1047 ldub [%o0 + 5], %o3 ! load second byte 1048 deccc %o2 1049 bz,pt %ncc, .bc_sm_exit 1050 stb %o3, [%o1 + 5] ! store second byte 1051 ldub [%o0 + 6], %o3 ! load third byte 1052 stb %o3, [%o1 + 6] ! store third byte 1053.bc_sm_exit: 1054 membar #Sync ! sync error barrier 1055 andn %o4, TRAMP_FLAG, %o4 1056 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1057 retl 1058 mov %g0, %o0 ! return 0 1059 1060 .align 16 1061.bc_med: 1062 xor %o0, %o1, %o3 ! setup alignment check 1063 btst 1, %o3 1064 bnz,pt %ncc, .bc_sm_movebytes ! unaligned 1065 nop 1066 btst 3, %o3 1067 bnz,pt %ncc, .bc_med_half ! halfword aligned 1068 nop 1069 btst 7, %o3 1070 bnz,pt %ncc, .bc_med_word ! word aligned 1071 nop 1072.bc_med_long: 1073 btst 3, %o0 ! check for 1074 bz,pt %ncc, .bc_med_long1 ! word alignment 1075 nop 1076.bc_med_long0: 1077 ldub [%o0], %o3 ! load one byte 1078 inc %o0 1079 stb %o3,[%o1] ! store byte 1080 inc %o1 1081 btst 3, %o0 1082 bnz,pt %ncc, .bc_med_long0 1083 dec %o2 1084.bc_med_long1: ! word aligned 1085 btst 7, %o0 ! check for long word 1086 bz,pt %ncc, .bc_med_long2 1087 nop 1088 lduw [%o0], %o3 ! load word 1089 add %o0, 4, %o0 ! advance SRC by 4 1090 stw %o3, [%o1] ! store word 1091 add %o1, 4, %o1 ! advance DST by 4 1092 sub %o2, 4, %o2 ! reduce count by 4 1093! 1094! Now long word aligned and have at least 32 bytes to move 1095! 1096.bc_med_long2: 1097 sub %o2, 31, %o2 ! adjust count to allow cc zero test 1098.bc_med_lmove: 1099 ldx [%o0], %o3 ! read long word 1100 stx %o3, [%o1] ! write long word 1101 subcc %o2, 32, %o2 ! reduce count by 32 1102 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words 1103 add %o0, 32, %o0 ! advance SRC by 32 1104 stx %o3, [%o1 + 8] 1105 ldx [%o0 - 16], %o3 1106 add %o1, 32, %o1 ! advance DST by 32 1107 stx %o3, [%o1 - 16] 1108 ldx [%o0 - 8], %o3 1109 bgt,pt %ncc, .bc_med_lmove ! loop til 31 or fewer bytes left 1110 stx %o3, [%o1 - 8] 1111 addcc %o2, 24, %o2 ! restore count to long word offset 1112 ble,pt %ncc, .bc_med_lextra ! check for more long words to move 1113 nop 1114.bc_med_lword: 1115 ldx [%o0], %o3 ! read long word 1116 subcc %o2, 8, %o2 ! reduce count by 8 1117 stx %o3, [%o1] ! write long word 1118 add %o0, 8, %o0 ! advance SRC by 8 1119 bgt,pt %ncc, .bc_med_lword ! loop til 7 or fewer bytes left 1120 add %o1, 8, %o1 ! advance DST by 8 1121.bc_med_lextra: 1122 addcc %o2, 7, %o2 ! restore rest of count 1123 bz,pt %ncc, .bc_sm_exit ! if zero, then done 1124 deccc %o2 1125 bz,pt %ncc, .bc_sm_byte 1126 nop 1127 ba,pt %ncc, .bc_sm_half 1128 nop 1129 1130 .align 16 1131.bc_med_word: 1132 btst 3, %o0 ! check for 1133 bz,pt %ncc, .bc_med_word1 ! word alignment 1134 nop 1135.bc_med_word0: 1136 ldub [%o0], %o3 ! load one byte 1137 inc %o0 1138 stb %o3,[%o1] ! store byte 1139 inc %o1 1140 btst 3, %o0 1141 bnz,pt %ncc, .bc_med_word0 1142 dec %o2 1143! 1144! Now word aligned and have at least 36 bytes to move 1145! 1146.bc_med_word1: 1147 sub %o2, 15, %o2 ! adjust count to allow cc zero test 1148.bc_med_wmove: 1149 lduw [%o0], %o3 ! read word 1150 stw %o3, [%o1] ! write word 1151 subcc %o2, 16, %o2 ! reduce count by 16 1152 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words 1153 add %o0, 16, %o0 ! advance SRC by 16 1154 stw %o3, [%o1 + 4] 1155 lduw [%o0 - 8], %o3 1156 add %o1, 16, %o1 ! advance DST by 16 1157 stw %o3, [%o1 - 8] 1158 lduw [%o0 - 4], %o3 1159 bgt,pt %ncc, .bc_med_wmove ! loop til 15 or fewer bytes left 1160 stw %o3, [%o1 - 4] 1161 addcc %o2, 12, %o2 ! restore count to word offset 1162 ble,pt %ncc, .bc_med_wextra ! check for more words to move 1163 nop 1164.bc_med_word2: 1165 lduw [%o0], %o3 ! read word 1166 subcc %o2, 4, %o2 ! reduce count by 4 1167 stw %o3, [%o1] ! write word 1168 add %o0, 4, %o0 ! advance SRC by 4 1169 bgt,pt %ncc, .bc_med_word2 ! loop til 3 or fewer bytes left 1170 add %o1, 4, %o1 ! advance DST by 4 1171.bc_med_wextra: 1172 addcc %o2, 3, %o2 ! restore rest of count 1173 bz,pt %ncc, .bc_sm_exit ! if zero, then done 1174 deccc %o2 1175 bz,pt %ncc, .bc_sm_byte 1176 nop 1177 ba,pt %ncc, .bc_sm_half 1178 nop 1179 1180 .align 16 1181.bc_med_half: 1182 btst 1, %o0 ! check for 1183 bz,pt %ncc, .bc_med_half1 ! half word alignment 1184 nop 1185 ldub [%o0], %o3 ! load one byte 1186 inc %o0 1187 stb %o3,[%o1] ! store byte 1188 inc %o1 1189 dec %o2 1190! 1191! Now half word aligned and have at least 38 bytes to move 1192! 1193.bc_med_half1: 1194 sub %o2, 7, %o2 ! adjust count to allow cc zero test 1195.bc_med_hmove: 1196 lduh [%o0], %o3 ! read half word 1197 sth %o3, [%o1] ! write half word 1198 subcc %o2, 8, %o2 ! reduce count by 8 1199 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords 1200 add %o0, 8, %o0 ! advance SRC by 8 1201 sth %o3, [%o1 + 2] 1202 lduh [%o0 - 4], %o3 1203 add %o1, 8, %o1 ! advance DST by 8 1204 sth %o3, [%o1 - 4] 1205 lduh [%o0 - 2], %o3 1206 bgt,pt %ncc, .bc_med_hmove ! loop til 7 or fewer bytes left 1207 sth %o3, [%o1 - 2] 1208 addcc %o2, 7, %o2 ! restore count 1209 bz,pt %ncc, .bc_sm_exit 1210 deccc %o2 1211 bz,pt %ncc, .bc_sm_byte 1212 nop 1213 ba,pt %ncc, .bc_sm_half 1214 nop 1215 1216 SET_SIZE(bcopy) 1217 1218/* 1219 * The _more entry points are not intended to be used directly by 1220 * any caller from outside this file. They are provided to allow 1221 * profiling and dtrace of the portions of the copy code that uses 1222 * the floating point registers. 1223 * This entry is particularly important as DTRACE (at least as of 1224 * 4/2004) does not support leaf functions. 1225 */ 1226 1227 ENTRY(bcopy_more) 1228.bcopy_more: 1229 prefetch [%o0], #n_reads 1230 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 1231 ldn [THREAD_REG + T_LOFAULT], %l6 ! save t_lofault 1232 tst %l6 1233 bz,pt %ncc, .do_copy 1234 nop 1235 sethi %hi(.copyerr), %o2 1236 or %o2, %lo(.copyerr), %o2 1237 membar #Sync ! sync error barrier 1238 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector 1239 ! 1240 ! We've already captured whether t_lofault was zero on entry. 1241 ! We need to mark ourselves as being from bcopy since both 1242 ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set 1243 ! and the saved lofault was zero, we won't reset lofault on 1244 ! returning. 1245 ! 1246 or %l6, TRAMP_FLAG, %l6 1247 1248/* 1249 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes 1250 * Also, use of FP registers has been tested to be enabled 1251 */ 1252.do_copy: 1253 FP_NOMIGRATE(6, 7) 1254 1255 rd %fprs, %o2 ! check for unused fp 1256 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 1257 btst FPRS_FEF, %o2 1258 bz,a,pt %icc, .do_blockcopy 1259 wr %g0, FPRS_FEF, %fprs 1260 1261 BST_FPQ1Q3_TOSTACK(%o2) 1262 1263.do_blockcopy: 1264 rd %gsr, %o2 1265 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 1266 or %l6, FPUSED_FLAG, %l6 1267 1268#define REALSRC %i0 1269#define DST %i1 1270#define CNT %i2 1271#define SRC %i3 1272#define TMP %i5 1273 1274 andcc DST, VIS_BLOCKSIZE - 1, TMP 1275 bz,pt %ncc, 2f 1276 neg TMP 1277 add TMP, VIS_BLOCKSIZE, TMP 1278 1279 ! TMP = bytes required to align DST on FP_BLOCK boundary 1280 ! Using SRC as a tmp here 1281 cmp TMP, 3 1282 bleu,pt %ncc, 1f 1283 sub CNT,TMP,CNT ! adjust main count 1284 sub TMP, 3, TMP ! adjust for end of loop test 1285.bc_blkalign: 1286 ldub [REALSRC], SRC ! move 4 bytes per loop iteration 1287 stb SRC, [DST] 1288 subcc TMP, 4, TMP 1289 ldub [REALSRC + 1], SRC 1290 add REALSRC, 4, REALSRC 1291 stb SRC, [DST + 1] 1292 ldub [REALSRC - 2], SRC 1293 add DST, 4, DST 1294 stb SRC, [DST - 2] 1295 ldub [REALSRC - 1], SRC 1296 bgu,pt %ncc, .bc_blkalign 1297 stb SRC, [DST - 1] 1298 1299 addcc TMP, 3, TMP ! restore count adjustment 1300 bz,pt %ncc, 2f ! no bytes left? 1301 nop 13021: ldub [REALSRC], SRC 1303 inc REALSRC 1304 inc DST 1305 deccc TMP 1306 bgu %ncc, 1b 1307 stb SRC, [DST - 1] 1308 13092: 1310 membar #StoreLoad 1311 andn REALSRC, 0x7, SRC 1312 1313 ! SRC - 8-byte aligned 1314 ! DST - 64-byte aligned 1315 ldd [SRC], %f0 1316 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 1317 alignaddr REALSRC, %g0, %g0 1318 ldd [SRC + 0x08], %f2 1319 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 1320 faligndata %f0, %f2, %f32 1321 ldd [SRC + 0x10], %f4 1322 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read 1323 faligndata %f2, %f4, %f34 1324 ldd [SRC + 0x18], %f6 1325 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 1326 faligndata %f4, %f6, %f36 1327 ldd [SRC + 0x20], %f8 1328 prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read 1329 faligndata %f6, %f8, %f38 1330 ldd [SRC + 0x28], %f10 1331 prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read 1332 faligndata %f8, %f10, %f40 1333 ldd [SRC + 0x30], %f12 1334 prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read 1335 faligndata %f10, %f12, %f42 1336 ldd [SRC + 0x38], %f14 1337 ldd [SRC + VIS_BLOCKSIZE], %f0 1338 sub CNT, VIS_BLOCKSIZE, CNT 1339 add SRC, VIS_BLOCKSIZE, SRC 1340 prefetch [SRC + (9 * VIS_BLOCKSIZE) - VIS_BLOCKSIZE], #one_read 1341 add REALSRC, VIS_BLOCKSIZE, REALSRC 1342 ba,pt %ncc, 1f 1343 prefetch [SRC + (10 * VIS_BLOCKSIZE) - VIS_BLOCKSIZE], #one_read 1344 .align 32 13451: 1346 ldd [SRC + 0x08], %f2 1347 faligndata %f12, %f14, %f44 1348 ldd [SRC + 0x10], %f4 1349 faligndata %f14, %f0, %f46 1350 stda %f32, [DST]ASI_BLK_P 1351 ldd [SRC + 0x18], %f6 1352 faligndata %f0, %f2, %f32 1353 ldd [SRC + 0x20], %f8 1354 faligndata %f2, %f4, %f34 1355 ldd [SRC + 0x28], %f10 1356 faligndata %f4, %f6, %f36 1357 ldd [SRC + 0x30], %f12 1358 faligndata %f6, %f8, %f38 1359 ldd [SRC + 0x38], %f14 1360 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #n_reads 1361 faligndata %f8, %f10, %f40 1362 ldd [SRC + VIS_BLOCKSIZE], %f0 1363 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1364 faligndata %f10, %f12, %f42 1365 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE) + 0x20], #n_reads 1366 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE) + 0x20], #one_read 1367 sub CNT, VIS_BLOCKSIZE, CNT 1368 add DST, VIS_BLOCKSIZE, DST 1369 add REALSRC, VIS_BLOCKSIZE, REALSRC 1370 cmp CNT, VIS_BLOCKSIZE + 8 1371 bgu,pt %ncc, 1b 1372 add SRC, VIS_BLOCKSIZE, SRC 1373 1374 ! only if REALSRC & 0x7 is 0 1375 cmp CNT, VIS_BLOCKSIZE 1376 bne %ncc, 3f 1377 andcc REALSRC, 0x7, %g0 1378 bz,pt %ncc, 2f 1379 nop 13803: 1381 faligndata %f12, %f14, %f44 1382 faligndata %f14, %f0, %f46 1383 stda %f32, [DST]ASI_BLK_P 1384 add DST, VIS_BLOCKSIZE, DST 1385 ba,pt %ncc, 3f 1386 nop 13872: 1388 ldd [SRC + 0x08], %f2 1389 fsrc1 %f12, %f44 1390 ldd [SRC + 0x10], %f4 1391 fsrc1 %f14, %f46 1392 stda %f32, [DST]ASI_BLK_P 1393 ldd [SRC + 0x18], %f6 1394 fsrc1 %f0, %f32 1395 ldd [SRC + 0x20], %f8 1396 fsrc1 %f2, %f34 1397 ldd [SRC + 0x28], %f10 1398 fsrc1 %f4, %f36 1399 ldd [SRC + 0x30], %f12 1400 fsrc1 %f6, %f38 1401 ldd [SRC + 0x38], %f14 1402 fsrc1 %f8, %f40 1403 sub CNT, VIS_BLOCKSIZE, CNT 1404 add DST, VIS_BLOCKSIZE, DST 1405 add SRC, VIS_BLOCKSIZE, SRC 1406 add REALSRC, VIS_BLOCKSIZE, REALSRC 1407 fsrc1 %f10, %f42 1408 fsrc1 %f12, %f44 1409 fsrc1 %f14, %f46 1410 stda %f32, [DST]ASI_BLK_P 1411 add DST, VIS_BLOCKSIZE, DST 1412 ba,a,pt %ncc, .bcb_exit 1413 nop 1414 14153: tst CNT 1416 bz,a,pt %ncc, .bcb_exit 1417 nop 1418 14195: ldub [REALSRC], TMP 1420 inc REALSRC 1421 inc DST 1422 deccc CNT 1423 bgu %ncc, 5b 1424 stb TMP, [DST - 1] 1425.bcb_exit: 1426 membar #Sync 1427 1428 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 1429 wr %o2, 0, %gsr 1430 1431 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 1432 btst FPRS_FEF, %o3 1433 bz,pt %icc, 4f 1434 nop 1435 1436 BLD_FPQ1Q3_FROMSTACK(%o2) 1437 1438 ba,pt %ncc, 2f 1439 wr %o3, 0, %fprs ! restore fprs 14404: 1441 FZEROQ1Q3 1442 wr %o3, 0, %fprs ! restore fprs 14432: 1444 membar #Sync ! sync error barrier 1445 andn %l6, MASK_FLAGS, %l6 1446 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1447 FP_ALLOWMIGRATE(5, 6) 1448 ret 1449 restore %g0, 0, %o0 1450 1451 SET_SIZE(bcopy_more) 1452 1453#endif /* lint */ 1454 1455/* 1456 * Block copy with possibly overlapped operands. 1457 */ 1458 1459#if defined(lint) 1460 1461/*ARGSUSED*/ 1462void 1463ovbcopy(const void *from, void *to, size_t count) 1464{} 1465 1466#else /* lint */ 1467 1468 ENTRY(ovbcopy) 1469 tst %o2 ! check count 1470 bgu,a %ncc, 1f ! nothing to do or bad arguments 1471 subcc %o0, %o1, %o3 ! difference of from and to address 1472 1473 retl ! return 1474 nop 14751: 1476 bneg,a %ncc, 2f 1477 neg %o3 ! if < 0, make it positive 14782: cmp %o2, %o3 ! cmp size and abs(from - to) 1479 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy, 1480 .empty ! no overlap 1481 cmp %o0, %o1 ! compare from and to addresses 1482 blu %ncc, .ov_bkwd ! if from < to, copy backwards 1483 nop 1484 ! 1485 ! Copy forwards. 1486 ! 1487.ov_fwd: 1488 ldub [%o0], %o3 ! read from address 1489 inc %o0 ! inc from address 1490 stb %o3, [%o1] ! write to address 1491 deccc %o2 ! dec count 1492 bgu %ncc, .ov_fwd ! loop till done 1493 inc %o1 ! inc to address 1494 1495 retl ! return 1496 nop 1497 ! 1498 ! Copy backwards. 1499 ! 1500.ov_bkwd: 1501 deccc %o2 ! dec count 1502 ldub [%o0 + %o2], %o3 ! get byte at end of src 1503 bgu %ncc, .ov_bkwd ! loop till done 1504 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst 1505 1506 retl ! return 1507 nop 1508 1509 SET_SIZE(ovbcopy) 1510 1511#endif /* lint */ 1512 1513 1514/* 1515 * hwblkpagecopy() 1516 * 1517 * Copies exactly one page. This routine assumes the caller (ppcopy) 1518 * has already disabled kernel preemption and has checked 1519 * use_hw_bcopy. Preventing preemption also prevents cpu migration. 1520 */ 1521#ifdef lint 1522/*ARGSUSED*/ 1523void 1524hwblkpagecopy(const void *src, void *dst) 1525{ } 1526#else /* lint */ 1527 ENTRY(hwblkpagecopy) 1528 ! get another window w/space for three aligned blocks of saved fpregs 1529 prefetch [%o0], #n_reads 1530 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 1531 1532 ! %i0 - source address (arg) 1533 ! %i1 - destination address (arg) 1534 ! %i2 - length of region (not arg) 1535 ! %l0 - saved fprs 1536 ! %l1 - pointer to saved fpregs 1537 1538 rd %fprs, %l0 ! check for unused fp 1539 btst FPRS_FEF, %l0 1540 bz,a,pt %icc, 1f 1541 wr %g0, FPRS_FEF, %fprs 1542 1543 BST_FPQ1Q3_TOSTACK(%l1) 1544 15451: set PAGESIZE, CNT 1546 mov REALSRC, SRC 1547 1548 ldd [SRC], %f0 1549 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 1550 ldd [SRC + 0x08], %f2 1551 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 1552 fmovd %f0, %f32 1553 ldd [SRC + 0x10], %f4 1554 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read 1555 fmovd %f2, %f34 1556 ldd [SRC + 0x18], %f6 1557 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 1558 fmovd %f4, %f36 1559 ldd [SRC + 0x20], %f8 1560 prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read 1561 fmovd %f6, %f38 1562 ldd [SRC + 0x28], %f10 1563 prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read 1564 fmovd %f8, %f40 1565 ldd [SRC + 0x30], %f12 1566 prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read 1567 fmovd %f10, %f42 1568 ldd [SRC + 0x38], %f14 1569 ldd [SRC + VIS_BLOCKSIZE], %f0 1570 sub CNT, VIS_BLOCKSIZE, CNT 1571 add SRC, VIS_BLOCKSIZE, SRC 1572 prefetch [SRC + (9 * VIS_BLOCKSIZE) - VIS_BLOCKSIZE], #one_read 1573 ba,pt %ncc, 2f 1574 prefetch [SRC + (10 * VIS_BLOCKSIZE) - VIS_BLOCKSIZE], #one_read 1575 .align 32 15762: 1577 ldd [SRC + 0x08], %f2 1578 fmovd %f12, %f44 1579 ldd [SRC + 0x10], %f4 1580 fmovd %f14, %f46 1581 stda %f32, [DST]ASI_BLK_P 1582 ldd [SRC + 0x18], %f6 1583 fmovd %f0, %f32 1584 ldd [SRC + 0x20], %f8 1585 fmovd %f2, %f34 1586 ldd [SRC + 0x28], %f10 1587 fmovd %f4, %f36 1588 ldd [SRC + 0x30], %f12 1589 fmovd %f6, %f38 1590 ldd [SRC + 0x38], %f14 1591 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #n_reads 1592 fmovd %f8, %f40 1593 ldd [SRC + VIS_BLOCKSIZE], %f0 1594 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1595 fmovd %f10, %f42 1596 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE) + 0x20], #n_reads 1597 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE) + 0x20], #one_read 1598 sub CNT, VIS_BLOCKSIZE, CNT 1599 add DST, VIS_BLOCKSIZE, DST 1600 cmp CNT, VIS_BLOCKSIZE + 8 1601 bgu,pt %ncc, 2b 1602 add SRC, VIS_BLOCKSIZE, SRC 1603 1604 ! trailing block 1605 ldd [SRC + 0x08], %f2 1606 fsrc1 %f12, %f44 1607 ldd [SRC + 0x10], %f4 1608 fsrc1 %f14, %f46 1609 stda %f32, [DST]ASI_BLK_P 1610 ldd [SRC + 0x18], %f6 1611 fsrc1 %f0, %f32 1612 ldd [SRC + 0x20], %f8 1613 fsrc1 %f2, %f34 1614 ldd [SRC + 0x28], %f10 1615 fsrc1 %f4, %f36 1616 ldd [SRC + 0x30], %f12 1617 fsrc1 %f6, %f38 1618 ldd [SRC + 0x38], %f14 1619 fsrc1 %f8, %f40 1620 sub CNT, VIS_BLOCKSIZE, CNT 1621 add DST, VIS_BLOCKSIZE, DST 1622 add SRC, VIS_BLOCKSIZE, SRC 1623 fsrc1 %f10, %f42 1624 fsrc1 %f12, %f44 1625 fsrc1 %f14, %f46 1626 stda %f32, [DST]ASI_BLK_P 1627 1628 membar #Sync 1629 1630 btst FPRS_FEF, %l0 1631 bz,pt %icc, 2f 1632 nop 1633 1634 BLD_FPQ1Q3_FROMSTACK(%l3) 1635 ba 3f 1636 nop 1637 16382: FZEROQ1Q3 1639 16403: wr %l0, 0, %fprs ! restore fprs 1641 ret 1642 restore %g0, 0, %o0 1643 1644 SET_SIZE(hwblkpagecopy) 1645#endif /* lint */ 1646 1647 1648/* 1649 * Transfer data to and from user space - 1650 * Note that these routines can cause faults 1651 * It is assumed that the kernel has nothing at 1652 * less than KERNELBASE in the virtual address space. 1653 * 1654 * Note that copyin(9F) and copyout(9F) are part of the 1655 * DDI/DKI which specifies that they return '-1' on "errors." 1656 * 1657 * Sigh. 1658 * 1659 * So there's two extremely similar routines - xcopyin() and xcopyout() 1660 * which return the errno that we've faithfully computed. This 1661 * allows other callers (e.g. uiomove(9F)) to work correctly. 1662 * Given that these are used pretty heavily, we expand the calling 1663 * sequences inline for all flavours (rather than making wrappers). 1664 * 1665 * There are also stub routines for xcopyout_little and xcopyin_little, 1666 * which currently are intended to handle requests of <= 16 bytes from 1667 * do_unaligned. Future enhancement to make them handle 8k pages efficiently 1668 * is left as an exercise... 1669 */ 1670 1671/* 1672 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr) 1673 * 1674 * General theory of operation: 1675 * 1676 * The only difference between copy{in,out} and 1677 * xcopy{in,out} is in the error handling routine they invoke 1678 * when a memory access error occurs. xcopyOP returns the errno 1679 * while copyOP returns -1 (see above). copy{in,out}_noerr set 1680 * a special flag (by oring the TRAMP_FLAG into the fault handler address) 1681 * if they are called with a fault handler already in place. That flag 1682 * causes the default handlers to trampoline to the previous handler 1683 * upon an error. 1684 * 1685 * None of the copyops routines grab a window until it's decided that 1686 * we need to do a HW block copy operation. This saves a window 1687 * spill/fill when we're called during socket ops. The typical IO 1688 * path won't cause spill/fill traps. 1689 * 1690 * This code uses a set of 4 limits for the maximum size that will 1691 * be copied given a particular input/output address alignment. 1692 * If the value for a particular limit is zero, the copy will be performed 1693 * by the plain copy loops rather than FPBLK. 1694 * 1695 * See the description of bcopy above for more details of the 1696 * data copying algorithm and the default limits. 1697 * 1698 */ 1699 1700/* 1701 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little). 1702 */ 1703 1704#if defined(lint) 1705 1706 1707#else /* lint */ 1708/* 1709 * We save the arguments in the following registers in case of a fault: 1710 * kaddr - %l1 1711 * uaddr - %l2 1712 * count - %l3 1713 */ 1714#define SAVE_SRC %l1 1715#define SAVE_DST %l2 1716#define SAVE_COUNT %l3 1717 1718#define SM_SAVE_SRC %g4 1719#define SM_SAVE_DST %g5 1720#define SM_SAVE_COUNT %o5 1721#define ERRNO %l5 1722 1723 1724#define REAL_LOFAULT %l4 1725/* 1726 * Generic copyio fault handler. This is the first line of defense when a 1727 * fault occurs in (x)copyin/(x)copyout. In order for this to function 1728 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT. 1729 * This allows us to share common code for all the flavors of the copy 1730 * operations, including the _noerr versions. 1731 * 1732 * Note that this function will restore the original input parameters before 1733 * calling REAL_LOFAULT. So the real handler can vector to the appropriate 1734 * member of the t_copyop structure, if needed. 1735 */ 1736 ENTRY(copyio_fault) 1737 membar #Sync 1738 mov %g1,ERRNO ! save errno in ERRNO 1739 btst FPUSED_FLAG, %l6 1740 bz %ncc, 1f 1741 nop 1742 1743 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 1744 wr %o2, 0, %gsr ! restore gsr 1745 1746 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 1747 btst FPRS_FEF, %o3 1748 bz,pt %icc, 4f 1749 nop 1750 1751 BLD_FPQ2Q4_FROMSTACK(%o2) 1752 1753 ba,pt %ncc, 1f 1754 wr %o3, 0, %fprs ! restore fprs 1755 17564: 1757 FZEROQ2Q4 1758 wr %o3, 0, %fprs ! restore fprs 1759 17601: 1761 andn %l6, FPUSED_FLAG, %l6 1762 membar #Sync 1763 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1764 FP_ALLOWMIGRATE(5, 6) 1765 1766 mov SAVE_SRC, %i0 1767 mov SAVE_DST, %i1 1768 jmp REAL_LOFAULT 1769 mov SAVE_COUNT, %i2 1770 1771 SET_SIZE(copyio_fault) 1772 1773 1774#endif 1775 1776#if defined(lint) 1777 1778/*ARGSUSED*/ 1779int 1780copyout(const void *kaddr, void *uaddr, size_t count) 1781{ return (0); } 1782 1783#else /* lint */ 1784 1785 ENTRY(copyout) 1786 1787 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 1788 bleu,pt %ncc, .copyout_small ! go to larger cases 1789 xor %o0, %o1, %o3 ! are src, dst alignable? 1790 btst 7, %o3 ! 1791 bz,pt %ncc, .copyout_8 ! check for longword alignment 1792 nop 1793 btst 1, %o3 ! 1794 bz,pt %ncc, .copyout_2 ! check for half-word 1795 nop 1796 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 1797 ld [%o3 + %lo(hw_copy_limit_1)], %o3 1798 tst %o3 1799 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1800 cmp %o2, %o3 ! if length <= limit 1801 bleu,pt %ncc, .copyout_small ! go to small copy 1802 nop 1803 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1804 nop 1805.copyout_2: 1806 btst 3, %o3 ! 1807 bz,pt %ncc, .copyout_4 ! check for word alignment 1808 nop 1809 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 1810 ld [%o3 + %lo(hw_copy_limit_2)], %o3 1811 tst %o3 1812 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1813 cmp %o2, %o3 ! if length <= limit 1814 bleu,pt %ncc, .copyout_small ! go to small copy 1815 nop 1816 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1817 nop 1818.copyout_4: 1819 ! already checked longword, must be word aligned 1820 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 1821 ld [%o3 + %lo(hw_copy_limit_4)], %o3 1822 tst %o3 1823 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1824 cmp %o2, %o3 ! if length <= limit 1825 bleu,pt %ncc, .copyout_small ! go to small copy 1826 nop 1827 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1828 nop 1829.copyout_8: 1830 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 1831 ld [%o3 + %lo(hw_copy_limit_8)], %o3 1832 tst %o3 1833 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1834 cmp %o2, %o3 ! if length <= limit 1835 bleu,pt %ncc, .copyout_small ! go to small copy 1836 nop 1837 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1838 nop 1839 1840 .align 16 1841 nop ! instruction alignment 1842 ! see discussion at start of file 1843.copyout_small: 1844 sethi %hi(.sm_copyout_err), %o5 ! .sm_copyout_err is lofault 1845 or %o5, %lo(.sm_copyout_err), %o5 1846 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 1847 membar #Sync ! sync error barrier 1848 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 1849.sm_do_copyout: 1850 mov %o0, SM_SAVE_SRC 1851 mov %o1, SM_SAVE_DST 1852 cmp %o2, SHORTCOPY ! check for really short case 1853 bleu,pt %ncc, .co_sm_left ! 1854 mov %o2, SM_SAVE_COUNT 1855 cmp %o2, CHKSIZE ! check for medium length cases 1856 bgu,pn %ncc, .co_med ! 1857 or %o0, %o1, %o3 ! prepare alignment check 1858 andcc %o3, 0x3, %g0 ! test for alignment 1859 bz,pt %ncc, .co_sm_word ! branch to word aligned case 1860.co_sm_movebytes: 1861 sub %o2, 3, %o2 ! adjust count to allow cc zero test 1862.co_sm_notalign4: 1863 ldub [%o0], %o3 ! read byte 1864 subcc %o2, 4, %o2 ! reduce count by 4 1865 stba %o3, [%o1]ASI_USER ! write byte 1866 inc %o1 ! advance DST by 1 1867 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes 1868 add %o0, 4, %o0 ! advance SRC by 4 1869 stba %o3, [%o1]ASI_USER 1870 inc %o1 ! advance DST by 1 1871 ldub [%o0 - 2], %o3 1872 stba %o3, [%o1]ASI_USER 1873 inc %o1 ! advance DST by 1 1874 ldub [%o0 - 1], %o3 1875 stba %o3, [%o1]ASI_USER 1876 bgt,pt %ncc, .co_sm_notalign4 ! loop til 3 or fewer bytes remain 1877 inc %o1 ! advance DST by 1 1878 add %o2, 3, %o2 ! restore count 1879.co_sm_left: 1880 tst %o2 1881 bz,pt %ncc, .co_sm_exit ! check for zero length 1882 nop 1883 ldub [%o0], %o3 ! load one byte 1884 deccc %o2 ! reduce count for cc test 1885 bz,pt %ncc, .co_sm_exit 1886 stba %o3,[%o1]ASI_USER ! store one byte 1887 ldub [%o0 + 1], %o3 ! load second byte 1888 deccc %o2 1889 inc %o1 1890 bz,pt %ncc, .co_sm_exit 1891 stba %o3,[%o1]ASI_USER ! store second byte 1892 ldub [%o0 + 2], %o3 ! load third byte 1893 inc %o1 1894 stba %o3,[%o1]ASI_USER ! store third byte 1895 membar #Sync ! sync error barrier 1896 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1897 retl 1898 mov %g0, %o0 ! return 0 1899 .align 16 1900.co_sm_words: 1901 lduw [%o0], %o3 ! read word 1902.co_sm_wordx: 1903 subcc %o2, 8, %o2 ! update count 1904 stwa %o3, [%o1]ASI_USER ! write word 1905 add %o0, 8, %o0 ! update SRC 1906 lduw [%o0 - 4], %o3 ! read word 1907 add %o1, 4, %o1 ! update DST 1908 stwa %o3, [%o1]ASI_USER ! write word 1909 bgt,pt %ncc, .co_sm_words ! loop til done 1910 add %o1, 4, %o1 ! update DST 1911 addcc %o2, 7, %o2 ! restore count 1912 bz,pt %ncc, .co_sm_exit 1913 nop 1914 deccc %o2 1915 bz,pt %ncc, .co_sm_byte 1916.co_sm_half: 1917 subcc %o2, 2, %o2 ! reduce count by 2 1918 lduh [%o0], %o3 ! read half word 1919 add %o0, 2, %o0 ! advance SRC by 2 1920 stha %o3, [%o1]ASI_USER ! write half word 1921 bgt,pt %ncc, .co_sm_half ! loop til done 1922 add %o1, 2, %o1 ! advance DST by 2 1923 addcc %o2, 1, %o2 ! restore count 1924 bz,pt %ncc, .co_sm_exit 1925 nop 1926.co_sm_byte: 1927 ldub [%o0], %o3 1928 stba %o3, [%o1]ASI_USER 1929 membar #Sync ! sync error barrier 1930 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1931 retl 1932 mov %g0, %o0 ! return 0 1933 .align 16 1934.co_sm_word: 1935 subcc %o2, 4, %o2 ! update count 1936 bgt,pt %ncc, .co_sm_wordx 1937 lduw [%o0], %o3 ! read word 1938 addcc %o2, 3, %o2 ! restore count 1939 bz,pt %ncc, .co_sm_exit 1940 stwa %o3, [%o1]ASI_USER ! write word 1941 deccc %o2 ! reduce count for cc test 1942 ldub [%o0 + 4], %o3 ! load one byte 1943 add %o1, 4, %o1 1944 bz,pt %ncc, .co_sm_exit 1945 stba %o3, [%o1]ASI_USER ! store one byte 1946 ldub [%o0 + 5], %o3 ! load second byte 1947 deccc %o2 1948 inc %o1 1949 bz,pt %ncc, .co_sm_exit 1950 stba %o3, [%o1]ASI_USER ! store second byte 1951 ldub [%o0 + 6], %o3 ! load third byte 1952 inc %o1 1953 stba %o3, [%o1]ASI_USER ! store third byte 1954.co_sm_exit: 1955 membar #Sync ! sync error barrier 1956 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1957 retl 1958 mov %g0, %o0 ! return 0 1959 1960 .align 16 1961.co_med: 1962 xor %o0, %o1, %o3 ! setup alignment check 1963 btst 1, %o3 1964 bnz,pt %ncc, .co_sm_movebytes ! unaligned 1965 nop 1966 btst 3, %o3 1967 bnz,pt %ncc, .co_med_half ! halfword aligned 1968 nop 1969 btst 7, %o3 1970 bnz,pt %ncc, .co_med_word ! word aligned 1971 nop 1972.co_med_long: 1973 btst 3, %o0 ! check for 1974 bz,pt %ncc, .co_med_long1 ! word alignment 1975 nop 1976.co_med_long0: 1977 ldub [%o0], %o3 ! load one byte 1978 inc %o0 1979 stba %o3,[%o1]ASI_USER ! store byte 1980 inc %o1 1981 btst 3, %o0 1982 bnz,pt %ncc, .co_med_long0 1983 dec %o2 1984.co_med_long1: ! word aligned 1985 btst 7, %o0 ! check for long word 1986 bz,pt %ncc, .co_med_long2 1987 nop 1988 lduw [%o0], %o3 ! load word 1989 add %o0, 4, %o0 ! advance SRC by 4 1990 stwa %o3, [%o1]ASI_USER ! store word 1991 add %o1, 4, %o1 ! advance DST by 4 1992 sub %o2, 4, %o2 ! reduce count by 4 1993! 1994! Now long word aligned and have at least 32 bytes to move 1995! 1996.co_med_long2: 1997 sub %o2, 31, %o2 ! adjust count to allow cc zero test 1998 sub %o1, 8, %o1 ! adjust pointer to allow store in 1999 ! branch delay slot instead of add 2000.co_med_lmove: 2001 add %o1, 8, %o1 ! advance DST by 8 2002 ldx [%o0], %o3 ! read long word 2003 subcc %o2, 32, %o2 ! reduce count by 32 2004 stxa %o3, [%o1]ASI_USER ! write long word 2005 add %o1, 8, %o1 ! advance DST by 8 2006 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words 2007 add %o0, 32, %o0 ! advance SRC by 32 2008 stxa %o3, [%o1]ASI_USER 2009 ldx [%o0 - 16], %o3 2010 add %o1, 8, %o1 ! advance DST by 8 2011 stxa %o3, [%o1]ASI_USER 2012 ldx [%o0 - 8], %o3 2013 add %o1, 8, %o1 ! advance DST by 8 2014 bgt,pt %ncc, .co_med_lmove ! loop til 31 or fewer bytes left 2015 stxa %o3, [%o1]ASI_USER 2016 add %o1, 8, %o1 ! advance DST by 8 2017 addcc %o2, 24, %o2 ! restore count to long word offset 2018 ble,pt %ncc, .co_med_lextra ! check for more long words to move 2019 nop 2020.co_med_lword: 2021 ldx [%o0], %o3 ! read long word 2022 subcc %o2, 8, %o2 ! reduce count by 8 2023 stxa %o3, [%o1]ASI_USER ! write long word 2024 add %o0, 8, %o0 ! advance SRC by 8 2025 bgt,pt %ncc, .co_med_lword ! loop til 7 or fewer bytes left 2026 add %o1, 8, %o1 ! advance DST by 8 2027.co_med_lextra: 2028 addcc %o2, 7, %o2 ! restore rest of count 2029 bz,pt %ncc, .co_sm_exit ! if zero, then done 2030 deccc %o2 2031 bz,pt %ncc, .co_sm_byte 2032 nop 2033 ba,pt %ncc, .co_sm_half 2034 nop 2035 2036 .align 16 2037 nop ! instruction alignment 2038 ! see discussion at start of file 2039.co_med_word: 2040 btst 3, %o0 ! check for 2041 bz,pt %ncc, .co_med_word1 ! word alignment 2042 nop 2043.co_med_word0: 2044 ldub [%o0], %o3 ! load one byte 2045 inc %o0 2046 stba %o3,[%o1]ASI_USER ! store byte 2047 inc %o1 2048 btst 3, %o0 2049 bnz,pt %ncc, .co_med_word0 2050 dec %o2 2051! 2052! Now word aligned and have at least 36 bytes to move 2053! 2054.co_med_word1: 2055 sub %o2, 15, %o2 ! adjust count to allow cc zero test 2056.co_med_wmove: 2057 lduw [%o0], %o3 ! read word 2058 subcc %o2, 16, %o2 ! reduce count by 16 2059 stwa %o3, [%o1]ASI_USER ! write word 2060 add %o1, 4, %o1 ! advance DST by 4 2061 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words 2062 add %o0, 16, %o0 ! advance SRC by 16 2063 stwa %o3, [%o1]ASI_USER 2064 add %o1, 4, %o1 ! advance DST by 4 2065 lduw [%o0 - 8], %o3 2066 stwa %o3, [%o1]ASI_USER 2067 add %o1, 4, %o1 ! advance DST by 4 2068 lduw [%o0 - 4], %o3 2069 stwa %o3, [%o1]ASI_USER 2070 bgt,pt %ncc, .co_med_wmove ! loop til 15 or fewer bytes left 2071 add %o1, 4, %o1 ! advance DST by 4 2072 addcc %o2, 12, %o2 ! restore count to word offset 2073 ble,pt %ncc, .co_med_wextra ! check for more words to move 2074 nop 2075.co_med_word2: 2076 lduw [%o0], %o3 ! read word 2077 subcc %o2, 4, %o2 ! reduce count by 4 2078 stwa %o3, [%o1]ASI_USER ! write word 2079 add %o0, 4, %o0 ! advance SRC by 4 2080 bgt,pt %ncc, .co_med_word2 ! loop til 3 or fewer bytes left 2081 add %o1, 4, %o1 ! advance DST by 4 2082.co_med_wextra: 2083 addcc %o2, 3, %o2 ! restore rest of count 2084 bz,pt %ncc, .co_sm_exit ! if zero, then done 2085 deccc %o2 2086 bz,pt %ncc, .co_sm_byte 2087 nop 2088 ba,pt %ncc, .co_sm_half 2089 nop 2090 2091 .align 16 2092 nop ! instruction alignment 2093 nop ! see discussion at start of file 2094 nop 2095.co_med_half: 2096 btst 1, %o0 ! check for 2097 bz,pt %ncc, .co_med_half1 ! half word alignment 2098 nop 2099 ldub [%o0], %o3 ! load one byte 2100 inc %o0 2101 stba %o3,[%o1]ASI_USER ! store byte 2102 inc %o1 2103 dec %o2 2104! 2105! Now half word aligned and have at least 38 bytes to move 2106! 2107.co_med_half1: 2108 sub %o2, 7, %o2 ! adjust count to allow cc zero test 2109.co_med_hmove: 2110 lduh [%o0], %o3 ! read half word 2111 subcc %o2, 8, %o2 ! reduce count by 8 2112 stha %o3, [%o1]ASI_USER ! write half word 2113 add %o1, 2, %o1 ! advance DST by 2 2114 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords 2115 add %o0, 8, %o0 ! advance SRC by 8 2116 stha %o3, [%o1]ASI_USER 2117 add %o1, 2, %o1 ! advance DST by 2 2118 lduh [%o0 - 4], %o3 2119 stha %o3, [%o1]ASI_USER 2120 add %o1, 2, %o1 ! advance DST by 2 2121 lduh [%o0 - 2], %o3 2122 stha %o3, [%o1]ASI_USER 2123 bgt,pt %ncc, .co_med_hmove ! loop til 7 or fewer bytes left 2124 add %o1, 2, %o1 ! advance DST by 2 2125 addcc %o2, 7, %o2 ! restore count 2126 bz,pt %ncc, .co_sm_exit 2127 deccc %o2 2128 bz,pt %ncc, .co_sm_byte 2129 nop 2130 ba,pt %ncc, .co_sm_half 2131 nop 2132 2133/* 2134 * We got here because of a fault during short copyout. 2135 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 2136 */ 2137.sm_copyout_err: 2138 membar #Sync 2139 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2140 mov SM_SAVE_SRC, %o0 2141 mov SM_SAVE_DST, %o1 2142 mov SM_SAVE_COUNT, %o2 2143 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2144 tst %o3 2145 bz,pt %ncc, 3f ! if not, return error 2146 nop 2147 ldn [%o3 + CP_COPYOUT], %o5 ! if handler, invoke it with 2148 jmp %o5 ! original arguments 2149 nop 21503: 2151 retl 2152 or %g0, -1, %o0 ! return error value 2153 2154 SET_SIZE(copyout) 2155 2156/* 2157 * The _more entry points are not intended to be used directly by 2158 * any caller from outside this file. They are provided to allow 2159 * profiling and dtrace of the portions of the copy code that uses 2160 * the floating point registers. 2161 * This entry is particularly important as DTRACE (at least as of 2162 * 4/2004) does not support leaf functions. 2163 */ 2164 2165 ENTRY(copyout_more) 2166.copyout_more: 2167 prefetch [%o0], #n_reads 2168 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2169 set .copyout_err, REAL_LOFAULT 2170 2171/* 2172 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes 2173 */ 2174.do_copyout: 2175 set copyio_fault, %l7 ! .copyio_fault is lofault val 2176 2177 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 2178 membar #Sync ! sync error barrier 2179 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 2180 2181 mov %i0, SAVE_SRC 2182 mov %i1, SAVE_DST 2183 mov %i2, SAVE_COUNT 2184 2185 FP_NOMIGRATE(6, 7) 2186 2187 rd %fprs, %o2 ! check for unused fp 2188 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 2189 btst FPRS_FEF, %o2 2190 bz,a,pt %icc, .do_blockcopyout 2191 wr %g0, FPRS_FEF, %fprs 2192 2193 BST_FPQ2Q4_TOSTACK(%o2) 2194 2195.do_blockcopyout: 2196 rd %gsr, %o2 2197 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 2198 or %l6, FPUSED_FLAG, %l6 2199 2200 andcc DST, VIS_BLOCKSIZE - 1, TMP 2201 mov ASI_USER, %asi 2202 bz,pt %ncc, 2f 2203 neg TMP 2204 add TMP, VIS_BLOCKSIZE, TMP 2205 2206 ! TMP = bytes required to align DST on FP_BLOCK boundary 2207 ! Using SRC as a tmp here 2208 cmp TMP, 3 2209 bleu,pt %ncc, 1f 2210 sub CNT,TMP,CNT ! adjust main count 2211 sub TMP, 3, TMP ! adjust for end of loop test 2212.co_blkalign: 2213 ldub [REALSRC], SRC ! move 4 bytes per loop iteration 2214 stba SRC, [DST]%asi 2215 subcc TMP, 4, TMP 2216 ldub [REALSRC + 1], SRC 2217 add REALSRC, 4, REALSRC 2218 stba SRC, [DST + 1]%asi 2219 ldub [REALSRC - 2], SRC 2220 add DST, 4, DST 2221 stba SRC, [DST - 2]%asi 2222 ldub [REALSRC - 1], SRC 2223 bgu,pt %ncc, .co_blkalign 2224 stba SRC, [DST - 1]%asi 2225 2226 addcc TMP, 3, TMP ! restore count adjustment 2227 bz,pt %ncc, 2f ! no bytes left? 2228 nop 22291: ldub [REALSRC], SRC 2230 inc REALSRC 2231 inc DST 2232 deccc TMP 2233 bgu %ncc, 1b 2234 stba SRC, [DST - 1]%asi 2235 22362: 2237 membar #StoreLoad 2238 andn REALSRC, 0x7, SRC 2239 2240 ! SRC - 8-byte aligned 2241 ! DST - 64-byte aligned 2242 ldd [SRC], %f16 2243 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 2244 alignaddr REALSRC, %g0, %g0 2245 ldd [SRC + 0x08], %f18 2246 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 2247 faligndata %f16, %f18, %f48 2248 ldd [SRC + 0x10], %f20 2249 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #one_read 2250 faligndata %f18, %f20, %f50 2251 ldd [SRC + 0x18], %f22 2252 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 2253 faligndata %f20, %f22, %f52 2254 ldd [SRC + 0x20], %f24 2255 prefetch [SRC + (5 * VIS_BLOCKSIZE)], #one_read 2256 faligndata %f22, %f24, %f54 2257 ldd [SRC + 0x28], %f26 2258 prefetch [SRC + (6 * VIS_BLOCKSIZE)], #one_read 2259 faligndata %f24, %f26, %f56 2260 ldd [SRC + 0x30], %f28 2261 prefetch [SRC + (7 * VIS_BLOCKSIZE)], #one_read 2262 faligndata %f26, %f28, %f58 2263 ldd [SRC + 0x38], %f30 2264 ldd [SRC + VIS_BLOCKSIZE], %f16 2265 sub CNT, VIS_BLOCKSIZE, CNT 2266 add SRC, VIS_BLOCKSIZE, SRC 2267 prefetch [SRC + (9 * VIS_BLOCKSIZE) - VIS_BLOCKSIZE], #one_read 2268 add REALSRC, VIS_BLOCKSIZE, REALSRC 2269 ba,pt %ncc, 1f 2270 prefetch [SRC + (10 * VIS_BLOCKSIZE) - VIS_BLOCKSIZE], #one_read 2271 .align 32 22721: 2273 ldd [SRC + 0x08], %f18 2274 faligndata %f28, %f30, %f60 2275 ldd [SRC + 0x10], %f20 2276 faligndata %f30, %f16, %f62 2277 stda %f48, [DST]ASI_BLK_AIUS 2278 ldd [SRC + 0x18], %f22 2279 faligndata %f16, %f18, %f48 2280 ldd [SRC + 0x20], %f24 2281 faligndata %f18, %f20, %f50 2282 ldd [SRC + 0x28], %f26 2283 faligndata %f20, %f22, %f52 2284 ldd [SRC + 0x30], %f28 2285 faligndata %f22, %f24, %f54 2286 ldd [SRC + 0x38], %f30 2287 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #n_reads 2288 faligndata %f24, %f26, %f56 2289 ldd [SRC + VIS_BLOCKSIZE], %f16 2290 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 2291 faligndata %f26, %f28, %f58 2292 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE) + 0x20], #n_reads 2293 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE) + 0x20], #one_read 2294 sub CNT, VIS_BLOCKSIZE, CNT 2295 add DST, VIS_BLOCKSIZE, DST 2296 add REALSRC, VIS_BLOCKSIZE, REALSRC 2297 cmp CNT, VIS_BLOCKSIZE + 8 2298 bgu,pt %ncc, 1b 2299 add SRC, VIS_BLOCKSIZE, SRC 2300 2301 ! only if REALSRC & 0x7 is 0 2302 cmp CNT, VIS_BLOCKSIZE 2303 bne %ncc, 3f 2304 andcc REALSRC, 0x7, %g0 2305 bz,pt %ncc, 2f 2306 nop 23073: 2308 faligndata %f28, %f30, %f60 2309 faligndata %f30, %f16, %f62 2310 stda %f48, [DST]ASI_BLK_AIUS 2311 add DST, VIS_BLOCKSIZE, DST 2312 ba,pt %ncc, 3f 2313 nop 23142: 2315 ldd [SRC + 0x08], %f18 2316 fsrc1 %f28, %f60 2317 ldd [SRC + 0x10], %f20 2318 fsrc1 %f30, %f62 2319 stda %f48, [DST]ASI_BLK_AIUS 2320 ldd [SRC + 0x18], %f22 2321 fsrc1 %f16, %f48 2322 ldd [SRC + 0x20], %f24 2323 fsrc1 %f18, %f50 2324 ldd [SRC + 0x28], %f26 2325 fsrc1 %f20, %f52 2326 ldd [SRC + 0x30], %f28 2327 fsrc1 %f22, %f54 2328 ldd [SRC + 0x38], %f30 2329 fsrc1 %f24, %f56 2330 sub CNT, VIS_BLOCKSIZE, CNT 2331 add DST, VIS_BLOCKSIZE, DST 2332 add SRC, VIS_BLOCKSIZE, SRC 2333 add REALSRC, VIS_BLOCKSIZE, REALSRC 2334 fsrc1 %f26, %f58 2335 fsrc1 %f28, %f60 2336 fsrc1 %f30, %f62 2337 stda %f48, [DST]ASI_BLK_AIUS 2338 add DST, VIS_BLOCKSIZE, DST 2339 ba,a,pt %ncc, 4f 2340 nop 2341 23423: tst CNT 2343 bz,a %ncc, 4f 2344 nop 2345 23465: ldub [REALSRC], TMP 2347 inc REALSRC 2348 inc DST 2349 deccc CNT 2350 bgu %ncc, 5b 2351 stba TMP, [DST - 1]%asi 23524: 2353 2354.copyout_exit: 2355 membar #Sync 2356 2357 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 2358 wr %o2, 0, %gsr ! restore gsr 2359 2360 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 2361 btst FPRS_FEF, %o3 2362 bz,pt %icc, 4f 2363 nop 2364 2365 BLD_FPQ2Q4_FROMSTACK(%o2) 2366 2367 ba,pt %ncc, 1f 2368 wr %o3, 0, %fprs ! restore fprs 2369 23704: 2371 FZEROQ2Q4 2372 wr %o3, 0, %fprs ! restore fprs 2373 23741: 2375 membar #Sync 2376 andn %l6, FPUSED_FLAG, %l6 2377 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2378 FP_ALLOWMIGRATE(5, 6) 2379 ret 2380 restore %g0, 0, %o0 2381 2382/* 2383 * We got here because of a fault during copyout. 2384 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 2385 */ 2386.copyout_err: 2387 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 2388 tst %o4 2389 bz,pt %ncc, 2f ! if not, return error 2390 nop 2391 ldn [%o4 + CP_COPYOUT], %g2 ! if handler, invoke it with 2392 jmp %g2 ! original arguments 2393 restore %g0, 0, %g0 ! dispose of copy window 23942: 2395 ret 2396 restore %g0, -1, %o0 ! return error value 2397 2398 2399 SET_SIZE(copyout_more) 2400 2401#endif /* lint */ 2402 2403 2404#ifdef lint 2405 2406/*ARGSUSED*/ 2407int 2408xcopyout(const void *kaddr, void *uaddr, size_t count) 2409{ return (0); } 2410 2411#else /* lint */ 2412 2413 ENTRY(xcopyout) 2414 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 2415 bleu,pt %ncc, .xcopyout_small ! go to larger cases 2416 xor %o0, %o1, %o3 ! are src, dst alignable? 2417 btst 7, %o3 ! 2418 bz,pt %ncc, .xcopyout_8 ! 2419 nop 2420 btst 1, %o3 ! 2421 bz,pt %ncc, .xcopyout_2 ! check for half-word 2422 nop 2423 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 2424 ld [%o3 + %lo(hw_copy_limit_1)], %o3 2425 tst %o3 2426 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2427 cmp %o2, %o3 ! if length <= limit 2428 bleu,pt %ncc, .xcopyout_small ! go to small copy 2429 nop 2430 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2431 nop 2432.xcopyout_2: 2433 btst 3, %o3 ! 2434 bz,pt %ncc, .xcopyout_4 ! check for word alignment 2435 nop 2436 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 2437 ld [%o3 + %lo(hw_copy_limit_2)], %o3 2438 tst %o3 2439 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2440 cmp %o2, %o3 ! if length <= limit 2441 bleu,pt %ncc, .xcopyout_small ! go to small copy 2442 nop 2443 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2444 nop 2445.xcopyout_4: 2446 ! already checked longword, must be word aligned 2447 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 2448 ld [%o3 + %lo(hw_copy_limit_4)], %o3 2449 tst %o3 2450 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2451 cmp %o2, %o3 ! if length <= limit 2452 bleu,pt %ncc, .xcopyout_small ! go to small copy 2453 nop 2454 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2455 nop 2456.xcopyout_8: 2457 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 2458 ld [%o3 + %lo(hw_copy_limit_8)], %o3 2459 tst %o3 2460 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2461 cmp %o2, %o3 ! if length <= limit 2462 bleu,pt %ncc, .xcopyout_small ! go to small copy 2463 nop 2464 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2465 nop 2466 2467.xcopyout_small: 2468 sethi %hi(.sm_xcopyout_err), %o5 ! .sm_xcopyout_err is lofault 2469 or %o5, %lo(.sm_xcopyout_err), %o5 2470 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 2471 membar #Sync ! sync error barrier 2472 ba,pt %ncc, .sm_do_copyout ! common code 2473 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 2474 2475.xcopyout_more: 2476 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2477 sethi %hi(.xcopyout_err), REAL_LOFAULT 2478 ba,pt %ncc, .do_copyout ! common code 2479 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT 2480 2481/* 2482 * We got here because of fault during xcopyout 2483 * Errno value is in ERRNO 2484 */ 2485.xcopyout_err: 2486 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 2487 tst %o4 2488 bz,pt %ncc, 2f ! if not, return error 2489 nop 2490 ldn [%o4 + CP_XCOPYOUT], %g2 ! if handler, invoke it with 2491 jmp %g2 ! original arguments 2492 restore %g0, 0, %g0 ! dispose of copy window 24932: 2494 ret 2495 restore ERRNO, 0, %o0 ! return errno value 2496 2497.sm_xcopyout_err: 2498 2499 membar #Sync 2500 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2501 mov SM_SAVE_SRC, %o0 2502 mov SM_SAVE_DST, %o1 2503 mov SM_SAVE_COUNT, %o2 2504 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2505 tst %o3 2506 bz,pt %ncc, 3f ! if not, return error 2507 nop 2508 ldn [%o3 + CP_XCOPYOUT], %o5 ! if handler, invoke it with 2509 jmp %o5 ! original arguments 2510 nop 25113: 2512 retl 2513 or %g1, 0, %o0 ! return errno value 2514 2515 SET_SIZE(xcopyout) 2516 2517#endif /* lint */ 2518 2519#ifdef lint 2520 2521/*ARGSUSED*/ 2522int 2523xcopyout_little(const void *kaddr, void *uaddr, size_t count) 2524{ return (0); } 2525 2526#else /* lint */ 2527 2528 ENTRY(xcopyout_little) 2529 sethi %hi(.xcopyio_err), %o5 2530 or %o5, %lo(.xcopyio_err), %o5 2531 ldn [THREAD_REG + T_LOFAULT], %o4 2532 membar #Sync ! sync error barrier 2533 stn %o5, [THREAD_REG + T_LOFAULT] 2534 mov %o4, %o5 2535 2536 subcc %g0, %o2, %o3 2537 add %o0, %o2, %o0 2538 bz,pn %ncc, 2f ! check for zero bytes 2539 sub %o2, 1, %o4 2540 add %o0, %o4, %o0 ! start w/last byte 2541 add %o1, %o2, %o1 2542 ldub [%o0 + %o3], %o4 2543 25441: stba %o4, [%o1 + %o3]ASI_AIUSL 2545 inccc %o3 2546 sub %o0, 2, %o0 ! get next byte 2547 bcc,a,pt %ncc, 1b 2548 ldub [%o0 + %o3], %o4 2549 25502: 2551 membar #Sync ! sync error barrier 2552 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2553 retl 2554 mov %g0, %o0 ! return (0) 2555 2556 SET_SIZE(xcopyout_little) 2557 2558#endif /* lint */ 2559 2560/* 2561 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little) 2562 */ 2563 2564#if defined(lint) 2565 2566/*ARGSUSED*/ 2567int 2568copyin(const void *uaddr, void *kaddr, size_t count) 2569{ return (0); } 2570 2571#else /* lint */ 2572 2573 ENTRY(copyin) 2574 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 2575 bleu,pt %ncc, .copyin_small ! go to larger cases 2576 xor %o0, %o1, %o3 ! are src, dst alignable? 2577 btst 7, %o3 ! 2578 bz,pt %ncc, .copyin_8 ! check for longword alignment 2579 nop 2580 btst 1, %o3 ! 2581 bz,pt %ncc, .copyin_2 ! check for half-word 2582 nop 2583 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 2584 ld [%o3 + %lo(hw_copy_limit_1)], %o3 2585 tst %o3 2586 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2587 cmp %o2, %o3 ! if length <= limit 2588 bleu,pt %ncc, .copyin_small ! go to small copy 2589 nop 2590 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2591 nop 2592.copyin_2: 2593 btst 3, %o3 ! 2594 bz,pt %ncc, .copyin_4 ! check for word alignment 2595 nop 2596 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 2597 ld [%o3 + %lo(hw_copy_limit_2)], %o3 2598 tst %o3 2599 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2600 cmp %o2, %o3 ! if length <= limit 2601 bleu,pt %ncc, .copyin_small ! go to small copy 2602 nop 2603 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2604 nop 2605.copyin_4: 2606 ! already checked longword, must be word aligned 2607 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 2608 ld [%o3 + %lo(hw_copy_limit_4)], %o3 2609 tst %o3 2610 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2611 cmp %o2, %o3 ! if length <= limit 2612 bleu,pt %ncc, .copyin_small ! go to small copy 2613 nop 2614 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2615 nop 2616.copyin_8: 2617 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 2618 ld [%o3 + %lo(hw_copy_limit_8)], %o3 2619 tst %o3 2620 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2621 cmp %o2, %o3 ! if length <= limit 2622 bleu,pt %ncc, .copyin_small ! go to small copy 2623 nop 2624 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2625 nop 2626 2627 .align 16 2628 nop ! instruction alignment 2629 ! see discussion at start of file 2630.copyin_small: 2631 sethi %hi(.sm_copyin_err), %o5 ! .sm_copyin_err is lofault 2632 or %o5, %lo(.sm_copyin_err), %o5 2633 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofault, no tramp 2634 membar #Sync ! sync error barrier 2635 stn %o5, [THREAD_REG + T_LOFAULT] 2636.sm_do_copyin: 2637 mov %o0, SM_SAVE_SRC 2638 mov %o1, SM_SAVE_DST 2639 cmp %o2, SHORTCOPY ! check for really short case 2640 bleu,pt %ncc, .ci_sm_left ! 2641 mov %o2, SM_SAVE_COUNT 2642 cmp %o2, CHKSIZE ! check for medium length cases 2643 bgu,pn %ncc, .ci_med ! 2644 or %o0, %o1, %o3 ! prepare alignment check 2645 andcc %o3, 0x3, %g0 ! test for alignment 2646 bz,pt %ncc, .ci_sm_word ! branch to word aligned case 2647.ci_sm_movebytes: 2648 sub %o2, 3, %o2 ! adjust count to allow cc zero test 2649.ci_sm_notalign4: 2650 lduba [%o0]ASI_USER, %o3 ! read byte 2651 subcc %o2, 4, %o2 ! reduce count by 4 2652 stb %o3, [%o1] ! write byte 2653 add %o0, 1, %o0 ! advance SRC by 1 2654 lduba [%o0]ASI_USER, %o3 ! repeat for a total of 4 bytes 2655 add %o0, 1, %o0 ! advance SRC by 1 2656 stb %o3, [%o1 + 1] 2657 add %o1, 4, %o1 ! advance DST by 4 2658 lduba [%o0]ASI_USER, %o3 2659 add %o0, 1, %o0 ! advance SRC by 1 2660 stb %o3, [%o1 - 2] 2661 lduba [%o0]ASI_USER, %o3 2662 add %o0, 1, %o0 ! advance SRC by 1 2663 bgt,pt %ncc, .ci_sm_notalign4 ! loop til 3 or fewer bytes remain 2664 stb %o3, [%o1 - 1] 2665 add %o2, 3, %o2 ! restore count 2666.ci_sm_left: 2667 tst %o2 2668 bz,pt %ncc, .ci_sm_exit 2669 nop 2670 lduba [%o0]ASI_USER, %o3 ! load one byte 2671 deccc %o2 ! reduce count for cc test 2672 bz,pt %ncc, .ci_sm_exit 2673 stb %o3,[%o1] ! store one byte 2674 inc %o0 2675 lduba [%o0]ASI_USER, %o3 ! load second byte 2676 deccc %o2 2677 bz,pt %ncc, .ci_sm_exit 2678 stb %o3,[%o1 + 1] ! store second byte 2679 inc %o0 2680 lduba [%o0]ASI_USER, %o3 ! load third byte 2681 stb %o3,[%o1 + 2] ! store third byte 2682 membar #Sync ! sync error barrier 2683 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2684 retl 2685 mov %g0, %o0 ! return 0 2686 .align 16 2687.ci_sm_words: 2688 lduwa [%o0]ASI_USER, %o3 ! read word 2689.ci_sm_wordx: 2690 subcc %o2, 8, %o2 ! update count 2691 stw %o3, [%o1] ! write word 2692 add %o0, 4, %o0 ! update SRC 2693 add %o1, 8, %o1 ! update DST 2694 lduwa [%o0]ASI_USER, %o3 ! read word 2695 add %o0, 4, %o0 ! update SRC 2696 bgt,pt %ncc, .ci_sm_words ! loop til done 2697 stw %o3, [%o1 - 4] ! write word 2698 addcc %o2, 7, %o2 ! restore count 2699 bz,pt %ncc, .ci_sm_exit 2700 nop 2701 deccc %o2 2702 bz,pt %ncc, .ci_sm_byte 2703.ci_sm_half: 2704 subcc %o2, 2, %o2 ! reduce count by 2 2705 lduha [%o0]ASI_USER, %o3 ! read half word 2706 add %o0, 2, %o0 ! advance SRC by 2 2707 add %o1, 2, %o1 ! advance DST by 2 2708 bgt,pt %ncc, .ci_sm_half ! loop til done 2709 sth %o3, [%o1 - 2] ! write half word 2710 addcc %o2, 1, %o2 ! restore count 2711 bz,pt %ncc, .ci_sm_exit 2712 nop 2713.ci_sm_byte: 2714 lduba [%o0]ASI_USER, %o3 2715 stb %o3, [%o1] 2716 membar #Sync ! sync error barrier 2717 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2718 retl 2719 mov %g0, %o0 ! return 0 2720 .align 16 2721.ci_sm_word: 2722 subcc %o2, 4, %o2 ! update count 2723 bgt,pt %ncc, .ci_sm_wordx 2724 lduwa [%o0]ASI_USER, %o3 ! read word 2725 addcc %o2, 3, %o2 ! restore count 2726 bz,pt %ncc, .ci_sm_exit 2727 stw %o3, [%o1] ! write word 2728 deccc %o2 ! reduce count for cc test 2729 add %o0, 4, %o0 2730 lduba [%o0]ASI_USER, %o3 ! load one byte 2731 bz,pt %ncc, .ci_sm_exit 2732 stb %o3, [%o1 + 4] ! store one byte 2733 inc %o0 2734 lduba [%o0]ASI_USER, %o3 ! load second byte 2735 deccc %o2 2736 bz,pt %ncc, .ci_sm_exit 2737 stb %o3, [%o1 + 5] ! store second byte 2738 inc %o0 2739 lduba [%o0]ASI_USER, %o3 ! load third byte 2740 stb %o3, [%o1 + 6] ! store third byte 2741.ci_sm_exit: 2742 membar #Sync ! sync error barrier 2743 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2744 retl 2745 mov %g0, %o0 ! return 0 2746 2747 .align 16 2748.ci_med: 2749 xor %o0, %o1, %o3 ! setup alignment check 2750 btst 1, %o3 2751 bnz,pt %ncc, .ci_sm_movebytes ! unaligned 2752 nop 2753 btst 3, %o3 2754 bnz,pt %ncc, .ci_med_half ! halfword aligned 2755 nop 2756 btst 7, %o3 2757 bnz,pt %ncc, .ci_med_word ! word aligned 2758 nop 2759.ci_med_long: 2760 btst 3, %o0 ! check for 2761 bz,pt %ncc, .ci_med_long1 ! word alignment 2762 nop 2763.ci_med_long0: 2764 lduba [%o0]ASI_USER, %o3 ! load one byte 2765 inc %o0 2766 stb %o3,[%o1] ! store byte 2767 inc %o1 2768 btst 3, %o0 2769 bnz,pt %ncc, .ci_med_long0 2770 dec %o2 2771.ci_med_long1: ! word aligned 2772 btst 7, %o0 ! check for long word 2773 bz,pt %ncc, .ci_med_long2 2774 nop 2775 lduwa [%o0]ASI_USER, %o3 ! load word 2776 add %o0, 4, %o0 ! advance SRC by 4 2777 stw %o3, [%o1] ! store word 2778 add %o1, 4, %o1 ! advance DST by 4 2779 sub %o2, 4, %o2 ! reduce count by 4 2780! 2781! Now long word aligned and have at least 32 bytes to move 2782! 2783.ci_med_long2: 2784 sub %o2, 31, %o2 ! adjust count to allow cc zero test 2785.ci_med_lmove: 2786 ldxa [%o0]ASI_USER, %o3 ! read long word 2787 subcc %o2, 32, %o2 ! reduce count by 32 2788 stx %o3, [%o1] ! write long word 2789 add %o0, 8, %o0 ! advance SRC by 8 2790 ldxa [%o0]ASI_USER, %o3 ! repeat for a total for 4 long words 2791 add %o0, 8, %o0 ! advance SRC by 8 2792 stx %o3, [%o1 + 8] 2793 add %o1, 32, %o1 ! advance DST by 32 2794 ldxa [%o0]ASI_USER, %o3 2795 add %o0, 8, %o0 ! advance SRC by 8 2796 stx %o3, [%o1 - 16] 2797 ldxa [%o0]ASI_USER, %o3 2798 add %o0, 8, %o0 ! advance SRC by 8 2799 bgt,pt %ncc, .ci_med_lmove ! loop til 31 or fewer bytes left 2800 stx %o3, [%o1 - 8] 2801 addcc %o2, 24, %o2 ! restore count to long word offset 2802 ble,pt %ncc, .ci_med_lextra ! check for more long words to move 2803 nop 2804.ci_med_lword: 2805 ldxa [%o0]ASI_USER, %o3 ! read long word 2806 subcc %o2, 8, %o2 ! reduce count by 8 2807 stx %o3, [%o1] ! write long word 2808 add %o0, 8, %o0 ! advance SRC by 8 2809 bgt,pt %ncc, .ci_med_lword ! loop til 7 or fewer bytes left 2810 add %o1, 8, %o1 ! advance DST by 8 2811.ci_med_lextra: 2812 addcc %o2, 7, %o2 ! restore rest of count 2813 bz,pt %ncc, .ci_sm_exit ! if zero, then done 2814 deccc %o2 2815 bz,pt %ncc, .ci_sm_byte 2816 nop 2817 ba,pt %ncc, .ci_sm_half 2818 nop 2819 2820 .align 16 2821 nop ! instruction alignment 2822 ! see discussion at start of file 2823.ci_med_word: 2824 btst 3, %o0 ! check for 2825 bz,pt %ncc, .ci_med_word1 ! word alignment 2826 nop 2827.ci_med_word0: 2828 lduba [%o0]ASI_USER, %o3 ! load one byte 2829 inc %o0 2830 stb %o3,[%o1] ! store byte 2831 inc %o1 2832 btst 3, %o0 2833 bnz,pt %ncc, .ci_med_word0 2834 dec %o2 2835! 2836! Now word aligned and have at least 36 bytes to move 2837! 2838.ci_med_word1: 2839 sub %o2, 15, %o2 ! adjust count to allow cc zero test 2840.ci_med_wmove: 2841 lduwa [%o0]ASI_USER, %o3 ! read word 2842 subcc %o2, 16, %o2 ! reduce count by 16 2843 stw %o3, [%o1] ! write word 2844 add %o0, 4, %o0 ! advance SRC by 4 2845 lduwa [%o0]ASI_USER, %o3 ! repeat for a total for 4 words 2846 add %o0, 4, %o0 ! advance SRC by 4 2847 stw %o3, [%o1 + 4] 2848 add %o1, 16, %o1 ! advance DST by 16 2849 lduwa [%o0]ASI_USER, %o3 2850 add %o0, 4, %o0 ! advance SRC by 4 2851 stw %o3, [%o1 - 8] 2852 lduwa [%o0]ASI_USER, %o3 2853 add %o0, 4, %o0 ! advance SRC by 4 2854 bgt,pt %ncc, .ci_med_wmove ! loop til 15 or fewer bytes left 2855 stw %o3, [%o1 - 4] 2856 addcc %o2, 12, %o2 ! restore count to word offset 2857 ble,pt %ncc, .ci_med_wextra ! check for more words to move 2858 nop 2859.ci_med_word2: 2860 lduwa [%o0]ASI_USER, %o3 ! read word 2861 subcc %o2, 4, %o2 ! reduce count by 4 2862 stw %o3, [%o1] ! write word 2863 add %o0, 4, %o0 ! advance SRC by 4 2864 bgt,pt %ncc, .ci_med_word2 ! loop til 3 or fewer bytes left 2865 add %o1, 4, %o1 ! advance DST by 4 2866.ci_med_wextra: 2867 addcc %o2, 3, %o2 ! restore rest of count 2868 bz,pt %ncc, .ci_sm_exit ! if zero, then done 2869 deccc %o2 2870 bz,pt %ncc, .ci_sm_byte 2871 nop 2872 ba,pt %ncc, .ci_sm_half 2873 nop 2874 2875 .align 16 2876 nop ! instruction alignment 2877 ! see discussion at start of file 2878.ci_med_half: 2879 btst 1, %o0 ! check for 2880 bz,pt %ncc, .ci_med_half1 ! half word alignment 2881 nop 2882 lduba [%o0]ASI_USER, %o3 ! load one byte 2883 inc %o0 2884 stb %o3,[%o1] ! store byte 2885 inc %o1 2886 dec %o2 2887! 2888! Now half word aligned and have at least 38 bytes to move 2889! 2890.ci_med_half1: 2891 sub %o2, 7, %o2 ! adjust count to allow cc zero test 2892.ci_med_hmove: 2893 lduha [%o0]ASI_USER, %o3 ! read half word 2894 subcc %o2, 8, %o2 ! reduce count by 8 2895 sth %o3, [%o1] ! write half word 2896 add %o0, 2, %o0 ! advance SRC by 2 2897 lduha [%o0]ASI_USER, %o3 ! repeat for a total for 4 halfwords 2898 add %o0, 2, %o0 ! advance SRC by 2 2899 sth %o3, [%o1 + 2] 2900 add %o1, 8, %o1 ! advance DST by 8 2901 lduha [%o0]ASI_USER, %o3 2902 add %o0, 2, %o0 ! advance SRC by 2 2903 sth %o3, [%o1 - 4] 2904 lduha [%o0]ASI_USER, %o3 2905 add %o0, 2, %o0 ! advance SRC by 2 2906 bgt,pt %ncc, .ci_med_hmove ! loop til 7 or fewer bytes left 2907 sth %o3, [%o1 - 2] 2908 addcc %o2, 7, %o2 ! restore count 2909 bz,pt %ncc, .ci_sm_exit 2910 deccc %o2 2911 bz,pt %ncc, .ci_sm_byte 2912 nop 2913 ba,pt %ncc, .ci_sm_half 2914 nop 2915 2916.sm_copyin_err: 2917 membar #Sync 2918 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2919 mov SM_SAVE_SRC, %o0 2920 mov SM_SAVE_DST, %o1 2921 mov SM_SAVE_COUNT, %o2 2922 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2923 tst %o3 2924 bz,pt %ncc, 3f ! if not, return error 2925 nop 2926 ldn [%o3 + CP_COPYIN], %o5 ! if handler, invoke it with 2927 jmp %o5 ! original arguments 2928 nop 29293: 2930 retl 2931 or %g0, -1, %o0 ! return errno value 2932 2933 SET_SIZE(copyin) 2934 2935 2936/* 2937 * The _more entry points are not intended to be used directly by 2938 * any caller from outside this file. They are provided to allow 2939 * profiling and dtrace of the portions of the copy code that uses 2940 * the floating point registers. 2941 * This entry is particularly important as DTRACE (at least as of 2942 * 4/2004) does not support leaf functions. 2943 */ 2944 2945 ENTRY(copyin_more) 2946.copyin_more: 2947 prefetch [%o0], #n_reads 2948 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2949 set .copyin_err, REAL_LOFAULT 2950 2951/* 2952 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes 2953 */ 2954.do_copyin: 2955 set copyio_fault, %l7 ! .copyio_fault is lofault val 2956 2957 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 2958 membar #Sync ! sync error barrier 2959 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 2960 2961 mov %i0, SAVE_SRC 2962 mov %i1, SAVE_DST 2963 mov %i2, SAVE_COUNT 2964 2965 FP_NOMIGRATE(6, 7) 2966 2967 rd %fprs, %o2 ! check for unused fp 2968 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 2969 btst FPRS_FEF, %o2 2970 bz,a,pt %icc, .do_blockcopyin 2971 wr %g0, FPRS_FEF, %fprs 2972 2973 BST_FPQ2Q4_TOSTACK(%o2) 2974 2975.do_blockcopyin: 2976 rd %gsr, %o2 2977 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 2978 or %l6, FPUSED_FLAG, %l6 2979 2980 andcc DST, VIS_BLOCKSIZE - 1, TMP 2981 mov ASI_USER, %asi 2982 bz,pt %ncc, 2f 2983 neg TMP 2984 add TMP, VIS_BLOCKSIZE, TMP 2985 2986 ! TMP = bytes required to align DST on FP_BLOCK boundary 2987 ! Using SRC as a tmp here 2988 cmp TMP, 3 2989 bleu,pt %ncc, 1f 2990 sub CNT,TMP,CNT ! adjust main count 2991 sub TMP, 3, TMP ! adjust for end of loop test 2992.ci_blkalign: 2993 lduba [REALSRC]%asi, SRC ! move 4 bytes per loop iteration 2994 stb SRC, [DST] 2995 subcc TMP, 4, TMP 2996 lduba [REALSRC + 1]%asi, SRC 2997 add REALSRC, 4, REALSRC 2998 stb SRC, [DST + 1] 2999 lduba [REALSRC - 2]%asi, SRC 3000 add DST, 4, DST 3001 stb SRC, [DST - 2] 3002 lduba [REALSRC - 1]%asi, SRC 3003 bgu,pt %ncc, .ci_blkalign 3004 stb SRC, [DST - 1] 3005 3006 addcc TMP, 3, TMP ! restore count adjustment 3007 bz,pt %ncc, 2f ! no bytes left? 3008 nop 30091: lduba [REALSRC]%asi, SRC 3010 inc REALSRC 3011 inc DST 3012 deccc TMP 3013 bgu %ncc, 1b 3014 stb SRC, [DST - 1] 3015 30162: 3017 membar #StoreLoad 3018 andn REALSRC, 0x7, SRC 3019 3020 ! SRC - 8-byte aligned 3021 ! DST - 64-byte aligned 3022 ldda [SRC]%asi, %f16 3023 prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #n_reads 3024 alignaddr REALSRC, %g0, %g0 3025 ldda [SRC + 0x08]%asi, %f18 3026 prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #n_reads 3027 faligndata %f16, %f18, %f48 3028 ldda [SRC + 0x10]%asi, %f20 3029 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #one_read 3030 faligndata %f18, %f20, %f50 3031 ldda [SRC + 0x18]%asi, %f22 3032 prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read 3033 faligndata %f20, %f22, %f52 3034 ldda [SRC + 0x20]%asi, %f24 3035 prefetcha [SRC + (5 * VIS_BLOCKSIZE)]%asi, #one_read 3036 faligndata %f22, %f24, %f54 3037 ldda [SRC + 0x28]%asi, %f26 3038 prefetcha [SRC + (6 * VIS_BLOCKSIZE)]%asi, #one_read 3039 faligndata %f24, %f26, %f56 3040 ldda [SRC + 0x30]%asi, %f28 3041 prefetcha [SRC + (7 * VIS_BLOCKSIZE)]%asi, #one_read 3042 faligndata %f26, %f28, %f58 3043 ldda [SRC + 0x38]%asi, %f30 3044 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 3045 sub CNT, VIS_BLOCKSIZE, CNT 3046 add SRC, VIS_BLOCKSIZE, SRC 3047 prefetcha [SRC + (9 * VIS_BLOCKSIZE) - VIS_BLOCKSIZE]%asi, #one_read 3048 add REALSRC, VIS_BLOCKSIZE, REALSRC 3049 ba,pt %ncc, 1f 3050 prefetcha [SRC + (10 * VIS_BLOCKSIZE) - VIS_BLOCKSIZE]%asi, #one_read 3051 .align 32 30521: 3053 ldda [SRC + 0x08]%asi, %f18 3054 faligndata %f28, %f30, %f60 3055 ldda [SRC + 0x10]%asi, %f20 3056 faligndata %f30, %f16, %f62 3057 stda %f48, [DST]ASI_BLK_P 3058 ldda [SRC + 0x18]%asi, %f22 3059 faligndata %f16, %f18, %f48 3060 ldda [SRC + 0x20]%asi, %f24 3061 faligndata %f18, %f20, %f50 3062 ldda [SRC + 0x28]%asi, %f26 3063 faligndata %f20, %f22, %f52 3064 ldda [SRC + 0x30]%asi, %f28 3065 faligndata %f22, %f24, %f54 3066 ldda [SRC + 0x38]%asi, %f30 3067 prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)]%asi, #n_reads 3068 faligndata %f24, %f26, %f56 3069 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 3070 prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read 3071 faligndata %f26, %f28, %f58 3072 prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE) + 0x20]%asi, #n_reads 3073 prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE) + 0x20]%asi, #one_read 3074 sub CNT, VIS_BLOCKSIZE, CNT 3075 add DST, VIS_BLOCKSIZE, DST 3076 add REALSRC, VIS_BLOCKSIZE, REALSRC 3077 cmp CNT, VIS_BLOCKSIZE + 8 3078 bgu,pt %ncc, 1b 3079 add SRC, VIS_BLOCKSIZE, SRC 3080 3081 ! only if REALSRC & 0x7 is 0 3082 cmp CNT, VIS_BLOCKSIZE 3083 bne %ncc, 3f 3084 andcc REALSRC, 0x7, %g0 3085 bz,pt %ncc, 2f 3086 nop 30873: 3088 faligndata %f28, %f30, %f60 3089 faligndata %f30, %f16, %f62 3090 stda %f48, [DST]ASI_BLK_P 3091 add DST, VIS_BLOCKSIZE, DST 3092 ba,pt %ncc, 3f 3093 nop 30942: 3095 ldda [SRC + 0x08]%asi, %f18 3096 fsrc1 %f28, %f60 3097 ldda [SRC + 0x10]%asi, %f20 3098 fsrc1 %f30, %f62 3099 stda %f48, [DST]ASI_BLK_P 3100 ldda [SRC + 0x18]%asi, %f22 3101 fsrc1 %f16, %f48 3102 ldda [SRC + 0x20]%asi, %f24 3103 fsrc1 %f18, %f50 3104 ldda [SRC + 0x28]%asi, %f26 3105 fsrc1 %f20, %f52 3106 ldda [SRC + 0x30]%asi, %f28 3107 fsrc1 %f22, %f54 3108 ldda [SRC + 0x38]%asi, %f30 3109 fsrc1 %f24, %f56 3110 sub CNT, VIS_BLOCKSIZE, CNT 3111 add DST, VIS_BLOCKSIZE, DST 3112 add SRC, VIS_BLOCKSIZE, SRC 3113 add REALSRC, VIS_BLOCKSIZE, REALSRC 3114 fsrc1 %f26, %f58 3115 fsrc1 %f28, %f60 3116 fsrc1 %f30, %f62 3117 stda %f48, [DST]ASI_BLK_P 3118 add DST, VIS_BLOCKSIZE, DST 3119 ba,a,pt %ncc, 4f 3120 nop 3121 31223: tst CNT 3123 bz,a %ncc, 4f 3124 nop 3125 31265: lduba [REALSRC]ASI_USER, TMP 3127 inc REALSRC 3128 inc DST 3129 deccc CNT 3130 bgu %ncc, 5b 3131 stb TMP, [DST - 1] 31324: 3133 3134.copyin_exit: 3135 membar #Sync 3136 3137 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 3138 wr %o2, 0, %gsr 3139 3140 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 3141 btst FPRS_FEF, %o3 3142 bz,pt %icc, 4f 3143 nop 3144 3145 BLD_FPQ2Q4_FROMSTACK(%o2) 3146 3147 ba,pt %ncc, 1f 3148 wr %o3, 0, %fprs ! restore fprs 3149 31504: 3151 FZEROQ2Q4 3152 wr %o3, 0, %fprs ! restore fprs 3153 31541: 3155 membar #Sync ! sync error barrier 3156 andn %l6, FPUSED_FLAG, %l6 3157 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3158 FP_ALLOWMIGRATE(5, 6) 3159 ret 3160 restore %g0, 0, %o0 3161/* 3162 * We got here because of a fault during copyin 3163 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 3164 */ 3165.copyin_err: 3166 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 3167 tst %o4 3168 bz,pt %ncc, 2f ! if not, return error 3169 nop 3170 ldn [%o4 + CP_COPYIN], %g2 ! if handler, invoke it with 3171 jmp %g2 ! original arguments 3172 restore %g0, 0, %g0 ! dispose of copy window 31732: 3174 ret 3175 restore %g0, -1, %o0 ! return error value 3176 3177 3178 SET_SIZE(copyin_more) 3179 3180#endif /* lint */ 3181 3182#ifdef lint 3183 3184/*ARGSUSED*/ 3185int 3186xcopyin(const void *uaddr, void *kaddr, size_t count) 3187{ return (0); } 3188 3189#else /* lint */ 3190 3191 ENTRY(xcopyin) 3192 3193 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3194 bleu,pt %ncc, .xcopyin_small ! go to larger cases 3195 xor %o0, %o1, %o3 ! are src, dst alignable? 3196 btst 7, %o3 ! 3197 bz,pt %ncc, .xcopyin_8 ! check for longword alignment 3198 nop 3199 btst 1, %o3 ! 3200 bz,pt %ncc, .xcopyin_2 ! check for half-word 3201 nop 3202 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3203 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3204 tst %o3 3205 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3206 cmp %o2, %o3 ! if length <= limit 3207 bleu,pt %ncc, .xcopyin_small ! go to small copy 3208 nop 3209 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3210 nop 3211.xcopyin_2: 3212 btst 3, %o3 ! 3213 bz,pt %ncc, .xcopyin_4 ! check for word alignment 3214 nop 3215 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3216 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3217 tst %o3 3218 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3219 cmp %o2, %o3 ! if length <= limit 3220 bleu,pt %ncc, .xcopyin_small ! go to small copy 3221 nop 3222 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3223 nop 3224.xcopyin_4: 3225 ! already checked longword, must be word aligned 3226 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3227 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3228 tst %o3 3229 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3230 cmp %o2, %o3 ! if length <= limit 3231 bleu,pt %ncc, .xcopyin_small ! go to small copy 3232 nop 3233 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3234 nop 3235.xcopyin_8: 3236 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3237 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3238 tst %o3 3239 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3240 cmp %o2, %o3 ! if length <= limit 3241 bleu,pt %ncc, .xcopyin_small ! go to small copy 3242 nop 3243 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3244 nop 3245 3246.xcopyin_small: 3247 sethi %hi(.sm_xcopyin_err), %o5 ! .sm_xcopyin_err is lofault value 3248 or %o5, %lo(.sm_xcopyin_err), %o5 3249 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofaul 3250 membar #Sync ! sync error barrier 3251 ba,pt %ncc, .sm_do_copyin ! common code 3252 stn %o5, [THREAD_REG + T_LOFAULT] 3253 3254.xcopyin_more: 3255 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3256 sethi %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value 3257 ba,pt %ncc, .do_copyin 3258 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT 3259 3260/* 3261 * We got here because of fault during xcopyin 3262 * Errno value is in ERRNO 3263 */ 3264.xcopyin_err: 3265 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 3266 tst %o4 3267 bz,pt %ncc, 2f ! if not, return error 3268 nop 3269 ldn [%o4 + CP_XCOPYIN], %g2 ! if handler, invoke it with 3270 jmp %g2 ! original arguments 3271 restore %g0, 0, %g0 ! dispose of copy window 32722: 3273 ret 3274 restore ERRNO, 0, %o0 ! return errno value 3275 3276.sm_xcopyin_err: 3277 3278 membar #Sync 3279 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3280 mov SM_SAVE_SRC, %o0 3281 mov SM_SAVE_DST, %o1 3282 mov SM_SAVE_COUNT, %o2 3283 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 3284 tst %o3 3285 bz,pt %ncc, 3f ! if not, return error 3286 nop 3287 ldn [%o3 + CP_XCOPYIN], %o5 ! if handler, invoke it with 3288 jmp %o5 ! original arguments 3289 nop 32903: 3291 retl 3292 or %g1, 0, %o0 ! return errno value 3293 3294 SET_SIZE(xcopyin) 3295 3296#endif /* lint */ 3297 3298#ifdef lint 3299 3300/*ARGSUSED*/ 3301int 3302xcopyin_little(const void *uaddr, void *kaddr, size_t count) 3303{ return (0); } 3304 3305#else /* lint */ 3306 3307 ENTRY(xcopyin_little) 3308 sethi %hi(.xcopyio_err), %o5 3309 or %o5, %lo(.xcopyio_err), %o5 3310 ldn [THREAD_REG + T_LOFAULT], %o4 3311 membar #Sync ! sync error barrier 3312 stn %o5, [THREAD_REG + T_LOFAULT] 3313 mov %o4, %o5 3314 3315 subcc %g0, %o2, %o3 3316 add %o0, %o2, %o0 3317 bz,pn %ncc, 2f ! check for zero bytes 3318 sub %o2, 1, %o4 3319 add %o0, %o4, %o0 ! start w/last byte 3320 add %o1, %o2, %o1 3321 lduba [%o0 + %o3]ASI_AIUSL, %o4 3322 33231: stb %o4, [%o1 + %o3] 3324 inccc %o3 3325 sub %o0, 2, %o0 ! get next byte 3326 bcc,a,pt %ncc, 1b 3327 lduba [%o0 + %o3]ASI_AIUSL, %o4 3328 33292: 3330 membar #Sync ! sync error barrier 3331 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3332 retl 3333 mov %g0, %o0 ! return (0) 3334 3335.xcopyio_err: 3336 membar #Sync ! sync error barrier 3337 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3338 retl 3339 mov %g1, %o0 3340 3341 SET_SIZE(xcopyin_little) 3342 3343#endif /* lint */ 3344 3345 3346/* 3347 * Copy a block of storage - must not overlap (from + len <= to). 3348 * No fault handler installed (to be called under on_fault()) 3349 */ 3350#if defined(lint) 3351 3352/* ARGSUSED */ 3353void 3354copyin_noerr(const void *ufrom, void *kto, size_t count) 3355{} 3356 3357#else /* lint */ 3358 ENTRY(copyin_noerr) 3359 3360 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3361 bleu,pt %ncc, .copyin_ne_small ! go to larger cases 3362 xor %o0, %o1, %o3 ! are src, dst alignable? 3363 btst 7, %o3 ! 3364 bz,pt %ncc, .copyin_ne_8 ! check for longword alignment 3365 nop 3366 btst 1, %o3 ! 3367 bz,pt %ncc, .copyin_ne_2 ! check for half-word 3368 nop 3369 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3370 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3371 tst %o3 3372 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3373 cmp %o2, %o3 ! if length <= limit 3374 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3375 nop 3376 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3377 nop 3378.copyin_ne_2: 3379 btst 3, %o3 ! 3380 bz,pt %ncc, .copyin_ne_4 ! check for word alignment 3381 nop 3382 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3383 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3384 tst %o3 3385 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3386 cmp %o2, %o3 ! if length <= limit 3387 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3388 nop 3389 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3390 nop 3391.copyin_ne_4: 3392 ! already checked longword, must be word aligned 3393 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3394 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3395 tst %o3 3396 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3397 cmp %o2, %o3 ! if length <= limit 3398 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3399 nop 3400 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3401 nop 3402.copyin_ne_8: 3403 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3404 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3405 tst %o3 3406 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3407 cmp %o2, %o3 ! if length <= limit 3408 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3409 nop 3410 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3411 nop 3412 3413.copyin_ne_small: 3414 ldn [THREAD_REG + T_LOFAULT], %o4 3415 tst %o4 3416 bz,pn %ncc, .sm_do_copyin 3417 nop 3418 sethi %hi(.sm_copyio_noerr), %o5 3419 or %o5, %lo(.sm_copyio_noerr), %o5 3420 membar #Sync ! sync error barrier 3421 ba,pt %ncc, .sm_do_copyin 3422 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault 3423 3424.copyin_noerr_more: 3425 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3426 sethi %hi(.copyio_noerr), REAL_LOFAULT 3427 ba,pt %ncc, .do_copyin 3428 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 3429 3430.copyio_noerr: 3431 jmp %l6 3432 restore %g0,0,%g0 3433 3434.sm_copyio_noerr: 3435 membar #Sync 3436 stn %o4, [THREAD_REG + T_LOFAULT] ! restore t_lofault 3437 jmp %o4 3438 nop 3439 3440 SET_SIZE(copyin_noerr) 3441#endif /* lint */ 3442 3443/* 3444 * Copy a block of storage - must not overlap (from + len <= to). 3445 * No fault handler installed (to be called under on_fault()) 3446 */ 3447 3448#if defined(lint) 3449 3450/* ARGSUSED */ 3451void 3452copyout_noerr(const void *kfrom, void *uto, size_t count) 3453{} 3454 3455#else /* lint */ 3456 ENTRY(copyout_noerr) 3457 3458 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3459 bleu,pt %ncc, .copyout_ne_small ! go to larger cases 3460 xor %o0, %o1, %o3 ! are src, dst alignable? 3461 btst 7, %o3 ! 3462 bz,pt %ncc, .copyout_ne_8 ! check for longword alignment 3463 nop 3464 btst 1, %o3 ! 3465 bz,pt %ncc, .copyout_ne_2 ! check for half-word 3466 nop 3467 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3468 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3469 tst %o3 3470 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3471 cmp %o2, %o3 ! if length <= limit 3472 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3473 nop 3474 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3475 nop 3476.copyout_ne_2: 3477 btst 3, %o3 ! 3478 bz,pt %ncc, .copyout_ne_4 ! check for word alignment 3479 nop 3480 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3481 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3482 tst %o3 3483 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3484 cmp %o2, %o3 ! if length <= limit 3485 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3486 nop 3487 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3488 nop 3489.copyout_ne_4: 3490 ! already checked longword, must be word aligned 3491 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3492 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3493 tst %o3 3494 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3495 cmp %o2, %o3 ! if length <= limit 3496 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3497 nop 3498 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3499 nop 3500.copyout_ne_8: 3501 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3502 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3503 tst %o3 3504 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3505 cmp %o2, %o3 ! if length <= limit 3506 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3507 nop 3508 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3509 nop 3510 3511.copyout_ne_small: 3512 ldn [THREAD_REG + T_LOFAULT], %o4 3513 tst %o4 3514 bz,pn %ncc, .sm_do_copyout 3515 nop 3516 sethi %hi(.sm_copyio_noerr), %o5 3517 or %o5, %lo(.sm_copyio_noerr), %o5 3518 membar #Sync ! sync error barrier 3519 ba,pt %ncc, .sm_do_copyout 3520 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault 3521 3522.copyout_noerr_more: 3523 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3524 sethi %hi(.copyio_noerr), REAL_LOFAULT 3525 ba,pt %ncc, .do_copyout 3526 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 3527 3528 SET_SIZE(copyout_noerr) 3529#endif /* lint */ 3530 3531 3532/* 3533 * hwblkclr - clears block-aligned, block-multiple-sized regions that are 3534 * longer than 256 bytes in length using spitfire's block stores. If 3535 * the criteria for using this routine are not met then it calls bzero 3536 * and returns 1. Otherwise 0 is returned indicating success. 3537 * Caller is responsible for ensuring use_hw_bzero is true and that 3538 * kpreempt_disable() has been called. 3539 */ 3540#ifdef lint 3541/*ARGSUSED*/ 3542int 3543hwblkclr(void *addr, size_t len) 3544{ 3545 return(0); 3546} 3547#else /* lint */ 3548 ! %i0 - start address 3549 ! %i1 - length of region (multiple of 64) 3550 ! %l0 - saved fprs 3551 ! %l1 - pointer to saved %d0 block 3552 ! %l2 - saved curthread->t_lwp 3553 3554 ENTRY(hwblkclr) 3555 ! get another window w/space for one aligned block of saved fpregs 3556 save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp 3557 3558 ! Must be block-aligned 3559 andcc %i0, (VIS_BLOCKSIZE-1), %g0 3560 bnz,pn %ncc, 1f 3561 nop 3562 3563 ! ... and must be 256 bytes or more 3564 cmp %i1, 256 3565 blu,pn %ncc, 1f 3566 nop 3567 3568 ! ... and length must be a multiple of VIS_BLOCKSIZE 3569 andcc %i1, (VIS_BLOCKSIZE-1), %g0 3570 bz,pn %ncc, 2f 3571 nop 3572 35731: ! punt, call bzero but notify the caller that bzero was used 3574 mov %i0, %o0 3575 call bzero 3576 mov %i1, %o1 3577 ret 3578 restore %g0, 1, %o0 ! return (1) - did not use block operations 3579 35802: rd %fprs, %l0 ! check for unused fp 3581 btst FPRS_FEF, %l0 3582 bz,pt %icc, 1f 3583 nop 3584 3585 ! save in-use fpregs on stack 3586 membar #Sync 3587 add %fp, STACK_BIAS - 65, %l1 3588 and %l1, -VIS_BLOCKSIZE, %l1 3589 stda %d0, [%l1]ASI_BLK_P 3590 35911: membar #StoreStore|#StoreLoad|#LoadStore 3592 wr %g0, FPRS_FEF, %fprs 3593 wr %g0, ASI_BLK_P, %asi 3594 3595 ! Clear block 3596 fzero %d0 3597 fzero %d2 3598 fzero %d4 3599 fzero %d6 3600 fzero %d8 3601 fzero %d10 3602 fzero %d12 3603 fzero %d14 3604 3605 mov 256, %i3 3606 ba,pt %ncc, .pz_doblock 3607 nop 3608 3609.pz_blkstart: 3610 ! stda %d0, [%i0 + 192]%asi ! in dly slot of branch that got us here 3611 stda %d0, [%i0 + 128]%asi 3612 stda %d0, [%i0 + 64]%asi 3613 stda %d0, [%i0]%asi 3614.pz_zinst: 3615 add %i0, %i3, %i0 3616 sub %i1, %i3, %i1 3617.pz_doblock: 3618 cmp %i1, 256 3619 bgeu,a %ncc, .pz_blkstart 3620 stda %d0, [%i0 + 192]%asi 3621 3622 cmp %i1, 64 3623 blu %ncc, .pz_finish 3624 3625 andn %i1, (64-1), %i3 3626 srl %i3, 4, %i2 ! using blocks, 1 instr / 16 words 3627 set .pz_zinst, %i4 3628 sub %i4, %i2, %i4 3629 jmp %i4 3630 nop 3631 3632.pz_finish: 3633 membar #Sync 3634 btst FPRS_FEF, %l0 3635 bz,a .pz_finished 3636 wr %l0, 0, %fprs ! restore fprs 3637 3638 ! restore fpregs from stack 3639 ldda [%l1]ASI_BLK_P, %d0 3640 membar #Sync 3641 wr %l0, 0, %fprs ! restore fprs 3642 3643.pz_finished: 3644 ret 3645 restore %g0, 0, %o0 ! return (bzero or not) 3646 3647 SET_SIZE(hwblkclr) 3648#endif /* lint */ 3649 3650#ifdef lint 3651/*ARGSUSED*/ 3652void 3653hw_pa_bcopy32(uint64_t src, uint64_t dst) 3654{} 3655#else /*!lint */ 3656 /* 3657 * Copy 32 bytes of data from src (%o0) to dst (%o1) 3658 * using physical addresses. 3659 */ 3660 ENTRY_NP(hw_pa_bcopy32) 3661 rdpr %pstate, %g1 3662 andn %g1, PSTATE_IE, %g2 3663 wrpr %g0, %g2, %pstate 3664 3665 rdpr %pstate, %g0 3666 ldxa [%o0]ASI_MEM, %o2 3667 add %o0, 8, %o0 3668 ldxa [%o0]ASI_MEM, %o3 3669 add %o0, 8, %o0 3670 ldxa [%o0]ASI_MEM, %o4 3671 add %o0, 8, %o0 3672 ldxa [%o0]ASI_MEM, %o5 3673 membar #Sync 3674 3675 stxa %o2, [%o1]ASI_MEM 3676 add %o1, 8, %o1 3677 stxa %o3, [%o1]ASI_MEM 3678 add %o1, 8, %o1 3679 stxa %o4, [%o1]ASI_MEM 3680 add %o1, 8, %o1 3681 stxa %o5, [%o1]ASI_MEM 3682 3683 retl 3684 wrpr %g0, %g1, %pstate 3685 3686 SET_SIZE(hw_pa_bcopy32) 3687 3688#endif /* lint */ 3689 3690#if defined(lint) 3691 3692int use_hw_bcopy = 1; 3693int use_hw_bzero = 1; 3694uint_t hw_copy_limit_1 = 0; 3695uint_t hw_copy_limit_2 = 0; 3696uint_t hw_copy_limit_4 = 0; 3697uint_t hw_copy_limit_8 = 0; 3698 3699#else /* !lint */ 3700 3701 DGDEF(use_hw_bcopy) 3702 .word 1 3703 DGDEF(use_hw_bzero) 3704 .word 1 3705 DGDEF(hw_copy_limit_1) 3706 .word 0 3707 DGDEF(hw_copy_limit_2) 3708 .word 0 3709 DGDEF(hw_copy_limit_4) 3710 .word 0 3711 DGDEF(hw_copy_limit_8) 3712 .word 0 3713 3714 .align 64 3715 .section ".text" 3716#endif /* !lint */ 3717