1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma ident "%Z%%M% %I% %E% SMI" 27 28#include <sys/param.h> 29#include <sys/errno.h> 30#include <sys/asm_linkage.h> 31#include <sys/vtrace.h> 32#include <sys/machthread.h> 33#include <sys/clock.h> 34#include <sys/asi.h> 35#include <sys/fsr.h> 36#include <sys/privregs.h> 37 38#if !defined(lint) 39#include "assym.h" 40#endif /* lint */ 41 42/* 43 * Pseudo-code to aid in understanding the control flow of the 44 * bcopy/copyin/copyout routines. 45 * 46 * On entry: 47 * 48 * ! Determine whether to use the FP register version 49 * ! or the leaf routine version depending on size 50 * ! of copy and flags. Set up error handling accordingly. 51 * ! The transition point depends on whether the src and 52 * ! dst addresses can be aligned to long word, word, 53 * ! half word, or byte boundaries. 54 * ! 55 * ! WARNING: <Register usage convention> 56 * ! For FP version, %l6 holds previous error handling and 57 * ! a flag: TRAMP_FLAG (low bits) 58 * ! for leaf routine version, %o4 holds those values. 59 * ! So either %l6 or %o4 is reserved and not available for 60 * ! any other use. 61 * 62 * if (length <= VIS_COPY_THRESHOLD) ! start with a quick test 63 * go to small_copy; ! to speed short copies 64 * 65 * ! src, dst long word alignable 66 * if (hw_copy_limit_8 == 0) ! hw_copy disabled 67 * go to small_copy; 68 * if (length <= hw_copy_limit_8) 69 * go to small_copy; 70 * go to FPBLK_copy; 71 * } 72 * if (src,dst not alignable) { 73 * if (hw_copy_limit_1 == 0) ! hw_copy disabled 74 * go to small_copy; 75 * if (length <= hw_copy_limit_1) 76 * go to small_copy; 77 * go to FPBLK_copy; 78 * } 79 * if (src,dst halfword alignable) { 80 * if (hw_copy_limit_2 == 0) ! hw_copy disabled 81 * go to small_copy; 82 * if (length <= hw_copy_limit_2) 83 * go to small_copy; 84 * go to FPBLK_copy; 85 * } 86 * if (src,dst word alignable) { 87 * if (hw_copy_limit_4 == 0) ! hw_copy disabled 88 * go to small_copy; 89 * if (length <= hw_copy_limit_4) 90 * go to small_copy; 91 * go to FPBLK_copy; 92 * } 93 * 94 * small_copy: 95 * Setup_leaf_rtn_error_handler; ! diffs for each entry point 96 * 97 * if (count <= 3) ! fast path for tiny copies 98 * go to sm_left; ! special finish up code 99 * else 100 * if (count > CHKSIZE) ! medium sized copies 101 * go to sm_med ! tuned by alignment 102 * if(src&dst not both word aligned) { 103 * sm_movebytes: 104 * move byte by byte in 4-way unrolled loop 105 * fall into sm_left; 106 * sm_left: 107 * move 0-3 bytes byte at a time as needed. 108 * restore error handler and exit. 109 * 110 * } else { ! src&dst are word aligned 111 * check for at least 8 bytes left, 112 * move word at a time, unrolled by 2 113 * when fewer than 8 bytes left, 114 * sm_half: move half word at a time while 2 or more bytes left 115 * sm_byte: move final byte if necessary 116 * sm_exit: 117 * restore error handler and exit. 118 * } 119 * 120 * ! Medium length cases with at least CHKSIZE bytes available 121 * ! method: line up src and dst as best possible, then 122 * ! move data in 4-way unrolled loops. 123 * 124 * sm_med: 125 * if(src&dst unalignable) 126 * go to sm_movebytes 127 * if(src&dst halfword alignable) 128 * go to sm_movehalf 129 * if(src&dst word alignable) 130 * go to sm_moveword 131 * ! fall into long word movement 132 * move bytes until src is word aligned 133 * if not long word aligned, move a word 134 * move long words in 4-way unrolled loop until < 32 bytes left 135 * move long words in 1-way unrolled loop until < 8 bytes left 136 * if zero bytes left, goto sm_exit 137 * if one byte left, go to sm_byte 138 * else go to sm_half 139 * 140 * sm_moveword: 141 * move bytes until src is word aligned 142 * move words in 4-way unrolled loop until < 16 bytes left 143 * move words in 1-way unrolled loop until < 4 bytes left 144 * if zero bytes left, goto sm_exit 145 * if one byte left, go to sm_byte 146 * else go to sm_half 147 * 148 * sm_movehalf: 149 * move a byte if needed to align src on halfword 150 * move halfwords in 4-way unrolled loop until < 8 bytes left 151 * if zero bytes left, goto sm_exit 152 * if one byte left, go to sm_byte 153 * else go to sm_half 154 * 155 * 156 * FPBLK_copy: 157 * %l6 = curthread->t_lofault; 158 * if (%l6 != NULL) { 159 * membar #Sync 160 * curthread->t_lofault = .copyerr; 161 * caller_error_handler = TRUE ! %l6 |= 2 162 * } 163 * 164 * ! for FPU testing we must not migrate cpus 165 * if (curthread->t_lwp == NULL) { 166 * ! Kernel threads do not have pcb's in which to store 167 * ! the floating point state, so disallow preemption during 168 * ! the copy. This also prevents cpu migration. 169 * kpreempt_disable(curthread); 170 * } else { 171 * thread_nomigrate(); 172 * } 173 * 174 * old_fprs = %fprs; 175 * old_gsr = %gsr; 176 * if (%fprs.fef) { 177 * %fprs.fef = 1; 178 * save current fpregs on stack using blockstore 179 * } else { 180 * %fprs.fef = 1; 181 * } 182 * 183 * 184 * do_blockcopy_here; 185 * 186 * In lofault handler: 187 * curthread->t_lofault = .copyerr2; 188 * Continue on with the normal exit handler 189 * 190 * On normal exit: 191 * %gsr = old_gsr; 192 * if (old_fprs & FPRS_FEF) 193 * restore fpregs from stack using blockload 194 * else 195 * zero fpregs 196 * %fprs = old_fprs; 197 * membar #Sync 198 * curthread->t_lofault = (%l6 & ~3); 199 * ! following test omitted from copyin/copyout as they 200 * ! will always have a current thread 201 * if (curthread->t_lwp == NULL) 202 * kpreempt_enable(curthread); 203 * else 204 * thread_allowmigrate(); 205 * return (0) 206 * 207 * In second lofault handler (.copyerr2): 208 * We've tried to restore fp state from the stack and failed. To 209 * prevent from returning with a corrupted fp state, we will panic. 210 */ 211 212/* 213 * Comments about optimization choices 214 * 215 * The initial optimization decision in this code is to determine 216 * whether to use the FP registers for a copy or not. If we don't 217 * use the FP registers, we can execute the copy as a leaf routine, 218 * saving a register save and restore. Also, less elaborate setup 219 * is required, allowing short copies to be completed more quickly. 220 * For longer copies, especially unaligned ones (where the src and 221 * dst do not align to allow simple ldx,stx operation), the FP 222 * registers allow much faster copy operations. 223 * 224 * The estimated extra cost of the FP path will vary depending on 225 * src/dst alignment, dst offset from the next 64 byte FPblock store 226 * boundary, remaining src data after the last full dst cache line is 227 * moved whether the FP registers need to be saved, and some other 228 * minor issues. The average additional overhead is estimated to be 229 * 400 clocks. Since each non-repeated/predicted tst and branch costs 230 * around 10 clocks, elaborate calculation would slow down to all 231 * longer copies and only benefit a small portion of medium sized 232 * copies. Rather than incur such cost, we chose fixed transition 233 * points for each of the alignment choices. 234 * 235 * For the inner loop, here is a comparison of the per cache line 236 * costs for each alignment when src&dst are in cache: 237 * 238 * byte aligned: 108 clocks slower for non-FPBLK 239 * half aligned: 44 clocks slower for non-FPBLK 240 * word aligned: 12 clocks slower for non-FPBLK 241 * long aligned: 4 clocks >>faster<< for non-FPBLK 242 * 243 * The long aligned loop runs faster because it does no prefetching. 244 * That wins if the data is not in cache or there is too little 245 * data to gain much benefit from prefetching. But when there 246 * is more data and that data is not in cache, failing to prefetch 247 * can run much slower. In addition, there is a 2 Kbyte store queue 248 * which will cause the non-FPBLK inner loop to slow for larger copies. 249 * The exact tradeoff is strongly load and application dependent, with 250 * increasing risk of a customer visible performance regression if the 251 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache 252 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe 253 * upper limit for the non-FPBLK code. To minimize performance regression 254 * risk while still gaining the primary benefits of the improvements to 255 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various 256 * hw_copy_limit_*. Later experimental studies using different values 257 * of hw_copy_limit_* can be used to make further adjustments if 258 * appropriate. 259 * 260 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned 261 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned 262 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned 263 * hw_copy_limit_8 = src and dst are longword aligned 264 * 265 * To say that src and dst are word aligned means that after 266 * some initial alignment activity of moving 0 to 3 bytes, 267 * both the src and dst will be on word boundaries so that 268 * word loads and stores may be used. 269 * 270 * Default values at May,2005 are: 271 * hw_copy_limit_1 = 256 272 * hw_copy_limit_2 = 512 273 * hw_copy_limit_4 = 1024 274 * hw_copy_limit_8 = 1024 (or 1536 on some systems) 275 * 276 * 277 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is 278 * disabled for that alignment choice. 279 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256) 280 * the value of VIS_COPY_THRESHOLD is used. 281 * It is not envisioned that hw_copy_limit_? will be changed in the field 282 * It is provided to allow for disabling FPBLK copies and to allow 283 * easy testing of alternate values on future HW implementations 284 * that might have different cache sizes, clock rates or instruction 285 * timing rules. 286 * 287 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum 288 * threshold to speedup all shorter copies (less than 256). That 289 * saves an alignment test, memory reference, and enabling test 290 * for all short copies, or an estimated 24 clocks. 291 * 292 * The order in which these limits are checked does matter since each 293 * non-predicted tst and branch costs around 10 clocks. 294 * If src and dst are randomly selected addresses, 295 * 4 of 8 will not be alignable. 296 * 2 of 8 will be half word alignable. 297 * 1 of 8 will be word alignable. 298 * 1 of 8 will be long word alignable. 299 * But, tests on running kernels show that src and dst to copy code 300 * are typically not on random alignments. Structure copies and 301 * copies of larger data sizes are often on long word boundaries. 302 * So we test the long word alignment case first, then 303 * the byte alignment, then halfword, then word alignment. 304 * 305 * Several times, tests for length are made to split the code 306 * into subcases. These tests often allow later tests to be 307 * avoided. For example, within the non-FPBLK copy, we first 308 * check for tiny copies of 3 bytes or less. That allows us 309 * to use a 4-way unrolled loop for the general byte copy case 310 * without a test on loop entry. 311 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less 312 * vs longer cases. For the really short case, we don't attempt 313 * align src and dst. We try to minimize special case tests in 314 * the shortest loops as each test adds a significant percentage 315 * to the total time. 316 * 317 * For the medium sized cases, we allow ourselves to adjust the 318 * src and dst alignment and provide special cases for each of 319 * the four adjusted alignment cases. The CHKSIZE that was used 320 * to decide between short and medium size was chosen to be 39 321 * as that allows for the worst case of 7 bytes of alignment 322 * shift and 4 times 8 bytes for the first long word unrolling. 323 * That knowledge saves an initial test for length on entry into 324 * the medium cases. If the general loop unrolling factor were 325 * to be increases, this number would also need to be adjusted. 326 * 327 * For all cases in the non-FPBLK code where it is known that at 328 * least 4 chunks of data are available for movement, the 329 * loop is unrolled by four. This 4-way loop runs in 8 clocks 330 * or 2 clocks per data element. 331 * 332 * Instruction alignment is forced by used of .align 16 directives 333 * and nops which are not executed in the code. This 334 * combination of operations shifts the alignment of following 335 * loops to insure that loops are aligned so that their instructions 336 * fall within the minimum number of 4 instruction fetch groups. 337 * If instructions are inserted or removed between the .align 338 * instruction and the unrolled loops, then the alignment needs 339 * to be readjusted. Misaligned loops can add a clock per loop 340 * iteration to the loop timing. 341 * 342 * In a few cases, code is duplicated to avoid a branch. Since 343 * a non-predicted tst and branch takes 10 clocks, this savings 344 * is judged an appropriate time-space tradeoff. 345 * 346 * Within the FPBLK-code, the prefetch method in the inner 347 * loop needs to be explained as it is not standard. Two 348 * prefetches are issued for each cache line instead of one. 349 * The primary one is at the maximum reach of 8 cache lines. 350 * Most of the time, that maximum prefetch reach gives the 351 * cache line more time to reach the processor for systems with 352 * higher processor clocks. But, sometimes memory interference 353 * can cause that prefetch to be dropped. Putting a second 354 * prefetch at a reach of 5 cache lines catches the drops 355 * three iterations later and shows a measured improvement 356 * in performance over any similar loop with a single prefetch. 357 * The prefetches are placed in the loop so they overlap with 358 * non-memory instructions, so that there is no extra cost 359 * when the data is already in-cache. 360 * 361 */ 362 363/* 364 * Notes on preserving existing fp state and on membars. 365 * 366 * When a copyOP decides to use fp we may have to preserve existing 367 * floating point state. It is not the caller's state that we need to 368 * preserve - the rest of the kernel does not use fp and, anyway, fp 369 * registers are volatile across a call. Some examples: 370 * 371 * - userland has fp state and is interrupted (device interrupt 372 * or trap) and within the interrupt/trap handling we use 373 * bcopy() 374 * - another (higher level) interrupt or trap handler uses bcopy 375 * while a bcopy from an earlier interrupt is still active 376 * - an asynchronous error trap occurs while fp state exists (in 377 * userland or in kernel copy) and the tl0 component of the handling 378 * uses bcopy 379 * - a user process with fp state incurs a copy-on-write fault and 380 * hwblkpagecopy always uses fp 381 * 382 * We therefore need a per-call place in which to preserve fp state - 383 * using our stack is ideal (and since fp copy cannot be leaf optimized 384 * because of calls it makes, this is no hardship). 385 * 386 * When we have finished fp copy (with it's repeated block stores) 387 * we must membar #Sync so that our block stores may complete before 388 * we either restore the original fp state into the fp registers or 389 * return to a caller which may initiate other fp operations that could 390 * modify the fp regs we used before the block stores complete. 391 * 392 * Synchronous faults (eg, unresolvable DMMU miss) that occur while 393 * t_lofault is not NULL will not panic but will instead trampoline 394 * to the registered lofault handler. There is no need for any 395 * membars for these - eg, our store to t_lofault will always be visible to 396 * ourselves and it is our cpu which will take any trap. 397 * 398 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur 399 * while t_lofault is not NULL will also not panic. Since we're copying 400 * to or from userland the extent of the damage is known - the destination 401 * buffer is incomplete. So trap handlers will trampoline to the lofault 402 * handler in this case which should take some form of error action to 403 * avoid using the incomplete buffer. The trap handler also flags the 404 * fault so that later return-from-trap handling (for the trap that brought 405 * this thread into the kernel in the first place) can notify the process 406 * and reboot the system (or restart the service with Greenline/Contracts). 407 * 408 * Asynchronous faults (eg, uncorrectable ECC error from memory) can 409 * result in deferred error traps - the trap is taken sometime after 410 * the event and the trap PC may not be the PC of the faulting access. 411 * Delivery of such pending traps can be forced by a membar #Sync, acting 412 * as an "error barrier" in this role. To accurately apply the user/kernel 413 * separation described in the preceding paragraph we must force delivery 414 * of deferred traps affecting kernel state before we install a lofault 415 * handler (if we interpose a new lofault handler on an existing one there 416 * is no need to repeat this), and we must force delivery of deferred 417 * errors affecting the lofault-protected region before we clear t_lofault. 418 * Failure to do so results in lost kernel state being interpreted as 419 * affecting a copyin/copyout only, or of an error that really only 420 * affects copy data being interpreted as losing kernel state. 421 * 422 * Since the copy operations may preserve and later restore floating 423 * point state that does not belong to the caller (see examples above), 424 * we must be careful in how we do this in order to prevent corruption 425 * of another program. 426 * 427 * To make sure that floating point state is always saved and restored 428 * correctly, the following "big rules" must be followed when the floating 429 * point registers will be used: 430 * 431 * 1. %l6 always holds the caller's lofault handler. Also in this register, 432 * Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in 433 * use. Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a 434 * lofault handler was set coming in. 435 * 436 * 2. The FPUSED flag indicates that all FP state has been successfully stored 437 * on the stack. It should not be set until this save has been completed. 438 * 439 * 3. The FPUSED flag should not be cleared on exit until all FP state has 440 * been restored from the stack. If an error occurs while restoring 441 * data from the stack, the error handler can check this flag to see if 442 * a restore is necessary. 443 * 444 * 4. Code run under the new lofault handler must be kept to a minimum. In 445 * particular, any calls to FP_ALLOWMIGRATE, which could result in a call 446 * to kpreempt(), should not be made until after the lofault handler has 447 * been restored. 448 */ 449 450/* 451 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed 452 * to "break even" using FP/VIS-accelerated memory operations. 453 * The FPBLK code assumes a minimum number of bytes are available 454 * to be moved on entry. Check that code carefully before 455 * reducing VIS_COPY_THRESHOLD below 256. 456 */ 457/* 458 * This shadows sys/machsystm.h which can't be included due to the lack of 459 * _ASM guards in include files it references. Change it here, change it there. 460 */ 461#define VIS_COPY_THRESHOLD 256 462 463/* 464 * TEST for very short copies 465 * Be aware that the maximum unroll for the short unaligned case 466 * is SHORTCOPY+1 467 */ 468#define SHORTCOPY 3 469#define CHKSIZE 39 470 471/* 472 * Indicates that we're to trampoline to the error handler. 473 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag. 474 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag. 475 */ 476#define FPUSED_FLAG 1 477#define TRAMP_FLAG 2 478#define MASK_FLAGS 3 479 480/* 481 * Number of outstanding prefetches. 482 * first prefetch moves data from L2 to L1 (n_reads) 483 * second prefetch moves data from memory to L2 (one_read) 484 */ 485#define OLYMPUS_C_PREFETCH 24 486#define OLYMPUS_C_2ND_PREFETCH 12 487 488#define VIS_BLOCKSIZE 64 489 490/* 491 * Size of stack frame in order to accomodate a 64-byte aligned 492 * floating-point register save area and 2 64-bit temp locations. 493 * All copy functions use two quadrants of fp registers; to assure a 494 * block-aligned two block buffer in which to save we must reserve 495 * three blocks on stack. Not all functions preserve %pfrs on stack 496 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all. 497 * 498 * _______________________________________ <-- %fp + STACK_BIAS 499 * | We may need to preserve 2 quadrants | 500 * | of fp regs, but since we do so with | 501 * | BST/BLD we need room in which to | 502 * | align to VIS_BLOCKSIZE bytes. So | 503 * | this area is 3 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET 504 * |-------------------------------------| 505 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET 506 * |-------------------------------------| 507 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET 508 * --------------------------------------- 509 */ 510#define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8)) 511#define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 3) 512#define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 2) - 1) 513#define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8) 514#define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8) 515 516/* 517 * Common macros used by the various versions of the block copy 518 * routines in this file. 519 */ 520 521/* 522 * In FP copies if we do not have preserved data to restore over 523 * the fp regs we used then we must zero those regs to avoid 524 * exposing portions of the data to later threads (data security). 525 * 526 * Copy functions use either quadrants 1 and 3 or 2 and 4. 527 * 528 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47 529 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63 530 * 531 * The instructions below are quicker than repeated fzero instructions 532 * since they can dispatch down two fp pipelines. 533 */ 534#define FZEROQ1Q3 \ 535 fzero %f0 ;\ 536 fmovd %f0, %f2 ;\ 537 fmovd %f0, %f4 ;\ 538 fmovd %f0, %f6 ;\ 539 fmovd %f0, %f8 ;\ 540 fmovd %f0, %f10 ;\ 541 fmovd %f0, %f12 ;\ 542 fmovd %f0, %f14 ;\ 543 fmovd %f0, %f32 ;\ 544 fmovd %f0, %f34 ;\ 545 fmovd %f0, %f36 ;\ 546 fmovd %f0, %f38 ;\ 547 fmovd %f0, %f40 ;\ 548 fmovd %f0, %f42 ;\ 549 fmovd %f0, %f44 ;\ 550 fmovd %f0, %f46 551 552#define FZEROQ2Q4 \ 553 fzero %f16 ;\ 554 fmovd %f0, %f18 ;\ 555 fmovd %f0, %f20 ;\ 556 fmovd %f0, %f22 ;\ 557 fmovd %f0, %f24 ;\ 558 fmovd %f0, %f26 ;\ 559 fmovd %f0, %f28 ;\ 560 fmovd %f0, %f30 ;\ 561 fmovd %f0, %f48 ;\ 562 fmovd %f0, %f50 ;\ 563 fmovd %f0, %f52 ;\ 564 fmovd %f0, %f54 ;\ 565 fmovd %f0, %f56 ;\ 566 fmovd %f0, %f58 ;\ 567 fmovd %f0, %f60 ;\ 568 fmovd %f0, %f62 569 570/* 571 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack. 572 * Used to save and restore in-use fp registers when we want to use FP 573 * and find fp already in use and copy size still large enough to justify 574 * the additional overhead of this save and restore. 575 * 576 * A membar #Sync is needed before save to sync fp ops initiated before 577 * the call to the copy function (by whoever has fp in use); for example 578 * an earlier block load to the quadrant we are about to save may still be 579 * "in flight". A membar #Sync is required at the end of the save to 580 * sync our block store (the copy code is about to begin ldd's to the 581 * first quadrant). 582 * 583 * Similarly: a membar #Sync before restore allows the block stores of 584 * the copy operation to complete before we fill the quadrants with their 585 * original data, and a membar #Sync after restore lets the block loads 586 * of the restore complete before we return to whoever has the fp regs 587 * in use. To avoid repeated membar #Sync we make it the responsibility 588 * of the copy code to membar #Sync immediately after copy is complete 589 * and before using the BLD_*_FROMSTACK macro. 590 */ 591#if !defined(lint) 592#define BST_FPQ1Q3_TOSTACK(tmp1) \ 593 /* membar #Sync */ ;\ 594 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 595 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 596 stda %f0, [tmp1]ASI_BLK_P ;\ 597 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 598 stda %f32, [tmp1]ASI_BLK_P ;\ 599 membar #Sync 600 601#define BLD_FPQ1Q3_FROMSTACK(tmp1) \ 602 /* membar #Sync - provided at copy completion */ ;\ 603 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 604 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 605 ldda [tmp1]ASI_BLK_P, %f0 ;\ 606 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 607 ldda [tmp1]ASI_BLK_P, %f32 ;\ 608 membar #Sync 609 610#define BST_FPQ2Q4_TOSTACK(tmp1) \ 611 /* membar #Sync */ ;\ 612 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 613 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 614 stda %f16, [tmp1]ASI_BLK_P ;\ 615 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 616 stda %f48, [tmp1]ASI_BLK_P ;\ 617 membar #Sync 618 619#define BLD_FPQ2Q4_FROMSTACK(tmp1) \ 620 /* membar #Sync - provided at copy completion */ ;\ 621 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 622 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 623 ldda [tmp1]ASI_BLK_P, %f16 ;\ 624 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 625 ldda [tmp1]ASI_BLK_P, %f48 ;\ 626 membar #Sync 627#endif 628 629/* 630 * FP_NOMIGRATE and FP_ALLOWMIGRATE. Prevent migration (or, stronger, 631 * prevent preemption if there is no t_lwp to save FP state to on context 632 * switch) before commencing a FP copy, and reallow it on completion or 633 * in error trampoline paths when we were using FP copy. 634 * 635 * Both macros may call other functions, so be aware that all outputs are 636 * forfeit after using these macros. For this reason we do not pass registers 637 * to use - we just use any outputs we want. 638 * 639 * Pseudo code: 640 * 641 * FP_NOMIGRATE: 642 * 643 * if (curthread->t_lwp) { 644 * thread_nomigrate(); 645 * } else { 646 * kpreempt_disable(); 647 * } 648 * 649 * FP_ALLOWMIGRATE: 650 * 651 * if (curthread->t_lwp) { 652 * thread_allowmigrate(); 653 * } else { 654 * kpreempt_enable(); 655 * } 656 */ 657 658#define FP_NOMIGRATE(label1, label2) \ 659 ldn [THREAD_REG + T_LWP], %o0 ;\ 660 brz,a,pn %o0, label1/**/f ;\ 661 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ 662 call thread_nomigrate ;\ 663 nop ;\ 664 ba label2/**/f ;\ 665 nop ;\ 666label1: ;\ 667 inc %o1 ;\ 668 stb %o1, [THREAD_REG + T_PREEMPT] ;\ 669label2: 670 671#define FP_ALLOWMIGRATE(label1, label2) \ 672 ldn [THREAD_REG + T_LWP], %o0 ;\ 673 brz,a,pn %o0, label1/**/f ;\ 674 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ 675 call thread_allowmigrate ;\ 676 nop ;\ 677 ba label2/**/f ;\ 678 nop ;\ 679label1: ;\ 680 dec %o1 ;\ 681 brnz,pn %o1, label2/**/f ;\ 682 stb %o1, [THREAD_REG + T_PREEMPT] ;\ 683 ldn [THREAD_REG + T_CPU], %o0 ;\ 684 ldub [%o0 + CPU_KPRUNRUN], %o0 ;\ 685 brz,pt %o0, label2/**/f ;\ 686 nop ;\ 687 call kpreempt ;\ 688 rdpr %pil, %o0 ;\ 689label2: 690 691/* 692 * Copy a block of storage, returning an error code if `from' or 693 * `to' takes a kernel pagefault which cannot be resolved. 694 * Returns errno value on pagefault error, 0 if all ok 695 */ 696 697#if defined(lint) 698 699/* ARGSUSED */ 700int 701kcopy(const void *from, void *to, size_t count) 702{ return(0); } 703 704#else /* lint */ 705 706 .seg ".text" 707 .align 4 708 709 ENTRY(kcopy) 710 711 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 712 bleu,pt %ncc, .kcopy_small ! go to larger cases 713 xor %o0, %o1, %o3 ! are src, dst alignable? 714 btst 7, %o3 ! 715 bz,pt %ncc, .kcopy_8 ! check for longword alignment 716 nop 717 btst 1, %o3 ! 718 bz,pt %ncc, .kcopy_2 ! check for half-word 719 nop 720 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 721 ld [%o3 + %lo(hw_copy_limit_1)], %o3 722 tst %o3 723 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 724 cmp %o2, %o3 ! if length <= limit 725 bleu,pt %ncc, .kcopy_small ! go to small copy 726 nop 727 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 728 nop 729.kcopy_2: 730 btst 3, %o3 ! 731 bz,pt %ncc, .kcopy_4 ! check for word alignment 732 nop 733 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 734 ld [%o3 + %lo(hw_copy_limit_2)], %o3 735 tst %o3 736 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 737 cmp %o2, %o3 ! if length <= limit 738 bleu,pt %ncc, .kcopy_small ! go to small copy 739 nop 740 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 741 nop 742.kcopy_4: 743 ! already checked longword, must be word aligned 744 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 745 ld [%o3 + %lo(hw_copy_limit_4)], %o3 746 tst %o3 747 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 748 cmp %o2, %o3 ! if length <= limit 749 bleu,pt %ncc, .kcopy_small ! go to small copy 750 nop 751 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 752 nop 753.kcopy_8: 754 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 755 ld [%o3 + %lo(hw_copy_limit_8)], %o3 756 tst %o3 757 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 758 cmp %o2, %o3 ! if length <= limit 759 bleu,pt %ncc, .kcopy_small ! go to small copy 760 nop 761 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 762 nop 763 764.kcopy_small: 765 sethi %hi(.sm_copyerr), %o5 ! sm_copyerr is lofault value 766 or %o5, %lo(.sm_copyerr), %o5 767 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 768 membar #Sync ! sync error barrier 769 ba,pt %ncc, .sm_do_copy ! common code 770 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 771 772.kcopy_more: 773 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 774 sethi %hi(.copyerr), %l7 ! copyerr is lofault value 775 or %l7, %lo(.copyerr), %l7 776 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 777 membar #Sync ! sync error barrier 778 ba,pt %ncc, .do_copy ! common code 779 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 780 781 782/* 783 * We got here because of a fault during bcopy_more, called from kcopy or bcopy. 784 * Errno value is in %g1. bcopy_more uses fp quadrants 1 and 3. 785 */ 786.copyerr: 787 set .copyerr2, %l0 788 membar #Sync ! sync error barrier 789 stn %l0, [THREAD_REG + T_LOFAULT] ! set t_lofault 790 btst FPUSED_FLAG, %l6 791 bz %ncc, 1f 792 and %l6, TRAMP_FLAG, %l0 ! copy trampoline flag to %l0 793 794 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 795 wr %o2, 0, %gsr 796 797 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 798 btst FPRS_FEF, %o3 799 bz,pt %icc, 4f 800 nop 801 802 BLD_FPQ1Q3_FROMSTACK(%o2) 803 804 ba,pt %ncc, 1f 805 wr %o3, 0, %fprs ! restore fprs 806 8074: 808 FZEROQ1Q3 809 wr %o3, 0, %fprs ! restore fprs 810 811 ! 812 ! Need to cater for the different expectations of kcopy 813 ! and bcopy. kcopy will *always* set a t_lofault handler 814 ! If it fires, we're expected to just return the error code 815 ! and *not* to invoke any existing error handler. As far as 816 ! bcopy is concerned, we only set t_lofault if there was an 817 ! existing lofault handler. In that case we're expected to 818 ! invoke the previously existing handler after resetting the 819 ! t_lofault value. 820 ! 8211: 822 andn %l6, MASK_FLAGS, %l6 ! turn trampoline flag off 823 membar #Sync ! sync error barrier 824 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 825 FP_ALLOWMIGRATE(5, 6) 826 827 btst TRAMP_FLAG, %l0 828 bnz,pn %ncc, 3f 829 nop 830 ret 831 restore %g1, 0, %o0 832 8333: 834 ! 835 ! We're here via bcopy. There *must* have been an error handler 836 ! in place otherwise we would have died a nasty death already. 837 ! 838 jmp %l6 ! goto real handler 839 restore %g0, 0, %o0 ! dispose of copy window 840 841/* 842 * We got here because of a fault in .copyerr. We can't safely restore fp 843 * state, so we panic. 844 */ 845fp_panic_msg: 846 .asciz "Unable to restore fp state after copy operation" 847 848 .align 4 849.copyerr2: 850 set fp_panic_msg, %o0 851 call panic 852 nop 853 854/* 855 * We got here because of a fault during a small kcopy or bcopy. 856 * No floating point registers are used by the small copies. 857 * Errno value is in %g1. 858 */ 859.sm_copyerr: 8601: 861 btst TRAMP_FLAG, %o4 862 membar #Sync 863 andn %o4, TRAMP_FLAG, %o4 864 bnz,pn %ncc, 3f 865 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 866 retl 867 mov %g1, %o0 8683: 869 jmp %o4 ! goto real handler 870 mov %g0, %o0 ! 871 872 SET_SIZE(kcopy) 873#endif /* lint */ 874 875 876/* 877 * Copy a block of storage - must not overlap (from + len <= to). 878 * Registers: l6 - saved t_lofault 879 * (for short copies, o4 - saved t_lofault) 880 * 881 * Copy a page of memory. 882 * Assumes double word alignment and a count >= 256. 883 */ 884#if defined(lint) 885 886/* ARGSUSED */ 887void 888bcopy(const void *from, void *to, size_t count) 889{} 890 891#else /* lint */ 892 893 ENTRY(bcopy) 894 895 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 896 bleu,pt %ncc, .bcopy_small ! go to larger cases 897 xor %o0, %o1, %o3 ! are src, dst alignable? 898 btst 7, %o3 ! 899 bz,pt %ncc, .bcopy_8 ! check for longword alignment 900 nop 901 btst 1, %o3 ! 902 bz,pt %ncc, .bcopy_2 ! check for half-word 903 nop 904 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 905 ld [%o3 + %lo(hw_copy_limit_1)], %o3 906 tst %o3 907 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 908 cmp %o2, %o3 ! if length <= limit 909 bleu,pt %ncc, .bcopy_small ! go to small copy 910 nop 911 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 912 nop 913.bcopy_2: 914 btst 3, %o3 ! 915 bz,pt %ncc, .bcopy_4 ! check for word alignment 916 nop 917 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 918 ld [%o3 + %lo(hw_copy_limit_2)], %o3 919 tst %o3 920 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 921 cmp %o2, %o3 ! if length <= limit 922 bleu,pt %ncc, .bcopy_small ! go to small copy 923 nop 924 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 925 nop 926.bcopy_4: 927 ! already checked longword, must be word aligned 928 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 929 ld [%o3 + %lo(hw_copy_limit_4)], %o3 930 tst %o3 931 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 932 cmp %o2, %o3 ! if length <= limit 933 bleu,pt %ncc, .bcopy_small ! go to small copy 934 nop 935 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 936 nop 937.bcopy_8: 938 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 939 ld [%o3 + %lo(hw_copy_limit_8)], %o3 940 tst %o3 941 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 942 cmp %o2, %o3 ! if length <= limit 943 bleu,pt %ncc, .bcopy_small ! go to small copy 944 nop 945 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 946 nop 947 948 .align 16 949.bcopy_small: 950 ldn [THREAD_REG + T_LOFAULT], %o4 ! save t_lofault 951 tst %o4 952 bz,pt %icc, .sm_do_copy 953 nop 954 sethi %hi(.sm_copyerr), %o5 955 or %o5, %lo(.sm_copyerr), %o5 956 membar #Sync ! sync error barrier 957 stn %o5, [THREAD_REG + T_LOFAULT] ! install new vector 958 or %o4, TRAMP_FLAG, %o4 ! error should trampoline 959.sm_do_copy: 960 cmp %o2, SHORTCOPY ! check for really short case 961 bleu,pt %ncc, .bc_sm_left ! 962 cmp %o2, CHKSIZE ! check for medium length cases 963 bgu,pn %ncc, .bc_med ! 964 or %o0, %o1, %o3 ! prepare alignment check 965 andcc %o3, 0x3, %g0 ! test for alignment 966 bz,pt %ncc, .bc_sm_word ! branch to word aligned case 967.bc_sm_movebytes: 968 sub %o2, 3, %o2 ! adjust count to allow cc zero test 969.bc_sm_notalign4: 970 ldub [%o0], %o3 ! read byte 971 stb %o3, [%o1] ! write byte 972 subcc %o2, 4, %o2 ! reduce count by 4 973 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes 974 add %o0, 4, %o0 ! advance SRC by 4 975 stb %o3, [%o1 + 1] 976 ldub [%o0 - 2], %o3 977 add %o1, 4, %o1 ! advance DST by 4 978 stb %o3, [%o1 - 2] 979 ldub [%o0 - 1], %o3 980 bgt,pt %ncc, .bc_sm_notalign4 ! loop til 3 or fewer bytes remain 981 stb %o3, [%o1 - 1] 982 add %o2, 3, %o2 ! restore count 983.bc_sm_left: 984 tst %o2 985 bz,pt %ncc, .bc_sm_exit ! check for zero length 986 deccc %o2 ! reduce count for cc test 987 ldub [%o0], %o3 ! move one byte 988 bz,pt %ncc, .bc_sm_exit 989 stb %o3, [%o1] 990 ldub [%o0 + 1], %o3 ! move another byte 991 deccc %o2 ! check for more 992 bz,pt %ncc, .bc_sm_exit 993 stb %o3, [%o1 + 1] 994 ldub [%o0 + 2], %o3 ! move final byte 995 stb %o3, [%o1 + 2] 996 membar #Sync ! sync error barrier 997 andn %o4, TRAMP_FLAG, %o4 998 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 999 retl 1000 mov %g0, %o0 ! return 0 1001 .align 16 1002 nop ! instruction alignment 1003 ! see discussion at start of file 1004.bc_sm_words: 1005 lduw [%o0], %o3 ! read word 1006.bc_sm_wordx: 1007 subcc %o2, 8, %o2 ! update count 1008 stw %o3, [%o1] ! write word 1009 add %o0, 8, %o0 ! update SRC 1010 lduw [%o0 - 4], %o3 ! read word 1011 add %o1, 8, %o1 ! update DST 1012 bgt,pt %ncc, .bc_sm_words ! loop til done 1013 stw %o3, [%o1 - 4] ! write word 1014 addcc %o2, 7, %o2 ! restore count 1015 bz,pt %ncc, .bc_sm_exit 1016 deccc %o2 1017 bz,pt %ncc, .bc_sm_byte 1018.bc_sm_half: 1019 subcc %o2, 2, %o2 ! reduce count by 2 1020 add %o0, 2, %o0 ! advance SRC by 2 1021 lduh [%o0 - 2], %o3 ! read half word 1022 add %o1, 2, %o1 ! advance DST by 2 1023 bgt,pt %ncc, .bc_sm_half ! loop til done 1024 sth %o3, [%o1 - 2] ! write half word 1025 addcc %o2, 1, %o2 ! restore count 1026 bz,pt %ncc, .bc_sm_exit 1027 nop 1028.bc_sm_byte: 1029 ldub [%o0], %o3 1030 stb %o3, [%o1] 1031 membar #Sync ! sync error barrier 1032 andn %o4, TRAMP_FLAG, %o4 1033 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1034 retl 1035 mov %g0, %o0 ! return 0 1036 1037.bc_sm_word: 1038 subcc %o2, 4, %o2 ! update count 1039 bgt,pt %ncc, .bc_sm_wordx 1040 lduw [%o0], %o3 ! read word 1041 addcc %o2, 3, %o2 ! restore count 1042 bz,pt %ncc, .bc_sm_exit 1043 stw %o3, [%o1] ! write word 1044 deccc %o2 ! reduce count for cc test 1045 ldub [%o0 + 4], %o3 ! load one byte 1046 bz,pt %ncc, .bc_sm_exit 1047 stb %o3, [%o1 + 4] ! store one byte 1048 ldub [%o0 + 5], %o3 ! load second byte 1049 deccc %o2 1050 bz,pt %ncc, .bc_sm_exit 1051 stb %o3, [%o1 + 5] ! store second byte 1052 ldub [%o0 + 6], %o3 ! load third byte 1053 stb %o3, [%o1 + 6] ! store third byte 1054.bc_sm_exit: 1055 membar #Sync ! sync error barrier 1056 andn %o4, TRAMP_FLAG, %o4 1057 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1058 retl 1059 mov %g0, %o0 ! return 0 1060 1061 .align 16 1062.bc_med: 1063 xor %o0, %o1, %o3 ! setup alignment check 1064 btst 1, %o3 1065 bnz,pt %ncc, .bc_sm_movebytes ! unaligned 1066 nop 1067 btst 3, %o3 1068 bnz,pt %ncc, .bc_med_half ! halfword aligned 1069 nop 1070 btst 7, %o3 1071 bnz,pt %ncc, .bc_med_word ! word aligned 1072 nop 1073.bc_med_long: 1074 btst 3, %o0 ! check for 1075 bz,pt %ncc, .bc_med_long1 ! word alignment 1076 nop 1077.bc_med_long0: 1078 ldub [%o0], %o3 ! load one byte 1079 inc %o0 1080 stb %o3,[%o1] ! store byte 1081 inc %o1 1082 btst 3, %o0 1083 bnz,pt %ncc, .bc_med_long0 1084 dec %o2 1085.bc_med_long1: ! word aligned 1086 btst 7, %o0 ! check for long word 1087 bz,pt %ncc, .bc_med_long2 1088 nop 1089 lduw [%o0], %o3 ! load word 1090 add %o0, 4, %o0 ! advance SRC by 4 1091 stw %o3, [%o1] ! store word 1092 add %o1, 4, %o1 ! advance DST by 4 1093 sub %o2, 4, %o2 ! reduce count by 4 1094! 1095! Now long word aligned and have at least 32 bytes to move 1096! 1097.bc_med_long2: 1098 sub %o2, 31, %o2 ! adjust count to allow cc zero test 1099.bc_med_lmove: 1100 ldx [%o0], %o3 ! read long word 1101 stx %o3, [%o1] ! write long word 1102 subcc %o2, 32, %o2 ! reduce count by 32 1103 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words 1104 add %o0, 32, %o0 ! advance SRC by 32 1105 stx %o3, [%o1 + 8] 1106 ldx [%o0 - 16], %o3 1107 add %o1, 32, %o1 ! advance DST by 32 1108 stx %o3, [%o1 - 16] 1109 ldx [%o0 - 8], %o3 1110 bgt,pt %ncc, .bc_med_lmove ! loop til 31 or fewer bytes left 1111 stx %o3, [%o1 - 8] 1112 addcc %o2, 24, %o2 ! restore count to long word offset 1113 ble,pt %ncc, .bc_med_lextra ! check for more long words to move 1114 nop 1115.bc_med_lword: 1116 ldx [%o0], %o3 ! read long word 1117 subcc %o2, 8, %o2 ! reduce count by 8 1118 stx %o3, [%o1] ! write long word 1119 add %o0, 8, %o0 ! advance SRC by 8 1120 bgt,pt %ncc, .bc_med_lword ! loop til 7 or fewer bytes left 1121 add %o1, 8, %o1 ! advance DST by 8 1122.bc_med_lextra: 1123 addcc %o2, 7, %o2 ! restore rest of count 1124 bz,pt %ncc, .bc_sm_exit ! if zero, then done 1125 deccc %o2 1126 bz,pt %ncc, .bc_sm_byte 1127 nop 1128 ba,pt %ncc, .bc_sm_half 1129 nop 1130 1131 .align 16 1132.bc_med_word: 1133 btst 3, %o0 ! check for 1134 bz,pt %ncc, .bc_med_word1 ! word alignment 1135 nop 1136.bc_med_word0: 1137 ldub [%o0], %o3 ! load one byte 1138 inc %o0 1139 stb %o3,[%o1] ! store byte 1140 inc %o1 1141 btst 3, %o0 1142 bnz,pt %ncc, .bc_med_word0 1143 dec %o2 1144! 1145! Now word aligned and have at least 36 bytes to move 1146! 1147.bc_med_word1: 1148 sub %o2, 15, %o2 ! adjust count to allow cc zero test 1149.bc_med_wmove: 1150 lduw [%o0], %o3 ! read word 1151 stw %o3, [%o1] ! write word 1152 subcc %o2, 16, %o2 ! reduce count by 16 1153 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words 1154 add %o0, 16, %o0 ! advance SRC by 16 1155 stw %o3, [%o1 + 4] 1156 lduw [%o0 - 8], %o3 1157 add %o1, 16, %o1 ! advance DST by 16 1158 stw %o3, [%o1 - 8] 1159 lduw [%o0 - 4], %o3 1160 bgt,pt %ncc, .bc_med_wmove ! loop til 15 or fewer bytes left 1161 stw %o3, [%o1 - 4] 1162 addcc %o2, 12, %o2 ! restore count to word offset 1163 ble,pt %ncc, .bc_med_wextra ! check for more words to move 1164 nop 1165.bc_med_word2: 1166 lduw [%o0], %o3 ! read word 1167 subcc %o2, 4, %o2 ! reduce count by 4 1168 stw %o3, [%o1] ! write word 1169 add %o0, 4, %o0 ! advance SRC by 4 1170 bgt,pt %ncc, .bc_med_word2 ! loop til 3 or fewer bytes left 1171 add %o1, 4, %o1 ! advance DST by 4 1172.bc_med_wextra: 1173 addcc %o2, 3, %o2 ! restore rest of count 1174 bz,pt %ncc, .bc_sm_exit ! if zero, then done 1175 deccc %o2 1176 bz,pt %ncc, .bc_sm_byte 1177 nop 1178 ba,pt %ncc, .bc_sm_half 1179 nop 1180 1181 .align 16 1182.bc_med_half: 1183 btst 1, %o0 ! check for 1184 bz,pt %ncc, .bc_med_half1 ! half word alignment 1185 nop 1186 ldub [%o0], %o3 ! load one byte 1187 inc %o0 1188 stb %o3,[%o1] ! store byte 1189 inc %o1 1190 dec %o2 1191! 1192! Now half word aligned and have at least 38 bytes to move 1193! 1194.bc_med_half1: 1195 sub %o2, 7, %o2 ! adjust count to allow cc zero test 1196.bc_med_hmove: 1197 lduh [%o0], %o3 ! read half word 1198 sth %o3, [%o1] ! write half word 1199 subcc %o2, 8, %o2 ! reduce count by 8 1200 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords 1201 add %o0, 8, %o0 ! advance SRC by 8 1202 sth %o3, [%o1 + 2] 1203 lduh [%o0 - 4], %o3 1204 add %o1, 8, %o1 ! advance DST by 8 1205 sth %o3, [%o1 - 4] 1206 lduh [%o0 - 2], %o3 1207 bgt,pt %ncc, .bc_med_hmove ! loop til 7 or fewer bytes left 1208 sth %o3, [%o1 - 2] 1209 addcc %o2, 7, %o2 ! restore count 1210 bz,pt %ncc, .bc_sm_exit 1211 deccc %o2 1212 bz,pt %ncc, .bc_sm_byte 1213 nop 1214 ba,pt %ncc, .bc_sm_half 1215 nop 1216 1217 SET_SIZE(bcopy) 1218 1219/* 1220 * The _more entry points are not intended to be used directly by 1221 * any caller from outside this file. They are provided to allow 1222 * profiling and dtrace of the portions of the copy code that uses 1223 * the floating point registers. 1224 * This entry is particularly important as DTRACE (at least as of 1225 * 4/2004) does not support leaf functions. 1226 */ 1227 1228 ENTRY(bcopy_more) 1229.bcopy_more: 1230 prefetch [%o0], #n_reads 1231 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 1232 ldn [THREAD_REG + T_LOFAULT], %l6 ! save t_lofault 1233 tst %l6 1234 bz,pt %ncc, .do_copy 1235 nop 1236 sethi %hi(.copyerr), %o2 1237 or %o2, %lo(.copyerr), %o2 1238 membar #Sync ! sync error barrier 1239 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector 1240 ! 1241 ! We've already captured whether t_lofault was zero on entry. 1242 ! We need to mark ourselves as being from bcopy since both 1243 ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set 1244 ! and the saved lofault was zero, we won't reset lofault on 1245 ! returning. 1246 ! 1247 or %l6, TRAMP_FLAG, %l6 1248 1249/* 1250 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes 1251 * Also, use of FP registers has been tested to be enabled 1252 */ 1253.do_copy: 1254 FP_NOMIGRATE(6, 7) 1255 1256 rd %fprs, %o2 ! check for unused fp 1257 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 1258 btst FPRS_FEF, %o2 1259 bz,a,pt %icc, .do_blockcopy 1260 wr %g0, FPRS_FEF, %fprs 1261 1262 BST_FPQ1Q3_TOSTACK(%o2) 1263 1264.do_blockcopy: 1265 rd %gsr, %o2 1266 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 1267 or %l6, FPUSED_FLAG, %l6 1268 1269#define REALSRC %i0 1270#define DST %i1 1271#define CNT %i2 1272#define SRC %i3 1273#define TMP %i5 1274 1275 andcc DST, VIS_BLOCKSIZE - 1, TMP 1276 bz,pt %ncc, 2f 1277 neg TMP 1278 add TMP, VIS_BLOCKSIZE, TMP 1279 1280 ! TMP = bytes required to align DST on FP_BLOCK boundary 1281 ! Using SRC as a tmp here 1282 cmp TMP, 3 1283 bleu,pt %ncc, 1f 1284 sub CNT,TMP,CNT ! adjust main count 1285 sub TMP, 3, TMP ! adjust for end of loop test 1286.bc_blkalign: 1287 ldub [REALSRC], SRC ! move 4 bytes per loop iteration 1288 stb SRC, [DST] 1289 subcc TMP, 4, TMP 1290 ldub [REALSRC + 1], SRC 1291 add REALSRC, 4, REALSRC 1292 stb SRC, [DST + 1] 1293 ldub [REALSRC - 2], SRC 1294 add DST, 4, DST 1295 stb SRC, [DST - 2] 1296 ldub [REALSRC - 1], SRC 1297 bgu,pt %ncc, .bc_blkalign 1298 stb SRC, [DST - 1] 1299 1300 addcc TMP, 3, TMP ! restore count adjustment 1301 bz,pt %ncc, 2f ! no bytes left? 1302 nop 13031: ldub [REALSRC], SRC 1304 inc REALSRC 1305 inc DST 1306 deccc TMP 1307 bgu %ncc, 1b 1308 stb SRC, [DST - 1] 1309 13102: 1311 membar #StoreLoad 1312 andn REALSRC, 0x7, SRC 1313 1314 ! SRC - 8-byte aligned 1315 ! DST - 64-byte aligned 1316 ldd [SRC], %f0 1317 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 1318 alignaddr REALSRC, %g0, %g0 1319 ldd [SRC + 0x08], %f2 1320 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 1321 faligndata %f0, %f2, %f32 1322 ldd [SRC + 0x10], %f4 1323 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 1324 faligndata %f2, %f4, %f34 1325 ldd [SRC + 0x18], %f6 1326 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 1327 faligndata %f4, %f6, %f36 1328 ldd [SRC + 0x20], %f8 1329 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read 1330 faligndata %f6, %f8, %f38 1331 ldd [SRC + 0x28], %f10 1332 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read 1333 faligndata %f8, %f10, %f40 1334 ldd [SRC + 0x30], %f12 1335 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read 1336 faligndata %f10, %f12, %f42 1337 ldd [SRC + 0x38], %f14 1338 ldd [SRC + VIS_BLOCKSIZE], %f0 1339 sub CNT, VIS_BLOCKSIZE, CNT 1340 add SRC, VIS_BLOCKSIZE, SRC 1341 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read 1342 add REALSRC, VIS_BLOCKSIZE, REALSRC 1343 ba,pt %ncc, 1f 1344 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read 1345 .align 32 13461: 1347 ldd [SRC + 0x08], %f2 1348 faligndata %f12, %f14, %f44 1349 ldd [SRC + 0x10], %f4 1350 faligndata %f14, %f0, %f46 1351 stda %f32, [DST]ASI_BLK_P 1352 ldd [SRC + 0x18], %f6 1353 faligndata %f0, %f2, %f32 1354 ldd [SRC + 0x20], %f8 1355 faligndata %f2, %f4, %f34 1356 ldd [SRC + 0x28], %f10 1357 faligndata %f4, %f6, %f36 1358 ldd [SRC + 0x30], %f12 1359 faligndata %f6, %f8, %f38 1360 sub CNT, VIS_BLOCKSIZE, CNT 1361 ldd [SRC + 0x38], %f14 1362 faligndata %f8, %f10, %f40 1363 add DST, VIS_BLOCKSIZE, DST 1364 ldd [SRC + VIS_BLOCKSIZE], %f0 1365 faligndata %f10, %f12, %f42 1366 add REALSRC, VIS_BLOCKSIZE, REALSRC 1367 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 1368 add SRC, VIS_BLOCKSIZE, SRC 1369 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1370 cmp CNT, VIS_BLOCKSIZE + 8 1371 bgu,pt %ncc, 1b 1372 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1373 1374 ! only if REALSRC & 0x7 is 0 1375 cmp CNT, VIS_BLOCKSIZE 1376 bne %ncc, 3f 1377 andcc REALSRC, 0x7, %g0 1378 bz,pt %ncc, 2f 1379 nop 13803: 1381 faligndata %f12, %f14, %f44 1382 faligndata %f14, %f0, %f46 1383 stda %f32, [DST]ASI_BLK_P 1384 add DST, VIS_BLOCKSIZE, DST 1385 ba,pt %ncc, 3f 1386 nop 13872: 1388 ldd [SRC + 0x08], %f2 1389 fsrc1 %f12, %f44 1390 ldd [SRC + 0x10], %f4 1391 fsrc1 %f14, %f46 1392 stda %f32, [DST]ASI_BLK_P 1393 ldd [SRC + 0x18], %f6 1394 fsrc1 %f0, %f32 1395 ldd [SRC + 0x20], %f8 1396 fsrc1 %f2, %f34 1397 ldd [SRC + 0x28], %f10 1398 fsrc1 %f4, %f36 1399 ldd [SRC + 0x30], %f12 1400 fsrc1 %f6, %f38 1401 ldd [SRC + 0x38], %f14 1402 fsrc1 %f8, %f40 1403 sub CNT, VIS_BLOCKSIZE, CNT 1404 add DST, VIS_BLOCKSIZE, DST 1405 add SRC, VIS_BLOCKSIZE, SRC 1406 add REALSRC, VIS_BLOCKSIZE, REALSRC 1407 fsrc1 %f10, %f42 1408 fsrc1 %f12, %f44 1409 fsrc1 %f14, %f46 1410 stda %f32, [DST]ASI_BLK_P 1411 add DST, VIS_BLOCKSIZE, DST 1412 ba,a,pt %ncc, .bcb_exit 1413 nop 1414 14153: tst CNT 1416 bz,a,pt %ncc, .bcb_exit 1417 nop 1418 14195: ldub [REALSRC], TMP 1420 inc REALSRC 1421 inc DST 1422 deccc CNT 1423 bgu %ncc, 5b 1424 stb TMP, [DST - 1] 1425.bcb_exit: 1426 membar #Sync 1427 1428 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 1429 wr %o2, 0, %gsr 1430 1431 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 1432 btst FPRS_FEF, %o3 1433 bz,pt %icc, 4f 1434 nop 1435 1436 BLD_FPQ1Q3_FROMSTACK(%o2) 1437 1438 ba,pt %ncc, 2f 1439 wr %o3, 0, %fprs ! restore fprs 14404: 1441 FZEROQ1Q3 1442 wr %o3, 0, %fprs ! restore fprs 14432: 1444 membar #Sync ! sync error barrier 1445 andn %l6, MASK_FLAGS, %l6 1446 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1447 FP_ALLOWMIGRATE(5, 6) 1448 ret 1449 restore %g0, 0, %o0 1450 1451 SET_SIZE(bcopy_more) 1452 1453#endif /* lint */ 1454 1455/* 1456 * Block copy with possibly overlapped operands. 1457 */ 1458 1459#if defined(lint) 1460 1461/*ARGSUSED*/ 1462void 1463ovbcopy(const void *from, void *to, size_t count) 1464{} 1465 1466#else /* lint */ 1467 1468 ENTRY(ovbcopy) 1469 tst %o2 ! check count 1470 bgu,a %ncc, 1f ! nothing to do or bad arguments 1471 subcc %o0, %o1, %o3 ! difference of from and to address 1472 1473 retl ! return 1474 nop 14751: 1476 bneg,a %ncc, 2f 1477 neg %o3 ! if < 0, make it positive 14782: cmp %o2, %o3 ! cmp size and abs(from - to) 1479 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy, 1480 .empty ! no overlap 1481 cmp %o0, %o1 ! compare from and to addresses 1482 blu %ncc, .ov_bkwd ! if from < to, copy backwards 1483 nop 1484 ! 1485 ! Copy forwards. 1486 ! 1487.ov_fwd: 1488 ldub [%o0], %o3 ! read from address 1489 inc %o0 ! inc from address 1490 stb %o3, [%o1] ! write to address 1491 deccc %o2 ! dec count 1492 bgu %ncc, .ov_fwd ! loop till done 1493 inc %o1 ! inc to address 1494 1495 retl ! return 1496 nop 1497 ! 1498 ! Copy backwards. 1499 ! 1500.ov_bkwd: 1501 deccc %o2 ! dec count 1502 ldub [%o0 + %o2], %o3 ! get byte at end of src 1503 bgu %ncc, .ov_bkwd ! loop till done 1504 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst 1505 1506 retl ! return 1507 nop 1508 1509 SET_SIZE(ovbcopy) 1510 1511#endif /* lint */ 1512 1513 1514/* 1515 * hwblkpagecopy() 1516 * 1517 * Copies exactly one page. This routine assumes the caller (ppcopy) 1518 * has already disabled kernel preemption and has checked 1519 * use_hw_bcopy. Preventing preemption also prevents cpu migration. 1520 */ 1521#ifdef lint 1522/*ARGSUSED*/ 1523void 1524hwblkpagecopy(const void *src, void *dst) 1525{ } 1526#else /* lint */ 1527 ENTRY(hwblkpagecopy) 1528 ! get another window w/space for three aligned blocks of saved fpregs 1529 prefetch [%o0], #n_reads 1530 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 1531 1532 ! %i0 - source address (arg) 1533 ! %i1 - destination address (arg) 1534 ! %i2 - length of region (not arg) 1535 ! %l0 - saved fprs 1536 ! %l1 - pointer to saved fpregs 1537 1538 rd %fprs, %l0 ! check for unused fp 1539 btst FPRS_FEF, %l0 1540 bz,a,pt %icc, 1f 1541 wr %g0, FPRS_FEF, %fprs 1542 1543 BST_FPQ1Q3_TOSTACK(%l1) 1544 15451: set PAGESIZE, CNT 1546 mov REALSRC, SRC 1547 1548 ldd [SRC], %f0 1549 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 1550 ldd [SRC + 0x08], %f2 1551 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 1552 fmovd %f0, %f32 1553 ldd [SRC + 0x10], %f4 1554 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 1555 fmovd %f2, %f34 1556 ldd [SRC + 0x18], %f6 1557 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 1558 fmovd %f4, %f36 1559 ldd [SRC + 0x20], %f8 1560 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read 1561 fmovd %f6, %f38 1562 ldd [SRC + 0x28], %f10 1563 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read 1564 fmovd %f8, %f40 1565 ldd [SRC + 0x30], %f12 1566 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read 1567 fmovd %f10, %f42 1568 ldd [SRC + 0x38], %f14 1569 ldd [SRC + VIS_BLOCKSIZE], %f0 1570 sub CNT, VIS_BLOCKSIZE, CNT 1571 add SRC, VIS_BLOCKSIZE, SRC 1572 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read 1573 ba,pt %ncc, 2f 1574 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read 1575 .align 32 15762: 1577 ldd [SRC + 0x08], %f2 1578 fmovd %f12, %f44 1579 ldd [SRC + 0x10], %f4 1580 fmovd %f14, %f46 1581 stda %f32, [DST]ASI_BLK_P 1582 ldd [SRC + 0x18], %f6 1583 fmovd %f0, %f32 1584 ldd [SRC + 0x20], %f8 1585 fmovd %f2, %f34 1586 ldd [SRC + 0x28], %f10 1587 fmovd %f4, %f36 1588 ldd [SRC + 0x30], %f12 1589 fmovd %f6, %f38 1590 ldd [SRC + 0x38], %f14 1591 fmovd %f8, %f40 1592 ldd [SRC + VIS_BLOCKSIZE], %f0 1593 fmovd %f10, %f42 1594 sub CNT, VIS_BLOCKSIZE, CNT 1595 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 1596 add DST, VIS_BLOCKSIZE, DST 1597 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1598 add SRC, VIS_BLOCKSIZE, SRC 1599 cmp CNT, VIS_BLOCKSIZE + 8 1600 bgu,pt %ncc, 2b 1601 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1602 1603 ! trailing block 1604 ldd [SRC + 0x08], %f2 1605 fsrc1 %f12, %f44 1606 ldd [SRC + 0x10], %f4 1607 fsrc1 %f14, %f46 1608 stda %f32, [DST]ASI_BLK_P 1609 ldd [SRC + 0x18], %f6 1610 fsrc1 %f0, %f32 1611 ldd [SRC + 0x20], %f8 1612 fsrc1 %f2, %f34 1613 ldd [SRC + 0x28], %f10 1614 fsrc1 %f4, %f36 1615 ldd [SRC + 0x30], %f12 1616 fsrc1 %f6, %f38 1617 ldd [SRC + 0x38], %f14 1618 fsrc1 %f8, %f40 1619 sub CNT, VIS_BLOCKSIZE, CNT 1620 add DST, VIS_BLOCKSIZE, DST 1621 add SRC, VIS_BLOCKSIZE, SRC 1622 fsrc1 %f10, %f42 1623 fsrc1 %f12, %f44 1624 fsrc1 %f14, %f46 1625 stda %f32, [DST]ASI_BLK_P 1626 1627 membar #Sync 1628 1629 btst FPRS_FEF, %l0 1630 bz,pt %icc, 2f 1631 nop 1632 1633 BLD_FPQ1Q3_FROMSTACK(%l3) 1634 ba 3f 1635 nop 1636 16372: FZEROQ1Q3 1638 16393: wr %l0, 0, %fprs ! restore fprs 1640 ret 1641 restore %g0, 0, %o0 1642 1643 SET_SIZE(hwblkpagecopy) 1644#endif /* lint */ 1645 1646 1647/* 1648 * Transfer data to and from user space - 1649 * Note that these routines can cause faults 1650 * It is assumed that the kernel has nothing at 1651 * less than KERNELBASE in the virtual address space. 1652 * 1653 * Note that copyin(9F) and copyout(9F) are part of the 1654 * DDI/DKI which specifies that they return '-1' on "errors." 1655 * 1656 * Sigh. 1657 * 1658 * So there's two extremely similar routines - xcopyin() and xcopyout() 1659 * which return the errno that we've faithfully computed. This 1660 * allows other callers (e.g. uiomove(9F)) to work correctly. 1661 * Given that these are used pretty heavily, we expand the calling 1662 * sequences inline for all flavours (rather than making wrappers). 1663 * 1664 * There are also stub routines for xcopyout_little and xcopyin_little, 1665 * which currently are intended to handle requests of <= 16 bytes from 1666 * do_unaligned. Future enhancement to make them handle 8k pages efficiently 1667 * is left as an exercise... 1668 */ 1669 1670/* 1671 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr) 1672 * 1673 * General theory of operation: 1674 * 1675 * The only difference between copy{in,out} and 1676 * xcopy{in,out} is in the error handling routine they invoke 1677 * when a memory access error occurs. xcopyOP returns the errno 1678 * while copyOP returns -1 (see above). copy{in,out}_noerr set 1679 * a special flag (by oring the TRAMP_FLAG into the fault handler address) 1680 * if they are called with a fault handler already in place. That flag 1681 * causes the default handlers to trampoline to the previous handler 1682 * upon an error. 1683 * 1684 * None of the copyops routines grab a window until it's decided that 1685 * we need to do a HW block copy operation. This saves a window 1686 * spill/fill when we're called during socket ops. The typical IO 1687 * path won't cause spill/fill traps. 1688 * 1689 * This code uses a set of 4 limits for the maximum size that will 1690 * be copied given a particular input/output address alignment. 1691 * If the value for a particular limit is zero, the copy will be performed 1692 * by the plain copy loops rather than FPBLK. 1693 * 1694 * See the description of bcopy above for more details of the 1695 * data copying algorithm and the default limits. 1696 * 1697 */ 1698 1699/* 1700 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little). 1701 */ 1702 1703#if defined(lint) 1704 1705 1706#else /* lint */ 1707/* 1708 * We save the arguments in the following registers in case of a fault: 1709 * kaddr - %l1 1710 * uaddr - %l2 1711 * count - %l3 1712 */ 1713#define SAVE_SRC %l1 1714#define SAVE_DST %l2 1715#define SAVE_COUNT %l3 1716 1717#define SM_SAVE_SRC %g4 1718#define SM_SAVE_DST %g5 1719#define SM_SAVE_COUNT %o5 1720#define ERRNO %l5 1721 1722 1723#define REAL_LOFAULT %l4 1724/* 1725 * Generic copyio fault handler. This is the first line of defense when a 1726 * fault occurs in (x)copyin/(x)copyout. In order for this to function 1727 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT. 1728 * This allows us to share common code for all the flavors of the copy 1729 * operations, including the _noerr versions. 1730 * 1731 * Note that this function will restore the original input parameters before 1732 * calling REAL_LOFAULT. So the real handler can vector to the appropriate 1733 * member of the t_copyop structure, if needed. 1734 */ 1735 ENTRY(copyio_fault) 1736 membar #Sync 1737 mov %g1,ERRNO ! save errno in ERRNO 1738 btst FPUSED_FLAG, %l6 1739 bz %ncc, 1f 1740 nop 1741 1742 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 1743 wr %o2, 0, %gsr ! restore gsr 1744 1745 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 1746 btst FPRS_FEF, %o3 1747 bz,pt %icc, 4f 1748 nop 1749 1750 BLD_FPQ2Q4_FROMSTACK(%o2) 1751 1752 ba,pt %ncc, 1f 1753 wr %o3, 0, %fprs ! restore fprs 1754 17554: 1756 FZEROQ2Q4 1757 wr %o3, 0, %fprs ! restore fprs 1758 17591: 1760 andn %l6, FPUSED_FLAG, %l6 1761 membar #Sync 1762 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1763 FP_ALLOWMIGRATE(5, 6) 1764 1765 mov SAVE_SRC, %i0 1766 mov SAVE_DST, %i1 1767 jmp REAL_LOFAULT 1768 mov SAVE_COUNT, %i2 1769 1770 SET_SIZE(copyio_fault) 1771 1772 1773#endif 1774 1775#if defined(lint) 1776 1777/*ARGSUSED*/ 1778int 1779copyout(const void *kaddr, void *uaddr, size_t count) 1780{ return (0); } 1781 1782#else /* lint */ 1783 1784 ENTRY(copyout) 1785 1786 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 1787 bleu,pt %ncc, .copyout_small ! go to larger cases 1788 xor %o0, %o1, %o3 ! are src, dst alignable? 1789 btst 7, %o3 ! 1790 bz,pt %ncc, .copyout_8 ! check for longword alignment 1791 nop 1792 btst 1, %o3 ! 1793 bz,pt %ncc, .copyout_2 ! check for half-word 1794 nop 1795 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 1796 ld [%o3 + %lo(hw_copy_limit_1)], %o3 1797 tst %o3 1798 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1799 cmp %o2, %o3 ! if length <= limit 1800 bleu,pt %ncc, .copyout_small ! go to small copy 1801 nop 1802 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1803 nop 1804.copyout_2: 1805 btst 3, %o3 ! 1806 bz,pt %ncc, .copyout_4 ! check for word alignment 1807 nop 1808 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 1809 ld [%o3 + %lo(hw_copy_limit_2)], %o3 1810 tst %o3 1811 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1812 cmp %o2, %o3 ! if length <= limit 1813 bleu,pt %ncc, .copyout_small ! go to small copy 1814 nop 1815 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1816 nop 1817.copyout_4: 1818 ! already checked longword, must be word aligned 1819 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 1820 ld [%o3 + %lo(hw_copy_limit_4)], %o3 1821 tst %o3 1822 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1823 cmp %o2, %o3 ! if length <= limit 1824 bleu,pt %ncc, .copyout_small ! go to small copy 1825 nop 1826 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1827 nop 1828.copyout_8: 1829 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 1830 ld [%o3 + %lo(hw_copy_limit_8)], %o3 1831 tst %o3 1832 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1833 cmp %o2, %o3 ! if length <= limit 1834 bleu,pt %ncc, .copyout_small ! go to small copy 1835 nop 1836 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1837 nop 1838 1839 .align 16 1840 nop ! instruction alignment 1841 ! see discussion at start of file 1842.copyout_small: 1843 sethi %hi(.sm_copyout_err), %o5 ! .sm_copyout_err is lofault 1844 or %o5, %lo(.sm_copyout_err), %o5 1845 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 1846 membar #Sync ! sync error barrier 1847 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 1848.sm_do_copyout: 1849 mov %o0, SM_SAVE_SRC 1850 mov %o1, SM_SAVE_DST 1851 cmp %o2, SHORTCOPY ! check for really short case 1852 bleu,pt %ncc, .co_sm_left ! 1853 mov %o2, SM_SAVE_COUNT 1854 cmp %o2, CHKSIZE ! check for medium length cases 1855 bgu,pn %ncc, .co_med ! 1856 or %o0, %o1, %o3 ! prepare alignment check 1857 andcc %o3, 0x3, %g0 ! test for alignment 1858 bz,pt %ncc, .co_sm_word ! branch to word aligned case 1859.co_sm_movebytes: 1860 sub %o2, 3, %o2 ! adjust count to allow cc zero test 1861.co_sm_notalign4: 1862 ldub [%o0], %o3 ! read byte 1863 subcc %o2, 4, %o2 ! reduce count by 4 1864 stba %o3, [%o1]ASI_USER ! write byte 1865 inc %o1 ! advance DST by 1 1866 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes 1867 add %o0, 4, %o0 ! advance SRC by 4 1868 stba %o3, [%o1]ASI_USER 1869 inc %o1 ! advance DST by 1 1870 ldub [%o0 - 2], %o3 1871 stba %o3, [%o1]ASI_USER 1872 inc %o1 ! advance DST by 1 1873 ldub [%o0 - 1], %o3 1874 stba %o3, [%o1]ASI_USER 1875 bgt,pt %ncc, .co_sm_notalign4 ! loop til 3 or fewer bytes remain 1876 inc %o1 ! advance DST by 1 1877 add %o2, 3, %o2 ! restore count 1878.co_sm_left: 1879 tst %o2 1880 bz,pt %ncc, .co_sm_exit ! check for zero length 1881 nop 1882 ldub [%o0], %o3 ! load one byte 1883 deccc %o2 ! reduce count for cc test 1884 bz,pt %ncc, .co_sm_exit 1885 stba %o3,[%o1]ASI_USER ! store one byte 1886 ldub [%o0 + 1], %o3 ! load second byte 1887 deccc %o2 1888 inc %o1 1889 bz,pt %ncc, .co_sm_exit 1890 stba %o3,[%o1]ASI_USER ! store second byte 1891 ldub [%o0 + 2], %o3 ! load third byte 1892 inc %o1 1893 stba %o3,[%o1]ASI_USER ! store third byte 1894 membar #Sync ! sync error barrier 1895 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1896 retl 1897 mov %g0, %o0 ! return 0 1898 .align 16 1899.co_sm_words: 1900 lduw [%o0], %o3 ! read word 1901.co_sm_wordx: 1902 subcc %o2, 8, %o2 ! update count 1903 stwa %o3, [%o1]ASI_USER ! write word 1904 add %o0, 8, %o0 ! update SRC 1905 lduw [%o0 - 4], %o3 ! read word 1906 add %o1, 4, %o1 ! update DST 1907 stwa %o3, [%o1]ASI_USER ! write word 1908 bgt,pt %ncc, .co_sm_words ! loop til done 1909 add %o1, 4, %o1 ! update DST 1910 addcc %o2, 7, %o2 ! restore count 1911 bz,pt %ncc, .co_sm_exit 1912 nop 1913 deccc %o2 1914 bz,pt %ncc, .co_sm_byte 1915.co_sm_half: 1916 subcc %o2, 2, %o2 ! reduce count by 2 1917 lduh [%o0], %o3 ! read half word 1918 add %o0, 2, %o0 ! advance SRC by 2 1919 stha %o3, [%o1]ASI_USER ! write half word 1920 bgt,pt %ncc, .co_sm_half ! loop til done 1921 add %o1, 2, %o1 ! advance DST by 2 1922 addcc %o2, 1, %o2 ! restore count 1923 bz,pt %ncc, .co_sm_exit 1924 nop 1925.co_sm_byte: 1926 ldub [%o0], %o3 1927 stba %o3, [%o1]ASI_USER 1928 membar #Sync ! sync error barrier 1929 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1930 retl 1931 mov %g0, %o0 ! return 0 1932 .align 16 1933.co_sm_word: 1934 subcc %o2, 4, %o2 ! update count 1935 bgt,pt %ncc, .co_sm_wordx 1936 lduw [%o0], %o3 ! read word 1937 addcc %o2, 3, %o2 ! restore count 1938 bz,pt %ncc, .co_sm_exit 1939 stwa %o3, [%o1]ASI_USER ! write word 1940 deccc %o2 ! reduce count for cc test 1941 ldub [%o0 + 4], %o3 ! load one byte 1942 add %o1, 4, %o1 1943 bz,pt %ncc, .co_sm_exit 1944 stba %o3, [%o1]ASI_USER ! store one byte 1945 ldub [%o0 + 5], %o3 ! load second byte 1946 deccc %o2 1947 inc %o1 1948 bz,pt %ncc, .co_sm_exit 1949 stba %o3, [%o1]ASI_USER ! store second byte 1950 ldub [%o0 + 6], %o3 ! load third byte 1951 inc %o1 1952 stba %o3, [%o1]ASI_USER ! store third byte 1953.co_sm_exit: 1954 membar #Sync ! sync error barrier 1955 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1956 retl 1957 mov %g0, %o0 ! return 0 1958 1959 .align 16 1960.co_med: 1961 xor %o0, %o1, %o3 ! setup alignment check 1962 btst 1, %o3 1963 bnz,pt %ncc, .co_sm_movebytes ! unaligned 1964 nop 1965 btst 3, %o3 1966 bnz,pt %ncc, .co_med_half ! halfword aligned 1967 nop 1968 btst 7, %o3 1969 bnz,pt %ncc, .co_med_word ! word aligned 1970 nop 1971.co_med_long: 1972 btst 3, %o0 ! check for 1973 bz,pt %ncc, .co_med_long1 ! word alignment 1974 nop 1975.co_med_long0: 1976 ldub [%o0], %o3 ! load one byte 1977 inc %o0 1978 stba %o3,[%o1]ASI_USER ! store byte 1979 inc %o1 1980 btst 3, %o0 1981 bnz,pt %ncc, .co_med_long0 1982 dec %o2 1983.co_med_long1: ! word aligned 1984 btst 7, %o0 ! check for long word 1985 bz,pt %ncc, .co_med_long2 1986 nop 1987 lduw [%o0], %o3 ! load word 1988 add %o0, 4, %o0 ! advance SRC by 4 1989 stwa %o3, [%o1]ASI_USER ! store word 1990 add %o1, 4, %o1 ! advance DST by 4 1991 sub %o2, 4, %o2 ! reduce count by 4 1992! 1993! Now long word aligned and have at least 32 bytes to move 1994! 1995.co_med_long2: 1996 sub %o2, 31, %o2 ! adjust count to allow cc zero test 1997 sub %o1, 8, %o1 ! adjust pointer to allow store in 1998 ! branch delay slot instead of add 1999.co_med_lmove: 2000 add %o1, 8, %o1 ! advance DST by 8 2001 ldx [%o0], %o3 ! read long word 2002 subcc %o2, 32, %o2 ! reduce count by 32 2003 stxa %o3, [%o1]ASI_USER ! write long word 2004 add %o1, 8, %o1 ! advance DST by 8 2005 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words 2006 add %o0, 32, %o0 ! advance SRC by 32 2007 stxa %o3, [%o1]ASI_USER 2008 ldx [%o0 - 16], %o3 2009 add %o1, 8, %o1 ! advance DST by 8 2010 stxa %o3, [%o1]ASI_USER 2011 ldx [%o0 - 8], %o3 2012 add %o1, 8, %o1 ! advance DST by 8 2013 bgt,pt %ncc, .co_med_lmove ! loop til 31 or fewer bytes left 2014 stxa %o3, [%o1]ASI_USER 2015 add %o1, 8, %o1 ! advance DST by 8 2016 addcc %o2, 24, %o2 ! restore count to long word offset 2017 ble,pt %ncc, .co_med_lextra ! check for more long words to move 2018 nop 2019.co_med_lword: 2020 ldx [%o0], %o3 ! read long word 2021 subcc %o2, 8, %o2 ! reduce count by 8 2022 stxa %o3, [%o1]ASI_USER ! write long word 2023 add %o0, 8, %o0 ! advance SRC by 8 2024 bgt,pt %ncc, .co_med_lword ! loop til 7 or fewer bytes left 2025 add %o1, 8, %o1 ! advance DST by 8 2026.co_med_lextra: 2027 addcc %o2, 7, %o2 ! restore rest of count 2028 bz,pt %ncc, .co_sm_exit ! if zero, then done 2029 deccc %o2 2030 bz,pt %ncc, .co_sm_byte 2031 nop 2032 ba,pt %ncc, .co_sm_half 2033 nop 2034 2035 .align 16 2036 nop ! instruction alignment 2037 ! see discussion at start of file 2038.co_med_word: 2039 btst 3, %o0 ! check for 2040 bz,pt %ncc, .co_med_word1 ! word alignment 2041 nop 2042.co_med_word0: 2043 ldub [%o0], %o3 ! load one byte 2044 inc %o0 2045 stba %o3,[%o1]ASI_USER ! store byte 2046 inc %o1 2047 btst 3, %o0 2048 bnz,pt %ncc, .co_med_word0 2049 dec %o2 2050! 2051! Now word aligned and have at least 36 bytes to move 2052! 2053.co_med_word1: 2054 sub %o2, 15, %o2 ! adjust count to allow cc zero test 2055.co_med_wmove: 2056 lduw [%o0], %o3 ! read word 2057 subcc %o2, 16, %o2 ! reduce count by 16 2058 stwa %o3, [%o1]ASI_USER ! write word 2059 add %o1, 4, %o1 ! advance DST by 4 2060 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words 2061 add %o0, 16, %o0 ! advance SRC by 16 2062 stwa %o3, [%o1]ASI_USER 2063 add %o1, 4, %o1 ! advance DST by 4 2064 lduw [%o0 - 8], %o3 2065 stwa %o3, [%o1]ASI_USER 2066 add %o1, 4, %o1 ! advance DST by 4 2067 lduw [%o0 - 4], %o3 2068 stwa %o3, [%o1]ASI_USER 2069 bgt,pt %ncc, .co_med_wmove ! loop til 15 or fewer bytes left 2070 add %o1, 4, %o1 ! advance DST by 4 2071 addcc %o2, 12, %o2 ! restore count to word offset 2072 ble,pt %ncc, .co_med_wextra ! check for more words to move 2073 nop 2074.co_med_word2: 2075 lduw [%o0], %o3 ! read word 2076 subcc %o2, 4, %o2 ! reduce count by 4 2077 stwa %o3, [%o1]ASI_USER ! write word 2078 add %o0, 4, %o0 ! advance SRC by 4 2079 bgt,pt %ncc, .co_med_word2 ! loop til 3 or fewer bytes left 2080 add %o1, 4, %o1 ! advance DST by 4 2081.co_med_wextra: 2082 addcc %o2, 3, %o2 ! restore rest of count 2083 bz,pt %ncc, .co_sm_exit ! if zero, then done 2084 deccc %o2 2085 bz,pt %ncc, .co_sm_byte 2086 nop 2087 ba,pt %ncc, .co_sm_half 2088 nop 2089 2090 .align 16 2091 nop ! instruction alignment 2092 nop ! see discussion at start of file 2093 nop 2094.co_med_half: 2095 btst 1, %o0 ! check for 2096 bz,pt %ncc, .co_med_half1 ! half word alignment 2097 nop 2098 ldub [%o0], %o3 ! load one byte 2099 inc %o0 2100 stba %o3,[%o1]ASI_USER ! store byte 2101 inc %o1 2102 dec %o2 2103! 2104! Now half word aligned and have at least 38 bytes to move 2105! 2106.co_med_half1: 2107 sub %o2, 7, %o2 ! adjust count to allow cc zero test 2108.co_med_hmove: 2109 lduh [%o0], %o3 ! read half word 2110 subcc %o2, 8, %o2 ! reduce count by 8 2111 stha %o3, [%o1]ASI_USER ! write half word 2112 add %o1, 2, %o1 ! advance DST by 2 2113 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords 2114 add %o0, 8, %o0 ! advance SRC by 8 2115 stha %o3, [%o1]ASI_USER 2116 add %o1, 2, %o1 ! advance DST by 2 2117 lduh [%o0 - 4], %o3 2118 stha %o3, [%o1]ASI_USER 2119 add %o1, 2, %o1 ! advance DST by 2 2120 lduh [%o0 - 2], %o3 2121 stha %o3, [%o1]ASI_USER 2122 bgt,pt %ncc, .co_med_hmove ! loop til 7 or fewer bytes left 2123 add %o1, 2, %o1 ! advance DST by 2 2124 addcc %o2, 7, %o2 ! restore count 2125 bz,pt %ncc, .co_sm_exit 2126 deccc %o2 2127 bz,pt %ncc, .co_sm_byte 2128 nop 2129 ba,pt %ncc, .co_sm_half 2130 nop 2131 2132/* 2133 * We got here because of a fault during short copyout. 2134 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 2135 */ 2136.sm_copyout_err: 2137 membar #Sync 2138 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2139 mov SM_SAVE_SRC, %o0 2140 mov SM_SAVE_DST, %o1 2141 mov SM_SAVE_COUNT, %o2 2142 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2143 tst %o3 2144 bz,pt %ncc, 3f ! if not, return error 2145 nop 2146 ldn [%o3 + CP_COPYOUT], %o5 ! if handler, invoke it with 2147 jmp %o5 ! original arguments 2148 nop 21493: 2150 retl 2151 or %g0, -1, %o0 ! return error value 2152 2153 SET_SIZE(copyout) 2154 2155/* 2156 * The _more entry points are not intended to be used directly by 2157 * any caller from outside this file. They are provided to allow 2158 * profiling and dtrace of the portions of the copy code that uses 2159 * the floating point registers. 2160 * This entry is particularly important as DTRACE (at least as of 2161 * 4/2004) does not support leaf functions. 2162 */ 2163 2164 ENTRY(copyout_more) 2165.copyout_more: 2166 prefetch [%o0], #n_reads 2167 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2168 set .copyout_err, REAL_LOFAULT 2169 2170/* 2171 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes 2172 */ 2173.do_copyout: 2174 set copyio_fault, %l7 ! .copyio_fault is lofault val 2175 2176 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 2177 membar #Sync ! sync error barrier 2178 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 2179 2180 mov %i0, SAVE_SRC 2181 mov %i1, SAVE_DST 2182 mov %i2, SAVE_COUNT 2183 2184 FP_NOMIGRATE(6, 7) 2185 2186 rd %fprs, %o2 ! check for unused fp 2187 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 2188 btst FPRS_FEF, %o2 2189 bz,a,pt %icc, .do_blockcopyout 2190 wr %g0, FPRS_FEF, %fprs 2191 2192 BST_FPQ2Q4_TOSTACK(%o2) 2193 2194.do_blockcopyout: 2195 rd %gsr, %o2 2196 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 2197 or %l6, FPUSED_FLAG, %l6 2198 2199 andcc DST, VIS_BLOCKSIZE - 1, TMP 2200 mov ASI_USER, %asi 2201 bz,pt %ncc, 2f 2202 neg TMP 2203 add TMP, VIS_BLOCKSIZE, TMP 2204 2205 ! TMP = bytes required to align DST on FP_BLOCK boundary 2206 ! Using SRC as a tmp here 2207 cmp TMP, 3 2208 bleu,pt %ncc, 1f 2209 sub CNT,TMP,CNT ! adjust main count 2210 sub TMP, 3, TMP ! adjust for end of loop test 2211.co_blkalign: 2212 ldub [REALSRC], SRC ! move 4 bytes per loop iteration 2213 stba SRC, [DST]%asi 2214 subcc TMP, 4, TMP 2215 ldub [REALSRC + 1], SRC 2216 add REALSRC, 4, REALSRC 2217 stba SRC, [DST + 1]%asi 2218 ldub [REALSRC - 2], SRC 2219 add DST, 4, DST 2220 stba SRC, [DST - 2]%asi 2221 ldub [REALSRC - 1], SRC 2222 bgu,pt %ncc, .co_blkalign 2223 stba SRC, [DST - 1]%asi 2224 2225 addcc TMP, 3, TMP ! restore count adjustment 2226 bz,pt %ncc, 2f ! no bytes left? 2227 nop 22281: ldub [REALSRC], SRC 2229 inc REALSRC 2230 inc DST 2231 deccc TMP 2232 bgu %ncc, 1b 2233 stba SRC, [DST - 1]%asi 2234 22352: 2236 membar #StoreLoad 2237 andn REALSRC, 0x7, SRC 2238 2239 ! SRC - 8-byte aligned 2240 ! DST - 64-byte aligned 2241 ldd [SRC], %f16 2242 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 2243 alignaddr REALSRC, %g0, %g0 2244 ldd [SRC + 0x08], %f18 2245 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 2246 faligndata %f16, %f18, %f48 2247 ldd [SRC + 0x10], %f20 2248 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 2249 faligndata %f18, %f20, %f50 2250 ldd [SRC + 0x18], %f22 2251 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 2252 faligndata %f20, %f22, %f52 2253 ldd [SRC + 0x20], %f24 2254 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read 2255 faligndata %f22, %f24, %f54 2256 ldd [SRC + 0x28], %f26 2257 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read 2258 faligndata %f24, %f26, %f56 2259 ldd [SRC + 0x30], %f28 2260 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read 2261 faligndata %f26, %f28, %f58 2262 ldd [SRC + 0x38], %f30 2263 ldd [SRC + VIS_BLOCKSIZE], %f16 2264 sub CNT, VIS_BLOCKSIZE, CNT 2265 add SRC, VIS_BLOCKSIZE, SRC 2266 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read 2267 add REALSRC, VIS_BLOCKSIZE, REALSRC 2268 ba,pt %ncc, 1f 2269 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read 2270 .align 32 22711: 2272 ldd [SRC + 0x08], %f18 2273 faligndata %f28, %f30, %f60 2274 ldd [SRC + 0x10], %f20 2275 faligndata %f30, %f16, %f62 2276 stda %f48, [DST]ASI_BLK_AIUS 2277 ldd [SRC + 0x18], %f22 2278 faligndata %f16, %f18, %f48 2279 ldd [SRC + 0x20], %f24 2280 faligndata %f18, %f20, %f50 2281 ldd [SRC + 0x28], %f26 2282 faligndata %f20, %f22, %f52 2283 ldd [SRC + 0x30], %f28 2284 faligndata %f22, %f24, %f54 2285 sub CNT, VIS_BLOCKSIZE, CNT 2286 ldd [SRC + 0x38], %f30 2287 faligndata %f24, %f26, %f56 2288 add DST, VIS_BLOCKSIZE, DST 2289 ldd [SRC + VIS_BLOCKSIZE], %f16 2290 faligndata %f26, %f28, %f58 2291 add REALSRC, VIS_BLOCKSIZE, REALSRC 2292 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 2293 add SRC, VIS_BLOCKSIZE, SRC 2294 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read 2295 cmp CNT, VIS_BLOCKSIZE + 8 2296 bgu,pt %ncc, 1b 2297 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 2298 2299 ! only if REALSRC & 0x7 is 0 2300 cmp CNT, VIS_BLOCKSIZE 2301 bne %ncc, 3f 2302 andcc REALSRC, 0x7, %g0 2303 bz,pt %ncc, 2f 2304 nop 23053: 2306 faligndata %f28, %f30, %f60 2307 faligndata %f30, %f16, %f62 2308 stda %f48, [DST]ASI_BLK_AIUS 2309 add DST, VIS_BLOCKSIZE, DST 2310 ba,pt %ncc, 3f 2311 nop 23122: 2313 ldd [SRC + 0x08], %f18 2314 fsrc1 %f28, %f60 2315 ldd [SRC + 0x10], %f20 2316 fsrc1 %f30, %f62 2317 stda %f48, [DST]ASI_BLK_AIUS 2318 ldd [SRC + 0x18], %f22 2319 fsrc1 %f16, %f48 2320 ldd [SRC + 0x20], %f24 2321 fsrc1 %f18, %f50 2322 ldd [SRC + 0x28], %f26 2323 fsrc1 %f20, %f52 2324 ldd [SRC + 0x30], %f28 2325 fsrc1 %f22, %f54 2326 ldd [SRC + 0x38], %f30 2327 fsrc1 %f24, %f56 2328 sub CNT, VIS_BLOCKSIZE, CNT 2329 add DST, VIS_BLOCKSIZE, DST 2330 add SRC, VIS_BLOCKSIZE, SRC 2331 add REALSRC, VIS_BLOCKSIZE, REALSRC 2332 fsrc1 %f26, %f58 2333 fsrc1 %f28, %f60 2334 fsrc1 %f30, %f62 2335 stda %f48, [DST]ASI_BLK_AIUS 2336 add DST, VIS_BLOCKSIZE, DST 2337 ba,a,pt %ncc, 4f 2338 nop 2339 23403: tst CNT 2341 bz,a %ncc, 4f 2342 nop 2343 23445: ldub [REALSRC], TMP 2345 inc REALSRC 2346 inc DST 2347 deccc CNT 2348 bgu %ncc, 5b 2349 stba TMP, [DST - 1]%asi 23504: 2351 2352.copyout_exit: 2353 membar #Sync 2354 2355 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 2356 wr %o2, 0, %gsr ! restore gsr 2357 2358 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 2359 btst FPRS_FEF, %o3 2360 bz,pt %icc, 4f 2361 nop 2362 2363 BLD_FPQ2Q4_FROMSTACK(%o2) 2364 2365 ba,pt %ncc, 1f 2366 wr %o3, 0, %fprs ! restore fprs 2367 23684: 2369 FZEROQ2Q4 2370 wr %o3, 0, %fprs ! restore fprs 2371 23721: 2373 membar #Sync 2374 andn %l6, FPUSED_FLAG, %l6 2375 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2376 FP_ALLOWMIGRATE(5, 6) 2377 ret 2378 restore %g0, 0, %o0 2379 2380/* 2381 * We got here because of a fault during copyout. 2382 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 2383 */ 2384.copyout_err: 2385 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 2386 tst %o4 2387 bz,pt %ncc, 2f ! if not, return error 2388 nop 2389 ldn [%o4 + CP_COPYOUT], %g2 ! if handler, invoke it with 2390 jmp %g2 ! original arguments 2391 restore %g0, 0, %g0 ! dispose of copy window 23922: 2393 ret 2394 restore %g0, -1, %o0 ! return error value 2395 2396 2397 SET_SIZE(copyout_more) 2398 2399#endif /* lint */ 2400 2401 2402#ifdef lint 2403 2404/*ARGSUSED*/ 2405int 2406xcopyout(const void *kaddr, void *uaddr, size_t count) 2407{ return (0); } 2408 2409#else /* lint */ 2410 2411 ENTRY(xcopyout) 2412 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 2413 bleu,pt %ncc, .xcopyout_small ! go to larger cases 2414 xor %o0, %o1, %o3 ! are src, dst alignable? 2415 btst 7, %o3 ! 2416 bz,pt %ncc, .xcopyout_8 ! 2417 nop 2418 btst 1, %o3 ! 2419 bz,pt %ncc, .xcopyout_2 ! check for half-word 2420 nop 2421 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 2422 ld [%o3 + %lo(hw_copy_limit_1)], %o3 2423 tst %o3 2424 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2425 cmp %o2, %o3 ! if length <= limit 2426 bleu,pt %ncc, .xcopyout_small ! go to small copy 2427 nop 2428 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2429 nop 2430.xcopyout_2: 2431 btst 3, %o3 ! 2432 bz,pt %ncc, .xcopyout_4 ! check for word alignment 2433 nop 2434 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 2435 ld [%o3 + %lo(hw_copy_limit_2)], %o3 2436 tst %o3 2437 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2438 cmp %o2, %o3 ! if length <= limit 2439 bleu,pt %ncc, .xcopyout_small ! go to small copy 2440 nop 2441 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2442 nop 2443.xcopyout_4: 2444 ! already checked longword, must be word aligned 2445 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 2446 ld [%o3 + %lo(hw_copy_limit_4)], %o3 2447 tst %o3 2448 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2449 cmp %o2, %o3 ! if length <= limit 2450 bleu,pt %ncc, .xcopyout_small ! go to small copy 2451 nop 2452 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2453 nop 2454.xcopyout_8: 2455 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 2456 ld [%o3 + %lo(hw_copy_limit_8)], %o3 2457 tst %o3 2458 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2459 cmp %o2, %o3 ! if length <= limit 2460 bleu,pt %ncc, .xcopyout_small ! go to small copy 2461 nop 2462 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2463 nop 2464 2465.xcopyout_small: 2466 sethi %hi(.sm_xcopyout_err), %o5 ! .sm_xcopyout_err is lofault 2467 or %o5, %lo(.sm_xcopyout_err), %o5 2468 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 2469 membar #Sync ! sync error barrier 2470 ba,pt %ncc, .sm_do_copyout ! common code 2471 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 2472 2473.xcopyout_more: 2474 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2475 sethi %hi(.xcopyout_err), REAL_LOFAULT 2476 ba,pt %ncc, .do_copyout ! common code 2477 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT 2478 2479/* 2480 * We got here because of fault during xcopyout 2481 * Errno value is in ERRNO 2482 */ 2483.xcopyout_err: 2484 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 2485 tst %o4 2486 bz,pt %ncc, 2f ! if not, return error 2487 nop 2488 ldn [%o4 + CP_XCOPYOUT], %g2 ! if handler, invoke it with 2489 jmp %g2 ! original arguments 2490 restore %g0, 0, %g0 ! dispose of copy window 24912: 2492 ret 2493 restore ERRNO, 0, %o0 ! return errno value 2494 2495.sm_xcopyout_err: 2496 2497 membar #Sync 2498 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2499 mov SM_SAVE_SRC, %o0 2500 mov SM_SAVE_DST, %o1 2501 mov SM_SAVE_COUNT, %o2 2502 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2503 tst %o3 2504 bz,pt %ncc, 3f ! if not, return error 2505 nop 2506 ldn [%o3 + CP_XCOPYOUT], %o5 ! if handler, invoke it with 2507 jmp %o5 ! original arguments 2508 nop 25093: 2510 retl 2511 or %g1, 0, %o0 ! return errno value 2512 2513 SET_SIZE(xcopyout) 2514 2515#endif /* lint */ 2516 2517#ifdef lint 2518 2519/*ARGSUSED*/ 2520int 2521xcopyout_little(const void *kaddr, void *uaddr, size_t count) 2522{ return (0); } 2523 2524#else /* lint */ 2525 2526 ENTRY(xcopyout_little) 2527 sethi %hi(.xcopyio_err), %o5 2528 or %o5, %lo(.xcopyio_err), %o5 2529 ldn [THREAD_REG + T_LOFAULT], %o4 2530 membar #Sync ! sync error barrier 2531 stn %o5, [THREAD_REG + T_LOFAULT] 2532 mov %o4, %o5 2533 2534 subcc %g0, %o2, %o3 2535 add %o0, %o2, %o0 2536 bz,pn %ncc, 2f ! check for zero bytes 2537 sub %o2, 1, %o4 2538 add %o0, %o4, %o0 ! start w/last byte 2539 add %o1, %o2, %o1 2540 ldub [%o0 + %o3], %o4 2541 25421: stba %o4, [%o1 + %o3]ASI_AIUSL 2543 inccc %o3 2544 sub %o0, 2, %o0 ! get next byte 2545 bcc,a,pt %ncc, 1b 2546 ldub [%o0 + %o3], %o4 2547 25482: 2549 membar #Sync ! sync error barrier 2550 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2551 retl 2552 mov %g0, %o0 ! return (0) 2553 2554 SET_SIZE(xcopyout_little) 2555 2556#endif /* lint */ 2557 2558/* 2559 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little) 2560 */ 2561 2562#if defined(lint) 2563 2564/*ARGSUSED*/ 2565int 2566copyin(const void *uaddr, void *kaddr, size_t count) 2567{ return (0); } 2568 2569#else /* lint */ 2570 2571 ENTRY(copyin) 2572 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 2573 bleu,pt %ncc, .copyin_small ! go to larger cases 2574 xor %o0, %o1, %o3 ! are src, dst alignable? 2575 btst 7, %o3 ! 2576 bz,pt %ncc, .copyin_8 ! check for longword alignment 2577 nop 2578 btst 1, %o3 ! 2579 bz,pt %ncc, .copyin_2 ! check for half-word 2580 nop 2581 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 2582 ld [%o3 + %lo(hw_copy_limit_1)], %o3 2583 tst %o3 2584 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2585 cmp %o2, %o3 ! if length <= limit 2586 bleu,pt %ncc, .copyin_small ! go to small copy 2587 nop 2588 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2589 nop 2590.copyin_2: 2591 btst 3, %o3 ! 2592 bz,pt %ncc, .copyin_4 ! check for word alignment 2593 nop 2594 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 2595 ld [%o3 + %lo(hw_copy_limit_2)], %o3 2596 tst %o3 2597 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2598 cmp %o2, %o3 ! if length <= limit 2599 bleu,pt %ncc, .copyin_small ! go to small copy 2600 nop 2601 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2602 nop 2603.copyin_4: 2604 ! already checked longword, must be word aligned 2605 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 2606 ld [%o3 + %lo(hw_copy_limit_4)], %o3 2607 tst %o3 2608 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2609 cmp %o2, %o3 ! if length <= limit 2610 bleu,pt %ncc, .copyin_small ! go to small copy 2611 nop 2612 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2613 nop 2614.copyin_8: 2615 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 2616 ld [%o3 + %lo(hw_copy_limit_8)], %o3 2617 tst %o3 2618 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2619 cmp %o2, %o3 ! if length <= limit 2620 bleu,pt %ncc, .copyin_small ! go to small copy 2621 nop 2622 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2623 nop 2624 2625 .align 16 2626 nop ! instruction alignment 2627 ! see discussion at start of file 2628.copyin_small: 2629 sethi %hi(.sm_copyin_err), %o5 ! .sm_copyin_err is lofault 2630 or %o5, %lo(.sm_copyin_err), %o5 2631 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofault, no tramp 2632 membar #Sync ! sync error barrier 2633 stn %o5, [THREAD_REG + T_LOFAULT] 2634.sm_do_copyin: 2635 mov %o0, SM_SAVE_SRC 2636 mov %o1, SM_SAVE_DST 2637 cmp %o2, SHORTCOPY ! check for really short case 2638 bleu,pt %ncc, .ci_sm_left ! 2639 mov %o2, SM_SAVE_COUNT 2640 cmp %o2, CHKSIZE ! check for medium length cases 2641 bgu,pn %ncc, .ci_med ! 2642 or %o0, %o1, %o3 ! prepare alignment check 2643 andcc %o3, 0x3, %g0 ! test for alignment 2644 bz,pt %ncc, .ci_sm_word ! branch to word aligned case 2645.ci_sm_movebytes: 2646 sub %o2, 3, %o2 ! adjust count to allow cc zero test 2647.ci_sm_notalign4: 2648 lduba [%o0]ASI_USER, %o3 ! read byte 2649 subcc %o2, 4, %o2 ! reduce count by 4 2650 stb %o3, [%o1] ! write byte 2651 add %o0, 1, %o0 ! advance SRC by 1 2652 lduba [%o0]ASI_USER, %o3 ! repeat for a total of 4 bytes 2653 add %o0, 1, %o0 ! advance SRC by 1 2654 stb %o3, [%o1 + 1] 2655 add %o1, 4, %o1 ! advance DST by 4 2656 lduba [%o0]ASI_USER, %o3 2657 add %o0, 1, %o0 ! advance SRC by 1 2658 stb %o3, [%o1 - 2] 2659 lduba [%o0]ASI_USER, %o3 2660 add %o0, 1, %o0 ! advance SRC by 1 2661 bgt,pt %ncc, .ci_sm_notalign4 ! loop til 3 or fewer bytes remain 2662 stb %o3, [%o1 - 1] 2663 add %o2, 3, %o2 ! restore count 2664.ci_sm_left: 2665 tst %o2 2666 bz,pt %ncc, .ci_sm_exit 2667 nop 2668 lduba [%o0]ASI_USER, %o3 ! load one byte 2669 deccc %o2 ! reduce count for cc test 2670 bz,pt %ncc, .ci_sm_exit 2671 stb %o3,[%o1] ! store one byte 2672 inc %o0 2673 lduba [%o0]ASI_USER, %o3 ! load second byte 2674 deccc %o2 2675 bz,pt %ncc, .ci_sm_exit 2676 stb %o3,[%o1 + 1] ! store second byte 2677 inc %o0 2678 lduba [%o0]ASI_USER, %o3 ! load third byte 2679 stb %o3,[%o1 + 2] ! store third byte 2680 membar #Sync ! sync error barrier 2681 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2682 retl 2683 mov %g0, %o0 ! return 0 2684 .align 16 2685.ci_sm_words: 2686 lduwa [%o0]ASI_USER, %o3 ! read word 2687.ci_sm_wordx: 2688 subcc %o2, 8, %o2 ! update count 2689 stw %o3, [%o1] ! write word 2690 add %o0, 4, %o0 ! update SRC 2691 add %o1, 8, %o1 ! update DST 2692 lduwa [%o0]ASI_USER, %o3 ! read word 2693 add %o0, 4, %o0 ! update SRC 2694 bgt,pt %ncc, .ci_sm_words ! loop til done 2695 stw %o3, [%o1 - 4] ! write word 2696 addcc %o2, 7, %o2 ! restore count 2697 bz,pt %ncc, .ci_sm_exit 2698 nop 2699 deccc %o2 2700 bz,pt %ncc, .ci_sm_byte 2701.ci_sm_half: 2702 subcc %o2, 2, %o2 ! reduce count by 2 2703 lduha [%o0]ASI_USER, %o3 ! read half word 2704 add %o0, 2, %o0 ! advance SRC by 2 2705 add %o1, 2, %o1 ! advance DST by 2 2706 bgt,pt %ncc, .ci_sm_half ! loop til done 2707 sth %o3, [%o1 - 2] ! write half word 2708 addcc %o2, 1, %o2 ! restore count 2709 bz,pt %ncc, .ci_sm_exit 2710 nop 2711.ci_sm_byte: 2712 lduba [%o0]ASI_USER, %o3 2713 stb %o3, [%o1] 2714 membar #Sync ! sync error barrier 2715 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2716 retl 2717 mov %g0, %o0 ! return 0 2718 .align 16 2719.ci_sm_word: 2720 subcc %o2, 4, %o2 ! update count 2721 bgt,pt %ncc, .ci_sm_wordx 2722 lduwa [%o0]ASI_USER, %o3 ! read word 2723 addcc %o2, 3, %o2 ! restore count 2724 bz,pt %ncc, .ci_sm_exit 2725 stw %o3, [%o1] ! write word 2726 deccc %o2 ! reduce count for cc test 2727 add %o0, 4, %o0 2728 lduba [%o0]ASI_USER, %o3 ! load one byte 2729 bz,pt %ncc, .ci_sm_exit 2730 stb %o3, [%o1 + 4] ! store one byte 2731 inc %o0 2732 lduba [%o0]ASI_USER, %o3 ! load second byte 2733 deccc %o2 2734 bz,pt %ncc, .ci_sm_exit 2735 stb %o3, [%o1 + 5] ! store second byte 2736 inc %o0 2737 lduba [%o0]ASI_USER, %o3 ! load third byte 2738 stb %o3, [%o1 + 6] ! store third byte 2739.ci_sm_exit: 2740 membar #Sync ! sync error barrier 2741 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2742 retl 2743 mov %g0, %o0 ! return 0 2744 2745 .align 16 2746.ci_med: 2747 xor %o0, %o1, %o3 ! setup alignment check 2748 btst 1, %o3 2749 bnz,pt %ncc, .ci_sm_movebytes ! unaligned 2750 nop 2751 btst 3, %o3 2752 bnz,pt %ncc, .ci_med_half ! halfword aligned 2753 nop 2754 btst 7, %o3 2755 bnz,pt %ncc, .ci_med_word ! word aligned 2756 nop 2757.ci_med_long: 2758 btst 3, %o0 ! check for 2759 bz,pt %ncc, .ci_med_long1 ! word alignment 2760 nop 2761.ci_med_long0: 2762 lduba [%o0]ASI_USER, %o3 ! load one byte 2763 inc %o0 2764 stb %o3,[%o1] ! store byte 2765 inc %o1 2766 btst 3, %o0 2767 bnz,pt %ncc, .ci_med_long0 2768 dec %o2 2769.ci_med_long1: ! word aligned 2770 btst 7, %o0 ! check for long word 2771 bz,pt %ncc, .ci_med_long2 2772 nop 2773 lduwa [%o0]ASI_USER, %o3 ! load word 2774 add %o0, 4, %o0 ! advance SRC by 4 2775 stw %o3, [%o1] ! store word 2776 add %o1, 4, %o1 ! advance DST by 4 2777 sub %o2, 4, %o2 ! reduce count by 4 2778! 2779! Now long word aligned and have at least 32 bytes to move 2780! 2781.ci_med_long2: 2782 sub %o2, 31, %o2 ! adjust count to allow cc zero test 2783.ci_med_lmove: 2784 ldxa [%o0]ASI_USER, %o3 ! read long word 2785 subcc %o2, 32, %o2 ! reduce count by 32 2786 stx %o3, [%o1] ! write long word 2787 add %o0, 8, %o0 ! advance SRC by 8 2788 ldxa [%o0]ASI_USER, %o3 ! repeat for a total for 4 long words 2789 add %o0, 8, %o0 ! advance SRC by 8 2790 stx %o3, [%o1 + 8] 2791 add %o1, 32, %o1 ! advance DST by 32 2792 ldxa [%o0]ASI_USER, %o3 2793 add %o0, 8, %o0 ! advance SRC by 8 2794 stx %o3, [%o1 - 16] 2795 ldxa [%o0]ASI_USER, %o3 2796 add %o0, 8, %o0 ! advance SRC by 8 2797 bgt,pt %ncc, .ci_med_lmove ! loop til 31 or fewer bytes left 2798 stx %o3, [%o1 - 8] 2799 addcc %o2, 24, %o2 ! restore count to long word offset 2800 ble,pt %ncc, .ci_med_lextra ! check for more long words to move 2801 nop 2802.ci_med_lword: 2803 ldxa [%o0]ASI_USER, %o3 ! read long word 2804 subcc %o2, 8, %o2 ! reduce count by 8 2805 stx %o3, [%o1] ! write long word 2806 add %o0, 8, %o0 ! advance SRC by 8 2807 bgt,pt %ncc, .ci_med_lword ! loop til 7 or fewer bytes left 2808 add %o1, 8, %o1 ! advance DST by 8 2809.ci_med_lextra: 2810 addcc %o2, 7, %o2 ! restore rest of count 2811 bz,pt %ncc, .ci_sm_exit ! if zero, then done 2812 deccc %o2 2813 bz,pt %ncc, .ci_sm_byte 2814 nop 2815 ba,pt %ncc, .ci_sm_half 2816 nop 2817 2818 .align 16 2819 nop ! instruction alignment 2820 ! see discussion at start of file 2821.ci_med_word: 2822 btst 3, %o0 ! check for 2823 bz,pt %ncc, .ci_med_word1 ! word alignment 2824 nop 2825.ci_med_word0: 2826 lduba [%o0]ASI_USER, %o3 ! load one byte 2827 inc %o0 2828 stb %o3,[%o1] ! store byte 2829 inc %o1 2830 btst 3, %o0 2831 bnz,pt %ncc, .ci_med_word0 2832 dec %o2 2833! 2834! Now word aligned and have at least 36 bytes to move 2835! 2836.ci_med_word1: 2837 sub %o2, 15, %o2 ! adjust count to allow cc zero test 2838.ci_med_wmove: 2839 lduwa [%o0]ASI_USER, %o3 ! read word 2840 subcc %o2, 16, %o2 ! reduce count by 16 2841 stw %o3, [%o1] ! write word 2842 add %o0, 4, %o0 ! advance SRC by 4 2843 lduwa [%o0]ASI_USER, %o3 ! repeat for a total for 4 words 2844 add %o0, 4, %o0 ! advance SRC by 4 2845 stw %o3, [%o1 + 4] 2846 add %o1, 16, %o1 ! advance DST by 16 2847 lduwa [%o0]ASI_USER, %o3 2848 add %o0, 4, %o0 ! advance SRC by 4 2849 stw %o3, [%o1 - 8] 2850 lduwa [%o0]ASI_USER, %o3 2851 add %o0, 4, %o0 ! advance SRC by 4 2852 bgt,pt %ncc, .ci_med_wmove ! loop til 15 or fewer bytes left 2853 stw %o3, [%o1 - 4] 2854 addcc %o2, 12, %o2 ! restore count to word offset 2855 ble,pt %ncc, .ci_med_wextra ! check for more words to move 2856 nop 2857.ci_med_word2: 2858 lduwa [%o0]ASI_USER, %o3 ! read word 2859 subcc %o2, 4, %o2 ! reduce count by 4 2860 stw %o3, [%o1] ! write word 2861 add %o0, 4, %o0 ! advance SRC by 4 2862 bgt,pt %ncc, .ci_med_word2 ! loop til 3 or fewer bytes left 2863 add %o1, 4, %o1 ! advance DST by 4 2864.ci_med_wextra: 2865 addcc %o2, 3, %o2 ! restore rest of count 2866 bz,pt %ncc, .ci_sm_exit ! if zero, then done 2867 deccc %o2 2868 bz,pt %ncc, .ci_sm_byte 2869 nop 2870 ba,pt %ncc, .ci_sm_half 2871 nop 2872 2873 .align 16 2874 nop ! instruction alignment 2875 ! see discussion at start of file 2876.ci_med_half: 2877 btst 1, %o0 ! check for 2878 bz,pt %ncc, .ci_med_half1 ! half word alignment 2879 nop 2880 lduba [%o0]ASI_USER, %o3 ! load one byte 2881 inc %o0 2882 stb %o3,[%o1] ! store byte 2883 inc %o1 2884 dec %o2 2885! 2886! Now half word aligned and have at least 38 bytes to move 2887! 2888.ci_med_half1: 2889 sub %o2, 7, %o2 ! adjust count to allow cc zero test 2890.ci_med_hmove: 2891 lduha [%o0]ASI_USER, %o3 ! read half word 2892 subcc %o2, 8, %o2 ! reduce count by 8 2893 sth %o3, [%o1] ! write half word 2894 add %o0, 2, %o0 ! advance SRC by 2 2895 lduha [%o0]ASI_USER, %o3 ! repeat for a total for 4 halfwords 2896 add %o0, 2, %o0 ! advance SRC by 2 2897 sth %o3, [%o1 + 2] 2898 add %o1, 8, %o1 ! advance DST by 8 2899 lduha [%o0]ASI_USER, %o3 2900 add %o0, 2, %o0 ! advance SRC by 2 2901 sth %o3, [%o1 - 4] 2902 lduha [%o0]ASI_USER, %o3 2903 add %o0, 2, %o0 ! advance SRC by 2 2904 bgt,pt %ncc, .ci_med_hmove ! loop til 7 or fewer bytes left 2905 sth %o3, [%o1 - 2] 2906 addcc %o2, 7, %o2 ! restore count 2907 bz,pt %ncc, .ci_sm_exit 2908 deccc %o2 2909 bz,pt %ncc, .ci_sm_byte 2910 nop 2911 ba,pt %ncc, .ci_sm_half 2912 nop 2913 2914.sm_copyin_err: 2915 membar #Sync 2916 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2917 mov SM_SAVE_SRC, %o0 2918 mov SM_SAVE_DST, %o1 2919 mov SM_SAVE_COUNT, %o2 2920 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2921 tst %o3 2922 bz,pt %ncc, 3f ! if not, return error 2923 nop 2924 ldn [%o3 + CP_COPYIN], %o5 ! if handler, invoke it with 2925 jmp %o5 ! original arguments 2926 nop 29273: 2928 retl 2929 or %g0, -1, %o0 ! return errno value 2930 2931 SET_SIZE(copyin) 2932 2933 2934/* 2935 * The _more entry points are not intended to be used directly by 2936 * any caller from outside this file. They are provided to allow 2937 * profiling and dtrace of the portions of the copy code that uses 2938 * the floating point registers. 2939 * This entry is particularly important as DTRACE (at least as of 2940 * 4/2004) does not support leaf functions. 2941 */ 2942 2943 ENTRY(copyin_more) 2944.copyin_more: 2945 prefetch [%o0], #n_reads 2946 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2947 set .copyin_err, REAL_LOFAULT 2948 2949/* 2950 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes 2951 */ 2952.do_copyin: 2953 set copyio_fault, %l7 ! .copyio_fault is lofault val 2954 2955 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 2956 membar #Sync ! sync error barrier 2957 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 2958 2959 mov %i0, SAVE_SRC 2960 mov %i1, SAVE_DST 2961 mov %i2, SAVE_COUNT 2962 2963 FP_NOMIGRATE(6, 7) 2964 2965 rd %fprs, %o2 ! check for unused fp 2966 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 2967 btst FPRS_FEF, %o2 2968 bz,a,pt %icc, .do_blockcopyin 2969 wr %g0, FPRS_FEF, %fprs 2970 2971 BST_FPQ2Q4_TOSTACK(%o2) 2972 2973.do_blockcopyin: 2974 rd %gsr, %o2 2975 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 2976 or %l6, FPUSED_FLAG, %l6 2977 2978 andcc DST, VIS_BLOCKSIZE - 1, TMP 2979 mov ASI_USER, %asi 2980 bz,pt %ncc, 2f 2981 neg TMP 2982 add TMP, VIS_BLOCKSIZE, TMP 2983 2984 ! TMP = bytes required to align DST on FP_BLOCK boundary 2985 ! Using SRC as a tmp here 2986 cmp TMP, 3 2987 bleu,pt %ncc, 1f 2988 sub CNT,TMP,CNT ! adjust main count 2989 sub TMP, 3, TMP ! adjust for end of loop test 2990.ci_blkalign: 2991 lduba [REALSRC]%asi, SRC ! move 4 bytes per loop iteration 2992 stb SRC, [DST] 2993 subcc TMP, 4, TMP 2994 lduba [REALSRC + 1]%asi, SRC 2995 add REALSRC, 4, REALSRC 2996 stb SRC, [DST + 1] 2997 lduba [REALSRC - 2]%asi, SRC 2998 add DST, 4, DST 2999 stb SRC, [DST - 2] 3000 lduba [REALSRC - 1]%asi, SRC 3001 bgu,pt %ncc, .ci_blkalign 3002 stb SRC, [DST - 1] 3003 3004 addcc TMP, 3, TMP ! restore count adjustment 3005 bz,pt %ncc, 2f ! no bytes left? 3006 nop 30071: lduba [REALSRC]%asi, SRC 3008 inc REALSRC 3009 inc DST 3010 deccc TMP 3011 bgu %ncc, 1b 3012 stb SRC, [DST - 1] 3013 30142: 3015 membar #StoreLoad 3016 andn REALSRC, 0x7, SRC 3017 3018 ! SRC - 8-byte aligned 3019 ! DST - 64-byte aligned 3020 ldda [SRC]%asi, %f16 3021 prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #n_reads 3022 alignaddr REALSRC, %g0, %g0 3023 ldda [SRC + 0x08]%asi, %f18 3024 prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #n_reads 3025 faligndata %f16, %f18, %f48 3026 ldda [SRC + 0x10]%asi, %f20 3027 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads 3028 faligndata %f18, %f20, %f50 3029 ldda [SRC + 0x18]%asi, %f22 3030 prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read 3031 faligndata %f20, %f22, %f52 3032 ldda [SRC + 0x20]%asi, %f24 3033 prefetcha [SRC + (8 * VIS_BLOCKSIZE)]%asi, #one_read 3034 faligndata %f22, %f24, %f54 3035 ldda [SRC + 0x28]%asi, %f26 3036 prefetcha [SRC + (12 * VIS_BLOCKSIZE)]%asi, #one_read 3037 faligndata %f24, %f26, %f56 3038 ldda [SRC + 0x30]%asi, %f28 3039 prefetcha [SRC + (16 * VIS_BLOCKSIZE)]%asi, #one_read 3040 faligndata %f26, %f28, %f58 3041 ldda [SRC + 0x38]%asi, %f30 3042 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 3043 sub CNT, VIS_BLOCKSIZE, CNT 3044 add SRC, VIS_BLOCKSIZE, SRC 3045 prefetcha [SRC + (19 * VIS_BLOCKSIZE)]%asi, #one_read 3046 add REALSRC, VIS_BLOCKSIZE, REALSRC 3047 ba,pt %ncc, 1f 3048 prefetcha [SRC + (23 * VIS_BLOCKSIZE)]%asi, #one_read 3049 .align 32 30501: 3051 ldda [SRC + 0x08]%asi, %f18 3052 faligndata %f28, %f30, %f60 3053 ldda [SRC + 0x10]%asi, %f20 3054 faligndata %f30, %f16, %f62 3055 stda %f48, [DST]ASI_BLK_P 3056 ldda [SRC + 0x18]%asi, %f22 3057 faligndata %f16, %f18, %f48 3058 ldda [SRC + 0x20]%asi, %f24 3059 faligndata %f18, %f20, %f50 3060 ldda [SRC + 0x28]%asi, %f26 3061 faligndata %f20, %f22, %f52 3062 ldda [SRC + 0x30]%asi, %f28 3063 faligndata %f22, %f24, %f54 3064 sub CNT, VIS_BLOCKSIZE, CNT 3065 ldda [SRC + 0x38]%asi, %f30 3066 faligndata %f24, %f26, %f56 3067 add DST, VIS_BLOCKSIZE, DST 3068 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 3069 faligndata %f26, %f28, %f58 3070 add REALSRC, VIS_BLOCKSIZE, REALSRC 3071 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads 3072 add SRC, VIS_BLOCKSIZE, SRC 3073 prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read 3074 cmp CNT, VIS_BLOCKSIZE + 8 3075 bgu,pt %ncc, 1b 3076 prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read 3077 3078 ! only if REALSRC & 0x7 is 0 3079 cmp CNT, VIS_BLOCKSIZE 3080 bne %ncc, 3f 3081 andcc REALSRC, 0x7, %g0 3082 bz,pt %ncc, 2f 3083 nop 30843: 3085 faligndata %f28, %f30, %f60 3086 faligndata %f30, %f16, %f62 3087 stda %f48, [DST]ASI_BLK_P 3088 add DST, VIS_BLOCKSIZE, DST 3089 ba,pt %ncc, 3f 3090 nop 30912: 3092 ldda [SRC + 0x08]%asi, %f18 3093 fsrc1 %f28, %f60 3094 ldda [SRC + 0x10]%asi, %f20 3095 fsrc1 %f30, %f62 3096 stda %f48, [DST]ASI_BLK_P 3097 ldda [SRC + 0x18]%asi, %f22 3098 fsrc1 %f16, %f48 3099 ldda [SRC + 0x20]%asi, %f24 3100 fsrc1 %f18, %f50 3101 ldda [SRC + 0x28]%asi, %f26 3102 fsrc1 %f20, %f52 3103 ldda [SRC + 0x30]%asi, %f28 3104 fsrc1 %f22, %f54 3105 ldda [SRC + 0x38]%asi, %f30 3106 fsrc1 %f24, %f56 3107 sub CNT, VIS_BLOCKSIZE, CNT 3108 add DST, VIS_BLOCKSIZE, DST 3109 add SRC, VIS_BLOCKSIZE, SRC 3110 add REALSRC, VIS_BLOCKSIZE, REALSRC 3111 fsrc1 %f26, %f58 3112 fsrc1 %f28, %f60 3113 fsrc1 %f30, %f62 3114 stda %f48, [DST]ASI_BLK_P 3115 add DST, VIS_BLOCKSIZE, DST 3116 ba,a,pt %ncc, 4f 3117 nop 3118 31193: tst CNT 3120 bz,a %ncc, 4f 3121 nop 3122 31235: lduba [REALSRC]ASI_USER, TMP 3124 inc REALSRC 3125 inc DST 3126 deccc CNT 3127 bgu %ncc, 5b 3128 stb TMP, [DST - 1] 31294: 3130 3131.copyin_exit: 3132 membar #Sync 3133 3134 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 3135 wr %o2, 0, %gsr 3136 3137 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 3138 btst FPRS_FEF, %o3 3139 bz,pt %icc, 4f 3140 nop 3141 3142 BLD_FPQ2Q4_FROMSTACK(%o2) 3143 3144 ba,pt %ncc, 1f 3145 wr %o3, 0, %fprs ! restore fprs 3146 31474: 3148 FZEROQ2Q4 3149 wr %o3, 0, %fprs ! restore fprs 3150 31511: 3152 membar #Sync ! sync error barrier 3153 andn %l6, FPUSED_FLAG, %l6 3154 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3155 FP_ALLOWMIGRATE(5, 6) 3156 ret 3157 restore %g0, 0, %o0 3158/* 3159 * We got here because of a fault during copyin 3160 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 3161 */ 3162.copyin_err: 3163 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 3164 tst %o4 3165 bz,pt %ncc, 2f ! if not, return error 3166 nop 3167 ldn [%o4 + CP_COPYIN], %g2 ! if handler, invoke it with 3168 jmp %g2 ! original arguments 3169 restore %g0, 0, %g0 ! dispose of copy window 31702: 3171 ret 3172 restore %g0, -1, %o0 ! return error value 3173 3174 3175 SET_SIZE(copyin_more) 3176 3177#endif /* lint */ 3178 3179#ifdef lint 3180 3181/*ARGSUSED*/ 3182int 3183xcopyin(const void *uaddr, void *kaddr, size_t count) 3184{ return (0); } 3185 3186#else /* lint */ 3187 3188 ENTRY(xcopyin) 3189 3190 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3191 bleu,pt %ncc, .xcopyin_small ! go to larger cases 3192 xor %o0, %o1, %o3 ! are src, dst alignable? 3193 btst 7, %o3 ! 3194 bz,pt %ncc, .xcopyin_8 ! check for longword alignment 3195 nop 3196 btst 1, %o3 ! 3197 bz,pt %ncc, .xcopyin_2 ! check for half-word 3198 nop 3199 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3200 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3201 tst %o3 3202 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3203 cmp %o2, %o3 ! if length <= limit 3204 bleu,pt %ncc, .xcopyin_small ! go to small copy 3205 nop 3206 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3207 nop 3208.xcopyin_2: 3209 btst 3, %o3 ! 3210 bz,pt %ncc, .xcopyin_4 ! check for word alignment 3211 nop 3212 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3213 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3214 tst %o3 3215 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3216 cmp %o2, %o3 ! if length <= limit 3217 bleu,pt %ncc, .xcopyin_small ! go to small copy 3218 nop 3219 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3220 nop 3221.xcopyin_4: 3222 ! already checked longword, must be word aligned 3223 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3224 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3225 tst %o3 3226 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3227 cmp %o2, %o3 ! if length <= limit 3228 bleu,pt %ncc, .xcopyin_small ! go to small copy 3229 nop 3230 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3231 nop 3232.xcopyin_8: 3233 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3234 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3235 tst %o3 3236 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3237 cmp %o2, %o3 ! if length <= limit 3238 bleu,pt %ncc, .xcopyin_small ! go to small copy 3239 nop 3240 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3241 nop 3242 3243.xcopyin_small: 3244 sethi %hi(.sm_xcopyin_err), %o5 ! .sm_xcopyin_err is lofault value 3245 or %o5, %lo(.sm_xcopyin_err), %o5 3246 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofaul 3247 membar #Sync ! sync error barrier 3248 ba,pt %ncc, .sm_do_copyin ! common code 3249 stn %o5, [THREAD_REG + T_LOFAULT] 3250 3251.xcopyin_more: 3252 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3253 sethi %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value 3254 ba,pt %ncc, .do_copyin 3255 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT 3256 3257/* 3258 * We got here because of fault during xcopyin 3259 * Errno value is in ERRNO 3260 */ 3261.xcopyin_err: 3262 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 3263 tst %o4 3264 bz,pt %ncc, 2f ! if not, return error 3265 nop 3266 ldn [%o4 + CP_XCOPYIN], %g2 ! if handler, invoke it with 3267 jmp %g2 ! original arguments 3268 restore %g0, 0, %g0 ! dispose of copy window 32692: 3270 ret 3271 restore ERRNO, 0, %o0 ! return errno value 3272 3273.sm_xcopyin_err: 3274 3275 membar #Sync 3276 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3277 mov SM_SAVE_SRC, %o0 3278 mov SM_SAVE_DST, %o1 3279 mov SM_SAVE_COUNT, %o2 3280 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 3281 tst %o3 3282 bz,pt %ncc, 3f ! if not, return error 3283 nop 3284 ldn [%o3 + CP_XCOPYIN], %o5 ! if handler, invoke it with 3285 jmp %o5 ! original arguments 3286 nop 32873: 3288 retl 3289 or %g1, 0, %o0 ! return errno value 3290 3291 SET_SIZE(xcopyin) 3292 3293#endif /* lint */ 3294 3295#ifdef lint 3296 3297/*ARGSUSED*/ 3298int 3299xcopyin_little(const void *uaddr, void *kaddr, size_t count) 3300{ return (0); } 3301 3302#else /* lint */ 3303 3304 ENTRY(xcopyin_little) 3305 sethi %hi(.xcopyio_err), %o5 3306 or %o5, %lo(.xcopyio_err), %o5 3307 ldn [THREAD_REG + T_LOFAULT], %o4 3308 membar #Sync ! sync error barrier 3309 stn %o5, [THREAD_REG + T_LOFAULT] 3310 mov %o4, %o5 3311 3312 subcc %g0, %o2, %o3 3313 add %o0, %o2, %o0 3314 bz,pn %ncc, 2f ! check for zero bytes 3315 sub %o2, 1, %o4 3316 add %o0, %o4, %o0 ! start w/last byte 3317 add %o1, %o2, %o1 3318 lduba [%o0 + %o3]ASI_AIUSL, %o4 3319 33201: stb %o4, [%o1 + %o3] 3321 inccc %o3 3322 sub %o0, 2, %o0 ! get next byte 3323 bcc,a,pt %ncc, 1b 3324 lduba [%o0 + %o3]ASI_AIUSL, %o4 3325 33262: 3327 membar #Sync ! sync error barrier 3328 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3329 retl 3330 mov %g0, %o0 ! return (0) 3331 3332.xcopyio_err: 3333 membar #Sync ! sync error barrier 3334 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3335 retl 3336 mov %g1, %o0 3337 3338 SET_SIZE(xcopyin_little) 3339 3340#endif /* lint */ 3341 3342 3343/* 3344 * Copy a block of storage - must not overlap (from + len <= to). 3345 * No fault handler installed (to be called under on_fault()) 3346 */ 3347#if defined(lint) 3348 3349/* ARGSUSED */ 3350void 3351copyin_noerr(const void *ufrom, void *kto, size_t count) 3352{} 3353 3354#else /* lint */ 3355 ENTRY(copyin_noerr) 3356 3357 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3358 bleu,pt %ncc, .copyin_ne_small ! go to larger cases 3359 xor %o0, %o1, %o3 ! are src, dst alignable? 3360 btst 7, %o3 ! 3361 bz,pt %ncc, .copyin_ne_8 ! check for longword alignment 3362 nop 3363 btst 1, %o3 ! 3364 bz,pt %ncc, .copyin_ne_2 ! check for half-word 3365 nop 3366 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3367 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3368 tst %o3 3369 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3370 cmp %o2, %o3 ! if length <= limit 3371 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3372 nop 3373 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3374 nop 3375.copyin_ne_2: 3376 btst 3, %o3 ! 3377 bz,pt %ncc, .copyin_ne_4 ! check for word alignment 3378 nop 3379 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3380 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3381 tst %o3 3382 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3383 cmp %o2, %o3 ! if length <= limit 3384 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3385 nop 3386 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3387 nop 3388.copyin_ne_4: 3389 ! already checked longword, must be word aligned 3390 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3391 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3392 tst %o3 3393 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3394 cmp %o2, %o3 ! if length <= limit 3395 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3396 nop 3397 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3398 nop 3399.copyin_ne_8: 3400 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3401 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3402 tst %o3 3403 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3404 cmp %o2, %o3 ! if length <= limit 3405 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3406 nop 3407 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3408 nop 3409 3410.copyin_ne_small: 3411 ldn [THREAD_REG + T_LOFAULT], %o4 3412 tst %o4 3413 bz,pn %ncc, .sm_do_copyin 3414 nop 3415 sethi %hi(.sm_copyio_noerr), %o5 3416 or %o5, %lo(.sm_copyio_noerr), %o5 3417 membar #Sync ! sync error barrier 3418 ba,pt %ncc, .sm_do_copyin 3419 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault 3420 3421.copyin_noerr_more: 3422 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3423 sethi %hi(.copyio_noerr), REAL_LOFAULT 3424 ba,pt %ncc, .do_copyin 3425 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 3426 3427.copyio_noerr: 3428 jmp %l6 3429 restore %g0,0,%g0 3430 3431.sm_copyio_noerr: 3432 membar #Sync 3433 stn %o4, [THREAD_REG + T_LOFAULT] ! restore t_lofault 3434 jmp %o4 3435 nop 3436 3437 SET_SIZE(copyin_noerr) 3438#endif /* lint */ 3439 3440/* 3441 * Copy a block of storage - must not overlap (from + len <= to). 3442 * No fault handler installed (to be called under on_fault()) 3443 */ 3444 3445#if defined(lint) 3446 3447/* ARGSUSED */ 3448void 3449copyout_noerr(const void *kfrom, void *uto, size_t count) 3450{} 3451 3452#else /* lint */ 3453 ENTRY(copyout_noerr) 3454 3455 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3456 bleu,pt %ncc, .copyout_ne_small ! go to larger cases 3457 xor %o0, %o1, %o3 ! are src, dst alignable? 3458 btst 7, %o3 ! 3459 bz,pt %ncc, .copyout_ne_8 ! check for longword alignment 3460 nop 3461 btst 1, %o3 ! 3462 bz,pt %ncc, .copyout_ne_2 ! check for half-word 3463 nop 3464 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3465 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3466 tst %o3 3467 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3468 cmp %o2, %o3 ! if length <= limit 3469 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3470 nop 3471 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3472 nop 3473.copyout_ne_2: 3474 btst 3, %o3 ! 3475 bz,pt %ncc, .copyout_ne_4 ! check for word alignment 3476 nop 3477 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3478 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3479 tst %o3 3480 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3481 cmp %o2, %o3 ! if length <= limit 3482 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3483 nop 3484 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3485 nop 3486.copyout_ne_4: 3487 ! already checked longword, must be word aligned 3488 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3489 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3490 tst %o3 3491 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3492 cmp %o2, %o3 ! if length <= limit 3493 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3494 nop 3495 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3496 nop 3497.copyout_ne_8: 3498 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3499 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3500 tst %o3 3501 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3502 cmp %o2, %o3 ! if length <= limit 3503 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3504 nop 3505 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3506 nop 3507 3508.copyout_ne_small: 3509 ldn [THREAD_REG + T_LOFAULT], %o4 3510 tst %o4 3511 bz,pn %ncc, .sm_do_copyout 3512 nop 3513 sethi %hi(.sm_copyio_noerr), %o5 3514 or %o5, %lo(.sm_copyio_noerr), %o5 3515 membar #Sync ! sync error barrier 3516 ba,pt %ncc, .sm_do_copyout 3517 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault 3518 3519.copyout_noerr_more: 3520 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3521 sethi %hi(.copyio_noerr), REAL_LOFAULT 3522 ba,pt %ncc, .do_copyout 3523 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 3524 3525 SET_SIZE(copyout_noerr) 3526#endif /* lint */ 3527 3528 3529/* 3530 * hwblkclr - clears block-aligned, block-multiple-sized regions that are 3531 * longer than 256 bytes in length using spitfire's block stores. If 3532 * the criteria for using this routine are not met then it calls bzero 3533 * and returns 1. Otherwise 0 is returned indicating success. 3534 * Caller is responsible for ensuring use_hw_bzero is true and that 3535 * kpreempt_disable() has been called. 3536 */ 3537#ifdef lint 3538/*ARGSUSED*/ 3539int 3540hwblkclr(void *addr, size_t len) 3541{ 3542 return(0); 3543} 3544#else /* lint */ 3545 ! %i0 - start address 3546 ! %i1 - length of region (multiple of 64) 3547 ! %l0 - saved fprs 3548 ! %l1 - pointer to saved %d0 block 3549 ! %l2 - saved curthread->t_lwp 3550 3551 ENTRY(hwblkclr) 3552 ! get another window w/space for one aligned block of saved fpregs 3553 save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp 3554 3555 ! Must be block-aligned 3556 andcc %i0, (VIS_BLOCKSIZE-1), %g0 3557 bnz,pn %ncc, 1f 3558 nop 3559 3560 ! ... and must be 256 bytes or more 3561 cmp %i1, 256 3562 blu,pn %ncc, 1f 3563 nop 3564 3565 ! ... and length must be a multiple of VIS_BLOCKSIZE 3566 andcc %i1, (VIS_BLOCKSIZE-1), %g0 3567 bz,pn %ncc, 2f 3568 nop 3569 35701: ! punt, call bzero but notify the caller that bzero was used 3571 mov %i0, %o0 3572 call bzero 3573 mov %i1, %o1 3574 ret 3575 restore %g0, 1, %o0 ! return (1) - did not use block operations 3576 35772: rd %fprs, %l0 ! check for unused fp 3578 btst FPRS_FEF, %l0 3579 bz,pt %icc, 1f 3580 nop 3581 3582 ! save in-use fpregs on stack 3583 membar #Sync 3584 add %fp, STACK_BIAS - 65, %l1 3585 and %l1, -VIS_BLOCKSIZE, %l1 3586 stda %d0, [%l1]ASI_BLK_P 3587 35881: membar #StoreStore|#StoreLoad|#LoadStore 3589 wr %g0, FPRS_FEF, %fprs 3590 wr %g0, ASI_BLK_P, %asi 3591 3592 ! Clear block 3593 fzero %d0 3594 fzero %d2 3595 fzero %d4 3596 fzero %d6 3597 fzero %d8 3598 fzero %d10 3599 fzero %d12 3600 fzero %d14 3601 3602 mov 256, %i3 3603 ba,pt %ncc, .pz_doblock 3604 nop 3605 3606.pz_blkstart: 3607 ! stda %d0, [%i0 + 192]%asi ! in dly slot of branch that got us here 3608 stda %d0, [%i0 + 128]%asi 3609 stda %d0, [%i0 + 64]%asi 3610 stda %d0, [%i0]%asi 3611.pz_zinst: 3612 add %i0, %i3, %i0 3613 sub %i1, %i3, %i1 3614.pz_doblock: 3615 cmp %i1, 256 3616 bgeu,a %ncc, .pz_blkstart 3617 stda %d0, [%i0 + 192]%asi 3618 3619 cmp %i1, 64 3620 blu %ncc, .pz_finish 3621 3622 andn %i1, (64-1), %i3 3623 srl %i3, 4, %i2 ! using blocks, 1 instr / 16 words 3624 set .pz_zinst, %i4 3625 sub %i4, %i2, %i4 3626 jmp %i4 3627 nop 3628 3629.pz_finish: 3630 membar #Sync 3631 btst FPRS_FEF, %l0 3632 bz,a .pz_finished 3633 wr %l0, 0, %fprs ! restore fprs 3634 3635 ! restore fpregs from stack 3636 ldda [%l1]ASI_BLK_P, %d0 3637 membar #Sync 3638 wr %l0, 0, %fprs ! restore fprs 3639 3640.pz_finished: 3641 ret 3642 restore %g0, 0, %o0 ! return (bzero or not) 3643 3644 SET_SIZE(hwblkclr) 3645#endif /* lint */ 3646 3647#ifdef lint 3648/*ARGSUSED*/ 3649void 3650hw_pa_bcopy32(uint64_t src, uint64_t dst) 3651{} 3652#else /*!lint */ 3653 /* 3654 * Copy 32 bytes of data from src (%o0) to dst (%o1) 3655 * using physical addresses. 3656 */ 3657 ENTRY_NP(hw_pa_bcopy32) 3658 rdpr %pstate, %g1 3659 andn %g1, PSTATE_IE, %g2 3660 wrpr %g0, %g2, %pstate 3661 3662 rdpr %pstate, %g0 3663 ldxa [%o0]ASI_MEM, %o2 3664 add %o0, 8, %o0 3665 ldxa [%o0]ASI_MEM, %o3 3666 add %o0, 8, %o0 3667 ldxa [%o0]ASI_MEM, %o4 3668 add %o0, 8, %o0 3669 ldxa [%o0]ASI_MEM, %o5 3670 membar #Sync 3671 3672 stxa %o2, [%o1]ASI_MEM 3673 add %o1, 8, %o1 3674 stxa %o3, [%o1]ASI_MEM 3675 add %o1, 8, %o1 3676 stxa %o4, [%o1]ASI_MEM 3677 add %o1, 8, %o1 3678 stxa %o5, [%o1]ASI_MEM 3679 3680 retl 3681 wrpr %g0, %g1, %pstate 3682 3683 SET_SIZE(hw_pa_bcopy32) 3684 3685#endif /* lint */ 3686 3687#if defined(lint) 3688 3689int use_hw_bcopy = 1; 3690int use_hw_bzero = 1; 3691uint_t hw_copy_limit_1 = 0; 3692uint_t hw_copy_limit_2 = 0; 3693uint_t hw_copy_limit_4 = 0; 3694uint_t hw_copy_limit_8 = 0; 3695 3696#else /* !lint */ 3697 3698 DGDEF(use_hw_bcopy) 3699 .word 1 3700 DGDEF(use_hw_bzero) 3701 .word 1 3702 DGDEF(hw_copy_limit_1) 3703 .word 0 3704 DGDEF(hw_copy_limit_2) 3705 .word 0 3706 DGDEF(hw_copy_limit_4) 3707 .word 0 3708 DGDEF(hw_copy_limit_8) 3709 .word 0 3710 3711 .align 64 3712 .section ".text" 3713#endif /* !lint */ 3714