1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma ident "%Z%%M% %I% %E% SMI" 27 28#include <sys/param.h> 29#include <sys/errno.h> 30#include <sys/asm_linkage.h> 31#include <sys/vtrace.h> 32#include <sys/machthread.h> 33#include <sys/clock.h> 34#include <sys/asi.h> 35#include <sys/fsr.h> 36#include <sys/privregs.h> 37 38#if !defined(lint) 39#include "assym.h" 40#endif /* lint */ 41 42/* 43 * Pseudo-code to aid in understanding the control flow of the 44 * bcopy/copyin/copyout routines. 45 * 46 * On entry: 47 * 48 * ! Determine whether to use the FP register version 49 * ! or the leaf routine version depending on size 50 * ! of copy and flags. Set up error handling accordingly. 51 * ! The transition point depends on whether the src and 52 * ! dst addresses can be aligned to long word, word, 53 * ! half word, or byte boundaries. 54 * ! 55 * ! WARNING: <Register usage convention> 56 * ! For FP version, %l6 holds previous error handling and 57 * ! a flag: TRAMP_FLAG (low bits) 58 * ! for leaf routine version, %o4 holds those values. 59 * ! So either %l6 or %o4 is reserved and not available for 60 * ! any other use. 61 * 62 * if (length <= VIS_COPY_THRESHOLD) ! start with a quick test 63 * go to small_copy; ! to speed short copies 64 * 65 * ! src, dst long word alignable 66 * if (hw_copy_limit_8 == 0) ! hw_copy disabled 67 * go to small_copy; 68 * if (length <= hw_copy_limit_8) 69 * go to small_copy; 70 * go to FPBLK_copy; 71 * } 72 * if (src,dst not alignable) { 73 * if (hw_copy_limit_1 == 0) ! hw_copy disabled 74 * go to small_copy; 75 * if (length <= hw_copy_limit_1) 76 * go to small_copy; 77 * go to FPBLK_copy; 78 * } 79 * if (src,dst halfword alignable) { 80 * if (hw_copy_limit_2 == 0) ! hw_copy disabled 81 * go to small_copy; 82 * if (length <= hw_copy_limit_2) 83 * go to small_copy; 84 * go to FPBLK_copy; 85 * } 86 * if (src,dst word alignable) { 87 * if (hw_copy_limit_4 == 0) ! hw_copy disabled 88 * go to small_copy; 89 * if (length <= hw_copy_limit_4) 90 * go to small_copy; 91 * go to FPBLK_copy; 92 * } 93 * 94 * small_copy: 95 * Setup_leaf_rtn_error_handler; ! diffs for each entry point 96 * 97 * if (count <= 3) ! fast path for tiny copies 98 * go to sm_left; ! special finish up code 99 * else 100 * if (count > CHKSIZE) ! medium sized copies 101 * go to sm_med ! tuned by alignment 102 * if(src&dst not both word aligned) { 103 * sm_movebytes: 104 * move byte by byte in 4-way unrolled loop 105 * fall into sm_left; 106 * sm_left: 107 * move 0-3 bytes byte at a time as needed. 108 * restore error handler and exit. 109 * 110 * } else { ! src&dst are word aligned 111 * check for at least 8 bytes left, 112 * move word at a time, unrolled by 2 113 * when fewer than 8 bytes left, 114 * sm_half: move half word at a time while 2 or more bytes left 115 * sm_byte: move final byte if necessary 116 * sm_exit: 117 * restore error handler and exit. 118 * } 119 * 120 * ! Medium length cases with at least CHKSIZE bytes available 121 * ! method: line up src and dst as best possible, then 122 * ! move data in 4-way unrolled loops. 123 * 124 * sm_med: 125 * if(src&dst unalignable) 126 * go to sm_movebytes 127 * if(src&dst halfword alignable) 128 * go to sm_movehalf 129 * if(src&dst word alignable) 130 * go to sm_moveword 131 * ! fall into long word movement 132 * move bytes until src is word aligned 133 * if not long word aligned, move a word 134 * move long words in 4-way unrolled loop until < 32 bytes left 135 * move long words in 1-way unrolled loop until < 8 bytes left 136 * if zero bytes left, goto sm_exit 137 * if one byte left, go to sm_byte 138 * else go to sm_half 139 * 140 * sm_moveword: 141 * move bytes until src is word aligned 142 * move words in 4-way unrolled loop until < 16 bytes left 143 * move words in 1-way unrolled loop until < 4 bytes left 144 * if zero bytes left, goto sm_exit 145 * if one byte left, go to sm_byte 146 * else go to sm_half 147 * 148 * sm_movehalf: 149 * move a byte if needed to align src on halfword 150 * move halfwords in 4-way unrolled loop until < 8 bytes left 151 * if zero bytes left, goto sm_exit 152 * if one byte left, go to sm_byte 153 * else go to sm_half 154 * 155 * 156 * FPBLK_copy: 157 * %l6 = curthread->t_lofault; 158 * if (%l6 != NULL) { 159 * membar #Sync 160 * curthread->t_lofault = .copyerr; 161 * caller_error_handler = TRUE ! %l6 |= 2 162 * } 163 * 164 * ! for FPU testing we must not migrate cpus 165 * if (curthread->t_lwp == NULL) { 166 * ! Kernel threads do not have pcb's in which to store 167 * ! the floating point state, so disallow preemption during 168 * ! the copy. This also prevents cpu migration. 169 * kpreempt_disable(curthread); 170 * } else { 171 * thread_nomigrate(); 172 * } 173 * 174 * old_fprs = %fprs; 175 * old_gsr = %gsr; 176 * if (%fprs.fef) { 177 * %fprs.fef = 1; 178 * save current fpregs on stack using blockstore 179 * } else { 180 * %fprs.fef = 1; 181 * } 182 * 183 * 184 * do_blockcopy_here; 185 * 186 * In lofault handler: 187 * curthread->t_lofault = .copyerr2; 188 * Continue on with the normal exit handler 189 * 190 * On normal exit: 191 * %gsr = old_gsr; 192 * if (old_fprs & FPRS_FEF) 193 * restore fpregs from stack using blockload 194 * else 195 * zero fpregs 196 * %fprs = old_fprs; 197 * membar #Sync 198 * curthread->t_lofault = (%l6 & ~3); 199 * ! following test omitted from copyin/copyout as they 200 * ! will always have a current thread 201 * if (curthread->t_lwp == NULL) 202 * kpreempt_enable(curthread); 203 * else 204 * thread_allowmigrate(); 205 * return (0) 206 * 207 * In second lofault handler (.copyerr2): 208 * We've tried to restore fp state from the stack and failed. To 209 * prevent from returning with a corrupted fp state, we will panic. 210 */ 211 212/* 213 * Comments about optimization choices 214 * 215 * The initial optimization decision in this code is to determine 216 * whether to use the FP registers for a copy or not. If we don't 217 * use the FP registers, we can execute the copy as a leaf routine, 218 * saving a register save and restore. Also, less elaborate setup 219 * is required, allowing short copies to be completed more quickly. 220 * For longer copies, especially unaligned ones (where the src and 221 * dst do not align to allow simple ldx,stx operation), the FP 222 * registers allow much faster copy operations. 223 * 224 * The estimated extra cost of the FP path will vary depending on 225 * src/dst alignment, dst offset from the next 64 byte FPblock store 226 * boundary, remaining src data after the last full dst cache line is 227 * moved whether the FP registers need to be saved, and some other 228 * minor issues. The average additional overhead is estimated to be 229 * 400 clocks. Since each non-repeated/predicted tst and branch costs 230 * around 10 clocks, elaborate calculation would slow down to all 231 * longer copies and only benefit a small portion of medium sized 232 * copies. Rather than incur such cost, we chose fixed transition 233 * points for each of the alignment choices. 234 * 235 * For the inner loop, here is a comparison of the per cache line 236 * costs for each alignment when src&dst are in cache: 237 * 238 * byte aligned: 108 clocks slower for non-FPBLK 239 * half aligned: 44 clocks slower for non-FPBLK 240 * word aligned: 12 clocks slower for non-FPBLK 241 * long aligned: 4 clocks >>faster<< for non-FPBLK 242 * 243 * The long aligned loop runs faster because it does no prefetching. 244 * That wins if the data is not in cache or there is too little 245 * data to gain much benefit from prefetching. But when there 246 * is more data and that data is not in cache, failing to prefetch 247 * can run much slower. In addition, there is a 2 Kbyte store queue 248 * which will cause the non-FPBLK inner loop to slow for larger copies. 249 * The exact tradeoff is strongly load and application dependent, with 250 * increasing risk of a customer visible performance regression if the 251 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache 252 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe 253 * upper limit for the non-FPBLK code. To minimize performance regression 254 * risk while still gaining the primary benefits of the improvements to 255 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various 256 * hw_copy_limit_*. Later experimental studies using different values 257 * of hw_copy_limit_* can be used to make further adjustments if 258 * appropriate. 259 * 260 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned 261 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned 262 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned 263 * hw_copy_limit_8 = src and dst are longword aligned 264 * 265 * To say that src and dst are word aligned means that after 266 * some initial alignment activity of moving 0 to 3 bytes, 267 * both the src and dst will be on word boundaries so that 268 * word loads and stores may be used. 269 * 270 * Default values at May,2005 are: 271 * hw_copy_limit_1 = 256 272 * hw_copy_limit_2 = 512 273 * hw_copy_limit_4 = 1024 274 * hw_copy_limit_8 = 1024 (or 1536 on some systems) 275 * 276 * 277 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is 278 * disabled for that alignment choice. 279 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256) 280 * the value of VIS_COPY_THRESHOLD is used. 281 * It is not envisioned that hw_copy_limit_? will be changed in the field 282 * It is provided to allow for disabling FPBLK copies and to allow 283 * easy testing of alternate values on future HW implementations 284 * that might have different cache sizes, clock rates or instruction 285 * timing rules. 286 * 287 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum 288 * threshold to speedup all shorter copies (less than 256). That 289 * saves an alignment test, memory reference, and enabling test 290 * for all short copies, or an estimated 24 clocks. 291 * 292 * The order in which these limits are checked does matter since each 293 * non-predicted tst and branch costs around 10 clocks. 294 * If src and dst are randomly selected addresses, 295 * 4 of 8 will not be alignable. 296 * 2 of 8 will be half word alignable. 297 * 1 of 8 will be word alignable. 298 * 1 of 8 will be long word alignable. 299 * But, tests on running kernels show that src and dst to copy code 300 * are typically not on random alignments. Structure copies and 301 * copies of larger data sizes are often on long word boundaries. 302 * So we test the long word alignment case first, then 303 * the byte alignment, then halfword, then word alignment. 304 * 305 * Several times, tests for length are made to split the code 306 * into subcases. These tests often allow later tests to be 307 * avoided. For example, within the non-FPBLK copy, we first 308 * check for tiny copies of 3 bytes or less. That allows us 309 * to use a 4-way unrolled loop for the general byte copy case 310 * without a test on loop entry. 311 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less 312 * vs longer cases. For the really short case, we don't attempt 313 * align src and dst. We try to minimize special case tests in 314 * the shortest loops as each test adds a significant percentage 315 * to the total time. 316 * 317 * For the medium sized cases, we allow ourselves to adjust the 318 * src and dst alignment and provide special cases for each of 319 * the four adjusted alignment cases. The CHKSIZE that was used 320 * to decide between short and medium size was chosen to be 39 321 * as that allows for the worst case of 7 bytes of alignment 322 * shift and 4 times 8 bytes for the first long word unrolling. 323 * That knowledge saves an initial test for length on entry into 324 * the medium cases. If the general loop unrolling factor were 325 * to be increases, this number would also need to be adjusted. 326 * 327 * For all cases in the non-FPBLK code where it is known that at 328 * least 4 chunks of data are available for movement, the 329 * loop is unrolled by four. This 4-way loop runs in 8 clocks 330 * or 2 clocks per data element. 331 * 332 * Instruction alignment is forced by used of .align 16 directives 333 * and nops which are not executed in the code. This 334 * combination of operations shifts the alignment of following 335 * loops to insure that loops are aligned so that their instructions 336 * fall within the minimum number of 4 instruction fetch groups. 337 * If instructions are inserted or removed between the .align 338 * instruction and the unrolled loops, then the alignment needs 339 * to be readjusted. Misaligned loops can add a clock per loop 340 * iteration to the loop timing. 341 * 342 * In a few cases, code is duplicated to avoid a branch. Since 343 * a non-predicted tst and branch takes 10 clocks, this savings 344 * is judged an appropriate time-space tradeoff. 345 * 346 * Within the FPBLK-code, the prefetch method in the inner 347 * loop needs to be explained as it is not standard. Two 348 * prefetches are issued for each cache line instead of one. 349 * The primary one is at the maximum reach of 8 cache lines. 350 * Most of the time, that maximum prefetch reach gives the 351 * cache line more time to reach the processor for systems with 352 * higher processor clocks. But, sometimes memory interference 353 * can cause that prefetch to be dropped. Putting a second 354 * prefetch at a reach of 5 cache lines catches the drops 355 * three iterations later and shows a measured improvement 356 * in performance over any similar loop with a single prefetch. 357 * The prefetches are placed in the loop so they overlap with 358 * non-memory instructions, so that there is no extra cost 359 * when the data is already in-cache. 360 * 361 */ 362 363/* 364 * Notes on preserving existing fp state and on membars. 365 * 366 * When a copyOP decides to use fp we may have to preserve existing 367 * floating point state. It is not the caller's state that we need to 368 * preserve - the rest of the kernel does not use fp and, anyway, fp 369 * registers are volatile across a call. Some examples: 370 * 371 * - userland has fp state and is interrupted (device interrupt 372 * or trap) and within the interrupt/trap handling we use 373 * bcopy() 374 * - another (higher level) interrupt or trap handler uses bcopy 375 * while a bcopy from an earlier interrupt is still active 376 * - an asynchronous error trap occurs while fp state exists (in 377 * userland or in kernel copy) and the tl0 component of the handling 378 * uses bcopy 379 * - a user process with fp state incurs a copy-on-write fault and 380 * hwblkpagecopy always uses fp 381 * 382 * We therefore need a per-call place in which to preserve fp state - 383 * using our stack is ideal (and since fp copy cannot be leaf optimized 384 * because of calls it makes, this is no hardship). 385 * 386 * When we have finished fp copy (with it's repeated block stores) 387 * we must membar #Sync so that our block stores may complete before 388 * we either restore the original fp state into the fp registers or 389 * return to a caller which may initiate other fp operations that could 390 * modify the fp regs we used before the block stores complete. 391 * 392 * Synchronous faults (eg, unresolvable DMMU miss) that occur while 393 * t_lofault is not NULL will not panic but will instead trampoline 394 * to the registered lofault handler. There is no need for any 395 * membars for these - eg, our store to t_lofault will always be visible to 396 * ourselves and it is our cpu which will take any trap. 397 * 398 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur 399 * while t_lofault is not NULL will also not panic. Since we're copying 400 * to or from userland the extent of the damage is known - the destination 401 * buffer is incomplete. So trap handlers will trampoline to the lofault 402 * handler in this case which should take some form of error action to 403 * avoid using the incomplete buffer. The trap handler also flags the 404 * fault so that later return-from-trap handling (for the trap that brought 405 * this thread into the kernel in the first place) can notify the process 406 * and reboot the system (or restart the service with Greenline/Contracts). 407 * 408 * Asynchronous faults (eg, uncorrectable ECC error from memory) can 409 * result in deferred error traps - the trap is taken sometime after 410 * the event and the trap PC may not be the PC of the faulting access. 411 * Delivery of such pending traps can be forced by a membar #Sync, acting 412 * as an "error barrier" in this role. To accurately apply the user/kernel 413 * separation described in the preceding paragraph we must force delivery 414 * of deferred traps affecting kernel state before we install a lofault 415 * handler (if we interpose a new lofault handler on an existing one there 416 * is no need to repeat this), and we must force delivery of deferred 417 * errors affecting the lofault-protected region before we clear t_lofault. 418 * Failure to do so results in lost kernel state being interpreted as 419 * affecting a copyin/copyout only, or of an error that really only 420 * affects copy data being interpreted as losing kernel state. 421 * 422 * Since the copy operations may preserve and later restore floating 423 * point state that does not belong to the caller (see examples above), 424 * we must be careful in how we do this in order to prevent corruption 425 * of another program. 426 * 427 * To make sure that floating point state is always saved and restored 428 * correctly, the following "big rules" must be followed when the floating 429 * point registers will be used: 430 * 431 * 1. %l6 always holds the caller's lofault handler. Also in this register, 432 * Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in 433 * use. Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a 434 * lofault handler was set coming in. 435 * 436 * 2. The FPUSED flag indicates that all FP state has been successfully stored 437 * on the stack. It should not be set until this save has been completed. 438 * 439 * 3. The FPUSED flag should not be cleared on exit until all FP state has 440 * been restored from the stack. If an error occurs while restoring 441 * data from the stack, the error handler can check this flag to see if 442 * a restore is necessary. 443 * 444 * 4. Code run under the new lofault handler must be kept to a minimum. In 445 * particular, any calls to FP_ALLOWMIGRATE, which could result in a call 446 * to kpreempt(), should not be made until after the lofault handler has 447 * been restored. 448 */ 449 450/* 451 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed 452 * to "break even" using FP/VIS-accelerated memory operations. 453 * The FPBLK code assumes a minimum number of bytes are available 454 * to be moved on entry. Check that code carefully before 455 * reducing VIS_COPY_THRESHOLD below 256. 456 */ 457/* 458 * This shadows sys/machsystm.h which can't be included due to the lack of 459 * _ASM guards in include files it references. Change it here, change it there. 460 */ 461#define VIS_COPY_THRESHOLD 256 462 463/* 464 * TEST for very short copies 465 * Be aware that the maximum unroll for the short unaligned case 466 * is SHORTCOPY+1 467 */ 468#define SHORTCOPY 3 469#define CHKSIZE 39 470 471/* 472 * Indicates that we're to trampoline to the error handler. 473 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag. 474 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag. 475 */ 476#define FPUSED_FLAG 1 477#define TRAMP_FLAG 2 478#define MASK_FLAGS 3 479 480/* 481 * Number of outstanding prefetches. 482 * first prefetch moves data from L2 to L1 (n_reads) 483 * second prefetch moves data from memory to L2 (one_read) 484 */ 485#define OLYMPUS_C_PREFETCH 24 486#define OLYMPUS_C_2ND_PREFETCH 12 487 488#define VIS_BLOCKSIZE 64 489 490/* 491 * Size of stack frame in order to accomodate a 64-byte aligned 492 * floating-point register save area and 2 64-bit temp locations. 493 * All copy functions use two quadrants of fp registers; to assure a 494 * block-aligned two block buffer in which to save we must reserve 495 * three blocks on stack. Not all functions preserve %pfrs on stack 496 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all. 497 * 498 * _______________________________________ <-- %fp + STACK_BIAS 499 * | We may need to preserve 2 quadrants | 500 * | of fp regs, but since we do so with | 501 * | BST/BLD we need room in which to | 502 * | align to VIS_BLOCKSIZE bytes. So | 503 * | this area is 3 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET 504 * |-------------------------------------| 505 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET 506 * |-------------------------------------| 507 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET 508 * --------------------------------------- 509 */ 510#define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8)) 511#define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 3) 512#define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 2) - 1) 513#define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8) 514#define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8) 515 516/* 517 * Common macros used by the various versions of the block copy 518 * routines in this file. 519 */ 520 521/* 522 * In FP copies if we do not have preserved data to restore over 523 * the fp regs we used then we must zero those regs to avoid 524 * exposing portions of the data to later threads (data security). 525 * 526 * Copy functions use either quadrants 1 and 3 or 2 and 4. 527 * 528 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47 529 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63 530 * 531 * The instructions below are quicker than repeated fzero instructions 532 * since they can dispatch down two fp pipelines. 533 */ 534#define FZEROQ1Q3 \ 535 fzero %f0 ;\ 536 fmovd %f0, %f2 ;\ 537 fmovd %f0, %f4 ;\ 538 fmovd %f0, %f6 ;\ 539 fmovd %f0, %f8 ;\ 540 fmovd %f0, %f10 ;\ 541 fmovd %f0, %f12 ;\ 542 fmovd %f0, %f14 ;\ 543 fmovd %f0, %f32 ;\ 544 fmovd %f0, %f34 ;\ 545 fmovd %f0, %f36 ;\ 546 fmovd %f0, %f38 ;\ 547 fmovd %f0, %f40 ;\ 548 fmovd %f0, %f42 ;\ 549 fmovd %f0, %f44 ;\ 550 fmovd %f0, %f46 551 552#define FZEROQ2Q4 \ 553 fzero %f16 ;\ 554 fmovd %f0, %f18 ;\ 555 fmovd %f0, %f20 ;\ 556 fmovd %f0, %f22 ;\ 557 fmovd %f0, %f24 ;\ 558 fmovd %f0, %f26 ;\ 559 fmovd %f0, %f28 ;\ 560 fmovd %f0, %f30 ;\ 561 fmovd %f0, %f48 ;\ 562 fmovd %f0, %f50 ;\ 563 fmovd %f0, %f52 ;\ 564 fmovd %f0, %f54 ;\ 565 fmovd %f0, %f56 ;\ 566 fmovd %f0, %f58 ;\ 567 fmovd %f0, %f60 ;\ 568 fmovd %f0, %f62 569 570/* 571 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack. 572 * Used to save and restore in-use fp registers when we want to use FP 573 * and find fp already in use and copy size still large enough to justify 574 * the additional overhead of this save and restore. 575 * 576 * A membar #Sync is needed before save to sync fp ops initiated before 577 * the call to the copy function (by whoever has fp in use); for example 578 * an earlier block load to the quadrant we are about to save may still be 579 * "in flight". A membar #Sync is required at the end of the save to 580 * sync our block store (the copy code is about to begin ldd's to the 581 * first quadrant). 582 * 583 * Similarly: a membar #Sync before restore allows the block stores of 584 * the copy operation to complete before we fill the quadrants with their 585 * original data, and a membar #Sync after restore lets the block loads 586 * of the restore complete before we return to whoever has the fp regs 587 * in use. To avoid repeated membar #Sync we make it the responsibility 588 * of the copy code to membar #Sync immediately after copy is complete 589 * and before using the BLD_*_FROMSTACK macro. 590 */ 591#if !defined(lint) 592#define BST_FPQ1Q3_TOSTACK(tmp1) \ 593 /* membar #Sync */ ;\ 594 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 595 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 596 stda %f0, [tmp1]ASI_BLK_P ;\ 597 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 598 stda %f32, [tmp1]ASI_BLK_P ;\ 599 membar #Sync 600 601#define BLD_FPQ1Q3_FROMSTACK(tmp1) \ 602 /* membar #Sync - provided at copy completion */ ;\ 603 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 604 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 605 ldda [tmp1]ASI_BLK_P, %f0 ;\ 606 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 607 ldda [tmp1]ASI_BLK_P, %f32 ;\ 608 membar #Sync 609 610#define BST_FPQ2Q4_TOSTACK(tmp1) \ 611 /* membar #Sync */ ;\ 612 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 613 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 614 stda %f16, [tmp1]ASI_BLK_P ;\ 615 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 616 stda %f48, [tmp1]ASI_BLK_P ;\ 617 membar #Sync 618 619#define BLD_FPQ2Q4_FROMSTACK(tmp1) \ 620 /* membar #Sync - provided at copy completion */ ;\ 621 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 622 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 623 ldda [tmp1]ASI_BLK_P, %f16 ;\ 624 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 625 ldda [tmp1]ASI_BLK_P, %f48 ;\ 626 membar #Sync 627#endif 628 629/* 630 * FP_NOMIGRATE and FP_ALLOWMIGRATE. Prevent migration (or, stronger, 631 * prevent preemption if there is no t_lwp to save FP state to on context 632 * switch) before commencing a FP copy, and reallow it on completion or 633 * in error trampoline paths when we were using FP copy. 634 * 635 * Both macros may call other functions, so be aware that all outputs are 636 * forfeit after using these macros. For this reason we do not pass registers 637 * to use - we just use any outputs we want. 638 * 639 * Pseudo code: 640 * 641 * FP_NOMIGRATE: 642 * 643 * if (curthread->t_lwp) { 644 * thread_nomigrate(); 645 * } else { 646 * kpreempt_disable(); 647 * } 648 * 649 * FP_ALLOWMIGRATE: 650 * 651 * if (curthread->t_lwp) { 652 * thread_allowmigrate(); 653 * } else { 654 * kpreempt_enable(); 655 * } 656 */ 657 658#define FP_NOMIGRATE(label1, label2) \ 659 ldn [THREAD_REG + T_LWP], %o0 ;\ 660 brz,a,pn %o0, label1/**/f ;\ 661 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ 662 call thread_nomigrate ;\ 663 nop ;\ 664 ba label2/**/f ;\ 665 nop ;\ 666label1: ;\ 667 inc %o1 ;\ 668 stb %o1, [THREAD_REG + T_PREEMPT] ;\ 669label2: 670 671#define FP_ALLOWMIGRATE(label1, label2) \ 672 ldn [THREAD_REG + T_LWP], %o0 ;\ 673 brz,a,pn %o0, label1/**/f ;\ 674 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ 675 call thread_allowmigrate ;\ 676 nop ;\ 677 ba label2/**/f ;\ 678 nop ;\ 679label1: ;\ 680 dec %o1 ;\ 681 brnz,pn %o1, label2/**/f ;\ 682 stb %o1, [THREAD_REG + T_PREEMPT] ;\ 683 ldn [THREAD_REG + T_CPU], %o0 ;\ 684 ldub [%o0 + CPU_KPRUNRUN], %o0 ;\ 685 brz,pt %o0, label2/**/f ;\ 686 nop ;\ 687 call kpreempt ;\ 688 rdpr %pil, %o0 ;\ 689label2: 690 691/* 692 * Copy a block of storage, returning an error code if `from' or 693 * `to' takes a kernel pagefault which cannot be resolved. 694 * Returns errno value on pagefault error, 0 if all ok 695 */ 696 697#if defined(lint) 698 699/* ARGSUSED */ 700int 701kcopy(const void *from, void *to, size_t count) 702{ return(0); } 703 704#else /* lint */ 705 706 .seg ".text" 707 .align 4 708 709 ENTRY(kcopy) 710 711 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 712 bleu,pt %ncc, .kcopy_small ! go to larger cases 713 xor %o0, %o1, %o3 ! are src, dst alignable? 714 btst 7, %o3 ! 715 bz,pt %ncc, .kcopy_8 ! check for longword alignment 716 nop 717 btst 1, %o3 ! 718 bz,pt %ncc, .kcopy_2 ! check for half-word 719 nop 720 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 721 ld [%o3 + %lo(hw_copy_limit_1)], %o3 722 tst %o3 723 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 724 cmp %o2, %o3 ! if length <= limit 725 bleu,pt %ncc, .kcopy_small ! go to small copy 726 nop 727 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 728 nop 729.kcopy_2: 730 btst 3, %o3 ! 731 bz,pt %ncc, .kcopy_4 ! check for word alignment 732 nop 733 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 734 ld [%o3 + %lo(hw_copy_limit_2)], %o3 735 tst %o3 736 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 737 cmp %o2, %o3 ! if length <= limit 738 bleu,pt %ncc, .kcopy_small ! go to small copy 739 nop 740 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 741 nop 742.kcopy_4: 743 ! already checked longword, must be word aligned 744 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 745 ld [%o3 + %lo(hw_copy_limit_4)], %o3 746 tst %o3 747 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 748 cmp %o2, %o3 ! if length <= limit 749 bleu,pt %ncc, .kcopy_small ! go to small copy 750 nop 751 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 752 nop 753.kcopy_8: 754 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 755 ld [%o3 + %lo(hw_copy_limit_8)], %o3 756 tst %o3 757 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 758 cmp %o2, %o3 ! if length <= limit 759 bleu,pt %ncc, .kcopy_small ! go to small copy 760 nop 761 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 762 nop 763 764.kcopy_small: 765 sethi %hi(.sm_copyerr), %o5 ! sm_copyerr is lofault value 766 or %o5, %lo(.sm_copyerr), %o5 767 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 768 membar #Sync ! sync error barrier 769 ba,pt %ncc, .sm_do_copy ! common code 770 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 771 772.kcopy_more: 773 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 774 sethi %hi(.copyerr), %l7 ! copyerr is lofault value 775 or %l7, %lo(.copyerr), %l7 776 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 777 membar #Sync ! sync error barrier 778 ba,pt %ncc, .do_copy ! common code 779 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 780 781 782/* 783 * We got here because of a fault during bcopy_more, called from kcopy or bcopy. 784 * Errno value is in %g1. bcopy_more uses fp quadrants 1 and 3. 785 */ 786.copyerr: 787 set .copyerr2, %l0 788 membar #Sync ! sync error barrier 789 stn %l0, [THREAD_REG + T_LOFAULT] ! set t_lofault 790 btst FPUSED_FLAG, %l6 791 bz %ncc, 1f 792 and %l6, TRAMP_FLAG, %l0 ! copy trampoline flag to %l0 793 794 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 795 wr %o2, 0, %gsr 796 797 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 798 btst FPRS_FEF, %o3 799 bz,pt %icc, 4f 800 nop 801 802 BLD_FPQ1Q3_FROMSTACK(%o2) 803 804 ba,pt %ncc, 1f 805 wr %o3, 0, %fprs ! restore fprs 806 8074: 808 FZEROQ1Q3 809 wr %o3, 0, %fprs ! restore fprs 810 811 ! 812 ! Need to cater for the different expectations of kcopy 813 ! and bcopy. kcopy will *always* set a t_lofault handler 814 ! If it fires, we're expected to just return the error code 815 ! and *not* to invoke any existing error handler. As far as 816 ! bcopy is concerned, we only set t_lofault if there was an 817 ! existing lofault handler. In that case we're expected to 818 ! invoke the previously existing handler after resetting the 819 ! t_lofault value. 820 ! 8211: 822 andn %l6, MASK_FLAGS, %l6 ! turn trampoline flag off 823 membar #Sync ! sync error barrier 824 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 825 FP_ALLOWMIGRATE(5, 6) 826 827 btst TRAMP_FLAG, %l0 828 bnz,pn %ncc, 3f 829 nop 830 ret 831 restore %g1, 0, %o0 832 8333: 834 ! 835 ! We're here via bcopy. There *must* have been an error handler 836 ! in place otherwise we would have died a nasty death already. 837 ! 838 jmp %l6 ! goto real handler 839 restore %g0, 0, %o0 ! dispose of copy window 840 841/* 842 * We got here because of a fault in .copyerr. We can't safely restore fp 843 * state, so we panic. 844 */ 845fp_panic_msg: 846 .asciz "Unable to restore fp state after copy operation" 847 848 .align 4 849.copyerr2: 850 set fp_panic_msg, %o0 851 call panic 852 nop 853 854/* 855 * We got here because of a fault during a small kcopy or bcopy. 856 * No floating point registers are used by the small copies. 857 * Errno value is in %g1. 858 */ 859.sm_copyerr: 8601: 861 btst TRAMP_FLAG, %o4 862 membar #Sync 863 andn %o4, TRAMP_FLAG, %o4 864 bnz,pn %ncc, 3f 865 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 866 retl 867 mov %g1, %o0 8683: 869 jmp %o4 ! goto real handler 870 mov %g0, %o0 ! 871 872 SET_SIZE(kcopy) 873#endif /* lint */ 874 875 876/* 877 * Copy a block of storage - must not overlap (from + len <= to). 878 * Registers: l6 - saved t_lofault 879 * (for short copies, o4 - saved t_lofault) 880 * 881 * Copy a page of memory. 882 * Assumes double word alignment and a count >= 256. 883 */ 884#if defined(lint) 885 886/* ARGSUSED */ 887void 888bcopy(const void *from, void *to, size_t count) 889{} 890 891#else /* lint */ 892 893 ENTRY(bcopy) 894 895 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 896 bleu,pt %ncc, .bcopy_small ! go to larger cases 897 xor %o0, %o1, %o3 ! are src, dst alignable? 898 btst 7, %o3 ! 899 bz,pt %ncc, .bcopy_8 ! check for longword alignment 900 nop 901 btst 1, %o3 ! 902 bz,pt %ncc, .bcopy_2 ! check for half-word 903 nop 904 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 905 ld [%o3 + %lo(hw_copy_limit_1)], %o3 906 tst %o3 907 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 908 cmp %o2, %o3 ! if length <= limit 909 bleu,pt %ncc, .bcopy_small ! go to small copy 910 nop 911 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 912 nop 913.bcopy_2: 914 btst 3, %o3 ! 915 bz,pt %ncc, .bcopy_4 ! check for word alignment 916 nop 917 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 918 ld [%o3 + %lo(hw_copy_limit_2)], %o3 919 tst %o3 920 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 921 cmp %o2, %o3 ! if length <= limit 922 bleu,pt %ncc, .bcopy_small ! go to small copy 923 nop 924 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 925 nop 926.bcopy_4: 927 ! already checked longword, must be word aligned 928 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 929 ld [%o3 + %lo(hw_copy_limit_4)], %o3 930 tst %o3 931 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 932 cmp %o2, %o3 ! if length <= limit 933 bleu,pt %ncc, .bcopy_small ! go to small copy 934 nop 935 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 936 nop 937.bcopy_8: 938 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 939 ld [%o3 + %lo(hw_copy_limit_8)], %o3 940 tst %o3 941 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 942 cmp %o2, %o3 ! if length <= limit 943 bleu,pt %ncc, .bcopy_small ! go to small copy 944 nop 945 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 946 nop 947 948 .align 16 949.bcopy_small: 950 ldn [THREAD_REG + T_LOFAULT], %o4 ! save t_lofault 951 tst %o4 952 bz,pt %icc, .sm_do_copy 953 nop 954 sethi %hi(.sm_copyerr), %o5 955 or %o5, %lo(.sm_copyerr), %o5 956 membar #Sync ! sync error barrier 957 stn %o5, [THREAD_REG + T_LOFAULT] ! install new vector 958 or %o4, TRAMP_FLAG, %o4 ! error should trampoline 959.sm_do_copy: 960 cmp %o2, SHORTCOPY ! check for really short case 961 bleu,pt %ncc, .bc_sm_left ! 962 cmp %o2, CHKSIZE ! check for medium length cases 963 bgu,pn %ncc, .bc_med ! 964 or %o0, %o1, %o3 ! prepare alignment check 965 andcc %o3, 0x3, %g0 ! test for alignment 966 bz,pt %ncc, .bc_sm_word ! branch to word aligned case 967.bc_sm_movebytes: 968 sub %o2, 3, %o2 ! adjust count to allow cc zero test 969.bc_sm_notalign4: 970 ldub [%o0], %o3 ! read byte 971 stb %o3, [%o1] ! write byte 972 subcc %o2, 4, %o2 ! reduce count by 4 973 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes 974 add %o0, 4, %o0 ! advance SRC by 4 975 stb %o3, [%o1 + 1] 976 ldub [%o0 - 2], %o3 977 add %o1, 4, %o1 ! advance DST by 4 978 stb %o3, [%o1 - 2] 979 ldub [%o0 - 1], %o3 980 bgt,pt %ncc, .bc_sm_notalign4 ! loop til 3 or fewer bytes remain 981 stb %o3, [%o1 - 1] 982 add %o2, 3, %o2 ! restore count 983.bc_sm_left: 984 tst %o2 985 bz,pt %ncc, .bc_sm_exit ! check for zero length 986 deccc %o2 ! reduce count for cc test 987 ldub [%o0], %o3 ! move one byte 988 bz,pt %ncc, .bc_sm_exit 989 stb %o3, [%o1] 990 ldub [%o0 + 1], %o3 ! move another byte 991 deccc %o2 ! check for more 992 bz,pt %ncc, .bc_sm_exit 993 stb %o3, [%o1 + 1] 994 ldub [%o0 + 2], %o3 ! move final byte 995 stb %o3, [%o1 + 2] 996 membar #Sync ! sync error barrier 997 andn %o4, TRAMP_FLAG, %o4 998 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 999 retl 1000 mov %g0, %o0 ! return 0 1001 .align 16 1002 nop ! instruction alignment 1003 ! see discussion at start of file 1004.bc_sm_words: 1005 lduw [%o0], %o3 ! read word 1006.bc_sm_wordx: 1007 subcc %o2, 8, %o2 ! update count 1008 stw %o3, [%o1] ! write word 1009 add %o0, 8, %o0 ! update SRC 1010 lduw [%o0 - 4], %o3 ! read word 1011 add %o1, 8, %o1 ! update DST 1012 bgt,pt %ncc, .bc_sm_words ! loop til done 1013 stw %o3, [%o1 - 4] ! write word 1014 addcc %o2, 7, %o2 ! restore count 1015 bz,pt %ncc, .bc_sm_exit 1016 deccc %o2 1017 bz,pt %ncc, .bc_sm_byte 1018.bc_sm_half: 1019 subcc %o2, 2, %o2 ! reduce count by 2 1020 add %o0, 2, %o0 ! advance SRC by 2 1021 lduh [%o0 - 2], %o3 ! read half word 1022 add %o1, 2, %o1 ! advance DST by 2 1023 bgt,pt %ncc, .bc_sm_half ! loop til done 1024 sth %o3, [%o1 - 2] ! write half word 1025 addcc %o2, 1, %o2 ! restore count 1026 bz,pt %ncc, .bc_sm_exit 1027 nop 1028.bc_sm_byte: 1029 ldub [%o0], %o3 1030 stb %o3, [%o1] 1031 membar #Sync ! sync error barrier 1032 andn %o4, TRAMP_FLAG, %o4 1033 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1034 retl 1035 mov %g0, %o0 ! return 0 1036 1037.bc_sm_word: 1038 subcc %o2, 4, %o2 ! update count 1039 bgt,pt %ncc, .bc_sm_wordx 1040 lduw [%o0], %o3 ! read word 1041 addcc %o2, 3, %o2 ! restore count 1042 bz,pt %ncc, .bc_sm_exit 1043 stw %o3, [%o1] ! write word 1044 deccc %o2 ! reduce count for cc test 1045 ldub [%o0 + 4], %o3 ! load one byte 1046 bz,pt %ncc, .bc_sm_exit 1047 stb %o3, [%o1 + 4] ! store one byte 1048 ldub [%o0 + 5], %o3 ! load second byte 1049 deccc %o2 1050 bz,pt %ncc, .bc_sm_exit 1051 stb %o3, [%o1 + 5] ! store second byte 1052 ldub [%o0 + 6], %o3 ! load third byte 1053 stb %o3, [%o1 + 6] ! store third byte 1054.bc_sm_exit: 1055 brz,pt %o4, .bc_sm_done 1056 nop 1057 membar #Sync ! sync error barrier 1058 andn %o4, TRAMP_FLAG, %o4 1059 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1060.bc_sm_done: 1061 retl 1062 mov %g0, %o0 ! return 0 1063 1064 .align 16 1065.bc_med: 1066 xor %o0, %o1, %o3 ! setup alignment check 1067 btst 1, %o3 1068 bnz,pt %ncc, .bc_sm_movebytes ! unaligned 1069 nop 1070 btst 3, %o3 1071 bnz,pt %ncc, .bc_med_half ! halfword aligned 1072 nop 1073 btst 7, %o3 1074 bnz,pt %ncc, .bc_med_word ! word aligned 1075 nop 1076.bc_med_long: 1077 btst 3, %o0 ! check for 1078 bz,pt %ncc, .bc_med_long1 ! word alignment 1079 nop 1080.bc_med_long0: 1081 ldub [%o0], %o3 ! load one byte 1082 inc %o0 1083 stb %o3,[%o1] ! store byte 1084 inc %o1 1085 btst 3, %o0 1086 bnz,pt %ncc, .bc_med_long0 1087 dec %o2 1088.bc_med_long1: ! word aligned 1089 btst 7, %o0 ! check for long word 1090 bz,pt %ncc, .bc_med_long2 1091 nop 1092 lduw [%o0], %o3 ! load word 1093 add %o0, 4, %o0 ! advance SRC by 4 1094 stw %o3, [%o1] ! store word 1095 add %o1, 4, %o1 ! advance DST by 4 1096 sub %o2, 4, %o2 ! reduce count by 4 1097! 1098! Now long word aligned and have at least 32 bytes to move 1099! 1100.bc_med_long2: 1101 sub %o2, 31, %o2 ! adjust count to allow cc zero test 1102.bc_med_lmove: 1103 ldx [%o0], %o3 ! read long word 1104 stx %o3, [%o1] ! write long word 1105 subcc %o2, 32, %o2 ! reduce count by 32 1106 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words 1107 add %o0, 32, %o0 ! advance SRC by 32 1108 stx %o3, [%o1 + 8] 1109 ldx [%o0 - 16], %o3 1110 add %o1, 32, %o1 ! advance DST by 32 1111 stx %o3, [%o1 - 16] 1112 ldx [%o0 - 8], %o3 1113 bgt,pt %ncc, .bc_med_lmove ! loop til 31 or fewer bytes left 1114 stx %o3, [%o1 - 8] 1115 addcc %o2, 24, %o2 ! restore count to long word offset 1116 ble,pt %ncc, .bc_med_lextra ! check for more long words to move 1117 nop 1118.bc_med_lword: 1119 ldx [%o0], %o3 ! read long word 1120 subcc %o2, 8, %o2 ! reduce count by 8 1121 stx %o3, [%o1] ! write long word 1122 add %o0, 8, %o0 ! advance SRC by 8 1123 bgt,pt %ncc, .bc_med_lword ! loop til 7 or fewer bytes left 1124 add %o1, 8, %o1 ! advance DST by 8 1125.bc_med_lextra: 1126 addcc %o2, 7, %o2 ! restore rest of count 1127 bz,pt %ncc, .bc_sm_exit ! if zero, then done 1128 deccc %o2 1129 bz,pt %ncc, .bc_sm_byte 1130 nop 1131 ba,pt %ncc, .bc_sm_half 1132 nop 1133 1134 .align 16 1135.bc_med_word: 1136 btst 3, %o0 ! check for 1137 bz,pt %ncc, .bc_med_word1 ! word alignment 1138 nop 1139.bc_med_word0: 1140 ldub [%o0], %o3 ! load one byte 1141 inc %o0 1142 stb %o3,[%o1] ! store byte 1143 inc %o1 1144 btst 3, %o0 1145 bnz,pt %ncc, .bc_med_word0 1146 dec %o2 1147! 1148! Now word aligned and have at least 36 bytes to move 1149! 1150.bc_med_word1: 1151 sub %o2, 15, %o2 ! adjust count to allow cc zero test 1152.bc_med_wmove: 1153 lduw [%o0], %o3 ! read word 1154 stw %o3, [%o1] ! write word 1155 subcc %o2, 16, %o2 ! reduce count by 16 1156 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words 1157 add %o0, 16, %o0 ! advance SRC by 16 1158 stw %o3, [%o1 + 4] 1159 lduw [%o0 - 8], %o3 1160 add %o1, 16, %o1 ! advance DST by 16 1161 stw %o3, [%o1 - 8] 1162 lduw [%o0 - 4], %o3 1163 bgt,pt %ncc, .bc_med_wmove ! loop til 15 or fewer bytes left 1164 stw %o3, [%o1 - 4] 1165 addcc %o2, 12, %o2 ! restore count to word offset 1166 ble,pt %ncc, .bc_med_wextra ! check for more words to move 1167 nop 1168.bc_med_word2: 1169 lduw [%o0], %o3 ! read word 1170 subcc %o2, 4, %o2 ! reduce count by 4 1171 stw %o3, [%o1] ! write word 1172 add %o0, 4, %o0 ! advance SRC by 4 1173 bgt,pt %ncc, .bc_med_word2 ! loop til 3 or fewer bytes left 1174 add %o1, 4, %o1 ! advance DST by 4 1175.bc_med_wextra: 1176 addcc %o2, 3, %o2 ! restore rest of count 1177 bz,pt %ncc, .bc_sm_exit ! if zero, then done 1178 deccc %o2 1179 bz,pt %ncc, .bc_sm_byte 1180 nop 1181 ba,pt %ncc, .bc_sm_half 1182 nop 1183 1184 .align 16 1185.bc_med_half: 1186 btst 1, %o0 ! check for 1187 bz,pt %ncc, .bc_med_half1 ! half word alignment 1188 nop 1189 ldub [%o0], %o3 ! load one byte 1190 inc %o0 1191 stb %o3,[%o1] ! store byte 1192 inc %o1 1193 dec %o2 1194! 1195! Now half word aligned and have at least 38 bytes to move 1196! 1197.bc_med_half1: 1198 sub %o2, 7, %o2 ! adjust count to allow cc zero test 1199.bc_med_hmove: 1200 lduh [%o0], %o3 ! read half word 1201 sth %o3, [%o1] ! write half word 1202 subcc %o2, 8, %o2 ! reduce count by 8 1203 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords 1204 add %o0, 8, %o0 ! advance SRC by 8 1205 sth %o3, [%o1 + 2] 1206 lduh [%o0 - 4], %o3 1207 add %o1, 8, %o1 ! advance DST by 8 1208 sth %o3, [%o1 - 4] 1209 lduh [%o0 - 2], %o3 1210 bgt,pt %ncc, .bc_med_hmove ! loop til 7 or fewer bytes left 1211 sth %o3, [%o1 - 2] 1212 addcc %o2, 7, %o2 ! restore count 1213 bz,pt %ncc, .bc_sm_exit 1214 deccc %o2 1215 bz,pt %ncc, .bc_sm_byte 1216 nop 1217 ba,pt %ncc, .bc_sm_half 1218 nop 1219 1220 SET_SIZE(bcopy) 1221 1222/* 1223 * The _more entry points are not intended to be used directly by 1224 * any caller from outside this file. They are provided to allow 1225 * profiling and dtrace of the portions of the copy code that uses 1226 * the floating point registers. 1227 * This entry is particularly important as DTRACE (at least as of 1228 * 4/2004) does not support leaf functions. 1229 */ 1230 1231 ENTRY(bcopy_more) 1232.bcopy_more: 1233 prefetch [%o0], #n_reads 1234 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 1235 ldn [THREAD_REG + T_LOFAULT], %l6 ! save t_lofault 1236 tst %l6 1237 bz,pt %ncc, .do_copy 1238 nop 1239 sethi %hi(.copyerr), %o2 1240 or %o2, %lo(.copyerr), %o2 1241 membar #Sync ! sync error barrier 1242 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector 1243 ! 1244 ! We've already captured whether t_lofault was zero on entry. 1245 ! We need to mark ourselves as being from bcopy since both 1246 ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set 1247 ! and the saved lofault was zero, we won't reset lofault on 1248 ! returning. 1249 ! 1250 or %l6, TRAMP_FLAG, %l6 1251 1252/* 1253 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes 1254 * Also, use of FP registers has been tested to be enabled 1255 */ 1256.do_copy: 1257 FP_NOMIGRATE(6, 7) 1258 1259 rd %fprs, %o2 ! check for unused fp 1260 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 1261 btst FPRS_FEF, %o2 1262 bz,a,pt %icc, .do_blockcopy 1263 wr %g0, FPRS_FEF, %fprs 1264 1265 BST_FPQ1Q3_TOSTACK(%o2) 1266 1267.do_blockcopy: 1268 rd %gsr, %o2 1269 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 1270 or %l6, FPUSED_FLAG, %l6 1271 1272#define REALSRC %i0 1273#define DST %i1 1274#define CNT %i2 1275#define SRC %i3 1276#define TMP %i5 1277 1278 andcc DST, VIS_BLOCKSIZE - 1, TMP 1279 bz,pt %ncc, 2f 1280 neg TMP 1281 add TMP, VIS_BLOCKSIZE, TMP 1282 1283 ! TMP = bytes required to align DST on FP_BLOCK boundary 1284 ! Using SRC as a tmp here 1285 cmp TMP, 3 1286 bleu,pt %ncc, 1f 1287 sub CNT,TMP,CNT ! adjust main count 1288 sub TMP, 3, TMP ! adjust for end of loop test 1289.bc_blkalign: 1290 ldub [REALSRC], SRC ! move 4 bytes per loop iteration 1291 stb SRC, [DST] 1292 subcc TMP, 4, TMP 1293 ldub [REALSRC + 1], SRC 1294 add REALSRC, 4, REALSRC 1295 stb SRC, [DST + 1] 1296 ldub [REALSRC - 2], SRC 1297 add DST, 4, DST 1298 stb SRC, [DST - 2] 1299 ldub [REALSRC - 1], SRC 1300 bgu,pt %ncc, .bc_blkalign 1301 stb SRC, [DST - 1] 1302 1303 addcc TMP, 3, TMP ! restore count adjustment 1304 bz,pt %ncc, 2f ! no bytes left? 1305 nop 13061: ldub [REALSRC], SRC 1307 inc REALSRC 1308 inc DST 1309 deccc TMP 1310 bgu %ncc, 1b 1311 stb SRC, [DST - 1] 1312 13132: 1314 membar #StoreLoad 1315 andn REALSRC, 0x7, SRC 1316 1317 ! SRC - 8-byte aligned 1318 ! DST - 64-byte aligned 1319 ldd [SRC], %f0 1320 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 1321 alignaddr REALSRC, %g0, %g0 1322 ldd [SRC + 0x08], %f2 1323 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 1324 faligndata %f0, %f2, %f32 1325 ldd [SRC + 0x10], %f4 1326 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 1327 faligndata %f2, %f4, %f34 1328 ldd [SRC + 0x18], %f6 1329 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 1330 faligndata %f4, %f6, %f36 1331 ldd [SRC + 0x20], %f8 1332 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read 1333 faligndata %f6, %f8, %f38 1334 ldd [SRC + 0x28], %f10 1335 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read 1336 faligndata %f8, %f10, %f40 1337 ldd [SRC + 0x30], %f12 1338 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read 1339 faligndata %f10, %f12, %f42 1340 ldd [SRC + 0x38], %f14 1341 ldd [SRC + VIS_BLOCKSIZE], %f0 1342 sub CNT, VIS_BLOCKSIZE, CNT 1343 add SRC, VIS_BLOCKSIZE, SRC 1344 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read 1345 add REALSRC, VIS_BLOCKSIZE, REALSRC 1346 ba,pt %ncc, 1f 1347 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read 1348 .align 32 13491: 1350 ldd [SRC + 0x08], %f2 1351 faligndata %f12, %f14, %f44 1352 ldd [SRC + 0x10], %f4 1353 faligndata %f14, %f0, %f46 1354 stda %f32, [DST]ASI_BLK_P 1355 ldd [SRC + 0x18], %f6 1356 faligndata %f0, %f2, %f32 1357 ldd [SRC + 0x20], %f8 1358 faligndata %f2, %f4, %f34 1359 ldd [SRC + 0x28], %f10 1360 faligndata %f4, %f6, %f36 1361 ldd [SRC + 0x30], %f12 1362 faligndata %f6, %f8, %f38 1363 sub CNT, VIS_BLOCKSIZE, CNT 1364 ldd [SRC + 0x38], %f14 1365 faligndata %f8, %f10, %f40 1366 add DST, VIS_BLOCKSIZE, DST 1367 ldd [SRC + VIS_BLOCKSIZE], %f0 1368 faligndata %f10, %f12, %f42 1369 add REALSRC, VIS_BLOCKSIZE, REALSRC 1370 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 1371 add SRC, VIS_BLOCKSIZE, SRC 1372 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1373 cmp CNT, VIS_BLOCKSIZE + 8 1374 bgu,pt %ncc, 1b 1375 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1376 1377 ! only if REALSRC & 0x7 is 0 1378 cmp CNT, VIS_BLOCKSIZE 1379 bne %ncc, 3f 1380 andcc REALSRC, 0x7, %g0 1381 bz,pt %ncc, 2f 1382 nop 13833: 1384 faligndata %f12, %f14, %f44 1385 faligndata %f14, %f0, %f46 1386 stda %f32, [DST]ASI_BLK_P 1387 add DST, VIS_BLOCKSIZE, DST 1388 ba,pt %ncc, 3f 1389 nop 13902: 1391 ldd [SRC + 0x08], %f2 1392 fsrc1 %f12, %f44 1393 ldd [SRC + 0x10], %f4 1394 fsrc1 %f14, %f46 1395 stda %f32, [DST]ASI_BLK_P 1396 ldd [SRC + 0x18], %f6 1397 fsrc1 %f0, %f32 1398 ldd [SRC + 0x20], %f8 1399 fsrc1 %f2, %f34 1400 ldd [SRC + 0x28], %f10 1401 fsrc1 %f4, %f36 1402 ldd [SRC + 0x30], %f12 1403 fsrc1 %f6, %f38 1404 ldd [SRC + 0x38], %f14 1405 fsrc1 %f8, %f40 1406 sub CNT, VIS_BLOCKSIZE, CNT 1407 add DST, VIS_BLOCKSIZE, DST 1408 add SRC, VIS_BLOCKSIZE, SRC 1409 add REALSRC, VIS_BLOCKSIZE, REALSRC 1410 fsrc1 %f10, %f42 1411 fsrc1 %f12, %f44 1412 fsrc1 %f14, %f46 1413 stda %f32, [DST]ASI_BLK_P 1414 add DST, VIS_BLOCKSIZE, DST 1415 ba,a,pt %ncc, .bcb_exit 1416 nop 1417 14183: tst CNT 1419 bz,a,pt %ncc, .bcb_exit 1420 nop 1421 14225: ldub [REALSRC], TMP 1423 inc REALSRC 1424 inc DST 1425 deccc CNT 1426 bgu %ncc, 5b 1427 stb TMP, [DST - 1] 1428.bcb_exit: 1429 membar #Sync 1430 1431 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 1432 wr %o2, 0, %gsr 1433 1434 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 1435 btst FPRS_FEF, %o3 1436 bz,pt %icc, 4f 1437 nop 1438 1439 BLD_FPQ1Q3_FROMSTACK(%o2) 1440 1441 ba,pt %ncc, 2f 1442 wr %o3, 0, %fprs ! restore fprs 14434: 1444 FZEROQ1Q3 1445 wr %o3, 0, %fprs ! restore fprs 14462: 1447 membar #Sync ! sync error barrier 1448 andn %l6, MASK_FLAGS, %l6 1449 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1450 FP_ALLOWMIGRATE(5, 6) 1451 ret 1452 restore %g0, 0, %o0 1453 1454 SET_SIZE(bcopy_more) 1455 1456#endif /* lint */ 1457 1458/* 1459 * Block copy with possibly overlapped operands. 1460 */ 1461 1462#if defined(lint) 1463 1464/*ARGSUSED*/ 1465void 1466ovbcopy(const void *from, void *to, size_t count) 1467{} 1468 1469#else /* lint */ 1470 1471 ENTRY(ovbcopy) 1472 tst %o2 ! check count 1473 bgu,a %ncc, 1f ! nothing to do or bad arguments 1474 subcc %o0, %o1, %o3 ! difference of from and to address 1475 1476 retl ! return 1477 nop 14781: 1479 bneg,a %ncc, 2f 1480 neg %o3 ! if < 0, make it positive 14812: cmp %o2, %o3 ! cmp size and abs(from - to) 1482 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy, 1483 .empty ! no overlap 1484 cmp %o0, %o1 ! compare from and to addresses 1485 blu %ncc, .ov_bkwd ! if from < to, copy backwards 1486 nop 1487 ! 1488 ! Copy forwards. 1489 ! 1490.ov_fwd: 1491 ldub [%o0], %o3 ! read from address 1492 inc %o0 ! inc from address 1493 stb %o3, [%o1] ! write to address 1494 deccc %o2 ! dec count 1495 bgu %ncc, .ov_fwd ! loop till done 1496 inc %o1 ! inc to address 1497 1498 retl ! return 1499 nop 1500 ! 1501 ! Copy backwards. 1502 ! 1503.ov_bkwd: 1504 deccc %o2 ! dec count 1505 ldub [%o0 + %o2], %o3 ! get byte at end of src 1506 bgu %ncc, .ov_bkwd ! loop till done 1507 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst 1508 1509 retl ! return 1510 nop 1511 1512 SET_SIZE(ovbcopy) 1513 1514#endif /* lint */ 1515 1516 1517/* 1518 * hwblkpagecopy() 1519 * 1520 * Copies exactly one page. This routine assumes the caller (ppcopy) 1521 * has already disabled kernel preemption and has checked 1522 * use_hw_bcopy. Preventing preemption also prevents cpu migration. 1523 */ 1524#ifdef lint 1525/*ARGSUSED*/ 1526void 1527hwblkpagecopy(const void *src, void *dst) 1528{ } 1529#else /* lint */ 1530 ENTRY(hwblkpagecopy) 1531 ! get another window w/space for three aligned blocks of saved fpregs 1532 prefetch [%o0], #n_reads 1533 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 1534 1535 ! %i0 - source address (arg) 1536 ! %i1 - destination address (arg) 1537 ! %i2 - length of region (not arg) 1538 ! %l0 - saved fprs 1539 ! %l1 - pointer to saved fpregs 1540 1541 rd %fprs, %l0 ! check for unused fp 1542 btst FPRS_FEF, %l0 1543 bz,a,pt %icc, 1f 1544 wr %g0, FPRS_FEF, %fprs 1545 1546 BST_FPQ1Q3_TOSTACK(%l1) 1547 15481: set PAGESIZE, CNT 1549 mov REALSRC, SRC 1550 1551 ldd [SRC], %f0 1552 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 1553 ldd [SRC + 0x08], %f2 1554 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 1555 fmovd %f0, %f32 1556 ldd [SRC + 0x10], %f4 1557 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 1558 fmovd %f2, %f34 1559 ldd [SRC + 0x18], %f6 1560 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 1561 fmovd %f4, %f36 1562 ldd [SRC + 0x20], %f8 1563 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read 1564 fmovd %f6, %f38 1565 ldd [SRC + 0x28], %f10 1566 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read 1567 fmovd %f8, %f40 1568 ldd [SRC + 0x30], %f12 1569 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read 1570 fmovd %f10, %f42 1571 ldd [SRC + 0x38], %f14 1572 ldd [SRC + VIS_BLOCKSIZE], %f0 1573 sub CNT, VIS_BLOCKSIZE, CNT 1574 add SRC, VIS_BLOCKSIZE, SRC 1575 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read 1576 ba,pt %ncc, 2f 1577 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read 1578 .align 32 15792: 1580 ldd [SRC + 0x08], %f2 1581 fmovd %f12, %f44 1582 ldd [SRC + 0x10], %f4 1583 fmovd %f14, %f46 1584 stda %f32, [DST]ASI_BLK_P 1585 ldd [SRC + 0x18], %f6 1586 fmovd %f0, %f32 1587 ldd [SRC + 0x20], %f8 1588 fmovd %f2, %f34 1589 ldd [SRC + 0x28], %f10 1590 fmovd %f4, %f36 1591 ldd [SRC + 0x30], %f12 1592 fmovd %f6, %f38 1593 ldd [SRC + 0x38], %f14 1594 fmovd %f8, %f40 1595 ldd [SRC + VIS_BLOCKSIZE], %f0 1596 fmovd %f10, %f42 1597 sub CNT, VIS_BLOCKSIZE, CNT 1598 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 1599 add DST, VIS_BLOCKSIZE, DST 1600 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1601 add SRC, VIS_BLOCKSIZE, SRC 1602 cmp CNT, VIS_BLOCKSIZE + 8 1603 bgu,pt %ncc, 2b 1604 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1605 1606 ! trailing block 1607 ldd [SRC + 0x08], %f2 1608 fsrc1 %f12, %f44 1609 ldd [SRC + 0x10], %f4 1610 fsrc1 %f14, %f46 1611 stda %f32, [DST]ASI_BLK_P 1612 ldd [SRC + 0x18], %f6 1613 fsrc1 %f0, %f32 1614 ldd [SRC + 0x20], %f8 1615 fsrc1 %f2, %f34 1616 ldd [SRC + 0x28], %f10 1617 fsrc1 %f4, %f36 1618 ldd [SRC + 0x30], %f12 1619 fsrc1 %f6, %f38 1620 ldd [SRC + 0x38], %f14 1621 fsrc1 %f8, %f40 1622 sub CNT, VIS_BLOCKSIZE, CNT 1623 add DST, VIS_BLOCKSIZE, DST 1624 add SRC, VIS_BLOCKSIZE, SRC 1625 fsrc1 %f10, %f42 1626 fsrc1 %f12, %f44 1627 fsrc1 %f14, %f46 1628 stda %f32, [DST]ASI_BLK_P 1629 1630 membar #Sync 1631 1632 btst FPRS_FEF, %l0 1633 bz,pt %icc, 2f 1634 nop 1635 1636 BLD_FPQ1Q3_FROMSTACK(%l3) 1637 ba 3f 1638 nop 1639 16402: FZEROQ1Q3 1641 16423: wr %l0, 0, %fprs ! restore fprs 1643 ret 1644 restore %g0, 0, %o0 1645 1646 SET_SIZE(hwblkpagecopy) 1647#endif /* lint */ 1648 1649 1650/* 1651 * Transfer data to and from user space - 1652 * Note that these routines can cause faults 1653 * It is assumed that the kernel has nothing at 1654 * less than KERNELBASE in the virtual address space. 1655 * 1656 * Note that copyin(9F) and copyout(9F) are part of the 1657 * DDI/DKI which specifies that they return '-1' on "errors." 1658 * 1659 * Sigh. 1660 * 1661 * So there's two extremely similar routines - xcopyin() and xcopyout() 1662 * which return the errno that we've faithfully computed. This 1663 * allows other callers (e.g. uiomove(9F)) to work correctly. 1664 * Given that these are used pretty heavily, we expand the calling 1665 * sequences inline for all flavours (rather than making wrappers). 1666 * 1667 * There are also stub routines for xcopyout_little and xcopyin_little, 1668 * which currently are intended to handle requests of <= 16 bytes from 1669 * do_unaligned. Future enhancement to make them handle 8k pages efficiently 1670 * is left as an exercise... 1671 */ 1672 1673/* 1674 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr) 1675 * 1676 * General theory of operation: 1677 * 1678 * The only difference between copy{in,out} and 1679 * xcopy{in,out} is in the error handling routine they invoke 1680 * when a memory access error occurs. xcopyOP returns the errno 1681 * while copyOP returns -1 (see above). copy{in,out}_noerr set 1682 * a special flag (by oring the TRAMP_FLAG into the fault handler address) 1683 * if they are called with a fault handler already in place. That flag 1684 * causes the default handlers to trampoline to the previous handler 1685 * upon an error. 1686 * 1687 * None of the copyops routines grab a window until it's decided that 1688 * we need to do a HW block copy operation. This saves a window 1689 * spill/fill when we're called during socket ops. The typical IO 1690 * path won't cause spill/fill traps. 1691 * 1692 * This code uses a set of 4 limits for the maximum size that will 1693 * be copied given a particular input/output address alignment. 1694 * If the value for a particular limit is zero, the copy will be performed 1695 * by the plain copy loops rather than FPBLK. 1696 * 1697 * See the description of bcopy above for more details of the 1698 * data copying algorithm and the default limits. 1699 * 1700 */ 1701 1702/* 1703 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little). 1704 */ 1705 1706#if defined(lint) 1707 1708 1709#else /* lint */ 1710/* 1711 * We save the arguments in the following registers in case of a fault: 1712 * kaddr - %l1 1713 * uaddr - %l2 1714 * count - %l3 1715 */ 1716#define SAVE_SRC %l1 1717#define SAVE_DST %l2 1718#define SAVE_COUNT %l3 1719 1720#define SM_SAVE_SRC %g4 1721#define SM_SAVE_DST %g5 1722#define SM_SAVE_COUNT %o5 1723#define ERRNO %l5 1724 1725 1726#define REAL_LOFAULT %l4 1727/* 1728 * Generic copyio fault handler. This is the first line of defense when a 1729 * fault occurs in (x)copyin/(x)copyout. In order for this to function 1730 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT. 1731 * This allows us to share common code for all the flavors of the copy 1732 * operations, including the _noerr versions. 1733 * 1734 * Note that this function will restore the original input parameters before 1735 * calling REAL_LOFAULT. So the real handler can vector to the appropriate 1736 * member of the t_copyop structure, if needed. 1737 */ 1738 ENTRY(copyio_fault) 1739 membar #Sync 1740 mov %g1,ERRNO ! save errno in ERRNO 1741 btst FPUSED_FLAG, %l6 1742 bz %ncc, 1f 1743 nop 1744 1745 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 1746 wr %o2, 0, %gsr ! restore gsr 1747 1748 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 1749 btst FPRS_FEF, %o3 1750 bz,pt %icc, 4f 1751 nop 1752 1753 BLD_FPQ2Q4_FROMSTACK(%o2) 1754 1755 ba,pt %ncc, 1f 1756 wr %o3, 0, %fprs ! restore fprs 1757 17584: 1759 FZEROQ2Q4 1760 wr %o3, 0, %fprs ! restore fprs 1761 17621: 1763 andn %l6, FPUSED_FLAG, %l6 1764 membar #Sync 1765 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1766 FP_ALLOWMIGRATE(5, 6) 1767 1768 mov SAVE_SRC, %i0 1769 mov SAVE_DST, %i1 1770 jmp REAL_LOFAULT 1771 mov SAVE_COUNT, %i2 1772 1773 SET_SIZE(copyio_fault) 1774 1775 1776#endif 1777 1778#if defined(lint) 1779 1780/*ARGSUSED*/ 1781int 1782copyout(const void *kaddr, void *uaddr, size_t count) 1783{ return (0); } 1784 1785#else /* lint */ 1786 1787 ENTRY(copyout) 1788 1789 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 1790 bleu,pt %ncc, .copyout_small ! go to larger cases 1791 xor %o0, %o1, %o3 ! are src, dst alignable? 1792 btst 7, %o3 ! 1793 bz,pt %ncc, .copyout_8 ! check for longword alignment 1794 nop 1795 btst 1, %o3 ! 1796 bz,pt %ncc, .copyout_2 ! check for half-word 1797 nop 1798 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 1799 ld [%o3 + %lo(hw_copy_limit_1)], %o3 1800 tst %o3 1801 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1802 cmp %o2, %o3 ! if length <= limit 1803 bleu,pt %ncc, .copyout_small ! go to small copy 1804 nop 1805 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1806 nop 1807.copyout_2: 1808 btst 3, %o3 ! 1809 bz,pt %ncc, .copyout_4 ! check for word alignment 1810 nop 1811 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 1812 ld [%o3 + %lo(hw_copy_limit_2)], %o3 1813 tst %o3 1814 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1815 cmp %o2, %o3 ! if length <= limit 1816 bleu,pt %ncc, .copyout_small ! go to small copy 1817 nop 1818 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1819 nop 1820.copyout_4: 1821 ! already checked longword, must be word aligned 1822 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 1823 ld [%o3 + %lo(hw_copy_limit_4)], %o3 1824 tst %o3 1825 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1826 cmp %o2, %o3 ! if length <= limit 1827 bleu,pt %ncc, .copyout_small ! go to small copy 1828 nop 1829 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1830 nop 1831.copyout_8: 1832 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 1833 ld [%o3 + %lo(hw_copy_limit_8)], %o3 1834 tst %o3 1835 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1836 cmp %o2, %o3 ! if length <= limit 1837 bleu,pt %ncc, .copyout_small ! go to small copy 1838 nop 1839 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1840 nop 1841 1842 .align 16 1843 nop ! instruction alignment 1844 ! see discussion at start of file 1845.copyout_small: 1846 sethi %hi(.sm_copyout_err), %o5 ! .sm_copyout_err is lofault 1847 or %o5, %lo(.sm_copyout_err), %o5 1848 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 1849 membar #Sync ! sync error barrier 1850 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 1851.sm_do_copyout: 1852 mov %o0, SM_SAVE_SRC 1853 mov %o1, SM_SAVE_DST 1854 cmp %o2, SHORTCOPY ! check for really short case 1855 bleu,pt %ncc, .co_sm_left ! 1856 mov %o2, SM_SAVE_COUNT 1857 cmp %o2, CHKSIZE ! check for medium length cases 1858 bgu,pn %ncc, .co_med ! 1859 or %o0, %o1, %o3 ! prepare alignment check 1860 andcc %o3, 0x3, %g0 ! test for alignment 1861 bz,pt %ncc, .co_sm_word ! branch to word aligned case 1862.co_sm_movebytes: 1863 sub %o2, 3, %o2 ! adjust count to allow cc zero test 1864.co_sm_notalign4: 1865 ldub [%o0], %o3 ! read byte 1866 subcc %o2, 4, %o2 ! reduce count by 4 1867 stba %o3, [%o1]ASI_USER ! write byte 1868 inc %o1 ! advance DST by 1 1869 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes 1870 add %o0, 4, %o0 ! advance SRC by 4 1871 stba %o3, [%o1]ASI_USER 1872 inc %o1 ! advance DST by 1 1873 ldub [%o0 - 2], %o3 1874 stba %o3, [%o1]ASI_USER 1875 inc %o1 ! advance DST by 1 1876 ldub [%o0 - 1], %o3 1877 stba %o3, [%o1]ASI_USER 1878 bgt,pt %ncc, .co_sm_notalign4 ! loop til 3 or fewer bytes remain 1879 inc %o1 ! advance DST by 1 1880 add %o2, 3, %o2 ! restore count 1881.co_sm_left: 1882 tst %o2 1883 bz,pt %ncc, .co_sm_exit ! check for zero length 1884 nop 1885 ldub [%o0], %o3 ! load one byte 1886 deccc %o2 ! reduce count for cc test 1887 bz,pt %ncc, .co_sm_exit 1888 stba %o3,[%o1]ASI_USER ! store one byte 1889 ldub [%o0 + 1], %o3 ! load second byte 1890 deccc %o2 1891 inc %o1 1892 bz,pt %ncc, .co_sm_exit 1893 stba %o3,[%o1]ASI_USER ! store second byte 1894 ldub [%o0 + 2], %o3 ! load third byte 1895 inc %o1 1896 stba %o3,[%o1]ASI_USER ! store third byte 1897 membar #Sync ! sync error barrier 1898 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1899 retl 1900 mov %g0, %o0 ! return 0 1901 .align 16 1902.co_sm_words: 1903 lduw [%o0], %o3 ! read word 1904.co_sm_wordx: 1905 subcc %o2, 8, %o2 ! update count 1906 stwa %o3, [%o1]ASI_USER ! write word 1907 add %o0, 8, %o0 ! update SRC 1908 lduw [%o0 - 4], %o3 ! read word 1909 add %o1, 4, %o1 ! update DST 1910 stwa %o3, [%o1]ASI_USER ! write word 1911 bgt,pt %ncc, .co_sm_words ! loop til done 1912 add %o1, 4, %o1 ! update DST 1913 addcc %o2, 7, %o2 ! restore count 1914 bz,pt %ncc, .co_sm_exit 1915 nop 1916 deccc %o2 1917 bz,pt %ncc, .co_sm_byte 1918.co_sm_half: 1919 subcc %o2, 2, %o2 ! reduce count by 2 1920 lduh [%o0], %o3 ! read half word 1921 add %o0, 2, %o0 ! advance SRC by 2 1922 stha %o3, [%o1]ASI_USER ! write half word 1923 bgt,pt %ncc, .co_sm_half ! loop til done 1924 add %o1, 2, %o1 ! advance DST by 2 1925 addcc %o2, 1, %o2 ! restore count 1926 bz,pt %ncc, .co_sm_exit 1927 nop 1928.co_sm_byte: 1929 ldub [%o0], %o3 1930 stba %o3, [%o1]ASI_USER 1931 membar #Sync ! sync error barrier 1932 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1933 retl 1934 mov %g0, %o0 ! return 0 1935 .align 16 1936.co_sm_word: 1937 subcc %o2, 4, %o2 ! update count 1938 bgt,pt %ncc, .co_sm_wordx 1939 lduw [%o0], %o3 ! read word 1940 addcc %o2, 3, %o2 ! restore count 1941 bz,pt %ncc, .co_sm_exit 1942 stwa %o3, [%o1]ASI_USER ! write word 1943 deccc %o2 ! reduce count for cc test 1944 ldub [%o0 + 4], %o3 ! load one byte 1945 add %o1, 4, %o1 1946 bz,pt %ncc, .co_sm_exit 1947 stba %o3, [%o1]ASI_USER ! store one byte 1948 ldub [%o0 + 5], %o3 ! load second byte 1949 deccc %o2 1950 inc %o1 1951 bz,pt %ncc, .co_sm_exit 1952 stba %o3, [%o1]ASI_USER ! store second byte 1953 ldub [%o0 + 6], %o3 ! load third byte 1954 inc %o1 1955 stba %o3, [%o1]ASI_USER ! store third byte 1956.co_sm_exit: 1957 membar #Sync ! sync error barrier 1958 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1959 retl 1960 mov %g0, %o0 ! return 0 1961 1962 .align 16 1963.co_med: 1964 xor %o0, %o1, %o3 ! setup alignment check 1965 btst 1, %o3 1966 bnz,pt %ncc, .co_sm_movebytes ! unaligned 1967 nop 1968 btst 3, %o3 1969 bnz,pt %ncc, .co_med_half ! halfword aligned 1970 nop 1971 btst 7, %o3 1972 bnz,pt %ncc, .co_med_word ! word aligned 1973 nop 1974.co_med_long: 1975 btst 3, %o0 ! check for 1976 bz,pt %ncc, .co_med_long1 ! word alignment 1977 nop 1978.co_med_long0: 1979 ldub [%o0], %o3 ! load one byte 1980 inc %o0 1981 stba %o3,[%o1]ASI_USER ! store byte 1982 inc %o1 1983 btst 3, %o0 1984 bnz,pt %ncc, .co_med_long0 1985 dec %o2 1986.co_med_long1: ! word aligned 1987 btst 7, %o0 ! check for long word 1988 bz,pt %ncc, .co_med_long2 1989 nop 1990 lduw [%o0], %o3 ! load word 1991 add %o0, 4, %o0 ! advance SRC by 4 1992 stwa %o3, [%o1]ASI_USER ! store word 1993 add %o1, 4, %o1 ! advance DST by 4 1994 sub %o2, 4, %o2 ! reduce count by 4 1995! 1996! Now long word aligned and have at least 32 bytes to move 1997! 1998.co_med_long2: 1999 sub %o2, 31, %o2 ! adjust count to allow cc zero test 2000 sub %o1, 8, %o1 ! adjust pointer to allow store in 2001 ! branch delay slot instead of add 2002.co_med_lmove: 2003 add %o1, 8, %o1 ! advance DST by 8 2004 ldx [%o0], %o3 ! read long word 2005 subcc %o2, 32, %o2 ! reduce count by 32 2006 stxa %o3, [%o1]ASI_USER ! write long word 2007 add %o1, 8, %o1 ! advance DST by 8 2008 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words 2009 add %o0, 32, %o0 ! advance SRC by 32 2010 stxa %o3, [%o1]ASI_USER 2011 ldx [%o0 - 16], %o3 2012 add %o1, 8, %o1 ! advance DST by 8 2013 stxa %o3, [%o1]ASI_USER 2014 ldx [%o0 - 8], %o3 2015 add %o1, 8, %o1 ! advance DST by 8 2016 bgt,pt %ncc, .co_med_lmove ! loop til 31 or fewer bytes left 2017 stxa %o3, [%o1]ASI_USER 2018 add %o1, 8, %o1 ! advance DST by 8 2019 addcc %o2, 24, %o2 ! restore count to long word offset 2020 ble,pt %ncc, .co_med_lextra ! check for more long words to move 2021 nop 2022.co_med_lword: 2023 ldx [%o0], %o3 ! read long word 2024 subcc %o2, 8, %o2 ! reduce count by 8 2025 stxa %o3, [%o1]ASI_USER ! write long word 2026 add %o0, 8, %o0 ! advance SRC by 8 2027 bgt,pt %ncc, .co_med_lword ! loop til 7 or fewer bytes left 2028 add %o1, 8, %o1 ! advance DST by 8 2029.co_med_lextra: 2030 addcc %o2, 7, %o2 ! restore rest of count 2031 bz,pt %ncc, .co_sm_exit ! if zero, then done 2032 deccc %o2 2033 bz,pt %ncc, .co_sm_byte 2034 nop 2035 ba,pt %ncc, .co_sm_half 2036 nop 2037 2038 .align 16 2039 nop ! instruction alignment 2040 ! see discussion at start of file 2041.co_med_word: 2042 btst 3, %o0 ! check for 2043 bz,pt %ncc, .co_med_word1 ! word alignment 2044 nop 2045.co_med_word0: 2046 ldub [%o0], %o3 ! load one byte 2047 inc %o0 2048 stba %o3,[%o1]ASI_USER ! store byte 2049 inc %o1 2050 btst 3, %o0 2051 bnz,pt %ncc, .co_med_word0 2052 dec %o2 2053! 2054! Now word aligned and have at least 36 bytes to move 2055! 2056.co_med_word1: 2057 sub %o2, 15, %o2 ! adjust count to allow cc zero test 2058.co_med_wmove: 2059 lduw [%o0], %o3 ! read word 2060 subcc %o2, 16, %o2 ! reduce count by 16 2061 stwa %o3, [%o1]ASI_USER ! write word 2062 add %o1, 4, %o1 ! advance DST by 4 2063 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words 2064 add %o0, 16, %o0 ! advance SRC by 16 2065 stwa %o3, [%o1]ASI_USER 2066 add %o1, 4, %o1 ! advance DST by 4 2067 lduw [%o0 - 8], %o3 2068 stwa %o3, [%o1]ASI_USER 2069 add %o1, 4, %o1 ! advance DST by 4 2070 lduw [%o0 - 4], %o3 2071 stwa %o3, [%o1]ASI_USER 2072 bgt,pt %ncc, .co_med_wmove ! loop til 15 or fewer bytes left 2073 add %o1, 4, %o1 ! advance DST by 4 2074 addcc %o2, 12, %o2 ! restore count to word offset 2075 ble,pt %ncc, .co_med_wextra ! check for more words to move 2076 nop 2077.co_med_word2: 2078 lduw [%o0], %o3 ! read word 2079 subcc %o2, 4, %o2 ! reduce count by 4 2080 stwa %o3, [%o1]ASI_USER ! write word 2081 add %o0, 4, %o0 ! advance SRC by 4 2082 bgt,pt %ncc, .co_med_word2 ! loop til 3 or fewer bytes left 2083 add %o1, 4, %o1 ! advance DST by 4 2084.co_med_wextra: 2085 addcc %o2, 3, %o2 ! restore rest of count 2086 bz,pt %ncc, .co_sm_exit ! if zero, then done 2087 deccc %o2 2088 bz,pt %ncc, .co_sm_byte 2089 nop 2090 ba,pt %ncc, .co_sm_half 2091 nop 2092 2093 .align 16 2094 nop ! instruction alignment 2095 nop ! see discussion at start of file 2096 nop 2097.co_med_half: 2098 btst 1, %o0 ! check for 2099 bz,pt %ncc, .co_med_half1 ! half word alignment 2100 nop 2101 ldub [%o0], %o3 ! load one byte 2102 inc %o0 2103 stba %o3,[%o1]ASI_USER ! store byte 2104 inc %o1 2105 dec %o2 2106! 2107! Now half word aligned and have at least 38 bytes to move 2108! 2109.co_med_half1: 2110 sub %o2, 7, %o2 ! adjust count to allow cc zero test 2111.co_med_hmove: 2112 lduh [%o0], %o3 ! read half word 2113 subcc %o2, 8, %o2 ! reduce count by 8 2114 stha %o3, [%o1]ASI_USER ! write half word 2115 add %o1, 2, %o1 ! advance DST by 2 2116 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords 2117 add %o0, 8, %o0 ! advance SRC by 8 2118 stha %o3, [%o1]ASI_USER 2119 add %o1, 2, %o1 ! advance DST by 2 2120 lduh [%o0 - 4], %o3 2121 stha %o3, [%o1]ASI_USER 2122 add %o1, 2, %o1 ! advance DST by 2 2123 lduh [%o0 - 2], %o3 2124 stha %o3, [%o1]ASI_USER 2125 bgt,pt %ncc, .co_med_hmove ! loop til 7 or fewer bytes left 2126 add %o1, 2, %o1 ! advance DST by 2 2127 addcc %o2, 7, %o2 ! restore count 2128 bz,pt %ncc, .co_sm_exit 2129 deccc %o2 2130 bz,pt %ncc, .co_sm_byte 2131 nop 2132 ba,pt %ncc, .co_sm_half 2133 nop 2134 2135/* 2136 * We got here because of a fault during short copyout. 2137 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 2138 */ 2139.sm_copyout_err: 2140 membar #Sync 2141 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2142 mov SM_SAVE_SRC, %o0 2143 mov SM_SAVE_DST, %o1 2144 mov SM_SAVE_COUNT, %o2 2145 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2146 tst %o3 2147 bz,pt %ncc, 3f ! if not, return error 2148 nop 2149 ldn [%o3 + CP_COPYOUT], %o5 ! if handler, invoke it with 2150 jmp %o5 ! original arguments 2151 nop 21523: 2153 retl 2154 or %g0, -1, %o0 ! return error value 2155 2156 SET_SIZE(copyout) 2157 2158/* 2159 * The _more entry points are not intended to be used directly by 2160 * any caller from outside this file. They are provided to allow 2161 * profiling and dtrace of the portions of the copy code that uses 2162 * the floating point registers. 2163 * This entry is particularly important as DTRACE (at least as of 2164 * 4/2004) does not support leaf functions. 2165 */ 2166 2167 ENTRY(copyout_more) 2168.copyout_more: 2169 prefetch [%o0], #n_reads 2170 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2171 set .copyout_err, REAL_LOFAULT 2172 2173/* 2174 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes 2175 */ 2176.do_copyout: 2177 set copyio_fault, %l7 ! .copyio_fault is lofault val 2178 2179 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 2180 membar #Sync ! sync error barrier 2181 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 2182 2183 mov %i0, SAVE_SRC 2184 mov %i1, SAVE_DST 2185 mov %i2, SAVE_COUNT 2186 2187 FP_NOMIGRATE(6, 7) 2188 2189 rd %fprs, %o2 ! check for unused fp 2190 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 2191 btst FPRS_FEF, %o2 2192 bz,a,pt %icc, .do_blockcopyout 2193 wr %g0, FPRS_FEF, %fprs 2194 2195 BST_FPQ2Q4_TOSTACK(%o2) 2196 2197.do_blockcopyout: 2198 rd %gsr, %o2 2199 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 2200 or %l6, FPUSED_FLAG, %l6 2201 2202 andcc DST, VIS_BLOCKSIZE - 1, TMP 2203 mov ASI_USER, %asi 2204 bz,pt %ncc, 2f 2205 neg TMP 2206 add TMP, VIS_BLOCKSIZE, TMP 2207 2208 ! TMP = bytes required to align DST on FP_BLOCK boundary 2209 ! Using SRC as a tmp here 2210 cmp TMP, 3 2211 bleu,pt %ncc, 1f 2212 sub CNT,TMP,CNT ! adjust main count 2213 sub TMP, 3, TMP ! adjust for end of loop test 2214.co_blkalign: 2215 ldub [REALSRC], SRC ! move 4 bytes per loop iteration 2216 stba SRC, [DST]%asi 2217 subcc TMP, 4, TMP 2218 ldub [REALSRC + 1], SRC 2219 add REALSRC, 4, REALSRC 2220 stba SRC, [DST + 1]%asi 2221 ldub [REALSRC - 2], SRC 2222 add DST, 4, DST 2223 stba SRC, [DST - 2]%asi 2224 ldub [REALSRC - 1], SRC 2225 bgu,pt %ncc, .co_blkalign 2226 stba SRC, [DST - 1]%asi 2227 2228 addcc TMP, 3, TMP ! restore count adjustment 2229 bz,pt %ncc, 2f ! no bytes left? 2230 nop 22311: ldub [REALSRC], SRC 2232 inc REALSRC 2233 inc DST 2234 deccc TMP 2235 bgu %ncc, 1b 2236 stba SRC, [DST - 1]%asi 2237 22382: 2239 membar #StoreLoad 2240 andn REALSRC, 0x7, SRC 2241 2242 ! SRC - 8-byte aligned 2243 ! DST - 64-byte aligned 2244 ldd [SRC], %f16 2245 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 2246 alignaddr REALSRC, %g0, %g0 2247 ldd [SRC + 0x08], %f18 2248 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 2249 faligndata %f16, %f18, %f48 2250 ldd [SRC + 0x10], %f20 2251 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 2252 faligndata %f18, %f20, %f50 2253 ldd [SRC + 0x18], %f22 2254 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 2255 faligndata %f20, %f22, %f52 2256 ldd [SRC + 0x20], %f24 2257 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read 2258 faligndata %f22, %f24, %f54 2259 ldd [SRC + 0x28], %f26 2260 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read 2261 faligndata %f24, %f26, %f56 2262 ldd [SRC + 0x30], %f28 2263 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read 2264 faligndata %f26, %f28, %f58 2265 ldd [SRC + 0x38], %f30 2266 ldd [SRC + VIS_BLOCKSIZE], %f16 2267 sub CNT, VIS_BLOCKSIZE, CNT 2268 add SRC, VIS_BLOCKSIZE, SRC 2269 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read 2270 add REALSRC, VIS_BLOCKSIZE, REALSRC 2271 ba,pt %ncc, 1f 2272 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read 2273 .align 32 22741: 2275 ldd [SRC + 0x08], %f18 2276 faligndata %f28, %f30, %f60 2277 ldd [SRC + 0x10], %f20 2278 faligndata %f30, %f16, %f62 2279 stda %f48, [DST]ASI_BLK_AIUS 2280 ldd [SRC + 0x18], %f22 2281 faligndata %f16, %f18, %f48 2282 ldd [SRC + 0x20], %f24 2283 faligndata %f18, %f20, %f50 2284 ldd [SRC + 0x28], %f26 2285 faligndata %f20, %f22, %f52 2286 ldd [SRC + 0x30], %f28 2287 faligndata %f22, %f24, %f54 2288 sub CNT, VIS_BLOCKSIZE, CNT 2289 ldd [SRC + 0x38], %f30 2290 faligndata %f24, %f26, %f56 2291 add DST, VIS_BLOCKSIZE, DST 2292 ldd [SRC + VIS_BLOCKSIZE], %f16 2293 faligndata %f26, %f28, %f58 2294 add REALSRC, VIS_BLOCKSIZE, REALSRC 2295 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 2296 add SRC, VIS_BLOCKSIZE, SRC 2297 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read 2298 cmp CNT, VIS_BLOCKSIZE + 8 2299 bgu,pt %ncc, 1b 2300 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 2301 2302 ! only if REALSRC & 0x7 is 0 2303 cmp CNT, VIS_BLOCKSIZE 2304 bne %ncc, 3f 2305 andcc REALSRC, 0x7, %g0 2306 bz,pt %ncc, 2f 2307 nop 23083: 2309 faligndata %f28, %f30, %f60 2310 faligndata %f30, %f16, %f62 2311 stda %f48, [DST]ASI_BLK_AIUS 2312 add DST, VIS_BLOCKSIZE, DST 2313 ba,pt %ncc, 3f 2314 nop 23152: 2316 ldd [SRC + 0x08], %f18 2317 fsrc1 %f28, %f60 2318 ldd [SRC + 0x10], %f20 2319 fsrc1 %f30, %f62 2320 stda %f48, [DST]ASI_BLK_AIUS 2321 ldd [SRC + 0x18], %f22 2322 fsrc1 %f16, %f48 2323 ldd [SRC + 0x20], %f24 2324 fsrc1 %f18, %f50 2325 ldd [SRC + 0x28], %f26 2326 fsrc1 %f20, %f52 2327 ldd [SRC + 0x30], %f28 2328 fsrc1 %f22, %f54 2329 ldd [SRC + 0x38], %f30 2330 fsrc1 %f24, %f56 2331 sub CNT, VIS_BLOCKSIZE, CNT 2332 add DST, VIS_BLOCKSIZE, DST 2333 add SRC, VIS_BLOCKSIZE, SRC 2334 add REALSRC, VIS_BLOCKSIZE, REALSRC 2335 fsrc1 %f26, %f58 2336 fsrc1 %f28, %f60 2337 fsrc1 %f30, %f62 2338 stda %f48, [DST]ASI_BLK_AIUS 2339 add DST, VIS_BLOCKSIZE, DST 2340 ba,a,pt %ncc, 4f 2341 nop 2342 23433: tst CNT 2344 bz,a %ncc, 4f 2345 nop 2346 23475: ldub [REALSRC], TMP 2348 inc REALSRC 2349 inc DST 2350 deccc CNT 2351 bgu %ncc, 5b 2352 stba TMP, [DST - 1]%asi 23534: 2354 2355.copyout_exit: 2356 membar #Sync 2357 2358 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 2359 wr %o2, 0, %gsr ! restore gsr 2360 2361 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 2362 btst FPRS_FEF, %o3 2363 bz,pt %icc, 4f 2364 nop 2365 2366 BLD_FPQ2Q4_FROMSTACK(%o2) 2367 2368 ba,pt %ncc, 1f 2369 wr %o3, 0, %fprs ! restore fprs 2370 23714: 2372 FZEROQ2Q4 2373 wr %o3, 0, %fprs ! restore fprs 2374 23751: 2376 membar #Sync 2377 andn %l6, FPUSED_FLAG, %l6 2378 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2379 FP_ALLOWMIGRATE(5, 6) 2380 ret 2381 restore %g0, 0, %o0 2382 2383/* 2384 * We got here because of a fault during copyout. 2385 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 2386 */ 2387.copyout_err: 2388 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 2389 tst %o4 2390 bz,pt %ncc, 2f ! if not, return error 2391 nop 2392 ldn [%o4 + CP_COPYOUT], %g2 ! if handler, invoke it with 2393 jmp %g2 ! original arguments 2394 restore %g0, 0, %g0 ! dispose of copy window 23952: 2396 ret 2397 restore %g0, -1, %o0 ! return error value 2398 2399 2400 SET_SIZE(copyout_more) 2401 2402#endif /* lint */ 2403 2404 2405#ifdef lint 2406 2407/*ARGSUSED*/ 2408int 2409xcopyout(const void *kaddr, void *uaddr, size_t count) 2410{ return (0); } 2411 2412#else /* lint */ 2413 2414 ENTRY(xcopyout) 2415 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 2416 bleu,pt %ncc, .xcopyout_small ! go to larger cases 2417 xor %o0, %o1, %o3 ! are src, dst alignable? 2418 btst 7, %o3 ! 2419 bz,pt %ncc, .xcopyout_8 ! 2420 nop 2421 btst 1, %o3 ! 2422 bz,pt %ncc, .xcopyout_2 ! check for half-word 2423 nop 2424 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 2425 ld [%o3 + %lo(hw_copy_limit_1)], %o3 2426 tst %o3 2427 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2428 cmp %o2, %o3 ! if length <= limit 2429 bleu,pt %ncc, .xcopyout_small ! go to small copy 2430 nop 2431 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2432 nop 2433.xcopyout_2: 2434 btst 3, %o3 ! 2435 bz,pt %ncc, .xcopyout_4 ! check for word alignment 2436 nop 2437 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 2438 ld [%o3 + %lo(hw_copy_limit_2)], %o3 2439 tst %o3 2440 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2441 cmp %o2, %o3 ! if length <= limit 2442 bleu,pt %ncc, .xcopyout_small ! go to small copy 2443 nop 2444 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2445 nop 2446.xcopyout_4: 2447 ! already checked longword, must be word aligned 2448 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 2449 ld [%o3 + %lo(hw_copy_limit_4)], %o3 2450 tst %o3 2451 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2452 cmp %o2, %o3 ! if length <= limit 2453 bleu,pt %ncc, .xcopyout_small ! go to small copy 2454 nop 2455 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2456 nop 2457.xcopyout_8: 2458 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 2459 ld [%o3 + %lo(hw_copy_limit_8)], %o3 2460 tst %o3 2461 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2462 cmp %o2, %o3 ! if length <= limit 2463 bleu,pt %ncc, .xcopyout_small ! go to small copy 2464 nop 2465 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2466 nop 2467 2468.xcopyout_small: 2469 sethi %hi(.sm_xcopyout_err), %o5 ! .sm_xcopyout_err is lofault 2470 or %o5, %lo(.sm_xcopyout_err), %o5 2471 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 2472 membar #Sync ! sync error barrier 2473 ba,pt %ncc, .sm_do_copyout ! common code 2474 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 2475 2476.xcopyout_more: 2477 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2478 sethi %hi(.xcopyout_err), REAL_LOFAULT 2479 ba,pt %ncc, .do_copyout ! common code 2480 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT 2481 2482/* 2483 * We got here because of fault during xcopyout 2484 * Errno value is in ERRNO 2485 */ 2486.xcopyout_err: 2487 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 2488 tst %o4 2489 bz,pt %ncc, 2f ! if not, return error 2490 nop 2491 ldn [%o4 + CP_XCOPYOUT], %g2 ! if handler, invoke it with 2492 jmp %g2 ! original arguments 2493 restore %g0, 0, %g0 ! dispose of copy window 24942: 2495 ret 2496 restore ERRNO, 0, %o0 ! return errno value 2497 2498.sm_xcopyout_err: 2499 2500 membar #Sync 2501 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2502 mov SM_SAVE_SRC, %o0 2503 mov SM_SAVE_DST, %o1 2504 mov SM_SAVE_COUNT, %o2 2505 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2506 tst %o3 2507 bz,pt %ncc, 3f ! if not, return error 2508 nop 2509 ldn [%o3 + CP_XCOPYOUT], %o5 ! if handler, invoke it with 2510 jmp %o5 ! original arguments 2511 nop 25123: 2513 retl 2514 or %g1, 0, %o0 ! return errno value 2515 2516 SET_SIZE(xcopyout) 2517 2518#endif /* lint */ 2519 2520#ifdef lint 2521 2522/*ARGSUSED*/ 2523int 2524xcopyout_little(const void *kaddr, void *uaddr, size_t count) 2525{ return (0); } 2526 2527#else /* lint */ 2528 2529 ENTRY(xcopyout_little) 2530 sethi %hi(.xcopyio_err), %o5 2531 or %o5, %lo(.xcopyio_err), %o5 2532 ldn [THREAD_REG + T_LOFAULT], %o4 2533 membar #Sync ! sync error barrier 2534 stn %o5, [THREAD_REG + T_LOFAULT] 2535 mov %o4, %o5 2536 2537 subcc %g0, %o2, %o3 2538 add %o0, %o2, %o0 2539 bz,pn %ncc, 2f ! check for zero bytes 2540 sub %o2, 1, %o4 2541 add %o0, %o4, %o0 ! start w/last byte 2542 add %o1, %o2, %o1 2543 ldub [%o0 + %o3], %o4 2544 25451: stba %o4, [%o1 + %o3]ASI_AIUSL 2546 inccc %o3 2547 sub %o0, 2, %o0 ! get next byte 2548 bcc,a,pt %ncc, 1b 2549 ldub [%o0 + %o3], %o4 2550 25512: 2552 membar #Sync ! sync error barrier 2553 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2554 retl 2555 mov %g0, %o0 ! return (0) 2556 2557 SET_SIZE(xcopyout_little) 2558 2559#endif /* lint */ 2560 2561/* 2562 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little) 2563 */ 2564 2565#if defined(lint) 2566 2567/*ARGSUSED*/ 2568int 2569copyin(const void *uaddr, void *kaddr, size_t count) 2570{ return (0); } 2571 2572#else /* lint */ 2573 2574 ENTRY(copyin) 2575 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 2576 bleu,pt %ncc, .copyin_small ! go to larger cases 2577 xor %o0, %o1, %o3 ! are src, dst alignable? 2578 btst 7, %o3 ! 2579 bz,pt %ncc, .copyin_8 ! check for longword alignment 2580 nop 2581 btst 1, %o3 ! 2582 bz,pt %ncc, .copyin_2 ! check for half-word 2583 nop 2584 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 2585 ld [%o3 + %lo(hw_copy_limit_1)], %o3 2586 tst %o3 2587 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2588 cmp %o2, %o3 ! if length <= limit 2589 bleu,pt %ncc, .copyin_small ! go to small copy 2590 nop 2591 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2592 nop 2593.copyin_2: 2594 btst 3, %o3 ! 2595 bz,pt %ncc, .copyin_4 ! check for word alignment 2596 nop 2597 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 2598 ld [%o3 + %lo(hw_copy_limit_2)], %o3 2599 tst %o3 2600 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2601 cmp %o2, %o3 ! if length <= limit 2602 bleu,pt %ncc, .copyin_small ! go to small copy 2603 nop 2604 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2605 nop 2606.copyin_4: 2607 ! already checked longword, must be word aligned 2608 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 2609 ld [%o3 + %lo(hw_copy_limit_4)], %o3 2610 tst %o3 2611 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2612 cmp %o2, %o3 ! if length <= limit 2613 bleu,pt %ncc, .copyin_small ! go to small copy 2614 nop 2615 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2616 nop 2617.copyin_8: 2618 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 2619 ld [%o3 + %lo(hw_copy_limit_8)], %o3 2620 tst %o3 2621 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2622 cmp %o2, %o3 ! if length <= limit 2623 bleu,pt %ncc, .copyin_small ! go to small copy 2624 nop 2625 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2626 nop 2627 2628 .align 16 2629 nop ! instruction alignment 2630 ! see discussion at start of file 2631.copyin_small: 2632 sethi %hi(.sm_copyin_err), %o5 ! .sm_copyin_err is lofault 2633 or %o5, %lo(.sm_copyin_err), %o5 2634 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofault, no tramp 2635 membar #Sync ! sync error barrier 2636 stn %o5, [THREAD_REG + T_LOFAULT] 2637.sm_do_copyin: 2638 mov %o0, SM_SAVE_SRC 2639 mov %o1, SM_SAVE_DST 2640 cmp %o2, SHORTCOPY ! check for really short case 2641 bleu,pt %ncc, .ci_sm_left ! 2642 mov %o2, SM_SAVE_COUNT 2643 cmp %o2, CHKSIZE ! check for medium length cases 2644 bgu,pn %ncc, .ci_med ! 2645 or %o0, %o1, %o3 ! prepare alignment check 2646 andcc %o3, 0x3, %g0 ! test for alignment 2647 bz,pt %ncc, .ci_sm_word ! branch to word aligned case 2648.ci_sm_movebytes: 2649 sub %o2, 3, %o2 ! adjust count to allow cc zero test 2650.ci_sm_notalign4: 2651 lduba [%o0]ASI_USER, %o3 ! read byte 2652 subcc %o2, 4, %o2 ! reduce count by 4 2653 stb %o3, [%o1] ! write byte 2654 add %o0, 1, %o0 ! advance SRC by 1 2655 lduba [%o0]ASI_USER, %o3 ! repeat for a total of 4 bytes 2656 add %o0, 1, %o0 ! advance SRC by 1 2657 stb %o3, [%o1 + 1] 2658 add %o1, 4, %o1 ! advance DST by 4 2659 lduba [%o0]ASI_USER, %o3 2660 add %o0, 1, %o0 ! advance SRC by 1 2661 stb %o3, [%o1 - 2] 2662 lduba [%o0]ASI_USER, %o3 2663 add %o0, 1, %o0 ! advance SRC by 1 2664 bgt,pt %ncc, .ci_sm_notalign4 ! loop til 3 or fewer bytes remain 2665 stb %o3, [%o1 - 1] 2666 add %o2, 3, %o2 ! restore count 2667.ci_sm_left: 2668 tst %o2 2669 bz,pt %ncc, .ci_sm_exit 2670 nop 2671 lduba [%o0]ASI_USER, %o3 ! load one byte 2672 deccc %o2 ! reduce count for cc test 2673 bz,pt %ncc, .ci_sm_exit 2674 stb %o3,[%o1] ! store one byte 2675 inc %o0 2676 lduba [%o0]ASI_USER, %o3 ! load second byte 2677 deccc %o2 2678 bz,pt %ncc, .ci_sm_exit 2679 stb %o3,[%o1 + 1] ! store second byte 2680 inc %o0 2681 lduba [%o0]ASI_USER, %o3 ! load third byte 2682 stb %o3,[%o1 + 2] ! store third byte 2683 membar #Sync ! sync error barrier 2684 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2685 retl 2686 mov %g0, %o0 ! return 0 2687 .align 16 2688.ci_sm_words: 2689 lduwa [%o0]ASI_USER, %o3 ! read word 2690.ci_sm_wordx: 2691 subcc %o2, 8, %o2 ! update count 2692 stw %o3, [%o1] ! write word 2693 add %o0, 4, %o0 ! update SRC 2694 add %o1, 8, %o1 ! update DST 2695 lduwa [%o0]ASI_USER, %o3 ! read word 2696 add %o0, 4, %o0 ! update SRC 2697 bgt,pt %ncc, .ci_sm_words ! loop til done 2698 stw %o3, [%o1 - 4] ! write word 2699 addcc %o2, 7, %o2 ! restore count 2700 bz,pt %ncc, .ci_sm_exit 2701 nop 2702 deccc %o2 2703 bz,pt %ncc, .ci_sm_byte 2704.ci_sm_half: 2705 subcc %o2, 2, %o2 ! reduce count by 2 2706 lduha [%o0]ASI_USER, %o3 ! read half word 2707 add %o0, 2, %o0 ! advance SRC by 2 2708 add %o1, 2, %o1 ! advance DST by 2 2709 bgt,pt %ncc, .ci_sm_half ! loop til done 2710 sth %o3, [%o1 - 2] ! write half word 2711 addcc %o2, 1, %o2 ! restore count 2712 bz,pt %ncc, .ci_sm_exit 2713 nop 2714.ci_sm_byte: 2715 lduba [%o0]ASI_USER, %o3 2716 stb %o3, [%o1] 2717 membar #Sync ! sync error barrier 2718 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2719 retl 2720 mov %g0, %o0 ! return 0 2721 .align 16 2722.ci_sm_word: 2723 subcc %o2, 4, %o2 ! update count 2724 bgt,pt %ncc, .ci_sm_wordx 2725 lduwa [%o0]ASI_USER, %o3 ! read word 2726 addcc %o2, 3, %o2 ! restore count 2727 bz,pt %ncc, .ci_sm_exit 2728 stw %o3, [%o1] ! write word 2729 deccc %o2 ! reduce count for cc test 2730 add %o0, 4, %o0 2731 lduba [%o0]ASI_USER, %o3 ! load one byte 2732 bz,pt %ncc, .ci_sm_exit 2733 stb %o3, [%o1 + 4] ! store one byte 2734 inc %o0 2735 lduba [%o0]ASI_USER, %o3 ! load second byte 2736 deccc %o2 2737 bz,pt %ncc, .ci_sm_exit 2738 stb %o3, [%o1 + 5] ! store second byte 2739 inc %o0 2740 lduba [%o0]ASI_USER, %o3 ! load third byte 2741 stb %o3, [%o1 + 6] ! store third byte 2742.ci_sm_exit: 2743 membar #Sync ! sync error barrier 2744 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2745 retl 2746 mov %g0, %o0 ! return 0 2747 2748 .align 16 2749.ci_med: 2750 xor %o0, %o1, %o3 ! setup alignment check 2751 btst 1, %o3 2752 bnz,pt %ncc, .ci_sm_movebytes ! unaligned 2753 nop 2754 btst 3, %o3 2755 bnz,pt %ncc, .ci_med_half ! halfword aligned 2756 nop 2757 btst 7, %o3 2758 bnz,pt %ncc, .ci_med_word ! word aligned 2759 nop 2760.ci_med_long: 2761 btst 3, %o0 ! check for 2762 bz,pt %ncc, .ci_med_long1 ! word alignment 2763 nop 2764.ci_med_long0: 2765 lduba [%o0]ASI_USER, %o3 ! load one byte 2766 inc %o0 2767 stb %o3,[%o1] ! store byte 2768 inc %o1 2769 btst 3, %o0 2770 bnz,pt %ncc, .ci_med_long0 2771 dec %o2 2772.ci_med_long1: ! word aligned 2773 btst 7, %o0 ! check for long word 2774 bz,pt %ncc, .ci_med_long2 2775 nop 2776 lduwa [%o0]ASI_USER, %o3 ! load word 2777 add %o0, 4, %o0 ! advance SRC by 4 2778 stw %o3, [%o1] ! store word 2779 add %o1, 4, %o1 ! advance DST by 4 2780 sub %o2, 4, %o2 ! reduce count by 4 2781! 2782! Now long word aligned and have at least 32 bytes to move 2783! 2784.ci_med_long2: 2785 sub %o2, 31, %o2 ! adjust count to allow cc zero test 2786.ci_med_lmove: 2787 ldxa [%o0]ASI_USER, %o3 ! read long word 2788 subcc %o2, 32, %o2 ! reduce count by 32 2789 stx %o3, [%o1] ! write long word 2790 add %o0, 8, %o0 ! advance SRC by 8 2791 ldxa [%o0]ASI_USER, %o3 ! repeat for a total for 4 long words 2792 add %o0, 8, %o0 ! advance SRC by 8 2793 stx %o3, [%o1 + 8] 2794 add %o1, 32, %o1 ! advance DST by 32 2795 ldxa [%o0]ASI_USER, %o3 2796 add %o0, 8, %o0 ! advance SRC by 8 2797 stx %o3, [%o1 - 16] 2798 ldxa [%o0]ASI_USER, %o3 2799 add %o0, 8, %o0 ! advance SRC by 8 2800 bgt,pt %ncc, .ci_med_lmove ! loop til 31 or fewer bytes left 2801 stx %o3, [%o1 - 8] 2802 addcc %o2, 24, %o2 ! restore count to long word offset 2803 ble,pt %ncc, .ci_med_lextra ! check for more long words to move 2804 nop 2805.ci_med_lword: 2806 ldxa [%o0]ASI_USER, %o3 ! read long word 2807 subcc %o2, 8, %o2 ! reduce count by 8 2808 stx %o3, [%o1] ! write long word 2809 add %o0, 8, %o0 ! advance SRC by 8 2810 bgt,pt %ncc, .ci_med_lword ! loop til 7 or fewer bytes left 2811 add %o1, 8, %o1 ! advance DST by 8 2812.ci_med_lextra: 2813 addcc %o2, 7, %o2 ! restore rest of count 2814 bz,pt %ncc, .ci_sm_exit ! if zero, then done 2815 deccc %o2 2816 bz,pt %ncc, .ci_sm_byte 2817 nop 2818 ba,pt %ncc, .ci_sm_half 2819 nop 2820 2821 .align 16 2822 nop ! instruction alignment 2823 ! see discussion at start of file 2824.ci_med_word: 2825 btst 3, %o0 ! check for 2826 bz,pt %ncc, .ci_med_word1 ! word alignment 2827 nop 2828.ci_med_word0: 2829 lduba [%o0]ASI_USER, %o3 ! load one byte 2830 inc %o0 2831 stb %o3,[%o1] ! store byte 2832 inc %o1 2833 btst 3, %o0 2834 bnz,pt %ncc, .ci_med_word0 2835 dec %o2 2836! 2837! Now word aligned and have at least 36 bytes to move 2838! 2839.ci_med_word1: 2840 sub %o2, 15, %o2 ! adjust count to allow cc zero test 2841.ci_med_wmove: 2842 lduwa [%o0]ASI_USER, %o3 ! read word 2843 subcc %o2, 16, %o2 ! reduce count by 16 2844 stw %o3, [%o1] ! write word 2845 add %o0, 4, %o0 ! advance SRC by 4 2846 lduwa [%o0]ASI_USER, %o3 ! repeat for a total for 4 words 2847 add %o0, 4, %o0 ! advance SRC by 4 2848 stw %o3, [%o1 + 4] 2849 add %o1, 16, %o1 ! advance DST by 16 2850 lduwa [%o0]ASI_USER, %o3 2851 add %o0, 4, %o0 ! advance SRC by 4 2852 stw %o3, [%o1 - 8] 2853 lduwa [%o0]ASI_USER, %o3 2854 add %o0, 4, %o0 ! advance SRC by 4 2855 bgt,pt %ncc, .ci_med_wmove ! loop til 15 or fewer bytes left 2856 stw %o3, [%o1 - 4] 2857 addcc %o2, 12, %o2 ! restore count to word offset 2858 ble,pt %ncc, .ci_med_wextra ! check for more words to move 2859 nop 2860.ci_med_word2: 2861 lduwa [%o0]ASI_USER, %o3 ! read word 2862 subcc %o2, 4, %o2 ! reduce count by 4 2863 stw %o3, [%o1] ! write word 2864 add %o0, 4, %o0 ! advance SRC by 4 2865 bgt,pt %ncc, .ci_med_word2 ! loop til 3 or fewer bytes left 2866 add %o1, 4, %o1 ! advance DST by 4 2867.ci_med_wextra: 2868 addcc %o2, 3, %o2 ! restore rest of count 2869 bz,pt %ncc, .ci_sm_exit ! if zero, then done 2870 deccc %o2 2871 bz,pt %ncc, .ci_sm_byte 2872 nop 2873 ba,pt %ncc, .ci_sm_half 2874 nop 2875 2876 .align 16 2877 nop ! instruction alignment 2878 ! see discussion at start of file 2879.ci_med_half: 2880 btst 1, %o0 ! check for 2881 bz,pt %ncc, .ci_med_half1 ! half word alignment 2882 nop 2883 lduba [%o0]ASI_USER, %o3 ! load one byte 2884 inc %o0 2885 stb %o3,[%o1] ! store byte 2886 inc %o1 2887 dec %o2 2888! 2889! Now half word aligned and have at least 38 bytes to move 2890! 2891.ci_med_half1: 2892 sub %o2, 7, %o2 ! adjust count to allow cc zero test 2893.ci_med_hmove: 2894 lduha [%o0]ASI_USER, %o3 ! read half word 2895 subcc %o2, 8, %o2 ! reduce count by 8 2896 sth %o3, [%o1] ! write half word 2897 add %o0, 2, %o0 ! advance SRC by 2 2898 lduha [%o0]ASI_USER, %o3 ! repeat for a total for 4 halfwords 2899 add %o0, 2, %o0 ! advance SRC by 2 2900 sth %o3, [%o1 + 2] 2901 add %o1, 8, %o1 ! advance DST by 8 2902 lduha [%o0]ASI_USER, %o3 2903 add %o0, 2, %o0 ! advance SRC by 2 2904 sth %o3, [%o1 - 4] 2905 lduha [%o0]ASI_USER, %o3 2906 add %o0, 2, %o0 ! advance SRC by 2 2907 bgt,pt %ncc, .ci_med_hmove ! loop til 7 or fewer bytes left 2908 sth %o3, [%o1 - 2] 2909 addcc %o2, 7, %o2 ! restore count 2910 bz,pt %ncc, .ci_sm_exit 2911 deccc %o2 2912 bz,pt %ncc, .ci_sm_byte 2913 nop 2914 ba,pt %ncc, .ci_sm_half 2915 nop 2916 2917.sm_copyin_err: 2918 membar #Sync 2919 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2920 mov SM_SAVE_SRC, %o0 2921 mov SM_SAVE_DST, %o1 2922 mov SM_SAVE_COUNT, %o2 2923 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2924 tst %o3 2925 bz,pt %ncc, 3f ! if not, return error 2926 nop 2927 ldn [%o3 + CP_COPYIN], %o5 ! if handler, invoke it with 2928 jmp %o5 ! original arguments 2929 nop 29303: 2931 retl 2932 or %g0, -1, %o0 ! return errno value 2933 2934 SET_SIZE(copyin) 2935 2936 2937/* 2938 * The _more entry points are not intended to be used directly by 2939 * any caller from outside this file. They are provided to allow 2940 * profiling and dtrace of the portions of the copy code that uses 2941 * the floating point registers. 2942 * This entry is particularly important as DTRACE (at least as of 2943 * 4/2004) does not support leaf functions. 2944 */ 2945 2946 ENTRY(copyin_more) 2947.copyin_more: 2948 prefetch [%o0], #n_reads 2949 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2950 set .copyin_err, REAL_LOFAULT 2951 2952/* 2953 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes 2954 */ 2955.do_copyin: 2956 set copyio_fault, %l7 ! .copyio_fault is lofault val 2957 2958 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 2959 membar #Sync ! sync error barrier 2960 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 2961 2962 mov %i0, SAVE_SRC 2963 mov %i1, SAVE_DST 2964 mov %i2, SAVE_COUNT 2965 2966 FP_NOMIGRATE(6, 7) 2967 2968 rd %fprs, %o2 ! check for unused fp 2969 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 2970 btst FPRS_FEF, %o2 2971 bz,a,pt %icc, .do_blockcopyin 2972 wr %g0, FPRS_FEF, %fprs 2973 2974 BST_FPQ2Q4_TOSTACK(%o2) 2975 2976.do_blockcopyin: 2977 rd %gsr, %o2 2978 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 2979 or %l6, FPUSED_FLAG, %l6 2980 2981 andcc DST, VIS_BLOCKSIZE - 1, TMP 2982 mov ASI_USER, %asi 2983 bz,pt %ncc, 2f 2984 neg TMP 2985 add TMP, VIS_BLOCKSIZE, TMP 2986 2987 ! TMP = bytes required to align DST on FP_BLOCK boundary 2988 ! Using SRC as a tmp here 2989 cmp TMP, 3 2990 bleu,pt %ncc, 1f 2991 sub CNT,TMP,CNT ! adjust main count 2992 sub TMP, 3, TMP ! adjust for end of loop test 2993.ci_blkalign: 2994 lduba [REALSRC]%asi, SRC ! move 4 bytes per loop iteration 2995 stb SRC, [DST] 2996 subcc TMP, 4, TMP 2997 lduba [REALSRC + 1]%asi, SRC 2998 add REALSRC, 4, REALSRC 2999 stb SRC, [DST + 1] 3000 lduba [REALSRC - 2]%asi, SRC 3001 add DST, 4, DST 3002 stb SRC, [DST - 2] 3003 lduba [REALSRC - 1]%asi, SRC 3004 bgu,pt %ncc, .ci_blkalign 3005 stb SRC, [DST - 1] 3006 3007 addcc TMP, 3, TMP ! restore count adjustment 3008 bz,pt %ncc, 2f ! no bytes left? 3009 nop 30101: lduba [REALSRC]%asi, SRC 3011 inc REALSRC 3012 inc DST 3013 deccc TMP 3014 bgu %ncc, 1b 3015 stb SRC, [DST - 1] 3016 30172: 3018 membar #StoreLoad 3019 andn REALSRC, 0x7, SRC 3020 3021 ! SRC - 8-byte aligned 3022 ! DST - 64-byte aligned 3023 ldda [SRC]%asi, %f16 3024 prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #n_reads 3025 alignaddr REALSRC, %g0, %g0 3026 ldda [SRC + 0x08]%asi, %f18 3027 prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #n_reads 3028 faligndata %f16, %f18, %f48 3029 ldda [SRC + 0x10]%asi, %f20 3030 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads 3031 faligndata %f18, %f20, %f50 3032 ldda [SRC + 0x18]%asi, %f22 3033 prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read 3034 faligndata %f20, %f22, %f52 3035 ldda [SRC + 0x20]%asi, %f24 3036 prefetcha [SRC + (8 * VIS_BLOCKSIZE)]%asi, #one_read 3037 faligndata %f22, %f24, %f54 3038 ldda [SRC + 0x28]%asi, %f26 3039 prefetcha [SRC + (12 * VIS_BLOCKSIZE)]%asi, #one_read 3040 faligndata %f24, %f26, %f56 3041 ldda [SRC + 0x30]%asi, %f28 3042 prefetcha [SRC + (16 * VIS_BLOCKSIZE)]%asi, #one_read 3043 faligndata %f26, %f28, %f58 3044 ldda [SRC + 0x38]%asi, %f30 3045 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 3046 sub CNT, VIS_BLOCKSIZE, CNT 3047 add SRC, VIS_BLOCKSIZE, SRC 3048 prefetcha [SRC + (19 * VIS_BLOCKSIZE)]%asi, #one_read 3049 add REALSRC, VIS_BLOCKSIZE, REALSRC 3050 ba,pt %ncc, 1f 3051 prefetcha [SRC + (23 * VIS_BLOCKSIZE)]%asi, #one_read 3052 .align 32 30531: 3054 ldda [SRC + 0x08]%asi, %f18 3055 faligndata %f28, %f30, %f60 3056 ldda [SRC + 0x10]%asi, %f20 3057 faligndata %f30, %f16, %f62 3058 stda %f48, [DST]ASI_BLK_P 3059 ldda [SRC + 0x18]%asi, %f22 3060 faligndata %f16, %f18, %f48 3061 ldda [SRC + 0x20]%asi, %f24 3062 faligndata %f18, %f20, %f50 3063 ldda [SRC + 0x28]%asi, %f26 3064 faligndata %f20, %f22, %f52 3065 ldda [SRC + 0x30]%asi, %f28 3066 faligndata %f22, %f24, %f54 3067 sub CNT, VIS_BLOCKSIZE, CNT 3068 ldda [SRC + 0x38]%asi, %f30 3069 faligndata %f24, %f26, %f56 3070 add DST, VIS_BLOCKSIZE, DST 3071 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 3072 faligndata %f26, %f28, %f58 3073 add REALSRC, VIS_BLOCKSIZE, REALSRC 3074 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads 3075 add SRC, VIS_BLOCKSIZE, SRC 3076 prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read 3077 cmp CNT, VIS_BLOCKSIZE + 8 3078 bgu,pt %ncc, 1b 3079 prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read 3080 3081 ! only if REALSRC & 0x7 is 0 3082 cmp CNT, VIS_BLOCKSIZE 3083 bne %ncc, 3f 3084 andcc REALSRC, 0x7, %g0 3085 bz,pt %ncc, 2f 3086 nop 30873: 3088 faligndata %f28, %f30, %f60 3089 faligndata %f30, %f16, %f62 3090 stda %f48, [DST]ASI_BLK_P 3091 add DST, VIS_BLOCKSIZE, DST 3092 ba,pt %ncc, 3f 3093 nop 30942: 3095 ldda [SRC + 0x08]%asi, %f18 3096 fsrc1 %f28, %f60 3097 ldda [SRC + 0x10]%asi, %f20 3098 fsrc1 %f30, %f62 3099 stda %f48, [DST]ASI_BLK_P 3100 ldda [SRC + 0x18]%asi, %f22 3101 fsrc1 %f16, %f48 3102 ldda [SRC + 0x20]%asi, %f24 3103 fsrc1 %f18, %f50 3104 ldda [SRC + 0x28]%asi, %f26 3105 fsrc1 %f20, %f52 3106 ldda [SRC + 0x30]%asi, %f28 3107 fsrc1 %f22, %f54 3108 ldda [SRC + 0x38]%asi, %f30 3109 fsrc1 %f24, %f56 3110 sub CNT, VIS_BLOCKSIZE, CNT 3111 add DST, VIS_BLOCKSIZE, DST 3112 add SRC, VIS_BLOCKSIZE, SRC 3113 add REALSRC, VIS_BLOCKSIZE, REALSRC 3114 fsrc1 %f26, %f58 3115 fsrc1 %f28, %f60 3116 fsrc1 %f30, %f62 3117 stda %f48, [DST]ASI_BLK_P 3118 add DST, VIS_BLOCKSIZE, DST 3119 ba,a,pt %ncc, 4f 3120 nop 3121 31223: tst CNT 3123 bz,a %ncc, 4f 3124 nop 3125 31265: lduba [REALSRC]ASI_USER, TMP 3127 inc REALSRC 3128 inc DST 3129 deccc CNT 3130 bgu %ncc, 5b 3131 stb TMP, [DST - 1] 31324: 3133 3134.copyin_exit: 3135 membar #Sync 3136 3137 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 3138 wr %o2, 0, %gsr 3139 3140 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 3141 btst FPRS_FEF, %o3 3142 bz,pt %icc, 4f 3143 nop 3144 3145 BLD_FPQ2Q4_FROMSTACK(%o2) 3146 3147 ba,pt %ncc, 1f 3148 wr %o3, 0, %fprs ! restore fprs 3149 31504: 3151 FZEROQ2Q4 3152 wr %o3, 0, %fprs ! restore fprs 3153 31541: 3155 membar #Sync ! sync error barrier 3156 andn %l6, FPUSED_FLAG, %l6 3157 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3158 FP_ALLOWMIGRATE(5, 6) 3159 ret 3160 restore %g0, 0, %o0 3161/* 3162 * We got here because of a fault during copyin 3163 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 3164 */ 3165.copyin_err: 3166 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 3167 tst %o4 3168 bz,pt %ncc, 2f ! if not, return error 3169 nop 3170 ldn [%o4 + CP_COPYIN], %g2 ! if handler, invoke it with 3171 jmp %g2 ! original arguments 3172 restore %g0, 0, %g0 ! dispose of copy window 31732: 3174 ret 3175 restore %g0, -1, %o0 ! return error value 3176 3177 3178 SET_SIZE(copyin_more) 3179 3180#endif /* lint */ 3181 3182#ifdef lint 3183 3184/*ARGSUSED*/ 3185int 3186xcopyin(const void *uaddr, void *kaddr, size_t count) 3187{ return (0); } 3188 3189#else /* lint */ 3190 3191 ENTRY(xcopyin) 3192 3193 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3194 bleu,pt %ncc, .xcopyin_small ! go to larger cases 3195 xor %o0, %o1, %o3 ! are src, dst alignable? 3196 btst 7, %o3 ! 3197 bz,pt %ncc, .xcopyin_8 ! check for longword alignment 3198 nop 3199 btst 1, %o3 ! 3200 bz,pt %ncc, .xcopyin_2 ! check for half-word 3201 nop 3202 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3203 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3204 tst %o3 3205 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3206 cmp %o2, %o3 ! if length <= limit 3207 bleu,pt %ncc, .xcopyin_small ! go to small copy 3208 nop 3209 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3210 nop 3211.xcopyin_2: 3212 btst 3, %o3 ! 3213 bz,pt %ncc, .xcopyin_4 ! check for word alignment 3214 nop 3215 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3216 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3217 tst %o3 3218 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3219 cmp %o2, %o3 ! if length <= limit 3220 bleu,pt %ncc, .xcopyin_small ! go to small copy 3221 nop 3222 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3223 nop 3224.xcopyin_4: 3225 ! already checked longword, must be word aligned 3226 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3227 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3228 tst %o3 3229 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3230 cmp %o2, %o3 ! if length <= limit 3231 bleu,pt %ncc, .xcopyin_small ! go to small copy 3232 nop 3233 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3234 nop 3235.xcopyin_8: 3236 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3237 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3238 tst %o3 3239 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3240 cmp %o2, %o3 ! if length <= limit 3241 bleu,pt %ncc, .xcopyin_small ! go to small copy 3242 nop 3243 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3244 nop 3245 3246.xcopyin_small: 3247 sethi %hi(.sm_xcopyin_err), %o5 ! .sm_xcopyin_err is lofault value 3248 or %o5, %lo(.sm_xcopyin_err), %o5 3249 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofaul 3250 membar #Sync ! sync error barrier 3251 ba,pt %ncc, .sm_do_copyin ! common code 3252 stn %o5, [THREAD_REG + T_LOFAULT] 3253 3254.xcopyin_more: 3255 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3256 sethi %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value 3257 ba,pt %ncc, .do_copyin 3258 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT 3259 3260/* 3261 * We got here because of fault during xcopyin 3262 * Errno value is in ERRNO 3263 */ 3264.xcopyin_err: 3265 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 3266 tst %o4 3267 bz,pt %ncc, 2f ! if not, return error 3268 nop 3269 ldn [%o4 + CP_XCOPYIN], %g2 ! if handler, invoke it with 3270 jmp %g2 ! original arguments 3271 restore %g0, 0, %g0 ! dispose of copy window 32722: 3273 ret 3274 restore ERRNO, 0, %o0 ! return errno value 3275 3276.sm_xcopyin_err: 3277 3278 membar #Sync 3279 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3280 mov SM_SAVE_SRC, %o0 3281 mov SM_SAVE_DST, %o1 3282 mov SM_SAVE_COUNT, %o2 3283 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 3284 tst %o3 3285 bz,pt %ncc, 3f ! if not, return error 3286 nop 3287 ldn [%o3 + CP_XCOPYIN], %o5 ! if handler, invoke it with 3288 jmp %o5 ! original arguments 3289 nop 32903: 3291 retl 3292 or %g1, 0, %o0 ! return errno value 3293 3294 SET_SIZE(xcopyin) 3295 3296#endif /* lint */ 3297 3298#ifdef lint 3299 3300/*ARGSUSED*/ 3301int 3302xcopyin_little(const void *uaddr, void *kaddr, size_t count) 3303{ return (0); } 3304 3305#else /* lint */ 3306 3307 ENTRY(xcopyin_little) 3308 sethi %hi(.xcopyio_err), %o5 3309 or %o5, %lo(.xcopyio_err), %o5 3310 ldn [THREAD_REG + T_LOFAULT], %o4 3311 membar #Sync ! sync error barrier 3312 stn %o5, [THREAD_REG + T_LOFAULT] 3313 mov %o4, %o5 3314 3315 subcc %g0, %o2, %o3 3316 add %o0, %o2, %o0 3317 bz,pn %ncc, 2f ! check for zero bytes 3318 sub %o2, 1, %o4 3319 add %o0, %o4, %o0 ! start w/last byte 3320 add %o1, %o2, %o1 3321 lduba [%o0 + %o3]ASI_AIUSL, %o4 3322 33231: stb %o4, [%o1 + %o3] 3324 inccc %o3 3325 sub %o0, 2, %o0 ! get next byte 3326 bcc,a,pt %ncc, 1b 3327 lduba [%o0 + %o3]ASI_AIUSL, %o4 3328 33292: 3330 membar #Sync ! sync error barrier 3331 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3332 retl 3333 mov %g0, %o0 ! return (0) 3334 3335.xcopyio_err: 3336 membar #Sync ! sync error barrier 3337 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3338 retl 3339 mov %g1, %o0 3340 3341 SET_SIZE(xcopyin_little) 3342 3343#endif /* lint */ 3344 3345 3346/* 3347 * Copy a block of storage - must not overlap (from + len <= to). 3348 * No fault handler installed (to be called under on_fault()) 3349 */ 3350#if defined(lint) 3351 3352/* ARGSUSED */ 3353void 3354copyin_noerr(const void *ufrom, void *kto, size_t count) 3355{} 3356 3357#else /* lint */ 3358 ENTRY(copyin_noerr) 3359 3360 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3361 bleu,pt %ncc, .copyin_ne_small ! go to larger cases 3362 xor %o0, %o1, %o3 ! are src, dst alignable? 3363 btst 7, %o3 ! 3364 bz,pt %ncc, .copyin_ne_8 ! check for longword alignment 3365 nop 3366 btst 1, %o3 ! 3367 bz,pt %ncc, .copyin_ne_2 ! check for half-word 3368 nop 3369 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3370 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3371 tst %o3 3372 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3373 cmp %o2, %o3 ! if length <= limit 3374 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3375 nop 3376 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3377 nop 3378.copyin_ne_2: 3379 btst 3, %o3 ! 3380 bz,pt %ncc, .copyin_ne_4 ! check for word alignment 3381 nop 3382 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3383 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3384 tst %o3 3385 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3386 cmp %o2, %o3 ! if length <= limit 3387 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3388 nop 3389 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3390 nop 3391.copyin_ne_4: 3392 ! already checked longword, must be word aligned 3393 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3394 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3395 tst %o3 3396 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3397 cmp %o2, %o3 ! if length <= limit 3398 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3399 nop 3400 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3401 nop 3402.copyin_ne_8: 3403 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3404 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3405 tst %o3 3406 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3407 cmp %o2, %o3 ! if length <= limit 3408 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3409 nop 3410 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3411 nop 3412 3413.copyin_ne_small: 3414 ldn [THREAD_REG + T_LOFAULT], %o4 3415 tst %o4 3416 bz,pn %ncc, .sm_do_copyin 3417 nop 3418 sethi %hi(.sm_copyio_noerr), %o5 3419 or %o5, %lo(.sm_copyio_noerr), %o5 3420 membar #Sync ! sync error barrier 3421 ba,pt %ncc, .sm_do_copyin 3422 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault 3423 3424.copyin_noerr_more: 3425 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3426 sethi %hi(.copyio_noerr), REAL_LOFAULT 3427 ba,pt %ncc, .do_copyin 3428 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 3429 3430.copyio_noerr: 3431 jmp %l6 3432 restore %g0,0,%g0 3433 3434.sm_copyio_noerr: 3435 membar #Sync 3436 stn %o4, [THREAD_REG + T_LOFAULT] ! restore t_lofault 3437 jmp %o4 3438 nop 3439 3440 SET_SIZE(copyin_noerr) 3441#endif /* lint */ 3442 3443/* 3444 * Copy a block of storage - must not overlap (from + len <= to). 3445 * No fault handler installed (to be called under on_fault()) 3446 */ 3447 3448#if defined(lint) 3449 3450/* ARGSUSED */ 3451void 3452copyout_noerr(const void *kfrom, void *uto, size_t count) 3453{} 3454 3455#else /* lint */ 3456 ENTRY(copyout_noerr) 3457 3458 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3459 bleu,pt %ncc, .copyout_ne_small ! go to larger cases 3460 xor %o0, %o1, %o3 ! are src, dst alignable? 3461 btst 7, %o3 ! 3462 bz,pt %ncc, .copyout_ne_8 ! check for longword alignment 3463 nop 3464 btst 1, %o3 ! 3465 bz,pt %ncc, .copyout_ne_2 ! check for half-word 3466 nop 3467 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3468 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3469 tst %o3 3470 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3471 cmp %o2, %o3 ! if length <= limit 3472 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3473 nop 3474 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3475 nop 3476.copyout_ne_2: 3477 btst 3, %o3 ! 3478 bz,pt %ncc, .copyout_ne_4 ! check for word alignment 3479 nop 3480 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3481 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3482 tst %o3 3483 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3484 cmp %o2, %o3 ! if length <= limit 3485 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3486 nop 3487 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3488 nop 3489.copyout_ne_4: 3490 ! already checked longword, must be word aligned 3491 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3492 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3493 tst %o3 3494 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3495 cmp %o2, %o3 ! if length <= limit 3496 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3497 nop 3498 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3499 nop 3500.copyout_ne_8: 3501 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3502 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3503 tst %o3 3504 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3505 cmp %o2, %o3 ! if length <= limit 3506 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3507 nop 3508 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3509 nop 3510 3511.copyout_ne_small: 3512 ldn [THREAD_REG + T_LOFAULT], %o4 3513 tst %o4 3514 bz,pn %ncc, .sm_do_copyout 3515 nop 3516 sethi %hi(.sm_copyio_noerr), %o5 3517 or %o5, %lo(.sm_copyio_noerr), %o5 3518 membar #Sync ! sync error barrier 3519 ba,pt %ncc, .sm_do_copyout 3520 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault 3521 3522.copyout_noerr_more: 3523 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3524 sethi %hi(.copyio_noerr), REAL_LOFAULT 3525 ba,pt %ncc, .do_copyout 3526 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 3527 3528 SET_SIZE(copyout_noerr) 3529#endif /* lint */ 3530 3531 3532/* 3533 * hwblkclr - clears block-aligned, block-multiple-sized regions that are 3534 * longer than 256 bytes in length using spitfire's block stores. If 3535 * the criteria for using this routine are not met then it calls bzero 3536 * and returns 1. Otherwise 0 is returned indicating success. 3537 * Caller is responsible for ensuring use_hw_bzero is true and that 3538 * kpreempt_disable() has been called. 3539 */ 3540#ifdef lint 3541/*ARGSUSED*/ 3542int 3543hwblkclr(void *addr, size_t len) 3544{ 3545 return(0); 3546} 3547#else /* lint */ 3548 ! %i0 - start address 3549 ! %i1 - length of region (multiple of 64) 3550 ! %l0 - saved fprs 3551 ! %l1 - pointer to saved %d0 block 3552 ! %l2 - saved curthread->t_lwp 3553 3554 ENTRY(hwblkclr) 3555 ! get another window w/space for one aligned block of saved fpregs 3556 save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp 3557 3558 ! Must be block-aligned 3559 andcc %i0, (VIS_BLOCKSIZE-1), %g0 3560 bnz,pn %ncc, 1f 3561 nop 3562 3563 ! ... and must be 256 bytes or more 3564 cmp %i1, 256 3565 blu,pn %ncc, 1f 3566 nop 3567 3568 ! ... and length must be a multiple of VIS_BLOCKSIZE 3569 andcc %i1, (VIS_BLOCKSIZE-1), %g0 3570 bz,pn %ncc, 2f 3571 nop 3572 35731: ! punt, call bzero but notify the caller that bzero was used 3574 mov %i0, %o0 3575 call bzero 3576 mov %i1, %o1 3577 ret 3578 restore %g0, 1, %o0 ! return (1) - did not use block operations 3579 35802: rd %fprs, %l0 ! check for unused fp 3581 btst FPRS_FEF, %l0 3582 bz,pt %icc, 1f 3583 nop 3584 3585 ! save in-use fpregs on stack 3586 membar #Sync 3587 add %fp, STACK_BIAS - 65, %l1 3588 and %l1, -VIS_BLOCKSIZE, %l1 3589 stda %d0, [%l1]ASI_BLK_P 3590 35911: membar #StoreStore|#StoreLoad|#LoadStore 3592 wr %g0, FPRS_FEF, %fprs 3593 wr %g0, ASI_BLK_P, %asi 3594 3595 ! Clear block 3596 fzero %d0 3597 fzero %d2 3598 fzero %d4 3599 fzero %d6 3600 fzero %d8 3601 fzero %d10 3602 fzero %d12 3603 fzero %d14 3604 3605 mov 256, %i3 3606 ba,pt %ncc, .pz_doblock 3607 nop 3608 3609.pz_blkstart: 3610 ! stda %d0, [%i0 + 192]%asi ! in dly slot of branch that got us here 3611 stda %d0, [%i0 + 128]%asi 3612 stda %d0, [%i0 + 64]%asi 3613 stda %d0, [%i0]%asi 3614.pz_zinst: 3615 add %i0, %i3, %i0 3616 sub %i1, %i3, %i1 3617.pz_doblock: 3618 cmp %i1, 256 3619 bgeu,a %ncc, .pz_blkstart 3620 stda %d0, [%i0 + 192]%asi 3621 3622 cmp %i1, 64 3623 blu %ncc, .pz_finish 3624 3625 andn %i1, (64-1), %i3 3626 srl %i3, 4, %i2 ! using blocks, 1 instr / 16 words 3627 set .pz_zinst, %i4 3628 sub %i4, %i2, %i4 3629 jmp %i4 3630 nop 3631 3632.pz_finish: 3633 membar #Sync 3634 btst FPRS_FEF, %l0 3635 bz,a .pz_finished 3636 wr %l0, 0, %fprs ! restore fprs 3637 3638 ! restore fpregs from stack 3639 ldda [%l1]ASI_BLK_P, %d0 3640 membar #Sync 3641 wr %l0, 0, %fprs ! restore fprs 3642 3643.pz_finished: 3644 ret 3645 restore %g0, 0, %o0 ! return (bzero or not) 3646 3647 SET_SIZE(hwblkclr) 3648#endif /* lint */ 3649 3650#ifdef lint 3651/*ARGSUSED*/ 3652void 3653hw_pa_bcopy32(uint64_t src, uint64_t dst) 3654{} 3655#else /*!lint */ 3656 /* 3657 * Copy 32 bytes of data from src (%o0) to dst (%o1) 3658 * using physical addresses. 3659 */ 3660 ENTRY_NP(hw_pa_bcopy32) 3661 rdpr %pstate, %g1 3662 andn %g1, PSTATE_IE, %g2 3663 wrpr %g0, %g2, %pstate 3664 3665 rdpr %pstate, %g0 3666 ldxa [%o0]ASI_MEM, %o2 3667 add %o0, 8, %o0 3668 ldxa [%o0]ASI_MEM, %o3 3669 add %o0, 8, %o0 3670 ldxa [%o0]ASI_MEM, %o4 3671 add %o0, 8, %o0 3672 ldxa [%o0]ASI_MEM, %o5 3673 membar #Sync 3674 3675 stxa %o2, [%o1]ASI_MEM 3676 add %o1, 8, %o1 3677 stxa %o3, [%o1]ASI_MEM 3678 add %o1, 8, %o1 3679 stxa %o4, [%o1]ASI_MEM 3680 add %o1, 8, %o1 3681 stxa %o5, [%o1]ASI_MEM 3682 3683 retl 3684 wrpr %g0, %g1, %pstate 3685 3686 SET_SIZE(hw_pa_bcopy32) 3687 3688#endif /* lint */ 3689 3690#if defined(lint) 3691 3692int use_hw_bcopy = 1; 3693int use_hw_bzero = 1; 3694uint_t hw_copy_limit_1 = 0; 3695uint_t hw_copy_limit_2 = 0; 3696uint_t hw_copy_limit_4 = 0; 3697uint_t hw_copy_limit_8 = 0; 3698 3699#else /* !lint */ 3700 3701 DGDEF(use_hw_bcopy) 3702 .word 1 3703 DGDEF(use_hw_bzero) 3704 .word 1 3705 DGDEF(hw_copy_limit_1) 3706 .word 0 3707 DGDEF(hw_copy_limit_2) 3708 .word 0 3709 DGDEF(hw_copy_limit_4) 3710 .word 0 3711 DGDEF(hw_copy_limit_8) 3712 .word 0 3713 3714 .align 64 3715 .section ".text" 3716#endif /* !lint */ 3717