1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#include <sys/param.h> 27#include <sys/errno.h> 28#include <sys/asm_linkage.h> 29#include <sys/vtrace.h> 30#include <sys/machthread.h> 31#include <sys/clock.h> 32#include <sys/asi.h> 33#include <sys/fsr.h> 34#include <sys/privregs.h> 35 36#if !defined(lint) 37#include "assym.h" 38#endif /* lint */ 39 40/* 41 * Pseudo-code to aid in understanding the control flow of the 42 * bcopy/copyin/copyout routines. 43 * 44 * On entry: 45 * 46 * ! Determine whether to use the FP register version 47 * ! or the leaf routine version depending on size 48 * ! of copy and flags. Set up error handling accordingly. 49 * ! The transition point depends on whether the src and 50 * ! dst addresses can be aligned to long word, word, 51 * ! half word, or byte boundaries. 52 * ! 53 * ! WARNING: <Register usage convention> 54 * ! For FP version, %l6 holds previous error handling and 55 * ! a flag: TRAMP_FLAG (low bits) 56 * ! for leaf routine version, %o4 holds those values. 57 * ! So either %l6 or %o4 is reserved and not available for 58 * ! any other use. 59 * 60 * if (length <= VIS_COPY_THRESHOLD) ! start with a quick test 61 * go to small_copy; ! to speed short copies 62 * 63 * ! src, dst long word alignable 64 * if (hw_copy_limit_8 == 0) ! hw_copy disabled 65 * go to small_copy; 66 * if (length <= hw_copy_limit_8) 67 * go to small_copy; 68 * go to FPBLK_copy; 69 * } 70 * if (src,dst not alignable) { 71 * if (hw_copy_limit_1 == 0) ! hw_copy disabled 72 * go to small_copy; 73 * if (length <= hw_copy_limit_1) 74 * go to small_copy; 75 * go to FPBLK_copy; 76 * } 77 * if (src,dst halfword alignable) { 78 * if (hw_copy_limit_2 == 0) ! hw_copy disabled 79 * go to small_copy; 80 * if (length <= hw_copy_limit_2) 81 * go to small_copy; 82 * go to FPBLK_copy; 83 * } 84 * if (src,dst word alignable) { 85 * if (hw_copy_limit_4 == 0) ! hw_copy disabled 86 * go to small_copy; 87 * if (length <= hw_copy_limit_4) 88 * go to small_copy; 89 * go to FPBLK_copy; 90 * } 91 * 92 * small_copy: 93 * Setup_leaf_rtn_error_handler; ! diffs for each entry point 94 * 95 * if (count <= 3) ! fast path for tiny copies 96 * go to sm_left; ! special finish up code 97 * else 98 * if (count > CHKSIZE) ! medium sized copies 99 * go to sm_med ! tuned by alignment 100 * if(src&dst not both word aligned) { 101 * sm_movebytes: 102 * move byte by byte in 4-way unrolled loop 103 * fall into sm_left; 104 * sm_left: 105 * move 0-3 bytes byte at a time as needed. 106 * restore error handler and exit. 107 * 108 * } else { ! src&dst are word aligned 109 * check for at least 8 bytes left, 110 * move word at a time, unrolled by 2 111 * when fewer than 8 bytes left, 112 * sm_half: move half word at a time while 2 or more bytes left 113 * sm_byte: move final byte if necessary 114 * sm_exit: 115 * restore error handler and exit. 116 * } 117 * 118 * ! Medium length cases with at least CHKSIZE bytes available 119 * ! method: line up src and dst as best possible, then 120 * ! move data in 4-way unrolled loops. 121 * 122 * sm_med: 123 * if(src&dst unalignable) 124 * go to sm_movebytes 125 * if(src&dst halfword alignable) 126 * go to sm_movehalf 127 * if(src&dst word alignable) 128 * go to sm_moveword 129 * ! fall into long word movement 130 * move bytes until src is word aligned 131 * if not long word aligned, move a word 132 * move long words in 4-way unrolled loop until < 32 bytes left 133 * move long words in 1-way unrolled loop until < 8 bytes left 134 * if zero bytes left, goto sm_exit 135 * if one byte left, go to sm_byte 136 * else go to sm_half 137 * 138 * sm_moveword: 139 * move bytes until src is word aligned 140 * move words in 4-way unrolled loop until < 16 bytes left 141 * move words in 1-way unrolled loop until < 4 bytes left 142 * if zero bytes left, goto sm_exit 143 * if one byte left, go to sm_byte 144 * else go to sm_half 145 * 146 * sm_movehalf: 147 * move a byte if needed to align src on halfword 148 * move halfwords in 4-way unrolled loop until < 8 bytes left 149 * if zero bytes left, goto sm_exit 150 * if one byte left, go to sm_byte 151 * else go to sm_half 152 * 153 * 154 * FPBLK_copy: 155 * %l6 = curthread->t_lofault; 156 * if (%l6 != NULL) { 157 * membar #Sync 158 * curthread->t_lofault = .copyerr; 159 * caller_error_handler = TRUE ! %l6 |= 2 160 * } 161 * 162 * ! for FPU testing we must not migrate cpus 163 * if (curthread->t_lwp == NULL) { 164 * ! Kernel threads do not have pcb's in which to store 165 * ! the floating point state, so disallow preemption during 166 * ! the copy. This also prevents cpu migration. 167 * kpreempt_disable(curthread); 168 * } else { 169 * thread_nomigrate(); 170 * } 171 * 172 * old_fprs = %fprs; 173 * old_gsr = %gsr; 174 * if (%fprs.fef) { 175 * %fprs.fef = 1; 176 * save current fpregs on stack using blockstore 177 * } else { 178 * %fprs.fef = 1; 179 * } 180 * 181 * 182 * do_blockcopy_here; 183 * 184 * In lofault handler: 185 * curthread->t_lofault = .copyerr2; 186 * Continue on with the normal exit handler 187 * 188 * On normal exit: 189 * %gsr = old_gsr; 190 * if (old_fprs & FPRS_FEF) 191 * restore fpregs from stack using blockload 192 * else 193 * zero fpregs 194 * %fprs = old_fprs; 195 * membar #Sync 196 * curthread->t_lofault = (%l6 & ~3); 197 * ! following test omitted from copyin/copyout as they 198 * ! will always have a current thread 199 * if (curthread->t_lwp == NULL) 200 * kpreempt_enable(curthread); 201 * else 202 * thread_allowmigrate(); 203 * return (0) 204 * 205 * In second lofault handler (.copyerr2): 206 * We've tried to restore fp state from the stack and failed. To 207 * prevent from returning with a corrupted fp state, we will panic. 208 */ 209 210/* 211 * Comments about optimization choices 212 * 213 * The initial optimization decision in this code is to determine 214 * whether to use the FP registers for a copy or not. If we don't 215 * use the FP registers, we can execute the copy as a leaf routine, 216 * saving a register save and restore. Also, less elaborate setup 217 * is required, allowing short copies to be completed more quickly. 218 * For longer copies, especially unaligned ones (where the src and 219 * dst do not align to allow simple ldx,stx operation), the FP 220 * registers allow much faster copy operations. 221 * 222 * The estimated extra cost of the FP path will vary depending on 223 * src/dst alignment, dst offset from the next 64 byte FPblock store 224 * boundary, remaining src data after the last full dst cache line is 225 * moved whether the FP registers need to be saved, and some other 226 * minor issues. The average additional overhead is estimated to be 227 * 400 clocks. Since each non-repeated/predicted tst and branch costs 228 * around 10 clocks, elaborate calculation would slow down to all 229 * longer copies and only benefit a small portion of medium sized 230 * copies. Rather than incur such cost, we chose fixed transition 231 * points for each of the alignment choices. 232 * 233 * For the inner loop, here is a comparison of the per cache line 234 * costs for each alignment when src&dst are in cache: 235 * 236 * byte aligned: 108 clocks slower for non-FPBLK 237 * half aligned: 44 clocks slower for non-FPBLK 238 * word aligned: 12 clocks slower for non-FPBLK 239 * long aligned: 4 clocks >>faster<< for non-FPBLK 240 * 241 * The long aligned loop runs faster because it does no prefetching. 242 * That wins if the data is not in cache or there is too little 243 * data to gain much benefit from prefetching. But when there 244 * is more data and that data is not in cache, failing to prefetch 245 * can run much slower. In addition, there is a 2 Kbyte store queue 246 * which will cause the non-FPBLK inner loop to slow for larger copies. 247 * The exact tradeoff is strongly load and application dependent, with 248 * increasing risk of a customer visible performance regression if the 249 * non-FPBLK code is used for larger copies. Studies of synthetic in-cache 250 * vs out-of-cache copy tests in user space suggest 1024 bytes as a safe 251 * upper limit for the non-FPBLK code. To minimize performance regression 252 * risk while still gaining the primary benefits of the improvements to 253 * the non-FPBLK code, we set an upper bound of 1024 bytes for the various 254 * hw_copy_limit_*. Later experimental studies using different values 255 * of hw_copy_limit_* can be used to make further adjustments if 256 * appropriate. 257 * 258 * hw_copy_limit_1 = src and dst are byte aligned but not halfword aligned 259 * hw_copy_limit_2 = src and dst are halfword aligned but not word aligned 260 * hw_copy_limit_4 = src and dst are word aligned but not longword aligned 261 * hw_copy_limit_8 = src and dst are longword aligned 262 * 263 * To say that src and dst are word aligned means that after 264 * some initial alignment activity of moving 0 to 3 bytes, 265 * both the src and dst will be on word boundaries so that 266 * word loads and stores may be used. 267 * 268 * Default values at May,2005 are: 269 * hw_copy_limit_1 = 256 270 * hw_copy_limit_2 = 512 271 * hw_copy_limit_4 = 1024 272 * hw_copy_limit_8 = 1024 (or 1536 on some systems) 273 * 274 * 275 * If hw_copy_limit_? is set to zero, then use of FPBLK copy is 276 * disabled for that alignment choice. 277 * If hw_copy_limit_? is set to a value between 1 and VIS_COPY_THRESHOLD (256) 278 * the value of VIS_COPY_THRESHOLD is used. 279 * It is not envisioned that hw_copy_limit_? will be changed in the field 280 * It is provided to allow for disabling FPBLK copies and to allow 281 * easy testing of alternate values on future HW implementations 282 * that might have different cache sizes, clock rates or instruction 283 * timing rules. 284 * 285 * Our first test for FPBLK copies vs non-FPBLK copies checks a minimum 286 * threshold to speedup all shorter copies (less than 256). That 287 * saves an alignment test, memory reference, and enabling test 288 * for all short copies, or an estimated 24 clocks. 289 * 290 * The order in which these limits are checked does matter since each 291 * non-predicted tst and branch costs around 10 clocks. 292 * If src and dst are randomly selected addresses, 293 * 4 of 8 will not be alignable. 294 * 2 of 8 will be half word alignable. 295 * 1 of 8 will be word alignable. 296 * 1 of 8 will be long word alignable. 297 * But, tests on running kernels show that src and dst to copy code 298 * are typically not on random alignments. Structure copies and 299 * copies of larger data sizes are often on long word boundaries. 300 * So we test the long word alignment case first, then 301 * the byte alignment, then halfword, then word alignment. 302 * 303 * Several times, tests for length are made to split the code 304 * into subcases. These tests often allow later tests to be 305 * avoided. For example, within the non-FPBLK copy, we first 306 * check for tiny copies of 3 bytes or less. That allows us 307 * to use a 4-way unrolled loop for the general byte copy case 308 * without a test on loop entry. 309 * We subdivide the non-FPBLK case further into CHKSIZE bytes and less 310 * vs longer cases. For the really short case, we don't attempt 311 * align src and dst. We try to minimize special case tests in 312 * the shortest loops as each test adds a significant percentage 313 * to the total time. 314 * 315 * For the medium sized cases, we allow ourselves to adjust the 316 * src and dst alignment and provide special cases for each of 317 * the four adjusted alignment cases. The CHKSIZE that was used 318 * to decide between short and medium size was chosen to be 39 319 * as that allows for the worst case of 7 bytes of alignment 320 * shift and 4 times 8 bytes for the first long word unrolling. 321 * That knowledge saves an initial test for length on entry into 322 * the medium cases. If the general loop unrolling factor were 323 * to be increases, this number would also need to be adjusted. 324 * 325 * For all cases in the non-FPBLK code where it is known that at 326 * least 4 chunks of data are available for movement, the 327 * loop is unrolled by four. This 4-way loop runs in 8 clocks 328 * or 2 clocks per data element. 329 * 330 * Instruction alignment is forced by used of .align 16 directives 331 * and nops which are not executed in the code. This 332 * combination of operations shifts the alignment of following 333 * loops to insure that loops are aligned so that their instructions 334 * fall within the minimum number of 4 instruction fetch groups. 335 * If instructions are inserted or removed between the .align 336 * instruction and the unrolled loops, then the alignment needs 337 * to be readjusted. Misaligned loops can add a clock per loop 338 * iteration to the loop timing. 339 * 340 * In a few cases, code is duplicated to avoid a branch. Since 341 * a non-predicted tst and branch takes 10 clocks, this savings 342 * is judged an appropriate time-space tradeoff. 343 * 344 * Within the FPBLK-code, the prefetch method in the inner 345 * loop needs to be explained as it is not standard. Two 346 * prefetches are issued for each cache line instead of one. 347 * The primary one is at the maximum reach of 8 cache lines. 348 * Most of the time, that maximum prefetch reach gives the 349 * cache line more time to reach the processor for systems with 350 * higher processor clocks. But, sometimes memory interference 351 * can cause that prefetch to be dropped. Putting a second 352 * prefetch at a reach of 5 cache lines catches the drops 353 * three iterations later and shows a measured improvement 354 * in performance over any similar loop with a single prefetch. 355 * The prefetches are placed in the loop so they overlap with 356 * non-memory instructions, so that there is no extra cost 357 * when the data is already in-cache. 358 * 359 */ 360 361/* 362 * Notes on preserving existing fp state and on membars. 363 * 364 * When a copyOP decides to use fp we may have to preserve existing 365 * floating point state. It is not the caller's state that we need to 366 * preserve - the rest of the kernel does not use fp and, anyway, fp 367 * registers are volatile across a call. Some examples: 368 * 369 * - userland has fp state and is interrupted (device interrupt 370 * or trap) and within the interrupt/trap handling we use 371 * bcopy() 372 * - another (higher level) interrupt or trap handler uses bcopy 373 * while a bcopy from an earlier interrupt is still active 374 * - an asynchronous error trap occurs while fp state exists (in 375 * userland or in kernel copy) and the tl0 component of the handling 376 * uses bcopy 377 * - a user process with fp state incurs a copy-on-write fault and 378 * hwblkpagecopy always uses fp 379 * 380 * We therefore need a per-call place in which to preserve fp state - 381 * using our stack is ideal (and since fp copy cannot be leaf optimized 382 * because of calls it makes, this is no hardship). 383 * 384 * When we have finished fp copy (with it's repeated block stores) 385 * we must membar #Sync so that our block stores may complete before 386 * we either restore the original fp state into the fp registers or 387 * return to a caller which may initiate other fp operations that could 388 * modify the fp regs we used before the block stores complete. 389 * 390 * Synchronous faults (eg, unresolvable DMMU miss) that occur while 391 * t_lofault is not NULL will not panic but will instead trampoline 392 * to the registered lofault handler. There is no need for any 393 * membars for these - eg, our store to t_lofault will always be visible to 394 * ourselves and it is our cpu which will take any trap. 395 * 396 * Asynchronous faults (eg, uncorrectable ECC error from memory) that occur 397 * while t_lofault is not NULL will also not panic. Since we're copying 398 * to or from userland the extent of the damage is known - the destination 399 * buffer is incomplete. So trap handlers will trampoline to the lofault 400 * handler in this case which should take some form of error action to 401 * avoid using the incomplete buffer. The trap handler also flags the 402 * fault so that later return-from-trap handling (for the trap that brought 403 * this thread into the kernel in the first place) can notify the process 404 * and reboot the system (or restart the service with Greenline/Contracts). 405 * 406 * Asynchronous faults (eg, uncorrectable ECC error from memory) can 407 * result in deferred error traps - the trap is taken sometime after 408 * the event and the trap PC may not be the PC of the faulting access. 409 * Delivery of such pending traps can be forced by a membar #Sync, acting 410 * as an "error barrier" in this role. To accurately apply the user/kernel 411 * separation described in the preceding paragraph we must force delivery 412 * of deferred traps affecting kernel state before we install a lofault 413 * handler (if we interpose a new lofault handler on an existing one there 414 * is no need to repeat this), and we must force delivery of deferred 415 * errors affecting the lofault-protected region before we clear t_lofault. 416 * Failure to do so results in lost kernel state being interpreted as 417 * affecting a copyin/copyout only, or of an error that really only 418 * affects copy data being interpreted as losing kernel state. 419 * 420 * Since the copy operations may preserve and later restore floating 421 * point state that does not belong to the caller (see examples above), 422 * we must be careful in how we do this in order to prevent corruption 423 * of another program. 424 * 425 * To make sure that floating point state is always saved and restored 426 * correctly, the following "big rules" must be followed when the floating 427 * point registers will be used: 428 * 429 * 1. %l6 always holds the caller's lofault handler. Also in this register, 430 * Bit 1 (FPUSED_FLAG) indicates that the floating point registers are in 431 * use. Bit 2 (TRAMP_FLAG) indicates that the call was to bcopy, and a 432 * lofault handler was set coming in. 433 * 434 * 2. The FPUSED flag indicates that all FP state has been successfully stored 435 * on the stack. It should not be set until this save has been completed. 436 * 437 * 3. The FPUSED flag should not be cleared on exit until all FP state has 438 * been restored from the stack. If an error occurs while restoring 439 * data from the stack, the error handler can check this flag to see if 440 * a restore is necessary. 441 * 442 * 4. Code run under the new lofault handler must be kept to a minimum. In 443 * particular, any calls to FP_ALLOWMIGRATE, which could result in a call 444 * to kpreempt(), should not be made until after the lofault handler has 445 * been restored. 446 */ 447 448/* 449 * VIS_COPY_THRESHOLD indicates the minimum number of bytes needed 450 * to "break even" using FP/VIS-accelerated memory operations. 451 * The FPBLK code assumes a minimum number of bytes are available 452 * to be moved on entry. Check that code carefully before 453 * reducing VIS_COPY_THRESHOLD below 256. 454 */ 455/* 456 * This shadows sys/machsystm.h which can't be included due to the lack of 457 * _ASM guards in include files it references. Change it here, change it there. 458 */ 459#define VIS_COPY_THRESHOLD 256 460 461/* 462 * TEST for very short copies 463 * Be aware that the maximum unroll for the short unaligned case 464 * is SHORTCOPY+1 465 */ 466#define SHORTCOPY 3 467#define CHKSIZE 39 468 469/* 470 * Indicates that we're to trampoline to the error handler. 471 * Entry points bcopy, copyin_noerr, and copyout_noerr use this flag. 472 * kcopy, copyout, xcopyout, copyin, and xcopyin do not set this flag. 473 */ 474#define FPUSED_FLAG 1 475#define TRAMP_FLAG 2 476#define MASK_FLAGS 3 477 478/* 479 * Number of outstanding prefetches. 480 * first prefetch moves data from L2 to L1 (n_reads) 481 * second prefetch moves data from memory to L2 (one_read) 482 */ 483#define OLYMPUS_C_PREFETCH 24 484#define OLYMPUS_C_2ND_PREFETCH 12 485 486#define VIS_BLOCKSIZE 64 487 488/* 489 * Size of stack frame in order to accomodate a 64-byte aligned 490 * floating-point register save area and 2 64-bit temp locations. 491 * All copy functions use two quadrants of fp registers; to assure a 492 * block-aligned two block buffer in which to save we must reserve 493 * three blocks on stack. Not all functions preserve %pfrs on stack 494 * or need to preserve %gsr but we use HWCOPYFRAMESIZE for all. 495 * 496 * _______________________________________ <-- %fp + STACK_BIAS 497 * | We may need to preserve 2 quadrants | 498 * | of fp regs, but since we do so with | 499 * | BST/BLD we need room in which to | 500 * | align to VIS_BLOCKSIZE bytes. So | 501 * | this area is 3 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET 502 * |-------------------------------------| 503 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET 504 * |-------------------------------------| 505 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET 506 * --------------------------------------- 507 */ 508#define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (2 + 1)) + (2 * 8)) 509#define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 3) 510#define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 2) - 1) 511#define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8) 512#define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8) 513 514/* 515 * Common macros used by the various versions of the block copy 516 * routines in this file. 517 */ 518 519/* 520 * In FP copies if we do not have preserved data to restore over 521 * the fp regs we used then we must zero those regs to avoid 522 * exposing portions of the data to later threads (data security). 523 * 524 * Copy functions use either quadrants 1 and 3 or 2 and 4. 525 * 526 * FZEROQ1Q3: Zero quadrants 1 and 3, ie %f0 - %f15 and %f32 - %f47 527 * FZEROQ2Q4: Zero quadrants 2 and 4, ie %f16 - %f31 and %f48 - %f63 528 * 529 * The instructions below are quicker than repeated fzero instructions 530 * since they can dispatch down two fp pipelines. 531 */ 532#define FZEROQ1Q3 \ 533 fzero %f0 ;\ 534 fmovd %f0, %f2 ;\ 535 fmovd %f0, %f4 ;\ 536 fmovd %f0, %f6 ;\ 537 fmovd %f0, %f8 ;\ 538 fmovd %f0, %f10 ;\ 539 fmovd %f0, %f12 ;\ 540 fmovd %f0, %f14 ;\ 541 fmovd %f0, %f32 ;\ 542 fmovd %f0, %f34 ;\ 543 fmovd %f0, %f36 ;\ 544 fmovd %f0, %f38 ;\ 545 fmovd %f0, %f40 ;\ 546 fmovd %f0, %f42 ;\ 547 fmovd %f0, %f44 ;\ 548 fmovd %f0, %f46 549 550#define FZEROQ2Q4 \ 551 fzero %f16 ;\ 552 fmovd %f0, %f18 ;\ 553 fmovd %f0, %f20 ;\ 554 fmovd %f0, %f22 ;\ 555 fmovd %f0, %f24 ;\ 556 fmovd %f0, %f26 ;\ 557 fmovd %f0, %f28 ;\ 558 fmovd %f0, %f30 ;\ 559 fmovd %f0, %f48 ;\ 560 fmovd %f0, %f50 ;\ 561 fmovd %f0, %f52 ;\ 562 fmovd %f0, %f54 ;\ 563 fmovd %f0, %f56 ;\ 564 fmovd %f0, %f58 ;\ 565 fmovd %f0, %f60 ;\ 566 fmovd %f0, %f62 567 568/* 569 * Macros to save and restore quadrants 1 and 3 or 2 and 4 to/from the stack. 570 * Used to save and restore in-use fp registers when we want to use FP 571 * and find fp already in use and copy size still large enough to justify 572 * the additional overhead of this save and restore. 573 * 574 * A membar #Sync is needed before save to sync fp ops initiated before 575 * the call to the copy function (by whoever has fp in use); for example 576 * an earlier block load to the quadrant we are about to save may still be 577 * "in flight". A membar #Sync is required at the end of the save to 578 * sync our block store (the copy code is about to begin ldd's to the 579 * first quadrant). 580 * 581 * Similarly: a membar #Sync before restore allows the block stores of 582 * the copy operation to complete before we fill the quadrants with their 583 * original data, and a membar #Sync after restore lets the block loads 584 * of the restore complete before we return to whoever has the fp regs 585 * in use. To avoid repeated membar #Sync we make it the responsibility 586 * of the copy code to membar #Sync immediately after copy is complete 587 * and before using the BLD_*_FROMSTACK macro. 588 */ 589#if !defined(lint) 590#define BST_FPQ1Q3_TOSTACK(tmp1) \ 591 /* membar #Sync */ ;\ 592 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 593 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 594 stda %f0, [tmp1]ASI_BLK_P ;\ 595 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 596 stda %f32, [tmp1]ASI_BLK_P ;\ 597 membar #Sync 598 599#define BLD_FPQ1Q3_FROMSTACK(tmp1) \ 600 /* membar #Sync - provided at copy completion */ ;\ 601 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 602 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 603 ldda [tmp1]ASI_BLK_P, %f0 ;\ 604 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 605 ldda [tmp1]ASI_BLK_P, %f32 ;\ 606 membar #Sync 607 608#define BST_FPQ2Q4_TOSTACK(tmp1) \ 609 /* membar #Sync */ ;\ 610 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 611 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 612 stda %f16, [tmp1]ASI_BLK_P ;\ 613 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 614 stda %f48, [tmp1]ASI_BLK_P ;\ 615 membar #Sync 616 617#define BLD_FPQ2Q4_FROMSTACK(tmp1) \ 618 /* membar #Sync - provided at copy completion */ ;\ 619 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 620 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 621 ldda [tmp1]ASI_BLK_P, %f16 ;\ 622 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 623 ldda [tmp1]ASI_BLK_P, %f48 ;\ 624 membar #Sync 625#endif 626 627/* 628 * FP_NOMIGRATE and FP_ALLOWMIGRATE. Prevent migration (or, stronger, 629 * prevent preemption if there is no t_lwp to save FP state to on context 630 * switch) before commencing a FP copy, and reallow it on completion or 631 * in error trampoline paths when we were using FP copy. 632 * 633 * Both macros may call other functions, so be aware that all outputs are 634 * forfeit after using these macros. For this reason we do not pass registers 635 * to use - we just use any outputs we want. 636 * 637 * Pseudo code: 638 * 639 * FP_NOMIGRATE: 640 * 641 * if (curthread->t_lwp) { 642 * thread_nomigrate(); 643 * } else { 644 * kpreempt_disable(); 645 * } 646 * 647 * FP_ALLOWMIGRATE: 648 * 649 * if (curthread->t_lwp) { 650 * thread_allowmigrate(); 651 * } else { 652 * kpreempt_enable(); 653 * } 654 */ 655 656#define FP_NOMIGRATE(label1, label2) \ 657 ldn [THREAD_REG + T_LWP], %o0 ;\ 658 brz,a,pn %o0, label1/**/f ;\ 659 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ 660 call thread_nomigrate ;\ 661 nop ;\ 662 ba label2/**/f ;\ 663 nop ;\ 664label1: ;\ 665 inc %o1 ;\ 666 stb %o1, [THREAD_REG + T_PREEMPT] ;\ 667label2: 668 669#define FP_ALLOWMIGRATE(label1, label2) \ 670 ldn [THREAD_REG + T_LWP], %o0 ;\ 671 brz,a,pn %o0, label1/**/f ;\ 672 ldsb [THREAD_REG + T_PREEMPT], %o1 ;\ 673 call thread_allowmigrate ;\ 674 nop ;\ 675 ba label2/**/f ;\ 676 nop ;\ 677label1: ;\ 678 dec %o1 ;\ 679 brnz,pn %o1, label2/**/f ;\ 680 stb %o1, [THREAD_REG + T_PREEMPT] ;\ 681 ldn [THREAD_REG + T_CPU], %o0 ;\ 682 ldub [%o0 + CPU_KPRUNRUN], %o0 ;\ 683 brz,pt %o0, label2/**/f ;\ 684 nop ;\ 685 call kpreempt ;\ 686 rdpr %pil, %o0 ;\ 687label2: 688 689/* 690 * Copy a block of storage, returning an error code if `from' or 691 * `to' takes a kernel pagefault which cannot be resolved. 692 * Returns errno value on pagefault error, 0 if all ok 693 */ 694 695#if defined(lint) 696 697/* ARGSUSED */ 698int 699kcopy(const void *from, void *to, size_t count) 700{ return(0); } 701 702#else /* lint */ 703 704 .seg ".text" 705 .align 4 706 707 ENTRY(kcopy) 708 709 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 710 bleu,pt %ncc, .kcopy_small ! go to larger cases 711 xor %o0, %o1, %o3 ! are src, dst alignable? 712 btst 7, %o3 ! 713 bz,pt %ncc, .kcopy_8 ! check for longword alignment 714 nop 715 btst 1, %o3 ! 716 bz,pt %ncc, .kcopy_2 ! check for half-word 717 nop 718 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 719 ld [%o3 + %lo(hw_copy_limit_1)], %o3 720 tst %o3 721 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 722 cmp %o2, %o3 ! if length <= limit 723 bleu,pt %ncc, .kcopy_small ! go to small copy 724 nop 725 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 726 nop 727.kcopy_2: 728 btst 3, %o3 ! 729 bz,pt %ncc, .kcopy_4 ! check for word alignment 730 nop 731 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 732 ld [%o3 + %lo(hw_copy_limit_2)], %o3 733 tst %o3 734 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 735 cmp %o2, %o3 ! if length <= limit 736 bleu,pt %ncc, .kcopy_small ! go to small copy 737 nop 738 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 739 nop 740.kcopy_4: 741 ! already checked longword, must be word aligned 742 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 743 ld [%o3 + %lo(hw_copy_limit_4)], %o3 744 tst %o3 745 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 746 cmp %o2, %o3 ! if length <= limit 747 bleu,pt %ncc, .kcopy_small ! go to small copy 748 nop 749 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 750 nop 751.kcopy_8: 752 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 753 ld [%o3 + %lo(hw_copy_limit_8)], %o3 754 tst %o3 755 bz,pn %icc, .kcopy_small ! if zero, disable HW copy 756 cmp %o2, %o3 ! if length <= limit 757 bleu,pt %ncc, .kcopy_small ! go to small copy 758 nop 759 ba,pt %ncc, .kcopy_more ! otherwise go to large copy 760 nop 761 762.kcopy_small: 763 sethi %hi(.sm_copyerr), %o5 ! sm_copyerr is lofault value 764 or %o5, %lo(.sm_copyerr), %o5 765 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 766 membar #Sync ! sync error barrier 767 ba,pt %ncc, .sm_do_copy ! common code 768 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 769 770.kcopy_more: 771 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 772 sethi %hi(.copyerr), %l7 ! copyerr is lofault value 773 or %l7, %lo(.copyerr), %l7 774 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 775 membar #Sync ! sync error barrier 776 ba,pt %ncc, .do_copy ! common code 777 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 778 779 780/* 781 * We got here because of a fault during bcopy_more, called from kcopy or bcopy. 782 * Errno value is in %g1. bcopy_more uses fp quadrants 1 and 3. 783 */ 784.copyerr: 785 set .copyerr2, %l0 786 membar #Sync ! sync error barrier 787 stn %l0, [THREAD_REG + T_LOFAULT] ! set t_lofault 788 btst FPUSED_FLAG, %l6 789 bz %ncc, 1f 790 and %l6, TRAMP_FLAG, %l0 ! copy trampoline flag to %l0 791 792 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 793 wr %o2, 0, %gsr 794 795 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 796 btst FPRS_FEF, %o3 797 bz,pt %icc, 4f 798 nop 799 800 BLD_FPQ1Q3_FROMSTACK(%o2) 801 802 ba,pt %ncc, 1f 803 wr %o3, 0, %fprs ! restore fprs 804 8054: 806 FZEROQ1Q3 807 wr %o3, 0, %fprs ! restore fprs 808 809 ! 810 ! Need to cater for the different expectations of kcopy 811 ! and bcopy. kcopy will *always* set a t_lofault handler 812 ! If it fires, we're expected to just return the error code 813 ! and *not* to invoke any existing error handler. As far as 814 ! bcopy is concerned, we only set t_lofault if there was an 815 ! existing lofault handler. In that case we're expected to 816 ! invoke the previously existing handler after resetting the 817 ! t_lofault value. 818 ! 8191: 820 andn %l6, MASK_FLAGS, %l6 ! turn trampoline flag off 821 membar #Sync ! sync error barrier 822 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 823 FP_ALLOWMIGRATE(5, 6) 824 825 btst TRAMP_FLAG, %l0 826 bnz,pn %ncc, 3f 827 nop 828 ret 829 restore %g1, 0, %o0 830 8313: 832 ! 833 ! We're here via bcopy. There *must* have been an error handler 834 ! in place otherwise we would have died a nasty death already. 835 ! 836 jmp %l6 ! goto real handler 837 restore %g0, 0, %o0 ! dispose of copy window 838 839/* 840 * We got here because of a fault in .copyerr. We can't safely restore fp 841 * state, so we panic. 842 */ 843fp_panic_msg: 844 .asciz "Unable to restore fp state after copy operation" 845 846 .align 4 847.copyerr2: 848 set fp_panic_msg, %o0 849 call panic 850 nop 851 852/* 853 * We got here because of a fault during a small kcopy or bcopy. 854 * No floating point registers are used by the small copies. 855 * Errno value is in %g1. 856 */ 857.sm_copyerr: 8581: 859 btst TRAMP_FLAG, %o4 860 membar #Sync 861 andn %o4, TRAMP_FLAG, %o4 862 bnz,pn %ncc, 3f 863 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 864 retl 865 mov %g1, %o0 8663: 867 jmp %o4 ! goto real handler 868 mov %g0, %o0 ! 869 870 SET_SIZE(kcopy) 871#endif /* lint */ 872 873 874/* 875 * Copy a block of storage - must not overlap (from + len <= to). 876 * Registers: l6 - saved t_lofault 877 * (for short copies, o4 - saved t_lofault) 878 * 879 * Copy a page of memory. 880 * Assumes double word alignment and a count >= 256. 881 */ 882#if defined(lint) 883 884/* ARGSUSED */ 885void 886bcopy(const void *from, void *to, size_t count) 887{} 888 889#else /* lint */ 890 891 ENTRY(bcopy) 892 893 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 894 bleu,pt %ncc, .bcopy_small ! go to larger cases 895 xor %o0, %o1, %o3 ! are src, dst alignable? 896 btst 7, %o3 ! 897 bz,pt %ncc, .bcopy_8 ! check for longword alignment 898 nop 899 btst 1, %o3 ! 900 bz,pt %ncc, .bcopy_2 ! check for half-word 901 nop 902 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 903 ld [%o3 + %lo(hw_copy_limit_1)], %o3 904 tst %o3 905 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 906 cmp %o2, %o3 ! if length <= limit 907 bleu,pt %ncc, .bcopy_small ! go to small copy 908 nop 909 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 910 nop 911.bcopy_2: 912 btst 3, %o3 ! 913 bz,pt %ncc, .bcopy_4 ! check for word alignment 914 nop 915 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 916 ld [%o3 + %lo(hw_copy_limit_2)], %o3 917 tst %o3 918 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 919 cmp %o2, %o3 ! if length <= limit 920 bleu,pt %ncc, .bcopy_small ! go to small copy 921 nop 922 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 923 nop 924.bcopy_4: 925 ! already checked longword, must be word aligned 926 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 927 ld [%o3 + %lo(hw_copy_limit_4)], %o3 928 tst %o3 929 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 930 cmp %o2, %o3 ! if length <= limit 931 bleu,pt %ncc, .bcopy_small ! go to small copy 932 nop 933 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 934 nop 935.bcopy_8: 936 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 937 ld [%o3 + %lo(hw_copy_limit_8)], %o3 938 tst %o3 939 bz,pn %icc, .bcopy_small ! if zero, disable HW copy 940 cmp %o2, %o3 ! if length <= limit 941 bleu,pt %ncc, .bcopy_small ! go to small copy 942 nop 943 ba,pt %ncc, .bcopy_more ! otherwise go to large copy 944 nop 945 946 .align 16 947.bcopy_small: 948 ldn [THREAD_REG + T_LOFAULT], %o4 ! save t_lofault 949 tst %o4 950 bz,pt %icc, .sm_do_copy 951 nop 952 sethi %hi(.sm_copyerr), %o5 953 or %o5, %lo(.sm_copyerr), %o5 954 membar #Sync ! sync error barrier 955 stn %o5, [THREAD_REG + T_LOFAULT] ! install new vector 956 or %o4, TRAMP_FLAG, %o4 ! error should trampoline 957.sm_do_copy: 958 cmp %o2, SHORTCOPY ! check for really short case 959 bleu,pt %ncc, .bc_sm_left ! 960 cmp %o2, CHKSIZE ! check for medium length cases 961 bgu,pn %ncc, .bc_med ! 962 or %o0, %o1, %o3 ! prepare alignment check 963 andcc %o3, 0x3, %g0 ! test for alignment 964 bz,pt %ncc, .bc_sm_word ! branch to word aligned case 965.bc_sm_movebytes: 966 sub %o2, 3, %o2 ! adjust count to allow cc zero test 967.bc_sm_notalign4: 968 ldub [%o0], %o3 ! read byte 969 stb %o3, [%o1] ! write byte 970 subcc %o2, 4, %o2 ! reduce count by 4 971 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes 972 add %o0, 4, %o0 ! advance SRC by 4 973 stb %o3, [%o1 + 1] 974 ldub [%o0 - 2], %o3 975 add %o1, 4, %o1 ! advance DST by 4 976 stb %o3, [%o1 - 2] 977 ldub [%o0 - 1], %o3 978 bgt,pt %ncc, .bc_sm_notalign4 ! loop til 3 or fewer bytes remain 979 stb %o3, [%o1 - 1] 980 add %o2, 3, %o2 ! restore count 981.bc_sm_left: 982 tst %o2 983 bz,pt %ncc, .bc_sm_exit ! check for zero length 984 deccc %o2 ! reduce count for cc test 985 ldub [%o0], %o3 ! move one byte 986 bz,pt %ncc, .bc_sm_exit 987 stb %o3, [%o1] 988 ldub [%o0 + 1], %o3 ! move another byte 989 deccc %o2 ! check for more 990 bz,pt %ncc, .bc_sm_exit 991 stb %o3, [%o1 + 1] 992 ldub [%o0 + 2], %o3 ! move final byte 993 ba,pt %ncc, .bc_sm_exit 994 stb %o3, [%o1 + 2] 995 .align 16 996 nop ! instruction alignment 997 ! see discussion at start of file 998.bc_sm_words: 999 lduw [%o0], %o3 ! read word 1000.bc_sm_wordx: 1001 subcc %o2, 8, %o2 ! update count 1002 stw %o3, [%o1] ! write word 1003 add %o0, 8, %o0 ! update SRC 1004 lduw [%o0 - 4], %o3 ! read word 1005 add %o1, 8, %o1 ! update DST 1006 bgt,pt %ncc, .bc_sm_words ! loop til done 1007 stw %o3, [%o1 - 4] ! write word 1008 addcc %o2, 7, %o2 ! restore count 1009 bz,pt %ncc, .bc_sm_exit 1010 deccc %o2 1011 bz,pt %ncc, .bc_sm_byte 1012.bc_sm_half: 1013 subcc %o2, 2, %o2 ! reduce count by 2 1014 add %o0, 2, %o0 ! advance SRC by 2 1015 lduh [%o0 - 2], %o3 ! read half word 1016 add %o1, 2, %o1 ! advance DST by 2 1017 bgt,pt %ncc, .bc_sm_half ! loop til done 1018 sth %o3, [%o1 - 2] ! write half word 1019 addcc %o2, 1, %o2 ! restore count 1020 bz,pt %ncc, .bc_sm_exit 1021 nop 1022.bc_sm_byte: 1023 ldub [%o0], %o3 1024 ba,pt %ncc, .bc_sm_exit 1025 stb %o3, [%o1] 1026 1027.bc_sm_word: 1028 subcc %o2, 4, %o2 ! update count 1029 bgt,pt %ncc, .bc_sm_wordx 1030 lduw [%o0], %o3 ! read word 1031 addcc %o2, 3, %o2 ! restore count 1032 bz,pt %ncc, .bc_sm_exit 1033 stw %o3, [%o1] ! write word 1034 deccc %o2 ! reduce count for cc test 1035 ldub [%o0 + 4], %o3 ! load one byte 1036 bz,pt %ncc, .bc_sm_exit 1037 stb %o3, [%o1 + 4] ! store one byte 1038 ldub [%o0 + 5], %o3 ! load second byte 1039 deccc %o2 1040 bz,pt %ncc, .bc_sm_exit 1041 stb %o3, [%o1 + 5] ! store second byte 1042 ldub [%o0 + 6], %o3 ! load third byte 1043 stb %o3, [%o1 + 6] ! store third byte 1044.bc_sm_exit: 1045 ldn [THREAD_REG + T_LOFAULT], %o3 1046 brz,pt %o3, .bc_sm_done 1047 nop 1048 membar #Sync ! sync error barrier 1049 andn %o4, TRAMP_FLAG, %o4 1050 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1051.bc_sm_done: 1052 retl 1053 mov %g0, %o0 ! return 0 1054 1055 .align 16 1056.bc_med: 1057 xor %o0, %o1, %o3 ! setup alignment check 1058 btst 1, %o3 1059 bnz,pt %ncc, .bc_sm_movebytes ! unaligned 1060 nop 1061 btst 3, %o3 1062 bnz,pt %ncc, .bc_med_half ! halfword aligned 1063 nop 1064 btst 7, %o3 1065 bnz,pt %ncc, .bc_med_word ! word aligned 1066 nop 1067.bc_med_long: 1068 btst 3, %o0 ! check for 1069 bz,pt %ncc, .bc_med_long1 ! word alignment 1070 nop 1071.bc_med_long0: 1072 ldub [%o0], %o3 ! load one byte 1073 inc %o0 1074 stb %o3,[%o1] ! store byte 1075 inc %o1 1076 btst 3, %o0 1077 bnz,pt %ncc, .bc_med_long0 1078 dec %o2 1079.bc_med_long1: ! word aligned 1080 btst 7, %o0 ! check for long word 1081 bz,pt %ncc, .bc_med_long2 1082 nop 1083 lduw [%o0], %o3 ! load word 1084 add %o0, 4, %o0 ! advance SRC by 4 1085 stw %o3, [%o1] ! store word 1086 add %o1, 4, %o1 ! advance DST by 4 1087 sub %o2, 4, %o2 ! reduce count by 4 1088! 1089! Now long word aligned and have at least 32 bytes to move 1090! 1091.bc_med_long2: 1092 sub %o2, 31, %o2 ! adjust count to allow cc zero test 1093.bc_med_lmove: 1094 ldx [%o0], %o3 ! read long word 1095 stx %o3, [%o1] ! write long word 1096 subcc %o2, 32, %o2 ! reduce count by 32 1097 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words 1098 add %o0, 32, %o0 ! advance SRC by 32 1099 stx %o3, [%o1 + 8] 1100 ldx [%o0 - 16], %o3 1101 add %o1, 32, %o1 ! advance DST by 32 1102 stx %o3, [%o1 - 16] 1103 ldx [%o0 - 8], %o3 1104 bgt,pt %ncc, .bc_med_lmove ! loop til 31 or fewer bytes left 1105 stx %o3, [%o1 - 8] 1106 addcc %o2, 24, %o2 ! restore count to long word offset 1107 ble,pt %ncc, .bc_med_lextra ! check for more long words to move 1108 nop 1109.bc_med_lword: 1110 ldx [%o0], %o3 ! read long word 1111 subcc %o2, 8, %o2 ! reduce count by 8 1112 stx %o3, [%o1] ! write long word 1113 add %o0, 8, %o0 ! advance SRC by 8 1114 bgt,pt %ncc, .bc_med_lword ! loop til 7 or fewer bytes left 1115 add %o1, 8, %o1 ! advance DST by 8 1116.bc_med_lextra: 1117 addcc %o2, 7, %o2 ! restore rest of count 1118 bz,pt %ncc, .bc_sm_exit ! if zero, then done 1119 deccc %o2 1120 bz,pt %ncc, .bc_sm_byte 1121 nop 1122 ba,pt %ncc, .bc_sm_half 1123 nop 1124 1125 .align 16 1126.bc_med_word: 1127 btst 3, %o0 ! check for 1128 bz,pt %ncc, .bc_med_word1 ! word alignment 1129 nop 1130.bc_med_word0: 1131 ldub [%o0], %o3 ! load one byte 1132 inc %o0 1133 stb %o3,[%o1] ! store byte 1134 inc %o1 1135 btst 3, %o0 1136 bnz,pt %ncc, .bc_med_word0 1137 dec %o2 1138! 1139! Now word aligned and have at least 36 bytes to move 1140! 1141.bc_med_word1: 1142 sub %o2, 15, %o2 ! adjust count to allow cc zero test 1143.bc_med_wmove: 1144 lduw [%o0], %o3 ! read word 1145 stw %o3, [%o1] ! write word 1146 subcc %o2, 16, %o2 ! reduce count by 16 1147 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words 1148 add %o0, 16, %o0 ! advance SRC by 16 1149 stw %o3, [%o1 + 4] 1150 lduw [%o0 - 8], %o3 1151 add %o1, 16, %o1 ! advance DST by 16 1152 stw %o3, [%o1 - 8] 1153 lduw [%o0 - 4], %o3 1154 bgt,pt %ncc, .bc_med_wmove ! loop til 15 or fewer bytes left 1155 stw %o3, [%o1 - 4] 1156 addcc %o2, 12, %o2 ! restore count to word offset 1157 ble,pt %ncc, .bc_med_wextra ! check for more words to move 1158 nop 1159.bc_med_word2: 1160 lduw [%o0], %o3 ! read word 1161 subcc %o2, 4, %o2 ! reduce count by 4 1162 stw %o3, [%o1] ! write word 1163 add %o0, 4, %o0 ! advance SRC by 4 1164 bgt,pt %ncc, .bc_med_word2 ! loop til 3 or fewer bytes left 1165 add %o1, 4, %o1 ! advance DST by 4 1166.bc_med_wextra: 1167 addcc %o2, 3, %o2 ! restore rest of count 1168 bz,pt %ncc, .bc_sm_exit ! if zero, then done 1169 deccc %o2 1170 bz,pt %ncc, .bc_sm_byte 1171 nop 1172 ba,pt %ncc, .bc_sm_half 1173 nop 1174 1175 .align 16 1176.bc_med_half: 1177 btst 1, %o0 ! check for 1178 bz,pt %ncc, .bc_med_half1 ! half word alignment 1179 nop 1180 ldub [%o0], %o3 ! load one byte 1181 inc %o0 1182 stb %o3,[%o1] ! store byte 1183 inc %o1 1184 dec %o2 1185! 1186! Now half word aligned and have at least 38 bytes to move 1187! 1188.bc_med_half1: 1189 sub %o2, 7, %o2 ! adjust count to allow cc zero test 1190.bc_med_hmove: 1191 lduh [%o0], %o3 ! read half word 1192 sth %o3, [%o1] ! write half word 1193 subcc %o2, 8, %o2 ! reduce count by 8 1194 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords 1195 add %o0, 8, %o0 ! advance SRC by 8 1196 sth %o3, [%o1 + 2] 1197 lduh [%o0 - 4], %o3 1198 add %o1, 8, %o1 ! advance DST by 8 1199 sth %o3, [%o1 - 4] 1200 lduh [%o0 - 2], %o3 1201 bgt,pt %ncc, .bc_med_hmove ! loop til 7 or fewer bytes left 1202 sth %o3, [%o1 - 2] 1203 addcc %o2, 7, %o2 ! restore count 1204 bz,pt %ncc, .bc_sm_exit 1205 deccc %o2 1206 bz,pt %ncc, .bc_sm_byte 1207 nop 1208 ba,pt %ncc, .bc_sm_half 1209 nop 1210 1211 SET_SIZE(bcopy) 1212 1213/* 1214 * The _more entry points are not intended to be used directly by 1215 * any caller from outside this file. They are provided to allow 1216 * profiling and dtrace of the portions of the copy code that uses 1217 * the floating point registers. 1218 * This entry is particularly important as DTRACE (at least as of 1219 * 4/2004) does not support leaf functions. 1220 */ 1221 1222 ENTRY(bcopy_more) 1223.bcopy_more: 1224 prefetch [%o0], #n_reads 1225 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 1226 ldn [THREAD_REG + T_LOFAULT], %l6 ! save t_lofault 1227 tst %l6 1228 bz,pt %ncc, .do_copy 1229 nop 1230 sethi %hi(.copyerr), %o2 1231 or %o2, %lo(.copyerr), %o2 1232 membar #Sync ! sync error barrier 1233 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector 1234 ! 1235 ! We've already captured whether t_lofault was zero on entry. 1236 ! We need to mark ourselves as being from bcopy since both 1237 ! kcopy and bcopy use the same code path. If TRAMP_FLAG is set 1238 ! and the saved lofault was zero, we won't reset lofault on 1239 ! returning. 1240 ! 1241 or %l6, TRAMP_FLAG, %l6 1242 1243/* 1244 * Copies that reach here are larger than VIS_COPY_THRESHOLD bytes 1245 * Also, use of FP registers has been tested to be enabled 1246 */ 1247.do_copy: 1248 FP_NOMIGRATE(6, 7) 1249 1250 rd %fprs, %o2 ! check for unused fp 1251 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 1252 btst FPRS_FEF, %o2 1253 bz,a,pt %icc, .do_blockcopy 1254 wr %g0, FPRS_FEF, %fprs 1255 1256 BST_FPQ1Q3_TOSTACK(%o2) 1257 1258.do_blockcopy: 1259 rd %gsr, %o2 1260 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 1261 or %l6, FPUSED_FLAG, %l6 1262 1263#define REALSRC %i0 1264#define DST %i1 1265#define CNT %i2 1266#define SRC %i3 1267#define TMP %i5 1268 1269 andcc DST, VIS_BLOCKSIZE - 1, TMP 1270 bz,pt %ncc, 2f 1271 neg TMP 1272 add TMP, VIS_BLOCKSIZE, TMP 1273 1274 ! TMP = bytes required to align DST on FP_BLOCK boundary 1275 ! Using SRC as a tmp here 1276 cmp TMP, 3 1277 bleu,pt %ncc, 1f 1278 sub CNT,TMP,CNT ! adjust main count 1279 sub TMP, 3, TMP ! adjust for end of loop test 1280.bc_blkalign: 1281 ldub [REALSRC], SRC ! move 4 bytes per loop iteration 1282 stb SRC, [DST] 1283 subcc TMP, 4, TMP 1284 ldub [REALSRC + 1], SRC 1285 add REALSRC, 4, REALSRC 1286 stb SRC, [DST + 1] 1287 ldub [REALSRC - 2], SRC 1288 add DST, 4, DST 1289 stb SRC, [DST - 2] 1290 ldub [REALSRC - 1], SRC 1291 bgu,pt %ncc, .bc_blkalign 1292 stb SRC, [DST - 1] 1293 1294 addcc TMP, 3, TMP ! restore count adjustment 1295 bz,pt %ncc, 2f ! no bytes left? 1296 nop 12971: ldub [REALSRC], SRC 1298 inc REALSRC 1299 inc DST 1300 deccc TMP 1301 bgu %ncc, 1b 1302 stb SRC, [DST - 1] 1303 13042: 1305 membar #StoreLoad 1306 andn REALSRC, 0x7, SRC 1307 1308 ! SRC - 8-byte aligned 1309 ! DST - 64-byte aligned 1310 ldd [SRC], %f0 1311 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 1312 alignaddr REALSRC, %g0, %g0 1313 ldd [SRC + 0x08], %f2 1314 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 1315 faligndata %f0, %f2, %f32 1316 ldd [SRC + 0x10], %f4 1317 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 1318 faligndata %f2, %f4, %f34 1319 ldd [SRC + 0x18], %f6 1320 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 1321 faligndata %f4, %f6, %f36 1322 ldd [SRC + 0x20], %f8 1323 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read 1324 faligndata %f6, %f8, %f38 1325 ldd [SRC + 0x28], %f10 1326 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read 1327 faligndata %f8, %f10, %f40 1328 ldd [SRC + 0x30], %f12 1329 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read 1330 faligndata %f10, %f12, %f42 1331 ldd [SRC + 0x38], %f14 1332 ldd [SRC + VIS_BLOCKSIZE], %f0 1333 sub CNT, VIS_BLOCKSIZE, CNT 1334 add SRC, VIS_BLOCKSIZE, SRC 1335 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read 1336 add REALSRC, VIS_BLOCKSIZE, REALSRC 1337 ba,pt %ncc, 1f 1338 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read 1339 .align 32 13401: 1341 ldd [SRC + 0x08], %f2 1342 faligndata %f12, %f14, %f44 1343 ldd [SRC + 0x10], %f4 1344 faligndata %f14, %f0, %f46 1345 stda %f32, [DST]ASI_BLK_P 1346 ldd [SRC + 0x18], %f6 1347 faligndata %f0, %f2, %f32 1348 ldd [SRC + 0x20], %f8 1349 faligndata %f2, %f4, %f34 1350 ldd [SRC + 0x28], %f10 1351 faligndata %f4, %f6, %f36 1352 ldd [SRC + 0x30], %f12 1353 faligndata %f6, %f8, %f38 1354 sub CNT, VIS_BLOCKSIZE, CNT 1355 ldd [SRC + 0x38], %f14 1356 faligndata %f8, %f10, %f40 1357 add DST, VIS_BLOCKSIZE, DST 1358 ldd [SRC + VIS_BLOCKSIZE], %f0 1359 faligndata %f10, %f12, %f42 1360 add REALSRC, VIS_BLOCKSIZE, REALSRC 1361 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 1362 add SRC, VIS_BLOCKSIZE, SRC 1363 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1364 cmp CNT, VIS_BLOCKSIZE + 8 1365 bgu,pt %ncc, 1b 1366 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1367 1368 ! only if REALSRC & 0x7 is 0 1369 cmp CNT, VIS_BLOCKSIZE 1370 bne %ncc, 3f 1371 andcc REALSRC, 0x7, %g0 1372 bz,pt %ncc, 2f 1373 nop 13743: 1375 faligndata %f12, %f14, %f44 1376 faligndata %f14, %f0, %f46 1377 stda %f32, [DST]ASI_BLK_P 1378 add DST, VIS_BLOCKSIZE, DST 1379 ba,pt %ncc, 3f 1380 nop 13812: 1382 ldd [SRC + 0x08], %f2 1383 fsrc1 %f12, %f44 1384 ldd [SRC + 0x10], %f4 1385 fsrc1 %f14, %f46 1386 stda %f32, [DST]ASI_BLK_P 1387 ldd [SRC + 0x18], %f6 1388 fsrc1 %f0, %f32 1389 ldd [SRC + 0x20], %f8 1390 fsrc1 %f2, %f34 1391 ldd [SRC + 0x28], %f10 1392 fsrc1 %f4, %f36 1393 ldd [SRC + 0x30], %f12 1394 fsrc1 %f6, %f38 1395 ldd [SRC + 0x38], %f14 1396 fsrc1 %f8, %f40 1397 sub CNT, VIS_BLOCKSIZE, CNT 1398 add DST, VIS_BLOCKSIZE, DST 1399 add SRC, VIS_BLOCKSIZE, SRC 1400 add REALSRC, VIS_BLOCKSIZE, REALSRC 1401 fsrc1 %f10, %f42 1402 fsrc1 %f12, %f44 1403 fsrc1 %f14, %f46 1404 stda %f32, [DST]ASI_BLK_P 1405 add DST, VIS_BLOCKSIZE, DST 1406 ba,a,pt %ncc, .bcb_exit 1407 nop 1408 14093: tst CNT 1410 bz,a,pt %ncc, .bcb_exit 1411 nop 1412 14135: ldub [REALSRC], TMP 1414 inc REALSRC 1415 inc DST 1416 deccc CNT 1417 bgu %ncc, 5b 1418 stb TMP, [DST - 1] 1419.bcb_exit: 1420 membar #Sync 1421 1422 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 1423 wr %o2, 0, %gsr 1424 1425 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 1426 btst FPRS_FEF, %o3 1427 bz,pt %icc, 4f 1428 nop 1429 1430 BLD_FPQ1Q3_FROMSTACK(%o2) 1431 1432 ba,pt %ncc, 2f 1433 wr %o3, 0, %fprs ! restore fprs 14344: 1435 FZEROQ1Q3 1436 wr %o3, 0, %fprs ! restore fprs 14372: 1438 membar #Sync ! sync error barrier 1439 andn %l6, MASK_FLAGS, %l6 1440 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1441 FP_ALLOWMIGRATE(5, 6) 1442 ret 1443 restore %g0, 0, %o0 1444 1445 SET_SIZE(bcopy_more) 1446 1447#endif /* lint */ 1448 1449/* 1450 * Block copy with possibly overlapped operands. 1451 */ 1452 1453#if defined(lint) 1454 1455/*ARGSUSED*/ 1456void 1457ovbcopy(const void *from, void *to, size_t count) 1458{} 1459 1460#else /* lint */ 1461 1462 ENTRY(ovbcopy) 1463 tst %o2 ! check count 1464 bgu,a %ncc, 1f ! nothing to do or bad arguments 1465 subcc %o0, %o1, %o3 ! difference of from and to address 1466 1467 retl ! return 1468 nop 14691: 1470 bneg,a %ncc, 2f 1471 neg %o3 ! if < 0, make it positive 14722: cmp %o2, %o3 ! cmp size and abs(from - to) 1473 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy, 1474 .empty ! no overlap 1475 cmp %o0, %o1 ! compare from and to addresses 1476 blu %ncc, .ov_bkwd ! if from < to, copy backwards 1477 nop 1478 ! 1479 ! Copy forwards. 1480 ! 1481.ov_fwd: 1482 ldub [%o0], %o3 ! read from address 1483 inc %o0 ! inc from address 1484 stb %o3, [%o1] ! write to address 1485 deccc %o2 ! dec count 1486 bgu %ncc, .ov_fwd ! loop till done 1487 inc %o1 ! inc to address 1488 1489 retl ! return 1490 nop 1491 ! 1492 ! Copy backwards. 1493 ! 1494.ov_bkwd: 1495 deccc %o2 ! dec count 1496 ldub [%o0 + %o2], %o3 ! get byte at end of src 1497 bgu %ncc, .ov_bkwd ! loop till done 1498 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst 1499 1500 retl ! return 1501 nop 1502 1503 SET_SIZE(ovbcopy) 1504 1505#endif /* lint */ 1506 1507 1508/* 1509 * hwblkpagecopy() 1510 * 1511 * Copies exactly one page. This routine assumes the caller (ppcopy) 1512 * has already disabled kernel preemption and has checked 1513 * use_hw_bcopy. Preventing preemption also prevents cpu migration. 1514 */ 1515#ifdef lint 1516/*ARGSUSED*/ 1517void 1518hwblkpagecopy(const void *src, void *dst) 1519{ } 1520#else /* lint */ 1521 ENTRY(hwblkpagecopy) 1522 ! get another window w/space for three aligned blocks of saved fpregs 1523 prefetch [%o0], #n_reads 1524 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 1525 1526 ! %i0 - source address (arg) 1527 ! %i1 - destination address (arg) 1528 ! %i2 - length of region (not arg) 1529 ! %l0 - saved fprs 1530 ! %l1 - pointer to saved fpregs 1531 1532 rd %fprs, %l0 ! check for unused fp 1533 btst FPRS_FEF, %l0 1534 bz,a,pt %icc, 1f 1535 wr %g0, FPRS_FEF, %fprs 1536 1537 BST_FPQ1Q3_TOSTACK(%l1) 1538 15391: set PAGESIZE, CNT 1540 mov REALSRC, SRC 1541 1542 ldd [SRC], %f0 1543 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 1544 ldd [SRC + 0x08], %f2 1545 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 1546 fmovd %f0, %f32 1547 ldd [SRC + 0x10], %f4 1548 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 1549 fmovd %f2, %f34 1550 ldd [SRC + 0x18], %f6 1551 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 1552 fmovd %f4, %f36 1553 ldd [SRC + 0x20], %f8 1554 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read 1555 fmovd %f6, %f38 1556 ldd [SRC + 0x28], %f10 1557 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read 1558 fmovd %f8, %f40 1559 ldd [SRC + 0x30], %f12 1560 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read 1561 fmovd %f10, %f42 1562 ldd [SRC + 0x38], %f14 1563 ldd [SRC + VIS_BLOCKSIZE], %f0 1564 sub CNT, VIS_BLOCKSIZE, CNT 1565 add SRC, VIS_BLOCKSIZE, SRC 1566 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read 1567 ba,pt %ncc, 2f 1568 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read 1569 .align 32 15702: 1571 ldd [SRC + 0x08], %f2 1572 fmovd %f12, %f44 1573 ldd [SRC + 0x10], %f4 1574 fmovd %f14, %f46 1575 stda %f32, [DST]ASI_BLK_P 1576 ldd [SRC + 0x18], %f6 1577 fmovd %f0, %f32 1578 ldd [SRC + 0x20], %f8 1579 fmovd %f2, %f34 1580 ldd [SRC + 0x28], %f10 1581 fmovd %f4, %f36 1582 ldd [SRC + 0x30], %f12 1583 fmovd %f6, %f38 1584 ldd [SRC + 0x38], %f14 1585 fmovd %f8, %f40 1586 ldd [SRC + VIS_BLOCKSIZE], %f0 1587 fmovd %f10, %f42 1588 sub CNT, VIS_BLOCKSIZE, CNT 1589 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 1590 add DST, VIS_BLOCKSIZE, DST 1591 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1592 add SRC, VIS_BLOCKSIZE, SRC 1593 cmp CNT, VIS_BLOCKSIZE + 8 1594 bgu,pt %ncc, 2b 1595 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 1596 1597 ! trailing block 1598 ldd [SRC + 0x08], %f2 1599 fsrc1 %f12, %f44 1600 ldd [SRC + 0x10], %f4 1601 fsrc1 %f14, %f46 1602 stda %f32, [DST]ASI_BLK_P 1603 ldd [SRC + 0x18], %f6 1604 fsrc1 %f0, %f32 1605 ldd [SRC + 0x20], %f8 1606 fsrc1 %f2, %f34 1607 ldd [SRC + 0x28], %f10 1608 fsrc1 %f4, %f36 1609 ldd [SRC + 0x30], %f12 1610 fsrc1 %f6, %f38 1611 ldd [SRC + 0x38], %f14 1612 fsrc1 %f8, %f40 1613 sub CNT, VIS_BLOCKSIZE, CNT 1614 add DST, VIS_BLOCKSIZE, DST 1615 add SRC, VIS_BLOCKSIZE, SRC 1616 fsrc1 %f10, %f42 1617 fsrc1 %f12, %f44 1618 fsrc1 %f14, %f46 1619 stda %f32, [DST]ASI_BLK_P 1620 1621 membar #Sync 1622 1623 btst FPRS_FEF, %l0 1624 bz,pt %icc, 2f 1625 nop 1626 1627 BLD_FPQ1Q3_FROMSTACK(%l3) 1628 ba 3f 1629 nop 1630 16312: FZEROQ1Q3 1632 16333: wr %l0, 0, %fprs ! restore fprs 1634 ret 1635 restore %g0, 0, %o0 1636 1637 SET_SIZE(hwblkpagecopy) 1638#endif /* lint */ 1639 1640 1641/* 1642 * Transfer data to and from user space - 1643 * Note that these routines can cause faults 1644 * It is assumed that the kernel has nothing at 1645 * less than KERNELBASE in the virtual address space. 1646 * 1647 * Note that copyin(9F) and copyout(9F) are part of the 1648 * DDI/DKI which specifies that they return '-1' on "errors." 1649 * 1650 * Sigh. 1651 * 1652 * So there's two extremely similar routines - xcopyin() and xcopyout() 1653 * which return the errno that we've faithfully computed. This 1654 * allows other callers (e.g. uiomove(9F)) to work correctly. 1655 * Given that these are used pretty heavily, we expand the calling 1656 * sequences inline for all flavours (rather than making wrappers). 1657 * 1658 * There are also stub routines for xcopyout_little and xcopyin_little, 1659 * which currently are intended to handle requests of <= 16 bytes from 1660 * do_unaligned. Future enhancement to make them handle 8k pages efficiently 1661 * is left as an exercise... 1662 */ 1663 1664/* 1665 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr) 1666 * 1667 * General theory of operation: 1668 * 1669 * The only difference between copy{in,out} and 1670 * xcopy{in,out} is in the error handling routine they invoke 1671 * when a memory access error occurs. xcopyOP returns the errno 1672 * while copyOP returns -1 (see above). copy{in,out}_noerr set 1673 * a special flag (by oring the TRAMP_FLAG into the fault handler address) 1674 * if they are called with a fault handler already in place. That flag 1675 * causes the default handlers to trampoline to the previous handler 1676 * upon an error. 1677 * 1678 * None of the copyops routines grab a window until it's decided that 1679 * we need to do a HW block copy operation. This saves a window 1680 * spill/fill when we're called during socket ops. The typical IO 1681 * path won't cause spill/fill traps. 1682 * 1683 * This code uses a set of 4 limits for the maximum size that will 1684 * be copied given a particular input/output address alignment. 1685 * If the value for a particular limit is zero, the copy will be performed 1686 * by the plain copy loops rather than FPBLK. 1687 * 1688 * See the description of bcopy above for more details of the 1689 * data copying algorithm and the default limits. 1690 * 1691 */ 1692 1693/* 1694 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little). 1695 */ 1696 1697#if defined(lint) 1698 1699 1700#else /* lint */ 1701/* 1702 * We save the arguments in the following registers in case of a fault: 1703 * kaddr - %l1 1704 * uaddr - %l2 1705 * count - %l3 1706 */ 1707#define SAVE_SRC %l1 1708#define SAVE_DST %l2 1709#define SAVE_COUNT %l3 1710 1711#define SM_SAVE_SRC %g4 1712#define SM_SAVE_DST %g5 1713#define SM_SAVE_COUNT %o5 1714#define ERRNO %l5 1715 1716 1717#define REAL_LOFAULT %l4 1718/* 1719 * Generic copyio fault handler. This is the first line of defense when a 1720 * fault occurs in (x)copyin/(x)copyout. In order for this to function 1721 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT. 1722 * This allows us to share common code for all the flavors of the copy 1723 * operations, including the _noerr versions. 1724 * 1725 * Note that this function will restore the original input parameters before 1726 * calling REAL_LOFAULT. So the real handler can vector to the appropriate 1727 * member of the t_copyop structure, if needed. 1728 */ 1729 ENTRY(copyio_fault) 1730 membar #Sync 1731 mov %g1,ERRNO ! save errno in ERRNO 1732 btst FPUSED_FLAG, %l6 1733 bz %ncc, 1f 1734 nop 1735 1736 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 1737 wr %o2, 0, %gsr ! restore gsr 1738 1739 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 1740 btst FPRS_FEF, %o3 1741 bz,pt %icc, 4f 1742 nop 1743 1744 BLD_FPQ2Q4_FROMSTACK(%o2) 1745 1746 ba,pt %ncc, 1f 1747 wr %o3, 0, %fprs ! restore fprs 1748 17494: 1750 FZEROQ2Q4 1751 wr %o3, 0, %fprs ! restore fprs 1752 17531: 1754 andn %l6, FPUSED_FLAG, %l6 1755 membar #Sync 1756 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1757 FP_ALLOWMIGRATE(5, 6) 1758 1759 mov SAVE_SRC, %i0 1760 mov SAVE_DST, %i1 1761 jmp REAL_LOFAULT 1762 mov SAVE_COUNT, %i2 1763 1764 SET_SIZE(copyio_fault) 1765 1766 1767#endif 1768 1769#if defined(lint) 1770 1771/*ARGSUSED*/ 1772int 1773copyout(const void *kaddr, void *uaddr, size_t count) 1774{ return (0); } 1775 1776#else /* lint */ 1777 1778 ENTRY(copyout) 1779 1780 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 1781 bleu,pt %ncc, .copyout_small ! go to larger cases 1782 xor %o0, %o1, %o3 ! are src, dst alignable? 1783 btst 7, %o3 ! 1784 bz,pt %ncc, .copyout_8 ! check for longword alignment 1785 nop 1786 btst 1, %o3 ! 1787 bz,pt %ncc, .copyout_2 ! check for half-word 1788 nop 1789 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 1790 ld [%o3 + %lo(hw_copy_limit_1)], %o3 1791 tst %o3 1792 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1793 cmp %o2, %o3 ! if length <= limit 1794 bleu,pt %ncc, .copyout_small ! go to small copy 1795 nop 1796 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1797 nop 1798.copyout_2: 1799 btst 3, %o3 ! 1800 bz,pt %ncc, .copyout_4 ! check for word alignment 1801 nop 1802 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 1803 ld [%o3 + %lo(hw_copy_limit_2)], %o3 1804 tst %o3 1805 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1806 cmp %o2, %o3 ! if length <= limit 1807 bleu,pt %ncc, .copyout_small ! go to small copy 1808 nop 1809 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1810 nop 1811.copyout_4: 1812 ! already checked longword, must be word aligned 1813 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 1814 ld [%o3 + %lo(hw_copy_limit_4)], %o3 1815 tst %o3 1816 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1817 cmp %o2, %o3 ! if length <= limit 1818 bleu,pt %ncc, .copyout_small ! go to small copy 1819 nop 1820 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1821 nop 1822.copyout_8: 1823 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 1824 ld [%o3 + %lo(hw_copy_limit_8)], %o3 1825 tst %o3 1826 bz,pn %icc, .copyout_small ! if zero, disable HW copy 1827 cmp %o2, %o3 ! if length <= limit 1828 bleu,pt %ncc, .copyout_small ! go to small copy 1829 nop 1830 ba,pt %ncc, .copyout_more ! otherwise go to large copy 1831 nop 1832 1833 .align 16 1834 nop ! instruction alignment 1835 ! see discussion at start of file 1836.copyout_small: 1837 sethi %hi(.sm_copyout_err), %o5 ! .sm_copyout_err is lofault 1838 or %o5, %lo(.sm_copyout_err), %o5 1839 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 1840 membar #Sync ! sync error barrier 1841 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 1842.sm_do_copyout: 1843 mov %o0, SM_SAVE_SRC 1844 mov %o1, SM_SAVE_DST 1845 cmp %o2, SHORTCOPY ! check for really short case 1846 bleu,pt %ncc, .co_sm_left ! 1847 mov %o2, SM_SAVE_COUNT 1848 cmp %o2, CHKSIZE ! check for medium length cases 1849 bgu,pn %ncc, .co_med ! 1850 or %o0, %o1, %o3 ! prepare alignment check 1851 andcc %o3, 0x3, %g0 ! test for alignment 1852 bz,pt %ncc, .co_sm_word ! branch to word aligned case 1853.co_sm_movebytes: 1854 sub %o2, 3, %o2 ! adjust count to allow cc zero test 1855.co_sm_notalign4: 1856 ldub [%o0], %o3 ! read byte 1857 subcc %o2, 4, %o2 ! reduce count by 4 1858 stba %o3, [%o1]ASI_USER ! write byte 1859 inc %o1 ! advance DST by 1 1860 ldub [%o0 + 1], %o3 ! repeat for a total of 4 bytes 1861 add %o0, 4, %o0 ! advance SRC by 4 1862 stba %o3, [%o1]ASI_USER 1863 inc %o1 ! advance DST by 1 1864 ldub [%o0 - 2], %o3 1865 stba %o3, [%o1]ASI_USER 1866 inc %o1 ! advance DST by 1 1867 ldub [%o0 - 1], %o3 1868 stba %o3, [%o1]ASI_USER 1869 bgt,pt %ncc, .co_sm_notalign4 ! loop til 3 or fewer bytes remain 1870 inc %o1 ! advance DST by 1 1871 add %o2, 3, %o2 ! restore count 1872.co_sm_left: 1873 tst %o2 1874 bz,pt %ncc, .co_sm_exit ! check for zero length 1875 nop 1876 ldub [%o0], %o3 ! load one byte 1877 deccc %o2 ! reduce count for cc test 1878 bz,pt %ncc, .co_sm_exit 1879 stba %o3,[%o1]ASI_USER ! store one byte 1880 ldub [%o0 + 1], %o3 ! load second byte 1881 deccc %o2 1882 inc %o1 1883 bz,pt %ncc, .co_sm_exit 1884 stba %o3,[%o1]ASI_USER ! store second byte 1885 ldub [%o0 + 2], %o3 ! load third byte 1886 inc %o1 1887 stba %o3,[%o1]ASI_USER ! store third byte 1888 membar #Sync ! sync error barrier 1889 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1890 retl 1891 mov %g0, %o0 ! return 0 1892 .align 16 1893.co_sm_words: 1894 lduw [%o0], %o3 ! read word 1895.co_sm_wordx: 1896 subcc %o2, 8, %o2 ! update count 1897 stwa %o3, [%o1]ASI_USER ! write word 1898 add %o0, 8, %o0 ! update SRC 1899 lduw [%o0 - 4], %o3 ! read word 1900 add %o1, 4, %o1 ! update DST 1901 stwa %o3, [%o1]ASI_USER ! write word 1902 bgt,pt %ncc, .co_sm_words ! loop til done 1903 add %o1, 4, %o1 ! update DST 1904 addcc %o2, 7, %o2 ! restore count 1905 bz,pt %ncc, .co_sm_exit 1906 nop 1907 deccc %o2 1908 bz,pt %ncc, .co_sm_byte 1909.co_sm_half: 1910 subcc %o2, 2, %o2 ! reduce count by 2 1911 lduh [%o0], %o3 ! read half word 1912 add %o0, 2, %o0 ! advance SRC by 2 1913 stha %o3, [%o1]ASI_USER ! write half word 1914 bgt,pt %ncc, .co_sm_half ! loop til done 1915 add %o1, 2, %o1 ! advance DST by 2 1916 addcc %o2, 1, %o2 ! restore count 1917 bz,pt %ncc, .co_sm_exit 1918 nop 1919.co_sm_byte: 1920 ldub [%o0], %o3 1921 stba %o3, [%o1]ASI_USER 1922 membar #Sync ! sync error barrier 1923 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1924 retl 1925 mov %g0, %o0 ! return 0 1926 .align 16 1927.co_sm_word: 1928 subcc %o2, 4, %o2 ! update count 1929 bgt,pt %ncc, .co_sm_wordx 1930 lduw [%o0], %o3 ! read word 1931 addcc %o2, 3, %o2 ! restore count 1932 bz,pt %ncc, .co_sm_exit 1933 stwa %o3, [%o1]ASI_USER ! write word 1934 deccc %o2 ! reduce count for cc test 1935 ldub [%o0 + 4], %o3 ! load one byte 1936 add %o1, 4, %o1 1937 bz,pt %ncc, .co_sm_exit 1938 stba %o3, [%o1]ASI_USER ! store one byte 1939 ldub [%o0 + 5], %o3 ! load second byte 1940 deccc %o2 1941 inc %o1 1942 bz,pt %ncc, .co_sm_exit 1943 stba %o3, [%o1]ASI_USER ! store second byte 1944 ldub [%o0 + 6], %o3 ! load third byte 1945 inc %o1 1946 stba %o3, [%o1]ASI_USER ! store third byte 1947.co_sm_exit: 1948 membar #Sync ! sync error barrier 1949 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1950 retl 1951 mov %g0, %o0 ! return 0 1952 1953 .align 16 1954.co_med: 1955 xor %o0, %o1, %o3 ! setup alignment check 1956 btst 1, %o3 1957 bnz,pt %ncc, .co_sm_movebytes ! unaligned 1958 nop 1959 btst 3, %o3 1960 bnz,pt %ncc, .co_med_half ! halfword aligned 1961 nop 1962 btst 7, %o3 1963 bnz,pt %ncc, .co_med_word ! word aligned 1964 nop 1965.co_med_long: 1966 btst 3, %o0 ! check for 1967 bz,pt %ncc, .co_med_long1 ! word alignment 1968 nop 1969.co_med_long0: 1970 ldub [%o0], %o3 ! load one byte 1971 inc %o0 1972 stba %o3,[%o1]ASI_USER ! store byte 1973 inc %o1 1974 btst 3, %o0 1975 bnz,pt %ncc, .co_med_long0 1976 dec %o2 1977.co_med_long1: ! word aligned 1978 btst 7, %o0 ! check for long word 1979 bz,pt %ncc, .co_med_long2 1980 nop 1981 lduw [%o0], %o3 ! load word 1982 add %o0, 4, %o0 ! advance SRC by 4 1983 stwa %o3, [%o1]ASI_USER ! store word 1984 add %o1, 4, %o1 ! advance DST by 4 1985 sub %o2, 4, %o2 ! reduce count by 4 1986! 1987! Now long word aligned and have at least 32 bytes to move 1988! 1989.co_med_long2: 1990 sub %o2, 31, %o2 ! adjust count to allow cc zero test 1991 sub %o1, 8, %o1 ! adjust pointer to allow store in 1992 ! branch delay slot instead of add 1993.co_med_lmove: 1994 add %o1, 8, %o1 ! advance DST by 8 1995 ldx [%o0], %o3 ! read long word 1996 subcc %o2, 32, %o2 ! reduce count by 32 1997 stxa %o3, [%o1]ASI_USER ! write long word 1998 add %o1, 8, %o1 ! advance DST by 8 1999 ldx [%o0 + 8], %o3 ! repeat for a total for 4 long words 2000 add %o0, 32, %o0 ! advance SRC by 32 2001 stxa %o3, [%o1]ASI_USER 2002 ldx [%o0 - 16], %o3 2003 add %o1, 8, %o1 ! advance DST by 8 2004 stxa %o3, [%o1]ASI_USER 2005 ldx [%o0 - 8], %o3 2006 add %o1, 8, %o1 ! advance DST by 8 2007 bgt,pt %ncc, .co_med_lmove ! loop til 31 or fewer bytes left 2008 stxa %o3, [%o1]ASI_USER 2009 add %o1, 8, %o1 ! advance DST by 8 2010 addcc %o2, 24, %o2 ! restore count to long word offset 2011 ble,pt %ncc, .co_med_lextra ! check for more long words to move 2012 nop 2013.co_med_lword: 2014 ldx [%o0], %o3 ! read long word 2015 subcc %o2, 8, %o2 ! reduce count by 8 2016 stxa %o3, [%o1]ASI_USER ! write long word 2017 add %o0, 8, %o0 ! advance SRC by 8 2018 bgt,pt %ncc, .co_med_lword ! loop til 7 or fewer bytes left 2019 add %o1, 8, %o1 ! advance DST by 8 2020.co_med_lextra: 2021 addcc %o2, 7, %o2 ! restore rest of count 2022 bz,pt %ncc, .co_sm_exit ! if zero, then done 2023 deccc %o2 2024 bz,pt %ncc, .co_sm_byte 2025 nop 2026 ba,pt %ncc, .co_sm_half 2027 nop 2028 2029 .align 16 2030 nop ! instruction alignment 2031 ! see discussion at start of file 2032.co_med_word: 2033 btst 3, %o0 ! check for 2034 bz,pt %ncc, .co_med_word1 ! word alignment 2035 nop 2036.co_med_word0: 2037 ldub [%o0], %o3 ! load one byte 2038 inc %o0 2039 stba %o3,[%o1]ASI_USER ! store byte 2040 inc %o1 2041 btst 3, %o0 2042 bnz,pt %ncc, .co_med_word0 2043 dec %o2 2044! 2045! Now word aligned and have at least 36 bytes to move 2046! 2047.co_med_word1: 2048 sub %o2, 15, %o2 ! adjust count to allow cc zero test 2049.co_med_wmove: 2050 lduw [%o0], %o3 ! read word 2051 subcc %o2, 16, %o2 ! reduce count by 16 2052 stwa %o3, [%o1]ASI_USER ! write word 2053 add %o1, 4, %o1 ! advance DST by 4 2054 lduw [%o0 + 4], %o3 ! repeat for a total for 4 words 2055 add %o0, 16, %o0 ! advance SRC by 16 2056 stwa %o3, [%o1]ASI_USER 2057 add %o1, 4, %o1 ! advance DST by 4 2058 lduw [%o0 - 8], %o3 2059 stwa %o3, [%o1]ASI_USER 2060 add %o1, 4, %o1 ! advance DST by 4 2061 lduw [%o0 - 4], %o3 2062 stwa %o3, [%o1]ASI_USER 2063 bgt,pt %ncc, .co_med_wmove ! loop til 15 or fewer bytes left 2064 add %o1, 4, %o1 ! advance DST by 4 2065 addcc %o2, 12, %o2 ! restore count to word offset 2066 ble,pt %ncc, .co_med_wextra ! check for more words to move 2067 nop 2068.co_med_word2: 2069 lduw [%o0], %o3 ! read word 2070 subcc %o2, 4, %o2 ! reduce count by 4 2071 stwa %o3, [%o1]ASI_USER ! write word 2072 add %o0, 4, %o0 ! advance SRC by 4 2073 bgt,pt %ncc, .co_med_word2 ! loop til 3 or fewer bytes left 2074 add %o1, 4, %o1 ! advance DST by 4 2075.co_med_wextra: 2076 addcc %o2, 3, %o2 ! restore rest of count 2077 bz,pt %ncc, .co_sm_exit ! if zero, then done 2078 deccc %o2 2079 bz,pt %ncc, .co_sm_byte 2080 nop 2081 ba,pt %ncc, .co_sm_half 2082 nop 2083 2084 .align 16 2085 nop ! instruction alignment 2086 nop ! see discussion at start of file 2087 nop 2088.co_med_half: 2089 btst 1, %o0 ! check for 2090 bz,pt %ncc, .co_med_half1 ! half word alignment 2091 nop 2092 ldub [%o0], %o3 ! load one byte 2093 inc %o0 2094 stba %o3,[%o1]ASI_USER ! store byte 2095 inc %o1 2096 dec %o2 2097! 2098! Now half word aligned and have at least 38 bytes to move 2099! 2100.co_med_half1: 2101 sub %o2, 7, %o2 ! adjust count to allow cc zero test 2102.co_med_hmove: 2103 lduh [%o0], %o3 ! read half word 2104 subcc %o2, 8, %o2 ! reduce count by 8 2105 stha %o3, [%o1]ASI_USER ! write half word 2106 add %o1, 2, %o1 ! advance DST by 2 2107 lduh [%o0 + 2], %o3 ! repeat for a total for 4 halfwords 2108 add %o0, 8, %o0 ! advance SRC by 8 2109 stha %o3, [%o1]ASI_USER 2110 add %o1, 2, %o1 ! advance DST by 2 2111 lduh [%o0 - 4], %o3 2112 stha %o3, [%o1]ASI_USER 2113 add %o1, 2, %o1 ! advance DST by 2 2114 lduh [%o0 - 2], %o3 2115 stha %o3, [%o1]ASI_USER 2116 bgt,pt %ncc, .co_med_hmove ! loop til 7 or fewer bytes left 2117 add %o1, 2, %o1 ! advance DST by 2 2118 addcc %o2, 7, %o2 ! restore count 2119 bz,pt %ncc, .co_sm_exit 2120 deccc %o2 2121 bz,pt %ncc, .co_sm_byte 2122 nop 2123 ba,pt %ncc, .co_sm_half 2124 nop 2125 2126/* 2127 * We got here because of a fault during short copyout. 2128 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 2129 */ 2130.sm_copyout_err: 2131 membar #Sync 2132 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2133 mov SM_SAVE_SRC, %o0 2134 mov SM_SAVE_DST, %o1 2135 mov SM_SAVE_COUNT, %o2 2136 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2137 tst %o3 2138 bz,pt %ncc, 3f ! if not, return error 2139 nop 2140 ldn [%o3 + CP_COPYOUT], %o5 ! if handler, invoke it with 2141 jmp %o5 ! original arguments 2142 nop 21433: 2144 retl 2145 or %g0, -1, %o0 ! return error value 2146 2147 SET_SIZE(copyout) 2148 2149/* 2150 * The _more entry points are not intended to be used directly by 2151 * any caller from outside this file. They are provided to allow 2152 * profiling and dtrace of the portions of the copy code that uses 2153 * the floating point registers. 2154 * This entry is particularly important as DTRACE (at least as of 2155 * 4/2004) does not support leaf functions. 2156 */ 2157 2158 ENTRY(copyout_more) 2159.copyout_more: 2160 prefetch [%o0], #n_reads 2161 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2162 set .copyout_err, REAL_LOFAULT 2163 2164/* 2165 * Copy outs that reach here are larger than VIS_COPY_THRESHOLD bytes 2166 */ 2167.do_copyout: 2168 set copyio_fault, %l7 ! .copyio_fault is lofault val 2169 2170 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 2171 membar #Sync ! sync error barrier 2172 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 2173 2174 mov %i0, SAVE_SRC 2175 mov %i1, SAVE_DST 2176 mov %i2, SAVE_COUNT 2177 2178 FP_NOMIGRATE(6, 7) 2179 2180 rd %fprs, %o2 ! check for unused fp 2181 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 2182 btst FPRS_FEF, %o2 2183 bz,a,pt %icc, .do_blockcopyout 2184 wr %g0, FPRS_FEF, %fprs 2185 2186 BST_FPQ2Q4_TOSTACK(%o2) 2187 2188.do_blockcopyout: 2189 rd %gsr, %o2 2190 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 2191 or %l6, FPUSED_FLAG, %l6 2192 2193 andcc DST, VIS_BLOCKSIZE - 1, TMP 2194 mov ASI_USER, %asi 2195 bz,pt %ncc, 2f 2196 neg TMP 2197 add TMP, VIS_BLOCKSIZE, TMP 2198 2199 ! TMP = bytes required to align DST on FP_BLOCK boundary 2200 ! Using SRC as a tmp here 2201 cmp TMP, 3 2202 bleu,pt %ncc, 1f 2203 sub CNT,TMP,CNT ! adjust main count 2204 sub TMP, 3, TMP ! adjust for end of loop test 2205.co_blkalign: 2206 ldub [REALSRC], SRC ! move 4 bytes per loop iteration 2207 stba SRC, [DST]%asi 2208 subcc TMP, 4, TMP 2209 ldub [REALSRC + 1], SRC 2210 add REALSRC, 4, REALSRC 2211 stba SRC, [DST + 1]%asi 2212 ldub [REALSRC - 2], SRC 2213 add DST, 4, DST 2214 stba SRC, [DST - 2]%asi 2215 ldub [REALSRC - 1], SRC 2216 bgu,pt %ncc, .co_blkalign 2217 stba SRC, [DST - 1]%asi 2218 2219 addcc TMP, 3, TMP ! restore count adjustment 2220 bz,pt %ncc, 2f ! no bytes left? 2221 nop 22221: ldub [REALSRC], SRC 2223 inc REALSRC 2224 inc DST 2225 deccc TMP 2226 bgu %ncc, 1b 2227 stba SRC, [DST - 1]%asi 2228 22292: 2230 membar #StoreLoad 2231 andn REALSRC, 0x7, SRC 2232 2233 ! SRC - 8-byte aligned 2234 ! DST - 64-byte aligned 2235 ldd [SRC], %f16 2236 prefetch [SRC + (1 * VIS_BLOCKSIZE)], #n_reads 2237 alignaddr REALSRC, %g0, %g0 2238 ldd [SRC + 0x08], %f18 2239 prefetch [SRC + (2 * VIS_BLOCKSIZE)], #n_reads 2240 faligndata %f16, %f18, %f48 2241 ldd [SRC + 0x10], %f20 2242 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 2243 faligndata %f18, %f20, %f50 2244 ldd [SRC + 0x18], %f22 2245 prefetch [SRC + (4 * VIS_BLOCKSIZE)], #one_read 2246 faligndata %f20, %f22, %f52 2247 ldd [SRC + 0x20], %f24 2248 prefetch [SRC + (8 * VIS_BLOCKSIZE)], #one_read 2249 faligndata %f22, %f24, %f54 2250 ldd [SRC + 0x28], %f26 2251 prefetch [SRC + (12 * VIS_BLOCKSIZE)], #one_read 2252 faligndata %f24, %f26, %f56 2253 ldd [SRC + 0x30], %f28 2254 prefetch [SRC + (16 * VIS_BLOCKSIZE)], #one_read 2255 faligndata %f26, %f28, %f58 2256 ldd [SRC + 0x38], %f30 2257 ldd [SRC + VIS_BLOCKSIZE], %f16 2258 sub CNT, VIS_BLOCKSIZE, CNT 2259 add SRC, VIS_BLOCKSIZE, SRC 2260 prefetch [SRC + (19 * VIS_BLOCKSIZE)], #one_read 2261 add REALSRC, VIS_BLOCKSIZE, REALSRC 2262 ba,pt %ncc, 1f 2263 prefetch [SRC + (23 * VIS_BLOCKSIZE)], #one_read 2264 .align 32 22651: 2266 ldd [SRC + 0x08], %f18 2267 faligndata %f28, %f30, %f60 2268 ldd [SRC + 0x10], %f20 2269 faligndata %f30, %f16, %f62 2270 stda %f48, [DST]ASI_BLK_AIUS 2271 ldd [SRC + 0x18], %f22 2272 faligndata %f16, %f18, %f48 2273 ldd [SRC + 0x20], %f24 2274 faligndata %f18, %f20, %f50 2275 ldd [SRC + 0x28], %f26 2276 faligndata %f20, %f22, %f52 2277 ldd [SRC + 0x30], %f28 2278 faligndata %f22, %f24, %f54 2279 sub CNT, VIS_BLOCKSIZE, CNT 2280 ldd [SRC + 0x38], %f30 2281 faligndata %f24, %f26, %f56 2282 add DST, VIS_BLOCKSIZE, DST 2283 ldd [SRC + VIS_BLOCKSIZE], %f16 2284 faligndata %f26, %f28, %f58 2285 add REALSRC, VIS_BLOCKSIZE, REALSRC 2286 prefetch [SRC + (3 * VIS_BLOCKSIZE)], #n_reads 2287 add SRC, VIS_BLOCKSIZE, SRC 2288 prefetch [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)], #one_read 2289 cmp CNT, VIS_BLOCKSIZE + 8 2290 bgu,pt %ncc, 1b 2291 prefetch [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)], #one_read 2292 2293 ! only if REALSRC & 0x7 is 0 2294 cmp CNT, VIS_BLOCKSIZE 2295 bne %ncc, 3f 2296 andcc REALSRC, 0x7, %g0 2297 bz,pt %ncc, 2f 2298 nop 22993: 2300 faligndata %f28, %f30, %f60 2301 faligndata %f30, %f16, %f62 2302 stda %f48, [DST]ASI_BLK_AIUS 2303 add DST, VIS_BLOCKSIZE, DST 2304 ba,pt %ncc, 3f 2305 nop 23062: 2307 ldd [SRC + 0x08], %f18 2308 fsrc1 %f28, %f60 2309 ldd [SRC + 0x10], %f20 2310 fsrc1 %f30, %f62 2311 stda %f48, [DST]ASI_BLK_AIUS 2312 ldd [SRC + 0x18], %f22 2313 fsrc1 %f16, %f48 2314 ldd [SRC + 0x20], %f24 2315 fsrc1 %f18, %f50 2316 ldd [SRC + 0x28], %f26 2317 fsrc1 %f20, %f52 2318 ldd [SRC + 0x30], %f28 2319 fsrc1 %f22, %f54 2320 ldd [SRC + 0x38], %f30 2321 fsrc1 %f24, %f56 2322 sub CNT, VIS_BLOCKSIZE, CNT 2323 add DST, VIS_BLOCKSIZE, DST 2324 add SRC, VIS_BLOCKSIZE, SRC 2325 add REALSRC, VIS_BLOCKSIZE, REALSRC 2326 fsrc1 %f26, %f58 2327 fsrc1 %f28, %f60 2328 fsrc1 %f30, %f62 2329 stda %f48, [DST]ASI_BLK_AIUS 2330 add DST, VIS_BLOCKSIZE, DST 2331 ba,a,pt %ncc, 4f 2332 nop 2333 23343: tst CNT 2335 bz,a %ncc, 4f 2336 nop 2337 23385: ldub [REALSRC], TMP 2339 inc REALSRC 2340 inc DST 2341 deccc CNT 2342 bgu %ncc, 5b 2343 stba TMP, [DST - 1]%asi 23444: 2345 2346.copyout_exit: 2347 membar #Sync 2348 2349 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 2350 wr %o2, 0, %gsr ! restore gsr 2351 2352 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 2353 btst FPRS_FEF, %o3 2354 bz,pt %icc, 4f 2355 nop 2356 2357 BLD_FPQ2Q4_FROMSTACK(%o2) 2358 2359 ba,pt %ncc, 1f 2360 wr %o3, 0, %fprs ! restore fprs 2361 23624: 2363 FZEROQ2Q4 2364 wr %o3, 0, %fprs ! restore fprs 2365 23661: 2367 membar #Sync 2368 andn %l6, FPUSED_FLAG, %l6 2369 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2370 FP_ALLOWMIGRATE(5, 6) 2371 ret 2372 restore %g0, 0, %o0 2373 2374/* 2375 * We got here because of a fault during copyout. 2376 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 2377 */ 2378.copyout_err: 2379 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 2380 tst %o4 2381 bz,pt %ncc, 2f ! if not, return error 2382 nop 2383 ldn [%o4 + CP_COPYOUT], %g2 ! if handler, invoke it with 2384 jmp %g2 ! original arguments 2385 restore %g0, 0, %g0 ! dispose of copy window 23862: 2387 ret 2388 restore %g0, -1, %o0 ! return error value 2389 2390 2391 SET_SIZE(copyout_more) 2392 2393#endif /* lint */ 2394 2395 2396#ifdef lint 2397 2398/*ARGSUSED*/ 2399int 2400xcopyout(const void *kaddr, void *uaddr, size_t count) 2401{ return (0); } 2402 2403#else /* lint */ 2404 2405 ENTRY(xcopyout) 2406 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 2407 bleu,pt %ncc, .xcopyout_small ! go to larger cases 2408 xor %o0, %o1, %o3 ! are src, dst alignable? 2409 btst 7, %o3 ! 2410 bz,pt %ncc, .xcopyout_8 ! 2411 nop 2412 btst 1, %o3 ! 2413 bz,pt %ncc, .xcopyout_2 ! check for half-word 2414 nop 2415 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 2416 ld [%o3 + %lo(hw_copy_limit_1)], %o3 2417 tst %o3 2418 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2419 cmp %o2, %o3 ! if length <= limit 2420 bleu,pt %ncc, .xcopyout_small ! go to small copy 2421 nop 2422 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2423 nop 2424.xcopyout_2: 2425 btst 3, %o3 ! 2426 bz,pt %ncc, .xcopyout_4 ! check for word alignment 2427 nop 2428 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 2429 ld [%o3 + %lo(hw_copy_limit_2)], %o3 2430 tst %o3 2431 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2432 cmp %o2, %o3 ! if length <= limit 2433 bleu,pt %ncc, .xcopyout_small ! go to small copy 2434 nop 2435 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2436 nop 2437.xcopyout_4: 2438 ! already checked longword, must be word aligned 2439 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 2440 ld [%o3 + %lo(hw_copy_limit_4)], %o3 2441 tst %o3 2442 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2443 cmp %o2, %o3 ! if length <= limit 2444 bleu,pt %ncc, .xcopyout_small ! go to small copy 2445 nop 2446 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2447 nop 2448.xcopyout_8: 2449 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 2450 ld [%o3 + %lo(hw_copy_limit_8)], %o3 2451 tst %o3 2452 bz,pn %icc, .xcopyout_small ! if zero, disable HW copy 2453 cmp %o2, %o3 ! if length <= limit 2454 bleu,pt %ncc, .xcopyout_small ! go to small copy 2455 nop 2456 ba,pt %ncc, .xcopyout_more ! otherwise go to large copy 2457 nop 2458 2459.xcopyout_small: 2460 sethi %hi(.sm_xcopyout_err), %o5 ! .sm_xcopyout_err is lofault 2461 or %o5, %lo(.sm_xcopyout_err), %o5 2462 ldn [THREAD_REG + T_LOFAULT], %o4 ! save existing handler 2463 membar #Sync ! sync error barrier 2464 ba,pt %ncc, .sm_do_copyout ! common code 2465 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 2466 2467.xcopyout_more: 2468 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2469 sethi %hi(.xcopyout_err), REAL_LOFAULT 2470 ba,pt %ncc, .do_copyout ! common code 2471 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT 2472 2473/* 2474 * We got here because of fault during xcopyout 2475 * Errno value is in ERRNO 2476 */ 2477.xcopyout_err: 2478 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 2479 tst %o4 2480 bz,pt %ncc, 2f ! if not, return error 2481 nop 2482 ldn [%o4 + CP_XCOPYOUT], %g2 ! if handler, invoke it with 2483 jmp %g2 ! original arguments 2484 restore %g0, 0, %g0 ! dispose of copy window 24852: 2486 ret 2487 restore ERRNO, 0, %o0 ! return errno value 2488 2489.sm_xcopyout_err: 2490 2491 membar #Sync 2492 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2493 mov SM_SAVE_SRC, %o0 2494 mov SM_SAVE_DST, %o1 2495 mov SM_SAVE_COUNT, %o2 2496 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2497 tst %o3 2498 bz,pt %ncc, 3f ! if not, return error 2499 nop 2500 ldn [%o3 + CP_XCOPYOUT], %o5 ! if handler, invoke it with 2501 jmp %o5 ! original arguments 2502 nop 25033: 2504 retl 2505 or %g1, 0, %o0 ! return errno value 2506 2507 SET_SIZE(xcopyout) 2508 2509#endif /* lint */ 2510 2511#ifdef lint 2512 2513/*ARGSUSED*/ 2514int 2515xcopyout_little(const void *kaddr, void *uaddr, size_t count) 2516{ return (0); } 2517 2518#else /* lint */ 2519 2520 ENTRY(xcopyout_little) 2521 sethi %hi(.xcopyio_err), %o5 2522 or %o5, %lo(.xcopyio_err), %o5 2523 ldn [THREAD_REG + T_LOFAULT], %o4 2524 membar #Sync ! sync error barrier 2525 stn %o5, [THREAD_REG + T_LOFAULT] 2526 mov %o4, %o5 2527 2528 subcc %g0, %o2, %o3 2529 add %o0, %o2, %o0 2530 bz,pn %ncc, 2f ! check for zero bytes 2531 sub %o2, 1, %o4 2532 add %o0, %o4, %o0 ! start w/last byte 2533 add %o1, %o2, %o1 2534 ldub [%o0 + %o3], %o4 2535 25361: stba %o4, [%o1 + %o3]ASI_AIUSL 2537 inccc %o3 2538 sub %o0, 2, %o0 ! get next byte 2539 bcc,a,pt %ncc, 1b 2540 ldub [%o0 + %o3], %o4 2541 25422: 2543 membar #Sync ! sync error barrier 2544 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2545 retl 2546 mov %g0, %o0 ! return (0) 2547 2548 SET_SIZE(xcopyout_little) 2549 2550#endif /* lint */ 2551 2552/* 2553 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little) 2554 */ 2555 2556#if defined(lint) 2557 2558/*ARGSUSED*/ 2559int 2560copyin(const void *uaddr, void *kaddr, size_t count) 2561{ return (0); } 2562 2563#else /* lint */ 2564 2565 ENTRY(copyin) 2566 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 2567 bleu,pt %ncc, .copyin_small ! go to larger cases 2568 xor %o0, %o1, %o3 ! are src, dst alignable? 2569 btst 7, %o3 ! 2570 bz,pt %ncc, .copyin_8 ! check for longword alignment 2571 nop 2572 btst 1, %o3 ! 2573 bz,pt %ncc, .copyin_2 ! check for half-word 2574 nop 2575 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 2576 ld [%o3 + %lo(hw_copy_limit_1)], %o3 2577 tst %o3 2578 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2579 cmp %o2, %o3 ! if length <= limit 2580 bleu,pt %ncc, .copyin_small ! go to small copy 2581 nop 2582 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2583 nop 2584.copyin_2: 2585 btst 3, %o3 ! 2586 bz,pt %ncc, .copyin_4 ! check for word alignment 2587 nop 2588 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 2589 ld [%o3 + %lo(hw_copy_limit_2)], %o3 2590 tst %o3 2591 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2592 cmp %o2, %o3 ! if length <= limit 2593 bleu,pt %ncc, .copyin_small ! go to small copy 2594 nop 2595 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2596 nop 2597.copyin_4: 2598 ! already checked longword, must be word aligned 2599 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 2600 ld [%o3 + %lo(hw_copy_limit_4)], %o3 2601 tst %o3 2602 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2603 cmp %o2, %o3 ! if length <= limit 2604 bleu,pt %ncc, .copyin_small ! go to small copy 2605 nop 2606 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2607 nop 2608.copyin_8: 2609 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 2610 ld [%o3 + %lo(hw_copy_limit_8)], %o3 2611 tst %o3 2612 bz,pn %icc, .copyin_small ! if zero, disable HW copy 2613 cmp %o2, %o3 ! if length <= limit 2614 bleu,pt %ncc, .copyin_small ! go to small copy 2615 nop 2616 ba,pt %ncc, .copyin_more ! otherwise go to large copy 2617 nop 2618 2619 .align 16 2620 nop ! instruction alignment 2621 ! see discussion at start of file 2622.copyin_small: 2623 sethi %hi(.sm_copyin_err), %o5 ! .sm_copyin_err is lofault 2624 or %o5, %lo(.sm_copyin_err), %o5 2625 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofault, no tramp 2626 membar #Sync ! sync error barrier 2627 stn %o5, [THREAD_REG + T_LOFAULT] 2628.sm_do_copyin: 2629 mov %o0, SM_SAVE_SRC 2630 mov %o1, SM_SAVE_DST 2631 cmp %o2, SHORTCOPY ! check for really short case 2632 bleu,pt %ncc, .ci_sm_left ! 2633 mov %o2, SM_SAVE_COUNT 2634 cmp %o2, CHKSIZE ! check for medium length cases 2635 bgu,pn %ncc, .ci_med ! 2636 or %o0, %o1, %o3 ! prepare alignment check 2637 andcc %o3, 0x3, %g0 ! test for alignment 2638 bz,pt %ncc, .ci_sm_word ! branch to word aligned case 2639.ci_sm_movebytes: 2640 sub %o2, 3, %o2 ! adjust count to allow cc zero test 2641.ci_sm_notalign4: 2642 lduba [%o0]ASI_USER, %o3 ! read byte 2643 subcc %o2, 4, %o2 ! reduce count by 4 2644 stb %o3, [%o1] ! write byte 2645 add %o0, 1, %o0 ! advance SRC by 1 2646 lduba [%o0]ASI_USER, %o3 ! repeat for a total of 4 bytes 2647 add %o0, 1, %o0 ! advance SRC by 1 2648 stb %o3, [%o1 + 1] 2649 add %o1, 4, %o1 ! advance DST by 4 2650 lduba [%o0]ASI_USER, %o3 2651 add %o0, 1, %o0 ! advance SRC by 1 2652 stb %o3, [%o1 - 2] 2653 lduba [%o0]ASI_USER, %o3 2654 add %o0, 1, %o0 ! advance SRC by 1 2655 bgt,pt %ncc, .ci_sm_notalign4 ! loop til 3 or fewer bytes remain 2656 stb %o3, [%o1 - 1] 2657 add %o2, 3, %o2 ! restore count 2658.ci_sm_left: 2659 tst %o2 2660 bz,pt %ncc, .ci_sm_exit 2661 nop 2662 lduba [%o0]ASI_USER, %o3 ! load one byte 2663 deccc %o2 ! reduce count for cc test 2664 bz,pt %ncc, .ci_sm_exit 2665 stb %o3,[%o1] ! store one byte 2666 inc %o0 2667 lduba [%o0]ASI_USER, %o3 ! load second byte 2668 deccc %o2 2669 bz,pt %ncc, .ci_sm_exit 2670 stb %o3,[%o1 + 1] ! store second byte 2671 inc %o0 2672 lduba [%o0]ASI_USER, %o3 ! load third byte 2673 stb %o3,[%o1 + 2] ! store third byte 2674 membar #Sync ! sync error barrier 2675 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2676 retl 2677 mov %g0, %o0 ! return 0 2678 .align 16 2679.ci_sm_words: 2680 lduwa [%o0]ASI_USER, %o3 ! read word 2681.ci_sm_wordx: 2682 subcc %o2, 8, %o2 ! update count 2683 stw %o3, [%o1] ! write word 2684 add %o0, 4, %o0 ! update SRC 2685 add %o1, 8, %o1 ! update DST 2686 lduwa [%o0]ASI_USER, %o3 ! read word 2687 add %o0, 4, %o0 ! update SRC 2688 bgt,pt %ncc, .ci_sm_words ! loop til done 2689 stw %o3, [%o1 - 4] ! write word 2690 addcc %o2, 7, %o2 ! restore count 2691 bz,pt %ncc, .ci_sm_exit 2692 nop 2693 deccc %o2 2694 bz,pt %ncc, .ci_sm_byte 2695.ci_sm_half: 2696 subcc %o2, 2, %o2 ! reduce count by 2 2697 lduha [%o0]ASI_USER, %o3 ! read half word 2698 add %o0, 2, %o0 ! advance SRC by 2 2699 add %o1, 2, %o1 ! advance DST by 2 2700 bgt,pt %ncc, .ci_sm_half ! loop til done 2701 sth %o3, [%o1 - 2] ! write half word 2702 addcc %o2, 1, %o2 ! restore count 2703 bz,pt %ncc, .ci_sm_exit 2704 nop 2705.ci_sm_byte: 2706 lduba [%o0]ASI_USER, %o3 2707 stb %o3, [%o1] 2708 membar #Sync ! sync error barrier 2709 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2710 retl 2711 mov %g0, %o0 ! return 0 2712 .align 16 2713.ci_sm_word: 2714 subcc %o2, 4, %o2 ! update count 2715 bgt,pt %ncc, .ci_sm_wordx 2716 lduwa [%o0]ASI_USER, %o3 ! read word 2717 addcc %o2, 3, %o2 ! restore count 2718 bz,pt %ncc, .ci_sm_exit 2719 stw %o3, [%o1] ! write word 2720 deccc %o2 ! reduce count for cc test 2721 add %o0, 4, %o0 2722 lduba [%o0]ASI_USER, %o3 ! load one byte 2723 bz,pt %ncc, .ci_sm_exit 2724 stb %o3, [%o1 + 4] ! store one byte 2725 inc %o0 2726 lduba [%o0]ASI_USER, %o3 ! load second byte 2727 deccc %o2 2728 bz,pt %ncc, .ci_sm_exit 2729 stb %o3, [%o1 + 5] ! store second byte 2730 inc %o0 2731 lduba [%o0]ASI_USER, %o3 ! load third byte 2732 stb %o3, [%o1 + 6] ! store third byte 2733.ci_sm_exit: 2734 membar #Sync ! sync error barrier 2735 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2736 retl 2737 mov %g0, %o0 ! return 0 2738 2739 .align 16 2740.ci_med: 2741 xor %o0, %o1, %o3 ! setup alignment check 2742 btst 1, %o3 2743 bnz,pt %ncc, .ci_sm_movebytes ! unaligned 2744 nop 2745 btst 3, %o3 2746 bnz,pt %ncc, .ci_med_half ! halfword aligned 2747 nop 2748 btst 7, %o3 2749 bnz,pt %ncc, .ci_med_word ! word aligned 2750 nop 2751.ci_med_long: 2752 btst 3, %o0 ! check for 2753 bz,pt %ncc, .ci_med_long1 ! word alignment 2754 nop 2755.ci_med_long0: 2756 lduba [%o0]ASI_USER, %o3 ! load one byte 2757 inc %o0 2758 stb %o3,[%o1] ! store byte 2759 inc %o1 2760 btst 3, %o0 2761 bnz,pt %ncc, .ci_med_long0 2762 dec %o2 2763.ci_med_long1: ! word aligned 2764 btst 7, %o0 ! check for long word 2765 bz,pt %ncc, .ci_med_long2 2766 nop 2767 lduwa [%o0]ASI_USER, %o3 ! load word 2768 add %o0, 4, %o0 ! advance SRC by 4 2769 stw %o3, [%o1] ! store word 2770 add %o1, 4, %o1 ! advance DST by 4 2771 sub %o2, 4, %o2 ! reduce count by 4 2772! 2773! Now long word aligned and have at least 32 bytes to move 2774! 2775.ci_med_long2: 2776 sub %o2, 31, %o2 ! adjust count to allow cc zero test 2777.ci_med_lmove: 2778 ldxa [%o0]ASI_USER, %o3 ! read long word 2779 subcc %o2, 32, %o2 ! reduce count by 32 2780 stx %o3, [%o1] ! write long word 2781 add %o0, 8, %o0 ! advance SRC by 8 2782 ldxa [%o0]ASI_USER, %o3 ! repeat for a total for 4 long words 2783 add %o0, 8, %o0 ! advance SRC by 8 2784 stx %o3, [%o1 + 8] 2785 add %o1, 32, %o1 ! advance DST by 32 2786 ldxa [%o0]ASI_USER, %o3 2787 add %o0, 8, %o0 ! advance SRC by 8 2788 stx %o3, [%o1 - 16] 2789 ldxa [%o0]ASI_USER, %o3 2790 add %o0, 8, %o0 ! advance SRC by 8 2791 bgt,pt %ncc, .ci_med_lmove ! loop til 31 or fewer bytes left 2792 stx %o3, [%o1 - 8] 2793 addcc %o2, 24, %o2 ! restore count to long word offset 2794 ble,pt %ncc, .ci_med_lextra ! check for more long words to move 2795 nop 2796.ci_med_lword: 2797 ldxa [%o0]ASI_USER, %o3 ! read long word 2798 subcc %o2, 8, %o2 ! reduce count by 8 2799 stx %o3, [%o1] ! write long word 2800 add %o0, 8, %o0 ! advance SRC by 8 2801 bgt,pt %ncc, .ci_med_lword ! loop til 7 or fewer bytes left 2802 add %o1, 8, %o1 ! advance DST by 8 2803.ci_med_lextra: 2804 addcc %o2, 7, %o2 ! restore rest of count 2805 bz,pt %ncc, .ci_sm_exit ! if zero, then done 2806 deccc %o2 2807 bz,pt %ncc, .ci_sm_byte 2808 nop 2809 ba,pt %ncc, .ci_sm_half 2810 nop 2811 2812 .align 16 2813 nop ! instruction alignment 2814 ! see discussion at start of file 2815.ci_med_word: 2816 btst 3, %o0 ! check for 2817 bz,pt %ncc, .ci_med_word1 ! word alignment 2818 nop 2819.ci_med_word0: 2820 lduba [%o0]ASI_USER, %o3 ! load one byte 2821 inc %o0 2822 stb %o3,[%o1] ! store byte 2823 inc %o1 2824 btst 3, %o0 2825 bnz,pt %ncc, .ci_med_word0 2826 dec %o2 2827! 2828! Now word aligned and have at least 36 bytes to move 2829! 2830.ci_med_word1: 2831 sub %o2, 15, %o2 ! adjust count to allow cc zero test 2832.ci_med_wmove: 2833 lduwa [%o0]ASI_USER, %o3 ! read word 2834 subcc %o2, 16, %o2 ! reduce count by 16 2835 stw %o3, [%o1] ! write word 2836 add %o0, 4, %o0 ! advance SRC by 4 2837 lduwa [%o0]ASI_USER, %o3 ! repeat for a total for 4 words 2838 add %o0, 4, %o0 ! advance SRC by 4 2839 stw %o3, [%o1 + 4] 2840 add %o1, 16, %o1 ! advance DST by 16 2841 lduwa [%o0]ASI_USER, %o3 2842 add %o0, 4, %o0 ! advance SRC by 4 2843 stw %o3, [%o1 - 8] 2844 lduwa [%o0]ASI_USER, %o3 2845 add %o0, 4, %o0 ! advance SRC by 4 2846 bgt,pt %ncc, .ci_med_wmove ! loop til 15 or fewer bytes left 2847 stw %o3, [%o1 - 4] 2848 addcc %o2, 12, %o2 ! restore count to word offset 2849 ble,pt %ncc, .ci_med_wextra ! check for more words to move 2850 nop 2851.ci_med_word2: 2852 lduwa [%o0]ASI_USER, %o3 ! read word 2853 subcc %o2, 4, %o2 ! reduce count by 4 2854 stw %o3, [%o1] ! write word 2855 add %o0, 4, %o0 ! advance SRC by 4 2856 bgt,pt %ncc, .ci_med_word2 ! loop til 3 or fewer bytes left 2857 add %o1, 4, %o1 ! advance DST by 4 2858.ci_med_wextra: 2859 addcc %o2, 3, %o2 ! restore rest of count 2860 bz,pt %ncc, .ci_sm_exit ! if zero, then done 2861 deccc %o2 2862 bz,pt %ncc, .ci_sm_byte 2863 nop 2864 ba,pt %ncc, .ci_sm_half 2865 nop 2866 2867 .align 16 2868 nop ! instruction alignment 2869 ! see discussion at start of file 2870.ci_med_half: 2871 btst 1, %o0 ! check for 2872 bz,pt %ncc, .ci_med_half1 ! half word alignment 2873 nop 2874 lduba [%o0]ASI_USER, %o3 ! load one byte 2875 inc %o0 2876 stb %o3,[%o1] ! store byte 2877 inc %o1 2878 dec %o2 2879! 2880! Now half word aligned and have at least 38 bytes to move 2881! 2882.ci_med_half1: 2883 sub %o2, 7, %o2 ! adjust count to allow cc zero test 2884.ci_med_hmove: 2885 lduha [%o0]ASI_USER, %o3 ! read half word 2886 subcc %o2, 8, %o2 ! reduce count by 8 2887 sth %o3, [%o1] ! write half word 2888 add %o0, 2, %o0 ! advance SRC by 2 2889 lduha [%o0]ASI_USER, %o3 ! repeat for a total for 4 halfwords 2890 add %o0, 2, %o0 ! advance SRC by 2 2891 sth %o3, [%o1 + 2] 2892 add %o1, 8, %o1 ! advance DST by 8 2893 lduha [%o0]ASI_USER, %o3 2894 add %o0, 2, %o0 ! advance SRC by 2 2895 sth %o3, [%o1 - 4] 2896 lduha [%o0]ASI_USER, %o3 2897 add %o0, 2, %o0 ! advance SRC by 2 2898 bgt,pt %ncc, .ci_med_hmove ! loop til 7 or fewer bytes left 2899 sth %o3, [%o1 - 2] 2900 addcc %o2, 7, %o2 ! restore count 2901 bz,pt %ncc, .ci_sm_exit 2902 deccc %o2 2903 bz,pt %ncc, .ci_sm_byte 2904 nop 2905 ba,pt %ncc, .ci_sm_half 2906 nop 2907 2908.sm_copyin_err: 2909 membar #Sync 2910 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2911 mov SM_SAVE_SRC, %o0 2912 mov SM_SAVE_DST, %o1 2913 mov SM_SAVE_COUNT, %o2 2914 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 2915 tst %o3 2916 bz,pt %ncc, 3f ! if not, return error 2917 nop 2918 ldn [%o3 + CP_COPYIN], %o5 ! if handler, invoke it with 2919 jmp %o5 ! original arguments 2920 nop 29213: 2922 retl 2923 or %g0, -1, %o0 ! return errno value 2924 2925 SET_SIZE(copyin) 2926 2927 2928/* 2929 * The _more entry points are not intended to be used directly by 2930 * any caller from outside this file. They are provided to allow 2931 * profiling and dtrace of the portions of the copy code that uses 2932 * the floating point registers. 2933 * This entry is particularly important as DTRACE (at least as of 2934 * 4/2004) does not support leaf functions. 2935 */ 2936 2937 ENTRY(copyin_more) 2938.copyin_more: 2939 prefetch [%o0], #n_reads 2940 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2941 set .copyin_err, REAL_LOFAULT 2942 2943/* 2944 * Copy ins that reach here are larger than VIS_COPY_THRESHOLD bytes 2945 */ 2946.do_copyin: 2947 set copyio_fault, %l7 ! .copyio_fault is lofault val 2948 2949 ldn [THREAD_REG + T_LOFAULT], %l6 ! save existing handler 2950 membar #Sync ! sync error barrier 2951 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 2952 2953 mov %i0, SAVE_SRC 2954 mov %i1, SAVE_DST 2955 mov %i2, SAVE_COUNT 2956 2957 FP_NOMIGRATE(6, 7) 2958 2959 rd %fprs, %o2 ! check for unused fp 2960 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 2961 btst FPRS_FEF, %o2 2962 bz,a,pt %icc, .do_blockcopyin 2963 wr %g0, FPRS_FEF, %fprs 2964 2965 BST_FPQ2Q4_TOSTACK(%o2) 2966 2967.do_blockcopyin: 2968 rd %gsr, %o2 2969 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 2970 or %l6, FPUSED_FLAG, %l6 2971 2972 andcc DST, VIS_BLOCKSIZE - 1, TMP 2973 mov ASI_USER, %asi 2974 bz,pt %ncc, 2f 2975 neg TMP 2976 add TMP, VIS_BLOCKSIZE, TMP 2977 2978 ! TMP = bytes required to align DST on FP_BLOCK boundary 2979 ! Using SRC as a tmp here 2980 cmp TMP, 3 2981 bleu,pt %ncc, 1f 2982 sub CNT,TMP,CNT ! adjust main count 2983 sub TMP, 3, TMP ! adjust for end of loop test 2984.ci_blkalign: 2985 lduba [REALSRC]%asi, SRC ! move 4 bytes per loop iteration 2986 stb SRC, [DST] 2987 subcc TMP, 4, TMP 2988 lduba [REALSRC + 1]%asi, SRC 2989 add REALSRC, 4, REALSRC 2990 stb SRC, [DST + 1] 2991 lduba [REALSRC - 2]%asi, SRC 2992 add DST, 4, DST 2993 stb SRC, [DST - 2] 2994 lduba [REALSRC - 1]%asi, SRC 2995 bgu,pt %ncc, .ci_blkalign 2996 stb SRC, [DST - 1] 2997 2998 addcc TMP, 3, TMP ! restore count adjustment 2999 bz,pt %ncc, 2f ! no bytes left? 3000 nop 30011: lduba [REALSRC]%asi, SRC 3002 inc REALSRC 3003 inc DST 3004 deccc TMP 3005 bgu %ncc, 1b 3006 stb SRC, [DST - 1] 3007 30082: 3009 membar #StoreLoad 3010 andn REALSRC, 0x7, SRC 3011 3012 ! SRC - 8-byte aligned 3013 ! DST - 64-byte aligned 3014 ldda [SRC]%asi, %f16 3015 prefetcha [SRC + (1 * VIS_BLOCKSIZE)]%asi, #n_reads 3016 alignaddr REALSRC, %g0, %g0 3017 ldda [SRC + 0x08]%asi, %f18 3018 prefetcha [SRC + (2 * VIS_BLOCKSIZE)]%asi, #n_reads 3019 faligndata %f16, %f18, %f48 3020 ldda [SRC + 0x10]%asi, %f20 3021 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads 3022 faligndata %f18, %f20, %f50 3023 ldda [SRC + 0x18]%asi, %f22 3024 prefetcha [SRC + (4 * VIS_BLOCKSIZE)]%asi, #one_read 3025 faligndata %f20, %f22, %f52 3026 ldda [SRC + 0x20]%asi, %f24 3027 prefetcha [SRC + (8 * VIS_BLOCKSIZE)]%asi, #one_read 3028 faligndata %f22, %f24, %f54 3029 ldda [SRC + 0x28]%asi, %f26 3030 prefetcha [SRC + (12 * VIS_BLOCKSIZE)]%asi, #one_read 3031 faligndata %f24, %f26, %f56 3032 ldda [SRC + 0x30]%asi, %f28 3033 prefetcha [SRC + (16 * VIS_BLOCKSIZE)]%asi, #one_read 3034 faligndata %f26, %f28, %f58 3035 ldda [SRC + 0x38]%asi, %f30 3036 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 3037 sub CNT, VIS_BLOCKSIZE, CNT 3038 add SRC, VIS_BLOCKSIZE, SRC 3039 prefetcha [SRC + (19 * VIS_BLOCKSIZE)]%asi, #one_read 3040 add REALSRC, VIS_BLOCKSIZE, REALSRC 3041 ba,pt %ncc, 1f 3042 prefetcha [SRC + (23 * VIS_BLOCKSIZE)]%asi, #one_read 3043 .align 32 30441: 3045 ldda [SRC + 0x08]%asi, %f18 3046 faligndata %f28, %f30, %f60 3047 ldda [SRC + 0x10]%asi, %f20 3048 faligndata %f30, %f16, %f62 3049 stda %f48, [DST]ASI_BLK_P 3050 ldda [SRC + 0x18]%asi, %f22 3051 faligndata %f16, %f18, %f48 3052 ldda [SRC + 0x20]%asi, %f24 3053 faligndata %f18, %f20, %f50 3054 ldda [SRC + 0x28]%asi, %f26 3055 faligndata %f20, %f22, %f52 3056 ldda [SRC + 0x30]%asi, %f28 3057 faligndata %f22, %f24, %f54 3058 sub CNT, VIS_BLOCKSIZE, CNT 3059 ldda [SRC + 0x38]%asi, %f30 3060 faligndata %f24, %f26, %f56 3061 add DST, VIS_BLOCKSIZE, DST 3062 ldda [SRC + VIS_BLOCKSIZE]%asi, %f16 3063 faligndata %f26, %f28, %f58 3064 add REALSRC, VIS_BLOCKSIZE, REALSRC 3065 prefetcha [SRC + (3 * VIS_BLOCKSIZE)]%asi, #n_reads 3066 add SRC, VIS_BLOCKSIZE, SRC 3067 prefetcha [SRC + ((OLYMPUS_C_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read 3068 cmp CNT, VIS_BLOCKSIZE + 8 3069 bgu,pt %ncc, 1b 3070 prefetcha [SRC + ((OLYMPUS_C_2ND_PREFETCH) * VIS_BLOCKSIZE)]%asi, #one_read 3071 3072 ! only if REALSRC & 0x7 is 0 3073 cmp CNT, VIS_BLOCKSIZE 3074 bne %ncc, 3f 3075 andcc REALSRC, 0x7, %g0 3076 bz,pt %ncc, 2f 3077 nop 30783: 3079 faligndata %f28, %f30, %f60 3080 faligndata %f30, %f16, %f62 3081 stda %f48, [DST]ASI_BLK_P 3082 add DST, VIS_BLOCKSIZE, DST 3083 ba,pt %ncc, 3f 3084 nop 30852: 3086 ldda [SRC + 0x08]%asi, %f18 3087 fsrc1 %f28, %f60 3088 ldda [SRC + 0x10]%asi, %f20 3089 fsrc1 %f30, %f62 3090 stda %f48, [DST]ASI_BLK_P 3091 ldda [SRC + 0x18]%asi, %f22 3092 fsrc1 %f16, %f48 3093 ldda [SRC + 0x20]%asi, %f24 3094 fsrc1 %f18, %f50 3095 ldda [SRC + 0x28]%asi, %f26 3096 fsrc1 %f20, %f52 3097 ldda [SRC + 0x30]%asi, %f28 3098 fsrc1 %f22, %f54 3099 ldda [SRC + 0x38]%asi, %f30 3100 fsrc1 %f24, %f56 3101 sub CNT, VIS_BLOCKSIZE, CNT 3102 add DST, VIS_BLOCKSIZE, DST 3103 add SRC, VIS_BLOCKSIZE, SRC 3104 add REALSRC, VIS_BLOCKSIZE, REALSRC 3105 fsrc1 %f26, %f58 3106 fsrc1 %f28, %f60 3107 fsrc1 %f30, %f62 3108 stda %f48, [DST]ASI_BLK_P 3109 add DST, VIS_BLOCKSIZE, DST 3110 ba,a,pt %ncc, 4f 3111 nop 3112 31133: tst CNT 3114 bz,a %ncc, 4f 3115 nop 3116 31175: lduba [REALSRC]ASI_USER, TMP 3118 inc REALSRC 3119 inc DST 3120 deccc CNT 3121 bgu %ncc, 5b 3122 stb TMP, [DST - 1] 31234: 3124 3125.copyin_exit: 3126 membar #Sync 3127 3128 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 3129 wr %o2, 0, %gsr 3130 3131 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 3132 btst FPRS_FEF, %o3 3133 bz,pt %icc, 4f 3134 nop 3135 3136 BLD_FPQ2Q4_FROMSTACK(%o2) 3137 3138 ba,pt %ncc, 1f 3139 wr %o3, 0, %fprs ! restore fprs 3140 31414: 3142 FZEROQ2Q4 3143 wr %o3, 0, %fprs ! restore fprs 3144 31451: 3146 membar #Sync ! sync error barrier 3147 andn %l6, FPUSED_FLAG, %l6 3148 stn %l6, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3149 FP_ALLOWMIGRATE(5, 6) 3150 ret 3151 restore %g0, 0, %o0 3152/* 3153 * We got here because of a fault during copyin 3154 * Errno value is in ERRNO, but DDI/DKI says return -1 (sigh). 3155 */ 3156.copyin_err: 3157 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 3158 tst %o4 3159 bz,pt %ncc, 2f ! if not, return error 3160 nop 3161 ldn [%o4 + CP_COPYIN], %g2 ! if handler, invoke it with 3162 jmp %g2 ! original arguments 3163 restore %g0, 0, %g0 ! dispose of copy window 31642: 3165 ret 3166 restore %g0, -1, %o0 ! return error value 3167 3168 3169 SET_SIZE(copyin_more) 3170 3171#endif /* lint */ 3172 3173#ifdef lint 3174 3175/*ARGSUSED*/ 3176int 3177xcopyin(const void *uaddr, void *kaddr, size_t count) 3178{ return (0); } 3179 3180#else /* lint */ 3181 3182 ENTRY(xcopyin) 3183 3184 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3185 bleu,pt %ncc, .xcopyin_small ! go to larger cases 3186 xor %o0, %o1, %o3 ! are src, dst alignable? 3187 btst 7, %o3 ! 3188 bz,pt %ncc, .xcopyin_8 ! check for longword alignment 3189 nop 3190 btst 1, %o3 ! 3191 bz,pt %ncc, .xcopyin_2 ! check for half-word 3192 nop 3193 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3194 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3195 tst %o3 3196 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3197 cmp %o2, %o3 ! if length <= limit 3198 bleu,pt %ncc, .xcopyin_small ! go to small copy 3199 nop 3200 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3201 nop 3202.xcopyin_2: 3203 btst 3, %o3 ! 3204 bz,pt %ncc, .xcopyin_4 ! check for word alignment 3205 nop 3206 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3207 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3208 tst %o3 3209 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3210 cmp %o2, %o3 ! if length <= limit 3211 bleu,pt %ncc, .xcopyin_small ! go to small copy 3212 nop 3213 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3214 nop 3215.xcopyin_4: 3216 ! already checked longword, must be word aligned 3217 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3218 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3219 tst %o3 3220 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3221 cmp %o2, %o3 ! if length <= limit 3222 bleu,pt %ncc, .xcopyin_small ! go to small copy 3223 nop 3224 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3225 nop 3226.xcopyin_8: 3227 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3228 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3229 tst %o3 3230 bz,pn %icc, .xcopyin_small ! if zero, disable HW copy 3231 cmp %o2, %o3 ! if length <= limit 3232 bleu,pt %ncc, .xcopyin_small ! go to small copy 3233 nop 3234 ba,pt %ncc, .xcopyin_more ! otherwise go to large copy 3235 nop 3236 3237.xcopyin_small: 3238 sethi %hi(.sm_xcopyin_err), %o5 ! .sm_xcopyin_err is lofault value 3239 or %o5, %lo(.sm_xcopyin_err), %o5 3240 ldn [THREAD_REG + T_LOFAULT], %o4 ! set/save t_lofaul 3241 membar #Sync ! sync error barrier 3242 ba,pt %ncc, .sm_do_copyin ! common code 3243 stn %o5, [THREAD_REG + T_LOFAULT] 3244 3245.xcopyin_more: 3246 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3247 sethi %hi(.xcopyin_err), REAL_LOFAULT ! .xcopyin_err is lofault value 3248 ba,pt %ncc, .do_copyin 3249 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT 3250 3251/* 3252 * We got here because of fault during xcopyin 3253 * Errno value is in ERRNO 3254 */ 3255.xcopyin_err: 3256 ldn [THREAD_REG + T_COPYOPS], %o4 ! check for copyop handler 3257 tst %o4 3258 bz,pt %ncc, 2f ! if not, return error 3259 nop 3260 ldn [%o4 + CP_XCOPYIN], %g2 ! if handler, invoke it with 3261 jmp %g2 ! original arguments 3262 restore %g0, 0, %g0 ! dispose of copy window 32632: 3264 ret 3265 restore ERRNO, 0, %o0 ! return errno value 3266 3267.sm_xcopyin_err: 3268 3269 membar #Sync 3270 stn %o4, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3271 mov SM_SAVE_SRC, %o0 3272 mov SM_SAVE_DST, %o1 3273 mov SM_SAVE_COUNT, %o2 3274 ldn [THREAD_REG + T_COPYOPS], %o3 ! check for copyop handler 3275 tst %o3 3276 bz,pt %ncc, 3f ! if not, return error 3277 nop 3278 ldn [%o3 + CP_XCOPYIN], %o5 ! if handler, invoke it with 3279 jmp %o5 ! original arguments 3280 nop 32813: 3282 retl 3283 or %g1, 0, %o0 ! return errno value 3284 3285 SET_SIZE(xcopyin) 3286 3287#endif /* lint */ 3288 3289#ifdef lint 3290 3291/*ARGSUSED*/ 3292int 3293xcopyin_little(const void *uaddr, void *kaddr, size_t count) 3294{ return (0); } 3295 3296#else /* lint */ 3297 3298 ENTRY(xcopyin_little) 3299 sethi %hi(.xcopyio_err), %o5 3300 or %o5, %lo(.xcopyio_err), %o5 3301 ldn [THREAD_REG + T_LOFAULT], %o4 3302 membar #Sync ! sync error barrier 3303 stn %o5, [THREAD_REG + T_LOFAULT] 3304 mov %o4, %o5 3305 3306 subcc %g0, %o2, %o3 3307 add %o0, %o2, %o0 3308 bz,pn %ncc, 2f ! check for zero bytes 3309 sub %o2, 1, %o4 3310 add %o0, %o4, %o0 ! start w/last byte 3311 add %o1, %o2, %o1 3312 lduba [%o0 + %o3]ASI_AIUSL, %o4 3313 33141: stb %o4, [%o1 + %o3] 3315 inccc %o3 3316 sub %o0, 2, %o0 ! get next byte 3317 bcc,a,pt %ncc, 1b 3318 lduba [%o0 + %o3]ASI_AIUSL, %o4 3319 33202: 3321 membar #Sync ! sync error barrier 3322 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3323 retl 3324 mov %g0, %o0 ! return (0) 3325 3326.xcopyio_err: 3327 membar #Sync ! sync error barrier 3328 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3329 retl 3330 mov %g1, %o0 3331 3332 SET_SIZE(xcopyin_little) 3333 3334#endif /* lint */ 3335 3336 3337/* 3338 * Copy a block of storage - must not overlap (from + len <= to). 3339 * No fault handler installed (to be called under on_fault()) 3340 */ 3341#if defined(lint) 3342 3343/* ARGSUSED */ 3344void 3345copyin_noerr(const void *ufrom, void *kto, size_t count) 3346{} 3347 3348#else /* lint */ 3349 ENTRY(copyin_noerr) 3350 3351 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3352 bleu,pt %ncc, .copyin_ne_small ! go to larger cases 3353 xor %o0, %o1, %o3 ! are src, dst alignable? 3354 btst 7, %o3 ! 3355 bz,pt %ncc, .copyin_ne_8 ! check for longword alignment 3356 nop 3357 btst 1, %o3 ! 3358 bz,pt %ncc, .copyin_ne_2 ! check for half-word 3359 nop 3360 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3361 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3362 tst %o3 3363 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3364 cmp %o2, %o3 ! if length <= limit 3365 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3366 nop 3367 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3368 nop 3369.copyin_ne_2: 3370 btst 3, %o3 ! 3371 bz,pt %ncc, .copyin_ne_4 ! check for word alignment 3372 nop 3373 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3374 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3375 tst %o3 3376 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3377 cmp %o2, %o3 ! if length <= limit 3378 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3379 nop 3380 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3381 nop 3382.copyin_ne_4: 3383 ! already checked longword, must be word aligned 3384 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3385 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3386 tst %o3 3387 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3388 cmp %o2, %o3 ! if length <= limit 3389 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3390 nop 3391 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3392 nop 3393.copyin_ne_8: 3394 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3395 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3396 tst %o3 3397 bz,pn %icc, .copyin_ne_small ! if zero, disable HW copy 3398 cmp %o2, %o3 ! if length <= limit 3399 bleu,pt %ncc, .copyin_ne_small ! go to small copy 3400 nop 3401 ba,pt %ncc, .copyin_noerr_more ! otherwise go to large copy 3402 nop 3403 3404.copyin_ne_small: 3405 ldn [THREAD_REG + T_LOFAULT], %o4 3406 tst %o4 3407 bz,pn %ncc, .sm_do_copyin 3408 nop 3409 sethi %hi(.sm_copyio_noerr), %o5 3410 or %o5, %lo(.sm_copyio_noerr), %o5 3411 membar #Sync ! sync error barrier 3412 ba,pt %ncc, .sm_do_copyin 3413 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault 3414 3415.copyin_noerr_more: 3416 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3417 sethi %hi(.copyio_noerr), REAL_LOFAULT 3418 ba,pt %ncc, .do_copyin 3419 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 3420 3421.copyio_noerr: 3422 jmp %l6 3423 restore %g0,0,%g0 3424 3425.sm_copyio_noerr: 3426 membar #Sync 3427 stn %o4, [THREAD_REG + T_LOFAULT] ! restore t_lofault 3428 jmp %o4 3429 nop 3430 3431 SET_SIZE(copyin_noerr) 3432#endif /* lint */ 3433 3434/* 3435 * Copy a block of storage - must not overlap (from + len <= to). 3436 * No fault handler installed (to be called under on_fault()) 3437 */ 3438 3439#if defined(lint) 3440 3441/* ARGSUSED */ 3442void 3443copyout_noerr(const void *kfrom, void *uto, size_t count) 3444{} 3445 3446#else /* lint */ 3447 ENTRY(copyout_noerr) 3448 3449 cmp %o2, VIS_COPY_THRESHOLD ! check for leaf rtn case 3450 bleu,pt %ncc, .copyout_ne_small ! go to larger cases 3451 xor %o0, %o1, %o3 ! are src, dst alignable? 3452 btst 7, %o3 ! 3453 bz,pt %ncc, .copyout_ne_8 ! check for longword alignment 3454 nop 3455 btst 1, %o3 ! 3456 bz,pt %ncc, .copyout_ne_2 ! check for half-word 3457 nop 3458 sethi %hi(hw_copy_limit_1), %o3 ! Check copy limit 3459 ld [%o3 + %lo(hw_copy_limit_1)], %o3 3460 tst %o3 3461 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3462 cmp %o2, %o3 ! if length <= limit 3463 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3464 nop 3465 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3466 nop 3467.copyout_ne_2: 3468 btst 3, %o3 ! 3469 bz,pt %ncc, .copyout_ne_4 ! check for word alignment 3470 nop 3471 sethi %hi(hw_copy_limit_2), %o3 ! Check copy limit 3472 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3473 tst %o3 3474 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3475 cmp %o2, %o3 ! if length <= limit 3476 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3477 nop 3478 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3479 nop 3480.copyout_ne_4: 3481 ! already checked longword, must be word aligned 3482 sethi %hi(hw_copy_limit_4), %o3 ! Check copy limit 3483 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3484 tst %o3 3485 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3486 cmp %o2, %o3 ! if length <= limit 3487 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3488 nop 3489 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3490 nop 3491.copyout_ne_8: 3492 sethi %hi(hw_copy_limit_8), %o3 ! Check copy limit 3493 ld [%o3 + %lo(hw_copy_limit_8)], %o3 3494 tst %o3 3495 bz,pn %icc, .copyout_ne_small ! if zero, disable HW copy 3496 cmp %o2, %o3 ! if length <= limit 3497 bleu,pt %ncc, .copyout_ne_small ! go to small copy 3498 nop 3499 ba,pt %ncc, .copyout_noerr_more ! otherwise go to large copy 3500 nop 3501 3502.copyout_ne_small: 3503 ldn [THREAD_REG + T_LOFAULT], %o4 3504 tst %o4 3505 bz,pn %ncc, .sm_do_copyout 3506 nop 3507 sethi %hi(.sm_copyio_noerr), %o5 3508 or %o5, %lo(.sm_copyio_noerr), %o5 3509 membar #Sync ! sync error barrier 3510 ba,pt %ncc, .sm_do_copyout 3511 stn %o5, [THREAD_REG + T_LOFAULT] ! set/save t_lofault 3512 3513.copyout_noerr_more: 3514 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3515 sethi %hi(.copyio_noerr), REAL_LOFAULT 3516 ba,pt %ncc, .do_copyout 3517 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 3518 3519 SET_SIZE(copyout_noerr) 3520#endif /* lint */ 3521 3522 3523/* 3524 * hwblkclr - clears block-aligned, block-multiple-sized regions that are 3525 * longer than 256 bytes in length using spitfire's block stores. If 3526 * the criteria for using this routine are not met then it calls bzero 3527 * and returns 1. Otherwise 0 is returned indicating success. 3528 * Caller is responsible for ensuring use_hw_bzero is true and that 3529 * kpreempt_disable() has been called. 3530 */ 3531#ifdef lint 3532/*ARGSUSED*/ 3533int 3534hwblkclr(void *addr, size_t len) 3535{ 3536 return(0); 3537} 3538#else /* lint */ 3539 ! %i0 - start address 3540 ! %i1 - length of region (multiple of 64) 3541 ! %l0 - saved fprs 3542 ! %l1 - pointer to saved %d0 block 3543 ! %l2 - saved curthread->t_lwp 3544 3545 ENTRY(hwblkclr) 3546 ! get another window w/space for one aligned block of saved fpregs 3547 save %sp, -SA(MINFRAME + 2*VIS_BLOCKSIZE), %sp 3548 3549 ! Must be block-aligned 3550 andcc %i0, (VIS_BLOCKSIZE-1), %g0 3551 bnz,pn %ncc, 1f 3552 nop 3553 3554 ! ... and must be 256 bytes or more 3555 cmp %i1, 256 3556 blu,pn %ncc, 1f 3557 nop 3558 3559 ! ... and length must be a multiple of VIS_BLOCKSIZE 3560 andcc %i1, (VIS_BLOCKSIZE-1), %g0 3561 bz,pn %ncc, 2f 3562 nop 3563 35641: ! punt, call bzero but notify the caller that bzero was used 3565 mov %i0, %o0 3566 call bzero 3567 mov %i1, %o1 3568 ret 3569 restore %g0, 1, %o0 ! return (1) - did not use block operations 3570 35712: rd %fprs, %l0 ! check for unused fp 3572 btst FPRS_FEF, %l0 3573 bz,pt %icc, 1f 3574 nop 3575 3576 ! save in-use fpregs on stack 3577 membar #Sync 3578 add %fp, STACK_BIAS - 65, %l1 3579 and %l1, -VIS_BLOCKSIZE, %l1 3580 stda %d0, [%l1]ASI_BLK_P 3581 35821: membar #StoreStore|#StoreLoad|#LoadStore 3583 wr %g0, FPRS_FEF, %fprs 3584 wr %g0, ASI_BLK_P, %asi 3585 3586 ! Clear block 3587 fzero %d0 3588 fzero %d2 3589 fzero %d4 3590 fzero %d6 3591 fzero %d8 3592 fzero %d10 3593 fzero %d12 3594 fzero %d14 3595 3596 mov 256, %i3 3597 ba,pt %ncc, .pz_doblock 3598 nop 3599 3600.pz_blkstart: 3601 ! stda %d0, [%i0 + 192]%asi ! in dly slot of branch that got us here 3602 stda %d0, [%i0 + 128]%asi 3603 stda %d0, [%i0 + 64]%asi 3604 stda %d0, [%i0]%asi 3605.pz_zinst: 3606 add %i0, %i3, %i0 3607 sub %i1, %i3, %i1 3608.pz_doblock: 3609 cmp %i1, 256 3610 bgeu,a %ncc, .pz_blkstart 3611 stda %d0, [%i0 + 192]%asi 3612 3613 cmp %i1, 64 3614 blu %ncc, .pz_finish 3615 3616 andn %i1, (64-1), %i3 3617 srl %i3, 4, %i2 ! using blocks, 1 instr / 16 words 3618 set .pz_zinst, %i4 3619 sub %i4, %i2, %i4 3620 jmp %i4 3621 nop 3622 3623.pz_finish: 3624 membar #Sync 3625 btst FPRS_FEF, %l0 3626 bz,a .pz_finished 3627 wr %l0, 0, %fprs ! restore fprs 3628 3629 ! restore fpregs from stack 3630 ldda [%l1]ASI_BLK_P, %d0 3631 membar #Sync 3632 wr %l0, 0, %fprs ! restore fprs 3633 3634.pz_finished: 3635 ret 3636 restore %g0, 0, %o0 ! return (bzero or not) 3637 3638 SET_SIZE(hwblkclr) 3639#endif /* lint */ 3640 3641#ifdef lint 3642/*ARGSUSED*/ 3643void 3644hw_pa_bcopy32(uint64_t src, uint64_t dst) 3645{} 3646#else /*!lint */ 3647 /* 3648 * Copy 32 bytes of data from src (%o0) to dst (%o1) 3649 * using physical addresses. 3650 */ 3651 ENTRY_NP(hw_pa_bcopy32) 3652 rdpr %pstate, %g1 3653 andn %g1, PSTATE_IE, %g2 3654 wrpr %g0, %g2, %pstate 3655 3656 rdpr %pstate, %g0 3657 ldxa [%o0]ASI_MEM, %o2 3658 add %o0, 8, %o0 3659 ldxa [%o0]ASI_MEM, %o3 3660 add %o0, 8, %o0 3661 ldxa [%o0]ASI_MEM, %o4 3662 add %o0, 8, %o0 3663 ldxa [%o0]ASI_MEM, %o5 3664 membar #Sync 3665 3666 stxa %o2, [%o1]ASI_MEM 3667 add %o1, 8, %o1 3668 stxa %o3, [%o1]ASI_MEM 3669 add %o1, 8, %o1 3670 stxa %o4, [%o1]ASI_MEM 3671 add %o1, 8, %o1 3672 stxa %o5, [%o1]ASI_MEM 3673 3674 retl 3675 wrpr %g0, %g1, %pstate 3676 3677 SET_SIZE(hw_pa_bcopy32) 3678 3679#endif /* lint */ 3680 3681#if defined(lint) 3682 3683int use_hw_bcopy = 1; 3684int use_hw_bzero = 1; 3685uint_t hw_copy_limit_1 = 0; 3686uint_t hw_copy_limit_2 = 0; 3687uint_t hw_copy_limit_4 = 0; 3688uint_t hw_copy_limit_8 = 0; 3689 3690#else /* !lint */ 3691 3692 DGDEF(use_hw_bcopy) 3693 .word 1 3694 DGDEF(use_hw_bzero) 3695 .word 1 3696 DGDEF(hw_copy_limit_1) 3697 .word 0 3698 DGDEF(hw_copy_limit_2) 3699 .word 0 3700 DGDEF(hw_copy_limit_4) 3701 .word 0 3702 DGDEF(hw_copy_limit_8) 3703 .word 0 3704 3705 .align 64 3706 .section ".text" 3707#endif /* !lint */ 3708