1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 26#include <sys/param.h> 27#include <sys/errno.h> 28#include <sys/asm_linkage.h> 29#include <sys/vtrace.h> 30#include <sys/machthread.h> 31#include <sys/clock.h> 32#include <sys/asi.h> 33#include <sys/fsr.h> 34#include <sys/privregs.h> 35#include <sys/machasi.h> 36#include <sys/niagaraasi.h> 37 38#if !defined(lint) 39#include "assym.h" 40#endif /* lint */ 41 42 43/* 44 * Pseudo-code to aid in understanding the control flow of the 45 * bcopy/kcopy routine. 46 * 47 * ! WARNING : <Register usage convention> 48 * ! In kcopy() the %o5, holds previous error handler and a flag 49 * ! LOFAULT_SET (low bits). The %o5 is null in bcopy(). 50 * ! The %o5 is not available for any other use. 51 * 52 * On entry: 53 * ! Determine whether to use the FP register version or the 54 * ! the leaf routine version depending on the size of the copy. 55 * ! Set up error handling accordingly. 56 * ! The transition point depends on FP_COPY 57 * ! For both versions %o5 is reserved 58 * 59 * kcopy(): 60 * if(length > FP_COPY) 61 * go to regular_kcopy 62 * 63 * ! Setup_leaf_rtn_error_handler 64 * %o5 = curthread->t_lofault; ! save existing handler in %o5 65 * %o5 |= LOFAULT_SET; ! ORed with LOFAULT_SET flag 66 * curthread->t_lofault = .sm_copyerr; 67 * goto small_bcopy(); 68 * 69 * regular_kcopy: 70 * save_registers() 71 * %o5 = curthread->t_lofault; ! save existing handler in %o5 72 * %o5 |= LOFAULT_SET; ! ORed with LOFAULT_SET flag 73 * curthread->t_lofault = .copyerr; 74 * goto do_copy(); 75 * 76 * bcopy(): 77 * if(length > FP_COPY) 78 * go to regular_bcopy 79 * 80 * ! Setup_leaf_rtn_error_handler 81 * %o5 = curthread->t_lofault; ! save existing handler in %o5 82 * curthread->t_lofault = .sm_copyerr; 83 * goto small_bcopy(); 84 * 85 * regular_bcopy: 86 * %o5 = curthread->t_lofault; ! save existing handler in %o5 87 * curthread->t_lofault = .copyerr; 88 * goto do_copy(); 89 * 90 * small_bcopy: 91 * ! handle copies smaller than FP_COPY 92 * restore t_lofault handler 93 * exit 94 * 95 * do_copy: 96 * ! handle copies larger than FP_COPY 97 * save fp_regs 98 * blockcopy; 99 * restore fp_regs 100 * restore t_lofault handler if came from kcopy(); 101 * 102 * 103 * In leaf lofault handler: 104 * curthread->t_lofault = (%o5 & ~LOFAULT_SET); ! restore old t_lofault 105 * return (errno) 106 * 107 * In lofault handler: 108 * curthread->t_lofault = (%o5 & ~LOFAULT_SET); ! restore old t_lofault 109 * restore fp_regs 110 * return (errno) 111 * 112 * 113 * 114 * For all of bcopy/copyin/copyout the copy logic is specialized according 115 * to how the src and dst is aligned and how much data needs to be moved. 116 * The following comments apply to the N2/RF code (#if !defined(NIAGARA_IMPL)) 117 * 118 * N2/RF Flow : 119 * 120 * if (count < FP_COPY) { (584 bytes) 121 * set small fault handler (no register window save/restore) 122 * if count < SHORTCOPY (7 bytes) 123 * copy bytes; go to short_exit 124 * else 125 * determine dst alignment, move minimum bytes/halfwords to 126 * get dst aligned on long word boundary 127 * if( src is on long word boundary ) { 128 * medlong: src/dst aligned on 8 bytes 129 * copy with ldx/stx in 4-way unrolled loop; 130 * copy final 0-31 bytes; go to short_exit 131 * } else { src/dst not aligned on 8 bytes 132 * if src is word aligned, ld/st words in 32-byte chunks 133 * if src is half word aligned, ld half, ld word, ld half; pack 134 * into long word, store long words in 32-byte chunks 135 * if src is byte aligned, ld byte,half,word parts; pack into long 136 * word, store long words in 32-byte chunks 137 * move final 0-31 bytes according to src alignment; go to short_exit 138 * short_exit: 139 * restore trap handler if needed, retl 140 * else { More than FP_COPY bytes 141 * set fault handler 142 * disable kernel preemption 143 * save registers, save FP registers if in use 144 * move bytes to align destination register on long word boundary 145 * if(src is on long word boundary) { src/dst aligned on 8 bytes 146 * align dst on 64 byte boundary; use 8-way test for each of 8 possible 147 * src alignments relative to a 64 byte boundary to select the 148 * 16-way unrolled loop (128 bytes) to use for 149 * block load, fmovd, block-init-store, block-store, fmovd operations 150 * then go to remain_stuff. 151 * remain_stuff: move remaining bytes. go to long_exit 152 * } else { 153 * setup alignaddr for faligndata instructions 154 * align dst on 64 byte boundary; use 8-way test for each of 8 possible 155 * src alignments to nearest long word relative to 64 byte boundary to 156 * select the 8-way unrolled loop (64 bytes) to use for 157 * block load, falign, fmovd, block-store loop 158 * (only use block-init-store when src/dst on 8 byte boundaries.) 159 * goto unalign_done. 160 * unalign_done: 161 * move remaining bytes for unaligned cases. go to long_exit 162 * long_exit: 163 * restore %gsr, FP regs (either from stack or set to zero), 164 * restore trap handler, check for kernel preemption request, 165 * handle if needed, ret. 166 * } 167 * 168 * Other platforms include hw_bcopy_limit_[1248] to control the exact 169 * point where the FP register code is used. On those platforms, the 170 * FP register code did not leave data in L2 cache, potentially affecting 171 * performance more than the gain/loss from the algorithm difference. 172 * For N2/RF, block store places data in the L2 cache, so use or non-use 173 * of the FP registers has no effect on L2 cache behavior. 174 * The cost for testing hw_bcopy_limit_* according to different 175 * alignments exceeds 50 cycles for all cases, even when hw_bcopy_limits 176 * were not used. That cost was judged too high relative to the benefits, 177 * so the hw_bcopy_limit option is omitted from this code. 178 */ 179 180/* 181 * Less then or equal this number of bytes we will always copy byte-for-byte 182 */ 183#define SMALL_LIMIT 7 184 185/* 186 * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault 187 * handler was set 188 */ 189#define LOFAULT_SET 2 190 191/* 192 * This define is to align data for the unaligned source cases. 193 * The data1, data2 and data3 is merged into data1 and data2. 194 * The data3 is preserved for next merge. 195 */ 196#define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \ 197 sllx data1, lshift, data1 ;\ 198 srlx data2, rshift, tmp ;\ 199 or data1, tmp, data1 ;\ 200 sllx data2, lshift, data2 ;\ 201 srlx data3, rshift, tmp ;\ 202 or data2, tmp, data2 203/* 204 * This macro is to align the data. Basically it merges 205 * data1 and data2 to form double word. 206 */ 207#define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \ 208 sllx data1, lshift, data1 ;\ 209 srlx data2, rshift, tmp ;\ 210 or data1, tmp, data1 211 212#if !defined(NIAGARA_IMPL) 213/* 214 * Flags set in the lower bits of the t_lofault address: 215 * FPUSED_FLAG: The FP registers were in use and must be restored 216 * LOFAULT_SET: Set for bcopy calls, cleared for kcopy calls 217 * COPY_FLAGS: Both of the above 218 * 219 * Other flags: 220 * KPREEMPT_FLAG: kpreempt needs to be called 221 */ 222#define FPUSED_FLAG 1 223#define LOFAULT_SET 2 224#define COPY_FLAGS (FPUSED_FLAG | LOFAULT_SET) 225#define KPREEMPT_FLAG 4 226 227#define ALIGN_OFF_1_7 \ 228 faligndata %d0, %d2, %d48 ;\ 229 faligndata %d2, %d4, %d50 ;\ 230 faligndata %d4, %d6, %d52 ;\ 231 faligndata %d6, %d8, %d54 ;\ 232 faligndata %d8, %d10, %d56 ;\ 233 faligndata %d10, %d12, %d58 ;\ 234 faligndata %d12, %d14, %d60 ;\ 235 faligndata %d14, %d16, %d62 236 237#define ALIGN_OFF_8_15 \ 238 faligndata %d2, %d4, %d48 ;\ 239 faligndata %d4, %d6, %d50 ;\ 240 faligndata %d6, %d8, %d52 ;\ 241 faligndata %d8, %d10, %d54 ;\ 242 faligndata %d10, %d12, %d56 ;\ 243 faligndata %d12, %d14, %d58 ;\ 244 faligndata %d14, %d16, %d60 ;\ 245 faligndata %d16, %d18, %d62 246 247#define ALIGN_OFF_16_23 \ 248 faligndata %d4, %d6, %d48 ;\ 249 faligndata %d6, %d8, %d50 ;\ 250 faligndata %d8, %d10, %d52 ;\ 251 faligndata %d10, %d12, %d54 ;\ 252 faligndata %d12, %d14, %d56 ;\ 253 faligndata %d14, %d16, %d58 ;\ 254 faligndata %d16, %d18, %d60 ;\ 255 faligndata %d18, %d20, %d62 256 257#define ALIGN_OFF_24_31 \ 258 faligndata %d6, %d8, %d48 ;\ 259 faligndata %d8, %d10, %d50 ;\ 260 faligndata %d10, %d12, %d52 ;\ 261 faligndata %d12, %d14, %d54 ;\ 262 faligndata %d14, %d16, %d56 ;\ 263 faligndata %d16, %d18, %d58 ;\ 264 faligndata %d18, %d20, %d60 ;\ 265 faligndata %d20, %d22, %d62 266 267#define ALIGN_OFF_32_39 \ 268 faligndata %d8, %d10, %d48 ;\ 269 faligndata %d10, %d12, %d50 ;\ 270 faligndata %d12, %d14, %d52 ;\ 271 faligndata %d14, %d16, %d54 ;\ 272 faligndata %d16, %d18, %d56 ;\ 273 faligndata %d18, %d20, %d58 ;\ 274 faligndata %d20, %d22, %d60 ;\ 275 faligndata %d22, %d24, %d62 276 277#define ALIGN_OFF_40_47 \ 278 faligndata %d10, %d12, %d48 ;\ 279 faligndata %d12, %d14, %d50 ;\ 280 faligndata %d14, %d16, %d52 ;\ 281 faligndata %d16, %d18, %d54 ;\ 282 faligndata %d18, %d20, %d56 ;\ 283 faligndata %d20, %d22, %d58 ;\ 284 faligndata %d22, %d24, %d60 ;\ 285 faligndata %d24, %d26, %d62 286 287#define ALIGN_OFF_48_55 \ 288 faligndata %d12, %d14, %d48 ;\ 289 faligndata %d14, %d16, %d50 ;\ 290 faligndata %d16, %d18, %d52 ;\ 291 faligndata %d18, %d20, %d54 ;\ 292 faligndata %d20, %d22, %d56 ;\ 293 faligndata %d22, %d24, %d58 ;\ 294 faligndata %d24, %d26, %d60 ;\ 295 faligndata %d26, %d28, %d62 296 297#define ALIGN_OFF_56_63 \ 298 faligndata %d14, %d16, %d48 ;\ 299 faligndata %d16, %d18, %d50 ;\ 300 faligndata %d18, %d20, %d52 ;\ 301 faligndata %d20, %d22, %d54 ;\ 302 faligndata %d22, %d24, %d56 ;\ 303 faligndata %d24, %d26, %d58 ;\ 304 faligndata %d26, %d28, %d60 ;\ 305 faligndata %d28, %d30, %d62 306 307/* 308 * FP_COPY indicates the minimum number of bytes needed 309 * to justify using FP/VIS-accelerated memory operations. 310 * The FPBLK code assumes a minimum number of bytes are available 311 * to be moved on entry. Check that code carefully before 312 * reducing FP_COPY below 256. 313 */ 314#define FP_COPY 584 315#define SHORTCOPY 7 316#define ASI_STBI_P ASI_BLK_INIT_ST_QUAD_LDD_P 317#define ASI_STBI_AIUS ASI_BLK_INIT_QUAD_LDD_AIUS 318#define CACHE_LINE 64 319#define VIS_BLOCKSIZE 64 320 321/* 322 * Size of stack frame in order to accomodate a 64-byte aligned 323 * floating-point register save area and 2 64-bit temp locations. 324 * All copy functions use three quadrants of fp registers; to assure a 325 * block-aligned three block buffer in which to save we must reserve 326 * four blocks on stack. 327 * 328 * _______________________________________ <-- %fp + STACK_BIAS 329 * | We may need to preserve 3 quadrants | 330 * | of fp regs, but since we do so with | 331 * | BST/BLD we need room in which to | 332 * | align to VIS_BLOCKSIZE bytes. So | 333 * | this area is 4 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET 334 * |-------------------------------------| 335 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET 336 * |-------------------------------------| 337 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET 338 * --------------------------------------- 339 */ 340#define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8)) 341#define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 4) 342#define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 3) + 1) 343#define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8) 344#define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8) 345 346/* 347 * In FP copies if we do not have preserved data to restore over 348 * the fp regs we used then we must zero those regs to avoid 349 * exposing portions of the data to later threads (data security). 350 */ 351#define FZERO \ 352 fzero %f0 ;\ 353 fzero %f2 ;\ 354 faddd %f0, %f2, %f4 ;\ 355 fmuld %f0, %f2, %f6 ;\ 356 faddd %f0, %f2, %f8 ;\ 357 fmuld %f0, %f2, %f10 ;\ 358 faddd %f0, %f2, %f12 ;\ 359 fmuld %f0, %f2, %f14 ;\ 360 faddd %f0, %f2, %f16 ;\ 361 fmuld %f0, %f2, %f18 ;\ 362 faddd %f0, %f2, %f20 ;\ 363 fmuld %f0, %f2, %f22 ;\ 364 faddd %f0, %f2, %f24 ;\ 365 fmuld %f0, %f2, %f26 ;\ 366 faddd %f0, %f2, %f28 ;\ 367 fmuld %f0, %f2, %f30 ;\ 368 faddd %f0, %f2, %f48 ;\ 369 fmuld %f0, %f2, %f50 ;\ 370 faddd %f0, %f2, %f52 ;\ 371 fmuld %f0, %f2, %f54 ;\ 372 faddd %f0, %f2, %f56 ;\ 373 fmuld %f0, %f2, %f58 ;\ 374 faddd %f0, %f2, %f60 ;\ 375 fmuld %f0, %f2, %f62 376 377#if !defined(lint) 378 379/* 380 * Macros to save and restore fp registers to/from the stack. 381 * Used to save and restore in-use fp registers when we want to use FP. 382 */ 383#define BST_FP_TOSTACK(tmp1) \ 384 /* membar #Sync */ ;\ 385 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 386 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 387 stda %f0, [tmp1]ASI_BLK_P ;\ 388 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 389 stda %f16, [tmp1]ASI_BLK_P ;\ 390 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 391 stda %f48, [tmp1]ASI_BLK_P ;\ 392 membar #Sync 393 394#define BLD_FP_FROMSTACK(tmp1) \ 395 /* membar #Sync - provided at copy completion */ ;\ 396 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 397 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 398 ldda [tmp1]ASI_BLK_P, %f0 ;\ 399 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 400 ldda [tmp1]ASI_BLK_P, %f16 ;\ 401 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 402 ldda [tmp1]ASI_BLK_P, %f48 ;\ 403 membar #Sync 404#endif /* NIAGARA_IMPL */ 405 406#endif /* lint */ 407/* 408 * Copy a block of storage, returning an error code if `from' or 409 * `to' takes a kernel pagefault which cannot be resolved. 410 * Returns errno value on pagefault error, 0 if all ok 411 */ 412 413#if defined(lint) 414 415/* ARGSUSED */ 416int 417kcopy(const void *from, void *to, size_t count) 418{ return(0); } 419 420#else /* lint */ 421 422 .seg ".text" 423 .align 4 424 425 ENTRY(kcopy) 426#if !defined(NIAGARA_IMPL) 427 cmp %o2, FP_COPY ! check for small copy/leaf case 428 bgt,pt %ncc, .kcopy_more ! 429 nop 430.kcopy_small: ! setup error handler 431 sethi %hi(.sm_copyerr), %o4 432 or %o4, %lo(.sm_copyerr), %o4 ! .sm_copyerr is lofault value 433 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler 434 ! Note that we carefully do *not* flag the setting of 435 ! t_lofault. 436 membar #Sync ! sync error barrier 437 b .sm_do_copy ! common code 438 stn %o4, [THREAD_REG + T_LOFAULT] ! set t_lofault 439 440 441.kcopy_more: 442 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 443 sethi %hi(.copyerr), %l7 ! copyerr is lofault value 444 or %l7, %lo(.copyerr), %l7 445 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler 446 ! Note that we carefully do *not* flag the setting of 447 ! t_lofault. 448 membar #Sync ! sync error barrier 449 b .do_copy ! common code 450 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 451 452/* 453 * We got here because of a fault during a small kcopy or bcopy. 454 * if a fault handler existed when bcopy was called. 455 * No floating point registers are used by the small copies. 456 * Small copies are from a leaf routine 457 * Errno value is in %g1. 458 */ 459.sm_copyerr: 460 ! The kcopy will always set a t_lofault handler. If it fires, 461 ! we're expected to just return the error code and not to 462 ! invoke any existing error handler. As far as bcopy is concerned, 463 ! we only set t_lofault if there was an existing lofault handler. 464 ! In that case we're expected to invoke the previously existing 465 ! handler after resetting the t_lofault value. 466 btst LOFAULT_SET, %o5 467 membar #Sync ! sync error barrier 468 andn %o5, LOFAULT_SET, %o5 ! clear fault flag 469 bnz,pn %ncc, 3f 470 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 471 retl 472 mov %g1, %o0 4733: 474 ! We're here via bcopy. There must have been an error handler 475 ! in place otherwise we would have died a nasty death already. 476 jmp %o5 ! goto real handler 477 mov %g0, %o0 478/* 479 * end of .sm_copyerr 480 */ 481 482/* 483 * We got here because of a fault during kcopy or bcopy if a fault 484 * handler existed when bcopy was called. 485 * stack and fp registers need to be restored 486 * Errno value is in %g1. 487 */ 488.copyerr: 489 sethi %hi(.copyerr2), %l1 490 or %l1, %lo(.copyerr2), %l1 491 membar #Sync ! sync error barrier 492 stn %l1, [THREAD_REG + T_LOFAULT] ! set t_lofault 493 btst FPUSED_FLAG, %o5 494 bz,pt %xcc, 1f 495 and %o5, LOFAULT_SET, %l1 ! copy flag to %l1 496 497 membar #Sync ! sync error barrier 498 wr %l5, 0, %gsr 499 btst FPRS_FEF, %g5 500 bz,pt %icc, 4f 501 nop 502 ! restore fpregs from stack 503 BLD_FP_FROMSTACK(%o2) 504 ba,pt %ncc, 2f 505 wr %g5, 0, %fprs ! restore fprs 5064: 507 FZERO 508 wr %g5, 0, %fprs ! restore fprs 5092: 510 ldn [THREAD_REG + T_LWP], %o2 511 brnz,pt %o2, 1f 512 nop 513 514 ldsb [THREAD_REG + T_PREEMPT], %l0 515 deccc %l0 516 bnz,pn %ncc, 1f 517 stb %l0, [THREAD_REG + T_PREEMPT] 518 519 ! Check for a kernel preemption request 520 ldn [THREAD_REG + T_CPU], %l0 521 ldub [%l0 + CPU_KPRUNRUN], %l0 522 brnz,a,pt %l0, 1f ! Need to call kpreempt? 523 or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag 524 525 ! The kcopy will always set a t_lofault handler. If it fires, 526 ! we're expected to just return the error code and not to 527 ! invoke any existing error handler. As far as bcopy is concerned, 528 ! we only set t_lofault if there was an existing lofault handler. 529 ! In that case we're expected to invoke the previously existing 530 ! handler after resetting the t_lofault value. 5311: 532 andn %o5, COPY_FLAGS, %o5 ! remove flags from lofault address 533 membar #Sync ! sync error barrier 534 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 535 536 ! call kpreempt if necessary 537 btst KPREEMPT_FLAG, %l1 538 bz,pt %icc, 2f 539 nop 540 call kpreempt 541 rdpr %pil, %o0 ! pass %pil 5422: 543 btst LOFAULT_SET, %l1 544 bnz,pn %ncc, 3f 545 nop 546 ret 547 restore %g1, 0, %o0 5483: 549 ! We're here via bcopy. There must have been an error handler 550 ! in place otherwise we would have died a nasty death already. 551 jmp %o5 ! goto real handler 552 restore %g0, 0, %o0 ! dispose of copy window 553 554/* 555 * We got here because of a fault in .copyerr. We can't safely restore fp 556 * state, so we panic. 557 */ 558fp_panic_msg: 559 .asciz "Unable to restore fp state after copy operation" 560 561 .align 4 562.copyerr2: 563 set fp_panic_msg, %o0 564 call panic 565 nop 566/* 567 * end of .copyerr 568 */ 569 570#else /* NIAGARA_IMPL */ 571 save %sp, -SA(MINFRAME), %sp 572 set .copyerr, %l7 ! copyerr is lofault value 573 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler 574 or %o5, LOFAULT_SET, %o5 575 membar #Sync ! sync error barrier 576 b .do_copy ! common code 577 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 578 579/* 580 * We got here because of a fault during kcopy. 581 * Errno value is in %g1. 582 */ 583.copyerr: 584 ! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET 585 ! into %o5 to indicate it has set t_lofault handler. Need to clear 586 ! LOFAULT_SET flag before restoring the error handler. 587 andn %o5, LOFAULT_SET, %o5 588 membar #Sync ! sync error barrier 589 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 590 ret 591 restore %g1, 0, %o0 592#endif /* NIAGARA_IMPL */ 593 594 SET_SIZE(kcopy) 595#endif /* lint */ 596 597 598/* 599 * Copy a block of storage - must not overlap (from + len <= to). 600 */ 601#if defined(lint) 602 603/* ARGSUSED */ 604void 605bcopy(const void *from, void *to, size_t count) 606{} 607 608#else /* lint */ 609 610 ENTRY(bcopy) 611#if !defined(NIAGARA_IMPL) 612 cmp %o2, FP_COPY ! check for small copy/leaf case 613 bgt,pt %ncc, .bcopy_more ! 614 nop 615.bcopy_small: ! setup error handler 616 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler 617 tst %o5 618 bz,pt %icc, .sm_do_copy 619 sethi %hi(.sm_copyerr), %o4 620 or %o4, %lo(.sm_copyerr), %o4 ! .sm_copyerr is lofault value 621 membar #Sync ! sync error barrier 622 stn %o4, [THREAD_REG + T_LOFAULT] ! set t_lofault 623 or %o5, LOFAULT_SET, %o5 ! Error should trampoline 624.sm_do_copy: 625 mov %o0, %g1 ! save %o0 626 cmp %o2, SHORTCOPY ! make sure there is enough to align 627 ble,pt %ncc, .bc_smallest 628 andcc %o1, 0x7, %o3 ! is dest long aligned 629 bnz,pn %ncc, .bc_align 630 andcc %o1, 1, %o3 ! is dest byte aligned 631 632! Destination is long word aligned 633.bc_al_src: 634 andcc %o0, 7, %o3 635 brnz,pt %o3, .bc_src_dst_unal8 636 nop 637/* 638 * Special case for handling when src and dest are both long word aligned 639 * and total data to move is less than FP_COPY bytes 640 * Also handles finish up for large block moves, so may be less than 32 bytes 641 */ 642.bc_medlong: 643 subcc %o2, 31, %o2 ! adjust length to allow cc test 644 ble,pt %ncc, .bc_medl31 645 nop 646.bc_medl32: 647 ldx [%o0], %o4 ! move 32 bytes 648 subcc %o2, 32, %o2 ! decrement length count by 32 649 stx %o4, [%o1] 650 ldx [%o0+8], %o4 651 stx %o4, [%o1+8] 652 ldx [%o0+16], %o4 653 add %o0, 32, %o0 ! increase src ptr by 32 654 stx %o4, [%o1+16] 655 ldx [%o0-8], %o4 656 add %o1, 32, %o1 ! increase dst ptr by 32 657 bgu,pt %ncc, .bc_medl32 ! repeat if at least 32 bytes left 658 stx %o4, [%o1-8] 659.bc_medl31: 660 addcc %o2, 24, %o2 ! adjust count to be off by 7 661 ble,pt %ncc, .bc_medl7 ! skip if 7 or fewer bytes left 662 nop 663.bc_medl8: 664 ldx [%o0], %o4 ! move 8 bytes 665 add %o0, 8, %o0 ! increase src ptr by 8 666 subcc %o2, 8, %o2 ! decrease count by 8 667 add %o1, 8, %o1 ! increase dst ptr by 8 668 bgu,pt %ncc, .bc_medl8 669 stx %o4, [%o1-8] 670.bc_medl7: 671 addcc %o2, 7, %o2 ! finish adjustment of remaining count 672 bnz,pt %ncc, .bc_small4 ! do final bytes if not finished 673 674.bc_smallx: ! finish up and exit 675 tst %o5 676 bz,pt %ncc, .bc_sm_done 677 andn %o5, COPY_FLAGS, %o5 ! remove flags from lofault address 678 membar #Sync ! sync error barrier 679 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 680.bc_sm_done: 681 retl 682 mov %g0, %o0 683 684.bc_small4: 685 cmp %o2, 4 686 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left 687 nop ! 688 ld [%o0], %o4 ! move 4 bytes 689 add %o0, 4, %o0 ! increase src ptr by 4 690 add %o1, 4, %o1 ! increase dst ptr by 4 691 subcc %o2, 4, %o2 ! decrease count by 4 692 bz,pt %ncc, .bc_smallx 693 stw %o4, [%o1-4] 694 695.bc_small3x: ! Exactly 1, 2, or 3 bytes remain 696 subcc %o2, 1, %o2 ! reduce count for cc test 697 ldub [%o0], %o4 ! load one byte 698 bz,pt %ncc, .bc_smallx 699 stb %o4, [%o1] ! store one byte 700 ldub [%o0+1], %o4 ! load second byte 701 subcc %o2, 1, %o2 702 bz,pt %ncc, .bc_smallx 703 stb %o4, [%o1+1] ! store second byte 704 ldub [%o0+2], %o4 ! load third byte 705 ba .bc_smallx 706 stb %o4, [%o1+2] ! store third byte 707 708.bc_smallest: ! 7 or fewer bytes remain 709 tst %o2 710 bz,pt %ncc, .bc_smallx 711 cmp %o2, 4 712 blt,pt %ncc, .bc_small3x 713 nop 714 ldub [%o0], %o4 ! read byte 715 subcc %o2, 4, %o2 ! reduce count by 4 716 stb %o4, [%o1] ! write byte 717 ldub [%o0+1], %o4 ! repeat for total of 4 bytes 718 add %o0, 4, %o0 ! advance src by 4 719 stb %o4, [%o1+1] 720 ldub [%o0-2], %o4 721 add %o1, 4, %o1 ! advance dst by 4 722 stb %o4, [%o1-2] 723 ldub [%o0-1], %o4 724 bnz,pt %ncc, .bc_small3x 725 stb %o4, [%o1-1] 726 ba .bc_smallx 727 nop 728 729/* 730 * Align destination to long word boundary 731 */ 732.bc_align: ! byte align test in prior branch delay 733 bnz,pt %ncc, .bc_al_d1 734.bc_al_d1f: ! dest is now half word aligned 735 andcc %o1, 2, %o3 736 bnz,pt %ncc, .bc_al_d2 737.bc_al_d2f: ! dest is now word aligned 738 andcc %o1, 4, %o3 ! is dest longword aligned? 739 bz,pt %ncc, .bc_al_src 740 nop 741.bc_al_d4: ! dest is word aligned; src is unknown 742 ldub [%o0], %o4 ! move a word (src align unknown) 743 ldub [%o0+1], %o3 744 sll %o4, 24, %o4 ! position 745 sll %o3, 16, %o3 ! position 746 or %o4, %o3, %o3 ! merge 747 ldub [%o0+2], %o4 748 sll %o4, 8, %o4 ! position 749 or %o4, %o3, %o3 ! merge 750 ldub [%o0+3], %o4 751 or %o4, %o3, %o4 ! merge 752 stw %o4,[%o1] ! store four bytes 753 add %o0, 4, %o0 ! adjust src by 4 754 add %o1, 4, %o1 ! adjust dest by 4 755 sub %o2, 4, %o2 ! adjust count by 4 756 andcc %o0, 7, %o3 ! check for src long word alignment 757 brz,pt %o3, .bc_medlong 758.bc_src_dst_unal8: 759 ! dst is 8-byte aligned, src is not 760 ! Size is less than FP_COPY 761 ! Following code is to select for alignment 762 andcc %o0, 0x3, %o3 ! test word alignment 763 bz,pt %ncc, .bc_medword 764 nop 765 andcc %o0, 0x1, %o3 ! test halfword alignment 766 bnz,pt %ncc, .bc_med_byte ! go to byte move if not halfword 767 andcc %o0, 0x2, %o3 ! test which byte alignment 768 ba .bc_medhalf 769 nop 770.bc_al_d1: ! align dest to half word 771 ldub [%o0], %o4 ! move a byte 772 add %o0, 1, %o0 773 stb %o4, [%o1] 774 add %o1, 1, %o1 775 andcc %o1, 2, %o3 776 bz,pt %ncc, .bc_al_d2f 777 sub %o2, 1, %o2 778.bc_al_d2: ! align dest to word 779 ldub [%o0], %o4 ! move a half-word (src align unknown) 780 ldub [%o0+1], %o3 781 sll %o4, 8, %o4 ! position 782 or %o4, %o3, %o4 ! merge 783 sth %o4, [%o1] 784 add %o0, 2, %o0 785 add %o1, 2, %o1 786 andcc %o1, 4, %o3 ! is dest longword aligned? 787 bz,pt %ncc, .bc_al_src 788 sub %o2, 2, %o2 789 ba .bc_al_d4 790 nop 791/* 792 * Handle all cases where src and dest are aligned on word 793 * boundaries. Use unrolled loops for better performance. 794 * This option wins over standard large data move when 795 * source and destination is in cache for medium 796 * to short data moves. 797 */ 798.bc_medword: 799 subcc %o2, 31, %o2 ! adjust length to allow cc test 800 ble,pt %ncc, .bc_medw31 801 nop 802.bc_medw32: 803 ld [%o0], %o4 ! move a block of 32 bytes 804 stw %o4, [%o1] 805 ld [%o0+4], %o4 806 stw %o4, [%o1+4] 807 ld [%o0+8], %o4 808 stw %o4, [%o1+8] 809 ld [%o0+12], %o4 810 stw %o4, [%o1+12] 811 ld [%o0+16], %o4 812 stw %o4, [%o1+16] 813 ld [%o0+20], %o4 814 subcc %o2, 32, %o2 ! decrement length count 815 stw %o4, [%o1+20] 816 ld [%o0+24], %o4 817 add %o0, 32, %o0 ! increase src ptr by 32 818 stw %o4, [%o1+24] 819 ld [%o0-4], %o4 820 add %o1, 32, %o1 ! increase dst ptr by 32 821 bgu,pt %ncc, .bc_medw32 ! repeat if at least 32 bytes left 822 stw %o4, [%o1-4] 823.bc_medw31: 824 addcc %o2, 24, %o2 ! adjust count to be off by 7 825 ble,pt %ncc, .bc_medw7 ! skip if 7 or fewer bytes left 826 nop ! 827.bc_medw15: 828 ld [%o0], %o4 ! move a block of 8 bytes 829 subcc %o2, 8, %o2 ! decrement length count 830 stw %o4, [%o1] 831 add %o0, 8, %o0 ! increase src ptr by 8 832 ld [%o0-4], %o4 833 add %o1, 8, %o1 ! increase dst ptr by 8 834 bgu,pt %ncc, .bc_medw15 835 stw %o4, [%o1-4] 836.bc_medw7: 837 addcc %o2, 7, %o2 ! finish adjustment of remaining count 838 bz,pt %ncc, .bc_smallx ! exit if finished 839 cmp %o2, 4 840 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left 841 nop ! 842 ld [%o0], %o4 ! move 4 bytes 843 add %o0, 4, %o0 ! increase src ptr by 4 844 add %o1, 4, %o1 ! increase dst ptr by 4 845 subcc %o2, 4, %o2 ! decrease count by 4 846 bnz .bc_small3x 847 stw %o4, [%o1-4] 848 ba .bc_smallx 849 nop 850 851.bc_medhalf: 852 subcc %o2, 31, %o2 ! adjust length to allow cc test 853 ble,pt %ncc, .bc_medh31 854 nop 855.bc_medh32: ! load and store block of 32 bytes 856 subcc %o2, 32, %o2 ! decrement length count 857 858 lduh [%o0], %o4 ! move 32 bytes 859 lduw [%o0+2], %o3 860 sllx %o4, 48, %o4 861 sllx %o3, 16, %o3 862 or %o4, %o3, %o3 863 lduh [%o0+6], %o4 864 or %o4, %o3, %o4 865 stx %o4, [%o1] 866 867 lduh [%o0+8], %o4 868 lduw [%o0+10], %o3 869 sllx %o4, 48, %o4 870 sllx %o3, 16, %o3 871 or %o4, %o3, %o3 872 lduh [%o0+14], %o4 873 or %o4, %o3, %o4 874 stx %o4, [%o1+8] 875 876 lduh [%o0+16], %o4 877 lduw [%o0+18], %o3 878 sllx %o4, 48, %o4 879 sllx %o3, 16, %o3 880 or %o4, %o3, %o3 881 lduh [%o0+22], %o4 882 or %o4, %o3, %o4 883 stx %o4, [%o1+16] 884 885 add %o0, 32, %o0 ! increase src ptr by 32 886 add %o1, 32, %o1 ! increase dst ptr by 32 887 888 lduh [%o0-8], %o4 889 lduw [%o0-6], %o3 890 sllx %o4, 48, %o4 891 sllx %o3, 16, %o3 892 or %o4, %o3, %o3 893 lduh [%o0-2], %o4 894 or %o3, %o4, %o4 895 bgu,pt %ncc, .bc_medh32 ! repeat if at least 32 bytes left 896 stx %o4, [%o1-8] 897 898.bc_medh31: 899 addcc %o2, 24, %o2 ! adjust count to be off by 7 900 ble,pt %ncc, .bc_medh7 ! skip if 7 or fewer bytes left 901 nop ! 902.bc_medh15: 903 lduh [%o0], %o4 ! move 16 bytes 904 subcc %o2, 8, %o2 ! decrement length count 905 lduw [%o0+2], %o3 906 sllx %o4, 48, %o4 907 sllx %o3, 16, %o3 908 or %o4, %o3, %o3 909 add %o1, 8, %o1 ! increase dst ptr by 8 910 lduh [%o0+6], %o4 911 add %o0, 8, %o0 ! increase src ptr by 8 912 or %o4, %o3, %o4 913 bgu,pt %ncc, .bc_medh15 914 stx %o4, [%o1-8] 915.bc_medh7: 916 addcc %o2, 7, %o2 ! finish adjustment of remaining count 917 bz,pt %ncc, .bc_smallx ! exit if finished 918 cmp %o2, 4 919 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left 920 nop ! 921 lduh [%o0], %o4 922 sll %o4, 16, %o4 923 lduh [%o0+2], %o3 924 or %o3, %o4, %o4 925 subcc %o2, 4, %o2 926 add %o0, 4, %o0 927 add %o1, 4, %o1 928 bnz .bc_small3x 929 stw %o4, [%o1-4] 930 ba .bc_smallx 931 nop 932 933 .align 16 934.bc_med_byte: 935 bnz,pt %ncc, .bc_medbh32a ! go to correct byte move 936 subcc %o2, 31, %o2 ! adjust length to allow cc test 937 ble,pt %ncc, .bc_medb31 938 nop 939.bc_medb32: ! Alignment 1 or 5 940 subcc %o2, 32, %o2 ! decrement length count 941 942 ldub [%o0], %o4 ! load and store a block of 32 bytes 943 sllx %o4, 56, %o3 944 lduh [%o0+1], %o4 945 sllx %o4, 40, %o4 946 or %o4, %o3, %o3 947 lduw [%o0+3], %o4 948 sllx %o4, 8, %o4 949 or %o4, %o3, %o3 950 ldub [%o0+7], %o4 951 or %o4, %o3, %o4 952 stx %o4, [%o1] 953 954 ldub [%o0+8], %o4 955 sllx %o4, 56, %o3 956 lduh [%o0+9], %o4 957 sllx %o4, 40, %o4 958 or %o4, %o3, %o3 959 lduw [%o0+11], %o4 960 sllx %o4, 8, %o4 961 or %o4, %o3, %o3 962 ldub [%o0+15], %o4 963 or %o4, %o3, %o4 964 stx %o4, [%o1+8] 965 966 ldub [%o0+16], %o4 967 sllx %o4, 56, %o3 968 lduh [%o0+17], %o4 969 sllx %o4, 40, %o4 970 or %o4, %o3, %o3 971 lduw [%o0+19], %o4 972 sllx %o4, 8, %o4 973 or %o4, %o3, %o3 974 ldub [%o0+23], %o4 975 or %o4, %o3, %o4 976 stx %o4, [%o1+16] 977 978 add %o0, 32, %o0 ! increase src ptr by 32 979 add %o1, 32, %o1 ! increase dst ptr by 32 980 981 ldub [%o0-8], %o4 982 sllx %o4, 56, %o3 983 lduh [%o0-7], %o4 984 sllx %o4, 40, %o4 985 or %o4, %o3, %o3 986 lduw [%o0-5], %o4 987 sllx %o4, 8, %o4 988 or %o4, %o3, %o3 989 ldub [%o0-1], %o4 990 or %o4, %o3, %o4 991 bgu,pt %ncc, .bc_medb32 ! repeat if at least 32 bytes left 992 stx %o4, [%o1-8] 993 994.bc_medb31: ! 31 or fewer bytes remaining 995 addcc %o2, 24, %o2 ! adjust count to be off by 7 996 ble,pt %ncc, .bc_medb7 ! skip if 7 or fewer bytes left 997 nop ! 998.bc_medb15: 999 1000 ldub [%o0], %o4 ! load and store a block of 8 bytes 1001 subcc %o2, 8, %o2 ! decrement length count 1002 sllx %o4, 56, %o3 1003 lduh [%o0+1], %o4 1004 sllx %o4, 40, %o4 1005 or %o4, %o3, %o3 1006 lduw [%o0+3], %o4 1007 add %o1, 8, %o1 ! increase dst ptr by 16 1008 sllx %o4, 8, %o4 1009 or %o4, %o3, %o3 1010 ldub [%o0+7], %o4 1011 add %o0, 8, %o0 ! increase src ptr by 16 1012 or %o4, %o3, %o4 1013 bgu,pt %ncc, .bc_medb15 1014 stx %o4, [%o1-8] 1015.bc_medb7: 1016 addcc %o2, 7, %o2 ! finish adjustment of remaining count 1017 bz,pt %ncc, .bc_smallx ! exit if finished 1018 cmp %o2, 4 1019 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left 1020 nop ! 1021 ldub [%o0], %o4 ! move 4 bytes 1022 sll %o4, 24, %o3 1023 lduh [%o0+1], %o4 1024 sll %o4, 8, %o4 1025 or %o4, %o3, %o3 1026 ldub [%o0+3], %o4 1027 or %o4, %o3, %o4 1028 subcc %o2, 4, %o2 1029 add %o0, 4, %o0 1030 add %o1, 4, %o1 1031 bnz .bc_small3x 1032 stw %o4, [%o1-4] 1033 ba .bc_smallx 1034 nop 1035 1036 .align 16 1037.bc_medbh32a: ! Alignment 3 or 7 1038 ble,pt %ncc, .bc_medbh31 1039 nop 1040.bc_medbh32: ! Alignment 3 or 7 1041 subcc %o2, 32, %o2 ! decrement length count 1042 1043 ldub [%o0], %o4 ! load and store a block of 32 bytes 1044 sllx %o4, 56, %o3 1045 lduw [%o0+1], %o4 1046 sllx %o4, 24, %o4 1047 or %o4, %o3, %o3 1048 lduh [%o0+5], %o4 1049 sllx %o4, 8, %o4 1050 or %o4, %o3, %o3 1051 ldub [%o0+7], %o4 1052 or %o4, %o3, %o4 1053 stx %o4, [%o1] 1054 1055 ldub [%o0+8], %o4 1056 sllx %o4, 56, %o3 1057 lduw [%o0+9], %o4 1058 sllx %o4, 24, %o4 1059 or %o4, %o3, %o3 1060 lduh [%o0+13], %o4 1061 sllx %o4, 8, %o4 1062 or %o4, %o3, %o3 1063 ldub [%o0+15], %o4 1064 or %o4, %o3, %o4 1065 stx %o4, [%o1+8] 1066 1067 ldub [%o0+16], %o4 1068 sllx %o4, 56, %o3 1069 lduw [%o0+17], %o4 1070 sllx %o4, 24, %o4 1071 or %o4, %o3, %o3 1072 lduh [%o0+21], %o4 1073 sllx %o4, 8, %o4 1074 or %o4, %o3, %o3 1075 ldub [%o0+23], %o4 1076 or %o4, %o3, %o4 1077 stx %o4, [%o1+16] 1078 1079 add %o0, 32, %o0 ! increase src ptr by 32 1080 add %o1, 32, %o1 ! increase dst ptr by 32 1081 1082 ldub [%o0-8], %o4 1083 sllx %o4, 56, %o3 1084 lduw [%o0-7], %o4 1085 sllx %o4, 24, %o4 1086 or %o4, %o3, %o3 1087 lduh [%o0-3], %o4 1088 sllx %o4, 8, %o4 1089 or %o4, %o3, %o3 1090 ldub [%o0-1], %o4 1091 or %o4, %o3, %o4 1092 bgu,pt %ncc, .bc_medbh32 ! repeat if at least 32 bytes left 1093 stx %o4, [%o1-8] 1094 1095.bc_medbh31: 1096 addcc %o2, 24, %o2 ! adjust count to be off by 7 1097 ble,pt %ncc, .bc_medb7 ! skip if 7 or fewer bytes left 1098 nop ! 1099.bc_medbh15: 1100 ldub [%o0], %o4 ! load and store a block of 8 bytes 1101 sllx %o4, 56, %o3 1102 lduw [%o0+1], %o4 1103 sllx %o4, 24, %o4 1104 or %o4, %o3, %o3 1105 lduh [%o0+5], %o4 1106 sllx %o4, 8, %o4 1107 or %o4, %o3, %o3 1108 ldub [%o0+7], %o4 1109 or %o4, %o3, %o4 1110 stx %o4, [%o1] 1111 subcc %o2, 8, %o2 ! decrement length count 1112 add %o1, 8, %o1 ! increase dst ptr by 8 1113 add %o0, 8, %o0 ! increase src ptr by 8 1114 bgu,pt %ncc, .bc_medbh15 1115 stx %o4, [%o1-8] 1116 ba .bc_medb7 1117 nop 1118 1119 SET_SIZE(bcopy) 1120/* 1121 * The _more entry points are not intended to be used directly by 1122 * any caller from outside this file. They are provided to allow 1123 * profiling and dtrace of the portions of the copy code that uses 1124 * the floating point registers. 1125*/ 1126 ENTRY(bcopy_more) 1127.bcopy_more: 1128 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 1129 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler 1130 brz,pt %o5, .do_copy 1131 nop 1132 sethi %hi(.copyerr), %l7 ! copyerr is lofault value 1133 or %l7, %lo(.copyerr), %l7 1134 membar #Sync ! sync error barrier 1135 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 1136 ! We've already captured whether t_lofault was zero on entry. 1137 ! We need to mark ourselves as being from bcopy since both 1138 ! kcopy and bcopy use the same code path. If LOFAULT_SET is 1139 ! set and the saved lofault was zero, we won't reset lofault on 1140 ! returning. 1141 or %o5, LOFAULT_SET, %o5 1142.do_copy: 1143 ldn [THREAD_REG + T_LWP], %o3 1144 brnz,pt %o3, 1f 1145 nop 1146/* 1147 * kpreempt_disable(); 1148 */ 1149 ldsb [THREAD_REG +T_PREEMPT], %o3 1150 inc %o3 1151 stb %o3, [THREAD_REG + T_PREEMPT] 11521: 1153/* 1154 * Following code is for large copies. We know there is at 1155 * least FP_COPY bytes available. FP regs are used, so 1156 * we save registers and fp regs before starting 1157 */ 1158 rd %fprs, %g5 ! check for unused fp 1159 or %o5,FPUSED_FLAG,%o5 1160 ! if fprs.fef == 0, set it. 1161 ! Setting it when already set costs more than checking 1162 andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0 1163 bz,pt %ncc, .bc_fp_unused 1164 prefetch [%i0 + (1 * CACHE_LINE)], #one_read 1165 BST_FP_TOSTACK(%o3) 1166 ba .bc_fp_ready 1167.bc_fp_unused: 1168 andcc %i1, 1, %o3 ! is dest byte aligned 1169 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 1170.bc_fp_ready: 1171 rd %gsr, %l5 ! save %gsr value 1172 bnz,pt %ncc, .bc_big_d1 1173.bc_big_d1f: ! dest is now half word aligned 1174 andcc %i1, 2, %o3 1175 bnz,pt %ncc, .bc_big_d2 1176.bc_big_d2f: ! dest is now word aligned 1177 andcc %i1, 4, %o3 1178 bnz,pt %ncc, .bc_big_d4 1179.bc_big_d4f: ! dest is now long word aligned 1180 andcc %i0, 7, %o3 ! is src long word aligned 1181 brnz,pt %o3, .bc_big_unal8 1182 prefetch [%i0 + (2 * CACHE_LINE)], #one_read 1183 1184 ! Src and dst are long word aligned 1185 ! align dst to 64 byte boundary 1186 andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 1187 brz,pn %o3, .bc_al_to_64 1188 nop 1189 sub %o3, 64, %o3 ! %o3 has negative bytes to move 1190 add %i2, %o3, %i2 ! adjust remaining count 1191 andcc %o3, 8, %o4 ! odd long words to move? 1192 brz,pt %o4, .bc_al_to_16 1193 nop 1194 add %o3, 8, %o3 1195 ldx [%i0], %o4 1196 add %i0, 8, %i0 ! increment src ptr 1197 add %i1, 8, %i1 ! increment dst ptr 1198 stx %o4, [%i1-8] 1199! Dest is aligned on 16 bytes, src 8 byte aligned 1200.bc_al_to_16: 1201 andcc %o3, 0x30, %o4 ! pair of long words to move? 1202 brz,pt %o4, .bc_al_to_64 1203 nop 1204.bc_al_mv_16: 1205 add %o3, 16, %o3 1206 ldx [%i0], %o4 1207 stx %o4, [%i1] 1208 ldx [%i0+8], %o4 1209 add %i0, 16, %i0 ! increment src ptr 1210 stx %o4, [%i1+8] 1211 andcc %o3, 48, %o4 1212 brnz,pt %o4, .bc_al_mv_16 1213 add %i1, 16, %i1 ! increment dst ptr 1214! Dest is aligned on 64 bytes, src 8 byte aligned 1215.bc_al_to_64: 1216 ! Determine source alignment 1217 ! to correct 8 byte offset 1218 andcc %i0, 32, %o3 1219 brnz,pn %o3, .bc_aln_1 1220 andcc %i0, 16, %o3 1221 brnz,pn %o3, .bc_aln_01 1222 andcc %i0, 8, %o3 1223 brz,pn %o3, .bc_aln_000 1224 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 1225 ba .bc_aln_001 1226 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1227 1228.bc_aln_01: 1229 brnz,pn %o3, .bc_aln_011 1230 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 1231 ba .bc_aln_010 1232 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1233.bc_aln_1: 1234 andcc %i0, 16, %o3 1235 brnz,pn %o3, .bc_aln_11 1236 andcc %i0, 8, %o3 1237 brnz,pn %o3, .bc_aln_101 1238 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 1239 ba .bc_aln_100 1240 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1241.bc_aln_11: 1242 brz,pn %o3, .bc_aln_110 1243 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 1244 1245.bc_aln_111: 1246! Alignment off by 8 bytes 1247 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1248 ldd [%i0], %d0 1249 add %i0, 8, %i0 1250 sub %i2, 8, %i2 1251 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1252 and %i2, 0x7f, %i2 ! residue bytes in %i2 1253 sub %i1, %i0, %i1 1254.bc_aln_111_loop: 1255 ldda [%i0]ASI_BLK_P,%d16 ! block load 1256 subcc %o3, 64, %o3 1257 fmovd %d16, %d2 1258 fmovd %d18, %d4 1259 fmovd %d20, %d6 1260 fmovd %d22, %d8 1261 fmovd %d24, %d10 1262 fmovd %d26, %d12 1263 fmovd %d28, %d14 1264 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1265 stda %d0,[%i0+%i1]ASI_BLK_P 1266 add %i0, 64, %i0 1267 fmovd %d30, %d0 1268 bgt,pt %ncc, .bc_aln_111_loop 1269 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1270 add %i1, %i0, %i1 1271 1272 std %d0, [%i1] 1273 ba .bc_remain_stuff 1274 add %i1, 8, %i1 1275 ! END OF aln_111 1276 1277.bc_aln_110: 1278! Alignment off by 16 bytes 1279 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1280 ldd [%i0], %d0 1281 ldd [%i0+8], %d2 1282 add %i0, 16, %i0 1283 sub %i2, 16, %i2 1284 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1285 and %i2, 0x7f, %i2 ! residue bytes in %i2 1286 sub %i1, %i0, %i1 1287.bc_aln_110_loop: 1288 ldda [%i0]ASI_BLK_P,%d16 ! block load 1289 subcc %o3, 64, %o3 1290 fmovd %d16, %d4 1291 fmovd %d18, %d6 1292 fmovd %d20, %d8 1293 fmovd %d22, %d10 1294 fmovd %d24, %d12 1295 fmovd %d26, %d14 1296 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1297 stda %d0,[%i0+%i1]ASI_BLK_P 1298 add %i0, 64, %i0 1299 fmovd %d28, %d0 1300 fmovd %d30, %d2 1301 bgt,pt %ncc, .bc_aln_110_loop 1302 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1303 add %i1, %i0, %i1 1304 1305 std %d0, [%i1] 1306 std %d2, [%i1+8] 1307 ba .bc_remain_stuff 1308 add %i1, 16, %i1 1309 ! END OF aln_110 1310 1311.bc_aln_101: 1312! Alignment off by 24 bytes 1313 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1314 ldd [%i0], %d0 1315 ldd [%i0+8], %d2 1316 ldd [%i0+16], %d4 1317 add %i0, 24, %i0 1318 sub %i2, 24, %i2 1319 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1320 and %i2, 0x7f, %i2 ! residue bytes in %i2 1321 sub %i1, %i0, %i1 1322.bc_aln_101_loop: 1323 ldda [%i0]ASI_BLK_P,%d16 ! block load 1324 subcc %o3, 64, %o3 1325 fmovd %d16, %d6 1326 fmovd %d18, %d8 1327 fmovd %d20, %d10 1328 fmovd %d22, %d12 1329 fmovd %d24, %d14 1330 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1331 stda %d0,[%i0+%i1]ASI_BLK_P 1332 add %i0, 64, %i0 1333 fmovd %d26, %d0 1334 fmovd %d28, %d2 1335 fmovd %d30, %d4 1336 bgt,pt %ncc, .bc_aln_101_loop 1337 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1338 add %i1, %i0, %i1 1339 1340 std %d0, [%i1] 1341 std %d2, [%i1+8] 1342 std %d4, [%i1+16] 1343 ba .bc_remain_stuff 1344 add %i1, 24, %i1 1345 ! END OF aln_101 1346 1347.bc_aln_100: 1348! Alignment off by 32 bytes 1349 ldd [%i0], %d0 1350 ldd [%i0+8], %d2 1351 ldd [%i0+16],%d4 1352 ldd [%i0+24],%d6 1353 add %i0, 32, %i0 1354 sub %i2, 32, %i2 1355 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1356 and %i2, 0x7f, %i2 ! residue bytes in %i2 1357 sub %i1, %i0, %i1 1358.bc_aln_100_loop: 1359 ldda [%i0]ASI_BLK_P,%d16 ! block load 1360 subcc %o3, 64, %o3 1361 fmovd %d16, %d8 1362 fmovd %d18, %d10 1363 fmovd %d20, %d12 1364 fmovd %d22, %d14 1365 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1366 stda %d0,[%i0+%i1]ASI_BLK_P 1367 add %i0, 64, %i0 1368 fmovd %d24, %d0 1369 fmovd %d26, %d2 1370 fmovd %d28, %d4 1371 fmovd %d30, %d6 1372 bgt,pt %ncc, .bc_aln_100_loop 1373 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1374 add %i1, %i0, %i1 1375 1376 std %d0, [%i1] 1377 std %d2, [%i1+8] 1378 std %d4, [%i1+16] 1379 std %d6, [%i1+24] 1380 ba .bc_remain_stuff 1381 add %i1, 32, %i1 1382 ! END OF aln_100 1383 1384.bc_aln_011: 1385! Alignment off by 40 bytes 1386 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1387 ldd [%i0], %d0 1388 ldd [%i0+8], %d2 1389 ldd [%i0+16], %d4 1390 ldd [%i0+24], %d6 1391 ldd [%i0+32], %d8 1392 add %i0, 40, %i0 1393 sub %i2, 40, %i2 1394 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1395 and %i2, 0x7f, %i2 ! residue bytes in %i2 1396 sub %i1, %i0, %i1 1397.bc_aln_011_loop: 1398 ldda [%i0]ASI_BLK_P,%d16 ! block load 1399 subcc %o3, 64, %o3 1400 fmovd %d16, %d10 1401 fmovd %d18, %d12 1402 fmovd %d20, %d14 1403 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1404 stda %d0,[%i0+%i1]ASI_BLK_P 1405 add %i0, 64, %i0 1406 fmovd %d22, %d0 1407 fmovd %d24, %d2 1408 fmovd %d26, %d4 1409 fmovd %d28, %d6 1410 fmovd %d30, %d8 1411 bgt,pt %ncc, .bc_aln_011_loop 1412 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1413 add %i1, %i0, %i1 1414 1415 std %d0, [%i1] 1416 std %d2, [%i1+8] 1417 std %d4, [%i1+16] 1418 std %d6, [%i1+24] 1419 std %d8, [%i1+32] 1420 ba .bc_remain_stuff 1421 add %i1, 40, %i1 1422 ! END OF aln_011 1423 1424.bc_aln_010: 1425! Alignment off by 48 bytes 1426 ldd [%i0], %d0 1427 ldd [%i0+8], %d2 1428 ldd [%i0+16], %d4 1429 ldd [%i0+24], %d6 1430 ldd [%i0+32], %d8 1431 ldd [%i0+40], %d10 1432 add %i0, 48, %i0 1433 sub %i2, 48, %i2 1434 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1435 and %i2, 0x7f, %i2 ! residue bytes in %i2 1436 sub %i1, %i0, %i1 1437.bc_aln_010_loop: 1438 ldda [%i0]ASI_BLK_P,%d16 ! block load 1439 subcc %o3, 64, %o3 1440 fmovd %d16, %d12 1441 fmovd %d18, %d14 1442 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1443 stda %d0,[%i0+%i1]ASI_BLK_P 1444 add %i0, 64, %i0 1445 fmovd %d20, %d0 1446 fmovd %d22, %d2 1447 fmovd %d24, %d4 1448 fmovd %d26, %d6 1449 fmovd %d28, %d8 1450 fmovd %d30, %d10 1451 bgt,pt %ncc, .bc_aln_010_loop 1452 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1453 add %i1, %i0, %i1 1454 1455 std %d0, [%i1] 1456 std %d2, [%i1+8] 1457 std %d4, [%i1+16] 1458 std %d6, [%i1+24] 1459 std %d8, [%i1+32] 1460 std %d10, [%i1+40] 1461 ba .bc_remain_stuff 1462 add %i1, 48, %i1 1463 ! END OF aln_010 1464 1465.bc_aln_001: 1466! Alignment off by 56 bytes 1467 ldd [%i0], %d0 1468 ldd [%i0+8], %d2 1469 ldd [%i0+16], %d4 1470 ldd [%i0+24], %d6 1471 ldd [%i0+32], %d8 1472 ldd [%i0+40], %d10 1473 ldd [%i0+48], %d12 1474 add %i0, 56, %i0 1475 sub %i2, 56, %i2 1476 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1477 and %i2, 0x7f, %i2 ! residue bytes in %i2 1478 sub %i1, %i0, %i1 1479.bc_aln_001_loop: 1480 ldda [%i0]ASI_BLK_P,%d16 ! block load 1481 subcc %o3, 64, %o3 1482 fmovd %d16, %d14 1483 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1484 stda %d0,[%i0+%i1]ASI_BLK_P 1485 add %i0, 64, %i0 1486 fmovd %d18, %d0 1487 fmovd %d20, %d2 1488 fmovd %d22, %d4 1489 fmovd %d24, %d6 1490 fmovd %d26, %d8 1491 fmovd %d28, %d10 1492 fmovd %d30, %d12 1493 bgt,pt %ncc, .bc_aln_001_loop 1494 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1495 add %i1, %i0, %i1 1496 1497 std %d0, [%i1] 1498 std %d2, [%i1+8] 1499 std %d4, [%i1+16] 1500 std %d6, [%i1+24] 1501 std %d8, [%i1+32] 1502 std %d10, [%i1+40] 1503 std %d12, [%i1+48] 1504 ba .bc_remain_stuff 1505 add %i1, 56, %i1 1506 ! END OF aln_001 1507 1508.bc_aln_000: 1509 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1510 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1511 and %i2, 0x7f, %i2 ! residue bytes in %i2 1512 sub %i1, %i0, %i1 1513.bc_aln_000_loop: 1514 ldda [%i0]ASI_BLK_P,%d0 1515 subcc %o3, 64, %o3 1516 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1517 stda %d0,[%i0+%i1]ASI_BLK_P 1518 add %i0, 64, %i0 1519 bgt,pt %ncc, .bc_aln_000_loop 1520 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1521 add %i1, %i0, %i1 1522 1523 ! END OF aln_000 1524 1525.bc_remain_stuff: 1526 subcc %i2, 31, %i2 ! adjust length to allow cc test 1527 ble,pt %ncc, .bc_aln_31 1528 nop 1529.bc_aln_32: 1530 ldx [%i0], %o4 ! move 32 bytes 1531 subcc %i2, 32, %i2 ! decrement length count by 32 1532 stx %o4, [%i1] 1533 ldx [%i0+8], %o4 1534 stx %o4, [%i1+8] 1535 ldx [%i0+16], %o4 1536 add %i0, 32, %i0 ! increase src ptr by 32 1537 stx %o4, [%i1+16] 1538 ldx [%i0-8], %o4 1539 add %i1, 32, %i1 ! increase dst ptr by 32 1540 bgu,pt %ncc, .bc_aln_32 ! repeat if at least 32 bytes left 1541 stx %o4, [%i1-8] 1542.bc_aln_31: 1543 addcc %i2, 24, %i2 ! adjust count to be off by 7 1544 ble,pt %ncc, .bc_aln_7 ! skip if 7 or fewer bytes left 1545 nop ! 1546.bc_aln_15: 1547 ldx [%i0], %o4 ! move 8 bytes 1548 add %i0, 8, %i0 ! increase src ptr by 8 1549 subcc %i2, 8, %i2 ! decrease count by 8 1550 add %i1, 8, %i1 ! increase dst ptr by 8 1551 bgu,pt %ncc, .bc_aln_15 1552 stx %o4, [%i1-8] ! 1553.bc_aln_7: 1554 addcc %i2, 7, %i2 ! finish adjustment of remaining count 1555 bz,pt %ncc, .bc_exit ! exit if finished 1556 cmp %i2, 4 1557 blt,pt %ncc, .bc_unaln3x ! skip if less than 4 bytes left 1558 nop ! 1559 ld [%i0], %o4 ! move 4 bytes 1560 add %i0, 4, %i0 ! increase src ptr by 4 1561 add %i1, 4, %i1 ! increase dst ptr by 4 1562 subcc %i2, 4, %i2 ! decrease count by 4 1563 bnz .bc_unaln3x 1564 stw %o4, [%i1-4] 1565 ba .bc_exit 1566 nop 1567 1568 ! destination alignment code 1569.bc_big_d1: 1570 ldub [%i0], %o4 ! move a byte 1571 add %i0, 1, %i0 1572 stb %o4, [%i1] 1573 add %i1, 1, %i1 1574 andcc %i1, 2, %o3 1575 bz,pt %ncc, .bc_big_d2f 1576 sub %i2, 1, %i2 1577.bc_big_d2: 1578 ldub [%i0], %o4 ! move a half-word (src align unknown) 1579 ldub [%i0+1], %o3 1580 add %i0, 2, %i0 1581 sll %o4, 8, %o4 ! position 1582 or %o4, %o3, %o4 ! merge 1583 sth %o4, [%i1] 1584 add %i1, 2, %i1 1585 andcc %i1, 4, %o3 1586 bz,pt %ncc, .bc_big_d4f 1587 sub %i2, 2, %i2 1588.bc_big_d4: 1589 ldub [%i0], %o4 ! move a word (src align unknown) 1590 ldub [%i0+1], %o3 1591 sll %o4, 24, %o4 ! position 1592 sll %o3, 16, %o3 ! position 1593 or %o4, %o3, %o3 ! merge 1594 ldub [%i0+2], %o4 1595 sll %o4, 8, %o4 ! position 1596 or %o4, %o3, %o3 ! merge 1597 ldub [%i0+3], %o4 1598 or %o4, %o3, %o4 ! merge 1599 stw %o4,[%i1] ! store four bytes 1600 add %i0, 4, %i0 ! adjust src by 4 1601 add %i1, 4, %i1 ! adjust dest by 4 1602 ba .bc_big_d4f 1603 sub %i2, 4, %i2 ! adjust count by 4 1604 1605 1606 ! Dst is on 8 byte boundary; src is not; 1607.bc_big_unal8: 1608 andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned? 1609 bz %ncc, .bc_unalnsrc 1610 sub %o3, 64, %o3 ! %o3 will be multiple of 8 1611 neg %o3 ! bytes until dest is 64 byte aligned 1612 sub %i2, %o3, %i2 ! update cnt with bytes to be moved 1613 ! Move bytes according to source alignment 1614 andcc %i0, 0x1, %o4 1615 bnz %ncc, .bc_unalnbyte ! check for byte alignment 1616 nop 1617 andcc %i0, 2, %o4 ! check for half word alignment 1618 bnz %ncc, .bc_unalnhalf 1619 nop 1620 ! Src is word aligned, move bytes until dest 64 byte aligned 1621.bc_unalnword: 1622 ld [%i0], %o4 ! load 4 bytes 1623 stw %o4, [%i1] ! and store 4 bytes 1624 ld [%i0+4], %o4 ! load 4 bytes 1625 add %i0, 8, %i0 ! increase src ptr by 8 1626 stw %o4, [%i1+4] ! and store 4 bytes 1627 subcc %o3, 8, %o3 ! decrease count by 8 1628 bnz %ncc, .bc_unalnword 1629 add %i1, 8, %i1 ! increase dst ptr by 8 1630 ba .bc_unalnsrc 1631 nop 1632 1633 ! Src is half-word aligned, move bytes until dest 64 byte aligned 1634.bc_unalnhalf: 1635 lduh [%i0], %o4 ! load 2 bytes 1636 sllx %o4, 32, %i3 ! shift left 1637 lduw [%i0+2], %o4 1638 or %o4, %i3, %i3 1639 sllx %i3, 16, %i3 1640 lduh [%i0+6], %o4 1641 or %o4, %i3, %i3 1642 stx %i3, [%i1] 1643 add %i0, 8, %i0 1644 subcc %o3, 8, %o3 1645 bnz %ncc, .bc_unalnhalf 1646 add %i1, 8, %i1 1647 ba .bc_unalnsrc 1648 nop 1649 1650 ! Src is Byte aligned, move bytes until dest 64 byte aligned 1651.bc_unalnbyte: 1652 sub %i1, %i0, %i1 ! share pointer advance 1653.bc_unalnbyte_loop: 1654 ldub [%i0], %o4 1655 sllx %o4, 56, %i3 1656 lduh [%i0+1], %o4 1657 sllx %o4, 40, %o4 1658 or %o4, %i3, %i3 1659 lduh [%i0+3], %o4 1660 sllx %o4, 24, %o4 1661 or %o4, %i3, %i3 1662 lduh [%i0+5], %o4 1663 sllx %o4, 8, %o4 1664 or %o4, %i3, %i3 1665 ldub [%i0+7], %o4 1666 or %o4, %i3, %i3 1667 stx %i3, [%i1+%i0] 1668 subcc %o3, 8, %o3 1669 bnz %ncc, .bc_unalnbyte_loop 1670 add %i0, 8, %i0 1671 add %i1,%i0, %i1 ! restore pointer 1672 1673 ! Destination is now block (64 byte aligned), src is not 8 byte aligned 1674.bc_unalnsrc: 1675 andn %i2, 0x3f, %i3 ! %i3 is multiple of block size 1676 and %i2, 0x3f, %i2 ! residue bytes in %i2 1677 add %i2, 64, %i2 ! Insure we don't load beyond 1678 sub %i3, 64, %i3 ! end of source buffer 1679 1680 andn %i0, 0x3f, %o4 ! %o4 has block aligned src address 1681 prefetch [%o4 + (3 * CACHE_LINE)], #one_read 1682 alignaddr %i0, %g0, %g0 ! generate %gsr 1683 add %i0, %i3, %i0 ! advance %i0 to after blocks 1684 ! 1685 ! Determine source alignment to correct 8 byte offset 1686 andcc %i0, 0x20, %o3 1687 brnz,pn %o3, .bc_unaln_1 1688 andcc %i0, 0x10, %o3 1689 brnz,pn %o3, .bc_unaln_01 1690 andcc %i0, 0x08, %o3 1691 brz,a %o3, .bc_unaln_000 1692 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1693 ba .bc_unaln_001 1694 nop 1695.bc_unaln_01: 1696 brnz,a %o3, .bc_unaln_011 1697 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1698 ba .bc_unaln_010 1699 nop 1700.bc_unaln_1: 1701 brnz,pn %o3, .bc_unaln_11 1702 andcc %i0, 0x08, %o3 1703 brnz,a %o3, .bc_unaln_101 1704 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1705 ba .bc_unaln_100 1706 nop 1707.bc_unaln_11: 1708 brz,pn %o3, .bc_unaln_110 1709 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1710 1711.bc_unaln_111: 1712 ldd [%o4+56], %d14 1713.bc_unaln_111_loop: 1714 add %o4, 64, %o4 1715 ldda [%o4]ASI_BLK_P, %d16 1716 faligndata %d14, %d16, %d48 1717 faligndata %d16, %d18, %d50 1718 faligndata %d18, %d20, %d52 1719 faligndata %d20, %d22, %d54 1720 faligndata %d22, %d24, %d56 1721 faligndata %d24, %d26, %d58 1722 faligndata %d26, %d28, %d60 1723 faligndata %d28, %d30, %d62 1724 fmovd %d30, %d14 1725 stda %d48, [%i1]ASI_BLK_P 1726 subcc %i3, 64, %i3 1727 add %i1, 64, %i1 1728 bgu,pt %ncc, .bc_unaln_111_loop 1729 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1730 ba .bc_unaln_done 1731 nop 1732 1733.bc_unaln_110: 1734 ldd [%o4+48], %d12 1735 ldd [%o4+56], %d14 1736.bc_unaln_110_loop: 1737 add %o4, 64, %o4 1738 ldda [%o4]ASI_BLK_P, %d16 1739 faligndata %d12, %d14, %d48 1740 faligndata %d14, %d16, %d50 1741 faligndata %d16, %d18, %d52 1742 faligndata %d18, %d20, %d54 1743 faligndata %d20, %d22, %d56 1744 faligndata %d22, %d24, %d58 1745 faligndata %d24, %d26, %d60 1746 faligndata %d26, %d28, %d62 1747 fmovd %d28, %d12 1748 fmovd %d30, %d14 1749 stda %d48, [%i1]ASI_BLK_P 1750 subcc %i3, 64, %i3 1751 add %i1, 64, %i1 1752 bgu,pt %ncc, .bc_unaln_110_loop 1753 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1754 ba .bc_unaln_done 1755 nop 1756 1757.bc_unaln_101: 1758 ldd [%o4+40], %d10 1759 ldd [%o4+48], %d12 1760 ldd [%o4+56], %d14 1761.bc_unaln_101_loop: 1762 add %o4, 64, %o4 1763 ldda [%o4]ASI_BLK_P, %d16 1764 faligndata %d10, %d12, %d48 1765 faligndata %d12, %d14, %d50 1766 faligndata %d14, %d16, %d52 1767 faligndata %d16, %d18, %d54 1768 faligndata %d18, %d20, %d56 1769 faligndata %d20, %d22, %d58 1770 faligndata %d22, %d24, %d60 1771 faligndata %d24, %d26, %d62 1772 fmovd %d26, %d10 1773 fmovd %d28, %d12 1774 fmovd %d30, %d14 1775 stda %d48, [%i1]ASI_BLK_P 1776 subcc %i3, 64, %i3 1777 add %i1, 64, %i1 1778 bgu,pt %ncc, .bc_unaln_101_loop 1779 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1780 ba .bc_unaln_done 1781 nop 1782 1783.bc_unaln_100: 1784 ldd [%o4+32], %d8 1785 ldd [%o4+40], %d10 1786 ldd [%o4+48], %d12 1787 ldd [%o4+56], %d14 1788.bc_unaln_100_loop: 1789 add %o4, 64, %o4 1790 ldda [%o4]ASI_BLK_P, %d16 1791 faligndata %d8, %d10, %d48 1792 faligndata %d10, %d12, %d50 1793 faligndata %d12, %d14, %d52 1794 faligndata %d14, %d16, %d54 1795 faligndata %d16, %d18, %d56 1796 faligndata %d18, %d20, %d58 1797 faligndata %d20, %d22, %d60 1798 faligndata %d22, %d24, %d62 1799 fmovd %d24, %d8 1800 fmovd %d26, %d10 1801 fmovd %d28, %d12 1802 fmovd %d30, %d14 1803 stda %d48, [%i1]ASI_BLK_P 1804 subcc %i3, 64, %i3 1805 add %i1, 64, %i1 1806 bgu,pt %ncc, .bc_unaln_100_loop 1807 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1808 ba .bc_unaln_done 1809 nop 1810 1811.bc_unaln_011: 1812 ldd [%o4+24], %d6 1813 ldd [%o4+32], %d8 1814 ldd [%o4+40], %d10 1815 ldd [%o4+48], %d12 1816 ldd [%o4+56], %d14 1817.bc_unaln_011_loop: 1818 add %o4, 64, %o4 1819 ldda [%o4]ASI_BLK_P, %d16 1820 faligndata %d6, %d8, %d48 1821 faligndata %d8, %d10, %d50 1822 faligndata %d10, %d12, %d52 1823 faligndata %d12, %d14, %d54 1824 faligndata %d14, %d16, %d56 1825 faligndata %d16, %d18, %d58 1826 faligndata %d18, %d20, %d60 1827 faligndata %d20, %d22, %d62 1828 fmovd %d22, %d6 1829 fmovd %d24, %d8 1830 fmovd %d26, %d10 1831 fmovd %d28, %d12 1832 fmovd %d30, %d14 1833 stda %d48, [%i1]ASI_BLK_P 1834 subcc %i3, 64, %i3 1835 add %i1, 64, %i1 1836 bgu,pt %ncc, .bc_unaln_011_loop 1837 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1838 ba .bc_unaln_done 1839 nop 1840 1841.bc_unaln_010: 1842 ldd [%o4+16], %d4 1843 ldd [%o4+24], %d6 1844 ldd [%o4+32], %d8 1845 ldd [%o4+40], %d10 1846 ldd [%o4+48], %d12 1847 ldd [%o4+56], %d14 1848.bc_unaln_010_loop: 1849 add %o4, 64, %o4 1850 ldda [%o4]ASI_BLK_P, %d16 1851 faligndata %d4, %d6, %d48 1852 faligndata %d6, %d8, %d50 1853 faligndata %d8, %d10, %d52 1854 faligndata %d10, %d12, %d54 1855 faligndata %d12, %d14, %d56 1856 faligndata %d14, %d16, %d58 1857 faligndata %d16, %d18, %d60 1858 faligndata %d18, %d20, %d62 1859 fmovd %d20, %d4 1860 fmovd %d22, %d6 1861 fmovd %d24, %d8 1862 fmovd %d26, %d10 1863 fmovd %d28, %d12 1864 fmovd %d30, %d14 1865 stda %d48, [%i1]ASI_BLK_P 1866 subcc %i3, 64, %i3 1867 add %i1, 64, %i1 1868 bgu,pt %ncc, .bc_unaln_010_loop 1869 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1870 ba .bc_unaln_done 1871 nop 1872 1873.bc_unaln_001: 1874 ldd [%o4+8], %d2 1875 ldd [%o4+16], %d4 1876 ldd [%o4+24], %d6 1877 ldd [%o4+32], %d8 1878 ldd [%o4+40], %d10 1879 ldd [%o4+48], %d12 1880 ldd [%o4+56], %d14 1881.bc_unaln_001_loop: 1882 add %o4, 64, %o4 1883 ldda [%o4]ASI_BLK_P, %d16 1884 faligndata %d2, %d4, %d48 1885 faligndata %d4, %d6, %d50 1886 faligndata %d6, %d8, %d52 1887 faligndata %d8, %d10, %d54 1888 faligndata %d10, %d12, %d56 1889 faligndata %d12, %d14, %d58 1890 faligndata %d14, %d16, %d60 1891 faligndata %d16, %d18, %d62 1892 fmovd %d18, %d2 1893 fmovd %d20, %d4 1894 fmovd %d22, %d6 1895 fmovd %d24, %d8 1896 fmovd %d26, %d10 1897 fmovd %d28, %d12 1898 fmovd %d30, %d14 1899 stda %d48, [%i1]ASI_BLK_P 1900 subcc %i3, 64, %i3 1901 add %i1, 64, %i1 1902 bgu,pt %ncc, .bc_unaln_001_loop 1903 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1904 ba .bc_unaln_done 1905 nop 1906 1907.bc_unaln_000: 1908 ldda [%o4]ASI_BLK_P, %d0 1909.bc_unaln_000_loop: 1910 add %o4, 64, %o4 1911 ldda [%o4]ASI_BLK_P, %d16 1912 faligndata %d0, %d2, %d48 1913 faligndata %d2, %d4, %d50 1914 faligndata %d4, %d6, %d52 1915 faligndata %d6, %d8, %d54 1916 faligndata %d8, %d10, %d56 1917 faligndata %d10, %d12, %d58 1918 faligndata %d12, %d14, %d60 1919 faligndata %d14, %d16, %d62 1920 fmovd %d16, %d0 1921 fmovd %d18, %d2 1922 fmovd %d20, %d4 1923 fmovd %d22, %d6 1924 fmovd %d24, %d8 1925 fmovd %d26, %d10 1926 fmovd %d28, %d12 1927 fmovd %d30, %d14 1928 stda %d48, [%i1]ASI_BLK_P 1929 subcc %i3, 64, %i3 1930 add %i1, 64, %i1 1931 bgu,pt %ncc, .bc_unaln_000_loop 1932 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1933 1934.bc_unaln_done: 1935 ! Handle trailing bytes, 64 to 127 1936 ! Dest long word aligned, Src not long word aligned 1937 cmp %i2, 15 1938 bleu %ncc, .bc_unaln_short 1939 1940 andn %i2, 0x7, %i3 ! %i3 is multiple of 8 1941 and %i2, 0x7, %i2 ! residue bytes in %i2 1942 add %i2, 8, %i2 1943 sub %i3, 8, %i3 ! insure we don't load past end of src 1944 andn %i0, 0x7, %o4 ! %o4 has long word aligned src address 1945 add %i0, %i3, %i0 ! advance %i0 to after multiple of 8 1946 ldd [%o4], %d0 ! fetch partial word 1947.bc_unaln_by8: 1948 ldd [%o4+8], %d2 1949 add %o4, 8, %o4 1950 faligndata %d0, %d2, %d16 1951 subcc %i3, 8, %i3 1952 std %d16, [%i1] 1953 fmovd %d2, %d0 1954 bgu,pt %ncc, .bc_unaln_by8 1955 add %i1, 8, %i1 1956 1957.bc_unaln_short: 1958 cmp %i2, 8 1959 blt,pt %ncc, .bc_unalnfin 1960 nop 1961 ldub [%i0], %o4 1962 sll %o4, 24, %o3 1963 ldub [%i0+1], %o4 1964 sll %o4, 16, %o4 1965 or %o4, %o3, %o3 1966 ldub [%i0+2], %o4 1967 sll %o4, 8, %o4 1968 or %o4, %o3, %o3 1969 ldub [%i0+3], %o4 1970 or %o4, %o3, %o3 1971 stw %o3, [%i1] 1972 ldub [%i0+4], %o4 1973 sll %o4, 24, %o3 1974 ldub [%i0+5], %o4 1975 sll %o4, 16, %o4 1976 or %o4, %o3, %o3 1977 ldub [%i0+6], %o4 1978 sll %o4, 8, %o4 1979 or %o4, %o3, %o3 1980 ldub [%i0+7], %o4 1981 or %o4, %o3, %o3 1982 stw %o3, [%i1+4] 1983 add %i0, 8, %i0 1984 add %i1, 8, %i1 1985 sub %i2, 8, %i2 1986.bc_unalnfin: 1987 cmp %i2, 4 1988 blt,pt %ncc, .bc_unalnz 1989 tst %i2 1990 ldub [%i0], %o3 ! read byte 1991 subcc %i2, 4, %i2 ! reduce count by 4 1992 sll %o3, 24, %o3 ! position 1993 ldub [%i0+1], %o4 1994 sll %o4, 16, %o4 ! position 1995 or %o4, %o3, %o3 ! merge 1996 ldub [%i0+2], %o4 1997 sll %o4, 8, %o4 ! position 1998 or %o4, %o3, %o3 ! merge 1999 add %i1, 4, %i1 ! advance dst by 4 2000 ldub [%i0+3], %o4 2001 add %i0, 4, %i0 ! advance src by 4 2002 or %o4, %o3, %o4 ! merge 2003 bnz,pt %ncc, .bc_unaln3x 2004 stw %o4, [%i1-4] 2005 ba .bc_exit 2006 nop 2007.bc_unalnz: 2008 bz,pt %ncc, .bc_exit 2009.bc_unaln3x: ! Exactly 1, 2, or 3 bytes remain 2010 subcc %i2, 1, %i2 ! reduce count for cc test 2011 ldub [%i0], %o4 ! load one byte 2012 bz,pt %ncc, .bc_exit 2013 stb %o4, [%i1] ! store one byte 2014 ldub [%i0+1], %o4 ! load second byte 2015 subcc %i2, 1, %i2 2016 bz,pt %ncc, .bc_exit 2017 stb %o4, [%i1+1] ! store second byte 2018 ldub [%i0+2], %o4 ! load third byte 2019 stb %o4, [%i1+2] ! store third byte 2020.bc_exit: 2021 wr %l5, %g0, %gsr ! restore %gsr 2022 brnz %g5, .bc_fp_restore 2023 and %o5, COPY_FLAGS, %l1 ! save flags in %l1 2024 FZERO 2025 wr %g5, %g0, %fprs 2026 ba,pt %ncc, .bc_ex2 2027 nop 2028.bc_fp_restore: 2029 BLD_FP_FROMSTACK(%o4) 2030.bc_ex2: 2031 ldn [THREAD_REG + T_LWP], %o2 2032 brnz,pt %o2, 1f 2033 nop 2034 2035 ldsb [THREAD_REG + T_PREEMPT], %l0 2036 deccc %l0 2037 bnz,pn %ncc, 1f 2038 stb %l0, [THREAD_REG + T_PREEMPT] 2039 2040 ! Check for a kernel preemption request 2041 ldn [THREAD_REG + T_CPU], %l0 2042 ldub [%l0 + CPU_KPRUNRUN], %l0 2043 brnz,a,pt %l0, 1f ! Need to call kpreempt? 2044 or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag 20451: 2046 btst LOFAULT_SET, %l1 2047 bz,pn %icc, 3f 2048 andncc %o5, COPY_FLAGS, %o5 2049 ! Here via bcopy. Check to see if the handler was NULL. 2050 ! If so, just return quietly. Otherwise, reset the 2051 ! handler and return. 2052 bz,pn %ncc, 2f 2053 nop 2054 membar #Sync 2055 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 20562: 2057 btst KPREEMPT_FLAG, %l1 2058 bz,pt %icc, 3f 2059 nop 2060 call kpreempt 2061 rdpr %pil, %o0 ! pass %pil 20623: 2063 ret 2064 restore %g0, 0, %o0 2065 2066 SET_SIZE(bcopy_more) 2067 2068 2069#else /* NIAGARA_IMPL */ 2070 save %sp, -SA(MINFRAME), %sp 2071 clr %o5 ! flag LOFAULT_SET is not set for bcopy 2072.do_copy: 2073 cmp %i2, 12 ! for small counts 2074 blu %ncc, .bytecp ! just copy bytes 2075 .empty 2076 2077 cmp %i2, 128 ! for less than 128 bytes 2078 blu,pn %ncc, .bcb_punt ! no block st/quad ld 2079 nop 2080 2081 set use_hw_bcopy, %o2 2082 ld [%o2], %o2 2083 brz,pn %o2, .bcb_punt 2084 nop 2085 2086 subcc %i1, %i0, %i3 2087 bneg,a,pn %ncc, 1f 2088 neg %i3 20891: 2090 /* 2091 * Compare against 256 since we should be checking block addresses 2092 * and (dest & ~63) - (src & ~63) can be 3 blocks even if 2093 * src = dest + (64 * 3) + 63. 2094 */ 2095 cmp %i3, 256 2096 blu,pn %ncc, .bcb_punt 2097 nop 2098 2099 /* 2100 * Copy that reach here have at least 2 blocks of data to copy. 2101 */ 2102.do_blockcopy: 2103 ! Swap src/dst since the code below is memcpy code 2104 ! and memcpy/bcopy have different calling sequences 2105 mov %i1, %i5 2106 mov %i0, %i1 2107 mov %i5, %i0 2108 2109 ! Block (64 bytes) align the destination. 2110 andcc %i0, 0x3f, %i3 ! is dst aligned on a 64 bytes 2111 bz %xcc, .chksrc ! dst is already double aligned 2112 sub %i3, 0x40, %i3 2113 neg %i3 ! bytes till dst 64 bytes aligned 2114 sub %i2, %i3, %i2 ! update i2 with new count 2115 2116 ! Based on source and destination alignment do 2117 ! either 8 bytes, 4 bytes, 2 bytes or byte copy. 2118 2119 ! Is dst & src 8B aligned 2120 or %i0, %i1, %o2 2121 andcc %o2, 0x7, %g0 2122 bz %ncc, .alewdcp 2123 nop 2124 2125 ! Is dst & src 4B aligned 2126 andcc %o2, 0x3, %g0 2127 bz %ncc, .alwdcp 2128 nop 2129 2130 ! Is dst & src 2B aligned 2131 andcc %o2, 0x1, %g0 2132 bz %ncc, .alhlfwdcp 2133 nop 2134 2135 ! 1B aligned 21361: ldub [%i1], %o2 2137 stb %o2, [%i0] 2138 inc %i1 2139 deccc %i3 2140 bgu,pt %ncc, 1b 2141 inc %i0 2142 2143 ba .chksrc 2144 nop 2145 2146 ! dst & src 4B aligned 2147.alwdcp: 2148 ld [%i1], %o2 2149 st %o2, [%i0] 2150 add %i1, 0x4, %i1 2151 subcc %i3, 0x4, %i3 2152 bgu,pt %ncc, .alwdcp 2153 add %i0, 0x4, %i0 2154 2155 ba .chksrc 2156 nop 2157 2158 ! dst & src 2B aligned 2159.alhlfwdcp: 2160 lduh [%i1], %o2 2161 stuh %o2, [%i0] 2162 add %i1, 0x2, %i1 2163 subcc %i3, 0x2, %i3 2164 bgu,pt %ncc, .alhlfwdcp 2165 add %i0, 0x2, %i0 2166 2167 ba .chksrc 2168 nop 2169 2170 ! dst & src 8B aligned 2171.alewdcp: 2172 ldx [%i1], %o2 2173 stx %o2, [%i0] 2174 add %i1, 0x8, %i1 2175 subcc %i3, 0x8, %i3 2176 bgu,pt %ncc, .alewdcp 2177 add %i0, 0x8, %i0 2178 2179 ! Now Destination is block (64 bytes) aligned 2180.chksrc: 2181 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size 2182 sub %i2, %i3, %i2 ! Residue bytes in %i2 2183 2184 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 2185 2186 andcc %i1, 0xf, %o2 ! is src quadword aligned 2187 bz,pn %xcc, .blkcpy ! src offset in %o2 2188 nop 2189 cmp %o2, 0x8 2190 bg .cpy_upper_double 2191 nop 2192 bl .cpy_lower_double 2193 nop 2194 2195 ! Falls through when source offset is equal to 8 i.e. 2196 ! source is double word aligned. 2197 ! In this case no shift/merge of data is required 2198 sub %i1, %o2, %i1 ! align the src at 16 bytes. 2199 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 2200 prefetch [%l0+0x0], #one_read 2201 ldda [%i1+0x0]%asi, %l2 2202loop0: 2203 ldda [%i1+0x10]%asi, %l4 2204 prefetch [%l0+0x40], #one_read 2205 2206 stxa %l3, [%i0+0x0]%asi 2207 stxa %l4, [%i0+0x8]%asi 2208 2209 ldda [%i1+0x20]%asi, %l2 2210 stxa %l5, [%i0+0x10]%asi 2211 stxa %l2, [%i0+0x18]%asi 2212 2213 ldda [%i1+0x30]%asi, %l4 2214 stxa %l3, [%i0+0x20]%asi 2215 stxa %l4, [%i0+0x28]%asi 2216 2217 ldda [%i1+0x40]%asi, %l2 2218 stxa %l5, [%i0+0x30]%asi 2219 stxa %l2, [%i0+0x38]%asi 2220 2221 add %l0, 0x40, %l0 2222 add %i1, 0x40, %i1 2223 subcc %i3, 0x40, %i3 2224 bgu,pt %xcc, loop0 2225 add %i0, 0x40, %i0 2226 ba .blkdone 2227 add %i1, %o2, %i1 ! increment the source by src offset 2228 ! the src offset was stored in %o2 2229 2230.cpy_lower_double: 2231 sub %i1, %o2, %i1 ! align the src at 16 bytes. 2232 sll %o2, 3, %o0 ! %o0 left shift 2233 mov 0x40, %o1 2234 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 2235 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 2236 prefetch [%l0+0x0], #one_read 2237 ldda [%i1+0x0]%asi, %l2 ! partial data in %l2 and %l3 has 2238 ! complete data 2239loop1: 2240 ldda [%i1+0x10]%asi, %l4 ! %l4 has partial data for this read. 2241 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4 2242 ! into %l2 and %l3 2243 prefetch [%l0+0x40], #one_read 2244 stxa %l2, [%i0+0x0]%asi 2245 stxa %l3, [%i0+0x8]%asi 2246 2247 ldda [%i1+0x20]%asi, %l2 2248 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and 2249 stxa %l4, [%i0+0x10]%asi ! %l4 from previous read 2250 stxa %l5, [%i0+0x18]%asi ! into %l4 and %l5 2251 2252 ! Repeat the same for next 32 bytes. 2253 2254 ldda [%i1+0x30]%asi, %l4 2255 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) 2256 stxa %l2, [%i0+0x20]%asi 2257 stxa %l3, [%i0+0x28]%asi 2258 2259 ldda [%i1+0x40]%asi, %l2 2260 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) 2261 stxa %l4, [%i0+0x30]%asi 2262 stxa %l5, [%i0+0x38]%asi 2263 2264 add %l0, 0x40, %l0 2265 add %i1, 0x40, %i1 2266 subcc %i3, 0x40, %i3 2267 bgu,pt %xcc, loop1 2268 add %i0, 0x40, %i0 2269 ba .blkdone 2270 add %i1, %o2, %i1 ! increment the source by src offset 2271 ! the src offset was stored in %o2 2272 2273.cpy_upper_double: 2274 sub %i1, %o2, %i1 ! align the src at 16 bytes. 2275 mov 0x8, %o0 2276 sub %o2, %o0, %o0 2277 sll %o0, 3, %o0 ! %o0 left shift 2278 mov 0x40, %o1 2279 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 2280 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 2281 prefetch [%l0+0x0], #one_read 2282 ldda [%i1+0x0]%asi, %l2 ! partial data in %l3 for this read and 2283 ! no data in %l2 2284loop2: 2285 ldda [%i1+0x10]%asi, %l4 ! %l4 has complete data and %l5 has 2286 ! partial 2287 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5 2288 ! into %l3 and %l4 2289 prefetch [%l0+0x40], #one_read 2290 stxa %l3, [%i0+0x0]%asi 2291 stxa %l4, [%i0+0x8]%asi 2292 2293 ldda [%i1+0x20]%asi, %l2 2294 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with 2295 stxa %l5, [%i0+0x10]%asi ! %l5 from previous read 2296 stxa %l2, [%i0+0x18]%asi ! into %l5 and %l2 2297 2298 ! Repeat the same for next 32 bytes. 2299 2300 ldda [%i1+0x30]%asi, %l4 2301 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) 2302 stxa %l3, [%i0+0x20]%asi 2303 stxa %l4, [%i0+0x28]%asi 2304 2305 ldda [%i1+0x40]%asi, %l2 2306 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) 2307 stxa %l5, [%i0+0x30]%asi 2308 stxa %l2, [%i0+0x38]%asi 2309 2310 add %l0, 0x40, %l0 2311 add %i1, 0x40, %i1 2312 subcc %i3, 0x40, %i3 2313 bgu,pt %xcc, loop2 2314 add %i0, 0x40, %i0 2315 ba .blkdone 2316 add %i1, %o2, %i1 ! increment the source by src offset 2317 ! the src offset was stored in %o2 2318 2319 2320 ! Both Source and Destination are block aligned. 2321 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P 2322.blkcpy: 2323 prefetch [%i1+0x0], #one_read 23241: 2325 ldda [%i1+0x0]%asi, %l0 2326 ldda [%i1+0x10]%asi, %l2 2327 prefetch [%i1+0x40], #one_read 2328 2329 stxa %l0, [%i0+0x0]%asi 2330 ldda [%i1+0x20]%asi, %l4 2331 ldda [%i1+0x30]%asi, %l6 2332 2333 stxa %l1, [%i0+0x8]%asi 2334 stxa %l2, [%i0+0x10]%asi 2335 stxa %l3, [%i0+0x18]%asi 2336 stxa %l4, [%i0+0x20]%asi 2337 stxa %l5, [%i0+0x28]%asi 2338 stxa %l6, [%i0+0x30]%asi 2339 stxa %l7, [%i0+0x38]%asi 2340 2341 add %i1, 0x40, %i1 2342 subcc %i3, 0x40, %i3 2343 bgu,pt %xcc, 1b 2344 add %i0, 0x40, %i0 2345 2346.blkdone: 2347 membar #Sync 2348 2349 brz,pt %i2, .blkexit 2350 nop 2351 2352 ! Handle trailing bytes 2353 cmp %i2, 0x8 2354 blu,pt %ncc, .residue 2355 nop 2356 2357 ! Can we do some 8B ops 2358 or %i1, %i0, %o2 2359 andcc %o2, 0x7, %g0 2360 bnz %ncc, .last4 2361 nop 2362 2363 ! Do 8byte ops as long as possible 2364.last8: 2365 ldx [%i1], %o2 2366 stx %o2, [%i0] 2367 add %i1, 0x8, %i1 2368 sub %i2, 0x8, %i2 2369 cmp %i2, 0x8 2370 bgu,pt %ncc, .last8 2371 add %i0, 0x8, %i0 2372 2373 brz,pt %i2, .blkexit 2374 nop 2375 2376 ba .residue 2377 nop 2378 2379.last4: 2380 ! Can we do 4B ops 2381 andcc %o2, 0x3, %g0 2382 bnz %ncc, .last2 2383 nop 23841: 2385 ld [%i1], %o2 2386 st %o2, [%i0] 2387 add %i1, 0x4, %i1 2388 sub %i2, 0x4, %i2 2389 cmp %i2, 0x4 2390 bgu,pt %ncc, 1b 2391 add %i0, 0x4, %i0 2392 2393 brz,pt %i2, .blkexit 2394 nop 2395 2396 ba .residue 2397 nop 2398 2399.last2: 2400 ! Can we do 2B ops 2401 andcc %o2, 0x1, %g0 2402 bnz %ncc, .residue 2403 nop 2404 24051: 2406 lduh [%i1], %o2 2407 stuh %o2, [%i0] 2408 add %i1, 0x2, %i1 2409 sub %i2, 0x2, %i2 2410 cmp %i2, 0x2 2411 bgu,pt %ncc, 1b 2412 add %i0, 0x2, %i0 2413 2414 brz,pt %i2, .blkexit 2415 nop 2416 2417.residue: 2418 ldub [%i1], %o2 2419 stb %o2, [%i0] 2420 inc %i1 2421 deccc %i2 2422 bgu,pt %ncc, .residue 2423 inc %i0 2424 2425.blkexit: 2426 2427 membar #Sync ! sync error barrier 2428 ! Restore t_lofault handler, if came here from kcopy(). 2429 tst %o5 2430 bz %ncc, 1f 2431 andn %o5, LOFAULT_SET, %o5 2432 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 24331: 2434 ret 2435 restore %g0, 0, %o0 2436 2437 2438.bcb_punt: 2439 ! 2440 ! use aligned transfers where possible 2441 ! 2442 xor %i0, %i1, %o4 ! xor from and to address 2443 btst 7, %o4 ! if lower three bits zero 2444 bz .aldoubcp ! can align on double boundary 2445 .empty ! assembler complaints about label 2446 2447 xor %i0, %i1, %o4 ! xor from and to address 2448 btst 3, %o4 ! if lower two bits zero 2449 bz .alwordcp ! can align on word boundary 2450 btst 3, %i0 ! delay slot, from address unaligned? 2451 ! 2452 ! use aligned reads and writes where possible 2453 ! this differs from wordcp in that it copes 2454 ! with odd alignment between source and destnation 2455 ! using word reads and writes with the proper shifts 2456 ! in between to align transfers to and from memory 2457 ! i0 - src address, i1 - dest address, i2 - count 2458 ! i3, i4 - tmps for used generating complete word 2459 ! i5 (word to write) 2460 ! l0 size in bits of upper part of source word (US) 2461 ! l1 size in bits of lower part of source word (LS = 32 - US) 2462 ! l2 size in bits of upper part of destination word (UD) 2463 ! l3 size in bits of lower part of destination word (LD = 32 - UD) 2464 ! l4 number of bytes leftover after aligned transfers complete 2465 ! l5 the number 32 2466 ! 2467 mov 32, %l5 ! load an oft-needed constant 2468 bz .align_dst_only 2469 btst 3, %i1 ! is destnation address aligned? 2470 clr %i4 ! clear registers used in either case 2471 bz .align_src_only 2472 clr %l0 2473 ! 2474 ! both source and destination addresses are unaligned 2475 ! 24761: ! align source 2477 ldub [%i0], %i3 ! read a byte from source address 2478 add %i0, 1, %i0 ! increment source address 2479 or %i4, %i3, %i4 ! or in with previous bytes (if any) 2480 btst 3, %i0 ! is source aligned? 2481 add %l0, 8, %l0 ! increment size of upper source (US) 2482 bnz,a 1b 2483 sll %i4, 8, %i4 ! make room for next byte 2484 2485 sub %l5, %l0, %l1 ! generate shift left count (LS) 2486 sll %i4, %l1, %i4 ! prepare to get rest 2487 ld [%i0], %i3 ! read a word 2488 add %i0, 4, %i0 ! increment source address 2489 srl %i3, %l0, %i5 ! upper src bits into lower dst bits 2490 or %i4, %i5, %i5 ! merge 2491 mov 24, %l3 ! align destination 24921: 2493 srl %i5, %l3, %i4 ! prepare to write a single byte 2494 stb %i4, [%i1] ! write a byte 2495 add %i1, 1, %i1 ! increment destination address 2496 sub %i2, 1, %i2 ! decrement count 2497 btst 3, %i1 ! is destination aligned? 2498 bnz,a 1b 2499 sub %l3, 8, %l3 ! delay slot, decrement shift count (LD) 2500 sub %l5, %l3, %l2 ! generate shift left count (UD) 2501 sll %i5, %l2, %i5 ! move leftover into upper bytes 2502 cmp %l2, %l0 ! cmp # reqd to fill dst w old src left 2503 bgu %ncc, .more_needed ! need more to fill than we have 2504 nop 2505 2506 sll %i3, %l1, %i3 ! clear upper used byte(s) 2507 srl %i3, %l1, %i3 2508 ! get the odd bytes between alignments 2509 sub %l0, %l2, %l0 ! regenerate shift count 2510 sub %l5, %l0, %l1 ! generate new shift left count (LS) 2511 and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0 2512 andn %i2, 3, %i2 ! # of aligned bytes that can be moved 2513 srl %i3, %l0, %i4 2514 or %i5, %i4, %i5 2515 st %i5, [%i1] ! write a word 2516 subcc %i2, 4, %i2 ! decrement count 2517 bz %ncc, .unalign_out 2518 add %i1, 4, %i1 ! increment destination address 2519 2520 b 2f 2521 sll %i3, %l1, %i5 ! get leftover into upper bits 2522.more_needed: 2523 sll %i3, %l0, %i3 ! save remaining byte(s) 2524 srl %i3, %l0, %i3 2525 sub %l2, %l0, %l1 ! regenerate shift count 2526 sub %l5, %l1, %l0 ! generate new shift left count 2527 sll %i3, %l1, %i4 ! move to fill empty space 2528 b 3f 2529 or %i5, %i4, %i5 ! merge to complete word 2530 ! 2531 ! the source address is aligned and destination is not 2532 ! 2533.align_dst_only: 2534 ld [%i0], %i4 ! read a word 2535 add %i0, 4, %i0 ! increment source address 2536 mov 24, %l0 ! initial shift alignment count 25371: 2538 srl %i4, %l0, %i3 ! prepare to write a single byte 2539 stb %i3, [%i1] ! write a byte 2540 add %i1, 1, %i1 ! increment destination address 2541 sub %i2, 1, %i2 ! decrement count 2542 btst 3, %i1 ! is destination aligned? 2543 bnz,a 1b 2544 sub %l0, 8, %l0 ! delay slot, decrement shift count 2545.xfer: 2546 sub %l5, %l0, %l1 ! generate shift left count 2547 sll %i4, %l1, %i5 ! get leftover 25483: 2549 and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0 2550 andn %i2, 3, %i2 ! # of aligned bytes that can be moved 25512: 2552 ld [%i0], %i3 ! read a source word 2553 add %i0, 4, %i0 ! increment source address 2554 srl %i3, %l0, %i4 ! upper src bits into lower dst bits 2555 or %i5, %i4, %i5 ! merge with upper dest bits (leftover) 2556 st %i5, [%i1] ! write a destination word 2557 subcc %i2, 4, %i2 ! decrement count 2558 bz %ncc, .unalign_out ! check if done 2559 add %i1, 4, %i1 ! increment destination address 2560 b 2b ! loop 2561 sll %i3, %l1, %i5 ! get leftover 2562.unalign_out: 2563 tst %l4 ! any bytes leftover? 2564 bz %ncc, .cpdone 2565 .empty ! allow next instruction in delay slot 25661: 2567 sub %l0, 8, %l0 ! decrement shift 2568 srl %i3, %l0, %i4 ! upper src byte into lower dst byte 2569 stb %i4, [%i1] ! write a byte 2570 subcc %l4, 1, %l4 ! decrement count 2571 bz %ncc, .cpdone ! done? 2572 add %i1, 1, %i1 ! increment destination 2573 tst %l0 ! any more previously read bytes 2574 bnz %ncc, 1b ! we have leftover bytes 2575 mov %l4, %i2 ! delay slot, mv cnt where dbytecp wants 2576 b .dbytecp ! let dbytecp do the rest 2577 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst 2578 ! 2579 ! the destination address is aligned and the source is not 2580 ! 2581.align_src_only: 2582 ldub [%i0], %i3 ! read a byte from source address 2583 add %i0, 1, %i0 ! increment source address 2584 or %i4, %i3, %i4 ! or in with previous bytes (if any) 2585 btst 3, %i0 ! is source aligned? 2586 add %l0, 8, %l0 ! increment shift count (US) 2587 bnz,a .align_src_only 2588 sll %i4, 8, %i4 ! make room for next byte 2589 b,a .xfer 2590 ! 2591 ! if from address unaligned for double-word moves, 2592 ! move bytes till it is, if count is < 56 it could take 2593 ! longer to align the thing than to do the transfer 2594 ! in word size chunks right away 2595 ! 2596.aldoubcp: 2597 cmp %i2, 56 ! if count < 56, use wordcp, it takes 2598 blu,a %ncc, .alwordcp ! longer to align doubles than words 2599 mov 3, %o0 ! mask for word alignment 2600 call .alignit ! copy bytes until aligned 2601 mov 7, %o0 ! mask for double alignment 2602 ! 2603 ! source and destination are now double-word aligned 2604 ! i3 has aligned count returned by alignit 2605 ! 2606 and %i2, 7, %i2 ! unaligned leftover count 2607 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst 26085: 2609 ldx [%i0+%i1], %o4 ! read from address 2610 stx %o4, [%i1] ! write at destination address 2611 subcc %i3, 8, %i3 ! dec count 2612 bgu %ncc, 5b 2613 add %i1, 8, %i1 ! delay slot, inc to address 2614 cmp %i2, 4 ! see if we can copy a word 2615 blu %ncc, .dbytecp ! if 3 or less bytes use bytecp 2616 .empty 2617 ! 2618 ! for leftover bytes we fall into wordcp, if needed 2619 ! 2620.wordcp: 2621 and %i2, 3, %i2 ! unaligned leftover count 26225: 2623 ld [%i0+%i1], %o4 ! read from address 2624 st %o4, [%i1] ! write at destination address 2625 subcc %i3, 4, %i3 ! dec count 2626 bgu %ncc, 5b 2627 add %i1, 4, %i1 ! delay slot, inc to address 2628 b,a .dbytecp 2629 2630 ! we come here to align copies on word boundaries 2631.alwordcp: 2632 call .alignit ! go word-align it 2633 mov 3, %o0 ! bits that must be zero to be aligned 2634 b .wordcp 2635 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst 2636 2637 ! 2638 ! byte copy, works with any alignment 2639 ! 2640.bytecp: 2641 b .dbytecp 2642 sub %i0, %i1, %i0 ! i0 gets difference of src and dst 2643 2644 ! 2645 ! differenced byte copy, works with any alignment 2646 ! assumes dest in %i1 and (source - dest) in %i0 2647 ! 26481: 2649 stb %o4, [%i1] ! write to address 2650 inc %i1 ! inc to address 2651.dbytecp: 2652 deccc %i2 ! dec count 2653 bgeu,a %ncc, 1b ! loop till done 2654 ldub [%i0+%i1], %o4 ! read from address 2655.cpdone: 2656 2657 membar #Sync ! sync error barrier 2658 ! Restore t_lofault handler, if came here from kcopy(). 2659 tst %o5 2660 bz %ncc, 1f 2661 andn %o5, LOFAULT_SET, %o5 2662 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 26631: 2664 ret 2665 restore %g0, 0, %o0 ! return (0) 2666 2667/* 2668 * Common code used to align transfers on word and doubleword 2669 * boundaries. Aligns source and destination and returns a count 2670 * of aligned bytes to transfer in %i3 2671 */ 26721: 2673 inc %i0 ! inc from 2674 stb %o4, [%i1] ! write a byte 2675 inc %i1 ! inc to 2676 dec %i2 ! dec count 2677.alignit: 2678 btst %o0, %i0 ! %o0 is bit mask to check for alignment 2679 bnz,a 1b 2680 ldub [%i0], %o4 ! read next byte 2681 2682 retl 2683 andn %i2, %o0, %i3 ! return size of aligned bytes 2684 2685 SET_SIZE(bcopy) 2686 2687#endif /* NIAGARA_IMPL */ 2688 2689#endif /* lint */ 2690 2691/* 2692 * Block copy with possibly overlapped operands. 2693 */ 2694 2695#if defined(lint) 2696 2697/*ARGSUSED*/ 2698void 2699ovbcopy(const void *from, void *to, size_t count) 2700{} 2701 2702#else /* lint */ 2703 2704 ENTRY(ovbcopy) 2705 tst %o2 ! check count 2706 bgu,a %ncc, 1f ! nothing to do or bad arguments 2707 subcc %o0, %o1, %o3 ! difference of from and to address 2708 2709 retl ! return 2710 nop 27111: 2712 bneg,a %ncc, 2f 2713 neg %o3 ! if < 0, make it positive 27142: cmp %o2, %o3 ! cmp size and abs(from - to) 2715 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy, 2716 .empty ! no overlap 2717 cmp %o0, %o1 ! compare from and to addresses 2718 blu %ncc, .ov_bkwd ! if from < to, copy backwards 2719 nop 2720 ! 2721 ! Copy forwards. 2722 ! 2723.ov_fwd: 2724 ldub [%o0], %o3 ! read from address 2725 inc %o0 ! inc from address 2726 stb %o3, [%o1] ! write to address 2727 deccc %o2 ! dec count 2728 bgu %ncc, .ov_fwd ! loop till done 2729 inc %o1 ! inc to address 2730 2731 retl ! return 2732 nop 2733 ! 2734 ! Copy backwards. 2735 ! 2736.ov_bkwd: 2737 deccc %o2 ! dec count 2738 ldub [%o0 + %o2], %o3 ! get byte at end of src 2739 bgu %ncc, .ov_bkwd ! loop till done 2740 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst 2741 2742 retl ! return 2743 nop 2744 SET_SIZE(ovbcopy) 2745 2746#endif /* lint */ 2747 2748/* 2749 * hwblkpagecopy() 2750 * 2751 * Copies exactly one page. This routine assumes the caller (ppcopy) 2752 * has already disabled kernel preemption and has checked 2753 * use_hw_bcopy. 2754 */ 2755#ifdef lint 2756/*ARGSUSED*/ 2757void 2758hwblkpagecopy(const void *src, void *dst) 2759{ } 2760#else /* lint */ 2761 ENTRY(hwblkpagecopy) 2762 save %sp, -SA(MINFRAME), %sp 2763 2764 ! %i0 - source address (arg) 2765 ! %i1 - destination address (arg) 2766 ! %i2 - length of region (not arg) 2767 2768 set PAGESIZE, %i2 2769 2770 /* 2771 * Copying exactly one page and PAGESIZE is in mutliple of 0x80. 2772 */ 2773 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 2774 prefetch [%i0+0x0], #one_read 2775 prefetch [%i0+0x40], #one_read 27761: 2777 prefetch [%i0+0x80], #one_read 2778 prefetch [%i0+0xc0], #one_read 2779 ldda [%i0+0x0]%asi, %l0 2780 ldda [%i0+0x10]%asi, %l2 2781 ldda [%i0+0x20]%asi, %l4 2782 ldda [%i0+0x30]%asi, %l6 2783 stxa %l0, [%i1+0x0]%asi 2784 stxa %l1, [%i1+0x8]%asi 2785 stxa %l2, [%i1+0x10]%asi 2786 stxa %l3, [%i1+0x18]%asi 2787 stxa %l4, [%i1+0x20]%asi 2788 stxa %l5, [%i1+0x28]%asi 2789 stxa %l6, [%i1+0x30]%asi 2790 stxa %l7, [%i1+0x38]%asi 2791 ldda [%i0+0x40]%asi, %l0 2792 ldda [%i0+0x50]%asi, %l2 2793 ldda [%i0+0x60]%asi, %l4 2794 ldda [%i0+0x70]%asi, %l6 2795 stxa %l0, [%i1+0x40]%asi 2796 stxa %l1, [%i1+0x48]%asi 2797 stxa %l2, [%i1+0x50]%asi 2798 stxa %l3, [%i1+0x58]%asi 2799 stxa %l4, [%i1+0x60]%asi 2800 stxa %l5, [%i1+0x68]%asi 2801 stxa %l6, [%i1+0x70]%asi 2802 stxa %l7, [%i1+0x78]%asi 2803 2804 add %i0, 0x80, %i0 2805 subcc %i2, 0x80, %i2 2806 bgu,pt %xcc, 1b 2807 add %i1, 0x80, %i1 2808 2809 membar #Sync 2810 ret 2811 restore %g0, 0, %o0 2812 SET_SIZE(hwblkpagecopy) 2813#endif /* lint */ 2814 2815 2816/* 2817 * Transfer data to and from user space - 2818 * Note that these routines can cause faults 2819 * It is assumed that the kernel has nothing at 2820 * less than KERNELBASE in the virtual address space. 2821 * 2822 * Note that copyin(9F) and copyout(9F) are part of the 2823 * DDI/DKI which specifies that they return '-1' on "errors." 2824 * 2825 * Sigh. 2826 * 2827 * So there's two extremely similar routines - xcopyin() and xcopyout() 2828 * which return the errno that we've faithfully computed. This 2829 * allows other callers (e.g. uiomove(9F)) to work correctly. 2830 * Given that these are used pretty heavily, we expand the calling 2831 * sequences inline for all flavours (rather than making wrappers). 2832 * 2833 * There are also stub routines for xcopyout_little and xcopyin_little, 2834 * which currently are intended to handle requests of <= 16 bytes from 2835 * do_unaligned. Future enhancement to make them handle 8k pages efficiently 2836 * is left as an exercise... 2837 */ 2838 2839/* 2840 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr) 2841 * 2842 * General theory of operation: 2843 * 2844 * None of the copyops routines grab a window until it's decided that 2845 * we need to do a HW block copy operation. This saves a window 2846 * spill/fill when we're called during socket ops. The typical IO 2847 * path won't cause spill/fill traps. 2848 * 2849 * This code uses a set of 4 limits for the maximum size that will 2850 * be copied given a particular input/output address alignment. 2851 * the default limits are: 2852 * 2853 * single byte aligned - 256 (hw_copy_limit_1) 2854 * two byte aligned - 512 (hw_copy_limit_2) 2855 * four byte aligned - 1024 (hw_copy_limit_4) 2856 * eight byte aligned - 1024 (hw_copy_limit_8) 2857 * 2858 * If the value for a particular limit is zero, the copy will be done 2859 * via the copy loops rather than block store/quad load instructions. 2860 * 2861 * Flow: 2862 * 2863 * If count == zero return zero. 2864 * 2865 * Store the previous lo_fault handler into %g6. 2866 * Place our secondary lofault handler into %g5. 2867 * Place the address of our nowindow fault handler into %o3. 2868 * Place the address of the windowed fault handler into %o4. 2869 * --> We'll use this handler if we end up grabbing a window 2870 * --> before we use block initializing store and quad load ASIs 2871 * 2872 * If count is less than or equal to SMALL_LIMIT (7) we 2873 * always do a byte for byte copy. 2874 * 2875 * If count is > SMALL_LIMIT, we check the alignment of the input 2876 * and output pointers. Based on the alignment we check count 2877 * against a limit based on detected alignment. If we exceed the 2878 * alignment value we copy via block initializing store and quad 2879 * load instructions. 2880 * 2881 * If we don't exceed one of the limits, we store -count in %o3, 2882 * we store the number of chunks (8, 4, 2 or 1 byte) operated 2883 * on in our basic copy loop in %o2. Following this we branch 2884 * to the appropriate copy loop and copy that many chunks. 2885 * Since we've been adding the chunk size to %o3 each time through 2886 * as well as decrementing %o2, we can tell if any data is 2887 * is left to be copied by examining %o3. If that is zero, we're 2888 * done and can go home. If not, we figure out what the largest 2889 * chunk size left to be copied is and branch to that copy loop 2890 * unless there's only one byte left. We load that as we're 2891 * branching to code that stores it just before we return. 2892 * 2893 * Fault handlers are invoked if we reference memory that has no 2894 * current mapping. All forms share the same copyio_fault handler. 2895 * This routine handles fixing up the stack and general housecleaning. 2896 * Each copy operation has a simple fault handler that is then called 2897 * to do the work specific to the invidual operation. The handler 2898 * for copyOP and xcopyOP are found at the end of individual function. 2899 * The handlers for xcopyOP_little are found at the end of xcopyin_little. 2900 * The handlers for copyOP_noerr are found at the end of copyin_noerr. 2901 */ 2902 2903/* 2904 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little). 2905 */ 2906 2907#if defined(lint) 2908 2909/*ARGSUSED*/ 2910int 2911copyout(const void *kaddr, void *uaddr, size_t count) 2912{ return (0); } 2913 2914#else /* lint */ 2915 2916/* 2917 * We save the arguments in the following registers in case of a fault: 2918 * kaddr - %g2 2919 * uaddr - %g3 2920 * count - %g4 2921 */ 2922#define SAVE_SRC %g2 2923#define SAVE_DST %g3 2924#define SAVE_COUNT %g4 2925 2926#define REAL_LOFAULT %g5 2927#define SAVED_LOFAULT %g6 2928 2929/* 2930 * Generic copyio fault handler. This is the first line of defense when a 2931 * fault occurs in (x)copyin/(x)copyout. In order for this to function 2932 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT. 2933 * This allows us to share common code for all the flavors of the copy 2934 * operations, including the _noerr versions. 2935 * 2936 * Note that this function will restore the original input parameters before 2937 * calling REAL_LOFAULT. So the real handler can vector to the appropriate 2938 * member of the t_copyop structure, if needed. 2939 */ 2940 ENTRY(copyio_fault) 2941#if !defined(NIAGARA_IMPL) 2942 btst FPUSED_FLAG, SAVED_LOFAULT 2943 bz 1f 2944 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 2945 2946 wr %l5, 0, %gsr ! restore gsr 2947 2948 btst FPRS_FEF, %g1 2949 bz %icc, 4f 2950 nop 2951 2952 ! restore fpregs from stack 2953 BLD_FP_FROMSTACK(%o2) 2954 2955 ba,pt %ncc, 1f 2956 nop 29574: 2958 FZERO ! zero all of the fpregs 2959 wr %g1, %g0, %fprs ! restore fprs 29601: 2961 restore 2962 mov SAVE_SRC, %o0 2963 mov SAVE_DST, %o1 2964 jmp REAL_LOFAULT 2965 mov SAVE_COUNT, %o2 2966 2967#else /* NIAGARA_IMPL */ 2968 membar #Sync 2969 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2970 restore 2971 mov SAVE_SRC, %o0 2972 mov SAVE_DST, %o1 2973 jmp REAL_LOFAULT 2974 mov SAVE_COUNT, %o2 2975 2976#endif /* NIAGARA_IMPL */ 2977 2978 SET_SIZE(copyio_fault) 2979 2980 ENTRY(copyio_fault_nowindow) 2981 membar #Sync 2982 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2983 2984 mov SAVE_SRC, %o0 2985 mov SAVE_DST, %o1 2986 jmp REAL_LOFAULT 2987 mov SAVE_COUNT, %o2 2988 SET_SIZE(copyio_fault_nowindow) 2989 2990 ENTRY(copyout) 2991 sethi %hi(.copyout_err), REAL_LOFAULT 2992 or REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT 2993 2994#if !defined(NIAGARA_IMPL) 2995.do_copyout: 2996 tst %o2 ! check for zero count; quick exit 2997 bz,pt %ncc, .co_smallqx 2998 mov %o0, SAVE_SRC 2999 mov %o1, SAVE_DST 3000 mov %o2, SAVE_COUNT 3001 cmp %o2, FP_COPY ! check for small copy/leaf case 3002 bgt,pt %ncc, .co_copy_more 3003 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT 3004/* 3005 * Small copy out code 3006 * 3007 */ 3008 sethi %hi(copyio_fault_nowindow), %o3 3009 or %o3, %lo(copyio_fault_nowindow), %o3 3010 membar #Sync 3011 stn %o3, [THREAD_REG + T_LOFAULT] 3012 3013 mov ASI_USER, %asi 3014 cmp %o2, SHORTCOPY ! make sure there is enough to align 3015 ble,pt %ncc, .co_smallest 3016 andcc %o1, 0x7, %o3 ! is dest long word aligned 3017 bnz,pn %ncc, .co_align 3018 andcc %o1, 1, %o3 ! is dest byte aligned 3019 3020! Destination is long word aligned 3021! 8 cases for src alignment; load parts, store long words 3022.co_al_src: 3023 andcc %o0, 7, %o3 3024 brnz,pt %o3, .co_src_dst_unal8 3025 nop 3026/* 3027 * Special case for handling when src and dest are both long word aligned 3028 * and total data to move is less than FP_COPY bytes 3029 * Also handles finish up for large block moves, so may be less than 32 bytes 3030 */ 3031.co_medlong: 3032 subcc %o2, 31, %o2 ! adjust length to allow cc test 3033 ble,pt %ncc, .co_medl31 3034 nop 3035.co_medl32: 3036 ldx [%o0], %o4 ! move 32 bytes 3037 subcc %o2, 32, %o2 ! decrement length count by 32 3038 stxa %o4, [%o1]%asi 3039 ldx [%o0+8], %o4 3040 stxa %o4, [%o1+8]%asi 3041 ldx [%o0+16], %o4 3042 add %o0, 32, %o0 ! increase src ptr by 32 3043 stxa %o4, [%o1+16]%asi 3044 ldx [%o0-8], %o4 3045 add %o1, 32, %o1 ! increase dst ptr by 32 3046 bgu,pt %ncc, .co_medl32 ! repeat if at least 32 bytes left 3047 stxa %o4, [%o1-8]%asi 3048.co_medl31: 3049 addcc %o2, 24, %o2 ! adjust count to be off by 7 3050 ble,pt %ncc, .co_medl7 ! skip if 7 or fewer bytes left 3051 nop 3052.co_medl8: 3053 ldx [%o0], %o4 ! move 8 bytes 3054 add %o0, 8, %o0 ! increase src ptr by 8 3055 subcc %o2, 8, %o2 ! decrease count by 8 3056 add %o1, 8, %o1 ! increase dst ptr by 8 3057 bgu,pt %ncc, .co_medl8 3058 stxa %o4, [%o1-8]%asi 3059.co_medl7: 3060 addcc %o2, 7, %o2 ! finish adjustment of remaining count 3061 bnz,pt %ncc, .co_small4 ! do final bytes if not finished 3062 3063.co_smallx: ! finish up and exit 3064 membar #Sync 3065 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 3066.co_smallqx: 3067 retl 3068 mov %g0, %o0 3069 3070.co_small4: 3071 cmp %o2, 4 3072 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left 3073 nop ! 3074 ld [%o0], %o4 ! move 4 bytes 3075 add %o0, 4, %o0 ! increase src ptr by 4 3076 add %o1, 4, %o1 ! increase dst ptr by 4 3077 subcc %o2, 4, %o2 ! decrease count by 4 3078 bz,pt %ncc, .co_smallx 3079 stwa %o4, [%o1-4]%asi 3080 3081.co_small3x: ! Exactly 1, 2, or 3 bytes remain 3082 subcc %o2, 1, %o2 ! reduce count for cc test 3083 ldub [%o0], %o4 ! load one byte 3084 bz,pt %ncc, .co_smallx 3085 stba %o4, [%o1]%asi ! store one byte 3086 ldub [%o0+1], %o4 ! load second byte 3087 subcc %o2, 1, %o2 3088 bz,pt %ncc, .co_smallx 3089 stba %o4, [%o1+1]%asi ! store second byte 3090 ldub [%o0+2], %o4 ! load third byte 3091 ba .co_smallx 3092 stba %o4, [%o1+2]%asi ! store third byte 3093 3094.co_smallest: ! 7 or fewer bytes remain 3095 cmp %o2, 4 3096 blt,pt %ncc, .co_small3x 3097 nop 3098 ldub [%o0], %o4 ! read byte 3099 subcc %o2, 4, %o2 ! reduce count by 4 3100 stba %o4, [%o1]%asi ! write byte 3101 ldub [%o0+1], %o4 ! repeat for total of 4 bytes 3102 add %o0, 4, %o0 ! advance src by 4 3103 stba %o4, [%o1+1]%asi 3104 ldub [%o0-2], %o4 3105 add %o1, 4, %o1 ! advance dst by 4 3106 stba %o4, [%o1-2]%asi 3107 ldub [%o0-1], %o4 3108 bnz,pt %ncc, .co_small3x 3109 stba %o4, [%o1-1]%asi 3110 membar #Sync 3111 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 3112 retl 3113 mov %g0, %o0 3114 3115.co_align: ! byte align test in prior branch delay 3116 bnz,pt %ncc, .co_al_d1 3117.co_al_d1f: ! dest is now half word aligned 3118 andcc %o1, 2, %o3 3119 bnz,pt %ncc, .co_al_d2 3120.co_al_d2f: ! dest is now word aligned 3121 andcc %o1, 4, %o3 ! is dest longword aligned? 3122 bz,pt %ncc, .co_al_src 3123 nop 3124.co_al_d4: ! dest is word aligned; src is unknown 3125 ldub [%o0], %o4 ! move a word (src align unknown) 3126 ldub [%o0+1], %o3 3127 sll %o4, 24, %o4 ! position 3128 sll %o3, 16, %o3 ! position 3129 or %o4, %o3, %o3 ! merge 3130 ldub [%o0+2], %o4 3131 sll %o4, 8, %o4 ! position 3132 or %o4, %o3, %o3 ! merge 3133 ldub [%o0+3], %o4 3134 or %o4, %o3, %o4 ! merge 3135 stwa %o4,[%o1]%asi ! store four bytes 3136 add %o0, 4, %o0 ! adjust src by 4 3137 add %o1, 4, %o1 ! adjust dest by 4 3138 sub %o2, 4, %o2 ! adjust count by 4 3139 andcc %o0, 7, %o3 ! check for src long word alignment 3140 brz,pt %o3, .co_medlong 3141.co_src_dst_unal8: 3142 ! dst is 8-byte aligned, src is not 3143 ! Size is less than FP_COPY 3144 ! Following code is to select for alignment 3145 andcc %o0, 0x3, %o3 ! test word alignment 3146 bz,pt %ncc, .co_medword 3147 nop 3148 andcc %o0, 0x1, %o3 ! test halfword alignment 3149 bnz,pt %ncc, .co_med_byte ! go to byte move if not halfword 3150 andcc %o0, 0x2, %o3 ! test which byte alignment 3151 ba .co_medhalf 3152 nop 3153.co_al_d1: ! align dest to half word 3154 ldub [%o0], %o4 ! move a byte 3155 add %o0, 1, %o0 3156 stba %o4, [%o1]%asi 3157 add %o1, 1, %o1 3158 andcc %o1, 2, %o3 3159 bz,pt %ncc, .co_al_d2f 3160 sub %o2, 1, %o2 3161.co_al_d2: ! align dest to word 3162 ldub [%o0], %o4 ! move a half-word (src align unknown) 3163 ldub [%o0+1], %o3 3164 sll %o4, 8, %o4 ! position 3165 or %o4, %o3, %o4 ! merge 3166 stha %o4, [%o1]%asi 3167 add %o0, 2, %o0 3168 add %o1, 2, %o1 3169 andcc %o1, 4, %o3 ! is dest longword aligned? 3170 bz,pt %ncc, .co_al_src 3171 sub %o2, 2, %o2 3172 ba .co_al_d4 3173 nop 3174/* 3175 * Handle all cases where src and dest are aligned on word 3176 * boundaries. Use unrolled loops for better performance. 3177 * This option wins over standard large data move when 3178 * source and destination is in cache for medium 3179 * to short data moves. 3180 */ 3181.co_medword: 3182 subcc %o2, 31, %o2 ! adjust length to allow cc test 3183 ble,pt %ncc, .co_medw31 3184 nop 3185.co_medw32: 3186 ld [%o0], %o4 ! move a block of 32 bytes 3187 stwa %o4, [%o1]%asi 3188 ld [%o0+4], %o4 3189 stwa %o4, [%o1+4]%asi 3190 ld [%o0+8], %o4 3191 stwa %o4, [%o1+8]%asi 3192 ld [%o0+12], %o4 3193 stwa %o4, [%o1+12]%asi 3194 ld [%o0+16], %o4 3195 stwa %o4, [%o1+16]%asi 3196 ld [%o0+20], %o4 3197 subcc %o2, 32, %o2 ! decrement length count 3198 stwa %o4, [%o1+20]%asi 3199 ld [%o0+24], %o4 3200 add %o0, 32, %o0 ! increase src ptr by 32 3201 stwa %o4, [%o1+24]%asi 3202 ld [%o0-4], %o4 3203 add %o1, 32, %o1 ! increase dst ptr by 32 3204 bgu,pt %ncc, .co_medw32 ! repeat if at least 32 bytes left 3205 stwa %o4, [%o1-4]%asi 3206.co_medw31: 3207 addcc %o2, 24, %o2 ! adjust count to be off by 7 3208 ble,pt %ncc, .co_medw7 ! skip if 7 or fewer bytes left 3209 nop ! 3210.co_medw15: 3211 ld [%o0], %o4 ! move a block of 8 bytes 3212 subcc %o2, 8, %o2 ! decrement length count 3213 stwa %o4, [%o1]%asi 3214 add %o0, 8, %o0 ! increase src ptr by 8 3215 ld [%o0-4], %o4 3216 add %o1, 8, %o1 ! increase dst ptr by 8 3217 bgu,pt %ncc, .co_medw15 3218 stwa %o4, [%o1-4]%asi 3219.co_medw7: 3220 addcc %o2, 7, %o2 ! finish adjustment of remaining count 3221 bz,pt %ncc, .co_smallx ! exit if finished 3222 cmp %o2, 4 3223 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left 3224 nop ! 3225 ld [%o0], %o4 ! move 4 bytes 3226 add %o0, 4, %o0 ! increase src ptr by 4 3227 add %o1, 4, %o1 ! increase dst ptr by 4 3228 subcc %o2, 4, %o2 ! decrease count by 4 3229 bnz .co_small3x 3230 stwa %o4, [%o1-4]%asi 3231 membar #Sync 3232 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 3233 retl 3234 mov %g0, %o0 3235 3236.co_medhalf: 3237 subcc %o2, 31, %o2 ! adjust length to allow cc test 3238 ble,pt %ncc, .co_medh31 3239 nop 3240.co_medh32: ! load and store block of 32 bytes 3241 3242 lduh [%o0], %o4 ! move 32 bytes 3243 subcc %o2, 32, %o2 ! decrement length count 3244 lduw [%o0+2], %o3 3245 sllx %o4, 48, %o4 3246 sllx %o3, 16, %o3 3247 or %o4, %o3, %o3 3248 lduh [%o0+6], %o4 3249 or %o4, %o3, %o4 3250 stxa %o4, [%o1]%asi 3251 3252 lduh [%o0+8], %o4 3253 lduw [%o0+10], %o3 3254 sllx %o4, 48, %o4 3255 sllx %o3, 16, %o3 3256 or %o4, %o3, %o3 3257 lduh [%o0+14], %o4 3258 or %o4, %o3, %o4 3259 stxa %o4, [%o1+8]%asi 3260 3261 lduh [%o0+16], %o4 3262 lduw [%o0+18], %o3 3263 sllx %o4, 48, %o4 3264 sllx %o3, 16, %o3 3265 or %o4, %o3, %o3 3266 lduh [%o0+22], %o4 3267 or %o4, %o3, %o4 3268 stxa %o4, [%o1+16]%asi 3269 3270 add %o0, 32, %o0 ! increase src ptr by 32 3271 add %o1, 32, %o1 ! increase dst ptr by 32 3272 3273 lduh [%o0-8], %o4 3274 lduw [%o0-6], %o3 3275 sllx %o4, 48, %o4 3276 sllx %o3, 16, %o3 3277 or %o4, %o3, %o3 3278 lduh [%o0-2], %o4 3279 or %o3, %o4, %o4 3280 bgu,pt %ncc, .co_medh32 ! repeat if at least 32 bytes left 3281 stxa %o4, [%o1-8]%asi 3282 3283.co_medh31: 3284 addcc %o2, 24, %o2 ! adjust count to be off by 7 3285 ble,pt %ncc, .co_medh7 ! skip if 7 or fewer bytes left 3286 nop ! 3287.co_medh15: 3288 lduh [%o0], %o4 ! move 16 bytes 3289 subcc %o2, 8, %o2 ! decrement length count 3290 lduw [%o0+2], %o3 3291 sllx %o4, 48, %o4 3292 sllx %o3, 16, %o3 3293 or %o4, %o3, %o3 3294 add %o1, 8, %o1 ! increase dst ptr by 8 3295 lduh [%o0+6], %o4 3296 add %o0, 8, %o0 ! increase src ptr by 8 3297 or %o4, %o3, %o4 3298 bgu,pt %ncc, .co_medh15 3299 stxa %o4, [%o1-8]%asi 3300.co_medh7: 3301 addcc %o2, 7, %o2 ! finish adjustment of remaining count 3302 bz,pt %ncc, .co_smallx ! exit if finished 3303 cmp %o2, 4 3304 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left 3305 nop ! 3306 lduh [%o0], %o4 3307 sll %o4, 16, %o4 3308 lduh [%o0+2], %o3 3309 or %o3, %o4, %o4 3310 subcc %o2, 4, %o2 3311 add %o0, 4, %o0 3312 add %o1, 4, %o1 3313 bnz .co_small3x 3314 stwa %o4, [%o1-4]%asi 3315 membar #Sync 3316 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 3317 retl 3318 mov %g0, %o0 3319 3320 .align 16 3321.co_med_byte: 3322 bnz,pt %ncc, .co_medbh32a ! go to correct byte move 3323 subcc %o2, 31, %o2 ! adjust length to allow cc test 3324 ble,pt %ncc, .co_medb31 3325 nop 3326.co_medb32: ! Alignment 1 or 5 3327 subcc %o2, 32, %o2 ! decrement length count 3328 3329 ldub [%o0], %o4 ! load and store a block of 32 bytes 3330 sllx %o4, 56, %o3 3331 lduh [%o0+1], %o4 3332 sllx %o4, 40, %o4 3333 or %o4, %o3, %o3 3334 lduw [%o0+3], %o4 3335 sllx %o4, 8, %o4 3336 or %o4, %o3, %o3 3337 ldub [%o0+7], %o4 3338 or %o4, %o3, %o4 3339 stxa %o4, [%o1]%asi 3340 3341 ldub [%o0+8], %o4 3342 sllx %o4, 56, %o3 3343 lduh [%o0+9], %o4 3344 sllx %o4, 40, %o4 3345 or %o4, %o3, %o3 3346 lduw [%o0+11], %o4 3347 sllx %o4, 8, %o4 3348 or %o4, %o3, %o3 3349 ldub [%o0+15], %o4 3350 or %o4, %o3, %o4 3351 stxa %o4, [%o1+8]%asi 3352 3353 ldub [%o0+16], %o4 3354 sllx %o4, 56, %o3 3355 lduh [%o0+17], %o4 3356 sllx %o4, 40, %o4 3357 or %o4, %o3, %o3 3358 lduw [%o0+19], %o4 3359 sllx %o4, 8, %o4 3360 or %o4, %o3, %o3 3361 ldub [%o0+23], %o4 3362 or %o4, %o3, %o4 3363 stxa %o4, [%o1+16]%asi 3364 3365 add %o0, 32, %o0 ! increase src ptr by 32 3366 add %o1, 32, %o1 ! increase dst ptr by 32 3367 3368 ldub [%o0-8], %o4 3369 sllx %o4, 56, %o3 3370 lduh [%o0-7], %o4 3371 sllx %o4, 40, %o4 3372 or %o4, %o3, %o3 3373 lduw [%o0-5], %o4 3374 sllx %o4, 8, %o4 3375 or %o4, %o3, %o3 3376 ldub [%o0-1], %o4 3377 or %o4, %o3, %o4 3378 bgu,pt %ncc, .co_medb32 ! repeat if at least 32 bytes left 3379 stxa %o4, [%o1-8]%asi 3380 3381.co_medb31: ! 31 or fewer bytes remaining 3382 addcc %o2, 24, %o2 ! adjust count to be off by 7 3383 ble,pt %ncc, .co_medb7 ! skip if 7 or fewer bytes left 3384 nop ! 3385.co_medb15: 3386 3387 ldub [%o0], %o4 ! load and store a block of 8 bytes 3388 subcc %o2, 8, %o2 ! decrement length count 3389 sllx %o4, 56, %o3 3390 lduh [%o0+1], %o4 3391 sllx %o4, 40, %o4 3392 or %o4, %o3, %o3 3393 lduw [%o0+3], %o4 3394 add %o1, 8, %o1 ! increase dst ptr by 16 3395 sllx %o4, 8, %o4 3396 or %o4, %o3, %o3 3397 ldub [%o0+7], %o4 3398 add %o0, 8, %o0 ! increase src ptr by 16 3399 or %o4, %o3, %o4 3400 bgu,pt %ncc, .co_medb15 3401 stxa %o4, [%o1-8]%asi 3402.co_medb7: 3403 addcc %o2, 7, %o2 ! finish adjustment of remaining count 3404 bz,pt %ncc, .co_smallx ! exit if finished 3405 cmp %o2, 4 3406 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left 3407 nop ! 3408 ldub [%o0], %o4 ! move 4 bytes 3409 sll %o4, 24, %o3 3410 lduh [%o0+1], %o4 3411 sll %o4, 8, %o4 3412 or %o4, %o3, %o3 3413 ldub [%o0+3], %o4 3414 or %o4, %o3, %o4 3415 subcc %o2, 4, %o2 3416 add %o0, 4, %o0 3417 add %o1, 4, %o1 3418 bnz .co_small3x 3419 stwa %o4, [%o1-4]%asi 3420 membar #Sync 3421 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 3422 retl 3423 mov %g0, %o0 3424 3425 .align 16 3426.co_medbh32a: 3427 ble,pt %ncc, .co_medbh31 3428 nop 3429.co_medbh32: ! Alignment 3 or 7 3430 subcc %o2, 32, %o2 ! decrement length count 3431 3432 ldub [%o0], %o4 ! load and store a block of 32 bytes 3433 sllx %o4, 56, %o3 3434 lduw [%o0+1], %o4 3435 sllx %o4, 24, %o4 3436 or %o4, %o3, %o3 3437 lduh [%o0+5], %o4 3438 sllx %o4, 8, %o4 3439 or %o4, %o3, %o3 3440 ldub [%o0+7], %o4 3441 or %o4, %o3, %o4 3442 stxa %o4, [%o1]%asi 3443 3444 ldub [%o0+8], %o4 3445 sllx %o4, 56, %o3 3446 lduw [%o0+9], %o4 3447 sllx %o4, 24, %o4 3448 or %o4, %o3, %o3 3449 lduh [%o0+13], %o4 3450 sllx %o4, 8, %o4 3451 or %o4, %o3, %o3 3452 ldub [%o0+15], %o4 3453 or %o4, %o3, %o4 3454 stxa %o4, [%o1+8]%asi 3455 3456 ldub [%o0+16], %o4 3457 sllx %o4, 56, %o3 3458 lduw [%o0+17], %o4 3459 sllx %o4, 24, %o4 3460 or %o4, %o3, %o3 3461 lduh [%o0+21], %o4 3462 sllx %o4, 8, %o4 3463 or %o4, %o3, %o3 3464 ldub [%o0+23], %o4 3465 or %o4, %o3, %o4 3466 stxa %o4, [%o1+16]%asi 3467 3468 add %o0, 32, %o0 ! increase src ptr by 32 3469 add %o1, 32, %o1 ! increase dst ptr by 32 3470 3471 ldub [%o0-8], %o4 3472 sllx %o4, 56, %o3 3473 lduw [%o0-7], %o4 3474 sllx %o4, 24, %o4 3475 or %o4, %o3, %o3 3476 lduh [%o0-3], %o4 3477 sllx %o4, 8, %o4 3478 or %o4, %o3, %o3 3479 ldub [%o0-1], %o4 3480 or %o4, %o3, %o4 3481 bgu,pt %ncc, .co_medbh32 ! repeat if at least 32 bytes left 3482 stxa %o4, [%o1-8]%asi 3483 3484.co_medbh31: 3485 addcc %o2, 24, %o2 ! adjust count to be off by 7 3486 ble,pt %ncc, .co_medb7 ! skip if 7 or fewer bytes left 3487 nop ! 3488.co_medbh15: 3489 ldub [%o0], %o4 ! load and store a block of 8 bytes 3490 sllx %o4, 56, %o3 3491 lduw [%o0+1], %o4 3492 sllx %o4, 24, %o4 3493 or %o4, %o3, %o3 3494 lduh [%o0+5], %o4 3495 sllx %o4, 8, %o4 3496 or %o4, %o3, %o3 3497 ldub [%o0+7], %o4 3498 or %o4, %o3, %o4 3499 stxa %o4, [%o1]%asi 3500 subcc %o2, 8, %o2 ! decrement length count 3501 add %o1, 8, %o1 ! increase dst ptr by 8 3502 add %o0, 8, %o0 ! increase src ptr by 8 3503 bgu,pt %ncc, .co_medbh15 3504 stxa %o4, [%o1-8]%asi 3505 ba .co_medb7 3506 nop 3507/* 3508 * End of small copy (no window) code 3509 */ 3510 3511/* 3512 * Long copy code 3513 */ 3514.co_copy_more: 3515 sethi %hi(copyio_fault), %o3 3516 or %o3, %lo(copyio_fault), %o3 3517 membar #Sync 3518 stn %o3, [THREAD_REG + T_LOFAULT] 3519 3520/* 3521 * Following code is for large copies. We know there is at 3522 * least FP_COPY bytes available. FP regs are used, so 3523 * we save registers and fp regs before starting 3524 */ 3525 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3526 or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 3527 rd %fprs, %g1 ! check for unused fp 3528 ! if fprs.fef == 0, set it. 3529 ! Setting it when already set costs more than checking 3530 andcc %g1, FPRS_FEF, %g1 ! test FEF, fprs.du = fprs.dl = 0 3531 bz,pt %ncc, .co_fp_unused 3532 mov ASI_USER, %asi 3533 BST_FP_TOSTACK(%o3) 3534 ba .co_fp_ready 3535.co_fp_unused: 3536 prefetch [%i0 + (1 * CACHE_LINE)], #one_read 3537 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 3538.co_fp_ready: 3539 rd %gsr, %l5 ! save %gsr value 3540 andcc %i1, 1, %o3 ! is dest byte aligned 3541 bnz,pt %ncc, .co_big_d1 3542.co_big_d1f: ! dest is now half word aligned 3543 andcc %i1, 2, %o3 3544 bnz,pt %ncc, .co_big_d2 3545.co_big_d2f: ! dest is now word aligned 3546 andcc %i1, 4, %o3 ! is dest longword aligned 3547 bnz,pt %ncc, .co_big_d4 3548.co_big_d4f: ! dest is now long word aligned 3549 andcc %i0, 7, %o3 ! is src long word aligned 3550 brnz,pt %o3, .co_big_unal8 3551 prefetch [%i0 + (2 * CACHE_LINE)], #one_read 3552 ! Src and dst are long word aligned 3553 ! align dst to 64 byte boundary 3554 andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 3555 brz,pn %o3, .co_al_to_64 3556 nop 3557 sub %o3, 64, %o3 ! %o3 has negative bytes to move 3558 add %i2, %o3, %i2 ! adjust remaining count 3559 andcc %o3, 8, %o4 ! odd long words to move? 3560 brz,pt %o4, .co_al_to_16 3561 nop 3562 add %o3, 8, %o3 3563 ldx [%i0], %o4 3564 add %i0, 8, %i0 ! increment src ptr 3565 stxa %o4, [%i1]ASI_USER 3566 add %i1, 8, %i1 ! increment dst ptr 3567! Dest is aligned on 16 bytes, src 8 byte aligned 3568.co_al_to_16: 3569 andcc %o3, 0x30, %o4 ! move to move? 3570 brz,pt %o4, .co_al_to_64 3571 nop 3572.co_al_mv_16: 3573 add %o3, 16, %o3 3574 ldx [%i0], %o4 3575 stxa %o4, [%i1]ASI_USER 3576 add %i0, 16, %i0 ! increment src ptr 3577 ldx [%i0-8], %o4 3578 add %i1, 8, %i1 ! increment dst ptr 3579 stxa %o4, [%i1]ASI_USER 3580 andcc %o3, 0x30, %o4 3581 brnz,pt %o4, .co_al_mv_16 3582 add %i1, 8, %i1 ! increment dst ptr 3583! Dest is aligned on 64 bytes, src 8 byte aligned 3584.co_al_to_64: 3585 ! Determine source alignment 3586 ! to correct 8 byte offset 3587 andcc %i0, 32, %o3 3588 brnz,pn %o3, .co_aln_1 3589 andcc %i0, 16, %o3 3590 brnz,pn %o3, .co_aln_01 3591 andcc %i0, 8, %o3 3592 brz,pn %o3, .co_aln_000 3593 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 3594 ba .co_aln_001 3595 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3596.co_aln_01: 3597 brnz,pn %o3, .co_aln_011 3598 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 3599 ba .co_aln_010 3600 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3601.co_aln_1: 3602 andcc %i0, 16, %o3 3603 brnz,pn %o3, .co_aln_11 3604 andcc %i0, 8, %o3 3605 brnz,pn %o3, .co_aln_101 3606 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 3607 ba .co_aln_100 3608 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3609.co_aln_11: 3610 brz,pn %o3, .co_aln_110 3611 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 3612 3613.co_aln_111: 3614! Alignment off by 8 bytes 3615 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3616 ldd [%i0], %d0 3617 add %i0, 8, %i0 3618 sub %i2, 8, %i2 3619 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3620 and %i2, 0x7f, %i2 ! residue bytes in %i2 3621 sub %i1, %i0, %i1 3622.co_aln_111_loop: 3623 ldda [%i0]ASI_BLK_P,%d16 ! block load 3624 subcc %o3, 64, %o3 3625 fmovd %d16, %d2 3626 fmovd %d18, %d4 3627 fmovd %d20, %d6 3628 fmovd %d22, %d8 3629 fmovd %d24, %d10 3630 fmovd %d26, %d12 3631 fmovd %d28, %d14 3632 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3633 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3634 add %i0, 64, %i0 3635 fmovd %d30, %d0 3636 bgt,pt %ncc, .co_aln_111_loop 3637 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3638 add %i1, %i0, %i1 3639 3640 stda %d0, [%i1]ASI_USER 3641 ba .co_remain_stuff 3642 add %i1, 8, %i1 3643 ! END OF aln_111 3644 3645.co_aln_110: 3646! Alignment off by 16 bytes 3647 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3648 ldd [%i0], %d0 3649 ldd [%i0+8], %d2 3650 add %i0, 16, %i0 3651 sub %i2, 16, %i2 3652 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3653 and %i2, 0x7f, %i2 ! residue bytes in %i2 3654 sub %i1, %i0, %i1 3655.co_aln_110_loop: 3656 ldda [%i0]ASI_BLK_P,%d16 ! block load 3657 subcc %o3, 64, %o3 3658 fmovd %d16, %d4 3659 fmovd %d18, %d6 3660 fmovd %d20, %d8 3661 fmovd %d22, %d10 3662 fmovd %d24, %d12 3663 fmovd %d26, %d14 3664 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3665 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3666 add %i0, 64, %i0 3667 fmovd %d28, %d0 3668 fmovd %d30, %d2 3669 bgt,pt %ncc, .co_aln_110_loop 3670 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3671 add %i1, %i0, %i1 3672 3673 stda %d0, [%i1]%asi 3674 stda %d2, [%i1+8]%asi 3675 ba .co_remain_stuff 3676 add %i1, 16, %i1 3677 ! END OF aln_110 3678 3679.co_aln_101: 3680! Alignment off by 24 bytes 3681 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3682 ldd [%i0], %d0 3683 ldd [%i0+8], %d2 3684 ldd [%i0+16], %d4 3685 add %i0, 24, %i0 3686 sub %i2, 24, %i2 3687 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3688 and %i2, 0x7f, %i2 ! residue bytes in %i2 3689 sub %i1, %i0, %i1 3690.co_aln_101_loop: 3691 ldda [%i0]ASI_BLK_P,%d16 ! block load 3692 subcc %o3, 64, %o3 3693 fmovd %d16, %d6 3694 fmovd %d18, %d8 3695 fmovd %d20, %d10 3696 fmovd %d22, %d12 3697 fmovd %d24, %d14 3698 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3699 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3700 add %i0, 64, %i0 3701 fmovd %d26, %d0 3702 fmovd %d28, %d2 3703 fmovd %d30, %d4 3704 bgt,pt %ncc, .co_aln_101_loop 3705 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3706 add %i1, %i0, %i1 3707 3708 stda %d0, [%i1]%asi 3709 stda %d2, [%i1+8]%asi 3710 stda %d4, [%i1+16]%asi 3711 ba .co_remain_stuff 3712 add %i1, 24, %i1 3713 ! END OF aln_101 3714 3715.co_aln_100: 3716! Alignment off by 32 bytes 3717 ldd [%i0], %d0 3718 ldd [%i0+8], %d2 3719 ldd [%i0+16],%d4 3720 ldd [%i0+24],%d6 3721 add %i0, 32, %i0 3722 sub %i2, 32, %i2 3723 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3724 and %i2, 0x7f, %i2 ! residue bytes in %i2 3725 sub %i1, %i0, %i1 3726.co_aln_100_loop: 3727 ldda [%i0]ASI_BLK_P,%d16 ! block load 3728 subcc %o3, 64, %o3 3729 fmovd %d16, %d8 3730 fmovd %d18, %d10 3731 fmovd %d20, %d12 3732 fmovd %d22, %d14 3733 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3734 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3735 add %i0, 64, %i0 3736 fmovd %d24, %d0 3737 fmovd %d26, %d2 3738 fmovd %d28, %d4 3739 fmovd %d30, %d6 3740 bgt,pt %ncc, .co_aln_100_loop 3741 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3742 add %i1, %i0, %i1 3743 3744 stda %d0, [%i1]%asi 3745 stda %d2, [%i1+8]%asi 3746 stda %d4, [%i1+16]%asi 3747 stda %d6, [%i1+24]%asi 3748 ba .co_remain_stuff 3749 add %i1, 32, %i1 3750 ! END OF aln_100 3751 3752.co_aln_011: 3753! Alignment off by 40 bytes 3754 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3755 ldd [%i0], %d0 3756 ldd [%i0+8], %d2 3757 ldd [%i0+16], %d4 3758 ldd [%i0+24], %d6 3759 ldd [%i0+32], %d8 3760 add %i0, 40, %i0 3761 sub %i2, 40, %i2 3762 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3763 and %i2, 0x7f, %i2 ! residue bytes in %i2 3764 sub %i1, %i0, %i1 3765.co_aln_011_loop: 3766 ldda [%i0]ASI_BLK_P,%d16 ! block load 3767 subcc %o3, 64, %o3 3768 fmovd %d16, %d10 3769 fmovd %d18, %d12 3770 fmovd %d20, %d14 3771 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3772 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3773 add %i0, 64, %i0 3774 fmovd %d22, %d0 3775 fmovd %d24, %d2 3776 fmovd %d26, %d4 3777 fmovd %d28, %d6 3778 fmovd %d30, %d8 3779 bgt,pt %ncc, .co_aln_011_loop 3780 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3781 add %i1, %i0, %i1 3782 3783 stda %d0, [%i1]%asi 3784 stda %d2, [%i1+8]%asi 3785 stda %d4, [%i1+16]%asi 3786 stda %d6, [%i1+24]%asi 3787 stda %d8, [%i1+32]%asi 3788 ba .co_remain_stuff 3789 add %i1, 40, %i1 3790 ! END OF aln_011 3791 3792.co_aln_010: 3793! Alignment off by 48 bytes 3794 ldd [%i0], %d0 3795 ldd [%i0+8], %d2 3796 ldd [%i0+16], %d4 3797 ldd [%i0+24], %d6 3798 ldd [%i0+32], %d8 3799 ldd [%i0+40], %d10 3800 add %i0, 48, %i0 3801 sub %i2, 48, %i2 3802 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3803 and %i2, 0x7f, %i2 ! residue bytes in %i2 3804 sub %i1, %i0, %i1 3805.co_aln_010_loop: 3806 ldda [%i0]ASI_BLK_P,%d16 ! block load 3807 subcc %o3, 64, %o3 3808 fmovd %d16, %d12 3809 fmovd %d18, %d14 3810 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3811 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3812 add %i0, 64, %i0 3813 fmovd %d20, %d0 3814 fmovd %d22, %d2 3815 fmovd %d24, %d4 3816 fmovd %d26, %d6 3817 fmovd %d28, %d8 3818 fmovd %d30, %d10 3819 bgt,pt %ncc, .co_aln_010_loop 3820 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3821 add %i1, %i0, %i1 3822 3823 stda %d0, [%i1]%asi 3824 stda %d2, [%i1+8]%asi 3825 stda %d4, [%i1+16]%asi 3826 stda %d6, [%i1+24]%asi 3827 stda %d8, [%i1+32]%asi 3828 stda %d10, [%i1+40]%asi 3829 ba .co_remain_stuff 3830 add %i1, 48, %i1 3831 ! END OF aln_010 3832 3833.co_aln_001: 3834! Alignment off by 56 bytes 3835 ldd [%i0], %d0 3836 ldd [%i0+8], %d2 3837 ldd [%i0+16], %d4 3838 ldd [%i0+24], %d6 3839 ldd [%i0+32], %d8 3840 ldd [%i0+40], %d10 3841 ldd [%i0+48], %d12 3842 add %i0, 56, %i0 3843 sub %i2, 56, %i2 3844 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3845 and %i2, 0x7f, %i2 ! residue bytes in %i2 3846 sub %i1, %i0, %i1 3847.co_aln_001_loop: 3848 ldda [%i0]ASI_BLK_P,%d16 ! block load 3849 subcc %o3, 64, %o3 3850 fmovd %d16, %d14 3851 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3852 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3853 add %i0, 64, %i0 3854 fmovd %d18, %d0 3855 fmovd %d20, %d2 3856 fmovd %d22, %d4 3857 fmovd %d24, %d6 3858 fmovd %d26, %d8 3859 fmovd %d28, %d10 3860 fmovd %d30, %d12 3861 bgt,pt %ncc, .co_aln_001_loop 3862 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3863 add %i1, %i0, %i1 3864 3865 stda %d0, [%i1]%asi 3866 stda %d2, [%i1+8]%asi 3867 stda %d4, [%i1+16]%asi 3868 stda %d6, [%i1+24]%asi 3869 stda %d8, [%i1+32]%asi 3870 stda %d10, [%i1+40]%asi 3871 stda %d12, [%i1+48]%asi 3872 ba .co_remain_stuff 3873 add %i1, 56, %i1 3874 ! END OF aln_001 3875 3876.co_aln_000: 3877 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3878 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3879 and %i2, 0x7f, %i2 ! residue bytes in %i2 3880 sub %i1, %i0, %i1 3881.co_aln_000_loop: 3882 ldda [%i0]ASI_BLK_P,%d0 3883 subcc %o3, 64, %o3 3884 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3885 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3886 add %i0, 64, %i0 3887 bgt,pt %ncc, .co_aln_000_loop 3888 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3889 add %i1, %i0, %i1 3890 3891 ! END OF aln_000 3892 3893.co_remain_stuff: 3894 subcc %i2, 31, %i2 ! adjust length to allow cc test 3895 ble,pt %ncc, .co_aln_31 3896 nop 3897.co_aln_32: 3898 ldx [%i0], %o4 ! move 32 bytes 3899 subcc %i2, 32, %i2 ! decrement length count by 32 3900 stxa %o4, [%i1]%asi 3901 ldx [%i0+8], %o4 3902 stxa %o4, [%i1+8]%asi 3903 ldx [%i0+16], %o4 3904 add %i0, 32, %i0 ! increase src ptr by 32 3905 stxa %o4, [%i1+16]%asi 3906 ldx [%i0-8], %o4 3907 add %i1, 32, %i1 ! increase dst ptr by 32 3908 bgu,pt %ncc, .co_aln_32 ! repeat if at least 32 bytes left 3909 stxa %o4, [%i1-8]%asi 3910.co_aln_31: 3911 addcc %i2, 24, %i2 ! adjust count to be off by 7 3912 ble,pt %ncc, .co_aln_7 ! skip if 7 or fewer bytes left 3913 nop ! 3914.co_aln_15: 3915 ldx [%i0], %o4 ! move 8 bytes 3916 add %i0, 8, %i0 ! increase src ptr by 8 3917 subcc %i2, 8, %i2 ! decrease count by 8 3918 add %i1, 8, %i1 ! increase dst ptr by 8 3919 bgu,pt %ncc, .co_aln_15 3920 stxa %o4, [%i1-8]%asi 3921.co_aln_7: 3922 addcc %i2, 7, %i2 ! finish adjustment of remaining count 3923 bz,pt %ncc, .co_exit ! exit if finished 3924 cmp %i2, 4 3925 blt,pt %ncc, .co_unaln3x ! skip if less than 4 bytes left 3926 nop ! 3927 ld [%i0], %o4 ! move 4 bytes 3928 add %i0, 4, %i0 ! increase src ptr by 4 3929 add %i1, 4, %i1 ! increase dst ptr by 4 3930 subcc %i2, 4, %i2 ! decrease count by 4 3931 bnz .co_unaln3x 3932 stwa %o4, [%i1-4]%asi 3933 ba .co_exit 3934 nop 3935 3936 ! destination alignment code 3937.co_big_d1: 3938 ldub [%i0], %o4 ! move a byte 3939 add %i0, 1, %i0 3940 stba %o4, [%i1]ASI_USER 3941 add %i1, 1, %i1 3942 andcc %i1, 2, %o3 3943 bz,pt %ncc, .co_big_d2f 3944 sub %i2, 1, %i2 3945.co_big_d2: 3946 ldub [%i0], %o4 ! move a half-word (src align unknown) 3947 ldub [%i0+1], %o3 3948 add %i0, 2, %i0 3949 sll %o4, 8, %o4 ! position 3950 or %o4, %o3, %o4 ! merge 3951 stha %o4, [%i1]ASI_USER 3952 add %i1, 2, %i1 3953 andcc %i1, 4, %o3 ! is dest longword aligned 3954 bz,pt %ncc, .co_big_d4f 3955 sub %i2, 2, %i2 3956.co_big_d4: ! dest is at least word aligned 3957 nop 3958 ldub [%i0], %o4 ! move a word (src align unknown) 3959 ldub [%i0+1], %o3 3960 sll %o4, 24, %o4 ! position 3961 sll %o3, 16, %o3 ! position 3962 or %o4, %o3, %o3 ! merge 3963 ldub [%i0+2], %o4 3964 sll %o4, 8, %o4 ! position 3965 or %o4, %o3, %o3 ! merge 3966 ldub [%i0+3], %o4 3967 or %o4, %o3, %o4 ! merge 3968 stwa %o4,[%i1]ASI_USER ! store four bytes 3969 add %i0, 4, %i0 ! adjust src by 4 3970 add %i1, 4, %i1 ! adjust dest by 4 3971 ba .co_big_d4f 3972 sub %i2, 4, %i2 ! adjust count by 4 3973 3974 3975 ! Dst is on 8 byte boundary; src is not; 3976.co_big_unal8: 3977 andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned? 3978 bz %ncc, .co_unalnsrc 3979 sub %o3, 64, %o3 ! %o3 will be multiple of 8 3980 neg %o3 ! bytes until dest is 64 byte aligned 3981 sub %i2, %o3, %i2 ! update cnt with bytes to be moved 3982 ! Move bytes according to source alignment 3983 andcc %i0, 0x1, %o4 3984 bnz %ncc, .co_unalnbyte ! check for byte alignment 3985 nop 3986 andcc %i0, 2, %o4 ! check for half word alignment 3987 bnz %ncc, .co_unalnhalf 3988 nop 3989 ! Src is word aligned, move bytes until dest 64 byte aligned 3990.co_unalnword: 3991 ld [%i0], %o4 ! load 4 bytes 3992 stwa %o4, [%i1]%asi ! and store 4 bytes 3993 ld [%i0+4], %o4 ! load 4 bytes 3994 add %i0, 8, %i0 ! increase src ptr by 8 3995 stwa %o4, [%i1+4]%asi ! and store 4 bytes 3996 subcc %o3, 8, %o3 ! decrease count by 8 3997 bnz %ncc, .co_unalnword 3998 add %i1, 8, %i1 ! increase dst ptr by 8 3999 ba .co_unalnsrc 4000 nop 4001 4002 ! Src is half-word aligned, move bytes until dest 64 byte aligned 4003.co_unalnhalf: 4004 lduh [%i0], %o4 ! load 2 bytes 4005 sllx %o4, 32, %i3 ! shift left 4006 lduw [%i0+2], %o4 4007 or %o4, %i3, %i3 4008 sllx %i3, 16, %i3 4009 lduh [%i0+6], %o4 4010 or %o4, %i3, %i3 4011 stxa %i3, [%i1]ASI_USER 4012 add %i0, 8, %i0 4013 subcc %o3, 8, %o3 4014 bnz %ncc, .co_unalnhalf 4015 add %i1, 8, %i1 4016 ba .co_unalnsrc 4017 nop 4018 4019 ! Src is Byte aligned, move bytes until dest 64 byte aligned 4020.co_unalnbyte: 4021 sub %i1, %i0, %i1 ! share pointer advance 4022.co_unalnbyte_loop: 4023 ldub [%i0], %o4 4024 sllx %o4, 56, %i3 4025 lduh [%i0+1], %o4 4026 sllx %o4, 40, %o4 4027 or %o4, %i3, %i3 4028 lduh [%i0+3], %o4 4029 sllx %o4, 24, %o4 4030 or %o4, %i3, %i3 4031 lduh [%i0+5], %o4 4032 sllx %o4, 8, %o4 4033 or %o4, %i3, %i3 4034 ldub [%i0+7], %o4 4035 or %o4, %i3, %i3 4036 stxa %i3, [%i1+%i0]ASI_USER 4037 subcc %o3, 8, %o3 4038 bnz %ncc, .co_unalnbyte_loop 4039 add %i0, 8, %i0 4040 add %i1,%i0, %i1 ! restore pointer 4041 4042 ! Destination is now block (64 byte aligned), src is not 8 byte aligned 4043.co_unalnsrc: 4044 andn %i2, 0x3f, %i3 ! %i3 is multiple of block size 4045 and %i2, 0x3f, %i2 ! residue bytes in %i2 4046 add %i2, 64, %i2 ! Insure we don't load beyond 4047 sub %i3, 64, %i3 ! end of source buffer 4048 4049 andn %i0, 0x3f, %o4 ! %o4 has block aligned src address 4050 prefetch [%o4 + (3 * CACHE_LINE)], #one_read 4051 alignaddr %i0, %g0, %g0 ! generate %gsr 4052 add %i0, %i3, %i0 ! advance %i0 to after blocks 4053 ! 4054 ! Determine source alignment to correct 8 byte offset 4055 andcc %i0, 0x20, %o3 4056 brnz,pn %o3, .co_unaln_1 4057 andcc %i0, 0x10, %o3 4058 brnz,pn %o3, .co_unaln_01 4059 andcc %i0, 0x08, %o3 4060 brz,a %o3, .co_unaln_000 4061 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4062 ba .co_unaln_001 4063 nop 4064.co_unaln_01: 4065 brnz,a %o3, .co_unaln_011 4066 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4067 ba .co_unaln_010 4068 nop 4069.co_unaln_1: 4070 brnz,pn %o3, .co_unaln_11 4071 andcc %i0, 0x08, %o3 4072 brnz,a %o3, .co_unaln_101 4073 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4074 ba .co_unaln_100 4075 nop 4076.co_unaln_11: 4077 brz,pn %o3, .co_unaln_110 4078 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 4079 4080.co_unaln_111: 4081 ldd [%o4+56], %d14 4082.co_unaln_111_loop: 4083 add %o4, 64, %o4 4084 ldda [%o4]ASI_BLK_P, %d16 4085 faligndata %d14, %d16, %d48 4086 faligndata %d16, %d18, %d50 4087 faligndata %d18, %d20, %d52 4088 faligndata %d20, %d22, %d54 4089 faligndata %d22, %d24, %d56 4090 faligndata %d24, %d26, %d58 4091 faligndata %d26, %d28, %d60 4092 faligndata %d28, %d30, %d62 4093 fmovd %d30, %d14 4094 stda %d48, [%i1]ASI_BLK_AIUS 4095 subcc %i3, 64, %i3 4096 add %i1, 64, %i1 4097 bgu,pt %ncc, .co_unaln_111_loop 4098 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4099 ba .co_unaln_done 4100 nop 4101 4102.co_unaln_110: 4103 ldd [%o4+48], %d12 4104 ldd [%o4+56], %d14 4105.co_unaln_110_loop: 4106 add %o4, 64, %o4 4107 ldda [%o4]ASI_BLK_P, %d16 4108 faligndata %d12, %d14, %d48 4109 faligndata %d14, %d16, %d50 4110 faligndata %d16, %d18, %d52 4111 faligndata %d18, %d20, %d54 4112 faligndata %d20, %d22, %d56 4113 faligndata %d22, %d24, %d58 4114 faligndata %d24, %d26, %d60 4115 faligndata %d26, %d28, %d62 4116 fmovd %d28, %d12 4117 fmovd %d30, %d14 4118 stda %d48, [%i1]ASI_BLK_AIUS 4119 subcc %i3, 64, %i3 4120 add %i1, 64, %i1 4121 bgu,pt %ncc, .co_unaln_110_loop 4122 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4123 ba .co_unaln_done 4124 nop 4125 4126.co_unaln_101: 4127 ldd [%o4+40], %d10 4128 ldd [%o4+48], %d12 4129 ldd [%o4+56], %d14 4130.co_unaln_101_loop: 4131 add %o4, 64, %o4 4132 ldda [%o4]ASI_BLK_P, %d16 4133 faligndata %d10, %d12, %d48 4134 faligndata %d12, %d14, %d50 4135 faligndata %d14, %d16, %d52 4136 faligndata %d16, %d18, %d54 4137 faligndata %d18, %d20, %d56 4138 faligndata %d20, %d22, %d58 4139 faligndata %d22, %d24, %d60 4140 faligndata %d24, %d26, %d62 4141 fmovd %d26, %d10 4142 fmovd %d28, %d12 4143 fmovd %d30, %d14 4144 stda %d48, [%i1]ASI_BLK_AIUS 4145 subcc %i3, 64, %i3 4146 add %i1, 64, %i1 4147 bgu,pt %ncc, .co_unaln_101_loop 4148 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4149 ba .co_unaln_done 4150 nop 4151 4152.co_unaln_100: 4153 ldd [%o4+32], %d8 4154 ldd [%o4+40], %d10 4155 ldd [%o4+48], %d12 4156 ldd [%o4+56], %d14 4157.co_unaln_100_loop: 4158 add %o4, 64, %o4 4159 ldda [%o4]ASI_BLK_P, %d16 4160 faligndata %d8, %d10, %d48 4161 faligndata %d10, %d12, %d50 4162 faligndata %d12, %d14, %d52 4163 faligndata %d14, %d16, %d54 4164 faligndata %d16, %d18, %d56 4165 faligndata %d18, %d20, %d58 4166 faligndata %d20, %d22, %d60 4167 faligndata %d22, %d24, %d62 4168 fmovd %d24, %d8 4169 fmovd %d26, %d10 4170 fmovd %d28, %d12 4171 fmovd %d30, %d14 4172 stda %d48, [%i1]ASI_BLK_AIUS 4173 subcc %i3, 64, %i3 4174 add %i1, 64, %i1 4175 bgu,pt %ncc, .co_unaln_100_loop 4176 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4177 ba .co_unaln_done 4178 nop 4179 4180.co_unaln_011: 4181 ldd [%o4+24], %d6 4182 ldd [%o4+32], %d8 4183 ldd [%o4+40], %d10 4184 ldd [%o4+48], %d12 4185 ldd [%o4+56], %d14 4186.co_unaln_011_loop: 4187 add %o4, 64, %o4 4188 ldda [%o4]ASI_BLK_P, %d16 4189 faligndata %d6, %d8, %d48 4190 faligndata %d8, %d10, %d50 4191 faligndata %d10, %d12, %d52 4192 faligndata %d12, %d14, %d54 4193 faligndata %d14, %d16, %d56 4194 faligndata %d16, %d18, %d58 4195 faligndata %d18, %d20, %d60 4196 faligndata %d20, %d22, %d62 4197 fmovd %d22, %d6 4198 fmovd %d24, %d8 4199 fmovd %d26, %d10 4200 fmovd %d28, %d12 4201 fmovd %d30, %d14 4202 stda %d48, [%i1]ASI_BLK_AIUS 4203 subcc %i3, 64, %i3 4204 add %i1, 64, %i1 4205 bgu,pt %ncc, .co_unaln_011_loop 4206 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4207 ba .co_unaln_done 4208 nop 4209 4210.co_unaln_010: 4211 ldd [%o4+16], %d4 4212 ldd [%o4+24], %d6 4213 ldd [%o4+32], %d8 4214 ldd [%o4+40], %d10 4215 ldd [%o4+48], %d12 4216 ldd [%o4+56], %d14 4217.co_unaln_010_loop: 4218 add %o4, 64, %o4 4219 ldda [%o4]ASI_BLK_P, %d16 4220 faligndata %d4, %d6, %d48 4221 faligndata %d6, %d8, %d50 4222 faligndata %d8, %d10, %d52 4223 faligndata %d10, %d12, %d54 4224 faligndata %d12, %d14, %d56 4225 faligndata %d14, %d16, %d58 4226 faligndata %d16, %d18, %d60 4227 faligndata %d18, %d20, %d62 4228 fmovd %d20, %d4 4229 fmovd %d22, %d6 4230 fmovd %d24, %d8 4231 fmovd %d26, %d10 4232 fmovd %d28, %d12 4233 fmovd %d30, %d14 4234 stda %d48, [%i1]ASI_BLK_AIUS 4235 subcc %i3, 64, %i3 4236 add %i1, 64, %i1 4237 bgu,pt %ncc, .co_unaln_010_loop 4238 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4239 ba .co_unaln_done 4240 nop 4241 4242.co_unaln_001: 4243 ldd [%o4+8], %d2 4244 ldd [%o4+16], %d4 4245 ldd [%o4+24], %d6 4246 ldd [%o4+32], %d8 4247 ldd [%o4+40], %d10 4248 ldd [%o4+48], %d12 4249 ldd [%o4+56], %d14 4250.co_unaln_001_loop: 4251 add %o4, 64, %o4 4252 ldda [%o4]ASI_BLK_P, %d16 4253 faligndata %d2, %d4, %d48 4254 faligndata %d4, %d6, %d50 4255 faligndata %d6, %d8, %d52 4256 faligndata %d8, %d10, %d54 4257 faligndata %d10, %d12, %d56 4258 faligndata %d12, %d14, %d58 4259 faligndata %d14, %d16, %d60 4260 faligndata %d16, %d18, %d62 4261 fmovd %d18, %d2 4262 fmovd %d20, %d4 4263 fmovd %d22, %d6 4264 fmovd %d24, %d8 4265 fmovd %d26, %d10 4266 fmovd %d28, %d12 4267 fmovd %d30, %d14 4268 stda %d48, [%i1]ASI_BLK_AIUS 4269 subcc %i3, 64, %i3 4270 add %i1, 64, %i1 4271 bgu,pt %ncc, .co_unaln_001_loop 4272 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4273 ba .co_unaln_done 4274 nop 4275 4276.co_unaln_000: 4277 ldda [%o4]ASI_BLK_P, %d0 4278.co_unaln_000_loop: 4279 add %o4, 64, %o4 4280 ldda [%o4]ASI_BLK_P, %d16 4281 faligndata %d0, %d2, %d48 4282 faligndata %d2, %d4, %d50 4283 faligndata %d4, %d6, %d52 4284 faligndata %d6, %d8, %d54 4285 faligndata %d8, %d10, %d56 4286 faligndata %d10, %d12, %d58 4287 faligndata %d12, %d14, %d60 4288 faligndata %d14, %d16, %d62 4289 fmovd %d16, %d0 4290 fmovd %d18, %d2 4291 fmovd %d20, %d4 4292 fmovd %d22, %d6 4293 fmovd %d24, %d8 4294 fmovd %d26, %d10 4295 fmovd %d28, %d12 4296 fmovd %d30, %d14 4297 stda %d48, [%i1]ASI_BLK_AIUS 4298 subcc %i3, 64, %i3 4299 add %i1, 64, %i1 4300 bgu,pt %ncc, .co_unaln_000_loop 4301 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4302 4303.co_unaln_done: 4304 ! Handle trailing bytes, 64 to 127 4305 ! Dest long word aligned, Src not long word aligned 4306 cmp %i2, 15 4307 bleu %ncc, .co_unaln_short 4308 4309 andn %i2, 0x7, %i3 ! %i3 is multiple of 8 4310 and %i2, 0x7, %i2 ! residue bytes in %i2 4311 add %i2, 8, %i2 4312 sub %i3, 8, %i3 ! insure we don't load past end of src 4313 andn %i0, 0x7, %o4 ! %o4 has long word aligned src address 4314 add %i0, %i3, %i0 ! advance %i0 to after multiple of 8 4315 ldd [%o4], %d0 ! fetch partial word 4316.co_unaln_by8: 4317 ldd [%o4+8], %d2 4318 add %o4, 8, %o4 4319 faligndata %d0, %d2, %d16 4320 subcc %i3, 8, %i3 4321 stda %d16, [%i1]%asi 4322 fmovd %d2, %d0 4323 bgu,pt %ncc, .co_unaln_by8 4324 add %i1, 8, %i1 4325 4326.co_unaln_short: 4327 cmp %i2, 8 4328 blt,pt %ncc, .co_unalnfin 4329 nop 4330 ldub [%i0], %o4 4331 sll %o4, 24, %o3 4332 ldub [%i0+1], %o4 4333 sll %o4, 16, %o4 4334 or %o4, %o3, %o3 4335 ldub [%i0+2], %o4 4336 sll %o4, 8, %o4 4337 or %o4, %o3, %o3 4338 ldub [%i0+3], %o4 4339 or %o4, %o3, %o3 4340 stwa %o3, [%i1]%asi 4341 ldub [%i0+4], %o4 4342 sll %o4, 24, %o3 4343 ldub [%i0+5], %o4 4344 sll %o4, 16, %o4 4345 or %o4, %o3, %o3 4346 ldub [%i0+6], %o4 4347 sll %o4, 8, %o4 4348 or %o4, %o3, %o3 4349 ldub [%i0+7], %o4 4350 or %o4, %o3, %o3 4351 stwa %o3, [%i1+4]%asi 4352 add %i0, 8, %i0 4353 add %i1, 8, %i1 4354 sub %i2, 8, %i2 4355.co_unalnfin: 4356 cmp %i2, 4 4357 blt,pt %ncc, .co_unalnz 4358 tst %i2 4359 ldub [%i0], %o3 ! read byte 4360 subcc %i2, 4, %i2 ! reduce count by 4 4361 sll %o3, 24, %o3 ! position 4362 ldub [%i0+1], %o4 4363 sll %o4, 16, %o4 ! position 4364 or %o4, %o3, %o3 ! merge 4365 ldub [%i0+2], %o4 4366 sll %o4, 8, %o4 ! position 4367 or %o4, %o3, %o3 ! merge 4368 add %i1, 4, %i1 ! advance dst by 4 4369 ldub [%i0+3], %o4 4370 add %i0, 4, %i0 ! advance src by 4 4371 or %o4, %o3, %o4 ! merge 4372 bnz,pt %ncc, .co_unaln3x 4373 stwa %o4, [%i1-4]%asi 4374 ba .co_exit 4375 nop 4376.co_unalnz: 4377 bz,pt %ncc, .co_exit 4378 wr %l5, %g0, %gsr ! restore %gsr 4379.co_unaln3x: ! Exactly 1, 2, or 3 bytes remain 4380 subcc %i2, 1, %i2 ! reduce count for cc test 4381 ldub [%i0], %o4 ! load one byte 4382 bz,pt %ncc, .co_exit 4383 stba %o4, [%i1]%asi ! store one byte 4384 ldub [%i0+1], %o4 ! load second byte 4385 subcc %i2, 1, %i2 4386 bz,pt %ncc, .co_exit 4387 stba %o4, [%i1+1]%asi ! store second byte 4388 ldub [%i0+2], %o4 ! load third byte 4389 stba %o4, [%i1+2]%asi ! store third byte 4390.co_exit: 4391 brnz %g1, .co_fp_restore 4392 nop 4393 FZERO 4394 wr %g1, %g0, %fprs 4395 ba,pt %ncc, .co_ex2 4396 membar #Sync 4397.co_fp_restore: 4398 BLD_FP_FROMSTACK(%o4) 4399.co_ex2: 4400 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 4401 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 4402 ret 4403 restore %g0, 0, %o0 4404 4405.copyout_err: 4406 ldn [THREAD_REG + T_COPYOPS], %o4 4407 brz %o4, 2f 4408 nop 4409 ldn [%o4 + CP_COPYOUT], %g2 4410 jmp %g2 4411 nop 44122: 4413 retl 4414 mov -1, %o0 4415 4416#else /* NIAGARA_IMPL */ 4417.do_copyout: 4418 ! 4419 ! Check the length and bail if zero. 4420 ! 4421 tst %o2 4422 bnz,pt %ncc, 1f 4423 nop 4424 retl 4425 clr %o0 44261: 4427 sethi %hi(copyio_fault), %o4 4428 or %o4, %lo(copyio_fault), %o4 4429 sethi %hi(copyio_fault_nowindow), %o3 4430 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT 4431 or %o3, %lo(copyio_fault_nowindow), %o3 4432 membar #Sync 4433 stn %o3, [THREAD_REG + T_LOFAULT] 4434 4435 mov %o0, SAVE_SRC 4436 mov %o1, SAVE_DST 4437 mov %o2, SAVE_COUNT 4438 4439 ! 4440 ! Check to see if we're more than SMALL_LIMIT (7 bytes). 4441 ! Run in leaf mode, using the %o regs as our input regs. 4442 ! 4443 subcc %o2, SMALL_LIMIT, %o3 4444 bgu,a,pt %ncc, .dco_ns 4445 or %o0, %o1, %o3 4446 ! 4447 ! What was previously ".small_copyout" 4448 ! Do full differenced copy. 4449 ! 4450.dcobcp: 4451 sub %g0, %o2, %o3 ! negate count 4452 add %o0, %o2, %o0 ! make %o0 point at the end 4453 add %o1, %o2, %o1 ! make %o1 point at the end 4454 ba,pt %ncc, .dcocl 4455 ldub [%o0 + %o3], %o4 ! load first byte 4456 ! 4457 ! %o0 and %o2 point at the end and remain pointing at the end 4458 ! of their buffers. We pull things out by adding %o3 (which is 4459 ! the negation of the length) to the buffer end which gives us 4460 ! the curent location in the buffers. By incrementing %o3 we walk 4461 ! through both buffers without having to bump each buffer's 4462 ! pointer. A very fast 4 instruction loop. 4463 ! 4464 .align 16 4465.dcocl: 4466 stba %o4, [%o1 + %o3]ASI_USER 4467 inccc %o3 4468 bl,a,pt %ncc, .dcocl 4469 ldub [%o0 + %o3], %o4 4470 ! 4471 ! We're done. Go home. 4472 ! 4473 membar #Sync 4474 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 4475 retl 4476 clr %o0 4477 ! 4478 ! Try aligned copies from here. 4479 ! 4480.dco_ns: 4481 ! %o0 = kernel addr (to be copied from) 4482 ! %o1 = user addr (to be copied to) 4483 ! %o2 = length 4484 ! %o3 = %o1 | %o2 (used for alignment checking) 4485 ! %o4 is alternate lo_fault 4486 ! %o5 is original lo_fault 4487 ! 4488 ! See if we're single byte aligned. If we are, check the 4489 ! limit for single byte copies. If we're smaller or equal, 4490 ! bounce to the byte for byte copy loop. Otherwise do it in 4491 ! HW (if enabled). 4492 ! 4493 btst 1, %o3 4494 bz,pt %icc, .dcoh8 4495 btst 7, %o3 4496 ! 4497 ! Single byte aligned. Do we do it via HW or via 4498 ! byte for byte? Do a quick no memory reference 4499 ! check to pick up small copies. 4500 ! 4501 sethi %hi(hw_copy_limit_1), %o3 4502 ! 4503 ! Big enough that we need to check the HW limit for 4504 ! this size copy. 4505 ! 4506 ld [%o3 + %lo(hw_copy_limit_1)], %o3 4507 ! 4508 ! Is HW copy on? If not, do everything byte for byte. 4509 ! 4510 tst %o3 4511 bz,pn %icc, .dcobcp 4512 subcc %o3, %o2, %o3 4513 ! 4514 ! If we're less than or equal to the single byte copy limit, 4515 ! bop to the copy loop. 4516 ! 4517 bge,pt %ncc, .dcobcp 4518 nop 4519 ! 4520 ! We're big enough and copy is on. Do it with HW. 4521 ! 4522 ba,pt %ncc, .big_copyout 4523 nop 4524.dcoh8: 4525 ! 4526 ! 8 byte aligned? 4527 ! 4528 bnz,a %ncc, .dcoh4 4529 btst 3, %o3 4530 ! 4531 ! See if we're in the "small range". 4532 ! If so, go off and do the copy. 4533 ! If not, load the hard limit. %o3 is 4534 ! available for reuse. 4535 ! 4536 sethi %hi(hw_copy_limit_8), %o3 4537 ld [%o3 + %lo(hw_copy_limit_8)], %o3 4538 ! 4539 ! If it's zero, there's no HW bcopy. 4540 ! Bop off to the aligned copy. 4541 ! 4542 tst %o3 4543 bz,pn %icc, .dcos8 4544 subcc %o3, %o2, %o3 4545 ! 4546 ! We're negative if our size is larger than hw_copy_limit_8. 4547 ! 4548 bge,pt %ncc, .dcos8 4549 nop 4550 ! 4551 ! HW assist is on and we're large enough. Do it. 4552 ! 4553 ba,pt %ncc, .big_copyout 4554 nop 4555.dcos8: 4556 ! 4557 ! Housekeeping for copy loops. Uses same idea as in the byte for 4558 ! byte copy loop above. 4559 ! 4560 add %o0, %o2, %o0 4561 add %o1, %o2, %o1 4562 sub %g0, %o2, %o3 4563 ba,pt %ncc, .dodebc 4564 srl %o2, 3, %o2 ! Number of 8 byte chunks to copy 4565 ! 4566 ! 4 byte aligned? 4567 ! 4568.dcoh4: 4569 bnz,pn %ncc, .dcoh2 4570 ! 4571 ! See if we're in the "small range". 4572 ! If so, go off an do the copy. 4573 ! If not, load the hard limit. %o3 is 4574 ! available for reuse. 4575 ! 4576 sethi %hi(hw_copy_limit_4), %o3 4577 ld [%o3 + %lo(hw_copy_limit_4)], %o3 4578 ! 4579 ! If it's zero, there's no HW bcopy. 4580 ! Bop off to the aligned copy. 4581 ! 4582 tst %o3 4583 bz,pn %icc, .dcos4 4584 subcc %o3, %o2, %o3 4585 ! 4586 ! We're negative if our size is larger than hw_copy_limit_4. 4587 ! 4588 bge,pt %ncc, .dcos4 4589 nop 4590 ! 4591 ! HW assist is on and we're large enough. Do it. 4592 ! 4593 ba,pt %ncc, .big_copyout 4594 nop 4595.dcos4: 4596 add %o0, %o2, %o0 4597 add %o1, %o2, %o1 4598 sub %g0, %o2, %o3 4599 ba,pt %ncc, .dodfbc 4600 srl %o2, 2, %o2 ! Number of 4 byte chunks to copy 4601 ! 4602 ! We must be 2 byte aligned. Off we go. 4603 ! The check for small copies was done in the 4604 ! delay at .dcoh4 4605 ! 4606.dcoh2: 4607 ble %ncc, .dcos2 4608 sethi %hi(hw_copy_limit_2), %o3 4609 ld [%o3 + %lo(hw_copy_limit_2)], %o3 4610 tst %o3 4611 bz,pn %icc, .dcos2 4612 subcc %o3, %o2, %o3 4613 bge,pt %ncc, .dcos2 4614 nop 4615 ! 4616 ! HW is on and we're big enough. Do it. 4617 ! 4618 ba,pt %ncc, .big_copyout 4619 nop 4620.dcos2: 4621 add %o0, %o2, %o0 4622 add %o1, %o2, %o1 4623 sub %g0, %o2, %o3 4624 ba,pt %ncc, .dodtbc 4625 srl %o2, 1, %o2 ! Number of 2 byte chunks to copy 4626.small_copyout: 4627 ! 4628 ! Why are we doing this AGAIN? There are certain conditions in 4629 ! big_copyout that will cause us to forego the HW assisted copies 4630 ! and bounce back to a non-HW assisted copy. This dispatches those 4631 ! copies. Note that we branch around this in the main line code. 4632 ! 4633 ! We make no check for limits or HW enablement here. We've 4634 ! already been told that we're a poster child so just go off 4635 ! and do it. 4636 ! 4637 or %o0, %o1, %o3 4638 btst 1, %o3 4639 bnz %icc, .dcobcp ! Most likely 4640 btst 7, %o3 4641 bz %icc, .dcos8 4642 btst 3, %o3 4643 bz %icc, .dcos4 4644 nop 4645 ba,pt %ncc, .dcos2 4646 nop 4647 .align 32 4648.dodebc: 4649 ldx [%o0 + %o3], %o4 4650 deccc %o2 4651 stxa %o4, [%o1 + %o3]ASI_USER 4652 bg,pt %ncc, .dodebc 4653 addcc %o3, 8, %o3 4654 ! 4655 ! End of copy loop. Check to see if we're done. Most 4656 ! eight byte aligned copies end here. 4657 ! 4658 bz,pt %ncc, .dcofh 4659 nop 4660 ! 4661 ! Something is left - do it byte for byte. 4662 ! 4663 ba,pt %ncc, .dcocl 4664 ldub [%o0 + %o3], %o4 ! load next byte 4665 ! 4666 ! Four byte copy loop. %o2 is the number of 4 byte chunks to copy. 4667 ! 4668 .align 32 4669.dodfbc: 4670 lduw [%o0 + %o3], %o4 4671 deccc %o2 4672 sta %o4, [%o1 + %o3]ASI_USER 4673 bg,pt %ncc, .dodfbc 4674 addcc %o3, 4, %o3 4675 ! 4676 ! End of copy loop. Check to see if we're done. Most 4677 ! four byte aligned copies end here. 4678 ! 4679 bz,pt %ncc, .dcofh 4680 nop 4681 ! 4682 ! Something is left. Do it byte for byte. 4683 ! 4684 ba,pt %ncc, .dcocl 4685 ldub [%o0 + %o3], %o4 ! load next byte 4686 ! 4687 ! two byte aligned copy loop. %o2 is the number of 2 byte chunks to 4688 ! copy. 4689 ! 4690 .align 32 4691.dodtbc: 4692 lduh [%o0 + %o3], %o4 4693 deccc %o2 4694 stha %o4, [%o1 + %o3]ASI_USER 4695 bg,pt %ncc, .dodtbc 4696 addcc %o3, 2, %o3 4697 ! 4698 ! End of copy loop. Anything left? 4699 ! 4700 bz,pt %ncc, .dcofh 4701 nop 4702 ! 4703 ! Deal with the last byte 4704 ! 4705 ldub [%o0 + %o3], %o4 4706 stba %o4, [%o1 + %o3]ASI_USER 4707.dcofh: 4708 membar #Sync 4709 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 4710 retl 4711 clr %o0 4712 4713.big_copyout: 4714 ! We're going to go off and do a block copy. 4715 ! Switch fault handlers and grab a window. We 4716 ! don't do a membar #Sync since we've done only 4717 ! kernel data to this point. 4718 stn %o4, [THREAD_REG + T_LOFAULT] 4719 4720 ! Copy out that reach here are larger than 256 bytes. The 4721 ! hw_copy_limit_1 is set to 256. Never set this limit less 4722 ! 128 bytes. 4723 save %sp, -SA(MINFRAME), %sp 4724.do_block_copyout: 4725 4726 ! Swap src/dst since the code below is memcpy code 4727 ! and memcpy/bcopy have different calling sequences 4728 mov %i1, %i5 4729 mov %i0, %i1 4730 mov %i5, %i0 4731 4732 ! Block (64 bytes) align the destination. 4733 andcc %i0, 0x3f, %i3 ! is dst block aligned 4734 bz %ncc, copyout_blalign ! dst already block aligned 4735 sub %i3, 0x40, %i3 4736 neg %i3 ! bytes till dst 64 bytes aligned 4737 sub %i2, %i3, %i2 ! update i2 with new count 4738 4739 ! Based on source and destination alignment do 4740 ! either 8 bytes, 4 bytes, 2 bytes or byte copy. 4741 4742 ! Is dst & src 8B aligned 4743 or %i0, %i1, %o2 4744 andcc %o2, 0x7, %g0 4745 bz %ncc, .co_alewdcp 4746 nop 4747 4748 ! Is dst & src 4B aligned 4749 andcc %o2, 0x3, %g0 4750 bz %ncc, .co_alwdcp 4751 nop 4752 4753 ! Is dst & src 2B aligned 4754 andcc %o2, 0x1, %g0 4755 bz %ncc, .co_alhlfwdcp 4756 nop 4757 4758 ! 1B aligned 47591: ldub [%i1], %o2 4760 stba %o2, [%i0]ASI_USER 4761 inc %i1 4762 deccc %i3 4763 bgu,pt %ncc, 1b 4764 inc %i0 4765 4766 ba copyout_blalign 4767 nop 4768 4769 ! dst & src 4B aligned 4770.co_alwdcp: 4771 ld [%i1], %o2 4772 sta %o2, [%i0]ASI_USER 4773 add %i1, 0x4, %i1 4774 subcc %i3, 0x4, %i3 4775 bgu,pt %ncc, .co_alwdcp 4776 add %i0, 0x4, %i0 4777 4778 ba copyout_blalign 4779 nop 4780 4781 ! dst & src 2B aligned 4782.co_alhlfwdcp: 4783 lduh [%i1], %o2 4784 stuha %o2, [%i0]ASI_USER 4785 add %i1, 0x2, %i1 4786 subcc %i3, 0x2, %i3 4787 bgu,pt %ncc, .co_alhlfwdcp 4788 add %i0, 0x2, %i0 4789 4790 ba copyout_blalign 4791 nop 4792 4793 ! dst & src 8B aligned 4794.co_alewdcp: 4795 ldx [%i1], %o2 4796 stxa %o2, [%i0]ASI_USER 4797 add %i1, 0x8, %i1 4798 subcc %i3, 0x8, %i3 4799 bgu,pt %ncc, .co_alewdcp 4800 add %i0, 0x8, %i0 4801 4802 ! Now Destination is block (64 bytes) aligned 4803copyout_blalign: 4804 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size 4805 sub %i2, %i3, %i2 ! Residue bytes in %i2 4806 4807 mov ASI_BLK_INIT_QUAD_LDD_AIUS, %asi 4808 4809 andcc %i1, 0xf, %o2 ! is src quadword aligned 4810 bz,pn %xcc, .co_blkcpy ! src offset in %o2 (last 4-bits) 4811 nop 4812 cmp %o2, 0x8 4813 bg .co_upper_double 4814 nop 4815 bl .co_lower_double 4816 nop 4817 4818 ! Falls through when source offset is equal to 8 i.e. 4819 ! source is double word aligned. 4820 ! In this case no shift/merge of data is required 4821 4822 sub %i1, %o2, %i1 ! align the src at 16 bytes. 4823 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 4824 prefetch [%l0+0x0], #one_read 4825 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4826.co_loop0: 4827 add %i1, 0x10, %i1 4828 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 4829 prefetch [%l0+0x40], #one_read 4830 4831 stxa %l3, [%i0+0x0]%asi 4832 stxa %l4, [%i0+0x8]%asi 4833 4834 add %i1, 0x10, %i1 4835 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4836 4837 stxa %l5, [%i0+0x10]%asi 4838 stxa %l2, [%i0+0x18]%asi 4839 4840 add %i1, 0x10, %i1 4841 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 4842 4843 stxa %l3, [%i0+0x20]%asi 4844 stxa %l4, [%i0+0x28]%asi 4845 4846 add %i1, 0x10, %i1 4847 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4848 4849 stxa %l5, [%i0+0x30]%asi 4850 stxa %l2, [%i0+0x38]%asi 4851 4852 add %l0, 0x40, %l0 4853 subcc %i3, 0x40, %i3 4854 bgu,pt %xcc, .co_loop0 4855 add %i0, 0x40, %i0 4856 ba .co_blkdone 4857 add %i1, %o2, %i1 ! increment the source by src offset 4858 ! the src offset was stored in %o2 4859 4860.co_lower_double: 4861 4862 sub %i1, %o2, %i1 ! align the src at 16 bytes. 4863 sll %o2, 3, %o0 ! %o0 left shift 4864 mov 0x40, %o1 4865 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 4866 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 4867 prefetch [%l0+0x0], #one_read 4868 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l2 and %l3 has 4869 ! complete data 4870.co_loop1: 4871 add %i1, 0x10, %i1 4872 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has partial data 4873 ! for this read. 4874 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4 4875 ! into %l2 and %l3 4876 prefetch [%l0+0x40], #one_read 4877 4878 stxa %l2, [%i0+0x0]%asi 4879 stxa %l3, [%i0+0x8]%asi 4880 4881 add %i1, 0x10, %i1 4882 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4883 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and 4884 ! %l4 from previous read 4885 ! into %l4 and %l5 4886 stxa %l4, [%i0+0x10]%asi 4887 stxa %l5, [%i0+0x18]%asi 4888 4889 ! Repeat the same for next 32 bytes. 4890 4891 add %i1, 0x10, %i1 4892 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 4893 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) 4894 4895 stxa %l2, [%i0+0x20]%asi 4896 stxa %l3, [%i0+0x28]%asi 4897 4898 add %i1, 0x10, %i1 4899 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4900 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) 4901 4902 stxa %l4, [%i0+0x30]%asi 4903 stxa %l5, [%i0+0x38]%asi 4904 4905 add %l0, 0x40, %l0 4906 subcc %i3, 0x40, %i3 4907 bgu,pt %xcc, .co_loop1 4908 add %i0, 0x40, %i0 4909 ba .co_blkdone 4910 add %i1, %o2, %i1 ! increment the source by src offset 4911 ! the src offset was stored in %o2 4912 4913.co_upper_double: 4914 4915 sub %i1, %o2, %i1 ! align the src at 16 bytes. 4916 sub %o2, 0x8, %o0 4917 sll %o0, 3, %o0 ! %o0 left shift 4918 mov 0x40, %o1 4919 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 4920 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 4921 prefetch [%l0+0x0], #one_read 4922 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l3 4923 ! for this read and 4924 ! no data in %l2 4925.co_loop2: 4926 add %i1, 0x10, %i1 4927 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has complete data 4928 ! and %l5 has partial 4929 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5 4930 ! into %l3 and %l4 4931 prefetch [%l0+0x40], #one_read 4932 4933 stxa %l3, [%i0+0x0]%asi 4934 stxa %l4, [%i0+0x8]%asi 4935 4936 add %i1, 0x10, %i1 4937 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4938 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with 4939 ! %l5 from previous read 4940 ! into %l5 and %l2 4941 4942 stxa %l5, [%i0+0x10]%asi 4943 stxa %l2, [%i0+0x18]%asi 4944 4945 ! Repeat the same for next 32 bytes. 4946 4947 add %i1, 0x10, %i1 4948 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 4949 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) 4950 4951 stxa %l3, [%i0+0x20]%asi 4952 stxa %l4, [%i0+0x28]%asi 4953 4954 add %i1, 0x10, %i1 4955 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4956 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) 4957 4958 stxa %l5, [%i0+0x30]%asi 4959 stxa %l2, [%i0+0x38]%asi 4960 4961 add %l0, 0x40, %l0 4962 subcc %i3, 0x40, %i3 4963 bgu,pt %xcc, .co_loop2 4964 add %i0, 0x40, %i0 4965 ba .co_blkdone 4966 add %i1, %o2, %i1 ! increment the source by src offset 4967 ! the src offset was stored in %o2 4968 4969 4970 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P 4971.co_blkcpy: 4972 4973 andn %i1, 0x3f, %o0 ! %o0 has block aligned source 4974 prefetch [%o0+0x0], #one_read 49751: 4976 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0 4977 add %i1, 0x10, %i1 4978 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4979 add %i1, 0x10, %i1 4980 4981 prefetch [%o0+0x40], #one_read 4982 4983 stxa %l0, [%i0+0x0]%asi 4984 4985 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 4986 add %i1, 0x10, %i1 4987 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6 4988 add %i1, 0x10, %i1 4989 4990 stxa %l1, [%i0+0x8]%asi 4991 stxa %l2, [%i0+0x10]%asi 4992 stxa %l3, [%i0+0x18]%asi 4993 stxa %l4, [%i0+0x20]%asi 4994 stxa %l5, [%i0+0x28]%asi 4995 stxa %l6, [%i0+0x30]%asi 4996 stxa %l7, [%i0+0x38]%asi 4997 4998 add %o0, 0x40, %o0 4999 subcc %i3, 0x40, %i3 5000 bgu,pt %xcc, 1b 5001 add %i0, 0x40, %i0 5002 5003.co_blkdone: 5004 membar #Sync 5005 5006 brz,pt %i2, .copyout_exit 5007 nop 5008 5009 ! Handle trailing bytes 5010 cmp %i2, 0x8 5011 blu,pt %ncc, .co_residue 5012 nop 5013 5014 ! Can we do some 8B ops 5015 or %i1, %i0, %o2 5016 andcc %o2, 0x7, %g0 5017 bnz %ncc, .co_last4 5018 nop 5019 5020 ! Do 8byte ops as long as possible 5021.co_last8: 5022 ldx [%i1], %o2 5023 stxa %o2, [%i0]ASI_USER 5024 add %i1, 0x8, %i1 5025 sub %i2, 0x8, %i2 5026 cmp %i2, 0x8 5027 bgu,pt %ncc, .co_last8 5028 add %i0, 0x8, %i0 5029 5030 brz,pt %i2, .copyout_exit 5031 nop 5032 5033 ba .co_residue 5034 nop 5035 5036.co_last4: 5037 ! Can we do 4B ops 5038 andcc %o2, 0x3, %g0 5039 bnz %ncc, .co_last2 5040 nop 50411: 5042 ld [%i1], %o2 5043 sta %o2, [%i0]ASI_USER 5044 add %i1, 0x4, %i1 5045 sub %i2, 0x4, %i2 5046 cmp %i2, 0x4 5047 bgu,pt %ncc, 1b 5048 add %i0, 0x4, %i0 5049 5050 brz,pt %i2, .copyout_exit 5051 nop 5052 5053 ba .co_residue 5054 nop 5055 5056.co_last2: 5057 ! Can we do 2B ops 5058 andcc %o2, 0x1, %g0 5059 bnz %ncc, .co_residue 5060 nop 5061 50621: 5063 lduh [%i1], %o2 5064 stuha %o2, [%i0]ASI_USER 5065 add %i1, 0x2, %i1 5066 sub %i2, 0x2, %i2 5067 cmp %i2, 0x2 5068 bgu,pt %ncc, 1b 5069 add %i0, 0x2, %i0 5070 5071 brz,pt %i2, .copyout_exit 5072 nop 5073 5074 ! Copy the residue as byte copy 5075.co_residue: 5076 ldub [%i1], %i4 5077 stba %i4, [%i0]ASI_USER 5078 inc %i1 5079 deccc %i2 5080 bgu,pt %xcc, .co_residue 5081 inc %i0 5082 5083.copyout_exit: 5084 membar #Sync 5085 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 5086 ret 5087 restore %g0, 0, %o0 5088 5089.copyout_err: 5090 ldn [THREAD_REG + T_COPYOPS], %o4 5091 brz %o4, 2f 5092 nop 5093 ldn [%o4 + CP_COPYOUT], %g2 5094 jmp %g2 5095 nop 50962: 5097 retl 5098 mov -1, %o0 5099#endif /* NIAGARA_IMPL */ 5100 SET_SIZE(copyout) 5101 5102#endif /* lint */ 5103 5104 5105#ifdef lint 5106 5107/*ARGSUSED*/ 5108int 5109xcopyout(const void *kaddr, void *uaddr, size_t count) 5110{ return (0); } 5111 5112#else /* lint */ 5113 5114 ENTRY(xcopyout) 5115 sethi %hi(.xcopyout_err), REAL_LOFAULT 5116 b .do_copyout 5117 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT 5118.xcopyout_err: 5119 ldn [THREAD_REG + T_COPYOPS], %o4 5120 brz %o4, 2f 5121 nop 5122 ldn [%o4 + CP_XCOPYOUT], %g2 5123 jmp %g2 5124 nop 51252: 5126 retl 5127 mov %g1, %o0 5128 SET_SIZE(xcopyout) 5129 5130#endif /* lint */ 5131 5132#ifdef lint 5133 5134/*ARGSUSED*/ 5135int 5136xcopyout_little(const void *kaddr, void *uaddr, size_t count) 5137{ return (0); } 5138 5139#else /* lint */ 5140 5141 ENTRY(xcopyout_little) 5142 sethi %hi(.little_err), %o4 5143 ldn [THREAD_REG + T_LOFAULT], %o5 5144 or %o4, %lo(.little_err), %o4 5145 membar #Sync ! sync error barrier 5146 stn %o4, [THREAD_REG + T_LOFAULT] 5147 5148 subcc %g0, %o2, %o3 5149 add %o0, %o2, %o0 5150 bz,pn %ncc, 2f ! check for zero bytes 5151 sub %o2, 1, %o4 5152 add %o0, %o4, %o0 ! start w/last byte 5153 add %o1, %o2, %o1 5154 ldub [%o0+%o3], %o4 5155 51561: stba %o4, [%o1+%o3]ASI_AIUSL 5157 inccc %o3 5158 sub %o0, 2, %o0 ! get next byte 5159 bcc,a,pt %ncc, 1b 5160 ldub [%o0+%o3], %o4 5161 51622: membar #Sync ! sync error barrier 5163 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 5164 retl 5165 mov %g0, %o0 ! return (0) 5166 SET_SIZE(xcopyout_little) 5167 5168#endif /* lint */ 5169 5170/* 5171 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little) 5172 */ 5173 5174#if defined(lint) 5175 5176/*ARGSUSED*/ 5177int 5178copyin(const void *uaddr, void *kaddr, size_t count) 5179{ return (0); } 5180 5181#else /* lint */ 5182 5183 ENTRY(copyin) 5184 sethi %hi(.copyin_err), REAL_LOFAULT 5185 or REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT 5186 5187#if !defined(NIAGARA_IMPL) 5188.do_copyin: 5189 tst %o2 ! check for zero count; quick exit 5190 bz,pt %ncc, .ci_smallqx 5191 mov %o0, SAVE_SRC 5192 mov %o1, SAVE_DST 5193 mov %o2, SAVE_COUNT 5194 cmp %o2, FP_COPY ! check for small copy/leaf case 5195 bgt,pt %ncc, .ci_copy_more 5196 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT 5197/* 5198 * Small copy in code 5199 * 5200 */ 5201 sethi %hi(copyio_fault_nowindow), %o3 5202 or %o3, %lo(copyio_fault_nowindow), %o3 5203 membar #Sync 5204 stn %o3, [THREAD_REG + T_LOFAULT] 5205 5206 mov ASI_USER, %asi 5207 cmp %o2, SHORTCOPY ! make sure there is enough to align 5208 ble,pt %ncc, .ci_smallest 5209 andcc %o1, 0x7, %o3 ! is dest long word aligned 5210 bnz,pn %ncc, .ci_align 5211 andcc %o1, 1, %o3 ! is dest byte aligned 5212 5213! Destination is long word aligned 5214.ci_al_src: 5215 andcc %o0, 7, %o3 5216 brnz,pt %o3, .ci_src_dst_unal8 5217 nop 5218/* 5219 * Special case for handling when src and dest are both long word aligned 5220 * and total data to move is less than FP_COPY bytes 5221 * Also handles finish up for large block moves, so may be less than 32 bytes 5222 */ 5223.ci_medlong: 5224 subcc %o2, 31, %o2 ! adjust length to allow cc test 5225 ble,pt %ncc, .ci_medl31 5226 nop 5227.ci_medl32: 5228 ldxa [%o0]%asi, %o4 ! move 32 bytes 5229 subcc %o2, 32, %o2 ! decrement length count by 32 5230 stx %o4, [%o1] 5231 ldxa [%o0+8]%asi, %o4 5232 stx %o4, [%o1+8] 5233 ldxa [%o0+16]%asi, %o4 5234 add %o0, 32, %o0 ! increase src ptr by 32 5235 stx %o4, [%o1+16] 5236 ldxa [%o0-8]%asi, %o4 5237 add %o1, 32, %o1 ! increase dst ptr by 32 5238 bgu,pt %ncc, .ci_medl32 ! repeat if at least 32 bytes left 5239 stx %o4, [%o1-8] 5240.ci_medl31: 5241 addcc %o2, 24, %o2 ! adjust count to be off by 7 5242 ble,pt %ncc, .ci_medl7 ! skip if 7 or fewer bytes left 5243 nop 5244.ci_medl8: 5245 ldxa [%o0]%asi, %o4 ! move 8 bytes 5246 add %o0, 8, %o0 ! increase src ptr by 8 5247 subcc %o2, 8, %o2 ! decrease count by 8 5248 add %o1, 8, %o1 ! increase dst ptr by 8 5249 bgu,pt %ncc, .ci_medl8 5250 stx %o4, [%o1-8] 5251.ci_medl7: 5252 addcc %o2, 7, %o2 ! finish adjustment of remaining count 5253 bnz,pt %ncc, .ci_small4 ! do final bytes if not finished 5254 nop 5255.ci_smallx: ! finish up and exit 5256 membar #Sync 5257 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 5258.ci_smallqx: 5259 retl 5260 mov %g0, %o0 5261 5262.ci_small4: 5263 cmp %o2, 4 5264 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left 5265 nop ! 5266 lda [%o0]%asi, %o4 ! move 4 bytes 5267 add %o0, 4, %o0 ! increase src ptr by 4 5268 add %o1, 4, %o1 ! increase dst ptr by 4 5269 subcc %o2, 4, %o2 ! decrease count by 4 5270 bz %ncc, .ci_smallx 5271 stw %o4, [%o1-4] 5272 5273.ci_small3x: ! Exactly 1, 2, or 3 bytes remain 5274 subcc %o2, 1, %o2 ! reduce count for cc test 5275 lduba [%o0]%asi, %o4 ! load one byte 5276 bz,pt %ncc, .ci_smallx 5277 stb %o4, [%o1] ! store one byte 5278 lduba [%o0+1]%asi, %o4 ! load second byte 5279 subcc %o2, 1, %o2 5280 bz,pt %ncc, .ci_smallx 5281 stb %o4, [%o1+1] ! store second byte 5282 lduba [%o0+2]%asi, %o4 ! load third byte 5283 ba .ci_smallx 5284 stb %o4, [%o1+2] ! store third byte 5285 5286.ci_smallest: ! 7 or fewer bytes remain 5287 cmp %o2, 4 5288 blt,pt %ncc, .ci_small3x 5289 nop 5290 lduba [%o0]%asi, %o4 ! read byte 5291 subcc %o2, 4, %o2 ! reduce count by 4 5292 stb %o4, [%o1] ! write byte 5293 lduba [%o0+1]%asi, %o4 ! repeat for total of 4 bytes 5294 add %o0, 4, %o0 ! advance src by 4 5295 stb %o4, [%o1+1] 5296 lduba [%o0-2]%asi, %o4 5297 add %o1, 4, %o1 ! advance dst by 4 5298 stb %o4, [%o1-2] 5299 lduba [%o0-1]%asi, %o4 5300 bnz,pt %ncc, .ci_small3x 5301 stb %o4, [%o1-1] 5302 membar #Sync 5303 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 5304 retl 5305 mov %g0, %o0 5306 5307.ci_align: 5308 bnz,pt %ncc, .ci_al_d1 5309.ci_al_d1f: ! dest is now half word aligned 5310 andcc %o1, 2, %o3 ! is dest word aligned 5311 bnz,pt %ncc, .ci_al_d2 5312.ci_al_d2f: ! dest is now word aligned 5313 andcc %o1, 4, %o3 ! is dest longword aligned? 5314 bz,pt %ncc, .ci_al_src 5315 nop 5316.ci_al_d4: ! dest is word aligned; src is unknown 5317 lduba [%o0]%asi, %o4 ! move a word (src align unknown) 5318 lduba [%o0+1]%asi, %o3 5319 sll %o4, 24, %o4 ! position 5320 sll %o3, 16, %o3 ! position 5321 or %o4, %o3, %o3 ! merge 5322 lduba [%o0+2]%asi, %o4 5323 sll %o4, 8, %o4 ! position 5324 or %o4, %o3, %o3 ! merge 5325 lduba [%o0+3]%asi, %o4 5326 or %o4, %o3, %o4 ! merge 5327 stw %o4,[%o1] ! store four bytes 5328 add %o0, 4, %o0 ! adjust src by 4 5329 add %o1, 4, %o1 ! adjust dest by 4 5330 sub %o2, 4, %o2 ! adjust count by 4 5331 andcc %o0, 7, %o3 ! check for src long word alignment 5332 brz,pt %o3, .ci_medlong 5333.ci_src_dst_unal8: 5334 ! dst is 8-byte aligned, src is not 5335 ! Size is less than FP_COPY 5336 ! Following code is to select for alignment 5337 andcc %o0, 0x3, %o3 ! test word alignment 5338 bz,pt %ncc, .ci_medword 5339 nop 5340 andcc %o0, 0x1, %o3 ! test halfword alignment 5341 bnz,pt %ncc, .ci_med_byte ! go to byte move if not halfword 5342 andcc %o0, 0x2, %o3 ! test which byte alignment 5343 ba .ci_medhalf 5344 nop 5345.ci_al_d1: ! align dest to half word 5346 lduba [%o0]%asi, %o4 ! move a byte 5347 add %o0, 1, %o0 5348 stb %o4, [%o1] 5349 add %o1, 1, %o1 5350 andcc %o1, 2, %o3 ! is dest word aligned 5351 bz,pt %ncc, .ci_al_d2f 5352 sub %o2, 1, %o2 5353.ci_al_d2: ! align dest to word 5354 lduba [%o0]%asi, %o4 ! move a half-word (src align unknown) 5355 lduba [%o0+1]%asi, %o3 5356 sll %o4, 8, %o4 ! position 5357 or %o4, %o3, %o4 ! merge 5358 sth %o4, [%o1] 5359 add %o0, 2, %o0 5360 add %o1, 2, %o1 5361 andcc %o1, 4, %o3 ! is dest longword aligned? 5362 bz,pt %ncc, .ci_al_src 5363 sub %o2, 2, %o2 5364 ba .ci_al_d4 5365 nop 5366/* 5367 * Handle all cases where src and dest are aligned on word 5368 * boundaries. Use unrolled loops for better performance. 5369 * This option wins over standard large data move when 5370 * source and destination is in cache for medium 5371 * to short data moves. 5372 */ 5373.ci_medword: 5374 subcc %o2, 31, %o2 ! adjust length to allow cc test 5375 ble,pt %ncc, .ci_medw31 5376 nop 5377.ci_medw32: 5378 lda [%o0]%asi, %o4 ! move a block of 32 bytes 5379 stw %o4, [%o1] 5380 lda [%o0+4]%asi, %o4 5381 stw %o4, [%o1+4] 5382 lda [%o0+8]%asi, %o4 5383 stw %o4, [%o1+8] 5384 lda [%o0+12]%asi, %o4 5385 stw %o4, [%o1+12] 5386 lda [%o0+16]%asi, %o4 5387 stw %o4, [%o1+16] 5388 lda [%o0+20]%asi, %o4 5389 subcc %o2, 32, %o2 ! decrement length count 5390 stw %o4, [%o1+20] 5391 lda [%o0+24]%asi, %o4 5392 add %o0, 32, %o0 ! increase src ptr by 32 5393 stw %o4, [%o1+24] 5394 lda [%o0-4]%asi, %o4 5395 add %o1, 32, %o1 ! increase dst ptr by 32 5396 bgu,pt %ncc, .ci_medw32 ! repeat if at least 32 bytes left 5397 stw %o4, [%o1-4] 5398.ci_medw31: 5399 addcc %o2, 24, %o2 ! adjust count to be off by 7 5400 ble,pt %ncc, .ci_medw7 ! skip if 7 or fewer bytes left 5401 nop ! 5402.ci_medw15: 5403 lda [%o0]%asi, %o4 ! move a block of 8 bytes 5404 subcc %o2, 8, %o2 ! decrement length count 5405 stw %o4, [%o1] 5406 add %o0, 8, %o0 ! increase src ptr by 8 5407 lda [%o0-4]%asi, %o4 5408 add %o1, 8, %o1 ! increase dst ptr by 8 5409 bgu,pt %ncc, .ci_medw15 5410 stw %o4, [%o1-4] 5411.ci_medw7: 5412 addcc %o2, 7, %o2 ! finish adjustment of remaining count 5413 bz,pt %ncc, .ci_smallx ! exit if finished 5414 cmp %o2, 4 5415 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left 5416 nop ! 5417 lda [%o0]%asi, %o4 ! move 4 bytes 5418 add %o0, 4, %o0 ! increase src ptr by 4 5419 add %o1, 4, %o1 ! increase dst ptr by 4 5420 subcc %o2, 4, %o2 ! decrease count by 4 5421 bnz .ci_small3x 5422 stw %o4, [%o1-4] 5423 membar #Sync 5424 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 5425 retl 5426 mov %g0, %o0 5427 5428.ci_medhalf: 5429 subcc %o2, 31, %o2 ! adjust length to allow cc test 5430 ble,pt %ncc, .ci_medh31 5431 nop 5432.ci_medh32: ! load and store block of 32 bytes 5433 subcc %o2, 32, %o2 ! decrement length count 5434 5435 lduha [%o0]%asi, %o4 ! move 32 bytes 5436 lduwa [%o0+2]%asi, %o3 5437 sllx %o4, 48, %o4 5438 sllx %o3, 16, %o3 5439 or %o4, %o3, %o3 5440 lduha [%o0+6]%asi, %o4 5441 or %o4, %o3, %o4 5442 stx %o4, [%o1] 5443 5444 lduha [%o0+8]%asi, %o4 5445 lduwa [%o0+10]%asi, %o3 5446 sllx %o4, 48, %o4 5447 sllx %o3, 16, %o3 5448 or %o4, %o3, %o3 5449 lduha [%o0+14]%asi, %o4 5450 or %o4, %o3, %o4 5451 stx %o4, [%o1+8] 5452 5453 lduha [%o0+16]%asi, %o4 5454 lduwa [%o0+18]%asi, %o3 5455 sllx %o4, 48, %o4 5456 sllx %o3, 16, %o3 5457 or %o4, %o3, %o3 5458 lduha [%o0+22]%asi, %o4 5459 or %o4, %o3, %o4 5460 stx %o4, [%o1+16] 5461 5462 add %o0, 32, %o0 ! increase src ptr by 32 5463 add %o1, 32, %o1 ! increase dst ptr by 32 5464 5465 lduha [%o0-8]%asi, %o4 5466 lduwa [%o0-6]%asi, %o3 5467 sllx %o4, 48, %o4 5468 sllx %o3, 16, %o3 5469 or %o4, %o3, %o3 5470 lduha [%o0-2]%asi, %o4 5471 or %o3, %o4, %o4 5472 bgu,pt %ncc, .ci_medh32 ! repeat if at least 32 bytes left 5473 stx %o4, [%o1-8] 5474 5475.ci_medh31: 5476 addcc %o2, 24, %o2 ! adjust count to be off by 7 5477 ble,pt %ncc, .ci_medh7 ! skip if 7 or fewer bytes left 5478 nop ! 5479.ci_medh15: 5480 lduha [%o0]%asi, %o4 ! move 16 bytes 5481 subcc %o2, 8, %o2 ! decrement length count 5482 lduwa [%o0+2]%asi, %o3 5483 sllx %o4, 48, %o4 5484 sllx %o3, 16, %o3 5485 or %o4, %o3, %o3 5486 add %o1, 8, %o1 ! increase dst ptr by 8 5487 lduha [%o0+6]%asi, %o4 5488 add %o0, 8, %o0 ! increase src ptr by 8 5489 or %o4, %o3, %o4 5490 bgu,pt %ncc, .ci_medh15 5491 stx %o4, [%o1-8] 5492.ci_medh7: 5493 addcc %o2, 7, %o2 ! finish adjustment of remaining count 5494 bz,pt %ncc, .ci_smallx ! exit if finished 5495 cmp %o2, 4 5496 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left 5497 nop ! 5498 lduha [%o0]%asi, %o4 5499 sll %o4, 16, %o4 5500 lduha [%o0+2]%asi, %o3 5501 or %o3, %o4, %o4 5502 subcc %o2, 4, %o2 5503 add %o0, 4, %o0 5504 add %o1, 4, %o1 5505 bnz .ci_small3x 5506 stw %o4, [%o1-4] 5507 membar #Sync 5508 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 5509 retl 5510 mov %g0, %o0 5511 5512 .align 16 5513.ci_med_byte: 5514 bnz,pt %ncc, .ci_medbh32a ! go to correct byte move 5515 subcc %o2, 31, %o2 ! adjust length to allow cc test 5516 ble,pt %ncc, .ci_medb31 5517 nop 5518.ci_medb32: ! Alignment 1 or 5 5519 subcc %o2, 32, %o2 ! decrement length count 5520 5521 lduba [%o0]%asi, %o4 ! load and store a block of 32 bytes 5522 sllx %o4, 56, %o3 5523 lduha [%o0+1]%asi, %o4 5524 sllx %o4, 40, %o4 5525 or %o4, %o3, %o3 5526 lduwa [%o0+3]%asi, %o4 5527 sllx %o4, 8, %o4 5528 or %o4, %o3, %o3 5529 lduba [%o0+7]%asi, %o4 5530 or %o4, %o3, %o4 5531 stx %o4, [%o1] 5532 5533 lduba [%o0+8]%asi, %o4 5534 sllx %o4, 56, %o3 5535 lduha [%o0+9]%asi, %o4 5536 sllx %o4, 40, %o4 5537 or %o4, %o3, %o3 5538 lduwa [%o0+11]%asi, %o4 5539 sllx %o4, 8, %o4 5540 or %o4, %o3, %o3 5541 lduba [%o0+15]%asi, %o4 5542 or %o4, %o3, %o4 5543 stx %o4, [%o1+8] 5544 5545 lduba [%o0+16]%asi, %o4 5546 sllx %o4, 56, %o3 5547 lduha [%o0+17]%asi, %o4 5548 sllx %o4, 40, %o4 5549 or %o4, %o3, %o3 5550 lduwa [%o0+19]%asi, %o4 5551 sllx %o4, 8, %o4 5552 or %o4, %o3, %o3 5553 lduba [%o0+23]%asi, %o4 5554 or %o4, %o3, %o4 5555 stx %o4, [%o1+16] 5556 5557 add %o0, 32, %o0 ! increase src ptr by 32 5558 add %o1, 32, %o1 ! increase dst ptr by 32 5559 5560 lduba [%o0-8]%asi, %o4 5561 sllx %o4, 56, %o3 5562 lduha [%o0-7]%asi, %o4 5563 sllx %o4, 40, %o4 5564 or %o4, %o3, %o3 5565 lduwa [%o0-5]%asi, %o4 5566 sllx %o4, 8, %o4 5567 or %o4, %o3, %o3 5568 lduba [%o0-1]%asi, %o4 5569 or %o4, %o3, %o4 5570 bgu,pt %ncc, .ci_medb32 ! repeat if at least 32 bytes left 5571 stx %o4, [%o1-8] 5572 5573.ci_medb31: ! 31 or fewer bytes remaining 5574 addcc %o2, 24, %o2 ! adjust count to be off by 7 5575 ble,pt %ncc, .ci_medb7 ! skip if 7 or fewer bytes left 5576 nop ! 5577.ci_medb15: 5578 5579 lduba [%o0]%asi, %o4 ! load and store a block of 8 bytes 5580 subcc %o2, 8, %o2 ! decrement length count 5581 sllx %o4, 56, %o3 5582 lduha [%o0+1]%asi, %o4 5583 sllx %o4, 40, %o4 5584 or %o4, %o3, %o3 5585 lduwa [%o0+3]%asi, %o4 5586 add %o1, 8, %o1 ! increase dst ptr by 16 5587 sllx %o4, 8, %o4 5588 or %o4, %o3, %o3 5589 lduba [%o0+7]%asi, %o4 5590 add %o0, 8, %o0 ! increase src ptr by 16 5591 or %o4, %o3, %o4 5592 bgu,pt %ncc, .ci_medb15 5593 stx %o4, [%o1-8] 5594.ci_medb7: 5595 addcc %o2, 7, %o2 ! finish adjustment of remaining count 5596 bz,pt %ncc, .ci_smallx ! exit if finished 5597 cmp %o2, 4 5598 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left 5599 nop ! 5600 lduba [%o0]%asi, %o4 ! move 4 bytes 5601 sll %o4, 24, %o3 5602 lduha [%o0+1]%asi, %o4 5603 sll %o4, 8, %o4 5604 or %o4, %o3, %o3 5605 lduba [%o0+3]%asi, %o4 5606 or %o4, %o3, %o4 5607 subcc %o2, 4, %o2 5608 add %o0, 4, %o0 5609 add %o1, 4, %o1 5610 bnz .ci_small3x 5611 stw %o4, [%o1-4] 5612 membar #Sync 5613 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 5614 retl 5615 mov %g0, %o0 5616 5617 .align 16 5618.ci_medbh32a: ! Alignment 3 or 7 5619 ble,pt %ncc, .ci_medbh31 5620 nop 5621.ci_medbh32: ! Alignment 3 or 7 5622 subcc %o2, 32, %o2 ! decrement length count 5623 5624 lduba [%o0]%asi, %o4 ! load and store a block of 32 bytes 5625 sllx %o4, 56, %o3 5626 lduwa [%o0+1]%asi, %o4 5627 sllx %o4, 24, %o4 5628 or %o4, %o3, %o3 5629 lduha [%o0+5]%asi, %o4 5630 sllx %o4, 8, %o4 5631 or %o4, %o3, %o3 5632 lduba [%o0+7]%asi, %o4 5633 or %o4, %o3, %o4 5634 stx %o4, [%o1] 5635 5636 lduba [%o0+8]%asi, %o4 5637 sllx %o4, 56, %o3 5638 lduwa [%o0+9]%asi, %o4 5639 sllx %o4, 24, %o4 5640 or %o4, %o3, %o3 5641 lduha [%o0+13]%asi, %o4 5642 sllx %o4, 8, %o4 5643 or %o4, %o3, %o3 5644 lduba [%o0+15]%asi, %o4 5645 or %o4, %o3, %o4 5646 stx %o4, [%o1+8] 5647 5648 lduba [%o0+16]%asi, %o4 5649 sllx %o4, 56, %o3 5650 lduwa [%o0+17]%asi, %o4 5651 sllx %o4, 24, %o4 5652 or %o4, %o3, %o3 5653 lduha [%o0+21]%asi, %o4 5654 sllx %o4, 8, %o4 5655 or %o4, %o3, %o3 5656 lduba [%o0+23]%asi, %o4 5657 or %o4, %o3, %o4 5658 stx %o4, [%o1+16] 5659 5660 add %o0, 32, %o0 ! increase src ptr by 32 5661 add %o1, 32, %o1 ! increase dst ptr by 32 5662 5663 lduba [%o0-8]%asi, %o4 5664 sllx %o4, 56, %o3 5665 lduwa [%o0-7]%asi, %o4 5666 sllx %o4, 24, %o4 5667 or %o4, %o3, %o3 5668 lduha [%o0-3]%asi, %o4 5669 sllx %o4, 8, %o4 5670 or %o4, %o3, %o3 5671 lduba [%o0-1]%asi, %o4 5672 or %o4, %o3, %o4 5673 bgu,pt %ncc, .ci_medbh32 ! repeat if at least 32 bytes left 5674 stx %o4, [%o1-8] 5675 5676.ci_medbh31: 5677 addcc %o2, 24, %o2 ! adjust count to be off by 7 5678 ble,pt %ncc, .ci_medb7 ! skip if 7 or fewer bytes left 5679 nop ! 5680.ci_medbh15: 5681 lduba [%o0]%asi, %o4 ! load and store a block of 8 bytes 5682 sllx %o4, 56, %o3 5683 lduwa [%o0+1]%asi, %o4 5684 sllx %o4, 24, %o4 5685 or %o4, %o3, %o3 5686 lduha [%o0+5]%asi, %o4 5687 sllx %o4, 8, %o4 5688 or %o4, %o3, %o3 5689 lduba [%o0+7]%asi, %o4 5690 or %o4, %o3, %o4 5691 stx %o4, [%o1] 5692 subcc %o2, 8, %o2 ! decrement length count 5693 add %o1, 8, %o1 ! increase dst ptr by 8 5694 add %o0, 8, %o0 ! increase src ptr by 8 5695 bgu,pt %ncc, .ci_medbh15 5696 stx %o4, [%o1-8] 5697 ba .ci_medb7 5698 nop 5699 5700/* 5701 * End of small copy in code (no window) 5702 * 5703 */ 5704 5705/* 5706 * Long copy in code (using register window and fp regs) 5707 * 5708 */ 5709 5710.ci_copy_more: 5711 sethi %hi(copyio_fault), %o3 5712 or %o3, %lo(copyio_fault), %o3 5713 membar #Sync 5714 stn %o3, [THREAD_REG + T_LOFAULT] 5715/* 5716 * Following code is for large copies. We know there is at 5717 * least FP_COPY bytes available. FP regs are used, so 5718 * we save registers and fp regs before starting 5719 */ 5720 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 5721 or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 5722 rd %fprs, %g1 ! check for unused fp 5723 ! if fprs.fef == 0, set it. 5724 ! Setting it when already set costs more than checking 5725 andcc %g1, FPRS_FEF, %g1 ! test FEF, fprs.du = fprs.dl = 0 5726 bz,pt %ncc, .ci_fp_unused 5727 mov ASI_USER, %asi 5728 BST_FP_TOSTACK(%o3) 5729 ba .ci_fp_ready 5730.ci_fp_unused: 5731 prefetcha [%i0 + (1 * CACHE_LINE)]%asi, #one_read 5732 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 5733.ci_fp_ready: 5734 rd %gsr, %l5 ! save %gsr value 5735 andcc %i1, 1, %o3 ! is dest byte aligned 5736 bnz,pt %ncc, .ci_big_d1 5737.ci_big_d1f: ! dest is now half word aligned 5738 andcc %i1, 2, %o3 5739 bnz,pt %ncc, .ci_big_d2 5740.ci_big_d2f: ! dest is now word aligned 5741 andcc %i1, 4, %o3 5742 bnz,pt %ncc, .ci_big_d4 5743.ci_big_d4f: ! dest is long word aligned 5744 andcc %i0, 7, %o3 ! is src long word aligned 5745 brnz,pt %o3, .ci_big_unal8 5746 prefetcha [%i0 + (2 * CACHE_LINE)]%asi, #one_read 5747 ! Src and dst are long word aligned 5748 ! align dst to 64 byte boundary 5749 andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 5750 brz,pn %o3, .ci_al_to_64 5751 nop 5752 sub %o3, 64, %o3 ! %o3 has negative bytes to move 5753 add %i2, %o3, %i2 ! adjust remaining count 5754 andcc %o3, 8, %o4 ! odd long words to move? 5755 brz,pt %o4, .ci_al_to_16 5756 nop 5757 add %o3, 8, %o3 5758 ldxa [%i0]%asi, %o4 5759 add %i0, 8, %i0 ! increment src ptr 5760 add %i1, 8, %i1 ! increment dst ptr 5761 stx %o4, [%i1-8] 5762! Dest is aligned on 16 bytes, src 8 byte aligned 5763.ci_al_to_16: 5764 andcc %o3, 0x30, %o4 ! pair of long words to move? 5765 brz,pt %o4, .ci_al_to_64 5766 nop 5767.ci_al_mv_16: 5768 add %o3, 16, %o3 5769 ldxa [%i0]%asi, %o4 5770 stx %o4, [%i1] 5771 add %i0, 16, %i0 ! increment src ptr 5772 ldxa [%i0-8]%asi, %o4 5773 stx %o4, [%i1+8] 5774 andcc %o3, 0x30, %o4 5775 brnz,pt %o4, .ci_al_mv_16 5776 add %i1, 16, %i1 ! increment dst ptr 5777! Dest is aligned on 64 bytes, src 8 byte aligned 5778.ci_al_to_64: 5779 ! Determine source alignment 5780 ! to correct 8 byte offset 5781 andcc %i0, 32, %o3 5782 brnz,pn %o3, .ci_aln_1 5783 andcc %i0, 16, %o3 5784 brnz,pn %o3, .ci_aln_01 5785 andcc %i0, 8, %o3 5786 brz,pn %o3, .ci_aln_000 5787 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read 5788 ba .ci_aln_001 5789 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5790.ci_aln_01: 5791 brnz,pn %o3, .ci_aln_011 5792 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read 5793 ba .ci_aln_010 5794 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5795.ci_aln_1: 5796 andcc %i0, 16, %o3 5797 brnz,pn %o3, .ci_aln_11 5798 andcc %i0, 8, %o3 5799 brnz,pn %o3, .ci_aln_101 5800 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read 5801 ba .ci_aln_100 5802 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5803.ci_aln_11: 5804 brz,pn %o3, .ci_aln_110 5805 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read 5806 5807.ci_aln_111: 5808! Alignment off by 8 bytes 5809 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5810 ldda [%i0]%asi, %d0 5811 add %i0, 8, %i0 5812 sub %i2, 8, %i2 5813 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5814 and %i2, 0x7f, %i2 ! residue bytes in %i2 5815 sub %i1, %i0, %i1 5816.ci_aln_111_loop: 5817 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5818 subcc %o3, 64, %o3 5819 fmovd %d16, %d2 5820 fmovd %d18, %d4 5821 fmovd %d20, %d6 5822 fmovd %d22, %d8 5823 fmovd %d24, %d10 5824 fmovd %d26, %d12 5825 fmovd %d28, %d14 5826 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5827 stda %d0,[%i0+%i1]ASI_BLK_P 5828 add %i0, 64, %i0 5829 fmovd %d30, %d0 5830 bgt,pt %ncc, .ci_aln_111_loop 5831 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5832 add %i1, %i0, %i1 5833 5834 std %d0, [%i1] 5835 ba .ci_remain_stuff 5836 add %i1, 8, %i1 5837 ! END OF aln_111 5838 5839.ci_aln_110: 5840! Alignment off by 16 bytes 5841 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5842 ldda [%i0]%asi, %d0 5843 ldda [%i0+8]%asi, %d2 5844 add %i0, 16, %i0 5845 sub %i2, 16, %i2 5846 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5847 and %i2, 0x7f, %i2 ! residue bytes in %i2 5848 sub %i1, %i0, %i1 5849.ci_aln_110_loop: 5850 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5851 subcc %o3, 64, %o3 5852 fmovd %d16, %d4 5853 fmovd %d18, %d6 5854 fmovd %d20, %d8 5855 fmovd %d22, %d10 5856 fmovd %d24, %d12 5857 fmovd %d26, %d14 5858 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5859 stda %d0,[%i0+%i1]ASI_BLK_P 5860 add %i0, 64, %i0 5861 fmovd %d28, %d0 5862 fmovd %d30, %d2 5863 bgt,pt %ncc, .ci_aln_110_loop 5864 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5865 add %i1, %i0, %i1 5866 5867 std %d0, [%i1] 5868 std %d2, [%i1+8] 5869 ba .ci_remain_stuff 5870 add %i1, 16, %i1 5871 ! END OF aln_110 5872 5873.ci_aln_101: 5874! Alignment off by 24 bytes 5875 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5876 ldda [%i0]%asi, %d0 5877 ldda [%i0+8]%asi, %d2 5878 ldda [%i0+16]%asi, %d4 5879 add %i0, 24, %i0 5880 sub %i2, 24, %i2 5881 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5882 and %i2, 0x7f, %i2 ! residue bytes in %i2 5883 sub %i1, %i0, %i1 5884.ci_aln_101_loop: 5885 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5886 subcc %o3, 64, %o3 5887 fmovd %d16, %d6 5888 fmovd %d18, %d8 5889 fmovd %d20, %d10 5890 fmovd %d22, %d12 5891 fmovd %d24, %d14 5892 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5893 stda %d0,[%i0+%i1]ASI_BLK_P 5894 add %i0, 64, %i0 5895 fmovd %d26, %d0 5896 fmovd %d28, %d2 5897 fmovd %d30, %d4 5898 bgt,pt %ncc, .ci_aln_101_loop 5899 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5900 add %i1, %i0, %i1 5901 5902 std %d0, [%i1] 5903 std %d2, [%i1+8] 5904 std %d4, [%i1+16] 5905 ba .ci_remain_stuff 5906 add %i1, 24, %i1 5907 ! END OF aln_101 5908 5909.ci_aln_100: 5910! Alignment off by 32 bytes 5911 ldda [%i0]%asi, %d0 5912 ldda [%i0+8]%asi, %d2 5913 ldda [%i0+16]%asi,%d4 5914 ldda [%i0+24]%asi,%d6 5915 add %i0, 32, %i0 5916 sub %i2, 32, %i2 5917 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5918 and %i2, 0x7f, %i2 ! residue bytes in %i2 5919 sub %i1, %i0, %i1 5920.ci_aln_100_loop: 5921 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5922 subcc %o3, 64, %o3 5923 fmovd %d16, %d8 5924 fmovd %d18, %d10 5925 fmovd %d20, %d12 5926 fmovd %d22, %d14 5927 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5928 stda %d0,[%i0+%i1]ASI_BLK_P 5929 add %i0, 64, %i0 5930 fmovd %d24, %d0 5931 fmovd %d26, %d2 5932 fmovd %d28, %d4 5933 fmovd %d30, %d6 5934 bgt,pt %ncc, .ci_aln_100_loop 5935 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5936 add %i1, %i0, %i1 5937 5938 std %d0, [%i1] 5939 std %d2, [%i1+8] 5940 std %d4, [%i1+16] 5941 std %d6, [%i1+24] 5942 ba .ci_remain_stuff 5943 add %i1, 32, %i1 5944 ! END OF aln_100 5945 5946.ci_aln_011: 5947! Alignment off by 40 bytes 5948 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5949 ldda [%i0]%asi, %d0 5950 ldda [%i0+8]%asi, %d2 5951 ldda [%i0+16]%asi, %d4 5952 ldda [%i0+24]%asi, %d6 5953 ldda [%i0+32]%asi, %d8 5954 add %i0, 40, %i0 5955 sub %i2, 40, %i2 5956 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5957 and %i2, 0x7f, %i2 ! residue bytes in %i2 5958 sub %i1, %i0, %i1 5959.ci_aln_011_loop: 5960 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5961 subcc %o3, 64, %o3 5962 fmovd %d16, %d10 5963 fmovd %d18, %d12 5964 fmovd %d20, %d14 5965 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5966 stda %d0,[%i0+%i1]ASI_BLK_P 5967 add %i0, 64, %i0 5968 fmovd %d22, %d0 5969 fmovd %d24, %d2 5970 fmovd %d26, %d4 5971 fmovd %d28, %d6 5972 fmovd %d30, %d8 5973 bgt,pt %ncc, .ci_aln_011_loop 5974 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5975 add %i1, %i0, %i1 5976 5977 std %d0, [%i1] 5978 std %d2, [%i1+8] 5979 std %d4, [%i1+16] 5980 std %d6, [%i1+24] 5981 std %d8, [%i1+32] 5982 ba .ci_remain_stuff 5983 add %i1, 40, %i1 5984 ! END OF aln_011 5985 5986.ci_aln_010: 5987! Alignment off by 48 bytes 5988 ldda [%i0]%asi, %d0 5989 ldda [%i0+8]%asi, %d2 5990 ldda [%i0+16]%asi, %d4 5991 ldda [%i0+24]%asi, %d6 5992 ldda [%i0+32]%asi, %d8 5993 ldda [%i0+40]%asi, %d10 5994 add %i0, 48, %i0 5995 sub %i2, 48, %i2 5996 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5997 and %i2, 0x7f, %i2 ! residue bytes in %i2 5998 sub %i1, %i0, %i1 5999.ci_aln_010_loop: 6000 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 6001 subcc %o3, 64, %o3 6002 fmovd %d16, %d12 6003 fmovd %d18, %d14 6004 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 6005 stda %d0,[%i0+%i1]ASI_BLK_P 6006 add %i0, 64, %i0 6007 fmovd %d20, %d0 6008 fmovd %d22, %d2 6009 fmovd %d24, %d4 6010 fmovd %d26, %d6 6011 fmovd %d28, %d8 6012 fmovd %d30, %d10 6013 bgt,pt %ncc, .ci_aln_010_loop 6014 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 6015 add %i1, %i0, %i1 6016 6017 std %d0, [%i1] 6018 std %d2, [%i1+8] 6019 std %d4, [%i1+16] 6020 std %d6, [%i1+24] 6021 std %d8, [%i1+32] 6022 std %d10, [%i1+40] 6023 ba .ci_remain_stuff 6024 add %i1, 48, %i1 6025 ! END OF aln_010 6026 6027.ci_aln_001: 6028! Alignment off by 56 bytes 6029 ldda [%i0]%asi, %d0 6030 ldda [%i0+8]%asi, %d2 6031 ldda [%i0+16]%asi, %d4 6032 ldda [%i0+24]%asi, %d6 6033 ldda [%i0+32]%asi, %d8 6034 ldda [%i0+40]%asi, %d10 6035 ldda [%i0+48]%asi, %d12 6036 add %i0, 56, %i0 6037 sub %i2, 56, %i2 6038 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 6039 and %i2, 0x7f, %i2 ! residue bytes in %i2 6040 sub %i1, %i0, %i1 6041.ci_aln_001_loop: 6042 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 6043 subcc %o3, 64, %o3 6044 fmovd %d16, %d14 6045 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 6046 stda %d0,[%i0+%i1]ASI_BLK_P 6047 add %i0, 64, %i0 6048 fmovd %d18, %d0 6049 fmovd %d20, %d2 6050 fmovd %d22, %d4 6051 fmovd %d24, %d6 6052 fmovd %d26, %d8 6053 fmovd %d28, %d10 6054 fmovd %d30, %d12 6055 bgt,pt %ncc, .ci_aln_001_loop 6056 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 6057 add %i1, %i0, %i1 6058 6059 std %d0, [%i1] 6060 std %d2, [%i1+8] 6061 std %d4, [%i1+16] 6062 std %d6, [%i1+24] 6063 std %d8, [%i1+32] 6064 std %d10, [%i1+40] 6065 std %d12, [%i1+48] 6066 ba .ci_remain_stuff 6067 add %i1, 56, %i1 6068 ! END OF aln_001 6069 6070.ci_aln_000: 6071 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 6072 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 6073 and %i2, 0x7f, %i2 ! residue bytes in %i2 6074 sub %i1, %i0, %i1 6075.ci_aln_000_loop: 6076 ldda [%i0]ASI_BLK_AIUS,%d0 6077 subcc %o3, 64, %o3 6078 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 6079 stda %d0,[%i0+%i1]ASI_BLK_P 6080 add %i0, 64, %i0 6081 bgt,pt %ncc, .ci_aln_000_loop 6082 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 6083 add %i1, %i0, %i1 6084 6085 ! END OF aln_000 6086 6087.ci_remain_stuff: 6088 subcc %i2, 31, %i2 ! adjust length to allow cc test 6089 ble,pt %ncc, .ci_aln_31 6090 nop 6091.ci_aln_32: 6092 ldxa [%i0]%asi, %o4 ! move 32 bytes 6093 subcc %i2, 32, %i2 ! decrement length count by 32 6094 stx %o4, [%i1] 6095 ldxa [%i0+8]%asi, %o4 6096 stx %o4, [%i1+8] 6097 ldxa [%i0+16]%asi, %o4 6098 add %i0, 32, %i0 ! increase src ptr by 32 6099 stx %o4, [%i1+16] 6100 ldxa [%i0-8]%asi, %o4 6101 add %i1, 32, %i1 ! increase dst ptr by 32 6102 bgu,pt %ncc, .ci_aln_32 ! repeat if at least 32 bytes left 6103 stx %o4, [%i1-8] 6104.ci_aln_31: 6105 addcc %i2, 24, %i2 ! adjust count to be off by 7 6106 ble,pt %ncc, .ci_aln_7 ! skip if 7 or fewer bytes left 6107 nop ! 6108.ci_aln_15: 6109 ldxa [%i0]%asi, %o4 ! move 8 bytes 6110 add %i0, 8, %i0 ! increase src ptr by 8 6111 subcc %i2, 8, %i2 ! decrease count by 8 6112 add %i1, 8, %i1 ! increase dst ptr by 8 6113 bgu,pt %ncc, .ci_aln_15 6114 stx %o4, [%i1-8] ! 6115.ci_aln_7: 6116 addcc %i2, 7, %i2 ! finish adjustment of remaining count 6117 bz,pt %ncc, .ci_exit ! exit if finished 6118 cmp %i2, 4 6119 blt,pt %ncc, .ci_unaln3x ! skip if less than 4 bytes left 6120 nop ! 6121 lda [%i0]%asi, %o4 ! move 4 bytes 6122 add %i0, 4, %i0 ! increase src ptr by 4 6123 add %i1, 4, %i1 ! increase dst ptr by 4 6124 subcc %i2, 4, %i2 ! decrease count by 4 6125 bnz .ci_unaln3x 6126 stw %o4, [%i1-4] 6127 ba .ci_exit 6128 nop 6129 6130 ! destination alignment code 6131.ci_big_d1: 6132 lduba [%i0]%asi, %o4 ! move a byte 6133 add %i0, 1, %i0 6134 stb %o4, [%i1] 6135 add %i1, 1, %i1 6136 andcc %i1, 2, %o3 6137 bz,pt %ncc, .ci_big_d2f 6138 sub %i2, 1, %i2 6139.ci_big_d2: ! dest is now at least half word aligned 6140 lduba [%i0]%asi, %o4 ! move a half-word (src align unknown) 6141 lduba [%i0+1]%asi, %o3 6142 add %i0, 2, %i0 6143 sll %o4, 8, %o4 ! position 6144 or %o4, %o3, %o4 ! merge 6145 sth %o4, [%i1] 6146 add %i1, 2, %i1 6147 andcc %i1, 4, %o3 6148 bz,pt %ncc, .ci_big_d4f 6149 sub %i2, 2, %i2 6150.ci_big_d4: ! dest is at least word aligned 6151 nop 6152 lduba [%i0]%asi, %o4 ! move a word (src align unknown) 6153 lduba [%i0+1]%asi, %o3 6154 sll %o4, 24, %o4 ! position 6155 sll %o3, 16, %o3 ! position 6156 or %o4, %o3, %o3 ! merge 6157 lduba [%i0+2]%asi, %o4 6158 sll %o4, 8, %o4 ! position 6159 or %o4, %o3, %o3 ! merge 6160 lduba [%i0+3]%asi, %o4 6161 or %o4, %o3, %o4 ! merge 6162 stw %o4,[%i1] ! store four bytes 6163 add %i0, 4, %i0 ! adjust src by 4 6164 add %i1, 4, %i1 ! adjust dest by 4 6165 ba .ci_big_d4f 6166 sub %i2, 4, %i2 ! adjust count by 4 6167 6168 6169 ! Dst is on 8 byte boundary; src is not; 6170.ci_big_unal8: 6171 andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned? 6172 bz %ncc, .ci_unalnsrc 6173 sub %o3, 64, %o3 ! %o3 will be multiple of 8 6174 neg %o3 ! bytes until dest is 64 byte aligned 6175 sub %i2, %o3, %i2 ! update cnt with bytes to be moved 6176 ! Move bytes according to source alignment 6177 andcc %i0, 0x1, %o4 6178 bnz %ncc, .ci_unalnbyte ! check for byte alignment 6179 nop 6180 andcc %i0, 2, %o4 ! check for half word alignment 6181 bnz %ncc, .ci_unalnhalf 6182 nop 6183 ! Src is word aligned, move bytes until dest 64 byte aligned 6184.ci_unalnword: 6185 lda [%i0]%asi, %o4 ! load 4 bytes 6186 stw %o4, [%i1] ! and store 4 bytes 6187 lda [%i0+4]%asi, %o4 ! load 4 bytes 6188 add %i0, 8, %i0 ! increase src ptr by 8 6189 stw %o4, [%i1+4] ! and store 4 bytes 6190 subcc %o3, 8, %o3 ! decrease count by 8 6191 bnz %ncc, .ci_unalnword 6192 add %i1, 8, %i1 ! increase dst ptr by 8 6193 ba .ci_unalnsrc 6194 nop 6195 6196 ! Src is half-word aligned, move bytes until dest 64 byte aligned 6197.ci_unalnhalf: 6198 lduha [%i0]%asi, %o4 ! load 2 bytes 6199 sllx %o4, 32, %i3 ! shift left 6200 lduwa [%i0+2]%asi, %o4 6201 or %o4, %i3, %i3 6202 sllx %i3, 16, %i3 6203 lduha [%i0+6]%asi, %o4 6204 or %o4, %i3, %i3 6205 stx %i3, [%i1] 6206 add %i0, 8, %i0 6207 subcc %o3, 8, %o3 6208 bnz %ncc, .ci_unalnhalf 6209 add %i1, 8, %i1 6210 ba .ci_unalnsrc 6211 nop 6212 6213 ! Src is Byte aligned, move bytes until dest 64 byte aligned 6214.ci_unalnbyte: 6215 sub %i1, %i0, %i1 ! share pointer advance 6216.ci_unalnbyte_loop: 6217 lduba [%i0]%asi, %o4 6218 sllx %o4, 56, %i3 6219 lduha [%i0+1]%asi, %o4 6220 sllx %o4, 40, %o4 6221 or %o4, %i3, %i3 6222 lduha [%i0+3]%asi, %o4 6223 sllx %o4, 24, %o4 6224 or %o4, %i3, %i3 6225 lduha [%i0+5]%asi, %o4 6226 sllx %o4, 8, %o4 6227 or %o4, %i3, %i3 6228 lduba [%i0+7]%asi, %o4 6229 or %o4, %i3, %i3 6230 stx %i3, [%i1+%i0] 6231 subcc %o3, 8, %o3 6232 bnz %ncc, .ci_unalnbyte_loop 6233 add %i0, 8, %i0 6234 add %i1,%i0, %i1 ! restore pointer 6235 6236 ! Destination is now block (64 byte aligned), src is not 8 byte aligned 6237.ci_unalnsrc: 6238 andn %i2, 0x3f, %i3 ! %i3 is multiple of block size 6239 and %i2, 0x3f, %i2 ! residue bytes in %i2 6240 add %i2, 64, %i2 ! Insure we don't load beyond 6241 sub %i3, 64, %i3 ! end of source buffer 6242 6243 andn %i0, 0x3f, %o4 ! %o4 has block aligned src address 6244 prefetcha [%o4 + (3 * CACHE_LINE)]%asi, #one_read 6245 alignaddr %i0, %g0, %g0 ! generate %gsr 6246 add %i0, %i3, %i0 ! advance %i0 to after blocks 6247 ! 6248 ! Determine source alignment to correct 8 byte offset 6249 andcc %i0, 0x20, %o3 6250 brnz,pn %o3, .ci_unaln_1 6251 andcc %i0, 0x10, %o3 6252 brnz,pn %o3, .ci_unaln_01 6253 andcc %i0, 0x08, %o3 6254 brz,a %o3, .ci_unaln_000 6255 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6256 ba .ci_unaln_001 6257 nop 6258.ci_unaln_01: 6259 brnz,a %o3, .ci_unaln_011 6260 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6261 ba .ci_unaln_010 6262 nop 6263.ci_unaln_1: 6264 brnz,pn %o3, .ci_unaln_11 6265 andcc %i0, 0x08, %o3 6266 brnz,a %o3, .ci_unaln_101 6267 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6268 ba .ci_unaln_100 6269 nop 6270.ci_unaln_11: 6271 brz,pn %o3, .ci_unaln_110 6272 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 6273 6274.ci_unaln_111: 6275 ldda [%o4+56]%asi, %d14 6276.ci_unaln_111_loop: 6277 add %o4, 64, %o4 6278 ldda [%o4]ASI_BLK_AIUS, %d16 6279 faligndata %d14, %d16, %d48 6280 faligndata %d16, %d18, %d50 6281 faligndata %d18, %d20, %d52 6282 faligndata %d20, %d22, %d54 6283 faligndata %d22, %d24, %d56 6284 faligndata %d24, %d26, %d58 6285 faligndata %d26, %d28, %d60 6286 faligndata %d28, %d30, %d62 6287 fmovd %d30, %d14 6288 stda %d48, [%i1]ASI_BLK_P 6289 subcc %i3, 64, %i3 6290 add %i1, 64, %i1 6291 bgu,pt %ncc, .ci_unaln_111_loop 6292 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6293 ba .ci_unaln_done 6294 nop 6295 6296.ci_unaln_110: 6297 ldda [%o4+48]%asi, %d12 6298 ldda [%o4+56]%asi, %d14 6299.ci_unaln_110_loop: 6300 add %o4, 64, %o4 6301 ldda [%o4]ASI_BLK_AIUS, %d16 6302 faligndata %d12, %d14, %d48 6303 faligndata %d14, %d16, %d50 6304 faligndata %d16, %d18, %d52 6305 faligndata %d18, %d20, %d54 6306 faligndata %d20, %d22, %d56 6307 faligndata %d22, %d24, %d58 6308 faligndata %d24, %d26, %d60 6309 faligndata %d26, %d28, %d62 6310 fmovd %d28, %d12 6311 fmovd %d30, %d14 6312 stda %d48, [%i1]ASI_BLK_P 6313 subcc %i3, 64, %i3 6314 add %i1, 64, %i1 6315 bgu,pt %ncc, .ci_unaln_110_loop 6316 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6317 ba .ci_unaln_done 6318 nop 6319 6320.ci_unaln_101: 6321 ldda [%o4+40]%asi, %d10 6322 ldda [%o4+48]%asi, %d12 6323 ldda [%o4+56]%asi, %d14 6324.ci_unaln_101_loop: 6325 add %o4, 64, %o4 6326 ldda [%o4]ASI_BLK_AIUS, %d16 6327 faligndata %d10, %d12, %d48 6328 faligndata %d12, %d14, %d50 6329 faligndata %d14, %d16, %d52 6330 faligndata %d16, %d18, %d54 6331 faligndata %d18, %d20, %d56 6332 faligndata %d20, %d22, %d58 6333 faligndata %d22, %d24, %d60 6334 faligndata %d24, %d26, %d62 6335 fmovd %d26, %d10 6336 fmovd %d28, %d12 6337 fmovd %d30, %d14 6338 stda %d48, [%i1]ASI_BLK_P 6339 subcc %i3, 64, %i3 6340 add %i1, 64, %i1 6341 bgu,pt %ncc, .ci_unaln_101_loop 6342 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6343 ba .ci_unaln_done 6344 nop 6345 6346.ci_unaln_100: 6347 ldda [%o4+32]%asi, %d8 6348 ldda [%o4+40]%asi, %d10 6349 ldda [%o4+48]%asi, %d12 6350 ldda [%o4+56]%asi, %d14 6351.ci_unaln_100_loop: 6352 add %o4, 64, %o4 6353 ldda [%o4]ASI_BLK_AIUS, %d16 6354 faligndata %d8, %d10, %d48 6355 faligndata %d10, %d12, %d50 6356 faligndata %d12, %d14, %d52 6357 faligndata %d14, %d16, %d54 6358 faligndata %d16, %d18, %d56 6359 faligndata %d18, %d20, %d58 6360 faligndata %d20, %d22, %d60 6361 faligndata %d22, %d24, %d62 6362 fmovd %d24, %d8 6363 fmovd %d26, %d10 6364 fmovd %d28, %d12 6365 fmovd %d30, %d14 6366 stda %d48, [%i1]ASI_BLK_P 6367 subcc %i3, 64, %i3 6368 add %i1, 64, %i1 6369 bgu,pt %ncc, .ci_unaln_100_loop 6370 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6371 ba .ci_unaln_done 6372 nop 6373 6374.ci_unaln_011: 6375 ldda [%o4+24]%asi, %d6 6376 ldda [%o4+32]%asi, %d8 6377 ldda [%o4+40]%asi, %d10 6378 ldda [%o4+48]%asi, %d12 6379 ldda [%o4+56]%asi, %d14 6380.ci_unaln_011_loop: 6381 add %o4, 64, %o4 6382 ldda [%o4]ASI_BLK_AIUS, %d16 6383 faligndata %d6, %d8, %d48 6384 faligndata %d8, %d10, %d50 6385 faligndata %d10, %d12, %d52 6386 faligndata %d12, %d14, %d54 6387 faligndata %d14, %d16, %d56 6388 faligndata %d16, %d18, %d58 6389 faligndata %d18, %d20, %d60 6390 faligndata %d20, %d22, %d62 6391 fmovd %d22, %d6 6392 fmovd %d24, %d8 6393 fmovd %d26, %d10 6394 fmovd %d28, %d12 6395 fmovd %d30, %d14 6396 stda %d48, [%i1]ASI_BLK_P 6397 subcc %i3, 64, %i3 6398 add %i1, 64, %i1 6399 bgu,pt %ncc, .ci_unaln_011_loop 6400 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6401 ba .ci_unaln_done 6402 nop 6403 6404.ci_unaln_010: 6405 ldda [%o4+16]%asi, %d4 6406 ldda [%o4+24]%asi, %d6 6407 ldda [%o4+32]%asi, %d8 6408 ldda [%o4+40]%asi, %d10 6409 ldda [%o4+48]%asi, %d12 6410 ldda [%o4+56]%asi, %d14 6411.ci_unaln_010_loop: 6412 add %o4, 64, %o4 6413 ldda [%o4]ASI_BLK_AIUS, %d16 6414 faligndata %d4, %d6, %d48 6415 faligndata %d6, %d8, %d50 6416 faligndata %d8, %d10, %d52 6417 faligndata %d10, %d12, %d54 6418 faligndata %d12, %d14, %d56 6419 faligndata %d14, %d16, %d58 6420 faligndata %d16, %d18, %d60 6421 faligndata %d18, %d20, %d62 6422 fmovd %d20, %d4 6423 fmovd %d22, %d6 6424 fmovd %d24, %d8 6425 fmovd %d26, %d10 6426 fmovd %d28, %d12 6427 fmovd %d30, %d14 6428 stda %d48, [%i1]ASI_BLK_P 6429 subcc %i3, 64, %i3 6430 add %i1, 64, %i1 6431 bgu,pt %ncc, .ci_unaln_010_loop 6432 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6433 ba .ci_unaln_done 6434 nop 6435 6436.ci_unaln_001: 6437 ldda [%o4+8]%asi, %d2 6438 ldda [%o4+16]%asi, %d4 6439 ldda [%o4+24]%asi, %d6 6440 ldda [%o4+32]%asi, %d8 6441 ldda [%o4+40]%asi, %d10 6442 ldda [%o4+48]%asi, %d12 6443 ldda [%o4+56]%asi, %d14 6444.ci_unaln_001_loop: 6445 add %o4, 64, %o4 6446 ldda [%o4]ASI_BLK_AIUS, %d16 6447 faligndata %d2, %d4, %d48 6448 faligndata %d4, %d6, %d50 6449 faligndata %d6, %d8, %d52 6450 faligndata %d8, %d10, %d54 6451 faligndata %d10, %d12, %d56 6452 faligndata %d12, %d14, %d58 6453 faligndata %d14, %d16, %d60 6454 faligndata %d16, %d18, %d62 6455 fmovd %d18, %d2 6456 fmovd %d20, %d4 6457 fmovd %d22, %d6 6458 fmovd %d24, %d8 6459 fmovd %d26, %d10 6460 fmovd %d28, %d12 6461 fmovd %d30, %d14 6462 stda %d48, [%i1]ASI_BLK_P 6463 subcc %i3, 64, %i3 6464 add %i1, 64, %i1 6465 bgu,pt %ncc, .ci_unaln_001_loop 6466 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6467 ba .ci_unaln_done 6468 nop 6469 6470.ci_unaln_000: 6471 ldda [%o4]ASI_BLK_AIUS, %d0 6472.ci_unaln_000_loop: 6473 add %o4, 64, %o4 6474 ldda [%o4]ASI_BLK_AIUS, %d16 6475 faligndata %d0, %d2, %d48 6476 faligndata %d2, %d4, %d50 6477 faligndata %d4, %d6, %d52 6478 faligndata %d6, %d8, %d54 6479 faligndata %d8, %d10, %d56 6480 faligndata %d10, %d12, %d58 6481 faligndata %d12, %d14, %d60 6482 faligndata %d14, %d16, %d62 6483 fmovd %d16, %d0 6484 fmovd %d18, %d2 6485 fmovd %d20, %d4 6486 fmovd %d22, %d6 6487 fmovd %d24, %d8 6488 fmovd %d26, %d10 6489 fmovd %d28, %d12 6490 fmovd %d30, %d14 6491 stda %d48, [%i1]ASI_BLK_P 6492 subcc %i3, 64, %i3 6493 add %i1, 64, %i1 6494 bgu,pt %ncc, .ci_unaln_000_loop 6495 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6496 6497.ci_unaln_done: 6498 ! Handle trailing bytes, 64 to 127 6499 ! Dest long word aligned, Src not long word aligned 6500 cmp %i2, 15 6501 bleu %ncc, .ci_unaln_short 6502 6503 andn %i2, 0x7, %i3 ! %i3 is multiple of 8 6504 and %i2, 0x7, %i2 ! residue bytes in %i2 6505 add %i2, 8, %i2 6506 sub %i3, 8, %i3 ! insure we don't load past end of src 6507 andn %i0, 0x7, %o4 ! %o4 has long word aligned src address 6508 add %i0, %i3, %i0 ! advance %i0 to after multiple of 8 6509 ldda [%o4]%asi, %d0 ! fetch partial word 6510.ci_unaln_by8: 6511 ldda [%o4+8]%asi, %d2 6512 add %o4, 8, %o4 6513 faligndata %d0, %d2, %d16 6514 subcc %i3, 8, %i3 6515 std %d16, [%i1] 6516 fmovd %d2, %d0 6517 bgu,pt %ncc, .ci_unaln_by8 6518 add %i1, 8, %i1 6519 6520.ci_unaln_short: 6521 cmp %i2, 8 6522 blt,pt %ncc, .ci_unalnfin 6523 nop 6524 lduba [%i0]%asi, %o4 6525 sll %o4, 24, %o3 6526 lduba [%i0+1]%asi, %o4 6527 sll %o4, 16, %o4 6528 or %o4, %o3, %o3 6529 lduba [%i0+2]%asi, %o4 6530 sll %o4, 8, %o4 6531 or %o4, %o3, %o3 6532 lduba [%i0+3]%asi, %o4 6533 or %o4, %o3, %o3 6534 stw %o3, [%i1] 6535 lduba [%i0+4]%asi, %o4 6536 sll %o4, 24, %o3 6537 lduba [%i0+5]%asi, %o4 6538 sll %o4, 16, %o4 6539 or %o4, %o3, %o3 6540 lduba [%i0+6]%asi, %o4 6541 sll %o4, 8, %o4 6542 or %o4, %o3, %o3 6543 lduba [%i0+7]%asi, %o4 6544 or %o4, %o3, %o3 6545 stw %o3, [%i1+4] 6546 add %i0, 8, %i0 6547 add %i1, 8, %i1 6548 sub %i2, 8, %i2 6549.ci_unalnfin: 6550 cmp %i2, 4 6551 blt,pt %ncc, .ci_unalnz 6552 tst %i2 6553 lduba [%i0]%asi, %o3 ! read byte 6554 subcc %i2, 4, %i2 ! reduce count by 4 6555 sll %o3, 24, %o3 ! position 6556 lduba [%i0+1]%asi, %o4 6557 sll %o4, 16, %o4 ! position 6558 or %o4, %o3, %o3 ! merge 6559 lduba [%i0+2]%asi, %o4 6560 sll %o4, 8, %o4 ! position 6561 or %o4, %o3, %o3 ! merge 6562 add %i1, 4, %i1 ! advance dst by 4 6563 lduba [%i0+3]%asi, %o4 6564 add %i0, 4, %i0 ! advance src by 4 6565 or %o4, %o3, %o4 ! merge 6566 bnz,pt %ncc, .ci_unaln3x 6567 stw %o4, [%i1-4] 6568 ba .ci_exit 6569 nop 6570.ci_unalnz: 6571 bz,pt %ncc, .ci_exit 6572 wr %l5, %g0, %gsr ! restore %gsr 6573.ci_unaln3x: ! Exactly 1, 2, or 3 bytes remain 6574 subcc %i2, 1, %i2 ! reduce count for cc test 6575 lduba [%i0]%asi, %o4 ! load one byte 6576 bz,pt %ncc, .ci_exit 6577 stb %o4, [%i1] ! store one byte 6578 lduba [%i0+1]%asi, %o4 ! load second byte 6579 subcc %i2, 1, %i2 6580 bz,pt %ncc, .ci_exit 6581 stb %o4, [%i1+1] ! store second byte 6582 lduba [%i0+2]%asi, %o4 ! load third byte 6583 stb %o4, [%i1+2] ! store third byte 6584.ci_exit: 6585 brnz %g1, .ci_fp_restore 6586 nop 6587 FZERO 6588 wr %g1, %g0, %fprs 6589 ba,pt %ncc, .ci_ex2 6590 membar #Sync 6591.ci_fp_restore: 6592 BLD_FP_FROMSTACK(%o4) 6593.ci_ex2: 6594 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 6595 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 6596 ret 6597 restore %g0, 0, %o0 6598 6599.copyin_err: 6600 ldn [THREAD_REG + T_COPYOPS], %o4 6601 brz %o4, 2f 6602 nop 6603 ldn [%o4 + CP_COPYIN], %g2 6604 jmp %g2 6605 nop 66062: 6607 retl 6608 mov -1, %o0 6609 6610#else /* NIAGARA_IMPL */ 6611.do_copyin: 6612 ! 6613 ! Check the length and bail if zero. 6614 ! 6615 tst %o2 6616 bnz,pt %ncc, 1f 6617 nop 6618 retl 6619 clr %o0 66201: 6621 sethi %hi(copyio_fault), %o4 6622 or %o4, %lo(copyio_fault), %o4 6623 sethi %hi(copyio_fault_nowindow), %o3 6624 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT 6625 or %o3, %lo(copyio_fault_nowindow), %o3 6626 membar #Sync 6627 stn %o3, [THREAD_REG + T_LOFAULT] 6628 6629 mov %o0, SAVE_SRC 6630 mov %o1, SAVE_DST 6631 mov %o2, SAVE_COUNT 6632 6633 ! 6634 ! Check to see if we're more than SMALL_LIMIT. 6635 ! 6636 subcc %o2, SMALL_LIMIT, %o3 6637 bgu,a,pt %ncc, .dci_ns 6638 or %o0, %o1, %o3 6639 ! 6640 ! What was previously ".small_copyin" 6641 ! 6642.dcibcp: 6643 sub %g0, %o2, %o3 ! setup for copy loop 6644 add %o0, %o2, %o0 6645 add %o1, %o2, %o1 6646 ba,pt %ncc, .dcicl 6647 lduba [%o0 + %o3]ASI_USER, %o4 6648 ! 6649 ! %o0 and %o1 point at the end and remain pointing at the end 6650 ! of their buffers. We pull things out by adding %o3 (which is 6651 ! the negation of the length) to the buffer end which gives us 6652 ! the curent location in the buffers. By incrementing %o3 we walk 6653 ! through both buffers without having to bump each buffer's 6654 ! pointer. A very fast 4 instruction loop. 6655 ! 6656 .align 16 6657.dcicl: 6658 stb %o4, [%o1 + %o3] 6659 inccc %o3 6660 bl,a,pt %ncc, .dcicl 6661 lduba [%o0 + %o3]ASI_USER, %o4 6662 ! 6663 ! We're done. Go home. 6664 ! 6665 membar #Sync 6666 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 6667 retl 6668 clr %o0 6669 ! 6670 ! Try aligned copies from here. 6671 ! 6672.dci_ns: 6673 ! 6674 ! See if we're single byte aligned. If we are, check the 6675 ! limit for single byte copies. If we're smaller, or equal, 6676 ! bounce to the byte for byte copy loop. Otherwise do it in 6677 ! HW (if enabled). 6678 ! 6679 btst 1, %o3 6680 bz,a,pt %icc, .dcih8 6681 btst 7, %o3 6682 ! 6683 ! We're single byte aligned. 6684 ! 6685 sethi %hi(hw_copy_limit_1), %o3 6686 ld [%o3 + %lo(hw_copy_limit_1)], %o3 6687 ! 6688 ! Is HW copy on? If not do everything byte for byte. 6689 ! 6690 tst %o3 6691 bz,pn %icc, .dcibcp 6692 subcc %o3, %o2, %o3 6693 ! 6694 ! Are we bigger than the HW limit? If not 6695 ! go to byte for byte. 6696 ! 6697 bge,pt %ncc, .dcibcp 6698 nop 6699 ! 6700 ! We're big enough and copy is on. Do it with HW. 6701 ! 6702 ba,pt %ncc, .big_copyin 6703 nop 6704.dcih8: 6705 ! 6706 ! 8 byte aligned? 6707 ! 6708 bnz,a %ncc, .dcih4 6709 btst 3, %o3 6710 ! 6711 ! We're eight byte aligned. 6712 ! 6713 sethi %hi(hw_copy_limit_8), %o3 6714 ld [%o3 + %lo(hw_copy_limit_8)], %o3 6715 ! 6716 ! Is HW assist on? If not, do it with the aligned copy. 6717 ! 6718 tst %o3 6719 bz,pn %icc, .dcis8 6720 subcc %o3, %o2, %o3 6721 bge %ncc, .dcis8 6722 nop 6723 ba,pt %ncc, .big_copyin 6724 nop 6725.dcis8: 6726 ! 6727 ! Housekeeping for copy loops. Uses same idea as in the byte for 6728 ! byte copy loop above. 6729 ! 6730 add %o0, %o2, %o0 6731 add %o1, %o2, %o1 6732 sub %g0, %o2, %o3 6733 ba,pt %ncc, .didebc 6734 srl %o2, 3, %o2 ! Number of 8 byte chunks to copy 6735 ! 6736 ! 4 byte aligned? 6737 ! 6738.dcih4: 6739 bnz %ncc, .dcih2 6740 sethi %hi(hw_copy_limit_4), %o3 6741 ld [%o3 + %lo(hw_copy_limit_4)], %o3 6742 ! 6743 ! Is HW assist on? If not, do it with the aligned copy. 6744 ! 6745 tst %o3 6746 bz,pn %icc, .dcis4 6747 subcc %o3, %o2, %o3 6748 ! 6749 ! We're negative if our size is less than or equal to hw_copy_limit_4. 6750 ! 6751 bge %ncc, .dcis4 6752 nop 6753 ba,pt %ncc, .big_copyin 6754 nop 6755.dcis4: 6756 ! 6757 ! Housekeeping for copy loops. Uses same idea as in the byte 6758 ! for byte copy loop above. 6759 ! 6760 add %o0, %o2, %o0 6761 add %o1, %o2, %o1 6762 sub %g0, %o2, %o3 6763 ba,pt %ncc, .didfbc 6764 srl %o2, 2, %o2 ! Number of 4 byte chunks to copy 6765.dcih2: 6766 ! 6767 ! We're two byte aligned. Check for "smallness" 6768 ! done in delay at .dcih4 6769 ! 6770 bleu,pt %ncc, .dcis2 6771 sethi %hi(hw_copy_limit_2), %o3 6772 ld [%o3 + %lo(hw_copy_limit_2)], %o3 6773 ! 6774 ! Is HW assist on? If not, do it with the aligned copy. 6775 ! 6776 tst %o3 6777 bz,pn %icc, .dcis2 6778 subcc %o3, %o2, %o3 6779 ! 6780 ! Are we larger than the HW limit? 6781 ! 6782 bge %ncc, .dcis2 6783 nop 6784 ! 6785 ! HW assist is on and we're large enough to use it. 6786 ! 6787 ba,pt %ncc, .big_copyin 6788 nop 6789 ! 6790 ! Housekeeping for copy loops. Uses same idea as in the byte 6791 ! for byte copy loop above. 6792 ! 6793.dcis2: 6794 add %o0, %o2, %o0 6795 add %o1, %o2, %o1 6796 sub %g0, %o2, %o3 6797 ba,pt %ncc, .didtbc 6798 srl %o2, 1, %o2 ! Number of 2 byte chunks to copy 6799 ! 6800.small_copyin: 6801 ! 6802 ! Why are we doing this AGAIN? There are certain conditions in 6803 ! big copyin that will cause us to forgo the HW assisted copys 6804 ! and bounce back to a non-hw assisted copy. This dispatches 6805 ! those copies. Note that we branch around this in the main line 6806 ! code. 6807 ! 6808 ! We make no check for limits or HW enablement here. We've 6809 ! already been told that we're a poster child so just go off 6810 ! and do it. 6811 ! 6812 or %o0, %o1, %o3 6813 btst 1, %o3 6814 bnz %icc, .dcibcp ! Most likely 6815 btst 7, %o3 6816 bz %icc, .dcis8 6817 btst 3, %o3 6818 bz %icc, .dcis4 6819 nop 6820 ba,pt %ncc, .dcis2 6821 nop 6822 ! 6823 ! Eight byte aligned copies. A steal from the original .small_copyin 6824 ! with modifications. %o2 is number of 8 byte chunks to copy. When 6825 ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more 6826 ! to copy. 6827 ! 6828 .align 32 6829.didebc: 6830 ldxa [%o0 + %o3]ASI_USER, %o4 6831 deccc %o2 6832 stx %o4, [%o1 + %o3] 6833 bg,pt %ncc, .didebc 6834 addcc %o3, 8, %o3 6835 ! 6836 ! End of copy loop. Most 8 byte aligned copies end here. 6837 ! 6838 bz,pt %ncc, .dcifh 6839 nop 6840 ! 6841 ! Something is left. Do it byte for byte. 6842 ! 6843 ba,pt %ncc, .dcicl 6844 lduba [%o0 + %o3]ASI_USER, %o4 6845 ! 6846 ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy. 6847 ! 6848 .align 32 6849.didfbc: 6850 lduwa [%o0 + %o3]ASI_USER, %o4 6851 deccc %o2 6852 st %o4, [%o1 + %o3] 6853 bg,pt %ncc, .didfbc 6854 addcc %o3, 4, %o3 6855 ! 6856 ! End of copy loop. Most 4 byte aligned copies end here. 6857 ! 6858 bz,pt %ncc, .dcifh 6859 nop 6860 ! 6861 ! Something is left. Do it byte for byte. 6862 ! 6863 ba,pt %ncc, .dcicl 6864 lduba [%o0 + %o3]ASI_USER, %o4 6865 ! 6866 ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to 6867 ! copy. 6868 ! 6869 .align 32 6870.didtbc: 6871 lduha [%o0 + %o3]ASI_USER, %o4 6872 deccc %o2 6873 sth %o4, [%o1 + %o3] 6874 bg,pt %ncc, .didtbc 6875 addcc %o3, 2, %o3 6876 ! 6877 ! End of copy loop. Most 2 byte aligned copies end here. 6878 ! 6879 bz,pt %ncc, .dcifh 6880 nop 6881 ! 6882 ! Deal with the last byte 6883 ! 6884 lduba [%o0 + %o3]ASI_USER, %o4 6885 stb %o4, [%o1 + %o3] 6886.dcifh: 6887 membar #Sync 6888 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 6889 retl 6890 clr %o0 6891 6892.big_copyin: 6893 ! We're going off to do a block copy. 6894 ! Switch fault hendlers and grab a window. We 6895 ! don't do a membar #Sync since we've done only 6896 ! kernel data to this point. 6897 stn %o4, [THREAD_REG + T_LOFAULT] 6898 6899 ! Copy in that reach here are larger than 256 bytes. The 6900 ! hw_copy_limit_1 is set to 256. Never set this limit less 6901 ! 128 bytes. 6902 save %sp, -SA(MINFRAME), %sp 6903.do_blockcopyin: 6904 6905 ! Swap src/dst since the code below is memcpy code 6906 ! and memcpy/bcopy have different calling sequences 6907 mov %i1, %i5 6908 mov %i0, %i1 6909 mov %i5, %i0 6910 6911 ! Block (64 bytes) align the destination. 6912 andcc %i0, 0x3f, %i3 ! is dst block aligned 6913 bz %ncc, copyin_blalign ! dst already block aligned 6914 sub %i3, 0x40, %i3 6915 neg %i3 ! bytes till dst 64 bytes aligned 6916 sub %i2, %i3, %i2 ! update i2 with new count 6917 6918 ! Based on source and destination alignment do 6919 ! either 8 bytes, 4 bytes, 2 bytes or byte copy. 6920 6921 ! Is dst & src 8B aligned 6922 or %i0, %i1, %o2 6923 andcc %o2, 0x7, %g0 6924 bz %ncc, .ci_alewdcp 6925 nop 6926 6927 ! Is dst & src 4B aligned 6928 andcc %o2, 0x3, %g0 6929 bz %ncc, .ci_alwdcp 6930 nop 6931 6932 ! Is dst & src 2B aligned 6933 andcc %o2, 0x1, %g0 6934 bz %ncc, .ci_alhlfwdcp 6935 nop 6936 6937 ! 1B aligned 69381: lduba [%i1]ASI_USER, %o2 6939 stb %o2, [%i0] 6940 inc %i1 6941 deccc %i3 6942 bgu,pt %ncc, 1b 6943 inc %i0 6944 6945 ba copyin_blalign 6946 nop 6947 6948 ! dst & src 4B aligned 6949.ci_alwdcp: 6950 lda [%i1]ASI_USER, %o2 6951 st %o2, [%i0] 6952 add %i1, 0x4, %i1 6953 subcc %i3, 0x4, %i3 6954 bgu,pt %ncc, .ci_alwdcp 6955 add %i0, 0x4, %i0 6956 6957 ba copyin_blalign 6958 nop 6959 6960 ! dst & src 2B aligned 6961.ci_alhlfwdcp: 6962 lduha [%i1]ASI_USER, %o2 6963 stuh %o2, [%i0] 6964 add %i1, 0x2, %i1 6965 subcc %i3, 0x2, %i3 6966 bgu,pt %ncc, .ci_alhlfwdcp 6967 add %i0, 0x2, %i0 6968 6969 ba copyin_blalign 6970 nop 6971 6972 ! dst & src 8B aligned 6973.ci_alewdcp: 6974 ldxa [%i1]ASI_USER, %o2 6975 stx %o2, [%i0] 6976 add %i1, 0x8, %i1 6977 subcc %i3, 0x8, %i3 6978 bgu,pt %ncc, .ci_alewdcp 6979 add %i0, 0x8, %i0 6980 6981copyin_blalign: 6982 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size 6983 sub %i2, %i3, %i2 ! Residue bytes in %i2 6984 6985 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 6986 6987 andcc %i1, 0xf, %o2 ! is src quadword aligned 6988 bz,pn %xcc, .ci_blkcpy ! src offset in %o2 (last 4-bits) 6989 nop 6990 cmp %o2, 0x8 6991 bg .ci_upper_double 6992 nop 6993 bl .ci_lower_double 6994 nop 6995 6996 ! Falls through when source offset is equal to 8 i.e. 6997 ! source is double word aligned. 6998 ! In this case no shift/merge of data is required 6999 7000 sub %i1, %o2, %i1 ! align the src at 16 bytes. 7001 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 7002 prefetcha [%l0]ASI_USER, #one_read 7003 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 7004 add %l0, 0x40, %l0 7005.ci_loop0: 7006 add %i1, 0x10, %i1 7007 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 7008 7009 prefetcha [%l0]ASI_USER, #one_read 7010 7011 stxa %l3, [%i0+0x0]%asi 7012 stxa %l4, [%i0+0x8]%asi 7013 7014 add %i1, 0x10, %i1 7015 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 7016 7017 stxa %l5, [%i0+0x10]%asi 7018 stxa %l2, [%i0+0x18]%asi 7019 7020 add %i1, 0x10, %i1 7021 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 7022 7023 stxa %l3, [%i0+0x20]%asi 7024 stxa %l4, [%i0+0x28]%asi 7025 7026 add %i1, 0x10, %i1 7027 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 7028 7029 stxa %l5, [%i0+0x30]%asi 7030 stxa %l2, [%i0+0x38]%asi 7031 7032 add %l0, 0x40, %l0 7033 subcc %i3, 0x40, %i3 7034 bgu,pt %xcc, .ci_loop0 7035 add %i0, 0x40, %i0 7036 ba .ci_blkdone 7037 add %i1, %o2, %i1 ! increment the source by src offset 7038 ! the src offset was stored in %o2 7039 7040.ci_lower_double: 7041 7042 sub %i1, %o2, %i1 ! align the src at 16 bytes. 7043 sll %o2, 3, %o0 ! %o0 left shift 7044 mov 0x40, %o1 7045 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 7046 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 7047 prefetcha [%l0]ASI_USER, #one_read 7048 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l2 7049 ! and %l3 has complete 7050 ! data 7051 add %l0, 0x40, %l0 7052.ci_loop1: 7053 add %i1, 0x10, %i1 7054 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has partial data 7055 ! for this read. 7056 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4 7057 ! into %l2 and %l3 7058 7059 prefetcha [%l0]ASI_USER, #one_read 7060 7061 stxa %l2, [%i0+0x0]%asi 7062 stxa %l3, [%i0+0x8]%asi 7063 7064 add %i1, 0x10, %i1 7065 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 7066 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and 7067 ! %l4 from previous read 7068 ! into %l4 and %l5 7069 stxa %l4, [%i0+0x10]%asi 7070 stxa %l5, [%i0+0x18]%asi 7071 7072 ! Repeat the same for next 32 bytes. 7073 7074 add %i1, 0x10, %i1 7075 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 7076 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) 7077 7078 stxa %l2, [%i0+0x20]%asi 7079 stxa %l3, [%i0+0x28]%asi 7080 7081 add %i1, 0x10, %i1 7082 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 7083 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) 7084 7085 stxa %l4, [%i0+0x30]%asi 7086 stxa %l5, [%i0+0x38]%asi 7087 7088 add %l0, 0x40, %l0 7089 subcc %i3, 0x40, %i3 7090 bgu,pt %xcc, .ci_loop1 7091 add %i0, 0x40, %i0 7092 ba .ci_blkdone 7093 add %i1, %o2, %i1 ! increment the source by src offset 7094 ! the src offset was stored in %o2 7095 7096.ci_upper_double: 7097 7098 sub %i1, %o2, %i1 ! align the src at 16 bytes. 7099 sub %o2, 0x8, %o0 7100 sll %o0, 3, %o0 ! %o0 left shift 7101 mov 0x40, %o1 7102 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 7103 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 7104 prefetcha [%l0]ASI_USER, #one_read 7105 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l3 7106 ! for this read and 7107 ! no data in %l2 7108 add %l0, 0x40, %l0 7109.ci_loop2: 7110 add %i1, 0x10, %i1 7111 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has complete data 7112 ! and %l5 has partial 7113 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5 7114 ! into %l3 and %l4 7115 prefetcha [%l0]ASI_USER, #one_read 7116 7117 stxa %l3, [%i0+0x0]%asi 7118 stxa %l4, [%i0+0x8]%asi 7119 7120 add %i1, 0x10, %i1 7121 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 7122 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with 7123 ! %l5 from previous read 7124 ! into %l5 and %l2 7125 7126 stxa %l5, [%i0+0x10]%asi 7127 stxa %l2, [%i0+0x18]%asi 7128 7129 ! Repeat the same for next 32 bytes. 7130 7131 add %i1, 0x10, %i1 7132 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 7133 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) 7134 7135 stxa %l3, [%i0+0x20]%asi 7136 stxa %l4, [%i0+0x28]%asi 7137 7138 add %i1, 0x10, %i1 7139 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 7140 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) 7141 7142 stxa %l5, [%i0+0x30]%asi 7143 stxa %l2, [%i0+0x38]%asi 7144 7145 add %l0, 0x40, %l0 7146 subcc %i3, 0x40, %i3 7147 bgu,pt %xcc, .ci_loop2 7148 add %i0, 0x40, %i0 7149 ba .ci_blkdone 7150 add %i1, %o2, %i1 ! increment the source by src offset 7151 ! the src offset was stored in %o2 7152 7153 7154 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P 7155.ci_blkcpy: 7156 7157 andn %i1, 0x3f, %o0 ! %o0 has block aligned source 7158 prefetcha [%o0]ASI_USER, #one_read 7159 add %o0, 0x40, %o0 71601: 7161 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0 7162 add %i1, 0x10, %i1 7163 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 7164 add %i1, 0x10, %i1 7165 7166 prefetcha [%o0]ASI_USER, #one_read 7167 7168 stxa %l0, [%i0+0x0]%asi 7169 7170 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 7171 add %i1, 0x10, %i1 7172 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6 7173 add %i1, 0x10, %i1 7174 7175 stxa %l1, [%i0+0x8]%asi 7176 stxa %l2, [%i0+0x10]%asi 7177 stxa %l3, [%i0+0x18]%asi 7178 stxa %l4, [%i0+0x20]%asi 7179 stxa %l5, [%i0+0x28]%asi 7180 stxa %l6, [%i0+0x30]%asi 7181 stxa %l7, [%i0+0x38]%asi 7182 7183 add %o0, 0x40, %o0 7184 subcc %i3, 0x40, %i3 7185 bgu,pt %xcc, 1b 7186 add %i0, 0x40, %i0 7187 7188.ci_blkdone: 7189 membar #Sync 7190 7191 brz,pt %i2, .copyin_exit 7192 nop 7193 7194 ! Handle trailing bytes 7195 cmp %i2, 0x8 7196 blu,pt %ncc, .ci_residue 7197 nop 7198 7199 ! Can we do some 8B ops 7200 or %i1, %i0, %o2 7201 andcc %o2, 0x7, %g0 7202 bnz %ncc, .ci_last4 7203 nop 7204 7205 ! Do 8byte ops as long as possible 7206.ci_last8: 7207 ldxa [%i1]ASI_USER, %o2 7208 stx %o2, [%i0] 7209 add %i1, 0x8, %i1 7210 sub %i2, 0x8, %i2 7211 cmp %i2, 0x8 7212 bgu,pt %ncc, .ci_last8 7213 add %i0, 0x8, %i0 7214 7215 brz,pt %i2, .copyin_exit 7216 nop 7217 7218 ba .ci_residue 7219 nop 7220 7221.ci_last4: 7222 ! Can we do 4B ops 7223 andcc %o2, 0x3, %g0 7224 bnz %ncc, .ci_last2 7225 nop 72261: 7227 lda [%i1]ASI_USER, %o2 7228 st %o2, [%i0] 7229 add %i1, 0x4, %i1 7230 sub %i2, 0x4, %i2 7231 cmp %i2, 0x4 7232 bgu,pt %ncc, 1b 7233 add %i0, 0x4, %i0 7234 7235 brz,pt %i2, .copyin_exit 7236 nop 7237 7238 ba .ci_residue 7239 nop 7240 7241.ci_last2: 7242 ! Can we do 2B ops 7243 andcc %o2, 0x1, %g0 7244 bnz %ncc, .ci_residue 7245 nop 7246 72471: 7248 lduha [%i1]ASI_USER, %o2 7249 stuh %o2, [%i0] 7250 add %i1, 0x2, %i1 7251 sub %i2, 0x2, %i2 7252 cmp %i2, 0x2 7253 bgu,pt %ncc, 1b 7254 add %i0, 0x2, %i0 7255 7256 brz,pt %i2, .copyin_exit 7257 nop 7258 7259 ! Copy the residue as byte copy 7260.ci_residue: 7261 lduba [%i1]ASI_USER, %i4 7262 stb %i4, [%i0] 7263 inc %i1 7264 deccc %i2 7265 bgu,pt %xcc, .ci_residue 7266 inc %i0 7267 7268.copyin_exit: 7269 membar #Sync 7270 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 7271 ret 7272 restore %g0, 0, %o0 7273.copyin_err: 7274 ldn [THREAD_REG + T_COPYOPS], %o4 7275 brz %o4, 2f 7276 nop 7277 ldn [%o4 + CP_COPYIN], %g2 7278 jmp %g2 7279 nop 72802: 7281 retl 7282 mov -1, %o0 7283#endif /* NIAGARA_IMPL */ 7284 SET_SIZE(copyin) 7285 7286#endif /* lint */ 7287 7288#ifdef lint 7289 7290/*ARGSUSED*/ 7291int 7292xcopyin(const void *uaddr, void *kaddr, size_t count) 7293{ return (0); } 7294 7295#else /* lint */ 7296 7297 ENTRY(xcopyin) 7298 sethi %hi(.xcopyin_err), REAL_LOFAULT 7299 b .do_copyin 7300 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT 7301.xcopyin_err: 7302 ldn [THREAD_REG + T_COPYOPS], %o4 7303 brz %o4, 2f 7304 nop 7305 ldn [%o4 + CP_XCOPYIN], %g2 7306 jmp %g2 7307 nop 73082: 7309 retl 7310 mov %g1, %o0 7311 SET_SIZE(xcopyin) 7312 7313#endif /* lint */ 7314 7315#ifdef lint 7316 7317/*ARGSUSED*/ 7318int 7319xcopyin_little(const void *uaddr, void *kaddr, size_t count) 7320{ return (0); } 7321 7322#else /* lint */ 7323 7324 ENTRY(xcopyin_little) 7325 sethi %hi(.little_err), %o4 7326 ldn [THREAD_REG + T_LOFAULT], %o5 7327 or %o4, %lo(.little_err), %o4 7328 membar #Sync ! sync error barrier 7329 stn %o4, [THREAD_REG + T_LOFAULT] 7330 7331 subcc %g0, %o2, %o3 7332 add %o0, %o2, %o0 7333 bz,pn %ncc, 2f ! check for zero bytes 7334 sub %o2, 1, %o4 7335 add %o0, %o4, %o0 ! start w/last byte 7336 add %o1, %o2, %o1 7337 lduba [%o0+%o3]ASI_AIUSL, %o4 7338 73391: stb %o4, [%o1+%o3] 7340 inccc %o3 7341 sub %o0, 2, %o0 ! get next byte 7342 bcc,a,pt %ncc, 1b 7343 lduba [%o0+%o3]ASI_AIUSL, %o4 7344 73452: membar #Sync ! sync error barrier 7346 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 7347 retl 7348 mov %g0, %o0 ! return (0) 7349 7350.little_err: 7351 membar #Sync ! sync error barrier 7352 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 7353 retl 7354 mov %g1, %o0 7355 SET_SIZE(xcopyin_little) 7356 7357#endif /* lint */ 7358 7359 7360/* 7361 * Copy a block of storage - must not overlap (from + len <= to). 7362 * No fault handler installed (to be called under on_fault()) 7363 */ 7364#if defined(lint) 7365 7366/* ARGSUSED */ 7367void 7368copyin_noerr(const void *ufrom, void *kto, size_t count) 7369{} 7370 7371#else /* lint */ 7372 7373 ENTRY(copyin_noerr) 7374 sethi %hi(.copyio_noerr), REAL_LOFAULT 7375 b .do_copyin 7376 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 7377.copyio_noerr: 7378 jmp SAVED_LOFAULT 7379 nop 7380 SET_SIZE(copyin_noerr) 7381 7382#endif /* lint */ 7383 7384/* 7385 * Copy a block of storage - must not overlap (from + len <= to). 7386 * No fault handler installed (to be called under on_fault()) 7387 */ 7388 7389#if defined(lint) 7390 7391/* ARGSUSED */ 7392void 7393copyout_noerr(const void *kfrom, void *uto, size_t count) 7394{} 7395 7396#else /* lint */ 7397 7398 ENTRY(copyout_noerr) 7399 sethi %hi(.copyio_noerr), REAL_LOFAULT 7400 b .do_copyout 7401 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 7402 SET_SIZE(copyout_noerr) 7403 7404#endif /* lint */ 7405 7406#if defined(lint) 7407 7408int use_hw_bcopy = 1; 7409int use_hw_bzero = 1; 7410uint_t hw_copy_limit_1 = 0x100; 7411uint_t hw_copy_limit_2 = 0x200; 7412uint_t hw_copy_limit_4 = 0x400; 7413uint_t hw_copy_limit_8 = 0x400; 7414 7415#else /* !lint */ 7416 7417 .align 4 7418 DGDEF(use_hw_bcopy) 7419 .word 1 7420 DGDEF(use_hw_bzero) 7421 .word 1 7422 DGDEF(hw_copy_limit_1) 7423 .word 0x100 7424 DGDEF(hw_copy_limit_2) 7425 .word 0x200 7426 DGDEF(hw_copy_limit_4) 7427 .word 0x400 7428 DGDEF(hw_copy_limit_8) 7429 .word 0x400 7430 7431 .align 64 7432 .section ".text" 7433#endif /* !lint */ 7434 7435/* 7436 * hwblkclr - clears block-aligned, block-multiple-sized regions that are 7437 * longer than 256 bytes in length using Niagara's block stores/quad store. 7438 * If the criteria for using this routine are not met then it calls bzero 7439 * and returns 1. Otherwise 0 is returned indicating success. 7440 * Caller is responsible for ensuring use_hw_bzero is true and that 7441 * kpreempt_disable() has been called. 7442 */ 7443#ifdef lint 7444/*ARGSUSED*/ 7445int 7446hwblkclr(void *addr, size_t len) 7447{ 7448 return(0); 7449} 7450#else /* lint */ 7451 ! %i0 - start address 7452 ! %i1 - length of region (multiple of 64) 7453 7454 ENTRY(hwblkclr) 7455 save %sp, -SA(MINFRAME), %sp 7456 7457 ! Must be block-aligned 7458 andcc %i0, 0x3f, %g0 7459 bnz,pn %ncc, 1f 7460 nop 7461 7462 ! ... and must be 256 bytes or more 7463 cmp %i1, 0x100 7464 blu,pn %ncc, 1f 7465 nop 7466 7467 ! ... and length must be a multiple of 64 7468 andcc %i1, 0x3f, %g0 7469 bz,pn %ncc, .pz_doblock 7470 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 7471 74721: ! punt, call bzero but notify the caller that bzero was used 7473 mov %i0, %o0 7474 call bzero 7475 mov %i1, %o1 7476 ret 7477 restore %g0, 1, %o0 ! return (1) - did not use block operations 7478 7479 ! Already verified that there are at least 256 bytes to set 7480.pz_doblock: 7481 stxa %g0, [%i0+0x0]%asi 7482 stxa %g0, [%i0+0x40]%asi 7483 stxa %g0, [%i0+0x80]%asi 7484 stxa %g0, [%i0+0xc0]%asi 7485 7486 stxa %g0, [%i0+0x8]%asi 7487 stxa %g0, [%i0+0x10]%asi 7488 stxa %g0, [%i0+0x18]%asi 7489 stxa %g0, [%i0+0x20]%asi 7490 stxa %g0, [%i0+0x28]%asi 7491 stxa %g0, [%i0+0x30]%asi 7492 stxa %g0, [%i0+0x38]%asi 7493 7494 stxa %g0, [%i0+0x48]%asi 7495 stxa %g0, [%i0+0x50]%asi 7496 stxa %g0, [%i0+0x58]%asi 7497 stxa %g0, [%i0+0x60]%asi 7498 stxa %g0, [%i0+0x68]%asi 7499 stxa %g0, [%i0+0x70]%asi 7500 stxa %g0, [%i0+0x78]%asi 7501 7502 stxa %g0, [%i0+0x88]%asi 7503 stxa %g0, [%i0+0x90]%asi 7504 stxa %g0, [%i0+0x98]%asi 7505 stxa %g0, [%i0+0xa0]%asi 7506 stxa %g0, [%i0+0xa8]%asi 7507 stxa %g0, [%i0+0xb0]%asi 7508 stxa %g0, [%i0+0xb8]%asi 7509 7510 stxa %g0, [%i0+0xc8]%asi 7511 stxa %g0, [%i0+0xd0]%asi 7512 stxa %g0, [%i0+0xd8]%asi 7513 stxa %g0, [%i0+0xe0]%asi 7514 stxa %g0, [%i0+0xe8]%asi 7515 stxa %g0, [%i0+0xf0]%asi 7516 stxa %g0, [%i0+0xf8]%asi 7517 7518 sub %i1, 0x100, %i1 7519 cmp %i1, 0x100 7520 bgu,pt %ncc, .pz_doblock 7521 add %i0, 0x100, %i0 7522 75232: 7524 ! Check if more than 64 bytes to set 7525 cmp %i1,0x40 7526 blu %ncc, .pz_finish 7527 nop 7528 75293: 7530 stxa %g0, [%i0+0x0]%asi 7531 stxa %g0, [%i0+0x8]%asi 7532 stxa %g0, [%i0+0x10]%asi 7533 stxa %g0, [%i0+0x18]%asi 7534 stxa %g0, [%i0+0x20]%asi 7535 stxa %g0, [%i0+0x28]%asi 7536 stxa %g0, [%i0+0x30]%asi 7537 stxa %g0, [%i0+0x38]%asi 7538 7539 subcc %i1, 0x40, %i1 7540 bgu,pt %ncc, 3b 7541 add %i0, 0x40, %i0 7542 7543.pz_finish: 7544 membar #Sync 7545 ret 7546 restore %g0, 0, %o0 ! return (bzero or not) 7547 SET_SIZE(hwblkclr) 7548#endif /* lint */ 7549 7550#ifdef lint 7551/* Copy 32 bytes of data from src to dst using physical addresses */ 7552/*ARGSUSED*/ 7553void 7554hw_pa_bcopy32(uint64_t src, uint64_t dst) 7555{} 7556#else /*!lint */ 7557 7558 /* 7559 * Copy 32 bytes of data from src (%o0) to dst (%o1) 7560 * using physical addresses. 7561 */ 7562 ENTRY_NP(hw_pa_bcopy32) 7563 rdpr %pstate, %g1 7564 andn %g1, PSTATE_IE, %g2 7565 wrpr %g0, %g2, %pstate 7566 7567 ldxa [%o0]ASI_MEM, %o2 7568 add %o0, 8, %o0 7569 ldxa [%o0]ASI_MEM, %o3 7570 add %o0, 8, %o0 7571 ldxa [%o0]ASI_MEM, %o4 7572 add %o0, 8, %o0 7573 ldxa [%o0]ASI_MEM, %o5 7574 stxa %o2, [%o1]ASI_MEM 7575 add %o1, 8, %o1 7576 stxa %o3, [%o1]ASI_MEM 7577 add %o1, 8, %o1 7578 stxa %o4, [%o1]ASI_MEM 7579 add %o1, 8, %o1 7580 stxa %o5, [%o1]ASI_MEM 7581 7582 membar #Sync 7583 retl 7584 wrpr %g0, %g1, %pstate 7585 SET_SIZE(hw_pa_bcopy32) 7586#endif /* lint */ 7587 7588/* 7589 * Zero a block of storage. 7590 * 7591 * uzero is used by the kernel to zero a block in user address space. 7592 */ 7593 7594/* 7595 * Control flow of the bzero/kzero/uzero routine. 7596 * 7597 * For fewer than 7 bytes stores, bytes will be zeroed. 7598 * 7599 * For less than 15 bytes stores, align the address on 4 byte boundary. 7600 * Then store as many 4-byte chunks, followed by trailing bytes. 7601 * 7602 * For sizes greater than 15 bytes, align the address on 8 byte boundary. 7603 * if (count > 128) { 7604 * store as many 8-bytes chunks to block align the address 7605 * store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR 7606 * store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero) 7607 * } 7608 * Store as many 8-byte chunks, followed by trailing bytes. 7609 */ 7610 7611#if defined(lint) 7612 7613/* ARGSUSED */ 7614int 7615kzero(void *addr, size_t count) 7616{ return(0); } 7617 7618/* ARGSUSED */ 7619void 7620uzero(void *addr, size_t count) 7621{} 7622 7623#else /* lint */ 7624 7625 ENTRY(uzero) 7626 ! 7627 ! Set a new lo_fault handler only if we came in with one 7628 ! already specified. 7629 ! 7630 wr %g0, ASI_USER, %asi 7631 ldn [THREAD_REG + T_LOFAULT], %o5 7632 tst %o5 7633 bz,pt %ncc, .do_zero 7634 sethi %hi(.zeroerr), %o2 7635 or %o2, %lo(.zeroerr), %o2 7636 membar #Sync 7637 ba,pt %ncc, .do_zero 7638 stn %o2, [THREAD_REG + T_LOFAULT] 7639 7640 ENTRY(kzero) 7641 ! 7642 ! Always set a lo_fault handler 7643 ! 7644 wr %g0, ASI_P, %asi 7645 ldn [THREAD_REG + T_LOFAULT], %o5 7646 sethi %hi(.zeroerr), %o2 7647 or %o5, LOFAULT_SET, %o5 7648 or %o2, %lo(.zeroerr), %o2 7649 membar #Sync 7650 ba,pt %ncc, .do_zero 7651 stn %o2, [THREAD_REG + T_LOFAULT] 7652 7653/* 7654 * We got here because of a fault during kzero or if 7655 * uzero or bzero was called with t_lofault non-zero. 7656 * Otherwise we've already run screaming from the room. 7657 * Errno value is in %g1. Note that we're here iff 7658 * we did set t_lofault. 7659 */ 7660.zeroerr: 7661 ! 7662 ! Undo asi register setting. Just set it to be the 7663 ! kernel default without checking. 7664 ! 7665 wr %g0, ASI_P, %asi 7666 7667 ! 7668 ! We did set t_lofault. It may well have been zero coming in. 7669 ! 76701: 7671 tst %o5 7672 membar #Sync 7673 bne,pn %ncc, 3f 7674 andncc %o5, LOFAULT_SET, %o5 76752: 7676 ! 7677 ! Old handler was zero. Just return the error. 7678 ! 7679 retl ! return 7680 mov %g1, %o0 ! error code from %g1 76813: 7682 ! 7683 ! We're here because %o5 was non-zero. It was non-zero 7684 ! because either LOFAULT_SET was present, a previous fault 7685 ! handler was present or both. In all cases we need to reset 7686 ! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET 7687 ! before we either simply return the error or we invoke the 7688 ! previously specified handler. 7689 ! 7690 be %ncc, 2b 7691 stn %o5, [THREAD_REG + T_LOFAULT] 7692 jmp %o5 ! goto real handler 7693 nop 7694 SET_SIZE(kzero) 7695 SET_SIZE(uzero) 7696 7697#endif /* lint */ 7698 7699/* 7700 * Zero a block of storage. 7701 */ 7702 7703#if defined(lint) 7704 7705/* ARGSUSED */ 7706void 7707bzero(void *addr, size_t count) 7708{} 7709 7710#else /* lint */ 7711 7712 ENTRY(bzero) 7713 wr %g0, ASI_P, %asi 7714 7715 ldn [THREAD_REG + T_LOFAULT], %o5 ! save old vector 7716 tst %o5 7717 bz,pt %ncc, .do_zero 7718 sethi %hi(.zeroerr), %o2 7719 or %o2, %lo(.zeroerr), %o2 7720 membar #Sync ! sync error barrier 7721 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector 7722 7723.do_zero: 7724 cmp %o1, 7 7725 blu,pn %ncc, .byteclr 7726 nop 7727 7728 cmp %o1, 15 7729 blu,pn %ncc, .wdalign 7730 nop 7731 7732 andcc %o0, 7, %o3 ! is add aligned on a 8 byte bound 7733 bz,pt %ncc, .blkalign ! already double aligned 7734 sub %o3, 8, %o3 ! -(bytes till double aligned) 7735 add %o1, %o3, %o1 ! update o1 with new count 7736 77371: 7738 stba %g0, [%o0]%asi 7739 inccc %o3 7740 bl,pt %ncc, 1b 7741 inc %o0 7742 7743 ! Now address is double aligned 7744.blkalign: 7745 cmp %o1, 0x80 ! check if there are 128 bytes to set 7746 blu,pn %ncc, .bzero_small 7747 mov %o1, %o3 7748 7749 sethi %hi(use_hw_bzero), %o2 7750 ld [%o2 + %lo(use_hw_bzero)], %o2 7751 tst %o2 7752 bz %ncc, .bzero_small 7753 mov %o1, %o3 7754 7755 rd %asi, %o3 7756 wr %g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 7757 cmp %o3, ASI_P 7758 bne,a %ncc, .algnblk 7759 wr %g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi 7760 7761.algnblk: 7762 andcc %o0, 0x3f, %o3 ! is block aligned? 7763 bz,pt %ncc, .bzero_blk 7764 sub %o3, 0x40, %o3 ! -(bytes till block aligned) 7765 add %o1, %o3, %o1 ! o1 is the remainder 7766 7767 ! Clear -(%o3) bytes till block aligned 77681: 7769 stxa %g0, [%o0]%asi 7770 addcc %o3, 8, %o3 7771 bl,pt %ncc, 1b 7772 add %o0, 8, %o0 7773 7774.bzero_blk: 7775 and %o1, 0x3f, %o3 ! calc bytes left after blk clear 7776 andn %o1, 0x3f, %o4 ! calc size of blocks in bytes 7777 7778 cmp %o4, 0x100 ! 256 bytes or more 7779 blu,pn %ncc, 3f 7780 nop 7781 77822: 7783 stxa %g0, [%o0+0x0]%asi 7784 stxa %g0, [%o0+0x40]%asi 7785 stxa %g0, [%o0+0x80]%asi 7786 stxa %g0, [%o0+0xc0]%asi 7787 7788 stxa %g0, [%o0+0x8]%asi 7789 stxa %g0, [%o0+0x10]%asi 7790 stxa %g0, [%o0+0x18]%asi 7791 stxa %g0, [%o0+0x20]%asi 7792 stxa %g0, [%o0+0x28]%asi 7793 stxa %g0, [%o0+0x30]%asi 7794 stxa %g0, [%o0+0x38]%asi 7795 7796 stxa %g0, [%o0+0x48]%asi 7797 stxa %g0, [%o0+0x50]%asi 7798 stxa %g0, [%o0+0x58]%asi 7799 stxa %g0, [%o0+0x60]%asi 7800 stxa %g0, [%o0+0x68]%asi 7801 stxa %g0, [%o0+0x70]%asi 7802 stxa %g0, [%o0+0x78]%asi 7803 7804 stxa %g0, [%o0+0x88]%asi 7805 stxa %g0, [%o0+0x90]%asi 7806 stxa %g0, [%o0+0x98]%asi 7807 stxa %g0, [%o0+0xa0]%asi 7808 stxa %g0, [%o0+0xa8]%asi 7809 stxa %g0, [%o0+0xb0]%asi 7810 stxa %g0, [%o0+0xb8]%asi 7811 7812 stxa %g0, [%o0+0xc8]%asi 7813 stxa %g0, [%o0+0xd0]%asi 7814 stxa %g0, [%o0+0xd8]%asi 7815 stxa %g0, [%o0+0xe0]%asi 7816 stxa %g0, [%o0+0xe8]%asi 7817 stxa %g0, [%o0+0xf0]%asi 7818 stxa %g0, [%o0+0xf8]%asi 7819 7820 sub %o4, 0x100, %o4 7821 cmp %o4, 0x100 7822 bgu,pt %ncc, 2b 7823 add %o0, 0x100, %o0 7824 78253: 7826 ! ... check if 64 bytes to set 7827 cmp %o4, 0x40 7828 blu %ncc, .bzero_blk_done 7829 nop 7830 78314: 7832 stxa %g0, [%o0+0x0]%asi 7833 stxa %g0, [%o0+0x8]%asi 7834 stxa %g0, [%o0+0x10]%asi 7835 stxa %g0, [%o0+0x18]%asi 7836 stxa %g0, [%o0+0x20]%asi 7837 stxa %g0, [%o0+0x28]%asi 7838 stxa %g0, [%o0+0x30]%asi 7839 stxa %g0, [%o0+0x38]%asi 7840 7841 subcc %o4, 0x40, %o4 7842 bgu,pt %ncc, 3b 7843 add %o0, 0x40, %o0 7844 7845.bzero_blk_done: 7846 membar #Sync 7847 ! 7848 ! Undo asi register setting. 7849 ! 7850 rd %asi, %o4 7851 wr %g0, ASI_P, %asi 7852 cmp %o4, ASI_BLK_INIT_ST_QUAD_LDD_P 7853 bne,a %ncc, .bzero_small 7854 wr %g0, ASI_USER, %asi 7855 7856.bzero_small: 7857 ! Set the remaining doubles 7858 subcc %o3, 8, %o3 ! Can we store any doubles? 7859 blu,pn %ncc, .byteclr 7860 and %o1, 7, %o1 ! calc bytes left after doubles 7861 7862.dbclr: 7863 stxa %g0, [%o0]%asi ! Clear the doubles 7864 subcc %o3, 8, %o3 7865 bgeu,pt %ncc, .dbclr 7866 add %o0, 8, %o0 7867 7868 ba .byteclr 7869 nop 7870 7871.wdalign: 7872 andcc %o0, 3, %o3 ! is add aligned on a word boundary 7873 bz,pn %ncc, .wdclr 7874 andn %o1, 3, %o3 ! create word sized count in %o3 7875 7876 dec %o1 ! decrement count 7877 stba %g0, [%o0]%asi ! clear a byte 7878 ba .wdalign 7879 inc %o0 ! next byte 7880 7881.wdclr: 7882 sta %g0, [%o0]%asi ! 4-byte clearing loop 7883 subcc %o3, 4, %o3 7884 bnz,pt %ncc, .wdclr 7885 inc 4, %o0 7886 7887 and %o1, 3, %o1 ! leftover count, if any 7888 7889.byteclr: 7890 ! Set the leftover bytes 7891 brz %o1, .bzero_exit 7892 nop 7893 78947: 7895 deccc %o1 ! byte clearing loop 7896 stba %g0, [%o0]%asi 7897 bgu,pt %ncc, 7b 7898 inc %o0 7899 7900.bzero_exit: 7901 ! 7902 ! We're just concerned with whether t_lofault was set 7903 ! when we came in. We end up here from either kzero() 7904 ! or bzero(). kzero() *always* sets a lofault handler. 7905 ! It ors LOFAULT_SET into %o5 to indicate it has done 7906 ! this even if the value of %o5 is otherwise zero. 7907 ! bzero() sets a lofault handler *only* if one was 7908 ! previously set. Accordingly we need to examine 7909 ! %o5 and if it is non-zero be sure to clear LOFAULT_SET 7910 ! before resetting the error handler. 7911 ! 7912 tst %o5 7913 bz %ncc, 1f 7914 andn %o5, LOFAULT_SET, %o5 7915 membar #Sync ! sync error barrier 7916 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 79171: 7918 retl 7919 clr %o0 ! return (0) 7920 7921 SET_SIZE(bzero) 7922#endif /* lint */ 7923