1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 26#include <sys/param.h> 27#include <sys/errno.h> 28#include <sys/asm_linkage.h> 29#include <sys/vtrace.h> 30#include <sys/machthread.h> 31#include <sys/clock.h> 32#include <sys/asi.h> 33#include <sys/fsr.h> 34#include <sys/privregs.h> 35#include <sys/machasi.h> 36#include <sys/niagaraasi.h> 37 38#include "assym.h" 39 40 41/* 42 * Pseudo-code to aid in understanding the control flow of the 43 * bcopy/kcopy routine. 44 * 45 * ! WARNING : <Register usage convention> 46 * ! In kcopy() the %o5, holds previous error handler and a flag 47 * ! LOFAULT_SET (low bits). The %o5 is null in bcopy(). 48 * ! The %o5 is not available for any other use. 49 * 50 * On entry: 51 * ! Determine whether to use the FP register version or the 52 * ! the leaf routine version depending on the size of the copy. 53 * ! Set up error handling accordingly. 54 * ! The transition point depends on FP_COPY 55 * ! For both versions %o5 is reserved 56 * 57 * kcopy(): 58 * if(length > FP_COPY) 59 * go to regular_kcopy 60 * 61 * ! Setup_leaf_rtn_error_handler 62 * %o5 = curthread->t_lofault; ! save existing handler in %o5 63 * %o5 |= LOFAULT_SET; ! ORed with LOFAULT_SET flag 64 * curthread->t_lofault = .sm_copyerr; 65 * goto small_bcopy(); 66 * 67 * regular_kcopy: 68 * save_registers() 69 * %o5 = curthread->t_lofault; ! save existing handler in %o5 70 * %o5 |= LOFAULT_SET; ! ORed with LOFAULT_SET flag 71 * curthread->t_lofault = .copyerr; 72 * goto do_copy(); 73 * 74 * bcopy(): 75 * if(length > FP_COPY) 76 * go to regular_bcopy 77 * 78 * ! Setup_leaf_rtn_error_handler 79 * %o5 = curthread->t_lofault; ! save existing handler in %o5 80 * curthread->t_lofault = .sm_copyerr; 81 * goto small_bcopy(); 82 * 83 * regular_bcopy: 84 * %o5 = curthread->t_lofault; ! save existing handler in %o5 85 * curthread->t_lofault = .copyerr; 86 * goto do_copy(); 87 * 88 * small_bcopy: 89 * ! handle copies smaller than FP_COPY 90 * restore t_lofault handler 91 * exit 92 * 93 * do_copy: 94 * ! handle copies larger than FP_COPY 95 * save fp_regs 96 * blockcopy; 97 * restore fp_regs 98 * restore t_lofault handler if came from kcopy(); 99 * 100 * 101 * In leaf lofault handler: 102 * curthread->t_lofault = (%o5 & ~LOFAULT_SET); ! restore old t_lofault 103 * return (errno) 104 * 105 * In lofault handler: 106 * curthread->t_lofault = (%o5 & ~LOFAULT_SET); ! restore old t_lofault 107 * restore fp_regs 108 * return (errno) 109 * 110 * 111 * 112 * For all of bcopy/copyin/copyout the copy logic is specialized according 113 * to how the src and dst is aligned and how much data needs to be moved. 114 * The following comments apply to the N2/RF code (#if !defined(NIAGARA_IMPL)) 115 * 116 * N2/RF Flow : 117 * 118 * if (count < FP_COPY) { (584 bytes) 119 * set small fault handler (no register window save/restore) 120 * if count < SHORTCOPY (7 bytes) 121 * copy bytes; go to short_exit 122 * else 123 * determine dst alignment, move minimum bytes/halfwords to 124 * get dst aligned on long word boundary 125 * if( src is on long word boundary ) { 126 * medlong: src/dst aligned on 8 bytes 127 * copy with ldx/stx in 4-way unrolled loop; 128 * copy final 0-31 bytes; go to short_exit 129 * } else { src/dst not aligned on 8 bytes 130 * if src is word aligned, ld/st words in 32-byte chunks 131 * if src is half word aligned, ld half, ld word, ld half; pack 132 * into long word, store long words in 32-byte chunks 133 * if src is byte aligned, ld byte,half,word parts; pack into long 134 * word, store long words in 32-byte chunks 135 * move final 0-31 bytes according to src alignment; go to short_exit 136 * short_exit: 137 * restore trap handler if needed, retl 138 * else { More than FP_COPY bytes 139 * set fault handler 140 * disable kernel preemption 141 * save registers, save FP registers if in use 142 * move bytes to align destination register on long word boundary 143 * if(src is on long word boundary) { src/dst aligned on 8 bytes 144 * align dst on 64 byte boundary; use 8-way test for each of 8 possible 145 * src alignments relative to a 64 byte boundary to select the 146 * 16-way unrolled loop (128 bytes) to use for 147 * block load, fmovd, block-init-store, block-store, fmovd operations 148 * then go to remain_stuff. 149 * remain_stuff: move remaining bytes. go to long_exit 150 * } else { 151 * setup alignaddr for faligndata instructions 152 * align dst on 64 byte boundary; use 8-way test for each of 8 possible 153 * src alignments to nearest long word relative to 64 byte boundary to 154 * select the 8-way unrolled loop (64 bytes) to use for 155 * block load, falign, fmovd, block-store loop 156 * (only use block-init-store when src/dst on 8 byte boundaries.) 157 * goto unalign_done. 158 * unalign_done: 159 * move remaining bytes for unaligned cases. go to long_exit 160 * long_exit: 161 * restore %gsr, FP regs (either from stack or set to zero), 162 * restore trap handler, check for kernel preemption request, 163 * handle if needed, ret. 164 * } 165 * 166 * Other platforms include hw_bcopy_limit_[1248] to control the exact 167 * point where the FP register code is used. On those platforms, the 168 * FP register code did not leave data in L2 cache, potentially affecting 169 * performance more than the gain/loss from the algorithm difference. 170 * For N2/RF, block store places data in the L2 cache, so use or non-use 171 * of the FP registers has no effect on L2 cache behavior. 172 * The cost for testing hw_bcopy_limit_* according to different 173 * alignments exceeds 50 cycles for all cases, even when hw_bcopy_limits 174 * were not used. That cost was judged too high relative to the benefits, 175 * so the hw_bcopy_limit option is omitted from this code. 176 */ 177 178/* 179 * Less then or equal this number of bytes we will always copy byte-for-byte 180 */ 181#define SMALL_LIMIT 7 182 183/* 184 * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault 185 * handler was set 186 */ 187#define LOFAULT_SET 2 188 189/* 190 * This define is to align data for the unaligned source cases. 191 * The data1, data2 and data3 is merged into data1 and data2. 192 * The data3 is preserved for next merge. 193 */ 194#define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \ 195 sllx data1, lshift, data1 ;\ 196 srlx data2, rshift, tmp ;\ 197 or data1, tmp, data1 ;\ 198 sllx data2, lshift, data2 ;\ 199 srlx data3, rshift, tmp ;\ 200 or data2, tmp, data2 201/* 202 * This macro is to align the data. Basically it merges 203 * data1 and data2 to form double word. 204 */ 205#define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \ 206 sllx data1, lshift, data1 ;\ 207 srlx data2, rshift, tmp ;\ 208 or data1, tmp, data1 209 210#if !defined(NIAGARA_IMPL) 211/* 212 * Flags set in the lower bits of the t_lofault address: 213 * FPUSED_FLAG: The FP registers were in use and must be restored 214 * LOFAULT_SET: Set for bcopy calls, cleared for kcopy calls 215 * COPY_FLAGS: Both of the above 216 * 217 * Other flags: 218 * KPREEMPT_FLAG: kpreempt needs to be called 219 */ 220#define FPUSED_FLAG 1 221#define LOFAULT_SET 2 222#define COPY_FLAGS (FPUSED_FLAG | LOFAULT_SET) 223#define KPREEMPT_FLAG 4 224 225#define ALIGN_OFF_1_7 \ 226 faligndata %d0, %d2, %d48 ;\ 227 faligndata %d2, %d4, %d50 ;\ 228 faligndata %d4, %d6, %d52 ;\ 229 faligndata %d6, %d8, %d54 ;\ 230 faligndata %d8, %d10, %d56 ;\ 231 faligndata %d10, %d12, %d58 ;\ 232 faligndata %d12, %d14, %d60 ;\ 233 faligndata %d14, %d16, %d62 234 235#define ALIGN_OFF_8_15 \ 236 faligndata %d2, %d4, %d48 ;\ 237 faligndata %d4, %d6, %d50 ;\ 238 faligndata %d6, %d8, %d52 ;\ 239 faligndata %d8, %d10, %d54 ;\ 240 faligndata %d10, %d12, %d56 ;\ 241 faligndata %d12, %d14, %d58 ;\ 242 faligndata %d14, %d16, %d60 ;\ 243 faligndata %d16, %d18, %d62 244 245#define ALIGN_OFF_16_23 \ 246 faligndata %d4, %d6, %d48 ;\ 247 faligndata %d6, %d8, %d50 ;\ 248 faligndata %d8, %d10, %d52 ;\ 249 faligndata %d10, %d12, %d54 ;\ 250 faligndata %d12, %d14, %d56 ;\ 251 faligndata %d14, %d16, %d58 ;\ 252 faligndata %d16, %d18, %d60 ;\ 253 faligndata %d18, %d20, %d62 254 255#define ALIGN_OFF_24_31 \ 256 faligndata %d6, %d8, %d48 ;\ 257 faligndata %d8, %d10, %d50 ;\ 258 faligndata %d10, %d12, %d52 ;\ 259 faligndata %d12, %d14, %d54 ;\ 260 faligndata %d14, %d16, %d56 ;\ 261 faligndata %d16, %d18, %d58 ;\ 262 faligndata %d18, %d20, %d60 ;\ 263 faligndata %d20, %d22, %d62 264 265#define ALIGN_OFF_32_39 \ 266 faligndata %d8, %d10, %d48 ;\ 267 faligndata %d10, %d12, %d50 ;\ 268 faligndata %d12, %d14, %d52 ;\ 269 faligndata %d14, %d16, %d54 ;\ 270 faligndata %d16, %d18, %d56 ;\ 271 faligndata %d18, %d20, %d58 ;\ 272 faligndata %d20, %d22, %d60 ;\ 273 faligndata %d22, %d24, %d62 274 275#define ALIGN_OFF_40_47 \ 276 faligndata %d10, %d12, %d48 ;\ 277 faligndata %d12, %d14, %d50 ;\ 278 faligndata %d14, %d16, %d52 ;\ 279 faligndata %d16, %d18, %d54 ;\ 280 faligndata %d18, %d20, %d56 ;\ 281 faligndata %d20, %d22, %d58 ;\ 282 faligndata %d22, %d24, %d60 ;\ 283 faligndata %d24, %d26, %d62 284 285#define ALIGN_OFF_48_55 \ 286 faligndata %d12, %d14, %d48 ;\ 287 faligndata %d14, %d16, %d50 ;\ 288 faligndata %d16, %d18, %d52 ;\ 289 faligndata %d18, %d20, %d54 ;\ 290 faligndata %d20, %d22, %d56 ;\ 291 faligndata %d22, %d24, %d58 ;\ 292 faligndata %d24, %d26, %d60 ;\ 293 faligndata %d26, %d28, %d62 294 295#define ALIGN_OFF_56_63 \ 296 faligndata %d14, %d16, %d48 ;\ 297 faligndata %d16, %d18, %d50 ;\ 298 faligndata %d18, %d20, %d52 ;\ 299 faligndata %d20, %d22, %d54 ;\ 300 faligndata %d22, %d24, %d56 ;\ 301 faligndata %d24, %d26, %d58 ;\ 302 faligndata %d26, %d28, %d60 ;\ 303 faligndata %d28, %d30, %d62 304 305/* 306 * FP_COPY indicates the minimum number of bytes needed 307 * to justify using FP/VIS-accelerated memory operations. 308 * The FPBLK code assumes a minimum number of bytes are available 309 * to be moved on entry. Check that code carefully before 310 * reducing FP_COPY below 256. 311 */ 312#define FP_COPY 584 313#define SHORTCOPY 7 314#define ASI_STBI_P ASI_BLK_INIT_ST_QUAD_LDD_P 315#define ASI_STBI_AIUS ASI_BLK_INIT_QUAD_LDD_AIUS 316#define CACHE_LINE 64 317#define VIS_BLOCKSIZE 64 318 319/* 320 * Size of stack frame in order to accomodate a 64-byte aligned 321 * floating-point register save area and 2 64-bit temp locations. 322 * All copy functions use three quadrants of fp registers; to assure a 323 * block-aligned three block buffer in which to save we must reserve 324 * four blocks on stack. 325 * 326 * _______________________________________ <-- %fp + STACK_BIAS 327 * | We may need to preserve 3 quadrants | 328 * | of fp regs, but since we do so with | 329 * | BST/BLD we need room in which to | 330 * | align to VIS_BLOCKSIZE bytes. So | 331 * | this area is 4 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET 332 * |-------------------------------------| 333 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET 334 * |-------------------------------------| 335 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET 336 * --------------------------------------- 337 */ 338#define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8)) 339#define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 4) 340#define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 3) + 1) 341#define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8) 342#define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8) 343 344/* 345 * In FP copies if we do not have preserved data to restore over 346 * the fp regs we used then we must zero those regs to avoid 347 * exposing portions of the data to later threads (data security). 348 */ 349#define FZERO \ 350 fzero %f0 ;\ 351 fzero %f2 ;\ 352 faddd %f0, %f2, %f4 ;\ 353 fmuld %f0, %f2, %f6 ;\ 354 faddd %f0, %f2, %f8 ;\ 355 fmuld %f0, %f2, %f10 ;\ 356 faddd %f0, %f2, %f12 ;\ 357 fmuld %f0, %f2, %f14 ;\ 358 faddd %f0, %f2, %f16 ;\ 359 fmuld %f0, %f2, %f18 ;\ 360 faddd %f0, %f2, %f20 ;\ 361 fmuld %f0, %f2, %f22 ;\ 362 faddd %f0, %f2, %f24 ;\ 363 fmuld %f0, %f2, %f26 ;\ 364 faddd %f0, %f2, %f28 ;\ 365 fmuld %f0, %f2, %f30 ;\ 366 faddd %f0, %f2, %f48 ;\ 367 fmuld %f0, %f2, %f50 ;\ 368 faddd %f0, %f2, %f52 ;\ 369 fmuld %f0, %f2, %f54 ;\ 370 faddd %f0, %f2, %f56 ;\ 371 fmuld %f0, %f2, %f58 ;\ 372 faddd %f0, %f2, %f60 ;\ 373 fmuld %f0, %f2, %f62 374 375/* 376 * Macros to save and restore fp registers to/from the stack. 377 * Used to save and restore in-use fp registers when we want to use FP. 378 */ 379#define BST_FP_TOSTACK(tmp1) \ 380 /* membar #Sync */ ;\ 381 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 382 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 383 stda %f0, [tmp1]ASI_BLK_P ;\ 384 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 385 stda %f16, [tmp1]ASI_BLK_P ;\ 386 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 387 stda %f48, [tmp1]ASI_BLK_P ;\ 388 membar #Sync 389 390#define BLD_FP_FROMSTACK(tmp1) \ 391 /* membar #Sync - provided at copy completion */ ;\ 392 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 393 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 394 ldda [tmp1]ASI_BLK_P, %f0 ;\ 395 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 396 ldda [tmp1]ASI_BLK_P, %f16 ;\ 397 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 398 ldda [tmp1]ASI_BLK_P, %f48 ;\ 399 membar #Sync 400 401#endif /* !NIAGARA_IMPL */ 402 403/* 404 * Copy a block of storage, returning an error code if `from' or 405 * `to' takes a kernel pagefault which cannot be resolved. 406 * Returns errno value on pagefault error, 0 if all ok 407 */ 408 409 .seg ".text" 410 .align 4 411 412 ENTRY(kcopy) 413#if !defined(NIAGARA_IMPL) 414 cmp %o2, FP_COPY ! check for small copy/leaf case 415 bgt,pt %ncc, .kcopy_more ! 416 nop 417.kcopy_small: ! setup error handler 418 sethi %hi(.sm_copyerr), %o4 419 or %o4, %lo(.sm_copyerr), %o4 ! .sm_copyerr is lofault value 420 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler 421 ! Note that we carefully do *not* flag the setting of 422 ! t_lofault. 423 membar #Sync ! sync error barrier 424 b .sm_do_copy ! common code 425 stn %o4, [THREAD_REG + T_LOFAULT] ! set t_lofault 426 427 428.kcopy_more: 429 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 430 sethi %hi(.copyerr), %l7 ! copyerr is lofault value 431 or %l7, %lo(.copyerr), %l7 432 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler 433 ! Note that we carefully do *not* flag the setting of 434 ! t_lofault. 435 membar #Sync ! sync error barrier 436 b .do_copy ! common code 437 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 438 439/* 440 * We got here because of a fault during a small kcopy or bcopy. 441 * if a fault handler existed when bcopy was called. 442 * No floating point registers are used by the small copies. 443 * Small copies are from a leaf routine 444 * Errno value is in %g1. 445 */ 446.sm_copyerr: 447 ! The kcopy will always set a t_lofault handler. If it fires, 448 ! we're expected to just return the error code and not to 449 ! invoke any existing error handler. As far as bcopy is concerned, 450 ! we only set t_lofault if there was an existing lofault handler. 451 ! In that case we're expected to invoke the previously existing 452 ! handler after resetting the t_lofault value. 453 btst LOFAULT_SET, %o5 454 membar #Sync ! sync error barrier 455 andn %o5, LOFAULT_SET, %o5 ! clear fault flag 456 bnz,pn %ncc, 3f 457 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 458 retl 459 mov %g1, %o0 4603: 461 ! We're here via bcopy. There must have been an error handler 462 ! in place otherwise we would have died a nasty death already. 463 jmp %o5 ! goto real handler 464 mov %g0, %o0 465/* 466 * end of .sm_copyerr 467 */ 468 469/* 470 * We got here because of a fault during kcopy or bcopy if a fault 471 * handler existed when bcopy was called. 472 * stack and fp registers need to be restored 473 * Errno value is in %g1. 474 */ 475.copyerr: 476 sethi %hi(.copyerr2), %l1 477 or %l1, %lo(.copyerr2), %l1 478 membar #Sync ! sync error barrier 479 stn %l1, [THREAD_REG + T_LOFAULT] ! set t_lofault 480 btst FPUSED_FLAG, %o5 481 bz,pt %xcc, 1f 482 and %o5, LOFAULT_SET, %l1 ! copy flag to %l1 483 484 membar #Sync ! sync error barrier 485 wr %l5, 0, %gsr 486 btst FPRS_FEF, %g5 487 bz,pt %icc, 4f 488 nop 489 ! restore fpregs from stack 490 BLD_FP_FROMSTACK(%o2) 491 ba,pt %ncc, 2f 492 wr %g5, 0, %fprs ! restore fprs 4934: 494 FZERO 495 wr %g5, 0, %fprs ! restore fprs 4962: 497 ldn [THREAD_REG + T_LWP], %o2 498 brnz,pt %o2, 1f 499 nop 500 501 ldsb [THREAD_REG + T_PREEMPT], %l0 502 deccc %l0 503 bnz,pn %ncc, 1f 504 stb %l0, [THREAD_REG + T_PREEMPT] 505 506 ! Check for a kernel preemption request 507 ldn [THREAD_REG + T_CPU], %l0 508 ldub [%l0 + CPU_KPRUNRUN], %l0 509 brnz,a,pt %l0, 1f ! Need to call kpreempt? 510 or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag 511 512 ! The kcopy will always set a t_lofault handler. If it fires, 513 ! we're expected to just return the error code and not to 514 ! invoke any existing error handler. As far as bcopy is concerned, 515 ! we only set t_lofault if there was an existing lofault handler. 516 ! In that case we're expected to invoke the previously existing 517 ! handler after resetting the t_lofault value. 5181: 519 andn %o5, COPY_FLAGS, %o5 ! remove flags from lofault address 520 membar #Sync ! sync error barrier 521 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 522 523 ! call kpreempt if necessary 524 btst KPREEMPT_FLAG, %l1 525 bz,pt %icc, 2f 526 nop 527 call kpreempt 528 rdpr %pil, %o0 ! pass %pil 5292: 530 btst LOFAULT_SET, %l1 531 bnz,pn %ncc, 3f 532 nop 533 ret 534 restore %g1, 0, %o0 5353: 536 ! We're here via bcopy. There must have been an error handler 537 ! in place otherwise we would have died a nasty death already. 538 jmp %o5 ! goto real handler 539 restore %g0, 0, %o0 ! dispose of copy window 540 541/* 542 * We got here because of a fault in .copyerr. We can't safely restore fp 543 * state, so we panic. 544 */ 545fp_panic_msg: 546 .asciz "Unable to restore fp state after copy operation" 547 548 .align 4 549.copyerr2: 550 set fp_panic_msg, %o0 551 call panic 552 nop 553/* 554 * end of .copyerr 555 */ 556 557#else /* NIAGARA_IMPL */ 558 save %sp, -SA(MINFRAME), %sp 559 set .copyerr, %l7 ! copyerr is lofault value 560 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler 561 or %o5, LOFAULT_SET, %o5 562 membar #Sync ! sync error barrier 563 b .do_copy ! common code 564 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 565 566/* 567 * We got here because of a fault during kcopy. 568 * Errno value is in %g1. 569 */ 570.copyerr: 571 ! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET 572 ! into %o5 to indicate it has set t_lofault handler. Need to clear 573 ! LOFAULT_SET flag before restoring the error handler. 574 andn %o5, LOFAULT_SET, %o5 575 membar #Sync ! sync error barrier 576 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 577 ret 578 restore %g1, 0, %o0 579#endif /* NIAGARA_IMPL */ 580 581 SET_SIZE(kcopy) 582 583 584/* 585 * Copy a block of storage - must not overlap (from + len <= to). 586 */ 587 588 ENTRY(bcopy) 589#if !defined(NIAGARA_IMPL) 590 cmp %o2, FP_COPY ! check for small copy/leaf case 591 bgt,pt %ncc, .bcopy_more ! 592 nop 593.bcopy_small: ! setup error handler 594 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler 595 tst %o5 596 bz,pt %icc, .sm_do_copy 597 sethi %hi(.sm_copyerr), %o4 598 or %o4, %lo(.sm_copyerr), %o4 ! .sm_copyerr is lofault value 599 membar #Sync ! sync error barrier 600 stn %o4, [THREAD_REG + T_LOFAULT] ! set t_lofault 601 or %o5, LOFAULT_SET, %o5 ! Error should trampoline 602.sm_do_copy: 603 mov %o0, %g1 ! save %o0 604 cmp %o2, SHORTCOPY ! make sure there is enough to align 605 ble,pt %ncc, .bc_smallest 606 andcc %o1, 0x7, %o3 ! is dest long aligned 607 bnz,pn %ncc, .bc_align 608 andcc %o1, 1, %o3 ! is dest byte aligned 609 610! Destination is long word aligned 611.bc_al_src: 612 andcc %o0, 7, %o3 613 brnz,pt %o3, .bc_src_dst_unal8 614 nop 615/* 616 * Special case for handling when src and dest are both long word aligned 617 * and total data to move is less than FP_COPY bytes 618 * Also handles finish up for large block moves, so may be less than 32 bytes 619 */ 620.bc_medlong: 621 subcc %o2, 31, %o2 ! adjust length to allow cc test 622 ble,pt %ncc, .bc_medl31 623 nop 624.bc_medl32: 625 ldx [%o0], %o4 ! move 32 bytes 626 subcc %o2, 32, %o2 ! decrement length count by 32 627 stx %o4, [%o1] 628 ldx [%o0+8], %o4 629 stx %o4, [%o1+8] 630 ldx [%o0+16], %o4 631 add %o0, 32, %o0 ! increase src ptr by 32 632 stx %o4, [%o1+16] 633 ldx [%o0-8], %o4 634 add %o1, 32, %o1 ! increase dst ptr by 32 635 bgu,pt %ncc, .bc_medl32 ! repeat if at least 32 bytes left 636 stx %o4, [%o1-8] 637.bc_medl31: 638 addcc %o2, 24, %o2 ! adjust count to be off by 7 639 ble,pt %ncc, .bc_medl7 ! skip if 7 or fewer bytes left 640 nop 641.bc_medl8: 642 ldx [%o0], %o4 ! move 8 bytes 643 add %o0, 8, %o0 ! increase src ptr by 8 644 subcc %o2, 8, %o2 ! decrease count by 8 645 add %o1, 8, %o1 ! increase dst ptr by 8 646 bgu,pt %ncc, .bc_medl8 647 stx %o4, [%o1-8] 648.bc_medl7: 649 addcc %o2, 7, %o2 ! finish adjustment of remaining count 650 bnz,pt %ncc, .bc_small4 ! do final bytes if not finished 651 652.bc_smallx: ! finish up and exit 653 tst %o5 654 bz,pt %ncc, .bc_sm_done 655 andn %o5, COPY_FLAGS, %o5 ! remove flags from lofault address 656 membar #Sync ! sync error barrier 657 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 658.bc_sm_done: 659 retl 660 mov %g0, %o0 661 662.bc_small4: 663 cmp %o2, 4 664 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left 665 nop ! 666 ld [%o0], %o4 ! move 4 bytes 667 add %o0, 4, %o0 ! increase src ptr by 4 668 add %o1, 4, %o1 ! increase dst ptr by 4 669 subcc %o2, 4, %o2 ! decrease count by 4 670 bz,pt %ncc, .bc_smallx 671 stw %o4, [%o1-4] 672 673.bc_small3x: ! Exactly 1, 2, or 3 bytes remain 674 subcc %o2, 1, %o2 ! reduce count for cc test 675 ldub [%o0], %o4 ! load one byte 676 bz,pt %ncc, .bc_smallx 677 stb %o4, [%o1] ! store one byte 678 ldub [%o0+1], %o4 ! load second byte 679 subcc %o2, 1, %o2 680 bz,pt %ncc, .bc_smallx 681 stb %o4, [%o1+1] ! store second byte 682 ldub [%o0+2], %o4 ! load third byte 683 ba .bc_smallx 684 stb %o4, [%o1+2] ! store third byte 685 686.bc_smallest: ! 7 or fewer bytes remain 687 tst %o2 688 bz,pt %ncc, .bc_smallx 689 cmp %o2, 4 690 blt,pt %ncc, .bc_small3x 691 nop 692 ldub [%o0], %o4 ! read byte 693 subcc %o2, 4, %o2 ! reduce count by 4 694 stb %o4, [%o1] ! write byte 695 ldub [%o0+1], %o4 ! repeat for total of 4 bytes 696 add %o0, 4, %o0 ! advance src by 4 697 stb %o4, [%o1+1] 698 ldub [%o0-2], %o4 699 add %o1, 4, %o1 ! advance dst by 4 700 stb %o4, [%o1-2] 701 ldub [%o0-1], %o4 702 bnz,pt %ncc, .bc_small3x 703 stb %o4, [%o1-1] 704 ba .bc_smallx 705 nop 706 707/* 708 * Align destination to long word boundary 709 */ 710.bc_align: ! byte align test in prior branch delay 711 bnz,pt %ncc, .bc_al_d1 712.bc_al_d1f: ! dest is now half word aligned 713 andcc %o1, 2, %o3 714 bnz,pt %ncc, .bc_al_d2 715.bc_al_d2f: ! dest is now word aligned 716 andcc %o1, 4, %o3 ! is dest longword aligned? 717 bz,pt %ncc, .bc_al_src 718 nop 719.bc_al_d4: ! dest is word aligned; src is unknown 720 ldub [%o0], %o4 ! move a word (src align unknown) 721 ldub [%o0+1], %o3 722 sll %o4, 24, %o4 ! position 723 sll %o3, 16, %o3 ! position 724 or %o4, %o3, %o3 ! merge 725 ldub [%o0+2], %o4 726 sll %o4, 8, %o4 ! position 727 or %o4, %o3, %o3 ! merge 728 ldub [%o0+3], %o4 729 or %o4, %o3, %o4 ! merge 730 stw %o4,[%o1] ! store four bytes 731 add %o0, 4, %o0 ! adjust src by 4 732 add %o1, 4, %o1 ! adjust dest by 4 733 sub %o2, 4, %o2 ! adjust count by 4 734 andcc %o0, 7, %o3 ! check for src long word alignment 735 brz,pt %o3, .bc_medlong 736.bc_src_dst_unal8: 737 ! dst is 8-byte aligned, src is not 738 ! Size is less than FP_COPY 739 ! Following code is to select for alignment 740 andcc %o0, 0x3, %o3 ! test word alignment 741 bz,pt %ncc, .bc_medword 742 nop 743 andcc %o0, 0x1, %o3 ! test halfword alignment 744 bnz,pt %ncc, .bc_med_byte ! go to byte move if not halfword 745 andcc %o0, 0x2, %o3 ! test which byte alignment 746 ba .bc_medhalf 747 nop 748.bc_al_d1: ! align dest to half word 749 ldub [%o0], %o4 ! move a byte 750 add %o0, 1, %o0 751 stb %o4, [%o1] 752 add %o1, 1, %o1 753 andcc %o1, 2, %o3 754 bz,pt %ncc, .bc_al_d2f 755 sub %o2, 1, %o2 756.bc_al_d2: ! align dest to word 757 ldub [%o0], %o4 ! move a half-word (src align unknown) 758 ldub [%o0+1], %o3 759 sll %o4, 8, %o4 ! position 760 or %o4, %o3, %o4 ! merge 761 sth %o4, [%o1] 762 add %o0, 2, %o0 763 add %o1, 2, %o1 764 andcc %o1, 4, %o3 ! is dest longword aligned? 765 bz,pt %ncc, .bc_al_src 766 sub %o2, 2, %o2 767 ba .bc_al_d4 768 nop 769/* 770 * Handle all cases where src and dest are aligned on word 771 * boundaries. Use unrolled loops for better performance. 772 * This option wins over standard large data move when 773 * source and destination is in cache for medium 774 * to short data moves. 775 */ 776.bc_medword: 777 subcc %o2, 31, %o2 ! adjust length to allow cc test 778 ble,pt %ncc, .bc_medw31 779 nop 780.bc_medw32: 781 ld [%o0], %o4 ! move a block of 32 bytes 782 stw %o4, [%o1] 783 ld [%o0+4], %o4 784 stw %o4, [%o1+4] 785 ld [%o0+8], %o4 786 stw %o4, [%o1+8] 787 ld [%o0+12], %o4 788 stw %o4, [%o1+12] 789 ld [%o0+16], %o4 790 stw %o4, [%o1+16] 791 ld [%o0+20], %o4 792 subcc %o2, 32, %o2 ! decrement length count 793 stw %o4, [%o1+20] 794 ld [%o0+24], %o4 795 add %o0, 32, %o0 ! increase src ptr by 32 796 stw %o4, [%o1+24] 797 ld [%o0-4], %o4 798 add %o1, 32, %o1 ! increase dst ptr by 32 799 bgu,pt %ncc, .bc_medw32 ! repeat if at least 32 bytes left 800 stw %o4, [%o1-4] 801.bc_medw31: 802 addcc %o2, 24, %o2 ! adjust count to be off by 7 803 ble,pt %ncc, .bc_medw7 ! skip if 7 or fewer bytes left 804 nop ! 805.bc_medw15: 806 ld [%o0], %o4 ! move a block of 8 bytes 807 subcc %o2, 8, %o2 ! decrement length count 808 stw %o4, [%o1] 809 add %o0, 8, %o0 ! increase src ptr by 8 810 ld [%o0-4], %o4 811 add %o1, 8, %o1 ! increase dst ptr by 8 812 bgu,pt %ncc, .bc_medw15 813 stw %o4, [%o1-4] 814.bc_medw7: 815 addcc %o2, 7, %o2 ! finish adjustment of remaining count 816 bz,pt %ncc, .bc_smallx ! exit if finished 817 cmp %o2, 4 818 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left 819 nop ! 820 ld [%o0], %o4 ! move 4 bytes 821 add %o0, 4, %o0 ! increase src ptr by 4 822 add %o1, 4, %o1 ! increase dst ptr by 4 823 subcc %o2, 4, %o2 ! decrease count by 4 824 bnz .bc_small3x 825 stw %o4, [%o1-4] 826 ba .bc_smallx 827 nop 828 829.bc_medhalf: 830 subcc %o2, 31, %o2 ! adjust length to allow cc test 831 ble,pt %ncc, .bc_medh31 832 nop 833.bc_medh32: ! load and store block of 32 bytes 834 subcc %o2, 32, %o2 ! decrement length count 835 836 lduh [%o0], %o4 ! move 32 bytes 837 lduw [%o0+2], %o3 838 sllx %o4, 48, %o4 839 sllx %o3, 16, %o3 840 or %o4, %o3, %o3 841 lduh [%o0+6], %o4 842 or %o4, %o3, %o4 843 stx %o4, [%o1] 844 845 lduh [%o0+8], %o4 846 lduw [%o0+10], %o3 847 sllx %o4, 48, %o4 848 sllx %o3, 16, %o3 849 or %o4, %o3, %o3 850 lduh [%o0+14], %o4 851 or %o4, %o3, %o4 852 stx %o4, [%o1+8] 853 854 lduh [%o0+16], %o4 855 lduw [%o0+18], %o3 856 sllx %o4, 48, %o4 857 sllx %o3, 16, %o3 858 or %o4, %o3, %o3 859 lduh [%o0+22], %o4 860 or %o4, %o3, %o4 861 stx %o4, [%o1+16] 862 863 add %o0, 32, %o0 ! increase src ptr by 32 864 add %o1, 32, %o1 ! increase dst ptr by 32 865 866 lduh [%o0-8], %o4 867 lduw [%o0-6], %o3 868 sllx %o4, 48, %o4 869 sllx %o3, 16, %o3 870 or %o4, %o3, %o3 871 lduh [%o0-2], %o4 872 or %o3, %o4, %o4 873 bgu,pt %ncc, .bc_medh32 ! repeat if at least 32 bytes left 874 stx %o4, [%o1-8] 875 876.bc_medh31: 877 addcc %o2, 24, %o2 ! adjust count to be off by 7 878 ble,pt %ncc, .bc_medh7 ! skip if 7 or fewer bytes left 879 nop ! 880.bc_medh15: 881 lduh [%o0], %o4 ! move 16 bytes 882 subcc %o2, 8, %o2 ! decrement length count 883 lduw [%o0+2], %o3 884 sllx %o4, 48, %o4 885 sllx %o3, 16, %o3 886 or %o4, %o3, %o3 887 add %o1, 8, %o1 ! increase dst ptr by 8 888 lduh [%o0+6], %o4 889 add %o0, 8, %o0 ! increase src ptr by 8 890 or %o4, %o3, %o4 891 bgu,pt %ncc, .bc_medh15 892 stx %o4, [%o1-8] 893.bc_medh7: 894 addcc %o2, 7, %o2 ! finish adjustment of remaining count 895 bz,pt %ncc, .bc_smallx ! exit if finished 896 cmp %o2, 4 897 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left 898 nop ! 899 lduh [%o0], %o4 900 sll %o4, 16, %o4 901 lduh [%o0+2], %o3 902 or %o3, %o4, %o4 903 subcc %o2, 4, %o2 904 add %o0, 4, %o0 905 add %o1, 4, %o1 906 bnz .bc_small3x 907 stw %o4, [%o1-4] 908 ba .bc_smallx 909 nop 910 911 .align 16 912.bc_med_byte: 913 bnz,pt %ncc, .bc_medbh32a ! go to correct byte move 914 subcc %o2, 31, %o2 ! adjust length to allow cc test 915 ble,pt %ncc, .bc_medb31 916 nop 917.bc_medb32: ! Alignment 1 or 5 918 subcc %o2, 32, %o2 ! decrement length count 919 920 ldub [%o0], %o4 ! load and store a block of 32 bytes 921 sllx %o4, 56, %o3 922 lduh [%o0+1], %o4 923 sllx %o4, 40, %o4 924 or %o4, %o3, %o3 925 lduw [%o0+3], %o4 926 sllx %o4, 8, %o4 927 or %o4, %o3, %o3 928 ldub [%o0+7], %o4 929 or %o4, %o3, %o4 930 stx %o4, [%o1] 931 932 ldub [%o0+8], %o4 933 sllx %o4, 56, %o3 934 lduh [%o0+9], %o4 935 sllx %o4, 40, %o4 936 or %o4, %o3, %o3 937 lduw [%o0+11], %o4 938 sllx %o4, 8, %o4 939 or %o4, %o3, %o3 940 ldub [%o0+15], %o4 941 or %o4, %o3, %o4 942 stx %o4, [%o1+8] 943 944 ldub [%o0+16], %o4 945 sllx %o4, 56, %o3 946 lduh [%o0+17], %o4 947 sllx %o4, 40, %o4 948 or %o4, %o3, %o3 949 lduw [%o0+19], %o4 950 sllx %o4, 8, %o4 951 or %o4, %o3, %o3 952 ldub [%o0+23], %o4 953 or %o4, %o3, %o4 954 stx %o4, [%o1+16] 955 956 add %o0, 32, %o0 ! increase src ptr by 32 957 add %o1, 32, %o1 ! increase dst ptr by 32 958 959 ldub [%o0-8], %o4 960 sllx %o4, 56, %o3 961 lduh [%o0-7], %o4 962 sllx %o4, 40, %o4 963 or %o4, %o3, %o3 964 lduw [%o0-5], %o4 965 sllx %o4, 8, %o4 966 or %o4, %o3, %o3 967 ldub [%o0-1], %o4 968 or %o4, %o3, %o4 969 bgu,pt %ncc, .bc_medb32 ! repeat if at least 32 bytes left 970 stx %o4, [%o1-8] 971 972.bc_medb31: ! 31 or fewer bytes remaining 973 addcc %o2, 24, %o2 ! adjust count to be off by 7 974 ble,pt %ncc, .bc_medb7 ! skip if 7 or fewer bytes left 975 nop ! 976.bc_medb15: 977 978 ldub [%o0], %o4 ! load and store a block of 8 bytes 979 subcc %o2, 8, %o2 ! decrement length count 980 sllx %o4, 56, %o3 981 lduh [%o0+1], %o4 982 sllx %o4, 40, %o4 983 or %o4, %o3, %o3 984 lduw [%o0+3], %o4 985 add %o1, 8, %o1 ! increase dst ptr by 16 986 sllx %o4, 8, %o4 987 or %o4, %o3, %o3 988 ldub [%o0+7], %o4 989 add %o0, 8, %o0 ! increase src ptr by 16 990 or %o4, %o3, %o4 991 bgu,pt %ncc, .bc_medb15 992 stx %o4, [%o1-8] 993.bc_medb7: 994 addcc %o2, 7, %o2 ! finish adjustment of remaining count 995 bz,pt %ncc, .bc_smallx ! exit if finished 996 cmp %o2, 4 997 blt,pt %ncc, .bc_small3x ! skip if less than 4 bytes left 998 nop ! 999 ldub [%o0], %o4 ! move 4 bytes 1000 sll %o4, 24, %o3 1001 lduh [%o0+1], %o4 1002 sll %o4, 8, %o4 1003 or %o4, %o3, %o3 1004 ldub [%o0+3], %o4 1005 or %o4, %o3, %o4 1006 subcc %o2, 4, %o2 1007 add %o0, 4, %o0 1008 add %o1, 4, %o1 1009 bnz .bc_small3x 1010 stw %o4, [%o1-4] 1011 ba .bc_smallx 1012 nop 1013 1014 .align 16 1015.bc_medbh32a: ! Alignment 3 or 7 1016 ble,pt %ncc, .bc_medbh31 1017 nop 1018.bc_medbh32: ! Alignment 3 or 7 1019 subcc %o2, 32, %o2 ! decrement length count 1020 1021 ldub [%o0], %o4 ! load and store a block of 32 bytes 1022 sllx %o4, 56, %o3 1023 lduw [%o0+1], %o4 1024 sllx %o4, 24, %o4 1025 or %o4, %o3, %o3 1026 lduh [%o0+5], %o4 1027 sllx %o4, 8, %o4 1028 or %o4, %o3, %o3 1029 ldub [%o0+7], %o4 1030 or %o4, %o3, %o4 1031 stx %o4, [%o1] 1032 1033 ldub [%o0+8], %o4 1034 sllx %o4, 56, %o3 1035 lduw [%o0+9], %o4 1036 sllx %o4, 24, %o4 1037 or %o4, %o3, %o3 1038 lduh [%o0+13], %o4 1039 sllx %o4, 8, %o4 1040 or %o4, %o3, %o3 1041 ldub [%o0+15], %o4 1042 or %o4, %o3, %o4 1043 stx %o4, [%o1+8] 1044 1045 ldub [%o0+16], %o4 1046 sllx %o4, 56, %o3 1047 lduw [%o0+17], %o4 1048 sllx %o4, 24, %o4 1049 or %o4, %o3, %o3 1050 lduh [%o0+21], %o4 1051 sllx %o4, 8, %o4 1052 or %o4, %o3, %o3 1053 ldub [%o0+23], %o4 1054 or %o4, %o3, %o4 1055 stx %o4, [%o1+16] 1056 1057 add %o0, 32, %o0 ! increase src ptr by 32 1058 add %o1, 32, %o1 ! increase dst ptr by 32 1059 1060 ldub [%o0-8], %o4 1061 sllx %o4, 56, %o3 1062 lduw [%o0-7], %o4 1063 sllx %o4, 24, %o4 1064 or %o4, %o3, %o3 1065 lduh [%o0-3], %o4 1066 sllx %o4, 8, %o4 1067 or %o4, %o3, %o3 1068 ldub [%o0-1], %o4 1069 or %o4, %o3, %o4 1070 bgu,pt %ncc, .bc_medbh32 ! repeat if at least 32 bytes left 1071 stx %o4, [%o1-8] 1072 1073.bc_medbh31: 1074 addcc %o2, 24, %o2 ! adjust count to be off by 7 1075 ble,pt %ncc, .bc_medb7 ! skip if 7 or fewer bytes left 1076 nop ! 1077.bc_medbh15: 1078 ldub [%o0], %o4 ! load and store a block of 8 bytes 1079 sllx %o4, 56, %o3 1080 lduw [%o0+1], %o4 1081 sllx %o4, 24, %o4 1082 or %o4, %o3, %o3 1083 lduh [%o0+5], %o4 1084 sllx %o4, 8, %o4 1085 or %o4, %o3, %o3 1086 ldub [%o0+7], %o4 1087 or %o4, %o3, %o4 1088 stx %o4, [%o1] 1089 subcc %o2, 8, %o2 ! decrement length count 1090 add %o1, 8, %o1 ! increase dst ptr by 8 1091 add %o0, 8, %o0 ! increase src ptr by 8 1092 bgu,pt %ncc, .bc_medbh15 1093 stx %o4, [%o1-8] 1094 ba .bc_medb7 1095 nop 1096 1097 SET_SIZE(bcopy) 1098/* 1099 * The _more entry points are not intended to be used directly by 1100 * any caller from outside this file. They are provided to allow 1101 * profiling and dtrace of the portions of the copy code that uses 1102 * the floating point registers. 1103*/ 1104 ENTRY(bcopy_more) 1105.bcopy_more: 1106 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 1107 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler 1108 brz,pt %o5, .do_copy 1109 nop 1110 sethi %hi(.copyerr), %l7 ! copyerr is lofault value 1111 or %l7, %lo(.copyerr), %l7 1112 membar #Sync ! sync error barrier 1113 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 1114 ! We've already captured whether t_lofault was zero on entry. 1115 ! We need to mark ourselves as being from bcopy since both 1116 ! kcopy and bcopy use the same code path. If LOFAULT_SET is 1117 ! set and the saved lofault was zero, we won't reset lofault on 1118 ! returning. 1119 or %o5, LOFAULT_SET, %o5 1120.do_copy: 1121 ldn [THREAD_REG + T_LWP], %o3 1122 brnz,pt %o3, 1f 1123 nop 1124/* 1125 * kpreempt_disable(); 1126 */ 1127 ldsb [THREAD_REG +T_PREEMPT], %o3 1128 inc %o3 1129 stb %o3, [THREAD_REG + T_PREEMPT] 11301: 1131/* 1132 * Following code is for large copies. We know there is at 1133 * least FP_COPY bytes available. FP regs are used, so 1134 * we save registers and fp regs before starting 1135 */ 1136 rd %fprs, %g5 ! check for unused fp 1137 or %o5,FPUSED_FLAG,%o5 1138 ! if fprs.fef == 0, set it. 1139 ! Setting it when already set costs more than checking 1140 andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0 1141 bz,pt %ncc, .bc_fp_unused 1142 prefetch [%i0 + (1 * CACHE_LINE)], #one_read 1143 BST_FP_TOSTACK(%o3) 1144 ba .bc_fp_ready 1145.bc_fp_unused: 1146 andcc %i1, 1, %o3 ! is dest byte aligned 1147 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 1148.bc_fp_ready: 1149 rd %gsr, %l5 ! save %gsr value 1150 bnz,pt %ncc, .bc_big_d1 1151.bc_big_d1f: ! dest is now half word aligned 1152 andcc %i1, 2, %o3 1153 bnz,pt %ncc, .bc_big_d2 1154.bc_big_d2f: ! dest is now word aligned 1155 andcc %i1, 4, %o3 1156 bnz,pt %ncc, .bc_big_d4 1157.bc_big_d4f: ! dest is now long word aligned 1158 andcc %i0, 7, %o3 ! is src long word aligned 1159 brnz,pt %o3, .bc_big_unal8 1160 prefetch [%i0 + (2 * CACHE_LINE)], #one_read 1161 1162 ! Src and dst are long word aligned 1163 ! align dst to 64 byte boundary 1164 andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 1165 brz,pn %o3, .bc_al_to_64 1166 nop 1167 sub %o3, 64, %o3 ! %o3 has negative bytes to move 1168 add %i2, %o3, %i2 ! adjust remaining count 1169 andcc %o3, 8, %o4 ! odd long words to move? 1170 brz,pt %o4, .bc_al_to_16 1171 nop 1172 add %o3, 8, %o3 1173 ldx [%i0], %o4 1174 add %i0, 8, %i0 ! increment src ptr 1175 add %i1, 8, %i1 ! increment dst ptr 1176 stx %o4, [%i1-8] 1177! Dest is aligned on 16 bytes, src 8 byte aligned 1178.bc_al_to_16: 1179 andcc %o3, 0x30, %o4 ! pair of long words to move? 1180 brz,pt %o4, .bc_al_to_64 1181 nop 1182.bc_al_mv_16: 1183 add %o3, 16, %o3 1184 ldx [%i0], %o4 1185 stx %o4, [%i1] 1186 ldx [%i0+8], %o4 1187 add %i0, 16, %i0 ! increment src ptr 1188 stx %o4, [%i1+8] 1189 andcc %o3, 48, %o4 1190 brnz,pt %o4, .bc_al_mv_16 1191 add %i1, 16, %i1 ! increment dst ptr 1192! Dest is aligned on 64 bytes, src 8 byte aligned 1193.bc_al_to_64: 1194 ! Determine source alignment 1195 ! to correct 8 byte offset 1196 andcc %i0, 32, %o3 1197 brnz,pn %o3, .bc_aln_1 1198 andcc %i0, 16, %o3 1199 brnz,pn %o3, .bc_aln_01 1200 andcc %i0, 8, %o3 1201 brz,pn %o3, .bc_aln_000 1202 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 1203 ba .bc_aln_001 1204 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1205 1206.bc_aln_01: 1207 brnz,pn %o3, .bc_aln_011 1208 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 1209 ba .bc_aln_010 1210 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1211.bc_aln_1: 1212 andcc %i0, 16, %o3 1213 brnz,pn %o3, .bc_aln_11 1214 andcc %i0, 8, %o3 1215 brnz,pn %o3, .bc_aln_101 1216 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 1217 ba .bc_aln_100 1218 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1219.bc_aln_11: 1220 brz,pn %o3, .bc_aln_110 1221 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 1222 1223.bc_aln_111: 1224! Alignment off by 8 bytes 1225 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1226 ldd [%i0], %d0 1227 add %i0, 8, %i0 1228 sub %i2, 8, %i2 1229 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1230 and %i2, 0x7f, %i2 ! residue bytes in %i2 1231 sub %i1, %i0, %i1 1232.bc_aln_111_loop: 1233 ldda [%i0]ASI_BLK_P,%d16 ! block load 1234 subcc %o3, 64, %o3 1235 fmovd %d16, %d2 1236 fmovd %d18, %d4 1237 fmovd %d20, %d6 1238 fmovd %d22, %d8 1239 fmovd %d24, %d10 1240 fmovd %d26, %d12 1241 fmovd %d28, %d14 1242 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1243 stda %d0,[%i0+%i1]ASI_BLK_P 1244 add %i0, 64, %i0 1245 fmovd %d30, %d0 1246 bgt,pt %ncc, .bc_aln_111_loop 1247 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1248 add %i1, %i0, %i1 1249 1250 std %d0, [%i1] 1251 ba .bc_remain_stuff 1252 add %i1, 8, %i1 1253 ! END OF aln_111 1254 1255.bc_aln_110: 1256! Alignment off by 16 bytes 1257 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1258 ldd [%i0], %d0 1259 ldd [%i0+8], %d2 1260 add %i0, 16, %i0 1261 sub %i2, 16, %i2 1262 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1263 and %i2, 0x7f, %i2 ! residue bytes in %i2 1264 sub %i1, %i0, %i1 1265.bc_aln_110_loop: 1266 ldda [%i0]ASI_BLK_P,%d16 ! block load 1267 subcc %o3, 64, %o3 1268 fmovd %d16, %d4 1269 fmovd %d18, %d6 1270 fmovd %d20, %d8 1271 fmovd %d22, %d10 1272 fmovd %d24, %d12 1273 fmovd %d26, %d14 1274 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1275 stda %d0,[%i0+%i1]ASI_BLK_P 1276 add %i0, 64, %i0 1277 fmovd %d28, %d0 1278 fmovd %d30, %d2 1279 bgt,pt %ncc, .bc_aln_110_loop 1280 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1281 add %i1, %i0, %i1 1282 1283 std %d0, [%i1] 1284 std %d2, [%i1+8] 1285 ba .bc_remain_stuff 1286 add %i1, 16, %i1 1287 ! END OF aln_110 1288 1289.bc_aln_101: 1290! Alignment off by 24 bytes 1291 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1292 ldd [%i0], %d0 1293 ldd [%i0+8], %d2 1294 ldd [%i0+16], %d4 1295 add %i0, 24, %i0 1296 sub %i2, 24, %i2 1297 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1298 and %i2, 0x7f, %i2 ! residue bytes in %i2 1299 sub %i1, %i0, %i1 1300.bc_aln_101_loop: 1301 ldda [%i0]ASI_BLK_P,%d16 ! block load 1302 subcc %o3, 64, %o3 1303 fmovd %d16, %d6 1304 fmovd %d18, %d8 1305 fmovd %d20, %d10 1306 fmovd %d22, %d12 1307 fmovd %d24, %d14 1308 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1309 stda %d0,[%i0+%i1]ASI_BLK_P 1310 add %i0, 64, %i0 1311 fmovd %d26, %d0 1312 fmovd %d28, %d2 1313 fmovd %d30, %d4 1314 bgt,pt %ncc, .bc_aln_101_loop 1315 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1316 add %i1, %i0, %i1 1317 1318 std %d0, [%i1] 1319 std %d2, [%i1+8] 1320 std %d4, [%i1+16] 1321 ba .bc_remain_stuff 1322 add %i1, 24, %i1 1323 ! END OF aln_101 1324 1325.bc_aln_100: 1326! Alignment off by 32 bytes 1327 ldd [%i0], %d0 1328 ldd [%i0+8], %d2 1329 ldd [%i0+16],%d4 1330 ldd [%i0+24],%d6 1331 add %i0, 32, %i0 1332 sub %i2, 32, %i2 1333 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1334 and %i2, 0x7f, %i2 ! residue bytes in %i2 1335 sub %i1, %i0, %i1 1336.bc_aln_100_loop: 1337 ldda [%i0]ASI_BLK_P,%d16 ! block load 1338 subcc %o3, 64, %o3 1339 fmovd %d16, %d8 1340 fmovd %d18, %d10 1341 fmovd %d20, %d12 1342 fmovd %d22, %d14 1343 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1344 stda %d0,[%i0+%i1]ASI_BLK_P 1345 add %i0, 64, %i0 1346 fmovd %d24, %d0 1347 fmovd %d26, %d2 1348 fmovd %d28, %d4 1349 fmovd %d30, %d6 1350 bgt,pt %ncc, .bc_aln_100_loop 1351 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1352 add %i1, %i0, %i1 1353 1354 std %d0, [%i1] 1355 std %d2, [%i1+8] 1356 std %d4, [%i1+16] 1357 std %d6, [%i1+24] 1358 ba .bc_remain_stuff 1359 add %i1, 32, %i1 1360 ! END OF aln_100 1361 1362.bc_aln_011: 1363! Alignment off by 40 bytes 1364 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1365 ldd [%i0], %d0 1366 ldd [%i0+8], %d2 1367 ldd [%i0+16], %d4 1368 ldd [%i0+24], %d6 1369 ldd [%i0+32], %d8 1370 add %i0, 40, %i0 1371 sub %i2, 40, %i2 1372 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1373 and %i2, 0x7f, %i2 ! residue bytes in %i2 1374 sub %i1, %i0, %i1 1375.bc_aln_011_loop: 1376 ldda [%i0]ASI_BLK_P,%d16 ! block load 1377 subcc %o3, 64, %o3 1378 fmovd %d16, %d10 1379 fmovd %d18, %d12 1380 fmovd %d20, %d14 1381 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1382 stda %d0,[%i0+%i1]ASI_BLK_P 1383 add %i0, 64, %i0 1384 fmovd %d22, %d0 1385 fmovd %d24, %d2 1386 fmovd %d26, %d4 1387 fmovd %d28, %d6 1388 fmovd %d30, %d8 1389 bgt,pt %ncc, .bc_aln_011_loop 1390 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1391 add %i1, %i0, %i1 1392 1393 std %d0, [%i1] 1394 std %d2, [%i1+8] 1395 std %d4, [%i1+16] 1396 std %d6, [%i1+24] 1397 std %d8, [%i1+32] 1398 ba .bc_remain_stuff 1399 add %i1, 40, %i1 1400 ! END OF aln_011 1401 1402.bc_aln_010: 1403! Alignment off by 48 bytes 1404 ldd [%i0], %d0 1405 ldd [%i0+8], %d2 1406 ldd [%i0+16], %d4 1407 ldd [%i0+24], %d6 1408 ldd [%i0+32], %d8 1409 ldd [%i0+40], %d10 1410 add %i0, 48, %i0 1411 sub %i2, 48, %i2 1412 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1413 and %i2, 0x7f, %i2 ! residue bytes in %i2 1414 sub %i1, %i0, %i1 1415.bc_aln_010_loop: 1416 ldda [%i0]ASI_BLK_P,%d16 ! block load 1417 subcc %o3, 64, %o3 1418 fmovd %d16, %d12 1419 fmovd %d18, %d14 1420 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1421 stda %d0,[%i0+%i1]ASI_BLK_P 1422 add %i0, 64, %i0 1423 fmovd %d20, %d0 1424 fmovd %d22, %d2 1425 fmovd %d24, %d4 1426 fmovd %d26, %d6 1427 fmovd %d28, %d8 1428 fmovd %d30, %d10 1429 bgt,pt %ncc, .bc_aln_010_loop 1430 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1431 add %i1, %i0, %i1 1432 1433 std %d0, [%i1] 1434 std %d2, [%i1+8] 1435 std %d4, [%i1+16] 1436 std %d6, [%i1+24] 1437 std %d8, [%i1+32] 1438 std %d10, [%i1+40] 1439 ba .bc_remain_stuff 1440 add %i1, 48, %i1 1441 ! END OF aln_010 1442 1443.bc_aln_001: 1444! Alignment off by 56 bytes 1445 ldd [%i0], %d0 1446 ldd [%i0+8], %d2 1447 ldd [%i0+16], %d4 1448 ldd [%i0+24], %d6 1449 ldd [%i0+32], %d8 1450 ldd [%i0+40], %d10 1451 ldd [%i0+48], %d12 1452 add %i0, 56, %i0 1453 sub %i2, 56, %i2 1454 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1455 and %i2, 0x7f, %i2 ! residue bytes in %i2 1456 sub %i1, %i0, %i1 1457.bc_aln_001_loop: 1458 ldda [%i0]ASI_BLK_P,%d16 ! block load 1459 subcc %o3, 64, %o3 1460 fmovd %d16, %d14 1461 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1462 stda %d0,[%i0+%i1]ASI_BLK_P 1463 add %i0, 64, %i0 1464 fmovd %d18, %d0 1465 fmovd %d20, %d2 1466 fmovd %d22, %d4 1467 fmovd %d24, %d6 1468 fmovd %d26, %d8 1469 fmovd %d28, %d10 1470 fmovd %d30, %d12 1471 bgt,pt %ncc, .bc_aln_001_loop 1472 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1473 add %i1, %i0, %i1 1474 1475 std %d0, [%i1] 1476 std %d2, [%i1+8] 1477 std %d4, [%i1+16] 1478 std %d6, [%i1+24] 1479 std %d8, [%i1+32] 1480 std %d10, [%i1+40] 1481 std %d12, [%i1+48] 1482 ba .bc_remain_stuff 1483 add %i1, 56, %i1 1484 ! END OF aln_001 1485 1486.bc_aln_000: 1487 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1488 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 1489 and %i2, 0x7f, %i2 ! residue bytes in %i2 1490 sub %i1, %i0, %i1 1491.bc_aln_000_loop: 1492 ldda [%i0]ASI_BLK_P,%d0 1493 subcc %o3, 64, %o3 1494 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 1495 stda %d0,[%i0+%i1]ASI_BLK_P 1496 add %i0, 64, %i0 1497 bgt,pt %ncc, .bc_aln_000_loop 1498 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1499 add %i1, %i0, %i1 1500 1501 ! END OF aln_000 1502 1503.bc_remain_stuff: 1504 subcc %i2, 31, %i2 ! adjust length to allow cc test 1505 ble,pt %ncc, .bc_aln_31 1506 nop 1507.bc_aln_32: 1508 ldx [%i0], %o4 ! move 32 bytes 1509 subcc %i2, 32, %i2 ! decrement length count by 32 1510 stx %o4, [%i1] 1511 ldx [%i0+8], %o4 1512 stx %o4, [%i1+8] 1513 ldx [%i0+16], %o4 1514 add %i0, 32, %i0 ! increase src ptr by 32 1515 stx %o4, [%i1+16] 1516 ldx [%i0-8], %o4 1517 add %i1, 32, %i1 ! increase dst ptr by 32 1518 bgu,pt %ncc, .bc_aln_32 ! repeat if at least 32 bytes left 1519 stx %o4, [%i1-8] 1520.bc_aln_31: 1521 addcc %i2, 24, %i2 ! adjust count to be off by 7 1522 ble,pt %ncc, .bc_aln_7 ! skip if 7 or fewer bytes left 1523 nop ! 1524.bc_aln_15: 1525 ldx [%i0], %o4 ! move 8 bytes 1526 add %i0, 8, %i0 ! increase src ptr by 8 1527 subcc %i2, 8, %i2 ! decrease count by 8 1528 add %i1, 8, %i1 ! increase dst ptr by 8 1529 bgu,pt %ncc, .bc_aln_15 1530 stx %o4, [%i1-8] ! 1531.bc_aln_7: 1532 addcc %i2, 7, %i2 ! finish adjustment of remaining count 1533 bz,pt %ncc, .bc_exit ! exit if finished 1534 cmp %i2, 4 1535 blt,pt %ncc, .bc_unaln3x ! skip if less than 4 bytes left 1536 nop ! 1537 ld [%i0], %o4 ! move 4 bytes 1538 add %i0, 4, %i0 ! increase src ptr by 4 1539 add %i1, 4, %i1 ! increase dst ptr by 4 1540 subcc %i2, 4, %i2 ! decrease count by 4 1541 bnz .bc_unaln3x 1542 stw %o4, [%i1-4] 1543 ba .bc_exit 1544 nop 1545 1546 ! destination alignment code 1547.bc_big_d1: 1548 ldub [%i0], %o4 ! move a byte 1549 add %i0, 1, %i0 1550 stb %o4, [%i1] 1551 add %i1, 1, %i1 1552 andcc %i1, 2, %o3 1553 bz,pt %ncc, .bc_big_d2f 1554 sub %i2, 1, %i2 1555.bc_big_d2: 1556 ldub [%i0], %o4 ! move a half-word (src align unknown) 1557 ldub [%i0+1], %o3 1558 add %i0, 2, %i0 1559 sll %o4, 8, %o4 ! position 1560 or %o4, %o3, %o4 ! merge 1561 sth %o4, [%i1] 1562 add %i1, 2, %i1 1563 andcc %i1, 4, %o3 1564 bz,pt %ncc, .bc_big_d4f 1565 sub %i2, 2, %i2 1566.bc_big_d4: 1567 ldub [%i0], %o4 ! move a word (src align unknown) 1568 ldub [%i0+1], %o3 1569 sll %o4, 24, %o4 ! position 1570 sll %o3, 16, %o3 ! position 1571 or %o4, %o3, %o3 ! merge 1572 ldub [%i0+2], %o4 1573 sll %o4, 8, %o4 ! position 1574 or %o4, %o3, %o3 ! merge 1575 ldub [%i0+3], %o4 1576 or %o4, %o3, %o4 ! merge 1577 stw %o4,[%i1] ! store four bytes 1578 add %i0, 4, %i0 ! adjust src by 4 1579 add %i1, 4, %i1 ! adjust dest by 4 1580 ba .bc_big_d4f 1581 sub %i2, 4, %i2 ! adjust count by 4 1582 1583 1584 ! Dst is on 8 byte boundary; src is not; 1585.bc_big_unal8: 1586 andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned? 1587 bz %ncc, .bc_unalnsrc 1588 sub %o3, 64, %o3 ! %o3 will be multiple of 8 1589 neg %o3 ! bytes until dest is 64 byte aligned 1590 sub %i2, %o3, %i2 ! update cnt with bytes to be moved 1591 ! Move bytes according to source alignment 1592 andcc %i0, 0x1, %o4 1593 bnz %ncc, .bc_unalnbyte ! check for byte alignment 1594 nop 1595 andcc %i0, 2, %o4 ! check for half word alignment 1596 bnz %ncc, .bc_unalnhalf 1597 nop 1598 ! Src is word aligned, move bytes until dest 64 byte aligned 1599.bc_unalnword: 1600 ld [%i0], %o4 ! load 4 bytes 1601 stw %o4, [%i1] ! and store 4 bytes 1602 ld [%i0+4], %o4 ! load 4 bytes 1603 add %i0, 8, %i0 ! increase src ptr by 8 1604 stw %o4, [%i1+4] ! and store 4 bytes 1605 subcc %o3, 8, %o3 ! decrease count by 8 1606 bnz %ncc, .bc_unalnword 1607 add %i1, 8, %i1 ! increase dst ptr by 8 1608 ba .bc_unalnsrc 1609 nop 1610 1611 ! Src is half-word aligned, move bytes until dest 64 byte aligned 1612.bc_unalnhalf: 1613 lduh [%i0], %o4 ! load 2 bytes 1614 sllx %o4, 32, %i3 ! shift left 1615 lduw [%i0+2], %o4 1616 or %o4, %i3, %i3 1617 sllx %i3, 16, %i3 1618 lduh [%i0+6], %o4 1619 or %o4, %i3, %i3 1620 stx %i3, [%i1] 1621 add %i0, 8, %i0 1622 subcc %o3, 8, %o3 1623 bnz %ncc, .bc_unalnhalf 1624 add %i1, 8, %i1 1625 ba .bc_unalnsrc 1626 nop 1627 1628 ! Src is Byte aligned, move bytes until dest 64 byte aligned 1629.bc_unalnbyte: 1630 sub %i1, %i0, %i1 ! share pointer advance 1631.bc_unalnbyte_loop: 1632 ldub [%i0], %o4 1633 sllx %o4, 56, %i3 1634 lduh [%i0+1], %o4 1635 sllx %o4, 40, %o4 1636 or %o4, %i3, %i3 1637 lduh [%i0+3], %o4 1638 sllx %o4, 24, %o4 1639 or %o4, %i3, %i3 1640 lduh [%i0+5], %o4 1641 sllx %o4, 8, %o4 1642 or %o4, %i3, %i3 1643 ldub [%i0+7], %o4 1644 or %o4, %i3, %i3 1645 stx %i3, [%i1+%i0] 1646 subcc %o3, 8, %o3 1647 bnz %ncc, .bc_unalnbyte_loop 1648 add %i0, 8, %i0 1649 add %i1,%i0, %i1 ! restore pointer 1650 1651 ! Destination is now block (64 byte aligned), src is not 8 byte aligned 1652.bc_unalnsrc: 1653 andn %i2, 0x3f, %i3 ! %i3 is multiple of block size 1654 and %i2, 0x3f, %i2 ! residue bytes in %i2 1655 add %i2, 64, %i2 ! Insure we don't load beyond 1656 sub %i3, 64, %i3 ! end of source buffer 1657 1658 andn %i0, 0x3f, %o4 ! %o4 has block aligned src address 1659 prefetch [%o4 + (3 * CACHE_LINE)], #one_read 1660 alignaddr %i0, %g0, %g0 ! generate %gsr 1661 add %i0, %i3, %i0 ! advance %i0 to after blocks 1662 ! 1663 ! Determine source alignment to correct 8 byte offset 1664 andcc %i0, 0x20, %o3 1665 brnz,pn %o3, .bc_unaln_1 1666 andcc %i0, 0x10, %o3 1667 brnz,pn %o3, .bc_unaln_01 1668 andcc %i0, 0x08, %o3 1669 brz,a %o3, .bc_unaln_000 1670 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1671 ba .bc_unaln_001 1672 nop 1673.bc_unaln_01: 1674 brnz,a %o3, .bc_unaln_011 1675 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1676 ba .bc_unaln_010 1677 nop 1678.bc_unaln_1: 1679 brnz,pn %o3, .bc_unaln_11 1680 andcc %i0, 0x08, %o3 1681 brnz,a %o3, .bc_unaln_101 1682 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1683 ba .bc_unaln_100 1684 nop 1685.bc_unaln_11: 1686 brz,pn %o3, .bc_unaln_110 1687 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 1688 1689.bc_unaln_111: 1690 ldd [%o4+56], %d14 1691.bc_unaln_111_loop: 1692 add %o4, 64, %o4 1693 ldda [%o4]ASI_BLK_P, %d16 1694 faligndata %d14, %d16, %d48 1695 faligndata %d16, %d18, %d50 1696 faligndata %d18, %d20, %d52 1697 faligndata %d20, %d22, %d54 1698 faligndata %d22, %d24, %d56 1699 faligndata %d24, %d26, %d58 1700 faligndata %d26, %d28, %d60 1701 faligndata %d28, %d30, %d62 1702 fmovd %d30, %d14 1703 stda %d48, [%i1]ASI_BLK_P 1704 subcc %i3, 64, %i3 1705 add %i1, 64, %i1 1706 bgu,pt %ncc, .bc_unaln_111_loop 1707 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1708 ba .bc_unaln_done 1709 nop 1710 1711.bc_unaln_110: 1712 ldd [%o4+48], %d12 1713 ldd [%o4+56], %d14 1714.bc_unaln_110_loop: 1715 add %o4, 64, %o4 1716 ldda [%o4]ASI_BLK_P, %d16 1717 faligndata %d12, %d14, %d48 1718 faligndata %d14, %d16, %d50 1719 faligndata %d16, %d18, %d52 1720 faligndata %d18, %d20, %d54 1721 faligndata %d20, %d22, %d56 1722 faligndata %d22, %d24, %d58 1723 faligndata %d24, %d26, %d60 1724 faligndata %d26, %d28, %d62 1725 fmovd %d28, %d12 1726 fmovd %d30, %d14 1727 stda %d48, [%i1]ASI_BLK_P 1728 subcc %i3, 64, %i3 1729 add %i1, 64, %i1 1730 bgu,pt %ncc, .bc_unaln_110_loop 1731 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1732 ba .bc_unaln_done 1733 nop 1734 1735.bc_unaln_101: 1736 ldd [%o4+40], %d10 1737 ldd [%o4+48], %d12 1738 ldd [%o4+56], %d14 1739.bc_unaln_101_loop: 1740 add %o4, 64, %o4 1741 ldda [%o4]ASI_BLK_P, %d16 1742 faligndata %d10, %d12, %d48 1743 faligndata %d12, %d14, %d50 1744 faligndata %d14, %d16, %d52 1745 faligndata %d16, %d18, %d54 1746 faligndata %d18, %d20, %d56 1747 faligndata %d20, %d22, %d58 1748 faligndata %d22, %d24, %d60 1749 faligndata %d24, %d26, %d62 1750 fmovd %d26, %d10 1751 fmovd %d28, %d12 1752 fmovd %d30, %d14 1753 stda %d48, [%i1]ASI_BLK_P 1754 subcc %i3, 64, %i3 1755 add %i1, 64, %i1 1756 bgu,pt %ncc, .bc_unaln_101_loop 1757 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1758 ba .bc_unaln_done 1759 nop 1760 1761.bc_unaln_100: 1762 ldd [%o4+32], %d8 1763 ldd [%o4+40], %d10 1764 ldd [%o4+48], %d12 1765 ldd [%o4+56], %d14 1766.bc_unaln_100_loop: 1767 add %o4, 64, %o4 1768 ldda [%o4]ASI_BLK_P, %d16 1769 faligndata %d8, %d10, %d48 1770 faligndata %d10, %d12, %d50 1771 faligndata %d12, %d14, %d52 1772 faligndata %d14, %d16, %d54 1773 faligndata %d16, %d18, %d56 1774 faligndata %d18, %d20, %d58 1775 faligndata %d20, %d22, %d60 1776 faligndata %d22, %d24, %d62 1777 fmovd %d24, %d8 1778 fmovd %d26, %d10 1779 fmovd %d28, %d12 1780 fmovd %d30, %d14 1781 stda %d48, [%i1]ASI_BLK_P 1782 subcc %i3, 64, %i3 1783 add %i1, 64, %i1 1784 bgu,pt %ncc, .bc_unaln_100_loop 1785 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1786 ba .bc_unaln_done 1787 nop 1788 1789.bc_unaln_011: 1790 ldd [%o4+24], %d6 1791 ldd [%o4+32], %d8 1792 ldd [%o4+40], %d10 1793 ldd [%o4+48], %d12 1794 ldd [%o4+56], %d14 1795.bc_unaln_011_loop: 1796 add %o4, 64, %o4 1797 ldda [%o4]ASI_BLK_P, %d16 1798 faligndata %d6, %d8, %d48 1799 faligndata %d8, %d10, %d50 1800 faligndata %d10, %d12, %d52 1801 faligndata %d12, %d14, %d54 1802 faligndata %d14, %d16, %d56 1803 faligndata %d16, %d18, %d58 1804 faligndata %d18, %d20, %d60 1805 faligndata %d20, %d22, %d62 1806 fmovd %d22, %d6 1807 fmovd %d24, %d8 1808 fmovd %d26, %d10 1809 fmovd %d28, %d12 1810 fmovd %d30, %d14 1811 stda %d48, [%i1]ASI_BLK_P 1812 subcc %i3, 64, %i3 1813 add %i1, 64, %i1 1814 bgu,pt %ncc, .bc_unaln_011_loop 1815 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1816 ba .bc_unaln_done 1817 nop 1818 1819.bc_unaln_010: 1820 ldd [%o4+16], %d4 1821 ldd [%o4+24], %d6 1822 ldd [%o4+32], %d8 1823 ldd [%o4+40], %d10 1824 ldd [%o4+48], %d12 1825 ldd [%o4+56], %d14 1826.bc_unaln_010_loop: 1827 add %o4, 64, %o4 1828 ldda [%o4]ASI_BLK_P, %d16 1829 faligndata %d4, %d6, %d48 1830 faligndata %d6, %d8, %d50 1831 faligndata %d8, %d10, %d52 1832 faligndata %d10, %d12, %d54 1833 faligndata %d12, %d14, %d56 1834 faligndata %d14, %d16, %d58 1835 faligndata %d16, %d18, %d60 1836 faligndata %d18, %d20, %d62 1837 fmovd %d20, %d4 1838 fmovd %d22, %d6 1839 fmovd %d24, %d8 1840 fmovd %d26, %d10 1841 fmovd %d28, %d12 1842 fmovd %d30, %d14 1843 stda %d48, [%i1]ASI_BLK_P 1844 subcc %i3, 64, %i3 1845 add %i1, 64, %i1 1846 bgu,pt %ncc, .bc_unaln_010_loop 1847 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1848 ba .bc_unaln_done 1849 nop 1850 1851.bc_unaln_001: 1852 ldd [%o4+8], %d2 1853 ldd [%o4+16], %d4 1854 ldd [%o4+24], %d6 1855 ldd [%o4+32], %d8 1856 ldd [%o4+40], %d10 1857 ldd [%o4+48], %d12 1858 ldd [%o4+56], %d14 1859.bc_unaln_001_loop: 1860 add %o4, 64, %o4 1861 ldda [%o4]ASI_BLK_P, %d16 1862 faligndata %d2, %d4, %d48 1863 faligndata %d4, %d6, %d50 1864 faligndata %d6, %d8, %d52 1865 faligndata %d8, %d10, %d54 1866 faligndata %d10, %d12, %d56 1867 faligndata %d12, %d14, %d58 1868 faligndata %d14, %d16, %d60 1869 faligndata %d16, %d18, %d62 1870 fmovd %d18, %d2 1871 fmovd %d20, %d4 1872 fmovd %d22, %d6 1873 fmovd %d24, %d8 1874 fmovd %d26, %d10 1875 fmovd %d28, %d12 1876 fmovd %d30, %d14 1877 stda %d48, [%i1]ASI_BLK_P 1878 subcc %i3, 64, %i3 1879 add %i1, 64, %i1 1880 bgu,pt %ncc, .bc_unaln_001_loop 1881 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1882 ba .bc_unaln_done 1883 nop 1884 1885.bc_unaln_000: 1886 ldda [%o4]ASI_BLK_P, %d0 1887.bc_unaln_000_loop: 1888 add %o4, 64, %o4 1889 ldda [%o4]ASI_BLK_P, %d16 1890 faligndata %d0, %d2, %d48 1891 faligndata %d2, %d4, %d50 1892 faligndata %d4, %d6, %d52 1893 faligndata %d6, %d8, %d54 1894 faligndata %d8, %d10, %d56 1895 faligndata %d10, %d12, %d58 1896 faligndata %d12, %d14, %d60 1897 faligndata %d14, %d16, %d62 1898 fmovd %d16, %d0 1899 fmovd %d18, %d2 1900 fmovd %d20, %d4 1901 fmovd %d22, %d6 1902 fmovd %d24, %d8 1903 fmovd %d26, %d10 1904 fmovd %d28, %d12 1905 fmovd %d30, %d14 1906 stda %d48, [%i1]ASI_BLK_P 1907 subcc %i3, 64, %i3 1908 add %i1, 64, %i1 1909 bgu,pt %ncc, .bc_unaln_000_loop 1910 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 1911 1912.bc_unaln_done: 1913 ! Handle trailing bytes, 64 to 127 1914 ! Dest long word aligned, Src not long word aligned 1915 cmp %i2, 15 1916 bleu %ncc, .bc_unaln_short 1917 1918 andn %i2, 0x7, %i3 ! %i3 is multiple of 8 1919 and %i2, 0x7, %i2 ! residue bytes in %i2 1920 add %i2, 8, %i2 1921 sub %i3, 8, %i3 ! insure we don't load past end of src 1922 andn %i0, 0x7, %o4 ! %o4 has long word aligned src address 1923 add %i0, %i3, %i0 ! advance %i0 to after multiple of 8 1924 ldd [%o4], %d0 ! fetch partial word 1925.bc_unaln_by8: 1926 ldd [%o4+8], %d2 1927 add %o4, 8, %o4 1928 faligndata %d0, %d2, %d16 1929 subcc %i3, 8, %i3 1930 std %d16, [%i1] 1931 fmovd %d2, %d0 1932 bgu,pt %ncc, .bc_unaln_by8 1933 add %i1, 8, %i1 1934 1935.bc_unaln_short: 1936 cmp %i2, 8 1937 blt,pt %ncc, .bc_unalnfin 1938 nop 1939 ldub [%i0], %o4 1940 sll %o4, 24, %o3 1941 ldub [%i0+1], %o4 1942 sll %o4, 16, %o4 1943 or %o4, %o3, %o3 1944 ldub [%i0+2], %o4 1945 sll %o4, 8, %o4 1946 or %o4, %o3, %o3 1947 ldub [%i0+3], %o4 1948 or %o4, %o3, %o3 1949 stw %o3, [%i1] 1950 ldub [%i0+4], %o4 1951 sll %o4, 24, %o3 1952 ldub [%i0+5], %o4 1953 sll %o4, 16, %o4 1954 or %o4, %o3, %o3 1955 ldub [%i0+6], %o4 1956 sll %o4, 8, %o4 1957 or %o4, %o3, %o3 1958 ldub [%i0+7], %o4 1959 or %o4, %o3, %o3 1960 stw %o3, [%i1+4] 1961 add %i0, 8, %i0 1962 add %i1, 8, %i1 1963 sub %i2, 8, %i2 1964.bc_unalnfin: 1965 cmp %i2, 4 1966 blt,pt %ncc, .bc_unalnz 1967 tst %i2 1968 ldub [%i0], %o3 ! read byte 1969 subcc %i2, 4, %i2 ! reduce count by 4 1970 sll %o3, 24, %o3 ! position 1971 ldub [%i0+1], %o4 1972 sll %o4, 16, %o4 ! position 1973 or %o4, %o3, %o3 ! merge 1974 ldub [%i0+2], %o4 1975 sll %o4, 8, %o4 ! position 1976 or %o4, %o3, %o3 ! merge 1977 add %i1, 4, %i1 ! advance dst by 4 1978 ldub [%i0+3], %o4 1979 add %i0, 4, %i0 ! advance src by 4 1980 or %o4, %o3, %o4 ! merge 1981 bnz,pt %ncc, .bc_unaln3x 1982 stw %o4, [%i1-4] 1983 ba .bc_exit 1984 nop 1985.bc_unalnz: 1986 bz,pt %ncc, .bc_exit 1987.bc_unaln3x: ! Exactly 1, 2, or 3 bytes remain 1988 subcc %i2, 1, %i2 ! reduce count for cc test 1989 ldub [%i0], %o4 ! load one byte 1990 bz,pt %ncc, .bc_exit 1991 stb %o4, [%i1] ! store one byte 1992 ldub [%i0+1], %o4 ! load second byte 1993 subcc %i2, 1, %i2 1994 bz,pt %ncc, .bc_exit 1995 stb %o4, [%i1+1] ! store second byte 1996 ldub [%i0+2], %o4 ! load third byte 1997 stb %o4, [%i1+2] ! store third byte 1998.bc_exit: 1999 wr %l5, %g0, %gsr ! restore %gsr 2000 brnz %g5, .bc_fp_restore 2001 and %o5, COPY_FLAGS, %l1 ! save flags in %l1 2002 FZERO 2003 wr %g5, %g0, %fprs 2004 ba,pt %ncc, .bc_ex2 2005 nop 2006.bc_fp_restore: 2007 BLD_FP_FROMSTACK(%o4) 2008.bc_ex2: 2009 ldn [THREAD_REG + T_LWP], %o2 2010 brnz,pt %o2, 1f 2011 nop 2012 2013 ldsb [THREAD_REG + T_PREEMPT], %l0 2014 deccc %l0 2015 bnz,pn %ncc, 1f 2016 stb %l0, [THREAD_REG + T_PREEMPT] 2017 2018 ! Check for a kernel preemption request 2019 ldn [THREAD_REG + T_CPU], %l0 2020 ldub [%l0 + CPU_KPRUNRUN], %l0 2021 brnz,a,pt %l0, 1f ! Need to call kpreempt? 2022 or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag 20231: 2024 btst LOFAULT_SET, %l1 2025 bz,pn %icc, 3f 2026 andncc %o5, COPY_FLAGS, %o5 2027 ! Here via bcopy. Check to see if the handler was NULL. 2028 ! If so, just return quietly. Otherwise, reset the 2029 ! handler and return. 2030 bz,pn %ncc, 2f 2031 nop 2032 membar #Sync 2033 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 20342: 2035 btst KPREEMPT_FLAG, %l1 2036 bz,pt %icc, 3f 2037 nop 2038 call kpreempt 2039 rdpr %pil, %o0 ! pass %pil 20403: 2041 ret 2042 restore %g0, 0, %o0 2043 2044 SET_SIZE(bcopy_more) 2045 2046 2047#else /* NIAGARA_IMPL */ 2048 save %sp, -SA(MINFRAME), %sp 2049 clr %o5 ! flag LOFAULT_SET is not set for bcopy 2050.do_copy: 2051 cmp %i2, 12 ! for small counts 2052 blu %ncc, .bytecp ! just copy bytes 2053 .empty 2054 2055 cmp %i2, 128 ! for less than 128 bytes 2056 blu,pn %ncc, .bcb_punt ! no block st/quad ld 2057 nop 2058 2059 set use_hw_bcopy, %o2 2060 ld [%o2], %o2 2061 brz,pn %o2, .bcb_punt 2062 nop 2063 2064 subcc %i1, %i0, %i3 2065 bneg,a,pn %ncc, 1f 2066 neg %i3 20671: 2068 /* 2069 * Compare against 256 since we should be checking block addresses 2070 * and (dest & ~63) - (src & ~63) can be 3 blocks even if 2071 * src = dest + (64 * 3) + 63. 2072 */ 2073 cmp %i3, 256 2074 blu,pn %ncc, .bcb_punt 2075 nop 2076 2077 /* 2078 * Copy that reach here have at least 2 blocks of data to copy. 2079 */ 2080.do_blockcopy: 2081 ! Swap src/dst since the code below is memcpy code 2082 ! and memcpy/bcopy have different calling sequences 2083 mov %i1, %i5 2084 mov %i0, %i1 2085 mov %i5, %i0 2086 2087 ! Block (64 bytes) align the destination. 2088 andcc %i0, 0x3f, %i3 ! is dst aligned on a 64 bytes 2089 bz %xcc, .chksrc ! dst is already double aligned 2090 sub %i3, 0x40, %i3 2091 neg %i3 ! bytes till dst 64 bytes aligned 2092 sub %i2, %i3, %i2 ! update i2 with new count 2093 2094 ! Based on source and destination alignment do 2095 ! either 8 bytes, 4 bytes, 2 bytes or byte copy. 2096 2097 ! Is dst & src 8B aligned 2098 or %i0, %i1, %o2 2099 andcc %o2, 0x7, %g0 2100 bz %ncc, .alewdcp 2101 nop 2102 2103 ! Is dst & src 4B aligned 2104 andcc %o2, 0x3, %g0 2105 bz %ncc, .alwdcp 2106 nop 2107 2108 ! Is dst & src 2B aligned 2109 andcc %o2, 0x1, %g0 2110 bz %ncc, .alhlfwdcp 2111 nop 2112 2113 ! 1B aligned 21141: ldub [%i1], %o2 2115 stb %o2, [%i0] 2116 inc %i1 2117 deccc %i3 2118 bgu,pt %ncc, 1b 2119 inc %i0 2120 2121 ba .chksrc 2122 nop 2123 2124 ! dst & src 4B aligned 2125.alwdcp: 2126 ld [%i1], %o2 2127 st %o2, [%i0] 2128 add %i1, 0x4, %i1 2129 subcc %i3, 0x4, %i3 2130 bgu,pt %ncc, .alwdcp 2131 add %i0, 0x4, %i0 2132 2133 ba .chksrc 2134 nop 2135 2136 ! dst & src 2B aligned 2137.alhlfwdcp: 2138 lduh [%i1], %o2 2139 stuh %o2, [%i0] 2140 add %i1, 0x2, %i1 2141 subcc %i3, 0x2, %i3 2142 bgu,pt %ncc, .alhlfwdcp 2143 add %i0, 0x2, %i0 2144 2145 ba .chksrc 2146 nop 2147 2148 ! dst & src 8B aligned 2149.alewdcp: 2150 ldx [%i1], %o2 2151 stx %o2, [%i0] 2152 add %i1, 0x8, %i1 2153 subcc %i3, 0x8, %i3 2154 bgu,pt %ncc, .alewdcp 2155 add %i0, 0x8, %i0 2156 2157 ! Now Destination is block (64 bytes) aligned 2158.chksrc: 2159 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size 2160 sub %i2, %i3, %i2 ! Residue bytes in %i2 2161 2162 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 2163 2164 andcc %i1, 0xf, %o2 ! is src quadword aligned 2165 bz,pn %xcc, .blkcpy ! src offset in %o2 2166 nop 2167 cmp %o2, 0x8 2168 bg .cpy_upper_double 2169 nop 2170 bl .cpy_lower_double 2171 nop 2172 2173 ! Falls through when source offset is equal to 8 i.e. 2174 ! source is double word aligned. 2175 ! In this case no shift/merge of data is required 2176 sub %i1, %o2, %i1 ! align the src at 16 bytes. 2177 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 2178 prefetch [%l0+0x0], #one_read 2179 ldda [%i1+0x0]%asi, %l2 2180loop0: 2181 ldda [%i1+0x10]%asi, %l4 2182 prefetch [%l0+0x40], #one_read 2183 2184 stxa %l3, [%i0+0x0]%asi 2185 stxa %l4, [%i0+0x8]%asi 2186 2187 ldda [%i1+0x20]%asi, %l2 2188 stxa %l5, [%i0+0x10]%asi 2189 stxa %l2, [%i0+0x18]%asi 2190 2191 ldda [%i1+0x30]%asi, %l4 2192 stxa %l3, [%i0+0x20]%asi 2193 stxa %l4, [%i0+0x28]%asi 2194 2195 ldda [%i1+0x40]%asi, %l2 2196 stxa %l5, [%i0+0x30]%asi 2197 stxa %l2, [%i0+0x38]%asi 2198 2199 add %l0, 0x40, %l0 2200 add %i1, 0x40, %i1 2201 subcc %i3, 0x40, %i3 2202 bgu,pt %xcc, loop0 2203 add %i0, 0x40, %i0 2204 ba .blkdone 2205 add %i1, %o2, %i1 ! increment the source by src offset 2206 ! the src offset was stored in %o2 2207 2208.cpy_lower_double: 2209 sub %i1, %o2, %i1 ! align the src at 16 bytes. 2210 sll %o2, 3, %o0 ! %o0 left shift 2211 mov 0x40, %o1 2212 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 2213 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 2214 prefetch [%l0+0x0], #one_read 2215 ldda [%i1+0x0]%asi, %l2 ! partial data in %l2 and %l3 has 2216 ! complete data 2217loop1: 2218 ldda [%i1+0x10]%asi, %l4 ! %l4 has partial data for this read. 2219 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4 2220 ! into %l2 and %l3 2221 prefetch [%l0+0x40], #one_read 2222 stxa %l2, [%i0+0x0]%asi 2223 stxa %l3, [%i0+0x8]%asi 2224 2225 ldda [%i1+0x20]%asi, %l2 2226 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and 2227 stxa %l4, [%i0+0x10]%asi ! %l4 from previous read 2228 stxa %l5, [%i0+0x18]%asi ! into %l4 and %l5 2229 2230 ! Repeat the same for next 32 bytes. 2231 2232 ldda [%i1+0x30]%asi, %l4 2233 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) 2234 stxa %l2, [%i0+0x20]%asi 2235 stxa %l3, [%i0+0x28]%asi 2236 2237 ldda [%i1+0x40]%asi, %l2 2238 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) 2239 stxa %l4, [%i0+0x30]%asi 2240 stxa %l5, [%i0+0x38]%asi 2241 2242 add %l0, 0x40, %l0 2243 add %i1, 0x40, %i1 2244 subcc %i3, 0x40, %i3 2245 bgu,pt %xcc, loop1 2246 add %i0, 0x40, %i0 2247 ba .blkdone 2248 add %i1, %o2, %i1 ! increment the source by src offset 2249 ! the src offset was stored in %o2 2250 2251.cpy_upper_double: 2252 sub %i1, %o2, %i1 ! align the src at 16 bytes. 2253 mov 0x8, %o0 2254 sub %o2, %o0, %o0 2255 sll %o0, 3, %o0 ! %o0 left shift 2256 mov 0x40, %o1 2257 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 2258 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 2259 prefetch [%l0+0x0], #one_read 2260 ldda [%i1+0x0]%asi, %l2 ! partial data in %l3 for this read and 2261 ! no data in %l2 2262loop2: 2263 ldda [%i1+0x10]%asi, %l4 ! %l4 has complete data and %l5 has 2264 ! partial 2265 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5 2266 ! into %l3 and %l4 2267 prefetch [%l0+0x40], #one_read 2268 stxa %l3, [%i0+0x0]%asi 2269 stxa %l4, [%i0+0x8]%asi 2270 2271 ldda [%i1+0x20]%asi, %l2 2272 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with 2273 stxa %l5, [%i0+0x10]%asi ! %l5 from previous read 2274 stxa %l2, [%i0+0x18]%asi ! into %l5 and %l2 2275 2276 ! Repeat the same for next 32 bytes. 2277 2278 ldda [%i1+0x30]%asi, %l4 2279 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) 2280 stxa %l3, [%i0+0x20]%asi 2281 stxa %l4, [%i0+0x28]%asi 2282 2283 ldda [%i1+0x40]%asi, %l2 2284 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) 2285 stxa %l5, [%i0+0x30]%asi 2286 stxa %l2, [%i0+0x38]%asi 2287 2288 add %l0, 0x40, %l0 2289 add %i1, 0x40, %i1 2290 subcc %i3, 0x40, %i3 2291 bgu,pt %xcc, loop2 2292 add %i0, 0x40, %i0 2293 ba .blkdone 2294 add %i1, %o2, %i1 ! increment the source by src offset 2295 ! the src offset was stored in %o2 2296 2297 2298 ! Both Source and Destination are block aligned. 2299 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P 2300.blkcpy: 2301 prefetch [%i1+0x0], #one_read 23021: 2303 ldda [%i1+0x0]%asi, %l0 2304 ldda [%i1+0x10]%asi, %l2 2305 prefetch [%i1+0x40], #one_read 2306 2307 stxa %l0, [%i0+0x0]%asi 2308 ldda [%i1+0x20]%asi, %l4 2309 ldda [%i1+0x30]%asi, %l6 2310 2311 stxa %l1, [%i0+0x8]%asi 2312 stxa %l2, [%i0+0x10]%asi 2313 stxa %l3, [%i0+0x18]%asi 2314 stxa %l4, [%i0+0x20]%asi 2315 stxa %l5, [%i0+0x28]%asi 2316 stxa %l6, [%i0+0x30]%asi 2317 stxa %l7, [%i0+0x38]%asi 2318 2319 add %i1, 0x40, %i1 2320 subcc %i3, 0x40, %i3 2321 bgu,pt %xcc, 1b 2322 add %i0, 0x40, %i0 2323 2324.blkdone: 2325 membar #Sync 2326 2327 brz,pt %i2, .blkexit 2328 nop 2329 2330 ! Handle trailing bytes 2331 cmp %i2, 0x8 2332 blu,pt %ncc, .residue 2333 nop 2334 2335 ! Can we do some 8B ops 2336 or %i1, %i0, %o2 2337 andcc %o2, 0x7, %g0 2338 bnz %ncc, .last4 2339 nop 2340 2341 ! Do 8byte ops as long as possible 2342.last8: 2343 ldx [%i1], %o2 2344 stx %o2, [%i0] 2345 add %i1, 0x8, %i1 2346 sub %i2, 0x8, %i2 2347 cmp %i2, 0x8 2348 bgu,pt %ncc, .last8 2349 add %i0, 0x8, %i0 2350 2351 brz,pt %i2, .blkexit 2352 nop 2353 2354 ba .residue 2355 nop 2356 2357.last4: 2358 ! Can we do 4B ops 2359 andcc %o2, 0x3, %g0 2360 bnz %ncc, .last2 2361 nop 23621: 2363 ld [%i1], %o2 2364 st %o2, [%i0] 2365 add %i1, 0x4, %i1 2366 sub %i2, 0x4, %i2 2367 cmp %i2, 0x4 2368 bgu,pt %ncc, 1b 2369 add %i0, 0x4, %i0 2370 2371 brz,pt %i2, .blkexit 2372 nop 2373 2374 ba .residue 2375 nop 2376 2377.last2: 2378 ! Can we do 2B ops 2379 andcc %o2, 0x1, %g0 2380 bnz %ncc, .residue 2381 nop 2382 23831: 2384 lduh [%i1], %o2 2385 stuh %o2, [%i0] 2386 add %i1, 0x2, %i1 2387 sub %i2, 0x2, %i2 2388 cmp %i2, 0x2 2389 bgu,pt %ncc, 1b 2390 add %i0, 0x2, %i0 2391 2392 brz,pt %i2, .blkexit 2393 nop 2394 2395.residue: 2396 ldub [%i1], %o2 2397 stb %o2, [%i0] 2398 inc %i1 2399 deccc %i2 2400 bgu,pt %ncc, .residue 2401 inc %i0 2402 2403.blkexit: 2404 2405 membar #Sync ! sync error barrier 2406 ! Restore t_lofault handler, if came here from kcopy(). 2407 tst %o5 2408 bz %ncc, 1f 2409 andn %o5, LOFAULT_SET, %o5 2410 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 24111: 2412 ret 2413 restore %g0, 0, %o0 2414 2415 2416.bcb_punt: 2417 ! 2418 ! use aligned transfers where possible 2419 ! 2420 xor %i0, %i1, %o4 ! xor from and to address 2421 btst 7, %o4 ! if lower three bits zero 2422 bz .aldoubcp ! can align on double boundary 2423 .empty ! assembler complaints about label 2424 2425 xor %i0, %i1, %o4 ! xor from and to address 2426 btst 3, %o4 ! if lower two bits zero 2427 bz .alwordcp ! can align on word boundary 2428 btst 3, %i0 ! delay slot, from address unaligned? 2429 ! 2430 ! use aligned reads and writes where possible 2431 ! this differs from wordcp in that it copes 2432 ! with odd alignment between source and destnation 2433 ! using word reads and writes with the proper shifts 2434 ! in between to align transfers to and from memory 2435 ! i0 - src address, i1 - dest address, i2 - count 2436 ! i3, i4 - tmps for used generating complete word 2437 ! i5 (word to write) 2438 ! l0 size in bits of upper part of source word (US) 2439 ! l1 size in bits of lower part of source word (LS = 32 - US) 2440 ! l2 size in bits of upper part of destination word (UD) 2441 ! l3 size in bits of lower part of destination word (LD = 32 - UD) 2442 ! l4 number of bytes leftover after aligned transfers complete 2443 ! l5 the number 32 2444 ! 2445 mov 32, %l5 ! load an oft-needed constant 2446 bz .align_dst_only 2447 btst 3, %i1 ! is destnation address aligned? 2448 clr %i4 ! clear registers used in either case 2449 bz .align_src_only 2450 clr %l0 2451 ! 2452 ! both source and destination addresses are unaligned 2453 ! 24541: ! align source 2455 ldub [%i0], %i3 ! read a byte from source address 2456 add %i0, 1, %i0 ! increment source address 2457 or %i4, %i3, %i4 ! or in with previous bytes (if any) 2458 btst 3, %i0 ! is source aligned? 2459 add %l0, 8, %l0 ! increment size of upper source (US) 2460 bnz,a 1b 2461 sll %i4, 8, %i4 ! make room for next byte 2462 2463 sub %l5, %l0, %l1 ! generate shift left count (LS) 2464 sll %i4, %l1, %i4 ! prepare to get rest 2465 ld [%i0], %i3 ! read a word 2466 add %i0, 4, %i0 ! increment source address 2467 srl %i3, %l0, %i5 ! upper src bits into lower dst bits 2468 or %i4, %i5, %i5 ! merge 2469 mov 24, %l3 ! align destination 24701: 2471 srl %i5, %l3, %i4 ! prepare to write a single byte 2472 stb %i4, [%i1] ! write a byte 2473 add %i1, 1, %i1 ! increment destination address 2474 sub %i2, 1, %i2 ! decrement count 2475 btst 3, %i1 ! is destination aligned? 2476 bnz,a 1b 2477 sub %l3, 8, %l3 ! delay slot, decrement shift count (LD) 2478 sub %l5, %l3, %l2 ! generate shift left count (UD) 2479 sll %i5, %l2, %i5 ! move leftover into upper bytes 2480 cmp %l2, %l0 ! cmp # reqd to fill dst w old src left 2481 bgu %ncc, .more_needed ! need more to fill than we have 2482 nop 2483 2484 sll %i3, %l1, %i3 ! clear upper used byte(s) 2485 srl %i3, %l1, %i3 2486 ! get the odd bytes between alignments 2487 sub %l0, %l2, %l0 ! regenerate shift count 2488 sub %l5, %l0, %l1 ! generate new shift left count (LS) 2489 and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0 2490 andn %i2, 3, %i2 ! # of aligned bytes that can be moved 2491 srl %i3, %l0, %i4 2492 or %i5, %i4, %i5 2493 st %i5, [%i1] ! write a word 2494 subcc %i2, 4, %i2 ! decrement count 2495 bz %ncc, .unalign_out 2496 add %i1, 4, %i1 ! increment destination address 2497 2498 b 2f 2499 sll %i3, %l1, %i5 ! get leftover into upper bits 2500.more_needed: 2501 sll %i3, %l0, %i3 ! save remaining byte(s) 2502 srl %i3, %l0, %i3 2503 sub %l2, %l0, %l1 ! regenerate shift count 2504 sub %l5, %l1, %l0 ! generate new shift left count 2505 sll %i3, %l1, %i4 ! move to fill empty space 2506 b 3f 2507 or %i5, %i4, %i5 ! merge to complete word 2508 ! 2509 ! the source address is aligned and destination is not 2510 ! 2511.align_dst_only: 2512 ld [%i0], %i4 ! read a word 2513 add %i0, 4, %i0 ! increment source address 2514 mov 24, %l0 ! initial shift alignment count 25151: 2516 srl %i4, %l0, %i3 ! prepare to write a single byte 2517 stb %i3, [%i1] ! write a byte 2518 add %i1, 1, %i1 ! increment destination address 2519 sub %i2, 1, %i2 ! decrement count 2520 btst 3, %i1 ! is destination aligned? 2521 bnz,a 1b 2522 sub %l0, 8, %l0 ! delay slot, decrement shift count 2523.xfer: 2524 sub %l5, %l0, %l1 ! generate shift left count 2525 sll %i4, %l1, %i5 ! get leftover 25263: 2527 and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0 2528 andn %i2, 3, %i2 ! # of aligned bytes that can be moved 25292: 2530 ld [%i0], %i3 ! read a source word 2531 add %i0, 4, %i0 ! increment source address 2532 srl %i3, %l0, %i4 ! upper src bits into lower dst bits 2533 or %i5, %i4, %i5 ! merge with upper dest bits (leftover) 2534 st %i5, [%i1] ! write a destination word 2535 subcc %i2, 4, %i2 ! decrement count 2536 bz %ncc, .unalign_out ! check if done 2537 add %i1, 4, %i1 ! increment destination address 2538 b 2b ! loop 2539 sll %i3, %l1, %i5 ! get leftover 2540.unalign_out: 2541 tst %l4 ! any bytes leftover? 2542 bz %ncc, .cpdone 2543 .empty ! allow next instruction in delay slot 25441: 2545 sub %l0, 8, %l0 ! decrement shift 2546 srl %i3, %l0, %i4 ! upper src byte into lower dst byte 2547 stb %i4, [%i1] ! write a byte 2548 subcc %l4, 1, %l4 ! decrement count 2549 bz %ncc, .cpdone ! done? 2550 add %i1, 1, %i1 ! increment destination 2551 tst %l0 ! any more previously read bytes 2552 bnz %ncc, 1b ! we have leftover bytes 2553 mov %l4, %i2 ! delay slot, mv cnt where dbytecp wants 2554 b .dbytecp ! let dbytecp do the rest 2555 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst 2556 ! 2557 ! the destination address is aligned and the source is not 2558 ! 2559.align_src_only: 2560 ldub [%i0], %i3 ! read a byte from source address 2561 add %i0, 1, %i0 ! increment source address 2562 or %i4, %i3, %i4 ! or in with previous bytes (if any) 2563 btst 3, %i0 ! is source aligned? 2564 add %l0, 8, %l0 ! increment shift count (US) 2565 bnz,a .align_src_only 2566 sll %i4, 8, %i4 ! make room for next byte 2567 b,a .xfer 2568 ! 2569 ! if from address unaligned for double-word moves, 2570 ! move bytes till it is, if count is < 56 it could take 2571 ! longer to align the thing than to do the transfer 2572 ! in word size chunks right away 2573 ! 2574.aldoubcp: 2575 cmp %i2, 56 ! if count < 56, use wordcp, it takes 2576 blu,a %ncc, .alwordcp ! longer to align doubles than words 2577 mov 3, %o0 ! mask for word alignment 2578 call .alignit ! copy bytes until aligned 2579 mov 7, %o0 ! mask for double alignment 2580 ! 2581 ! source and destination are now double-word aligned 2582 ! i3 has aligned count returned by alignit 2583 ! 2584 and %i2, 7, %i2 ! unaligned leftover count 2585 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst 25865: 2587 ldx [%i0+%i1], %o4 ! read from address 2588 stx %o4, [%i1] ! write at destination address 2589 subcc %i3, 8, %i3 ! dec count 2590 bgu %ncc, 5b 2591 add %i1, 8, %i1 ! delay slot, inc to address 2592 cmp %i2, 4 ! see if we can copy a word 2593 blu %ncc, .dbytecp ! if 3 or less bytes use bytecp 2594 .empty 2595 ! 2596 ! for leftover bytes we fall into wordcp, if needed 2597 ! 2598.wordcp: 2599 and %i2, 3, %i2 ! unaligned leftover count 26005: 2601 ld [%i0+%i1], %o4 ! read from address 2602 st %o4, [%i1] ! write at destination address 2603 subcc %i3, 4, %i3 ! dec count 2604 bgu %ncc, 5b 2605 add %i1, 4, %i1 ! delay slot, inc to address 2606 b,a .dbytecp 2607 2608 ! we come here to align copies on word boundaries 2609.alwordcp: 2610 call .alignit ! go word-align it 2611 mov 3, %o0 ! bits that must be zero to be aligned 2612 b .wordcp 2613 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst 2614 2615 ! 2616 ! byte copy, works with any alignment 2617 ! 2618.bytecp: 2619 b .dbytecp 2620 sub %i0, %i1, %i0 ! i0 gets difference of src and dst 2621 2622 ! 2623 ! differenced byte copy, works with any alignment 2624 ! assumes dest in %i1 and (source - dest) in %i0 2625 ! 26261: 2627 stb %o4, [%i1] ! write to address 2628 inc %i1 ! inc to address 2629.dbytecp: 2630 deccc %i2 ! dec count 2631 bgeu,a %ncc, 1b ! loop till done 2632 ldub [%i0+%i1], %o4 ! read from address 2633.cpdone: 2634 2635 membar #Sync ! sync error barrier 2636 ! Restore t_lofault handler, if came here from kcopy(). 2637 tst %o5 2638 bz %ncc, 1f 2639 andn %o5, LOFAULT_SET, %o5 2640 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 26411: 2642 ret 2643 restore %g0, 0, %o0 ! return (0) 2644 2645/* 2646 * Common code used to align transfers on word and doubleword 2647 * boundaries. Aligns source and destination and returns a count 2648 * of aligned bytes to transfer in %i3 2649 */ 26501: 2651 inc %i0 ! inc from 2652 stb %o4, [%i1] ! write a byte 2653 inc %i1 ! inc to 2654 dec %i2 ! dec count 2655.alignit: 2656 btst %o0, %i0 ! %o0 is bit mask to check for alignment 2657 bnz,a 1b 2658 ldub [%i0], %o4 ! read next byte 2659 2660 retl 2661 andn %i2, %o0, %i3 ! return size of aligned bytes 2662 2663 SET_SIZE(bcopy) 2664 2665#endif /* NIAGARA_IMPL */ 2666 2667/* 2668 * Block copy with possibly overlapped operands. 2669 */ 2670 2671 ENTRY(ovbcopy) 2672 tst %o2 ! check count 2673 bgu,a %ncc, 1f ! nothing to do or bad arguments 2674 subcc %o0, %o1, %o3 ! difference of from and to address 2675 2676 retl ! return 2677 nop 26781: 2679 bneg,a %ncc, 2f 2680 neg %o3 ! if < 0, make it positive 26812: cmp %o2, %o3 ! cmp size and abs(from - to) 2682 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy, 2683 .empty ! no overlap 2684 cmp %o0, %o1 ! compare from and to addresses 2685 blu %ncc, .ov_bkwd ! if from < to, copy backwards 2686 nop 2687 ! 2688 ! Copy forwards. 2689 ! 2690.ov_fwd: 2691 ldub [%o0], %o3 ! read from address 2692 inc %o0 ! inc from address 2693 stb %o3, [%o1] ! write to address 2694 deccc %o2 ! dec count 2695 bgu %ncc, .ov_fwd ! loop till done 2696 inc %o1 ! inc to address 2697 2698 retl ! return 2699 nop 2700 ! 2701 ! Copy backwards. 2702 ! 2703.ov_bkwd: 2704 deccc %o2 ! dec count 2705 ldub [%o0 + %o2], %o3 ! get byte at end of src 2706 bgu %ncc, .ov_bkwd ! loop till done 2707 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst 2708 2709 retl ! return 2710 nop 2711 SET_SIZE(ovbcopy) 2712 2713/* 2714 * hwblkpagecopy() 2715 * 2716 * Copies exactly one page. This routine assumes the caller (ppcopy) 2717 * has already disabled kernel preemption and has checked 2718 * use_hw_bcopy. 2719 */ 2720 ENTRY(hwblkpagecopy) 2721 save %sp, -SA(MINFRAME), %sp 2722 2723 ! %i0 - source address (arg) 2724 ! %i1 - destination address (arg) 2725 ! %i2 - length of region (not arg) 2726 2727 set PAGESIZE, %i2 2728 2729 /* 2730 * Copying exactly one page and PAGESIZE is in mutliple of 0x80. 2731 */ 2732 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 2733 prefetch [%i0+0x0], #one_read 2734 prefetch [%i0+0x40], #one_read 27351: 2736 prefetch [%i0+0x80], #one_read 2737 prefetch [%i0+0xc0], #one_read 2738 ldda [%i0+0x0]%asi, %l0 2739 ldda [%i0+0x10]%asi, %l2 2740 ldda [%i0+0x20]%asi, %l4 2741 ldda [%i0+0x30]%asi, %l6 2742 stxa %l0, [%i1+0x0]%asi 2743 stxa %l1, [%i1+0x8]%asi 2744 stxa %l2, [%i1+0x10]%asi 2745 stxa %l3, [%i1+0x18]%asi 2746 stxa %l4, [%i1+0x20]%asi 2747 stxa %l5, [%i1+0x28]%asi 2748 stxa %l6, [%i1+0x30]%asi 2749 stxa %l7, [%i1+0x38]%asi 2750 ldda [%i0+0x40]%asi, %l0 2751 ldda [%i0+0x50]%asi, %l2 2752 ldda [%i0+0x60]%asi, %l4 2753 ldda [%i0+0x70]%asi, %l6 2754 stxa %l0, [%i1+0x40]%asi 2755 stxa %l1, [%i1+0x48]%asi 2756 stxa %l2, [%i1+0x50]%asi 2757 stxa %l3, [%i1+0x58]%asi 2758 stxa %l4, [%i1+0x60]%asi 2759 stxa %l5, [%i1+0x68]%asi 2760 stxa %l6, [%i1+0x70]%asi 2761 stxa %l7, [%i1+0x78]%asi 2762 2763 add %i0, 0x80, %i0 2764 subcc %i2, 0x80, %i2 2765 bgu,pt %xcc, 1b 2766 add %i1, 0x80, %i1 2767 2768 membar #Sync 2769 ret 2770 restore %g0, 0, %o0 2771 SET_SIZE(hwblkpagecopy) 2772 2773 2774/* 2775 * Transfer data to and from user space - 2776 * Note that these routines can cause faults 2777 * It is assumed that the kernel has nothing at 2778 * less than KERNELBASE in the virtual address space. 2779 * 2780 * Note that copyin(9F) and copyout(9F) are part of the 2781 * DDI/DKI which specifies that they return '-1' on "errors." 2782 * 2783 * Sigh. 2784 * 2785 * So there's two extremely similar routines - xcopyin() and xcopyout() 2786 * which return the errno that we've faithfully computed. This 2787 * allows other callers (e.g. uiomove(9F)) to work correctly. 2788 * Given that these are used pretty heavily, we expand the calling 2789 * sequences inline for all flavours (rather than making wrappers). 2790 * 2791 * There are also stub routines for xcopyout_little and xcopyin_little, 2792 * which currently are intended to handle requests of <= 16 bytes from 2793 * do_unaligned. Future enhancement to make them handle 8k pages efficiently 2794 * is left as an exercise... 2795 */ 2796 2797/* 2798 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr) 2799 * 2800 * General theory of operation: 2801 * 2802 * None of the copyops routines grab a window until it's decided that 2803 * we need to do a HW block copy operation. This saves a window 2804 * spill/fill when we're called during socket ops. The typical IO 2805 * path won't cause spill/fill traps. 2806 * 2807 * This code uses a set of 4 limits for the maximum size that will 2808 * be copied given a particular input/output address alignment. 2809 * the default limits are: 2810 * 2811 * single byte aligned - 256 (hw_copy_limit_1) 2812 * two byte aligned - 512 (hw_copy_limit_2) 2813 * four byte aligned - 1024 (hw_copy_limit_4) 2814 * eight byte aligned - 1024 (hw_copy_limit_8) 2815 * 2816 * If the value for a particular limit is zero, the copy will be done 2817 * via the copy loops rather than block store/quad load instructions. 2818 * 2819 * Flow: 2820 * 2821 * If count == zero return zero. 2822 * 2823 * Store the previous lo_fault handler into %g6. 2824 * Place our secondary lofault handler into %g5. 2825 * Place the address of our nowindow fault handler into %o3. 2826 * Place the address of the windowed fault handler into %o4. 2827 * --> We'll use this handler if we end up grabbing a window 2828 * --> before we use block initializing store and quad load ASIs 2829 * 2830 * If count is less than or equal to SMALL_LIMIT (7) we 2831 * always do a byte for byte copy. 2832 * 2833 * If count is > SMALL_LIMIT, we check the alignment of the input 2834 * and output pointers. Based on the alignment we check count 2835 * against a limit based on detected alignment. If we exceed the 2836 * alignment value we copy via block initializing store and quad 2837 * load instructions. 2838 * 2839 * If we don't exceed one of the limits, we store -count in %o3, 2840 * we store the number of chunks (8, 4, 2 or 1 byte) operated 2841 * on in our basic copy loop in %o2. Following this we branch 2842 * to the appropriate copy loop and copy that many chunks. 2843 * Since we've been adding the chunk size to %o3 each time through 2844 * as well as decrementing %o2, we can tell if any data is 2845 * is left to be copied by examining %o3. If that is zero, we're 2846 * done and can go home. If not, we figure out what the largest 2847 * chunk size left to be copied is and branch to that copy loop 2848 * unless there's only one byte left. We load that as we're 2849 * branching to code that stores it just before we return. 2850 * 2851 * Fault handlers are invoked if we reference memory that has no 2852 * current mapping. All forms share the same copyio_fault handler. 2853 * This routine handles fixing up the stack and general housecleaning. 2854 * Each copy operation has a simple fault handler that is then called 2855 * to do the work specific to the invidual operation. The handler 2856 * for copyOP and xcopyOP are found at the end of individual function. 2857 * The handlers for xcopyOP_little are found at the end of xcopyin_little. 2858 * The handlers for copyOP_noerr are found at the end of copyin_noerr. 2859 */ 2860 2861/* 2862 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little). 2863 */ 2864 2865/* 2866 * We save the arguments in the following registers in case of a fault: 2867 * kaddr - %g2 2868 * uaddr - %g3 2869 * count - %g4 2870 */ 2871#define SAVE_SRC %g2 2872#define SAVE_DST %g3 2873#define SAVE_COUNT %g4 2874 2875#define REAL_LOFAULT %g5 2876#define SAVED_LOFAULT %g6 2877 2878/* 2879 * Generic copyio fault handler. This is the first line of defense when a 2880 * fault occurs in (x)copyin/(x)copyout. In order for this to function 2881 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT. 2882 * This allows us to share common code for all the flavors of the copy 2883 * operations, including the _noerr versions. 2884 * 2885 * Note that this function will restore the original input parameters before 2886 * calling REAL_LOFAULT. So the real handler can vector to the appropriate 2887 * member of the t_copyop structure, if needed. 2888 */ 2889 ENTRY(copyio_fault) 2890#if !defined(NIAGARA_IMPL) 2891 btst FPUSED_FLAG, SAVED_LOFAULT 2892 bz 1f 2893 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 2894 2895 wr %l5, 0, %gsr ! restore gsr 2896 2897 btst FPRS_FEF, %g1 2898 bz %icc, 4f 2899 nop 2900 2901 ! restore fpregs from stack 2902 BLD_FP_FROMSTACK(%o2) 2903 2904 ba,pt %ncc, 1f 2905 nop 29064: 2907 FZERO ! zero all of the fpregs 2908 wr %g1, %g0, %fprs ! restore fprs 29091: 2910 restore 2911 mov SAVE_SRC, %o0 2912 mov SAVE_DST, %o1 2913 jmp REAL_LOFAULT 2914 mov SAVE_COUNT, %o2 2915 2916#else /* NIAGARA_IMPL */ 2917 membar #Sync 2918 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2919 restore 2920 mov SAVE_SRC, %o0 2921 mov SAVE_DST, %o1 2922 jmp REAL_LOFAULT 2923 mov SAVE_COUNT, %o2 2924 2925#endif /* NIAGARA_IMPL */ 2926 2927 SET_SIZE(copyio_fault) 2928 2929 ENTRY(copyio_fault_nowindow) 2930 membar #Sync 2931 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2932 2933 mov SAVE_SRC, %o0 2934 mov SAVE_DST, %o1 2935 jmp REAL_LOFAULT 2936 mov SAVE_COUNT, %o2 2937 SET_SIZE(copyio_fault_nowindow) 2938 2939 ENTRY(copyout) 2940 sethi %hi(.copyout_err), REAL_LOFAULT 2941 or REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT 2942 2943#if !defined(NIAGARA_IMPL) 2944.do_copyout: 2945 tst %o2 ! check for zero count; quick exit 2946 bz,pt %ncc, .co_smallqx 2947 mov %o0, SAVE_SRC 2948 mov %o1, SAVE_DST 2949 mov %o2, SAVE_COUNT 2950 cmp %o2, FP_COPY ! check for small copy/leaf case 2951 bgt,pt %ncc, .co_copy_more 2952 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT 2953/* 2954 * Small copy out code 2955 * 2956 */ 2957 sethi %hi(copyio_fault_nowindow), %o3 2958 or %o3, %lo(copyio_fault_nowindow), %o3 2959 membar #Sync 2960 stn %o3, [THREAD_REG + T_LOFAULT] 2961 2962 mov ASI_USER, %asi 2963 cmp %o2, SHORTCOPY ! make sure there is enough to align 2964 ble,pt %ncc, .co_smallest 2965 andcc %o1, 0x7, %o3 ! is dest long word aligned 2966 bnz,pn %ncc, .co_align 2967 andcc %o1, 1, %o3 ! is dest byte aligned 2968 2969! Destination is long word aligned 2970! 8 cases for src alignment; load parts, store long words 2971.co_al_src: 2972 andcc %o0, 7, %o3 2973 brnz,pt %o3, .co_src_dst_unal8 2974 nop 2975/* 2976 * Special case for handling when src and dest are both long word aligned 2977 * and total data to move is less than FP_COPY bytes 2978 * Also handles finish up for large block moves, so may be less than 32 bytes 2979 */ 2980.co_medlong: 2981 subcc %o2, 31, %o2 ! adjust length to allow cc test 2982 ble,pt %ncc, .co_medl31 2983 nop 2984.co_medl32: 2985 ldx [%o0], %o4 ! move 32 bytes 2986 subcc %o2, 32, %o2 ! decrement length count by 32 2987 stxa %o4, [%o1]%asi 2988 ldx [%o0+8], %o4 2989 stxa %o4, [%o1+8]%asi 2990 ldx [%o0+16], %o4 2991 add %o0, 32, %o0 ! increase src ptr by 32 2992 stxa %o4, [%o1+16]%asi 2993 ldx [%o0-8], %o4 2994 add %o1, 32, %o1 ! increase dst ptr by 32 2995 bgu,pt %ncc, .co_medl32 ! repeat if at least 32 bytes left 2996 stxa %o4, [%o1-8]%asi 2997.co_medl31: 2998 addcc %o2, 24, %o2 ! adjust count to be off by 7 2999 ble,pt %ncc, .co_medl7 ! skip if 7 or fewer bytes left 3000 nop 3001.co_medl8: 3002 ldx [%o0], %o4 ! move 8 bytes 3003 add %o0, 8, %o0 ! increase src ptr by 8 3004 subcc %o2, 8, %o2 ! decrease count by 8 3005 add %o1, 8, %o1 ! increase dst ptr by 8 3006 bgu,pt %ncc, .co_medl8 3007 stxa %o4, [%o1-8]%asi 3008.co_medl7: 3009 addcc %o2, 7, %o2 ! finish adjustment of remaining count 3010 bnz,pt %ncc, .co_small4 ! do final bytes if not finished 3011 3012.co_smallx: ! finish up and exit 3013 membar #Sync 3014 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 3015.co_smallqx: 3016 retl 3017 mov %g0, %o0 3018 3019.co_small4: 3020 cmp %o2, 4 3021 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left 3022 nop ! 3023 ld [%o0], %o4 ! move 4 bytes 3024 add %o0, 4, %o0 ! increase src ptr by 4 3025 add %o1, 4, %o1 ! increase dst ptr by 4 3026 subcc %o2, 4, %o2 ! decrease count by 4 3027 bz,pt %ncc, .co_smallx 3028 stwa %o4, [%o1-4]%asi 3029 3030.co_small3x: ! Exactly 1, 2, or 3 bytes remain 3031 subcc %o2, 1, %o2 ! reduce count for cc test 3032 ldub [%o0], %o4 ! load one byte 3033 bz,pt %ncc, .co_smallx 3034 stba %o4, [%o1]%asi ! store one byte 3035 ldub [%o0+1], %o4 ! load second byte 3036 subcc %o2, 1, %o2 3037 bz,pt %ncc, .co_smallx 3038 stba %o4, [%o1+1]%asi ! store second byte 3039 ldub [%o0+2], %o4 ! load third byte 3040 ba .co_smallx 3041 stba %o4, [%o1+2]%asi ! store third byte 3042 3043.co_smallest: ! 7 or fewer bytes remain 3044 cmp %o2, 4 3045 blt,pt %ncc, .co_small3x 3046 nop 3047 ldub [%o0], %o4 ! read byte 3048 subcc %o2, 4, %o2 ! reduce count by 4 3049 stba %o4, [%o1]%asi ! write byte 3050 ldub [%o0+1], %o4 ! repeat for total of 4 bytes 3051 add %o0, 4, %o0 ! advance src by 4 3052 stba %o4, [%o1+1]%asi 3053 ldub [%o0-2], %o4 3054 add %o1, 4, %o1 ! advance dst by 4 3055 stba %o4, [%o1-2]%asi 3056 ldub [%o0-1], %o4 3057 bnz,pt %ncc, .co_small3x 3058 stba %o4, [%o1-1]%asi 3059 membar #Sync 3060 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 3061 retl 3062 mov %g0, %o0 3063 3064.co_align: ! byte align test in prior branch delay 3065 bnz,pt %ncc, .co_al_d1 3066.co_al_d1f: ! dest is now half word aligned 3067 andcc %o1, 2, %o3 3068 bnz,pt %ncc, .co_al_d2 3069.co_al_d2f: ! dest is now word aligned 3070 andcc %o1, 4, %o3 ! is dest longword aligned? 3071 bz,pt %ncc, .co_al_src 3072 nop 3073.co_al_d4: ! dest is word aligned; src is unknown 3074 ldub [%o0], %o4 ! move a word (src align unknown) 3075 ldub [%o0+1], %o3 3076 sll %o4, 24, %o4 ! position 3077 sll %o3, 16, %o3 ! position 3078 or %o4, %o3, %o3 ! merge 3079 ldub [%o0+2], %o4 3080 sll %o4, 8, %o4 ! position 3081 or %o4, %o3, %o3 ! merge 3082 ldub [%o0+3], %o4 3083 or %o4, %o3, %o4 ! merge 3084 stwa %o4,[%o1]%asi ! store four bytes 3085 add %o0, 4, %o0 ! adjust src by 4 3086 add %o1, 4, %o1 ! adjust dest by 4 3087 sub %o2, 4, %o2 ! adjust count by 4 3088 andcc %o0, 7, %o3 ! check for src long word alignment 3089 brz,pt %o3, .co_medlong 3090.co_src_dst_unal8: 3091 ! dst is 8-byte aligned, src is not 3092 ! Size is less than FP_COPY 3093 ! Following code is to select for alignment 3094 andcc %o0, 0x3, %o3 ! test word alignment 3095 bz,pt %ncc, .co_medword 3096 nop 3097 andcc %o0, 0x1, %o3 ! test halfword alignment 3098 bnz,pt %ncc, .co_med_byte ! go to byte move if not halfword 3099 andcc %o0, 0x2, %o3 ! test which byte alignment 3100 ba .co_medhalf 3101 nop 3102.co_al_d1: ! align dest to half word 3103 ldub [%o0], %o4 ! move a byte 3104 add %o0, 1, %o0 3105 stba %o4, [%o1]%asi 3106 add %o1, 1, %o1 3107 andcc %o1, 2, %o3 3108 bz,pt %ncc, .co_al_d2f 3109 sub %o2, 1, %o2 3110.co_al_d2: ! align dest to word 3111 ldub [%o0], %o4 ! move a half-word (src align unknown) 3112 ldub [%o0+1], %o3 3113 sll %o4, 8, %o4 ! position 3114 or %o4, %o3, %o4 ! merge 3115 stha %o4, [%o1]%asi 3116 add %o0, 2, %o0 3117 add %o1, 2, %o1 3118 andcc %o1, 4, %o3 ! is dest longword aligned? 3119 bz,pt %ncc, .co_al_src 3120 sub %o2, 2, %o2 3121 ba .co_al_d4 3122 nop 3123/* 3124 * Handle all cases where src and dest are aligned on word 3125 * boundaries. Use unrolled loops for better performance. 3126 * This option wins over standard large data move when 3127 * source and destination is in cache for medium 3128 * to short data moves. 3129 */ 3130.co_medword: 3131 subcc %o2, 31, %o2 ! adjust length to allow cc test 3132 ble,pt %ncc, .co_medw31 3133 nop 3134.co_medw32: 3135 ld [%o0], %o4 ! move a block of 32 bytes 3136 stwa %o4, [%o1]%asi 3137 ld [%o0+4], %o4 3138 stwa %o4, [%o1+4]%asi 3139 ld [%o0+8], %o4 3140 stwa %o4, [%o1+8]%asi 3141 ld [%o0+12], %o4 3142 stwa %o4, [%o1+12]%asi 3143 ld [%o0+16], %o4 3144 stwa %o4, [%o1+16]%asi 3145 ld [%o0+20], %o4 3146 subcc %o2, 32, %o2 ! decrement length count 3147 stwa %o4, [%o1+20]%asi 3148 ld [%o0+24], %o4 3149 add %o0, 32, %o0 ! increase src ptr by 32 3150 stwa %o4, [%o1+24]%asi 3151 ld [%o0-4], %o4 3152 add %o1, 32, %o1 ! increase dst ptr by 32 3153 bgu,pt %ncc, .co_medw32 ! repeat if at least 32 bytes left 3154 stwa %o4, [%o1-4]%asi 3155.co_medw31: 3156 addcc %o2, 24, %o2 ! adjust count to be off by 7 3157 ble,pt %ncc, .co_medw7 ! skip if 7 or fewer bytes left 3158 nop ! 3159.co_medw15: 3160 ld [%o0], %o4 ! move a block of 8 bytes 3161 subcc %o2, 8, %o2 ! decrement length count 3162 stwa %o4, [%o1]%asi 3163 add %o0, 8, %o0 ! increase src ptr by 8 3164 ld [%o0-4], %o4 3165 add %o1, 8, %o1 ! increase dst ptr by 8 3166 bgu,pt %ncc, .co_medw15 3167 stwa %o4, [%o1-4]%asi 3168.co_medw7: 3169 addcc %o2, 7, %o2 ! finish adjustment of remaining count 3170 bz,pt %ncc, .co_smallx ! exit if finished 3171 cmp %o2, 4 3172 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left 3173 nop ! 3174 ld [%o0], %o4 ! move 4 bytes 3175 add %o0, 4, %o0 ! increase src ptr by 4 3176 add %o1, 4, %o1 ! increase dst ptr by 4 3177 subcc %o2, 4, %o2 ! decrease count by 4 3178 bnz .co_small3x 3179 stwa %o4, [%o1-4]%asi 3180 membar #Sync 3181 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 3182 retl 3183 mov %g0, %o0 3184 3185.co_medhalf: 3186 subcc %o2, 31, %o2 ! adjust length to allow cc test 3187 ble,pt %ncc, .co_medh31 3188 nop 3189.co_medh32: ! load and store block of 32 bytes 3190 3191 lduh [%o0], %o4 ! move 32 bytes 3192 subcc %o2, 32, %o2 ! decrement length count 3193 lduw [%o0+2], %o3 3194 sllx %o4, 48, %o4 3195 sllx %o3, 16, %o3 3196 or %o4, %o3, %o3 3197 lduh [%o0+6], %o4 3198 or %o4, %o3, %o4 3199 stxa %o4, [%o1]%asi 3200 3201 lduh [%o0+8], %o4 3202 lduw [%o0+10], %o3 3203 sllx %o4, 48, %o4 3204 sllx %o3, 16, %o3 3205 or %o4, %o3, %o3 3206 lduh [%o0+14], %o4 3207 or %o4, %o3, %o4 3208 stxa %o4, [%o1+8]%asi 3209 3210 lduh [%o0+16], %o4 3211 lduw [%o0+18], %o3 3212 sllx %o4, 48, %o4 3213 sllx %o3, 16, %o3 3214 or %o4, %o3, %o3 3215 lduh [%o0+22], %o4 3216 or %o4, %o3, %o4 3217 stxa %o4, [%o1+16]%asi 3218 3219 add %o0, 32, %o0 ! increase src ptr by 32 3220 add %o1, 32, %o1 ! increase dst ptr by 32 3221 3222 lduh [%o0-8], %o4 3223 lduw [%o0-6], %o3 3224 sllx %o4, 48, %o4 3225 sllx %o3, 16, %o3 3226 or %o4, %o3, %o3 3227 lduh [%o0-2], %o4 3228 or %o3, %o4, %o4 3229 bgu,pt %ncc, .co_medh32 ! repeat if at least 32 bytes left 3230 stxa %o4, [%o1-8]%asi 3231 3232.co_medh31: 3233 addcc %o2, 24, %o2 ! adjust count to be off by 7 3234 ble,pt %ncc, .co_medh7 ! skip if 7 or fewer bytes left 3235 nop ! 3236.co_medh15: 3237 lduh [%o0], %o4 ! move 16 bytes 3238 subcc %o2, 8, %o2 ! decrement length count 3239 lduw [%o0+2], %o3 3240 sllx %o4, 48, %o4 3241 sllx %o3, 16, %o3 3242 or %o4, %o3, %o3 3243 add %o1, 8, %o1 ! increase dst ptr by 8 3244 lduh [%o0+6], %o4 3245 add %o0, 8, %o0 ! increase src ptr by 8 3246 or %o4, %o3, %o4 3247 bgu,pt %ncc, .co_medh15 3248 stxa %o4, [%o1-8]%asi 3249.co_medh7: 3250 addcc %o2, 7, %o2 ! finish adjustment of remaining count 3251 bz,pt %ncc, .co_smallx ! exit if finished 3252 cmp %o2, 4 3253 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left 3254 nop ! 3255 lduh [%o0], %o4 3256 sll %o4, 16, %o4 3257 lduh [%o0+2], %o3 3258 or %o3, %o4, %o4 3259 subcc %o2, 4, %o2 3260 add %o0, 4, %o0 3261 add %o1, 4, %o1 3262 bnz .co_small3x 3263 stwa %o4, [%o1-4]%asi 3264 membar #Sync 3265 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 3266 retl 3267 mov %g0, %o0 3268 3269 .align 16 3270.co_med_byte: 3271 bnz,pt %ncc, .co_medbh32a ! go to correct byte move 3272 subcc %o2, 31, %o2 ! adjust length to allow cc test 3273 ble,pt %ncc, .co_medb31 3274 nop 3275.co_medb32: ! Alignment 1 or 5 3276 subcc %o2, 32, %o2 ! decrement length count 3277 3278 ldub [%o0], %o4 ! load and store a block of 32 bytes 3279 sllx %o4, 56, %o3 3280 lduh [%o0+1], %o4 3281 sllx %o4, 40, %o4 3282 or %o4, %o3, %o3 3283 lduw [%o0+3], %o4 3284 sllx %o4, 8, %o4 3285 or %o4, %o3, %o3 3286 ldub [%o0+7], %o4 3287 or %o4, %o3, %o4 3288 stxa %o4, [%o1]%asi 3289 3290 ldub [%o0+8], %o4 3291 sllx %o4, 56, %o3 3292 lduh [%o0+9], %o4 3293 sllx %o4, 40, %o4 3294 or %o4, %o3, %o3 3295 lduw [%o0+11], %o4 3296 sllx %o4, 8, %o4 3297 or %o4, %o3, %o3 3298 ldub [%o0+15], %o4 3299 or %o4, %o3, %o4 3300 stxa %o4, [%o1+8]%asi 3301 3302 ldub [%o0+16], %o4 3303 sllx %o4, 56, %o3 3304 lduh [%o0+17], %o4 3305 sllx %o4, 40, %o4 3306 or %o4, %o3, %o3 3307 lduw [%o0+19], %o4 3308 sllx %o4, 8, %o4 3309 or %o4, %o3, %o3 3310 ldub [%o0+23], %o4 3311 or %o4, %o3, %o4 3312 stxa %o4, [%o1+16]%asi 3313 3314 add %o0, 32, %o0 ! increase src ptr by 32 3315 add %o1, 32, %o1 ! increase dst ptr by 32 3316 3317 ldub [%o0-8], %o4 3318 sllx %o4, 56, %o3 3319 lduh [%o0-7], %o4 3320 sllx %o4, 40, %o4 3321 or %o4, %o3, %o3 3322 lduw [%o0-5], %o4 3323 sllx %o4, 8, %o4 3324 or %o4, %o3, %o3 3325 ldub [%o0-1], %o4 3326 or %o4, %o3, %o4 3327 bgu,pt %ncc, .co_medb32 ! repeat if at least 32 bytes left 3328 stxa %o4, [%o1-8]%asi 3329 3330.co_medb31: ! 31 or fewer bytes remaining 3331 addcc %o2, 24, %o2 ! adjust count to be off by 7 3332 ble,pt %ncc, .co_medb7 ! skip if 7 or fewer bytes left 3333 nop ! 3334.co_medb15: 3335 3336 ldub [%o0], %o4 ! load and store a block of 8 bytes 3337 subcc %o2, 8, %o2 ! decrement length count 3338 sllx %o4, 56, %o3 3339 lduh [%o0+1], %o4 3340 sllx %o4, 40, %o4 3341 or %o4, %o3, %o3 3342 lduw [%o0+3], %o4 3343 add %o1, 8, %o1 ! increase dst ptr by 16 3344 sllx %o4, 8, %o4 3345 or %o4, %o3, %o3 3346 ldub [%o0+7], %o4 3347 add %o0, 8, %o0 ! increase src ptr by 16 3348 or %o4, %o3, %o4 3349 bgu,pt %ncc, .co_medb15 3350 stxa %o4, [%o1-8]%asi 3351.co_medb7: 3352 addcc %o2, 7, %o2 ! finish adjustment of remaining count 3353 bz,pt %ncc, .co_smallx ! exit if finished 3354 cmp %o2, 4 3355 blt,pt %ncc, .co_small3x ! skip if less than 4 bytes left 3356 nop ! 3357 ldub [%o0], %o4 ! move 4 bytes 3358 sll %o4, 24, %o3 3359 lduh [%o0+1], %o4 3360 sll %o4, 8, %o4 3361 or %o4, %o3, %o3 3362 ldub [%o0+3], %o4 3363 or %o4, %o3, %o4 3364 subcc %o2, 4, %o2 3365 add %o0, 4, %o0 3366 add %o1, 4, %o1 3367 bnz .co_small3x 3368 stwa %o4, [%o1-4]%asi 3369 membar #Sync 3370 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 3371 retl 3372 mov %g0, %o0 3373 3374 .align 16 3375.co_medbh32a: 3376 ble,pt %ncc, .co_medbh31 3377 nop 3378.co_medbh32: ! Alignment 3 or 7 3379 subcc %o2, 32, %o2 ! decrement length count 3380 3381 ldub [%o0], %o4 ! load and store a block of 32 bytes 3382 sllx %o4, 56, %o3 3383 lduw [%o0+1], %o4 3384 sllx %o4, 24, %o4 3385 or %o4, %o3, %o3 3386 lduh [%o0+5], %o4 3387 sllx %o4, 8, %o4 3388 or %o4, %o3, %o3 3389 ldub [%o0+7], %o4 3390 or %o4, %o3, %o4 3391 stxa %o4, [%o1]%asi 3392 3393 ldub [%o0+8], %o4 3394 sllx %o4, 56, %o3 3395 lduw [%o0+9], %o4 3396 sllx %o4, 24, %o4 3397 or %o4, %o3, %o3 3398 lduh [%o0+13], %o4 3399 sllx %o4, 8, %o4 3400 or %o4, %o3, %o3 3401 ldub [%o0+15], %o4 3402 or %o4, %o3, %o4 3403 stxa %o4, [%o1+8]%asi 3404 3405 ldub [%o0+16], %o4 3406 sllx %o4, 56, %o3 3407 lduw [%o0+17], %o4 3408 sllx %o4, 24, %o4 3409 or %o4, %o3, %o3 3410 lduh [%o0+21], %o4 3411 sllx %o4, 8, %o4 3412 or %o4, %o3, %o3 3413 ldub [%o0+23], %o4 3414 or %o4, %o3, %o4 3415 stxa %o4, [%o1+16]%asi 3416 3417 add %o0, 32, %o0 ! increase src ptr by 32 3418 add %o1, 32, %o1 ! increase dst ptr by 32 3419 3420 ldub [%o0-8], %o4 3421 sllx %o4, 56, %o3 3422 lduw [%o0-7], %o4 3423 sllx %o4, 24, %o4 3424 or %o4, %o3, %o3 3425 lduh [%o0-3], %o4 3426 sllx %o4, 8, %o4 3427 or %o4, %o3, %o3 3428 ldub [%o0-1], %o4 3429 or %o4, %o3, %o4 3430 bgu,pt %ncc, .co_medbh32 ! repeat if at least 32 bytes left 3431 stxa %o4, [%o1-8]%asi 3432 3433.co_medbh31: 3434 addcc %o2, 24, %o2 ! adjust count to be off by 7 3435 ble,pt %ncc, .co_medb7 ! skip if 7 or fewer bytes left 3436 nop ! 3437.co_medbh15: 3438 ldub [%o0], %o4 ! load and store a block of 8 bytes 3439 sllx %o4, 56, %o3 3440 lduw [%o0+1], %o4 3441 sllx %o4, 24, %o4 3442 or %o4, %o3, %o3 3443 lduh [%o0+5], %o4 3444 sllx %o4, 8, %o4 3445 or %o4, %o3, %o3 3446 ldub [%o0+7], %o4 3447 or %o4, %o3, %o4 3448 stxa %o4, [%o1]%asi 3449 subcc %o2, 8, %o2 ! decrement length count 3450 add %o1, 8, %o1 ! increase dst ptr by 8 3451 add %o0, 8, %o0 ! increase src ptr by 8 3452 bgu,pt %ncc, .co_medbh15 3453 stxa %o4, [%o1-8]%asi 3454 ba .co_medb7 3455 nop 3456/* 3457 * End of small copy (no window) code 3458 */ 3459 3460/* 3461 * Long copy code 3462 */ 3463.co_copy_more: 3464 sethi %hi(copyio_fault), %o3 3465 or %o3, %lo(copyio_fault), %o3 3466 membar #Sync 3467 stn %o3, [THREAD_REG + T_LOFAULT] 3468 3469/* 3470 * Following code is for large copies. We know there is at 3471 * least FP_COPY bytes available. FP regs are used, so 3472 * we save registers and fp regs before starting 3473 */ 3474 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3475 or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 3476 rd %fprs, %g1 ! check for unused fp 3477 ! if fprs.fef == 0, set it. 3478 ! Setting it when already set costs more than checking 3479 andcc %g1, FPRS_FEF, %g1 ! test FEF, fprs.du = fprs.dl = 0 3480 bz,pt %ncc, .co_fp_unused 3481 mov ASI_USER, %asi 3482 BST_FP_TOSTACK(%o3) 3483 ba .co_fp_ready 3484.co_fp_unused: 3485 prefetch [%i0 + (1 * CACHE_LINE)], #one_read 3486 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 3487.co_fp_ready: 3488 rd %gsr, %l5 ! save %gsr value 3489 andcc %i1, 1, %o3 ! is dest byte aligned 3490 bnz,pt %ncc, .co_big_d1 3491.co_big_d1f: ! dest is now half word aligned 3492 andcc %i1, 2, %o3 3493 bnz,pt %ncc, .co_big_d2 3494.co_big_d2f: ! dest is now word aligned 3495 andcc %i1, 4, %o3 ! is dest longword aligned 3496 bnz,pt %ncc, .co_big_d4 3497.co_big_d4f: ! dest is now long word aligned 3498 andcc %i0, 7, %o3 ! is src long word aligned 3499 brnz,pt %o3, .co_big_unal8 3500 prefetch [%i0 + (2 * CACHE_LINE)], #one_read 3501 ! Src and dst are long word aligned 3502 ! align dst to 64 byte boundary 3503 andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 3504 brz,pn %o3, .co_al_to_64 3505 nop 3506 sub %o3, 64, %o3 ! %o3 has negative bytes to move 3507 add %i2, %o3, %i2 ! adjust remaining count 3508 andcc %o3, 8, %o4 ! odd long words to move? 3509 brz,pt %o4, .co_al_to_16 3510 nop 3511 add %o3, 8, %o3 3512 ldx [%i0], %o4 3513 add %i0, 8, %i0 ! increment src ptr 3514 stxa %o4, [%i1]ASI_USER 3515 add %i1, 8, %i1 ! increment dst ptr 3516! Dest is aligned on 16 bytes, src 8 byte aligned 3517.co_al_to_16: 3518 andcc %o3, 0x30, %o4 ! move to move? 3519 brz,pt %o4, .co_al_to_64 3520 nop 3521.co_al_mv_16: 3522 add %o3, 16, %o3 3523 ldx [%i0], %o4 3524 stxa %o4, [%i1]ASI_USER 3525 add %i0, 16, %i0 ! increment src ptr 3526 ldx [%i0-8], %o4 3527 add %i1, 8, %i1 ! increment dst ptr 3528 stxa %o4, [%i1]ASI_USER 3529 andcc %o3, 0x30, %o4 3530 brnz,pt %o4, .co_al_mv_16 3531 add %i1, 8, %i1 ! increment dst ptr 3532! Dest is aligned on 64 bytes, src 8 byte aligned 3533.co_al_to_64: 3534 ! Determine source alignment 3535 ! to correct 8 byte offset 3536 andcc %i0, 32, %o3 3537 brnz,pn %o3, .co_aln_1 3538 andcc %i0, 16, %o3 3539 brnz,pn %o3, .co_aln_01 3540 andcc %i0, 8, %o3 3541 brz,pn %o3, .co_aln_000 3542 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 3543 ba .co_aln_001 3544 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3545.co_aln_01: 3546 brnz,pn %o3, .co_aln_011 3547 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 3548 ba .co_aln_010 3549 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3550.co_aln_1: 3551 andcc %i0, 16, %o3 3552 brnz,pn %o3, .co_aln_11 3553 andcc %i0, 8, %o3 3554 brnz,pn %o3, .co_aln_101 3555 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 3556 ba .co_aln_100 3557 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3558.co_aln_11: 3559 brz,pn %o3, .co_aln_110 3560 prefetch [%i0 + (3 * CACHE_LINE)], #one_read 3561 3562.co_aln_111: 3563! Alignment off by 8 bytes 3564 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3565 ldd [%i0], %d0 3566 add %i0, 8, %i0 3567 sub %i2, 8, %i2 3568 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3569 and %i2, 0x7f, %i2 ! residue bytes in %i2 3570 sub %i1, %i0, %i1 3571.co_aln_111_loop: 3572 ldda [%i0]ASI_BLK_P,%d16 ! block load 3573 subcc %o3, 64, %o3 3574 fmovd %d16, %d2 3575 fmovd %d18, %d4 3576 fmovd %d20, %d6 3577 fmovd %d22, %d8 3578 fmovd %d24, %d10 3579 fmovd %d26, %d12 3580 fmovd %d28, %d14 3581 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3582 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3583 add %i0, 64, %i0 3584 fmovd %d30, %d0 3585 bgt,pt %ncc, .co_aln_111_loop 3586 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3587 add %i1, %i0, %i1 3588 3589 stda %d0, [%i1]ASI_USER 3590 ba .co_remain_stuff 3591 add %i1, 8, %i1 3592 ! END OF aln_111 3593 3594.co_aln_110: 3595! Alignment off by 16 bytes 3596 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3597 ldd [%i0], %d0 3598 ldd [%i0+8], %d2 3599 add %i0, 16, %i0 3600 sub %i2, 16, %i2 3601 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3602 and %i2, 0x7f, %i2 ! residue bytes in %i2 3603 sub %i1, %i0, %i1 3604.co_aln_110_loop: 3605 ldda [%i0]ASI_BLK_P,%d16 ! block load 3606 subcc %o3, 64, %o3 3607 fmovd %d16, %d4 3608 fmovd %d18, %d6 3609 fmovd %d20, %d8 3610 fmovd %d22, %d10 3611 fmovd %d24, %d12 3612 fmovd %d26, %d14 3613 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3614 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3615 add %i0, 64, %i0 3616 fmovd %d28, %d0 3617 fmovd %d30, %d2 3618 bgt,pt %ncc, .co_aln_110_loop 3619 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3620 add %i1, %i0, %i1 3621 3622 stda %d0, [%i1]%asi 3623 stda %d2, [%i1+8]%asi 3624 ba .co_remain_stuff 3625 add %i1, 16, %i1 3626 ! END OF aln_110 3627 3628.co_aln_101: 3629! Alignment off by 24 bytes 3630 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3631 ldd [%i0], %d0 3632 ldd [%i0+8], %d2 3633 ldd [%i0+16], %d4 3634 add %i0, 24, %i0 3635 sub %i2, 24, %i2 3636 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3637 and %i2, 0x7f, %i2 ! residue bytes in %i2 3638 sub %i1, %i0, %i1 3639.co_aln_101_loop: 3640 ldda [%i0]ASI_BLK_P,%d16 ! block load 3641 subcc %o3, 64, %o3 3642 fmovd %d16, %d6 3643 fmovd %d18, %d8 3644 fmovd %d20, %d10 3645 fmovd %d22, %d12 3646 fmovd %d24, %d14 3647 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3648 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3649 add %i0, 64, %i0 3650 fmovd %d26, %d0 3651 fmovd %d28, %d2 3652 fmovd %d30, %d4 3653 bgt,pt %ncc, .co_aln_101_loop 3654 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3655 add %i1, %i0, %i1 3656 3657 stda %d0, [%i1]%asi 3658 stda %d2, [%i1+8]%asi 3659 stda %d4, [%i1+16]%asi 3660 ba .co_remain_stuff 3661 add %i1, 24, %i1 3662 ! END OF aln_101 3663 3664.co_aln_100: 3665! Alignment off by 32 bytes 3666 ldd [%i0], %d0 3667 ldd [%i0+8], %d2 3668 ldd [%i0+16],%d4 3669 ldd [%i0+24],%d6 3670 add %i0, 32, %i0 3671 sub %i2, 32, %i2 3672 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3673 and %i2, 0x7f, %i2 ! residue bytes in %i2 3674 sub %i1, %i0, %i1 3675.co_aln_100_loop: 3676 ldda [%i0]ASI_BLK_P,%d16 ! block load 3677 subcc %o3, 64, %o3 3678 fmovd %d16, %d8 3679 fmovd %d18, %d10 3680 fmovd %d20, %d12 3681 fmovd %d22, %d14 3682 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3683 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3684 add %i0, 64, %i0 3685 fmovd %d24, %d0 3686 fmovd %d26, %d2 3687 fmovd %d28, %d4 3688 fmovd %d30, %d6 3689 bgt,pt %ncc, .co_aln_100_loop 3690 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3691 add %i1, %i0, %i1 3692 3693 stda %d0, [%i1]%asi 3694 stda %d2, [%i1+8]%asi 3695 stda %d4, [%i1+16]%asi 3696 stda %d6, [%i1+24]%asi 3697 ba .co_remain_stuff 3698 add %i1, 32, %i1 3699 ! END OF aln_100 3700 3701.co_aln_011: 3702! Alignment off by 40 bytes 3703 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3704 ldd [%i0], %d0 3705 ldd [%i0+8], %d2 3706 ldd [%i0+16], %d4 3707 ldd [%i0+24], %d6 3708 ldd [%i0+32], %d8 3709 add %i0, 40, %i0 3710 sub %i2, 40, %i2 3711 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3712 and %i2, 0x7f, %i2 ! residue bytes in %i2 3713 sub %i1, %i0, %i1 3714.co_aln_011_loop: 3715 ldda [%i0]ASI_BLK_P,%d16 ! block load 3716 subcc %o3, 64, %o3 3717 fmovd %d16, %d10 3718 fmovd %d18, %d12 3719 fmovd %d20, %d14 3720 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3721 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3722 add %i0, 64, %i0 3723 fmovd %d22, %d0 3724 fmovd %d24, %d2 3725 fmovd %d26, %d4 3726 fmovd %d28, %d6 3727 fmovd %d30, %d8 3728 bgt,pt %ncc, .co_aln_011_loop 3729 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3730 add %i1, %i0, %i1 3731 3732 stda %d0, [%i1]%asi 3733 stda %d2, [%i1+8]%asi 3734 stda %d4, [%i1+16]%asi 3735 stda %d6, [%i1+24]%asi 3736 stda %d8, [%i1+32]%asi 3737 ba .co_remain_stuff 3738 add %i1, 40, %i1 3739 ! END OF aln_011 3740 3741.co_aln_010: 3742! Alignment off by 48 bytes 3743 ldd [%i0], %d0 3744 ldd [%i0+8], %d2 3745 ldd [%i0+16], %d4 3746 ldd [%i0+24], %d6 3747 ldd [%i0+32], %d8 3748 ldd [%i0+40], %d10 3749 add %i0, 48, %i0 3750 sub %i2, 48, %i2 3751 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3752 and %i2, 0x7f, %i2 ! residue bytes in %i2 3753 sub %i1, %i0, %i1 3754.co_aln_010_loop: 3755 ldda [%i0]ASI_BLK_P,%d16 ! block load 3756 subcc %o3, 64, %o3 3757 fmovd %d16, %d12 3758 fmovd %d18, %d14 3759 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3760 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3761 add %i0, 64, %i0 3762 fmovd %d20, %d0 3763 fmovd %d22, %d2 3764 fmovd %d24, %d4 3765 fmovd %d26, %d6 3766 fmovd %d28, %d8 3767 fmovd %d30, %d10 3768 bgt,pt %ncc, .co_aln_010_loop 3769 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3770 add %i1, %i0, %i1 3771 3772 stda %d0, [%i1]%asi 3773 stda %d2, [%i1+8]%asi 3774 stda %d4, [%i1+16]%asi 3775 stda %d6, [%i1+24]%asi 3776 stda %d8, [%i1+32]%asi 3777 stda %d10, [%i1+40]%asi 3778 ba .co_remain_stuff 3779 add %i1, 48, %i1 3780 ! END OF aln_010 3781 3782.co_aln_001: 3783! Alignment off by 56 bytes 3784 ldd [%i0], %d0 3785 ldd [%i0+8], %d2 3786 ldd [%i0+16], %d4 3787 ldd [%i0+24], %d6 3788 ldd [%i0+32], %d8 3789 ldd [%i0+40], %d10 3790 ldd [%i0+48], %d12 3791 add %i0, 56, %i0 3792 sub %i2, 56, %i2 3793 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3794 and %i2, 0x7f, %i2 ! residue bytes in %i2 3795 sub %i1, %i0, %i1 3796.co_aln_001_loop: 3797 ldda [%i0]ASI_BLK_P,%d16 ! block load 3798 subcc %o3, 64, %o3 3799 fmovd %d16, %d14 3800 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3801 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3802 add %i0, 64, %i0 3803 fmovd %d18, %d0 3804 fmovd %d20, %d2 3805 fmovd %d22, %d4 3806 fmovd %d24, %d6 3807 fmovd %d26, %d8 3808 fmovd %d28, %d10 3809 fmovd %d30, %d12 3810 bgt,pt %ncc, .co_aln_001_loop 3811 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3812 add %i1, %i0, %i1 3813 3814 stda %d0, [%i1]%asi 3815 stda %d2, [%i1+8]%asi 3816 stda %d4, [%i1+16]%asi 3817 stda %d6, [%i1+24]%asi 3818 stda %d8, [%i1+32]%asi 3819 stda %d10, [%i1+40]%asi 3820 stda %d12, [%i1+48]%asi 3821 ba .co_remain_stuff 3822 add %i1, 56, %i1 3823 ! END OF aln_001 3824 3825.co_aln_000: 3826 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3827 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 3828 and %i2, 0x7f, %i2 ! residue bytes in %i2 3829 sub %i1, %i0, %i1 3830.co_aln_000_loop: 3831 ldda [%i0]ASI_BLK_P,%d0 3832 subcc %o3, 64, %o3 3833 stxa %g0,[%i0+%i1]ASI_STBI_AIUS ! block initializing store 3834 stda %d0,[%i0+%i1]ASI_BLK_AIUS 3835 add %i0, 64, %i0 3836 bgt,pt %ncc, .co_aln_000_loop 3837 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 3838 add %i1, %i0, %i1 3839 3840 ! END OF aln_000 3841 3842.co_remain_stuff: 3843 subcc %i2, 31, %i2 ! adjust length to allow cc test 3844 ble,pt %ncc, .co_aln_31 3845 nop 3846.co_aln_32: 3847 ldx [%i0], %o4 ! move 32 bytes 3848 subcc %i2, 32, %i2 ! decrement length count by 32 3849 stxa %o4, [%i1]%asi 3850 ldx [%i0+8], %o4 3851 stxa %o4, [%i1+8]%asi 3852 ldx [%i0+16], %o4 3853 add %i0, 32, %i0 ! increase src ptr by 32 3854 stxa %o4, [%i1+16]%asi 3855 ldx [%i0-8], %o4 3856 add %i1, 32, %i1 ! increase dst ptr by 32 3857 bgu,pt %ncc, .co_aln_32 ! repeat if at least 32 bytes left 3858 stxa %o4, [%i1-8]%asi 3859.co_aln_31: 3860 addcc %i2, 24, %i2 ! adjust count to be off by 7 3861 ble,pt %ncc, .co_aln_7 ! skip if 7 or fewer bytes left 3862 nop ! 3863.co_aln_15: 3864 ldx [%i0], %o4 ! move 8 bytes 3865 add %i0, 8, %i0 ! increase src ptr by 8 3866 subcc %i2, 8, %i2 ! decrease count by 8 3867 add %i1, 8, %i1 ! increase dst ptr by 8 3868 bgu,pt %ncc, .co_aln_15 3869 stxa %o4, [%i1-8]%asi 3870.co_aln_7: 3871 addcc %i2, 7, %i2 ! finish adjustment of remaining count 3872 bz,pt %ncc, .co_exit ! exit if finished 3873 cmp %i2, 4 3874 blt,pt %ncc, .co_unaln3x ! skip if less than 4 bytes left 3875 nop ! 3876 ld [%i0], %o4 ! move 4 bytes 3877 add %i0, 4, %i0 ! increase src ptr by 4 3878 add %i1, 4, %i1 ! increase dst ptr by 4 3879 subcc %i2, 4, %i2 ! decrease count by 4 3880 bnz .co_unaln3x 3881 stwa %o4, [%i1-4]%asi 3882 ba .co_exit 3883 nop 3884 3885 ! destination alignment code 3886.co_big_d1: 3887 ldub [%i0], %o4 ! move a byte 3888 add %i0, 1, %i0 3889 stba %o4, [%i1]ASI_USER 3890 add %i1, 1, %i1 3891 andcc %i1, 2, %o3 3892 bz,pt %ncc, .co_big_d2f 3893 sub %i2, 1, %i2 3894.co_big_d2: 3895 ldub [%i0], %o4 ! move a half-word (src align unknown) 3896 ldub [%i0+1], %o3 3897 add %i0, 2, %i0 3898 sll %o4, 8, %o4 ! position 3899 or %o4, %o3, %o4 ! merge 3900 stha %o4, [%i1]ASI_USER 3901 add %i1, 2, %i1 3902 andcc %i1, 4, %o3 ! is dest longword aligned 3903 bz,pt %ncc, .co_big_d4f 3904 sub %i2, 2, %i2 3905.co_big_d4: ! dest is at least word aligned 3906 nop 3907 ldub [%i0], %o4 ! move a word (src align unknown) 3908 ldub [%i0+1], %o3 3909 sll %o4, 24, %o4 ! position 3910 sll %o3, 16, %o3 ! position 3911 or %o4, %o3, %o3 ! merge 3912 ldub [%i0+2], %o4 3913 sll %o4, 8, %o4 ! position 3914 or %o4, %o3, %o3 ! merge 3915 ldub [%i0+3], %o4 3916 or %o4, %o3, %o4 ! merge 3917 stwa %o4,[%i1]ASI_USER ! store four bytes 3918 add %i0, 4, %i0 ! adjust src by 4 3919 add %i1, 4, %i1 ! adjust dest by 4 3920 ba .co_big_d4f 3921 sub %i2, 4, %i2 ! adjust count by 4 3922 3923 3924 ! Dst is on 8 byte boundary; src is not; 3925.co_big_unal8: 3926 andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned? 3927 bz %ncc, .co_unalnsrc 3928 sub %o3, 64, %o3 ! %o3 will be multiple of 8 3929 neg %o3 ! bytes until dest is 64 byte aligned 3930 sub %i2, %o3, %i2 ! update cnt with bytes to be moved 3931 ! Move bytes according to source alignment 3932 andcc %i0, 0x1, %o4 3933 bnz %ncc, .co_unalnbyte ! check for byte alignment 3934 nop 3935 andcc %i0, 2, %o4 ! check for half word alignment 3936 bnz %ncc, .co_unalnhalf 3937 nop 3938 ! Src is word aligned, move bytes until dest 64 byte aligned 3939.co_unalnword: 3940 ld [%i0], %o4 ! load 4 bytes 3941 stwa %o4, [%i1]%asi ! and store 4 bytes 3942 ld [%i0+4], %o4 ! load 4 bytes 3943 add %i0, 8, %i0 ! increase src ptr by 8 3944 stwa %o4, [%i1+4]%asi ! and store 4 bytes 3945 subcc %o3, 8, %o3 ! decrease count by 8 3946 bnz %ncc, .co_unalnword 3947 add %i1, 8, %i1 ! increase dst ptr by 8 3948 ba .co_unalnsrc 3949 nop 3950 3951 ! Src is half-word aligned, move bytes until dest 64 byte aligned 3952.co_unalnhalf: 3953 lduh [%i0], %o4 ! load 2 bytes 3954 sllx %o4, 32, %i3 ! shift left 3955 lduw [%i0+2], %o4 3956 or %o4, %i3, %i3 3957 sllx %i3, 16, %i3 3958 lduh [%i0+6], %o4 3959 or %o4, %i3, %i3 3960 stxa %i3, [%i1]ASI_USER 3961 add %i0, 8, %i0 3962 subcc %o3, 8, %o3 3963 bnz %ncc, .co_unalnhalf 3964 add %i1, 8, %i1 3965 ba .co_unalnsrc 3966 nop 3967 3968 ! Src is Byte aligned, move bytes until dest 64 byte aligned 3969.co_unalnbyte: 3970 sub %i1, %i0, %i1 ! share pointer advance 3971.co_unalnbyte_loop: 3972 ldub [%i0], %o4 3973 sllx %o4, 56, %i3 3974 lduh [%i0+1], %o4 3975 sllx %o4, 40, %o4 3976 or %o4, %i3, %i3 3977 lduh [%i0+3], %o4 3978 sllx %o4, 24, %o4 3979 or %o4, %i3, %i3 3980 lduh [%i0+5], %o4 3981 sllx %o4, 8, %o4 3982 or %o4, %i3, %i3 3983 ldub [%i0+7], %o4 3984 or %o4, %i3, %i3 3985 stxa %i3, [%i1+%i0]ASI_USER 3986 subcc %o3, 8, %o3 3987 bnz %ncc, .co_unalnbyte_loop 3988 add %i0, 8, %i0 3989 add %i1,%i0, %i1 ! restore pointer 3990 3991 ! Destination is now block (64 byte aligned), src is not 8 byte aligned 3992.co_unalnsrc: 3993 andn %i2, 0x3f, %i3 ! %i3 is multiple of block size 3994 and %i2, 0x3f, %i2 ! residue bytes in %i2 3995 add %i2, 64, %i2 ! Insure we don't load beyond 3996 sub %i3, 64, %i3 ! end of source buffer 3997 3998 andn %i0, 0x3f, %o4 ! %o4 has block aligned src address 3999 prefetch [%o4 + (3 * CACHE_LINE)], #one_read 4000 alignaddr %i0, %g0, %g0 ! generate %gsr 4001 add %i0, %i3, %i0 ! advance %i0 to after blocks 4002 ! 4003 ! Determine source alignment to correct 8 byte offset 4004 andcc %i0, 0x20, %o3 4005 brnz,pn %o3, .co_unaln_1 4006 andcc %i0, 0x10, %o3 4007 brnz,pn %o3, .co_unaln_01 4008 andcc %i0, 0x08, %o3 4009 brz,a %o3, .co_unaln_000 4010 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4011 ba .co_unaln_001 4012 nop 4013.co_unaln_01: 4014 brnz,a %o3, .co_unaln_011 4015 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4016 ba .co_unaln_010 4017 nop 4018.co_unaln_1: 4019 brnz,pn %o3, .co_unaln_11 4020 andcc %i0, 0x08, %o3 4021 brnz,a %o3, .co_unaln_101 4022 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4023 ba .co_unaln_100 4024 nop 4025.co_unaln_11: 4026 brz,pn %o3, .co_unaln_110 4027 prefetch [%i0 + (4 * CACHE_LINE)], #one_read 4028 4029.co_unaln_111: 4030 ldd [%o4+56], %d14 4031.co_unaln_111_loop: 4032 add %o4, 64, %o4 4033 ldda [%o4]ASI_BLK_P, %d16 4034 faligndata %d14, %d16, %d48 4035 faligndata %d16, %d18, %d50 4036 faligndata %d18, %d20, %d52 4037 faligndata %d20, %d22, %d54 4038 faligndata %d22, %d24, %d56 4039 faligndata %d24, %d26, %d58 4040 faligndata %d26, %d28, %d60 4041 faligndata %d28, %d30, %d62 4042 fmovd %d30, %d14 4043 stda %d48, [%i1]ASI_BLK_AIUS 4044 subcc %i3, 64, %i3 4045 add %i1, 64, %i1 4046 bgu,pt %ncc, .co_unaln_111_loop 4047 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4048 ba .co_unaln_done 4049 nop 4050 4051.co_unaln_110: 4052 ldd [%o4+48], %d12 4053 ldd [%o4+56], %d14 4054.co_unaln_110_loop: 4055 add %o4, 64, %o4 4056 ldda [%o4]ASI_BLK_P, %d16 4057 faligndata %d12, %d14, %d48 4058 faligndata %d14, %d16, %d50 4059 faligndata %d16, %d18, %d52 4060 faligndata %d18, %d20, %d54 4061 faligndata %d20, %d22, %d56 4062 faligndata %d22, %d24, %d58 4063 faligndata %d24, %d26, %d60 4064 faligndata %d26, %d28, %d62 4065 fmovd %d28, %d12 4066 fmovd %d30, %d14 4067 stda %d48, [%i1]ASI_BLK_AIUS 4068 subcc %i3, 64, %i3 4069 add %i1, 64, %i1 4070 bgu,pt %ncc, .co_unaln_110_loop 4071 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4072 ba .co_unaln_done 4073 nop 4074 4075.co_unaln_101: 4076 ldd [%o4+40], %d10 4077 ldd [%o4+48], %d12 4078 ldd [%o4+56], %d14 4079.co_unaln_101_loop: 4080 add %o4, 64, %o4 4081 ldda [%o4]ASI_BLK_P, %d16 4082 faligndata %d10, %d12, %d48 4083 faligndata %d12, %d14, %d50 4084 faligndata %d14, %d16, %d52 4085 faligndata %d16, %d18, %d54 4086 faligndata %d18, %d20, %d56 4087 faligndata %d20, %d22, %d58 4088 faligndata %d22, %d24, %d60 4089 faligndata %d24, %d26, %d62 4090 fmovd %d26, %d10 4091 fmovd %d28, %d12 4092 fmovd %d30, %d14 4093 stda %d48, [%i1]ASI_BLK_AIUS 4094 subcc %i3, 64, %i3 4095 add %i1, 64, %i1 4096 bgu,pt %ncc, .co_unaln_101_loop 4097 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4098 ba .co_unaln_done 4099 nop 4100 4101.co_unaln_100: 4102 ldd [%o4+32], %d8 4103 ldd [%o4+40], %d10 4104 ldd [%o4+48], %d12 4105 ldd [%o4+56], %d14 4106.co_unaln_100_loop: 4107 add %o4, 64, %o4 4108 ldda [%o4]ASI_BLK_P, %d16 4109 faligndata %d8, %d10, %d48 4110 faligndata %d10, %d12, %d50 4111 faligndata %d12, %d14, %d52 4112 faligndata %d14, %d16, %d54 4113 faligndata %d16, %d18, %d56 4114 faligndata %d18, %d20, %d58 4115 faligndata %d20, %d22, %d60 4116 faligndata %d22, %d24, %d62 4117 fmovd %d24, %d8 4118 fmovd %d26, %d10 4119 fmovd %d28, %d12 4120 fmovd %d30, %d14 4121 stda %d48, [%i1]ASI_BLK_AIUS 4122 subcc %i3, 64, %i3 4123 add %i1, 64, %i1 4124 bgu,pt %ncc, .co_unaln_100_loop 4125 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4126 ba .co_unaln_done 4127 nop 4128 4129.co_unaln_011: 4130 ldd [%o4+24], %d6 4131 ldd [%o4+32], %d8 4132 ldd [%o4+40], %d10 4133 ldd [%o4+48], %d12 4134 ldd [%o4+56], %d14 4135.co_unaln_011_loop: 4136 add %o4, 64, %o4 4137 ldda [%o4]ASI_BLK_P, %d16 4138 faligndata %d6, %d8, %d48 4139 faligndata %d8, %d10, %d50 4140 faligndata %d10, %d12, %d52 4141 faligndata %d12, %d14, %d54 4142 faligndata %d14, %d16, %d56 4143 faligndata %d16, %d18, %d58 4144 faligndata %d18, %d20, %d60 4145 faligndata %d20, %d22, %d62 4146 fmovd %d22, %d6 4147 fmovd %d24, %d8 4148 fmovd %d26, %d10 4149 fmovd %d28, %d12 4150 fmovd %d30, %d14 4151 stda %d48, [%i1]ASI_BLK_AIUS 4152 subcc %i3, 64, %i3 4153 add %i1, 64, %i1 4154 bgu,pt %ncc, .co_unaln_011_loop 4155 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4156 ba .co_unaln_done 4157 nop 4158 4159.co_unaln_010: 4160 ldd [%o4+16], %d4 4161 ldd [%o4+24], %d6 4162 ldd [%o4+32], %d8 4163 ldd [%o4+40], %d10 4164 ldd [%o4+48], %d12 4165 ldd [%o4+56], %d14 4166.co_unaln_010_loop: 4167 add %o4, 64, %o4 4168 ldda [%o4]ASI_BLK_P, %d16 4169 faligndata %d4, %d6, %d48 4170 faligndata %d6, %d8, %d50 4171 faligndata %d8, %d10, %d52 4172 faligndata %d10, %d12, %d54 4173 faligndata %d12, %d14, %d56 4174 faligndata %d14, %d16, %d58 4175 faligndata %d16, %d18, %d60 4176 faligndata %d18, %d20, %d62 4177 fmovd %d20, %d4 4178 fmovd %d22, %d6 4179 fmovd %d24, %d8 4180 fmovd %d26, %d10 4181 fmovd %d28, %d12 4182 fmovd %d30, %d14 4183 stda %d48, [%i1]ASI_BLK_AIUS 4184 subcc %i3, 64, %i3 4185 add %i1, 64, %i1 4186 bgu,pt %ncc, .co_unaln_010_loop 4187 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4188 ba .co_unaln_done 4189 nop 4190 4191.co_unaln_001: 4192 ldd [%o4+8], %d2 4193 ldd [%o4+16], %d4 4194 ldd [%o4+24], %d6 4195 ldd [%o4+32], %d8 4196 ldd [%o4+40], %d10 4197 ldd [%o4+48], %d12 4198 ldd [%o4+56], %d14 4199.co_unaln_001_loop: 4200 add %o4, 64, %o4 4201 ldda [%o4]ASI_BLK_P, %d16 4202 faligndata %d2, %d4, %d48 4203 faligndata %d4, %d6, %d50 4204 faligndata %d6, %d8, %d52 4205 faligndata %d8, %d10, %d54 4206 faligndata %d10, %d12, %d56 4207 faligndata %d12, %d14, %d58 4208 faligndata %d14, %d16, %d60 4209 faligndata %d16, %d18, %d62 4210 fmovd %d18, %d2 4211 fmovd %d20, %d4 4212 fmovd %d22, %d6 4213 fmovd %d24, %d8 4214 fmovd %d26, %d10 4215 fmovd %d28, %d12 4216 fmovd %d30, %d14 4217 stda %d48, [%i1]ASI_BLK_AIUS 4218 subcc %i3, 64, %i3 4219 add %i1, 64, %i1 4220 bgu,pt %ncc, .co_unaln_001_loop 4221 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4222 ba .co_unaln_done 4223 nop 4224 4225.co_unaln_000: 4226 ldda [%o4]ASI_BLK_P, %d0 4227.co_unaln_000_loop: 4228 add %o4, 64, %o4 4229 ldda [%o4]ASI_BLK_P, %d16 4230 faligndata %d0, %d2, %d48 4231 faligndata %d2, %d4, %d50 4232 faligndata %d4, %d6, %d52 4233 faligndata %d6, %d8, %d54 4234 faligndata %d8, %d10, %d56 4235 faligndata %d10, %d12, %d58 4236 faligndata %d12, %d14, %d60 4237 faligndata %d14, %d16, %d62 4238 fmovd %d16, %d0 4239 fmovd %d18, %d2 4240 fmovd %d20, %d4 4241 fmovd %d22, %d6 4242 fmovd %d24, %d8 4243 fmovd %d26, %d10 4244 fmovd %d28, %d12 4245 fmovd %d30, %d14 4246 stda %d48, [%i1]ASI_BLK_AIUS 4247 subcc %i3, 64, %i3 4248 add %i1, 64, %i1 4249 bgu,pt %ncc, .co_unaln_000_loop 4250 prefetch [%o4 + (4 * CACHE_LINE)], #one_read 4251 4252.co_unaln_done: 4253 ! Handle trailing bytes, 64 to 127 4254 ! Dest long word aligned, Src not long word aligned 4255 cmp %i2, 15 4256 bleu %ncc, .co_unaln_short 4257 4258 andn %i2, 0x7, %i3 ! %i3 is multiple of 8 4259 and %i2, 0x7, %i2 ! residue bytes in %i2 4260 add %i2, 8, %i2 4261 sub %i3, 8, %i3 ! insure we don't load past end of src 4262 andn %i0, 0x7, %o4 ! %o4 has long word aligned src address 4263 add %i0, %i3, %i0 ! advance %i0 to after multiple of 8 4264 ldd [%o4], %d0 ! fetch partial word 4265.co_unaln_by8: 4266 ldd [%o4+8], %d2 4267 add %o4, 8, %o4 4268 faligndata %d0, %d2, %d16 4269 subcc %i3, 8, %i3 4270 stda %d16, [%i1]%asi 4271 fmovd %d2, %d0 4272 bgu,pt %ncc, .co_unaln_by8 4273 add %i1, 8, %i1 4274 4275.co_unaln_short: 4276 cmp %i2, 8 4277 blt,pt %ncc, .co_unalnfin 4278 nop 4279 ldub [%i0], %o4 4280 sll %o4, 24, %o3 4281 ldub [%i0+1], %o4 4282 sll %o4, 16, %o4 4283 or %o4, %o3, %o3 4284 ldub [%i0+2], %o4 4285 sll %o4, 8, %o4 4286 or %o4, %o3, %o3 4287 ldub [%i0+3], %o4 4288 or %o4, %o3, %o3 4289 stwa %o3, [%i1]%asi 4290 ldub [%i0+4], %o4 4291 sll %o4, 24, %o3 4292 ldub [%i0+5], %o4 4293 sll %o4, 16, %o4 4294 or %o4, %o3, %o3 4295 ldub [%i0+6], %o4 4296 sll %o4, 8, %o4 4297 or %o4, %o3, %o3 4298 ldub [%i0+7], %o4 4299 or %o4, %o3, %o3 4300 stwa %o3, [%i1+4]%asi 4301 add %i0, 8, %i0 4302 add %i1, 8, %i1 4303 sub %i2, 8, %i2 4304.co_unalnfin: 4305 cmp %i2, 4 4306 blt,pt %ncc, .co_unalnz 4307 tst %i2 4308 ldub [%i0], %o3 ! read byte 4309 subcc %i2, 4, %i2 ! reduce count by 4 4310 sll %o3, 24, %o3 ! position 4311 ldub [%i0+1], %o4 4312 sll %o4, 16, %o4 ! position 4313 or %o4, %o3, %o3 ! merge 4314 ldub [%i0+2], %o4 4315 sll %o4, 8, %o4 ! position 4316 or %o4, %o3, %o3 ! merge 4317 add %i1, 4, %i1 ! advance dst by 4 4318 ldub [%i0+3], %o4 4319 add %i0, 4, %i0 ! advance src by 4 4320 or %o4, %o3, %o4 ! merge 4321 bnz,pt %ncc, .co_unaln3x 4322 stwa %o4, [%i1-4]%asi 4323 ba .co_exit 4324 nop 4325.co_unalnz: 4326 bz,pt %ncc, .co_exit 4327 wr %l5, %g0, %gsr ! restore %gsr 4328.co_unaln3x: ! Exactly 1, 2, or 3 bytes remain 4329 subcc %i2, 1, %i2 ! reduce count for cc test 4330 ldub [%i0], %o4 ! load one byte 4331 bz,pt %ncc, .co_exit 4332 stba %o4, [%i1]%asi ! store one byte 4333 ldub [%i0+1], %o4 ! load second byte 4334 subcc %i2, 1, %i2 4335 bz,pt %ncc, .co_exit 4336 stba %o4, [%i1+1]%asi ! store second byte 4337 ldub [%i0+2], %o4 ! load third byte 4338 stba %o4, [%i1+2]%asi ! store third byte 4339.co_exit: 4340 brnz %g1, .co_fp_restore 4341 nop 4342 FZERO 4343 wr %g1, %g0, %fprs 4344 ba,pt %ncc, .co_ex2 4345 membar #Sync 4346.co_fp_restore: 4347 BLD_FP_FROMSTACK(%o4) 4348.co_ex2: 4349 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 4350 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 4351 ret 4352 restore %g0, 0, %o0 4353 4354.copyout_err: 4355 ldn [THREAD_REG + T_COPYOPS], %o4 4356 brz %o4, 2f 4357 nop 4358 ldn [%o4 + CP_COPYOUT], %g2 4359 jmp %g2 4360 nop 43612: 4362 retl 4363 mov -1, %o0 4364 4365#else /* NIAGARA_IMPL */ 4366.do_copyout: 4367 ! 4368 ! Check the length and bail if zero. 4369 ! 4370 tst %o2 4371 bnz,pt %ncc, 1f 4372 nop 4373 retl 4374 clr %o0 43751: 4376 sethi %hi(copyio_fault), %o4 4377 or %o4, %lo(copyio_fault), %o4 4378 sethi %hi(copyio_fault_nowindow), %o3 4379 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT 4380 or %o3, %lo(copyio_fault_nowindow), %o3 4381 membar #Sync 4382 stn %o3, [THREAD_REG + T_LOFAULT] 4383 4384 mov %o0, SAVE_SRC 4385 mov %o1, SAVE_DST 4386 mov %o2, SAVE_COUNT 4387 4388 ! 4389 ! Check to see if we're more than SMALL_LIMIT (7 bytes). 4390 ! Run in leaf mode, using the %o regs as our input regs. 4391 ! 4392 subcc %o2, SMALL_LIMIT, %o3 4393 bgu,a,pt %ncc, .dco_ns 4394 or %o0, %o1, %o3 4395 ! 4396 ! What was previously ".small_copyout" 4397 ! Do full differenced copy. 4398 ! 4399.dcobcp: 4400 sub %g0, %o2, %o3 ! negate count 4401 add %o0, %o2, %o0 ! make %o0 point at the end 4402 add %o1, %o2, %o1 ! make %o1 point at the end 4403 ba,pt %ncc, .dcocl 4404 ldub [%o0 + %o3], %o4 ! load first byte 4405 ! 4406 ! %o0 and %o2 point at the end and remain pointing at the end 4407 ! of their buffers. We pull things out by adding %o3 (which is 4408 ! the negation of the length) to the buffer end which gives us 4409 ! the curent location in the buffers. By incrementing %o3 we walk 4410 ! through both buffers without having to bump each buffer's 4411 ! pointer. A very fast 4 instruction loop. 4412 ! 4413 .align 16 4414.dcocl: 4415 stba %o4, [%o1 + %o3]ASI_USER 4416 inccc %o3 4417 bl,a,pt %ncc, .dcocl 4418 ldub [%o0 + %o3], %o4 4419 ! 4420 ! We're done. Go home. 4421 ! 4422 membar #Sync 4423 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 4424 retl 4425 clr %o0 4426 ! 4427 ! Try aligned copies from here. 4428 ! 4429.dco_ns: 4430 ! %o0 = kernel addr (to be copied from) 4431 ! %o1 = user addr (to be copied to) 4432 ! %o2 = length 4433 ! %o3 = %o1 | %o2 (used for alignment checking) 4434 ! %o4 is alternate lo_fault 4435 ! %o5 is original lo_fault 4436 ! 4437 ! See if we're single byte aligned. If we are, check the 4438 ! limit for single byte copies. If we're smaller or equal, 4439 ! bounce to the byte for byte copy loop. Otherwise do it in 4440 ! HW (if enabled). 4441 ! 4442 btst 1, %o3 4443 bz,pt %icc, .dcoh8 4444 btst 7, %o3 4445 ! 4446 ! Single byte aligned. Do we do it via HW or via 4447 ! byte for byte? Do a quick no memory reference 4448 ! check to pick up small copies. 4449 ! 4450 sethi %hi(hw_copy_limit_1), %o3 4451 ! 4452 ! Big enough that we need to check the HW limit for 4453 ! this size copy. 4454 ! 4455 ld [%o3 + %lo(hw_copy_limit_1)], %o3 4456 ! 4457 ! Is HW copy on? If not, do everything byte for byte. 4458 ! 4459 tst %o3 4460 bz,pn %icc, .dcobcp 4461 subcc %o3, %o2, %o3 4462 ! 4463 ! If we're less than or equal to the single byte copy limit, 4464 ! bop to the copy loop. 4465 ! 4466 bge,pt %ncc, .dcobcp 4467 nop 4468 ! 4469 ! We're big enough and copy is on. Do it with HW. 4470 ! 4471 ba,pt %ncc, .big_copyout 4472 nop 4473.dcoh8: 4474 ! 4475 ! 8 byte aligned? 4476 ! 4477 bnz,a %ncc, .dcoh4 4478 btst 3, %o3 4479 ! 4480 ! See if we're in the "small range". 4481 ! If so, go off and do the copy. 4482 ! If not, load the hard limit. %o3 is 4483 ! available for reuse. 4484 ! 4485 sethi %hi(hw_copy_limit_8), %o3 4486 ld [%o3 + %lo(hw_copy_limit_8)], %o3 4487 ! 4488 ! If it's zero, there's no HW bcopy. 4489 ! Bop off to the aligned copy. 4490 ! 4491 tst %o3 4492 bz,pn %icc, .dcos8 4493 subcc %o3, %o2, %o3 4494 ! 4495 ! We're negative if our size is larger than hw_copy_limit_8. 4496 ! 4497 bge,pt %ncc, .dcos8 4498 nop 4499 ! 4500 ! HW assist is on and we're large enough. Do it. 4501 ! 4502 ba,pt %ncc, .big_copyout 4503 nop 4504.dcos8: 4505 ! 4506 ! Housekeeping for copy loops. Uses same idea as in the byte for 4507 ! byte copy loop above. 4508 ! 4509 add %o0, %o2, %o0 4510 add %o1, %o2, %o1 4511 sub %g0, %o2, %o3 4512 ba,pt %ncc, .dodebc 4513 srl %o2, 3, %o2 ! Number of 8 byte chunks to copy 4514 ! 4515 ! 4 byte aligned? 4516 ! 4517.dcoh4: 4518 bnz,pn %ncc, .dcoh2 4519 ! 4520 ! See if we're in the "small range". 4521 ! If so, go off an do the copy. 4522 ! If not, load the hard limit. %o3 is 4523 ! available for reuse. 4524 ! 4525 sethi %hi(hw_copy_limit_4), %o3 4526 ld [%o3 + %lo(hw_copy_limit_4)], %o3 4527 ! 4528 ! If it's zero, there's no HW bcopy. 4529 ! Bop off to the aligned copy. 4530 ! 4531 tst %o3 4532 bz,pn %icc, .dcos4 4533 subcc %o3, %o2, %o3 4534 ! 4535 ! We're negative if our size is larger than hw_copy_limit_4. 4536 ! 4537 bge,pt %ncc, .dcos4 4538 nop 4539 ! 4540 ! HW assist is on and we're large enough. Do it. 4541 ! 4542 ba,pt %ncc, .big_copyout 4543 nop 4544.dcos4: 4545 add %o0, %o2, %o0 4546 add %o1, %o2, %o1 4547 sub %g0, %o2, %o3 4548 ba,pt %ncc, .dodfbc 4549 srl %o2, 2, %o2 ! Number of 4 byte chunks to copy 4550 ! 4551 ! We must be 2 byte aligned. Off we go. 4552 ! The check for small copies was done in the 4553 ! delay at .dcoh4 4554 ! 4555.dcoh2: 4556 ble %ncc, .dcos2 4557 sethi %hi(hw_copy_limit_2), %o3 4558 ld [%o3 + %lo(hw_copy_limit_2)], %o3 4559 tst %o3 4560 bz,pn %icc, .dcos2 4561 subcc %o3, %o2, %o3 4562 bge,pt %ncc, .dcos2 4563 nop 4564 ! 4565 ! HW is on and we're big enough. Do it. 4566 ! 4567 ba,pt %ncc, .big_copyout 4568 nop 4569.dcos2: 4570 add %o0, %o2, %o0 4571 add %o1, %o2, %o1 4572 sub %g0, %o2, %o3 4573 ba,pt %ncc, .dodtbc 4574 srl %o2, 1, %o2 ! Number of 2 byte chunks to copy 4575.small_copyout: 4576 ! 4577 ! Why are we doing this AGAIN? There are certain conditions in 4578 ! big_copyout that will cause us to forego the HW assisted copies 4579 ! and bounce back to a non-HW assisted copy. This dispatches those 4580 ! copies. Note that we branch around this in the main line code. 4581 ! 4582 ! We make no check for limits or HW enablement here. We've 4583 ! already been told that we're a poster child so just go off 4584 ! and do it. 4585 ! 4586 or %o0, %o1, %o3 4587 btst 1, %o3 4588 bnz %icc, .dcobcp ! Most likely 4589 btst 7, %o3 4590 bz %icc, .dcos8 4591 btst 3, %o3 4592 bz %icc, .dcos4 4593 nop 4594 ba,pt %ncc, .dcos2 4595 nop 4596 .align 32 4597.dodebc: 4598 ldx [%o0 + %o3], %o4 4599 deccc %o2 4600 stxa %o4, [%o1 + %o3]ASI_USER 4601 bg,pt %ncc, .dodebc 4602 addcc %o3, 8, %o3 4603 ! 4604 ! End of copy loop. Check to see if we're done. Most 4605 ! eight byte aligned copies end here. 4606 ! 4607 bz,pt %ncc, .dcofh 4608 nop 4609 ! 4610 ! Something is left - do it byte for byte. 4611 ! 4612 ba,pt %ncc, .dcocl 4613 ldub [%o0 + %o3], %o4 ! load next byte 4614 ! 4615 ! Four byte copy loop. %o2 is the number of 4 byte chunks to copy. 4616 ! 4617 .align 32 4618.dodfbc: 4619 lduw [%o0 + %o3], %o4 4620 deccc %o2 4621 sta %o4, [%o1 + %o3]ASI_USER 4622 bg,pt %ncc, .dodfbc 4623 addcc %o3, 4, %o3 4624 ! 4625 ! End of copy loop. Check to see if we're done. Most 4626 ! four byte aligned copies end here. 4627 ! 4628 bz,pt %ncc, .dcofh 4629 nop 4630 ! 4631 ! Something is left. Do it byte for byte. 4632 ! 4633 ba,pt %ncc, .dcocl 4634 ldub [%o0 + %o3], %o4 ! load next byte 4635 ! 4636 ! two byte aligned copy loop. %o2 is the number of 2 byte chunks to 4637 ! copy. 4638 ! 4639 .align 32 4640.dodtbc: 4641 lduh [%o0 + %o3], %o4 4642 deccc %o2 4643 stha %o4, [%o1 + %o3]ASI_USER 4644 bg,pt %ncc, .dodtbc 4645 addcc %o3, 2, %o3 4646 ! 4647 ! End of copy loop. Anything left? 4648 ! 4649 bz,pt %ncc, .dcofh 4650 nop 4651 ! 4652 ! Deal with the last byte 4653 ! 4654 ldub [%o0 + %o3], %o4 4655 stba %o4, [%o1 + %o3]ASI_USER 4656.dcofh: 4657 membar #Sync 4658 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 4659 retl 4660 clr %o0 4661 4662.big_copyout: 4663 ! We're going to go off and do a block copy. 4664 ! Switch fault handlers and grab a window. We 4665 ! don't do a membar #Sync since we've done only 4666 ! kernel data to this point. 4667 stn %o4, [THREAD_REG + T_LOFAULT] 4668 4669 ! Copy out that reach here are larger than 256 bytes. The 4670 ! hw_copy_limit_1 is set to 256. Never set this limit less 4671 ! 128 bytes. 4672 save %sp, -SA(MINFRAME), %sp 4673.do_block_copyout: 4674 4675 ! Swap src/dst since the code below is memcpy code 4676 ! and memcpy/bcopy have different calling sequences 4677 mov %i1, %i5 4678 mov %i0, %i1 4679 mov %i5, %i0 4680 4681 ! Block (64 bytes) align the destination. 4682 andcc %i0, 0x3f, %i3 ! is dst block aligned 4683 bz %ncc, copyout_blalign ! dst already block aligned 4684 sub %i3, 0x40, %i3 4685 neg %i3 ! bytes till dst 64 bytes aligned 4686 sub %i2, %i3, %i2 ! update i2 with new count 4687 4688 ! Based on source and destination alignment do 4689 ! either 8 bytes, 4 bytes, 2 bytes or byte copy. 4690 4691 ! Is dst & src 8B aligned 4692 or %i0, %i1, %o2 4693 andcc %o2, 0x7, %g0 4694 bz %ncc, .co_alewdcp 4695 nop 4696 4697 ! Is dst & src 4B aligned 4698 andcc %o2, 0x3, %g0 4699 bz %ncc, .co_alwdcp 4700 nop 4701 4702 ! Is dst & src 2B aligned 4703 andcc %o2, 0x1, %g0 4704 bz %ncc, .co_alhlfwdcp 4705 nop 4706 4707 ! 1B aligned 47081: ldub [%i1], %o2 4709 stba %o2, [%i0]ASI_USER 4710 inc %i1 4711 deccc %i3 4712 bgu,pt %ncc, 1b 4713 inc %i0 4714 4715 ba copyout_blalign 4716 nop 4717 4718 ! dst & src 4B aligned 4719.co_alwdcp: 4720 ld [%i1], %o2 4721 sta %o2, [%i0]ASI_USER 4722 add %i1, 0x4, %i1 4723 subcc %i3, 0x4, %i3 4724 bgu,pt %ncc, .co_alwdcp 4725 add %i0, 0x4, %i0 4726 4727 ba copyout_blalign 4728 nop 4729 4730 ! dst & src 2B aligned 4731.co_alhlfwdcp: 4732 lduh [%i1], %o2 4733 stuha %o2, [%i0]ASI_USER 4734 add %i1, 0x2, %i1 4735 subcc %i3, 0x2, %i3 4736 bgu,pt %ncc, .co_alhlfwdcp 4737 add %i0, 0x2, %i0 4738 4739 ba copyout_blalign 4740 nop 4741 4742 ! dst & src 8B aligned 4743.co_alewdcp: 4744 ldx [%i1], %o2 4745 stxa %o2, [%i0]ASI_USER 4746 add %i1, 0x8, %i1 4747 subcc %i3, 0x8, %i3 4748 bgu,pt %ncc, .co_alewdcp 4749 add %i0, 0x8, %i0 4750 4751 ! Now Destination is block (64 bytes) aligned 4752copyout_blalign: 4753 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size 4754 sub %i2, %i3, %i2 ! Residue bytes in %i2 4755 4756 mov ASI_BLK_INIT_QUAD_LDD_AIUS, %asi 4757 4758 andcc %i1, 0xf, %o2 ! is src quadword aligned 4759 bz,pn %xcc, .co_blkcpy ! src offset in %o2 (last 4-bits) 4760 nop 4761 cmp %o2, 0x8 4762 bg .co_upper_double 4763 nop 4764 bl .co_lower_double 4765 nop 4766 4767 ! Falls through when source offset is equal to 8 i.e. 4768 ! source is double word aligned. 4769 ! In this case no shift/merge of data is required 4770 4771 sub %i1, %o2, %i1 ! align the src at 16 bytes. 4772 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 4773 prefetch [%l0+0x0], #one_read 4774 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4775.co_loop0: 4776 add %i1, 0x10, %i1 4777 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 4778 prefetch [%l0+0x40], #one_read 4779 4780 stxa %l3, [%i0+0x0]%asi 4781 stxa %l4, [%i0+0x8]%asi 4782 4783 add %i1, 0x10, %i1 4784 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4785 4786 stxa %l5, [%i0+0x10]%asi 4787 stxa %l2, [%i0+0x18]%asi 4788 4789 add %i1, 0x10, %i1 4790 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 4791 4792 stxa %l3, [%i0+0x20]%asi 4793 stxa %l4, [%i0+0x28]%asi 4794 4795 add %i1, 0x10, %i1 4796 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4797 4798 stxa %l5, [%i0+0x30]%asi 4799 stxa %l2, [%i0+0x38]%asi 4800 4801 add %l0, 0x40, %l0 4802 subcc %i3, 0x40, %i3 4803 bgu,pt %xcc, .co_loop0 4804 add %i0, 0x40, %i0 4805 ba .co_blkdone 4806 add %i1, %o2, %i1 ! increment the source by src offset 4807 ! the src offset was stored in %o2 4808 4809.co_lower_double: 4810 4811 sub %i1, %o2, %i1 ! align the src at 16 bytes. 4812 sll %o2, 3, %o0 ! %o0 left shift 4813 mov 0x40, %o1 4814 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 4815 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 4816 prefetch [%l0+0x0], #one_read 4817 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l2 and %l3 has 4818 ! complete data 4819.co_loop1: 4820 add %i1, 0x10, %i1 4821 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has partial data 4822 ! for this read. 4823 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4 4824 ! into %l2 and %l3 4825 prefetch [%l0+0x40], #one_read 4826 4827 stxa %l2, [%i0+0x0]%asi 4828 stxa %l3, [%i0+0x8]%asi 4829 4830 add %i1, 0x10, %i1 4831 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4832 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and 4833 ! %l4 from previous read 4834 ! into %l4 and %l5 4835 stxa %l4, [%i0+0x10]%asi 4836 stxa %l5, [%i0+0x18]%asi 4837 4838 ! Repeat the same for next 32 bytes. 4839 4840 add %i1, 0x10, %i1 4841 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 4842 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) 4843 4844 stxa %l2, [%i0+0x20]%asi 4845 stxa %l3, [%i0+0x28]%asi 4846 4847 add %i1, 0x10, %i1 4848 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4849 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) 4850 4851 stxa %l4, [%i0+0x30]%asi 4852 stxa %l5, [%i0+0x38]%asi 4853 4854 add %l0, 0x40, %l0 4855 subcc %i3, 0x40, %i3 4856 bgu,pt %xcc, .co_loop1 4857 add %i0, 0x40, %i0 4858 ba .co_blkdone 4859 add %i1, %o2, %i1 ! increment the source by src offset 4860 ! the src offset was stored in %o2 4861 4862.co_upper_double: 4863 4864 sub %i1, %o2, %i1 ! align the src at 16 bytes. 4865 sub %o2, 0x8, %o0 4866 sll %o0, 3, %o0 ! %o0 left shift 4867 mov 0x40, %o1 4868 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 4869 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 4870 prefetch [%l0+0x0], #one_read 4871 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l3 4872 ! for this read and 4873 ! no data in %l2 4874.co_loop2: 4875 add %i1, 0x10, %i1 4876 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has complete data 4877 ! and %l5 has partial 4878 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5 4879 ! into %l3 and %l4 4880 prefetch [%l0+0x40], #one_read 4881 4882 stxa %l3, [%i0+0x0]%asi 4883 stxa %l4, [%i0+0x8]%asi 4884 4885 add %i1, 0x10, %i1 4886 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4887 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with 4888 ! %l5 from previous read 4889 ! into %l5 and %l2 4890 4891 stxa %l5, [%i0+0x10]%asi 4892 stxa %l2, [%i0+0x18]%asi 4893 4894 ! Repeat the same for next 32 bytes. 4895 4896 add %i1, 0x10, %i1 4897 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 4898 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) 4899 4900 stxa %l3, [%i0+0x20]%asi 4901 stxa %l4, [%i0+0x28]%asi 4902 4903 add %i1, 0x10, %i1 4904 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4905 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) 4906 4907 stxa %l5, [%i0+0x30]%asi 4908 stxa %l2, [%i0+0x38]%asi 4909 4910 add %l0, 0x40, %l0 4911 subcc %i3, 0x40, %i3 4912 bgu,pt %xcc, .co_loop2 4913 add %i0, 0x40, %i0 4914 ba .co_blkdone 4915 add %i1, %o2, %i1 ! increment the source by src offset 4916 ! the src offset was stored in %o2 4917 4918 4919 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P 4920.co_blkcpy: 4921 4922 andn %i1, 0x3f, %o0 ! %o0 has block aligned source 4923 prefetch [%o0+0x0], #one_read 49241: 4925 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0 4926 add %i1, 0x10, %i1 4927 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 4928 add %i1, 0x10, %i1 4929 4930 prefetch [%o0+0x40], #one_read 4931 4932 stxa %l0, [%i0+0x0]%asi 4933 4934 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 4935 add %i1, 0x10, %i1 4936 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6 4937 add %i1, 0x10, %i1 4938 4939 stxa %l1, [%i0+0x8]%asi 4940 stxa %l2, [%i0+0x10]%asi 4941 stxa %l3, [%i0+0x18]%asi 4942 stxa %l4, [%i0+0x20]%asi 4943 stxa %l5, [%i0+0x28]%asi 4944 stxa %l6, [%i0+0x30]%asi 4945 stxa %l7, [%i0+0x38]%asi 4946 4947 add %o0, 0x40, %o0 4948 subcc %i3, 0x40, %i3 4949 bgu,pt %xcc, 1b 4950 add %i0, 0x40, %i0 4951 4952.co_blkdone: 4953 membar #Sync 4954 4955 brz,pt %i2, .copyout_exit 4956 nop 4957 4958 ! Handle trailing bytes 4959 cmp %i2, 0x8 4960 blu,pt %ncc, .co_residue 4961 nop 4962 4963 ! Can we do some 8B ops 4964 or %i1, %i0, %o2 4965 andcc %o2, 0x7, %g0 4966 bnz %ncc, .co_last4 4967 nop 4968 4969 ! Do 8byte ops as long as possible 4970.co_last8: 4971 ldx [%i1], %o2 4972 stxa %o2, [%i0]ASI_USER 4973 add %i1, 0x8, %i1 4974 sub %i2, 0x8, %i2 4975 cmp %i2, 0x8 4976 bgu,pt %ncc, .co_last8 4977 add %i0, 0x8, %i0 4978 4979 brz,pt %i2, .copyout_exit 4980 nop 4981 4982 ba .co_residue 4983 nop 4984 4985.co_last4: 4986 ! Can we do 4B ops 4987 andcc %o2, 0x3, %g0 4988 bnz %ncc, .co_last2 4989 nop 49901: 4991 ld [%i1], %o2 4992 sta %o2, [%i0]ASI_USER 4993 add %i1, 0x4, %i1 4994 sub %i2, 0x4, %i2 4995 cmp %i2, 0x4 4996 bgu,pt %ncc, 1b 4997 add %i0, 0x4, %i0 4998 4999 brz,pt %i2, .copyout_exit 5000 nop 5001 5002 ba .co_residue 5003 nop 5004 5005.co_last2: 5006 ! Can we do 2B ops 5007 andcc %o2, 0x1, %g0 5008 bnz %ncc, .co_residue 5009 nop 5010 50111: 5012 lduh [%i1], %o2 5013 stuha %o2, [%i0]ASI_USER 5014 add %i1, 0x2, %i1 5015 sub %i2, 0x2, %i2 5016 cmp %i2, 0x2 5017 bgu,pt %ncc, 1b 5018 add %i0, 0x2, %i0 5019 5020 brz,pt %i2, .copyout_exit 5021 nop 5022 5023 ! Copy the residue as byte copy 5024.co_residue: 5025 ldub [%i1], %i4 5026 stba %i4, [%i0]ASI_USER 5027 inc %i1 5028 deccc %i2 5029 bgu,pt %xcc, .co_residue 5030 inc %i0 5031 5032.copyout_exit: 5033 membar #Sync 5034 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 5035 ret 5036 restore %g0, 0, %o0 5037 5038.copyout_err: 5039 ldn [THREAD_REG + T_COPYOPS], %o4 5040 brz %o4, 2f 5041 nop 5042 ldn [%o4 + CP_COPYOUT], %g2 5043 jmp %g2 5044 nop 50452: 5046 retl 5047 mov -1, %o0 5048#endif /* NIAGARA_IMPL */ 5049 SET_SIZE(copyout) 5050 5051 5052 ENTRY(xcopyout) 5053 sethi %hi(.xcopyout_err), REAL_LOFAULT 5054 b .do_copyout 5055 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT 5056.xcopyout_err: 5057 ldn [THREAD_REG + T_COPYOPS], %o4 5058 brz %o4, 2f 5059 nop 5060 ldn [%o4 + CP_XCOPYOUT], %g2 5061 jmp %g2 5062 nop 50632: 5064 retl 5065 mov %g1, %o0 5066 SET_SIZE(xcopyout) 5067 5068 ENTRY(xcopyout_little) 5069 sethi %hi(.little_err), %o4 5070 ldn [THREAD_REG + T_LOFAULT], %o5 5071 or %o4, %lo(.little_err), %o4 5072 membar #Sync ! sync error barrier 5073 stn %o4, [THREAD_REG + T_LOFAULT] 5074 5075 subcc %g0, %o2, %o3 5076 add %o0, %o2, %o0 5077 bz,pn %ncc, 2f ! check for zero bytes 5078 sub %o2, 1, %o4 5079 add %o0, %o4, %o0 ! start w/last byte 5080 add %o1, %o2, %o1 5081 ldub [%o0+%o3], %o4 5082 50831: stba %o4, [%o1+%o3]ASI_AIUSL 5084 inccc %o3 5085 sub %o0, 2, %o0 ! get next byte 5086 bcc,a,pt %ncc, 1b 5087 ldub [%o0+%o3], %o4 5088 50892: membar #Sync ! sync error barrier 5090 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 5091 retl 5092 mov %g0, %o0 ! return (0) 5093 SET_SIZE(xcopyout_little) 5094 5095/* 5096 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little) 5097 */ 5098 5099 ENTRY(copyin) 5100 sethi %hi(.copyin_err), REAL_LOFAULT 5101 or REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT 5102 5103#if !defined(NIAGARA_IMPL) 5104.do_copyin: 5105 tst %o2 ! check for zero count; quick exit 5106 bz,pt %ncc, .ci_smallqx 5107 mov %o0, SAVE_SRC 5108 mov %o1, SAVE_DST 5109 mov %o2, SAVE_COUNT 5110 cmp %o2, FP_COPY ! check for small copy/leaf case 5111 bgt,pt %ncc, .ci_copy_more 5112 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT 5113/* 5114 * Small copy in code 5115 * 5116 */ 5117 sethi %hi(copyio_fault_nowindow), %o3 5118 or %o3, %lo(copyio_fault_nowindow), %o3 5119 membar #Sync 5120 stn %o3, [THREAD_REG + T_LOFAULT] 5121 5122 mov ASI_USER, %asi 5123 cmp %o2, SHORTCOPY ! make sure there is enough to align 5124 ble,pt %ncc, .ci_smallest 5125 andcc %o1, 0x7, %o3 ! is dest long word aligned 5126 bnz,pn %ncc, .ci_align 5127 andcc %o1, 1, %o3 ! is dest byte aligned 5128 5129! Destination is long word aligned 5130.ci_al_src: 5131 andcc %o0, 7, %o3 5132 brnz,pt %o3, .ci_src_dst_unal8 5133 nop 5134/* 5135 * Special case for handling when src and dest are both long word aligned 5136 * and total data to move is less than FP_COPY bytes 5137 * Also handles finish up for large block moves, so may be less than 32 bytes 5138 */ 5139.ci_medlong: 5140 subcc %o2, 31, %o2 ! adjust length to allow cc test 5141 ble,pt %ncc, .ci_medl31 5142 nop 5143.ci_medl32: 5144 ldxa [%o0]%asi, %o4 ! move 32 bytes 5145 subcc %o2, 32, %o2 ! decrement length count by 32 5146 stx %o4, [%o1] 5147 ldxa [%o0+8]%asi, %o4 5148 stx %o4, [%o1+8] 5149 ldxa [%o0+16]%asi, %o4 5150 add %o0, 32, %o0 ! increase src ptr by 32 5151 stx %o4, [%o1+16] 5152 ldxa [%o0-8]%asi, %o4 5153 add %o1, 32, %o1 ! increase dst ptr by 32 5154 bgu,pt %ncc, .ci_medl32 ! repeat if at least 32 bytes left 5155 stx %o4, [%o1-8] 5156.ci_medl31: 5157 addcc %o2, 24, %o2 ! adjust count to be off by 7 5158 ble,pt %ncc, .ci_medl7 ! skip if 7 or fewer bytes left 5159 nop 5160.ci_medl8: 5161 ldxa [%o0]%asi, %o4 ! move 8 bytes 5162 add %o0, 8, %o0 ! increase src ptr by 8 5163 subcc %o2, 8, %o2 ! decrease count by 8 5164 add %o1, 8, %o1 ! increase dst ptr by 8 5165 bgu,pt %ncc, .ci_medl8 5166 stx %o4, [%o1-8] 5167.ci_medl7: 5168 addcc %o2, 7, %o2 ! finish adjustment of remaining count 5169 bnz,pt %ncc, .ci_small4 ! do final bytes if not finished 5170 nop 5171.ci_smallx: ! finish up and exit 5172 membar #Sync 5173 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 5174.ci_smallqx: 5175 retl 5176 mov %g0, %o0 5177 5178.ci_small4: 5179 cmp %o2, 4 5180 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left 5181 nop ! 5182 lda [%o0]%asi, %o4 ! move 4 bytes 5183 add %o0, 4, %o0 ! increase src ptr by 4 5184 add %o1, 4, %o1 ! increase dst ptr by 4 5185 subcc %o2, 4, %o2 ! decrease count by 4 5186 bz %ncc, .ci_smallx 5187 stw %o4, [%o1-4] 5188 5189.ci_small3x: ! Exactly 1, 2, or 3 bytes remain 5190 subcc %o2, 1, %o2 ! reduce count for cc test 5191 lduba [%o0]%asi, %o4 ! load one byte 5192 bz,pt %ncc, .ci_smallx 5193 stb %o4, [%o1] ! store one byte 5194 lduba [%o0+1]%asi, %o4 ! load second byte 5195 subcc %o2, 1, %o2 5196 bz,pt %ncc, .ci_smallx 5197 stb %o4, [%o1+1] ! store second byte 5198 lduba [%o0+2]%asi, %o4 ! load third byte 5199 ba .ci_smallx 5200 stb %o4, [%o1+2] ! store third byte 5201 5202.ci_smallest: ! 7 or fewer bytes remain 5203 cmp %o2, 4 5204 blt,pt %ncc, .ci_small3x 5205 nop 5206 lduba [%o0]%asi, %o4 ! read byte 5207 subcc %o2, 4, %o2 ! reduce count by 4 5208 stb %o4, [%o1] ! write byte 5209 lduba [%o0+1]%asi, %o4 ! repeat for total of 4 bytes 5210 add %o0, 4, %o0 ! advance src by 4 5211 stb %o4, [%o1+1] 5212 lduba [%o0-2]%asi, %o4 5213 add %o1, 4, %o1 ! advance dst by 4 5214 stb %o4, [%o1-2] 5215 lduba [%o0-1]%asi, %o4 5216 bnz,pt %ncc, .ci_small3x 5217 stb %o4, [%o1-1] 5218 membar #Sync 5219 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 5220 retl 5221 mov %g0, %o0 5222 5223.ci_align: 5224 bnz,pt %ncc, .ci_al_d1 5225.ci_al_d1f: ! dest is now half word aligned 5226 andcc %o1, 2, %o3 ! is dest word aligned 5227 bnz,pt %ncc, .ci_al_d2 5228.ci_al_d2f: ! dest is now word aligned 5229 andcc %o1, 4, %o3 ! is dest longword aligned? 5230 bz,pt %ncc, .ci_al_src 5231 nop 5232.ci_al_d4: ! dest is word aligned; src is unknown 5233 lduba [%o0]%asi, %o4 ! move a word (src align unknown) 5234 lduba [%o0+1]%asi, %o3 5235 sll %o4, 24, %o4 ! position 5236 sll %o3, 16, %o3 ! position 5237 or %o4, %o3, %o3 ! merge 5238 lduba [%o0+2]%asi, %o4 5239 sll %o4, 8, %o4 ! position 5240 or %o4, %o3, %o3 ! merge 5241 lduba [%o0+3]%asi, %o4 5242 or %o4, %o3, %o4 ! merge 5243 stw %o4,[%o1] ! store four bytes 5244 add %o0, 4, %o0 ! adjust src by 4 5245 add %o1, 4, %o1 ! adjust dest by 4 5246 sub %o2, 4, %o2 ! adjust count by 4 5247 andcc %o0, 7, %o3 ! check for src long word alignment 5248 brz,pt %o3, .ci_medlong 5249.ci_src_dst_unal8: 5250 ! dst is 8-byte aligned, src is not 5251 ! Size is less than FP_COPY 5252 ! Following code is to select for alignment 5253 andcc %o0, 0x3, %o3 ! test word alignment 5254 bz,pt %ncc, .ci_medword 5255 nop 5256 andcc %o0, 0x1, %o3 ! test halfword alignment 5257 bnz,pt %ncc, .ci_med_byte ! go to byte move if not halfword 5258 andcc %o0, 0x2, %o3 ! test which byte alignment 5259 ba .ci_medhalf 5260 nop 5261.ci_al_d1: ! align dest to half word 5262 lduba [%o0]%asi, %o4 ! move a byte 5263 add %o0, 1, %o0 5264 stb %o4, [%o1] 5265 add %o1, 1, %o1 5266 andcc %o1, 2, %o3 ! is dest word aligned 5267 bz,pt %ncc, .ci_al_d2f 5268 sub %o2, 1, %o2 5269.ci_al_d2: ! align dest to word 5270 lduba [%o0]%asi, %o4 ! move a half-word (src align unknown) 5271 lduba [%o0+1]%asi, %o3 5272 sll %o4, 8, %o4 ! position 5273 or %o4, %o3, %o4 ! merge 5274 sth %o4, [%o1] 5275 add %o0, 2, %o0 5276 add %o1, 2, %o1 5277 andcc %o1, 4, %o3 ! is dest longword aligned? 5278 bz,pt %ncc, .ci_al_src 5279 sub %o2, 2, %o2 5280 ba .ci_al_d4 5281 nop 5282/* 5283 * Handle all cases where src and dest are aligned on word 5284 * boundaries. Use unrolled loops for better performance. 5285 * This option wins over standard large data move when 5286 * source and destination is in cache for medium 5287 * to short data moves. 5288 */ 5289.ci_medword: 5290 subcc %o2, 31, %o2 ! adjust length to allow cc test 5291 ble,pt %ncc, .ci_medw31 5292 nop 5293.ci_medw32: 5294 lda [%o0]%asi, %o4 ! move a block of 32 bytes 5295 stw %o4, [%o1] 5296 lda [%o0+4]%asi, %o4 5297 stw %o4, [%o1+4] 5298 lda [%o0+8]%asi, %o4 5299 stw %o4, [%o1+8] 5300 lda [%o0+12]%asi, %o4 5301 stw %o4, [%o1+12] 5302 lda [%o0+16]%asi, %o4 5303 stw %o4, [%o1+16] 5304 lda [%o0+20]%asi, %o4 5305 subcc %o2, 32, %o2 ! decrement length count 5306 stw %o4, [%o1+20] 5307 lda [%o0+24]%asi, %o4 5308 add %o0, 32, %o0 ! increase src ptr by 32 5309 stw %o4, [%o1+24] 5310 lda [%o0-4]%asi, %o4 5311 add %o1, 32, %o1 ! increase dst ptr by 32 5312 bgu,pt %ncc, .ci_medw32 ! repeat if at least 32 bytes left 5313 stw %o4, [%o1-4] 5314.ci_medw31: 5315 addcc %o2, 24, %o2 ! adjust count to be off by 7 5316 ble,pt %ncc, .ci_medw7 ! skip if 7 or fewer bytes left 5317 nop ! 5318.ci_medw15: 5319 lda [%o0]%asi, %o4 ! move a block of 8 bytes 5320 subcc %o2, 8, %o2 ! decrement length count 5321 stw %o4, [%o1] 5322 add %o0, 8, %o0 ! increase src ptr by 8 5323 lda [%o0-4]%asi, %o4 5324 add %o1, 8, %o1 ! increase dst ptr by 8 5325 bgu,pt %ncc, .ci_medw15 5326 stw %o4, [%o1-4] 5327.ci_medw7: 5328 addcc %o2, 7, %o2 ! finish adjustment of remaining count 5329 bz,pt %ncc, .ci_smallx ! exit if finished 5330 cmp %o2, 4 5331 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left 5332 nop ! 5333 lda [%o0]%asi, %o4 ! move 4 bytes 5334 add %o0, 4, %o0 ! increase src ptr by 4 5335 add %o1, 4, %o1 ! increase dst ptr by 4 5336 subcc %o2, 4, %o2 ! decrease count by 4 5337 bnz .ci_small3x 5338 stw %o4, [%o1-4] 5339 membar #Sync 5340 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 5341 retl 5342 mov %g0, %o0 5343 5344.ci_medhalf: 5345 subcc %o2, 31, %o2 ! adjust length to allow cc test 5346 ble,pt %ncc, .ci_medh31 5347 nop 5348.ci_medh32: ! load and store block of 32 bytes 5349 subcc %o2, 32, %o2 ! decrement length count 5350 5351 lduha [%o0]%asi, %o4 ! move 32 bytes 5352 lduwa [%o0+2]%asi, %o3 5353 sllx %o4, 48, %o4 5354 sllx %o3, 16, %o3 5355 or %o4, %o3, %o3 5356 lduha [%o0+6]%asi, %o4 5357 or %o4, %o3, %o4 5358 stx %o4, [%o1] 5359 5360 lduha [%o0+8]%asi, %o4 5361 lduwa [%o0+10]%asi, %o3 5362 sllx %o4, 48, %o4 5363 sllx %o3, 16, %o3 5364 or %o4, %o3, %o3 5365 lduha [%o0+14]%asi, %o4 5366 or %o4, %o3, %o4 5367 stx %o4, [%o1+8] 5368 5369 lduha [%o0+16]%asi, %o4 5370 lduwa [%o0+18]%asi, %o3 5371 sllx %o4, 48, %o4 5372 sllx %o3, 16, %o3 5373 or %o4, %o3, %o3 5374 lduha [%o0+22]%asi, %o4 5375 or %o4, %o3, %o4 5376 stx %o4, [%o1+16] 5377 5378 add %o0, 32, %o0 ! increase src ptr by 32 5379 add %o1, 32, %o1 ! increase dst ptr by 32 5380 5381 lduha [%o0-8]%asi, %o4 5382 lduwa [%o0-6]%asi, %o3 5383 sllx %o4, 48, %o4 5384 sllx %o3, 16, %o3 5385 or %o4, %o3, %o3 5386 lduha [%o0-2]%asi, %o4 5387 or %o3, %o4, %o4 5388 bgu,pt %ncc, .ci_medh32 ! repeat if at least 32 bytes left 5389 stx %o4, [%o1-8] 5390 5391.ci_medh31: 5392 addcc %o2, 24, %o2 ! adjust count to be off by 7 5393 ble,pt %ncc, .ci_medh7 ! skip if 7 or fewer bytes left 5394 nop ! 5395.ci_medh15: 5396 lduha [%o0]%asi, %o4 ! move 16 bytes 5397 subcc %o2, 8, %o2 ! decrement length count 5398 lduwa [%o0+2]%asi, %o3 5399 sllx %o4, 48, %o4 5400 sllx %o3, 16, %o3 5401 or %o4, %o3, %o3 5402 add %o1, 8, %o1 ! increase dst ptr by 8 5403 lduha [%o0+6]%asi, %o4 5404 add %o0, 8, %o0 ! increase src ptr by 8 5405 or %o4, %o3, %o4 5406 bgu,pt %ncc, .ci_medh15 5407 stx %o4, [%o1-8] 5408.ci_medh7: 5409 addcc %o2, 7, %o2 ! finish adjustment of remaining count 5410 bz,pt %ncc, .ci_smallx ! exit if finished 5411 cmp %o2, 4 5412 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left 5413 nop ! 5414 lduha [%o0]%asi, %o4 5415 sll %o4, 16, %o4 5416 lduha [%o0+2]%asi, %o3 5417 or %o3, %o4, %o4 5418 subcc %o2, 4, %o2 5419 add %o0, 4, %o0 5420 add %o1, 4, %o1 5421 bnz .ci_small3x 5422 stw %o4, [%o1-4] 5423 membar #Sync 5424 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 5425 retl 5426 mov %g0, %o0 5427 5428 .align 16 5429.ci_med_byte: 5430 bnz,pt %ncc, .ci_medbh32a ! go to correct byte move 5431 subcc %o2, 31, %o2 ! adjust length to allow cc test 5432 ble,pt %ncc, .ci_medb31 5433 nop 5434.ci_medb32: ! Alignment 1 or 5 5435 subcc %o2, 32, %o2 ! decrement length count 5436 5437 lduba [%o0]%asi, %o4 ! load and store a block of 32 bytes 5438 sllx %o4, 56, %o3 5439 lduha [%o0+1]%asi, %o4 5440 sllx %o4, 40, %o4 5441 or %o4, %o3, %o3 5442 lduwa [%o0+3]%asi, %o4 5443 sllx %o4, 8, %o4 5444 or %o4, %o3, %o3 5445 lduba [%o0+7]%asi, %o4 5446 or %o4, %o3, %o4 5447 stx %o4, [%o1] 5448 5449 lduba [%o0+8]%asi, %o4 5450 sllx %o4, 56, %o3 5451 lduha [%o0+9]%asi, %o4 5452 sllx %o4, 40, %o4 5453 or %o4, %o3, %o3 5454 lduwa [%o0+11]%asi, %o4 5455 sllx %o4, 8, %o4 5456 or %o4, %o3, %o3 5457 lduba [%o0+15]%asi, %o4 5458 or %o4, %o3, %o4 5459 stx %o4, [%o1+8] 5460 5461 lduba [%o0+16]%asi, %o4 5462 sllx %o4, 56, %o3 5463 lduha [%o0+17]%asi, %o4 5464 sllx %o4, 40, %o4 5465 or %o4, %o3, %o3 5466 lduwa [%o0+19]%asi, %o4 5467 sllx %o4, 8, %o4 5468 or %o4, %o3, %o3 5469 lduba [%o0+23]%asi, %o4 5470 or %o4, %o3, %o4 5471 stx %o4, [%o1+16] 5472 5473 add %o0, 32, %o0 ! increase src ptr by 32 5474 add %o1, 32, %o1 ! increase dst ptr by 32 5475 5476 lduba [%o0-8]%asi, %o4 5477 sllx %o4, 56, %o3 5478 lduha [%o0-7]%asi, %o4 5479 sllx %o4, 40, %o4 5480 or %o4, %o3, %o3 5481 lduwa [%o0-5]%asi, %o4 5482 sllx %o4, 8, %o4 5483 or %o4, %o3, %o3 5484 lduba [%o0-1]%asi, %o4 5485 or %o4, %o3, %o4 5486 bgu,pt %ncc, .ci_medb32 ! repeat if at least 32 bytes left 5487 stx %o4, [%o1-8] 5488 5489.ci_medb31: ! 31 or fewer bytes remaining 5490 addcc %o2, 24, %o2 ! adjust count to be off by 7 5491 ble,pt %ncc, .ci_medb7 ! skip if 7 or fewer bytes left 5492 nop ! 5493.ci_medb15: 5494 5495 lduba [%o0]%asi, %o4 ! load and store a block of 8 bytes 5496 subcc %o2, 8, %o2 ! decrement length count 5497 sllx %o4, 56, %o3 5498 lduha [%o0+1]%asi, %o4 5499 sllx %o4, 40, %o4 5500 or %o4, %o3, %o3 5501 lduwa [%o0+3]%asi, %o4 5502 add %o1, 8, %o1 ! increase dst ptr by 16 5503 sllx %o4, 8, %o4 5504 or %o4, %o3, %o3 5505 lduba [%o0+7]%asi, %o4 5506 add %o0, 8, %o0 ! increase src ptr by 16 5507 or %o4, %o3, %o4 5508 bgu,pt %ncc, .ci_medb15 5509 stx %o4, [%o1-8] 5510.ci_medb7: 5511 addcc %o2, 7, %o2 ! finish adjustment of remaining count 5512 bz,pt %ncc, .ci_smallx ! exit if finished 5513 cmp %o2, 4 5514 blt,pt %ncc, .ci_small3x ! skip if less than 4 bytes left 5515 nop ! 5516 lduba [%o0]%asi, %o4 ! move 4 bytes 5517 sll %o4, 24, %o3 5518 lduha [%o0+1]%asi, %o4 5519 sll %o4, 8, %o4 5520 or %o4, %o3, %o3 5521 lduba [%o0+3]%asi, %o4 5522 or %o4, %o3, %o4 5523 subcc %o2, 4, %o2 5524 add %o0, 4, %o0 5525 add %o1, 4, %o1 5526 bnz .ci_small3x 5527 stw %o4, [%o1-4] 5528 membar #Sync 5529 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 5530 retl 5531 mov %g0, %o0 5532 5533 .align 16 5534.ci_medbh32a: ! Alignment 3 or 7 5535 ble,pt %ncc, .ci_medbh31 5536 nop 5537.ci_medbh32: ! Alignment 3 or 7 5538 subcc %o2, 32, %o2 ! decrement length count 5539 5540 lduba [%o0]%asi, %o4 ! load and store a block of 32 bytes 5541 sllx %o4, 56, %o3 5542 lduwa [%o0+1]%asi, %o4 5543 sllx %o4, 24, %o4 5544 or %o4, %o3, %o3 5545 lduha [%o0+5]%asi, %o4 5546 sllx %o4, 8, %o4 5547 or %o4, %o3, %o3 5548 lduba [%o0+7]%asi, %o4 5549 or %o4, %o3, %o4 5550 stx %o4, [%o1] 5551 5552 lduba [%o0+8]%asi, %o4 5553 sllx %o4, 56, %o3 5554 lduwa [%o0+9]%asi, %o4 5555 sllx %o4, 24, %o4 5556 or %o4, %o3, %o3 5557 lduha [%o0+13]%asi, %o4 5558 sllx %o4, 8, %o4 5559 or %o4, %o3, %o3 5560 lduba [%o0+15]%asi, %o4 5561 or %o4, %o3, %o4 5562 stx %o4, [%o1+8] 5563 5564 lduba [%o0+16]%asi, %o4 5565 sllx %o4, 56, %o3 5566 lduwa [%o0+17]%asi, %o4 5567 sllx %o4, 24, %o4 5568 or %o4, %o3, %o3 5569 lduha [%o0+21]%asi, %o4 5570 sllx %o4, 8, %o4 5571 or %o4, %o3, %o3 5572 lduba [%o0+23]%asi, %o4 5573 or %o4, %o3, %o4 5574 stx %o4, [%o1+16] 5575 5576 add %o0, 32, %o0 ! increase src ptr by 32 5577 add %o1, 32, %o1 ! increase dst ptr by 32 5578 5579 lduba [%o0-8]%asi, %o4 5580 sllx %o4, 56, %o3 5581 lduwa [%o0-7]%asi, %o4 5582 sllx %o4, 24, %o4 5583 or %o4, %o3, %o3 5584 lduha [%o0-3]%asi, %o4 5585 sllx %o4, 8, %o4 5586 or %o4, %o3, %o3 5587 lduba [%o0-1]%asi, %o4 5588 or %o4, %o3, %o4 5589 bgu,pt %ncc, .ci_medbh32 ! repeat if at least 32 bytes left 5590 stx %o4, [%o1-8] 5591 5592.ci_medbh31: 5593 addcc %o2, 24, %o2 ! adjust count to be off by 7 5594 ble,pt %ncc, .ci_medb7 ! skip if 7 or fewer bytes left 5595 nop ! 5596.ci_medbh15: 5597 lduba [%o0]%asi, %o4 ! load and store a block of 8 bytes 5598 sllx %o4, 56, %o3 5599 lduwa [%o0+1]%asi, %o4 5600 sllx %o4, 24, %o4 5601 or %o4, %o3, %o3 5602 lduha [%o0+5]%asi, %o4 5603 sllx %o4, 8, %o4 5604 or %o4, %o3, %o3 5605 lduba [%o0+7]%asi, %o4 5606 or %o4, %o3, %o4 5607 stx %o4, [%o1] 5608 subcc %o2, 8, %o2 ! decrement length count 5609 add %o1, 8, %o1 ! increase dst ptr by 8 5610 add %o0, 8, %o0 ! increase src ptr by 8 5611 bgu,pt %ncc, .ci_medbh15 5612 stx %o4, [%o1-8] 5613 ba .ci_medb7 5614 nop 5615 5616/* 5617 * End of small copy in code (no window) 5618 * 5619 */ 5620 5621/* 5622 * Long copy in code (using register window and fp regs) 5623 * 5624 */ 5625 5626.ci_copy_more: 5627 sethi %hi(copyio_fault), %o3 5628 or %o3, %lo(copyio_fault), %o3 5629 membar #Sync 5630 stn %o3, [THREAD_REG + T_LOFAULT] 5631/* 5632 * Following code is for large copies. We know there is at 5633 * least FP_COPY bytes available. FP regs are used, so 5634 * we save registers and fp regs before starting 5635 */ 5636 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 5637 or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 5638 rd %fprs, %g1 ! check for unused fp 5639 ! if fprs.fef == 0, set it. 5640 ! Setting it when already set costs more than checking 5641 andcc %g1, FPRS_FEF, %g1 ! test FEF, fprs.du = fprs.dl = 0 5642 bz,pt %ncc, .ci_fp_unused 5643 mov ASI_USER, %asi 5644 BST_FP_TOSTACK(%o3) 5645 ba .ci_fp_ready 5646.ci_fp_unused: 5647 prefetcha [%i0 + (1 * CACHE_LINE)]%asi, #one_read 5648 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 5649.ci_fp_ready: 5650 rd %gsr, %l5 ! save %gsr value 5651 andcc %i1, 1, %o3 ! is dest byte aligned 5652 bnz,pt %ncc, .ci_big_d1 5653.ci_big_d1f: ! dest is now half word aligned 5654 andcc %i1, 2, %o3 5655 bnz,pt %ncc, .ci_big_d2 5656.ci_big_d2f: ! dest is now word aligned 5657 andcc %i1, 4, %o3 5658 bnz,pt %ncc, .ci_big_d4 5659.ci_big_d4f: ! dest is long word aligned 5660 andcc %i0, 7, %o3 ! is src long word aligned 5661 brnz,pt %o3, .ci_big_unal8 5662 prefetcha [%i0 + (2 * CACHE_LINE)]%asi, #one_read 5663 ! Src and dst are long word aligned 5664 ! align dst to 64 byte boundary 5665 andcc %i1, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 5666 brz,pn %o3, .ci_al_to_64 5667 nop 5668 sub %o3, 64, %o3 ! %o3 has negative bytes to move 5669 add %i2, %o3, %i2 ! adjust remaining count 5670 andcc %o3, 8, %o4 ! odd long words to move? 5671 brz,pt %o4, .ci_al_to_16 5672 nop 5673 add %o3, 8, %o3 5674 ldxa [%i0]%asi, %o4 5675 add %i0, 8, %i0 ! increment src ptr 5676 add %i1, 8, %i1 ! increment dst ptr 5677 stx %o4, [%i1-8] 5678! Dest is aligned on 16 bytes, src 8 byte aligned 5679.ci_al_to_16: 5680 andcc %o3, 0x30, %o4 ! pair of long words to move? 5681 brz,pt %o4, .ci_al_to_64 5682 nop 5683.ci_al_mv_16: 5684 add %o3, 16, %o3 5685 ldxa [%i0]%asi, %o4 5686 stx %o4, [%i1] 5687 add %i0, 16, %i0 ! increment src ptr 5688 ldxa [%i0-8]%asi, %o4 5689 stx %o4, [%i1+8] 5690 andcc %o3, 0x30, %o4 5691 brnz,pt %o4, .ci_al_mv_16 5692 add %i1, 16, %i1 ! increment dst ptr 5693! Dest is aligned on 64 bytes, src 8 byte aligned 5694.ci_al_to_64: 5695 ! Determine source alignment 5696 ! to correct 8 byte offset 5697 andcc %i0, 32, %o3 5698 brnz,pn %o3, .ci_aln_1 5699 andcc %i0, 16, %o3 5700 brnz,pn %o3, .ci_aln_01 5701 andcc %i0, 8, %o3 5702 brz,pn %o3, .ci_aln_000 5703 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read 5704 ba .ci_aln_001 5705 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5706.ci_aln_01: 5707 brnz,pn %o3, .ci_aln_011 5708 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read 5709 ba .ci_aln_010 5710 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5711.ci_aln_1: 5712 andcc %i0, 16, %o3 5713 brnz,pn %o3, .ci_aln_11 5714 andcc %i0, 8, %o3 5715 brnz,pn %o3, .ci_aln_101 5716 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read 5717 ba .ci_aln_100 5718 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5719.ci_aln_11: 5720 brz,pn %o3, .ci_aln_110 5721 prefetcha [%i0 + (3 * CACHE_LINE)]%asi, #one_read 5722 5723.ci_aln_111: 5724! Alignment off by 8 bytes 5725 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5726 ldda [%i0]%asi, %d0 5727 add %i0, 8, %i0 5728 sub %i2, 8, %i2 5729 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5730 and %i2, 0x7f, %i2 ! residue bytes in %i2 5731 sub %i1, %i0, %i1 5732.ci_aln_111_loop: 5733 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5734 subcc %o3, 64, %o3 5735 fmovd %d16, %d2 5736 fmovd %d18, %d4 5737 fmovd %d20, %d6 5738 fmovd %d22, %d8 5739 fmovd %d24, %d10 5740 fmovd %d26, %d12 5741 fmovd %d28, %d14 5742 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5743 stda %d0,[%i0+%i1]ASI_BLK_P 5744 add %i0, 64, %i0 5745 fmovd %d30, %d0 5746 bgt,pt %ncc, .ci_aln_111_loop 5747 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5748 add %i1, %i0, %i1 5749 5750 std %d0, [%i1] 5751 ba .ci_remain_stuff 5752 add %i1, 8, %i1 5753 ! END OF aln_111 5754 5755.ci_aln_110: 5756! Alignment off by 16 bytes 5757 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5758 ldda [%i0]%asi, %d0 5759 ldda [%i0+8]%asi, %d2 5760 add %i0, 16, %i0 5761 sub %i2, 16, %i2 5762 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5763 and %i2, 0x7f, %i2 ! residue bytes in %i2 5764 sub %i1, %i0, %i1 5765.ci_aln_110_loop: 5766 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5767 subcc %o3, 64, %o3 5768 fmovd %d16, %d4 5769 fmovd %d18, %d6 5770 fmovd %d20, %d8 5771 fmovd %d22, %d10 5772 fmovd %d24, %d12 5773 fmovd %d26, %d14 5774 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5775 stda %d0,[%i0+%i1]ASI_BLK_P 5776 add %i0, 64, %i0 5777 fmovd %d28, %d0 5778 fmovd %d30, %d2 5779 bgt,pt %ncc, .ci_aln_110_loop 5780 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5781 add %i1, %i0, %i1 5782 5783 std %d0, [%i1] 5784 std %d2, [%i1+8] 5785 ba .ci_remain_stuff 5786 add %i1, 16, %i1 5787 ! END OF aln_110 5788 5789.ci_aln_101: 5790! Alignment off by 24 bytes 5791 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5792 ldda [%i0]%asi, %d0 5793 ldda [%i0+8]%asi, %d2 5794 ldda [%i0+16]%asi, %d4 5795 add %i0, 24, %i0 5796 sub %i2, 24, %i2 5797 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5798 and %i2, 0x7f, %i2 ! residue bytes in %i2 5799 sub %i1, %i0, %i1 5800.ci_aln_101_loop: 5801 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5802 subcc %o3, 64, %o3 5803 fmovd %d16, %d6 5804 fmovd %d18, %d8 5805 fmovd %d20, %d10 5806 fmovd %d22, %d12 5807 fmovd %d24, %d14 5808 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5809 stda %d0,[%i0+%i1]ASI_BLK_P 5810 add %i0, 64, %i0 5811 fmovd %d26, %d0 5812 fmovd %d28, %d2 5813 fmovd %d30, %d4 5814 bgt,pt %ncc, .ci_aln_101_loop 5815 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5816 add %i1, %i0, %i1 5817 5818 std %d0, [%i1] 5819 std %d2, [%i1+8] 5820 std %d4, [%i1+16] 5821 ba .ci_remain_stuff 5822 add %i1, 24, %i1 5823 ! END OF aln_101 5824 5825.ci_aln_100: 5826! Alignment off by 32 bytes 5827 ldda [%i0]%asi, %d0 5828 ldda [%i0+8]%asi, %d2 5829 ldda [%i0+16]%asi,%d4 5830 ldda [%i0+24]%asi,%d6 5831 add %i0, 32, %i0 5832 sub %i2, 32, %i2 5833 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5834 and %i2, 0x7f, %i2 ! residue bytes in %i2 5835 sub %i1, %i0, %i1 5836.ci_aln_100_loop: 5837 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5838 subcc %o3, 64, %o3 5839 fmovd %d16, %d8 5840 fmovd %d18, %d10 5841 fmovd %d20, %d12 5842 fmovd %d22, %d14 5843 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5844 stda %d0,[%i0+%i1]ASI_BLK_P 5845 add %i0, 64, %i0 5846 fmovd %d24, %d0 5847 fmovd %d26, %d2 5848 fmovd %d28, %d4 5849 fmovd %d30, %d6 5850 bgt,pt %ncc, .ci_aln_100_loop 5851 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5852 add %i1, %i0, %i1 5853 5854 std %d0, [%i1] 5855 std %d2, [%i1+8] 5856 std %d4, [%i1+16] 5857 std %d6, [%i1+24] 5858 ba .ci_remain_stuff 5859 add %i1, 32, %i1 5860 ! END OF aln_100 5861 5862.ci_aln_011: 5863! Alignment off by 40 bytes 5864 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5865 ldda [%i0]%asi, %d0 5866 ldda [%i0+8]%asi, %d2 5867 ldda [%i0+16]%asi, %d4 5868 ldda [%i0+24]%asi, %d6 5869 ldda [%i0+32]%asi, %d8 5870 add %i0, 40, %i0 5871 sub %i2, 40, %i2 5872 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5873 and %i2, 0x7f, %i2 ! residue bytes in %i2 5874 sub %i1, %i0, %i1 5875.ci_aln_011_loop: 5876 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5877 subcc %o3, 64, %o3 5878 fmovd %d16, %d10 5879 fmovd %d18, %d12 5880 fmovd %d20, %d14 5881 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5882 stda %d0,[%i0+%i1]ASI_BLK_P 5883 add %i0, 64, %i0 5884 fmovd %d22, %d0 5885 fmovd %d24, %d2 5886 fmovd %d26, %d4 5887 fmovd %d28, %d6 5888 fmovd %d30, %d8 5889 bgt,pt %ncc, .ci_aln_011_loop 5890 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5891 add %i1, %i0, %i1 5892 5893 std %d0, [%i1] 5894 std %d2, [%i1+8] 5895 std %d4, [%i1+16] 5896 std %d6, [%i1+24] 5897 std %d8, [%i1+32] 5898 ba .ci_remain_stuff 5899 add %i1, 40, %i1 5900 ! END OF aln_011 5901 5902.ci_aln_010: 5903! Alignment off by 48 bytes 5904 ldda [%i0]%asi, %d0 5905 ldda [%i0+8]%asi, %d2 5906 ldda [%i0+16]%asi, %d4 5907 ldda [%i0+24]%asi, %d6 5908 ldda [%i0+32]%asi, %d8 5909 ldda [%i0+40]%asi, %d10 5910 add %i0, 48, %i0 5911 sub %i2, 48, %i2 5912 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5913 and %i2, 0x7f, %i2 ! residue bytes in %i2 5914 sub %i1, %i0, %i1 5915.ci_aln_010_loop: 5916 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5917 subcc %o3, 64, %o3 5918 fmovd %d16, %d12 5919 fmovd %d18, %d14 5920 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5921 stda %d0,[%i0+%i1]ASI_BLK_P 5922 add %i0, 64, %i0 5923 fmovd %d20, %d0 5924 fmovd %d22, %d2 5925 fmovd %d24, %d4 5926 fmovd %d26, %d6 5927 fmovd %d28, %d8 5928 fmovd %d30, %d10 5929 bgt,pt %ncc, .ci_aln_010_loop 5930 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5931 add %i1, %i0, %i1 5932 5933 std %d0, [%i1] 5934 std %d2, [%i1+8] 5935 std %d4, [%i1+16] 5936 std %d6, [%i1+24] 5937 std %d8, [%i1+32] 5938 std %d10, [%i1+40] 5939 ba .ci_remain_stuff 5940 add %i1, 48, %i1 5941 ! END OF aln_010 5942 5943.ci_aln_001: 5944! Alignment off by 56 bytes 5945 ldda [%i0]%asi, %d0 5946 ldda [%i0+8]%asi, %d2 5947 ldda [%i0+16]%asi, %d4 5948 ldda [%i0+24]%asi, %d6 5949 ldda [%i0+32]%asi, %d8 5950 ldda [%i0+40]%asi, %d10 5951 ldda [%i0+48]%asi, %d12 5952 add %i0, 56, %i0 5953 sub %i2, 56, %i2 5954 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5955 and %i2, 0x7f, %i2 ! residue bytes in %i2 5956 sub %i1, %i0, %i1 5957.ci_aln_001_loop: 5958 ldda [%i0]ASI_BLK_AIUS,%d16 ! block load 5959 subcc %o3, 64, %o3 5960 fmovd %d16, %d14 5961 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5962 stda %d0,[%i0+%i1]ASI_BLK_P 5963 add %i0, 64, %i0 5964 fmovd %d18, %d0 5965 fmovd %d20, %d2 5966 fmovd %d22, %d4 5967 fmovd %d24, %d6 5968 fmovd %d26, %d8 5969 fmovd %d28, %d10 5970 fmovd %d30, %d12 5971 bgt,pt %ncc, .ci_aln_001_loop 5972 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5973 add %i1, %i0, %i1 5974 5975 std %d0, [%i1] 5976 std %d2, [%i1+8] 5977 std %d4, [%i1+16] 5978 std %d6, [%i1+24] 5979 std %d8, [%i1+32] 5980 std %d10, [%i1+40] 5981 std %d12, [%i1+48] 5982 ba .ci_remain_stuff 5983 add %i1, 56, %i1 5984 ! END OF aln_001 5985 5986.ci_aln_000: 5987 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5988 andn %i2, 0x7f, %o3 ! %o3 is multiple of 2*block size 5989 and %i2, 0x7f, %i2 ! residue bytes in %i2 5990 sub %i1, %i0, %i1 5991.ci_aln_000_loop: 5992 ldda [%i0]ASI_BLK_AIUS,%d0 5993 subcc %o3, 64, %o3 5994 stxa %g0,[%i0+%i1]ASI_STBI_P ! block initializing store 5995 stda %d0,[%i0+%i1]ASI_BLK_P 5996 add %i0, 64, %i0 5997 bgt,pt %ncc, .ci_aln_000_loop 5998 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 5999 add %i1, %i0, %i1 6000 6001 ! END OF aln_000 6002 6003.ci_remain_stuff: 6004 subcc %i2, 31, %i2 ! adjust length to allow cc test 6005 ble,pt %ncc, .ci_aln_31 6006 nop 6007.ci_aln_32: 6008 ldxa [%i0]%asi, %o4 ! move 32 bytes 6009 subcc %i2, 32, %i2 ! decrement length count by 32 6010 stx %o4, [%i1] 6011 ldxa [%i0+8]%asi, %o4 6012 stx %o4, [%i1+8] 6013 ldxa [%i0+16]%asi, %o4 6014 add %i0, 32, %i0 ! increase src ptr by 32 6015 stx %o4, [%i1+16] 6016 ldxa [%i0-8]%asi, %o4 6017 add %i1, 32, %i1 ! increase dst ptr by 32 6018 bgu,pt %ncc, .ci_aln_32 ! repeat if at least 32 bytes left 6019 stx %o4, [%i1-8] 6020.ci_aln_31: 6021 addcc %i2, 24, %i2 ! adjust count to be off by 7 6022 ble,pt %ncc, .ci_aln_7 ! skip if 7 or fewer bytes left 6023 nop ! 6024.ci_aln_15: 6025 ldxa [%i0]%asi, %o4 ! move 8 bytes 6026 add %i0, 8, %i0 ! increase src ptr by 8 6027 subcc %i2, 8, %i2 ! decrease count by 8 6028 add %i1, 8, %i1 ! increase dst ptr by 8 6029 bgu,pt %ncc, .ci_aln_15 6030 stx %o4, [%i1-8] ! 6031.ci_aln_7: 6032 addcc %i2, 7, %i2 ! finish adjustment of remaining count 6033 bz,pt %ncc, .ci_exit ! exit if finished 6034 cmp %i2, 4 6035 blt,pt %ncc, .ci_unaln3x ! skip if less than 4 bytes left 6036 nop ! 6037 lda [%i0]%asi, %o4 ! move 4 bytes 6038 add %i0, 4, %i0 ! increase src ptr by 4 6039 add %i1, 4, %i1 ! increase dst ptr by 4 6040 subcc %i2, 4, %i2 ! decrease count by 4 6041 bnz .ci_unaln3x 6042 stw %o4, [%i1-4] 6043 ba .ci_exit 6044 nop 6045 6046 ! destination alignment code 6047.ci_big_d1: 6048 lduba [%i0]%asi, %o4 ! move a byte 6049 add %i0, 1, %i0 6050 stb %o4, [%i1] 6051 add %i1, 1, %i1 6052 andcc %i1, 2, %o3 6053 bz,pt %ncc, .ci_big_d2f 6054 sub %i2, 1, %i2 6055.ci_big_d2: ! dest is now at least half word aligned 6056 lduba [%i0]%asi, %o4 ! move a half-word (src align unknown) 6057 lduba [%i0+1]%asi, %o3 6058 add %i0, 2, %i0 6059 sll %o4, 8, %o4 ! position 6060 or %o4, %o3, %o4 ! merge 6061 sth %o4, [%i1] 6062 add %i1, 2, %i1 6063 andcc %i1, 4, %o3 6064 bz,pt %ncc, .ci_big_d4f 6065 sub %i2, 2, %i2 6066.ci_big_d4: ! dest is at least word aligned 6067 nop 6068 lduba [%i0]%asi, %o4 ! move a word (src align unknown) 6069 lduba [%i0+1]%asi, %o3 6070 sll %o4, 24, %o4 ! position 6071 sll %o3, 16, %o3 ! position 6072 or %o4, %o3, %o3 ! merge 6073 lduba [%i0+2]%asi, %o4 6074 sll %o4, 8, %o4 ! position 6075 or %o4, %o3, %o3 ! merge 6076 lduba [%i0+3]%asi, %o4 6077 or %o4, %o3, %o4 ! merge 6078 stw %o4,[%i1] ! store four bytes 6079 add %i0, 4, %i0 ! adjust src by 4 6080 add %i1, 4, %i1 ! adjust dest by 4 6081 ba .ci_big_d4f 6082 sub %i2, 4, %i2 ! adjust count by 4 6083 6084 6085 ! Dst is on 8 byte boundary; src is not; 6086.ci_big_unal8: 6087 andcc %i1, 0x3f, %o3 ! is dst 64-byte block aligned? 6088 bz %ncc, .ci_unalnsrc 6089 sub %o3, 64, %o3 ! %o3 will be multiple of 8 6090 neg %o3 ! bytes until dest is 64 byte aligned 6091 sub %i2, %o3, %i2 ! update cnt with bytes to be moved 6092 ! Move bytes according to source alignment 6093 andcc %i0, 0x1, %o4 6094 bnz %ncc, .ci_unalnbyte ! check for byte alignment 6095 nop 6096 andcc %i0, 2, %o4 ! check for half word alignment 6097 bnz %ncc, .ci_unalnhalf 6098 nop 6099 ! Src is word aligned, move bytes until dest 64 byte aligned 6100.ci_unalnword: 6101 lda [%i0]%asi, %o4 ! load 4 bytes 6102 stw %o4, [%i1] ! and store 4 bytes 6103 lda [%i0+4]%asi, %o4 ! load 4 bytes 6104 add %i0, 8, %i0 ! increase src ptr by 8 6105 stw %o4, [%i1+4] ! and store 4 bytes 6106 subcc %o3, 8, %o3 ! decrease count by 8 6107 bnz %ncc, .ci_unalnword 6108 add %i1, 8, %i1 ! increase dst ptr by 8 6109 ba .ci_unalnsrc 6110 nop 6111 6112 ! Src is half-word aligned, move bytes until dest 64 byte aligned 6113.ci_unalnhalf: 6114 lduha [%i0]%asi, %o4 ! load 2 bytes 6115 sllx %o4, 32, %i3 ! shift left 6116 lduwa [%i0+2]%asi, %o4 6117 or %o4, %i3, %i3 6118 sllx %i3, 16, %i3 6119 lduha [%i0+6]%asi, %o4 6120 or %o4, %i3, %i3 6121 stx %i3, [%i1] 6122 add %i0, 8, %i0 6123 subcc %o3, 8, %o3 6124 bnz %ncc, .ci_unalnhalf 6125 add %i1, 8, %i1 6126 ba .ci_unalnsrc 6127 nop 6128 6129 ! Src is Byte aligned, move bytes until dest 64 byte aligned 6130.ci_unalnbyte: 6131 sub %i1, %i0, %i1 ! share pointer advance 6132.ci_unalnbyte_loop: 6133 lduba [%i0]%asi, %o4 6134 sllx %o4, 56, %i3 6135 lduha [%i0+1]%asi, %o4 6136 sllx %o4, 40, %o4 6137 or %o4, %i3, %i3 6138 lduha [%i0+3]%asi, %o4 6139 sllx %o4, 24, %o4 6140 or %o4, %i3, %i3 6141 lduha [%i0+5]%asi, %o4 6142 sllx %o4, 8, %o4 6143 or %o4, %i3, %i3 6144 lduba [%i0+7]%asi, %o4 6145 or %o4, %i3, %i3 6146 stx %i3, [%i1+%i0] 6147 subcc %o3, 8, %o3 6148 bnz %ncc, .ci_unalnbyte_loop 6149 add %i0, 8, %i0 6150 add %i1,%i0, %i1 ! restore pointer 6151 6152 ! Destination is now block (64 byte aligned), src is not 8 byte aligned 6153.ci_unalnsrc: 6154 andn %i2, 0x3f, %i3 ! %i3 is multiple of block size 6155 and %i2, 0x3f, %i2 ! residue bytes in %i2 6156 add %i2, 64, %i2 ! Insure we don't load beyond 6157 sub %i3, 64, %i3 ! end of source buffer 6158 6159 andn %i0, 0x3f, %o4 ! %o4 has block aligned src address 6160 prefetcha [%o4 + (3 * CACHE_LINE)]%asi, #one_read 6161 alignaddr %i0, %g0, %g0 ! generate %gsr 6162 add %i0, %i3, %i0 ! advance %i0 to after blocks 6163 ! 6164 ! Determine source alignment to correct 8 byte offset 6165 andcc %i0, 0x20, %o3 6166 brnz,pn %o3, .ci_unaln_1 6167 andcc %i0, 0x10, %o3 6168 brnz,pn %o3, .ci_unaln_01 6169 andcc %i0, 0x08, %o3 6170 brz,a %o3, .ci_unaln_000 6171 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6172 ba .ci_unaln_001 6173 nop 6174.ci_unaln_01: 6175 brnz,a %o3, .ci_unaln_011 6176 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6177 ba .ci_unaln_010 6178 nop 6179.ci_unaln_1: 6180 brnz,pn %o3, .ci_unaln_11 6181 andcc %i0, 0x08, %o3 6182 brnz,a %o3, .ci_unaln_101 6183 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6184 ba .ci_unaln_100 6185 nop 6186.ci_unaln_11: 6187 brz,pn %o3, .ci_unaln_110 6188 prefetcha [%i0 + (4 * CACHE_LINE)]%asi, #one_read 6189 6190.ci_unaln_111: 6191 ldda [%o4+56]%asi, %d14 6192.ci_unaln_111_loop: 6193 add %o4, 64, %o4 6194 ldda [%o4]ASI_BLK_AIUS, %d16 6195 faligndata %d14, %d16, %d48 6196 faligndata %d16, %d18, %d50 6197 faligndata %d18, %d20, %d52 6198 faligndata %d20, %d22, %d54 6199 faligndata %d22, %d24, %d56 6200 faligndata %d24, %d26, %d58 6201 faligndata %d26, %d28, %d60 6202 faligndata %d28, %d30, %d62 6203 fmovd %d30, %d14 6204 stda %d48, [%i1]ASI_BLK_P 6205 subcc %i3, 64, %i3 6206 add %i1, 64, %i1 6207 bgu,pt %ncc, .ci_unaln_111_loop 6208 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6209 ba .ci_unaln_done 6210 nop 6211 6212.ci_unaln_110: 6213 ldda [%o4+48]%asi, %d12 6214 ldda [%o4+56]%asi, %d14 6215.ci_unaln_110_loop: 6216 add %o4, 64, %o4 6217 ldda [%o4]ASI_BLK_AIUS, %d16 6218 faligndata %d12, %d14, %d48 6219 faligndata %d14, %d16, %d50 6220 faligndata %d16, %d18, %d52 6221 faligndata %d18, %d20, %d54 6222 faligndata %d20, %d22, %d56 6223 faligndata %d22, %d24, %d58 6224 faligndata %d24, %d26, %d60 6225 faligndata %d26, %d28, %d62 6226 fmovd %d28, %d12 6227 fmovd %d30, %d14 6228 stda %d48, [%i1]ASI_BLK_P 6229 subcc %i3, 64, %i3 6230 add %i1, 64, %i1 6231 bgu,pt %ncc, .ci_unaln_110_loop 6232 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6233 ba .ci_unaln_done 6234 nop 6235 6236.ci_unaln_101: 6237 ldda [%o4+40]%asi, %d10 6238 ldda [%o4+48]%asi, %d12 6239 ldda [%o4+56]%asi, %d14 6240.ci_unaln_101_loop: 6241 add %o4, 64, %o4 6242 ldda [%o4]ASI_BLK_AIUS, %d16 6243 faligndata %d10, %d12, %d48 6244 faligndata %d12, %d14, %d50 6245 faligndata %d14, %d16, %d52 6246 faligndata %d16, %d18, %d54 6247 faligndata %d18, %d20, %d56 6248 faligndata %d20, %d22, %d58 6249 faligndata %d22, %d24, %d60 6250 faligndata %d24, %d26, %d62 6251 fmovd %d26, %d10 6252 fmovd %d28, %d12 6253 fmovd %d30, %d14 6254 stda %d48, [%i1]ASI_BLK_P 6255 subcc %i3, 64, %i3 6256 add %i1, 64, %i1 6257 bgu,pt %ncc, .ci_unaln_101_loop 6258 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6259 ba .ci_unaln_done 6260 nop 6261 6262.ci_unaln_100: 6263 ldda [%o4+32]%asi, %d8 6264 ldda [%o4+40]%asi, %d10 6265 ldda [%o4+48]%asi, %d12 6266 ldda [%o4+56]%asi, %d14 6267.ci_unaln_100_loop: 6268 add %o4, 64, %o4 6269 ldda [%o4]ASI_BLK_AIUS, %d16 6270 faligndata %d8, %d10, %d48 6271 faligndata %d10, %d12, %d50 6272 faligndata %d12, %d14, %d52 6273 faligndata %d14, %d16, %d54 6274 faligndata %d16, %d18, %d56 6275 faligndata %d18, %d20, %d58 6276 faligndata %d20, %d22, %d60 6277 faligndata %d22, %d24, %d62 6278 fmovd %d24, %d8 6279 fmovd %d26, %d10 6280 fmovd %d28, %d12 6281 fmovd %d30, %d14 6282 stda %d48, [%i1]ASI_BLK_P 6283 subcc %i3, 64, %i3 6284 add %i1, 64, %i1 6285 bgu,pt %ncc, .ci_unaln_100_loop 6286 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6287 ba .ci_unaln_done 6288 nop 6289 6290.ci_unaln_011: 6291 ldda [%o4+24]%asi, %d6 6292 ldda [%o4+32]%asi, %d8 6293 ldda [%o4+40]%asi, %d10 6294 ldda [%o4+48]%asi, %d12 6295 ldda [%o4+56]%asi, %d14 6296.ci_unaln_011_loop: 6297 add %o4, 64, %o4 6298 ldda [%o4]ASI_BLK_AIUS, %d16 6299 faligndata %d6, %d8, %d48 6300 faligndata %d8, %d10, %d50 6301 faligndata %d10, %d12, %d52 6302 faligndata %d12, %d14, %d54 6303 faligndata %d14, %d16, %d56 6304 faligndata %d16, %d18, %d58 6305 faligndata %d18, %d20, %d60 6306 faligndata %d20, %d22, %d62 6307 fmovd %d22, %d6 6308 fmovd %d24, %d8 6309 fmovd %d26, %d10 6310 fmovd %d28, %d12 6311 fmovd %d30, %d14 6312 stda %d48, [%i1]ASI_BLK_P 6313 subcc %i3, 64, %i3 6314 add %i1, 64, %i1 6315 bgu,pt %ncc, .ci_unaln_011_loop 6316 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6317 ba .ci_unaln_done 6318 nop 6319 6320.ci_unaln_010: 6321 ldda [%o4+16]%asi, %d4 6322 ldda [%o4+24]%asi, %d6 6323 ldda [%o4+32]%asi, %d8 6324 ldda [%o4+40]%asi, %d10 6325 ldda [%o4+48]%asi, %d12 6326 ldda [%o4+56]%asi, %d14 6327.ci_unaln_010_loop: 6328 add %o4, 64, %o4 6329 ldda [%o4]ASI_BLK_AIUS, %d16 6330 faligndata %d4, %d6, %d48 6331 faligndata %d6, %d8, %d50 6332 faligndata %d8, %d10, %d52 6333 faligndata %d10, %d12, %d54 6334 faligndata %d12, %d14, %d56 6335 faligndata %d14, %d16, %d58 6336 faligndata %d16, %d18, %d60 6337 faligndata %d18, %d20, %d62 6338 fmovd %d20, %d4 6339 fmovd %d22, %d6 6340 fmovd %d24, %d8 6341 fmovd %d26, %d10 6342 fmovd %d28, %d12 6343 fmovd %d30, %d14 6344 stda %d48, [%i1]ASI_BLK_P 6345 subcc %i3, 64, %i3 6346 add %i1, 64, %i1 6347 bgu,pt %ncc, .ci_unaln_010_loop 6348 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6349 ba .ci_unaln_done 6350 nop 6351 6352.ci_unaln_001: 6353 ldda [%o4+8]%asi, %d2 6354 ldda [%o4+16]%asi, %d4 6355 ldda [%o4+24]%asi, %d6 6356 ldda [%o4+32]%asi, %d8 6357 ldda [%o4+40]%asi, %d10 6358 ldda [%o4+48]%asi, %d12 6359 ldda [%o4+56]%asi, %d14 6360.ci_unaln_001_loop: 6361 add %o4, 64, %o4 6362 ldda [%o4]ASI_BLK_AIUS, %d16 6363 faligndata %d2, %d4, %d48 6364 faligndata %d4, %d6, %d50 6365 faligndata %d6, %d8, %d52 6366 faligndata %d8, %d10, %d54 6367 faligndata %d10, %d12, %d56 6368 faligndata %d12, %d14, %d58 6369 faligndata %d14, %d16, %d60 6370 faligndata %d16, %d18, %d62 6371 fmovd %d18, %d2 6372 fmovd %d20, %d4 6373 fmovd %d22, %d6 6374 fmovd %d24, %d8 6375 fmovd %d26, %d10 6376 fmovd %d28, %d12 6377 fmovd %d30, %d14 6378 stda %d48, [%i1]ASI_BLK_P 6379 subcc %i3, 64, %i3 6380 add %i1, 64, %i1 6381 bgu,pt %ncc, .ci_unaln_001_loop 6382 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6383 ba .ci_unaln_done 6384 nop 6385 6386.ci_unaln_000: 6387 ldda [%o4]ASI_BLK_AIUS, %d0 6388.ci_unaln_000_loop: 6389 add %o4, 64, %o4 6390 ldda [%o4]ASI_BLK_AIUS, %d16 6391 faligndata %d0, %d2, %d48 6392 faligndata %d2, %d4, %d50 6393 faligndata %d4, %d6, %d52 6394 faligndata %d6, %d8, %d54 6395 faligndata %d8, %d10, %d56 6396 faligndata %d10, %d12, %d58 6397 faligndata %d12, %d14, %d60 6398 faligndata %d14, %d16, %d62 6399 fmovd %d16, %d0 6400 fmovd %d18, %d2 6401 fmovd %d20, %d4 6402 fmovd %d22, %d6 6403 fmovd %d24, %d8 6404 fmovd %d26, %d10 6405 fmovd %d28, %d12 6406 fmovd %d30, %d14 6407 stda %d48, [%i1]ASI_BLK_P 6408 subcc %i3, 64, %i3 6409 add %i1, 64, %i1 6410 bgu,pt %ncc, .ci_unaln_000_loop 6411 prefetcha [%o4 + (4 * CACHE_LINE)]%asi, #one_read 6412 6413.ci_unaln_done: 6414 ! Handle trailing bytes, 64 to 127 6415 ! Dest long word aligned, Src not long word aligned 6416 cmp %i2, 15 6417 bleu %ncc, .ci_unaln_short 6418 6419 andn %i2, 0x7, %i3 ! %i3 is multiple of 8 6420 and %i2, 0x7, %i2 ! residue bytes in %i2 6421 add %i2, 8, %i2 6422 sub %i3, 8, %i3 ! insure we don't load past end of src 6423 andn %i0, 0x7, %o4 ! %o4 has long word aligned src address 6424 add %i0, %i3, %i0 ! advance %i0 to after multiple of 8 6425 ldda [%o4]%asi, %d0 ! fetch partial word 6426.ci_unaln_by8: 6427 ldda [%o4+8]%asi, %d2 6428 add %o4, 8, %o4 6429 faligndata %d0, %d2, %d16 6430 subcc %i3, 8, %i3 6431 std %d16, [%i1] 6432 fmovd %d2, %d0 6433 bgu,pt %ncc, .ci_unaln_by8 6434 add %i1, 8, %i1 6435 6436.ci_unaln_short: 6437 cmp %i2, 8 6438 blt,pt %ncc, .ci_unalnfin 6439 nop 6440 lduba [%i0]%asi, %o4 6441 sll %o4, 24, %o3 6442 lduba [%i0+1]%asi, %o4 6443 sll %o4, 16, %o4 6444 or %o4, %o3, %o3 6445 lduba [%i0+2]%asi, %o4 6446 sll %o4, 8, %o4 6447 or %o4, %o3, %o3 6448 lduba [%i0+3]%asi, %o4 6449 or %o4, %o3, %o3 6450 stw %o3, [%i1] 6451 lduba [%i0+4]%asi, %o4 6452 sll %o4, 24, %o3 6453 lduba [%i0+5]%asi, %o4 6454 sll %o4, 16, %o4 6455 or %o4, %o3, %o3 6456 lduba [%i0+6]%asi, %o4 6457 sll %o4, 8, %o4 6458 or %o4, %o3, %o3 6459 lduba [%i0+7]%asi, %o4 6460 or %o4, %o3, %o3 6461 stw %o3, [%i1+4] 6462 add %i0, 8, %i0 6463 add %i1, 8, %i1 6464 sub %i2, 8, %i2 6465.ci_unalnfin: 6466 cmp %i2, 4 6467 blt,pt %ncc, .ci_unalnz 6468 tst %i2 6469 lduba [%i0]%asi, %o3 ! read byte 6470 subcc %i2, 4, %i2 ! reduce count by 4 6471 sll %o3, 24, %o3 ! position 6472 lduba [%i0+1]%asi, %o4 6473 sll %o4, 16, %o4 ! position 6474 or %o4, %o3, %o3 ! merge 6475 lduba [%i0+2]%asi, %o4 6476 sll %o4, 8, %o4 ! position 6477 or %o4, %o3, %o3 ! merge 6478 add %i1, 4, %i1 ! advance dst by 4 6479 lduba [%i0+3]%asi, %o4 6480 add %i0, 4, %i0 ! advance src by 4 6481 or %o4, %o3, %o4 ! merge 6482 bnz,pt %ncc, .ci_unaln3x 6483 stw %o4, [%i1-4] 6484 ba .ci_exit 6485 nop 6486.ci_unalnz: 6487 bz,pt %ncc, .ci_exit 6488 wr %l5, %g0, %gsr ! restore %gsr 6489.ci_unaln3x: ! Exactly 1, 2, or 3 bytes remain 6490 subcc %i2, 1, %i2 ! reduce count for cc test 6491 lduba [%i0]%asi, %o4 ! load one byte 6492 bz,pt %ncc, .ci_exit 6493 stb %o4, [%i1] ! store one byte 6494 lduba [%i0+1]%asi, %o4 ! load second byte 6495 subcc %i2, 1, %i2 6496 bz,pt %ncc, .ci_exit 6497 stb %o4, [%i1+1] ! store second byte 6498 lduba [%i0+2]%asi, %o4 ! load third byte 6499 stb %o4, [%i1+2] ! store third byte 6500.ci_exit: 6501 brnz %g1, .ci_fp_restore 6502 nop 6503 FZERO 6504 wr %g1, %g0, %fprs 6505 ba,pt %ncc, .ci_ex2 6506 membar #Sync 6507.ci_fp_restore: 6508 BLD_FP_FROMSTACK(%o4) 6509.ci_ex2: 6510 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 6511 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 6512 ret 6513 restore %g0, 0, %o0 6514 6515.copyin_err: 6516 ldn [THREAD_REG + T_COPYOPS], %o4 6517 brz %o4, 2f 6518 nop 6519 ldn [%o4 + CP_COPYIN], %g2 6520 jmp %g2 6521 nop 65222: 6523 retl 6524 mov -1, %o0 6525 6526#else /* NIAGARA_IMPL */ 6527.do_copyin: 6528 ! 6529 ! Check the length and bail if zero. 6530 ! 6531 tst %o2 6532 bnz,pt %ncc, 1f 6533 nop 6534 retl 6535 clr %o0 65361: 6537 sethi %hi(copyio_fault), %o4 6538 or %o4, %lo(copyio_fault), %o4 6539 sethi %hi(copyio_fault_nowindow), %o3 6540 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT 6541 or %o3, %lo(copyio_fault_nowindow), %o3 6542 membar #Sync 6543 stn %o3, [THREAD_REG + T_LOFAULT] 6544 6545 mov %o0, SAVE_SRC 6546 mov %o1, SAVE_DST 6547 mov %o2, SAVE_COUNT 6548 6549 ! 6550 ! Check to see if we're more than SMALL_LIMIT. 6551 ! 6552 subcc %o2, SMALL_LIMIT, %o3 6553 bgu,a,pt %ncc, .dci_ns 6554 or %o0, %o1, %o3 6555 ! 6556 ! What was previously ".small_copyin" 6557 ! 6558.dcibcp: 6559 sub %g0, %o2, %o3 ! setup for copy loop 6560 add %o0, %o2, %o0 6561 add %o1, %o2, %o1 6562 ba,pt %ncc, .dcicl 6563 lduba [%o0 + %o3]ASI_USER, %o4 6564 ! 6565 ! %o0 and %o1 point at the end and remain pointing at the end 6566 ! of their buffers. We pull things out by adding %o3 (which is 6567 ! the negation of the length) to the buffer end which gives us 6568 ! the curent location in the buffers. By incrementing %o3 we walk 6569 ! through both buffers without having to bump each buffer's 6570 ! pointer. A very fast 4 instruction loop. 6571 ! 6572 .align 16 6573.dcicl: 6574 stb %o4, [%o1 + %o3] 6575 inccc %o3 6576 bl,a,pt %ncc, .dcicl 6577 lduba [%o0 + %o3]ASI_USER, %o4 6578 ! 6579 ! We're done. Go home. 6580 ! 6581 membar #Sync 6582 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 6583 retl 6584 clr %o0 6585 ! 6586 ! Try aligned copies from here. 6587 ! 6588.dci_ns: 6589 ! 6590 ! See if we're single byte aligned. If we are, check the 6591 ! limit for single byte copies. If we're smaller, or equal, 6592 ! bounce to the byte for byte copy loop. Otherwise do it in 6593 ! HW (if enabled). 6594 ! 6595 btst 1, %o3 6596 bz,a,pt %icc, .dcih8 6597 btst 7, %o3 6598 ! 6599 ! We're single byte aligned. 6600 ! 6601 sethi %hi(hw_copy_limit_1), %o3 6602 ld [%o3 + %lo(hw_copy_limit_1)], %o3 6603 ! 6604 ! Is HW copy on? If not do everything byte for byte. 6605 ! 6606 tst %o3 6607 bz,pn %icc, .dcibcp 6608 subcc %o3, %o2, %o3 6609 ! 6610 ! Are we bigger than the HW limit? If not 6611 ! go to byte for byte. 6612 ! 6613 bge,pt %ncc, .dcibcp 6614 nop 6615 ! 6616 ! We're big enough and copy is on. Do it with HW. 6617 ! 6618 ba,pt %ncc, .big_copyin 6619 nop 6620.dcih8: 6621 ! 6622 ! 8 byte aligned? 6623 ! 6624 bnz,a %ncc, .dcih4 6625 btst 3, %o3 6626 ! 6627 ! We're eight byte aligned. 6628 ! 6629 sethi %hi(hw_copy_limit_8), %o3 6630 ld [%o3 + %lo(hw_copy_limit_8)], %o3 6631 ! 6632 ! Is HW assist on? If not, do it with the aligned copy. 6633 ! 6634 tst %o3 6635 bz,pn %icc, .dcis8 6636 subcc %o3, %o2, %o3 6637 bge %ncc, .dcis8 6638 nop 6639 ba,pt %ncc, .big_copyin 6640 nop 6641.dcis8: 6642 ! 6643 ! Housekeeping for copy loops. Uses same idea as in the byte for 6644 ! byte copy loop above. 6645 ! 6646 add %o0, %o2, %o0 6647 add %o1, %o2, %o1 6648 sub %g0, %o2, %o3 6649 ba,pt %ncc, .didebc 6650 srl %o2, 3, %o2 ! Number of 8 byte chunks to copy 6651 ! 6652 ! 4 byte aligned? 6653 ! 6654.dcih4: 6655 bnz %ncc, .dcih2 6656 sethi %hi(hw_copy_limit_4), %o3 6657 ld [%o3 + %lo(hw_copy_limit_4)], %o3 6658 ! 6659 ! Is HW assist on? If not, do it with the aligned copy. 6660 ! 6661 tst %o3 6662 bz,pn %icc, .dcis4 6663 subcc %o3, %o2, %o3 6664 ! 6665 ! We're negative if our size is less than or equal to hw_copy_limit_4. 6666 ! 6667 bge %ncc, .dcis4 6668 nop 6669 ba,pt %ncc, .big_copyin 6670 nop 6671.dcis4: 6672 ! 6673 ! Housekeeping for copy loops. Uses same idea as in the byte 6674 ! for byte copy loop above. 6675 ! 6676 add %o0, %o2, %o0 6677 add %o1, %o2, %o1 6678 sub %g0, %o2, %o3 6679 ba,pt %ncc, .didfbc 6680 srl %o2, 2, %o2 ! Number of 4 byte chunks to copy 6681.dcih2: 6682 ! 6683 ! We're two byte aligned. Check for "smallness" 6684 ! done in delay at .dcih4 6685 ! 6686 bleu,pt %ncc, .dcis2 6687 sethi %hi(hw_copy_limit_2), %o3 6688 ld [%o3 + %lo(hw_copy_limit_2)], %o3 6689 ! 6690 ! Is HW assist on? If not, do it with the aligned copy. 6691 ! 6692 tst %o3 6693 bz,pn %icc, .dcis2 6694 subcc %o3, %o2, %o3 6695 ! 6696 ! Are we larger than the HW limit? 6697 ! 6698 bge %ncc, .dcis2 6699 nop 6700 ! 6701 ! HW assist is on and we're large enough to use it. 6702 ! 6703 ba,pt %ncc, .big_copyin 6704 nop 6705 ! 6706 ! Housekeeping for copy loops. Uses same idea as in the byte 6707 ! for byte copy loop above. 6708 ! 6709.dcis2: 6710 add %o0, %o2, %o0 6711 add %o1, %o2, %o1 6712 sub %g0, %o2, %o3 6713 ba,pt %ncc, .didtbc 6714 srl %o2, 1, %o2 ! Number of 2 byte chunks to copy 6715 ! 6716.small_copyin: 6717 ! 6718 ! Why are we doing this AGAIN? There are certain conditions in 6719 ! big copyin that will cause us to forgo the HW assisted copys 6720 ! and bounce back to a non-hw assisted copy. This dispatches 6721 ! those copies. Note that we branch around this in the main line 6722 ! code. 6723 ! 6724 ! We make no check for limits or HW enablement here. We've 6725 ! already been told that we're a poster child so just go off 6726 ! and do it. 6727 ! 6728 or %o0, %o1, %o3 6729 btst 1, %o3 6730 bnz %icc, .dcibcp ! Most likely 6731 btst 7, %o3 6732 bz %icc, .dcis8 6733 btst 3, %o3 6734 bz %icc, .dcis4 6735 nop 6736 ba,pt %ncc, .dcis2 6737 nop 6738 ! 6739 ! Eight byte aligned copies. A steal from the original .small_copyin 6740 ! with modifications. %o2 is number of 8 byte chunks to copy. When 6741 ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more 6742 ! to copy. 6743 ! 6744 .align 32 6745.didebc: 6746 ldxa [%o0 + %o3]ASI_USER, %o4 6747 deccc %o2 6748 stx %o4, [%o1 + %o3] 6749 bg,pt %ncc, .didebc 6750 addcc %o3, 8, %o3 6751 ! 6752 ! End of copy loop. Most 8 byte aligned copies end here. 6753 ! 6754 bz,pt %ncc, .dcifh 6755 nop 6756 ! 6757 ! Something is left. Do it byte for byte. 6758 ! 6759 ba,pt %ncc, .dcicl 6760 lduba [%o0 + %o3]ASI_USER, %o4 6761 ! 6762 ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy. 6763 ! 6764 .align 32 6765.didfbc: 6766 lduwa [%o0 + %o3]ASI_USER, %o4 6767 deccc %o2 6768 st %o4, [%o1 + %o3] 6769 bg,pt %ncc, .didfbc 6770 addcc %o3, 4, %o3 6771 ! 6772 ! End of copy loop. Most 4 byte aligned copies end here. 6773 ! 6774 bz,pt %ncc, .dcifh 6775 nop 6776 ! 6777 ! Something is left. Do it byte for byte. 6778 ! 6779 ba,pt %ncc, .dcicl 6780 lduba [%o0 + %o3]ASI_USER, %o4 6781 ! 6782 ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to 6783 ! copy. 6784 ! 6785 .align 32 6786.didtbc: 6787 lduha [%o0 + %o3]ASI_USER, %o4 6788 deccc %o2 6789 sth %o4, [%o1 + %o3] 6790 bg,pt %ncc, .didtbc 6791 addcc %o3, 2, %o3 6792 ! 6793 ! End of copy loop. Most 2 byte aligned copies end here. 6794 ! 6795 bz,pt %ncc, .dcifh 6796 nop 6797 ! 6798 ! Deal with the last byte 6799 ! 6800 lduba [%o0 + %o3]ASI_USER, %o4 6801 stb %o4, [%o1 + %o3] 6802.dcifh: 6803 membar #Sync 6804 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 6805 retl 6806 clr %o0 6807 6808.big_copyin: 6809 ! We're going off to do a block copy. 6810 ! Switch fault hendlers and grab a window. We 6811 ! don't do a membar #Sync since we've done only 6812 ! kernel data to this point. 6813 stn %o4, [THREAD_REG + T_LOFAULT] 6814 6815 ! Copy in that reach here are larger than 256 bytes. The 6816 ! hw_copy_limit_1 is set to 256. Never set this limit less 6817 ! 128 bytes. 6818 save %sp, -SA(MINFRAME), %sp 6819.do_blockcopyin: 6820 6821 ! Swap src/dst since the code below is memcpy code 6822 ! and memcpy/bcopy have different calling sequences 6823 mov %i1, %i5 6824 mov %i0, %i1 6825 mov %i5, %i0 6826 6827 ! Block (64 bytes) align the destination. 6828 andcc %i0, 0x3f, %i3 ! is dst block aligned 6829 bz %ncc, copyin_blalign ! dst already block aligned 6830 sub %i3, 0x40, %i3 6831 neg %i3 ! bytes till dst 64 bytes aligned 6832 sub %i2, %i3, %i2 ! update i2 with new count 6833 6834 ! Based on source and destination alignment do 6835 ! either 8 bytes, 4 bytes, 2 bytes or byte copy. 6836 6837 ! Is dst & src 8B aligned 6838 or %i0, %i1, %o2 6839 andcc %o2, 0x7, %g0 6840 bz %ncc, .ci_alewdcp 6841 nop 6842 6843 ! Is dst & src 4B aligned 6844 andcc %o2, 0x3, %g0 6845 bz %ncc, .ci_alwdcp 6846 nop 6847 6848 ! Is dst & src 2B aligned 6849 andcc %o2, 0x1, %g0 6850 bz %ncc, .ci_alhlfwdcp 6851 nop 6852 6853 ! 1B aligned 68541: lduba [%i1]ASI_USER, %o2 6855 stb %o2, [%i0] 6856 inc %i1 6857 deccc %i3 6858 bgu,pt %ncc, 1b 6859 inc %i0 6860 6861 ba copyin_blalign 6862 nop 6863 6864 ! dst & src 4B aligned 6865.ci_alwdcp: 6866 lda [%i1]ASI_USER, %o2 6867 st %o2, [%i0] 6868 add %i1, 0x4, %i1 6869 subcc %i3, 0x4, %i3 6870 bgu,pt %ncc, .ci_alwdcp 6871 add %i0, 0x4, %i0 6872 6873 ba copyin_blalign 6874 nop 6875 6876 ! dst & src 2B aligned 6877.ci_alhlfwdcp: 6878 lduha [%i1]ASI_USER, %o2 6879 stuh %o2, [%i0] 6880 add %i1, 0x2, %i1 6881 subcc %i3, 0x2, %i3 6882 bgu,pt %ncc, .ci_alhlfwdcp 6883 add %i0, 0x2, %i0 6884 6885 ba copyin_blalign 6886 nop 6887 6888 ! dst & src 8B aligned 6889.ci_alewdcp: 6890 ldxa [%i1]ASI_USER, %o2 6891 stx %o2, [%i0] 6892 add %i1, 0x8, %i1 6893 subcc %i3, 0x8, %i3 6894 bgu,pt %ncc, .ci_alewdcp 6895 add %i0, 0x8, %i0 6896 6897copyin_blalign: 6898 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size 6899 sub %i2, %i3, %i2 ! Residue bytes in %i2 6900 6901 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 6902 6903 andcc %i1, 0xf, %o2 ! is src quadword aligned 6904 bz,pn %xcc, .ci_blkcpy ! src offset in %o2 (last 4-bits) 6905 nop 6906 cmp %o2, 0x8 6907 bg .ci_upper_double 6908 nop 6909 bl .ci_lower_double 6910 nop 6911 6912 ! Falls through when source offset is equal to 8 i.e. 6913 ! source is double word aligned. 6914 ! In this case no shift/merge of data is required 6915 6916 sub %i1, %o2, %i1 ! align the src at 16 bytes. 6917 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 6918 prefetcha [%l0]ASI_USER, #one_read 6919 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 6920 add %l0, 0x40, %l0 6921.ci_loop0: 6922 add %i1, 0x10, %i1 6923 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 6924 6925 prefetcha [%l0]ASI_USER, #one_read 6926 6927 stxa %l3, [%i0+0x0]%asi 6928 stxa %l4, [%i0+0x8]%asi 6929 6930 add %i1, 0x10, %i1 6931 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 6932 6933 stxa %l5, [%i0+0x10]%asi 6934 stxa %l2, [%i0+0x18]%asi 6935 6936 add %i1, 0x10, %i1 6937 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 6938 6939 stxa %l3, [%i0+0x20]%asi 6940 stxa %l4, [%i0+0x28]%asi 6941 6942 add %i1, 0x10, %i1 6943 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 6944 6945 stxa %l5, [%i0+0x30]%asi 6946 stxa %l2, [%i0+0x38]%asi 6947 6948 add %l0, 0x40, %l0 6949 subcc %i3, 0x40, %i3 6950 bgu,pt %xcc, .ci_loop0 6951 add %i0, 0x40, %i0 6952 ba .ci_blkdone 6953 add %i1, %o2, %i1 ! increment the source by src offset 6954 ! the src offset was stored in %o2 6955 6956.ci_lower_double: 6957 6958 sub %i1, %o2, %i1 ! align the src at 16 bytes. 6959 sll %o2, 3, %o0 ! %o0 left shift 6960 mov 0x40, %o1 6961 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 6962 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 6963 prefetcha [%l0]ASI_USER, #one_read 6964 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l2 6965 ! and %l3 has complete 6966 ! data 6967 add %l0, 0x40, %l0 6968.ci_loop1: 6969 add %i1, 0x10, %i1 6970 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has partial data 6971 ! for this read. 6972 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4 6973 ! into %l2 and %l3 6974 6975 prefetcha [%l0]ASI_USER, #one_read 6976 6977 stxa %l2, [%i0+0x0]%asi 6978 stxa %l3, [%i0+0x8]%asi 6979 6980 add %i1, 0x10, %i1 6981 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 6982 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and 6983 ! %l4 from previous read 6984 ! into %l4 and %l5 6985 stxa %l4, [%i0+0x10]%asi 6986 stxa %l5, [%i0+0x18]%asi 6987 6988 ! Repeat the same for next 32 bytes. 6989 6990 add %i1, 0x10, %i1 6991 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 6992 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) 6993 6994 stxa %l2, [%i0+0x20]%asi 6995 stxa %l3, [%i0+0x28]%asi 6996 6997 add %i1, 0x10, %i1 6998 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 6999 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) 7000 7001 stxa %l4, [%i0+0x30]%asi 7002 stxa %l5, [%i0+0x38]%asi 7003 7004 add %l0, 0x40, %l0 7005 subcc %i3, 0x40, %i3 7006 bgu,pt %xcc, .ci_loop1 7007 add %i0, 0x40, %i0 7008 ba .ci_blkdone 7009 add %i1, %o2, %i1 ! increment the source by src offset 7010 ! the src offset was stored in %o2 7011 7012.ci_upper_double: 7013 7014 sub %i1, %o2, %i1 ! align the src at 16 bytes. 7015 sub %o2, 0x8, %o0 7016 sll %o0, 3, %o0 ! %o0 left shift 7017 mov 0x40, %o1 7018 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 7019 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 7020 prefetcha [%l0]ASI_USER, #one_read 7021 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l3 7022 ! for this read and 7023 ! no data in %l2 7024 add %l0, 0x40, %l0 7025.ci_loop2: 7026 add %i1, 0x10, %i1 7027 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has complete data 7028 ! and %l5 has partial 7029 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5 7030 ! into %l3 and %l4 7031 prefetcha [%l0]ASI_USER, #one_read 7032 7033 stxa %l3, [%i0+0x0]%asi 7034 stxa %l4, [%i0+0x8]%asi 7035 7036 add %i1, 0x10, %i1 7037 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 7038 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with 7039 ! %l5 from previous read 7040 ! into %l5 and %l2 7041 7042 stxa %l5, [%i0+0x10]%asi 7043 stxa %l2, [%i0+0x18]%asi 7044 7045 ! Repeat the same for next 32 bytes. 7046 7047 add %i1, 0x10, %i1 7048 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 7049 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) 7050 7051 stxa %l3, [%i0+0x20]%asi 7052 stxa %l4, [%i0+0x28]%asi 7053 7054 add %i1, 0x10, %i1 7055 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 7056 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) 7057 7058 stxa %l5, [%i0+0x30]%asi 7059 stxa %l2, [%i0+0x38]%asi 7060 7061 add %l0, 0x40, %l0 7062 subcc %i3, 0x40, %i3 7063 bgu,pt %xcc, .ci_loop2 7064 add %i0, 0x40, %i0 7065 ba .ci_blkdone 7066 add %i1, %o2, %i1 ! increment the source by src offset 7067 ! the src offset was stored in %o2 7068 7069 7070 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P 7071.ci_blkcpy: 7072 7073 andn %i1, 0x3f, %o0 ! %o0 has block aligned source 7074 prefetcha [%o0]ASI_USER, #one_read 7075 add %o0, 0x40, %o0 70761: 7077 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0 7078 add %i1, 0x10, %i1 7079 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 7080 add %i1, 0x10, %i1 7081 7082 prefetcha [%o0]ASI_USER, #one_read 7083 7084 stxa %l0, [%i0+0x0]%asi 7085 7086 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 7087 add %i1, 0x10, %i1 7088 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6 7089 add %i1, 0x10, %i1 7090 7091 stxa %l1, [%i0+0x8]%asi 7092 stxa %l2, [%i0+0x10]%asi 7093 stxa %l3, [%i0+0x18]%asi 7094 stxa %l4, [%i0+0x20]%asi 7095 stxa %l5, [%i0+0x28]%asi 7096 stxa %l6, [%i0+0x30]%asi 7097 stxa %l7, [%i0+0x38]%asi 7098 7099 add %o0, 0x40, %o0 7100 subcc %i3, 0x40, %i3 7101 bgu,pt %xcc, 1b 7102 add %i0, 0x40, %i0 7103 7104.ci_blkdone: 7105 membar #Sync 7106 7107 brz,pt %i2, .copyin_exit 7108 nop 7109 7110 ! Handle trailing bytes 7111 cmp %i2, 0x8 7112 blu,pt %ncc, .ci_residue 7113 nop 7114 7115 ! Can we do some 8B ops 7116 or %i1, %i0, %o2 7117 andcc %o2, 0x7, %g0 7118 bnz %ncc, .ci_last4 7119 nop 7120 7121 ! Do 8byte ops as long as possible 7122.ci_last8: 7123 ldxa [%i1]ASI_USER, %o2 7124 stx %o2, [%i0] 7125 add %i1, 0x8, %i1 7126 sub %i2, 0x8, %i2 7127 cmp %i2, 0x8 7128 bgu,pt %ncc, .ci_last8 7129 add %i0, 0x8, %i0 7130 7131 brz,pt %i2, .copyin_exit 7132 nop 7133 7134 ba .ci_residue 7135 nop 7136 7137.ci_last4: 7138 ! Can we do 4B ops 7139 andcc %o2, 0x3, %g0 7140 bnz %ncc, .ci_last2 7141 nop 71421: 7143 lda [%i1]ASI_USER, %o2 7144 st %o2, [%i0] 7145 add %i1, 0x4, %i1 7146 sub %i2, 0x4, %i2 7147 cmp %i2, 0x4 7148 bgu,pt %ncc, 1b 7149 add %i0, 0x4, %i0 7150 7151 brz,pt %i2, .copyin_exit 7152 nop 7153 7154 ba .ci_residue 7155 nop 7156 7157.ci_last2: 7158 ! Can we do 2B ops 7159 andcc %o2, 0x1, %g0 7160 bnz %ncc, .ci_residue 7161 nop 7162 71631: 7164 lduha [%i1]ASI_USER, %o2 7165 stuh %o2, [%i0] 7166 add %i1, 0x2, %i1 7167 sub %i2, 0x2, %i2 7168 cmp %i2, 0x2 7169 bgu,pt %ncc, 1b 7170 add %i0, 0x2, %i0 7171 7172 brz,pt %i2, .copyin_exit 7173 nop 7174 7175 ! Copy the residue as byte copy 7176.ci_residue: 7177 lduba [%i1]ASI_USER, %i4 7178 stb %i4, [%i0] 7179 inc %i1 7180 deccc %i2 7181 bgu,pt %xcc, .ci_residue 7182 inc %i0 7183 7184.copyin_exit: 7185 membar #Sync 7186 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 7187 ret 7188 restore %g0, 0, %o0 7189.copyin_err: 7190 ldn [THREAD_REG + T_COPYOPS], %o4 7191 brz %o4, 2f 7192 nop 7193 ldn [%o4 + CP_COPYIN], %g2 7194 jmp %g2 7195 nop 71962: 7197 retl 7198 mov -1, %o0 7199#endif /* NIAGARA_IMPL */ 7200 SET_SIZE(copyin) 7201 7202 ENTRY(xcopyin) 7203 sethi %hi(.xcopyin_err), REAL_LOFAULT 7204 b .do_copyin 7205 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT 7206.xcopyin_err: 7207 ldn [THREAD_REG + T_COPYOPS], %o4 7208 brz %o4, 2f 7209 nop 7210 ldn [%o4 + CP_XCOPYIN], %g2 7211 jmp %g2 7212 nop 72132: 7214 retl 7215 mov %g1, %o0 7216 SET_SIZE(xcopyin) 7217 7218 ENTRY(xcopyin_little) 7219 sethi %hi(.little_err), %o4 7220 ldn [THREAD_REG + T_LOFAULT], %o5 7221 or %o4, %lo(.little_err), %o4 7222 membar #Sync ! sync error barrier 7223 stn %o4, [THREAD_REG + T_LOFAULT] 7224 7225 subcc %g0, %o2, %o3 7226 add %o0, %o2, %o0 7227 bz,pn %ncc, 2f ! check for zero bytes 7228 sub %o2, 1, %o4 7229 add %o0, %o4, %o0 ! start w/last byte 7230 add %o1, %o2, %o1 7231 lduba [%o0+%o3]ASI_AIUSL, %o4 7232 72331: stb %o4, [%o1+%o3] 7234 inccc %o3 7235 sub %o0, 2, %o0 ! get next byte 7236 bcc,a,pt %ncc, 1b 7237 lduba [%o0+%o3]ASI_AIUSL, %o4 7238 72392: membar #Sync ! sync error barrier 7240 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 7241 retl 7242 mov %g0, %o0 ! return (0) 7243 7244.little_err: 7245 membar #Sync ! sync error barrier 7246 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 7247 retl 7248 mov %g1, %o0 7249 SET_SIZE(xcopyin_little) 7250 7251 7252/* 7253 * Copy a block of storage - must not overlap (from + len <= to). 7254 * No fault handler installed (to be called under on_fault()) 7255 */ 7256 7257 ENTRY(copyin_noerr) 7258 sethi %hi(.copyio_noerr), REAL_LOFAULT 7259 b .do_copyin 7260 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 7261.copyio_noerr: 7262 jmp SAVED_LOFAULT 7263 nop 7264 SET_SIZE(copyin_noerr) 7265 7266/* 7267 * Copy a block of storage - must not overlap (from + len <= to). 7268 * No fault handler installed (to be called under on_fault()) 7269 */ 7270 7271 ENTRY(copyout_noerr) 7272 sethi %hi(.copyio_noerr), REAL_LOFAULT 7273 b .do_copyout 7274 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 7275 SET_SIZE(copyout_noerr) 7276 7277 .align 4 7278 DGDEF(use_hw_bcopy) 7279 .word 1 7280 DGDEF(use_hw_bzero) 7281 .word 1 7282 DGDEF(hw_copy_limit_1) 7283 .word 0x100 7284 DGDEF(hw_copy_limit_2) 7285 .word 0x200 7286 DGDEF(hw_copy_limit_4) 7287 .word 0x400 7288 DGDEF(hw_copy_limit_8) 7289 .word 0x400 7290 7291 .align 64 7292 .section ".text" 7293 7294/* 7295 * hwblkclr - clears block-aligned, block-multiple-sized regions that are 7296 * longer than 256 bytes in length using Niagara's block stores/quad store. 7297 * If the criteria for using this routine are not met then it calls bzero 7298 * and returns 1. Otherwise 0 is returned indicating success. 7299 * Caller is responsible for ensuring use_hw_bzero is true and that 7300 * kpreempt_disable() has been called. 7301 */ 7302 ! %i0 - start address 7303 ! %i1 - length of region (multiple of 64) 7304 7305 ENTRY(hwblkclr) 7306 save %sp, -SA(MINFRAME), %sp 7307 7308 ! Must be block-aligned 7309 andcc %i0, 0x3f, %g0 7310 bnz,pn %ncc, 1f 7311 nop 7312 7313 ! ... and must be 256 bytes or more 7314 cmp %i1, 0x100 7315 blu,pn %ncc, 1f 7316 nop 7317 7318 ! ... and length must be a multiple of 64 7319 andcc %i1, 0x3f, %g0 7320 bz,pn %ncc, .pz_doblock 7321 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 7322 73231: ! punt, call bzero but notify the caller that bzero was used 7324 mov %i0, %o0 7325 call bzero 7326 mov %i1, %o1 7327 ret 7328 restore %g0, 1, %o0 ! return (1) - did not use block operations 7329 7330 ! Already verified that there are at least 256 bytes to set 7331.pz_doblock: 7332 stxa %g0, [%i0+0x0]%asi 7333 stxa %g0, [%i0+0x40]%asi 7334 stxa %g0, [%i0+0x80]%asi 7335 stxa %g0, [%i0+0xc0]%asi 7336 7337 stxa %g0, [%i0+0x8]%asi 7338 stxa %g0, [%i0+0x10]%asi 7339 stxa %g0, [%i0+0x18]%asi 7340 stxa %g0, [%i0+0x20]%asi 7341 stxa %g0, [%i0+0x28]%asi 7342 stxa %g0, [%i0+0x30]%asi 7343 stxa %g0, [%i0+0x38]%asi 7344 7345 stxa %g0, [%i0+0x48]%asi 7346 stxa %g0, [%i0+0x50]%asi 7347 stxa %g0, [%i0+0x58]%asi 7348 stxa %g0, [%i0+0x60]%asi 7349 stxa %g0, [%i0+0x68]%asi 7350 stxa %g0, [%i0+0x70]%asi 7351 stxa %g0, [%i0+0x78]%asi 7352 7353 stxa %g0, [%i0+0x88]%asi 7354 stxa %g0, [%i0+0x90]%asi 7355 stxa %g0, [%i0+0x98]%asi 7356 stxa %g0, [%i0+0xa0]%asi 7357 stxa %g0, [%i0+0xa8]%asi 7358 stxa %g0, [%i0+0xb0]%asi 7359 stxa %g0, [%i0+0xb8]%asi 7360 7361 stxa %g0, [%i0+0xc8]%asi 7362 stxa %g0, [%i0+0xd0]%asi 7363 stxa %g0, [%i0+0xd8]%asi 7364 stxa %g0, [%i0+0xe0]%asi 7365 stxa %g0, [%i0+0xe8]%asi 7366 stxa %g0, [%i0+0xf0]%asi 7367 stxa %g0, [%i0+0xf8]%asi 7368 7369 sub %i1, 0x100, %i1 7370 cmp %i1, 0x100 7371 bgu,pt %ncc, .pz_doblock 7372 add %i0, 0x100, %i0 7373 73742: 7375 ! Check if more than 64 bytes to set 7376 cmp %i1,0x40 7377 blu %ncc, .pz_finish 7378 nop 7379 73803: 7381 stxa %g0, [%i0+0x0]%asi 7382 stxa %g0, [%i0+0x8]%asi 7383 stxa %g0, [%i0+0x10]%asi 7384 stxa %g0, [%i0+0x18]%asi 7385 stxa %g0, [%i0+0x20]%asi 7386 stxa %g0, [%i0+0x28]%asi 7387 stxa %g0, [%i0+0x30]%asi 7388 stxa %g0, [%i0+0x38]%asi 7389 7390 subcc %i1, 0x40, %i1 7391 bgu,pt %ncc, 3b 7392 add %i0, 0x40, %i0 7393 7394.pz_finish: 7395 membar #Sync 7396 ret 7397 restore %g0, 0, %o0 ! return (bzero or not) 7398 SET_SIZE(hwblkclr) 7399 7400 /* 7401 * Copy 32 bytes of data from src (%o0) to dst (%o1) 7402 * using physical addresses. 7403 */ 7404 ENTRY_NP(hw_pa_bcopy32) 7405 rdpr %pstate, %g1 7406 andn %g1, PSTATE_IE, %g2 7407 wrpr %g0, %g2, %pstate 7408 7409 ldxa [%o0]ASI_MEM, %o2 7410 add %o0, 8, %o0 7411 ldxa [%o0]ASI_MEM, %o3 7412 add %o0, 8, %o0 7413 ldxa [%o0]ASI_MEM, %o4 7414 add %o0, 8, %o0 7415 ldxa [%o0]ASI_MEM, %o5 7416 stxa %o2, [%o1]ASI_MEM 7417 add %o1, 8, %o1 7418 stxa %o3, [%o1]ASI_MEM 7419 add %o1, 8, %o1 7420 stxa %o4, [%o1]ASI_MEM 7421 add %o1, 8, %o1 7422 stxa %o5, [%o1]ASI_MEM 7423 7424 membar #Sync 7425 retl 7426 wrpr %g0, %g1, %pstate 7427 SET_SIZE(hw_pa_bcopy32) 7428 7429/* 7430 * Zero a block of storage. 7431 * 7432 * uzero is used by the kernel to zero a block in user address space. 7433 */ 7434 7435/* 7436 * Control flow of the bzero/kzero/uzero routine. 7437 * 7438 * For fewer than 7 bytes stores, bytes will be zeroed. 7439 * 7440 * For less than 15 bytes stores, align the address on 4 byte boundary. 7441 * Then store as many 4-byte chunks, followed by trailing bytes. 7442 * 7443 * For sizes greater than 15 bytes, align the address on 8 byte boundary. 7444 * if (count > 128) { 7445 * store as many 8-bytes chunks to block align the address 7446 * store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR 7447 * store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero) 7448 * } 7449 * Store as many 8-byte chunks, followed by trailing bytes. 7450 */ 7451 7452 ENTRY(uzero) 7453 ! 7454 ! Set a new lo_fault handler only if we came in with one 7455 ! already specified. 7456 ! 7457 wr %g0, ASI_USER, %asi 7458 ldn [THREAD_REG + T_LOFAULT], %o5 7459 tst %o5 7460 bz,pt %ncc, .do_zero 7461 sethi %hi(.zeroerr), %o2 7462 or %o2, %lo(.zeroerr), %o2 7463 membar #Sync 7464 ba,pt %ncc, .do_zero 7465 stn %o2, [THREAD_REG + T_LOFAULT] 7466 7467 ENTRY(kzero) 7468 ! 7469 ! Always set a lo_fault handler 7470 ! 7471 wr %g0, ASI_P, %asi 7472 ldn [THREAD_REG + T_LOFAULT], %o5 7473 sethi %hi(.zeroerr), %o2 7474 or %o5, LOFAULT_SET, %o5 7475 or %o2, %lo(.zeroerr), %o2 7476 membar #Sync 7477 ba,pt %ncc, .do_zero 7478 stn %o2, [THREAD_REG + T_LOFAULT] 7479 7480/* 7481 * We got here because of a fault during kzero or if 7482 * uzero or bzero was called with t_lofault non-zero. 7483 * Otherwise we've already run screaming from the room. 7484 * Errno value is in %g1. Note that we're here iff 7485 * we did set t_lofault. 7486 */ 7487.zeroerr: 7488 ! 7489 ! Undo asi register setting. Just set it to be the 7490 ! kernel default without checking. 7491 ! 7492 wr %g0, ASI_P, %asi 7493 7494 ! 7495 ! We did set t_lofault. It may well have been zero coming in. 7496 ! 74971: 7498 tst %o5 7499 membar #Sync 7500 bne,pn %ncc, 3f 7501 andncc %o5, LOFAULT_SET, %o5 75022: 7503 ! 7504 ! Old handler was zero. Just return the error. 7505 ! 7506 retl ! return 7507 mov %g1, %o0 ! error code from %g1 75083: 7509 ! 7510 ! We're here because %o5 was non-zero. It was non-zero 7511 ! because either LOFAULT_SET was present, a previous fault 7512 ! handler was present or both. In all cases we need to reset 7513 ! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET 7514 ! before we either simply return the error or we invoke the 7515 ! previously specified handler. 7516 ! 7517 be %ncc, 2b 7518 stn %o5, [THREAD_REG + T_LOFAULT] 7519 jmp %o5 ! goto real handler 7520 nop 7521 SET_SIZE(kzero) 7522 SET_SIZE(uzero) 7523 7524/* 7525 * Zero a block of storage. 7526 */ 7527 7528 ENTRY(bzero) 7529 wr %g0, ASI_P, %asi 7530 7531 ldn [THREAD_REG + T_LOFAULT], %o5 ! save old vector 7532 tst %o5 7533 bz,pt %ncc, .do_zero 7534 sethi %hi(.zeroerr), %o2 7535 or %o2, %lo(.zeroerr), %o2 7536 membar #Sync ! sync error barrier 7537 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector 7538 7539.do_zero: 7540 cmp %o1, 7 7541 blu,pn %ncc, .byteclr 7542 nop 7543 7544 cmp %o1, 15 7545 blu,pn %ncc, .wdalign 7546 nop 7547 7548 andcc %o0, 7, %o3 ! is add aligned on a 8 byte bound 7549 bz,pt %ncc, .blkalign ! already double aligned 7550 sub %o3, 8, %o3 ! -(bytes till double aligned) 7551 add %o1, %o3, %o1 ! update o1 with new count 7552 75531: 7554 stba %g0, [%o0]%asi 7555 inccc %o3 7556 bl,pt %ncc, 1b 7557 inc %o0 7558 7559 ! Now address is double aligned 7560.blkalign: 7561 cmp %o1, 0x80 ! check if there are 128 bytes to set 7562 blu,pn %ncc, .bzero_small 7563 mov %o1, %o3 7564 7565 sethi %hi(use_hw_bzero), %o2 7566 ld [%o2 + %lo(use_hw_bzero)], %o2 7567 tst %o2 7568 bz %ncc, .bzero_small 7569 mov %o1, %o3 7570 7571 rd %asi, %o3 7572 wr %g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 7573 cmp %o3, ASI_P 7574 bne,a %ncc, .algnblk 7575 wr %g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi 7576 7577.algnblk: 7578 andcc %o0, 0x3f, %o3 ! is block aligned? 7579 bz,pt %ncc, .bzero_blk 7580 sub %o3, 0x40, %o3 ! -(bytes till block aligned) 7581 add %o1, %o3, %o1 ! o1 is the remainder 7582 7583 ! Clear -(%o3) bytes till block aligned 75841: 7585 stxa %g0, [%o0]%asi 7586 addcc %o3, 8, %o3 7587 bl,pt %ncc, 1b 7588 add %o0, 8, %o0 7589 7590.bzero_blk: 7591 and %o1, 0x3f, %o3 ! calc bytes left after blk clear 7592 andn %o1, 0x3f, %o4 ! calc size of blocks in bytes 7593 7594 cmp %o4, 0x100 ! 256 bytes or more 7595 blu,pn %ncc, 3f 7596 nop 7597 75982: 7599 stxa %g0, [%o0+0x0]%asi 7600 stxa %g0, [%o0+0x40]%asi 7601 stxa %g0, [%o0+0x80]%asi 7602 stxa %g0, [%o0+0xc0]%asi 7603 7604 stxa %g0, [%o0+0x8]%asi 7605 stxa %g0, [%o0+0x10]%asi 7606 stxa %g0, [%o0+0x18]%asi 7607 stxa %g0, [%o0+0x20]%asi 7608 stxa %g0, [%o0+0x28]%asi 7609 stxa %g0, [%o0+0x30]%asi 7610 stxa %g0, [%o0+0x38]%asi 7611 7612 stxa %g0, [%o0+0x48]%asi 7613 stxa %g0, [%o0+0x50]%asi 7614 stxa %g0, [%o0+0x58]%asi 7615 stxa %g0, [%o0+0x60]%asi 7616 stxa %g0, [%o0+0x68]%asi 7617 stxa %g0, [%o0+0x70]%asi 7618 stxa %g0, [%o0+0x78]%asi 7619 7620 stxa %g0, [%o0+0x88]%asi 7621 stxa %g0, [%o0+0x90]%asi 7622 stxa %g0, [%o0+0x98]%asi 7623 stxa %g0, [%o0+0xa0]%asi 7624 stxa %g0, [%o0+0xa8]%asi 7625 stxa %g0, [%o0+0xb0]%asi 7626 stxa %g0, [%o0+0xb8]%asi 7627 7628 stxa %g0, [%o0+0xc8]%asi 7629 stxa %g0, [%o0+0xd0]%asi 7630 stxa %g0, [%o0+0xd8]%asi 7631 stxa %g0, [%o0+0xe0]%asi 7632 stxa %g0, [%o0+0xe8]%asi 7633 stxa %g0, [%o0+0xf0]%asi 7634 stxa %g0, [%o0+0xf8]%asi 7635 7636 sub %o4, 0x100, %o4 7637 cmp %o4, 0x100 7638 bgu,pt %ncc, 2b 7639 add %o0, 0x100, %o0 7640 76413: 7642 ! ... check if 64 bytes to set 7643 cmp %o4, 0x40 7644 blu %ncc, .bzero_blk_done 7645 nop 7646 76474: 7648 stxa %g0, [%o0+0x0]%asi 7649 stxa %g0, [%o0+0x8]%asi 7650 stxa %g0, [%o0+0x10]%asi 7651 stxa %g0, [%o0+0x18]%asi 7652 stxa %g0, [%o0+0x20]%asi 7653 stxa %g0, [%o0+0x28]%asi 7654 stxa %g0, [%o0+0x30]%asi 7655 stxa %g0, [%o0+0x38]%asi 7656 7657 subcc %o4, 0x40, %o4 7658 bgu,pt %ncc, 3b 7659 add %o0, 0x40, %o0 7660 7661.bzero_blk_done: 7662 membar #Sync 7663 ! 7664 ! Undo asi register setting. 7665 ! 7666 rd %asi, %o4 7667 wr %g0, ASI_P, %asi 7668 cmp %o4, ASI_BLK_INIT_ST_QUAD_LDD_P 7669 bne,a %ncc, .bzero_small 7670 wr %g0, ASI_USER, %asi 7671 7672.bzero_small: 7673 ! Set the remaining doubles 7674 subcc %o3, 8, %o3 ! Can we store any doubles? 7675 blu,pn %ncc, .byteclr 7676 and %o1, 7, %o1 ! calc bytes left after doubles 7677 7678.dbclr: 7679 stxa %g0, [%o0]%asi ! Clear the doubles 7680 subcc %o3, 8, %o3 7681 bgeu,pt %ncc, .dbclr 7682 add %o0, 8, %o0 7683 7684 ba .byteclr 7685 nop 7686 7687.wdalign: 7688 andcc %o0, 3, %o3 ! is add aligned on a word boundary 7689 bz,pn %ncc, .wdclr 7690 andn %o1, 3, %o3 ! create word sized count in %o3 7691 7692 dec %o1 ! decrement count 7693 stba %g0, [%o0]%asi ! clear a byte 7694 ba .wdalign 7695 inc %o0 ! next byte 7696 7697.wdclr: 7698 sta %g0, [%o0]%asi ! 4-byte clearing loop 7699 subcc %o3, 4, %o3 7700 bnz,pt %ncc, .wdclr 7701 inc 4, %o0 7702 7703 and %o1, 3, %o1 ! leftover count, if any 7704 7705.byteclr: 7706 ! Set the leftover bytes 7707 brz %o1, .bzero_exit 7708 nop 7709 77107: 7711 deccc %o1 ! byte clearing loop 7712 stba %g0, [%o0]%asi 7713 bgu,pt %ncc, 7b 7714 inc %o0 7715 7716.bzero_exit: 7717 ! 7718 ! We're just concerned with whether t_lofault was set 7719 ! when we came in. We end up here from either kzero() 7720 ! or bzero(). kzero() *always* sets a lofault handler. 7721 ! It ors LOFAULT_SET into %o5 to indicate it has done 7722 ! this even if the value of %o5 is otherwise zero. 7723 ! bzero() sets a lofault handler *only* if one was 7724 ! previously set. Accordingly we need to examine 7725 ! %o5 and if it is non-zero be sure to clear LOFAULT_SET 7726 ! before resetting the error handler. 7727 ! 7728 tst %o5 7729 bz %ncc, 1f 7730 andn %o5, LOFAULT_SET, %o5 7731 membar #Sync ! sync error barrier 7732 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 77331: 7734 retl 7735 clr %o0 ! return (0) 7736 7737 SET_SIZE(bzero) 7738