1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26#pragma ident "%Z%%M% %I% %E% SMI" 27 28#include <sys/param.h> 29#include <sys/errno.h> 30#include <sys/asm_linkage.h> 31#include <sys/vtrace.h> 32#include <sys/machthread.h> 33#include <sys/clock.h> 34#include <sys/asi.h> 35#include <sys/fsr.h> 36#include <sys/privregs.h> 37#include <sys/machasi.h> 38#include <sys/niagaraasi.h> 39 40#if !defined(lint) 41#include "assym.h" 42#endif /* lint */ 43 44 45/* 46 * Pseudo-code to aid in understanding the control flow of the 47 * bcopy/kcopy routine. 48 * 49 * ! WARNING : <Register usage convention> 50 * ! In kcopy() the %o5, holds previous error handler and a flag 51 * ! LOFAULT_SET (low bits). The %o5 is null in bcopy(). 52 * ! The %o5 is not available for any other use. 53 * 54 * kcopy(): 55 * %o5 = curthread->t_lofault; ! save existing handler in %o5 56 * %o5 |= LOFAULT_SET; ! ORed with LOFAULT_SET flag 57 * curthread->t_lofault = .copyerr; 58 * Call bcopy(); 59 * 60 * bcopy(): 61 * if (length < 128) 62 * goto regular_copy; 63 * 64 * if (!use_hw_bcopy) 65 * goto regular_copy; 66 * 67 * blockcopy; 68 * restore t_lofault handler if came from kcopy(); 69 * 70 * regular_copy; 71 * restore t_lofault handler if came from kcopy(); 72 * 73 * In lofault handler: 74 * curthread->t_lofault = (%o5 & ~LOFAULT_SET); ! restore old t_lofault 75 * return (errno) 76 * 77 */ 78 79/* 80 * Less then or equal this number of bytes we will always copy byte-for-byte 81 */ 82#define SMALL_LIMIT 7 83 84/* 85 * LOFAULT_SET : Flag set by kzero and kcopy to indicate that t_lofault 86 * handler was set 87 */ 88#define LOFAULT_SET 2 89 90/* 91 * This define is to align data for the unaligned source cases. 92 * The data1, data2 and data3 is merged into data1 and data2. 93 * The data3 is preserved for next merge. 94 */ 95#define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \ 96 sllx data1, lshift, data1 ;\ 97 srlx data2, rshift, tmp ;\ 98 or data1, tmp, data1 ;\ 99 sllx data2, lshift, data2 ;\ 100 srlx data3, rshift, tmp ;\ 101 or data2, tmp, data2 102/* 103 * This macro is to align the data. Basically it merges 104 * data1 and data2 to form double word. 105 */ 106#define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \ 107 sllx data1, lshift, data1 ;\ 108 srlx data2, rshift, tmp ;\ 109 or data1, tmp, data1 110 111#if !defined(NIAGARA_IMPL) 112/* 113 * Flags set in the lower bits of the t_lofault address: 114 * FPUSED_FLAG: The FP registers were in use and must be restored 115 * BCOPY_FLAG: Set for bcopy calls, cleared for kcopy calls 116 * COPY_FLAGS: Both of the above 117 * 118 * Other flags: 119 * KPREEMPT_FLAG: kpreempt needs to be called 120 */ 121#define FPUSED_FLAG 1 122#define BCOPY_FLAG 2 123#define COPY_FLAGS (FPUSED_FLAG | BCOPY_FLAG) 124#define KPREEMPT_FLAG 4 125 126#define ALIGN_OFF_1_7 \ 127 faligndata %d0, %d2, %d48 ;\ 128 faligndata %d2, %d4, %d50 ;\ 129 faligndata %d4, %d6, %d52 ;\ 130 faligndata %d6, %d8, %d54 ;\ 131 faligndata %d8, %d10, %d56 ;\ 132 faligndata %d10, %d12, %d58 ;\ 133 faligndata %d12, %d14, %d60 ;\ 134 faligndata %d14, %d16, %d62 135 136#define ALIGN_OFF_8_15 \ 137 faligndata %d2, %d4, %d48 ;\ 138 faligndata %d4, %d6, %d50 ;\ 139 faligndata %d6, %d8, %d52 ;\ 140 faligndata %d8, %d10, %d54 ;\ 141 faligndata %d10, %d12, %d56 ;\ 142 faligndata %d12, %d14, %d58 ;\ 143 faligndata %d14, %d16, %d60 ;\ 144 faligndata %d16, %d18, %d62 145 146#define ALIGN_OFF_16_23 \ 147 faligndata %d4, %d6, %d48 ;\ 148 faligndata %d6, %d8, %d50 ;\ 149 faligndata %d8, %d10, %d52 ;\ 150 faligndata %d10, %d12, %d54 ;\ 151 faligndata %d12, %d14, %d56 ;\ 152 faligndata %d14, %d16, %d58 ;\ 153 faligndata %d16, %d18, %d60 ;\ 154 faligndata %d18, %d20, %d62 155 156#define ALIGN_OFF_24_31 \ 157 faligndata %d6, %d8, %d48 ;\ 158 faligndata %d8, %d10, %d50 ;\ 159 faligndata %d10, %d12, %d52 ;\ 160 faligndata %d12, %d14, %d54 ;\ 161 faligndata %d14, %d16, %d56 ;\ 162 faligndata %d16, %d18, %d58 ;\ 163 faligndata %d18, %d20, %d60 ;\ 164 faligndata %d20, %d22, %d62 165 166#define ALIGN_OFF_32_39 \ 167 faligndata %d8, %d10, %d48 ;\ 168 faligndata %d10, %d12, %d50 ;\ 169 faligndata %d12, %d14, %d52 ;\ 170 faligndata %d14, %d16, %d54 ;\ 171 faligndata %d16, %d18, %d56 ;\ 172 faligndata %d18, %d20, %d58 ;\ 173 faligndata %d20, %d22, %d60 ;\ 174 faligndata %d22, %d24, %d62 175 176#define ALIGN_OFF_40_47 \ 177 faligndata %d10, %d12, %d48 ;\ 178 faligndata %d12, %d14, %d50 ;\ 179 faligndata %d14, %d16, %d52 ;\ 180 faligndata %d16, %d18, %d54 ;\ 181 faligndata %d18, %d20, %d56 ;\ 182 faligndata %d20, %d22, %d58 ;\ 183 faligndata %d22, %d24, %d60 ;\ 184 faligndata %d24, %d26, %d62 185 186#define ALIGN_OFF_48_55 \ 187 faligndata %d12, %d14, %d48 ;\ 188 faligndata %d14, %d16, %d50 ;\ 189 faligndata %d16, %d18, %d52 ;\ 190 faligndata %d18, %d20, %d54 ;\ 191 faligndata %d20, %d22, %d56 ;\ 192 faligndata %d22, %d24, %d58 ;\ 193 faligndata %d24, %d26, %d60 ;\ 194 faligndata %d26, %d28, %d62 195 196#define ALIGN_OFF_56_63 \ 197 faligndata %d14, %d16, %d48 ;\ 198 faligndata %d16, %d18, %d50 ;\ 199 faligndata %d18, %d20, %d52 ;\ 200 faligndata %d20, %d22, %d54 ;\ 201 faligndata %d22, %d24, %d56 ;\ 202 faligndata %d24, %d26, %d58 ;\ 203 faligndata %d26, %d28, %d60 ;\ 204 faligndata %d28, %d30, %d62 205 206#define VIS_BLOCKSIZE 64 207 208/* 209 * Size of stack frame in order to accomodate a 64-byte aligned 210 * floating-point register save area and 2 64-bit temp locations. 211 * All copy functions use three quadrants of fp registers; to assure a 212 * block-aligned three block buffer in which to save we must reserve 213 * four blocks on stack. 214 * 215 * _______________________________________ <-- %fp + STACK_BIAS 216 * | We may need to preserve 3 quadrants | 217 * | of fp regs, but since we do so with | 218 * | BST/BLD we need room in which to | 219 * | align to VIS_BLOCKSIZE bytes. So | 220 * | this area is 4 * VIS_BLOCKSIZE. | <-- - SAVED_FPREGS_OFFSET 221 * |-------------------------------------| 222 * | 8 bytes to save %fprs | <-- - SAVED_FPRS_OFFSET 223 * |-------------------------------------| 224 * | 8 bytes to save %gsr | <-- - SAVED_GSR_OFFSET 225 * --------------------------------------- 226 */ 227#define HWCOPYFRAMESIZE ((VIS_BLOCKSIZE * (3 + 1)) + (2 * 8)) 228#define SAVED_FPREGS_OFFSET (VIS_BLOCKSIZE * 4) 229#define SAVED_FPREGS_ADJUST ((VIS_BLOCKSIZE * 3) + 1) 230#define SAVED_FPRS_OFFSET (SAVED_FPREGS_OFFSET + 8) 231#define SAVED_GSR_OFFSET (SAVED_FPRS_OFFSET + 8) 232 233/* 234 * In FP copies if we do not have preserved data to restore over 235 * the fp regs we used then we must zero those regs to avoid 236 * exposing portions of the data to later threads (data security). 237 */ 238#define FZERO \ 239 fzero %f0 ;\ 240 fzero %f2 ;\ 241 faddd %f0, %f2, %f4 ;\ 242 fmuld %f0, %f2, %f6 ;\ 243 faddd %f0, %f2, %f8 ;\ 244 fmuld %f0, %f2, %f10 ;\ 245 faddd %f0, %f2, %f12 ;\ 246 fmuld %f0, %f2, %f14 ;\ 247 faddd %f0, %f2, %f16 ;\ 248 fmuld %f0, %f2, %f18 ;\ 249 faddd %f0, %f2, %f20 ;\ 250 fmuld %f0, %f2, %f22 ;\ 251 faddd %f0, %f2, %f24 ;\ 252 fmuld %f0, %f2, %f26 ;\ 253 faddd %f0, %f2, %f28 ;\ 254 fmuld %f0, %f2, %f30 ;\ 255 faddd %f0, %f2, %f48 ;\ 256 fmuld %f0, %f2, %f50 ;\ 257 faddd %f0, %f2, %f52 ;\ 258 fmuld %f0, %f2, %f54 ;\ 259 faddd %f0, %f2, %f56 ;\ 260 fmuld %f0, %f2, %f58 ;\ 261 faddd %f0, %f2, %f60 ;\ 262 fmuld %f0, %f2, %f62 263 264/* 265 * Macros to save and restore fp registers to/from the stack. 266 * Used to save and restore in-use fp registers when we want to use FP. 267 */ 268#define BST_FP_TOSTACK(tmp1) \ 269 /* membar #Sync */ ;\ 270 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 271 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 272 stda %f0, [tmp1]ASI_BLK_P ;\ 273 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 274 stda %f16, [tmp1]ASI_BLK_P ;\ 275 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 276 stda %f48, [tmp1]ASI_BLK_P ;\ 277 membar #Sync 278 279#define BLD_FP_FROMSTACK(tmp1) \ 280 /* membar #Sync - provided at copy completion */ ;\ 281 add %fp, STACK_BIAS - SAVED_FPREGS_ADJUST, tmp1 ;\ 282 and tmp1, -VIS_BLOCKSIZE, tmp1 /* block align */ ;\ 283 ldda [tmp1]ASI_BLK_P, %f0 ;\ 284 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 285 ldda [tmp1]ASI_BLK_P, %f16 ;\ 286 add tmp1, VIS_BLOCKSIZE, tmp1 ;\ 287 ldda [tmp1]ASI_BLK_P, %f48 ;\ 288 membar #Sync 289#endif /* NIAGARA_IMPL */ 290 291/* 292 * Copy a block of storage, returning an error code if `from' or 293 * `to' takes a kernel pagefault which cannot be resolved. 294 * Returns errno value on pagefault error, 0 if all ok 295 */ 296 297#if defined(lint) 298 299/* ARGSUSED */ 300int 301kcopy(const void *from, void *to, size_t count) 302{ return(0); } 303 304#else /* lint */ 305 306 .seg ".text" 307 .align 4 308 309 ENTRY(kcopy) 310 311#if !defined(NIAGARA_IMPL) 312 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 313 sethi %hi(.copyerr), %l7 ! copyerr is lofault value 314 or %l7, %lo(.copyerr), %l7 315 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler 316 ! Note that we carefully do *not* flag the setting of 317 ! t_lofault. 318 membar #Sync ! sync error barrier 319 b .do_copy ! common code 320 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 321 322/* 323 * We got here because of a fault during kcopy or bcopy if a fault 324 * handler existed when bcopy was called. 325 * Errno value is in %g1. 326 */ 327.copyerr: 328 sethi %hi(.copyerr2), %l1 329 or %l1, %lo(.copyerr2), %l1 330 membar #Sync ! sync error barrier 331 stn %l1, [THREAD_REG + T_LOFAULT] ! set t_lofault 332 btst FPUSED_FLAG, %o5 333 bz,pt %xcc, 1f 334 and %o5, BCOPY_FLAG, %l1 ! copy flag to %l1 335 336 membar #Sync ! sync error barrier 337 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 338 wr %o2, 0, %gsr 339 340 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 341 btst FPRS_FEF, %o3 342 bz,pt %icc, 4f 343 nop 344 345 ! restore fpregs from stack 346 BLD_FP_FROMSTACK(%o2) 347 348 ba,pt %ncc, 2f 349 wr %o3, 0, %fprs ! restore fprs 350 3514: 352 FZERO 353 wr %o3, 0, %fprs ! restore fprs 354 3552: 356 ldn [THREAD_REG + T_LWP], %o2 357 brnz,pt %o2, 1f 358 nop 359 360 ldsb [THREAD_REG + T_PREEMPT], %l0 361 deccc %l0 362 bnz,pn %ncc, 1f 363 stb %l0, [THREAD_REG + T_PREEMPT] 364 365 ldsb [THREAD_REG + T_PREEMPT], %l0 366 dec %l0 367 stb %l0, [THREAD_REG + T_PREEMPT] 368 369 ! Check for a kernel preemption request 370 ldn [THREAD_REG + T_CPU], %l0 371 ldub [%l0 + CPU_KPRUNRUN], %l0 372 brnz,a,pt %l0, 1f ! Need to call kpreempt? 373 or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag 374 375 ! The kcopy will always set a t_lofault handler. If it fires, 376 ! we're expected to just return the error code and not to 377 ! invoke any existing error handler. As far as bcopy is concerned, 378 ! we only set t_lofault if there was an existing lofault handler. 379 ! In that case we're expected to invoke the previously existing 380 ! handler after restting the t_lofault value. 3811: 382 andn %o5, COPY_FLAGS, %o5 ! remove flags from lofault address 383 membar #Sync ! sync error barrier 384 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 385 386 ! call kpreempt if necessary 387 btst KPREEMPT_FLAG, %l1 388 bz,pt %icc, 2f 389 nop 390 call kpreempt 391 rdpr %pil, %o0 ! pass %pil 3922: 393 btst BCOPY_FLAG, %l1 394 bnz,pn %ncc, 3f 395 nop 396 ret 397 restore %g1, 0, %o0 398 3993: 400 ! We're here via bcopy. There must have been an error handler 401 ! in place otherwise we would have died a nasty death already. 402 jmp %o5 ! goto real handler 403 restore %g0, 0, %o0 ! dispose of copy window 404 405/* 406 * We got here because of a fault in .copyerr. We can't safely restore fp 407 * state, so we panic. 408 */ 409fp_panic_msg: 410 .asciz "Unable to restore fp state after copy operation" 411 412 .align 4 413.copyerr2: 414 set fp_panic_msg, %o0 415 call panic 416 nop 417#else /* NIAGARA_IMPL */ 418 save %sp, -SA(MINFRAME), %sp 419 set .copyerr, %l7 ! copyerr is lofault value 420 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler 421 or %o5, LOFAULT_SET, %o5 422 membar #Sync ! sync error barrier 423 b .do_copy ! common code 424 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 425 426/* 427 * We got here because of a fault during kcopy. 428 * Errno value is in %g1. 429 */ 430.copyerr: 431 ! The kcopy() *always* sets a t_lofault handler and it ORs LOFAULT_SET 432 ! into %o5 to indicate it has set t_lofault handler. Need to clear 433 ! LOFAULT_SET flag before restoring the error handler. 434 andn %o5, LOFAULT_SET, %o5 435 membar #Sync ! sync error barrier 436 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 437 ret 438 restore %g1, 0, %o0 439#endif /* NIAGARA_IMPL */ 440 441 SET_SIZE(kcopy) 442#endif /* lint */ 443 444 445/* 446 * Copy a block of storage - must not overlap (from + len <= to). 447 */ 448#if defined(lint) 449 450/* ARGSUSED */ 451void 452bcopy(const void *from, void *to, size_t count) 453{} 454 455#else /* lint */ 456 457 ENTRY(bcopy) 458 459#if !defined(NIAGARA_IMPL) 460 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 461 ldn [THREAD_REG + T_LOFAULT], %o5 ! save existing handler 462 brz,pt %o5, .do_copy 463 nop 464 sethi %hi(.copyerr), %l7 ! copyerr is lofault value 465 or %l7, %lo(.copyerr), %l7 466 membar #Sync ! sync error barrier 467 stn %l7, [THREAD_REG + T_LOFAULT] ! set t_lofault 468 ! We've already captured whether t_lofault was zero on entry. 469 ! We need to mark ourselves as being from bcopy since both 470 ! kcopy and bcopy use the same code path. If BCOPY_FLAG is 471 ! set and the saved lofault was zero, we won't reset lofault on 472 ! returning. 473 or %o5, BCOPY_FLAG, %o5 474#else /* NIAGARA_IMPL */ 475 save %sp, -SA(MINFRAME), %sp 476 clr %o5 ! flag LOFAULT_SET is not set for bcopy 477#endif /* NIAGARA_IMPL */ 478 479.do_copy: 480 cmp %i2, 12 ! for small counts 481 blu %ncc, .bytecp ! just copy bytes 482 .empty 483 484 cmp %i2, 128 ! for less than 128 bytes 485 blu,pn %ncc, .bcb_punt ! no block st/quad ld 486 nop 487 488 set use_hw_bcopy, %o2 489 ld [%o2], %o2 490 brz,pn %o2, .bcb_punt 491 nop 492 493 subcc %i1, %i0, %i3 494 bneg,a,pn %ncc, 1f 495 neg %i3 4961: 497 /* 498 * Compare against 256 since we should be checking block addresses 499 * and (dest & ~63) - (src & ~63) can be 3 blocks even if 500 * src = dest + (64 * 3) + 63. 501 */ 502 cmp %i3, 256 503 blu,pn %ncc, .bcb_punt 504 nop 505 506 /* 507 * Copy that reach here have at least 2 blocks of data to copy. 508 */ 509#if !defined(NIAGARA_IMPL) 510 ldn [THREAD_REG + T_LWP], %o3 511 brnz,pt %o3, 1f 512 nop 513 514 ! kpreempt_disable(); 515 ldsb [THREAD_REG + T_PREEMPT], %o2 516 inc %o2 517 stb %o2, [THREAD_REG + T_PREEMPT] 518 5191: 520 rd %fprs, %o2 ! check for unused fp 521 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save orig %fprs 522 btst FPRS_FEF, %o2 523 bz,a,pt %icc, .do_blockcopy 524 wr %g0, FPRS_FEF, %fprs 525 526 ! save in-use fpregs on stack 527 BST_FP_TOSTACK(%o2) 528#endif /* NIAGARA_IMPL */ 529 530.do_blockcopy: 531 532#if !defined(NIAGARA_IMPL) 533 rd %gsr, %o2 534 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 535 or %o5, FPUSED_FLAG, %o5 ! fp regs are in use 536#endif /* NIAGARA_IMPL */ 537 538 ! Swap src/dst since the code below is memcpy code 539 ! and memcpy/bcopy have different calling sequences 540 mov %i1, %i5 541 mov %i0, %i1 542 mov %i5, %i0 543 544 ! Block (64 bytes) align the destination. 545 andcc %i0, 0x3f, %i3 ! is dst aligned on a 64 bytes 546 bz %xcc, .chksrc ! dst is already double aligned 547 sub %i3, 0x40, %i3 548 neg %i3 ! bytes till dst 64 bytes aligned 549 sub %i2, %i3, %i2 ! update i2 with new count 550 551 ! Based on source and destination alignment do 552 ! either 8 bytes, 4 bytes, 2 bytes or byte copy. 553 554 ! Is dst & src 8B aligned 555 or %i0, %i1, %o2 556 andcc %o2, 0x7, %g0 557 bz %ncc, .alewdcp 558 nop 559 560 ! Is dst & src 4B aligned 561 andcc %o2, 0x3, %g0 562 bz %ncc, .alwdcp 563 nop 564 565 ! Is dst & src 2B aligned 566 andcc %o2, 0x1, %g0 567 bz %ncc, .alhlfwdcp 568 nop 569 570 ! 1B aligned 5711: ldub [%i1], %o2 572 stb %o2, [%i0] 573 inc %i1 574 deccc %i3 575 bgu,pt %ncc, 1b 576 inc %i0 577 578 ba .chksrc 579 nop 580 581 ! dst & src 4B aligned 582.alwdcp: 583 ld [%i1], %o2 584 st %o2, [%i0] 585 add %i1, 0x4, %i1 586 subcc %i3, 0x4, %i3 587 bgu,pt %ncc, .alwdcp 588 add %i0, 0x4, %i0 589 590 ba .chksrc 591 nop 592 593 ! dst & src 2B aligned 594.alhlfwdcp: 595 lduh [%i1], %o2 596 stuh %o2, [%i0] 597 add %i1, 0x2, %i1 598 subcc %i3, 0x2, %i3 599 bgu,pt %ncc, .alhlfwdcp 600 add %i0, 0x2, %i0 601 602 ba .chksrc 603 nop 604 605 ! dst & src 8B aligned 606.alewdcp: 607 ldx [%i1], %o2 608 stx %o2, [%i0] 609 add %i1, 0x8, %i1 610 subcc %i3, 0x8, %i3 611 bgu,pt %ncc, .alewdcp 612 add %i0, 0x8, %i0 613 614 ! Now Destination is block (64 bytes) aligned 615.chksrc: 616 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size 617 sub %i2, %i3, %i2 ! Residue bytes in %i2 618 619 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 620 621#if !defined(NIAGARA_IMPL) 622 andn %i1, 0x3f, %l0 ! %l0 has block aligned src address 623 prefetch [%l0+0x0], #one_read 624 andcc %i1, 0x3f, %g0 ! is src 64B aligned 625 bz,pn %ncc, .blkcpy 626 nop 627 628 ! handle misaligned source cases 629 alignaddr %i1, %g0, %g0 ! generate %gsr 630 631 srl %i1, 0x3, %l1 ! src add bits 3, 4, 5 are now least 632 ! significant in %l1 633 andcc %l1, 0x7, %l2 ! mask everything except bits 1, 2, 3 634 add %i1, %i3, %i1 635 636 ! switch statement to get to right 8 byte block within 637 ! 64 byte block 638 cmp %l2, 0x4 639 bgeu,a hlf 640 cmp %l2, 0x6 641 cmp %l2, 0x2 642 bgeu,a sqtr 643 nop 644 cmp %l2, 0x1 645 be,a off15 646 nop 647 ba off7 648 nop 649sqtr: 650 be,a off23 651 nop 652 ba,a off31 653 nop 654 655hlf: 656 bgeu,a fqtr 657 nop 658 cmp %l2, 0x5 659 be,a off47 660 nop 661 ba off39 662 nop 663fqtr: 664 be,a off55 665 nop 666 667 ! Falls through when the source offset is greater than 56 668 ldd [%l0+0x38], %d14 669 prefetch [%l0+0x40], #one_read 670 prefetch [%l0+0x80], #one_read 6717: 672 add %l0, 0x40, %l0 673 stxa %g0, [%i0]%asi ! initialize the cache line 674 675 ldda [%l0]ASI_BLK_P, %d16 676 ALIGN_OFF_56_63 677 fmovd %d30, %d14 678 679 stda %d48, [%i0]ASI_BLK_P 680 subcc %i3, 0x40, %i3 681 add %i0, 0x40, %i0 682 bgu,pt %ncc, 7b 683 prefetch [%l0+0x80], #one_read 684 ba .blkdone 685 membar #Sync 686 687 ! This copy case for source offset between 1 and 7 688off7: 689 ldda [%l0]ASI_BLK_P, %d0 690 prefetch [%l0+0x40], #one_read 691 prefetch [%l0+0x80], #one_read 6920: 693 add %l0, 0x40, %l0 694 stxa %g0, [%i0]%asi ! initialize the cache line 695 696 ldda [%l0]ASI_BLK_P, %d16 697 ALIGN_OFF_1_7 698 fmovd %d16, %d0 699 fmovd %d18, %d2 700 fmovd %d20, %d4 701 fmovd %d22, %d6 702 fmovd %d24, %d8 703 fmovd %d26, %d10 704 fmovd %d28, %d12 705 fmovd %d30, %d14 706 707 stda %d48, [%i0]ASI_BLK_P 708 subcc %i3, 0x40, %i3 709 add %i0, 0x40, %i0 710 bgu,pt %ncc, 0b 711 prefetch [%l0+0x80], #one_read 712 ba .blkdone 713 membar #Sync 714 715 ! This copy case for source offset between 8 and 15 716off15: 717 ldd [%l0+0x8], %d2 718 ldd [%l0+0x10], %d4 719 ldd [%l0+0x18], %d6 720 ldd [%l0+0x20], %d8 721 ldd [%l0+0x28], %d10 722 ldd [%l0+0x30], %d12 723 ldd [%l0+0x38], %d14 724 prefetch [%l0+0x40], #one_read 725 prefetch [%l0+0x80], #one_read 7261: 727 add %l0, 0x40, %l0 728 stxa %g0, [%i0]%asi ! initialize the cache line 729 730 ldda [%l0]ASI_BLK_P, %d16 731 ALIGN_OFF_8_15 732 fmovd %d18, %d2 733 fmovd %d20, %d4 734 fmovd %d22, %d6 735 fmovd %d24, %d8 736 fmovd %d26, %d10 737 fmovd %d28, %d12 738 fmovd %d30, %d14 739 740 stda %d48, [%i0]ASI_BLK_P 741 subcc %i3, 0x40, %i3 742 add %i0, 0x40, %i0 743 bgu,pt %ncc, 1b 744 prefetch [%l0+0x80], #one_read 745 ba .blkdone 746 membar #Sync 747 748 ! This copy case for source offset between 16 and 23 749off23: 750 ldd [%l0+0x10], %d4 751 ldd [%l0+0x18], %d6 752 ldd [%l0+0x20], %d8 753 ldd [%l0+0x28], %d10 754 ldd [%l0+0x30], %d12 755 ldd [%l0+0x38], %d14 756 prefetch [%l0+0x40], #one_read 757 prefetch [%l0+0x80], #one_read 7582: 759 add %l0, 0x40, %l0 760 stxa %g0, [%i0]%asi ! initialize the cache line 761 762 ldda [%l0]ASI_BLK_P, %d16 763 ALIGN_OFF_16_23 764 fmovd %d20, %d4 765 fmovd %d22, %d6 766 fmovd %d24, %d8 767 fmovd %d26, %d10 768 fmovd %d28, %d12 769 fmovd %d30, %d14 770 771 stda %d48, [%i0]ASI_BLK_P 772 subcc %i3, 0x40, %i3 773 add %i0, 0x40, %i0 774 bgu,pt %ncc, 2b 775 prefetch [%l0+0x80], #one_read 776 ba .blkdone 777 membar #Sync 778 779 ! This copy case for source offset between 24 and 31 780off31: 781 ldd [%l0+0x18], %d6 782 ldd [%l0+0x20], %d8 783 ldd [%l0+0x28], %d10 784 ldd [%l0+0x30], %d12 785 ldd [%l0+0x38], %d14 786 prefetch [%l0+0x40], #one_read 787 prefetch [%l0+0x80], #one_read 7883: 789 add %l0, 0x40, %l0 790 stxa %g0, [%i0]%asi ! initialize the cache line 791 792 ldda [%l0]ASI_BLK_P, %d16 793 ALIGN_OFF_24_31 794 fmovd %d22, %d6 795 fmovd %d24, %d8 796 fmovd %d26, %d10 797 fmovd %d28, %d12 798 fmovd %d30, %d14 799 800 stda %d48, [%i0]ASI_BLK_P 801 subcc %i3, 0x40, %i3 802 add %i0, 0x40, %i0 803 bgu,pt %ncc, 3b 804 prefetch [%l0+0x80], #one_read 805 ba .blkdone 806 membar #Sync 807 808 ! This copy case for source offset between 32 and 39 809off39: 810 ldd [%l0+0x20], %d8 811 ldd [%l0+0x28], %d10 812 ldd [%l0+0x30], %d12 813 ldd [%l0+0x38], %d14 814 prefetch [%l0+0x40], #one_read 815 prefetch [%l0+0x80], #one_read 8164: 817 add %l0, 0x40, %l0 818 stxa %g0, [%i0]%asi ! initialize the cache line 819 820 ldda [%l0]ASI_BLK_P, %d16 821 ALIGN_OFF_32_39 822 fmovd %d24, %d8 823 fmovd %d26, %d10 824 fmovd %d28, %d12 825 fmovd %d30, %d14 826 827 stda %d48, [%i0]ASI_BLK_P 828 subcc %i3, 0x40, %i3 829 add %i0, 0x40, %i0 830 bgu,pt %ncc, 4b 831 prefetch [%l0+0x80], #one_read 832 ba .blkdone 833 membar #Sync 834 835 ! This copy case for source offset between 40 and 47 836off47: 837 ldd [%l0+0x28], %d10 838 ldd [%l0+0x30], %d12 839 ldd [%l0+0x38], %d14 840 prefetch [%l0+0x40], #one_read 841 prefetch [%l0+0x80], #one_read 8425: 843 add %l0, 0x40, %l0 844 stxa %g0, [%i0]%asi ! initialize the cache line 845 846 ldda [%l0]ASI_BLK_P, %d16 847 ALIGN_OFF_40_47 848 fmovd %d26, %d10 849 fmovd %d28, %d12 850 fmovd %d30, %d14 851 852 stda %d48, [%i0]ASI_BLK_P 853 subcc %i3, 0x40, %i3 854 add %i0, 0x40, %i0 855 bgu,pt %ncc, 5b 856 prefetch [%l0+0x80], #one_read 857 ba .blkdone 858 membar #Sync 859 860 ! This copy case for source offset between 48 and 55 861off55: 862 ldd [%l0+0x30], %d12 863 ldd [%l0+0x38], %d14 864 prefetch [%l0+0x40], #one_read 865 prefetch [%l0+0x80], #one_read 8666: 867 add %l0, 0x40, %l0 868 stxa %g0, [%i0]%asi ! initialize the cache line 869 870 ldda [%l0]ASI_BLK_P, %d16 871 ALIGN_OFF_48_55 872 fmovd %d28, %d12 873 fmovd %d30, %d14 874 875 stda %d48, [%i0]ASI_BLK_P 876 subcc %i3, 0x40, %i3 877 add %i0, 0x40, %i0 878 bgu,pt %ncc, 6b 879 prefetch [%l0+0x80], #one_read 880 ba .blkdone 881 membar #Sync 882 883 ! Both source and destination are block aligned. 884.blkcpy: 885 prefetch [%i1+0x40], #one_read 886 prefetch [%i1+0x80], #one_read 8878: 888 stxa %g0, [%i0]%asi ! initialize the cache line 889 ldda [%i1]ASI_BLK_P, %d0 890 stda %d0, [%i0]ASI_BLK_P 891 892 add %i1, 0x40, %i1 893 subcc %i3, 0x40, %i3 894 add %i0, 0x40, %i0 895 bgu,pt %ncc, 8b 896 prefetch [%i1+0x80], #one_read 897 membar #Sync 898 899.blkdone: 900#else /* NIAGARA_IMPL */ 901 andcc %i1, 0xf, %o2 ! is src quadword aligned 902 bz,pn %xcc, .blkcpy ! src offset in %o2 903 nop 904 cmp %o2, 0x8 905 bg .cpy_upper_double 906 nop 907 bl .cpy_lower_double 908 nop 909 910 ! Falls through when source offset is equal to 8 i.e. 911 ! source is double word aligned. 912 ! In this case no shift/merge of data is required 913 sub %i1, %o2, %i1 ! align the src at 16 bytes. 914 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 915 prefetch [%l0+0x0], #one_read 916 ldda [%i1+0x0]%asi, %l2 917loop0: 918 ldda [%i1+0x10]%asi, %l4 919 prefetch [%l0+0x40], #one_read 920 921 stxa %l3, [%i0+0x0]%asi 922 stxa %l4, [%i0+0x8]%asi 923 924 ldda [%i1+0x20]%asi, %l2 925 stxa %l5, [%i0+0x10]%asi 926 stxa %l2, [%i0+0x18]%asi 927 928 ldda [%i1+0x30]%asi, %l4 929 stxa %l3, [%i0+0x20]%asi 930 stxa %l4, [%i0+0x28]%asi 931 932 ldda [%i1+0x40]%asi, %l2 933 stxa %l5, [%i0+0x30]%asi 934 stxa %l2, [%i0+0x38]%asi 935 936 add %l0, 0x40, %l0 937 add %i1, 0x40, %i1 938 subcc %i3, 0x40, %i3 939 bgu,pt %xcc, loop0 940 add %i0, 0x40, %i0 941 ba .blkdone 942 add %i1, %o2, %i1 ! increment the source by src offset 943 ! the src offset was stored in %o2 944 945.cpy_lower_double: 946 sub %i1, %o2, %i1 ! align the src at 16 bytes. 947 sll %o2, 3, %o0 ! %o0 left shift 948 mov 0x40, %o1 949 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 950 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 951 prefetch [%l0+0x0], #one_read 952 ldda [%i1+0x0]%asi, %l2 ! partial data in %l2 and %l3 has 953 ! complete data 954loop1: 955 ldda [%i1+0x10]%asi, %l4 ! %l4 has partial data for this read. 956 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4 957 ! into %l2 and %l3 958 prefetch [%l0+0x40], #one_read 959 stxa %l2, [%i0+0x0]%asi 960 stxa %l3, [%i0+0x8]%asi 961 962 ldda [%i1+0x20]%asi, %l2 963 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and 964 stxa %l4, [%i0+0x10]%asi ! %l4 from previous read 965 stxa %l5, [%i0+0x18]%asi ! into %l4 and %l5 966 967 ! Repeat the same for next 32 bytes. 968 969 ldda [%i1+0x30]%asi, %l4 970 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) 971 stxa %l2, [%i0+0x20]%asi 972 stxa %l3, [%i0+0x28]%asi 973 974 ldda [%i1+0x40]%asi, %l2 975 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) 976 stxa %l4, [%i0+0x30]%asi 977 stxa %l5, [%i0+0x38]%asi 978 979 add %l0, 0x40, %l0 980 add %i1, 0x40, %i1 981 subcc %i3, 0x40, %i3 982 bgu,pt %xcc, loop1 983 add %i0, 0x40, %i0 984 ba .blkdone 985 add %i1, %o2, %i1 ! increment the source by src offset 986 ! the src offset was stored in %o2 987 988.cpy_upper_double: 989 sub %i1, %o2, %i1 ! align the src at 16 bytes. 990 mov 0x8, %o0 991 sub %o2, %o0, %o0 992 sll %o0, 3, %o0 ! %o0 left shift 993 mov 0x40, %o1 994 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 995 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 996 prefetch [%l0+0x0], #one_read 997 ldda [%i1+0x0]%asi, %l2 ! partial data in %l3 for this read and 998 ! no data in %l2 999loop2: 1000 ldda [%i1+0x10]%asi, %l4 ! %l4 has complete data and %l5 has 1001 ! partial 1002 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5 1003 ! into %l3 and %l4 1004 prefetch [%l0+0x40], #one_read 1005 stxa %l3, [%i0+0x0]%asi 1006 stxa %l4, [%i0+0x8]%asi 1007 1008 ldda [%i1+0x20]%asi, %l2 1009 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with 1010 stxa %l5, [%i0+0x10]%asi ! %l5 from previous read 1011 stxa %l2, [%i0+0x18]%asi ! into %l5 and %l2 1012 1013 ! Repeat the same for next 32 bytes. 1014 1015 ldda [%i1+0x30]%asi, %l4 1016 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) 1017 stxa %l3, [%i0+0x20]%asi 1018 stxa %l4, [%i0+0x28]%asi 1019 1020 ldda [%i1+0x40]%asi, %l2 1021 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) 1022 stxa %l5, [%i0+0x30]%asi 1023 stxa %l2, [%i0+0x38]%asi 1024 1025 add %l0, 0x40, %l0 1026 add %i1, 0x40, %i1 1027 subcc %i3, 0x40, %i3 1028 bgu,pt %xcc, loop2 1029 add %i0, 0x40, %i0 1030 ba .blkdone 1031 add %i1, %o2, %i1 ! increment the source by src offset 1032 ! the src offset was stored in %o2 1033 1034 1035 ! Both Source and Destination are block aligned. 1036 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P 1037.blkcpy: 1038 prefetch [%i1+0x0], #one_read 10391: 1040 ldda [%i1+0x0]%asi, %l0 1041 ldda [%i1+0x10]%asi, %l2 1042 prefetch [%i1+0x40], #one_read 1043 1044 stxa %l0, [%i0+0x0]%asi 1045 ldda [%i1+0x20]%asi, %l4 1046 ldda [%i1+0x30]%asi, %l6 1047 1048 stxa %l1, [%i0+0x8]%asi 1049 stxa %l2, [%i0+0x10]%asi 1050 stxa %l3, [%i0+0x18]%asi 1051 stxa %l4, [%i0+0x20]%asi 1052 stxa %l5, [%i0+0x28]%asi 1053 stxa %l6, [%i0+0x30]%asi 1054 stxa %l7, [%i0+0x38]%asi 1055 1056 add %i1, 0x40, %i1 1057 subcc %i3, 0x40, %i3 1058 bgu,pt %xcc, 1b 1059 add %i0, 0x40, %i0 1060 1061.blkdone: 1062 membar #Sync 1063#endif /* NIAGARA_IMPL */ 1064 1065 brz,pt %i2, .blkexit 1066 nop 1067 1068 ! Handle trailing bytes 1069 cmp %i2, 0x8 1070 blu,pt %ncc, .residue 1071 nop 1072 1073 ! Can we do some 8B ops 1074 or %i1, %i0, %o2 1075 andcc %o2, 0x7, %g0 1076 bnz %ncc, .last4 1077 nop 1078 1079 ! Do 8byte ops as long as possible 1080.last8: 1081 ldx [%i1], %o2 1082 stx %o2, [%i0] 1083 add %i1, 0x8, %i1 1084 sub %i2, 0x8, %i2 1085 cmp %i2, 0x8 1086 bgu,pt %ncc, .last8 1087 add %i0, 0x8, %i0 1088 1089 brz,pt %i2, .blkexit 1090 nop 1091 1092 ba .residue 1093 nop 1094 1095.last4: 1096 ! Can we do 4B ops 1097 andcc %o2, 0x3, %g0 1098 bnz %ncc, .last2 1099 nop 11001: 1101 ld [%i1], %o2 1102 st %o2, [%i0] 1103 add %i1, 0x4, %i1 1104 sub %i2, 0x4, %i2 1105 cmp %i2, 0x4 1106 bgu,pt %ncc, 1b 1107 add %i0, 0x4, %i0 1108 1109 brz,pt %i2, .blkexit 1110 nop 1111 1112 ba .residue 1113 nop 1114 1115.last2: 1116 ! Can we do 2B ops 1117 andcc %o2, 0x1, %g0 1118 bnz %ncc, .residue 1119 nop 1120 11211: 1122 lduh [%i1], %o2 1123 stuh %o2, [%i0] 1124 add %i1, 0x2, %i1 1125 sub %i2, 0x2, %i2 1126 cmp %i2, 0x2 1127 bgu,pt %ncc, 1b 1128 add %i0, 0x2, %i0 1129 1130 brz,pt %i2, .blkexit 1131 nop 1132 1133.residue: 1134 ldub [%i1], %o2 1135 stb %o2, [%i0] 1136 inc %i1 1137 deccc %i2 1138 bgu,pt %ncc, .residue 1139 inc %i0 1140 1141.blkexit: 1142#if !defined(NIAGARA_IMPL) 1143 btst FPUSED_FLAG, %o5 1144 bz %icc, 1f 1145 and %o5, COPY_FLAGS, %l1 ! Store flags in %l1 1146 ! We can't clear the flags from %o5 yet 1147 ! If there's an error, .copyerr will 1148 ! need them 1149 1150 ldx [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 ! restore gsr 1151 wr %o2, 0, %gsr 1152 1153 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 1154 btst FPRS_FEF, %o3 1155 bz,pt %icc, 4f 1156 nop 1157 1158 ! restore fpregs from stack 1159 BLD_FP_FROMSTACK(%o2) 1160 1161 ba,pt %ncc, 2f 1162 wr %o3, 0, %fprs ! restore fprs 1163 11644: 1165 FZERO 1166 wr %o3, 0, %fprs ! restore fprs 1167 11682: 1169 ldn [THREAD_REG + T_LWP], %o2 1170 brnz,pt %o2, 1f 1171 nop 1172 1173 ldsb [THREAD_REG + T_PREEMPT], %l0 1174 deccc %l0 1175 bnz,pn %ncc, 1f 1176 stb %l0, [THREAD_REG + T_PREEMPT] 1177 1178 ! Check for a kernel preemption request 1179 ldn [THREAD_REG + T_CPU], %l0 1180 ldub [%l0 + CPU_KPRUNRUN], %l0 1181 brnz,a,pt %l0, 1f ! Need to call kpreempt? 1182 or %l1, KPREEMPT_FLAG, %l1 ! If so, set the flag 1183 1184 ldsb [THREAD_REG + T_PREEMPT], %l0 1185 dec %l0 1186 stb %l0, [THREAD_REG + T_PREEMPT] 11871: 1188 btst BCOPY_FLAG, %l1 1189 bz,pn %icc, 3f 1190 andncc %o5, COPY_FLAGS, %o5 1191 1192 ! Here via bcopy. Check to see if the handler was NULL. 1193 ! If so, just return quietly. Otherwise, reset the 1194 ! handler and go home. 1195 bnz,pn %ncc, 3f 1196 nop 1197 1198 ! Null handler. 1199 btst KPREEMPT_FLAG, %l1 1200 bz,pt %icc, 2f 1201 nop 1202 call kpreempt 1203 rdpr %pil, %o0 ! pass %pil 12042: 1205 1206 ret 1207 restore %g0, 0, %o0 1208 1209 ! Here via kcopy or bcopy with a handler. 1210 ! Reset the fault handler. 12113: 1212 membar #Sync 1213 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1214 1215 ! call kpreempt if necessary 1216 btst KPREEMPT_FLAG, %l1 1217 bz,pt %icc, 4f 1218 nop 1219 call kpreempt 1220 rdpr %pil, %o0 12214: 1222#else /* NIAGARA_IMPL */ 1223 membar #Sync ! sync error barrier 1224 ! Restore t_lofault handler, if came here from kcopy(). 1225 tst %o5 1226 bz %ncc, 1f 1227 andn %o5, LOFAULT_SET, %o5 1228 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 12291: 1230#endif /* NIAGARA_IMPL */ 1231 ret 1232 restore %g0, 0, %o0 1233 1234.bcb_punt: 1235 ! 1236 ! use aligned transfers where possible 1237 ! 1238 xor %i0, %i1, %o4 ! xor from and to address 1239 btst 7, %o4 ! if lower three bits zero 1240 bz .aldoubcp ! can align on double boundary 1241 .empty ! assembler complaints about label 1242 1243 xor %i0, %i1, %o4 ! xor from and to address 1244 btst 3, %o4 ! if lower two bits zero 1245 bz .alwordcp ! can align on word boundary 1246 btst 3, %i0 ! delay slot, from address unaligned? 1247 ! 1248 ! use aligned reads and writes where possible 1249 ! this differs from wordcp in that it copes 1250 ! with odd alignment between source and destnation 1251 ! using word reads and writes with the proper shifts 1252 ! in between to align transfers to and from memory 1253 ! i0 - src address, i1 - dest address, i2 - count 1254 ! i3, i4 - tmps for used generating complete word 1255 ! i5 (word to write) 1256 ! l0 size in bits of upper part of source word (US) 1257 ! l1 size in bits of lower part of source word (LS = 32 - US) 1258 ! l2 size in bits of upper part of destination word (UD) 1259 ! l3 size in bits of lower part of destination word (LD = 32 - UD) 1260 ! l4 number of bytes leftover after aligned transfers complete 1261 ! l5 the number 32 1262 ! 1263 mov 32, %l5 ! load an oft-needed constant 1264 bz .align_dst_only 1265 btst 3, %i1 ! is destnation address aligned? 1266 clr %i4 ! clear registers used in either case 1267 bz .align_src_only 1268 clr %l0 1269 ! 1270 ! both source and destination addresses are unaligned 1271 ! 12721: ! align source 1273 ldub [%i0], %i3 ! read a byte from source address 1274 add %i0, 1, %i0 ! increment source address 1275 or %i4, %i3, %i4 ! or in with previous bytes (if any) 1276 btst 3, %i0 ! is source aligned? 1277 add %l0, 8, %l0 ! increment size of upper source (US) 1278 bnz,a 1b 1279 sll %i4, 8, %i4 ! make room for next byte 1280 1281 sub %l5, %l0, %l1 ! generate shift left count (LS) 1282 sll %i4, %l1, %i4 ! prepare to get rest 1283 ld [%i0], %i3 ! read a word 1284 add %i0, 4, %i0 ! increment source address 1285 srl %i3, %l0, %i5 ! upper src bits into lower dst bits 1286 or %i4, %i5, %i5 ! merge 1287 mov 24, %l3 ! align destination 12881: 1289 srl %i5, %l3, %i4 ! prepare to write a single byte 1290 stb %i4, [%i1] ! write a byte 1291 add %i1, 1, %i1 ! increment destination address 1292 sub %i2, 1, %i2 ! decrement count 1293 btst 3, %i1 ! is destination aligned? 1294 bnz,a 1b 1295 sub %l3, 8, %l3 ! delay slot, decrement shift count (LD) 1296 sub %l5, %l3, %l2 ! generate shift left count (UD) 1297 sll %i5, %l2, %i5 ! move leftover into upper bytes 1298 cmp %l2, %l0 ! cmp # reqd to fill dst w old src left 1299 bgu %ncc, .more_needed ! need more to fill than we have 1300 nop 1301 1302 sll %i3, %l1, %i3 ! clear upper used byte(s) 1303 srl %i3, %l1, %i3 1304 ! get the odd bytes between alignments 1305 sub %l0, %l2, %l0 ! regenerate shift count 1306 sub %l5, %l0, %l1 ! generate new shift left count (LS) 1307 and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0 1308 andn %i2, 3, %i2 ! # of aligned bytes that can be moved 1309 srl %i3, %l0, %i4 1310 or %i5, %i4, %i5 1311 st %i5, [%i1] ! write a word 1312 subcc %i2, 4, %i2 ! decrement count 1313 bz %ncc, .unalign_out 1314 add %i1, 4, %i1 ! increment destination address 1315 1316 b 2f 1317 sll %i3, %l1, %i5 ! get leftover into upper bits 1318.more_needed: 1319 sll %i3, %l0, %i3 ! save remaining byte(s) 1320 srl %i3, %l0, %i3 1321 sub %l2, %l0, %l1 ! regenerate shift count 1322 sub %l5, %l1, %l0 ! generate new shift left count 1323 sll %i3, %l1, %i4 ! move to fill empty space 1324 b 3f 1325 or %i5, %i4, %i5 ! merge to complete word 1326 ! 1327 ! the source address is aligned and destination is not 1328 ! 1329.align_dst_only: 1330 ld [%i0], %i4 ! read a word 1331 add %i0, 4, %i0 ! increment source address 1332 mov 24, %l0 ! initial shift alignment count 13331: 1334 srl %i4, %l0, %i3 ! prepare to write a single byte 1335 stb %i3, [%i1] ! write a byte 1336 add %i1, 1, %i1 ! increment destination address 1337 sub %i2, 1, %i2 ! decrement count 1338 btst 3, %i1 ! is destination aligned? 1339 bnz,a 1b 1340 sub %l0, 8, %l0 ! delay slot, decrement shift count 1341.xfer: 1342 sub %l5, %l0, %l1 ! generate shift left count 1343 sll %i4, %l1, %i5 ! get leftover 13443: 1345 and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0 1346 andn %i2, 3, %i2 ! # of aligned bytes that can be moved 13472: 1348 ld [%i0], %i3 ! read a source word 1349 add %i0, 4, %i0 ! increment source address 1350 srl %i3, %l0, %i4 ! upper src bits into lower dst bits 1351 or %i5, %i4, %i5 ! merge with upper dest bits (leftover) 1352 st %i5, [%i1] ! write a destination word 1353 subcc %i2, 4, %i2 ! decrement count 1354 bz %ncc, .unalign_out ! check if done 1355 add %i1, 4, %i1 ! increment destination address 1356 b 2b ! loop 1357 sll %i3, %l1, %i5 ! get leftover 1358.unalign_out: 1359 tst %l4 ! any bytes leftover? 1360 bz %ncc, .cpdone 1361 .empty ! allow next instruction in delay slot 13621: 1363 sub %l0, 8, %l0 ! decrement shift 1364 srl %i3, %l0, %i4 ! upper src byte into lower dst byte 1365 stb %i4, [%i1] ! write a byte 1366 subcc %l4, 1, %l4 ! decrement count 1367 bz %ncc, .cpdone ! done? 1368 add %i1, 1, %i1 ! increment destination 1369 tst %l0 ! any more previously read bytes 1370 bnz %ncc, 1b ! we have leftover bytes 1371 mov %l4, %i2 ! delay slot, mv cnt where dbytecp wants 1372 b .dbytecp ! let dbytecp do the rest 1373 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst 1374 ! 1375 ! the destination address is aligned and the source is not 1376 ! 1377.align_src_only: 1378 ldub [%i0], %i3 ! read a byte from source address 1379 add %i0, 1, %i0 ! increment source address 1380 or %i4, %i3, %i4 ! or in with previous bytes (if any) 1381 btst 3, %i0 ! is source aligned? 1382 add %l0, 8, %l0 ! increment shift count (US) 1383 bnz,a .align_src_only 1384 sll %i4, 8, %i4 ! make room for next byte 1385 b,a .xfer 1386 ! 1387 ! if from address unaligned for double-word moves, 1388 ! move bytes till it is, if count is < 56 it could take 1389 ! longer to align the thing than to do the transfer 1390 ! in word size chunks right away 1391 ! 1392.aldoubcp: 1393 cmp %i2, 56 ! if count < 56, use wordcp, it takes 1394 blu,a %ncc, .alwordcp ! longer to align doubles than words 1395 mov 3, %o0 ! mask for word alignment 1396 call .alignit ! copy bytes until aligned 1397 mov 7, %o0 ! mask for double alignment 1398 ! 1399 ! source and destination are now double-word aligned 1400 ! i3 has aligned count returned by alignit 1401 ! 1402 and %i2, 7, %i2 ! unaligned leftover count 1403 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst 14045: 1405 ldx [%i0+%i1], %o4 ! read from address 1406 stx %o4, [%i1] ! write at destination address 1407 subcc %i3, 8, %i3 ! dec count 1408 bgu %ncc, 5b 1409 add %i1, 8, %i1 ! delay slot, inc to address 1410 cmp %i2, 4 ! see if we can copy a word 1411 blu %ncc, .dbytecp ! if 3 or less bytes use bytecp 1412 .empty 1413 ! 1414 ! for leftover bytes we fall into wordcp, if needed 1415 ! 1416.wordcp: 1417 and %i2, 3, %i2 ! unaligned leftover count 14185: 1419 ld [%i0+%i1], %o4 ! read from address 1420 st %o4, [%i1] ! write at destination address 1421 subcc %i3, 4, %i3 ! dec count 1422 bgu %ncc, 5b 1423 add %i1, 4, %i1 ! delay slot, inc to address 1424 b,a .dbytecp 1425 1426 ! we come here to align copies on word boundaries 1427.alwordcp: 1428 call .alignit ! go word-align it 1429 mov 3, %o0 ! bits that must be zero to be aligned 1430 b .wordcp 1431 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst 1432 1433 ! 1434 ! byte copy, works with any alignment 1435 ! 1436.bytecp: 1437 b .dbytecp 1438 sub %i0, %i1, %i0 ! i0 gets difference of src and dst 1439 1440 ! 1441 ! differenced byte copy, works with any alignment 1442 ! assumes dest in %i1 and (source - dest) in %i0 1443 ! 14441: 1445 stb %o4, [%i1] ! write to address 1446 inc %i1 ! inc to address 1447.dbytecp: 1448 deccc %i2 ! dec count 1449 bgeu,a %ncc, 1b ! loop till done 1450 ldub [%i0+%i1], %o4 ! read from address 1451.cpdone: 1452#if !defined(NIAGARA_IMPL) 1453 ! FPUSED_FLAG will not have been set in any path leading to 1454 ! this point. No need to deal with it. 1455 btst BCOPY_FLAG, %o5 1456 bz,pn %icc, 2f 1457 andcc %o5, BCOPY_FLAG, %o5 1458 ! Here via bcopy. Check to see if the handler was NULL. 1459 ! If so, just return quietly. Otherwise, reset the 1460 ! handler and go home. 1461 bnz,pn %ncc, 2f 1462 nop 1463 ! 1464 ! Null handler. 1465 ! 1466 ret 1467 restore %g0, 0, %o0 1468 ! Here via kcopy or bcopy with a handler. 1469 ! Reset the fault handler. 14702: 1471 membar #Sync 1472 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1473#else /* NIAGARA_IMPL */ 1474 membar #Sync ! sync error barrier 1475 ! Restore t_lofault handler, if came here from kcopy(). 1476 tst %o5 1477 bz %ncc, 1f 1478 andn %o5, LOFAULT_SET, %o5 1479 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 14801: 1481#endif /* NIAGARA_IMPL */ 1482 ret 1483 restore %g0, 0, %o0 ! return (0) 1484 1485/* 1486 * Common code used to align transfers on word and doubleword 1487 * boudaries. Aligns source and destination and returns a count 1488 * of aligned bytes to transfer in %i3 1489 */ 14901: 1491 inc %i0 ! inc from 1492 stb %o4, [%i1] ! write a byte 1493 inc %i1 ! inc to 1494 dec %i2 ! dec count 1495.alignit: 1496 btst %o0, %i0 ! %o0 is bit mask to check for alignment 1497 bnz,a 1b 1498 ldub [%i0], %o4 ! read next byte 1499 1500 retl 1501 andn %i2, %o0, %i3 ! return size of aligned bytes 1502 SET_SIZE(bcopy) 1503 1504#endif /* lint */ 1505 1506/* 1507 * Block copy with possibly overlapped operands. 1508 */ 1509 1510#if defined(lint) 1511 1512/*ARGSUSED*/ 1513void 1514ovbcopy(const void *from, void *to, size_t count) 1515{} 1516 1517#else /* lint */ 1518 1519 ENTRY(ovbcopy) 1520 tst %o2 ! check count 1521 bgu,a %ncc, 1f ! nothing to do or bad arguments 1522 subcc %o0, %o1, %o3 ! difference of from and to address 1523 1524 retl ! return 1525 nop 15261: 1527 bneg,a %ncc, 2f 1528 neg %o3 ! if < 0, make it positive 15292: cmp %o2, %o3 ! cmp size and abs(from - to) 1530 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy, 1531 .empty ! no overlap 1532 cmp %o0, %o1 ! compare from and to addresses 1533 blu %ncc, .ov_bkwd ! if from < to, copy backwards 1534 nop 1535 ! 1536 ! Copy forwards. 1537 ! 1538.ov_fwd: 1539 ldub [%o0], %o3 ! read from address 1540 inc %o0 ! inc from address 1541 stb %o3, [%o1] ! write to address 1542 deccc %o2 ! dec count 1543 bgu %ncc, .ov_fwd ! loop till done 1544 inc %o1 ! inc to address 1545 1546 retl ! return 1547 nop 1548 ! 1549 ! Copy backwards. 1550 ! 1551.ov_bkwd: 1552 deccc %o2 ! dec count 1553 ldub [%o0 + %o2], %o3 ! get byte at end of src 1554 bgu %ncc, .ov_bkwd ! loop till done 1555 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst 1556 1557 retl ! return 1558 nop 1559 SET_SIZE(ovbcopy) 1560 1561#endif /* lint */ 1562 1563/* 1564 * hwblkpagecopy() 1565 * 1566 * Copies exactly one page. This routine assumes the caller (ppcopy) 1567 * has already disabled kernel preemption and has checked 1568 * use_hw_bcopy. 1569 */ 1570#ifdef lint 1571/*ARGSUSED*/ 1572void 1573hwblkpagecopy(const void *src, void *dst) 1574{ } 1575#else /* lint */ 1576 ENTRY(hwblkpagecopy) 1577 save %sp, -SA(MINFRAME), %sp 1578 1579 ! %i0 - source address (arg) 1580 ! %i1 - destination address (arg) 1581 ! %i2 - length of region (not arg) 1582 1583 set PAGESIZE, %i2 1584 1585 /* 1586 * Copying exactly one page and PAGESIZE is in mutliple of 0x80. 1587 */ 1588 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 1589 prefetch [%i0+0x0], #one_read 1590 prefetch [%i0+0x40], #one_read 15911: 1592 prefetch [%i0+0x80], #one_read 1593 prefetch [%i0+0xc0], #one_read 1594 ldda [%i0+0x0]%asi, %l0 1595 ldda [%i0+0x10]%asi, %l2 1596 ldda [%i0+0x20]%asi, %l4 1597 ldda [%i0+0x30]%asi, %l6 1598 stxa %l0, [%i1+0x0]%asi 1599 stxa %l1, [%i1+0x8]%asi 1600 stxa %l2, [%i1+0x10]%asi 1601 stxa %l3, [%i1+0x18]%asi 1602 stxa %l4, [%i1+0x20]%asi 1603 stxa %l5, [%i1+0x28]%asi 1604 stxa %l6, [%i1+0x30]%asi 1605 stxa %l7, [%i1+0x38]%asi 1606 ldda [%i0+0x40]%asi, %l0 1607 ldda [%i0+0x50]%asi, %l2 1608 ldda [%i0+0x60]%asi, %l4 1609 ldda [%i0+0x70]%asi, %l6 1610 stxa %l0, [%i1+0x40]%asi 1611 stxa %l1, [%i1+0x48]%asi 1612 stxa %l2, [%i1+0x50]%asi 1613 stxa %l3, [%i1+0x58]%asi 1614 stxa %l4, [%i1+0x60]%asi 1615 stxa %l5, [%i1+0x68]%asi 1616 stxa %l6, [%i1+0x70]%asi 1617 stxa %l7, [%i1+0x78]%asi 1618 1619 add %i0, 0x80, %i0 1620 subcc %i2, 0x80, %i2 1621 bgu,pt %xcc, 1b 1622 add %i1, 0x80, %i1 1623 1624 membar #Sync 1625 ret 1626 restore %g0, 0, %o0 1627 SET_SIZE(hwblkpagecopy) 1628#endif /* lint */ 1629 1630 1631/* 1632 * Transfer data to and from user space - 1633 * Note that these routines can cause faults 1634 * It is assumed that the kernel has nothing at 1635 * less than KERNELBASE in the virtual address space. 1636 * 1637 * Note that copyin(9F) and copyout(9F) are part of the 1638 * DDI/DKI which specifies that they return '-1' on "errors." 1639 * 1640 * Sigh. 1641 * 1642 * So there's two extremely similar routines - xcopyin() and xcopyout() 1643 * which return the errno that we've faithfully computed. This 1644 * allows other callers (e.g. uiomove(9F)) to work correctly. 1645 * Given that these are used pretty heavily, we expand the calling 1646 * sequences inline for all flavours (rather than making wrappers). 1647 * 1648 * There are also stub routines for xcopyout_little and xcopyin_little, 1649 * which currently are intended to handle requests of <= 16 bytes from 1650 * do_unaligned. Future enhancement to make them handle 8k pages efficiently 1651 * is left as an exercise... 1652 */ 1653 1654/* 1655 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr) 1656 * 1657 * General theory of operation: 1658 * 1659 * None of the copyops routines grab a window until it's decided that 1660 * we need to do a HW block copy operation. This saves a window 1661 * spill/fill when we're called during socket ops. The typical IO 1662 * path won't cause spill/fill traps. 1663 * 1664 * This code uses a set of 4 limits for the maximum size that will 1665 * be copied given a particular input/output address alignment. 1666 * the default limits are: 1667 * 1668 * single byte aligned - 256 (hw_copy_limit_1) 1669 * two byte aligned - 512 (hw_copy_limit_2) 1670 * four byte aligned - 1024 (hw_copy_limit_4) 1671 * eight byte aligned - 1024 (hw_copy_limit_8) 1672 * 1673 * If the value for a particular limit is zero, the copy will be done 1674 * via the copy loops rather than block store/quad load instructions. 1675 * 1676 * Flow: 1677 * 1678 * If count == zero return zero. 1679 * 1680 * Store the previous lo_fault handler into %g6. 1681 * Place our secondary lofault handler into %g5. 1682 * Place the address of our nowindow fault handler into %o3. 1683 * Place the address of the windowed fault handler into %o4. 1684 * --> We'll use this handler if we end up grabbing a window 1685 * --> before we use block initializing store and quad load ASIs 1686 * 1687 * If count is less than or equal to SMALL_LIMIT (7) we 1688 * always do a byte for byte copy. 1689 * 1690 * If count is > SMALL_LIMIT, we check the alignment of the input 1691 * and output pointers. Based on the alignment we check count 1692 * against a limit based on detected alignment. If we exceed the 1693 * alignment value we copy via block initializing store and quad 1694 * load instructions. 1695 * 1696 * If we don't exceed one of the limits, we store -count in %o3, 1697 * we store the number of chunks (8, 4, 2 or 1 byte) operated 1698 * on in our basic copy loop in %o2. Following this we branch 1699 * to the appropriate copy loop and copy that many chunks. 1700 * Since we've been adding the chunk size to %o3 each time through 1701 * as well as decrementing %o2, we can tell if any data is 1702 * is left to be copied by examining %o3. If that is zero, we're 1703 * done and can go home. If not, we figure out what the largest 1704 * chunk size left to be copied is and branch to that copy loop 1705 * unless there's only one byte left. We load that as we're 1706 * branching to code that stores it just before we return. 1707 * 1708 * Fault handlers are invoked if we reference memory that has no 1709 * current mapping. All forms share the same copyio_fault handler. 1710 * This routine handles fixing up the stack and general housecleaning. 1711 * Each copy operation has a simple fault handler that is then called 1712 * to do the work specific to the invidual operation. The handler 1713 * for copyOP and xcopyOP are found at the end of individual function. 1714 * The handlers for xcopyOP_little are found at the end of xcopyin_little. 1715 * The handlers for copyOP_noerr are found at the end of copyin_noerr. 1716 */ 1717 1718/* 1719 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little). 1720 */ 1721 1722#if defined(lint) 1723 1724/*ARGSUSED*/ 1725int 1726copyout(const void *kaddr, void *uaddr, size_t count) 1727{ return (0); } 1728 1729#else /* lint */ 1730 1731/* 1732 * We save the arguments in the following registers in case of a fault: 1733 * kaddr - %g2 1734 * uaddr - %g3 1735 * count - %g4 1736 */ 1737#define SAVE_SRC %g2 1738#define SAVE_DST %g3 1739#define SAVE_COUNT %g4 1740 1741#define REAL_LOFAULT %g5 1742#define SAVED_LOFAULT %g6 1743 1744/* 1745 * Generic copyio fault handler. This is the first line of defense when a 1746 * fault occurs in (x)copyin/(x)copyout. In order for this to function 1747 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT. 1748 * This allows us to share common code for all the flavors of the copy 1749 * operations, including the _noerr versions. 1750 * 1751 * Note that this function will restore the original input parameters before 1752 * calling REAL_LOFAULT. So the real handler can vector to the appropriate 1753 * member of the t_copyop structure, if needed. 1754 */ 1755 ENTRY(copyio_fault) 1756#if !defined(NIAGARA_IMPL) 1757 btst FPUSED_FLAG, SAVED_LOFAULT 1758 bz 1f 1759 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 1760 1761 ld [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 1762 wr %o2, 0, %gsr ! restore gsr 1763 1764 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 1765 btst FPRS_FEF, %o3 1766 bz %icc, 4f 1767 nop 1768 1769 ! restore fpregs from stack 1770 BLD_FP_FROMSTACK(%o2) 1771 1772 ba,pt %ncc, 1f 1773 wr %o3, 0, %fprs ! restore fprs 1774 17754: 1776 FZERO ! zero all of the fpregs 1777 wr %o3, 0, %fprs ! restore fprs 1778 17791: 1780#else /* NIAGARA_IMPL */ 1781 membar #Sync 1782 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1783#endif /* NIAGARA_IMPL */ 1784 1785 restore 1786 1787 mov SAVE_SRC, %o0 1788 mov SAVE_DST, %o1 1789 jmp REAL_LOFAULT 1790 mov SAVE_COUNT, %o2 1791 SET_SIZE(copyio_fault) 1792 1793 ENTRY(copyio_fault_nowindow) 1794 membar #Sync 1795 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1796 1797 mov SAVE_SRC, %o0 1798 mov SAVE_DST, %o1 1799 jmp REAL_LOFAULT 1800 mov SAVE_COUNT, %o2 1801 SET_SIZE(copyio_fault_nowindow) 1802 1803 ENTRY(copyout) 1804 sethi %hi(.copyout_err), REAL_LOFAULT 1805 or REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT 1806 1807.do_copyout: 1808 ! 1809 ! Check the length and bail if zero. 1810 ! 1811 tst %o2 1812 bnz,pt %ncc, 1f 1813 nop 1814 retl 1815 clr %o0 18161: 1817 sethi %hi(copyio_fault), %o4 1818 or %o4, %lo(copyio_fault), %o4 1819 sethi %hi(copyio_fault_nowindow), %o3 1820 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT 1821 or %o3, %lo(copyio_fault_nowindow), %o3 1822 membar #Sync 1823 stn %o3, [THREAD_REG + T_LOFAULT] 1824 1825 mov %o0, SAVE_SRC 1826 mov %o1, SAVE_DST 1827 mov %o2, SAVE_COUNT 1828 1829 ! 1830 ! Check to see if we're more than SMALL_LIMIT (7 bytes). 1831 ! Run in leaf mode, using the %o regs as our input regs. 1832 ! 1833 subcc %o2, SMALL_LIMIT, %o3 1834 bgu,a,pt %ncc, .dco_ns 1835 or %o0, %o1, %o3 1836 ! 1837 ! What was previously ".small_copyout" 1838 ! Do full differenced copy. 1839 ! 1840.dcobcp: 1841 sub %g0, %o2, %o3 ! negate count 1842 add %o0, %o2, %o0 ! make %o0 point at the end 1843 add %o1, %o2, %o1 ! make %o1 point at the end 1844 ba,pt %ncc, .dcocl 1845 ldub [%o0 + %o3], %o4 ! load first byte 1846 ! 1847 ! %o0 and %o2 point at the end and remain pointing at the end 1848 ! of their buffers. We pull things out by adding %o3 (which is 1849 ! the negation of the length) to the buffer end which gives us 1850 ! the curent location in the buffers. By incrementing %o3 we walk 1851 ! through both buffers without having to bump each buffer's 1852 ! pointer. A very fast 4 instruction loop. 1853 ! 1854 .align 16 1855.dcocl: 1856 stba %o4, [%o1 + %o3]ASI_USER 1857 inccc %o3 1858 bl,a,pt %ncc, .dcocl 1859 ldub [%o0 + %o3], %o4 1860 ! 1861 ! We're done. Go home. 1862 ! 1863 membar #Sync 1864 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 1865 retl 1866 clr %o0 1867 ! 1868 ! Try aligned copies from here. 1869 ! 1870.dco_ns: 1871 ! %o0 = kernel addr (to be copied from) 1872 ! %o1 = user addr (to be copied to) 1873 ! %o2 = length 1874 ! %o3 = %o1 | %o2 (used for alignment checking) 1875 ! %o4 is alternate lo_fault 1876 ! %o5 is original lo_fault 1877 ! 1878 ! See if we're single byte aligned. If we are, check the 1879 ! limit for single byte copies. If we're smaller or equal, 1880 ! bounce to the byte for byte copy loop. Otherwise do it in 1881 ! HW (if enabled). 1882 ! 1883 btst 1, %o3 1884 bz,pt %icc, .dcoh8 1885 btst 7, %o3 1886 ! 1887 ! Single byte aligned. Do we do it via HW or via 1888 ! byte for byte? Do a quick no memory reference 1889 ! check to pick up small copies. 1890 ! 1891 sethi %hi(hw_copy_limit_1), %o3 1892 ! 1893 ! Big enough that we need to check the HW limit for 1894 ! this size copy. 1895 ! 1896 ld [%o3 + %lo(hw_copy_limit_1)], %o3 1897 ! 1898 ! Is HW copy on? If not, do everything byte for byte. 1899 ! 1900 tst %o3 1901 bz,pn %icc, .dcobcp 1902 subcc %o3, %o2, %o3 1903 ! 1904 ! If we're less than or equal to the single byte copy limit, 1905 ! bop to the copy loop. 1906 ! 1907 bge,pt %ncc, .dcobcp 1908 nop 1909 ! 1910 ! We're big enough and copy is on. Do it with HW. 1911 ! 1912 ba,pt %ncc, .big_copyout 1913 nop 1914.dcoh8: 1915 ! 1916 ! 8 byte aligned? 1917 ! 1918 bnz,a %ncc, .dcoh4 1919 btst 3, %o3 1920 ! 1921 ! See if we're in the "small range". 1922 ! If so, go off and do the copy. 1923 ! If not, load the hard limit. %o3 is 1924 ! available for reuse. 1925 ! 1926 sethi %hi(hw_copy_limit_8), %o3 1927 ld [%o3 + %lo(hw_copy_limit_8)], %o3 1928 ! 1929 ! If it's zero, there's no HW bcopy. 1930 ! Bop off to the aligned copy. 1931 ! 1932 tst %o3 1933 bz,pn %icc, .dcos8 1934 subcc %o3, %o2, %o3 1935 ! 1936 ! We're negative if our size is larger than hw_copy_limit_8. 1937 ! 1938 bge,pt %ncc, .dcos8 1939 nop 1940 ! 1941 ! HW assist is on and we're large enough. Do it. 1942 ! 1943 ba,pt %ncc, .big_copyout 1944 nop 1945.dcos8: 1946 ! 1947 ! Housekeeping for copy loops. Uses same idea as in the byte for 1948 ! byte copy loop above. 1949 ! 1950 add %o0, %o2, %o0 1951 add %o1, %o2, %o1 1952 sub %g0, %o2, %o3 1953 ba,pt %ncc, .dodebc 1954 srl %o2, 3, %o2 ! Number of 8 byte chunks to copy 1955 ! 1956 ! 4 byte aligned? 1957 ! 1958.dcoh4: 1959 bnz,pn %ncc, .dcoh2 1960 ! 1961 ! See if we're in the "small range". 1962 ! If so, go off an do the copy. 1963 ! If not, load the hard limit. %o3 is 1964 ! available for reuse. 1965 ! 1966 sethi %hi(hw_copy_limit_4), %o3 1967 ld [%o3 + %lo(hw_copy_limit_4)], %o3 1968 ! 1969 ! If it's zero, there's no HW bcopy. 1970 ! Bop off to the aligned copy. 1971 ! 1972 tst %o3 1973 bz,pn %icc, .dcos4 1974 subcc %o3, %o2, %o3 1975 ! 1976 ! We're negative if our size is larger than hw_copy_limit_4. 1977 ! 1978 bge,pt %ncc, .dcos4 1979 nop 1980 ! 1981 ! HW assist is on and we're large enough. Do it. 1982 ! 1983 ba,pt %ncc, .big_copyout 1984 nop 1985.dcos4: 1986 add %o0, %o2, %o0 1987 add %o1, %o2, %o1 1988 sub %g0, %o2, %o3 1989 ba,pt %ncc, .dodfbc 1990 srl %o2, 2, %o2 ! Number of 4 byte chunks to copy 1991 ! 1992 ! We must be 2 byte aligned. Off we go. 1993 ! The check for small copies was done in the 1994 ! delay at .dcoh4 1995 ! 1996.dcoh2: 1997 ble %ncc, .dcos2 1998 sethi %hi(hw_copy_limit_2), %o3 1999 ld [%o3 + %lo(hw_copy_limit_2)], %o3 2000 tst %o3 2001 bz,pn %icc, .dcos2 2002 subcc %o3, %o2, %o3 2003 bge,pt %ncc, .dcos2 2004 nop 2005 ! 2006 ! HW is on and we're big enough. Do it. 2007 ! 2008 ba,pt %ncc, .big_copyout 2009 nop 2010.dcos2: 2011 add %o0, %o2, %o0 2012 add %o1, %o2, %o1 2013 sub %g0, %o2, %o3 2014 ba,pt %ncc, .dodtbc 2015 srl %o2, 1, %o2 ! Number of 2 byte chunks to copy 2016.small_copyout: 2017 ! 2018 ! Why are we doing this AGAIN? There are certain conditions in 2019 ! big_copyout that will cause us to forego the HW assisted copies 2020 ! and bounce back to a non-HW assisted copy. This dispatches those 2021 ! copies. Note that we branch around this in the main line code. 2022 ! 2023 ! We make no check for limits or HW enablement here. We've 2024 ! already been told that we're a poster child so just go off 2025 ! and do it. 2026 ! 2027 or %o0, %o1, %o3 2028 btst 1, %o3 2029 bnz %icc, .dcobcp ! Most likely 2030 btst 7, %o3 2031 bz %icc, .dcos8 2032 btst 3, %o3 2033 bz %icc, .dcos4 2034 nop 2035 ba,pt %ncc, .dcos2 2036 nop 2037 .align 32 2038.dodebc: 2039 ldx [%o0 + %o3], %o4 2040 deccc %o2 2041 stxa %o4, [%o1 + %o3]ASI_USER 2042 bg,pt %ncc, .dodebc 2043 addcc %o3, 8, %o3 2044 ! 2045 ! End of copy loop. Check to see if we're done. Most 2046 ! eight byte aligned copies end here. 2047 ! 2048 bz,pt %ncc, .dcofh 2049 nop 2050 ! 2051 ! Something is left - do it byte for byte. 2052 ! 2053 ba,pt %ncc, .dcocl 2054 ldub [%o0 + %o3], %o4 ! load next byte 2055 ! 2056 ! Four byte copy loop. %o2 is the number of 4 byte chunks to copy. 2057 ! 2058 .align 32 2059.dodfbc: 2060 lduw [%o0 + %o3], %o4 2061 deccc %o2 2062 sta %o4, [%o1 + %o3]ASI_USER 2063 bg,pt %ncc, .dodfbc 2064 addcc %o3, 4, %o3 2065 ! 2066 ! End of copy loop. Check to see if we're done. Most 2067 ! four byte aligned copies end here. 2068 ! 2069 bz,pt %ncc, .dcofh 2070 nop 2071 ! 2072 ! Something is left. Do it byte for byte. 2073 ! 2074 ba,pt %ncc, .dcocl 2075 ldub [%o0 + %o3], %o4 ! load next byte 2076 ! 2077 ! two byte aligned copy loop. %o2 is the number of 2 byte chunks to 2078 ! copy. 2079 ! 2080 .align 32 2081.dodtbc: 2082 lduh [%o0 + %o3], %o4 2083 deccc %o2 2084 stha %o4, [%o1 + %o3]ASI_USER 2085 bg,pt %ncc, .dodtbc 2086 addcc %o3, 2, %o3 2087 ! 2088 ! End of copy loop. Anything left? 2089 ! 2090 bz,pt %ncc, .dcofh 2091 nop 2092 ! 2093 ! Deal with the last byte 2094 ! 2095 ldub [%o0 + %o3], %o4 2096 stba %o4, [%o1 + %o3]ASI_USER 2097.dcofh: 2098 membar #Sync 2099 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2100 retl 2101 clr %o0 2102 2103.big_copyout: 2104 ! We're going to go off and do a block copy. 2105 ! Switch fault handlers and grab a window. We 2106 ! don't do a membar #Sync since we've done only 2107 ! kernel data to this point. 2108 stn %o4, [THREAD_REG + T_LOFAULT] 2109 2110 ! Copy out that reach here are larger than 256 bytes. The 2111 ! hw_copy_limit_1 is set to 256. Never set this limit less 2112 ! 128 bytes. 2113#if !defined(NIAGARA_IMPL) 2114 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 2115 2116 rd %fprs, %o2 ! check for unused fp 2117 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save %fprs 2118 btst FPRS_FEF, %o2 2119 bz,a,pt %icc, .do_block_copyout 2120 wr %g0, FPRS_FEF, %fprs 2121 2122 ! save in-use fpregs on stack 2123 BST_FP_TOSTACK(%o2) 2124#else /* NIAGARA_IMPL */ 2125 save %sp, -SA(MINFRAME), %sp 2126#endif /* NIAGARA_IMPL */ 2127 2128.do_block_copyout: 2129 2130#if !defined(NIAGARA_IMPL) 2131 rd %gsr, %o2 2132 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 2133 ! set the lower bit saved t_lofault to indicate that we need 2134 ! clear %fprs register on the way out 2135 or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 2136#endif /* NIAGARA_IMPL */ 2137 2138 ! Swap src/dst since the code below is memcpy code 2139 ! and memcpy/bcopy have different calling sequences 2140 mov %i1, %i5 2141 mov %i0, %i1 2142 mov %i5, %i0 2143 2144 ! Block (64 bytes) align the destination. 2145 andcc %i0, 0x3f, %i3 ! is dst block aligned 2146 bz %ncc, copyout_blalign ! dst already block aligned 2147 sub %i3, 0x40, %i3 2148 neg %i3 ! bytes till dst 64 bytes aligned 2149 sub %i2, %i3, %i2 ! update i2 with new count 2150 2151 ! Based on source and destination alignment do 2152 ! either 8 bytes, 4 bytes, 2 bytes or byte copy. 2153 2154 ! Is dst & src 8B aligned 2155 or %i0, %i1, %o2 2156 andcc %o2, 0x7, %g0 2157 bz %ncc, .co_alewdcp 2158 nop 2159 2160 ! Is dst & src 4B aligned 2161 andcc %o2, 0x3, %g0 2162 bz %ncc, .co_alwdcp 2163 nop 2164 2165 ! Is dst & src 2B aligned 2166 andcc %o2, 0x1, %g0 2167 bz %ncc, .co_alhlfwdcp 2168 nop 2169 2170 ! 1B aligned 21711: ldub [%i1], %o2 2172 stba %o2, [%i0]ASI_USER 2173 inc %i1 2174 deccc %i3 2175 bgu,pt %ncc, 1b 2176 inc %i0 2177 2178 ba copyout_blalign 2179 nop 2180 2181 ! dst & src 4B aligned 2182.co_alwdcp: 2183 ld [%i1], %o2 2184 sta %o2, [%i0]ASI_USER 2185 add %i1, 0x4, %i1 2186 subcc %i3, 0x4, %i3 2187 bgu,pt %ncc, .co_alwdcp 2188 add %i0, 0x4, %i0 2189 2190 ba copyout_blalign 2191 nop 2192 2193 ! dst & src 2B aligned 2194.co_alhlfwdcp: 2195 lduh [%i1], %o2 2196 stuha %o2, [%i0]ASI_USER 2197 add %i1, 0x2, %i1 2198 subcc %i3, 0x2, %i3 2199 bgu,pt %ncc, .co_alhlfwdcp 2200 add %i0, 0x2, %i0 2201 2202 ba copyout_blalign 2203 nop 2204 2205 ! dst & src 8B aligned 2206.co_alewdcp: 2207 ldx [%i1], %o2 2208 stxa %o2, [%i0]ASI_USER 2209 add %i1, 0x8, %i1 2210 subcc %i3, 0x8, %i3 2211 bgu,pt %ncc, .co_alewdcp 2212 add %i0, 0x8, %i0 2213 2214 ! Now Destination is block (64 bytes) aligned 2215copyout_blalign: 2216 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size 2217 sub %i2, %i3, %i2 ! Residue bytes in %i2 2218 2219 mov ASI_BLK_INIT_QUAD_LDD_AIUS, %asi 2220 2221#if !defined(NIAGARA_IMPL) 2222 andn %i1, 0x3f, %l0 ! %l0 has block aligned src address 2223 prefetch [%l0+0x0], #one_read 2224 andcc %i1, 0x3f, %g0 ! is src 64B aligned 2225 bz,pn %ncc, .co_blkcpy 2226 nop 2227 2228 ! handle misaligned source cases 2229 alignaddr %i1, %g0, %g0 ! generate %gsr 2230 2231 srl %i1, 0x3, %l1 ! src add bits 3, 4, 5 are now least 2232 ! significant in %l1 2233 andcc %l1, 0x7, %l2 ! mask everything except bits 1, 2, 3 2234 add %i1, %i3, %i1 2235 2236 ! switch statement to get to right 8 byte block within 2237 ! 64 byte block 2238 cmp %l2, 0x4 2239 bgeu,a co_hlf 2240 cmp %l2, 0x6 2241 cmp %l2, 0x2 2242 bgeu,a co_sqtr 2243 nop 2244 cmp %l2, 0x1 2245 be,a co_off15 2246 nop 2247 ba co_off7 2248 nop 2249co_sqtr: 2250 be,a co_off23 2251 nop 2252 ba,a co_off31 2253 nop 2254 2255co_hlf: 2256 bgeu,a co_fqtr 2257 nop 2258 cmp %l2, 0x5 2259 be,a co_off47 2260 nop 2261 ba co_off39 2262 nop 2263co_fqtr: 2264 be,a co_off55 2265 nop 2266 2267 ldd [%l0+0x38], %d14 2268 prefetch [%l0+0x40], #one_read 2269 prefetch [%l0+0x80], #one_read 22707: 2271 add %l0, 0x40, %l0 2272 stxa %g0, [%i0]%asi ! initialize the cache line 2273 2274 ldda [%l0]ASI_BLK_P, %d16 2275 ALIGN_OFF_56_63 2276 fmovd %d30, %d14 2277 2278 stda %d48, [%i0]ASI_BLK_AIUS 2279 subcc %i3, 0x40, %i3 2280 add %i0, 0x40, %i0 2281 bgu,pt %ncc, 7b 2282 prefetch [%l0+0x80], #one_read 2283 ba .co_blkdone 2284 membar #Sync 2285 2286co_off7: 2287 ldda [%l0]ASI_BLK_P, %d0 2288 prefetch [%l0+0x40], #one_read 2289 prefetch [%l0+0x80], #one_read 22900: 2291 add %l0, 0x40, %l0 2292 stxa %g0, [%i0]%asi ! initialize the cache line 2293 2294 ldda [%l0]ASI_BLK_P, %d16 2295 ALIGN_OFF_1_7 2296 fmovd %d16, %d0 2297 fmovd %d18, %d2 2298 fmovd %d20, %d4 2299 fmovd %d22, %d6 2300 fmovd %d24, %d8 2301 fmovd %d26, %d10 2302 fmovd %d28, %d12 2303 fmovd %d30, %d14 2304 2305 stda %d48, [%i0]ASI_BLK_AIUS 2306 subcc %i3, 0x40, %i3 2307 add %i0, 0x40, %i0 2308 bgu,pt %ncc, 0b 2309 prefetch [%l0+0x80], #one_read 2310 ba .co_blkdone 2311 membar #Sync 2312 2313co_off15: 2314 ldd [%l0+0x8], %d2 2315 ldd [%l0+0x10], %d4 2316 ldd [%l0+0x18], %d6 2317 ldd [%l0+0x20], %d8 2318 ldd [%l0+0x28], %d10 2319 ldd [%l0+0x30], %d12 2320 ldd [%l0+0x38], %d14 2321 prefetch [%l0+0x40], #one_read 2322 prefetch [%l0+0x80], #one_read 23231: 2324 add %l0, 0x40, %l0 2325 stxa %g0, [%i0]%asi ! initialize the cache line 2326 2327 ldda [%l0]ASI_BLK_P, %d16 2328 ALIGN_OFF_8_15 2329 fmovd %d18, %d2 2330 fmovd %d20, %d4 2331 fmovd %d22, %d6 2332 fmovd %d24, %d8 2333 fmovd %d26, %d10 2334 fmovd %d28, %d12 2335 fmovd %d30, %d14 2336 2337 stda %d48, [%i0]ASI_BLK_AIUS 2338 subcc %i3, 0x40, %i3 2339 add %i0, 0x40, %i0 2340 bgu,pt %ncc, 1b 2341 prefetch [%l0+0x80], #one_read 2342 ba .co_blkdone 2343 membar #Sync 2344 2345co_off23: 2346 ldd [%l0+0x10], %d4 2347 ldd [%l0+0x18], %d6 2348 ldd [%l0+0x20], %d8 2349 ldd [%l0+0x28], %d10 2350 ldd [%l0+0x30], %d12 2351 ldd [%l0+0x38], %d14 2352 prefetch [%l0+0x40], #one_read 2353 prefetch [%l0+0x80], #one_read 23542: 2355 add %l0, 0x40, %l0 2356 stxa %g0, [%i0]%asi ! initialize the cache line 2357 2358 ldda [%l0]ASI_BLK_P, %d16 2359 ALIGN_OFF_16_23 2360 fmovd %d20, %d4 2361 fmovd %d22, %d6 2362 fmovd %d24, %d8 2363 fmovd %d26, %d10 2364 fmovd %d28, %d12 2365 fmovd %d30, %d14 2366 2367 stda %d48, [%i0]ASI_BLK_AIUS 2368 subcc %i3, 0x40, %i3 2369 add %i0, 0x40, %i0 2370 bgu,pt %ncc, 2b 2371 prefetch [%l0+0x80], #one_read 2372 ba .co_blkdone 2373 membar #Sync 2374 2375co_off31: 2376 ldd [%l0+0x18], %d6 2377 ldd [%l0+0x20], %d8 2378 ldd [%l0+0x28], %d10 2379 ldd [%l0+0x30], %d12 2380 ldd [%l0+0x38], %d14 2381 prefetch [%l0+0x40], #one_read 2382 prefetch [%l0+0x80], #one_read 23833: 2384 add %l0, 0x40, %l0 2385 stxa %g0, [%i0]%asi ! initialize the cache line 2386 2387 ldda [%l0]ASI_BLK_P, %d16 2388 ALIGN_OFF_24_31 2389 fmovd %d22, %d6 2390 fmovd %d24, %d8 2391 fmovd %d26, %d10 2392 fmovd %d28, %d12 2393 fmovd %d30, %d14 2394 2395 stda %d48, [%i0]ASI_BLK_AIUS 2396 subcc %i3, 0x40, %i3 2397 add %i0, 0x40, %i0 2398 bgu,pt %ncc, 3b 2399 prefetch [%l0+0x80], #one_read 2400 ba .co_blkdone 2401 membar #Sync 2402 2403co_off39: 2404 ldd [%l0+0x20], %d8 2405 ldd [%l0+0x28], %d10 2406 ldd [%l0+0x30], %d12 2407 ldd [%l0+0x38], %d14 2408 prefetch [%l0+0x40], #one_read 2409 prefetch [%l0+0x80], #one_read 24104: 2411 add %l0, 0x40, %l0 2412 stxa %g0, [%i0]%asi ! initialize the cache line 2413 2414 ldda [%l0]ASI_BLK_P, %d16 2415 ALIGN_OFF_32_39 2416 fmovd %d24, %d8 2417 fmovd %d26, %d10 2418 fmovd %d28, %d12 2419 fmovd %d30, %d14 2420 2421 stda %d48, [%i0]ASI_BLK_AIUS 2422 subcc %i3, 0x40, %i3 2423 add %i0, 0x40, %i0 2424 bgu,pt %ncc, 4b 2425 prefetch [%l0+0x80], #one_read 2426 ba .co_blkdone 2427 membar #Sync 2428 2429co_off47: 2430 ldd [%l0+0x28], %d10 2431 ldd [%l0+0x30], %d12 2432 ldd [%l0+0x38], %d14 2433 prefetch [%l0+0x40], #one_read 2434 prefetch [%l0+0x80], #one_read 24355: 2436 add %l0, 0x40, %l0 2437 stxa %g0, [%i0]%asi ! initialize the cache line 2438 2439 ldda [%l0]ASI_BLK_P, %d16 2440 ALIGN_OFF_40_47 2441 fmovd %d26, %d10 2442 fmovd %d28, %d12 2443 fmovd %d30, %d14 2444 2445 stda %d48, [%i0]ASI_BLK_AIUS 2446 subcc %i3, 0x40, %i3 2447 add %i0, 0x40, %i0 2448 bgu,pt %ncc, 5b 2449 prefetch [%l0+0x80], #one_read 2450 ba .co_blkdone 2451 membar #Sync 2452 2453co_off55: 2454 ldd [%l0+0x30], %d12 2455 ldd [%l0+0x38], %d14 2456 prefetch [%l0+0x40], #one_read 2457 prefetch [%l0+0x80], #one_read 24586: 2459 add %l0, 0x40, %l0 2460 stxa %g0, [%i0]%asi ! initialize the cache line 2461 2462 ldda [%l0]ASI_BLK_P, %d16 2463 ALIGN_OFF_48_55 2464 fmovd %d28, %d12 2465 fmovd %d30, %d14 2466 2467 stda %d48, [%i0]ASI_BLK_AIUS 2468 subcc %i3, 0x40, %i3 2469 add %i0, 0x40, %i0 2470 bgu,pt %ncc, 6b 2471 prefetch [%l0+0x80], #one_read 2472 ba .co_blkdone 2473 membar #Sync 2474 2475.co_blkcpy: 2476 prefetch [%i1+0x40], #one_read 2477 prefetch [%i1+0x80], #one_read 24788: 2479 stxa %g0, [%i0]%asi ! initialize the cache line 2480 ldda [%i1]ASI_BLK_P, %d0 2481 stda %d0, [%i0]ASI_BLK_AIUS 2482 2483 add %i1, 0x40, %i1 2484 subcc %i3, 0x40, %i3 2485 add %i0, 0x40, %i0 2486 bgu,pt %ncc, 8b 2487 prefetch [%i1+0x80], #one_read 2488 membar #Sync 2489 2490.co_blkdone: 2491#else /* NIAGARA_IMPL */ 2492 andcc %i1, 0xf, %o2 ! is src quadword aligned 2493 bz,pn %xcc, .co_blkcpy ! src offset in %o2 (last 4-bits) 2494 nop 2495 cmp %o2, 0x8 2496 bg .co_upper_double 2497 nop 2498 bl .co_lower_double 2499 nop 2500 2501 ! Falls through when source offset is equal to 8 i.e. 2502 ! source is double word aligned. 2503 ! In this case no shift/merge of data is required 2504 2505 sub %i1, %o2, %i1 ! align the src at 16 bytes. 2506 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 2507 prefetch [%l0+0x0], #one_read 2508 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 2509.co_loop0: 2510 add %i1, 0x10, %i1 2511 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 2512 prefetch [%l0+0x40], #one_read 2513 2514 stxa %l3, [%i0+0x0]%asi 2515 stxa %l4, [%i0+0x8]%asi 2516 2517 add %i1, 0x10, %i1 2518 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 2519 2520 stxa %l5, [%i0+0x10]%asi 2521 stxa %l2, [%i0+0x18]%asi 2522 2523 add %i1, 0x10, %i1 2524 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 2525 2526 stxa %l3, [%i0+0x20]%asi 2527 stxa %l4, [%i0+0x28]%asi 2528 2529 add %i1, 0x10, %i1 2530 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 2531 2532 stxa %l5, [%i0+0x30]%asi 2533 stxa %l2, [%i0+0x38]%asi 2534 2535 add %l0, 0x40, %l0 2536 subcc %i3, 0x40, %i3 2537 bgu,pt %xcc, .co_loop0 2538 add %i0, 0x40, %i0 2539 ba .co_blkdone 2540 add %i1, %o2, %i1 ! increment the source by src offset 2541 ! the src offset was stored in %o2 2542 2543.co_lower_double: 2544 2545 sub %i1, %o2, %i1 ! align the src at 16 bytes. 2546 sll %o2, 3, %o0 ! %o0 left shift 2547 mov 0x40, %o1 2548 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 2549 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 2550 prefetch [%l0+0x0], #one_read 2551 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l2 and %l3 has 2552 ! complete data 2553.co_loop1: 2554 add %i1, 0x10, %i1 2555 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has partial data 2556 ! for this read. 2557 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4 2558 ! into %l2 and %l3 2559 prefetch [%l0+0x40], #one_read 2560 2561 stxa %l2, [%i0+0x0]%asi 2562 stxa %l3, [%i0+0x8]%asi 2563 2564 add %i1, 0x10, %i1 2565 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 2566 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and 2567 ! %l4 from previous read 2568 ! into %l4 and %l5 2569 stxa %l4, [%i0+0x10]%asi 2570 stxa %l5, [%i0+0x18]%asi 2571 2572 ! Repeat the same for next 32 bytes. 2573 2574 add %i1, 0x10, %i1 2575 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 2576 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) 2577 2578 stxa %l2, [%i0+0x20]%asi 2579 stxa %l3, [%i0+0x28]%asi 2580 2581 add %i1, 0x10, %i1 2582 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 2583 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) 2584 2585 stxa %l4, [%i0+0x30]%asi 2586 stxa %l5, [%i0+0x38]%asi 2587 2588 add %l0, 0x40, %l0 2589 subcc %i3, 0x40, %i3 2590 bgu,pt %xcc, .co_loop1 2591 add %i0, 0x40, %i0 2592 ba .co_blkdone 2593 add %i1, %o2, %i1 ! increment the source by src offset 2594 ! the src offset was stored in %o2 2595 2596.co_upper_double: 2597 2598 sub %i1, %o2, %i1 ! align the src at 16 bytes. 2599 sub %o2, 0x8, %o0 2600 sll %o0, 3, %o0 ! %o0 left shift 2601 mov 0x40, %o1 2602 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 2603 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 2604 prefetch [%l0+0x0], #one_read 2605 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l3 2606 ! for this read and 2607 ! no data in %l2 2608.co_loop2: 2609 add %i1, 0x10, %i1 2610 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has complete data 2611 ! and %l5 has partial 2612 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5 2613 ! into %l3 and %l4 2614 prefetch [%l0+0x40], #one_read 2615 2616 stxa %l3, [%i0+0x0]%asi 2617 stxa %l4, [%i0+0x8]%asi 2618 2619 add %i1, 0x10, %i1 2620 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 2621 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with 2622 ! %l5 from previous read 2623 ! into %l5 and %l2 2624 2625 stxa %l5, [%i0+0x10]%asi 2626 stxa %l2, [%i0+0x18]%asi 2627 2628 ! Repeat the same for next 32 bytes. 2629 2630 add %i1, 0x10, %i1 2631 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 2632 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) 2633 2634 stxa %l3, [%i0+0x20]%asi 2635 stxa %l4, [%i0+0x28]%asi 2636 2637 add %i1, 0x10, %i1 2638 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 2639 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) 2640 2641 stxa %l5, [%i0+0x30]%asi 2642 stxa %l2, [%i0+0x38]%asi 2643 2644 add %l0, 0x40, %l0 2645 subcc %i3, 0x40, %i3 2646 bgu,pt %xcc, .co_loop2 2647 add %i0, 0x40, %i0 2648 ba .co_blkdone 2649 add %i1, %o2, %i1 ! increment the source by src offset 2650 ! the src offset was stored in %o2 2651 2652 2653 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P 2654.co_blkcpy: 2655 2656 andn %i1, 0x3f, %o0 ! %o0 has block aligned source 2657 prefetch [%o0+0x0], #one_read 26581: 2659 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0 2660 add %i1, 0x10, %i1 2661 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 2662 add %i1, 0x10, %i1 2663 2664 prefetch [%o0+0x40], #one_read 2665 2666 stxa %l0, [%i0+0x0]%asi 2667 2668 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 2669 add %i1, 0x10, %i1 2670 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6 2671 add %i1, 0x10, %i1 2672 2673 stxa %l1, [%i0+0x8]%asi 2674 stxa %l2, [%i0+0x10]%asi 2675 stxa %l3, [%i0+0x18]%asi 2676 stxa %l4, [%i0+0x20]%asi 2677 stxa %l5, [%i0+0x28]%asi 2678 stxa %l6, [%i0+0x30]%asi 2679 stxa %l7, [%i0+0x38]%asi 2680 2681 add %o0, 0x40, %o0 2682 subcc %i3, 0x40, %i3 2683 bgu,pt %xcc, 1b 2684 add %i0, 0x40, %i0 2685 2686.co_blkdone: 2687 membar #Sync 2688#endif /* NIAGARA_IMPL */ 2689 2690 brz,pt %i2, .copyout_exit 2691 nop 2692 2693 ! Handle trailing bytes 2694 cmp %i2, 0x8 2695 blu,pt %ncc, .co_residue 2696 nop 2697 2698 ! Can we do some 8B ops 2699 or %i1, %i0, %o2 2700 andcc %o2, 0x7, %g0 2701 bnz %ncc, .co_last4 2702 nop 2703 2704 ! Do 8byte ops as long as possible 2705.co_last8: 2706 ldx [%i1], %o2 2707 stxa %o2, [%i0]ASI_USER 2708 add %i1, 0x8, %i1 2709 sub %i2, 0x8, %i2 2710 cmp %i2, 0x8 2711 bgu,pt %ncc, .co_last8 2712 add %i0, 0x8, %i0 2713 2714 brz,pt %i2, .copyout_exit 2715 nop 2716 2717 ba .co_residue 2718 nop 2719 2720.co_last4: 2721 ! Can we do 4B ops 2722 andcc %o2, 0x3, %g0 2723 bnz %ncc, .co_last2 2724 nop 27251: 2726 ld [%i1], %o2 2727 sta %o2, [%i0]ASI_USER 2728 add %i1, 0x4, %i1 2729 sub %i2, 0x4, %i2 2730 cmp %i2, 0x4 2731 bgu,pt %ncc, 1b 2732 add %i0, 0x4, %i0 2733 2734 brz,pt %i2, .copyout_exit 2735 nop 2736 2737 ba .co_residue 2738 nop 2739 2740.co_last2: 2741 ! Can we do 2B ops 2742 andcc %o2, 0x1, %g0 2743 bnz %ncc, .co_residue 2744 nop 2745 27461: 2747 lduh [%i1], %o2 2748 stuha %o2, [%i0]ASI_USER 2749 add %i1, 0x2, %i1 2750 sub %i2, 0x2, %i2 2751 cmp %i2, 0x2 2752 bgu,pt %ncc, 1b 2753 add %i0, 0x2, %i0 2754 2755 brz,pt %i2, .copyout_exit 2756 nop 2757 2758 ! Copy the residue as byte copy 2759.co_residue: 2760 ldub [%i1], %i4 2761 stba %i4, [%i0]ASI_USER 2762 inc %i1 2763 deccc %i2 2764 bgu,pt %xcc, .co_residue 2765 inc %i0 2766 2767.copyout_exit: 2768#if !defined(NIAGARA_IMPL) 2769 ld [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 2770 wr %o2, 0, %gsr ! restore gsr 2771 2772 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 2773 btst FPRS_FEF, %o3 2774 bz %icc, 4f 2775 nop 2776 2777 ! restore fpregs from stack 2778 BLD_FP_FROMSTACK(%o2) 2779 2780 ba,pt %ncc, 2f 2781 wr %o3, 0, %fprs ! restore fprs 2782 27834: 2784 FZERO ! zero all of the fpregs 2785 wr %o3, 0, %fprs ! restore fprs 2786 27872: 2788 membar #Sync 2789 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 2790#else /* NIAGARA_IMPL */ 2791 membar #Sync 2792#endif /* NIAGARA_IMPL */ 2793 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2794 ret 2795 restore %g0, 0, %o0 2796 2797.copyout_err: 2798 ldn [THREAD_REG + T_COPYOPS], %o4 2799 brz %o4, 2f 2800 nop 2801 ldn [%o4 + CP_COPYOUT], %g2 2802 jmp %g2 2803 nop 28042: 2805 retl 2806 mov -1, %o0 2807 SET_SIZE(copyout) 2808 2809#endif /* lint */ 2810 2811 2812#ifdef lint 2813 2814/*ARGSUSED*/ 2815int 2816xcopyout(const void *kaddr, void *uaddr, size_t count) 2817{ return (0); } 2818 2819#else /* lint */ 2820 2821 ENTRY(xcopyout) 2822 sethi %hi(.xcopyout_err), REAL_LOFAULT 2823 b .do_copyout 2824 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT 2825.xcopyout_err: 2826 ldn [THREAD_REG + T_COPYOPS], %o4 2827 brz %o4, 2f 2828 nop 2829 ldn [%o4 + CP_XCOPYOUT], %g2 2830 jmp %g2 2831 nop 28322: 2833 retl 2834 mov %g1, %o0 2835 SET_SIZE(xcopyout) 2836 2837#endif /* lint */ 2838 2839#ifdef lint 2840 2841/*ARGSUSED*/ 2842int 2843xcopyout_little(const void *kaddr, void *uaddr, size_t count) 2844{ return (0); } 2845 2846#else /* lint */ 2847 2848 ENTRY(xcopyout_little) 2849 sethi %hi(.little_err), %o4 2850 ldn [THREAD_REG + T_LOFAULT], %o5 2851 or %o4, %lo(.little_err), %o4 2852 membar #Sync ! sync error barrier 2853 stn %o4, [THREAD_REG + T_LOFAULT] 2854 2855 subcc %g0, %o2, %o3 2856 add %o0, %o2, %o0 2857 bz,pn %ncc, 2f ! check for zero bytes 2858 sub %o2, 1, %o4 2859 add %o0, %o4, %o0 ! start w/last byte 2860 add %o1, %o2, %o1 2861 ldub [%o0+%o3], %o4 2862 28631: stba %o4, [%o1+%o3]ASI_AIUSL 2864 inccc %o3 2865 sub %o0, 2, %o0 ! get next byte 2866 bcc,a,pt %ncc, 1b 2867 ldub [%o0+%o3], %o4 2868 28692: membar #Sync ! sync error barrier 2870 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2871 retl 2872 mov %g0, %o0 ! return (0) 2873 SET_SIZE(xcopyout_little) 2874 2875#endif /* lint */ 2876 2877/* 2878 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little) 2879 */ 2880 2881#if defined(lint) 2882 2883/*ARGSUSED*/ 2884int 2885copyin(const void *uaddr, void *kaddr, size_t count) 2886{ return (0); } 2887 2888#else /* lint */ 2889 2890 ENTRY(copyin) 2891 sethi %hi(.copyin_err), REAL_LOFAULT 2892 or REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT 2893 2894.do_copyin: 2895 ! 2896 ! Check the length and bail if zero. 2897 ! 2898 tst %o2 2899 bnz,pt %ncc, 1f 2900 nop 2901 retl 2902 clr %o0 29031: 2904 sethi %hi(copyio_fault), %o4 2905 or %o4, %lo(copyio_fault), %o4 2906 sethi %hi(copyio_fault_nowindow), %o3 2907 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT 2908 or %o3, %lo(copyio_fault_nowindow), %o3 2909 membar #Sync 2910 stn %o3, [THREAD_REG + T_LOFAULT] 2911 2912 mov %o0, SAVE_SRC 2913 mov %o1, SAVE_DST 2914 mov %o2, SAVE_COUNT 2915 2916 ! 2917 ! Check to see if we're more than SMALL_LIMIT. 2918 ! 2919 subcc %o2, SMALL_LIMIT, %o3 2920 bgu,a,pt %ncc, .dci_ns 2921 or %o0, %o1, %o3 2922 ! 2923 ! What was previously ".small_copyin" 2924 ! 2925.dcibcp: 2926 sub %g0, %o2, %o3 ! setup for copy loop 2927 add %o0, %o2, %o0 2928 add %o1, %o2, %o1 2929 ba,pt %ncc, .dcicl 2930 lduba [%o0 + %o3]ASI_USER, %o4 2931 ! 2932 ! %o0 and %o1 point at the end and remain pointing at the end 2933 ! of their buffers. We pull things out by adding %o3 (which is 2934 ! the negation of the length) to the buffer end which gives us 2935 ! the curent location in the buffers. By incrementing %o3 we walk 2936 ! through both buffers without having to bump each buffer's 2937 ! pointer. A very fast 4 instruction loop. 2938 ! 2939 .align 16 2940.dcicl: 2941 stb %o4, [%o1 + %o3] 2942 inccc %o3 2943 bl,a,pt %ncc, .dcicl 2944 lduba [%o0 + %o3]ASI_USER, %o4 2945 ! 2946 ! We're done. Go home. 2947 ! 2948 membar #Sync 2949 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 2950 retl 2951 clr %o0 2952 ! 2953 ! Try aligned copies from here. 2954 ! 2955.dci_ns: 2956 ! 2957 ! See if we're single byte aligned. If we are, check the 2958 ! limit for single byte copies. If we're smaller, or equal, 2959 ! bounce to the byte for byte copy loop. Otherwise do it in 2960 ! HW (if enabled). 2961 ! 2962 btst 1, %o3 2963 bz,a,pt %icc, .dcih8 2964 btst 7, %o3 2965 ! 2966 ! We're single byte aligned. 2967 ! 2968 sethi %hi(hw_copy_limit_1), %o3 2969 ld [%o3 + %lo(hw_copy_limit_1)], %o3 2970 ! 2971 ! Is HW copy on? If not do everything byte for byte. 2972 ! 2973 tst %o3 2974 bz,pn %icc, .dcibcp 2975 subcc %o3, %o2, %o3 2976 ! 2977 ! Are we bigger than the HW limit? If not 2978 ! go to byte for byte. 2979 ! 2980 bge,pt %ncc, .dcibcp 2981 nop 2982 ! 2983 ! We're big enough and copy is on. Do it with HW. 2984 ! 2985 ba,pt %ncc, .big_copyin 2986 nop 2987.dcih8: 2988 ! 2989 ! 8 byte aligned? 2990 ! 2991 bnz,a %ncc, .dcih4 2992 btst 3, %o3 2993 ! 2994 ! We're eight byte aligned. 2995 ! 2996 sethi %hi(hw_copy_limit_8), %o3 2997 ld [%o3 + %lo(hw_copy_limit_8)], %o3 2998 ! 2999 ! Is HW assist on? If not, do it with the aligned copy. 3000 ! 3001 tst %o3 3002 bz,pn %icc, .dcis8 3003 subcc %o3, %o2, %o3 3004 bge %ncc, .dcis8 3005 nop 3006 ba,pt %ncc, .big_copyin 3007 nop 3008.dcis8: 3009 ! 3010 ! Housekeeping for copy loops. Uses same idea as in the byte for 3011 ! byte copy loop above. 3012 ! 3013 add %o0, %o2, %o0 3014 add %o1, %o2, %o1 3015 sub %g0, %o2, %o3 3016 ba,pt %ncc, .didebc 3017 srl %o2, 3, %o2 ! Number of 8 byte chunks to copy 3018 ! 3019 ! 4 byte aligned? 3020 ! 3021.dcih4: 3022 bnz %ncc, .dcih2 3023 sethi %hi(hw_copy_limit_4), %o3 3024 ld [%o3 + %lo(hw_copy_limit_4)], %o3 3025 ! 3026 ! Is HW assist on? If not, do it with the aligned copy. 3027 ! 3028 tst %o3 3029 bz,pn %icc, .dcis4 3030 subcc %o3, %o2, %o3 3031 ! 3032 ! We're negative if our size is less than or equal to hw_copy_limit_4. 3033 ! 3034 bge %ncc, .dcis4 3035 nop 3036 ba,pt %ncc, .big_copyin 3037 nop 3038.dcis4: 3039 ! 3040 ! Housekeeping for copy loops. Uses same idea as in the byte 3041 ! for byte copy loop above. 3042 ! 3043 add %o0, %o2, %o0 3044 add %o1, %o2, %o1 3045 sub %g0, %o2, %o3 3046 ba,pt %ncc, .didfbc 3047 srl %o2, 2, %o2 ! Number of 4 byte chunks to copy 3048.dcih2: 3049 ! 3050 ! We're two byte aligned. Check for "smallness" 3051 ! done in delay at .dcih4 3052 ! 3053 bleu,pt %ncc, .dcis2 3054 sethi %hi(hw_copy_limit_2), %o3 3055 ld [%o3 + %lo(hw_copy_limit_2)], %o3 3056 ! 3057 ! Is HW assist on? If not, do it with the aligned copy. 3058 ! 3059 tst %o3 3060 bz,pn %icc, .dcis2 3061 subcc %o3, %o2, %o3 3062 ! 3063 ! Are we larger than the HW limit? 3064 ! 3065 bge %ncc, .dcis2 3066 nop 3067 ! 3068 ! HW assist is on and we're large enough to use it. 3069 ! 3070 ba,pt %ncc, .big_copyin 3071 nop 3072 ! 3073 ! Housekeeping for copy loops. Uses same idea as in the byte 3074 ! for byte copy loop above. 3075 ! 3076.dcis2: 3077 add %o0, %o2, %o0 3078 add %o1, %o2, %o1 3079 sub %g0, %o2, %o3 3080 ba,pt %ncc, .didtbc 3081 srl %o2, 1, %o2 ! Number of 2 byte chunks to copy 3082 ! 3083.small_copyin: 3084 ! 3085 ! Why are we doing this AGAIN? There are certain conditions in 3086 ! big copyin that will cause us to forgo the HW assisted copys 3087 ! and bounce back to a non-hw assisted copy. This dispatches 3088 ! those copies. Note that we branch around this in the main line 3089 ! code. 3090 ! 3091 ! We make no check for limits or HW enablement here. We've 3092 ! already been told that we're a poster child so just go off 3093 ! and do it. 3094 ! 3095 or %o0, %o1, %o3 3096 btst 1, %o3 3097 bnz %icc, .dcibcp ! Most likely 3098 btst 7, %o3 3099 bz %icc, .dcis8 3100 btst 3, %o3 3101 bz %icc, .dcis4 3102 nop 3103 ba,pt %ncc, .dcis2 3104 nop 3105 ! 3106 ! Eight byte aligned copies. A steal from the original .small_copyin 3107 ! with modifications. %o2 is number of 8 byte chunks to copy. When 3108 ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more 3109 ! to copy. 3110 ! 3111 .align 32 3112.didebc: 3113 ldxa [%o0 + %o3]ASI_USER, %o4 3114 deccc %o2 3115 stx %o4, [%o1 + %o3] 3116 bg,pt %ncc, .didebc 3117 addcc %o3, 8, %o3 3118 ! 3119 ! End of copy loop. Most 8 byte aligned copies end here. 3120 ! 3121 bz,pt %ncc, .dcifh 3122 nop 3123 ! 3124 ! Something is left. Do it byte for byte. 3125 ! 3126 ba,pt %ncc, .dcicl 3127 lduba [%o0 + %o3]ASI_USER, %o4 3128 ! 3129 ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy. 3130 ! 3131 .align 32 3132.didfbc: 3133 lduwa [%o0 + %o3]ASI_USER, %o4 3134 deccc %o2 3135 st %o4, [%o1 + %o3] 3136 bg,pt %ncc, .didfbc 3137 addcc %o3, 4, %o3 3138 ! 3139 ! End of copy loop. Most 4 byte aligned copies end here. 3140 ! 3141 bz,pt %ncc, .dcifh 3142 nop 3143 ! 3144 ! Something is left. Do it byte for byte. 3145 ! 3146 ba,pt %ncc, .dcicl 3147 lduba [%o0 + %o3]ASI_USER, %o4 3148 ! 3149 ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to 3150 ! copy. 3151 ! 3152 .align 32 3153.didtbc: 3154 lduha [%o0 + %o3]ASI_USER, %o4 3155 deccc %o2 3156 sth %o4, [%o1 + %o3] 3157 bg,pt %ncc, .didtbc 3158 addcc %o3, 2, %o3 3159 ! 3160 ! End of copy loop. Most 2 byte aligned copies end here. 3161 ! 3162 bz,pt %ncc, .dcifh 3163 nop 3164 ! 3165 ! Deal with the last byte 3166 ! 3167 lduba [%o0 + %o3]ASI_USER, %o4 3168 stb %o4, [%o1 + %o3] 3169.dcifh: 3170 membar #Sync 3171 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3172 retl 3173 clr %o0 3174 3175.big_copyin: 3176 ! We're going off to do a block copy. 3177 ! Switch fault hendlers and grab a window. We 3178 ! don't do a membar #Sync since we've done only 3179 ! kernel data to this point. 3180 stn %o4, [THREAD_REG + T_LOFAULT] 3181 3182 ! Copy in that reach here are larger than 256 bytes. The 3183 ! hw_copy_limit_1 is set to 256. Never set this limit less 3184 ! 128 bytes. 3185#if !defined(NIAGARA_IMPL) 3186 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 3187 3188 rd %fprs, %o2 ! check for unused fp 3189 st %o2, [%fp + STACK_BIAS - SAVED_FPRS_OFFSET] ! save %fprs 3190 btst FPRS_FEF, %o2 3191 bz,a,pt %icc, .do_blockcopyin 3192 wr %g0, FPRS_FEF, %fprs 3193 3194 ! save in-use fpregs on stack 3195 BST_FP_TOSTACK(%o2) 3196#else /* NIAGARA_IMPL */ 3197 save %sp, -SA(MINFRAME), %sp 3198#endif /* NIAGARA_IMPL */ 3199 3200.do_blockcopyin: 3201 3202#if !defined(NIAGARA_IMPL) 3203 rd %gsr, %o2 3204 stx %o2, [%fp + STACK_BIAS - SAVED_GSR_OFFSET] ! save gsr 3205 ! set the lower bit saved t_lofault to indicate that we need 3206 ! clear %fprs register on the way out 3207 or SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 3208#endif /* NIAGARA_IMPL */ 3209 3210 ! Swap src/dst since the code below is memcpy code 3211 ! and memcpy/bcopy have different calling sequences 3212 mov %i1, %i5 3213 mov %i0, %i1 3214 mov %i5, %i0 3215 3216 ! Block (64 bytes) align the destination. 3217 andcc %i0, 0x3f, %i3 ! is dst block aligned 3218 bz %ncc, copyin_blalign ! dst already block aligned 3219 sub %i3, 0x40, %i3 3220 neg %i3 ! bytes till dst 64 bytes aligned 3221 sub %i2, %i3, %i2 ! update i2 with new count 3222 3223 ! Based on source and destination alignment do 3224 ! either 8 bytes, 4 bytes, 2 bytes or byte copy. 3225 3226 ! Is dst & src 8B aligned 3227 or %i0, %i1, %o2 3228 andcc %o2, 0x7, %g0 3229 bz %ncc, .ci_alewdcp 3230 nop 3231 3232 ! Is dst & src 4B aligned 3233 andcc %o2, 0x3, %g0 3234 bz %ncc, .ci_alwdcp 3235 nop 3236 3237 ! Is dst & src 2B aligned 3238 andcc %o2, 0x1, %g0 3239 bz %ncc, .ci_alhlfwdcp 3240 nop 3241 3242 ! 1B aligned 32431: lduba [%i1]ASI_USER, %o2 3244 stb %o2, [%i0] 3245 inc %i1 3246 deccc %i3 3247 bgu,pt %ncc, 1b 3248 inc %i0 3249 3250 ba copyin_blalign 3251 nop 3252 3253 ! dst & src 4B aligned 3254.ci_alwdcp: 3255 lda [%i1]ASI_USER, %o2 3256 st %o2, [%i0] 3257 add %i1, 0x4, %i1 3258 subcc %i3, 0x4, %i3 3259 bgu,pt %ncc, .ci_alwdcp 3260 add %i0, 0x4, %i0 3261 3262 ba copyin_blalign 3263 nop 3264 3265 ! dst & src 2B aligned 3266.ci_alhlfwdcp: 3267 lduha [%i1]ASI_USER, %o2 3268 stuh %o2, [%i0] 3269 add %i1, 0x2, %i1 3270 subcc %i3, 0x2, %i3 3271 bgu,pt %ncc, .ci_alhlfwdcp 3272 add %i0, 0x2, %i0 3273 3274 ba copyin_blalign 3275 nop 3276 3277 ! dst & src 8B aligned 3278.ci_alewdcp: 3279 ldxa [%i1]ASI_USER, %o2 3280 stx %o2, [%i0] 3281 add %i1, 0x8, %i1 3282 subcc %i3, 0x8, %i3 3283 bgu,pt %ncc, .ci_alewdcp 3284 add %i0, 0x8, %i0 3285 3286copyin_blalign: 3287 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size 3288 sub %i2, %i3, %i2 ! Residue bytes in %i2 3289 3290#if !defined(NIAGARA_IMPL) 3291 mov ASI_USER, %asi 3292 3293 andn %i1, 0x3f, %l0 ! %l0 has block aligned src address 3294 prefetch [%l0+0x0], #one_read 3295 andcc %i1, 0x3f, %g0 ! is src 64B aligned 3296 bz,pn %ncc, .ci_blkcpy 3297 nop 3298 3299 ! handle misaligned source cases 3300 alignaddr %i1, %g0, %g0 ! generate %gsr 3301 3302 srl %i1, 0x3, %l1 ! src add bits 3, 4, 5 are now least 3303 ! significant in %l1 3304 andcc %l1, 0x7, %l2 ! mask everything except bits 1, 2, 3 3305 add %i1, %i3, %i1 3306 3307 ! switch statement to get to right 8 byte block within 3308 ! 64 byte block 3309 cmp %l2, 0x4 3310 bgeu,a ci_hlf 3311 cmp %l2, 0x6 3312 cmp %l2, 0x2 3313 bgeu,a ci_sqtr 3314 nop 3315 cmp %l2, 0x1 3316 be,a ci_off15 3317 nop 3318 ba ci_off7 3319 nop 3320ci_sqtr: 3321 be,a ci_off23 3322 nop 3323 ba,a ci_off31 3324 nop 3325 3326ci_hlf: 3327 bgeu,a ci_fqtr 3328 nop 3329 cmp %l2, 0x5 3330 be,a ci_off47 3331 nop 3332 ba ci_off39 3333 nop 3334ci_fqtr: 3335 be,a ci_off55 3336 nop 3337 3338 ldda [%l0+0x38]%asi, %d14 3339 prefetch [%l0+0x40], #one_read 3340 prefetch [%l0+0x80], #one_read 33417: 3342 add %l0, 0x40, %l0 3343 stxa %g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line 3344 3345 ldda [%l0]ASI_BLK_AIUS, %d16 3346 ALIGN_OFF_56_63 3347 fmovd %d30, %d14 3348 3349 stda %d48, [%i0]ASI_BLK_P 3350 subcc %i3, 0x40, %i3 3351 add %i0, 0x40, %i0 3352 bgu,pt %ncc, 7b 3353 prefetch [%l0+0x80], #one_read 3354 ba .ci_blkdone 3355 membar #Sync 3356 3357ci_off7: 3358 ldda [%l0]ASI_BLK_AIUS, %d0 3359 prefetch [%l0+0x40], #one_read 3360 prefetch [%l0+0x80], #one_read 33610: 3362 add %l0, 0x40, %l0 3363 stxa %g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line 3364 3365 ldda [%l0]ASI_BLK_AIUS, %d16 3366 ALIGN_OFF_1_7 3367 fmovd %d16, %d0 3368 fmovd %d18, %d2 3369 fmovd %d20, %d4 3370 fmovd %d22, %d6 3371 fmovd %d24, %d8 3372 fmovd %d26, %d10 3373 fmovd %d28, %d12 3374 fmovd %d30, %d14 3375 3376 stda %d48, [%i0]ASI_BLK_P 3377 subcc %i3, 0x40, %i3 3378 add %i0, 0x40, %i0 3379 bgu,pt %ncc, 0b 3380 prefetch [%l0+0x80], #one_read 3381 ba .ci_blkdone 3382 membar #Sync 3383 3384ci_off15: 3385 ldda [%l0+0x8]%asi, %d2 3386 ldda [%l0+0x10]%asi, %d4 3387 ldda [%l0+0x18]%asi, %d6 3388 ldda [%l0+0x20]%asi, %d8 3389 ldda [%l0+0x28]%asi, %d10 3390 ldda [%l0+0x30]%asi, %d12 3391 ldda [%l0+0x38]%asi, %d14 3392 prefetch [%l0+0x40], #one_read 3393 prefetch [%l0+0x80], #one_read 33941: 3395 add %l0, 0x40, %l0 3396 stxa %g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line 3397 3398 ldda [%l0]ASI_BLK_AIUS, %d16 3399 ALIGN_OFF_8_15 3400 fmovd %d18, %d2 3401 fmovd %d20, %d4 3402 fmovd %d22, %d6 3403 fmovd %d24, %d8 3404 fmovd %d26, %d10 3405 fmovd %d28, %d12 3406 fmovd %d30, %d14 3407 3408 stda %d48, [%i0]ASI_BLK_P 3409 subcc %i3, 0x40, %i3 3410 add %i0, 0x40, %i0 3411 bgu,pt %ncc, 1b 3412 prefetch [%l0+0x80], #one_read 3413 ba .ci_blkdone 3414 membar #Sync 3415 3416ci_off23: 3417 ldda [%l0+0x10]%asi, %d4 3418 ldda [%l0+0x18]%asi, %d6 3419 ldda [%l0+0x20]%asi, %d8 3420 ldda [%l0+0x28]%asi, %d10 3421 ldda [%l0+0x30]%asi, %d12 3422 ldda [%l0+0x38]%asi, %d14 3423 prefetch [%l0+0x40], #one_read 3424 prefetch [%l0+0x80], #one_read 34252: 3426 add %l0, 0x40, %l0 3427 stxa %g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line 3428 3429 ldda [%l0]ASI_BLK_AIUS, %d16 3430 ALIGN_OFF_16_23 3431 fmovd %d20, %d4 3432 fmovd %d22, %d6 3433 fmovd %d24, %d8 3434 fmovd %d26, %d10 3435 fmovd %d28, %d12 3436 fmovd %d30, %d14 3437 3438 stda %d48, [%i0]ASI_BLK_P 3439 subcc %i3, 0x40, %i3 3440 add %i0, 0x40, %i0 3441 bgu,pt %ncc, 2b 3442 prefetch [%l0+0x80], #one_read 3443 ba .ci_blkdone 3444 membar #Sync 3445 3446ci_off31: 3447 ldda [%l0+0x18]%asi, %d6 3448 ldda [%l0+0x20]%asi, %d8 3449 ldda [%l0+0x28]%asi, %d10 3450 ldda [%l0+0x30]%asi, %d12 3451 ldda [%l0+0x38]%asi, %d14 3452 prefetch [%l0+0x40], #one_read 3453 prefetch [%l0+0x80], #one_read 34543: 3455 add %l0, 0x40, %l0 3456 stxa %g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line 3457 3458 ldda [%l0]ASI_BLK_AIUS, %d16 3459 ALIGN_OFF_24_31 3460 fmovd %d22, %d6 3461 fmovd %d24, %d8 3462 fmovd %d26, %d10 3463 fmovd %d28, %d12 3464 fmovd %d30, %d14 3465 3466 stda %d48, [%i0]ASI_BLK_P 3467 subcc %i3, 0x40, %i3 3468 add %i0, 0x40, %i0 3469 bgu,pt %ncc, 3b 3470 prefetch [%l0+0x80], #one_read 3471 ba .ci_blkdone 3472 membar #Sync 3473 3474ci_off39: 3475 ldda [%l0+0x20]%asi, %d8 3476 ldda [%l0+0x28]%asi, %d10 3477 ldda [%l0+0x30]%asi, %d12 3478 ldda [%l0+0x38]%asi, %d14 3479 prefetch [%l0+0x40], #one_read 3480 prefetch [%l0+0x80], #one_read 34814: 3482 add %l0, 0x40, %l0 3483 stxa %g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line 3484 3485 ldda [%l0]ASI_BLK_AIUS, %d16 3486 ALIGN_OFF_32_39 3487 fmovd %d24, %d8 3488 fmovd %d26, %d10 3489 fmovd %d28, %d12 3490 fmovd %d30, %d14 3491 3492 stda %d48, [%i0]ASI_BLK_P 3493 subcc %i3, 0x40, %i3 3494 add %i0, 0x40, %i0 3495 bgu,pt %ncc, 4b 3496 prefetch [%l0+0x80], #one_read 3497 ba .ci_blkdone 3498 membar #Sync 3499 3500ci_off47: 3501 ldda [%l0+0x28]%asi, %d10 3502 ldda [%l0+0x30]%asi, %d12 3503 ldda [%l0+0x38]%asi, %d14 3504 prefetch [%l0+0x40], #one_read 3505 prefetch [%l0+0x80], #one_read 35065: 3507 add %l0, 0x40, %l0 3508 stxa %g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line 3509 3510 ldda [%l0]ASI_BLK_AIUS, %d16 3511 ALIGN_OFF_40_47 3512 fmovd %d26, %d10 3513 fmovd %d28, %d12 3514 fmovd %d30, %d14 3515 3516 stda %d48, [%i0]ASI_BLK_P 3517 subcc %i3, 0x40, %i3 3518 add %i0, 0x40, %i0 3519 bgu,pt %ncc, 5b 3520 prefetch [%l0+0x80], #one_read 3521 ba .ci_blkdone 3522 membar #Sync 3523 3524ci_off55: 3525 ldda [%l0+0x30]%asi, %d12 3526 ldda [%l0+0x38]%asi, %d14 3527 prefetch [%l0+0x40], #one_read 3528 prefetch [%l0+0x80], #one_read 35296: 3530 add %l0, 0x40, %l0 3531 stxa %g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line 3532 3533 ldda [%l0]ASI_BLK_AIUS, %d16 3534 ALIGN_OFF_48_55 3535 fmovd %d28, %d12 3536 fmovd %d30, %d14 3537 3538 stda %d48, [%i0]ASI_BLK_P 3539 subcc %i3, 0x40, %i3 3540 add %i0, 0x40, %i0 3541 bgu,pt %ncc, 6b 3542 prefetch [%l0+0x80], #one_read 3543 ba .ci_blkdone 3544 membar #Sync 3545 3546.ci_blkcpy: 3547 prefetch [%i1+0x40], #one_read 3548 prefetch [%i1+0x80], #one_read 35498: 3550 stxa %g0, [%i0]ASI_BLK_INIT_ST_QUAD_LDD_P ! initialize the cache line 3551 ldda [%i1]ASI_BLK_AIUS, %d0 3552 stda %d0, [%i0]ASI_BLK_P 3553 3554 add %i1, 0x40, %i1 3555 subcc %i3, 0x40, %i3 3556 add %i0, 0x40, %i0 3557 bgu,pt %ncc, 8b 3558 prefetch [%i1+0x80], #one_read 3559 membar #Sync 3560 3561.ci_blkdone: 3562#else /* NIAGARA_IMPL */ 3563 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 3564 3565 andcc %i1, 0xf, %o2 ! is src quadword aligned 3566 bz,pn %xcc, .ci_blkcpy ! src offset in %o2 (last 4-bits) 3567 nop 3568 cmp %o2, 0x8 3569 bg .ci_upper_double 3570 nop 3571 bl .ci_lower_double 3572 nop 3573 3574 ! Falls through when source offset is equal to 8 i.e. 3575 ! source is double word aligned. 3576 ! In this case no shift/merge of data is required 3577 3578 sub %i1, %o2, %i1 ! align the src at 16 bytes. 3579 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 3580 prefetch [%l0+0x0], #one_read 3581 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 3582.ci_loop0: 3583 add %i1, 0x10, %i1 3584 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 3585 3586 prefetch [%l0+0x40], #one_read 3587 3588 stxa %l3, [%i0+0x0]%asi 3589 stxa %l4, [%i0+0x8]%asi 3590 3591 add %i1, 0x10, %i1 3592 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 3593 3594 stxa %l5, [%i0+0x10]%asi 3595 stxa %l2, [%i0+0x18]%asi 3596 3597 add %i1, 0x10, %i1 3598 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 3599 3600 stxa %l3, [%i0+0x20]%asi 3601 stxa %l4, [%i0+0x28]%asi 3602 3603 add %i1, 0x10, %i1 3604 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 3605 3606 stxa %l5, [%i0+0x30]%asi 3607 stxa %l2, [%i0+0x38]%asi 3608 3609 add %l0, 0x40, %l0 3610 subcc %i3, 0x40, %i3 3611 bgu,pt %xcc, .ci_loop0 3612 add %i0, 0x40, %i0 3613 ba .ci_blkdone 3614 add %i1, %o2, %i1 ! increment the source by src offset 3615 ! the src offset was stored in %o2 3616 3617.ci_lower_double: 3618 3619 sub %i1, %o2, %i1 ! align the src at 16 bytes. 3620 sll %o2, 3, %o0 ! %o0 left shift 3621 mov 0x40, %o1 3622 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 3623 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 3624 prefetch [%l0+0x0], #one_read 3625 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l2 3626 ! and %l3 has complete 3627 ! data 3628.ci_loop1: 3629 add %i1, 0x10, %i1 3630 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has partial data 3631 ! for this read. 3632 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4 3633 ! into %l2 and %l3 3634 3635 prefetch [%l0+0x40], #one_read 3636 3637 stxa %l2, [%i0+0x0]%asi 3638 stxa %l3, [%i0+0x8]%asi 3639 3640 add %i1, 0x10, %i1 3641 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 3642 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and 3643 ! %l4 from previous read 3644 ! into %l4 and %l5 3645 stxa %l4, [%i0+0x10]%asi 3646 stxa %l5, [%i0+0x18]%asi 3647 3648 ! Repeat the same for next 32 bytes. 3649 3650 add %i1, 0x10, %i1 3651 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 3652 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) 3653 3654 stxa %l2, [%i0+0x20]%asi 3655 stxa %l3, [%i0+0x28]%asi 3656 3657 add %i1, 0x10, %i1 3658 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 3659 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) 3660 3661 stxa %l4, [%i0+0x30]%asi 3662 stxa %l5, [%i0+0x38]%asi 3663 3664 add %l0, 0x40, %l0 3665 subcc %i3, 0x40, %i3 3666 bgu,pt %xcc, .ci_loop1 3667 add %i0, 0x40, %i0 3668 ba .ci_blkdone 3669 add %i1, %o2, %i1 ! increment the source by src offset 3670 ! the src offset was stored in %o2 3671 3672.ci_upper_double: 3673 3674 sub %i1, %o2, %i1 ! align the src at 16 bytes. 3675 sub %o2, 0x8, %o0 3676 sll %o0, 3, %o0 ! %o0 left shift 3677 mov 0x40, %o1 3678 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 3679 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 3680 prefetch [%l0+0x0], #one_read 3681 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l3 3682 ! for this read and 3683 ! no data in %l2 3684.ci_loop2: 3685 add %i1, 0x10, %i1 3686 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has complete data 3687 ! and %l5 has partial 3688 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5 3689 ! into %l3 and %l4 3690 prefetch [%l0+0x40], #one_read 3691 3692 stxa %l3, [%i0+0x0]%asi 3693 stxa %l4, [%i0+0x8]%asi 3694 3695 add %i1, 0x10, %i1 3696 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 3697 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with 3698 ! %l5 from previous read 3699 ! into %l5 and %l2 3700 3701 stxa %l5, [%i0+0x10]%asi 3702 stxa %l2, [%i0+0x18]%asi 3703 3704 ! Repeat the same for next 32 bytes. 3705 3706 add %i1, 0x10, %i1 3707 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 3708 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) 3709 3710 stxa %l3, [%i0+0x20]%asi 3711 stxa %l4, [%i0+0x28]%asi 3712 3713 add %i1, 0x10, %i1 3714 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 3715 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) 3716 3717 stxa %l5, [%i0+0x30]%asi 3718 stxa %l2, [%i0+0x38]%asi 3719 3720 add %l0, 0x40, %l0 3721 subcc %i3, 0x40, %i3 3722 bgu,pt %xcc, .ci_loop2 3723 add %i0, 0x40, %i0 3724 ba .ci_blkdone 3725 add %i1, %o2, %i1 ! increment the source by src offset 3726 ! the src offset was stored in %o2 3727 3728 3729 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P 3730.ci_blkcpy: 3731 3732 andn %i1, 0x3f, %o0 ! %o0 has block aligned source 3733 prefetch [%o0+0x0], #one_read 37341: 3735 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0 3736 add %i1, 0x10, %i1 3737 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 3738 add %i1, 0x10, %i1 3739 3740 prefetch [%o0+0x40], #one_read 3741 3742 stxa %l0, [%i0+0x0]%asi 3743 3744 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 3745 add %i1, 0x10, %i1 3746 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6 3747 add %i1, 0x10, %i1 3748 3749 stxa %l1, [%i0+0x8]%asi 3750 stxa %l2, [%i0+0x10]%asi 3751 stxa %l3, [%i0+0x18]%asi 3752 stxa %l4, [%i0+0x20]%asi 3753 stxa %l5, [%i0+0x28]%asi 3754 stxa %l6, [%i0+0x30]%asi 3755 stxa %l7, [%i0+0x38]%asi 3756 3757 add %o0, 0x40, %o0 3758 subcc %i3, 0x40, %i3 3759 bgu,pt %xcc, 1b 3760 add %i0, 0x40, %i0 3761 3762.ci_blkdone: 3763 membar #Sync 3764#endif /* NIAGARA_IMPL */ 3765 3766 brz,pt %i2, .copyin_exit 3767 nop 3768 3769 ! Handle trailing bytes 3770 cmp %i2, 0x8 3771 blu,pt %ncc, .ci_residue 3772 nop 3773 3774 ! Can we do some 8B ops 3775 or %i1, %i0, %o2 3776 andcc %o2, 0x7, %g0 3777 bnz %ncc, .ci_last4 3778 nop 3779 3780 ! Do 8byte ops as long as possible 3781.ci_last8: 3782 ldxa [%i1]ASI_USER, %o2 3783 stx %o2, [%i0] 3784 add %i1, 0x8, %i1 3785 sub %i2, 0x8, %i2 3786 cmp %i2, 0x8 3787 bgu,pt %ncc, .ci_last8 3788 add %i0, 0x8, %i0 3789 3790 brz,pt %i2, .copyin_exit 3791 nop 3792 3793 ba .ci_residue 3794 nop 3795 3796.ci_last4: 3797 ! Can we do 4B ops 3798 andcc %o2, 0x3, %g0 3799 bnz %ncc, .ci_last2 3800 nop 38011: 3802 lda [%i1]ASI_USER, %o2 3803 st %o2, [%i0] 3804 add %i1, 0x4, %i1 3805 sub %i2, 0x4, %i2 3806 cmp %i2, 0x4 3807 bgu,pt %ncc, 1b 3808 add %i0, 0x4, %i0 3809 3810 brz,pt %i2, .copyin_exit 3811 nop 3812 3813 ba .ci_residue 3814 nop 3815 3816.ci_last2: 3817 ! Can we do 2B ops 3818 andcc %o2, 0x1, %g0 3819 bnz %ncc, .ci_residue 3820 nop 3821 38221: 3823 lduha [%i1]ASI_USER, %o2 3824 stuh %o2, [%i0] 3825 add %i1, 0x2, %i1 3826 sub %i2, 0x2, %i2 3827 cmp %i2, 0x2 3828 bgu,pt %ncc, 1b 3829 add %i0, 0x2, %i0 3830 3831 brz,pt %i2, .copyin_exit 3832 nop 3833 3834 ! Copy the residue as byte copy 3835.ci_residue: 3836 lduba [%i1]ASI_USER, %i4 3837 stb %i4, [%i0] 3838 inc %i1 3839 deccc %i2 3840 bgu,pt %xcc, .ci_residue 3841 inc %i0 3842 3843.copyin_exit: 3844#if !defined(NIAGARA_IMPL) 3845 ld [%fp + STACK_BIAS - SAVED_GSR_OFFSET], %o2 3846 wr %o2, 0, %gsr ! restore gsr 3847 3848 ld [%fp + STACK_BIAS - SAVED_FPRS_OFFSET], %o3 3849 btst FPRS_FEF, %o3 3850 bz %icc, 4f 3851 nop 3852 3853 ! restore fpregs from stack 3854 BLD_FP_FROMSTACK(%o2) 3855 3856 ba,pt %ncc, 2f 3857 wr %o3, 0, %fprs ! restore fprs 3858 38594: 3860 FZERO ! zero all of the fpregs 3861 wr %o3, 0, %fprs ! restore fprs 3862 38632: 3864 membar #Sync ! sync error barrier 3865 andn SAVED_LOFAULT, FPUSED_FLAG, SAVED_LOFAULT 3866#else /* NIAGARA_IMPL */ 3867 membar #Sync 3868#endif /* NIAGARA_IMPL */ 3869 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3870 ret 3871 restore %g0, 0, %o0 3872.copyin_err: 3873 ldn [THREAD_REG + T_COPYOPS], %o4 3874 brz %o4, 2f 3875 nop 3876 ldn [%o4 + CP_COPYIN], %g2 3877 jmp %g2 3878 nop 38792: 3880 retl 3881 mov -1, %o0 3882 SET_SIZE(copyin) 3883 3884#endif /* lint */ 3885 3886#ifdef lint 3887 3888/*ARGSUSED*/ 3889int 3890xcopyin(const void *uaddr, void *kaddr, size_t count) 3891{ return (0); } 3892 3893#else /* lint */ 3894 3895 ENTRY(xcopyin) 3896 sethi %hi(.xcopyin_err), REAL_LOFAULT 3897 b .do_copyin 3898 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT 3899.xcopyin_err: 3900 ldn [THREAD_REG + T_COPYOPS], %o4 3901 brz %o4, 2f 3902 nop 3903 ldn [%o4 + CP_XCOPYIN], %g2 3904 jmp %g2 3905 nop 39062: 3907 retl 3908 mov %g1, %o0 3909 SET_SIZE(xcopyin) 3910 3911#endif /* lint */ 3912 3913#ifdef lint 3914 3915/*ARGSUSED*/ 3916int 3917xcopyin_little(const void *uaddr, void *kaddr, size_t count) 3918{ return (0); } 3919 3920#else /* lint */ 3921 3922 ENTRY(xcopyin_little) 3923 sethi %hi(.little_err), %o4 3924 ldn [THREAD_REG + T_LOFAULT], %o5 3925 or %o4, %lo(.little_err), %o4 3926 membar #Sync ! sync error barrier 3927 stn %o4, [THREAD_REG + T_LOFAULT] 3928 3929 subcc %g0, %o2, %o3 3930 add %o0, %o2, %o0 3931 bz,pn %ncc, 2f ! check for zero bytes 3932 sub %o2, 1, %o4 3933 add %o0, %o4, %o0 ! start w/last byte 3934 add %o1, %o2, %o1 3935 lduba [%o0+%o3]ASI_AIUSL, %o4 3936 39371: stb %o4, [%o1+%o3] 3938 inccc %o3 3939 sub %o0, 2, %o0 ! get next byte 3940 bcc,a,pt %ncc, 1b 3941 lduba [%o0+%o3]ASI_AIUSL, %o4 3942 39432: membar #Sync ! sync error barrier 3944 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3945 retl 3946 mov %g0, %o0 ! return (0) 3947 3948.little_err: 3949 membar #Sync ! sync error barrier 3950 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 3951 retl 3952 mov %g1, %o0 3953 SET_SIZE(xcopyin_little) 3954 3955#endif /* lint */ 3956 3957 3958/* 3959 * Copy a block of storage - must not overlap (from + len <= to). 3960 * No fault handler installed (to be called under on_fault()) 3961 */ 3962#if defined(lint) 3963 3964/* ARGSUSED */ 3965void 3966copyin_noerr(const void *ufrom, void *kto, size_t count) 3967{} 3968 3969#else /* lint */ 3970 3971 ENTRY(copyin_noerr) 3972 sethi %hi(.copyio_noerr), REAL_LOFAULT 3973 b .do_copyin 3974 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 3975.copyio_noerr: 3976 jmp SAVED_LOFAULT 3977 nop 3978 SET_SIZE(copyin_noerr) 3979 3980#endif /* lint */ 3981 3982/* 3983 * Copy a block of storage - must not overlap (from + len <= to). 3984 * No fault handler installed (to be called under on_fault()) 3985 */ 3986 3987#if defined(lint) 3988 3989/* ARGSUSED */ 3990void 3991copyout_noerr(const void *kfrom, void *uto, size_t count) 3992{} 3993 3994#else /* lint */ 3995 3996 ENTRY(copyout_noerr) 3997 sethi %hi(.copyio_noerr), REAL_LOFAULT 3998 b .do_copyout 3999 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 4000 SET_SIZE(copyout_noerr) 4001 4002#endif /* lint */ 4003 4004#if defined(lint) 4005 4006int use_hw_bcopy = 1; 4007int use_hw_bzero = 1; 4008uint_t hw_copy_limit_1 = 0x100; 4009uint_t hw_copy_limit_2 = 0x200; 4010uint_t hw_copy_limit_4 = 0x400; 4011uint_t hw_copy_limit_8 = 0x400; 4012 4013#else /* !lint */ 4014 4015 .align 4 4016 DGDEF(use_hw_bcopy) 4017 .word 1 4018 DGDEF(use_hw_bzero) 4019 .word 1 4020 DGDEF(hw_copy_limit_1) 4021 .word 0x100 4022 DGDEF(hw_copy_limit_2) 4023 .word 0x200 4024 DGDEF(hw_copy_limit_4) 4025 .word 0x400 4026 DGDEF(hw_copy_limit_8) 4027 .word 0x400 4028 4029 .align 64 4030 .section ".text" 4031#endif /* !lint */ 4032 4033/* 4034 * hwblkclr - clears block-aligned, block-multiple-sized regions that are 4035 * longer than 256 bytes in length using Niagara's block stores/quad store. 4036 * If the criteria for using this routine are not met then it calls bzero 4037 * and returns 1. Otherwise 0 is returned indicating success. 4038 * Caller is responsible for ensuring use_hw_bzero is true and that 4039 * kpreempt_disable() has been called. 4040 */ 4041#ifdef lint 4042/*ARGSUSED*/ 4043int 4044hwblkclr(void *addr, size_t len) 4045{ 4046 return(0); 4047} 4048#else /* lint */ 4049 ! %i0 - start address 4050 ! %i1 - length of region (multiple of 64) 4051 4052 ENTRY(hwblkclr) 4053 save %sp, -SA(MINFRAME), %sp 4054 4055 ! Must be block-aligned 4056 andcc %i0, 0x3f, %g0 4057 bnz,pn %ncc, 1f 4058 nop 4059 4060 ! ... and must be 256 bytes or more 4061 cmp %i1, 0x100 4062 blu,pn %ncc, 1f 4063 nop 4064 4065 ! ... and length must be a multiple of 64 4066 andcc %i1, 0x3f, %g0 4067 bz,pn %ncc, .pz_doblock 4068 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 4069 40701: ! punt, call bzero but notify the caller that bzero was used 4071 mov %i0, %o0 4072 call bzero 4073 mov %i1, %o1 4074 ret 4075 restore %g0, 1, %o0 ! return (1) - did not use block operations 4076 4077 ! Already verified that there are at least 256 bytes to set 4078.pz_doblock: 4079 stxa %g0, [%i0+0x0]%asi 4080 stxa %g0, [%i0+0x40]%asi 4081 stxa %g0, [%i0+0x80]%asi 4082 stxa %g0, [%i0+0xc0]%asi 4083 4084 stxa %g0, [%i0+0x8]%asi 4085 stxa %g0, [%i0+0x10]%asi 4086 stxa %g0, [%i0+0x18]%asi 4087 stxa %g0, [%i0+0x20]%asi 4088 stxa %g0, [%i0+0x28]%asi 4089 stxa %g0, [%i0+0x30]%asi 4090 stxa %g0, [%i0+0x38]%asi 4091 4092 stxa %g0, [%i0+0x48]%asi 4093 stxa %g0, [%i0+0x50]%asi 4094 stxa %g0, [%i0+0x58]%asi 4095 stxa %g0, [%i0+0x60]%asi 4096 stxa %g0, [%i0+0x68]%asi 4097 stxa %g0, [%i0+0x70]%asi 4098 stxa %g0, [%i0+0x78]%asi 4099 4100 stxa %g0, [%i0+0x88]%asi 4101 stxa %g0, [%i0+0x90]%asi 4102 stxa %g0, [%i0+0x98]%asi 4103 stxa %g0, [%i0+0xa0]%asi 4104 stxa %g0, [%i0+0xa8]%asi 4105 stxa %g0, [%i0+0xb0]%asi 4106 stxa %g0, [%i0+0xb8]%asi 4107 4108 stxa %g0, [%i0+0xc8]%asi 4109 stxa %g0, [%i0+0xd0]%asi 4110 stxa %g0, [%i0+0xd8]%asi 4111 stxa %g0, [%i0+0xe0]%asi 4112 stxa %g0, [%i0+0xe8]%asi 4113 stxa %g0, [%i0+0xf0]%asi 4114 stxa %g0, [%i0+0xf8]%asi 4115 4116 sub %i1, 0x100, %i1 4117 cmp %i1, 0x100 4118 bgu,pt %ncc, .pz_doblock 4119 add %i0, 0x100, %i0 4120 41212: 4122 ! Check if more than 64 bytes to set 4123 cmp %i1,0x40 4124 blu %ncc, .pz_finish 4125 nop 4126 41273: 4128 stxa %g0, [%i0+0x0]%asi 4129 stxa %g0, [%i0+0x8]%asi 4130 stxa %g0, [%i0+0x10]%asi 4131 stxa %g0, [%i0+0x18]%asi 4132 stxa %g0, [%i0+0x20]%asi 4133 stxa %g0, [%i0+0x28]%asi 4134 stxa %g0, [%i0+0x30]%asi 4135 stxa %g0, [%i0+0x38]%asi 4136 4137 subcc %i1, 0x40, %i1 4138 bgu,pt %ncc, 3b 4139 add %i0, 0x40, %i0 4140 4141.pz_finish: 4142 membar #Sync 4143 ret 4144 restore %g0, 0, %o0 ! return (bzero or not) 4145 SET_SIZE(hwblkclr) 4146#endif /* lint */ 4147 4148#ifdef lint 4149/* Copy 32 bytes of data from src to dst using physical addresses */ 4150/*ARGSUSED*/ 4151void 4152hw_pa_bcopy32(uint64_t src, uint64_t dst) 4153{} 4154#else /*!lint */ 4155 4156 /* 4157 * Copy 32 bytes of data from src (%o0) to dst (%o1) 4158 * using physical addresses. 4159 */ 4160 ENTRY_NP(hw_pa_bcopy32) 4161 rdpr %pstate, %g1 4162 andn %g1, PSTATE_IE, %g2 4163 wrpr %g0, %g2, %pstate 4164 4165 ldxa [%o0]ASI_MEM, %o2 4166 add %o0, 8, %o0 4167 ldxa [%o0]ASI_MEM, %o3 4168 add %o0, 8, %o0 4169 ldxa [%o0]ASI_MEM, %o4 4170 add %o0, 8, %o0 4171 ldxa [%o0]ASI_MEM, %o5 4172 stxa %o2, [%o1]ASI_MEM 4173 add %o1, 8, %o1 4174 stxa %o3, [%o1]ASI_MEM 4175 add %o1, 8, %o1 4176 stxa %o4, [%o1]ASI_MEM 4177 add %o1, 8, %o1 4178 stxa %o5, [%o1]ASI_MEM 4179 4180 membar #Sync 4181 retl 4182 wrpr %g0, %g1, %pstate 4183 SET_SIZE(hw_pa_bcopy32) 4184#endif /* lint */ 4185 4186/* 4187 * Zero a block of storage. 4188 * 4189 * uzero is used by the kernel to zero a block in user address space. 4190 */ 4191 4192/* 4193 * Control flow of the bzero/kzero/uzero routine. 4194 * 4195 * For fewer than 7 bytes stores, bytes will be zeroed. 4196 * 4197 * For less than 15 bytes stores, align the address on 4 byte boundary. 4198 * Then store as many 4-byte chunks, followed by trailing bytes. 4199 * 4200 * For sizes greater than 15 bytes, align the address on 8 byte boundary. 4201 * if (count > 128) { 4202 * store as many 8-bytes chunks to block align the address 4203 * store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR 4204 * store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero) 4205 * } 4206 * Store as many 8-byte chunks, followed by trailing bytes. 4207 */ 4208 4209#if defined(lint) 4210 4211/* ARGSUSED */ 4212int 4213kzero(void *addr, size_t count) 4214{ return(0); } 4215 4216/* ARGSUSED */ 4217void 4218uzero(void *addr, size_t count) 4219{} 4220 4221#else /* lint */ 4222 4223 ENTRY(uzero) 4224 ! 4225 ! Set a new lo_fault handler only if we came in with one 4226 ! already specified. 4227 ! 4228 wr %g0, ASI_USER, %asi 4229 ldn [THREAD_REG + T_LOFAULT], %o5 4230 tst %o5 4231 bz,pt %ncc, .do_zero 4232 sethi %hi(.zeroerr), %o2 4233 or %o2, %lo(.zeroerr), %o2 4234 membar #Sync 4235 ba,pt %ncc, .do_zero 4236 stn %o2, [THREAD_REG + T_LOFAULT] 4237 4238 ENTRY(kzero) 4239 ! 4240 ! Always set a lo_fault handler 4241 ! 4242 wr %g0, ASI_P, %asi 4243 ldn [THREAD_REG + T_LOFAULT], %o5 4244 sethi %hi(.zeroerr), %o2 4245 or %o5, LOFAULT_SET, %o5 4246 or %o2, %lo(.zeroerr), %o2 4247 membar #Sync 4248 ba,pt %ncc, .do_zero 4249 stn %o2, [THREAD_REG + T_LOFAULT] 4250 4251/* 4252 * We got here because of a fault during kzero or if 4253 * uzero or bzero was called with t_lofault non-zero. 4254 * Otherwise we've already run screaming from the room. 4255 * Errno value is in %g1. Note that we're here iff 4256 * we did set t_lofault. 4257 */ 4258.zeroerr: 4259 ! 4260 ! Undo asi register setting. Just set it to be the 4261 ! kernel default without checking. 4262 ! 4263 wr %g0, ASI_P, %asi 4264 4265 ! 4266 ! We did set t_lofault. It may well have been zero coming in. 4267 ! 42681: 4269 tst %o5 4270 membar #Sync 4271 bne,pn %ncc, 3f 4272 andncc %o5, LOFAULT_SET, %o5 42732: 4274 ! 4275 ! Old handler was zero. Just return the error. 4276 ! 4277 retl ! return 4278 mov %g1, %o0 ! error code from %g1 42793: 4280 ! 4281 ! We're here because %o5 was non-zero. It was non-zero 4282 ! because either LOFAULT_SET was present, a previous fault 4283 ! handler was present or both. In all cases we need to reset 4284 ! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET 4285 ! before we either simply return the error or we invoke the 4286 ! previously specified handler. 4287 ! 4288 be %ncc, 2b 4289 stn %o5, [THREAD_REG + T_LOFAULT] 4290 jmp %o5 ! goto real handler 4291 nop 4292 SET_SIZE(kzero) 4293 SET_SIZE(uzero) 4294 4295#endif /* lint */ 4296 4297/* 4298 * Zero a block of storage. 4299 */ 4300 4301#if defined(lint) 4302 4303/* ARGSUSED */ 4304void 4305bzero(void *addr, size_t count) 4306{} 4307 4308#else /* lint */ 4309 4310 ENTRY(bzero) 4311 wr %g0, ASI_P, %asi 4312 4313 ldn [THREAD_REG + T_LOFAULT], %o5 ! save old vector 4314 tst %o5 4315 bz,pt %ncc, .do_zero 4316 sethi %hi(.zeroerr), %o2 4317 or %o2, %lo(.zeroerr), %o2 4318 membar #Sync ! sync error barrier 4319 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector 4320 4321.do_zero: 4322 cmp %o1, 7 4323 blu,pn %ncc, .byteclr 4324 nop 4325 4326 cmp %o1, 15 4327 blu,pn %ncc, .wdalign 4328 nop 4329 4330 andcc %o0, 7, %o3 ! is add aligned on a 8 byte bound 4331 bz,pt %ncc, .blkalign ! already double aligned 4332 sub %o3, 8, %o3 ! -(bytes till double aligned) 4333 add %o1, %o3, %o1 ! update o1 with new count 4334 43351: 4336 stba %g0, [%o0]%asi 4337 inccc %o3 4338 bl,pt %ncc, 1b 4339 inc %o0 4340 4341 ! Now address is double aligned 4342.blkalign: 4343 cmp %o1, 0x80 ! check if there are 128 bytes to set 4344 blu,pn %ncc, .bzero_small 4345 mov %o1, %o3 4346 4347 sethi %hi(use_hw_bzero), %o2 4348 ld [%o2 + %lo(use_hw_bzero)], %o2 4349 tst %o2 4350 bz %ncc, .bzero_small 4351 mov %o1, %o3 4352 4353 rd %asi, %o3 4354 wr %g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 4355 cmp %o3, ASI_P 4356 bne,a %ncc, .algnblk 4357 wr %g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi 4358 4359.algnblk: 4360 andcc %o0, 0x3f, %o3 ! is block aligned? 4361 bz,pt %ncc, .bzero_blk 4362 sub %o3, 0x40, %o3 ! -(bytes till block aligned) 4363 add %o1, %o3, %o1 ! o1 is the remainder 4364 4365 ! Clear -(%o3) bytes till block aligned 43661: 4367 stxa %g0, [%o0]%asi 4368 addcc %o3, 8, %o3 4369 bl,pt %ncc, 1b 4370 add %o0, 8, %o0 4371 4372.bzero_blk: 4373 and %o1, 0x3f, %o3 ! calc bytes left after blk clear 4374 andn %o1, 0x3f, %o4 ! calc size of blocks in bytes 4375 4376 cmp %o4, 0x100 ! 256 bytes or more 4377 blu,pn %ncc, 3f 4378 nop 4379 43802: 4381 stxa %g0, [%o0+0x0]%asi 4382 stxa %g0, [%o0+0x40]%asi 4383 stxa %g0, [%o0+0x80]%asi 4384 stxa %g0, [%o0+0xc0]%asi 4385 4386 stxa %g0, [%o0+0x8]%asi 4387 stxa %g0, [%o0+0x10]%asi 4388 stxa %g0, [%o0+0x18]%asi 4389 stxa %g0, [%o0+0x20]%asi 4390 stxa %g0, [%o0+0x28]%asi 4391 stxa %g0, [%o0+0x30]%asi 4392 stxa %g0, [%o0+0x38]%asi 4393 4394 stxa %g0, [%o0+0x48]%asi 4395 stxa %g0, [%o0+0x50]%asi 4396 stxa %g0, [%o0+0x58]%asi 4397 stxa %g0, [%o0+0x60]%asi 4398 stxa %g0, [%o0+0x68]%asi 4399 stxa %g0, [%o0+0x70]%asi 4400 stxa %g0, [%o0+0x78]%asi 4401 4402 stxa %g0, [%o0+0x88]%asi 4403 stxa %g0, [%o0+0x90]%asi 4404 stxa %g0, [%o0+0x98]%asi 4405 stxa %g0, [%o0+0xa0]%asi 4406 stxa %g0, [%o0+0xa8]%asi 4407 stxa %g0, [%o0+0xb0]%asi 4408 stxa %g0, [%o0+0xb8]%asi 4409 4410 stxa %g0, [%o0+0xc8]%asi 4411 stxa %g0, [%o0+0xd0]%asi 4412 stxa %g0, [%o0+0xd8]%asi 4413 stxa %g0, [%o0+0xe0]%asi 4414 stxa %g0, [%o0+0xe8]%asi 4415 stxa %g0, [%o0+0xf0]%asi 4416 stxa %g0, [%o0+0xf8]%asi 4417 4418 sub %o4, 0x100, %o4 4419 cmp %o4, 0x100 4420 bgu,pt %ncc, 2b 4421 add %o0, 0x100, %o0 4422 44233: 4424 ! ... check if 64 bytes to set 4425 cmp %o4, 0x40 4426 blu %ncc, .bzero_blk_done 4427 nop 4428 44294: 4430 stxa %g0, [%o0+0x0]%asi 4431 stxa %g0, [%o0+0x8]%asi 4432 stxa %g0, [%o0+0x10]%asi 4433 stxa %g0, [%o0+0x18]%asi 4434 stxa %g0, [%o0+0x20]%asi 4435 stxa %g0, [%o0+0x28]%asi 4436 stxa %g0, [%o0+0x30]%asi 4437 stxa %g0, [%o0+0x38]%asi 4438 4439 subcc %o4, 0x40, %o4 4440 bgu,pt %ncc, 3b 4441 add %o0, 0x40, %o0 4442 4443.bzero_blk_done: 4444 membar #Sync 4445 ! 4446 ! Undo asi register setting. 4447 ! 4448 rd %asi, %o4 4449 wr %g0, ASI_P, %asi 4450 cmp %o4, ASI_BLK_INIT_ST_QUAD_LDD_P 4451 bne,a %ncc, .bzero_small 4452 wr %g0, ASI_USER, %asi 4453 4454.bzero_small: 4455 ! Set the remaining doubles 4456 subcc %o3, 8, %o3 ! Can we store any doubles? 4457 blu,pn %ncc, .byteclr 4458 and %o1, 7, %o1 ! calc bytes left after doubles 4459 4460.dbclr: 4461 stxa %g0, [%o0]%asi ! Clear the doubles 4462 subcc %o3, 8, %o3 4463 bgeu,pt %ncc, .dbclr 4464 add %o0, 8, %o0 4465 4466 ba .byteclr 4467 nop 4468 4469.wdalign: 4470 andcc %o0, 3, %o3 ! is add aligned on a word boundary 4471 bz,pn %ncc, .wdclr 4472 andn %o1, 3, %o3 ! create word sized count in %o3 4473 4474 dec %o1 ! decrement count 4475 stba %g0, [%o0]%asi ! clear a byte 4476 ba .wdalign 4477 inc %o0 ! next byte 4478 4479.wdclr: 4480 sta %g0, [%o0]%asi ! 4-byte clearing loop 4481 subcc %o3, 4, %o3 4482 bnz,pt %ncc, .wdclr 4483 inc 4, %o0 4484 4485 and %o1, 3, %o1 ! leftover count, if any 4486 4487.byteclr: 4488 ! Set the leftover bytes 4489 brz %o1, .bzero_exit 4490 nop 4491 44927: 4493 deccc %o1 ! byte clearing loop 4494 stba %g0, [%o0]%asi 4495 bgu,pt %ncc, 7b 4496 inc %o0 4497 4498.bzero_exit: 4499 ! 4500 ! We're just concerned with whether t_lofault was set 4501 ! when we came in. We end up here from either kzero() 4502 ! or bzero(). kzero() *always* sets a lofault handler. 4503 ! It ors LOFAULT_SET into %o5 to indicate it has done 4504 ! this even if the value of %o5 is otherwise zero. 4505 ! bzero() sets a lofault handler *only* if one was 4506 ! previously set. Accordingly we need to examine 4507 ! %o5 and if it is non-zero be sure to clear LOFAULT_SET 4508 ! before resetting the error handler. 4509 ! 4510 tst %o5 4511 bz %ncc, 1f 4512 andn %o5, LOFAULT_SET, %o5 4513 membar #Sync ! sync error barrier 4514 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 45151: 4516 retl 4517 clr %o0 ! return (0) 4518 4519 SET_SIZE(bzero) 4520#endif /* lint */ 4521