1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22/* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27#pragma ident "%Z%%M% %I% %E% SMI" 28 29#include <sys/param.h> 30#include <sys/errno.h> 31#include <sys/asm_linkage.h> 32#include <sys/vtrace.h> 33#include <sys/machthread.h> 34#include <sys/clock.h> 35#include <sys/asi.h> 36#include <sys/fsr.h> 37#include <sys/privregs.h> 38#include <sys/machasi.h> 39#include <sys/niagaraasi.h> 40 41#if !defined(lint) 42#include "assym.h" 43#endif /* lint */ 44 45 46/* 47 * Pseudo-code to aid in understanding the control flow of the 48 * bcopy/kcopy routine. 49 * 50 * On entry to kcopy: 51 * %l7 = curthread->t_lofault; 52 * curthread->t_lofault = .copyerr; 53 * %o5 = %l7; ! save existing handler in %o5 54 * Call bcopy(); 55 * 56 * On entry to bcopy: 57 * 58 * if (length < 128) 59 * goto_regular_copy; 60 * 61 * if (!use_vis) 62 * goto_regular_copy; 63 * 64 * do_blockcopy_here; 65 * 66 * In lofault handler: 67 * curthread->t_lofault = %o5; ! restore old t_lofault 68 * return (errno) 69 * 70 */ 71 72/* 73 * Less then or equal this number of bytes we will always copy byte-for-byte 74 */ 75#define SMALL_LIMIT 7 76 77/* 78 * Size of stack frame in order to accomodate a 64-byte aligned 79 * floating-point register save area and 2 32-bit temp locations. 80 */ 81#define HWCOPYFRAMESIZE ((64 * 5) + (2 * 4)) 82 83/* 84 * LOFAULT_SET : Flag set by kzero to indicate that lo_fault handler was set 85 */ 86#define LOFAULT_SET 2 87 88/* 89 * This define is to align data for the unaligned source cases. 90 * The data1, data2 and data3 is merged into data1 and data2. 91 * The data3 is preserved for next merge. 92 */ 93#define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \ 94 sllx data1, lshift, data1 ;\ 95 srlx data2, rshift, tmp ;\ 96 or data1, tmp, data1 ;\ 97 sllx data2, lshift, data2 ;\ 98 srlx data3, rshift, tmp ;\ 99 or data2, tmp, data2 100/* 101 * This macro is to align the data. Basically it merges 102 * data1 and data2 to form double word. 103 */ 104#define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \ 105 sllx data1, lshift, data1 ;\ 106 srlx data2, rshift, tmp ;\ 107 or data1, tmp, data1 108 109/* 110 * Copy a block of storage, returning an error code if `from' or 111 * `to' takes a kernel pagefault which cannot be resolved. 112 * Returns errno value on pagefault error, 0 if all ok 113 */ 114 115 116 117#if defined(lint) 118 119/* ARGSUSED */ 120int 121kcopy(const void *from, void *to, size_t count) 122{ return(0); } 123 124#else /* lint */ 125 126 .seg ".text" 127 .align 4 128 129 ENTRY(kcopy) 130 131 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 132 set .copyerr, %o5 ! copyerr is lofault value 133 ldn [THREAD_REG + T_LOFAULT], %l7 ! save existing handler 134 membar #Sync ! sync error barrier (see copy.s) 135 stn %o5, [THREAD_REG + T_LOFAULT] ! set t_lofault 136 b .do_copy ! common code 137 mov %l7, %o5 138 139/* 140 * We got here because of a fault during kcopy. 141 * Errno value is in %g1. 142 */ 143.copyerr: 144 membar #Sync ! sync error barrier 145 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 146 ret 147 restore %g1, 0, %o0 148 149 SET_SIZE(kcopy) 150#endif /* lint */ 151 152 153/* 154 * Copy a block of storage - must not overlap (from + len <= to). 155 * 156 * Copy a page of memory. 157 * Assumes double word alignment and a count >= 256. 158 */ 159#if defined(lint) 160 161/* ARGSUSED */ 162void 163bcopy(const void *from, void *to, size_t count) 164{} 165 166#else /* lint */ 167 168 ENTRY(bcopy) 169 170 save %sp, -SA(MINFRAME + HWCOPYFRAMESIZE), %sp 171 172.do_copy: 173 cmp %i2, 12 ! for small counts 174 blu %ncc, .bytecp ! just copy bytes 175 .empty 176 177 cmp %i2, 128 ! for less than 128 bytes 178 blu,pn %ncc, .bcb_punt ! no block st/quad ld 179 nop 180 181 set use_hw_bcopy, %o2 182 ld [%o2], %o2 183 tst %o2 184 bz .bcb_punt 185 nop 186 187 subcc %i1, %i0, %i3 188 bneg,a,pn %ncc, 1f 189 neg %i3 1901: 191 /* 192 * Compare against 256 since we should be checking block addresses 193 * and (dest & ~63) - (src & ~63) can be 3 blocks even if 194 * src = dest + (64 * 3) + 63. 195 */ 196 cmp %i3, 256 197 blu,pn %ncc, .bcb_punt 198 nop 199 200 /* 201 * Copy that reach here have at least 2 blocks of data to copy. 202 */ 203.do_blockcopy: 204 ! Swap src/dst since the code below is memcpy code 205 ! and memcpy/bcopy have different calling sequences 206 mov %i1, %i5 207 mov %i0, %i1 208 mov %i5, %i0 209 210 andcc %i0, 0x3f, %i3 ! is dst aligned on a 64 bytes 211 bz %xcc, .chksrc ! dst is already double aligned 212 sub %i3, 0x40, %i3 213 neg %i3 ! bytes till dst 64 bytes aligned 214 sub %i2, %i3, %i2 ! update i2 with new count 215 2161: ldub [%i1], %i4 217 stb %i4, [%i0] 218 inc %i1 219 deccc %i3 220 bgu %xcc, 1b 221 inc %i0 222 223 ! Now Destination is block (64 bytes) aligned 224.chksrc: 225 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size 226 sub %i2, %i3, %i2 ! Residue bytes in %i2 227 228 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 229 230 andcc %i1, 0xf, %o2 ! is src quadword aligned 231 bz,pn %xcc, .blkcpy ! src offset in %o2 232 nop 233 cmp %o2, 0x8 234 bg .cpy_upper_double 235 nop 236 bl .cpy_lower_double 237 nop 238 239 ! Falls through when source offset is equal to 8 i.e. 240 ! source is double word aligned. 241 ! In this case no shift/merge of data is required 242 sub %i1, %o2, %i1 ! align the src at 16 bytes. 243 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 244 prefetch [%l0+0x0], #one_read 245 ldda [%i1+0x0]%asi, %l2 246loop0: 247 ldda [%i1+0x10]%asi, %l4 248 prefetch [%l0+0x40], #one_read 249 250 stxa %l3, [%i0+0x0]%asi 251 stxa %l4, [%i0+0x8]%asi 252 253 ldda [%i1+0x20]%asi, %l2 254 stxa %l5, [%i0+0x10]%asi 255 stxa %l2, [%i0+0x18]%asi 256 257 ldda [%i1+0x30]%asi, %l4 258 stxa %l3, [%i0+0x20]%asi 259 stxa %l4, [%i0+0x28]%asi 260 261 ldda [%i1+0x40]%asi, %l2 262 stxa %l5, [%i0+0x30]%asi 263 stxa %l2, [%i0+0x38]%asi 264 265 add %l0, 0x40, %l0 266 add %i1, 0x40, %i1 267 subcc %i3, 0x40, %i3 268 bgu,pt %xcc, loop0 269 add %i0, 0x40, %i0 270 ba .blkdone 271 add %i1, %o2, %i1 ! increment the source by src offset 272 ! the src offset was stored in %o2 273 274.cpy_lower_double: 275 sub %i1, %o2, %i1 ! align the src at 16 bytes. 276 sll %o2, 3, %o0 ! %o0 left shift 277 mov 0x40, %o1 278 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 279 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 280 prefetch [%l0+0x0], #one_read 281 ldda [%i1+0x0]%asi, %l2 ! partial data in %l2 and %l3 has 282 ! complete data 283loop1: 284 ldda [%i1+0x10]%asi, %l4 ! %l4 has partial data for this read. 285 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4 286 ! into %l2 and %l3 287 prefetch [%l0+0x40], #one_read 288 stxa %l2, [%i0+0x0]%asi 289 stxa %l3, [%i0+0x8]%asi 290 291 ldda [%i1+0x20]%asi, %l2 292 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and 293 stxa %l4, [%i0+0x10]%asi ! %l4 from previous read 294 stxa %l5, [%i0+0x18]%asi ! into %l4 and %l5 295 296 ! Repeat the same for next 32 bytes. 297 298 ldda [%i1+0x30]%asi, %l4 299 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) 300 stxa %l2, [%i0+0x20]%asi 301 stxa %l3, [%i0+0x28]%asi 302 303 ldda [%i1+0x40]%asi, %l2 304 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) 305 stxa %l4, [%i0+0x30]%asi 306 stxa %l5, [%i0+0x38]%asi 307 308 add %l0, 0x40, %l0 309 add %i1, 0x40, %i1 310 subcc %i3, 0x40, %i3 311 bgu,pt %xcc, loop1 312 add %i0, 0x40, %i0 313 ba .blkdone 314 add %i1, %o2, %i1 ! increment the source by src offset 315 ! the src offset was stored in %o2 316 317.cpy_upper_double: 318 sub %i1, %o2, %i1 ! align the src at 16 bytes. 319 mov 0x8, %o0 320 sub %o2, %o0, %o0 321 sll %o0, 3, %o0 ! %o0 left shift 322 mov 0x40, %o1 323 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 324 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 325 prefetch [%l0+0x0], #one_read 326 ldda [%i1+0x0]%asi, %l2 ! partial data in %l3 for this read and 327 ! no data in %l2 328loop2: 329 ldda [%i1+0x10]%asi, %l4 ! %l4 has complete data and %l5 has 330 ! partial 331 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5 332 ! into %l3 and %l4 333 prefetch [%l0+0x40], #one_read 334 stxa %l3, [%i0+0x0]%asi 335 stxa %l4, [%i0+0x8]%asi 336 337 ldda [%i1+0x20]%asi, %l2 338 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with 339 stxa %l5, [%i0+0x10]%asi ! %l5 from previous read 340 stxa %l2, [%i0+0x18]%asi ! into %l5 and %l2 341 342 ! Repeat the same for next 32 bytes. 343 344 ldda [%i1+0x30]%asi, %l4 345 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) 346 stxa %l3, [%i0+0x20]%asi 347 stxa %l4, [%i0+0x28]%asi 348 349 ldda [%i1+0x40]%asi, %l2 350 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) 351 stxa %l5, [%i0+0x30]%asi 352 stxa %l2, [%i0+0x38]%asi 353 354 add %l0, 0x40, %l0 355 add %i1, 0x40, %i1 356 subcc %i3, 0x40, %i3 357 bgu,pt %xcc, loop2 358 add %i0, 0x40, %i0 359 ba .blkdone 360 add %i1, %o2, %i1 ! increment the source by src offset 361 ! the src offset was stored in %o2 362 363 364 ! Both Source and Destination are block aligned. 365 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P 366.blkcpy: 367 prefetch [%i1+0x0], #one_read 3681: 369 ldda [%i1+0x0]%asi, %l0 370 ldda [%i1+0x10]%asi, %l2 371 prefetch [%i1+0x40], #one_read 372 373 stxa %l0, [%i0+0x0]%asi 374 ldda [%i1+0x20]%asi, %l4 375 ldda [%i1+0x30]%asi, %l6 376 377 stxa %l1, [%i0+0x8]%asi 378 stxa %l2, [%i0+0x10]%asi 379 stxa %l3, [%i0+0x18]%asi 380 stxa %l4, [%i0+0x20]%asi 381 stxa %l5, [%i0+0x28]%asi 382 stxa %l6, [%i0+0x30]%asi 383 stxa %l7, [%i0+0x38]%asi 384 385 add %i1, 0x40, %i1 386 subcc %i3, 0x40, %i3 387 bgu,pt %xcc, 1b 388 add %i0, 0x40, %i0 389 390.blkdone: 391 membar #Sync 392 tst %i2 393 bz,pt %xcc, .blkexit 394 nop 395 396.residue: 397 ldub [%i1], %i4 398 stb %i4, [%i0] 399 inc %i1 400 deccc %i2 401 bgu %xcc, .residue 402 inc %i0 403 404.blkexit: 405 ret 406 restore %g0, 0, %o0 407 408.bcb_punt: 409 ! 410 ! use aligned transfers where possible 411 ! 412 xor %i0, %i1, %o4 ! xor from and to address 413 btst 7, %o4 ! if lower three bits zero 414 bz .aldoubcp ! can align on double boundary 415 .empty ! assembler complaints about label 416 417 xor %i0, %i1, %o4 ! xor from and to address 418 btst 3, %o4 ! if lower two bits zero 419 bz .alwordcp ! can align on word boundary 420 btst 3, %i0 ! delay slot, from address unaligned? 421 ! 422 ! use aligned reads and writes where possible 423 ! this differs from wordcp in that it copes 424 ! with odd alignment between source and destnation 425 ! using word reads and writes with the proper shifts 426 ! in between to align transfers to and from memory 427 ! i0 - src address, i1 - dest address, i2 - count 428 ! i3, i4 - tmps for used generating complete word 429 ! i5 (word to write) 430 ! l0 size in bits of upper part of source word (US) 431 ! l1 size in bits of lower part of source word (LS = 32 - US) 432 ! l2 size in bits of upper part of destination word (UD) 433 ! l3 size in bits of lower part of destination word (LD = 32 - UD) 434 ! l4 number of bytes leftover after aligned transfers complete 435 ! l5 the number 32 436 ! 437 mov 32, %l5 ! load an oft-needed constant 438 bz .align_dst_only 439 btst 3, %i1 ! is destnation address aligned? 440 clr %i4 ! clear registers used in either case 441 bz .align_src_only 442 clr %l0 443 ! 444 ! both source and destination addresses are unaligned 445 ! 4461: ! align source 447 ldub [%i0], %i3 ! read a byte from source address 448 add %i0, 1, %i0 ! increment source address 449 or %i4, %i3, %i4 ! or in with previous bytes (if any) 450 btst 3, %i0 ! is source aligned? 451 add %l0, 8, %l0 ! increment size of upper source (US) 452 bnz,a 1b 453 sll %i4, 8, %i4 ! make room for next byte 454 455 sub %l5, %l0, %l1 ! generate shift left count (LS) 456 sll %i4, %l1, %i4 ! prepare to get rest 457 ld [%i0], %i3 ! read a word 458 add %i0, 4, %i0 ! increment source address 459 srl %i3, %l0, %i5 ! upper src bits into lower dst bits 460 or %i4, %i5, %i5 ! merge 461 mov 24, %l3 ! align destination 4621: 463 srl %i5, %l3, %i4 ! prepare to write a single byte 464 stb %i4, [%i1] ! write a byte 465 add %i1, 1, %i1 ! increment destination address 466 sub %i2, 1, %i2 ! decrement count 467 btst 3, %i1 ! is destination aligned? 468 bnz,a 1b 469 sub %l3, 8, %l3 ! delay slot, decrement shift count (LD) 470 sub %l5, %l3, %l2 ! generate shift left count (UD) 471 sll %i5, %l2, %i5 ! move leftover into upper bytes 472 cmp %l2, %l0 ! cmp # reqd to fill dst w old src left 473 bgu %ncc, .more_needed ! need more to fill than we have 474 nop 475 476 sll %i3, %l1, %i3 ! clear upper used byte(s) 477 srl %i3, %l1, %i3 478 ! get the odd bytes between alignments 479 sub %l0, %l2, %l0 ! regenerate shift count 480 sub %l5, %l0, %l1 ! generate new shift left count (LS) 481 and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0 482 andn %i2, 3, %i2 ! # of aligned bytes that can be moved 483 srl %i3, %l0, %i4 484 or %i5, %i4, %i5 485 st %i5, [%i1] ! write a word 486 subcc %i2, 4, %i2 ! decrement count 487 bz %ncc, .unalign_out 488 add %i1, 4, %i1 ! increment destination address 489 490 b 2f 491 sll %i3, %l1, %i5 ! get leftover into upper bits 492.more_needed: 493 sll %i3, %l0, %i3 ! save remaining byte(s) 494 srl %i3, %l0, %i3 495 sub %l2, %l0, %l1 ! regenerate shift count 496 sub %l5, %l1, %l0 ! generate new shift left count 497 sll %i3, %l1, %i4 ! move to fill empty space 498 b 3f 499 or %i5, %i4, %i5 ! merge to complete word 500 ! 501 ! the source address is aligned and destination is not 502 ! 503.align_dst_only: 504 ld [%i0], %i4 ! read a word 505 add %i0, 4, %i0 ! increment source address 506 mov 24, %l0 ! initial shift alignment count 5071: 508 srl %i4, %l0, %i3 ! prepare to write a single byte 509 stb %i3, [%i1] ! write a byte 510 add %i1, 1, %i1 ! increment destination address 511 sub %i2, 1, %i2 ! decrement count 512 btst 3, %i1 ! is destination aligned? 513 bnz,a 1b 514 sub %l0, 8, %l0 ! delay slot, decrement shift count 515.xfer: 516 sub %l5, %l0, %l1 ! generate shift left count 517 sll %i4, %l1, %i5 ! get leftover 5183: 519 and %i2, 3, %l4 ! must do remaining bytes if count%4 > 0 520 andn %i2, 3, %i2 ! # of aligned bytes that can be moved 5212: 522 ld [%i0], %i3 ! read a source word 523 add %i0, 4, %i0 ! increment source address 524 srl %i3, %l0, %i4 ! upper src bits into lower dst bits 525 or %i5, %i4, %i5 ! merge with upper dest bits (leftover) 526 st %i5, [%i1] ! write a destination word 527 subcc %i2, 4, %i2 ! decrement count 528 bz %ncc, .unalign_out ! check if done 529 add %i1, 4, %i1 ! increment destination address 530 b 2b ! loop 531 sll %i3, %l1, %i5 ! get leftover 532.unalign_out: 533 tst %l4 ! any bytes leftover? 534 bz %ncc, .cpdone 535 .empty ! allow next instruction in delay slot 5361: 537 sub %l0, 8, %l0 ! decrement shift 538 srl %i3, %l0, %i4 ! upper src byte into lower dst byte 539 stb %i4, [%i1] ! write a byte 540 subcc %l4, 1, %l4 ! decrement count 541 bz %ncc, .cpdone ! done? 542 add %i1, 1, %i1 ! increment destination 543 tst %l0 ! any more previously read bytes 544 bnz %ncc, 1b ! we have leftover bytes 545 mov %l4, %i2 ! delay slot, mv cnt where dbytecp wants 546 b .dbytecp ! let dbytecp do the rest 547 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst 548 ! 549 ! the destination address is aligned and the source is not 550 ! 551.align_src_only: 552 ldub [%i0], %i3 ! read a byte from source address 553 add %i0, 1, %i0 ! increment source address 554 or %i4, %i3, %i4 ! or in with previous bytes (if any) 555 btst 3, %i0 ! is source aligned? 556 add %l0, 8, %l0 ! increment shift count (US) 557 bnz,a .align_src_only 558 sll %i4, 8, %i4 ! make room for next byte 559 b,a .xfer 560 ! 561 ! if from address unaligned for double-word moves, 562 ! move bytes till it is, if count is < 56 it could take 563 ! longer to align the thing than to do the transfer 564 ! in word size chunks right away 565 ! 566.aldoubcp: 567 cmp %i2, 56 ! if count < 56, use wordcp, it takes 568 blu,a %ncc, .alwordcp ! longer to align doubles than words 569 mov 3, %o0 ! mask for word alignment 570 call .alignit ! copy bytes until aligned 571 mov 7, %o0 ! mask for double alignment 572 ! 573 ! source and destination are now double-word aligned 574 ! i3 has aligned count returned by alignit 575 ! 576 and %i2, 7, %i2 ! unaligned leftover count 577 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst 5785: 579 ldx [%i0+%i1], %o4 ! read from address 580 stx %o4, [%i1] ! write at destination address 581 subcc %i3, 8, %i3 ! dec count 582 bgu %ncc, 5b 583 add %i1, 8, %i1 ! delay slot, inc to address 584 cmp %i2, 4 ! see if we can copy a word 585 blu %ncc, .dbytecp ! if 3 or less bytes use bytecp 586 .empty 587 ! 588 ! for leftover bytes we fall into wordcp, if needed 589 ! 590.wordcp: 591 and %i2, 3, %i2 ! unaligned leftover count 5925: 593 ld [%i0+%i1], %o4 ! read from address 594 st %o4, [%i1] ! write at destination address 595 subcc %i3, 4, %i3 ! dec count 596 bgu %ncc, 5b 597 add %i1, 4, %i1 ! delay slot, inc to address 598 b,a .dbytecp 599 600 ! we come here to align copies on word boundaries 601.alwordcp: 602 call .alignit ! go word-align it 603 mov 3, %o0 ! bits that must be zero to be aligned 604 b .wordcp 605 sub %i0, %i1, %i0 ! i0 gets the difference of src and dst 606 607 ! 608 ! byte copy, works with any alignment 609 ! 610.bytecp: 611 b .dbytecp 612 sub %i0, %i1, %i0 ! i0 gets difference of src and dst 613 614 ! 615 ! differenced byte copy, works with any alignment 616 ! assumes dest in %i1 and (source - dest) in %i0 617 ! 6181: 619 stb %o4, [%i1] ! write to address 620 inc %i1 ! inc to address 621.dbytecp: 622 deccc %i2 ! dec count 623 bgeu,a %ncc, 1b ! loop till done 624 ldub [%i0+%i1], %o4 ! read from address 625.cpdone: 626 membar #Sync ! sync error barrier 627 ret 628 restore %g0, 0, %o0 ! return (0) 629 630/* 631 * Common code used to align transfers on word and doubleword 632 * boudaries. Aligns source and destination and returns a count 633 * of aligned bytes to transfer in %i3 634 */ 6351: 636 inc %i0 ! inc from 637 stb %o4, [%i1] ! write a byte 638 inc %i1 ! inc to 639 dec %i2 ! dec count 640.alignit: 641 btst %o0, %i0 ! %o0 is bit mask to check for alignment 642 bnz,a 1b 643 ldub [%i0], %o4 ! read next byte 644 645 retl 646 andn %i2, %o0, %i3 ! return size of aligned bytes 647 SET_SIZE(bcopy) 648 649#endif /* lint */ 650 651/* 652 * Block copy with possibly overlapped operands. 653 */ 654 655#if defined(lint) 656 657/*ARGSUSED*/ 658void 659ovbcopy(const void *from, void *to, size_t count) 660{} 661 662#else /* lint */ 663 664 ENTRY(ovbcopy) 665 tst %o2 ! check count 666 bgu,a %ncc, 1f ! nothing to do or bad arguments 667 subcc %o0, %o1, %o3 ! difference of from and to address 668 669 retl ! return 670 nop 6711: 672 bneg,a %ncc, 2f 673 neg %o3 ! if < 0, make it positive 6742: cmp %o2, %o3 ! cmp size and abs(from - to) 675 bleu %ncc, bcopy ! if size <= abs(diff): use bcopy, 676 .empty ! no overlap 677 cmp %o0, %o1 ! compare from and to addresses 678 blu %ncc, .ov_bkwd ! if from < to, copy backwards 679 nop 680 ! 681 ! Copy forwards. 682 ! 683.ov_fwd: 684 ldub [%o0], %o3 ! read from address 685 inc %o0 ! inc from address 686 stb %o3, [%o1] ! write to address 687 deccc %o2 ! dec count 688 bgu %ncc, .ov_fwd ! loop till done 689 inc %o1 ! inc to address 690 691 retl ! return 692 nop 693 ! 694 ! Copy backwards. 695 ! 696.ov_bkwd: 697 deccc %o2 ! dec count 698 ldub [%o0 + %o2], %o3 ! get byte at end of src 699 bgu %ncc, .ov_bkwd ! loop till done 700 stb %o3, [%o1 + %o2] ! delay slot, store at end of dst 701 702 retl ! return 703 nop 704 SET_SIZE(ovbcopy) 705 706#endif /* lint */ 707 708/* 709 * hwblkpagecopy() 710 * 711 * Copies exactly one page. This routine assumes the caller (ppcopy) 712 * has already disabled kernel preemption and has checked 713 * use_hw_bcopy. 714 */ 715#ifdef lint 716/*ARGSUSED*/ 717void 718hwblkpagecopy(const void *src, void *dst) 719{ } 720#else /* lint */ 721 ENTRY(hwblkpagecopy) 722 save %sp, -SA(MINFRAME + 4*64), %sp 723 724 ! %i0 - source address (arg) 725 ! %i1 - destination address (arg) 726 ! %i2 - length of region (not arg) 727 728 set PAGESIZE, %i2 729 730 /* 731 * Copying exactly one page and PAGESIZE is in mutliple of 0x80. 732 */ 733 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 734 prefetch [%i0+0x0], #one_read 735 prefetch [%i0+0x40], #one_read 7361: 737 prefetch [%i0+0x80], #one_read 738 prefetch [%i0+0xc0], #one_read 739 ldda [%i0+0x0]%asi, %l0 740 ldda [%i0+0x10]%asi, %l2 741 ldda [%i0+0x20]%asi, %l4 742 ldda [%i0+0x30]%asi, %l6 743 stxa %l0, [%i1+0x0]%asi 744 stxa %l1, [%i1+0x8]%asi 745 stxa %l2, [%i1+0x10]%asi 746 stxa %l3, [%i1+0x18]%asi 747 stxa %l4, [%i1+0x20]%asi 748 stxa %l5, [%i1+0x28]%asi 749 stxa %l6, [%i1+0x30]%asi 750 stxa %l7, [%i1+0x38]%asi 751 ldda [%i0+0x40]%asi, %l0 752 ldda [%i0+0x50]%asi, %l2 753 ldda [%i0+0x60]%asi, %l4 754 ldda [%i0+0x70]%asi, %l6 755 stxa %l0, [%i1+0x40]%asi 756 stxa %l1, [%i1+0x48]%asi 757 stxa %l2, [%i1+0x50]%asi 758 stxa %l3, [%i1+0x58]%asi 759 stxa %l4, [%i1+0x60]%asi 760 stxa %l5, [%i1+0x68]%asi 761 stxa %l6, [%i1+0x70]%asi 762 stxa %l7, [%i1+0x78]%asi 763 764 add %i0, 0x80, %i0 765 subcc %i2, 0x80, %i2 766 bgu,pt %xcc, 1b 767 add %i1, 0x80, %i1 768 769 membar #Sync 770 ret 771 restore %g0, 0, %o0 772 SET_SIZE(hwblkpagecopy) 773#endif /* lint */ 774 775 776/* 777 * Transfer data to and from user space - 778 * Note that these routines can cause faults 779 * It is assumed that the kernel has nothing at 780 * less than KERNELBASE in the virtual address space. 781 * 782 * Note that copyin(9F) and copyout(9F) are part of the 783 * DDI/DKI which specifies that they return '-1' on "errors." 784 * 785 * Sigh. 786 * 787 * So there's two extremely similar routines - xcopyin() and xcopyout() 788 * which return the errno that we've faithfully computed. This 789 * allows other callers (e.g. uiomove(9F)) to work correctly. 790 * Given that these are used pretty heavily, we expand the calling 791 * sequences inline for all flavours (rather than making wrappers). 792 * 793 * There are also stub routines for xcopyout_little and xcopyin_little, 794 * which currently are intended to handle requests of <= 16 bytes from 795 * do_unaligned. Future enhancement to make them handle 8k pages efficiently 796 * is left as an exercise... 797 */ 798 799/* 800 * Copy user data to kernel space (copyOP/xcopyOP/copyOP_noerr) 801 * 802 * General theory of operation: 803 * 804 * None of the copyops routines grab a window until it's decided that 805 * we need to do a HW block copy operation. This saves a window 806 * spill/fill when we're called during socket ops. The typical IO 807 * path won't cause spill/fill traps. 808 * 809 * This code uses a set of 4 limits for the maximum size that will 810 * be copied given a particular input/output address alignment. 811 * the default limits are: 812 * 813 * single byte aligned - 256 (hw_copy_limit_1) 814 * two byte aligned - 512 (hw_copy_limit_2) 815 * four byte aligned - 1024 (hw_copy_limit_4) 816 * eight byte aligned - 1024 (hw_copy_limit_8) 817 * 818 * If the value for a particular limit is zero, the copy will be done 819 * via the copy loops rather than block store/quad load instructions. 820 * 821 * Flow: 822 * 823 * If count == zero return zero. 824 * 825 * Store the previous lo_fault handler into %g6. 826 * Place our secondary lofault handler into %g5. 827 * Place the address of our nowindow fault handler into %o3. 828 * Place the address of the windowed fault handler into %o4. 829 * --> We'll use this handler if we end up grabbing a window 830 * --> before we use block initializing store and quad load ASIs 831 * 832 * If count is less than or equal to SMALL_LIMIT (7) we 833 * always do a byte for byte copy. 834 * 835 * If count is > SMALL_LIMIT, we check the alignment of the input 836 * and output pointers. Based on the alignment we check count 837 * against a limit based on detected alignment. If we exceed the 838 * alignment value we copy via block initializing store and quad 839 * load instructions. 840 * 841 * If we don't exceed one of the limits, we store -count in %o3, 842 * we store the number of chunks (8, 4, 2 or 1 byte) operated 843 * on in our basic copy loop in %o2. Following this we branch 844 * to the appropriate copy loop and copy that many chunks. 845 * Since we've been adding the chunk size to %o3 each time through 846 * as well as decrementing %o2, we can tell if any data is 847 * is left to be copied by examining %o3. If that is zero, we're 848 * done and can go home. If not, we figure out what the largest 849 * chunk size left to be copied is and branch to that copy loop 850 * unless there's only one byte left. We load that as we're 851 * branching to code that stores it just before we return. 852 * 853 * Fault handlers are invoked if we reference memory that has no 854 * current mapping. All forms share the same copyio_fault handler. 855 * This routine handles fixing up the stack and general housecleaning. 856 * Each copy operation has a simple fault handler that is then called 857 * to do the work specific to the invidual operation. The handler 858 * for copyOP and xcopyOP are found at the end of individual function. 859 * The handlers for xcopyOP_little are found at the end of xcopyin_little. 860 * The handlers for copyOP_noerr are found at the end of copyin_noerr. 861 */ 862 863/* 864 * Copy kernel data to user space (copyout/xcopyout/xcopyout_little). 865 */ 866 867#if defined(lint) 868 869/*ARGSUSED*/ 870int 871copyout(const void *kaddr, void *uaddr, size_t count) 872{ return (0); } 873 874#else /* lint */ 875 876/* 877 * We save the arguments in the following registers in case of a fault: 878 * kaddr - %g2 879 * uaddr - %g3 880 * count - %g4 881 */ 882#define SAVE_SRC %g2 883#define SAVE_DST %g3 884#define SAVE_COUNT %g4 885 886#define REAL_LOFAULT %g5 887#define SAVED_LOFAULT %g6 888 889/* 890 * Generic copyio fault handler. This is the first line of defense when a 891 * fault occurs in (x)copyin/(x)copyout. In order for this to function 892 * properly, the value of the 'real' lofault handler should be in REAL_LOFAULT. 893 * This allows us to share common code for all the flavors of the copy 894 * operations, including the _noerr versions. 895 * 896 * Note that this function will restore the original input parameters before 897 * calling REAL_LOFAULT. So the real handler can vector to the appropriate 898 * member of the t_copyop structure, if needed. 899 */ 900 ENTRY(copyio_fault) 901 membar #Sync 902 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 903 904 restore 905 906 mov SAVE_SRC, %o0 907 mov SAVE_DST, %o1 908 jmp REAL_LOFAULT 909 mov SAVE_COUNT, %o2 910 SET_SIZE(copyio_fault) 911 912 ENTRY(copyio_fault_nowindow) 913 membar #Sync 914 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 915 916 mov SAVE_SRC, %o0 917 mov SAVE_DST, %o1 918 jmp REAL_LOFAULT 919 mov SAVE_COUNT, %o2 920 SET_SIZE(copyio_fault_nowindow) 921 922 ENTRY(copyout) 923 sethi %hi(.copyout_err), REAL_LOFAULT 924 or REAL_LOFAULT, %lo(.copyout_err), REAL_LOFAULT 925 926.do_copyout: 927 ! 928 ! Check the length and bail if zero. 929 ! 930 tst %o2 931 bnz,pt %ncc, 1f 932 nop 933 retl 934 clr %o0 9351: 936 sethi %hi(copyio_fault), %o4 937 or %o4, %lo(copyio_fault), %o4 938 sethi %hi(copyio_fault_nowindow), %o3 939 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT 940 or %o3, %lo(copyio_fault_nowindow), %o3 941 membar #Sync 942 stn %o3, [THREAD_REG + T_LOFAULT] 943 944 mov %o0, SAVE_SRC 945 mov %o1, SAVE_DST 946 mov %o2, SAVE_COUNT 947 948 ! 949 ! Check to see if we're more than SMALL_LIMIT (7 bytes). 950 ! Run in leaf mode, using the %o regs as our input regs. 951 ! 952 subcc %o2, SMALL_LIMIT, %o3 953 bgu,a,pt %ncc, .dco_ns 954 or %o0, %o1, %o3 955 ! 956 ! What was previously ".small_copyout" 957 ! Do full differenced copy. 958 ! 959.dcobcp: 960 sub %g0, %o2, %o3 ! negate count 961 add %o0, %o2, %o0 ! make %o0 point at the end 962 add %o1, %o2, %o1 ! make %o1 point at the end 963 ba,pt %ncc, .dcocl 964 ldub [%o0 + %o3], %o4 ! load first byte 965 ! 966 ! %o0 and %o2 point at the end and remain pointing at the end 967 ! of their buffers. We pull things out by adding %o3 (which is 968 ! the negation of the length) to the buffer end which gives us 969 ! the curent location in the buffers. By incrementing %o3 we walk 970 ! through both buffers without having to bump each buffer's 971 ! pointer. A very fast 4 instruction loop. 972 ! 973 .align 16 974.dcocl: 975 stba %o4, [%o1 + %o3]ASI_USER 976 inccc %o3 977 bl,a,pt %ncc, .dcocl 978 ldub [%o0 + %o3], %o4 979 ! 980 ! We're done. Go home. 981 ! 982 membar #Sync 983 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 984 retl 985 clr %o0 986 ! 987 ! Try aligned copies from here. 988 ! 989.dco_ns: 990 ! %o0 = kernel addr (to be copied from) 991 ! %o1 = user addr (to be copied to) 992 ! %o2 = length 993 ! %o3 = %o1 | %o2 (used for alignment checking) 994 ! %o4 is alternate lo_fault 995 ! %o5 is original lo_fault 996 ! 997 ! See if we're single byte aligned. If we are, check the 998 ! limit for single byte copies. If we're smaller or equal, 999 ! bounce to the byte for byte copy loop. Otherwise do it in 1000 ! HW (if enabled). 1001 ! 1002 btst 1, %o3 1003 bz,pt %icc, .dcoh8 1004 btst 7, %o3 1005 ! 1006 ! Single byte aligned. Do we do it via HW or via 1007 ! byte for byte? Do a quick no memory reference 1008 ! check to pick up small copies. 1009 ! 1010 sethi %hi(hw_copy_limit_1), %o3 1011 ! 1012 ! Big enough that we need to check the HW limit for 1013 ! this size copy. 1014 ! 1015 ld [%o3 + %lo(hw_copy_limit_1)], %o3 1016 ! 1017 ! Is HW copy on? If not, do everything byte for byte. 1018 ! 1019 tst %o3 1020 bz,pn %icc, .dcobcp 1021 subcc %o3, %o2, %o3 1022 ! 1023 ! If we're less than or equal to the single byte copy limit, 1024 ! bop to the copy loop. 1025 ! 1026 bge,pt %ncc, .dcobcp 1027 nop 1028 ! 1029 ! We're big enough and copy is on. Do it with HW. 1030 ! 1031 ba,pt %ncc, .big_copyout 1032 nop 1033.dcoh8: 1034 ! 1035 ! 8 byte aligned? 1036 ! 1037 bnz,a %ncc, .dcoh4 1038 btst 3, %o3 1039 ! 1040 ! See if we're in the "small range". 1041 ! If so, go off and do the copy. 1042 ! If not, load the hard limit. %o3 is 1043 ! available for reuse. 1044 ! 1045 sethi %hi(hw_copy_limit_8), %o3 1046 ld [%o3 + %lo(hw_copy_limit_8)], %o3 1047 ! 1048 ! If it's zero, there's no HW bcopy. 1049 ! Bop off to the aligned copy. 1050 ! 1051 tst %o3 1052 bz,pn %icc, .dcos8 1053 subcc %o3, %o2, %o3 1054 ! 1055 ! We're negative if our size is larger than hw_copy_limit_8. 1056 ! 1057 bge,pt %ncc, .dcos8 1058 nop 1059 ! 1060 ! HW assist is on and we're large enough. Do it. 1061 ! 1062 ba,pt %ncc, .big_copyout 1063 nop 1064.dcos8: 1065 ! 1066 ! Housekeeping for copy loops. Uses same idea as in the byte for 1067 ! byte copy loop above. 1068 ! 1069 add %o0, %o2, %o0 1070 add %o1, %o2, %o1 1071 sub %g0, %o2, %o3 1072 ba,pt %ncc, .dodebc 1073 srl %o2, 3, %o2 ! Number of 8 byte chunks to copy 1074 ! 1075 ! 4 byte aligned? 1076 ! 1077.dcoh4: 1078 bnz,pn %ncc, .dcoh2 1079 ! 1080 ! See if we're in the "small range". 1081 ! If so, go off an do the copy. 1082 ! If not, load the hard limit. %o3 is 1083 ! available for reuse. 1084 ! 1085 sethi %hi(hw_copy_limit_4), %o3 1086 ld [%o3 + %lo(hw_copy_limit_4)], %o3 1087 ! 1088 ! If it's zero, there's no HW bcopy. 1089 ! Bop off to the aligned copy. 1090 ! 1091 tst %o3 1092 bz,pn %icc, .dcos4 1093 subcc %o3, %o2, %o3 1094 ! 1095 ! We're negative if our size is larger than hw_copy_limit_4. 1096 ! 1097 bge,pt %ncc, .dcos4 1098 nop 1099 ! 1100 ! HW assist is on and we're large enough. Do it. 1101 ! 1102 ba,pt %ncc, .big_copyout 1103 nop 1104.dcos4: 1105 add %o0, %o2, %o0 1106 add %o1, %o2, %o1 1107 sub %g0, %o2, %o3 1108 ba,pt %ncc, .dodfbc 1109 srl %o2, 2, %o2 ! Number of 4 byte chunks to copy 1110 ! 1111 ! We must be 2 byte aligned. Off we go. 1112 ! The check for small copies was done in the 1113 ! delay at .dcoh4 1114 ! 1115.dcoh2: 1116 ble %ncc, .dcos2 1117 sethi %hi(hw_copy_limit_2), %o3 1118 ld [%o3 + %lo(hw_copy_limit_2)], %o3 1119 tst %o3 1120 bz,pn %icc, .dcos2 1121 subcc %o3, %o2, %o3 1122 bge,pt %ncc, .dcos2 1123 nop 1124 ! 1125 ! HW is on and we're big enough. Do it. 1126 ! 1127 ba,pt %ncc, .big_copyout 1128 nop 1129.dcos2: 1130 add %o0, %o2, %o0 1131 add %o1, %o2, %o1 1132 sub %g0, %o2, %o3 1133 ba,pt %ncc, .dodtbc 1134 srl %o2, 1, %o2 ! Number of 2 byte chunks to copy 1135.small_copyout: 1136 ! 1137 ! Why are we doing this AGAIN? There are certain conditions in 1138 ! big_copyout that will cause us to forego the HW assisted copies 1139 ! and bounce back to a non-HW assisted copy. This dispatches those 1140 ! copies. Note that we branch around this in the main line code. 1141 ! 1142 ! We make no check for limits or HW enablement here. We've 1143 ! already been told that we're a poster child so just go off 1144 ! and do it. 1145 ! 1146 or %o0, %o1, %o3 1147 btst 1, %o3 1148 bnz %icc, .dcobcp ! Most likely 1149 btst 7, %o3 1150 bz %icc, .dcos8 1151 btst 3, %o3 1152 bz %icc, .dcos4 1153 nop 1154 ba,pt %ncc, .dcos2 1155 nop 1156 .align 32 1157.dodebc: 1158 ldx [%o0 + %o3], %o4 1159 deccc %o2 1160 stxa %o4, [%o1 + %o3]ASI_USER 1161 bg,pt %ncc, .dodebc 1162 addcc %o3, 8, %o3 1163 ! 1164 ! End of copy loop. Check to see if we're done. Most 1165 ! eight byte aligned copies end here. 1166 ! 1167 bz,pt %ncc, .dcofh 1168 nop 1169 ! 1170 ! Something is left - do it byte for byte. 1171 ! 1172 ba,pt %ncc, .dcocl 1173 ldub [%o0 + %o3], %o4 ! load next byte 1174 ! 1175 ! Four byte copy loop. %o2 is the number of 4 byte chunks to copy. 1176 ! 1177 .align 32 1178.dodfbc: 1179 lduw [%o0 + %o3], %o4 1180 deccc %o2 1181 sta %o4, [%o1 + %o3]ASI_USER 1182 bg,pt %ncc, .dodfbc 1183 addcc %o3, 4, %o3 1184 ! 1185 ! End of copy loop. Check to see if we're done. Most 1186 ! four byte aligned copies end here. 1187 ! 1188 bz,pt %ncc, .dcofh 1189 nop 1190 ! 1191 ! Something is left. Do it byte for byte. 1192 ! 1193 ba,pt %ncc, .dcocl 1194 ldub [%o0 + %o3], %o4 ! load next byte 1195 ! 1196 ! two byte aligned copy loop. %o2 is the number of 2 byte chunks to 1197 ! copy. 1198 ! 1199 .align 32 1200.dodtbc: 1201 lduh [%o0 + %o3], %o4 1202 deccc %o2 1203 stha %o4, [%o1 + %o3]ASI_USER 1204 bg,pt %ncc, .dodtbc 1205 addcc %o3, 2, %o3 1206 ! 1207 ! End of copy loop. Anything left? 1208 ! 1209 bz,pt %ncc, .dcofh 1210 nop 1211 ! 1212 ! Deal with the last byte 1213 ! 1214 ldub [%o0 + %o3], %o4 1215 stba %o4, [%o1 + %o3]ASI_USER 1216.dcofh: 1217 membar #Sync 1218 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1219 retl 1220 clr %o0 1221 1222.big_copyout: 1223 ! 1224 ! We're going to go off and do a block copy. 1225 ! Switch fault handlers and grab a window. We 1226 ! don't do a membar #Sync since we've done only 1227 ! kernel data to this point. 1228 ! 1229 stn %o4, [THREAD_REG + T_LOFAULT] 1230 save %sp, -SA(MINFRAME), %sp 1231 1232 ! Copy out that reach here are larger than 256 bytes. The 1233 ! hw_copy_limit_1 is set to 256. Never set this limit less 1234 ! 128 bytes. 1235.do_block_copyout: 1236 1237 ! Swap src/dst since the code below is memcpy code 1238 ! and memcpy/bcopy have different calling sequences 1239 mov %i1, %i5 1240 mov %i0, %i1 1241 mov %i5, %i0 1242 1243 andcc %i0, 7, %i3 ! is dst double aligned 1244 bz %ncc, copyout_blkcpy 1245 sub %i3, 8, %i3 1246 neg %i3 ! bytes till double aligned 1247 sub %i2, %i3, %i2 ! update %i2 with new count 1248 1249 ! Align Destination on double-word boundary 1250 12511: ldub [%i1], %i4 1252 inc %i1 1253 stba %i4, [%i0]ASI_USER 1254 deccc %i3 1255 bgu %ncc, 1b 1256 inc %i0 1257 1258copyout_blkcpy: 1259 andcc %i0, 63, %i3 1260 bz,pn %ncc, copyout_blalign ! now block aligned 1261 sub %i3, 64, %i3 1262 neg %i3 ! bytes till block aligned 1263 sub %i2, %i3, %i2 ! update %i2 with new count 1264 1265 ! Copy %i3 bytes till dst is block (64 byte) aligned. use 1266 ! double word copies. 1267 1268 andcc %i1, 7, %g1 ! is src aligned on a 8 bytes 1269 bz %ncc, .co_dbcopy ! %g1 has source offset (last 3-bits) 1270 sll %g1, 3, %l1 ! left shift 1271 mov 0x40, %l2 1272 sub %l2, %l1, %l2 ! right shift = (64 - left shift) 1273 1274 ! Now use double word copies to align destination. 1275.co_double: 1276 sub %i1, %g1, %i1 ! align the src at 8 bytes. 1277 ldx [%i1], %o2 12782: 1279 ldx [%i1+8], %o4 1280 ALIGN_DATA_EW(%o2, %o4, %l1, %l2, %o3) 1281 stxa %o2, [%i0]ASI_USER 1282 mov %o4, %o2 1283 add %i1, 0x8, %i1 1284 subcc %i3, 0x8, %i3 1285 bgu,pt %ncc, 2b 1286 add %i0, 0x8, %i0 1287 ba copyout_blalign 1288 add %i1, %g1, %i1 1289 1290 ! Both source and destination are double aligned. 1291 ! No shift and merge of data required in this case. 1292.co_dbcopy: 1293 ldx [%i1], %o2 1294 stxa %o2, [%i0]ASI_USER 1295 add %i1, 0x8, %i1 1296 subcc %i3, 0x8, %i3 1297 bgu,pt %ncc, .co_dbcopy 1298 add %i0, 0x8, %i0 1299 1300copyout_blalign: 1301 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size 1302 sub %i2, %i3, %i2 ! Residue bytes in %i2 1303 1304 mov ASI_BLK_INIT_QUAD_LDD_AIUS, %asi 1305 1306 andcc %i1, 0xf, %o2 ! is src quadword aligned 1307 bz,pn %xcc, .co_blkcpy ! src offset in %o2 (last 4-bits) 1308 nop 1309 cmp %o2, 0x8 1310 bg .co_upper_double 1311 nop 1312 bl .co_lower_double 1313 nop 1314 1315 ! Falls through when source offset is equal to 8 i.e. 1316 ! source is double word aligned. 1317 ! In this case no shift/merge of data is required 1318 1319 sub %i1, %o2, %i1 ! align the src at 16 bytes. 1320 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 1321 prefetch [%l0+0x0], #one_read 1322 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 1323.co_loop0: 1324 add %i1, 0x10, %i1 1325 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 1326 prefetch [%l0+0x40], #one_read 1327 1328 stxa %l3, [%i0+0x0]%asi 1329 stxa %l4, [%i0+0x8]%asi 1330 1331 add %i1, 0x10, %i1 1332 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 1333 1334 stxa %l5, [%i0+0x10]%asi 1335 stxa %l2, [%i0+0x18]%asi 1336 1337 add %i1, 0x10, %i1 1338 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 1339 1340 stxa %l3, [%i0+0x20]%asi 1341 stxa %l4, [%i0+0x28]%asi 1342 1343 add %i1, 0x10, %i1 1344 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 1345 1346 stxa %l5, [%i0+0x30]%asi 1347 stxa %l2, [%i0+0x38]%asi 1348 1349 add %l0, 0x40, %l0 1350 subcc %i3, 0x40, %i3 1351 bgu,pt %xcc, .co_loop0 1352 add %i0, 0x40, %i0 1353 ba .co_blkdone 1354 add %i1, %o2, %i1 ! increment the source by src offset 1355 ! the src offset was stored in %o2 1356 1357.co_lower_double: 1358 1359 sub %i1, %o2, %i1 ! align the src at 16 bytes. 1360 sll %o2, 3, %o0 ! %o0 left shift 1361 mov 0x40, %o1 1362 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 1363 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 1364 prefetch [%l0+0x0], #one_read 1365 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l2 and %l3 has 1366 ! complete data 1367.co_loop1: 1368 add %i1, 0x10, %i1 1369 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has partial data 1370 ! for this read. 1371 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4 1372 ! into %l2 and %l3 1373 prefetch [%l0+0x40], #one_read 1374 1375 stxa %l2, [%i0+0x0]%asi 1376 stxa %l3, [%i0+0x8]%asi 1377 1378 add %i1, 0x10, %i1 1379 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 1380 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and 1381 ! %l4 from previous read 1382 ! into %l4 and %l5 1383 stxa %l4, [%i0+0x10]%asi 1384 stxa %l5, [%i0+0x18]%asi 1385 1386 ! Repeat the same for next 32 bytes. 1387 1388 add %i1, 0x10, %i1 1389 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 1390 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) 1391 1392 stxa %l2, [%i0+0x20]%asi 1393 stxa %l3, [%i0+0x28]%asi 1394 1395 add %i1, 0x10, %i1 1396 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 1397 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) 1398 1399 stxa %l4, [%i0+0x30]%asi 1400 stxa %l5, [%i0+0x38]%asi 1401 1402 add %l0, 0x40, %l0 1403 subcc %i3, 0x40, %i3 1404 bgu,pt %xcc, .co_loop1 1405 add %i0, 0x40, %i0 1406 ba .co_blkdone 1407 add %i1, %o2, %i1 ! increment the source by src offset 1408 ! the src offset was stored in %o2 1409 1410.co_upper_double: 1411 1412 sub %i1, %o2, %i1 ! align the src at 16 bytes. 1413 sub %o2, 0x8, %o0 1414 sll %o0, 3, %o0 ! %o0 left shift 1415 mov 0x40, %o1 1416 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 1417 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 1418 prefetch [%l0+0x0], #one_read 1419 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 ! partial data in %l3 1420 ! for this read and 1421 ! no data in %l2 1422.co_loop2: 1423 add %i1, 0x10, %i1 1424 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 ! %l4 has complete data 1425 ! and %l5 has partial 1426 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5 1427 ! into %l3 and %l4 1428 prefetch [%l0+0x40], #one_read 1429 1430 stxa %l3, [%i0+0x0]%asi 1431 stxa %l4, [%i0+0x8]%asi 1432 1433 add %i1, 0x10, %i1 1434 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 1435 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with 1436 ! %l5 from previous read 1437 ! into %l5 and %l2 1438 1439 stxa %l5, [%i0+0x10]%asi 1440 stxa %l2, [%i0+0x18]%asi 1441 1442 ! Repeat the same for next 32 bytes. 1443 1444 add %i1, 0x10, %i1 1445 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 1446 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) 1447 1448 stxa %l3, [%i0+0x20]%asi 1449 stxa %l4, [%i0+0x28]%asi 1450 1451 add %i1, 0x10, %i1 1452 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 1453 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) 1454 1455 stxa %l5, [%i0+0x30]%asi 1456 stxa %l2, [%i0+0x38]%asi 1457 1458 add %l0, 0x40, %l0 1459 subcc %i3, 0x40, %i3 1460 bgu,pt %xcc, .co_loop2 1461 add %i0, 0x40, %i0 1462 ba .co_blkdone 1463 add %i1, %o2, %i1 ! increment the source by src offset 1464 ! the src offset was stored in %o2 1465 1466 1467 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P 1468.co_blkcpy: 1469 1470 andn %i1, 0x3f, %o0 ! %o0 has block aligned source 1471 prefetch [%o0+0x0], #one_read 14721: 1473 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l0 1474 add %i1, 0x10, %i1 1475 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l2 1476 add %i1, 0x10, %i1 1477 1478 prefetch [%o0+0x40], #one_read 1479 1480 stxa %l0, [%i0+0x0]%asi 1481 1482 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l4 1483 add %i1, 0x10, %i1 1484 ldda [%i1]ASI_BLK_INIT_ST_QUAD_LDD_P, %l6 1485 add %i1, 0x10, %i1 1486 1487 stxa %l1, [%i0+0x8]%asi 1488 stxa %l2, [%i0+0x10]%asi 1489 stxa %l3, [%i0+0x18]%asi 1490 stxa %l4, [%i0+0x20]%asi 1491 stxa %l5, [%i0+0x28]%asi 1492 stxa %l6, [%i0+0x30]%asi 1493 stxa %l7, [%i0+0x38]%asi 1494 1495 add %o0, 0x40, %o0 1496 subcc %i3, 0x40, %i3 1497 bgu,pt %xcc, 1b 1498 add %i0, 0x40, %i0 1499 1500.co_blkdone: 1501 membar #Sync 1502 1503 ! Copy as much rest of the data as double word copy. 1504.co_dwcp: 1505 cmp %i2, 0x8 ! Not enough bytes to copy as double 1506 blu %ncc, .co_dbdone 1507 nop 1508 1509 andn %i2, 0x7, %i3 ! %i3 count is multiple of 8 bytes size 1510 sub %i2, %i3, %i2 ! Residue bytes in %i2 1511 1512 andcc %i1, 7, %g1 ! is src aligned on a 8 bytes 1513 bz %ncc, .co_cpy_db 1514 nop 1515 1516 sll %g1, 3, %l0 ! left shift 1517 mov 0x40, %l1 1518 sub %l1, %l0, %l1 ! right shift = (64 - left shift) 1519 1520.co_cpy_wd: 1521 sub %i1, %g1, %i1 ! align the src at 8 bytes. 1522 ldx [%i1], %o2 15233: 1524 ldx [%i1+8], %o4 1525 ALIGN_DATA_EW(%o2, %o4, %l0, %l1, %o3) 1526 stxa %o2, [%i0]ASI_USER 1527 mov %o4, %o2 1528 add %i1, 0x8, %i1 1529 subcc %i3, 0x8, %i3 1530 bgu,pt %ncc, 3b 1531 add %i0, 0x8, %i0 1532 ba .co_dbdone 1533 add %i1, %g1, %i1 1534 1535.co_cpy_db: 1536 ldx [%i1], %o2 1537 stxa %o2, [%i0]ASI_USER 1538 add %i1, 0x8, %i1 1539 subcc %i3, 0x8, %i3 1540 bgu,pt %ncc, .co_cpy_db 1541 add %i0, 0x8, %i0 1542 1543.co_dbdone: 1544 tst %i2 1545 bz,pt %xcc, .copyout_exit 1546 nop 1547 1548 ! Copy the residue as byte copy 1549.co_residue: 1550 ldub [%i1], %i4 1551 stba %i4, [%i0]ASI_USER 1552 inc %i1 1553 deccc %i2 1554 bgu %xcc, .co_residue 1555 inc %i0 1556 1557.copyout_exit: 1558 membar #Sync 1559 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1560 ret 1561 restore %g0, 0, %o0 1562 1563.copyout_err: 1564 ldn [THREAD_REG + T_COPYOPS], %o4 1565 brz %o4, 2f 1566 nop 1567 ldn [%o4 + CP_COPYOUT], %g2 1568 jmp %g2 1569 nop 15702: 1571 retl 1572 mov -1, %o0 1573 SET_SIZE(copyout) 1574 1575#endif /* lint */ 1576 1577 1578#ifdef lint 1579 1580/*ARGSUSED*/ 1581int 1582xcopyout(const void *kaddr, void *uaddr, size_t count) 1583{ return (0); } 1584 1585#else /* lint */ 1586 1587 ENTRY(xcopyout) 1588 sethi %hi(.xcopyout_err), REAL_LOFAULT 1589 b .do_copyout 1590 or REAL_LOFAULT, %lo(.xcopyout_err), REAL_LOFAULT 1591.xcopyout_err: 1592 ldn [THREAD_REG + T_COPYOPS], %o4 1593 brz %o4, 2f 1594 nop 1595 ldn [%o4 + CP_XCOPYOUT], %g2 1596 jmp %g2 1597 nop 15982: 1599 retl 1600 mov %g1, %o0 1601 SET_SIZE(xcopyout) 1602 1603#endif /* lint */ 1604 1605#ifdef lint 1606 1607/*ARGSUSED*/ 1608int 1609xcopyout_little(const void *kaddr, void *uaddr, size_t count) 1610{ return (0); } 1611 1612#else /* lint */ 1613 1614 ENTRY(xcopyout_little) 1615 sethi %hi(.little_err), %o4 1616 ldn [THREAD_REG + T_LOFAULT], %o5 1617 or %o4, %lo(.little_err), %o4 1618 membar #Sync ! sync error barrier 1619 stn %o4, [THREAD_REG + T_LOFAULT] 1620 1621 subcc %g0, %o2, %o3 1622 add %o0, %o2, %o0 1623 bz,pn %ncc, 2f ! check for zero bytes 1624 sub %o2, 1, %o4 1625 add %o0, %o4, %o0 ! start w/last byte 1626 add %o1, %o2, %o1 1627 ldub [%o0+%o3], %o4 1628 16291: stba %o4, [%o1+%o3]ASI_AIUSL 1630 inccc %o3 1631 sub %o0, 2, %o0 ! get next byte 1632 bcc,a,pt %ncc, 1b 1633 ldub [%o0+%o3], %o4 1634 16352: membar #Sync ! sync error barrier 1636 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1637 retl 1638 mov %g0, %o0 ! return (0) 1639 SET_SIZE(xcopyout_little) 1640 1641#endif /* lint */ 1642 1643/* 1644 * Copy user data to kernel space (copyin/xcopyin/xcopyin_little) 1645 */ 1646 1647#if defined(lint) 1648 1649/*ARGSUSED*/ 1650int 1651copyin(const void *uaddr, void *kaddr, size_t count) 1652{ return (0); } 1653 1654#else /* lint */ 1655 1656 ENTRY(copyin) 1657 sethi %hi(.copyin_err), REAL_LOFAULT 1658 or REAL_LOFAULT, %lo(.copyin_err), REAL_LOFAULT 1659 1660.do_copyin: 1661 ! 1662 ! Check the length and bail if zero. 1663 ! 1664 tst %o2 1665 bnz,pt %ncc, 1f 1666 nop 1667 retl 1668 clr %o0 16691: 1670 sethi %hi(copyio_fault), %o4 1671 or %o4, %lo(copyio_fault), %o4 1672 sethi %hi(copyio_fault_nowindow), %o3 1673 ldn [THREAD_REG + T_LOFAULT], SAVED_LOFAULT 1674 or %o3, %lo(copyio_fault_nowindow), %o3 1675 membar #Sync 1676 stn %o3, [THREAD_REG + T_LOFAULT] 1677 1678 mov %o0, SAVE_SRC 1679 mov %o1, SAVE_DST 1680 mov %o2, SAVE_COUNT 1681 1682 ! 1683 ! Check to see if we're more than SMALL_LIMIT. 1684 ! 1685 subcc %o2, SMALL_LIMIT, %o3 1686 bgu,a,pt %ncc, .dci_ns 1687 or %o0, %o1, %o3 1688 ! 1689 ! What was previously ".small_copyin" 1690 ! 1691.dcibcp: 1692 sub %g0, %o2, %o3 ! setup for copy loop 1693 add %o0, %o2, %o0 1694 add %o1, %o2, %o1 1695 ba,pt %ncc, .dcicl 1696 lduba [%o0 + %o3]ASI_USER, %o4 1697 ! 1698 ! %o0 and %o1 point at the end and remain pointing at the end 1699 ! of their buffers. We pull things out by adding %o3 (which is 1700 ! the negation of the length) to the buffer end which gives us 1701 ! the curent location in the buffers. By incrementing %o3 we walk 1702 ! through both buffers without having to bump each buffer's 1703 ! pointer. A very fast 4 instruction loop. 1704 ! 1705 .align 16 1706.dcicl: 1707 stb %o4, [%o1 + %o3] 1708 inccc %o3 1709 bl,a,pt %ncc, .dcicl 1710 lduba [%o0 + %o3]ASI_USER, %o4 1711 ! 1712 ! We're done. Go home. 1713 ! 1714 membar #Sync 1715 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] 1716 retl 1717 clr %o0 1718 ! 1719 ! Try aligned copies from here. 1720 ! 1721.dci_ns: 1722 ! 1723 ! See if we're single byte aligned. If we are, check the 1724 ! limit for single byte copies. If we're smaller, or equal, 1725 ! bounce to the byte for byte copy loop. Otherwise do it in 1726 ! HW (if enabled). 1727 ! 1728 btst 1, %o3 1729 bz,a,pt %icc, .dcih8 1730 btst 7, %o3 1731 ! 1732 ! We're single byte aligned. 1733 ! 1734 sethi %hi(hw_copy_limit_1), %o3 1735 ld [%o3 + %lo(hw_copy_limit_1)], %o3 1736 ! 1737 ! Is HW copy on? If not do everything byte for byte. 1738 ! 1739 tst %o3 1740 bz,pn %icc, .dcibcp 1741 subcc %o3, %o2, %o3 1742 ! 1743 ! Are we bigger than the HW limit? If not 1744 ! go to byte for byte. 1745 ! 1746 bge,pt %ncc, .dcibcp 1747 nop 1748 ! 1749 ! We're big enough and copy is on. Do it with HW. 1750 ! 1751 ba,pt %ncc, .big_copyin 1752 nop 1753.dcih8: 1754 ! 1755 ! 8 byte aligned? 1756 ! 1757 bnz,a %ncc, .dcih4 1758 btst 3, %o3 1759 ! 1760 ! We're eight byte aligned. 1761 ! 1762 sethi %hi(hw_copy_limit_8), %o3 1763 ld [%o3 + %lo(hw_copy_limit_8)], %o3 1764 ! 1765 ! Is HW assist on? If not, do it with the aligned copy. 1766 ! 1767 tst %o3 1768 bz,pn %icc, .dcis8 1769 subcc %o3, %o2, %o3 1770 bge %ncc, .dcis8 1771 nop 1772 ba,pt %ncc, .big_copyin 1773 nop 1774.dcis8: 1775 ! 1776 ! Housekeeping for copy loops. Uses same idea as in the byte for 1777 ! byte copy loop above. 1778 ! 1779 add %o0, %o2, %o0 1780 add %o1, %o2, %o1 1781 sub %g0, %o2, %o3 1782 ba,pt %ncc, .didebc 1783 srl %o2, 3, %o2 ! Number of 8 byte chunks to copy 1784 ! 1785 ! 4 byte aligned? 1786 ! 1787.dcih4: 1788 bnz %ncc, .dcih2 1789 sethi %hi(hw_copy_limit_4), %o3 1790 ld [%o3 + %lo(hw_copy_limit_4)], %o3 1791 ! 1792 ! Is HW assist on? If not, do it with the aligned copy. 1793 ! 1794 tst %o3 1795 bz,pn %icc, .dcis4 1796 subcc %o3, %o2, %o3 1797 ! 1798 ! We're negative if our size is less than or equal to hw_copy_limit_4. 1799 ! 1800 bge %ncc, .dcis4 1801 nop 1802 ba,pt %ncc, .big_copyin 1803 nop 1804.dcis4: 1805 ! 1806 ! Housekeeping for copy loops. Uses same idea as in the byte 1807 ! for byte copy loop above. 1808 ! 1809 add %o0, %o2, %o0 1810 add %o1, %o2, %o1 1811 sub %g0, %o2, %o3 1812 ba,pt %ncc, .didfbc 1813 srl %o2, 2, %o2 ! Number of 4 byte chunks to copy 1814.dcih2: 1815 ! 1816 ! We're two byte aligned. Check for "smallness" 1817 ! done in delay at .dcih4 1818 ! 1819 bleu,pt %ncc, .dcis2 1820 sethi %hi(hw_copy_limit_2), %o3 1821 ld [%o3 + %lo(hw_copy_limit_2)], %o3 1822 ! 1823 ! Is HW assist on? If not, do it with the aligned copy. 1824 ! 1825 tst %o3 1826 bz,pn %icc, .dcis2 1827 subcc %o3, %o2, %o3 1828 ! 1829 ! Are we larger than the HW limit? 1830 ! 1831 bge %ncc, .dcis2 1832 nop 1833 ! 1834 ! HW assist is on and we're large enough to use it. 1835 ! 1836 ba,pt %ncc, .big_copyin 1837 nop 1838 ! 1839 ! Housekeeping for copy loops. Uses same idea as in the byte 1840 ! for byte copy loop above. 1841 ! 1842.dcis2: 1843 add %o0, %o2, %o0 1844 add %o1, %o2, %o1 1845 sub %g0, %o2, %o3 1846 ba,pt %ncc, .didtbc 1847 srl %o2, 1, %o2 ! Number of 2 byte chunks to copy 1848 ! 1849.small_copyin: 1850 ! 1851 ! Why are we doing this AGAIN? There are certain conditions in 1852 ! big copyin that will cause us to forgo the HW assisted copys 1853 ! and bounce back to a non-hw assisted copy. This dispatches 1854 ! those copies. Note that we branch around this in the main line 1855 ! code. 1856 ! 1857 ! We make no check for limits or HW enablement here. We've 1858 ! already been told that we're a poster child so just go off 1859 ! and do it. 1860 ! 1861 or %o0, %o1, %o3 1862 btst 1, %o3 1863 bnz %icc, .dcibcp ! Most likely 1864 btst 7, %o3 1865 bz %icc, .dcis8 1866 btst 3, %o3 1867 bz %icc, .dcis4 1868 nop 1869 ba,pt %ncc, .dcis2 1870 nop 1871 ! 1872 ! Eight byte aligned copies. A steal from the original .small_copyin 1873 ! with modifications. %o2 is number of 8 byte chunks to copy. When 1874 ! done, we examine %o3. If this is < 0, we have 1 - 7 bytes more 1875 ! to copy. 1876 ! 1877 .align 32 1878.didebc: 1879 ldxa [%o0 + %o3]ASI_USER, %o4 1880 deccc %o2 1881 stx %o4, [%o1 + %o3] 1882 bg,pt %ncc, .didebc 1883 addcc %o3, 8, %o3 1884 ! 1885 ! End of copy loop. Most 8 byte aligned copies end here. 1886 ! 1887 bz,pt %ncc, .dcifh 1888 nop 1889 ! 1890 ! Something is left. Do it byte for byte. 1891 ! 1892 ba,pt %ncc, .dcicl 1893 lduba [%o0 + %o3]ASI_USER, %o4 1894 ! 1895 ! 4 byte copy loop. %o2 is number of 4 byte chunks to copy. 1896 ! 1897 .align 32 1898.didfbc: 1899 lduwa [%o0 + %o3]ASI_USER, %o4 1900 deccc %o2 1901 st %o4, [%o1 + %o3] 1902 bg,pt %ncc, .didfbc 1903 addcc %o3, 4, %o3 1904 ! 1905 ! End of copy loop. Most 4 byte aligned copies end here. 1906 ! 1907 bz,pt %ncc, .dcifh 1908 nop 1909 ! 1910 ! Something is left. Do it byte for byte. 1911 ! 1912 ba,pt %ncc, .dcicl 1913 lduba [%o0 + %o3]ASI_USER, %o4 1914 ! 1915 ! 2 byte aligned copy loop. %o2 is number of 2 byte chunks to 1916 ! copy. 1917 ! 1918 .align 32 1919.didtbc: 1920 lduha [%o0 + %o3]ASI_USER, %o4 1921 deccc %o2 1922 sth %o4, [%o1 + %o3] 1923 bg,pt %ncc, .didtbc 1924 addcc %o3, 2, %o3 1925 ! 1926 ! End of copy loop. Most 2 byte aligned copies end here. 1927 ! 1928 bz,pt %ncc, .dcifh 1929 nop 1930 ! 1931 ! Deal with the last byte 1932 ! 1933 lduba [%o0 + %o3]ASI_USER, %o4 1934 stb %o4, [%o1 + %o3] 1935.dcifh: 1936 membar #Sync 1937 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 1938 retl 1939 clr %o0 1940 1941.big_copyin: 1942 ! 1943 ! We're going off to do a block copy. 1944 ! Switch fault hendlers and grab a window. We 1945 ! don't do a membar #Sync since we've done only 1946 ! kernel data to this point. 1947 ! 1948 stn %o4, [THREAD_REG + T_LOFAULT] 1949 save %sp, -SA(MINFRAME), %sp 1950 1951 ! Copy in that reach here are larger than 256 bytes. The 1952 ! hw_copy_limit_1 is set to 256. Never set this limit less 1953 ! 128 bytes. 1954.do_blockcopyin: 1955 1956 ! Swap src/dst since the code below is memcpy code 1957 ! and memcpy/bcopy have different calling sequences 1958 mov %i1, %i5 1959 mov %i0, %i1 1960 mov %i5, %i0 1961 1962 andcc %i0, 7, %i3 ! is dst double aligned 1963 bz %ncc, copyin_blkcpy 1964 sub %i3, 8, %i3 1965 neg %i3 ! bytes till double aligned 1966 sub %i2, %i3, %i2 ! update %i2 with new count 1967 1968 ! Align Destination on double-word boundary 1969 19701: lduba [%i1]ASI_USER, %i4 1971 inc %i1 1972 stb %i4, [%i0] 1973 deccc %i3 1974 bgu %ncc, 1b 1975 inc %i0 1976 1977copyin_blkcpy: 1978 andcc %i0, 63, %i3 1979 bz,pn %ncc, copyin_blalign ! now block aligned 1980 sub %i3, 64, %i3 1981 neg %i3 ! bytes till block aligned 1982 sub %i2, %i3, %i2 ! update %i2 with new count 1983 1984 ! Copy %i3 bytes till dst is block (64 byte) aligned. use 1985 ! double word copies. 1986 1987 andcc %i1, 7, %g1 ! is src aligned on a 8 bytes 1988 bz %ncc, .ci_dbcopy ! %g1 has source offset (last 3-bits) 1989 sll %g1, 3, %l1 ! left shift 1990 mov 0x40, %l2 1991 sub %l2, %l1, %l2 ! right shift = (64 - left shift) 1992 1993 ! Now use double word copies to align destination. 1994.ci_double: 1995 sub %i1, %g1, %i1 ! align the src at 8 bytes. 1996 ldxa [%i1]ASI_USER, %o2 19972: 1998 add %i1, 0x8, %i1 1999 ldxa [%i1]ASI_USER, %o4 2000 ALIGN_DATA_EW(%o2, %o4, %l1, %l2, %o3) 2001 stx %o2, [%i0] 2002 mov %o4, %o2 2003 subcc %i3, 0x8, %i3 2004 bgu,pt %ncc, 2b 2005 add %i0, 0x8, %i0 2006 ba copyin_blalign 2007 add %i1, %g1, %i1 2008 2009 ! Both source and destination are double aligned. 2010 ! No shift and merge of data required in this case. 2011.ci_dbcopy: 2012 ldxa [%i1]ASI_USER, %o2 2013 stx %o2, [%i0] 2014 add %i1, 0x8, %i1 2015 subcc %i3, 0x8, %i3 2016 bgu,pt %ncc, .ci_dbcopy 2017 add %i0, 0x8, %i0 2018 2019copyin_blalign: 2020 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size 2021 sub %i2, %i3, %i2 ! Residue bytes in %i2 2022 2023 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 2024 2025 andcc %i1, 0xf, %o2 ! is src quadword aligned 2026 bz,pn %xcc, .ci_blkcpy ! src offset in %o2 (last 4-bits) 2027 nop 2028 cmp %o2, 0x8 2029 bg .ci_upper_double 2030 nop 2031 bl .ci_lower_double 2032 nop 2033 2034 ! Falls through when source offset is equal to 8 i.e. 2035 ! source is double word aligned. 2036 ! In this case no shift/merge of data is required 2037 2038 sub %i1, %o2, %i1 ! align the src at 16 bytes. 2039 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 2040 prefetch [%l0+0x0], #one_read 2041 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 2042.ci_loop0: 2043 add %i1, 0x10, %i1 2044 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 2045 2046 prefetch [%l0+0x40], #one_read 2047 2048 stxa %l3, [%i0+0x0]%asi 2049 stxa %l4, [%i0+0x8]%asi 2050 2051 add %i1, 0x10, %i1 2052 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 2053 2054 stxa %l5, [%i0+0x10]%asi 2055 stxa %l2, [%i0+0x18]%asi 2056 2057 add %i1, 0x10, %i1 2058 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 2059 2060 stxa %l3, [%i0+0x20]%asi 2061 stxa %l4, [%i0+0x28]%asi 2062 2063 add %i1, 0x10, %i1 2064 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 2065 2066 stxa %l5, [%i0+0x30]%asi 2067 stxa %l2, [%i0+0x38]%asi 2068 2069 add %l0, 0x40, %l0 2070 subcc %i3, 0x40, %i3 2071 bgu,pt %xcc, .ci_loop0 2072 add %i0, 0x40, %i0 2073 ba .ci_blkdone 2074 add %i1, %o2, %i1 ! increment the source by src offset 2075 ! the src offset was stored in %o2 2076 2077.ci_lower_double: 2078 2079 sub %i1, %o2, %i1 ! align the src at 16 bytes. 2080 sll %o2, 3, %o0 ! %o0 left shift 2081 mov 0x40, %o1 2082 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 2083 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 2084 prefetch [%l0+0x0], #one_read 2085 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l2 2086 ! and %l3 has complete 2087 ! data 2088.ci_loop1: 2089 add %i1, 0x10, %i1 2090 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has partial data 2091 ! for this read. 2092 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) ! merge %l2, %l3 and %l4 2093 ! into %l2 and %l3 2094 2095 prefetch [%l0+0x40], #one_read 2096 2097 stxa %l2, [%i0+0x0]%asi 2098 stxa %l3, [%i0+0x8]%asi 2099 2100 add %i1, 0x10, %i1 2101 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 2102 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) ! merge %l2 with %l5 and 2103 ! %l4 from previous read 2104 ! into %l4 and %l5 2105 stxa %l4, [%i0+0x10]%asi 2106 stxa %l5, [%i0+0x18]%asi 2107 2108 ! Repeat the same for next 32 bytes. 2109 2110 add %i1, 0x10, %i1 2111 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 2112 ALIGN_DATA(%l2, %l3, %l4, %o0, %o1, %l6) 2113 2114 stxa %l2, [%i0+0x20]%asi 2115 stxa %l3, [%i0+0x28]%asi 2116 2117 add %i1, 0x10, %i1 2118 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 2119 ALIGN_DATA(%l4, %l5, %l2, %o0, %o1, %l6) 2120 2121 stxa %l4, [%i0+0x30]%asi 2122 stxa %l5, [%i0+0x38]%asi 2123 2124 add %l0, 0x40, %l0 2125 subcc %i3, 0x40, %i3 2126 bgu,pt %xcc, .ci_loop1 2127 add %i0, 0x40, %i0 2128 ba .ci_blkdone 2129 add %i1, %o2, %i1 ! increment the source by src offset 2130 ! the src offset was stored in %o2 2131 2132.ci_upper_double: 2133 2134 sub %i1, %o2, %i1 ! align the src at 16 bytes. 2135 sub %o2, 0x8, %o0 2136 sll %o0, 3, %o0 ! %o0 left shift 2137 mov 0x40, %o1 2138 sub %o1, %o0, %o1 ! %o1 right shift = (64 - left shift) 2139 andn %i1, 0x3f, %l0 ! %l0 has block aligned source 2140 prefetch [%l0+0x0], #one_read 2141 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 ! partial data in %l3 2142 ! for this read and 2143 ! no data in %l2 2144.ci_loop2: 2145 add %i1, 0x10, %i1 2146 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 ! %l4 has complete data 2147 ! and %l5 has partial 2148 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) ! merge %l3, %l4 and %l5 2149 ! into %l3 and %l4 2150 prefetch [%l0+0x40], #one_read 2151 2152 stxa %l3, [%i0+0x0]%asi 2153 stxa %l4, [%i0+0x8]%asi 2154 2155 add %i1, 0x10, %i1 2156 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 2157 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) ! merge %l2 and %l3 with 2158 ! %l5 from previous read 2159 ! into %l5 and %l2 2160 2161 stxa %l5, [%i0+0x10]%asi 2162 stxa %l2, [%i0+0x18]%asi 2163 2164 ! Repeat the same for next 32 bytes. 2165 2166 add %i1, 0x10, %i1 2167 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 2168 ALIGN_DATA(%l3, %l4, %l5, %o0, %o1, %l6) 2169 2170 stxa %l3, [%i0+0x20]%asi 2171 stxa %l4, [%i0+0x28]%asi 2172 2173 add %i1, 0x10, %i1 2174 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 2175 ALIGN_DATA(%l5, %l2, %l3, %o0, %o1, %l6) 2176 2177 stxa %l5, [%i0+0x30]%asi 2178 stxa %l2, [%i0+0x38]%asi 2179 2180 add %l0, 0x40, %l0 2181 subcc %i3, 0x40, %i3 2182 bgu,pt %xcc, .ci_loop2 2183 add %i0, 0x40, %i0 2184 ba .ci_blkdone 2185 add %i1, %o2, %i1 ! increment the source by src offset 2186 ! the src offset was stored in %o2 2187 2188 2189 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P 2190.ci_blkcpy: 2191 2192 andn %i1, 0x3f, %o0 ! %o0 has block aligned source 2193 prefetch [%o0+0x0], #one_read 21941: 2195 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l0 2196 add %i1, 0x10, %i1 2197 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l2 2198 add %i1, 0x10, %i1 2199 2200 prefetch [%o0+0x40], #one_read 2201 2202 stxa %l0, [%i0+0x0]%asi 2203 2204 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l4 2205 add %i1, 0x10, %i1 2206 ldda [%i1]ASI_BLK_INIT_QUAD_LDD_AIUS, %l6 2207 add %i1, 0x10, %i1 2208 2209 stxa %l1, [%i0+0x8]%asi 2210 stxa %l2, [%i0+0x10]%asi 2211 stxa %l3, [%i0+0x18]%asi 2212 stxa %l4, [%i0+0x20]%asi 2213 stxa %l5, [%i0+0x28]%asi 2214 stxa %l6, [%i0+0x30]%asi 2215 stxa %l7, [%i0+0x38]%asi 2216 2217 add %o0, 0x40, %o0 2218 subcc %i3, 0x40, %i3 2219 bgu,pt %xcc, 1b 2220 add %i0, 0x40, %i0 2221 2222.ci_blkdone: 2223 membar #Sync 2224 2225 ! Copy as much rest of the data as double word copy. 2226.ci_dwcp: 2227 cmp %i2, 0x8 ! Not enough bytes to copy as double 2228 blu %ncc, .ci_dbdone 2229 nop 2230 2231 andn %i2, 0x7, %i3 ! %i3 count is multiple of 8 bytes size 2232 sub %i2, %i3, %i2 ! Residue bytes in %i2 2233 2234 andcc %i1, 7, %g1 ! is src aligned on a 8 bytes 2235 bz %ncc, .ci_cpy_db 2236 nop 2237 2238 sll %g1, 3, %l0 ! left shift 2239 mov 0x40, %l1 2240 sub %l1, %l0, %l1 ! right shift = (64 - left shift) 2241 2242.ci_cpy_dbwd: 2243 sub %i1, %g1, %i1 ! align the src at 8 bytes. 2244 ldxa [%i1]ASI_USER, %o2 22453: 2246 add %i1, 0x8, %i1 2247 ldxa [%i1]ASI_USER, %o4 2248 ALIGN_DATA_EW(%o2, %o4, %l0, %l1, %o3) 2249 stx %o2, [%i0] 2250 mov %o4, %o2 2251 subcc %i3, 0x8, %i3 2252 bgu,pt %ncc, 3b 2253 add %i0, 0x8, %i0 2254 ba .ci_dbdone 2255 add %i1, %g1, %i1 2256 2257.ci_cpy_db: 2258 ldxa [%i1]ASI_USER, %o2 2259 stx %o2, [%i0] 2260 add %i1, 0x8, %i1 2261 subcc %i3, 0x8, %i3 2262 bgu,pt %ncc, .ci_cpy_db 2263 add %i0, 0x8, %i0 2264 2265.ci_dbdone: 2266 tst %i2 2267 bz,pt %xcc, .copyin_exit 2268 nop 2269 2270 ! Copy the residue as byte copy 2271.ci_residue: 2272 lduba [%i1]ASI_USER, %i4 2273 stb %i4, [%i0] 2274 inc %i1 2275 deccc %i2 2276 bgu %xcc, .ci_residue 2277 inc %i0 2278 2279.copyin_exit: 2280 membar #Sync 2281 stn SAVED_LOFAULT, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2282 ret 2283 restore %g0, 0, %o0 2284.copyin_err: 2285 ldn [THREAD_REG + T_COPYOPS], %o4 2286 brz %o4, 2f 2287 nop 2288 ldn [%o4 + CP_COPYIN], %g2 2289 jmp %g2 2290 nop 22912: 2292 retl 2293 mov -1, %o0 2294 SET_SIZE(copyin) 2295 2296#endif /* lint */ 2297 2298#ifdef lint 2299 2300/*ARGSUSED*/ 2301int 2302xcopyin(const void *uaddr, void *kaddr, size_t count) 2303{ return (0); } 2304 2305#else /* lint */ 2306 2307 ENTRY(xcopyin) 2308 sethi %hi(.xcopyin_err), REAL_LOFAULT 2309 b .do_copyin 2310 or REAL_LOFAULT, %lo(.xcopyin_err), REAL_LOFAULT 2311.xcopyin_err: 2312 ldn [THREAD_REG + T_COPYOPS], %o4 2313 brz %o4, 2f 2314 nop 2315 ldn [%o4 + CP_XCOPYIN], %g2 2316 jmp %g2 2317 nop 23182: 2319 retl 2320 mov %g1, %o0 2321 SET_SIZE(xcopyin) 2322 2323#endif /* lint */ 2324 2325#ifdef lint 2326 2327/*ARGSUSED*/ 2328int 2329xcopyin_little(const void *uaddr, void *kaddr, size_t count) 2330{ return (0); } 2331 2332#else /* lint */ 2333 2334 ENTRY(xcopyin_little) 2335 sethi %hi(.little_err), %o4 2336 ldn [THREAD_REG + T_LOFAULT], %o5 2337 or %o4, %lo(.little_err), %o4 2338 membar #Sync ! sync error barrier 2339 stn %o4, [THREAD_REG + T_LOFAULT] 2340 2341 subcc %g0, %o2, %o3 2342 add %o0, %o2, %o0 2343 bz,pn %ncc, 2f ! check for zero bytes 2344 sub %o2, 1, %o4 2345 add %o0, %o4, %o0 ! start w/last byte 2346 add %o1, %o2, %o1 2347 lduba [%o0+%o3]ASI_AIUSL, %o4 2348 23491: stb %o4, [%o1+%o3] 2350 inccc %o3 2351 sub %o0, 2, %o0 ! get next byte 2352 bcc,a,pt %ncc, 1b 2353 lduba [%o0+%o3]ASI_AIUSL, %o4 2354 23552: membar #Sync ! sync error barrier 2356 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2357 retl 2358 mov %g0, %o0 ! return (0) 2359 2360.little_err: 2361 membar #Sync ! sync error barrier 2362 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 2363 retl 2364 mov %g1, %o0 2365 SET_SIZE(xcopyin_little) 2366 2367#endif /* lint */ 2368 2369 2370/* 2371 * Copy a block of storage - must not overlap (from + len <= to). 2372 * No fault handler installed (to be called under on_fault()) 2373 */ 2374#if defined(lint) 2375 2376/* ARGSUSED */ 2377void 2378copyin_noerr(const void *ufrom, void *kto, size_t count) 2379{} 2380 2381#else /* lint */ 2382 2383 ENTRY(copyin_noerr) 2384 sethi %hi(.copyio_noerr), REAL_LOFAULT 2385 b .do_copyin 2386 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 2387.copyio_noerr: 2388 jmp SAVED_LOFAULT 2389 nop 2390 SET_SIZE(copyin_noerr) 2391 2392#endif /* lint */ 2393 2394/* 2395 * Copy a block of storage - must not overlap (from + len <= to). 2396 * No fault handler installed (to be called under on_fault()) 2397 */ 2398 2399#if defined(lint) 2400 2401/* ARGSUSED */ 2402void 2403copyout_noerr(const void *kfrom, void *uto, size_t count) 2404{} 2405 2406#else /* lint */ 2407 2408 ENTRY(copyout_noerr) 2409 sethi %hi(.copyio_noerr), REAL_LOFAULT 2410 b .do_copyout 2411 or REAL_LOFAULT, %lo(.copyio_noerr), REAL_LOFAULT 2412 SET_SIZE(copyout_noerr) 2413 2414#endif /* lint */ 2415 2416#if defined(lint) 2417 2418int use_hw_bcopy = 1; 2419int use_hw_bzero = 1; 2420uint_t hw_copy_limit_1 = 0x100; 2421uint_t hw_copy_limit_2 = 0x200; 2422uint_t hw_copy_limit_4 = 0x400; 2423uint_t hw_copy_limit_8 = 0x400; 2424 2425#else /* !lint */ 2426 2427 .align 4 2428 DGDEF(use_hw_bcopy) 2429 .word 1 2430 DGDEF(use_hw_bzero) 2431 .word 1 2432 DGDEF(hw_copy_limit_1) 2433 .word 0x100 2434 DGDEF(hw_copy_limit_2) 2435 .word 0x200 2436 DGDEF(hw_copy_limit_4) 2437 .word 0x400 2438 DGDEF(hw_copy_limit_8) 2439 .word 0x400 2440 2441 .align 64 2442 .section ".text" 2443#endif /* !lint */ 2444 2445/* 2446 * hwblkclr - clears block-aligned, block-multiple-sized regions that are 2447 * longer than 256 bytes in length using Niagara's block stores/quad store. 2448 * If the criteria for using this routine are not met then it calls bzero 2449 * and returns 1. Otherwise 0 is returned indicating success. 2450 * Caller is responsible for ensuring use_hw_bzero is true and that 2451 * kpreempt_disable() has been called. 2452 */ 2453#ifdef lint 2454/*ARGSUSED*/ 2455int 2456hwblkclr(void *addr, size_t len) 2457{ 2458 return(0); 2459} 2460#else /* lint */ 2461 ! %i0 - start address 2462 ! %i1 - length of region (multiple of 64) 2463 2464 ENTRY(hwblkclr) 2465 save %sp, -SA(MINFRAME), %sp 2466 2467 ! Must be block-aligned 2468 andcc %i0, 0x3f, %g0 2469 bnz,pn %ncc, 1f 2470 nop 2471 2472 ! ... and must be 256 bytes or more 2473 cmp %i1, 0x100 2474 blu,pn %ncc, 1f 2475 nop 2476 2477 ! ... and length must be a multiple of 64 2478 andcc %i1, 0x3f, %g0 2479 bz,pn %ncc, .pz_doblock 2480 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 2481 24821: ! punt, call bzero but notify the caller that bzero was used 2483 mov %i0, %o0 2484 call bzero 2485 mov %i1, %o1 2486 ret 2487 restore %g0, 1, %o0 ! return (1) - did not use block operations 2488 2489 ! Already verified that there are at least 256 bytes to set 2490.pz_doblock: 2491 stxa %g0, [%i0+0x0]%asi 2492 stxa %g0, [%i0+0x40]%asi 2493 stxa %g0, [%i0+0x80]%asi 2494 stxa %g0, [%i0+0xc0]%asi 2495 2496 stxa %g0, [%i0+0x8]%asi 2497 stxa %g0, [%i0+0x10]%asi 2498 stxa %g0, [%i0+0x18]%asi 2499 stxa %g0, [%i0+0x20]%asi 2500 stxa %g0, [%i0+0x28]%asi 2501 stxa %g0, [%i0+0x30]%asi 2502 stxa %g0, [%i0+0x38]%asi 2503 2504 stxa %g0, [%i0+0x48]%asi 2505 stxa %g0, [%i0+0x50]%asi 2506 stxa %g0, [%i0+0x58]%asi 2507 stxa %g0, [%i0+0x60]%asi 2508 stxa %g0, [%i0+0x68]%asi 2509 stxa %g0, [%i0+0x70]%asi 2510 stxa %g0, [%i0+0x78]%asi 2511 2512 stxa %g0, [%i0+0x88]%asi 2513 stxa %g0, [%i0+0x90]%asi 2514 stxa %g0, [%i0+0x98]%asi 2515 stxa %g0, [%i0+0xa0]%asi 2516 stxa %g0, [%i0+0xa8]%asi 2517 stxa %g0, [%i0+0xb0]%asi 2518 stxa %g0, [%i0+0xb8]%asi 2519 2520 stxa %g0, [%i0+0xc8]%asi 2521 stxa %g0, [%i0+0xd0]%asi 2522 stxa %g0, [%i0+0xd8]%asi 2523 stxa %g0, [%i0+0xe0]%asi 2524 stxa %g0, [%i0+0xe8]%asi 2525 stxa %g0, [%i0+0xf0]%asi 2526 stxa %g0, [%i0+0xf8]%asi 2527 2528 sub %i1, 0x100, %i1 2529 cmp %i1, 0x100 2530 bgu,pt %ncc, .pz_doblock 2531 add %i0, 0x100, %i0 2532 25332: 2534 ! Check if more than 64 bytes to set 2535 cmp %i1,0x40 2536 blu %ncc, .pz_finish 2537 nop 2538 25393: 2540 stxa %g0, [%i0+0x0]%asi 2541 stxa %g0, [%i0+0x8]%asi 2542 stxa %g0, [%i0+0x10]%asi 2543 stxa %g0, [%i0+0x18]%asi 2544 stxa %g0, [%i0+0x20]%asi 2545 stxa %g0, [%i0+0x28]%asi 2546 stxa %g0, [%i0+0x30]%asi 2547 stxa %g0, [%i0+0x38]%asi 2548 2549 subcc %i1, 0x40, %i1 2550 bgu,pt %ncc, 3b 2551 add %i0, 0x40, %i0 2552 2553.pz_finish: 2554 membar #Sync 2555 ret 2556 restore %g0, 0, %o0 ! return (bzero or not) 2557 SET_SIZE(hwblkclr) 2558#endif /* lint */ 2559 2560#ifdef lint 2561/* Copy 32 bytes of data from src to dst using physical addresses */ 2562/*ARGSUSED*/ 2563void 2564hw_pa_bcopy32(uint64_t src, uint64_t dst) 2565{} 2566#else /*!lint */ 2567 2568 /* 2569 * Copy 32 bytes of data from src (%o0) to dst (%o1) 2570 * using physical addresses. 2571 */ 2572 ENTRY_NP(hw_pa_bcopy32) 2573 rdpr %pstate, %g1 2574 andn %g1, PSTATE_IE, %g2 2575 wrpr %g0, %g2, %pstate 2576 2577 ldxa [%o0]ASI_MEM, %o2 2578 add %o0, 8, %o0 2579 ldxa [%o0]ASI_MEM, %o3 2580 add %o0, 8, %o0 2581 ldxa [%o0]ASI_MEM, %o4 2582 add %o0, 8, %o0 2583 ldxa [%o0]ASI_MEM, %o5 2584 stxa %o2, [%o1]ASI_MEM 2585 add %o1, 8, %o1 2586 stxa %o3, [%o1]ASI_MEM 2587 add %o1, 8, %o1 2588 stxa %o4, [%o1]ASI_MEM 2589 add %o1, 8, %o1 2590 stxa %o5, [%o1]ASI_MEM 2591 2592 membar #Sync 2593 retl 2594 wrpr %g0, %g1, %pstate 2595 SET_SIZE(hw_pa_bcopy32) 2596#endif /* lint */ 2597 2598/* 2599 * Zero a block of storage. 2600 * 2601 * uzero is used by the kernel to zero a block in user address space. 2602 */ 2603 2604/* 2605 * Control flow of the bzero/kzero/uzero routine. 2606 * 2607 * For fewer than 7 bytes stores, bytes will be zeroed. 2608 * 2609 * For less than 15 bytes stores, align the address on 4 byte boundary. 2610 * Then store as many 4-byte chunks, followed by trailing bytes. 2611 * 2612 * For sizes greater than 15 bytes, align the address on 8 byte boundary. 2613 * if (count > 128) { 2614 * store as many 8-bytes chunks to block align the address 2615 * store using ASI_BLK_INIT_ST_QUAD_LDD_P (bzero/kzero) OR 2616 * store using ASI_BLK_INIT_QUAD_LDD_AIUS (uzero) 2617 * } 2618 * Store as many 8-byte chunks, followed by trailing bytes. 2619 */ 2620 2621#if defined(lint) 2622 2623/* ARGSUSED */ 2624int 2625kzero(void *addr, size_t count) 2626{ return(0); } 2627 2628/* ARGSUSED */ 2629void 2630uzero(void *addr, size_t count) 2631{} 2632 2633#else /* lint */ 2634 2635 ENTRY(uzero) 2636 ! 2637 ! Set a new lo_fault handler only if we came in with one 2638 ! already specified. 2639 ! 2640 wr %g0, ASI_USER, %asi 2641 ldn [THREAD_REG + T_LOFAULT], %o5 2642 tst %o5 2643 bz,pt %ncc, .do_zero 2644 sethi %hi(.zeroerr), %o2 2645 or %o2, %lo(.zeroerr), %o2 2646 membar #Sync 2647 ba,pt %ncc, .do_zero 2648 stn %o2, [THREAD_REG + T_LOFAULT] 2649 2650 ENTRY(kzero) 2651 ! 2652 ! Always set a lo_fault handler 2653 ! 2654 wr %g0, ASI_P, %asi 2655 ldn [THREAD_REG + T_LOFAULT], %o5 2656 sethi %hi(.zeroerr), %o2 2657 or %o5, LOFAULT_SET, %o5 2658 or %o2, %lo(.zeroerr), %o2 2659 membar #Sync 2660 ba,pt %ncc, .do_zero 2661 stn %o2, [THREAD_REG + T_LOFAULT] 2662 2663/* 2664 * We got here because of a fault during kzero or if 2665 * uzero or bzero was called with t_lofault non-zero. 2666 * Otherwise we've already run screaming from the room. 2667 * Errno value is in %g1. Note that we're here iff 2668 * we did set t_lofault. 2669 */ 2670.zeroerr: 2671 ! 2672 ! Undo asi register setting. Just set it to be the 2673 ! kernel default without checking. 2674 ! 2675 wr %g0, ASI_P, %asi 2676 2677 ! 2678 ! We did set t_lofault. It may well have been zero coming in. 2679 ! 26801: 2681 tst %o5 2682 membar #Sync 2683 bne,pn %ncc, 3f 2684 andncc %o5, LOFAULT_SET, %o5 26852: 2686 ! 2687 ! Old handler was zero. Just return the error. 2688 ! 2689 retl ! return 2690 mov %g1, %o0 ! error code from %g1 26913: 2692 ! 2693 ! We're here because %o5 was non-zero. It was non-zero 2694 ! because either LOFAULT_SET was present, a previous fault 2695 ! handler was present or both. In all cases we need to reset 2696 ! T_LOFAULT to the value of %o5 after clearing LOFAULT_SET 2697 ! before we either simply return the error or we invoke the 2698 ! previously specified handler. 2699 ! 2700 be %ncc, 2b 2701 stn %o5, [THREAD_REG + T_LOFAULT] 2702 jmp %o5 ! goto real handler 2703 nop 2704 SET_SIZE(kzero) 2705 SET_SIZE(uzero) 2706 2707#endif /* lint */ 2708 2709/* 2710 * Zero a block of storage. 2711 */ 2712 2713#if defined(lint) 2714 2715/* ARGSUSED */ 2716void 2717bzero(void *addr, size_t count) 2718{} 2719 2720#else /* lint */ 2721 2722 ENTRY(bzero) 2723 wr %g0, ASI_P, %asi 2724 2725 ldn [THREAD_REG + T_LOFAULT], %o5 ! save old vector 2726 tst %o5 2727 bz,pt %ncc, .do_zero 2728 sethi %hi(.zeroerr), %o2 2729 or %o2, %lo(.zeroerr), %o2 2730 membar #Sync ! sync error barrier 2731 stn %o2, [THREAD_REG + T_LOFAULT] ! install new vector 2732 2733.do_zero: 2734 cmp %o1, 7 2735 blu,pn %ncc, .byteclr 2736 nop 2737 2738 cmp %o1, 15 2739 blu,pn %ncc, .wdalign 2740 nop 2741 2742 andcc %o0, 7, %o3 ! is add aligned on a 8 byte bound 2743 bz,pt %ncc, .blkalign ! already double aligned 2744 sub %o3, 8, %o3 ! -(bytes till double aligned) 2745 add %o1, %o3, %o1 ! update o1 with new count 2746 27471: 2748 stba %g0, [%o0]%asi 2749 inccc %o3 2750 bl,pt %ncc, 1b 2751 inc %o0 2752 2753 ! Now address is double aligned 2754.blkalign: 2755 cmp %o1, 0x80 ! check if there are 128 bytes to set 2756 blu,pn %ncc, .bzero_small 2757 mov %o1, %o3 2758 2759 sethi %hi(use_hw_bzero), %o2 2760 ld [%o2 + %lo(use_hw_bzero)], %o2 2761 tst %o2 2762 bz %ncc, .bzero_small 2763 mov %o1, %o3 2764 2765 rd %asi, %o3 2766 wr %g0, ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 2767 cmp %o3, ASI_P 2768 bne,a %ncc, .algnblk 2769 wr %g0, ASI_BLK_INIT_QUAD_LDD_AIUS, %asi 2770 2771.algnblk: 2772 andcc %o0, 0x3f, %o3 ! is block aligned? 2773 bz,pt %ncc, .bzero_blk 2774 sub %o3, 0x40, %o3 ! -(bytes till block aligned) 2775 add %o1, %o3, %o1 ! o1 is the remainder 2776 2777 ! Clear -(%o3) bytes till block aligned 27781: 2779 stxa %g0, [%o0]%asi 2780 addcc %o3, 8, %o3 2781 bl,pt %ncc, 1b 2782 add %o0, 8, %o0 2783 2784.bzero_blk: 2785 and %o1, 0x3f, %o3 ! calc bytes left after blk clear 2786 andn %o1, 0x3f, %o4 ! calc size of blocks in bytes 2787 2788 cmp %o4, 0x100 ! 256 bytes or more 2789 blu,pn %ncc, 3f 2790 nop 2791 27922: 2793 stxa %g0, [%o0+0x0]%asi 2794 stxa %g0, [%o0+0x40]%asi 2795 stxa %g0, [%o0+0x80]%asi 2796 stxa %g0, [%o0+0xc0]%asi 2797 2798 stxa %g0, [%o0+0x8]%asi 2799 stxa %g0, [%o0+0x10]%asi 2800 stxa %g0, [%o0+0x18]%asi 2801 stxa %g0, [%o0+0x20]%asi 2802 stxa %g0, [%o0+0x28]%asi 2803 stxa %g0, [%o0+0x30]%asi 2804 stxa %g0, [%o0+0x38]%asi 2805 2806 stxa %g0, [%o0+0x48]%asi 2807 stxa %g0, [%o0+0x50]%asi 2808 stxa %g0, [%o0+0x58]%asi 2809 stxa %g0, [%o0+0x60]%asi 2810 stxa %g0, [%o0+0x68]%asi 2811 stxa %g0, [%o0+0x70]%asi 2812 stxa %g0, [%o0+0x78]%asi 2813 2814 stxa %g0, [%o0+0x88]%asi 2815 stxa %g0, [%o0+0x90]%asi 2816 stxa %g0, [%o0+0x98]%asi 2817 stxa %g0, [%o0+0xa0]%asi 2818 stxa %g0, [%o0+0xa8]%asi 2819 stxa %g0, [%o0+0xb0]%asi 2820 stxa %g0, [%o0+0xb8]%asi 2821 2822 stxa %g0, [%o0+0xc8]%asi 2823 stxa %g0, [%o0+0xd0]%asi 2824 stxa %g0, [%o0+0xd8]%asi 2825 stxa %g0, [%o0+0xe0]%asi 2826 stxa %g0, [%o0+0xe8]%asi 2827 stxa %g0, [%o0+0xf0]%asi 2828 stxa %g0, [%o0+0xf8]%asi 2829 2830 sub %o4, 0x100, %o4 2831 cmp %o4, 0x100 2832 bgu,pt %ncc, 2b 2833 add %o0, 0x100, %o0 2834 28353: 2836 ! ... check if 64 bytes to set 2837 cmp %o4, 0x40 2838 blu %ncc, .bzero_blk_done 2839 nop 2840 28414: 2842 stxa %g0, [%o0+0x0]%asi 2843 stxa %g0, [%o0+0x8]%asi 2844 stxa %g0, [%o0+0x10]%asi 2845 stxa %g0, [%o0+0x18]%asi 2846 stxa %g0, [%o0+0x20]%asi 2847 stxa %g0, [%o0+0x28]%asi 2848 stxa %g0, [%o0+0x30]%asi 2849 stxa %g0, [%o0+0x38]%asi 2850 2851 subcc %o4, 0x40, %o4 2852 bgu,pt %ncc, 3b 2853 add %o0, 0x40, %o0 2854 2855.bzero_blk_done: 2856 membar #Sync 2857 ! 2858 ! Undo asi register setting. 2859 ! 2860 rd %asi, %o4 2861 wr %g0, ASI_P, %asi 2862 cmp %o4, ASI_BLK_INIT_ST_QUAD_LDD_P 2863 bne,a %ncc, .bzero_small 2864 wr %g0, ASI_USER, %asi 2865 2866.bzero_small: 2867 ! Set the remaining doubles 2868 subcc %o3, 8, %o3 ! Can we store any doubles? 2869 blu,pn %ncc, .byteclr 2870 and %o1, 7, %o1 ! calc bytes left after doubles 2871 2872.dbclr: 2873 stxa %g0, [%o0]%asi ! Clear the doubles 2874 subcc %o3, 8, %o3 2875 bgeu,pt %ncc, .dbclr 2876 add %o0, 8, %o0 2877 2878 ba .byteclr 2879 nop 2880 2881.wdalign: 2882 andcc %o0, 3, %o3 ! is add aligned on a word boundary 2883 bz,pn %ncc, .wdclr 2884 andn %o1, 3, %o3 ! create word sized count in %o3 2885 2886 dec %o1 ! decrement count 2887 stba %g0, [%o0]%asi ! clear a byte 2888 ba .wdalign 2889 inc %o0 ! next byte 2890 2891.wdclr: 2892 sta %g0, [%o0]%asi ! 4-byte clearing loop 2893 subcc %o3, 4, %o3 2894 bnz,pt %ncc, .wdclr 2895 inc 4, %o0 2896 2897 and %o1, 3, %o1 ! leftover count, if any 2898 2899.byteclr: 2900 ! Set the leftover bytes 2901 brz %o1, .bzero_exit 2902 nop 2903 29047: 2905 deccc %o1 ! byte clearing loop 2906 stba %g0, [%o0]%asi 2907 bgu,pt %ncc, 7b 2908 inc %o0 2909 2910.bzero_exit: 2911 ! 2912 ! We're just concerned with whether t_lofault was set 2913 ! when we came in. We end up here from either kzero() 2914 ! or bzero(). kzero() *always* sets a lofault handler. 2915 ! It ors LOFAULT_SET into %o5 to indicate it has done 2916 ! this even if the value of %o5 is otherwise zero. 2917 ! bzero() sets a lofault handler *only* if one was 2918 ! previously set. Accordingly we need to examine 2919 ! %o5 and if it is non-zero be sure to clear LOFAULT_SET 2920 ! before resetting the error handler. 2921 ! 2922 tst %o5 2923 bz %ncc, 1f 2924 andn %o5, LOFAULT_SET, %o5 2925 membar #Sync ! sync error barrier 2926 stn %o5, [THREAD_REG + T_LOFAULT] ! restore old t_lofault 29271: 2928 retl 2929 clr %o0 ! return (0) 2930 2931 SET_SIZE(bzero) 2932#endif /* lint */ 2933