1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 .file "memcpy.s" 27 28/* 29 * memcpy(s1, s2, len) 30 * 31 * Copy s2 to s1, always copy n bytes. 32 * Note: this C code does not work for overlapped copies. 33 * Memmove() and bcopy() do. 34 * 35 * Added entry __align_cpy_1 is generally for use of the compilers. 36 * 37 * Fast assembler language version of the following C-program for memcpy 38 * which represents the `standard' for the C-library. 39 * 40 * void * 41 * memcpy(void *s, const void *s0, size_t n) 42 * { 43 * if (n != 0) { 44 * char *s1 = s; 45 * const char *s2 = s0; 46 * do { 47 * *s1++ = *s2++; 48 * } while (--n != 0); 49 * } 50 * return (s); 51 * } 52 * 53 * 54 * N1 Flow : 55 * 56 * if (count < 17) { 57 * Do the byte copy 58 * Return destination address 59 * } 60 * if (count < 128) { 61 * Is source aligned on word boundary 62 * If no then align source on word boundary then goto .ald 63 * If yes goto .ald 64 * .ald: 65 * Is destination aligned on word boundary 66 * Depending on destination offset (last 2 bits of destination) 67 * copy data by shifting and merging. 68 * Copy residue bytes as byte copy 69 * Return destination address 70 * } else { 71 * Align destination on block boundary 72 * Depending on the source offset (last 4 bits of source address) align 73 * the data and store to destination. Both the load and store are done 74 * using ASI_BLK_INIT_ST_QUAD_LDD_P. 75 * For remaining count copy as much data in 8-byte chunk from source to 76 * destination. 77 * Followed by trailing copy using byte copy. 78 * Return saved destination address 79 * } 80 * 81 * 82 * N2 Flow : 83 * Flow : 84 * 85 * if (count < 128) { 86 * if count < 3 87 * copy bytes; exit with dst addr 88 * if src & dst aligned on word boundary but not long word boundary, 89 * copy with ldw/stw; branch to finish_up 90 * if src & dst aligned on long word boundary 91 * copy with ldx/stx; branch to finish_up 92 * if src & dst not aligned and length <= 14 93 * copy bytes; exit with dst addr 94 * move enough bytes to get src to word boundary 95 * if dst now on word boundary 96 * move_words: 97 * copy words; branch to finish_up 98 * if dst now on half word boundary 99 * load words, shift half words, store words; branch to finish_up 100 * if dst on byte 1 101 * load words, shift 3 bytes, store words; branch to finish_up 102 * if dst on byte 3 103 * load words, shift 1 byte, store words; branch to finish_up 104 * finish_up: 105 * copy bytes; exit with dst addr 106 * } else { More than 128 bytes 107 * move bytes until dst is on long word boundary 108 * if( src is on long word boundary ) { 109 * if (count < 512) { 110 * finish_long: src/dst aligned on 8 bytes 111 * copy with ldx/stx in 8-way unrolled loop; 112 * copy final 0-63 bytes; exit with dst addr 113 * } else { src/dst aligned; count > 512 114 * align dst on 64 byte boundary; use 8-way test for each of 8 possible 115 * src alignments relative to a 64 byte boundary to select the 116 * 16-way unrolled loop to use for 117 * block load, fmovd, block-init-store, block-store, fmovd operations 118 * then go to finish_long. 119 * } 120 * } else { src/dst not aligned on 8 bytes 121 * if src is word aligned and count < 512 122 * move words in 8-way unrolled loop 123 * move final 0-31 bytes; exit with dst addr 124 * if count < 512 125 * use alignaddr/faligndata combined with ldd/std in 8-way 126 * unrolled loop to move data. 127 * go to unalign_done 128 * else 129 * setup alignaddr for faligndata instructions 130 * align dst on 64 byte boundary; use 8-way test for each of 8 possible 131 * src alignments to nearest long word relative to 64 byte boundary to 132 * select the 8-way unrolled loop to use for 133 * block load, falign, fmovd, block-init-store, block-store loop 134 * (only use block-init-store when src/dst on 8 byte boundaries.) 135 * unalign_done: 136 * move remaining bytes for unaligned cases. exit with dst addr. 137 * } 138 * 139 * Comment on N2 memmove and memcpy common code and block-store-init: 140 * In the man page for memmove, it specifies that copying will take place 141 * correctly between objects that overlap. For memcpy, behavior is 142 * undefined for objects that overlap. 143 * 144 * In rare cases, some multi-threaded applications may attempt to examine 145 * the copy destination buffer during the copy. Using the block-store-init 146 * instruction allows those applications to observe zeros in some 147 * cache lines of the destination buffer for narrow windows. But the 148 * the block-store-init provides memory throughput advantages for many 149 * common applications. To meet both needs, those applications which need 150 * the destination buffer to retain meaning during the copy should use 151 * memmove instead of memcpy. The memmove version duplicates the memcpy 152 * algorithms except the memmove version does not use block-store-init 153 * in those cases where memcpy does use block-store-init. Otherwise, when 154 * memmove can determine the source and destination do not overlap, 155 * memmove shares the memcpy code. 156 */ 157 158#include <sys/asm_linkage.h> 159#include <sys/niagaraasi.h> 160#include <sys/asi.h> 161#include <sys/trap.h> 162 163/* documented name for primary block initializing store */ 164#define ASI_STBI_P ASI_BLK_INIT_ST_QUAD_LDD_P 165 166#define BLOCK_SIZE 64 167#define FPRS_FEF 0x4 168 169#define SHORTCOPY 3 170#define SHORTCHECK 14 171#define SHORT_LONG 64 /* max copy for short longword-aligned case */ 172 /* must be at least 32 */ 173#define SMALL_MAX 128 174#define MED_UMAX 512 /* max copy for medium un-aligned case */ 175#define MED_WMAX 512 /* max copy for medium word-aligned case */ 176#define MED_MAX 512 /* max copy for medium longword-aligned case */ 177 178#ifdef NIAGARA2_IMPL 179#include <sys/sun4asi.h> 180 181#else /* NIAGARA2_IMPL */ 182/* 183 * This define is to align data for the unaligned source cases. 184 * The data1, data2 and data3 is merged into data1 and data2. 185 * The data3 is preserved for next merge. 186 */ 187#define ALIGN_DATA(data1, data2, data3, lshift, rshift, tmp) \ 188 sllx data1, lshift, data1 ;\ 189 srlx data2, rshift, tmp ;\ 190 or data1, tmp, data1 ;\ 191 sllx data2, lshift, data2 ;\ 192 srlx data3, rshift, tmp ;\ 193 or data2, tmp, data2 194/* 195 * Align the data. Merge the data1 and data2 into data1. 196 */ 197#define ALIGN_DATA_EW(data1, data2, lshift, rshift, tmp) \ 198 sllx data1, lshift, data1 ;\ 199 srlx data2, rshift, tmp ;\ 200 or data1, tmp, data1 201#endif /* NIAGARA2_IMPL */ 202 203 204 ANSI_PRAGMA_WEAK(memmove,function) 205 ANSI_PRAGMA_WEAK(memcpy,function) 206 207 ENTRY(memmove) 208 cmp %o1, %o0 ! if from address is >= to use forward copy 209 bgeu,pn %ncc, .forcpy ! else use backward if ... 210 sub %o0, %o1, %o4 ! get difference of two addresses 211 cmp %o2, %o4 ! compare size and difference of addresses 212 bleu,pn %ncc, .forcpy ! if size is bigger, do overlapped copy 213 add %o1, %o2, %o5 ! get to end of source space 214 215 ! 216 ! an overlapped copy that must be done "backwards" 217 ! 218.chksize: 219 cmp %o2, 8 ! less than 8 byte do byte copy 220 blu,pt %ncc, 2f ! else continue 221 222 ! Now size is bigger than 8 223.dbalign: 224 add %o0, %o2, %g1 ! get to end of dest space 225 andcc %g1, 7, %o3 ! %o3 has bytes till dst 8 bytes aligned 226 bz,a,pn %ncc, .dbbck ! if dst is not 8 byte aligned: align it 227 andn %o2, 7, %o3 ! %o3 count is multiple of 8 bytes size 228 sub %o2, %o3, %o2 ! update o2 with new count 229 2301: dec %o5 ! decrement source 231 ldub [%o5], %g1 ! load one byte 232 deccc %o3 ! decrement count 233 bgu,pt %ncc, 1b ! if not done keep copying 234 stb %g1, [%o5+%o4] ! store one byte into dest 235 andncc %o2, 7, %o3 ! %o3 count is multiple of 8 bytes size 236 bz,pn %ncc, 2f ! if size < 8, move to byte copy 237 238 ! Now Destination is 8 byte aligned 239.dbbck: 240 andcc %o5, 7, %o0 ! %o0 has src offset 241 bz,a,pn %ncc, .dbcopybc ! if src is aligned to fast mem move 242 sub %o2, %o3, %o2 ! Residue bytes in %o2 243 244.cpy_dbwdbc: ! alignment of src is needed 245 sub %o2, 8, %o2 ! set size one loop ahead 246 sll %o0, 3, %g1 ! %g1 is left shift 247 mov 64, %g5 ! init %g5 to be 64 248 sub %g5, %g1, %g5 ! %g5 right shift = (64 - left shift) 249 sub %o5, %o0, %o5 ! align the src at 8 bytes. 250 add %o4, %o0, %o4 ! increase difference between src & dst 251 ldx [%o5], %o1 ! load first 8 bytes 252 srlx %o1, %g5, %o1 2531: sub %o5, 8, %o5 ! subtract 8 from src 254 ldx [%o5], %o0 ! load 8 byte 255 sllx %o0, %g1, %o3 ! shift loaded 8 bytes left into tmp reg 256 or %o1, %o3, %o3 ! align data 257 stx %o3, [%o5+%o4] ! store 8 byte 258 subcc %o2, 8, %o2 ! subtract 8 byte from size 259 bg,pt %ncc, 1b ! if size > 0 continue 260 srlx %o0, %g5, %o1 ! move extra byte for the next use 261 262 srl %g1, 3, %o0 ! retsote %o0 value for alignment 263 add %o5, %o0, %o5 ! restore src alignment 264 sub %o4, %o0, %o4 ! restore difference between src & dest 265 266 ba 2f ! branch to the trailing byte copy 267 add %o2, 8, %o2 ! restore size value 268 269.dbcopybc: ! alignment of src is not needed 2701: sub %o5, 8, %o5 ! subtract from src 271 ldx [%o5], %g1 ! load 8 bytes 272 subcc %o3, 8, %o3 ! subtract from size 273 bgu,pt %ncc, 1b ! if size is bigger 0 continue 274 stx %g1, [%o5+%o4] ! store 8 bytes to destination 275 276 ba 2f 277 nop 278 279.bcbyte: 2801: ldub [%o5], %g1 ! load one byte 281 stb %g1, [%o5+%o4] ! store one byte 2822: deccc %o2 ! decrement size 283 bgeu,a,pt %ncc, 1b ! if size is >= 0 continue 284 dec %o5 ! decrement from address 285 286.exitbc: ! exit from backward copy 287 retl 288 add %o5, %o4, %o0 ! restore dest addr 289 290#ifdef NIAGARA2_IMPL 291 ! 292 ! Check to see if memmove is large aligned copy 293 ! If so, use special version of copy that avoids 294 ! use of block store init 295 ! 296.forcpy: 297 cmp %o2, SMALL_MAX ! check for not small case 298 blt,pn %ncc, .mv_short ! merge with memcpy 299 mov %o0, %g1 ! save %o0 300 neg %o0, %o5 301 andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 302 brz,pt %o5, .mv_dst_aligned_on_8 303 304 ! %o5 has the bytes to be written in partial store. 305 sub %o2, %o5, %o2 306 sub %o1, %o0, %o1 ! %o1 gets the difference 3077: ! dst aligning loop 308 ldub [%o1+%o0], %o4 ! load one byte 309 subcc %o5, 1, %o5 310 stb %o4, [%o0] 311 bgu,pt %ncc, 7b 312 add %o0, 1, %o0 ! advance dst 313 add %o1, %o0, %o1 ! restore %o1 314.mv_dst_aligned_on_8: 315 andcc %o1, 7, %o5 316 brnz,pt %o5, .src_dst_unaligned_on_8 317 prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read 318 319.mv_src_dst_aligned_on_8: 320 ! check if we are copying MED_MAX or more bytes 321 cmp %o2, MED_MAX ! limit to store buffer size 322 bleu,pt %ncc, .medlong 323 prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read 324 325/* 326 * The following memmove code mimics the memcpy code for large aligned copies, 327 * but does not use the ASI_STBI_P (block initializing store) performance 328 * optimization. See memmove rationale section in documentation 329 */ 330.mv_large_align8_copy: ! Src and dst share 8 byte alignment 331 rd %fprs, %g5 ! check for unused fp 332 ! if fprs.fef == 0, set it. 333 ! Setting it when already set costs more than checking 334 andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0 335 bz,a %ncc, 1f 336 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 3371: 338 ! align dst to 64 byte boundary 339 andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 340 brz,pn %o3, .mv_aligned_on_64 341 sub %o3, 64, %o3 ! %o3 has negative bytes to move 342 add %o2, %o3, %o2 ! adjust remaining count 343.mv_align_to_64: 344 ldx [%o1], %o4 345 add %o1, 8, %o1 ! increment src ptr 346 addcc %o3, 8, %o3 347 stx %o4, [%o0] 348 brnz,pt %o3, .mv_align_to_64 349 add %o0, 8, %o0 ! increment dst ptr 350 351.mv_aligned_on_64: 352 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 353 mov %asi,%o4 ! save %asi 354 ! Determine source alignment 355 ! to correct 8 byte offset 356 andcc %o1, 0x20, %o3 357 brnz,pn %o3, .mv_align_1 358 mov ASI_BLK_P, %asi ! setup %asi for block load/store 359 andcc %o1, 0x10, %o3 360 brnz,pn %o3, .mv_align_01 361 nop 362 andcc %o1, 0x08, %o3 363 brz,pn %o3, .mv_align_000 364 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 365 ba .mv_align_001 366 nop 367.mv_align_01: 368 andcc %o1, 0x08, %o3 369 brnz,pn %o3, .mv_align_011 370 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 371 ba .mv_align_010 372 nop 373.mv_align_1: 374 andcc %o1, 0x10, %o3 375 brnz,pn %o3, .mv_align_11 376 nop 377 andcc %o1, 0x08, %o3 378 brnz,pn %o3, .mv_align_101 379 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 380 ba .mv_align_100 381 nop 382.mv_align_11: 383 andcc %o1, 0x08, %o3 384 brz,pn %o3, .mv_align_110 385 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 386 387.mv_align_111: 388! Alignment off by 8 bytes 389 ldd [%o1], %d0 390 add %o1, 8, %o1 391 sub %o2, 8, %o2 392 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 393 and %o2, 0x7f, %o2 ! residue bytes in %o2 394.mv_align_111_loop: 395 subcc %o5, 128, %o5 396 /* ---- copy line 1 of 2. ---- */ 397 ldda [%o1]%asi,%d16 ! block load 398 fmovd %d16, %d2 399 fmovd %d18, %d4 400 fmovd %d20, %d6 401 fmovd %d22, %d8 402 fmovd %d24, %d10 403 fmovd %d26, %d12 404 fmovd %d28, %d14 405 stda %d0,[%o0]%asi 406 add %o0, 64, %o0 ! advance dst 407 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 408 fmovd %d30, %d0 409 410 /* ---- copy line 2 of 2. ---- */ 411 ldda [%o1+64]%asi,%d16 412 fmovd %d16, %d2 413 fmovd %d18, %d4 414 fmovd %d20, %d6 415 fmovd %d22, %d8 416 fmovd %d24, %d10 417 fmovd %d26, %d12 418 fmovd %d28, %d14 419 add %o1, 128, %o1 ! increment src 420 stda %d0,[%o0]%asi 421 add %o0, 64, %o0 ! advance dst 422 fmovd %d30, %d0 423 bgt,pt %ncc, .mv_align_111_loop 424 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 425 426 std %d0, [%o0] 427 ba .remain_stuff 428 add %o0, 8, %o0 429 ! END OF mv_align_111 430 431.mv_align_110: 432! Alignment off by 16 bytes 433 ldd [%o1], %d0 434 ldd [%o1+8], %d2 435 add %o1, 16, %o1 436 sub %o2, 16, %o2 437 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 438 and %o2, 0x7f, %o2 ! residue bytes in %o2 439.mv_align_110_loop: 440 subcc %o5, 128, %o5 441 /* ---- copy line 1 of 2. ---- */ 442 443 ldda [%o1]%asi,%d16 ! block load 444 fmovd %d16, %d4 445 fmovd %d18, %d6 446 fmovd %d20, %d8 447 fmovd %d22, %d10 448 fmovd %d24, %d12 449 fmovd %d26, %d14 450 stda %d0,[%o0]%asi 451 add %o0, 64, %o0 ! advance dst 452 fmovd %d28, %d0 453 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 454 fmovd %d30, %d2 455 456 /* ---- copy line 2 of 2. ---- */ 457 ldda [%o1+64]%asi,%d16 458 fmovd %d16, %d4 459 fmovd %d18, %d6 460 fmovd %d20, %d8 461 fmovd %d22, %d10 462 fmovd %d24, %d12 463 fmovd %d26, %d14 464 add %o1, 128, %o1 ! increment src 465 stda %d0,[%o0]%asi 466 add %o0, 64, %o0 ! advance dst 467 fmovd %d28, %d0 468 fmovd %d30, %d2 469 bgt,pt %ncc, .mv_align_110_loop 470 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 471 472 std %d0, [%o0] 473 std %d2, [%o0+8] 474 ba .remain_stuff 475 add %o0, 16, %o0 476 ! END OF mv_align_110 477 478.mv_align_101: 479! Alignment off by 24 bytes 480 ldd [%o1], %d0 481 ldd [%o1+8], %d2 482 ldd [%o1+16], %d4 483 add %o1, 24, %o1 484 sub %o2, 24, %o2 485 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 486 and %o2, 0x7f, %o2 ! residue bytes in %o2 487.mv_align_101_loop: 488 subcc %o5, 128, %o5 489 /* ---- copy line 1 of 2. ---- */ 490 491 ldda [%o1]%asi,%d16 ! block load 492 fmovd %d16, %d6 493 fmovd %d18, %d8 494 fmovd %d20, %d10 495 fmovd %d22, %d12 496 fmovd %d24, %d14 497 stda %d0,[%o0]%asi 498 add %o0, 64, %o0 ! advance dst 499 fmovd %d26, %d0 500 fmovd %d28, %d2 501 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 502 fmovd %d30, %d4 503 504 /* ---- copy line 2 of 2. ---- */ 505 ldda [%o1+64]%asi,%d16 506 fmovd %d16, %d6 507 fmovd %d18, %d8 508 fmovd %d20, %d10 509 fmovd %d22, %d12 510 fmovd %d24, %d14 511 add %o1, 128, %o1 ! increment src 512 stda %d0,[%o0]%asi 513 add %o0, 64, %o0 ! advance dst 514 fmovd %d26, %d0 515 fmovd %d28, %d2 516 fmovd %d30, %d4 517 bgt,pt %ncc, .mv_align_101_loop 518 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 519 520 std %d0, [%o0] 521 std %d2, [%o0+8] 522 std %d4, [%o0+16] 523 ba .remain_stuff 524 add %o0, 24, %o0 525 ! END OF mv_align_101 526 527.mv_align_100: 528! Alignment off by 32 bytes 529 ldd [%o1], %d0 530 ldd [%o1+8], %d2 531 ldd [%o1+16],%d4 532 ldd [%o1+24],%d6 533 add %o1, 32, %o1 534 sub %o2, 32, %o2 535 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 536 and %o2, 0x7f, %o2 ! residue bytes in %o2 537.mv_align_100_loop: 538 subcc %o5, 128, %o5 539 /* ---- copy line 1 of 2. ---- */ 540 ldda [%o1]%asi,%d16 ! block load 541 fmovd %d16, %d8 542 fmovd %d18, %d10 543 fmovd %d20, %d12 544 fmovd %d22, %d14 545 stda %d0,[%o0]%asi 546 add %o0, 64, %o0 ! advance dst 547 fmovd %d24, %d0 548 fmovd %d26, %d2 549 fmovd %d28, %d4 550 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 551 fmovd %d30, %d6 552 553 /* ---- copy line 2 of 2. ---- */ 554 ldda [%o1+64]%asi,%d16 555 fmovd %d16, %d8 556 fmovd %d18, %d10 557 fmovd %d20, %d12 558 fmovd %d22, %d14 559 add %o1, 128, %o1 ! increment src 560 stda %d0,[%o0]%asi 561 add %o0, 64, %o0 ! advance dst 562 fmovd %d24, %d0 563 fmovd %d26, %d2 564 fmovd %d28, %d4 565 fmovd %d30, %d6 566 bgt,pt %ncc, .mv_align_100_loop 567 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 568 569 std %d0, [%o0] 570 std %d2, [%o0+8] 571 std %d4, [%o0+16] 572 std %d6, [%o0+24] 573 ba .remain_stuff 574 add %o0, 32, %o0 575 ! END OF mv_align_100 576 577.mv_align_011: 578! Alignment off by 40 bytes 579 ldd [%o1], %d0 580 ldd [%o1+8], %d2 581 ldd [%o1+16], %d4 582 ldd [%o1+24], %d6 583 ldd [%o1+32], %d8 584 add %o1, 40, %o1 585 sub %o2, 40, %o2 586 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 587 and %o2, 0x7f, %o2 ! residue bytes in %o2 588.mv_align_011_loop: 589 subcc %o5, 128, %o5 590 /* ---- copy line 1 of 2. ---- */ 591 592 ldda [%o1]%asi,%d16 ! block load 593 fmovd %d16, %d10 594 fmovd %d18, %d12 595 fmovd %d20, %d14 596 stda %d0,[%o0]%asi 597 add %o0, 64, %o0 ! advance dst 598 fmovd %d22, %d0 599 fmovd %d24, %d2 600 fmovd %d26, %d4 601 fmovd %d28, %d6 602 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 603 fmovd %d30, %d8 604 605 /* ---- copy line 2 of 2. ---- */ 606 ldda [%o1+64]%asi,%d16 607 fmovd %d16, %d10 608 fmovd %d18, %d12 609 fmovd %d20, %d14 610 add %o1, 128, %o1 ! increment src 611 stda %d0,[%o0]%asi 612 add %o0, 64, %o0 ! advance dst 613 fmovd %d22, %d0 614 fmovd %d24, %d2 615 fmovd %d26, %d4 616 fmovd %d28, %d6 617 fmovd %d30, %d8 618 bgt,pt %ncc, .mv_align_011_loop 619 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 620 621 std %d0, [%o0] 622 std %d2, [%o0+8] 623 std %d4, [%o0+16] 624 std %d6, [%o0+24] 625 std %d8, [%o0+32] 626 ba .remain_stuff 627 add %o0, 40, %o0 628 ! END OF mv_align_011 629 630.mv_align_010: 631! Alignment off by 48 bytes 632 ldd [%o1], %d0 633 ldd [%o1+8], %d2 634 ldd [%o1+16], %d4 635 ldd [%o1+24], %d6 636 ldd [%o1+32], %d8 637 ldd [%o1+40], %d10 638 add %o1, 48, %o1 639 sub %o2, 48, %o2 640 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 641 and %o2, 0x7f, %o2 ! residue bytes in %o2 642.mv_align_010_loop: 643 subcc %o5, 128, %o5 644 /* ---- copy line 1 of 2. ---- */ 645 646 ldda [%o1]%asi,%d16 ! block load 647 fmovd %d16, %d12 648 fmovd %d18, %d14 649 stda %d0,[%o0]%asi 650 add %o0, 64, %o0 ! advance dst 651 fmovd %d20, %d0 652 fmovd %d22, %d2 653 fmovd %d24, %d4 654 fmovd %d26, %d6 655 fmovd %d28, %d8 656 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 657 fmovd %d30, %d10 658 659 /* ---- copy line 2 of 2. ---- */ 660 ldda [%o1+64]%asi,%d16 661 fmovd %d16, %d12 662 fmovd %d18, %d14 663 add %o1, 128, %o1 ! increment src 664 stda %d0,[%o0]%asi 665 add %o0, 64, %o0 ! advance dst 666 fmovd %d20, %d0 667 fmovd %d22, %d2 668 fmovd %d24, %d4 669 fmovd %d26, %d6 670 fmovd %d28, %d8 671 fmovd %d30, %d10 672 bgt,pt %ncc, .mv_align_010_loop 673 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 674 675 std %d0, [%o0] 676 std %d2, [%o0+8] 677 std %d4, [%o0+16] 678 std %d6, [%o0+24] 679 std %d8, [%o0+32] 680 std %d10, [%o0+40] 681 ba .remain_stuff 682 add %o0, 48, %o0 683 ! END OF mv_align_010 684 685.mv_align_001: 686! Alignment off by 56 bytes 687 ldd [%o1], %d0 688 ldd [%o1+8], %d2 689 ldd [%o1+16], %d4 690 ldd [%o1+24], %d6 691 ldd [%o1+32], %d8 692 ldd [%o1+40], %d10 693 ldd [%o1+48], %d12 694 add %o1, 56, %o1 695 sub %o2, 56, %o2 696 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 697 and %o2, 0x7f, %o2 ! residue bytes in %o2 698.mv_align_001_loop: 699 subcc %o5, 128, %o5 700 /* ---- copy line 1 of 2. ---- */ 701 702 ldda [%o1]%asi,%d16 ! block load 703 fmovd %d16, %d14 704 stda %d0,[%o0]%asi 705 add %o0, 64, %o0 ! advance dst 706 fmovd %d18, %d0 707 fmovd %d20, %d2 708 fmovd %d22, %d4 709 fmovd %d24, %d6 710 fmovd %d26, %d8 711 fmovd %d28, %d10 712 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 713 fmovd %d30, %d12 714 715 /* ---- copy line 2 of 2. ---- */ 716 ldda [%o1+64]%asi,%d16 717 fmovd %d16, %d14 718 add %o1, 128, %o1 ! increment src 719 stda %d0,[%o0]%asi 720 add %o0, 64, %o0 ! advance dst 721 fmovd %d18, %d0 722 fmovd %d20, %d2 723 fmovd %d22, %d4 724 fmovd %d24, %d6 725 fmovd %d26, %d8 726 fmovd %d28, %d10 727 fmovd %d30, %d12 728 bgt,pt %ncc, .mv_align_001_loop 729 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 730 731 std %d0, [%o0] 732 std %d2, [%o0+8] 733 std %d4, [%o0+16] 734 std %d6, [%o0+24] 735 std %d8, [%o0+32] 736 std %d10, [%o0+40] 737 std %d12, [%o0+48] 738 ba .remain_stuff 739 add %o0, 56, %o0 740 ! END OF mv_align_001 741 742.mv_align_000: 743 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 744 and %o2, 0x7f, %o2 ! residue bytes in %o2 745.mv_align_000_loop: 746 /* ---- copy line 1 of 2. ---- */ 747 subcc %o5, 128, %o5 748 ldda [%o1]%asi,%d0 749 stda %d0,[%o0]%asi 750 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 751 752 /* ---- copy line 2 of 2. ---- */ 753 add %o0, 64, %o0 754 ldda [%o1+64]%asi,%d0 755 add %o1, 128, %o1 ! increment src 756 stda %d0,[%o0]%asi 757 add %o0, 64, %o0 ! increment dst 758 bgt,pt %ncc, .mv_align_000_loop 759 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 760 ba .remain_stuff 761 nop 762 763 ! END OF mv_align_000 764#else /* NIAGARA2_IMPL */ 765#endif /* NIAGARA2_IMPL */ 766 767 SET_SIZE(memmove) 768 769 ENTRY(memcpy) 770 ENTRY(__align_cpy_1) 771#ifdef NIAGARA2_IMPL 772 cmp %o2, SMALL_MAX ! check for not small case 773 bgeu,pn %ncc, .medium ! go to larger cases 774 mov %o0, %g1 ! save %o0 775.mv_short: 776 cmp %o2, SHORTCOPY ! check for really short case 777 ble,pt %ncc, .smallfin 778 or %o0, %o1, %o4 ! prepare alignment check 779 andcc %o4, 0x3, %o5 ! test for alignment 780 bz,pt %ncc, .smallword ! branch to word aligned case 781 cmp %o2, SHORTCHECK 782 ble,pt %ncc, .smallrest 783 andcc %o1, 0x3, %o5 ! is src word aligned 784 bz,pn %ncc, .aldst 785 cmp %o5, 2 ! is src half-word aligned 786 be,pt %ncc, .s2algn 787 cmp %o5, 3 ! src is byte aligned 788.s1algn:ldub [%o1], %o3 ! move 1 or 3 bytes to align it 789 inc 1, %o1 790 stb %o3, [%o0] ! move a byte to align src 791 inc 1, %o0 792 bne,pt %ncc, .s2algn 793 dec %o2 794 b .ald ! now go align dest 795 andcc %o0, 0x3, %o5 796 797.s2algn:lduh [%o1], %o3 ! know src is 2 byte aligned 798 inc 2, %o1 799 srl %o3, 8, %o4 800 stb %o4, [%o0] ! have to do bytes, 801 stb %o3, [%o0 + 1] ! don't know dst alignment 802 inc 2, %o0 803 dec 2, %o2 804 805.aldst: andcc %o0, 0x3, %o5 ! align the destination address 806.ald: bz,pn %ncc, .w4cp 807 cmp %o5, 2 808 be,pn %ncc, .w2cp 809 cmp %o5, 3 810.w3cp: lduw [%o1], %o4 811 inc 4, %o1 812 srl %o4, 24, %o5 813 stb %o5, [%o0] 814 bne,pt %ncc, .w1cp 815 inc %o0 816 dec 1, %o2 817 andn %o2, 3, %o3 ! %o3 is aligned word count 818 dec 4, %o3 ! avoid reading beyond tail of src 819 sub %o1, %o0, %o1 ! %o1 gets the difference 820 8211: sll %o4, 8, %g5 ! save residual bytes 822 lduw [%o1+%o0], %o4 823 deccc 4, %o3 824 srl %o4, 24, %o5 ! merge with residual 825 or %o5, %g5, %g5 826 st %g5, [%o0] 827 bnz,pt %ncc, 1b 828 inc 4, %o0 829 sub %o1, 3, %o1 ! used one byte of last word read 830 and %o2, 3, %o2 831 b 7f 832 inc 4, %o2 833 834.w1cp: srl %o4, 8, %o5 835 sth %o5, [%o0] 836 inc 2, %o0 837 dec 3, %o2 838 andn %o2, 3, %o3 ! %o3 is aligned word count 839 dec 4, %o3 ! avoid reading beyond tail of src 840 sub %o1, %o0, %o1 ! %o1 gets the difference 841 8422: sll %o4, 24, %g5 ! save residual bytes 843 lduw [%o1+%o0], %o4 844 deccc 4, %o3 845 srl %o4, 8, %o5 ! merge with residual 846 or %o5, %g5, %g5 847 st %g5, [%o0] 848 bnz,pt %ncc, 2b 849 inc 4, %o0 850 sub %o1, 1, %o1 ! used three bytes of last word read 851 and %o2, 3, %o2 852 b 7f 853 inc 4, %o2 854 855.w2cp: lduw [%o1], %o4 856 inc 4, %o1 857 srl %o4, 16, %o5 858 sth %o5, [%o0] 859 inc 2, %o0 860 dec 2, %o2 861 andn %o2, 3, %o3 ! %o3 is aligned word count 862 dec 4, %o3 ! avoid reading beyond tail of src 863 sub %o1, %o0, %o1 ! %o1 gets the difference 864 8653: sll %o4, 16, %g5 ! save residual bytes 866 lduw [%o1+%o0], %o4 867 deccc 4, %o3 868 srl %o4, 16, %o5 ! merge with residual 869 or %o5, %g5, %g5 870 st %g5, [%o0] 871 bnz,pt %ncc, 3b 872 inc 4, %o0 873 sub %o1, 2, %o1 ! used two bytes of last word read 874 and %o2, 3, %o2 875 b 7f 876 inc 4, %o2 877 878.w4cp: andn %o2, 3, %o3 ! %o3 is aligned word count 879 sub %o1, %o0, %o1 ! %o1 gets the difference 880 8811: lduw [%o1+%o0], %o4 ! read from address 882 deccc 4, %o3 ! decrement count 883 st %o4, [%o0] ! write at destination address 884 bgu,pt %ncc, 1b 885 inc 4, %o0 ! increment to address 886 and %o2, 3, %o2 ! number of leftover bytes, if any 887 888 ! simple finish up byte copy, works with any alignment 8897: 890 add %o1, %o0, %o1 ! restore %o1 891.smallrest: 892 tst %o2 893 bz,pt %ncc, .smallx 894 cmp %o2, 4 895 blt,pt %ncc, .smallleft3 896 nop 897 sub %o2, 3, %o2 898.smallnotalign4: 899 ldub [%o1], %o3 ! read byte 900 subcc %o2, 4, %o2 ! reduce count by 4 901 stb %o3, [%o0] ! write byte 902 ldub [%o1+1], %o3 ! repeat for total of 4 bytes 903 add %o1, 4, %o1 ! advance SRC by 4 904 stb %o3, [%o0+1] 905 ldub [%o1-2], %o3 906 add %o0, 4, %o0 ! advance DST by 4 907 stb %o3, [%o0-2] 908 ldub [%o1-1], %o3 909 bgu,pt %ncc, .smallnotalign4 ! loop til 3 or fewer bytes remain 910 stb %o3, [%o0-1] 911 addcc %o2, 3, %o2 ! restore count 912 bz,pt %ncc, .smallx 913.smallleft3: ! 1, 2, or 3 bytes remain 914 subcc %o2, 1, %o2 915 ldub [%o1], %o3 ! load one byte 916 bz,pt %ncc, .smallx 917 stb %o3, [%o0] ! store one byte 918 ldub [%o1+1], %o3 ! load second byte 919 subcc %o2, 1, %o2 920 bz,pt %ncc, .smallx 921 stb %o3, [%o0+1] ! store second byte 922 ldub [%o1+2], %o3 ! load third byte 923 stb %o3, [%o0+2] ! store third byte 924.smallx: 925 retl 926 mov %g1, %o0 ! restore %o0 927 928.smallfin: 929 tst %o2 930 bnz,pt %ncc, .smallleft3 931 nop 932 retl 933 mov %g1, %o0 ! restore %o0 934 935 .align 16 936.smallwords: 937 lduw [%o1], %o3 ! read word 938.smallwordx: 939 subcc %o2, 8, %o2 ! update count 940 stw %o3, [%o0] ! write word 941 add %o1, 8, %o1 ! update SRC 942 lduw [%o1-4], %o3 ! read word 943 add %o0, 8, %o0 ! update DST 944 bgu,pt %ncc, .smallwords ! loop until done 945 stw %o3, [%o0-4] ! write word 946 addcc %o2, 7, %o2 ! restore count 947 bz,pt %ncc, .smallexit ! check for completion 948 cmp %o2, 4 ! check for 4 or more bytes left 949 blt %ncc, .smallleft3 ! if not, go to finish up 950 nop 951 lduw [%o1], %o3 952 add %o1, 4, %o1 953 subcc %o2, 4, %o2 954 add %o0, 4, %o0 955 bnz,pt %ncc, .smallleft3 956 stw %o3, [%o0-4] 957 retl 958 mov %g1, %o0 ! restore %o0 959 960! 8 or more bytes, src and dest start on word boundary 961! %o4 contains or %o0, %o1; %o3 contains first four bytes of src 962.smalllong: 963 andcc %o4, 0x7, %o5 ! test for long alignment 964 bnz,pt %ncc, .smallwordx ! branch to word aligned case 965 cmp %o2, SHORT_LONG-7 966 bge,a %ncc, .medl64 ! if we branch 967 sub %o2,56,%o2 ! adjust %o2 to -31 off count 968 sub %o1, %o0, %o1 ! %o1 gets the difference 969.small_long_l: 970 ldx [%o1+%o0], %o3 971 subcc %o2, 8, %o2 972 add %o0, 8, %o0 973 bgu,pt %ncc, .small_long_l ! loop until done 974 stx %o3, [%o0-8] ! write word 975 add %o1, %o0, %o1 ! restore %o1 976 addcc %o2, 7, %o2 ! restore %o2 to correct count 977 bz,pt %ncc, .smallexit ! check for completion 978 cmp %o2, 4 ! check for 4 or more bytes left 979 blt,pt %ncc, .smallleft3 ! if not, go to finish up 980 nop 981 lduw [%o1], %o3 982 add %o1, 4, %o1 983 subcc %o2, 4, %o2 984 stw %o3, [%o0] 985 add %o0, 4, %o0 986 bnz,pt %ncc, .smallleft3 987 nop 988 retl 989 mov %g1, %o0 ! restore %o0 990 991 .align 16 992! src and dest start on word boundary 993.smallword: 994 subcc %o2, 7, %o2 ! adjust count 995 bgu,pt %ncc, .smalllong 996 lduw [%o1], %o3 ! read word 997 addcc %o2, 3, %o2 ! restore count 998 bz,pt %ncc, .smallexit 999 stw %o3, [%o0] ! write word 1000 deccc %o2 ! reduce count for cc test 1001 ldub [%o1+4], %o3 ! load one byte 1002 bz,pt %ncc, .smallexit 1003 stb %o3, [%o0+4] ! store one byte 1004 ldub [%o1+5], %o3 ! load second byte 1005 deccc %o2 1006 bz,pt %ncc, .smallexit 1007 stb %o3, [%o0+5] ! store second byte 1008 ldub [%o1+6], %o3 ! load third byte 1009 stb %o3, [%o0+6] ! store third byte 1010.smallexit: 1011 retl 1012 mov %g1, %o0 ! restore %o0 1013 1014 .align 16 1015.medium: 1016 neg %o0, %o5 1017 andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 1018 brz,pt %o5, .dst_aligned_on_8 1019 1020 ! %o5 has the bytes to be written in partial store. 1021 sub %o2, %o5, %o2 1022 sub %o1, %o0, %o1 ! %o1 gets the difference 10237: ! dst aligning loop 1024 ldub [%o1+%o0], %o4 ! load one byte 1025 subcc %o5, 1, %o5 1026 stb %o4, [%o0] 1027 bgu,pt %ncc, 7b 1028 add %o0, 1, %o0 ! advance dst 1029 add %o1, %o0, %o1 ! restore %o1 1030.dst_aligned_on_8: 1031 andcc %o1, 7, %o5 1032 brnz,pt %o5, .src_dst_unaligned_on_8 1033 prefetch [%o1 + (1 * BLOCK_SIZE)], #one_read 1034 1035.src_dst_aligned_on_8: 1036 ! check if we are copying MED_MAX or more bytes 1037 cmp %o2, MED_MAX ! limit to store buffer size 1038 bgu,pt %ncc, .large_align8_copy 1039 prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read 1040/* 1041 * Special case for handling when src and dest are both long word aligned 1042 * and total data to move is less than MED_MAX bytes 1043 */ 1044.medlong: 1045 subcc %o2, 63, %o2 ! adjust length to allow cc test 1046 ble,pt %ncc, .medl63 ! skip big loop if less than 64 bytes 1047.medl64: 1048 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read ! into the l2 cache 1049 ldx [%o1], %o4 ! load 1050 subcc %o2, 64, %o2 ! decrement length count 1051 stx %o4, [%o0] ! and store 1052 ldx [%o1+8], %o3 ! a block of 64 bytes 1053 stx %o3, [%o0+8] 1054 ldx [%o1+16], %o4 1055 stx %o4, [%o0+16] 1056 ldx [%o1+24], %o3 1057 stx %o3, [%o0+24] 1058 ldx [%o1+32], %o4 ! load 1059 stx %o4, [%o0+32] ! and store 1060 ldx [%o1+40], %o3 ! a block of 64 bytes 1061 add %o1, 64, %o1 ! increase src ptr by 64 1062 stx %o3, [%o0+40] 1063 ldx [%o1-16], %o4 1064 add %o0, 64, %o0 ! increase dst ptr by 64 1065 stx %o4, [%o0-16] 1066 ldx [%o1-8], %o3 1067 bgu,pt %ncc, .medl64 ! repeat if at least 64 bytes left 1068 stx %o3, [%o0-8] 1069.medl63: 1070 addcc %o2, 32, %o2 ! adjust remaining count 1071 ble,pt %ncc, .medl31 ! to skip if 31 or fewer bytes left 1072 nop 1073 ldx [%o1], %o4 ! load 1074 sub %o2, 32, %o2 ! decrement length count 1075 stx %o4, [%o0] ! and store 1076 ldx [%o1+8], %o3 ! a block of 32 bytes 1077 add %o1, 32, %o1 ! increase src ptr by 32 1078 stx %o3, [%o0+8] 1079 ldx [%o1-16], %o4 1080 add %o0, 32, %o0 ! increase dst ptr by 32 1081 stx %o4, [%o0-16] 1082 ldx [%o1-8], %o3 1083 stx %o3, [%o0-8] 1084.medl31: 1085 addcc %o2, 16, %o2 ! adjust remaining count 1086 ble,pt %ncc, .medl15 ! skip if 15 or fewer bytes left 1087 nop ! 1088 ldx [%o1], %o4 ! load and store 16 bytes 1089 add %o1, 16, %o1 ! increase src ptr by 16 1090 stx %o4, [%o0] ! 1091 sub %o2, 16, %o2 ! decrease count by 16 1092 ldx [%o1-8], %o3 ! 1093 add %o0, 16, %o0 ! increase dst ptr by 16 1094 stx %o3, [%o0-8] 1095.medl15: 1096 addcc %o2, 15, %o2 ! restore count 1097 bz,pt %ncc, .smallexit ! exit if finished 1098 cmp %o2, 8 1099 blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 1100 tst %o2 1101 ldx [%o1], %o4 ! load 8 bytes 1102 add %o1, 8, %o1 ! increase src ptr by 8 1103 add %o0, 8, %o0 ! increase dst ptr by 8 1104 subcc %o2, 8, %o2 ! decrease count by 8 1105 bnz,pt %ncc, .medw7 1106 stx %o4, [%o0-8] ! and store 8 bytes 1107 retl 1108 mov %g1, %o0 ! restore %o0 1109 1110 .align 16 1111.src_dst_unaligned_on_8: 1112 ! DST is 8-byte aligned, src is not 11132: 1114 andcc %o1, 0x3, %o5 ! test word alignment 1115 bnz,pt %ncc, .unalignsetup ! branch to skip if not word aligned 1116 prefetch [%o1 + (2 * BLOCK_SIZE)], #one_read 1117 1118/* 1119 * Handle all cases where src and dest are aligned on word 1120 * boundaries. Use unrolled loops for better performance. 1121 * This option wins over standard large data move when 1122 * source and destination is in cache for medium 1123 * to short data moves. 1124 */ 1125 cmp %o2, MED_WMAX ! limit to store buffer size 1126 bge,pt %ncc, .unalignrejoin ! otherwise rejoin main loop 1127 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 1128 1129 subcc %o2, 31, %o2 ! adjust length to allow cc test 1130 ! for end of loop 1131 ble,pt %ncc, .medw31 ! skip big loop if less than 16 1132 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1133.medw32: 1134 ld [%o1], %o4 ! move a block of 32 bytes 1135 stw %o4, [%o0] 1136 ld [%o1+4], %o3 1137 stw %o3, [%o0+4] 1138 ld [%o1+8], %o4 1139 stw %o4, [%o0+8] 1140 ld [%o1+12], %o3 1141 stw %o3, [%o0+12] 1142 ld [%o1+16], %o4 1143 subcc %o2, 32, %o2 ! decrement length count 1144 stw %o4, [%o0+16] 1145 ld [%o1+20], %o3 1146 add %o1, 32, %o1 ! increase src ptr by 32 1147 stw %o3, [%o0+20] 1148 ld [%o1-8], %o4 1149 add %o0, 32, %o0 ! increase dst ptr by 32 1150 stw %o4, [%o0-8] 1151 ld [%o1-4], %o3 1152 bgu,pt %ncc, .medw32 ! repeat if at least 32 bytes left 1153 stw %o3, [%o0-4] 1154.medw31: 1155 addcc %o2, 31, %o2 ! restore count 1156 1157 bz,pt %ncc, .smallexit ! exit if finished 1158 nop 1159 cmp %o2, 16 1160 blt,pt %ncc, .medw15 1161 nop 1162 ld [%o1], %o4 ! move a block of 16 bytes 1163 subcc %o2, 16, %o2 ! decrement length count 1164 stw %o4, [%o0] 1165 ld [%o1+4], %o3 1166 add %o1, 16, %o1 ! increase src ptr by 16 1167 stw %o3, [%o0+4] 1168 ld [%o1-8], %o4 1169 add %o0, 16, %o0 ! increase dst ptr by 16 1170 stw %o4, [%o0-8] 1171 ld [%o1-4], %o3 1172 stw %o3, [%o0-4] 1173.medw15: 1174 bz,pt %ncc, .smallexit ! exit if finished 1175 cmp %o2, 8 1176 blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 1177 tst %o2 1178 ld [%o1], %o4 ! load 4 bytes 1179 subcc %o2, 8, %o2 ! decrease count by 8 1180 stw %o4, [%o0] ! and store 4 bytes 1181 add %o1, 8, %o1 ! increase src ptr by 8 1182 ld [%o1-4], %o3 ! load 4 bytes 1183 add %o0, 8, %o0 ! increase dst ptr by 8 1184 stw %o3, [%o0-4] ! and store 4 bytes 1185 bz,pt %ncc, .smallexit ! exit if finished 1186.medw7: ! count is ge 1, less than 8 1187 cmp %o2, 4 ! check for 4 bytes left 1188 blt,pt %ncc, .smallleft3 ! skip if 3 or fewer bytes left 1189 nop ! 1190 ld [%o1], %o4 ! load 4 bytes 1191 add %o1, 4, %o1 ! increase src ptr by 4 1192 add %o0, 4, %o0 ! increase dst ptr by 4 1193 subcc %o2, 4, %o2 ! decrease count by 4 1194 bnz .smallleft3 1195 stw %o4, [%o0-4] ! and store 4 bytes 1196 retl 1197 mov %g1, %o0 ! restore %o0 1198 1199 .align 16 1200.large_align8_copy: ! Src and dst share 8 byte alignment 1201 rd %fprs, %g5 ! check for unused fp 1202 ! if fprs.fef == 0, set it. 1203 ! Setting it when already set costs more than checking 1204 andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0 1205 bz,a %ncc, 1f 1206 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 12071: 1208 ! align dst to 64 byte boundary 1209 andcc %o0, 0x3f, %o3 ! %o3 == 0 means dst is 64 byte aligned 1210 brz,pn %o3, .aligned_to_64 1211 andcc %o0, 8, %o3 ! odd long words to move? 1212 brz,pt %o3, .aligned_to_16 1213 nop 1214 ldx [%o1], %o4 1215 sub %o2, 8, %o2 1216 add %o1, 8, %o1 ! increment src ptr 1217 add %o0, 8, %o0 ! increment dst ptr 1218 stx %o4, [%o0-8] 1219.aligned_to_16: 1220 andcc %o0, 16, %o3 ! pair of long words to move? 1221 brz,pt %o3, .aligned_to_32 1222 nop 1223 ldx [%o1], %o4 1224 sub %o2, 16, %o2 1225 stx %o4, [%o0] 1226 add %o1, 16, %o1 ! increment src ptr 1227 ldx [%o1-8], %o4 1228 add %o0, 16, %o0 ! increment dst ptr 1229 stx %o4, [%o0-8] 1230.aligned_to_32: 1231 andcc %o0, 32, %o3 ! four long words to move? 1232 brz,pt %o3, .aligned_to_64 1233 nop 1234 ldx [%o1], %o4 1235 sub %o2, 32, %o2 1236 stx %o4, [%o0] 1237 ldx [%o1+8], %o4 1238 stx %o4, [%o0+8] 1239 ldx [%o1+16], %o4 1240 stx %o4, [%o0+16] 1241 add %o1, 32, %o1 ! increment src ptr 1242 ldx [%o1-8], %o4 1243 add %o0, 32, %o0 ! increment dst ptr 1244 stx %o4, [%o0-8] 1245.aligned_to_64: 1246 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 1247 mov %asi,%o4 ! save %asi 1248 ! Determine source alignment 1249 ! to correct 8 byte offset 1250 andcc %o1, 0x20, %o3 1251 brnz,pn %o3, .align_1 1252 mov ASI_BLK_P, %asi ! setup %asi for block load/store 1253 andcc %o1, 0x10, %o3 1254 brnz,pn %o3, .align_01 1255 nop 1256 andcc %o1, 0x08, %o3 1257 brz,pn %o3, .align_000 1258 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1259 ba .align_001 1260 nop 1261.align_01: 1262 andcc %o1, 0x08, %o3 1263 brnz,pn %o3, .align_011 1264 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1265 ba .align_010 1266 nop 1267.align_1: 1268 andcc %o1, 0x10, %o3 1269 brnz,pn %o3, .align_11 1270 nop 1271 andcc %o1, 0x08, %o3 1272 brnz,pn %o3, .align_101 1273 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1274 ba .align_100 1275 nop 1276.align_11: 1277 andcc %o1, 0x08, %o3 1278 brz,pn %o3, .align_110 1279 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1280 1281.align_111: 1282! Alignment off by 8 bytes 1283 ldd [%o1], %d0 1284 add %o1, 8, %o1 1285 sub %o2, 8, %o2 1286 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1287 and %o2, 0x7f, %o2 ! residue bytes in %o2 1288.align_111_loop: 1289 subcc %o5, 128, %o5 1290 /* ---- copy line 1 of 2. ---- */ 1291 ldda [%o1]%asi,%d16 ! block load 1292 fmovd %d16, %d2 1293 fmovd %d18, %d4 1294 fmovd %d20, %d6 1295 fmovd %d22, %d8 1296 fmovd %d24, %d10 1297 fmovd %d26, %d12 1298 fmovd %d28, %d14 1299 stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1300 stda %d0,[%o0]%asi 1301 add %o0, 64, %o0 ! advance dst 1302 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1303 fmovd %d30, %d0 1304 1305 /* ---- copy line 2 of 2. ---- */ 1306 ldda [%o1+64]%asi,%d16 1307 fmovd %d16, %d2 1308 fmovd %d18, %d4 1309 fmovd %d20, %d6 1310 fmovd %d22, %d8 1311 fmovd %d24, %d10 1312 fmovd %d26, %d12 1313 fmovd %d28, %d14 1314 add %o1, 128, %o1 ! increment src 1315 stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1316 stda %d0,[%o0]%asi 1317 add %o0, 64, %o0 ! advance dst 1318 fmovd %d30, %d0 1319 bgt,pt %ncc, .align_111_loop 1320 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1321 1322 std %d0, [%o0] 1323 ba .remain_stuff 1324 add %o0, 8, %o0 1325 ! END OF align_111 1326 1327.align_110: 1328! Alignment off by 16 bytes 1329 ldd [%o1], %d0 1330 ldd [%o1+8], %d2 1331 add %o1, 16, %o1 1332 sub %o2, 16, %o2 1333 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1334 and %o2, 0x7f, %o2 ! residue bytes in %o2 1335.align_110_loop: 1336 subcc %o5, 128, %o5 1337 /* ---- copy line 1 of 2. ---- */ 1338 1339 ldda [%o1]%asi,%d16 ! block load 1340 fmovd %d16, %d4 1341 fmovd %d18, %d6 1342 fmovd %d20, %d8 1343 fmovd %d22, %d10 1344 fmovd %d24, %d12 1345 fmovd %d26, %d14 1346 stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1347 stda %d0,[%o0]%asi 1348 add %o0, 64, %o0 ! advance dst 1349 fmovd %d28, %d0 1350 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1351 fmovd %d30, %d2 1352 1353 /* ---- copy line 2 of 2. ---- */ 1354 ldda [%o1+64]%asi,%d16 1355 fmovd %d16, %d4 1356 fmovd %d18, %d6 1357 fmovd %d20, %d8 1358 fmovd %d22, %d10 1359 fmovd %d24, %d12 1360 fmovd %d26, %d14 1361 add %o1, 128, %o1 ! increment src 1362 stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1363 stda %d0,[%o0]%asi 1364 add %o0, 64, %o0 ! advance dst 1365 fmovd %d28, %d0 1366 fmovd %d30, %d2 1367 bgt,pt %ncc, .align_110_loop 1368 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1369 1370 std %d0, [%o0] 1371 std %d2, [%o0+8] 1372 ba .remain_stuff 1373 add %o0, 16, %o0 1374 ! END OF align_110 1375 1376.align_101: 1377! Alignment off by 24 bytes 1378 ldd [%o1], %d0 1379 ldd [%o1+8], %d2 1380 ldd [%o1+16], %d4 1381 add %o1, 24, %o1 1382 sub %o2, 24, %o2 1383 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1384 and %o2, 0x7f, %o2 ! residue bytes in %o2 1385.align_101_loop: 1386 subcc %o5, 128, %o5 1387 /* ---- copy line 1 of 2. ---- */ 1388 1389 ldda [%o1]%asi,%d16 ! block load 1390 fmovd %d16, %d6 1391 fmovd %d18, %d8 1392 fmovd %d20, %d10 1393 fmovd %d22, %d12 1394 fmovd %d24, %d14 1395 stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1396 stda %d0,[%o0]%asi 1397 add %o0, 64, %o0 ! advance dst 1398 fmovd %d26, %d0 1399 fmovd %d28, %d2 1400 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1401 fmovd %d30, %d4 1402 1403 /* ---- copy line 2 of 2. ---- */ 1404 ldda [%o1+64]%asi,%d16 1405 fmovd %d16, %d6 1406 fmovd %d18, %d8 1407 fmovd %d20, %d10 1408 fmovd %d22, %d12 1409 fmovd %d24, %d14 1410 add %o1, 128, %o1 ! increment src 1411 stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1412 stda %d0,[%o0]%asi 1413 add %o0, 64, %o0 ! advance dst 1414 fmovd %d26, %d0 1415 fmovd %d28, %d2 1416 fmovd %d30, %d4 1417 bgt,pt %ncc, .align_101_loop 1418 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1419 1420 std %d0, [%o0] 1421 std %d2, [%o0+8] 1422 std %d4, [%o0+16] 1423 ba .remain_stuff 1424 add %o0, 24, %o0 1425 ! END OF align_101 1426 1427.align_100: 1428! Alignment off by 32 bytes 1429 ldd [%o1], %d0 1430 ldd [%o1+8], %d2 1431 ldd [%o1+16],%d4 1432 ldd [%o1+24],%d6 1433 add %o1, 32, %o1 1434 sub %o2, 32, %o2 1435 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1436 and %o2, 0x7f, %o2 ! residue bytes in %o2 1437.align_100_loop: 1438 subcc %o5, 128, %o5 1439 /* ---- copy line 1 of 2. ---- */ 1440 ldda [%o1]%asi,%d16 ! block load 1441 fmovd %d16, %d8 1442 fmovd %d18, %d10 1443 fmovd %d20, %d12 1444 fmovd %d22, %d14 1445 stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1446 stda %d0,[%o0]%asi 1447 add %o0, 64, %o0 ! advance dst 1448 fmovd %d24, %d0 1449 fmovd %d26, %d2 1450 fmovd %d28, %d4 1451 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1452 fmovd %d30, %d6 1453 1454 /* ---- copy line 2 of 2. ---- */ 1455 ldda [%o1+64]%asi,%d16 1456 fmovd %d16, %d8 1457 fmovd %d18, %d10 1458 fmovd %d20, %d12 1459 fmovd %d22, %d14 1460 add %o1, 128, %o1 ! increment src 1461 stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1462 stda %d0,[%o0]%asi 1463 add %o0, 64, %o0 ! advance dst 1464 fmovd %d24, %d0 1465 fmovd %d26, %d2 1466 fmovd %d28, %d4 1467 fmovd %d30, %d6 1468 bgt,pt %ncc, .align_100_loop 1469 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1470 1471 std %d0, [%o0] 1472 std %d2, [%o0+8] 1473 std %d4, [%o0+16] 1474 std %d6, [%o0+24] 1475 ba .remain_stuff 1476 add %o0, 32, %o0 1477 ! END OF align_100 1478 1479.align_011: 1480! Alignment off by 40 bytes 1481 ldd [%o1], %d0 1482 ldd [%o1+8], %d2 1483 ldd [%o1+16], %d4 1484 ldd [%o1+24], %d6 1485 ldd [%o1+32], %d8 1486 add %o1, 40, %o1 1487 sub %o2, 40, %o2 1488 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1489 and %o2, 0x7f, %o2 ! residue bytes in %o2 1490.align_011_loop: 1491 subcc %o5, 128, %o5 1492 /* ---- copy line 1 of 2. ---- */ 1493 1494 ldda [%o1]%asi,%d16 ! block load 1495 fmovd %d16, %d10 1496 fmovd %d18, %d12 1497 fmovd %d20, %d14 1498 stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1499 stda %d0,[%o0]%asi 1500 add %o0, 64, %o0 ! advance dst 1501 fmovd %d22, %d0 1502 fmovd %d24, %d2 1503 fmovd %d26, %d4 1504 fmovd %d28, %d6 1505 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1506 fmovd %d30, %d8 1507 1508 /* ---- copy line 2 of 2. ---- */ 1509 ldda [%o1+64]%asi,%d16 1510 fmovd %d16, %d10 1511 fmovd %d18, %d12 1512 fmovd %d20, %d14 1513 add %o1, 128, %o1 ! increment src 1514 stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1515 stda %d0,[%o0]%asi 1516 add %o0, 64, %o0 ! advance dst 1517 fmovd %d22, %d0 1518 fmovd %d24, %d2 1519 fmovd %d26, %d4 1520 fmovd %d28, %d6 1521 fmovd %d30, %d8 1522 bgt,pt %ncc, .align_011_loop 1523 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1524 1525 std %d0, [%o0] 1526 std %d2, [%o0+8] 1527 std %d4, [%o0+16] 1528 std %d6, [%o0+24] 1529 std %d8, [%o0+32] 1530 ba .remain_stuff 1531 add %o0, 40, %o0 1532 ! END OF align_011 1533 1534.align_010: 1535! Alignment off by 48 bytes 1536 ldd [%o1], %d0 1537 ldd [%o1+8], %d2 1538 ldd [%o1+16], %d4 1539 ldd [%o1+24], %d6 1540 ldd [%o1+32], %d8 1541 ldd [%o1+40], %d10 1542 add %o1, 48, %o1 1543 sub %o2, 48, %o2 1544 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1545 and %o2, 0x7f, %o2 ! residue bytes in %o2 1546.align_010_loop: 1547 subcc %o5, 128, %o5 1548 /* ---- copy line 1 of 2. ---- */ 1549 1550 ldda [%o1]%asi,%d16 ! block load 1551 fmovd %d16, %d12 1552 fmovd %d18, %d14 1553 stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1554 stda %d0,[%o0]%asi 1555 add %o0, 64, %o0 ! advance dst 1556 fmovd %d20, %d0 1557 fmovd %d22, %d2 1558 fmovd %d24, %d4 1559 fmovd %d26, %d6 1560 fmovd %d28, %d8 1561 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1562 fmovd %d30, %d10 1563 1564 /* ---- copy line 2 of 2. ---- */ 1565 ldda [%o1+64]%asi,%d16 1566 fmovd %d16, %d12 1567 fmovd %d18, %d14 1568 add %o1, 128, %o1 ! increment src 1569 stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1570 stda %d0,[%o0]%asi 1571 add %o0, 64, %o0 ! advance dst 1572 fmovd %d20, %d0 1573 fmovd %d22, %d2 1574 fmovd %d24, %d4 1575 fmovd %d26, %d6 1576 fmovd %d28, %d8 1577 fmovd %d30, %d10 1578 bgt,pt %ncc, .align_010_loop 1579 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1580 1581 std %d0, [%o0] 1582 std %d2, [%o0+8] 1583 std %d4, [%o0+16] 1584 std %d6, [%o0+24] 1585 std %d8, [%o0+32] 1586 std %d10, [%o0+40] 1587 ba .remain_stuff 1588 add %o0, 48, %o0 1589 ! END OF align_010 1590 1591.align_001: 1592! Alignment off by 56 bytes 1593 ldd [%o1], %d0 1594 ldd [%o1+8], %d2 1595 ldd [%o1+16], %d4 1596 ldd [%o1+24], %d6 1597 ldd [%o1+32], %d8 1598 ldd [%o1+40], %d10 1599 ldd [%o1+48], %d12 1600 add %o1, 56, %o1 1601 sub %o2, 56, %o2 1602 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1603 and %o2, 0x7f, %o2 ! residue bytes in %o2 1604.align_001_loop: 1605 subcc %o5, 128, %o5 1606 /* ---- copy line 1 of 2. ---- */ 1607 1608 ldda [%o1]%asi,%d16 ! block load 1609 fmovd %d16, %d14 1610 stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1611 stda %d0,[%o0]%asi 1612 add %o0, 64, %o0 ! advance dst 1613 fmovd %d18, %d0 1614 fmovd %d20, %d2 1615 fmovd %d22, %d4 1616 fmovd %d24, %d6 1617 fmovd %d26, %d8 1618 fmovd %d28, %d10 1619 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1620 fmovd %d30, %d12 1621 1622 /* ---- copy line 2 of 2. ---- */ 1623 ldda [%o1+64]%asi,%d16 1624 fmovd %d16, %d14 1625 add %o1, 128, %o1 ! increment src 1626 stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1627 stda %d0,[%o0]%asi 1628 add %o0, 64, %o0 ! advance dst 1629 fmovd %d18, %d0 1630 fmovd %d20, %d2 1631 fmovd %d22, %d4 1632 fmovd %d24, %d6 1633 fmovd %d26, %d8 1634 fmovd %d28, %d10 1635 fmovd %d30, %d12 1636 bgt,pt %ncc, .align_001_loop 1637 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1638 1639 std %d0, [%o0] 1640 std %d2, [%o0+8] 1641 std %d4, [%o0+16] 1642 std %d6, [%o0+24] 1643 std %d8, [%o0+32] 1644 std %d10, [%o0+40] 1645 std %d12, [%o0+48] 1646 ba .remain_stuff 1647 add %o0, 56, %o0 1648 ! END OF align_001 1649 1650.align_000: 1651 andn %o2, 0x7f, %o5 ! %o5 is multiple of 2*block size 1652 and %o2, 0x7f, %o2 ! residue bytes in %o2 1653.align_000_loop: 1654 /* ---- copy line 1 of 2. ---- */ 1655 subcc %o5, 128, %o5 1656 ldda [%o1]%asi,%d0 1657 stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1658 stda %d0,[%o0]%asi 1659 prefetch [%o1 + (5 * BLOCK_SIZE)], #one_read 1660 1661 /* ---- copy line 2 of 2. ---- */ 1662 add %o0, 64, %o0 1663 ldda [%o1+64]%asi,%d0 1664 add %o1, 128, %o1 ! increment src 1665 stxa %g0,[%o0]ASI_STBI_P ! block initializing store 1666 stda %d0,[%o0]%asi 1667 add %o0, 64, %o0 ! increment dst 1668 bgt,pt %ncc, .align_000_loop 1669 prefetch [%o1 + (4 * BLOCK_SIZE)], #one_read 1670 1671 ! END OF align_000 1672 1673.remain_stuff: 1674 mov %o4, %asi ! restore %asi 1675 brnz %g5, .medlong 1676 membar #Sync 1677 ba .medlong 1678 wr %g5, %g0, %fprs 1679 1680 .align 16 1681 ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX 1682.unalignsetup: 1683 prefetch [%o1 + (3 * BLOCK_SIZE)], #one_read 1684.unalignrejoin: 1685 rd %fprs, %g5 ! check for unused fp 1686 ! if fprs.fef == 0, set it. 1687 ! Setting it when already set costs more than checking 1688 andcc %g5, FPRS_FEF, %g5 ! test FEF, fprs.du = fprs.dl = 0 1689 bz,a %ncc, 1f 1690 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 16911: 1692 cmp %o2, MED_UMAX ! check for medium unaligned limit 1693 bge,pt %ncc,.unalign_large 1694 nop 1695 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 1696 and %o2, 0x3f, %o2 ! residue bytes in %o2 1697 cmp %o2, 8 ! Insure we don't load beyond 1698 bgt .unalign_adjust ! end of source buffer 1699 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 1700 add %o2, 64, %o2 ! adjust to leave loop 1701 sub %o5, 64, %o5 ! early if necessary 1702.unalign_adjust: 1703 alignaddr %o1, %g0, %g0 ! generate %gsr 1704 add %o1, %o5, %o1 ! advance %o1 to after blocks 1705 ldd [%o4], %d0 1706.unalign_loop: 1707 ldd [%o4+8], %d2 1708 faligndata %d0, %d2, %d16 1709 ldd [%o4+16], %d4 1710 std %d16, [%o0] 1711 faligndata %d2, %d4, %d18 1712 ldd [%o4+24], %d6 1713 std %d18, [%o0+8] 1714 faligndata %d4, %d6, %d20 1715 ldd [%o4+32], %d8 1716 std %d20, [%o0+16] 1717 faligndata %d6, %d8, %d22 1718 ldd [%o4+40], %d10 1719 std %d22, [%o0+24] 1720 faligndata %d8, %d10, %d24 1721 ldd [%o4+48], %d12 1722 std %d24, [%o0+32] 1723 faligndata %d10, %d12, %d26 1724 ldd [%o4+56], %d14 1725 std %d26, [%o0+40] 1726 faligndata %d12, %d14, %d28 1727 ldd [%o4+64], %d0 1728 std %d28, [%o0+48] 1729 faligndata %d14, %d0, %d30 1730 add %o4, BLOCK_SIZE, %o4 1731 std %d30, [%o0+56] 1732 add %o0, BLOCK_SIZE, %o0 1733 subcc %o5, BLOCK_SIZE, %o5 1734 bgu,pt %ncc, .unalign_loop 1735 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1736 ba .unalign_done 1737 nop 1738 1739.unalign_large: 1740 andcc %o0, 0x3f, %o3 ! is dst 64-byte block aligned? 1741 bz %ncc, .unalignsrc 1742 sub %o3, 64, %o3 ! %o3 will be multiple of 8 1743 neg %o3 ! bytes until dest is 64 byte aligned 1744 sub %o2, %o3, %o2 ! update cnt with bytes to be moved 1745 ! Move bytes according to source alignment 1746 andcc %o1, 0x1, %o5 1747 bnz %ncc, .unalignbyte ! check for byte alignment 1748 nop 1749 andcc %o1, 2, %o5 ! check for half word alignment 1750 bnz %ncc, .unalignhalf 1751 nop 1752 ! Src is word aligned 1753.unalignword: 1754 ld [%o1], %o4 ! load 4 bytes 1755 stw %o4, [%o0] ! and store 4 bytes 1756 ld [%o1+4], %o4 ! load 4 bytes 1757 add %o1, 8, %o1 ! increase src ptr by 8 1758 stw %o4, [%o0+4] ! and store 4 bytes 1759 subcc %o3, 8, %o3 ! decrease count by 8 1760 bnz %ncc, .unalignword 1761 add %o0, 8, %o0 ! increase dst ptr by 8 1762 ba .unalignsrc 1763 nop 1764 1765 ! Src is half-word aligned 1766.unalignhalf: 1767 lduh [%o1], %o4 ! load 2 bytes 1768 sllx %o4, 32, %o5 ! shift left 1769 lduw [%o1+2], %o4 1770 or %o4, %o5, %o5 1771 sllx %o5, 16, %o5 1772 lduh [%o1+6], %o4 1773 or %o4, %o5, %o5 1774 stx %o5, [%o0] 1775 add %o1, 8, %o1 1776 subcc %o3, 8, %o3 1777 bnz %ncc, .unalignhalf 1778 add %o0, 8, %o0 1779 ba .unalignsrc 1780 nop 1781 1782 ! Src is Byte aligned 1783.unalignbyte: 1784 sub %o0, %o1, %o0 ! share pointer advance 1785.unalignbyte_loop: 1786 ldub [%o1], %o4 1787 sllx %o4, 56, %o5 1788 lduh [%o1+1], %o4 1789 sllx %o4, 40, %o4 1790 or %o4, %o5, %o5 1791 lduh [%o1+3], %o4 1792 sllx %o4, 24, %o4 1793 or %o4, %o5, %o5 1794 lduh [%o1+5], %o4 1795 sllx %o4, 8, %o4 1796 or %o4, %o5, %o5 1797 ldub [%o1+7], %o4 1798 or %o4, %o5, %o5 1799 stx %o5, [%o0+%o1] 1800 subcc %o3, 8, %o3 1801 bnz %ncc, .unalignbyte_loop 1802 add %o1, 8, %o1 1803 add %o0,%o1, %o0 ! restore pointer 1804 1805 ! Destination is now block (64 byte aligned) 1806.unalignsrc: 1807 andn %o2, 0x3f, %o5 ! %o5 is multiple of block size 1808 and %o2, 0x3f, %o2 ! residue bytes in %o2 1809 add %o2, 64, %o2 ! Insure we don't load beyond 1810 sub %o5, 64, %o5 ! end of source buffer 1811 1812 andn %o1, 0x3f, %o4 ! %o4 has block aligned src address 1813 prefetch [%o4 + (3 * BLOCK_SIZE)], #one_read 1814 alignaddr %o1, %g0, %g0 ! generate %gsr 1815 add %o1, %o5, %o1 ! advance %o1 to after blocks 1816 ! 1817 ! Determine source alignment to correct 8 byte offset 1818 andcc %o1, 0x20, %o3 1819 brnz,pn %o3, .unalign_1 1820 nop 1821 andcc %o1, 0x10, %o3 1822 brnz,pn %o3, .unalign_01 1823 nop 1824 andcc %o1, 0x08, %o3 1825 brz,a %o3, .unalign_000 1826 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1827 ba .unalign_001 1828 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1829.unalign_01: 1830 andcc %o1, 0x08, %o3 1831 brnz,a %o3, .unalign_011 1832 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1833 ba .unalign_010 1834 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1835.unalign_1: 1836 andcc %o1, 0x10, %o3 1837 brnz,pn %o3, .unalign_11 1838 nop 1839 andcc %o1, 0x08, %o3 1840 brnz,a %o3, .unalign_101 1841 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1842 ba .unalign_100 1843 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1844.unalign_11: 1845 andcc %o1, 0x08, %o3 1846 brz,pn %o3, .unalign_110 1847 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1848 1849.unalign_111: 1850 ldd [%o4+56], %d14 1851.unalign_111_loop: 1852 add %o4, 64, %o4 1853 ldda [%o4]ASI_BLK_P, %d16 1854 faligndata %d14, %d16, %d48 1855 faligndata %d16, %d18, %d50 1856 faligndata %d18, %d20, %d52 1857 faligndata %d20, %d22, %d54 1858 faligndata %d22, %d24, %d56 1859 faligndata %d24, %d26, %d58 1860 faligndata %d26, %d28, %d60 1861 faligndata %d28, %d30, %d62 1862 fmovd %d30, %d14 1863 stda %d48, [%o0]ASI_BLK_P 1864 subcc %o5, 64, %o5 1865 add %o0, 64, %o0 1866 bgu,pt %ncc, .unalign_111_loop 1867 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1868 ba .unalign_done 1869 membar #Sync 1870 1871.unalign_110: 1872 ldd [%o4+48], %d12 1873 ldd [%o4+56], %d14 1874.unalign_110_loop: 1875 add %o4, 64, %o4 1876 ldda [%o4]ASI_BLK_P, %d16 1877 faligndata %d12, %d14, %d48 1878 faligndata %d14, %d16, %d50 1879 faligndata %d16, %d18, %d52 1880 faligndata %d18, %d20, %d54 1881 faligndata %d20, %d22, %d56 1882 faligndata %d22, %d24, %d58 1883 faligndata %d24, %d26, %d60 1884 faligndata %d26, %d28, %d62 1885 fmovd %d28, %d12 1886 fmovd %d30, %d14 1887 stda %d48, [%o0]ASI_BLK_P 1888 subcc %o5, 64, %o5 1889 add %o0, 64, %o0 1890 bgu,pt %ncc, .unalign_110_loop 1891 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1892 ba .unalign_done 1893 membar #Sync 1894 1895.unalign_101: 1896 ldd [%o4+40], %d10 1897 ldd [%o4+48], %d12 1898 ldd [%o4+56], %d14 1899.unalign_101_loop: 1900 add %o4, 64, %o4 1901 ldda [%o4]ASI_BLK_P, %d16 1902 faligndata %d10, %d12, %d48 1903 faligndata %d12, %d14, %d50 1904 faligndata %d14, %d16, %d52 1905 faligndata %d16, %d18, %d54 1906 faligndata %d18, %d20, %d56 1907 faligndata %d20, %d22, %d58 1908 faligndata %d22, %d24, %d60 1909 faligndata %d24, %d26, %d62 1910 fmovd %d26, %d10 1911 fmovd %d28, %d12 1912 fmovd %d30, %d14 1913 stda %d48, [%o0]ASI_BLK_P 1914 subcc %o5, 64, %o5 1915 add %o0, 64, %o0 1916 bgu,pt %ncc, .unalign_101_loop 1917 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1918 ba .unalign_done 1919 membar #Sync 1920 1921.unalign_100: 1922 ldd [%o4+32], %d8 1923 ldd [%o4+40], %d10 1924 ldd [%o4+48], %d12 1925 ldd [%o4+56], %d14 1926.unalign_100_loop: 1927 add %o4, 64, %o4 1928 ldda [%o4]ASI_BLK_P, %d16 1929 faligndata %d8, %d10, %d48 1930 faligndata %d10, %d12, %d50 1931 faligndata %d12, %d14, %d52 1932 faligndata %d14, %d16, %d54 1933 faligndata %d16, %d18, %d56 1934 faligndata %d18, %d20, %d58 1935 faligndata %d20, %d22, %d60 1936 faligndata %d22, %d24, %d62 1937 fmovd %d24, %d8 1938 fmovd %d26, %d10 1939 fmovd %d28, %d12 1940 fmovd %d30, %d14 1941 stda %d48, [%o0]ASI_BLK_P 1942 subcc %o5, 64, %o5 1943 add %o0, 64, %o0 1944 bgu,pt %ncc, .unalign_100_loop 1945 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1946 ba .unalign_done 1947 membar #Sync 1948 1949.unalign_011: 1950 ldd [%o4+24], %d6 1951 ldd [%o4+32], %d8 1952 ldd [%o4+40], %d10 1953 ldd [%o4+48], %d12 1954 ldd [%o4+56], %d14 1955.unalign_011_loop: 1956 add %o4, 64, %o4 1957 ldda [%o4]ASI_BLK_P, %d16 1958 faligndata %d6, %d8, %d48 1959 faligndata %d8, %d10, %d50 1960 faligndata %d10, %d12, %d52 1961 faligndata %d12, %d14, %d54 1962 faligndata %d14, %d16, %d56 1963 faligndata %d16, %d18, %d58 1964 faligndata %d18, %d20, %d60 1965 faligndata %d20, %d22, %d62 1966 fmovd %d22, %d6 1967 fmovd %d24, %d8 1968 fmovd %d26, %d10 1969 fmovd %d28, %d12 1970 fmovd %d30, %d14 1971 stda %d48, [%o0]ASI_BLK_P 1972 subcc %o5, 64, %o5 1973 add %o0, 64, %o0 1974 bgu,pt %ncc, .unalign_011_loop 1975 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 1976 ba .unalign_done 1977 membar #Sync 1978 1979.unalign_010: 1980 ldd [%o4+16], %d4 1981 ldd [%o4+24], %d6 1982 ldd [%o4+32], %d8 1983 ldd [%o4+40], %d10 1984 ldd [%o4+48], %d12 1985 ldd [%o4+56], %d14 1986.unalign_010_loop: 1987 add %o4, 64, %o4 1988 ldda [%o4]ASI_BLK_P, %d16 1989 faligndata %d4, %d6, %d48 1990 faligndata %d6, %d8, %d50 1991 faligndata %d8, %d10, %d52 1992 faligndata %d10, %d12, %d54 1993 faligndata %d12, %d14, %d56 1994 faligndata %d14, %d16, %d58 1995 faligndata %d16, %d18, %d60 1996 faligndata %d18, %d20, %d62 1997 fmovd %d20, %d4 1998 fmovd %d22, %d6 1999 fmovd %d24, %d8 2000 fmovd %d26, %d10 2001 fmovd %d28, %d12 2002 fmovd %d30, %d14 2003 stda %d48, [%o0]ASI_BLK_P 2004 subcc %o5, 64, %o5 2005 add %o0, 64, %o0 2006 bgu,pt %ncc, .unalign_010_loop 2007 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 2008 ba .unalign_done 2009 membar #Sync 2010 2011.unalign_001: 2012 ldd [%o4+8], %d2 2013 ldd [%o4+16], %d4 2014 ldd [%o4+24], %d6 2015 ldd [%o4+32], %d8 2016 ldd [%o4+40], %d10 2017 ldd [%o4+48], %d12 2018 ldd [%o4+56], %d14 2019.unalign_001_loop: 2020 add %o4, 64, %o4 2021 ldda [%o4]ASI_BLK_P, %d16 2022 faligndata %d2, %d4, %d48 2023 faligndata %d4, %d6, %d50 2024 faligndata %d6, %d8, %d52 2025 faligndata %d8, %d10, %d54 2026 faligndata %d10, %d12, %d56 2027 faligndata %d12, %d14, %d58 2028 faligndata %d14, %d16, %d60 2029 faligndata %d16, %d18, %d62 2030 fmovd %d18, %d2 2031 fmovd %d20, %d4 2032 fmovd %d22, %d6 2033 fmovd %d24, %d8 2034 fmovd %d26, %d10 2035 fmovd %d28, %d12 2036 fmovd %d30, %d14 2037 stda %d48, [%o0]ASI_BLK_P 2038 subcc %o5, 64, %o5 2039 add %o0, 64, %o0 2040 bgu,pt %ncc, .unalign_001_loop 2041 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 2042 ba .unalign_done 2043 membar #Sync 2044 2045.unalign_000: 2046 ldda [%o4]ASI_BLK_P, %d0 2047.unalign_000_loop: 2048 add %o4, 64, %o4 2049 ldda [%o4]ASI_BLK_P, %d16 2050 faligndata %d0, %d2, %d48 2051 faligndata %d2, %d4, %d50 2052 faligndata %d4, %d6, %d52 2053 faligndata %d6, %d8, %d54 2054 faligndata %d8, %d10, %d56 2055 faligndata %d10, %d12, %d58 2056 faligndata %d12, %d14, %d60 2057 faligndata %d14, %d16, %d62 2058 fmovd %d16, %d0 2059 fmovd %d18, %d2 2060 fmovd %d20, %d4 2061 fmovd %d22, %d6 2062 fmovd %d24, %d8 2063 fmovd %d26, %d10 2064 fmovd %d28, %d12 2065 fmovd %d30, %d14 2066 stda %d48, [%o0]ASI_BLK_P 2067 subcc %o5, 64, %o5 2068 add %o0, 64, %o0 2069 bgu,pt %ncc, .unalign_000_loop 2070 prefetch [%o4 + (4 * BLOCK_SIZE)], #one_read 2071 membar #Sync 2072 2073.unalign_done: 2074 ! Handle trailing bytes, 64 to 127 2075 ! Dest long word aligned, Src not long word aligned 2076 cmp %o2, 15 2077 bleu %ncc, .unalign_short 2078 2079 andn %o2, 0x7, %o5 ! %o5 is multiple of 8 2080 and %o2, 0x7, %o2 ! residue bytes in %o2 2081 add %o2, 8, %o2 2082 sub %o5, 8, %o5 ! insure we don't load past end of src 2083 andn %o1, 0x7, %o4 ! %o4 has long word aligned src address 2084 add %o1, %o5, %o1 ! advance %o1 to after multiple of 8 2085 ldd [%o4], %d0 ! fetch partial word 2086.unalign_by8: 2087 ldd [%o4+8], %d2 2088 add %o4, 8, %o4 2089 faligndata %d0, %d2, %d16 2090 subcc %o5, 8, %o5 2091 std %d16, [%o0] 2092 fmovd %d2, %d0 2093 bgu,pt %ncc, .unalign_by8 2094 add %o0, 8, %o0 2095 2096.unalign_short: 2097 brnz %g5, .smallrest 2098 nop 2099 ba .smallrest 2100 wr %g5, %g0, %fprs 2101#else /* NIAGARA2_IMPL */ 2102.forcpy: 2103 mov %o0, %g5 ! save des address for return val 2104 cmp %o2, 17 ! for small counts copy bytes 2105 bleu,pt %ncc, .dbytecp 2106 nop 2107 2108 cmp %o2, 0x80 ! For lengths less than 128 bytes no 2109 bleu,pn %ncc, .no_blkcpy ! copy using ASI_BLK_INIT_ST_QUAD_LDD_P 2110 2111 /* 2112 * Make sure that source and destination buffers are 64 bytes apart. 2113 * If they are not, do not use ASI_BLK_INIT_ST_QUAD_LDD_P asi to copy 2114 * the data. 2115 */ 2116 subcc %o1, %o0, %o3 2117 blu %ncc, .blkalgndst 2118 cmp %o3, 0x40 ! if src - dst >= 0x40 2119 bgeu,pt %ncc, .blkalgndst ! then use ASI_BLK_INIT_ST_QUAD_LDD_P 2120.no_blkcpy: 2121 andcc %o1, 3, %o5 ! is src word aligned 2122 bz,pn %ncc, .aldst 2123 cmp %o5, 2 ! is src half-word aligned 2124 be,pt %ncc, .s2algn 2125 cmp %o5, 3 ! src is byte aligned 2126.s1algn:ldub [%o1], %o3 ! move 1 or 3 bytes to align it 2127 inc 1, %o1 2128 stb %o3, [%g5] ! move a byte to align src 2129 inc 1, %g5 2130 bne,pt %ncc, .s2algn 2131 dec %o2 2132 b .ald ! now go align dest 2133 andcc %g5, 3, %o5 2134 2135.s2algn:lduh [%o1], %o3 ! know src is 2 byte alinged 2136 inc 2, %o1 2137 srl %o3, 8, %o4 2138 stb %o4, [%g5] ! have to do bytes, 2139 stb %o3, [%g5 + 1] ! don't know dst alingment 2140 inc 2, %g5 2141 dec 2, %o2 2142 2143.aldst: andcc %g5, 3, %o5 ! align the destination address 2144.ald: bz,pn %ncc, .w4cp 2145 cmp %o5, 2 2146 bz,pn %ncc, .w2cp 2147 cmp %o5, 3 2148.w3cp: lduw [%o1], %o4 2149 inc 4, %o1 2150 srl %o4, 24, %o5 2151 stb %o5, [%g5] 2152 bne,pt %ncc, .w1cp 2153 inc %g5 2154 dec 1, %o2 2155 andn %o2, 3, %o3 ! o3 is aligned word count 2156 dec 4, %o3 ! avoid reading beyond tail of src 2157 sub %o1, %g5, %o1 ! o1 gets the difference 2158 21591: sll %o4, 8, %g1 ! save residual bytes 2160 lduw [%o1+%g5], %o4 2161 deccc 4, %o3 2162 srl %o4, 24, %o5 ! merge with residual 2163 or %o5, %g1, %g1 2164 st %g1, [%g5] 2165 bnz,pt %ncc, 1b 2166 inc 4, %g5 2167 sub %o1, 3, %o1 ! used one byte of last word read 2168 and %o2, 3, %o2 2169 b 7f 2170 inc 4, %o2 2171 2172.w1cp: srl %o4, 8, %o5 2173 sth %o5, [%g5] 2174 inc 2, %g5 2175 dec 3, %o2 2176 andn %o2, 3, %o3 ! o3 is aligned word count 2177 dec 4, %o3 ! avoid reading beyond tail of src 2178 sub %o1, %g5, %o1 ! o1 gets the difference 2179 21802: sll %o4, 24, %g1 ! save residual bytes 2181 lduw [%o1+%g5], %o4 2182 deccc 4, %o3 2183 srl %o4, 8, %o5 ! merge with residual 2184 or %o5, %g1, %g1 2185 st %g1, [%g5] 2186 bnz,pt %ncc, 2b 2187 inc 4, %g5 2188 sub %o1, 1, %o1 ! used three bytes of last word read 2189 and %o2, 3, %o2 2190 b 7f 2191 inc 4, %o2 2192 2193.w2cp: lduw [%o1], %o4 2194 inc 4, %o1 2195 srl %o4, 16, %o5 2196 sth %o5, [%g5] 2197 inc 2, %g5 2198 dec 2, %o2 2199 andn %o2, 3, %o3 ! o3 is aligned word count 2200 dec 4, %o3 ! avoid reading beyond tail of src 2201 sub %o1, %g5, %o1 ! o1 gets the difference 2202 22033: sll %o4, 16, %g1 ! save residual bytes 2204 lduw [%o1+%g5], %o4 2205 deccc 4, %o3 2206 srl %o4, 16, %o5 ! merge with residual 2207 or %o5, %g1, %g1 2208 st %g1, [%g5] 2209 bnz,pt %ncc, 3b 2210 inc 4, %g5 2211 sub %o1, 2, %o1 ! used two bytes of last word read 2212 and %o2, 3, %o2 2213 b 7f 2214 inc 4, %o2 2215 2216.w4cp: andn %o2, 3, %o3 ! o3 is aligned word count 2217 sub %o1, %g5, %o1 ! o1 gets the difference 2218 22191: lduw [%o1+%g5], %o4 ! read from address 2220 deccc 4, %o3 ! decrement count 2221 st %o4, [%g5] ! write at destination address 2222 bgu,pt %ncc, 1b 2223 inc 4, %g5 ! increment to address 2224 b 7f 2225 and %o2, 3, %o2 ! number of leftover bytes, if any 2226 2227 ! 2228 ! differenced byte copy, works with any alignment 2229 ! 2230.dbytecp: 2231 b 7f 2232 sub %o1, %g5, %o1 ! o1 gets the difference 2233 22344: stb %o4, [%g5] ! write to address 2235 inc %g5 ! inc to address 22367: deccc %o2 ! decrement count 2237 bgeu,a,pt %ncc,4b ! loop till done 2238 ldub [%o1+%g5], %o4 ! read from address 2239 retl ! %o0 was preserved 2240 nop 2241 2242.blkalgndst: 2243 save %sp, -SA(MINFRAME), %sp 2244 2245 ! Block (64 bytes) align the destination. 2246 andcc %i0, 0x3f, %i3 ! is dst block aligned 2247 bz %ncc, .chksrc ! dst already block aligned 2248 sub %i3, 0x40, %i3 2249 neg %i3 ! bytes till dst 64 bytes aligned 2250 sub %i2, %i3, %i2 ! update i2 with new count 2251 2252 ! Based on source and destination alignment do 2253 ! either 8 bytes, 4 bytes, 2 bytes or byte copy. 2254 2255 ! Is dst & src 8B aligned 2256 or %i0, %i1, %o2 2257 andcc %o2, 0x7, %g0 2258 bz %ncc, .alewdcp 2259 nop 2260 2261 ! Is dst & src 4B aligned 2262 andcc %o2, 0x3, %g0 2263 bz %ncc, .alwdcp 2264 nop 2265 2266 ! Is dst & src 2B aligned 2267 andcc %o2, 0x1, %g0 2268 bz %ncc, .alhlfwdcp 2269 nop 2270 2271 ! 1B aligned 22721: ldub [%i1], %o2 2273 stb %o2, [%i0] 2274 inc %i1 2275 deccc %i3 2276 bgu,pt %ncc, 1b 2277 inc %i0 2278 2279 ba .chksrc 2280 nop 2281 2282 ! dst & src 4B aligned 2283.alwdcp: 2284 ld [%i1], %o2 2285 st %o2, [%i0] 2286 add %i1, 0x4, %i1 2287 subcc %i3, 0x4, %i3 2288 bgu,pt %ncc, .alwdcp 2289 add %i0, 0x4, %i0 2290 2291 ba .chksrc 2292 nop 2293 2294 ! dst & src 2B aligned 2295.alhlfwdcp: 2296 lduh [%i1], %o2 2297 stuh %o2, [%i0] 2298 add %i1, 0x2, %i1 2299 subcc %i3, 0x2, %i3 2300 bgu,pt %ncc, .alhlfwdcp 2301 add %i0, 0x2, %i0 2302 2303 ba .chksrc 2304 nop 2305 2306 ! dst & src 8B aligned 2307.alewdcp: 2308 ldx [%i1], %o2 2309 stx %o2, [%i0] 2310 add %i1, 0x8, %i1 2311 subcc %i3, 0x8, %i3 2312 bgu,pt %ncc, .alewdcp 2313 add %i0, 0x8, %i0 2314 2315 ! Now Destination is block (64 bytes) aligned 2316.chksrc: 2317 andn %i2, 0x3f, %i3 ! %i3 count is multiple of block size 2318 sub %i2, %i3, %i2 ! Residue bytes in %i2 2319 mov ASI_BLK_INIT_ST_QUAD_LDD_P, %asi 2320 andcc %i1, 0xf, %l1 ! is src quadword aligned 2321 bz,pn %ncc, .blkcpy ! src offset in %l1 2322 nop 2323 cmp %l1, 0x8 2324 bgu %ncc, .cpy_upper_double 2325 nop 2326 blu %ncc, .cpy_lower_double 2327 nop 2328 2329 ! Falls through when source offset is equal to 8 i.e. 2330 ! source is double word aligned. 2331 ! In this case no shift/merge of data is required 2332 sub %i1, %l1, %i1 ! align the src at 16 bytes. 2333 andn %i1, 0x3f, %o0 ! %o0 has block aligned source 2334 prefetch [%o0+0x0], #one_read 2335 ldda [%i1+0x0]%asi, %o2 2336loop0: 2337 ldda [%i1+0x10]%asi, %o4 2338 prefetch [%o0+0x40], #one_read 2339 2340 stxa %o3, [%i0+0x0]%asi 2341 stxa %o4, [%i0+0x8]%asi 2342 2343 ldda [%i1+0x20]%asi, %o2 2344 stxa %o5, [%i0+0x10]%asi 2345 stxa %o2, [%i0+0x18]%asi 2346 2347 ldda [%i1+0x30]%asi, %o4 2348 stxa %o3, [%i0+0x20]%asi 2349 stxa %o4, [%i0+0x28]%asi 2350 2351 ldda [%i1+0x40]%asi, %o2 2352 stxa %o5, [%i0+0x30]%asi 2353 stxa %o2, [%i0+0x38]%asi 2354 2355 add %o0, 0x40, %o0 2356 add %i1, 0x40, %i1 2357 subcc %i3, 0x40, %i3 2358 bgu,pt %ncc, loop0 2359 add %i0, 0x40, %i0 2360 ba .blkdone 2361 add %i1, %l1, %i1 ! increment the source by src offset 2362 2363.cpy_lower_double: 2364 sub %i1, %l1, %i1 ! align the src at 16 bytes. 2365 sll %l1, 3, %l2 ! %l2 left shift 2366 mov 0x40, %l3 2367 sub %l3, %l2, %l3 ! %l3 right shift = (64 - left shift) 2368 andn %i1, 0x3f, %o0 ! %o0 has block aligned source 2369 prefetch [%o0+0x0], #one_read 2370 ldda [%i1+0x0]%asi, %o2 ! partial data in %o2 and %o3 has 2371 ! complete data 2372loop1: 2373 ldda [%i1+0x10]%asi, %o4 ! %o4 has partial data for this read. 2374 ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1) ! merge %o2, %o3 and %o4 2375 ! into %o2 and %o3 2376 prefetch [%o0+0x40], #one_read 2377 stxa %o2, [%i0+0x0]%asi 2378 stxa %o3, [%i0+0x8]%asi 2379 2380 ldda [%i1+0x20]%asi, %o2 2381 ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1) ! merge %o2 with %o5 and 2382 stxa %o4, [%i0+0x10]%asi ! %o4 from previous read 2383 stxa %o5, [%i0+0x18]%asi ! into %o4 and %o5 2384 2385 ! Repeat the same for next 32 bytes. 2386 2387 ldda [%i1+0x30]%asi, %o4 2388 ALIGN_DATA(%o2, %o3, %o4, %l2, %l3, %g1) 2389 stxa %o2, [%i0+0x20]%asi 2390 stxa %o3, [%i0+0x28]%asi 2391 2392 ldda [%i1+0x40]%asi, %o2 2393 ALIGN_DATA(%o4, %o5, %o2, %l2, %l3, %g1) 2394 stxa %o4, [%i0+0x30]%asi 2395 stxa %o5, [%i0+0x38]%asi 2396 2397 add %o0, 0x40, %o0 2398 add %i1, 0x40, %i1 2399 subcc %i3, 0x40, %i3 2400 bgu,pt %ncc, loop1 2401 add %i0, 0x40, %i0 2402 ba .blkdone 2403 add %i1, %l1, %i1 ! increment the source by src offset 2404 2405.cpy_upper_double: 2406 sub %i1, %l1, %i1 ! align the src at 16 bytes. 2407 mov 0x8, %l2 2408 sub %l1, %l2, %l2 2409 sll %l2, 3, %l2 ! %l2 left shift 2410 mov 0x40, %l3 2411 sub %l3, %l2, %l3 ! %l3 right shift = (64 - left shift) 2412 andn %i1, 0x3f, %o0 ! %o0 has block aligned source 2413 prefetch [%o0+0x0], #one_read 2414 ldda [%i1+0x0]%asi, %o2 ! partial data in %o3 for this read and 2415 ! no data in %o2 2416loop2: 2417 ldda [%i1+0x10]%asi, %o4 ! %o4 has complete data and %o5 has 2418 ! partial 2419 ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1) ! merge %o3, %o4 and %o5 2420 ! into %o3 and %o4 2421 prefetch [%o0+0x40], #one_read 2422 stxa %o3, [%i0+0x0]%asi 2423 stxa %o4, [%i0+0x8]%asi 2424 2425 ldda [%i1+0x20]%asi, %o2 2426 ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1) ! merge %o2 and %o3 with 2427 stxa %o5, [%i0+0x10]%asi ! %o5 from previous read 2428 stxa %o2, [%i0+0x18]%asi ! into %o5 and %o2 2429 2430 ! Repeat the same for next 32 bytes. 2431 2432 ldda [%i1+0x30]%asi, %o4 2433 ALIGN_DATA(%o3, %o4, %o5, %l2, %l3, %g1) 2434 stxa %o3, [%i0+0x20]%asi 2435 stxa %o4, [%i0+0x28]%asi 2436 2437 ldda [%i1+0x40]%asi, %o2 2438 ALIGN_DATA(%o5, %o2, %o3, %l2, %l3, %g1) 2439 stxa %o5, [%i0+0x30]%asi 2440 stxa %o2, [%i0+0x38]%asi 2441 2442 add %o0, 0x40, %o0 2443 add %i1, 0x40, %i1 2444 subcc %i3, 0x40, %i3 2445 bgu,pt %ncc, loop2 2446 add %i0, 0x40, %i0 2447 ba .blkdone 2448 add %i1, %l1, %i1 ! increment the source by src offset 2449 2450 ! Do fast copy using ASI_BLK_INIT_ST_QUAD_LDD_P 2451.blkcpy: 2452 andn %i1, 0x3f, %o0 ! %o0 has block aligned source 2453 prefetch [%o0+0x0], #one_read 24541: 2455 prefetch [%o0+0x40], #one_read 2456 2457 ldda [%i1+0x0]%asi, %o2 2458 ldda [%i1+0x10]%asi, %o4 2459 2460 stxa %o2, [%i0+0x0]%asi 2461 stxa %o3, [%i0+0x8]%asi 2462 stxa %o4, [%i0+0x10]%asi 2463 stxa %o5, [%i0+0x18]%asi 2464 2465 ldda [%i1+0x20]%asi, %o2 2466 ldda [%i1+0x30]%asi, %o4 2467 2468 stxa %o2, [%i0+0x20]%asi 2469 stxa %o3, [%i0+0x28]%asi 2470 stxa %o4, [%i0+0x30]%asi 2471 stxa %o5, [%i0+0x38]%asi 2472 2473 add %o0, 0x40, %o0 2474 add %i1, 0x40, %i1 2475 subcc %i3, 0x40, %i3 2476 bgu,pt %ncc, 1b 2477 add %i0, 0x40, %i0 2478 2479.blkdone: 2480 membar #Sync 2481 2482 mov ASI_PNF, %asi ! restore %asi to default 2483 ! ASI_PRIMARY_NOFAULT value 2484 tst %i2 2485 bz,pt %ncc, .blkexit 2486 nop 2487 2488 ! Handle trailing bytes 2489 cmp %i2, 0x8 2490 blu,pt %ncc, .residue 2491 nop 2492 2493 ! Can we do some 8B ops 2494 or %i1, %i0, %o2 2495 andcc %o2, 0x7, %g0 2496 bnz %ncc, .last4 2497 nop 2498 2499 ! Do 8byte ops as long as possible 2500.last8: 2501 ldx [%i1], %o2 2502 stx %o2, [%i0] 2503 add %i1, 0x8, %i1 2504 sub %i2, 0x8, %i2 2505 cmp %i2, 0x8 2506 bgu,pt %ncc, .last8 2507 add %i0, 0x8, %i0 2508 2509 tst %i2 2510 bz,pt %ncc, .blkexit 2511 nop 2512 2513 ba .residue 2514 nop 2515 2516.last4: 2517 ! Can we do 4B ops 2518 andcc %o2, 0x3, %g0 2519 bnz %ncc, .last2 2520 nop 25211: 2522 ld [%i1], %o2 2523 st %o2, [%i0] 2524 add %i1, 0x4, %i1 2525 sub %i2, 0x4, %i2 2526 cmp %i2, 0x4 2527 bgu,pt %ncc, 1b 2528 add %i0, 0x4, %i0 2529 2530 cmp %i2, 0 2531 bz,pt %ncc, .blkexit 2532 nop 2533 2534 ba .residue 2535 nop 2536 2537.last2: 2538 ! Can we do 2B ops 2539 andcc %o2, 0x1, %g0 2540 bnz %ncc, .residue 2541 nop 2542 25431: 2544 lduh [%i1], %o2 2545 stuh %o2, [%i0] 2546 add %i1, 0x2, %i1 2547 sub %i2, 0x2, %i2 2548 cmp %i2, 0x2 2549 bgu,pt %ncc, 1b 2550 add %i0, 0x2, %i0 2551 2552 cmp %i2, 0 2553 bz,pt %ncc, .blkexit 2554 nop 2555 2556.residue: 2557 ldub [%i1], %o2 2558 stb %o2, [%i0] 2559 inc %i1 2560 deccc %i2 2561 bgu,pt %ncc, .residue 2562 inc %i0 2563 2564.blkexit: 2565 2566 ret 2567 restore %g5, %g0, %o0 2568 2569#endif /* NIAGARA2_IMPL */ 2570 SET_SIZE(memcpy) 2571 SET_SIZE(__align_cpy_1) 2572