1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22/* 23 * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 .file "memcpy.s" 27 28/* 29 * memcpy(s1, s2, len) 30 * 31 * Copy s2 to s1, always copy n bytes. 32 * Note: this C code does not work for overlapped copies. 33 * Memmove() and bcopy() do. 34 * 35 * Fast assembler language version of the following C-program for memcpy 36 * which represents the `standard' for the C-library. 37 * 38 * void * 39 * memcpy(void *s, const void *s0, size_t n) 40 * { 41 * if (n != 0) { 42 * char *s1 = s; 43 * const char *s2 = s0; 44 * do { 45 * *s1++ = *s2++; 46 * } while (--n != 0); 47 * } 48 * return (s); 49 * } 50 */ 51 52#include <sys/asm_linkage.h> 53#include <sys/sun4asi.h> 54#include <sys/trap.h> 55 56#define ICACHE_LINE_SIZE 64 57#define BLOCK_SIZE 64 58#define FPRS_FEF 0x4 59 60#define SHORTCOPY 3 61#define SMALL_MAX 39 62#define MEDIUM_MAX 255 63#define MED_WMAX 256 /* max copy for medium word-aligned case */ 64#define MED_MAX 256 /* max copy for medium longword-aligned case */ 65 66#ifndef BSTORE_SIZE 67#define BSTORE_SIZE 256 /* min copy size for block store */ 68#endif 69 70 ANSI_PRAGMA_WEAK(memmove,function) 71 ANSI_PRAGMA_WEAK(memcpy,function) 72 73 ENTRY(memmove) 74 cmp %o1, %o0 ! if from address is >= to use forward copy 75 bgeu %ncc, .forcpy ! else use backward if ... 76 sub %o0, %o1, %o4 ! get difference of two addresses 77 cmp %o2, %o4 ! compare size and difference of addresses 78 bleu %ncc, .forcpy ! if size is bigger, do overlapped copy 79 nop 80 81 ! 82 ! an overlapped copy that must be done "backwards" 83 ! 84.ovbc: 85 mov %o0, %g1 ! save dest address for return val 86 add %o1, %o2, %o1 ! get to end of source space 87 add %o0, %o2, %o0 ! get to end of destination space 88 89 cmp %o2, 24 90 bgeu,pn %ncc, .dbalign 91 nop 92 cmp %o2, 4 93 blt,pn %ncc, .byte 94 sub %o2, 3, %o2 95.byte4loop: 96 ldub [%o1-1], %o3 ! load last byte 97 stb %o3, [%o0-1] ! store last byte 98 sub %o1, 4, %o1 99 ldub [%o1+2], %o3 ! load 2nd from last byte 100 stb %o3, [%o0-2] ! store 2nd from last byte 101 sub %o0, 4, %o0 102 ldub [%o1+1], %o3 ! load 3rd from last byte 103 stb %o3, [%o0+1] ! store 3rd from last byte 104 subcc %o2, 4, %o2 105 ldub [%o1], %o3 ! load 4th from last byte 106 bgu,pt %ncc, .byte4loop 107 stb %o3, [%o0] ! store 4th from last byte 108.byte: 109 addcc %o2, 3, %o2 110 bz,pt %ncc, .exit 111.byteloop: 112 dec %o1 ! decrement src address 113 ldub [%o1], %o3 ! read a byte 114 dec %o0 ! decrement dst address 115 deccc %o2 ! decrement count 116 bgu,pt %ncc, .byteloop ! loop until done 117 stb %o3, [%o0] ! write byte 118.exit: 119 retl 120 mov %g1, %o0 121 122 .align 16 123.dbalign: 124 andcc %o0, 7, %o5 ! bytes till DST 8 byte aligned 125 bz,pt %ncc, .dbmed 126 sub %o2, %o5, %o2 ! update count 127.dbalign1: 128 dec %o1 ! decrement src address 129 ldub [%o1], %o3 ! read a byte 130 dec %o0 ! decrement dst address 131 deccc %o5 ! decrement count 132 bgu,pt %ncc, .dbalign1 ! loop until done 133 stb %o3, [%o0] ! store a byte 134 135! check for src long word alignment 136.dbmed: 137 andcc %o1, 7, %g0 ! chk src long word alignment 138 bnz,pn %ncc, .dbbck 139 nop 140! 141! Following code is for overlapping copies where src and dest 142! are long word aligned 143! 144 cmp %o2, 4095 145 blt,pn %ncc, .dbmedl32enter ! go to no prefetch code 146 nop 147 prefetch [%o1 - (1 * BLOCK_SIZE)], 20 ! into the prefetch cache 148 sub %o2, 63, %o2 ! adjust length to allow cc test 149 ! for end of loop 150 prefetch [%o1 - (2 * BLOCK_SIZE)], 20 ! into the prefetch cache 151 rd %fprs, %o3 ! o3 = fprs 152 ! if fprs.fef == 0, set it. Checking it, requires 2 instructions. 153 ! So set it anyway, without checking. 154 prefetch [%o1 - (3 * BLOCK_SIZE)], 20 ! into the prefetch cache 155 wr %g0, 0x4, %fprs ! fprs.fef = 1 156 prefetch [%o1 - (4 * BLOCK_SIZE)], 20 ! into the prefetch cache 157.dbmedl64: 158 prefetch [%o1 - (5 * BLOCK_SIZE)], 20 ! into the prefetch cache 159 ldd [%o1-8], %d4 ! load 160 subcc %o2, 64, %o2 ! decrement length count 161 std %d4, [%o0-8] ! and store 162 ldd [%o1-16], %d2 ! a block of 64 bytes 163 sub %o1, 64, %o1 ! decrease src ptr by 64 164 std %d2, [%o0-16] 165 sub %o0, 64, %o0 ! decrease dst ptr by 64 166 ldd [%o1+40], %d4 167 std %d4, [%o0+40] 168 ldd [%o1+32], %d2 169 std %d2, [%o0+32] 170 ldd [%o1+24], %d4 171 std %d4, [%o0+24] 172 ldd [%o1+16], %d2 173 std %d2, [%o0+16] 174 ldd [%o1+8], %d4 175 std %d4, [%o0+8] 176 ldd [%o1], %d2 177 bgu,pt %ncc, .dbmedl64 ! repeat if at least 64 bytes left 178 std %d2, [%o0] 179 add %o2, 63, %o2 ! restore offset adjustment 180 and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0 181 wr %o3, %g0, %fprs ! fprs = o3 restore fprs 182.dbmedl32enter: 183 subcc %o2, 31, %o2 ! adjust length to allow cc test 184 ! for end of loop 185 ble,pt %ncc, .dbmedl31 ! skip big loop if less than 32 186 nop 187.dbmedl32: 188 ldx [%o1-8], %o4 ! load 189 subcc %o2, 32, %o2 ! decrement length count 190 stx %o4, [%o0-8] ! and store 191 ldx [%o1-16], %o3 ! a block of 32 bytes 192 sub %o1, 32, %o1 ! decrease src ptr by 32 193 stx %o3, [%o0-16] 194 ldx [%o1+8], %o4 195 sub %o0, 32, %o0 ! decrease dst ptr by 32 196 stx %o4, [%o0+8] 197 ldx [%o1], %o3 198 bgu,pt %ncc, .dbmedl32 ! repeat if at least 32 bytes left 199 stx %o3, [%o0] 200.dbmedl31: 201 addcc %o2, 16, %o2 ! adjust remaining count 202 ble,pt %ncc, .dbmedl15 ! skip if 15 or fewer bytes left 203 nop ! 204 ldx [%o1-8], %o4 ! load and store 16 bytes 205 sub %o1, 16, %o1 ! decrease src ptr by 16 206 stx %o4, [%o0-8] ! 207 sub %o2, 16, %o2 ! decrease count by 16 208 ldx [%o1], %o3 ! 209 sub %o0, 16, %o0 ! decrease dst ptr by 16 210 stx %o3, [%o0] 211.dbmedl15: 212 addcc %o2, 15, %o2 ! restore count 213 bz,pt %ncc, .dbexit ! exit if finished 214 nop 215 cmp %o2, 8 216 blt,pt %ncc, .dbremain ! skip if 7 or fewer bytes left 217 nop 218 ldx [%o1-8], %o4 ! load 8 bytes 219 sub %o1, 8, %o1 ! decrease src ptr by 8 220 stx %o4, [%o0-8] ! and store 8 bytes 221 subcc %o2, 8, %o2 ! decrease count by 8 222 bnz %ncc, .dbremain ! exit if finished 223 sub %o0, 8, %o0 ! decrease dst ptr by 8 224 retl 225 mov %g1, %o0 226 227! 228! Following code is for overlapping copies where src and dest 229! are not long word aligned 230! 231 .align 16 232.dbbck: 233 rd %fprs, %o3 ! o3 = fprs 234 235 ! if fprs.fef == 0, set it. Checking it, requires 2 instructions. 236 ! So set it anyway, without checking. 237 wr %g0, 0x4, %fprs ! fprs.fef = 1 238 239 alignaddr %o1, %g0, %o5 ! align src 240 ldd [%o5], %d0 ! get first 8 byte block 241 andn %o2, 7, %o4 ! prepare src ptr for finishup code 242 cmp %o2, 32 243 blt,pn %ncc, .dbmv8 244 sub %o1, %o4, %o1 ! 245 cmp %o2, 4095 ! check for short memmoves 246 blt,pn %ncc, .dbmv32enter ! go to no prefetch code 247.dbmv64: 248 ldd [%o5-8], %d2 ! load 8 bytes 249 ldd [%o5-16], %d4 ! load 8 bytes 250 sub %o5, 64, %o5 ! 251 ldd [%o5+40], %d6 ! load 8 bytes 252 sub %o0, 64, %o0 ! 253 ldd [%o5+32], %d8 ! load 8 bytes 254 sub %o2, 64, %o2 ! 64 less bytes to copy 255 ldd [%o5+24], %d18 ! load 8 bytes 256 cmp %o2, 64 ! do we have < 64 bytes remaining 257 ldd [%o5+16], %d28 ! load 8 bytes 258 ldd [%o5+8], %d30 ! load 8 bytes 259 prefetch [%o5 - (5 * BLOCK_SIZE)], 20 ! into the prefetch cache 260 faligndata %d2, %d0, %d10 ! extract 8 bytes out 261 ldd [%o5], %d0 ! load 8 bytes 262 std %d10, [%o0+56] ! store the current 8 bytes 263 faligndata %d4, %d2, %d12 ! extract 8 bytes out 264 std %d12, [%o0+48] ! store the current 8 bytes 265 faligndata %d6, %d4, %d14 ! extract 8 bytes out 266 std %d14, [%o0+40] ! store the current 8 bytes 267 faligndata %d8, %d6, %d16 ! extract 8 bytes out 268 std %d16, [%o0+32] ! store the current 8 bytes 269 faligndata %d18, %d8, %d20 ! extract 8 bytes out 270 std %d20, [%o0+24] ! store the current 8 bytes 271 faligndata %d28, %d18, %d22 ! extract 8 bytes out 272 std %d22, [%o0+16] ! store the current 8 bytes 273 faligndata %d30, %d28, %d24 ! extract 8 bytes out 274 std %d24, [%o0+8] ! store the current 8 bytes 275 faligndata %d0, %d30, %d26 ! extract 8 bytes out 276 bgeu,pt %ncc, .dbmv64 277 std %d26, [%o0] ! store the current 8 bytes 278 279 cmp %o2, 32 280 blt,pn %ncc, .dbmvx 281 nop 282.dbmv32: 283 ldd [%o5-8], %d2 ! load 8 bytes 284.dbmv32enter: 285 ldd [%o5-16], %d4 ! load 8 bytes 286 sub %o5, 32, %o5 ! 287 ldd [%o5+8], %d6 ! load 8 bytes 288 sub %o0, 32, %o0 ! 289 faligndata %d2, %d0, %d10 ! extract 8 bytes out 290 ldd [%o5], %d0 ! load 8 bytes 291 sub %o2,32, %o2 ! 32 less bytes to copy 292 std %d10, [%o0+24] ! store the current 8 bytes 293 cmp %o2, 32 ! do we have < 32 bytes remaining 294 faligndata %d4, %d2, %d12 ! extract 8 bytes out 295 std %d12, [%o0+16] ! store the current 8 bytes 296 faligndata %d6, %d4, %d14 ! extract 8 bytes out 297 std %d14, [%o0+8] ! store the current 8 bytes 298 faligndata %d0, %d6, %d16 ! extract 8 bytes out 299 bgeu,pt %ncc, .dbmv32 300 std %d16, [%o0] ! store the current 8 bytes 301.dbmvx: 302 cmp %o2, 8 ! do we have < 8 bytes remaining 303 blt,pt %ncc, .dbmvfinish ! if yes, skip to finish up code 304 nop 305.dbmv8: 306 ldd [%o5-8], %d2 307 sub %o0, 8, %o0 ! since we are at the end 308 ! when we first enter the loop 309 sub %o2, 8, %o2 ! 8 less bytes to copy 310 sub %o5, 8, %o5 311 cmp %o2, 8 ! do we have < 8 bytes remaining 312 faligndata %d2, %d0, %d8 ! extract 8 bytes out 313 std %d8, [%o0] ! store the current 8 bytes 314 bgeu,pt %ncc, .dbmv8 315 fmovd %d2, %d0 316.dbmvfinish: 317 and %o3, 0x4, %o3 ! fprs.du = fprs.dl = 0 318 tst %o2 319 bz,pt %ncc, .dbexit 320 wr %o3, %g0, %fprs ! fprs = o3 restore fprs 321 322.dbremain: 323 cmp %o2, 4 324 blt,pn %ncc, .dbbyte 325 nop 326 ldub [%o1-1], %o3 ! load last byte 327 stb %o3, [%o0-1] ! store last byte 328 sub %o1, 4, %o1 329 ldub [%o1+2], %o3 ! load 2nd from last byte 330 stb %o3, [%o0-2] ! store 2nd from last byte 331 sub %o0, 4, %o0 332 ldub [%o1+1], %o3 ! load 3rd from last byte 333 stb %o3, [%o0+1] ! store 3rd from last byte 334 subcc %o2, 4, %o2 335 ldub [%o1], %o3 ! load 4th from last byte 336 stb %o3, [%o0] ! store 4th from last byte 337 bz,pt %ncc, .dbexit 338.dbbyte: 339 dec %o1 ! decrement src address 340 ldub [%o1], %o3 ! read a byte 341 dec %o0 ! decrement dst address 342 deccc %o2 ! decrement count 343 bgu,pt %ncc, .dbbyte ! loop until done 344 stb %o3, [%o0] ! write byte 345.dbexit: 346 retl 347 mov %g1, %o0 348 SET_SIZE(memmove) 349 350 351 .align ICACHE_LINE_SIZE 352 ENTRY(memcpy) 353 ! adjust instruction alignment 354 nop ! Do not remove, these nops affect 355 nop ! icache alignment and performance 356.forcpy: 357 cmp %o2, SMALL_MAX ! check for not small case 358 bgu,pn %ncc, .medium ! go to larger cases 359 mov %o0, %g1 ! save %o0 360 cmp %o2, SHORTCOPY ! check for really short case 361 ble,pt %ncc, .smallleft ! 362 or %o0, %o1, %o3 ! prepare alignment check 363 andcc %o3, 0x3, %g0 ! test for alignment 364 bz,pt %ncc, .smallword ! branch to word aligned case 365 sub %o2, 3, %o2 ! adjust count to allow cc zero test 366.smallnotalign4: 367 ldub [%o1], %o3 ! read byte 368 subcc %o2, 4, %o2 ! reduce count by 4 369 stb %o3, [%o0] ! write byte 370 ldub [%o1+1], %o3 ! repeat for a total of 4 bytes 371 add %o1, 4, %o1 ! advance SRC by 4 372 stb %o3, [%o0+1] 373 ldub [%o1-2], %o3 374 add %o0, 4, %o0 ! advance DST by 4 375 stb %o3, [%o0-2] 376 ldub [%o1-1], %o3 377 bgu,pt %ncc, .smallnotalign4 ! loop til 3 or fewer bytes remain 378 stb %o3, [%o0-1] 379 add %o2, 3, %o2 ! restore count 380.smallleft: 381 tst %o2 382 bz,pt %ncc, .smallexit 383 nop 384.smallleft3: ! 1, 2, or 3 bytes remain 385 ldub [%o1], %o3 ! load one byte 386 deccc %o2 ! reduce count for cc test 387 bz,pt %ncc, .smallexit 388 stb %o3, [%o0] ! store one byte 389 ldub [%o1+1], %o3 ! load second byte 390 deccc %o2 391 bz,pt %ncc, .smallexit 392 stb %o3, [%o0+1] ! store second byte 393 ldub [%o1+2], %o3 ! load third byte 394 stb %o3, [%o0+2] ! store third byte 395 retl 396 mov %g1, %o0 ! restore %o0 397 398 .align 16 399 nop ! affects loop icache alignment 400.smallwords: 401 lduw [%o1], %o3 ! read word 402.smallwordx: 403 subcc %o2, 8, %o2 ! update count 404 stw %o3, [%o0] ! write word 405 add %o1, 8, %o1 ! update SRC 406 lduw [%o1-4], %o3 ! read word 407 add %o0, 8, %o0 ! update DST 408 bgu,pt %ncc, .smallwords ! loop until done 409 stw %o3, [%o0-4] ! write word 410 addcc %o2, 7, %o2 ! restore count 411 bz,pt %ncc, .smallexit ! check for completion 412 nop 413 cmp %o2, 4 ! check for 4 or more bytes left 414 blt .smallleft3 ! if not, go to finish up 415 nop 416 lduw [%o1], %o3 417 add %o1, 4, %o1 418 subcc %o2, 4, %o2 419 stw %o3, [%o0] 420 add %o0, 4, %o0 421 bnz,pt %ncc, .smallleft3 422 nop 423 retl 424 mov %g1, %o0 ! restore %o0 425 426.smallword: 427 subcc %o2, 4, %o2 ! update count 428 bgu,pt %ncc, .smallwordx 429 lduw [%o1], %o3 ! read word 430 addcc %o2, 3, %o2 ! restore count 431 bz,pt %ncc, .smallexit 432 stw %o3, [%o0] ! write word 433 deccc %o2 ! reduce count for cc test 434 ldub [%o1+4], %o3 ! load one byte 435 bz,pt %ncc, .smallexit 436 stb %o3, [%o0+4] ! store one byte 437 ldub [%o1+5], %o3 ! load second byte 438 deccc %o2 439 bz,pt %ncc, .smallexit 440 stb %o3, [%o0+5] ! store second byte 441 ldub [%o1+6], %o3 ! load third byte 442 stb %o3, [%o0+6] ! store third byte 443.smallexit: 444 retl 445 mov %g1, %o0 ! restore %o0 446 .align 16 447.medium: 448 neg %o0, %o5 449 neg %o1, %o3 450 andcc %o5, 7, %o5 ! bytes till DST 8 byte aligned 451 and %o3, 7, %o3 ! bytes till SRC 8 byte aligned 452 453 bz %ncc, 2f 454 sub %o5, %o3, %o3 ! -(bytes till SRC aligned after DST aligned) 455 ! o3={-7, -6, ... 7} o3>0 => SRC overaligned 456 457 sub %o2, %o5, %o2 ! update count 458 4591: 460 ldub [%o1], %o4 461 deccc %o5 462 inc %o1 463 stb %o4, [%o0] 464 bgu,pt %ncc, 1b 465 inc %o0 466 467 ! Now DST is 8-byte aligned. o0, o1, o2 are current. 468 4692: 470 andcc %o1, 0x3, %g0 ! test alignment 471 bnz,pt %ncc, .mediumsetup ! branch to skip aligned cases 472 ! if src, dst not aligned 473 prefetch [%o1 + (1 * BLOCK_SIZE)], 20 474 475/* 476 * Handle all cases where src and dest are aligned on word 477 * or long word boundaries. Use unrolled loops for better 478 * performance. This option wins over standard large data 479 * move when source and destination is in cache for medium 480 * to short data moves. 481 */ 482 andcc %o1, 0x7, %g0 ! test word alignment 483 bz,pt %ncc, .medlword ! branch to long word aligned case 484 prefetch [%o1 + (2 * BLOCK_SIZE)], 20 485 cmp %o2, MED_WMAX ! limit to store buffer size 486 bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop 487 nop 488 subcc %o2, 15, %o2 ! adjust length to allow cc test 489 ! for end of loop 490 ble,pt %ncc, .medw15 ! skip big loop if less than 16 491 prefetch [%o1 + (3 * BLOCK_SIZE)], 20 492/* 493 * no need to put prefetch in loop as prefetches have 494 * already been issued for maximum loop size 495 */ 496.medw16: 497 ld [%o1], %o4 ! load 498 subcc %o2, 16, %o2 ! decrement length count 499 stw %o4, [%o0] ! and store 500 ld [%o1+4], %o3 ! a block of 16 bytes 501 add %o1, 16, %o1 ! increase src ptr by 16 502 stw %o3, [%o0+4] 503 ld [%o1-8], %o4 504 add %o0, 16, %o0 ! increase dst ptr by 16 505 stw %o4, [%o0-8] 506 ld [%o1-4], %o3 507 bgu,pt %ncc, .medw16 ! repeat if at least 16 bytes left 508 stw %o3, [%o0-4] 509.medw15: 510 addcc %o2, 15, %o2 ! restore count 511 bz,pt %ncc, .medwexit ! exit if finished 512 nop 513 cmp %o2, 8 514 blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 515 nop ! 516 ld [%o1], %o4 ! load 4 bytes 517 subcc %o2, 8, %o2 ! decrease count by 8 518 stw %o4, [%o0] ! and store 4 bytes 519 add %o1, 8, %o1 ! increase src ptr by 8 520 ld [%o1-4], %o3 ! load 4 bytes 521 add %o0, 8, %o0 ! increase dst ptr by 8 522 stw %o3, [%o0-4] ! and store 4 bytes 523 bz %ncc, .medwexit ! exit if finished 524 nop 525.medw7: ! count is ge 1, less than 8 526 cmp %o2, 3 ! check for 4 bytes left 527 ble,pt %ncc, .medw3 ! skip if 3 or fewer bytes left 528 nop ! 529 ld [%o1], %o4 ! load 4 bytes 530 sub %o2, 4, %o2 ! decrease count by 4 531 add %o1, 4, %o1 ! increase src ptr by 4 532 stw %o4, [%o0] ! and store 4 bytes 533 add %o0, 4, %o0 ! increase dst ptr by 4 534 tst %o2 ! check for zero bytes left 535 bz %ncc, .medwexit ! exit if finished 536 nop 537.medw3: ! count is known to be 1, 2, or 3 538 deccc %o2 ! reduce count by one 539 ldub [%o1], %o3 ! load one byte 540 bz,pt %ncc, .medwexit ! exit if last byte 541 stb %o3, [%o0] ! store one byte 542 ldub [%o1+1], %o3 ! load second byte 543 deccc %o2 ! reduce count by one 544 bz,pt %ncc, .medwexit ! exit if last byte 545 stb %o3, [%o0+1] ! store second byte 546 ldub [%o1+2], %o3 ! load third byte 547 stb %o3, [%o0+2] ! store third byte 548.medwexit: 549 retl 550 mov %g1, %o0 ! restore %o0 551 552/* 553 * Special case for handling when src and dest are both long word aligned 554 * and total data to move is between SMALL_MAX and MED_MAX bytes 555 */ 556 557 .align 16 558 nop 559.medlword: ! long word aligned 560 ! length > SMALL_MAX 561 cmp %o2, MED_MAX ! limit to store buffer size 562 bgu,pt %ncc, .mediumrejoin ! otherwise rejoin main loop 563 nop 564 subcc %o2, 31, %o2 ! adjust length to allow cc test 565 ! for end of loop 566 ble,pt %ncc, .medl31 ! skip big loop if less than 32 567 prefetch [%o1 + (3 * BLOCK_SIZE)], 20 ! into the l2 cache 568/* 569 * no need to put prefetch in loop as prefetches have 570 * already been issued for maximum loop size 571 */ 572.medl32: 573 ldx [%o1], %o4 ! load 574 subcc %o2, 32, %o2 ! decrement length count 575 stx %o4, [%o0] ! and store 576 ldx [%o1+8], %o3 ! a block of 32 bytes 577 add %o1, 32, %o1 ! increase src ptr by 32 578 stx %o3, [%o0+8] 579 ldx [%o1-16], %o4 580 add %o0, 32, %o0 ! increase dst ptr by 32 581 stx %o4, [%o0-16] 582 ldx [%o1-8], %o3 583 bgu,pt %ncc, .medl32 ! repeat if at least 32 bytes left 584 stx %o3, [%o0-8] 585.medl31: 586 addcc %o2, 16, %o2 ! adjust remaining count 587 ble,pt %ncc, .medl15 ! skip if 15 or fewer bytes left 588 nop ! 589 ldx [%o1], %o4 ! load and store 16 bytes 590 add %o1, 16, %o1 ! increase src ptr by 16 591 stx %o4, [%o0] ! 592 sub %o2, 16, %o2 ! decrease count by 16 593 ldx [%o1-8], %o3 ! 594 add %o0, 16, %o0 ! increase dst ptr by 16 595 stx %o3, [%o0-8] 596.medl15: 597 addcc %o2, 15, %o2 ! restore count 598 bz,pt %ncc, .medwexit ! exit if finished 599 nop 600 cmp %o2, 8 601 blt,pt %ncc, .medw7 ! skip if 7 or fewer bytes left 602 nop 603 ldx [%o1], %o4 ! load 8 bytes 604 add %o1, 8, %o1 ! increase src ptr by 8 605 stx %o4, [%o0] ! and store 8 bytes 606 subcc %o2, 8, %o2 ! decrease count by 8 607 bz %ncc, .medwexit ! exit if finished 608 add %o0, 8, %o0 ! increase dst ptr by 8 609 ba .medw7 610 nop 611 612 .align 16 613 nop 614 nop 615 nop 616.mediumsetup: 617 prefetch [%o1 + (2 * BLOCK_SIZE)], 21 618.mediumrejoin: 619 rd %fprs, %o4 ! check for unused FPU 620 621 add %o1, 8, %o1 ! prepare to round SRC upward 622 623 sethi %hi(0x1234567f), %o5 ! For GSR.MASK 624 or %o5, 0x67f, %o5 625 626 andcc %o4, FPRS_FEF, %o4 ! test FEF, fprs.du = fprs.dl = 0 627 bz,a %ncc, 3f 628 wr %g0, FPRS_FEF, %fprs ! fprs.fef = 1 6293: 630 cmp %o2, MEDIUM_MAX 631 bmask %o5, %g0, %g0 632 633 ! Compute o5 (number of bytes that need copying using the main loop). 634 ! First, compute for the medium case. 635 ! Then, if large case, o5 is replaced by count for block alignment. 636 ! Be careful not to read past end of SRC 637 ! Currently, o2 is the actual count remaining 638 ! o3 is how much sooner we'll cross the alignment boundary 639 ! in SRC compared to in DST 640 ! 641 ! Examples: Let # denote bytes that should not be accessed 642 ! Let x denote a byte already copied to align DST 643 ! Let . and - denote bytes not yet copied 644 ! Let | denote double alignment boundaries 645 ! 646 ! DST: ######xx|........|--------|..###### o2 = 18 647 ! o0 648 ! 649 ! o3 = -3: SRC: ###xx...|.....---|-----..#|######## o5 = 8 650 ! o1 651 ! 652 ! o3 = 0: SRC: ######xx|........|--------|..###### o5 = 16-8 = 8 653 ! o1 654 ! 655 ! o3 = +1: SRC: #######x|x.......|.-------|-..##### o5 = 16-8 = 8 656 ! o1 657 658 or %g0, -8, %o5 659 alignaddr %o1, %g0, %o1 ! set GSR.ALIGN and align o1 660 661 movrlz %o3, %g0, %o5 ! subtract 8 from o2+o3 only if o3>=0 662 add %o5, %o2, %o5 663 add %o5, %o3, %o5 664 665 bleu %ncc, 4f 666 andn %o5, 7, %o5 ! 8 byte aligned count 667 neg %o0, %o5 ! 'large' case 668 and %o5, BLOCK_SIZE-1, %o5 ! bytes till DST block aligned 6694: 670 brgez,a %o3, .beginmedloop 671 ldd [%o1-8], %d0 672 673 add %o1, %o3, %o1 ! back up o1 6745: 675 ldda [%o1]ASI_FL8_P, %d2 676 inc %o1 677 andcc %o1, 7, %g0 678 bnz %ncc, 5b 679 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 680 681.beginmedloop: 682 tst %o5 683 bz %ncc, .endmedloop 684 sub %o2, %o5, %o2 ! update count for later 685 686 ! Main loop to write out doubles. Note: o5 & 7 == 0 687 688 ldd [%o1], %d2 689 subcc %o5, 8, %o5 ! update local count 690 bz,pn %ncc, 1f 691 add %o1, 8, %o1 ! update SRC 692 693.medloop: 694 faligndata %d0, %d2, %d4 695 ldd [%o1], %d0 696 subcc %o5, 8, %o5 ! update local count 697 add %o1, 16, %o1 ! update SRC 698 std %d4, [%o0] 699 bz,pn %ncc, 2f 700 faligndata %d2, %d0, %d6 701 ldd [%o1 - 8], %d2 702 subcc %o5, 8, %o5 ! update local count 703 std %d6, [%o0 + 8] 704 bnz,pt %ncc, .medloop 705 add %o0, 16, %o0 ! update DST 706 7071: 708 faligndata %d0, %d2, %d4 709 fmovd %d2, %d0 710 std %d4, [%o0] 711 ba .endmedloop 712 add %o0, 8, %o0 713 7142: 715 std %d6, [%o0 + 8] 716 sub %o1, 8, %o1 717 add %o0, 16, %o0 718 719 720.endmedloop: 721 ! Currently, o1 is pointing to the next double-aligned byte in SRC 722 ! The 8 bytes starting at [o1-8] are available in d0 723 ! At least one, and possibly all, of these need to be written. 724 725 cmp %o2, BLOCK_SIZE 726 bgu %ncc, .large ! otherwise, less than 16 bytes left 727 728#if 0 729 730 /* This code will use partial stores. */ 731 732 mov %g0, %o5 733 and %o3, 7, %o3 ! Number of bytes needed to completely 734 ! fill %d0 with good (unwritten) data. 735 736 subcc %o2, 8, %o2 ! update count (maybe too much) 737 movl %ncc, %o2, %o5 738 addcc %o3, %o5, %o5 ! extra bytes we can stuff into %d0 739 sub %o3, %o5, %o3 ! update o3 (# bad bytes in %d0) 740 741 bz %ncc, 2f 742 alignaddr %o3, %g0, %g0 ! set GSR.ALIGN 743 7441: 745 deccc %o5 746 ldda [%o1]ASI_FL8_P, %d2 747 inc %o1 748 bgu %ncc, 1b 749 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 750 7512: 752 not %o3 753 faligndata %d0, %d0, %d0 ! shift bytes to the left 754 and %o3, 7, %o3 ! last byte to be stored in [%o0+%o3] 755 edge8n %g0, %o3, %o5 756 stda %d0, [%o0]%o5, ASI_PST8_P 757 brlez %o2, .mediumexit 758 add %o0, %o3, %o0 ! update DST to last stored byte 7593: 760 inc %o0 761 deccc %o2 762 ldub [%o1], %o3 763 stb %o3, [%o0] 764 bgu %ncc, 3b 765 inc %o1 766 767#else 768 769 andcc %o3, 7, %o5 ! Number of bytes needed to completely 770 ! fill %d0 with good (unwritten) data. 771 bz %ncc, 2f 772 sub %o5, 8, %o3 ! -(number of good bytes in %d0) 773 cmp %o2, 8 774 bl,a %ncc, 3f ! Not enough bytes to fill %d0 775 add %o1, %o3, %o1 ! Back up %o1 776 7771: 778 deccc %o5 779 ldda [%o1]ASI_FL8_P, %d2 780 inc %o1 781 bgu %ncc, 1b 782 bshuffle %d0, %d2, %d0 ! shifts d0 left 1 byte and or's in d2 783 7842: 785 subcc %o2, 8, %o2 786 std %d0, [%o0] 787 bz %ncc, .mediumexit 788 add %o0, 8, %o0 7893: 790 ldub [%o1], %o3 791 deccc %o2 792 inc %o1 793 stb %o3, [%o0] 794 bgu %ncc, 3b 795 inc %o0 796#endif 797 798.mediumexit: 799 wr %o4, %g0, %fprs ! fprs = o4 restore fprs 800 retl 801 mov %g1, %o0 802 803 804 .align ICACHE_LINE_SIZE 805.large: 806 ! The following test for BSTORE_SIZE is used to decide whether 807 ! to store data with a block store or with individual stores. 808 ! The block store wins when the amount of data is so large 809 ! that it is causes other application data to be moved out 810 ! of the L1 or L2 cache. 811 ! On a Panther, block store can lose more often because block 812 ! store forces the stored data to be removed from the L3 cache. 813 ! 814 sethi %hi(BSTORE_SIZE),%o5 815 or %o5,%lo(BSTORE_SIZE),%o5 816 cmp %o2, %o5 817 bgu %ncc, .xlarge 818 819 ! %o0 I/O DST is 64-byte aligned 820 ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN) 821 ! %d0 I/O already loaded with SRC data from [%o1-8] 822 ! %o2 I/O count (number of bytes that need to be written) 823 ! %o3 I Not written. If zero, then SRC is double aligned. 824 ! %o4 I Not written. Holds fprs. 825 ! %o5 O The number of doubles that remain to be written. 826 827 ! Load the rest of the current block 828 ! Recall that %o1 is further into SRC than %o0 is into DST 829 830 prefetch [%o0 + (0 * BLOCK_SIZE)], 22 831 prefetch [%o0 + (1 * BLOCK_SIZE)], 22 832 prefetch [%o0 + (2 * BLOCK_SIZE)], 22 833 ldd [%o1], %f2 834 prefetch [%o1 + (3 * BLOCK_SIZE)], 21 835 ldd [%o1 + 0x8], %f4 836 faligndata %f0, %f2, %f32 837 ldd [%o1 + 0x10], %f6 838 faligndata %f2, %f4, %f34 839 ldd [%o1 + 0x18], %f8 840 faligndata %f4, %f6, %f36 841 ldd [%o1 + 0x20], %f10 842 or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 843 prefetch [%o1 + (4 * BLOCK_SIZE)], 21 844 faligndata %f6, %f8, %f38 845 ldd [%o1 + 0x28], %f12 846 movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed lter) 847 faligndata %f8, %f10, %f40 848 ldd [%o1 + 0x30], %f14 849 faligndata %f10, %f12, %f42 850 ldd [%o1 + 0x38], %f0 851 sub %o2, BLOCK_SIZE, %o2 ! update count 852 prefetch [%o1 + (5 * BLOCK_SIZE)], 21 853 add %o1, BLOCK_SIZE, %o1 ! update SRC 854 855 ! Main loop. Write previous block. Load rest of current block. 856 ! Some bytes will be loaded that won't yet be written. 8571: 858 ldd [%o1], %f2 859 faligndata %f12, %f14, %f44 860 ldd [%o1 + 0x8], %f4 861 faligndata %f14, %f0, %f46 862 std %f32, [%o0] 863 std %f34, [%o0+8] 864 std %f36, [%o0+16] 865 std %f38, [%o0+24] 866 std %f40, [%o0+32] 867 std %f42, [%o0+40] 868 std %f44, [%o0+48] 869 std %f46, [%o0+56] 870 sub %o2, BLOCK_SIZE, %o2 ! update count 871 prefetch [%o0 + (6 * BLOCK_SIZE)], 22 872 prefetch [%o0 + (3 * BLOCK_SIZE)], 22 873 add %o0, BLOCK_SIZE, %o0 ! update DST 874 ldd [%o1 + 0x10], %f6 875 faligndata %f0, %f2, %f32 876 ldd [%o1 + 0x18], %f8 877 faligndata %f2, %f4, %f34 878 ldd [%o1 + 0x20], %f10 879 faligndata %f4, %f6, %f36 880 ldd [%o1 + 0x28], %f12 881 faligndata %f6, %f8, %f38 882 ldd [%o1 + 0x30], %f14 883 faligndata %f8, %f10, %f40 884 ldd [%o1 + 0x38], %f0 885 faligndata %f10, %f12, %f42 886 cmp %o2, BLOCK_SIZE + 8 887 prefetch [%o1 + (5 * BLOCK_SIZE)], 21 888 bgu,pt %ncc, 1b 889 add %o1, BLOCK_SIZE, %o1 ! update SRC 890 faligndata %f12, %f14, %f44 891 faligndata %f14, %f0, %f46 892 stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache 893 cmp %o2, BLOCK_SIZE 894 bne %ncc, 2f ! exactly 1 block remaining? 895 add %o0, BLOCK_SIZE, %o0 ! update DST 896 brz,a %o3, 3f ! is SRC double aligned? 897 ldd [%o1], %f2 898 8992: 900 add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 901 add %o5, %o3, %o5 902 903 membar #StoreLoad|#StoreStore 904 905 ba .beginmedloop 906 andn %o5, 7, %o5 ! 8 byte aligned count 907 908 909 ! This is when there is exactly 1 block remaining and SRC is aligned 9103: 911 ldd [%o1 + 0x8], %f4 912 ldd [%o1 + 0x10], %f6 913 fsrc1 %f0, %f32 914 ldd [%o1 + 0x18], %f8 915 fsrc1 %f2, %f34 916 ldd [%o1 + 0x20], %f10 917 fsrc1 %f4, %f36 918 ldd [%o1 + 0x28], %f12 919 fsrc1 %f6, %f38 920 ldd [%o1 + 0x30], %f14 921 fsrc1 %f8, %f40 922 fsrc1 %f10, %f42 923 fsrc1 %f12, %f44 924 fsrc1 %f14, %f46 925 stda %f32, [%o0]ASI_BLK_P 926 membar #StoreLoad|#StoreStore 927 wr %o4, 0, %fprs 928 retl 929 mov %g1, %o0 930 931 932 .align 16 933 ! two nops here causes loop starting at 1f below to be 934 ! on a cache line boundary, improving performance 935 nop 936 nop 937.xlarge: 938 ! %o0 I/O DST is 64-byte aligned 939 ! %o1 I/O 8-byte aligned (and we've set GSR.ALIGN) 940 ! %d0 I/O already loaded with SRC data from [%o1-8] 941 ! %o2 I/O count (number of bytes that need to be written) 942 ! %o3 I Not written. If zero, then SRC is double aligned. 943 ! %o4 I Not written. Holds fprs. 944 ! %o5 O The number of doubles that remain to be written. 945 946 ! Load the rest of the current block 947 ! Recall that %o1 is further into SRC than %o0 is into DST 948 949 ! prefetch [%o1 + (3 * BLOCK_SIZE)], 21 950 ! executed in delay slot for branch to .xlarge 951 prefetch [%o1 + (4 * BLOCK_SIZE)], 21 952 prefetch [%o1 + (5 * BLOCK_SIZE)], 21 953 ldd [%o1], %f2 954 prefetch [%o1 + (6 * BLOCK_SIZE)], 21 955 ldd [%o1 + 0x8], %f4 956 faligndata %f0, %f2, %f32 957 ldd [%o1 + 0x10], %f6 958 faligndata %f2, %f4, %f34 959 ldd [%o1 + 0x18], %f8 960 faligndata %f4, %f6, %f36 961 ldd [%o1 + 0x20], %f10 962 or %g0, -8, %o5 ! if %o3 >= 0, %o5 = -8 963 faligndata %f6, %f8, %f38 964 ldd [%o1 + 0x28], %f12 965 movrlz %o3, %g0, %o5 ! if %o3 < 0, %o5 = 0 (needed later) 966 faligndata %f8, %f10, %f40 967 ldd [%o1 + 0x30], %f14 968 faligndata %f10, %f12, %f42 969 ldd [%o1 + 0x38], %f0 970 sub %o2, BLOCK_SIZE, %o2 ! update count 971 prefetch [%o1 + (7 * BLOCK_SIZE)], 21 972 add %o1, BLOCK_SIZE, %o1 ! update SRC 973 974 ! This point is 32-byte aligned since 24 instructions appear since 975 ! the previous alignment directive. 976 977 978 ! Main loop. Write previous block. Load rest of current block. 979 ! Some bytes will be loaded that won't yet be written. 9801: 981 ldd [%o1], %f2 982 faligndata %f12, %f14, %f44 983 ldd [%o1 + 0x8], %f4 984 faligndata %f14, %f0, %f46 985 stda %f32, [%o0]ASI_BLK_P 986 sub %o2, BLOCK_SIZE, %o2 ! update count 987 ldd [%o1 + 0x10], %f6 988 faligndata %f0, %f2, %f32 989 ldd [%o1 + 0x18], %f8 990 faligndata %f2, %f4, %f34 991 ldd [%o1 + 0x20], %f10 992 faligndata %f4, %f6, %f36 993 ldd [%o1 + 0x28], %f12 994 faligndata %f6, %f8, %f38 995 ldd [%o1 + 0x30], %f14 996 faligndata %f8, %f10, %f40 997 ldd [%o1 + 0x38], %f0 998 faligndata %f10, %f12, %f42 999 ! offset of 8*BLK+8 bytes works best over range of (src-dst) mod 1K 1000 prefetch [%o1 + (8 * BLOCK_SIZE) + 8], 21 1001 add %o0, BLOCK_SIZE, %o0 ! update DST 1002 cmp %o2, BLOCK_SIZE + 8 1003 ! second prefetch important to correct for occasional dropped 1004 ! initial prefetches, 5*BLK works best over range of (src-dst) mod 1K 1005 ! strong prefetch prevents drops on Panther, but Jaguar and earlier 1006 ! US-III models treat strong prefetches as weak prefetchs 1007 ! to avoid regressions on customer hardware, we retain the prefetch 1008 prefetch [%o1 + (5 * BLOCK_SIZE)], 21 1009 bgu,pt %ncc, 1b 1010 add %o1, BLOCK_SIZE, %o1 ! update SRC 1011 1012 faligndata %f12, %f14, %f44 1013 faligndata %f14, %f0, %f46 1014 stda %f32, [%o0]ASI_BLK_P ! store 64 bytes, bypass cache 1015 cmp %o2, BLOCK_SIZE 1016 bne %ncc, 2f ! exactly 1 block remaining? 1017 add %o0, BLOCK_SIZE, %o0 ! update DST 1018 brz,a %o3, 3f ! is SRC double aligned? 1019 ldd [%o1], %f2 1020 10212: 1022 add %o5, %o2, %o5 ! %o5 was already set to 0 or -8 1023 add %o5, %o3, %o5 1024 1025 membar #StoreLoad|#StoreStore 1026 1027 ba .beginmedloop 1028 andn %o5, 7, %o5 ! 8 byte aligned count 1029 1030 1031 ! This is when there is exactly 1 block remaining and SRC is aligned 10323: 1033 ldd [%o1 + 0x8], %f4 1034 ldd [%o1 + 0x10], %f6 1035 fsrc1 %f0, %f32 1036 ldd [%o1 + 0x18], %f8 1037 fsrc1 %f2, %f34 1038 ldd [%o1 + 0x20], %f10 1039 fsrc1 %f4, %f36 1040 ldd [%o1 + 0x28], %f12 1041 fsrc1 %f6, %f38 1042 ldd [%o1 + 0x30], %f14 1043 fsrc1 %f8, %f40 1044 fsrc1 %f10, %f42 1045 fsrc1 %f12, %f44 1046 fsrc1 %f14, %f46 1047 stda %f32, [%o0]ASI_BLK_P 1048 membar #StoreLoad|#StoreStore 1049 wr %o4, 0, %fprs 1050 retl 1051 mov %g1, %o0 1052 1053 SET_SIZE(memcpy) 1054