1/* 2 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6/* 7 * Copyright (c) 2002 Advanced Micro Devices, Inc. 8 * 9 * All rights reserved. 10 * 11 * Redistribution and use in source and binary forms, with or 12 * without modification, are permitted provided that the 13 * following conditions are met: 14 * 15 * + Redistributions of source code must retain the above 16 * copyright notice, this list of conditions and the 17 * following disclaimer. 18 * 19 * + Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the 21 * following disclaimer in the documentation and/or other 22 * materials provided with the distribution. 23 * 24 * + Neither the name of Advanced Micro Devices, Inc. nor the 25 * names of its contributors may be used to endorse or 26 * promote products derived from this software without 27 * specific prior written permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 30 * CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, 31 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 32 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 33 * DISCLAIMED. IN NO EVENT SHALL ADVANCED MICRO DEVICES, 34 * INC. OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 35 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 36 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 37 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 38 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 39 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 40 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 41 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 42 * POSSIBILITY OF SUCH DAMAGE. 43 * 44 * It is licensee's responsibility to comply with any export 45 * regulations applicable in licensee's jurisdiction. 46 */ 47 48 .ident "%Z%%M% %I% %E% SMI" 49 50 .file "%M%" 51 52#include <sys/asm_linkage.h> 53 54 ANSI_PRAGMA_WEAK(memmove,function) 55 ANSI_PRAGMA_WEAK(memcpy,function) 56 57#include "SYS.h" 58#include "cache.h" 59 60 ANSI_PRAGMA_WEAK2(_private_memcpy,memcpy,function) 61 62#define LABEL(s) .memcpy/**/s 63 64 ENTRY(memmove) /* (void *s1, void *s2, size_t n) */ 65 cmpq %rsi,%rdi / if (source addr > dest addr) 66 leaq -1(%rsi,%rdx),%r9 67 jle .CopyRight / 68 cmpq %r9,%rdi 69 jle .CopyLeft 70 jmp .CopyRight 71 72 ENTRY(memcpy) /* (void *, const void*, size_t) */ 73 74.CopyRight: 75LABEL(1try): 76 cmp $16, %rdx 77 mov %rdi, %rax 78 jae LABEL(1after) 79 80 .p2align 4 81 82LABEL(1): /* 1-byte */ 83 test $1, %dl 84 jz LABEL(1a) 85 86 mov (%rsi), %cl 87 mov %cl, (%rdi) 88 89 dec %dl 90 lea 1 (%rsi), %rsi 91 lea 1 (%rdi), %rdi 92 jz LABEL(exit) 93 94 .p2align 4,, 4 95 96LABEL(1a): 97 test $2, %dl 98 jz LABEL(1b) 99 100 mov (%rsi), %cx 101 mov %cx, (%rdi) 102 103 sub $2, %dl 104 lea 2 (%rsi), %rsi 105 lea 2 (%rdi), %rdi 106 jz LABEL(exit) 107 108 .p2align 4,, 4 109 110LABEL(1b): 111 test $4, %dl 112 jz LABEL(1c) 113 114 mov (%rsi), %ecx 115 mov %ecx, (%rdi) 116 117/* sub $4, %dl */ 118 lea 4 (%rsi), %rsi 119 lea 4 (%rdi), %rdi 120/* jz LABEL(exit) */ 121 122 .p2align 4,, 4 123 124LABEL(1c): 125 test $8, %dl 126 jz LABEL(1d) 127 128 mov (%rsi), %rcx 129 mov %rcx, (%rdi) 130 131/* sub $8, %dl */ 132/* lea 8 (%rsi), %rsi */ 133/* lea 8 (%rdi), %rdi */ 134/* jz LABEL(exit) */ 135 136 .p2align 4 137 138LABEL(1d): 139 140LABEL(exit): 141 rep 142 ret 143 144 .p2align 4 145 146LABEL(1after): 147 push %rax 148 149LABEL(8try): 150 cmp $32, %rdx 151 jae LABEL(8after) 152 153LABEL(8): /* 8-byte */ 154 mov %edx, %ecx 155 shr $3, %ecx 156 jz LABEL(8skip) 157 158 .p2align 4 159 160LABEL(8loop): 161 dec %ecx 162 163 mov (%rsi), %rax 164 mov %rax, (%rdi) 165 166 lea 8 (%rsi), %rsi 167 lea 8 (%rdi), %rdi 168 169 jnz LABEL(8loop) 170 171LABEL(8skip): 172 and $7, %edx 173 pop %rax 174 jnz LABEL(1) 175 176 rep 177 ret 178 179 .p2align 4 180 181LABEL(8after): 182 183LABEL(32try): 184 mov $512, %r8d /* size for unaligned data */ 185 mov $4096, %r9d /* size for aligned data */ 186 test $7, %esi /* check if either source.. */ 187 cmovz %r9, %r8 188 test $7, %edi /* .. or destination is aligned */ 189 cmovz %r9, %r8 190 191 cmp %r8, %rdx 192 ja LABEL(32after) 193 194LABEL(32): /* 32-byte */ 195 mov %edx, %ecx 196 shr $5, %ecx 197 jz LABEL(32skip) 198 199 .p2align 4 200 201LABEL(32loop): 202 dec %ecx 203 204 mov (%rsi), %rax 205 mov 8 (%rsi), %r8 206 mov 16 (%rsi), %r9 207 mov 24 (%rsi), %r10 208 209 mov %rax, (%rdi) 210 mov %r8, 8 (%rdi) 211 mov %r9, 16 (%rdi) 212 mov %r10, 24 (%rdi) 213 214 lea 32 (%rsi), %rsi 215 lea 32 (%rdi), %rdi 216 217 jz LABEL(32skip) 218 219 dec %ecx 220 221 mov (%rsi), %rax 222 mov 8 (%rsi), %r8 223 mov 16 (%rsi), %r9 224 mov 24 (%rsi), %r10 225 226 mov %rax, (%rdi) 227 mov %r8, 8 (%rdi) 228 mov %r9, 16 (%rdi) 229 mov %r10, 24 (%rdi) 230 231 lea 32 (%rsi), %rsi 232 lea 32 (%rdi), %rdi 233 234 jnz LABEL(32loop) 235 236 .p2align 4 237 238LABEL(32skip): 239 and $31, %edx 240 jnz LABEL(8) 241 242 pop %rax 243 ret 244 245 .p2align 4 246 247LABEL(32after): 248 249 /* 3DNow: use prefetch */ 250 prefetchnta _sref_(.amd64cache1) /* improves test further ahead on B0 */ 251 252LABEL(aligntry): 253 mov %edi, %r8d /* align by destination */ 254 255 and $7, %r8d 256 jz LABEL(alignafter) /* not unaligned */ 257 258LABEL(align): /* align */ 259 lea -8 (%r8, %rdx), %rdx 260 sub $8, %r8d 261 262 .p2align 4 263 264LABEL(alignloop): 265 inc %r8d 266 267 mov (%rsi), %al 268 mov %al, (%rdi) 269 270 lea 1 (%rsi), %rsi 271 lea 1 (%rdi), %rdi 272 273 jnz LABEL(alignloop) 274 275 .p2align 4 276 277LABEL(alignafter): 278 mov _sref_(.amd64cache1half), %r11 279 cmp %rdx, %r11 280 cmova %rdx, %r11 281 282LABEL(fast): 283 mov %r11, %rcx 284 and $-8, %r11 285 shr $3, %rcx 286/* jz LABEL(fastskip) */ 287 288 rep /* good ol' MOVS */ 289 movsq 290 291LABEL(fastskip): 292 sub %r11, %rdx 293 test $-8, %rdx 294 jnz LABEL(fastafterlater) 295 296 and $7, %edx 297 pop %rax 298 jnz LABEL(1) 299 300 rep 301 ret 302 303 .p2align 4 304 305LABEL(64try): 306 mov _sref_(.amd64cache1half), %r11 307 cmp %rdx, %r11 308 cmova %rdx, %r11 309 310LABEL(64): /* 64-byte */ 311 mov %r11, %rcx 312 and $-64, %r11 313 shr $6, %rcx 314 jz LABEL(64skip) 315 316 .p2align 4 317 318LABEL(64loop): 319 dec %ecx 320 321 mov (%rsi), %rax 322 mov 8 (%rsi), %r8 323 mov 16 (%rsi), %r9 324 mov 24 (%rsi), %r10 325 326 mov %rax, (%rdi) 327 mov %r8, 8 (%rdi) 328 mov %r9, 16 (%rdi) 329 mov %r10, 24 (%rdi) 330 331 mov 32 (%rsi), %rax 332 mov 40 (%rsi), %r8 333 mov 48 (%rsi), %r9 334 mov 56 (%rsi), %r10 335 336 mov %rax, 32 (%rdi) 337 mov %r8, 40 (%rdi) 338 mov %r9, 48 (%rdi) 339 mov %r10, 56 (%rdi) 340 341 lea 64 (%rsi), %rsi 342 lea 64 (%rdi), %rdi 343 344 jz LABEL(64skip) 345 346 dec %ecx 347 348 mov (%rsi), %rax 349 mov 8 (%rsi), %r8 350 mov 16 (%rsi), %r9 351 mov 24 (%rsi), %r10 352 353 mov %rax, (%rdi) 354 mov %r8, 8 (%rdi) 355 mov %r9, 16 (%rdi) 356 mov %r10, 24 (%rdi) 357 358 mov 32 (%rsi), %rax 359 mov 40 (%rsi), %r8 360 mov 48 (%rsi), %r9 361 mov 56 (%rsi), %r10 362 363 mov %rax, 32 (%rdi) 364 mov %r8, 40 (%rdi) 365 mov %r9, 48 (%rdi) 366 mov %r10, 56 (%rdi) 367 368 lea 64 (%rsi), %rsi 369 lea 64 (%rdi), %rdi 370 371 jnz LABEL(64loop) 372 373 .p2align 4 374 375LABEL(64skip): 376 sub %r11, %rdx 377 test $-64, %rdx 378 jnz LABEL(64after) 379 380 and $63, %edx 381 jnz LABEL(32) 382 383 pop %rax 384 ret 385 386 .p2align 4 387 388LABEL(64after): 389 390LABEL(fastafterlater): 391 392LABEL(pretry): 393 mov _sref_(.amd64cache2half), %r8 394 cmp %rdx, %r8 395 cmova %rdx, %r8 396 397LABEL(pre): /* 64-byte prefetching */ 398 mov %r8, %rcx 399 and $-64, %r8 400 shr $6, %rcx 401 jz LABEL(preskip) 402 403 push %r14 404 push %r13 405 push %r12 406 push %rbx 407 408 .p2align 4 409 410LABEL(preloop): 411 dec %rcx 412 413 mov (%rsi), %rax 414 mov 8 (%rsi), %rbx 415 mov 16 (%rsi), %r9 416 mov 24 (%rsi), %r10 417 mov 32 (%rsi), %r11 418 mov 40 (%rsi), %r12 419 mov 48 (%rsi), %r13 420 mov 56 (%rsi), %r14 421 422 prefetchnta 0 + 896 (%rsi) /* 3DNow: use prefetch */ 423 prefetchnta 64 + 896 (%rsi) /* 3DNow: use prefetch */ 424 425 mov %rax, (%rdi) 426 mov %rbx, 8 (%rdi) 427 mov %r9, 16 (%rdi) 428 mov %r10, 24 (%rdi) 429 mov %r11, 32 (%rdi) 430 mov %r12, 40 (%rdi) 431 mov %r13, 48 (%rdi) 432 mov %r14, 56 (%rdi) 433 434 lea 64 (%rsi), %rsi 435 lea 64 (%rdi), %rdi 436 437 jz LABEL(preskipa) 438 439 dec %rcx 440 441 mov (%rsi), %rax 442 mov 8 (%rsi), %rbx 443 mov 16 (%rsi), %r9 444 mov 24 (%rsi), %r10 445 mov 32 (%rsi), %r11 446 mov 40 (%rsi), %r12 447 mov 48 (%rsi), %r13 448 mov 56 (%rsi), %r14 449 450 mov %rax, (%rdi) 451 mov %rbx, 8 (%rdi) 452 mov %r9, 16 (%rdi) 453 mov %r10, 24 (%rdi) 454 mov %r11, 32 (%rdi) 455 mov %r12, 40 (%rdi) 456 mov %r13, 48 (%rdi) 457 mov %r14, 56 (%rdi) 458 459 prefetchnta -64 + 896 (%rdi) /* 3DNow: use prefetchw */ 460 prefetchnta 0 + 896 (%rdi) /* 3DNow: use prefetchw */ 461 462 lea 64 (%rsi), %rsi 463 lea 64 (%rdi), %rdi 464 465 jnz LABEL(preloop) 466 467LABEL(preskipa): 468 pop %rbx 469 pop %r12 470 pop %r13 471 pop %r14 472 473 474LABEL(preskip): 475 sub %r8, %rdx 476 test $-64, %rdx 477 jnz LABEL(preafter) 478 479 and $63, %edx 480 jnz LABEL(32) 481 482 pop %rax 483 ret 484 485 .p2align 4 486 487LABEL(preafter): 488 489LABEL(NTtry): 490 491LABEL(NT): /* NT 64-byte */ 492 mov %rdx, %rcx 493 shr $7, %rcx 494 jz LABEL(NTskip) 495 496 push %r14 497 push %r13 498 push %r12 499 500 .p2align 4 501 502LABEL(NTloop): 503 prefetchnta 768 (%rsi) /* prefetching NT here is not so good on B0 and C0 MP systems */ 504 prefetchnta 832 (%rsi) 505 506 dec %rcx 507 508 mov (%rsi), %rax 509 mov 8 (%rsi), %r8 510 mov 16 (%rsi), %r9 511 mov 24 (%rsi), %r10 512 mov 32 (%rsi), %r11 513 mov 40 (%rsi), %r12 514 mov 48 (%rsi), %r13 515 mov 56 (%rsi), %r14 516 517 movnti %rax, (%rdi) 518 movnti %r8, 8 (%rdi) 519 movnti %r9, 16 (%rdi) 520 movnti %r10, 24 (%rdi) 521 movnti %r11, 32 (%rdi) 522 movnti %r12, 40 (%rdi) 523 movnti %r13, 48 (%rdi) 524 movnti %r14, 56 (%rdi) 525 526 mov 64 (%rsi), %rax 527 mov 72 (%rsi), %r8 528 mov 80 (%rsi), %r9 529 mov 88 (%rsi), %r10 530 mov 96 (%rsi), %r11 531 mov 104 (%rsi), %r12 532 mov 112 (%rsi), %r13 533 mov 120 (%rsi), %r14 534 535 movnti %rax, 64 (%rdi) 536 movnti %r8, 72 (%rdi) 537 movnti %r9, 80 (%rdi) 538 movnti %r10, 88 (%rdi) 539 movnti %r11, 96 (%rdi) 540 movnti %r12, 104 (%rdi) 541 movnti %r13, 112 (%rdi) 542 movnti %r14, 120 (%rdi) 543 544 lea 128 (%rsi), %rsi 545 lea 128 (%rdi), %rdi 546 547 jnz LABEL(NTloop) 548 549 mfence 550 551 pop %r12 552 pop %r13 553 pop %r14 554 555LABEL(NTskip): 556 and $127, %edx 557 jnz LABEL(32) 558 559 pop %rax 560 ret 561 562 SET_SIZE(memcpy) /* (void *, const void*, size_t) */ 563 564.CopyLeft: 565 movq %rdi,%rax / set up return value 566 movq $7,%r8 / heavily used constant 567 movq %rdx,%rcx / put len into %rcx for rep 568 std / reverse direction bit (RtoL) 569 cmpq $24,%rcx / if (size < 24) 570 ja .BigCopyLeft / { 571 movq %r9,%rsi / src = src + size - 1 572 leaq -1(%rcx,%rdi),%rdi / dst = dst + size - 1 573 rep; smovb / do the byte copy 574 cld / reset direction flag to LtoR 575 ret / return(dba); 576.BigCopyLeft: / } else { 577 xchgq %r9,%rcx 578 movq %rcx,%rsi / align source w/byte copy 579 leaq -1(%r9,%rdi),%rdi 580 andq %r8,%rcx 581 jz .SkipAlignLeft 582 addq $1, %rcx / we need to insure that future 583 subq %rcx,%r9 / copy is done on aligned boundary 584 rep; smovb 585.SkipAlignLeft: 586 movq %r9,%rcx 587 subq %r8,%rsi 588 shrq $3,%rcx / do 8 byte copy RtoL 589 subq %r8,%rdi 590 rep; smovq 591 andq %r8,%r9 / do 1 byte copy whats left 592 jz .CleanupReturnLeft 593 movq %r9,%rcx 594 addq %r8,%rsi / rep; smovl instruction will decrement 595 addq %r8,%rdi / %rdi, %rsi by four after each copy 596 / adding 3 will restore pointers to byte 597 / before last double word copied 598 / which is where they are expected to 599 / be for the single byte copy code 600 rep; smovb 601.CleanupReturnLeft: 602 cld / reset direction flag to LtoR 603 ret / return(dba); 604 SET_SIZE(memmove) 605