1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22/* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27/* 28 * Copyright (c) 2002 Advanced Micro Devices, Inc. 29 * 30 * All rights reserved. 31 * 32 * Redistribution and use in source and binary forms, with or 33 * without modification, are permitted provided that the 34 * following conditions are met: 35 * 36 * + Redistributions of source code must retain the above 37 * copyright notice, this list of conditions and the 38 * following disclaimer. 39 * 40 * + Redistributions in binary form must reproduce the above 41 * copyright notice, this list of conditions and the 42 * following disclaimer in the documentation and/or other 43 * materials provided with the distribution. 44 * 45 * + Neither the name of Advanced Micro Devices, Inc. nor the 46 * names of its contributors may be used to endorse or 47 * promote products derived from this software without 48 * specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 51 * CONTRIBUTORS AS IS AND ANY EXPRESS OR IMPLIED WARRANTIES, 52 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 53 * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 54 * DISCLAIMED. IN NO EVENT SHALL ADVANCED MICRO DEVICES, 55 * INC. OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 56 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 57 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE 58 * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR 59 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 60 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 61 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 62 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 63 * POSSIBILITY OF SUCH DAMAGE. 64 * 65 * It is licensee's responsibility to comply with any export 66 * regulations applicable in licensee's jurisdiction. 67 */ 68 69 .ident "%Z%%M% %I% %E% SMI" 70 71 .file "%M%" 72 73#include <sys/asm_linkage.h> 74 75 ANSI_PRAGMA_WEAK(memmove,function) 76 ANSI_PRAGMA_WEAK(memcpy,function) 77 78#include "SYS.h" 79#include "cache.h" 80 81 ANSI_PRAGMA_WEAK2(_private_memcpy,memcpy,function) 82 83#define LABEL(s) .memcpy/**/s 84 85 ENTRY(memmove) /* (void *s1, void *s2, size_t n) */ 86 cmpq %rsi,%rdi / if (source addr > dest addr) 87 leaq -1(%rsi,%rdx),%r9 88 jle .CopyRight / 89 cmpq %r9,%rdi 90 jle .CopyLeft 91 jmp .CopyRight 92 93 ENTRY(memcpy) /* (void *, const void*, size_t) */ 94 95.CopyRight: 96LABEL(1try): 97 cmp $16, %rdx 98 mov %rdi, %rax 99 jae LABEL(1after) 100 101 .p2align 4 102 103LABEL(1): /* 1-byte */ 104 test $1, %dl 105 jz LABEL(1a) 106 107 mov (%rsi), %cl 108 mov %cl, (%rdi) 109 110 dec %dl 111 lea 1 (%rsi), %rsi 112 lea 1 (%rdi), %rdi 113 jz LABEL(exit) 114 115 .p2align 4,, 4 116 117LABEL(1a): 118 test $2, %dl 119 jz LABEL(1b) 120 121 mov (%rsi), %cx 122 mov %cx, (%rdi) 123 124 sub $2, %dl 125 lea 2 (%rsi), %rsi 126 lea 2 (%rdi), %rdi 127 jz LABEL(exit) 128 129 .p2align 4,, 4 130 131LABEL(1b): 132 test $4, %dl 133 jz LABEL(1c) 134 135 mov (%rsi), %ecx 136 mov %ecx, (%rdi) 137 138/* sub $4, %dl */ 139 lea 4 (%rsi), %rsi 140 lea 4 (%rdi), %rdi 141/* jz LABEL(exit) */ 142 143 .p2align 4,, 4 144 145LABEL(1c): 146 test $8, %dl 147 jz LABEL(1d) 148 149 mov (%rsi), %rcx 150 mov %rcx, (%rdi) 151 152/* sub $8, %dl */ 153/* lea 8 (%rsi), %rsi */ 154/* lea 8 (%rdi), %rdi */ 155/* jz LABEL(exit) */ 156 157 .p2align 4 158 159LABEL(1d): 160 161LABEL(exit): 162 rep 163 ret 164 165 .p2align 4 166 167LABEL(1after): 168 push %rax 169 170LABEL(8try): 171 cmp $32, %rdx 172 jae LABEL(8after) 173 174LABEL(8): /* 8-byte */ 175 mov %edx, %ecx 176 shr $3, %ecx 177 jz LABEL(8skip) 178 179 .p2align 4 180 181LABEL(8loop): 182 dec %ecx 183 184 mov (%rsi), %rax 185 mov %rax, (%rdi) 186 187 lea 8 (%rsi), %rsi 188 lea 8 (%rdi), %rdi 189 190 jnz LABEL(8loop) 191 192LABEL(8skip): 193 and $7, %edx 194 pop %rax 195 jnz LABEL(1) 196 197 rep 198 ret 199 200 .p2align 4 201 202LABEL(8after): 203 204LABEL(32try): 205 mov $512, %r8d /* size for unaligned data */ 206 mov $4096, %r9d /* size for aligned data */ 207 test $7, %esi /* check if either source.. */ 208 cmovz %r9, %r8 209 test $7, %edi /* .. or destination is aligned */ 210 cmovz %r9, %r8 211 212 cmp %r8, %rdx 213 ja LABEL(32after) 214 215LABEL(32): /* 32-byte */ 216 mov %edx, %ecx 217 shr $5, %ecx 218 jz LABEL(32skip) 219 220 .p2align 4 221 222LABEL(32loop): 223 dec %ecx 224 225 mov (%rsi), %rax 226 mov 8 (%rsi), %r8 227 mov 16 (%rsi), %r9 228 mov 24 (%rsi), %r10 229 230 mov %rax, (%rdi) 231 mov %r8, 8 (%rdi) 232 mov %r9, 16 (%rdi) 233 mov %r10, 24 (%rdi) 234 235 lea 32 (%rsi), %rsi 236 lea 32 (%rdi), %rdi 237 238 jz LABEL(32skip) 239 240 dec %ecx 241 242 mov (%rsi), %rax 243 mov 8 (%rsi), %r8 244 mov 16 (%rsi), %r9 245 mov 24 (%rsi), %r10 246 247 mov %rax, (%rdi) 248 mov %r8, 8 (%rdi) 249 mov %r9, 16 (%rdi) 250 mov %r10, 24 (%rdi) 251 252 lea 32 (%rsi), %rsi 253 lea 32 (%rdi), %rdi 254 255 jnz LABEL(32loop) 256 257 .p2align 4 258 259LABEL(32skip): 260 and $31, %edx 261 jnz LABEL(8) 262 263 pop %rax 264 ret 265 266 .p2align 4 267 268LABEL(32after): 269 270 /* 3DNow: use prefetch */ 271 prefetchnta _sref_(.amd64cache1) /* improves test further ahead on B0 */ 272 273LABEL(aligntry): 274 mov %edi, %r8d /* align by destination */ 275 276 and $7, %r8d 277 jz LABEL(alignafter) /* not unaligned */ 278 279LABEL(align): /* align */ 280 lea -8 (%r8, %rdx), %rdx 281 sub $8, %r8d 282 283 .p2align 4 284 285LABEL(alignloop): 286 inc %r8d 287 288 mov (%rsi), %al 289 mov %al, (%rdi) 290 291 lea 1 (%rsi), %rsi 292 lea 1 (%rdi), %rdi 293 294 jnz LABEL(alignloop) 295 296 .p2align 4 297 298LABEL(alignafter): 299 mov _sref_(.amd64cache1half), %r11 300 cmp %rdx, %r11 301 cmova %rdx, %r11 302 303LABEL(fast): 304 mov %r11, %rcx 305 and $-8, %r11 306 shr $3, %rcx 307/* jz LABEL(fastskip) */ 308 309 rep /* good ol' MOVS */ 310 movsq 311 312LABEL(fastskip): 313 sub %r11, %rdx 314 test $-8, %rdx 315 jnz LABEL(fastafterlater) 316 317 and $7, %edx 318 pop %rax 319 jnz LABEL(1) 320 321 rep 322 ret 323 324 .p2align 4 325 326LABEL(64try): 327 mov _sref_(.amd64cache1half), %r11 328 cmp %rdx, %r11 329 cmova %rdx, %r11 330 331LABEL(64): /* 64-byte */ 332 mov %r11, %rcx 333 and $-64, %r11 334 shr $6, %rcx 335 jz LABEL(64skip) 336 337 .p2align 4 338 339LABEL(64loop): 340 dec %ecx 341 342 mov (%rsi), %rax 343 mov 8 (%rsi), %r8 344 mov 16 (%rsi), %r9 345 mov 24 (%rsi), %r10 346 347 mov %rax, (%rdi) 348 mov %r8, 8 (%rdi) 349 mov %r9, 16 (%rdi) 350 mov %r10, 24 (%rdi) 351 352 mov 32 (%rsi), %rax 353 mov 40 (%rsi), %r8 354 mov 48 (%rsi), %r9 355 mov 56 (%rsi), %r10 356 357 mov %rax, 32 (%rdi) 358 mov %r8, 40 (%rdi) 359 mov %r9, 48 (%rdi) 360 mov %r10, 56 (%rdi) 361 362 lea 64 (%rsi), %rsi 363 lea 64 (%rdi), %rdi 364 365 jz LABEL(64skip) 366 367 dec %ecx 368 369 mov (%rsi), %rax 370 mov 8 (%rsi), %r8 371 mov 16 (%rsi), %r9 372 mov 24 (%rsi), %r10 373 374 mov %rax, (%rdi) 375 mov %r8, 8 (%rdi) 376 mov %r9, 16 (%rdi) 377 mov %r10, 24 (%rdi) 378 379 mov 32 (%rsi), %rax 380 mov 40 (%rsi), %r8 381 mov 48 (%rsi), %r9 382 mov 56 (%rsi), %r10 383 384 mov %rax, 32 (%rdi) 385 mov %r8, 40 (%rdi) 386 mov %r9, 48 (%rdi) 387 mov %r10, 56 (%rdi) 388 389 lea 64 (%rsi), %rsi 390 lea 64 (%rdi), %rdi 391 392 jnz LABEL(64loop) 393 394 .p2align 4 395 396LABEL(64skip): 397 sub %r11, %rdx 398 test $-64, %rdx 399 jnz LABEL(64after) 400 401 and $63, %edx 402 jnz LABEL(32) 403 404 pop %rax 405 ret 406 407 .p2align 4 408 409LABEL(64after): 410 411LABEL(fastafterlater): 412 413LABEL(pretry): 414 mov _sref_(.amd64cache2half), %r8 415 cmp %rdx, %r8 416 cmova %rdx, %r8 417 418LABEL(pre): /* 64-byte prefetching */ 419 mov %r8, %rcx 420 and $-64, %r8 421 shr $6, %rcx 422 jz LABEL(preskip) 423 424 push %r14 425 push %r13 426 push %r12 427 push %rbx 428 429 .p2align 4 430 431LABEL(preloop): 432 dec %rcx 433 434 mov (%rsi), %rax 435 mov 8 (%rsi), %rbx 436 mov 16 (%rsi), %r9 437 mov 24 (%rsi), %r10 438 mov 32 (%rsi), %r11 439 mov 40 (%rsi), %r12 440 mov 48 (%rsi), %r13 441 mov 56 (%rsi), %r14 442 443 prefetchnta 0 + 896 (%rsi) /* 3DNow: use prefetch */ 444 prefetchnta 64 + 896 (%rsi) /* 3DNow: use prefetch */ 445 446 mov %rax, (%rdi) 447 mov %rbx, 8 (%rdi) 448 mov %r9, 16 (%rdi) 449 mov %r10, 24 (%rdi) 450 mov %r11, 32 (%rdi) 451 mov %r12, 40 (%rdi) 452 mov %r13, 48 (%rdi) 453 mov %r14, 56 (%rdi) 454 455 lea 64 (%rsi), %rsi 456 lea 64 (%rdi), %rdi 457 458 jz LABEL(preskipa) 459 460 dec %rcx 461 462 mov (%rsi), %rax 463 mov 8 (%rsi), %rbx 464 mov 16 (%rsi), %r9 465 mov 24 (%rsi), %r10 466 mov 32 (%rsi), %r11 467 mov 40 (%rsi), %r12 468 mov 48 (%rsi), %r13 469 mov 56 (%rsi), %r14 470 471 mov %rax, (%rdi) 472 mov %rbx, 8 (%rdi) 473 mov %r9, 16 (%rdi) 474 mov %r10, 24 (%rdi) 475 mov %r11, 32 (%rdi) 476 mov %r12, 40 (%rdi) 477 mov %r13, 48 (%rdi) 478 mov %r14, 56 (%rdi) 479 480 prefetchnta -64 + 896 (%rdi) /* 3DNow: use prefetchw */ 481 prefetchnta 0 + 896 (%rdi) /* 3DNow: use prefetchw */ 482 483 lea 64 (%rsi), %rsi 484 lea 64 (%rdi), %rdi 485 486 jnz LABEL(preloop) 487 488LABEL(preskipa): 489 pop %rbx 490 pop %r12 491 pop %r13 492 pop %r14 493 494 495LABEL(preskip): 496 sub %r8, %rdx 497 test $-64, %rdx 498 jnz LABEL(preafter) 499 500 and $63, %edx 501 jnz LABEL(32) 502 503 pop %rax 504 ret 505 506 .p2align 4 507 508LABEL(preafter): 509 510LABEL(NTtry): 511 512LABEL(NT): /* NT 64-byte */ 513 mov %rdx, %rcx 514 shr $7, %rcx 515 jz LABEL(NTskip) 516 517 push %r14 518 push %r13 519 push %r12 520 521 .p2align 4 522 523LABEL(NTloop): 524 prefetchnta 768 (%rsi) /* prefetching NT here is not so good on B0 and C0 MP systems */ 525 prefetchnta 832 (%rsi) 526 527 dec %rcx 528 529 mov (%rsi), %rax 530 mov 8 (%rsi), %r8 531 mov 16 (%rsi), %r9 532 mov 24 (%rsi), %r10 533 mov 32 (%rsi), %r11 534 mov 40 (%rsi), %r12 535 mov 48 (%rsi), %r13 536 mov 56 (%rsi), %r14 537 538 movnti %rax, (%rdi) 539 movnti %r8, 8 (%rdi) 540 movnti %r9, 16 (%rdi) 541 movnti %r10, 24 (%rdi) 542 movnti %r11, 32 (%rdi) 543 movnti %r12, 40 (%rdi) 544 movnti %r13, 48 (%rdi) 545 movnti %r14, 56 (%rdi) 546 547 mov 64 (%rsi), %rax 548 mov 72 (%rsi), %r8 549 mov 80 (%rsi), %r9 550 mov 88 (%rsi), %r10 551 mov 96 (%rsi), %r11 552 mov 104 (%rsi), %r12 553 mov 112 (%rsi), %r13 554 mov 120 (%rsi), %r14 555 556 movnti %rax, 64 (%rdi) 557 movnti %r8, 72 (%rdi) 558 movnti %r9, 80 (%rdi) 559 movnti %r10, 88 (%rdi) 560 movnti %r11, 96 (%rdi) 561 movnti %r12, 104 (%rdi) 562 movnti %r13, 112 (%rdi) 563 movnti %r14, 120 (%rdi) 564 565 lea 128 (%rsi), %rsi 566 lea 128 (%rdi), %rdi 567 568 jnz LABEL(NTloop) 569 570 mfence 571 572 pop %r12 573 pop %r13 574 pop %r14 575 576LABEL(NTskip): 577 and $127, %edx 578 jnz LABEL(32) 579 580 pop %rax 581 ret 582 583 SET_SIZE(memcpy) /* (void *, const void*, size_t) */ 584 585.CopyLeft: 586 movq %rdi,%rax / set up return value 587 movq $7,%r8 / heavily used constant 588 movq %rdx,%rcx / put len into %rcx for rep 589 std / reverse direction bit (RtoL) 590 cmpq $24,%rcx / if (size < 24) 591 ja .BigCopyLeft / { 592 movq %r9,%rsi / src = src + size - 1 593 leaq -1(%rcx,%rdi),%rdi / dst = dst + size - 1 594 rep; smovb / do the byte copy 595 cld / reset direction flag to LtoR 596 ret / return(dba); 597.BigCopyLeft: / } else { 598 xchgq %r9,%rcx 599 movq %rcx,%rsi / align source w/byte copy 600 leaq -1(%r9,%rdi),%rdi 601 andq %r8,%rcx 602 jz .SkipAlignLeft 603 addq $1, %rcx / we need to insure that future 604 subq %rcx,%r9 / copy is done on aligned boundary 605 rep; smovb 606.SkipAlignLeft: 607 movq %r9,%rcx 608 subq %r8,%rsi 609 shrq $3,%rcx / do 8 byte copy RtoL 610 subq %r8,%rdi 611 rep; smovq 612 andq %r8,%r9 / do 1 byte copy whats left 613 jz .CleanupReturnLeft 614 movq %r9,%rcx 615 addq %r8,%rsi / rep; smovl instruction will decrement 616 addq %r8,%rdi / %rdi, %rsi by four after each copy 617 / adding 3 will restore pointers to byte 618 / before last double word copied 619 / which is where they are expected to 620 / be for the single byte copy code 621 rep; smovb 622.CleanupReturnLeft: 623 cld / reset direction flag to LtoR 624 ret / return(dba); 625 SET_SIZE(memmove) 626