1/* 2 * This program is free software; you can redistribute it and/or modify 3 * it under the terms of the GNU General Public License as published by 4 * the Free Software Foundation; either version 2 of the License, or 5 * (at your option) any later version. 6 * 7 * This program is distributed in the hope that it will be useful, 8 * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 * GNU General Public License for more details. 11 * 12 * You should have received a copy of the GNU General Public License 13 * along with this program; if not, write to the Free Software 14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 15 * 16 * Copyright (C) IBM Corporation, 2012 17 * 18 * Author: Anton Blanchard <anton@au.ibm.com> 19 */ 20#include <asm/ppc_asm.h> 21 22_GLOBAL(memcpy_power7) 23 24#ifdef __BIG_ENDIAN__ 25#define LVS(VRT,RA,RB) lvsl VRT,RA,RB 26#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC 27#else 28#define LVS(VRT,RA,RB) lvsr VRT,RA,RB 29#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC 30#endif 31 32#ifdef CONFIG_ALTIVEC 33 cmpldi r5,16 34 cmpldi cr1,r5,4096 35 36 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 37 38 blt .Lshort_copy 39 bgt cr1,.Lvmx_copy 40#else 41 cmpldi r5,16 42 43 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 44 45 blt .Lshort_copy 46#endif 47 48.Lnonvmx_copy: 49 /* Get the source 8B aligned */ 50 neg r6,r4 51 mtocrf 0x01,r6 52 clrldi r6,r6,(64-3) 53 54 bf cr7*4+3,1f 55 lbz r0,0(r4) 56 addi r4,r4,1 57 stb r0,0(r3) 58 addi r3,r3,1 59 601: bf cr7*4+2,2f 61 lhz r0,0(r4) 62 addi r4,r4,2 63 sth r0,0(r3) 64 addi r3,r3,2 65 662: bf cr7*4+1,3f 67 lwz r0,0(r4) 68 addi r4,r4,4 69 stw r0,0(r3) 70 addi r3,r3,4 71 723: sub r5,r5,r6 73 cmpldi r5,128 74 blt 5f 75 76 mflr r0 77 stdu r1,-STACKFRAMESIZE(r1) 78 std r14,STK_REG(R14)(r1) 79 std r15,STK_REG(R15)(r1) 80 std r16,STK_REG(R16)(r1) 81 std r17,STK_REG(R17)(r1) 82 std r18,STK_REG(R18)(r1) 83 std r19,STK_REG(R19)(r1) 84 std r20,STK_REG(R20)(r1) 85 std r21,STK_REG(R21)(r1) 86 std r22,STK_REG(R22)(r1) 87 std r0,STACKFRAMESIZE+16(r1) 88 89 srdi r6,r5,7 90 mtctr r6 91 92 /* Now do cacheline (128B) sized loads and stores. */ 93 .align 5 944: 95 ld r0,0(r4) 96 ld r6,8(r4) 97 ld r7,16(r4) 98 ld r8,24(r4) 99 ld r9,32(r4) 100 ld r10,40(r4) 101 ld r11,48(r4) 102 ld r12,56(r4) 103 ld r14,64(r4) 104 ld r15,72(r4) 105 ld r16,80(r4) 106 ld r17,88(r4) 107 ld r18,96(r4) 108 ld r19,104(r4) 109 ld r20,112(r4) 110 ld r21,120(r4) 111 addi r4,r4,128 112 std r0,0(r3) 113 std r6,8(r3) 114 std r7,16(r3) 115 std r8,24(r3) 116 std r9,32(r3) 117 std r10,40(r3) 118 std r11,48(r3) 119 std r12,56(r3) 120 std r14,64(r3) 121 std r15,72(r3) 122 std r16,80(r3) 123 std r17,88(r3) 124 std r18,96(r3) 125 std r19,104(r3) 126 std r20,112(r3) 127 std r21,120(r3) 128 addi r3,r3,128 129 bdnz 4b 130 131 clrldi r5,r5,(64-7) 132 133 ld r14,STK_REG(R14)(r1) 134 ld r15,STK_REG(R15)(r1) 135 ld r16,STK_REG(R16)(r1) 136 ld r17,STK_REG(R17)(r1) 137 ld r18,STK_REG(R18)(r1) 138 ld r19,STK_REG(R19)(r1) 139 ld r20,STK_REG(R20)(r1) 140 ld r21,STK_REG(R21)(r1) 141 ld r22,STK_REG(R22)(r1) 142 addi r1,r1,STACKFRAMESIZE 143 144 /* Up to 127B to go */ 1455: srdi r6,r5,4 146 mtocrf 0x01,r6 147 1486: bf cr7*4+1,7f 149 ld r0,0(r4) 150 ld r6,8(r4) 151 ld r7,16(r4) 152 ld r8,24(r4) 153 ld r9,32(r4) 154 ld r10,40(r4) 155 ld r11,48(r4) 156 ld r12,56(r4) 157 addi r4,r4,64 158 std r0,0(r3) 159 std r6,8(r3) 160 std r7,16(r3) 161 std r8,24(r3) 162 std r9,32(r3) 163 std r10,40(r3) 164 std r11,48(r3) 165 std r12,56(r3) 166 addi r3,r3,64 167 168 /* Up to 63B to go */ 1697: bf cr7*4+2,8f 170 ld r0,0(r4) 171 ld r6,8(r4) 172 ld r7,16(r4) 173 ld r8,24(r4) 174 addi r4,r4,32 175 std r0,0(r3) 176 std r6,8(r3) 177 std r7,16(r3) 178 std r8,24(r3) 179 addi r3,r3,32 180 181 /* Up to 31B to go */ 1828: bf cr7*4+3,9f 183 ld r0,0(r4) 184 ld r6,8(r4) 185 addi r4,r4,16 186 std r0,0(r3) 187 std r6,8(r3) 188 addi r3,r3,16 189 1909: clrldi r5,r5,(64-4) 191 192 /* Up to 15B to go */ 193.Lshort_copy: 194 mtocrf 0x01,r5 195 bf cr7*4+0,12f 196 lwz r0,0(r4) /* Less chance of a reject with word ops */ 197 lwz r6,4(r4) 198 addi r4,r4,8 199 stw r0,0(r3) 200 stw r6,4(r3) 201 addi r3,r3,8 202 20312: bf cr7*4+1,13f 204 lwz r0,0(r4) 205 addi r4,r4,4 206 stw r0,0(r3) 207 addi r3,r3,4 208 20913: bf cr7*4+2,14f 210 lhz r0,0(r4) 211 addi r4,r4,2 212 sth r0,0(r3) 213 addi r3,r3,2 214 21514: bf cr7*4+3,15f 216 lbz r0,0(r4) 217 stb r0,0(r3) 218 21915: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 220 blr 221 222.Lunwind_stack_nonvmx_copy: 223 addi r1,r1,STACKFRAMESIZE 224 b .Lnonvmx_copy 225 226#ifdef CONFIG_ALTIVEC 227.Lvmx_copy: 228 mflr r0 229 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 230 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 231 std r0,16(r1) 232 stdu r1,-STACKFRAMESIZE(r1) 233 bl enter_vmx_copy 234 cmpwi cr1,r3,0 235 ld r0,STACKFRAMESIZE+16(r1) 236 ld r3,STK_REG(R31)(r1) 237 ld r4,STK_REG(R30)(r1) 238 ld r5,STK_REG(R29)(r1) 239 mtlr r0 240 241 /* 242 * We prefetch both the source and destination using enhanced touch 243 * instructions. We use a stream ID of 0 for the load side and 244 * 1 for the store side. 245 */ 246 clrrdi r6,r4,7 247 clrrdi r9,r3,7 248 ori r9,r9,1 /* stream=1 */ 249 250 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ 251 cmpldi r7,0x3FF 252 ble 1f 253 li r7,0x3FF 2541: lis r0,0x0E00 /* depth=7 */ 255 sldi r7,r7,7 256 or r7,r7,r0 257 ori r10,r7,1 /* stream=1 */ 258 259 lis r8,0x8000 /* GO=1 */ 260 clrldi r8,r8,32 261 262.machine push 263.machine "power4" 264 dcbt r0,r6,0b01000 265 dcbt r0,r7,0b01010 266 dcbtst r0,r9,0b01000 267 dcbtst r0,r10,0b01010 268 eieio 269 dcbt r0,r8,0b01010 /* GO */ 270.machine pop 271 272 beq cr1,.Lunwind_stack_nonvmx_copy 273 274 /* 275 * If source and destination are not relatively aligned we use a 276 * slower permute loop. 277 */ 278 xor r6,r4,r3 279 rldicl. r6,r6,0,(64-4) 280 bne .Lvmx_unaligned_copy 281 282 /* Get the destination 16B aligned */ 283 neg r6,r3 284 mtocrf 0x01,r6 285 clrldi r6,r6,(64-4) 286 287 bf cr7*4+3,1f 288 lbz r0,0(r4) 289 addi r4,r4,1 290 stb r0,0(r3) 291 addi r3,r3,1 292 2931: bf cr7*4+2,2f 294 lhz r0,0(r4) 295 addi r4,r4,2 296 sth r0,0(r3) 297 addi r3,r3,2 298 2992: bf cr7*4+1,3f 300 lwz r0,0(r4) 301 addi r4,r4,4 302 stw r0,0(r3) 303 addi r3,r3,4 304 3053: bf cr7*4+0,4f 306 ld r0,0(r4) 307 addi r4,r4,8 308 std r0,0(r3) 309 addi r3,r3,8 310 3114: sub r5,r5,r6 312 313 /* Get the desination 128B aligned */ 314 neg r6,r3 315 srdi r7,r6,4 316 mtocrf 0x01,r7 317 clrldi r6,r6,(64-7) 318 319 li r9,16 320 li r10,32 321 li r11,48 322 323 bf cr7*4+3,5f 324 lvx vr1,r0,r4 325 addi r4,r4,16 326 stvx vr1,r0,r3 327 addi r3,r3,16 328 3295: bf cr7*4+2,6f 330 lvx vr1,r0,r4 331 lvx vr0,r4,r9 332 addi r4,r4,32 333 stvx vr1,r0,r3 334 stvx vr0,r3,r9 335 addi r3,r3,32 336 3376: bf cr7*4+1,7f 338 lvx vr3,r0,r4 339 lvx vr2,r4,r9 340 lvx vr1,r4,r10 341 lvx vr0,r4,r11 342 addi r4,r4,64 343 stvx vr3,r0,r3 344 stvx vr2,r3,r9 345 stvx vr1,r3,r10 346 stvx vr0,r3,r11 347 addi r3,r3,64 348 3497: sub r5,r5,r6 350 srdi r6,r5,7 351 352 std r14,STK_REG(R14)(r1) 353 std r15,STK_REG(R15)(r1) 354 std r16,STK_REG(R16)(r1) 355 356 li r12,64 357 li r14,80 358 li r15,96 359 li r16,112 360 361 mtctr r6 362 363 /* 364 * Now do cacheline sized loads and stores. By this stage the 365 * cacheline stores are also cacheline aligned. 366 */ 367 .align 5 3688: 369 lvx vr7,r0,r4 370 lvx vr6,r4,r9 371 lvx vr5,r4,r10 372 lvx vr4,r4,r11 373 lvx vr3,r4,r12 374 lvx vr2,r4,r14 375 lvx vr1,r4,r15 376 lvx vr0,r4,r16 377 addi r4,r4,128 378 stvx vr7,r0,r3 379 stvx vr6,r3,r9 380 stvx vr5,r3,r10 381 stvx vr4,r3,r11 382 stvx vr3,r3,r12 383 stvx vr2,r3,r14 384 stvx vr1,r3,r15 385 stvx vr0,r3,r16 386 addi r3,r3,128 387 bdnz 8b 388 389 ld r14,STK_REG(R14)(r1) 390 ld r15,STK_REG(R15)(r1) 391 ld r16,STK_REG(R16)(r1) 392 393 /* Up to 127B to go */ 394 clrldi r5,r5,(64-7) 395 srdi r6,r5,4 396 mtocrf 0x01,r6 397 398 bf cr7*4+1,9f 399 lvx vr3,r0,r4 400 lvx vr2,r4,r9 401 lvx vr1,r4,r10 402 lvx vr0,r4,r11 403 addi r4,r4,64 404 stvx vr3,r0,r3 405 stvx vr2,r3,r9 406 stvx vr1,r3,r10 407 stvx vr0,r3,r11 408 addi r3,r3,64 409 4109: bf cr7*4+2,10f 411 lvx vr1,r0,r4 412 lvx vr0,r4,r9 413 addi r4,r4,32 414 stvx vr1,r0,r3 415 stvx vr0,r3,r9 416 addi r3,r3,32 417 41810: bf cr7*4+3,11f 419 lvx vr1,r0,r4 420 addi r4,r4,16 421 stvx vr1,r0,r3 422 addi r3,r3,16 423 424 /* Up to 15B to go */ 42511: clrldi r5,r5,(64-4) 426 mtocrf 0x01,r5 427 bf cr7*4+0,12f 428 ld r0,0(r4) 429 addi r4,r4,8 430 std r0,0(r3) 431 addi r3,r3,8 432 43312: bf cr7*4+1,13f 434 lwz r0,0(r4) 435 addi r4,r4,4 436 stw r0,0(r3) 437 addi r3,r3,4 438 43913: bf cr7*4+2,14f 440 lhz r0,0(r4) 441 addi r4,r4,2 442 sth r0,0(r3) 443 addi r3,r3,2 444 44514: bf cr7*4+3,15f 446 lbz r0,0(r4) 447 stb r0,0(r3) 448 44915: addi r1,r1,STACKFRAMESIZE 450 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 451 b exit_vmx_copy /* tail call optimise */ 452 453.Lvmx_unaligned_copy: 454 /* Get the destination 16B aligned */ 455 neg r6,r3 456 mtocrf 0x01,r6 457 clrldi r6,r6,(64-4) 458 459 bf cr7*4+3,1f 460 lbz r0,0(r4) 461 addi r4,r4,1 462 stb r0,0(r3) 463 addi r3,r3,1 464 4651: bf cr7*4+2,2f 466 lhz r0,0(r4) 467 addi r4,r4,2 468 sth r0,0(r3) 469 addi r3,r3,2 470 4712: bf cr7*4+1,3f 472 lwz r0,0(r4) 473 addi r4,r4,4 474 stw r0,0(r3) 475 addi r3,r3,4 476 4773: bf cr7*4+0,4f 478 lwz r0,0(r4) /* Less chance of a reject with word ops */ 479 lwz r7,4(r4) 480 addi r4,r4,8 481 stw r0,0(r3) 482 stw r7,4(r3) 483 addi r3,r3,8 484 4854: sub r5,r5,r6 486 487 /* Get the desination 128B aligned */ 488 neg r6,r3 489 srdi r7,r6,4 490 mtocrf 0x01,r7 491 clrldi r6,r6,(64-7) 492 493 li r9,16 494 li r10,32 495 li r11,48 496 497 LVS(vr16,0,r4) /* Setup permute control vector */ 498 lvx vr0,0,r4 499 addi r4,r4,16 500 501 bf cr7*4+3,5f 502 lvx vr1,r0,r4 503 VPERM(vr8,vr0,vr1,vr16) 504 addi r4,r4,16 505 stvx vr8,r0,r3 506 addi r3,r3,16 507 vor vr0,vr1,vr1 508 5095: bf cr7*4+2,6f 510 lvx vr1,r0,r4 511 VPERM(vr8,vr0,vr1,vr16) 512 lvx vr0,r4,r9 513 VPERM(vr9,vr1,vr0,vr16) 514 addi r4,r4,32 515 stvx vr8,r0,r3 516 stvx vr9,r3,r9 517 addi r3,r3,32 518 5196: bf cr7*4+1,7f 520 lvx vr3,r0,r4 521 VPERM(vr8,vr0,vr3,vr16) 522 lvx vr2,r4,r9 523 VPERM(vr9,vr3,vr2,vr16) 524 lvx vr1,r4,r10 525 VPERM(vr10,vr2,vr1,vr16) 526 lvx vr0,r4,r11 527 VPERM(vr11,vr1,vr0,vr16) 528 addi r4,r4,64 529 stvx vr8,r0,r3 530 stvx vr9,r3,r9 531 stvx vr10,r3,r10 532 stvx vr11,r3,r11 533 addi r3,r3,64 534 5357: sub r5,r5,r6 536 srdi r6,r5,7 537 538 std r14,STK_REG(R14)(r1) 539 std r15,STK_REG(R15)(r1) 540 std r16,STK_REG(R16)(r1) 541 542 li r12,64 543 li r14,80 544 li r15,96 545 li r16,112 546 547 mtctr r6 548 549 /* 550 * Now do cacheline sized loads and stores. By this stage the 551 * cacheline stores are also cacheline aligned. 552 */ 553 .align 5 5548: 555 lvx vr7,r0,r4 556 VPERM(vr8,vr0,vr7,vr16) 557 lvx vr6,r4,r9 558 VPERM(vr9,vr7,vr6,vr16) 559 lvx vr5,r4,r10 560 VPERM(vr10,vr6,vr5,vr16) 561 lvx vr4,r4,r11 562 VPERM(vr11,vr5,vr4,vr16) 563 lvx vr3,r4,r12 564 VPERM(vr12,vr4,vr3,vr16) 565 lvx vr2,r4,r14 566 VPERM(vr13,vr3,vr2,vr16) 567 lvx vr1,r4,r15 568 VPERM(vr14,vr2,vr1,vr16) 569 lvx vr0,r4,r16 570 VPERM(vr15,vr1,vr0,vr16) 571 addi r4,r4,128 572 stvx vr8,r0,r3 573 stvx vr9,r3,r9 574 stvx vr10,r3,r10 575 stvx vr11,r3,r11 576 stvx vr12,r3,r12 577 stvx vr13,r3,r14 578 stvx vr14,r3,r15 579 stvx vr15,r3,r16 580 addi r3,r3,128 581 bdnz 8b 582 583 ld r14,STK_REG(R14)(r1) 584 ld r15,STK_REG(R15)(r1) 585 ld r16,STK_REG(R16)(r1) 586 587 /* Up to 127B to go */ 588 clrldi r5,r5,(64-7) 589 srdi r6,r5,4 590 mtocrf 0x01,r6 591 592 bf cr7*4+1,9f 593 lvx vr3,r0,r4 594 VPERM(vr8,vr0,vr3,vr16) 595 lvx vr2,r4,r9 596 VPERM(vr9,vr3,vr2,vr16) 597 lvx vr1,r4,r10 598 VPERM(vr10,vr2,vr1,vr16) 599 lvx vr0,r4,r11 600 VPERM(vr11,vr1,vr0,vr16) 601 addi r4,r4,64 602 stvx vr8,r0,r3 603 stvx vr9,r3,r9 604 stvx vr10,r3,r10 605 stvx vr11,r3,r11 606 addi r3,r3,64 607 6089: bf cr7*4+2,10f 609 lvx vr1,r0,r4 610 VPERM(vr8,vr0,vr1,vr16) 611 lvx vr0,r4,r9 612 VPERM(vr9,vr1,vr0,vr16) 613 addi r4,r4,32 614 stvx vr8,r0,r3 615 stvx vr9,r3,r9 616 addi r3,r3,32 617 61810: bf cr7*4+3,11f 619 lvx vr1,r0,r4 620 VPERM(vr8,vr0,vr1,vr16) 621 addi r4,r4,16 622 stvx vr8,r0,r3 623 addi r3,r3,16 624 625 /* Up to 15B to go */ 62611: clrldi r5,r5,(64-4) 627 addi r4,r4,-16 /* Unwind the +16 load offset */ 628 mtocrf 0x01,r5 629 bf cr7*4+0,12f 630 lwz r0,0(r4) /* Less chance of a reject with word ops */ 631 lwz r6,4(r4) 632 addi r4,r4,8 633 stw r0,0(r3) 634 stw r6,4(r3) 635 addi r3,r3,8 636 63712: bf cr7*4+1,13f 638 lwz r0,0(r4) 639 addi r4,r4,4 640 stw r0,0(r3) 641 addi r3,r3,4 642 64313: bf cr7*4+2,14f 644 lhz r0,0(r4) 645 addi r4,r4,2 646 sth r0,0(r3) 647 addi r3,r3,2 648 64914: bf cr7*4+3,15f 650 lbz r0,0(r4) 651 stb r0,0(r3) 652 65315: addi r1,r1,STACKFRAMESIZE 654 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 655 b exit_vmx_copy /* tail call optimise */ 656#endif /* CONFIG_ALTIVEC */ 657