1/* 2 * This program is free software; you can redistribute it and/or modify 3 * it under the terms of the GNU General Public License as published by 4 * the Free Software Foundation; either version 2 of the License, or 5 * (at your option) any later version. 6 * 7 * This program is distributed in the hope that it will be useful, 8 * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 * GNU General Public License for more details. 11 * 12 * You should have received a copy of the GNU General Public License 13 * along with this program; if not, write to the Free Software 14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 15 * 16 * Copyright (C) IBM Corporation, 2011 17 * 18 * Author: Anton Blanchard <anton@au.ibm.com> 19 */ 20#include <asm/ppc_asm.h> 21 22#ifndef SELFTEST_CASE 23/* 0 == don't use VMX, 1 == use VMX */ 24#define SELFTEST_CASE 0 25#endif 26 27#ifdef __BIG_ENDIAN__ 28#define LVS(VRT,RA,RB) lvsl VRT,RA,RB 29#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC 30#else 31#define LVS(VRT,RA,RB) lvsr VRT,RA,RB 32#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC 33#endif 34 35 .macro err1 36100: 37 EX_TABLE(100b,.Ldo_err1) 38 .endm 39 40 .macro err2 41200: 42 EX_TABLE(200b,.Ldo_err2) 43 .endm 44 45#ifdef CONFIG_ALTIVEC 46 .macro err3 47300: 48 EX_TABLE(300b,.Ldo_err3) 49 .endm 50 51 .macro err4 52400: 53 EX_TABLE(400b,.Ldo_err4) 54 .endm 55 56 57.Ldo_err4: 58 ld r16,STK_REG(R16)(r1) 59 ld r15,STK_REG(R15)(r1) 60 ld r14,STK_REG(R14)(r1) 61.Ldo_err3: 62 bl exit_vmx_usercopy 63 ld r0,STACKFRAMESIZE+16(r1) 64 mtlr r0 65 b .Lexit 66#endif /* CONFIG_ALTIVEC */ 67 68.Ldo_err2: 69 ld r22,STK_REG(R22)(r1) 70 ld r21,STK_REG(R21)(r1) 71 ld r20,STK_REG(R20)(r1) 72 ld r19,STK_REG(R19)(r1) 73 ld r18,STK_REG(R18)(r1) 74 ld r17,STK_REG(R17)(r1) 75 ld r16,STK_REG(R16)(r1) 76 ld r15,STK_REG(R15)(r1) 77 ld r14,STK_REG(R14)(r1) 78.Lexit: 79 addi r1,r1,STACKFRAMESIZE 80.Ldo_err1: 81 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 82 ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 83 ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 84 b __copy_tofrom_user_base 85 86 87_GLOBAL(__copy_tofrom_user_power7) 88 cmpldi r5,16 89 cmpldi cr1,r5,3328 90 91 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 92 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 93 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 94 95 blt .Lshort_copy 96 97#ifdef CONFIG_ALTIVEC 98test_feature = SELFTEST_CASE 99BEGIN_FTR_SECTION 100 bgt cr1,.Lvmx_copy 101END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) 102#endif 103 104.Lnonvmx_copy: 105 /* Get the source 8B aligned */ 106 neg r6,r4 107 mtocrf 0x01,r6 108 clrldi r6,r6,(64-3) 109 110 bf cr7*4+3,1f 111err1; lbz r0,0(r4) 112 addi r4,r4,1 113err1; stb r0,0(r3) 114 addi r3,r3,1 115 1161: bf cr7*4+2,2f 117err1; lhz r0,0(r4) 118 addi r4,r4,2 119err1; sth r0,0(r3) 120 addi r3,r3,2 121 1222: bf cr7*4+1,3f 123err1; lwz r0,0(r4) 124 addi r4,r4,4 125err1; stw r0,0(r3) 126 addi r3,r3,4 127 1283: sub r5,r5,r6 129 cmpldi r5,128 130 blt 5f 131 132 mflr r0 133 stdu r1,-STACKFRAMESIZE(r1) 134 std r14,STK_REG(R14)(r1) 135 std r15,STK_REG(R15)(r1) 136 std r16,STK_REG(R16)(r1) 137 std r17,STK_REG(R17)(r1) 138 std r18,STK_REG(R18)(r1) 139 std r19,STK_REG(R19)(r1) 140 std r20,STK_REG(R20)(r1) 141 std r21,STK_REG(R21)(r1) 142 std r22,STK_REG(R22)(r1) 143 std r0,STACKFRAMESIZE+16(r1) 144 145 srdi r6,r5,7 146 mtctr r6 147 148 /* Now do cacheline (128B) sized loads and stores. */ 149 .align 5 1504: 151err2; ld r0,0(r4) 152err2; ld r6,8(r4) 153err2; ld r7,16(r4) 154err2; ld r8,24(r4) 155err2; ld r9,32(r4) 156err2; ld r10,40(r4) 157err2; ld r11,48(r4) 158err2; ld r12,56(r4) 159err2; ld r14,64(r4) 160err2; ld r15,72(r4) 161err2; ld r16,80(r4) 162err2; ld r17,88(r4) 163err2; ld r18,96(r4) 164err2; ld r19,104(r4) 165err2; ld r20,112(r4) 166err2; ld r21,120(r4) 167 addi r4,r4,128 168err2; std r0,0(r3) 169err2; std r6,8(r3) 170err2; std r7,16(r3) 171err2; std r8,24(r3) 172err2; std r9,32(r3) 173err2; std r10,40(r3) 174err2; std r11,48(r3) 175err2; std r12,56(r3) 176err2; std r14,64(r3) 177err2; std r15,72(r3) 178err2; std r16,80(r3) 179err2; std r17,88(r3) 180err2; std r18,96(r3) 181err2; std r19,104(r3) 182err2; std r20,112(r3) 183err2; std r21,120(r3) 184 addi r3,r3,128 185 bdnz 4b 186 187 clrldi r5,r5,(64-7) 188 189 ld r14,STK_REG(R14)(r1) 190 ld r15,STK_REG(R15)(r1) 191 ld r16,STK_REG(R16)(r1) 192 ld r17,STK_REG(R17)(r1) 193 ld r18,STK_REG(R18)(r1) 194 ld r19,STK_REG(R19)(r1) 195 ld r20,STK_REG(R20)(r1) 196 ld r21,STK_REG(R21)(r1) 197 ld r22,STK_REG(R22)(r1) 198 addi r1,r1,STACKFRAMESIZE 199 200 /* Up to 127B to go */ 2015: srdi r6,r5,4 202 mtocrf 0x01,r6 203 2046: bf cr7*4+1,7f 205err1; ld r0,0(r4) 206err1; ld r6,8(r4) 207err1; ld r7,16(r4) 208err1; ld r8,24(r4) 209err1; ld r9,32(r4) 210err1; ld r10,40(r4) 211err1; ld r11,48(r4) 212err1; ld r12,56(r4) 213 addi r4,r4,64 214err1; std r0,0(r3) 215err1; std r6,8(r3) 216err1; std r7,16(r3) 217err1; std r8,24(r3) 218err1; std r9,32(r3) 219err1; std r10,40(r3) 220err1; std r11,48(r3) 221err1; std r12,56(r3) 222 addi r3,r3,64 223 224 /* Up to 63B to go */ 2257: bf cr7*4+2,8f 226err1; ld r0,0(r4) 227err1; ld r6,8(r4) 228err1; ld r7,16(r4) 229err1; ld r8,24(r4) 230 addi r4,r4,32 231err1; std r0,0(r3) 232err1; std r6,8(r3) 233err1; std r7,16(r3) 234err1; std r8,24(r3) 235 addi r3,r3,32 236 237 /* Up to 31B to go */ 2388: bf cr7*4+3,9f 239err1; ld r0,0(r4) 240err1; ld r6,8(r4) 241 addi r4,r4,16 242err1; std r0,0(r3) 243err1; std r6,8(r3) 244 addi r3,r3,16 245 2469: clrldi r5,r5,(64-4) 247 248 /* Up to 15B to go */ 249.Lshort_copy: 250 mtocrf 0x01,r5 251 bf cr7*4+0,12f 252err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ 253err1; lwz r6,4(r4) 254 addi r4,r4,8 255err1; stw r0,0(r3) 256err1; stw r6,4(r3) 257 addi r3,r3,8 258 25912: bf cr7*4+1,13f 260err1; lwz r0,0(r4) 261 addi r4,r4,4 262err1; stw r0,0(r3) 263 addi r3,r3,4 264 26513: bf cr7*4+2,14f 266err1; lhz r0,0(r4) 267 addi r4,r4,2 268err1; sth r0,0(r3) 269 addi r3,r3,2 270 27114: bf cr7*4+3,15f 272err1; lbz r0,0(r4) 273err1; stb r0,0(r3) 274 27515: li r3,0 276 blr 277 278.Lunwind_stack_nonvmx_copy: 279 addi r1,r1,STACKFRAMESIZE 280 b .Lnonvmx_copy 281 282.Lvmx_copy: 283#ifdef CONFIG_ALTIVEC 284 mflr r0 285 std r0,16(r1) 286 stdu r1,-STACKFRAMESIZE(r1) 287 bl enter_vmx_usercopy 288 cmpwi cr1,r3,0 289 ld r0,STACKFRAMESIZE+16(r1) 290 ld r3,STK_REG(R31)(r1) 291 ld r4,STK_REG(R30)(r1) 292 ld r5,STK_REG(R29)(r1) 293 mtlr r0 294 295 /* 296 * We prefetch both the source and destination using enhanced touch 297 * instructions. We use a stream ID of 0 for the load side and 298 * 1 for the store side. 299 */ 300 clrrdi r6,r4,7 301 clrrdi r9,r3,7 302 ori r9,r9,1 /* stream=1 */ 303 304 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ 305 cmpldi r7,0x3FF 306 ble 1f 307 li r7,0x3FF 3081: lis r0,0x0E00 /* depth=7 */ 309 sldi r7,r7,7 310 or r7,r7,r0 311 ori r10,r7,1 /* stream=1 */ 312 313 lis r8,0x8000 /* GO=1 */ 314 clrldi r8,r8,32 315 316 /* setup read stream 0 */ 317 dcbt 0,r6,0b01000 /* addr from */ 318 dcbt 0,r7,0b01010 /* length and depth from */ 319 /* setup write stream 1 */ 320 dcbtst 0,r9,0b01000 /* addr to */ 321 dcbtst 0,r10,0b01010 /* length and depth to */ 322 eieio 323 dcbt 0,r8,0b01010 /* all streams GO */ 324 325 beq cr1,.Lunwind_stack_nonvmx_copy 326 327 /* 328 * If source and destination are not relatively aligned we use a 329 * slower permute loop. 330 */ 331 xor r6,r4,r3 332 rldicl. r6,r6,0,(64-4) 333 bne .Lvmx_unaligned_copy 334 335 /* Get the destination 16B aligned */ 336 neg r6,r3 337 mtocrf 0x01,r6 338 clrldi r6,r6,(64-4) 339 340 bf cr7*4+3,1f 341err3; lbz r0,0(r4) 342 addi r4,r4,1 343err3; stb r0,0(r3) 344 addi r3,r3,1 345 3461: bf cr7*4+2,2f 347err3; lhz r0,0(r4) 348 addi r4,r4,2 349err3; sth r0,0(r3) 350 addi r3,r3,2 351 3522: bf cr7*4+1,3f 353err3; lwz r0,0(r4) 354 addi r4,r4,4 355err3; stw r0,0(r3) 356 addi r3,r3,4 357 3583: bf cr7*4+0,4f 359err3; ld r0,0(r4) 360 addi r4,r4,8 361err3; std r0,0(r3) 362 addi r3,r3,8 363 3644: sub r5,r5,r6 365 366 /* Get the desination 128B aligned */ 367 neg r6,r3 368 srdi r7,r6,4 369 mtocrf 0x01,r7 370 clrldi r6,r6,(64-7) 371 372 li r9,16 373 li r10,32 374 li r11,48 375 376 bf cr7*4+3,5f 377err3; lvx v1,0,r4 378 addi r4,r4,16 379err3; stvx v1,0,r3 380 addi r3,r3,16 381 3825: bf cr7*4+2,6f 383err3; lvx v1,0,r4 384err3; lvx v0,r4,r9 385 addi r4,r4,32 386err3; stvx v1,0,r3 387err3; stvx v0,r3,r9 388 addi r3,r3,32 389 3906: bf cr7*4+1,7f 391err3; lvx v3,0,r4 392err3; lvx v2,r4,r9 393err3; lvx v1,r4,r10 394err3; lvx v0,r4,r11 395 addi r4,r4,64 396err3; stvx v3,0,r3 397err3; stvx v2,r3,r9 398err3; stvx v1,r3,r10 399err3; stvx v0,r3,r11 400 addi r3,r3,64 401 4027: sub r5,r5,r6 403 srdi r6,r5,7 404 405 std r14,STK_REG(R14)(r1) 406 std r15,STK_REG(R15)(r1) 407 std r16,STK_REG(R16)(r1) 408 409 li r12,64 410 li r14,80 411 li r15,96 412 li r16,112 413 414 mtctr r6 415 416 /* 417 * Now do cacheline sized loads and stores. By this stage the 418 * cacheline stores are also cacheline aligned. 419 */ 420 .align 5 4218: 422err4; lvx v7,0,r4 423err4; lvx v6,r4,r9 424err4; lvx v5,r4,r10 425err4; lvx v4,r4,r11 426err4; lvx v3,r4,r12 427err4; lvx v2,r4,r14 428err4; lvx v1,r4,r15 429err4; lvx v0,r4,r16 430 addi r4,r4,128 431err4; stvx v7,0,r3 432err4; stvx v6,r3,r9 433err4; stvx v5,r3,r10 434err4; stvx v4,r3,r11 435err4; stvx v3,r3,r12 436err4; stvx v2,r3,r14 437err4; stvx v1,r3,r15 438err4; stvx v0,r3,r16 439 addi r3,r3,128 440 bdnz 8b 441 442 ld r14,STK_REG(R14)(r1) 443 ld r15,STK_REG(R15)(r1) 444 ld r16,STK_REG(R16)(r1) 445 446 /* Up to 127B to go */ 447 clrldi r5,r5,(64-7) 448 srdi r6,r5,4 449 mtocrf 0x01,r6 450 451 bf cr7*4+1,9f 452err3; lvx v3,0,r4 453err3; lvx v2,r4,r9 454err3; lvx v1,r4,r10 455err3; lvx v0,r4,r11 456 addi r4,r4,64 457err3; stvx v3,0,r3 458err3; stvx v2,r3,r9 459err3; stvx v1,r3,r10 460err3; stvx v0,r3,r11 461 addi r3,r3,64 462 4639: bf cr7*4+2,10f 464err3; lvx v1,0,r4 465err3; lvx v0,r4,r9 466 addi r4,r4,32 467err3; stvx v1,0,r3 468err3; stvx v0,r3,r9 469 addi r3,r3,32 470 47110: bf cr7*4+3,11f 472err3; lvx v1,0,r4 473 addi r4,r4,16 474err3; stvx v1,0,r3 475 addi r3,r3,16 476 477 /* Up to 15B to go */ 47811: clrldi r5,r5,(64-4) 479 mtocrf 0x01,r5 480 bf cr7*4+0,12f 481err3; ld r0,0(r4) 482 addi r4,r4,8 483err3; std r0,0(r3) 484 addi r3,r3,8 485 48612: bf cr7*4+1,13f 487err3; lwz r0,0(r4) 488 addi r4,r4,4 489err3; stw r0,0(r3) 490 addi r3,r3,4 491 49213: bf cr7*4+2,14f 493err3; lhz r0,0(r4) 494 addi r4,r4,2 495err3; sth r0,0(r3) 496 addi r3,r3,2 497 49814: bf cr7*4+3,15f 499err3; lbz r0,0(r4) 500err3; stb r0,0(r3) 501 50215: addi r1,r1,STACKFRAMESIZE 503 b exit_vmx_usercopy /* tail call optimise */ 504 505.Lvmx_unaligned_copy: 506 /* Get the destination 16B aligned */ 507 neg r6,r3 508 mtocrf 0x01,r6 509 clrldi r6,r6,(64-4) 510 511 bf cr7*4+3,1f 512err3; lbz r0,0(r4) 513 addi r4,r4,1 514err3; stb r0,0(r3) 515 addi r3,r3,1 516 5171: bf cr7*4+2,2f 518err3; lhz r0,0(r4) 519 addi r4,r4,2 520err3; sth r0,0(r3) 521 addi r3,r3,2 522 5232: bf cr7*4+1,3f 524err3; lwz r0,0(r4) 525 addi r4,r4,4 526err3; stw r0,0(r3) 527 addi r3,r3,4 528 5293: bf cr7*4+0,4f 530err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 531err3; lwz r7,4(r4) 532 addi r4,r4,8 533err3; stw r0,0(r3) 534err3; stw r7,4(r3) 535 addi r3,r3,8 536 5374: sub r5,r5,r6 538 539 /* Get the desination 128B aligned */ 540 neg r6,r3 541 srdi r7,r6,4 542 mtocrf 0x01,r7 543 clrldi r6,r6,(64-7) 544 545 li r9,16 546 li r10,32 547 li r11,48 548 549 LVS(v16,0,r4) /* Setup permute control vector */ 550err3; lvx v0,0,r4 551 addi r4,r4,16 552 553 bf cr7*4+3,5f 554err3; lvx v1,0,r4 555 VPERM(v8,v0,v1,v16) 556 addi r4,r4,16 557err3; stvx v8,0,r3 558 addi r3,r3,16 559 vor v0,v1,v1 560 5615: bf cr7*4+2,6f 562err3; lvx v1,0,r4 563 VPERM(v8,v0,v1,v16) 564err3; lvx v0,r4,r9 565 VPERM(v9,v1,v0,v16) 566 addi r4,r4,32 567err3; stvx v8,0,r3 568err3; stvx v9,r3,r9 569 addi r3,r3,32 570 5716: bf cr7*4+1,7f 572err3; lvx v3,0,r4 573 VPERM(v8,v0,v3,v16) 574err3; lvx v2,r4,r9 575 VPERM(v9,v3,v2,v16) 576err3; lvx v1,r4,r10 577 VPERM(v10,v2,v1,v16) 578err3; lvx v0,r4,r11 579 VPERM(v11,v1,v0,v16) 580 addi r4,r4,64 581err3; stvx v8,0,r3 582err3; stvx v9,r3,r9 583err3; stvx v10,r3,r10 584err3; stvx v11,r3,r11 585 addi r3,r3,64 586 5877: sub r5,r5,r6 588 srdi r6,r5,7 589 590 std r14,STK_REG(R14)(r1) 591 std r15,STK_REG(R15)(r1) 592 std r16,STK_REG(R16)(r1) 593 594 li r12,64 595 li r14,80 596 li r15,96 597 li r16,112 598 599 mtctr r6 600 601 /* 602 * Now do cacheline sized loads and stores. By this stage the 603 * cacheline stores are also cacheline aligned. 604 */ 605 .align 5 6068: 607err4; lvx v7,0,r4 608 VPERM(v8,v0,v7,v16) 609err4; lvx v6,r4,r9 610 VPERM(v9,v7,v6,v16) 611err4; lvx v5,r4,r10 612 VPERM(v10,v6,v5,v16) 613err4; lvx v4,r4,r11 614 VPERM(v11,v5,v4,v16) 615err4; lvx v3,r4,r12 616 VPERM(v12,v4,v3,v16) 617err4; lvx v2,r4,r14 618 VPERM(v13,v3,v2,v16) 619err4; lvx v1,r4,r15 620 VPERM(v14,v2,v1,v16) 621err4; lvx v0,r4,r16 622 VPERM(v15,v1,v0,v16) 623 addi r4,r4,128 624err4; stvx v8,0,r3 625err4; stvx v9,r3,r9 626err4; stvx v10,r3,r10 627err4; stvx v11,r3,r11 628err4; stvx v12,r3,r12 629err4; stvx v13,r3,r14 630err4; stvx v14,r3,r15 631err4; stvx v15,r3,r16 632 addi r3,r3,128 633 bdnz 8b 634 635 ld r14,STK_REG(R14)(r1) 636 ld r15,STK_REG(R15)(r1) 637 ld r16,STK_REG(R16)(r1) 638 639 /* Up to 127B to go */ 640 clrldi r5,r5,(64-7) 641 srdi r6,r5,4 642 mtocrf 0x01,r6 643 644 bf cr7*4+1,9f 645err3; lvx v3,0,r4 646 VPERM(v8,v0,v3,v16) 647err3; lvx v2,r4,r9 648 VPERM(v9,v3,v2,v16) 649err3; lvx v1,r4,r10 650 VPERM(v10,v2,v1,v16) 651err3; lvx v0,r4,r11 652 VPERM(v11,v1,v0,v16) 653 addi r4,r4,64 654err3; stvx v8,0,r3 655err3; stvx v9,r3,r9 656err3; stvx v10,r3,r10 657err3; stvx v11,r3,r11 658 addi r3,r3,64 659 6609: bf cr7*4+2,10f 661err3; lvx v1,0,r4 662 VPERM(v8,v0,v1,v16) 663err3; lvx v0,r4,r9 664 VPERM(v9,v1,v0,v16) 665 addi r4,r4,32 666err3; stvx v8,0,r3 667err3; stvx v9,r3,r9 668 addi r3,r3,32 669 67010: bf cr7*4+3,11f 671err3; lvx v1,0,r4 672 VPERM(v8,v0,v1,v16) 673 addi r4,r4,16 674err3; stvx v8,0,r3 675 addi r3,r3,16 676 677 /* Up to 15B to go */ 67811: clrldi r5,r5,(64-4) 679 addi r4,r4,-16 /* Unwind the +16 load offset */ 680 mtocrf 0x01,r5 681 bf cr7*4+0,12f 682err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 683err3; lwz r6,4(r4) 684 addi r4,r4,8 685err3; stw r0,0(r3) 686err3; stw r6,4(r3) 687 addi r3,r3,8 688 68912: bf cr7*4+1,13f 690err3; lwz r0,0(r4) 691 addi r4,r4,4 692err3; stw r0,0(r3) 693 addi r3,r3,4 694 69513: bf cr7*4+2,14f 696err3; lhz r0,0(r4) 697 addi r4,r4,2 698err3; sth r0,0(r3) 699 addi r3,r3,2 700 70114: bf cr7*4+3,15f 702err3; lbz r0,0(r4) 703err3; stb r0,0(r3) 704 70515: addi r1,r1,STACKFRAMESIZE 706 b exit_vmx_usercopy /* tail call optimise */ 707#endif /* CONFIG_ALTIVEC */ 708