1/* 2 * This program is free software; you can redistribute it and/or modify 3 * it under the terms of the GNU General Public License as published by 4 * the Free Software Foundation; either version 2 of the License, or 5 * (at your option) any later version. 6 * 7 * This program is distributed in the hope that it will be useful, 8 * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 * GNU General Public License for more details. 11 * 12 * You should have received a copy of the GNU General Public License 13 * along with this program; if not, write to the Free Software 14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 15 * 16 * Copyright (C) IBM Corporation, 2011 17 * 18 * Author: Anton Blanchard <anton@au.ibm.com> 19 */ 20#include <asm/ppc_asm.h> 21 22#ifdef __BIG_ENDIAN__ 23#define LVS(VRT,RA,RB) lvsl VRT,RA,RB 24#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC 25#else 26#define LVS(VRT,RA,RB) lvsr VRT,RA,RB 27#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC 28#endif 29 30 .macro err1 31100: 32 EX_TABLE(100b,.Ldo_err1) 33 .endm 34 35 .macro err2 36200: 37 EX_TABLE(200b,.Ldo_err2) 38 .endm 39 40#ifdef CONFIG_ALTIVEC 41 .macro err3 42300: 43 EX_TABLE(300b,.Ldo_err3) 44 .endm 45 46 .macro err4 47400: 48 EX_TABLE(400b,.Ldo_err4) 49 .endm 50 51 52.Ldo_err4: 53 ld r16,STK_REG(R16)(r1) 54 ld r15,STK_REG(R15)(r1) 55 ld r14,STK_REG(R14)(r1) 56.Ldo_err3: 57 bl exit_vmx_usercopy 58 ld r0,STACKFRAMESIZE+16(r1) 59 mtlr r0 60 b .Lexit 61#endif /* CONFIG_ALTIVEC */ 62 63.Ldo_err2: 64 ld r22,STK_REG(R22)(r1) 65 ld r21,STK_REG(R21)(r1) 66 ld r20,STK_REG(R20)(r1) 67 ld r19,STK_REG(R19)(r1) 68 ld r18,STK_REG(R18)(r1) 69 ld r17,STK_REG(R17)(r1) 70 ld r16,STK_REG(R16)(r1) 71 ld r15,STK_REG(R15)(r1) 72 ld r14,STK_REG(R14)(r1) 73.Lexit: 74 addi r1,r1,STACKFRAMESIZE 75.Ldo_err1: 76 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 77 ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 78 ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 79 b __copy_tofrom_user_base 80 81 82_GLOBAL(__copy_tofrom_user_power7) 83#ifdef CONFIG_ALTIVEC 84 cmpldi r5,16 85 cmpldi cr1,r5,3328 86 87 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 88 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 89 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 90 91 blt .Lshort_copy 92 bge cr1,.Lvmx_copy 93#else 94 cmpldi r5,16 95 96 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 97 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 98 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 99 100 blt .Lshort_copy 101#endif 102 103.Lnonvmx_copy: 104 /* Get the source 8B aligned */ 105 neg r6,r4 106 mtocrf 0x01,r6 107 clrldi r6,r6,(64-3) 108 109 bf cr7*4+3,1f 110err1; lbz r0,0(r4) 111 addi r4,r4,1 112err1; stb r0,0(r3) 113 addi r3,r3,1 114 1151: bf cr7*4+2,2f 116err1; lhz r0,0(r4) 117 addi r4,r4,2 118err1; sth r0,0(r3) 119 addi r3,r3,2 120 1212: bf cr7*4+1,3f 122err1; lwz r0,0(r4) 123 addi r4,r4,4 124err1; stw r0,0(r3) 125 addi r3,r3,4 126 1273: sub r5,r5,r6 128 cmpldi r5,128 129 blt 5f 130 131 mflr r0 132 stdu r1,-STACKFRAMESIZE(r1) 133 std r14,STK_REG(R14)(r1) 134 std r15,STK_REG(R15)(r1) 135 std r16,STK_REG(R16)(r1) 136 std r17,STK_REG(R17)(r1) 137 std r18,STK_REG(R18)(r1) 138 std r19,STK_REG(R19)(r1) 139 std r20,STK_REG(R20)(r1) 140 std r21,STK_REG(R21)(r1) 141 std r22,STK_REG(R22)(r1) 142 std r0,STACKFRAMESIZE+16(r1) 143 144 srdi r6,r5,7 145 mtctr r6 146 147 /* Now do cacheline (128B) sized loads and stores. */ 148 .align 5 1494: 150err2; ld r0,0(r4) 151err2; ld r6,8(r4) 152err2; ld r7,16(r4) 153err2; ld r8,24(r4) 154err2; ld r9,32(r4) 155err2; ld r10,40(r4) 156err2; ld r11,48(r4) 157err2; ld r12,56(r4) 158err2; ld r14,64(r4) 159err2; ld r15,72(r4) 160err2; ld r16,80(r4) 161err2; ld r17,88(r4) 162err2; ld r18,96(r4) 163err2; ld r19,104(r4) 164err2; ld r20,112(r4) 165err2; ld r21,120(r4) 166 addi r4,r4,128 167err2; std r0,0(r3) 168err2; std r6,8(r3) 169err2; std r7,16(r3) 170err2; std r8,24(r3) 171err2; std r9,32(r3) 172err2; std r10,40(r3) 173err2; std r11,48(r3) 174err2; std r12,56(r3) 175err2; std r14,64(r3) 176err2; std r15,72(r3) 177err2; std r16,80(r3) 178err2; std r17,88(r3) 179err2; std r18,96(r3) 180err2; std r19,104(r3) 181err2; std r20,112(r3) 182err2; std r21,120(r3) 183 addi r3,r3,128 184 bdnz 4b 185 186 clrldi r5,r5,(64-7) 187 188 ld r14,STK_REG(R14)(r1) 189 ld r15,STK_REG(R15)(r1) 190 ld r16,STK_REG(R16)(r1) 191 ld r17,STK_REG(R17)(r1) 192 ld r18,STK_REG(R18)(r1) 193 ld r19,STK_REG(R19)(r1) 194 ld r20,STK_REG(R20)(r1) 195 ld r21,STK_REG(R21)(r1) 196 ld r22,STK_REG(R22)(r1) 197 addi r1,r1,STACKFRAMESIZE 198 199 /* Up to 127B to go */ 2005: srdi r6,r5,4 201 mtocrf 0x01,r6 202 2036: bf cr7*4+1,7f 204err1; ld r0,0(r4) 205err1; ld r6,8(r4) 206err1; ld r7,16(r4) 207err1; ld r8,24(r4) 208err1; ld r9,32(r4) 209err1; ld r10,40(r4) 210err1; ld r11,48(r4) 211err1; ld r12,56(r4) 212 addi r4,r4,64 213err1; std r0,0(r3) 214err1; std r6,8(r3) 215err1; std r7,16(r3) 216err1; std r8,24(r3) 217err1; std r9,32(r3) 218err1; std r10,40(r3) 219err1; std r11,48(r3) 220err1; std r12,56(r3) 221 addi r3,r3,64 222 223 /* Up to 63B to go */ 2247: bf cr7*4+2,8f 225err1; ld r0,0(r4) 226err1; ld r6,8(r4) 227err1; ld r7,16(r4) 228err1; ld r8,24(r4) 229 addi r4,r4,32 230err1; std r0,0(r3) 231err1; std r6,8(r3) 232err1; std r7,16(r3) 233err1; std r8,24(r3) 234 addi r3,r3,32 235 236 /* Up to 31B to go */ 2378: bf cr7*4+3,9f 238err1; ld r0,0(r4) 239err1; ld r6,8(r4) 240 addi r4,r4,16 241err1; std r0,0(r3) 242err1; std r6,8(r3) 243 addi r3,r3,16 244 2459: clrldi r5,r5,(64-4) 246 247 /* Up to 15B to go */ 248.Lshort_copy: 249 mtocrf 0x01,r5 250 bf cr7*4+0,12f 251err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ 252err1; lwz r6,4(r4) 253 addi r4,r4,8 254err1; stw r0,0(r3) 255err1; stw r6,4(r3) 256 addi r3,r3,8 257 25812: bf cr7*4+1,13f 259err1; lwz r0,0(r4) 260 addi r4,r4,4 261err1; stw r0,0(r3) 262 addi r3,r3,4 263 26413: bf cr7*4+2,14f 265err1; lhz r0,0(r4) 266 addi r4,r4,2 267err1; sth r0,0(r3) 268 addi r3,r3,2 269 27014: bf cr7*4+3,15f 271err1; lbz r0,0(r4) 272err1; stb r0,0(r3) 273 27415: li r3,0 275 blr 276 277.Lunwind_stack_nonvmx_copy: 278 addi r1,r1,STACKFRAMESIZE 279 b .Lnonvmx_copy 280 281#ifdef CONFIG_ALTIVEC 282.Lvmx_copy: 283 mflr r0 284 std r0,16(r1) 285 stdu r1,-STACKFRAMESIZE(r1) 286 bl enter_vmx_usercopy 287 cmpwi cr1,r3,0 288 ld r0,STACKFRAMESIZE+16(r1) 289 ld r3,STK_REG(R31)(r1) 290 ld r4,STK_REG(R30)(r1) 291 ld r5,STK_REG(R29)(r1) 292 mtlr r0 293 294 /* 295 * We prefetch both the source and destination using enhanced touch 296 * instructions. We use a stream ID of 0 for the load side and 297 * 1 for the store side. 298 */ 299 clrrdi r6,r4,7 300 clrrdi r9,r3,7 301 ori r9,r9,1 /* stream=1 */ 302 303 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ 304 cmpldi r7,0x3FF 305 ble 1f 306 li r7,0x3FF 3071: lis r0,0x0E00 /* depth=7 */ 308 sldi r7,r7,7 309 or r7,r7,r0 310 ori r10,r7,1 /* stream=1 */ 311 312 lis r8,0x8000 /* GO=1 */ 313 clrldi r8,r8,32 314 315.machine push 316.machine "power4" 317 /* setup read stream 0 */ 318 dcbt 0,r6,0b01000 /* addr from */ 319 dcbt 0,r7,0b01010 /* length and depth from */ 320 /* setup write stream 1 */ 321 dcbtst 0,r9,0b01000 /* addr to */ 322 dcbtst 0,r10,0b01010 /* length and depth to */ 323 eieio 324 dcbt 0,r8,0b01010 /* all streams GO */ 325.machine pop 326 327 beq cr1,.Lunwind_stack_nonvmx_copy 328 329 /* 330 * If source and destination are not relatively aligned we use a 331 * slower permute loop. 332 */ 333 xor r6,r4,r3 334 rldicl. r6,r6,0,(64-4) 335 bne .Lvmx_unaligned_copy 336 337 /* Get the destination 16B aligned */ 338 neg r6,r3 339 mtocrf 0x01,r6 340 clrldi r6,r6,(64-4) 341 342 bf cr7*4+3,1f 343err3; lbz r0,0(r4) 344 addi r4,r4,1 345err3; stb r0,0(r3) 346 addi r3,r3,1 347 3481: bf cr7*4+2,2f 349err3; lhz r0,0(r4) 350 addi r4,r4,2 351err3; sth r0,0(r3) 352 addi r3,r3,2 353 3542: bf cr7*4+1,3f 355err3; lwz r0,0(r4) 356 addi r4,r4,4 357err3; stw r0,0(r3) 358 addi r3,r3,4 359 3603: bf cr7*4+0,4f 361err3; ld r0,0(r4) 362 addi r4,r4,8 363err3; std r0,0(r3) 364 addi r3,r3,8 365 3664: sub r5,r5,r6 367 368 /* Get the desination 128B aligned */ 369 neg r6,r3 370 srdi r7,r6,4 371 mtocrf 0x01,r7 372 clrldi r6,r6,(64-7) 373 374 li r9,16 375 li r10,32 376 li r11,48 377 378 bf cr7*4+3,5f 379err3; lvx v1,0,r4 380 addi r4,r4,16 381err3; stvx v1,0,r3 382 addi r3,r3,16 383 3845: bf cr7*4+2,6f 385err3; lvx v1,0,r4 386err3; lvx v0,r4,r9 387 addi r4,r4,32 388err3; stvx v1,0,r3 389err3; stvx v0,r3,r9 390 addi r3,r3,32 391 3926: bf cr7*4+1,7f 393err3; lvx v3,0,r4 394err3; lvx v2,r4,r9 395err3; lvx v1,r4,r10 396err3; lvx v0,r4,r11 397 addi r4,r4,64 398err3; stvx v3,0,r3 399err3; stvx v2,r3,r9 400err3; stvx v1,r3,r10 401err3; stvx v0,r3,r11 402 addi r3,r3,64 403 4047: sub r5,r5,r6 405 srdi r6,r5,7 406 407 std r14,STK_REG(R14)(r1) 408 std r15,STK_REG(R15)(r1) 409 std r16,STK_REG(R16)(r1) 410 411 li r12,64 412 li r14,80 413 li r15,96 414 li r16,112 415 416 mtctr r6 417 418 /* 419 * Now do cacheline sized loads and stores. By this stage the 420 * cacheline stores are also cacheline aligned. 421 */ 422 .align 5 4238: 424err4; lvx v7,0,r4 425err4; lvx v6,r4,r9 426err4; lvx v5,r4,r10 427err4; lvx v4,r4,r11 428err4; lvx v3,r4,r12 429err4; lvx v2,r4,r14 430err4; lvx v1,r4,r15 431err4; lvx v0,r4,r16 432 addi r4,r4,128 433err4; stvx v7,0,r3 434err4; stvx v6,r3,r9 435err4; stvx v5,r3,r10 436err4; stvx v4,r3,r11 437err4; stvx v3,r3,r12 438err4; stvx v2,r3,r14 439err4; stvx v1,r3,r15 440err4; stvx v0,r3,r16 441 addi r3,r3,128 442 bdnz 8b 443 444 ld r14,STK_REG(R14)(r1) 445 ld r15,STK_REG(R15)(r1) 446 ld r16,STK_REG(R16)(r1) 447 448 /* Up to 127B to go */ 449 clrldi r5,r5,(64-7) 450 srdi r6,r5,4 451 mtocrf 0x01,r6 452 453 bf cr7*4+1,9f 454err3; lvx v3,0,r4 455err3; lvx v2,r4,r9 456err3; lvx v1,r4,r10 457err3; lvx v0,r4,r11 458 addi r4,r4,64 459err3; stvx v3,0,r3 460err3; stvx v2,r3,r9 461err3; stvx v1,r3,r10 462err3; stvx v0,r3,r11 463 addi r3,r3,64 464 4659: bf cr7*4+2,10f 466err3; lvx v1,0,r4 467err3; lvx v0,r4,r9 468 addi r4,r4,32 469err3; stvx v1,0,r3 470err3; stvx v0,r3,r9 471 addi r3,r3,32 472 47310: bf cr7*4+3,11f 474err3; lvx v1,0,r4 475 addi r4,r4,16 476err3; stvx v1,0,r3 477 addi r3,r3,16 478 479 /* Up to 15B to go */ 48011: clrldi r5,r5,(64-4) 481 mtocrf 0x01,r5 482 bf cr7*4+0,12f 483err3; ld r0,0(r4) 484 addi r4,r4,8 485err3; std r0,0(r3) 486 addi r3,r3,8 487 48812: bf cr7*4+1,13f 489err3; lwz r0,0(r4) 490 addi r4,r4,4 491err3; stw r0,0(r3) 492 addi r3,r3,4 493 49413: bf cr7*4+2,14f 495err3; lhz r0,0(r4) 496 addi r4,r4,2 497err3; sth r0,0(r3) 498 addi r3,r3,2 499 50014: bf cr7*4+3,15f 501err3; lbz r0,0(r4) 502err3; stb r0,0(r3) 503 50415: addi r1,r1,STACKFRAMESIZE 505 b exit_vmx_usercopy /* tail call optimise */ 506 507.Lvmx_unaligned_copy: 508 /* Get the destination 16B aligned */ 509 neg r6,r3 510 mtocrf 0x01,r6 511 clrldi r6,r6,(64-4) 512 513 bf cr7*4+3,1f 514err3; lbz r0,0(r4) 515 addi r4,r4,1 516err3; stb r0,0(r3) 517 addi r3,r3,1 518 5191: bf cr7*4+2,2f 520err3; lhz r0,0(r4) 521 addi r4,r4,2 522err3; sth r0,0(r3) 523 addi r3,r3,2 524 5252: bf cr7*4+1,3f 526err3; lwz r0,0(r4) 527 addi r4,r4,4 528err3; stw r0,0(r3) 529 addi r3,r3,4 530 5313: bf cr7*4+0,4f 532err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 533err3; lwz r7,4(r4) 534 addi r4,r4,8 535err3; stw r0,0(r3) 536err3; stw r7,4(r3) 537 addi r3,r3,8 538 5394: sub r5,r5,r6 540 541 /* Get the desination 128B aligned */ 542 neg r6,r3 543 srdi r7,r6,4 544 mtocrf 0x01,r7 545 clrldi r6,r6,(64-7) 546 547 li r9,16 548 li r10,32 549 li r11,48 550 551 LVS(v16,0,r4) /* Setup permute control vector */ 552err3; lvx v0,0,r4 553 addi r4,r4,16 554 555 bf cr7*4+3,5f 556err3; lvx v1,0,r4 557 VPERM(v8,v0,v1,v16) 558 addi r4,r4,16 559err3; stvx v8,0,r3 560 addi r3,r3,16 561 vor v0,v1,v1 562 5635: bf cr7*4+2,6f 564err3; lvx v1,0,r4 565 VPERM(v8,v0,v1,v16) 566err3; lvx v0,r4,r9 567 VPERM(v9,v1,v0,v16) 568 addi r4,r4,32 569err3; stvx v8,0,r3 570err3; stvx v9,r3,r9 571 addi r3,r3,32 572 5736: bf cr7*4+1,7f 574err3; lvx v3,0,r4 575 VPERM(v8,v0,v3,v16) 576err3; lvx v2,r4,r9 577 VPERM(v9,v3,v2,v16) 578err3; lvx v1,r4,r10 579 VPERM(v10,v2,v1,v16) 580err3; lvx v0,r4,r11 581 VPERM(v11,v1,v0,v16) 582 addi r4,r4,64 583err3; stvx v8,0,r3 584err3; stvx v9,r3,r9 585err3; stvx v10,r3,r10 586err3; stvx v11,r3,r11 587 addi r3,r3,64 588 5897: sub r5,r5,r6 590 srdi r6,r5,7 591 592 std r14,STK_REG(R14)(r1) 593 std r15,STK_REG(R15)(r1) 594 std r16,STK_REG(R16)(r1) 595 596 li r12,64 597 li r14,80 598 li r15,96 599 li r16,112 600 601 mtctr r6 602 603 /* 604 * Now do cacheline sized loads and stores. By this stage the 605 * cacheline stores are also cacheline aligned. 606 */ 607 .align 5 6088: 609err4; lvx v7,0,r4 610 VPERM(v8,v0,v7,v16) 611err4; lvx v6,r4,r9 612 VPERM(v9,v7,v6,v16) 613err4; lvx v5,r4,r10 614 VPERM(v10,v6,v5,v16) 615err4; lvx v4,r4,r11 616 VPERM(v11,v5,v4,v16) 617err4; lvx v3,r4,r12 618 VPERM(v12,v4,v3,v16) 619err4; lvx v2,r4,r14 620 VPERM(v13,v3,v2,v16) 621err4; lvx v1,r4,r15 622 VPERM(v14,v2,v1,v16) 623err4; lvx v0,r4,r16 624 VPERM(v15,v1,v0,v16) 625 addi r4,r4,128 626err4; stvx v8,0,r3 627err4; stvx v9,r3,r9 628err4; stvx v10,r3,r10 629err4; stvx v11,r3,r11 630err4; stvx v12,r3,r12 631err4; stvx v13,r3,r14 632err4; stvx v14,r3,r15 633err4; stvx v15,r3,r16 634 addi r3,r3,128 635 bdnz 8b 636 637 ld r14,STK_REG(R14)(r1) 638 ld r15,STK_REG(R15)(r1) 639 ld r16,STK_REG(R16)(r1) 640 641 /* Up to 127B to go */ 642 clrldi r5,r5,(64-7) 643 srdi r6,r5,4 644 mtocrf 0x01,r6 645 646 bf cr7*4+1,9f 647err3; lvx v3,0,r4 648 VPERM(v8,v0,v3,v16) 649err3; lvx v2,r4,r9 650 VPERM(v9,v3,v2,v16) 651err3; lvx v1,r4,r10 652 VPERM(v10,v2,v1,v16) 653err3; lvx v0,r4,r11 654 VPERM(v11,v1,v0,v16) 655 addi r4,r4,64 656err3; stvx v8,0,r3 657err3; stvx v9,r3,r9 658err3; stvx v10,r3,r10 659err3; stvx v11,r3,r11 660 addi r3,r3,64 661 6629: bf cr7*4+2,10f 663err3; lvx v1,0,r4 664 VPERM(v8,v0,v1,v16) 665err3; lvx v0,r4,r9 666 VPERM(v9,v1,v0,v16) 667 addi r4,r4,32 668err3; stvx v8,0,r3 669err3; stvx v9,r3,r9 670 addi r3,r3,32 671 67210: bf cr7*4+3,11f 673err3; lvx v1,0,r4 674 VPERM(v8,v0,v1,v16) 675 addi r4,r4,16 676err3; stvx v8,0,r3 677 addi r3,r3,16 678 679 /* Up to 15B to go */ 68011: clrldi r5,r5,(64-4) 681 addi r4,r4,-16 /* Unwind the +16 load offset */ 682 mtocrf 0x01,r5 683 bf cr7*4+0,12f 684err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 685err3; lwz r6,4(r4) 686 addi r4,r4,8 687err3; stw r0,0(r3) 688err3; stw r6,4(r3) 689 addi r3,r3,8 690 69112: bf cr7*4+1,13f 692err3; lwz r0,0(r4) 693 addi r4,r4,4 694err3; stw r0,0(r3) 695 addi r3,r3,4 696 69713: bf cr7*4+2,14f 698err3; lhz r0,0(r4) 699 addi r4,r4,2 700err3; sth r0,0(r3) 701 addi r3,r3,2 702 70314: bf cr7*4+3,15f 704err3; lbz r0,0(r4) 705err3; stb r0,0(r3) 706 70715: addi r1,r1,STACKFRAMESIZE 708 b exit_vmx_usercopy /* tail call optimise */ 709#endif /* CONFIG_ALTIVEC */ 710