1/* 2 * This program is free software; you can redistribute it and/or modify 3 * it under the terms of the GNU General Public License as published by 4 * the Free Software Foundation; either version 2 of the License, or 5 * (at your option) any later version. 6 * 7 * This program is distributed in the hope that it will be useful, 8 * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 * GNU General Public License for more details. 11 * 12 * You should have received a copy of the GNU General Public License 13 * along with this program; if not, write to the Free Software 14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 15 * 16 * Copyright (C) IBM Corporation, 2011 17 * 18 * Author: Anton Blanchard <anton@au.ibm.com> 19 */ 20#include <asm/ppc_asm.h> 21 22#ifdef __BIG_ENDIAN__ 23#define LVS(VRT,RA,RB) lvsl VRT,RA,RB 24#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC 25#else 26#define LVS(VRT,RA,RB) lvsr VRT,RA,RB 27#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC 28#endif 29 30 .macro err1 31100: 32 .section __ex_table,"a" 33 .align 3 34 .llong 100b,.Ldo_err1 35 .previous 36 .endm 37 38 .macro err2 39200: 40 .section __ex_table,"a" 41 .align 3 42 .llong 200b,.Ldo_err2 43 .previous 44 .endm 45 46#ifdef CONFIG_ALTIVEC 47 .macro err3 48300: 49 .section __ex_table,"a" 50 .align 3 51 .llong 300b,.Ldo_err3 52 .previous 53 .endm 54 55 .macro err4 56400: 57 .section __ex_table,"a" 58 .align 3 59 .llong 400b,.Ldo_err4 60 .previous 61 .endm 62 63 64.Ldo_err4: 65 ld r16,STK_REG(R16)(r1) 66 ld r15,STK_REG(R15)(r1) 67 ld r14,STK_REG(R14)(r1) 68.Ldo_err3: 69 bl exit_vmx_usercopy 70 ld r0,STACKFRAMESIZE+16(r1) 71 mtlr r0 72 b .Lexit 73#endif /* CONFIG_ALTIVEC */ 74 75.Ldo_err2: 76 ld r22,STK_REG(R22)(r1) 77 ld r21,STK_REG(R21)(r1) 78 ld r20,STK_REG(R20)(r1) 79 ld r19,STK_REG(R19)(r1) 80 ld r18,STK_REG(R18)(r1) 81 ld r17,STK_REG(R17)(r1) 82 ld r16,STK_REG(R16)(r1) 83 ld r15,STK_REG(R15)(r1) 84 ld r14,STK_REG(R14)(r1) 85.Lexit: 86 addi r1,r1,STACKFRAMESIZE 87.Ldo_err1: 88 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 89 ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 90 ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 91 b __copy_tofrom_user_base 92 93 94_GLOBAL(__copy_tofrom_user_power7) 95#ifdef CONFIG_ALTIVEC 96 cmpldi r5,16 97 cmpldi cr1,r5,4096 98 99 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 100 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 101 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 102 103 blt .Lshort_copy 104 bgt cr1,.Lvmx_copy 105#else 106 cmpldi r5,16 107 108 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 109 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 110 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 111 112 blt .Lshort_copy 113#endif 114 115.Lnonvmx_copy: 116 /* Get the source 8B aligned */ 117 neg r6,r4 118 mtocrf 0x01,r6 119 clrldi r6,r6,(64-3) 120 121 bf cr7*4+3,1f 122err1; lbz r0,0(r4) 123 addi r4,r4,1 124err1; stb r0,0(r3) 125 addi r3,r3,1 126 1271: bf cr7*4+2,2f 128err1; lhz r0,0(r4) 129 addi r4,r4,2 130err1; sth r0,0(r3) 131 addi r3,r3,2 132 1332: bf cr7*4+1,3f 134err1; lwz r0,0(r4) 135 addi r4,r4,4 136err1; stw r0,0(r3) 137 addi r3,r3,4 138 1393: sub r5,r5,r6 140 cmpldi r5,128 141 blt 5f 142 143 mflr r0 144 stdu r1,-STACKFRAMESIZE(r1) 145 std r14,STK_REG(R14)(r1) 146 std r15,STK_REG(R15)(r1) 147 std r16,STK_REG(R16)(r1) 148 std r17,STK_REG(R17)(r1) 149 std r18,STK_REG(R18)(r1) 150 std r19,STK_REG(R19)(r1) 151 std r20,STK_REG(R20)(r1) 152 std r21,STK_REG(R21)(r1) 153 std r22,STK_REG(R22)(r1) 154 std r0,STACKFRAMESIZE+16(r1) 155 156 srdi r6,r5,7 157 mtctr r6 158 159 /* Now do cacheline (128B) sized loads and stores. */ 160 .align 5 1614: 162err2; ld r0,0(r4) 163err2; ld r6,8(r4) 164err2; ld r7,16(r4) 165err2; ld r8,24(r4) 166err2; ld r9,32(r4) 167err2; ld r10,40(r4) 168err2; ld r11,48(r4) 169err2; ld r12,56(r4) 170err2; ld r14,64(r4) 171err2; ld r15,72(r4) 172err2; ld r16,80(r4) 173err2; ld r17,88(r4) 174err2; ld r18,96(r4) 175err2; ld r19,104(r4) 176err2; ld r20,112(r4) 177err2; ld r21,120(r4) 178 addi r4,r4,128 179err2; std r0,0(r3) 180err2; std r6,8(r3) 181err2; std r7,16(r3) 182err2; std r8,24(r3) 183err2; std r9,32(r3) 184err2; std r10,40(r3) 185err2; std r11,48(r3) 186err2; std r12,56(r3) 187err2; std r14,64(r3) 188err2; std r15,72(r3) 189err2; std r16,80(r3) 190err2; std r17,88(r3) 191err2; std r18,96(r3) 192err2; std r19,104(r3) 193err2; std r20,112(r3) 194err2; std r21,120(r3) 195 addi r3,r3,128 196 bdnz 4b 197 198 clrldi r5,r5,(64-7) 199 200 ld r14,STK_REG(R14)(r1) 201 ld r15,STK_REG(R15)(r1) 202 ld r16,STK_REG(R16)(r1) 203 ld r17,STK_REG(R17)(r1) 204 ld r18,STK_REG(R18)(r1) 205 ld r19,STK_REG(R19)(r1) 206 ld r20,STK_REG(R20)(r1) 207 ld r21,STK_REG(R21)(r1) 208 ld r22,STK_REG(R22)(r1) 209 addi r1,r1,STACKFRAMESIZE 210 211 /* Up to 127B to go */ 2125: srdi r6,r5,4 213 mtocrf 0x01,r6 214 2156: bf cr7*4+1,7f 216err1; ld r0,0(r4) 217err1; ld r6,8(r4) 218err1; ld r7,16(r4) 219err1; ld r8,24(r4) 220err1; ld r9,32(r4) 221err1; ld r10,40(r4) 222err1; ld r11,48(r4) 223err1; ld r12,56(r4) 224 addi r4,r4,64 225err1; std r0,0(r3) 226err1; std r6,8(r3) 227err1; std r7,16(r3) 228err1; std r8,24(r3) 229err1; std r9,32(r3) 230err1; std r10,40(r3) 231err1; std r11,48(r3) 232err1; std r12,56(r3) 233 addi r3,r3,64 234 235 /* Up to 63B to go */ 2367: bf cr7*4+2,8f 237err1; ld r0,0(r4) 238err1; ld r6,8(r4) 239err1; ld r7,16(r4) 240err1; ld r8,24(r4) 241 addi r4,r4,32 242err1; std r0,0(r3) 243err1; std r6,8(r3) 244err1; std r7,16(r3) 245err1; std r8,24(r3) 246 addi r3,r3,32 247 248 /* Up to 31B to go */ 2498: bf cr7*4+3,9f 250err1; ld r0,0(r4) 251err1; ld r6,8(r4) 252 addi r4,r4,16 253err1; std r0,0(r3) 254err1; std r6,8(r3) 255 addi r3,r3,16 256 2579: clrldi r5,r5,(64-4) 258 259 /* Up to 15B to go */ 260.Lshort_copy: 261 mtocrf 0x01,r5 262 bf cr7*4+0,12f 263err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ 264err1; lwz r6,4(r4) 265 addi r4,r4,8 266err1; stw r0,0(r3) 267err1; stw r6,4(r3) 268 addi r3,r3,8 269 27012: bf cr7*4+1,13f 271err1; lwz r0,0(r4) 272 addi r4,r4,4 273err1; stw r0,0(r3) 274 addi r3,r3,4 275 27613: bf cr7*4+2,14f 277err1; lhz r0,0(r4) 278 addi r4,r4,2 279err1; sth r0,0(r3) 280 addi r3,r3,2 281 28214: bf cr7*4+3,15f 283err1; lbz r0,0(r4) 284err1; stb r0,0(r3) 285 28615: li r3,0 287 blr 288 289.Lunwind_stack_nonvmx_copy: 290 addi r1,r1,STACKFRAMESIZE 291 b .Lnonvmx_copy 292 293#ifdef CONFIG_ALTIVEC 294.Lvmx_copy: 295 mflr r0 296 std r0,16(r1) 297 stdu r1,-STACKFRAMESIZE(r1) 298 bl enter_vmx_usercopy 299 cmpwi cr1,r3,0 300 ld r0,STACKFRAMESIZE+16(r1) 301 ld r3,STK_REG(R31)(r1) 302 ld r4,STK_REG(R30)(r1) 303 ld r5,STK_REG(R29)(r1) 304 mtlr r0 305 306 /* 307 * We prefetch both the source and destination using enhanced touch 308 * instructions. We use a stream ID of 0 for the load side and 309 * 1 for the store side. 310 */ 311 clrrdi r6,r4,7 312 clrrdi r9,r3,7 313 ori r9,r9,1 /* stream=1 */ 314 315 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ 316 cmpldi r7,0x3FF 317 ble 1f 318 li r7,0x3FF 3191: lis r0,0x0E00 /* depth=7 */ 320 sldi r7,r7,7 321 or r7,r7,r0 322 ori r10,r7,1 /* stream=1 */ 323 324 lis r8,0x8000 /* GO=1 */ 325 clrldi r8,r8,32 326 327.machine push 328.machine "power4" 329 /* setup read stream 0 */ 330 dcbt r0,r6,0b01000 /* addr from */ 331 dcbt r0,r7,0b01010 /* length and depth from */ 332 /* setup write stream 1 */ 333 dcbtst r0,r9,0b01000 /* addr to */ 334 dcbtst r0,r10,0b01010 /* length and depth to */ 335 eieio 336 dcbt r0,r8,0b01010 /* all streams GO */ 337.machine pop 338 339 beq cr1,.Lunwind_stack_nonvmx_copy 340 341 /* 342 * If source and destination are not relatively aligned we use a 343 * slower permute loop. 344 */ 345 xor r6,r4,r3 346 rldicl. r6,r6,0,(64-4) 347 bne .Lvmx_unaligned_copy 348 349 /* Get the destination 16B aligned */ 350 neg r6,r3 351 mtocrf 0x01,r6 352 clrldi r6,r6,(64-4) 353 354 bf cr7*4+3,1f 355err3; lbz r0,0(r4) 356 addi r4,r4,1 357err3; stb r0,0(r3) 358 addi r3,r3,1 359 3601: bf cr7*4+2,2f 361err3; lhz r0,0(r4) 362 addi r4,r4,2 363err3; sth r0,0(r3) 364 addi r3,r3,2 365 3662: bf cr7*4+1,3f 367err3; lwz r0,0(r4) 368 addi r4,r4,4 369err3; stw r0,0(r3) 370 addi r3,r3,4 371 3723: bf cr7*4+0,4f 373err3; ld r0,0(r4) 374 addi r4,r4,8 375err3; std r0,0(r3) 376 addi r3,r3,8 377 3784: sub r5,r5,r6 379 380 /* Get the desination 128B aligned */ 381 neg r6,r3 382 srdi r7,r6,4 383 mtocrf 0x01,r7 384 clrldi r6,r6,(64-7) 385 386 li r9,16 387 li r10,32 388 li r11,48 389 390 bf cr7*4+3,5f 391err3; lvx vr1,r0,r4 392 addi r4,r4,16 393err3; stvx vr1,r0,r3 394 addi r3,r3,16 395 3965: bf cr7*4+2,6f 397err3; lvx vr1,r0,r4 398err3; lvx vr0,r4,r9 399 addi r4,r4,32 400err3; stvx vr1,r0,r3 401err3; stvx vr0,r3,r9 402 addi r3,r3,32 403 4046: bf cr7*4+1,7f 405err3; lvx vr3,r0,r4 406err3; lvx vr2,r4,r9 407err3; lvx vr1,r4,r10 408err3; lvx vr0,r4,r11 409 addi r4,r4,64 410err3; stvx vr3,r0,r3 411err3; stvx vr2,r3,r9 412err3; stvx vr1,r3,r10 413err3; stvx vr0,r3,r11 414 addi r3,r3,64 415 4167: sub r5,r5,r6 417 srdi r6,r5,7 418 419 std r14,STK_REG(R14)(r1) 420 std r15,STK_REG(R15)(r1) 421 std r16,STK_REG(R16)(r1) 422 423 li r12,64 424 li r14,80 425 li r15,96 426 li r16,112 427 428 mtctr r6 429 430 /* 431 * Now do cacheline sized loads and stores. By this stage the 432 * cacheline stores are also cacheline aligned. 433 */ 434 .align 5 4358: 436err4; lvx vr7,r0,r4 437err4; lvx vr6,r4,r9 438err4; lvx vr5,r4,r10 439err4; lvx vr4,r4,r11 440err4; lvx vr3,r4,r12 441err4; lvx vr2,r4,r14 442err4; lvx vr1,r4,r15 443err4; lvx vr0,r4,r16 444 addi r4,r4,128 445err4; stvx vr7,r0,r3 446err4; stvx vr6,r3,r9 447err4; stvx vr5,r3,r10 448err4; stvx vr4,r3,r11 449err4; stvx vr3,r3,r12 450err4; stvx vr2,r3,r14 451err4; stvx vr1,r3,r15 452err4; stvx vr0,r3,r16 453 addi r3,r3,128 454 bdnz 8b 455 456 ld r14,STK_REG(R14)(r1) 457 ld r15,STK_REG(R15)(r1) 458 ld r16,STK_REG(R16)(r1) 459 460 /* Up to 127B to go */ 461 clrldi r5,r5,(64-7) 462 srdi r6,r5,4 463 mtocrf 0x01,r6 464 465 bf cr7*4+1,9f 466err3; lvx vr3,r0,r4 467err3; lvx vr2,r4,r9 468err3; lvx vr1,r4,r10 469err3; lvx vr0,r4,r11 470 addi r4,r4,64 471err3; stvx vr3,r0,r3 472err3; stvx vr2,r3,r9 473err3; stvx vr1,r3,r10 474err3; stvx vr0,r3,r11 475 addi r3,r3,64 476 4779: bf cr7*4+2,10f 478err3; lvx vr1,r0,r4 479err3; lvx vr0,r4,r9 480 addi r4,r4,32 481err3; stvx vr1,r0,r3 482err3; stvx vr0,r3,r9 483 addi r3,r3,32 484 48510: bf cr7*4+3,11f 486err3; lvx vr1,r0,r4 487 addi r4,r4,16 488err3; stvx vr1,r0,r3 489 addi r3,r3,16 490 491 /* Up to 15B to go */ 49211: clrldi r5,r5,(64-4) 493 mtocrf 0x01,r5 494 bf cr7*4+0,12f 495err3; ld r0,0(r4) 496 addi r4,r4,8 497err3; std r0,0(r3) 498 addi r3,r3,8 499 50012: bf cr7*4+1,13f 501err3; lwz r0,0(r4) 502 addi r4,r4,4 503err3; stw r0,0(r3) 504 addi r3,r3,4 505 50613: bf cr7*4+2,14f 507err3; lhz r0,0(r4) 508 addi r4,r4,2 509err3; sth r0,0(r3) 510 addi r3,r3,2 511 51214: bf cr7*4+3,15f 513err3; lbz r0,0(r4) 514err3; stb r0,0(r3) 515 51615: addi r1,r1,STACKFRAMESIZE 517 b exit_vmx_usercopy /* tail call optimise */ 518 519.Lvmx_unaligned_copy: 520 /* Get the destination 16B aligned */ 521 neg r6,r3 522 mtocrf 0x01,r6 523 clrldi r6,r6,(64-4) 524 525 bf cr7*4+3,1f 526err3; lbz r0,0(r4) 527 addi r4,r4,1 528err3; stb r0,0(r3) 529 addi r3,r3,1 530 5311: bf cr7*4+2,2f 532err3; lhz r0,0(r4) 533 addi r4,r4,2 534err3; sth r0,0(r3) 535 addi r3,r3,2 536 5372: bf cr7*4+1,3f 538err3; lwz r0,0(r4) 539 addi r4,r4,4 540err3; stw r0,0(r3) 541 addi r3,r3,4 542 5433: bf cr7*4+0,4f 544err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 545err3; lwz r7,4(r4) 546 addi r4,r4,8 547err3; stw r0,0(r3) 548err3; stw r7,4(r3) 549 addi r3,r3,8 550 5514: sub r5,r5,r6 552 553 /* Get the desination 128B aligned */ 554 neg r6,r3 555 srdi r7,r6,4 556 mtocrf 0x01,r7 557 clrldi r6,r6,(64-7) 558 559 li r9,16 560 li r10,32 561 li r11,48 562 563 LVS(vr16,0,r4) /* Setup permute control vector */ 564err3; lvx vr0,0,r4 565 addi r4,r4,16 566 567 bf cr7*4+3,5f 568err3; lvx vr1,r0,r4 569 VPERM(vr8,vr0,vr1,vr16) 570 addi r4,r4,16 571err3; stvx vr8,r0,r3 572 addi r3,r3,16 573 vor vr0,vr1,vr1 574 5755: bf cr7*4+2,6f 576err3; lvx vr1,r0,r4 577 VPERM(vr8,vr0,vr1,vr16) 578err3; lvx vr0,r4,r9 579 VPERM(vr9,vr1,vr0,vr16) 580 addi r4,r4,32 581err3; stvx vr8,r0,r3 582err3; stvx vr9,r3,r9 583 addi r3,r3,32 584 5856: bf cr7*4+1,7f 586err3; lvx vr3,r0,r4 587 VPERM(vr8,vr0,vr3,vr16) 588err3; lvx vr2,r4,r9 589 VPERM(vr9,vr3,vr2,vr16) 590err3; lvx vr1,r4,r10 591 VPERM(vr10,vr2,vr1,vr16) 592err3; lvx vr0,r4,r11 593 VPERM(vr11,vr1,vr0,vr16) 594 addi r4,r4,64 595err3; stvx vr8,r0,r3 596err3; stvx vr9,r3,r9 597err3; stvx vr10,r3,r10 598err3; stvx vr11,r3,r11 599 addi r3,r3,64 600 6017: sub r5,r5,r6 602 srdi r6,r5,7 603 604 std r14,STK_REG(R14)(r1) 605 std r15,STK_REG(R15)(r1) 606 std r16,STK_REG(R16)(r1) 607 608 li r12,64 609 li r14,80 610 li r15,96 611 li r16,112 612 613 mtctr r6 614 615 /* 616 * Now do cacheline sized loads and stores. By this stage the 617 * cacheline stores are also cacheline aligned. 618 */ 619 .align 5 6208: 621err4; lvx vr7,r0,r4 622 VPERM(vr8,vr0,vr7,vr16) 623err4; lvx vr6,r4,r9 624 VPERM(vr9,vr7,vr6,vr16) 625err4; lvx vr5,r4,r10 626 VPERM(vr10,vr6,vr5,vr16) 627err4; lvx vr4,r4,r11 628 VPERM(vr11,vr5,vr4,vr16) 629err4; lvx vr3,r4,r12 630 VPERM(vr12,vr4,vr3,vr16) 631err4; lvx vr2,r4,r14 632 VPERM(vr13,vr3,vr2,vr16) 633err4; lvx vr1,r4,r15 634 VPERM(vr14,vr2,vr1,vr16) 635err4; lvx vr0,r4,r16 636 VPERM(vr15,vr1,vr0,vr16) 637 addi r4,r4,128 638err4; stvx vr8,r0,r3 639err4; stvx vr9,r3,r9 640err4; stvx vr10,r3,r10 641err4; stvx vr11,r3,r11 642err4; stvx vr12,r3,r12 643err4; stvx vr13,r3,r14 644err4; stvx vr14,r3,r15 645err4; stvx vr15,r3,r16 646 addi r3,r3,128 647 bdnz 8b 648 649 ld r14,STK_REG(R14)(r1) 650 ld r15,STK_REG(R15)(r1) 651 ld r16,STK_REG(R16)(r1) 652 653 /* Up to 127B to go */ 654 clrldi r5,r5,(64-7) 655 srdi r6,r5,4 656 mtocrf 0x01,r6 657 658 bf cr7*4+1,9f 659err3; lvx vr3,r0,r4 660 VPERM(vr8,vr0,vr3,vr16) 661err3; lvx vr2,r4,r9 662 VPERM(vr9,vr3,vr2,vr16) 663err3; lvx vr1,r4,r10 664 VPERM(vr10,vr2,vr1,vr16) 665err3; lvx vr0,r4,r11 666 VPERM(vr11,vr1,vr0,vr16) 667 addi r4,r4,64 668err3; stvx vr8,r0,r3 669err3; stvx vr9,r3,r9 670err3; stvx vr10,r3,r10 671err3; stvx vr11,r3,r11 672 addi r3,r3,64 673 6749: bf cr7*4+2,10f 675err3; lvx vr1,r0,r4 676 VPERM(vr8,vr0,vr1,vr16) 677err3; lvx vr0,r4,r9 678 VPERM(vr9,vr1,vr0,vr16) 679 addi r4,r4,32 680err3; stvx vr8,r0,r3 681err3; stvx vr9,r3,r9 682 addi r3,r3,32 683 68410: bf cr7*4+3,11f 685err3; lvx vr1,r0,r4 686 VPERM(vr8,vr0,vr1,vr16) 687 addi r4,r4,16 688err3; stvx vr8,r0,r3 689 addi r3,r3,16 690 691 /* Up to 15B to go */ 69211: clrldi r5,r5,(64-4) 693 addi r4,r4,-16 /* Unwind the +16 load offset */ 694 mtocrf 0x01,r5 695 bf cr7*4+0,12f 696err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 697err3; lwz r6,4(r4) 698 addi r4,r4,8 699err3; stw r0,0(r3) 700err3; stw r6,4(r3) 701 addi r3,r3,8 702 70312: bf cr7*4+1,13f 704err3; lwz r0,0(r4) 705 addi r4,r4,4 706err3; stw r0,0(r3) 707 addi r3,r3,4 708 70913: bf cr7*4+2,14f 710err3; lhz r0,0(r4) 711 addi r4,r4,2 712err3; sth r0,0(r3) 713 addi r3,r3,2 714 71514: bf cr7*4+3,15f 716err3; lbz r0,0(r4) 717err3; stb r0,0(r3) 718 71915: addi r1,r1,STACKFRAMESIZE 720 b exit_vmx_usercopy /* tail call optimise */ 721#endif /* CONFIG_ALTIVEC */ 722