1/* 2 * This program is free software; you can redistribute it and/or modify 3 * it under the terms of the GNU General Public License as published by 4 * the Free Software Foundation; either version 2 of the License, or 5 * (at your option) any later version. 6 * 7 * This program is distributed in the hope that it will be useful, 8 * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 * GNU General Public License for more details. 11 * 12 * You should have received a copy of the GNU General Public License 13 * along with this program; if not, write to the Free Software 14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 15 * 16 * Copyright (C) IBM Corporation, 2011 17 * 18 * Author: Anton Blanchard <anton@au.ibm.com> 19 */ 20#include <asm/ppc_asm.h> 21 22#define STACKFRAMESIZE 256 23#define STK_REG(i) (112 + ((i)-14)*8) 24 25 .macro err1 26100: 27 .section __ex_table,"a" 28 .align 3 29 .llong 100b,.Ldo_err1 30 .previous 31 .endm 32 33 .macro err2 34200: 35 .section __ex_table,"a" 36 .align 3 37 .llong 200b,.Ldo_err2 38 .previous 39 .endm 40 41#ifdef CONFIG_ALTIVEC 42 .macro err3 43300: 44 .section __ex_table,"a" 45 .align 3 46 .llong 300b,.Ldo_err3 47 .previous 48 .endm 49 50 .macro err4 51400: 52 .section __ex_table,"a" 53 .align 3 54 .llong 400b,.Ldo_err4 55 .previous 56 .endm 57 58 59.Ldo_err4: 60 ld r16,STK_REG(r16)(r1) 61 ld r15,STK_REG(r15)(r1) 62 ld r14,STK_REG(r14)(r1) 63.Ldo_err3: 64 bl .exit_vmx_copy 65 ld r0,STACKFRAMESIZE+16(r1) 66 mtlr r0 67 b .Lexit 68#endif /* CONFIG_ALTIVEC */ 69 70.Ldo_err2: 71 ld r22,STK_REG(r22)(r1) 72 ld r21,STK_REG(r21)(r1) 73 ld r20,STK_REG(r20)(r1) 74 ld r19,STK_REG(r19)(r1) 75 ld r18,STK_REG(r18)(r1) 76 ld r17,STK_REG(r17)(r1) 77 ld r16,STK_REG(r16)(r1) 78 ld r15,STK_REG(r15)(r1) 79 ld r14,STK_REG(r14)(r1) 80.Lexit: 81 addi r1,r1,STACKFRAMESIZE 82.Ldo_err1: 83 ld r3,48(r1) 84 ld r4,56(r1) 85 ld r5,64(r1) 86 b __copy_tofrom_user_base 87 88 89_GLOBAL(__copy_tofrom_user_power7) 90#ifdef CONFIG_ALTIVEC 91 cmpldi r5,16 92 cmpldi cr1,r5,4096 93 94 std r3,48(r1) 95 std r4,56(r1) 96 std r5,64(r1) 97 98 blt .Lshort_copy 99 bgt cr1,.Lvmx_copy 100#else 101 cmpldi r5,16 102 103 std r3,48(r1) 104 std r4,56(r1) 105 std r5,64(r1) 106 107 blt .Lshort_copy 108#endif 109 110.Lnonvmx_copy: 111 /* Get the source 8B aligned */ 112 neg r6,r4 113 mtocrf 0x01,r6 114 clrldi r6,r6,(64-3) 115 116 bf cr7*4+3,1f 117err1; lbz r0,0(r4) 118 addi r4,r4,1 119err1; stb r0,0(r3) 120 addi r3,r3,1 121 1221: bf cr7*4+2,2f 123err1; lhz r0,0(r4) 124 addi r4,r4,2 125err1; sth r0,0(r3) 126 addi r3,r3,2 127 1282: bf cr7*4+1,3f 129err1; lwz r0,0(r4) 130 addi r4,r4,4 131err1; stw r0,0(r3) 132 addi r3,r3,4 133 1343: sub r5,r5,r6 135 cmpldi r5,128 136 blt 5f 137 138 mflr r0 139 stdu r1,-STACKFRAMESIZE(r1) 140 std r14,STK_REG(r14)(r1) 141 std r15,STK_REG(r15)(r1) 142 std r16,STK_REG(r16)(r1) 143 std r17,STK_REG(r17)(r1) 144 std r18,STK_REG(r18)(r1) 145 std r19,STK_REG(r19)(r1) 146 std r20,STK_REG(r20)(r1) 147 std r21,STK_REG(r21)(r1) 148 std r22,STK_REG(r22)(r1) 149 std r0,STACKFRAMESIZE+16(r1) 150 151 srdi r6,r5,7 152 mtctr r6 153 154 /* Now do cacheline (128B) sized loads and stores. */ 155 .align 5 1564: 157err2; ld r0,0(r4) 158err2; ld r6,8(r4) 159err2; ld r7,16(r4) 160err2; ld r8,24(r4) 161err2; ld r9,32(r4) 162err2; ld r10,40(r4) 163err2; ld r11,48(r4) 164err2; ld r12,56(r4) 165err2; ld r14,64(r4) 166err2; ld r15,72(r4) 167err2; ld r16,80(r4) 168err2; ld r17,88(r4) 169err2; ld r18,96(r4) 170err2; ld r19,104(r4) 171err2; ld r20,112(r4) 172err2; ld r21,120(r4) 173 addi r4,r4,128 174err2; std r0,0(r3) 175err2; std r6,8(r3) 176err2; std r7,16(r3) 177err2; std r8,24(r3) 178err2; std r9,32(r3) 179err2; std r10,40(r3) 180err2; std r11,48(r3) 181err2; std r12,56(r3) 182err2; std r14,64(r3) 183err2; std r15,72(r3) 184err2; std r16,80(r3) 185err2; std r17,88(r3) 186err2; std r18,96(r3) 187err2; std r19,104(r3) 188err2; std r20,112(r3) 189err2; std r21,120(r3) 190 addi r3,r3,128 191 bdnz 4b 192 193 clrldi r5,r5,(64-7) 194 195 ld r14,STK_REG(r14)(r1) 196 ld r15,STK_REG(r15)(r1) 197 ld r16,STK_REG(r16)(r1) 198 ld r17,STK_REG(r17)(r1) 199 ld r18,STK_REG(r18)(r1) 200 ld r19,STK_REG(r19)(r1) 201 ld r20,STK_REG(r20)(r1) 202 ld r21,STK_REG(r21)(r1) 203 ld r22,STK_REG(r22)(r1) 204 addi r1,r1,STACKFRAMESIZE 205 206 /* Up to 127B to go */ 2075: srdi r6,r5,4 208 mtocrf 0x01,r6 209 2106: bf cr7*4+1,7f 211err1; ld r0,0(r4) 212err1; ld r6,8(r4) 213err1; ld r7,16(r4) 214err1; ld r8,24(r4) 215err1; ld r9,32(r4) 216err1; ld r10,40(r4) 217err1; ld r11,48(r4) 218err1; ld r12,56(r4) 219 addi r4,r4,64 220err1; std r0,0(r3) 221err1; std r6,8(r3) 222err1; std r7,16(r3) 223err1; std r8,24(r3) 224err1; std r9,32(r3) 225err1; std r10,40(r3) 226err1; std r11,48(r3) 227err1; std r12,56(r3) 228 addi r3,r3,64 229 230 /* Up to 63B to go */ 2317: bf cr7*4+2,8f 232err1; ld r0,0(r4) 233err1; ld r6,8(r4) 234err1; ld r7,16(r4) 235err1; ld r8,24(r4) 236 addi r4,r4,32 237err1; std r0,0(r3) 238err1; std r6,8(r3) 239err1; std r7,16(r3) 240err1; std r8,24(r3) 241 addi r3,r3,32 242 243 /* Up to 31B to go */ 2448: bf cr7*4+3,9f 245err1; ld r0,0(r4) 246err1; ld r6,8(r4) 247 addi r4,r4,16 248err1; std r0,0(r3) 249err1; std r6,8(r3) 250 addi r3,r3,16 251 2529: clrldi r5,r5,(64-4) 253 254 /* Up to 15B to go */ 255.Lshort_copy: 256 mtocrf 0x01,r5 257 bf cr7*4+0,12f 258err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ 259err1; lwz r6,4(r4) 260 addi r4,r4,8 261err1; stw r0,0(r3) 262err1; stw r6,4(r3) 263 addi r3,r3,8 264 26512: bf cr7*4+1,13f 266err1; lwz r0,0(r4) 267 addi r4,r4,4 268err1; stw r0,0(r3) 269 addi r3,r3,4 270 27113: bf cr7*4+2,14f 272err1; lhz r0,0(r4) 273 addi r4,r4,2 274err1; sth r0,0(r3) 275 addi r3,r3,2 276 27714: bf cr7*4+3,15f 278err1; lbz r0,0(r4) 279err1; stb r0,0(r3) 280 28115: li r3,0 282 blr 283 284.Lunwind_stack_nonvmx_copy: 285 addi r1,r1,STACKFRAMESIZE 286 b .Lnonvmx_copy 287 288#ifdef CONFIG_ALTIVEC 289.Lvmx_copy: 290 mflr r0 291 std r0,16(r1) 292 stdu r1,-STACKFRAMESIZE(r1) 293 bl .enter_vmx_copy 294 cmpwi r3,0 295 ld r0,STACKFRAMESIZE+16(r1) 296 ld r3,STACKFRAMESIZE+48(r1) 297 ld r4,STACKFRAMESIZE+56(r1) 298 ld r5,STACKFRAMESIZE+64(r1) 299 mtlr r0 300 301 beq .Lunwind_stack_nonvmx_copy 302 303 /* 304 * If source and destination are not relatively aligned we use a 305 * slower permute loop. 306 */ 307 xor r6,r4,r3 308 rldicl. r6,r6,0,(64-4) 309 bne .Lvmx_unaligned_copy 310 311 /* Get the destination 16B aligned */ 312 neg r6,r3 313 mtocrf 0x01,r6 314 clrldi r6,r6,(64-4) 315 316 bf cr7*4+3,1f 317err3; lbz r0,0(r4) 318 addi r4,r4,1 319err3; stb r0,0(r3) 320 addi r3,r3,1 321 3221: bf cr7*4+2,2f 323err3; lhz r0,0(r4) 324 addi r4,r4,2 325err3; sth r0,0(r3) 326 addi r3,r3,2 327 3282: bf cr7*4+1,3f 329err3; lwz r0,0(r4) 330 addi r4,r4,4 331err3; stw r0,0(r3) 332 addi r3,r3,4 333 3343: bf cr7*4+0,4f 335err3; ld r0,0(r4) 336 addi r4,r4,8 337err3; std r0,0(r3) 338 addi r3,r3,8 339 3404: sub r5,r5,r6 341 342 /* Get the desination 128B aligned */ 343 neg r6,r3 344 srdi r7,r6,4 345 mtocrf 0x01,r7 346 clrldi r6,r6,(64-7) 347 348 li r9,16 349 li r10,32 350 li r11,48 351 352 bf cr7*4+3,5f 353err3; lvx vr1,r0,r4 354 addi r4,r4,16 355err3; stvx vr1,r0,r3 356 addi r3,r3,16 357 3585: bf cr7*4+2,6f 359err3; lvx vr1,r0,r4 360err3; lvx vr0,r4,r9 361 addi r4,r4,32 362err3; stvx vr1,r0,r3 363err3; stvx vr0,r3,r9 364 addi r3,r3,32 365 3666: bf cr7*4+1,7f 367err3; lvx vr3,r0,r4 368err3; lvx vr2,r4,r9 369err3; lvx vr1,r4,r10 370err3; lvx vr0,r4,r11 371 addi r4,r4,64 372err3; stvx vr3,r0,r3 373err3; stvx vr2,r3,r9 374err3; stvx vr1,r3,r10 375err3; stvx vr0,r3,r11 376 addi r3,r3,64 377 3787: sub r5,r5,r6 379 srdi r6,r5,7 380 381 std r14,STK_REG(r14)(r1) 382 std r15,STK_REG(r15)(r1) 383 std r16,STK_REG(r16)(r1) 384 385 li r12,64 386 li r14,80 387 li r15,96 388 li r16,112 389 390 mtctr r6 391 392 /* 393 * Now do cacheline sized loads and stores. By this stage the 394 * cacheline stores are also cacheline aligned. 395 */ 396 .align 5 3978: 398err4; lvx vr7,r0,r4 399err4; lvx vr6,r4,r9 400err4; lvx vr5,r4,r10 401err4; lvx vr4,r4,r11 402err4; lvx vr3,r4,r12 403err4; lvx vr2,r4,r14 404err4; lvx vr1,r4,r15 405err4; lvx vr0,r4,r16 406 addi r4,r4,128 407err4; stvx vr7,r0,r3 408err4; stvx vr6,r3,r9 409err4; stvx vr5,r3,r10 410err4; stvx vr4,r3,r11 411err4; stvx vr3,r3,r12 412err4; stvx vr2,r3,r14 413err4; stvx vr1,r3,r15 414err4; stvx vr0,r3,r16 415 addi r3,r3,128 416 bdnz 8b 417 418 ld r14,STK_REG(r14)(r1) 419 ld r15,STK_REG(r15)(r1) 420 ld r16,STK_REG(r16)(r1) 421 422 /* Up to 127B to go */ 423 clrldi r5,r5,(64-7) 424 srdi r6,r5,4 425 mtocrf 0x01,r6 426 427 bf cr7*4+1,9f 428err3; lvx vr3,r0,r4 429err3; lvx vr2,r4,r9 430err3; lvx vr1,r4,r10 431err3; lvx vr0,r4,r11 432 addi r4,r4,64 433err3; stvx vr3,r0,r3 434err3; stvx vr2,r3,r9 435err3; stvx vr1,r3,r10 436err3; stvx vr0,r3,r11 437 addi r3,r3,64 438 4399: bf cr7*4+2,10f 440err3; lvx vr1,r0,r4 441err3; lvx vr0,r4,r9 442 addi r4,r4,32 443err3; stvx vr1,r0,r3 444err3; stvx vr0,r3,r9 445 addi r3,r3,32 446 44710: bf cr7*4+3,11f 448err3; lvx vr1,r0,r4 449 addi r4,r4,16 450err3; stvx vr1,r0,r3 451 addi r3,r3,16 452 453 /* Up to 15B to go */ 45411: clrldi r5,r5,(64-4) 455 mtocrf 0x01,r5 456 bf cr7*4+0,12f 457err3; ld r0,0(r4) 458 addi r4,r4,8 459err3; std r0,0(r3) 460 addi r3,r3,8 461 46212: bf cr7*4+1,13f 463err3; lwz r0,0(r4) 464 addi r4,r4,4 465err3; stw r0,0(r3) 466 addi r3,r3,4 467 46813: bf cr7*4+2,14f 469err3; lhz r0,0(r4) 470 addi r4,r4,2 471err3; sth r0,0(r3) 472 addi r3,r3,2 473 47414: bf cr7*4+3,15f 475err3; lbz r0,0(r4) 476err3; stb r0,0(r3) 477 47815: addi r1,r1,STACKFRAMESIZE 479 b .exit_vmx_copy /* tail call optimise */ 480 481.Lvmx_unaligned_copy: 482 /* Get the destination 16B aligned */ 483 neg r6,r3 484 mtocrf 0x01,r6 485 clrldi r6,r6,(64-4) 486 487 bf cr7*4+3,1f 488err3; lbz r0,0(r4) 489 addi r4,r4,1 490err3; stb r0,0(r3) 491 addi r3,r3,1 492 4931: bf cr7*4+2,2f 494err3; lhz r0,0(r4) 495 addi r4,r4,2 496err3; sth r0,0(r3) 497 addi r3,r3,2 498 4992: bf cr7*4+1,3f 500err3; lwz r0,0(r4) 501 addi r4,r4,4 502err3; stw r0,0(r3) 503 addi r3,r3,4 504 5053: bf cr7*4+0,4f 506err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 507err3; lwz r7,4(r4) 508 addi r4,r4,8 509err3; stw r0,0(r3) 510err3; stw r7,4(r3) 511 addi r3,r3,8 512 5134: sub r5,r5,r6 514 515 /* Get the desination 128B aligned */ 516 neg r6,r3 517 srdi r7,r6,4 518 mtocrf 0x01,r7 519 clrldi r6,r6,(64-7) 520 521 li r9,16 522 li r10,32 523 li r11,48 524 525 lvsl vr16,0,r4 /* Setup permute control vector */ 526err3; lvx vr0,0,r4 527 addi r4,r4,16 528 529 bf cr7*4+3,5f 530err3; lvx vr1,r0,r4 531 vperm vr8,vr0,vr1,vr16 532 addi r4,r4,16 533err3; stvx vr8,r0,r3 534 addi r3,r3,16 535 vor vr0,vr1,vr1 536 5375: bf cr7*4+2,6f 538err3; lvx vr1,r0,r4 539 vperm vr8,vr0,vr1,vr16 540err3; lvx vr0,r4,r9 541 vperm vr9,vr1,vr0,vr16 542 addi r4,r4,32 543err3; stvx vr8,r0,r3 544err3; stvx vr9,r3,r9 545 addi r3,r3,32 546 5476: bf cr7*4+1,7f 548err3; lvx vr3,r0,r4 549 vperm vr8,vr0,vr3,vr16 550err3; lvx vr2,r4,r9 551 vperm vr9,vr3,vr2,vr16 552err3; lvx vr1,r4,r10 553 vperm vr10,vr2,vr1,vr16 554err3; lvx vr0,r4,r11 555 vperm vr11,vr1,vr0,vr16 556 addi r4,r4,64 557err3; stvx vr8,r0,r3 558err3; stvx vr9,r3,r9 559err3; stvx vr10,r3,r10 560err3; stvx vr11,r3,r11 561 addi r3,r3,64 562 5637: sub r5,r5,r6 564 srdi r6,r5,7 565 566 std r14,STK_REG(r14)(r1) 567 std r15,STK_REG(r15)(r1) 568 std r16,STK_REG(r16)(r1) 569 570 li r12,64 571 li r14,80 572 li r15,96 573 li r16,112 574 575 mtctr r6 576 577 /* 578 * Now do cacheline sized loads and stores. By this stage the 579 * cacheline stores are also cacheline aligned. 580 */ 581 .align 5 5828: 583err4; lvx vr7,r0,r4 584 vperm vr8,vr0,vr7,vr16 585err4; lvx vr6,r4,r9 586 vperm vr9,vr7,vr6,vr16 587err4; lvx vr5,r4,r10 588 vperm vr10,vr6,vr5,vr16 589err4; lvx vr4,r4,r11 590 vperm vr11,vr5,vr4,vr16 591err4; lvx vr3,r4,r12 592 vperm vr12,vr4,vr3,vr16 593err4; lvx vr2,r4,r14 594 vperm vr13,vr3,vr2,vr16 595err4; lvx vr1,r4,r15 596 vperm vr14,vr2,vr1,vr16 597err4; lvx vr0,r4,r16 598 vperm vr15,vr1,vr0,vr16 599 addi r4,r4,128 600err4; stvx vr8,r0,r3 601err4; stvx vr9,r3,r9 602err4; stvx vr10,r3,r10 603err4; stvx vr11,r3,r11 604err4; stvx vr12,r3,r12 605err4; stvx vr13,r3,r14 606err4; stvx vr14,r3,r15 607err4; stvx vr15,r3,r16 608 addi r3,r3,128 609 bdnz 8b 610 611 ld r14,STK_REG(r14)(r1) 612 ld r15,STK_REG(r15)(r1) 613 ld r16,STK_REG(r16)(r1) 614 615 /* Up to 127B to go */ 616 clrldi r5,r5,(64-7) 617 srdi r6,r5,4 618 mtocrf 0x01,r6 619 620 bf cr7*4+1,9f 621err3; lvx vr3,r0,r4 622 vperm vr8,vr0,vr3,vr16 623err3; lvx vr2,r4,r9 624 vperm vr9,vr3,vr2,vr16 625err3; lvx vr1,r4,r10 626 vperm vr10,vr2,vr1,vr16 627err3; lvx vr0,r4,r11 628 vperm vr11,vr1,vr0,vr16 629 addi r4,r4,64 630err3; stvx vr8,r0,r3 631err3; stvx vr9,r3,r9 632err3; stvx vr10,r3,r10 633err3; stvx vr11,r3,r11 634 addi r3,r3,64 635 6369: bf cr7*4+2,10f 637err3; lvx vr1,r0,r4 638 vperm vr8,vr0,vr1,vr16 639err3; lvx vr0,r4,r9 640 vperm vr9,vr1,vr0,vr16 641 addi r4,r4,32 642err3; stvx vr8,r0,r3 643err3; stvx vr9,r3,r9 644 addi r3,r3,32 645 64610: bf cr7*4+3,11f 647err3; lvx vr1,r0,r4 648 vperm vr8,vr0,vr1,vr16 649 addi r4,r4,16 650err3; stvx vr8,r0,r3 651 addi r3,r3,16 652 653 /* Up to 15B to go */ 65411: clrldi r5,r5,(64-4) 655 addi r4,r4,-16 /* Unwind the +16 load offset */ 656 mtocrf 0x01,r5 657 bf cr7*4+0,12f 658err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 659err3; lwz r6,4(r4) 660 addi r4,r4,8 661err3; stw r0,0(r3) 662err3; stw r6,4(r3) 663 addi r3,r3,8 664 66512: bf cr7*4+1,13f 666err3; lwz r0,0(r4) 667 addi r4,r4,4 668err3; stw r0,0(r3) 669 addi r3,r3,4 670 67113: bf cr7*4+2,14f 672err3; lhz r0,0(r4) 673 addi r4,r4,2 674err3; sth r0,0(r3) 675 addi r3,r3,2 676 67714: bf cr7*4+3,15f 678err3; lbz r0,0(r4) 679err3; stb r0,0(r3) 680 68115: addi r1,r1,STACKFRAMESIZE 682 b .exit_vmx_copy /* tail call optimise */ 683#endif /* CONFiG_ALTIVEC */ 684