1/* 2 * This program is free software; you can redistribute it and/or modify 3 * it under the terms of the GNU General Public License as published by 4 * the Free Software Foundation; either version 2 of the License, or 5 * (at your option) any later version. 6 * 7 * This program is distributed in the hope that it will be useful, 8 * but WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 10 * GNU General Public License for more details. 11 * 12 * You should have received a copy of the GNU General Public License 13 * along with this program; if not, write to the Free Software 14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 15 * 16 * Copyright (C) IBM Corporation, 2011 17 * 18 * Author: Anton Blanchard <anton@au.ibm.com> 19 */ 20#include <asm/ppc_asm.h> 21 22 .macro err1 23100: 24 .section __ex_table,"a" 25 .align 3 26 .llong 100b,.Ldo_err1 27 .previous 28 .endm 29 30 .macro err2 31200: 32 .section __ex_table,"a" 33 .align 3 34 .llong 200b,.Ldo_err2 35 .previous 36 .endm 37 38#ifdef CONFIG_ALTIVEC 39 .macro err3 40300: 41 .section __ex_table,"a" 42 .align 3 43 .llong 300b,.Ldo_err3 44 .previous 45 .endm 46 47 .macro err4 48400: 49 .section __ex_table,"a" 50 .align 3 51 .llong 400b,.Ldo_err4 52 .previous 53 .endm 54 55 56.Ldo_err4: 57 ld r16,STK_REG(R16)(r1) 58 ld r15,STK_REG(R15)(r1) 59 ld r14,STK_REG(R14)(r1) 60.Ldo_err3: 61 bl .exit_vmx_usercopy 62 ld r0,STACKFRAMESIZE+16(r1) 63 mtlr r0 64 b .Lexit 65#endif /* CONFIG_ALTIVEC */ 66 67.Ldo_err2: 68 ld r22,STK_REG(R22)(r1) 69 ld r21,STK_REG(R21)(r1) 70 ld r20,STK_REG(R20)(r1) 71 ld r19,STK_REG(R19)(r1) 72 ld r18,STK_REG(R18)(r1) 73 ld r17,STK_REG(R17)(r1) 74 ld r16,STK_REG(R16)(r1) 75 ld r15,STK_REG(R15)(r1) 76 ld r14,STK_REG(R14)(r1) 77.Lexit: 78 addi r1,r1,STACKFRAMESIZE 79.Ldo_err1: 80 ld r3,48(r1) 81 ld r4,56(r1) 82 ld r5,64(r1) 83 b __copy_tofrom_user_base 84 85 86_GLOBAL(__copy_tofrom_user_power7) 87#ifdef CONFIG_ALTIVEC 88 cmpldi r5,16 89 cmpldi cr1,r5,4096 90 91 std r3,48(r1) 92 std r4,56(r1) 93 std r5,64(r1) 94 95 blt .Lshort_copy 96 bgt cr1,.Lvmx_copy 97#else 98 cmpldi r5,16 99 100 std r3,48(r1) 101 std r4,56(r1) 102 std r5,64(r1) 103 104 blt .Lshort_copy 105#endif 106 107.Lnonvmx_copy: 108 /* Get the source 8B aligned */ 109 neg r6,r4 110 mtocrf 0x01,r6 111 clrldi r6,r6,(64-3) 112 113 bf cr7*4+3,1f 114err1; lbz r0,0(r4) 115 addi r4,r4,1 116err1; stb r0,0(r3) 117 addi r3,r3,1 118 1191: bf cr7*4+2,2f 120err1; lhz r0,0(r4) 121 addi r4,r4,2 122err1; sth r0,0(r3) 123 addi r3,r3,2 124 1252: bf cr7*4+1,3f 126err1; lwz r0,0(r4) 127 addi r4,r4,4 128err1; stw r0,0(r3) 129 addi r3,r3,4 130 1313: sub r5,r5,r6 132 cmpldi r5,128 133 blt 5f 134 135 mflr r0 136 stdu r1,-STACKFRAMESIZE(r1) 137 std r14,STK_REG(R14)(r1) 138 std r15,STK_REG(R15)(r1) 139 std r16,STK_REG(R16)(r1) 140 std r17,STK_REG(R17)(r1) 141 std r18,STK_REG(R18)(r1) 142 std r19,STK_REG(R19)(r1) 143 std r20,STK_REG(R20)(r1) 144 std r21,STK_REG(R21)(r1) 145 std r22,STK_REG(R22)(r1) 146 std r0,STACKFRAMESIZE+16(r1) 147 148 srdi r6,r5,7 149 mtctr r6 150 151 /* Now do cacheline (128B) sized loads and stores. */ 152 .align 5 1534: 154err2; ld r0,0(r4) 155err2; ld r6,8(r4) 156err2; ld r7,16(r4) 157err2; ld r8,24(r4) 158err2; ld r9,32(r4) 159err2; ld r10,40(r4) 160err2; ld r11,48(r4) 161err2; ld r12,56(r4) 162err2; ld r14,64(r4) 163err2; ld r15,72(r4) 164err2; ld r16,80(r4) 165err2; ld r17,88(r4) 166err2; ld r18,96(r4) 167err2; ld r19,104(r4) 168err2; ld r20,112(r4) 169err2; ld r21,120(r4) 170 addi r4,r4,128 171err2; std r0,0(r3) 172err2; std r6,8(r3) 173err2; std r7,16(r3) 174err2; std r8,24(r3) 175err2; std r9,32(r3) 176err2; std r10,40(r3) 177err2; std r11,48(r3) 178err2; std r12,56(r3) 179err2; std r14,64(r3) 180err2; std r15,72(r3) 181err2; std r16,80(r3) 182err2; std r17,88(r3) 183err2; std r18,96(r3) 184err2; std r19,104(r3) 185err2; std r20,112(r3) 186err2; std r21,120(r3) 187 addi r3,r3,128 188 bdnz 4b 189 190 clrldi r5,r5,(64-7) 191 192 ld r14,STK_REG(R14)(r1) 193 ld r15,STK_REG(R15)(r1) 194 ld r16,STK_REG(R16)(r1) 195 ld r17,STK_REG(R17)(r1) 196 ld r18,STK_REG(R18)(r1) 197 ld r19,STK_REG(R19)(r1) 198 ld r20,STK_REG(R20)(r1) 199 ld r21,STK_REG(R21)(r1) 200 ld r22,STK_REG(R22)(r1) 201 addi r1,r1,STACKFRAMESIZE 202 203 /* Up to 127B to go */ 2045: srdi r6,r5,4 205 mtocrf 0x01,r6 206 2076: bf cr7*4+1,7f 208err1; ld r0,0(r4) 209err1; ld r6,8(r4) 210err1; ld r7,16(r4) 211err1; ld r8,24(r4) 212err1; ld r9,32(r4) 213err1; ld r10,40(r4) 214err1; ld r11,48(r4) 215err1; ld r12,56(r4) 216 addi r4,r4,64 217err1; std r0,0(r3) 218err1; std r6,8(r3) 219err1; std r7,16(r3) 220err1; std r8,24(r3) 221err1; std r9,32(r3) 222err1; std r10,40(r3) 223err1; std r11,48(r3) 224err1; std r12,56(r3) 225 addi r3,r3,64 226 227 /* Up to 63B to go */ 2287: bf cr7*4+2,8f 229err1; ld r0,0(r4) 230err1; ld r6,8(r4) 231err1; ld r7,16(r4) 232err1; ld r8,24(r4) 233 addi r4,r4,32 234err1; std r0,0(r3) 235err1; std r6,8(r3) 236err1; std r7,16(r3) 237err1; std r8,24(r3) 238 addi r3,r3,32 239 240 /* Up to 31B to go */ 2418: bf cr7*4+3,9f 242err1; ld r0,0(r4) 243err1; ld r6,8(r4) 244 addi r4,r4,16 245err1; std r0,0(r3) 246err1; std r6,8(r3) 247 addi r3,r3,16 248 2499: clrldi r5,r5,(64-4) 250 251 /* Up to 15B to go */ 252.Lshort_copy: 253 mtocrf 0x01,r5 254 bf cr7*4+0,12f 255err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ 256err1; lwz r6,4(r4) 257 addi r4,r4,8 258err1; stw r0,0(r3) 259err1; stw r6,4(r3) 260 addi r3,r3,8 261 26212: bf cr7*4+1,13f 263err1; lwz r0,0(r4) 264 addi r4,r4,4 265err1; stw r0,0(r3) 266 addi r3,r3,4 267 26813: bf cr7*4+2,14f 269err1; lhz r0,0(r4) 270 addi r4,r4,2 271err1; sth r0,0(r3) 272 addi r3,r3,2 273 27414: bf cr7*4+3,15f 275err1; lbz r0,0(r4) 276err1; stb r0,0(r3) 277 27815: li r3,0 279 blr 280 281.Lunwind_stack_nonvmx_copy: 282 addi r1,r1,STACKFRAMESIZE 283 b .Lnonvmx_copy 284 285#ifdef CONFIG_ALTIVEC 286.Lvmx_copy: 287 mflr r0 288 std r0,16(r1) 289 stdu r1,-STACKFRAMESIZE(r1) 290 bl .enter_vmx_usercopy 291 cmpwi r3,0 292 ld r0,STACKFRAMESIZE+16(r1) 293 ld r3,STACKFRAMESIZE+48(r1) 294 ld r4,STACKFRAMESIZE+56(r1) 295 ld r5,STACKFRAMESIZE+64(r1) 296 mtlr r0 297 298 /* 299 * We prefetch both the source and destination using enhanced touch 300 * instructions. We use a stream ID of 0 for the load side and 301 * 1 for the store side. 302 */ 303 clrrdi r6,r4,7 304 clrrdi r9,r3,7 305 ori r9,r9,1 /* stream=1 */ 306 307 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ 308 cmpldi r7,0x3FF 309 ble 1f 310 li r7,0x3FF 3111: lis r0,0x0E00 /* depth=7 */ 312 sldi r7,r7,7 313 or r7,r7,r0 314 ori r10,r7,1 /* stream=1 */ 315 316 lis r8,0x8000 /* GO=1 */ 317 clrldi r8,r8,32 318 319.machine push 320.machine "power4" 321 dcbt r0,r6,0b01000 322 dcbt r0,r7,0b01010 323 dcbtst r0,r9,0b01000 324 dcbtst r0,r10,0b01010 325 eieio 326 dcbt r0,r8,0b01010 /* GO */ 327.machine pop 328 329 /* 330 * We prefetch both the source and destination using enhanced touch 331 * instructions. We use a stream ID of 0 for the load side and 332 * 1 for the store side. 333 */ 334 clrrdi r6,r4,7 335 clrrdi r9,r3,7 336 ori r9,r9,1 /* stream=1 */ 337 338 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ 339 cmpldi cr1,r7,0x3FF 340 ble cr1,1f 341 li r7,0x3FF 3421: lis r0,0x0E00 /* depth=7 */ 343 sldi r7,r7,7 344 or r7,r7,r0 345 ori r10,r7,1 /* stream=1 */ 346 347 lis r8,0x8000 /* GO=1 */ 348 clrldi r8,r8,32 349 350.machine push 351.machine "power4" 352 dcbt r0,r6,0b01000 353 dcbt r0,r7,0b01010 354 dcbtst r0,r9,0b01000 355 dcbtst r0,r10,0b01010 356 eieio 357 dcbt r0,r8,0b01010 /* GO */ 358.machine pop 359 360 beq .Lunwind_stack_nonvmx_copy 361 362 /* 363 * If source and destination are not relatively aligned we use a 364 * slower permute loop. 365 */ 366 xor r6,r4,r3 367 rldicl. r6,r6,0,(64-4) 368 bne .Lvmx_unaligned_copy 369 370 /* Get the destination 16B aligned */ 371 neg r6,r3 372 mtocrf 0x01,r6 373 clrldi r6,r6,(64-4) 374 375 bf cr7*4+3,1f 376err3; lbz r0,0(r4) 377 addi r4,r4,1 378err3; stb r0,0(r3) 379 addi r3,r3,1 380 3811: bf cr7*4+2,2f 382err3; lhz r0,0(r4) 383 addi r4,r4,2 384err3; sth r0,0(r3) 385 addi r3,r3,2 386 3872: bf cr7*4+1,3f 388err3; lwz r0,0(r4) 389 addi r4,r4,4 390err3; stw r0,0(r3) 391 addi r3,r3,4 392 3933: bf cr7*4+0,4f 394err3; ld r0,0(r4) 395 addi r4,r4,8 396err3; std r0,0(r3) 397 addi r3,r3,8 398 3994: sub r5,r5,r6 400 401 /* Get the desination 128B aligned */ 402 neg r6,r3 403 srdi r7,r6,4 404 mtocrf 0x01,r7 405 clrldi r6,r6,(64-7) 406 407 li r9,16 408 li r10,32 409 li r11,48 410 411 bf cr7*4+3,5f 412err3; lvx vr1,r0,r4 413 addi r4,r4,16 414err3; stvx vr1,r0,r3 415 addi r3,r3,16 416 4175: bf cr7*4+2,6f 418err3; lvx vr1,r0,r4 419err3; lvx vr0,r4,r9 420 addi r4,r4,32 421err3; stvx vr1,r0,r3 422err3; stvx vr0,r3,r9 423 addi r3,r3,32 424 4256: bf cr7*4+1,7f 426err3; lvx vr3,r0,r4 427err3; lvx vr2,r4,r9 428err3; lvx vr1,r4,r10 429err3; lvx vr0,r4,r11 430 addi r4,r4,64 431err3; stvx vr3,r0,r3 432err3; stvx vr2,r3,r9 433err3; stvx vr1,r3,r10 434err3; stvx vr0,r3,r11 435 addi r3,r3,64 436 4377: sub r5,r5,r6 438 srdi r6,r5,7 439 440 std r14,STK_REG(R14)(r1) 441 std r15,STK_REG(R15)(r1) 442 std r16,STK_REG(R16)(r1) 443 444 li r12,64 445 li r14,80 446 li r15,96 447 li r16,112 448 449 mtctr r6 450 451 /* 452 * Now do cacheline sized loads and stores. By this stage the 453 * cacheline stores are also cacheline aligned. 454 */ 455 .align 5 4568: 457err4; lvx vr7,r0,r4 458err4; lvx vr6,r4,r9 459err4; lvx vr5,r4,r10 460err4; lvx vr4,r4,r11 461err4; lvx vr3,r4,r12 462err4; lvx vr2,r4,r14 463err4; lvx vr1,r4,r15 464err4; lvx vr0,r4,r16 465 addi r4,r4,128 466err4; stvx vr7,r0,r3 467err4; stvx vr6,r3,r9 468err4; stvx vr5,r3,r10 469err4; stvx vr4,r3,r11 470err4; stvx vr3,r3,r12 471err4; stvx vr2,r3,r14 472err4; stvx vr1,r3,r15 473err4; stvx vr0,r3,r16 474 addi r3,r3,128 475 bdnz 8b 476 477 ld r14,STK_REG(R14)(r1) 478 ld r15,STK_REG(R15)(r1) 479 ld r16,STK_REG(R16)(r1) 480 481 /* Up to 127B to go */ 482 clrldi r5,r5,(64-7) 483 srdi r6,r5,4 484 mtocrf 0x01,r6 485 486 bf cr7*4+1,9f 487err3; lvx vr3,r0,r4 488err3; lvx vr2,r4,r9 489err3; lvx vr1,r4,r10 490err3; lvx vr0,r4,r11 491 addi r4,r4,64 492err3; stvx vr3,r0,r3 493err3; stvx vr2,r3,r9 494err3; stvx vr1,r3,r10 495err3; stvx vr0,r3,r11 496 addi r3,r3,64 497 4989: bf cr7*4+2,10f 499err3; lvx vr1,r0,r4 500err3; lvx vr0,r4,r9 501 addi r4,r4,32 502err3; stvx vr1,r0,r3 503err3; stvx vr0,r3,r9 504 addi r3,r3,32 505 50610: bf cr7*4+3,11f 507err3; lvx vr1,r0,r4 508 addi r4,r4,16 509err3; stvx vr1,r0,r3 510 addi r3,r3,16 511 512 /* Up to 15B to go */ 51311: clrldi r5,r5,(64-4) 514 mtocrf 0x01,r5 515 bf cr7*4+0,12f 516err3; ld r0,0(r4) 517 addi r4,r4,8 518err3; std r0,0(r3) 519 addi r3,r3,8 520 52112: bf cr7*4+1,13f 522err3; lwz r0,0(r4) 523 addi r4,r4,4 524err3; stw r0,0(r3) 525 addi r3,r3,4 526 52713: bf cr7*4+2,14f 528err3; lhz r0,0(r4) 529 addi r4,r4,2 530err3; sth r0,0(r3) 531 addi r3,r3,2 532 53314: bf cr7*4+3,15f 534err3; lbz r0,0(r4) 535err3; stb r0,0(r3) 536 53715: addi r1,r1,STACKFRAMESIZE 538 b .exit_vmx_usercopy /* tail call optimise */ 539 540.Lvmx_unaligned_copy: 541 /* Get the destination 16B aligned */ 542 neg r6,r3 543 mtocrf 0x01,r6 544 clrldi r6,r6,(64-4) 545 546 bf cr7*4+3,1f 547err3; lbz r0,0(r4) 548 addi r4,r4,1 549err3; stb r0,0(r3) 550 addi r3,r3,1 551 5521: bf cr7*4+2,2f 553err3; lhz r0,0(r4) 554 addi r4,r4,2 555err3; sth r0,0(r3) 556 addi r3,r3,2 557 5582: bf cr7*4+1,3f 559err3; lwz r0,0(r4) 560 addi r4,r4,4 561err3; stw r0,0(r3) 562 addi r3,r3,4 563 5643: bf cr7*4+0,4f 565err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 566err3; lwz r7,4(r4) 567 addi r4,r4,8 568err3; stw r0,0(r3) 569err3; stw r7,4(r3) 570 addi r3,r3,8 571 5724: sub r5,r5,r6 573 574 /* Get the desination 128B aligned */ 575 neg r6,r3 576 srdi r7,r6,4 577 mtocrf 0x01,r7 578 clrldi r6,r6,(64-7) 579 580 li r9,16 581 li r10,32 582 li r11,48 583 584 lvsl vr16,0,r4 /* Setup permute control vector */ 585err3; lvx vr0,0,r4 586 addi r4,r4,16 587 588 bf cr7*4+3,5f 589err3; lvx vr1,r0,r4 590 vperm vr8,vr0,vr1,vr16 591 addi r4,r4,16 592err3; stvx vr8,r0,r3 593 addi r3,r3,16 594 vor vr0,vr1,vr1 595 5965: bf cr7*4+2,6f 597err3; lvx vr1,r0,r4 598 vperm vr8,vr0,vr1,vr16 599err3; lvx vr0,r4,r9 600 vperm vr9,vr1,vr0,vr16 601 addi r4,r4,32 602err3; stvx vr8,r0,r3 603err3; stvx vr9,r3,r9 604 addi r3,r3,32 605 6066: bf cr7*4+1,7f 607err3; lvx vr3,r0,r4 608 vperm vr8,vr0,vr3,vr16 609err3; lvx vr2,r4,r9 610 vperm vr9,vr3,vr2,vr16 611err3; lvx vr1,r4,r10 612 vperm vr10,vr2,vr1,vr16 613err3; lvx vr0,r4,r11 614 vperm vr11,vr1,vr0,vr16 615 addi r4,r4,64 616err3; stvx vr8,r0,r3 617err3; stvx vr9,r3,r9 618err3; stvx vr10,r3,r10 619err3; stvx vr11,r3,r11 620 addi r3,r3,64 621 6227: sub r5,r5,r6 623 srdi r6,r5,7 624 625 std r14,STK_REG(R14)(r1) 626 std r15,STK_REG(R15)(r1) 627 std r16,STK_REG(R16)(r1) 628 629 li r12,64 630 li r14,80 631 li r15,96 632 li r16,112 633 634 mtctr r6 635 636 /* 637 * Now do cacheline sized loads and stores. By this stage the 638 * cacheline stores are also cacheline aligned. 639 */ 640 .align 5 6418: 642err4; lvx vr7,r0,r4 643 vperm vr8,vr0,vr7,vr16 644err4; lvx vr6,r4,r9 645 vperm vr9,vr7,vr6,vr16 646err4; lvx vr5,r4,r10 647 vperm vr10,vr6,vr5,vr16 648err4; lvx vr4,r4,r11 649 vperm vr11,vr5,vr4,vr16 650err4; lvx vr3,r4,r12 651 vperm vr12,vr4,vr3,vr16 652err4; lvx vr2,r4,r14 653 vperm vr13,vr3,vr2,vr16 654err4; lvx vr1,r4,r15 655 vperm vr14,vr2,vr1,vr16 656err4; lvx vr0,r4,r16 657 vperm vr15,vr1,vr0,vr16 658 addi r4,r4,128 659err4; stvx vr8,r0,r3 660err4; stvx vr9,r3,r9 661err4; stvx vr10,r3,r10 662err4; stvx vr11,r3,r11 663err4; stvx vr12,r3,r12 664err4; stvx vr13,r3,r14 665err4; stvx vr14,r3,r15 666err4; stvx vr15,r3,r16 667 addi r3,r3,128 668 bdnz 8b 669 670 ld r14,STK_REG(R14)(r1) 671 ld r15,STK_REG(R15)(r1) 672 ld r16,STK_REG(R16)(r1) 673 674 /* Up to 127B to go */ 675 clrldi r5,r5,(64-7) 676 srdi r6,r5,4 677 mtocrf 0x01,r6 678 679 bf cr7*4+1,9f 680err3; lvx vr3,r0,r4 681 vperm vr8,vr0,vr3,vr16 682err3; lvx vr2,r4,r9 683 vperm vr9,vr3,vr2,vr16 684err3; lvx vr1,r4,r10 685 vperm vr10,vr2,vr1,vr16 686err3; lvx vr0,r4,r11 687 vperm vr11,vr1,vr0,vr16 688 addi r4,r4,64 689err3; stvx vr8,r0,r3 690err3; stvx vr9,r3,r9 691err3; stvx vr10,r3,r10 692err3; stvx vr11,r3,r11 693 addi r3,r3,64 694 6959: bf cr7*4+2,10f 696err3; lvx vr1,r0,r4 697 vperm vr8,vr0,vr1,vr16 698err3; lvx vr0,r4,r9 699 vperm vr9,vr1,vr0,vr16 700 addi r4,r4,32 701err3; stvx vr8,r0,r3 702err3; stvx vr9,r3,r9 703 addi r3,r3,32 704 70510: bf cr7*4+3,11f 706err3; lvx vr1,r0,r4 707 vperm vr8,vr0,vr1,vr16 708 addi r4,r4,16 709err3; stvx vr8,r0,r3 710 addi r3,r3,16 711 712 /* Up to 15B to go */ 71311: clrldi r5,r5,(64-4) 714 addi r4,r4,-16 /* Unwind the +16 load offset */ 715 mtocrf 0x01,r5 716 bf cr7*4+0,12f 717err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 718err3; lwz r6,4(r4) 719 addi r4,r4,8 720err3; stw r0,0(r3) 721err3; stw r6,4(r3) 722 addi r3,r3,8 723 72412: bf cr7*4+1,13f 725err3; lwz r0,0(r4) 726 addi r4,r4,4 727err3; stw r0,0(r3) 728 addi r3,r3,4 729 73013: bf cr7*4+2,14f 731err3; lhz r0,0(r4) 732 addi r4,r4,2 733err3; sth r0,0(r3) 734 addi r3,r3,2 735 73614: bf cr7*4+3,15f 737err3; lbz r0,0(r4) 738err3; stb r0,0(r3) 739 74015: addi r1,r1,STACKFRAMESIZE 741 b .exit_vmx_usercopy /* tail call optimise */ 742#endif /* CONFiG_ALTIVEC */ 743