1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * 4 * Copyright (C) IBM Corporation, 2011 5 * 6 * Author: Anton Blanchard <anton@au.ibm.com> 7 */ 8#include <asm/ppc_asm.h> 9 10#ifndef SELFTEST_CASE 11/* 0 == don't use VMX, 1 == use VMX */ 12#define SELFTEST_CASE 0 13#endif 14 15#ifdef __BIG_ENDIAN__ 16#define LVS(VRT,RA,RB) lvsl VRT,RA,RB 17#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC 18#else 19#define LVS(VRT,RA,RB) lvsr VRT,RA,RB 20#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC 21#endif 22 23 .macro err1 24100: 25 EX_TABLE(100b,.Ldo_err1) 26 .endm 27 28 .macro err2 29200: 30 EX_TABLE(200b,.Ldo_err2) 31 .endm 32 33#ifdef CONFIG_ALTIVEC 34 .macro err3 35300: 36 EX_TABLE(300b,.Ldo_err3) 37 .endm 38 39 .macro err4 40400: 41 EX_TABLE(400b,.Ldo_err4) 42 .endm 43 44 45.Ldo_err4: 46 ld r16,STK_REG(R16)(r1) 47 ld r15,STK_REG(R15)(r1) 48 ld r14,STK_REG(R14)(r1) 49.Ldo_err3: 50 bl CFUNC(exit_vmx_usercopy) 51 ld r0,STACKFRAMESIZE+16(r1) 52 mtlr r0 53 b .Lexit 54#endif /* CONFIG_ALTIVEC */ 55 56.Ldo_err2: 57 ld r22,STK_REG(R22)(r1) 58 ld r21,STK_REG(R21)(r1) 59 ld r20,STK_REG(R20)(r1) 60 ld r19,STK_REG(R19)(r1) 61 ld r18,STK_REG(R18)(r1) 62 ld r17,STK_REG(R17)(r1) 63 ld r16,STK_REG(R16)(r1) 64 ld r15,STK_REG(R15)(r1) 65 ld r14,STK_REG(R14)(r1) 66.Lexit: 67 addi r1,r1,STACKFRAMESIZE 68.Ldo_err1: 69 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 70 ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 71 ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 72 b __copy_tofrom_user_base 73 74 75_GLOBAL(__copy_tofrom_user_power7) 76 cmpldi r5,16 77 cmpldi cr1,r5,3328 78 79 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 80 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 81 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 82 83 blt .Lshort_copy 84 85#ifdef CONFIG_ALTIVEC 86test_feature = SELFTEST_CASE 87BEGIN_FTR_SECTION 88 bgt cr1,.Lvmx_copy 89END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) 90#endif 91 92.Lnonvmx_copy: 93 /* Get the source 8B aligned */ 94 neg r6,r4 95 mtocrf 0x01,r6 96 clrldi r6,r6,(64-3) 97 98 bf cr7*4+3,1f 99err1; lbz r0,0(r4) 100 addi r4,r4,1 101err1; stb r0,0(r3) 102 addi r3,r3,1 103 1041: bf cr7*4+2,2f 105err1; lhz r0,0(r4) 106 addi r4,r4,2 107err1; sth r0,0(r3) 108 addi r3,r3,2 109 1102: bf cr7*4+1,3f 111err1; lwz r0,0(r4) 112 addi r4,r4,4 113err1; stw r0,0(r3) 114 addi r3,r3,4 115 1163: sub r5,r5,r6 117 cmpldi r5,128 118 blt 5f 119 120 mflr r0 121 stdu r1,-STACKFRAMESIZE(r1) 122 std r14,STK_REG(R14)(r1) 123 std r15,STK_REG(R15)(r1) 124 std r16,STK_REG(R16)(r1) 125 std r17,STK_REG(R17)(r1) 126 std r18,STK_REG(R18)(r1) 127 std r19,STK_REG(R19)(r1) 128 std r20,STK_REG(R20)(r1) 129 std r21,STK_REG(R21)(r1) 130 std r22,STK_REG(R22)(r1) 131 std r0,STACKFRAMESIZE+16(r1) 132 133 srdi r6,r5,7 134 mtctr r6 135 136 /* Now do cacheline (128B) sized loads and stores. */ 137 .align 5 1384: 139err2; ld r0,0(r4) 140err2; ld r6,8(r4) 141err2; ld r7,16(r4) 142err2; ld r8,24(r4) 143err2; ld r9,32(r4) 144err2; ld r10,40(r4) 145err2; ld r11,48(r4) 146err2; ld r12,56(r4) 147err2; ld r14,64(r4) 148err2; ld r15,72(r4) 149err2; ld r16,80(r4) 150err2; ld r17,88(r4) 151err2; ld r18,96(r4) 152err2; ld r19,104(r4) 153err2; ld r20,112(r4) 154err2; ld r21,120(r4) 155 addi r4,r4,128 156err2; std r0,0(r3) 157err2; std r6,8(r3) 158err2; std r7,16(r3) 159err2; std r8,24(r3) 160err2; std r9,32(r3) 161err2; std r10,40(r3) 162err2; std r11,48(r3) 163err2; std r12,56(r3) 164err2; std r14,64(r3) 165err2; std r15,72(r3) 166err2; std r16,80(r3) 167err2; std r17,88(r3) 168err2; std r18,96(r3) 169err2; std r19,104(r3) 170err2; std r20,112(r3) 171err2; std r21,120(r3) 172 addi r3,r3,128 173 bdnz 4b 174 175 clrldi r5,r5,(64-7) 176 177 ld r14,STK_REG(R14)(r1) 178 ld r15,STK_REG(R15)(r1) 179 ld r16,STK_REG(R16)(r1) 180 ld r17,STK_REG(R17)(r1) 181 ld r18,STK_REG(R18)(r1) 182 ld r19,STK_REG(R19)(r1) 183 ld r20,STK_REG(R20)(r1) 184 ld r21,STK_REG(R21)(r1) 185 ld r22,STK_REG(R22)(r1) 186 addi r1,r1,STACKFRAMESIZE 187 188 /* Up to 127B to go */ 1895: srdi r6,r5,4 190 mtocrf 0x01,r6 191 1926: bf cr7*4+1,7f 193err1; ld r0,0(r4) 194err1; ld r6,8(r4) 195err1; ld r7,16(r4) 196err1; ld r8,24(r4) 197err1; ld r9,32(r4) 198err1; ld r10,40(r4) 199err1; ld r11,48(r4) 200err1; ld r12,56(r4) 201 addi r4,r4,64 202err1; std r0,0(r3) 203err1; std r6,8(r3) 204err1; std r7,16(r3) 205err1; std r8,24(r3) 206err1; std r9,32(r3) 207err1; std r10,40(r3) 208err1; std r11,48(r3) 209err1; std r12,56(r3) 210 addi r3,r3,64 211 212 /* Up to 63B to go */ 2137: bf cr7*4+2,8f 214err1; ld r0,0(r4) 215err1; ld r6,8(r4) 216err1; ld r7,16(r4) 217err1; ld r8,24(r4) 218 addi r4,r4,32 219err1; std r0,0(r3) 220err1; std r6,8(r3) 221err1; std r7,16(r3) 222err1; std r8,24(r3) 223 addi r3,r3,32 224 225 /* Up to 31B to go */ 2268: bf cr7*4+3,9f 227err1; ld r0,0(r4) 228err1; ld r6,8(r4) 229 addi r4,r4,16 230err1; std r0,0(r3) 231err1; std r6,8(r3) 232 addi r3,r3,16 233 2349: clrldi r5,r5,(64-4) 235 236 /* Up to 15B to go */ 237.Lshort_copy: 238 mtocrf 0x01,r5 239 bf cr7*4+0,12f 240err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ 241err1; lwz r6,4(r4) 242 addi r4,r4,8 243err1; stw r0,0(r3) 244err1; stw r6,4(r3) 245 addi r3,r3,8 246 24712: bf cr7*4+1,13f 248err1; lwz r0,0(r4) 249 addi r4,r4,4 250err1; stw r0,0(r3) 251 addi r3,r3,4 252 25313: bf cr7*4+2,14f 254err1; lhz r0,0(r4) 255 addi r4,r4,2 256err1; sth r0,0(r3) 257 addi r3,r3,2 258 25914: bf cr7*4+3,15f 260err1; lbz r0,0(r4) 261err1; stb r0,0(r3) 262 26315: li r3,0 264 blr 265 266.Lunwind_stack_nonvmx_copy: 267 addi r1,r1,STACKFRAMESIZE 268 b .Lnonvmx_copy 269 270.Lvmx_copy: 271#ifdef CONFIG_ALTIVEC 272 mflr r0 273 std r0,16(r1) 274 stdu r1,-STACKFRAMESIZE(r1) 275 bl CFUNC(enter_vmx_usercopy) 276 cmpwi cr1,r3,0 277 ld r0,STACKFRAMESIZE+16(r1) 278 ld r3,STK_REG(R31)(r1) 279 ld r4,STK_REG(R30)(r1) 280 ld r5,STK_REG(R29)(r1) 281 mtlr r0 282 283 /* 284 * We prefetch both the source and destination using enhanced touch 285 * instructions. We use a stream ID of 0 for the load side and 286 * 1 for the store side. 287 */ 288 clrrdi r6,r4,7 289 clrrdi r9,r3,7 290 ori r9,r9,1 /* stream=1 */ 291 292 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ 293 cmpldi r7,0x3FF 294 ble 1f 295 li r7,0x3FF 2961: lis r0,0x0E00 /* depth=7 */ 297 sldi r7,r7,7 298 or r7,r7,r0 299 ori r10,r7,1 /* stream=1 */ 300 301 DCBT_SETUP_STREAMS(r6, r7, r9, r10, r8) 302 303 beq cr1,.Lunwind_stack_nonvmx_copy 304 305 /* 306 * If source and destination are not relatively aligned we use a 307 * slower permute loop. 308 */ 309 xor r6,r4,r3 310 rldicl. r6,r6,0,(64-4) 311 bne .Lvmx_unaligned_copy 312 313 /* Get the destination 16B aligned */ 314 neg r6,r3 315 mtocrf 0x01,r6 316 clrldi r6,r6,(64-4) 317 318 bf cr7*4+3,1f 319err3; lbz r0,0(r4) 320 addi r4,r4,1 321err3; stb r0,0(r3) 322 addi r3,r3,1 323 3241: bf cr7*4+2,2f 325err3; lhz r0,0(r4) 326 addi r4,r4,2 327err3; sth r0,0(r3) 328 addi r3,r3,2 329 3302: bf cr7*4+1,3f 331err3; lwz r0,0(r4) 332 addi r4,r4,4 333err3; stw r0,0(r3) 334 addi r3,r3,4 335 3363: bf cr7*4+0,4f 337err3; ld r0,0(r4) 338 addi r4,r4,8 339err3; std r0,0(r3) 340 addi r3,r3,8 341 3424: sub r5,r5,r6 343 344 /* Get the desination 128B aligned */ 345 neg r6,r3 346 srdi r7,r6,4 347 mtocrf 0x01,r7 348 clrldi r6,r6,(64-7) 349 350 li r9,16 351 li r10,32 352 li r11,48 353 354 bf cr7*4+3,5f 355err3; lvx v1,0,r4 356 addi r4,r4,16 357err3; stvx v1,0,r3 358 addi r3,r3,16 359 3605: bf cr7*4+2,6f 361err3; lvx v1,0,r4 362err3; lvx v0,r4,r9 363 addi r4,r4,32 364err3; stvx v1,0,r3 365err3; stvx v0,r3,r9 366 addi r3,r3,32 367 3686: bf cr7*4+1,7f 369err3; lvx v3,0,r4 370err3; lvx v2,r4,r9 371err3; lvx v1,r4,r10 372err3; lvx v0,r4,r11 373 addi r4,r4,64 374err3; stvx v3,0,r3 375err3; stvx v2,r3,r9 376err3; stvx v1,r3,r10 377err3; stvx v0,r3,r11 378 addi r3,r3,64 379 3807: sub r5,r5,r6 381 srdi r6,r5,7 382 383 std r14,STK_REG(R14)(r1) 384 std r15,STK_REG(R15)(r1) 385 std r16,STK_REG(R16)(r1) 386 387 li r12,64 388 li r14,80 389 li r15,96 390 li r16,112 391 392 mtctr r6 393 394 /* 395 * Now do cacheline sized loads and stores. By this stage the 396 * cacheline stores are also cacheline aligned. 397 */ 398 .align 5 3998: 400err4; lvx v7,0,r4 401err4; lvx v6,r4,r9 402err4; lvx v5,r4,r10 403err4; lvx v4,r4,r11 404err4; lvx v3,r4,r12 405err4; lvx v2,r4,r14 406err4; lvx v1,r4,r15 407err4; lvx v0,r4,r16 408 addi r4,r4,128 409err4; stvx v7,0,r3 410err4; stvx v6,r3,r9 411err4; stvx v5,r3,r10 412err4; stvx v4,r3,r11 413err4; stvx v3,r3,r12 414err4; stvx v2,r3,r14 415err4; stvx v1,r3,r15 416err4; stvx v0,r3,r16 417 addi r3,r3,128 418 bdnz 8b 419 420 ld r14,STK_REG(R14)(r1) 421 ld r15,STK_REG(R15)(r1) 422 ld r16,STK_REG(R16)(r1) 423 424 /* Up to 127B to go */ 425 clrldi r5,r5,(64-7) 426 srdi r6,r5,4 427 mtocrf 0x01,r6 428 429 bf cr7*4+1,9f 430err3; lvx v3,0,r4 431err3; lvx v2,r4,r9 432err3; lvx v1,r4,r10 433err3; lvx v0,r4,r11 434 addi r4,r4,64 435err3; stvx v3,0,r3 436err3; stvx v2,r3,r9 437err3; stvx v1,r3,r10 438err3; stvx v0,r3,r11 439 addi r3,r3,64 440 4419: bf cr7*4+2,10f 442err3; lvx v1,0,r4 443err3; lvx v0,r4,r9 444 addi r4,r4,32 445err3; stvx v1,0,r3 446err3; stvx v0,r3,r9 447 addi r3,r3,32 448 44910: bf cr7*4+3,11f 450err3; lvx v1,0,r4 451 addi r4,r4,16 452err3; stvx v1,0,r3 453 addi r3,r3,16 454 455 /* Up to 15B to go */ 45611: clrldi r5,r5,(64-4) 457 mtocrf 0x01,r5 458 bf cr7*4+0,12f 459err3; ld r0,0(r4) 460 addi r4,r4,8 461err3; std r0,0(r3) 462 addi r3,r3,8 463 46412: bf cr7*4+1,13f 465err3; lwz r0,0(r4) 466 addi r4,r4,4 467err3; stw r0,0(r3) 468 addi r3,r3,4 469 47013: bf cr7*4+2,14f 471err3; lhz r0,0(r4) 472 addi r4,r4,2 473err3; sth r0,0(r3) 474 addi r3,r3,2 475 47614: bf cr7*4+3,15f 477err3; lbz r0,0(r4) 478err3; stb r0,0(r3) 479 48015: addi r1,r1,STACKFRAMESIZE 481 b CFUNC(exit_vmx_usercopy) /* tail call optimise */ 482 483.Lvmx_unaligned_copy: 484 /* Get the destination 16B aligned */ 485 neg r6,r3 486 mtocrf 0x01,r6 487 clrldi r6,r6,(64-4) 488 489 bf cr7*4+3,1f 490err3; lbz r0,0(r4) 491 addi r4,r4,1 492err3; stb r0,0(r3) 493 addi r3,r3,1 494 4951: bf cr7*4+2,2f 496err3; lhz r0,0(r4) 497 addi r4,r4,2 498err3; sth r0,0(r3) 499 addi r3,r3,2 500 5012: bf cr7*4+1,3f 502err3; lwz r0,0(r4) 503 addi r4,r4,4 504err3; stw r0,0(r3) 505 addi r3,r3,4 506 5073: bf cr7*4+0,4f 508err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 509err3; lwz r7,4(r4) 510 addi r4,r4,8 511err3; stw r0,0(r3) 512err3; stw r7,4(r3) 513 addi r3,r3,8 514 5154: sub r5,r5,r6 516 517 /* Get the desination 128B aligned */ 518 neg r6,r3 519 srdi r7,r6,4 520 mtocrf 0x01,r7 521 clrldi r6,r6,(64-7) 522 523 li r9,16 524 li r10,32 525 li r11,48 526 527 LVS(v16,0,r4) /* Setup permute control vector */ 528err3; lvx v0,0,r4 529 addi r4,r4,16 530 531 bf cr7*4+3,5f 532err3; lvx v1,0,r4 533 VPERM(v8,v0,v1,v16) 534 addi r4,r4,16 535err3; stvx v8,0,r3 536 addi r3,r3,16 537 vor v0,v1,v1 538 5395: bf cr7*4+2,6f 540err3; lvx v1,0,r4 541 VPERM(v8,v0,v1,v16) 542err3; lvx v0,r4,r9 543 VPERM(v9,v1,v0,v16) 544 addi r4,r4,32 545err3; stvx v8,0,r3 546err3; stvx v9,r3,r9 547 addi r3,r3,32 548 5496: bf cr7*4+1,7f 550err3; lvx v3,0,r4 551 VPERM(v8,v0,v3,v16) 552err3; lvx v2,r4,r9 553 VPERM(v9,v3,v2,v16) 554err3; lvx v1,r4,r10 555 VPERM(v10,v2,v1,v16) 556err3; lvx v0,r4,r11 557 VPERM(v11,v1,v0,v16) 558 addi r4,r4,64 559err3; stvx v8,0,r3 560err3; stvx v9,r3,r9 561err3; stvx v10,r3,r10 562err3; stvx v11,r3,r11 563 addi r3,r3,64 564 5657: sub r5,r5,r6 566 srdi r6,r5,7 567 568 std r14,STK_REG(R14)(r1) 569 std r15,STK_REG(R15)(r1) 570 std r16,STK_REG(R16)(r1) 571 572 li r12,64 573 li r14,80 574 li r15,96 575 li r16,112 576 577 mtctr r6 578 579 /* 580 * Now do cacheline sized loads and stores. By this stage the 581 * cacheline stores are also cacheline aligned. 582 */ 583 .align 5 5848: 585err4; lvx v7,0,r4 586 VPERM(v8,v0,v7,v16) 587err4; lvx v6,r4,r9 588 VPERM(v9,v7,v6,v16) 589err4; lvx v5,r4,r10 590 VPERM(v10,v6,v5,v16) 591err4; lvx v4,r4,r11 592 VPERM(v11,v5,v4,v16) 593err4; lvx v3,r4,r12 594 VPERM(v12,v4,v3,v16) 595err4; lvx v2,r4,r14 596 VPERM(v13,v3,v2,v16) 597err4; lvx v1,r4,r15 598 VPERM(v14,v2,v1,v16) 599err4; lvx v0,r4,r16 600 VPERM(v15,v1,v0,v16) 601 addi r4,r4,128 602err4; stvx v8,0,r3 603err4; stvx v9,r3,r9 604err4; stvx v10,r3,r10 605err4; stvx v11,r3,r11 606err4; stvx v12,r3,r12 607err4; stvx v13,r3,r14 608err4; stvx v14,r3,r15 609err4; stvx v15,r3,r16 610 addi r3,r3,128 611 bdnz 8b 612 613 ld r14,STK_REG(R14)(r1) 614 ld r15,STK_REG(R15)(r1) 615 ld r16,STK_REG(R16)(r1) 616 617 /* Up to 127B to go */ 618 clrldi r5,r5,(64-7) 619 srdi r6,r5,4 620 mtocrf 0x01,r6 621 622 bf cr7*4+1,9f 623err3; lvx v3,0,r4 624 VPERM(v8,v0,v3,v16) 625err3; lvx v2,r4,r9 626 VPERM(v9,v3,v2,v16) 627err3; lvx v1,r4,r10 628 VPERM(v10,v2,v1,v16) 629err3; lvx v0,r4,r11 630 VPERM(v11,v1,v0,v16) 631 addi r4,r4,64 632err3; stvx v8,0,r3 633err3; stvx v9,r3,r9 634err3; stvx v10,r3,r10 635err3; stvx v11,r3,r11 636 addi r3,r3,64 637 6389: bf cr7*4+2,10f 639err3; lvx v1,0,r4 640 VPERM(v8,v0,v1,v16) 641err3; lvx v0,r4,r9 642 VPERM(v9,v1,v0,v16) 643 addi r4,r4,32 644err3; stvx v8,0,r3 645err3; stvx v9,r3,r9 646 addi r3,r3,32 647 64810: bf cr7*4+3,11f 649err3; lvx v1,0,r4 650 VPERM(v8,v0,v1,v16) 651 addi r4,r4,16 652err3; stvx v8,0,r3 653 addi r3,r3,16 654 655 /* Up to 15B to go */ 65611: clrldi r5,r5,(64-4) 657 addi r4,r4,-16 /* Unwind the +16 load offset */ 658 mtocrf 0x01,r5 659 bf cr7*4+0,12f 660err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 661err3; lwz r6,4(r4) 662 addi r4,r4,8 663err3; stw r0,0(r3) 664err3; stw r6,4(r3) 665 addi r3,r3,8 666 66712: bf cr7*4+1,13f 668err3; lwz r0,0(r4) 669 addi r4,r4,4 670err3; stw r0,0(r3) 671 addi r3,r3,4 672 67313: bf cr7*4+2,14f 674err3; lhz r0,0(r4) 675 addi r4,r4,2 676err3; sth r0,0(r3) 677 addi r3,r3,2 678 67914: bf cr7*4+3,15f 680err3; lbz r0,0(r4) 681err3; stb r0,0(r3) 682 68315: addi r1,r1,STACKFRAMESIZE 684 b CFUNC(exit_vmx_usercopy) /* tail call optimise */ 685#endif /* CONFIG_ALTIVEC */ 686