1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * 4 * Copyright (C) IBM Corporation, 2011 5 * 6 * Author: Anton Blanchard <anton@au.ibm.com> 7 */ 8#include <linux/export.h> 9#include <asm/ppc_asm.h> 10 11#ifdef __BIG_ENDIAN__ 12#define LVS(VRT,RA,RB) lvsl VRT,RA,RB 13#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC 14#else 15#define LVS(VRT,RA,RB) lvsr VRT,RA,RB 16#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC 17#endif 18 19 .macro err1 20100: 21 EX_TABLE(100b,.Ldo_err1) 22 .endm 23 24 .macro err2 25200: 26 EX_TABLE(200b,.Ldo_err2) 27 .endm 28 29#ifdef CONFIG_ALTIVEC 30 .macro err3 31300: 32 EX_TABLE(300b,.Ldo_err3) 33 .endm 34 35 .macro err4 36400: 37 EX_TABLE(400b,.Ldo_err4) 38 .endm 39 40 41.Ldo_err4: 42 ld r16,STK_REG(R16)(r1) 43 ld r15,STK_REG(R15)(r1) 44 ld r14,STK_REG(R14)(r1) 45.Ldo_err3: 46 ld r6,STK_REG(R31)(r1) /* original destination pointer */ 47 ld r5,STK_REG(R29)(r1) /* original number of bytes */ 48 subf r7,r6,r3 /* #bytes copied */ 49 subf r3,r7,r5 /* #bytes not copied in r3 */ 50 ld r0,STACKFRAMESIZE+16(r1) 51 mtlr r0 52 addi r1,r1,STACKFRAMESIZE 53 blr 54#endif /* CONFIG_ALTIVEC */ 55 56.Ldo_err2: 57 ld r22,STK_REG(R22)(r1) 58 ld r21,STK_REG(R21)(r1) 59 ld r20,STK_REG(R20)(r1) 60 ld r19,STK_REG(R19)(r1) 61 ld r18,STK_REG(R18)(r1) 62 ld r17,STK_REG(R17)(r1) 63 ld r16,STK_REG(R16)(r1) 64 ld r15,STK_REG(R15)(r1) 65 ld r14,STK_REG(R14)(r1) 66.Lexit: 67 addi r1,r1,STACKFRAMESIZE 68.Ldo_err1: 69 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 70 ld r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 71 ld r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 72 b __copy_tofrom_user_base 73 74 75_GLOBAL(__copy_tofrom_user_power7) 76 cmpldi r5,16 77 78 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 79 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 80 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 81 82 blt .Lshort_copy 83 84 85.Lnonvmx_copy: 86 /* Get the source 8B aligned */ 87 neg r6,r4 88 mtocrf 0x01,r6 89 clrldi r6,r6,(64-3) 90 91 bf cr7*4+3,1f 92err1; lbz r0,0(r4) 93 addi r4,r4,1 94err1; stb r0,0(r3) 95 addi r3,r3,1 96 971: bf cr7*4+2,2f 98err1; lhz r0,0(r4) 99 addi r4,r4,2 100err1; sth r0,0(r3) 101 addi r3,r3,2 102 1032: bf cr7*4+1,3f 104err1; lwz r0,0(r4) 105 addi r4,r4,4 106err1; stw r0,0(r3) 107 addi r3,r3,4 108 1093: sub r5,r5,r6 110 cmpldi r5,128 111 blt 5f 112 113 mflr r0 114 stdu r1,-STACKFRAMESIZE(r1) 115 std r14,STK_REG(R14)(r1) 116 std r15,STK_REG(R15)(r1) 117 std r16,STK_REG(R16)(r1) 118 std r17,STK_REG(R17)(r1) 119 std r18,STK_REG(R18)(r1) 120 std r19,STK_REG(R19)(r1) 121 std r20,STK_REG(R20)(r1) 122 std r21,STK_REG(R21)(r1) 123 std r22,STK_REG(R22)(r1) 124 std r0,STACKFRAMESIZE+16(r1) 125 126 srdi r6,r5,7 127 mtctr r6 128 129 /* Now do cacheline (128B) sized loads and stores. */ 130 .align 5 1314: 132err2; ld r0,0(r4) 133err2; ld r6,8(r4) 134err2; ld r7,16(r4) 135err2; ld r8,24(r4) 136err2; ld r9,32(r4) 137err2; ld r10,40(r4) 138err2; ld r11,48(r4) 139err2; ld r12,56(r4) 140err2; ld r14,64(r4) 141err2; ld r15,72(r4) 142err2; ld r16,80(r4) 143err2; ld r17,88(r4) 144err2; ld r18,96(r4) 145err2; ld r19,104(r4) 146err2; ld r20,112(r4) 147err2; ld r21,120(r4) 148 addi r4,r4,128 149err2; std r0,0(r3) 150err2; std r6,8(r3) 151err2; std r7,16(r3) 152err2; std r8,24(r3) 153err2; std r9,32(r3) 154err2; std r10,40(r3) 155err2; std r11,48(r3) 156err2; std r12,56(r3) 157err2; std r14,64(r3) 158err2; std r15,72(r3) 159err2; std r16,80(r3) 160err2; std r17,88(r3) 161err2; std r18,96(r3) 162err2; std r19,104(r3) 163err2; std r20,112(r3) 164err2; std r21,120(r3) 165 addi r3,r3,128 166 bdnz 4b 167 168 clrldi r5,r5,(64-7) 169 170 ld r14,STK_REG(R14)(r1) 171 ld r15,STK_REG(R15)(r1) 172 ld r16,STK_REG(R16)(r1) 173 ld r17,STK_REG(R17)(r1) 174 ld r18,STK_REG(R18)(r1) 175 ld r19,STK_REG(R19)(r1) 176 ld r20,STK_REG(R20)(r1) 177 ld r21,STK_REG(R21)(r1) 178 ld r22,STK_REG(R22)(r1) 179 addi r1,r1,STACKFRAMESIZE 180 181 /* Up to 127B to go */ 1825: srdi r6,r5,4 183 mtocrf 0x01,r6 184 1856: bf cr7*4+1,7f 186err1; ld r0,0(r4) 187err1; ld r6,8(r4) 188err1; ld r7,16(r4) 189err1; ld r8,24(r4) 190err1; ld r9,32(r4) 191err1; ld r10,40(r4) 192err1; ld r11,48(r4) 193err1; ld r12,56(r4) 194 addi r4,r4,64 195err1; std r0,0(r3) 196err1; std r6,8(r3) 197err1; std r7,16(r3) 198err1; std r8,24(r3) 199err1; std r9,32(r3) 200err1; std r10,40(r3) 201err1; std r11,48(r3) 202err1; std r12,56(r3) 203 addi r3,r3,64 204 205 /* Up to 63B to go */ 2067: bf cr7*4+2,8f 207err1; ld r0,0(r4) 208err1; ld r6,8(r4) 209err1; ld r7,16(r4) 210err1; ld r8,24(r4) 211 addi r4,r4,32 212err1; std r0,0(r3) 213err1; std r6,8(r3) 214err1; std r7,16(r3) 215err1; std r8,24(r3) 216 addi r3,r3,32 217 218 /* Up to 31B to go */ 2198: bf cr7*4+3,9f 220err1; ld r0,0(r4) 221err1; ld r6,8(r4) 222 addi r4,r4,16 223err1; std r0,0(r3) 224err1; std r6,8(r3) 225 addi r3,r3,16 226 2279: clrldi r5,r5,(64-4) 228 229 /* Up to 15B to go */ 230.Lshort_copy: 231 mtocrf 0x01,r5 232 bf cr7*4+0,12f 233err1; lwz r0,0(r4) /* Less chance of a reject with word ops */ 234err1; lwz r6,4(r4) 235 addi r4,r4,8 236err1; stw r0,0(r3) 237err1; stw r6,4(r3) 238 addi r3,r3,8 239 24012: bf cr7*4+1,13f 241err1; lwz r0,0(r4) 242 addi r4,r4,4 243err1; stw r0,0(r3) 244 addi r3,r3,4 245 24613: bf cr7*4+2,14f 247err1; lhz r0,0(r4) 248 addi r4,r4,2 249err1; sth r0,0(r3) 250 addi r3,r3,2 251 25214: bf cr7*4+3,15f 253err1; lbz r0,0(r4) 254err1; stb r0,0(r3) 255 25615: li r3,0 257 blr 258 259#ifdef CONFIG_ALTIVEC 260_GLOBAL(__copy_tofrom_user_power7_vmx) 261 mflr r0 262 std r0,16(r1) 263 stdu r1,-STACKFRAMESIZE(r1) 264 265 std r3,STK_REG(R31)(r1) 266 std r5,STK_REG(R29)(r1) 267 /* 268 * We prefetch both the source and destination using enhanced touch 269 * instructions. We use a stream ID of 0 for the load side and 270 * 1 for the store side. 271 */ 272 clrrdi r6,r4,7 273 clrrdi r9,r3,7 274 ori r9,r9,1 /* stream=1 */ 275 276 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ 277 cmpldi r7,0x3FF 278 ble 1f 279 li r7,0x3FF 2801: lis r0,0x0E00 /* depth=7 */ 281 sldi r7,r7,7 282 or r7,r7,r0 283 ori r10,r7,1 /* stream=1 */ 284 285 DCBT_SETUP_STREAMS(r6, r7, r9, r10, r8) 286 287 /* 288 * If source and destination are not relatively aligned we use a 289 * slower permute loop. 290 */ 291 xor r6,r4,r3 292 rldicl. r6,r6,0,(64-4) 293 bne .Lvmx_unaligned_copy 294 295 /* Get the destination 16B aligned */ 296 neg r6,r3 297 mtocrf 0x01,r6 298 clrldi r6,r6,(64-4) 299 300 bf cr7*4+3,1f 301err3; lbz r0,0(r4) 302 addi r4,r4,1 303err3; stb r0,0(r3) 304 addi r3,r3,1 305 3061: bf cr7*4+2,2f 307err3; lhz r0,0(r4) 308 addi r4,r4,2 309err3; sth r0,0(r3) 310 addi r3,r3,2 311 3122: bf cr7*4+1,3f 313err3; lwz r0,0(r4) 314 addi r4,r4,4 315err3; stw r0,0(r3) 316 addi r3,r3,4 317 3183: bf cr7*4+0,4f 319err3; ld r0,0(r4) 320 addi r4,r4,8 321err3; std r0,0(r3) 322 addi r3,r3,8 323 3244: sub r5,r5,r6 325 326 /* Get the desination 128B aligned */ 327 neg r6,r3 328 srdi r7,r6,4 329 mtocrf 0x01,r7 330 clrldi r6,r6,(64-7) 331 332 li r9,16 333 li r10,32 334 li r11,48 335 336 bf cr7*4+3,5f 337err3; lvx v1,0,r4 338 addi r4,r4,16 339err3; stvx v1,0,r3 340 addi r3,r3,16 341 3425: bf cr7*4+2,6f 343err3; lvx v1,0,r4 344err3; lvx v0,r4,r9 345 addi r4,r4,32 346err3; stvx v1,0,r3 347err3; stvx v0,r3,r9 348 addi r3,r3,32 349 3506: bf cr7*4+1,7f 351err3; lvx v3,0,r4 352err3; lvx v2,r4,r9 353err3; lvx v1,r4,r10 354err3; lvx v0,r4,r11 355 addi r4,r4,64 356err3; stvx v3,0,r3 357err3; stvx v2,r3,r9 358err3; stvx v1,r3,r10 359err3; stvx v0,r3,r11 360 addi r3,r3,64 361 3627: sub r5,r5,r6 363 srdi r6,r5,7 364 365 std r14,STK_REG(R14)(r1) 366 std r15,STK_REG(R15)(r1) 367 std r16,STK_REG(R16)(r1) 368 369 li r12,64 370 li r14,80 371 li r15,96 372 li r16,112 373 374 mtctr r6 375 376 /* 377 * Now do cacheline sized loads and stores. By this stage the 378 * cacheline stores are also cacheline aligned. 379 */ 380 .align 5 3818: 382err4; lvx v7,0,r4 383err4; lvx v6,r4,r9 384err4; lvx v5,r4,r10 385err4; lvx v4,r4,r11 386err4; lvx v3,r4,r12 387err4; lvx v2,r4,r14 388err4; lvx v1,r4,r15 389err4; lvx v0,r4,r16 390 addi r4,r4,128 391err4; stvx v7,0,r3 392err4; stvx v6,r3,r9 393err4; stvx v5,r3,r10 394err4; stvx v4,r3,r11 395err4; stvx v3,r3,r12 396err4; stvx v2,r3,r14 397err4; stvx v1,r3,r15 398err4; stvx v0,r3,r16 399 addi r3,r3,128 400 bdnz 8b 401 402 ld r14,STK_REG(R14)(r1) 403 ld r15,STK_REG(R15)(r1) 404 ld r16,STK_REG(R16)(r1) 405 406 /* Up to 127B to go */ 407 clrldi r5,r5,(64-7) 408 srdi r6,r5,4 409 mtocrf 0x01,r6 410 411 bf cr7*4+1,9f 412err3; lvx v3,0,r4 413err3; lvx v2,r4,r9 414err3; lvx v1,r4,r10 415err3; lvx v0,r4,r11 416 addi r4,r4,64 417err3; stvx v3,0,r3 418err3; stvx v2,r3,r9 419err3; stvx v1,r3,r10 420err3; stvx v0,r3,r11 421 addi r3,r3,64 422 4239: bf cr7*4+2,10f 424err3; lvx v1,0,r4 425err3; lvx v0,r4,r9 426 addi r4,r4,32 427err3; stvx v1,0,r3 428err3; stvx v0,r3,r9 429 addi r3,r3,32 430 43110: bf cr7*4+3,11f 432err3; lvx v1,0,r4 433 addi r4,r4,16 434err3; stvx v1,0,r3 435 addi r3,r3,16 436 437 /* Up to 15B to go */ 43811: clrldi r5,r5,(64-4) 439 mtocrf 0x01,r5 440 bf cr7*4+0,12f 441err3; ld r0,0(r4) 442 addi r4,r4,8 443err3; std r0,0(r3) 444 addi r3,r3,8 445 44612: bf cr7*4+1,13f 447err3; lwz r0,0(r4) 448 addi r4,r4,4 449err3; stw r0,0(r3) 450 addi r3,r3,4 451 45213: bf cr7*4+2,14f 453err3; lhz r0,0(r4) 454 addi r4,r4,2 455err3; sth r0,0(r3) 456 addi r3,r3,2 457 45814: bf cr7*4+3,15f 459err3; lbz r0,0(r4) 460err3; stb r0,0(r3) 461 46215: addi r1,r1,STACKFRAMESIZE 463 li r3,0 464 blr 465 466.Lvmx_unaligned_copy: 467 /* Get the destination 16B aligned */ 468 neg r6,r3 469 mtocrf 0x01,r6 470 clrldi r6,r6,(64-4) 471 472 bf cr7*4+3,1f 473err3; lbz r0,0(r4) 474 addi r4,r4,1 475err3; stb r0,0(r3) 476 addi r3,r3,1 477 4781: bf cr7*4+2,2f 479err3; lhz r0,0(r4) 480 addi r4,r4,2 481err3; sth r0,0(r3) 482 addi r3,r3,2 483 4842: bf cr7*4+1,3f 485err3; lwz r0,0(r4) 486 addi r4,r4,4 487err3; stw r0,0(r3) 488 addi r3,r3,4 489 4903: bf cr7*4+0,4f 491err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 492err3; lwz r7,4(r4) 493 addi r4,r4,8 494err3; stw r0,0(r3) 495err3; stw r7,4(r3) 496 addi r3,r3,8 497 4984: sub r5,r5,r6 499 500 /* Get the desination 128B aligned */ 501 neg r6,r3 502 srdi r7,r6,4 503 mtocrf 0x01,r7 504 clrldi r6,r6,(64-7) 505 506 li r9,16 507 li r10,32 508 li r11,48 509 510 LVS(v16,0,r4) /* Setup permute control vector */ 511err3; lvx v0,0,r4 512 addi r4,r4,16 513 514 bf cr7*4+3,5f 515err3; lvx v1,0,r4 516 VPERM(v8,v0,v1,v16) 517 addi r4,r4,16 518err3; stvx v8,0,r3 519 addi r3,r3,16 520 vor v0,v1,v1 521 5225: bf cr7*4+2,6f 523err3; lvx v1,0,r4 524 VPERM(v8,v0,v1,v16) 525err3; lvx v0,r4,r9 526 VPERM(v9,v1,v0,v16) 527 addi r4,r4,32 528err3; stvx v8,0,r3 529err3; stvx v9,r3,r9 530 addi r3,r3,32 531 5326: bf cr7*4+1,7f 533err3; lvx v3,0,r4 534 VPERM(v8,v0,v3,v16) 535err3; lvx v2,r4,r9 536 VPERM(v9,v3,v2,v16) 537err3; lvx v1,r4,r10 538 VPERM(v10,v2,v1,v16) 539err3; lvx v0,r4,r11 540 VPERM(v11,v1,v0,v16) 541 addi r4,r4,64 542err3; stvx v8,0,r3 543err3; stvx v9,r3,r9 544err3; stvx v10,r3,r10 545err3; stvx v11,r3,r11 546 addi r3,r3,64 547 5487: sub r5,r5,r6 549 srdi r6,r5,7 550 551 std r14,STK_REG(R14)(r1) 552 std r15,STK_REG(R15)(r1) 553 std r16,STK_REG(R16)(r1) 554 555 li r12,64 556 li r14,80 557 li r15,96 558 li r16,112 559 560 mtctr r6 561 562 /* 563 * Now do cacheline sized loads and stores. By this stage the 564 * cacheline stores are also cacheline aligned. 565 */ 566 .align 5 5678: 568err4; lvx v7,0,r4 569 VPERM(v8,v0,v7,v16) 570err4; lvx v6,r4,r9 571 VPERM(v9,v7,v6,v16) 572err4; lvx v5,r4,r10 573 VPERM(v10,v6,v5,v16) 574err4; lvx v4,r4,r11 575 VPERM(v11,v5,v4,v16) 576err4; lvx v3,r4,r12 577 VPERM(v12,v4,v3,v16) 578err4; lvx v2,r4,r14 579 VPERM(v13,v3,v2,v16) 580err4; lvx v1,r4,r15 581 VPERM(v14,v2,v1,v16) 582err4; lvx v0,r4,r16 583 VPERM(v15,v1,v0,v16) 584 addi r4,r4,128 585err4; stvx v8,0,r3 586err4; stvx v9,r3,r9 587err4; stvx v10,r3,r10 588err4; stvx v11,r3,r11 589err4; stvx v12,r3,r12 590err4; stvx v13,r3,r14 591err4; stvx v14,r3,r15 592err4; stvx v15,r3,r16 593 addi r3,r3,128 594 bdnz 8b 595 596 ld r14,STK_REG(R14)(r1) 597 ld r15,STK_REG(R15)(r1) 598 ld r16,STK_REG(R16)(r1) 599 600 /* Up to 127B to go */ 601 clrldi r5,r5,(64-7) 602 srdi r6,r5,4 603 mtocrf 0x01,r6 604 605 bf cr7*4+1,9f 606err3; lvx v3,0,r4 607 VPERM(v8,v0,v3,v16) 608err3; lvx v2,r4,r9 609 VPERM(v9,v3,v2,v16) 610err3; lvx v1,r4,r10 611 VPERM(v10,v2,v1,v16) 612err3; lvx v0,r4,r11 613 VPERM(v11,v1,v0,v16) 614 addi r4,r4,64 615err3; stvx v8,0,r3 616err3; stvx v9,r3,r9 617err3; stvx v10,r3,r10 618err3; stvx v11,r3,r11 619 addi r3,r3,64 620 6219: bf cr7*4+2,10f 622err3; lvx v1,0,r4 623 VPERM(v8,v0,v1,v16) 624err3; lvx v0,r4,r9 625 VPERM(v9,v1,v0,v16) 626 addi r4,r4,32 627err3; stvx v8,0,r3 628err3; stvx v9,r3,r9 629 addi r3,r3,32 630 63110: bf cr7*4+3,11f 632err3; lvx v1,0,r4 633 VPERM(v8,v0,v1,v16) 634 addi r4,r4,16 635err3; stvx v8,0,r3 636 addi r3,r3,16 637 638 /* Up to 15B to go */ 63911: clrldi r5,r5,(64-4) 640 addi r4,r4,-16 /* Unwind the +16 load offset */ 641 mtocrf 0x01,r5 642 bf cr7*4+0,12f 643err3; lwz r0,0(r4) /* Less chance of a reject with word ops */ 644err3; lwz r6,4(r4) 645 addi r4,r4,8 646err3; stw r0,0(r3) 647err3; stw r6,4(r3) 648 addi r3,r3,8 649 65012: bf cr7*4+1,13f 651err3; lwz r0,0(r4) 652 addi r4,r4,4 653err3; stw r0,0(r3) 654 addi r3,r3,4 655 65613: bf cr7*4+2,14f 657err3; lhz r0,0(r4) 658 addi r4,r4,2 659err3; sth r0,0(r3) 660 addi r3,r3,2 661 66214: bf cr7*4+3,15f 663err3; lbz r0,0(r4) 664err3; stb r0,0(r3) 665 66615: addi r1,r1,STACKFRAMESIZE 667 li r3,0 668 blr 669EXPORT_SYMBOL(__copy_tofrom_user_power7_vmx) 670#endif /* CONFIG_ALTIVEC */ 671