1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * 4 * Copyright (C) IBM Corporation, 2012 5 * 6 * Author: Anton Blanchard <anton@au.ibm.com> 7 */ 8#include <asm/ppc_asm.h> 9 10#ifndef SELFTEST_CASE 11/* 0 == don't use VMX, 1 == use VMX */ 12#define SELFTEST_CASE 0 13#endif 14 15#ifdef __BIG_ENDIAN__ 16#define LVS(VRT,RA,RB) lvsl VRT,RA,RB 17#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRA,VRB,VRC 18#else 19#define LVS(VRT,RA,RB) lvsr VRT,RA,RB 20#define VPERM(VRT,VRA,VRB,VRC) vperm VRT,VRB,VRA,VRC 21#endif 22 23_GLOBAL(memcpy_power7) 24 cmpldi r5,16 25 cmpldi cr1,r5,4096 26 std r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 27 blt .Lshort_copy 28 29#ifdef CONFIG_ALTIVEC 30test_feature = SELFTEST_CASE 31BEGIN_FTR_SECTION 32 bgt cr1, .Lvmx_copy 33END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) 34#endif 35 36.Lnonvmx_copy: 37 /* Get the source 8B aligned */ 38 neg r6,r4 39 mtocrf 0x01,r6 40 clrldi r6,r6,(64-3) 41 42 bf cr7*4+3,1f 43 lbz r0,0(r4) 44 addi r4,r4,1 45 stb r0,0(r3) 46 addi r3,r3,1 47 481: bf cr7*4+2,2f 49 lhz r0,0(r4) 50 addi r4,r4,2 51 sth r0,0(r3) 52 addi r3,r3,2 53 542: bf cr7*4+1,3f 55 lwz r0,0(r4) 56 addi r4,r4,4 57 stw r0,0(r3) 58 addi r3,r3,4 59 603: sub r5,r5,r6 61 cmpldi r5,128 62 blt 5f 63 64 mflr r0 65 stdu r1,-STACKFRAMESIZE(r1) 66 std r14,STK_REG(R14)(r1) 67 std r15,STK_REG(R15)(r1) 68 std r16,STK_REG(R16)(r1) 69 std r17,STK_REG(R17)(r1) 70 std r18,STK_REG(R18)(r1) 71 std r19,STK_REG(R19)(r1) 72 std r20,STK_REG(R20)(r1) 73 std r21,STK_REG(R21)(r1) 74 std r22,STK_REG(R22)(r1) 75 std r0,STACKFRAMESIZE+16(r1) 76 77 srdi r6,r5,7 78 mtctr r6 79 80 /* Now do cacheline (128B) sized loads and stores. */ 81 .align 5 824: 83 ld r0,0(r4) 84 ld r6,8(r4) 85 ld r7,16(r4) 86 ld r8,24(r4) 87 ld r9,32(r4) 88 ld r10,40(r4) 89 ld r11,48(r4) 90 ld r12,56(r4) 91 ld r14,64(r4) 92 ld r15,72(r4) 93 ld r16,80(r4) 94 ld r17,88(r4) 95 ld r18,96(r4) 96 ld r19,104(r4) 97 ld r20,112(r4) 98 ld r21,120(r4) 99 addi r4,r4,128 100 std r0,0(r3) 101 std r6,8(r3) 102 std r7,16(r3) 103 std r8,24(r3) 104 std r9,32(r3) 105 std r10,40(r3) 106 std r11,48(r3) 107 std r12,56(r3) 108 std r14,64(r3) 109 std r15,72(r3) 110 std r16,80(r3) 111 std r17,88(r3) 112 std r18,96(r3) 113 std r19,104(r3) 114 std r20,112(r3) 115 std r21,120(r3) 116 addi r3,r3,128 117 bdnz 4b 118 119 clrldi r5,r5,(64-7) 120 121 ld r14,STK_REG(R14)(r1) 122 ld r15,STK_REG(R15)(r1) 123 ld r16,STK_REG(R16)(r1) 124 ld r17,STK_REG(R17)(r1) 125 ld r18,STK_REG(R18)(r1) 126 ld r19,STK_REG(R19)(r1) 127 ld r20,STK_REG(R20)(r1) 128 ld r21,STK_REG(R21)(r1) 129 ld r22,STK_REG(R22)(r1) 130 addi r1,r1,STACKFRAMESIZE 131 132 /* Up to 127B to go */ 1335: srdi r6,r5,4 134 mtocrf 0x01,r6 135 1366: bf cr7*4+1,7f 137 ld r0,0(r4) 138 ld r6,8(r4) 139 ld r7,16(r4) 140 ld r8,24(r4) 141 ld r9,32(r4) 142 ld r10,40(r4) 143 ld r11,48(r4) 144 ld r12,56(r4) 145 addi r4,r4,64 146 std r0,0(r3) 147 std r6,8(r3) 148 std r7,16(r3) 149 std r8,24(r3) 150 std r9,32(r3) 151 std r10,40(r3) 152 std r11,48(r3) 153 std r12,56(r3) 154 addi r3,r3,64 155 156 /* Up to 63B to go */ 1577: bf cr7*4+2,8f 158 ld r0,0(r4) 159 ld r6,8(r4) 160 ld r7,16(r4) 161 ld r8,24(r4) 162 addi r4,r4,32 163 std r0,0(r3) 164 std r6,8(r3) 165 std r7,16(r3) 166 std r8,24(r3) 167 addi r3,r3,32 168 169 /* Up to 31B to go */ 1708: bf cr7*4+3,9f 171 ld r0,0(r4) 172 ld r6,8(r4) 173 addi r4,r4,16 174 std r0,0(r3) 175 std r6,8(r3) 176 addi r3,r3,16 177 1789: clrldi r5,r5,(64-4) 179 180 /* Up to 15B to go */ 181.Lshort_copy: 182 mtocrf 0x01,r5 183 bf cr7*4+0,12f 184 lwz r0,0(r4) /* Less chance of a reject with word ops */ 185 lwz r6,4(r4) 186 addi r4,r4,8 187 stw r0,0(r3) 188 stw r6,4(r3) 189 addi r3,r3,8 190 19112: bf cr7*4+1,13f 192 lwz r0,0(r4) 193 addi r4,r4,4 194 stw r0,0(r3) 195 addi r3,r3,4 196 19713: bf cr7*4+2,14f 198 lhz r0,0(r4) 199 addi r4,r4,2 200 sth r0,0(r3) 201 addi r3,r3,2 202 20314: bf cr7*4+3,15f 204 lbz r0,0(r4) 205 stb r0,0(r3) 206 20715: ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 208 blr 209 210.Lunwind_stack_nonvmx_copy: 211 addi r1,r1,STACKFRAMESIZE 212 b .Lnonvmx_copy 213 214.Lvmx_copy: 215#ifdef CONFIG_ALTIVEC 216 mflr r0 217 std r4,-STACKFRAMESIZE+STK_REG(R30)(r1) 218 std r5,-STACKFRAMESIZE+STK_REG(R29)(r1) 219 std r0,16(r1) 220 stdu r1,-STACKFRAMESIZE(r1) 221 bl CFUNC(enter_vmx_ops) 222 cmpwi cr1,r3,0 223 ld r0,STACKFRAMESIZE+16(r1) 224 ld r3,STK_REG(R31)(r1) 225 ld r4,STK_REG(R30)(r1) 226 ld r5,STK_REG(R29)(r1) 227 mtlr r0 228 229 /* 230 * We prefetch both the source and destination using enhanced touch 231 * instructions. We use a stream ID of 0 for the load side and 232 * 1 for the store side. 233 */ 234 clrrdi r6,r4,7 235 clrrdi r9,r3,7 236 ori r9,r9,1 /* stream=1 */ 237 238 srdi r7,r5,7 /* length in cachelines, capped at 0x3FF */ 239 cmpldi r7,0x3FF 240 ble 1f 241 li r7,0x3FF 2421: lis r0,0x0E00 /* depth=7 */ 243 sldi r7,r7,7 244 or r7,r7,r0 245 ori r10,r7,1 /* stream=1 */ 246 247 DCBT_SETUP_STREAMS(r6, r7, r9, r10, r8) 248 249 beq cr1,.Lunwind_stack_nonvmx_copy 250 251 /* 252 * If source and destination are not relatively aligned we use a 253 * slower permute loop. 254 */ 255 xor r6,r4,r3 256 rldicl. r6,r6,0,(64-4) 257 bne .Lvmx_unaligned_copy 258 259 /* Get the destination 16B aligned */ 260 neg r6,r3 261 mtocrf 0x01,r6 262 clrldi r6,r6,(64-4) 263 264 bf cr7*4+3,1f 265 lbz r0,0(r4) 266 addi r4,r4,1 267 stb r0,0(r3) 268 addi r3,r3,1 269 2701: bf cr7*4+2,2f 271 lhz r0,0(r4) 272 addi r4,r4,2 273 sth r0,0(r3) 274 addi r3,r3,2 275 2762: bf cr7*4+1,3f 277 lwz r0,0(r4) 278 addi r4,r4,4 279 stw r0,0(r3) 280 addi r3,r3,4 281 2823: bf cr7*4+0,4f 283 ld r0,0(r4) 284 addi r4,r4,8 285 std r0,0(r3) 286 addi r3,r3,8 287 2884: sub r5,r5,r6 289 290 /* Get the desination 128B aligned */ 291 neg r6,r3 292 srdi r7,r6,4 293 mtocrf 0x01,r7 294 clrldi r6,r6,(64-7) 295 296 li r9,16 297 li r10,32 298 li r11,48 299 300 bf cr7*4+3,5f 301 lvx v1,0,r4 302 addi r4,r4,16 303 stvx v1,0,r3 304 addi r3,r3,16 305 3065: bf cr7*4+2,6f 307 lvx v1,0,r4 308 lvx v0,r4,r9 309 addi r4,r4,32 310 stvx v1,0,r3 311 stvx v0,r3,r9 312 addi r3,r3,32 313 3146: bf cr7*4+1,7f 315 lvx v3,0,r4 316 lvx v2,r4,r9 317 lvx v1,r4,r10 318 lvx v0,r4,r11 319 addi r4,r4,64 320 stvx v3,0,r3 321 stvx v2,r3,r9 322 stvx v1,r3,r10 323 stvx v0,r3,r11 324 addi r3,r3,64 325 3267: sub r5,r5,r6 327 srdi r6,r5,7 328 329 std r14,STK_REG(R14)(r1) 330 std r15,STK_REG(R15)(r1) 331 std r16,STK_REG(R16)(r1) 332 333 li r12,64 334 li r14,80 335 li r15,96 336 li r16,112 337 338 mtctr r6 339 340 /* 341 * Now do cacheline sized loads and stores. By this stage the 342 * cacheline stores are also cacheline aligned. 343 */ 344 .align 5 3458: 346 lvx v7,0,r4 347 lvx v6,r4,r9 348 lvx v5,r4,r10 349 lvx v4,r4,r11 350 lvx v3,r4,r12 351 lvx v2,r4,r14 352 lvx v1,r4,r15 353 lvx v0,r4,r16 354 addi r4,r4,128 355 stvx v7,0,r3 356 stvx v6,r3,r9 357 stvx v5,r3,r10 358 stvx v4,r3,r11 359 stvx v3,r3,r12 360 stvx v2,r3,r14 361 stvx v1,r3,r15 362 stvx v0,r3,r16 363 addi r3,r3,128 364 bdnz 8b 365 366 ld r14,STK_REG(R14)(r1) 367 ld r15,STK_REG(R15)(r1) 368 ld r16,STK_REG(R16)(r1) 369 370 /* Up to 127B to go */ 371 clrldi r5,r5,(64-7) 372 srdi r6,r5,4 373 mtocrf 0x01,r6 374 375 bf cr7*4+1,9f 376 lvx v3,0,r4 377 lvx v2,r4,r9 378 lvx v1,r4,r10 379 lvx v0,r4,r11 380 addi r4,r4,64 381 stvx v3,0,r3 382 stvx v2,r3,r9 383 stvx v1,r3,r10 384 stvx v0,r3,r11 385 addi r3,r3,64 386 3879: bf cr7*4+2,10f 388 lvx v1,0,r4 389 lvx v0,r4,r9 390 addi r4,r4,32 391 stvx v1,0,r3 392 stvx v0,r3,r9 393 addi r3,r3,32 394 39510: bf cr7*4+3,11f 396 lvx v1,0,r4 397 addi r4,r4,16 398 stvx v1,0,r3 399 addi r3,r3,16 400 401 /* Up to 15B to go */ 40211: clrldi r5,r5,(64-4) 403 mtocrf 0x01,r5 404 bf cr7*4+0,12f 405 ld r0,0(r4) 406 addi r4,r4,8 407 std r0,0(r3) 408 addi r3,r3,8 409 41012: bf cr7*4+1,13f 411 lwz r0,0(r4) 412 addi r4,r4,4 413 stw r0,0(r3) 414 addi r3,r3,4 415 41613: bf cr7*4+2,14f 417 lhz r0,0(r4) 418 addi r4,r4,2 419 sth r0,0(r3) 420 addi r3,r3,2 421 42214: bf cr7*4+3,15f 423 lbz r0,0(r4) 424 stb r0,0(r3) 425 42615: addi r1,r1,STACKFRAMESIZE 427 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 428 b CFUNC(exit_vmx_ops) /* tail call optimise */ 429 430.Lvmx_unaligned_copy: 431 /* Get the destination 16B aligned */ 432 neg r6,r3 433 mtocrf 0x01,r6 434 clrldi r6,r6,(64-4) 435 436 bf cr7*4+3,1f 437 lbz r0,0(r4) 438 addi r4,r4,1 439 stb r0,0(r3) 440 addi r3,r3,1 441 4421: bf cr7*4+2,2f 443 lhz r0,0(r4) 444 addi r4,r4,2 445 sth r0,0(r3) 446 addi r3,r3,2 447 4482: bf cr7*4+1,3f 449 lwz r0,0(r4) 450 addi r4,r4,4 451 stw r0,0(r3) 452 addi r3,r3,4 453 4543: bf cr7*4+0,4f 455 lwz r0,0(r4) /* Less chance of a reject with word ops */ 456 lwz r7,4(r4) 457 addi r4,r4,8 458 stw r0,0(r3) 459 stw r7,4(r3) 460 addi r3,r3,8 461 4624: sub r5,r5,r6 463 464 /* Get the desination 128B aligned */ 465 neg r6,r3 466 srdi r7,r6,4 467 mtocrf 0x01,r7 468 clrldi r6,r6,(64-7) 469 470 li r9,16 471 li r10,32 472 li r11,48 473 474 LVS(v16,0,r4) /* Setup permute control vector */ 475 lvx v0,0,r4 476 addi r4,r4,16 477 478 bf cr7*4+3,5f 479 lvx v1,0,r4 480 VPERM(v8,v0,v1,v16) 481 addi r4,r4,16 482 stvx v8,0,r3 483 addi r3,r3,16 484 vor v0,v1,v1 485 4865: bf cr7*4+2,6f 487 lvx v1,0,r4 488 VPERM(v8,v0,v1,v16) 489 lvx v0,r4,r9 490 VPERM(v9,v1,v0,v16) 491 addi r4,r4,32 492 stvx v8,0,r3 493 stvx v9,r3,r9 494 addi r3,r3,32 495 4966: bf cr7*4+1,7f 497 lvx v3,0,r4 498 VPERM(v8,v0,v3,v16) 499 lvx v2,r4,r9 500 VPERM(v9,v3,v2,v16) 501 lvx v1,r4,r10 502 VPERM(v10,v2,v1,v16) 503 lvx v0,r4,r11 504 VPERM(v11,v1,v0,v16) 505 addi r4,r4,64 506 stvx v8,0,r3 507 stvx v9,r3,r9 508 stvx v10,r3,r10 509 stvx v11,r3,r11 510 addi r3,r3,64 511 5127: sub r5,r5,r6 513 srdi r6,r5,7 514 515 std r14,STK_REG(R14)(r1) 516 std r15,STK_REG(R15)(r1) 517 std r16,STK_REG(R16)(r1) 518 519 li r12,64 520 li r14,80 521 li r15,96 522 li r16,112 523 524 mtctr r6 525 526 /* 527 * Now do cacheline sized loads and stores. By this stage the 528 * cacheline stores are also cacheline aligned. 529 */ 530 .align 5 5318: 532 lvx v7,0,r4 533 VPERM(v8,v0,v7,v16) 534 lvx v6,r4,r9 535 VPERM(v9,v7,v6,v16) 536 lvx v5,r4,r10 537 VPERM(v10,v6,v5,v16) 538 lvx v4,r4,r11 539 VPERM(v11,v5,v4,v16) 540 lvx v3,r4,r12 541 VPERM(v12,v4,v3,v16) 542 lvx v2,r4,r14 543 VPERM(v13,v3,v2,v16) 544 lvx v1,r4,r15 545 VPERM(v14,v2,v1,v16) 546 lvx v0,r4,r16 547 VPERM(v15,v1,v0,v16) 548 addi r4,r4,128 549 stvx v8,0,r3 550 stvx v9,r3,r9 551 stvx v10,r3,r10 552 stvx v11,r3,r11 553 stvx v12,r3,r12 554 stvx v13,r3,r14 555 stvx v14,r3,r15 556 stvx v15,r3,r16 557 addi r3,r3,128 558 bdnz 8b 559 560 ld r14,STK_REG(R14)(r1) 561 ld r15,STK_REG(R15)(r1) 562 ld r16,STK_REG(R16)(r1) 563 564 /* Up to 127B to go */ 565 clrldi r5,r5,(64-7) 566 srdi r6,r5,4 567 mtocrf 0x01,r6 568 569 bf cr7*4+1,9f 570 lvx v3,0,r4 571 VPERM(v8,v0,v3,v16) 572 lvx v2,r4,r9 573 VPERM(v9,v3,v2,v16) 574 lvx v1,r4,r10 575 VPERM(v10,v2,v1,v16) 576 lvx v0,r4,r11 577 VPERM(v11,v1,v0,v16) 578 addi r4,r4,64 579 stvx v8,0,r3 580 stvx v9,r3,r9 581 stvx v10,r3,r10 582 stvx v11,r3,r11 583 addi r3,r3,64 584 5859: bf cr7*4+2,10f 586 lvx v1,0,r4 587 VPERM(v8,v0,v1,v16) 588 lvx v0,r4,r9 589 VPERM(v9,v1,v0,v16) 590 addi r4,r4,32 591 stvx v8,0,r3 592 stvx v9,r3,r9 593 addi r3,r3,32 594 59510: bf cr7*4+3,11f 596 lvx v1,0,r4 597 VPERM(v8,v0,v1,v16) 598 addi r4,r4,16 599 stvx v8,0,r3 600 addi r3,r3,16 601 602 /* Up to 15B to go */ 60311: clrldi r5,r5,(64-4) 604 addi r4,r4,-16 /* Unwind the +16 load offset */ 605 mtocrf 0x01,r5 606 bf cr7*4+0,12f 607 lwz r0,0(r4) /* Less chance of a reject with word ops */ 608 lwz r6,4(r4) 609 addi r4,r4,8 610 stw r0,0(r3) 611 stw r6,4(r3) 612 addi r3,r3,8 613 61412: bf cr7*4+1,13f 615 lwz r0,0(r4) 616 addi r4,r4,4 617 stw r0,0(r3) 618 addi r3,r3,4 619 62013: bf cr7*4+2,14f 621 lhz r0,0(r4) 622 addi r4,r4,2 623 sth r0,0(r3) 624 addi r3,r3,2 625 62614: bf cr7*4+3,15f 627 lbz r0,0(r4) 628 stb r0,0(r3) 629 63015: addi r1,r1,STACKFRAMESIZE 631 ld r3,-STACKFRAMESIZE+STK_REG(R31)(r1) 632 b CFUNC(exit_vmx_ops) /* tail call optimise */ 633#endif /* CONFIG_ALTIVEC */ 634