1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Core of the accelerated CRC algorithm. 4 * In your file, define the constants and CRC_FUNCTION_NAME 5 * Then include this file. 6 * 7 * Calculate the checksum of data that is 16 byte aligned and a multiple of 8 * 16 bytes. 9 * 10 * The first step is to reduce it to 1024 bits. We do this in 8 parallel 11 * chunks in order to mask the latency of the vpmsum instructions. If we 12 * have more than 32 kB of data to checksum we repeat this step multiple 13 * times, passing in the previous 1024 bits. 14 * 15 * The next step is to reduce the 1024 bits to 64 bits. This step adds 16 * 32 bits of 0s to the end - this matches what a CRC does. We just 17 * calculate constants that land the data in this 32 bits. 18 * 19 * We then use fixed point Barrett reduction to compute a mod n over GF(2) 20 * for n = CRC using POWER8 instructions. We use x = 32. 21 * 22 * https://en.wikipedia.org/wiki/Barrett_reduction 23 * 24 * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM 25*/ 26 27#include <asm/ppc_asm.h> 28#include <asm/ppc-opcode.h> 29 30#define MAX_SIZE 32768 31 32 .text 33 34#if defined(__BIG_ENDIAN__) && defined(REFLECT) 35#define BYTESWAP_DATA 36#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) 37#define BYTESWAP_DATA 38#else 39#undef BYTESWAP_DATA 40#endif 41 42#define off16 r25 43#define off32 r26 44#define off48 r27 45#define off64 r28 46#define off80 r29 47#define off96 r30 48#define off112 r31 49 50#define const1 v24 51#define const2 v25 52 53#define byteswap v26 54#define mask_32bit v27 55#define mask_64bit v28 56#define zeroes v29 57 58#ifdef BYTESWAP_DATA 59#define VPERM(A, B, C, D) vperm A, B, C, D 60#else 61#define VPERM(A, B, C, D) 62#endif 63 64/* unsigned int CRC_FUNCTION_NAME(unsigned int crc, void *p, unsigned long len) */ 65FUNC_START(CRC_FUNCTION_NAME) 66 std r31,-8(r1) 67 std r30,-16(r1) 68 std r29,-24(r1) 69 std r28,-32(r1) 70 std r27,-40(r1) 71 std r26,-48(r1) 72 std r25,-56(r1) 73 74 li off16,16 75 li off32,32 76 li off48,48 77 li off64,64 78 li off80,80 79 li off96,96 80 li off112,112 81 li r0,0 82 83 /* Enough room for saving 10 non volatile VMX registers */ 84 subi r6,r1,56+10*16 85 subi r7,r1,56+2*16 86 87 stvx v20,0,r6 88 stvx v21,off16,r6 89 stvx v22,off32,r6 90 stvx v23,off48,r6 91 stvx v24,off64,r6 92 stvx v25,off80,r6 93 stvx v26,off96,r6 94 stvx v27,off112,r6 95 stvx v28,0,r7 96 stvx v29,off16,r7 97 98 mr r10,r3 99 100 vxor zeroes,zeroes,zeroes 101 vspltisw v0,-1 102 103 vsldoi mask_32bit,zeroes,v0,4 104 vsldoi mask_64bit,zeroes,v0,8 105 106 /* Get the initial value into v8 */ 107 vxor v8,v8,v8 108 MTVRD(v8, R3) 109#ifdef REFLECT 110 vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ 111#else 112 vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */ 113#endif 114 115#ifdef BYTESWAP_DATA 116 LOAD_REG_ADDR(r3, .byteswap_constant) 117 lvx byteswap,0,r3 118 addi r3,r3,16 119#endif 120 121 cmpdi r5,256 122 blt .Lshort 123 124 rldicr r6,r5,0,56 125 126 /* Checksum in blocks of MAX_SIZE */ 1271: lis r7,MAX_SIZE@h 128 ori r7,r7,MAX_SIZE@l 129 mr r9,r7 130 cmpd r6,r7 131 bgt 2f 132 mr r7,r6 1332: subf r6,r7,r6 134 135 /* our main loop does 128 bytes at a time */ 136 srdi r7,r7,7 137 138 /* 139 * Work out the offset into the constants table to start at. Each 140 * constant is 16 bytes, and it is used against 128 bytes of input 141 * data - 128 / 16 = 8 142 */ 143 sldi r8,r7,4 144 srdi r9,r9,3 145 subf r8,r8,r9 146 147 /* We reduce our final 128 bytes in a separate step */ 148 addi r7,r7,-1 149 mtctr r7 150 151 LOAD_REG_ADDR(r3, .constants) 152 153 /* Find the start of our constants */ 154 add r3,r3,r8 155 156 /* zero v0-v7 which will contain our checksums */ 157 vxor v0,v0,v0 158 vxor v1,v1,v1 159 vxor v2,v2,v2 160 vxor v3,v3,v3 161 vxor v4,v4,v4 162 vxor v5,v5,v5 163 vxor v6,v6,v6 164 vxor v7,v7,v7 165 166 lvx const1,0,r3 167 168 /* 169 * If we are looping back to consume more data we use the values 170 * already in v16-v23. 171 */ 172 cmpdi r0,1 173 beq 2f 174 175 /* First warm up pass */ 176 lvx v16,0,r4 177 lvx v17,off16,r4 178 VPERM(v16,v16,v16,byteswap) 179 VPERM(v17,v17,v17,byteswap) 180 lvx v18,off32,r4 181 lvx v19,off48,r4 182 VPERM(v18,v18,v18,byteswap) 183 VPERM(v19,v19,v19,byteswap) 184 lvx v20,off64,r4 185 lvx v21,off80,r4 186 VPERM(v20,v20,v20,byteswap) 187 VPERM(v21,v21,v21,byteswap) 188 lvx v22,off96,r4 189 lvx v23,off112,r4 190 VPERM(v22,v22,v22,byteswap) 191 VPERM(v23,v23,v23,byteswap) 192 addi r4,r4,8*16 193 194 /* xor in initial value */ 195 vxor v16,v16,v8 196 1972: bdz .Lfirst_warm_up_done 198 199 addi r3,r3,16 200 lvx const2,0,r3 201 202 /* Second warm up pass */ 203 VPMSUMD(v8,v16,const1) 204 lvx v16,0,r4 205 VPERM(v16,v16,v16,byteswap) 206 ori r2,r2,0 207 208 VPMSUMD(v9,v17,const1) 209 lvx v17,off16,r4 210 VPERM(v17,v17,v17,byteswap) 211 ori r2,r2,0 212 213 VPMSUMD(v10,v18,const1) 214 lvx v18,off32,r4 215 VPERM(v18,v18,v18,byteswap) 216 ori r2,r2,0 217 218 VPMSUMD(v11,v19,const1) 219 lvx v19,off48,r4 220 VPERM(v19,v19,v19,byteswap) 221 ori r2,r2,0 222 223 VPMSUMD(v12,v20,const1) 224 lvx v20,off64,r4 225 VPERM(v20,v20,v20,byteswap) 226 ori r2,r2,0 227 228 VPMSUMD(v13,v21,const1) 229 lvx v21,off80,r4 230 VPERM(v21,v21,v21,byteswap) 231 ori r2,r2,0 232 233 VPMSUMD(v14,v22,const1) 234 lvx v22,off96,r4 235 VPERM(v22,v22,v22,byteswap) 236 ori r2,r2,0 237 238 VPMSUMD(v15,v23,const1) 239 lvx v23,off112,r4 240 VPERM(v23,v23,v23,byteswap) 241 242 addi r4,r4,8*16 243 244 bdz .Lfirst_cool_down 245 246 /* 247 * main loop. We modulo schedule it such that it takes three iterations 248 * to complete - first iteration load, second iteration vpmsum, third 249 * iteration xor. 250 */ 251 .balign 16 2524: lvx const1,0,r3 253 addi r3,r3,16 254 ori r2,r2,0 255 256 vxor v0,v0,v8 257 VPMSUMD(v8,v16,const2) 258 lvx v16,0,r4 259 VPERM(v16,v16,v16,byteswap) 260 ori r2,r2,0 261 262 vxor v1,v1,v9 263 VPMSUMD(v9,v17,const2) 264 lvx v17,off16,r4 265 VPERM(v17,v17,v17,byteswap) 266 ori r2,r2,0 267 268 vxor v2,v2,v10 269 VPMSUMD(v10,v18,const2) 270 lvx v18,off32,r4 271 VPERM(v18,v18,v18,byteswap) 272 ori r2,r2,0 273 274 vxor v3,v3,v11 275 VPMSUMD(v11,v19,const2) 276 lvx v19,off48,r4 277 VPERM(v19,v19,v19,byteswap) 278 lvx const2,0,r3 279 ori r2,r2,0 280 281 vxor v4,v4,v12 282 VPMSUMD(v12,v20,const1) 283 lvx v20,off64,r4 284 VPERM(v20,v20,v20,byteswap) 285 ori r2,r2,0 286 287 vxor v5,v5,v13 288 VPMSUMD(v13,v21,const1) 289 lvx v21,off80,r4 290 VPERM(v21,v21,v21,byteswap) 291 ori r2,r2,0 292 293 vxor v6,v6,v14 294 VPMSUMD(v14,v22,const1) 295 lvx v22,off96,r4 296 VPERM(v22,v22,v22,byteswap) 297 ori r2,r2,0 298 299 vxor v7,v7,v15 300 VPMSUMD(v15,v23,const1) 301 lvx v23,off112,r4 302 VPERM(v23,v23,v23,byteswap) 303 304 addi r4,r4,8*16 305 306 bdnz 4b 307 308.Lfirst_cool_down: 309 /* First cool down pass */ 310 lvx const1,0,r3 311 addi r3,r3,16 312 313 vxor v0,v0,v8 314 VPMSUMD(v8,v16,const1) 315 ori r2,r2,0 316 317 vxor v1,v1,v9 318 VPMSUMD(v9,v17,const1) 319 ori r2,r2,0 320 321 vxor v2,v2,v10 322 VPMSUMD(v10,v18,const1) 323 ori r2,r2,0 324 325 vxor v3,v3,v11 326 VPMSUMD(v11,v19,const1) 327 ori r2,r2,0 328 329 vxor v4,v4,v12 330 VPMSUMD(v12,v20,const1) 331 ori r2,r2,0 332 333 vxor v5,v5,v13 334 VPMSUMD(v13,v21,const1) 335 ori r2,r2,0 336 337 vxor v6,v6,v14 338 VPMSUMD(v14,v22,const1) 339 ori r2,r2,0 340 341 vxor v7,v7,v15 342 VPMSUMD(v15,v23,const1) 343 ori r2,r2,0 344 345.Lsecond_cool_down: 346 /* Second cool down pass */ 347 vxor v0,v0,v8 348 vxor v1,v1,v9 349 vxor v2,v2,v10 350 vxor v3,v3,v11 351 vxor v4,v4,v12 352 vxor v5,v5,v13 353 vxor v6,v6,v14 354 vxor v7,v7,v15 355 356#ifdef REFLECT 357 /* 358 * vpmsumd produces a 96 bit result in the least significant bits 359 * of the register. Since we are bit reflected we have to shift it 360 * left 32 bits so it occupies the least significant bits in the 361 * bit reflected domain. 362 */ 363 vsldoi v0,v0,zeroes,4 364 vsldoi v1,v1,zeroes,4 365 vsldoi v2,v2,zeroes,4 366 vsldoi v3,v3,zeroes,4 367 vsldoi v4,v4,zeroes,4 368 vsldoi v5,v5,zeroes,4 369 vsldoi v6,v6,zeroes,4 370 vsldoi v7,v7,zeroes,4 371#endif 372 373 /* xor with last 1024 bits */ 374 lvx v8,0,r4 375 lvx v9,off16,r4 376 VPERM(v8,v8,v8,byteswap) 377 VPERM(v9,v9,v9,byteswap) 378 lvx v10,off32,r4 379 lvx v11,off48,r4 380 VPERM(v10,v10,v10,byteswap) 381 VPERM(v11,v11,v11,byteswap) 382 lvx v12,off64,r4 383 lvx v13,off80,r4 384 VPERM(v12,v12,v12,byteswap) 385 VPERM(v13,v13,v13,byteswap) 386 lvx v14,off96,r4 387 lvx v15,off112,r4 388 VPERM(v14,v14,v14,byteswap) 389 VPERM(v15,v15,v15,byteswap) 390 391 addi r4,r4,8*16 392 393 vxor v16,v0,v8 394 vxor v17,v1,v9 395 vxor v18,v2,v10 396 vxor v19,v3,v11 397 vxor v20,v4,v12 398 vxor v21,v5,v13 399 vxor v22,v6,v14 400 vxor v23,v7,v15 401 402 li r0,1 403 cmpdi r6,0 404 addi r6,r6,128 405 bne 1b 406 407 /* Work out how many bytes we have left */ 408 andi. r5,r5,127 409 410 /* Calculate where in the constant table we need to start */ 411 subfic r6,r5,128 412 add r3,r3,r6 413 414 /* How many 16 byte chunks are in the tail */ 415 srdi r7,r5,4 416 mtctr r7 417 418 /* 419 * Reduce the previously calculated 1024 bits to 64 bits, shifting 420 * 32 bits to include the trailing 32 bits of zeros 421 */ 422 lvx v0,0,r3 423 lvx v1,off16,r3 424 lvx v2,off32,r3 425 lvx v3,off48,r3 426 lvx v4,off64,r3 427 lvx v5,off80,r3 428 lvx v6,off96,r3 429 lvx v7,off112,r3 430 addi r3,r3,8*16 431 432 VPMSUMW(v0,v16,v0) 433 VPMSUMW(v1,v17,v1) 434 VPMSUMW(v2,v18,v2) 435 VPMSUMW(v3,v19,v3) 436 VPMSUMW(v4,v20,v4) 437 VPMSUMW(v5,v21,v5) 438 VPMSUMW(v6,v22,v6) 439 VPMSUMW(v7,v23,v7) 440 441 /* Now reduce the tail (0 - 112 bytes) */ 442 cmpdi r7,0 443 beq 1f 444 445 lvx v16,0,r4 446 lvx v17,0,r3 447 VPERM(v16,v16,v16,byteswap) 448 VPMSUMW(v16,v16,v17) 449 vxor v0,v0,v16 450 bdz 1f 451 452 lvx v16,off16,r4 453 lvx v17,off16,r3 454 VPERM(v16,v16,v16,byteswap) 455 VPMSUMW(v16,v16,v17) 456 vxor v0,v0,v16 457 bdz 1f 458 459 lvx v16,off32,r4 460 lvx v17,off32,r3 461 VPERM(v16,v16,v16,byteswap) 462 VPMSUMW(v16,v16,v17) 463 vxor v0,v0,v16 464 bdz 1f 465 466 lvx v16,off48,r4 467 lvx v17,off48,r3 468 VPERM(v16,v16,v16,byteswap) 469 VPMSUMW(v16,v16,v17) 470 vxor v0,v0,v16 471 bdz 1f 472 473 lvx v16,off64,r4 474 lvx v17,off64,r3 475 VPERM(v16,v16,v16,byteswap) 476 VPMSUMW(v16,v16,v17) 477 vxor v0,v0,v16 478 bdz 1f 479 480 lvx v16,off80,r4 481 lvx v17,off80,r3 482 VPERM(v16,v16,v16,byteswap) 483 VPMSUMW(v16,v16,v17) 484 vxor v0,v0,v16 485 bdz 1f 486 487 lvx v16,off96,r4 488 lvx v17,off96,r3 489 VPERM(v16,v16,v16,byteswap) 490 VPMSUMW(v16,v16,v17) 491 vxor v0,v0,v16 492 493 /* Now xor all the parallel chunks together */ 4941: vxor v0,v0,v1 495 vxor v2,v2,v3 496 vxor v4,v4,v5 497 vxor v6,v6,v7 498 499 vxor v0,v0,v2 500 vxor v4,v4,v6 501 502 vxor v0,v0,v4 503 504.Lbarrett_reduction: 505 /* Barrett constants */ 506 LOAD_REG_ADDR(r3, .barrett_constants) 507 508 lvx const1,0,r3 509 lvx const2,off16,r3 510 511 vsldoi v1,v0,v0,8 512 vxor v0,v0,v1 /* xor two 64 bit results together */ 513 514#ifdef REFLECT 515 /* shift left one bit */ 516 vspltisb v1,1 517 vsl v0,v0,v1 518#endif 519 520 vand v0,v0,mask_64bit 521#ifndef REFLECT 522 /* 523 * Now for the Barrett reduction algorithm. The idea is to calculate q, 524 * the multiple of our polynomial that we need to subtract. By 525 * doing the computation 2x bits higher (ie 64 bits) and shifting the 526 * result back down 2x bits, we round down to the nearest multiple. 527 */ 528 VPMSUMD(v1,v0,const1) /* ma */ 529 vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */ 530 VPMSUMD(v1,v1,const2) /* qn */ 531 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ 532 533 /* 534 * Get the result into r3. We need to shift it left 8 bytes: 535 * V0 [ 0 1 2 X ] 536 * V0 [ 0 X 2 3 ] 537 */ 538 vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */ 539#else 540 /* 541 * The reflected version of Barrett reduction. Instead of bit 542 * reflecting our data (which is expensive to do), we bit reflect our 543 * constants and our algorithm, which means the intermediate data in 544 * our vector registers goes from 0-63 instead of 63-0. We can reflect 545 * the algorithm because we don't carry in mod 2 arithmetic. 546 */ 547 vand v1,v0,mask_32bit /* bottom 32 bits of a */ 548 VPMSUMD(v1,v1,const1) /* ma */ 549 vand v1,v1,mask_32bit /* bottom 32bits of ma */ 550 VPMSUMD(v1,v1,const2) /* qn */ 551 vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ 552 553 /* 554 * Since we are bit reflected, the result (ie the low 32 bits) is in 555 * the high 32 bits. We just need to shift it left 4 bytes 556 * V0 [ 0 1 X 3 ] 557 * V0 [ 0 X 2 3 ] 558 */ 559 vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ 560#endif 561 562 /* Get it into r3 */ 563 MFVRD(R3, v0) 564 565.Lout: 566 subi r6,r1,56+10*16 567 subi r7,r1,56+2*16 568 569 lvx v20,0,r6 570 lvx v21,off16,r6 571 lvx v22,off32,r6 572 lvx v23,off48,r6 573 lvx v24,off64,r6 574 lvx v25,off80,r6 575 lvx v26,off96,r6 576 lvx v27,off112,r6 577 lvx v28,0,r7 578 lvx v29,off16,r7 579 580 ld r31,-8(r1) 581 ld r30,-16(r1) 582 ld r29,-24(r1) 583 ld r28,-32(r1) 584 ld r27,-40(r1) 585 ld r26,-48(r1) 586 ld r25,-56(r1) 587 588 blr 589 590.Lfirst_warm_up_done: 591 lvx const1,0,r3 592 addi r3,r3,16 593 594 VPMSUMD(v8,v16,const1) 595 VPMSUMD(v9,v17,const1) 596 VPMSUMD(v10,v18,const1) 597 VPMSUMD(v11,v19,const1) 598 VPMSUMD(v12,v20,const1) 599 VPMSUMD(v13,v21,const1) 600 VPMSUMD(v14,v22,const1) 601 VPMSUMD(v15,v23,const1) 602 603 b .Lsecond_cool_down 604 605.Lshort: 606 cmpdi r5,0 607 beq .Lzero 608 609 LOAD_REG_ADDR(r3, .short_constants) 610 611 /* Calculate where in the constant table we need to start */ 612 subfic r6,r5,256 613 add r3,r3,r6 614 615 /* How many 16 byte chunks? */ 616 srdi r7,r5,4 617 mtctr r7 618 619 vxor v19,v19,v19 620 vxor v20,v20,v20 621 622 lvx v0,0,r4 623 lvx v16,0,r3 624 VPERM(v0,v0,v16,byteswap) 625 vxor v0,v0,v8 /* xor in initial value */ 626 VPMSUMW(v0,v0,v16) 627 bdz .Lv0 628 629 lvx v1,off16,r4 630 lvx v17,off16,r3 631 VPERM(v1,v1,v17,byteswap) 632 VPMSUMW(v1,v1,v17) 633 bdz .Lv1 634 635 lvx v2,off32,r4 636 lvx v16,off32,r3 637 VPERM(v2,v2,v16,byteswap) 638 VPMSUMW(v2,v2,v16) 639 bdz .Lv2 640 641 lvx v3,off48,r4 642 lvx v17,off48,r3 643 VPERM(v3,v3,v17,byteswap) 644 VPMSUMW(v3,v3,v17) 645 bdz .Lv3 646 647 lvx v4,off64,r4 648 lvx v16,off64,r3 649 VPERM(v4,v4,v16,byteswap) 650 VPMSUMW(v4,v4,v16) 651 bdz .Lv4 652 653 lvx v5,off80,r4 654 lvx v17,off80,r3 655 VPERM(v5,v5,v17,byteswap) 656 VPMSUMW(v5,v5,v17) 657 bdz .Lv5 658 659 lvx v6,off96,r4 660 lvx v16,off96,r3 661 VPERM(v6,v6,v16,byteswap) 662 VPMSUMW(v6,v6,v16) 663 bdz .Lv6 664 665 lvx v7,off112,r4 666 lvx v17,off112,r3 667 VPERM(v7,v7,v17,byteswap) 668 VPMSUMW(v7,v7,v17) 669 bdz .Lv7 670 671 addi r3,r3,128 672 addi r4,r4,128 673 674 lvx v8,0,r4 675 lvx v16,0,r3 676 VPERM(v8,v8,v16,byteswap) 677 VPMSUMW(v8,v8,v16) 678 bdz .Lv8 679 680 lvx v9,off16,r4 681 lvx v17,off16,r3 682 VPERM(v9,v9,v17,byteswap) 683 VPMSUMW(v9,v9,v17) 684 bdz .Lv9 685 686 lvx v10,off32,r4 687 lvx v16,off32,r3 688 VPERM(v10,v10,v16,byteswap) 689 VPMSUMW(v10,v10,v16) 690 bdz .Lv10 691 692 lvx v11,off48,r4 693 lvx v17,off48,r3 694 VPERM(v11,v11,v17,byteswap) 695 VPMSUMW(v11,v11,v17) 696 bdz .Lv11 697 698 lvx v12,off64,r4 699 lvx v16,off64,r3 700 VPERM(v12,v12,v16,byteswap) 701 VPMSUMW(v12,v12,v16) 702 bdz .Lv12 703 704 lvx v13,off80,r4 705 lvx v17,off80,r3 706 VPERM(v13,v13,v17,byteswap) 707 VPMSUMW(v13,v13,v17) 708 bdz .Lv13 709 710 lvx v14,off96,r4 711 lvx v16,off96,r3 712 VPERM(v14,v14,v16,byteswap) 713 VPMSUMW(v14,v14,v16) 714 bdz .Lv14 715 716 lvx v15,off112,r4 717 lvx v17,off112,r3 718 VPERM(v15,v15,v17,byteswap) 719 VPMSUMW(v15,v15,v17) 720 721.Lv15: vxor v19,v19,v15 722.Lv14: vxor v20,v20,v14 723.Lv13: vxor v19,v19,v13 724.Lv12: vxor v20,v20,v12 725.Lv11: vxor v19,v19,v11 726.Lv10: vxor v20,v20,v10 727.Lv9: vxor v19,v19,v9 728.Lv8: vxor v20,v20,v8 729.Lv7: vxor v19,v19,v7 730.Lv6: vxor v20,v20,v6 731.Lv5: vxor v19,v19,v5 732.Lv4: vxor v20,v20,v4 733.Lv3: vxor v19,v19,v3 734.Lv2: vxor v20,v20,v2 735.Lv1: vxor v19,v19,v1 736.Lv0: vxor v20,v20,v0 737 738 vxor v0,v19,v20 739 740 b .Lbarrett_reduction 741 742.Lzero: 743 mr r3,r10 744 b .Lout 745 746FUNC_END(CRC_FUNCTION_NAME) 747