1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * Hardware-accelerated CRC-32 variants for Linux on z Systems 4 * 5 * Use the z/Architecture Vector Extension Facility to accelerate the 6 * computing of bitreflected CRC-32 checksums for IEEE 802.3 Ethernet 7 * and Castagnoli. 8 * 9 * This CRC-32 implementation algorithm is bitreflected and processes 10 * the least-significant bit first (Little-Endian). 11 * 12 * Copyright IBM Corp. 2015 13 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> 14 */ 15 16 #include <linux/types.h> 17 #include <asm/fpu.h> 18 #include "crc32-vx.h" 19 20 /* Vector register range containing CRC-32 constants */ 21 #define CONST_PERM_LE2BE 9 22 #define CONST_R2R1 10 23 #define CONST_R4R3 11 24 #define CONST_R5 12 25 #define CONST_RU_POLY 13 26 #define CONST_CRC_POLY 14 27 28 /* 29 * The CRC-32 constant block contains reduction constants to fold and 30 * process particular chunks of the input data stream in parallel. 31 * 32 * For the CRC-32 variants, the constants are precomputed according to 33 * these definitions: 34 * 35 * R1 = [(x4*128+32 mod P'(x) << 32)]' << 1 36 * R2 = [(x4*128-32 mod P'(x) << 32)]' << 1 37 * R3 = [(x128+32 mod P'(x) << 32)]' << 1 38 * R4 = [(x128-32 mod P'(x) << 32)]' << 1 39 * R5 = [(x64 mod P'(x) << 32)]' << 1 40 * R6 = [(x32 mod P'(x) << 32)]' << 1 41 * 42 * The bitreflected Barret reduction constant, u', is defined as 43 * the bit reversal of floor(x**64 / P(x)). 44 * 45 * where P(x) is the polynomial in the normal domain and the P'(x) is the 46 * polynomial in the reversed (bitreflected) domain. 47 * 48 * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials: 49 * 50 * P(x) = 0x04C11DB7 51 * P'(x) = 0xEDB88320 52 * 53 * CRC-32C (Castagnoli) polynomials: 54 * 55 * P(x) = 0x1EDC6F41 56 * P'(x) = 0x82F63B78 57 */ 58 59 static unsigned long constants_CRC_32_LE[] = { 60 0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */ 61 0x1c6e41596, 0x154442bd4, /* R2, R1 */ 62 0x0ccaa009e, 0x1751997d0, /* R4, R3 */ 63 0x0, 0x163cd6124, /* R5 */ 64 0x0, 0x1f7011641, /* u' */ 65 0x0, 0x1db710641 /* P'(x) << 1 */ 66 }; 67 68 static unsigned long constants_CRC_32C_LE[] = { 69 0x0f0e0d0c0b0a0908, 0x0706050403020100, /* BE->LE mask */ 70 0x09e4addf8, 0x740eef02, /* R2, R1 */ 71 0x14cd00bd6, 0xf20c0dfe, /* R4, R3 */ 72 0x0, 0x0dd45aab8, /* R5 */ 73 0x0, 0x0dea713f1, /* u' */ 74 0x0, 0x105ec76f0 /* P'(x) << 1 */ 75 }; 76 77 /** 78 * crc32_le_vgfm_generic - Compute CRC-32 (LE variant) with vector registers 79 * @crc: Initial CRC value, typically ~0. 80 * @buf: Input buffer pointer, performance might be improved if the 81 * buffer is on a doubleword boundary. 82 * @size: Size of the buffer, must be 64 bytes or greater. 83 * @constants: CRC-32 constant pool base pointer. 84 * 85 * Register usage: 86 * V0: Initial CRC value and intermediate constants and results. 87 * V1..V4: Data for CRC computation. 88 * V5..V8: Next data chunks that are fetched from the input buffer. 89 * V9: Constant for BE->LE conversion and shift operations 90 * V10..V14: CRC-32 constants. 91 */ 92 static u32 crc32_le_vgfm_generic(u32 crc, unsigned char const *buf, size_t size, unsigned long *constants) 93 { 94 /* Load CRC-32 constants */ 95 fpu_vlm(CONST_PERM_LE2BE, CONST_CRC_POLY, constants); 96 97 /* 98 * Load the initial CRC value. 99 * 100 * The CRC value is loaded into the rightmost word of the 101 * vector register and is later XORed with the LSB portion 102 * of the loaded input data. 103 */ 104 fpu_vzero(0); /* Clear V0 */ 105 fpu_vlvgf(0, crc, 3); /* Load CRC into rightmost word */ 106 107 /* Load a 64-byte data chunk and XOR with CRC */ 108 fpu_vlm(1, 4, buf); 109 fpu_vperm(1, 1, 1, CONST_PERM_LE2BE); 110 fpu_vperm(2, 2, 2, CONST_PERM_LE2BE); 111 fpu_vperm(3, 3, 3, CONST_PERM_LE2BE); 112 fpu_vperm(4, 4, 4, CONST_PERM_LE2BE); 113 114 fpu_vx(1, 0, 1); /* V1 ^= CRC */ 115 buf += 64; 116 size -= 64; 117 118 while (size >= 64) { 119 fpu_vlm(5, 8, buf); 120 fpu_vperm(5, 5, 5, CONST_PERM_LE2BE); 121 fpu_vperm(6, 6, 6, CONST_PERM_LE2BE); 122 fpu_vperm(7, 7, 7, CONST_PERM_LE2BE); 123 fpu_vperm(8, 8, 8, CONST_PERM_LE2BE); 124 /* 125 * Perform a GF(2) multiplication of the doublewords in V1 with 126 * the R1 and R2 reduction constants in V0. The intermediate 127 * result is then folded (accumulated) with the next data chunk 128 * in V5 and stored in V1. Repeat this step for the register 129 * contents in V2, V3, and V4 respectively. 130 */ 131 fpu_vgfmag(1, CONST_R2R1, 1, 5); 132 fpu_vgfmag(2, CONST_R2R1, 2, 6); 133 fpu_vgfmag(3, CONST_R2R1, 3, 7); 134 fpu_vgfmag(4, CONST_R2R1, 4, 8); 135 buf += 64; 136 size -= 64; 137 } 138 139 /* 140 * Fold V1 to V4 into a single 128-bit value in V1. Multiply V1 with R3 141 * and R4 and accumulating the next 128-bit chunk until a single 128-bit 142 * value remains. 143 */ 144 fpu_vgfmag(1, CONST_R4R3, 1, 2); 145 fpu_vgfmag(1, CONST_R4R3, 1, 3); 146 fpu_vgfmag(1, CONST_R4R3, 1, 4); 147 148 while (size >= 16) { 149 fpu_vl(2, buf); 150 fpu_vperm(2, 2, 2, CONST_PERM_LE2BE); 151 fpu_vgfmag(1, CONST_R4R3, 1, 2); 152 buf += 16; 153 size -= 16; 154 } 155 156 /* 157 * Set up a vector register for byte shifts. The shift value must 158 * be loaded in bits 1-4 in byte element 7 of a vector register. 159 * Shift by 8 bytes: 0x40 160 * Shift by 4 bytes: 0x20 161 */ 162 fpu_vleib(9, 0x40, 7); 163 164 /* 165 * Prepare V0 for the next GF(2) multiplication: shift V0 by 8 bytes 166 * to move R4 into the rightmost doubleword and set the leftmost 167 * doubleword to 0x1. 168 */ 169 fpu_vsrlb(0, CONST_R4R3, 9); 170 fpu_vleig(0, 1, 0); 171 172 /* 173 * Compute GF(2) product of V1 and V0. The rightmost doubleword 174 * of V1 is multiplied with R4. The leftmost doubleword of V1 is 175 * multiplied by 0x1 and is then XORed with rightmost product. 176 * Implicitly, the intermediate leftmost product becomes padded 177 */ 178 fpu_vgfmg(1, 0, 1); 179 180 /* 181 * Now do the final 32-bit fold by multiplying the rightmost word 182 * in V1 with R5 and XOR the result with the remaining bits in V1. 183 * 184 * To achieve this by a single VGFMAG, right shift V1 by a word 185 * and store the result in V2 which is then accumulated. Use the 186 * vector unpack instruction to load the rightmost half of the 187 * doubleword into the rightmost doubleword element of V1; the other 188 * half is loaded in the leftmost doubleword. 189 * The vector register with CONST_R5 contains the R5 constant in the 190 * rightmost doubleword and the leftmost doubleword is zero to ignore 191 * the leftmost product of V1. 192 */ 193 fpu_vleib(9, 0x20, 7); /* Shift by words */ 194 fpu_vsrlb(2, 1, 9); /* Store remaining bits in V2 */ 195 fpu_vupllf(1, 1); /* Split rightmost doubleword */ 196 fpu_vgfmag(1, CONST_R5, 1, 2); /* V1 = (V1 * R5) XOR V2 */ 197 198 /* 199 * Apply a Barret reduction to compute the final 32-bit CRC value. 200 * 201 * The input values to the Barret reduction are the degree-63 polynomial 202 * in V1 (R(x)), degree-32 generator polynomial, and the reduction 203 * constant u. The Barret reduction result is the CRC value of R(x) mod 204 * P(x). 205 * 206 * The Barret reduction algorithm is defined as: 207 * 208 * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u 209 * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) 210 * 3. C(x) = R(x) XOR T2(x) mod x^32 211 * 212 * Note: The leftmost doubleword of vector register containing 213 * CONST_RU_POLY is zero and, thus, the intermediate GF(2) product 214 * is zero and does not contribute to the final result. 215 */ 216 217 /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ 218 fpu_vupllf(2, 1); 219 fpu_vgfmg(2, CONST_RU_POLY, 2); 220 221 /* 222 * Compute the GF(2) product of the CRC polynomial with T1(x) in 223 * V2 and XOR the intermediate result, T2(x), with the value in V1. 224 * The final result is stored in word element 2 of V2. 225 */ 226 fpu_vupllf(2, 2); 227 fpu_vgfmag(2, CONST_CRC_POLY, 2, 1); 228 229 return fpu_vlgvf(2, 2); 230 } 231 232 u32 crc32_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size) 233 { 234 return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32_LE[0]); 235 } 236 237 u32 crc32c_le_vgfm_16(u32 crc, unsigned char const *buf, size_t size) 238 { 239 return crc32_le_vgfm_generic(crc, buf, size, &constants_CRC_32C_LE[0]); 240 } 241