1 /* SPDX-License-Identifier: GPL-2.0 */ 2 /* 3 * Hardware-accelerated CRC-32 variants for Linux on z Systems 4 * 5 * Use the z/Architecture Vector Extension Facility to accelerate the 6 * computing of CRC-32 checksums. 7 * 8 * This CRC-32 implementation algorithm processes the most-significant 9 * bit first (BE). 10 * 11 * Copyright IBM Corp. 2015 12 * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com> 13 */ 14 15 #include <linux/types.h> 16 #include <asm/fpu.h> 17 #include "crc32-vx.h" 18 19 /* Vector register range containing CRC-32 constants */ 20 #define CONST_R1R2 9 21 #define CONST_R3R4 10 22 #define CONST_R5 11 23 #define CONST_R6 12 24 #define CONST_RU_POLY 13 25 #define CONST_CRC_POLY 14 26 27 /* 28 * The CRC-32 constant block contains reduction constants to fold and 29 * process particular chunks of the input data stream in parallel. 30 * 31 * For the CRC-32 variants, the constants are precomputed according to 32 * these definitions: 33 * 34 * R1 = x4*128+64 mod P(x) 35 * R2 = x4*128 mod P(x) 36 * R3 = x128+64 mod P(x) 37 * R4 = x128 mod P(x) 38 * R5 = x96 mod P(x) 39 * R6 = x64 mod P(x) 40 * 41 * Barret reduction constant, u, is defined as floor(x**64 / P(x)). 42 * 43 * where P(x) is the polynomial in the normal domain and the P'(x) is the 44 * polynomial in the reversed (bitreflected) domain. 45 * 46 * Note that the constant definitions below are extended in order to compute 47 * intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction. 48 * The rightmost doubleword can be 0 to prevent contribution to the result or 49 * can be multiplied by 1 to perform an XOR without the need for a separate 50 * VECTOR EXCLUSIVE OR instruction. 51 * 52 * CRC-32 (IEEE 802.3 Ethernet, ...) polynomials: 53 * 54 * P(x) = 0x04C11DB7 55 * P'(x) = 0xEDB88320 56 */ 57 58 static unsigned long constants_CRC_32_BE[] = { 59 0x08833794c, 0x0e6228b11, /* R1, R2 */ 60 0x0c5b9cd4c, 0x0e8a45605, /* R3, R4 */ 61 0x0f200aa66, 1UL << 32, /* R5, x32 */ 62 0x0490d678d, 1, /* R6, 1 */ 63 0x104d101df, 0, /* u */ 64 0x104C11DB7, 0, /* P(x) */ 65 }; 66 67 /** 68 * crc32_be_vgfm_16 - Compute CRC-32 (BE variant) with vector registers 69 * @crc: Initial CRC value, typically ~0. 70 * @buf: Input buffer pointer, performance might be improved if the 71 * buffer is on a doubleword boundary. 72 * @size: Size of the buffer, must be 64 bytes or greater. 73 * 74 * Register usage: 75 * V0: Initial CRC value and intermediate constants and results. 76 * V1..V4: Data for CRC computation. 77 * V5..V8: Next data chunks that are fetched from the input buffer. 78 * V9..V14: CRC-32 constants. 79 */ 80 u32 crc32_be_vgfm_16(u32 crc, unsigned char const *buf, size_t size) 81 { 82 /* Load CRC-32 constants */ 83 fpu_vlm(CONST_R1R2, CONST_CRC_POLY, &constants_CRC_32_BE); 84 fpu_vzero(0); 85 86 /* Load the initial CRC value into the leftmost word of V0. */ 87 fpu_vlvgf(0, crc, 0); 88 89 /* Load a 64-byte data chunk and XOR with CRC */ 90 fpu_vlm(1, 4, buf); 91 fpu_vx(1, 0, 1); 92 buf += 64; 93 size -= 64; 94 95 while (size >= 64) { 96 /* Load the next 64-byte data chunk into V5 to V8 */ 97 fpu_vlm(5, 8, buf); 98 99 /* 100 * Perform a GF(2) multiplication of the doublewords in V1 with 101 * the reduction constants in V0. The intermediate result is 102 * then folded (accumulated) with the next data chunk in V5 and 103 * stored in V1. Repeat this step for the register contents 104 * in V2, V3, and V4 respectively. 105 */ 106 fpu_vgfmag(1, CONST_R1R2, 1, 5); 107 fpu_vgfmag(2, CONST_R1R2, 2, 6); 108 fpu_vgfmag(3, CONST_R1R2, 3, 7); 109 fpu_vgfmag(4, CONST_R1R2, 4, 8); 110 buf += 64; 111 size -= 64; 112 } 113 114 /* Fold V1 to V4 into a single 128-bit value in V1 */ 115 fpu_vgfmag(1, CONST_R3R4, 1, 2); 116 fpu_vgfmag(1, CONST_R3R4, 1, 3); 117 fpu_vgfmag(1, CONST_R3R4, 1, 4); 118 119 while (size >= 16) { 120 fpu_vl(2, buf); 121 fpu_vgfmag(1, CONST_R3R4, 1, 2); 122 buf += 16; 123 size -= 16; 124 } 125 126 /* 127 * The R5 constant is used to fold a 128-bit value into an 96-bit value 128 * that is XORed with the next 96-bit input data chunk. To use a single 129 * VGFMG instruction, multiply the rightmost 64-bit with x^32 (1<<32) to 130 * form an intermediate 96-bit value (with appended zeros) which is then 131 * XORed with the intermediate reduction result. 132 */ 133 fpu_vgfmg(1, CONST_R5, 1); 134 135 /* 136 * Further reduce the remaining 96-bit value to a 64-bit value using a 137 * single VGFMG, the rightmost doubleword is multiplied with 0x1. The 138 * intermediate result is then XORed with the product of the leftmost 139 * doubleword with R6. The result is a 64-bit value and is subject to 140 * the Barret reduction. 141 */ 142 fpu_vgfmg(1, CONST_R6, 1); 143 144 /* 145 * The input values to the Barret reduction are the degree-63 polynomial 146 * in V1 (R(x)), degree-32 generator polynomial, and the reduction 147 * constant u. The Barret reduction result is the CRC value of R(x) mod 148 * P(x). 149 * 150 * The Barret reduction algorithm is defined as: 151 * 152 * 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u 153 * 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) 154 * 3. C(x) = R(x) XOR T2(x) mod x^32 155 * 156 * Note: To compensate the division by x^32, use the vector unpack 157 * instruction to move the leftmost word into the leftmost doubleword 158 * of the vector register. The rightmost doubleword is multiplied 159 * with zero to not contribute to the intermediate results. 160 */ 161 162 /* T1(x) = floor( R(x) / x^32 ) GF2MUL u */ 163 fpu_vupllf(2, 1); 164 fpu_vgfmg(2, CONST_RU_POLY, 2); 165 166 /* 167 * Compute the GF(2) product of the CRC polynomial in VO with T1(x) in 168 * V2 and XOR the intermediate result, T2(x), with the value in V1. 169 * The final result is in the rightmost word of V2. 170 */ 171 fpu_vupllf(2, 2); 172 fpu_vgfmag(2, CONST_CRC_POLY, 2, 1); 173 return fpu_vlgvf(2, 3); 174 } 175