1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Copyright 2012 Xyratex Technology Limited 4 * 5 * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 6 * calculation. 7 * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) 8 * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found 9 * at: 10 * http://www.intel.com/products/processor/manuals/ 11 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual 12 * Volume 2B: Instruction Set Reference, N-Z 13 * 14 * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> 15 * Alexander Boyko <Alexander_Boyko@xyratex.com> 16 */ 17 18#include <linux/linkage.h> 19 20 21.section .rodata 22.align 16 23/* 24 * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 25 * #define CONSTANT_R1 0x154442bd4LL 26 * 27 * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 28 * #define CONSTANT_R2 0x1c6e41596LL 29 */ 30.Lconstant_R2R1: 31 .octa 0x00000001c6e415960000000154442bd4 32/* 33 * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 34 * #define CONSTANT_R3 0x1751997d0LL 35 * 36 * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e 37 * #define CONSTANT_R4 0x0ccaa009eLL 38 */ 39.Lconstant_R4R3: 40 .octa 0x00000000ccaa009e00000001751997d0 41/* 42 * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 43 * #define CONSTANT_R5 0x163cd6124LL 44 */ 45.Lconstant_R5: 46 .octa 0x00000000000000000000000163cd6124 47.Lconstant_mask32: 48 .octa 0x000000000000000000000000FFFFFFFF 49/* 50 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL 51 * 52 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL 53 * #define CONSTANT_RU 0x1F7011641LL 54 */ 55.Lconstant_RUpoly: 56 .octa 0x00000001F701164100000001DB710641 57 58#define CONSTANT %xmm0 59 60#ifdef __x86_64__ 61#define CRC %edi 62#define BUF %rsi 63#define LEN %rdx 64#else 65#define CRC %eax 66#define BUF %edx 67#define LEN %ecx 68#endif 69 70 71 72.text 73/** 74 * Calculate crc32 75 * CRC - initial crc32 76 * BUF - buffer (16 bytes aligned) 77 * LEN - sizeof buffer (16 bytes aligned), LEN should be greater than 63 78 * return %eax crc32 79 * u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len); 80 */ 81 82SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ 83 movdqa (BUF), %xmm1 84 movdqa 0x10(BUF), %xmm2 85 movdqa 0x20(BUF), %xmm3 86 movdqa 0x30(BUF), %xmm4 87 movd CRC, CONSTANT 88 pxor CONSTANT, %xmm1 89 sub $0x40, LEN 90 add $0x40, BUF 91 cmp $0x40, LEN 92 jb .Lless_64 93 94#ifdef __x86_64__ 95 movdqa .Lconstant_R2R1(%rip), CONSTANT 96#else 97 movdqa .Lconstant_R2R1, CONSTANT 98#endif 99 100.Lloop_64:/* 64 bytes Full cache line folding */ 101 prefetchnta 0x40(BUF) 102 movdqa %xmm1, %xmm5 103 movdqa %xmm2, %xmm6 104 movdqa %xmm3, %xmm7 105#ifdef __x86_64__ 106 movdqa %xmm4, %xmm8 107#endif 108 pclmulqdq $0x00, CONSTANT, %xmm1 109 pclmulqdq $0x00, CONSTANT, %xmm2 110 pclmulqdq $0x00, CONSTANT, %xmm3 111#ifdef __x86_64__ 112 pclmulqdq $0x00, CONSTANT, %xmm4 113#endif 114 pclmulqdq $0x11, CONSTANT, %xmm5 115 pclmulqdq $0x11, CONSTANT, %xmm6 116 pclmulqdq $0x11, CONSTANT, %xmm7 117#ifdef __x86_64__ 118 pclmulqdq $0x11, CONSTANT, %xmm8 119#endif 120 pxor %xmm5, %xmm1 121 pxor %xmm6, %xmm2 122 pxor %xmm7, %xmm3 123#ifdef __x86_64__ 124 pxor %xmm8, %xmm4 125#else 126 /* xmm8 unsupported for x32 */ 127 movdqa %xmm4, %xmm5 128 pclmulqdq $0x00, CONSTANT, %xmm4 129 pclmulqdq $0x11, CONSTANT, %xmm5 130 pxor %xmm5, %xmm4 131#endif 132 133 pxor (BUF), %xmm1 134 pxor 0x10(BUF), %xmm2 135 pxor 0x20(BUF), %xmm3 136 pxor 0x30(BUF), %xmm4 137 138 sub $0x40, LEN 139 add $0x40, BUF 140 cmp $0x40, LEN 141 jge .Lloop_64 142.Lless_64:/* Folding cache line into 128bit */ 143#ifdef __x86_64__ 144 movdqa .Lconstant_R4R3(%rip), CONSTANT 145#else 146 movdqa .Lconstant_R4R3, CONSTANT 147#endif 148 prefetchnta (BUF) 149 150 movdqa %xmm1, %xmm5 151 pclmulqdq $0x00, CONSTANT, %xmm1 152 pclmulqdq $0x11, CONSTANT, %xmm5 153 pxor %xmm5, %xmm1 154 pxor %xmm2, %xmm1 155 156 movdqa %xmm1, %xmm5 157 pclmulqdq $0x00, CONSTANT, %xmm1 158 pclmulqdq $0x11, CONSTANT, %xmm5 159 pxor %xmm5, %xmm1 160 pxor %xmm3, %xmm1 161 162 movdqa %xmm1, %xmm5 163 pclmulqdq $0x00, CONSTANT, %xmm1 164 pclmulqdq $0x11, CONSTANT, %xmm5 165 pxor %xmm5, %xmm1 166 pxor %xmm4, %xmm1 167 168 cmp $0x10, LEN 169 jb .Lfold_64 170.Lloop_16:/* Folding rest buffer into 128bit */ 171 movdqa %xmm1, %xmm5 172 pclmulqdq $0x00, CONSTANT, %xmm1 173 pclmulqdq $0x11, CONSTANT, %xmm5 174 pxor %xmm5, %xmm1 175 pxor (BUF), %xmm1 176 sub $0x10, LEN 177 add $0x10, BUF 178 cmp $0x10, LEN 179 jge .Lloop_16 180 181.Lfold_64: 182 /* perform the last 64 bit fold, also adds 32 zeroes 183 * to the input stream */ 184 pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ 185 psrldq $0x08, %xmm1 186 pxor CONSTANT, %xmm1 187 188 /* final 32-bit fold */ 189 movdqa %xmm1, %xmm2 190#ifdef __x86_64__ 191 movdqa .Lconstant_R5(%rip), CONSTANT 192 movdqa .Lconstant_mask32(%rip), %xmm3 193#else 194 movdqa .Lconstant_R5, CONSTANT 195 movdqa .Lconstant_mask32, %xmm3 196#endif 197 psrldq $0x04, %xmm2 198 pand %xmm3, %xmm1 199 pclmulqdq $0x00, CONSTANT, %xmm1 200 pxor %xmm2, %xmm1 201 202 /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ 203#ifdef __x86_64__ 204 movdqa .Lconstant_RUpoly(%rip), CONSTANT 205#else 206 movdqa .Lconstant_RUpoly, CONSTANT 207#endif 208 movdqa %xmm1, %xmm2 209 pand %xmm3, %xmm1 210 pclmulqdq $0x10, CONSTANT, %xmm1 211 pand %xmm3, %xmm1 212 pclmulqdq $0x00, CONSTANT, %xmm1 213 pxor %xmm2, %xmm1 214 pextrd $0x01, %xmm1, %eax 215 216 RET 217SYM_FUNC_END(crc32_pclmul_le_16) 218