1*55d1ecceSEric Biggers/* SPDX-License-Identifier: GPL-2.0-only */ 2*55d1ecceSEric Biggers/* 3*55d1ecceSEric Biggers * Copyright 2012 Xyratex Technology Limited 4*55d1ecceSEric Biggers * 5*55d1ecceSEric Biggers * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 6*55d1ecceSEric Biggers * calculation. 7*55d1ecceSEric Biggers * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) 8*55d1ecceSEric Biggers * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found 9*55d1ecceSEric Biggers * at: 10*55d1ecceSEric Biggers * http://www.intel.com/products/processor/manuals/ 11*55d1ecceSEric Biggers * Intel(R) 64 and IA-32 Architectures Software Developer's Manual 12*55d1ecceSEric Biggers * Volume 2B: Instruction Set Reference, N-Z 13*55d1ecceSEric Biggers * 14*55d1ecceSEric Biggers * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> 15*55d1ecceSEric Biggers * Alexander Boyko <Alexander_Boyko@xyratex.com> 16*55d1ecceSEric Biggers */ 17*55d1ecceSEric Biggers 18*55d1ecceSEric Biggers#include <linux/linkage.h> 19*55d1ecceSEric Biggers 20*55d1ecceSEric Biggers 21*55d1ecceSEric Biggers.section .rodata 22*55d1ecceSEric Biggers.align 16 23*55d1ecceSEric Biggers/* 24*55d1ecceSEric Biggers * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 25*55d1ecceSEric Biggers * #define CONSTANT_R1 0x154442bd4LL 26*55d1ecceSEric Biggers * 27*55d1ecceSEric Biggers * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 28*55d1ecceSEric Biggers * #define CONSTANT_R2 0x1c6e41596LL 29*55d1ecceSEric Biggers */ 30*55d1ecceSEric Biggers.Lconstant_R2R1: 31*55d1ecceSEric Biggers .octa 0x00000001c6e415960000000154442bd4 32*55d1ecceSEric Biggers/* 33*55d1ecceSEric Biggers * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 34*55d1ecceSEric Biggers * #define CONSTANT_R3 0x1751997d0LL 35*55d1ecceSEric Biggers * 36*55d1ecceSEric Biggers * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e 37*55d1ecceSEric Biggers * #define CONSTANT_R4 0x0ccaa009eLL 38*55d1ecceSEric Biggers */ 39*55d1ecceSEric Biggers.Lconstant_R4R3: 40*55d1ecceSEric Biggers .octa 0x00000000ccaa009e00000001751997d0 41*55d1ecceSEric Biggers/* 42*55d1ecceSEric Biggers * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 43*55d1ecceSEric Biggers * #define CONSTANT_R5 0x163cd6124LL 44*55d1ecceSEric Biggers */ 45*55d1ecceSEric Biggers.Lconstant_R5: 46*55d1ecceSEric Biggers .octa 0x00000000000000000000000163cd6124 47*55d1ecceSEric Biggers.Lconstant_mask32: 48*55d1ecceSEric Biggers .octa 0x000000000000000000000000FFFFFFFF 49*55d1ecceSEric Biggers/* 50*55d1ecceSEric Biggers * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL 51*55d1ecceSEric Biggers * 52*55d1ecceSEric Biggers * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL 53*55d1ecceSEric Biggers * #define CONSTANT_RU 0x1F7011641LL 54*55d1ecceSEric Biggers */ 55*55d1ecceSEric Biggers.Lconstant_RUpoly: 56*55d1ecceSEric Biggers .octa 0x00000001F701164100000001DB710641 57*55d1ecceSEric Biggers 58*55d1ecceSEric Biggers#define CONSTANT %xmm0 59*55d1ecceSEric Biggers 60*55d1ecceSEric Biggers#ifdef __x86_64__ 61*55d1ecceSEric Biggers#define CRC %edi 62*55d1ecceSEric Biggers#define BUF %rsi 63*55d1ecceSEric Biggers#define LEN %rdx 64*55d1ecceSEric Biggers#else 65*55d1ecceSEric Biggers#define CRC %eax 66*55d1ecceSEric Biggers#define BUF %edx 67*55d1ecceSEric Biggers#define LEN %ecx 68*55d1ecceSEric Biggers#endif 69*55d1ecceSEric Biggers 70*55d1ecceSEric Biggers 71*55d1ecceSEric Biggers 72*55d1ecceSEric Biggers.text 73*55d1ecceSEric Biggers/** 74*55d1ecceSEric Biggers * Calculate crc32 75*55d1ecceSEric Biggers * CRC - initial crc32 76*55d1ecceSEric Biggers * BUF - buffer (16 bytes aligned) 77*55d1ecceSEric Biggers * LEN - sizeof buffer (16 bytes aligned), LEN should be greater than 63 78*55d1ecceSEric Biggers * return %eax crc32 79*55d1ecceSEric Biggers * u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len); 80*55d1ecceSEric Biggers */ 81*55d1ecceSEric Biggers 82*55d1ecceSEric BiggersSYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ 83*55d1ecceSEric Biggers movdqa (BUF), %xmm1 84*55d1ecceSEric Biggers movdqa 0x10(BUF), %xmm2 85*55d1ecceSEric Biggers movdqa 0x20(BUF), %xmm3 86*55d1ecceSEric Biggers movdqa 0x30(BUF), %xmm4 87*55d1ecceSEric Biggers movd CRC, CONSTANT 88*55d1ecceSEric Biggers pxor CONSTANT, %xmm1 89*55d1ecceSEric Biggers sub $0x40, LEN 90*55d1ecceSEric Biggers add $0x40, BUF 91*55d1ecceSEric Biggers cmp $0x40, LEN 92*55d1ecceSEric Biggers jb .Lless_64 93*55d1ecceSEric Biggers 94*55d1ecceSEric Biggers#ifdef __x86_64__ 95*55d1ecceSEric Biggers movdqa .Lconstant_R2R1(%rip), CONSTANT 96*55d1ecceSEric Biggers#else 97*55d1ecceSEric Biggers movdqa .Lconstant_R2R1, CONSTANT 98*55d1ecceSEric Biggers#endif 99*55d1ecceSEric Biggers 100*55d1ecceSEric Biggers.Lloop_64:/* 64 bytes Full cache line folding */ 101*55d1ecceSEric Biggers prefetchnta 0x40(BUF) 102*55d1ecceSEric Biggers movdqa %xmm1, %xmm5 103*55d1ecceSEric Biggers movdqa %xmm2, %xmm6 104*55d1ecceSEric Biggers movdqa %xmm3, %xmm7 105*55d1ecceSEric Biggers#ifdef __x86_64__ 106*55d1ecceSEric Biggers movdqa %xmm4, %xmm8 107*55d1ecceSEric Biggers#endif 108*55d1ecceSEric Biggers pclmulqdq $0x00, CONSTANT, %xmm1 109*55d1ecceSEric Biggers pclmulqdq $0x00, CONSTANT, %xmm2 110*55d1ecceSEric Biggers pclmulqdq $0x00, CONSTANT, %xmm3 111*55d1ecceSEric Biggers#ifdef __x86_64__ 112*55d1ecceSEric Biggers pclmulqdq $0x00, CONSTANT, %xmm4 113*55d1ecceSEric Biggers#endif 114*55d1ecceSEric Biggers pclmulqdq $0x11, CONSTANT, %xmm5 115*55d1ecceSEric Biggers pclmulqdq $0x11, CONSTANT, %xmm6 116*55d1ecceSEric Biggers pclmulqdq $0x11, CONSTANT, %xmm7 117*55d1ecceSEric Biggers#ifdef __x86_64__ 118*55d1ecceSEric Biggers pclmulqdq $0x11, CONSTANT, %xmm8 119*55d1ecceSEric Biggers#endif 120*55d1ecceSEric Biggers pxor %xmm5, %xmm1 121*55d1ecceSEric Biggers pxor %xmm6, %xmm2 122*55d1ecceSEric Biggers pxor %xmm7, %xmm3 123*55d1ecceSEric Biggers#ifdef __x86_64__ 124*55d1ecceSEric Biggers pxor %xmm8, %xmm4 125*55d1ecceSEric Biggers#else 126*55d1ecceSEric Biggers /* xmm8 unsupported for x32 */ 127*55d1ecceSEric Biggers movdqa %xmm4, %xmm5 128*55d1ecceSEric Biggers pclmulqdq $0x00, CONSTANT, %xmm4 129*55d1ecceSEric Biggers pclmulqdq $0x11, CONSTANT, %xmm5 130*55d1ecceSEric Biggers pxor %xmm5, %xmm4 131*55d1ecceSEric Biggers#endif 132*55d1ecceSEric Biggers 133*55d1ecceSEric Biggers pxor (BUF), %xmm1 134*55d1ecceSEric Biggers pxor 0x10(BUF), %xmm2 135*55d1ecceSEric Biggers pxor 0x20(BUF), %xmm3 136*55d1ecceSEric Biggers pxor 0x30(BUF), %xmm4 137*55d1ecceSEric Biggers 138*55d1ecceSEric Biggers sub $0x40, LEN 139*55d1ecceSEric Biggers add $0x40, BUF 140*55d1ecceSEric Biggers cmp $0x40, LEN 141*55d1ecceSEric Biggers jge .Lloop_64 142*55d1ecceSEric Biggers.Lless_64:/* Folding cache line into 128bit */ 143*55d1ecceSEric Biggers#ifdef __x86_64__ 144*55d1ecceSEric Biggers movdqa .Lconstant_R4R3(%rip), CONSTANT 145*55d1ecceSEric Biggers#else 146*55d1ecceSEric Biggers movdqa .Lconstant_R4R3, CONSTANT 147*55d1ecceSEric Biggers#endif 148*55d1ecceSEric Biggers prefetchnta (BUF) 149*55d1ecceSEric Biggers 150*55d1ecceSEric Biggers movdqa %xmm1, %xmm5 151*55d1ecceSEric Biggers pclmulqdq $0x00, CONSTANT, %xmm1 152*55d1ecceSEric Biggers pclmulqdq $0x11, CONSTANT, %xmm5 153*55d1ecceSEric Biggers pxor %xmm5, %xmm1 154*55d1ecceSEric Biggers pxor %xmm2, %xmm1 155*55d1ecceSEric Biggers 156*55d1ecceSEric Biggers movdqa %xmm1, %xmm5 157*55d1ecceSEric Biggers pclmulqdq $0x00, CONSTANT, %xmm1 158*55d1ecceSEric Biggers pclmulqdq $0x11, CONSTANT, %xmm5 159*55d1ecceSEric Biggers pxor %xmm5, %xmm1 160*55d1ecceSEric Biggers pxor %xmm3, %xmm1 161*55d1ecceSEric Biggers 162*55d1ecceSEric Biggers movdqa %xmm1, %xmm5 163*55d1ecceSEric Biggers pclmulqdq $0x00, CONSTANT, %xmm1 164*55d1ecceSEric Biggers pclmulqdq $0x11, CONSTANT, %xmm5 165*55d1ecceSEric Biggers pxor %xmm5, %xmm1 166*55d1ecceSEric Biggers pxor %xmm4, %xmm1 167*55d1ecceSEric Biggers 168*55d1ecceSEric Biggers cmp $0x10, LEN 169*55d1ecceSEric Biggers jb .Lfold_64 170*55d1ecceSEric Biggers.Lloop_16:/* Folding rest buffer into 128bit */ 171*55d1ecceSEric Biggers movdqa %xmm1, %xmm5 172*55d1ecceSEric Biggers pclmulqdq $0x00, CONSTANT, %xmm1 173*55d1ecceSEric Biggers pclmulqdq $0x11, CONSTANT, %xmm5 174*55d1ecceSEric Biggers pxor %xmm5, %xmm1 175*55d1ecceSEric Biggers pxor (BUF), %xmm1 176*55d1ecceSEric Biggers sub $0x10, LEN 177*55d1ecceSEric Biggers add $0x10, BUF 178*55d1ecceSEric Biggers cmp $0x10, LEN 179*55d1ecceSEric Biggers jge .Lloop_16 180*55d1ecceSEric Biggers 181*55d1ecceSEric Biggers.Lfold_64: 182*55d1ecceSEric Biggers /* perform the last 64 bit fold, also adds 32 zeroes 183*55d1ecceSEric Biggers * to the input stream */ 184*55d1ecceSEric Biggers pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ 185*55d1ecceSEric Biggers psrldq $0x08, %xmm1 186*55d1ecceSEric Biggers pxor CONSTANT, %xmm1 187*55d1ecceSEric Biggers 188*55d1ecceSEric Biggers /* final 32-bit fold */ 189*55d1ecceSEric Biggers movdqa %xmm1, %xmm2 190*55d1ecceSEric Biggers#ifdef __x86_64__ 191*55d1ecceSEric Biggers movdqa .Lconstant_R5(%rip), CONSTANT 192*55d1ecceSEric Biggers movdqa .Lconstant_mask32(%rip), %xmm3 193*55d1ecceSEric Biggers#else 194*55d1ecceSEric Biggers movdqa .Lconstant_R5, CONSTANT 195*55d1ecceSEric Biggers movdqa .Lconstant_mask32, %xmm3 196*55d1ecceSEric Biggers#endif 197*55d1ecceSEric Biggers psrldq $0x04, %xmm2 198*55d1ecceSEric Biggers pand %xmm3, %xmm1 199*55d1ecceSEric Biggers pclmulqdq $0x00, CONSTANT, %xmm1 200*55d1ecceSEric Biggers pxor %xmm2, %xmm1 201*55d1ecceSEric Biggers 202*55d1ecceSEric Biggers /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ 203*55d1ecceSEric Biggers#ifdef __x86_64__ 204*55d1ecceSEric Biggers movdqa .Lconstant_RUpoly(%rip), CONSTANT 205*55d1ecceSEric Biggers#else 206*55d1ecceSEric Biggers movdqa .Lconstant_RUpoly, CONSTANT 207*55d1ecceSEric Biggers#endif 208*55d1ecceSEric Biggers movdqa %xmm1, %xmm2 209*55d1ecceSEric Biggers pand %xmm3, %xmm1 210*55d1ecceSEric Biggers pclmulqdq $0x10, CONSTANT, %xmm1 211*55d1ecceSEric Biggers pand %xmm3, %xmm1 212*55d1ecceSEric Biggers pclmulqdq $0x00, CONSTANT, %xmm1 213*55d1ecceSEric Biggers pxor %xmm2, %xmm1 214*55d1ecceSEric Biggers pextrd $0x01, %xmm1, %eax 215*55d1ecceSEric Biggers 216*55d1ecceSEric Biggers RET 217*55d1ecceSEric BiggersSYM_FUNC_END(crc32_pclmul_le_16) 218