1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2// 3// Template to generate [V]PCLMULQDQ-based CRC functions for x86 4// 5// Copyright 2025 Google LLC 6// 7// Author: Eric Biggers <ebiggers@google.com> 8 9#include <linux/linkage.h> 10#include <linux/objtool.h> 11 12// Offsets within the generated constants table 13.set OFFSETOF_BSWAP_MASK, -5*16 // msb-first CRCs only 14.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS, -4*16 // must precede next 15.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS, -3*16 // must precede next 16.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS, -2*16 // must precede next 17.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS, -1*16 // must precede next 18.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS, 0*16 // must be 0 19.set OFFSETOF_SHUF_TABLE, 1*16 20.set OFFSETOF_BARRETT_REDUCTION_CONSTS, 4*16 21 22// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the 23// corresponding non-VEX instruction plus any needed moves. The supported 24// instruction formats are: 25// 26// - Two-arg [src, dst], where the non-VEX format is the same. 27// - Three-arg [src1, src2, dst] where the non-VEX format is 28// [src1, src2_and_dst]. If src2 != dst, then src1 must != dst too. 29// 30// \insn gives the instruction without a "v" prefix and including any immediate 31// argument if needed to make the instruction follow one of the above formats. 32// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to 33// it first; this is needed when \arg1 is an unaligned mem operand. 34.macro _cond_vex insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp 35.if AVX_LEVEL == 0 36 // VEX not allowed. Emulate it. 37 .ifnb \arg3 // Three-arg [src1, src2, dst] 38 .ifc "\arg2", "\arg3" // src2 == dst? 39 .ifnb \unaligned_mem_tmp 40 movdqu \arg1, \unaligned_mem_tmp 41 \insn \unaligned_mem_tmp, \arg3 42 .else 43 \insn \arg1, \arg3 44 .endif 45 .else // src2 != dst 46 .ifc "\arg1", "\arg3" 47 .error "Can't have src1 == dst when src2 != dst" 48 .endif 49 .ifnb \unaligned_mem_tmp 50 movdqu \arg1, \unaligned_mem_tmp 51 movdqa \arg2, \arg3 52 \insn \unaligned_mem_tmp, \arg3 53 .else 54 movdqa \arg2, \arg3 55 \insn \arg1, \arg3 56 .endif 57 .endif 58 .else // Two-arg [src, dst] 59 .ifnb \unaligned_mem_tmp 60 movdqu \arg1, \unaligned_mem_tmp 61 \insn \unaligned_mem_tmp, \arg2 62 .else 63 \insn \arg1, \arg2 64 .endif 65 .endif 66.else 67 // VEX is allowed. Emit the desired instruction directly. 68 .ifnb \arg3 69 v\insn \arg1, \arg2, \arg3 70 .else 71 v\insn \arg1, \arg2 72 .endif 73.endif 74.endm 75 76// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector 77// register of length VL. 78.macro _vbroadcast src, dst 79.if VL == 16 80 _cond_vex movdqa, \src, \dst 81.elseif VL == 32 82 vbroadcasti128 \src, \dst 83.else 84 vbroadcasti32x4 \src, \dst 85.endif 86.endm 87 88// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC 89// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane. 90.macro _load_data vl, src, bswap_mask, dst 91.if \vl < 64 92 _cond_vex movdqu, "\src", \dst 93.else 94 vmovdqu8 \src, \dst 95.endif 96.if !LSB_CRC 97 _cond_vex pshufb, \bswap_mask, \dst, \dst 98.endif 99.endm 100 101.macro _prepare_v0 vl, v0, v1, bswap_mask 102.if LSB_CRC 103 .if \vl < 64 104 _cond_vex pxor, (BUF), \v0, \v0, unaligned_mem_tmp=\v1 105 .else 106 vpxorq (BUF), \v0, \v0 107 .endif 108.else 109 _load_data \vl, (BUF), \bswap_mask, \v1 110 .if \vl < 64 111 _cond_vex pxor, \v1, \v0, \v0 112 .else 113 vpxorq \v1, \v0, \v0 114 .endif 115.endif 116.endm 117 118// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for 119// msb-first order or the physically high qword for lsb-first order 120#define LO64_TERMS 0 121 122// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high 123// qword for msb-first order or the physically low qword for lsb-first order 124#define HI64_TERMS 1 125 126// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given 127// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst. 128.macro _pclmulqdq src1, src1_terms, src2, src2_terms, dst 129 _cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \ 130 \src1, \src2, \dst 131.endm 132 133// Fold \acc into \data and store the result back into \acc. \data can be an 134// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no 135// byte-reflection is needed; otherwise it must be a vector register. \consts 136// is a vector register containing the needed fold constants, and \tmp is a 137// temporary vector register. All arguments must be the same length. 138.macro _fold_vec acc, data, consts, tmp 139 _pclmulqdq \consts, HI64_TERMS, \acc, HI64_TERMS, \tmp 140 _pclmulqdq \consts, LO64_TERMS, \acc, LO64_TERMS, \acc 141.if AVX_LEVEL <= 2 142 _cond_vex pxor, \data, \tmp, \tmp 143 _cond_vex pxor, \tmp, \acc, \acc 144.else 145 vpternlogq $0x96, \data, \tmp, \acc 146.endif 147.endm 148 149// Fold \acc into \data and store the result back into \acc. \data is an 150// unaligned mem operand, \consts is a vector register containing the needed 151// fold constants, \bswap_mask is a vector register containing the 152// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are 153// temporary vector registers. All arguments must have length \vl. 154.macro _fold_vec_mem vl, acc, data, consts, bswap_mask, tmp1, tmp2 155.if AVX_LEVEL == 0 || !LSB_CRC 156 _load_data \vl, \data, \bswap_mask, \tmp1 157 _fold_vec \acc, \tmp1, \consts, \tmp2 158.else 159 _fold_vec \acc, \data, \consts, \tmp1 160.endif 161.endm 162 163// Load the constants for folding across 2**i vectors of length VL at a time 164// into all 128-bit lanes of the vector register CONSTS. 165.macro _load_vec_folding_consts i 166 _vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \ 167 CONSTS 168.endm 169 170// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store 171// the result back into \v0. If the remaining length mod \vl is nonzero, also 172// fold \vl data bytes from BUF. For both operations the fold distance is \vl. 173// \consts must be a register of length \vl containing the fold constants. 174.macro _fold_vec_final vl, v0, v1, consts, bswap_mask, tmp1, tmp2 175 _fold_vec \v0, \v1, \consts, \tmp1 176 test $\vl, LEN8 177 jz .Lfold_vec_final_done\@ 178 _fold_vec_mem \vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2 179 add $\vl, BUF 180.Lfold_vec_final_done\@: 181.endm 182 183// This macro generates the body of a CRC function with the following prototype: 184// 185// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts); 186// 187// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it. 188// |buf| is the data to checksum. |len| is the data length in bytes, which must 189// be at least 16. |consts| is a pointer to the fold_across_128_bits_consts 190// field of the constants struct that was generated for the chosen CRC variant. 191// 192// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g. 193// 32 for a CRC-32. Currently the supported values are 8, 16, 32, and 64. If 194// the file is compiled in i386 mode, then the maximum supported value is 32. 195// 196// \lsb_crc is 1 if the CRC processes the least significant bit of each byte 197// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0. \lsb_crc is 0 198// if the CRC processes the most significant bit of each byte first, i.e. maps 199// bit0 to x^0, bit1 to x^1, bit7 to x^7. 200// 201// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64. 202// 203// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or 204// 512 for AVX512. 205// 206// If \vl == 16 && \avx_level == 0, the generated code requires: 207// PCLMULQDQ && SSE4.1. (Note: all known CPUs with PCLMULQDQ also have SSE4.1.) 208// 209// If \vl == 32 && \avx_level == 2, the generated code requires: 210// VPCLMULQDQ && AVX2. 211// 212// If \vl == 64 && \avx_level == 512, the generated code requires: 213// VPCLMULQDQ && AVX512BW && AVX512VL. 214// 215// Other \vl and \avx_level combinations are either not supported or not useful. 216.macro _crc_pclmul n, lsb_crc, vl, avx_level 217 .set LSB_CRC, \lsb_crc 218 .set VL, \vl 219 .set AVX_LEVEL, \avx_level 220 221 // Define aliases for the xmm, ymm, or zmm registers according to VL. 222.irp i, 0,1,2,3,4,5,6,7 223 .if VL == 16 224 .set V\i, %xmm\i 225 .set LOG2_VL, 4 226 .elseif VL == 32 227 .set V\i, %ymm\i 228 .set LOG2_VL, 5 229 .elseif VL == 64 230 .set V\i, %zmm\i 231 .set LOG2_VL, 6 232 .else 233 .error "Unsupported vector length" 234 .endif 235.endr 236 // Define aliases for the function parameters. 237 // Note: when crc_t is shorter than u32, zero-extension to 32 bits is 238 // guaranteed by the ABI. Zero-extension to 64 bits is *not* guaranteed 239 // when crc_t is shorter than u64. 240#ifdef __x86_64__ 241.if \n <= 32 242 .set CRC, %edi 243.else 244 .set CRC, %rdi 245.endif 246 .set BUF, %rsi 247 .set LEN, %rdx 248 .set LEN32, %edx 249 .set LEN8, %dl 250 .set CONSTS_PTR, %rcx 251#else 252 // 32-bit support, assuming -mregparm=3 and not including support for 253 // CRC-64 (which would use both eax and edx to pass the crc parameter). 254 .set CRC, %eax 255 .set BUF, %edx 256 .set LEN, %ecx 257 .set LEN32, %ecx 258 .set LEN8, %cl 259 .set CONSTS_PTR, %ebx // Passed on stack 260#endif 261 262 // Define aliases for some local variables. V0-V5 are used without 263 // aliases (for accumulators, data, temporary values, etc). Staying 264 // within the first 8 vector registers keeps the code 32-bit SSE 265 // compatible and reduces the size of 64-bit SSE code slightly. 266 .set BSWAP_MASK, V6 267 .set BSWAP_MASK_YMM, %ymm6 268 .set BSWAP_MASK_XMM, %xmm6 269 .set CONSTS, V7 270 .set CONSTS_YMM, %ymm7 271 .set CONSTS_XMM, %xmm7 272 273 // Use ANNOTATE_NOENDBR to suppress an objtool warning, since the 274 // functions generated by this macro are called only by static_call. 275 ANNOTATE_NOENDBR 276 277#ifdef __i386__ 278 push CONSTS_PTR 279 mov 8(%esp), CONSTS_PTR 280#endif 281 282 // Create a 128-bit vector that contains the initial CRC in the end 283 // representing the high-order polynomial coefficients, and the rest 0. 284 // If the CRC is msb-first, also load the byte-reflection table. 285.if \n <= 32 286 _cond_vex movd, CRC, %xmm0 287.else 288 _cond_vex movq, CRC, %xmm0 289.endif 290.if !LSB_CRC 291 _cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0 292 _vbroadcast OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK 293.endif 294 295 // Load the first vector of data and XOR the initial CRC into the 296 // appropriate end of the first 128-bit lane of data. If LEN < VL, then 297 // use a short vector and jump ahead to the final reduction. (LEN >= 16 298 // is guaranteed here but not necessarily LEN >= VL.) 299.if VL >= 32 300 cmp $VL, LEN 301 jae .Lat_least_1vec\@ 302 .if VL == 64 303 cmp $32, LEN32 304 jb .Lless_than_32bytes\@ 305 _prepare_v0 32, %ymm0, %ymm1, BSWAP_MASK_YMM 306 add $32, BUF 307 jmp .Lreduce_256bits_to_128bits\@ 308.Lless_than_32bytes\@: 309 .endif 310 _prepare_v0 16, %xmm0, %xmm1, BSWAP_MASK_XMM 311 add $16, BUF 312 vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM 313 jmp .Lcheck_for_partial_block\@ 314.Lat_least_1vec\@: 315.endif 316 _prepare_v0 VL, V0, V1, BSWAP_MASK 317 318 // Handle VL <= LEN < 4*VL. 319 cmp $4*VL-1, LEN 320 ja .Lat_least_4vecs\@ 321 add $VL, BUF 322 // If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector. 323 // If VL==16 then load fold_across_128_bits_consts first, as the final 324 // reduction depends on it and it won't be loaded anywhere else. 325 cmp $2*VL-1, LEN32 326.if VL == 16 327 _cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM 328.endif 329 jbe .Lreduce_1vec_to_128bits\@ 330 // Otherwise 2*VL <= LEN < 4*VL. Load one more vector and jump ahead to 331 // the reduction from 2 vectors. 332 _load_data VL, (BUF), BSWAP_MASK, V1 333 add $VL, BUF 334 jmp .Lreduce_2vecs_to_1\@ 335 336.Lat_least_4vecs\@: 337 // Load 3 more vectors of data. 338 _load_data VL, 1*VL(BUF), BSWAP_MASK, V1 339 _load_data VL, 2*VL(BUF), BSWAP_MASK, V2 340 _load_data VL, 3*VL(BUF), BSWAP_MASK, V3 341 sub $-4*VL, BUF // Shorter than 'add 4*VL' when VL=32 342 add $-4*VL, LEN // Shorter than 'sub 4*VL' when VL=32 343 344 // Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next 345 // 4 vectors of data and write the result back to V0-V3. 346 cmp $4*VL-1, LEN // Shorter than 'cmp 4*VL' when VL=32 347 jbe .Lreduce_4vecs_to_2\@ 348 _load_vec_folding_consts 2 349.Lfold_4vecs_loop\@: 350 _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 351 _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 352 _fold_vec_mem VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 353 _fold_vec_mem VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 354 sub $-4*VL, BUF 355 add $-4*VL, LEN 356 cmp $4*VL-1, LEN 357 ja .Lfold_4vecs_loop\@ 358 359 // Fold V0,V1 into V2,V3 and write the result back to V0,V1. Then fold 360 // two more vectors of data from BUF, if at least that much remains. 361.Lreduce_4vecs_to_2\@: 362 _load_vec_folding_consts 1 363 _fold_vec V0, V2, CONSTS, V4 364 _fold_vec V1, V3, CONSTS, V4 365 test $2*VL, LEN8 366 jz .Lreduce_2vecs_to_1\@ 367 _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 368 _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 369 sub $-2*VL, BUF 370 371 // Fold V0 into V1 and write the result back to V0. Then fold one more 372 // vector of data from BUF, if at least that much remains. 373.Lreduce_2vecs_to_1\@: 374 _load_vec_folding_consts 0 375 _fold_vec_final VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5 376 377.Lreduce_1vec_to_128bits\@: 378.if VL == 64 379 // Reduce 512-bit %zmm0 to 256-bit %ymm0. Then fold 256 more bits of 380 // data from BUF, if at least that much remains. 381 vbroadcasti128 OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM 382 vextracti64x4 $1, %zmm0, %ymm1 383 _fold_vec_final 32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5 384.Lreduce_256bits_to_128bits\@: 385.endif 386.if VL >= 32 387 // Reduce 256-bit %ymm0 to 128-bit %xmm0. Then fold 128 more bits of 388 // data from BUF, if at least that much remains. 389 vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM 390 vextracti128 $1, %ymm0, %xmm1 391 _fold_vec_final 16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5 392.Lcheck_for_partial_block\@: 393.endif 394 and $15, LEN32 395 jz .Lreduce_128bits_to_crc\@ 396 397 // 1 <= LEN <= 15 data bytes remain in BUF. The polynomial is now 398 // A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0 399 // and B is the polynomial of the remaining LEN data bytes. To reduce 400 // this to 128 bits without needing fold constants for each possible 401 // LEN, rearrange this expression into C1*(x^128) + C2, where 402 // C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128. 403 // Then fold C1 into C2, which is just another fold across 128 bits. 404 405.if !LSB_CRC || AVX_LEVEL == 0 406 // Load the last 16 data bytes. Note that originally LEN was >= 16. 407 _load_data 16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2 408.endif // Else will use vpblendvb mem operand later. 409.if !LSB_CRC 410 neg LEN // Needed for indexing shuf_table 411.endif 412 413 // tmp = A*x^(8*LEN) mod x^128 414 // lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1] 415 // i.e. right-shift by LEN bytes. 416 // msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN] 417 // i.e. left-shift by LEN bytes. 418 _cond_vex movdqu, "OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3 419 _cond_vex pshufb, %xmm3, %xmm0, %xmm1 420 421 // C1 = floor(A / x^(128 - 8*LEN)) 422 // lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1] 423 // i.e. left-shift by 16-LEN bytes. 424 // msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1] 425 // i.e. right-shift by 16-LEN bytes. 426 _cond_vex pshufb, "OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \ 427 %xmm0, %xmm0, unaligned_mem_tmp=%xmm4 428 429 // C2 = tmp + B. This is just a blend of tmp with the last 16 data 430 // bytes (reflected if msb-first). The blend mask is the shuffle table 431 // that was used to create tmp. 0 selects tmp, and 1 last16databytes. 432.if AVX_LEVEL == 0 433 movdqa %xmm0, %xmm4 434 movdqa %xmm3, %xmm0 435 pblendvb %xmm2, %xmm1 // uses %xmm0 as implicit operand 436 movdqa %xmm4, %xmm0 437.elseif LSB_CRC 438 vpblendvb %xmm3, -16(BUF,LEN), %xmm1, %xmm1 439.else 440 vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 441.endif 442 443 // Fold C1 into C2 and store the 128-bit result in %xmm0. 444 _fold_vec %xmm0, %xmm1, CONSTS_XMM, %xmm4 445 446.Lreduce_128bits_to_crc\@: 447 // Compute the CRC as %xmm0 * x^n mod G. Here %xmm0 means the 128-bit 448 // polynomial stored in %xmm0 (using either lsb-first or msb-first bit 449 // order according to LSB_CRC), and G is the CRC's generator polynomial. 450 451 // First, multiply %xmm0 by x^n and reduce the result to 64+n bits: 452 // 453 // t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) + 454 // x^n * (%xmm0 mod x^64) 455 // 456 // Store t0 * x^(64-n) in %xmm0. I.e., actually do: 457 // 458 // %xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) + 459 // x^64 * (%xmm0 mod x^64) 460 // 461 // The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned 462 // to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily 463 // select it. The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the 464 // msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case 465 // (considering the extra factor of x that gets implicitly introduced by 466 // each pclmulqdq when using lsb-first order), is identical to the 467 // constant that was used earlier for folding the LO64_TERMS across 128 468 // bits. Thus it's already available in LO64_TERMS of CONSTS_XMM. 469 _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1 470.if LSB_CRC 471 _cond_vex psrldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64) 472.else 473 _cond_vex pslldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64) 474.endif 475 _cond_vex pxor, %xmm1, %xmm0, %xmm0 476 // The HI64_TERMS of %xmm0 now contain floor(t0 / x^n). 477 // The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n). 478 479 // First step of Barrett reduction: Compute floor(t0 / G). This is the 480 // polynomial by which G needs to be multiplied to cancel out the x^n 481 // and higher terms of t0, i.e. to reduce t0 mod G. First do: 482 // 483 // t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n) 484 // 485 // Then the desired value floor(t0 / G) is floor(t1 / x^64). The 63 in 486 // x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest 487 // value that makes enough precision be carried through the calculation. 488 // 489 // The '* x' makes it so the result is floor(t1 / x^64) rather than 490 // floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it 491 // can be extracted much more easily in the next step. In the lsb-first 492 // case the '* x' happens implicitly. In the msb-first case it must be 493 // done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the 494 // constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and 495 // the multiplication by the x^64 term is handled using a pxor. The 496 // pxor causes the low 64 terms of t1 to be wrong, but they are unused. 497 _cond_vex movdqa, OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM 498 _pclmulqdq CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1 499.if !LSB_CRC 500 _cond_vex pxor, %xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n) 501.endif 502 // The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G). 503 504 // Second step of Barrett reduction: Cancel out the x^n and higher terms 505 // of t0 by subtracting the needed multiple of G. This gives the CRC: 506 // 507 // crc := t0 - (G * floor(t0 / G)) 508 // 509 // But %xmm0 contains t0 * x^(64-n), so it's more convenient to do: 510 // 511 // crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n) 512 // 513 // Furthermore, since the resulting CRC is n-bit, if mod x^n is 514 // explicitly applied to it then the x^n term of G makes no difference 515 // in the result and can be omitted. This helps keep the constant 516 // multiplier in 64 bits in most cases. This gives the following: 517 // 518 // %xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G)) 519 // crc := (%xmm0 / x^(64-n)) mod x^n 520 // 521 // In the lsb-first case, each pclmulqdq implicitly introduces 522 // an extra factor of x, so in that case the constant that needs to be 523 // passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63. 524 // For lsb-first CRCs where n=64, the extra factor of x cannot be as 525 // easily avoided. In that case, instead pass '(G - x^n - x^0) / x' to 526 // pclmulqdq and handle the x^0 term (i.e. 1) separately. (All CRC 527 // polynomials have nonzero x^n and x^0 terms.) It works out as: the 528 // CRC has be XORed with the physically low qword of %xmm1, representing 529 // floor(t0 / G). The most efficient way to do that is to move it to 530 // the physically high qword and use a ternlog to combine the two XORs. 531.if LSB_CRC && \n == 64 532 _cond_vex punpcklqdq, %xmm1, %xmm2, %xmm2 533 _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1 534 .if AVX_LEVEL <= 2 535 _cond_vex pxor, %xmm2, %xmm0, %xmm0 536 _cond_vex pxor, %xmm1, %xmm0, %xmm0 537 .else 538 vpternlogq $0x96, %xmm2, %xmm1, %xmm0 539 .endif 540 _cond_vex "pextrq $1,", %xmm0, %rax // (%xmm0 / x^0) mod x^64 541.else 542 _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1 543 _cond_vex pxor, %xmm1, %xmm0, %xmm0 544 .if \n == 8 545 _cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8 546 .elseif \n == 16 547 _cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16 548 .elseif \n == 32 549 _cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32 550 .else // \n == 64 && !LSB_CRC 551 _cond_vex movq, %xmm0, %rax // (%xmm0 / x^0) mod x^64 552 .endif 553.endif 554 555.if VL > 16 556 vzeroupper // Needed when ymm or zmm registers may have been used. 557.endif 558#ifdef __i386__ 559 pop CONSTS_PTR 560#endif 561 RET 562.endm 563 564#ifdef CONFIG_AS_VPCLMULQDQ 565#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \ 566SYM_FUNC_START(prefix##_pclmul_sse); \ 567 _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \ 568SYM_FUNC_END(prefix##_pclmul_sse); \ 569 \ 570SYM_FUNC_START(prefix##_vpclmul_avx2); \ 571 _crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=2; \ 572SYM_FUNC_END(prefix##_vpclmul_avx2); \ 573 \ 574SYM_FUNC_START(prefix##_vpclmul_avx512); \ 575 _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=512; \ 576SYM_FUNC_END(prefix##_vpclmul_avx512); 577#else 578#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \ 579SYM_FUNC_START(prefix##_pclmul_sse); \ 580 _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \ 581SYM_FUNC_END(prefix##_pclmul_sse); 582#endif // !CONFIG_AS_VPCLMULQDQ 583