1/* 2 * Implement fast CRC32C with PCLMULQDQ instructions. (x86_64) 3 * 4 * The white papers on CRC32C calculations with PCLMULQDQ instruction can be 5 * downloaded from: 6 * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/crc-iscsi-polynomial-crc32-instruction-paper.pdf 7 * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf 8 * 9 * Copyright (C) 2012 Intel Corporation. 10 * Copyright 2024 Google LLC 11 * 12 * Authors: 13 * Wajdi Feghali <wajdi.k.feghali@intel.com> 14 * James Guilford <james.guilford@intel.com> 15 * David Cote <david.m.cote@intel.com> 16 * Tim Chen <tim.c.chen@linux.intel.com> 17 * 18 * This software is available to you under a choice of one of two 19 * licenses. You may choose to be licensed under the terms of the GNU 20 * General Public License (GPL) Version 2, available from the file 21 * COPYING in the main directory of this source tree, or the 22 * OpenIB.org BSD license below: 23 * 24 * Redistribution and use in source and binary forms, with or 25 * without modification, are permitted provided that the following 26 * conditions are met: 27 * 28 * - Redistributions of source code must retain the above 29 * copyright notice, this list of conditions and the following 30 * disclaimer. 31 * 32 * - Redistributions in binary form must reproduce the above 33 * copyright notice, this list of conditions and the following 34 * disclaimer in the documentation and/or other materials 35 * provided with the distribution. 36 * 37 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 38 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 39 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 40 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 41 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 42 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 43 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 44 * SOFTWARE. 45 */ 46 47#include <linux/linkage.h> 48 49## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction 50 51# Define threshold below which buffers are considered "small" and routed to 52# regular CRC code that does not interleave the CRC instructions. 53#define SMALL_SIZE 200 54 55# u32 crc32c_x86_3way(u32 crc, const u8 *buffer, size_t len); 56 57.text 58SYM_FUNC_START(crc32c_x86_3way) 59#define crc0 %edi 60#define crc0_q %rdi 61#define bufp %rsi 62#define bufp_d %esi 63#define len %rdx 64#define len_dw %edx 65#define n_misaligned %ecx /* overlaps chunk_bytes! */ 66#define n_misaligned_q %rcx 67#define chunk_bytes %ecx /* overlaps n_misaligned! */ 68#define chunk_bytes_q %rcx 69#define crc1 %r8 70#define crc2 %r9 71 72 cmp $SMALL_SIZE, len 73 jb .Lsmall 74 75 ################################################################ 76 ## 1) ALIGN: 77 ################################################################ 78 mov bufp_d, n_misaligned 79 neg n_misaligned 80 and $7, n_misaligned # calculate the misalignment amount of 81 # the address 82 je .Laligned # Skip if aligned 83 84 # Process 1 <= n_misaligned <= 7 bytes individually in order to align 85 # the remaining data to an 8-byte boundary. 86.Ldo_align: 87 movq (bufp), %rax 88 add n_misaligned_q, bufp 89 sub n_misaligned_q, len 90.Lalign_loop: 91 crc32b %al, crc0 # compute crc32 of 1-byte 92 shr $8, %rax # get next byte 93 dec n_misaligned 94 jne .Lalign_loop 95.Laligned: 96 97 ################################################################ 98 ## 2) PROCESS BLOCK: 99 ################################################################ 100 101 cmp $128*24, len 102 jae .Lfull_block 103 104.Lpartial_block: 105 # Compute floor(len / 24) to get num qwords to process from each lane. 106 imul $2731, len_dw, %eax # 2731 = ceil(2^16 / 24) 107 shr $16, %eax 108 jmp .Lcrc_3lanes 109 110.Lfull_block: 111 # Processing 128 qwords from each lane. 112 mov $128, %eax 113 114 ################################################################ 115 ## 3) CRC each of three lanes: 116 ################################################################ 117 118.Lcrc_3lanes: 119 xor crc1,crc1 120 xor crc2,crc2 121 mov %eax, chunk_bytes 122 shl $3, chunk_bytes # num bytes to process from each lane 123 sub $5, %eax # 4 for 4x_loop, 1 for special last iter 124 jl .Lcrc_3lanes_4x_done 125 126 # Unroll the loop by a factor of 4 to reduce the overhead of the loop 127 # bookkeeping instructions, which can compete with crc32q for the ALUs. 128.Lcrc_3lanes_4x_loop: 129 crc32q (bufp), crc0_q 130 crc32q (bufp,chunk_bytes_q), crc1 131 crc32q (bufp,chunk_bytes_q,2), crc2 132 crc32q 8(bufp), crc0_q 133 crc32q 8(bufp,chunk_bytes_q), crc1 134 crc32q 8(bufp,chunk_bytes_q,2), crc2 135 crc32q 16(bufp), crc0_q 136 crc32q 16(bufp,chunk_bytes_q), crc1 137 crc32q 16(bufp,chunk_bytes_q,2), crc2 138 crc32q 24(bufp), crc0_q 139 crc32q 24(bufp,chunk_bytes_q), crc1 140 crc32q 24(bufp,chunk_bytes_q,2), crc2 141 add $32, bufp 142 sub $4, %eax 143 jge .Lcrc_3lanes_4x_loop 144 145.Lcrc_3lanes_4x_done: 146 add $4, %eax 147 jz .Lcrc_3lanes_last_qword 148 149.Lcrc_3lanes_1x_loop: 150 crc32q (bufp), crc0_q 151 crc32q (bufp,chunk_bytes_q), crc1 152 crc32q (bufp,chunk_bytes_q,2), crc2 153 add $8, bufp 154 dec %eax 155 jnz .Lcrc_3lanes_1x_loop 156 157.Lcrc_3lanes_last_qword: 158 crc32q (bufp), crc0_q 159 crc32q (bufp,chunk_bytes_q), crc1 160# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet 161 162 ################################################################ 163 ## 4) Combine three results: 164 ################################################################ 165 166 lea (K_table-8)(%rip), %rax # first entry is for idx 1 167 pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2 168 lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3 169 sub %rax, len # len -= chunk_bytes * 3 170 171 movq crc0_q, %xmm1 # CRC for block 1 172 pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 173 174 movq crc1, %xmm2 # CRC for block 2 175 pclmulqdq $0x10, %xmm0, %xmm2 # Multiply by K1 176 177 pxor %xmm2,%xmm1 178 movq %xmm1, %rax 179 xor (bufp,chunk_bytes_q,2), %rax 180 mov crc2, crc0_q 181 crc32 %rax, crc0_q 182 lea 8(bufp,chunk_bytes_q,2), bufp 183 184 ################################################################ 185 ## 5) If more blocks remain, goto (2): 186 ################################################################ 187 188 cmp $128*24, len 189 jae .Lfull_block 190 cmp $SMALL_SIZE, len 191 jae .Lpartial_block 192 193 ####################################################################### 194 ## 6) Process any remainder without interleaving: 195 ####################################################################### 196.Lsmall: 197 test len_dw, len_dw 198 jz .Ldone 199 mov len_dw, %eax 200 shr $3, %eax 201 jz .Ldo_dword 202.Ldo_qwords: 203 crc32q (bufp), crc0_q 204 add $8, bufp 205 dec %eax 206 jnz .Ldo_qwords 207.Ldo_dword: 208 test $4, len_dw 209 jz .Ldo_word 210 crc32l (bufp), crc0 211 add $4, bufp 212.Ldo_word: 213 test $2, len_dw 214 jz .Ldo_byte 215 crc32w (bufp), crc0 216 add $2, bufp 217.Ldo_byte: 218 test $1, len_dw 219 jz .Ldone 220 crc32b (bufp), crc0 221.Ldone: 222 mov crc0, %eax 223 RET 224SYM_FUNC_END(crc32c_x86_3way) 225 226.section .rodata, "a", @progbits 227 ################################################################ 228 ## PCLMULQDQ tables 229 ## Table is 128 entries x 2 words (8 bytes) each 230 ################################################################ 231.align 8 232K_table: 233 .long 0x493c7d27, 0x00000001 234 .long 0xba4fc28e, 0x493c7d27 235 .long 0xddc0152b, 0xf20c0dfe 236 .long 0x9e4addf8, 0xba4fc28e 237 .long 0x39d3b296, 0x3da6d0cb 238 .long 0x0715ce53, 0xddc0152b 239 .long 0x47db8317, 0x1c291d04 240 .long 0x0d3b6092, 0x9e4addf8 241 .long 0xc96cfdc0, 0x740eef02 242 .long 0x878a92a7, 0x39d3b296 243 .long 0xdaece73e, 0x083a6eec 244 .long 0xab7aff2a, 0x0715ce53 245 .long 0x2162d385, 0xc49f4f67 246 .long 0x83348832, 0x47db8317 247 .long 0x299847d5, 0x2ad91c30 248 .long 0xb9e02b86, 0x0d3b6092 249 .long 0x18b33a4e, 0x6992cea2 250 .long 0xb6dd949b, 0xc96cfdc0 251 .long 0x78d9ccb7, 0x7e908048 252 .long 0xbac2fd7b, 0x878a92a7 253 .long 0xa60ce07b, 0x1b3d8f29 254 .long 0xce7f39f4, 0xdaece73e 255 .long 0x61d82e56, 0xf1d0f55e 256 .long 0xd270f1a2, 0xab7aff2a 257 .long 0xc619809d, 0xa87ab8a8 258 .long 0x2b3cac5d, 0x2162d385 259 .long 0x65863b64, 0x8462d800 260 .long 0x1b03397f, 0x83348832 261 .long 0xebb883bd, 0x71d111a8 262 .long 0xb3e32c28, 0x299847d5 263 .long 0x064f7f26, 0xffd852c6 264 .long 0xdd7e3b0c, 0xb9e02b86 265 .long 0xf285651c, 0xdcb17aa4 266 .long 0x10746f3c, 0x18b33a4e 267 .long 0xc7a68855, 0xf37c5aee 268 .long 0x271d9844, 0xb6dd949b 269 .long 0x8e766a0c, 0x6051d5a2 270 .long 0x93a5f730, 0x78d9ccb7 271 .long 0x6cb08e5c, 0x18b0d4ff 272 .long 0x6b749fb2, 0xbac2fd7b 273 .long 0x1393e203, 0x21f3d99c 274 .long 0xcec3662e, 0xa60ce07b 275 .long 0x96c515bb, 0x8f158014 276 .long 0xe6fc4e6a, 0xce7f39f4 277 .long 0x8227bb8a, 0xa00457f7 278 .long 0xb0cd4768, 0x61d82e56 279 .long 0x39c7ff35, 0x8d6d2c43 280 .long 0xd7a4825c, 0xd270f1a2 281 .long 0x0ab3844b, 0x00ac29cf 282 .long 0x0167d312, 0xc619809d 283 .long 0xf6076544, 0xe9adf796 284 .long 0x26f6a60a, 0x2b3cac5d 285 .long 0xa741c1bf, 0x96638b34 286 .long 0x98d8d9cb, 0x65863b64 287 .long 0x49c3cc9c, 0xe0e9f351 288 .long 0x68bce87a, 0x1b03397f 289 .long 0x57a3d037, 0x9af01f2d 290 .long 0x6956fc3b, 0xebb883bd 291 .long 0x42d98888, 0x2cff42cf 292 .long 0x3771e98f, 0xb3e32c28 293 .long 0xb42ae3d9, 0x88f25a3a 294 .long 0x2178513a, 0x064f7f26 295 .long 0xe0ac139e, 0x4e36f0b0 296 .long 0x170076fa, 0xdd7e3b0c 297 .long 0x444dd413, 0xbd6f81f8 298 .long 0x6f345e45, 0xf285651c 299 .long 0x41d17b64, 0x91c9bd4b 300 .long 0xff0dba97, 0x10746f3c 301 .long 0xa2b73df1, 0x885f087b 302 .long 0xf872e54c, 0xc7a68855 303 .long 0x1e41e9fc, 0x4c144932 304 .long 0x86d8e4d2, 0x271d9844 305 .long 0x651bd98b, 0x52148f02 306 .long 0x5bb8f1bc, 0x8e766a0c 307 .long 0xa90fd27a, 0xa3c6f37a 308 .long 0xb3af077a, 0x93a5f730 309 .long 0x4984d782, 0xd7c0557f 310 .long 0xca6ef3ac, 0x6cb08e5c 311 .long 0x234e0b26, 0x63ded06a 312 .long 0xdd66cbbb, 0x6b749fb2 313 .long 0x4597456a, 0x4d56973c 314 .long 0xe9e28eb4, 0x1393e203 315 .long 0x7b3ff57a, 0x9669c9df 316 .long 0xc9c8b782, 0xcec3662e 317 .long 0x3f70cc6f, 0xe417f38a 318 .long 0x93e106a4, 0x96c515bb 319 .long 0x62ec6c6d, 0x4b9e0f71 320 .long 0xd813b325, 0xe6fc4e6a 321 .long 0x0df04680, 0xd104b8fc 322 .long 0x2342001e, 0x8227bb8a 323 .long 0x0a2a8d7e, 0x5b397730 324 .long 0x6d9a4957, 0xb0cd4768 325 .long 0xe8b6368b, 0xe78eb416 326 .long 0xd2c3ed1a, 0x39c7ff35 327 .long 0x995a5724, 0x61ff0e01 328 .long 0x9ef68d35, 0xd7a4825c 329 .long 0x0c139b31, 0x8d96551c 330 .long 0xf2271e60, 0x0ab3844b 331 .long 0x0b0bf8ca, 0x0bf80dd2 332 .long 0x2664fd8b, 0x0167d312 333 .long 0xed64812d, 0x8821abed 334 .long 0x02ee03b2, 0xf6076544 335 .long 0x8604ae0f, 0x6a45d2b2 336 .long 0x363bd6b3, 0x26f6a60a 337 .long 0x135c83fd, 0xd8d26619 338 .long 0x5fabe670, 0xa741c1bf 339 .long 0x35ec3279, 0xde87806c 340 .long 0x00bcf5f6, 0x98d8d9cb 341 .long 0x8ae00689, 0x14338754 342 .long 0x17f27698, 0x49c3cc9c 343 .long 0x58ca5f00, 0x5bd2011f 344 .long 0xaa7c7ad5, 0x68bce87a 345 .long 0xb5cfca28, 0xdd07448e 346 .long 0xded288f8, 0x57a3d037 347 .long 0x59f229bc, 0xdde8f5b9 348 .long 0x6d390dec, 0x6956fc3b 349 .long 0x37170390, 0xa3e3e02c 350 .long 0x6353c1cc, 0x42d98888 351 .long 0xc4584f5c, 0xd73c7bea 352 .long 0xf48642e9, 0x3771e98f 353 .long 0x531377e2, 0x80ff0093 354 .long 0xdd35bc8d, 0xb42ae3d9 355 .long 0xb25b29f2, 0x8fe4c34d 356 .long 0x9a5ede41, 0x2178513a 357 .long 0xa563905d, 0xdf99fc11 358 .long 0x45cddf4e, 0xe0ac139e 359 .long 0xacfa3103, 0x6c23e841 360 .long 0xa51b6135, 0x170076fa 361