1*2051da85SEric Biggers// 2*2051da85SEric Biggers// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions 3*2051da85SEric Biggers// 4*2051da85SEric Biggers// Copyright (C) 2016 Linaro Ltd 5*2051da85SEric Biggers// Copyright (C) 2019-2024 Google LLC 6*2051da85SEric Biggers// 7*2051da85SEric Biggers// Authors: Ard Biesheuvel <ardb@google.com> 8*2051da85SEric Biggers// Eric Biggers <ebiggers@google.com> 9*2051da85SEric Biggers// 10*2051da85SEric Biggers// This program is free software; you can redistribute it and/or modify 11*2051da85SEric Biggers// it under the terms of the GNU General Public License version 2 as 12*2051da85SEric Biggers// published by the Free Software Foundation. 13*2051da85SEric Biggers// 14*2051da85SEric Biggers 15*2051da85SEric Biggers// Derived from the x86 version: 16*2051da85SEric Biggers// 17*2051da85SEric Biggers// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions 18*2051da85SEric Biggers// 19*2051da85SEric Biggers// Copyright (c) 2013, Intel Corporation 20*2051da85SEric Biggers// 21*2051da85SEric Biggers// Authors: 22*2051da85SEric Biggers// Erdinc Ozturk <erdinc.ozturk@intel.com> 23*2051da85SEric Biggers// Vinodh Gopal <vinodh.gopal@intel.com> 24*2051da85SEric Biggers// James Guilford <james.guilford@intel.com> 25*2051da85SEric Biggers// Tim Chen <tim.c.chen@linux.intel.com> 26*2051da85SEric Biggers// 27*2051da85SEric Biggers// This software is available to you under a choice of one of two 28*2051da85SEric Biggers// licenses. You may choose to be licensed under the terms of the GNU 29*2051da85SEric Biggers// General Public License (GPL) Version 2, available from the file 30*2051da85SEric Biggers// COPYING in the main directory of this source tree, or the 31*2051da85SEric Biggers// OpenIB.org BSD license below: 32*2051da85SEric Biggers// 33*2051da85SEric Biggers// Redistribution and use in source and binary forms, with or without 34*2051da85SEric Biggers// modification, are permitted provided that the following conditions are 35*2051da85SEric Biggers// met: 36*2051da85SEric Biggers// 37*2051da85SEric Biggers// * Redistributions of source code must retain the above copyright 38*2051da85SEric Biggers// notice, this list of conditions and the following disclaimer. 39*2051da85SEric Biggers// 40*2051da85SEric Biggers// * Redistributions in binary form must reproduce the above copyright 41*2051da85SEric Biggers// notice, this list of conditions and the following disclaimer in the 42*2051da85SEric Biggers// documentation and/or other materials provided with the 43*2051da85SEric Biggers// distribution. 44*2051da85SEric Biggers// 45*2051da85SEric Biggers// * Neither the name of the Intel Corporation nor the names of its 46*2051da85SEric Biggers// contributors may be used to endorse or promote products derived from 47*2051da85SEric Biggers// this software without specific prior written permission. 48*2051da85SEric Biggers// 49*2051da85SEric Biggers// 50*2051da85SEric Biggers// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY 51*2051da85SEric Biggers// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52*2051da85SEric Biggers// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 53*2051da85SEric Biggers// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR 54*2051da85SEric Biggers// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 55*2051da85SEric Biggers// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 56*2051da85SEric Biggers// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 57*2051da85SEric Biggers// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 58*2051da85SEric Biggers// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 59*2051da85SEric Biggers// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 60*2051da85SEric Biggers// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 61*2051da85SEric Biggers// 62*2051da85SEric Biggers// Reference paper titled "Fast CRC Computation for Generic 63*2051da85SEric Biggers// Polynomials Using PCLMULQDQ Instruction" 64*2051da85SEric Biggers// URL: http://www.intel.com/content/dam/www/public/us/en/documents 65*2051da85SEric Biggers// /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf 66*2051da85SEric Biggers// 67*2051da85SEric Biggers 68*2051da85SEric Biggers#include <linux/linkage.h> 69*2051da85SEric Biggers#include <asm/assembler.h> 70*2051da85SEric Biggers 71*2051da85SEric Biggers .text 72*2051da85SEric Biggers .arch armv8-a+crypto 73*2051da85SEric Biggers 74*2051da85SEric Biggers init_crc .req w0 75*2051da85SEric Biggers buf .req x1 76*2051da85SEric Biggers len .req x2 77*2051da85SEric Biggers fold_consts_ptr .req x5 78*2051da85SEric Biggers 79*2051da85SEric Biggers fold_consts .req v10 80*2051da85SEric Biggers 81*2051da85SEric Biggers t3 .req v17 82*2051da85SEric Biggers t4 .req v18 83*2051da85SEric Biggers t5 .req v19 84*2051da85SEric Biggers t6 .req v20 85*2051da85SEric Biggers t7 .req v21 86*2051da85SEric Biggers t8 .req v22 87*2051da85SEric Biggers 88*2051da85SEric Biggers perm .req v27 89*2051da85SEric Biggers 90*2051da85SEric Biggers .macro pmull16x64_p64, a16, b64, c64 91*2051da85SEric Biggers pmull2 \c64\().1q, \a16\().2d, \b64\().2d 92*2051da85SEric Biggers pmull \b64\().1q, \a16\().1d, \b64\().1d 93*2051da85SEric Biggers .endm 94*2051da85SEric Biggers 95*2051da85SEric Biggers /* 96*2051da85SEric Biggers * Pairwise long polynomial multiplication of two 16-bit values 97*2051da85SEric Biggers * 98*2051da85SEric Biggers * { w0, w1 }, { y0, y1 } 99*2051da85SEric Biggers * 100*2051da85SEric Biggers * by two 64-bit values 101*2051da85SEric Biggers * 102*2051da85SEric Biggers * { x0, x1, x2, x3, x4, x5, x6, x7 }, { z0, z1, z2, z3, z4, z5, z6, z7 } 103*2051da85SEric Biggers * 104*2051da85SEric Biggers * where each vector element is a byte, ordered from least to most 105*2051da85SEric Biggers * significant. 106*2051da85SEric Biggers * 107*2051da85SEric Biggers * This can be implemented using 8x8 long polynomial multiplication, by 108*2051da85SEric Biggers * reorganizing the input so that each pairwise 8x8 multiplication 109*2051da85SEric Biggers * produces one of the terms from the decomposition below, and 110*2051da85SEric Biggers * combining the results of each rank and shifting them into place. 111*2051da85SEric Biggers * 112*2051da85SEric Biggers * Rank 113*2051da85SEric Biggers * 0 w0*x0 ^ | y0*z0 ^ 114*2051da85SEric Biggers * 1 (w0*x1 ^ w1*x0) << 8 ^ | (y0*z1 ^ y1*z0) << 8 ^ 115*2051da85SEric Biggers * 2 (w0*x2 ^ w1*x1) << 16 ^ | (y0*z2 ^ y1*z1) << 16 ^ 116*2051da85SEric Biggers * 3 (w0*x3 ^ w1*x2) << 24 ^ | (y0*z3 ^ y1*z2) << 24 ^ 117*2051da85SEric Biggers * 4 (w0*x4 ^ w1*x3) << 32 ^ | (y0*z4 ^ y1*z3) << 32 ^ 118*2051da85SEric Biggers * 5 (w0*x5 ^ w1*x4) << 40 ^ | (y0*z5 ^ y1*z4) << 40 ^ 119*2051da85SEric Biggers * 6 (w0*x6 ^ w1*x5) << 48 ^ | (y0*z6 ^ y1*z5) << 48 ^ 120*2051da85SEric Biggers * 7 (w0*x7 ^ w1*x6) << 56 ^ | (y0*z7 ^ y1*z6) << 56 ^ 121*2051da85SEric Biggers * 8 w1*x7 << 64 | y1*z7 << 64 122*2051da85SEric Biggers * 123*2051da85SEric Biggers * The inputs can be reorganized into 124*2051da85SEric Biggers * 125*2051da85SEric Biggers * { w0, w0, w0, w0, y0, y0, y0, y0 }, { w1, w1, w1, w1, y1, y1, y1, y1 } 126*2051da85SEric Biggers * { x0, x2, x4, x6, z0, z2, z4, z6 }, { x1, x3, x5, x7, z1, z3, z5, z7 } 127*2051da85SEric Biggers * 128*2051da85SEric Biggers * and after performing 8x8->16 bit long polynomial multiplication of 129*2051da85SEric Biggers * each of the halves of the first vector with those of the second one, 130*2051da85SEric Biggers * we obtain the following four vectors of 16-bit elements: 131*2051da85SEric Biggers * 132*2051da85SEric Biggers * a := { w0*x0, w0*x2, w0*x4, w0*x6 }, { y0*z0, y0*z2, y0*z4, y0*z6 } 133*2051da85SEric Biggers * b := { w0*x1, w0*x3, w0*x5, w0*x7 }, { y0*z1, y0*z3, y0*z5, y0*z7 } 134*2051da85SEric Biggers * c := { w1*x0, w1*x2, w1*x4, w1*x6 }, { y1*z0, y1*z2, y1*z4, y1*z6 } 135*2051da85SEric Biggers * d := { w1*x1, w1*x3, w1*x5, w1*x7 }, { y1*z1, y1*z3, y1*z5, y1*z7 } 136*2051da85SEric Biggers * 137*2051da85SEric Biggers * Results b and c can be XORed together, as the vector elements have 138*2051da85SEric Biggers * matching ranks. Then, the final XOR (*) can be pulled forward, and 139*2051da85SEric Biggers * applied between the halves of each of the remaining three vectors, 140*2051da85SEric Biggers * which are then shifted into place, and combined to produce two 141*2051da85SEric Biggers * 80-bit results. 142*2051da85SEric Biggers * 143*2051da85SEric Biggers * (*) NOTE: the 16x64 bit polynomial multiply below is not equivalent 144*2051da85SEric Biggers * to the 64x64 bit one above, but XOR'ing the outputs together will 145*2051da85SEric Biggers * produce the expected result, and this is sufficient in the context of 146*2051da85SEric Biggers * this algorithm. 147*2051da85SEric Biggers */ 148*2051da85SEric Biggers .macro pmull16x64_p8, a16, b64, c64 149*2051da85SEric Biggers ext t7.16b, \b64\().16b, \b64\().16b, #1 150*2051da85SEric Biggers tbl t5.16b, {\a16\().16b}, perm.16b 151*2051da85SEric Biggers uzp1 t7.16b, \b64\().16b, t7.16b 152*2051da85SEric Biggers bl __pmull_p8_16x64 153*2051da85SEric Biggers ext \b64\().16b, t4.16b, t4.16b, #15 154*2051da85SEric Biggers eor \c64\().16b, t8.16b, t5.16b 155*2051da85SEric Biggers .endm 156*2051da85SEric Biggers 157*2051da85SEric BiggersSYM_FUNC_START_LOCAL(__pmull_p8_16x64) 158*2051da85SEric Biggers ext t6.16b, t5.16b, t5.16b, #8 159*2051da85SEric Biggers 160*2051da85SEric Biggers pmull t3.8h, t7.8b, t5.8b 161*2051da85SEric Biggers pmull t4.8h, t7.8b, t6.8b 162*2051da85SEric Biggers pmull2 t5.8h, t7.16b, t5.16b 163*2051da85SEric Biggers pmull2 t6.8h, t7.16b, t6.16b 164*2051da85SEric Biggers 165*2051da85SEric Biggers ext t8.16b, t3.16b, t3.16b, #8 166*2051da85SEric Biggers eor t4.16b, t4.16b, t6.16b 167*2051da85SEric Biggers ext t7.16b, t5.16b, t5.16b, #8 168*2051da85SEric Biggers ext t6.16b, t4.16b, t4.16b, #8 169*2051da85SEric Biggers eor t8.8b, t8.8b, t3.8b 170*2051da85SEric Biggers eor t5.8b, t5.8b, t7.8b 171*2051da85SEric Biggers eor t4.8b, t4.8b, t6.8b 172*2051da85SEric Biggers ext t5.16b, t5.16b, t5.16b, #14 173*2051da85SEric Biggers ret 174*2051da85SEric BiggersSYM_FUNC_END(__pmull_p8_16x64) 175*2051da85SEric Biggers 176*2051da85SEric Biggers 177*2051da85SEric Biggers // Fold reg1, reg2 into the next 32 data bytes, storing the result back 178*2051da85SEric Biggers // into reg1, reg2. 179*2051da85SEric Biggers .macro fold_32_bytes, p, reg1, reg2 180*2051da85SEric Biggers ldp q11, q12, [buf], #0x20 181*2051da85SEric Biggers 182*2051da85SEric Biggers pmull16x64_\p fold_consts, \reg1, v8 183*2051da85SEric Biggers 184*2051da85SEric BiggersCPU_LE( rev64 v11.16b, v11.16b ) 185*2051da85SEric BiggersCPU_LE( rev64 v12.16b, v12.16b ) 186*2051da85SEric Biggers 187*2051da85SEric Biggers pmull16x64_\p fold_consts, \reg2, v9 188*2051da85SEric Biggers 189*2051da85SEric BiggersCPU_LE( ext v11.16b, v11.16b, v11.16b, #8 ) 190*2051da85SEric BiggersCPU_LE( ext v12.16b, v12.16b, v12.16b, #8 ) 191*2051da85SEric Biggers 192*2051da85SEric Biggers eor \reg1\().16b, \reg1\().16b, v8.16b 193*2051da85SEric Biggers eor \reg2\().16b, \reg2\().16b, v9.16b 194*2051da85SEric Biggers eor \reg1\().16b, \reg1\().16b, v11.16b 195*2051da85SEric Biggers eor \reg2\().16b, \reg2\().16b, v12.16b 196*2051da85SEric Biggers .endm 197*2051da85SEric Biggers 198*2051da85SEric Biggers // Fold src_reg into dst_reg, optionally loading the next fold constants 199*2051da85SEric Biggers .macro fold_16_bytes, p, src_reg, dst_reg, load_next_consts 200*2051da85SEric Biggers pmull16x64_\p fold_consts, \src_reg, v8 201*2051da85SEric Biggers .ifnb \load_next_consts 202*2051da85SEric Biggers ld1 {fold_consts.2d}, [fold_consts_ptr], #16 203*2051da85SEric Biggers .endif 204*2051da85SEric Biggers eor \dst_reg\().16b, \dst_reg\().16b, v8.16b 205*2051da85SEric Biggers eor \dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b 206*2051da85SEric Biggers .endm 207*2051da85SEric Biggers 208*2051da85SEric Biggers .macro crc_t10dif_pmull, p 209*2051da85SEric Biggers 210*2051da85SEric Biggers // For sizes less than 256 bytes, we can't fold 128 bytes at a time. 211*2051da85SEric Biggers cmp len, #256 212*2051da85SEric Biggers b.lt .Lless_than_256_bytes_\@ 213*2051da85SEric Biggers 214*2051da85SEric Biggers adr_l fold_consts_ptr, .Lfold_across_128_bytes_consts 215*2051da85SEric Biggers 216*2051da85SEric Biggers // Load the first 128 data bytes. Byte swapping is necessary to make 217*2051da85SEric Biggers // the bit order match the polynomial coefficient order. 218*2051da85SEric Biggers ldp q0, q1, [buf] 219*2051da85SEric Biggers ldp q2, q3, [buf, #0x20] 220*2051da85SEric Biggers ldp q4, q5, [buf, #0x40] 221*2051da85SEric Biggers ldp q6, q7, [buf, #0x60] 222*2051da85SEric Biggers add buf, buf, #0x80 223*2051da85SEric BiggersCPU_LE( rev64 v0.16b, v0.16b ) 224*2051da85SEric BiggersCPU_LE( rev64 v1.16b, v1.16b ) 225*2051da85SEric BiggersCPU_LE( rev64 v2.16b, v2.16b ) 226*2051da85SEric BiggersCPU_LE( rev64 v3.16b, v3.16b ) 227*2051da85SEric BiggersCPU_LE( rev64 v4.16b, v4.16b ) 228*2051da85SEric BiggersCPU_LE( rev64 v5.16b, v5.16b ) 229*2051da85SEric BiggersCPU_LE( rev64 v6.16b, v6.16b ) 230*2051da85SEric BiggersCPU_LE( rev64 v7.16b, v7.16b ) 231*2051da85SEric BiggersCPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) 232*2051da85SEric BiggersCPU_LE( ext v1.16b, v1.16b, v1.16b, #8 ) 233*2051da85SEric BiggersCPU_LE( ext v2.16b, v2.16b, v2.16b, #8 ) 234*2051da85SEric BiggersCPU_LE( ext v3.16b, v3.16b, v3.16b, #8 ) 235*2051da85SEric BiggersCPU_LE( ext v4.16b, v4.16b, v4.16b, #8 ) 236*2051da85SEric BiggersCPU_LE( ext v5.16b, v5.16b, v5.16b, #8 ) 237*2051da85SEric BiggersCPU_LE( ext v6.16b, v6.16b, v6.16b, #8 ) 238*2051da85SEric BiggersCPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) 239*2051da85SEric Biggers 240*2051da85SEric Biggers // XOR the first 16 data *bits* with the initial CRC value. 241*2051da85SEric Biggers movi v8.16b, #0 242*2051da85SEric Biggers mov v8.h[7], init_crc 243*2051da85SEric Biggers eor v0.16b, v0.16b, v8.16b 244*2051da85SEric Biggers 245*2051da85SEric Biggers // Load the constants for folding across 128 bytes. 246*2051da85SEric Biggers ld1 {fold_consts.2d}, [fold_consts_ptr] 247*2051da85SEric Biggers 248*2051da85SEric Biggers // Subtract 128 for the 128 data bytes just consumed. Subtract another 249*2051da85SEric Biggers // 128 to simplify the termination condition of the following loop. 250*2051da85SEric Biggers sub len, len, #256 251*2051da85SEric Biggers 252*2051da85SEric Biggers // While >= 128 data bytes remain (not counting v0-v7), fold the 128 253*2051da85SEric Biggers // bytes v0-v7 into them, storing the result back into v0-v7. 254*2051da85SEric Biggers.Lfold_128_bytes_loop_\@: 255*2051da85SEric Biggers fold_32_bytes \p, v0, v1 256*2051da85SEric Biggers fold_32_bytes \p, v2, v3 257*2051da85SEric Biggers fold_32_bytes \p, v4, v5 258*2051da85SEric Biggers fold_32_bytes \p, v6, v7 259*2051da85SEric Biggers 260*2051da85SEric Biggers subs len, len, #128 261*2051da85SEric Biggers b.ge .Lfold_128_bytes_loop_\@ 262*2051da85SEric Biggers 263*2051da85SEric Biggers // Now fold the 112 bytes in v0-v6 into the 16 bytes in v7. 264*2051da85SEric Biggers 265*2051da85SEric Biggers // Fold across 64 bytes. 266*2051da85SEric Biggers add fold_consts_ptr, fold_consts_ptr, #16 267*2051da85SEric Biggers ld1 {fold_consts.2d}, [fold_consts_ptr], #16 268*2051da85SEric Biggers fold_16_bytes \p, v0, v4 269*2051da85SEric Biggers fold_16_bytes \p, v1, v5 270*2051da85SEric Biggers fold_16_bytes \p, v2, v6 271*2051da85SEric Biggers fold_16_bytes \p, v3, v7, 1 272*2051da85SEric Biggers // Fold across 32 bytes. 273*2051da85SEric Biggers fold_16_bytes \p, v4, v6 274*2051da85SEric Biggers fold_16_bytes \p, v5, v7, 1 275*2051da85SEric Biggers // Fold across 16 bytes. 276*2051da85SEric Biggers fold_16_bytes \p, v6, v7 277*2051da85SEric Biggers 278*2051da85SEric Biggers // Add 128 to get the correct number of data bytes remaining in 0...127 279*2051da85SEric Biggers // (not counting v7), following the previous extra subtraction by 128. 280*2051da85SEric Biggers // Then subtract 16 to simplify the termination condition of the 281*2051da85SEric Biggers // following loop. 282*2051da85SEric Biggers adds len, len, #(128-16) 283*2051da85SEric Biggers 284*2051da85SEric Biggers // While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7 285*2051da85SEric Biggers // into them, storing the result back into v7. 286*2051da85SEric Biggers b.lt .Lfold_16_bytes_loop_done_\@ 287*2051da85SEric Biggers.Lfold_16_bytes_loop_\@: 288*2051da85SEric Biggers pmull16x64_\p fold_consts, v7, v8 289*2051da85SEric Biggers eor v7.16b, v7.16b, v8.16b 290*2051da85SEric Biggers ldr q0, [buf], #16 291*2051da85SEric BiggersCPU_LE( rev64 v0.16b, v0.16b ) 292*2051da85SEric BiggersCPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) 293*2051da85SEric Biggers eor v7.16b, v7.16b, v0.16b 294*2051da85SEric Biggers subs len, len, #16 295*2051da85SEric Biggers b.ge .Lfold_16_bytes_loop_\@ 296*2051da85SEric Biggers 297*2051da85SEric Biggers.Lfold_16_bytes_loop_done_\@: 298*2051da85SEric Biggers // Add 16 to get the correct number of data bytes remaining in 0...15 299*2051da85SEric Biggers // (not counting v7), following the previous extra subtraction by 16. 300*2051da85SEric Biggers adds len, len, #16 301*2051da85SEric Biggers b.eq .Lreduce_final_16_bytes_\@ 302*2051da85SEric Biggers 303*2051da85SEric Biggers.Lhandle_partial_segment_\@: 304*2051da85SEric Biggers // Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 305*2051da85SEric Biggers // 16 bytes are in v7 and the rest are the remaining data in 'buf'. To 306*2051da85SEric Biggers // do this without needing a fold constant for each possible 'len', 307*2051da85SEric Biggers // redivide the bytes into a first chunk of 'len' bytes and a second 308*2051da85SEric Biggers // chunk of 16 bytes, then fold the first chunk into the second. 309*2051da85SEric Biggers 310*2051da85SEric Biggers // v0 = last 16 original data bytes 311*2051da85SEric Biggers add buf, buf, len 312*2051da85SEric Biggers ldr q0, [buf, #-16] 313*2051da85SEric BiggersCPU_LE( rev64 v0.16b, v0.16b ) 314*2051da85SEric BiggersCPU_LE( ext v0.16b, v0.16b, v0.16b, #8 ) 315*2051da85SEric Biggers 316*2051da85SEric Biggers // v1 = high order part of second chunk: v7 left-shifted by 'len' bytes. 317*2051da85SEric Biggers adr_l x4, .Lbyteshift_table + 16 318*2051da85SEric Biggers sub x4, x4, len 319*2051da85SEric Biggers ld1 {v2.16b}, [x4] 320*2051da85SEric Biggers tbl v1.16b, {v7.16b}, v2.16b 321*2051da85SEric Biggers 322*2051da85SEric Biggers // v3 = first chunk: v7 right-shifted by '16-len' bytes. 323*2051da85SEric Biggers movi v3.16b, #0x80 324*2051da85SEric Biggers eor v2.16b, v2.16b, v3.16b 325*2051da85SEric Biggers tbl v3.16b, {v7.16b}, v2.16b 326*2051da85SEric Biggers 327*2051da85SEric Biggers // Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes. 328*2051da85SEric Biggers sshr v2.16b, v2.16b, #7 329*2051da85SEric Biggers 330*2051da85SEric Biggers // v2 = second chunk: 'len' bytes from v0 (low-order bytes), 331*2051da85SEric Biggers // then '16-len' bytes from v1 (high-order bytes). 332*2051da85SEric Biggers bsl v2.16b, v1.16b, v0.16b 333*2051da85SEric Biggers 334*2051da85SEric Biggers // Fold the first chunk into the second chunk, storing the result in v7. 335*2051da85SEric Biggers pmull16x64_\p fold_consts, v3, v0 336*2051da85SEric Biggers eor v7.16b, v3.16b, v0.16b 337*2051da85SEric Biggers eor v7.16b, v7.16b, v2.16b 338*2051da85SEric Biggers b .Lreduce_final_16_bytes_\@ 339*2051da85SEric Biggers 340*2051da85SEric Biggers.Lless_than_256_bytes_\@: 341*2051da85SEric Biggers // Checksumming a buffer of length 16...255 bytes 342*2051da85SEric Biggers 343*2051da85SEric Biggers adr_l fold_consts_ptr, .Lfold_across_16_bytes_consts 344*2051da85SEric Biggers 345*2051da85SEric Biggers // Load the first 16 data bytes. 346*2051da85SEric Biggers ldr q7, [buf], #0x10 347*2051da85SEric BiggersCPU_LE( rev64 v7.16b, v7.16b ) 348*2051da85SEric BiggersCPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) 349*2051da85SEric Biggers 350*2051da85SEric Biggers // XOR the first 16 data *bits* with the initial CRC value. 351*2051da85SEric Biggers movi v0.16b, #0 352*2051da85SEric Biggers mov v0.h[7], init_crc 353*2051da85SEric Biggers eor v7.16b, v7.16b, v0.16b 354*2051da85SEric Biggers 355*2051da85SEric Biggers // Load the fold-across-16-bytes constants. 356*2051da85SEric Biggers ld1 {fold_consts.2d}, [fold_consts_ptr], #16 357*2051da85SEric Biggers 358*2051da85SEric Biggers cmp len, #16 359*2051da85SEric Biggers b.eq .Lreduce_final_16_bytes_\@ // len == 16 360*2051da85SEric Biggers subs len, len, #32 361*2051da85SEric Biggers b.ge .Lfold_16_bytes_loop_\@ // 32 <= len <= 255 362*2051da85SEric Biggers add len, len, #16 363*2051da85SEric Biggers b .Lhandle_partial_segment_\@ // 17 <= len <= 31 364*2051da85SEric Biggers 365*2051da85SEric Biggers.Lreduce_final_16_bytes_\@: 366*2051da85SEric Biggers .endm 367*2051da85SEric Biggers 368*2051da85SEric Biggers// 369*2051da85SEric Biggers// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len); 370*2051da85SEric Biggers// 371*2051da85SEric Biggers// Assumes len >= 16. 372*2051da85SEric Biggers// 373*2051da85SEric BiggersSYM_FUNC_START(crc_t10dif_pmull_p8) 374*2051da85SEric Biggers frame_push 1 375*2051da85SEric Biggers 376*2051da85SEric Biggers // Compose { 0,0,0,0, 8,8,8,8, 1,1,1,1, 9,9,9,9 } 377*2051da85SEric Biggers movi perm.4h, #8, lsl #8 378*2051da85SEric Biggers orr perm.2s, #1, lsl #16 379*2051da85SEric Biggers orr perm.2s, #1, lsl #24 380*2051da85SEric Biggers zip1 perm.16b, perm.16b, perm.16b 381*2051da85SEric Biggers zip1 perm.16b, perm.16b, perm.16b 382*2051da85SEric Biggers 383*2051da85SEric Biggers crc_t10dif_pmull p8 384*2051da85SEric Biggers 385*2051da85SEric BiggersCPU_LE( rev64 v7.16b, v7.16b ) 386*2051da85SEric BiggersCPU_LE( ext v7.16b, v7.16b, v7.16b, #8 ) 387*2051da85SEric Biggers str q7, [x3] 388*2051da85SEric Biggers 389*2051da85SEric Biggers frame_pop 390*2051da85SEric Biggers ret 391*2051da85SEric BiggersSYM_FUNC_END(crc_t10dif_pmull_p8) 392*2051da85SEric Biggers 393*2051da85SEric Biggers .align 5 394*2051da85SEric Biggers// 395*2051da85SEric Biggers// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len); 396*2051da85SEric Biggers// 397*2051da85SEric Biggers// Assumes len >= 16. 398*2051da85SEric Biggers// 399*2051da85SEric BiggersSYM_FUNC_START(crc_t10dif_pmull_p64) 400*2051da85SEric Biggers crc_t10dif_pmull p64 401*2051da85SEric Biggers 402*2051da85SEric Biggers // Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC. 403*2051da85SEric Biggers 404*2051da85SEric Biggers movi v2.16b, #0 // init zero register 405*2051da85SEric Biggers 406*2051da85SEric Biggers // Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. 407*2051da85SEric Biggers ld1 {fold_consts.2d}, [fold_consts_ptr], #16 408*2051da85SEric Biggers 409*2051da85SEric Biggers // Fold the high 64 bits into the low 64 bits, while also multiplying by 410*2051da85SEric Biggers // x^64. This produces a 128-bit value congruent to x^64 * M(x) and 411*2051da85SEric Biggers // whose low 48 bits are 0. 412*2051da85SEric Biggers ext v0.16b, v2.16b, v7.16b, #8 413*2051da85SEric Biggers pmull2 v7.1q, v7.2d, fold_consts.2d // high bits * x^48 * (x^80 mod G(x)) 414*2051da85SEric Biggers eor v0.16b, v0.16b, v7.16b // + low bits * x^64 415*2051da85SEric Biggers 416*2051da85SEric Biggers // Fold the high 32 bits into the low 96 bits. This produces a 96-bit 417*2051da85SEric Biggers // value congruent to x^64 * M(x) and whose low 48 bits are 0. 418*2051da85SEric Biggers ext v1.16b, v0.16b, v2.16b, #12 // extract high 32 bits 419*2051da85SEric Biggers mov v0.s[3], v2.s[0] // zero high 32 bits 420*2051da85SEric Biggers pmull v1.1q, v1.1d, fold_consts.1d // high 32 bits * x^48 * (x^48 mod G(x)) 421*2051da85SEric Biggers eor v0.16b, v0.16b, v1.16b // + low bits 422*2051da85SEric Biggers 423*2051da85SEric Biggers // Load G(x) and floor(x^48 / G(x)). 424*2051da85SEric Biggers ld1 {fold_consts.2d}, [fold_consts_ptr] 425*2051da85SEric Biggers 426*2051da85SEric Biggers // Use Barrett reduction to compute the final CRC value. 427*2051da85SEric Biggers pmull2 v1.1q, v0.2d, fold_consts.2d // high 32 bits * floor(x^48 / G(x)) 428*2051da85SEric Biggers ushr v1.2d, v1.2d, #32 // /= x^32 429*2051da85SEric Biggers pmull v1.1q, v1.1d, fold_consts.1d // *= G(x) 430*2051da85SEric Biggers ushr v0.2d, v0.2d, #48 431*2051da85SEric Biggers eor v0.16b, v0.16b, v1.16b // + low 16 nonzero bits 432*2051da85SEric Biggers // Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0. 433*2051da85SEric Biggers 434*2051da85SEric Biggers umov w0, v0.h[0] 435*2051da85SEric Biggers ret 436*2051da85SEric BiggersSYM_FUNC_END(crc_t10dif_pmull_p64) 437*2051da85SEric Biggers 438*2051da85SEric Biggers .section ".rodata", "a" 439*2051da85SEric Biggers .align 4 440*2051da85SEric Biggers 441*2051da85SEric Biggers// Fold constants precomputed from the polynomial 0x18bb7 442*2051da85SEric Biggers// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 443*2051da85SEric Biggers.Lfold_across_128_bytes_consts: 444*2051da85SEric Biggers .quad 0x0000000000006123 // x^(8*128) mod G(x) 445*2051da85SEric Biggers .quad 0x0000000000002295 // x^(8*128+64) mod G(x) 446*2051da85SEric Biggers// .Lfold_across_64_bytes_consts: 447*2051da85SEric Biggers .quad 0x0000000000001069 // x^(4*128) mod G(x) 448*2051da85SEric Biggers .quad 0x000000000000dd31 // x^(4*128+64) mod G(x) 449*2051da85SEric Biggers// .Lfold_across_32_bytes_consts: 450*2051da85SEric Biggers .quad 0x000000000000857d // x^(2*128) mod G(x) 451*2051da85SEric Biggers .quad 0x0000000000007acc // x^(2*128+64) mod G(x) 452*2051da85SEric Biggers.Lfold_across_16_bytes_consts: 453*2051da85SEric Biggers .quad 0x000000000000a010 // x^(1*128) mod G(x) 454*2051da85SEric Biggers .quad 0x0000000000001faa // x^(1*128+64) mod G(x) 455*2051da85SEric Biggers// .Lfinal_fold_consts: 456*2051da85SEric Biggers .quad 0x1368000000000000 // x^48 * (x^48 mod G(x)) 457*2051da85SEric Biggers .quad 0x2d56000000000000 // x^48 * (x^80 mod G(x)) 458*2051da85SEric Biggers// .Lbarrett_reduction_consts: 459*2051da85SEric Biggers .quad 0x0000000000018bb7 // G(x) 460*2051da85SEric Biggers .quad 0x00000001f65a57f8 // floor(x^48 / G(x)) 461*2051da85SEric Biggers 462*2051da85SEric Biggers// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - 463*2051da85SEric Biggers// len] is the index vector to shift left by 'len' bytes, and is also {0x80, 464*2051da85SEric Biggers// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes. 465*2051da85SEric Biggers.Lbyteshift_table: 466*2051da85SEric Biggers .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 467*2051da85SEric Biggers .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f 468*2051da85SEric Biggers .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 469*2051da85SEric Biggers .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 470