1*530b304fSEric Biggers/* 2*530b304fSEric Biggers * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions 3*530b304fSEric Biggers * 4*530b304fSEric Biggers * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 5*530b304fSEric Biggers * 6*530b304fSEric Biggers * This program is free software; you can redistribute it and/or modify 7*530b304fSEric Biggers * it under the terms of the GNU General Public License version 2 as 8*530b304fSEric Biggers * published by the Free Software Foundation. 9*530b304fSEric Biggers */ 10*530b304fSEric Biggers 11*530b304fSEric Biggers/* GPL HEADER START 12*530b304fSEric Biggers * 13*530b304fSEric Biggers * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 14*530b304fSEric Biggers * 15*530b304fSEric Biggers * This program is free software; you can redistribute it and/or modify 16*530b304fSEric Biggers * it under the terms of the GNU General Public License version 2 only, 17*530b304fSEric Biggers * as published by the Free Software Foundation. 18*530b304fSEric Biggers * 19*530b304fSEric Biggers * This program is distributed in the hope that it will be useful, but 20*530b304fSEric Biggers * WITHOUT ANY WARRANTY; without even the implied warranty of 21*530b304fSEric Biggers * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22*530b304fSEric Biggers * General Public License version 2 for more details (a copy is included 23*530b304fSEric Biggers * in the LICENSE file that accompanied this code). 24*530b304fSEric Biggers * 25*530b304fSEric Biggers * You should have received a copy of the GNU General Public License 26*530b304fSEric Biggers * version 2 along with this program; If not, see http://www.gnu.org/licenses 27*530b304fSEric Biggers * 28*530b304fSEric Biggers * Please visit http://www.xyratex.com/contact if you need additional 29*530b304fSEric Biggers * information or have any questions. 30*530b304fSEric Biggers * 31*530b304fSEric Biggers * GPL HEADER END 32*530b304fSEric Biggers */ 33*530b304fSEric Biggers 34*530b304fSEric Biggers/* 35*530b304fSEric Biggers * Copyright 2012 Xyratex Technology Limited 36*530b304fSEric Biggers * 37*530b304fSEric Biggers * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 38*530b304fSEric Biggers * calculation. 39*530b304fSEric Biggers * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) 40*530b304fSEric Biggers * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found 41*530b304fSEric Biggers * at: 42*530b304fSEric Biggers * https://www.intel.com/products/processor/manuals/ 43*530b304fSEric Biggers * Intel(R) 64 and IA-32 Architectures Software Developer's Manual 44*530b304fSEric Biggers * Volume 2B: Instruction Set Reference, N-Z 45*530b304fSEric Biggers * 46*530b304fSEric Biggers * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> 47*530b304fSEric Biggers * Alexander Boyko <Alexander_Boyko@xyratex.com> 48*530b304fSEric Biggers */ 49*530b304fSEric Biggers 50*530b304fSEric Biggers#include <linux/linkage.h> 51*530b304fSEric Biggers#include <asm/assembler.h> 52*530b304fSEric Biggers 53*530b304fSEric Biggers .text 54*530b304fSEric Biggers .align 6 55*530b304fSEric Biggers .arch armv8-a 56*530b304fSEric Biggers .arch_extension crc 57*530b304fSEric Biggers .fpu crypto-neon-fp-armv8 58*530b304fSEric Biggers 59*530b304fSEric Biggers.Lcrc32_constants: 60*530b304fSEric Biggers /* 61*530b304fSEric Biggers * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 62*530b304fSEric Biggers * #define CONSTANT_R1 0x154442bd4LL 63*530b304fSEric Biggers * 64*530b304fSEric Biggers * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 65*530b304fSEric Biggers * #define CONSTANT_R2 0x1c6e41596LL 66*530b304fSEric Biggers */ 67*530b304fSEric Biggers .quad 0x0000000154442bd4 68*530b304fSEric Biggers .quad 0x00000001c6e41596 69*530b304fSEric Biggers 70*530b304fSEric Biggers /* 71*530b304fSEric Biggers * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 72*530b304fSEric Biggers * #define CONSTANT_R3 0x1751997d0LL 73*530b304fSEric Biggers * 74*530b304fSEric Biggers * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e 75*530b304fSEric Biggers * #define CONSTANT_R4 0x0ccaa009eLL 76*530b304fSEric Biggers */ 77*530b304fSEric Biggers .quad 0x00000001751997d0 78*530b304fSEric Biggers .quad 0x00000000ccaa009e 79*530b304fSEric Biggers 80*530b304fSEric Biggers /* 81*530b304fSEric Biggers * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 82*530b304fSEric Biggers * #define CONSTANT_R5 0x163cd6124LL 83*530b304fSEric Biggers */ 84*530b304fSEric Biggers .quad 0x0000000163cd6124 85*530b304fSEric Biggers .quad 0x00000000FFFFFFFF 86*530b304fSEric Biggers 87*530b304fSEric Biggers /* 88*530b304fSEric Biggers * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL 89*530b304fSEric Biggers * 90*530b304fSEric Biggers * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` 91*530b304fSEric Biggers * = 0x1F7011641LL 92*530b304fSEric Biggers * #define CONSTANT_RU 0x1F7011641LL 93*530b304fSEric Biggers */ 94*530b304fSEric Biggers .quad 0x00000001DB710641 95*530b304fSEric Biggers .quad 0x00000001F7011641 96*530b304fSEric Biggers 97*530b304fSEric Biggers.Lcrc32c_constants: 98*530b304fSEric Biggers .quad 0x00000000740eef02 99*530b304fSEric Biggers .quad 0x000000009e4addf8 100*530b304fSEric Biggers .quad 0x00000000f20c0dfe 101*530b304fSEric Biggers .quad 0x000000014cd00bd6 102*530b304fSEric Biggers .quad 0x00000000dd45aab8 103*530b304fSEric Biggers .quad 0x00000000FFFFFFFF 104*530b304fSEric Biggers .quad 0x0000000105ec76f0 105*530b304fSEric Biggers .quad 0x00000000dea713f1 106*530b304fSEric Biggers 107*530b304fSEric Biggers dCONSTANTl .req d0 108*530b304fSEric Biggers dCONSTANTh .req d1 109*530b304fSEric Biggers qCONSTANT .req q0 110*530b304fSEric Biggers 111*530b304fSEric Biggers BUF .req r0 112*530b304fSEric Biggers LEN .req r1 113*530b304fSEric Biggers CRC .req r2 114*530b304fSEric Biggers 115*530b304fSEric Biggers qzr .req q9 116*530b304fSEric Biggers 117*530b304fSEric Biggers /** 118*530b304fSEric Biggers * Calculate crc32 119*530b304fSEric Biggers * BUF - buffer 120*530b304fSEric Biggers * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63 121*530b304fSEric Biggers * CRC - initial crc32 122*530b304fSEric Biggers * return %eax crc32 123*530b304fSEric Biggers * uint crc32_pmull_le(unsigned char const *buffer, 124*530b304fSEric Biggers * size_t len, uint crc32) 125*530b304fSEric Biggers */ 126*530b304fSEric BiggersSYM_FUNC_START(crc32_pmull_le) 127*530b304fSEric Biggers adr r3, .Lcrc32_constants 128*530b304fSEric Biggers b 0f 129*530b304fSEric BiggersSYM_FUNC_END(crc32_pmull_le) 130*530b304fSEric Biggers 131*530b304fSEric BiggersSYM_FUNC_START(crc32c_pmull_le) 132*530b304fSEric Biggers adr r3, .Lcrc32c_constants 133*530b304fSEric Biggers 134*530b304fSEric Biggers0: bic LEN, LEN, #15 135*530b304fSEric Biggers vld1.8 {q1-q2}, [BUF, :128]! 136*530b304fSEric Biggers vld1.8 {q3-q4}, [BUF, :128]! 137*530b304fSEric Biggers vmov.i8 qzr, #0 138*530b304fSEric Biggers vmov.i8 qCONSTANT, #0 139*530b304fSEric Biggers vmov.32 dCONSTANTl[0], CRC 140*530b304fSEric Biggers veor.8 d2, d2, dCONSTANTl 141*530b304fSEric Biggers sub LEN, LEN, #0x40 142*530b304fSEric Biggers cmp LEN, #0x40 143*530b304fSEric Biggers blt less_64 144*530b304fSEric Biggers 145*530b304fSEric Biggers vld1.64 {qCONSTANT}, [r3] 146*530b304fSEric Biggers 147*530b304fSEric Biggersloop_64: /* 64 bytes Full cache line folding */ 148*530b304fSEric Biggers sub LEN, LEN, #0x40 149*530b304fSEric Biggers 150*530b304fSEric Biggers vmull.p64 q5, d3, dCONSTANTh 151*530b304fSEric Biggers vmull.p64 q6, d5, dCONSTANTh 152*530b304fSEric Biggers vmull.p64 q7, d7, dCONSTANTh 153*530b304fSEric Biggers vmull.p64 q8, d9, dCONSTANTh 154*530b304fSEric Biggers 155*530b304fSEric Biggers vmull.p64 q1, d2, dCONSTANTl 156*530b304fSEric Biggers vmull.p64 q2, d4, dCONSTANTl 157*530b304fSEric Biggers vmull.p64 q3, d6, dCONSTANTl 158*530b304fSEric Biggers vmull.p64 q4, d8, dCONSTANTl 159*530b304fSEric Biggers 160*530b304fSEric Biggers veor.8 q1, q1, q5 161*530b304fSEric Biggers vld1.8 {q5}, [BUF, :128]! 162*530b304fSEric Biggers veor.8 q2, q2, q6 163*530b304fSEric Biggers vld1.8 {q6}, [BUF, :128]! 164*530b304fSEric Biggers veor.8 q3, q3, q7 165*530b304fSEric Biggers vld1.8 {q7}, [BUF, :128]! 166*530b304fSEric Biggers veor.8 q4, q4, q8 167*530b304fSEric Biggers vld1.8 {q8}, [BUF, :128]! 168*530b304fSEric Biggers 169*530b304fSEric Biggers veor.8 q1, q1, q5 170*530b304fSEric Biggers veor.8 q2, q2, q6 171*530b304fSEric Biggers veor.8 q3, q3, q7 172*530b304fSEric Biggers veor.8 q4, q4, q8 173*530b304fSEric Biggers 174*530b304fSEric Biggers cmp LEN, #0x40 175*530b304fSEric Biggers bge loop_64 176*530b304fSEric Biggers 177*530b304fSEric Biggersless_64: /* Folding cache line into 128bit */ 178*530b304fSEric Biggers vldr dCONSTANTl, [r3, #16] 179*530b304fSEric Biggers vldr dCONSTANTh, [r3, #24] 180*530b304fSEric Biggers 181*530b304fSEric Biggers vmull.p64 q5, d3, dCONSTANTh 182*530b304fSEric Biggers vmull.p64 q1, d2, dCONSTANTl 183*530b304fSEric Biggers veor.8 q1, q1, q5 184*530b304fSEric Biggers veor.8 q1, q1, q2 185*530b304fSEric Biggers 186*530b304fSEric Biggers vmull.p64 q5, d3, dCONSTANTh 187*530b304fSEric Biggers vmull.p64 q1, d2, dCONSTANTl 188*530b304fSEric Biggers veor.8 q1, q1, q5 189*530b304fSEric Biggers veor.8 q1, q1, q3 190*530b304fSEric Biggers 191*530b304fSEric Biggers vmull.p64 q5, d3, dCONSTANTh 192*530b304fSEric Biggers vmull.p64 q1, d2, dCONSTANTl 193*530b304fSEric Biggers veor.8 q1, q1, q5 194*530b304fSEric Biggers veor.8 q1, q1, q4 195*530b304fSEric Biggers 196*530b304fSEric Biggers teq LEN, #0 197*530b304fSEric Biggers beq fold_64 198*530b304fSEric Biggers 199*530b304fSEric Biggersloop_16: /* Folding rest buffer into 128bit */ 200*530b304fSEric Biggers subs LEN, LEN, #0x10 201*530b304fSEric Biggers 202*530b304fSEric Biggers vld1.8 {q2}, [BUF, :128]! 203*530b304fSEric Biggers vmull.p64 q5, d3, dCONSTANTh 204*530b304fSEric Biggers vmull.p64 q1, d2, dCONSTANTl 205*530b304fSEric Biggers veor.8 q1, q1, q5 206*530b304fSEric Biggers veor.8 q1, q1, q2 207*530b304fSEric Biggers 208*530b304fSEric Biggers bne loop_16 209*530b304fSEric Biggers 210*530b304fSEric Biggersfold_64: 211*530b304fSEric Biggers /* perform the last 64 bit fold, also adds 32 zeroes 212*530b304fSEric Biggers * to the input stream */ 213*530b304fSEric Biggers vmull.p64 q2, d2, dCONSTANTh 214*530b304fSEric Biggers vext.8 q1, q1, qzr, #8 215*530b304fSEric Biggers veor.8 q1, q1, q2 216*530b304fSEric Biggers 217*530b304fSEric Biggers /* final 32-bit fold */ 218*530b304fSEric Biggers vldr dCONSTANTl, [r3, #32] 219*530b304fSEric Biggers vldr d6, [r3, #40] 220*530b304fSEric Biggers vmov.i8 d7, #0 221*530b304fSEric Biggers 222*530b304fSEric Biggers vext.8 q2, q1, qzr, #4 223*530b304fSEric Biggers vand.8 d2, d2, d6 224*530b304fSEric Biggers vmull.p64 q1, d2, dCONSTANTl 225*530b304fSEric Biggers veor.8 q1, q1, q2 226*530b304fSEric Biggers 227*530b304fSEric Biggers /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ 228*530b304fSEric Biggers vldr dCONSTANTl, [r3, #48] 229*530b304fSEric Biggers vldr dCONSTANTh, [r3, #56] 230*530b304fSEric Biggers 231*530b304fSEric Biggers vand.8 q2, q1, q3 232*530b304fSEric Biggers vext.8 q2, qzr, q2, #8 233*530b304fSEric Biggers vmull.p64 q2, d5, dCONSTANTh 234*530b304fSEric Biggers vand.8 q2, q2, q3 235*530b304fSEric Biggers vmull.p64 q2, d4, dCONSTANTl 236*530b304fSEric Biggers veor.8 q1, q1, q2 237*530b304fSEric Biggers vmov r0, s5 238*530b304fSEric Biggers 239*530b304fSEric Biggers bx lr 240*530b304fSEric BiggersSYM_FUNC_END(crc32c_pmull_le) 241*530b304fSEric Biggers 242*530b304fSEric Biggers .macro __crc32, c 243*530b304fSEric Biggers subs ip, r2, #8 244*530b304fSEric Biggers bmi .Ltail\c 245*530b304fSEric Biggers 246*530b304fSEric Biggers tst r1, #3 247*530b304fSEric Biggers bne .Lunaligned\c 248*530b304fSEric Biggers 249*530b304fSEric Biggers teq ip, #0 250*530b304fSEric Biggers.Laligned8\c: 251*530b304fSEric Biggers ldrd r2, r3, [r1], #8 252*530b304fSEric BiggersARM_BE8(rev r2, r2 ) 253*530b304fSEric BiggersARM_BE8(rev r3, r3 ) 254*530b304fSEric Biggers crc32\c\()w r0, r0, r2 255*530b304fSEric Biggers crc32\c\()w r0, r0, r3 256*530b304fSEric Biggers bxeq lr 257*530b304fSEric Biggers subs ip, ip, #8 258*530b304fSEric Biggers bpl .Laligned8\c 259*530b304fSEric Biggers 260*530b304fSEric Biggers.Ltail\c: 261*530b304fSEric Biggers tst ip, #4 262*530b304fSEric Biggers beq 2f 263*530b304fSEric Biggers ldr r3, [r1], #4 264*530b304fSEric BiggersARM_BE8(rev r3, r3 ) 265*530b304fSEric Biggers crc32\c\()w r0, r0, r3 266*530b304fSEric Biggers 267*530b304fSEric Biggers2: tst ip, #2 268*530b304fSEric Biggers beq 1f 269*530b304fSEric Biggers ldrh r3, [r1], #2 270*530b304fSEric BiggersARM_BE8(rev16 r3, r3 ) 271*530b304fSEric Biggers crc32\c\()h r0, r0, r3 272*530b304fSEric Biggers 273*530b304fSEric Biggers1: tst ip, #1 274*530b304fSEric Biggers bxeq lr 275*530b304fSEric Biggers ldrb r3, [r1] 276*530b304fSEric Biggers crc32\c\()b r0, r0, r3 277*530b304fSEric Biggers bx lr 278*530b304fSEric Biggers 279*530b304fSEric Biggers.Lunaligned\c: 280*530b304fSEric Biggers tst r1, #1 281*530b304fSEric Biggers beq 2f 282*530b304fSEric Biggers ldrb r3, [r1], #1 283*530b304fSEric Biggers subs r2, r2, #1 284*530b304fSEric Biggers crc32\c\()b r0, r0, r3 285*530b304fSEric Biggers 286*530b304fSEric Biggers tst r1, #2 287*530b304fSEric Biggers beq 0f 288*530b304fSEric Biggers2: ldrh r3, [r1], #2 289*530b304fSEric Biggers subs r2, r2, #2 290*530b304fSEric BiggersARM_BE8(rev16 r3, r3 ) 291*530b304fSEric Biggers crc32\c\()h r0, r0, r3 292*530b304fSEric Biggers 293*530b304fSEric Biggers0: subs ip, r2, #8 294*530b304fSEric Biggers bpl .Laligned8\c 295*530b304fSEric Biggers b .Ltail\c 296*530b304fSEric Biggers .endm 297*530b304fSEric Biggers 298*530b304fSEric Biggers .align 5 299*530b304fSEric BiggersSYM_FUNC_START(crc32_armv8_le) 300*530b304fSEric Biggers __crc32 301*530b304fSEric BiggersSYM_FUNC_END(crc32_armv8_le) 302*530b304fSEric Biggers 303*530b304fSEric Biggers .align 5 304*530b304fSEric BiggersSYM_FUNC_START(crc32c_armv8_le) 305*530b304fSEric Biggers __crc32 c 306*530b304fSEric BiggersSYM_FUNC_END(crc32c_armv8_le) 307