1/* 2 * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions 3 * 4 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11/* GPL HEADER START 12 * 13 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 14 * 15 * This program is free software; you can redistribute it and/or modify 16 * it under the terms of the GNU General Public License version 2 only, 17 * as published by the Free Software Foundation. 18 * 19 * This program is distributed in the hope that it will be useful, but 20 * WITHOUT ANY WARRANTY; without even the implied warranty of 21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 * General Public License version 2 for more details (a copy is included 23 * in the LICENSE file that accompanied this code). 24 * 25 * You should have received a copy of the GNU General Public License 26 * version 2 along with this program; If not, see http://www.gnu.org/licenses 27 * 28 * Please visit http://www.xyratex.com/contact if you need additional 29 * information or have any questions. 30 * 31 * GPL HEADER END 32 */ 33 34/* 35 * Copyright 2012 Xyratex Technology Limited 36 * 37 * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 38 * calculation. 39 * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) 40 * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found 41 * at: 42 * https://www.intel.com/products/processor/manuals/ 43 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual 44 * Volume 2B: Instruction Set Reference, N-Z 45 * 46 * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> 47 * Alexander Boyko <Alexander_Boyko@xyratex.com> 48 */ 49 50#include <linux/linkage.h> 51#include <asm/assembler.h> 52 53 .text 54 .align 6 55 .arch armv8-a 56 .arch_extension crc 57 .fpu crypto-neon-fp-armv8 58 59.Lcrc32_constants: 60 /* 61 * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 62 * #define CONSTANT_R1 0x154442bd4LL 63 * 64 * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 65 * #define CONSTANT_R2 0x1c6e41596LL 66 */ 67 .quad 0x0000000154442bd4 68 .quad 0x00000001c6e41596 69 70 /* 71 * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 72 * #define CONSTANT_R3 0x1751997d0LL 73 * 74 * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e 75 * #define CONSTANT_R4 0x0ccaa009eLL 76 */ 77 .quad 0x00000001751997d0 78 .quad 0x00000000ccaa009e 79 80 /* 81 * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 82 * #define CONSTANT_R5 0x163cd6124LL 83 */ 84 .quad 0x0000000163cd6124 85 .quad 0x00000000FFFFFFFF 86 87 /* 88 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL 89 * 90 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` 91 * = 0x1F7011641LL 92 * #define CONSTANT_RU 0x1F7011641LL 93 */ 94 .quad 0x00000001DB710641 95 .quad 0x00000001F7011641 96 97.Lcrc32c_constants: 98 .quad 0x00000000740eef02 99 .quad 0x000000009e4addf8 100 .quad 0x00000000f20c0dfe 101 .quad 0x000000014cd00bd6 102 .quad 0x00000000dd45aab8 103 .quad 0x00000000FFFFFFFF 104 .quad 0x0000000105ec76f0 105 .quad 0x00000000dea713f1 106 107 dCONSTANTl .req d0 108 dCONSTANTh .req d1 109 qCONSTANT .req q0 110 111 BUF .req r0 112 LEN .req r1 113 CRC .req r2 114 115 qzr .req q9 116 117 /** 118 * Calculate crc32 119 * BUF - buffer 120 * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63 121 * CRC - initial crc32 122 * return %eax crc32 123 * uint crc32_pmull_le(unsigned char const *buffer, 124 * size_t len, uint crc32) 125 */ 126SYM_FUNC_START(crc32_pmull_le) 127 adr r3, .Lcrc32_constants 128 b 0f 129SYM_FUNC_END(crc32_pmull_le) 130 131SYM_FUNC_START(crc32c_pmull_le) 132 adr r3, .Lcrc32c_constants 133 1340: bic LEN, LEN, #15 135 vld1.8 {q1-q2}, [BUF, :128]! 136 vld1.8 {q3-q4}, [BUF, :128]! 137 vmov.i8 qzr, #0 138 vmov.i8 qCONSTANT, #0 139 vmov.32 dCONSTANTl[0], CRC 140 veor.8 d2, d2, dCONSTANTl 141 sub LEN, LEN, #0x40 142 cmp LEN, #0x40 143 blt less_64 144 145 vld1.64 {qCONSTANT}, [r3] 146 147loop_64: /* 64 bytes Full cache line folding */ 148 sub LEN, LEN, #0x40 149 150 vmull.p64 q5, d3, dCONSTANTh 151 vmull.p64 q6, d5, dCONSTANTh 152 vmull.p64 q7, d7, dCONSTANTh 153 vmull.p64 q8, d9, dCONSTANTh 154 155 vmull.p64 q1, d2, dCONSTANTl 156 vmull.p64 q2, d4, dCONSTANTl 157 vmull.p64 q3, d6, dCONSTANTl 158 vmull.p64 q4, d8, dCONSTANTl 159 160 veor.8 q1, q1, q5 161 vld1.8 {q5}, [BUF, :128]! 162 veor.8 q2, q2, q6 163 vld1.8 {q6}, [BUF, :128]! 164 veor.8 q3, q3, q7 165 vld1.8 {q7}, [BUF, :128]! 166 veor.8 q4, q4, q8 167 vld1.8 {q8}, [BUF, :128]! 168 169 veor.8 q1, q1, q5 170 veor.8 q2, q2, q6 171 veor.8 q3, q3, q7 172 veor.8 q4, q4, q8 173 174 cmp LEN, #0x40 175 bge loop_64 176 177less_64: /* Folding cache line into 128bit */ 178 vldr dCONSTANTl, [r3, #16] 179 vldr dCONSTANTh, [r3, #24] 180 181 vmull.p64 q5, d3, dCONSTANTh 182 vmull.p64 q1, d2, dCONSTANTl 183 veor.8 q1, q1, q5 184 veor.8 q1, q1, q2 185 186 vmull.p64 q5, d3, dCONSTANTh 187 vmull.p64 q1, d2, dCONSTANTl 188 veor.8 q1, q1, q5 189 veor.8 q1, q1, q3 190 191 vmull.p64 q5, d3, dCONSTANTh 192 vmull.p64 q1, d2, dCONSTANTl 193 veor.8 q1, q1, q5 194 veor.8 q1, q1, q4 195 196 teq LEN, #0 197 beq fold_64 198 199loop_16: /* Folding rest buffer into 128bit */ 200 subs LEN, LEN, #0x10 201 202 vld1.8 {q2}, [BUF, :128]! 203 vmull.p64 q5, d3, dCONSTANTh 204 vmull.p64 q1, d2, dCONSTANTl 205 veor.8 q1, q1, q5 206 veor.8 q1, q1, q2 207 208 bne loop_16 209 210fold_64: 211 /* perform the last 64 bit fold, also adds 32 zeroes 212 * to the input stream */ 213 vmull.p64 q2, d2, dCONSTANTh 214 vext.8 q1, q1, qzr, #8 215 veor.8 q1, q1, q2 216 217 /* final 32-bit fold */ 218 vldr dCONSTANTl, [r3, #32] 219 vldr d6, [r3, #40] 220 vmov.i8 d7, #0 221 222 vext.8 q2, q1, qzr, #4 223 vand.8 d2, d2, d6 224 vmull.p64 q1, d2, dCONSTANTl 225 veor.8 q1, q1, q2 226 227 /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ 228 vldr dCONSTANTl, [r3, #48] 229 vldr dCONSTANTh, [r3, #56] 230 231 vand.8 q2, q1, q3 232 vext.8 q2, qzr, q2, #8 233 vmull.p64 q2, d5, dCONSTANTh 234 vand.8 q2, q2, q3 235 vmull.p64 q2, d4, dCONSTANTl 236 veor.8 q1, q1, q2 237 vmov r0, s5 238 239 bx lr 240SYM_FUNC_END(crc32c_pmull_le) 241 242 .macro __crc32, c 243 subs ip, r2, #8 244 bmi .Ltail\c 245 246 tst r1, #3 247 bne .Lunaligned\c 248 249 teq ip, #0 250.Laligned8\c: 251 ldrd r2, r3, [r1], #8 252ARM_BE8(rev r2, r2 ) 253ARM_BE8(rev r3, r3 ) 254 crc32\c\()w r0, r0, r2 255 crc32\c\()w r0, r0, r3 256 bxeq lr 257 subs ip, ip, #8 258 bpl .Laligned8\c 259 260.Ltail\c: 261 tst ip, #4 262 beq 2f 263 ldr r3, [r1], #4 264ARM_BE8(rev r3, r3 ) 265 crc32\c\()w r0, r0, r3 266 2672: tst ip, #2 268 beq 1f 269 ldrh r3, [r1], #2 270ARM_BE8(rev16 r3, r3 ) 271 crc32\c\()h r0, r0, r3 272 2731: tst ip, #1 274 bxeq lr 275 ldrb r3, [r1] 276 crc32\c\()b r0, r0, r3 277 bx lr 278 279.Lunaligned\c: 280 tst r1, #1 281 beq 2f 282 ldrb r3, [r1], #1 283 subs r2, r2, #1 284 crc32\c\()b r0, r0, r3 285 286 tst r1, #2 287 beq 0f 2882: ldrh r3, [r1], #2 289 subs r2, r2, #2 290ARM_BE8(rev16 r3, r3 ) 291 crc32\c\()h r0, r0, r3 292 2930: subs ip, r2, #8 294 bpl .Laligned8\c 295 b .Ltail\c 296 .endm 297 298 .align 5 299SYM_FUNC_START(crc32_armv8_le) 300 __crc32 301SYM_FUNC_END(crc32_armv8_le) 302 303 .align 5 304SYM_FUNC_START(crc32c_armv8_le) 305 __crc32 c 306SYM_FUNC_END(crc32c_armv8_le) 307