1/* 2 * Accelerated CRC32(C) using ARM CRC, NEON and Crypto Extensions instructions 3 * 4 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11/* GPL HEADER START 12 * 13 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 14 * 15 * This program is free software; you can redistribute it and/or modify 16 * it under the terms of the GNU General Public License version 2 only, 17 * as published by the Free Software Foundation. 18 * 19 * This program is distributed in the hope that it will be useful, but 20 * WITHOUT ANY WARRANTY; without even the implied warranty of 21 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 22 * General Public License version 2 for more details (a copy is included 23 * in the LICENSE file that accompanied this code). 24 * 25 * You should have received a copy of the GNU General Public License 26 * version 2 along with this program; If not, see http://www.gnu.org/licenses 27 * 28 * Please visit http://www.xyratex.com/contact if you need additional 29 * information or have any questions. 30 * 31 * GPL HEADER END 32 */ 33 34/* 35 * Copyright 2012 Xyratex Technology Limited 36 * 37 * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 38 * calculation. 39 * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) 40 * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found 41 * at: 42 * https://www.intel.com/products/processor/manuals/ 43 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual 44 * Volume 2B: Instruction Set Reference, N-Z 45 * 46 * Authors: Gregory Prestas <Gregory_Prestas@us.xyratex.com> 47 * Alexander Boyko <Alexander_Boyko@xyratex.com> 48 */ 49 50#include <linux/linkage.h> 51#include <linux/cfi_types.h> 52#include <asm/assembler.h> 53 54 .text 55 .align 6 56 .arch armv8-a 57 .arch_extension crc 58 .fpu crypto-neon-fp-armv8 59 60.Lcrc32_constants: 61 /* 62 * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 63 * #define CONSTANT_R1 0x154442bd4LL 64 * 65 * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 66 * #define CONSTANT_R2 0x1c6e41596LL 67 */ 68 .quad 0x0000000154442bd4 69 .quad 0x00000001c6e41596 70 71 /* 72 * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 73 * #define CONSTANT_R3 0x1751997d0LL 74 * 75 * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e 76 * #define CONSTANT_R4 0x0ccaa009eLL 77 */ 78 .quad 0x00000001751997d0 79 .quad 0x00000000ccaa009e 80 81 /* 82 * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 83 * #define CONSTANT_R5 0x163cd6124LL 84 */ 85 .quad 0x0000000163cd6124 86 .quad 0x00000000FFFFFFFF 87 88 /* 89 * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL 90 * 91 * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` 92 * = 0x1F7011641LL 93 * #define CONSTANT_RU 0x1F7011641LL 94 */ 95 .quad 0x00000001DB710641 96 .quad 0x00000001F7011641 97 98.Lcrc32c_constants: 99 .quad 0x00000000740eef02 100 .quad 0x000000009e4addf8 101 .quad 0x00000000f20c0dfe 102 .quad 0x000000014cd00bd6 103 .quad 0x00000000dd45aab8 104 .quad 0x00000000FFFFFFFF 105 .quad 0x0000000105ec76f0 106 .quad 0x00000000dea713f1 107 108 dCONSTANTl .req d0 109 dCONSTANTh .req d1 110 qCONSTANT .req q0 111 112 BUF .req r0 113 LEN .req r1 114 CRC .req r2 115 116 qzr .req q9 117 118 /** 119 * Calculate crc32 120 * BUF - buffer 121 * LEN - sizeof buffer (multiple of 16 bytes), LEN should be > 63 122 * CRC - initial crc32 123 * return %eax crc32 124 * uint crc32_pmull_le(unsigned char const *buffer, 125 * size_t len, uint crc32) 126 */ 127SYM_FUNC_START(crc32_pmull_le) 128 adr r3, .Lcrc32_constants 129 b 0f 130SYM_FUNC_END(crc32_pmull_le) 131 132SYM_FUNC_START(crc32c_pmull_le) 133 adr r3, .Lcrc32c_constants 134 1350: bic LEN, LEN, #15 136 vld1.8 {q1-q2}, [BUF, :128]! 137 vld1.8 {q3-q4}, [BUF, :128]! 138 vmov.i8 qzr, #0 139 vmov.i8 qCONSTANT, #0 140 vmov.32 dCONSTANTl[0], CRC 141 veor.8 d2, d2, dCONSTANTl 142 sub LEN, LEN, #0x40 143 cmp LEN, #0x40 144 blt less_64 145 146 vld1.64 {qCONSTANT}, [r3] 147 148loop_64: /* 64 bytes Full cache line folding */ 149 sub LEN, LEN, #0x40 150 151 vmull.p64 q5, d3, dCONSTANTh 152 vmull.p64 q6, d5, dCONSTANTh 153 vmull.p64 q7, d7, dCONSTANTh 154 vmull.p64 q8, d9, dCONSTANTh 155 156 vmull.p64 q1, d2, dCONSTANTl 157 vmull.p64 q2, d4, dCONSTANTl 158 vmull.p64 q3, d6, dCONSTANTl 159 vmull.p64 q4, d8, dCONSTANTl 160 161 veor.8 q1, q1, q5 162 vld1.8 {q5}, [BUF, :128]! 163 veor.8 q2, q2, q6 164 vld1.8 {q6}, [BUF, :128]! 165 veor.8 q3, q3, q7 166 vld1.8 {q7}, [BUF, :128]! 167 veor.8 q4, q4, q8 168 vld1.8 {q8}, [BUF, :128]! 169 170 veor.8 q1, q1, q5 171 veor.8 q2, q2, q6 172 veor.8 q3, q3, q7 173 veor.8 q4, q4, q8 174 175 cmp LEN, #0x40 176 bge loop_64 177 178less_64: /* Folding cache line into 128bit */ 179 vldr dCONSTANTl, [r3, #16] 180 vldr dCONSTANTh, [r3, #24] 181 182 vmull.p64 q5, d3, dCONSTANTh 183 vmull.p64 q1, d2, dCONSTANTl 184 veor.8 q1, q1, q5 185 veor.8 q1, q1, q2 186 187 vmull.p64 q5, d3, dCONSTANTh 188 vmull.p64 q1, d2, dCONSTANTl 189 veor.8 q1, q1, q5 190 veor.8 q1, q1, q3 191 192 vmull.p64 q5, d3, dCONSTANTh 193 vmull.p64 q1, d2, dCONSTANTl 194 veor.8 q1, q1, q5 195 veor.8 q1, q1, q4 196 197 teq LEN, #0 198 beq fold_64 199 200loop_16: /* Folding rest buffer into 128bit */ 201 subs LEN, LEN, #0x10 202 203 vld1.8 {q2}, [BUF, :128]! 204 vmull.p64 q5, d3, dCONSTANTh 205 vmull.p64 q1, d2, dCONSTANTl 206 veor.8 q1, q1, q5 207 veor.8 q1, q1, q2 208 209 bne loop_16 210 211fold_64: 212 /* perform the last 64 bit fold, also adds 32 zeroes 213 * to the input stream */ 214 vmull.p64 q2, d2, dCONSTANTh 215 vext.8 q1, q1, qzr, #8 216 veor.8 q1, q1, q2 217 218 /* final 32-bit fold */ 219 vldr dCONSTANTl, [r3, #32] 220 vldr d6, [r3, #40] 221 vmov.i8 d7, #0 222 223 vext.8 q2, q1, qzr, #4 224 vand.8 d2, d2, d6 225 vmull.p64 q1, d2, dCONSTANTl 226 veor.8 q1, q1, q2 227 228 /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ 229 vldr dCONSTANTl, [r3, #48] 230 vldr dCONSTANTh, [r3, #56] 231 232 vand.8 q2, q1, q3 233 vext.8 q2, qzr, q2, #8 234 vmull.p64 q2, d5, dCONSTANTh 235 vand.8 q2, q2, q3 236 vmull.p64 q2, d4, dCONSTANTl 237 veor.8 q1, q1, q2 238 vmov r0, s5 239 240 bx lr 241SYM_FUNC_END(crc32c_pmull_le) 242 243 .macro __crc32, c 244 subs ip, r2, #8 245 bmi .Ltail\c 246 247 tst r1, #3 248 bne .Lunaligned\c 249 250 teq ip, #0 251.Laligned8\c: 252 ldrd r2, r3, [r1], #8 253ARM_BE8(rev r2, r2 ) 254ARM_BE8(rev r3, r3 ) 255 crc32\c\()w r0, r0, r2 256 crc32\c\()w r0, r0, r3 257 bxeq lr 258 subs ip, ip, #8 259 bpl .Laligned8\c 260 261.Ltail\c: 262 tst ip, #4 263 beq 2f 264 ldr r3, [r1], #4 265ARM_BE8(rev r3, r3 ) 266 crc32\c\()w r0, r0, r3 267 2682: tst ip, #2 269 beq 1f 270 ldrh r3, [r1], #2 271ARM_BE8(rev16 r3, r3 ) 272 crc32\c\()h r0, r0, r3 273 2741: tst ip, #1 275 bxeq lr 276 ldrb r3, [r1] 277 crc32\c\()b r0, r0, r3 278 bx lr 279 280.Lunaligned\c: 281 tst r1, #1 282 beq 2f 283 ldrb r3, [r1], #1 284 subs r2, r2, #1 285 crc32\c\()b r0, r0, r3 286 287 tst r1, #2 288 beq 0f 2892: ldrh r3, [r1], #2 290 subs r2, r2, #2 291ARM_BE8(rev16 r3, r3 ) 292 crc32\c\()h r0, r0, r3 293 2940: subs ip, r2, #8 295 bpl .Laligned8\c 296 b .Ltail\c 297 .endm 298 299 .align 5 300SYM_TYPED_FUNC_START(crc32_armv8_le) 301 __crc32 302SYM_FUNC_END(crc32_armv8_le) 303 304 .align 5 305SYM_TYPED_FUNC_START(crc32c_armv8_le) 306 __crc32 c 307SYM_FUNC_END(crc32c_armv8_le) 308