xref: /linux/arch/arm64/crypto/crct10dif-ce-core.S (revision 7ae9fb1b7ecbb5d85d07857943f677fd1a559b18)
16ef5737fSArd Biesheuvel//
26ef5737fSArd Biesheuvel// Accelerated CRC-T10DIF using arm64 NEON and Crypto Extensions instructions
36ef5737fSArd Biesheuvel//
46ef5737fSArd Biesheuvel// Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org>
56227cd12SEric Biggers// Copyright (C) 2019 Google LLC <ebiggers@google.com>
66ef5737fSArd Biesheuvel//
76ef5737fSArd Biesheuvel// This program is free software; you can redistribute it and/or modify
86ef5737fSArd Biesheuvel// it under the terms of the GNU General Public License version 2 as
96ef5737fSArd Biesheuvel// published by the Free Software Foundation.
106ef5737fSArd Biesheuvel//
116ef5737fSArd Biesheuvel
126227cd12SEric Biggers// Derived from the x86 version:
136ef5737fSArd Biesheuvel//
146ef5737fSArd Biesheuvel// Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
156ef5737fSArd Biesheuvel//
166ef5737fSArd Biesheuvel// Copyright (c) 2013, Intel Corporation
176ef5737fSArd Biesheuvel//
186ef5737fSArd Biesheuvel// Authors:
196ef5737fSArd Biesheuvel//     Erdinc Ozturk <erdinc.ozturk@intel.com>
206ef5737fSArd Biesheuvel//     Vinodh Gopal <vinodh.gopal@intel.com>
216ef5737fSArd Biesheuvel//     James Guilford <james.guilford@intel.com>
226ef5737fSArd Biesheuvel//     Tim Chen <tim.c.chen@linux.intel.com>
236ef5737fSArd Biesheuvel//
246ef5737fSArd Biesheuvel// This software is available to you under a choice of one of two
256ef5737fSArd Biesheuvel// licenses.  You may choose to be licensed under the terms of the GNU
266ef5737fSArd Biesheuvel// General Public License (GPL) Version 2, available from the file
276ef5737fSArd Biesheuvel// COPYING in the main directory of this source tree, or the
286ef5737fSArd Biesheuvel// OpenIB.org BSD license below:
296ef5737fSArd Biesheuvel//
306ef5737fSArd Biesheuvel// Redistribution and use in source and binary forms, with or without
316ef5737fSArd Biesheuvel// modification, are permitted provided that the following conditions are
326ef5737fSArd Biesheuvel// met:
336ef5737fSArd Biesheuvel//
346ef5737fSArd Biesheuvel// * Redistributions of source code must retain the above copyright
356ef5737fSArd Biesheuvel//   notice, this list of conditions and the following disclaimer.
366ef5737fSArd Biesheuvel//
376ef5737fSArd Biesheuvel// * Redistributions in binary form must reproduce the above copyright
386ef5737fSArd Biesheuvel//   notice, this list of conditions and the following disclaimer in the
396ef5737fSArd Biesheuvel//   documentation and/or other materials provided with the
406ef5737fSArd Biesheuvel//   distribution.
416ef5737fSArd Biesheuvel//
426ef5737fSArd Biesheuvel// * Neither the name of the Intel Corporation nor the names of its
436ef5737fSArd Biesheuvel//   contributors may be used to endorse or promote products derived from
446ef5737fSArd Biesheuvel//   this software without specific prior written permission.
456ef5737fSArd Biesheuvel//
466ef5737fSArd Biesheuvel//
476ef5737fSArd Biesheuvel// THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
486ef5737fSArd Biesheuvel// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
496ef5737fSArd Biesheuvel// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
506ef5737fSArd Biesheuvel// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
516ef5737fSArd Biesheuvel// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
526ef5737fSArd Biesheuvel// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
536ef5737fSArd Biesheuvel// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
546ef5737fSArd Biesheuvel// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
556ef5737fSArd Biesheuvel// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
566ef5737fSArd Biesheuvel// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
576ef5737fSArd Biesheuvel// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
586ef5737fSArd Biesheuvel//
596ef5737fSArd Biesheuvel//       Reference paper titled "Fast CRC Computation for Generic
606ef5737fSArd Biesheuvel//	Polynomials Using PCLMULQDQ Instruction"
616ef5737fSArd Biesheuvel//       URL: http://www.intel.com/content/dam/www/public/us/en/documents
626ef5737fSArd Biesheuvel//  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
636ef5737fSArd Biesheuvel//
646ef5737fSArd Biesheuvel
656ef5737fSArd Biesheuvel#include <linux/linkage.h>
666ef5737fSArd Biesheuvel#include <asm/assembler.h>
676ef5737fSArd Biesheuvel
686ef5737fSArd Biesheuvel	.text
693ca73b70SMark Brown	.arch		armv8-a+crypto
706ef5737fSArd Biesheuvel
71fc754c02SArd Biesheuvel	init_crc	.req	w0
72fc754c02SArd Biesheuvel	buf		.req	x1
73fc754c02SArd Biesheuvel	len		.req	x2
74fc754c02SArd Biesheuvel	fold_consts_ptr	.req	x3
756ef5737fSArd Biesheuvel
766227cd12SEric Biggers	fold_consts	.req	v10
776ef5737fSArd Biesheuvel
782fffee53SArd Biesheuvel	ad		.req	v14
792fffee53SArd Biesheuvel
802fffee53SArd Biesheuvel	k00_16		.req	v15
812fffee53SArd Biesheuvel	k32_48		.req	v16
822fffee53SArd Biesheuvel
832fffee53SArd Biesheuvel	t3		.req	v17
842fffee53SArd Biesheuvel	t4		.req	v18
852fffee53SArd Biesheuvel	t5		.req	v19
862fffee53SArd Biesheuvel	t6		.req	v20
872fffee53SArd Biesheuvel	t7		.req	v21
882fffee53SArd Biesheuvel	t8		.req	v22
892fffee53SArd Biesheuvel	t9		.req	v23
902fffee53SArd Biesheuvel
912fffee53SArd Biesheuvel	perm1		.req	v24
922fffee53SArd Biesheuvel	perm2		.req	v25
932fffee53SArd Biesheuvel	perm3		.req	v26
942fffee53SArd Biesheuvel	perm4		.req	v27
952fffee53SArd Biesheuvel
962fffee53SArd Biesheuvel	bd1		.req	v28
972fffee53SArd Biesheuvel	bd2		.req	v29
982fffee53SArd Biesheuvel	bd3		.req	v30
992fffee53SArd Biesheuvel	bd4		.req	v31
1002fffee53SArd Biesheuvel
1012fffee53SArd Biesheuvel	.macro		__pmull_init_p64
1022fffee53SArd Biesheuvel	.endm
1032fffee53SArd Biesheuvel
1042fffee53SArd Biesheuvel	.macro		__pmull_pre_p64, bd
1052fffee53SArd Biesheuvel	.endm
1062fffee53SArd Biesheuvel
1072fffee53SArd Biesheuvel	.macro		__pmull_init_p8
1082fffee53SArd Biesheuvel	// k00_16 := 0x0000000000000000_000000000000ffff
1092fffee53SArd Biesheuvel	// k32_48 := 0x00000000ffffffff_0000ffffffffffff
1102fffee53SArd Biesheuvel	movi		k32_48.2d, #0xffffffff
1112fffee53SArd Biesheuvel	mov		k32_48.h[2], k32_48.h[0]
1122fffee53SArd Biesheuvel	ushr		k00_16.2d, k32_48.2d, #32
1132fffee53SArd Biesheuvel
1142fffee53SArd Biesheuvel	// prepare the permutation vectors
1152fffee53SArd Biesheuvel	mov_q		x5, 0x080f0e0d0c0b0a09
1162fffee53SArd Biesheuvel	movi		perm4.8b, #8
1172fffee53SArd Biesheuvel	dup		perm1.2d, x5
1182fffee53SArd Biesheuvel	eor		perm1.16b, perm1.16b, perm4.16b
1192fffee53SArd Biesheuvel	ushr		perm2.2d, perm1.2d, #8
1202fffee53SArd Biesheuvel	ushr		perm3.2d, perm1.2d, #16
1212fffee53SArd Biesheuvel	ushr		perm4.2d, perm1.2d, #24
1222fffee53SArd Biesheuvel	sli		perm2.2d, perm1.2d, #56
1232fffee53SArd Biesheuvel	sli		perm3.2d, perm1.2d, #48
1242fffee53SArd Biesheuvel	sli		perm4.2d, perm1.2d, #40
1252fffee53SArd Biesheuvel	.endm
1262fffee53SArd Biesheuvel
1272fffee53SArd Biesheuvel	.macro		__pmull_pre_p8, bd
1282fffee53SArd Biesheuvel	tbl		bd1.16b, {\bd\().16b}, perm1.16b
1292fffee53SArd Biesheuvel	tbl		bd2.16b, {\bd\().16b}, perm2.16b
1302fffee53SArd Biesheuvel	tbl		bd3.16b, {\bd\().16b}, perm3.16b
1312fffee53SArd Biesheuvel	tbl		bd4.16b, {\bd\().16b}, perm4.16b
1322fffee53SArd Biesheuvel	.endm
1332fffee53SArd Biesheuvel
1340e89640bSMark BrownSYM_FUNC_START_LOCAL(__pmull_p8_core)
1352fffee53SArd Biesheuvel.L__pmull_p8_core:
1362fffee53SArd Biesheuvel	ext		t4.8b, ad.8b, ad.8b, #1			// A1
1372fffee53SArd Biesheuvel	ext		t5.8b, ad.8b, ad.8b, #2			// A2
1382fffee53SArd Biesheuvel	ext		t6.8b, ad.8b, ad.8b, #3			// A3
1392fffee53SArd Biesheuvel
1406227cd12SEric Biggers	pmull		t4.8h, t4.8b, fold_consts.8b		// F = A1*B
1412fffee53SArd Biesheuvel	pmull		t8.8h, ad.8b, bd1.8b			// E = A*B1
1426227cd12SEric Biggers	pmull		t5.8h, t5.8b, fold_consts.8b		// H = A2*B
1432fffee53SArd Biesheuvel	pmull		t7.8h, ad.8b, bd2.8b			// G = A*B2
1446227cd12SEric Biggers	pmull		t6.8h, t6.8b, fold_consts.8b		// J = A3*B
1452fffee53SArd Biesheuvel	pmull		t9.8h, ad.8b, bd3.8b			// I = A*B3
1462fffee53SArd Biesheuvel	pmull		t3.8h, ad.8b, bd4.8b			// K = A*B4
1472fffee53SArd Biesheuvel	b		0f
1482fffee53SArd Biesheuvel
1492fffee53SArd Biesheuvel.L__pmull_p8_core2:
1502fffee53SArd Biesheuvel	tbl		t4.16b, {ad.16b}, perm1.16b		// A1
1512fffee53SArd Biesheuvel	tbl		t5.16b, {ad.16b}, perm2.16b		// A2
1522fffee53SArd Biesheuvel	tbl		t6.16b, {ad.16b}, perm3.16b		// A3
1532fffee53SArd Biesheuvel
1546227cd12SEric Biggers	pmull2		t4.8h, t4.16b, fold_consts.16b		// F = A1*B
1552fffee53SArd Biesheuvel	pmull2		t8.8h, ad.16b, bd1.16b			// E = A*B1
1566227cd12SEric Biggers	pmull2		t5.8h, t5.16b, fold_consts.16b		// H = A2*B
1572fffee53SArd Biesheuvel	pmull2		t7.8h, ad.16b, bd2.16b			// G = A*B2
1586227cd12SEric Biggers	pmull2		t6.8h, t6.16b, fold_consts.16b		// J = A3*B
1592fffee53SArd Biesheuvel	pmull2		t9.8h, ad.16b, bd3.16b			// I = A*B3
1602fffee53SArd Biesheuvel	pmull2		t3.8h, ad.16b, bd4.16b			// K = A*B4
1612fffee53SArd Biesheuvel
1622fffee53SArd Biesheuvel0:	eor		t4.16b, t4.16b, t8.16b			// L = E + F
1632fffee53SArd Biesheuvel	eor		t5.16b, t5.16b, t7.16b			// M = G + H
1642fffee53SArd Biesheuvel	eor		t6.16b, t6.16b, t9.16b			// N = I + J
1652fffee53SArd Biesheuvel
1662fffee53SArd Biesheuvel	uzp1		t8.2d, t4.2d, t5.2d
1672fffee53SArd Biesheuvel	uzp2		t4.2d, t4.2d, t5.2d
1682fffee53SArd Biesheuvel	uzp1		t7.2d, t6.2d, t3.2d
1692fffee53SArd Biesheuvel	uzp2		t6.2d, t6.2d, t3.2d
1702fffee53SArd Biesheuvel
1712fffee53SArd Biesheuvel	// t4 = (L) (P0 + P1) << 8
1722fffee53SArd Biesheuvel	// t5 = (M) (P2 + P3) << 16
1732fffee53SArd Biesheuvel	eor		t8.16b, t8.16b, t4.16b
1742fffee53SArd Biesheuvel	and		t4.16b, t4.16b, k32_48.16b
1752fffee53SArd Biesheuvel
1762fffee53SArd Biesheuvel	// t6 = (N) (P4 + P5) << 24
1772fffee53SArd Biesheuvel	// t7 = (K) (P6 + P7) << 32
1782fffee53SArd Biesheuvel	eor		t7.16b, t7.16b, t6.16b
1792fffee53SArd Biesheuvel	and		t6.16b, t6.16b, k00_16.16b
1802fffee53SArd Biesheuvel
1812fffee53SArd Biesheuvel	eor		t8.16b, t8.16b, t4.16b
1822fffee53SArd Biesheuvel	eor		t7.16b, t7.16b, t6.16b
1832fffee53SArd Biesheuvel
1842fffee53SArd Biesheuvel	zip2		t5.2d, t8.2d, t4.2d
1852fffee53SArd Biesheuvel	zip1		t4.2d, t8.2d, t4.2d
1862fffee53SArd Biesheuvel	zip2		t3.2d, t7.2d, t6.2d
1872fffee53SArd Biesheuvel	zip1		t6.2d, t7.2d, t6.2d
1882fffee53SArd Biesheuvel
1892fffee53SArd Biesheuvel	ext		t4.16b, t4.16b, t4.16b, #15
1902fffee53SArd Biesheuvel	ext		t5.16b, t5.16b, t5.16b, #14
1912fffee53SArd Biesheuvel	ext		t6.16b, t6.16b, t6.16b, #13
1922fffee53SArd Biesheuvel	ext		t3.16b, t3.16b, t3.16b, #12
1932fffee53SArd Biesheuvel
1942fffee53SArd Biesheuvel	eor		t4.16b, t4.16b, t5.16b
1952fffee53SArd Biesheuvel	eor		t6.16b, t6.16b, t3.16b
1962fffee53SArd Biesheuvel	ret
1970e89640bSMark BrownSYM_FUNC_END(__pmull_p8_core)
1982fffee53SArd Biesheuvel
1992fffee53SArd Biesheuvel	.macro		__pmull_p8, rq, ad, bd, i
2006227cd12SEric Biggers	.ifnc		\bd, fold_consts
2012fffee53SArd Biesheuvel	.err
2022fffee53SArd Biesheuvel	.endif
2032fffee53SArd Biesheuvel	mov		ad.16b, \ad\().16b
2042fffee53SArd Biesheuvel	.ifb		\i
2056227cd12SEric Biggers	pmull		\rq\().8h, \ad\().8b, \bd\().8b		// D = A*B
2062fffee53SArd Biesheuvel	.else
2076227cd12SEric Biggers	pmull2		\rq\().8h, \ad\().16b, \bd\().16b	// D = A*B
2082fffee53SArd Biesheuvel	.endif
2092fffee53SArd Biesheuvel
2102fffee53SArd Biesheuvel	bl		.L__pmull_p8_core\i
2112fffee53SArd Biesheuvel
2122fffee53SArd Biesheuvel	eor		\rq\().16b, \rq\().16b, t4.16b
2132fffee53SArd Biesheuvel	eor		\rq\().16b, \rq\().16b, t6.16b
2142fffee53SArd Biesheuvel	.endm
2152fffee53SArd Biesheuvel
2166227cd12SEric Biggers	// Fold reg1, reg2 into the next 32 data bytes, storing the result back
2176227cd12SEric Biggers	// into reg1, reg2.
2186227cd12SEric Biggers	.macro		fold_32_bytes, p, reg1, reg2
2196227cd12SEric Biggers	ldp		q11, q12, [buf], #0x20
2206c1b0da1SArd Biesheuvel
2216227cd12SEric Biggers	__pmull_\p	v8, \reg1, fold_consts, 2
2226227cd12SEric Biggers	__pmull_\p	\reg1, \reg1, fold_consts
2236c1b0da1SArd Biesheuvel
2246c1b0da1SArd BiesheuvelCPU_LE(	rev64		v11.16b, v11.16b		)
2256c1b0da1SArd BiesheuvelCPU_LE(	rev64		v12.16b, v12.16b		)
2266c1b0da1SArd Biesheuvel
2276227cd12SEric Biggers	__pmull_\p	v9, \reg2, fold_consts, 2
2286227cd12SEric Biggers	__pmull_\p	\reg2, \reg2, fold_consts
2296c1b0da1SArd Biesheuvel
2306c1b0da1SArd BiesheuvelCPU_LE(	ext		v11.16b, v11.16b, v11.16b, #8	)
2316c1b0da1SArd BiesheuvelCPU_LE(	ext		v12.16b, v12.16b, v12.16b, #8	)
2326c1b0da1SArd Biesheuvel
2336c1b0da1SArd Biesheuvel	eor		\reg1\().16b, \reg1\().16b, v8.16b
2346c1b0da1SArd Biesheuvel	eor		\reg2\().16b, \reg2\().16b, v9.16b
2356c1b0da1SArd Biesheuvel	eor		\reg1\().16b, \reg1\().16b, v11.16b
2366c1b0da1SArd Biesheuvel	eor		\reg2\().16b, \reg2\().16b, v12.16b
2376c1b0da1SArd Biesheuvel	.endm
2386c1b0da1SArd Biesheuvel
2396227cd12SEric Biggers	// Fold src_reg into dst_reg, optionally loading the next fold constants
2406227cd12SEric Biggers	.macro		fold_16_bytes, p, src_reg, dst_reg, load_next_consts
2416227cd12SEric Biggers	__pmull_\p	v8, \src_reg, fold_consts
2426227cd12SEric Biggers	__pmull_\p	\src_reg, \src_reg, fold_consts, 2
2436227cd12SEric Biggers	.ifnb		\load_next_consts
2446227cd12SEric Biggers	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
2456227cd12SEric Biggers	__pmull_pre_\p	fold_consts
2466c1b0da1SArd Biesheuvel	.endif
2476227cd12SEric Biggers	eor		\dst_reg\().16b, \dst_reg\().16b, v8.16b
2486227cd12SEric Biggers	eor		\dst_reg\().16b, \dst_reg\().16b, \src_reg\().16b
2496c1b0da1SArd Biesheuvel	.endm
2506c1b0da1SArd Biesheuvel
2516c1b0da1SArd Biesheuvel	.macro		__pmull_p64, rd, rn, rm, n
2526c1b0da1SArd Biesheuvel	.ifb		\n
2536c1b0da1SArd Biesheuvel	pmull		\rd\().1q, \rn\().1d, \rm\().1d
2546c1b0da1SArd Biesheuvel	.else
2556c1b0da1SArd Biesheuvel	pmull2		\rd\().1q, \rn\().2d, \rm\().2d
2566c1b0da1SArd Biesheuvel	.endif
2576c1b0da1SArd Biesheuvel	.endm
2586c1b0da1SArd Biesheuvel
2596c1b0da1SArd Biesheuvel	.macro		crc_t10dif_pmull, p
2602fffee53SArd Biesheuvel	__pmull_init_\p
2612fffee53SArd Biesheuvel
2626227cd12SEric Biggers	// For sizes less than 256 bytes, we can't fold 128 bytes at a time.
2636227cd12SEric Biggers	cmp		len, #256
2646227cd12SEric Biggers	b.lt		.Lless_than_256_bytes_\@
2656ef5737fSArd Biesheuvel
2666227cd12SEric Biggers	adr_l		fold_consts_ptr, .Lfold_across_128_bytes_consts
2676ef5737fSArd Biesheuvel
2686227cd12SEric Biggers	// Load the first 128 data bytes.  Byte swapping is necessary to make
2696227cd12SEric Biggers	// the bit order match the polynomial coefficient order.
2706227cd12SEric Biggers	ldp		q0, q1, [buf]
2716227cd12SEric Biggers	ldp		q2, q3, [buf, #0x20]
2726227cd12SEric Biggers	ldp		q4, q5, [buf, #0x40]
2736227cd12SEric Biggers	ldp		q6, q7, [buf, #0x60]
2746227cd12SEric Biggers	add		buf, buf, #0x80
2756ef5737fSArd BiesheuvelCPU_LE(	rev64		v0.16b, v0.16b			)
2766ef5737fSArd BiesheuvelCPU_LE(	rev64		v1.16b, v1.16b			)
2776ef5737fSArd BiesheuvelCPU_LE(	rev64		v2.16b, v2.16b			)
2786ef5737fSArd BiesheuvelCPU_LE(	rev64		v3.16b, v3.16b			)
2796ef5737fSArd BiesheuvelCPU_LE(	rev64		v4.16b, v4.16b			)
2806ef5737fSArd BiesheuvelCPU_LE(	rev64		v5.16b, v5.16b			)
2816ef5737fSArd BiesheuvelCPU_LE(	rev64		v6.16b, v6.16b			)
2826ef5737fSArd BiesheuvelCPU_LE(	rev64		v7.16b, v7.16b			)
2836ef5737fSArd BiesheuvelCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
2846ef5737fSArd BiesheuvelCPU_LE(	ext		v1.16b, v1.16b, v1.16b, #8	)
2856ef5737fSArd BiesheuvelCPU_LE(	ext		v2.16b, v2.16b, v2.16b, #8	)
2866ef5737fSArd BiesheuvelCPU_LE(	ext		v3.16b, v3.16b, v3.16b, #8	)
2876ef5737fSArd BiesheuvelCPU_LE(	ext		v4.16b, v4.16b, v4.16b, #8	)
2886ef5737fSArd BiesheuvelCPU_LE(	ext		v5.16b, v5.16b, v5.16b, #8	)
2896ef5737fSArd BiesheuvelCPU_LE(	ext		v6.16b, v6.16b, v6.16b, #8	)
2906ef5737fSArd BiesheuvelCPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
2916ef5737fSArd Biesheuvel
2926227cd12SEric Biggers	// XOR the first 16 data *bits* with the initial CRC value.
2936227cd12SEric Biggers	movi		v8.16b, #0
2946227cd12SEric Biggers	mov		v8.h[7], init_crc
2956227cd12SEric Biggers	eor		v0.16b, v0.16b, v8.16b
2966ef5737fSArd Biesheuvel
2976227cd12SEric Biggers	// Load the constants for folding across 128 bytes.
2986227cd12SEric Biggers	ld1		{fold_consts.2d}, [fold_consts_ptr]
2996227cd12SEric Biggers	__pmull_pre_\p	fold_consts
3006ef5737fSArd Biesheuvel
3016227cd12SEric Biggers	// Subtract 128 for the 128 data bytes just consumed.  Subtract another
3026227cd12SEric Biggers	// 128 to simplify the termination condition of the following loop.
3036227cd12SEric Biggers	sub		len, len, #256
3046ef5737fSArd Biesheuvel
3056227cd12SEric Biggers	// While >= 128 data bytes remain (not counting v0-v7), fold the 128
3066227cd12SEric Biggers	// bytes v0-v7 into them, storing the result back into v0-v7.
3076227cd12SEric Biggers.Lfold_128_bytes_loop_\@:
3086227cd12SEric Biggers	fold_32_bytes	\p, v0, v1
3096227cd12SEric Biggers	fold_32_bytes	\p, v2, v3
3106227cd12SEric Biggers	fold_32_bytes	\p, v4, v5
3116227cd12SEric Biggers	fold_32_bytes	\p, v6, v7
3126ef5737fSArd Biesheuvel
3136227cd12SEric Biggers	subs		len, len, #128
314fc754c02SArd Biesheuvel	b.ge		.Lfold_128_bytes_loop_\@
3156ef5737fSArd Biesheuvel
3166227cd12SEric Biggers	// Now fold the 112 bytes in v0-v6 into the 16 bytes in v7.
3176ef5737fSArd Biesheuvel
3186227cd12SEric Biggers	// Fold across 64 bytes.
3196227cd12SEric Biggers	add		fold_consts_ptr, fold_consts_ptr, #16
3206227cd12SEric Biggers	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
3216227cd12SEric Biggers	__pmull_pre_\p	fold_consts
3226227cd12SEric Biggers	fold_16_bytes	\p, v0, v4
3236227cd12SEric Biggers	fold_16_bytes	\p, v1, v5
3246227cd12SEric Biggers	fold_16_bytes	\p, v2, v6
3256227cd12SEric Biggers	fold_16_bytes	\p, v3, v7, 1
3266227cd12SEric Biggers	// Fold across 32 bytes.
3276227cd12SEric Biggers	fold_16_bytes	\p, v4, v6
3286227cd12SEric Biggers	fold_16_bytes	\p, v5, v7, 1
3296227cd12SEric Biggers	// Fold across 16 bytes.
3306227cd12SEric Biggers	fold_16_bytes	\p, v6, v7
3316ef5737fSArd Biesheuvel
3326227cd12SEric Biggers	// Add 128 to get the correct number of data bytes remaining in 0...127
3336227cd12SEric Biggers	// (not counting v7), following the previous extra subtraction by 128.
3346227cd12SEric Biggers	// Then subtract 16 to simplify the termination condition of the
3356227cd12SEric Biggers	// following loop.
3366227cd12SEric Biggers	adds		len, len, #(128-16)
3376ef5737fSArd Biesheuvel
3386227cd12SEric Biggers	// While >= 16 data bytes remain (not counting v7), fold the 16 bytes v7
3396227cd12SEric Biggers	// into them, storing the result back into v7.
3406227cd12SEric Biggers	b.lt		.Lfold_16_bytes_loop_done_\@
3416227cd12SEric Biggers.Lfold_16_bytes_loop_\@:
3426227cd12SEric Biggers	__pmull_\p	v8, v7, fold_consts
3436227cd12SEric Biggers	__pmull_\p	v7, v7, fold_consts, 2
3446ef5737fSArd Biesheuvel	eor		v7.16b, v7.16b, v8.16b
3456227cd12SEric Biggers	ldr		q0, [buf], #16
3466ef5737fSArd BiesheuvelCPU_LE(	rev64		v0.16b, v0.16b			)
3476ef5737fSArd BiesheuvelCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
3486ef5737fSArd Biesheuvel	eor		v7.16b, v7.16b, v0.16b
3496227cd12SEric Biggers	subs		len, len, #16
3506227cd12SEric Biggers	b.ge		.Lfold_16_bytes_loop_\@
3516ef5737fSArd Biesheuvel
3526227cd12SEric Biggers.Lfold_16_bytes_loop_done_\@:
3536227cd12SEric Biggers	// Add 16 to get the correct number of data bytes remaining in 0...15
3546227cd12SEric Biggers	// (not counting v7), following the previous extra subtraction by 16.
3556227cd12SEric Biggers	adds		len, len, #16
3566227cd12SEric Biggers	b.eq		.Lreduce_final_16_bytes_\@
3576ef5737fSArd Biesheuvel
3586227cd12SEric Biggers.Lhandle_partial_segment_\@:
3596227cd12SEric Biggers	// Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first
3606227cd12SEric Biggers	// 16 bytes are in v7 and the rest are the remaining data in 'buf'.  To
3616227cd12SEric Biggers	// do this without needing a fold constant for each possible 'len',
3626227cd12SEric Biggers	// redivide the bytes into a first chunk of 'len' bytes and a second
3636227cd12SEric Biggers	// chunk of 16 bytes, then fold the first chunk into the second.
3646ef5737fSArd Biesheuvel
3656227cd12SEric Biggers	// v0 = last 16 original data bytes
3666227cd12SEric Biggers	add		buf, buf, len
3676227cd12SEric Biggers	ldr		q0, [buf, #-16]
3686227cd12SEric BiggersCPU_LE(	rev64		v0.16b, v0.16b			)
3696227cd12SEric BiggersCPU_LE(	ext		v0.16b, v0.16b, v0.16b, #8	)
3706ef5737fSArd Biesheuvel
3716227cd12SEric Biggers	// v1 = high order part of second chunk: v7 left-shifted by 'len' bytes.
3726227cd12SEric Biggers	adr_l		x4, .Lbyteshift_table + 16
3736227cd12SEric Biggers	sub		x4, x4, len
3746227cd12SEric Biggers	ld1		{v2.16b}, [x4]
3756227cd12SEric Biggers	tbl		v1.16b, {v7.16b}, v2.16b
3766ef5737fSArd Biesheuvel
3776227cd12SEric Biggers	// v3 = first chunk: v7 right-shifted by '16-len' bytes.
3786227cd12SEric Biggers	movi		v3.16b, #0x80
3796227cd12SEric Biggers	eor		v2.16b, v2.16b, v3.16b
3806227cd12SEric Biggers	tbl		v3.16b, {v7.16b}, v2.16b
3816ef5737fSArd Biesheuvel
3826227cd12SEric Biggers	// Convert to 8-bit masks: 'len' 0x00 bytes, then '16-len' 0xff bytes.
3836227cd12SEric Biggers	sshr		v2.16b, v2.16b, #7
3846ef5737fSArd Biesheuvel
3856227cd12SEric Biggers	// v2 = second chunk: 'len' bytes from v0 (low-order bytes),
3866227cd12SEric Biggers	// then '16-len' bytes from v1 (high-order bytes).
3876227cd12SEric Biggers	bsl		v2.16b, v1.16b, v0.16b
3886ef5737fSArd Biesheuvel
3896227cd12SEric Biggers	// Fold the first chunk into the second chunk, storing the result in v7.
3906227cd12SEric Biggers	__pmull_\p	v0, v3, fold_consts
3916227cd12SEric Biggers	__pmull_\p	v7, v3, fold_consts, 2
3926ef5737fSArd Biesheuvel	eor		v7.16b, v7.16b, v0.16b
3936227cd12SEric Biggers	eor		v7.16b, v7.16b, v2.16b
3946ef5737fSArd Biesheuvel
3956227cd12SEric Biggers.Lreduce_final_16_bytes_\@:
3966227cd12SEric Biggers	// Reduce the 128-bit value M(x), stored in v7, to the final 16-bit CRC.
3976ef5737fSArd Biesheuvel
3986227cd12SEric Biggers	movi		v2.16b, #0		// init zero register
3996ef5737fSArd Biesheuvel
4006227cd12SEric Biggers	// Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
4016227cd12SEric Biggers	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
4026227cd12SEric Biggers	__pmull_pre_\p	fold_consts
4036ef5737fSArd Biesheuvel
4046227cd12SEric Biggers	// Fold the high 64 bits into the low 64 bits, while also multiplying by
4056227cd12SEric Biggers	// x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
4066227cd12SEric Biggers	// whose low 48 bits are 0.
4076227cd12SEric Biggers	ext		v0.16b, v2.16b, v7.16b, #8
4086227cd12SEric Biggers	__pmull_\p	v7, v7, fold_consts, 2	// high bits * x^48 * (x^80 mod G(x))
4096227cd12SEric Biggers	eor		v0.16b, v0.16b, v7.16b	// + low bits * x^64
4106ef5737fSArd Biesheuvel
4116227cd12SEric Biggers	// Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
4126227cd12SEric Biggers	// value congruent to x^64 * M(x) and whose low 48 bits are 0.
4136227cd12SEric Biggers	ext		v1.16b, v0.16b, v2.16b, #12	// extract high 32 bits
4146227cd12SEric Biggers	mov		v0.s[3], v2.s[0]	// zero high 32 bits
4156227cd12SEric Biggers	__pmull_\p	v1, v1, fold_consts	// high 32 bits * x^48 * (x^48 mod G(x))
4166227cd12SEric Biggers	eor		v0.16b, v0.16b, v1.16b	// + low bits
4176ef5737fSArd Biesheuvel
4186227cd12SEric Biggers	// Load G(x) and floor(x^48 / G(x)).
4196227cd12SEric Biggers	ld1		{fold_consts.2d}, [fold_consts_ptr]
4206227cd12SEric Biggers	__pmull_pre_\p	fold_consts
4216227cd12SEric Biggers
4226227cd12SEric Biggers	// Use Barrett reduction to compute the final CRC value.
4236227cd12SEric Biggers	__pmull_\p	v1, v0, fold_consts, 2	// high 32 bits * floor(x^48 / G(x))
4246227cd12SEric Biggers	ushr		v1.2d, v1.2d, #32	// /= x^32
4256227cd12SEric Biggers	__pmull_\p	v1, v1, fold_consts	// *= G(x)
4266227cd12SEric Biggers	ushr		v0.2d, v0.2d, #48
4276227cd12SEric Biggers	eor		v0.16b, v0.16b, v1.16b	// + low 16 nonzero bits
4286227cd12SEric Biggers	// Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of v0.
4296227cd12SEric Biggers
4306227cd12SEric Biggers	umov		w0, v0.h[0]
431fc754c02SArd Biesheuvel	.ifc		\p, p8
432*489a4a05SArd Biesheuvel	frame_pop
433fc754c02SArd Biesheuvel	.endif
4346ef5737fSArd Biesheuvel	ret
4356ef5737fSArd Biesheuvel
4366227cd12SEric Biggers.Lless_than_256_bytes_\@:
4376227cd12SEric Biggers	// Checksumming a buffer of length 16...255 bytes
4386ef5737fSArd Biesheuvel
4396227cd12SEric Biggers	adr_l		fold_consts_ptr, .Lfold_across_16_bytes_consts
4406ef5737fSArd Biesheuvel
4416227cd12SEric Biggers	// Load the first 16 data bytes.
4426227cd12SEric Biggers	ldr		q7, [buf], #0x10
4436ef5737fSArd BiesheuvelCPU_LE(	rev64		v7.16b, v7.16b			)
4446ef5737fSArd BiesheuvelCPU_LE(	ext		v7.16b, v7.16b, v7.16b, #8	)
4456ef5737fSArd Biesheuvel
4466227cd12SEric Biggers	// XOR the first 16 data *bits* with the initial CRC value.
4476227cd12SEric Biggers	movi		v0.16b, #0
4486227cd12SEric Biggers	mov		v0.h[7], init_crc
4496227cd12SEric Biggers	eor		v7.16b, v7.16b, v0.16b
4506ef5737fSArd Biesheuvel
4516227cd12SEric Biggers	// Load the fold-across-16-bytes constants.
4526227cd12SEric Biggers	ld1		{fold_consts.2d}, [fold_consts_ptr], #16
4536227cd12SEric Biggers	__pmull_pre_\p	fold_consts
4546ef5737fSArd Biesheuvel
4556227cd12SEric Biggers	cmp		len, #16
4566227cd12SEric Biggers	b.eq		.Lreduce_final_16_bytes_\@	// len == 16
4576227cd12SEric Biggers	subs		len, len, #32
4586227cd12SEric Biggers	b.ge		.Lfold_16_bytes_loop_\@		// 32 <= len <= 255
4596227cd12SEric Biggers	add		len, len, #16
4606227cd12SEric Biggers	b		.Lhandle_partial_segment_\@	// 17 <= len <= 31
4616c1b0da1SArd Biesheuvel	.endm
4626c1b0da1SArd Biesheuvel
4636227cd12SEric Biggers//
4646227cd12SEric Biggers// u16 crc_t10dif_pmull_p8(u16 init_crc, const u8 *buf, size_t len);
4656227cd12SEric Biggers//
4666227cd12SEric Biggers// Assumes len >= 16.
4676227cd12SEric Biggers//
4680e89640bSMark BrownSYM_FUNC_START(crc_t10dif_pmull_p8)
469*489a4a05SArd Biesheuvel	frame_push	1
4702fffee53SArd Biesheuvel	crc_t10dif_pmull p8
4710e89640bSMark BrownSYM_FUNC_END(crc_t10dif_pmull_p8)
4722fffee53SArd Biesheuvel
4732fffee53SArd Biesheuvel	.align		5
4746227cd12SEric Biggers//
4756227cd12SEric Biggers// u16 crc_t10dif_pmull_p64(u16 init_crc, const u8 *buf, size_t len);
4766227cd12SEric Biggers//
4776227cd12SEric Biggers// Assumes len >= 16.
4786227cd12SEric Biggers//
4790e89640bSMark BrownSYM_FUNC_START(crc_t10dif_pmull_p64)
4806c1b0da1SArd Biesheuvel	crc_t10dif_pmull	p64
4810e89640bSMark BrownSYM_FUNC_END(crc_t10dif_pmull_p64)
4826ef5737fSArd Biesheuvel
483325f562dSArd Biesheuvel	.section	".rodata", "a"
4846ef5737fSArd Biesheuvel	.align		4
4856ef5737fSArd Biesheuvel
4866227cd12SEric Biggers// Fold constants precomputed from the polynomial 0x18bb7
4876227cd12SEric Biggers// G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
4886227cd12SEric Biggers.Lfold_across_128_bytes_consts:
4896227cd12SEric Biggers	.quad		0x0000000000006123	// x^(8*128)	mod G(x)
4906227cd12SEric Biggers	.quad		0x0000000000002295	// x^(8*128+64)	mod G(x)
4916227cd12SEric Biggers// .Lfold_across_64_bytes_consts:
4926227cd12SEric Biggers	.quad		0x0000000000001069	// x^(4*128)	mod G(x)
4936227cd12SEric Biggers	.quad		0x000000000000dd31	// x^(4*128+64)	mod G(x)
4946227cd12SEric Biggers// .Lfold_across_32_bytes_consts:
4956227cd12SEric Biggers	.quad		0x000000000000857d	// x^(2*128)	mod G(x)
4966227cd12SEric Biggers	.quad		0x0000000000007acc	// x^(2*128+64)	mod G(x)
4976227cd12SEric Biggers.Lfold_across_16_bytes_consts:
4986227cd12SEric Biggers	.quad		0x000000000000a010	// x^(1*128)	mod G(x)
4996227cd12SEric Biggers	.quad		0x0000000000001faa	// x^(1*128+64)	mod G(x)
5006227cd12SEric Biggers// .Lfinal_fold_consts:
5016227cd12SEric Biggers	.quad		0x1368000000000000	// x^48 * (x^48 mod G(x))
5026227cd12SEric Biggers	.quad		0x2d56000000000000	// x^48 * (x^80 mod G(x))
5036227cd12SEric Biggers// .Lbarrett_reduction_consts:
5046227cd12SEric Biggers	.quad		0x0000000000018bb7	// G(x)
5056227cd12SEric Biggers	.quad		0x00000001f65a57f8	// floor(x^48 / G(x))
5066ef5737fSArd Biesheuvel
5076227cd12SEric Biggers// For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 -
5086227cd12SEric Biggers// len] is the index vector to shift left by 'len' bytes, and is also {0x80,
5096227cd12SEric Biggers// ..., 0x80} XOR the index vector to shift right by '16 - len' bytes.
5106227cd12SEric Biggers.Lbyteshift_table:
5116ef5737fSArd Biesheuvel	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
5126ef5737fSArd Biesheuvel	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
5136ef5737fSArd Biesheuvel	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
5146ef5737fSArd Biesheuvel	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
515