xref: /linux/arch/x86/lib/crct10dif-pcl-asm_64.S (revision 37b33c68b00089a574ebd0a856a5d554eb3001b7)
1*ed4bc981SEric Biggers########################################################################
2*ed4bc981SEric Biggers# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions
3*ed4bc981SEric Biggers#
4*ed4bc981SEric Biggers# Copyright (c) 2013, Intel Corporation
5*ed4bc981SEric Biggers#
6*ed4bc981SEric Biggers# Authors:
7*ed4bc981SEric Biggers#     Erdinc Ozturk <erdinc.ozturk@intel.com>
8*ed4bc981SEric Biggers#     Vinodh Gopal <vinodh.gopal@intel.com>
9*ed4bc981SEric Biggers#     James Guilford <james.guilford@intel.com>
10*ed4bc981SEric Biggers#     Tim Chen <tim.c.chen@linux.intel.com>
11*ed4bc981SEric Biggers#
12*ed4bc981SEric Biggers# This software is available to you under a choice of one of two
13*ed4bc981SEric Biggers# licenses.  You may choose to be licensed under the terms of the GNU
14*ed4bc981SEric Biggers# General Public License (GPL) Version 2, available from the file
15*ed4bc981SEric Biggers# COPYING in the main directory of this source tree, or the
16*ed4bc981SEric Biggers# OpenIB.org BSD license below:
17*ed4bc981SEric Biggers#
18*ed4bc981SEric Biggers# Redistribution and use in source and binary forms, with or without
19*ed4bc981SEric Biggers# modification, are permitted provided that the following conditions are
20*ed4bc981SEric Biggers# met:
21*ed4bc981SEric Biggers#
22*ed4bc981SEric Biggers# * Redistributions of source code must retain the above copyright
23*ed4bc981SEric Biggers#   notice, this list of conditions and the following disclaimer.
24*ed4bc981SEric Biggers#
25*ed4bc981SEric Biggers# * Redistributions in binary form must reproduce the above copyright
26*ed4bc981SEric Biggers#   notice, this list of conditions and the following disclaimer in the
27*ed4bc981SEric Biggers#   documentation and/or other materials provided with the
28*ed4bc981SEric Biggers#   distribution.
29*ed4bc981SEric Biggers#
30*ed4bc981SEric Biggers# * Neither the name of the Intel Corporation nor the names of its
31*ed4bc981SEric Biggers#   contributors may be used to endorse or promote products derived from
32*ed4bc981SEric Biggers#   this software without specific prior written permission.
33*ed4bc981SEric Biggers#
34*ed4bc981SEric Biggers#
35*ed4bc981SEric Biggers# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY
36*ed4bc981SEric Biggers# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37*ed4bc981SEric Biggers# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38*ed4bc981SEric Biggers# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
39*ed4bc981SEric Biggers# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
40*ed4bc981SEric Biggers# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
41*ed4bc981SEric Biggers# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
42*ed4bc981SEric Biggers# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
43*ed4bc981SEric Biggers# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
44*ed4bc981SEric Biggers# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
45*ed4bc981SEric Biggers# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46*ed4bc981SEric Biggers#
47*ed4bc981SEric Biggers#       Reference paper titled "Fast CRC Computation for Generic
48*ed4bc981SEric Biggers#	Polynomials Using PCLMULQDQ Instruction"
49*ed4bc981SEric Biggers#       URL: http://www.intel.com/content/dam/www/public/us/en/documents
50*ed4bc981SEric Biggers#  /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
51*ed4bc981SEric Biggers#
52*ed4bc981SEric Biggers
53*ed4bc981SEric Biggers#include <linux/linkage.h>
54*ed4bc981SEric Biggers
55*ed4bc981SEric Biggers.text
56*ed4bc981SEric Biggers
57*ed4bc981SEric Biggers#define		init_crc	%edi
58*ed4bc981SEric Biggers#define		buf		%rsi
59*ed4bc981SEric Biggers#define		len		%rdx
60*ed4bc981SEric Biggers
61*ed4bc981SEric Biggers#define		FOLD_CONSTS	%xmm10
62*ed4bc981SEric Biggers#define		BSWAP_MASK	%xmm11
63*ed4bc981SEric Biggers
64*ed4bc981SEric Biggers# Fold reg1, reg2 into the next 32 data bytes, storing the result back into
65*ed4bc981SEric Biggers# reg1, reg2.
66*ed4bc981SEric Biggers.macro	fold_32_bytes	offset, reg1, reg2
67*ed4bc981SEric Biggers	movdqu	\offset(buf), %xmm9
68*ed4bc981SEric Biggers	movdqu	\offset+16(buf), %xmm12
69*ed4bc981SEric Biggers	pshufb	BSWAP_MASK, %xmm9
70*ed4bc981SEric Biggers	pshufb	BSWAP_MASK, %xmm12
71*ed4bc981SEric Biggers	movdqa	\reg1, %xmm8
72*ed4bc981SEric Biggers	movdqa	\reg2, %xmm13
73*ed4bc981SEric Biggers	pclmulqdq	$0x00, FOLD_CONSTS, \reg1
74*ed4bc981SEric Biggers	pclmulqdq	$0x11, FOLD_CONSTS, %xmm8
75*ed4bc981SEric Biggers	pclmulqdq	$0x00, FOLD_CONSTS, \reg2
76*ed4bc981SEric Biggers	pclmulqdq	$0x11, FOLD_CONSTS, %xmm13
77*ed4bc981SEric Biggers	pxor	%xmm9 , \reg1
78*ed4bc981SEric Biggers	xorps	%xmm8 , \reg1
79*ed4bc981SEric Biggers	pxor	%xmm12, \reg2
80*ed4bc981SEric Biggers	xorps	%xmm13, \reg2
81*ed4bc981SEric Biggers.endm
82*ed4bc981SEric Biggers
83*ed4bc981SEric Biggers# Fold src_reg into dst_reg.
84*ed4bc981SEric Biggers.macro	fold_16_bytes	src_reg, dst_reg
85*ed4bc981SEric Biggers	movdqa	\src_reg, %xmm8
86*ed4bc981SEric Biggers	pclmulqdq	$0x11, FOLD_CONSTS, \src_reg
87*ed4bc981SEric Biggers	pclmulqdq	$0x00, FOLD_CONSTS, %xmm8
88*ed4bc981SEric Biggers	pxor	%xmm8, \dst_reg
89*ed4bc981SEric Biggers	xorps	\src_reg, \dst_reg
90*ed4bc981SEric Biggers.endm
91*ed4bc981SEric Biggers
92*ed4bc981SEric Biggers#
93*ed4bc981SEric Biggers# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len);
94*ed4bc981SEric Biggers#
95*ed4bc981SEric Biggers# Assumes len >= 16.
96*ed4bc981SEric Biggers#
97*ed4bc981SEric BiggersSYM_FUNC_START(crc_t10dif_pcl)
98*ed4bc981SEric Biggers
99*ed4bc981SEric Biggers	movdqa	.Lbswap_mask(%rip), BSWAP_MASK
100*ed4bc981SEric Biggers
101*ed4bc981SEric Biggers	# For sizes less than 256 bytes, we can't fold 128 bytes at a time.
102*ed4bc981SEric Biggers	cmp	$256, len
103*ed4bc981SEric Biggers	jl	.Lless_than_256_bytes
104*ed4bc981SEric Biggers
105*ed4bc981SEric Biggers	# Load the first 128 data bytes.  Byte swapping is necessary to make the
106*ed4bc981SEric Biggers	# bit order match the polynomial coefficient order.
107*ed4bc981SEric Biggers	movdqu	16*0(buf), %xmm0
108*ed4bc981SEric Biggers	movdqu	16*1(buf), %xmm1
109*ed4bc981SEric Biggers	movdqu	16*2(buf), %xmm2
110*ed4bc981SEric Biggers	movdqu	16*3(buf), %xmm3
111*ed4bc981SEric Biggers	movdqu	16*4(buf), %xmm4
112*ed4bc981SEric Biggers	movdqu	16*5(buf), %xmm5
113*ed4bc981SEric Biggers	movdqu	16*6(buf), %xmm6
114*ed4bc981SEric Biggers	movdqu	16*7(buf), %xmm7
115*ed4bc981SEric Biggers	add	$128, buf
116*ed4bc981SEric Biggers	pshufb	BSWAP_MASK, %xmm0
117*ed4bc981SEric Biggers	pshufb	BSWAP_MASK, %xmm1
118*ed4bc981SEric Biggers	pshufb	BSWAP_MASK, %xmm2
119*ed4bc981SEric Biggers	pshufb	BSWAP_MASK, %xmm3
120*ed4bc981SEric Biggers	pshufb	BSWAP_MASK, %xmm4
121*ed4bc981SEric Biggers	pshufb	BSWAP_MASK, %xmm5
122*ed4bc981SEric Biggers	pshufb	BSWAP_MASK, %xmm6
123*ed4bc981SEric Biggers	pshufb	BSWAP_MASK, %xmm7
124*ed4bc981SEric Biggers
125*ed4bc981SEric Biggers	# XOR the first 16 data *bits* with the initial CRC value.
126*ed4bc981SEric Biggers	pxor	%xmm8, %xmm8
127*ed4bc981SEric Biggers	pinsrw	$7, init_crc, %xmm8
128*ed4bc981SEric Biggers	pxor	%xmm8, %xmm0
129*ed4bc981SEric Biggers
130*ed4bc981SEric Biggers	movdqa	.Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS
131*ed4bc981SEric Biggers
132*ed4bc981SEric Biggers	# Subtract 128 for the 128 data bytes just consumed.  Subtract another
133*ed4bc981SEric Biggers	# 128 to simplify the termination condition of the following loop.
134*ed4bc981SEric Biggers	sub	$256, len
135*ed4bc981SEric Biggers
136*ed4bc981SEric Biggers	# While >= 128 data bytes remain (not counting xmm0-7), fold the 128
137*ed4bc981SEric Biggers	# bytes xmm0-7 into them, storing the result back into xmm0-7.
138*ed4bc981SEric Biggers.Lfold_128_bytes_loop:
139*ed4bc981SEric Biggers	fold_32_bytes	0, %xmm0, %xmm1
140*ed4bc981SEric Biggers	fold_32_bytes	32, %xmm2, %xmm3
141*ed4bc981SEric Biggers	fold_32_bytes	64, %xmm4, %xmm5
142*ed4bc981SEric Biggers	fold_32_bytes	96, %xmm6, %xmm7
143*ed4bc981SEric Biggers	add	$128, buf
144*ed4bc981SEric Biggers	sub	$128, len
145*ed4bc981SEric Biggers	jge	.Lfold_128_bytes_loop
146*ed4bc981SEric Biggers
147*ed4bc981SEric Biggers	# Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7.
148*ed4bc981SEric Biggers
149*ed4bc981SEric Biggers	# Fold across 64 bytes.
150*ed4bc981SEric Biggers	movdqa	.Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS
151*ed4bc981SEric Biggers	fold_16_bytes	%xmm0, %xmm4
152*ed4bc981SEric Biggers	fold_16_bytes	%xmm1, %xmm5
153*ed4bc981SEric Biggers	fold_16_bytes	%xmm2, %xmm6
154*ed4bc981SEric Biggers	fold_16_bytes	%xmm3, %xmm7
155*ed4bc981SEric Biggers	# Fold across 32 bytes.
156*ed4bc981SEric Biggers	movdqa	.Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS
157*ed4bc981SEric Biggers	fold_16_bytes	%xmm4, %xmm6
158*ed4bc981SEric Biggers	fold_16_bytes	%xmm5, %xmm7
159*ed4bc981SEric Biggers	# Fold across 16 bytes.
160*ed4bc981SEric Biggers	movdqa	.Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
161*ed4bc981SEric Biggers	fold_16_bytes	%xmm6, %xmm7
162*ed4bc981SEric Biggers
163*ed4bc981SEric Biggers	# Add 128 to get the correct number of data bytes remaining in 0...127
164*ed4bc981SEric Biggers	# (not counting xmm7), following the previous extra subtraction by 128.
165*ed4bc981SEric Biggers	# Then subtract 16 to simplify the termination condition of the
166*ed4bc981SEric Biggers	# following loop.
167*ed4bc981SEric Biggers	add	$128-16, len
168*ed4bc981SEric Biggers
169*ed4bc981SEric Biggers	# While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes
170*ed4bc981SEric Biggers	# xmm7 into them, storing the result back into xmm7.
171*ed4bc981SEric Biggers	jl	.Lfold_16_bytes_loop_done
172*ed4bc981SEric Biggers.Lfold_16_bytes_loop:
173*ed4bc981SEric Biggers	movdqa	%xmm7, %xmm8
174*ed4bc981SEric Biggers	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7
175*ed4bc981SEric Biggers	pclmulqdq	$0x00, FOLD_CONSTS, %xmm8
176*ed4bc981SEric Biggers	pxor	%xmm8, %xmm7
177*ed4bc981SEric Biggers	movdqu	(buf), %xmm0
178*ed4bc981SEric Biggers	pshufb	BSWAP_MASK, %xmm0
179*ed4bc981SEric Biggers	pxor	%xmm0 , %xmm7
180*ed4bc981SEric Biggers	add	$16, buf
181*ed4bc981SEric Biggers	sub	$16, len
182*ed4bc981SEric Biggers	jge	.Lfold_16_bytes_loop
183*ed4bc981SEric Biggers
184*ed4bc981SEric Biggers.Lfold_16_bytes_loop_done:
185*ed4bc981SEric Biggers	# Add 16 to get the correct number of data bytes remaining in 0...15
186*ed4bc981SEric Biggers	# (not counting xmm7), following the previous extra subtraction by 16.
187*ed4bc981SEric Biggers	add	$16, len
188*ed4bc981SEric Biggers	je	.Lreduce_final_16_bytes
189*ed4bc981SEric Biggers
190*ed4bc981SEric Biggers.Lhandle_partial_segment:
191*ed4bc981SEric Biggers	# Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16
192*ed4bc981SEric Biggers	# bytes are in xmm7 and the rest are the remaining data in 'buf'.  To do
193*ed4bc981SEric Biggers	# this without needing a fold constant for each possible 'len', redivide
194*ed4bc981SEric Biggers	# the bytes into a first chunk of 'len' bytes and a second chunk of 16
195*ed4bc981SEric Biggers	# bytes, then fold the first chunk into the second.
196*ed4bc981SEric Biggers
197*ed4bc981SEric Biggers	movdqa	%xmm7, %xmm2
198*ed4bc981SEric Biggers
199*ed4bc981SEric Biggers	# xmm1 = last 16 original data bytes
200*ed4bc981SEric Biggers	movdqu	-16(buf, len), %xmm1
201*ed4bc981SEric Biggers	pshufb	BSWAP_MASK, %xmm1
202*ed4bc981SEric Biggers
203*ed4bc981SEric Biggers	# xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes.
204*ed4bc981SEric Biggers	lea	.Lbyteshift_table+16(%rip), %rax
205*ed4bc981SEric Biggers	sub	len, %rax
206*ed4bc981SEric Biggers	movdqu	(%rax), %xmm0
207*ed4bc981SEric Biggers	pshufb	%xmm0, %xmm2
208*ed4bc981SEric Biggers
209*ed4bc981SEric Biggers	# xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes.
210*ed4bc981SEric Biggers	pxor	.Lmask1(%rip), %xmm0
211*ed4bc981SEric Biggers	pshufb	%xmm0, %xmm7
212*ed4bc981SEric Biggers
213*ed4bc981SEric Biggers	# xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes),
214*ed4bc981SEric Biggers	# then '16-len' bytes from xmm2 (high-order bytes).
215*ed4bc981SEric Biggers	pblendvb	%xmm2, %xmm1	#xmm0 is implicit
216*ed4bc981SEric Biggers
217*ed4bc981SEric Biggers	# Fold the first chunk into the second chunk, storing the result in xmm7.
218*ed4bc981SEric Biggers	movdqa	%xmm7, %xmm8
219*ed4bc981SEric Biggers	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7
220*ed4bc981SEric Biggers	pclmulqdq	$0x00, FOLD_CONSTS, %xmm8
221*ed4bc981SEric Biggers	pxor	%xmm8, %xmm7
222*ed4bc981SEric Biggers	pxor	%xmm1, %xmm7
223*ed4bc981SEric Biggers
224*ed4bc981SEric Biggers.Lreduce_final_16_bytes:
225*ed4bc981SEric Biggers	# Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC
226*ed4bc981SEric Biggers
227*ed4bc981SEric Biggers	# Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'.
228*ed4bc981SEric Biggers	movdqa	.Lfinal_fold_consts(%rip), FOLD_CONSTS
229*ed4bc981SEric Biggers
230*ed4bc981SEric Biggers	# Fold the high 64 bits into the low 64 bits, while also multiplying by
231*ed4bc981SEric Biggers	# x^64.  This produces a 128-bit value congruent to x^64 * M(x) and
232*ed4bc981SEric Biggers	# whose low 48 bits are 0.
233*ed4bc981SEric Biggers	movdqa	%xmm7, %xmm0
234*ed4bc981SEric Biggers	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x))
235*ed4bc981SEric Biggers	pslldq	$8, %xmm0
236*ed4bc981SEric Biggers	pxor	%xmm0, %xmm7			  # + low bits * x^64
237*ed4bc981SEric Biggers
238*ed4bc981SEric Biggers	# Fold the high 32 bits into the low 96 bits.  This produces a 96-bit
239*ed4bc981SEric Biggers	# value congruent to x^64 * M(x) and whose low 48 bits are 0.
240*ed4bc981SEric Biggers	movdqa	%xmm7, %xmm0
241*ed4bc981SEric Biggers	pand	.Lmask2(%rip), %xmm0		  # zero high 32 bits
242*ed4bc981SEric Biggers	psrldq	$12, %xmm7			  # extract high 32 bits
243*ed4bc981SEric Biggers	pclmulqdq	$0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x))
244*ed4bc981SEric Biggers	pxor	%xmm0, %xmm7			  # + low bits
245*ed4bc981SEric Biggers
246*ed4bc981SEric Biggers	# Load G(x) and floor(x^48 / G(x)).
247*ed4bc981SEric Biggers	movdqa	.Lbarrett_reduction_consts(%rip), FOLD_CONSTS
248*ed4bc981SEric Biggers
249*ed4bc981SEric Biggers	# Use Barrett reduction to compute the final CRC value.
250*ed4bc981SEric Biggers	movdqa	%xmm7, %xmm0
251*ed4bc981SEric Biggers	pclmulqdq	$0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x))
252*ed4bc981SEric Biggers	psrlq	$32, %xmm7			  # /= x^32
253*ed4bc981SEric Biggers	pclmulqdq	$0x00, FOLD_CONSTS, %xmm7 # *= G(x)
254*ed4bc981SEric Biggers	psrlq	$48, %xmm0
255*ed4bc981SEric Biggers	pxor	%xmm7, %xmm0		     # + low 16 nonzero bits
256*ed4bc981SEric Biggers	# Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0.
257*ed4bc981SEric Biggers
258*ed4bc981SEric Biggers	pextrw	$0, %xmm0, %eax
259*ed4bc981SEric Biggers	RET
260*ed4bc981SEric Biggers
261*ed4bc981SEric Biggers.align 16
262*ed4bc981SEric Biggers.Lless_than_256_bytes:
263*ed4bc981SEric Biggers	# Checksumming a buffer of length 16...255 bytes
264*ed4bc981SEric Biggers
265*ed4bc981SEric Biggers	# Load the first 16 data bytes.
266*ed4bc981SEric Biggers	movdqu	(buf), %xmm7
267*ed4bc981SEric Biggers	pshufb	BSWAP_MASK, %xmm7
268*ed4bc981SEric Biggers	add	$16, buf
269*ed4bc981SEric Biggers
270*ed4bc981SEric Biggers	# XOR the first 16 data *bits* with the initial CRC value.
271*ed4bc981SEric Biggers	pxor	%xmm0, %xmm0
272*ed4bc981SEric Biggers	pinsrw	$7, init_crc, %xmm0
273*ed4bc981SEric Biggers	pxor	%xmm0, %xmm7
274*ed4bc981SEric Biggers
275*ed4bc981SEric Biggers	movdqa	.Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS
276*ed4bc981SEric Biggers	cmp	$16, len
277*ed4bc981SEric Biggers	je	.Lreduce_final_16_bytes		# len == 16
278*ed4bc981SEric Biggers	sub	$32, len
279*ed4bc981SEric Biggers	jge	.Lfold_16_bytes_loop		# 32 <= len <= 255
280*ed4bc981SEric Biggers	add	$16, len
281*ed4bc981SEric Biggers	jmp	.Lhandle_partial_segment	# 17 <= len <= 31
282*ed4bc981SEric BiggersSYM_FUNC_END(crc_t10dif_pcl)
283*ed4bc981SEric Biggers
284*ed4bc981SEric Biggers.section	.rodata, "a", @progbits
285*ed4bc981SEric Biggers.align 16
286*ed4bc981SEric Biggers
287*ed4bc981SEric Biggers# Fold constants precomputed from the polynomial 0x18bb7
288*ed4bc981SEric Biggers# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0
289*ed4bc981SEric Biggers.Lfold_across_128_bytes_consts:
290*ed4bc981SEric Biggers	.quad		0x0000000000006123	# x^(8*128)	mod G(x)
291*ed4bc981SEric Biggers	.quad		0x0000000000002295	# x^(8*128+64)	mod G(x)
292*ed4bc981SEric Biggers.Lfold_across_64_bytes_consts:
293*ed4bc981SEric Biggers	.quad		0x0000000000001069	# x^(4*128)	mod G(x)
294*ed4bc981SEric Biggers	.quad		0x000000000000dd31	# x^(4*128+64)	mod G(x)
295*ed4bc981SEric Biggers.Lfold_across_32_bytes_consts:
296*ed4bc981SEric Biggers	.quad		0x000000000000857d	# x^(2*128)	mod G(x)
297*ed4bc981SEric Biggers	.quad		0x0000000000007acc	# x^(2*128+64)	mod G(x)
298*ed4bc981SEric Biggers.Lfold_across_16_bytes_consts:
299*ed4bc981SEric Biggers	.quad		0x000000000000a010	# x^(1*128)	mod G(x)
300*ed4bc981SEric Biggers	.quad		0x0000000000001faa	# x^(1*128+64)	mod G(x)
301*ed4bc981SEric Biggers.Lfinal_fold_consts:
302*ed4bc981SEric Biggers	.quad		0x1368000000000000	# x^48 * (x^48 mod G(x))
303*ed4bc981SEric Biggers	.quad		0x2d56000000000000	# x^48 * (x^80 mod G(x))
304*ed4bc981SEric Biggers.Lbarrett_reduction_consts:
305*ed4bc981SEric Biggers	.quad		0x0000000000018bb7	# G(x)
306*ed4bc981SEric Biggers	.quad		0x00000001f65a57f8	# floor(x^48 / G(x))
307*ed4bc981SEric Biggers
308*ed4bc981SEric Biggers.section	.rodata.cst16.mask1, "aM", @progbits, 16
309*ed4bc981SEric Biggers.align 16
310*ed4bc981SEric Biggers.Lmask1:
311*ed4bc981SEric Biggers	.octa	0x80808080808080808080808080808080
312*ed4bc981SEric Biggers
313*ed4bc981SEric Biggers.section	.rodata.cst16.mask2, "aM", @progbits, 16
314*ed4bc981SEric Biggers.align 16
315*ed4bc981SEric Biggers.Lmask2:
316*ed4bc981SEric Biggers	.octa	0x00000000FFFFFFFFFFFFFFFFFFFFFFFF
317*ed4bc981SEric Biggers
318*ed4bc981SEric Biggers.section	.rodata.cst16.bswap_mask, "aM", @progbits, 16
319*ed4bc981SEric Biggers.align 16
320*ed4bc981SEric Biggers.Lbswap_mask:
321*ed4bc981SEric Biggers	.octa	0x000102030405060708090A0B0C0D0E0F
322*ed4bc981SEric Biggers
323*ed4bc981SEric Biggers.section	.rodata.cst32.byteshift_table, "aM", @progbits, 32
324*ed4bc981SEric Biggers.align 16
325*ed4bc981SEric Biggers# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len]
326*ed4bc981SEric Biggers# is the index vector to shift left by 'len' bytes, and is also {0x80, ...,
327*ed4bc981SEric Biggers# 0x80} XOR the index vector to shift right by '16 - len' bytes.
328*ed4bc981SEric Biggers.Lbyteshift_table:
329*ed4bc981SEric Biggers	.byte		 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87
330*ed4bc981SEric Biggers	.byte		0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
331*ed4bc981SEric Biggers	.byte		 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7
332*ed4bc981SEric Biggers	.byte		 0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe , 0x0
333