xref: /freebsd/sys/crypto/openssl/arm/bsaes-armv7.S (revision c0855eaa3ee9614804b6bd6a255aa9f71e095f43)
1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from bsaes-armv7.pl. */
2*c0855eaaSJohn Baldwin@ Copyright 2012-2023 The OpenSSL Project Authors. All Rights Reserved.
3bc3d5698SJohn Baldwin@
4*c0855eaaSJohn Baldwin@ Licensed under the Apache License 2.0 (the "License").  You may not use
5bc3d5698SJohn Baldwin@ this file except in compliance with the License.  You can obtain a copy
6bc3d5698SJohn Baldwin@ in the file LICENSE in the source distribution or at
7bc3d5698SJohn Baldwin@ https://www.openssl.org/source/license.html
8bc3d5698SJohn Baldwin
9bc3d5698SJohn Baldwin
10bc3d5698SJohn Baldwin@ ====================================================================
11bc3d5698SJohn Baldwin@ Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12bc3d5698SJohn Baldwin@ project. The module is, however, dual licensed under OpenSSL and
13bc3d5698SJohn Baldwin@ CRYPTOGAMS licenses depending on where you obtain it. For further
14bc3d5698SJohn Baldwin@ details see http://www.openssl.org/~appro/cryptogams/.
15bc3d5698SJohn Baldwin@
16bc3d5698SJohn Baldwin@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel
17*c0855eaaSJohn Baldwin@ of Linaro.
18bc3d5698SJohn Baldwin@ ====================================================================
19bc3d5698SJohn Baldwin
20bc3d5698SJohn Baldwin@ Bit-sliced AES for ARM NEON
21bc3d5698SJohn Baldwin@
22bc3d5698SJohn Baldwin@ February 2012.
23bc3d5698SJohn Baldwin@
24bc3d5698SJohn Baldwin@ This implementation is direct adaptation of bsaes-x86_64 module for
25bc3d5698SJohn Baldwin@ ARM NEON. Except that this module is endian-neutral [in sense that
26bc3d5698SJohn Baldwin@ it can be compiled for either endianness] by courtesy of vld1.8's
27bc3d5698SJohn Baldwin@ neutrality. Initial version doesn't implement interface to OpenSSL,
28bc3d5698SJohn Baldwin@ only low-level primitives and unsupported entry points, just enough
29bc3d5698SJohn Baldwin@ to collect performance results, which for Cortex-A8 core are:
30bc3d5698SJohn Baldwin@
31bc3d5698SJohn Baldwin@ encrypt	19.5 cycles per byte processed with 128-bit key
32bc3d5698SJohn Baldwin@ decrypt	22.1 cycles per byte processed with 128-bit key
33bc3d5698SJohn Baldwin@ key conv.	440  cycles per 128-bit key/0.18 of 8x block
34bc3d5698SJohn Baldwin@
35bc3d5698SJohn Baldwin@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7,
36bc3d5698SJohn Baldwin@ which is [much] worse than anticipated (for further details see
37bc3d5698SJohn Baldwin@ http://www.openssl.org/~appro/Snapdragon-S4.html).
38bc3d5698SJohn Baldwin@
39bc3d5698SJohn Baldwin@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code
40bc3d5698SJohn Baldwin@ manages in 20.0 cycles].
41bc3d5698SJohn Baldwin@
42bc3d5698SJohn Baldwin@ When comparing to x86_64 results keep in mind that NEON unit is
43bc3d5698SJohn Baldwin@ [mostly] single-issue and thus can't [fully] benefit from
44bc3d5698SJohn Baldwin@ instruction-level parallelism. And when comparing to aes-armv4
45bc3d5698SJohn Baldwin@ results keep in mind key schedule conversion overhead (see
46bc3d5698SJohn Baldwin@ bsaes-x86_64.pl for further details)...
47bc3d5698SJohn Baldwin@
48bc3d5698SJohn Baldwin@						<appro@openssl.org>
49bc3d5698SJohn Baldwin
50bc3d5698SJohn Baldwin@ April-August 2013
51bc3d5698SJohn Baldwin@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard.
52bc3d5698SJohn Baldwin
53*c0855eaaSJohn Baldwin@ $output is the last argument if it looks like a file (it has an extension)
54*c0855eaaSJohn Baldwin@ $flavour is the first argument if it doesn't look like a file
55bc3d5698SJohn Baldwin#ifndef __KERNEL__
56bc3d5698SJohn Baldwin# include "arm_arch.h"
57bc3d5698SJohn Baldwin
58bc3d5698SJohn Baldwin# define VFP_ABI_PUSH	vstmdb	sp!,{d8-d15}
59bc3d5698SJohn Baldwin# define VFP_ABI_POP	vldmia	sp!,{d8-d15}
60bc3d5698SJohn Baldwin# define VFP_ABI_FRAME	0x40
61bc3d5698SJohn Baldwin#else
62bc3d5698SJohn Baldwin# define VFP_ABI_PUSH
63bc3d5698SJohn Baldwin# define VFP_ABI_POP
64bc3d5698SJohn Baldwin# define VFP_ABI_FRAME	0
65bc3d5698SJohn Baldwin# define BSAES_ASM_EXTENDED_KEY
66bc3d5698SJohn Baldwin# define XTS_CHAIN_TWEAK
67bc3d5698SJohn Baldwin# define __ARM_ARCH__ __LINUX_ARM_ARCH__
68bc3d5698SJohn Baldwin# define __ARM_MAX_ARCH__ 7
69bc3d5698SJohn Baldwin#endif
70bc3d5698SJohn Baldwin
71bc3d5698SJohn Baldwin#ifdef __thumb__
72bc3d5698SJohn Baldwin# define adrl adr
73bc3d5698SJohn Baldwin#endif
74bc3d5698SJohn Baldwin
75bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7
76bc3d5698SJohn Baldwin.arch	armv7-a
77bc3d5698SJohn Baldwin.fpu	neon
78bc3d5698SJohn Baldwin
79bc3d5698SJohn Baldwin.syntax	unified 	@ ARMv7-capable assembler is expected to handle this
80bc3d5698SJohn Baldwin#if defined(__thumb2__) && !defined(__APPLE__)
81bc3d5698SJohn Baldwin.thumb
82bc3d5698SJohn Baldwin#else
83bc3d5698SJohn Baldwin.code	32
84bc3d5698SJohn Baldwin# undef __thumb2__
85bc3d5698SJohn Baldwin#endif
86bc3d5698SJohn Baldwin
87*c0855eaaSJohn Baldwin.text
88*c0855eaaSJohn Baldwin
89bc3d5698SJohn Baldwin.type	_bsaes_decrypt8,%function
90bc3d5698SJohn Baldwin.align	4
91bc3d5698SJohn Baldwin_bsaes_decrypt8:
92bc3d5698SJohn Baldwin	adr	r6,.
93bc3d5698SJohn Baldwin	vldmia	r4!, {q9}		@ round 0 key
94bc3d5698SJohn Baldwin#if defined(__thumb2__) || defined(__APPLE__)
95bc3d5698SJohn Baldwin	adr	r6,.LM0ISR
96bc3d5698SJohn Baldwin#else
97bc3d5698SJohn Baldwin	add	r6,r6,#.LM0ISR-_bsaes_decrypt8
98bc3d5698SJohn Baldwin#endif
99bc3d5698SJohn Baldwin
100bc3d5698SJohn Baldwin	vldmia	r6!, {q8}		@ .LM0ISR
101bc3d5698SJohn Baldwin	veor	q10, q0, q9	@ xor with round0 key
102bc3d5698SJohn Baldwin	veor	q11, q1, q9
103bc3d5698SJohn Baldwin	vtbl.8	d0, {q10}, d16
104bc3d5698SJohn Baldwin	vtbl.8	d1, {q10}, d17
105bc3d5698SJohn Baldwin	veor	q12, q2, q9
106bc3d5698SJohn Baldwin	vtbl.8	d2, {q11}, d16
107bc3d5698SJohn Baldwin	vtbl.8	d3, {q11}, d17
108bc3d5698SJohn Baldwin	veor	q13, q3, q9
109bc3d5698SJohn Baldwin	vtbl.8	d4, {q12}, d16
110bc3d5698SJohn Baldwin	vtbl.8	d5, {q12}, d17
111bc3d5698SJohn Baldwin	veor	q14, q4, q9
112bc3d5698SJohn Baldwin	vtbl.8	d6, {q13}, d16
113bc3d5698SJohn Baldwin	vtbl.8	d7, {q13}, d17
114bc3d5698SJohn Baldwin	veor	q15, q5, q9
115bc3d5698SJohn Baldwin	vtbl.8	d8, {q14}, d16
116bc3d5698SJohn Baldwin	vtbl.8	d9, {q14}, d17
117bc3d5698SJohn Baldwin	veor	q10, q6, q9
118bc3d5698SJohn Baldwin	vtbl.8	d10, {q15}, d16
119bc3d5698SJohn Baldwin	vtbl.8	d11, {q15}, d17
120bc3d5698SJohn Baldwin	veor	q11, q7, q9
121bc3d5698SJohn Baldwin	vtbl.8	d12, {q10}, d16
122bc3d5698SJohn Baldwin	vtbl.8	d13, {q10}, d17
123bc3d5698SJohn Baldwin	vtbl.8	d14, {q11}, d16
124bc3d5698SJohn Baldwin	vtbl.8	d15, {q11}, d17
125bc3d5698SJohn Baldwin	vmov.i8	q8,#0x55			@ compose .LBS0
126bc3d5698SJohn Baldwin	vmov.i8	q9,#0x33			@ compose .LBS1
127bc3d5698SJohn Baldwin	vshr.u64	q10, q6, #1
128bc3d5698SJohn Baldwin	vshr.u64	q11, q4, #1
129bc3d5698SJohn Baldwin	veor	q10, q10, q7
130bc3d5698SJohn Baldwin	veor	q11, q11, q5
131bc3d5698SJohn Baldwin	vand	q10, q10, q8
132bc3d5698SJohn Baldwin	vand	q11, q11, q8
133bc3d5698SJohn Baldwin	veor	q7, q7, q10
134bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #1
135bc3d5698SJohn Baldwin	veor	q5, q5, q11
136bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #1
137bc3d5698SJohn Baldwin	veor	q6, q6, q10
138bc3d5698SJohn Baldwin	veor	q4, q4, q11
139bc3d5698SJohn Baldwin	vshr.u64	q10, q2, #1
140bc3d5698SJohn Baldwin	vshr.u64	q11, q0, #1
141bc3d5698SJohn Baldwin	veor	q10, q10, q3
142bc3d5698SJohn Baldwin	veor	q11, q11, q1
143bc3d5698SJohn Baldwin	vand	q10, q10, q8
144bc3d5698SJohn Baldwin	vand	q11, q11, q8
145bc3d5698SJohn Baldwin	veor	q3, q3, q10
146bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #1
147bc3d5698SJohn Baldwin	veor	q1, q1, q11
148bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #1
149bc3d5698SJohn Baldwin	veor	q2, q2, q10
150bc3d5698SJohn Baldwin	veor	q0, q0, q11
151bc3d5698SJohn Baldwin	vmov.i8	q8,#0x0f			@ compose .LBS2
152bc3d5698SJohn Baldwin	vshr.u64	q10, q5, #2
153bc3d5698SJohn Baldwin	vshr.u64	q11, q4, #2
154bc3d5698SJohn Baldwin	veor	q10, q10, q7
155bc3d5698SJohn Baldwin	veor	q11, q11, q6
156bc3d5698SJohn Baldwin	vand	q10, q10, q9
157bc3d5698SJohn Baldwin	vand	q11, q11, q9
158bc3d5698SJohn Baldwin	veor	q7, q7, q10
159bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #2
160bc3d5698SJohn Baldwin	veor	q6, q6, q11
161bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #2
162bc3d5698SJohn Baldwin	veor	q5, q5, q10
163bc3d5698SJohn Baldwin	veor	q4, q4, q11
164bc3d5698SJohn Baldwin	vshr.u64	q10, q1, #2
165bc3d5698SJohn Baldwin	vshr.u64	q11, q0, #2
166bc3d5698SJohn Baldwin	veor	q10, q10, q3
167bc3d5698SJohn Baldwin	veor	q11, q11, q2
168bc3d5698SJohn Baldwin	vand	q10, q10, q9
169bc3d5698SJohn Baldwin	vand	q11, q11, q9
170bc3d5698SJohn Baldwin	veor	q3, q3, q10
171bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #2
172bc3d5698SJohn Baldwin	veor	q2, q2, q11
173bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #2
174bc3d5698SJohn Baldwin	veor	q1, q1, q10
175bc3d5698SJohn Baldwin	veor	q0, q0, q11
176bc3d5698SJohn Baldwin	vshr.u64	q10, q3, #4
177bc3d5698SJohn Baldwin	vshr.u64	q11, q2, #4
178bc3d5698SJohn Baldwin	veor	q10, q10, q7
179bc3d5698SJohn Baldwin	veor	q11, q11, q6
180bc3d5698SJohn Baldwin	vand	q10, q10, q8
181bc3d5698SJohn Baldwin	vand	q11, q11, q8
182bc3d5698SJohn Baldwin	veor	q7, q7, q10
183bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #4
184bc3d5698SJohn Baldwin	veor	q6, q6, q11
185bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #4
186bc3d5698SJohn Baldwin	veor	q3, q3, q10
187bc3d5698SJohn Baldwin	veor	q2, q2, q11
188bc3d5698SJohn Baldwin	vshr.u64	q10, q1, #4
189bc3d5698SJohn Baldwin	vshr.u64	q11, q0, #4
190bc3d5698SJohn Baldwin	veor	q10, q10, q5
191bc3d5698SJohn Baldwin	veor	q11, q11, q4
192bc3d5698SJohn Baldwin	vand	q10, q10, q8
193bc3d5698SJohn Baldwin	vand	q11, q11, q8
194bc3d5698SJohn Baldwin	veor	q5, q5, q10
195bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #4
196bc3d5698SJohn Baldwin	veor	q4, q4, q11
197bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #4
198bc3d5698SJohn Baldwin	veor	q1, q1, q10
199bc3d5698SJohn Baldwin	veor	q0, q0, q11
200bc3d5698SJohn Baldwin	sub	r5,r5,#1
201bc3d5698SJohn Baldwin	b	.Ldec_sbox
202bc3d5698SJohn Baldwin.align	4
203bc3d5698SJohn Baldwin.Ldec_loop:
204bc3d5698SJohn Baldwin	vldmia	r4!, {q8,q9,q10,q11}
205bc3d5698SJohn Baldwin	veor	q8, q8, q0
206bc3d5698SJohn Baldwin	veor	q9, q9, q1
207bc3d5698SJohn Baldwin	vtbl.8	d0, {q8}, d24
208bc3d5698SJohn Baldwin	vtbl.8	d1, {q8}, d25
209bc3d5698SJohn Baldwin	vldmia	r4!, {q8}
210bc3d5698SJohn Baldwin	veor	q10, q10, q2
211bc3d5698SJohn Baldwin	vtbl.8	d2, {q9}, d24
212bc3d5698SJohn Baldwin	vtbl.8	d3, {q9}, d25
213bc3d5698SJohn Baldwin	vldmia	r4!, {q9}
214bc3d5698SJohn Baldwin	veor	q11, q11, q3
215bc3d5698SJohn Baldwin	vtbl.8	d4, {q10}, d24
216bc3d5698SJohn Baldwin	vtbl.8	d5, {q10}, d25
217bc3d5698SJohn Baldwin	vldmia	r4!, {q10}
218bc3d5698SJohn Baldwin	vtbl.8	d6, {q11}, d24
219bc3d5698SJohn Baldwin	vtbl.8	d7, {q11}, d25
220bc3d5698SJohn Baldwin	vldmia	r4!, {q11}
221bc3d5698SJohn Baldwin	veor	q8, q8, q4
222bc3d5698SJohn Baldwin	veor	q9, q9, q5
223bc3d5698SJohn Baldwin	vtbl.8	d8, {q8}, d24
224bc3d5698SJohn Baldwin	vtbl.8	d9, {q8}, d25
225bc3d5698SJohn Baldwin	veor	q10, q10, q6
226bc3d5698SJohn Baldwin	vtbl.8	d10, {q9}, d24
227bc3d5698SJohn Baldwin	vtbl.8	d11, {q9}, d25
228bc3d5698SJohn Baldwin	veor	q11, q11, q7
229bc3d5698SJohn Baldwin	vtbl.8	d12, {q10}, d24
230bc3d5698SJohn Baldwin	vtbl.8	d13, {q10}, d25
231bc3d5698SJohn Baldwin	vtbl.8	d14, {q11}, d24
232bc3d5698SJohn Baldwin	vtbl.8	d15, {q11}, d25
233bc3d5698SJohn Baldwin.Ldec_sbox:
234bc3d5698SJohn Baldwin	veor	q1, q1, q4
235bc3d5698SJohn Baldwin	veor	q3, q3, q4
236bc3d5698SJohn Baldwin
237bc3d5698SJohn Baldwin	veor	q4, q4, q7
238bc3d5698SJohn Baldwin	veor	q1, q1, q6
239bc3d5698SJohn Baldwin	veor	q2, q2, q7
240bc3d5698SJohn Baldwin	veor	q6, q6, q4
241bc3d5698SJohn Baldwin
242bc3d5698SJohn Baldwin	veor	q0, q0, q1
243bc3d5698SJohn Baldwin	veor	q2, q2, q5
244bc3d5698SJohn Baldwin	veor	q7, q7, q6
245bc3d5698SJohn Baldwin	veor	q3, q3, q0
246bc3d5698SJohn Baldwin	veor	q5, q5, q0
247bc3d5698SJohn Baldwin	veor	q1, q1, q3
248bc3d5698SJohn Baldwin	veor	q11, q3, q0
249bc3d5698SJohn Baldwin	veor	q10, q7, q4
250bc3d5698SJohn Baldwin	veor	q9, q1, q6
251bc3d5698SJohn Baldwin	veor	q13, q4, q0
252bc3d5698SJohn Baldwin	vmov	q8, q10
253bc3d5698SJohn Baldwin	veor	q12, q5, q2
254bc3d5698SJohn Baldwin
255bc3d5698SJohn Baldwin	vorr	q10, q10, q9
256bc3d5698SJohn Baldwin	veor	q15, q11, q8
257bc3d5698SJohn Baldwin	vand	q14, q11, q12
258bc3d5698SJohn Baldwin	vorr	q11, q11, q12
259bc3d5698SJohn Baldwin	veor	q12, q12, q9
260bc3d5698SJohn Baldwin	vand	q8, q8, q9
261bc3d5698SJohn Baldwin	veor	q9, q6, q2
262bc3d5698SJohn Baldwin	vand	q15, q15, q12
263bc3d5698SJohn Baldwin	vand	q13, q13, q9
264bc3d5698SJohn Baldwin	veor	q9, q3, q7
265bc3d5698SJohn Baldwin	veor	q12, q1, q5
266bc3d5698SJohn Baldwin	veor	q11, q11, q13
267bc3d5698SJohn Baldwin	veor	q10, q10, q13
268bc3d5698SJohn Baldwin	vand	q13, q9, q12
269bc3d5698SJohn Baldwin	vorr	q9, q9, q12
270bc3d5698SJohn Baldwin	veor	q11, q11, q15
271bc3d5698SJohn Baldwin	veor	q8, q8, q13
272bc3d5698SJohn Baldwin	veor	q10, q10, q14
273bc3d5698SJohn Baldwin	veor	q9, q9, q15
274bc3d5698SJohn Baldwin	veor	q8, q8, q14
275bc3d5698SJohn Baldwin	vand	q12, q4, q6
276bc3d5698SJohn Baldwin	veor	q9, q9, q14
277bc3d5698SJohn Baldwin	vand	q13, q0, q2
278bc3d5698SJohn Baldwin	vand	q14, q7, q1
279bc3d5698SJohn Baldwin	vorr	q15, q3, q5
280bc3d5698SJohn Baldwin	veor	q11, q11, q12
281bc3d5698SJohn Baldwin	veor	q9, q9, q14
282bc3d5698SJohn Baldwin	veor	q8, q8, q15
283bc3d5698SJohn Baldwin	veor	q10, q10, q13
284bc3d5698SJohn Baldwin
285bc3d5698SJohn Baldwin	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
286bc3d5698SJohn Baldwin
287bc3d5698SJohn Baldwin	@ new smaller inversion
288bc3d5698SJohn Baldwin
289bc3d5698SJohn Baldwin	vand	q14, q11, q9
290bc3d5698SJohn Baldwin	vmov	q12, q8
291bc3d5698SJohn Baldwin
292bc3d5698SJohn Baldwin	veor	q13, q10, q14
293bc3d5698SJohn Baldwin	veor	q15, q8, q14
294bc3d5698SJohn Baldwin	veor	q14, q8, q14	@ q14=q15
295bc3d5698SJohn Baldwin
296bc3d5698SJohn Baldwin	vbsl	q13, q9, q8
297bc3d5698SJohn Baldwin	vbsl	q15, q11, q10
298bc3d5698SJohn Baldwin	veor	q11, q11, q10
299bc3d5698SJohn Baldwin
300bc3d5698SJohn Baldwin	vbsl	q12, q13, q14
301bc3d5698SJohn Baldwin	vbsl	q8, q14, q13
302bc3d5698SJohn Baldwin
303bc3d5698SJohn Baldwin	vand	q14, q12, q15
304bc3d5698SJohn Baldwin	veor	q9, q9, q8
305bc3d5698SJohn Baldwin
306bc3d5698SJohn Baldwin	veor	q14, q14, q11
307bc3d5698SJohn Baldwin	veor	q12, q5, q2
308bc3d5698SJohn Baldwin	veor	q8, q1, q6
309bc3d5698SJohn Baldwin	veor	q10, q15, q14
310bc3d5698SJohn Baldwin	vand	q10, q10, q5
311bc3d5698SJohn Baldwin	veor	q5, q5, q1
312bc3d5698SJohn Baldwin	vand	q11, q1, q15
313bc3d5698SJohn Baldwin	vand	q5, q5, q14
314bc3d5698SJohn Baldwin	veor	q1, q11, q10
315bc3d5698SJohn Baldwin	veor	q5, q5, q11
316bc3d5698SJohn Baldwin	veor	q15, q15, q13
317bc3d5698SJohn Baldwin	veor	q14, q14, q9
318bc3d5698SJohn Baldwin	veor	q11, q15, q14
319bc3d5698SJohn Baldwin	veor	q10, q13, q9
320bc3d5698SJohn Baldwin	vand	q11, q11, q12
321bc3d5698SJohn Baldwin	vand	q10, q10, q2
322bc3d5698SJohn Baldwin	veor	q12, q12, q8
323bc3d5698SJohn Baldwin	veor	q2, q2, q6
324bc3d5698SJohn Baldwin	vand	q8, q8, q15
325bc3d5698SJohn Baldwin	vand	q6, q6, q13
326bc3d5698SJohn Baldwin	vand	q12, q12, q14
327bc3d5698SJohn Baldwin	vand	q2, q2, q9
328bc3d5698SJohn Baldwin	veor	q8, q8, q12
329bc3d5698SJohn Baldwin	veor	q2, q2, q6
330bc3d5698SJohn Baldwin	veor	q12, q12, q11
331bc3d5698SJohn Baldwin	veor	q6, q6, q10
332bc3d5698SJohn Baldwin	veor	q5, q5, q12
333bc3d5698SJohn Baldwin	veor	q2, q2, q12
334bc3d5698SJohn Baldwin	veor	q1, q1, q8
335bc3d5698SJohn Baldwin	veor	q6, q6, q8
336bc3d5698SJohn Baldwin
337bc3d5698SJohn Baldwin	veor	q12, q3, q0
338bc3d5698SJohn Baldwin	veor	q8, q7, q4
339bc3d5698SJohn Baldwin	veor	q11, q15, q14
340bc3d5698SJohn Baldwin	veor	q10, q13, q9
341bc3d5698SJohn Baldwin	vand	q11, q11, q12
342bc3d5698SJohn Baldwin	vand	q10, q10, q0
343bc3d5698SJohn Baldwin	veor	q12, q12, q8
344bc3d5698SJohn Baldwin	veor	q0, q0, q4
345bc3d5698SJohn Baldwin	vand	q8, q8, q15
346bc3d5698SJohn Baldwin	vand	q4, q4, q13
347bc3d5698SJohn Baldwin	vand	q12, q12, q14
348bc3d5698SJohn Baldwin	vand	q0, q0, q9
349bc3d5698SJohn Baldwin	veor	q8, q8, q12
350bc3d5698SJohn Baldwin	veor	q0, q0, q4
351bc3d5698SJohn Baldwin	veor	q12, q12, q11
352bc3d5698SJohn Baldwin	veor	q4, q4, q10
353bc3d5698SJohn Baldwin	veor	q15, q15, q13
354bc3d5698SJohn Baldwin	veor	q14, q14, q9
355bc3d5698SJohn Baldwin	veor	q10, q15, q14
356bc3d5698SJohn Baldwin	vand	q10, q10, q3
357bc3d5698SJohn Baldwin	veor	q3, q3, q7
358bc3d5698SJohn Baldwin	vand	q11, q7, q15
359bc3d5698SJohn Baldwin	vand	q3, q3, q14
360bc3d5698SJohn Baldwin	veor	q7, q11, q10
361bc3d5698SJohn Baldwin	veor	q3, q3, q11
362bc3d5698SJohn Baldwin	veor	q3, q3, q12
363bc3d5698SJohn Baldwin	veor	q0, q0, q12
364bc3d5698SJohn Baldwin	veor	q7, q7, q8
365bc3d5698SJohn Baldwin	veor	q4, q4, q8
366bc3d5698SJohn Baldwin	veor	q1, q1, q7
367bc3d5698SJohn Baldwin	veor	q6, q6, q5
368bc3d5698SJohn Baldwin
369bc3d5698SJohn Baldwin	veor	q4, q4, q1
370bc3d5698SJohn Baldwin	veor	q2, q2, q7
371bc3d5698SJohn Baldwin	veor	q5, q5, q7
372bc3d5698SJohn Baldwin	veor	q4, q4, q2
373bc3d5698SJohn Baldwin	veor	q7, q7, q0
374bc3d5698SJohn Baldwin	veor	q4, q4, q5
375bc3d5698SJohn Baldwin	veor	q3, q3, q6
376bc3d5698SJohn Baldwin	veor	q6, q6, q1
377bc3d5698SJohn Baldwin	veor	q3, q3, q4
378bc3d5698SJohn Baldwin
379bc3d5698SJohn Baldwin	veor	q4, q4, q0
380bc3d5698SJohn Baldwin	veor	q7, q7, q3
381bc3d5698SJohn Baldwin	subs	r5,r5,#1
382bc3d5698SJohn Baldwin	bcc	.Ldec_done
383bc3d5698SJohn Baldwin	@ multiplication by 0x05-0x00-0x04-0x00
384bc3d5698SJohn Baldwin	vext.8	q8, q0, q0, #8
385bc3d5698SJohn Baldwin	vext.8	q14, q3, q3, #8
386bc3d5698SJohn Baldwin	vext.8	q15, q5, q5, #8
387bc3d5698SJohn Baldwin	veor	q8, q8, q0
388bc3d5698SJohn Baldwin	vext.8	q9, q1, q1, #8
389bc3d5698SJohn Baldwin	veor	q14, q14, q3
390bc3d5698SJohn Baldwin	vext.8	q10, q6, q6, #8
391bc3d5698SJohn Baldwin	veor	q15, q15, q5
392bc3d5698SJohn Baldwin	vext.8	q11, q4, q4, #8
393bc3d5698SJohn Baldwin	veor	q9, q9, q1
394bc3d5698SJohn Baldwin	vext.8	q12, q2, q2, #8
395bc3d5698SJohn Baldwin	veor	q10, q10, q6
396bc3d5698SJohn Baldwin	vext.8	q13, q7, q7, #8
397bc3d5698SJohn Baldwin	veor	q11, q11, q4
398bc3d5698SJohn Baldwin	veor	q12, q12, q2
399bc3d5698SJohn Baldwin	veor	q13, q13, q7
400bc3d5698SJohn Baldwin
401bc3d5698SJohn Baldwin	veor	q0, q0, q14
402bc3d5698SJohn Baldwin	veor	q1, q1, q14
403bc3d5698SJohn Baldwin	veor	q6, q6, q8
404bc3d5698SJohn Baldwin	veor	q2, q2, q10
405bc3d5698SJohn Baldwin	veor	q4, q4, q9
406bc3d5698SJohn Baldwin	veor	q1, q1, q15
407bc3d5698SJohn Baldwin	veor	q6, q6, q15
408bc3d5698SJohn Baldwin	veor	q2, q2, q14
409bc3d5698SJohn Baldwin	veor	q7, q7, q11
410bc3d5698SJohn Baldwin	veor	q4, q4, q14
411bc3d5698SJohn Baldwin	veor	q3, q3, q12
412bc3d5698SJohn Baldwin	veor	q2, q2, q15
413bc3d5698SJohn Baldwin	veor	q7, q7, q15
414bc3d5698SJohn Baldwin	veor	q5, q5, q13
415bc3d5698SJohn Baldwin	vext.8	q8, q0, q0, #12	@ x0 <<< 32
416bc3d5698SJohn Baldwin	vext.8	q9, q1, q1, #12
417bc3d5698SJohn Baldwin	veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
418bc3d5698SJohn Baldwin	vext.8	q10, q6, q6, #12
419bc3d5698SJohn Baldwin	veor	q1, q1, q9
420bc3d5698SJohn Baldwin	vext.8	q11, q4, q4, #12
421bc3d5698SJohn Baldwin	veor	q6, q6, q10
422bc3d5698SJohn Baldwin	vext.8	q12, q2, q2, #12
423bc3d5698SJohn Baldwin	veor	q4, q4, q11
424bc3d5698SJohn Baldwin	vext.8	q13, q7, q7, #12
425bc3d5698SJohn Baldwin	veor	q2, q2, q12
426bc3d5698SJohn Baldwin	vext.8	q14, q3, q3, #12
427bc3d5698SJohn Baldwin	veor	q7, q7, q13
428bc3d5698SJohn Baldwin	vext.8	q15, q5, q5, #12
429bc3d5698SJohn Baldwin	veor	q3, q3, q14
430bc3d5698SJohn Baldwin
431bc3d5698SJohn Baldwin	veor	q9, q9, q0
432bc3d5698SJohn Baldwin	veor	q5, q5, q15
433bc3d5698SJohn Baldwin	vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
434bc3d5698SJohn Baldwin	veor	q10, q10, q1
435bc3d5698SJohn Baldwin	veor	q8, q8, q5
436bc3d5698SJohn Baldwin	veor	q9, q9, q5
437bc3d5698SJohn Baldwin	vext.8	q1, q1, q1, #8
438bc3d5698SJohn Baldwin	veor	q13, q13, q2
439bc3d5698SJohn Baldwin	veor	q0, q0, q8
440bc3d5698SJohn Baldwin	veor	q14, q14, q7
441bc3d5698SJohn Baldwin	veor	q1, q1, q9
442bc3d5698SJohn Baldwin	vext.8	q8, q2, q2, #8
443bc3d5698SJohn Baldwin	veor	q12, q12, q4
444bc3d5698SJohn Baldwin	vext.8	q9, q7, q7, #8
445bc3d5698SJohn Baldwin	veor	q15, q15, q3
446bc3d5698SJohn Baldwin	vext.8	q2, q4, q4, #8
447bc3d5698SJohn Baldwin	veor	q11, q11, q6
448bc3d5698SJohn Baldwin	vext.8	q7, q5, q5, #8
449bc3d5698SJohn Baldwin	veor	q12, q12, q5
450bc3d5698SJohn Baldwin	vext.8	q4, q3, q3, #8
451bc3d5698SJohn Baldwin	veor	q11, q11, q5
452bc3d5698SJohn Baldwin	vext.8	q3, q6, q6, #8
453bc3d5698SJohn Baldwin	veor	q5, q9, q13
454bc3d5698SJohn Baldwin	veor	q11, q11, q2
455bc3d5698SJohn Baldwin	veor	q7, q7, q15
456bc3d5698SJohn Baldwin	veor	q6, q4, q14
457bc3d5698SJohn Baldwin	veor	q4, q8, q12
458bc3d5698SJohn Baldwin	veor	q2, q3, q10
459bc3d5698SJohn Baldwin	vmov	q3, q11
460bc3d5698SJohn Baldwin	 @ vmov	q5, q9
461bc3d5698SJohn Baldwin	vldmia	r6, {q12}		@ .LISR
462bc3d5698SJohn Baldwin	ite	eq				@ Thumb2 thing, sanity check in ARM
463bc3d5698SJohn Baldwin	addeq	r6,r6,#0x10
464bc3d5698SJohn Baldwin	bne	.Ldec_loop
465bc3d5698SJohn Baldwin	vldmia	r6, {q12}		@ .LISRM0
466bc3d5698SJohn Baldwin	b	.Ldec_loop
467bc3d5698SJohn Baldwin.align	4
468bc3d5698SJohn Baldwin.Ldec_done:
469bc3d5698SJohn Baldwin	vmov.i8	q8,#0x55			@ compose .LBS0
470bc3d5698SJohn Baldwin	vmov.i8	q9,#0x33			@ compose .LBS1
471bc3d5698SJohn Baldwin	vshr.u64	q10, q3, #1
472bc3d5698SJohn Baldwin	vshr.u64	q11, q2, #1
473bc3d5698SJohn Baldwin	veor	q10, q10, q5
474bc3d5698SJohn Baldwin	veor	q11, q11, q7
475bc3d5698SJohn Baldwin	vand	q10, q10, q8
476bc3d5698SJohn Baldwin	vand	q11, q11, q8
477bc3d5698SJohn Baldwin	veor	q5, q5, q10
478bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #1
479bc3d5698SJohn Baldwin	veor	q7, q7, q11
480bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #1
481bc3d5698SJohn Baldwin	veor	q3, q3, q10
482bc3d5698SJohn Baldwin	veor	q2, q2, q11
483bc3d5698SJohn Baldwin	vshr.u64	q10, q6, #1
484bc3d5698SJohn Baldwin	vshr.u64	q11, q0, #1
485bc3d5698SJohn Baldwin	veor	q10, q10, q4
486bc3d5698SJohn Baldwin	veor	q11, q11, q1
487bc3d5698SJohn Baldwin	vand	q10, q10, q8
488bc3d5698SJohn Baldwin	vand	q11, q11, q8
489bc3d5698SJohn Baldwin	veor	q4, q4, q10
490bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #1
491bc3d5698SJohn Baldwin	veor	q1, q1, q11
492bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #1
493bc3d5698SJohn Baldwin	veor	q6, q6, q10
494bc3d5698SJohn Baldwin	veor	q0, q0, q11
495bc3d5698SJohn Baldwin	vmov.i8	q8,#0x0f			@ compose .LBS2
496bc3d5698SJohn Baldwin	vshr.u64	q10, q7, #2
497bc3d5698SJohn Baldwin	vshr.u64	q11, q2, #2
498bc3d5698SJohn Baldwin	veor	q10, q10, q5
499bc3d5698SJohn Baldwin	veor	q11, q11, q3
500bc3d5698SJohn Baldwin	vand	q10, q10, q9
501bc3d5698SJohn Baldwin	vand	q11, q11, q9
502bc3d5698SJohn Baldwin	veor	q5, q5, q10
503bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #2
504bc3d5698SJohn Baldwin	veor	q3, q3, q11
505bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #2
506bc3d5698SJohn Baldwin	veor	q7, q7, q10
507bc3d5698SJohn Baldwin	veor	q2, q2, q11
508bc3d5698SJohn Baldwin	vshr.u64	q10, q1, #2
509bc3d5698SJohn Baldwin	vshr.u64	q11, q0, #2
510bc3d5698SJohn Baldwin	veor	q10, q10, q4
511bc3d5698SJohn Baldwin	veor	q11, q11, q6
512bc3d5698SJohn Baldwin	vand	q10, q10, q9
513bc3d5698SJohn Baldwin	vand	q11, q11, q9
514bc3d5698SJohn Baldwin	veor	q4, q4, q10
515bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #2
516bc3d5698SJohn Baldwin	veor	q6, q6, q11
517bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #2
518bc3d5698SJohn Baldwin	veor	q1, q1, q10
519bc3d5698SJohn Baldwin	veor	q0, q0, q11
520bc3d5698SJohn Baldwin	vshr.u64	q10, q4, #4
521bc3d5698SJohn Baldwin	vshr.u64	q11, q6, #4
522bc3d5698SJohn Baldwin	veor	q10, q10, q5
523bc3d5698SJohn Baldwin	veor	q11, q11, q3
524bc3d5698SJohn Baldwin	vand	q10, q10, q8
525bc3d5698SJohn Baldwin	vand	q11, q11, q8
526bc3d5698SJohn Baldwin	veor	q5, q5, q10
527bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #4
528bc3d5698SJohn Baldwin	veor	q3, q3, q11
529bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #4
530bc3d5698SJohn Baldwin	veor	q4, q4, q10
531bc3d5698SJohn Baldwin	veor	q6, q6, q11
532bc3d5698SJohn Baldwin	vshr.u64	q10, q1, #4
533bc3d5698SJohn Baldwin	vshr.u64	q11, q0, #4
534bc3d5698SJohn Baldwin	veor	q10, q10, q7
535bc3d5698SJohn Baldwin	veor	q11, q11, q2
536bc3d5698SJohn Baldwin	vand	q10, q10, q8
537bc3d5698SJohn Baldwin	vand	q11, q11, q8
538bc3d5698SJohn Baldwin	veor	q7, q7, q10
539bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #4
540bc3d5698SJohn Baldwin	veor	q2, q2, q11
541bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #4
542bc3d5698SJohn Baldwin	veor	q1, q1, q10
543bc3d5698SJohn Baldwin	veor	q0, q0, q11
544bc3d5698SJohn Baldwin	vldmia	r4, {q8}			@ last round key
545bc3d5698SJohn Baldwin	veor	q6, q6, q8
546bc3d5698SJohn Baldwin	veor	q4, q4, q8
547bc3d5698SJohn Baldwin	veor	q2, q2, q8
548bc3d5698SJohn Baldwin	veor	q7, q7, q8
549bc3d5698SJohn Baldwin	veor	q3, q3, q8
550bc3d5698SJohn Baldwin	veor	q5, q5, q8
551bc3d5698SJohn Baldwin	veor	q0, q0, q8
552bc3d5698SJohn Baldwin	veor	q1, q1, q8
553bc3d5698SJohn Baldwin	bx	lr
554bc3d5698SJohn Baldwin.size	_bsaes_decrypt8,.-_bsaes_decrypt8
555bc3d5698SJohn Baldwin
556bc3d5698SJohn Baldwin.type	_bsaes_const,%object
557bc3d5698SJohn Baldwin.align	6
558bc3d5698SJohn Baldwin_bsaes_const:
559bc3d5698SJohn Baldwin.LM0ISR:@ InvShiftRows constants
560bc3d5698SJohn Baldwin.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
561bc3d5698SJohn Baldwin.LISR:
562bc3d5698SJohn Baldwin.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
563bc3d5698SJohn Baldwin.LISRM0:
564bc3d5698SJohn Baldwin.quad	0x01040b0e0205080f, 0x0306090c00070a0d
565bc3d5698SJohn Baldwin.LM0SR:@ ShiftRows constants
566bc3d5698SJohn Baldwin.quad	0x0a0e02060f03070b, 0x0004080c05090d01
567bc3d5698SJohn Baldwin.LSR:
568bc3d5698SJohn Baldwin.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
569bc3d5698SJohn Baldwin.LSRM0:
570bc3d5698SJohn Baldwin.quad	0x0304090e00050a0f, 0x01060b0c0207080d
571bc3d5698SJohn Baldwin.LM0:
572bc3d5698SJohn Baldwin.quad	0x02060a0e03070b0f, 0x0004080c0105090d
573bc3d5698SJohn Baldwin.LREVM0SR:
574bc3d5698SJohn Baldwin.quad	0x090d01050c000408, 0x03070b0f060a0e02
575bc3d5698SJohn Baldwin.byte	66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
576bc3d5698SJohn Baldwin.align	2
577bc3d5698SJohn Baldwin.align	6
578bc3d5698SJohn Baldwin.size	_bsaes_const,.-_bsaes_const
579bc3d5698SJohn Baldwin
580bc3d5698SJohn Baldwin.type	_bsaes_encrypt8,%function
581bc3d5698SJohn Baldwin.align	4
582bc3d5698SJohn Baldwin_bsaes_encrypt8:
583bc3d5698SJohn Baldwin	adr	r6,.
584bc3d5698SJohn Baldwin	vldmia	r4!, {q9}		@ round 0 key
585bc3d5698SJohn Baldwin#if defined(__thumb2__) || defined(__APPLE__)
586bc3d5698SJohn Baldwin	adr	r6,.LM0SR
587bc3d5698SJohn Baldwin#else
588bc3d5698SJohn Baldwin	sub	r6,r6,#_bsaes_encrypt8-.LM0SR
589bc3d5698SJohn Baldwin#endif
590bc3d5698SJohn Baldwin
591bc3d5698SJohn Baldwin	vldmia	r6!, {q8}		@ .LM0SR
592bc3d5698SJohn Baldwin_bsaes_encrypt8_alt:
593bc3d5698SJohn Baldwin	veor	q10, q0, q9	@ xor with round0 key
594bc3d5698SJohn Baldwin	veor	q11, q1, q9
595bc3d5698SJohn Baldwin	vtbl.8	d0, {q10}, d16
596bc3d5698SJohn Baldwin	vtbl.8	d1, {q10}, d17
597bc3d5698SJohn Baldwin	veor	q12, q2, q9
598bc3d5698SJohn Baldwin	vtbl.8	d2, {q11}, d16
599bc3d5698SJohn Baldwin	vtbl.8	d3, {q11}, d17
600bc3d5698SJohn Baldwin	veor	q13, q3, q9
601bc3d5698SJohn Baldwin	vtbl.8	d4, {q12}, d16
602bc3d5698SJohn Baldwin	vtbl.8	d5, {q12}, d17
603bc3d5698SJohn Baldwin	veor	q14, q4, q9
604bc3d5698SJohn Baldwin	vtbl.8	d6, {q13}, d16
605bc3d5698SJohn Baldwin	vtbl.8	d7, {q13}, d17
606bc3d5698SJohn Baldwin	veor	q15, q5, q9
607bc3d5698SJohn Baldwin	vtbl.8	d8, {q14}, d16
608bc3d5698SJohn Baldwin	vtbl.8	d9, {q14}, d17
609bc3d5698SJohn Baldwin	veor	q10, q6, q9
610bc3d5698SJohn Baldwin	vtbl.8	d10, {q15}, d16
611bc3d5698SJohn Baldwin	vtbl.8	d11, {q15}, d17
612bc3d5698SJohn Baldwin	veor	q11, q7, q9
613bc3d5698SJohn Baldwin	vtbl.8	d12, {q10}, d16
614bc3d5698SJohn Baldwin	vtbl.8	d13, {q10}, d17
615bc3d5698SJohn Baldwin	vtbl.8	d14, {q11}, d16
616bc3d5698SJohn Baldwin	vtbl.8	d15, {q11}, d17
617bc3d5698SJohn Baldwin_bsaes_encrypt8_bitslice:
618bc3d5698SJohn Baldwin	vmov.i8	q8,#0x55			@ compose .LBS0
619bc3d5698SJohn Baldwin	vmov.i8	q9,#0x33			@ compose .LBS1
620bc3d5698SJohn Baldwin	vshr.u64	q10, q6, #1
621bc3d5698SJohn Baldwin	vshr.u64	q11, q4, #1
622bc3d5698SJohn Baldwin	veor	q10, q10, q7
623bc3d5698SJohn Baldwin	veor	q11, q11, q5
624bc3d5698SJohn Baldwin	vand	q10, q10, q8
625bc3d5698SJohn Baldwin	vand	q11, q11, q8
626bc3d5698SJohn Baldwin	veor	q7, q7, q10
627bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #1
628bc3d5698SJohn Baldwin	veor	q5, q5, q11
629bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #1
630bc3d5698SJohn Baldwin	veor	q6, q6, q10
631bc3d5698SJohn Baldwin	veor	q4, q4, q11
632bc3d5698SJohn Baldwin	vshr.u64	q10, q2, #1
633bc3d5698SJohn Baldwin	vshr.u64	q11, q0, #1
634bc3d5698SJohn Baldwin	veor	q10, q10, q3
635bc3d5698SJohn Baldwin	veor	q11, q11, q1
636bc3d5698SJohn Baldwin	vand	q10, q10, q8
637bc3d5698SJohn Baldwin	vand	q11, q11, q8
638bc3d5698SJohn Baldwin	veor	q3, q3, q10
639bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #1
640bc3d5698SJohn Baldwin	veor	q1, q1, q11
641bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #1
642bc3d5698SJohn Baldwin	veor	q2, q2, q10
643bc3d5698SJohn Baldwin	veor	q0, q0, q11
644bc3d5698SJohn Baldwin	vmov.i8	q8,#0x0f			@ compose .LBS2
645bc3d5698SJohn Baldwin	vshr.u64	q10, q5, #2
646bc3d5698SJohn Baldwin	vshr.u64	q11, q4, #2
647bc3d5698SJohn Baldwin	veor	q10, q10, q7
648bc3d5698SJohn Baldwin	veor	q11, q11, q6
649bc3d5698SJohn Baldwin	vand	q10, q10, q9
650bc3d5698SJohn Baldwin	vand	q11, q11, q9
651bc3d5698SJohn Baldwin	veor	q7, q7, q10
652bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #2
653bc3d5698SJohn Baldwin	veor	q6, q6, q11
654bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #2
655bc3d5698SJohn Baldwin	veor	q5, q5, q10
656bc3d5698SJohn Baldwin	veor	q4, q4, q11
657bc3d5698SJohn Baldwin	vshr.u64	q10, q1, #2
658bc3d5698SJohn Baldwin	vshr.u64	q11, q0, #2
659bc3d5698SJohn Baldwin	veor	q10, q10, q3
660bc3d5698SJohn Baldwin	veor	q11, q11, q2
661bc3d5698SJohn Baldwin	vand	q10, q10, q9
662bc3d5698SJohn Baldwin	vand	q11, q11, q9
663bc3d5698SJohn Baldwin	veor	q3, q3, q10
664bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #2
665bc3d5698SJohn Baldwin	veor	q2, q2, q11
666bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #2
667bc3d5698SJohn Baldwin	veor	q1, q1, q10
668bc3d5698SJohn Baldwin	veor	q0, q0, q11
669bc3d5698SJohn Baldwin	vshr.u64	q10, q3, #4
670bc3d5698SJohn Baldwin	vshr.u64	q11, q2, #4
671bc3d5698SJohn Baldwin	veor	q10, q10, q7
672bc3d5698SJohn Baldwin	veor	q11, q11, q6
673bc3d5698SJohn Baldwin	vand	q10, q10, q8
674bc3d5698SJohn Baldwin	vand	q11, q11, q8
675bc3d5698SJohn Baldwin	veor	q7, q7, q10
676bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #4
677bc3d5698SJohn Baldwin	veor	q6, q6, q11
678bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #4
679bc3d5698SJohn Baldwin	veor	q3, q3, q10
680bc3d5698SJohn Baldwin	veor	q2, q2, q11
681bc3d5698SJohn Baldwin	vshr.u64	q10, q1, #4
682bc3d5698SJohn Baldwin	vshr.u64	q11, q0, #4
683bc3d5698SJohn Baldwin	veor	q10, q10, q5
684bc3d5698SJohn Baldwin	veor	q11, q11, q4
685bc3d5698SJohn Baldwin	vand	q10, q10, q8
686bc3d5698SJohn Baldwin	vand	q11, q11, q8
687bc3d5698SJohn Baldwin	veor	q5, q5, q10
688bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #4
689bc3d5698SJohn Baldwin	veor	q4, q4, q11
690bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #4
691bc3d5698SJohn Baldwin	veor	q1, q1, q10
692bc3d5698SJohn Baldwin	veor	q0, q0, q11
693bc3d5698SJohn Baldwin	sub	r5,r5,#1
694bc3d5698SJohn Baldwin	b	.Lenc_sbox
695bc3d5698SJohn Baldwin.align	4
696bc3d5698SJohn Baldwin.Lenc_loop:
697bc3d5698SJohn Baldwin	vldmia	r4!, {q8,q9,q10,q11}
698bc3d5698SJohn Baldwin	veor	q8, q8, q0
699bc3d5698SJohn Baldwin	veor	q9, q9, q1
700bc3d5698SJohn Baldwin	vtbl.8	d0, {q8}, d24
701bc3d5698SJohn Baldwin	vtbl.8	d1, {q8}, d25
702bc3d5698SJohn Baldwin	vldmia	r4!, {q8}
703bc3d5698SJohn Baldwin	veor	q10, q10, q2
704bc3d5698SJohn Baldwin	vtbl.8	d2, {q9}, d24
705bc3d5698SJohn Baldwin	vtbl.8	d3, {q9}, d25
706bc3d5698SJohn Baldwin	vldmia	r4!, {q9}
707bc3d5698SJohn Baldwin	veor	q11, q11, q3
708bc3d5698SJohn Baldwin	vtbl.8	d4, {q10}, d24
709bc3d5698SJohn Baldwin	vtbl.8	d5, {q10}, d25
710bc3d5698SJohn Baldwin	vldmia	r4!, {q10}
711bc3d5698SJohn Baldwin	vtbl.8	d6, {q11}, d24
712bc3d5698SJohn Baldwin	vtbl.8	d7, {q11}, d25
713bc3d5698SJohn Baldwin	vldmia	r4!, {q11}
714bc3d5698SJohn Baldwin	veor	q8, q8, q4
715bc3d5698SJohn Baldwin	veor	q9, q9, q5
716bc3d5698SJohn Baldwin	vtbl.8	d8, {q8}, d24
717bc3d5698SJohn Baldwin	vtbl.8	d9, {q8}, d25
718bc3d5698SJohn Baldwin	veor	q10, q10, q6
719bc3d5698SJohn Baldwin	vtbl.8	d10, {q9}, d24
720bc3d5698SJohn Baldwin	vtbl.8	d11, {q9}, d25
721bc3d5698SJohn Baldwin	veor	q11, q11, q7
722bc3d5698SJohn Baldwin	vtbl.8	d12, {q10}, d24
723bc3d5698SJohn Baldwin	vtbl.8	d13, {q10}, d25
724bc3d5698SJohn Baldwin	vtbl.8	d14, {q11}, d24
725bc3d5698SJohn Baldwin	vtbl.8	d15, {q11}, d25
726bc3d5698SJohn Baldwin.Lenc_sbox:
727bc3d5698SJohn Baldwin	veor	q2, q2, q1
728bc3d5698SJohn Baldwin	veor	q5, q5, q6
729bc3d5698SJohn Baldwin	veor	q3, q3, q0
730bc3d5698SJohn Baldwin	veor	q6, q6, q2
731bc3d5698SJohn Baldwin	veor	q5, q5, q0
732bc3d5698SJohn Baldwin
733bc3d5698SJohn Baldwin	veor	q6, q6, q3
734bc3d5698SJohn Baldwin	veor	q3, q3, q7
735bc3d5698SJohn Baldwin	veor	q7, q7, q5
736bc3d5698SJohn Baldwin	veor	q3, q3, q4
737bc3d5698SJohn Baldwin	veor	q4, q4, q5
738bc3d5698SJohn Baldwin
739bc3d5698SJohn Baldwin	veor	q2, q2, q7
740bc3d5698SJohn Baldwin	veor	q3, q3, q1
741bc3d5698SJohn Baldwin	veor	q1, q1, q5
742bc3d5698SJohn Baldwin	veor	q11, q7, q4
743bc3d5698SJohn Baldwin	veor	q10, q1, q2
744bc3d5698SJohn Baldwin	veor	q9, q5, q3
745bc3d5698SJohn Baldwin	veor	q13, q2, q4
746bc3d5698SJohn Baldwin	vmov	q8, q10
747bc3d5698SJohn Baldwin	veor	q12, q6, q0
748bc3d5698SJohn Baldwin
749bc3d5698SJohn Baldwin	vorr	q10, q10, q9
750bc3d5698SJohn Baldwin	veor	q15, q11, q8
751bc3d5698SJohn Baldwin	vand	q14, q11, q12
752bc3d5698SJohn Baldwin	vorr	q11, q11, q12
753bc3d5698SJohn Baldwin	veor	q12, q12, q9
754bc3d5698SJohn Baldwin	vand	q8, q8, q9
755bc3d5698SJohn Baldwin	veor	q9, q3, q0
756bc3d5698SJohn Baldwin	vand	q15, q15, q12
757bc3d5698SJohn Baldwin	vand	q13, q13, q9
758bc3d5698SJohn Baldwin	veor	q9, q7, q1
759bc3d5698SJohn Baldwin	veor	q12, q5, q6
760bc3d5698SJohn Baldwin	veor	q11, q11, q13
761bc3d5698SJohn Baldwin	veor	q10, q10, q13
762bc3d5698SJohn Baldwin	vand	q13, q9, q12
763bc3d5698SJohn Baldwin	vorr	q9, q9, q12
764bc3d5698SJohn Baldwin	veor	q11, q11, q15
765bc3d5698SJohn Baldwin	veor	q8, q8, q13
766bc3d5698SJohn Baldwin	veor	q10, q10, q14
767bc3d5698SJohn Baldwin	veor	q9, q9, q15
768bc3d5698SJohn Baldwin	veor	q8, q8, q14
769bc3d5698SJohn Baldwin	vand	q12, q2, q3
770bc3d5698SJohn Baldwin	veor	q9, q9, q14
771bc3d5698SJohn Baldwin	vand	q13, q4, q0
772bc3d5698SJohn Baldwin	vand	q14, q1, q5
773bc3d5698SJohn Baldwin	vorr	q15, q7, q6
774bc3d5698SJohn Baldwin	veor	q11, q11, q12
775bc3d5698SJohn Baldwin	veor	q9, q9, q14
776bc3d5698SJohn Baldwin	veor	q8, q8, q15
777bc3d5698SJohn Baldwin	veor	q10, q10, q13
778bc3d5698SJohn Baldwin
779bc3d5698SJohn Baldwin	@ Inv_GF16 	0, 	1, 	2, 	3, s0, s1, s2, s3
780bc3d5698SJohn Baldwin
781bc3d5698SJohn Baldwin	@ new smaller inversion
782bc3d5698SJohn Baldwin
783bc3d5698SJohn Baldwin	vand	q14, q11, q9
784bc3d5698SJohn Baldwin	vmov	q12, q8
785bc3d5698SJohn Baldwin
786bc3d5698SJohn Baldwin	veor	q13, q10, q14
787bc3d5698SJohn Baldwin	veor	q15, q8, q14
788bc3d5698SJohn Baldwin	veor	q14, q8, q14	@ q14=q15
789bc3d5698SJohn Baldwin
790bc3d5698SJohn Baldwin	vbsl	q13, q9, q8
791bc3d5698SJohn Baldwin	vbsl	q15, q11, q10
792bc3d5698SJohn Baldwin	veor	q11, q11, q10
793bc3d5698SJohn Baldwin
794bc3d5698SJohn Baldwin	vbsl	q12, q13, q14
795bc3d5698SJohn Baldwin	vbsl	q8, q14, q13
796bc3d5698SJohn Baldwin
797bc3d5698SJohn Baldwin	vand	q14, q12, q15
798bc3d5698SJohn Baldwin	veor	q9, q9, q8
799bc3d5698SJohn Baldwin
800bc3d5698SJohn Baldwin	veor	q14, q14, q11
801bc3d5698SJohn Baldwin	veor	q12, q6, q0
802bc3d5698SJohn Baldwin	veor	q8, q5, q3
803bc3d5698SJohn Baldwin	veor	q10, q15, q14
804bc3d5698SJohn Baldwin	vand	q10, q10, q6
805bc3d5698SJohn Baldwin	veor	q6, q6, q5
806bc3d5698SJohn Baldwin	vand	q11, q5, q15
807bc3d5698SJohn Baldwin	vand	q6, q6, q14
808bc3d5698SJohn Baldwin	veor	q5, q11, q10
809bc3d5698SJohn Baldwin	veor	q6, q6, q11
810bc3d5698SJohn Baldwin	veor	q15, q15, q13
811bc3d5698SJohn Baldwin	veor	q14, q14, q9
812bc3d5698SJohn Baldwin	veor	q11, q15, q14
813bc3d5698SJohn Baldwin	veor	q10, q13, q9
814bc3d5698SJohn Baldwin	vand	q11, q11, q12
815bc3d5698SJohn Baldwin	vand	q10, q10, q0
816bc3d5698SJohn Baldwin	veor	q12, q12, q8
817bc3d5698SJohn Baldwin	veor	q0, q0, q3
818bc3d5698SJohn Baldwin	vand	q8, q8, q15
819bc3d5698SJohn Baldwin	vand	q3, q3, q13
820bc3d5698SJohn Baldwin	vand	q12, q12, q14
821bc3d5698SJohn Baldwin	vand	q0, q0, q9
822bc3d5698SJohn Baldwin	veor	q8, q8, q12
823bc3d5698SJohn Baldwin	veor	q0, q0, q3
824bc3d5698SJohn Baldwin	veor	q12, q12, q11
825bc3d5698SJohn Baldwin	veor	q3, q3, q10
826bc3d5698SJohn Baldwin	veor	q6, q6, q12
827bc3d5698SJohn Baldwin	veor	q0, q0, q12
828bc3d5698SJohn Baldwin	veor	q5, q5, q8
829bc3d5698SJohn Baldwin	veor	q3, q3, q8
830bc3d5698SJohn Baldwin
831bc3d5698SJohn Baldwin	veor	q12, q7, q4
832bc3d5698SJohn Baldwin	veor	q8, q1, q2
833bc3d5698SJohn Baldwin	veor	q11, q15, q14
834bc3d5698SJohn Baldwin	veor	q10, q13, q9
835bc3d5698SJohn Baldwin	vand	q11, q11, q12
836bc3d5698SJohn Baldwin	vand	q10, q10, q4
837bc3d5698SJohn Baldwin	veor	q12, q12, q8
838bc3d5698SJohn Baldwin	veor	q4, q4, q2
839bc3d5698SJohn Baldwin	vand	q8, q8, q15
840bc3d5698SJohn Baldwin	vand	q2, q2, q13
841bc3d5698SJohn Baldwin	vand	q12, q12, q14
842bc3d5698SJohn Baldwin	vand	q4, q4, q9
843bc3d5698SJohn Baldwin	veor	q8, q8, q12
844bc3d5698SJohn Baldwin	veor	q4, q4, q2
845bc3d5698SJohn Baldwin	veor	q12, q12, q11
846bc3d5698SJohn Baldwin	veor	q2, q2, q10
847bc3d5698SJohn Baldwin	veor	q15, q15, q13
848bc3d5698SJohn Baldwin	veor	q14, q14, q9
849bc3d5698SJohn Baldwin	veor	q10, q15, q14
850bc3d5698SJohn Baldwin	vand	q10, q10, q7
851bc3d5698SJohn Baldwin	veor	q7, q7, q1
852bc3d5698SJohn Baldwin	vand	q11, q1, q15
853bc3d5698SJohn Baldwin	vand	q7, q7, q14
854bc3d5698SJohn Baldwin	veor	q1, q11, q10
855bc3d5698SJohn Baldwin	veor	q7, q7, q11
856bc3d5698SJohn Baldwin	veor	q7, q7, q12
857bc3d5698SJohn Baldwin	veor	q4, q4, q12
858bc3d5698SJohn Baldwin	veor	q1, q1, q8
859bc3d5698SJohn Baldwin	veor	q2, q2, q8
860bc3d5698SJohn Baldwin	veor	q7, q7, q0
861bc3d5698SJohn Baldwin	veor	q1, q1, q6
862bc3d5698SJohn Baldwin	veor	q6, q6, q0
863bc3d5698SJohn Baldwin	veor	q4, q4, q7
864bc3d5698SJohn Baldwin	veor	q0, q0, q1
865bc3d5698SJohn Baldwin
866bc3d5698SJohn Baldwin	veor	q1, q1, q5
867bc3d5698SJohn Baldwin	veor	q5, q5, q2
868bc3d5698SJohn Baldwin	veor	q2, q2, q3
869bc3d5698SJohn Baldwin	veor	q3, q3, q5
870bc3d5698SJohn Baldwin	veor	q4, q4, q5
871bc3d5698SJohn Baldwin
872bc3d5698SJohn Baldwin	veor	q6, q6, q3
873bc3d5698SJohn Baldwin	subs	r5,r5,#1
874bc3d5698SJohn Baldwin	bcc	.Lenc_done
875bc3d5698SJohn Baldwin	vext.8	q8, q0, q0, #12	@ x0 <<< 32
876bc3d5698SJohn Baldwin	vext.8	q9, q1, q1, #12
877bc3d5698SJohn Baldwin	veor	q0, q0, q8		@ x0 ^ (x0 <<< 32)
878bc3d5698SJohn Baldwin	vext.8	q10, q4, q4, #12
879bc3d5698SJohn Baldwin	veor	q1, q1, q9
880bc3d5698SJohn Baldwin	vext.8	q11, q6, q6, #12
881bc3d5698SJohn Baldwin	veor	q4, q4, q10
882bc3d5698SJohn Baldwin	vext.8	q12, q3, q3, #12
883bc3d5698SJohn Baldwin	veor	q6, q6, q11
884bc3d5698SJohn Baldwin	vext.8	q13, q7, q7, #12
885bc3d5698SJohn Baldwin	veor	q3, q3, q12
886bc3d5698SJohn Baldwin	vext.8	q14, q2, q2, #12
887bc3d5698SJohn Baldwin	veor	q7, q7, q13
888bc3d5698SJohn Baldwin	vext.8	q15, q5, q5, #12
889bc3d5698SJohn Baldwin	veor	q2, q2, q14
890bc3d5698SJohn Baldwin
891bc3d5698SJohn Baldwin	veor	q9, q9, q0
892bc3d5698SJohn Baldwin	veor	q5, q5, q15
893bc3d5698SJohn Baldwin	vext.8	q0, q0, q0, #8		@ (x0 ^ (x0 <<< 32)) <<< 64)
894bc3d5698SJohn Baldwin	veor	q10, q10, q1
895bc3d5698SJohn Baldwin	veor	q8, q8, q5
896bc3d5698SJohn Baldwin	veor	q9, q9, q5
897bc3d5698SJohn Baldwin	vext.8	q1, q1, q1, #8
898bc3d5698SJohn Baldwin	veor	q13, q13, q3
899bc3d5698SJohn Baldwin	veor	q0, q0, q8
900bc3d5698SJohn Baldwin	veor	q14, q14, q7
901bc3d5698SJohn Baldwin	veor	q1, q1, q9
902bc3d5698SJohn Baldwin	vext.8	q8, q3, q3, #8
903bc3d5698SJohn Baldwin	veor	q12, q12, q6
904bc3d5698SJohn Baldwin	vext.8	q9, q7, q7, #8
905bc3d5698SJohn Baldwin	veor	q15, q15, q2
906bc3d5698SJohn Baldwin	vext.8	q3, q6, q6, #8
907bc3d5698SJohn Baldwin	veor	q11, q11, q4
908bc3d5698SJohn Baldwin	vext.8	q7, q5, q5, #8
909bc3d5698SJohn Baldwin	veor	q12, q12, q5
910bc3d5698SJohn Baldwin	vext.8	q6, q2, q2, #8
911bc3d5698SJohn Baldwin	veor	q11, q11, q5
912bc3d5698SJohn Baldwin	vext.8	q2, q4, q4, #8
913bc3d5698SJohn Baldwin	veor	q5, q9, q13
914bc3d5698SJohn Baldwin	veor	q4, q8, q12
915bc3d5698SJohn Baldwin	veor	q3, q3, q11
916bc3d5698SJohn Baldwin	veor	q7, q7, q15
917bc3d5698SJohn Baldwin	veor	q6, q6, q14
918bc3d5698SJohn Baldwin	 @ vmov	q4, q8
919bc3d5698SJohn Baldwin	veor	q2, q2, q10
920bc3d5698SJohn Baldwin	 @ vmov	q5, q9
921bc3d5698SJohn Baldwin	vldmia	r6, {q12}		@ .LSR
922bc3d5698SJohn Baldwin	ite	eq				@ Thumb2 thing, samity check in ARM
923bc3d5698SJohn Baldwin	addeq	r6,r6,#0x10
924bc3d5698SJohn Baldwin	bne	.Lenc_loop
925bc3d5698SJohn Baldwin	vldmia	r6, {q12}		@ .LSRM0
926bc3d5698SJohn Baldwin	b	.Lenc_loop
927bc3d5698SJohn Baldwin.align	4
928bc3d5698SJohn Baldwin.Lenc_done:
929bc3d5698SJohn Baldwin	vmov.i8	q8,#0x55			@ compose .LBS0
930bc3d5698SJohn Baldwin	vmov.i8	q9,#0x33			@ compose .LBS1
931bc3d5698SJohn Baldwin	vshr.u64	q10, q2, #1
932bc3d5698SJohn Baldwin	vshr.u64	q11, q3, #1
933bc3d5698SJohn Baldwin	veor	q10, q10, q5
934bc3d5698SJohn Baldwin	veor	q11, q11, q7
935bc3d5698SJohn Baldwin	vand	q10, q10, q8
936bc3d5698SJohn Baldwin	vand	q11, q11, q8
937bc3d5698SJohn Baldwin	veor	q5, q5, q10
938bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #1
939bc3d5698SJohn Baldwin	veor	q7, q7, q11
940bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #1
941bc3d5698SJohn Baldwin	veor	q2, q2, q10
942bc3d5698SJohn Baldwin	veor	q3, q3, q11
943bc3d5698SJohn Baldwin	vshr.u64	q10, q4, #1
944bc3d5698SJohn Baldwin	vshr.u64	q11, q0, #1
945bc3d5698SJohn Baldwin	veor	q10, q10, q6
946bc3d5698SJohn Baldwin	veor	q11, q11, q1
947bc3d5698SJohn Baldwin	vand	q10, q10, q8
948bc3d5698SJohn Baldwin	vand	q11, q11, q8
949bc3d5698SJohn Baldwin	veor	q6, q6, q10
950bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #1
951bc3d5698SJohn Baldwin	veor	q1, q1, q11
952bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #1
953bc3d5698SJohn Baldwin	veor	q4, q4, q10
954bc3d5698SJohn Baldwin	veor	q0, q0, q11
955bc3d5698SJohn Baldwin	vmov.i8	q8,#0x0f			@ compose .LBS2
956bc3d5698SJohn Baldwin	vshr.u64	q10, q7, #2
957bc3d5698SJohn Baldwin	vshr.u64	q11, q3, #2
958bc3d5698SJohn Baldwin	veor	q10, q10, q5
959bc3d5698SJohn Baldwin	veor	q11, q11, q2
960bc3d5698SJohn Baldwin	vand	q10, q10, q9
961bc3d5698SJohn Baldwin	vand	q11, q11, q9
962bc3d5698SJohn Baldwin	veor	q5, q5, q10
963bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #2
964bc3d5698SJohn Baldwin	veor	q2, q2, q11
965bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #2
966bc3d5698SJohn Baldwin	veor	q7, q7, q10
967bc3d5698SJohn Baldwin	veor	q3, q3, q11
968bc3d5698SJohn Baldwin	vshr.u64	q10, q1, #2
969bc3d5698SJohn Baldwin	vshr.u64	q11, q0, #2
970bc3d5698SJohn Baldwin	veor	q10, q10, q6
971bc3d5698SJohn Baldwin	veor	q11, q11, q4
972bc3d5698SJohn Baldwin	vand	q10, q10, q9
973bc3d5698SJohn Baldwin	vand	q11, q11, q9
974bc3d5698SJohn Baldwin	veor	q6, q6, q10
975bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #2
976bc3d5698SJohn Baldwin	veor	q4, q4, q11
977bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #2
978bc3d5698SJohn Baldwin	veor	q1, q1, q10
979bc3d5698SJohn Baldwin	veor	q0, q0, q11
980bc3d5698SJohn Baldwin	vshr.u64	q10, q6, #4
981bc3d5698SJohn Baldwin	vshr.u64	q11, q4, #4
982bc3d5698SJohn Baldwin	veor	q10, q10, q5
983bc3d5698SJohn Baldwin	veor	q11, q11, q2
984bc3d5698SJohn Baldwin	vand	q10, q10, q8
985bc3d5698SJohn Baldwin	vand	q11, q11, q8
986bc3d5698SJohn Baldwin	veor	q5, q5, q10
987bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #4
988bc3d5698SJohn Baldwin	veor	q2, q2, q11
989bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #4
990bc3d5698SJohn Baldwin	veor	q6, q6, q10
991bc3d5698SJohn Baldwin	veor	q4, q4, q11
992bc3d5698SJohn Baldwin	vshr.u64	q10, q1, #4
993bc3d5698SJohn Baldwin	vshr.u64	q11, q0, #4
994bc3d5698SJohn Baldwin	veor	q10, q10, q7
995bc3d5698SJohn Baldwin	veor	q11, q11, q3
996bc3d5698SJohn Baldwin	vand	q10, q10, q8
997bc3d5698SJohn Baldwin	vand	q11, q11, q8
998bc3d5698SJohn Baldwin	veor	q7, q7, q10
999bc3d5698SJohn Baldwin	vshl.u64	q10, q10, #4
1000bc3d5698SJohn Baldwin	veor	q3, q3, q11
1001bc3d5698SJohn Baldwin	vshl.u64	q11, q11, #4
1002bc3d5698SJohn Baldwin	veor	q1, q1, q10
1003bc3d5698SJohn Baldwin	veor	q0, q0, q11
1004bc3d5698SJohn Baldwin	vldmia	r4, {q8}			@ last round key
1005bc3d5698SJohn Baldwin	veor	q4, q4, q8
1006bc3d5698SJohn Baldwin	veor	q6, q6, q8
1007bc3d5698SJohn Baldwin	veor	q3, q3, q8
1008bc3d5698SJohn Baldwin	veor	q7, q7, q8
1009bc3d5698SJohn Baldwin	veor	q2, q2, q8
1010bc3d5698SJohn Baldwin	veor	q5, q5, q8
1011bc3d5698SJohn Baldwin	veor	q0, q0, q8
1012bc3d5698SJohn Baldwin	veor	q1, q1, q8
1013bc3d5698SJohn Baldwin	bx	lr
1014bc3d5698SJohn Baldwin.size	_bsaes_encrypt8,.-_bsaes_encrypt8
1015bc3d5698SJohn Baldwin.type	_bsaes_key_convert,%function
1016bc3d5698SJohn Baldwin.align	4
1017bc3d5698SJohn Baldwin_bsaes_key_convert:
1018bc3d5698SJohn Baldwin	adr	r6,.
1019bc3d5698SJohn Baldwin	vld1.8	{q7},  [r4]!		@ load round 0 key
1020bc3d5698SJohn Baldwin#if defined(__thumb2__) || defined(__APPLE__)
1021bc3d5698SJohn Baldwin	adr	r6,.LM0
1022bc3d5698SJohn Baldwin#else
1023bc3d5698SJohn Baldwin	sub	r6,r6,#_bsaes_key_convert-.LM0
1024bc3d5698SJohn Baldwin#endif
1025bc3d5698SJohn Baldwin	vld1.8	{q15}, [r4]!		@ load round 1 key
1026bc3d5698SJohn Baldwin
1027bc3d5698SJohn Baldwin	vmov.i8	q8,  #0x01			@ bit masks
1028bc3d5698SJohn Baldwin	vmov.i8	q9,  #0x02
1029bc3d5698SJohn Baldwin	vmov.i8	q10, #0x04
1030bc3d5698SJohn Baldwin	vmov.i8	q11, #0x08
1031bc3d5698SJohn Baldwin	vmov.i8	q12, #0x10
1032bc3d5698SJohn Baldwin	vmov.i8	q13, #0x20
1033bc3d5698SJohn Baldwin	vldmia	r6, {q14}		@ .LM0
1034bc3d5698SJohn Baldwin
1035bc3d5698SJohn Baldwin#ifdef __ARMEL__
1036bc3d5698SJohn Baldwin	vrev32.8	q7,  q7
1037bc3d5698SJohn Baldwin	vrev32.8	q15, q15
1038bc3d5698SJohn Baldwin#endif
1039bc3d5698SJohn Baldwin	sub	r5,r5,#1
1040bc3d5698SJohn Baldwin	vstmia	r12!, {q7}		@ save round 0 key
1041bc3d5698SJohn Baldwin	b	.Lkey_loop
1042bc3d5698SJohn Baldwin
1043bc3d5698SJohn Baldwin.align	4
1044bc3d5698SJohn Baldwin.Lkey_loop:
1045bc3d5698SJohn Baldwin	vtbl.8	d14,{q15},d28
1046bc3d5698SJohn Baldwin	vtbl.8	d15,{q15},d29
1047bc3d5698SJohn Baldwin	vmov.i8	q6,  #0x40
1048bc3d5698SJohn Baldwin	vmov.i8	q15, #0x80
1049bc3d5698SJohn Baldwin
1050bc3d5698SJohn Baldwin	vtst.8	q0, q7, q8
1051bc3d5698SJohn Baldwin	vtst.8	q1, q7, q9
1052bc3d5698SJohn Baldwin	vtst.8	q2, q7, q10
1053bc3d5698SJohn Baldwin	vtst.8	q3, q7, q11
1054bc3d5698SJohn Baldwin	vtst.8	q4, q7, q12
1055bc3d5698SJohn Baldwin	vtst.8	q5, q7, q13
1056bc3d5698SJohn Baldwin	vtst.8	q6, q7, q6
1057bc3d5698SJohn Baldwin	vtst.8	q7, q7, q15
1058bc3d5698SJohn Baldwin	vld1.8	{q15}, [r4]!		@ load next round key
1059bc3d5698SJohn Baldwin	vmvn	q0, q0		@ "pnot"
1060bc3d5698SJohn Baldwin	vmvn	q1, q1
1061bc3d5698SJohn Baldwin	vmvn	q5, q5
1062bc3d5698SJohn Baldwin	vmvn	q6, q6
1063bc3d5698SJohn Baldwin#ifdef __ARMEL__
1064bc3d5698SJohn Baldwin	vrev32.8	q15, q15
1065bc3d5698SJohn Baldwin#endif
1066bc3d5698SJohn Baldwin	subs	r5,r5,#1
1067bc3d5698SJohn Baldwin	vstmia	r12!,{q0,q1,q2,q3,q4,q5,q6,q7}		@ write bit-sliced round key
1068bc3d5698SJohn Baldwin	bne	.Lkey_loop
1069bc3d5698SJohn Baldwin
1070bc3d5698SJohn Baldwin	vmov.i8	q7,#0x63			@ compose .L63
1071bc3d5698SJohn Baldwin	@ don't save last round key
1072bc3d5698SJohn Baldwin	bx	lr
1073bc3d5698SJohn Baldwin.size	_bsaes_key_convert,.-_bsaes_key_convert
1074bc3d5698SJohn Baldwin
1075bc3d5698SJohn Baldwin
1076bc3d5698SJohn Baldwin
1077*c0855eaaSJohn Baldwin.globl	ossl_bsaes_cbc_encrypt
1078*c0855eaaSJohn Baldwin.type	ossl_bsaes_cbc_encrypt,%function
1079bc3d5698SJohn Baldwin.align	5
1080*c0855eaaSJohn Baldwinossl_bsaes_cbc_encrypt:
1081bc3d5698SJohn Baldwin#ifndef	__KERNEL__
1082bc3d5698SJohn Baldwin	cmp	r2, #128
1083bc3d5698SJohn Baldwin#ifndef	__thumb__
1084bc3d5698SJohn Baldwin	blo	AES_cbc_encrypt
1085bc3d5698SJohn Baldwin#else
1086*c0855eaaSJohn Baldwin	bhs	.Lcbc_do_bsaes
1087bc3d5698SJohn Baldwin	b	AES_cbc_encrypt
1088*c0855eaaSJohn Baldwin.Lcbc_do_bsaes:
1089bc3d5698SJohn Baldwin#endif
1090bc3d5698SJohn Baldwin#endif
1091bc3d5698SJohn Baldwin
1092bc3d5698SJohn Baldwin	@ it is up to the caller to make sure we are called with enc == 0
1093bc3d5698SJohn Baldwin
1094bc3d5698SJohn Baldwin	mov	ip, sp
1095bc3d5698SJohn Baldwin	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
1096bc3d5698SJohn Baldwin	VFP_ABI_PUSH
1097bc3d5698SJohn Baldwin	ldr	r8, [ip]			@ IV is 1st arg on the stack
1098bc3d5698SJohn Baldwin	mov	r2, r2, lsr#4		@ len in 16 byte blocks
1099bc3d5698SJohn Baldwin	sub	sp, #0x10			@ scratch space to carry over the IV
1100bc3d5698SJohn Baldwin	mov	r9, sp				@ save sp
1101bc3d5698SJohn Baldwin
1102bc3d5698SJohn Baldwin	ldr	r10, [r3, #240]		@ get # of rounds
1103bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
1104bc3d5698SJohn Baldwin	@ allocate the key schedule on the stack
1105bc3d5698SJohn Baldwin	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
1106bc3d5698SJohn Baldwin	add	r12, #96			@ sifze of bit-slices key schedule
1107bc3d5698SJohn Baldwin
1108bc3d5698SJohn Baldwin	@ populate the key schedule
1109bc3d5698SJohn Baldwin	mov	r4, r3			@ pass key
1110bc3d5698SJohn Baldwin	mov	r5, r10			@ pass # of rounds
1111bc3d5698SJohn Baldwin	mov	sp, r12				@ sp is sp
1112bc3d5698SJohn Baldwin	bl	_bsaes_key_convert
1113bc3d5698SJohn Baldwin	vldmia	sp, {q6}
1114bc3d5698SJohn Baldwin	vstmia	r12,  {q15}		@ save last round key
1115bc3d5698SJohn Baldwin	veor	q7, q7, q6	@ fix up round 0 key
1116bc3d5698SJohn Baldwin	vstmia	sp, {q7}
1117bc3d5698SJohn Baldwin#else
1118bc3d5698SJohn Baldwin	ldr	r12, [r3, #244]
1119bc3d5698SJohn Baldwin	eors	r12, #1
1120bc3d5698SJohn Baldwin	beq	0f
1121bc3d5698SJohn Baldwin
1122bc3d5698SJohn Baldwin	@ populate the key schedule
1123bc3d5698SJohn Baldwin	str	r12, [r3, #244]
1124bc3d5698SJohn Baldwin	mov	r4, r3			@ pass key
1125bc3d5698SJohn Baldwin	mov	r5, r10			@ pass # of rounds
1126bc3d5698SJohn Baldwin	add	r12, r3, #248			@ pass key schedule
1127bc3d5698SJohn Baldwin	bl	_bsaes_key_convert
1128bc3d5698SJohn Baldwin	add	r4, r3, #248
1129bc3d5698SJohn Baldwin	vldmia	r4, {q6}
1130bc3d5698SJohn Baldwin	vstmia	r12, {q15}			@ save last round key
1131bc3d5698SJohn Baldwin	veor	q7, q7, q6	@ fix up round 0 key
1132bc3d5698SJohn Baldwin	vstmia	r4, {q7}
1133bc3d5698SJohn Baldwin
1134bc3d5698SJohn Baldwin.align	2
1135454c425dSMark Johnston0:
1136bc3d5698SJohn Baldwin#endif
1137bc3d5698SJohn Baldwin
1138bc3d5698SJohn Baldwin	vld1.8	{q15}, [r8]		@ load IV
1139bc3d5698SJohn Baldwin	b	.Lcbc_dec_loop
1140bc3d5698SJohn Baldwin
1141bc3d5698SJohn Baldwin.align	4
1142bc3d5698SJohn Baldwin.Lcbc_dec_loop:
1143bc3d5698SJohn Baldwin	subs	r2, r2, #0x8
1144bc3d5698SJohn Baldwin	bmi	.Lcbc_dec_loop_finish
1145bc3d5698SJohn Baldwin
1146bc3d5698SJohn Baldwin	vld1.8	{q0,q1}, [r0]!	@ load input
1147bc3d5698SJohn Baldwin	vld1.8	{q2,q3}, [r0]!
1148bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
1149bc3d5698SJohn Baldwin	mov	r4, sp			@ pass the key
1150bc3d5698SJohn Baldwin#else
1151bc3d5698SJohn Baldwin	add	r4, r3, #248
1152bc3d5698SJohn Baldwin#endif
1153bc3d5698SJohn Baldwin	vld1.8	{q4,q5}, [r0]!
1154bc3d5698SJohn Baldwin	mov	r5, r10
1155bc3d5698SJohn Baldwin	vld1.8	{q6,q7}, [r0]
1156bc3d5698SJohn Baldwin	sub	r0, r0, #0x60
1157bc3d5698SJohn Baldwin	vstmia	r9, {q15}			@ put aside IV
1158bc3d5698SJohn Baldwin
1159bc3d5698SJohn Baldwin	bl	_bsaes_decrypt8
1160bc3d5698SJohn Baldwin
1161bc3d5698SJohn Baldwin	vldmia	r9, {q14}			@ reload IV
1162bc3d5698SJohn Baldwin	vld1.8	{q8,q9}, [r0]!	@ reload input
1163bc3d5698SJohn Baldwin	veor	q0, q0, q14	@ ^= IV
1164bc3d5698SJohn Baldwin	vld1.8	{q10,q11}, [r0]!
1165bc3d5698SJohn Baldwin	veor	q1, q1, q8
1166bc3d5698SJohn Baldwin	veor	q6, q6, q9
1167bc3d5698SJohn Baldwin	vld1.8	{q12,q13}, [r0]!
1168bc3d5698SJohn Baldwin	veor	q4, q4, q10
1169bc3d5698SJohn Baldwin	veor	q2, q2, q11
1170bc3d5698SJohn Baldwin	vld1.8	{q14,q15}, [r0]!
1171bc3d5698SJohn Baldwin	veor	q7, q7, q12
1172bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r1]!	@ write output
1173bc3d5698SJohn Baldwin	veor	q3, q3, q13
1174bc3d5698SJohn Baldwin	vst1.8	{q6}, [r1]!
1175bc3d5698SJohn Baldwin	veor	q5, q5, q14
1176bc3d5698SJohn Baldwin	vst1.8	{q4}, [r1]!
1177bc3d5698SJohn Baldwin	vst1.8	{q2}, [r1]!
1178bc3d5698SJohn Baldwin	vst1.8	{q7}, [r1]!
1179bc3d5698SJohn Baldwin	vst1.8	{q3}, [r1]!
1180bc3d5698SJohn Baldwin	vst1.8	{q5}, [r1]!
1181bc3d5698SJohn Baldwin
1182bc3d5698SJohn Baldwin	b	.Lcbc_dec_loop
1183bc3d5698SJohn Baldwin
1184bc3d5698SJohn Baldwin.Lcbc_dec_loop_finish:
1185bc3d5698SJohn Baldwin	adds	r2, r2, #8
1186bc3d5698SJohn Baldwin	beq	.Lcbc_dec_done
1187bc3d5698SJohn Baldwin
1188bc3d5698SJohn Baldwin	vld1.8	{q0}, [r0]!		@ load input
1189bc3d5698SJohn Baldwin	cmp	r2, #2
1190bc3d5698SJohn Baldwin	blo	.Lcbc_dec_one
1191bc3d5698SJohn Baldwin	vld1.8	{q1}, [r0]!
1192bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
1193bc3d5698SJohn Baldwin	mov	r4, sp			@ pass the key
1194bc3d5698SJohn Baldwin#else
1195bc3d5698SJohn Baldwin	add	r4, r3, #248
1196bc3d5698SJohn Baldwin#endif
1197bc3d5698SJohn Baldwin	mov	r5, r10
1198bc3d5698SJohn Baldwin	vstmia	r9, {q15}			@ put aside IV
1199bc3d5698SJohn Baldwin	beq	.Lcbc_dec_two
1200bc3d5698SJohn Baldwin	vld1.8	{q2}, [r0]!
1201bc3d5698SJohn Baldwin	cmp	r2, #4
1202bc3d5698SJohn Baldwin	blo	.Lcbc_dec_three
1203bc3d5698SJohn Baldwin	vld1.8	{q3}, [r0]!
1204bc3d5698SJohn Baldwin	beq	.Lcbc_dec_four
1205bc3d5698SJohn Baldwin	vld1.8	{q4}, [r0]!
1206bc3d5698SJohn Baldwin	cmp	r2, #6
1207bc3d5698SJohn Baldwin	blo	.Lcbc_dec_five
1208bc3d5698SJohn Baldwin	vld1.8	{q5}, [r0]!
1209bc3d5698SJohn Baldwin	beq	.Lcbc_dec_six
1210bc3d5698SJohn Baldwin	vld1.8	{q6}, [r0]!
1211bc3d5698SJohn Baldwin	sub	r0, r0, #0x70
1212bc3d5698SJohn Baldwin
1213bc3d5698SJohn Baldwin	bl	_bsaes_decrypt8
1214bc3d5698SJohn Baldwin
1215bc3d5698SJohn Baldwin	vldmia	r9, {q14}			@ reload IV
1216bc3d5698SJohn Baldwin	vld1.8	{q8,q9}, [r0]!	@ reload input
1217bc3d5698SJohn Baldwin	veor	q0, q0, q14	@ ^= IV
1218bc3d5698SJohn Baldwin	vld1.8	{q10,q11}, [r0]!
1219bc3d5698SJohn Baldwin	veor	q1, q1, q8
1220bc3d5698SJohn Baldwin	veor	q6, q6, q9
1221bc3d5698SJohn Baldwin	vld1.8	{q12,q13}, [r0]!
1222bc3d5698SJohn Baldwin	veor	q4, q4, q10
1223bc3d5698SJohn Baldwin	veor	q2, q2, q11
1224bc3d5698SJohn Baldwin	vld1.8	{q15}, [r0]!
1225bc3d5698SJohn Baldwin	veor	q7, q7, q12
1226bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r1]!	@ write output
1227bc3d5698SJohn Baldwin	veor	q3, q3, q13
1228bc3d5698SJohn Baldwin	vst1.8	{q6}, [r1]!
1229bc3d5698SJohn Baldwin	vst1.8	{q4}, [r1]!
1230bc3d5698SJohn Baldwin	vst1.8	{q2}, [r1]!
1231bc3d5698SJohn Baldwin	vst1.8	{q7}, [r1]!
1232bc3d5698SJohn Baldwin	vst1.8	{q3}, [r1]!
1233bc3d5698SJohn Baldwin	b	.Lcbc_dec_done
1234bc3d5698SJohn Baldwin.align	4
1235bc3d5698SJohn Baldwin.Lcbc_dec_six:
1236bc3d5698SJohn Baldwin	sub	r0, r0, #0x60
1237bc3d5698SJohn Baldwin	bl	_bsaes_decrypt8
1238bc3d5698SJohn Baldwin	vldmia	r9,{q14}			@ reload IV
1239bc3d5698SJohn Baldwin	vld1.8	{q8,q9}, [r0]!	@ reload input
1240bc3d5698SJohn Baldwin	veor	q0, q0, q14	@ ^= IV
1241bc3d5698SJohn Baldwin	vld1.8	{q10,q11}, [r0]!
1242bc3d5698SJohn Baldwin	veor	q1, q1, q8
1243bc3d5698SJohn Baldwin	veor	q6, q6, q9
1244bc3d5698SJohn Baldwin	vld1.8	{q12}, [r0]!
1245bc3d5698SJohn Baldwin	veor	q4, q4, q10
1246bc3d5698SJohn Baldwin	veor	q2, q2, q11
1247bc3d5698SJohn Baldwin	vld1.8	{q15}, [r0]!
1248bc3d5698SJohn Baldwin	veor	q7, q7, q12
1249bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r1]!	@ write output
1250bc3d5698SJohn Baldwin	vst1.8	{q6}, [r1]!
1251bc3d5698SJohn Baldwin	vst1.8	{q4}, [r1]!
1252bc3d5698SJohn Baldwin	vst1.8	{q2}, [r1]!
1253bc3d5698SJohn Baldwin	vst1.8	{q7}, [r1]!
1254bc3d5698SJohn Baldwin	b	.Lcbc_dec_done
1255bc3d5698SJohn Baldwin.align	4
1256bc3d5698SJohn Baldwin.Lcbc_dec_five:
1257bc3d5698SJohn Baldwin	sub	r0, r0, #0x50
1258bc3d5698SJohn Baldwin	bl	_bsaes_decrypt8
1259bc3d5698SJohn Baldwin	vldmia	r9, {q14}			@ reload IV
1260bc3d5698SJohn Baldwin	vld1.8	{q8,q9}, [r0]!	@ reload input
1261bc3d5698SJohn Baldwin	veor	q0, q0, q14	@ ^= IV
1262bc3d5698SJohn Baldwin	vld1.8	{q10,q11}, [r0]!
1263bc3d5698SJohn Baldwin	veor	q1, q1, q8
1264bc3d5698SJohn Baldwin	veor	q6, q6, q9
1265bc3d5698SJohn Baldwin	vld1.8	{q15}, [r0]!
1266bc3d5698SJohn Baldwin	veor	q4, q4, q10
1267bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r1]!	@ write output
1268bc3d5698SJohn Baldwin	veor	q2, q2, q11
1269bc3d5698SJohn Baldwin	vst1.8	{q6}, [r1]!
1270bc3d5698SJohn Baldwin	vst1.8	{q4}, [r1]!
1271bc3d5698SJohn Baldwin	vst1.8	{q2}, [r1]!
1272bc3d5698SJohn Baldwin	b	.Lcbc_dec_done
1273bc3d5698SJohn Baldwin.align	4
1274bc3d5698SJohn Baldwin.Lcbc_dec_four:
1275bc3d5698SJohn Baldwin	sub	r0, r0, #0x40
1276bc3d5698SJohn Baldwin	bl	_bsaes_decrypt8
1277bc3d5698SJohn Baldwin	vldmia	r9, {q14}			@ reload IV
1278bc3d5698SJohn Baldwin	vld1.8	{q8,q9}, [r0]!	@ reload input
1279bc3d5698SJohn Baldwin	veor	q0, q0, q14	@ ^= IV
1280bc3d5698SJohn Baldwin	vld1.8	{q10}, [r0]!
1281bc3d5698SJohn Baldwin	veor	q1, q1, q8
1282bc3d5698SJohn Baldwin	veor	q6, q6, q9
1283bc3d5698SJohn Baldwin	vld1.8	{q15}, [r0]!
1284bc3d5698SJohn Baldwin	veor	q4, q4, q10
1285bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r1]!	@ write output
1286bc3d5698SJohn Baldwin	vst1.8	{q6}, [r1]!
1287bc3d5698SJohn Baldwin	vst1.8	{q4}, [r1]!
1288bc3d5698SJohn Baldwin	b	.Lcbc_dec_done
1289bc3d5698SJohn Baldwin.align	4
1290bc3d5698SJohn Baldwin.Lcbc_dec_three:
1291bc3d5698SJohn Baldwin	sub	r0, r0, #0x30
1292bc3d5698SJohn Baldwin	bl	_bsaes_decrypt8
1293bc3d5698SJohn Baldwin	vldmia	r9, {q14}			@ reload IV
1294bc3d5698SJohn Baldwin	vld1.8	{q8,q9}, [r0]!	@ reload input
1295bc3d5698SJohn Baldwin	veor	q0, q0, q14	@ ^= IV
1296bc3d5698SJohn Baldwin	vld1.8	{q15}, [r0]!
1297bc3d5698SJohn Baldwin	veor	q1, q1, q8
1298bc3d5698SJohn Baldwin	veor	q6, q6, q9
1299bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r1]!	@ write output
1300bc3d5698SJohn Baldwin	vst1.8	{q6}, [r1]!
1301bc3d5698SJohn Baldwin	b	.Lcbc_dec_done
1302bc3d5698SJohn Baldwin.align	4
1303bc3d5698SJohn Baldwin.Lcbc_dec_two:
1304bc3d5698SJohn Baldwin	sub	r0, r0, #0x20
1305bc3d5698SJohn Baldwin	bl	_bsaes_decrypt8
1306bc3d5698SJohn Baldwin	vldmia	r9, {q14}			@ reload IV
1307bc3d5698SJohn Baldwin	vld1.8	{q8}, [r0]!		@ reload input
1308bc3d5698SJohn Baldwin	veor	q0, q0, q14	@ ^= IV
1309bc3d5698SJohn Baldwin	vld1.8	{q15}, [r0]!		@ reload input
1310bc3d5698SJohn Baldwin	veor	q1, q1, q8
1311bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r1]!	@ write output
1312bc3d5698SJohn Baldwin	b	.Lcbc_dec_done
1313bc3d5698SJohn Baldwin.align	4
1314bc3d5698SJohn Baldwin.Lcbc_dec_one:
1315bc3d5698SJohn Baldwin	sub	r0, r0, #0x10
1316bc3d5698SJohn Baldwin	mov	r10, r1			@ save original out pointer
1317bc3d5698SJohn Baldwin	mov	r1, r9			@ use the iv scratch space as out buffer
1318bc3d5698SJohn Baldwin	mov	r2, r3
1319bc3d5698SJohn Baldwin	vmov	q4,q15		@ just in case ensure that IV
1320bc3d5698SJohn Baldwin	vmov	q5,q0			@ and input are preserved
1321bc3d5698SJohn Baldwin	bl	AES_decrypt
1322bc3d5698SJohn Baldwin	vld1.8	{q0}, [r9]		@ load result
1323bc3d5698SJohn Baldwin	veor	q0, q0, q4	@ ^= IV
1324bc3d5698SJohn Baldwin	vmov	q15, q5		@ q5 holds input
1325bc3d5698SJohn Baldwin	vst1.8	{q0}, [r10]		@ write output
1326bc3d5698SJohn Baldwin
1327bc3d5698SJohn Baldwin.Lcbc_dec_done:
1328bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
1329bc3d5698SJohn Baldwin	vmov.i32	q0, #0
1330bc3d5698SJohn Baldwin	vmov.i32	q1, #0
1331bc3d5698SJohn Baldwin.Lcbc_dec_bzero:@ wipe key schedule [if any]
1332bc3d5698SJohn Baldwin	vstmia	sp!, {q0,q1}
1333bc3d5698SJohn Baldwin	cmp	sp, r9
1334bc3d5698SJohn Baldwin	bne	.Lcbc_dec_bzero
1335bc3d5698SJohn Baldwin#endif
1336bc3d5698SJohn Baldwin
1337bc3d5698SJohn Baldwin	mov	sp, r9
1338bc3d5698SJohn Baldwin	add	sp, #0x10			@ add sp,r9,#0x10 is no good for thumb
1339bc3d5698SJohn Baldwin	vst1.8	{q15}, [r8]		@ return IV
1340bc3d5698SJohn Baldwin	VFP_ABI_POP
1341bc3d5698SJohn Baldwin	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}
1342*c0855eaaSJohn Baldwin.size	ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt
1343bc3d5698SJohn Baldwin
1344*c0855eaaSJohn Baldwin.globl	ossl_bsaes_ctr32_encrypt_blocks
1345*c0855eaaSJohn Baldwin.type	ossl_bsaes_ctr32_encrypt_blocks,%function
1346bc3d5698SJohn Baldwin.align	5
1347*c0855eaaSJohn Baldwinossl_bsaes_ctr32_encrypt_blocks:
1348bc3d5698SJohn Baldwin	cmp	r2, #8			@ use plain AES for
1349bc3d5698SJohn Baldwin	blo	.Lctr_enc_short			@ small sizes
1350bc3d5698SJohn Baldwin
1351bc3d5698SJohn Baldwin	mov	ip, sp
1352bc3d5698SJohn Baldwin	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}
1353bc3d5698SJohn Baldwin	VFP_ABI_PUSH
1354bc3d5698SJohn Baldwin	ldr	r8, [ip]			@ ctr is 1st arg on the stack
1355bc3d5698SJohn Baldwin	sub	sp, sp, #0x10			@ scratch space to carry over the ctr
1356bc3d5698SJohn Baldwin	mov	r9, sp				@ save sp
1357bc3d5698SJohn Baldwin
1358bc3d5698SJohn Baldwin	ldr	r10, [r3, #240]		@ get # of rounds
1359bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
1360bc3d5698SJohn Baldwin	@ allocate the key schedule on the stack
1361bc3d5698SJohn Baldwin	sub	r12, sp, r10, lsl#7		@ 128 bytes per inner round key
1362bc3d5698SJohn Baldwin	add	r12, #96			@ size of bit-sliced key schedule
1363bc3d5698SJohn Baldwin
1364bc3d5698SJohn Baldwin	@ populate the key schedule
1365bc3d5698SJohn Baldwin	mov	r4, r3			@ pass key
1366bc3d5698SJohn Baldwin	mov	r5, r10			@ pass # of rounds
1367bc3d5698SJohn Baldwin	mov	sp, r12				@ sp is sp
1368bc3d5698SJohn Baldwin	bl	_bsaes_key_convert
1369bc3d5698SJohn Baldwin	veor	q7,q7,q15	@ fix up last round key
1370bc3d5698SJohn Baldwin	vstmia	r12, {q7}			@ save last round key
1371bc3d5698SJohn Baldwin
1372bc3d5698SJohn Baldwin	vld1.8	{q0}, [r8]		@ load counter
1373bc3d5698SJohn Baldwin#ifdef	__APPLE__
1374bc3d5698SJohn Baldwin	mov	r8, #:lower16:(.LREVM0SR-.LM0)
1375bc3d5698SJohn Baldwin	add	r8, r6, r8
1376bc3d5698SJohn Baldwin#else
1377bc3d5698SJohn Baldwin	add	r8, r6, #.LREVM0SR-.LM0	@ borrow r8
1378bc3d5698SJohn Baldwin#endif
1379bc3d5698SJohn Baldwin	vldmia	sp, {q4}		@ load round0 key
1380bc3d5698SJohn Baldwin#else
1381bc3d5698SJohn Baldwin	ldr	r12, [r3, #244]
1382bc3d5698SJohn Baldwin	eors	r12, #1
1383bc3d5698SJohn Baldwin	beq	0f
1384bc3d5698SJohn Baldwin
1385bc3d5698SJohn Baldwin	@ populate the key schedule
1386bc3d5698SJohn Baldwin	str	r12, [r3, #244]
1387bc3d5698SJohn Baldwin	mov	r4, r3			@ pass key
1388bc3d5698SJohn Baldwin	mov	r5, r10			@ pass # of rounds
1389bc3d5698SJohn Baldwin	add	r12, r3, #248			@ pass key schedule
1390bc3d5698SJohn Baldwin	bl	_bsaes_key_convert
1391bc3d5698SJohn Baldwin	veor	q7,q7,q15	@ fix up last round key
1392bc3d5698SJohn Baldwin	vstmia	r12, {q7}			@ save last round key
1393bc3d5698SJohn Baldwin
1394bc3d5698SJohn Baldwin.align	2
1395454c425dSMark Johnston0:	add	r12, r3, #248
1396bc3d5698SJohn Baldwin	vld1.8	{q0}, [r8]		@ load counter
1397e415d255SJung-uk Kim	add	r8, r6, #.LREVM0SR-.LM0	@ borrow r8
1398bc3d5698SJohn Baldwin	vldmia	r12, {q4}			@ load round0 key
1399bc3d5698SJohn Baldwin	sub	sp, #0x10			@ place for adjusted round0 key
1400bc3d5698SJohn Baldwin#endif
1401bc3d5698SJohn Baldwin
1402bc3d5698SJohn Baldwin	vmov.i32	q8,#1		@ compose 1<<96
1403bc3d5698SJohn Baldwin	veor	q9,q9,q9
1404bc3d5698SJohn Baldwin	vrev32.8	q0,q0
1405bc3d5698SJohn Baldwin	vext.8	q8,q9,q8,#4
1406bc3d5698SJohn Baldwin	vrev32.8	q4,q4
1407bc3d5698SJohn Baldwin	vadd.u32	q9,q8,q8	@ compose 2<<96
1408bc3d5698SJohn Baldwin	vstmia	sp, {q4}		@ save adjusted round0 key
1409bc3d5698SJohn Baldwin	b	.Lctr_enc_loop
1410bc3d5698SJohn Baldwin
1411bc3d5698SJohn Baldwin.align	4
1412bc3d5698SJohn Baldwin.Lctr_enc_loop:
1413bc3d5698SJohn Baldwin	vadd.u32	q10, q8, q9	@ compose 3<<96
1414bc3d5698SJohn Baldwin	vadd.u32	q1, q0, q8	@ +1
1415bc3d5698SJohn Baldwin	vadd.u32	q2, q0, q9	@ +2
1416bc3d5698SJohn Baldwin	vadd.u32	q3, q0, q10	@ +3
1417bc3d5698SJohn Baldwin	vadd.u32	q4, q1, q10
1418bc3d5698SJohn Baldwin	vadd.u32	q5, q2, q10
1419bc3d5698SJohn Baldwin	vadd.u32	q6, q3, q10
1420bc3d5698SJohn Baldwin	vadd.u32	q7, q4, q10
1421bc3d5698SJohn Baldwin	vadd.u32	q10, q5, q10	@ next counter
1422bc3d5698SJohn Baldwin
1423bc3d5698SJohn Baldwin	@ Borrow prologue from _bsaes_encrypt8 to use the opportunity
1424bc3d5698SJohn Baldwin	@ to flip byte order in 32-bit counter
1425bc3d5698SJohn Baldwin
1426bc3d5698SJohn Baldwin	vldmia	sp, {q9}		@ load round0 key
1427bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
1428bc3d5698SJohn Baldwin	add	r4, sp, #0x10		@ pass next round key
1429bc3d5698SJohn Baldwin#else
1430bc3d5698SJohn Baldwin	add	r4, r3, #264
1431bc3d5698SJohn Baldwin#endif
1432bc3d5698SJohn Baldwin	vldmia	r8, {q8}			@ .LREVM0SR
1433bc3d5698SJohn Baldwin	mov	r5, r10			@ pass rounds
1434bc3d5698SJohn Baldwin	vstmia	r9, {q10}			@ save next counter
1435bc3d5698SJohn Baldwin#ifdef	__APPLE__
1436bc3d5698SJohn Baldwin	mov	r6, #:lower16:(.LREVM0SR-.LSR)
1437bc3d5698SJohn Baldwin	sub	r6, r8, r6
1438bc3d5698SJohn Baldwin#else
1439bc3d5698SJohn Baldwin	sub	r6, r8, #.LREVM0SR-.LSR	@ pass constants
1440bc3d5698SJohn Baldwin#endif
1441bc3d5698SJohn Baldwin
1442bc3d5698SJohn Baldwin	bl	_bsaes_encrypt8_alt
1443bc3d5698SJohn Baldwin
1444bc3d5698SJohn Baldwin	subs	r2, r2, #8
1445bc3d5698SJohn Baldwin	blo	.Lctr_enc_loop_done
1446bc3d5698SJohn Baldwin
1447bc3d5698SJohn Baldwin	vld1.8	{q8,q9}, [r0]!	@ load input
1448bc3d5698SJohn Baldwin	vld1.8	{q10,q11}, [r0]!
1449bc3d5698SJohn Baldwin	veor	q0, q8
1450bc3d5698SJohn Baldwin	veor	q1, q9
1451bc3d5698SJohn Baldwin	vld1.8	{q12,q13}, [r0]!
1452bc3d5698SJohn Baldwin	veor	q4, q10
1453bc3d5698SJohn Baldwin	veor	q6, q11
1454bc3d5698SJohn Baldwin	vld1.8	{q14,q15}, [r0]!
1455bc3d5698SJohn Baldwin	veor	q3, q12
1456bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r1]!	@ write output
1457bc3d5698SJohn Baldwin	veor	q7, q13
1458bc3d5698SJohn Baldwin	veor	q2, q14
1459bc3d5698SJohn Baldwin	vst1.8	{q4}, [r1]!
1460bc3d5698SJohn Baldwin	veor	q5, q15
1461bc3d5698SJohn Baldwin	vst1.8	{q6}, [r1]!
1462bc3d5698SJohn Baldwin	vmov.i32	q8, #1			@ compose 1<<96
1463bc3d5698SJohn Baldwin	vst1.8	{q3}, [r1]!
1464bc3d5698SJohn Baldwin	veor	q9, q9, q9
1465bc3d5698SJohn Baldwin	vst1.8	{q7}, [r1]!
1466bc3d5698SJohn Baldwin	vext.8	q8, q9, q8, #4
1467bc3d5698SJohn Baldwin	vst1.8	{q2}, [r1]!
1468bc3d5698SJohn Baldwin	vadd.u32	q9,q8,q8		@ compose 2<<96
1469bc3d5698SJohn Baldwin	vst1.8	{q5}, [r1]!
1470bc3d5698SJohn Baldwin	vldmia	r9, {q0}			@ load counter
1471bc3d5698SJohn Baldwin
1472bc3d5698SJohn Baldwin	bne	.Lctr_enc_loop
1473bc3d5698SJohn Baldwin	b	.Lctr_enc_done
1474bc3d5698SJohn Baldwin
1475bc3d5698SJohn Baldwin.align	4
1476bc3d5698SJohn Baldwin.Lctr_enc_loop_done:
1477bc3d5698SJohn Baldwin	add	r2, r2, #8
1478bc3d5698SJohn Baldwin	vld1.8	{q8}, [r0]!	@ load input
1479bc3d5698SJohn Baldwin	veor	q0, q8
1480bc3d5698SJohn Baldwin	vst1.8	{q0}, [r1]!	@ write output
1481bc3d5698SJohn Baldwin	cmp	r2, #2
1482bc3d5698SJohn Baldwin	blo	.Lctr_enc_done
1483bc3d5698SJohn Baldwin	vld1.8	{q9}, [r0]!
1484bc3d5698SJohn Baldwin	veor	q1, q9
1485bc3d5698SJohn Baldwin	vst1.8	{q1}, [r1]!
1486bc3d5698SJohn Baldwin	beq	.Lctr_enc_done
1487bc3d5698SJohn Baldwin	vld1.8	{q10}, [r0]!
1488bc3d5698SJohn Baldwin	veor	q4, q10
1489bc3d5698SJohn Baldwin	vst1.8	{q4}, [r1]!
1490bc3d5698SJohn Baldwin	cmp	r2, #4
1491bc3d5698SJohn Baldwin	blo	.Lctr_enc_done
1492bc3d5698SJohn Baldwin	vld1.8	{q11}, [r0]!
1493bc3d5698SJohn Baldwin	veor	q6, q11
1494bc3d5698SJohn Baldwin	vst1.8	{q6}, [r1]!
1495bc3d5698SJohn Baldwin	beq	.Lctr_enc_done
1496bc3d5698SJohn Baldwin	vld1.8	{q12}, [r0]!
1497bc3d5698SJohn Baldwin	veor	q3, q12
1498bc3d5698SJohn Baldwin	vst1.8	{q3}, [r1]!
1499bc3d5698SJohn Baldwin	cmp	r2, #6
1500bc3d5698SJohn Baldwin	blo	.Lctr_enc_done
1501bc3d5698SJohn Baldwin	vld1.8	{q13}, [r0]!
1502bc3d5698SJohn Baldwin	veor	q7, q13
1503bc3d5698SJohn Baldwin	vst1.8	{q7}, [r1]!
1504bc3d5698SJohn Baldwin	beq	.Lctr_enc_done
1505bc3d5698SJohn Baldwin	vld1.8	{q14}, [r0]
1506bc3d5698SJohn Baldwin	veor	q2, q14
1507bc3d5698SJohn Baldwin	vst1.8	{q2}, [r1]!
1508bc3d5698SJohn Baldwin
1509bc3d5698SJohn Baldwin.Lctr_enc_done:
1510bc3d5698SJohn Baldwin	vmov.i32	q0, #0
1511bc3d5698SJohn Baldwin	vmov.i32	q1, #0
1512bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
1513bc3d5698SJohn Baldwin.Lctr_enc_bzero:@ wipe key schedule [if any]
1514bc3d5698SJohn Baldwin	vstmia	sp!, {q0,q1}
1515bc3d5698SJohn Baldwin	cmp	sp, r9
1516bc3d5698SJohn Baldwin	bne	.Lctr_enc_bzero
1517bc3d5698SJohn Baldwin#else
1518bc3d5698SJohn Baldwin	vstmia	sp, {q0,q1}
1519bc3d5698SJohn Baldwin#endif
1520bc3d5698SJohn Baldwin
1521bc3d5698SJohn Baldwin	mov	sp, r9
1522bc3d5698SJohn Baldwin	add	sp, #0x10		@ add sp,r9,#0x10 is no good for thumb
1523bc3d5698SJohn Baldwin	VFP_ABI_POP
1524bc3d5698SJohn Baldwin	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}	@ return
1525bc3d5698SJohn Baldwin
1526bc3d5698SJohn Baldwin.align	4
1527bc3d5698SJohn Baldwin.Lctr_enc_short:
1528bc3d5698SJohn Baldwin	ldr	ip, [sp]		@ ctr pointer is passed on stack
1529bc3d5698SJohn Baldwin	stmdb	sp!, {r4,r5,r6,r7,r8, lr}
1530bc3d5698SJohn Baldwin
1531bc3d5698SJohn Baldwin	mov	r4, r0		@ copy arguments
1532bc3d5698SJohn Baldwin	mov	r5, r1
1533bc3d5698SJohn Baldwin	mov	r6, r2
1534bc3d5698SJohn Baldwin	mov	r7, r3
1535bc3d5698SJohn Baldwin	ldr	r8, [ip, #12]		@ load counter .LSW
1536bc3d5698SJohn Baldwin	vld1.8	{q1}, [ip]		@ load whole counter value
1537bc3d5698SJohn Baldwin#ifdef __ARMEL__
1538bc3d5698SJohn Baldwin	rev	r8, r8
1539bc3d5698SJohn Baldwin#endif
1540bc3d5698SJohn Baldwin	sub	sp, sp, #0x10
1541bc3d5698SJohn Baldwin	vst1.8	{q1}, [sp]		@ copy counter value
1542bc3d5698SJohn Baldwin	sub	sp, sp, #0x10
1543bc3d5698SJohn Baldwin
1544bc3d5698SJohn Baldwin.Lctr_enc_short_loop:
1545bc3d5698SJohn Baldwin	add	r0, sp, #0x10		@ input counter value
1546bc3d5698SJohn Baldwin	mov	r1, sp			@ output on the stack
1547bc3d5698SJohn Baldwin	mov	r2, r7			@ key
1548bc3d5698SJohn Baldwin
1549bc3d5698SJohn Baldwin	bl	AES_encrypt
1550bc3d5698SJohn Baldwin
1551bc3d5698SJohn Baldwin	vld1.8	{q0}, [r4]!	@ load input
1552bc3d5698SJohn Baldwin	vld1.8	{q1}, [sp]		@ load encrypted counter
1553bc3d5698SJohn Baldwin	add	r8, r8, #1
1554bc3d5698SJohn Baldwin#ifdef __ARMEL__
1555bc3d5698SJohn Baldwin	rev	r0, r8
1556bc3d5698SJohn Baldwin	str	r0, [sp, #0x1c]		@ next counter value
1557bc3d5698SJohn Baldwin#else
1558bc3d5698SJohn Baldwin	str	r8, [sp, #0x1c]		@ next counter value
1559bc3d5698SJohn Baldwin#endif
1560bc3d5698SJohn Baldwin	veor	q0,q0,q1
1561bc3d5698SJohn Baldwin	vst1.8	{q0}, [r5]!	@ store output
1562bc3d5698SJohn Baldwin	subs	r6, r6, #1
1563bc3d5698SJohn Baldwin	bne	.Lctr_enc_short_loop
1564bc3d5698SJohn Baldwin
1565bc3d5698SJohn Baldwin	vmov.i32	q0, #0
1566bc3d5698SJohn Baldwin	vmov.i32	q1, #0
1567bc3d5698SJohn Baldwin	vstmia	sp!, {q0,q1}
1568bc3d5698SJohn Baldwin
1569bc3d5698SJohn Baldwin	ldmia	sp!, {r4,r5,r6,r7,r8, pc}
1570*c0855eaaSJohn Baldwin.size	ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks
1571*c0855eaaSJohn Baldwin.globl	ossl_bsaes_xts_encrypt
1572*c0855eaaSJohn Baldwin.type	ossl_bsaes_xts_encrypt,%function
1573bc3d5698SJohn Baldwin.align	4
1574*c0855eaaSJohn Baldwinossl_bsaes_xts_encrypt:
1575bc3d5698SJohn Baldwin	mov	ip, sp
1576bc3d5698SJohn Baldwin	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}		@ 0x20
1577bc3d5698SJohn Baldwin	VFP_ABI_PUSH
1578bc3d5698SJohn Baldwin	mov	r6, sp				@ future r3
1579bc3d5698SJohn Baldwin
1580bc3d5698SJohn Baldwin	mov	r7, r0
1581bc3d5698SJohn Baldwin	mov	r8, r1
1582bc3d5698SJohn Baldwin	mov	r9, r2
1583bc3d5698SJohn Baldwin	mov	r10, r3
1584bc3d5698SJohn Baldwin
1585bc3d5698SJohn Baldwin	sub	r0, sp, #0x10			@ 0x10
1586bc3d5698SJohn Baldwin	bic	r0, #0xf			@ align at 16 bytes
1587bc3d5698SJohn Baldwin	mov	sp, r0
1588bc3d5698SJohn Baldwin
1589bc3d5698SJohn Baldwin#ifdef	XTS_CHAIN_TWEAK
1590bc3d5698SJohn Baldwin	ldr	r0, [ip]			@ pointer to input tweak
1591bc3d5698SJohn Baldwin#else
1592bc3d5698SJohn Baldwin	@ generate initial tweak
1593bc3d5698SJohn Baldwin	ldr	r0, [ip, #4]			@ iv[]
1594bc3d5698SJohn Baldwin	mov	r1, sp
1595bc3d5698SJohn Baldwin	ldr	r2, [ip, #0]			@ key2
1596bc3d5698SJohn Baldwin	bl	AES_encrypt
1597bc3d5698SJohn Baldwin	mov	r0,sp				@ pointer to initial tweak
1598bc3d5698SJohn Baldwin#endif
1599bc3d5698SJohn Baldwin
1600bc3d5698SJohn Baldwin	ldr	r1, [r10, #240]		@ get # of rounds
1601bc3d5698SJohn Baldwin	mov	r3, r6
1602bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
1603bc3d5698SJohn Baldwin	@ allocate the key schedule on the stack
1604bc3d5698SJohn Baldwin	sub	r12, sp, r1, lsl#7		@ 128 bytes per inner round key
1605bc3d5698SJohn Baldwin	@ add	r12, #96			@ size of bit-sliced key schedule
1606bc3d5698SJohn Baldwin	sub	r12, #48			@ place for tweak[9]
1607bc3d5698SJohn Baldwin
1608bc3d5698SJohn Baldwin	@ populate the key schedule
1609bc3d5698SJohn Baldwin	mov	r4, r10			@ pass key
1610bc3d5698SJohn Baldwin	mov	r5, r1			@ pass # of rounds
1611bc3d5698SJohn Baldwin	mov	sp, r12
1612bc3d5698SJohn Baldwin	add	r12, #0x90			@ pass key schedule
1613bc3d5698SJohn Baldwin	bl	_bsaes_key_convert
1614bc3d5698SJohn Baldwin	veor	q7, q7, q15	@ fix up last round key
1615bc3d5698SJohn Baldwin	vstmia	r12, {q7}			@ save last round key
1616bc3d5698SJohn Baldwin#else
1617bc3d5698SJohn Baldwin	ldr	r12, [r10, #244]
1618bc3d5698SJohn Baldwin	eors	r12, #1
1619bc3d5698SJohn Baldwin	beq	0f
1620bc3d5698SJohn Baldwin
1621bc3d5698SJohn Baldwin	str	r12, [r10, #244]
1622bc3d5698SJohn Baldwin	mov	r4, r10			@ pass key
1623bc3d5698SJohn Baldwin	mov	r5, r1			@ pass # of rounds
1624bc3d5698SJohn Baldwin	add	r12, r10, #248			@ pass key schedule
1625bc3d5698SJohn Baldwin	bl	_bsaes_key_convert
1626bc3d5698SJohn Baldwin	veor	q7, q7, q15	@ fix up last round key
1627bc3d5698SJohn Baldwin	vstmia	r12, {q7}
1628bc3d5698SJohn Baldwin
1629bc3d5698SJohn Baldwin.align	2
1630454c425dSMark Johnston0:	sub	sp, #0x90			@ place for tweak[9]
1631bc3d5698SJohn Baldwin#endif
1632bc3d5698SJohn Baldwin
1633bc3d5698SJohn Baldwin	vld1.8	{q8}, [r0]			@ initial tweak
1634bc3d5698SJohn Baldwin	adr	r2, .Lxts_magic
1635bc3d5698SJohn Baldwin
1636bc3d5698SJohn Baldwin	subs	r9, #0x80
1637bc3d5698SJohn Baldwin	blo	.Lxts_enc_short
1638bc3d5698SJohn Baldwin	b	.Lxts_enc_loop
1639bc3d5698SJohn Baldwin
1640bc3d5698SJohn Baldwin.align	4
1641bc3d5698SJohn Baldwin.Lxts_enc_loop:
1642bc3d5698SJohn Baldwin	vldmia	r2, {q5}	@ load XTS magic
1643bc3d5698SJohn Baldwin	vshr.s64	q6, q8, #63
1644bc3d5698SJohn Baldwin	mov	r0, sp
1645bc3d5698SJohn Baldwin	vand	q6, q6, q5
1646bc3d5698SJohn Baldwin	vadd.u64	q9, q8, q8
1647bc3d5698SJohn Baldwin	vst1.64	{q8}, [r0,:128]!
1648bc3d5698SJohn Baldwin	vswp	d13,d12
1649bc3d5698SJohn Baldwin	vshr.s64	q7, q9, #63
1650bc3d5698SJohn Baldwin	veor	q9, q9, q6
1651bc3d5698SJohn Baldwin	vand	q7, q7, q5
1652bc3d5698SJohn Baldwin	vadd.u64	q10, q9, q9
1653bc3d5698SJohn Baldwin	vst1.64	{q9}, [r0,:128]!
1654bc3d5698SJohn Baldwin	vswp	d15,d14
1655bc3d5698SJohn Baldwin	vshr.s64	q6, q10, #63
1656bc3d5698SJohn Baldwin	veor	q10, q10, q7
1657bc3d5698SJohn Baldwin	vand	q6, q6, q5
1658bc3d5698SJohn Baldwin	vld1.8	{q0}, [r7]!
1659bc3d5698SJohn Baldwin	vadd.u64	q11, q10, q10
1660bc3d5698SJohn Baldwin	vst1.64	{q10}, [r0,:128]!
1661bc3d5698SJohn Baldwin	vswp	d13,d12
1662bc3d5698SJohn Baldwin	vshr.s64	q7, q11, #63
1663bc3d5698SJohn Baldwin	veor	q11, q11, q6
1664bc3d5698SJohn Baldwin	vand	q7, q7, q5
1665bc3d5698SJohn Baldwin	vld1.8	{q1}, [r7]!
1666bc3d5698SJohn Baldwin	veor	q0, q0, q8
1667bc3d5698SJohn Baldwin	vadd.u64	q12, q11, q11
1668bc3d5698SJohn Baldwin	vst1.64	{q11}, [r0,:128]!
1669bc3d5698SJohn Baldwin	vswp	d15,d14
1670bc3d5698SJohn Baldwin	vshr.s64	q6, q12, #63
1671bc3d5698SJohn Baldwin	veor	q12, q12, q7
1672bc3d5698SJohn Baldwin	vand	q6, q6, q5
1673bc3d5698SJohn Baldwin	vld1.8	{q2}, [r7]!
1674bc3d5698SJohn Baldwin	veor	q1, q1, q9
1675bc3d5698SJohn Baldwin	vadd.u64	q13, q12, q12
1676bc3d5698SJohn Baldwin	vst1.64	{q12}, [r0,:128]!
1677bc3d5698SJohn Baldwin	vswp	d13,d12
1678bc3d5698SJohn Baldwin	vshr.s64	q7, q13, #63
1679bc3d5698SJohn Baldwin	veor	q13, q13, q6
1680bc3d5698SJohn Baldwin	vand	q7, q7, q5
1681bc3d5698SJohn Baldwin	vld1.8	{q3}, [r7]!
1682bc3d5698SJohn Baldwin	veor	q2, q2, q10
1683bc3d5698SJohn Baldwin	vadd.u64	q14, q13, q13
1684bc3d5698SJohn Baldwin	vst1.64	{q13}, [r0,:128]!
1685bc3d5698SJohn Baldwin	vswp	d15,d14
1686bc3d5698SJohn Baldwin	vshr.s64	q6, q14, #63
1687bc3d5698SJohn Baldwin	veor	q14, q14, q7
1688bc3d5698SJohn Baldwin	vand	q6, q6, q5
1689bc3d5698SJohn Baldwin	vld1.8	{q4}, [r7]!
1690bc3d5698SJohn Baldwin	veor	q3, q3, q11
1691bc3d5698SJohn Baldwin	vadd.u64	q15, q14, q14
1692bc3d5698SJohn Baldwin	vst1.64	{q14}, [r0,:128]!
1693bc3d5698SJohn Baldwin	vswp	d13,d12
1694bc3d5698SJohn Baldwin	vshr.s64	q7, q15, #63
1695bc3d5698SJohn Baldwin	veor	q15, q15, q6
1696bc3d5698SJohn Baldwin	vand	q7, q7, q5
1697bc3d5698SJohn Baldwin	vld1.8	{q5}, [r7]!
1698bc3d5698SJohn Baldwin	veor	q4, q4, q12
1699bc3d5698SJohn Baldwin	vadd.u64	q8, q15, q15
1700bc3d5698SJohn Baldwin	vst1.64	{q15}, [r0,:128]!
1701bc3d5698SJohn Baldwin	vswp	d15,d14
1702bc3d5698SJohn Baldwin	veor	q8, q8, q7
1703bc3d5698SJohn Baldwin	vst1.64	{q8}, [r0,:128]		@ next round tweak
1704bc3d5698SJohn Baldwin
1705bc3d5698SJohn Baldwin	vld1.8	{q6,q7}, [r7]!
1706bc3d5698SJohn Baldwin	veor	q5, q5, q13
1707bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
1708bc3d5698SJohn Baldwin	add	r4, sp, #0x90			@ pass key schedule
1709bc3d5698SJohn Baldwin#else
1710bc3d5698SJohn Baldwin	add	r4, r10, #248			@ pass key schedule
1711bc3d5698SJohn Baldwin#endif
1712bc3d5698SJohn Baldwin	veor	q6, q6, q14
1713bc3d5698SJohn Baldwin	mov	r5, r1			@ pass rounds
1714bc3d5698SJohn Baldwin	veor	q7, q7, q15
1715bc3d5698SJohn Baldwin	mov	r0, sp
1716bc3d5698SJohn Baldwin
1717bc3d5698SJohn Baldwin	bl	_bsaes_encrypt8
1718bc3d5698SJohn Baldwin
1719bc3d5698SJohn Baldwin	vld1.64	{q8,q9}, [r0,:128]!
1720bc3d5698SJohn Baldwin	vld1.64	{q10,q11}, [r0,:128]!
1721bc3d5698SJohn Baldwin	veor	q0, q0, q8
1722bc3d5698SJohn Baldwin	vld1.64	{q12,q13}, [r0,:128]!
1723bc3d5698SJohn Baldwin	veor	q1, q1, q9
1724bc3d5698SJohn Baldwin	veor	q8, q4, q10
1725bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r8]!
1726bc3d5698SJohn Baldwin	veor	q9, q6, q11
1727bc3d5698SJohn Baldwin	vld1.64	{q14,q15}, [r0,:128]!
1728bc3d5698SJohn Baldwin	veor	q10, q3, q12
1729bc3d5698SJohn Baldwin	vst1.8	{q8,q9}, [r8]!
1730bc3d5698SJohn Baldwin	veor	q11, q7, q13
1731bc3d5698SJohn Baldwin	veor	q12, q2, q14
1732bc3d5698SJohn Baldwin	vst1.8	{q10,q11}, [r8]!
1733bc3d5698SJohn Baldwin	veor	q13, q5, q15
1734bc3d5698SJohn Baldwin	vst1.8	{q12,q13}, [r8]!
1735bc3d5698SJohn Baldwin
1736bc3d5698SJohn Baldwin	vld1.64	{q8}, [r0,:128]		@ next round tweak
1737bc3d5698SJohn Baldwin
1738bc3d5698SJohn Baldwin	subs	r9, #0x80
1739bc3d5698SJohn Baldwin	bpl	.Lxts_enc_loop
1740bc3d5698SJohn Baldwin
1741bc3d5698SJohn Baldwin.Lxts_enc_short:
1742bc3d5698SJohn Baldwin	adds	r9, #0x70
1743bc3d5698SJohn Baldwin	bmi	.Lxts_enc_done
1744bc3d5698SJohn Baldwin
1745bc3d5698SJohn Baldwin	vldmia	r2, {q5}	@ load XTS magic
1746bc3d5698SJohn Baldwin	vshr.s64	q7, q8, #63
1747bc3d5698SJohn Baldwin	mov	r0, sp
1748bc3d5698SJohn Baldwin	vand	q7, q7, q5
1749bc3d5698SJohn Baldwin	vadd.u64	q9, q8, q8
1750bc3d5698SJohn Baldwin	vst1.64	{q8}, [r0,:128]!
1751bc3d5698SJohn Baldwin	vswp	d15,d14
1752bc3d5698SJohn Baldwin	vshr.s64	q6, q9, #63
1753bc3d5698SJohn Baldwin	veor	q9, q9, q7
1754bc3d5698SJohn Baldwin	vand	q6, q6, q5
1755bc3d5698SJohn Baldwin	vadd.u64	q10, q9, q9
1756bc3d5698SJohn Baldwin	vst1.64	{q9}, [r0,:128]!
1757bc3d5698SJohn Baldwin	vswp	d13,d12
1758bc3d5698SJohn Baldwin	vshr.s64	q7, q10, #63
1759bc3d5698SJohn Baldwin	veor	q10, q10, q6
1760bc3d5698SJohn Baldwin	vand	q7, q7, q5
1761bc3d5698SJohn Baldwin	vld1.8	{q0}, [r7]!
1762bc3d5698SJohn Baldwin	subs	r9, #0x10
1763bc3d5698SJohn Baldwin	bmi	.Lxts_enc_1
1764bc3d5698SJohn Baldwin	vadd.u64	q11, q10, q10
1765bc3d5698SJohn Baldwin	vst1.64	{q10}, [r0,:128]!
1766bc3d5698SJohn Baldwin	vswp	d15,d14
1767bc3d5698SJohn Baldwin	vshr.s64	q6, q11, #63
1768bc3d5698SJohn Baldwin	veor	q11, q11, q7
1769bc3d5698SJohn Baldwin	vand	q6, q6, q5
1770bc3d5698SJohn Baldwin	vld1.8	{q1}, [r7]!
1771bc3d5698SJohn Baldwin	subs	r9, #0x10
1772bc3d5698SJohn Baldwin	bmi	.Lxts_enc_2
1773bc3d5698SJohn Baldwin	veor	q0, q0, q8
1774bc3d5698SJohn Baldwin	vadd.u64	q12, q11, q11
1775bc3d5698SJohn Baldwin	vst1.64	{q11}, [r0,:128]!
1776bc3d5698SJohn Baldwin	vswp	d13,d12
1777bc3d5698SJohn Baldwin	vshr.s64	q7, q12, #63
1778bc3d5698SJohn Baldwin	veor	q12, q12, q6
1779bc3d5698SJohn Baldwin	vand	q7, q7, q5
1780bc3d5698SJohn Baldwin	vld1.8	{q2}, [r7]!
1781bc3d5698SJohn Baldwin	subs	r9, #0x10
1782bc3d5698SJohn Baldwin	bmi	.Lxts_enc_3
1783bc3d5698SJohn Baldwin	veor	q1, q1, q9
1784bc3d5698SJohn Baldwin	vadd.u64	q13, q12, q12
1785bc3d5698SJohn Baldwin	vst1.64	{q12}, [r0,:128]!
1786bc3d5698SJohn Baldwin	vswp	d15,d14
1787bc3d5698SJohn Baldwin	vshr.s64	q6, q13, #63
1788bc3d5698SJohn Baldwin	veor	q13, q13, q7
1789bc3d5698SJohn Baldwin	vand	q6, q6, q5
1790bc3d5698SJohn Baldwin	vld1.8	{q3}, [r7]!
1791bc3d5698SJohn Baldwin	subs	r9, #0x10
1792bc3d5698SJohn Baldwin	bmi	.Lxts_enc_4
1793bc3d5698SJohn Baldwin	veor	q2, q2, q10
1794bc3d5698SJohn Baldwin	vadd.u64	q14, q13, q13
1795bc3d5698SJohn Baldwin	vst1.64	{q13}, [r0,:128]!
1796bc3d5698SJohn Baldwin	vswp	d13,d12
1797bc3d5698SJohn Baldwin	vshr.s64	q7, q14, #63
1798bc3d5698SJohn Baldwin	veor	q14, q14, q6
1799bc3d5698SJohn Baldwin	vand	q7, q7, q5
1800bc3d5698SJohn Baldwin	vld1.8	{q4}, [r7]!
1801bc3d5698SJohn Baldwin	subs	r9, #0x10
1802bc3d5698SJohn Baldwin	bmi	.Lxts_enc_5
1803bc3d5698SJohn Baldwin	veor	q3, q3, q11
1804bc3d5698SJohn Baldwin	vadd.u64	q15, q14, q14
1805bc3d5698SJohn Baldwin	vst1.64	{q14}, [r0,:128]!
1806bc3d5698SJohn Baldwin	vswp	d15,d14
1807bc3d5698SJohn Baldwin	vshr.s64	q6, q15, #63
1808bc3d5698SJohn Baldwin	veor	q15, q15, q7
1809bc3d5698SJohn Baldwin	vand	q6, q6, q5
1810bc3d5698SJohn Baldwin	vld1.8	{q5}, [r7]!
1811bc3d5698SJohn Baldwin	subs	r9, #0x10
1812bc3d5698SJohn Baldwin	bmi	.Lxts_enc_6
1813bc3d5698SJohn Baldwin	veor	q4, q4, q12
1814bc3d5698SJohn Baldwin	sub	r9, #0x10
1815bc3d5698SJohn Baldwin	vst1.64	{q15}, [r0,:128]		@ next round tweak
1816bc3d5698SJohn Baldwin
1817bc3d5698SJohn Baldwin	vld1.8	{q6}, [r7]!
1818bc3d5698SJohn Baldwin	veor	q5, q5, q13
1819bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
1820bc3d5698SJohn Baldwin	add	r4, sp, #0x90			@ pass key schedule
1821bc3d5698SJohn Baldwin#else
1822bc3d5698SJohn Baldwin	add	r4, r10, #248			@ pass key schedule
1823bc3d5698SJohn Baldwin#endif
1824bc3d5698SJohn Baldwin	veor	q6, q6, q14
1825bc3d5698SJohn Baldwin	mov	r5, r1			@ pass rounds
1826bc3d5698SJohn Baldwin	mov	r0, sp
1827bc3d5698SJohn Baldwin
1828bc3d5698SJohn Baldwin	bl	_bsaes_encrypt8
1829bc3d5698SJohn Baldwin
1830bc3d5698SJohn Baldwin	vld1.64	{q8,q9}, [r0,:128]!
1831bc3d5698SJohn Baldwin	vld1.64	{q10,q11}, [r0,:128]!
1832bc3d5698SJohn Baldwin	veor	q0, q0, q8
1833bc3d5698SJohn Baldwin	vld1.64	{q12,q13}, [r0,:128]!
1834bc3d5698SJohn Baldwin	veor	q1, q1, q9
1835bc3d5698SJohn Baldwin	veor	q8, q4, q10
1836bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r8]!
1837bc3d5698SJohn Baldwin	veor	q9, q6, q11
1838bc3d5698SJohn Baldwin	vld1.64	{q14}, [r0,:128]!
1839bc3d5698SJohn Baldwin	veor	q10, q3, q12
1840bc3d5698SJohn Baldwin	vst1.8	{q8,q9}, [r8]!
1841bc3d5698SJohn Baldwin	veor	q11, q7, q13
1842bc3d5698SJohn Baldwin	veor	q12, q2, q14
1843bc3d5698SJohn Baldwin	vst1.8	{q10,q11}, [r8]!
1844bc3d5698SJohn Baldwin	vst1.8	{q12}, [r8]!
1845bc3d5698SJohn Baldwin
1846bc3d5698SJohn Baldwin	vld1.64	{q8}, [r0,:128]		@ next round tweak
1847bc3d5698SJohn Baldwin	b	.Lxts_enc_done
1848bc3d5698SJohn Baldwin.align	4
1849bc3d5698SJohn Baldwin.Lxts_enc_6:
1850bc3d5698SJohn Baldwin	veor	q4, q4, q12
1851bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
1852bc3d5698SJohn Baldwin	add	r4, sp, #0x90			@ pass key schedule
1853bc3d5698SJohn Baldwin#else
1854bc3d5698SJohn Baldwin	add	r4, r10, #248			@ pass key schedule
1855bc3d5698SJohn Baldwin#endif
1856bc3d5698SJohn Baldwin	veor	q5, q5, q13
1857bc3d5698SJohn Baldwin	mov	r5, r1			@ pass rounds
1858bc3d5698SJohn Baldwin	mov	r0, sp
1859bc3d5698SJohn Baldwin
1860bc3d5698SJohn Baldwin	bl	_bsaes_encrypt8
1861bc3d5698SJohn Baldwin
1862bc3d5698SJohn Baldwin	vld1.64	{q8,q9}, [r0,:128]!
1863bc3d5698SJohn Baldwin	vld1.64	{q10,q11}, [r0,:128]!
1864bc3d5698SJohn Baldwin	veor	q0, q0, q8
1865bc3d5698SJohn Baldwin	vld1.64	{q12,q13}, [r0,:128]!
1866bc3d5698SJohn Baldwin	veor	q1, q1, q9
1867bc3d5698SJohn Baldwin	veor	q8, q4, q10
1868bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r8]!
1869bc3d5698SJohn Baldwin	veor	q9, q6, q11
1870bc3d5698SJohn Baldwin	veor	q10, q3, q12
1871bc3d5698SJohn Baldwin	vst1.8	{q8,q9}, [r8]!
1872bc3d5698SJohn Baldwin	veor	q11, q7, q13
1873bc3d5698SJohn Baldwin	vst1.8	{q10,q11}, [r8]!
1874bc3d5698SJohn Baldwin
1875bc3d5698SJohn Baldwin	vld1.64	{q8}, [r0,:128]		@ next round tweak
1876bc3d5698SJohn Baldwin	b	.Lxts_enc_done
1877bc3d5698SJohn Baldwin
1878bc3d5698SJohn Baldwin@ put this in range for both ARM and Thumb mode adr instructions
1879bc3d5698SJohn Baldwin.align	5
1880bc3d5698SJohn Baldwin.Lxts_magic:
1881bc3d5698SJohn Baldwin.quad	1, 0x87
1882bc3d5698SJohn Baldwin
1883bc3d5698SJohn Baldwin.align	5
1884bc3d5698SJohn Baldwin.Lxts_enc_5:
1885bc3d5698SJohn Baldwin	veor	q3, q3, q11
1886bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
1887bc3d5698SJohn Baldwin	add	r4, sp, #0x90			@ pass key schedule
1888bc3d5698SJohn Baldwin#else
1889bc3d5698SJohn Baldwin	add	r4, r10, #248			@ pass key schedule
1890bc3d5698SJohn Baldwin#endif
1891bc3d5698SJohn Baldwin	veor	q4, q4, q12
1892bc3d5698SJohn Baldwin	mov	r5, r1			@ pass rounds
1893bc3d5698SJohn Baldwin	mov	r0, sp
1894bc3d5698SJohn Baldwin
1895bc3d5698SJohn Baldwin	bl	_bsaes_encrypt8
1896bc3d5698SJohn Baldwin
1897bc3d5698SJohn Baldwin	vld1.64	{q8,q9}, [r0,:128]!
1898bc3d5698SJohn Baldwin	vld1.64	{q10,q11}, [r0,:128]!
1899bc3d5698SJohn Baldwin	veor	q0, q0, q8
1900bc3d5698SJohn Baldwin	vld1.64	{q12}, [r0,:128]!
1901bc3d5698SJohn Baldwin	veor	q1, q1, q9
1902bc3d5698SJohn Baldwin	veor	q8, q4, q10
1903bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r8]!
1904bc3d5698SJohn Baldwin	veor	q9, q6, q11
1905bc3d5698SJohn Baldwin	veor	q10, q3, q12
1906bc3d5698SJohn Baldwin	vst1.8	{q8,q9}, [r8]!
1907bc3d5698SJohn Baldwin	vst1.8	{q10}, [r8]!
1908bc3d5698SJohn Baldwin
1909bc3d5698SJohn Baldwin	vld1.64	{q8}, [r0,:128]		@ next round tweak
1910bc3d5698SJohn Baldwin	b	.Lxts_enc_done
1911bc3d5698SJohn Baldwin.align	4
1912bc3d5698SJohn Baldwin.Lxts_enc_4:
1913bc3d5698SJohn Baldwin	veor	q2, q2, q10
1914bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
1915bc3d5698SJohn Baldwin	add	r4, sp, #0x90			@ pass key schedule
1916bc3d5698SJohn Baldwin#else
1917bc3d5698SJohn Baldwin	add	r4, r10, #248			@ pass key schedule
1918bc3d5698SJohn Baldwin#endif
1919bc3d5698SJohn Baldwin	veor	q3, q3, q11
1920bc3d5698SJohn Baldwin	mov	r5, r1			@ pass rounds
1921bc3d5698SJohn Baldwin	mov	r0, sp
1922bc3d5698SJohn Baldwin
1923bc3d5698SJohn Baldwin	bl	_bsaes_encrypt8
1924bc3d5698SJohn Baldwin
1925bc3d5698SJohn Baldwin	vld1.64	{q8,q9}, [r0,:128]!
1926bc3d5698SJohn Baldwin	vld1.64	{q10,q11}, [r0,:128]!
1927bc3d5698SJohn Baldwin	veor	q0, q0, q8
1928bc3d5698SJohn Baldwin	veor	q1, q1, q9
1929bc3d5698SJohn Baldwin	veor	q8, q4, q10
1930bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r8]!
1931bc3d5698SJohn Baldwin	veor	q9, q6, q11
1932bc3d5698SJohn Baldwin	vst1.8	{q8,q9}, [r8]!
1933bc3d5698SJohn Baldwin
1934bc3d5698SJohn Baldwin	vld1.64	{q8}, [r0,:128]		@ next round tweak
1935bc3d5698SJohn Baldwin	b	.Lxts_enc_done
1936bc3d5698SJohn Baldwin.align	4
1937bc3d5698SJohn Baldwin.Lxts_enc_3:
1938bc3d5698SJohn Baldwin	veor	q1, q1, q9
1939bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
1940bc3d5698SJohn Baldwin	add	r4, sp, #0x90			@ pass key schedule
1941bc3d5698SJohn Baldwin#else
1942bc3d5698SJohn Baldwin	add	r4, r10, #248			@ pass key schedule
1943bc3d5698SJohn Baldwin#endif
1944bc3d5698SJohn Baldwin	veor	q2, q2, q10
1945bc3d5698SJohn Baldwin	mov	r5, r1			@ pass rounds
1946bc3d5698SJohn Baldwin	mov	r0, sp
1947bc3d5698SJohn Baldwin
1948bc3d5698SJohn Baldwin	bl	_bsaes_encrypt8
1949bc3d5698SJohn Baldwin
1950bc3d5698SJohn Baldwin	vld1.64	{q8,q9}, [r0,:128]!
1951bc3d5698SJohn Baldwin	vld1.64	{q10}, [r0,:128]!
1952bc3d5698SJohn Baldwin	veor	q0, q0, q8
1953bc3d5698SJohn Baldwin	veor	q1, q1, q9
1954bc3d5698SJohn Baldwin	veor	q8, q4, q10
1955bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r8]!
1956bc3d5698SJohn Baldwin	vst1.8	{q8}, [r8]!
1957bc3d5698SJohn Baldwin
1958bc3d5698SJohn Baldwin	vld1.64	{q8}, [r0,:128]		@ next round tweak
1959bc3d5698SJohn Baldwin	b	.Lxts_enc_done
1960bc3d5698SJohn Baldwin.align	4
1961bc3d5698SJohn Baldwin.Lxts_enc_2:
1962bc3d5698SJohn Baldwin	veor	q0, q0, q8
1963bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
1964bc3d5698SJohn Baldwin	add	r4, sp, #0x90			@ pass key schedule
1965bc3d5698SJohn Baldwin#else
1966bc3d5698SJohn Baldwin	add	r4, r10, #248			@ pass key schedule
1967bc3d5698SJohn Baldwin#endif
1968bc3d5698SJohn Baldwin	veor	q1, q1, q9
1969bc3d5698SJohn Baldwin	mov	r5, r1			@ pass rounds
1970bc3d5698SJohn Baldwin	mov	r0, sp
1971bc3d5698SJohn Baldwin
1972bc3d5698SJohn Baldwin	bl	_bsaes_encrypt8
1973bc3d5698SJohn Baldwin
1974bc3d5698SJohn Baldwin	vld1.64	{q8,q9}, [r0,:128]!
1975bc3d5698SJohn Baldwin	veor	q0, q0, q8
1976bc3d5698SJohn Baldwin	veor	q1, q1, q9
1977bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r8]!
1978bc3d5698SJohn Baldwin
1979bc3d5698SJohn Baldwin	vld1.64	{q8}, [r0,:128]		@ next round tweak
1980bc3d5698SJohn Baldwin	b	.Lxts_enc_done
1981bc3d5698SJohn Baldwin.align	4
1982bc3d5698SJohn Baldwin.Lxts_enc_1:
1983bc3d5698SJohn Baldwin	mov	r0, sp
1984bc3d5698SJohn Baldwin	veor	q0, q0, q8
1985bc3d5698SJohn Baldwin	mov	r1, sp
1986bc3d5698SJohn Baldwin	vst1.8	{q0}, [sp,:128]
1987bc3d5698SJohn Baldwin	mov	r2, r10
1988bc3d5698SJohn Baldwin	mov	r4, r3				@ preserve fp
1989bc3d5698SJohn Baldwin
1990bc3d5698SJohn Baldwin	bl	AES_encrypt
1991bc3d5698SJohn Baldwin
1992bc3d5698SJohn Baldwin	vld1.8	{q0}, [sp,:128]
1993bc3d5698SJohn Baldwin	veor	q0, q0, q8
1994bc3d5698SJohn Baldwin	vst1.8	{q0}, [r8]!
1995bc3d5698SJohn Baldwin	mov	r3, r4
1996bc3d5698SJohn Baldwin
1997bc3d5698SJohn Baldwin	vmov	q8, q9		@ next round tweak
1998bc3d5698SJohn Baldwin
1999bc3d5698SJohn Baldwin.Lxts_enc_done:
2000bc3d5698SJohn Baldwin#ifndef	XTS_CHAIN_TWEAK
2001bc3d5698SJohn Baldwin	adds	r9, #0x10
2002bc3d5698SJohn Baldwin	beq	.Lxts_enc_ret
2003bc3d5698SJohn Baldwin	sub	r6, r8, #0x10
2004bc3d5698SJohn Baldwin
2005bc3d5698SJohn Baldwin.Lxts_enc_steal:
2006bc3d5698SJohn Baldwin	ldrb	r0, [r7], #1
2007bc3d5698SJohn Baldwin	ldrb	r1, [r8, #-0x10]
2008bc3d5698SJohn Baldwin	strb	r0, [r8, #-0x10]
2009bc3d5698SJohn Baldwin	strb	r1, [r8], #1
2010bc3d5698SJohn Baldwin
2011bc3d5698SJohn Baldwin	subs	r9, #1
2012bc3d5698SJohn Baldwin	bhi	.Lxts_enc_steal
2013bc3d5698SJohn Baldwin
2014bc3d5698SJohn Baldwin	vld1.8	{q0}, [r6]
2015bc3d5698SJohn Baldwin	mov	r0, sp
2016bc3d5698SJohn Baldwin	veor	q0, q0, q8
2017bc3d5698SJohn Baldwin	mov	r1, sp
2018bc3d5698SJohn Baldwin	vst1.8	{q0}, [sp,:128]
2019bc3d5698SJohn Baldwin	mov	r2, r10
2020bc3d5698SJohn Baldwin	mov	r4, r3			@ preserve fp
2021bc3d5698SJohn Baldwin
2022bc3d5698SJohn Baldwin	bl	AES_encrypt
2023bc3d5698SJohn Baldwin
2024bc3d5698SJohn Baldwin	vld1.8	{q0}, [sp,:128]
2025bc3d5698SJohn Baldwin	veor	q0, q0, q8
2026bc3d5698SJohn Baldwin	vst1.8	{q0}, [r6]
2027bc3d5698SJohn Baldwin	mov	r3, r4
2028bc3d5698SJohn Baldwin#endif
2029bc3d5698SJohn Baldwin
2030bc3d5698SJohn Baldwin.Lxts_enc_ret:
2031bc3d5698SJohn Baldwin	bic	r0, r3, #0xf
2032bc3d5698SJohn Baldwin	vmov.i32	q0, #0
2033bc3d5698SJohn Baldwin	vmov.i32	q1, #0
2034bc3d5698SJohn Baldwin#ifdef	XTS_CHAIN_TWEAK
2035bc3d5698SJohn Baldwin	ldr	r1, [r3, #0x20+VFP_ABI_FRAME]	@ chain tweak
2036bc3d5698SJohn Baldwin#endif
2037bc3d5698SJohn Baldwin.Lxts_enc_bzero:@ wipe key schedule [if any]
2038bc3d5698SJohn Baldwin	vstmia	sp!, {q0,q1}
2039bc3d5698SJohn Baldwin	cmp	sp, r0
2040bc3d5698SJohn Baldwin	bne	.Lxts_enc_bzero
2041bc3d5698SJohn Baldwin
2042bc3d5698SJohn Baldwin	mov	sp, r3
2043bc3d5698SJohn Baldwin#ifdef	XTS_CHAIN_TWEAK
2044bc3d5698SJohn Baldwin	vst1.8	{q8}, [r1]
2045bc3d5698SJohn Baldwin#endif
2046bc3d5698SJohn Baldwin	VFP_ABI_POP
2047bc3d5698SJohn Baldwin	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}	@ return
2048bc3d5698SJohn Baldwin
2049*c0855eaaSJohn Baldwin.size	ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt
2050bc3d5698SJohn Baldwin
2051*c0855eaaSJohn Baldwin.globl	ossl_bsaes_xts_decrypt
2052*c0855eaaSJohn Baldwin.type	ossl_bsaes_xts_decrypt,%function
2053bc3d5698SJohn Baldwin.align	4
2054*c0855eaaSJohn Baldwinossl_bsaes_xts_decrypt:
2055bc3d5698SJohn Baldwin	mov	ip, sp
2056bc3d5698SJohn Baldwin	stmdb	sp!, {r4,r5,r6,r7,r8,r9,r10, lr}		@ 0x20
2057bc3d5698SJohn Baldwin	VFP_ABI_PUSH
2058bc3d5698SJohn Baldwin	mov	r6, sp				@ future r3
2059bc3d5698SJohn Baldwin
2060bc3d5698SJohn Baldwin	mov	r7, r0
2061bc3d5698SJohn Baldwin	mov	r8, r1
2062bc3d5698SJohn Baldwin	mov	r9, r2
2063bc3d5698SJohn Baldwin	mov	r10, r3
2064bc3d5698SJohn Baldwin
2065bc3d5698SJohn Baldwin	sub	r0, sp, #0x10			@ 0x10
2066bc3d5698SJohn Baldwin	bic	r0, #0xf			@ align at 16 bytes
2067bc3d5698SJohn Baldwin	mov	sp, r0
2068bc3d5698SJohn Baldwin
2069bc3d5698SJohn Baldwin#ifdef	XTS_CHAIN_TWEAK
2070bc3d5698SJohn Baldwin	ldr	r0, [ip]			@ pointer to input tweak
2071bc3d5698SJohn Baldwin#else
2072bc3d5698SJohn Baldwin	@ generate initial tweak
2073bc3d5698SJohn Baldwin	ldr	r0, [ip, #4]			@ iv[]
2074bc3d5698SJohn Baldwin	mov	r1, sp
2075bc3d5698SJohn Baldwin	ldr	r2, [ip, #0]			@ key2
2076bc3d5698SJohn Baldwin	bl	AES_encrypt
2077bc3d5698SJohn Baldwin	mov	r0, sp				@ pointer to initial tweak
2078bc3d5698SJohn Baldwin#endif
2079bc3d5698SJohn Baldwin
2080bc3d5698SJohn Baldwin	ldr	r1, [r10, #240]		@ get # of rounds
2081bc3d5698SJohn Baldwin	mov	r3, r6
2082bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
2083bc3d5698SJohn Baldwin	@ allocate the key schedule on the stack
2084bc3d5698SJohn Baldwin	sub	r12, sp, r1, lsl#7		@ 128 bytes per inner round key
2085bc3d5698SJohn Baldwin	@ add	r12, #96			@ size of bit-sliced key schedule
2086bc3d5698SJohn Baldwin	sub	r12, #48			@ place for tweak[9]
2087bc3d5698SJohn Baldwin
2088bc3d5698SJohn Baldwin	@ populate the key schedule
2089bc3d5698SJohn Baldwin	mov	r4, r10			@ pass key
2090bc3d5698SJohn Baldwin	mov	r5, r1			@ pass # of rounds
2091bc3d5698SJohn Baldwin	mov	sp, r12
2092bc3d5698SJohn Baldwin	add	r12, #0x90			@ pass key schedule
2093bc3d5698SJohn Baldwin	bl	_bsaes_key_convert
2094bc3d5698SJohn Baldwin	add	r4, sp, #0x90
2095bc3d5698SJohn Baldwin	vldmia	r4, {q6}
2096bc3d5698SJohn Baldwin	vstmia	r12,  {q15}		@ save last round key
2097bc3d5698SJohn Baldwin	veor	q7, q7, q6	@ fix up round 0 key
2098bc3d5698SJohn Baldwin	vstmia	r4, {q7}
2099bc3d5698SJohn Baldwin#else
2100bc3d5698SJohn Baldwin	ldr	r12, [r10, #244]
2101bc3d5698SJohn Baldwin	eors	r12, #1
2102bc3d5698SJohn Baldwin	beq	0f
2103bc3d5698SJohn Baldwin
2104bc3d5698SJohn Baldwin	str	r12, [r10, #244]
2105bc3d5698SJohn Baldwin	mov	r4, r10			@ pass key
2106bc3d5698SJohn Baldwin	mov	r5, r1			@ pass # of rounds
2107bc3d5698SJohn Baldwin	add	r12, r10, #248			@ pass key schedule
2108bc3d5698SJohn Baldwin	bl	_bsaes_key_convert
2109bc3d5698SJohn Baldwin	add	r4, r10, #248
2110bc3d5698SJohn Baldwin	vldmia	r4, {q6}
2111bc3d5698SJohn Baldwin	vstmia	r12,  {q15}		@ save last round key
2112bc3d5698SJohn Baldwin	veor	q7, q7, q6	@ fix up round 0 key
2113bc3d5698SJohn Baldwin	vstmia	r4, {q7}
2114bc3d5698SJohn Baldwin
2115bc3d5698SJohn Baldwin.align	2
2116454c425dSMark Johnston0:	sub	sp, #0x90			@ place for tweak[9]
2117bc3d5698SJohn Baldwin#endif
2118bc3d5698SJohn Baldwin	vld1.8	{q8}, [r0]			@ initial tweak
2119bc3d5698SJohn Baldwin	adr	r2, .Lxts_magic
2120bc3d5698SJohn Baldwin
2121bc3d5698SJohn Baldwin#ifndef	XTS_CHAIN_TWEAK
2122bc3d5698SJohn Baldwin	tst	r9, #0xf			@ if not multiple of 16
2123bc3d5698SJohn Baldwin	it	ne				@ Thumb2 thing, sanity check in ARM
2124bc3d5698SJohn Baldwin	subne	r9, #0x10			@ subtract another 16 bytes
2125bc3d5698SJohn Baldwin#endif
2126bc3d5698SJohn Baldwin	subs	r9, #0x80
2127bc3d5698SJohn Baldwin
2128bc3d5698SJohn Baldwin	blo	.Lxts_dec_short
2129bc3d5698SJohn Baldwin	b	.Lxts_dec_loop
2130bc3d5698SJohn Baldwin
2131bc3d5698SJohn Baldwin.align	4
2132bc3d5698SJohn Baldwin.Lxts_dec_loop:
2133bc3d5698SJohn Baldwin	vldmia	r2, {q5}	@ load XTS magic
2134bc3d5698SJohn Baldwin	vshr.s64	q6, q8, #63
2135bc3d5698SJohn Baldwin	mov	r0, sp
2136bc3d5698SJohn Baldwin	vand	q6, q6, q5
2137bc3d5698SJohn Baldwin	vadd.u64	q9, q8, q8
2138bc3d5698SJohn Baldwin	vst1.64	{q8}, [r0,:128]!
2139bc3d5698SJohn Baldwin	vswp	d13,d12
2140bc3d5698SJohn Baldwin	vshr.s64	q7, q9, #63
2141bc3d5698SJohn Baldwin	veor	q9, q9, q6
2142bc3d5698SJohn Baldwin	vand	q7, q7, q5
2143bc3d5698SJohn Baldwin	vadd.u64	q10, q9, q9
2144bc3d5698SJohn Baldwin	vst1.64	{q9}, [r0,:128]!
2145bc3d5698SJohn Baldwin	vswp	d15,d14
2146bc3d5698SJohn Baldwin	vshr.s64	q6, q10, #63
2147bc3d5698SJohn Baldwin	veor	q10, q10, q7
2148bc3d5698SJohn Baldwin	vand	q6, q6, q5
2149bc3d5698SJohn Baldwin	vld1.8	{q0}, [r7]!
2150bc3d5698SJohn Baldwin	vadd.u64	q11, q10, q10
2151bc3d5698SJohn Baldwin	vst1.64	{q10}, [r0,:128]!
2152bc3d5698SJohn Baldwin	vswp	d13,d12
2153bc3d5698SJohn Baldwin	vshr.s64	q7, q11, #63
2154bc3d5698SJohn Baldwin	veor	q11, q11, q6
2155bc3d5698SJohn Baldwin	vand	q7, q7, q5
2156bc3d5698SJohn Baldwin	vld1.8	{q1}, [r7]!
2157bc3d5698SJohn Baldwin	veor	q0, q0, q8
2158bc3d5698SJohn Baldwin	vadd.u64	q12, q11, q11
2159bc3d5698SJohn Baldwin	vst1.64	{q11}, [r0,:128]!
2160bc3d5698SJohn Baldwin	vswp	d15,d14
2161bc3d5698SJohn Baldwin	vshr.s64	q6, q12, #63
2162bc3d5698SJohn Baldwin	veor	q12, q12, q7
2163bc3d5698SJohn Baldwin	vand	q6, q6, q5
2164bc3d5698SJohn Baldwin	vld1.8	{q2}, [r7]!
2165bc3d5698SJohn Baldwin	veor	q1, q1, q9
2166bc3d5698SJohn Baldwin	vadd.u64	q13, q12, q12
2167bc3d5698SJohn Baldwin	vst1.64	{q12}, [r0,:128]!
2168bc3d5698SJohn Baldwin	vswp	d13,d12
2169bc3d5698SJohn Baldwin	vshr.s64	q7, q13, #63
2170bc3d5698SJohn Baldwin	veor	q13, q13, q6
2171bc3d5698SJohn Baldwin	vand	q7, q7, q5
2172bc3d5698SJohn Baldwin	vld1.8	{q3}, [r7]!
2173bc3d5698SJohn Baldwin	veor	q2, q2, q10
2174bc3d5698SJohn Baldwin	vadd.u64	q14, q13, q13
2175bc3d5698SJohn Baldwin	vst1.64	{q13}, [r0,:128]!
2176bc3d5698SJohn Baldwin	vswp	d15,d14
2177bc3d5698SJohn Baldwin	vshr.s64	q6, q14, #63
2178bc3d5698SJohn Baldwin	veor	q14, q14, q7
2179bc3d5698SJohn Baldwin	vand	q6, q6, q5
2180bc3d5698SJohn Baldwin	vld1.8	{q4}, [r7]!
2181bc3d5698SJohn Baldwin	veor	q3, q3, q11
2182bc3d5698SJohn Baldwin	vadd.u64	q15, q14, q14
2183bc3d5698SJohn Baldwin	vst1.64	{q14}, [r0,:128]!
2184bc3d5698SJohn Baldwin	vswp	d13,d12
2185bc3d5698SJohn Baldwin	vshr.s64	q7, q15, #63
2186bc3d5698SJohn Baldwin	veor	q15, q15, q6
2187bc3d5698SJohn Baldwin	vand	q7, q7, q5
2188bc3d5698SJohn Baldwin	vld1.8	{q5}, [r7]!
2189bc3d5698SJohn Baldwin	veor	q4, q4, q12
2190bc3d5698SJohn Baldwin	vadd.u64	q8, q15, q15
2191bc3d5698SJohn Baldwin	vst1.64	{q15}, [r0,:128]!
2192bc3d5698SJohn Baldwin	vswp	d15,d14
2193bc3d5698SJohn Baldwin	veor	q8, q8, q7
2194bc3d5698SJohn Baldwin	vst1.64	{q8}, [r0,:128]		@ next round tweak
2195bc3d5698SJohn Baldwin
2196bc3d5698SJohn Baldwin	vld1.8	{q6,q7}, [r7]!
2197bc3d5698SJohn Baldwin	veor	q5, q5, q13
2198bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
2199bc3d5698SJohn Baldwin	add	r4, sp, #0x90			@ pass key schedule
2200bc3d5698SJohn Baldwin#else
2201bc3d5698SJohn Baldwin	add	r4, r10, #248			@ pass key schedule
2202bc3d5698SJohn Baldwin#endif
2203bc3d5698SJohn Baldwin	veor	q6, q6, q14
2204bc3d5698SJohn Baldwin	mov	r5, r1			@ pass rounds
2205bc3d5698SJohn Baldwin	veor	q7, q7, q15
2206bc3d5698SJohn Baldwin	mov	r0, sp
2207bc3d5698SJohn Baldwin
2208bc3d5698SJohn Baldwin	bl	_bsaes_decrypt8
2209bc3d5698SJohn Baldwin
2210bc3d5698SJohn Baldwin	vld1.64	{q8,q9}, [r0,:128]!
2211bc3d5698SJohn Baldwin	vld1.64	{q10,q11}, [r0,:128]!
2212bc3d5698SJohn Baldwin	veor	q0, q0, q8
2213bc3d5698SJohn Baldwin	vld1.64	{q12,q13}, [r0,:128]!
2214bc3d5698SJohn Baldwin	veor	q1, q1, q9
2215bc3d5698SJohn Baldwin	veor	q8, q6, q10
2216bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r8]!
2217bc3d5698SJohn Baldwin	veor	q9, q4, q11
2218bc3d5698SJohn Baldwin	vld1.64	{q14,q15}, [r0,:128]!
2219bc3d5698SJohn Baldwin	veor	q10, q2, q12
2220bc3d5698SJohn Baldwin	vst1.8	{q8,q9}, [r8]!
2221bc3d5698SJohn Baldwin	veor	q11, q7, q13
2222bc3d5698SJohn Baldwin	veor	q12, q3, q14
2223bc3d5698SJohn Baldwin	vst1.8	{q10,q11}, [r8]!
2224bc3d5698SJohn Baldwin	veor	q13, q5, q15
2225bc3d5698SJohn Baldwin	vst1.8	{q12,q13}, [r8]!
2226bc3d5698SJohn Baldwin
2227bc3d5698SJohn Baldwin	vld1.64	{q8}, [r0,:128]		@ next round tweak
2228bc3d5698SJohn Baldwin
2229bc3d5698SJohn Baldwin	subs	r9, #0x80
2230bc3d5698SJohn Baldwin	bpl	.Lxts_dec_loop
2231bc3d5698SJohn Baldwin
2232bc3d5698SJohn Baldwin.Lxts_dec_short:
2233bc3d5698SJohn Baldwin	adds	r9, #0x70
2234bc3d5698SJohn Baldwin	bmi	.Lxts_dec_done
2235bc3d5698SJohn Baldwin
2236bc3d5698SJohn Baldwin	vldmia	r2, {q5}	@ load XTS magic
2237bc3d5698SJohn Baldwin	vshr.s64	q7, q8, #63
2238bc3d5698SJohn Baldwin	mov	r0, sp
2239bc3d5698SJohn Baldwin	vand	q7, q7, q5
2240bc3d5698SJohn Baldwin	vadd.u64	q9, q8, q8
2241bc3d5698SJohn Baldwin	vst1.64	{q8}, [r0,:128]!
2242bc3d5698SJohn Baldwin	vswp	d15,d14
2243bc3d5698SJohn Baldwin	vshr.s64	q6, q9, #63
2244bc3d5698SJohn Baldwin	veor	q9, q9, q7
2245bc3d5698SJohn Baldwin	vand	q6, q6, q5
2246bc3d5698SJohn Baldwin	vadd.u64	q10, q9, q9
2247bc3d5698SJohn Baldwin	vst1.64	{q9}, [r0,:128]!
2248bc3d5698SJohn Baldwin	vswp	d13,d12
2249bc3d5698SJohn Baldwin	vshr.s64	q7, q10, #63
2250bc3d5698SJohn Baldwin	veor	q10, q10, q6
2251bc3d5698SJohn Baldwin	vand	q7, q7, q5
2252bc3d5698SJohn Baldwin	vld1.8	{q0}, [r7]!
2253bc3d5698SJohn Baldwin	subs	r9, #0x10
2254bc3d5698SJohn Baldwin	bmi	.Lxts_dec_1
2255bc3d5698SJohn Baldwin	vadd.u64	q11, q10, q10
2256bc3d5698SJohn Baldwin	vst1.64	{q10}, [r0,:128]!
2257bc3d5698SJohn Baldwin	vswp	d15,d14
2258bc3d5698SJohn Baldwin	vshr.s64	q6, q11, #63
2259bc3d5698SJohn Baldwin	veor	q11, q11, q7
2260bc3d5698SJohn Baldwin	vand	q6, q6, q5
2261bc3d5698SJohn Baldwin	vld1.8	{q1}, [r7]!
2262bc3d5698SJohn Baldwin	subs	r9, #0x10
2263bc3d5698SJohn Baldwin	bmi	.Lxts_dec_2
2264bc3d5698SJohn Baldwin	veor	q0, q0, q8
2265bc3d5698SJohn Baldwin	vadd.u64	q12, q11, q11
2266bc3d5698SJohn Baldwin	vst1.64	{q11}, [r0,:128]!
2267bc3d5698SJohn Baldwin	vswp	d13,d12
2268bc3d5698SJohn Baldwin	vshr.s64	q7, q12, #63
2269bc3d5698SJohn Baldwin	veor	q12, q12, q6
2270bc3d5698SJohn Baldwin	vand	q7, q7, q5
2271bc3d5698SJohn Baldwin	vld1.8	{q2}, [r7]!
2272bc3d5698SJohn Baldwin	subs	r9, #0x10
2273bc3d5698SJohn Baldwin	bmi	.Lxts_dec_3
2274bc3d5698SJohn Baldwin	veor	q1, q1, q9
2275bc3d5698SJohn Baldwin	vadd.u64	q13, q12, q12
2276bc3d5698SJohn Baldwin	vst1.64	{q12}, [r0,:128]!
2277bc3d5698SJohn Baldwin	vswp	d15,d14
2278bc3d5698SJohn Baldwin	vshr.s64	q6, q13, #63
2279bc3d5698SJohn Baldwin	veor	q13, q13, q7
2280bc3d5698SJohn Baldwin	vand	q6, q6, q5
2281bc3d5698SJohn Baldwin	vld1.8	{q3}, [r7]!
2282bc3d5698SJohn Baldwin	subs	r9, #0x10
2283bc3d5698SJohn Baldwin	bmi	.Lxts_dec_4
2284bc3d5698SJohn Baldwin	veor	q2, q2, q10
2285bc3d5698SJohn Baldwin	vadd.u64	q14, q13, q13
2286bc3d5698SJohn Baldwin	vst1.64	{q13}, [r0,:128]!
2287bc3d5698SJohn Baldwin	vswp	d13,d12
2288bc3d5698SJohn Baldwin	vshr.s64	q7, q14, #63
2289bc3d5698SJohn Baldwin	veor	q14, q14, q6
2290bc3d5698SJohn Baldwin	vand	q7, q7, q5
2291bc3d5698SJohn Baldwin	vld1.8	{q4}, [r7]!
2292bc3d5698SJohn Baldwin	subs	r9, #0x10
2293bc3d5698SJohn Baldwin	bmi	.Lxts_dec_5
2294bc3d5698SJohn Baldwin	veor	q3, q3, q11
2295bc3d5698SJohn Baldwin	vadd.u64	q15, q14, q14
2296bc3d5698SJohn Baldwin	vst1.64	{q14}, [r0,:128]!
2297bc3d5698SJohn Baldwin	vswp	d15,d14
2298bc3d5698SJohn Baldwin	vshr.s64	q6, q15, #63
2299bc3d5698SJohn Baldwin	veor	q15, q15, q7
2300bc3d5698SJohn Baldwin	vand	q6, q6, q5
2301bc3d5698SJohn Baldwin	vld1.8	{q5}, [r7]!
2302bc3d5698SJohn Baldwin	subs	r9, #0x10
2303bc3d5698SJohn Baldwin	bmi	.Lxts_dec_6
2304bc3d5698SJohn Baldwin	veor	q4, q4, q12
2305bc3d5698SJohn Baldwin	sub	r9, #0x10
2306bc3d5698SJohn Baldwin	vst1.64	{q15}, [r0,:128]		@ next round tweak
2307bc3d5698SJohn Baldwin
2308bc3d5698SJohn Baldwin	vld1.8	{q6}, [r7]!
2309bc3d5698SJohn Baldwin	veor	q5, q5, q13
2310bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
2311bc3d5698SJohn Baldwin	add	r4, sp, #0x90			@ pass key schedule
2312bc3d5698SJohn Baldwin#else
2313bc3d5698SJohn Baldwin	add	r4, r10, #248			@ pass key schedule
2314bc3d5698SJohn Baldwin#endif
2315bc3d5698SJohn Baldwin	veor	q6, q6, q14
2316bc3d5698SJohn Baldwin	mov	r5, r1			@ pass rounds
2317bc3d5698SJohn Baldwin	mov	r0, sp
2318bc3d5698SJohn Baldwin
2319bc3d5698SJohn Baldwin	bl	_bsaes_decrypt8
2320bc3d5698SJohn Baldwin
2321bc3d5698SJohn Baldwin	vld1.64	{q8,q9}, [r0,:128]!
2322bc3d5698SJohn Baldwin	vld1.64	{q10,q11}, [r0,:128]!
2323bc3d5698SJohn Baldwin	veor	q0, q0, q8
2324bc3d5698SJohn Baldwin	vld1.64	{q12,q13}, [r0,:128]!
2325bc3d5698SJohn Baldwin	veor	q1, q1, q9
2326bc3d5698SJohn Baldwin	veor	q8, q6, q10
2327bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r8]!
2328bc3d5698SJohn Baldwin	veor	q9, q4, q11
2329bc3d5698SJohn Baldwin	vld1.64	{q14}, [r0,:128]!
2330bc3d5698SJohn Baldwin	veor	q10, q2, q12
2331bc3d5698SJohn Baldwin	vst1.8	{q8,q9}, [r8]!
2332bc3d5698SJohn Baldwin	veor	q11, q7, q13
2333bc3d5698SJohn Baldwin	veor	q12, q3, q14
2334bc3d5698SJohn Baldwin	vst1.8	{q10,q11}, [r8]!
2335bc3d5698SJohn Baldwin	vst1.8	{q12}, [r8]!
2336bc3d5698SJohn Baldwin
2337bc3d5698SJohn Baldwin	vld1.64	{q8}, [r0,:128]		@ next round tweak
2338bc3d5698SJohn Baldwin	b	.Lxts_dec_done
2339bc3d5698SJohn Baldwin.align	4
2340bc3d5698SJohn Baldwin.Lxts_dec_6:
2341bc3d5698SJohn Baldwin	vst1.64	{q14}, [r0,:128]		@ next round tweak
2342bc3d5698SJohn Baldwin
2343bc3d5698SJohn Baldwin	veor	q4, q4, q12
2344bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
2345bc3d5698SJohn Baldwin	add	r4, sp, #0x90			@ pass key schedule
2346bc3d5698SJohn Baldwin#else
2347bc3d5698SJohn Baldwin	add	r4, r10, #248			@ pass key schedule
2348bc3d5698SJohn Baldwin#endif
2349bc3d5698SJohn Baldwin	veor	q5, q5, q13
2350bc3d5698SJohn Baldwin	mov	r5, r1			@ pass rounds
2351bc3d5698SJohn Baldwin	mov	r0, sp
2352bc3d5698SJohn Baldwin
2353bc3d5698SJohn Baldwin	bl	_bsaes_decrypt8
2354bc3d5698SJohn Baldwin
2355bc3d5698SJohn Baldwin	vld1.64	{q8,q9}, [r0,:128]!
2356bc3d5698SJohn Baldwin	vld1.64	{q10,q11}, [r0,:128]!
2357bc3d5698SJohn Baldwin	veor	q0, q0, q8
2358bc3d5698SJohn Baldwin	vld1.64	{q12,q13}, [r0,:128]!
2359bc3d5698SJohn Baldwin	veor	q1, q1, q9
2360bc3d5698SJohn Baldwin	veor	q8, q6, q10
2361bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r8]!
2362bc3d5698SJohn Baldwin	veor	q9, q4, q11
2363bc3d5698SJohn Baldwin	veor	q10, q2, q12
2364bc3d5698SJohn Baldwin	vst1.8	{q8,q9}, [r8]!
2365bc3d5698SJohn Baldwin	veor	q11, q7, q13
2366bc3d5698SJohn Baldwin	vst1.8	{q10,q11}, [r8]!
2367bc3d5698SJohn Baldwin
2368bc3d5698SJohn Baldwin	vld1.64	{q8}, [r0,:128]		@ next round tweak
2369bc3d5698SJohn Baldwin	b	.Lxts_dec_done
2370bc3d5698SJohn Baldwin.align	4
2371bc3d5698SJohn Baldwin.Lxts_dec_5:
2372bc3d5698SJohn Baldwin	veor	q3, q3, q11
2373bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
2374bc3d5698SJohn Baldwin	add	r4, sp, #0x90			@ pass key schedule
2375bc3d5698SJohn Baldwin#else
2376bc3d5698SJohn Baldwin	add	r4, r10, #248			@ pass key schedule
2377bc3d5698SJohn Baldwin#endif
2378bc3d5698SJohn Baldwin	veor	q4, q4, q12
2379bc3d5698SJohn Baldwin	mov	r5, r1			@ pass rounds
2380bc3d5698SJohn Baldwin	mov	r0, sp
2381bc3d5698SJohn Baldwin
2382bc3d5698SJohn Baldwin	bl	_bsaes_decrypt8
2383bc3d5698SJohn Baldwin
2384bc3d5698SJohn Baldwin	vld1.64	{q8,q9}, [r0,:128]!
2385bc3d5698SJohn Baldwin	vld1.64	{q10,q11}, [r0,:128]!
2386bc3d5698SJohn Baldwin	veor	q0, q0, q8
2387bc3d5698SJohn Baldwin	vld1.64	{q12}, [r0,:128]!
2388bc3d5698SJohn Baldwin	veor	q1, q1, q9
2389bc3d5698SJohn Baldwin	veor	q8, q6, q10
2390bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r8]!
2391bc3d5698SJohn Baldwin	veor	q9, q4, q11
2392bc3d5698SJohn Baldwin	veor	q10, q2, q12
2393bc3d5698SJohn Baldwin	vst1.8	{q8,q9}, [r8]!
2394bc3d5698SJohn Baldwin	vst1.8	{q10}, [r8]!
2395bc3d5698SJohn Baldwin
2396bc3d5698SJohn Baldwin	vld1.64	{q8}, [r0,:128]		@ next round tweak
2397bc3d5698SJohn Baldwin	b	.Lxts_dec_done
2398bc3d5698SJohn Baldwin.align	4
2399bc3d5698SJohn Baldwin.Lxts_dec_4:
2400bc3d5698SJohn Baldwin	veor	q2, q2, q10
2401bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
2402bc3d5698SJohn Baldwin	add	r4, sp, #0x90			@ pass key schedule
2403bc3d5698SJohn Baldwin#else
2404bc3d5698SJohn Baldwin	add	r4, r10, #248			@ pass key schedule
2405bc3d5698SJohn Baldwin#endif
2406bc3d5698SJohn Baldwin	veor	q3, q3, q11
2407bc3d5698SJohn Baldwin	mov	r5, r1			@ pass rounds
2408bc3d5698SJohn Baldwin	mov	r0, sp
2409bc3d5698SJohn Baldwin
2410bc3d5698SJohn Baldwin	bl	_bsaes_decrypt8
2411bc3d5698SJohn Baldwin
2412bc3d5698SJohn Baldwin	vld1.64	{q8,q9}, [r0,:128]!
2413bc3d5698SJohn Baldwin	vld1.64	{q10,q11}, [r0,:128]!
2414bc3d5698SJohn Baldwin	veor	q0, q0, q8
2415bc3d5698SJohn Baldwin	veor	q1, q1, q9
2416bc3d5698SJohn Baldwin	veor	q8, q6, q10
2417bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r8]!
2418bc3d5698SJohn Baldwin	veor	q9, q4, q11
2419bc3d5698SJohn Baldwin	vst1.8	{q8,q9}, [r8]!
2420bc3d5698SJohn Baldwin
2421bc3d5698SJohn Baldwin	vld1.64	{q8}, [r0,:128]		@ next round tweak
2422bc3d5698SJohn Baldwin	b	.Lxts_dec_done
2423bc3d5698SJohn Baldwin.align	4
2424bc3d5698SJohn Baldwin.Lxts_dec_3:
2425bc3d5698SJohn Baldwin	veor	q1, q1, q9
2426bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
2427bc3d5698SJohn Baldwin	add	r4, sp, #0x90			@ pass key schedule
2428bc3d5698SJohn Baldwin#else
2429bc3d5698SJohn Baldwin	add	r4, r10, #248			@ pass key schedule
2430bc3d5698SJohn Baldwin#endif
2431bc3d5698SJohn Baldwin	veor	q2, q2, q10
2432bc3d5698SJohn Baldwin	mov	r5, r1			@ pass rounds
2433bc3d5698SJohn Baldwin	mov	r0, sp
2434bc3d5698SJohn Baldwin
2435bc3d5698SJohn Baldwin	bl	_bsaes_decrypt8
2436bc3d5698SJohn Baldwin
2437bc3d5698SJohn Baldwin	vld1.64	{q8,q9}, [r0,:128]!
2438bc3d5698SJohn Baldwin	vld1.64	{q10}, [r0,:128]!
2439bc3d5698SJohn Baldwin	veor	q0, q0, q8
2440bc3d5698SJohn Baldwin	veor	q1, q1, q9
2441bc3d5698SJohn Baldwin	veor	q8, q6, q10
2442bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r8]!
2443bc3d5698SJohn Baldwin	vst1.8	{q8}, [r8]!
2444bc3d5698SJohn Baldwin
2445bc3d5698SJohn Baldwin	vld1.64	{q8}, [r0,:128]		@ next round tweak
2446bc3d5698SJohn Baldwin	b	.Lxts_dec_done
2447bc3d5698SJohn Baldwin.align	4
2448bc3d5698SJohn Baldwin.Lxts_dec_2:
2449bc3d5698SJohn Baldwin	veor	q0, q0, q8
2450bc3d5698SJohn Baldwin#ifndef	BSAES_ASM_EXTENDED_KEY
2451bc3d5698SJohn Baldwin	add	r4, sp, #0x90			@ pass key schedule
2452bc3d5698SJohn Baldwin#else
2453bc3d5698SJohn Baldwin	add	r4, r10, #248			@ pass key schedule
2454bc3d5698SJohn Baldwin#endif
2455bc3d5698SJohn Baldwin	veor	q1, q1, q9
2456bc3d5698SJohn Baldwin	mov	r5, r1			@ pass rounds
2457bc3d5698SJohn Baldwin	mov	r0, sp
2458bc3d5698SJohn Baldwin
2459bc3d5698SJohn Baldwin	bl	_bsaes_decrypt8
2460bc3d5698SJohn Baldwin
2461bc3d5698SJohn Baldwin	vld1.64	{q8,q9}, [r0,:128]!
2462bc3d5698SJohn Baldwin	veor	q0, q0, q8
2463bc3d5698SJohn Baldwin	veor	q1, q1, q9
2464bc3d5698SJohn Baldwin	vst1.8	{q0,q1}, [r8]!
2465bc3d5698SJohn Baldwin
2466bc3d5698SJohn Baldwin	vld1.64	{q8}, [r0,:128]		@ next round tweak
2467bc3d5698SJohn Baldwin	b	.Lxts_dec_done
2468bc3d5698SJohn Baldwin.align	4
2469bc3d5698SJohn Baldwin.Lxts_dec_1:
2470bc3d5698SJohn Baldwin	mov	r0, sp
2471bc3d5698SJohn Baldwin	veor	q0, q0, q8
2472bc3d5698SJohn Baldwin	mov	r1, sp
2473bc3d5698SJohn Baldwin	vst1.8	{q0}, [sp,:128]
2474bc3d5698SJohn Baldwin	mov	r5, r2			@ preserve magic
2475bc3d5698SJohn Baldwin	mov	r2, r10
2476bc3d5698SJohn Baldwin	mov	r4, r3				@ preserve fp
2477bc3d5698SJohn Baldwin
2478bc3d5698SJohn Baldwin	bl	AES_decrypt
2479bc3d5698SJohn Baldwin
2480bc3d5698SJohn Baldwin	vld1.8	{q0}, [sp,:128]
2481bc3d5698SJohn Baldwin	veor	q0, q0, q8
2482bc3d5698SJohn Baldwin	vst1.8	{q0}, [r8]!
2483bc3d5698SJohn Baldwin	mov	r3, r4
2484bc3d5698SJohn Baldwin	mov	r2, r5
2485bc3d5698SJohn Baldwin
2486bc3d5698SJohn Baldwin	vmov	q8, q9		@ next round tweak
2487bc3d5698SJohn Baldwin
2488bc3d5698SJohn Baldwin.Lxts_dec_done:
2489bc3d5698SJohn Baldwin#ifndef	XTS_CHAIN_TWEAK
2490bc3d5698SJohn Baldwin	adds	r9, #0x10
2491bc3d5698SJohn Baldwin	beq	.Lxts_dec_ret
2492bc3d5698SJohn Baldwin
2493bc3d5698SJohn Baldwin	@ calculate one round of extra tweak for the stolen ciphertext
2494bc3d5698SJohn Baldwin	vldmia	r2, {q5}
2495bc3d5698SJohn Baldwin	vshr.s64	q6, q8, #63
2496bc3d5698SJohn Baldwin	vand	q6, q6, q5
2497bc3d5698SJohn Baldwin	vadd.u64	q9, q8, q8
2498bc3d5698SJohn Baldwin	vswp	d13,d12
2499bc3d5698SJohn Baldwin	veor	q9, q9, q6
2500bc3d5698SJohn Baldwin
2501bc3d5698SJohn Baldwin	@ perform the final decryption with the last tweak value
2502bc3d5698SJohn Baldwin	vld1.8	{q0}, [r7]!
2503bc3d5698SJohn Baldwin	mov	r0, sp
2504bc3d5698SJohn Baldwin	veor	q0, q0, q9
2505bc3d5698SJohn Baldwin	mov	r1, sp
2506bc3d5698SJohn Baldwin	vst1.8	{q0}, [sp,:128]
2507bc3d5698SJohn Baldwin	mov	r2, r10
2508bc3d5698SJohn Baldwin	mov	r4, r3			@ preserve fp
2509bc3d5698SJohn Baldwin
2510bc3d5698SJohn Baldwin	bl	AES_decrypt
2511bc3d5698SJohn Baldwin
2512bc3d5698SJohn Baldwin	vld1.8	{q0}, [sp,:128]
2513bc3d5698SJohn Baldwin	veor	q0, q0, q9
2514bc3d5698SJohn Baldwin	vst1.8	{q0}, [r8]
2515bc3d5698SJohn Baldwin
2516bc3d5698SJohn Baldwin	mov	r6, r8
2517bc3d5698SJohn Baldwin.Lxts_dec_steal:
2518bc3d5698SJohn Baldwin	ldrb	r1, [r8]
2519bc3d5698SJohn Baldwin	ldrb	r0, [r7], #1
2520bc3d5698SJohn Baldwin	strb	r1, [r8, #0x10]
2521bc3d5698SJohn Baldwin	strb	r0, [r8], #1
2522bc3d5698SJohn Baldwin
2523bc3d5698SJohn Baldwin	subs	r9, #1
2524bc3d5698SJohn Baldwin	bhi	.Lxts_dec_steal
2525bc3d5698SJohn Baldwin
2526bc3d5698SJohn Baldwin	vld1.8	{q0}, [r6]
2527bc3d5698SJohn Baldwin	mov	r0, sp
2528bc3d5698SJohn Baldwin	veor	q0, q8
2529bc3d5698SJohn Baldwin	mov	r1, sp
2530bc3d5698SJohn Baldwin	vst1.8	{q0}, [sp,:128]
2531bc3d5698SJohn Baldwin	mov	r2, r10
2532bc3d5698SJohn Baldwin
2533bc3d5698SJohn Baldwin	bl	AES_decrypt
2534bc3d5698SJohn Baldwin
2535bc3d5698SJohn Baldwin	vld1.8	{q0}, [sp,:128]
2536bc3d5698SJohn Baldwin	veor	q0, q0, q8
2537bc3d5698SJohn Baldwin	vst1.8	{q0}, [r6]
2538bc3d5698SJohn Baldwin	mov	r3, r4
2539bc3d5698SJohn Baldwin#endif
2540bc3d5698SJohn Baldwin
2541bc3d5698SJohn Baldwin.Lxts_dec_ret:
2542bc3d5698SJohn Baldwin	bic	r0, r3, #0xf
2543bc3d5698SJohn Baldwin	vmov.i32	q0, #0
2544bc3d5698SJohn Baldwin	vmov.i32	q1, #0
2545bc3d5698SJohn Baldwin#ifdef	XTS_CHAIN_TWEAK
2546bc3d5698SJohn Baldwin	ldr	r1, [r3, #0x20+VFP_ABI_FRAME]	@ chain tweak
2547bc3d5698SJohn Baldwin#endif
2548bc3d5698SJohn Baldwin.Lxts_dec_bzero:@ wipe key schedule [if any]
2549bc3d5698SJohn Baldwin	vstmia	sp!, {q0,q1}
2550bc3d5698SJohn Baldwin	cmp	sp, r0
2551bc3d5698SJohn Baldwin	bne	.Lxts_dec_bzero
2552bc3d5698SJohn Baldwin
2553bc3d5698SJohn Baldwin	mov	sp, r3
2554bc3d5698SJohn Baldwin#ifdef	XTS_CHAIN_TWEAK
2555bc3d5698SJohn Baldwin	vst1.8	{q8}, [r1]
2556bc3d5698SJohn Baldwin#endif
2557bc3d5698SJohn Baldwin	VFP_ABI_POP
2558bc3d5698SJohn Baldwin	ldmia	sp!, {r4,r5,r6,r7,r8,r9,r10, pc}	@ return
2559bc3d5698SJohn Baldwin
2560*c0855eaaSJohn Baldwin.size	ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt
2561bc3d5698SJohn Baldwin#endif
2562