xref: /freebsd/sys/crypto/openssl/aarch64/bsaes-armv8.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1/* Do not modify. This file is auto-generated from bsaes-armv8.pl. */
2// Copyright 2021-2025 The OpenSSL Project Authors. All Rights Reserved.
3//
4// Licensed under the OpenSSL license (the "License").  You may not use
5// this file except in compliance with the License.  You can obtain a copy
6// in the file LICENSE in the source distribution or at
7// https://www.openssl.org/source/license.html
8//
9// ====================================================================
10// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL
11// project. Rights for redistribution and usage in source and binary
12// forms are granted according to the OpenSSL license.
13// ====================================================================
14//
15// This implementation is a translation of bsaes-armv7 for AArch64.
16// No attempt has been made to carry across the build switches for
17// kernel targets, since the Linux kernel crypto support has moved on
18// from when it was based on OpenSSL.
19
20// A lot of hand-scheduling has been performed. Consequently, this code
21// doesn't factor out neatly into macros in the same way that the
22// AArch32 version did, and there is little to be gained by wrapping it
23// up in Perl, and it is presented as pure assembly.
24
25
26#include "crypto/arm_arch.h"
27
28.text
29
30
31
32
33
34.type	_bsaes_decrypt8,%function
35.align	4
36// On entry:
37//   x9 -> key (previously expanded using _bsaes_key_convert)
38//   x10 = number of rounds
39//   v0-v7 input data
40// On exit:
41//   x9-x11 corrupted
42//   other general-purpose registers preserved
43//   v0-v7 output data
44//   v11-v15 preserved
45//   other SIMD registers corrupted
46_bsaes_decrypt8:
47	ldr	q8, [x9], #16
48	adrp	x11, .LM0ISR
49	add	x11, x11, #:lo12:.LM0ISR
50	movi	v9.16b, #0x55
51	ldr	q10, [x11], #16
52	movi	v16.16b, #0x33
53	movi	v17.16b, #0x0f
54	sub	x10, x10, #1
55	eor	v0.16b, v0.16b, v8.16b
56	eor	v1.16b, v1.16b, v8.16b
57	eor	v2.16b, v2.16b, v8.16b
58	eor	v4.16b, v4.16b, v8.16b
59	eor	v3.16b, v3.16b, v8.16b
60	eor	v5.16b, v5.16b, v8.16b
61	tbl	v0.16b, {v0.16b}, v10.16b
62	tbl	v1.16b, {v1.16b}, v10.16b
63	tbl	v2.16b, {v2.16b}, v10.16b
64	tbl	v4.16b, {v4.16b}, v10.16b
65	eor	v6.16b, v6.16b, v8.16b
66	eor	v7.16b, v7.16b, v8.16b
67	tbl	v3.16b, {v3.16b}, v10.16b
68	tbl	v5.16b, {v5.16b}, v10.16b
69	tbl	v6.16b, {v6.16b}, v10.16b
70	ushr	v8.2d, v0.2d, #1
71	tbl	v7.16b, {v7.16b}, v10.16b
72	ushr	v10.2d, v4.2d, #1
73	ushr	v18.2d, v2.2d, #1
74	eor	v8.16b, v8.16b, v1.16b
75	ushr	v19.2d, v6.2d, #1
76	eor	v10.16b, v10.16b, v5.16b
77	eor	v18.16b, v18.16b, v3.16b
78	and	v8.16b, v8.16b, v9.16b
79	eor	v19.16b, v19.16b, v7.16b
80	and	v10.16b, v10.16b, v9.16b
81	and	v18.16b, v18.16b, v9.16b
82	eor	v1.16b, v1.16b, v8.16b
83	shl	v8.2d, v8.2d, #1
84	and	v9.16b, v19.16b, v9.16b
85	eor	v5.16b, v5.16b, v10.16b
86	shl	v10.2d, v10.2d, #1
87	eor	v3.16b, v3.16b, v18.16b
88	shl	v18.2d, v18.2d, #1
89	eor	v0.16b, v0.16b, v8.16b
90	shl	v8.2d, v9.2d, #1
91	eor	v7.16b, v7.16b, v9.16b
92	eor	v4.16b, v4.16b, v10.16b
93	eor	v2.16b, v2.16b, v18.16b
94	ushr	v9.2d, v1.2d, #2
95	eor	v6.16b, v6.16b, v8.16b
96	ushr	v8.2d, v0.2d, #2
97	ushr	v10.2d, v5.2d, #2
98	ushr	v18.2d, v4.2d, #2
99	eor	v9.16b, v9.16b, v3.16b
100	eor	v8.16b, v8.16b, v2.16b
101	eor	v10.16b, v10.16b, v7.16b
102	eor	v18.16b, v18.16b, v6.16b
103	and	v9.16b, v9.16b, v16.16b
104	and	v8.16b, v8.16b, v16.16b
105	and	v10.16b, v10.16b, v16.16b
106	and	v16.16b, v18.16b, v16.16b
107	eor	v3.16b, v3.16b, v9.16b
108	shl	v9.2d, v9.2d, #2
109	eor	v2.16b, v2.16b, v8.16b
110	shl	v8.2d, v8.2d, #2
111	eor	v7.16b, v7.16b, v10.16b
112	shl	v10.2d, v10.2d, #2
113	eor	v6.16b, v6.16b, v16.16b
114	shl	v16.2d, v16.2d, #2
115	eor	v1.16b, v1.16b, v9.16b
116	eor	v0.16b, v0.16b, v8.16b
117	eor	v5.16b, v5.16b, v10.16b
118	eor	v4.16b, v4.16b, v16.16b
119	ushr	v8.2d, v3.2d, #4
120	ushr	v9.2d, v2.2d, #4
121	ushr	v10.2d, v1.2d, #4
122	ushr	v16.2d, v0.2d, #4
123	eor	v8.16b, v8.16b, v7.16b
124	eor	v9.16b, v9.16b, v6.16b
125	eor	v10.16b, v10.16b, v5.16b
126	eor	v16.16b, v16.16b, v4.16b
127	and	v8.16b, v8.16b, v17.16b
128	and	v9.16b, v9.16b, v17.16b
129	and	v10.16b, v10.16b, v17.16b
130	and	v16.16b, v16.16b, v17.16b
131	eor	v7.16b, v7.16b, v8.16b
132	shl	v8.2d, v8.2d, #4
133	eor	v6.16b, v6.16b, v9.16b
134	shl	v9.2d, v9.2d, #4
135	eor	v5.16b, v5.16b, v10.16b
136	shl	v10.2d, v10.2d, #4
137	eor	v4.16b, v4.16b, v16.16b
138	shl	v16.2d, v16.2d, #4
139	eor	v3.16b, v3.16b, v8.16b
140	eor	v2.16b, v2.16b, v9.16b
141	eor	v1.16b, v1.16b, v10.16b
142	eor	v0.16b, v0.16b, v16.16b
143	b	.Ldec_sbox
144.align	4
145.Ldec_loop:
146	ld1	{v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
147	ldp	q8, q9, [x9], #32
148	eor	v0.16b, v16.16b, v0.16b
149	ldr	q10, [x9], #16
150	eor	v1.16b, v17.16b, v1.16b
151	ldr	q16, [x9], #16
152	eor	v2.16b, v18.16b, v2.16b
153	eor	v3.16b, v19.16b, v3.16b
154	eor	v4.16b, v8.16b, v4.16b
155	eor	v5.16b, v9.16b, v5.16b
156	eor	v6.16b, v10.16b, v6.16b
157	eor	v7.16b, v16.16b, v7.16b
158	tbl	v0.16b, {v0.16b}, v28.16b
159	tbl	v1.16b, {v1.16b}, v28.16b
160	tbl	v2.16b, {v2.16b}, v28.16b
161	tbl	v3.16b, {v3.16b}, v28.16b
162	tbl	v4.16b, {v4.16b}, v28.16b
163	tbl	v5.16b, {v5.16b}, v28.16b
164	tbl	v6.16b, {v6.16b}, v28.16b
165	tbl	v7.16b, {v7.16b}, v28.16b
166.Ldec_sbox:
167	eor	v1.16b, v1.16b, v4.16b
168	eor	v3.16b, v3.16b, v4.16b
169	subs	x10, x10, #1
170	eor	v4.16b, v4.16b, v7.16b
171	eor	v2.16b, v2.16b, v7.16b
172	eor	v1.16b, v1.16b, v6.16b
173	eor	v6.16b, v6.16b, v4.16b
174	eor	v2.16b, v2.16b, v5.16b
175	eor	v0.16b, v0.16b, v1.16b
176	eor	v7.16b, v7.16b, v6.16b
177	eor	v8.16b, v6.16b, v2.16b
178	and	v9.16b, v4.16b, v6.16b
179	eor	v10.16b, v2.16b, v6.16b
180	eor	v3.16b, v3.16b, v0.16b
181	eor	v5.16b, v5.16b, v0.16b
182	eor	v16.16b, v7.16b, v4.16b
183	eor	v17.16b, v4.16b, v0.16b
184	and	v18.16b, v0.16b, v2.16b
185	eor	v19.16b, v7.16b, v4.16b
186	eor	v1.16b, v1.16b, v3.16b
187	eor	v20.16b, v3.16b, v0.16b
188	eor	v21.16b, v5.16b, v2.16b
189	eor	v22.16b, v3.16b, v7.16b
190	and	v8.16b, v17.16b, v8.16b
191	orr	v17.16b, v3.16b, v5.16b
192	eor	v23.16b, v1.16b, v6.16b
193	eor	v24.16b, v20.16b, v16.16b
194	eor	v25.16b, v1.16b, v5.16b
195	orr	v26.16b, v20.16b, v21.16b
196	and	v20.16b, v20.16b, v21.16b
197	and	v27.16b, v7.16b, v1.16b
198	eor	v21.16b, v21.16b, v23.16b
199	orr	v28.16b, v16.16b, v23.16b
200	orr	v29.16b, v22.16b, v25.16b
201	eor	v26.16b, v26.16b, v8.16b
202	and	v16.16b, v16.16b, v23.16b
203	and	v22.16b, v22.16b, v25.16b
204	and	v21.16b, v24.16b, v21.16b
205	eor	v8.16b, v28.16b, v8.16b
206	eor	v23.16b, v5.16b, v2.16b
207	eor	v24.16b, v1.16b, v6.16b
208	eor	v16.16b, v16.16b, v22.16b
209	eor	v22.16b, v3.16b, v0.16b
210	eor	v25.16b, v29.16b, v21.16b
211	eor	v21.16b, v26.16b, v21.16b
212	eor	v8.16b, v8.16b, v20.16b
213	eor	v26.16b, v23.16b, v24.16b
214	eor	v16.16b, v16.16b, v20.16b
215	eor	v28.16b, v22.16b, v19.16b
216	eor	v20.16b, v25.16b, v20.16b
217	eor	v9.16b, v21.16b, v9.16b
218	eor	v8.16b, v8.16b, v18.16b
219	eor	v18.16b, v5.16b, v1.16b
220	eor	v21.16b, v16.16b, v17.16b
221	eor	v16.16b, v16.16b, v17.16b
222	eor	v17.16b, v20.16b, v27.16b
223	eor	v20.16b, v3.16b, v7.16b
224	eor	v25.16b, v9.16b, v8.16b
225	eor	v27.16b, v0.16b, v4.16b
226	and	v29.16b, v9.16b, v17.16b
227	eor	v30.16b, v8.16b, v29.16b
228	eor	v31.16b, v21.16b, v29.16b
229	eor	v29.16b, v21.16b, v29.16b
230	bsl	v30.16b, v17.16b, v21.16b
231	bsl	v31.16b, v9.16b, v8.16b
232	bsl	v16.16b, v30.16b, v29.16b
233	bsl	v21.16b, v29.16b, v30.16b
234	eor	v8.16b, v31.16b, v30.16b
235	and	v1.16b, v1.16b, v31.16b
236	and	v9.16b, v16.16b, v31.16b
237	and	v6.16b, v6.16b, v30.16b
238	eor	v16.16b, v17.16b, v21.16b
239	and	v4.16b, v4.16b, v30.16b
240	eor	v17.16b, v8.16b, v30.16b
241	and	v21.16b, v24.16b, v8.16b
242	eor	v9.16b, v9.16b, v25.16b
243	and	v19.16b, v19.16b, v8.16b
244	eor	v24.16b, v30.16b, v16.16b
245	eor	v25.16b, v30.16b, v16.16b
246	and	v7.16b, v7.16b, v17.16b
247	and	v10.16b, v10.16b, v16.16b
248	eor	v29.16b, v9.16b, v16.16b
249	eor	v30.16b, v31.16b, v9.16b
250	and	v0.16b, v24.16b, v0.16b
251	and	v9.16b, v18.16b, v9.16b
252	and	v2.16b, v25.16b, v2.16b
253	eor	v10.16b, v10.16b, v6.16b
254	eor	v18.16b, v29.16b, v16.16b
255	and	v5.16b, v30.16b, v5.16b
256	eor	v24.16b, v8.16b, v29.16b
257	and	v25.16b, v26.16b, v29.16b
258	and	v26.16b, v28.16b, v29.16b
259	eor	v8.16b, v8.16b, v29.16b
260	eor	v17.16b, v17.16b, v18.16b
261	eor	v5.16b, v1.16b, v5.16b
262	and	v23.16b, v24.16b, v23.16b
263	eor	v21.16b, v21.16b, v25.16b
264	eor	v19.16b, v19.16b, v26.16b
265	eor	v0.16b, v4.16b, v0.16b
266	and	v3.16b, v17.16b, v3.16b
267	eor	v1.16b, v9.16b, v1.16b
268	eor	v9.16b, v25.16b, v23.16b
269	eor	v5.16b, v5.16b, v21.16b
270	eor	v2.16b, v6.16b, v2.16b
271	and	v6.16b, v8.16b, v22.16b
272	eor	v3.16b, v7.16b, v3.16b
273	and	v8.16b, v20.16b, v18.16b
274	eor	v10.16b, v10.16b, v9.16b
275	eor	v0.16b, v0.16b, v19.16b
276	eor	v9.16b, v1.16b, v9.16b
277	eor	v1.16b, v2.16b, v21.16b
278	eor	v3.16b, v3.16b, v19.16b
279	and	v16.16b, v27.16b, v16.16b
280	eor	v17.16b, v26.16b, v6.16b
281	eor	v6.16b, v8.16b, v7.16b
282	eor	v7.16b, v1.16b, v9.16b
283	eor	v1.16b, v5.16b, v3.16b
284	eor	v2.16b, v10.16b, v3.16b
285	eor	v4.16b, v16.16b, v4.16b
286	eor	v8.16b, v6.16b, v17.16b
287	eor	v5.16b, v9.16b, v3.16b
288	eor	v9.16b, v0.16b, v1.16b
289	eor	v6.16b, v7.16b, v1.16b
290	eor	v0.16b, v4.16b, v17.16b
291	eor	v4.16b, v8.16b, v7.16b
292	eor	v7.16b, v9.16b, v2.16b
293	eor	v8.16b, v3.16b, v0.16b
294	eor	v7.16b, v7.16b, v5.16b
295	eor	v3.16b, v4.16b, v7.16b
296	eor	v4.16b, v7.16b, v0.16b
297	eor	v7.16b, v8.16b, v3.16b
298	bcc	.Ldec_done
299	ext	v8.16b, v0.16b, v0.16b, #8
300	ext	v9.16b, v1.16b, v1.16b, #8
301	ldr	q28, [x11]                  // load from .LISR in common case (x10 > 0)
302	ext	v10.16b, v6.16b, v6.16b, #8
303	ext	v16.16b, v3.16b, v3.16b, #8
304	ext	v17.16b, v5.16b, v5.16b, #8
305	ext	v18.16b, v4.16b, v4.16b, #8
306	eor	v8.16b, v8.16b, v0.16b
307	eor	v9.16b, v9.16b, v1.16b
308	eor	v10.16b, v10.16b, v6.16b
309	eor	v16.16b, v16.16b, v3.16b
310	eor	v17.16b, v17.16b, v5.16b
311	ext	v19.16b, v2.16b, v2.16b, #8
312	ext	v20.16b, v7.16b, v7.16b, #8
313	eor	v18.16b, v18.16b, v4.16b
314	eor	v6.16b, v6.16b, v8.16b
315	eor	v8.16b, v2.16b, v10.16b
316	eor	v4.16b, v4.16b, v9.16b
317	eor	v2.16b, v19.16b, v2.16b
318	eor	v9.16b, v20.16b, v7.16b
319	eor	v0.16b, v0.16b, v16.16b
320	eor	v1.16b, v1.16b, v16.16b
321	eor	v6.16b, v6.16b, v17.16b
322	eor	v8.16b, v8.16b, v16.16b
323	eor	v7.16b, v7.16b, v18.16b
324	eor	v4.16b, v4.16b, v16.16b
325	eor	v2.16b, v3.16b, v2.16b
326	eor	v1.16b, v1.16b, v17.16b
327	eor	v3.16b, v5.16b, v9.16b
328	eor	v5.16b, v8.16b, v17.16b
329	eor	v7.16b, v7.16b, v17.16b
330	ext	v8.16b, v0.16b, v0.16b, #12
331	ext	v9.16b, v6.16b, v6.16b, #12
332	ext	v10.16b, v4.16b, v4.16b, #12
333	ext	v16.16b, v1.16b, v1.16b, #12
334	ext	v17.16b, v5.16b, v5.16b, #12
335	ext	v18.16b, v7.16b, v7.16b, #12
336	eor	v0.16b, v0.16b, v8.16b
337	eor	v6.16b, v6.16b, v9.16b
338	eor	v4.16b, v4.16b, v10.16b
339	ext	v19.16b, v2.16b, v2.16b, #12
340	ext	v20.16b, v3.16b, v3.16b, #12
341	eor	v1.16b, v1.16b, v16.16b
342	eor	v5.16b, v5.16b, v17.16b
343	eor	v7.16b, v7.16b, v18.16b
344	eor	v2.16b, v2.16b, v19.16b
345	eor	v16.16b, v16.16b, v0.16b
346	eor	v3.16b, v3.16b, v20.16b
347	eor	v17.16b, v17.16b, v4.16b
348	eor	v10.16b, v10.16b, v6.16b
349	ext	v0.16b, v0.16b, v0.16b, #8
350	eor	v9.16b, v9.16b, v1.16b
351	ext	v1.16b, v1.16b, v1.16b, #8
352	eor	v8.16b, v8.16b, v3.16b
353	eor	v16.16b, v16.16b, v3.16b
354	eor	v18.16b, v18.16b, v5.16b
355	eor	v19.16b, v19.16b, v7.16b
356	ext	v21.16b, v5.16b, v5.16b, #8
357	ext	v5.16b, v7.16b, v7.16b, #8
358	eor	v7.16b, v20.16b, v2.16b
359	ext	v4.16b, v4.16b, v4.16b, #8
360	ext	v20.16b, v3.16b, v3.16b, #8
361	eor	v17.16b, v17.16b, v3.16b
362	ext	v2.16b, v2.16b, v2.16b, #8
363	eor	v3.16b, v10.16b, v3.16b
364	ext	v10.16b, v6.16b, v6.16b, #8
365	eor	v0.16b, v0.16b, v8.16b
366	eor	v1.16b, v1.16b, v16.16b
367	eor	v5.16b, v5.16b, v18.16b
368	eor	v3.16b, v3.16b, v4.16b
369	eor	v7.16b, v20.16b, v7.16b
370	eor	v6.16b, v2.16b, v19.16b
371	eor	v4.16b, v21.16b, v17.16b
372	eor	v2.16b, v10.16b, v9.16b
373	bne	.Ldec_loop
374	ldr	q28, [x11, #16]!            // load from .LISRM0 on last round (x10 == 0)
375	b	.Ldec_loop
376.align	4
377.Ldec_done:
378	ushr	v8.2d, v0.2d, #1
379	movi	v9.16b, #0x55
380	ldr	q10, [x9]
381	ushr	v16.2d, v2.2d, #1
382	movi	v17.16b, #0x33
383	ushr	v18.2d, v6.2d, #1
384	movi	v19.16b, #0x0f
385	eor	v8.16b, v8.16b, v1.16b
386	ushr	v20.2d, v3.2d, #1
387	eor	v16.16b, v16.16b, v7.16b
388	eor	v18.16b, v18.16b, v4.16b
389	and	v8.16b, v8.16b, v9.16b
390	eor	v20.16b, v20.16b, v5.16b
391	and	v16.16b, v16.16b, v9.16b
392	and	v18.16b, v18.16b, v9.16b
393	shl	v21.2d, v8.2d, #1
394	eor	v1.16b, v1.16b, v8.16b
395	and	v8.16b, v20.16b, v9.16b
396	eor	v7.16b, v7.16b, v16.16b
397	shl	v9.2d, v16.2d, #1
398	eor	v4.16b, v4.16b, v18.16b
399	shl	v16.2d, v18.2d, #1
400	eor	v0.16b, v0.16b, v21.16b
401	shl	v18.2d, v8.2d, #1
402	eor	v5.16b, v5.16b, v8.16b
403	eor	v2.16b, v2.16b, v9.16b
404	eor	v6.16b, v6.16b, v16.16b
405	ushr	v8.2d, v1.2d, #2
406	eor	v3.16b, v3.16b, v18.16b
407	ushr	v9.2d, v0.2d, #2
408	ushr	v16.2d, v7.2d, #2
409	ushr	v18.2d, v2.2d, #2
410	eor	v8.16b, v8.16b, v4.16b
411	eor	v9.16b, v9.16b, v6.16b
412	eor	v16.16b, v16.16b, v5.16b
413	eor	v18.16b, v18.16b, v3.16b
414	and	v8.16b, v8.16b, v17.16b
415	and	v9.16b, v9.16b, v17.16b
416	and	v16.16b, v16.16b, v17.16b
417	and	v17.16b, v18.16b, v17.16b
418	eor	v4.16b, v4.16b, v8.16b
419	shl	v8.2d, v8.2d, #2
420	eor	v6.16b, v6.16b, v9.16b
421	shl	v9.2d, v9.2d, #2
422	eor	v5.16b, v5.16b, v16.16b
423	shl	v16.2d, v16.2d, #2
424	eor	v3.16b, v3.16b, v17.16b
425	shl	v17.2d, v17.2d, #2
426	eor	v1.16b, v1.16b, v8.16b
427	eor	v0.16b, v0.16b, v9.16b
428	eor	v7.16b, v7.16b, v16.16b
429	eor	v2.16b, v2.16b, v17.16b
430	ushr	v8.2d, v4.2d, #4
431	ushr	v9.2d, v6.2d, #4
432	ushr	v16.2d, v1.2d, #4
433	ushr	v17.2d, v0.2d, #4
434	eor	v8.16b, v8.16b, v5.16b
435	eor	v9.16b, v9.16b, v3.16b
436	eor	v16.16b, v16.16b, v7.16b
437	eor	v17.16b, v17.16b, v2.16b
438	and	v8.16b, v8.16b, v19.16b
439	and	v9.16b, v9.16b, v19.16b
440	and	v16.16b, v16.16b, v19.16b
441	and	v17.16b, v17.16b, v19.16b
442	eor	v5.16b, v5.16b, v8.16b
443	shl	v8.2d, v8.2d, #4
444	eor	v3.16b, v3.16b, v9.16b
445	shl	v9.2d, v9.2d, #4
446	eor	v7.16b, v7.16b, v16.16b
447	shl	v16.2d, v16.2d, #4
448	eor	v2.16b, v2.16b, v17.16b
449	shl	v17.2d, v17.2d, #4
450	eor	v4.16b, v4.16b, v8.16b
451	eor	v6.16b, v6.16b, v9.16b
452	eor	v7.16b, v7.16b, v10.16b
453	eor	v1.16b, v1.16b, v16.16b
454	eor	v2.16b, v2.16b, v10.16b
455	eor	v0.16b, v0.16b, v17.16b
456	eor	v4.16b, v4.16b, v10.16b
457	eor	v6.16b, v6.16b, v10.16b
458	eor	v3.16b, v3.16b, v10.16b
459	eor	v5.16b, v5.16b, v10.16b
460	eor	v1.16b, v1.16b, v10.16b
461	eor	v0.16b, v0.16b, v10.16b
462	ret
463.size	_bsaes_decrypt8,.-_bsaes_decrypt8
464
465.section	.rodata
466.type	_bsaes_consts,%object
467.align	6
468_bsaes_consts:
469// InvShiftRows constants
470// Used in _bsaes_decrypt8, which assumes contiguity
471// .LM0ISR used with round 0 key
472// .LISR   used with middle round keys
473// .LISRM0 used with final round key
474.LM0ISR:
475.quad	0x0a0e0206070b0f03, 0x0004080c0d010509
476.LISR:
477.quad	0x0504070602010003, 0x0f0e0d0c080b0a09
478.LISRM0:
479.quad	0x01040b0e0205080f, 0x0306090c00070a0d
480
481// ShiftRows constants
482// Used in _bsaes_encrypt8, which assumes contiguity
483// .LM0SR used with round 0 key
484// .LSR   used with middle round keys
485// .LSRM0 used with final round key
486.LM0SR:
487.quad	0x0a0e02060f03070b, 0x0004080c05090d01
488.LSR:
489.quad	0x0504070600030201, 0x0f0e0d0c0a09080b
490.LSRM0:
491.quad	0x0304090e00050a0f, 0x01060b0c0207080d
492
493.LM0_bigendian:
494.quad	0x02060a0e03070b0f, 0x0004080c0105090d
495.LM0_littleendian:
496.quad	0x0105090d0004080c, 0x03070b0f02060a0e
497
498// Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into
499// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR
500.LREVM0SR:
501.quad	0x090d01050c000408, 0x03070b0f060a0e02
502
503.align	6
504.size	_bsaes_consts,.-_bsaes_consts
505
506.previous
507
508.type	_bsaes_encrypt8,%function
509.align	4
510// On entry:
511//   x9 -> key (previously expanded using _bsaes_key_convert)
512//   x10 = number of rounds
513//   v0-v7 input data
514// On exit:
515//   x9-x11 corrupted
516//   other general-purpose registers preserved
517//   v0-v7 output data
518//   v11-v15 preserved
519//   other SIMD registers corrupted
520_bsaes_encrypt8:
521	ldr	q8, [x9], #16
522	adrp	x11, .LM0SR
523	add	x11, x11, #:lo12:.LM0SR
524	ldr	q9, [x11], #16
525_bsaes_encrypt8_alt:
526	eor	v0.16b, v0.16b, v8.16b
527	eor	v1.16b, v1.16b, v8.16b
528	sub	x10, x10, #1
529	eor	v2.16b, v2.16b, v8.16b
530	eor	v4.16b, v4.16b, v8.16b
531	eor	v3.16b, v3.16b, v8.16b
532	eor	v5.16b, v5.16b, v8.16b
533	tbl	v0.16b, {v0.16b}, v9.16b
534	tbl	v1.16b, {v1.16b}, v9.16b
535	tbl	v2.16b, {v2.16b}, v9.16b
536	tbl	v4.16b, {v4.16b}, v9.16b
537	eor	v6.16b, v6.16b, v8.16b
538	eor	v7.16b, v7.16b, v8.16b
539	tbl	v3.16b, {v3.16b}, v9.16b
540	tbl	v5.16b, {v5.16b}, v9.16b
541	tbl	v6.16b, {v6.16b}, v9.16b
542	ushr	v8.2d, v0.2d, #1
543	movi	v10.16b, #0x55
544	tbl	v7.16b, {v7.16b}, v9.16b
545	ushr	v9.2d, v4.2d, #1
546	movi	v16.16b, #0x33
547	ushr	v17.2d, v2.2d, #1
548	eor	v8.16b, v8.16b, v1.16b
549	movi	v18.16b, #0x0f
550	ushr	v19.2d, v6.2d, #1
551	eor	v9.16b, v9.16b, v5.16b
552	eor	v17.16b, v17.16b, v3.16b
553	and	v8.16b, v8.16b, v10.16b
554	eor	v19.16b, v19.16b, v7.16b
555	and	v9.16b, v9.16b, v10.16b
556	and	v17.16b, v17.16b, v10.16b
557	eor	v1.16b, v1.16b, v8.16b
558	shl	v8.2d, v8.2d, #1
559	and	v10.16b, v19.16b, v10.16b
560	eor	v5.16b, v5.16b, v9.16b
561	shl	v9.2d, v9.2d, #1
562	eor	v3.16b, v3.16b, v17.16b
563	shl	v17.2d, v17.2d, #1
564	eor	v0.16b, v0.16b, v8.16b
565	shl	v8.2d, v10.2d, #1
566	eor	v7.16b, v7.16b, v10.16b
567	eor	v4.16b, v4.16b, v9.16b
568	eor	v2.16b, v2.16b, v17.16b
569	ushr	v9.2d, v1.2d, #2
570	eor	v6.16b, v6.16b, v8.16b
571	ushr	v8.2d, v0.2d, #2
572	ushr	v10.2d, v5.2d, #2
573	ushr	v17.2d, v4.2d, #2
574	eor	v9.16b, v9.16b, v3.16b
575	eor	v8.16b, v8.16b, v2.16b
576	eor	v10.16b, v10.16b, v7.16b
577	eor	v17.16b, v17.16b, v6.16b
578	and	v9.16b, v9.16b, v16.16b
579	and	v8.16b, v8.16b, v16.16b
580	and	v10.16b, v10.16b, v16.16b
581	and	v16.16b, v17.16b, v16.16b
582	eor	v3.16b, v3.16b, v9.16b
583	shl	v9.2d, v9.2d, #2
584	eor	v2.16b, v2.16b, v8.16b
585	shl	v8.2d, v8.2d, #2
586	eor	v7.16b, v7.16b, v10.16b
587	shl	v10.2d, v10.2d, #2
588	eor	v6.16b, v6.16b, v16.16b
589	shl	v16.2d, v16.2d, #2
590	eor	v1.16b, v1.16b, v9.16b
591	eor	v0.16b, v0.16b, v8.16b
592	eor	v5.16b, v5.16b, v10.16b
593	eor	v4.16b, v4.16b, v16.16b
594	ushr	v8.2d, v3.2d, #4
595	ushr	v9.2d, v2.2d, #4
596	ushr	v10.2d, v1.2d, #4
597	ushr	v16.2d, v0.2d, #4
598	eor	v8.16b, v8.16b, v7.16b
599	eor	v9.16b, v9.16b, v6.16b
600	eor	v10.16b, v10.16b, v5.16b
601	eor	v16.16b, v16.16b, v4.16b
602	and	v8.16b, v8.16b, v18.16b
603	and	v9.16b, v9.16b, v18.16b
604	and	v10.16b, v10.16b, v18.16b
605	and	v16.16b, v16.16b, v18.16b
606	eor	v7.16b, v7.16b, v8.16b
607	shl	v8.2d, v8.2d, #4
608	eor	v6.16b, v6.16b, v9.16b
609	shl	v9.2d, v9.2d, #4
610	eor	v5.16b, v5.16b, v10.16b
611	shl	v10.2d, v10.2d, #4
612	eor	v4.16b, v4.16b, v16.16b
613	shl	v16.2d, v16.2d, #4
614	eor	v3.16b, v3.16b, v8.16b
615	eor	v2.16b, v2.16b, v9.16b
616	eor	v1.16b, v1.16b, v10.16b
617	eor	v0.16b, v0.16b, v16.16b
618	b	.Lenc_sbox
619.align	4
620.Lenc_loop:
621	ld1	{v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
622	ldp	q8, q9, [x9], #32
623	eor	v0.16b, v16.16b, v0.16b
624	ldr	q10, [x9], #16
625	eor	v1.16b, v17.16b, v1.16b
626	ldr	q16, [x9], #16
627	eor	v2.16b, v18.16b, v2.16b
628	eor	v3.16b, v19.16b, v3.16b
629	eor	v4.16b, v8.16b, v4.16b
630	eor	v5.16b, v9.16b, v5.16b
631	eor	v6.16b, v10.16b, v6.16b
632	eor	v7.16b, v16.16b, v7.16b
633	tbl	v0.16b, {v0.16b}, v28.16b
634	tbl	v1.16b, {v1.16b}, v28.16b
635	tbl	v2.16b, {v2.16b}, v28.16b
636	tbl	v3.16b, {v3.16b}, v28.16b
637	tbl	v4.16b, {v4.16b}, v28.16b
638	tbl	v5.16b, {v5.16b}, v28.16b
639	tbl	v6.16b, {v6.16b}, v28.16b
640	tbl	v7.16b, {v7.16b}, v28.16b
641.Lenc_sbox:
642	eor	v5.16b, v5.16b, v6.16b
643	eor	v3.16b, v3.16b, v0.16b
644	subs	x10, x10, #1
645	eor	v2.16b, v2.16b, v1.16b
646	eor	v5.16b, v5.16b, v0.16b
647	eor	v8.16b, v3.16b, v7.16b
648	eor	v6.16b, v6.16b, v2.16b
649	eor	v7.16b, v7.16b, v5.16b
650	eor	v8.16b, v8.16b, v4.16b
651	eor	v3.16b, v6.16b, v3.16b
652	eor	v4.16b, v4.16b, v5.16b
653	eor	v6.16b, v1.16b, v5.16b
654	eor	v2.16b, v2.16b, v7.16b
655	eor	v1.16b, v8.16b, v1.16b
656	eor	v8.16b, v7.16b, v4.16b
657	eor	v9.16b, v3.16b, v0.16b
658	eor	v10.16b, v7.16b, v6.16b
659	eor	v16.16b, v5.16b, v3.16b
660	eor	v17.16b, v6.16b, v2.16b
661	eor	v18.16b, v5.16b, v1.16b
662	eor	v19.16b, v2.16b, v4.16b
663	eor	v20.16b, v1.16b, v0.16b
664	orr	v21.16b, v8.16b, v9.16b
665	orr	v22.16b, v10.16b, v16.16b
666	eor	v23.16b, v8.16b, v17.16b
667	eor	v24.16b, v9.16b, v18.16b
668	and	v19.16b, v19.16b, v20.16b
669	orr	v20.16b, v17.16b, v18.16b
670	and	v8.16b, v8.16b, v9.16b
671	and	v9.16b, v17.16b, v18.16b
672	and	v17.16b, v23.16b, v24.16b
673	and	v10.16b, v10.16b, v16.16b
674	eor	v16.16b, v21.16b, v19.16b
675	eor	v18.16b, v20.16b, v19.16b
676	and	v19.16b, v2.16b, v1.16b
677	and	v20.16b, v6.16b, v5.16b
678	eor	v21.16b, v22.16b, v17.16b
679	eor	v9.16b, v9.16b, v10.16b
680	eor	v10.16b, v16.16b, v17.16b
681	eor	v16.16b, v18.16b, v8.16b
682	and	v17.16b, v4.16b, v0.16b
683	orr	v18.16b, v7.16b, v3.16b
684	eor	v21.16b, v21.16b, v8.16b
685	eor	v8.16b, v9.16b, v8.16b
686	eor	v9.16b, v10.16b, v19.16b
687	eor	v10.16b, v3.16b, v0.16b
688	eor	v16.16b, v16.16b, v17.16b
689	eor	v17.16b, v5.16b, v1.16b
690	eor	v19.16b, v21.16b, v20.16b
691	eor	v20.16b, v8.16b, v18.16b
692	eor	v8.16b, v8.16b, v18.16b
693	eor	v18.16b, v7.16b, v4.16b
694	eor	v21.16b, v9.16b, v16.16b
695	eor	v22.16b, v6.16b, v2.16b
696	and	v23.16b, v9.16b, v19.16b
697	eor	v24.16b, v10.16b, v17.16b
698	eor	v25.16b, v0.16b, v1.16b
699	eor	v26.16b, v7.16b, v6.16b
700	eor	v27.16b, v18.16b, v22.16b
701	eor	v28.16b, v3.16b, v5.16b
702	eor	v29.16b, v16.16b, v23.16b
703	eor	v30.16b, v20.16b, v23.16b
704	eor	v23.16b, v20.16b, v23.16b
705	eor	v31.16b, v4.16b, v2.16b
706	bsl	v29.16b, v19.16b, v20.16b
707	bsl	v30.16b, v9.16b, v16.16b
708	bsl	v8.16b, v29.16b, v23.16b
709	bsl	v20.16b, v23.16b, v29.16b
710	eor	v9.16b, v30.16b, v29.16b
711	and	v5.16b, v5.16b, v30.16b
712	and	v8.16b, v8.16b, v30.16b
713	and	v1.16b, v1.16b, v29.16b
714	eor	v16.16b, v19.16b, v20.16b
715	and	v2.16b, v2.16b, v29.16b
716	eor	v19.16b, v9.16b, v29.16b
717	and	v17.16b, v17.16b, v9.16b
718	eor	v8.16b, v8.16b, v21.16b
719	and	v20.16b, v22.16b, v9.16b
720	eor	v21.16b, v29.16b, v16.16b
721	eor	v22.16b, v29.16b, v16.16b
722	and	v23.16b, v25.16b, v16.16b
723	and	v6.16b, v6.16b, v19.16b
724	eor	v25.16b, v8.16b, v16.16b
725	eor	v29.16b, v30.16b, v8.16b
726	and	v4.16b, v21.16b, v4.16b
727	and	v8.16b, v28.16b, v8.16b
728	and	v0.16b, v22.16b, v0.16b
729	eor	v21.16b, v23.16b, v1.16b
730	eor	v22.16b, v9.16b, v25.16b
731	eor	v9.16b, v9.16b, v25.16b
732	eor	v23.16b, v25.16b, v16.16b
733	and	v3.16b, v29.16b, v3.16b
734	and	v24.16b, v24.16b, v25.16b
735	and	v25.16b, v27.16b, v25.16b
736	and	v10.16b, v22.16b, v10.16b
737	and	v9.16b, v9.16b, v18.16b
738	eor	v18.16b, v19.16b, v23.16b
739	and	v19.16b, v26.16b, v23.16b
740	eor	v3.16b, v5.16b, v3.16b
741	eor	v17.16b, v17.16b, v24.16b
742	eor	v10.16b, v24.16b, v10.16b
743	and	v16.16b, v31.16b, v16.16b
744	eor	v20.16b, v20.16b, v25.16b
745	eor	v9.16b, v25.16b, v9.16b
746	eor	v4.16b, v2.16b, v4.16b
747	and	v7.16b, v18.16b, v7.16b
748	eor	v18.16b, v19.16b, v6.16b
749	eor	v5.16b, v8.16b, v5.16b
750	eor	v0.16b, v1.16b, v0.16b
751	eor	v1.16b, v21.16b, v10.16b
752	eor	v8.16b, v3.16b, v17.16b
753	eor	v2.16b, v16.16b, v2.16b
754	eor	v3.16b, v6.16b, v7.16b
755	eor	v6.16b, v18.16b, v9.16b
756	eor	v4.16b, v4.16b, v20.16b
757	eor	v10.16b, v5.16b, v10.16b
758	eor	v0.16b, v0.16b, v17.16b
759	eor	v9.16b, v2.16b, v9.16b
760	eor	v3.16b, v3.16b, v20.16b
761	eor	v7.16b, v6.16b, v1.16b
762	eor	v5.16b, v8.16b, v4.16b
763	eor	v6.16b, v10.16b, v1.16b
764	eor	v2.16b, v4.16b, v0.16b
765	eor	v4.16b, v3.16b, v10.16b
766	eor	v9.16b, v9.16b, v7.16b
767	eor	v3.16b, v0.16b, v5.16b
768	eor	v0.16b, v1.16b, v4.16b
769	eor	v1.16b, v4.16b, v8.16b
770	eor	v4.16b, v9.16b, v5.16b
771	eor	v6.16b, v6.16b, v3.16b
772	bcc	.Lenc_done
773	ext	v8.16b, v0.16b, v0.16b, #12
774	ext	v9.16b, v4.16b, v4.16b, #12
775	ldr	q28, [x11]
776	ext	v10.16b, v6.16b, v6.16b, #12
777	ext	v16.16b, v1.16b, v1.16b, #12
778	ext	v17.16b, v3.16b, v3.16b, #12
779	ext	v18.16b, v7.16b, v7.16b, #12
780	eor	v0.16b, v0.16b, v8.16b
781	eor	v4.16b, v4.16b, v9.16b
782	eor	v6.16b, v6.16b, v10.16b
783	ext	v19.16b, v2.16b, v2.16b, #12
784	ext	v20.16b, v5.16b, v5.16b, #12
785	eor	v1.16b, v1.16b, v16.16b
786	eor	v3.16b, v3.16b, v17.16b
787	eor	v7.16b, v7.16b, v18.16b
788	eor	v2.16b, v2.16b, v19.16b
789	eor	v16.16b, v16.16b, v0.16b
790	eor	v5.16b, v5.16b, v20.16b
791	eor	v17.16b, v17.16b, v6.16b
792	eor	v10.16b, v10.16b, v4.16b
793	ext	v0.16b, v0.16b, v0.16b, #8
794	eor	v9.16b, v9.16b, v1.16b
795	ext	v1.16b, v1.16b, v1.16b, #8
796	eor	v8.16b, v8.16b, v5.16b
797	eor	v16.16b, v16.16b, v5.16b
798	eor	v18.16b, v18.16b, v3.16b
799	eor	v19.16b, v19.16b, v7.16b
800	ext	v3.16b, v3.16b, v3.16b, #8
801	ext	v7.16b, v7.16b, v7.16b, #8
802	eor	v20.16b, v20.16b, v2.16b
803	ext	v6.16b, v6.16b, v6.16b, #8
804	ext	v21.16b, v5.16b, v5.16b, #8
805	eor	v17.16b, v17.16b, v5.16b
806	ext	v2.16b, v2.16b, v2.16b, #8
807	eor	v10.16b, v10.16b, v5.16b
808	ext	v22.16b, v4.16b, v4.16b, #8
809	eor	v0.16b, v0.16b, v8.16b
810	eor	v1.16b, v1.16b, v16.16b
811	eor	v5.16b, v7.16b, v18.16b
812	eor	v4.16b, v3.16b, v17.16b
813	eor	v3.16b, v6.16b, v10.16b
814	eor	v7.16b, v21.16b, v20.16b
815	eor	v6.16b, v2.16b, v19.16b
816	eor	v2.16b, v22.16b, v9.16b
817	bne	.Lenc_loop
818	ldr	q28, [x11, #16]!            // load from .LSRM0 on last round (x10 == 0)
819	b	.Lenc_loop
820.align	4
821.Lenc_done:
822	ushr	v8.2d, v0.2d, #1
823	movi	v9.16b, #0x55
824	ldr	q10, [x9]
825	ushr	v16.2d, v3.2d, #1
826	movi	v17.16b, #0x33
827	ushr	v18.2d, v4.2d, #1
828	movi	v19.16b, #0x0f
829	eor	v8.16b, v8.16b, v1.16b
830	ushr	v20.2d, v2.2d, #1
831	eor	v16.16b, v16.16b, v7.16b
832	eor	v18.16b, v18.16b, v6.16b
833	and	v8.16b, v8.16b, v9.16b
834	eor	v20.16b, v20.16b, v5.16b
835	and	v16.16b, v16.16b, v9.16b
836	and	v18.16b, v18.16b, v9.16b
837	shl	v21.2d, v8.2d, #1
838	eor	v1.16b, v1.16b, v8.16b
839	and	v8.16b, v20.16b, v9.16b
840	eor	v7.16b, v7.16b, v16.16b
841	shl	v9.2d, v16.2d, #1
842	eor	v6.16b, v6.16b, v18.16b
843	shl	v16.2d, v18.2d, #1
844	eor	v0.16b, v0.16b, v21.16b
845	shl	v18.2d, v8.2d, #1
846	eor	v5.16b, v5.16b, v8.16b
847	eor	v3.16b, v3.16b, v9.16b
848	eor	v4.16b, v4.16b, v16.16b
849	ushr	v8.2d, v1.2d, #2
850	eor	v2.16b, v2.16b, v18.16b
851	ushr	v9.2d, v0.2d, #2
852	ushr	v16.2d, v7.2d, #2
853	ushr	v18.2d, v3.2d, #2
854	eor	v8.16b, v8.16b, v6.16b
855	eor	v9.16b, v9.16b, v4.16b
856	eor	v16.16b, v16.16b, v5.16b
857	eor	v18.16b, v18.16b, v2.16b
858	and	v8.16b, v8.16b, v17.16b
859	and	v9.16b, v9.16b, v17.16b
860	and	v16.16b, v16.16b, v17.16b
861	and	v17.16b, v18.16b, v17.16b
862	eor	v6.16b, v6.16b, v8.16b
863	shl	v8.2d, v8.2d, #2
864	eor	v4.16b, v4.16b, v9.16b
865	shl	v9.2d, v9.2d, #2
866	eor	v5.16b, v5.16b, v16.16b
867	shl	v16.2d, v16.2d, #2
868	eor	v2.16b, v2.16b, v17.16b
869	shl	v17.2d, v17.2d, #2
870	eor	v1.16b, v1.16b, v8.16b
871	eor	v0.16b, v0.16b, v9.16b
872	eor	v7.16b, v7.16b, v16.16b
873	eor	v3.16b, v3.16b, v17.16b
874	ushr	v8.2d, v6.2d, #4
875	ushr	v9.2d, v4.2d, #4
876	ushr	v16.2d, v1.2d, #4
877	ushr	v17.2d, v0.2d, #4
878	eor	v8.16b, v8.16b, v5.16b
879	eor	v9.16b, v9.16b, v2.16b
880	eor	v16.16b, v16.16b, v7.16b
881	eor	v17.16b, v17.16b, v3.16b
882	and	v8.16b, v8.16b, v19.16b
883	and	v9.16b, v9.16b, v19.16b
884	and	v16.16b, v16.16b, v19.16b
885	and	v17.16b, v17.16b, v19.16b
886	eor	v5.16b, v5.16b, v8.16b
887	shl	v8.2d, v8.2d, #4
888	eor	v2.16b, v2.16b, v9.16b
889	shl	v9.2d, v9.2d, #4
890	eor	v7.16b, v7.16b, v16.16b
891	shl	v16.2d, v16.2d, #4
892	eor	v3.16b, v3.16b, v17.16b
893	shl	v17.2d, v17.2d, #4
894	eor	v6.16b, v6.16b, v8.16b
895	eor	v4.16b, v4.16b, v9.16b
896	eor	v7.16b, v7.16b, v10.16b
897	eor	v1.16b, v1.16b, v16.16b
898	eor	v3.16b, v3.16b, v10.16b
899	eor	v0.16b, v0.16b, v17.16b
900	eor	v6.16b, v6.16b, v10.16b
901	eor	v4.16b, v4.16b, v10.16b
902	eor	v2.16b, v2.16b, v10.16b
903	eor	v5.16b, v5.16b, v10.16b
904	eor	v1.16b, v1.16b, v10.16b
905	eor	v0.16b, v0.16b, v10.16b
906	ret
907.size	_bsaes_encrypt8,.-_bsaes_encrypt8
908
909.type	_bsaes_key_convert,%function
910.align	4
911// On entry:
912//   x9 -> input key (big-endian)
913//   x10 = number of rounds
914//   x17 -> output key (native endianness)
915// On exit:
916//   x9, x10 corrupted
917//   x11 -> .LM0_bigendian
918//   x17 -> last quadword of output key
919//   other general-purpose registers preserved
920//   v2-v6 preserved
921//   v7.16b[] = 0x63
922//   v8-v14 preserved
923//   v15 = last round key (converted to native endianness)
924//   other SIMD registers corrupted
925_bsaes_key_convert:
926#ifdef __AARCH64EL__
927	adrp	x11, .LM0_littleendian
928	add	x11, x11, #:lo12:.LM0_littleendian
929#else
930	adrp	x11, .LM0_bigendian
931	add	x11, x11, #:lo12:.LM0_bigendian
932#endif
933	ldr	q0, [x9], #16               // load round 0 key
934	ldr	q1, [x11]                   // .LM0
935	ldr	q15, [x9], #16              // load round 1 key
936
937	movi	v7.16b, #0x63               // compose .L63
938	movi	v16.16b, #0x01              // bit masks
939	movi	v17.16b, #0x02
940	movi	v18.16b, #0x04
941	movi	v19.16b, #0x08
942	movi	v20.16b, #0x10
943	movi	v21.16b, #0x20
944	movi	v22.16b, #0x40
945	movi	v23.16b, #0x80
946
947#ifdef __AARCH64EL__
948	rev32	v0.16b, v0.16b
949#endif
950	sub	x10, x10, #1
951	str	q0, [x17], #16              // save round 0 key
952
953.align	4
954.Lkey_loop:
955	tbl	v0.16b, {v15.16b}, v1.16b
956	ldr	q15, [x9], #16              // load next round key
957
958	eor	v0.16b, v0.16b, v7.16b
959	cmtst	v24.16b, v0.16b, v16.16b
960	cmtst	v25.16b, v0.16b, v17.16b
961	cmtst	v26.16b, v0.16b, v18.16b
962	cmtst	v27.16b, v0.16b, v19.16b
963	cmtst	v28.16b, v0.16b, v20.16b
964	cmtst	v29.16b, v0.16b, v21.16b
965	cmtst	v30.16b, v0.16b, v22.16b
966	cmtst	v31.16b, v0.16b, v23.16b
967	sub	x10, x10, #1
968	st1	{v24.16b,v25.16b,v26.16b,v27.16b}, [x17], #64 // write bit-sliced round key
969	st1	{v28.16b,v29.16b,v30.16b,v31.16b}, [x17], #64
970	cbnz	x10, .Lkey_loop
971
972        // don't save last round key
973#ifdef __AARCH64EL__
974	rev32	v15.16b, v15.16b
975	adrp	x11, .LM0_bigendian
976	add	x11, x11, #:lo12:.LM0_bigendian
977#endif
978	ret
979.size	_bsaes_key_convert,.-_bsaes_key_convert
980
981.globl	ossl_bsaes_cbc_encrypt
982.type	ossl_bsaes_cbc_encrypt,%function
983.align	4
984// On entry:
985//   x0 -> input ciphertext
986//   x1 -> output plaintext
987//   x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16)
988//   x3 -> key
989//   x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call)
990//   w5 must be == 0
991// On exit:
992//   Output plaintext filled in
993//   Initialisation vector overwritten with last quadword of ciphertext
994//   No output registers, usual AAPCS64 register preservation
995ossl_bsaes_cbc_encrypt:
996	AARCH64_VALID_CALL_TARGET
997	cmp	x2, #128
998	bhs	.Lcbc_do_bsaes
999	b	AES_cbc_encrypt
1000.Lcbc_do_bsaes:
1001
1002        // it is up to the caller to make sure we are called with enc == 0
1003
1004	stp	x29, x30, [sp, #-48]!
1005	stp	d8, d9, [sp, #16]
1006	stp	d10, d15, [sp, #32]
1007	lsr	x2, x2, #4                  // len in 16 byte blocks
1008
1009	ldr	w15, [x3, #240]             // get # of rounds
1010	mov	x14, sp
1011
1012        // allocate the key schedule on the stack
1013	add	x17, sp, #96
1014	sub	x17, x17, x15, lsl #7       // 128 bytes per inner round key, less 96 bytes
1015
1016        // populate the key schedule
1017	mov	x9, x3                      // pass key
1018	mov	x10, x15                    // pass # of rounds
1019	mov	sp, x17                     // sp is sp
1020	bl	_bsaes_key_convert
1021	ldr	q6,  [sp]
1022	str	q15, [x17]                  // save last round key
1023	eor	v6.16b, v6.16b, v7.16b      // fix up round 0 key (by XORing with 0x63)
1024	str	q6, [sp]
1025
1026	ldr	q15, [x4]                   // load IV
1027	b	.Lcbc_dec_loop
1028
1029.align	4
1030.Lcbc_dec_loop:
1031	subs	x2, x2, #0x8
1032	bmi	.Lcbc_dec_loop_finish
1033
1034	ldr	q0, [x0], #16               // load input
1035	mov	x9, sp                      // pass the key
1036	ldr	q1, [x0], #16
1037	mov	x10, x15
1038	ldr	q2, [x0], #16
1039	ldr	q3, [x0], #16
1040	ldr	q4, [x0], #16
1041	ldr	q5, [x0], #16
1042	ldr	q6, [x0], #16
1043	ldr	q7, [x0], #-7*16
1044
1045	bl	_bsaes_decrypt8
1046
1047	ldr	q16, [x0], #16              // reload input
1048	eor	v0.16b, v0.16b, v15.16b     // ^= IV
1049	eor	v1.16b, v1.16b, v16.16b
1050	str	q0, [x1], #16               // write output
1051	ldr	q0, [x0], #16
1052	str	q1, [x1], #16
1053	ldr	q1, [x0], #16
1054	eor	v1.16b, v4.16b, v1.16b
1055	ldr	q4, [x0], #16
1056	eor	v2.16b, v2.16b, v4.16b
1057	eor	v0.16b, v6.16b, v0.16b
1058	ldr	q4, [x0], #16
1059	str	q0, [x1], #16
1060	str	q1, [x1], #16
1061	eor	v0.16b, v7.16b, v4.16b
1062	ldr	q1, [x0], #16
1063	str	q2, [x1], #16
1064	ldr	q2, [x0], #16
1065	ldr	q15, [x0], #16
1066	str	q0, [x1], #16
1067	eor	v0.16b, v5.16b, v2.16b
1068	eor	v1.16b, v3.16b, v1.16b
1069	str	q1, [x1], #16
1070	str	q0, [x1], #16
1071
1072	b	.Lcbc_dec_loop
1073
1074.Lcbc_dec_loop_finish:
1075	adds	x2, x2, #8
1076	beq	.Lcbc_dec_done
1077
1078	ldr	q0, [x0], #16               // load input
1079	cmp	x2, #2
1080	blo	.Lcbc_dec_one
1081	ldr	q1, [x0], #16
1082	mov	x9, sp                      // pass the key
1083	mov	x10, x15
1084	beq	.Lcbc_dec_two
1085	ldr	q2, [x0], #16
1086	cmp	x2, #4
1087	blo	.Lcbc_dec_three
1088	ldr	q3, [x0], #16
1089	beq	.Lcbc_dec_four
1090	ldr	q4, [x0], #16
1091	cmp	x2, #6
1092	blo	.Lcbc_dec_five
1093	ldr	q5, [x0], #16
1094	beq	.Lcbc_dec_six
1095	ldr	q6, [x0], #-6*16
1096
1097	bl	_bsaes_decrypt8
1098
1099	ldr	q5, [x0], #16               // reload input
1100	eor	v0.16b, v0.16b, v15.16b     // ^= IV
1101	ldr	q8, [x0], #16
1102	ldr	q9, [x0], #16
1103	ldr	q10, [x0], #16
1104	str	q0, [x1], #16               // write output
1105	ldr	q0, [x0], #16
1106	eor	v1.16b, v1.16b, v5.16b
1107	ldr	q5, [x0], #16
1108	eor	v6.16b, v6.16b, v8.16b
1109	ldr	q15, [x0]
1110	eor	v4.16b, v4.16b, v9.16b
1111	eor	v2.16b, v2.16b, v10.16b
1112	str	q1, [x1], #16
1113	eor	v0.16b, v7.16b, v0.16b
1114	str	q6, [x1], #16
1115	eor	v1.16b, v3.16b, v5.16b
1116	str	q4, [x1], #16
1117	str	q2, [x1], #16
1118	str	q0, [x1], #16
1119	str	q1, [x1]
1120	b	.Lcbc_dec_done
1121.align	4
1122.Lcbc_dec_six:
1123	sub	x0, x0, #0x60
1124	bl	_bsaes_decrypt8
1125	ldr	q3, [x0], #16               // reload input
1126	eor	v0.16b, v0.16b, v15.16b     // ^= IV
1127	ldr	q5, [x0], #16
1128	ldr	q8, [x0], #16
1129	ldr	q9, [x0], #16
1130	str	q0, [x1], #16               // write output
1131	ldr	q0, [x0], #16
1132	eor	v1.16b, v1.16b, v3.16b
1133	ldr	q15, [x0]
1134	eor	v3.16b, v6.16b, v5.16b
1135	eor	v4.16b, v4.16b, v8.16b
1136	eor	v2.16b, v2.16b, v9.16b
1137	str	q1, [x1], #16
1138	eor	v0.16b, v7.16b, v0.16b
1139	str	q3, [x1], #16
1140	str	q4, [x1], #16
1141	str	q2, [x1], #16
1142	str	q0, [x1]
1143	b	.Lcbc_dec_done
1144.align	4
1145.Lcbc_dec_five:
1146	sub	x0, x0, #0x50
1147	bl	_bsaes_decrypt8
1148	ldr	q3, [x0], #16               // reload input
1149	eor	v0.16b, v0.16b, v15.16b     // ^= IV
1150	ldr	q5, [x0], #16
1151	ldr	q7, [x0], #16
1152	ldr	q8, [x0], #16
1153	str	q0, [x1], #16               // write output
1154	ldr	q15, [x0]
1155	eor	v0.16b, v1.16b, v3.16b
1156	eor	v1.16b, v6.16b, v5.16b
1157	eor	v3.16b, v4.16b, v7.16b
1158	str	q0, [x1], #16
1159	eor	v0.16b, v2.16b, v8.16b
1160	str	q1, [x1], #16
1161	str	q3, [x1], #16
1162	str	q0, [x1]
1163	b	.Lcbc_dec_done
1164.align	4
1165.Lcbc_dec_four:
1166	sub	x0, x0, #0x40
1167	bl	_bsaes_decrypt8
1168	ldr	q2, [x0], #16               // reload input
1169	eor	v0.16b, v0.16b, v15.16b     // ^= IV
1170	ldr	q3, [x0], #16
1171	ldr	q5, [x0], #16
1172	str	q0, [x1], #16               // write output
1173	ldr	q15, [x0]
1174	eor	v0.16b, v1.16b, v2.16b
1175	eor	v1.16b, v6.16b, v3.16b
1176	eor	v2.16b, v4.16b, v5.16b
1177	str	q0, [x1], #16
1178	str	q1, [x1], #16
1179	str	q2, [x1]
1180	b	.Lcbc_dec_done
1181.align	4
1182.Lcbc_dec_three:
1183	sub	x0, x0, #0x30
1184	bl	_bsaes_decrypt8
1185	ldr	q2, [x0], #16               // reload input
1186	eor	v0.16b, v0.16b, v15.16b     // ^= IV
1187	ldr	q3, [x0], #16
1188	ldr	q15, [x0]
1189	str	q0, [x1], #16               // write output
1190	eor	v0.16b, v1.16b, v2.16b
1191	eor	v1.16b, v6.16b, v3.16b
1192	str	q0, [x1], #16
1193	str	q1, [x1]
1194	b	.Lcbc_dec_done
1195.align	4
1196.Lcbc_dec_two:
1197	sub	x0, x0, #0x20
1198	bl	_bsaes_decrypt8
1199	ldr	q2, [x0], #16               // reload input
1200	eor	v0.16b, v0.16b, v15.16b     // ^= IV
1201	ldr	q15, [x0]
1202	str	q0, [x1], #16               // write output
1203	eor	v0.16b, v1.16b, v2.16b
1204	str	q0, [x1]
1205	b	.Lcbc_dec_done
1206.align	4
1207.Lcbc_dec_one:
1208	sub	x0, x0, #0x10
1209	stp	x1, x4, [sp, #-32]!
1210	str	x14, [sp, #16]
1211	mov	v8.16b, v15.16b
1212	mov	v15.16b, v0.16b
1213	mov	x2, x3
1214	bl	AES_decrypt
1215	ldr	x14, [sp, #16]
1216	ldp	x1, x4, [sp], #32
1217	ldr	q0, [x1]                    // load result
1218	eor	v0.16b, v0.16b, v8.16b      // ^= IV
1219	str	q0, [x1]                    // write output
1220
1221.align	4
1222.Lcbc_dec_done:
1223	movi	v0.16b, #0
1224	movi	v1.16b, #0
1225.Lcbc_dec_bzero:	//	wipe key schedule [if any]
1226	stp	q0, q1, [sp], #32
1227	cmp	sp, x14
1228	bne	.Lcbc_dec_bzero
1229	str	q15, [x4]                   // return IV
1230	ldp	d8, d9, [sp, #16]
1231	ldp	d10, d15, [sp, #32]
1232	ldp	x29, x30, [sp], #48
1233	ret
1234.size	ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt
1235
1236.globl	ossl_bsaes_ctr32_encrypt_blocks
1237.type	ossl_bsaes_ctr32_encrypt_blocks,%function
1238.align	4
1239// On entry:
1240//   x0 -> input text (whole 16-byte blocks)
1241//   x1 -> output text (whole 16-byte blocks)
1242//   x2 = number of 16-byte blocks to encrypt/decrypt (> 0)
1243//   x3 -> key
1244//   x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block
1245// On exit:
1246//   Output text filled in
1247//   No output registers, usual AAPCS64 register preservation
1248ossl_bsaes_ctr32_encrypt_blocks:
1249	AARCH64_VALID_CALL_TARGET
1250	cmp	x2, #8                      // use plain AES for
1251	blo	.Lctr_enc_short             // small sizes
1252
1253	stp	x29, x30, [sp, #-80]!
1254	stp	d8, d9, [sp, #16]
1255	stp	d10, d11, [sp, #32]
1256	stp	d12, d13, [sp, #48]
1257	stp	d14, d15, [sp, #64]
1258
1259	ldr	w15, [x3, #240]             // get # of rounds
1260	mov	x14, sp
1261
1262        // allocate the key schedule on the stack
1263	add	x17, sp, #96
1264	sub	x17, x17, x15, lsl #7       // 128 bytes per inner round key, less 96 bytes
1265
1266        // populate the key schedule
1267	mov	x9, x3                      // pass key
1268	mov	x10, x15                    // pass # of rounds
1269	mov	sp, x17                     // sp is sp
1270	bl	_bsaes_key_convert
1271	eor	v7.16b, v7.16b, v15.16b     // fix up last round key
1272	str	q7, [x17]                   // save last round key
1273
1274	ldr	q0, [x4]                    // load counter
1275	add	x13, x11, #.LREVM0SR-.LM0_bigendian
1276	ldr	q4, [sp]                    // load round0 key
1277
1278	movi	v8.4s, #1                   // compose 1<<96
1279	movi	v9.16b, #0
1280	rev32	v15.16b, v0.16b
1281	rev32	v0.16b, v0.16b
1282	ext	v11.16b, v9.16b, v8.16b, #4
1283	rev32	v4.16b, v4.16b
1284	add	v12.4s, v11.4s, v11.4s      // compose 2<<96
1285	str	q4, [sp]                    // save adjusted round0 key
1286	add	v13.4s, v11.4s, v12.4s      // compose 3<<96
1287	add	v14.4s, v12.4s, v12.4s      // compose 4<<96
1288	b	.Lctr_enc_loop
1289
1290.align	4
1291.Lctr_enc_loop:
1292        // Intermix prologue from _bsaes_encrypt8 to use the opportunity
1293        // to flip byte order in 32-bit counter
1294
1295	add	v1.4s, v15.4s, v11.4s       // +1
1296	add	x9, sp, #0x10               // pass next round key
1297	add	v2.4s, v15.4s, v12.4s       // +2
1298	ldr	q9, [x13]                   // .LREVM0SR
1299	ldr	q8, [sp]                    // load round0 key
1300	add	v3.4s, v15.4s, v13.4s       // +3
1301	mov	x10, x15                    // pass rounds
1302	sub	x11, x13, #.LREVM0SR-.LSR   // pass constants
1303	add	v6.4s, v2.4s, v14.4s
1304	add	v4.4s, v15.4s, v14.4s       // +4
1305	add	v7.4s, v3.4s, v14.4s
1306	add	v15.4s, v4.4s, v14.4s       // next counter
1307	add	v5.4s, v1.4s, v14.4s
1308
1309	bl	_bsaes_encrypt8_alt
1310
1311	subs	x2, x2, #8
1312	blo	.Lctr_enc_loop_done
1313
1314	ldr	q16, [x0], #16
1315	ldr	q17, [x0], #16
1316	eor	v1.16b, v1.16b, v17.16b
1317	ldr	q17, [x0], #16
1318	eor	v0.16b, v0.16b, v16.16b
1319	eor	v4.16b, v4.16b, v17.16b
1320	str	q0, [x1], #16
1321	ldr	q16, [x0], #16
1322	str	q1, [x1], #16
1323	mov	v0.16b, v15.16b
1324	str	q4, [x1], #16
1325	ldr	q1, [x0], #16
1326	eor	v4.16b, v6.16b, v16.16b
1327	eor	v1.16b, v3.16b, v1.16b
1328	ldr	q3, [x0], #16
1329	eor	v3.16b, v7.16b, v3.16b
1330	ldr	q6, [x0], #16
1331	eor	v2.16b, v2.16b, v6.16b
1332	ldr	q6, [x0], #16
1333	eor	v5.16b, v5.16b, v6.16b
1334	str	q4, [x1], #16
1335	str	q1, [x1], #16
1336	str	q3, [x1], #16
1337	str	q2, [x1], #16
1338	str	q5, [x1], #16
1339
1340	bne	.Lctr_enc_loop
1341	b	.Lctr_enc_done
1342
1343.align	4
1344.Lctr_enc_loop_done:
1345	add	x2, x2, #8
1346	ldr	q16, [x0], #16              // load input
1347	eor	v0.16b, v0.16b, v16.16b
1348	str	q0, [x1], #16               // write output
1349	cmp	x2, #2
1350	blo	.Lctr_enc_done
1351	ldr	q17, [x0], #16
1352	eor	v1.16b, v1.16b, v17.16b
1353	str	q1, [x1], #16
1354	beq	.Lctr_enc_done
1355	ldr	q18, [x0], #16
1356	eor	v4.16b, v4.16b, v18.16b
1357	str	q4, [x1], #16
1358	cmp	x2, #4
1359	blo	.Lctr_enc_done
1360	ldr	q19, [x0], #16
1361	eor	v6.16b, v6.16b, v19.16b
1362	str	q6, [x1], #16
1363	beq	.Lctr_enc_done
1364	ldr	q20, [x0], #16
1365	eor	v3.16b, v3.16b, v20.16b
1366	str	q3, [x1], #16
1367	cmp	x2, #6
1368	blo	.Lctr_enc_done
1369	ldr	q21, [x0], #16
1370	eor	v7.16b, v7.16b, v21.16b
1371	str	q7, [x1], #16
1372	beq	.Lctr_enc_done
1373	ldr	q22, [x0]
1374	eor	v2.16b, v2.16b, v22.16b
1375	str	q2, [x1], #16
1376
1377.Lctr_enc_done:
1378	movi	v0.16b, #0
1379	movi	v1.16b, #0
1380.Lctr_enc_bzero:	//	wipe key schedule [if any]
1381	stp	q0, q1, [sp], #32
1382	cmp	sp, x14
1383	bne	.Lctr_enc_bzero
1384
1385	ldp	d8, d9, [sp, #16]
1386	ldp	d10, d11, [sp, #32]
1387	ldp	d12, d13, [sp, #48]
1388	ldp	d14, d15, [sp, #64]
1389	ldp	x29, x30, [sp], #80
1390	ret
1391
1392.Lctr_enc_short:
1393	stp	x29, x30, [sp, #-96]!
1394	stp	x19, x20, [sp, #16]
1395	stp	x21, x22, [sp, #32]
1396	str	x23, [sp, #48]
1397
1398	mov	x19, x0                     // copy arguments
1399	mov	x20, x1
1400	mov	x21, x2
1401	mov	x22, x3
1402	ldr	w23, [x4, #12]              // load counter .LSW
1403	ldr	q1, [x4]                    // load whole counter value
1404#ifdef __AARCH64EL__
1405	rev	w23, w23
1406#endif
1407	str	q1, [sp, #80]               // copy counter value
1408
1409.Lctr_enc_short_loop:
1410	add	x0, sp, #80                 // input counter value
1411	add	x1, sp, #64                 // output on the stack
1412	mov	x2, x22                     // key
1413
1414	bl	AES_encrypt
1415
1416	ldr	q0, [x19], #16              // load input
1417	ldr	q1, [sp, #64]               // load encrypted counter
1418	add	x23, x23, #1
1419#ifdef __AARCH64EL__
1420	rev	w0, w23
1421	str	w0, [sp, #80+12]            // next counter value
1422#else
1423	str	w23, [sp, #80+12]           // next counter value
1424#endif
1425	eor	v0.16b, v0.16b, v1.16b
1426	str	q0, [x20], #16              // store output
1427	subs	x21, x21, #1
1428	bne	.Lctr_enc_short_loop
1429
1430	movi	v0.16b, #0
1431	movi	v1.16b, #0
1432	stp	q0, q1, [sp, #64]
1433
1434	ldr	x23, [sp, #48]
1435	ldp	x21, x22, [sp, #32]
1436	ldp	x19, x20, [sp, #16]
1437	ldp	x29, x30, [sp], #96
1438	ret
1439.size	ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks
1440
1441.globl	ossl_bsaes_xts_encrypt
1442.type	ossl_bsaes_xts_encrypt,%function
1443.align	4
1444// On entry:
1445//   x0 -> input plaintext
1446//   x1 -> output ciphertext
1447//   x2 -> length of text in bytes (must be at least 16)
1448//   x3 -> key1 (used to encrypt the XORed plaintext blocks)
1449//   x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
1450//   x5 -> 16-byte initial vector (typically, sector number)
1451// On exit:
1452//   Output ciphertext filled in
1453//   No output registers, usual AAPCS64 register preservation
1454ossl_bsaes_xts_encrypt:
1455	AARCH64_VALID_CALL_TARGET
1456        // Stack layout:
1457        // sp ->
1458        //        nrounds*128-96 bytes: key schedule
1459        // x19 ->
1460        //        16 bytes: frame record
1461        //        4*16 bytes: tweak storage across _bsaes_encrypt8
1462        //        6*8 bytes: storage for 5 callee-saved general-purpose registers
1463        //        8*8 bytes: storage for 8 callee-saved SIMD registers
1464	stp	x29, x30, [sp, #-192]!
1465	stp	x19, x20, [sp, #80]
1466	stp	x21, x22, [sp, #96]
1467	str	x23, [sp, #112]
1468	stp	d8, d9, [sp, #128]
1469	stp	d10, d11, [sp, #144]
1470	stp	d12, d13, [sp, #160]
1471	stp	d14, d15, [sp, #176]
1472
1473	mov	x19, sp
1474	mov	x20, x0
1475	mov	x21, x1
1476	mov	x22, x2
1477	mov	x23, x3
1478
1479        // generate initial tweak
1480	sub	sp, sp, #16
1481	mov	x0, x5                      // iv[]
1482	mov	x1, sp
1483	mov	x2, x4                      // key2
1484	bl	AES_encrypt
1485	ldr	q11, [sp], #16
1486
1487	ldr	w1, [x23, #240]             // get # of rounds
1488        // allocate the key schedule on the stack
1489	add	x17, sp, #96
1490	sub	x17, x17, x1, lsl #7        // 128 bytes per inner round key, less 96 bytes
1491
1492        // populate the key schedule
1493	mov	x9, x23                     // pass key
1494	mov	x10, x1                     // pass # of rounds
1495	mov	sp, x17
1496	bl	_bsaes_key_convert
1497	eor	v15.16b, v15.16b, v7.16b    // fix up last round key
1498	str	q15, [x17]                  // save last round key
1499
1500	subs	x22, x22, #0x80
1501	blo	.Lxts_enc_short
1502	b	.Lxts_enc_loop
1503
1504.align	4
1505.Lxts_enc_loop:
1506	ldr	q8, .Lxts_magic
1507	mov	x10, x1                     // pass rounds
1508	add	x2, x19, #16
1509	ldr	q0, [x20], #16
1510	sshr	v1.2d, v11.2d, #63
1511	mov	x9, sp                      // pass key schedule
1512	ldr	q6, .Lxts_magic+16
1513	add	v2.2d, v11.2d, v11.2d
1514	cmtst	v3.2d, v11.2d, v6.2d
1515	and	v1.16b, v1.16b, v8.16b
1516	ext	v1.16b, v1.16b, v1.16b, #8
1517	and	v3.16b, v3.16b, v8.16b
1518	ldr	q4, [x20], #16
1519	eor	v12.16b, v2.16b, v1.16b
1520	eor	v1.16b, v4.16b, v12.16b
1521	eor	v0.16b, v0.16b, v11.16b
1522	cmtst	v2.2d, v12.2d, v6.2d
1523	add	v4.2d, v12.2d, v12.2d
1524	add	x0, x19, #16
1525	ext	v3.16b, v3.16b, v3.16b, #8
1526	and	v2.16b, v2.16b, v8.16b
1527	eor	v13.16b, v4.16b, v3.16b
1528	ldr	q3, [x20], #16
1529	ext	v4.16b, v2.16b, v2.16b, #8
1530	eor	v2.16b, v3.16b, v13.16b
1531	ldr	q3, [x20], #16
1532	add	v5.2d, v13.2d, v13.2d
1533	cmtst	v7.2d, v13.2d, v6.2d
1534	and	v7.16b, v7.16b, v8.16b
1535	ldr	q9, [x20], #16
1536	ext	v7.16b, v7.16b, v7.16b, #8
1537	ldr	q10, [x20], #16
1538	eor	v14.16b, v5.16b, v4.16b
1539	ldr	q16, [x20], #16
1540	add	v4.2d, v14.2d, v14.2d
1541	eor	v3.16b, v3.16b, v14.16b
1542	eor	v15.16b, v4.16b, v7.16b
1543	add	v5.2d, v15.2d, v15.2d
1544	ldr	q7, [x20], #16
1545	cmtst	v4.2d, v14.2d, v6.2d
1546	and	v17.16b, v4.16b, v8.16b
1547	cmtst	v18.2d, v15.2d, v6.2d
1548	eor	v4.16b, v9.16b, v15.16b
1549	ext	v9.16b, v17.16b, v17.16b, #8
1550	eor	v9.16b, v5.16b, v9.16b
1551	add	v17.2d, v9.2d, v9.2d
1552	and	v18.16b, v18.16b, v8.16b
1553	eor	v5.16b, v10.16b, v9.16b
1554	str	q9, [x2], #16
1555	ext	v10.16b, v18.16b, v18.16b, #8
1556	cmtst	v9.2d, v9.2d, v6.2d
1557	and	v9.16b, v9.16b, v8.16b
1558	eor	v10.16b, v17.16b, v10.16b
1559	cmtst	v17.2d, v10.2d, v6.2d
1560	eor	v6.16b, v16.16b, v10.16b
1561	str	q10, [x2], #16
1562	ext	v9.16b, v9.16b, v9.16b, #8
1563	add	v10.2d, v10.2d, v10.2d
1564	eor	v9.16b, v10.16b, v9.16b
1565	str	q9, [x2], #16
1566	eor	v7.16b, v7.16b, v9.16b
1567	add	v9.2d, v9.2d, v9.2d
1568	and	v8.16b, v17.16b, v8.16b
1569	ext	v8.16b, v8.16b, v8.16b, #8
1570	eor	v8.16b, v9.16b, v8.16b
1571	str	q8, [x2]                    // next round tweak
1572
1573	bl	_bsaes_encrypt8
1574
1575	ldr	q8, [x0], #16
1576	eor	v0.16b, v0.16b, v11.16b
1577	eor	v1.16b, v1.16b, v12.16b
1578	ldr	q9, [x0], #16
1579	eor	v4.16b, v4.16b, v13.16b
1580	eor	v6.16b, v6.16b, v14.16b
1581	ldr	q10, [x0], #16
1582	eor	v3.16b, v3.16b, v15.16b
1583	subs	x22, x22, #0x80
1584	str	q0, [x21], #16
1585	ldr	q11, [x0]                   // next round tweak
1586	str	q1, [x21], #16
1587	eor	v0.16b, v7.16b, v8.16b
1588	eor	v1.16b, v2.16b, v9.16b
1589	str	q4, [x21], #16
1590	eor	v2.16b, v5.16b, v10.16b
1591	str	q6, [x21], #16
1592	str	q3, [x21], #16
1593	str	q0, [x21], #16
1594	str	q1, [x21], #16
1595	str	q2, [x21], #16
1596	bpl	.Lxts_enc_loop
1597
1598.Lxts_enc_short:
1599	adds	x22, x22, #0x70
1600	bmi	.Lxts_enc_done
1601
1602	ldr	q8, .Lxts_magic
1603	sshr	v1.2d, v11.2d, #63
1604	add	v2.2d, v11.2d, v11.2d
1605	ldr	q9, .Lxts_magic+16
1606	subs	x22, x22, #0x10
1607	ldr	q0, [x20], #16
1608	and	v1.16b, v1.16b, v8.16b
1609	cmtst	v3.2d, v11.2d, v9.2d
1610	ext	v1.16b, v1.16b, v1.16b, #8
1611	and	v3.16b, v3.16b, v8.16b
1612	eor	v12.16b, v2.16b, v1.16b
1613	ext	v1.16b, v3.16b, v3.16b, #8
1614	add	v2.2d, v12.2d, v12.2d
1615	cmtst	v3.2d, v12.2d, v9.2d
1616	eor	v13.16b, v2.16b, v1.16b
1617	and	v22.16b, v3.16b, v8.16b
1618	bmi	.Lxts_enc_1
1619
1620	ext	v2.16b, v22.16b, v22.16b, #8
1621	add	v3.2d, v13.2d, v13.2d
1622	ldr	q1, [x20], #16
1623	cmtst	v4.2d, v13.2d, v9.2d
1624	subs	x22, x22, #0x10
1625	eor	v14.16b, v3.16b, v2.16b
1626	and	v23.16b, v4.16b, v8.16b
1627	bmi	.Lxts_enc_2
1628
1629	ext	v3.16b, v23.16b, v23.16b, #8
1630	add	v4.2d, v14.2d, v14.2d
1631	ldr	q2, [x20], #16
1632	cmtst	v5.2d, v14.2d, v9.2d
1633	eor	v0.16b, v0.16b, v11.16b
1634	subs	x22, x22, #0x10
1635	eor	v15.16b, v4.16b, v3.16b
1636	and	v24.16b, v5.16b, v8.16b
1637	bmi	.Lxts_enc_3
1638
1639	ext	v4.16b, v24.16b, v24.16b, #8
1640	add	v5.2d, v15.2d, v15.2d
1641	ldr	q3, [x20], #16
1642	cmtst	v6.2d, v15.2d, v9.2d
1643	eor	v1.16b, v1.16b, v12.16b
1644	subs	x22, x22, #0x10
1645	eor	v16.16b, v5.16b, v4.16b
1646	and	v25.16b, v6.16b, v8.16b
1647	bmi	.Lxts_enc_4
1648
1649	ext	v5.16b, v25.16b, v25.16b, #8
1650	add	v6.2d, v16.2d, v16.2d
1651	add	x0, x19, #16
1652	cmtst	v7.2d, v16.2d, v9.2d
1653	ldr	q4, [x20], #16
1654	eor	v2.16b, v2.16b, v13.16b
1655	str	q16, [x0], #16
1656	subs	x22, x22, #0x10
1657	eor	v17.16b, v6.16b, v5.16b
1658	and	v26.16b, v7.16b, v8.16b
1659	bmi	.Lxts_enc_5
1660
1661	ext	v7.16b, v26.16b, v26.16b, #8
1662	add	v18.2d, v17.2d, v17.2d
1663	ldr	q5, [x20], #16
1664	eor	v3.16b, v3.16b, v14.16b
1665	str	q17, [x0], #16
1666	subs	x22, x22, #0x10
1667	eor	v18.16b, v18.16b, v7.16b
1668	bmi	.Lxts_enc_6
1669
1670	ldr	q6, [x20], #16
1671	eor	v4.16b, v4.16b, v15.16b
1672	eor	v5.16b, v5.16b, v16.16b
1673	str	q18, [x0]                   // next round tweak
1674	mov	x9, sp                      // pass key schedule
1675	mov	x10, x1
1676	add	x0, x19, #16
1677	sub	x22, x22, #0x10
1678	eor	v6.16b, v6.16b, v17.16b
1679
1680	bl	_bsaes_encrypt8
1681
1682	ldr	q16, [x0], #16
1683	eor	v0.16b, v0.16b, v11.16b
1684	eor	v1.16b, v1.16b, v12.16b
1685	ldr	q17, [x0], #16
1686	eor	v4.16b, v4.16b, v13.16b
1687	eor	v6.16b, v6.16b, v14.16b
1688	eor	v3.16b, v3.16b, v15.16b
1689	ldr	q11, [x0]                   // next round tweak
1690	str	q0, [x21], #16
1691	str	q1, [x21], #16
1692	eor	v0.16b, v7.16b, v16.16b
1693	eor	v1.16b, v2.16b, v17.16b
1694	str	q4, [x21], #16
1695	str	q6, [x21], #16
1696	str	q3, [x21], #16
1697	str	q0, [x21], #16
1698	str	q1, [x21], #16
1699	b	.Lxts_enc_done
1700
1701.align	4
1702.Lxts_enc_6:
1703	eor	v4.16b, v4.16b, v15.16b
1704	eor	v5.16b, v5.16b, v16.16b
1705	mov	x9, sp                      // pass key schedule
1706	mov	x10, x1                     // pass rounds
1707	add	x0, x19, #16
1708
1709	bl	_bsaes_encrypt8
1710
1711	ldr	q16, [x0], #16
1712	eor	v0.16b, v0.16b, v11.16b
1713	eor	v1.16b, v1.16b, v12.16b
1714	eor	v4.16b, v4.16b, v13.16b
1715	eor	v6.16b, v6.16b, v14.16b
1716	ldr	q11, [x0]                   // next round tweak
1717	eor	v3.16b, v3.16b, v15.16b
1718	str	q0, [x21], #16
1719	str	q1, [x21], #16
1720	eor	v0.16b, v7.16b, v16.16b
1721	str	q4, [x21], #16
1722	str	q6, [x21], #16
1723	str	q3, [x21], #16
1724	str	q0, [x21], #16
1725	b	.Lxts_enc_done
1726
1727.align	4
1728.Lxts_enc_5:
1729	eor	v3.16b, v3.16b, v14.16b
1730	eor	v4.16b, v4.16b, v15.16b
1731	mov	x9, sp                      // pass key schedule
1732	mov	x10, x1                     // pass rounds
1733	add	x0, x19, #16
1734
1735	bl	_bsaes_encrypt8
1736
1737	eor	v0.16b, v0.16b, v11.16b
1738	eor	v1.16b, v1.16b, v12.16b
1739	ldr	q11, [x0]                   // next round tweak
1740	eor	v4.16b, v4.16b, v13.16b
1741	eor	v6.16b, v6.16b, v14.16b
1742	eor	v3.16b, v3.16b, v15.16b
1743	str	q0, [x21], #16
1744	str	q1, [x21], #16
1745	str	q4, [x21], #16
1746	str	q6, [x21], #16
1747	str	q3, [x21], #16
1748	b	.Lxts_enc_done
1749
1750.align	4
1751.Lxts_enc_4:
1752	eor	v2.16b, v2.16b, v13.16b
1753	eor	v3.16b, v3.16b, v14.16b
1754	mov	x9, sp                      // pass key schedule
1755	mov	x10, x1                     // pass rounds
1756	add	x0, x19, #16
1757
1758	bl	_bsaes_encrypt8
1759
1760	eor	v0.16b, v0.16b, v11.16b
1761	eor	v1.16b, v1.16b, v12.16b
1762	eor	v4.16b, v4.16b, v13.16b
1763	eor	v6.16b, v6.16b, v14.16b
1764	mov	v11.16b, v15.16b            // next round tweak
1765	str	q0, [x21], #16
1766	str	q1, [x21], #16
1767	str	q4, [x21], #16
1768	str	q6, [x21], #16
1769	b	.Lxts_enc_done
1770
1771.align	4
1772.Lxts_enc_3:
1773	eor	v1.16b, v1.16b, v12.16b
1774	eor	v2.16b, v2.16b, v13.16b
1775	mov	x9, sp                      // pass key schedule
1776	mov	x10, x1                     // pass rounds
1777	add	x0, x19, #16
1778
1779	bl	_bsaes_encrypt8
1780
1781	eor	v0.16b, v0.16b, v11.16b
1782	eor	v1.16b, v1.16b, v12.16b
1783	eor	v4.16b, v4.16b, v13.16b
1784	mov	v11.16b, v14.16b            // next round tweak
1785	str	q0, [x21], #16
1786	str	q1, [x21], #16
1787	str	q4, [x21], #16
1788	b	.Lxts_enc_done
1789
1790.align	4
1791.Lxts_enc_2:
1792	eor	v0.16b, v0.16b, v11.16b
1793	eor	v1.16b, v1.16b, v12.16b
1794	mov	x9, sp                      // pass key schedule
1795	mov	x10, x1                     // pass rounds
1796	add	x0, x19, #16
1797
1798	bl	_bsaes_encrypt8
1799
1800	eor	v0.16b, v0.16b, v11.16b
1801	eor	v1.16b, v1.16b, v12.16b
1802	mov	v11.16b, v13.16b            // next round tweak
1803	str	q0, [x21], #16
1804	str	q1, [x21], #16
1805	b	.Lxts_enc_done
1806
1807.align	4
1808.Lxts_enc_1:
1809	eor	v0.16b, v0.16b, v11.16b
1810	sub	x0, sp, #16
1811	sub	x1, sp, #16
1812	mov	x2, x23
1813	mov	v13.d[0], v11.d[1]          // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
1814	mov	v14.d[0], v12.d[1]
1815	str	q0, [sp, #-16]!
1816
1817	bl	AES_encrypt
1818
1819	ldr	q0, [sp], #16
1820	trn1	v13.2d, v11.2d, v13.2d
1821	trn1	v11.2d, v12.2d, v14.2d      // next round tweak
1822	eor	v0.16b, v0.16b, v13.16b
1823	str	q0, [x21], #16
1824
1825.Lxts_enc_done:
1826	adds	x22, x22, #0x10
1827	beq	.Lxts_enc_ret
1828
1829	sub	x6, x21, #0x10
1830        // Penultimate plaintext block produces final ciphertext part-block
1831        // plus remaining part of final plaintext block. Move ciphertext part
1832        // to final position and reuse penultimate ciphertext block buffer to
1833        // construct final plaintext block
1834.Lxts_enc_steal:
1835	ldrb	w0, [x20], #1
1836	ldrb	w1, [x21, #-0x10]
1837	strb	w0, [x21, #-0x10]
1838	strb	w1, [x21], #1
1839
1840	subs	x22, x22, #1
1841	bhi	.Lxts_enc_steal
1842
1843        // Finally encrypt the penultimate ciphertext block using the
1844        // last tweak
1845	ldr	q0, [x6]
1846	eor	v0.16b, v0.16b, v11.16b
1847	str	q0, [sp, #-16]!
1848	mov	x0, sp
1849	mov	x1, sp
1850	mov	x2, x23
1851	mov	x21, x6
1852	mov	v13.d[0], v11.d[1]          // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
1853
1854	bl	AES_encrypt
1855
1856	trn1	v11.2d, v11.2d, v13.2d
1857	ldr	q0, [sp], #16
1858	eor	v0.16b, v0.16b, v11.16b
1859	str	q0, [x21]
1860
1861.Lxts_enc_ret:
1862
1863	movi	v0.16b, #0
1864	movi	v1.16b, #0
1865.Lxts_enc_bzero:	//	wipe key schedule
1866	stp	q0, q1, [sp], #32
1867	cmp	sp, x19
1868	bne	.Lxts_enc_bzero
1869
1870	ldp	x19, x20, [sp, #80]
1871	ldp	x21, x22, [sp, #96]
1872	ldr	x23, [sp, #112]
1873	ldp	d8, d9, [sp, #128]
1874	ldp	d10, d11, [sp, #144]
1875	ldp	d12, d13, [sp, #160]
1876	ldp	d14, d15, [sp, #176]
1877	ldp	x29, x30, [sp], #192
1878	ret
1879.size	ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt
1880
1881// The assembler doesn't seem capable of de-duplicating these when expressed
1882// using `ldr qd,=` syntax, so assign a symbolic address
1883.align	5
1884.Lxts_magic:
1885.quad	1, 0x87, 0x4000000000000000, 0x4000000000000000
1886
1887.globl	ossl_bsaes_xts_decrypt
1888.type	ossl_bsaes_xts_decrypt,%function
1889.align	4
1890// On entry:
1891//   x0 -> input ciphertext
1892//   x1 -> output plaintext
1893//   x2 -> length of text in bytes (must be at least 16)
1894//   x3 -> key1 (used to decrypt the XORed ciphertext blocks)
1895//   x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
1896//   x5 -> 16-byte initial vector (typically, sector number)
1897// On exit:
1898//   Output plaintext filled in
1899//   No output registers, usual AAPCS64 register preservation
1900ossl_bsaes_xts_decrypt:
1901	AARCH64_VALID_CALL_TARGET
1902        // Stack layout:
1903        // sp ->
1904        //        nrounds*128-96 bytes: key schedule
1905        // x19 ->
1906        //        16 bytes: frame record
1907        //        4*16 bytes: tweak storage across _bsaes_decrypt8
1908        //        6*8 bytes: storage for 5 callee-saved general-purpose registers
1909        //        8*8 bytes: storage for 8 callee-saved SIMD registers
1910	stp	x29, x30, [sp, #-192]!
1911	stp	x19, x20, [sp, #80]
1912	stp	x21, x22, [sp, #96]
1913	str	x23, [sp, #112]
1914	stp	d8, d9, [sp, #128]
1915	stp	d10, d11, [sp, #144]
1916	stp	d12, d13, [sp, #160]
1917	stp	d14, d15, [sp, #176]
1918
1919	mov	x19, sp
1920	mov	x20, x0
1921	mov	x21, x1
1922	mov	x22, x2
1923	mov	x23, x3
1924
1925        // generate initial tweak
1926	sub	sp, sp, #16
1927	mov	x0, x5                      // iv[]
1928	mov	x1, sp
1929	mov	x2, x4                      // key2
1930	bl	AES_encrypt
1931	ldr	q11, [sp], #16
1932
1933	ldr	w1, [x23, #240]             // get # of rounds
1934        // allocate the key schedule on the stack
1935	add	x17, sp, #96
1936	sub	x17, x17, x1, lsl #7        // 128 bytes per inner round key, less 96 bytes
1937
1938        // populate the key schedule
1939	mov	x9, x23                     // pass key
1940	mov	x10, x1                     // pass # of rounds
1941	mov	sp, x17
1942	bl	_bsaes_key_convert
1943	ldr	q6,  [sp]
1944	str	q15, [x17]                  // save last round key
1945	eor	v6.16b, v6.16b, v7.16b      // fix up round 0 key (by XORing with 0x63)
1946	str	q6, [sp]
1947
1948	sub	x30, x22, #0x10
1949	tst	x22, #0xf                   // if not multiple of 16
1950	csel	x22, x30, x22, ne           // subtract another 16 bytes
1951	subs	x22, x22, #0x80
1952
1953	blo	.Lxts_dec_short
1954	b	.Lxts_dec_loop
1955
1956.align	4
1957.Lxts_dec_loop:
1958	ldr	q8, .Lxts_magic
1959	mov	x10, x1                     // pass rounds
1960	add	x2, x19, #16
1961	ldr	q0, [x20], #16
1962	sshr	v1.2d, v11.2d, #63
1963	mov	x9, sp                      // pass key schedule
1964	ldr	q6, .Lxts_magic+16
1965	add	v2.2d, v11.2d, v11.2d
1966	cmtst	v3.2d, v11.2d, v6.2d
1967	and	v1.16b, v1.16b, v8.16b
1968	ext	v1.16b, v1.16b, v1.16b, #8
1969	and	v3.16b, v3.16b, v8.16b
1970	ldr	q4, [x20], #16
1971	eor	v12.16b, v2.16b, v1.16b
1972	eor	v1.16b, v4.16b, v12.16b
1973	eor	v0.16b, v0.16b, v11.16b
1974	cmtst	v2.2d, v12.2d, v6.2d
1975	add	v4.2d, v12.2d, v12.2d
1976	add	x0, x19, #16
1977	ext	v3.16b, v3.16b, v3.16b, #8
1978	and	v2.16b, v2.16b, v8.16b
1979	eor	v13.16b, v4.16b, v3.16b
1980	ldr	q3, [x20], #16
1981	ext	v4.16b, v2.16b, v2.16b, #8
1982	eor	v2.16b, v3.16b, v13.16b
1983	ldr	q3, [x20], #16
1984	add	v5.2d, v13.2d, v13.2d
1985	cmtst	v7.2d, v13.2d, v6.2d
1986	and	v7.16b, v7.16b, v8.16b
1987	ldr	q9, [x20], #16
1988	ext	v7.16b, v7.16b, v7.16b, #8
1989	ldr	q10, [x20], #16
1990	eor	v14.16b, v5.16b, v4.16b
1991	ldr	q16, [x20], #16
1992	add	v4.2d, v14.2d, v14.2d
1993	eor	v3.16b, v3.16b, v14.16b
1994	eor	v15.16b, v4.16b, v7.16b
1995	add	v5.2d, v15.2d, v15.2d
1996	ldr	q7, [x20], #16
1997	cmtst	v4.2d, v14.2d, v6.2d
1998	and	v17.16b, v4.16b, v8.16b
1999	cmtst	v18.2d, v15.2d, v6.2d
2000	eor	v4.16b, v9.16b, v15.16b
2001	ext	v9.16b, v17.16b, v17.16b, #8
2002	eor	v9.16b, v5.16b, v9.16b
2003	add	v17.2d, v9.2d, v9.2d
2004	and	v18.16b, v18.16b, v8.16b
2005	eor	v5.16b, v10.16b, v9.16b
2006	str	q9, [x2], #16
2007	ext	v10.16b, v18.16b, v18.16b, #8
2008	cmtst	v9.2d, v9.2d, v6.2d
2009	and	v9.16b, v9.16b, v8.16b
2010	eor	v10.16b, v17.16b, v10.16b
2011	cmtst	v17.2d, v10.2d, v6.2d
2012	eor	v6.16b, v16.16b, v10.16b
2013	str	q10, [x2], #16
2014	ext	v9.16b, v9.16b, v9.16b, #8
2015	add	v10.2d, v10.2d, v10.2d
2016	eor	v9.16b, v10.16b, v9.16b
2017	str	q9, [x2], #16
2018	eor	v7.16b, v7.16b, v9.16b
2019	add	v9.2d, v9.2d, v9.2d
2020	and	v8.16b, v17.16b, v8.16b
2021	ext	v8.16b, v8.16b, v8.16b, #8
2022	eor	v8.16b, v9.16b, v8.16b
2023	str	q8, [x2]                    // next round tweak
2024
2025	bl	_bsaes_decrypt8
2026
2027	eor	v6.16b, v6.16b, v13.16b
2028	eor	v0.16b, v0.16b, v11.16b
2029	ldr	q8, [x0], #16
2030	eor	v7.16b, v7.16b, v8.16b
2031	str	q0, [x21], #16
2032	eor	v0.16b, v1.16b, v12.16b
2033	ldr	q1, [x0], #16
2034	eor	v1.16b, v3.16b, v1.16b
2035	subs	x22, x22, #0x80
2036	eor	v2.16b, v2.16b, v15.16b
2037	eor	v3.16b, v4.16b, v14.16b
2038	ldr	q4, [x0], #16
2039	str	q0, [x21], #16
2040	ldr	q11, [x0]                   // next round tweak
2041	eor	v0.16b, v5.16b, v4.16b
2042	str	q6, [x21], #16
2043	str	q3, [x21], #16
2044	str	q2, [x21], #16
2045	str	q7, [x21], #16
2046	str	q1, [x21], #16
2047	str	q0, [x21], #16
2048	bpl	.Lxts_dec_loop
2049
2050.Lxts_dec_short:
2051	adds	x22, x22, #0x70
2052	bmi	.Lxts_dec_done
2053
2054	ldr	q8, .Lxts_magic
2055	sshr	v1.2d, v11.2d, #63
2056	add	v2.2d, v11.2d, v11.2d
2057	ldr	q9, .Lxts_magic+16
2058	subs	x22, x22, #0x10
2059	ldr	q0, [x20], #16
2060	and	v1.16b, v1.16b, v8.16b
2061	cmtst	v3.2d, v11.2d, v9.2d
2062	ext	v1.16b, v1.16b, v1.16b, #8
2063	and	v3.16b, v3.16b, v8.16b
2064	eor	v12.16b, v2.16b, v1.16b
2065	ext	v1.16b, v3.16b, v3.16b, #8
2066	add	v2.2d, v12.2d, v12.2d
2067	cmtst	v3.2d, v12.2d, v9.2d
2068	eor	v13.16b, v2.16b, v1.16b
2069	and	v22.16b, v3.16b, v8.16b
2070	bmi	.Lxts_dec_1
2071
2072	ext	v2.16b, v22.16b, v22.16b, #8
2073	add	v3.2d, v13.2d, v13.2d
2074	ldr	q1, [x20], #16
2075	cmtst	v4.2d, v13.2d, v9.2d
2076	subs	x22, x22, #0x10
2077	eor	v14.16b, v3.16b, v2.16b
2078	and	v23.16b, v4.16b, v8.16b
2079	bmi	.Lxts_dec_2
2080
2081	ext	v3.16b, v23.16b, v23.16b, #8
2082	add	v4.2d, v14.2d, v14.2d
2083	ldr	q2, [x20], #16
2084	cmtst	v5.2d, v14.2d, v9.2d
2085	eor	v0.16b, v0.16b, v11.16b
2086	subs	x22, x22, #0x10
2087	eor	v15.16b, v4.16b, v3.16b
2088	and	v24.16b, v5.16b, v8.16b
2089	bmi	.Lxts_dec_3
2090
2091	ext	v4.16b, v24.16b, v24.16b, #8
2092	add	v5.2d, v15.2d, v15.2d
2093	ldr	q3, [x20], #16
2094	cmtst	v6.2d, v15.2d, v9.2d
2095	eor	v1.16b, v1.16b, v12.16b
2096	subs	x22, x22, #0x10
2097	eor	v16.16b, v5.16b, v4.16b
2098	and	v25.16b, v6.16b, v8.16b
2099	bmi	.Lxts_dec_4
2100
2101	ext	v5.16b, v25.16b, v25.16b, #8
2102	add	v6.2d, v16.2d, v16.2d
2103	add	x0, x19, #16
2104	cmtst	v7.2d, v16.2d, v9.2d
2105	ldr	q4, [x20], #16
2106	eor	v2.16b, v2.16b, v13.16b
2107	str	q16, [x0], #16
2108	subs	x22, x22, #0x10
2109	eor	v17.16b, v6.16b, v5.16b
2110	and	v26.16b, v7.16b, v8.16b
2111	bmi	.Lxts_dec_5
2112
2113	ext	v7.16b, v26.16b, v26.16b, #8
2114	add	v18.2d, v17.2d, v17.2d
2115	ldr	q5, [x20], #16
2116	eor	v3.16b, v3.16b, v14.16b
2117	str	q17, [x0], #16
2118	subs	x22, x22, #0x10
2119	eor	v18.16b, v18.16b, v7.16b
2120	bmi	.Lxts_dec_6
2121
2122	ldr	q6, [x20], #16
2123	eor	v4.16b, v4.16b, v15.16b
2124	eor	v5.16b, v5.16b, v16.16b
2125	str	q18, [x0]                   // next round tweak
2126	mov	x9, sp                      // pass key schedule
2127	mov	x10, x1
2128	add	x0, x19, #16
2129	sub	x22, x22, #0x10
2130	eor	v6.16b, v6.16b, v17.16b
2131
2132	bl	_bsaes_decrypt8
2133
2134	ldr	q16, [x0], #16
2135	eor	v0.16b, v0.16b, v11.16b
2136	eor	v1.16b, v1.16b, v12.16b
2137	ldr	q17, [x0], #16
2138	eor	v6.16b, v6.16b, v13.16b
2139	eor	v4.16b, v4.16b, v14.16b
2140	eor	v2.16b, v2.16b, v15.16b
2141	ldr	q11, [x0]                   // next round tweak
2142	str	q0, [x21], #16
2143	str	q1, [x21], #16
2144	eor	v0.16b, v7.16b, v16.16b
2145	eor	v1.16b, v3.16b, v17.16b
2146	str	q6, [x21], #16
2147	str	q4, [x21], #16
2148	str	q2, [x21], #16
2149	str	q0, [x21], #16
2150	str	q1, [x21], #16
2151	b	.Lxts_dec_done
2152
2153.align	4
2154.Lxts_dec_6:
2155	eor	v4.16b, v4.16b, v15.16b
2156	eor	v5.16b, v5.16b, v16.16b
2157	mov	x9, sp                      // pass key schedule
2158	mov	x10, x1                     // pass rounds
2159	add	x0, x19, #16
2160
2161	bl	_bsaes_decrypt8
2162
2163	ldr	q16, [x0], #16
2164	eor	v0.16b, v0.16b, v11.16b
2165	eor	v1.16b, v1.16b, v12.16b
2166	eor	v6.16b, v6.16b, v13.16b
2167	eor	v4.16b, v4.16b, v14.16b
2168	ldr	q11, [x0]                   // next round tweak
2169	eor	v2.16b, v2.16b, v15.16b
2170	str	q0, [x21], #16
2171	str	q1, [x21], #16
2172	eor	v0.16b, v7.16b, v16.16b
2173	str	q6, [x21], #16
2174	str	q4, [x21], #16
2175	str	q2, [x21], #16
2176	str	q0, [x21], #16
2177	b	.Lxts_dec_done
2178
2179.align	4
2180.Lxts_dec_5:
2181	eor	v3.16b, v3.16b, v14.16b
2182	eor	v4.16b, v4.16b, v15.16b
2183	mov	x9, sp                      // pass key schedule
2184	mov	x10, x1                     // pass rounds
2185	add	x0, x19, #16
2186
2187	bl	_bsaes_decrypt8
2188
2189	eor	v0.16b, v0.16b, v11.16b
2190	eor	v1.16b, v1.16b, v12.16b
2191	ldr	q11, [x0]                   // next round tweak
2192	eor	v6.16b, v6.16b, v13.16b
2193	eor	v4.16b, v4.16b, v14.16b
2194	eor	v2.16b, v2.16b, v15.16b
2195	str	q0, [x21], #16
2196	str	q1, [x21], #16
2197	str	q6, [x21], #16
2198	str	q4, [x21], #16
2199	str	q2, [x21], #16
2200	b	.Lxts_dec_done
2201
2202.align	4
2203.Lxts_dec_4:
2204	eor	v2.16b, v2.16b, v13.16b
2205	eor	v3.16b, v3.16b, v14.16b
2206	mov	x9, sp                      // pass key schedule
2207	mov	x10, x1                     // pass rounds
2208	add	x0, x19, #16
2209
2210	bl	_bsaes_decrypt8
2211
2212	eor	v0.16b, v0.16b, v11.16b
2213	eor	v1.16b, v1.16b, v12.16b
2214	eor	v6.16b, v6.16b, v13.16b
2215	eor	v4.16b, v4.16b, v14.16b
2216	mov	v11.16b, v15.16b            // next round tweak
2217	str	q0, [x21], #16
2218	str	q1, [x21], #16
2219	str	q6, [x21], #16
2220	str	q4, [x21], #16
2221	b	.Lxts_dec_done
2222
2223.align	4
2224.Lxts_dec_3:
2225	eor	v1.16b, v1.16b, v12.16b
2226	eor	v2.16b, v2.16b, v13.16b
2227	mov	x9, sp                      // pass key schedule
2228	mov	x10, x1                     // pass rounds
2229	add	x0, x19, #16
2230
2231	bl	_bsaes_decrypt8
2232
2233	eor	v0.16b, v0.16b, v11.16b
2234	eor	v1.16b, v1.16b, v12.16b
2235	eor	v6.16b, v6.16b, v13.16b
2236	mov	v11.16b, v14.16b            // next round tweak
2237	str	q0, [x21], #16
2238	str	q1, [x21], #16
2239	str	q6, [x21], #16
2240	b	.Lxts_dec_done
2241
2242.align	4
2243.Lxts_dec_2:
2244	eor	v0.16b, v0.16b, v11.16b
2245	eor	v1.16b, v1.16b, v12.16b
2246	mov	x9, sp                      // pass key schedule
2247	mov	x10, x1                     // pass rounds
2248	add	x0, x19, #16
2249
2250	bl	_bsaes_decrypt8
2251
2252	eor	v0.16b, v0.16b, v11.16b
2253	eor	v1.16b, v1.16b, v12.16b
2254	mov	v11.16b, v13.16b            // next round tweak
2255	str	q0, [x21], #16
2256	str	q1, [x21], #16
2257	b	.Lxts_dec_done
2258
2259.align	4
2260.Lxts_dec_1:
2261	eor	v0.16b, v0.16b, v11.16b
2262	sub	x0, sp, #16
2263	sub	x1, sp, #16
2264	mov	x2, x23
2265	mov	v13.d[0], v11.d[1]          // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
2266	mov	v14.d[0], v12.d[1]
2267	str	q0, [sp, #-16]!
2268
2269	bl	AES_decrypt
2270
2271	ldr	q0, [sp], #16
2272	trn1	v13.2d, v11.2d, v13.2d
2273	trn1	v11.2d, v12.2d, v14.2d      // next round tweak
2274	eor	v0.16b, v0.16b, v13.16b
2275	str	q0, [x21], #16
2276
2277.Lxts_dec_done:
2278	adds	x22, x22, #0x10
2279	beq	.Lxts_dec_ret
2280
2281        // calculate one round of extra tweak for the stolen ciphertext
2282	ldr	q8, .Lxts_magic
2283	sshr	v6.2d, v11.2d, #63
2284	and	v6.16b, v6.16b, v8.16b
2285	add	v12.2d, v11.2d, v11.2d
2286	ext	v6.16b, v6.16b, v6.16b, #8
2287	eor	v12.16b, v12.16b, v6.16b
2288
2289        // perform the final decryption with the last tweak value
2290	ldr	q0, [x20], #16
2291	eor	v0.16b, v0.16b, v12.16b
2292	str	q0, [sp, #-16]!
2293	mov	x0, sp
2294	mov	x1, sp
2295	mov	x2, x23
2296	mov	v13.d[0], v11.d[1]          // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
2297	mov	v14.d[0], v12.d[1]
2298
2299	bl	AES_decrypt
2300
2301	trn1	v12.2d, v12.2d, v14.2d
2302	trn1	v11.2d, v11.2d, v13.2d
2303	ldr	q0, [sp], #16
2304	eor	v0.16b, v0.16b, v12.16b
2305	str	q0, [x21]
2306
2307	mov	x6, x21
2308        // Penultimate ciphertext block produces final plaintext part-block
2309        // plus remaining part of final ciphertext block. Move plaintext part
2310        // to final position and reuse penultimate plaintext block buffer to
2311        // construct final ciphertext block
2312.Lxts_dec_steal:
2313	ldrb	w1, [x21]
2314	ldrb	w0, [x20], #1
2315	strb	w1, [x21, #0x10]
2316	strb	w0, [x21], #1
2317
2318	subs	x22, x22, #1
2319	bhi	.Lxts_dec_steal
2320
2321        // Finally decrypt the penultimate plaintext block using the
2322        // penultimate tweak
2323	ldr	q0, [x6]
2324	eor	v0.16b, v0.16b, v11.16b
2325	str	q0, [sp, #-16]!
2326	mov	x0, sp
2327	mov	x1, sp
2328	mov	x2, x23
2329	mov	x21, x6
2330
2331	bl	AES_decrypt
2332
2333	trn1	v11.2d, v11.2d, v13.2d
2334	ldr	q0, [sp], #16
2335	eor	v0.16b, v0.16b, v11.16b
2336	str	q0, [x21]
2337
2338.Lxts_dec_ret:
2339
2340	movi	v0.16b, #0
2341	movi	v1.16b, #0
2342.Lxts_dec_bzero:	//	wipe key schedule
2343	stp	q0, q1, [sp], #32
2344	cmp	sp, x19
2345	bne	.Lxts_dec_bzero
2346
2347	ldp	x19, x20, [sp, #80]
2348	ldp	x21, x22, [sp, #96]
2349	ldr	x23, [sp, #112]
2350	ldp	d8, d9, [sp, #128]
2351	ldp	d10, d11, [sp, #144]
2352	ldp	d12, d13, [sp, #160]
2353	ldp	d14, d15, [sp, #176]
2354	ldp	x29, x30, [sp], #192
2355	ret
2356.size	ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt
2357