xref: /freebsd/crypto/openssl/crypto/aes/asm/bsaes-armv8.pl (revision e7be843b4a162e68651d3911f0357ed464915629)
1#!/usr/bin/env perl
2# Copyright 2020-2025 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9use strict;
10
11my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
12my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
13my $xlate;
14
15$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1;
16( $xlate="${dir}arm-xlate.pl" and -f $xlate  ) or
17( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
18die "can't locate arm-xlate.pl";
19
20open OUT,"| \"$^X\" $xlate $flavour $output";
21*STDOUT=*OUT;
22
23my $code = data();
24print $code;
25
26close STDOUT or die "error closing STDOUT: $!"; # enforce flush
27
28sub data
29{
30    local $/;
31    return <DATA>;
32}
33
34__END__
35// Copyright 2021-2025 The OpenSSL Project Authors. All Rights Reserved.
36//
37// Licensed under the OpenSSL license (the "License").  You may not use
38// this file except in compliance with the License.  You can obtain a copy
39// in the file LICENSE in the source distribution or at
40// https://www.openssl.org/source/license.html
41//
42// ====================================================================
43// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL
44// project. Rights for redistribution and usage in source and binary
45// forms are granted according to the OpenSSL license.
46// ====================================================================
47//
48// This implementation is a translation of bsaes-armv7 for AArch64.
49// No attempt has been made to carry across the build switches for
50// kernel targets, since the Linux kernel crypto support has moved on
51// from when it was based on OpenSSL.
52
53// A lot of hand-scheduling has been performed. Consequently, this code
54// doesn't factor out neatly into macros in the same way that the
55// AArch32 version did, and there is little to be gained by wrapping it
56// up in Perl, and it is presented as pure assembly.
57
58
59#include "crypto/arm_arch.h"
60
61.text
62
63.extern AES_cbc_encrypt
64.extern AES_encrypt
65.extern AES_decrypt
66
67.type   _bsaes_decrypt8,%function
68.align  4
69// On entry:
70//   x9 -> key (previously expanded using _bsaes_key_convert)
71//   x10 = number of rounds
72//   v0-v7 input data
73// On exit:
74//   x9-x11 corrupted
75//   other general-purpose registers preserved
76//   v0-v7 output data
77//   v11-v15 preserved
78//   other SIMD registers corrupted
79_bsaes_decrypt8:
80        ldr     q8, [x9], #16
81        adrp    x11, .LM0ISR
82        add     x11, x11, #:lo12:.LM0ISR
83        movi    v9.16b, #0x55
84        ldr     q10, [x11], #16
85        movi    v16.16b, #0x33
86        movi    v17.16b, #0x0f
87        sub     x10, x10, #1
88        eor     v0.16b, v0.16b, v8.16b
89        eor     v1.16b, v1.16b, v8.16b
90        eor     v2.16b, v2.16b, v8.16b
91        eor     v4.16b, v4.16b, v8.16b
92        eor     v3.16b, v3.16b, v8.16b
93        eor     v5.16b, v5.16b, v8.16b
94        tbl     v0.16b, {v0.16b}, v10.16b
95        tbl     v1.16b, {v1.16b}, v10.16b
96        tbl     v2.16b, {v2.16b}, v10.16b
97        tbl     v4.16b, {v4.16b}, v10.16b
98        eor     v6.16b, v6.16b, v8.16b
99        eor     v7.16b, v7.16b, v8.16b
100        tbl     v3.16b, {v3.16b}, v10.16b
101        tbl     v5.16b, {v5.16b}, v10.16b
102        tbl     v6.16b, {v6.16b}, v10.16b
103        ushr    v8.2d, v0.2d, #1
104        tbl     v7.16b, {v7.16b}, v10.16b
105        ushr    v10.2d, v4.2d, #1
106        ushr    v18.2d, v2.2d, #1
107        eor     v8.16b, v8.16b, v1.16b
108        ushr    v19.2d, v6.2d, #1
109        eor     v10.16b, v10.16b, v5.16b
110        eor     v18.16b, v18.16b, v3.16b
111        and     v8.16b, v8.16b, v9.16b
112        eor     v19.16b, v19.16b, v7.16b
113        and     v10.16b, v10.16b, v9.16b
114        and     v18.16b, v18.16b, v9.16b
115        eor     v1.16b, v1.16b, v8.16b
116        shl     v8.2d, v8.2d, #1
117        and     v9.16b, v19.16b, v9.16b
118        eor     v5.16b, v5.16b, v10.16b
119        shl     v10.2d, v10.2d, #1
120        eor     v3.16b, v3.16b, v18.16b
121        shl     v18.2d, v18.2d, #1
122        eor     v0.16b, v0.16b, v8.16b
123        shl     v8.2d, v9.2d, #1
124        eor     v7.16b, v7.16b, v9.16b
125        eor     v4.16b, v4.16b, v10.16b
126        eor     v2.16b, v2.16b, v18.16b
127        ushr    v9.2d, v1.2d, #2
128        eor     v6.16b, v6.16b, v8.16b
129        ushr    v8.2d, v0.2d, #2
130        ushr    v10.2d, v5.2d, #2
131        ushr    v18.2d, v4.2d, #2
132        eor     v9.16b, v9.16b, v3.16b
133        eor     v8.16b, v8.16b, v2.16b
134        eor     v10.16b, v10.16b, v7.16b
135        eor     v18.16b, v18.16b, v6.16b
136        and     v9.16b, v9.16b, v16.16b
137        and     v8.16b, v8.16b, v16.16b
138        and     v10.16b, v10.16b, v16.16b
139        and     v16.16b, v18.16b, v16.16b
140        eor     v3.16b, v3.16b, v9.16b
141        shl     v9.2d, v9.2d, #2
142        eor     v2.16b, v2.16b, v8.16b
143        shl     v8.2d, v8.2d, #2
144        eor     v7.16b, v7.16b, v10.16b
145        shl     v10.2d, v10.2d, #2
146        eor     v6.16b, v6.16b, v16.16b
147        shl     v16.2d, v16.2d, #2
148        eor     v1.16b, v1.16b, v9.16b
149        eor     v0.16b, v0.16b, v8.16b
150        eor     v5.16b, v5.16b, v10.16b
151        eor     v4.16b, v4.16b, v16.16b
152        ushr    v8.2d, v3.2d, #4
153        ushr    v9.2d, v2.2d, #4
154        ushr    v10.2d, v1.2d, #4
155        ushr    v16.2d, v0.2d, #4
156        eor     v8.16b, v8.16b, v7.16b
157        eor     v9.16b, v9.16b, v6.16b
158        eor     v10.16b, v10.16b, v5.16b
159        eor     v16.16b, v16.16b, v4.16b
160        and     v8.16b, v8.16b, v17.16b
161        and     v9.16b, v9.16b, v17.16b
162        and     v10.16b, v10.16b, v17.16b
163        and     v16.16b, v16.16b, v17.16b
164        eor     v7.16b, v7.16b, v8.16b
165        shl     v8.2d, v8.2d, #4
166        eor     v6.16b, v6.16b, v9.16b
167        shl     v9.2d, v9.2d, #4
168        eor     v5.16b, v5.16b, v10.16b
169        shl     v10.2d, v10.2d, #4
170        eor     v4.16b, v4.16b, v16.16b
171        shl     v16.2d, v16.2d, #4
172        eor     v3.16b, v3.16b, v8.16b
173        eor     v2.16b, v2.16b, v9.16b
174        eor     v1.16b, v1.16b, v10.16b
175        eor     v0.16b, v0.16b, v16.16b
176        b       .Ldec_sbox
177.align  4
178.Ldec_loop:
179        ld1     {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
180        ldp     q8, q9, [x9], #32
181        eor     v0.16b, v16.16b, v0.16b
182        ldr     q10, [x9], #16
183        eor     v1.16b, v17.16b, v1.16b
184        ldr     q16, [x9], #16
185        eor     v2.16b, v18.16b, v2.16b
186        eor     v3.16b, v19.16b, v3.16b
187        eor     v4.16b, v8.16b, v4.16b
188        eor     v5.16b, v9.16b, v5.16b
189        eor     v6.16b, v10.16b, v6.16b
190        eor     v7.16b, v16.16b, v7.16b
191        tbl     v0.16b, {v0.16b}, v28.16b
192        tbl     v1.16b, {v1.16b}, v28.16b
193        tbl     v2.16b, {v2.16b}, v28.16b
194        tbl     v3.16b, {v3.16b}, v28.16b
195        tbl     v4.16b, {v4.16b}, v28.16b
196        tbl     v5.16b, {v5.16b}, v28.16b
197        tbl     v6.16b, {v6.16b}, v28.16b
198        tbl     v7.16b, {v7.16b}, v28.16b
199.Ldec_sbox:
200        eor     v1.16b, v1.16b, v4.16b
201        eor     v3.16b, v3.16b, v4.16b
202        subs    x10, x10, #1
203        eor     v4.16b, v4.16b, v7.16b
204        eor     v2.16b, v2.16b, v7.16b
205        eor     v1.16b, v1.16b, v6.16b
206        eor     v6.16b, v6.16b, v4.16b
207        eor     v2.16b, v2.16b, v5.16b
208        eor     v0.16b, v0.16b, v1.16b
209        eor     v7.16b, v7.16b, v6.16b
210        eor     v8.16b, v6.16b, v2.16b
211        and     v9.16b, v4.16b, v6.16b
212        eor     v10.16b, v2.16b, v6.16b
213        eor     v3.16b, v3.16b, v0.16b
214        eor     v5.16b, v5.16b, v0.16b
215        eor     v16.16b, v7.16b, v4.16b
216        eor     v17.16b, v4.16b, v0.16b
217        and     v18.16b, v0.16b, v2.16b
218        eor     v19.16b, v7.16b, v4.16b
219        eor     v1.16b, v1.16b, v3.16b
220        eor     v20.16b, v3.16b, v0.16b
221        eor     v21.16b, v5.16b, v2.16b
222        eor     v22.16b, v3.16b, v7.16b
223        and     v8.16b, v17.16b, v8.16b
224        orr     v17.16b, v3.16b, v5.16b
225        eor     v23.16b, v1.16b, v6.16b
226        eor     v24.16b, v20.16b, v16.16b
227        eor     v25.16b, v1.16b, v5.16b
228        orr     v26.16b, v20.16b, v21.16b
229        and     v20.16b, v20.16b, v21.16b
230        and     v27.16b, v7.16b, v1.16b
231        eor     v21.16b, v21.16b, v23.16b
232        orr     v28.16b, v16.16b, v23.16b
233        orr     v29.16b, v22.16b, v25.16b
234        eor     v26.16b, v26.16b, v8.16b
235        and     v16.16b, v16.16b, v23.16b
236        and     v22.16b, v22.16b, v25.16b
237        and     v21.16b, v24.16b, v21.16b
238        eor     v8.16b, v28.16b, v8.16b
239        eor     v23.16b, v5.16b, v2.16b
240        eor     v24.16b, v1.16b, v6.16b
241        eor     v16.16b, v16.16b, v22.16b
242        eor     v22.16b, v3.16b, v0.16b
243        eor     v25.16b, v29.16b, v21.16b
244        eor     v21.16b, v26.16b, v21.16b
245        eor     v8.16b, v8.16b, v20.16b
246        eor     v26.16b, v23.16b, v24.16b
247        eor     v16.16b, v16.16b, v20.16b
248        eor     v28.16b, v22.16b, v19.16b
249        eor     v20.16b, v25.16b, v20.16b
250        eor     v9.16b, v21.16b, v9.16b
251        eor     v8.16b, v8.16b, v18.16b
252        eor     v18.16b, v5.16b, v1.16b
253        eor     v21.16b, v16.16b, v17.16b
254        eor     v16.16b, v16.16b, v17.16b
255        eor     v17.16b, v20.16b, v27.16b
256        eor     v20.16b, v3.16b, v7.16b
257        eor     v25.16b, v9.16b, v8.16b
258        eor     v27.16b, v0.16b, v4.16b
259        and     v29.16b, v9.16b, v17.16b
260        eor     v30.16b, v8.16b, v29.16b
261        eor     v31.16b, v21.16b, v29.16b
262        eor     v29.16b, v21.16b, v29.16b
263        bsl     v30.16b, v17.16b, v21.16b
264        bsl     v31.16b, v9.16b, v8.16b
265        bsl     v16.16b, v30.16b, v29.16b
266        bsl     v21.16b, v29.16b, v30.16b
267        eor     v8.16b, v31.16b, v30.16b
268        and     v1.16b, v1.16b, v31.16b
269        and     v9.16b, v16.16b, v31.16b
270        and     v6.16b, v6.16b, v30.16b
271        eor     v16.16b, v17.16b, v21.16b
272        and     v4.16b, v4.16b, v30.16b
273        eor     v17.16b, v8.16b, v30.16b
274        and     v21.16b, v24.16b, v8.16b
275        eor     v9.16b, v9.16b, v25.16b
276        and     v19.16b, v19.16b, v8.16b
277        eor     v24.16b, v30.16b, v16.16b
278        eor     v25.16b, v30.16b, v16.16b
279        and     v7.16b, v7.16b, v17.16b
280        and     v10.16b, v10.16b, v16.16b
281        eor     v29.16b, v9.16b, v16.16b
282        eor     v30.16b, v31.16b, v9.16b
283        and     v0.16b, v24.16b, v0.16b
284        and     v9.16b, v18.16b, v9.16b
285        and     v2.16b, v25.16b, v2.16b
286        eor     v10.16b, v10.16b, v6.16b
287        eor     v18.16b, v29.16b, v16.16b
288        and     v5.16b, v30.16b, v5.16b
289        eor     v24.16b, v8.16b, v29.16b
290        and     v25.16b, v26.16b, v29.16b
291        and     v26.16b, v28.16b, v29.16b
292        eor     v8.16b, v8.16b, v29.16b
293        eor     v17.16b, v17.16b, v18.16b
294        eor     v5.16b, v1.16b, v5.16b
295        and     v23.16b, v24.16b, v23.16b
296        eor     v21.16b, v21.16b, v25.16b
297        eor     v19.16b, v19.16b, v26.16b
298        eor     v0.16b, v4.16b, v0.16b
299        and     v3.16b, v17.16b, v3.16b
300        eor     v1.16b, v9.16b, v1.16b
301        eor     v9.16b, v25.16b, v23.16b
302        eor     v5.16b, v5.16b, v21.16b
303        eor     v2.16b, v6.16b, v2.16b
304        and     v6.16b, v8.16b, v22.16b
305        eor     v3.16b, v7.16b, v3.16b
306        and     v8.16b, v20.16b, v18.16b
307        eor     v10.16b, v10.16b, v9.16b
308        eor     v0.16b, v0.16b, v19.16b
309        eor     v9.16b, v1.16b, v9.16b
310        eor     v1.16b, v2.16b, v21.16b
311        eor     v3.16b, v3.16b, v19.16b
312        and     v16.16b, v27.16b, v16.16b
313        eor     v17.16b, v26.16b, v6.16b
314        eor     v6.16b, v8.16b, v7.16b
315        eor     v7.16b, v1.16b, v9.16b
316        eor     v1.16b, v5.16b, v3.16b
317        eor     v2.16b, v10.16b, v3.16b
318        eor     v4.16b, v16.16b, v4.16b
319        eor     v8.16b, v6.16b, v17.16b
320        eor     v5.16b, v9.16b, v3.16b
321        eor     v9.16b, v0.16b, v1.16b
322        eor     v6.16b, v7.16b, v1.16b
323        eor     v0.16b, v4.16b, v17.16b
324        eor     v4.16b, v8.16b, v7.16b
325        eor     v7.16b, v9.16b, v2.16b
326        eor     v8.16b, v3.16b, v0.16b
327        eor     v7.16b, v7.16b, v5.16b
328        eor     v3.16b, v4.16b, v7.16b
329        eor     v4.16b, v7.16b, v0.16b
330        eor     v7.16b, v8.16b, v3.16b
331        bcc     .Ldec_done
332        ext     v8.16b, v0.16b, v0.16b, #8
333        ext     v9.16b, v1.16b, v1.16b, #8
334        ldr     q28, [x11]                  // load from .LISR in common case (x10 > 0)
335        ext     v10.16b, v6.16b, v6.16b, #8
336        ext     v16.16b, v3.16b, v3.16b, #8
337        ext     v17.16b, v5.16b, v5.16b, #8
338        ext     v18.16b, v4.16b, v4.16b, #8
339        eor     v8.16b, v8.16b, v0.16b
340        eor     v9.16b, v9.16b, v1.16b
341        eor     v10.16b, v10.16b, v6.16b
342        eor     v16.16b, v16.16b, v3.16b
343        eor     v17.16b, v17.16b, v5.16b
344        ext     v19.16b, v2.16b, v2.16b, #8
345        ext     v20.16b, v7.16b, v7.16b, #8
346        eor     v18.16b, v18.16b, v4.16b
347        eor     v6.16b, v6.16b, v8.16b
348        eor     v8.16b, v2.16b, v10.16b
349        eor     v4.16b, v4.16b, v9.16b
350        eor     v2.16b, v19.16b, v2.16b
351        eor     v9.16b, v20.16b, v7.16b
352        eor     v0.16b, v0.16b, v16.16b
353        eor     v1.16b, v1.16b, v16.16b
354        eor     v6.16b, v6.16b, v17.16b
355        eor     v8.16b, v8.16b, v16.16b
356        eor     v7.16b, v7.16b, v18.16b
357        eor     v4.16b, v4.16b, v16.16b
358        eor     v2.16b, v3.16b, v2.16b
359        eor     v1.16b, v1.16b, v17.16b
360        eor     v3.16b, v5.16b, v9.16b
361        eor     v5.16b, v8.16b, v17.16b
362        eor     v7.16b, v7.16b, v17.16b
363        ext     v8.16b, v0.16b, v0.16b, #12
364        ext     v9.16b, v6.16b, v6.16b, #12
365        ext     v10.16b, v4.16b, v4.16b, #12
366        ext     v16.16b, v1.16b, v1.16b, #12
367        ext     v17.16b, v5.16b, v5.16b, #12
368        ext     v18.16b, v7.16b, v7.16b, #12
369        eor     v0.16b, v0.16b, v8.16b
370        eor     v6.16b, v6.16b, v9.16b
371        eor     v4.16b, v4.16b, v10.16b
372        ext     v19.16b, v2.16b, v2.16b, #12
373        ext     v20.16b, v3.16b, v3.16b, #12
374        eor     v1.16b, v1.16b, v16.16b
375        eor     v5.16b, v5.16b, v17.16b
376        eor     v7.16b, v7.16b, v18.16b
377        eor     v2.16b, v2.16b, v19.16b
378        eor     v16.16b, v16.16b, v0.16b
379        eor     v3.16b, v3.16b, v20.16b
380        eor     v17.16b, v17.16b, v4.16b
381        eor     v10.16b, v10.16b, v6.16b
382        ext     v0.16b, v0.16b, v0.16b, #8
383        eor     v9.16b, v9.16b, v1.16b
384        ext     v1.16b, v1.16b, v1.16b, #8
385        eor     v8.16b, v8.16b, v3.16b
386        eor     v16.16b, v16.16b, v3.16b
387        eor     v18.16b, v18.16b, v5.16b
388        eor     v19.16b, v19.16b, v7.16b
389        ext     v21.16b, v5.16b, v5.16b, #8
390        ext     v5.16b, v7.16b, v7.16b, #8
391        eor     v7.16b, v20.16b, v2.16b
392        ext     v4.16b, v4.16b, v4.16b, #8
393        ext     v20.16b, v3.16b, v3.16b, #8
394        eor     v17.16b, v17.16b, v3.16b
395        ext     v2.16b, v2.16b, v2.16b, #8
396        eor     v3.16b, v10.16b, v3.16b
397        ext     v10.16b, v6.16b, v6.16b, #8
398        eor     v0.16b, v0.16b, v8.16b
399        eor     v1.16b, v1.16b, v16.16b
400        eor     v5.16b, v5.16b, v18.16b
401        eor     v3.16b, v3.16b, v4.16b
402        eor     v7.16b, v20.16b, v7.16b
403        eor     v6.16b, v2.16b, v19.16b
404        eor     v4.16b, v21.16b, v17.16b
405        eor     v2.16b, v10.16b, v9.16b
406        bne     .Ldec_loop
407        ldr     q28, [x11, #16]!            // load from .LISRM0 on last round (x10 == 0)
408        b       .Ldec_loop
409.align  4
410.Ldec_done:
411        ushr    v8.2d, v0.2d, #1
412        movi    v9.16b, #0x55
413        ldr     q10, [x9]
414        ushr    v16.2d, v2.2d, #1
415        movi    v17.16b, #0x33
416        ushr    v18.2d, v6.2d, #1
417        movi    v19.16b, #0x0f
418        eor     v8.16b, v8.16b, v1.16b
419        ushr    v20.2d, v3.2d, #1
420        eor     v16.16b, v16.16b, v7.16b
421        eor     v18.16b, v18.16b, v4.16b
422        and     v8.16b, v8.16b, v9.16b
423        eor     v20.16b, v20.16b, v5.16b
424        and     v16.16b, v16.16b, v9.16b
425        and     v18.16b, v18.16b, v9.16b
426        shl     v21.2d, v8.2d, #1
427        eor     v1.16b, v1.16b, v8.16b
428        and     v8.16b, v20.16b, v9.16b
429        eor     v7.16b, v7.16b, v16.16b
430        shl     v9.2d, v16.2d, #1
431        eor     v4.16b, v4.16b, v18.16b
432        shl     v16.2d, v18.2d, #1
433        eor     v0.16b, v0.16b, v21.16b
434        shl     v18.2d, v8.2d, #1
435        eor     v5.16b, v5.16b, v8.16b
436        eor     v2.16b, v2.16b, v9.16b
437        eor     v6.16b, v6.16b, v16.16b
438        ushr    v8.2d, v1.2d, #2
439        eor     v3.16b, v3.16b, v18.16b
440        ushr    v9.2d, v0.2d, #2
441        ushr    v16.2d, v7.2d, #2
442        ushr    v18.2d, v2.2d, #2
443        eor     v8.16b, v8.16b, v4.16b
444        eor     v9.16b, v9.16b, v6.16b
445        eor     v16.16b, v16.16b, v5.16b
446        eor     v18.16b, v18.16b, v3.16b
447        and     v8.16b, v8.16b, v17.16b
448        and     v9.16b, v9.16b, v17.16b
449        and     v16.16b, v16.16b, v17.16b
450        and     v17.16b, v18.16b, v17.16b
451        eor     v4.16b, v4.16b, v8.16b
452        shl     v8.2d, v8.2d, #2
453        eor     v6.16b, v6.16b, v9.16b
454        shl     v9.2d, v9.2d, #2
455        eor     v5.16b, v5.16b, v16.16b
456        shl     v16.2d, v16.2d, #2
457        eor     v3.16b, v3.16b, v17.16b
458        shl     v17.2d, v17.2d, #2
459        eor     v1.16b, v1.16b, v8.16b
460        eor     v0.16b, v0.16b, v9.16b
461        eor     v7.16b, v7.16b, v16.16b
462        eor     v2.16b, v2.16b, v17.16b
463        ushr    v8.2d, v4.2d, #4
464        ushr    v9.2d, v6.2d, #4
465        ushr    v16.2d, v1.2d, #4
466        ushr    v17.2d, v0.2d, #4
467        eor     v8.16b, v8.16b, v5.16b
468        eor     v9.16b, v9.16b, v3.16b
469        eor     v16.16b, v16.16b, v7.16b
470        eor     v17.16b, v17.16b, v2.16b
471        and     v8.16b, v8.16b, v19.16b
472        and     v9.16b, v9.16b, v19.16b
473        and     v16.16b, v16.16b, v19.16b
474        and     v17.16b, v17.16b, v19.16b
475        eor     v5.16b, v5.16b, v8.16b
476        shl     v8.2d, v8.2d, #4
477        eor     v3.16b, v3.16b, v9.16b
478        shl     v9.2d, v9.2d, #4
479        eor     v7.16b, v7.16b, v16.16b
480        shl     v16.2d, v16.2d, #4
481        eor     v2.16b, v2.16b, v17.16b
482        shl     v17.2d, v17.2d, #4
483        eor     v4.16b, v4.16b, v8.16b
484        eor     v6.16b, v6.16b, v9.16b
485        eor     v7.16b, v7.16b, v10.16b
486        eor     v1.16b, v1.16b, v16.16b
487        eor     v2.16b, v2.16b, v10.16b
488        eor     v0.16b, v0.16b, v17.16b
489        eor     v4.16b, v4.16b, v10.16b
490        eor     v6.16b, v6.16b, v10.16b
491        eor     v3.16b, v3.16b, v10.16b
492        eor     v5.16b, v5.16b, v10.16b
493        eor     v1.16b, v1.16b, v10.16b
494        eor     v0.16b, v0.16b, v10.16b
495        ret
496.size   _bsaes_decrypt8,.-_bsaes_decrypt8
497
498.rodata
499.type   _bsaes_consts,%object
500.align  6
501_bsaes_consts:
502// InvShiftRows constants
503// Used in _bsaes_decrypt8, which assumes contiguity
504// .LM0ISR used with round 0 key
505// .LISR   used with middle round keys
506// .LISRM0 used with final round key
507.LM0ISR:
508.quad   0x0a0e0206070b0f03, 0x0004080c0d010509
509.LISR:
510.quad   0x0504070602010003, 0x0f0e0d0c080b0a09
511.LISRM0:
512.quad   0x01040b0e0205080f, 0x0306090c00070a0d
513
514// ShiftRows constants
515// Used in _bsaes_encrypt8, which assumes contiguity
516// .LM0SR used with round 0 key
517// .LSR   used with middle round keys
518// .LSRM0 used with final round key
519.LM0SR:
520.quad   0x0a0e02060f03070b, 0x0004080c05090d01
521.LSR:
522.quad   0x0504070600030201, 0x0f0e0d0c0a09080b
523.LSRM0:
524.quad   0x0304090e00050a0f, 0x01060b0c0207080d
525
526.LM0_bigendian:
527.quad   0x02060a0e03070b0f, 0x0004080c0105090d
528.LM0_littleendian:
529.quad   0x0105090d0004080c, 0x03070b0f02060a0e
530
531// Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into
532// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR
533.LREVM0SR:
534.quad   0x090d01050c000408, 0x03070b0f060a0e02
535
536.align  6
537.size   _bsaes_consts,.-_bsaes_consts
538
539.previous
540
541.type   _bsaes_encrypt8,%function
542.align  4
543// On entry:
544//   x9 -> key (previously expanded using _bsaes_key_convert)
545//   x10 = number of rounds
546//   v0-v7 input data
547// On exit:
548//   x9-x11 corrupted
549//   other general-purpose registers preserved
550//   v0-v7 output data
551//   v11-v15 preserved
552//   other SIMD registers corrupted
553_bsaes_encrypt8:
554        ldr     q8, [x9], #16
555        adrp    x11, .LM0SR
556        add     x11, x11, #:lo12:.LM0SR
557        ldr     q9, [x11], #16
558_bsaes_encrypt8_alt:
559        eor     v0.16b, v0.16b, v8.16b
560        eor     v1.16b, v1.16b, v8.16b
561        sub     x10, x10, #1
562        eor     v2.16b, v2.16b, v8.16b
563        eor     v4.16b, v4.16b, v8.16b
564        eor     v3.16b, v3.16b, v8.16b
565        eor     v5.16b, v5.16b, v8.16b
566        tbl     v0.16b, {v0.16b}, v9.16b
567        tbl     v1.16b, {v1.16b}, v9.16b
568        tbl     v2.16b, {v2.16b}, v9.16b
569        tbl     v4.16b, {v4.16b}, v9.16b
570        eor     v6.16b, v6.16b, v8.16b
571        eor     v7.16b, v7.16b, v8.16b
572        tbl     v3.16b, {v3.16b}, v9.16b
573        tbl     v5.16b, {v5.16b}, v9.16b
574        tbl     v6.16b, {v6.16b}, v9.16b
575        ushr    v8.2d, v0.2d, #1
576        movi    v10.16b, #0x55
577        tbl     v7.16b, {v7.16b}, v9.16b
578        ushr    v9.2d, v4.2d, #1
579        movi    v16.16b, #0x33
580        ushr    v17.2d, v2.2d, #1
581        eor     v8.16b, v8.16b, v1.16b
582        movi    v18.16b, #0x0f
583        ushr    v19.2d, v6.2d, #1
584        eor     v9.16b, v9.16b, v5.16b
585        eor     v17.16b, v17.16b, v3.16b
586        and     v8.16b, v8.16b, v10.16b
587        eor     v19.16b, v19.16b, v7.16b
588        and     v9.16b, v9.16b, v10.16b
589        and     v17.16b, v17.16b, v10.16b
590        eor     v1.16b, v1.16b, v8.16b
591        shl     v8.2d, v8.2d, #1
592        and     v10.16b, v19.16b, v10.16b
593        eor     v5.16b, v5.16b, v9.16b
594        shl     v9.2d, v9.2d, #1
595        eor     v3.16b, v3.16b, v17.16b
596        shl     v17.2d, v17.2d, #1
597        eor     v0.16b, v0.16b, v8.16b
598        shl     v8.2d, v10.2d, #1
599        eor     v7.16b, v7.16b, v10.16b
600        eor     v4.16b, v4.16b, v9.16b
601        eor     v2.16b, v2.16b, v17.16b
602        ushr    v9.2d, v1.2d, #2
603        eor     v6.16b, v6.16b, v8.16b
604        ushr    v8.2d, v0.2d, #2
605        ushr    v10.2d, v5.2d, #2
606        ushr    v17.2d, v4.2d, #2
607        eor     v9.16b, v9.16b, v3.16b
608        eor     v8.16b, v8.16b, v2.16b
609        eor     v10.16b, v10.16b, v7.16b
610        eor     v17.16b, v17.16b, v6.16b
611        and     v9.16b, v9.16b, v16.16b
612        and     v8.16b, v8.16b, v16.16b
613        and     v10.16b, v10.16b, v16.16b
614        and     v16.16b, v17.16b, v16.16b
615        eor     v3.16b, v3.16b, v9.16b
616        shl     v9.2d, v9.2d, #2
617        eor     v2.16b, v2.16b, v8.16b
618        shl     v8.2d, v8.2d, #2
619        eor     v7.16b, v7.16b, v10.16b
620        shl     v10.2d, v10.2d, #2
621        eor     v6.16b, v6.16b, v16.16b
622        shl     v16.2d, v16.2d, #2
623        eor     v1.16b, v1.16b, v9.16b
624        eor     v0.16b, v0.16b, v8.16b
625        eor     v5.16b, v5.16b, v10.16b
626        eor     v4.16b, v4.16b, v16.16b
627        ushr    v8.2d, v3.2d, #4
628        ushr    v9.2d, v2.2d, #4
629        ushr    v10.2d, v1.2d, #4
630        ushr    v16.2d, v0.2d, #4
631        eor     v8.16b, v8.16b, v7.16b
632        eor     v9.16b, v9.16b, v6.16b
633        eor     v10.16b, v10.16b, v5.16b
634        eor     v16.16b, v16.16b, v4.16b
635        and     v8.16b, v8.16b, v18.16b
636        and     v9.16b, v9.16b, v18.16b
637        and     v10.16b, v10.16b, v18.16b
638        and     v16.16b, v16.16b, v18.16b
639        eor     v7.16b, v7.16b, v8.16b
640        shl     v8.2d, v8.2d, #4
641        eor     v6.16b, v6.16b, v9.16b
642        shl     v9.2d, v9.2d, #4
643        eor     v5.16b, v5.16b, v10.16b
644        shl     v10.2d, v10.2d, #4
645        eor     v4.16b, v4.16b, v16.16b
646        shl     v16.2d, v16.2d, #4
647        eor     v3.16b, v3.16b, v8.16b
648        eor     v2.16b, v2.16b, v9.16b
649        eor     v1.16b, v1.16b, v10.16b
650        eor     v0.16b, v0.16b, v16.16b
651        b       .Lenc_sbox
652.align  4
653.Lenc_loop:
654        ld1     {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64
655        ldp     q8, q9, [x9], #32
656        eor     v0.16b, v16.16b, v0.16b
657        ldr     q10, [x9], #16
658        eor     v1.16b, v17.16b, v1.16b
659        ldr     q16, [x9], #16
660        eor     v2.16b, v18.16b, v2.16b
661        eor     v3.16b, v19.16b, v3.16b
662        eor     v4.16b, v8.16b, v4.16b
663        eor     v5.16b, v9.16b, v5.16b
664        eor     v6.16b, v10.16b, v6.16b
665        eor     v7.16b, v16.16b, v7.16b
666        tbl     v0.16b, {v0.16b}, v28.16b
667        tbl     v1.16b, {v1.16b}, v28.16b
668        tbl     v2.16b, {v2.16b}, v28.16b
669        tbl     v3.16b, {v3.16b}, v28.16b
670        tbl     v4.16b, {v4.16b}, v28.16b
671        tbl     v5.16b, {v5.16b}, v28.16b
672        tbl     v6.16b, {v6.16b}, v28.16b
673        tbl     v7.16b, {v7.16b}, v28.16b
674.Lenc_sbox:
675        eor     v5.16b, v5.16b, v6.16b
676        eor     v3.16b, v3.16b, v0.16b
677        subs    x10, x10, #1
678        eor     v2.16b, v2.16b, v1.16b
679        eor     v5.16b, v5.16b, v0.16b
680        eor     v8.16b, v3.16b, v7.16b
681        eor     v6.16b, v6.16b, v2.16b
682        eor     v7.16b, v7.16b, v5.16b
683        eor     v8.16b, v8.16b, v4.16b
684        eor     v3.16b, v6.16b, v3.16b
685        eor     v4.16b, v4.16b, v5.16b
686        eor     v6.16b, v1.16b, v5.16b
687        eor     v2.16b, v2.16b, v7.16b
688        eor     v1.16b, v8.16b, v1.16b
689        eor     v8.16b, v7.16b, v4.16b
690        eor     v9.16b, v3.16b, v0.16b
691        eor     v10.16b, v7.16b, v6.16b
692        eor     v16.16b, v5.16b, v3.16b
693        eor     v17.16b, v6.16b, v2.16b
694        eor     v18.16b, v5.16b, v1.16b
695        eor     v19.16b, v2.16b, v4.16b
696        eor     v20.16b, v1.16b, v0.16b
697        orr     v21.16b, v8.16b, v9.16b
698        orr     v22.16b, v10.16b, v16.16b
699        eor     v23.16b, v8.16b, v17.16b
700        eor     v24.16b, v9.16b, v18.16b
701        and     v19.16b, v19.16b, v20.16b
702        orr     v20.16b, v17.16b, v18.16b
703        and     v8.16b, v8.16b, v9.16b
704        and     v9.16b, v17.16b, v18.16b
705        and     v17.16b, v23.16b, v24.16b
706        and     v10.16b, v10.16b, v16.16b
707        eor     v16.16b, v21.16b, v19.16b
708        eor     v18.16b, v20.16b, v19.16b
709        and     v19.16b, v2.16b, v1.16b
710        and     v20.16b, v6.16b, v5.16b
711        eor     v21.16b, v22.16b, v17.16b
712        eor     v9.16b, v9.16b, v10.16b
713        eor     v10.16b, v16.16b, v17.16b
714        eor     v16.16b, v18.16b, v8.16b
715        and     v17.16b, v4.16b, v0.16b
716        orr     v18.16b, v7.16b, v3.16b
717        eor     v21.16b, v21.16b, v8.16b
718        eor     v8.16b, v9.16b, v8.16b
719        eor     v9.16b, v10.16b, v19.16b
720        eor     v10.16b, v3.16b, v0.16b
721        eor     v16.16b, v16.16b, v17.16b
722        eor     v17.16b, v5.16b, v1.16b
723        eor     v19.16b, v21.16b, v20.16b
724        eor     v20.16b, v8.16b, v18.16b
725        eor     v8.16b, v8.16b, v18.16b
726        eor     v18.16b, v7.16b, v4.16b
727        eor     v21.16b, v9.16b, v16.16b
728        eor     v22.16b, v6.16b, v2.16b
729        and     v23.16b, v9.16b, v19.16b
730        eor     v24.16b, v10.16b, v17.16b
731        eor     v25.16b, v0.16b, v1.16b
732        eor     v26.16b, v7.16b, v6.16b
733        eor     v27.16b, v18.16b, v22.16b
734        eor     v28.16b, v3.16b, v5.16b
735        eor     v29.16b, v16.16b, v23.16b
736        eor     v30.16b, v20.16b, v23.16b
737        eor     v23.16b, v20.16b, v23.16b
738        eor     v31.16b, v4.16b, v2.16b
739        bsl     v29.16b, v19.16b, v20.16b
740        bsl     v30.16b, v9.16b, v16.16b
741        bsl     v8.16b, v29.16b, v23.16b
742        bsl     v20.16b, v23.16b, v29.16b
743        eor     v9.16b, v30.16b, v29.16b
744        and     v5.16b, v5.16b, v30.16b
745        and     v8.16b, v8.16b, v30.16b
746        and     v1.16b, v1.16b, v29.16b
747        eor     v16.16b, v19.16b, v20.16b
748        and     v2.16b, v2.16b, v29.16b
749        eor     v19.16b, v9.16b, v29.16b
750        and     v17.16b, v17.16b, v9.16b
751        eor     v8.16b, v8.16b, v21.16b
752        and     v20.16b, v22.16b, v9.16b
753        eor     v21.16b, v29.16b, v16.16b
754        eor     v22.16b, v29.16b, v16.16b
755        and     v23.16b, v25.16b, v16.16b
756        and     v6.16b, v6.16b, v19.16b
757        eor     v25.16b, v8.16b, v16.16b
758        eor     v29.16b, v30.16b, v8.16b
759        and     v4.16b, v21.16b, v4.16b
760        and     v8.16b, v28.16b, v8.16b
761        and     v0.16b, v22.16b, v0.16b
762        eor     v21.16b, v23.16b, v1.16b
763        eor     v22.16b, v9.16b, v25.16b
764        eor     v9.16b, v9.16b, v25.16b
765        eor     v23.16b, v25.16b, v16.16b
766        and     v3.16b, v29.16b, v3.16b
767        and     v24.16b, v24.16b, v25.16b
768        and     v25.16b, v27.16b, v25.16b
769        and     v10.16b, v22.16b, v10.16b
770        and     v9.16b, v9.16b, v18.16b
771        eor     v18.16b, v19.16b, v23.16b
772        and     v19.16b, v26.16b, v23.16b
773        eor     v3.16b, v5.16b, v3.16b
774        eor     v17.16b, v17.16b, v24.16b
775        eor     v10.16b, v24.16b, v10.16b
776        and     v16.16b, v31.16b, v16.16b
777        eor     v20.16b, v20.16b, v25.16b
778        eor     v9.16b, v25.16b, v9.16b
779        eor     v4.16b, v2.16b, v4.16b
780        and     v7.16b, v18.16b, v7.16b
781        eor     v18.16b, v19.16b, v6.16b
782        eor     v5.16b, v8.16b, v5.16b
783        eor     v0.16b, v1.16b, v0.16b
784        eor     v1.16b, v21.16b, v10.16b
785        eor     v8.16b, v3.16b, v17.16b
786        eor     v2.16b, v16.16b, v2.16b
787        eor     v3.16b, v6.16b, v7.16b
788        eor     v6.16b, v18.16b, v9.16b
789        eor     v4.16b, v4.16b, v20.16b
790        eor     v10.16b, v5.16b, v10.16b
791        eor     v0.16b, v0.16b, v17.16b
792        eor     v9.16b, v2.16b, v9.16b
793        eor     v3.16b, v3.16b, v20.16b
794        eor     v7.16b, v6.16b, v1.16b
795        eor     v5.16b, v8.16b, v4.16b
796        eor     v6.16b, v10.16b, v1.16b
797        eor     v2.16b, v4.16b, v0.16b
798        eor     v4.16b, v3.16b, v10.16b
799        eor     v9.16b, v9.16b, v7.16b
800        eor     v3.16b, v0.16b, v5.16b
801        eor     v0.16b, v1.16b, v4.16b
802        eor     v1.16b, v4.16b, v8.16b
803        eor     v4.16b, v9.16b, v5.16b
804        eor     v6.16b, v6.16b, v3.16b
805        bcc     .Lenc_done
806        ext     v8.16b, v0.16b, v0.16b, #12
807        ext     v9.16b, v4.16b, v4.16b, #12
808        ldr     q28, [x11]
809        ext     v10.16b, v6.16b, v6.16b, #12
810        ext     v16.16b, v1.16b, v1.16b, #12
811        ext     v17.16b, v3.16b, v3.16b, #12
812        ext     v18.16b, v7.16b, v7.16b, #12
813        eor     v0.16b, v0.16b, v8.16b
814        eor     v4.16b, v4.16b, v9.16b
815        eor     v6.16b, v6.16b, v10.16b
816        ext     v19.16b, v2.16b, v2.16b, #12
817        ext     v20.16b, v5.16b, v5.16b, #12
818        eor     v1.16b, v1.16b, v16.16b
819        eor     v3.16b, v3.16b, v17.16b
820        eor     v7.16b, v7.16b, v18.16b
821        eor     v2.16b, v2.16b, v19.16b
822        eor     v16.16b, v16.16b, v0.16b
823        eor     v5.16b, v5.16b, v20.16b
824        eor     v17.16b, v17.16b, v6.16b
825        eor     v10.16b, v10.16b, v4.16b
826        ext     v0.16b, v0.16b, v0.16b, #8
827        eor     v9.16b, v9.16b, v1.16b
828        ext     v1.16b, v1.16b, v1.16b, #8
829        eor     v8.16b, v8.16b, v5.16b
830        eor     v16.16b, v16.16b, v5.16b
831        eor     v18.16b, v18.16b, v3.16b
832        eor     v19.16b, v19.16b, v7.16b
833        ext     v3.16b, v3.16b, v3.16b, #8
834        ext     v7.16b, v7.16b, v7.16b, #8
835        eor     v20.16b, v20.16b, v2.16b
836        ext     v6.16b, v6.16b, v6.16b, #8
837        ext     v21.16b, v5.16b, v5.16b, #8
838        eor     v17.16b, v17.16b, v5.16b
839        ext     v2.16b, v2.16b, v2.16b, #8
840        eor     v10.16b, v10.16b, v5.16b
841        ext     v22.16b, v4.16b, v4.16b, #8
842        eor     v0.16b, v0.16b, v8.16b
843        eor     v1.16b, v1.16b, v16.16b
844        eor     v5.16b, v7.16b, v18.16b
845        eor     v4.16b, v3.16b, v17.16b
846        eor     v3.16b, v6.16b, v10.16b
847        eor     v7.16b, v21.16b, v20.16b
848        eor     v6.16b, v2.16b, v19.16b
849        eor     v2.16b, v22.16b, v9.16b
850        bne     .Lenc_loop
851        ldr     q28, [x11, #16]!            // load from .LSRM0 on last round (x10 == 0)
852        b       .Lenc_loop
853.align  4
854.Lenc_done:
855        ushr    v8.2d, v0.2d, #1
856        movi    v9.16b, #0x55
857        ldr     q10, [x9]
858        ushr    v16.2d, v3.2d, #1
859        movi    v17.16b, #0x33
860        ushr    v18.2d, v4.2d, #1
861        movi    v19.16b, #0x0f
862        eor     v8.16b, v8.16b, v1.16b
863        ushr    v20.2d, v2.2d, #1
864        eor     v16.16b, v16.16b, v7.16b
865        eor     v18.16b, v18.16b, v6.16b
866        and     v8.16b, v8.16b, v9.16b
867        eor     v20.16b, v20.16b, v5.16b
868        and     v16.16b, v16.16b, v9.16b
869        and     v18.16b, v18.16b, v9.16b
870        shl     v21.2d, v8.2d, #1
871        eor     v1.16b, v1.16b, v8.16b
872        and     v8.16b, v20.16b, v9.16b
873        eor     v7.16b, v7.16b, v16.16b
874        shl     v9.2d, v16.2d, #1
875        eor     v6.16b, v6.16b, v18.16b
876        shl     v16.2d, v18.2d, #1
877        eor     v0.16b, v0.16b, v21.16b
878        shl     v18.2d, v8.2d, #1
879        eor     v5.16b, v5.16b, v8.16b
880        eor     v3.16b, v3.16b, v9.16b
881        eor     v4.16b, v4.16b, v16.16b
882        ushr    v8.2d, v1.2d, #2
883        eor     v2.16b, v2.16b, v18.16b
884        ushr    v9.2d, v0.2d, #2
885        ushr    v16.2d, v7.2d, #2
886        ushr    v18.2d, v3.2d, #2
887        eor     v8.16b, v8.16b, v6.16b
888        eor     v9.16b, v9.16b, v4.16b
889        eor     v16.16b, v16.16b, v5.16b
890        eor     v18.16b, v18.16b, v2.16b
891        and     v8.16b, v8.16b, v17.16b
892        and     v9.16b, v9.16b, v17.16b
893        and     v16.16b, v16.16b, v17.16b
894        and     v17.16b, v18.16b, v17.16b
895        eor     v6.16b, v6.16b, v8.16b
896        shl     v8.2d, v8.2d, #2
897        eor     v4.16b, v4.16b, v9.16b
898        shl     v9.2d, v9.2d, #2
899        eor     v5.16b, v5.16b, v16.16b
900        shl     v16.2d, v16.2d, #2
901        eor     v2.16b, v2.16b, v17.16b
902        shl     v17.2d, v17.2d, #2
903        eor     v1.16b, v1.16b, v8.16b
904        eor     v0.16b, v0.16b, v9.16b
905        eor     v7.16b, v7.16b, v16.16b
906        eor     v3.16b, v3.16b, v17.16b
907        ushr    v8.2d, v6.2d, #4
908        ushr    v9.2d, v4.2d, #4
909        ushr    v16.2d, v1.2d, #4
910        ushr    v17.2d, v0.2d, #4
911        eor     v8.16b, v8.16b, v5.16b
912        eor     v9.16b, v9.16b, v2.16b
913        eor     v16.16b, v16.16b, v7.16b
914        eor     v17.16b, v17.16b, v3.16b
915        and     v8.16b, v8.16b, v19.16b
916        and     v9.16b, v9.16b, v19.16b
917        and     v16.16b, v16.16b, v19.16b
918        and     v17.16b, v17.16b, v19.16b
919        eor     v5.16b, v5.16b, v8.16b
920        shl     v8.2d, v8.2d, #4
921        eor     v2.16b, v2.16b, v9.16b
922        shl     v9.2d, v9.2d, #4
923        eor     v7.16b, v7.16b, v16.16b
924        shl     v16.2d, v16.2d, #4
925        eor     v3.16b, v3.16b, v17.16b
926        shl     v17.2d, v17.2d, #4
927        eor     v6.16b, v6.16b, v8.16b
928        eor     v4.16b, v4.16b, v9.16b
929        eor     v7.16b, v7.16b, v10.16b
930        eor     v1.16b, v1.16b, v16.16b
931        eor     v3.16b, v3.16b, v10.16b
932        eor     v0.16b, v0.16b, v17.16b
933        eor     v6.16b, v6.16b, v10.16b
934        eor     v4.16b, v4.16b, v10.16b
935        eor     v2.16b, v2.16b, v10.16b
936        eor     v5.16b, v5.16b, v10.16b
937        eor     v1.16b, v1.16b, v10.16b
938        eor     v0.16b, v0.16b, v10.16b
939        ret
940.size   _bsaes_encrypt8,.-_bsaes_encrypt8
941
942.type   _bsaes_key_convert,%function
943.align  4
944// On entry:
945//   x9 -> input key (big-endian)
946//   x10 = number of rounds
947//   x17 -> output key (native endianness)
948// On exit:
949//   x9, x10 corrupted
950//   x11 -> .LM0_bigendian
951//   x17 -> last quadword of output key
952//   other general-purpose registers preserved
953//   v2-v6 preserved
954//   v7.16b[] = 0x63
955//   v8-v14 preserved
956//   v15 = last round key (converted to native endianness)
957//   other SIMD registers corrupted
958_bsaes_key_convert:
959#ifdef __AARCH64EL__
960        adrp    x11, .LM0_littleendian
961        add     x11, x11, #:lo12:.LM0_littleendian
962#else
963        adrp    x11, .LM0_bigendian
964        add     x11, x11, #:lo12:.LM0_bigendian
965#endif
966        ldr     q0, [x9], #16               // load round 0 key
967        ldr     q1, [x11]                   // .LM0
968        ldr     q15, [x9], #16              // load round 1 key
969
970        movi    v7.16b, #0x63               // compose .L63
971        movi    v16.16b, #0x01              // bit masks
972        movi    v17.16b, #0x02
973        movi    v18.16b, #0x04
974        movi    v19.16b, #0x08
975        movi    v20.16b, #0x10
976        movi    v21.16b, #0x20
977        movi    v22.16b, #0x40
978        movi    v23.16b, #0x80
979
980#ifdef __AARCH64EL__
981        rev32   v0.16b, v0.16b
982#endif
983        sub     x10, x10, #1
984        str     q0, [x17], #16              // save round 0 key
985
986.align  4
987.Lkey_loop:
988        tbl     v0.16b, {v15.16b}, v1.16b
989        ldr     q15, [x9], #16              // load next round key
990
991        eor     v0.16b, v0.16b, v7.16b
992        cmtst   v24.16b, v0.16b, v16.16b
993        cmtst   v25.16b, v0.16b, v17.16b
994        cmtst   v26.16b, v0.16b, v18.16b
995        cmtst   v27.16b, v0.16b, v19.16b
996        cmtst   v28.16b, v0.16b, v20.16b
997        cmtst   v29.16b, v0.16b, v21.16b
998        cmtst   v30.16b, v0.16b, v22.16b
999        cmtst   v31.16b, v0.16b, v23.16b
1000        sub     x10, x10, #1
1001        st1     {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key
1002        st1     {v28.16b-v31.16b}, [x17], #64
1003        cbnz    x10, .Lkey_loop
1004
1005        // don't save last round key
1006#ifdef __AARCH64EL__
1007        rev32   v15.16b, v15.16b
1008        adrp    x11, .LM0_bigendian
1009        add     x11, x11, #:lo12:.LM0_bigendian
1010#endif
1011        ret
1012.size   _bsaes_key_convert,.-_bsaes_key_convert
1013
1014.globl  ossl_bsaes_cbc_encrypt
1015.type   ossl_bsaes_cbc_encrypt,%function
1016.align  4
1017// On entry:
1018//   x0 -> input ciphertext
1019//   x1 -> output plaintext
1020//   x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16)
1021//   x3 -> key
1022//   x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call)
1023//   w5 must be == 0
1024// On exit:
1025//   Output plaintext filled in
1026//   Initialisation vector overwritten with last quadword of ciphertext
1027//   No output registers, usual AAPCS64 register preservation
1028ossl_bsaes_cbc_encrypt:
1029        AARCH64_VALID_CALL_TARGET
1030        cmp     x2, #128
1031        bhs     .Lcbc_do_bsaes
1032        b       AES_cbc_encrypt
1033.Lcbc_do_bsaes:
1034
1035        // it is up to the caller to make sure we are called with enc == 0
1036
1037        stp     x29, x30, [sp, #-48]!
1038        stp     d8, d9, [sp, #16]
1039        stp     d10, d15, [sp, #32]
1040        lsr     x2, x2, #4                  // len in 16 byte blocks
1041
1042        ldr     w15, [x3, #240]             // get # of rounds
1043        mov     x14, sp
1044
1045        // allocate the key schedule on the stack
1046        add     x17, sp, #96
1047        sub     x17, x17, x15, lsl #7       // 128 bytes per inner round key, less 96 bytes
1048
1049        // populate the key schedule
1050        mov     x9, x3                      // pass key
1051        mov     x10, x15                    // pass # of rounds
1052        mov     sp, x17                     // sp is sp
1053        bl      _bsaes_key_convert
1054        ldr     q6,  [sp]
1055        str     q15, [x17]                  // save last round key
1056        eor     v6.16b, v6.16b, v7.16b      // fix up round 0 key (by XORing with 0x63)
1057        str     q6, [sp]
1058
1059        ldr     q15, [x4]                   // load IV
1060        b       .Lcbc_dec_loop
1061
1062.align  4
1063.Lcbc_dec_loop:
1064        subs    x2, x2, #0x8
1065        bmi     .Lcbc_dec_loop_finish
1066
1067        ldr     q0, [x0], #16               // load input
1068        mov     x9, sp                      // pass the key
1069        ldr     q1, [x0], #16
1070        mov     x10, x15
1071        ldr     q2, [x0], #16
1072        ldr     q3, [x0], #16
1073        ldr     q4, [x0], #16
1074        ldr     q5, [x0], #16
1075        ldr     q6, [x0], #16
1076        ldr     q7, [x0], #-7*16
1077
1078        bl      _bsaes_decrypt8
1079
1080        ldr     q16, [x0], #16              // reload input
1081        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1082        eor     v1.16b, v1.16b, v16.16b
1083        str     q0, [x1], #16               // write output
1084        ldr     q0, [x0], #16
1085        str     q1, [x1], #16
1086        ldr     q1, [x0], #16
1087        eor     v1.16b, v4.16b, v1.16b
1088        ldr     q4, [x0], #16
1089        eor     v2.16b, v2.16b, v4.16b
1090        eor     v0.16b, v6.16b, v0.16b
1091        ldr     q4, [x0], #16
1092        str     q0, [x1], #16
1093        str     q1, [x1], #16
1094        eor     v0.16b, v7.16b, v4.16b
1095        ldr     q1, [x0], #16
1096        str     q2, [x1], #16
1097        ldr     q2, [x0], #16
1098        ldr     q15, [x0], #16
1099        str     q0, [x1], #16
1100        eor     v0.16b, v5.16b, v2.16b
1101        eor     v1.16b, v3.16b, v1.16b
1102        str     q1, [x1], #16
1103        str     q0, [x1], #16
1104
1105        b       .Lcbc_dec_loop
1106
1107.Lcbc_dec_loop_finish:
1108        adds    x2, x2, #8
1109        beq     .Lcbc_dec_done
1110
1111        ldr     q0, [x0], #16               // load input
1112        cmp     x2, #2
1113        blo     .Lcbc_dec_one
1114        ldr     q1, [x0], #16
1115        mov     x9, sp                      // pass the key
1116        mov     x10, x15
1117        beq     .Lcbc_dec_two
1118        ldr     q2, [x0], #16
1119        cmp     x2, #4
1120        blo     .Lcbc_dec_three
1121        ldr     q3, [x0], #16
1122        beq     .Lcbc_dec_four
1123        ldr     q4, [x0], #16
1124        cmp     x2, #6
1125        blo     .Lcbc_dec_five
1126        ldr     q5, [x0], #16
1127        beq     .Lcbc_dec_six
1128        ldr     q6, [x0], #-6*16
1129
1130        bl      _bsaes_decrypt8
1131
1132        ldr     q5, [x0], #16               // reload input
1133        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1134        ldr     q8, [x0], #16
1135        ldr     q9, [x0], #16
1136        ldr     q10, [x0], #16
1137        str     q0, [x1], #16               // write output
1138        ldr     q0, [x0], #16
1139        eor     v1.16b, v1.16b, v5.16b
1140        ldr     q5, [x0], #16
1141        eor     v6.16b, v6.16b, v8.16b
1142        ldr     q15, [x0]
1143        eor     v4.16b, v4.16b, v9.16b
1144        eor     v2.16b, v2.16b, v10.16b
1145        str     q1, [x1], #16
1146        eor     v0.16b, v7.16b, v0.16b
1147        str     q6, [x1], #16
1148        eor     v1.16b, v3.16b, v5.16b
1149        str     q4, [x1], #16
1150        str     q2, [x1], #16
1151        str     q0, [x1], #16
1152        str     q1, [x1]
1153        b       .Lcbc_dec_done
1154.align  4
1155.Lcbc_dec_six:
1156        sub     x0, x0, #0x60
1157        bl      _bsaes_decrypt8
1158        ldr     q3, [x0], #16               // reload input
1159        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1160        ldr     q5, [x0], #16
1161        ldr     q8, [x0], #16
1162        ldr     q9, [x0], #16
1163        str     q0, [x1], #16               // write output
1164        ldr     q0, [x0], #16
1165        eor     v1.16b, v1.16b, v3.16b
1166        ldr     q15, [x0]
1167        eor     v3.16b, v6.16b, v5.16b
1168        eor     v4.16b, v4.16b, v8.16b
1169        eor     v2.16b, v2.16b, v9.16b
1170        str     q1, [x1], #16
1171        eor     v0.16b, v7.16b, v0.16b
1172        str     q3, [x1], #16
1173        str     q4, [x1], #16
1174        str     q2, [x1], #16
1175        str     q0, [x1]
1176        b       .Lcbc_dec_done
1177.align  4
1178.Lcbc_dec_five:
1179        sub     x0, x0, #0x50
1180        bl      _bsaes_decrypt8
1181        ldr     q3, [x0], #16               // reload input
1182        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1183        ldr     q5, [x0], #16
1184        ldr     q7, [x0], #16
1185        ldr     q8, [x0], #16
1186        str     q0, [x1], #16               // write output
1187        ldr     q15, [x0]
1188        eor     v0.16b, v1.16b, v3.16b
1189        eor     v1.16b, v6.16b, v5.16b
1190        eor     v3.16b, v4.16b, v7.16b
1191        str     q0, [x1], #16
1192        eor     v0.16b, v2.16b, v8.16b
1193        str     q1, [x1], #16
1194        str     q3, [x1], #16
1195        str     q0, [x1]
1196        b       .Lcbc_dec_done
1197.align  4
1198.Lcbc_dec_four:
1199        sub     x0, x0, #0x40
1200        bl      _bsaes_decrypt8
1201        ldr     q2, [x0], #16               // reload input
1202        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1203        ldr     q3, [x0], #16
1204        ldr     q5, [x0], #16
1205        str     q0, [x1], #16               // write output
1206        ldr     q15, [x0]
1207        eor     v0.16b, v1.16b, v2.16b
1208        eor     v1.16b, v6.16b, v3.16b
1209        eor     v2.16b, v4.16b, v5.16b
1210        str     q0, [x1], #16
1211        str     q1, [x1], #16
1212        str     q2, [x1]
1213        b       .Lcbc_dec_done
1214.align  4
1215.Lcbc_dec_three:
1216        sub     x0, x0, #0x30
1217        bl      _bsaes_decrypt8
1218        ldr     q2, [x0], #16               // reload input
1219        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1220        ldr     q3, [x0], #16
1221        ldr     q15, [x0]
1222        str     q0, [x1], #16               // write output
1223        eor     v0.16b, v1.16b, v2.16b
1224        eor     v1.16b, v6.16b, v3.16b
1225        str     q0, [x1], #16
1226        str     q1, [x1]
1227        b       .Lcbc_dec_done
1228.align  4
1229.Lcbc_dec_two:
1230        sub     x0, x0, #0x20
1231        bl      _bsaes_decrypt8
1232        ldr     q2, [x0], #16               // reload input
1233        eor     v0.16b, v0.16b, v15.16b     // ^= IV
1234        ldr     q15, [x0]
1235        str     q0, [x1], #16               // write output
1236        eor     v0.16b, v1.16b, v2.16b
1237        str     q0, [x1]
1238        b       .Lcbc_dec_done
1239.align  4
1240.Lcbc_dec_one:
1241        sub     x0, x0, #0x10
1242        stp     x1, x4, [sp, #-32]!
1243        str     x14, [sp, #16]
1244        mov     v8.16b, v15.16b
1245        mov     v15.16b, v0.16b
1246        mov     x2, x3
1247        bl      AES_decrypt
1248        ldr     x14, [sp, #16]
1249        ldp     x1, x4, [sp], #32
1250        ldr     q0, [x1]                    // load result
1251        eor     v0.16b, v0.16b, v8.16b      // ^= IV
1252        str     q0, [x1]                    // write output
1253
1254.align  4
1255.Lcbc_dec_done:
1256        movi    v0.16b, #0
1257        movi    v1.16b, #0
1258.Lcbc_dec_bzero:// wipe key schedule [if any]
1259        stp     q0, q1, [sp], #32
1260        cmp     sp, x14
1261        bne     .Lcbc_dec_bzero
1262        str     q15, [x4]                   // return IV
1263        ldp     d8, d9, [sp, #16]
1264        ldp     d10, d15, [sp, #32]
1265        ldp     x29, x30, [sp], #48
1266        ret
1267.size   ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt
1268
1269.globl  ossl_bsaes_ctr32_encrypt_blocks
1270.type   ossl_bsaes_ctr32_encrypt_blocks,%function
1271.align  4
1272// On entry:
1273//   x0 -> input text (whole 16-byte blocks)
1274//   x1 -> output text (whole 16-byte blocks)
1275//   x2 = number of 16-byte blocks to encrypt/decrypt (> 0)
1276//   x3 -> key
1277//   x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block
1278// On exit:
1279//   Output text filled in
1280//   No output registers, usual AAPCS64 register preservation
1281ossl_bsaes_ctr32_encrypt_blocks:
1282        AARCH64_VALID_CALL_TARGET
1283        cmp     x2, #8                      // use plain AES for
1284        blo     .Lctr_enc_short             // small sizes
1285
1286        stp     x29, x30, [sp, #-80]!
1287        stp     d8, d9, [sp, #16]
1288        stp     d10, d11, [sp, #32]
1289        stp     d12, d13, [sp, #48]
1290        stp     d14, d15, [sp, #64]
1291
1292        ldr     w15, [x3, #240]             // get # of rounds
1293        mov     x14, sp
1294
1295        // allocate the key schedule on the stack
1296        add     x17, sp, #96
1297        sub     x17, x17, x15, lsl #7       // 128 bytes per inner round key, less 96 bytes
1298
1299        // populate the key schedule
1300        mov     x9, x3                      // pass key
1301        mov     x10, x15                    // pass # of rounds
1302        mov     sp, x17                     // sp is sp
1303        bl      _bsaes_key_convert
1304        eor     v7.16b, v7.16b, v15.16b     // fix up last round key
1305        str     q7, [x17]                   // save last round key
1306
1307        ldr     q0, [x4]                    // load counter
1308        add     x13, x11, #.LREVM0SR-.LM0_bigendian
1309        ldr     q4, [sp]                    // load round0 key
1310
1311        movi    v8.4s, #1                   // compose 1<<96
1312        movi    v9.16b, #0
1313        rev32   v15.16b, v0.16b
1314        rev32   v0.16b, v0.16b
1315        ext     v11.16b, v9.16b, v8.16b, #4
1316        rev32   v4.16b, v4.16b
1317        add     v12.4s, v11.4s, v11.4s      // compose 2<<96
1318        str     q4, [sp]                    // save adjusted round0 key
1319        add     v13.4s, v11.4s, v12.4s      // compose 3<<96
1320        add     v14.4s, v12.4s, v12.4s      // compose 4<<96
1321        b       .Lctr_enc_loop
1322
1323.align  4
1324.Lctr_enc_loop:
1325        // Intermix prologue from _bsaes_encrypt8 to use the opportunity
1326        // to flip byte order in 32-bit counter
1327
1328        add     v1.4s, v15.4s, v11.4s       // +1
1329        add     x9, sp, #0x10               // pass next round key
1330        add     v2.4s, v15.4s, v12.4s       // +2
1331        ldr     q9, [x13]                   // .LREVM0SR
1332        ldr     q8, [sp]                    // load round0 key
1333        add     v3.4s, v15.4s, v13.4s       // +3
1334        mov     x10, x15                    // pass rounds
1335        sub     x11, x13, #.LREVM0SR-.LSR   // pass constants
1336        add     v6.4s, v2.4s, v14.4s
1337        add     v4.4s, v15.4s, v14.4s       // +4
1338        add     v7.4s, v3.4s, v14.4s
1339        add     v15.4s, v4.4s, v14.4s       // next counter
1340        add     v5.4s, v1.4s, v14.4s
1341
1342        bl      _bsaes_encrypt8_alt
1343
1344        subs    x2, x2, #8
1345        blo     .Lctr_enc_loop_done
1346
1347        ldr     q16, [x0], #16
1348        ldr     q17, [x0], #16
1349        eor     v1.16b, v1.16b, v17.16b
1350        ldr     q17, [x0], #16
1351        eor     v0.16b, v0.16b, v16.16b
1352        eor     v4.16b, v4.16b, v17.16b
1353        str     q0, [x1], #16
1354        ldr     q16, [x0], #16
1355        str     q1, [x1], #16
1356        mov     v0.16b, v15.16b
1357        str     q4, [x1], #16
1358        ldr     q1, [x0], #16
1359        eor     v4.16b, v6.16b, v16.16b
1360        eor     v1.16b, v3.16b, v1.16b
1361        ldr     q3, [x0], #16
1362        eor     v3.16b, v7.16b, v3.16b
1363        ldr     q6, [x0], #16
1364        eor     v2.16b, v2.16b, v6.16b
1365        ldr     q6, [x0], #16
1366        eor     v5.16b, v5.16b, v6.16b
1367        str     q4, [x1], #16
1368        str     q1, [x1], #16
1369        str     q3, [x1], #16
1370        str     q2, [x1], #16
1371        str     q5, [x1], #16
1372
1373        bne     .Lctr_enc_loop
1374        b       .Lctr_enc_done
1375
1376.align  4
1377.Lctr_enc_loop_done:
1378        add     x2, x2, #8
1379        ldr     q16, [x0], #16              // load input
1380        eor     v0.16b, v0.16b, v16.16b
1381        str     q0, [x1], #16               // write output
1382        cmp     x2, #2
1383        blo     .Lctr_enc_done
1384        ldr     q17, [x0], #16
1385        eor     v1.16b, v1.16b, v17.16b
1386        str     q1, [x1], #16
1387        beq     .Lctr_enc_done
1388        ldr     q18, [x0], #16
1389        eor     v4.16b, v4.16b, v18.16b
1390        str     q4, [x1], #16
1391        cmp     x2, #4
1392        blo     .Lctr_enc_done
1393        ldr     q19, [x0], #16
1394        eor     v6.16b, v6.16b, v19.16b
1395        str     q6, [x1], #16
1396        beq     .Lctr_enc_done
1397        ldr     q20, [x0], #16
1398        eor     v3.16b, v3.16b, v20.16b
1399        str     q3, [x1], #16
1400        cmp     x2, #6
1401        blo     .Lctr_enc_done
1402        ldr     q21, [x0], #16
1403        eor     v7.16b, v7.16b, v21.16b
1404        str     q7, [x1], #16
1405        beq     .Lctr_enc_done
1406        ldr     q22, [x0]
1407        eor     v2.16b, v2.16b, v22.16b
1408        str     q2, [x1], #16
1409
1410.Lctr_enc_done:
1411        movi    v0.16b, #0
1412        movi    v1.16b, #0
1413.Lctr_enc_bzero: // wipe key schedule [if any]
1414        stp     q0, q1, [sp], #32
1415        cmp     sp, x14
1416        bne     .Lctr_enc_bzero
1417
1418        ldp     d8, d9, [sp, #16]
1419        ldp     d10, d11, [sp, #32]
1420        ldp     d12, d13, [sp, #48]
1421        ldp     d14, d15, [sp, #64]
1422        ldp     x29, x30, [sp], #80
1423        ret
1424
1425.Lctr_enc_short:
1426        stp     x29, x30, [sp, #-96]!
1427        stp     x19, x20, [sp, #16]
1428        stp     x21, x22, [sp, #32]
1429        str     x23, [sp, #48]
1430
1431        mov     x19, x0                     // copy arguments
1432        mov     x20, x1
1433        mov     x21, x2
1434        mov     x22, x3
1435        ldr     w23, [x4, #12]              // load counter .LSW
1436        ldr     q1, [x4]                    // load whole counter value
1437#ifdef __AARCH64EL__
1438        rev     w23, w23
1439#endif
1440        str     q1, [sp, #80]               // copy counter value
1441
1442.Lctr_enc_short_loop:
1443        add     x0, sp, #80                 // input counter value
1444        add     x1, sp, #64                 // output on the stack
1445        mov     x2, x22                     // key
1446
1447        bl      AES_encrypt
1448
1449        ldr     q0, [x19], #16              // load input
1450        ldr     q1, [sp, #64]               // load encrypted counter
1451        add     x23, x23, #1
1452#ifdef __AARCH64EL__
1453        rev     w0, w23
1454        str     w0, [sp, #80+12]            // next counter value
1455#else
1456        str     w23, [sp, #80+12]           // next counter value
1457#endif
1458        eor     v0.16b, v0.16b, v1.16b
1459        str     q0, [x20], #16              // store output
1460        subs    x21, x21, #1
1461        bne     .Lctr_enc_short_loop
1462
1463        movi    v0.16b, #0
1464        movi    v1.16b, #0
1465        stp     q0, q1, [sp, #64]
1466
1467        ldr     x23, [sp, #48]
1468        ldp     x21, x22, [sp, #32]
1469        ldp     x19, x20, [sp, #16]
1470        ldp     x29, x30, [sp], #96
1471        ret
1472.size   ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks
1473
1474.globl  ossl_bsaes_xts_encrypt
1475.type   ossl_bsaes_xts_encrypt,%function
1476.align  4
1477// On entry:
1478//   x0 -> input plaintext
1479//   x1 -> output ciphertext
1480//   x2 -> length of text in bytes (must be at least 16)
1481//   x3 -> key1 (used to encrypt the XORed plaintext blocks)
1482//   x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
1483//   x5 -> 16-byte initial vector (typically, sector number)
1484// On exit:
1485//   Output ciphertext filled in
1486//   No output registers, usual AAPCS64 register preservation
1487ossl_bsaes_xts_encrypt:
1488        AARCH64_VALID_CALL_TARGET
1489        // Stack layout:
1490        // sp ->
1491        //        nrounds*128-96 bytes: key schedule
1492        // x19 ->
1493        //        16 bytes: frame record
1494        //        4*16 bytes: tweak storage across _bsaes_encrypt8
1495        //        6*8 bytes: storage for 5 callee-saved general-purpose registers
1496        //        8*8 bytes: storage for 8 callee-saved SIMD registers
1497        stp     x29, x30, [sp, #-192]!
1498        stp     x19, x20, [sp, #80]
1499        stp     x21, x22, [sp, #96]
1500        str     x23, [sp, #112]
1501        stp     d8, d9, [sp, #128]
1502        stp     d10, d11, [sp, #144]
1503        stp     d12, d13, [sp, #160]
1504        stp     d14, d15, [sp, #176]
1505
1506        mov     x19, sp
1507        mov     x20, x0
1508        mov     x21, x1
1509        mov     x22, x2
1510        mov     x23, x3
1511
1512        // generate initial tweak
1513        sub     sp, sp, #16
1514        mov     x0, x5                      // iv[]
1515        mov     x1, sp
1516        mov     x2, x4                      // key2
1517        bl      AES_encrypt
1518        ldr     q11, [sp], #16
1519
1520        ldr     w1, [x23, #240]             // get # of rounds
1521        // allocate the key schedule on the stack
1522        add     x17, sp, #96
1523        sub     x17, x17, x1, lsl #7        // 128 bytes per inner round key, less 96 bytes
1524
1525        // populate the key schedule
1526        mov     x9, x23                     // pass key
1527        mov     x10, x1                     // pass # of rounds
1528        mov     sp, x17
1529        bl      _bsaes_key_convert
1530        eor     v15.16b, v15.16b, v7.16b    // fix up last round key
1531        str     q15, [x17]                  // save last round key
1532
1533        subs    x22, x22, #0x80
1534        blo     .Lxts_enc_short
1535        b       .Lxts_enc_loop
1536
1537.align  4
1538.Lxts_enc_loop:
1539        ldr     q8, .Lxts_magic
1540        mov     x10, x1                     // pass rounds
1541        add     x2, x19, #16
1542        ldr     q0, [x20], #16
1543        sshr    v1.2d, v11.2d, #63
1544        mov     x9, sp                      // pass key schedule
1545        ldr     q6, .Lxts_magic+16
1546        add     v2.2d, v11.2d, v11.2d
1547        cmtst   v3.2d, v11.2d, v6.2d
1548        and     v1.16b, v1.16b, v8.16b
1549        ext     v1.16b, v1.16b, v1.16b, #8
1550        and     v3.16b, v3.16b, v8.16b
1551        ldr     q4, [x20], #16
1552        eor     v12.16b, v2.16b, v1.16b
1553        eor     v1.16b, v4.16b, v12.16b
1554        eor     v0.16b, v0.16b, v11.16b
1555        cmtst   v2.2d, v12.2d, v6.2d
1556        add     v4.2d, v12.2d, v12.2d
1557        add     x0, x19, #16
1558        ext     v3.16b, v3.16b, v3.16b, #8
1559        and     v2.16b, v2.16b, v8.16b
1560        eor     v13.16b, v4.16b, v3.16b
1561        ldr     q3, [x20], #16
1562        ext     v4.16b, v2.16b, v2.16b, #8
1563        eor     v2.16b, v3.16b, v13.16b
1564        ldr     q3, [x20], #16
1565        add     v5.2d, v13.2d, v13.2d
1566        cmtst   v7.2d, v13.2d, v6.2d
1567        and     v7.16b, v7.16b, v8.16b
1568        ldr     q9, [x20], #16
1569        ext     v7.16b, v7.16b, v7.16b, #8
1570        ldr     q10, [x20], #16
1571        eor     v14.16b, v5.16b, v4.16b
1572        ldr     q16, [x20], #16
1573        add     v4.2d, v14.2d, v14.2d
1574        eor     v3.16b, v3.16b, v14.16b
1575        eor     v15.16b, v4.16b, v7.16b
1576        add     v5.2d, v15.2d, v15.2d
1577        ldr     q7, [x20], #16
1578        cmtst   v4.2d, v14.2d, v6.2d
1579        and     v17.16b, v4.16b, v8.16b
1580        cmtst   v18.2d, v15.2d, v6.2d
1581        eor     v4.16b, v9.16b, v15.16b
1582        ext     v9.16b, v17.16b, v17.16b, #8
1583        eor     v9.16b, v5.16b, v9.16b
1584        add     v17.2d, v9.2d, v9.2d
1585        and     v18.16b, v18.16b, v8.16b
1586        eor     v5.16b, v10.16b, v9.16b
1587        str     q9, [x2], #16
1588        ext     v10.16b, v18.16b, v18.16b, #8
1589        cmtst   v9.2d, v9.2d, v6.2d
1590        and     v9.16b, v9.16b, v8.16b
1591        eor     v10.16b, v17.16b, v10.16b
1592        cmtst   v17.2d, v10.2d, v6.2d
1593        eor     v6.16b, v16.16b, v10.16b
1594        str     q10, [x2], #16
1595        ext     v9.16b, v9.16b, v9.16b, #8
1596        add     v10.2d, v10.2d, v10.2d
1597        eor     v9.16b, v10.16b, v9.16b
1598        str     q9, [x2], #16
1599        eor     v7.16b, v7.16b, v9.16b
1600        add     v9.2d, v9.2d, v9.2d
1601        and     v8.16b, v17.16b, v8.16b
1602        ext     v8.16b, v8.16b, v8.16b, #8
1603        eor     v8.16b, v9.16b, v8.16b
1604        str     q8, [x2]                    // next round tweak
1605
1606        bl      _bsaes_encrypt8
1607
1608        ldr     q8, [x0], #16
1609        eor     v0.16b, v0.16b, v11.16b
1610        eor     v1.16b, v1.16b, v12.16b
1611        ldr     q9, [x0], #16
1612        eor     v4.16b, v4.16b, v13.16b
1613        eor     v6.16b, v6.16b, v14.16b
1614        ldr     q10, [x0], #16
1615        eor     v3.16b, v3.16b, v15.16b
1616        subs    x22, x22, #0x80
1617        str     q0, [x21], #16
1618        ldr     q11, [x0]                   // next round tweak
1619        str     q1, [x21], #16
1620        eor     v0.16b, v7.16b, v8.16b
1621        eor     v1.16b, v2.16b, v9.16b
1622        str     q4, [x21], #16
1623        eor     v2.16b, v5.16b, v10.16b
1624        str     q6, [x21], #16
1625        str     q3, [x21], #16
1626        str     q0, [x21], #16
1627        str     q1, [x21], #16
1628        str     q2, [x21], #16
1629        bpl     .Lxts_enc_loop
1630
1631.Lxts_enc_short:
1632        adds    x22, x22, #0x70
1633        bmi     .Lxts_enc_done
1634
1635        ldr     q8, .Lxts_magic
1636        sshr    v1.2d, v11.2d, #63
1637        add     v2.2d, v11.2d, v11.2d
1638        ldr     q9, .Lxts_magic+16
1639        subs    x22, x22, #0x10
1640        ldr     q0, [x20], #16
1641        and     v1.16b, v1.16b, v8.16b
1642        cmtst   v3.2d, v11.2d, v9.2d
1643        ext     v1.16b, v1.16b, v1.16b, #8
1644        and     v3.16b, v3.16b, v8.16b
1645        eor     v12.16b, v2.16b, v1.16b
1646        ext     v1.16b, v3.16b, v3.16b, #8
1647        add     v2.2d, v12.2d, v12.2d
1648        cmtst   v3.2d, v12.2d, v9.2d
1649        eor     v13.16b, v2.16b, v1.16b
1650        and     v22.16b, v3.16b, v8.16b
1651        bmi     .Lxts_enc_1
1652
1653        ext     v2.16b, v22.16b, v22.16b, #8
1654        add     v3.2d, v13.2d, v13.2d
1655        ldr     q1, [x20], #16
1656        cmtst   v4.2d, v13.2d, v9.2d
1657        subs    x22, x22, #0x10
1658        eor     v14.16b, v3.16b, v2.16b
1659        and     v23.16b, v4.16b, v8.16b
1660        bmi     .Lxts_enc_2
1661
1662        ext     v3.16b, v23.16b, v23.16b, #8
1663        add     v4.2d, v14.2d, v14.2d
1664        ldr     q2, [x20], #16
1665        cmtst   v5.2d, v14.2d, v9.2d
1666        eor     v0.16b, v0.16b, v11.16b
1667        subs    x22, x22, #0x10
1668        eor     v15.16b, v4.16b, v3.16b
1669        and     v24.16b, v5.16b, v8.16b
1670        bmi     .Lxts_enc_3
1671
1672        ext     v4.16b, v24.16b, v24.16b, #8
1673        add     v5.2d, v15.2d, v15.2d
1674        ldr     q3, [x20], #16
1675        cmtst   v6.2d, v15.2d, v9.2d
1676        eor     v1.16b, v1.16b, v12.16b
1677        subs    x22, x22, #0x10
1678        eor     v16.16b, v5.16b, v4.16b
1679        and     v25.16b, v6.16b, v8.16b
1680        bmi     .Lxts_enc_4
1681
1682        ext     v5.16b, v25.16b, v25.16b, #8
1683        add     v6.2d, v16.2d, v16.2d
1684        add     x0, x19, #16
1685        cmtst   v7.2d, v16.2d, v9.2d
1686        ldr     q4, [x20], #16
1687        eor     v2.16b, v2.16b, v13.16b
1688        str     q16, [x0], #16
1689        subs    x22, x22, #0x10
1690        eor     v17.16b, v6.16b, v5.16b
1691        and     v26.16b, v7.16b, v8.16b
1692        bmi     .Lxts_enc_5
1693
1694        ext     v7.16b, v26.16b, v26.16b, #8
1695        add     v18.2d, v17.2d, v17.2d
1696        ldr     q5, [x20], #16
1697        eor     v3.16b, v3.16b, v14.16b
1698        str     q17, [x0], #16
1699        subs    x22, x22, #0x10
1700        eor     v18.16b, v18.16b, v7.16b
1701        bmi     .Lxts_enc_6
1702
1703        ldr     q6, [x20], #16
1704        eor     v4.16b, v4.16b, v15.16b
1705        eor     v5.16b, v5.16b, v16.16b
1706        str     q18, [x0]                   // next round tweak
1707        mov     x9, sp                      // pass key schedule
1708        mov     x10, x1
1709        add     x0, x19, #16
1710        sub     x22, x22, #0x10
1711        eor     v6.16b, v6.16b, v17.16b
1712
1713        bl      _bsaes_encrypt8
1714
1715        ldr     q16, [x0], #16
1716        eor     v0.16b, v0.16b, v11.16b
1717        eor     v1.16b, v1.16b, v12.16b
1718        ldr     q17, [x0], #16
1719        eor     v4.16b, v4.16b, v13.16b
1720        eor     v6.16b, v6.16b, v14.16b
1721        eor     v3.16b, v3.16b, v15.16b
1722        ldr     q11, [x0]                   // next round tweak
1723        str     q0, [x21], #16
1724        str     q1, [x21], #16
1725        eor     v0.16b, v7.16b, v16.16b
1726        eor     v1.16b, v2.16b, v17.16b
1727        str     q4, [x21], #16
1728        str     q6, [x21], #16
1729        str     q3, [x21], #16
1730        str     q0, [x21], #16
1731        str     q1, [x21], #16
1732        b       .Lxts_enc_done
1733
1734.align  4
1735.Lxts_enc_6:
1736        eor     v4.16b, v4.16b, v15.16b
1737        eor     v5.16b, v5.16b, v16.16b
1738        mov     x9, sp                      // pass key schedule
1739        mov     x10, x1                     // pass rounds
1740        add     x0, x19, #16
1741
1742        bl      _bsaes_encrypt8
1743
1744        ldr     q16, [x0], #16
1745        eor     v0.16b, v0.16b, v11.16b
1746        eor     v1.16b, v1.16b, v12.16b
1747        eor     v4.16b, v4.16b, v13.16b
1748        eor     v6.16b, v6.16b, v14.16b
1749        ldr     q11, [x0]                   // next round tweak
1750        eor     v3.16b, v3.16b, v15.16b
1751        str     q0, [x21], #16
1752        str     q1, [x21], #16
1753        eor     v0.16b, v7.16b, v16.16b
1754        str     q4, [x21], #16
1755        str     q6, [x21], #16
1756        str     q3, [x21], #16
1757        str     q0, [x21], #16
1758        b       .Lxts_enc_done
1759
1760.align  4
1761.Lxts_enc_5:
1762        eor     v3.16b, v3.16b, v14.16b
1763        eor     v4.16b, v4.16b, v15.16b
1764        mov     x9, sp                      // pass key schedule
1765        mov     x10, x1                     // pass rounds
1766        add     x0, x19, #16
1767
1768        bl      _bsaes_encrypt8
1769
1770        eor     v0.16b, v0.16b, v11.16b
1771        eor     v1.16b, v1.16b, v12.16b
1772        ldr     q11, [x0]                   // next round tweak
1773        eor     v4.16b, v4.16b, v13.16b
1774        eor     v6.16b, v6.16b, v14.16b
1775        eor     v3.16b, v3.16b, v15.16b
1776        str     q0, [x21], #16
1777        str     q1, [x21], #16
1778        str     q4, [x21], #16
1779        str     q6, [x21], #16
1780        str     q3, [x21], #16
1781        b       .Lxts_enc_done
1782
1783.align  4
1784.Lxts_enc_4:
1785        eor     v2.16b, v2.16b, v13.16b
1786        eor     v3.16b, v3.16b, v14.16b
1787        mov     x9, sp                      // pass key schedule
1788        mov     x10, x1                     // pass rounds
1789        add     x0, x19, #16
1790
1791        bl      _bsaes_encrypt8
1792
1793        eor     v0.16b, v0.16b, v11.16b
1794        eor     v1.16b, v1.16b, v12.16b
1795        eor     v4.16b, v4.16b, v13.16b
1796        eor     v6.16b, v6.16b, v14.16b
1797        mov     v11.16b, v15.16b            // next round tweak
1798        str     q0, [x21], #16
1799        str     q1, [x21], #16
1800        str     q4, [x21], #16
1801        str     q6, [x21], #16
1802        b       .Lxts_enc_done
1803
1804.align  4
1805.Lxts_enc_3:
1806        eor     v1.16b, v1.16b, v12.16b
1807        eor     v2.16b, v2.16b, v13.16b
1808        mov     x9, sp                      // pass key schedule
1809        mov     x10, x1                     // pass rounds
1810        add     x0, x19, #16
1811
1812        bl      _bsaes_encrypt8
1813
1814        eor     v0.16b, v0.16b, v11.16b
1815        eor     v1.16b, v1.16b, v12.16b
1816        eor     v4.16b, v4.16b, v13.16b
1817        mov     v11.16b, v14.16b            // next round tweak
1818        str     q0, [x21], #16
1819        str     q1, [x21], #16
1820        str     q4, [x21], #16
1821        b       .Lxts_enc_done
1822
1823.align  4
1824.Lxts_enc_2:
1825        eor     v0.16b, v0.16b, v11.16b
1826        eor     v1.16b, v1.16b, v12.16b
1827        mov     x9, sp                      // pass key schedule
1828        mov     x10, x1                     // pass rounds
1829        add     x0, x19, #16
1830
1831        bl      _bsaes_encrypt8
1832
1833        eor     v0.16b, v0.16b, v11.16b
1834        eor     v1.16b, v1.16b, v12.16b
1835        mov     v11.16b, v13.16b            // next round tweak
1836        str     q0, [x21], #16
1837        str     q1, [x21], #16
1838        b       .Lxts_enc_done
1839
1840.align  4
1841.Lxts_enc_1:
1842        eor     v0.16b, v0.16b, v11.16b
1843        sub     x0, sp, #16
1844        sub     x1, sp, #16
1845        mov     x2, x23
1846        mov     v13.d[0], v11.d[1]          // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
1847        mov     v14.d[0], v12.d[1]
1848        str     q0, [sp, #-16]!
1849
1850        bl      AES_encrypt
1851
1852        ldr     q0, [sp], #16
1853        trn1    v13.2d, v11.2d, v13.2d
1854        trn1    v11.2d, v12.2d, v14.2d      // next round tweak
1855        eor     v0.16b, v0.16b, v13.16b
1856        str     q0, [x21], #16
1857
1858.Lxts_enc_done:
1859        adds    x22, x22, #0x10
1860        beq     .Lxts_enc_ret
1861
1862        sub     x6, x21, #0x10
1863        // Penultimate plaintext block produces final ciphertext part-block
1864        // plus remaining part of final plaintext block. Move ciphertext part
1865        // to final position and reuse penultimate ciphertext block buffer to
1866        // construct final plaintext block
1867.Lxts_enc_steal:
1868        ldrb    w0, [x20], #1
1869        ldrb    w1, [x21, #-0x10]
1870        strb    w0, [x21, #-0x10]
1871        strb    w1, [x21], #1
1872
1873        subs    x22, x22, #1
1874        bhi     .Lxts_enc_steal
1875
1876        // Finally encrypt the penultimate ciphertext block using the
1877        // last tweak
1878        ldr     q0, [x6]
1879        eor     v0.16b, v0.16b, v11.16b
1880        str     q0, [sp, #-16]!
1881        mov     x0, sp
1882        mov     x1, sp
1883        mov     x2, x23
1884        mov     x21, x6
1885        mov     v13.d[0], v11.d[1]          // just in case AES_encrypt corrupts top half of callee-saved SIMD registers
1886
1887        bl      AES_encrypt
1888
1889        trn1    v11.2d, v11.2d, v13.2d
1890        ldr     q0, [sp], #16
1891        eor     v0.16b, v0.16b, v11.16b
1892        str     q0, [x21]
1893
1894.Lxts_enc_ret:
1895
1896        movi    v0.16b, #0
1897        movi    v1.16b, #0
1898.Lxts_enc_bzero: // wipe key schedule
1899        stp     q0, q1, [sp], #32
1900        cmp     sp, x19
1901        bne     .Lxts_enc_bzero
1902
1903        ldp     x19, x20, [sp, #80]
1904        ldp     x21, x22, [sp, #96]
1905        ldr     x23, [sp, #112]
1906        ldp     d8, d9, [sp, #128]
1907        ldp     d10, d11, [sp, #144]
1908        ldp     d12, d13, [sp, #160]
1909        ldp     d14, d15, [sp, #176]
1910        ldp     x29, x30, [sp], #192
1911        ret
1912.size   ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt
1913
1914// The assembler doesn't seem capable of de-duplicating these when expressed
1915// using `ldr qd,=` syntax, so assign a symbolic address
1916.align  5
1917.Lxts_magic:
1918.quad   1, 0x87, 0x4000000000000000, 0x4000000000000000
1919
1920.globl  ossl_bsaes_xts_decrypt
1921.type   ossl_bsaes_xts_decrypt,%function
1922.align  4
1923// On entry:
1924//   x0 -> input ciphertext
1925//   x1 -> output plaintext
1926//   x2 -> length of text in bytes (must be at least 16)
1927//   x3 -> key1 (used to decrypt the XORed ciphertext blocks)
1928//   x4 -> key2 (used to encrypt the initial vector to yield the initial tweak)
1929//   x5 -> 16-byte initial vector (typically, sector number)
1930// On exit:
1931//   Output plaintext filled in
1932//   No output registers, usual AAPCS64 register preservation
1933ossl_bsaes_xts_decrypt:
1934        AARCH64_VALID_CALL_TARGET
1935        // Stack layout:
1936        // sp ->
1937        //        nrounds*128-96 bytes: key schedule
1938        // x19 ->
1939        //        16 bytes: frame record
1940        //        4*16 bytes: tweak storage across _bsaes_decrypt8
1941        //        6*8 bytes: storage for 5 callee-saved general-purpose registers
1942        //        8*8 bytes: storage for 8 callee-saved SIMD registers
1943        stp     x29, x30, [sp, #-192]!
1944        stp     x19, x20, [sp, #80]
1945        stp     x21, x22, [sp, #96]
1946        str     x23, [sp, #112]
1947        stp     d8, d9, [sp, #128]
1948        stp     d10, d11, [sp, #144]
1949        stp     d12, d13, [sp, #160]
1950        stp     d14, d15, [sp, #176]
1951
1952        mov     x19, sp
1953        mov     x20, x0
1954        mov     x21, x1
1955        mov     x22, x2
1956        mov     x23, x3
1957
1958        // generate initial tweak
1959        sub     sp, sp, #16
1960        mov     x0, x5                      // iv[]
1961        mov     x1, sp
1962        mov     x2, x4                      // key2
1963        bl      AES_encrypt
1964        ldr     q11, [sp], #16
1965
1966        ldr     w1, [x23, #240]             // get # of rounds
1967        // allocate the key schedule on the stack
1968        add     x17, sp, #96
1969        sub     x17, x17, x1, lsl #7        // 128 bytes per inner round key, less 96 bytes
1970
1971        // populate the key schedule
1972        mov     x9, x23                     // pass key
1973        mov     x10, x1                     // pass # of rounds
1974        mov     sp, x17
1975        bl      _bsaes_key_convert
1976        ldr     q6,  [sp]
1977        str     q15, [x17]                  // save last round key
1978        eor     v6.16b, v6.16b, v7.16b      // fix up round 0 key (by XORing with 0x63)
1979        str     q6, [sp]
1980
1981        sub     x30, x22, #0x10
1982        tst     x22, #0xf                   // if not multiple of 16
1983        csel    x22, x30, x22, ne           // subtract another 16 bytes
1984        subs    x22, x22, #0x80
1985
1986        blo     .Lxts_dec_short
1987        b       .Lxts_dec_loop
1988
1989.align  4
1990.Lxts_dec_loop:
1991        ldr     q8, .Lxts_magic
1992        mov     x10, x1                     // pass rounds
1993        add     x2, x19, #16
1994        ldr     q0, [x20], #16
1995        sshr    v1.2d, v11.2d, #63
1996        mov     x9, sp                      // pass key schedule
1997        ldr     q6, .Lxts_magic+16
1998        add     v2.2d, v11.2d, v11.2d
1999        cmtst   v3.2d, v11.2d, v6.2d
2000        and     v1.16b, v1.16b, v8.16b
2001        ext     v1.16b, v1.16b, v1.16b, #8
2002        and     v3.16b, v3.16b, v8.16b
2003        ldr     q4, [x20], #16
2004        eor     v12.16b, v2.16b, v1.16b
2005        eor     v1.16b, v4.16b, v12.16b
2006        eor     v0.16b, v0.16b, v11.16b
2007        cmtst   v2.2d, v12.2d, v6.2d
2008        add     v4.2d, v12.2d, v12.2d
2009        add     x0, x19, #16
2010        ext     v3.16b, v3.16b, v3.16b, #8
2011        and     v2.16b, v2.16b, v8.16b
2012        eor     v13.16b, v4.16b, v3.16b
2013        ldr     q3, [x20], #16
2014        ext     v4.16b, v2.16b, v2.16b, #8
2015        eor     v2.16b, v3.16b, v13.16b
2016        ldr     q3, [x20], #16
2017        add     v5.2d, v13.2d, v13.2d
2018        cmtst   v7.2d, v13.2d, v6.2d
2019        and     v7.16b, v7.16b, v8.16b
2020        ldr     q9, [x20], #16
2021        ext     v7.16b, v7.16b, v7.16b, #8
2022        ldr     q10, [x20], #16
2023        eor     v14.16b, v5.16b, v4.16b
2024        ldr     q16, [x20], #16
2025        add     v4.2d, v14.2d, v14.2d
2026        eor     v3.16b, v3.16b, v14.16b
2027        eor     v15.16b, v4.16b, v7.16b
2028        add     v5.2d, v15.2d, v15.2d
2029        ldr     q7, [x20], #16
2030        cmtst   v4.2d, v14.2d, v6.2d
2031        and     v17.16b, v4.16b, v8.16b
2032        cmtst   v18.2d, v15.2d, v6.2d
2033        eor     v4.16b, v9.16b, v15.16b
2034        ext     v9.16b, v17.16b, v17.16b, #8
2035        eor     v9.16b, v5.16b, v9.16b
2036        add     v17.2d, v9.2d, v9.2d
2037        and     v18.16b, v18.16b, v8.16b
2038        eor     v5.16b, v10.16b, v9.16b
2039        str     q9, [x2], #16
2040        ext     v10.16b, v18.16b, v18.16b, #8
2041        cmtst   v9.2d, v9.2d, v6.2d
2042        and     v9.16b, v9.16b, v8.16b
2043        eor     v10.16b, v17.16b, v10.16b
2044        cmtst   v17.2d, v10.2d, v6.2d
2045        eor     v6.16b, v16.16b, v10.16b
2046        str     q10, [x2], #16
2047        ext     v9.16b, v9.16b, v9.16b, #8
2048        add     v10.2d, v10.2d, v10.2d
2049        eor     v9.16b, v10.16b, v9.16b
2050        str     q9, [x2], #16
2051        eor     v7.16b, v7.16b, v9.16b
2052        add     v9.2d, v9.2d, v9.2d
2053        and     v8.16b, v17.16b, v8.16b
2054        ext     v8.16b, v8.16b, v8.16b, #8
2055        eor     v8.16b, v9.16b, v8.16b
2056        str     q8, [x2]                    // next round tweak
2057
2058        bl      _bsaes_decrypt8
2059
2060        eor     v6.16b, v6.16b, v13.16b
2061        eor     v0.16b, v0.16b, v11.16b
2062        ldr     q8, [x0], #16
2063        eor     v7.16b, v7.16b, v8.16b
2064        str     q0, [x21], #16
2065        eor     v0.16b, v1.16b, v12.16b
2066        ldr     q1, [x0], #16
2067        eor     v1.16b, v3.16b, v1.16b
2068        subs    x22, x22, #0x80
2069        eor     v2.16b, v2.16b, v15.16b
2070        eor     v3.16b, v4.16b, v14.16b
2071        ldr     q4, [x0], #16
2072        str     q0, [x21], #16
2073        ldr     q11, [x0]                   // next round tweak
2074        eor     v0.16b, v5.16b, v4.16b
2075        str     q6, [x21], #16
2076        str     q3, [x21], #16
2077        str     q2, [x21], #16
2078        str     q7, [x21], #16
2079        str     q1, [x21], #16
2080        str     q0, [x21], #16
2081        bpl     .Lxts_dec_loop
2082
2083.Lxts_dec_short:
2084        adds    x22, x22, #0x70
2085        bmi     .Lxts_dec_done
2086
2087        ldr     q8, .Lxts_magic
2088        sshr    v1.2d, v11.2d, #63
2089        add     v2.2d, v11.2d, v11.2d
2090        ldr     q9, .Lxts_magic+16
2091        subs    x22, x22, #0x10
2092        ldr     q0, [x20], #16
2093        and     v1.16b, v1.16b, v8.16b
2094        cmtst   v3.2d, v11.2d, v9.2d
2095        ext     v1.16b, v1.16b, v1.16b, #8
2096        and     v3.16b, v3.16b, v8.16b
2097        eor     v12.16b, v2.16b, v1.16b
2098        ext     v1.16b, v3.16b, v3.16b, #8
2099        add     v2.2d, v12.2d, v12.2d
2100        cmtst   v3.2d, v12.2d, v9.2d
2101        eor     v13.16b, v2.16b, v1.16b
2102        and     v22.16b, v3.16b, v8.16b
2103        bmi     .Lxts_dec_1
2104
2105        ext     v2.16b, v22.16b, v22.16b, #8
2106        add     v3.2d, v13.2d, v13.2d
2107        ldr     q1, [x20], #16
2108        cmtst   v4.2d, v13.2d, v9.2d
2109        subs    x22, x22, #0x10
2110        eor     v14.16b, v3.16b, v2.16b
2111        and     v23.16b, v4.16b, v8.16b
2112        bmi     .Lxts_dec_2
2113
2114        ext     v3.16b, v23.16b, v23.16b, #8
2115        add     v4.2d, v14.2d, v14.2d
2116        ldr     q2, [x20], #16
2117        cmtst   v5.2d, v14.2d, v9.2d
2118        eor     v0.16b, v0.16b, v11.16b
2119        subs    x22, x22, #0x10
2120        eor     v15.16b, v4.16b, v3.16b
2121        and     v24.16b, v5.16b, v8.16b
2122        bmi     .Lxts_dec_3
2123
2124        ext     v4.16b, v24.16b, v24.16b, #8
2125        add     v5.2d, v15.2d, v15.2d
2126        ldr     q3, [x20], #16
2127        cmtst   v6.2d, v15.2d, v9.2d
2128        eor     v1.16b, v1.16b, v12.16b
2129        subs    x22, x22, #0x10
2130        eor     v16.16b, v5.16b, v4.16b
2131        and     v25.16b, v6.16b, v8.16b
2132        bmi     .Lxts_dec_4
2133
2134        ext     v5.16b, v25.16b, v25.16b, #8
2135        add     v6.2d, v16.2d, v16.2d
2136        add     x0, x19, #16
2137        cmtst   v7.2d, v16.2d, v9.2d
2138        ldr     q4, [x20], #16
2139        eor     v2.16b, v2.16b, v13.16b
2140        str     q16, [x0], #16
2141        subs    x22, x22, #0x10
2142        eor     v17.16b, v6.16b, v5.16b
2143        and     v26.16b, v7.16b, v8.16b
2144        bmi     .Lxts_dec_5
2145
2146        ext     v7.16b, v26.16b, v26.16b, #8
2147        add     v18.2d, v17.2d, v17.2d
2148        ldr     q5, [x20], #16
2149        eor     v3.16b, v3.16b, v14.16b
2150        str     q17, [x0], #16
2151        subs    x22, x22, #0x10
2152        eor     v18.16b, v18.16b, v7.16b
2153        bmi     .Lxts_dec_6
2154
2155        ldr     q6, [x20], #16
2156        eor     v4.16b, v4.16b, v15.16b
2157        eor     v5.16b, v5.16b, v16.16b
2158        str     q18, [x0]                   // next round tweak
2159        mov     x9, sp                      // pass key schedule
2160        mov     x10, x1
2161        add     x0, x19, #16
2162        sub     x22, x22, #0x10
2163        eor     v6.16b, v6.16b, v17.16b
2164
2165        bl      _bsaes_decrypt8
2166
2167        ldr     q16, [x0], #16
2168        eor     v0.16b, v0.16b, v11.16b
2169        eor     v1.16b, v1.16b, v12.16b
2170        ldr     q17, [x0], #16
2171        eor     v6.16b, v6.16b, v13.16b
2172        eor     v4.16b, v4.16b, v14.16b
2173        eor     v2.16b, v2.16b, v15.16b
2174        ldr     q11, [x0]                   // next round tweak
2175        str     q0, [x21], #16
2176        str     q1, [x21], #16
2177        eor     v0.16b, v7.16b, v16.16b
2178        eor     v1.16b, v3.16b, v17.16b
2179        str     q6, [x21], #16
2180        str     q4, [x21], #16
2181        str     q2, [x21], #16
2182        str     q0, [x21], #16
2183        str     q1, [x21], #16
2184        b       .Lxts_dec_done
2185
2186.align  4
2187.Lxts_dec_6:
2188        eor     v4.16b, v4.16b, v15.16b
2189        eor     v5.16b, v5.16b, v16.16b
2190        mov     x9, sp                      // pass key schedule
2191        mov     x10, x1                     // pass rounds
2192        add     x0, x19, #16
2193
2194        bl      _bsaes_decrypt8
2195
2196        ldr     q16, [x0], #16
2197        eor     v0.16b, v0.16b, v11.16b
2198        eor     v1.16b, v1.16b, v12.16b
2199        eor     v6.16b, v6.16b, v13.16b
2200        eor     v4.16b, v4.16b, v14.16b
2201        ldr     q11, [x0]                   // next round tweak
2202        eor     v2.16b, v2.16b, v15.16b
2203        str     q0, [x21], #16
2204        str     q1, [x21], #16
2205        eor     v0.16b, v7.16b, v16.16b
2206        str     q6, [x21], #16
2207        str     q4, [x21], #16
2208        str     q2, [x21], #16
2209        str     q0, [x21], #16
2210        b       .Lxts_dec_done
2211
2212.align  4
2213.Lxts_dec_5:
2214        eor     v3.16b, v3.16b, v14.16b
2215        eor     v4.16b, v4.16b, v15.16b
2216        mov     x9, sp                      // pass key schedule
2217        mov     x10, x1                     // pass rounds
2218        add     x0, x19, #16
2219
2220        bl      _bsaes_decrypt8
2221
2222        eor     v0.16b, v0.16b, v11.16b
2223        eor     v1.16b, v1.16b, v12.16b
2224        ldr     q11, [x0]                   // next round tweak
2225        eor     v6.16b, v6.16b, v13.16b
2226        eor     v4.16b, v4.16b, v14.16b
2227        eor     v2.16b, v2.16b, v15.16b
2228        str     q0, [x21], #16
2229        str     q1, [x21], #16
2230        str     q6, [x21], #16
2231        str     q4, [x21], #16
2232        str     q2, [x21], #16
2233        b       .Lxts_dec_done
2234
2235.align  4
2236.Lxts_dec_4:
2237        eor     v2.16b, v2.16b, v13.16b
2238        eor     v3.16b, v3.16b, v14.16b
2239        mov     x9, sp                      // pass key schedule
2240        mov     x10, x1                     // pass rounds
2241        add     x0, x19, #16
2242
2243        bl      _bsaes_decrypt8
2244
2245        eor     v0.16b, v0.16b, v11.16b
2246        eor     v1.16b, v1.16b, v12.16b
2247        eor     v6.16b, v6.16b, v13.16b
2248        eor     v4.16b, v4.16b, v14.16b
2249        mov     v11.16b, v15.16b            // next round tweak
2250        str     q0, [x21], #16
2251        str     q1, [x21], #16
2252        str     q6, [x21], #16
2253        str     q4, [x21], #16
2254        b       .Lxts_dec_done
2255
2256.align  4
2257.Lxts_dec_3:
2258        eor     v1.16b, v1.16b, v12.16b
2259        eor     v2.16b, v2.16b, v13.16b
2260        mov     x9, sp                      // pass key schedule
2261        mov     x10, x1                     // pass rounds
2262        add     x0, x19, #16
2263
2264        bl      _bsaes_decrypt8
2265
2266        eor     v0.16b, v0.16b, v11.16b
2267        eor     v1.16b, v1.16b, v12.16b
2268        eor     v6.16b, v6.16b, v13.16b
2269        mov     v11.16b, v14.16b            // next round tweak
2270        str     q0, [x21], #16
2271        str     q1, [x21], #16
2272        str     q6, [x21], #16
2273        b       .Lxts_dec_done
2274
2275.align  4
2276.Lxts_dec_2:
2277        eor     v0.16b, v0.16b, v11.16b
2278        eor     v1.16b, v1.16b, v12.16b
2279        mov     x9, sp                      // pass key schedule
2280        mov     x10, x1                     // pass rounds
2281        add     x0, x19, #16
2282
2283        bl      _bsaes_decrypt8
2284
2285        eor     v0.16b, v0.16b, v11.16b
2286        eor     v1.16b, v1.16b, v12.16b
2287        mov     v11.16b, v13.16b            // next round tweak
2288        str     q0, [x21], #16
2289        str     q1, [x21], #16
2290        b       .Lxts_dec_done
2291
2292.align  4
2293.Lxts_dec_1:
2294        eor     v0.16b, v0.16b, v11.16b
2295        sub     x0, sp, #16
2296        sub     x1, sp, #16
2297        mov     x2, x23
2298        mov     v13.d[0], v11.d[1]          // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
2299        mov     v14.d[0], v12.d[1]
2300        str     q0, [sp, #-16]!
2301
2302        bl      AES_decrypt
2303
2304        ldr     q0, [sp], #16
2305        trn1    v13.2d, v11.2d, v13.2d
2306        trn1    v11.2d, v12.2d, v14.2d      // next round tweak
2307        eor     v0.16b, v0.16b, v13.16b
2308        str     q0, [x21], #16
2309
2310.Lxts_dec_done:
2311        adds    x22, x22, #0x10
2312        beq     .Lxts_dec_ret
2313
2314        // calculate one round of extra tweak for the stolen ciphertext
2315        ldr     q8, .Lxts_magic
2316        sshr    v6.2d, v11.2d, #63
2317        and     v6.16b, v6.16b, v8.16b
2318        add     v12.2d, v11.2d, v11.2d
2319        ext     v6.16b, v6.16b, v6.16b, #8
2320        eor     v12.16b, v12.16b, v6.16b
2321
2322        // perform the final decryption with the last tweak value
2323        ldr     q0, [x20], #16
2324        eor     v0.16b, v0.16b, v12.16b
2325        str     q0, [sp, #-16]!
2326        mov     x0, sp
2327        mov     x1, sp
2328        mov     x2, x23
2329        mov     v13.d[0], v11.d[1]          // just in case AES_decrypt corrupts top half of callee-saved SIMD registers
2330        mov     v14.d[0], v12.d[1]
2331
2332        bl      AES_decrypt
2333
2334        trn1    v12.2d, v12.2d, v14.2d
2335        trn1    v11.2d, v11.2d, v13.2d
2336        ldr     q0, [sp], #16
2337        eor     v0.16b, v0.16b, v12.16b
2338        str     q0, [x21]
2339
2340        mov     x6, x21
2341        // Penultimate ciphertext block produces final plaintext part-block
2342        // plus remaining part of final ciphertext block. Move plaintext part
2343        // to final position and reuse penultimate plaintext block buffer to
2344        // construct final ciphertext block
2345.Lxts_dec_steal:
2346        ldrb    w1, [x21]
2347        ldrb    w0, [x20], #1
2348        strb    w1, [x21, #0x10]
2349        strb    w0, [x21], #1
2350
2351        subs    x22, x22, #1
2352        bhi     .Lxts_dec_steal
2353
2354        // Finally decrypt the penultimate plaintext block using the
2355        // penultimate tweak
2356        ldr     q0, [x6]
2357        eor     v0.16b, v0.16b, v11.16b
2358        str     q0, [sp, #-16]!
2359        mov     x0, sp
2360        mov     x1, sp
2361        mov     x2, x23
2362        mov     x21, x6
2363
2364        bl      AES_decrypt
2365
2366        trn1    v11.2d, v11.2d, v13.2d
2367        ldr     q0, [sp], #16
2368        eor     v0.16b, v0.16b, v11.16b
2369        str     q0, [x21]
2370
2371.Lxts_dec_ret:
2372
2373        movi    v0.16b, #0
2374        movi    v1.16b, #0
2375.Lxts_dec_bzero: // wipe key schedule
2376        stp     q0, q1, [sp], #32
2377        cmp     sp, x19
2378        bne     .Lxts_dec_bzero
2379
2380        ldp     x19, x20, [sp, #80]
2381        ldp     x21, x22, [sp, #96]
2382        ldr     x23, [sp, #112]
2383        ldp     d8, d9, [sp, #128]
2384        ldp     d10, d11, [sp, #144]
2385        ldp     d12, d13, [sp, #160]
2386        ldp     d14, d15, [sp, #176]
2387        ldp     x29, x30, [sp], #192
2388        ret
2389.size   ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt
2390