xref: /freebsd/sys/crypto/openssl/aarch64/vpaes-armv8.S (revision 580d00f42fdd94ce43583cc45fe3f1d9fdff47d4)
1/* Do not modify. This file is auto-generated from vpaes-armv8.pl. */
2#include "arm_arch.h"
3
4.text
5
6.type	_vpaes_consts,%object
7.align	7	// totally strategic alignment
8_vpaes_consts:
9.Lk_mc_forward:	//	mc_forward
10.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
11.quad	0x080B0A0904070605, 0x000302010C0F0E0D
12.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
13.quad	0x000302010C0F0E0D, 0x080B0A0904070605
14.Lk_mc_backward:	//	mc_backward
15.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
16.quad	0x020100030E0D0C0F, 0x0A09080B06050407
17.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
18.quad	0x0A09080B06050407, 0x020100030E0D0C0F
19.Lk_sr:	//	sr
20.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
21.quad	0x030E09040F0A0500, 0x0B06010C07020D08
22.quad	0x0F060D040B020900, 0x070E050C030A0108
23.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
24
25//
26// "Hot" constants
27//
28.Lk_inv:	//	inv, inva
29.quad	0x0E05060F0D080180, 0x040703090A0B0C02
30.quad	0x01040A060F0B0780, 0x030D0E0C02050809
31.Lk_ipt:	//	input transform (lo, hi)
32.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
33.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
34.Lk_sbo:	//	sbou, sbot
35.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
36.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
37.Lk_sb1:	//	sb1u, sb1t
38.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
39.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
40.Lk_sb2:	//	sb2u, sb2t
41.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
42.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
43
44//
45//  Decryption stuff
46//
47.Lk_dipt:	//	decryption input transform
48.quad	0x0F505B040B545F00, 0x154A411E114E451A
49.quad	0x86E383E660056500, 0x12771772F491F194
50.Lk_dsbo:	//	decryption sbox final output
51.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
52.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
53.Lk_dsb9:	//	decryption sbox output *9*u, *9*t
54.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
55.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
56.Lk_dsbd:	//	decryption sbox output *D*u, *D*t
57.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
58.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
59.Lk_dsbb:	//	decryption sbox output *B*u, *B*t
60.quad	0xD022649296B44200, 0x602646F6B0F2D404
61.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
62.Lk_dsbe:	//	decryption sbox output *E*u, *E*t
63.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
64.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
65
66//
67//  Key schedule constants
68//
69.Lk_dksd:	//	decryption key schedule: invskew x*D
70.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
71.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
72.Lk_dksb:	//	decryption key schedule: invskew x*B
73.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
74.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
75.Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
76.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
77.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
78.Lk_dks9:	//	decryption key schedule: invskew x*9
79.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
80.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
81
82.Lk_rcon:	//	rcon
83.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
84
85.Lk_opt:	//	output transform
86.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
87.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
88.Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
89.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
90.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
91
92.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
93.align	2
94.size	_vpaes_consts,.-_vpaes_consts
95.align	6
96//
97//  _aes_preheat
98//
99//  Fills register %r10 -> .aes_consts (so you can -fPIC)
100//  and %xmm9-%xmm15 as specified below.
101//
102.type	_vpaes_encrypt_preheat,%function
103.align	4
104_vpaes_encrypt_preheat:
105	adr	x10, .Lk_inv
106	movi	v17.16b, #0x0f
107	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
108	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// .Lk_ipt, .Lk_sbo
109	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// .Lk_sb1, .Lk_sb2
110	ret
111.size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
112
113//
114//  _aes_encrypt_core
115//
116//  AES-encrypt %xmm0.
117//
118//  Inputs:
119//     %xmm0 = input
120//     %xmm9-%xmm15 as in _vpaes_preheat
121//    (%rdx) = scheduled keys
122//
123//  Output in %xmm0
124//  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
125//  Preserves %xmm6 - %xmm8 so you get some local vectors
126//
127//
128.type	_vpaes_encrypt_core,%function
129.align	4
130_vpaes_encrypt_core:
131	mov	x9, x2
132	ldr	w8, [x2,#240]			// pull rounds
133	adr	x11, .Lk_mc_forward+16
134						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
135	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
136	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
137	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
138	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
139						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
140	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
141	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
142	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
143	b	.Lenc_entry
144
145.align	4
146.Lenc_loop:
147	// middle of middle round
148	add	x10, x11, #0x40
149	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
150	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
151	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
152	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
153	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
154	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
155	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
156	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
157	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
158	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
159	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
160	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
161	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
162	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
163	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
164	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
165	sub	w8, w8, #1			// nr--
166
167.Lenc_entry:
168	// top of round
169	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
170	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
171	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
172	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
173	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
174	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
175	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
176	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
177	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
178	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
179	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
180	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
181	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
182	cbnz	w8, .Lenc_loop
183
184	// middle of last round
185	add	x10, x11, #0x80
186						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
187						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
188	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
189	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
190	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
191	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
192	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
193	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
194	ret
195.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
196
197.globl	vpaes_encrypt
198.type	vpaes_encrypt,%function
199.align	4
200vpaes_encrypt:
201	AARCH64_SIGN_LINK_REGISTER
202	stp	x29,x30,[sp,#-16]!
203	add	x29,sp,#0
204
205	ld1	{v7.16b}, [x0]
206	bl	_vpaes_encrypt_preheat
207	bl	_vpaes_encrypt_core
208	st1	{v0.16b}, [x1]
209
210	ldp	x29,x30,[sp],#16
211	AARCH64_VALIDATE_LINK_REGISTER
212	ret
213.size	vpaes_encrypt,.-vpaes_encrypt
214
215.type	_vpaes_encrypt_2x,%function
216.align	4
217_vpaes_encrypt_2x:
218	mov	x9, x2
219	ldr	w8, [x2,#240]			// pull rounds
220	adr	x11, .Lk_mc_forward+16
221						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
222	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
223	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
224	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
225	and	v9.16b,  v15.16b,  v17.16b
226	ushr	v8.16b,  v15.16b,  #4
227	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
228	tbl	v9.16b,  {v20.16b}, v9.16b
229						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
230	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
231	tbl	v10.16b, {v21.16b}, v8.16b
232	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
233	eor	v8.16b,  v9.16b,   v16.16b
234	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
235	eor	v8.16b,  v8.16b,   v10.16b
236	b	.Lenc_2x_entry
237
238.align	4
239.Lenc_2x_loop:
240	// middle of middle round
241	add	x10, x11, #0x40
242	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
243	tbl	v12.16b, {v25.16b}, v10.16b
244	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
245	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
246	tbl	v8.16b,  {v24.16b}, v11.16b
247	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
248	eor	v12.16b, v12.16b, v16.16b
249	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
250	tbl	v13.16b, {v27.16b}, v10.16b
251	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
252	eor	v8.16b,  v8.16b,  v12.16b
253	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
254	tbl	v10.16b, {v26.16b}, v11.16b
255	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
256	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
257	tbl	v11.16b, {v8.16b}, v1.16b
258	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
259	eor	v10.16b, v10.16b, v13.16b
260	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
261	tbl	v8.16b,  {v8.16b}, v4.16b
262	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
263	eor	v11.16b, v11.16b, v10.16b
264	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
265	tbl	v12.16b, {v11.16b},v1.16b
266	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
267	eor	v8.16b,  v8.16b,  v11.16b
268	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
269	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
270	eor	v8.16b,  v8.16b,  v12.16b
271	sub	w8, w8, #1			// nr--
272
273.Lenc_2x_entry:
274	// top of round
275	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
276	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
277	and	v9.16b,  v8.16b, v17.16b
278	ushr	v8.16b,  v8.16b, #4
279	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
280	tbl	v13.16b, {v19.16b},v9.16b
281	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
282	eor	v9.16b,  v9.16b,  v8.16b
283	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
284	tbl	v11.16b, {v18.16b},v8.16b
285	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
286	tbl	v12.16b, {v18.16b},v9.16b
287	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
288	eor	v11.16b, v11.16b, v13.16b
289	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
290	eor	v12.16b, v12.16b, v13.16b
291	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
292	tbl	v10.16b, {v18.16b},v11.16b
293	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
294	tbl	v11.16b, {v18.16b},v12.16b
295	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
296	eor	v10.16b, v10.16b, v9.16b
297	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
298	eor	v11.16b, v11.16b, v8.16b
299	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
300	cbnz	w8, .Lenc_2x_loop
301
302	// middle of last round
303	add	x10, x11, #0x80
304						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
305						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
306	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
307	tbl	v12.16b, {v22.16b}, v10.16b
308	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
309	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
310	tbl	v8.16b,  {v23.16b}, v11.16b
311	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
312	eor	v12.16b, v12.16b, v16.16b
313	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
314	eor	v8.16b,  v8.16b,  v12.16b
315	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
316	tbl	v1.16b,  {v8.16b},v1.16b
317	ret
318.size	_vpaes_encrypt_2x,.-_vpaes_encrypt_2x
319
320.type	_vpaes_decrypt_preheat,%function
321.align	4
322_vpaes_decrypt_preheat:
323	adr	x10, .Lk_inv
324	movi	v17.16b, #0x0f
325	adr	x11, .Lk_dipt
326	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
327	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64	// .Lk_dipt, .Lk_dsbo
328	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64	// .Lk_dsb9, .Lk_dsbd
329	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x11]		// .Lk_dsbb, .Lk_dsbe
330	ret
331.size	_vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
332
333//
334//  Decryption core
335//
336//  Same API as encryption core.
337//
338.type	_vpaes_decrypt_core,%function
339.align	4
340_vpaes_decrypt_core:
341	mov	x9, x2
342	ldr	w8, [x2,#240]			// pull rounds
343
344						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
345	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
346	eor	x11, x11, #0x30			// xor		$0x30,	%r11
347	adr	x10, .Lk_sr
348	and	x11, x11, #0x30			// and		$0x30,	%r11
349	add	x11, x11, x10
350	adr	x10, .Lk_mc_forward+48
351
352	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
353	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
354	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
355	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
356	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
357						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
358	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
359	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
360	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
361	b	.Ldec_entry
362
363.align	4
364.Ldec_loop:
365//
366//  Inverse mix columns
367//
368						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
369						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
370	tbl	v4.16b, {v24.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
371	tbl	v1.16b, {v25.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
372	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
373						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
374	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
375						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
376
377	tbl	v4.16b, {v26.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
378	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
379	tbl	v1.16b, {v27.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
380	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
381						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
382	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
383						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
384
385	tbl	v4.16b, {v28.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
386	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
387	tbl	v1.16b, {v29.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
388	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
389						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
390	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
391						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
392
393	tbl	v4.16b, {v30.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
394	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
395	tbl	v1.16b, {v31.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
396	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
397	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
398	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
399	sub	w8, w8, #1			// sub		$1,%rax			# nr--
400
401.Ldec_entry:
402	// top of round
403	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
404	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
405	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
406	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
407	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
408	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
409	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
410	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
411	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
412	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
413	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
414	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
415	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
416	cbnz	w8, .Ldec_loop
417
418	// middle of last round
419						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
420	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
421						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
422	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
423	tbl	v1.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
424	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
425	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
426	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
427	ret
428.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
429
430.globl	vpaes_decrypt
431.type	vpaes_decrypt,%function
432.align	4
433vpaes_decrypt:
434	AARCH64_SIGN_LINK_REGISTER
435	stp	x29,x30,[sp,#-16]!
436	add	x29,sp,#0
437
438	ld1	{v7.16b}, [x0]
439	bl	_vpaes_decrypt_preheat
440	bl	_vpaes_decrypt_core
441	st1	{v0.16b}, [x1]
442
443	ldp	x29,x30,[sp],#16
444	AARCH64_VALIDATE_LINK_REGISTER
445	ret
446.size	vpaes_decrypt,.-vpaes_decrypt
447
448// v14-v15 input, v0-v1 output
449.type	_vpaes_decrypt_2x,%function
450.align	4
451_vpaes_decrypt_2x:
452	mov	x9, x2
453	ldr	w8, [x2,#240]			// pull rounds
454
455						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
456	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
457	eor	x11, x11, #0x30			// xor		$0x30,	%r11
458	adr	x10, .Lk_sr
459	and	x11, x11, #0x30			// and		$0x30,	%r11
460	add	x11, x11, x10
461	adr	x10, .Lk_mc_forward+48
462
463	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
464	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
465	ushr	v0.16b,  v14.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
466	and	v9.16b,  v15.16b, v17.16b
467	ushr	v8.16b,  v15.16b, #4
468	tbl	v2.16b,  {v20.16b},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
469	tbl	v10.16b, {v20.16b},v9.16b
470	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
471						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
472	tbl	v0.16b,  {v21.16b},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
473	tbl	v8.16b,  {v21.16b},v8.16b
474	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
475	eor	v10.16b, v10.16b, v16.16b
476	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
477	eor	v8.16b,  v8.16b,  v10.16b
478	b	.Ldec_2x_entry
479
480.align	4
481.Ldec_2x_loop:
482//
483//  Inverse mix columns
484//
485						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
486						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
487	tbl	v4.16b,  {v24.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
488	tbl	v12.16b, {v24.16b}, v10.16b
489	tbl	v1.16b,  {v25.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
490	tbl	v9.16b,  {v25.16b}, v11.16b
491	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
492	eor	v8.16b,  v12.16b, v16.16b
493						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
494	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
495	eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
496						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
497
498	tbl	v4.16b,  {v26.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
499	tbl	v12.16b, {v26.16b}, v10.16b
500	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
501	tbl	v8.16b,  {v8.16b},v5.16b
502	tbl	v1.16b,  {v27.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
503	tbl	v9.16b,  {v27.16b}, v11.16b
504	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
505	eor	v8.16b,  v8.16b,  v12.16b
506						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
507	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
508	eor	v8.16b,  v8.16b,  v9.16b
509						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
510
511	tbl	v4.16b,  {v28.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
512	tbl	v12.16b, {v28.16b}, v10.16b
513	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
514	tbl	v8.16b,  {v8.16b},v5.16b
515	tbl	v1.16b,  {v29.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
516	tbl	v9.16b,  {v29.16b}, v11.16b
517	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
518	eor	v8.16b,  v8.16b,  v12.16b
519						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
520	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
521	eor	v8.16b,  v8.16b,  v9.16b
522						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
523
524	tbl	v4.16b,  {v30.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
525	tbl	v12.16b, {v30.16b}, v10.16b
526	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
527	tbl	v8.16b,  {v8.16b},v5.16b
528	tbl	v1.16b,  {v31.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
529	tbl	v9.16b,  {v31.16b}, v11.16b
530	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
531	eor	v8.16b,  v8.16b,  v12.16b
532	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
533	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
534	eor	v8.16b,  v8.16b,  v9.16b
535	sub	w8, w8, #1			// sub		$1,%rax			# nr--
536
537.Ldec_2x_entry:
538	// top of round
539	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
540	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
541	and	v9.16b,  v8.16b,  v17.16b
542	ushr	v8.16b,  v8.16b,  #4
543	tbl	v2.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
544	tbl	v10.16b, {v19.16b},v9.16b
545	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
546	eor	v9.16b,	 v9.16b,  v8.16b
547	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
548	tbl	v11.16b, {v18.16b},v8.16b
549	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
550	tbl	v12.16b, {v18.16b},v9.16b
551	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
552	eor	v11.16b, v11.16b, v10.16b
553	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
554	eor	v12.16b, v12.16b, v10.16b
555	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
556	tbl	v10.16b, {v18.16b},v11.16b
557	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
558	tbl	v11.16b, {v18.16b},v12.16b
559	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
560	eor	v10.16b, v10.16b, v9.16b
561	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
562	eor	v11.16b, v11.16b, v8.16b
563	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
564	cbnz	w8, .Ldec_2x_loop
565
566	// middle of last round
567						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
568	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
569	tbl	v12.16b, {v22.16b}, v10.16b
570						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
571	tbl	v1.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
572	tbl	v9.16b,  {v23.16b}, v11.16b
573	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
574	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
575	eor	v12.16b, v12.16b, v16.16b
576	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
577	eor	v8.16b,  v9.16b,  v12.16b
578	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
579	tbl	v1.16b,  {v8.16b},v2.16b
580	ret
581.size	_vpaes_decrypt_2x,.-_vpaes_decrypt_2x
582////////////////////////////////////////////////////////
583//                                                    //
584//                  AES key schedule                  //
585//                                                    //
586////////////////////////////////////////////////////////
587.type	_vpaes_key_preheat,%function
588.align	4
589_vpaes_key_preheat:
590	adr	x10, .Lk_inv
591	movi	v16.16b, #0x5b			// .Lk_s63
592	adr	x11, .Lk_sb1
593	movi	v17.16b, #0x0f			// .Lk_s0F
594	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// .Lk_inv, .Lk_ipt
595	adr	x10, .Lk_dksd
596	ld1	{v22.2d,v23.2d}, [x11]		// .Lk_sb1
597	adr	x11, .Lk_mc_forward
598	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// .Lk_dksd, .Lk_dksb
599	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// .Lk_dkse, .Lk_dks9
600	ld1	{v8.2d}, [x10]			// .Lk_rcon
601	ld1	{v9.2d}, [x11]			// .Lk_mc_forward[0]
602	ret
603.size	_vpaes_key_preheat,.-_vpaes_key_preheat
604
605.type	_vpaes_schedule_core,%function
606.align	4
607_vpaes_schedule_core:
608	AARCH64_SIGN_LINK_REGISTER
609	stp	x29, x30, [sp,#-16]!
610	add	x29,sp,#0
611
612	bl	_vpaes_key_preheat		// load the tables
613
614	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
615
616	// input transform
617	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
618	bl	_vpaes_schedule_transform
619	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
620
621	adr	x10, .Lk_sr			// lea	.Lk_sr(%rip),%r10
622	add	x8, x8, x10
623	cbnz	w3, .Lschedule_am_decrypting
624
625	// encrypting, output zeroth round key after transform
626	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
627	b	.Lschedule_go
628
629.Lschedule_am_decrypting:
630	// decrypting, output zeroth round key after shiftrows
631	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
632	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
633	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
634	eor	x8, x8, #0x30			// xor	$0x30, %r8
635
636.Lschedule_go:
637	cmp	w1, #192			// cmp	$192,	%esi
638	b.hi	.Lschedule_256
639	b.eq	.Lschedule_192
640	// 128: fall though
641
642//
643//  .schedule_128
644//
645//  128-bit specific part of key schedule.
646//
647//  This schedule is really simple, because all its parts
648//  are accomplished by the subroutines.
649//
650.Lschedule_128:
651	mov	x0, #10			// mov	$10, %esi
652
653.Loop_schedule_128:
654	sub	x0, x0, #1			// dec	%esi
655	bl	_vpaes_schedule_round
656	cbz	x0, .Lschedule_mangle_last
657	bl	_vpaes_schedule_mangle		// write output
658	b	.Loop_schedule_128
659
660//
661//  .aes_schedule_192
662//
663//  192-bit specific part of key schedule.
664//
665//  The main body of this schedule is the same as the 128-bit
666//  schedule, but with more smearing.  The long, high side is
667//  stored in %xmm7 as before, and the short, low side is in
668//  the high bits of %xmm6.
669//
670//  This schedule is somewhat nastier, however, because each
671//  round produces 192 bits of key material, or 1.5 round keys.
672//  Therefore, on each cycle we do 2 rounds and produce 3 round
673//  keys.
674//
675.align	4
676.Lschedule_192:
677	sub	x0, x0, #8
678	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
679	bl	_vpaes_schedule_transform	// input transform
680	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
681	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
682	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
683	mov	x0, #4			// mov	$4,	%esi
684
685.Loop_schedule_192:
686	sub	x0, x0, #1			// dec	%esi
687	bl	_vpaes_schedule_round
688	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
689	bl	_vpaes_schedule_mangle		// save key n
690	bl	_vpaes_schedule_192_smear
691	bl	_vpaes_schedule_mangle		// save key n+1
692	bl	_vpaes_schedule_round
693	cbz	x0, .Lschedule_mangle_last
694	bl	_vpaes_schedule_mangle		// save key n+2
695	bl	_vpaes_schedule_192_smear
696	b	.Loop_schedule_192
697
698//
699//  .aes_schedule_256
700//
701//  256-bit specific part of key schedule.
702//
703//  The structure here is very similar to the 128-bit
704//  schedule, but with an additional "low side" in
705//  %xmm6.  The low side's rounds are the same as the
706//  high side's, except no rcon and no rotation.
707//
708.align	4
709.Lschedule_256:
710	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
711	bl	_vpaes_schedule_transform	// input transform
712	mov	x0, #7			// mov	$7, %esi
713
714.Loop_schedule_256:
715	sub	x0, x0, #1			// dec	%esi
716	bl	_vpaes_schedule_mangle		// output low result
717	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
718
719	// high round
720	bl	_vpaes_schedule_round
721	cbz	x0, .Lschedule_mangle_last
722	bl	_vpaes_schedule_mangle
723
724	// low round. swap xmm7 and xmm6
725	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
726	movi	v4.16b, #0
727	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
728	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
729	bl	_vpaes_schedule_low_round
730	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
731
732	b	.Loop_schedule_256
733
734//
735//  .aes_schedule_mangle_last
736//
737//  Mangler for last round of key schedule
738//  Mangles %xmm0
739//    when encrypting, outputs out(%xmm0) ^ 63
740//    when decrypting, outputs unskew(%xmm0)
741//
742//  Always called right before return... jumps to cleanup and exits
743//
744.align	4
745.Lschedule_mangle_last:
746	// schedule last round key from xmm0
747	adr	x11, .Lk_deskew			// lea	.Lk_deskew(%rip),%r11	# prepare to deskew
748	cbnz	w3, .Lschedule_mangle_last_dec
749
750	// encrypting
751	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
752	adr	x11, .Lk_opt			// lea	.Lk_opt(%rip),	%r11		# prepare to output transform
753	add	x2, x2, #32			// add	$32,	%rdx
754	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
755
756.Lschedule_mangle_last_dec:
757	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
758	sub	x2, x2, #16			// add	$-16,	%rdx
759	eor	v0.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
760	bl	_vpaes_schedule_transform	// output transform
761	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
762
763	// cleanup
764	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
765	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
766	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
767	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
768	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
769	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
770	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
771	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
772	ldp	x29, x30, [sp],#16
773	AARCH64_VALIDATE_LINK_REGISTER
774	ret
775.size	_vpaes_schedule_core,.-_vpaes_schedule_core
776
777//
778//  .aes_schedule_192_smear
779//
780//  Smear the short, low side in the 192-bit key schedule.
781//
782//  Inputs:
783//    %xmm7: high side, b  a  x  y
784//    %xmm6:  low side, d  c  0  0
785//    %xmm13: 0
786//
787//  Outputs:
788//    %xmm6: b+c+d  b+c  0  0
789//    %xmm0: b+c+d  b+c  b  a
790//
791.type	_vpaes_schedule_192_smear,%function
792.align	4
793_vpaes_schedule_192_smear:
794	movi	v1.16b, #0
795	dup	v0.4s, v7.s[3]
796	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
797	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
798	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
799	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
800	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
801	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
802	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
803	ret
804.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
805
806//
807//  .aes_schedule_round
808//
809//  Runs one main round of the key schedule on %xmm0, %xmm7
810//
811//  Specifically, runs subbytes on the high dword of %xmm0
812//  then rotates it by one byte and xors into the low dword of
813//  %xmm7.
814//
815//  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
816//  next rcon.
817//
818//  Smears the dwords of %xmm7 by xoring the low into the
819//  second low, result into third, result into highest.
820//
821//  Returns results in %xmm7 = %xmm0.
822//  Clobbers %xmm1-%xmm4, %r11.
823//
824.type	_vpaes_schedule_round,%function
825.align	4
826_vpaes_schedule_round:
827	// extract rcon from xmm8
828	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
829	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
830	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
831	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
832
833	// rotate
834	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
835	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
836
837	// fall through...
838
839	// low round: same as high round, but no rotation and no rcon.
840_vpaes_schedule_low_round:
841	// smear xmm7
842	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
843	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
844	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
845
846	// subbytes
847	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
848	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
849	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
850	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
851	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
852	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
853	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
854	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
855	eor	v7.16b, v7.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm7,	%xmm7
856	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
857	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
858	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
859	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
860	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
861	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
862	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
863	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
864
865	// add in smeared stuff
866	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
867	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
868	ret
869.size	_vpaes_schedule_round,.-_vpaes_schedule_round
870
871//
872//  .aes_schedule_transform
873//
874//  Linear-transform %xmm0 according to tables at (%r11)
875//
876//  Requires that %xmm9 = 0x0F0F... as in preheat
877//  Output in %xmm0
878//  Clobbers %xmm1, %xmm2
879//
880.type	_vpaes_schedule_transform,%function
881.align	4
882_vpaes_schedule_transform:
883	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
884	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
885						// vmovdqa	(%r11),	%xmm2 	# lo
886	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
887						// vmovdqa	16(%r11),	%xmm1 # hi
888	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
889	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
890	ret
891.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
892
893//
894//  .aes_schedule_mangle
895//
896//  Mangle xmm0 from (basis-transformed) standard version
897//  to our version.
898//
899//  On encrypt,
900//    xor with 0x63
901//    multiply by circulant 0,1,1,1
902//    apply shiftrows transform
903//
904//  On decrypt,
905//    xor with 0x63
906//    multiply by "inverse mixcolumns" circulant E,B,D,9
907//    deskew
908//    apply shiftrows transform
909//
910//
911//  Writes out to (%rdx), and increments or decrements it
912//  Keeps track of round number mod 4 in %r8
913//  Preserves xmm0
914//  Clobbers xmm1-xmm5
915//
916.type	_vpaes_schedule_mangle,%function
917.align	4
918_vpaes_schedule_mangle:
919	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
920						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
921	cbnz	w3, .Lschedule_mangle_dec
922
923	// encrypting
924	eor	v4.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
925	add	x2, x2, #16			// add	$16,	%rdx
926	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
927	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
928	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
929	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
930	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
931	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
932
933	b	.Lschedule_mangle_both
934.align	4
935.Lschedule_mangle_dec:
936	// inverse mix columns
937						// lea	.Lk_dksd(%rip),%r11
938	ushr	v1.16b, v4.16b, #4		// vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
939	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
940
941						// vmovdqa	0x00(%r11),	%xmm2
942	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
943						// vmovdqa	0x10(%r11),	%xmm3
944	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
945	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
946	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
947
948						// vmovdqa	0x20(%r11),	%xmm2
949	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
950	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
951						// vmovdqa	0x30(%r11),	%xmm3
952	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
953	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
954	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
955
956						// vmovdqa	0x40(%r11),	%xmm2
957	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
958	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
959						// vmovdqa	0x50(%r11),	%xmm3
960	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
961	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
962
963						// vmovdqa	0x60(%r11),	%xmm2
964	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
965	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
966						// vmovdqa	0x70(%r11),	%xmm4
967	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
968	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
969	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
970	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
971
972	sub	x2, x2, #16			// add	$-16,	%rdx
973
974.Lschedule_mangle_both:
975	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
976	add	x8, x8, #64-16			// add	$-16,	%r8
977	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
978	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
979	ret
980.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
981
982.globl	vpaes_set_encrypt_key
983.type	vpaes_set_encrypt_key,%function
984.align	4
985vpaes_set_encrypt_key:
986	AARCH64_SIGN_LINK_REGISTER
987	stp	x29,x30,[sp,#-16]!
988	add	x29,sp,#0
989	stp	d8,d9,[sp,#-16]!	// ABI spec says so
990
991	lsr	w9, w1, #5		// shr	$5,%eax
992	add	w9, w9, #5		// $5,%eax
993	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
994
995	mov	w3, #0		// mov	$0,%ecx
996	mov	x8, #0x30		// mov	$0x30,%r8d
997	bl	_vpaes_schedule_core
998	eor	x0, x0, x0
999
1000	ldp	d8,d9,[sp],#16
1001	ldp	x29,x30,[sp],#16
1002	AARCH64_VALIDATE_LINK_REGISTER
1003	ret
1004.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
1005
1006.globl	vpaes_set_decrypt_key
1007.type	vpaes_set_decrypt_key,%function
1008.align	4
1009vpaes_set_decrypt_key:
1010	AARCH64_SIGN_LINK_REGISTER
1011	stp	x29,x30,[sp,#-16]!
1012	add	x29,sp,#0
1013	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1014
1015	lsr	w9, w1, #5		// shr	$5,%eax
1016	add	w9, w9, #5		// $5,%eax
1017	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
1018	lsl	w9, w9, #4		// shl	$4,%eax
1019	add	x2, x2, #16		// lea	16(%rdx,%rax),%rdx
1020	add	x2, x2, x9
1021
1022	mov	w3, #1		// mov	$1,%ecx
1023	lsr	w8, w1, #1		// shr	$1,%r8d
1024	and	x8, x8, #32		// and	$32,%r8d
1025	eor	x8, x8, #32		// xor	$32,%r8d	# nbits==192?0:32
1026	bl	_vpaes_schedule_core
1027
1028	ldp	d8,d9,[sp],#16
1029	ldp	x29,x30,[sp],#16
1030	AARCH64_VALIDATE_LINK_REGISTER
1031	ret
1032.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1033.globl	vpaes_cbc_encrypt
1034.type	vpaes_cbc_encrypt,%function
1035.align	4
1036vpaes_cbc_encrypt:
1037	AARCH64_SIGN_LINK_REGISTER
1038	cbz	x2, .Lcbc_abort
1039	cmp	w5, #0			// check direction
1040	b.eq	vpaes_cbc_decrypt
1041
1042	stp	x29,x30,[sp,#-16]!
1043	add	x29,sp,#0
1044
1045	mov	x17, x2		// reassign
1046	mov	x2,  x3		// reassign
1047
1048	ld1	{v0.16b}, [x4]	// load ivec
1049	bl	_vpaes_encrypt_preheat
1050	b	.Lcbc_enc_loop
1051
1052.align	4
1053.Lcbc_enc_loop:
1054	ld1	{v7.16b}, [x0],#16	// load input
1055	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
1056	bl	_vpaes_encrypt_core
1057	st1	{v0.16b}, [x1],#16	// save output
1058	subs	x17, x17, #16
1059	b.hi	.Lcbc_enc_loop
1060
1061	st1	{v0.16b}, [x4]	// write ivec
1062
1063	ldp	x29,x30,[sp],#16
1064.Lcbc_abort:
1065	AARCH64_VALIDATE_LINK_REGISTER
1066	ret
1067.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1068
1069.type	vpaes_cbc_decrypt,%function
1070.align	4
1071vpaes_cbc_decrypt:
1072	// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
1073	// only from vpaes_cbc_encrypt which has already signed the return address.
1074	stp	x29,x30,[sp,#-16]!
1075	add	x29,sp,#0
1076	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1077	stp	d10,d11,[sp,#-16]!
1078	stp	d12,d13,[sp,#-16]!
1079	stp	d14,d15,[sp,#-16]!
1080
1081	mov	x17, x2		// reassign
1082	mov	x2,  x3		// reassign
1083	ld1	{v6.16b}, [x4]	// load ivec
1084	bl	_vpaes_decrypt_preheat
1085	tst	x17, #16
1086	b.eq	.Lcbc_dec_loop2x
1087
1088	ld1	{v7.16b}, [x0], #16	// load input
1089	bl	_vpaes_decrypt_core
1090	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
1091	orr	v6.16b, v7.16b, v7.16b	// next ivec value
1092	st1	{v0.16b}, [x1], #16
1093	subs	x17, x17, #16
1094	b.ls	.Lcbc_dec_done
1095
1096.align	4
1097.Lcbc_dec_loop2x:
1098	ld1	{v14.16b,v15.16b}, [x0], #32
1099	bl	_vpaes_decrypt_2x
1100	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
1101	eor	v1.16b, v1.16b, v14.16b
1102	orr	v6.16b, v15.16b, v15.16b
1103	st1	{v0.16b,v1.16b}, [x1], #32
1104	subs	x17, x17, #32
1105	b.hi	.Lcbc_dec_loop2x
1106
1107.Lcbc_dec_done:
1108	st1	{v6.16b}, [x4]
1109
1110	ldp	d14,d15,[sp],#16
1111	ldp	d12,d13,[sp],#16
1112	ldp	d10,d11,[sp],#16
1113	ldp	d8,d9,[sp],#16
1114	ldp	x29,x30,[sp],#16
1115	AARCH64_VALIDATE_LINK_REGISTER
1116	ret
1117.size	vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1118.globl	vpaes_ecb_encrypt
1119.type	vpaes_ecb_encrypt,%function
1120.align	4
1121vpaes_ecb_encrypt:
1122	AARCH64_SIGN_LINK_REGISTER
1123	stp	x29,x30,[sp,#-16]!
1124	add	x29,sp,#0
1125	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1126	stp	d10,d11,[sp,#-16]!
1127	stp	d12,d13,[sp,#-16]!
1128	stp	d14,d15,[sp,#-16]!
1129
1130	mov	x17, x2
1131	mov	x2,  x3
1132	bl	_vpaes_encrypt_preheat
1133	tst	x17, #16
1134	b.eq	.Lecb_enc_loop
1135
1136	ld1	{v7.16b}, [x0],#16
1137	bl	_vpaes_encrypt_core
1138	st1	{v0.16b}, [x1],#16
1139	subs	x17, x17, #16
1140	b.ls	.Lecb_enc_done
1141
1142.align	4
1143.Lecb_enc_loop:
1144	ld1	{v14.16b,v15.16b}, [x0], #32
1145	bl	_vpaes_encrypt_2x
1146	st1	{v0.16b,v1.16b}, [x1], #32
1147	subs	x17, x17, #32
1148	b.hi	.Lecb_enc_loop
1149
1150.Lecb_enc_done:
1151	ldp	d14,d15,[sp],#16
1152	ldp	d12,d13,[sp],#16
1153	ldp	d10,d11,[sp],#16
1154	ldp	d8,d9,[sp],#16
1155	ldp	x29,x30,[sp],#16
1156	AARCH64_VALIDATE_LINK_REGISTER
1157	ret
1158.size	vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1159
1160.globl	vpaes_ecb_decrypt
1161.type	vpaes_ecb_decrypt,%function
1162.align	4
1163vpaes_ecb_decrypt:
1164	AARCH64_SIGN_LINK_REGISTER
1165	stp	x29,x30,[sp,#-16]!
1166	add	x29,sp,#0
1167	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1168	stp	d10,d11,[sp,#-16]!
1169	stp	d12,d13,[sp,#-16]!
1170	stp	d14,d15,[sp,#-16]!
1171
1172	mov	x17, x2
1173	mov	x2,  x3
1174	bl	_vpaes_decrypt_preheat
1175	tst	x17, #16
1176	b.eq	.Lecb_dec_loop
1177
1178	ld1	{v7.16b}, [x0],#16
1179	bl	_vpaes_encrypt_core
1180	st1	{v0.16b}, [x1],#16
1181	subs	x17, x17, #16
1182	b.ls	.Lecb_dec_done
1183
1184.align	4
1185.Lecb_dec_loop:
1186	ld1	{v14.16b,v15.16b}, [x0], #32
1187	bl	_vpaes_decrypt_2x
1188	st1	{v0.16b,v1.16b}, [x1], #32
1189	subs	x17, x17, #32
1190	b.hi	.Lecb_dec_loop
1191
1192.Lecb_dec_done:
1193	ldp	d14,d15,[sp],#16
1194	ldp	d12,d13,[sp],#16
1195	ldp	d10,d11,[sp],#16
1196	ldp	d8,d9,[sp],#16
1197	ldp	x29,x30,[sp],#16
1198	AARCH64_VALIDATE_LINK_REGISTER
1199	ret
1200.size	vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
1201