xref: /freebsd/sys/crypto/openssl/aarch64/vpaes-armv8.S (revision 22cf89c938886d14f5796fc49f9f020c23ea8eaf)
1/* Do not modify. This file is auto-generated from vpaes-armv8.pl. */
2.text
3
4.type	_vpaes_consts,%object
5.align	7	// totally strategic alignment
6_vpaes_consts:
7.Lk_mc_forward:	//	mc_forward
8.quad	0x0407060500030201, 0x0C0F0E0D080B0A09
9.quad	0x080B0A0904070605, 0x000302010C0F0E0D
10.quad	0x0C0F0E0D080B0A09, 0x0407060500030201
11.quad	0x000302010C0F0E0D, 0x080B0A0904070605
12.Lk_mc_backward:	//	mc_backward
13.quad	0x0605040702010003, 0x0E0D0C0F0A09080B
14.quad	0x020100030E0D0C0F, 0x0A09080B06050407
15.quad	0x0E0D0C0F0A09080B, 0x0605040702010003
16.quad	0x0A09080B06050407, 0x020100030E0D0C0F
17.Lk_sr:	//	sr
18.quad	0x0706050403020100, 0x0F0E0D0C0B0A0908
19.quad	0x030E09040F0A0500, 0x0B06010C07020D08
20.quad	0x0F060D040B020900, 0x070E050C030A0108
21.quad	0x0B0E0104070A0D00, 0x0306090C0F020508
22
23//
24// "Hot" constants
25//
26.Lk_inv:	//	inv, inva
27.quad	0x0E05060F0D080180, 0x040703090A0B0C02
28.quad	0x01040A060F0B0780, 0x030D0E0C02050809
29.Lk_ipt:	//	input transform (lo, hi)
30.quad	0xC2B2E8985A2A7000, 0xCABAE09052227808
31.quad	0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
32.Lk_sbo:	//	sbou, sbot
33.quad	0xD0D26D176FBDC700, 0x15AABF7AC502A878
34.quad	0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
35.Lk_sb1:	//	sb1u, sb1t
36.quad	0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
37.quad	0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
38.Lk_sb2:	//	sb2u, sb2t
39.quad	0x69EB88400AE12900, 0xC2A163C8AB82234A
40.quad	0xE27A93C60B712400, 0x5EB7E955BC982FCD
41
42//
43//  Decryption stuff
44//
45.Lk_dipt:	//	decryption input transform
46.quad	0x0F505B040B545F00, 0x154A411E114E451A
47.quad	0x86E383E660056500, 0x12771772F491F194
48.Lk_dsbo:	//	decryption sbox final output
49.quad	0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
50.quad	0x12D7560F93441D00, 0xCA4B8159D8C58E9C
51.Lk_dsb9:	//	decryption sbox output *9*u, *9*t
52.quad	0x851C03539A86D600, 0xCAD51F504F994CC9
53.quad	0xC03B1789ECD74900, 0x725E2C9EB2FBA565
54.Lk_dsbd:	//	decryption sbox output *D*u, *D*t
55.quad	0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
56.quad	0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
57.Lk_dsbb:	//	decryption sbox output *B*u, *B*t
58.quad	0xD022649296B44200, 0x602646F6B0F2D404
59.quad	0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
60.Lk_dsbe:	//	decryption sbox output *E*u, *E*t
61.quad	0x46F2929626D4D000, 0x2242600464B4F6B0
62.quad	0x0C55A6CDFFAAC100, 0x9467F36B98593E32
63
64//
65//  Key schedule constants
66//
67.Lk_dksd:	//	decryption key schedule: invskew x*D
68.quad	0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
69.quad	0x41C277F4B5368300, 0x5FDC69EAAB289D1E
70.Lk_dksb:	//	decryption key schedule: invskew x*B
71.quad	0x9A4FCA1F8550D500, 0x03D653861CC94C99
72.quad	0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
73.Lk_dkse:	//	decryption key schedule: invskew x*E + 0x63
74.quad	0xD5031CCA1FC9D600, 0x53859A4C994F5086
75.quad	0xA23196054FDC7BE8, 0xCD5EF96A20B31487
76.Lk_dks9:	//	decryption key schedule: invskew x*9
77.quad	0xB6116FC87ED9A700, 0x4AED933482255BFC
78.quad	0x4576516227143300, 0x8BB89FACE9DAFDCE
79
80.Lk_rcon:	//	rcon
81.quad	0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
82
83.Lk_opt:	//	output transform
84.quad	0xFF9F4929D6B66000, 0xF7974121DEBE6808
85.quad	0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
86.Lk_deskew:	//	deskew tables: inverts the sbox's "skew"
87.quad	0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
88.quad	0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
89
90.byte	86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0
91.align	2
92.size	_vpaes_consts,.-_vpaes_consts
93.align	6
94//
95//  _aes_preheat
96//
97//  Fills register %r10 -> .aes_consts (so you can -fPIC)
98//  and %xmm9-%xmm15 as specified below.
99//
100.type	_vpaes_encrypt_preheat,%function
101.align	4
102_vpaes_encrypt_preheat:
103	adr	x10, .Lk_inv
104	movi	v17.16b, #0x0f
105	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
106	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64	// .Lk_ipt, .Lk_sbo
107	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10]		// .Lk_sb1, .Lk_sb2
108	ret
109.size	_vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
110
111//
112//  _aes_encrypt_core
113//
114//  AES-encrypt %xmm0.
115//
116//  Inputs:
117//     %xmm0 = input
118//     %xmm9-%xmm15 as in _vpaes_preheat
119//    (%rdx) = scheduled keys
120//
121//  Output in %xmm0
122//  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
123//  Preserves %xmm6 - %xmm8 so you get some local vectors
124//
125//
126.type	_vpaes_encrypt_core,%function
127.align	4
128_vpaes_encrypt_core:
129	mov	x9, x2
130	ldr	w8, [x2,#240]			// pull rounds
131	adr	x11, .Lk_mc_forward+16
132						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
133	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
134	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
135	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
136	tbl	v1.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
137						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
138	tbl	v2.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
139	eor	v0.16b, v1.16b, v16.16b		// vpxor	%xmm5,	%xmm1,	%xmm0
140	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
141	b	.Lenc_entry
142
143.align	4
144.Lenc_loop:
145	// middle of middle round
146	add	x10, x11, #0x40
147	tbl	v4.16b, {v25.16b}, v2.16b		// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
148	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
149	tbl	v0.16b, {v24.16b}, v3.16b		// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
150	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
151	tbl	v5.16b,	{v27.16b}, v2.16b		// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
152	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
153	tbl	v2.16b, {v26.16b}, v3.16b		// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
154	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
155	tbl	v3.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
156	eor	v2.16b, v2.16b, v5.16b		// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
157	tbl	v0.16b, {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
158	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
159	tbl	v4.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
160	eor	v0.16b, v0.16b, v3.16b		// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
161	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
162	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
163	sub	w8, w8, #1			// nr--
164
165.Lenc_entry:
166	// top of round
167	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
168	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
169	tbl	v5.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
170	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
171	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
172	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
173	eor	v3.16b, v3.16b, v5.16b		// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
174	eor	v4.16b, v4.16b, v5.16b		// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
175	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
176	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
177	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
178	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
179	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
180	cbnz	w8, .Lenc_loop
181
182	// middle of last round
183	add	x10, x11, #0x80
184						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
185						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
186	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
187	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
188	tbl	v0.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
189	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
190	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
191	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
192	ret
193.size	_vpaes_encrypt_core,.-_vpaes_encrypt_core
194
195.globl	vpaes_encrypt
196.type	vpaes_encrypt,%function
197.align	4
198vpaes_encrypt:
199.inst	0xd503233f			// paciasp
200	stp	x29,x30,[sp,#-16]!
201	add	x29,sp,#0
202
203	ld1	{v7.16b}, [x0]
204	bl	_vpaes_encrypt_preheat
205	bl	_vpaes_encrypt_core
206	st1	{v0.16b}, [x1]
207
208	ldp	x29,x30,[sp],#16
209.inst	0xd50323bf			// autiasp
210	ret
211.size	vpaes_encrypt,.-vpaes_encrypt
212
213.type	_vpaes_encrypt_2x,%function
214.align	4
215_vpaes_encrypt_2x:
216	mov	x9, x2
217	ldr	w8, [x2,#240]			// pull rounds
218	adr	x11, .Lk_mc_forward+16
219						// vmovdqa	.Lk_ipt(%rip),	%xmm2	# iptlo
220	ld1	{v16.2d}, [x9], #16		// vmovdqu	(%r9),	%xmm5		# round0 key
221	and	v1.16b,  v14.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
222	ushr	v0.16b,  v14.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0
223	and	v9.16b,  v15.16b,  v17.16b
224	ushr	v8.16b,  v15.16b,  #4
225	tbl	v1.16b,  {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm1
226	tbl	v9.16b,  {v20.16b}, v9.16b
227						// vmovdqa	.Lk_ipt+16(%rip), %xmm3	# ipthi
228	tbl	v2.16b,  {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm3,	%xmm2
229	tbl	v10.16b, {v21.16b}, v8.16b
230	eor	v0.16b,  v1.16b,   v16.16b	// vpxor	%xmm5,	%xmm1,	%xmm0
231	eor	v8.16b,  v9.16b,   v16.16b
232	eor	v0.16b,  v0.16b,   v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
233	eor	v8.16b,  v8.16b,   v10.16b
234	b	.Lenc_2x_entry
235
236.align	4
237.Lenc_2x_loop:
238	// middle of middle round
239	add	x10, x11, #0x40
240	tbl	v4.16b,  {v25.16b}, v2.16b	// vpshufb	%xmm2,	%xmm13,	%xmm4	# 4 = sb1u
241	tbl	v12.16b, {v25.16b}, v10.16b
242	ld1	{v1.2d}, [x11], #16		// vmovdqa	-0x40(%r11,%r10), %xmm1	# .Lk_mc_forward[]
243	tbl	v0.16b,  {v24.16b}, v3.16b	// vpshufb	%xmm3,	%xmm12,	%xmm0	# 0 = sb1t
244	tbl	v8.16b,  {v24.16b}, v11.16b
245	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
246	eor	v12.16b, v12.16b, v16.16b
247	tbl	v5.16b,	 {v27.16b}, v2.16b	// vpshufb	%xmm2,	%xmm15,	%xmm5	# 4 = sb2u
248	tbl	v13.16b, {v27.16b}, v10.16b
249	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
250	eor	v8.16b,  v8.16b,  v12.16b
251	tbl	v2.16b,  {v26.16b}, v3.16b	// vpshufb	%xmm3,	%xmm14,	%xmm2	# 2 = sb2t
252	tbl	v10.16b, {v26.16b}, v11.16b
253	ld1	{v4.2d}, [x10]			// vmovdqa	(%r11,%r10), %xmm4	# .Lk_mc_backward[]
254	tbl	v3.16b,  {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm3	# 0 = B
255	tbl	v11.16b, {v8.16b}, v1.16b
256	eor	v2.16b,  v2.16b,  v5.16b	// vpxor	%xmm5,	%xmm2,	%xmm2	# 2 = 2A
257	eor	v10.16b, v10.16b, v13.16b
258	tbl	v0.16b,  {v0.16b}, v4.16b	// vpshufb	%xmm4,	%xmm0,	%xmm0	# 3 = D
259	tbl	v8.16b,  {v8.16b}, v4.16b
260	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 0 = 2A+B
261	eor	v11.16b, v11.16b, v10.16b
262	tbl	v4.16b,  {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm4	# 0 = 2B+C
263	tbl	v12.16b, {v11.16b},v1.16b
264	eor	v0.16b,  v0.16b,  v3.16b	// vpxor	%xmm3,	%xmm0,	%xmm0	# 3 = 2A+B+D
265	eor	v8.16b,  v8.16b,  v11.16b
266	and	x11, x11, #~(1<<6)		// and		$0x30,	%r11		# ... mod 4
267	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0, %xmm0	# 0 = 2A+3B+C+D
268	eor	v8.16b,  v8.16b,  v12.16b
269	sub	w8, w8, #1			// nr--
270
271.Lenc_2x_entry:
272	// top of round
273	and	v1.16b,  v0.16b, v17.16b	// vpand	%xmm0,	%xmm9,	%xmm1   # 0 = k
274	ushr	v0.16b,  v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
275	and	v9.16b,  v8.16b, v17.16b
276	ushr	v8.16b,  v8.16b, #4
277	tbl	v5.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm5	# 2 = a/k
278	tbl	v13.16b, {v19.16b},v9.16b
279	eor	v1.16b,  v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
280	eor	v9.16b,  v9.16b,  v8.16b
281	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3  	# 3 = 1/i
282	tbl	v11.16b, {v18.16b},v8.16b
283	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1, 	%xmm10,	%xmm4  	# 4 = 1/j
284	tbl	v12.16b, {v18.16b},v9.16b
285	eor	v3.16b,  v3.16b,  v5.16b	// vpxor	%xmm5,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
286	eor	v11.16b, v11.16b, v13.16b
287	eor	v4.16b,  v4.16b,  v5.16b	// vpxor	%xmm5,	%xmm4,	%xmm4  	# 4 = jak = 1/j + a/k
288	eor	v12.16b, v12.16b, v13.16b
289	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2  	# 2 = 1/iak
290	tbl	v10.16b, {v18.16b},v11.16b
291	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm3	# 3 = 1/jak
292	tbl	v11.16b, {v18.16b},v12.16b
293	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2  	# 2 = io
294	eor	v10.16b, v10.16b, v9.16b
295	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,	%xmm3,	%xmm3	# 3 = jo
296	eor	v11.16b, v11.16b, v8.16b
297	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm5
298	cbnz	w8, .Lenc_2x_loop
299
300	// middle of last round
301	add	x10, x11, #0x80
302						// vmovdqa	-0x60(%r10), %xmm4	# 3 : sbou	.Lk_sbo
303						// vmovdqa	-0x50(%r10), %xmm0	# 0 : sbot	.Lk_sbo+16
304	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
305	tbl	v12.16b, {v22.16b}, v10.16b
306	ld1	{v1.2d}, [x10]			// vmovdqa	0x40(%r11,%r10), %xmm1	# .Lk_sr[]
307	tbl	v0.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm0,	%xmm0	# 0 = sb1t
308	tbl	v8.16b,  {v23.16b}, v11.16b
309	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm5,	%xmm4,	%xmm4	# 4 = sb1u + k
310	eor	v12.16b, v12.16b, v16.16b
311	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0	# 0 = A
312	eor	v8.16b,  v8.16b,  v12.16b
313	tbl	v0.16b,  {v0.16b},v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0
314	tbl	v1.16b,  {v8.16b},v1.16b
315	ret
316.size	_vpaes_encrypt_2x,.-_vpaes_encrypt_2x
317
318.type	_vpaes_decrypt_preheat,%function
319.align	4
320_vpaes_decrypt_preheat:
321	adr	x10, .Lk_inv
322	movi	v17.16b, #0x0f
323	adr	x11, .Lk_dipt
324	ld1	{v18.2d,v19.2d}, [x10],#32	// .Lk_inv
325	ld1	{v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64	// .Lk_dipt, .Lk_dsbo
326	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64	// .Lk_dsb9, .Lk_dsbd
327	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x11]		// .Lk_dsbb, .Lk_dsbe
328	ret
329.size	_vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
330
331//
332//  Decryption core
333//
334//  Same API as encryption core.
335//
336.type	_vpaes_decrypt_core,%function
337.align	4
338_vpaes_decrypt_core:
339	mov	x9, x2
340	ldr	w8, [x2,#240]			// pull rounds
341
342						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
343	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
344	eor	x11, x11, #0x30			// xor		$0x30,	%r11
345	adr	x10, .Lk_sr
346	and	x11, x11, #0x30			// and		$0x30,	%r11
347	add	x11, x11, x10
348	adr	x10, .Lk_mc_forward+48
349
350	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
351	and	v1.16b, v7.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
352	ushr	v0.16b, v7.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
353	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
354	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
355						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
356	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
357	eor	v2.16b, v2.16b, v16.16b		// vpxor	%xmm4,	%xmm2,	%xmm2
358	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
359	b	.Ldec_entry
360
361.align	4
362.Ldec_loop:
363//
364//  Inverse mix columns
365//
366						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
367						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
368	tbl	v4.16b, {v24.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
369	tbl	v1.16b, {v25.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
370	eor	v0.16b, v4.16b, v16.16b		// vpxor	%xmm4,	%xmm0,	%xmm0
371						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
372	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
373						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
374
375	tbl	v4.16b, {v26.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
376	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
377	tbl	v1.16b, {v27.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
378	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
379						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
380	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
381						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
382
383	tbl	v4.16b, {v28.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
384	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
385	tbl	v1.16b, {v29.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
386	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
387						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
388	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
389						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
390
391	tbl	v4.16b, {v30.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
392	tbl	v0.16b, {v0.16b}, v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
393	tbl	v1.16b, {v31.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
394	eor	v0.16b, v0.16b, v4.16b		// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
395	ext	v5.16b, v5.16b, v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
396	eor	v0.16b, v0.16b, v1.16b		// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
397	sub	w8, w8, #1			// sub		$1,%rax			# nr--
398
399.Ldec_entry:
400	// top of round
401	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
402	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
403	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
404	eor	v1.16b,	v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
405	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
406	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
407	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
408	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
409	tbl	v2.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
410	tbl	v3.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
411	eor	v2.16b, v2.16b, v1.16b		// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
412	eor	v3.16b, v3.16b, v0.16b		// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
413	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
414	cbnz	w8, .Ldec_loop
415
416	// middle of last round
417						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
418	tbl	v4.16b, {v22.16b}, v2.16b		// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
419						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
420	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
421	tbl	v1.16b, {v23.16b}, v3.16b		// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
422	eor	v4.16b, v4.16b, v16.16b		// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
423	eor	v0.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
424	tbl	v0.16b, {v0.16b}, v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
425	ret
426.size	_vpaes_decrypt_core,.-_vpaes_decrypt_core
427
428.globl	vpaes_decrypt
429.type	vpaes_decrypt,%function
430.align	4
431vpaes_decrypt:
432.inst	0xd503233f			// paciasp
433	stp	x29,x30,[sp,#-16]!
434	add	x29,sp,#0
435
436	ld1	{v7.16b}, [x0]
437	bl	_vpaes_decrypt_preheat
438	bl	_vpaes_decrypt_core
439	st1	{v0.16b}, [x1]
440
441	ldp	x29,x30,[sp],#16
442.inst	0xd50323bf			// autiasp
443	ret
444.size	vpaes_decrypt,.-vpaes_decrypt
445
446// v14-v15 input, v0-v1 output
447.type	_vpaes_decrypt_2x,%function
448.align	4
449_vpaes_decrypt_2x:
450	mov	x9, x2
451	ldr	w8, [x2,#240]			// pull rounds
452
453						// vmovdqa	.Lk_dipt(%rip), %xmm2	# iptlo
454	lsl	x11, x8, #4			// mov	%rax,	%r11;	shl	$4, %r11
455	eor	x11, x11, #0x30			// xor		$0x30,	%r11
456	adr	x10, .Lk_sr
457	and	x11, x11, #0x30			// and		$0x30,	%r11
458	add	x11, x11, x10
459	adr	x10, .Lk_mc_forward+48
460
461	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm4		# round0 key
462	and	v1.16b,  v14.16b, v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1
463	ushr	v0.16b,  v14.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
464	and	v9.16b,  v15.16b, v17.16b
465	ushr	v8.16b,  v15.16b, #4
466	tbl	v2.16b,  {v20.16b},v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
467	tbl	v10.16b, {v20.16b},v9.16b
468	ld1	{v5.2d}, [x10]			// vmovdqa	.Lk_mc_forward+48(%rip), %xmm5
469						// vmovdqa	.Lk_dipt+16(%rip), %xmm1 # ipthi
470	tbl	v0.16b,  {v21.16b},v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
471	tbl	v8.16b,  {v21.16b},v8.16b
472	eor	v2.16b,  v2.16b,  v16.16b	// vpxor	%xmm4,	%xmm2,	%xmm2
473	eor	v10.16b, v10.16b, v16.16b
474	eor	v0.16b,  v0.16b,  v2.16b	// vpxor	%xmm2,	%xmm0,	%xmm0
475	eor	v8.16b,  v8.16b,  v10.16b
476	b	.Ldec_2x_entry
477
478.align	4
479.Ldec_2x_loop:
480//
481//  Inverse mix columns
482//
483						// vmovdqa	-0x20(%r10),%xmm4		# 4 : sb9u
484						// vmovdqa	-0x10(%r10),%xmm1		# 0 : sb9t
485	tbl	v4.16b,  {v24.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sb9u
486	tbl	v12.16b, {v24.16b}, v10.16b
487	tbl	v1.16b,  {v25.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sb9t
488	tbl	v9.16b,  {v25.16b}, v11.16b
489	eor	v0.16b,  v4.16b,  v16.16b	// vpxor	%xmm4,	%xmm0,	%xmm0
490	eor	v8.16b,  v12.16b, v16.16b
491						// vmovdqa	0x00(%r10),%xmm4		# 4 : sbdu
492	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
493	eor	v8.16b,  v8.16b,  v9.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
494						// vmovdqa	0x10(%r10),%xmm1		# 0 : sbdt
495
496	tbl	v4.16b,  {v26.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbdu
497	tbl	v12.16b, {v26.16b}, v10.16b
498	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
499	tbl	v8.16b,  {v8.16b},v5.16b
500	tbl	v1.16b,  {v27.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbdt
501	tbl	v9.16b,  {v27.16b}, v11.16b
502	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
503	eor	v8.16b,  v8.16b,  v12.16b
504						// vmovdqa	0x20(%r10),	%xmm4		# 4 : sbbu
505	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
506	eor	v8.16b,  v8.16b,  v9.16b
507						// vmovdqa	0x30(%r10),	%xmm1		# 0 : sbbt
508
509	tbl	v4.16b,  {v28.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbbu
510	tbl	v12.16b, {v28.16b}, v10.16b
511	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
512	tbl	v8.16b,  {v8.16b},v5.16b
513	tbl	v1.16b,  {v29.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbbt
514	tbl	v9.16b,  {v29.16b}, v11.16b
515	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
516	eor	v8.16b,  v8.16b,  v12.16b
517						// vmovdqa	0x40(%r10),	%xmm4		# 4 : sbeu
518	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
519	eor	v8.16b,  v8.16b,  v9.16b
520						// vmovdqa	0x50(%r10),	%xmm1		# 0 : sbet
521
522	tbl	v4.16b,  {v30.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4		# 4 = sbeu
523	tbl	v12.16b, {v30.16b}, v10.16b
524	tbl	v0.16b,  {v0.16b},v5.16b	// vpshufb	%xmm5,	%xmm0,	%xmm0		# MC ch
525	tbl	v8.16b,  {v8.16b},v5.16b
526	tbl	v1.16b,  {v31.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1		# 0 = sbet
527	tbl	v9.16b,  {v31.16b}, v11.16b
528	eor	v0.16b,  v0.16b,  v4.16b	// vpxor	%xmm4,	%xmm0,	%xmm0		# 4 = ch
529	eor	v8.16b,  v8.16b,  v12.16b
530	ext	v5.16b,  v5.16b,  v5.16b, #12	// vpalignr $12,	%xmm5,	%xmm5,	%xmm5
531	eor	v0.16b,  v0.16b,  v1.16b	// vpxor	%xmm1,	%xmm0,	%xmm0		# 0 = ch
532	eor	v8.16b,  v8.16b,  v9.16b
533	sub	w8, w8, #1			// sub		$1,%rax			# nr--
534
535.Ldec_2x_entry:
536	// top of round
537	and	v1.16b,  v0.16b,  v17.16b	// vpand	%xmm9,	%xmm0,	%xmm1	# 0 = k
538	ushr	v0.16b,  v0.16b,  #4		// vpsrlb	$4,	%xmm0,	%xmm0	# 1 = i
539	and	v9.16b,  v8.16b,  v17.16b
540	ushr	v8.16b,  v8.16b,  #4
541	tbl	v2.16b,  {v19.16b},v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2	# 2 = a/k
542	tbl	v10.16b, {v19.16b},v9.16b
543	eor	v1.16b,	 v1.16b,  v0.16b	// vpxor	%xmm0,	%xmm1,	%xmm1	# 0 = j
544	eor	v9.16b,	 v9.16b,  v8.16b
545	tbl	v3.16b,  {v18.16b},v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3	# 3 = 1/i
546	tbl	v11.16b, {v18.16b},v8.16b
547	tbl	v4.16b,  {v18.16b},v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4	# 4 = 1/j
548	tbl	v12.16b, {v18.16b},v9.16b
549	eor	v3.16b,  v3.16b,  v2.16b	// vpxor	%xmm2,	%xmm3,	%xmm3	# 3 = iak = 1/i + a/k
550	eor	v11.16b, v11.16b, v10.16b
551	eor	v4.16b,  v4.16b,  v2.16b	// vpxor	%xmm2, 	%xmm4,	%xmm4	# 4 = jak = 1/j + a/k
552	eor	v12.16b, v12.16b, v10.16b
553	tbl	v2.16b,  {v18.16b},v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm2	# 2 = 1/iak
554	tbl	v10.16b, {v18.16b},v11.16b
555	tbl	v3.16b,  {v18.16b},v4.16b	// vpshufb	%xmm4,  %xmm10,	%xmm3	# 3 = 1/jak
556	tbl	v11.16b, {v18.16b},v12.16b
557	eor	v2.16b,  v2.16b,  v1.16b	// vpxor	%xmm1,	%xmm2,	%xmm2	# 2 = io
558	eor	v10.16b, v10.16b, v9.16b
559	eor	v3.16b,  v3.16b,  v0.16b	// vpxor	%xmm0,  %xmm3,	%xmm3	# 3 = jo
560	eor	v11.16b, v11.16b, v8.16b
561	ld1	{v16.2d}, [x9],#16		// vmovdqu	(%r9),	%xmm0
562	cbnz	w8, .Ldec_2x_loop
563
564	// middle of last round
565						// vmovdqa	0x60(%r10),	%xmm4	# 3 : sbou
566	tbl	v4.16b,  {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm4,	%xmm4	# 4 = sbou
567	tbl	v12.16b, {v22.16b}, v10.16b
568						// vmovdqa	0x70(%r10),	%xmm1	# 0 : sbot
569	tbl	v1.16b,  {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm1,	%xmm1	# 0 = sb1t
570	tbl	v9.16b,  {v23.16b}, v11.16b
571	ld1	{v2.2d}, [x11]			// vmovdqa	-0x160(%r11),	%xmm2	# .Lk_sr-.Lk_dsbd=-0x160
572	eor	v4.16b,  v4.16b,  v16.16b	// vpxor	%xmm0,	%xmm4,	%xmm4	# 4 = sb1u + k
573	eor	v12.16b, v12.16b, v16.16b
574	eor	v0.16b,  v1.16b,  v4.16b	// vpxor	%xmm4,	%xmm1,	%xmm0	# 0 = A
575	eor	v8.16b,  v9.16b,  v12.16b
576	tbl	v0.16b,  {v0.16b},v2.16b	// vpshufb	%xmm2,	%xmm0,	%xmm0
577	tbl	v1.16b,  {v8.16b},v2.16b
578	ret
579.size	_vpaes_decrypt_2x,.-_vpaes_decrypt_2x
580////////////////////////////////////////////////////////
581//                                                    //
582//                  AES key schedule                  //
583//                                                    //
584////////////////////////////////////////////////////////
585.type	_vpaes_key_preheat,%function
586.align	4
587_vpaes_key_preheat:
588	adr	x10, .Lk_inv
589	movi	v16.16b, #0x5b			// .Lk_s63
590	adr	x11, .Lk_sb1
591	movi	v17.16b, #0x0f			// .Lk_s0F
592	ld1	{v18.2d,v19.2d,v20.2d,v21.2d}, [x10]		// .Lk_inv, .Lk_ipt
593	adr	x10, .Lk_dksd
594	ld1	{v22.2d,v23.2d}, [x11]		// .Lk_sb1
595	adr	x11, .Lk_mc_forward
596	ld1	{v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64	// .Lk_dksd, .Lk_dksb
597	ld1	{v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64	// .Lk_dkse, .Lk_dks9
598	ld1	{v8.2d}, [x10]			// .Lk_rcon
599	ld1	{v9.2d}, [x11]			// .Lk_mc_forward[0]
600	ret
601.size	_vpaes_key_preheat,.-_vpaes_key_preheat
602
603.type	_vpaes_schedule_core,%function
604.align	4
605_vpaes_schedule_core:
606.inst	0xd503233f			// paciasp
607	stp	x29, x30, [sp,#-16]!
608	add	x29,sp,#0
609
610	bl	_vpaes_key_preheat		// load the tables
611
612	ld1	{v0.16b}, [x0],#16		// vmovdqu	(%rdi),	%xmm0		# load key (unaligned)
613
614	// input transform
615	mov	v3.16b, v0.16b			// vmovdqa	%xmm0,	%xmm3
616	bl	_vpaes_schedule_transform
617	mov	v7.16b, v0.16b			// vmovdqa	%xmm0,	%xmm7
618
619	adr	x10, .Lk_sr			// lea	.Lk_sr(%rip),%r10
620	add	x8, x8, x10
621	cbnz	w3, .Lschedule_am_decrypting
622
623	// encrypting, output zeroth round key after transform
624	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)
625	b	.Lschedule_go
626
627.Lschedule_am_decrypting:
628	// decrypting, output zeroth round key after shiftrows
629	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
630	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb  %xmm1,	%xmm3,	%xmm3
631	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
632	eor	x8, x8, #0x30			// xor	$0x30, %r8
633
634.Lschedule_go:
635	cmp	w1, #192			// cmp	$192,	%esi
636	b.hi	.Lschedule_256
637	b.eq	.Lschedule_192
638	// 128: fall though
639
640//
641//  .schedule_128
642//
643//  128-bit specific part of key schedule.
644//
645//  This schedule is really simple, because all its parts
646//  are accomplished by the subroutines.
647//
648.Lschedule_128:
649	mov	x0, #10			// mov	$10, %esi
650
651.Loop_schedule_128:
652	sub	x0, x0, #1			// dec	%esi
653	bl	_vpaes_schedule_round
654	cbz	x0, .Lschedule_mangle_last
655	bl	_vpaes_schedule_mangle		// write output
656	b	.Loop_schedule_128
657
658//
659//  .aes_schedule_192
660//
661//  192-bit specific part of key schedule.
662//
663//  The main body of this schedule is the same as the 128-bit
664//  schedule, but with more smearing.  The long, high side is
665//  stored in %xmm7 as before, and the short, low side is in
666//  the high bits of %xmm6.
667//
668//  This schedule is somewhat nastier, however, because each
669//  round produces 192 bits of key material, or 1.5 round keys.
670//  Therefore, on each cycle we do 2 rounds and produce 3 round
671//  keys.
672//
673.align	4
674.Lschedule_192:
675	sub	x0, x0, #8
676	ld1	{v0.16b}, [x0]		// vmovdqu	8(%rdi),%xmm0		# load key part 2 (very unaligned)
677	bl	_vpaes_schedule_transform	// input transform
678	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save short part
679	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4, %xmm4	# clear 4
680	ins	v6.d[0], v4.d[0]		// vmovhlps	%xmm4,	%xmm6,	%xmm6		# clobber low side with zeros
681	mov	x0, #4			// mov	$4,	%esi
682
683.Loop_schedule_192:
684	sub	x0, x0, #1			// dec	%esi
685	bl	_vpaes_schedule_round
686	ext	v0.16b, v6.16b, v0.16b, #8	// vpalignr	$8,%xmm6,%xmm0,%xmm0
687	bl	_vpaes_schedule_mangle		// save key n
688	bl	_vpaes_schedule_192_smear
689	bl	_vpaes_schedule_mangle		// save key n+1
690	bl	_vpaes_schedule_round
691	cbz	x0, .Lschedule_mangle_last
692	bl	_vpaes_schedule_mangle		// save key n+2
693	bl	_vpaes_schedule_192_smear
694	b	.Loop_schedule_192
695
696//
697//  .aes_schedule_256
698//
699//  256-bit specific part of key schedule.
700//
701//  The structure here is very similar to the 128-bit
702//  schedule, but with an additional "low side" in
703//  %xmm6.  The low side's rounds are the same as the
704//  high side's, except no rcon and no rotation.
705//
706.align	4
707.Lschedule_256:
708	ld1	{v0.16b}, [x0]		// vmovdqu	16(%rdi),%xmm0		# load key part 2 (unaligned)
709	bl	_vpaes_schedule_transform	// input transform
710	mov	x0, #7			// mov	$7, %esi
711
712.Loop_schedule_256:
713	sub	x0, x0, #1			// dec	%esi
714	bl	_vpaes_schedule_mangle		// output low result
715	mov	v6.16b, v0.16b			// vmovdqa	%xmm0,	%xmm6		# save cur_lo in xmm6
716
717	// high round
718	bl	_vpaes_schedule_round
719	cbz	x0, .Lschedule_mangle_last
720	bl	_vpaes_schedule_mangle
721
722	// low round. swap xmm7 and xmm6
723	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
724	movi	v4.16b, #0
725	mov	v5.16b, v7.16b			// vmovdqa	%xmm7,	%xmm5
726	mov	v7.16b, v6.16b			// vmovdqa	%xmm6,	%xmm7
727	bl	_vpaes_schedule_low_round
728	mov	v7.16b, v5.16b			// vmovdqa	%xmm5,	%xmm7
729
730	b	.Loop_schedule_256
731
732//
733//  .aes_schedule_mangle_last
734//
735//  Mangler for last round of key schedule
736//  Mangles %xmm0
737//    when encrypting, outputs out(%xmm0) ^ 63
738//    when decrypting, outputs unskew(%xmm0)
739//
740//  Always called right before return... jumps to cleanup and exits
741//
742.align	4
743.Lschedule_mangle_last:
744	// schedule last round key from xmm0
745	adr	x11, .Lk_deskew			// lea	.Lk_deskew(%rip),%r11	# prepare to deskew
746	cbnz	w3, .Lschedule_mangle_last_dec
747
748	// encrypting
749	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),%xmm1
750	adr	x11, .Lk_opt			// lea	.Lk_opt(%rip),	%r11		# prepare to output transform
751	add	x2, x2, #32			// add	$32,	%rdx
752	tbl	v0.16b, {v0.16b}, v1.16b	// vpshufb	%xmm1,	%xmm0,	%xmm0		# output permute
753
754.Lschedule_mangle_last_dec:
755	ld1	{v20.2d,v21.2d}, [x11]		// reload constants
756	sub	x2, x2, #16			// add	$-16,	%rdx
757	eor	v0.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm0
758	bl	_vpaes_schedule_transform	// output transform
759	st1	{v0.2d}, [x2]			// vmovdqu	%xmm0,	(%rdx)		# save last key
760
761	// cleanup
762	eor	v0.16b, v0.16b, v0.16b		// vpxor	%xmm0,	%xmm0,	%xmm0
763	eor	v1.16b, v1.16b, v1.16b		// vpxor	%xmm1,	%xmm1,	%xmm1
764	eor	v2.16b, v2.16b, v2.16b		// vpxor	%xmm2,	%xmm2,	%xmm2
765	eor	v3.16b, v3.16b, v3.16b		// vpxor	%xmm3,	%xmm3,	%xmm3
766	eor	v4.16b, v4.16b, v4.16b		// vpxor	%xmm4,	%xmm4,	%xmm4
767	eor	v5.16b, v5.16b, v5.16b		// vpxor	%xmm5,	%xmm5,	%xmm5
768	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
769	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
770	ldp	x29, x30, [sp],#16
771.inst	0xd50323bf			// autiasp
772	ret
773.size	_vpaes_schedule_core,.-_vpaes_schedule_core
774
775//
776//  .aes_schedule_192_smear
777//
778//  Smear the short, low side in the 192-bit key schedule.
779//
780//  Inputs:
781//    %xmm7: high side, b  a  x  y
782//    %xmm6:  low side, d  c  0  0
783//    %xmm13: 0
784//
785//  Outputs:
786//    %xmm6: b+c+d  b+c  0  0
787//    %xmm0: b+c+d  b+c  b  a
788//
789.type	_vpaes_schedule_192_smear,%function
790.align	4
791_vpaes_schedule_192_smear:
792	movi	v1.16b, #0
793	dup	v0.4s, v7.s[3]
794	ins	v1.s[3], v6.s[2]	// vpshufd	$0x80,	%xmm6,	%xmm1	# d c 0 0 -> c 0 0 0
795	ins	v0.s[0], v7.s[2]	// vpshufd	$0xFE,	%xmm7,	%xmm0	# b a _ _ -> b b b a
796	eor	v6.16b, v6.16b, v1.16b	// vpxor	%xmm1,	%xmm6,	%xmm6	# -> c+d c 0 0
797	eor	v1.16b, v1.16b, v1.16b	// vpxor	%xmm1,	%xmm1,	%xmm1
798	eor	v6.16b, v6.16b, v0.16b	// vpxor	%xmm0,	%xmm6,	%xmm6	# -> b+c+d b+c b a
799	mov	v0.16b, v6.16b		// vmovdqa	%xmm6,	%xmm0
800	ins	v6.d[0], v1.d[0]	// vmovhlps	%xmm1,	%xmm6,	%xmm6	# clobber low side with zeros
801	ret
802.size	_vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
803
804//
805//  .aes_schedule_round
806//
807//  Runs one main round of the key schedule on %xmm0, %xmm7
808//
809//  Specifically, runs subbytes on the high dword of %xmm0
810//  then rotates it by one byte and xors into the low dword of
811//  %xmm7.
812//
813//  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
814//  next rcon.
815//
816//  Smears the dwords of %xmm7 by xoring the low into the
817//  second low, result into third, result into highest.
818//
819//  Returns results in %xmm7 = %xmm0.
820//  Clobbers %xmm1-%xmm4, %r11.
821//
822.type	_vpaes_schedule_round,%function
823.align	4
824_vpaes_schedule_round:
825	// extract rcon from xmm8
826	movi	v4.16b, #0			// vpxor	%xmm4,	%xmm4,	%xmm4
827	ext	v1.16b, v8.16b, v4.16b, #15	// vpalignr	$15,	%xmm8,	%xmm4,	%xmm1
828	ext	v8.16b, v8.16b, v8.16b, #15	// vpalignr	$15,	%xmm8,	%xmm8,	%xmm8
829	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
830
831	// rotate
832	dup	v0.4s, v0.s[3]			// vpshufd	$0xFF,	%xmm0,	%xmm0
833	ext	v0.16b, v0.16b, v0.16b, #1	// vpalignr	$1,	%xmm0,	%xmm0,	%xmm0
834
835	// fall through...
836
837	// low round: same as high round, but no rotation and no rcon.
838_vpaes_schedule_low_round:
839	// smear xmm7
840	ext	v1.16b, v4.16b, v7.16b, #12	// vpslldq	$4,	%xmm7,	%xmm1
841	eor	v7.16b, v7.16b, v1.16b		// vpxor	%xmm1,	%xmm7,	%xmm7
842	ext	v4.16b, v4.16b, v7.16b, #8	// vpslldq	$8,	%xmm7,	%xmm4
843
844	// subbytes
845	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1		# 0 = k
846	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0		# 1 = i
847	eor	v7.16b, v7.16b, v4.16b		// vpxor	%xmm4,	%xmm7,	%xmm7
848	tbl	v2.16b, {v19.16b}, v1.16b	// vpshufb	%xmm1,	%xmm11,	%xmm2		# 2 = a/k
849	eor	v1.16b, v1.16b, v0.16b		// vpxor	%xmm0,	%xmm1,	%xmm1		# 0 = j
850	tbl	v3.16b, {v18.16b}, v0.16b	// vpshufb	%xmm0, 	%xmm10,	%xmm3		# 3 = 1/i
851	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3		# 3 = iak = 1/i + a/k
852	tbl	v4.16b, {v18.16b}, v1.16b	// vpshufb	%xmm1,	%xmm10,	%xmm4		# 4 = 1/j
853	eor	v7.16b, v7.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm7,	%xmm7
854	tbl	v3.16b, {v18.16b}, v3.16b	// vpshufb	%xmm3,	%xmm10,	%xmm3		# 2 = 1/iak
855	eor	v4.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm4		# 4 = jak = 1/j + a/k
856	tbl	v2.16b, {v18.16b}, v4.16b	// vpshufb	%xmm4,	%xmm10,	%xmm2		# 3 = 1/jak
857	eor	v3.16b, v3.16b, v1.16b		// vpxor	%xmm1,	%xmm3,	%xmm3		# 2 = io
858	eor	v2.16b, v2.16b, v0.16b		// vpxor	%xmm0,	%xmm2,	%xmm2		# 3 = jo
859	tbl	v4.16b, {v23.16b}, v3.16b	// vpshufb	%xmm3,	%xmm13,	%xmm4		# 4 = sbou
860	tbl	v1.16b, {v22.16b}, v2.16b	// vpshufb	%xmm2,	%xmm12,	%xmm1		# 0 = sb1t
861	eor	v1.16b, v1.16b, v4.16b		// vpxor	%xmm4,	%xmm1,	%xmm1		# 0 = sbox output
862
863	// add in smeared stuff
864	eor	v0.16b, v1.16b, v7.16b		// vpxor	%xmm7,	%xmm1,	%xmm0
865	eor	v7.16b, v1.16b, v7.16b		// vmovdqa	%xmm0,	%xmm7
866	ret
867.size	_vpaes_schedule_round,.-_vpaes_schedule_round
868
869//
870//  .aes_schedule_transform
871//
872//  Linear-transform %xmm0 according to tables at (%r11)
873//
874//  Requires that %xmm9 = 0x0F0F... as in preheat
875//  Output in %xmm0
876//  Clobbers %xmm1, %xmm2
877//
878.type	_vpaes_schedule_transform,%function
879.align	4
880_vpaes_schedule_transform:
881	and	v1.16b, v0.16b, v17.16b		// vpand	%xmm9,	%xmm0,	%xmm1
882	ushr	v0.16b, v0.16b, #4		// vpsrlb	$4,	%xmm0,	%xmm0
883						// vmovdqa	(%r11),	%xmm2 	# lo
884	tbl	v2.16b, {v20.16b}, v1.16b	// vpshufb	%xmm1,	%xmm2,	%xmm2
885						// vmovdqa	16(%r11),	%xmm1 # hi
886	tbl	v0.16b, {v21.16b}, v0.16b	// vpshufb	%xmm0,	%xmm1,	%xmm0
887	eor	v0.16b, v0.16b, v2.16b		// vpxor	%xmm2,	%xmm0,	%xmm0
888	ret
889.size	_vpaes_schedule_transform,.-_vpaes_schedule_transform
890
891//
892//  .aes_schedule_mangle
893//
894//  Mangle xmm0 from (basis-transformed) standard version
895//  to our version.
896//
897//  On encrypt,
898//    xor with 0x63
899//    multiply by circulant 0,1,1,1
900//    apply shiftrows transform
901//
902//  On decrypt,
903//    xor with 0x63
904//    multiply by "inverse mixcolumns" circulant E,B,D,9
905//    deskew
906//    apply shiftrows transform
907//
908//
909//  Writes out to (%rdx), and increments or decrements it
910//  Keeps track of round number mod 4 in %r8
911//  Preserves xmm0
912//  Clobbers xmm1-xmm5
913//
914.type	_vpaes_schedule_mangle,%function
915.align	4
916_vpaes_schedule_mangle:
917	mov	v4.16b, v0.16b			// vmovdqa	%xmm0,	%xmm4	# save xmm0 for later
918						// vmovdqa	.Lk_mc_forward(%rip),%xmm5
919	cbnz	w3, .Lschedule_mangle_dec
920
921	// encrypting
922	eor	v4.16b, v0.16b, v16.16b		// vpxor	.Lk_s63(%rip),	%xmm0,	%xmm4
923	add	x2, x2, #16			// add	$16,	%rdx
924	tbl	v4.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm4
925	tbl	v1.16b, {v4.16b}, v9.16b	// vpshufb	%xmm5,	%xmm4,	%xmm1
926	tbl	v3.16b, {v1.16b}, v9.16b	// vpshufb	%xmm5,	%xmm1,	%xmm3
927	eor	v4.16b, v4.16b, v1.16b		// vpxor	%xmm1,	%xmm4,	%xmm4
928	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
929	eor	v3.16b, v3.16b, v4.16b		// vpxor	%xmm4,	%xmm3,	%xmm3
930
931	b	.Lschedule_mangle_both
932.align	4
933.Lschedule_mangle_dec:
934	// inverse mix columns
935						// lea	.Lk_dksd(%rip),%r11
936	ushr	v1.16b, v4.16b, #4		// vpsrlb	$4,	%xmm4,	%xmm1	# 1 = hi
937	and	v4.16b, v4.16b, v17.16b		// vpand	%xmm9,	%xmm4,	%xmm4	# 4 = lo
938
939						// vmovdqa	0x00(%r11),	%xmm2
940	tbl	v2.16b, {v24.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
941						// vmovdqa	0x10(%r11),	%xmm3
942	tbl	v3.16b,	{v25.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
943	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
944	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
945
946						// vmovdqa	0x20(%r11),	%xmm2
947	tbl	v2.16b, {v26.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
948	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
949						// vmovdqa	0x30(%r11),	%xmm3
950	tbl	v3.16b, {v27.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
951	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
952	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
953
954						// vmovdqa	0x40(%r11),	%xmm2
955	tbl	v2.16b, {v28.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
956	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
957						// vmovdqa	0x50(%r11),	%xmm3
958	tbl	v3.16b, {v29.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
959	eor	v3.16b, v3.16b, v2.16b		// vpxor	%xmm2,	%xmm3,	%xmm3
960
961						// vmovdqa	0x60(%r11),	%xmm2
962	tbl	v2.16b, {v30.16b}, v4.16b	// vpshufb	%xmm4,	%xmm2,	%xmm2
963	tbl	v3.16b, {v3.16b}, v9.16b	// vpshufb	%xmm5,	%xmm3,	%xmm3
964						// vmovdqa	0x70(%r11),	%xmm4
965	tbl	v4.16b, {v31.16b}, v1.16b	// vpshufb	%xmm1,	%xmm4,	%xmm4
966	ld1	{v1.2d}, [x8]			// vmovdqa	(%r8,%r10),	%xmm1
967	eor	v2.16b, v2.16b, v3.16b		// vpxor	%xmm3,	%xmm2,	%xmm2
968	eor	v3.16b, v4.16b, v2.16b		// vpxor	%xmm2,	%xmm4,	%xmm3
969
970	sub	x2, x2, #16			// add	$-16,	%rdx
971
972.Lschedule_mangle_both:
973	tbl	v3.16b, {v3.16b}, v1.16b	// vpshufb	%xmm1,	%xmm3,	%xmm3
974	add	x8, x8, #64-16			// add	$-16,	%r8
975	and	x8, x8, #~(1<<6)		// and	$0x30,	%r8
976	st1	{v3.2d}, [x2]			// vmovdqu	%xmm3,	(%rdx)
977	ret
978.size	_vpaes_schedule_mangle,.-_vpaes_schedule_mangle
979
980.globl	vpaes_set_encrypt_key
981.type	vpaes_set_encrypt_key,%function
982.align	4
983vpaes_set_encrypt_key:
984.inst	0xd503233f		// paciasp
985	stp	x29,x30,[sp,#-16]!
986	add	x29,sp,#0
987	stp	d8,d9,[sp,#-16]!	// ABI spec says so
988
989	lsr	w9, w1, #5		// shr	$5,%eax
990	add	w9, w9, #5		// $5,%eax
991	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
992
993	mov	w3, #0		// mov	$0,%ecx
994	mov	x8, #0x30		// mov	$0x30,%r8d
995	bl	_vpaes_schedule_core
996	eor	x0, x0, x0
997
998	ldp	d8,d9,[sp],#16
999	ldp	x29,x30,[sp],#16
1000.inst	0xd50323bf		// autiasp
1001	ret
1002.size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
1003
1004.globl	vpaes_set_decrypt_key
1005.type	vpaes_set_decrypt_key,%function
1006.align	4
1007vpaes_set_decrypt_key:
1008.inst	0xd503233f		// paciasp
1009	stp	x29,x30,[sp,#-16]!
1010	add	x29,sp,#0
1011	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1012
1013	lsr	w9, w1, #5		// shr	$5,%eax
1014	add	w9, w9, #5		// $5,%eax
1015	str	w9, [x2,#240]		// mov	%eax,240(%rdx)	# AES_KEY->rounds = nbits/32+5;
1016	lsl	w9, w9, #4		// shl	$4,%eax
1017	add	x2, x2, #16		// lea	16(%rdx,%rax),%rdx
1018	add	x2, x2, x9
1019
1020	mov	w3, #1		// mov	$1,%ecx
1021	lsr	w8, w1, #1		// shr	$1,%r8d
1022	and	x8, x8, #32		// and	$32,%r8d
1023	eor	x8, x8, #32		// xor	$32,%r8d	# nbits==192?0:32
1024	bl	_vpaes_schedule_core
1025
1026	ldp	d8,d9,[sp],#16
1027	ldp	x29,x30,[sp],#16
1028.inst	0xd50323bf		// autiasp
1029	ret
1030.size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1031.globl	vpaes_cbc_encrypt
1032.type	vpaes_cbc_encrypt,%function
1033.align	4
1034vpaes_cbc_encrypt:
1035	cbz	x2, .Lcbc_abort
1036	cmp	w5, #0			// check direction
1037	b.eq	vpaes_cbc_decrypt
1038
1039.inst	0xd503233f		// paciasp
1040	stp	x29,x30,[sp,#-16]!
1041	add	x29,sp,#0
1042
1043	mov	x17, x2		// reassign
1044	mov	x2,  x3		// reassign
1045
1046	ld1	{v0.16b}, [x4]	// load ivec
1047	bl	_vpaes_encrypt_preheat
1048	b	.Lcbc_enc_loop
1049
1050.align	4
1051.Lcbc_enc_loop:
1052	ld1	{v7.16b}, [x0],#16	// load input
1053	eor	v7.16b, v7.16b, v0.16b	// xor with ivec
1054	bl	_vpaes_encrypt_core
1055	st1	{v0.16b}, [x1],#16	// save output
1056	subs	x17, x17, #16
1057	b.hi	.Lcbc_enc_loop
1058
1059	st1	{v0.16b}, [x4]	// write ivec
1060
1061	ldp	x29,x30,[sp],#16
1062.inst	0xd50323bf		// autiasp
1063.Lcbc_abort:
1064	ret
1065.size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1066
1067.type	vpaes_cbc_decrypt,%function
1068.align	4
1069vpaes_cbc_decrypt:
1070.inst	0xd503233f		// paciasp
1071	stp	x29,x30,[sp,#-16]!
1072	add	x29,sp,#0
1073	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1074	stp	d10,d11,[sp,#-16]!
1075	stp	d12,d13,[sp,#-16]!
1076	stp	d14,d15,[sp,#-16]!
1077
1078	mov	x17, x2		// reassign
1079	mov	x2,  x3		// reassign
1080	ld1	{v6.16b}, [x4]	// load ivec
1081	bl	_vpaes_decrypt_preheat
1082	tst	x17, #16
1083	b.eq	.Lcbc_dec_loop2x
1084
1085	ld1	{v7.16b}, [x0], #16	// load input
1086	bl	_vpaes_decrypt_core
1087	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
1088	orr	v6.16b, v7.16b, v7.16b	// next ivec value
1089	st1	{v0.16b}, [x1], #16
1090	subs	x17, x17, #16
1091	b.ls	.Lcbc_dec_done
1092
1093.align	4
1094.Lcbc_dec_loop2x:
1095	ld1	{v14.16b,v15.16b}, [x0], #32
1096	bl	_vpaes_decrypt_2x
1097	eor	v0.16b, v0.16b, v6.16b	// xor with ivec
1098	eor	v1.16b, v1.16b, v14.16b
1099	orr	v6.16b, v15.16b, v15.16b
1100	st1	{v0.16b,v1.16b}, [x1], #32
1101	subs	x17, x17, #32
1102	b.hi	.Lcbc_dec_loop2x
1103
1104.Lcbc_dec_done:
1105	st1	{v6.16b}, [x4]
1106
1107	ldp	d14,d15,[sp],#16
1108	ldp	d12,d13,[sp],#16
1109	ldp	d10,d11,[sp],#16
1110	ldp	d8,d9,[sp],#16
1111	ldp	x29,x30,[sp],#16
1112.inst	0xd50323bf		// autiasp
1113	ret
1114.size	vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1115.globl	vpaes_ecb_encrypt
1116.type	vpaes_ecb_encrypt,%function
1117.align	4
1118vpaes_ecb_encrypt:
1119.inst	0xd503233f		// paciasp
1120	stp	x29,x30,[sp,#-16]!
1121	add	x29,sp,#0
1122	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1123	stp	d10,d11,[sp,#-16]!
1124	stp	d12,d13,[sp,#-16]!
1125	stp	d14,d15,[sp,#-16]!
1126
1127	mov	x17, x2
1128	mov	x2,  x3
1129	bl	_vpaes_encrypt_preheat
1130	tst	x17, #16
1131	b.eq	.Lecb_enc_loop
1132
1133	ld1	{v7.16b}, [x0],#16
1134	bl	_vpaes_encrypt_core
1135	st1	{v0.16b}, [x1],#16
1136	subs	x17, x17, #16
1137	b.ls	.Lecb_enc_done
1138
1139.align	4
1140.Lecb_enc_loop:
1141	ld1	{v14.16b,v15.16b}, [x0], #32
1142	bl	_vpaes_encrypt_2x
1143	st1	{v0.16b,v1.16b}, [x1], #32
1144	subs	x17, x17, #32
1145	b.hi	.Lecb_enc_loop
1146
1147.Lecb_enc_done:
1148	ldp	d14,d15,[sp],#16
1149	ldp	d12,d13,[sp],#16
1150	ldp	d10,d11,[sp],#16
1151	ldp	d8,d9,[sp],#16
1152	ldp	x29,x30,[sp],#16
1153.inst	0xd50323bf		// autiasp
1154	ret
1155.size	vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1156
1157.globl	vpaes_ecb_decrypt
1158.type	vpaes_ecb_decrypt,%function
1159.align	4
1160vpaes_ecb_decrypt:
1161.inst	0xd503233f		// paciasp
1162	stp	x29,x30,[sp,#-16]!
1163	add	x29,sp,#0
1164	stp	d8,d9,[sp,#-16]!	// ABI spec says so
1165	stp	d10,d11,[sp,#-16]!
1166	stp	d12,d13,[sp,#-16]!
1167	stp	d14,d15,[sp,#-16]!
1168
1169	mov	x17, x2
1170	mov	x2,  x3
1171	bl	_vpaes_decrypt_preheat
1172	tst	x17, #16
1173	b.eq	.Lecb_dec_loop
1174
1175	ld1	{v7.16b}, [x0],#16
1176	bl	_vpaes_encrypt_core
1177	st1	{v0.16b}, [x1],#16
1178	subs	x17, x17, #16
1179	b.ls	.Lecb_dec_done
1180
1181.align	4
1182.Lecb_dec_loop:
1183	ld1	{v14.16b,v15.16b}, [x0], #32
1184	bl	_vpaes_decrypt_2x
1185	st1	{v0.16b,v1.16b}, [x1], #32
1186	subs	x17, x17, #32
1187	b.hi	.Lecb_dec_loop
1188
1189.Lecb_dec_done:
1190	ldp	d14,d15,[sp],#16
1191	ldp	d12,d13,[sp],#16
1192	ldp	d10,d11,[sp],#16
1193	ldp	d8,d9,[sp],#16
1194	ldp	x29,x30,[sp],#16
1195.inst	0xd50323bf		// autiasp
1196	ret
1197.size	vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
1198