xref: /freebsd/sys/crypto/openssl/aarch64/vpsm4-armv8.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1*4757b351SPierre Pronchery/* Do not modify. This file is auto-generated from vpsm4-armv8.pl. */
2*4757b351SPierre Pronchery// Copyright 2020-2025 The OpenSSL Project Authors. All Rights Reserved.
3*4757b351SPierre Pronchery//
4*4757b351SPierre Pronchery// Licensed under the Apache License 2.0 (the "License").  You may not use
5*4757b351SPierre Pronchery// this file except in compliance with the License.  You can obtain a copy
6*4757b351SPierre Pronchery// in the file LICENSE in the source distribution or at
7*4757b351SPierre Pronchery// https://www.openssl.org/source/license.html
8*4757b351SPierre Pronchery
9*4757b351SPierre Pronchery//
10*4757b351SPierre Pronchery// This module implements SM4 with ASIMD on aarch64
11*4757b351SPierre Pronchery//
12*4757b351SPierre Pronchery// Feb 2022
13*4757b351SPierre Pronchery//
14*4757b351SPierre Pronchery
15*4757b351SPierre Pronchery// $output is the last argument if it looks like a file (it has an extension)
16*4757b351SPierre Pronchery// $flavour is the first argument if it doesn't look like a file
17*4757b351SPierre Pronchery#include "arm_arch.h"
18*4757b351SPierre Pronchery.arch	armv8-a
19*4757b351SPierre Pronchery.text
20*4757b351SPierre Pronchery
21*4757b351SPierre Pronchery.section	.rodata
22*4757b351SPierre Pronchery.type	_vpsm4_consts,%object
23*4757b351SPierre Pronchery.align	7
24*4757b351SPierre Pronchery_vpsm4_consts:
25*4757b351SPierre Pronchery.Lsbox:
26*4757b351SPierre Pronchery.byte	0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
27*4757b351SPierre Pronchery.byte	0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
28*4757b351SPierre Pronchery.byte	0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
29*4757b351SPierre Pronchery.byte	0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
30*4757b351SPierre Pronchery.byte	0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
31*4757b351SPierre Pronchery.byte	0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
32*4757b351SPierre Pronchery.byte	0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
33*4757b351SPierre Pronchery.byte	0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
34*4757b351SPierre Pronchery.byte	0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
35*4757b351SPierre Pronchery.byte	0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
36*4757b351SPierre Pronchery.byte	0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
37*4757b351SPierre Pronchery.byte	0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
38*4757b351SPierre Pronchery.byte	0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
39*4757b351SPierre Pronchery.byte	0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
40*4757b351SPierre Pronchery.byte	0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
41*4757b351SPierre Pronchery.byte	0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
42*4757b351SPierre Pronchery.Lck:
43*4757b351SPierre Pronchery.long	0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
44*4757b351SPierre Pronchery.long	0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
45*4757b351SPierre Pronchery.long	0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
46*4757b351SPierre Pronchery.long	0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
47*4757b351SPierre Pronchery.long	0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
48*4757b351SPierre Pronchery.long	0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
49*4757b351SPierre Pronchery.long	0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
50*4757b351SPierre Pronchery.long	0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
51*4757b351SPierre Pronchery.Lfk:
52*4757b351SPierre Pronchery.quad	0x56aa3350a3b1bac6,0xb27022dc677d9197
53*4757b351SPierre Pronchery.Lshuffles:
54*4757b351SPierre Pronchery.quad	0x0B0A090807060504,0x030201000F0E0D0C
55*4757b351SPierre Pronchery.Lxts_magic:
56*4757b351SPierre Pronchery.quad	0x0101010101010187,0x0101010101010101
57*4757b351SPierre Pronchery
58*4757b351SPierre Pronchery.size	_vpsm4_consts,.-_vpsm4_consts
59*4757b351SPierre Pronchery
60*4757b351SPierre Pronchery.previous
61*4757b351SPierre Pronchery
62*4757b351SPierre Pronchery.type	_vpsm4_set_key,%function
63*4757b351SPierre Pronchery.align	4
64*4757b351SPierre Pronchery_vpsm4_set_key:
65*4757b351SPierre Pronchery	AARCH64_VALID_CALL_TARGET
66*4757b351SPierre Pronchery	ld1	{v5.4s},[x0]
67*4757b351SPierre Pronchery	adrp	x10,.Lsbox
68*4757b351SPierre Pronchery	add	x10,x10,#:lo12:.Lsbox
69*4757b351SPierre Pronchery	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
70*4757b351SPierre Pronchery	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
71*4757b351SPierre Pronchery	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
72*4757b351SPierre Pronchery	ld1	{v28.16b,v29.16b,v30.16b,v31.16b},[x10]
73*4757b351SPierre Pronchery#ifndef __AARCH64EB__
74*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
75*4757b351SPierre Pronchery#endif
76*4757b351SPierre Pronchery	adrp	x5,.Lshuffles
77*4757b351SPierre Pronchery	add	x5,x5,#:lo12:.Lshuffles
78*4757b351SPierre Pronchery	ld1	{v7.2d},[x5]
79*4757b351SPierre Pronchery	adrp	x5,.Lfk
80*4757b351SPierre Pronchery	add	x5,x5,#:lo12:.Lfk
81*4757b351SPierre Pronchery	ld1	{v6.2d},[x5]
82*4757b351SPierre Pronchery	eor	v5.16b,v5.16b,v6.16b
83*4757b351SPierre Pronchery	mov	x6,#32
84*4757b351SPierre Pronchery	adrp	x5,.Lck
85*4757b351SPierre Pronchery	add	x5,x5,#:lo12:.Lck
86*4757b351SPierre Pronchery	movi	v0.16b,#64
87*4757b351SPierre Pronchery	cbnz	w2,1f
88*4757b351SPierre Pronchery	add	x1,x1,124
89*4757b351SPierre Pronchery1:
90*4757b351SPierre Pronchery	mov	w7,v5.s[1]
91*4757b351SPierre Pronchery	ldr	w8,[x5],#4
92*4757b351SPierre Pronchery	eor	w8,w8,w7
93*4757b351SPierre Pronchery	mov	w7,v5.s[2]
94*4757b351SPierre Pronchery	eor	w8,w8,w7
95*4757b351SPierre Pronchery	mov	w7,v5.s[3]
96*4757b351SPierre Pronchery	eor	w8,w8,w7
97*4757b351SPierre Pronchery	// sbox lookup
98*4757b351SPierre Pronchery	mov	v4.s[0],w8
99*4757b351SPierre Pronchery	tbl	v1.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v4.16b
100*4757b351SPierre Pronchery	sub	v4.16b,v4.16b,v0.16b
101*4757b351SPierre Pronchery	tbx	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v4.16b
102*4757b351SPierre Pronchery	sub	v4.16b,v4.16b,v0.16b
103*4757b351SPierre Pronchery	tbx	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v4.16b
104*4757b351SPierre Pronchery	sub	v4.16b,v4.16b,v0.16b
105*4757b351SPierre Pronchery	tbx	v1.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v4.16b
106*4757b351SPierre Pronchery	mov	w7,v1.s[0]
107*4757b351SPierre Pronchery	eor	w8,w7,w7,ror #19
108*4757b351SPierre Pronchery	eor	w8,w8,w7,ror #9
109*4757b351SPierre Pronchery	mov	w7,v5.s[0]
110*4757b351SPierre Pronchery	eor	w8,w8,w7
111*4757b351SPierre Pronchery	mov	v5.s[0],w8
112*4757b351SPierre Pronchery	cbz	w2,2f
113*4757b351SPierre Pronchery	str	w8,[x1],#4
114*4757b351SPierre Pronchery	b	3f
115*4757b351SPierre Pronchery2:
116*4757b351SPierre Pronchery	str	w8,[x1],#-4
117*4757b351SPierre Pronchery3:
118*4757b351SPierre Pronchery	tbl	v5.16b,{v5.16b},v7.16b
119*4757b351SPierre Pronchery	subs	x6,x6,#1
120*4757b351SPierre Pronchery	b.ne	1b
121*4757b351SPierre Pronchery	ret
122*4757b351SPierre Pronchery.size	_vpsm4_set_key,.-_vpsm4_set_key
123*4757b351SPierre Pronchery.type	_vpsm4_enc_4blks,%function
124*4757b351SPierre Pronchery.align	4
125*4757b351SPierre Pronchery_vpsm4_enc_4blks:
126*4757b351SPierre Pronchery	AARCH64_VALID_CALL_TARGET
127*4757b351SPierre Pronchery	mov	x10,x3
128*4757b351SPierre Pronchery	mov	w11,#8
129*4757b351SPierre Pronchery10:
130*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
131*4757b351SPierre Pronchery	dup	v12.4s,w7
132*4757b351SPierre Pronchery	dup	v13.4s,w8
133*4757b351SPierre Pronchery
134*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
135*4757b351SPierre Pronchery	eor	v14.16b,v6.16b,v7.16b
136*4757b351SPierre Pronchery	eor	v12.16b,v5.16b,v12.16b
137*4757b351SPierre Pronchery	eor	v12.16b,v14.16b,v12.16b
138*4757b351SPierre Pronchery	movi	v0.16b,#64
139*4757b351SPierre Pronchery	movi	v1.16b,#128
140*4757b351SPierre Pronchery	movi	v2.16b,#192
141*4757b351SPierre Pronchery	sub	v0.16b,v12.16b,v0.16b
142*4757b351SPierre Pronchery	sub	v1.16b,v12.16b,v1.16b
143*4757b351SPierre Pronchery	sub	v2.16b,v12.16b,v2.16b
144*4757b351SPierre Pronchery	tbl	v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
145*4757b351SPierre Pronchery	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
146*4757b351SPierre Pronchery	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
147*4757b351SPierre Pronchery	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
148*4757b351SPierre Pronchery	add	v0.2d,v0.2d,v1.2d
149*4757b351SPierre Pronchery	add	v2.2d,v2.2d,v12.2d
150*4757b351SPierre Pronchery	add	v12.2d,v0.2d,v2.2d
151*4757b351SPierre Pronchery
152*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-2
153*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,2
154*4757b351SPierre Pronchery	ushr	v2.4s,v12.4s,32-10
155*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v12.16b
156*4757b351SPierre Pronchery	sli	v2.4s,v12.4s,10
157*4757b351SPierre Pronchery	eor	v1.16b,v2.16b,v1.16b
158*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-18
159*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,18
160*4757b351SPierre Pronchery	ushr	v2.4s,v12.4s,32-24
161*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v1.16b
162*4757b351SPierre Pronchery	sli	v2.4s,v12.4s,24
163*4757b351SPierre Pronchery	eor	v12.16b,v2.16b,v1.16b
164*4757b351SPierre Pronchery	eor	v4.16b,v4.16b,v12.16b
165*4757b351SPierre Pronchery
166*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
167*4757b351SPierre Pronchery	eor	v14.16b,v14.16b,v4.16b
168*4757b351SPierre Pronchery	eor	v13.16b,v14.16b,v13.16b
169*4757b351SPierre Pronchery	movi	v0.16b,#64
170*4757b351SPierre Pronchery	movi	v1.16b,#128
171*4757b351SPierre Pronchery	movi	v2.16b,#192
172*4757b351SPierre Pronchery	sub	v0.16b,v13.16b,v0.16b
173*4757b351SPierre Pronchery	sub	v1.16b,v13.16b,v1.16b
174*4757b351SPierre Pronchery	sub	v2.16b,v13.16b,v2.16b
175*4757b351SPierre Pronchery	tbl	v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
176*4757b351SPierre Pronchery	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
177*4757b351SPierre Pronchery	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
178*4757b351SPierre Pronchery	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
179*4757b351SPierre Pronchery	add	v0.2d,v0.2d,v1.2d
180*4757b351SPierre Pronchery	add	v2.2d,v2.2d,v13.2d
181*4757b351SPierre Pronchery	add	v13.2d,v0.2d,v2.2d
182*4757b351SPierre Pronchery
183*4757b351SPierre Pronchery	ushr	v0.4s,v13.4s,32-2
184*4757b351SPierre Pronchery	sli	v0.4s,v13.4s,2
185*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-10
186*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v13.16b
187*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,10
188*4757b351SPierre Pronchery	eor	v1.16b,v2.16b,v1.16b
189*4757b351SPierre Pronchery	ushr	v0.4s,v13.4s,32-18
190*4757b351SPierre Pronchery	sli	v0.4s,v13.4s,18
191*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-24
192*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v1.16b
193*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,24
194*4757b351SPierre Pronchery	eor	v13.16b,v2.16b,v1.16b
195*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
196*4757b351SPierre Pronchery	eor	v5.16b,v5.16b,v13.16b
197*4757b351SPierre Pronchery
198*4757b351SPierre Pronchery	dup	v12.4s,w7
199*4757b351SPierre Pronchery	dup	v13.4s,w8
200*4757b351SPierre Pronchery
201*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
202*4757b351SPierre Pronchery	eor	v14.16b,v4.16b,v5.16b
203*4757b351SPierre Pronchery	eor	v12.16b,v7.16b,v12.16b
204*4757b351SPierre Pronchery	eor	v12.16b,v14.16b,v12.16b
205*4757b351SPierre Pronchery	movi	v0.16b,#64
206*4757b351SPierre Pronchery	movi	v1.16b,#128
207*4757b351SPierre Pronchery	movi	v2.16b,#192
208*4757b351SPierre Pronchery	sub	v0.16b,v12.16b,v0.16b
209*4757b351SPierre Pronchery	sub	v1.16b,v12.16b,v1.16b
210*4757b351SPierre Pronchery	sub	v2.16b,v12.16b,v2.16b
211*4757b351SPierre Pronchery	tbl	v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
212*4757b351SPierre Pronchery	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
213*4757b351SPierre Pronchery	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
214*4757b351SPierre Pronchery	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
215*4757b351SPierre Pronchery	add	v0.2d,v0.2d,v1.2d
216*4757b351SPierre Pronchery	add	v2.2d,v2.2d,v12.2d
217*4757b351SPierre Pronchery	add	v12.2d,v0.2d,v2.2d
218*4757b351SPierre Pronchery
219*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-2
220*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,2
221*4757b351SPierre Pronchery	ushr	v2.4s,v12.4s,32-10
222*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v12.16b
223*4757b351SPierre Pronchery	sli	v2.4s,v12.4s,10
224*4757b351SPierre Pronchery	eor	v1.16b,v2.16b,v1.16b
225*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-18
226*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,18
227*4757b351SPierre Pronchery	ushr	v2.4s,v12.4s,32-24
228*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v1.16b
229*4757b351SPierre Pronchery	sli	v2.4s,v12.4s,24
230*4757b351SPierre Pronchery	eor	v12.16b,v2.16b,v1.16b
231*4757b351SPierre Pronchery	eor	v6.16b,v6.16b,v12.16b
232*4757b351SPierre Pronchery
233*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
234*4757b351SPierre Pronchery	eor	v14.16b,v14.16b,v6.16b
235*4757b351SPierre Pronchery	eor	v13.16b,v14.16b,v13.16b
236*4757b351SPierre Pronchery	movi	v0.16b,#64
237*4757b351SPierre Pronchery	movi	v1.16b,#128
238*4757b351SPierre Pronchery	movi	v2.16b,#192
239*4757b351SPierre Pronchery	sub	v0.16b,v13.16b,v0.16b
240*4757b351SPierre Pronchery	sub	v1.16b,v13.16b,v1.16b
241*4757b351SPierre Pronchery	sub	v2.16b,v13.16b,v2.16b
242*4757b351SPierre Pronchery	tbl	v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
243*4757b351SPierre Pronchery	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
244*4757b351SPierre Pronchery	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
245*4757b351SPierre Pronchery	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
246*4757b351SPierre Pronchery	add	v0.2d,v0.2d,v1.2d
247*4757b351SPierre Pronchery	add	v2.2d,v2.2d,v13.2d
248*4757b351SPierre Pronchery	add	v13.2d,v0.2d,v2.2d
249*4757b351SPierre Pronchery
250*4757b351SPierre Pronchery	ushr	v0.4s,v13.4s,32-2
251*4757b351SPierre Pronchery	sli	v0.4s,v13.4s,2
252*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-10
253*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v13.16b
254*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,10
255*4757b351SPierre Pronchery	eor	v1.16b,v2.16b,v1.16b
256*4757b351SPierre Pronchery	ushr	v0.4s,v13.4s,32-18
257*4757b351SPierre Pronchery	sli	v0.4s,v13.4s,18
258*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-24
259*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v1.16b
260*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,24
261*4757b351SPierre Pronchery	eor	v13.16b,v2.16b,v1.16b
262*4757b351SPierre Pronchery	eor	v7.16b,v7.16b,v13.16b
263*4757b351SPierre Pronchery	subs	w11,w11,#1
264*4757b351SPierre Pronchery	b.ne	10b
265*4757b351SPierre Pronchery#ifndef __AARCH64EB__
266*4757b351SPierre Pronchery	rev32	v3.16b,v4.16b
267*4757b351SPierre Pronchery#else
268*4757b351SPierre Pronchery	mov	v3.16b,v4.16b
269*4757b351SPierre Pronchery#endif
270*4757b351SPierre Pronchery#ifndef __AARCH64EB__
271*4757b351SPierre Pronchery	rev32	v2.16b,v5.16b
272*4757b351SPierre Pronchery#else
273*4757b351SPierre Pronchery	mov	v2.16b,v5.16b
274*4757b351SPierre Pronchery#endif
275*4757b351SPierre Pronchery#ifndef __AARCH64EB__
276*4757b351SPierre Pronchery	rev32	v1.16b,v6.16b
277*4757b351SPierre Pronchery#else
278*4757b351SPierre Pronchery	mov	v1.16b,v6.16b
279*4757b351SPierre Pronchery#endif
280*4757b351SPierre Pronchery#ifndef __AARCH64EB__
281*4757b351SPierre Pronchery	rev32	v0.16b,v7.16b
282*4757b351SPierre Pronchery#else
283*4757b351SPierre Pronchery	mov	v0.16b,v7.16b
284*4757b351SPierre Pronchery#endif
285*4757b351SPierre Pronchery	ret
286*4757b351SPierre Pronchery.size	_vpsm4_enc_4blks,.-_vpsm4_enc_4blks
287*4757b351SPierre Pronchery.type	_vpsm4_enc_8blks,%function
288*4757b351SPierre Pronchery.align	4
289*4757b351SPierre Pronchery_vpsm4_enc_8blks:
290*4757b351SPierre Pronchery	AARCH64_VALID_CALL_TARGET
291*4757b351SPierre Pronchery	mov	x10,x3
292*4757b351SPierre Pronchery	mov	w11,#8
293*4757b351SPierre Pronchery10:
294*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
295*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
296*4757b351SPierre Pronchery	dup	v12.4s,w7
297*4757b351SPierre Pronchery	eor	v14.16b,v6.16b,v7.16b
298*4757b351SPierre Pronchery	eor	v15.16b,v10.16b,v11.16b
299*4757b351SPierre Pronchery	eor	v0.16b,v5.16b,v12.16b
300*4757b351SPierre Pronchery	eor	v1.16b,v9.16b,v12.16b
301*4757b351SPierre Pronchery	eor	v12.16b,v14.16b,v0.16b
302*4757b351SPierre Pronchery	eor	v13.16b,v15.16b,v1.16b
303*4757b351SPierre Pronchery	movi	v3.16b,#64
304*4757b351SPierre Pronchery	sub	v0.16b,v12.16b,v3.16b
305*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v3.16b
306*4757b351SPierre Pronchery	sub	v2.16b,v1.16b,v3.16b
307*4757b351SPierre Pronchery	tbl	v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
308*4757b351SPierre Pronchery	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
309*4757b351SPierre Pronchery	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
310*4757b351SPierre Pronchery	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
311*4757b351SPierre Pronchery	add	v1.2d,v0.2d,v1.2d
312*4757b351SPierre Pronchery	add	v12.2d,v2.2d,v12.2d
313*4757b351SPierre Pronchery	add	v12.2d,v1.2d,v12.2d
314*4757b351SPierre Pronchery
315*4757b351SPierre Pronchery	sub	v0.16b,v13.16b,v3.16b
316*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v3.16b
317*4757b351SPierre Pronchery	sub	v2.16b,v1.16b,v3.16b
318*4757b351SPierre Pronchery	tbl	v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
319*4757b351SPierre Pronchery	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
320*4757b351SPierre Pronchery	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
321*4757b351SPierre Pronchery	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
322*4757b351SPierre Pronchery	add	v1.2d,v0.2d,v1.2d
323*4757b351SPierre Pronchery	add	v13.2d,v2.2d,v13.2d
324*4757b351SPierre Pronchery	add	v13.2d,v1.2d,v13.2d
325*4757b351SPierre Pronchery
326*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-2
327*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,2
328*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-2
329*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v12.16b
330*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,2
331*4757b351SPierre Pronchery
332*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-10
333*4757b351SPierre Pronchery	eor	v3.16b,v2.16b,v13.16b
334*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,10
335*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-10
336*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v1.16b
337*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,10
338*4757b351SPierre Pronchery
339*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-18
340*4757b351SPierre Pronchery	eor	v3.16b,v2.16b,v3.16b
341*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,18
342*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-18
343*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v1.16b
344*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,18
345*4757b351SPierre Pronchery
346*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-24
347*4757b351SPierre Pronchery	eor	v3.16b,v2.16b,v3.16b
348*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,24
349*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-24
350*4757b351SPierre Pronchery	eor	v12.16b,v0.16b,v1.16b
351*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,24
352*4757b351SPierre Pronchery	eor	v13.16b,v2.16b,v3.16b
353*4757b351SPierre Pronchery	eor	v4.16b,v4.16b,v12.16b
354*4757b351SPierre Pronchery	eor	v8.16b,v8.16b,v13.16b
355*4757b351SPierre Pronchery
356*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
357*4757b351SPierre Pronchery	dup	v13.4s,w8
358*4757b351SPierre Pronchery	eor	v14.16b,v14.16b,v4.16b
359*4757b351SPierre Pronchery	eor	v15.16b,v15.16b,v8.16b
360*4757b351SPierre Pronchery	eor	v12.16b,v14.16b,v13.16b
361*4757b351SPierre Pronchery	eor	v13.16b,v15.16b,v13.16b
362*4757b351SPierre Pronchery	movi	v3.16b,#64
363*4757b351SPierre Pronchery	sub	v0.16b,v12.16b,v3.16b
364*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v3.16b
365*4757b351SPierre Pronchery	sub	v2.16b,v1.16b,v3.16b
366*4757b351SPierre Pronchery	tbl	v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
367*4757b351SPierre Pronchery	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
368*4757b351SPierre Pronchery	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
369*4757b351SPierre Pronchery	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
370*4757b351SPierre Pronchery	add	v1.2d,v0.2d,v1.2d
371*4757b351SPierre Pronchery	add	v12.2d,v2.2d,v12.2d
372*4757b351SPierre Pronchery	add	v12.2d,v1.2d,v12.2d
373*4757b351SPierre Pronchery
374*4757b351SPierre Pronchery	sub	v0.16b,v13.16b,v3.16b
375*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v3.16b
376*4757b351SPierre Pronchery	sub	v2.16b,v1.16b,v3.16b
377*4757b351SPierre Pronchery	tbl	v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
378*4757b351SPierre Pronchery	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
379*4757b351SPierre Pronchery	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
380*4757b351SPierre Pronchery	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
381*4757b351SPierre Pronchery	add	v1.2d,v0.2d,v1.2d
382*4757b351SPierre Pronchery	add	v13.2d,v2.2d,v13.2d
383*4757b351SPierre Pronchery	add	v13.2d,v1.2d,v13.2d
384*4757b351SPierre Pronchery
385*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-2
386*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,2
387*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-2
388*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v12.16b
389*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,2
390*4757b351SPierre Pronchery
391*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-10
392*4757b351SPierre Pronchery	eor	v3.16b,v2.16b,v13.16b
393*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,10
394*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-10
395*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v1.16b
396*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,10
397*4757b351SPierre Pronchery
398*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-18
399*4757b351SPierre Pronchery	eor	v3.16b,v2.16b,v3.16b
400*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,18
401*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-18
402*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v1.16b
403*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,18
404*4757b351SPierre Pronchery
405*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-24
406*4757b351SPierre Pronchery	eor	v3.16b,v2.16b,v3.16b
407*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,24
408*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-24
409*4757b351SPierre Pronchery	eor	v12.16b,v0.16b,v1.16b
410*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,24
411*4757b351SPierre Pronchery	eor	v13.16b,v2.16b,v3.16b
412*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
413*4757b351SPierre Pronchery	eor	v5.16b,v5.16b,v12.16b
414*4757b351SPierre Pronchery	eor	v9.16b,v9.16b,v13.16b
415*4757b351SPierre Pronchery
416*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
417*4757b351SPierre Pronchery	dup	v12.4s,w7
418*4757b351SPierre Pronchery	eor	v14.16b,v4.16b,v5.16b
419*4757b351SPierre Pronchery	eor	v15.16b,v8.16b,v9.16b
420*4757b351SPierre Pronchery	eor	v0.16b,v7.16b,v12.16b
421*4757b351SPierre Pronchery	eor	v1.16b,v11.16b,v12.16b
422*4757b351SPierre Pronchery	eor	v12.16b,v14.16b,v0.16b
423*4757b351SPierre Pronchery	eor	v13.16b,v15.16b,v1.16b
424*4757b351SPierre Pronchery	movi	v3.16b,#64
425*4757b351SPierre Pronchery	sub	v0.16b,v12.16b,v3.16b
426*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v3.16b
427*4757b351SPierre Pronchery	sub	v2.16b,v1.16b,v3.16b
428*4757b351SPierre Pronchery	tbl	v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
429*4757b351SPierre Pronchery	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
430*4757b351SPierre Pronchery	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
431*4757b351SPierre Pronchery	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
432*4757b351SPierre Pronchery	add	v1.2d,v0.2d,v1.2d
433*4757b351SPierre Pronchery	add	v12.2d,v2.2d,v12.2d
434*4757b351SPierre Pronchery	add	v12.2d,v1.2d,v12.2d
435*4757b351SPierre Pronchery
436*4757b351SPierre Pronchery	sub	v0.16b,v13.16b,v3.16b
437*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v3.16b
438*4757b351SPierre Pronchery	sub	v2.16b,v1.16b,v3.16b
439*4757b351SPierre Pronchery	tbl	v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
440*4757b351SPierre Pronchery	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
441*4757b351SPierre Pronchery	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
442*4757b351SPierre Pronchery	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
443*4757b351SPierre Pronchery	add	v1.2d,v0.2d,v1.2d
444*4757b351SPierre Pronchery	add	v13.2d,v2.2d,v13.2d
445*4757b351SPierre Pronchery	add	v13.2d,v1.2d,v13.2d
446*4757b351SPierre Pronchery
447*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-2
448*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,2
449*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-2
450*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v12.16b
451*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,2
452*4757b351SPierre Pronchery
453*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-10
454*4757b351SPierre Pronchery	eor	v3.16b,v2.16b,v13.16b
455*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,10
456*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-10
457*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v1.16b
458*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,10
459*4757b351SPierre Pronchery
460*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-18
461*4757b351SPierre Pronchery	eor	v3.16b,v2.16b,v3.16b
462*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,18
463*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-18
464*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v1.16b
465*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,18
466*4757b351SPierre Pronchery
467*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-24
468*4757b351SPierre Pronchery	eor	v3.16b,v2.16b,v3.16b
469*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,24
470*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-24
471*4757b351SPierre Pronchery	eor	v12.16b,v0.16b,v1.16b
472*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,24
473*4757b351SPierre Pronchery	eor	v13.16b,v2.16b,v3.16b
474*4757b351SPierre Pronchery	eor	v6.16b,v6.16b,v12.16b
475*4757b351SPierre Pronchery	eor	v10.16b,v10.16b,v13.16b
476*4757b351SPierre Pronchery
477*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
478*4757b351SPierre Pronchery	dup	v13.4s,w8
479*4757b351SPierre Pronchery	eor	v14.16b,v14.16b,v6.16b
480*4757b351SPierre Pronchery	eor	v15.16b,v15.16b,v10.16b
481*4757b351SPierre Pronchery	eor	v12.16b,v14.16b,v13.16b
482*4757b351SPierre Pronchery	eor	v13.16b,v15.16b,v13.16b
483*4757b351SPierre Pronchery	movi	v3.16b,#64
484*4757b351SPierre Pronchery	sub	v0.16b,v12.16b,v3.16b
485*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v3.16b
486*4757b351SPierre Pronchery	sub	v2.16b,v1.16b,v3.16b
487*4757b351SPierre Pronchery	tbl	v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b
488*4757b351SPierre Pronchery	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
489*4757b351SPierre Pronchery	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
490*4757b351SPierre Pronchery	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
491*4757b351SPierre Pronchery	add	v1.2d,v0.2d,v1.2d
492*4757b351SPierre Pronchery	add	v12.2d,v2.2d,v12.2d
493*4757b351SPierre Pronchery	add	v12.2d,v1.2d,v12.2d
494*4757b351SPierre Pronchery
495*4757b351SPierre Pronchery	sub	v0.16b,v13.16b,v3.16b
496*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v3.16b
497*4757b351SPierre Pronchery	sub	v2.16b,v1.16b,v3.16b
498*4757b351SPierre Pronchery	tbl	v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b
499*4757b351SPierre Pronchery	tbl	v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b
500*4757b351SPierre Pronchery	tbl	v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b
501*4757b351SPierre Pronchery	tbl	v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b
502*4757b351SPierre Pronchery	add	v1.2d,v0.2d,v1.2d
503*4757b351SPierre Pronchery	add	v13.2d,v2.2d,v13.2d
504*4757b351SPierre Pronchery	add	v13.2d,v1.2d,v13.2d
505*4757b351SPierre Pronchery
506*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-2
507*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,2
508*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-2
509*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v12.16b
510*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,2
511*4757b351SPierre Pronchery
512*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-10
513*4757b351SPierre Pronchery	eor	v3.16b,v2.16b,v13.16b
514*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,10
515*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-10
516*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v1.16b
517*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,10
518*4757b351SPierre Pronchery
519*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-18
520*4757b351SPierre Pronchery	eor	v3.16b,v2.16b,v3.16b
521*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,18
522*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-18
523*4757b351SPierre Pronchery	eor	v1.16b,v0.16b,v1.16b
524*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,18
525*4757b351SPierre Pronchery
526*4757b351SPierre Pronchery	ushr	v0.4s,v12.4s,32-24
527*4757b351SPierre Pronchery	eor	v3.16b,v2.16b,v3.16b
528*4757b351SPierre Pronchery	sli	v0.4s,v12.4s,24
529*4757b351SPierre Pronchery	ushr	v2.4s,v13.4s,32-24
530*4757b351SPierre Pronchery	eor	v12.16b,v0.16b,v1.16b
531*4757b351SPierre Pronchery	sli	v2.4s,v13.4s,24
532*4757b351SPierre Pronchery	eor	v13.16b,v2.16b,v3.16b
533*4757b351SPierre Pronchery	eor	v7.16b,v7.16b,v12.16b
534*4757b351SPierre Pronchery	eor	v11.16b,v11.16b,v13.16b
535*4757b351SPierre Pronchery	subs	w11,w11,#1
536*4757b351SPierre Pronchery	b.ne	10b
537*4757b351SPierre Pronchery#ifndef __AARCH64EB__
538*4757b351SPierre Pronchery	rev32	v3.16b,v4.16b
539*4757b351SPierre Pronchery#else
540*4757b351SPierre Pronchery	mov	v3.16b,v4.16b
541*4757b351SPierre Pronchery#endif
542*4757b351SPierre Pronchery#ifndef __AARCH64EB__
543*4757b351SPierre Pronchery	rev32	v2.16b,v5.16b
544*4757b351SPierre Pronchery#else
545*4757b351SPierre Pronchery	mov	v2.16b,v5.16b
546*4757b351SPierre Pronchery#endif
547*4757b351SPierre Pronchery#ifndef __AARCH64EB__
548*4757b351SPierre Pronchery	rev32	v1.16b,v6.16b
549*4757b351SPierre Pronchery#else
550*4757b351SPierre Pronchery	mov	v1.16b,v6.16b
551*4757b351SPierre Pronchery#endif
552*4757b351SPierre Pronchery#ifndef __AARCH64EB__
553*4757b351SPierre Pronchery	rev32	v0.16b,v7.16b
554*4757b351SPierre Pronchery#else
555*4757b351SPierre Pronchery	mov	v0.16b,v7.16b
556*4757b351SPierre Pronchery#endif
557*4757b351SPierre Pronchery#ifndef __AARCH64EB__
558*4757b351SPierre Pronchery	rev32	v7.16b,v8.16b
559*4757b351SPierre Pronchery#else
560*4757b351SPierre Pronchery	mov	v7.16b,v8.16b
561*4757b351SPierre Pronchery#endif
562*4757b351SPierre Pronchery#ifndef __AARCH64EB__
563*4757b351SPierre Pronchery	rev32	v6.16b,v9.16b
564*4757b351SPierre Pronchery#else
565*4757b351SPierre Pronchery	mov	v6.16b,v9.16b
566*4757b351SPierre Pronchery#endif
567*4757b351SPierre Pronchery#ifndef __AARCH64EB__
568*4757b351SPierre Pronchery	rev32	v5.16b,v10.16b
569*4757b351SPierre Pronchery#else
570*4757b351SPierre Pronchery	mov	v5.16b,v10.16b
571*4757b351SPierre Pronchery#endif
572*4757b351SPierre Pronchery#ifndef __AARCH64EB__
573*4757b351SPierre Pronchery	rev32	v4.16b,v11.16b
574*4757b351SPierre Pronchery#else
575*4757b351SPierre Pronchery	mov	v4.16b,v11.16b
576*4757b351SPierre Pronchery#endif
577*4757b351SPierre Pronchery	ret
578*4757b351SPierre Pronchery.size	_vpsm4_enc_8blks,.-_vpsm4_enc_8blks
579*4757b351SPierre Pronchery.globl	vpsm4_set_encrypt_key
580*4757b351SPierre Pronchery.type	vpsm4_set_encrypt_key,%function
581*4757b351SPierre Pronchery.align	5
582*4757b351SPierre Proncheryvpsm4_set_encrypt_key:
583*4757b351SPierre Pronchery	AARCH64_SIGN_LINK_REGISTER
584*4757b351SPierre Pronchery	stp	x29,x30,[sp,#-16]!
585*4757b351SPierre Pronchery	mov	w2,1
586*4757b351SPierre Pronchery	bl	_vpsm4_set_key
587*4757b351SPierre Pronchery	ldp	x29,x30,[sp],#16
588*4757b351SPierre Pronchery	AARCH64_VALIDATE_LINK_REGISTER
589*4757b351SPierre Pronchery	ret
590*4757b351SPierre Pronchery.size	vpsm4_set_encrypt_key,.-vpsm4_set_encrypt_key
591*4757b351SPierre Pronchery.globl	vpsm4_set_decrypt_key
592*4757b351SPierre Pronchery.type	vpsm4_set_decrypt_key,%function
593*4757b351SPierre Pronchery.align	5
594*4757b351SPierre Proncheryvpsm4_set_decrypt_key:
595*4757b351SPierre Pronchery	AARCH64_SIGN_LINK_REGISTER
596*4757b351SPierre Pronchery	stp	x29,x30,[sp,#-16]!
597*4757b351SPierre Pronchery	mov	w2,0
598*4757b351SPierre Pronchery	bl	_vpsm4_set_key
599*4757b351SPierre Pronchery	ldp	x29,x30,[sp],#16
600*4757b351SPierre Pronchery	AARCH64_VALIDATE_LINK_REGISTER
601*4757b351SPierre Pronchery	ret
602*4757b351SPierre Pronchery.size	vpsm4_set_decrypt_key,.-vpsm4_set_decrypt_key
603*4757b351SPierre Pronchery.globl	vpsm4_encrypt
604*4757b351SPierre Pronchery.type	vpsm4_encrypt,%function
605*4757b351SPierre Pronchery.align	5
606*4757b351SPierre Proncheryvpsm4_encrypt:
607*4757b351SPierre Pronchery	AARCH64_VALID_CALL_TARGET
608*4757b351SPierre Pronchery	ld1	{v4.4s},[x0]
609*4757b351SPierre Pronchery	adrp	x10,.Lsbox
610*4757b351SPierre Pronchery	add	x10,x10,#:lo12:.Lsbox
611*4757b351SPierre Pronchery	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
612*4757b351SPierre Pronchery	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
613*4757b351SPierre Pronchery	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
614*4757b351SPierre Pronchery	ld1	{v28.16b,v29.16b,v30.16b,v31.16b},[x10]
615*4757b351SPierre Pronchery#ifndef __AARCH64EB__
616*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
617*4757b351SPierre Pronchery#endif
618*4757b351SPierre Pronchery	mov	x3,x2
619*4757b351SPierre Pronchery	mov	x10,x3
620*4757b351SPierre Pronchery	mov	w11,#8
621*4757b351SPierre Pronchery	mov	w12,v4.s[0]
622*4757b351SPierre Pronchery	mov	w13,v4.s[1]
623*4757b351SPierre Pronchery	mov	w14,v4.s[2]
624*4757b351SPierre Pronchery	mov	w15,v4.s[3]
625*4757b351SPierre Pronchery10:
626*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
627*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
628*4757b351SPierre Pronchery	eor	w6,w14,w15
629*4757b351SPierre Pronchery	eor	w9,w7,w13
630*4757b351SPierre Pronchery	eor	w6,w6,w9
631*4757b351SPierre Pronchery	movi	v1.16b,#64
632*4757b351SPierre Pronchery	movi	v2.16b,#128
633*4757b351SPierre Pronchery	movi	v3.16b,#192
634*4757b351SPierre Pronchery	mov	v0.s[0],w6
635*4757b351SPierre Pronchery
636*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
637*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
638*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
639*4757b351SPierre Pronchery
640*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
641*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
642*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
643*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
644*4757b351SPierre Pronchery
645*4757b351SPierre Pronchery	mov	w6,v0.s[0]
646*4757b351SPierre Pronchery	mov	w7,v1.s[0]
647*4757b351SPierre Pronchery	mov	w9,v2.s[0]
648*4757b351SPierre Pronchery	add	w7,w6,w7
649*4757b351SPierre Pronchery	mov	w6,v3.s[0]
650*4757b351SPierre Pronchery	add	w7,w7,w9
651*4757b351SPierre Pronchery	add	w7,w7,w6
652*4757b351SPierre Pronchery
653*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
654*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
655*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
656*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
657*4757b351SPierre Pronchery	eor	w12,w12,w6
658*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
659*4757b351SPierre Pronchery	eor	w6,w14,w15
660*4757b351SPierre Pronchery	eor	w9,w12,w8
661*4757b351SPierre Pronchery	eor	w6,w6,w9
662*4757b351SPierre Pronchery	movi	v1.16b,#64
663*4757b351SPierre Pronchery	movi	v2.16b,#128
664*4757b351SPierre Pronchery	movi	v3.16b,#192
665*4757b351SPierre Pronchery	mov	v0.s[0],w6
666*4757b351SPierre Pronchery
667*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
668*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
669*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
670*4757b351SPierre Pronchery
671*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
672*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
673*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
674*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
675*4757b351SPierre Pronchery
676*4757b351SPierre Pronchery	mov	w6,v0.s[0]
677*4757b351SPierre Pronchery	mov	w7,v1.s[0]
678*4757b351SPierre Pronchery	mov	w9,v2.s[0]
679*4757b351SPierre Pronchery	add	w7,w6,w7
680*4757b351SPierre Pronchery	mov	w6,v3.s[0]
681*4757b351SPierre Pronchery	add	w7,w7,w9
682*4757b351SPierre Pronchery	add	w7,w7,w6
683*4757b351SPierre Pronchery
684*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
685*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
686*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
687*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
688*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
689*4757b351SPierre Pronchery	eor	w13,w13,w6
690*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
691*4757b351SPierre Pronchery	eor	w6,w12,w13
692*4757b351SPierre Pronchery	eor	w9,w7,w15
693*4757b351SPierre Pronchery	eor	w6,w6,w9
694*4757b351SPierre Pronchery	movi	v1.16b,#64
695*4757b351SPierre Pronchery	movi	v2.16b,#128
696*4757b351SPierre Pronchery	movi	v3.16b,#192
697*4757b351SPierre Pronchery	mov	v0.s[0],w6
698*4757b351SPierre Pronchery
699*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
700*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
701*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
702*4757b351SPierre Pronchery
703*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
704*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
705*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
706*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
707*4757b351SPierre Pronchery
708*4757b351SPierre Pronchery	mov	w6,v0.s[0]
709*4757b351SPierre Pronchery	mov	w7,v1.s[0]
710*4757b351SPierre Pronchery	mov	w9,v2.s[0]
711*4757b351SPierre Pronchery	add	w7,w6,w7
712*4757b351SPierre Pronchery	mov	w6,v3.s[0]
713*4757b351SPierre Pronchery	add	w7,w7,w9
714*4757b351SPierre Pronchery	add	w7,w7,w6
715*4757b351SPierre Pronchery
716*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
717*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
718*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
719*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
720*4757b351SPierre Pronchery	eor	w14,w14,w6
721*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
722*4757b351SPierre Pronchery	eor	w6,w12,w13
723*4757b351SPierre Pronchery	eor	w9,w14,w8
724*4757b351SPierre Pronchery	eor	w6,w6,w9
725*4757b351SPierre Pronchery	movi	v1.16b,#64
726*4757b351SPierre Pronchery	movi	v2.16b,#128
727*4757b351SPierre Pronchery	movi	v3.16b,#192
728*4757b351SPierre Pronchery	mov	v0.s[0],w6
729*4757b351SPierre Pronchery
730*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
731*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
732*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
733*4757b351SPierre Pronchery
734*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
735*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
736*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
737*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
738*4757b351SPierre Pronchery
739*4757b351SPierre Pronchery	mov	w6,v0.s[0]
740*4757b351SPierre Pronchery	mov	w7,v1.s[0]
741*4757b351SPierre Pronchery	mov	w9,v2.s[0]
742*4757b351SPierre Pronchery	add	w7,w6,w7
743*4757b351SPierre Pronchery	mov	w6,v3.s[0]
744*4757b351SPierre Pronchery	add	w7,w7,w9
745*4757b351SPierre Pronchery	add	w7,w7,w6
746*4757b351SPierre Pronchery
747*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
748*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
749*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
750*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
751*4757b351SPierre Pronchery	eor	w15,w15,w6
752*4757b351SPierre Pronchery	subs	w11,w11,#1
753*4757b351SPierre Pronchery	b.ne	10b
754*4757b351SPierre Pronchery	mov	v4.s[0],w15
755*4757b351SPierre Pronchery	mov	v4.s[1],w14
756*4757b351SPierre Pronchery	mov	v4.s[2],w13
757*4757b351SPierre Pronchery	mov	v4.s[3],w12
758*4757b351SPierre Pronchery#ifndef __AARCH64EB__
759*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
760*4757b351SPierre Pronchery#endif
761*4757b351SPierre Pronchery	st1	{v4.4s},[x1]
762*4757b351SPierre Pronchery	ret
763*4757b351SPierre Pronchery.size	vpsm4_encrypt,.-vpsm4_encrypt
764*4757b351SPierre Pronchery.globl	vpsm4_decrypt
765*4757b351SPierre Pronchery.type	vpsm4_decrypt,%function
766*4757b351SPierre Pronchery.align	5
767*4757b351SPierre Proncheryvpsm4_decrypt:
768*4757b351SPierre Pronchery	AARCH64_VALID_CALL_TARGET
769*4757b351SPierre Pronchery	ld1	{v4.4s},[x0]
770*4757b351SPierre Pronchery	adrp	x10,.Lsbox
771*4757b351SPierre Pronchery	add	x10,x10,#:lo12:.Lsbox
772*4757b351SPierre Pronchery	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
773*4757b351SPierre Pronchery	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
774*4757b351SPierre Pronchery	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
775*4757b351SPierre Pronchery	ld1	{v28.16b,v29.16b,v30.16b,v31.16b},[x10]
776*4757b351SPierre Pronchery#ifndef __AARCH64EB__
777*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
778*4757b351SPierre Pronchery#endif
779*4757b351SPierre Pronchery	mov	x3,x2
780*4757b351SPierre Pronchery	mov	x10,x3
781*4757b351SPierre Pronchery	mov	w11,#8
782*4757b351SPierre Pronchery	mov	w12,v4.s[0]
783*4757b351SPierre Pronchery	mov	w13,v4.s[1]
784*4757b351SPierre Pronchery	mov	w14,v4.s[2]
785*4757b351SPierre Pronchery	mov	w15,v4.s[3]
786*4757b351SPierre Pronchery10:
787*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
788*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
789*4757b351SPierre Pronchery	eor	w6,w14,w15
790*4757b351SPierre Pronchery	eor	w9,w7,w13
791*4757b351SPierre Pronchery	eor	w6,w6,w9
792*4757b351SPierre Pronchery	movi	v1.16b,#64
793*4757b351SPierre Pronchery	movi	v2.16b,#128
794*4757b351SPierre Pronchery	movi	v3.16b,#192
795*4757b351SPierre Pronchery	mov	v0.s[0],w6
796*4757b351SPierre Pronchery
797*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
798*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
799*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
800*4757b351SPierre Pronchery
801*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
802*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
803*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
804*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
805*4757b351SPierre Pronchery
806*4757b351SPierre Pronchery	mov	w6,v0.s[0]
807*4757b351SPierre Pronchery	mov	w7,v1.s[0]
808*4757b351SPierre Pronchery	mov	w9,v2.s[0]
809*4757b351SPierre Pronchery	add	w7,w6,w7
810*4757b351SPierre Pronchery	mov	w6,v3.s[0]
811*4757b351SPierre Pronchery	add	w7,w7,w9
812*4757b351SPierre Pronchery	add	w7,w7,w6
813*4757b351SPierre Pronchery
814*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
815*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
816*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
817*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
818*4757b351SPierre Pronchery	eor	w12,w12,w6
819*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
820*4757b351SPierre Pronchery	eor	w6,w14,w15
821*4757b351SPierre Pronchery	eor	w9,w12,w8
822*4757b351SPierre Pronchery	eor	w6,w6,w9
823*4757b351SPierre Pronchery	movi	v1.16b,#64
824*4757b351SPierre Pronchery	movi	v2.16b,#128
825*4757b351SPierre Pronchery	movi	v3.16b,#192
826*4757b351SPierre Pronchery	mov	v0.s[0],w6
827*4757b351SPierre Pronchery
828*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
829*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
830*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
831*4757b351SPierre Pronchery
832*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
833*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
834*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
835*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
836*4757b351SPierre Pronchery
837*4757b351SPierre Pronchery	mov	w6,v0.s[0]
838*4757b351SPierre Pronchery	mov	w7,v1.s[0]
839*4757b351SPierre Pronchery	mov	w9,v2.s[0]
840*4757b351SPierre Pronchery	add	w7,w6,w7
841*4757b351SPierre Pronchery	mov	w6,v3.s[0]
842*4757b351SPierre Pronchery	add	w7,w7,w9
843*4757b351SPierre Pronchery	add	w7,w7,w6
844*4757b351SPierre Pronchery
845*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
846*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
847*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
848*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
849*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
850*4757b351SPierre Pronchery	eor	w13,w13,w6
851*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
852*4757b351SPierre Pronchery	eor	w6,w12,w13
853*4757b351SPierre Pronchery	eor	w9,w7,w15
854*4757b351SPierre Pronchery	eor	w6,w6,w9
855*4757b351SPierre Pronchery	movi	v1.16b,#64
856*4757b351SPierre Pronchery	movi	v2.16b,#128
857*4757b351SPierre Pronchery	movi	v3.16b,#192
858*4757b351SPierre Pronchery	mov	v0.s[0],w6
859*4757b351SPierre Pronchery
860*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
861*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
862*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
863*4757b351SPierre Pronchery
864*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
865*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
866*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
867*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
868*4757b351SPierre Pronchery
869*4757b351SPierre Pronchery	mov	w6,v0.s[0]
870*4757b351SPierre Pronchery	mov	w7,v1.s[0]
871*4757b351SPierre Pronchery	mov	w9,v2.s[0]
872*4757b351SPierre Pronchery	add	w7,w6,w7
873*4757b351SPierre Pronchery	mov	w6,v3.s[0]
874*4757b351SPierre Pronchery	add	w7,w7,w9
875*4757b351SPierre Pronchery	add	w7,w7,w6
876*4757b351SPierre Pronchery
877*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
878*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
879*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
880*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
881*4757b351SPierre Pronchery	eor	w14,w14,w6
882*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
883*4757b351SPierre Pronchery	eor	w6,w12,w13
884*4757b351SPierre Pronchery	eor	w9,w14,w8
885*4757b351SPierre Pronchery	eor	w6,w6,w9
886*4757b351SPierre Pronchery	movi	v1.16b,#64
887*4757b351SPierre Pronchery	movi	v2.16b,#128
888*4757b351SPierre Pronchery	movi	v3.16b,#192
889*4757b351SPierre Pronchery	mov	v0.s[0],w6
890*4757b351SPierre Pronchery
891*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
892*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
893*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
894*4757b351SPierre Pronchery
895*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
896*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
897*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
898*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
899*4757b351SPierre Pronchery
900*4757b351SPierre Pronchery	mov	w6,v0.s[0]
901*4757b351SPierre Pronchery	mov	w7,v1.s[0]
902*4757b351SPierre Pronchery	mov	w9,v2.s[0]
903*4757b351SPierre Pronchery	add	w7,w6,w7
904*4757b351SPierre Pronchery	mov	w6,v3.s[0]
905*4757b351SPierre Pronchery	add	w7,w7,w9
906*4757b351SPierre Pronchery	add	w7,w7,w6
907*4757b351SPierre Pronchery
908*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
909*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
910*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
911*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
912*4757b351SPierre Pronchery	eor	w15,w15,w6
913*4757b351SPierre Pronchery	subs	w11,w11,#1
914*4757b351SPierre Pronchery	b.ne	10b
915*4757b351SPierre Pronchery	mov	v4.s[0],w15
916*4757b351SPierre Pronchery	mov	v4.s[1],w14
917*4757b351SPierre Pronchery	mov	v4.s[2],w13
918*4757b351SPierre Pronchery	mov	v4.s[3],w12
919*4757b351SPierre Pronchery#ifndef __AARCH64EB__
920*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
921*4757b351SPierre Pronchery#endif
922*4757b351SPierre Pronchery	st1	{v4.4s},[x1]
923*4757b351SPierre Pronchery	ret
924*4757b351SPierre Pronchery.size	vpsm4_decrypt,.-vpsm4_decrypt
925*4757b351SPierre Pronchery.globl	vpsm4_ecb_encrypt
926*4757b351SPierre Pronchery.type	vpsm4_ecb_encrypt,%function
927*4757b351SPierre Pronchery.align	5
928*4757b351SPierre Proncheryvpsm4_ecb_encrypt:
929*4757b351SPierre Pronchery	AARCH64_SIGN_LINK_REGISTER
930*4757b351SPierre Pronchery	// convert length into blocks
931*4757b351SPierre Pronchery	lsr	x2,x2,4
932*4757b351SPierre Pronchery	stp	d8,d9,[sp,#-80]!
933*4757b351SPierre Pronchery	stp	d10,d11,[sp,#16]
934*4757b351SPierre Pronchery	stp	d12,d13,[sp,#32]
935*4757b351SPierre Pronchery	stp	d14,d15,[sp,#48]
936*4757b351SPierre Pronchery	stp	x29,x30,[sp,#64]
937*4757b351SPierre Pronchery	adrp	x10,.Lsbox
938*4757b351SPierre Pronchery	add	x10,x10,#:lo12:.Lsbox
939*4757b351SPierre Pronchery	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
940*4757b351SPierre Pronchery	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
941*4757b351SPierre Pronchery	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
942*4757b351SPierre Pronchery	ld1	{v28.16b,v29.16b,v30.16b,v31.16b},[x10]
943*4757b351SPierre Pronchery.Lecb_8_blocks_process:
944*4757b351SPierre Pronchery	cmp	w2,#8
945*4757b351SPierre Pronchery	b.lt	.Lecb_4_blocks_process
946*4757b351SPierre Pronchery	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
947*4757b351SPierre Pronchery	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
948*4757b351SPierre Pronchery#ifndef __AARCH64EB__
949*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
950*4757b351SPierre Pronchery#endif
951*4757b351SPierre Pronchery#ifndef __AARCH64EB__
952*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
953*4757b351SPierre Pronchery#endif
954*4757b351SPierre Pronchery#ifndef __AARCH64EB__
955*4757b351SPierre Pronchery	rev32	v6.16b,v6.16b
956*4757b351SPierre Pronchery#endif
957*4757b351SPierre Pronchery#ifndef __AARCH64EB__
958*4757b351SPierre Pronchery	rev32	v7.16b,v7.16b
959*4757b351SPierre Pronchery#endif
960*4757b351SPierre Pronchery#ifndef __AARCH64EB__
961*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
962*4757b351SPierre Pronchery#endif
963*4757b351SPierre Pronchery#ifndef __AARCH64EB__
964*4757b351SPierre Pronchery	rev32	v9.16b,v9.16b
965*4757b351SPierre Pronchery#endif
966*4757b351SPierre Pronchery#ifndef __AARCH64EB__
967*4757b351SPierre Pronchery	rev32	v10.16b,v10.16b
968*4757b351SPierre Pronchery#endif
969*4757b351SPierre Pronchery#ifndef __AARCH64EB__
970*4757b351SPierre Pronchery	rev32	v11.16b,v11.16b
971*4757b351SPierre Pronchery#endif
972*4757b351SPierre Pronchery	bl	_vpsm4_enc_8blks
973*4757b351SPierre Pronchery	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
974*4757b351SPierre Pronchery	st4	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
975*4757b351SPierre Pronchery	subs	w2,w2,#8
976*4757b351SPierre Pronchery	b.gt	.Lecb_8_blocks_process
977*4757b351SPierre Pronchery	b	100f
978*4757b351SPierre Pronchery.Lecb_4_blocks_process:
979*4757b351SPierre Pronchery	cmp	w2,#4
980*4757b351SPierre Pronchery	b.lt	1f
981*4757b351SPierre Pronchery	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
982*4757b351SPierre Pronchery#ifndef __AARCH64EB__
983*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
984*4757b351SPierre Pronchery#endif
985*4757b351SPierre Pronchery#ifndef __AARCH64EB__
986*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
987*4757b351SPierre Pronchery#endif
988*4757b351SPierre Pronchery#ifndef __AARCH64EB__
989*4757b351SPierre Pronchery	rev32	v6.16b,v6.16b
990*4757b351SPierre Pronchery#endif
991*4757b351SPierre Pronchery#ifndef __AARCH64EB__
992*4757b351SPierre Pronchery	rev32	v7.16b,v7.16b
993*4757b351SPierre Pronchery#endif
994*4757b351SPierre Pronchery	bl	_vpsm4_enc_4blks
995*4757b351SPierre Pronchery	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
996*4757b351SPierre Pronchery	sub	w2,w2,#4
997*4757b351SPierre Pronchery1:
998*4757b351SPierre Pronchery	// process last block
999*4757b351SPierre Pronchery	cmp	w2,#1
1000*4757b351SPierre Pronchery	b.lt	100f
1001*4757b351SPierre Pronchery	b.gt	1f
1002*4757b351SPierre Pronchery	ld1	{v4.4s},[x0]
1003*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1004*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
1005*4757b351SPierre Pronchery#endif
1006*4757b351SPierre Pronchery	mov	x10,x3
1007*4757b351SPierre Pronchery	mov	w11,#8
1008*4757b351SPierre Pronchery	mov	w12,v4.s[0]
1009*4757b351SPierre Pronchery	mov	w13,v4.s[1]
1010*4757b351SPierre Pronchery	mov	w14,v4.s[2]
1011*4757b351SPierre Pronchery	mov	w15,v4.s[3]
1012*4757b351SPierre Pronchery10:
1013*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
1014*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1015*4757b351SPierre Pronchery	eor	w6,w14,w15
1016*4757b351SPierre Pronchery	eor	w9,w7,w13
1017*4757b351SPierre Pronchery	eor	w6,w6,w9
1018*4757b351SPierre Pronchery	movi	v1.16b,#64
1019*4757b351SPierre Pronchery	movi	v2.16b,#128
1020*4757b351SPierre Pronchery	movi	v3.16b,#192
1021*4757b351SPierre Pronchery	mov	v0.s[0],w6
1022*4757b351SPierre Pronchery
1023*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1024*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1025*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1026*4757b351SPierre Pronchery
1027*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1028*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1029*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1030*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1031*4757b351SPierre Pronchery
1032*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1033*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1034*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1035*4757b351SPierre Pronchery	add	w7,w6,w7
1036*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1037*4757b351SPierre Pronchery	add	w7,w7,w9
1038*4757b351SPierre Pronchery	add	w7,w7,w6
1039*4757b351SPierre Pronchery
1040*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1041*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1042*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1043*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1044*4757b351SPierre Pronchery	eor	w12,w12,w6
1045*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1046*4757b351SPierre Pronchery	eor	w6,w14,w15
1047*4757b351SPierre Pronchery	eor	w9,w12,w8
1048*4757b351SPierre Pronchery	eor	w6,w6,w9
1049*4757b351SPierre Pronchery	movi	v1.16b,#64
1050*4757b351SPierre Pronchery	movi	v2.16b,#128
1051*4757b351SPierre Pronchery	movi	v3.16b,#192
1052*4757b351SPierre Pronchery	mov	v0.s[0],w6
1053*4757b351SPierre Pronchery
1054*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1055*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1056*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1057*4757b351SPierre Pronchery
1058*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1059*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1060*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1061*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1062*4757b351SPierre Pronchery
1063*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1064*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1065*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1066*4757b351SPierre Pronchery	add	w7,w6,w7
1067*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1068*4757b351SPierre Pronchery	add	w7,w7,w9
1069*4757b351SPierre Pronchery	add	w7,w7,w6
1070*4757b351SPierre Pronchery
1071*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1072*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1073*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1074*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1075*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
1076*4757b351SPierre Pronchery	eor	w13,w13,w6
1077*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1078*4757b351SPierre Pronchery	eor	w6,w12,w13
1079*4757b351SPierre Pronchery	eor	w9,w7,w15
1080*4757b351SPierre Pronchery	eor	w6,w6,w9
1081*4757b351SPierre Pronchery	movi	v1.16b,#64
1082*4757b351SPierre Pronchery	movi	v2.16b,#128
1083*4757b351SPierre Pronchery	movi	v3.16b,#192
1084*4757b351SPierre Pronchery	mov	v0.s[0],w6
1085*4757b351SPierre Pronchery
1086*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1087*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1088*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1089*4757b351SPierre Pronchery
1090*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1091*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1092*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1093*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1094*4757b351SPierre Pronchery
1095*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1096*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1097*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1098*4757b351SPierre Pronchery	add	w7,w6,w7
1099*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1100*4757b351SPierre Pronchery	add	w7,w7,w9
1101*4757b351SPierre Pronchery	add	w7,w7,w6
1102*4757b351SPierre Pronchery
1103*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1104*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1105*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1106*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1107*4757b351SPierre Pronchery	eor	w14,w14,w6
1108*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1109*4757b351SPierre Pronchery	eor	w6,w12,w13
1110*4757b351SPierre Pronchery	eor	w9,w14,w8
1111*4757b351SPierre Pronchery	eor	w6,w6,w9
1112*4757b351SPierre Pronchery	movi	v1.16b,#64
1113*4757b351SPierre Pronchery	movi	v2.16b,#128
1114*4757b351SPierre Pronchery	movi	v3.16b,#192
1115*4757b351SPierre Pronchery	mov	v0.s[0],w6
1116*4757b351SPierre Pronchery
1117*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1118*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1119*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1120*4757b351SPierre Pronchery
1121*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1122*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1123*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1124*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1125*4757b351SPierre Pronchery
1126*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1127*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1128*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1129*4757b351SPierre Pronchery	add	w7,w6,w7
1130*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1131*4757b351SPierre Pronchery	add	w7,w7,w9
1132*4757b351SPierre Pronchery	add	w7,w7,w6
1133*4757b351SPierre Pronchery
1134*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1135*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1136*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1137*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1138*4757b351SPierre Pronchery	eor	w15,w15,w6
1139*4757b351SPierre Pronchery	subs	w11,w11,#1
1140*4757b351SPierre Pronchery	b.ne	10b
1141*4757b351SPierre Pronchery	mov	v4.s[0],w15
1142*4757b351SPierre Pronchery	mov	v4.s[1],w14
1143*4757b351SPierre Pronchery	mov	v4.s[2],w13
1144*4757b351SPierre Pronchery	mov	v4.s[3],w12
1145*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1146*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
1147*4757b351SPierre Pronchery#endif
1148*4757b351SPierre Pronchery	st1	{v4.4s},[x1]
1149*4757b351SPierre Pronchery	b	100f
1150*4757b351SPierre Pronchery1:	//	process last 2 blocks
1151*4757b351SPierre Pronchery	ld4	{v4.s,v5.s,v6.s,v7.s}[0],[x0],#16
1152*4757b351SPierre Pronchery	ld4	{v4.s,v5.s,v6.s,v7.s}[1],[x0],#16
1153*4757b351SPierre Pronchery	cmp	w2,#2
1154*4757b351SPierre Pronchery	b.gt	1f
1155*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1156*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
1157*4757b351SPierre Pronchery#endif
1158*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1159*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
1160*4757b351SPierre Pronchery#endif
1161*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1162*4757b351SPierre Pronchery	rev32	v6.16b,v6.16b
1163*4757b351SPierre Pronchery#endif
1164*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1165*4757b351SPierre Pronchery	rev32	v7.16b,v7.16b
1166*4757b351SPierre Pronchery#endif
1167*4757b351SPierre Pronchery	bl	_vpsm4_enc_4blks
1168*4757b351SPierre Pronchery	st4	{v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
1169*4757b351SPierre Pronchery	st4	{v0.s,v1.s,v2.s,v3.s}[1],[x1]
1170*4757b351SPierre Pronchery	b	100f
1171*4757b351SPierre Pronchery1:	//	process last 3 blocks
1172*4757b351SPierre Pronchery	ld4	{v4.s,v5.s,v6.s,v7.s}[2],[x0],#16
1173*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1174*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
1175*4757b351SPierre Pronchery#endif
1176*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1177*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
1178*4757b351SPierre Pronchery#endif
1179*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1180*4757b351SPierre Pronchery	rev32	v6.16b,v6.16b
1181*4757b351SPierre Pronchery#endif
1182*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1183*4757b351SPierre Pronchery	rev32	v7.16b,v7.16b
1184*4757b351SPierre Pronchery#endif
1185*4757b351SPierre Pronchery	bl	_vpsm4_enc_4blks
1186*4757b351SPierre Pronchery	st4	{v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
1187*4757b351SPierre Pronchery	st4	{v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
1188*4757b351SPierre Pronchery	st4	{v0.s,v1.s,v2.s,v3.s}[2],[x1]
1189*4757b351SPierre Pronchery100:
1190*4757b351SPierre Pronchery	ldp	d10,d11,[sp,#16]
1191*4757b351SPierre Pronchery	ldp	d12,d13,[sp,#32]
1192*4757b351SPierre Pronchery	ldp	d14,d15,[sp,#48]
1193*4757b351SPierre Pronchery	ldp	x29,x30,[sp,#64]
1194*4757b351SPierre Pronchery	ldp	d8,d9,[sp],#80
1195*4757b351SPierre Pronchery	AARCH64_VALIDATE_LINK_REGISTER
1196*4757b351SPierre Pronchery	ret
1197*4757b351SPierre Pronchery.size	vpsm4_ecb_encrypt,.-vpsm4_ecb_encrypt
1198*4757b351SPierre Pronchery.globl	vpsm4_cbc_encrypt
1199*4757b351SPierre Pronchery.type	vpsm4_cbc_encrypt,%function
1200*4757b351SPierre Pronchery.align	5
1201*4757b351SPierre Proncheryvpsm4_cbc_encrypt:
1202*4757b351SPierre Pronchery	AARCH64_VALID_CALL_TARGET
1203*4757b351SPierre Pronchery	lsr	x2,x2,4
1204*4757b351SPierre Pronchery	adrp	x10,.Lsbox
1205*4757b351SPierre Pronchery	add	x10,x10,#:lo12:.Lsbox
1206*4757b351SPierre Pronchery	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
1207*4757b351SPierre Pronchery	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
1208*4757b351SPierre Pronchery	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
1209*4757b351SPierre Pronchery	ld1	{v28.16b,v29.16b,v30.16b,v31.16b},[x10]
1210*4757b351SPierre Pronchery	cbz	w5,.Ldec
1211*4757b351SPierre Pronchery	ld1	{v3.4s},[x4]
1212*4757b351SPierre Pronchery.Lcbc_4_blocks_enc:
1213*4757b351SPierre Pronchery	cmp	w2,#4
1214*4757b351SPierre Pronchery	b.lt	1f
1215*4757b351SPierre Pronchery	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
1216*4757b351SPierre Pronchery	eor	v4.16b,v4.16b,v3.16b
1217*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1218*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
1219*4757b351SPierre Pronchery#endif
1220*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1221*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
1222*4757b351SPierre Pronchery#endif
1223*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1224*4757b351SPierre Pronchery	rev32	v6.16b,v6.16b
1225*4757b351SPierre Pronchery#endif
1226*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1227*4757b351SPierre Pronchery	rev32	v7.16b,v7.16b
1228*4757b351SPierre Pronchery#endif
1229*4757b351SPierre Pronchery	mov	x10,x3
1230*4757b351SPierre Pronchery	mov	w11,#8
1231*4757b351SPierre Pronchery	mov	w12,v4.s[0]
1232*4757b351SPierre Pronchery	mov	w13,v4.s[1]
1233*4757b351SPierre Pronchery	mov	w14,v4.s[2]
1234*4757b351SPierre Pronchery	mov	w15,v4.s[3]
1235*4757b351SPierre Pronchery10:
1236*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
1237*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1238*4757b351SPierre Pronchery	eor	w6,w14,w15
1239*4757b351SPierre Pronchery	eor	w9,w7,w13
1240*4757b351SPierre Pronchery	eor	w6,w6,w9
1241*4757b351SPierre Pronchery	movi	v1.16b,#64
1242*4757b351SPierre Pronchery	movi	v2.16b,#128
1243*4757b351SPierre Pronchery	movi	v3.16b,#192
1244*4757b351SPierre Pronchery	mov	v0.s[0],w6
1245*4757b351SPierre Pronchery
1246*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1247*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1248*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1249*4757b351SPierre Pronchery
1250*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1251*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1252*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1253*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1254*4757b351SPierre Pronchery
1255*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1256*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1257*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1258*4757b351SPierre Pronchery	add	w7,w6,w7
1259*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1260*4757b351SPierre Pronchery	add	w7,w7,w9
1261*4757b351SPierre Pronchery	add	w7,w7,w6
1262*4757b351SPierre Pronchery
1263*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1264*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1265*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1266*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1267*4757b351SPierre Pronchery	eor	w12,w12,w6
1268*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1269*4757b351SPierre Pronchery	eor	w6,w14,w15
1270*4757b351SPierre Pronchery	eor	w9,w12,w8
1271*4757b351SPierre Pronchery	eor	w6,w6,w9
1272*4757b351SPierre Pronchery	movi	v1.16b,#64
1273*4757b351SPierre Pronchery	movi	v2.16b,#128
1274*4757b351SPierre Pronchery	movi	v3.16b,#192
1275*4757b351SPierre Pronchery	mov	v0.s[0],w6
1276*4757b351SPierre Pronchery
1277*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1278*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1279*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1280*4757b351SPierre Pronchery
1281*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1282*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1283*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1284*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1285*4757b351SPierre Pronchery
1286*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1287*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1288*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1289*4757b351SPierre Pronchery	add	w7,w6,w7
1290*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1291*4757b351SPierre Pronchery	add	w7,w7,w9
1292*4757b351SPierre Pronchery	add	w7,w7,w6
1293*4757b351SPierre Pronchery
1294*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1295*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1296*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1297*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1298*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
1299*4757b351SPierre Pronchery	eor	w13,w13,w6
1300*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1301*4757b351SPierre Pronchery	eor	w6,w12,w13
1302*4757b351SPierre Pronchery	eor	w9,w7,w15
1303*4757b351SPierre Pronchery	eor	w6,w6,w9
1304*4757b351SPierre Pronchery	movi	v1.16b,#64
1305*4757b351SPierre Pronchery	movi	v2.16b,#128
1306*4757b351SPierre Pronchery	movi	v3.16b,#192
1307*4757b351SPierre Pronchery	mov	v0.s[0],w6
1308*4757b351SPierre Pronchery
1309*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1310*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1311*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1312*4757b351SPierre Pronchery
1313*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1314*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1315*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1316*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1317*4757b351SPierre Pronchery
1318*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1319*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1320*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1321*4757b351SPierre Pronchery	add	w7,w6,w7
1322*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1323*4757b351SPierre Pronchery	add	w7,w7,w9
1324*4757b351SPierre Pronchery	add	w7,w7,w6
1325*4757b351SPierre Pronchery
1326*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1327*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1328*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1329*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1330*4757b351SPierre Pronchery	eor	w14,w14,w6
1331*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1332*4757b351SPierre Pronchery	eor	w6,w12,w13
1333*4757b351SPierre Pronchery	eor	w9,w14,w8
1334*4757b351SPierre Pronchery	eor	w6,w6,w9
1335*4757b351SPierre Pronchery	movi	v1.16b,#64
1336*4757b351SPierre Pronchery	movi	v2.16b,#128
1337*4757b351SPierre Pronchery	movi	v3.16b,#192
1338*4757b351SPierre Pronchery	mov	v0.s[0],w6
1339*4757b351SPierre Pronchery
1340*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1341*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1342*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1343*4757b351SPierre Pronchery
1344*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1345*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1346*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1347*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1348*4757b351SPierre Pronchery
1349*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1350*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1351*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1352*4757b351SPierre Pronchery	add	w7,w6,w7
1353*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1354*4757b351SPierre Pronchery	add	w7,w7,w9
1355*4757b351SPierre Pronchery	add	w7,w7,w6
1356*4757b351SPierre Pronchery
1357*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1358*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1359*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1360*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1361*4757b351SPierre Pronchery	eor	w15,w15,w6
1362*4757b351SPierre Pronchery	subs	w11,w11,#1
1363*4757b351SPierre Pronchery	b.ne	10b
1364*4757b351SPierre Pronchery	mov	v4.s[0],w15
1365*4757b351SPierre Pronchery	mov	v4.s[1],w14
1366*4757b351SPierre Pronchery	mov	v4.s[2],w13
1367*4757b351SPierre Pronchery	mov	v4.s[3],w12
1368*4757b351SPierre Pronchery	eor	v5.16b,v5.16b,v4.16b
1369*4757b351SPierre Pronchery	mov	x10,x3
1370*4757b351SPierre Pronchery	mov	w11,#8
1371*4757b351SPierre Pronchery	mov	w12,v5.s[0]
1372*4757b351SPierre Pronchery	mov	w13,v5.s[1]
1373*4757b351SPierre Pronchery	mov	w14,v5.s[2]
1374*4757b351SPierre Pronchery	mov	w15,v5.s[3]
1375*4757b351SPierre Pronchery10:
1376*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
1377*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1378*4757b351SPierre Pronchery	eor	w6,w14,w15
1379*4757b351SPierre Pronchery	eor	w9,w7,w13
1380*4757b351SPierre Pronchery	eor	w6,w6,w9
1381*4757b351SPierre Pronchery	movi	v1.16b,#64
1382*4757b351SPierre Pronchery	movi	v2.16b,#128
1383*4757b351SPierre Pronchery	movi	v3.16b,#192
1384*4757b351SPierre Pronchery	mov	v0.s[0],w6
1385*4757b351SPierre Pronchery
1386*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1387*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1388*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1389*4757b351SPierre Pronchery
1390*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1391*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1392*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1393*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1394*4757b351SPierre Pronchery
1395*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1396*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1397*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1398*4757b351SPierre Pronchery	add	w7,w6,w7
1399*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1400*4757b351SPierre Pronchery	add	w7,w7,w9
1401*4757b351SPierre Pronchery	add	w7,w7,w6
1402*4757b351SPierre Pronchery
1403*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1404*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1405*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1406*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1407*4757b351SPierre Pronchery	eor	w12,w12,w6
1408*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1409*4757b351SPierre Pronchery	eor	w6,w14,w15
1410*4757b351SPierre Pronchery	eor	w9,w12,w8
1411*4757b351SPierre Pronchery	eor	w6,w6,w9
1412*4757b351SPierre Pronchery	movi	v1.16b,#64
1413*4757b351SPierre Pronchery	movi	v2.16b,#128
1414*4757b351SPierre Pronchery	movi	v3.16b,#192
1415*4757b351SPierre Pronchery	mov	v0.s[0],w6
1416*4757b351SPierre Pronchery
1417*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1418*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1419*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1420*4757b351SPierre Pronchery
1421*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1422*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1423*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1424*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1425*4757b351SPierre Pronchery
1426*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1427*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1428*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1429*4757b351SPierre Pronchery	add	w7,w6,w7
1430*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1431*4757b351SPierre Pronchery	add	w7,w7,w9
1432*4757b351SPierre Pronchery	add	w7,w7,w6
1433*4757b351SPierre Pronchery
1434*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1435*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1436*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1437*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1438*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
1439*4757b351SPierre Pronchery	eor	w13,w13,w6
1440*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1441*4757b351SPierre Pronchery	eor	w6,w12,w13
1442*4757b351SPierre Pronchery	eor	w9,w7,w15
1443*4757b351SPierre Pronchery	eor	w6,w6,w9
1444*4757b351SPierre Pronchery	movi	v1.16b,#64
1445*4757b351SPierre Pronchery	movi	v2.16b,#128
1446*4757b351SPierre Pronchery	movi	v3.16b,#192
1447*4757b351SPierre Pronchery	mov	v0.s[0],w6
1448*4757b351SPierre Pronchery
1449*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1450*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1451*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1452*4757b351SPierre Pronchery
1453*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1454*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1455*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1456*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1457*4757b351SPierre Pronchery
1458*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1459*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1460*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1461*4757b351SPierre Pronchery	add	w7,w6,w7
1462*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1463*4757b351SPierre Pronchery	add	w7,w7,w9
1464*4757b351SPierre Pronchery	add	w7,w7,w6
1465*4757b351SPierre Pronchery
1466*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1467*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1468*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1469*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1470*4757b351SPierre Pronchery	eor	w14,w14,w6
1471*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1472*4757b351SPierre Pronchery	eor	w6,w12,w13
1473*4757b351SPierre Pronchery	eor	w9,w14,w8
1474*4757b351SPierre Pronchery	eor	w6,w6,w9
1475*4757b351SPierre Pronchery	movi	v1.16b,#64
1476*4757b351SPierre Pronchery	movi	v2.16b,#128
1477*4757b351SPierre Pronchery	movi	v3.16b,#192
1478*4757b351SPierre Pronchery	mov	v0.s[0],w6
1479*4757b351SPierre Pronchery
1480*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1481*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1482*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1483*4757b351SPierre Pronchery
1484*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1485*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1486*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1487*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1488*4757b351SPierre Pronchery
1489*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1490*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1491*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1492*4757b351SPierre Pronchery	add	w7,w6,w7
1493*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1494*4757b351SPierre Pronchery	add	w7,w7,w9
1495*4757b351SPierre Pronchery	add	w7,w7,w6
1496*4757b351SPierre Pronchery
1497*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1498*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1499*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1500*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1501*4757b351SPierre Pronchery	eor	w15,w15,w6
1502*4757b351SPierre Pronchery	subs	w11,w11,#1
1503*4757b351SPierre Pronchery	b.ne	10b
1504*4757b351SPierre Pronchery	mov	v5.s[0],w15
1505*4757b351SPierre Pronchery	mov	v5.s[1],w14
1506*4757b351SPierre Pronchery	mov	v5.s[2],w13
1507*4757b351SPierre Pronchery	mov	v5.s[3],w12
1508*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1509*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
1510*4757b351SPierre Pronchery#endif
1511*4757b351SPierre Pronchery	eor	v6.16b,v6.16b,v5.16b
1512*4757b351SPierre Pronchery	mov	x10,x3
1513*4757b351SPierre Pronchery	mov	w11,#8
1514*4757b351SPierre Pronchery	mov	w12,v6.s[0]
1515*4757b351SPierre Pronchery	mov	w13,v6.s[1]
1516*4757b351SPierre Pronchery	mov	w14,v6.s[2]
1517*4757b351SPierre Pronchery	mov	w15,v6.s[3]
1518*4757b351SPierre Pronchery10:
1519*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
1520*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1521*4757b351SPierre Pronchery	eor	w6,w14,w15
1522*4757b351SPierre Pronchery	eor	w9,w7,w13
1523*4757b351SPierre Pronchery	eor	w6,w6,w9
1524*4757b351SPierre Pronchery	movi	v1.16b,#64
1525*4757b351SPierre Pronchery	movi	v2.16b,#128
1526*4757b351SPierre Pronchery	movi	v3.16b,#192
1527*4757b351SPierre Pronchery	mov	v0.s[0],w6
1528*4757b351SPierre Pronchery
1529*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1530*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1531*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1532*4757b351SPierre Pronchery
1533*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1534*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1535*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1536*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1537*4757b351SPierre Pronchery
1538*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1539*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1540*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1541*4757b351SPierre Pronchery	add	w7,w6,w7
1542*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1543*4757b351SPierre Pronchery	add	w7,w7,w9
1544*4757b351SPierre Pronchery	add	w7,w7,w6
1545*4757b351SPierre Pronchery
1546*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1547*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1548*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1549*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1550*4757b351SPierre Pronchery	eor	w12,w12,w6
1551*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1552*4757b351SPierre Pronchery	eor	w6,w14,w15
1553*4757b351SPierre Pronchery	eor	w9,w12,w8
1554*4757b351SPierre Pronchery	eor	w6,w6,w9
1555*4757b351SPierre Pronchery	movi	v1.16b,#64
1556*4757b351SPierre Pronchery	movi	v2.16b,#128
1557*4757b351SPierre Pronchery	movi	v3.16b,#192
1558*4757b351SPierre Pronchery	mov	v0.s[0],w6
1559*4757b351SPierre Pronchery
1560*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1561*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1562*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1563*4757b351SPierre Pronchery
1564*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1565*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1566*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1567*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1568*4757b351SPierre Pronchery
1569*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1570*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1571*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1572*4757b351SPierre Pronchery	add	w7,w6,w7
1573*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1574*4757b351SPierre Pronchery	add	w7,w7,w9
1575*4757b351SPierre Pronchery	add	w7,w7,w6
1576*4757b351SPierre Pronchery
1577*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1578*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1579*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1580*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1581*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
1582*4757b351SPierre Pronchery	eor	w13,w13,w6
1583*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1584*4757b351SPierre Pronchery	eor	w6,w12,w13
1585*4757b351SPierre Pronchery	eor	w9,w7,w15
1586*4757b351SPierre Pronchery	eor	w6,w6,w9
1587*4757b351SPierre Pronchery	movi	v1.16b,#64
1588*4757b351SPierre Pronchery	movi	v2.16b,#128
1589*4757b351SPierre Pronchery	movi	v3.16b,#192
1590*4757b351SPierre Pronchery	mov	v0.s[0],w6
1591*4757b351SPierre Pronchery
1592*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1593*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1594*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1595*4757b351SPierre Pronchery
1596*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1597*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1598*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1599*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1600*4757b351SPierre Pronchery
1601*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1602*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1603*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1604*4757b351SPierre Pronchery	add	w7,w6,w7
1605*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1606*4757b351SPierre Pronchery	add	w7,w7,w9
1607*4757b351SPierre Pronchery	add	w7,w7,w6
1608*4757b351SPierre Pronchery
1609*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1610*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1611*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1612*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1613*4757b351SPierre Pronchery	eor	w14,w14,w6
1614*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1615*4757b351SPierre Pronchery	eor	w6,w12,w13
1616*4757b351SPierre Pronchery	eor	w9,w14,w8
1617*4757b351SPierre Pronchery	eor	w6,w6,w9
1618*4757b351SPierre Pronchery	movi	v1.16b,#64
1619*4757b351SPierre Pronchery	movi	v2.16b,#128
1620*4757b351SPierre Pronchery	movi	v3.16b,#192
1621*4757b351SPierre Pronchery	mov	v0.s[0],w6
1622*4757b351SPierre Pronchery
1623*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1624*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1625*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1626*4757b351SPierre Pronchery
1627*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1628*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1629*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1630*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1631*4757b351SPierre Pronchery
1632*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1633*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1634*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1635*4757b351SPierre Pronchery	add	w7,w6,w7
1636*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1637*4757b351SPierre Pronchery	add	w7,w7,w9
1638*4757b351SPierre Pronchery	add	w7,w7,w6
1639*4757b351SPierre Pronchery
1640*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1641*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1642*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1643*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1644*4757b351SPierre Pronchery	eor	w15,w15,w6
1645*4757b351SPierre Pronchery	subs	w11,w11,#1
1646*4757b351SPierre Pronchery	b.ne	10b
1647*4757b351SPierre Pronchery	mov	v6.s[0],w15
1648*4757b351SPierre Pronchery	mov	v6.s[1],w14
1649*4757b351SPierre Pronchery	mov	v6.s[2],w13
1650*4757b351SPierre Pronchery	mov	v6.s[3],w12
1651*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1652*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
1653*4757b351SPierre Pronchery#endif
1654*4757b351SPierre Pronchery	eor	v7.16b,v7.16b,v6.16b
1655*4757b351SPierre Pronchery	mov	x10,x3
1656*4757b351SPierre Pronchery	mov	w11,#8
1657*4757b351SPierre Pronchery	mov	w12,v7.s[0]
1658*4757b351SPierre Pronchery	mov	w13,v7.s[1]
1659*4757b351SPierre Pronchery	mov	w14,v7.s[2]
1660*4757b351SPierre Pronchery	mov	w15,v7.s[3]
1661*4757b351SPierre Pronchery10:
1662*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
1663*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1664*4757b351SPierre Pronchery	eor	w6,w14,w15
1665*4757b351SPierre Pronchery	eor	w9,w7,w13
1666*4757b351SPierre Pronchery	eor	w6,w6,w9
1667*4757b351SPierre Pronchery	movi	v1.16b,#64
1668*4757b351SPierre Pronchery	movi	v2.16b,#128
1669*4757b351SPierre Pronchery	movi	v3.16b,#192
1670*4757b351SPierre Pronchery	mov	v0.s[0],w6
1671*4757b351SPierre Pronchery
1672*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1673*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1674*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1675*4757b351SPierre Pronchery
1676*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1677*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1678*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1679*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1680*4757b351SPierre Pronchery
1681*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1682*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1683*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1684*4757b351SPierre Pronchery	add	w7,w6,w7
1685*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1686*4757b351SPierre Pronchery	add	w7,w7,w9
1687*4757b351SPierre Pronchery	add	w7,w7,w6
1688*4757b351SPierre Pronchery
1689*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1690*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1691*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1692*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1693*4757b351SPierre Pronchery	eor	w12,w12,w6
1694*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1695*4757b351SPierre Pronchery	eor	w6,w14,w15
1696*4757b351SPierre Pronchery	eor	w9,w12,w8
1697*4757b351SPierre Pronchery	eor	w6,w6,w9
1698*4757b351SPierre Pronchery	movi	v1.16b,#64
1699*4757b351SPierre Pronchery	movi	v2.16b,#128
1700*4757b351SPierre Pronchery	movi	v3.16b,#192
1701*4757b351SPierre Pronchery	mov	v0.s[0],w6
1702*4757b351SPierre Pronchery
1703*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1704*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1705*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1706*4757b351SPierre Pronchery
1707*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1708*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1709*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1710*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1711*4757b351SPierre Pronchery
1712*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1713*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1714*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1715*4757b351SPierre Pronchery	add	w7,w6,w7
1716*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1717*4757b351SPierre Pronchery	add	w7,w7,w9
1718*4757b351SPierre Pronchery	add	w7,w7,w6
1719*4757b351SPierre Pronchery
1720*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1721*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1722*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1723*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1724*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
1725*4757b351SPierre Pronchery	eor	w13,w13,w6
1726*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1727*4757b351SPierre Pronchery	eor	w6,w12,w13
1728*4757b351SPierre Pronchery	eor	w9,w7,w15
1729*4757b351SPierre Pronchery	eor	w6,w6,w9
1730*4757b351SPierre Pronchery	movi	v1.16b,#64
1731*4757b351SPierre Pronchery	movi	v2.16b,#128
1732*4757b351SPierre Pronchery	movi	v3.16b,#192
1733*4757b351SPierre Pronchery	mov	v0.s[0],w6
1734*4757b351SPierre Pronchery
1735*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1736*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1737*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1738*4757b351SPierre Pronchery
1739*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1740*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1741*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1742*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1743*4757b351SPierre Pronchery
1744*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1745*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1746*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1747*4757b351SPierre Pronchery	add	w7,w6,w7
1748*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1749*4757b351SPierre Pronchery	add	w7,w7,w9
1750*4757b351SPierre Pronchery	add	w7,w7,w6
1751*4757b351SPierre Pronchery
1752*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1753*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1754*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1755*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1756*4757b351SPierre Pronchery	eor	w14,w14,w6
1757*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1758*4757b351SPierre Pronchery	eor	w6,w12,w13
1759*4757b351SPierre Pronchery	eor	w9,w14,w8
1760*4757b351SPierre Pronchery	eor	w6,w6,w9
1761*4757b351SPierre Pronchery	movi	v1.16b,#64
1762*4757b351SPierre Pronchery	movi	v2.16b,#128
1763*4757b351SPierre Pronchery	movi	v3.16b,#192
1764*4757b351SPierre Pronchery	mov	v0.s[0],w6
1765*4757b351SPierre Pronchery
1766*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1767*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1768*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1769*4757b351SPierre Pronchery
1770*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1771*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1772*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1773*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1774*4757b351SPierre Pronchery
1775*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1776*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1777*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1778*4757b351SPierre Pronchery	add	w7,w6,w7
1779*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1780*4757b351SPierre Pronchery	add	w7,w7,w9
1781*4757b351SPierre Pronchery	add	w7,w7,w6
1782*4757b351SPierre Pronchery
1783*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1784*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1785*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1786*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1787*4757b351SPierre Pronchery	eor	w15,w15,w6
1788*4757b351SPierre Pronchery	subs	w11,w11,#1
1789*4757b351SPierre Pronchery	b.ne	10b
1790*4757b351SPierre Pronchery	mov	v7.s[0],w15
1791*4757b351SPierre Pronchery	mov	v7.s[1],w14
1792*4757b351SPierre Pronchery	mov	v7.s[2],w13
1793*4757b351SPierre Pronchery	mov	v7.s[3],w12
1794*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1795*4757b351SPierre Pronchery	rev32	v6.16b,v6.16b
1796*4757b351SPierre Pronchery#endif
1797*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1798*4757b351SPierre Pronchery	rev32	v7.16b,v7.16b
1799*4757b351SPierre Pronchery#endif
1800*4757b351SPierre Pronchery	orr	v3.16b,v7.16b,v7.16b
1801*4757b351SPierre Pronchery	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
1802*4757b351SPierre Pronchery	subs	w2,w2,#4
1803*4757b351SPierre Pronchery	b.ne	.Lcbc_4_blocks_enc
1804*4757b351SPierre Pronchery	b	2f
1805*4757b351SPierre Pronchery1:
1806*4757b351SPierre Pronchery	subs	w2,w2,#1
1807*4757b351SPierre Pronchery	b.lt	2f
1808*4757b351SPierre Pronchery	ld1	{v4.4s},[x0],#16
1809*4757b351SPierre Pronchery	eor	v3.16b,v3.16b,v4.16b
1810*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1811*4757b351SPierre Pronchery	rev32	v3.16b,v3.16b
1812*4757b351SPierre Pronchery#endif
1813*4757b351SPierre Pronchery	mov	x10,x3
1814*4757b351SPierre Pronchery	mov	w11,#8
1815*4757b351SPierre Pronchery	mov	w12,v3.s[0]
1816*4757b351SPierre Pronchery	mov	w13,v3.s[1]
1817*4757b351SPierre Pronchery	mov	w14,v3.s[2]
1818*4757b351SPierre Pronchery	mov	w15,v3.s[3]
1819*4757b351SPierre Pronchery10:
1820*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
1821*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
1822*4757b351SPierre Pronchery	eor	w6,w14,w15
1823*4757b351SPierre Pronchery	eor	w9,w7,w13
1824*4757b351SPierre Pronchery	eor	w6,w6,w9
1825*4757b351SPierre Pronchery	movi	v1.16b,#64
1826*4757b351SPierre Pronchery	movi	v2.16b,#128
1827*4757b351SPierre Pronchery	movi	v3.16b,#192
1828*4757b351SPierre Pronchery	mov	v0.s[0],w6
1829*4757b351SPierre Pronchery
1830*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1831*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1832*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1833*4757b351SPierre Pronchery
1834*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1835*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1836*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1837*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1838*4757b351SPierre Pronchery
1839*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1840*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1841*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1842*4757b351SPierre Pronchery	add	w7,w6,w7
1843*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1844*4757b351SPierre Pronchery	add	w7,w7,w9
1845*4757b351SPierre Pronchery	add	w7,w7,w6
1846*4757b351SPierre Pronchery
1847*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1848*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1849*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1850*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1851*4757b351SPierre Pronchery	eor	w12,w12,w6
1852*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
1853*4757b351SPierre Pronchery	eor	w6,w14,w15
1854*4757b351SPierre Pronchery	eor	w9,w12,w8
1855*4757b351SPierre Pronchery	eor	w6,w6,w9
1856*4757b351SPierre Pronchery	movi	v1.16b,#64
1857*4757b351SPierre Pronchery	movi	v2.16b,#128
1858*4757b351SPierre Pronchery	movi	v3.16b,#192
1859*4757b351SPierre Pronchery	mov	v0.s[0],w6
1860*4757b351SPierre Pronchery
1861*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1862*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1863*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1864*4757b351SPierre Pronchery
1865*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1866*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1867*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1868*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1869*4757b351SPierre Pronchery
1870*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1871*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1872*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1873*4757b351SPierre Pronchery	add	w7,w6,w7
1874*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1875*4757b351SPierre Pronchery	add	w7,w7,w9
1876*4757b351SPierre Pronchery	add	w7,w7,w6
1877*4757b351SPierre Pronchery
1878*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1879*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1880*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1881*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1882*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
1883*4757b351SPierre Pronchery	eor	w13,w13,w6
1884*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
1885*4757b351SPierre Pronchery	eor	w6,w12,w13
1886*4757b351SPierre Pronchery	eor	w9,w7,w15
1887*4757b351SPierre Pronchery	eor	w6,w6,w9
1888*4757b351SPierre Pronchery	movi	v1.16b,#64
1889*4757b351SPierre Pronchery	movi	v2.16b,#128
1890*4757b351SPierre Pronchery	movi	v3.16b,#192
1891*4757b351SPierre Pronchery	mov	v0.s[0],w6
1892*4757b351SPierre Pronchery
1893*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1894*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1895*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1896*4757b351SPierre Pronchery
1897*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1898*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1899*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1900*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1901*4757b351SPierre Pronchery
1902*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1903*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1904*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1905*4757b351SPierre Pronchery	add	w7,w6,w7
1906*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1907*4757b351SPierre Pronchery	add	w7,w7,w9
1908*4757b351SPierre Pronchery	add	w7,w7,w6
1909*4757b351SPierre Pronchery
1910*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1911*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1912*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1913*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1914*4757b351SPierre Pronchery	eor	w14,w14,w6
1915*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
1916*4757b351SPierre Pronchery	eor	w6,w12,w13
1917*4757b351SPierre Pronchery	eor	w9,w14,w8
1918*4757b351SPierre Pronchery	eor	w6,w6,w9
1919*4757b351SPierre Pronchery	movi	v1.16b,#64
1920*4757b351SPierre Pronchery	movi	v2.16b,#128
1921*4757b351SPierre Pronchery	movi	v3.16b,#192
1922*4757b351SPierre Pronchery	mov	v0.s[0],w6
1923*4757b351SPierre Pronchery
1924*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
1925*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
1926*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
1927*4757b351SPierre Pronchery
1928*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
1929*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
1930*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
1931*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
1932*4757b351SPierre Pronchery
1933*4757b351SPierre Pronchery	mov	w6,v0.s[0]
1934*4757b351SPierre Pronchery	mov	w7,v1.s[0]
1935*4757b351SPierre Pronchery	mov	w9,v2.s[0]
1936*4757b351SPierre Pronchery	add	w7,w6,w7
1937*4757b351SPierre Pronchery	mov	w6,v3.s[0]
1938*4757b351SPierre Pronchery	add	w7,w7,w9
1939*4757b351SPierre Pronchery	add	w7,w7,w6
1940*4757b351SPierre Pronchery
1941*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
1942*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
1943*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
1944*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
1945*4757b351SPierre Pronchery	eor	w15,w15,w6
1946*4757b351SPierre Pronchery	subs	w11,w11,#1
1947*4757b351SPierre Pronchery	b.ne	10b
1948*4757b351SPierre Pronchery	mov	v3.s[0],w15
1949*4757b351SPierre Pronchery	mov	v3.s[1],w14
1950*4757b351SPierre Pronchery	mov	v3.s[2],w13
1951*4757b351SPierre Pronchery	mov	v3.s[3],w12
1952*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1953*4757b351SPierre Pronchery	rev32	v3.16b,v3.16b
1954*4757b351SPierre Pronchery#endif
1955*4757b351SPierre Pronchery	st1	{v3.4s},[x1],#16
1956*4757b351SPierre Pronchery	b	1b
1957*4757b351SPierre Pronchery2:
1958*4757b351SPierre Pronchery	// save back IV
1959*4757b351SPierre Pronchery	st1	{v3.4s},[x4]
1960*4757b351SPierre Pronchery	ret
1961*4757b351SPierre Pronchery
1962*4757b351SPierre Pronchery.Ldec:
1963*4757b351SPierre Pronchery	// decryption mode starts
1964*4757b351SPierre Pronchery	AARCH64_SIGN_LINK_REGISTER
1965*4757b351SPierre Pronchery	stp	d8,d9,[sp,#-80]!
1966*4757b351SPierre Pronchery	stp	d10,d11,[sp,#16]
1967*4757b351SPierre Pronchery	stp	d12,d13,[sp,#32]
1968*4757b351SPierre Pronchery	stp	d14,d15,[sp,#48]
1969*4757b351SPierre Pronchery	stp	x29,x30,[sp,#64]
1970*4757b351SPierre Pronchery.Lcbc_8_blocks_dec:
1971*4757b351SPierre Pronchery	cmp	w2,#8
1972*4757b351SPierre Pronchery	b.lt	1f
1973*4757b351SPierre Pronchery	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0]
1974*4757b351SPierre Pronchery	add	x10,x0,#64
1975*4757b351SPierre Pronchery	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[x10]
1976*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1977*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
1978*4757b351SPierre Pronchery#endif
1979*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1980*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
1981*4757b351SPierre Pronchery#endif
1982*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1983*4757b351SPierre Pronchery	rev32	v6.16b,v6.16b
1984*4757b351SPierre Pronchery#endif
1985*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1986*4757b351SPierre Pronchery	rev32	v7.16b,v7.16b
1987*4757b351SPierre Pronchery#endif
1988*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1989*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
1990*4757b351SPierre Pronchery#endif
1991*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1992*4757b351SPierre Pronchery	rev32	v9.16b,v9.16b
1993*4757b351SPierre Pronchery#endif
1994*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1995*4757b351SPierre Pronchery	rev32	v10.16b,v10.16b
1996*4757b351SPierre Pronchery#endif
1997*4757b351SPierre Pronchery#ifndef __AARCH64EB__
1998*4757b351SPierre Pronchery	rev32	v11.16b,v11.16b
1999*4757b351SPierre Pronchery#endif
2000*4757b351SPierre Pronchery	bl	_vpsm4_enc_8blks
2001*4757b351SPierre Pronchery	zip1	v8.4s,v0.4s,v1.4s
2002*4757b351SPierre Pronchery	zip2	v9.4s,v0.4s,v1.4s
2003*4757b351SPierre Pronchery	zip1	v10.4s,v2.4s,v3.4s
2004*4757b351SPierre Pronchery	zip2	v11.4s,v2.4s,v3.4s
2005*4757b351SPierre Pronchery	zip1	v0.2d,v8.2d,v10.2d
2006*4757b351SPierre Pronchery	zip2	v1.2d,v8.2d,v10.2d
2007*4757b351SPierre Pronchery	zip1	v2.2d,v9.2d,v11.2d
2008*4757b351SPierre Pronchery	zip2	v3.2d,v9.2d,v11.2d
2009*4757b351SPierre Pronchery	zip1	v8.4s,v4.4s,v5.4s
2010*4757b351SPierre Pronchery	zip2	v9.4s,v4.4s,v5.4s
2011*4757b351SPierre Pronchery	zip1	v10.4s,v6.4s,v7.4s
2012*4757b351SPierre Pronchery	zip2	v11.4s,v6.4s,v7.4s
2013*4757b351SPierre Pronchery	zip1	v4.2d,v8.2d,v10.2d
2014*4757b351SPierre Pronchery	zip2	v5.2d,v8.2d,v10.2d
2015*4757b351SPierre Pronchery	zip1	v6.2d,v9.2d,v11.2d
2016*4757b351SPierre Pronchery	zip2	v7.2d,v9.2d,v11.2d
2017*4757b351SPierre Pronchery	ld1	{v15.4s},[x4]
2018*4757b351SPierre Pronchery	ld1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
2019*4757b351SPierre Pronchery	// note ivec1 and vtmpx[3] are reusing the same register
2020*4757b351SPierre Pronchery	// care needs to be taken to avoid conflict
2021*4757b351SPierre Pronchery	eor	v0.16b,v0.16b,v15.16b
2022*4757b351SPierre Pronchery	ld1	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
2023*4757b351SPierre Pronchery	eor	v1.16b,v1.16b,v8.16b
2024*4757b351SPierre Pronchery	eor	v2.16b,v2.16b,v9.16b
2025*4757b351SPierre Pronchery	eor	v3.16b,v3.16b,v10.16b
2026*4757b351SPierre Pronchery	// save back IV
2027*4757b351SPierre Pronchery	st1	{v15.4s}, [x4]
2028*4757b351SPierre Pronchery	eor	v4.16b,v4.16b,v11.16b
2029*4757b351SPierre Pronchery	eor	v5.16b,v5.16b,v12.16b
2030*4757b351SPierre Pronchery	eor	v6.16b,v6.16b,v13.16b
2031*4757b351SPierre Pronchery	eor	v7.16b,v7.16b,v14.16b
2032*4757b351SPierre Pronchery	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2033*4757b351SPierre Pronchery	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
2034*4757b351SPierre Pronchery	subs	w2,w2,#8
2035*4757b351SPierre Pronchery	b.gt	.Lcbc_8_blocks_dec
2036*4757b351SPierre Pronchery	b.eq	100f
2037*4757b351SPierre Pronchery1:
2038*4757b351SPierre Pronchery	ld1	{v15.4s},[x4]
2039*4757b351SPierre Pronchery.Lcbc_4_blocks_dec:
2040*4757b351SPierre Pronchery	cmp	w2,#4
2041*4757b351SPierre Pronchery	b.lt	1f
2042*4757b351SPierre Pronchery	ld4	{v4.4s,v5.4s,v6.4s,v7.4s},[x0]
2043*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2044*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
2045*4757b351SPierre Pronchery#endif
2046*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2047*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
2048*4757b351SPierre Pronchery#endif
2049*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2050*4757b351SPierre Pronchery	rev32	v6.16b,v6.16b
2051*4757b351SPierre Pronchery#endif
2052*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2053*4757b351SPierre Pronchery	rev32	v7.16b,v7.16b
2054*4757b351SPierre Pronchery#endif
2055*4757b351SPierre Pronchery	bl	_vpsm4_enc_4blks
2056*4757b351SPierre Pronchery	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
2057*4757b351SPierre Pronchery	zip1	v8.4s,v0.4s,v1.4s
2058*4757b351SPierre Pronchery	zip2	v9.4s,v0.4s,v1.4s
2059*4757b351SPierre Pronchery	zip1	v10.4s,v2.4s,v3.4s
2060*4757b351SPierre Pronchery	zip2	v11.4s,v2.4s,v3.4s
2061*4757b351SPierre Pronchery	zip1	v0.2d,v8.2d,v10.2d
2062*4757b351SPierre Pronchery	zip2	v1.2d,v8.2d,v10.2d
2063*4757b351SPierre Pronchery	zip1	v2.2d,v9.2d,v11.2d
2064*4757b351SPierre Pronchery	zip2	v3.2d,v9.2d,v11.2d
2065*4757b351SPierre Pronchery	eor	v0.16b,v0.16b,v15.16b
2066*4757b351SPierre Pronchery	eor	v1.16b,v1.16b,v4.16b
2067*4757b351SPierre Pronchery	orr	v15.16b,v7.16b,v7.16b
2068*4757b351SPierre Pronchery	eor	v2.16b,v2.16b,v5.16b
2069*4757b351SPierre Pronchery	eor	v3.16b,v3.16b,v6.16b
2070*4757b351SPierre Pronchery	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2071*4757b351SPierre Pronchery	subs	w2,w2,#4
2072*4757b351SPierre Pronchery	b.gt	.Lcbc_4_blocks_dec
2073*4757b351SPierre Pronchery	// save back IV
2074*4757b351SPierre Pronchery	st1	{v7.4s}, [x4]
2075*4757b351SPierre Pronchery	b	100f
2076*4757b351SPierre Pronchery1:	//	last block
2077*4757b351SPierre Pronchery	subs	w2,w2,#1
2078*4757b351SPierre Pronchery	b.lt	100f
2079*4757b351SPierre Pronchery	b.gt	1f
2080*4757b351SPierre Pronchery	ld1	{v4.4s},[x0],#16
2081*4757b351SPierre Pronchery	// save back IV
2082*4757b351SPierre Pronchery	st1	{v4.4s}, [x4]
2083*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2084*4757b351SPierre Pronchery	rev32	v8.16b,v4.16b
2085*4757b351SPierre Pronchery#else
2086*4757b351SPierre Pronchery	mov	v8.16b,v4.16b
2087*4757b351SPierre Pronchery#endif
2088*4757b351SPierre Pronchery	mov	x10,x3
2089*4757b351SPierre Pronchery	mov	w11,#8
2090*4757b351SPierre Pronchery	mov	w12,v8.s[0]
2091*4757b351SPierre Pronchery	mov	w13,v8.s[1]
2092*4757b351SPierre Pronchery	mov	w14,v8.s[2]
2093*4757b351SPierre Pronchery	mov	w15,v8.s[3]
2094*4757b351SPierre Pronchery10:
2095*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
2096*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2097*4757b351SPierre Pronchery	eor	w6,w14,w15
2098*4757b351SPierre Pronchery	eor	w9,w7,w13
2099*4757b351SPierre Pronchery	eor	w6,w6,w9
2100*4757b351SPierre Pronchery	movi	v1.16b,#64
2101*4757b351SPierre Pronchery	movi	v2.16b,#128
2102*4757b351SPierre Pronchery	movi	v3.16b,#192
2103*4757b351SPierre Pronchery	mov	v0.s[0],w6
2104*4757b351SPierre Pronchery
2105*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
2106*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
2107*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
2108*4757b351SPierre Pronchery
2109*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2110*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2111*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2112*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2113*4757b351SPierre Pronchery
2114*4757b351SPierre Pronchery	mov	w6,v0.s[0]
2115*4757b351SPierre Pronchery	mov	w7,v1.s[0]
2116*4757b351SPierre Pronchery	mov	w9,v2.s[0]
2117*4757b351SPierre Pronchery	add	w7,w6,w7
2118*4757b351SPierre Pronchery	mov	w6,v3.s[0]
2119*4757b351SPierre Pronchery	add	w7,w7,w9
2120*4757b351SPierre Pronchery	add	w7,w7,w6
2121*4757b351SPierre Pronchery
2122*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
2123*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
2124*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
2125*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
2126*4757b351SPierre Pronchery	eor	w12,w12,w6
2127*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2128*4757b351SPierre Pronchery	eor	w6,w14,w15
2129*4757b351SPierre Pronchery	eor	w9,w12,w8
2130*4757b351SPierre Pronchery	eor	w6,w6,w9
2131*4757b351SPierre Pronchery	movi	v1.16b,#64
2132*4757b351SPierre Pronchery	movi	v2.16b,#128
2133*4757b351SPierre Pronchery	movi	v3.16b,#192
2134*4757b351SPierre Pronchery	mov	v0.s[0],w6
2135*4757b351SPierre Pronchery
2136*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
2137*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
2138*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
2139*4757b351SPierre Pronchery
2140*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2141*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2142*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2143*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2144*4757b351SPierre Pronchery
2145*4757b351SPierre Pronchery	mov	w6,v0.s[0]
2146*4757b351SPierre Pronchery	mov	w7,v1.s[0]
2147*4757b351SPierre Pronchery	mov	w9,v2.s[0]
2148*4757b351SPierre Pronchery	add	w7,w6,w7
2149*4757b351SPierre Pronchery	mov	w6,v3.s[0]
2150*4757b351SPierre Pronchery	add	w7,w7,w9
2151*4757b351SPierre Pronchery	add	w7,w7,w6
2152*4757b351SPierre Pronchery
2153*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
2154*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
2155*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
2156*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
2157*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
2158*4757b351SPierre Pronchery	eor	w13,w13,w6
2159*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2160*4757b351SPierre Pronchery	eor	w6,w12,w13
2161*4757b351SPierre Pronchery	eor	w9,w7,w15
2162*4757b351SPierre Pronchery	eor	w6,w6,w9
2163*4757b351SPierre Pronchery	movi	v1.16b,#64
2164*4757b351SPierre Pronchery	movi	v2.16b,#128
2165*4757b351SPierre Pronchery	movi	v3.16b,#192
2166*4757b351SPierre Pronchery	mov	v0.s[0],w6
2167*4757b351SPierre Pronchery
2168*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
2169*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
2170*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
2171*4757b351SPierre Pronchery
2172*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2173*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2174*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2175*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2176*4757b351SPierre Pronchery
2177*4757b351SPierre Pronchery	mov	w6,v0.s[0]
2178*4757b351SPierre Pronchery	mov	w7,v1.s[0]
2179*4757b351SPierre Pronchery	mov	w9,v2.s[0]
2180*4757b351SPierre Pronchery	add	w7,w6,w7
2181*4757b351SPierre Pronchery	mov	w6,v3.s[0]
2182*4757b351SPierre Pronchery	add	w7,w7,w9
2183*4757b351SPierre Pronchery	add	w7,w7,w6
2184*4757b351SPierre Pronchery
2185*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
2186*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
2187*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
2188*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
2189*4757b351SPierre Pronchery	eor	w14,w14,w6
2190*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2191*4757b351SPierre Pronchery	eor	w6,w12,w13
2192*4757b351SPierre Pronchery	eor	w9,w14,w8
2193*4757b351SPierre Pronchery	eor	w6,w6,w9
2194*4757b351SPierre Pronchery	movi	v1.16b,#64
2195*4757b351SPierre Pronchery	movi	v2.16b,#128
2196*4757b351SPierre Pronchery	movi	v3.16b,#192
2197*4757b351SPierre Pronchery	mov	v0.s[0],w6
2198*4757b351SPierre Pronchery
2199*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
2200*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
2201*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
2202*4757b351SPierre Pronchery
2203*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2204*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2205*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2206*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2207*4757b351SPierre Pronchery
2208*4757b351SPierre Pronchery	mov	w6,v0.s[0]
2209*4757b351SPierre Pronchery	mov	w7,v1.s[0]
2210*4757b351SPierre Pronchery	mov	w9,v2.s[0]
2211*4757b351SPierre Pronchery	add	w7,w6,w7
2212*4757b351SPierre Pronchery	mov	w6,v3.s[0]
2213*4757b351SPierre Pronchery	add	w7,w7,w9
2214*4757b351SPierre Pronchery	add	w7,w7,w6
2215*4757b351SPierre Pronchery
2216*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
2217*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
2218*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
2219*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
2220*4757b351SPierre Pronchery	eor	w15,w15,w6
2221*4757b351SPierre Pronchery	subs	w11,w11,#1
2222*4757b351SPierre Pronchery	b.ne	10b
2223*4757b351SPierre Pronchery	mov	v8.s[0],w15
2224*4757b351SPierre Pronchery	mov	v8.s[1],w14
2225*4757b351SPierre Pronchery	mov	v8.s[2],w13
2226*4757b351SPierre Pronchery	mov	v8.s[3],w12
2227*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2228*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
2229*4757b351SPierre Pronchery#endif
2230*4757b351SPierre Pronchery	eor	v8.16b,v8.16b,v15.16b
2231*4757b351SPierre Pronchery	st1	{v8.4s},[x1],#16
2232*4757b351SPierre Pronchery	b	100f
2233*4757b351SPierre Pronchery1:	//	last two blocks
2234*4757b351SPierre Pronchery	ld4	{v4.s,v5.s,v6.s,v7.s}[0],[x0]
2235*4757b351SPierre Pronchery	add	x10,x0,#16
2236*4757b351SPierre Pronchery	ld4	{v4.s,v5.s,v6.s,v7.s}[1],[x10],#16
2237*4757b351SPierre Pronchery	subs	w2,w2,1
2238*4757b351SPierre Pronchery	b.gt	1f
2239*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2240*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
2241*4757b351SPierre Pronchery#endif
2242*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2243*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
2244*4757b351SPierre Pronchery#endif
2245*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2246*4757b351SPierre Pronchery	rev32	v6.16b,v6.16b
2247*4757b351SPierre Pronchery#endif
2248*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2249*4757b351SPierre Pronchery	rev32	v7.16b,v7.16b
2250*4757b351SPierre Pronchery#endif
2251*4757b351SPierre Pronchery	bl	_vpsm4_enc_4blks
2252*4757b351SPierre Pronchery	ld1	{v4.4s,v5.4s},[x0],#32
2253*4757b351SPierre Pronchery	zip1	v8.4s,v0.4s,v1.4s
2254*4757b351SPierre Pronchery	zip2	v9.4s,v0.4s,v1.4s
2255*4757b351SPierre Pronchery	zip1	v10.4s,v2.4s,v3.4s
2256*4757b351SPierre Pronchery	zip2	v11.4s,v2.4s,v3.4s
2257*4757b351SPierre Pronchery	zip1	v0.2d,v8.2d,v10.2d
2258*4757b351SPierre Pronchery	zip2	v1.2d,v8.2d,v10.2d
2259*4757b351SPierre Pronchery	zip1	v2.2d,v9.2d,v11.2d
2260*4757b351SPierre Pronchery	zip2	v3.2d,v9.2d,v11.2d
2261*4757b351SPierre Pronchery	eor	v0.16b,v0.16b,v15.16b
2262*4757b351SPierre Pronchery	eor	v1.16b,v1.16b,v4.16b
2263*4757b351SPierre Pronchery	st1	{v0.4s,v1.4s},[x1],#32
2264*4757b351SPierre Pronchery	// save back IV
2265*4757b351SPierre Pronchery	st1	{v5.4s}, [x4]
2266*4757b351SPierre Pronchery	b	100f
2267*4757b351SPierre Pronchery1:	//	last 3 blocks
2268*4757b351SPierre Pronchery	ld4	{v4.s,v5.s,v6.s,v7.s}[2],[x10]
2269*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2270*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
2271*4757b351SPierre Pronchery#endif
2272*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2273*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
2274*4757b351SPierre Pronchery#endif
2275*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2276*4757b351SPierre Pronchery	rev32	v6.16b,v6.16b
2277*4757b351SPierre Pronchery#endif
2278*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2279*4757b351SPierre Pronchery	rev32	v7.16b,v7.16b
2280*4757b351SPierre Pronchery#endif
2281*4757b351SPierre Pronchery	bl	_vpsm4_enc_4blks
2282*4757b351SPierre Pronchery	ld1	{v4.4s,v5.4s,v6.4s},[x0],#48
2283*4757b351SPierre Pronchery	zip1	v8.4s,v0.4s,v1.4s
2284*4757b351SPierre Pronchery	zip2	v9.4s,v0.4s,v1.4s
2285*4757b351SPierre Pronchery	zip1	v10.4s,v2.4s,v3.4s
2286*4757b351SPierre Pronchery	zip2	v11.4s,v2.4s,v3.4s
2287*4757b351SPierre Pronchery	zip1	v0.2d,v8.2d,v10.2d
2288*4757b351SPierre Pronchery	zip2	v1.2d,v8.2d,v10.2d
2289*4757b351SPierre Pronchery	zip1	v2.2d,v9.2d,v11.2d
2290*4757b351SPierre Pronchery	zip2	v3.2d,v9.2d,v11.2d
2291*4757b351SPierre Pronchery	eor	v0.16b,v0.16b,v15.16b
2292*4757b351SPierre Pronchery	eor	v1.16b,v1.16b,v4.16b
2293*4757b351SPierre Pronchery	eor	v2.16b,v2.16b,v5.16b
2294*4757b351SPierre Pronchery	st1	{v0.4s,v1.4s,v2.4s},[x1],#48
2295*4757b351SPierre Pronchery	// save back IV
2296*4757b351SPierre Pronchery	st1	{v6.4s}, [x4]
2297*4757b351SPierre Pronchery100:
2298*4757b351SPierre Pronchery	ldp	d10,d11,[sp,#16]
2299*4757b351SPierre Pronchery	ldp	d12,d13,[sp,#32]
2300*4757b351SPierre Pronchery	ldp	d14,d15,[sp,#48]
2301*4757b351SPierre Pronchery	ldp	x29,x30,[sp,#64]
2302*4757b351SPierre Pronchery	ldp	d8,d9,[sp],#80
2303*4757b351SPierre Pronchery	AARCH64_VALIDATE_LINK_REGISTER
2304*4757b351SPierre Pronchery	ret
2305*4757b351SPierre Pronchery.size	vpsm4_cbc_encrypt,.-vpsm4_cbc_encrypt
2306*4757b351SPierre Pronchery.globl	vpsm4_ctr32_encrypt_blocks
2307*4757b351SPierre Pronchery.type	vpsm4_ctr32_encrypt_blocks,%function
2308*4757b351SPierre Pronchery.align	5
2309*4757b351SPierre Proncheryvpsm4_ctr32_encrypt_blocks:
2310*4757b351SPierre Pronchery	AARCH64_VALID_CALL_TARGET
2311*4757b351SPierre Pronchery	ld1	{v3.4s},[x4]
2312*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2313*4757b351SPierre Pronchery	rev32	v3.16b,v3.16b
2314*4757b351SPierre Pronchery#endif
2315*4757b351SPierre Pronchery	adrp	x10,.Lsbox
2316*4757b351SPierre Pronchery	add	x10,x10,#:lo12:.Lsbox
2317*4757b351SPierre Pronchery	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
2318*4757b351SPierre Pronchery	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
2319*4757b351SPierre Pronchery	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
2320*4757b351SPierre Pronchery	ld1	{v28.16b,v29.16b,v30.16b,v31.16b},[x10]
2321*4757b351SPierre Pronchery	cmp	w2,#1
2322*4757b351SPierre Pronchery	b.ne	1f
2323*4757b351SPierre Pronchery	// fast processing for one single block without
2324*4757b351SPierre Pronchery	// context saving overhead
2325*4757b351SPierre Pronchery	mov	x10,x3
2326*4757b351SPierre Pronchery	mov	w11,#8
2327*4757b351SPierre Pronchery	mov	w12,v3.s[0]
2328*4757b351SPierre Pronchery	mov	w13,v3.s[1]
2329*4757b351SPierre Pronchery	mov	w14,v3.s[2]
2330*4757b351SPierre Pronchery	mov	w15,v3.s[3]
2331*4757b351SPierre Pronchery10:
2332*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
2333*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2334*4757b351SPierre Pronchery	eor	w6,w14,w15
2335*4757b351SPierre Pronchery	eor	w9,w7,w13
2336*4757b351SPierre Pronchery	eor	w6,w6,w9
2337*4757b351SPierre Pronchery	movi	v1.16b,#64
2338*4757b351SPierre Pronchery	movi	v2.16b,#128
2339*4757b351SPierre Pronchery	movi	v3.16b,#192
2340*4757b351SPierre Pronchery	mov	v0.s[0],w6
2341*4757b351SPierre Pronchery
2342*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
2343*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
2344*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
2345*4757b351SPierre Pronchery
2346*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2347*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2348*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2349*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2350*4757b351SPierre Pronchery
2351*4757b351SPierre Pronchery	mov	w6,v0.s[0]
2352*4757b351SPierre Pronchery	mov	w7,v1.s[0]
2353*4757b351SPierre Pronchery	mov	w9,v2.s[0]
2354*4757b351SPierre Pronchery	add	w7,w6,w7
2355*4757b351SPierre Pronchery	mov	w6,v3.s[0]
2356*4757b351SPierre Pronchery	add	w7,w7,w9
2357*4757b351SPierre Pronchery	add	w7,w7,w6
2358*4757b351SPierre Pronchery
2359*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
2360*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
2361*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
2362*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
2363*4757b351SPierre Pronchery	eor	w12,w12,w6
2364*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2365*4757b351SPierre Pronchery	eor	w6,w14,w15
2366*4757b351SPierre Pronchery	eor	w9,w12,w8
2367*4757b351SPierre Pronchery	eor	w6,w6,w9
2368*4757b351SPierre Pronchery	movi	v1.16b,#64
2369*4757b351SPierre Pronchery	movi	v2.16b,#128
2370*4757b351SPierre Pronchery	movi	v3.16b,#192
2371*4757b351SPierre Pronchery	mov	v0.s[0],w6
2372*4757b351SPierre Pronchery
2373*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
2374*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
2375*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
2376*4757b351SPierre Pronchery
2377*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2378*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2379*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2380*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2381*4757b351SPierre Pronchery
2382*4757b351SPierre Pronchery	mov	w6,v0.s[0]
2383*4757b351SPierre Pronchery	mov	w7,v1.s[0]
2384*4757b351SPierre Pronchery	mov	w9,v2.s[0]
2385*4757b351SPierre Pronchery	add	w7,w6,w7
2386*4757b351SPierre Pronchery	mov	w6,v3.s[0]
2387*4757b351SPierre Pronchery	add	w7,w7,w9
2388*4757b351SPierre Pronchery	add	w7,w7,w6
2389*4757b351SPierre Pronchery
2390*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
2391*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
2392*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
2393*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
2394*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
2395*4757b351SPierre Pronchery	eor	w13,w13,w6
2396*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2397*4757b351SPierre Pronchery	eor	w6,w12,w13
2398*4757b351SPierre Pronchery	eor	w9,w7,w15
2399*4757b351SPierre Pronchery	eor	w6,w6,w9
2400*4757b351SPierre Pronchery	movi	v1.16b,#64
2401*4757b351SPierre Pronchery	movi	v2.16b,#128
2402*4757b351SPierre Pronchery	movi	v3.16b,#192
2403*4757b351SPierre Pronchery	mov	v0.s[0],w6
2404*4757b351SPierre Pronchery
2405*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
2406*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
2407*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
2408*4757b351SPierre Pronchery
2409*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2410*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2411*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2412*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2413*4757b351SPierre Pronchery
2414*4757b351SPierre Pronchery	mov	w6,v0.s[0]
2415*4757b351SPierre Pronchery	mov	w7,v1.s[0]
2416*4757b351SPierre Pronchery	mov	w9,v2.s[0]
2417*4757b351SPierre Pronchery	add	w7,w6,w7
2418*4757b351SPierre Pronchery	mov	w6,v3.s[0]
2419*4757b351SPierre Pronchery	add	w7,w7,w9
2420*4757b351SPierre Pronchery	add	w7,w7,w6
2421*4757b351SPierre Pronchery
2422*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
2423*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
2424*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
2425*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
2426*4757b351SPierre Pronchery	eor	w14,w14,w6
2427*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2428*4757b351SPierre Pronchery	eor	w6,w12,w13
2429*4757b351SPierre Pronchery	eor	w9,w14,w8
2430*4757b351SPierre Pronchery	eor	w6,w6,w9
2431*4757b351SPierre Pronchery	movi	v1.16b,#64
2432*4757b351SPierre Pronchery	movi	v2.16b,#128
2433*4757b351SPierre Pronchery	movi	v3.16b,#192
2434*4757b351SPierre Pronchery	mov	v0.s[0],w6
2435*4757b351SPierre Pronchery
2436*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
2437*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
2438*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
2439*4757b351SPierre Pronchery
2440*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2441*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2442*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2443*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2444*4757b351SPierre Pronchery
2445*4757b351SPierre Pronchery	mov	w6,v0.s[0]
2446*4757b351SPierre Pronchery	mov	w7,v1.s[0]
2447*4757b351SPierre Pronchery	mov	w9,v2.s[0]
2448*4757b351SPierre Pronchery	add	w7,w6,w7
2449*4757b351SPierre Pronchery	mov	w6,v3.s[0]
2450*4757b351SPierre Pronchery	add	w7,w7,w9
2451*4757b351SPierre Pronchery	add	w7,w7,w6
2452*4757b351SPierre Pronchery
2453*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
2454*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
2455*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
2456*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
2457*4757b351SPierre Pronchery	eor	w15,w15,w6
2458*4757b351SPierre Pronchery	subs	w11,w11,#1
2459*4757b351SPierre Pronchery	b.ne	10b
2460*4757b351SPierre Pronchery	mov	v3.s[0],w15
2461*4757b351SPierre Pronchery	mov	v3.s[1],w14
2462*4757b351SPierre Pronchery	mov	v3.s[2],w13
2463*4757b351SPierre Pronchery	mov	v3.s[3],w12
2464*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2465*4757b351SPierre Pronchery	rev32	v3.16b,v3.16b
2466*4757b351SPierre Pronchery#endif
2467*4757b351SPierre Pronchery	ld1	{v4.4s},[x0]
2468*4757b351SPierre Pronchery	eor	v4.16b,v4.16b,v3.16b
2469*4757b351SPierre Pronchery	st1	{v4.4s},[x1]
2470*4757b351SPierre Pronchery	ret
2471*4757b351SPierre Pronchery1:
2472*4757b351SPierre Pronchery	AARCH64_SIGN_LINK_REGISTER
2473*4757b351SPierre Pronchery	stp	d8,d9,[sp,#-80]!
2474*4757b351SPierre Pronchery	stp	d10,d11,[sp,#16]
2475*4757b351SPierre Pronchery	stp	d12,d13,[sp,#32]
2476*4757b351SPierre Pronchery	stp	d14,d15,[sp,#48]
2477*4757b351SPierre Pronchery	stp	x29,x30,[sp,#64]
2478*4757b351SPierre Pronchery	mov	w12,v3.s[0]
2479*4757b351SPierre Pronchery	mov	w13,v3.s[1]
2480*4757b351SPierre Pronchery	mov	w14,v3.s[2]
2481*4757b351SPierre Pronchery	mov	w5,v3.s[3]
2482*4757b351SPierre Pronchery.Lctr32_4_blocks_process:
2483*4757b351SPierre Pronchery	cmp	w2,#4
2484*4757b351SPierre Pronchery	b.lt	1f
2485*4757b351SPierre Pronchery	dup	v4.4s,w12
2486*4757b351SPierre Pronchery	dup	v5.4s,w13
2487*4757b351SPierre Pronchery	dup	v6.4s,w14
2488*4757b351SPierre Pronchery	mov	v7.s[0],w5
2489*4757b351SPierre Pronchery	add	w5,w5,#1
2490*4757b351SPierre Pronchery	mov	v7.s[1],w5
2491*4757b351SPierre Pronchery	add	w5,w5,#1
2492*4757b351SPierre Pronchery	mov	v7.s[2],w5
2493*4757b351SPierre Pronchery	add	w5,w5,#1
2494*4757b351SPierre Pronchery	mov	v7.s[3],w5
2495*4757b351SPierre Pronchery	add	w5,w5,#1
2496*4757b351SPierre Pronchery	cmp	w2,#8
2497*4757b351SPierre Pronchery	b.ge	.Lctr32_8_blocks_process
2498*4757b351SPierre Pronchery	bl	_vpsm4_enc_4blks
2499*4757b351SPierre Pronchery	ld4	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
2500*4757b351SPierre Pronchery	eor	v0.16b,v0.16b,v12.16b
2501*4757b351SPierre Pronchery	eor	v1.16b,v1.16b,v13.16b
2502*4757b351SPierre Pronchery	eor	v2.16b,v2.16b,v14.16b
2503*4757b351SPierre Pronchery	eor	v3.16b,v3.16b,v15.16b
2504*4757b351SPierre Pronchery	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2505*4757b351SPierre Pronchery	subs	w2,w2,#4
2506*4757b351SPierre Pronchery	b.ne	.Lctr32_4_blocks_process
2507*4757b351SPierre Pronchery	b	100f
2508*4757b351SPierre Pronchery.Lctr32_8_blocks_process:
2509*4757b351SPierre Pronchery	dup	v8.4s,w12
2510*4757b351SPierre Pronchery	dup	v9.4s,w13
2511*4757b351SPierre Pronchery	dup	v10.4s,w14
2512*4757b351SPierre Pronchery	mov	v11.s[0],w5
2513*4757b351SPierre Pronchery	add	w5,w5,#1
2514*4757b351SPierre Pronchery	mov	v11.s[1],w5
2515*4757b351SPierre Pronchery	add	w5,w5,#1
2516*4757b351SPierre Pronchery	mov	v11.s[2],w5
2517*4757b351SPierre Pronchery	add	w5,w5,#1
2518*4757b351SPierre Pronchery	mov	v11.s[3],w5
2519*4757b351SPierre Pronchery	add	w5,w5,#1
2520*4757b351SPierre Pronchery	bl	_vpsm4_enc_8blks
2521*4757b351SPierre Pronchery	ld4	{v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64
2522*4757b351SPierre Pronchery	ld4	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
2523*4757b351SPierre Pronchery	eor	v0.16b,v0.16b,v12.16b
2524*4757b351SPierre Pronchery	eor	v1.16b,v1.16b,v13.16b
2525*4757b351SPierre Pronchery	eor	v2.16b,v2.16b,v14.16b
2526*4757b351SPierre Pronchery	eor	v3.16b,v3.16b,v15.16b
2527*4757b351SPierre Pronchery	eor	v4.16b,v4.16b,v8.16b
2528*4757b351SPierre Pronchery	eor	v5.16b,v5.16b,v9.16b
2529*4757b351SPierre Pronchery	eor	v6.16b,v6.16b,v10.16b
2530*4757b351SPierre Pronchery	eor	v7.16b,v7.16b,v11.16b
2531*4757b351SPierre Pronchery	st4	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
2532*4757b351SPierre Pronchery	st4	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
2533*4757b351SPierre Pronchery	subs	w2,w2,#8
2534*4757b351SPierre Pronchery	b.ne	.Lctr32_4_blocks_process
2535*4757b351SPierre Pronchery	b	100f
2536*4757b351SPierre Pronchery1:	//	last block processing
2537*4757b351SPierre Pronchery	subs	w2,w2,#1
2538*4757b351SPierre Pronchery	b.lt	100f
2539*4757b351SPierre Pronchery	b.gt	1f
2540*4757b351SPierre Pronchery	mov	v3.s[0],w12
2541*4757b351SPierre Pronchery	mov	v3.s[1],w13
2542*4757b351SPierre Pronchery	mov	v3.s[2],w14
2543*4757b351SPierre Pronchery	mov	v3.s[3],w5
2544*4757b351SPierre Pronchery	mov	x10,x3
2545*4757b351SPierre Pronchery	mov	w11,#8
2546*4757b351SPierre Pronchery	mov	w12,v3.s[0]
2547*4757b351SPierre Pronchery	mov	w13,v3.s[1]
2548*4757b351SPierre Pronchery	mov	w14,v3.s[2]
2549*4757b351SPierre Pronchery	mov	w15,v3.s[3]
2550*4757b351SPierre Pronchery10:
2551*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
2552*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2553*4757b351SPierre Pronchery	eor	w6,w14,w15
2554*4757b351SPierre Pronchery	eor	w9,w7,w13
2555*4757b351SPierre Pronchery	eor	w6,w6,w9
2556*4757b351SPierre Pronchery	movi	v1.16b,#64
2557*4757b351SPierre Pronchery	movi	v2.16b,#128
2558*4757b351SPierre Pronchery	movi	v3.16b,#192
2559*4757b351SPierre Pronchery	mov	v0.s[0],w6
2560*4757b351SPierre Pronchery
2561*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
2562*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
2563*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
2564*4757b351SPierre Pronchery
2565*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2566*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2567*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2568*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2569*4757b351SPierre Pronchery
2570*4757b351SPierre Pronchery	mov	w6,v0.s[0]
2571*4757b351SPierre Pronchery	mov	w7,v1.s[0]
2572*4757b351SPierre Pronchery	mov	w9,v2.s[0]
2573*4757b351SPierre Pronchery	add	w7,w6,w7
2574*4757b351SPierre Pronchery	mov	w6,v3.s[0]
2575*4757b351SPierre Pronchery	add	w7,w7,w9
2576*4757b351SPierre Pronchery	add	w7,w7,w6
2577*4757b351SPierre Pronchery
2578*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
2579*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
2580*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
2581*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
2582*4757b351SPierre Pronchery	eor	w12,w12,w6
2583*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2584*4757b351SPierre Pronchery	eor	w6,w14,w15
2585*4757b351SPierre Pronchery	eor	w9,w12,w8
2586*4757b351SPierre Pronchery	eor	w6,w6,w9
2587*4757b351SPierre Pronchery	movi	v1.16b,#64
2588*4757b351SPierre Pronchery	movi	v2.16b,#128
2589*4757b351SPierre Pronchery	movi	v3.16b,#192
2590*4757b351SPierre Pronchery	mov	v0.s[0],w6
2591*4757b351SPierre Pronchery
2592*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
2593*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
2594*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
2595*4757b351SPierre Pronchery
2596*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2597*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2598*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2599*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2600*4757b351SPierre Pronchery
2601*4757b351SPierre Pronchery	mov	w6,v0.s[0]
2602*4757b351SPierre Pronchery	mov	w7,v1.s[0]
2603*4757b351SPierre Pronchery	mov	w9,v2.s[0]
2604*4757b351SPierre Pronchery	add	w7,w6,w7
2605*4757b351SPierre Pronchery	mov	w6,v3.s[0]
2606*4757b351SPierre Pronchery	add	w7,w7,w9
2607*4757b351SPierre Pronchery	add	w7,w7,w6
2608*4757b351SPierre Pronchery
2609*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
2610*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
2611*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
2612*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
2613*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
2614*4757b351SPierre Pronchery	eor	w13,w13,w6
2615*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2616*4757b351SPierre Pronchery	eor	w6,w12,w13
2617*4757b351SPierre Pronchery	eor	w9,w7,w15
2618*4757b351SPierre Pronchery	eor	w6,w6,w9
2619*4757b351SPierre Pronchery	movi	v1.16b,#64
2620*4757b351SPierre Pronchery	movi	v2.16b,#128
2621*4757b351SPierre Pronchery	movi	v3.16b,#192
2622*4757b351SPierre Pronchery	mov	v0.s[0],w6
2623*4757b351SPierre Pronchery
2624*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
2625*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
2626*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
2627*4757b351SPierre Pronchery
2628*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2629*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2630*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2631*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2632*4757b351SPierre Pronchery
2633*4757b351SPierre Pronchery	mov	w6,v0.s[0]
2634*4757b351SPierre Pronchery	mov	w7,v1.s[0]
2635*4757b351SPierre Pronchery	mov	w9,v2.s[0]
2636*4757b351SPierre Pronchery	add	w7,w6,w7
2637*4757b351SPierre Pronchery	mov	w6,v3.s[0]
2638*4757b351SPierre Pronchery	add	w7,w7,w9
2639*4757b351SPierre Pronchery	add	w7,w7,w6
2640*4757b351SPierre Pronchery
2641*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
2642*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
2643*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
2644*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
2645*4757b351SPierre Pronchery	eor	w14,w14,w6
2646*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2647*4757b351SPierre Pronchery	eor	w6,w12,w13
2648*4757b351SPierre Pronchery	eor	w9,w14,w8
2649*4757b351SPierre Pronchery	eor	w6,w6,w9
2650*4757b351SPierre Pronchery	movi	v1.16b,#64
2651*4757b351SPierre Pronchery	movi	v2.16b,#128
2652*4757b351SPierre Pronchery	movi	v3.16b,#192
2653*4757b351SPierre Pronchery	mov	v0.s[0],w6
2654*4757b351SPierre Pronchery
2655*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
2656*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
2657*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
2658*4757b351SPierre Pronchery
2659*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2660*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2661*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2662*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2663*4757b351SPierre Pronchery
2664*4757b351SPierre Pronchery	mov	w6,v0.s[0]
2665*4757b351SPierre Pronchery	mov	w7,v1.s[0]
2666*4757b351SPierre Pronchery	mov	w9,v2.s[0]
2667*4757b351SPierre Pronchery	add	w7,w6,w7
2668*4757b351SPierre Pronchery	mov	w6,v3.s[0]
2669*4757b351SPierre Pronchery	add	w7,w7,w9
2670*4757b351SPierre Pronchery	add	w7,w7,w6
2671*4757b351SPierre Pronchery
2672*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
2673*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
2674*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
2675*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
2676*4757b351SPierre Pronchery	eor	w15,w15,w6
2677*4757b351SPierre Pronchery	subs	w11,w11,#1
2678*4757b351SPierre Pronchery	b.ne	10b
2679*4757b351SPierre Pronchery	mov	v3.s[0],w15
2680*4757b351SPierre Pronchery	mov	v3.s[1],w14
2681*4757b351SPierre Pronchery	mov	v3.s[2],w13
2682*4757b351SPierre Pronchery	mov	v3.s[3],w12
2683*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2684*4757b351SPierre Pronchery	rev32	v3.16b,v3.16b
2685*4757b351SPierre Pronchery#endif
2686*4757b351SPierre Pronchery	ld1	{v4.4s},[x0]
2687*4757b351SPierre Pronchery	eor	v4.16b,v4.16b,v3.16b
2688*4757b351SPierre Pronchery	st1	{v4.4s},[x1]
2689*4757b351SPierre Pronchery	b	100f
2690*4757b351SPierre Pronchery1:	//	last 2 blocks processing
2691*4757b351SPierre Pronchery	dup	v4.4s,w12
2692*4757b351SPierre Pronchery	dup	v5.4s,w13
2693*4757b351SPierre Pronchery	dup	v6.4s,w14
2694*4757b351SPierre Pronchery	mov	v7.s[0],w5
2695*4757b351SPierre Pronchery	add	w5,w5,#1
2696*4757b351SPierre Pronchery	mov	v7.s[1],w5
2697*4757b351SPierre Pronchery	subs	w2,w2,#1
2698*4757b351SPierre Pronchery	b.ne	1f
2699*4757b351SPierre Pronchery	bl	_vpsm4_enc_4blks
2700*4757b351SPierre Pronchery	ld4	{v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
2701*4757b351SPierre Pronchery	ld4	{v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
2702*4757b351SPierre Pronchery	eor	v0.16b,v0.16b,v12.16b
2703*4757b351SPierre Pronchery	eor	v1.16b,v1.16b,v13.16b
2704*4757b351SPierre Pronchery	eor	v2.16b,v2.16b,v14.16b
2705*4757b351SPierre Pronchery	eor	v3.16b,v3.16b,v15.16b
2706*4757b351SPierre Pronchery	st4	{v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
2707*4757b351SPierre Pronchery	st4	{v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
2708*4757b351SPierre Pronchery	b	100f
2709*4757b351SPierre Pronchery1:	//	last 3 blocks processing
2710*4757b351SPierre Pronchery	add	w5,w5,#1
2711*4757b351SPierre Pronchery	mov	v7.s[2],w5
2712*4757b351SPierre Pronchery	bl	_vpsm4_enc_4blks
2713*4757b351SPierre Pronchery	ld4	{v12.s,v13.s,v14.s,v15.s}[0],[x0],#16
2714*4757b351SPierre Pronchery	ld4	{v12.s,v13.s,v14.s,v15.s}[1],[x0],#16
2715*4757b351SPierre Pronchery	ld4	{v12.s,v13.s,v14.s,v15.s}[2],[x0],#16
2716*4757b351SPierre Pronchery	eor	v0.16b,v0.16b,v12.16b
2717*4757b351SPierre Pronchery	eor	v1.16b,v1.16b,v13.16b
2718*4757b351SPierre Pronchery	eor	v2.16b,v2.16b,v14.16b
2719*4757b351SPierre Pronchery	eor	v3.16b,v3.16b,v15.16b
2720*4757b351SPierre Pronchery	st4	{v0.s,v1.s,v2.s,v3.s}[0],[x1],#16
2721*4757b351SPierre Pronchery	st4	{v0.s,v1.s,v2.s,v3.s}[1],[x1],#16
2722*4757b351SPierre Pronchery	st4	{v0.s,v1.s,v2.s,v3.s}[2],[x1],#16
2723*4757b351SPierre Pronchery100:
2724*4757b351SPierre Pronchery	ldp	d10,d11,[sp,#16]
2725*4757b351SPierre Pronchery	ldp	d12,d13,[sp,#32]
2726*4757b351SPierre Pronchery	ldp	d14,d15,[sp,#48]
2727*4757b351SPierre Pronchery	ldp	x29,x30,[sp,#64]
2728*4757b351SPierre Pronchery	ldp	d8,d9,[sp],#80
2729*4757b351SPierre Pronchery	AARCH64_VALIDATE_LINK_REGISTER
2730*4757b351SPierre Pronchery	ret
2731*4757b351SPierre Pronchery.size	vpsm4_ctr32_encrypt_blocks,.-vpsm4_ctr32_encrypt_blocks
2732*4757b351SPierre Pronchery.globl	vpsm4_xts_encrypt_gb
2733*4757b351SPierre Pronchery.type	vpsm4_xts_encrypt_gb,%function
2734*4757b351SPierre Pronchery.align	5
2735*4757b351SPierre Proncheryvpsm4_xts_encrypt_gb:
2736*4757b351SPierre Pronchery	AARCH64_SIGN_LINK_REGISTER
2737*4757b351SPierre Pronchery	stp	x15, x16, [sp, #-0x10]!
2738*4757b351SPierre Pronchery	stp	x17, x18, [sp, #-0x10]!
2739*4757b351SPierre Pronchery	stp	x19, x20, [sp, #-0x10]!
2740*4757b351SPierre Pronchery	stp	x21, x22, [sp, #-0x10]!
2741*4757b351SPierre Pronchery	stp	x23, x24, [sp, #-0x10]!
2742*4757b351SPierre Pronchery	stp	x25, x26, [sp, #-0x10]!
2743*4757b351SPierre Pronchery	stp	x27, x28, [sp, #-0x10]!
2744*4757b351SPierre Pronchery	stp	x29, x30, [sp, #-0x10]!
2745*4757b351SPierre Pronchery	stp	d8, d9, [sp, #-0x10]!
2746*4757b351SPierre Pronchery	stp	d10, d11, [sp, #-0x10]!
2747*4757b351SPierre Pronchery	stp	d12, d13, [sp, #-0x10]!
2748*4757b351SPierre Pronchery	stp	d14, d15, [sp, #-0x10]!
2749*4757b351SPierre Pronchery	mov	x26,x3
2750*4757b351SPierre Pronchery	mov	x27,x4
2751*4757b351SPierre Pronchery	mov	w28,w6
2752*4757b351SPierre Pronchery	ld1	{v8.4s}, [x5]
2753*4757b351SPierre Pronchery	mov	x3,x27
2754*4757b351SPierre Pronchery	adrp	x10,.Lsbox
2755*4757b351SPierre Pronchery	add	x10,x10,#:lo12:.Lsbox
2756*4757b351SPierre Pronchery	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
2757*4757b351SPierre Pronchery	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
2758*4757b351SPierre Pronchery	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
2759*4757b351SPierre Pronchery	ld1	{v28.16b,v29.16b,v30.16b,v31.16b},[x10]
2760*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2761*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
2762*4757b351SPierre Pronchery#endif
2763*4757b351SPierre Pronchery	mov	x10,x3
2764*4757b351SPierre Pronchery	mov	w11,#8
2765*4757b351SPierre Pronchery	mov	w12,v8.s[0]
2766*4757b351SPierre Pronchery	mov	w13,v8.s[1]
2767*4757b351SPierre Pronchery	mov	w14,v8.s[2]
2768*4757b351SPierre Pronchery	mov	w15,v8.s[3]
2769*4757b351SPierre Pronchery10:
2770*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
2771*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
2772*4757b351SPierre Pronchery	eor	w6,w14,w15
2773*4757b351SPierre Pronchery	eor	w9,w7,w13
2774*4757b351SPierre Pronchery	eor	w6,w6,w9
2775*4757b351SPierre Pronchery	movi	v1.16b,#64
2776*4757b351SPierre Pronchery	movi	v2.16b,#128
2777*4757b351SPierre Pronchery	movi	v3.16b,#192
2778*4757b351SPierre Pronchery	mov	v0.s[0],w6
2779*4757b351SPierre Pronchery
2780*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
2781*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
2782*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
2783*4757b351SPierre Pronchery
2784*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2785*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2786*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2787*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2788*4757b351SPierre Pronchery
2789*4757b351SPierre Pronchery	mov	w6,v0.s[0]
2790*4757b351SPierre Pronchery	mov	w7,v1.s[0]
2791*4757b351SPierre Pronchery	mov	w9,v2.s[0]
2792*4757b351SPierre Pronchery	add	w7,w6,w7
2793*4757b351SPierre Pronchery	mov	w6,v3.s[0]
2794*4757b351SPierre Pronchery	add	w7,w7,w9
2795*4757b351SPierre Pronchery	add	w7,w7,w6
2796*4757b351SPierre Pronchery
2797*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
2798*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
2799*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
2800*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
2801*4757b351SPierre Pronchery	eor	w12,w12,w6
2802*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
2803*4757b351SPierre Pronchery	eor	w6,w14,w15
2804*4757b351SPierre Pronchery	eor	w9,w12,w8
2805*4757b351SPierre Pronchery	eor	w6,w6,w9
2806*4757b351SPierre Pronchery	movi	v1.16b,#64
2807*4757b351SPierre Pronchery	movi	v2.16b,#128
2808*4757b351SPierre Pronchery	movi	v3.16b,#192
2809*4757b351SPierre Pronchery	mov	v0.s[0],w6
2810*4757b351SPierre Pronchery
2811*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
2812*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
2813*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
2814*4757b351SPierre Pronchery
2815*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2816*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2817*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2818*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2819*4757b351SPierre Pronchery
2820*4757b351SPierre Pronchery	mov	w6,v0.s[0]
2821*4757b351SPierre Pronchery	mov	w7,v1.s[0]
2822*4757b351SPierre Pronchery	mov	w9,v2.s[0]
2823*4757b351SPierre Pronchery	add	w7,w6,w7
2824*4757b351SPierre Pronchery	mov	w6,v3.s[0]
2825*4757b351SPierre Pronchery	add	w7,w7,w9
2826*4757b351SPierre Pronchery	add	w7,w7,w6
2827*4757b351SPierre Pronchery
2828*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
2829*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
2830*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
2831*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
2832*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
2833*4757b351SPierre Pronchery	eor	w13,w13,w6
2834*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
2835*4757b351SPierre Pronchery	eor	w6,w12,w13
2836*4757b351SPierre Pronchery	eor	w9,w7,w15
2837*4757b351SPierre Pronchery	eor	w6,w6,w9
2838*4757b351SPierre Pronchery	movi	v1.16b,#64
2839*4757b351SPierre Pronchery	movi	v2.16b,#128
2840*4757b351SPierre Pronchery	movi	v3.16b,#192
2841*4757b351SPierre Pronchery	mov	v0.s[0],w6
2842*4757b351SPierre Pronchery
2843*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
2844*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
2845*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
2846*4757b351SPierre Pronchery
2847*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2848*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2849*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2850*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2851*4757b351SPierre Pronchery
2852*4757b351SPierre Pronchery	mov	w6,v0.s[0]
2853*4757b351SPierre Pronchery	mov	w7,v1.s[0]
2854*4757b351SPierre Pronchery	mov	w9,v2.s[0]
2855*4757b351SPierre Pronchery	add	w7,w6,w7
2856*4757b351SPierre Pronchery	mov	w6,v3.s[0]
2857*4757b351SPierre Pronchery	add	w7,w7,w9
2858*4757b351SPierre Pronchery	add	w7,w7,w6
2859*4757b351SPierre Pronchery
2860*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
2861*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
2862*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
2863*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
2864*4757b351SPierre Pronchery	eor	w14,w14,w6
2865*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
2866*4757b351SPierre Pronchery	eor	w6,w12,w13
2867*4757b351SPierre Pronchery	eor	w9,w14,w8
2868*4757b351SPierre Pronchery	eor	w6,w6,w9
2869*4757b351SPierre Pronchery	movi	v1.16b,#64
2870*4757b351SPierre Pronchery	movi	v2.16b,#128
2871*4757b351SPierre Pronchery	movi	v3.16b,#192
2872*4757b351SPierre Pronchery	mov	v0.s[0],w6
2873*4757b351SPierre Pronchery
2874*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
2875*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
2876*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
2877*4757b351SPierre Pronchery
2878*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
2879*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
2880*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
2881*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
2882*4757b351SPierre Pronchery
2883*4757b351SPierre Pronchery	mov	w6,v0.s[0]
2884*4757b351SPierre Pronchery	mov	w7,v1.s[0]
2885*4757b351SPierre Pronchery	mov	w9,v2.s[0]
2886*4757b351SPierre Pronchery	add	w7,w6,w7
2887*4757b351SPierre Pronchery	mov	w6,v3.s[0]
2888*4757b351SPierre Pronchery	add	w7,w7,w9
2889*4757b351SPierre Pronchery	add	w7,w7,w6
2890*4757b351SPierre Pronchery
2891*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
2892*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
2893*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
2894*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
2895*4757b351SPierre Pronchery	eor	w15,w15,w6
2896*4757b351SPierre Pronchery	subs	w11,w11,#1
2897*4757b351SPierre Pronchery	b.ne	10b
2898*4757b351SPierre Pronchery	mov	v8.s[0],w15
2899*4757b351SPierre Pronchery	mov	v8.s[1],w14
2900*4757b351SPierre Pronchery	mov	v8.s[2],w13
2901*4757b351SPierre Pronchery	mov	v8.s[3],w12
2902*4757b351SPierre Pronchery#ifndef __AARCH64EB__
2903*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
2904*4757b351SPierre Pronchery#endif
2905*4757b351SPierre Pronchery	mov	x3,x26
2906*4757b351SPierre Pronchery	and	x29,x2,#0x0F
2907*4757b351SPierre Pronchery	// convert length into blocks
2908*4757b351SPierre Pronchery	lsr	x2,x2,4
2909*4757b351SPierre Pronchery	cmp	x2,#1
2910*4757b351SPierre Pronchery	b.lt	.return_gb
2911*4757b351SPierre Pronchery
2912*4757b351SPierre Pronchery	cmp	x29,0
2913*4757b351SPierre Pronchery	// If the encryption/decryption Length is N times of 16,
2914*4757b351SPierre Pronchery	// the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
2915*4757b351SPierre Pronchery	b.eq	.xts_encrypt_blocks_gb
2916*4757b351SPierre Pronchery
2917*4757b351SPierre Pronchery	// If the encryption/decryption length is not N times of 16,
2918*4757b351SPierre Pronchery	// the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb
2919*4757b351SPierre Pronchery	// the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb
2920*4757b351SPierre Pronchery	subs	x2,x2,#1
2921*4757b351SPierre Pronchery	b.eq	.only_2blks_tweak_gb
2922*4757b351SPierre Pronchery.xts_encrypt_blocks_gb:
2923*4757b351SPierre Pronchery	rbit	v8.16b,v8.16b
2924*4757b351SPierre Pronchery#ifdef __AARCH64EB__
2925*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
2926*4757b351SPierre Pronchery#endif
2927*4757b351SPierre Pronchery	mov	x12,v8.d[0]
2928*4757b351SPierre Pronchery	mov	x13,v8.d[1]
2929*4757b351SPierre Pronchery	mov	w7,0x87
2930*4757b351SPierre Pronchery	extr	x9,x13,x13,#32
2931*4757b351SPierre Pronchery	extr	x15,x13,x12,#63
2932*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
2933*4757b351SPierre Pronchery	eor	x14,x8,x12,lsl#1
2934*4757b351SPierre Pronchery	mov	w7,0x87
2935*4757b351SPierre Pronchery	extr	x9,x15,x15,#32
2936*4757b351SPierre Pronchery	extr	x17,x15,x14,#63
2937*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
2938*4757b351SPierre Pronchery	eor	x16,x8,x14,lsl#1
2939*4757b351SPierre Pronchery	mov	w7,0x87
2940*4757b351SPierre Pronchery	extr	x9,x17,x17,#32
2941*4757b351SPierre Pronchery	extr	x19,x17,x16,#63
2942*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
2943*4757b351SPierre Pronchery	eor	x18,x8,x16,lsl#1
2944*4757b351SPierre Pronchery	mov	w7,0x87
2945*4757b351SPierre Pronchery	extr	x9,x19,x19,#32
2946*4757b351SPierre Pronchery	extr	x21,x19,x18,#63
2947*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
2948*4757b351SPierre Pronchery	eor	x20,x8,x18,lsl#1
2949*4757b351SPierre Pronchery	mov	w7,0x87
2950*4757b351SPierre Pronchery	extr	x9,x21,x21,#32
2951*4757b351SPierre Pronchery	extr	x23,x21,x20,#63
2952*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
2953*4757b351SPierre Pronchery	eor	x22,x8,x20,lsl#1
2954*4757b351SPierre Pronchery	mov	w7,0x87
2955*4757b351SPierre Pronchery	extr	x9,x23,x23,#32
2956*4757b351SPierre Pronchery	extr	x25,x23,x22,#63
2957*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
2958*4757b351SPierre Pronchery	eor	x24,x8,x22,lsl#1
2959*4757b351SPierre Pronchery	mov	w7,0x87
2960*4757b351SPierre Pronchery	extr	x9,x25,x25,#32
2961*4757b351SPierre Pronchery	extr	x27,x25,x24,#63
2962*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
2963*4757b351SPierre Pronchery	eor	x26,x8,x24,lsl#1
2964*4757b351SPierre Pronchery.Lxts_8_blocks_process_gb:
2965*4757b351SPierre Pronchery	cmp	x2,#8
2966*4757b351SPierre Pronchery	b.lt	.Lxts_4_blocks_process_gb
2967*4757b351SPierre Pronchery	mov	v0.d[0],x12
2968*4757b351SPierre Pronchery	mov	v0.d[1],x13
2969*4757b351SPierre Pronchery#ifdef __AARCH64EB__
2970*4757b351SPierre Pronchery	rev32	v0.16b,v0.16b
2971*4757b351SPierre Pronchery#endif
2972*4757b351SPierre Pronchery	mov	v1.d[0],x14
2973*4757b351SPierre Pronchery	mov	v1.d[1],x15
2974*4757b351SPierre Pronchery#ifdef __AARCH64EB__
2975*4757b351SPierre Pronchery	rev32	v1.16b,v1.16b
2976*4757b351SPierre Pronchery#endif
2977*4757b351SPierre Pronchery	mov	v2.d[0],x16
2978*4757b351SPierre Pronchery	mov	v2.d[1],x17
2979*4757b351SPierre Pronchery#ifdef __AARCH64EB__
2980*4757b351SPierre Pronchery	rev32	v2.16b,v2.16b
2981*4757b351SPierre Pronchery#endif
2982*4757b351SPierre Pronchery	mov	v3.d[0],x18
2983*4757b351SPierre Pronchery	mov	v3.d[1],x19
2984*4757b351SPierre Pronchery#ifdef __AARCH64EB__
2985*4757b351SPierre Pronchery	rev32	v3.16b,v3.16b
2986*4757b351SPierre Pronchery#endif
2987*4757b351SPierre Pronchery	mov	v12.d[0],x20
2988*4757b351SPierre Pronchery	mov	v12.d[1],x21
2989*4757b351SPierre Pronchery#ifdef __AARCH64EB__
2990*4757b351SPierre Pronchery	rev32	v12.16b,v12.16b
2991*4757b351SPierre Pronchery#endif
2992*4757b351SPierre Pronchery	mov	v13.d[0],x22
2993*4757b351SPierre Pronchery	mov	v13.d[1],x23
2994*4757b351SPierre Pronchery#ifdef __AARCH64EB__
2995*4757b351SPierre Pronchery	rev32	v13.16b,v13.16b
2996*4757b351SPierre Pronchery#endif
2997*4757b351SPierre Pronchery	mov	v14.d[0],x24
2998*4757b351SPierre Pronchery	mov	v14.d[1],x25
2999*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3000*4757b351SPierre Pronchery	rev32	v14.16b,v14.16b
3001*4757b351SPierre Pronchery#endif
3002*4757b351SPierre Pronchery	mov	v15.d[0],x26
3003*4757b351SPierre Pronchery	mov	v15.d[1],x27
3004*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3005*4757b351SPierre Pronchery	rev32	v15.16b,v15.16b
3006*4757b351SPierre Pronchery#endif
3007*4757b351SPierre Pronchery	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
3008*4757b351SPierre Pronchery	rbit	v0.16b,v0.16b
3009*4757b351SPierre Pronchery	rbit	v1.16b,v1.16b
3010*4757b351SPierre Pronchery	rbit	v2.16b,v2.16b
3011*4757b351SPierre Pronchery	rbit	v3.16b,v3.16b
3012*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v0.16b
3013*4757b351SPierre Pronchery	eor	v5.16b, v5.16b, v1.16b
3014*4757b351SPierre Pronchery	eor	v6.16b, v6.16b, v2.16b
3015*4757b351SPierre Pronchery	eor	v7.16b, v7.16b, v3.16b
3016*4757b351SPierre Pronchery	ld1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
3017*4757b351SPierre Pronchery	rbit	v12.16b,v12.16b
3018*4757b351SPierre Pronchery	rbit	v13.16b,v13.16b
3019*4757b351SPierre Pronchery	rbit	v14.16b,v14.16b
3020*4757b351SPierre Pronchery	rbit	v15.16b,v15.16b
3021*4757b351SPierre Pronchery	eor	v8.16b, v8.16b, v12.16b
3022*4757b351SPierre Pronchery	eor	v9.16b, v9.16b, v13.16b
3023*4757b351SPierre Pronchery	eor	v10.16b, v10.16b, v14.16b
3024*4757b351SPierre Pronchery	eor	v11.16b, v11.16b, v15.16b
3025*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3026*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
3027*4757b351SPierre Pronchery#endif
3028*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3029*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
3030*4757b351SPierre Pronchery#endif
3031*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3032*4757b351SPierre Pronchery	rev32	v6.16b,v6.16b
3033*4757b351SPierre Pronchery#endif
3034*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3035*4757b351SPierre Pronchery	rev32	v7.16b,v7.16b
3036*4757b351SPierre Pronchery#endif
3037*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3038*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
3039*4757b351SPierre Pronchery#endif
3040*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3041*4757b351SPierre Pronchery	rev32	v9.16b,v9.16b
3042*4757b351SPierre Pronchery#endif
3043*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3044*4757b351SPierre Pronchery	rev32	v10.16b,v10.16b
3045*4757b351SPierre Pronchery#endif
3046*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3047*4757b351SPierre Pronchery	rev32	v11.16b,v11.16b
3048*4757b351SPierre Pronchery#endif
3049*4757b351SPierre Pronchery	zip1	v0.4s,v4.4s,v5.4s
3050*4757b351SPierre Pronchery	zip2	v1.4s,v4.4s,v5.4s
3051*4757b351SPierre Pronchery	zip1	v2.4s,v6.4s,v7.4s
3052*4757b351SPierre Pronchery	zip2	v3.4s,v6.4s,v7.4s
3053*4757b351SPierre Pronchery	zip1	v4.2d,v0.2d,v2.2d
3054*4757b351SPierre Pronchery	zip2	v5.2d,v0.2d,v2.2d
3055*4757b351SPierre Pronchery	zip1	v6.2d,v1.2d,v3.2d
3056*4757b351SPierre Pronchery	zip2	v7.2d,v1.2d,v3.2d
3057*4757b351SPierre Pronchery	zip1	v0.4s,v8.4s,v9.4s
3058*4757b351SPierre Pronchery	zip2	v1.4s,v8.4s,v9.4s
3059*4757b351SPierre Pronchery	zip1	v2.4s,v10.4s,v11.4s
3060*4757b351SPierre Pronchery	zip2	v3.4s,v10.4s,v11.4s
3061*4757b351SPierre Pronchery	zip1	v8.2d,v0.2d,v2.2d
3062*4757b351SPierre Pronchery	zip2	v9.2d,v0.2d,v2.2d
3063*4757b351SPierre Pronchery	zip1	v10.2d,v1.2d,v3.2d
3064*4757b351SPierre Pronchery	zip2	v11.2d,v1.2d,v3.2d
3065*4757b351SPierre Pronchery	bl	_vpsm4_enc_8blks
3066*4757b351SPierre Pronchery	zip1	v8.4s,v0.4s,v1.4s
3067*4757b351SPierre Pronchery	zip2	v9.4s,v0.4s,v1.4s
3068*4757b351SPierre Pronchery	zip1	v10.4s,v2.4s,v3.4s
3069*4757b351SPierre Pronchery	zip2	v11.4s,v2.4s,v3.4s
3070*4757b351SPierre Pronchery	zip1	v0.2d,v8.2d,v10.2d
3071*4757b351SPierre Pronchery	zip2	v1.2d,v8.2d,v10.2d
3072*4757b351SPierre Pronchery	zip1	v2.2d,v9.2d,v11.2d
3073*4757b351SPierre Pronchery	zip2	v3.2d,v9.2d,v11.2d
3074*4757b351SPierre Pronchery	zip1	v8.4s,v4.4s,v5.4s
3075*4757b351SPierre Pronchery	zip2	v9.4s,v4.4s,v5.4s
3076*4757b351SPierre Pronchery	zip1	v10.4s,v6.4s,v7.4s
3077*4757b351SPierre Pronchery	zip2	v11.4s,v6.4s,v7.4s
3078*4757b351SPierre Pronchery	zip1	v4.2d,v8.2d,v10.2d
3079*4757b351SPierre Pronchery	zip2	v5.2d,v8.2d,v10.2d
3080*4757b351SPierre Pronchery	zip1	v6.2d,v9.2d,v11.2d
3081*4757b351SPierre Pronchery	zip2	v7.2d,v9.2d,v11.2d
3082*4757b351SPierre Pronchery	mov	v12.d[0],x12
3083*4757b351SPierre Pronchery	mov	v12.d[1],x13
3084*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3085*4757b351SPierre Pronchery	rev32	v12.16b,v12.16b
3086*4757b351SPierre Pronchery#endif
3087*4757b351SPierre Pronchery	mov	w7,0x87
3088*4757b351SPierre Pronchery	extr	x9,x27,x27,#32
3089*4757b351SPierre Pronchery	extr	x13,x27,x26,#63
3090*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
3091*4757b351SPierre Pronchery	eor	x12,x8,x26,lsl#1
3092*4757b351SPierre Pronchery	mov	v13.d[0],x14
3093*4757b351SPierre Pronchery	mov	v13.d[1],x15
3094*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3095*4757b351SPierre Pronchery	rev32	v13.16b,v13.16b
3096*4757b351SPierre Pronchery#endif
3097*4757b351SPierre Pronchery	mov	w7,0x87
3098*4757b351SPierre Pronchery	extr	x9,x13,x13,#32
3099*4757b351SPierre Pronchery	extr	x15,x13,x12,#63
3100*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
3101*4757b351SPierre Pronchery	eor	x14,x8,x12,lsl#1
3102*4757b351SPierre Pronchery	mov	v14.d[0],x16
3103*4757b351SPierre Pronchery	mov	v14.d[1],x17
3104*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3105*4757b351SPierre Pronchery	rev32	v14.16b,v14.16b
3106*4757b351SPierre Pronchery#endif
3107*4757b351SPierre Pronchery	mov	w7,0x87
3108*4757b351SPierre Pronchery	extr	x9,x15,x15,#32
3109*4757b351SPierre Pronchery	extr	x17,x15,x14,#63
3110*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
3111*4757b351SPierre Pronchery	eor	x16,x8,x14,lsl#1
3112*4757b351SPierre Pronchery	mov	v15.d[0],x18
3113*4757b351SPierre Pronchery	mov	v15.d[1],x19
3114*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3115*4757b351SPierre Pronchery	rev32	v15.16b,v15.16b
3116*4757b351SPierre Pronchery#endif
3117*4757b351SPierre Pronchery	mov	w7,0x87
3118*4757b351SPierre Pronchery	extr	x9,x17,x17,#32
3119*4757b351SPierre Pronchery	extr	x19,x17,x16,#63
3120*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
3121*4757b351SPierre Pronchery	eor	x18,x8,x16,lsl#1
3122*4757b351SPierre Pronchery	mov	v8.d[0],x20
3123*4757b351SPierre Pronchery	mov	v8.d[1],x21
3124*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3125*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
3126*4757b351SPierre Pronchery#endif
3127*4757b351SPierre Pronchery	mov	w7,0x87
3128*4757b351SPierre Pronchery	extr	x9,x19,x19,#32
3129*4757b351SPierre Pronchery	extr	x21,x19,x18,#63
3130*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
3131*4757b351SPierre Pronchery	eor	x20,x8,x18,lsl#1
3132*4757b351SPierre Pronchery	mov	v9.d[0],x22
3133*4757b351SPierre Pronchery	mov	v9.d[1],x23
3134*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3135*4757b351SPierre Pronchery	rev32	v9.16b,v9.16b
3136*4757b351SPierre Pronchery#endif
3137*4757b351SPierre Pronchery	mov	w7,0x87
3138*4757b351SPierre Pronchery	extr	x9,x21,x21,#32
3139*4757b351SPierre Pronchery	extr	x23,x21,x20,#63
3140*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
3141*4757b351SPierre Pronchery	eor	x22,x8,x20,lsl#1
3142*4757b351SPierre Pronchery	mov	v10.d[0],x24
3143*4757b351SPierre Pronchery	mov	v10.d[1],x25
3144*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3145*4757b351SPierre Pronchery	rev32	v10.16b,v10.16b
3146*4757b351SPierre Pronchery#endif
3147*4757b351SPierre Pronchery	mov	w7,0x87
3148*4757b351SPierre Pronchery	extr	x9,x23,x23,#32
3149*4757b351SPierre Pronchery	extr	x25,x23,x22,#63
3150*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
3151*4757b351SPierre Pronchery	eor	x24,x8,x22,lsl#1
3152*4757b351SPierre Pronchery	mov	v11.d[0],x26
3153*4757b351SPierre Pronchery	mov	v11.d[1],x27
3154*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3155*4757b351SPierre Pronchery	rev32	v11.16b,v11.16b
3156*4757b351SPierre Pronchery#endif
3157*4757b351SPierre Pronchery	mov	w7,0x87
3158*4757b351SPierre Pronchery	extr	x9,x25,x25,#32
3159*4757b351SPierre Pronchery	extr	x27,x25,x24,#63
3160*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
3161*4757b351SPierre Pronchery	eor	x26,x8,x24,lsl#1
3162*4757b351SPierre Pronchery	eor	v0.16b, v0.16b, v12.16b
3163*4757b351SPierre Pronchery	eor	v1.16b, v1.16b, v13.16b
3164*4757b351SPierre Pronchery	eor	v2.16b, v2.16b, v14.16b
3165*4757b351SPierre Pronchery	eor	v3.16b, v3.16b, v15.16b
3166*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v8.16b
3167*4757b351SPierre Pronchery	eor	v5.16b, v5.16b, v9.16b
3168*4757b351SPierre Pronchery	eor	v6.16b, v6.16b, v10.16b
3169*4757b351SPierre Pronchery	eor	v7.16b, v7.16b, v11.16b
3170*4757b351SPierre Pronchery
3171*4757b351SPierre Pronchery	// save the last tweak
3172*4757b351SPierre Pronchery	st1	{v11.4s},[x5]
3173*4757b351SPierre Pronchery	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
3174*4757b351SPierre Pronchery	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
3175*4757b351SPierre Pronchery	subs	x2,x2,#8
3176*4757b351SPierre Pronchery	b.gt	.Lxts_8_blocks_process_gb
3177*4757b351SPierre Pronchery	b	100f
3178*4757b351SPierre Pronchery.Lxts_4_blocks_process_gb:
3179*4757b351SPierre Pronchery	mov	v8.d[0],x12
3180*4757b351SPierre Pronchery	mov	v8.d[1],x13
3181*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3182*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
3183*4757b351SPierre Pronchery#endif
3184*4757b351SPierre Pronchery	mov	v9.d[0],x14
3185*4757b351SPierre Pronchery	mov	v9.d[1],x15
3186*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3187*4757b351SPierre Pronchery	rev32	v9.16b,v9.16b
3188*4757b351SPierre Pronchery#endif
3189*4757b351SPierre Pronchery	mov	v10.d[0],x16
3190*4757b351SPierre Pronchery	mov	v10.d[1],x17
3191*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3192*4757b351SPierre Pronchery	rev32	v10.16b,v10.16b
3193*4757b351SPierre Pronchery#endif
3194*4757b351SPierre Pronchery	mov	v11.d[0],x18
3195*4757b351SPierre Pronchery	mov	v11.d[1],x19
3196*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3197*4757b351SPierre Pronchery	rev32	v11.16b,v11.16b
3198*4757b351SPierre Pronchery#endif
3199*4757b351SPierre Pronchery	cmp	x2,#4
3200*4757b351SPierre Pronchery	b.lt	1f
3201*4757b351SPierre Pronchery	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
3202*4757b351SPierre Pronchery	rbit	v8.16b,v8.16b
3203*4757b351SPierre Pronchery	rbit	v9.16b,v9.16b
3204*4757b351SPierre Pronchery	rbit	v10.16b,v10.16b
3205*4757b351SPierre Pronchery	rbit	v11.16b,v11.16b
3206*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v8.16b
3207*4757b351SPierre Pronchery	eor	v5.16b, v5.16b, v9.16b
3208*4757b351SPierre Pronchery	eor	v6.16b, v6.16b, v10.16b
3209*4757b351SPierre Pronchery	eor	v7.16b, v7.16b, v11.16b
3210*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3211*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
3212*4757b351SPierre Pronchery#endif
3213*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3214*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
3215*4757b351SPierre Pronchery#endif
3216*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3217*4757b351SPierre Pronchery	rev32	v6.16b,v6.16b
3218*4757b351SPierre Pronchery#endif
3219*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3220*4757b351SPierre Pronchery	rev32	v7.16b,v7.16b
3221*4757b351SPierre Pronchery#endif
3222*4757b351SPierre Pronchery	zip1	v0.4s,v4.4s,v5.4s
3223*4757b351SPierre Pronchery	zip2	v1.4s,v4.4s,v5.4s
3224*4757b351SPierre Pronchery	zip1	v2.4s,v6.4s,v7.4s
3225*4757b351SPierre Pronchery	zip2	v3.4s,v6.4s,v7.4s
3226*4757b351SPierre Pronchery	zip1	v4.2d,v0.2d,v2.2d
3227*4757b351SPierre Pronchery	zip2	v5.2d,v0.2d,v2.2d
3228*4757b351SPierre Pronchery	zip1	v6.2d,v1.2d,v3.2d
3229*4757b351SPierre Pronchery	zip2	v7.2d,v1.2d,v3.2d
3230*4757b351SPierre Pronchery	bl	_vpsm4_enc_4blks
3231*4757b351SPierre Pronchery	zip1	v4.4s,v0.4s,v1.4s
3232*4757b351SPierre Pronchery	zip2	v5.4s,v0.4s,v1.4s
3233*4757b351SPierre Pronchery	zip1	v6.4s,v2.4s,v3.4s
3234*4757b351SPierre Pronchery	zip2	v7.4s,v2.4s,v3.4s
3235*4757b351SPierre Pronchery	zip1	v0.2d,v4.2d,v6.2d
3236*4757b351SPierre Pronchery	zip2	v1.2d,v4.2d,v6.2d
3237*4757b351SPierre Pronchery	zip1	v2.2d,v5.2d,v7.2d
3238*4757b351SPierre Pronchery	zip2	v3.2d,v5.2d,v7.2d
3239*4757b351SPierre Pronchery	eor	v0.16b, v0.16b, v8.16b
3240*4757b351SPierre Pronchery	eor	v1.16b, v1.16b, v9.16b
3241*4757b351SPierre Pronchery	eor	v2.16b, v2.16b, v10.16b
3242*4757b351SPierre Pronchery	eor	v3.16b, v3.16b, v11.16b
3243*4757b351SPierre Pronchery	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
3244*4757b351SPierre Pronchery	sub	x2,x2,#4
3245*4757b351SPierre Pronchery	mov	v8.d[0],x20
3246*4757b351SPierre Pronchery	mov	v8.d[1],x21
3247*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3248*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
3249*4757b351SPierre Pronchery#endif
3250*4757b351SPierre Pronchery	mov	v9.d[0],x22
3251*4757b351SPierre Pronchery	mov	v9.d[1],x23
3252*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3253*4757b351SPierre Pronchery	rev32	v9.16b,v9.16b
3254*4757b351SPierre Pronchery#endif
3255*4757b351SPierre Pronchery	mov	v10.d[0],x24
3256*4757b351SPierre Pronchery	mov	v10.d[1],x25
3257*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3258*4757b351SPierre Pronchery	rev32	v10.16b,v10.16b
3259*4757b351SPierre Pronchery#endif
3260*4757b351SPierre Pronchery	// save the last tweak
3261*4757b351SPierre Pronchery	st1	{v11.4s},[x5]
3262*4757b351SPierre Pronchery1:
3263*4757b351SPierre Pronchery	// process last block
3264*4757b351SPierre Pronchery	cmp	x2,#1
3265*4757b351SPierre Pronchery	b.lt	100f
3266*4757b351SPierre Pronchery	b.gt	1f
3267*4757b351SPierre Pronchery	ld1	{v4.4s},[x0],#16
3268*4757b351SPierre Pronchery	rbit	v8.16b,v8.16b
3269*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v8.16b
3270*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3271*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
3272*4757b351SPierre Pronchery#endif
3273*4757b351SPierre Pronchery	mov	x10,x3
3274*4757b351SPierre Pronchery	mov	w11,#8
3275*4757b351SPierre Pronchery	mov	w12,v4.s[0]
3276*4757b351SPierre Pronchery	mov	w13,v4.s[1]
3277*4757b351SPierre Pronchery	mov	w14,v4.s[2]
3278*4757b351SPierre Pronchery	mov	w15,v4.s[3]
3279*4757b351SPierre Pronchery10:
3280*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
3281*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3282*4757b351SPierre Pronchery	eor	w6,w14,w15
3283*4757b351SPierre Pronchery	eor	w9,w7,w13
3284*4757b351SPierre Pronchery	eor	w6,w6,w9
3285*4757b351SPierre Pronchery	movi	v1.16b,#64
3286*4757b351SPierre Pronchery	movi	v2.16b,#128
3287*4757b351SPierre Pronchery	movi	v3.16b,#192
3288*4757b351SPierre Pronchery	mov	v0.s[0],w6
3289*4757b351SPierre Pronchery
3290*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
3291*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
3292*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
3293*4757b351SPierre Pronchery
3294*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3295*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3296*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3297*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3298*4757b351SPierre Pronchery
3299*4757b351SPierre Pronchery	mov	w6,v0.s[0]
3300*4757b351SPierre Pronchery	mov	w7,v1.s[0]
3301*4757b351SPierre Pronchery	mov	w9,v2.s[0]
3302*4757b351SPierre Pronchery	add	w7,w6,w7
3303*4757b351SPierre Pronchery	mov	w6,v3.s[0]
3304*4757b351SPierre Pronchery	add	w7,w7,w9
3305*4757b351SPierre Pronchery	add	w7,w7,w6
3306*4757b351SPierre Pronchery
3307*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
3308*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
3309*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
3310*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
3311*4757b351SPierre Pronchery	eor	w12,w12,w6
3312*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3313*4757b351SPierre Pronchery	eor	w6,w14,w15
3314*4757b351SPierre Pronchery	eor	w9,w12,w8
3315*4757b351SPierre Pronchery	eor	w6,w6,w9
3316*4757b351SPierre Pronchery	movi	v1.16b,#64
3317*4757b351SPierre Pronchery	movi	v2.16b,#128
3318*4757b351SPierre Pronchery	movi	v3.16b,#192
3319*4757b351SPierre Pronchery	mov	v0.s[0],w6
3320*4757b351SPierre Pronchery
3321*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
3322*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
3323*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
3324*4757b351SPierre Pronchery
3325*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3326*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3327*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3328*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3329*4757b351SPierre Pronchery
3330*4757b351SPierre Pronchery	mov	w6,v0.s[0]
3331*4757b351SPierre Pronchery	mov	w7,v1.s[0]
3332*4757b351SPierre Pronchery	mov	w9,v2.s[0]
3333*4757b351SPierre Pronchery	add	w7,w6,w7
3334*4757b351SPierre Pronchery	mov	w6,v3.s[0]
3335*4757b351SPierre Pronchery	add	w7,w7,w9
3336*4757b351SPierre Pronchery	add	w7,w7,w6
3337*4757b351SPierre Pronchery
3338*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
3339*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
3340*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
3341*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
3342*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
3343*4757b351SPierre Pronchery	eor	w13,w13,w6
3344*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3345*4757b351SPierre Pronchery	eor	w6,w12,w13
3346*4757b351SPierre Pronchery	eor	w9,w7,w15
3347*4757b351SPierre Pronchery	eor	w6,w6,w9
3348*4757b351SPierre Pronchery	movi	v1.16b,#64
3349*4757b351SPierre Pronchery	movi	v2.16b,#128
3350*4757b351SPierre Pronchery	movi	v3.16b,#192
3351*4757b351SPierre Pronchery	mov	v0.s[0],w6
3352*4757b351SPierre Pronchery
3353*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
3354*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
3355*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
3356*4757b351SPierre Pronchery
3357*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3358*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3359*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3360*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3361*4757b351SPierre Pronchery
3362*4757b351SPierre Pronchery	mov	w6,v0.s[0]
3363*4757b351SPierre Pronchery	mov	w7,v1.s[0]
3364*4757b351SPierre Pronchery	mov	w9,v2.s[0]
3365*4757b351SPierre Pronchery	add	w7,w6,w7
3366*4757b351SPierre Pronchery	mov	w6,v3.s[0]
3367*4757b351SPierre Pronchery	add	w7,w7,w9
3368*4757b351SPierre Pronchery	add	w7,w7,w6
3369*4757b351SPierre Pronchery
3370*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
3371*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
3372*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
3373*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
3374*4757b351SPierre Pronchery	eor	w14,w14,w6
3375*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3376*4757b351SPierre Pronchery	eor	w6,w12,w13
3377*4757b351SPierre Pronchery	eor	w9,w14,w8
3378*4757b351SPierre Pronchery	eor	w6,w6,w9
3379*4757b351SPierre Pronchery	movi	v1.16b,#64
3380*4757b351SPierre Pronchery	movi	v2.16b,#128
3381*4757b351SPierre Pronchery	movi	v3.16b,#192
3382*4757b351SPierre Pronchery	mov	v0.s[0],w6
3383*4757b351SPierre Pronchery
3384*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
3385*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
3386*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
3387*4757b351SPierre Pronchery
3388*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3389*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3390*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3391*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3392*4757b351SPierre Pronchery
3393*4757b351SPierre Pronchery	mov	w6,v0.s[0]
3394*4757b351SPierre Pronchery	mov	w7,v1.s[0]
3395*4757b351SPierre Pronchery	mov	w9,v2.s[0]
3396*4757b351SPierre Pronchery	add	w7,w6,w7
3397*4757b351SPierre Pronchery	mov	w6,v3.s[0]
3398*4757b351SPierre Pronchery	add	w7,w7,w9
3399*4757b351SPierre Pronchery	add	w7,w7,w6
3400*4757b351SPierre Pronchery
3401*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
3402*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
3403*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
3404*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
3405*4757b351SPierre Pronchery	eor	w15,w15,w6
3406*4757b351SPierre Pronchery	subs	w11,w11,#1
3407*4757b351SPierre Pronchery	b.ne	10b
3408*4757b351SPierre Pronchery	mov	v4.s[0],w15
3409*4757b351SPierre Pronchery	mov	v4.s[1],w14
3410*4757b351SPierre Pronchery	mov	v4.s[2],w13
3411*4757b351SPierre Pronchery	mov	v4.s[3],w12
3412*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3413*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
3414*4757b351SPierre Pronchery#endif
3415*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v8.16b
3416*4757b351SPierre Pronchery	st1	{v4.4s},[x1],#16
3417*4757b351SPierre Pronchery	// save the last tweak
3418*4757b351SPierre Pronchery	st1	{v8.4s},[x5]
3419*4757b351SPierre Pronchery	b	100f
3420*4757b351SPierre Pronchery1:	//	process last 2 blocks
3421*4757b351SPierre Pronchery	cmp	x2,#2
3422*4757b351SPierre Pronchery	b.gt	1f
3423*4757b351SPierre Pronchery	ld1	{v4.4s,v5.4s},[x0],#32
3424*4757b351SPierre Pronchery	rbit	v8.16b,v8.16b
3425*4757b351SPierre Pronchery	rbit	v9.16b,v9.16b
3426*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v8.16b
3427*4757b351SPierre Pronchery	eor	v5.16b, v5.16b, v9.16b
3428*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3429*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
3430*4757b351SPierre Pronchery#endif
3431*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3432*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
3433*4757b351SPierre Pronchery#endif
3434*4757b351SPierre Pronchery	zip1	v0.4s,v4.4s,v5.4s
3435*4757b351SPierre Pronchery	zip2	v1.4s,v4.4s,v5.4s
3436*4757b351SPierre Pronchery	zip1	v2.4s,v6.4s,v7.4s
3437*4757b351SPierre Pronchery	zip2	v3.4s,v6.4s,v7.4s
3438*4757b351SPierre Pronchery	zip1	v4.2d,v0.2d,v2.2d
3439*4757b351SPierre Pronchery	zip2	v5.2d,v0.2d,v2.2d
3440*4757b351SPierre Pronchery	zip1	v6.2d,v1.2d,v3.2d
3441*4757b351SPierre Pronchery	zip2	v7.2d,v1.2d,v3.2d
3442*4757b351SPierre Pronchery	bl	_vpsm4_enc_4blks
3443*4757b351SPierre Pronchery	zip1	v4.4s,v0.4s,v1.4s
3444*4757b351SPierre Pronchery	zip2	v5.4s,v0.4s,v1.4s
3445*4757b351SPierre Pronchery	zip1	v6.4s,v2.4s,v3.4s
3446*4757b351SPierre Pronchery	zip2	v7.4s,v2.4s,v3.4s
3447*4757b351SPierre Pronchery	zip1	v0.2d,v4.2d,v6.2d
3448*4757b351SPierre Pronchery	zip2	v1.2d,v4.2d,v6.2d
3449*4757b351SPierre Pronchery	zip1	v2.2d,v5.2d,v7.2d
3450*4757b351SPierre Pronchery	zip2	v3.2d,v5.2d,v7.2d
3451*4757b351SPierre Pronchery	eor	v0.16b, v0.16b, v8.16b
3452*4757b351SPierre Pronchery	eor	v1.16b, v1.16b, v9.16b
3453*4757b351SPierre Pronchery	st1	{v0.4s,v1.4s},[x1],#32
3454*4757b351SPierre Pronchery	// save the last tweak
3455*4757b351SPierre Pronchery	st1	{v9.4s},[x5]
3456*4757b351SPierre Pronchery	b	100f
3457*4757b351SPierre Pronchery1:	//	process last 3 blocks
3458*4757b351SPierre Pronchery	ld1	{v4.4s,v5.4s,v6.4s},[x0],#48
3459*4757b351SPierre Pronchery	rbit	v8.16b,v8.16b
3460*4757b351SPierre Pronchery	rbit	v9.16b,v9.16b
3461*4757b351SPierre Pronchery	rbit	v10.16b,v10.16b
3462*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v8.16b
3463*4757b351SPierre Pronchery	eor	v5.16b, v5.16b, v9.16b
3464*4757b351SPierre Pronchery	eor	v6.16b, v6.16b, v10.16b
3465*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3466*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
3467*4757b351SPierre Pronchery#endif
3468*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3469*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
3470*4757b351SPierre Pronchery#endif
3471*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3472*4757b351SPierre Pronchery	rev32	v6.16b,v6.16b
3473*4757b351SPierre Pronchery#endif
3474*4757b351SPierre Pronchery	zip1	v0.4s,v4.4s,v5.4s
3475*4757b351SPierre Pronchery	zip2	v1.4s,v4.4s,v5.4s
3476*4757b351SPierre Pronchery	zip1	v2.4s,v6.4s,v7.4s
3477*4757b351SPierre Pronchery	zip2	v3.4s,v6.4s,v7.4s
3478*4757b351SPierre Pronchery	zip1	v4.2d,v0.2d,v2.2d
3479*4757b351SPierre Pronchery	zip2	v5.2d,v0.2d,v2.2d
3480*4757b351SPierre Pronchery	zip1	v6.2d,v1.2d,v3.2d
3481*4757b351SPierre Pronchery	zip2	v7.2d,v1.2d,v3.2d
3482*4757b351SPierre Pronchery	bl	_vpsm4_enc_4blks
3483*4757b351SPierre Pronchery	zip1	v4.4s,v0.4s,v1.4s
3484*4757b351SPierre Pronchery	zip2	v5.4s,v0.4s,v1.4s
3485*4757b351SPierre Pronchery	zip1	v6.4s,v2.4s,v3.4s
3486*4757b351SPierre Pronchery	zip2	v7.4s,v2.4s,v3.4s
3487*4757b351SPierre Pronchery	zip1	v0.2d,v4.2d,v6.2d
3488*4757b351SPierre Pronchery	zip2	v1.2d,v4.2d,v6.2d
3489*4757b351SPierre Pronchery	zip1	v2.2d,v5.2d,v7.2d
3490*4757b351SPierre Pronchery	zip2	v3.2d,v5.2d,v7.2d
3491*4757b351SPierre Pronchery	eor	v0.16b, v0.16b, v8.16b
3492*4757b351SPierre Pronchery	eor	v1.16b, v1.16b, v9.16b
3493*4757b351SPierre Pronchery	eor	v2.16b, v2.16b, v10.16b
3494*4757b351SPierre Pronchery	st1	{v0.4s,v1.4s,v2.4s},[x1],#48
3495*4757b351SPierre Pronchery	// save the last tweak
3496*4757b351SPierre Pronchery	st1	{v10.4s},[x5]
3497*4757b351SPierre Pronchery100:
3498*4757b351SPierre Pronchery	cmp	x29,0
3499*4757b351SPierre Pronchery	b.eq	.return_gb
3500*4757b351SPierre Pronchery
3501*4757b351SPierre Pronchery// This branch calculates the last two tweaks,
3502*4757b351SPierre Pronchery// while the encryption/decryption length is larger than 32
3503*4757b351SPierre Pronchery.last_2blks_tweak_gb:
3504*4757b351SPierre Pronchery	ld1	{v8.4s},[x5]
3505*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3506*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
3507*4757b351SPierre Pronchery#endif
3508*4757b351SPierre Pronchery	rbit	v2.16b,v8.16b
3509*4757b351SPierre Pronchery	adrp	x10,.Lxts_magic
3510*4757b351SPierre Pronchery	ldr	q0, [x10, #:lo12:.Lxts_magic]
3511*4757b351SPierre Pronchery	shl	v9.16b, v2.16b, #1
3512*4757b351SPierre Pronchery	ext	v1.16b, v2.16b, v2.16b,#15
3513*4757b351SPierre Pronchery	ushr	v1.16b, v1.16b, #7
3514*4757b351SPierre Pronchery	mul	v1.16b, v1.16b, v0.16b
3515*4757b351SPierre Pronchery	eor	v9.16b, v9.16b, v1.16b
3516*4757b351SPierre Pronchery	rbit	v9.16b,v9.16b
3517*4757b351SPierre Pronchery	rbit	v2.16b,v9.16b
3518*4757b351SPierre Pronchery	adrp	x10,.Lxts_magic
3519*4757b351SPierre Pronchery	ldr	q0, [x10, #:lo12:.Lxts_magic]
3520*4757b351SPierre Pronchery	shl	v10.16b, v2.16b, #1
3521*4757b351SPierre Pronchery	ext	v1.16b, v2.16b, v2.16b,#15
3522*4757b351SPierre Pronchery	ushr	v1.16b, v1.16b, #7
3523*4757b351SPierre Pronchery	mul	v1.16b, v1.16b, v0.16b
3524*4757b351SPierre Pronchery	eor	v10.16b, v10.16b, v1.16b
3525*4757b351SPierre Pronchery	rbit	v10.16b,v10.16b
3526*4757b351SPierre Pronchery	b	.check_dec_gb
3527*4757b351SPierre Pronchery
3528*4757b351SPierre Pronchery
3529*4757b351SPierre Pronchery// This branch calculates the last two tweaks,
3530*4757b351SPierre Pronchery// while the encryption/decryption length is equal to 32, who only need two tweaks
3531*4757b351SPierre Pronchery.only_2blks_tweak_gb:
3532*4757b351SPierre Pronchery	mov	v9.16b,v8.16b
3533*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3534*4757b351SPierre Pronchery	rev32	v9.16b,v9.16b
3535*4757b351SPierre Pronchery#endif
3536*4757b351SPierre Pronchery	rbit	v2.16b,v9.16b
3537*4757b351SPierre Pronchery	adrp	x10,.Lxts_magic
3538*4757b351SPierre Pronchery	ldr	q0, [x10, #:lo12:.Lxts_magic]
3539*4757b351SPierre Pronchery	shl	v10.16b, v2.16b, #1
3540*4757b351SPierre Pronchery	ext	v1.16b, v2.16b, v2.16b,#15
3541*4757b351SPierre Pronchery	ushr	v1.16b, v1.16b, #7
3542*4757b351SPierre Pronchery	mul	v1.16b, v1.16b, v0.16b
3543*4757b351SPierre Pronchery	eor	v10.16b, v10.16b, v1.16b
3544*4757b351SPierre Pronchery	rbit	v10.16b,v10.16b
3545*4757b351SPierre Pronchery	b	.check_dec_gb
3546*4757b351SPierre Pronchery
3547*4757b351SPierre Pronchery
3548*4757b351SPierre Pronchery// Determine whether encryption or decryption is required.
3549*4757b351SPierre Pronchery// The last two tweaks need to be swapped for decryption.
3550*4757b351SPierre Pronchery.check_dec_gb:
3551*4757b351SPierre Pronchery	// encryption:1 decryption:0
3552*4757b351SPierre Pronchery	cmp	w28,1
3553*4757b351SPierre Pronchery	b.eq	.process_last_2blks_gb
3554*4757b351SPierre Pronchery	mov	v0.16B,v9.16b
3555*4757b351SPierre Pronchery	mov	v9.16B,v10.16b
3556*4757b351SPierre Pronchery	mov	v10.16B,v0.16b
3557*4757b351SPierre Pronchery
3558*4757b351SPierre Pronchery.process_last_2blks_gb:
3559*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3560*4757b351SPierre Pronchery	rev32	v9.16b,v9.16b
3561*4757b351SPierre Pronchery#endif
3562*4757b351SPierre Pronchery#ifdef __AARCH64EB__
3563*4757b351SPierre Pronchery	rev32	v10.16b,v10.16b
3564*4757b351SPierre Pronchery#endif
3565*4757b351SPierre Pronchery	ld1	{v4.4s},[x0],#16
3566*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v9.16b
3567*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3568*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
3569*4757b351SPierre Pronchery#endif
3570*4757b351SPierre Pronchery	mov	x10,x3
3571*4757b351SPierre Pronchery	mov	w11,#8
3572*4757b351SPierre Pronchery	mov	w12,v4.s[0]
3573*4757b351SPierre Pronchery	mov	w13,v4.s[1]
3574*4757b351SPierre Pronchery	mov	w14,v4.s[2]
3575*4757b351SPierre Pronchery	mov	w15,v4.s[3]
3576*4757b351SPierre Pronchery10:
3577*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
3578*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3579*4757b351SPierre Pronchery	eor	w6,w14,w15
3580*4757b351SPierre Pronchery	eor	w9,w7,w13
3581*4757b351SPierre Pronchery	eor	w6,w6,w9
3582*4757b351SPierre Pronchery	movi	v1.16b,#64
3583*4757b351SPierre Pronchery	movi	v2.16b,#128
3584*4757b351SPierre Pronchery	movi	v3.16b,#192
3585*4757b351SPierre Pronchery	mov	v0.s[0],w6
3586*4757b351SPierre Pronchery
3587*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
3588*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
3589*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
3590*4757b351SPierre Pronchery
3591*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3592*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3593*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3594*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3595*4757b351SPierre Pronchery
3596*4757b351SPierre Pronchery	mov	w6,v0.s[0]
3597*4757b351SPierre Pronchery	mov	w7,v1.s[0]
3598*4757b351SPierre Pronchery	mov	w9,v2.s[0]
3599*4757b351SPierre Pronchery	add	w7,w6,w7
3600*4757b351SPierre Pronchery	mov	w6,v3.s[0]
3601*4757b351SPierre Pronchery	add	w7,w7,w9
3602*4757b351SPierre Pronchery	add	w7,w7,w6
3603*4757b351SPierre Pronchery
3604*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
3605*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
3606*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
3607*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
3608*4757b351SPierre Pronchery	eor	w12,w12,w6
3609*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3610*4757b351SPierre Pronchery	eor	w6,w14,w15
3611*4757b351SPierre Pronchery	eor	w9,w12,w8
3612*4757b351SPierre Pronchery	eor	w6,w6,w9
3613*4757b351SPierre Pronchery	movi	v1.16b,#64
3614*4757b351SPierre Pronchery	movi	v2.16b,#128
3615*4757b351SPierre Pronchery	movi	v3.16b,#192
3616*4757b351SPierre Pronchery	mov	v0.s[0],w6
3617*4757b351SPierre Pronchery
3618*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
3619*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
3620*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
3621*4757b351SPierre Pronchery
3622*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3623*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3624*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3625*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3626*4757b351SPierre Pronchery
3627*4757b351SPierre Pronchery	mov	w6,v0.s[0]
3628*4757b351SPierre Pronchery	mov	w7,v1.s[0]
3629*4757b351SPierre Pronchery	mov	w9,v2.s[0]
3630*4757b351SPierre Pronchery	add	w7,w6,w7
3631*4757b351SPierre Pronchery	mov	w6,v3.s[0]
3632*4757b351SPierre Pronchery	add	w7,w7,w9
3633*4757b351SPierre Pronchery	add	w7,w7,w6
3634*4757b351SPierre Pronchery
3635*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
3636*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
3637*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
3638*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
3639*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
3640*4757b351SPierre Pronchery	eor	w13,w13,w6
3641*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3642*4757b351SPierre Pronchery	eor	w6,w12,w13
3643*4757b351SPierre Pronchery	eor	w9,w7,w15
3644*4757b351SPierre Pronchery	eor	w6,w6,w9
3645*4757b351SPierre Pronchery	movi	v1.16b,#64
3646*4757b351SPierre Pronchery	movi	v2.16b,#128
3647*4757b351SPierre Pronchery	movi	v3.16b,#192
3648*4757b351SPierre Pronchery	mov	v0.s[0],w6
3649*4757b351SPierre Pronchery
3650*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
3651*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
3652*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
3653*4757b351SPierre Pronchery
3654*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3655*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3656*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3657*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3658*4757b351SPierre Pronchery
3659*4757b351SPierre Pronchery	mov	w6,v0.s[0]
3660*4757b351SPierre Pronchery	mov	w7,v1.s[0]
3661*4757b351SPierre Pronchery	mov	w9,v2.s[0]
3662*4757b351SPierre Pronchery	add	w7,w6,w7
3663*4757b351SPierre Pronchery	mov	w6,v3.s[0]
3664*4757b351SPierre Pronchery	add	w7,w7,w9
3665*4757b351SPierre Pronchery	add	w7,w7,w6
3666*4757b351SPierre Pronchery
3667*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
3668*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
3669*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
3670*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
3671*4757b351SPierre Pronchery	eor	w14,w14,w6
3672*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3673*4757b351SPierre Pronchery	eor	w6,w12,w13
3674*4757b351SPierre Pronchery	eor	w9,w14,w8
3675*4757b351SPierre Pronchery	eor	w6,w6,w9
3676*4757b351SPierre Pronchery	movi	v1.16b,#64
3677*4757b351SPierre Pronchery	movi	v2.16b,#128
3678*4757b351SPierre Pronchery	movi	v3.16b,#192
3679*4757b351SPierre Pronchery	mov	v0.s[0],w6
3680*4757b351SPierre Pronchery
3681*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
3682*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
3683*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
3684*4757b351SPierre Pronchery
3685*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3686*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3687*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3688*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3689*4757b351SPierre Pronchery
3690*4757b351SPierre Pronchery	mov	w6,v0.s[0]
3691*4757b351SPierre Pronchery	mov	w7,v1.s[0]
3692*4757b351SPierre Pronchery	mov	w9,v2.s[0]
3693*4757b351SPierre Pronchery	add	w7,w6,w7
3694*4757b351SPierre Pronchery	mov	w6,v3.s[0]
3695*4757b351SPierre Pronchery	add	w7,w7,w9
3696*4757b351SPierre Pronchery	add	w7,w7,w6
3697*4757b351SPierre Pronchery
3698*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
3699*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
3700*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
3701*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
3702*4757b351SPierre Pronchery	eor	w15,w15,w6
3703*4757b351SPierre Pronchery	subs	w11,w11,#1
3704*4757b351SPierre Pronchery	b.ne	10b
3705*4757b351SPierre Pronchery	mov	v4.s[0],w15
3706*4757b351SPierre Pronchery	mov	v4.s[1],w14
3707*4757b351SPierre Pronchery	mov	v4.s[2],w13
3708*4757b351SPierre Pronchery	mov	v4.s[3],w12
3709*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3710*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
3711*4757b351SPierre Pronchery#endif
3712*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v9.16b
3713*4757b351SPierre Pronchery	st1	{v4.4s},[x1],#16
3714*4757b351SPierre Pronchery
3715*4757b351SPierre Pronchery	sub	x26,x1,16
3716*4757b351SPierre Pronchery.loop_gb:
3717*4757b351SPierre Pronchery	subs	x29,x29,1
3718*4757b351SPierre Pronchery	ldrb	w7,[x26,x29]
3719*4757b351SPierre Pronchery	ldrb	w8,[x0,x29]
3720*4757b351SPierre Pronchery	strb	w8,[x26,x29]
3721*4757b351SPierre Pronchery	strb	w7,[x1,x29]
3722*4757b351SPierre Pronchery	b.gt	.loop_gb
3723*4757b351SPierre Pronchery	ld1	{v4.4s}, [x26]
3724*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v10.16b
3725*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3726*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
3727*4757b351SPierre Pronchery#endif
3728*4757b351SPierre Pronchery	mov	x10,x3
3729*4757b351SPierre Pronchery	mov	w11,#8
3730*4757b351SPierre Pronchery	mov	w12,v4.s[0]
3731*4757b351SPierre Pronchery	mov	w13,v4.s[1]
3732*4757b351SPierre Pronchery	mov	w14,v4.s[2]
3733*4757b351SPierre Pronchery	mov	w15,v4.s[3]
3734*4757b351SPierre Pronchery10:
3735*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
3736*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3737*4757b351SPierre Pronchery	eor	w6,w14,w15
3738*4757b351SPierre Pronchery	eor	w9,w7,w13
3739*4757b351SPierre Pronchery	eor	w6,w6,w9
3740*4757b351SPierre Pronchery	movi	v1.16b,#64
3741*4757b351SPierre Pronchery	movi	v2.16b,#128
3742*4757b351SPierre Pronchery	movi	v3.16b,#192
3743*4757b351SPierre Pronchery	mov	v0.s[0],w6
3744*4757b351SPierre Pronchery
3745*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
3746*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
3747*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
3748*4757b351SPierre Pronchery
3749*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3750*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3751*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3752*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3753*4757b351SPierre Pronchery
3754*4757b351SPierre Pronchery	mov	w6,v0.s[0]
3755*4757b351SPierre Pronchery	mov	w7,v1.s[0]
3756*4757b351SPierre Pronchery	mov	w9,v2.s[0]
3757*4757b351SPierre Pronchery	add	w7,w6,w7
3758*4757b351SPierre Pronchery	mov	w6,v3.s[0]
3759*4757b351SPierre Pronchery	add	w7,w7,w9
3760*4757b351SPierre Pronchery	add	w7,w7,w6
3761*4757b351SPierre Pronchery
3762*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
3763*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
3764*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
3765*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
3766*4757b351SPierre Pronchery	eor	w12,w12,w6
3767*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3768*4757b351SPierre Pronchery	eor	w6,w14,w15
3769*4757b351SPierre Pronchery	eor	w9,w12,w8
3770*4757b351SPierre Pronchery	eor	w6,w6,w9
3771*4757b351SPierre Pronchery	movi	v1.16b,#64
3772*4757b351SPierre Pronchery	movi	v2.16b,#128
3773*4757b351SPierre Pronchery	movi	v3.16b,#192
3774*4757b351SPierre Pronchery	mov	v0.s[0],w6
3775*4757b351SPierre Pronchery
3776*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
3777*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
3778*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
3779*4757b351SPierre Pronchery
3780*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3781*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3782*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3783*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3784*4757b351SPierre Pronchery
3785*4757b351SPierre Pronchery	mov	w6,v0.s[0]
3786*4757b351SPierre Pronchery	mov	w7,v1.s[0]
3787*4757b351SPierre Pronchery	mov	w9,v2.s[0]
3788*4757b351SPierre Pronchery	add	w7,w6,w7
3789*4757b351SPierre Pronchery	mov	w6,v3.s[0]
3790*4757b351SPierre Pronchery	add	w7,w7,w9
3791*4757b351SPierre Pronchery	add	w7,w7,w6
3792*4757b351SPierre Pronchery
3793*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
3794*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
3795*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
3796*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
3797*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
3798*4757b351SPierre Pronchery	eor	w13,w13,w6
3799*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3800*4757b351SPierre Pronchery	eor	w6,w12,w13
3801*4757b351SPierre Pronchery	eor	w9,w7,w15
3802*4757b351SPierre Pronchery	eor	w6,w6,w9
3803*4757b351SPierre Pronchery	movi	v1.16b,#64
3804*4757b351SPierre Pronchery	movi	v2.16b,#128
3805*4757b351SPierre Pronchery	movi	v3.16b,#192
3806*4757b351SPierre Pronchery	mov	v0.s[0],w6
3807*4757b351SPierre Pronchery
3808*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
3809*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
3810*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
3811*4757b351SPierre Pronchery
3812*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3813*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3814*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3815*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3816*4757b351SPierre Pronchery
3817*4757b351SPierre Pronchery	mov	w6,v0.s[0]
3818*4757b351SPierre Pronchery	mov	w7,v1.s[0]
3819*4757b351SPierre Pronchery	mov	w9,v2.s[0]
3820*4757b351SPierre Pronchery	add	w7,w6,w7
3821*4757b351SPierre Pronchery	mov	w6,v3.s[0]
3822*4757b351SPierre Pronchery	add	w7,w7,w9
3823*4757b351SPierre Pronchery	add	w7,w7,w6
3824*4757b351SPierre Pronchery
3825*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
3826*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
3827*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
3828*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
3829*4757b351SPierre Pronchery	eor	w14,w14,w6
3830*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
3831*4757b351SPierre Pronchery	eor	w6,w12,w13
3832*4757b351SPierre Pronchery	eor	w9,w14,w8
3833*4757b351SPierre Pronchery	eor	w6,w6,w9
3834*4757b351SPierre Pronchery	movi	v1.16b,#64
3835*4757b351SPierre Pronchery	movi	v2.16b,#128
3836*4757b351SPierre Pronchery	movi	v3.16b,#192
3837*4757b351SPierre Pronchery	mov	v0.s[0],w6
3838*4757b351SPierre Pronchery
3839*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
3840*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
3841*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
3842*4757b351SPierre Pronchery
3843*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3844*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3845*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3846*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3847*4757b351SPierre Pronchery
3848*4757b351SPierre Pronchery	mov	w6,v0.s[0]
3849*4757b351SPierre Pronchery	mov	w7,v1.s[0]
3850*4757b351SPierre Pronchery	mov	w9,v2.s[0]
3851*4757b351SPierre Pronchery	add	w7,w6,w7
3852*4757b351SPierre Pronchery	mov	w6,v3.s[0]
3853*4757b351SPierre Pronchery	add	w7,w7,w9
3854*4757b351SPierre Pronchery	add	w7,w7,w6
3855*4757b351SPierre Pronchery
3856*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
3857*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
3858*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
3859*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
3860*4757b351SPierre Pronchery	eor	w15,w15,w6
3861*4757b351SPierre Pronchery	subs	w11,w11,#1
3862*4757b351SPierre Pronchery	b.ne	10b
3863*4757b351SPierre Pronchery	mov	v4.s[0],w15
3864*4757b351SPierre Pronchery	mov	v4.s[1],w14
3865*4757b351SPierre Pronchery	mov	v4.s[2],w13
3866*4757b351SPierre Pronchery	mov	v4.s[3],w12
3867*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3868*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
3869*4757b351SPierre Pronchery#endif
3870*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v10.16b
3871*4757b351SPierre Pronchery	st1	{v4.4s}, [x26]
3872*4757b351SPierre Pronchery.return_gb:
3873*4757b351SPierre Pronchery	ldp	d14, d15, [sp], #0x10
3874*4757b351SPierre Pronchery	ldp	d12, d13, [sp], #0x10
3875*4757b351SPierre Pronchery	ldp	d10, d11, [sp], #0x10
3876*4757b351SPierre Pronchery	ldp	d8, d9, [sp], #0x10
3877*4757b351SPierre Pronchery	ldp	x29, x30, [sp], #0x10
3878*4757b351SPierre Pronchery	ldp	x27, x28, [sp], #0x10
3879*4757b351SPierre Pronchery	ldp	x25, x26, [sp], #0x10
3880*4757b351SPierre Pronchery	ldp	x23, x24, [sp], #0x10
3881*4757b351SPierre Pronchery	ldp	x21, x22, [sp], #0x10
3882*4757b351SPierre Pronchery	ldp	x19, x20, [sp], #0x10
3883*4757b351SPierre Pronchery	ldp	x17, x18, [sp], #0x10
3884*4757b351SPierre Pronchery	ldp	x15, x16, [sp], #0x10
3885*4757b351SPierre Pronchery	AARCH64_VALIDATE_LINK_REGISTER
3886*4757b351SPierre Pronchery	ret
3887*4757b351SPierre Pronchery.size	vpsm4_xts_encrypt_gb,.-vpsm4_xts_encrypt_gb
3888*4757b351SPierre Pronchery.globl	vpsm4_xts_encrypt
3889*4757b351SPierre Pronchery.type	vpsm4_xts_encrypt,%function
3890*4757b351SPierre Pronchery.align	5
3891*4757b351SPierre Proncheryvpsm4_xts_encrypt:
3892*4757b351SPierre Pronchery	AARCH64_SIGN_LINK_REGISTER
3893*4757b351SPierre Pronchery	stp	x15, x16, [sp, #-0x10]!
3894*4757b351SPierre Pronchery	stp	x17, x18, [sp, #-0x10]!
3895*4757b351SPierre Pronchery	stp	x19, x20, [sp, #-0x10]!
3896*4757b351SPierre Pronchery	stp	x21, x22, [sp, #-0x10]!
3897*4757b351SPierre Pronchery	stp	x23, x24, [sp, #-0x10]!
3898*4757b351SPierre Pronchery	stp	x25, x26, [sp, #-0x10]!
3899*4757b351SPierre Pronchery	stp	x27, x28, [sp, #-0x10]!
3900*4757b351SPierre Pronchery	stp	x29, x30, [sp, #-0x10]!
3901*4757b351SPierre Pronchery	stp	d8, d9, [sp, #-0x10]!
3902*4757b351SPierre Pronchery	stp	d10, d11, [sp, #-0x10]!
3903*4757b351SPierre Pronchery	stp	d12, d13, [sp, #-0x10]!
3904*4757b351SPierre Pronchery	stp	d14, d15, [sp, #-0x10]!
3905*4757b351SPierre Pronchery	mov	x26,x3
3906*4757b351SPierre Pronchery	mov	x27,x4
3907*4757b351SPierre Pronchery	mov	w28,w6
3908*4757b351SPierre Pronchery	ld1	{v8.4s}, [x5]
3909*4757b351SPierre Pronchery	mov	x3,x27
3910*4757b351SPierre Pronchery	adrp	x10,.Lsbox
3911*4757b351SPierre Pronchery	add	x10,x10,#:lo12:.Lsbox
3912*4757b351SPierre Pronchery	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64
3913*4757b351SPierre Pronchery	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64
3914*4757b351SPierre Pronchery	ld1	{v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64
3915*4757b351SPierre Pronchery	ld1	{v28.16b,v29.16b,v30.16b,v31.16b},[x10]
3916*4757b351SPierre Pronchery#ifndef __AARCH64EB__
3917*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
3918*4757b351SPierre Pronchery#endif
3919*4757b351SPierre Pronchery	mov	x10,x3
3920*4757b351SPierre Pronchery	mov	w11,#8
3921*4757b351SPierre Pronchery	mov	w12,v8.s[0]
3922*4757b351SPierre Pronchery	mov	w13,v8.s[1]
3923*4757b351SPierre Pronchery	mov	w14,v8.s[2]
3924*4757b351SPierre Pronchery	mov	w15,v8.s[3]
3925*4757b351SPierre Pronchery10:
3926*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
3927*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
3928*4757b351SPierre Pronchery	eor	w6,w14,w15
3929*4757b351SPierre Pronchery	eor	w9,w7,w13
3930*4757b351SPierre Pronchery	eor	w6,w6,w9
3931*4757b351SPierre Pronchery	movi	v1.16b,#64
3932*4757b351SPierre Pronchery	movi	v2.16b,#128
3933*4757b351SPierre Pronchery	movi	v3.16b,#192
3934*4757b351SPierre Pronchery	mov	v0.s[0],w6
3935*4757b351SPierre Pronchery
3936*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
3937*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
3938*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
3939*4757b351SPierre Pronchery
3940*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3941*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3942*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3943*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3944*4757b351SPierre Pronchery
3945*4757b351SPierre Pronchery	mov	w6,v0.s[0]
3946*4757b351SPierre Pronchery	mov	w7,v1.s[0]
3947*4757b351SPierre Pronchery	mov	w9,v2.s[0]
3948*4757b351SPierre Pronchery	add	w7,w6,w7
3949*4757b351SPierre Pronchery	mov	w6,v3.s[0]
3950*4757b351SPierre Pronchery	add	w7,w7,w9
3951*4757b351SPierre Pronchery	add	w7,w7,w6
3952*4757b351SPierre Pronchery
3953*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
3954*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
3955*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
3956*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
3957*4757b351SPierre Pronchery	eor	w12,w12,w6
3958*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
3959*4757b351SPierre Pronchery	eor	w6,w14,w15
3960*4757b351SPierre Pronchery	eor	w9,w12,w8
3961*4757b351SPierre Pronchery	eor	w6,w6,w9
3962*4757b351SPierre Pronchery	movi	v1.16b,#64
3963*4757b351SPierre Pronchery	movi	v2.16b,#128
3964*4757b351SPierre Pronchery	movi	v3.16b,#192
3965*4757b351SPierre Pronchery	mov	v0.s[0],w6
3966*4757b351SPierre Pronchery
3967*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
3968*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
3969*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
3970*4757b351SPierre Pronchery
3971*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
3972*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
3973*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
3974*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
3975*4757b351SPierre Pronchery
3976*4757b351SPierre Pronchery	mov	w6,v0.s[0]
3977*4757b351SPierre Pronchery	mov	w7,v1.s[0]
3978*4757b351SPierre Pronchery	mov	w9,v2.s[0]
3979*4757b351SPierre Pronchery	add	w7,w6,w7
3980*4757b351SPierre Pronchery	mov	w6,v3.s[0]
3981*4757b351SPierre Pronchery	add	w7,w7,w9
3982*4757b351SPierre Pronchery	add	w7,w7,w6
3983*4757b351SPierre Pronchery
3984*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
3985*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
3986*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
3987*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
3988*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
3989*4757b351SPierre Pronchery	eor	w13,w13,w6
3990*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
3991*4757b351SPierre Pronchery	eor	w6,w12,w13
3992*4757b351SPierre Pronchery	eor	w9,w7,w15
3993*4757b351SPierre Pronchery	eor	w6,w6,w9
3994*4757b351SPierre Pronchery	movi	v1.16b,#64
3995*4757b351SPierre Pronchery	movi	v2.16b,#128
3996*4757b351SPierre Pronchery	movi	v3.16b,#192
3997*4757b351SPierre Pronchery	mov	v0.s[0],w6
3998*4757b351SPierre Pronchery
3999*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
4000*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
4001*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
4002*4757b351SPierre Pronchery
4003*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4004*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4005*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4006*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4007*4757b351SPierre Pronchery
4008*4757b351SPierre Pronchery	mov	w6,v0.s[0]
4009*4757b351SPierre Pronchery	mov	w7,v1.s[0]
4010*4757b351SPierre Pronchery	mov	w9,v2.s[0]
4011*4757b351SPierre Pronchery	add	w7,w6,w7
4012*4757b351SPierre Pronchery	mov	w6,v3.s[0]
4013*4757b351SPierre Pronchery	add	w7,w7,w9
4014*4757b351SPierre Pronchery	add	w7,w7,w6
4015*4757b351SPierre Pronchery
4016*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
4017*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
4018*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
4019*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
4020*4757b351SPierre Pronchery	eor	w14,w14,w6
4021*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4022*4757b351SPierre Pronchery	eor	w6,w12,w13
4023*4757b351SPierre Pronchery	eor	w9,w14,w8
4024*4757b351SPierre Pronchery	eor	w6,w6,w9
4025*4757b351SPierre Pronchery	movi	v1.16b,#64
4026*4757b351SPierre Pronchery	movi	v2.16b,#128
4027*4757b351SPierre Pronchery	movi	v3.16b,#192
4028*4757b351SPierre Pronchery	mov	v0.s[0],w6
4029*4757b351SPierre Pronchery
4030*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
4031*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
4032*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
4033*4757b351SPierre Pronchery
4034*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4035*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4036*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4037*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4038*4757b351SPierre Pronchery
4039*4757b351SPierre Pronchery	mov	w6,v0.s[0]
4040*4757b351SPierre Pronchery	mov	w7,v1.s[0]
4041*4757b351SPierre Pronchery	mov	w9,v2.s[0]
4042*4757b351SPierre Pronchery	add	w7,w6,w7
4043*4757b351SPierre Pronchery	mov	w6,v3.s[0]
4044*4757b351SPierre Pronchery	add	w7,w7,w9
4045*4757b351SPierre Pronchery	add	w7,w7,w6
4046*4757b351SPierre Pronchery
4047*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
4048*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
4049*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
4050*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
4051*4757b351SPierre Pronchery	eor	w15,w15,w6
4052*4757b351SPierre Pronchery	subs	w11,w11,#1
4053*4757b351SPierre Pronchery	b.ne	10b
4054*4757b351SPierre Pronchery	mov	v8.s[0],w15
4055*4757b351SPierre Pronchery	mov	v8.s[1],w14
4056*4757b351SPierre Pronchery	mov	v8.s[2],w13
4057*4757b351SPierre Pronchery	mov	v8.s[3],w12
4058*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4059*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
4060*4757b351SPierre Pronchery#endif
4061*4757b351SPierre Pronchery	mov	x3,x26
4062*4757b351SPierre Pronchery	and	x29,x2,#0x0F
4063*4757b351SPierre Pronchery	// convert length into blocks
4064*4757b351SPierre Pronchery	lsr	x2,x2,4
4065*4757b351SPierre Pronchery	cmp	x2,#1
4066*4757b351SPierre Pronchery	b.lt	.return
4067*4757b351SPierre Pronchery
4068*4757b351SPierre Pronchery	cmp	x29,0
4069*4757b351SPierre Pronchery	// If the encryption/decryption Length is N times of 16,
4070*4757b351SPierre Pronchery	// the all blocks are encrypted/decrypted in .xts_encrypt_blocks
4071*4757b351SPierre Pronchery	b.eq	.xts_encrypt_blocks
4072*4757b351SPierre Pronchery
4073*4757b351SPierre Pronchery	// If the encryption/decryption length is not N times of 16,
4074*4757b351SPierre Pronchery	// the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak
4075*4757b351SPierre Pronchery	// the other blocks are encrypted/decrypted in .xts_encrypt_blocks
4076*4757b351SPierre Pronchery	subs	x2,x2,#1
4077*4757b351SPierre Pronchery	b.eq	.only_2blks_tweak
4078*4757b351SPierre Pronchery.xts_encrypt_blocks:
4079*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4080*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
4081*4757b351SPierre Pronchery#endif
4082*4757b351SPierre Pronchery	mov	x12,v8.d[0]
4083*4757b351SPierre Pronchery	mov	x13,v8.d[1]
4084*4757b351SPierre Pronchery	mov	w7,0x87
4085*4757b351SPierre Pronchery	extr	x9,x13,x13,#32
4086*4757b351SPierre Pronchery	extr	x15,x13,x12,#63
4087*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
4088*4757b351SPierre Pronchery	eor	x14,x8,x12,lsl#1
4089*4757b351SPierre Pronchery	mov	w7,0x87
4090*4757b351SPierre Pronchery	extr	x9,x15,x15,#32
4091*4757b351SPierre Pronchery	extr	x17,x15,x14,#63
4092*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
4093*4757b351SPierre Pronchery	eor	x16,x8,x14,lsl#1
4094*4757b351SPierre Pronchery	mov	w7,0x87
4095*4757b351SPierre Pronchery	extr	x9,x17,x17,#32
4096*4757b351SPierre Pronchery	extr	x19,x17,x16,#63
4097*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
4098*4757b351SPierre Pronchery	eor	x18,x8,x16,lsl#1
4099*4757b351SPierre Pronchery	mov	w7,0x87
4100*4757b351SPierre Pronchery	extr	x9,x19,x19,#32
4101*4757b351SPierre Pronchery	extr	x21,x19,x18,#63
4102*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
4103*4757b351SPierre Pronchery	eor	x20,x8,x18,lsl#1
4104*4757b351SPierre Pronchery	mov	w7,0x87
4105*4757b351SPierre Pronchery	extr	x9,x21,x21,#32
4106*4757b351SPierre Pronchery	extr	x23,x21,x20,#63
4107*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
4108*4757b351SPierre Pronchery	eor	x22,x8,x20,lsl#1
4109*4757b351SPierre Pronchery	mov	w7,0x87
4110*4757b351SPierre Pronchery	extr	x9,x23,x23,#32
4111*4757b351SPierre Pronchery	extr	x25,x23,x22,#63
4112*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
4113*4757b351SPierre Pronchery	eor	x24,x8,x22,lsl#1
4114*4757b351SPierre Pronchery	mov	w7,0x87
4115*4757b351SPierre Pronchery	extr	x9,x25,x25,#32
4116*4757b351SPierre Pronchery	extr	x27,x25,x24,#63
4117*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
4118*4757b351SPierre Pronchery	eor	x26,x8,x24,lsl#1
4119*4757b351SPierre Pronchery.Lxts_8_blocks_process:
4120*4757b351SPierre Pronchery	cmp	x2,#8
4121*4757b351SPierre Pronchery	b.lt	.Lxts_4_blocks_process
4122*4757b351SPierre Pronchery	mov	v0.d[0],x12
4123*4757b351SPierre Pronchery	mov	v0.d[1],x13
4124*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4125*4757b351SPierre Pronchery	rev32	v0.16b,v0.16b
4126*4757b351SPierre Pronchery#endif
4127*4757b351SPierre Pronchery	mov	v1.d[0],x14
4128*4757b351SPierre Pronchery	mov	v1.d[1],x15
4129*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4130*4757b351SPierre Pronchery	rev32	v1.16b,v1.16b
4131*4757b351SPierre Pronchery#endif
4132*4757b351SPierre Pronchery	mov	v2.d[0],x16
4133*4757b351SPierre Pronchery	mov	v2.d[1],x17
4134*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4135*4757b351SPierre Pronchery	rev32	v2.16b,v2.16b
4136*4757b351SPierre Pronchery#endif
4137*4757b351SPierre Pronchery	mov	v3.d[0],x18
4138*4757b351SPierre Pronchery	mov	v3.d[1],x19
4139*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4140*4757b351SPierre Pronchery	rev32	v3.16b,v3.16b
4141*4757b351SPierre Pronchery#endif
4142*4757b351SPierre Pronchery	mov	v12.d[0],x20
4143*4757b351SPierre Pronchery	mov	v12.d[1],x21
4144*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4145*4757b351SPierre Pronchery	rev32	v12.16b,v12.16b
4146*4757b351SPierre Pronchery#endif
4147*4757b351SPierre Pronchery	mov	v13.d[0],x22
4148*4757b351SPierre Pronchery	mov	v13.d[1],x23
4149*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4150*4757b351SPierre Pronchery	rev32	v13.16b,v13.16b
4151*4757b351SPierre Pronchery#endif
4152*4757b351SPierre Pronchery	mov	v14.d[0],x24
4153*4757b351SPierre Pronchery	mov	v14.d[1],x25
4154*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4155*4757b351SPierre Pronchery	rev32	v14.16b,v14.16b
4156*4757b351SPierre Pronchery#endif
4157*4757b351SPierre Pronchery	mov	v15.d[0],x26
4158*4757b351SPierre Pronchery	mov	v15.d[1],x27
4159*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4160*4757b351SPierre Pronchery	rev32	v15.16b,v15.16b
4161*4757b351SPierre Pronchery#endif
4162*4757b351SPierre Pronchery	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
4163*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v0.16b
4164*4757b351SPierre Pronchery	eor	v5.16b, v5.16b, v1.16b
4165*4757b351SPierre Pronchery	eor	v6.16b, v6.16b, v2.16b
4166*4757b351SPierre Pronchery	eor	v7.16b, v7.16b, v3.16b
4167*4757b351SPierre Pronchery	ld1	{v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64
4168*4757b351SPierre Pronchery	eor	v8.16b, v8.16b, v12.16b
4169*4757b351SPierre Pronchery	eor	v9.16b, v9.16b, v13.16b
4170*4757b351SPierre Pronchery	eor	v10.16b, v10.16b, v14.16b
4171*4757b351SPierre Pronchery	eor	v11.16b, v11.16b, v15.16b
4172*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4173*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
4174*4757b351SPierre Pronchery#endif
4175*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4176*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
4177*4757b351SPierre Pronchery#endif
4178*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4179*4757b351SPierre Pronchery	rev32	v6.16b,v6.16b
4180*4757b351SPierre Pronchery#endif
4181*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4182*4757b351SPierre Pronchery	rev32	v7.16b,v7.16b
4183*4757b351SPierre Pronchery#endif
4184*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4185*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
4186*4757b351SPierre Pronchery#endif
4187*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4188*4757b351SPierre Pronchery	rev32	v9.16b,v9.16b
4189*4757b351SPierre Pronchery#endif
4190*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4191*4757b351SPierre Pronchery	rev32	v10.16b,v10.16b
4192*4757b351SPierre Pronchery#endif
4193*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4194*4757b351SPierre Pronchery	rev32	v11.16b,v11.16b
4195*4757b351SPierre Pronchery#endif
4196*4757b351SPierre Pronchery	zip1	v0.4s,v4.4s,v5.4s
4197*4757b351SPierre Pronchery	zip2	v1.4s,v4.4s,v5.4s
4198*4757b351SPierre Pronchery	zip1	v2.4s,v6.4s,v7.4s
4199*4757b351SPierre Pronchery	zip2	v3.4s,v6.4s,v7.4s
4200*4757b351SPierre Pronchery	zip1	v4.2d,v0.2d,v2.2d
4201*4757b351SPierre Pronchery	zip2	v5.2d,v0.2d,v2.2d
4202*4757b351SPierre Pronchery	zip1	v6.2d,v1.2d,v3.2d
4203*4757b351SPierre Pronchery	zip2	v7.2d,v1.2d,v3.2d
4204*4757b351SPierre Pronchery	zip1	v0.4s,v8.4s,v9.4s
4205*4757b351SPierre Pronchery	zip2	v1.4s,v8.4s,v9.4s
4206*4757b351SPierre Pronchery	zip1	v2.4s,v10.4s,v11.4s
4207*4757b351SPierre Pronchery	zip2	v3.4s,v10.4s,v11.4s
4208*4757b351SPierre Pronchery	zip1	v8.2d,v0.2d,v2.2d
4209*4757b351SPierre Pronchery	zip2	v9.2d,v0.2d,v2.2d
4210*4757b351SPierre Pronchery	zip1	v10.2d,v1.2d,v3.2d
4211*4757b351SPierre Pronchery	zip2	v11.2d,v1.2d,v3.2d
4212*4757b351SPierre Pronchery	bl	_vpsm4_enc_8blks
4213*4757b351SPierre Pronchery	zip1	v8.4s,v0.4s,v1.4s
4214*4757b351SPierre Pronchery	zip2	v9.4s,v0.4s,v1.4s
4215*4757b351SPierre Pronchery	zip1	v10.4s,v2.4s,v3.4s
4216*4757b351SPierre Pronchery	zip2	v11.4s,v2.4s,v3.4s
4217*4757b351SPierre Pronchery	zip1	v0.2d,v8.2d,v10.2d
4218*4757b351SPierre Pronchery	zip2	v1.2d,v8.2d,v10.2d
4219*4757b351SPierre Pronchery	zip1	v2.2d,v9.2d,v11.2d
4220*4757b351SPierre Pronchery	zip2	v3.2d,v9.2d,v11.2d
4221*4757b351SPierre Pronchery	zip1	v8.4s,v4.4s,v5.4s
4222*4757b351SPierre Pronchery	zip2	v9.4s,v4.4s,v5.4s
4223*4757b351SPierre Pronchery	zip1	v10.4s,v6.4s,v7.4s
4224*4757b351SPierre Pronchery	zip2	v11.4s,v6.4s,v7.4s
4225*4757b351SPierre Pronchery	zip1	v4.2d,v8.2d,v10.2d
4226*4757b351SPierre Pronchery	zip2	v5.2d,v8.2d,v10.2d
4227*4757b351SPierre Pronchery	zip1	v6.2d,v9.2d,v11.2d
4228*4757b351SPierre Pronchery	zip2	v7.2d,v9.2d,v11.2d
4229*4757b351SPierre Pronchery	mov	v12.d[0],x12
4230*4757b351SPierre Pronchery	mov	v12.d[1],x13
4231*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4232*4757b351SPierre Pronchery	rev32	v12.16b,v12.16b
4233*4757b351SPierre Pronchery#endif
4234*4757b351SPierre Pronchery	mov	w7,0x87
4235*4757b351SPierre Pronchery	extr	x9,x27,x27,#32
4236*4757b351SPierre Pronchery	extr	x13,x27,x26,#63
4237*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
4238*4757b351SPierre Pronchery	eor	x12,x8,x26,lsl#1
4239*4757b351SPierre Pronchery	mov	v13.d[0],x14
4240*4757b351SPierre Pronchery	mov	v13.d[1],x15
4241*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4242*4757b351SPierre Pronchery	rev32	v13.16b,v13.16b
4243*4757b351SPierre Pronchery#endif
4244*4757b351SPierre Pronchery	mov	w7,0x87
4245*4757b351SPierre Pronchery	extr	x9,x13,x13,#32
4246*4757b351SPierre Pronchery	extr	x15,x13,x12,#63
4247*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
4248*4757b351SPierre Pronchery	eor	x14,x8,x12,lsl#1
4249*4757b351SPierre Pronchery	mov	v14.d[0],x16
4250*4757b351SPierre Pronchery	mov	v14.d[1],x17
4251*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4252*4757b351SPierre Pronchery	rev32	v14.16b,v14.16b
4253*4757b351SPierre Pronchery#endif
4254*4757b351SPierre Pronchery	mov	w7,0x87
4255*4757b351SPierre Pronchery	extr	x9,x15,x15,#32
4256*4757b351SPierre Pronchery	extr	x17,x15,x14,#63
4257*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
4258*4757b351SPierre Pronchery	eor	x16,x8,x14,lsl#1
4259*4757b351SPierre Pronchery	mov	v15.d[0],x18
4260*4757b351SPierre Pronchery	mov	v15.d[1],x19
4261*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4262*4757b351SPierre Pronchery	rev32	v15.16b,v15.16b
4263*4757b351SPierre Pronchery#endif
4264*4757b351SPierre Pronchery	mov	w7,0x87
4265*4757b351SPierre Pronchery	extr	x9,x17,x17,#32
4266*4757b351SPierre Pronchery	extr	x19,x17,x16,#63
4267*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
4268*4757b351SPierre Pronchery	eor	x18,x8,x16,lsl#1
4269*4757b351SPierre Pronchery	mov	v8.d[0],x20
4270*4757b351SPierre Pronchery	mov	v8.d[1],x21
4271*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4272*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
4273*4757b351SPierre Pronchery#endif
4274*4757b351SPierre Pronchery	mov	w7,0x87
4275*4757b351SPierre Pronchery	extr	x9,x19,x19,#32
4276*4757b351SPierre Pronchery	extr	x21,x19,x18,#63
4277*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
4278*4757b351SPierre Pronchery	eor	x20,x8,x18,lsl#1
4279*4757b351SPierre Pronchery	mov	v9.d[0],x22
4280*4757b351SPierre Pronchery	mov	v9.d[1],x23
4281*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4282*4757b351SPierre Pronchery	rev32	v9.16b,v9.16b
4283*4757b351SPierre Pronchery#endif
4284*4757b351SPierre Pronchery	mov	w7,0x87
4285*4757b351SPierre Pronchery	extr	x9,x21,x21,#32
4286*4757b351SPierre Pronchery	extr	x23,x21,x20,#63
4287*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
4288*4757b351SPierre Pronchery	eor	x22,x8,x20,lsl#1
4289*4757b351SPierre Pronchery	mov	v10.d[0],x24
4290*4757b351SPierre Pronchery	mov	v10.d[1],x25
4291*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4292*4757b351SPierre Pronchery	rev32	v10.16b,v10.16b
4293*4757b351SPierre Pronchery#endif
4294*4757b351SPierre Pronchery	mov	w7,0x87
4295*4757b351SPierre Pronchery	extr	x9,x23,x23,#32
4296*4757b351SPierre Pronchery	extr	x25,x23,x22,#63
4297*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
4298*4757b351SPierre Pronchery	eor	x24,x8,x22,lsl#1
4299*4757b351SPierre Pronchery	mov	v11.d[0],x26
4300*4757b351SPierre Pronchery	mov	v11.d[1],x27
4301*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4302*4757b351SPierre Pronchery	rev32	v11.16b,v11.16b
4303*4757b351SPierre Pronchery#endif
4304*4757b351SPierre Pronchery	mov	w7,0x87
4305*4757b351SPierre Pronchery	extr	x9,x25,x25,#32
4306*4757b351SPierre Pronchery	extr	x27,x25,x24,#63
4307*4757b351SPierre Pronchery	and	w8,w7,w9,asr#31
4308*4757b351SPierre Pronchery	eor	x26,x8,x24,lsl#1
4309*4757b351SPierre Pronchery	eor	v0.16b, v0.16b, v12.16b
4310*4757b351SPierre Pronchery	eor	v1.16b, v1.16b, v13.16b
4311*4757b351SPierre Pronchery	eor	v2.16b, v2.16b, v14.16b
4312*4757b351SPierre Pronchery	eor	v3.16b, v3.16b, v15.16b
4313*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v8.16b
4314*4757b351SPierre Pronchery	eor	v5.16b, v5.16b, v9.16b
4315*4757b351SPierre Pronchery	eor	v6.16b, v6.16b, v10.16b
4316*4757b351SPierre Pronchery	eor	v7.16b, v7.16b, v11.16b
4317*4757b351SPierre Pronchery
4318*4757b351SPierre Pronchery	// save the last tweak
4319*4757b351SPierre Pronchery	st1	{v11.4s},[x5]
4320*4757b351SPierre Pronchery	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
4321*4757b351SPierre Pronchery	st1	{v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64
4322*4757b351SPierre Pronchery	subs	x2,x2,#8
4323*4757b351SPierre Pronchery	b.gt	.Lxts_8_blocks_process
4324*4757b351SPierre Pronchery	b	100f
4325*4757b351SPierre Pronchery.Lxts_4_blocks_process:
4326*4757b351SPierre Pronchery	mov	v8.d[0],x12
4327*4757b351SPierre Pronchery	mov	v8.d[1],x13
4328*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4329*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
4330*4757b351SPierre Pronchery#endif
4331*4757b351SPierre Pronchery	mov	v9.d[0],x14
4332*4757b351SPierre Pronchery	mov	v9.d[1],x15
4333*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4334*4757b351SPierre Pronchery	rev32	v9.16b,v9.16b
4335*4757b351SPierre Pronchery#endif
4336*4757b351SPierre Pronchery	mov	v10.d[0],x16
4337*4757b351SPierre Pronchery	mov	v10.d[1],x17
4338*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4339*4757b351SPierre Pronchery	rev32	v10.16b,v10.16b
4340*4757b351SPierre Pronchery#endif
4341*4757b351SPierre Pronchery	mov	v11.d[0],x18
4342*4757b351SPierre Pronchery	mov	v11.d[1],x19
4343*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4344*4757b351SPierre Pronchery	rev32	v11.16b,v11.16b
4345*4757b351SPierre Pronchery#endif
4346*4757b351SPierre Pronchery	cmp	x2,#4
4347*4757b351SPierre Pronchery	b.lt	1f
4348*4757b351SPierre Pronchery	ld1	{v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64
4349*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v8.16b
4350*4757b351SPierre Pronchery	eor	v5.16b, v5.16b, v9.16b
4351*4757b351SPierre Pronchery	eor	v6.16b, v6.16b, v10.16b
4352*4757b351SPierre Pronchery	eor	v7.16b, v7.16b, v11.16b
4353*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4354*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
4355*4757b351SPierre Pronchery#endif
4356*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4357*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
4358*4757b351SPierre Pronchery#endif
4359*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4360*4757b351SPierre Pronchery	rev32	v6.16b,v6.16b
4361*4757b351SPierre Pronchery#endif
4362*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4363*4757b351SPierre Pronchery	rev32	v7.16b,v7.16b
4364*4757b351SPierre Pronchery#endif
4365*4757b351SPierre Pronchery	zip1	v0.4s,v4.4s,v5.4s
4366*4757b351SPierre Pronchery	zip2	v1.4s,v4.4s,v5.4s
4367*4757b351SPierre Pronchery	zip1	v2.4s,v6.4s,v7.4s
4368*4757b351SPierre Pronchery	zip2	v3.4s,v6.4s,v7.4s
4369*4757b351SPierre Pronchery	zip1	v4.2d,v0.2d,v2.2d
4370*4757b351SPierre Pronchery	zip2	v5.2d,v0.2d,v2.2d
4371*4757b351SPierre Pronchery	zip1	v6.2d,v1.2d,v3.2d
4372*4757b351SPierre Pronchery	zip2	v7.2d,v1.2d,v3.2d
4373*4757b351SPierre Pronchery	bl	_vpsm4_enc_4blks
4374*4757b351SPierre Pronchery	zip1	v4.4s,v0.4s,v1.4s
4375*4757b351SPierre Pronchery	zip2	v5.4s,v0.4s,v1.4s
4376*4757b351SPierre Pronchery	zip1	v6.4s,v2.4s,v3.4s
4377*4757b351SPierre Pronchery	zip2	v7.4s,v2.4s,v3.4s
4378*4757b351SPierre Pronchery	zip1	v0.2d,v4.2d,v6.2d
4379*4757b351SPierre Pronchery	zip2	v1.2d,v4.2d,v6.2d
4380*4757b351SPierre Pronchery	zip1	v2.2d,v5.2d,v7.2d
4381*4757b351SPierre Pronchery	zip2	v3.2d,v5.2d,v7.2d
4382*4757b351SPierre Pronchery	eor	v0.16b, v0.16b, v8.16b
4383*4757b351SPierre Pronchery	eor	v1.16b, v1.16b, v9.16b
4384*4757b351SPierre Pronchery	eor	v2.16b, v2.16b, v10.16b
4385*4757b351SPierre Pronchery	eor	v3.16b, v3.16b, v11.16b
4386*4757b351SPierre Pronchery	st1	{v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64
4387*4757b351SPierre Pronchery	sub	x2,x2,#4
4388*4757b351SPierre Pronchery	mov	v8.d[0],x20
4389*4757b351SPierre Pronchery	mov	v8.d[1],x21
4390*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4391*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
4392*4757b351SPierre Pronchery#endif
4393*4757b351SPierre Pronchery	mov	v9.d[0],x22
4394*4757b351SPierre Pronchery	mov	v9.d[1],x23
4395*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4396*4757b351SPierre Pronchery	rev32	v9.16b,v9.16b
4397*4757b351SPierre Pronchery#endif
4398*4757b351SPierre Pronchery	mov	v10.d[0],x24
4399*4757b351SPierre Pronchery	mov	v10.d[1],x25
4400*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4401*4757b351SPierre Pronchery	rev32	v10.16b,v10.16b
4402*4757b351SPierre Pronchery#endif
4403*4757b351SPierre Pronchery	// save the last tweak
4404*4757b351SPierre Pronchery	st1	{v11.4s},[x5]
4405*4757b351SPierre Pronchery1:
4406*4757b351SPierre Pronchery	// process last block
4407*4757b351SPierre Pronchery	cmp	x2,#1
4408*4757b351SPierre Pronchery	b.lt	100f
4409*4757b351SPierre Pronchery	b.gt	1f
4410*4757b351SPierre Pronchery	ld1	{v4.4s},[x0],#16
4411*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v8.16b
4412*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4413*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
4414*4757b351SPierre Pronchery#endif
4415*4757b351SPierre Pronchery	mov	x10,x3
4416*4757b351SPierre Pronchery	mov	w11,#8
4417*4757b351SPierre Pronchery	mov	w12,v4.s[0]
4418*4757b351SPierre Pronchery	mov	w13,v4.s[1]
4419*4757b351SPierre Pronchery	mov	w14,v4.s[2]
4420*4757b351SPierre Pronchery	mov	w15,v4.s[3]
4421*4757b351SPierre Pronchery10:
4422*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
4423*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
4424*4757b351SPierre Pronchery	eor	w6,w14,w15
4425*4757b351SPierre Pronchery	eor	w9,w7,w13
4426*4757b351SPierre Pronchery	eor	w6,w6,w9
4427*4757b351SPierre Pronchery	movi	v1.16b,#64
4428*4757b351SPierre Pronchery	movi	v2.16b,#128
4429*4757b351SPierre Pronchery	movi	v3.16b,#192
4430*4757b351SPierre Pronchery	mov	v0.s[0],w6
4431*4757b351SPierre Pronchery
4432*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
4433*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
4434*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
4435*4757b351SPierre Pronchery
4436*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4437*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4438*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4439*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4440*4757b351SPierre Pronchery
4441*4757b351SPierre Pronchery	mov	w6,v0.s[0]
4442*4757b351SPierre Pronchery	mov	w7,v1.s[0]
4443*4757b351SPierre Pronchery	mov	w9,v2.s[0]
4444*4757b351SPierre Pronchery	add	w7,w6,w7
4445*4757b351SPierre Pronchery	mov	w6,v3.s[0]
4446*4757b351SPierre Pronchery	add	w7,w7,w9
4447*4757b351SPierre Pronchery	add	w7,w7,w6
4448*4757b351SPierre Pronchery
4449*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
4450*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
4451*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
4452*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
4453*4757b351SPierre Pronchery	eor	w12,w12,w6
4454*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
4455*4757b351SPierre Pronchery	eor	w6,w14,w15
4456*4757b351SPierre Pronchery	eor	w9,w12,w8
4457*4757b351SPierre Pronchery	eor	w6,w6,w9
4458*4757b351SPierre Pronchery	movi	v1.16b,#64
4459*4757b351SPierre Pronchery	movi	v2.16b,#128
4460*4757b351SPierre Pronchery	movi	v3.16b,#192
4461*4757b351SPierre Pronchery	mov	v0.s[0],w6
4462*4757b351SPierre Pronchery
4463*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
4464*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
4465*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
4466*4757b351SPierre Pronchery
4467*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4468*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4469*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4470*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4471*4757b351SPierre Pronchery
4472*4757b351SPierre Pronchery	mov	w6,v0.s[0]
4473*4757b351SPierre Pronchery	mov	w7,v1.s[0]
4474*4757b351SPierre Pronchery	mov	w9,v2.s[0]
4475*4757b351SPierre Pronchery	add	w7,w6,w7
4476*4757b351SPierre Pronchery	mov	w6,v3.s[0]
4477*4757b351SPierre Pronchery	add	w7,w7,w9
4478*4757b351SPierre Pronchery	add	w7,w7,w6
4479*4757b351SPierre Pronchery
4480*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
4481*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
4482*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
4483*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
4484*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
4485*4757b351SPierre Pronchery	eor	w13,w13,w6
4486*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
4487*4757b351SPierre Pronchery	eor	w6,w12,w13
4488*4757b351SPierre Pronchery	eor	w9,w7,w15
4489*4757b351SPierre Pronchery	eor	w6,w6,w9
4490*4757b351SPierre Pronchery	movi	v1.16b,#64
4491*4757b351SPierre Pronchery	movi	v2.16b,#128
4492*4757b351SPierre Pronchery	movi	v3.16b,#192
4493*4757b351SPierre Pronchery	mov	v0.s[0],w6
4494*4757b351SPierre Pronchery
4495*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
4496*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
4497*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
4498*4757b351SPierre Pronchery
4499*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4500*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4501*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4502*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4503*4757b351SPierre Pronchery
4504*4757b351SPierre Pronchery	mov	w6,v0.s[0]
4505*4757b351SPierre Pronchery	mov	w7,v1.s[0]
4506*4757b351SPierre Pronchery	mov	w9,v2.s[0]
4507*4757b351SPierre Pronchery	add	w7,w6,w7
4508*4757b351SPierre Pronchery	mov	w6,v3.s[0]
4509*4757b351SPierre Pronchery	add	w7,w7,w9
4510*4757b351SPierre Pronchery	add	w7,w7,w6
4511*4757b351SPierre Pronchery
4512*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
4513*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
4514*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
4515*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
4516*4757b351SPierre Pronchery	eor	w14,w14,w6
4517*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4518*4757b351SPierre Pronchery	eor	w6,w12,w13
4519*4757b351SPierre Pronchery	eor	w9,w14,w8
4520*4757b351SPierre Pronchery	eor	w6,w6,w9
4521*4757b351SPierre Pronchery	movi	v1.16b,#64
4522*4757b351SPierre Pronchery	movi	v2.16b,#128
4523*4757b351SPierre Pronchery	movi	v3.16b,#192
4524*4757b351SPierre Pronchery	mov	v0.s[0],w6
4525*4757b351SPierre Pronchery
4526*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
4527*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
4528*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
4529*4757b351SPierre Pronchery
4530*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4531*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4532*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4533*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4534*4757b351SPierre Pronchery
4535*4757b351SPierre Pronchery	mov	w6,v0.s[0]
4536*4757b351SPierre Pronchery	mov	w7,v1.s[0]
4537*4757b351SPierre Pronchery	mov	w9,v2.s[0]
4538*4757b351SPierre Pronchery	add	w7,w6,w7
4539*4757b351SPierre Pronchery	mov	w6,v3.s[0]
4540*4757b351SPierre Pronchery	add	w7,w7,w9
4541*4757b351SPierre Pronchery	add	w7,w7,w6
4542*4757b351SPierre Pronchery
4543*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
4544*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
4545*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
4546*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
4547*4757b351SPierre Pronchery	eor	w15,w15,w6
4548*4757b351SPierre Pronchery	subs	w11,w11,#1
4549*4757b351SPierre Pronchery	b.ne	10b
4550*4757b351SPierre Pronchery	mov	v4.s[0],w15
4551*4757b351SPierre Pronchery	mov	v4.s[1],w14
4552*4757b351SPierre Pronchery	mov	v4.s[2],w13
4553*4757b351SPierre Pronchery	mov	v4.s[3],w12
4554*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4555*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
4556*4757b351SPierre Pronchery#endif
4557*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v8.16b
4558*4757b351SPierre Pronchery	st1	{v4.4s},[x1],#16
4559*4757b351SPierre Pronchery	// save the last tweak
4560*4757b351SPierre Pronchery	st1	{v8.4s},[x5]
4561*4757b351SPierre Pronchery	b	100f
4562*4757b351SPierre Pronchery1:	//	process last 2 blocks
4563*4757b351SPierre Pronchery	cmp	x2,#2
4564*4757b351SPierre Pronchery	b.gt	1f
4565*4757b351SPierre Pronchery	ld1	{v4.4s,v5.4s},[x0],#32
4566*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v8.16b
4567*4757b351SPierre Pronchery	eor	v5.16b, v5.16b, v9.16b
4568*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4569*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
4570*4757b351SPierre Pronchery#endif
4571*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4572*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
4573*4757b351SPierre Pronchery#endif
4574*4757b351SPierre Pronchery	zip1	v0.4s,v4.4s,v5.4s
4575*4757b351SPierre Pronchery	zip2	v1.4s,v4.4s,v5.4s
4576*4757b351SPierre Pronchery	zip1	v2.4s,v6.4s,v7.4s
4577*4757b351SPierre Pronchery	zip2	v3.4s,v6.4s,v7.4s
4578*4757b351SPierre Pronchery	zip1	v4.2d,v0.2d,v2.2d
4579*4757b351SPierre Pronchery	zip2	v5.2d,v0.2d,v2.2d
4580*4757b351SPierre Pronchery	zip1	v6.2d,v1.2d,v3.2d
4581*4757b351SPierre Pronchery	zip2	v7.2d,v1.2d,v3.2d
4582*4757b351SPierre Pronchery	bl	_vpsm4_enc_4blks
4583*4757b351SPierre Pronchery	zip1	v4.4s,v0.4s,v1.4s
4584*4757b351SPierre Pronchery	zip2	v5.4s,v0.4s,v1.4s
4585*4757b351SPierre Pronchery	zip1	v6.4s,v2.4s,v3.4s
4586*4757b351SPierre Pronchery	zip2	v7.4s,v2.4s,v3.4s
4587*4757b351SPierre Pronchery	zip1	v0.2d,v4.2d,v6.2d
4588*4757b351SPierre Pronchery	zip2	v1.2d,v4.2d,v6.2d
4589*4757b351SPierre Pronchery	zip1	v2.2d,v5.2d,v7.2d
4590*4757b351SPierre Pronchery	zip2	v3.2d,v5.2d,v7.2d
4591*4757b351SPierre Pronchery	eor	v0.16b, v0.16b, v8.16b
4592*4757b351SPierre Pronchery	eor	v1.16b, v1.16b, v9.16b
4593*4757b351SPierre Pronchery	st1	{v0.4s,v1.4s},[x1],#32
4594*4757b351SPierre Pronchery	// save the last tweak
4595*4757b351SPierre Pronchery	st1	{v9.4s},[x5]
4596*4757b351SPierre Pronchery	b	100f
4597*4757b351SPierre Pronchery1:	//	process last 3 blocks
4598*4757b351SPierre Pronchery	ld1	{v4.4s,v5.4s,v6.4s},[x0],#48
4599*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v8.16b
4600*4757b351SPierre Pronchery	eor	v5.16b, v5.16b, v9.16b
4601*4757b351SPierre Pronchery	eor	v6.16b, v6.16b, v10.16b
4602*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4603*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
4604*4757b351SPierre Pronchery#endif
4605*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4606*4757b351SPierre Pronchery	rev32	v5.16b,v5.16b
4607*4757b351SPierre Pronchery#endif
4608*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4609*4757b351SPierre Pronchery	rev32	v6.16b,v6.16b
4610*4757b351SPierre Pronchery#endif
4611*4757b351SPierre Pronchery	zip1	v0.4s,v4.4s,v5.4s
4612*4757b351SPierre Pronchery	zip2	v1.4s,v4.4s,v5.4s
4613*4757b351SPierre Pronchery	zip1	v2.4s,v6.4s,v7.4s
4614*4757b351SPierre Pronchery	zip2	v3.4s,v6.4s,v7.4s
4615*4757b351SPierre Pronchery	zip1	v4.2d,v0.2d,v2.2d
4616*4757b351SPierre Pronchery	zip2	v5.2d,v0.2d,v2.2d
4617*4757b351SPierre Pronchery	zip1	v6.2d,v1.2d,v3.2d
4618*4757b351SPierre Pronchery	zip2	v7.2d,v1.2d,v3.2d
4619*4757b351SPierre Pronchery	bl	_vpsm4_enc_4blks
4620*4757b351SPierre Pronchery	zip1	v4.4s,v0.4s,v1.4s
4621*4757b351SPierre Pronchery	zip2	v5.4s,v0.4s,v1.4s
4622*4757b351SPierre Pronchery	zip1	v6.4s,v2.4s,v3.4s
4623*4757b351SPierre Pronchery	zip2	v7.4s,v2.4s,v3.4s
4624*4757b351SPierre Pronchery	zip1	v0.2d,v4.2d,v6.2d
4625*4757b351SPierre Pronchery	zip2	v1.2d,v4.2d,v6.2d
4626*4757b351SPierre Pronchery	zip1	v2.2d,v5.2d,v7.2d
4627*4757b351SPierre Pronchery	zip2	v3.2d,v5.2d,v7.2d
4628*4757b351SPierre Pronchery	eor	v0.16b, v0.16b, v8.16b
4629*4757b351SPierre Pronchery	eor	v1.16b, v1.16b, v9.16b
4630*4757b351SPierre Pronchery	eor	v2.16b, v2.16b, v10.16b
4631*4757b351SPierre Pronchery	st1	{v0.4s,v1.4s,v2.4s},[x1],#48
4632*4757b351SPierre Pronchery	// save the last tweak
4633*4757b351SPierre Pronchery	st1	{v10.4s},[x5]
4634*4757b351SPierre Pronchery100:
4635*4757b351SPierre Pronchery	cmp	x29,0
4636*4757b351SPierre Pronchery	b.eq	.return
4637*4757b351SPierre Pronchery
4638*4757b351SPierre Pronchery// This branch calculates the last two tweaks,
4639*4757b351SPierre Pronchery// while the encryption/decryption length is larger than 32
4640*4757b351SPierre Pronchery.last_2blks_tweak:
4641*4757b351SPierre Pronchery	ld1	{v8.4s},[x5]
4642*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4643*4757b351SPierre Pronchery	rev32	v8.16b,v8.16b
4644*4757b351SPierre Pronchery#endif
4645*4757b351SPierre Pronchery	mov	v2.16b,v8.16b
4646*4757b351SPierre Pronchery	adrp	x10,.Lxts_magic
4647*4757b351SPierre Pronchery	ldr	q0, [x10, #:lo12:.Lxts_magic]
4648*4757b351SPierre Pronchery	shl	v9.16b, v2.16b, #1
4649*4757b351SPierre Pronchery	ext	v1.16b, v2.16b, v2.16b,#15
4650*4757b351SPierre Pronchery	ushr	v1.16b, v1.16b, #7
4651*4757b351SPierre Pronchery	mul	v1.16b, v1.16b, v0.16b
4652*4757b351SPierre Pronchery	eor	v9.16b, v9.16b, v1.16b
4653*4757b351SPierre Pronchery	mov	v2.16b,v9.16b
4654*4757b351SPierre Pronchery	adrp	x10,.Lxts_magic
4655*4757b351SPierre Pronchery	ldr	q0, [x10, #:lo12:.Lxts_magic]
4656*4757b351SPierre Pronchery	shl	v10.16b, v2.16b, #1
4657*4757b351SPierre Pronchery	ext	v1.16b, v2.16b, v2.16b,#15
4658*4757b351SPierre Pronchery	ushr	v1.16b, v1.16b, #7
4659*4757b351SPierre Pronchery	mul	v1.16b, v1.16b, v0.16b
4660*4757b351SPierre Pronchery	eor	v10.16b, v10.16b, v1.16b
4661*4757b351SPierre Pronchery	b	.check_dec
4662*4757b351SPierre Pronchery
4663*4757b351SPierre Pronchery
4664*4757b351SPierre Pronchery// This branch calculates the last two tweaks,
4665*4757b351SPierre Pronchery// while the encryption/decryption length is equal to 32, who only need two tweaks
4666*4757b351SPierre Pronchery.only_2blks_tweak:
4667*4757b351SPierre Pronchery	mov	v9.16b,v8.16b
4668*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4669*4757b351SPierre Pronchery	rev32	v9.16b,v9.16b
4670*4757b351SPierre Pronchery#endif
4671*4757b351SPierre Pronchery	mov	v2.16b,v9.16b
4672*4757b351SPierre Pronchery	adrp	x10,.Lxts_magic
4673*4757b351SPierre Pronchery	ldr	q0, [x10, #:lo12:.Lxts_magic]
4674*4757b351SPierre Pronchery	shl	v10.16b, v2.16b, #1
4675*4757b351SPierre Pronchery	ext	v1.16b, v2.16b, v2.16b,#15
4676*4757b351SPierre Pronchery	ushr	v1.16b, v1.16b, #7
4677*4757b351SPierre Pronchery	mul	v1.16b, v1.16b, v0.16b
4678*4757b351SPierre Pronchery	eor	v10.16b, v10.16b, v1.16b
4679*4757b351SPierre Pronchery	b	.check_dec
4680*4757b351SPierre Pronchery
4681*4757b351SPierre Pronchery
4682*4757b351SPierre Pronchery// Determine whether encryption or decryption is required.
4683*4757b351SPierre Pronchery// The last two tweaks need to be swapped for decryption.
4684*4757b351SPierre Pronchery.check_dec:
4685*4757b351SPierre Pronchery	// encryption:1 decryption:0
4686*4757b351SPierre Pronchery	cmp	w28,1
4687*4757b351SPierre Pronchery	b.eq	.process_last_2blks
4688*4757b351SPierre Pronchery	mov	v0.16B,v9.16b
4689*4757b351SPierre Pronchery	mov	v9.16B,v10.16b
4690*4757b351SPierre Pronchery	mov	v10.16B,v0.16b
4691*4757b351SPierre Pronchery
4692*4757b351SPierre Pronchery.process_last_2blks:
4693*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4694*4757b351SPierre Pronchery	rev32	v9.16b,v9.16b
4695*4757b351SPierre Pronchery#endif
4696*4757b351SPierre Pronchery#ifdef __AARCH64EB__
4697*4757b351SPierre Pronchery	rev32	v10.16b,v10.16b
4698*4757b351SPierre Pronchery#endif
4699*4757b351SPierre Pronchery	ld1	{v4.4s},[x0],#16
4700*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v9.16b
4701*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4702*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
4703*4757b351SPierre Pronchery#endif
4704*4757b351SPierre Pronchery	mov	x10,x3
4705*4757b351SPierre Pronchery	mov	w11,#8
4706*4757b351SPierre Pronchery	mov	w12,v4.s[0]
4707*4757b351SPierre Pronchery	mov	w13,v4.s[1]
4708*4757b351SPierre Pronchery	mov	w14,v4.s[2]
4709*4757b351SPierre Pronchery	mov	w15,v4.s[3]
4710*4757b351SPierre Pronchery10:
4711*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
4712*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
4713*4757b351SPierre Pronchery	eor	w6,w14,w15
4714*4757b351SPierre Pronchery	eor	w9,w7,w13
4715*4757b351SPierre Pronchery	eor	w6,w6,w9
4716*4757b351SPierre Pronchery	movi	v1.16b,#64
4717*4757b351SPierre Pronchery	movi	v2.16b,#128
4718*4757b351SPierre Pronchery	movi	v3.16b,#192
4719*4757b351SPierre Pronchery	mov	v0.s[0],w6
4720*4757b351SPierre Pronchery
4721*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
4722*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
4723*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
4724*4757b351SPierre Pronchery
4725*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4726*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4727*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4728*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4729*4757b351SPierre Pronchery
4730*4757b351SPierre Pronchery	mov	w6,v0.s[0]
4731*4757b351SPierre Pronchery	mov	w7,v1.s[0]
4732*4757b351SPierre Pronchery	mov	w9,v2.s[0]
4733*4757b351SPierre Pronchery	add	w7,w6,w7
4734*4757b351SPierre Pronchery	mov	w6,v3.s[0]
4735*4757b351SPierre Pronchery	add	w7,w7,w9
4736*4757b351SPierre Pronchery	add	w7,w7,w6
4737*4757b351SPierre Pronchery
4738*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
4739*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
4740*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
4741*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
4742*4757b351SPierre Pronchery	eor	w12,w12,w6
4743*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
4744*4757b351SPierre Pronchery	eor	w6,w14,w15
4745*4757b351SPierre Pronchery	eor	w9,w12,w8
4746*4757b351SPierre Pronchery	eor	w6,w6,w9
4747*4757b351SPierre Pronchery	movi	v1.16b,#64
4748*4757b351SPierre Pronchery	movi	v2.16b,#128
4749*4757b351SPierre Pronchery	movi	v3.16b,#192
4750*4757b351SPierre Pronchery	mov	v0.s[0],w6
4751*4757b351SPierre Pronchery
4752*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
4753*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
4754*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
4755*4757b351SPierre Pronchery
4756*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4757*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4758*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4759*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4760*4757b351SPierre Pronchery
4761*4757b351SPierre Pronchery	mov	w6,v0.s[0]
4762*4757b351SPierre Pronchery	mov	w7,v1.s[0]
4763*4757b351SPierre Pronchery	mov	w9,v2.s[0]
4764*4757b351SPierre Pronchery	add	w7,w6,w7
4765*4757b351SPierre Pronchery	mov	w6,v3.s[0]
4766*4757b351SPierre Pronchery	add	w7,w7,w9
4767*4757b351SPierre Pronchery	add	w7,w7,w6
4768*4757b351SPierre Pronchery
4769*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
4770*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
4771*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
4772*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
4773*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
4774*4757b351SPierre Pronchery	eor	w13,w13,w6
4775*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
4776*4757b351SPierre Pronchery	eor	w6,w12,w13
4777*4757b351SPierre Pronchery	eor	w9,w7,w15
4778*4757b351SPierre Pronchery	eor	w6,w6,w9
4779*4757b351SPierre Pronchery	movi	v1.16b,#64
4780*4757b351SPierre Pronchery	movi	v2.16b,#128
4781*4757b351SPierre Pronchery	movi	v3.16b,#192
4782*4757b351SPierre Pronchery	mov	v0.s[0],w6
4783*4757b351SPierre Pronchery
4784*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
4785*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
4786*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
4787*4757b351SPierre Pronchery
4788*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4789*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4790*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4791*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4792*4757b351SPierre Pronchery
4793*4757b351SPierre Pronchery	mov	w6,v0.s[0]
4794*4757b351SPierre Pronchery	mov	w7,v1.s[0]
4795*4757b351SPierre Pronchery	mov	w9,v2.s[0]
4796*4757b351SPierre Pronchery	add	w7,w6,w7
4797*4757b351SPierre Pronchery	mov	w6,v3.s[0]
4798*4757b351SPierre Pronchery	add	w7,w7,w9
4799*4757b351SPierre Pronchery	add	w7,w7,w6
4800*4757b351SPierre Pronchery
4801*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
4802*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
4803*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
4804*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
4805*4757b351SPierre Pronchery	eor	w14,w14,w6
4806*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4807*4757b351SPierre Pronchery	eor	w6,w12,w13
4808*4757b351SPierre Pronchery	eor	w9,w14,w8
4809*4757b351SPierre Pronchery	eor	w6,w6,w9
4810*4757b351SPierre Pronchery	movi	v1.16b,#64
4811*4757b351SPierre Pronchery	movi	v2.16b,#128
4812*4757b351SPierre Pronchery	movi	v3.16b,#192
4813*4757b351SPierre Pronchery	mov	v0.s[0],w6
4814*4757b351SPierre Pronchery
4815*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
4816*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
4817*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
4818*4757b351SPierre Pronchery
4819*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4820*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4821*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4822*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4823*4757b351SPierre Pronchery
4824*4757b351SPierre Pronchery	mov	w6,v0.s[0]
4825*4757b351SPierre Pronchery	mov	w7,v1.s[0]
4826*4757b351SPierre Pronchery	mov	w9,v2.s[0]
4827*4757b351SPierre Pronchery	add	w7,w6,w7
4828*4757b351SPierre Pronchery	mov	w6,v3.s[0]
4829*4757b351SPierre Pronchery	add	w7,w7,w9
4830*4757b351SPierre Pronchery	add	w7,w7,w6
4831*4757b351SPierre Pronchery
4832*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
4833*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
4834*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
4835*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
4836*4757b351SPierre Pronchery	eor	w15,w15,w6
4837*4757b351SPierre Pronchery	subs	w11,w11,#1
4838*4757b351SPierre Pronchery	b.ne	10b
4839*4757b351SPierre Pronchery	mov	v4.s[0],w15
4840*4757b351SPierre Pronchery	mov	v4.s[1],w14
4841*4757b351SPierre Pronchery	mov	v4.s[2],w13
4842*4757b351SPierre Pronchery	mov	v4.s[3],w12
4843*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4844*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
4845*4757b351SPierre Pronchery#endif
4846*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v9.16b
4847*4757b351SPierre Pronchery	st1	{v4.4s},[x1],#16
4848*4757b351SPierre Pronchery
4849*4757b351SPierre Pronchery	sub	x26,x1,16
4850*4757b351SPierre Pronchery.loop:
4851*4757b351SPierre Pronchery	subs	x29,x29,1
4852*4757b351SPierre Pronchery	ldrb	w7,[x26,x29]
4853*4757b351SPierre Pronchery	ldrb	w8,[x0,x29]
4854*4757b351SPierre Pronchery	strb	w8,[x26,x29]
4855*4757b351SPierre Pronchery	strb	w7,[x1,x29]
4856*4757b351SPierre Pronchery	b.gt	.loop
4857*4757b351SPierre Pronchery	ld1	{v4.4s}, [x26]
4858*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v10.16b
4859*4757b351SPierre Pronchery#ifndef __AARCH64EB__
4860*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
4861*4757b351SPierre Pronchery#endif
4862*4757b351SPierre Pronchery	mov	x10,x3
4863*4757b351SPierre Pronchery	mov	w11,#8
4864*4757b351SPierre Pronchery	mov	w12,v4.s[0]
4865*4757b351SPierre Pronchery	mov	w13,v4.s[1]
4866*4757b351SPierre Pronchery	mov	w14,v4.s[2]
4867*4757b351SPierre Pronchery	mov	w15,v4.s[3]
4868*4757b351SPierre Pronchery10:
4869*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
4870*4757b351SPierre Pronchery	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
4871*4757b351SPierre Pronchery	eor	w6,w14,w15
4872*4757b351SPierre Pronchery	eor	w9,w7,w13
4873*4757b351SPierre Pronchery	eor	w6,w6,w9
4874*4757b351SPierre Pronchery	movi	v1.16b,#64
4875*4757b351SPierre Pronchery	movi	v2.16b,#128
4876*4757b351SPierre Pronchery	movi	v3.16b,#192
4877*4757b351SPierre Pronchery	mov	v0.s[0],w6
4878*4757b351SPierre Pronchery
4879*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
4880*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
4881*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
4882*4757b351SPierre Pronchery
4883*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4884*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4885*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4886*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4887*4757b351SPierre Pronchery
4888*4757b351SPierre Pronchery	mov	w6,v0.s[0]
4889*4757b351SPierre Pronchery	mov	w7,v1.s[0]
4890*4757b351SPierre Pronchery	mov	w9,v2.s[0]
4891*4757b351SPierre Pronchery	add	w7,w6,w7
4892*4757b351SPierre Pronchery	mov	w6,v3.s[0]
4893*4757b351SPierre Pronchery	add	w7,w7,w9
4894*4757b351SPierre Pronchery	add	w7,w7,w6
4895*4757b351SPierre Pronchery
4896*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
4897*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
4898*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
4899*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
4900*4757b351SPierre Pronchery	eor	w12,w12,w6
4901*4757b351SPierre Pronchery	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
4902*4757b351SPierre Pronchery	eor	w6,w14,w15
4903*4757b351SPierre Pronchery	eor	w9,w12,w8
4904*4757b351SPierre Pronchery	eor	w6,w6,w9
4905*4757b351SPierre Pronchery	movi	v1.16b,#64
4906*4757b351SPierre Pronchery	movi	v2.16b,#128
4907*4757b351SPierre Pronchery	movi	v3.16b,#192
4908*4757b351SPierre Pronchery	mov	v0.s[0],w6
4909*4757b351SPierre Pronchery
4910*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
4911*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
4912*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
4913*4757b351SPierre Pronchery
4914*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4915*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4916*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4917*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4918*4757b351SPierre Pronchery
4919*4757b351SPierre Pronchery	mov	w6,v0.s[0]
4920*4757b351SPierre Pronchery	mov	w7,v1.s[0]
4921*4757b351SPierre Pronchery	mov	w9,v2.s[0]
4922*4757b351SPierre Pronchery	add	w7,w6,w7
4923*4757b351SPierre Pronchery	mov	w6,v3.s[0]
4924*4757b351SPierre Pronchery	add	w7,w7,w9
4925*4757b351SPierre Pronchery	add	w7,w7,w6
4926*4757b351SPierre Pronchery
4927*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
4928*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
4929*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
4930*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
4931*4757b351SPierre Pronchery	ldp	w7,w8,[x10],8
4932*4757b351SPierre Pronchery	eor	w13,w13,w6
4933*4757b351SPierre Pronchery	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
4934*4757b351SPierre Pronchery	eor	w6,w12,w13
4935*4757b351SPierre Pronchery	eor	w9,w7,w15
4936*4757b351SPierre Pronchery	eor	w6,w6,w9
4937*4757b351SPierre Pronchery	movi	v1.16b,#64
4938*4757b351SPierre Pronchery	movi	v2.16b,#128
4939*4757b351SPierre Pronchery	movi	v3.16b,#192
4940*4757b351SPierre Pronchery	mov	v0.s[0],w6
4941*4757b351SPierre Pronchery
4942*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
4943*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
4944*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
4945*4757b351SPierre Pronchery
4946*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4947*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4948*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4949*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4950*4757b351SPierre Pronchery
4951*4757b351SPierre Pronchery	mov	w6,v0.s[0]
4952*4757b351SPierre Pronchery	mov	w7,v1.s[0]
4953*4757b351SPierre Pronchery	mov	w9,v2.s[0]
4954*4757b351SPierre Pronchery	add	w7,w6,w7
4955*4757b351SPierre Pronchery	mov	w6,v3.s[0]
4956*4757b351SPierre Pronchery	add	w7,w7,w9
4957*4757b351SPierre Pronchery	add	w7,w7,w6
4958*4757b351SPierre Pronchery
4959*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
4960*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
4961*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
4962*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
4963*4757b351SPierre Pronchery	eor	w14,w14,w6
4964*4757b351SPierre Pronchery	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
4965*4757b351SPierre Pronchery	eor	w6,w12,w13
4966*4757b351SPierre Pronchery	eor	w9,w14,w8
4967*4757b351SPierre Pronchery	eor	w6,w6,w9
4968*4757b351SPierre Pronchery	movi	v1.16b,#64
4969*4757b351SPierre Pronchery	movi	v2.16b,#128
4970*4757b351SPierre Pronchery	movi	v3.16b,#192
4971*4757b351SPierre Pronchery	mov	v0.s[0],w6
4972*4757b351SPierre Pronchery
4973*4757b351SPierre Pronchery	sub	v1.16b,v0.16b,v1.16b
4974*4757b351SPierre Pronchery	sub	v2.16b,v0.16b,v2.16b
4975*4757b351SPierre Pronchery	sub	v3.16b,v0.16b,v3.16b
4976*4757b351SPierre Pronchery
4977*4757b351SPierre Pronchery	tbl	v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b
4978*4757b351SPierre Pronchery	tbl	v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b
4979*4757b351SPierre Pronchery	tbl	v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b
4980*4757b351SPierre Pronchery	tbl	v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b
4981*4757b351SPierre Pronchery
4982*4757b351SPierre Pronchery	mov	w6,v0.s[0]
4983*4757b351SPierre Pronchery	mov	w7,v1.s[0]
4984*4757b351SPierre Pronchery	mov	w9,v2.s[0]
4985*4757b351SPierre Pronchery	add	w7,w6,w7
4986*4757b351SPierre Pronchery	mov	w6,v3.s[0]
4987*4757b351SPierre Pronchery	add	w7,w7,w9
4988*4757b351SPierre Pronchery	add	w7,w7,w6
4989*4757b351SPierre Pronchery
4990*4757b351SPierre Pronchery	eor	w6,w7,w7,ror #32-2
4991*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-10
4992*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-18
4993*4757b351SPierre Pronchery	eor	w6,w6,w7,ror #32-24
4994*4757b351SPierre Pronchery	eor	w15,w15,w6
4995*4757b351SPierre Pronchery	subs	w11,w11,#1
4996*4757b351SPierre Pronchery	b.ne	10b
4997*4757b351SPierre Pronchery	mov	v4.s[0],w15
4998*4757b351SPierre Pronchery	mov	v4.s[1],w14
4999*4757b351SPierre Pronchery	mov	v4.s[2],w13
5000*4757b351SPierre Pronchery	mov	v4.s[3],w12
5001*4757b351SPierre Pronchery#ifndef __AARCH64EB__
5002*4757b351SPierre Pronchery	rev32	v4.16b,v4.16b
5003*4757b351SPierre Pronchery#endif
5004*4757b351SPierre Pronchery	eor	v4.16b, v4.16b, v10.16b
5005*4757b351SPierre Pronchery	st1	{v4.4s}, [x26]
5006*4757b351SPierre Pronchery.return:
5007*4757b351SPierre Pronchery	ldp	d14, d15, [sp], #0x10
5008*4757b351SPierre Pronchery	ldp	d12, d13, [sp], #0x10
5009*4757b351SPierre Pronchery	ldp	d10, d11, [sp], #0x10
5010*4757b351SPierre Pronchery	ldp	d8, d9, [sp], #0x10
5011*4757b351SPierre Pronchery	ldp	x29, x30, [sp], #0x10
5012*4757b351SPierre Pronchery	ldp	x27, x28, [sp], #0x10
5013*4757b351SPierre Pronchery	ldp	x25, x26, [sp], #0x10
5014*4757b351SPierre Pronchery	ldp	x23, x24, [sp], #0x10
5015*4757b351SPierre Pronchery	ldp	x21, x22, [sp], #0x10
5016*4757b351SPierre Pronchery	ldp	x19, x20, [sp], #0x10
5017*4757b351SPierre Pronchery	ldp	x17, x18, [sp], #0x10
5018*4757b351SPierre Pronchery	ldp	x15, x16, [sp], #0x10
5019*4757b351SPierre Pronchery	AARCH64_VALIDATE_LINK_REGISTER
5020*4757b351SPierre Pronchery	ret
5021*4757b351SPierre Pronchery.size	vpsm4_xts_encrypt,.-vpsm4_xts_encrypt
5022