xref: /freebsd/sys/contrib/openzfs/module/icp/asm-aarch64/sha2/sha512-armv8.S (revision 78cd75393ec79565c63927bf200f06f839a1dc05)
1/*
2 * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     https://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
19 * - modified assembly to fit into OpenZFS
20 */
21
22#if defined(__aarch64__)
23
24.text
25
26.align	6
27.type	.LK512,%object
28.LK512:
29	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
30	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
31	.quad	0x3956c25bf348b538,0x59f111f1b605d019
32	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
33	.quad	0xd807aa98a3030242,0x12835b0145706fbe
34	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
35	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
36	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
37	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
38	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
39	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
40	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
41	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
42	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
43	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
44	.quad	0x06ca6351e003826f,0x142929670a0e6e70
45	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
46	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
47	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
48	.quad	0x81c2c92e47edaee6,0x92722c851482353b
49	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
50	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
51	.quad	0xd192e819d6ef5218,0xd69906245565a910
52	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
53	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
54	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
55	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
56	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
57	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
58	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
59	.quad	0x90befffa23631e28,0xa4506cebde82bde9
60	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
61	.quad	0xca273eceea26619c,0xd186b8c721c0c207
62	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
63	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
64	.quad	0x113f9804bef90dae,0x1b710b35131c471b
65	.quad	0x28db77f523047d84,0x32caab7b40c72493
66	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
67	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
68	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
69	.quad	0	// terminator
70.size	.LK512,.-.LK512
71
72.globl	zfs_sha512_block_armv7
73.type	zfs_sha512_block_armv7,%function
74.align	6
75zfs_sha512_block_armv7:
76	hint	#34					// bti c
77	stp	x29,x30,[sp,#-128]!
78	add	x29,sp,#0
79
80	stp	x19,x20,[sp,#16]
81	stp	x21,x22,[sp,#32]
82	stp	x23,x24,[sp,#48]
83	stp	x25,x26,[sp,#64]
84	stp	x27,x28,[sp,#80]
85	sub	sp,sp,#4*8
86
87	ldp	x20,x21,[x0]				// load context
88	ldp	x22,x23,[x0,#2*8]
89	ldp	x24,x25,[x0,#4*8]
90	add	x2,x1,x2,lsl#7	// end of input
91	ldp	x26,x27,[x0,#6*8]
92	adr	x30,.LK512
93	stp	x0,x2,[x29,#96]
94
95.Loop:
96	ldp	x3,x4,[x1],#2*8
97	ldr	x19,[x30],#8			// *K++
98	eor	x28,x21,x22				// magic seed
99	str	x1,[x29,#112]
100#ifndef	__AARCH64EB__
101	rev	x3,x3			// 0
102#endif
103	ror	x16,x24,#14
104	add	x27,x27,x19			// h+=K[i]
105	eor	x6,x24,x24,ror#23
106	and	x17,x25,x24
107	bic	x19,x26,x24
108	add	x27,x27,x3			// h+=X[i]
109	orr	x17,x17,x19			// Ch(e,f,g)
110	eor	x19,x20,x21			// a^b, b^c in next round
111	eor	x16,x16,x6,ror#18	// Sigma1(e)
112	ror	x6,x20,#28
113	add	x27,x27,x17			// h+=Ch(e,f,g)
114	eor	x17,x20,x20,ror#5
115	add	x27,x27,x16			// h+=Sigma1(e)
116	and	x28,x28,x19			// (b^c)&=(a^b)
117	add	x23,x23,x27			// d+=h
118	eor	x28,x28,x21			// Maj(a,b,c)
119	eor	x17,x6,x17,ror#34	// Sigma0(a)
120	add	x27,x27,x28			// h+=Maj(a,b,c)
121	ldr	x28,[x30],#8		// *K++, x19 in next round
122	//add	x27,x27,x17			// h+=Sigma0(a)
123#ifndef	__AARCH64EB__
124	rev	x4,x4			// 1
125#endif
126	ldp	x5,x6,[x1],#2*8
127	add	x27,x27,x17			// h+=Sigma0(a)
128	ror	x16,x23,#14
129	add	x26,x26,x28			// h+=K[i]
130	eor	x7,x23,x23,ror#23
131	and	x17,x24,x23
132	bic	x28,x25,x23
133	add	x26,x26,x4			// h+=X[i]
134	orr	x17,x17,x28			// Ch(e,f,g)
135	eor	x28,x27,x20			// a^b, b^c in next round
136	eor	x16,x16,x7,ror#18	// Sigma1(e)
137	ror	x7,x27,#28
138	add	x26,x26,x17			// h+=Ch(e,f,g)
139	eor	x17,x27,x27,ror#5
140	add	x26,x26,x16			// h+=Sigma1(e)
141	and	x19,x19,x28			// (b^c)&=(a^b)
142	add	x22,x22,x26			// d+=h
143	eor	x19,x19,x20			// Maj(a,b,c)
144	eor	x17,x7,x17,ror#34	// Sigma0(a)
145	add	x26,x26,x19			// h+=Maj(a,b,c)
146	ldr	x19,[x30],#8		// *K++, x28 in next round
147	//add	x26,x26,x17			// h+=Sigma0(a)
148#ifndef	__AARCH64EB__
149	rev	x5,x5			// 2
150#endif
151	add	x26,x26,x17			// h+=Sigma0(a)
152	ror	x16,x22,#14
153	add	x25,x25,x19			// h+=K[i]
154	eor	x8,x22,x22,ror#23
155	and	x17,x23,x22
156	bic	x19,x24,x22
157	add	x25,x25,x5			// h+=X[i]
158	orr	x17,x17,x19			// Ch(e,f,g)
159	eor	x19,x26,x27			// a^b, b^c in next round
160	eor	x16,x16,x8,ror#18	// Sigma1(e)
161	ror	x8,x26,#28
162	add	x25,x25,x17			// h+=Ch(e,f,g)
163	eor	x17,x26,x26,ror#5
164	add	x25,x25,x16			// h+=Sigma1(e)
165	and	x28,x28,x19			// (b^c)&=(a^b)
166	add	x21,x21,x25			// d+=h
167	eor	x28,x28,x27			// Maj(a,b,c)
168	eor	x17,x8,x17,ror#34	// Sigma0(a)
169	add	x25,x25,x28			// h+=Maj(a,b,c)
170	ldr	x28,[x30],#8		// *K++, x19 in next round
171	//add	x25,x25,x17			// h+=Sigma0(a)
172#ifndef	__AARCH64EB__
173	rev	x6,x6			// 3
174#endif
175	ldp	x7,x8,[x1],#2*8
176	add	x25,x25,x17			// h+=Sigma0(a)
177	ror	x16,x21,#14
178	add	x24,x24,x28			// h+=K[i]
179	eor	x9,x21,x21,ror#23
180	and	x17,x22,x21
181	bic	x28,x23,x21
182	add	x24,x24,x6			// h+=X[i]
183	orr	x17,x17,x28			// Ch(e,f,g)
184	eor	x28,x25,x26			// a^b, b^c in next round
185	eor	x16,x16,x9,ror#18	// Sigma1(e)
186	ror	x9,x25,#28
187	add	x24,x24,x17			// h+=Ch(e,f,g)
188	eor	x17,x25,x25,ror#5
189	add	x24,x24,x16			// h+=Sigma1(e)
190	and	x19,x19,x28			// (b^c)&=(a^b)
191	add	x20,x20,x24			// d+=h
192	eor	x19,x19,x26			// Maj(a,b,c)
193	eor	x17,x9,x17,ror#34	// Sigma0(a)
194	add	x24,x24,x19			// h+=Maj(a,b,c)
195	ldr	x19,[x30],#8		// *K++, x28 in next round
196	//add	x24,x24,x17			// h+=Sigma0(a)
197#ifndef	__AARCH64EB__
198	rev	x7,x7			// 4
199#endif
200	add	x24,x24,x17			// h+=Sigma0(a)
201	ror	x16,x20,#14
202	add	x23,x23,x19			// h+=K[i]
203	eor	x10,x20,x20,ror#23
204	and	x17,x21,x20
205	bic	x19,x22,x20
206	add	x23,x23,x7			// h+=X[i]
207	orr	x17,x17,x19			// Ch(e,f,g)
208	eor	x19,x24,x25			// a^b, b^c in next round
209	eor	x16,x16,x10,ror#18	// Sigma1(e)
210	ror	x10,x24,#28
211	add	x23,x23,x17			// h+=Ch(e,f,g)
212	eor	x17,x24,x24,ror#5
213	add	x23,x23,x16			// h+=Sigma1(e)
214	and	x28,x28,x19			// (b^c)&=(a^b)
215	add	x27,x27,x23			// d+=h
216	eor	x28,x28,x25			// Maj(a,b,c)
217	eor	x17,x10,x17,ror#34	// Sigma0(a)
218	add	x23,x23,x28			// h+=Maj(a,b,c)
219	ldr	x28,[x30],#8		// *K++, x19 in next round
220	//add	x23,x23,x17			// h+=Sigma0(a)
221#ifndef	__AARCH64EB__
222	rev	x8,x8			// 5
223#endif
224	ldp	x9,x10,[x1],#2*8
225	add	x23,x23,x17			// h+=Sigma0(a)
226	ror	x16,x27,#14
227	add	x22,x22,x28			// h+=K[i]
228	eor	x11,x27,x27,ror#23
229	and	x17,x20,x27
230	bic	x28,x21,x27
231	add	x22,x22,x8			// h+=X[i]
232	orr	x17,x17,x28			// Ch(e,f,g)
233	eor	x28,x23,x24			// a^b, b^c in next round
234	eor	x16,x16,x11,ror#18	// Sigma1(e)
235	ror	x11,x23,#28
236	add	x22,x22,x17			// h+=Ch(e,f,g)
237	eor	x17,x23,x23,ror#5
238	add	x22,x22,x16			// h+=Sigma1(e)
239	and	x19,x19,x28			// (b^c)&=(a^b)
240	add	x26,x26,x22			// d+=h
241	eor	x19,x19,x24			// Maj(a,b,c)
242	eor	x17,x11,x17,ror#34	// Sigma0(a)
243	add	x22,x22,x19			// h+=Maj(a,b,c)
244	ldr	x19,[x30],#8		// *K++, x28 in next round
245	//add	x22,x22,x17			// h+=Sigma0(a)
246#ifndef	__AARCH64EB__
247	rev	x9,x9			// 6
248#endif
249	add	x22,x22,x17			// h+=Sigma0(a)
250	ror	x16,x26,#14
251	add	x21,x21,x19			// h+=K[i]
252	eor	x12,x26,x26,ror#23
253	and	x17,x27,x26
254	bic	x19,x20,x26
255	add	x21,x21,x9			// h+=X[i]
256	orr	x17,x17,x19			// Ch(e,f,g)
257	eor	x19,x22,x23			// a^b, b^c in next round
258	eor	x16,x16,x12,ror#18	// Sigma1(e)
259	ror	x12,x22,#28
260	add	x21,x21,x17			// h+=Ch(e,f,g)
261	eor	x17,x22,x22,ror#5
262	add	x21,x21,x16			// h+=Sigma1(e)
263	and	x28,x28,x19			// (b^c)&=(a^b)
264	add	x25,x25,x21			// d+=h
265	eor	x28,x28,x23			// Maj(a,b,c)
266	eor	x17,x12,x17,ror#34	// Sigma0(a)
267	add	x21,x21,x28			// h+=Maj(a,b,c)
268	ldr	x28,[x30],#8		// *K++, x19 in next round
269	//add	x21,x21,x17			// h+=Sigma0(a)
270#ifndef	__AARCH64EB__
271	rev	x10,x10			// 7
272#endif
273	ldp	x11,x12,[x1],#2*8
274	add	x21,x21,x17			// h+=Sigma0(a)
275	ror	x16,x25,#14
276	add	x20,x20,x28			// h+=K[i]
277	eor	x13,x25,x25,ror#23
278	and	x17,x26,x25
279	bic	x28,x27,x25
280	add	x20,x20,x10			// h+=X[i]
281	orr	x17,x17,x28			// Ch(e,f,g)
282	eor	x28,x21,x22			// a^b, b^c in next round
283	eor	x16,x16,x13,ror#18	// Sigma1(e)
284	ror	x13,x21,#28
285	add	x20,x20,x17			// h+=Ch(e,f,g)
286	eor	x17,x21,x21,ror#5
287	add	x20,x20,x16			// h+=Sigma1(e)
288	and	x19,x19,x28			// (b^c)&=(a^b)
289	add	x24,x24,x20			// d+=h
290	eor	x19,x19,x22			// Maj(a,b,c)
291	eor	x17,x13,x17,ror#34	// Sigma0(a)
292	add	x20,x20,x19			// h+=Maj(a,b,c)
293	ldr	x19,[x30],#8		// *K++, x28 in next round
294	//add	x20,x20,x17			// h+=Sigma0(a)
295#ifndef	__AARCH64EB__
296	rev	x11,x11			// 8
297#endif
298	add	x20,x20,x17			// h+=Sigma0(a)
299	ror	x16,x24,#14
300	add	x27,x27,x19			// h+=K[i]
301	eor	x14,x24,x24,ror#23
302	and	x17,x25,x24
303	bic	x19,x26,x24
304	add	x27,x27,x11			// h+=X[i]
305	orr	x17,x17,x19			// Ch(e,f,g)
306	eor	x19,x20,x21			// a^b, b^c in next round
307	eor	x16,x16,x14,ror#18	// Sigma1(e)
308	ror	x14,x20,#28
309	add	x27,x27,x17			// h+=Ch(e,f,g)
310	eor	x17,x20,x20,ror#5
311	add	x27,x27,x16			// h+=Sigma1(e)
312	and	x28,x28,x19			// (b^c)&=(a^b)
313	add	x23,x23,x27			// d+=h
314	eor	x28,x28,x21			// Maj(a,b,c)
315	eor	x17,x14,x17,ror#34	// Sigma0(a)
316	add	x27,x27,x28			// h+=Maj(a,b,c)
317	ldr	x28,[x30],#8		// *K++, x19 in next round
318	//add	x27,x27,x17			// h+=Sigma0(a)
319#ifndef	__AARCH64EB__
320	rev	x12,x12			// 9
321#endif
322	ldp	x13,x14,[x1],#2*8
323	add	x27,x27,x17			// h+=Sigma0(a)
324	ror	x16,x23,#14
325	add	x26,x26,x28			// h+=K[i]
326	eor	x15,x23,x23,ror#23
327	and	x17,x24,x23
328	bic	x28,x25,x23
329	add	x26,x26,x12			// h+=X[i]
330	orr	x17,x17,x28			// Ch(e,f,g)
331	eor	x28,x27,x20			// a^b, b^c in next round
332	eor	x16,x16,x15,ror#18	// Sigma1(e)
333	ror	x15,x27,#28
334	add	x26,x26,x17			// h+=Ch(e,f,g)
335	eor	x17,x27,x27,ror#5
336	add	x26,x26,x16			// h+=Sigma1(e)
337	and	x19,x19,x28			// (b^c)&=(a^b)
338	add	x22,x22,x26			// d+=h
339	eor	x19,x19,x20			// Maj(a,b,c)
340	eor	x17,x15,x17,ror#34	// Sigma0(a)
341	add	x26,x26,x19			// h+=Maj(a,b,c)
342	ldr	x19,[x30],#8		// *K++, x28 in next round
343	//add	x26,x26,x17			// h+=Sigma0(a)
344#ifndef	__AARCH64EB__
345	rev	x13,x13			// 10
346#endif
347	add	x26,x26,x17			// h+=Sigma0(a)
348	ror	x16,x22,#14
349	add	x25,x25,x19			// h+=K[i]
350	eor	x0,x22,x22,ror#23
351	and	x17,x23,x22
352	bic	x19,x24,x22
353	add	x25,x25,x13			// h+=X[i]
354	orr	x17,x17,x19			// Ch(e,f,g)
355	eor	x19,x26,x27			// a^b, b^c in next round
356	eor	x16,x16,x0,ror#18	// Sigma1(e)
357	ror	x0,x26,#28
358	add	x25,x25,x17			// h+=Ch(e,f,g)
359	eor	x17,x26,x26,ror#5
360	add	x25,x25,x16			// h+=Sigma1(e)
361	and	x28,x28,x19			// (b^c)&=(a^b)
362	add	x21,x21,x25			// d+=h
363	eor	x28,x28,x27			// Maj(a,b,c)
364	eor	x17,x0,x17,ror#34	// Sigma0(a)
365	add	x25,x25,x28			// h+=Maj(a,b,c)
366	ldr	x28,[x30],#8		// *K++, x19 in next round
367	//add	x25,x25,x17			// h+=Sigma0(a)
368#ifndef	__AARCH64EB__
369	rev	x14,x14			// 11
370#endif
371	ldp	x15,x0,[x1],#2*8
372	add	x25,x25,x17			// h+=Sigma0(a)
373	str	x6,[sp,#24]
374	ror	x16,x21,#14
375	add	x24,x24,x28			// h+=K[i]
376	eor	x6,x21,x21,ror#23
377	and	x17,x22,x21
378	bic	x28,x23,x21
379	add	x24,x24,x14			// h+=X[i]
380	orr	x17,x17,x28			// Ch(e,f,g)
381	eor	x28,x25,x26			// a^b, b^c in next round
382	eor	x16,x16,x6,ror#18	// Sigma1(e)
383	ror	x6,x25,#28
384	add	x24,x24,x17			// h+=Ch(e,f,g)
385	eor	x17,x25,x25,ror#5
386	add	x24,x24,x16			// h+=Sigma1(e)
387	and	x19,x19,x28			// (b^c)&=(a^b)
388	add	x20,x20,x24			// d+=h
389	eor	x19,x19,x26			// Maj(a,b,c)
390	eor	x17,x6,x17,ror#34	// Sigma0(a)
391	add	x24,x24,x19			// h+=Maj(a,b,c)
392	ldr	x19,[x30],#8		// *K++, x28 in next round
393	//add	x24,x24,x17			// h+=Sigma0(a)
394#ifndef	__AARCH64EB__
395	rev	x15,x15			// 12
396#endif
397	add	x24,x24,x17			// h+=Sigma0(a)
398	str	x7,[sp,#0]
399	ror	x16,x20,#14
400	add	x23,x23,x19			// h+=K[i]
401	eor	x7,x20,x20,ror#23
402	and	x17,x21,x20
403	bic	x19,x22,x20
404	add	x23,x23,x15			// h+=X[i]
405	orr	x17,x17,x19			// Ch(e,f,g)
406	eor	x19,x24,x25			// a^b, b^c in next round
407	eor	x16,x16,x7,ror#18	// Sigma1(e)
408	ror	x7,x24,#28
409	add	x23,x23,x17			// h+=Ch(e,f,g)
410	eor	x17,x24,x24,ror#5
411	add	x23,x23,x16			// h+=Sigma1(e)
412	and	x28,x28,x19			// (b^c)&=(a^b)
413	add	x27,x27,x23			// d+=h
414	eor	x28,x28,x25			// Maj(a,b,c)
415	eor	x17,x7,x17,ror#34	// Sigma0(a)
416	add	x23,x23,x28			// h+=Maj(a,b,c)
417	ldr	x28,[x30],#8		// *K++, x19 in next round
418	//add	x23,x23,x17			// h+=Sigma0(a)
419#ifndef	__AARCH64EB__
420	rev	x0,x0			// 13
421#endif
422	ldp	x1,x2,[x1]
423	add	x23,x23,x17			// h+=Sigma0(a)
424	str	x8,[sp,#8]
425	ror	x16,x27,#14
426	add	x22,x22,x28			// h+=K[i]
427	eor	x8,x27,x27,ror#23
428	and	x17,x20,x27
429	bic	x28,x21,x27
430	add	x22,x22,x0			// h+=X[i]
431	orr	x17,x17,x28			// Ch(e,f,g)
432	eor	x28,x23,x24			// a^b, b^c in next round
433	eor	x16,x16,x8,ror#18	// Sigma1(e)
434	ror	x8,x23,#28
435	add	x22,x22,x17			// h+=Ch(e,f,g)
436	eor	x17,x23,x23,ror#5
437	add	x22,x22,x16			// h+=Sigma1(e)
438	and	x19,x19,x28			// (b^c)&=(a^b)
439	add	x26,x26,x22			// d+=h
440	eor	x19,x19,x24			// Maj(a,b,c)
441	eor	x17,x8,x17,ror#34	// Sigma0(a)
442	add	x22,x22,x19			// h+=Maj(a,b,c)
443	ldr	x19,[x30],#8		// *K++, x28 in next round
444	//add	x22,x22,x17			// h+=Sigma0(a)
445#ifndef	__AARCH64EB__
446	rev	x1,x1			// 14
447#endif
448	ldr	x6,[sp,#24]
449	add	x22,x22,x17			// h+=Sigma0(a)
450	str	x9,[sp,#16]
451	ror	x16,x26,#14
452	add	x21,x21,x19			// h+=K[i]
453	eor	x9,x26,x26,ror#23
454	and	x17,x27,x26
455	bic	x19,x20,x26
456	add	x21,x21,x1			// h+=X[i]
457	orr	x17,x17,x19			// Ch(e,f,g)
458	eor	x19,x22,x23			// a^b, b^c in next round
459	eor	x16,x16,x9,ror#18	// Sigma1(e)
460	ror	x9,x22,#28
461	add	x21,x21,x17			// h+=Ch(e,f,g)
462	eor	x17,x22,x22,ror#5
463	add	x21,x21,x16			// h+=Sigma1(e)
464	and	x28,x28,x19			// (b^c)&=(a^b)
465	add	x25,x25,x21			// d+=h
466	eor	x28,x28,x23			// Maj(a,b,c)
467	eor	x17,x9,x17,ror#34	// Sigma0(a)
468	add	x21,x21,x28			// h+=Maj(a,b,c)
469	ldr	x28,[x30],#8		// *K++, x19 in next round
470	//add	x21,x21,x17			// h+=Sigma0(a)
471#ifndef	__AARCH64EB__
472	rev	x2,x2			// 15
473#endif
474	ldr	x7,[sp,#0]
475	add	x21,x21,x17			// h+=Sigma0(a)
476	str	x10,[sp,#24]
477	ror	x16,x25,#14
478	add	x20,x20,x28			// h+=K[i]
479	ror	x9,x4,#1
480	and	x17,x26,x25
481	ror	x8,x1,#19
482	bic	x28,x27,x25
483	ror	x10,x21,#28
484	add	x20,x20,x2			// h+=X[i]
485	eor	x16,x16,x25,ror#18
486	eor	x9,x9,x4,ror#8
487	orr	x17,x17,x28			// Ch(e,f,g)
488	eor	x28,x21,x22			// a^b, b^c in next round
489	eor	x16,x16,x25,ror#41	// Sigma1(e)
490	eor	x10,x10,x21,ror#34
491	add	x20,x20,x17			// h+=Ch(e,f,g)
492	and	x19,x19,x28			// (b^c)&=(a^b)
493	eor	x8,x8,x1,ror#61
494	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
495	add	x20,x20,x16			// h+=Sigma1(e)
496	eor	x19,x19,x22			// Maj(a,b,c)
497	eor	x17,x10,x21,ror#39	// Sigma0(a)
498	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
499	add	x3,x3,x12
500	add	x24,x24,x20			// d+=h
501	add	x20,x20,x19			// h+=Maj(a,b,c)
502	ldr	x19,[x30],#8		// *K++, x28 in next round
503	add	x3,x3,x9
504	add	x20,x20,x17			// h+=Sigma0(a)
505	add	x3,x3,x8
506.Loop_16_xx:
507	ldr	x8,[sp,#8]
508	str	x11,[sp,#0]
509	ror	x16,x24,#14
510	add	x27,x27,x19			// h+=K[i]
511	ror	x10,x5,#1
512	and	x17,x25,x24
513	ror	x9,x2,#19
514	bic	x19,x26,x24
515	ror	x11,x20,#28
516	add	x27,x27,x3			// h+=X[i]
517	eor	x16,x16,x24,ror#18
518	eor	x10,x10,x5,ror#8
519	orr	x17,x17,x19			// Ch(e,f,g)
520	eor	x19,x20,x21			// a^b, b^c in next round
521	eor	x16,x16,x24,ror#41	// Sigma1(e)
522	eor	x11,x11,x20,ror#34
523	add	x27,x27,x17			// h+=Ch(e,f,g)
524	and	x28,x28,x19			// (b^c)&=(a^b)
525	eor	x9,x9,x2,ror#61
526	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
527	add	x27,x27,x16			// h+=Sigma1(e)
528	eor	x28,x28,x21			// Maj(a,b,c)
529	eor	x17,x11,x20,ror#39	// Sigma0(a)
530	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
531	add	x4,x4,x13
532	add	x23,x23,x27			// d+=h
533	add	x27,x27,x28			// h+=Maj(a,b,c)
534	ldr	x28,[x30],#8		// *K++, x19 in next round
535	add	x4,x4,x10
536	add	x27,x27,x17			// h+=Sigma0(a)
537	add	x4,x4,x9
538	ldr	x9,[sp,#16]
539	str	x12,[sp,#8]
540	ror	x16,x23,#14
541	add	x26,x26,x28			// h+=K[i]
542	ror	x11,x6,#1
543	and	x17,x24,x23
544	ror	x10,x3,#19
545	bic	x28,x25,x23
546	ror	x12,x27,#28
547	add	x26,x26,x4			// h+=X[i]
548	eor	x16,x16,x23,ror#18
549	eor	x11,x11,x6,ror#8
550	orr	x17,x17,x28			// Ch(e,f,g)
551	eor	x28,x27,x20			// a^b, b^c in next round
552	eor	x16,x16,x23,ror#41	// Sigma1(e)
553	eor	x12,x12,x27,ror#34
554	add	x26,x26,x17			// h+=Ch(e,f,g)
555	and	x19,x19,x28			// (b^c)&=(a^b)
556	eor	x10,x10,x3,ror#61
557	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
558	add	x26,x26,x16			// h+=Sigma1(e)
559	eor	x19,x19,x20			// Maj(a,b,c)
560	eor	x17,x12,x27,ror#39	// Sigma0(a)
561	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
562	add	x5,x5,x14
563	add	x22,x22,x26			// d+=h
564	add	x26,x26,x19			// h+=Maj(a,b,c)
565	ldr	x19,[x30],#8		// *K++, x28 in next round
566	add	x5,x5,x11
567	add	x26,x26,x17			// h+=Sigma0(a)
568	add	x5,x5,x10
569	ldr	x10,[sp,#24]
570	str	x13,[sp,#16]
571	ror	x16,x22,#14
572	add	x25,x25,x19			// h+=K[i]
573	ror	x12,x7,#1
574	and	x17,x23,x22
575	ror	x11,x4,#19
576	bic	x19,x24,x22
577	ror	x13,x26,#28
578	add	x25,x25,x5			// h+=X[i]
579	eor	x16,x16,x22,ror#18
580	eor	x12,x12,x7,ror#8
581	orr	x17,x17,x19			// Ch(e,f,g)
582	eor	x19,x26,x27			// a^b, b^c in next round
583	eor	x16,x16,x22,ror#41	// Sigma1(e)
584	eor	x13,x13,x26,ror#34
585	add	x25,x25,x17			// h+=Ch(e,f,g)
586	and	x28,x28,x19			// (b^c)&=(a^b)
587	eor	x11,x11,x4,ror#61
588	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
589	add	x25,x25,x16			// h+=Sigma1(e)
590	eor	x28,x28,x27			// Maj(a,b,c)
591	eor	x17,x13,x26,ror#39	// Sigma0(a)
592	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
593	add	x6,x6,x15
594	add	x21,x21,x25			// d+=h
595	add	x25,x25,x28			// h+=Maj(a,b,c)
596	ldr	x28,[x30],#8		// *K++, x19 in next round
597	add	x6,x6,x12
598	add	x25,x25,x17			// h+=Sigma0(a)
599	add	x6,x6,x11
600	ldr	x11,[sp,#0]
601	str	x14,[sp,#24]
602	ror	x16,x21,#14
603	add	x24,x24,x28			// h+=K[i]
604	ror	x13,x8,#1
605	and	x17,x22,x21
606	ror	x12,x5,#19
607	bic	x28,x23,x21
608	ror	x14,x25,#28
609	add	x24,x24,x6			// h+=X[i]
610	eor	x16,x16,x21,ror#18
611	eor	x13,x13,x8,ror#8
612	orr	x17,x17,x28			// Ch(e,f,g)
613	eor	x28,x25,x26			// a^b, b^c in next round
614	eor	x16,x16,x21,ror#41	// Sigma1(e)
615	eor	x14,x14,x25,ror#34
616	add	x24,x24,x17			// h+=Ch(e,f,g)
617	and	x19,x19,x28			// (b^c)&=(a^b)
618	eor	x12,x12,x5,ror#61
619	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
620	add	x24,x24,x16			// h+=Sigma1(e)
621	eor	x19,x19,x26			// Maj(a,b,c)
622	eor	x17,x14,x25,ror#39	// Sigma0(a)
623	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
624	add	x7,x7,x0
625	add	x20,x20,x24			// d+=h
626	add	x24,x24,x19			// h+=Maj(a,b,c)
627	ldr	x19,[x30],#8		// *K++, x28 in next round
628	add	x7,x7,x13
629	add	x24,x24,x17			// h+=Sigma0(a)
630	add	x7,x7,x12
631	ldr	x12,[sp,#8]
632	str	x15,[sp,#0]
633	ror	x16,x20,#14
634	add	x23,x23,x19			// h+=K[i]
635	ror	x14,x9,#1
636	and	x17,x21,x20
637	ror	x13,x6,#19
638	bic	x19,x22,x20
639	ror	x15,x24,#28
640	add	x23,x23,x7			// h+=X[i]
641	eor	x16,x16,x20,ror#18
642	eor	x14,x14,x9,ror#8
643	orr	x17,x17,x19			// Ch(e,f,g)
644	eor	x19,x24,x25			// a^b, b^c in next round
645	eor	x16,x16,x20,ror#41	// Sigma1(e)
646	eor	x15,x15,x24,ror#34
647	add	x23,x23,x17			// h+=Ch(e,f,g)
648	and	x28,x28,x19			// (b^c)&=(a^b)
649	eor	x13,x13,x6,ror#61
650	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
651	add	x23,x23,x16			// h+=Sigma1(e)
652	eor	x28,x28,x25			// Maj(a,b,c)
653	eor	x17,x15,x24,ror#39	// Sigma0(a)
654	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
655	add	x8,x8,x1
656	add	x27,x27,x23			// d+=h
657	add	x23,x23,x28			// h+=Maj(a,b,c)
658	ldr	x28,[x30],#8		// *K++, x19 in next round
659	add	x8,x8,x14
660	add	x23,x23,x17			// h+=Sigma0(a)
661	add	x8,x8,x13
662	ldr	x13,[sp,#16]
663	str	x0,[sp,#8]
664	ror	x16,x27,#14
665	add	x22,x22,x28			// h+=K[i]
666	ror	x15,x10,#1
667	and	x17,x20,x27
668	ror	x14,x7,#19
669	bic	x28,x21,x27
670	ror	x0,x23,#28
671	add	x22,x22,x8			// h+=X[i]
672	eor	x16,x16,x27,ror#18
673	eor	x15,x15,x10,ror#8
674	orr	x17,x17,x28			// Ch(e,f,g)
675	eor	x28,x23,x24			// a^b, b^c in next round
676	eor	x16,x16,x27,ror#41	// Sigma1(e)
677	eor	x0,x0,x23,ror#34
678	add	x22,x22,x17			// h+=Ch(e,f,g)
679	and	x19,x19,x28			// (b^c)&=(a^b)
680	eor	x14,x14,x7,ror#61
681	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
682	add	x22,x22,x16			// h+=Sigma1(e)
683	eor	x19,x19,x24			// Maj(a,b,c)
684	eor	x17,x0,x23,ror#39	// Sigma0(a)
685	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
686	add	x9,x9,x2
687	add	x26,x26,x22			// d+=h
688	add	x22,x22,x19			// h+=Maj(a,b,c)
689	ldr	x19,[x30],#8		// *K++, x28 in next round
690	add	x9,x9,x15
691	add	x22,x22,x17			// h+=Sigma0(a)
692	add	x9,x9,x14
693	ldr	x14,[sp,#24]
694	str	x1,[sp,#16]
695	ror	x16,x26,#14
696	add	x21,x21,x19			// h+=K[i]
697	ror	x0,x11,#1
698	and	x17,x27,x26
699	ror	x15,x8,#19
700	bic	x19,x20,x26
701	ror	x1,x22,#28
702	add	x21,x21,x9			// h+=X[i]
703	eor	x16,x16,x26,ror#18
704	eor	x0,x0,x11,ror#8
705	orr	x17,x17,x19			// Ch(e,f,g)
706	eor	x19,x22,x23			// a^b, b^c in next round
707	eor	x16,x16,x26,ror#41	// Sigma1(e)
708	eor	x1,x1,x22,ror#34
709	add	x21,x21,x17			// h+=Ch(e,f,g)
710	and	x28,x28,x19			// (b^c)&=(a^b)
711	eor	x15,x15,x8,ror#61
712	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
713	add	x21,x21,x16			// h+=Sigma1(e)
714	eor	x28,x28,x23			// Maj(a,b,c)
715	eor	x17,x1,x22,ror#39	// Sigma0(a)
716	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
717	add	x10,x10,x3
718	add	x25,x25,x21			// d+=h
719	add	x21,x21,x28			// h+=Maj(a,b,c)
720	ldr	x28,[x30],#8		// *K++, x19 in next round
721	add	x10,x10,x0
722	add	x21,x21,x17			// h+=Sigma0(a)
723	add	x10,x10,x15
724	ldr	x15,[sp,#0]
725	str	x2,[sp,#24]
726	ror	x16,x25,#14
727	add	x20,x20,x28			// h+=K[i]
728	ror	x1,x12,#1
729	and	x17,x26,x25
730	ror	x0,x9,#19
731	bic	x28,x27,x25
732	ror	x2,x21,#28
733	add	x20,x20,x10			// h+=X[i]
734	eor	x16,x16,x25,ror#18
735	eor	x1,x1,x12,ror#8
736	orr	x17,x17,x28			// Ch(e,f,g)
737	eor	x28,x21,x22			// a^b, b^c in next round
738	eor	x16,x16,x25,ror#41	// Sigma1(e)
739	eor	x2,x2,x21,ror#34
740	add	x20,x20,x17			// h+=Ch(e,f,g)
741	and	x19,x19,x28			// (b^c)&=(a^b)
742	eor	x0,x0,x9,ror#61
743	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
744	add	x20,x20,x16			// h+=Sigma1(e)
745	eor	x19,x19,x22			// Maj(a,b,c)
746	eor	x17,x2,x21,ror#39	// Sigma0(a)
747	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
748	add	x11,x11,x4
749	add	x24,x24,x20			// d+=h
750	add	x20,x20,x19			// h+=Maj(a,b,c)
751	ldr	x19,[x30],#8		// *K++, x28 in next round
752	add	x11,x11,x1
753	add	x20,x20,x17			// h+=Sigma0(a)
754	add	x11,x11,x0
755	ldr	x0,[sp,#8]
756	str	x3,[sp,#0]
757	ror	x16,x24,#14
758	add	x27,x27,x19			// h+=K[i]
759	ror	x2,x13,#1
760	and	x17,x25,x24
761	ror	x1,x10,#19
762	bic	x19,x26,x24
763	ror	x3,x20,#28
764	add	x27,x27,x11			// h+=X[i]
765	eor	x16,x16,x24,ror#18
766	eor	x2,x2,x13,ror#8
767	orr	x17,x17,x19			// Ch(e,f,g)
768	eor	x19,x20,x21			// a^b, b^c in next round
769	eor	x16,x16,x24,ror#41	// Sigma1(e)
770	eor	x3,x3,x20,ror#34
771	add	x27,x27,x17			// h+=Ch(e,f,g)
772	and	x28,x28,x19			// (b^c)&=(a^b)
773	eor	x1,x1,x10,ror#61
774	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
775	add	x27,x27,x16			// h+=Sigma1(e)
776	eor	x28,x28,x21			// Maj(a,b,c)
777	eor	x17,x3,x20,ror#39	// Sigma0(a)
778	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
779	add	x12,x12,x5
780	add	x23,x23,x27			// d+=h
781	add	x27,x27,x28			// h+=Maj(a,b,c)
782	ldr	x28,[x30],#8		// *K++, x19 in next round
783	add	x12,x12,x2
784	add	x27,x27,x17			// h+=Sigma0(a)
785	add	x12,x12,x1
786	ldr	x1,[sp,#16]
787	str	x4,[sp,#8]
788	ror	x16,x23,#14
789	add	x26,x26,x28			// h+=K[i]
790	ror	x3,x14,#1
791	and	x17,x24,x23
792	ror	x2,x11,#19
793	bic	x28,x25,x23
794	ror	x4,x27,#28
795	add	x26,x26,x12			// h+=X[i]
796	eor	x16,x16,x23,ror#18
797	eor	x3,x3,x14,ror#8
798	orr	x17,x17,x28			// Ch(e,f,g)
799	eor	x28,x27,x20			// a^b, b^c in next round
800	eor	x16,x16,x23,ror#41	// Sigma1(e)
801	eor	x4,x4,x27,ror#34
802	add	x26,x26,x17			// h+=Ch(e,f,g)
803	and	x19,x19,x28			// (b^c)&=(a^b)
804	eor	x2,x2,x11,ror#61
805	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
806	add	x26,x26,x16			// h+=Sigma1(e)
807	eor	x19,x19,x20			// Maj(a,b,c)
808	eor	x17,x4,x27,ror#39	// Sigma0(a)
809	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
810	add	x13,x13,x6
811	add	x22,x22,x26			// d+=h
812	add	x26,x26,x19			// h+=Maj(a,b,c)
813	ldr	x19,[x30],#8		// *K++, x28 in next round
814	add	x13,x13,x3
815	add	x26,x26,x17			// h+=Sigma0(a)
816	add	x13,x13,x2
817	ldr	x2,[sp,#24]
818	str	x5,[sp,#16]
819	ror	x16,x22,#14
820	add	x25,x25,x19			// h+=K[i]
821	ror	x4,x15,#1
822	and	x17,x23,x22
823	ror	x3,x12,#19
824	bic	x19,x24,x22
825	ror	x5,x26,#28
826	add	x25,x25,x13			// h+=X[i]
827	eor	x16,x16,x22,ror#18
828	eor	x4,x4,x15,ror#8
829	orr	x17,x17,x19			// Ch(e,f,g)
830	eor	x19,x26,x27			// a^b, b^c in next round
831	eor	x16,x16,x22,ror#41	// Sigma1(e)
832	eor	x5,x5,x26,ror#34
833	add	x25,x25,x17			// h+=Ch(e,f,g)
834	and	x28,x28,x19			// (b^c)&=(a^b)
835	eor	x3,x3,x12,ror#61
836	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
837	add	x25,x25,x16			// h+=Sigma1(e)
838	eor	x28,x28,x27			// Maj(a,b,c)
839	eor	x17,x5,x26,ror#39	// Sigma0(a)
840	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
841	add	x14,x14,x7
842	add	x21,x21,x25			// d+=h
843	add	x25,x25,x28			// h+=Maj(a,b,c)
844	ldr	x28,[x30],#8		// *K++, x19 in next round
845	add	x14,x14,x4
846	add	x25,x25,x17			// h+=Sigma0(a)
847	add	x14,x14,x3
848	ldr	x3,[sp,#0]
849	str	x6,[sp,#24]
850	ror	x16,x21,#14
851	add	x24,x24,x28			// h+=K[i]
852	ror	x5,x0,#1
853	and	x17,x22,x21
854	ror	x4,x13,#19
855	bic	x28,x23,x21
856	ror	x6,x25,#28
857	add	x24,x24,x14			// h+=X[i]
858	eor	x16,x16,x21,ror#18
859	eor	x5,x5,x0,ror#8
860	orr	x17,x17,x28			// Ch(e,f,g)
861	eor	x28,x25,x26			// a^b, b^c in next round
862	eor	x16,x16,x21,ror#41	// Sigma1(e)
863	eor	x6,x6,x25,ror#34
864	add	x24,x24,x17			// h+=Ch(e,f,g)
865	and	x19,x19,x28			// (b^c)&=(a^b)
866	eor	x4,x4,x13,ror#61
867	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
868	add	x24,x24,x16			// h+=Sigma1(e)
869	eor	x19,x19,x26			// Maj(a,b,c)
870	eor	x17,x6,x25,ror#39	// Sigma0(a)
871	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
872	add	x15,x15,x8
873	add	x20,x20,x24			// d+=h
874	add	x24,x24,x19			// h+=Maj(a,b,c)
875	ldr	x19,[x30],#8		// *K++, x28 in next round
876	add	x15,x15,x5
877	add	x24,x24,x17			// h+=Sigma0(a)
878	add	x15,x15,x4
879	ldr	x4,[sp,#8]
880	str	x7,[sp,#0]
881	ror	x16,x20,#14
882	add	x23,x23,x19			// h+=K[i]
883	ror	x6,x1,#1
884	and	x17,x21,x20
885	ror	x5,x14,#19
886	bic	x19,x22,x20
887	ror	x7,x24,#28
888	add	x23,x23,x15			// h+=X[i]
889	eor	x16,x16,x20,ror#18
890	eor	x6,x6,x1,ror#8
891	orr	x17,x17,x19			// Ch(e,f,g)
892	eor	x19,x24,x25			// a^b, b^c in next round
893	eor	x16,x16,x20,ror#41	// Sigma1(e)
894	eor	x7,x7,x24,ror#34
895	add	x23,x23,x17			// h+=Ch(e,f,g)
896	and	x28,x28,x19			// (b^c)&=(a^b)
897	eor	x5,x5,x14,ror#61
898	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
899	add	x23,x23,x16			// h+=Sigma1(e)
900	eor	x28,x28,x25			// Maj(a,b,c)
901	eor	x17,x7,x24,ror#39	// Sigma0(a)
902	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
903	add	x0,x0,x9
904	add	x27,x27,x23			// d+=h
905	add	x23,x23,x28			// h+=Maj(a,b,c)
906	ldr	x28,[x30],#8		// *K++, x19 in next round
907	add	x0,x0,x6
908	add	x23,x23,x17			// h+=Sigma0(a)
909	add	x0,x0,x5
910	ldr	x5,[sp,#16]
911	str	x8,[sp,#8]
912	ror	x16,x27,#14
913	add	x22,x22,x28			// h+=K[i]
914	ror	x7,x2,#1
915	and	x17,x20,x27
916	ror	x6,x15,#19
917	bic	x28,x21,x27
918	ror	x8,x23,#28
919	add	x22,x22,x0			// h+=X[i]
920	eor	x16,x16,x27,ror#18
921	eor	x7,x7,x2,ror#8
922	orr	x17,x17,x28			// Ch(e,f,g)
923	eor	x28,x23,x24			// a^b, b^c in next round
924	eor	x16,x16,x27,ror#41	// Sigma1(e)
925	eor	x8,x8,x23,ror#34
926	add	x22,x22,x17			// h+=Ch(e,f,g)
927	and	x19,x19,x28			// (b^c)&=(a^b)
928	eor	x6,x6,x15,ror#61
929	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
930	add	x22,x22,x16			// h+=Sigma1(e)
931	eor	x19,x19,x24			// Maj(a,b,c)
932	eor	x17,x8,x23,ror#39	// Sigma0(a)
933	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
934	add	x1,x1,x10
935	add	x26,x26,x22			// d+=h
936	add	x22,x22,x19			// h+=Maj(a,b,c)
937	ldr	x19,[x30],#8		// *K++, x28 in next round
938	add	x1,x1,x7
939	add	x22,x22,x17			// h+=Sigma0(a)
940	add	x1,x1,x6
941	ldr	x6,[sp,#24]
942	str	x9,[sp,#16]
943	ror	x16,x26,#14
944	add	x21,x21,x19			// h+=K[i]
945	ror	x8,x3,#1
946	and	x17,x27,x26
947	ror	x7,x0,#19
948	bic	x19,x20,x26
949	ror	x9,x22,#28
950	add	x21,x21,x1			// h+=X[i]
951	eor	x16,x16,x26,ror#18
952	eor	x8,x8,x3,ror#8
953	orr	x17,x17,x19			// Ch(e,f,g)
954	eor	x19,x22,x23			// a^b, b^c in next round
955	eor	x16,x16,x26,ror#41	// Sigma1(e)
956	eor	x9,x9,x22,ror#34
957	add	x21,x21,x17			// h+=Ch(e,f,g)
958	and	x28,x28,x19			// (b^c)&=(a^b)
959	eor	x7,x7,x0,ror#61
960	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
961	add	x21,x21,x16			// h+=Sigma1(e)
962	eor	x28,x28,x23			// Maj(a,b,c)
963	eor	x17,x9,x22,ror#39	// Sigma0(a)
964	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
965	add	x2,x2,x11
966	add	x25,x25,x21			// d+=h
967	add	x21,x21,x28			// h+=Maj(a,b,c)
968	ldr	x28,[x30],#8		// *K++, x19 in next round
969	add	x2,x2,x8
970	add	x21,x21,x17			// h+=Sigma0(a)
971	add	x2,x2,x7
972	ldr	x7,[sp,#0]
973	str	x10,[sp,#24]
974	ror	x16,x25,#14
975	add	x20,x20,x28			// h+=K[i]
976	ror	x9,x4,#1
977	and	x17,x26,x25
978	ror	x8,x1,#19
979	bic	x28,x27,x25
980	ror	x10,x21,#28
981	add	x20,x20,x2			// h+=X[i]
982	eor	x16,x16,x25,ror#18
983	eor	x9,x9,x4,ror#8
984	orr	x17,x17,x28			// Ch(e,f,g)
985	eor	x28,x21,x22			// a^b, b^c in next round
986	eor	x16,x16,x25,ror#41	// Sigma1(e)
987	eor	x10,x10,x21,ror#34
988	add	x20,x20,x17			// h+=Ch(e,f,g)
989	and	x19,x19,x28			// (b^c)&=(a^b)
990	eor	x8,x8,x1,ror#61
991	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
992	add	x20,x20,x16			// h+=Sigma1(e)
993	eor	x19,x19,x22			// Maj(a,b,c)
994	eor	x17,x10,x21,ror#39	// Sigma0(a)
995	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
996	add	x3,x3,x12
997	add	x24,x24,x20			// d+=h
998	add	x20,x20,x19			// h+=Maj(a,b,c)
999	ldr	x19,[x30],#8		// *K++, x28 in next round
1000	add	x3,x3,x9
1001	add	x20,x20,x17			// h+=Sigma0(a)
1002	add	x3,x3,x8
1003	cbnz	x19,.Loop_16_xx
1004
1005	ldp	x0,x2,[x29,#96]
1006	ldr	x1,[x29,#112]
1007	sub	x30,x30,#648		// rewind
1008
1009	ldp	x3,x4,[x0]
1010	ldp	x5,x6,[x0,#2*8]
1011	add	x1,x1,#14*8			// advance input pointer
1012	ldp	x7,x8,[x0,#4*8]
1013	add	x20,x20,x3
1014	ldp	x9,x10,[x0,#6*8]
1015	add	x21,x21,x4
1016	add	x22,x22,x5
1017	add	x23,x23,x6
1018	stp	x20,x21,[x0]
1019	add	x24,x24,x7
1020	add	x25,x25,x8
1021	stp	x22,x23,[x0,#2*8]
1022	add	x26,x26,x9
1023	add	x27,x27,x10
1024	cmp	x1,x2
1025	stp	x24,x25,[x0,#4*8]
1026	stp	x26,x27,[x0,#6*8]
1027	b.ne	.Loop
1028
1029	ldp	x19,x20,[x29,#16]
1030	add	sp,sp,#4*8
1031	ldp	x21,x22,[x29,#32]
1032	ldp	x23,x24,[x29,#48]
1033	ldp	x25,x26,[x29,#64]
1034	ldp	x27,x28,[x29,#80]
1035	ldp	x29,x30,[sp],#128
1036	ret
1037.size	zfs_sha512_block_armv7,.-zfs_sha512_block_armv7
1038
1039
1040.globl	zfs_sha512_block_armv8
1041.type	zfs_sha512_block_armv8,%function
1042.align	6
1043zfs_sha512_block_armv8:
1044	hint		#34				// bti c
1045.Lv8_entry:
1046	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later
1047	stp		x29,x30,[sp,#-16]!
1048	add		x29,sp,#0
1049
1050	ld1		{v16.16b-v19.16b},[x1],#64	// load input
1051	ld1		{v20.16b-v23.16b},[x1],#64
1052
1053	ld1		{v0.2d-v3.2d},[x0]		// load context
1054	adr		x3,.LK512
1055
1056	rev64		v16.16b,v16.16b
1057	rev64		v17.16b,v17.16b
1058	rev64		v18.16b,v18.16b
1059	rev64		v19.16b,v19.16b
1060	rev64		v20.16b,v20.16b
1061	rev64		v21.16b,v21.16b
1062	rev64		v22.16b,v22.16b
1063	rev64		v23.16b,v23.16b
1064	b		.Loop_hw
1065
1066.align	4
1067.Loop_hw:
1068	ld1		{v24.2d},[x3],#16
1069	subs		x2,x2,#1
1070	sub		x4,x1,#128
1071	orr		v26.16b,v0.16b,v0.16b			// offload
1072	orr		v27.16b,v1.16b,v1.16b
1073	orr		v28.16b,v2.16b,v2.16b
1074	orr		v29.16b,v3.16b,v3.16b
1075	csel		x1,x1,x4,ne			// conditional rewind
1076	add		v24.2d,v24.2d,v16.2d
1077	ld1		{v25.2d},[x3],#16
1078	ext		v24.16b,v24.16b,v24.16b,#8
1079	ext		v5.16b,v2.16b,v3.16b,#8
1080	ext		v6.16b,v1.16b,v2.16b,#8
1081	add		v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1082	 .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1083	 ext		v7.16b,v20.16b,v21.16b,#8
1084	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1085	 .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1086	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1087	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1088	add		v25.2d,v25.2d,v17.2d
1089	ld1		{v24.2d},[x3],#16
1090	ext		v25.16b,v25.16b,v25.16b,#8
1091	ext		v5.16b,v4.16b,v2.16b,#8
1092	ext		v6.16b,v0.16b,v4.16b,#8
1093	add		v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1094	 .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1095	 ext		v7.16b,v21.16b,v22.16b,#8
1096	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1097	 .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1098	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1099	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1100	add		v24.2d,v24.2d,v18.2d
1101	ld1		{v25.2d},[x3],#16
1102	ext		v24.16b,v24.16b,v24.16b,#8
1103	ext		v5.16b,v1.16b,v4.16b,#8
1104	ext		v6.16b,v3.16b,v1.16b,#8
1105	add		v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1106	 .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1107	 ext		v7.16b,v22.16b,v23.16b,#8
1108	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1109	 .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1110	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1111	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1112	add		v25.2d,v25.2d,v19.2d
1113	ld1		{v24.2d},[x3],#16
1114	ext		v25.16b,v25.16b,v25.16b,#8
1115	ext		v5.16b,v0.16b,v1.16b,#8
1116	ext		v6.16b,v2.16b,v0.16b,#8
1117	add		v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1118	 .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1119	 ext		v7.16b,v23.16b,v16.16b,#8
1120	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1121	 .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1122	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1123	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1124	add		v24.2d,v24.2d,v20.2d
1125	ld1		{v25.2d},[x3],#16
1126	ext		v24.16b,v24.16b,v24.16b,#8
1127	ext		v5.16b,v3.16b,v0.16b,#8
1128	ext		v6.16b,v4.16b,v3.16b,#8
1129	add		v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1130	 .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1131	 ext		v7.16b,v16.16b,v17.16b,#8
1132	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1133	 .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1134	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1135	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1136	add		v25.2d,v25.2d,v21.2d
1137	ld1		{v24.2d},[x3],#16
1138	ext		v25.16b,v25.16b,v25.16b,#8
1139	ext		v5.16b,v2.16b,v3.16b,#8
1140	ext		v6.16b,v1.16b,v2.16b,#8
1141	add		v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1142	 .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1143	 ext		v7.16b,v17.16b,v18.16b,#8
1144	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1145	 .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1146	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1147	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1148	add		v24.2d,v24.2d,v22.2d
1149	ld1		{v25.2d},[x3],#16
1150	ext		v24.16b,v24.16b,v24.16b,#8
1151	ext		v5.16b,v4.16b,v2.16b,#8
1152	ext		v6.16b,v0.16b,v4.16b,#8
1153	add		v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1154	 .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1155	 ext		v7.16b,v18.16b,v19.16b,#8
1156	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1157	 .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1158	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1159	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1160	add		v25.2d,v25.2d,v23.2d
1161	ld1		{v24.2d},[x3],#16
1162	ext		v25.16b,v25.16b,v25.16b,#8
1163	ext		v5.16b,v1.16b,v4.16b,#8
1164	ext		v6.16b,v3.16b,v1.16b,#8
1165	add		v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1166	 .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1167	 ext		v7.16b,v19.16b,v20.16b,#8
1168	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1169	 .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1170	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1171	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1172	add		v24.2d,v24.2d,v16.2d
1173	ld1		{v25.2d},[x3],#16
1174	ext		v24.16b,v24.16b,v24.16b,#8
1175	ext		v5.16b,v0.16b,v1.16b,#8
1176	ext		v6.16b,v2.16b,v0.16b,#8
1177	add		v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1178	 .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1179	 ext		v7.16b,v20.16b,v21.16b,#8
1180	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1181	 .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1182	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1183	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1184	add		v25.2d,v25.2d,v17.2d
1185	ld1		{v24.2d},[x3],#16
1186	ext		v25.16b,v25.16b,v25.16b,#8
1187	ext		v5.16b,v3.16b,v0.16b,#8
1188	ext		v6.16b,v4.16b,v3.16b,#8
1189	add		v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1190	 .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1191	 ext		v7.16b,v21.16b,v22.16b,#8
1192	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1193	 .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1194	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1195	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1196	add		v24.2d,v24.2d,v18.2d
1197	ld1		{v25.2d},[x3],#16
1198	ext		v24.16b,v24.16b,v24.16b,#8
1199	ext		v5.16b,v2.16b,v3.16b,#8
1200	ext		v6.16b,v1.16b,v2.16b,#8
1201	add		v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1202	 .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1203	 ext		v7.16b,v22.16b,v23.16b,#8
1204	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1205	 .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1206	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1207	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1208	add		v25.2d,v25.2d,v19.2d
1209	ld1		{v24.2d},[x3],#16
1210	ext		v25.16b,v25.16b,v25.16b,#8
1211	ext		v5.16b,v4.16b,v2.16b,#8
1212	ext		v6.16b,v0.16b,v4.16b,#8
1213	add		v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1214	 .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1215	 ext		v7.16b,v23.16b,v16.16b,#8
1216	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1217	 .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1218	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1219	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1220	add		v24.2d,v24.2d,v20.2d
1221	ld1		{v25.2d},[x3],#16
1222	ext		v24.16b,v24.16b,v24.16b,#8
1223	ext		v5.16b,v1.16b,v4.16b,#8
1224	ext		v6.16b,v3.16b,v1.16b,#8
1225	add		v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1226	 .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1227	 ext		v7.16b,v16.16b,v17.16b,#8
1228	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1229	 .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1230	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1231	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1232	add		v25.2d,v25.2d,v21.2d
1233	ld1		{v24.2d},[x3],#16
1234	ext		v25.16b,v25.16b,v25.16b,#8
1235	ext		v5.16b,v0.16b,v1.16b,#8
1236	ext		v6.16b,v2.16b,v0.16b,#8
1237	add		v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1238	 .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1239	 ext		v7.16b,v17.16b,v18.16b,#8
1240	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1241	 .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1242	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1243	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1244	add		v24.2d,v24.2d,v22.2d
1245	ld1		{v25.2d},[x3],#16
1246	ext		v24.16b,v24.16b,v24.16b,#8
1247	ext		v5.16b,v3.16b,v0.16b,#8
1248	ext		v6.16b,v4.16b,v3.16b,#8
1249	add		v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1250	 .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1251	 ext		v7.16b,v18.16b,v19.16b,#8
1252	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1253	 .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1254	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1255	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1256	add		v25.2d,v25.2d,v23.2d
1257	ld1		{v24.2d},[x3],#16
1258	ext		v25.16b,v25.16b,v25.16b,#8
1259	ext		v5.16b,v2.16b,v3.16b,#8
1260	ext		v6.16b,v1.16b,v2.16b,#8
1261	add		v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1262	 .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1263	 ext		v7.16b,v19.16b,v20.16b,#8
1264	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1265	 .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1266	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1267	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1268	add		v24.2d,v24.2d,v16.2d
1269	ld1		{v25.2d},[x3],#16
1270	ext		v24.16b,v24.16b,v24.16b,#8
1271	ext		v5.16b,v4.16b,v2.16b,#8
1272	ext		v6.16b,v0.16b,v4.16b,#8
1273	add		v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1274	 .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1275	 ext		v7.16b,v20.16b,v21.16b,#8
1276	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1277	 .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1278	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1279	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1280	add		v25.2d,v25.2d,v17.2d
1281	ld1		{v24.2d},[x3],#16
1282	ext		v25.16b,v25.16b,v25.16b,#8
1283	ext		v5.16b,v1.16b,v4.16b,#8
1284	ext		v6.16b,v3.16b,v1.16b,#8
1285	add		v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1286	 .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1287	 ext		v7.16b,v21.16b,v22.16b,#8
1288	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1289	 .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1290	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1291	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1292	add		v24.2d,v24.2d,v18.2d
1293	ld1		{v25.2d},[x3],#16
1294	ext		v24.16b,v24.16b,v24.16b,#8
1295	ext		v5.16b,v0.16b,v1.16b,#8
1296	ext		v6.16b,v2.16b,v0.16b,#8
1297	add		v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1298	 .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1299	 ext		v7.16b,v22.16b,v23.16b,#8
1300	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1301	 .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1302	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1303	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1304	add		v25.2d,v25.2d,v19.2d
1305	ld1		{v24.2d},[x3],#16
1306	ext		v25.16b,v25.16b,v25.16b,#8
1307	ext		v5.16b,v3.16b,v0.16b,#8
1308	ext		v6.16b,v4.16b,v3.16b,#8
1309	add		v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1310	 .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1311	 ext		v7.16b,v23.16b,v16.16b,#8
1312	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1313	 .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1314	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1315	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1316	add		v24.2d,v24.2d,v20.2d
1317	ld1		{v25.2d},[x3],#16
1318	ext		v24.16b,v24.16b,v24.16b,#8
1319	ext		v5.16b,v2.16b,v3.16b,#8
1320	ext		v6.16b,v1.16b,v2.16b,#8
1321	add		v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1322	 .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1323	 ext		v7.16b,v16.16b,v17.16b,#8
1324	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1325	 .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1326	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1327	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1328	add		v25.2d,v25.2d,v21.2d
1329	ld1		{v24.2d},[x3],#16
1330	ext		v25.16b,v25.16b,v25.16b,#8
1331	ext		v5.16b,v4.16b,v2.16b,#8
1332	ext		v6.16b,v0.16b,v4.16b,#8
1333	add		v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1334	 .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1335	 ext		v7.16b,v17.16b,v18.16b,#8
1336	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1337	 .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1338	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1339	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1340	add		v24.2d,v24.2d,v22.2d
1341	ld1		{v25.2d},[x3],#16
1342	ext		v24.16b,v24.16b,v24.16b,#8
1343	ext		v5.16b,v1.16b,v4.16b,#8
1344	ext		v6.16b,v3.16b,v1.16b,#8
1345	add		v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1346	 .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1347	 ext		v7.16b,v18.16b,v19.16b,#8
1348	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1349	 .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1350	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1351	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1352	add		v25.2d,v25.2d,v23.2d
1353	ld1		{v24.2d},[x3],#16
1354	ext		v25.16b,v25.16b,v25.16b,#8
1355	ext		v5.16b,v0.16b,v1.16b,#8
1356	ext		v6.16b,v2.16b,v0.16b,#8
1357	add		v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1358	 .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1359	 ext		v7.16b,v19.16b,v20.16b,#8
1360	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1361	 .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1362	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1363	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1364	add		v24.2d,v24.2d,v16.2d
1365	ld1		{v25.2d},[x3],#16
1366	ext		v24.16b,v24.16b,v24.16b,#8
1367	ext		v5.16b,v3.16b,v0.16b,#8
1368	ext		v6.16b,v4.16b,v3.16b,#8
1369	add		v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1370	 .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1371	 ext		v7.16b,v20.16b,v21.16b,#8
1372	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1373	 .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1374	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1375	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1376	add		v25.2d,v25.2d,v17.2d
1377	ld1		{v24.2d},[x3],#16
1378	ext		v25.16b,v25.16b,v25.16b,#8
1379	ext		v5.16b,v2.16b,v3.16b,#8
1380	ext		v6.16b,v1.16b,v2.16b,#8
1381	add		v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1382	 .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1383	 ext		v7.16b,v21.16b,v22.16b,#8
1384	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1385	 .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1386	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1387	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1388	add		v24.2d,v24.2d,v18.2d
1389	ld1		{v25.2d},[x3],#16
1390	ext		v24.16b,v24.16b,v24.16b,#8
1391	ext		v5.16b,v4.16b,v2.16b,#8
1392	ext		v6.16b,v0.16b,v4.16b,#8
1393	add		v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1394	 .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1395	 ext		v7.16b,v22.16b,v23.16b,#8
1396	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1397	 .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1398	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1399	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1400	add		v25.2d,v25.2d,v19.2d
1401	ld1		{v24.2d},[x3],#16
1402	ext		v25.16b,v25.16b,v25.16b,#8
1403	ext		v5.16b,v1.16b,v4.16b,#8
1404	ext		v6.16b,v3.16b,v1.16b,#8
1405	add		v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1406	 .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1407	 ext		v7.16b,v23.16b,v16.16b,#8
1408	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1409	 .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1410	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1411	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1412	add		v24.2d,v24.2d,v20.2d
1413	ld1		{v25.2d},[x3],#16
1414	ext		v24.16b,v24.16b,v24.16b,#8
1415	ext		v5.16b,v0.16b,v1.16b,#8
1416	ext		v6.16b,v2.16b,v0.16b,#8
1417	add		v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1418	 .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1419	 ext		v7.16b,v16.16b,v17.16b,#8
1420	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1421	 .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1422	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1423	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1424	add		v25.2d,v25.2d,v21.2d
1425	ld1		{v24.2d},[x3],#16
1426	ext		v25.16b,v25.16b,v25.16b,#8
1427	ext		v5.16b,v3.16b,v0.16b,#8
1428	ext		v6.16b,v4.16b,v3.16b,#8
1429	add		v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1430	 .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1431	 ext		v7.16b,v17.16b,v18.16b,#8
1432	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1433	 .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1434	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1435	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1436	add		v24.2d,v24.2d,v22.2d
1437	ld1		{v25.2d},[x3],#16
1438	ext		v24.16b,v24.16b,v24.16b,#8
1439	ext		v5.16b,v2.16b,v3.16b,#8
1440	ext		v6.16b,v1.16b,v2.16b,#8
1441	add		v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1442	 .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1443	 ext		v7.16b,v18.16b,v19.16b,#8
1444	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1445	 .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1446	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1447	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1448	add		v25.2d,v25.2d,v23.2d
1449	ld1		{v24.2d},[x3],#16
1450	ext		v25.16b,v25.16b,v25.16b,#8
1451	ext		v5.16b,v4.16b,v2.16b,#8
1452	ext		v6.16b,v0.16b,v4.16b,#8
1453	add		v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1454	 .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1455	 ext		v7.16b,v19.16b,v20.16b,#8
1456	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1457	 .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1458	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1459	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1460	ld1		{v25.2d},[x3],#16
1461	add		v24.2d,v24.2d,v16.2d
1462	 ld1		{v16.16b},[x1],#16		// load next input
1463	ext		v24.16b,v24.16b,v24.16b,#8
1464	ext		v5.16b,v1.16b,v4.16b,#8
1465	ext		v6.16b,v3.16b,v1.16b,#8
1466	add		v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1467	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1468	 rev64		v16.16b,v16.16b
1469	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1470	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1471	ld1		{v24.2d},[x3],#16
1472	add		v25.2d,v25.2d,v17.2d
1473	 ld1		{v17.16b},[x1],#16		// load next input
1474	ext		v25.16b,v25.16b,v25.16b,#8
1475	ext		v5.16b,v0.16b,v1.16b,#8
1476	ext		v6.16b,v2.16b,v0.16b,#8
1477	add		v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1478	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1479	 rev64		v17.16b,v17.16b
1480	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1481	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1482	ld1		{v25.2d},[x3],#16
1483	add		v24.2d,v24.2d,v18.2d
1484	 ld1		{v18.16b},[x1],#16		// load next input
1485	ext		v24.16b,v24.16b,v24.16b,#8
1486	ext		v5.16b,v3.16b,v0.16b,#8
1487	ext		v6.16b,v4.16b,v3.16b,#8
1488	add		v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1489	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1490	 rev64		v18.16b,v18.16b
1491	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1492	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1493	ld1		{v24.2d},[x3],#16
1494	add		v25.2d,v25.2d,v19.2d
1495	 ld1		{v19.16b},[x1],#16		// load next input
1496	ext		v25.16b,v25.16b,v25.16b,#8
1497	ext		v5.16b,v2.16b,v3.16b,#8
1498	ext		v6.16b,v1.16b,v2.16b,#8
1499	add		v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1500	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1501	 rev64		v19.16b,v19.16b
1502	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1503	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1504	ld1		{v25.2d},[x3],#16
1505	add		v24.2d,v24.2d,v20.2d
1506	 ld1		{v20.16b},[x1],#16		// load next input
1507	ext		v24.16b,v24.16b,v24.16b,#8
1508	ext		v5.16b,v4.16b,v2.16b,#8
1509	ext		v6.16b,v0.16b,v4.16b,#8
1510	add		v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1511	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1512	 rev64		v20.16b,v20.16b
1513	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1514	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1515	ld1		{v24.2d},[x3],#16
1516	add		v25.2d,v25.2d,v21.2d
1517	 ld1		{v21.16b},[x1],#16		// load next input
1518	ext		v25.16b,v25.16b,v25.16b,#8
1519	ext		v5.16b,v1.16b,v4.16b,#8
1520	ext		v6.16b,v3.16b,v1.16b,#8
1521	add		v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1522	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1523	 rev64		v21.16b,v21.16b
1524	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1525	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1526	ld1		{v25.2d},[x3],#16
1527	add		v24.2d,v24.2d,v22.2d
1528	 ld1		{v22.16b},[x1],#16		// load next input
1529	ext		v24.16b,v24.16b,v24.16b,#8
1530	ext		v5.16b,v0.16b,v1.16b,#8
1531	ext		v6.16b,v2.16b,v0.16b,#8
1532	add		v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1533	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1534	 rev64		v22.16b,v22.16b
1535	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1536	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1537	sub		x3,x3,#80*8	// rewind
1538	add		v25.2d,v25.2d,v23.2d
1539	 ld1		{v23.16b},[x1],#16		// load next input
1540	ext		v25.16b,v25.16b,v25.16b,#8
1541	ext		v5.16b,v3.16b,v0.16b,#8
1542	ext		v6.16b,v4.16b,v3.16b,#8
1543	add		v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1544	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1545	 rev64		v23.16b,v23.16b
1546	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1547	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1548	add		v0.2d,v0.2d,v26.2d			// accumulate
1549	add		v1.2d,v1.2d,v27.2d
1550	add		v2.2d,v2.2d,v28.2d
1551	add		v3.2d,v3.2d,v29.2d
1552
1553	cbnz		x2,.Loop_hw
1554
1555	st1		{v0.2d-v3.2d},[x0]		// store context
1556
1557	ldr		x29,[sp],#16
1558	ret
1559.size	zfs_sha512_block_armv8,.-zfs_sha512_block_armv8
1560#endif
1561