xref: /freebsd/sys/contrib/openzfs/module/icp/asm-aarch64/sha2/sha512-armv8.S (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1/*
2 * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     https://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
19 * - modified assembly to fit into OpenZFS
20 */
21
22#if defined(__aarch64__)
23
24	.section	.note.gnu.property,"a",@note
25	.p2align	3
26	.word	4
27	.word	16
28	.word	5
29	.asciz	"GNU"
30	.word	3221225472
31	.word	4
32	.word	3
33	.word	0
34.text
35
36.align	6
37.type	.LK512,%object
38.LK512:
39	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
40	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
41	.quad	0x3956c25bf348b538,0x59f111f1b605d019
42	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
43	.quad	0xd807aa98a3030242,0x12835b0145706fbe
44	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
45	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
46	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
47	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
48	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
49	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
50	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
51	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
52	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
53	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
54	.quad	0x06ca6351e003826f,0x142929670a0e6e70
55	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
56	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
57	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
58	.quad	0x81c2c92e47edaee6,0x92722c851482353b
59	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
60	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
61	.quad	0xd192e819d6ef5218,0xd69906245565a910
62	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
63	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
64	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
65	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
66	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
67	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
68	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
69	.quad	0x90befffa23631e28,0xa4506cebde82bde9
70	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
71	.quad	0xca273eceea26619c,0xd186b8c721c0c207
72	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
73	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
74	.quad	0x113f9804bef90dae,0x1b710b35131c471b
75	.quad	0x28db77f523047d84,0x32caab7b40c72493
76	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
77	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
78	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
79	.quad	0	// terminator
80.size	.LK512,.-.LK512
81
82.globl	zfs_sha512_block_armv7
83.type	zfs_sha512_block_armv7,%function
84.align	6
85zfs_sha512_block_armv7:
86	hint	#34					// bti c
87	stp	x29,x30,[sp,#-128]!
88	add	x29,sp,#0
89
90	stp	x19,x20,[sp,#16]
91	stp	x21,x22,[sp,#32]
92	stp	x23,x24,[sp,#48]
93	stp	x25,x26,[sp,#64]
94	stp	x27,x28,[sp,#80]
95	sub	sp,sp,#4*8
96
97	ldp	x20,x21,[x0]				// load context
98	ldp	x22,x23,[x0,#2*8]
99	ldp	x24,x25,[x0,#4*8]
100	add	x2,x1,x2,lsl#7	// end of input
101	ldp	x26,x27,[x0,#6*8]
102	adr	x30,.LK512
103	stp	x0,x2,[x29,#96]
104
105.Loop:
106	ldp	x3,x4,[x1],#2*8
107	ldr	x19,[x30],#8			// *K++
108	eor	x28,x21,x22				// magic seed
109	str	x1,[x29,#112]
110#ifndef	__AARCH64EB__
111	rev	x3,x3			// 0
112#endif
113	ror	x16,x24,#14
114	add	x27,x27,x19			// h+=K[i]
115	eor	x6,x24,x24,ror#23
116	and	x17,x25,x24
117	bic	x19,x26,x24
118	add	x27,x27,x3			// h+=X[i]
119	orr	x17,x17,x19			// Ch(e,f,g)
120	eor	x19,x20,x21			// a^b, b^c in next round
121	eor	x16,x16,x6,ror#18	// Sigma1(e)
122	ror	x6,x20,#28
123	add	x27,x27,x17			// h+=Ch(e,f,g)
124	eor	x17,x20,x20,ror#5
125	add	x27,x27,x16			// h+=Sigma1(e)
126	and	x28,x28,x19			// (b^c)&=(a^b)
127	add	x23,x23,x27			// d+=h
128	eor	x28,x28,x21			// Maj(a,b,c)
129	eor	x17,x6,x17,ror#34	// Sigma0(a)
130	add	x27,x27,x28			// h+=Maj(a,b,c)
131	ldr	x28,[x30],#8		// *K++, x19 in next round
132	//add	x27,x27,x17			// h+=Sigma0(a)
133#ifndef	__AARCH64EB__
134	rev	x4,x4			// 1
135#endif
136	ldp	x5,x6,[x1],#2*8
137	add	x27,x27,x17			// h+=Sigma0(a)
138	ror	x16,x23,#14
139	add	x26,x26,x28			// h+=K[i]
140	eor	x7,x23,x23,ror#23
141	and	x17,x24,x23
142	bic	x28,x25,x23
143	add	x26,x26,x4			// h+=X[i]
144	orr	x17,x17,x28			// Ch(e,f,g)
145	eor	x28,x27,x20			// a^b, b^c in next round
146	eor	x16,x16,x7,ror#18	// Sigma1(e)
147	ror	x7,x27,#28
148	add	x26,x26,x17			// h+=Ch(e,f,g)
149	eor	x17,x27,x27,ror#5
150	add	x26,x26,x16			// h+=Sigma1(e)
151	and	x19,x19,x28			// (b^c)&=(a^b)
152	add	x22,x22,x26			// d+=h
153	eor	x19,x19,x20			// Maj(a,b,c)
154	eor	x17,x7,x17,ror#34	// Sigma0(a)
155	add	x26,x26,x19			// h+=Maj(a,b,c)
156	ldr	x19,[x30],#8		// *K++, x28 in next round
157	//add	x26,x26,x17			// h+=Sigma0(a)
158#ifndef	__AARCH64EB__
159	rev	x5,x5			// 2
160#endif
161	add	x26,x26,x17			// h+=Sigma0(a)
162	ror	x16,x22,#14
163	add	x25,x25,x19			// h+=K[i]
164	eor	x8,x22,x22,ror#23
165	and	x17,x23,x22
166	bic	x19,x24,x22
167	add	x25,x25,x5			// h+=X[i]
168	orr	x17,x17,x19			// Ch(e,f,g)
169	eor	x19,x26,x27			// a^b, b^c in next round
170	eor	x16,x16,x8,ror#18	// Sigma1(e)
171	ror	x8,x26,#28
172	add	x25,x25,x17			// h+=Ch(e,f,g)
173	eor	x17,x26,x26,ror#5
174	add	x25,x25,x16			// h+=Sigma1(e)
175	and	x28,x28,x19			// (b^c)&=(a^b)
176	add	x21,x21,x25			// d+=h
177	eor	x28,x28,x27			// Maj(a,b,c)
178	eor	x17,x8,x17,ror#34	// Sigma0(a)
179	add	x25,x25,x28			// h+=Maj(a,b,c)
180	ldr	x28,[x30],#8		// *K++, x19 in next round
181	//add	x25,x25,x17			// h+=Sigma0(a)
182#ifndef	__AARCH64EB__
183	rev	x6,x6			// 3
184#endif
185	ldp	x7,x8,[x1],#2*8
186	add	x25,x25,x17			// h+=Sigma0(a)
187	ror	x16,x21,#14
188	add	x24,x24,x28			// h+=K[i]
189	eor	x9,x21,x21,ror#23
190	and	x17,x22,x21
191	bic	x28,x23,x21
192	add	x24,x24,x6			// h+=X[i]
193	orr	x17,x17,x28			// Ch(e,f,g)
194	eor	x28,x25,x26			// a^b, b^c in next round
195	eor	x16,x16,x9,ror#18	// Sigma1(e)
196	ror	x9,x25,#28
197	add	x24,x24,x17			// h+=Ch(e,f,g)
198	eor	x17,x25,x25,ror#5
199	add	x24,x24,x16			// h+=Sigma1(e)
200	and	x19,x19,x28			// (b^c)&=(a^b)
201	add	x20,x20,x24			// d+=h
202	eor	x19,x19,x26			// Maj(a,b,c)
203	eor	x17,x9,x17,ror#34	// Sigma0(a)
204	add	x24,x24,x19			// h+=Maj(a,b,c)
205	ldr	x19,[x30],#8		// *K++, x28 in next round
206	//add	x24,x24,x17			// h+=Sigma0(a)
207#ifndef	__AARCH64EB__
208	rev	x7,x7			// 4
209#endif
210	add	x24,x24,x17			// h+=Sigma0(a)
211	ror	x16,x20,#14
212	add	x23,x23,x19			// h+=K[i]
213	eor	x10,x20,x20,ror#23
214	and	x17,x21,x20
215	bic	x19,x22,x20
216	add	x23,x23,x7			// h+=X[i]
217	orr	x17,x17,x19			// Ch(e,f,g)
218	eor	x19,x24,x25			// a^b, b^c in next round
219	eor	x16,x16,x10,ror#18	// Sigma1(e)
220	ror	x10,x24,#28
221	add	x23,x23,x17			// h+=Ch(e,f,g)
222	eor	x17,x24,x24,ror#5
223	add	x23,x23,x16			// h+=Sigma1(e)
224	and	x28,x28,x19			// (b^c)&=(a^b)
225	add	x27,x27,x23			// d+=h
226	eor	x28,x28,x25			// Maj(a,b,c)
227	eor	x17,x10,x17,ror#34	// Sigma0(a)
228	add	x23,x23,x28			// h+=Maj(a,b,c)
229	ldr	x28,[x30],#8		// *K++, x19 in next round
230	//add	x23,x23,x17			// h+=Sigma0(a)
231#ifndef	__AARCH64EB__
232	rev	x8,x8			// 5
233#endif
234	ldp	x9,x10,[x1],#2*8
235	add	x23,x23,x17			// h+=Sigma0(a)
236	ror	x16,x27,#14
237	add	x22,x22,x28			// h+=K[i]
238	eor	x11,x27,x27,ror#23
239	and	x17,x20,x27
240	bic	x28,x21,x27
241	add	x22,x22,x8			// h+=X[i]
242	orr	x17,x17,x28			// Ch(e,f,g)
243	eor	x28,x23,x24			// a^b, b^c in next round
244	eor	x16,x16,x11,ror#18	// Sigma1(e)
245	ror	x11,x23,#28
246	add	x22,x22,x17			// h+=Ch(e,f,g)
247	eor	x17,x23,x23,ror#5
248	add	x22,x22,x16			// h+=Sigma1(e)
249	and	x19,x19,x28			// (b^c)&=(a^b)
250	add	x26,x26,x22			// d+=h
251	eor	x19,x19,x24			// Maj(a,b,c)
252	eor	x17,x11,x17,ror#34	// Sigma0(a)
253	add	x22,x22,x19			// h+=Maj(a,b,c)
254	ldr	x19,[x30],#8		// *K++, x28 in next round
255	//add	x22,x22,x17			// h+=Sigma0(a)
256#ifndef	__AARCH64EB__
257	rev	x9,x9			// 6
258#endif
259	add	x22,x22,x17			// h+=Sigma0(a)
260	ror	x16,x26,#14
261	add	x21,x21,x19			// h+=K[i]
262	eor	x12,x26,x26,ror#23
263	and	x17,x27,x26
264	bic	x19,x20,x26
265	add	x21,x21,x9			// h+=X[i]
266	orr	x17,x17,x19			// Ch(e,f,g)
267	eor	x19,x22,x23			// a^b, b^c in next round
268	eor	x16,x16,x12,ror#18	// Sigma1(e)
269	ror	x12,x22,#28
270	add	x21,x21,x17			// h+=Ch(e,f,g)
271	eor	x17,x22,x22,ror#5
272	add	x21,x21,x16			// h+=Sigma1(e)
273	and	x28,x28,x19			// (b^c)&=(a^b)
274	add	x25,x25,x21			// d+=h
275	eor	x28,x28,x23			// Maj(a,b,c)
276	eor	x17,x12,x17,ror#34	// Sigma0(a)
277	add	x21,x21,x28			// h+=Maj(a,b,c)
278	ldr	x28,[x30],#8		// *K++, x19 in next round
279	//add	x21,x21,x17			// h+=Sigma0(a)
280#ifndef	__AARCH64EB__
281	rev	x10,x10			// 7
282#endif
283	ldp	x11,x12,[x1],#2*8
284	add	x21,x21,x17			// h+=Sigma0(a)
285	ror	x16,x25,#14
286	add	x20,x20,x28			// h+=K[i]
287	eor	x13,x25,x25,ror#23
288	and	x17,x26,x25
289	bic	x28,x27,x25
290	add	x20,x20,x10			// h+=X[i]
291	orr	x17,x17,x28			// Ch(e,f,g)
292	eor	x28,x21,x22			// a^b, b^c in next round
293	eor	x16,x16,x13,ror#18	// Sigma1(e)
294	ror	x13,x21,#28
295	add	x20,x20,x17			// h+=Ch(e,f,g)
296	eor	x17,x21,x21,ror#5
297	add	x20,x20,x16			// h+=Sigma1(e)
298	and	x19,x19,x28			// (b^c)&=(a^b)
299	add	x24,x24,x20			// d+=h
300	eor	x19,x19,x22			// Maj(a,b,c)
301	eor	x17,x13,x17,ror#34	// Sigma0(a)
302	add	x20,x20,x19			// h+=Maj(a,b,c)
303	ldr	x19,[x30],#8		// *K++, x28 in next round
304	//add	x20,x20,x17			// h+=Sigma0(a)
305#ifndef	__AARCH64EB__
306	rev	x11,x11			// 8
307#endif
308	add	x20,x20,x17			// h+=Sigma0(a)
309	ror	x16,x24,#14
310	add	x27,x27,x19			// h+=K[i]
311	eor	x14,x24,x24,ror#23
312	and	x17,x25,x24
313	bic	x19,x26,x24
314	add	x27,x27,x11			// h+=X[i]
315	orr	x17,x17,x19			// Ch(e,f,g)
316	eor	x19,x20,x21			// a^b, b^c in next round
317	eor	x16,x16,x14,ror#18	// Sigma1(e)
318	ror	x14,x20,#28
319	add	x27,x27,x17			// h+=Ch(e,f,g)
320	eor	x17,x20,x20,ror#5
321	add	x27,x27,x16			// h+=Sigma1(e)
322	and	x28,x28,x19			// (b^c)&=(a^b)
323	add	x23,x23,x27			// d+=h
324	eor	x28,x28,x21			// Maj(a,b,c)
325	eor	x17,x14,x17,ror#34	// Sigma0(a)
326	add	x27,x27,x28			// h+=Maj(a,b,c)
327	ldr	x28,[x30],#8		// *K++, x19 in next round
328	//add	x27,x27,x17			// h+=Sigma0(a)
329#ifndef	__AARCH64EB__
330	rev	x12,x12			// 9
331#endif
332	ldp	x13,x14,[x1],#2*8
333	add	x27,x27,x17			// h+=Sigma0(a)
334	ror	x16,x23,#14
335	add	x26,x26,x28			// h+=K[i]
336	eor	x15,x23,x23,ror#23
337	and	x17,x24,x23
338	bic	x28,x25,x23
339	add	x26,x26,x12			// h+=X[i]
340	orr	x17,x17,x28			// Ch(e,f,g)
341	eor	x28,x27,x20			// a^b, b^c in next round
342	eor	x16,x16,x15,ror#18	// Sigma1(e)
343	ror	x15,x27,#28
344	add	x26,x26,x17			// h+=Ch(e,f,g)
345	eor	x17,x27,x27,ror#5
346	add	x26,x26,x16			// h+=Sigma1(e)
347	and	x19,x19,x28			// (b^c)&=(a^b)
348	add	x22,x22,x26			// d+=h
349	eor	x19,x19,x20			// Maj(a,b,c)
350	eor	x17,x15,x17,ror#34	// Sigma0(a)
351	add	x26,x26,x19			// h+=Maj(a,b,c)
352	ldr	x19,[x30],#8		// *K++, x28 in next round
353	//add	x26,x26,x17			// h+=Sigma0(a)
354#ifndef	__AARCH64EB__
355	rev	x13,x13			// 10
356#endif
357	add	x26,x26,x17			// h+=Sigma0(a)
358	ror	x16,x22,#14
359	add	x25,x25,x19			// h+=K[i]
360	eor	x0,x22,x22,ror#23
361	and	x17,x23,x22
362	bic	x19,x24,x22
363	add	x25,x25,x13			// h+=X[i]
364	orr	x17,x17,x19			// Ch(e,f,g)
365	eor	x19,x26,x27			// a^b, b^c in next round
366	eor	x16,x16,x0,ror#18	// Sigma1(e)
367	ror	x0,x26,#28
368	add	x25,x25,x17			// h+=Ch(e,f,g)
369	eor	x17,x26,x26,ror#5
370	add	x25,x25,x16			// h+=Sigma1(e)
371	and	x28,x28,x19			// (b^c)&=(a^b)
372	add	x21,x21,x25			// d+=h
373	eor	x28,x28,x27			// Maj(a,b,c)
374	eor	x17,x0,x17,ror#34	// Sigma0(a)
375	add	x25,x25,x28			// h+=Maj(a,b,c)
376	ldr	x28,[x30],#8		// *K++, x19 in next round
377	//add	x25,x25,x17			// h+=Sigma0(a)
378#ifndef	__AARCH64EB__
379	rev	x14,x14			// 11
380#endif
381	ldp	x15,x0,[x1],#2*8
382	add	x25,x25,x17			// h+=Sigma0(a)
383	str	x6,[sp,#24]
384	ror	x16,x21,#14
385	add	x24,x24,x28			// h+=K[i]
386	eor	x6,x21,x21,ror#23
387	and	x17,x22,x21
388	bic	x28,x23,x21
389	add	x24,x24,x14			// h+=X[i]
390	orr	x17,x17,x28			// Ch(e,f,g)
391	eor	x28,x25,x26			// a^b, b^c in next round
392	eor	x16,x16,x6,ror#18	// Sigma1(e)
393	ror	x6,x25,#28
394	add	x24,x24,x17			// h+=Ch(e,f,g)
395	eor	x17,x25,x25,ror#5
396	add	x24,x24,x16			// h+=Sigma1(e)
397	and	x19,x19,x28			// (b^c)&=(a^b)
398	add	x20,x20,x24			// d+=h
399	eor	x19,x19,x26			// Maj(a,b,c)
400	eor	x17,x6,x17,ror#34	// Sigma0(a)
401	add	x24,x24,x19			// h+=Maj(a,b,c)
402	ldr	x19,[x30],#8		// *K++, x28 in next round
403	//add	x24,x24,x17			// h+=Sigma0(a)
404#ifndef	__AARCH64EB__
405	rev	x15,x15			// 12
406#endif
407	add	x24,x24,x17			// h+=Sigma0(a)
408	str	x7,[sp,#0]
409	ror	x16,x20,#14
410	add	x23,x23,x19			// h+=K[i]
411	eor	x7,x20,x20,ror#23
412	and	x17,x21,x20
413	bic	x19,x22,x20
414	add	x23,x23,x15			// h+=X[i]
415	orr	x17,x17,x19			// Ch(e,f,g)
416	eor	x19,x24,x25			// a^b, b^c in next round
417	eor	x16,x16,x7,ror#18	// Sigma1(e)
418	ror	x7,x24,#28
419	add	x23,x23,x17			// h+=Ch(e,f,g)
420	eor	x17,x24,x24,ror#5
421	add	x23,x23,x16			// h+=Sigma1(e)
422	and	x28,x28,x19			// (b^c)&=(a^b)
423	add	x27,x27,x23			// d+=h
424	eor	x28,x28,x25			// Maj(a,b,c)
425	eor	x17,x7,x17,ror#34	// Sigma0(a)
426	add	x23,x23,x28			// h+=Maj(a,b,c)
427	ldr	x28,[x30],#8		// *K++, x19 in next round
428	//add	x23,x23,x17			// h+=Sigma0(a)
429#ifndef	__AARCH64EB__
430	rev	x0,x0			// 13
431#endif
432	ldp	x1,x2,[x1]
433	add	x23,x23,x17			// h+=Sigma0(a)
434	str	x8,[sp,#8]
435	ror	x16,x27,#14
436	add	x22,x22,x28			// h+=K[i]
437	eor	x8,x27,x27,ror#23
438	and	x17,x20,x27
439	bic	x28,x21,x27
440	add	x22,x22,x0			// h+=X[i]
441	orr	x17,x17,x28			// Ch(e,f,g)
442	eor	x28,x23,x24			// a^b, b^c in next round
443	eor	x16,x16,x8,ror#18	// Sigma1(e)
444	ror	x8,x23,#28
445	add	x22,x22,x17			// h+=Ch(e,f,g)
446	eor	x17,x23,x23,ror#5
447	add	x22,x22,x16			// h+=Sigma1(e)
448	and	x19,x19,x28			// (b^c)&=(a^b)
449	add	x26,x26,x22			// d+=h
450	eor	x19,x19,x24			// Maj(a,b,c)
451	eor	x17,x8,x17,ror#34	// Sigma0(a)
452	add	x22,x22,x19			// h+=Maj(a,b,c)
453	ldr	x19,[x30],#8		// *K++, x28 in next round
454	//add	x22,x22,x17			// h+=Sigma0(a)
455#ifndef	__AARCH64EB__
456	rev	x1,x1			// 14
457#endif
458	ldr	x6,[sp,#24]
459	add	x22,x22,x17			// h+=Sigma0(a)
460	str	x9,[sp,#16]
461	ror	x16,x26,#14
462	add	x21,x21,x19			// h+=K[i]
463	eor	x9,x26,x26,ror#23
464	and	x17,x27,x26
465	bic	x19,x20,x26
466	add	x21,x21,x1			// h+=X[i]
467	orr	x17,x17,x19			// Ch(e,f,g)
468	eor	x19,x22,x23			// a^b, b^c in next round
469	eor	x16,x16,x9,ror#18	// Sigma1(e)
470	ror	x9,x22,#28
471	add	x21,x21,x17			// h+=Ch(e,f,g)
472	eor	x17,x22,x22,ror#5
473	add	x21,x21,x16			// h+=Sigma1(e)
474	and	x28,x28,x19			// (b^c)&=(a^b)
475	add	x25,x25,x21			// d+=h
476	eor	x28,x28,x23			// Maj(a,b,c)
477	eor	x17,x9,x17,ror#34	// Sigma0(a)
478	add	x21,x21,x28			// h+=Maj(a,b,c)
479	ldr	x28,[x30],#8		// *K++, x19 in next round
480	//add	x21,x21,x17			// h+=Sigma0(a)
481#ifndef	__AARCH64EB__
482	rev	x2,x2			// 15
483#endif
484	ldr	x7,[sp,#0]
485	add	x21,x21,x17			// h+=Sigma0(a)
486	str	x10,[sp,#24]
487	ror	x16,x25,#14
488	add	x20,x20,x28			// h+=K[i]
489	ror	x9,x4,#1
490	and	x17,x26,x25
491	ror	x8,x1,#19
492	bic	x28,x27,x25
493	ror	x10,x21,#28
494	add	x20,x20,x2			// h+=X[i]
495	eor	x16,x16,x25,ror#18
496	eor	x9,x9,x4,ror#8
497	orr	x17,x17,x28			// Ch(e,f,g)
498	eor	x28,x21,x22			// a^b, b^c in next round
499	eor	x16,x16,x25,ror#41	// Sigma1(e)
500	eor	x10,x10,x21,ror#34
501	add	x20,x20,x17			// h+=Ch(e,f,g)
502	and	x19,x19,x28			// (b^c)&=(a^b)
503	eor	x8,x8,x1,ror#61
504	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
505	add	x20,x20,x16			// h+=Sigma1(e)
506	eor	x19,x19,x22			// Maj(a,b,c)
507	eor	x17,x10,x21,ror#39	// Sigma0(a)
508	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
509	add	x3,x3,x12
510	add	x24,x24,x20			// d+=h
511	add	x20,x20,x19			// h+=Maj(a,b,c)
512	ldr	x19,[x30],#8		// *K++, x28 in next round
513	add	x3,x3,x9
514	add	x20,x20,x17			// h+=Sigma0(a)
515	add	x3,x3,x8
516.Loop_16_xx:
517	ldr	x8,[sp,#8]
518	str	x11,[sp,#0]
519	ror	x16,x24,#14
520	add	x27,x27,x19			// h+=K[i]
521	ror	x10,x5,#1
522	and	x17,x25,x24
523	ror	x9,x2,#19
524	bic	x19,x26,x24
525	ror	x11,x20,#28
526	add	x27,x27,x3			// h+=X[i]
527	eor	x16,x16,x24,ror#18
528	eor	x10,x10,x5,ror#8
529	orr	x17,x17,x19			// Ch(e,f,g)
530	eor	x19,x20,x21			// a^b, b^c in next round
531	eor	x16,x16,x24,ror#41	// Sigma1(e)
532	eor	x11,x11,x20,ror#34
533	add	x27,x27,x17			// h+=Ch(e,f,g)
534	and	x28,x28,x19			// (b^c)&=(a^b)
535	eor	x9,x9,x2,ror#61
536	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
537	add	x27,x27,x16			// h+=Sigma1(e)
538	eor	x28,x28,x21			// Maj(a,b,c)
539	eor	x17,x11,x20,ror#39	// Sigma0(a)
540	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
541	add	x4,x4,x13
542	add	x23,x23,x27			// d+=h
543	add	x27,x27,x28			// h+=Maj(a,b,c)
544	ldr	x28,[x30],#8		// *K++, x19 in next round
545	add	x4,x4,x10
546	add	x27,x27,x17			// h+=Sigma0(a)
547	add	x4,x4,x9
548	ldr	x9,[sp,#16]
549	str	x12,[sp,#8]
550	ror	x16,x23,#14
551	add	x26,x26,x28			// h+=K[i]
552	ror	x11,x6,#1
553	and	x17,x24,x23
554	ror	x10,x3,#19
555	bic	x28,x25,x23
556	ror	x12,x27,#28
557	add	x26,x26,x4			// h+=X[i]
558	eor	x16,x16,x23,ror#18
559	eor	x11,x11,x6,ror#8
560	orr	x17,x17,x28			// Ch(e,f,g)
561	eor	x28,x27,x20			// a^b, b^c in next round
562	eor	x16,x16,x23,ror#41	// Sigma1(e)
563	eor	x12,x12,x27,ror#34
564	add	x26,x26,x17			// h+=Ch(e,f,g)
565	and	x19,x19,x28			// (b^c)&=(a^b)
566	eor	x10,x10,x3,ror#61
567	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
568	add	x26,x26,x16			// h+=Sigma1(e)
569	eor	x19,x19,x20			// Maj(a,b,c)
570	eor	x17,x12,x27,ror#39	// Sigma0(a)
571	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
572	add	x5,x5,x14
573	add	x22,x22,x26			// d+=h
574	add	x26,x26,x19			// h+=Maj(a,b,c)
575	ldr	x19,[x30],#8		// *K++, x28 in next round
576	add	x5,x5,x11
577	add	x26,x26,x17			// h+=Sigma0(a)
578	add	x5,x5,x10
579	ldr	x10,[sp,#24]
580	str	x13,[sp,#16]
581	ror	x16,x22,#14
582	add	x25,x25,x19			// h+=K[i]
583	ror	x12,x7,#1
584	and	x17,x23,x22
585	ror	x11,x4,#19
586	bic	x19,x24,x22
587	ror	x13,x26,#28
588	add	x25,x25,x5			// h+=X[i]
589	eor	x16,x16,x22,ror#18
590	eor	x12,x12,x7,ror#8
591	orr	x17,x17,x19			// Ch(e,f,g)
592	eor	x19,x26,x27			// a^b, b^c in next round
593	eor	x16,x16,x22,ror#41	// Sigma1(e)
594	eor	x13,x13,x26,ror#34
595	add	x25,x25,x17			// h+=Ch(e,f,g)
596	and	x28,x28,x19			// (b^c)&=(a^b)
597	eor	x11,x11,x4,ror#61
598	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
599	add	x25,x25,x16			// h+=Sigma1(e)
600	eor	x28,x28,x27			// Maj(a,b,c)
601	eor	x17,x13,x26,ror#39	// Sigma0(a)
602	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
603	add	x6,x6,x15
604	add	x21,x21,x25			// d+=h
605	add	x25,x25,x28			// h+=Maj(a,b,c)
606	ldr	x28,[x30],#8		// *K++, x19 in next round
607	add	x6,x6,x12
608	add	x25,x25,x17			// h+=Sigma0(a)
609	add	x6,x6,x11
610	ldr	x11,[sp,#0]
611	str	x14,[sp,#24]
612	ror	x16,x21,#14
613	add	x24,x24,x28			// h+=K[i]
614	ror	x13,x8,#1
615	and	x17,x22,x21
616	ror	x12,x5,#19
617	bic	x28,x23,x21
618	ror	x14,x25,#28
619	add	x24,x24,x6			// h+=X[i]
620	eor	x16,x16,x21,ror#18
621	eor	x13,x13,x8,ror#8
622	orr	x17,x17,x28			// Ch(e,f,g)
623	eor	x28,x25,x26			// a^b, b^c in next round
624	eor	x16,x16,x21,ror#41	// Sigma1(e)
625	eor	x14,x14,x25,ror#34
626	add	x24,x24,x17			// h+=Ch(e,f,g)
627	and	x19,x19,x28			// (b^c)&=(a^b)
628	eor	x12,x12,x5,ror#61
629	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
630	add	x24,x24,x16			// h+=Sigma1(e)
631	eor	x19,x19,x26			// Maj(a,b,c)
632	eor	x17,x14,x25,ror#39	// Sigma0(a)
633	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
634	add	x7,x7,x0
635	add	x20,x20,x24			// d+=h
636	add	x24,x24,x19			// h+=Maj(a,b,c)
637	ldr	x19,[x30],#8		// *K++, x28 in next round
638	add	x7,x7,x13
639	add	x24,x24,x17			// h+=Sigma0(a)
640	add	x7,x7,x12
641	ldr	x12,[sp,#8]
642	str	x15,[sp,#0]
643	ror	x16,x20,#14
644	add	x23,x23,x19			// h+=K[i]
645	ror	x14,x9,#1
646	and	x17,x21,x20
647	ror	x13,x6,#19
648	bic	x19,x22,x20
649	ror	x15,x24,#28
650	add	x23,x23,x7			// h+=X[i]
651	eor	x16,x16,x20,ror#18
652	eor	x14,x14,x9,ror#8
653	orr	x17,x17,x19			// Ch(e,f,g)
654	eor	x19,x24,x25			// a^b, b^c in next round
655	eor	x16,x16,x20,ror#41	// Sigma1(e)
656	eor	x15,x15,x24,ror#34
657	add	x23,x23,x17			// h+=Ch(e,f,g)
658	and	x28,x28,x19			// (b^c)&=(a^b)
659	eor	x13,x13,x6,ror#61
660	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
661	add	x23,x23,x16			// h+=Sigma1(e)
662	eor	x28,x28,x25			// Maj(a,b,c)
663	eor	x17,x15,x24,ror#39	// Sigma0(a)
664	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
665	add	x8,x8,x1
666	add	x27,x27,x23			// d+=h
667	add	x23,x23,x28			// h+=Maj(a,b,c)
668	ldr	x28,[x30],#8		// *K++, x19 in next round
669	add	x8,x8,x14
670	add	x23,x23,x17			// h+=Sigma0(a)
671	add	x8,x8,x13
672	ldr	x13,[sp,#16]
673	str	x0,[sp,#8]
674	ror	x16,x27,#14
675	add	x22,x22,x28			// h+=K[i]
676	ror	x15,x10,#1
677	and	x17,x20,x27
678	ror	x14,x7,#19
679	bic	x28,x21,x27
680	ror	x0,x23,#28
681	add	x22,x22,x8			// h+=X[i]
682	eor	x16,x16,x27,ror#18
683	eor	x15,x15,x10,ror#8
684	orr	x17,x17,x28			// Ch(e,f,g)
685	eor	x28,x23,x24			// a^b, b^c in next round
686	eor	x16,x16,x27,ror#41	// Sigma1(e)
687	eor	x0,x0,x23,ror#34
688	add	x22,x22,x17			// h+=Ch(e,f,g)
689	and	x19,x19,x28			// (b^c)&=(a^b)
690	eor	x14,x14,x7,ror#61
691	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
692	add	x22,x22,x16			// h+=Sigma1(e)
693	eor	x19,x19,x24			// Maj(a,b,c)
694	eor	x17,x0,x23,ror#39	// Sigma0(a)
695	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
696	add	x9,x9,x2
697	add	x26,x26,x22			// d+=h
698	add	x22,x22,x19			// h+=Maj(a,b,c)
699	ldr	x19,[x30],#8		// *K++, x28 in next round
700	add	x9,x9,x15
701	add	x22,x22,x17			// h+=Sigma0(a)
702	add	x9,x9,x14
703	ldr	x14,[sp,#24]
704	str	x1,[sp,#16]
705	ror	x16,x26,#14
706	add	x21,x21,x19			// h+=K[i]
707	ror	x0,x11,#1
708	and	x17,x27,x26
709	ror	x15,x8,#19
710	bic	x19,x20,x26
711	ror	x1,x22,#28
712	add	x21,x21,x9			// h+=X[i]
713	eor	x16,x16,x26,ror#18
714	eor	x0,x0,x11,ror#8
715	orr	x17,x17,x19			// Ch(e,f,g)
716	eor	x19,x22,x23			// a^b, b^c in next round
717	eor	x16,x16,x26,ror#41	// Sigma1(e)
718	eor	x1,x1,x22,ror#34
719	add	x21,x21,x17			// h+=Ch(e,f,g)
720	and	x28,x28,x19			// (b^c)&=(a^b)
721	eor	x15,x15,x8,ror#61
722	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
723	add	x21,x21,x16			// h+=Sigma1(e)
724	eor	x28,x28,x23			// Maj(a,b,c)
725	eor	x17,x1,x22,ror#39	// Sigma0(a)
726	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
727	add	x10,x10,x3
728	add	x25,x25,x21			// d+=h
729	add	x21,x21,x28			// h+=Maj(a,b,c)
730	ldr	x28,[x30],#8		// *K++, x19 in next round
731	add	x10,x10,x0
732	add	x21,x21,x17			// h+=Sigma0(a)
733	add	x10,x10,x15
734	ldr	x15,[sp,#0]
735	str	x2,[sp,#24]
736	ror	x16,x25,#14
737	add	x20,x20,x28			// h+=K[i]
738	ror	x1,x12,#1
739	and	x17,x26,x25
740	ror	x0,x9,#19
741	bic	x28,x27,x25
742	ror	x2,x21,#28
743	add	x20,x20,x10			// h+=X[i]
744	eor	x16,x16,x25,ror#18
745	eor	x1,x1,x12,ror#8
746	orr	x17,x17,x28			// Ch(e,f,g)
747	eor	x28,x21,x22			// a^b, b^c in next round
748	eor	x16,x16,x25,ror#41	// Sigma1(e)
749	eor	x2,x2,x21,ror#34
750	add	x20,x20,x17			// h+=Ch(e,f,g)
751	and	x19,x19,x28			// (b^c)&=(a^b)
752	eor	x0,x0,x9,ror#61
753	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
754	add	x20,x20,x16			// h+=Sigma1(e)
755	eor	x19,x19,x22			// Maj(a,b,c)
756	eor	x17,x2,x21,ror#39	// Sigma0(a)
757	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
758	add	x11,x11,x4
759	add	x24,x24,x20			// d+=h
760	add	x20,x20,x19			// h+=Maj(a,b,c)
761	ldr	x19,[x30],#8		// *K++, x28 in next round
762	add	x11,x11,x1
763	add	x20,x20,x17			// h+=Sigma0(a)
764	add	x11,x11,x0
765	ldr	x0,[sp,#8]
766	str	x3,[sp,#0]
767	ror	x16,x24,#14
768	add	x27,x27,x19			// h+=K[i]
769	ror	x2,x13,#1
770	and	x17,x25,x24
771	ror	x1,x10,#19
772	bic	x19,x26,x24
773	ror	x3,x20,#28
774	add	x27,x27,x11			// h+=X[i]
775	eor	x16,x16,x24,ror#18
776	eor	x2,x2,x13,ror#8
777	orr	x17,x17,x19			// Ch(e,f,g)
778	eor	x19,x20,x21			// a^b, b^c in next round
779	eor	x16,x16,x24,ror#41	// Sigma1(e)
780	eor	x3,x3,x20,ror#34
781	add	x27,x27,x17			// h+=Ch(e,f,g)
782	and	x28,x28,x19			// (b^c)&=(a^b)
783	eor	x1,x1,x10,ror#61
784	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
785	add	x27,x27,x16			// h+=Sigma1(e)
786	eor	x28,x28,x21			// Maj(a,b,c)
787	eor	x17,x3,x20,ror#39	// Sigma0(a)
788	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
789	add	x12,x12,x5
790	add	x23,x23,x27			// d+=h
791	add	x27,x27,x28			// h+=Maj(a,b,c)
792	ldr	x28,[x30],#8		// *K++, x19 in next round
793	add	x12,x12,x2
794	add	x27,x27,x17			// h+=Sigma0(a)
795	add	x12,x12,x1
796	ldr	x1,[sp,#16]
797	str	x4,[sp,#8]
798	ror	x16,x23,#14
799	add	x26,x26,x28			// h+=K[i]
800	ror	x3,x14,#1
801	and	x17,x24,x23
802	ror	x2,x11,#19
803	bic	x28,x25,x23
804	ror	x4,x27,#28
805	add	x26,x26,x12			// h+=X[i]
806	eor	x16,x16,x23,ror#18
807	eor	x3,x3,x14,ror#8
808	orr	x17,x17,x28			// Ch(e,f,g)
809	eor	x28,x27,x20			// a^b, b^c in next round
810	eor	x16,x16,x23,ror#41	// Sigma1(e)
811	eor	x4,x4,x27,ror#34
812	add	x26,x26,x17			// h+=Ch(e,f,g)
813	and	x19,x19,x28			// (b^c)&=(a^b)
814	eor	x2,x2,x11,ror#61
815	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
816	add	x26,x26,x16			// h+=Sigma1(e)
817	eor	x19,x19,x20			// Maj(a,b,c)
818	eor	x17,x4,x27,ror#39	// Sigma0(a)
819	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
820	add	x13,x13,x6
821	add	x22,x22,x26			// d+=h
822	add	x26,x26,x19			// h+=Maj(a,b,c)
823	ldr	x19,[x30],#8		// *K++, x28 in next round
824	add	x13,x13,x3
825	add	x26,x26,x17			// h+=Sigma0(a)
826	add	x13,x13,x2
827	ldr	x2,[sp,#24]
828	str	x5,[sp,#16]
829	ror	x16,x22,#14
830	add	x25,x25,x19			// h+=K[i]
831	ror	x4,x15,#1
832	and	x17,x23,x22
833	ror	x3,x12,#19
834	bic	x19,x24,x22
835	ror	x5,x26,#28
836	add	x25,x25,x13			// h+=X[i]
837	eor	x16,x16,x22,ror#18
838	eor	x4,x4,x15,ror#8
839	orr	x17,x17,x19			// Ch(e,f,g)
840	eor	x19,x26,x27			// a^b, b^c in next round
841	eor	x16,x16,x22,ror#41	// Sigma1(e)
842	eor	x5,x5,x26,ror#34
843	add	x25,x25,x17			// h+=Ch(e,f,g)
844	and	x28,x28,x19			// (b^c)&=(a^b)
845	eor	x3,x3,x12,ror#61
846	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
847	add	x25,x25,x16			// h+=Sigma1(e)
848	eor	x28,x28,x27			// Maj(a,b,c)
849	eor	x17,x5,x26,ror#39	// Sigma0(a)
850	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
851	add	x14,x14,x7
852	add	x21,x21,x25			// d+=h
853	add	x25,x25,x28			// h+=Maj(a,b,c)
854	ldr	x28,[x30],#8		// *K++, x19 in next round
855	add	x14,x14,x4
856	add	x25,x25,x17			// h+=Sigma0(a)
857	add	x14,x14,x3
858	ldr	x3,[sp,#0]
859	str	x6,[sp,#24]
860	ror	x16,x21,#14
861	add	x24,x24,x28			// h+=K[i]
862	ror	x5,x0,#1
863	and	x17,x22,x21
864	ror	x4,x13,#19
865	bic	x28,x23,x21
866	ror	x6,x25,#28
867	add	x24,x24,x14			// h+=X[i]
868	eor	x16,x16,x21,ror#18
869	eor	x5,x5,x0,ror#8
870	orr	x17,x17,x28			// Ch(e,f,g)
871	eor	x28,x25,x26			// a^b, b^c in next round
872	eor	x16,x16,x21,ror#41	// Sigma1(e)
873	eor	x6,x6,x25,ror#34
874	add	x24,x24,x17			// h+=Ch(e,f,g)
875	and	x19,x19,x28			// (b^c)&=(a^b)
876	eor	x4,x4,x13,ror#61
877	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
878	add	x24,x24,x16			// h+=Sigma1(e)
879	eor	x19,x19,x26			// Maj(a,b,c)
880	eor	x17,x6,x25,ror#39	// Sigma0(a)
881	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
882	add	x15,x15,x8
883	add	x20,x20,x24			// d+=h
884	add	x24,x24,x19			// h+=Maj(a,b,c)
885	ldr	x19,[x30],#8		// *K++, x28 in next round
886	add	x15,x15,x5
887	add	x24,x24,x17			// h+=Sigma0(a)
888	add	x15,x15,x4
889	ldr	x4,[sp,#8]
890	str	x7,[sp,#0]
891	ror	x16,x20,#14
892	add	x23,x23,x19			// h+=K[i]
893	ror	x6,x1,#1
894	and	x17,x21,x20
895	ror	x5,x14,#19
896	bic	x19,x22,x20
897	ror	x7,x24,#28
898	add	x23,x23,x15			// h+=X[i]
899	eor	x16,x16,x20,ror#18
900	eor	x6,x6,x1,ror#8
901	orr	x17,x17,x19			// Ch(e,f,g)
902	eor	x19,x24,x25			// a^b, b^c in next round
903	eor	x16,x16,x20,ror#41	// Sigma1(e)
904	eor	x7,x7,x24,ror#34
905	add	x23,x23,x17			// h+=Ch(e,f,g)
906	and	x28,x28,x19			// (b^c)&=(a^b)
907	eor	x5,x5,x14,ror#61
908	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
909	add	x23,x23,x16			// h+=Sigma1(e)
910	eor	x28,x28,x25			// Maj(a,b,c)
911	eor	x17,x7,x24,ror#39	// Sigma0(a)
912	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
913	add	x0,x0,x9
914	add	x27,x27,x23			// d+=h
915	add	x23,x23,x28			// h+=Maj(a,b,c)
916	ldr	x28,[x30],#8		// *K++, x19 in next round
917	add	x0,x0,x6
918	add	x23,x23,x17			// h+=Sigma0(a)
919	add	x0,x0,x5
920	ldr	x5,[sp,#16]
921	str	x8,[sp,#8]
922	ror	x16,x27,#14
923	add	x22,x22,x28			// h+=K[i]
924	ror	x7,x2,#1
925	and	x17,x20,x27
926	ror	x6,x15,#19
927	bic	x28,x21,x27
928	ror	x8,x23,#28
929	add	x22,x22,x0			// h+=X[i]
930	eor	x16,x16,x27,ror#18
931	eor	x7,x7,x2,ror#8
932	orr	x17,x17,x28			// Ch(e,f,g)
933	eor	x28,x23,x24			// a^b, b^c in next round
934	eor	x16,x16,x27,ror#41	// Sigma1(e)
935	eor	x8,x8,x23,ror#34
936	add	x22,x22,x17			// h+=Ch(e,f,g)
937	and	x19,x19,x28			// (b^c)&=(a^b)
938	eor	x6,x6,x15,ror#61
939	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
940	add	x22,x22,x16			// h+=Sigma1(e)
941	eor	x19,x19,x24			// Maj(a,b,c)
942	eor	x17,x8,x23,ror#39	// Sigma0(a)
943	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
944	add	x1,x1,x10
945	add	x26,x26,x22			// d+=h
946	add	x22,x22,x19			// h+=Maj(a,b,c)
947	ldr	x19,[x30],#8		// *K++, x28 in next round
948	add	x1,x1,x7
949	add	x22,x22,x17			// h+=Sigma0(a)
950	add	x1,x1,x6
951	ldr	x6,[sp,#24]
952	str	x9,[sp,#16]
953	ror	x16,x26,#14
954	add	x21,x21,x19			// h+=K[i]
955	ror	x8,x3,#1
956	and	x17,x27,x26
957	ror	x7,x0,#19
958	bic	x19,x20,x26
959	ror	x9,x22,#28
960	add	x21,x21,x1			// h+=X[i]
961	eor	x16,x16,x26,ror#18
962	eor	x8,x8,x3,ror#8
963	orr	x17,x17,x19			// Ch(e,f,g)
964	eor	x19,x22,x23			// a^b, b^c in next round
965	eor	x16,x16,x26,ror#41	// Sigma1(e)
966	eor	x9,x9,x22,ror#34
967	add	x21,x21,x17			// h+=Ch(e,f,g)
968	and	x28,x28,x19			// (b^c)&=(a^b)
969	eor	x7,x7,x0,ror#61
970	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
971	add	x21,x21,x16			// h+=Sigma1(e)
972	eor	x28,x28,x23			// Maj(a,b,c)
973	eor	x17,x9,x22,ror#39	// Sigma0(a)
974	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
975	add	x2,x2,x11
976	add	x25,x25,x21			// d+=h
977	add	x21,x21,x28			// h+=Maj(a,b,c)
978	ldr	x28,[x30],#8		// *K++, x19 in next round
979	add	x2,x2,x8
980	add	x21,x21,x17			// h+=Sigma0(a)
981	add	x2,x2,x7
982	ldr	x7,[sp,#0]
983	str	x10,[sp,#24]
984	ror	x16,x25,#14
985	add	x20,x20,x28			// h+=K[i]
986	ror	x9,x4,#1
987	and	x17,x26,x25
988	ror	x8,x1,#19
989	bic	x28,x27,x25
990	ror	x10,x21,#28
991	add	x20,x20,x2			// h+=X[i]
992	eor	x16,x16,x25,ror#18
993	eor	x9,x9,x4,ror#8
994	orr	x17,x17,x28			// Ch(e,f,g)
995	eor	x28,x21,x22			// a^b, b^c in next round
996	eor	x16,x16,x25,ror#41	// Sigma1(e)
997	eor	x10,x10,x21,ror#34
998	add	x20,x20,x17			// h+=Ch(e,f,g)
999	and	x19,x19,x28			// (b^c)&=(a^b)
1000	eor	x8,x8,x1,ror#61
1001	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
1002	add	x20,x20,x16			// h+=Sigma1(e)
1003	eor	x19,x19,x22			// Maj(a,b,c)
1004	eor	x17,x10,x21,ror#39	// Sigma0(a)
1005	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
1006	add	x3,x3,x12
1007	add	x24,x24,x20			// d+=h
1008	add	x20,x20,x19			// h+=Maj(a,b,c)
1009	ldr	x19,[x30],#8		// *K++, x28 in next round
1010	add	x3,x3,x9
1011	add	x20,x20,x17			// h+=Sigma0(a)
1012	add	x3,x3,x8
1013	cbnz	x19,.Loop_16_xx
1014
1015	ldp	x0,x2,[x29,#96]
1016	ldr	x1,[x29,#112]
1017	sub	x30,x30,#648		// rewind
1018
1019	ldp	x3,x4,[x0]
1020	ldp	x5,x6,[x0,#2*8]
1021	add	x1,x1,#14*8			// advance input pointer
1022	ldp	x7,x8,[x0,#4*8]
1023	add	x20,x20,x3
1024	ldp	x9,x10,[x0,#6*8]
1025	add	x21,x21,x4
1026	add	x22,x22,x5
1027	add	x23,x23,x6
1028	stp	x20,x21,[x0]
1029	add	x24,x24,x7
1030	add	x25,x25,x8
1031	stp	x22,x23,[x0,#2*8]
1032	add	x26,x26,x9
1033	add	x27,x27,x10
1034	cmp	x1,x2
1035	stp	x24,x25,[x0,#4*8]
1036	stp	x26,x27,[x0,#6*8]
1037	b.ne	.Loop
1038
1039	ldp	x19,x20,[x29,#16]
1040	add	sp,sp,#4*8
1041	ldp	x21,x22,[x29,#32]
1042	ldp	x23,x24,[x29,#48]
1043	ldp	x25,x26,[x29,#64]
1044	ldp	x27,x28,[x29,#80]
1045	ldp	x29,x30,[sp],#128
1046	ret
1047.size	zfs_sha512_block_armv7,.-zfs_sha512_block_armv7
1048
1049
1050.globl	zfs_sha512_block_armv8
1051.type	zfs_sha512_block_armv8,%function
1052.align	6
1053zfs_sha512_block_armv8:
1054	hint		#34				// bti c
1055.Lv8_entry:
1056	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later
1057	stp		x29,x30,[sp,#-16]!
1058	add		x29,sp,#0
1059
1060	ld1		{v16.16b-v19.16b},[x1],#64	// load input
1061	ld1		{v20.16b-v23.16b},[x1],#64
1062
1063	ld1		{v0.2d-v3.2d},[x0]		// load context
1064	adr		x3,.LK512
1065
1066	rev64		v16.16b,v16.16b
1067	rev64		v17.16b,v17.16b
1068	rev64		v18.16b,v18.16b
1069	rev64		v19.16b,v19.16b
1070	rev64		v20.16b,v20.16b
1071	rev64		v21.16b,v21.16b
1072	rev64		v22.16b,v22.16b
1073	rev64		v23.16b,v23.16b
1074	b		.Loop_hw
1075
1076.align	4
1077.Loop_hw:
1078	ld1		{v24.2d},[x3],#16
1079	subs		x2,x2,#1
1080	sub		x4,x1,#128
1081	orr		v26.16b,v0.16b,v0.16b			// offload
1082	orr		v27.16b,v1.16b,v1.16b
1083	orr		v28.16b,v2.16b,v2.16b
1084	orr		v29.16b,v3.16b,v3.16b
1085	csel		x1,x1,x4,ne			// conditional rewind
1086	add		v24.2d,v24.2d,v16.2d
1087	ld1		{v25.2d},[x3],#16
1088	ext		v24.16b,v24.16b,v24.16b,#8
1089	ext		v5.16b,v2.16b,v3.16b,#8
1090	ext		v6.16b,v1.16b,v2.16b,#8
1091	add		v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1092	 .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1093	 ext		v7.16b,v20.16b,v21.16b,#8
1094	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1095	 .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1096	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1097	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1098	add		v25.2d,v25.2d,v17.2d
1099	ld1		{v24.2d},[x3],#16
1100	ext		v25.16b,v25.16b,v25.16b,#8
1101	ext		v5.16b,v4.16b,v2.16b,#8
1102	ext		v6.16b,v0.16b,v4.16b,#8
1103	add		v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1104	 .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1105	 ext		v7.16b,v21.16b,v22.16b,#8
1106	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1107	 .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1108	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1109	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1110	add		v24.2d,v24.2d,v18.2d
1111	ld1		{v25.2d},[x3],#16
1112	ext		v24.16b,v24.16b,v24.16b,#8
1113	ext		v5.16b,v1.16b,v4.16b,#8
1114	ext		v6.16b,v3.16b,v1.16b,#8
1115	add		v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1116	 .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1117	 ext		v7.16b,v22.16b,v23.16b,#8
1118	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1119	 .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1120	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1121	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1122	add		v25.2d,v25.2d,v19.2d
1123	ld1		{v24.2d},[x3],#16
1124	ext		v25.16b,v25.16b,v25.16b,#8
1125	ext		v5.16b,v0.16b,v1.16b,#8
1126	ext		v6.16b,v2.16b,v0.16b,#8
1127	add		v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1128	 .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1129	 ext		v7.16b,v23.16b,v16.16b,#8
1130	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1131	 .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1132	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1133	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1134	add		v24.2d,v24.2d,v20.2d
1135	ld1		{v25.2d},[x3],#16
1136	ext		v24.16b,v24.16b,v24.16b,#8
1137	ext		v5.16b,v3.16b,v0.16b,#8
1138	ext		v6.16b,v4.16b,v3.16b,#8
1139	add		v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1140	 .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1141	 ext		v7.16b,v16.16b,v17.16b,#8
1142	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1143	 .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1144	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1145	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1146	add		v25.2d,v25.2d,v21.2d
1147	ld1		{v24.2d},[x3],#16
1148	ext		v25.16b,v25.16b,v25.16b,#8
1149	ext		v5.16b,v2.16b,v3.16b,#8
1150	ext		v6.16b,v1.16b,v2.16b,#8
1151	add		v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1152	 .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1153	 ext		v7.16b,v17.16b,v18.16b,#8
1154	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1155	 .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1156	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1157	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1158	add		v24.2d,v24.2d,v22.2d
1159	ld1		{v25.2d},[x3],#16
1160	ext		v24.16b,v24.16b,v24.16b,#8
1161	ext		v5.16b,v4.16b,v2.16b,#8
1162	ext		v6.16b,v0.16b,v4.16b,#8
1163	add		v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1164	 .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1165	 ext		v7.16b,v18.16b,v19.16b,#8
1166	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1167	 .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1168	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1169	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1170	add		v25.2d,v25.2d,v23.2d
1171	ld1		{v24.2d},[x3],#16
1172	ext		v25.16b,v25.16b,v25.16b,#8
1173	ext		v5.16b,v1.16b,v4.16b,#8
1174	ext		v6.16b,v3.16b,v1.16b,#8
1175	add		v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1176	 .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1177	 ext		v7.16b,v19.16b,v20.16b,#8
1178	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1179	 .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1180	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1181	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1182	add		v24.2d,v24.2d,v16.2d
1183	ld1		{v25.2d},[x3],#16
1184	ext		v24.16b,v24.16b,v24.16b,#8
1185	ext		v5.16b,v0.16b,v1.16b,#8
1186	ext		v6.16b,v2.16b,v0.16b,#8
1187	add		v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1188	 .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1189	 ext		v7.16b,v20.16b,v21.16b,#8
1190	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1191	 .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1192	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1193	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1194	add		v25.2d,v25.2d,v17.2d
1195	ld1		{v24.2d},[x3],#16
1196	ext		v25.16b,v25.16b,v25.16b,#8
1197	ext		v5.16b,v3.16b,v0.16b,#8
1198	ext		v6.16b,v4.16b,v3.16b,#8
1199	add		v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1200	 .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1201	 ext		v7.16b,v21.16b,v22.16b,#8
1202	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1203	 .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1204	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1205	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1206	add		v24.2d,v24.2d,v18.2d
1207	ld1		{v25.2d},[x3],#16
1208	ext		v24.16b,v24.16b,v24.16b,#8
1209	ext		v5.16b,v2.16b,v3.16b,#8
1210	ext		v6.16b,v1.16b,v2.16b,#8
1211	add		v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1212	 .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1213	 ext		v7.16b,v22.16b,v23.16b,#8
1214	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1215	 .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1216	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1217	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1218	add		v25.2d,v25.2d,v19.2d
1219	ld1		{v24.2d},[x3],#16
1220	ext		v25.16b,v25.16b,v25.16b,#8
1221	ext		v5.16b,v4.16b,v2.16b,#8
1222	ext		v6.16b,v0.16b,v4.16b,#8
1223	add		v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1224	 .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1225	 ext		v7.16b,v23.16b,v16.16b,#8
1226	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1227	 .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1228	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1229	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1230	add		v24.2d,v24.2d,v20.2d
1231	ld1		{v25.2d},[x3],#16
1232	ext		v24.16b,v24.16b,v24.16b,#8
1233	ext		v5.16b,v1.16b,v4.16b,#8
1234	ext		v6.16b,v3.16b,v1.16b,#8
1235	add		v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1236	 .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1237	 ext		v7.16b,v16.16b,v17.16b,#8
1238	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1239	 .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1240	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1241	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1242	add		v25.2d,v25.2d,v21.2d
1243	ld1		{v24.2d},[x3],#16
1244	ext		v25.16b,v25.16b,v25.16b,#8
1245	ext		v5.16b,v0.16b,v1.16b,#8
1246	ext		v6.16b,v2.16b,v0.16b,#8
1247	add		v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1248	 .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1249	 ext		v7.16b,v17.16b,v18.16b,#8
1250	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1251	 .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1252	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1253	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1254	add		v24.2d,v24.2d,v22.2d
1255	ld1		{v25.2d},[x3],#16
1256	ext		v24.16b,v24.16b,v24.16b,#8
1257	ext		v5.16b,v3.16b,v0.16b,#8
1258	ext		v6.16b,v4.16b,v3.16b,#8
1259	add		v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1260	 .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1261	 ext		v7.16b,v18.16b,v19.16b,#8
1262	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1263	 .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1264	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1265	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1266	add		v25.2d,v25.2d,v23.2d
1267	ld1		{v24.2d},[x3],#16
1268	ext		v25.16b,v25.16b,v25.16b,#8
1269	ext		v5.16b,v2.16b,v3.16b,#8
1270	ext		v6.16b,v1.16b,v2.16b,#8
1271	add		v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1272	 .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1273	 ext		v7.16b,v19.16b,v20.16b,#8
1274	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1275	 .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1276	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1277	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1278	add		v24.2d,v24.2d,v16.2d
1279	ld1		{v25.2d},[x3],#16
1280	ext		v24.16b,v24.16b,v24.16b,#8
1281	ext		v5.16b,v4.16b,v2.16b,#8
1282	ext		v6.16b,v0.16b,v4.16b,#8
1283	add		v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1284	 .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1285	 ext		v7.16b,v20.16b,v21.16b,#8
1286	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1287	 .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1288	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1289	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1290	add		v25.2d,v25.2d,v17.2d
1291	ld1		{v24.2d},[x3],#16
1292	ext		v25.16b,v25.16b,v25.16b,#8
1293	ext		v5.16b,v1.16b,v4.16b,#8
1294	ext		v6.16b,v3.16b,v1.16b,#8
1295	add		v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1296	 .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1297	 ext		v7.16b,v21.16b,v22.16b,#8
1298	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1299	 .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1300	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1301	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1302	add		v24.2d,v24.2d,v18.2d
1303	ld1		{v25.2d},[x3],#16
1304	ext		v24.16b,v24.16b,v24.16b,#8
1305	ext		v5.16b,v0.16b,v1.16b,#8
1306	ext		v6.16b,v2.16b,v0.16b,#8
1307	add		v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1308	 .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1309	 ext		v7.16b,v22.16b,v23.16b,#8
1310	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1311	 .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1312	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1313	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1314	add		v25.2d,v25.2d,v19.2d
1315	ld1		{v24.2d},[x3],#16
1316	ext		v25.16b,v25.16b,v25.16b,#8
1317	ext		v5.16b,v3.16b,v0.16b,#8
1318	ext		v6.16b,v4.16b,v3.16b,#8
1319	add		v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1320	 .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1321	 ext		v7.16b,v23.16b,v16.16b,#8
1322	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1323	 .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1324	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1325	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1326	add		v24.2d,v24.2d,v20.2d
1327	ld1		{v25.2d},[x3],#16
1328	ext		v24.16b,v24.16b,v24.16b,#8
1329	ext		v5.16b,v2.16b,v3.16b,#8
1330	ext		v6.16b,v1.16b,v2.16b,#8
1331	add		v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1332	 .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1333	 ext		v7.16b,v16.16b,v17.16b,#8
1334	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1335	 .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1336	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1337	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1338	add		v25.2d,v25.2d,v21.2d
1339	ld1		{v24.2d},[x3],#16
1340	ext		v25.16b,v25.16b,v25.16b,#8
1341	ext		v5.16b,v4.16b,v2.16b,#8
1342	ext		v6.16b,v0.16b,v4.16b,#8
1343	add		v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1344	 .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1345	 ext		v7.16b,v17.16b,v18.16b,#8
1346	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1347	 .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1348	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1349	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1350	add		v24.2d,v24.2d,v22.2d
1351	ld1		{v25.2d},[x3],#16
1352	ext		v24.16b,v24.16b,v24.16b,#8
1353	ext		v5.16b,v1.16b,v4.16b,#8
1354	ext		v6.16b,v3.16b,v1.16b,#8
1355	add		v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1356	 .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1357	 ext		v7.16b,v18.16b,v19.16b,#8
1358	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1359	 .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1360	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1361	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1362	add		v25.2d,v25.2d,v23.2d
1363	ld1		{v24.2d},[x3],#16
1364	ext		v25.16b,v25.16b,v25.16b,#8
1365	ext		v5.16b,v0.16b,v1.16b,#8
1366	ext		v6.16b,v2.16b,v0.16b,#8
1367	add		v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1368	 .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1369	 ext		v7.16b,v19.16b,v20.16b,#8
1370	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1371	 .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1372	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1373	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1374	add		v24.2d,v24.2d,v16.2d
1375	ld1		{v25.2d},[x3],#16
1376	ext		v24.16b,v24.16b,v24.16b,#8
1377	ext		v5.16b,v3.16b,v0.16b,#8
1378	ext		v6.16b,v4.16b,v3.16b,#8
1379	add		v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1380	 .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1381	 ext		v7.16b,v20.16b,v21.16b,#8
1382	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1383	 .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1384	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1385	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1386	add		v25.2d,v25.2d,v17.2d
1387	ld1		{v24.2d},[x3],#16
1388	ext		v25.16b,v25.16b,v25.16b,#8
1389	ext		v5.16b,v2.16b,v3.16b,#8
1390	ext		v6.16b,v1.16b,v2.16b,#8
1391	add		v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1392	 .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1393	 ext		v7.16b,v21.16b,v22.16b,#8
1394	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1395	 .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1396	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1397	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1398	add		v24.2d,v24.2d,v18.2d
1399	ld1		{v25.2d},[x3],#16
1400	ext		v24.16b,v24.16b,v24.16b,#8
1401	ext		v5.16b,v4.16b,v2.16b,#8
1402	ext		v6.16b,v0.16b,v4.16b,#8
1403	add		v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1404	 .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1405	 ext		v7.16b,v22.16b,v23.16b,#8
1406	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1407	 .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1408	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1409	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1410	add		v25.2d,v25.2d,v19.2d
1411	ld1		{v24.2d},[x3],#16
1412	ext		v25.16b,v25.16b,v25.16b,#8
1413	ext		v5.16b,v1.16b,v4.16b,#8
1414	ext		v6.16b,v3.16b,v1.16b,#8
1415	add		v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1416	 .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1417	 ext		v7.16b,v23.16b,v16.16b,#8
1418	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1419	 .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1420	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1421	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1422	add		v24.2d,v24.2d,v20.2d
1423	ld1		{v25.2d},[x3],#16
1424	ext		v24.16b,v24.16b,v24.16b,#8
1425	ext		v5.16b,v0.16b,v1.16b,#8
1426	ext		v6.16b,v2.16b,v0.16b,#8
1427	add		v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1428	 .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1429	 ext		v7.16b,v16.16b,v17.16b,#8
1430	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1431	 .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1432	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1433	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1434	add		v25.2d,v25.2d,v21.2d
1435	ld1		{v24.2d},[x3],#16
1436	ext		v25.16b,v25.16b,v25.16b,#8
1437	ext		v5.16b,v3.16b,v0.16b,#8
1438	ext		v6.16b,v4.16b,v3.16b,#8
1439	add		v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1440	 .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1441	 ext		v7.16b,v17.16b,v18.16b,#8
1442	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1443	 .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1444	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1445	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1446	add		v24.2d,v24.2d,v22.2d
1447	ld1		{v25.2d},[x3],#16
1448	ext		v24.16b,v24.16b,v24.16b,#8
1449	ext		v5.16b,v2.16b,v3.16b,#8
1450	ext		v6.16b,v1.16b,v2.16b,#8
1451	add		v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1452	 .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1453	 ext		v7.16b,v18.16b,v19.16b,#8
1454	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1455	 .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1456	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1457	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1458	add		v25.2d,v25.2d,v23.2d
1459	ld1		{v24.2d},[x3],#16
1460	ext		v25.16b,v25.16b,v25.16b,#8
1461	ext		v5.16b,v4.16b,v2.16b,#8
1462	ext		v6.16b,v0.16b,v4.16b,#8
1463	add		v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1464	 .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1465	 ext		v7.16b,v19.16b,v20.16b,#8
1466	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1467	 .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1468	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1469	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1470	ld1		{v25.2d},[x3],#16
1471	add		v24.2d,v24.2d,v16.2d
1472	 ld1		{v16.16b},[x1],#16		// load next input
1473	ext		v24.16b,v24.16b,v24.16b,#8
1474	ext		v5.16b,v1.16b,v4.16b,#8
1475	ext		v6.16b,v3.16b,v1.16b,#8
1476	add		v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1477	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1478	 rev64		v16.16b,v16.16b
1479	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1480	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1481	ld1		{v24.2d},[x3],#16
1482	add		v25.2d,v25.2d,v17.2d
1483	 ld1		{v17.16b},[x1],#16		// load next input
1484	ext		v25.16b,v25.16b,v25.16b,#8
1485	ext		v5.16b,v0.16b,v1.16b,#8
1486	ext		v6.16b,v2.16b,v0.16b,#8
1487	add		v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1488	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1489	 rev64		v17.16b,v17.16b
1490	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1491	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1492	ld1		{v25.2d},[x3],#16
1493	add		v24.2d,v24.2d,v18.2d
1494	 ld1		{v18.16b},[x1],#16		// load next input
1495	ext		v24.16b,v24.16b,v24.16b,#8
1496	ext		v5.16b,v3.16b,v0.16b,#8
1497	ext		v6.16b,v4.16b,v3.16b,#8
1498	add		v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1499	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1500	 rev64		v18.16b,v18.16b
1501	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1502	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1503	ld1		{v24.2d},[x3],#16
1504	add		v25.2d,v25.2d,v19.2d
1505	 ld1		{v19.16b},[x1],#16		// load next input
1506	ext		v25.16b,v25.16b,v25.16b,#8
1507	ext		v5.16b,v2.16b,v3.16b,#8
1508	ext		v6.16b,v1.16b,v2.16b,#8
1509	add		v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1510	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1511	 rev64		v19.16b,v19.16b
1512	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1513	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1514	ld1		{v25.2d},[x3],#16
1515	add		v24.2d,v24.2d,v20.2d
1516	 ld1		{v20.16b},[x1],#16		// load next input
1517	ext		v24.16b,v24.16b,v24.16b,#8
1518	ext		v5.16b,v4.16b,v2.16b,#8
1519	ext		v6.16b,v0.16b,v4.16b,#8
1520	add		v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1521	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1522	 rev64		v20.16b,v20.16b
1523	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1524	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1525	ld1		{v24.2d},[x3],#16
1526	add		v25.2d,v25.2d,v21.2d
1527	 ld1		{v21.16b},[x1],#16		// load next input
1528	ext		v25.16b,v25.16b,v25.16b,#8
1529	ext		v5.16b,v1.16b,v4.16b,#8
1530	ext		v6.16b,v3.16b,v1.16b,#8
1531	add		v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1532	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1533	 rev64		v21.16b,v21.16b
1534	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1535	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1536	ld1		{v25.2d},[x3],#16
1537	add		v24.2d,v24.2d,v22.2d
1538	 ld1		{v22.16b},[x1],#16		// load next input
1539	ext		v24.16b,v24.16b,v24.16b,#8
1540	ext		v5.16b,v0.16b,v1.16b,#8
1541	ext		v6.16b,v2.16b,v0.16b,#8
1542	add		v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1543	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1544	 rev64		v22.16b,v22.16b
1545	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1546	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1547	sub		x3,x3,#80*8	// rewind
1548	add		v25.2d,v25.2d,v23.2d
1549	 ld1		{v23.16b},[x1],#16		// load next input
1550	ext		v25.16b,v25.16b,v25.16b,#8
1551	ext		v5.16b,v3.16b,v0.16b,#8
1552	ext		v6.16b,v4.16b,v3.16b,#8
1553	add		v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1554	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1555	 rev64		v23.16b,v23.16b
1556	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1557	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1558	add		v0.2d,v0.2d,v26.2d			// accumulate
1559	add		v1.2d,v1.2d,v27.2d
1560	add		v2.2d,v2.2d,v28.2d
1561	add		v3.2d,v3.2d,v29.2d
1562
1563	cbnz		x2,.Loop_hw
1564
1565	st1		{v0.2d-v3.2d},[x0]		// store context
1566
1567	ldr		x29,[sp],#16
1568	ret
1569.size	zfs_sha512_block_armv8,.-zfs_sha512_block_armv8
1570#endif
1571