xref: /freebsd/sys/contrib/openzfs/module/icp/asm-aarch64/sha2/sha512-armv8.S (revision 9f44a47fd07924afc035991af15d84e6585dea4f)
1/*
2 * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     https://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
19 * - modified assembly to fit into OpenZFS
20 */
21
22#if defined(__aarch64__)
23
24.text
25
26.align	6
27.type	.LK512,%object
28.LK512:
29	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
30	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
31	.quad	0x3956c25bf348b538,0x59f111f1b605d019
32	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
33	.quad	0xd807aa98a3030242,0x12835b0145706fbe
34	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
35	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
36	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
37	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
38	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
39	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
40	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
41	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
42	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
43	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
44	.quad	0x06ca6351e003826f,0x142929670a0e6e70
45	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
46	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
47	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
48	.quad	0x81c2c92e47edaee6,0x92722c851482353b
49	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
50	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
51	.quad	0xd192e819d6ef5218,0xd69906245565a910
52	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
53	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
54	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
55	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
56	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
57	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
58	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
59	.quad	0x90befffa23631e28,0xa4506cebde82bde9
60	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
61	.quad	0xca273eceea26619c,0xd186b8c721c0c207
62	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
63	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
64	.quad	0x113f9804bef90dae,0x1b710b35131c471b
65	.quad	0x28db77f523047d84,0x32caab7b40c72493
66	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
67	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
68	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
69	.quad	0	// terminator
70.size	.LK512,.-.LK512
71
72.globl	zfs_sha512_block_armv7
73.type	zfs_sha512_block_armv7,%function
74.align	6
75zfs_sha512_block_armv7:
76	stp	x29,x30,[sp,#-128]!
77	add	x29,sp,#0
78
79	stp	x19,x20,[sp,#16]
80	stp	x21,x22,[sp,#32]
81	stp	x23,x24,[sp,#48]
82	stp	x25,x26,[sp,#64]
83	stp	x27,x28,[sp,#80]
84	sub	sp,sp,#4*8
85
86	ldp	x20,x21,[x0]				// load context
87	ldp	x22,x23,[x0,#2*8]
88	ldp	x24,x25,[x0,#4*8]
89	add	x2,x1,x2,lsl#7	// end of input
90	ldp	x26,x27,[x0,#6*8]
91	adr	x30,.LK512
92	stp	x0,x2,[x29,#96]
93
94.Loop:
95	ldp	x3,x4,[x1],#2*8
96	ldr	x19,[x30],#8			// *K++
97	eor	x28,x21,x22				// magic seed
98	str	x1,[x29,#112]
99#ifndef	__AARCH64EB__
100	rev	x3,x3			// 0
101#endif
102	ror	x16,x24,#14
103	add	x27,x27,x19			// h+=K[i]
104	eor	x6,x24,x24,ror#23
105	and	x17,x25,x24
106	bic	x19,x26,x24
107	add	x27,x27,x3			// h+=X[i]
108	orr	x17,x17,x19			// Ch(e,f,g)
109	eor	x19,x20,x21			// a^b, b^c in next round
110	eor	x16,x16,x6,ror#18	// Sigma1(e)
111	ror	x6,x20,#28
112	add	x27,x27,x17			// h+=Ch(e,f,g)
113	eor	x17,x20,x20,ror#5
114	add	x27,x27,x16			// h+=Sigma1(e)
115	and	x28,x28,x19			// (b^c)&=(a^b)
116	add	x23,x23,x27			// d+=h
117	eor	x28,x28,x21			// Maj(a,b,c)
118	eor	x17,x6,x17,ror#34	// Sigma0(a)
119	add	x27,x27,x28			// h+=Maj(a,b,c)
120	ldr	x28,[x30],#8		// *K++, x19 in next round
121	//add	x27,x27,x17			// h+=Sigma0(a)
122#ifndef	__AARCH64EB__
123	rev	x4,x4			// 1
124#endif
125	ldp	x5,x6,[x1],#2*8
126	add	x27,x27,x17			// h+=Sigma0(a)
127	ror	x16,x23,#14
128	add	x26,x26,x28			// h+=K[i]
129	eor	x7,x23,x23,ror#23
130	and	x17,x24,x23
131	bic	x28,x25,x23
132	add	x26,x26,x4			// h+=X[i]
133	orr	x17,x17,x28			// Ch(e,f,g)
134	eor	x28,x27,x20			// a^b, b^c in next round
135	eor	x16,x16,x7,ror#18	// Sigma1(e)
136	ror	x7,x27,#28
137	add	x26,x26,x17			// h+=Ch(e,f,g)
138	eor	x17,x27,x27,ror#5
139	add	x26,x26,x16			// h+=Sigma1(e)
140	and	x19,x19,x28			// (b^c)&=(a^b)
141	add	x22,x22,x26			// d+=h
142	eor	x19,x19,x20			// Maj(a,b,c)
143	eor	x17,x7,x17,ror#34	// Sigma0(a)
144	add	x26,x26,x19			// h+=Maj(a,b,c)
145	ldr	x19,[x30],#8		// *K++, x28 in next round
146	//add	x26,x26,x17			// h+=Sigma0(a)
147#ifndef	__AARCH64EB__
148	rev	x5,x5			// 2
149#endif
150	add	x26,x26,x17			// h+=Sigma0(a)
151	ror	x16,x22,#14
152	add	x25,x25,x19			// h+=K[i]
153	eor	x8,x22,x22,ror#23
154	and	x17,x23,x22
155	bic	x19,x24,x22
156	add	x25,x25,x5			// h+=X[i]
157	orr	x17,x17,x19			// Ch(e,f,g)
158	eor	x19,x26,x27			// a^b, b^c in next round
159	eor	x16,x16,x8,ror#18	// Sigma1(e)
160	ror	x8,x26,#28
161	add	x25,x25,x17			// h+=Ch(e,f,g)
162	eor	x17,x26,x26,ror#5
163	add	x25,x25,x16			// h+=Sigma1(e)
164	and	x28,x28,x19			// (b^c)&=(a^b)
165	add	x21,x21,x25			// d+=h
166	eor	x28,x28,x27			// Maj(a,b,c)
167	eor	x17,x8,x17,ror#34	// Sigma0(a)
168	add	x25,x25,x28			// h+=Maj(a,b,c)
169	ldr	x28,[x30],#8		// *K++, x19 in next round
170	//add	x25,x25,x17			// h+=Sigma0(a)
171#ifndef	__AARCH64EB__
172	rev	x6,x6			// 3
173#endif
174	ldp	x7,x8,[x1],#2*8
175	add	x25,x25,x17			// h+=Sigma0(a)
176	ror	x16,x21,#14
177	add	x24,x24,x28			// h+=K[i]
178	eor	x9,x21,x21,ror#23
179	and	x17,x22,x21
180	bic	x28,x23,x21
181	add	x24,x24,x6			// h+=X[i]
182	orr	x17,x17,x28			// Ch(e,f,g)
183	eor	x28,x25,x26			// a^b, b^c in next round
184	eor	x16,x16,x9,ror#18	// Sigma1(e)
185	ror	x9,x25,#28
186	add	x24,x24,x17			// h+=Ch(e,f,g)
187	eor	x17,x25,x25,ror#5
188	add	x24,x24,x16			// h+=Sigma1(e)
189	and	x19,x19,x28			// (b^c)&=(a^b)
190	add	x20,x20,x24			// d+=h
191	eor	x19,x19,x26			// Maj(a,b,c)
192	eor	x17,x9,x17,ror#34	// Sigma0(a)
193	add	x24,x24,x19			// h+=Maj(a,b,c)
194	ldr	x19,[x30],#8		// *K++, x28 in next round
195	//add	x24,x24,x17			// h+=Sigma0(a)
196#ifndef	__AARCH64EB__
197	rev	x7,x7			// 4
198#endif
199	add	x24,x24,x17			// h+=Sigma0(a)
200	ror	x16,x20,#14
201	add	x23,x23,x19			// h+=K[i]
202	eor	x10,x20,x20,ror#23
203	and	x17,x21,x20
204	bic	x19,x22,x20
205	add	x23,x23,x7			// h+=X[i]
206	orr	x17,x17,x19			// Ch(e,f,g)
207	eor	x19,x24,x25			// a^b, b^c in next round
208	eor	x16,x16,x10,ror#18	// Sigma1(e)
209	ror	x10,x24,#28
210	add	x23,x23,x17			// h+=Ch(e,f,g)
211	eor	x17,x24,x24,ror#5
212	add	x23,x23,x16			// h+=Sigma1(e)
213	and	x28,x28,x19			// (b^c)&=(a^b)
214	add	x27,x27,x23			// d+=h
215	eor	x28,x28,x25			// Maj(a,b,c)
216	eor	x17,x10,x17,ror#34	// Sigma0(a)
217	add	x23,x23,x28			// h+=Maj(a,b,c)
218	ldr	x28,[x30],#8		// *K++, x19 in next round
219	//add	x23,x23,x17			// h+=Sigma0(a)
220#ifndef	__AARCH64EB__
221	rev	x8,x8			// 5
222#endif
223	ldp	x9,x10,[x1],#2*8
224	add	x23,x23,x17			// h+=Sigma0(a)
225	ror	x16,x27,#14
226	add	x22,x22,x28			// h+=K[i]
227	eor	x11,x27,x27,ror#23
228	and	x17,x20,x27
229	bic	x28,x21,x27
230	add	x22,x22,x8			// h+=X[i]
231	orr	x17,x17,x28			// Ch(e,f,g)
232	eor	x28,x23,x24			// a^b, b^c in next round
233	eor	x16,x16,x11,ror#18	// Sigma1(e)
234	ror	x11,x23,#28
235	add	x22,x22,x17			// h+=Ch(e,f,g)
236	eor	x17,x23,x23,ror#5
237	add	x22,x22,x16			// h+=Sigma1(e)
238	and	x19,x19,x28			// (b^c)&=(a^b)
239	add	x26,x26,x22			// d+=h
240	eor	x19,x19,x24			// Maj(a,b,c)
241	eor	x17,x11,x17,ror#34	// Sigma0(a)
242	add	x22,x22,x19			// h+=Maj(a,b,c)
243	ldr	x19,[x30],#8		// *K++, x28 in next round
244	//add	x22,x22,x17			// h+=Sigma0(a)
245#ifndef	__AARCH64EB__
246	rev	x9,x9			// 6
247#endif
248	add	x22,x22,x17			// h+=Sigma0(a)
249	ror	x16,x26,#14
250	add	x21,x21,x19			// h+=K[i]
251	eor	x12,x26,x26,ror#23
252	and	x17,x27,x26
253	bic	x19,x20,x26
254	add	x21,x21,x9			// h+=X[i]
255	orr	x17,x17,x19			// Ch(e,f,g)
256	eor	x19,x22,x23			// a^b, b^c in next round
257	eor	x16,x16,x12,ror#18	// Sigma1(e)
258	ror	x12,x22,#28
259	add	x21,x21,x17			// h+=Ch(e,f,g)
260	eor	x17,x22,x22,ror#5
261	add	x21,x21,x16			// h+=Sigma1(e)
262	and	x28,x28,x19			// (b^c)&=(a^b)
263	add	x25,x25,x21			// d+=h
264	eor	x28,x28,x23			// Maj(a,b,c)
265	eor	x17,x12,x17,ror#34	// Sigma0(a)
266	add	x21,x21,x28			// h+=Maj(a,b,c)
267	ldr	x28,[x30],#8		// *K++, x19 in next round
268	//add	x21,x21,x17			// h+=Sigma0(a)
269#ifndef	__AARCH64EB__
270	rev	x10,x10			// 7
271#endif
272	ldp	x11,x12,[x1],#2*8
273	add	x21,x21,x17			// h+=Sigma0(a)
274	ror	x16,x25,#14
275	add	x20,x20,x28			// h+=K[i]
276	eor	x13,x25,x25,ror#23
277	and	x17,x26,x25
278	bic	x28,x27,x25
279	add	x20,x20,x10			// h+=X[i]
280	orr	x17,x17,x28			// Ch(e,f,g)
281	eor	x28,x21,x22			// a^b, b^c in next round
282	eor	x16,x16,x13,ror#18	// Sigma1(e)
283	ror	x13,x21,#28
284	add	x20,x20,x17			// h+=Ch(e,f,g)
285	eor	x17,x21,x21,ror#5
286	add	x20,x20,x16			// h+=Sigma1(e)
287	and	x19,x19,x28			// (b^c)&=(a^b)
288	add	x24,x24,x20			// d+=h
289	eor	x19,x19,x22			// Maj(a,b,c)
290	eor	x17,x13,x17,ror#34	// Sigma0(a)
291	add	x20,x20,x19			// h+=Maj(a,b,c)
292	ldr	x19,[x30],#8		// *K++, x28 in next round
293	//add	x20,x20,x17			// h+=Sigma0(a)
294#ifndef	__AARCH64EB__
295	rev	x11,x11			// 8
296#endif
297	add	x20,x20,x17			// h+=Sigma0(a)
298	ror	x16,x24,#14
299	add	x27,x27,x19			// h+=K[i]
300	eor	x14,x24,x24,ror#23
301	and	x17,x25,x24
302	bic	x19,x26,x24
303	add	x27,x27,x11			// h+=X[i]
304	orr	x17,x17,x19			// Ch(e,f,g)
305	eor	x19,x20,x21			// a^b, b^c in next round
306	eor	x16,x16,x14,ror#18	// Sigma1(e)
307	ror	x14,x20,#28
308	add	x27,x27,x17			// h+=Ch(e,f,g)
309	eor	x17,x20,x20,ror#5
310	add	x27,x27,x16			// h+=Sigma1(e)
311	and	x28,x28,x19			// (b^c)&=(a^b)
312	add	x23,x23,x27			// d+=h
313	eor	x28,x28,x21			// Maj(a,b,c)
314	eor	x17,x14,x17,ror#34	// Sigma0(a)
315	add	x27,x27,x28			// h+=Maj(a,b,c)
316	ldr	x28,[x30],#8		// *K++, x19 in next round
317	//add	x27,x27,x17			// h+=Sigma0(a)
318#ifndef	__AARCH64EB__
319	rev	x12,x12			// 9
320#endif
321	ldp	x13,x14,[x1],#2*8
322	add	x27,x27,x17			// h+=Sigma0(a)
323	ror	x16,x23,#14
324	add	x26,x26,x28			// h+=K[i]
325	eor	x15,x23,x23,ror#23
326	and	x17,x24,x23
327	bic	x28,x25,x23
328	add	x26,x26,x12			// h+=X[i]
329	orr	x17,x17,x28			// Ch(e,f,g)
330	eor	x28,x27,x20			// a^b, b^c in next round
331	eor	x16,x16,x15,ror#18	// Sigma1(e)
332	ror	x15,x27,#28
333	add	x26,x26,x17			// h+=Ch(e,f,g)
334	eor	x17,x27,x27,ror#5
335	add	x26,x26,x16			// h+=Sigma1(e)
336	and	x19,x19,x28			// (b^c)&=(a^b)
337	add	x22,x22,x26			// d+=h
338	eor	x19,x19,x20			// Maj(a,b,c)
339	eor	x17,x15,x17,ror#34	// Sigma0(a)
340	add	x26,x26,x19			// h+=Maj(a,b,c)
341	ldr	x19,[x30],#8		// *K++, x28 in next round
342	//add	x26,x26,x17			// h+=Sigma0(a)
343#ifndef	__AARCH64EB__
344	rev	x13,x13			// 10
345#endif
346	add	x26,x26,x17			// h+=Sigma0(a)
347	ror	x16,x22,#14
348	add	x25,x25,x19			// h+=K[i]
349	eor	x0,x22,x22,ror#23
350	and	x17,x23,x22
351	bic	x19,x24,x22
352	add	x25,x25,x13			// h+=X[i]
353	orr	x17,x17,x19			// Ch(e,f,g)
354	eor	x19,x26,x27			// a^b, b^c in next round
355	eor	x16,x16,x0,ror#18	// Sigma1(e)
356	ror	x0,x26,#28
357	add	x25,x25,x17			// h+=Ch(e,f,g)
358	eor	x17,x26,x26,ror#5
359	add	x25,x25,x16			// h+=Sigma1(e)
360	and	x28,x28,x19			// (b^c)&=(a^b)
361	add	x21,x21,x25			// d+=h
362	eor	x28,x28,x27			// Maj(a,b,c)
363	eor	x17,x0,x17,ror#34	// Sigma0(a)
364	add	x25,x25,x28			// h+=Maj(a,b,c)
365	ldr	x28,[x30],#8		// *K++, x19 in next round
366	//add	x25,x25,x17			// h+=Sigma0(a)
367#ifndef	__AARCH64EB__
368	rev	x14,x14			// 11
369#endif
370	ldp	x15,x0,[x1],#2*8
371	add	x25,x25,x17			// h+=Sigma0(a)
372	str	x6,[sp,#24]
373	ror	x16,x21,#14
374	add	x24,x24,x28			// h+=K[i]
375	eor	x6,x21,x21,ror#23
376	and	x17,x22,x21
377	bic	x28,x23,x21
378	add	x24,x24,x14			// h+=X[i]
379	orr	x17,x17,x28			// Ch(e,f,g)
380	eor	x28,x25,x26			// a^b, b^c in next round
381	eor	x16,x16,x6,ror#18	// Sigma1(e)
382	ror	x6,x25,#28
383	add	x24,x24,x17			// h+=Ch(e,f,g)
384	eor	x17,x25,x25,ror#5
385	add	x24,x24,x16			// h+=Sigma1(e)
386	and	x19,x19,x28			// (b^c)&=(a^b)
387	add	x20,x20,x24			// d+=h
388	eor	x19,x19,x26			// Maj(a,b,c)
389	eor	x17,x6,x17,ror#34	// Sigma0(a)
390	add	x24,x24,x19			// h+=Maj(a,b,c)
391	ldr	x19,[x30],#8		// *K++, x28 in next round
392	//add	x24,x24,x17			// h+=Sigma0(a)
393#ifndef	__AARCH64EB__
394	rev	x15,x15			// 12
395#endif
396	add	x24,x24,x17			// h+=Sigma0(a)
397	str	x7,[sp,#0]
398	ror	x16,x20,#14
399	add	x23,x23,x19			// h+=K[i]
400	eor	x7,x20,x20,ror#23
401	and	x17,x21,x20
402	bic	x19,x22,x20
403	add	x23,x23,x15			// h+=X[i]
404	orr	x17,x17,x19			// Ch(e,f,g)
405	eor	x19,x24,x25			// a^b, b^c in next round
406	eor	x16,x16,x7,ror#18	// Sigma1(e)
407	ror	x7,x24,#28
408	add	x23,x23,x17			// h+=Ch(e,f,g)
409	eor	x17,x24,x24,ror#5
410	add	x23,x23,x16			// h+=Sigma1(e)
411	and	x28,x28,x19			// (b^c)&=(a^b)
412	add	x27,x27,x23			// d+=h
413	eor	x28,x28,x25			// Maj(a,b,c)
414	eor	x17,x7,x17,ror#34	// Sigma0(a)
415	add	x23,x23,x28			// h+=Maj(a,b,c)
416	ldr	x28,[x30],#8		// *K++, x19 in next round
417	//add	x23,x23,x17			// h+=Sigma0(a)
418#ifndef	__AARCH64EB__
419	rev	x0,x0			// 13
420#endif
421	ldp	x1,x2,[x1]
422	add	x23,x23,x17			// h+=Sigma0(a)
423	str	x8,[sp,#8]
424	ror	x16,x27,#14
425	add	x22,x22,x28			// h+=K[i]
426	eor	x8,x27,x27,ror#23
427	and	x17,x20,x27
428	bic	x28,x21,x27
429	add	x22,x22,x0			// h+=X[i]
430	orr	x17,x17,x28			// Ch(e,f,g)
431	eor	x28,x23,x24			// a^b, b^c in next round
432	eor	x16,x16,x8,ror#18	// Sigma1(e)
433	ror	x8,x23,#28
434	add	x22,x22,x17			// h+=Ch(e,f,g)
435	eor	x17,x23,x23,ror#5
436	add	x22,x22,x16			// h+=Sigma1(e)
437	and	x19,x19,x28			// (b^c)&=(a^b)
438	add	x26,x26,x22			// d+=h
439	eor	x19,x19,x24			// Maj(a,b,c)
440	eor	x17,x8,x17,ror#34	// Sigma0(a)
441	add	x22,x22,x19			// h+=Maj(a,b,c)
442	ldr	x19,[x30],#8		// *K++, x28 in next round
443	//add	x22,x22,x17			// h+=Sigma0(a)
444#ifndef	__AARCH64EB__
445	rev	x1,x1			// 14
446#endif
447	ldr	x6,[sp,#24]
448	add	x22,x22,x17			// h+=Sigma0(a)
449	str	x9,[sp,#16]
450	ror	x16,x26,#14
451	add	x21,x21,x19			// h+=K[i]
452	eor	x9,x26,x26,ror#23
453	and	x17,x27,x26
454	bic	x19,x20,x26
455	add	x21,x21,x1			// h+=X[i]
456	orr	x17,x17,x19			// Ch(e,f,g)
457	eor	x19,x22,x23			// a^b, b^c in next round
458	eor	x16,x16,x9,ror#18	// Sigma1(e)
459	ror	x9,x22,#28
460	add	x21,x21,x17			// h+=Ch(e,f,g)
461	eor	x17,x22,x22,ror#5
462	add	x21,x21,x16			// h+=Sigma1(e)
463	and	x28,x28,x19			// (b^c)&=(a^b)
464	add	x25,x25,x21			// d+=h
465	eor	x28,x28,x23			// Maj(a,b,c)
466	eor	x17,x9,x17,ror#34	// Sigma0(a)
467	add	x21,x21,x28			// h+=Maj(a,b,c)
468	ldr	x28,[x30],#8		// *K++, x19 in next round
469	//add	x21,x21,x17			// h+=Sigma0(a)
470#ifndef	__AARCH64EB__
471	rev	x2,x2			// 15
472#endif
473	ldr	x7,[sp,#0]
474	add	x21,x21,x17			// h+=Sigma0(a)
475	str	x10,[sp,#24]
476	ror	x16,x25,#14
477	add	x20,x20,x28			// h+=K[i]
478	ror	x9,x4,#1
479	and	x17,x26,x25
480	ror	x8,x1,#19
481	bic	x28,x27,x25
482	ror	x10,x21,#28
483	add	x20,x20,x2			// h+=X[i]
484	eor	x16,x16,x25,ror#18
485	eor	x9,x9,x4,ror#8
486	orr	x17,x17,x28			// Ch(e,f,g)
487	eor	x28,x21,x22			// a^b, b^c in next round
488	eor	x16,x16,x25,ror#41	// Sigma1(e)
489	eor	x10,x10,x21,ror#34
490	add	x20,x20,x17			// h+=Ch(e,f,g)
491	and	x19,x19,x28			// (b^c)&=(a^b)
492	eor	x8,x8,x1,ror#61
493	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
494	add	x20,x20,x16			// h+=Sigma1(e)
495	eor	x19,x19,x22			// Maj(a,b,c)
496	eor	x17,x10,x21,ror#39	// Sigma0(a)
497	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
498	add	x3,x3,x12
499	add	x24,x24,x20			// d+=h
500	add	x20,x20,x19			// h+=Maj(a,b,c)
501	ldr	x19,[x30],#8		// *K++, x28 in next round
502	add	x3,x3,x9
503	add	x20,x20,x17			// h+=Sigma0(a)
504	add	x3,x3,x8
505.Loop_16_xx:
506	ldr	x8,[sp,#8]
507	str	x11,[sp,#0]
508	ror	x16,x24,#14
509	add	x27,x27,x19			// h+=K[i]
510	ror	x10,x5,#1
511	and	x17,x25,x24
512	ror	x9,x2,#19
513	bic	x19,x26,x24
514	ror	x11,x20,#28
515	add	x27,x27,x3			// h+=X[i]
516	eor	x16,x16,x24,ror#18
517	eor	x10,x10,x5,ror#8
518	orr	x17,x17,x19			// Ch(e,f,g)
519	eor	x19,x20,x21			// a^b, b^c in next round
520	eor	x16,x16,x24,ror#41	// Sigma1(e)
521	eor	x11,x11,x20,ror#34
522	add	x27,x27,x17			// h+=Ch(e,f,g)
523	and	x28,x28,x19			// (b^c)&=(a^b)
524	eor	x9,x9,x2,ror#61
525	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
526	add	x27,x27,x16			// h+=Sigma1(e)
527	eor	x28,x28,x21			// Maj(a,b,c)
528	eor	x17,x11,x20,ror#39	// Sigma0(a)
529	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
530	add	x4,x4,x13
531	add	x23,x23,x27			// d+=h
532	add	x27,x27,x28			// h+=Maj(a,b,c)
533	ldr	x28,[x30],#8		// *K++, x19 in next round
534	add	x4,x4,x10
535	add	x27,x27,x17			// h+=Sigma0(a)
536	add	x4,x4,x9
537	ldr	x9,[sp,#16]
538	str	x12,[sp,#8]
539	ror	x16,x23,#14
540	add	x26,x26,x28			// h+=K[i]
541	ror	x11,x6,#1
542	and	x17,x24,x23
543	ror	x10,x3,#19
544	bic	x28,x25,x23
545	ror	x12,x27,#28
546	add	x26,x26,x4			// h+=X[i]
547	eor	x16,x16,x23,ror#18
548	eor	x11,x11,x6,ror#8
549	orr	x17,x17,x28			// Ch(e,f,g)
550	eor	x28,x27,x20			// a^b, b^c in next round
551	eor	x16,x16,x23,ror#41	// Sigma1(e)
552	eor	x12,x12,x27,ror#34
553	add	x26,x26,x17			// h+=Ch(e,f,g)
554	and	x19,x19,x28			// (b^c)&=(a^b)
555	eor	x10,x10,x3,ror#61
556	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
557	add	x26,x26,x16			// h+=Sigma1(e)
558	eor	x19,x19,x20			// Maj(a,b,c)
559	eor	x17,x12,x27,ror#39	// Sigma0(a)
560	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
561	add	x5,x5,x14
562	add	x22,x22,x26			// d+=h
563	add	x26,x26,x19			// h+=Maj(a,b,c)
564	ldr	x19,[x30],#8		// *K++, x28 in next round
565	add	x5,x5,x11
566	add	x26,x26,x17			// h+=Sigma0(a)
567	add	x5,x5,x10
568	ldr	x10,[sp,#24]
569	str	x13,[sp,#16]
570	ror	x16,x22,#14
571	add	x25,x25,x19			// h+=K[i]
572	ror	x12,x7,#1
573	and	x17,x23,x22
574	ror	x11,x4,#19
575	bic	x19,x24,x22
576	ror	x13,x26,#28
577	add	x25,x25,x5			// h+=X[i]
578	eor	x16,x16,x22,ror#18
579	eor	x12,x12,x7,ror#8
580	orr	x17,x17,x19			// Ch(e,f,g)
581	eor	x19,x26,x27			// a^b, b^c in next round
582	eor	x16,x16,x22,ror#41	// Sigma1(e)
583	eor	x13,x13,x26,ror#34
584	add	x25,x25,x17			// h+=Ch(e,f,g)
585	and	x28,x28,x19			// (b^c)&=(a^b)
586	eor	x11,x11,x4,ror#61
587	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
588	add	x25,x25,x16			// h+=Sigma1(e)
589	eor	x28,x28,x27			// Maj(a,b,c)
590	eor	x17,x13,x26,ror#39	// Sigma0(a)
591	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
592	add	x6,x6,x15
593	add	x21,x21,x25			// d+=h
594	add	x25,x25,x28			// h+=Maj(a,b,c)
595	ldr	x28,[x30],#8		// *K++, x19 in next round
596	add	x6,x6,x12
597	add	x25,x25,x17			// h+=Sigma0(a)
598	add	x6,x6,x11
599	ldr	x11,[sp,#0]
600	str	x14,[sp,#24]
601	ror	x16,x21,#14
602	add	x24,x24,x28			// h+=K[i]
603	ror	x13,x8,#1
604	and	x17,x22,x21
605	ror	x12,x5,#19
606	bic	x28,x23,x21
607	ror	x14,x25,#28
608	add	x24,x24,x6			// h+=X[i]
609	eor	x16,x16,x21,ror#18
610	eor	x13,x13,x8,ror#8
611	orr	x17,x17,x28			// Ch(e,f,g)
612	eor	x28,x25,x26			// a^b, b^c in next round
613	eor	x16,x16,x21,ror#41	// Sigma1(e)
614	eor	x14,x14,x25,ror#34
615	add	x24,x24,x17			// h+=Ch(e,f,g)
616	and	x19,x19,x28			// (b^c)&=(a^b)
617	eor	x12,x12,x5,ror#61
618	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
619	add	x24,x24,x16			// h+=Sigma1(e)
620	eor	x19,x19,x26			// Maj(a,b,c)
621	eor	x17,x14,x25,ror#39	// Sigma0(a)
622	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
623	add	x7,x7,x0
624	add	x20,x20,x24			// d+=h
625	add	x24,x24,x19			// h+=Maj(a,b,c)
626	ldr	x19,[x30],#8		// *K++, x28 in next round
627	add	x7,x7,x13
628	add	x24,x24,x17			// h+=Sigma0(a)
629	add	x7,x7,x12
630	ldr	x12,[sp,#8]
631	str	x15,[sp,#0]
632	ror	x16,x20,#14
633	add	x23,x23,x19			// h+=K[i]
634	ror	x14,x9,#1
635	and	x17,x21,x20
636	ror	x13,x6,#19
637	bic	x19,x22,x20
638	ror	x15,x24,#28
639	add	x23,x23,x7			// h+=X[i]
640	eor	x16,x16,x20,ror#18
641	eor	x14,x14,x9,ror#8
642	orr	x17,x17,x19			// Ch(e,f,g)
643	eor	x19,x24,x25			// a^b, b^c in next round
644	eor	x16,x16,x20,ror#41	// Sigma1(e)
645	eor	x15,x15,x24,ror#34
646	add	x23,x23,x17			// h+=Ch(e,f,g)
647	and	x28,x28,x19			// (b^c)&=(a^b)
648	eor	x13,x13,x6,ror#61
649	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
650	add	x23,x23,x16			// h+=Sigma1(e)
651	eor	x28,x28,x25			// Maj(a,b,c)
652	eor	x17,x15,x24,ror#39	// Sigma0(a)
653	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
654	add	x8,x8,x1
655	add	x27,x27,x23			// d+=h
656	add	x23,x23,x28			// h+=Maj(a,b,c)
657	ldr	x28,[x30],#8		// *K++, x19 in next round
658	add	x8,x8,x14
659	add	x23,x23,x17			// h+=Sigma0(a)
660	add	x8,x8,x13
661	ldr	x13,[sp,#16]
662	str	x0,[sp,#8]
663	ror	x16,x27,#14
664	add	x22,x22,x28			// h+=K[i]
665	ror	x15,x10,#1
666	and	x17,x20,x27
667	ror	x14,x7,#19
668	bic	x28,x21,x27
669	ror	x0,x23,#28
670	add	x22,x22,x8			// h+=X[i]
671	eor	x16,x16,x27,ror#18
672	eor	x15,x15,x10,ror#8
673	orr	x17,x17,x28			// Ch(e,f,g)
674	eor	x28,x23,x24			// a^b, b^c in next round
675	eor	x16,x16,x27,ror#41	// Sigma1(e)
676	eor	x0,x0,x23,ror#34
677	add	x22,x22,x17			// h+=Ch(e,f,g)
678	and	x19,x19,x28			// (b^c)&=(a^b)
679	eor	x14,x14,x7,ror#61
680	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
681	add	x22,x22,x16			// h+=Sigma1(e)
682	eor	x19,x19,x24			// Maj(a,b,c)
683	eor	x17,x0,x23,ror#39	// Sigma0(a)
684	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
685	add	x9,x9,x2
686	add	x26,x26,x22			// d+=h
687	add	x22,x22,x19			// h+=Maj(a,b,c)
688	ldr	x19,[x30],#8		// *K++, x28 in next round
689	add	x9,x9,x15
690	add	x22,x22,x17			// h+=Sigma0(a)
691	add	x9,x9,x14
692	ldr	x14,[sp,#24]
693	str	x1,[sp,#16]
694	ror	x16,x26,#14
695	add	x21,x21,x19			// h+=K[i]
696	ror	x0,x11,#1
697	and	x17,x27,x26
698	ror	x15,x8,#19
699	bic	x19,x20,x26
700	ror	x1,x22,#28
701	add	x21,x21,x9			// h+=X[i]
702	eor	x16,x16,x26,ror#18
703	eor	x0,x0,x11,ror#8
704	orr	x17,x17,x19			// Ch(e,f,g)
705	eor	x19,x22,x23			// a^b, b^c in next round
706	eor	x16,x16,x26,ror#41	// Sigma1(e)
707	eor	x1,x1,x22,ror#34
708	add	x21,x21,x17			// h+=Ch(e,f,g)
709	and	x28,x28,x19			// (b^c)&=(a^b)
710	eor	x15,x15,x8,ror#61
711	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
712	add	x21,x21,x16			// h+=Sigma1(e)
713	eor	x28,x28,x23			// Maj(a,b,c)
714	eor	x17,x1,x22,ror#39	// Sigma0(a)
715	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
716	add	x10,x10,x3
717	add	x25,x25,x21			// d+=h
718	add	x21,x21,x28			// h+=Maj(a,b,c)
719	ldr	x28,[x30],#8		// *K++, x19 in next round
720	add	x10,x10,x0
721	add	x21,x21,x17			// h+=Sigma0(a)
722	add	x10,x10,x15
723	ldr	x15,[sp,#0]
724	str	x2,[sp,#24]
725	ror	x16,x25,#14
726	add	x20,x20,x28			// h+=K[i]
727	ror	x1,x12,#1
728	and	x17,x26,x25
729	ror	x0,x9,#19
730	bic	x28,x27,x25
731	ror	x2,x21,#28
732	add	x20,x20,x10			// h+=X[i]
733	eor	x16,x16,x25,ror#18
734	eor	x1,x1,x12,ror#8
735	orr	x17,x17,x28			// Ch(e,f,g)
736	eor	x28,x21,x22			// a^b, b^c in next round
737	eor	x16,x16,x25,ror#41	// Sigma1(e)
738	eor	x2,x2,x21,ror#34
739	add	x20,x20,x17			// h+=Ch(e,f,g)
740	and	x19,x19,x28			// (b^c)&=(a^b)
741	eor	x0,x0,x9,ror#61
742	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
743	add	x20,x20,x16			// h+=Sigma1(e)
744	eor	x19,x19,x22			// Maj(a,b,c)
745	eor	x17,x2,x21,ror#39	// Sigma0(a)
746	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
747	add	x11,x11,x4
748	add	x24,x24,x20			// d+=h
749	add	x20,x20,x19			// h+=Maj(a,b,c)
750	ldr	x19,[x30],#8		// *K++, x28 in next round
751	add	x11,x11,x1
752	add	x20,x20,x17			// h+=Sigma0(a)
753	add	x11,x11,x0
754	ldr	x0,[sp,#8]
755	str	x3,[sp,#0]
756	ror	x16,x24,#14
757	add	x27,x27,x19			// h+=K[i]
758	ror	x2,x13,#1
759	and	x17,x25,x24
760	ror	x1,x10,#19
761	bic	x19,x26,x24
762	ror	x3,x20,#28
763	add	x27,x27,x11			// h+=X[i]
764	eor	x16,x16,x24,ror#18
765	eor	x2,x2,x13,ror#8
766	orr	x17,x17,x19			// Ch(e,f,g)
767	eor	x19,x20,x21			// a^b, b^c in next round
768	eor	x16,x16,x24,ror#41	// Sigma1(e)
769	eor	x3,x3,x20,ror#34
770	add	x27,x27,x17			// h+=Ch(e,f,g)
771	and	x28,x28,x19			// (b^c)&=(a^b)
772	eor	x1,x1,x10,ror#61
773	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
774	add	x27,x27,x16			// h+=Sigma1(e)
775	eor	x28,x28,x21			// Maj(a,b,c)
776	eor	x17,x3,x20,ror#39	// Sigma0(a)
777	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
778	add	x12,x12,x5
779	add	x23,x23,x27			// d+=h
780	add	x27,x27,x28			// h+=Maj(a,b,c)
781	ldr	x28,[x30],#8		// *K++, x19 in next round
782	add	x12,x12,x2
783	add	x27,x27,x17			// h+=Sigma0(a)
784	add	x12,x12,x1
785	ldr	x1,[sp,#16]
786	str	x4,[sp,#8]
787	ror	x16,x23,#14
788	add	x26,x26,x28			// h+=K[i]
789	ror	x3,x14,#1
790	and	x17,x24,x23
791	ror	x2,x11,#19
792	bic	x28,x25,x23
793	ror	x4,x27,#28
794	add	x26,x26,x12			// h+=X[i]
795	eor	x16,x16,x23,ror#18
796	eor	x3,x3,x14,ror#8
797	orr	x17,x17,x28			// Ch(e,f,g)
798	eor	x28,x27,x20			// a^b, b^c in next round
799	eor	x16,x16,x23,ror#41	// Sigma1(e)
800	eor	x4,x4,x27,ror#34
801	add	x26,x26,x17			// h+=Ch(e,f,g)
802	and	x19,x19,x28			// (b^c)&=(a^b)
803	eor	x2,x2,x11,ror#61
804	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
805	add	x26,x26,x16			// h+=Sigma1(e)
806	eor	x19,x19,x20			// Maj(a,b,c)
807	eor	x17,x4,x27,ror#39	// Sigma0(a)
808	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
809	add	x13,x13,x6
810	add	x22,x22,x26			// d+=h
811	add	x26,x26,x19			// h+=Maj(a,b,c)
812	ldr	x19,[x30],#8		// *K++, x28 in next round
813	add	x13,x13,x3
814	add	x26,x26,x17			// h+=Sigma0(a)
815	add	x13,x13,x2
816	ldr	x2,[sp,#24]
817	str	x5,[sp,#16]
818	ror	x16,x22,#14
819	add	x25,x25,x19			// h+=K[i]
820	ror	x4,x15,#1
821	and	x17,x23,x22
822	ror	x3,x12,#19
823	bic	x19,x24,x22
824	ror	x5,x26,#28
825	add	x25,x25,x13			// h+=X[i]
826	eor	x16,x16,x22,ror#18
827	eor	x4,x4,x15,ror#8
828	orr	x17,x17,x19			// Ch(e,f,g)
829	eor	x19,x26,x27			// a^b, b^c in next round
830	eor	x16,x16,x22,ror#41	// Sigma1(e)
831	eor	x5,x5,x26,ror#34
832	add	x25,x25,x17			// h+=Ch(e,f,g)
833	and	x28,x28,x19			// (b^c)&=(a^b)
834	eor	x3,x3,x12,ror#61
835	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
836	add	x25,x25,x16			// h+=Sigma1(e)
837	eor	x28,x28,x27			// Maj(a,b,c)
838	eor	x17,x5,x26,ror#39	// Sigma0(a)
839	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
840	add	x14,x14,x7
841	add	x21,x21,x25			// d+=h
842	add	x25,x25,x28			// h+=Maj(a,b,c)
843	ldr	x28,[x30],#8		// *K++, x19 in next round
844	add	x14,x14,x4
845	add	x25,x25,x17			// h+=Sigma0(a)
846	add	x14,x14,x3
847	ldr	x3,[sp,#0]
848	str	x6,[sp,#24]
849	ror	x16,x21,#14
850	add	x24,x24,x28			// h+=K[i]
851	ror	x5,x0,#1
852	and	x17,x22,x21
853	ror	x4,x13,#19
854	bic	x28,x23,x21
855	ror	x6,x25,#28
856	add	x24,x24,x14			// h+=X[i]
857	eor	x16,x16,x21,ror#18
858	eor	x5,x5,x0,ror#8
859	orr	x17,x17,x28			// Ch(e,f,g)
860	eor	x28,x25,x26			// a^b, b^c in next round
861	eor	x16,x16,x21,ror#41	// Sigma1(e)
862	eor	x6,x6,x25,ror#34
863	add	x24,x24,x17			// h+=Ch(e,f,g)
864	and	x19,x19,x28			// (b^c)&=(a^b)
865	eor	x4,x4,x13,ror#61
866	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
867	add	x24,x24,x16			// h+=Sigma1(e)
868	eor	x19,x19,x26			// Maj(a,b,c)
869	eor	x17,x6,x25,ror#39	// Sigma0(a)
870	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
871	add	x15,x15,x8
872	add	x20,x20,x24			// d+=h
873	add	x24,x24,x19			// h+=Maj(a,b,c)
874	ldr	x19,[x30],#8		// *K++, x28 in next round
875	add	x15,x15,x5
876	add	x24,x24,x17			// h+=Sigma0(a)
877	add	x15,x15,x4
878	ldr	x4,[sp,#8]
879	str	x7,[sp,#0]
880	ror	x16,x20,#14
881	add	x23,x23,x19			// h+=K[i]
882	ror	x6,x1,#1
883	and	x17,x21,x20
884	ror	x5,x14,#19
885	bic	x19,x22,x20
886	ror	x7,x24,#28
887	add	x23,x23,x15			// h+=X[i]
888	eor	x16,x16,x20,ror#18
889	eor	x6,x6,x1,ror#8
890	orr	x17,x17,x19			// Ch(e,f,g)
891	eor	x19,x24,x25			// a^b, b^c in next round
892	eor	x16,x16,x20,ror#41	// Sigma1(e)
893	eor	x7,x7,x24,ror#34
894	add	x23,x23,x17			// h+=Ch(e,f,g)
895	and	x28,x28,x19			// (b^c)&=(a^b)
896	eor	x5,x5,x14,ror#61
897	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
898	add	x23,x23,x16			// h+=Sigma1(e)
899	eor	x28,x28,x25			// Maj(a,b,c)
900	eor	x17,x7,x24,ror#39	// Sigma0(a)
901	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
902	add	x0,x0,x9
903	add	x27,x27,x23			// d+=h
904	add	x23,x23,x28			// h+=Maj(a,b,c)
905	ldr	x28,[x30],#8		// *K++, x19 in next round
906	add	x0,x0,x6
907	add	x23,x23,x17			// h+=Sigma0(a)
908	add	x0,x0,x5
909	ldr	x5,[sp,#16]
910	str	x8,[sp,#8]
911	ror	x16,x27,#14
912	add	x22,x22,x28			// h+=K[i]
913	ror	x7,x2,#1
914	and	x17,x20,x27
915	ror	x6,x15,#19
916	bic	x28,x21,x27
917	ror	x8,x23,#28
918	add	x22,x22,x0			// h+=X[i]
919	eor	x16,x16,x27,ror#18
920	eor	x7,x7,x2,ror#8
921	orr	x17,x17,x28			// Ch(e,f,g)
922	eor	x28,x23,x24			// a^b, b^c in next round
923	eor	x16,x16,x27,ror#41	// Sigma1(e)
924	eor	x8,x8,x23,ror#34
925	add	x22,x22,x17			// h+=Ch(e,f,g)
926	and	x19,x19,x28			// (b^c)&=(a^b)
927	eor	x6,x6,x15,ror#61
928	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
929	add	x22,x22,x16			// h+=Sigma1(e)
930	eor	x19,x19,x24			// Maj(a,b,c)
931	eor	x17,x8,x23,ror#39	// Sigma0(a)
932	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
933	add	x1,x1,x10
934	add	x26,x26,x22			// d+=h
935	add	x22,x22,x19			// h+=Maj(a,b,c)
936	ldr	x19,[x30],#8		// *K++, x28 in next round
937	add	x1,x1,x7
938	add	x22,x22,x17			// h+=Sigma0(a)
939	add	x1,x1,x6
940	ldr	x6,[sp,#24]
941	str	x9,[sp,#16]
942	ror	x16,x26,#14
943	add	x21,x21,x19			// h+=K[i]
944	ror	x8,x3,#1
945	and	x17,x27,x26
946	ror	x7,x0,#19
947	bic	x19,x20,x26
948	ror	x9,x22,#28
949	add	x21,x21,x1			// h+=X[i]
950	eor	x16,x16,x26,ror#18
951	eor	x8,x8,x3,ror#8
952	orr	x17,x17,x19			// Ch(e,f,g)
953	eor	x19,x22,x23			// a^b, b^c in next round
954	eor	x16,x16,x26,ror#41	// Sigma1(e)
955	eor	x9,x9,x22,ror#34
956	add	x21,x21,x17			// h+=Ch(e,f,g)
957	and	x28,x28,x19			// (b^c)&=(a^b)
958	eor	x7,x7,x0,ror#61
959	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
960	add	x21,x21,x16			// h+=Sigma1(e)
961	eor	x28,x28,x23			// Maj(a,b,c)
962	eor	x17,x9,x22,ror#39	// Sigma0(a)
963	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
964	add	x2,x2,x11
965	add	x25,x25,x21			// d+=h
966	add	x21,x21,x28			// h+=Maj(a,b,c)
967	ldr	x28,[x30],#8		// *K++, x19 in next round
968	add	x2,x2,x8
969	add	x21,x21,x17			// h+=Sigma0(a)
970	add	x2,x2,x7
971	ldr	x7,[sp,#0]
972	str	x10,[sp,#24]
973	ror	x16,x25,#14
974	add	x20,x20,x28			// h+=K[i]
975	ror	x9,x4,#1
976	and	x17,x26,x25
977	ror	x8,x1,#19
978	bic	x28,x27,x25
979	ror	x10,x21,#28
980	add	x20,x20,x2			// h+=X[i]
981	eor	x16,x16,x25,ror#18
982	eor	x9,x9,x4,ror#8
983	orr	x17,x17,x28			// Ch(e,f,g)
984	eor	x28,x21,x22			// a^b, b^c in next round
985	eor	x16,x16,x25,ror#41	// Sigma1(e)
986	eor	x10,x10,x21,ror#34
987	add	x20,x20,x17			// h+=Ch(e,f,g)
988	and	x19,x19,x28			// (b^c)&=(a^b)
989	eor	x8,x8,x1,ror#61
990	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
991	add	x20,x20,x16			// h+=Sigma1(e)
992	eor	x19,x19,x22			// Maj(a,b,c)
993	eor	x17,x10,x21,ror#39	// Sigma0(a)
994	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
995	add	x3,x3,x12
996	add	x24,x24,x20			// d+=h
997	add	x20,x20,x19			// h+=Maj(a,b,c)
998	ldr	x19,[x30],#8		// *K++, x28 in next round
999	add	x3,x3,x9
1000	add	x20,x20,x17			// h+=Sigma0(a)
1001	add	x3,x3,x8
1002	cbnz	x19,.Loop_16_xx
1003
1004	ldp	x0,x2,[x29,#96]
1005	ldr	x1,[x29,#112]
1006	sub	x30,x30,#648		// rewind
1007
1008	ldp	x3,x4,[x0]
1009	ldp	x5,x6,[x0,#2*8]
1010	add	x1,x1,#14*8			// advance input pointer
1011	ldp	x7,x8,[x0,#4*8]
1012	add	x20,x20,x3
1013	ldp	x9,x10,[x0,#6*8]
1014	add	x21,x21,x4
1015	add	x22,x22,x5
1016	add	x23,x23,x6
1017	stp	x20,x21,[x0]
1018	add	x24,x24,x7
1019	add	x25,x25,x8
1020	stp	x22,x23,[x0,#2*8]
1021	add	x26,x26,x9
1022	add	x27,x27,x10
1023	cmp	x1,x2
1024	stp	x24,x25,[x0,#4*8]
1025	stp	x26,x27,[x0,#6*8]
1026	b.ne	.Loop
1027
1028	ldp	x19,x20,[x29,#16]
1029	add	sp,sp,#4*8
1030	ldp	x21,x22,[x29,#32]
1031	ldp	x23,x24,[x29,#48]
1032	ldp	x25,x26,[x29,#64]
1033	ldp	x27,x28,[x29,#80]
1034	ldp	x29,x30,[sp],#128
1035	ret
1036.size	zfs_sha512_block_armv7,.-zfs_sha512_block_armv7
1037
1038
1039.globl	zfs_sha512_block_armv8
1040.type	zfs_sha512_block_armv8,%function
1041.align	6
1042zfs_sha512_block_armv8:
1043.Lv8_entry:
1044	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later
1045	stp		x29,x30,[sp,#-16]!
1046	add		x29,sp,#0
1047
1048	ld1		{v16.16b-v19.16b},[x1],#64	// load input
1049	ld1		{v20.16b-v23.16b},[x1],#64
1050
1051	ld1		{v0.2d-v3.2d},[x0]		// load context
1052	adr		x3,.LK512
1053
1054	rev64		v16.16b,v16.16b
1055	rev64		v17.16b,v17.16b
1056	rev64		v18.16b,v18.16b
1057	rev64		v19.16b,v19.16b
1058	rev64		v20.16b,v20.16b
1059	rev64		v21.16b,v21.16b
1060	rev64		v22.16b,v22.16b
1061	rev64		v23.16b,v23.16b
1062	b		.Loop_hw
1063
1064.align	4
1065.Loop_hw:
1066	ld1		{v24.2d},[x3],#16
1067	subs		x2,x2,#1
1068	sub		x4,x1,#128
1069	orr		v26.16b,v0.16b,v0.16b			// offload
1070	orr		v27.16b,v1.16b,v1.16b
1071	orr		v28.16b,v2.16b,v2.16b
1072	orr		v29.16b,v3.16b,v3.16b
1073	csel		x1,x1,x4,ne			// conditional rewind
1074	add		v24.2d,v24.2d,v16.2d
1075	ld1		{v25.2d},[x3],#16
1076	ext		v24.16b,v24.16b,v24.16b,#8
1077	ext		v5.16b,v2.16b,v3.16b,#8
1078	ext		v6.16b,v1.16b,v2.16b,#8
1079	add		v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1080	 .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1081	 ext		v7.16b,v20.16b,v21.16b,#8
1082	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1083	 .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1084	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1085	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1086	add		v25.2d,v25.2d,v17.2d
1087	ld1		{v24.2d},[x3],#16
1088	ext		v25.16b,v25.16b,v25.16b,#8
1089	ext		v5.16b,v4.16b,v2.16b,#8
1090	ext		v6.16b,v0.16b,v4.16b,#8
1091	add		v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1092	 .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1093	 ext		v7.16b,v21.16b,v22.16b,#8
1094	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1095	 .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1096	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1097	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1098	add		v24.2d,v24.2d,v18.2d
1099	ld1		{v25.2d},[x3],#16
1100	ext		v24.16b,v24.16b,v24.16b,#8
1101	ext		v5.16b,v1.16b,v4.16b,#8
1102	ext		v6.16b,v3.16b,v1.16b,#8
1103	add		v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1104	 .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1105	 ext		v7.16b,v22.16b,v23.16b,#8
1106	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1107	 .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1108	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1109	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1110	add		v25.2d,v25.2d,v19.2d
1111	ld1		{v24.2d},[x3],#16
1112	ext		v25.16b,v25.16b,v25.16b,#8
1113	ext		v5.16b,v0.16b,v1.16b,#8
1114	ext		v6.16b,v2.16b,v0.16b,#8
1115	add		v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1116	 .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1117	 ext		v7.16b,v23.16b,v16.16b,#8
1118	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1119	 .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1120	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1121	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1122	add		v24.2d,v24.2d,v20.2d
1123	ld1		{v25.2d},[x3],#16
1124	ext		v24.16b,v24.16b,v24.16b,#8
1125	ext		v5.16b,v3.16b,v0.16b,#8
1126	ext		v6.16b,v4.16b,v3.16b,#8
1127	add		v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1128	 .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1129	 ext		v7.16b,v16.16b,v17.16b,#8
1130	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1131	 .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1132	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1133	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1134	add		v25.2d,v25.2d,v21.2d
1135	ld1		{v24.2d},[x3],#16
1136	ext		v25.16b,v25.16b,v25.16b,#8
1137	ext		v5.16b,v2.16b,v3.16b,#8
1138	ext		v6.16b,v1.16b,v2.16b,#8
1139	add		v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1140	 .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1141	 ext		v7.16b,v17.16b,v18.16b,#8
1142	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1143	 .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1144	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1145	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1146	add		v24.2d,v24.2d,v22.2d
1147	ld1		{v25.2d},[x3],#16
1148	ext		v24.16b,v24.16b,v24.16b,#8
1149	ext		v5.16b,v4.16b,v2.16b,#8
1150	ext		v6.16b,v0.16b,v4.16b,#8
1151	add		v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1152	 .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1153	 ext		v7.16b,v18.16b,v19.16b,#8
1154	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1155	 .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1156	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1157	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1158	add		v25.2d,v25.2d,v23.2d
1159	ld1		{v24.2d},[x3],#16
1160	ext		v25.16b,v25.16b,v25.16b,#8
1161	ext		v5.16b,v1.16b,v4.16b,#8
1162	ext		v6.16b,v3.16b,v1.16b,#8
1163	add		v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1164	 .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1165	 ext		v7.16b,v19.16b,v20.16b,#8
1166	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1167	 .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1168	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1169	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1170	add		v24.2d,v24.2d,v16.2d
1171	ld1		{v25.2d},[x3],#16
1172	ext		v24.16b,v24.16b,v24.16b,#8
1173	ext		v5.16b,v0.16b,v1.16b,#8
1174	ext		v6.16b,v2.16b,v0.16b,#8
1175	add		v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1176	 .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1177	 ext		v7.16b,v20.16b,v21.16b,#8
1178	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1179	 .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1180	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1181	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1182	add		v25.2d,v25.2d,v17.2d
1183	ld1		{v24.2d},[x3],#16
1184	ext		v25.16b,v25.16b,v25.16b,#8
1185	ext		v5.16b,v3.16b,v0.16b,#8
1186	ext		v6.16b,v4.16b,v3.16b,#8
1187	add		v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1188	 .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1189	 ext		v7.16b,v21.16b,v22.16b,#8
1190	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1191	 .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1192	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1193	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1194	add		v24.2d,v24.2d,v18.2d
1195	ld1		{v25.2d},[x3],#16
1196	ext		v24.16b,v24.16b,v24.16b,#8
1197	ext		v5.16b,v2.16b,v3.16b,#8
1198	ext		v6.16b,v1.16b,v2.16b,#8
1199	add		v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1200	 .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1201	 ext		v7.16b,v22.16b,v23.16b,#8
1202	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1203	 .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1204	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1205	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1206	add		v25.2d,v25.2d,v19.2d
1207	ld1		{v24.2d},[x3],#16
1208	ext		v25.16b,v25.16b,v25.16b,#8
1209	ext		v5.16b,v4.16b,v2.16b,#8
1210	ext		v6.16b,v0.16b,v4.16b,#8
1211	add		v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1212	 .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1213	 ext		v7.16b,v23.16b,v16.16b,#8
1214	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1215	 .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1216	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1217	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1218	add		v24.2d,v24.2d,v20.2d
1219	ld1		{v25.2d},[x3],#16
1220	ext		v24.16b,v24.16b,v24.16b,#8
1221	ext		v5.16b,v1.16b,v4.16b,#8
1222	ext		v6.16b,v3.16b,v1.16b,#8
1223	add		v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1224	 .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1225	 ext		v7.16b,v16.16b,v17.16b,#8
1226	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1227	 .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1228	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1229	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1230	add		v25.2d,v25.2d,v21.2d
1231	ld1		{v24.2d},[x3],#16
1232	ext		v25.16b,v25.16b,v25.16b,#8
1233	ext		v5.16b,v0.16b,v1.16b,#8
1234	ext		v6.16b,v2.16b,v0.16b,#8
1235	add		v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1236	 .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1237	 ext		v7.16b,v17.16b,v18.16b,#8
1238	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1239	 .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1240	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1241	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1242	add		v24.2d,v24.2d,v22.2d
1243	ld1		{v25.2d},[x3],#16
1244	ext		v24.16b,v24.16b,v24.16b,#8
1245	ext		v5.16b,v3.16b,v0.16b,#8
1246	ext		v6.16b,v4.16b,v3.16b,#8
1247	add		v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1248	 .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1249	 ext		v7.16b,v18.16b,v19.16b,#8
1250	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1251	 .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1252	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1253	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1254	add		v25.2d,v25.2d,v23.2d
1255	ld1		{v24.2d},[x3],#16
1256	ext		v25.16b,v25.16b,v25.16b,#8
1257	ext		v5.16b,v2.16b,v3.16b,#8
1258	ext		v6.16b,v1.16b,v2.16b,#8
1259	add		v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1260	 .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1261	 ext		v7.16b,v19.16b,v20.16b,#8
1262	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1263	 .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1264	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1265	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1266	add		v24.2d,v24.2d,v16.2d
1267	ld1		{v25.2d},[x3],#16
1268	ext		v24.16b,v24.16b,v24.16b,#8
1269	ext		v5.16b,v4.16b,v2.16b,#8
1270	ext		v6.16b,v0.16b,v4.16b,#8
1271	add		v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1272	 .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1273	 ext		v7.16b,v20.16b,v21.16b,#8
1274	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1275	 .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1276	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1277	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1278	add		v25.2d,v25.2d,v17.2d
1279	ld1		{v24.2d},[x3],#16
1280	ext		v25.16b,v25.16b,v25.16b,#8
1281	ext		v5.16b,v1.16b,v4.16b,#8
1282	ext		v6.16b,v3.16b,v1.16b,#8
1283	add		v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1284	 .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1285	 ext		v7.16b,v21.16b,v22.16b,#8
1286	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1287	 .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1288	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1289	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1290	add		v24.2d,v24.2d,v18.2d
1291	ld1		{v25.2d},[x3],#16
1292	ext		v24.16b,v24.16b,v24.16b,#8
1293	ext		v5.16b,v0.16b,v1.16b,#8
1294	ext		v6.16b,v2.16b,v0.16b,#8
1295	add		v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1296	 .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1297	 ext		v7.16b,v22.16b,v23.16b,#8
1298	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1299	 .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1300	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1301	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1302	add		v25.2d,v25.2d,v19.2d
1303	ld1		{v24.2d},[x3],#16
1304	ext		v25.16b,v25.16b,v25.16b,#8
1305	ext		v5.16b,v3.16b,v0.16b,#8
1306	ext		v6.16b,v4.16b,v3.16b,#8
1307	add		v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1308	 .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1309	 ext		v7.16b,v23.16b,v16.16b,#8
1310	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1311	 .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1312	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1313	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1314	add		v24.2d,v24.2d,v20.2d
1315	ld1		{v25.2d},[x3],#16
1316	ext		v24.16b,v24.16b,v24.16b,#8
1317	ext		v5.16b,v2.16b,v3.16b,#8
1318	ext		v6.16b,v1.16b,v2.16b,#8
1319	add		v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1320	 .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1321	 ext		v7.16b,v16.16b,v17.16b,#8
1322	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1323	 .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1324	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1325	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1326	add		v25.2d,v25.2d,v21.2d
1327	ld1		{v24.2d},[x3],#16
1328	ext		v25.16b,v25.16b,v25.16b,#8
1329	ext		v5.16b,v4.16b,v2.16b,#8
1330	ext		v6.16b,v0.16b,v4.16b,#8
1331	add		v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1332	 .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1333	 ext		v7.16b,v17.16b,v18.16b,#8
1334	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1335	 .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1336	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1337	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1338	add		v24.2d,v24.2d,v22.2d
1339	ld1		{v25.2d},[x3],#16
1340	ext		v24.16b,v24.16b,v24.16b,#8
1341	ext		v5.16b,v1.16b,v4.16b,#8
1342	ext		v6.16b,v3.16b,v1.16b,#8
1343	add		v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1344	 .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1345	 ext		v7.16b,v18.16b,v19.16b,#8
1346	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1347	 .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1348	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1349	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1350	add		v25.2d,v25.2d,v23.2d
1351	ld1		{v24.2d},[x3],#16
1352	ext		v25.16b,v25.16b,v25.16b,#8
1353	ext		v5.16b,v0.16b,v1.16b,#8
1354	ext		v6.16b,v2.16b,v0.16b,#8
1355	add		v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1356	 .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1357	 ext		v7.16b,v19.16b,v20.16b,#8
1358	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1359	 .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1360	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1361	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1362	add		v24.2d,v24.2d,v16.2d
1363	ld1		{v25.2d},[x3],#16
1364	ext		v24.16b,v24.16b,v24.16b,#8
1365	ext		v5.16b,v3.16b,v0.16b,#8
1366	ext		v6.16b,v4.16b,v3.16b,#8
1367	add		v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1368	 .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1369	 ext		v7.16b,v20.16b,v21.16b,#8
1370	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1371	 .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1372	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1373	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1374	add		v25.2d,v25.2d,v17.2d
1375	ld1		{v24.2d},[x3],#16
1376	ext		v25.16b,v25.16b,v25.16b,#8
1377	ext		v5.16b,v2.16b,v3.16b,#8
1378	ext		v6.16b,v1.16b,v2.16b,#8
1379	add		v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1380	 .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1381	 ext		v7.16b,v21.16b,v22.16b,#8
1382	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1383	 .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1384	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1385	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1386	add		v24.2d,v24.2d,v18.2d
1387	ld1		{v25.2d},[x3],#16
1388	ext		v24.16b,v24.16b,v24.16b,#8
1389	ext		v5.16b,v4.16b,v2.16b,#8
1390	ext		v6.16b,v0.16b,v4.16b,#8
1391	add		v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1392	 .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1393	 ext		v7.16b,v22.16b,v23.16b,#8
1394	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1395	 .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1396	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1397	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1398	add		v25.2d,v25.2d,v19.2d
1399	ld1		{v24.2d},[x3],#16
1400	ext		v25.16b,v25.16b,v25.16b,#8
1401	ext		v5.16b,v1.16b,v4.16b,#8
1402	ext		v6.16b,v3.16b,v1.16b,#8
1403	add		v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1404	 .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1405	 ext		v7.16b,v23.16b,v16.16b,#8
1406	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1407	 .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1408	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1409	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1410	add		v24.2d,v24.2d,v20.2d
1411	ld1		{v25.2d},[x3],#16
1412	ext		v24.16b,v24.16b,v24.16b,#8
1413	ext		v5.16b,v0.16b,v1.16b,#8
1414	ext		v6.16b,v2.16b,v0.16b,#8
1415	add		v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1416	 .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1417	 ext		v7.16b,v16.16b,v17.16b,#8
1418	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1419	 .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1420	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1421	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1422	add		v25.2d,v25.2d,v21.2d
1423	ld1		{v24.2d},[x3],#16
1424	ext		v25.16b,v25.16b,v25.16b,#8
1425	ext		v5.16b,v3.16b,v0.16b,#8
1426	ext		v6.16b,v4.16b,v3.16b,#8
1427	add		v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1428	 .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1429	 ext		v7.16b,v17.16b,v18.16b,#8
1430	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1431	 .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1432	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1433	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1434	add		v24.2d,v24.2d,v22.2d
1435	ld1		{v25.2d},[x3],#16
1436	ext		v24.16b,v24.16b,v24.16b,#8
1437	ext		v5.16b,v2.16b,v3.16b,#8
1438	ext		v6.16b,v1.16b,v2.16b,#8
1439	add		v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1440	 .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1441	 ext		v7.16b,v18.16b,v19.16b,#8
1442	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1443	 .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1444	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1445	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1446	add		v25.2d,v25.2d,v23.2d
1447	ld1		{v24.2d},[x3],#16
1448	ext		v25.16b,v25.16b,v25.16b,#8
1449	ext		v5.16b,v4.16b,v2.16b,#8
1450	ext		v6.16b,v0.16b,v4.16b,#8
1451	add		v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1452	 .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1453	 ext		v7.16b,v19.16b,v20.16b,#8
1454	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1455	 .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1456	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1457	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1458	ld1		{v25.2d},[x3],#16
1459	add		v24.2d,v24.2d,v16.2d
1460	 ld1		{v16.16b},[x1],#16		// load next input
1461	ext		v24.16b,v24.16b,v24.16b,#8
1462	ext		v5.16b,v1.16b,v4.16b,#8
1463	ext		v6.16b,v3.16b,v1.16b,#8
1464	add		v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1465	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1466	 rev64		v16.16b,v16.16b
1467	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1468	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1469	ld1		{v24.2d},[x3],#16
1470	add		v25.2d,v25.2d,v17.2d
1471	 ld1		{v17.16b},[x1],#16		// load next input
1472	ext		v25.16b,v25.16b,v25.16b,#8
1473	ext		v5.16b,v0.16b,v1.16b,#8
1474	ext		v6.16b,v2.16b,v0.16b,#8
1475	add		v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1476	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1477	 rev64		v17.16b,v17.16b
1478	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1479	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1480	ld1		{v25.2d},[x3],#16
1481	add		v24.2d,v24.2d,v18.2d
1482	 ld1		{v18.16b},[x1],#16		// load next input
1483	ext		v24.16b,v24.16b,v24.16b,#8
1484	ext		v5.16b,v3.16b,v0.16b,#8
1485	ext		v6.16b,v4.16b,v3.16b,#8
1486	add		v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1487	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1488	 rev64		v18.16b,v18.16b
1489	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1490	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1491	ld1		{v24.2d},[x3],#16
1492	add		v25.2d,v25.2d,v19.2d
1493	 ld1		{v19.16b},[x1],#16		// load next input
1494	ext		v25.16b,v25.16b,v25.16b,#8
1495	ext		v5.16b,v2.16b,v3.16b,#8
1496	ext		v6.16b,v1.16b,v2.16b,#8
1497	add		v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1498	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1499	 rev64		v19.16b,v19.16b
1500	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1501	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1502	ld1		{v25.2d},[x3],#16
1503	add		v24.2d,v24.2d,v20.2d
1504	 ld1		{v20.16b},[x1],#16		// load next input
1505	ext		v24.16b,v24.16b,v24.16b,#8
1506	ext		v5.16b,v4.16b,v2.16b,#8
1507	ext		v6.16b,v0.16b,v4.16b,#8
1508	add		v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1509	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1510	 rev64		v20.16b,v20.16b
1511	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1512	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1513	ld1		{v24.2d},[x3],#16
1514	add		v25.2d,v25.2d,v21.2d
1515	 ld1		{v21.16b},[x1],#16		// load next input
1516	ext		v25.16b,v25.16b,v25.16b,#8
1517	ext		v5.16b,v1.16b,v4.16b,#8
1518	ext		v6.16b,v3.16b,v1.16b,#8
1519	add		v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1520	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1521	 rev64		v21.16b,v21.16b
1522	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1523	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1524	ld1		{v25.2d},[x3],#16
1525	add		v24.2d,v24.2d,v22.2d
1526	 ld1		{v22.16b},[x1],#16		// load next input
1527	ext		v24.16b,v24.16b,v24.16b,#8
1528	ext		v5.16b,v0.16b,v1.16b,#8
1529	ext		v6.16b,v2.16b,v0.16b,#8
1530	add		v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1531	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1532	 rev64		v22.16b,v22.16b
1533	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1534	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1535	sub		x3,x3,#80*8	// rewind
1536	add		v25.2d,v25.2d,v23.2d
1537	 ld1		{v23.16b},[x1],#16		// load next input
1538	ext		v25.16b,v25.16b,v25.16b,#8
1539	ext		v5.16b,v3.16b,v0.16b,#8
1540	ext		v6.16b,v4.16b,v3.16b,#8
1541	add		v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1542	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1543	 rev64		v23.16b,v23.16b
1544	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1545	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1546	add		v0.2d,v0.2d,v26.2d			// accumulate
1547	add		v1.2d,v1.2d,v27.2d
1548	add		v2.2d,v2.2d,v28.2d
1549	add		v3.2d,v3.2d,v29.2d
1550
1551	cbnz		x2,.Loop_hw
1552
1553	st1		{v0.2d-v3.2d},[x0]		// store context
1554
1555	ldr		x29,[sp],#16
1556	ret
1557.size	zfs_sha512_block_armv8,.-zfs_sha512_block_armv8
1558#endif
1559