xref: /freebsd/sys/contrib/openzfs/module/icp/asm-aarch64/sha2/sha512-armv8.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1// SPDX-License-Identifier: Apache-2.0
2/*
3 * Copyright 2004-2022 The OpenSSL Project Authors. All Rights Reserved.
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 *     https://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18/*
19 * Portions Copyright (c) 2022 Tino Reichardt <milky-zfs@mcmilk.de>
20 * - modified assembly to fit into OpenZFS
21 */
22
23#if defined(__aarch64__)
24
25	.section	.note.gnu.property,"a",@note
26	.p2align	3
27	.word	4
28	.word	16
29	.word	5
30	.asciz	"GNU"
31	.word	3221225472
32	.word	4
33	.word	3
34	.word	0
35.text
36
37.align	6
38.type	.LK512,%object
39.LK512:
40	.quad	0x428a2f98d728ae22,0x7137449123ef65cd
41	.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
42	.quad	0x3956c25bf348b538,0x59f111f1b605d019
43	.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
44	.quad	0xd807aa98a3030242,0x12835b0145706fbe
45	.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
46	.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
47	.quad	0x9bdc06a725c71235,0xc19bf174cf692694
48	.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
49	.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
50	.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
51	.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
52	.quad	0x983e5152ee66dfab,0xa831c66d2db43210
53	.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
54	.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
55	.quad	0x06ca6351e003826f,0x142929670a0e6e70
56	.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
57	.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
58	.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
59	.quad	0x81c2c92e47edaee6,0x92722c851482353b
60	.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
61	.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
62	.quad	0xd192e819d6ef5218,0xd69906245565a910
63	.quad	0xf40e35855771202a,0x106aa07032bbd1b8
64	.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
65	.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
66	.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
67	.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
68	.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
69	.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
70	.quad	0x90befffa23631e28,0xa4506cebde82bde9
71	.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
72	.quad	0xca273eceea26619c,0xd186b8c721c0c207
73	.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
74	.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
75	.quad	0x113f9804bef90dae,0x1b710b35131c471b
76	.quad	0x28db77f523047d84,0x32caab7b40c72493
77	.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
78	.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
79	.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
80	.quad	0	// terminator
81.size	.LK512,.-.LK512
82
83.globl	zfs_sha512_block_armv7
84.type	zfs_sha512_block_armv7,%function
85.align	6
86zfs_sha512_block_armv7:
87	hint	#34					// bti c
88	stp	x29,x30,[sp,#-128]!
89	add	x29,sp,#0
90
91	stp	x19,x20,[sp,#16]
92	stp	x21,x22,[sp,#32]
93	stp	x23,x24,[sp,#48]
94	stp	x25,x26,[sp,#64]
95	stp	x27,x28,[sp,#80]
96	sub	sp,sp,#4*8
97
98	ldp	x20,x21,[x0]				// load context
99	ldp	x22,x23,[x0,#2*8]
100	ldp	x24,x25,[x0,#4*8]
101	add	x2,x1,x2,lsl#7	// end of input
102	ldp	x26,x27,[x0,#6*8]
103	adr	x30,.LK512
104	stp	x0,x2,[x29,#96]
105
106.Loop:
107	ldp	x3,x4,[x1],#2*8
108	ldr	x19,[x30],#8			// *K++
109	eor	x28,x21,x22				// magic seed
110	str	x1,[x29,#112]
111#ifndef	__AARCH64EB__
112	rev	x3,x3			// 0
113#endif
114	ror	x16,x24,#14
115	add	x27,x27,x19			// h+=K[i]
116	eor	x6,x24,x24,ror#23
117	and	x17,x25,x24
118	bic	x19,x26,x24
119	add	x27,x27,x3			// h+=X[i]
120	orr	x17,x17,x19			// Ch(e,f,g)
121	eor	x19,x20,x21			// a^b, b^c in next round
122	eor	x16,x16,x6,ror#18	// Sigma1(e)
123	ror	x6,x20,#28
124	add	x27,x27,x17			// h+=Ch(e,f,g)
125	eor	x17,x20,x20,ror#5
126	add	x27,x27,x16			// h+=Sigma1(e)
127	and	x28,x28,x19			// (b^c)&=(a^b)
128	add	x23,x23,x27			// d+=h
129	eor	x28,x28,x21			// Maj(a,b,c)
130	eor	x17,x6,x17,ror#34	// Sigma0(a)
131	add	x27,x27,x28			// h+=Maj(a,b,c)
132	ldr	x28,[x30],#8		// *K++, x19 in next round
133	//add	x27,x27,x17			// h+=Sigma0(a)
134#ifndef	__AARCH64EB__
135	rev	x4,x4			// 1
136#endif
137	ldp	x5,x6,[x1],#2*8
138	add	x27,x27,x17			// h+=Sigma0(a)
139	ror	x16,x23,#14
140	add	x26,x26,x28			// h+=K[i]
141	eor	x7,x23,x23,ror#23
142	and	x17,x24,x23
143	bic	x28,x25,x23
144	add	x26,x26,x4			// h+=X[i]
145	orr	x17,x17,x28			// Ch(e,f,g)
146	eor	x28,x27,x20			// a^b, b^c in next round
147	eor	x16,x16,x7,ror#18	// Sigma1(e)
148	ror	x7,x27,#28
149	add	x26,x26,x17			// h+=Ch(e,f,g)
150	eor	x17,x27,x27,ror#5
151	add	x26,x26,x16			// h+=Sigma1(e)
152	and	x19,x19,x28			// (b^c)&=(a^b)
153	add	x22,x22,x26			// d+=h
154	eor	x19,x19,x20			// Maj(a,b,c)
155	eor	x17,x7,x17,ror#34	// Sigma0(a)
156	add	x26,x26,x19			// h+=Maj(a,b,c)
157	ldr	x19,[x30],#8		// *K++, x28 in next round
158	//add	x26,x26,x17			// h+=Sigma0(a)
159#ifndef	__AARCH64EB__
160	rev	x5,x5			// 2
161#endif
162	add	x26,x26,x17			// h+=Sigma0(a)
163	ror	x16,x22,#14
164	add	x25,x25,x19			// h+=K[i]
165	eor	x8,x22,x22,ror#23
166	and	x17,x23,x22
167	bic	x19,x24,x22
168	add	x25,x25,x5			// h+=X[i]
169	orr	x17,x17,x19			// Ch(e,f,g)
170	eor	x19,x26,x27			// a^b, b^c in next round
171	eor	x16,x16,x8,ror#18	// Sigma1(e)
172	ror	x8,x26,#28
173	add	x25,x25,x17			// h+=Ch(e,f,g)
174	eor	x17,x26,x26,ror#5
175	add	x25,x25,x16			// h+=Sigma1(e)
176	and	x28,x28,x19			// (b^c)&=(a^b)
177	add	x21,x21,x25			// d+=h
178	eor	x28,x28,x27			// Maj(a,b,c)
179	eor	x17,x8,x17,ror#34	// Sigma0(a)
180	add	x25,x25,x28			// h+=Maj(a,b,c)
181	ldr	x28,[x30],#8		// *K++, x19 in next round
182	//add	x25,x25,x17			// h+=Sigma0(a)
183#ifndef	__AARCH64EB__
184	rev	x6,x6			// 3
185#endif
186	ldp	x7,x8,[x1],#2*8
187	add	x25,x25,x17			// h+=Sigma0(a)
188	ror	x16,x21,#14
189	add	x24,x24,x28			// h+=K[i]
190	eor	x9,x21,x21,ror#23
191	and	x17,x22,x21
192	bic	x28,x23,x21
193	add	x24,x24,x6			// h+=X[i]
194	orr	x17,x17,x28			// Ch(e,f,g)
195	eor	x28,x25,x26			// a^b, b^c in next round
196	eor	x16,x16,x9,ror#18	// Sigma1(e)
197	ror	x9,x25,#28
198	add	x24,x24,x17			// h+=Ch(e,f,g)
199	eor	x17,x25,x25,ror#5
200	add	x24,x24,x16			// h+=Sigma1(e)
201	and	x19,x19,x28			// (b^c)&=(a^b)
202	add	x20,x20,x24			// d+=h
203	eor	x19,x19,x26			// Maj(a,b,c)
204	eor	x17,x9,x17,ror#34	// Sigma0(a)
205	add	x24,x24,x19			// h+=Maj(a,b,c)
206	ldr	x19,[x30],#8		// *K++, x28 in next round
207	//add	x24,x24,x17			// h+=Sigma0(a)
208#ifndef	__AARCH64EB__
209	rev	x7,x7			// 4
210#endif
211	add	x24,x24,x17			// h+=Sigma0(a)
212	ror	x16,x20,#14
213	add	x23,x23,x19			// h+=K[i]
214	eor	x10,x20,x20,ror#23
215	and	x17,x21,x20
216	bic	x19,x22,x20
217	add	x23,x23,x7			// h+=X[i]
218	orr	x17,x17,x19			// Ch(e,f,g)
219	eor	x19,x24,x25			// a^b, b^c in next round
220	eor	x16,x16,x10,ror#18	// Sigma1(e)
221	ror	x10,x24,#28
222	add	x23,x23,x17			// h+=Ch(e,f,g)
223	eor	x17,x24,x24,ror#5
224	add	x23,x23,x16			// h+=Sigma1(e)
225	and	x28,x28,x19			// (b^c)&=(a^b)
226	add	x27,x27,x23			// d+=h
227	eor	x28,x28,x25			// Maj(a,b,c)
228	eor	x17,x10,x17,ror#34	// Sigma0(a)
229	add	x23,x23,x28			// h+=Maj(a,b,c)
230	ldr	x28,[x30],#8		// *K++, x19 in next round
231	//add	x23,x23,x17			// h+=Sigma0(a)
232#ifndef	__AARCH64EB__
233	rev	x8,x8			// 5
234#endif
235	ldp	x9,x10,[x1],#2*8
236	add	x23,x23,x17			// h+=Sigma0(a)
237	ror	x16,x27,#14
238	add	x22,x22,x28			// h+=K[i]
239	eor	x11,x27,x27,ror#23
240	and	x17,x20,x27
241	bic	x28,x21,x27
242	add	x22,x22,x8			// h+=X[i]
243	orr	x17,x17,x28			// Ch(e,f,g)
244	eor	x28,x23,x24			// a^b, b^c in next round
245	eor	x16,x16,x11,ror#18	// Sigma1(e)
246	ror	x11,x23,#28
247	add	x22,x22,x17			// h+=Ch(e,f,g)
248	eor	x17,x23,x23,ror#5
249	add	x22,x22,x16			// h+=Sigma1(e)
250	and	x19,x19,x28			// (b^c)&=(a^b)
251	add	x26,x26,x22			// d+=h
252	eor	x19,x19,x24			// Maj(a,b,c)
253	eor	x17,x11,x17,ror#34	// Sigma0(a)
254	add	x22,x22,x19			// h+=Maj(a,b,c)
255	ldr	x19,[x30],#8		// *K++, x28 in next round
256	//add	x22,x22,x17			// h+=Sigma0(a)
257#ifndef	__AARCH64EB__
258	rev	x9,x9			// 6
259#endif
260	add	x22,x22,x17			// h+=Sigma0(a)
261	ror	x16,x26,#14
262	add	x21,x21,x19			// h+=K[i]
263	eor	x12,x26,x26,ror#23
264	and	x17,x27,x26
265	bic	x19,x20,x26
266	add	x21,x21,x9			// h+=X[i]
267	orr	x17,x17,x19			// Ch(e,f,g)
268	eor	x19,x22,x23			// a^b, b^c in next round
269	eor	x16,x16,x12,ror#18	// Sigma1(e)
270	ror	x12,x22,#28
271	add	x21,x21,x17			// h+=Ch(e,f,g)
272	eor	x17,x22,x22,ror#5
273	add	x21,x21,x16			// h+=Sigma1(e)
274	and	x28,x28,x19			// (b^c)&=(a^b)
275	add	x25,x25,x21			// d+=h
276	eor	x28,x28,x23			// Maj(a,b,c)
277	eor	x17,x12,x17,ror#34	// Sigma0(a)
278	add	x21,x21,x28			// h+=Maj(a,b,c)
279	ldr	x28,[x30],#8		// *K++, x19 in next round
280	//add	x21,x21,x17			// h+=Sigma0(a)
281#ifndef	__AARCH64EB__
282	rev	x10,x10			// 7
283#endif
284	ldp	x11,x12,[x1],#2*8
285	add	x21,x21,x17			// h+=Sigma0(a)
286	ror	x16,x25,#14
287	add	x20,x20,x28			// h+=K[i]
288	eor	x13,x25,x25,ror#23
289	and	x17,x26,x25
290	bic	x28,x27,x25
291	add	x20,x20,x10			// h+=X[i]
292	orr	x17,x17,x28			// Ch(e,f,g)
293	eor	x28,x21,x22			// a^b, b^c in next round
294	eor	x16,x16,x13,ror#18	// Sigma1(e)
295	ror	x13,x21,#28
296	add	x20,x20,x17			// h+=Ch(e,f,g)
297	eor	x17,x21,x21,ror#5
298	add	x20,x20,x16			// h+=Sigma1(e)
299	and	x19,x19,x28			// (b^c)&=(a^b)
300	add	x24,x24,x20			// d+=h
301	eor	x19,x19,x22			// Maj(a,b,c)
302	eor	x17,x13,x17,ror#34	// Sigma0(a)
303	add	x20,x20,x19			// h+=Maj(a,b,c)
304	ldr	x19,[x30],#8		// *K++, x28 in next round
305	//add	x20,x20,x17			// h+=Sigma0(a)
306#ifndef	__AARCH64EB__
307	rev	x11,x11			// 8
308#endif
309	add	x20,x20,x17			// h+=Sigma0(a)
310	ror	x16,x24,#14
311	add	x27,x27,x19			// h+=K[i]
312	eor	x14,x24,x24,ror#23
313	and	x17,x25,x24
314	bic	x19,x26,x24
315	add	x27,x27,x11			// h+=X[i]
316	orr	x17,x17,x19			// Ch(e,f,g)
317	eor	x19,x20,x21			// a^b, b^c in next round
318	eor	x16,x16,x14,ror#18	// Sigma1(e)
319	ror	x14,x20,#28
320	add	x27,x27,x17			// h+=Ch(e,f,g)
321	eor	x17,x20,x20,ror#5
322	add	x27,x27,x16			// h+=Sigma1(e)
323	and	x28,x28,x19			// (b^c)&=(a^b)
324	add	x23,x23,x27			// d+=h
325	eor	x28,x28,x21			// Maj(a,b,c)
326	eor	x17,x14,x17,ror#34	// Sigma0(a)
327	add	x27,x27,x28			// h+=Maj(a,b,c)
328	ldr	x28,[x30],#8		// *K++, x19 in next round
329	//add	x27,x27,x17			// h+=Sigma0(a)
330#ifndef	__AARCH64EB__
331	rev	x12,x12			// 9
332#endif
333	ldp	x13,x14,[x1],#2*8
334	add	x27,x27,x17			// h+=Sigma0(a)
335	ror	x16,x23,#14
336	add	x26,x26,x28			// h+=K[i]
337	eor	x15,x23,x23,ror#23
338	and	x17,x24,x23
339	bic	x28,x25,x23
340	add	x26,x26,x12			// h+=X[i]
341	orr	x17,x17,x28			// Ch(e,f,g)
342	eor	x28,x27,x20			// a^b, b^c in next round
343	eor	x16,x16,x15,ror#18	// Sigma1(e)
344	ror	x15,x27,#28
345	add	x26,x26,x17			// h+=Ch(e,f,g)
346	eor	x17,x27,x27,ror#5
347	add	x26,x26,x16			// h+=Sigma1(e)
348	and	x19,x19,x28			// (b^c)&=(a^b)
349	add	x22,x22,x26			// d+=h
350	eor	x19,x19,x20			// Maj(a,b,c)
351	eor	x17,x15,x17,ror#34	// Sigma0(a)
352	add	x26,x26,x19			// h+=Maj(a,b,c)
353	ldr	x19,[x30],#8		// *K++, x28 in next round
354	//add	x26,x26,x17			// h+=Sigma0(a)
355#ifndef	__AARCH64EB__
356	rev	x13,x13			// 10
357#endif
358	add	x26,x26,x17			// h+=Sigma0(a)
359	ror	x16,x22,#14
360	add	x25,x25,x19			// h+=K[i]
361	eor	x0,x22,x22,ror#23
362	and	x17,x23,x22
363	bic	x19,x24,x22
364	add	x25,x25,x13			// h+=X[i]
365	orr	x17,x17,x19			// Ch(e,f,g)
366	eor	x19,x26,x27			// a^b, b^c in next round
367	eor	x16,x16,x0,ror#18	// Sigma1(e)
368	ror	x0,x26,#28
369	add	x25,x25,x17			// h+=Ch(e,f,g)
370	eor	x17,x26,x26,ror#5
371	add	x25,x25,x16			// h+=Sigma1(e)
372	and	x28,x28,x19			// (b^c)&=(a^b)
373	add	x21,x21,x25			// d+=h
374	eor	x28,x28,x27			// Maj(a,b,c)
375	eor	x17,x0,x17,ror#34	// Sigma0(a)
376	add	x25,x25,x28			// h+=Maj(a,b,c)
377	ldr	x28,[x30],#8		// *K++, x19 in next round
378	//add	x25,x25,x17			// h+=Sigma0(a)
379#ifndef	__AARCH64EB__
380	rev	x14,x14			// 11
381#endif
382	ldp	x15,x0,[x1],#2*8
383	add	x25,x25,x17			// h+=Sigma0(a)
384	str	x6,[sp,#24]
385	ror	x16,x21,#14
386	add	x24,x24,x28			// h+=K[i]
387	eor	x6,x21,x21,ror#23
388	and	x17,x22,x21
389	bic	x28,x23,x21
390	add	x24,x24,x14			// h+=X[i]
391	orr	x17,x17,x28			// Ch(e,f,g)
392	eor	x28,x25,x26			// a^b, b^c in next round
393	eor	x16,x16,x6,ror#18	// Sigma1(e)
394	ror	x6,x25,#28
395	add	x24,x24,x17			// h+=Ch(e,f,g)
396	eor	x17,x25,x25,ror#5
397	add	x24,x24,x16			// h+=Sigma1(e)
398	and	x19,x19,x28			// (b^c)&=(a^b)
399	add	x20,x20,x24			// d+=h
400	eor	x19,x19,x26			// Maj(a,b,c)
401	eor	x17,x6,x17,ror#34	// Sigma0(a)
402	add	x24,x24,x19			// h+=Maj(a,b,c)
403	ldr	x19,[x30],#8		// *K++, x28 in next round
404	//add	x24,x24,x17			// h+=Sigma0(a)
405#ifndef	__AARCH64EB__
406	rev	x15,x15			// 12
407#endif
408	add	x24,x24,x17			// h+=Sigma0(a)
409	str	x7,[sp,#0]
410	ror	x16,x20,#14
411	add	x23,x23,x19			// h+=K[i]
412	eor	x7,x20,x20,ror#23
413	and	x17,x21,x20
414	bic	x19,x22,x20
415	add	x23,x23,x15			// h+=X[i]
416	orr	x17,x17,x19			// Ch(e,f,g)
417	eor	x19,x24,x25			// a^b, b^c in next round
418	eor	x16,x16,x7,ror#18	// Sigma1(e)
419	ror	x7,x24,#28
420	add	x23,x23,x17			// h+=Ch(e,f,g)
421	eor	x17,x24,x24,ror#5
422	add	x23,x23,x16			// h+=Sigma1(e)
423	and	x28,x28,x19			// (b^c)&=(a^b)
424	add	x27,x27,x23			// d+=h
425	eor	x28,x28,x25			// Maj(a,b,c)
426	eor	x17,x7,x17,ror#34	// Sigma0(a)
427	add	x23,x23,x28			// h+=Maj(a,b,c)
428	ldr	x28,[x30],#8		// *K++, x19 in next round
429	//add	x23,x23,x17			// h+=Sigma0(a)
430#ifndef	__AARCH64EB__
431	rev	x0,x0			// 13
432#endif
433	ldp	x1,x2,[x1]
434	add	x23,x23,x17			// h+=Sigma0(a)
435	str	x8,[sp,#8]
436	ror	x16,x27,#14
437	add	x22,x22,x28			// h+=K[i]
438	eor	x8,x27,x27,ror#23
439	and	x17,x20,x27
440	bic	x28,x21,x27
441	add	x22,x22,x0			// h+=X[i]
442	orr	x17,x17,x28			// Ch(e,f,g)
443	eor	x28,x23,x24			// a^b, b^c in next round
444	eor	x16,x16,x8,ror#18	// Sigma1(e)
445	ror	x8,x23,#28
446	add	x22,x22,x17			// h+=Ch(e,f,g)
447	eor	x17,x23,x23,ror#5
448	add	x22,x22,x16			// h+=Sigma1(e)
449	and	x19,x19,x28			// (b^c)&=(a^b)
450	add	x26,x26,x22			// d+=h
451	eor	x19,x19,x24			// Maj(a,b,c)
452	eor	x17,x8,x17,ror#34	// Sigma0(a)
453	add	x22,x22,x19			// h+=Maj(a,b,c)
454	ldr	x19,[x30],#8		// *K++, x28 in next round
455	//add	x22,x22,x17			// h+=Sigma0(a)
456#ifndef	__AARCH64EB__
457	rev	x1,x1			// 14
458#endif
459	ldr	x6,[sp,#24]
460	add	x22,x22,x17			// h+=Sigma0(a)
461	str	x9,[sp,#16]
462	ror	x16,x26,#14
463	add	x21,x21,x19			// h+=K[i]
464	eor	x9,x26,x26,ror#23
465	and	x17,x27,x26
466	bic	x19,x20,x26
467	add	x21,x21,x1			// h+=X[i]
468	orr	x17,x17,x19			// Ch(e,f,g)
469	eor	x19,x22,x23			// a^b, b^c in next round
470	eor	x16,x16,x9,ror#18	// Sigma1(e)
471	ror	x9,x22,#28
472	add	x21,x21,x17			// h+=Ch(e,f,g)
473	eor	x17,x22,x22,ror#5
474	add	x21,x21,x16			// h+=Sigma1(e)
475	and	x28,x28,x19			// (b^c)&=(a^b)
476	add	x25,x25,x21			// d+=h
477	eor	x28,x28,x23			// Maj(a,b,c)
478	eor	x17,x9,x17,ror#34	// Sigma0(a)
479	add	x21,x21,x28			// h+=Maj(a,b,c)
480	ldr	x28,[x30],#8		// *K++, x19 in next round
481	//add	x21,x21,x17			// h+=Sigma0(a)
482#ifndef	__AARCH64EB__
483	rev	x2,x2			// 15
484#endif
485	ldr	x7,[sp,#0]
486	add	x21,x21,x17			// h+=Sigma0(a)
487	str	x10,[sp,#24]
488	ror	x16,x25,#14
489	add	x20,x20,x28			// h+=K[i]
490	ror	x9,x4,#1
491	and	x17,x26,x25
492	ror	x8,x1,#19
493	bic	x28,x27,x25
494	ror	x10,x21,#28
495	add	x20,x20,x2			// h+=X[i]
496	eor	x16,x16,x25,ror#18
497	eor	x9,x9,x4,ror#8
498	orr	x17,x17,x28			// Ch(e,f,g)
499	eor	x28,x21,x22			// a^b, b^c in next round
500	eor	x16,x16,x25,ror#41	// Sigma1(e)
501	eor	x10,x10,x21,ror#34
502	add	x20,x20,x17			// h+=Ch(e,f,g)
503	and	x19,x19,x28			// (b^c)&=(a^b)
504	eor	x8,x8,x1,ror#61
505	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
506	add	x20,x20,x16			// h+=Sigma1(e)
507	eor	x19,x19,x22			// Maj(a,b,c)
508	eor	x17,x10,x21,ror#39	// Sigma0(a)
509	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
510	add	x3,x3,x12
511	add	x24,x24,x20			// d+=h
512	add	x20,x20,x19			// h+=Maj(a,b,c)
513	ldr	x19,[x30],#8		// *K++, x28 in next round
514	add	x3,x3,x9
515	add	x20,x20,x17			// h+=Sigma0(a)
516	add	x3,x3,x8
517.Loop_16_xx:
518	ldr	x8,[sp,#8]
519	str	x11,[sp,#0]
520	ror	x16,x24,#14
521	add	x27,x27,x19			// h+=K[i]
522	ror	x10,x5,#1
523	and	x17,x25,x24
524	ror	x9,x2,#19
525	bic	x19,x26,x24
526	ror	x11,x20,#28
527	add	x27,x27,x3			// h+=X[i]
528	eor	x16,x16,x24,ror#18
529	eor	x10,x10,x5,ror#8
530	orr	x17,x17,x19			// Ch(e,f,g)
531	eor	x19,x20,x21			// a^b, b^c in next round
532	eor	x16,x16,x24,ror#41	// Sigma1(e)
533	eor	x11,x11,x20,ror#34
534	add	x27,x27,x17			// h+=Ch(e,f,g)
535	and	x28,x28,x19			// (b^c)&=(a^b)
536	eor	x9,x9,x2,ror#61
537	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
538	add	x27,x27,x16			// h+=Sigma1(e)
539	eor	x28,x28,x21			// Maj(a,b,c)
540	eor	x17,x11,x20,ror#39	// Sigma0(a)
541	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
542	add	x4,x4,x13
543	add	x23,x23,x27			// d+=h
544	add	x27,x27,x28			// h+=Maj(a,b,c)
545	ldr	x28,[x30],#8		// *K++, x19 in next round
546	add	x4,x4,x10
547	add	x27,x27,x17			// h+=Sigma0(a)
548	add	x4,x4,x9
549	ldr	x9,[sp,#16]
550	str	x12,[sp,#8]
551	ror	x16,x23,#14
552	add	x26,x26,x28			// h+=K[i]
553	ror	x11,x6,#1
554	and	x17,x24,x23
555	ror	x10,x3,#19
556	bic	x28,x25,x23
557	ror	x12,x27,#28
558	add	x26,x26,x4			// h+=X[i]
559	eor	x16,x16,x23,ror#18
560	eor	x11,x11,x6,ror#8
561	orr	x17,x17,x28			// Ch(e,f,g)
562	eor	x28,x27,x20			// a^b, b^c in next round
563	eor	x16,x16,x23,ror#41	// Sigma1(e)
564	eor	x12,x12,x27,ror#34
565	add	x26,x26,x17			// h+=Ch(e,f,g)
566	and	x19,x19,x28			// (b^c)&=(a^b)
567	eor	x10,x10,x3,ror#61
568	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
569	add	x26,x26,x16			// h+=Sigma1(e)
570	eor	x19,x19,x20			// Maj(a,b,c)
571	eor	x17,x12,x27,ror#39	// Sigma0(a)
572	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
573	add	x5,x5,x14
574	add	x22,x22,x26			// d+=h
575	add	x26,x26,x19			// h+=Maj(a,b,c)
576	ldr	x19,[x30],#8		// *K++, x28 in next round
577	add	x5,x5,x11
578	add	x26,x26,x17			// h+=Sigma0(a)
579	add	x5,x5,x10
580	ldr	x10,[sp,#24]
581	str	x13,[sp,#16]
582	ror	x16,x22,#14
583	add	x25,x25,x19			// h+=K[i]
584	ror	x12,x7,#1
585	and	x17,x23,x22
586	ror	x11,x4,#19
587	bic	x19,x24,x22
588	ror	x13,x26,#28
589	add	x25,x25,x5			// h+=X[i]
590	eor	x16,x16,x22,ror#18
591	eor	x12,x12,x7,ror#8
592	orr	x17,x17,x19			// Ch(e,f,g)
593	eor	x19,x26,x27			// a^b, b^c in next round
594	eor	x16,x16,x22,ror#41	// Sigma1(e)
595	eor	x13,x13,x26,ror#34
596	add	x25,x25,x17			// h+=Ch(e,f,g)
597	and	x28,x28,x19			// (b^c)&=(a^b)
598	eor	x11,x11,x4,ror#61
599	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
600	add	x25,x25,x16			// h+=Sigma1(e)
601	eor	x28,x28,x27			// Maj(a,b,c)
602	eor	x17,x13,x26,ror#39	// Sigma0(a)
603	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
604	add	x6,x6,x15
605	add	x21,x21,x25			// d+=h
606	add	x25,x25,x28			// h+=Maj(a,b,c)
607	ldr	x28,[x30],#8		// *K++, x19 in next round
608	add	x6,x6,x12
609	add	x25,x25,x17			// h+=Sigma0(a)
610	add	x6,x6,x11
611	ldr	x11,[sp,#0]
612	str	x14,[sp,#24]
613	ror	x16,x21,#14
614	add	x24,x24,x28			// h+=K[i]
615	ror	x13,x8,#1
616	and	x17,x22,x21
617	ror	x12,x5,#19
618	bic	x28,x23,x21
619	ror	x14,x25,#28
620	add	x24,x24,x6			// h+=X[i]
621	eor	x16,x16,x21,ror#18
622	eor	x13,x13,x8,ror#8
623	orr	x17,x17,x28			// Ch(e,f,g)
624	eor	x28,x25,x26			// a^b, b^c in next round
625	eor	x16,x16,x21,ror#41	// Sigma1(e)
626	eor	x14,x14,x25,ror#34
627	add	x24,x24,x17			// h+=Ch(e,f,g)
628	and	x19,x19,x28			// (b^c)&=(a^b)
629	eor	x12,x12,x5,ror#61
630	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
631	add	x24,x24,x16			// h+=Sigma1(e)
632	eor	x19,x19,x26			// Maj(a,b,c)
633	eor	x17,x14,x25,ror#39	// Sigma0(a)
634	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
635	add	x7,x7,x0
636	add	x20,x20,x24			// d+=h
637	add	x24,x24,x19			// h+=Maj(a,b,c)
638	ldr	x19,[x30],#8		// *K++, x28 in next round
639	add	x7,x7,x13
640	add	x24,x24,x17			// h+=Sigma0(a)
641	add	x7,x7,x12
642	ldr	x12,[sp,#8]
643	str	x15,[sp,#0]
644	ror	x16,x20,#14
645	add	x23,x23,x19			// h+=K[i]
646	ror	x14,x9,#1
647	and	x17,x21,x20
648	ror	x13,x6,#19
649	bic	x19,x22,x20
650	ror	x15,x24,#28
651	add	x23,x23,x7			// h+=X[i]
652	eor	x16,x16,x20,ror#18
653	eor	x14,x14,x9,ror#8
654	orr	x17,x17,x19			// Ch(e,f,g)
655	eor	x19,x24,x25			// a^b, b^c in next round
656	eor	x16,x16,x20,ror#41	// Sigma1(e)
657	eor	x15,x15,x24,ror#34
658	add	x23,x23,x17			// h+=Ch(e,f,g)
659	and	x28,x28,x19			// (b^c)&=(a^b)
660	eor	x13,x13,x6,ror#61
661	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
662	add	x23,x23,x16			// h+=Sigma1(e)
663	eor	x28,x28,x25			// Maj(a,b,c)
664	eor	x17,x15,x24,ror#39	// Sigma0(a)
665	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
666	add	x8,x8,x1
667	add	x27,x27,x23			// d+=h
668	add	x23,x23,x28			// h+=Maj(a,b,c)
669	ldr	x28,[x30],#8		// *K++, x19 in next round
670	add	x8,x8,x14
671	add	x23,x23,x17			// h+=Sigma0(a)
672	add	x8,x8,x13
673	ldr	x13,[sp,#16]
674	str	x0,[sp,#8]
675	ror	x16,x27,#14
676	add	x22,x22,x28			// h+=K[i]
677	ror	x15,x10,#1
678	and	x17,x20,x27
679	ror	x14,x7,#19
680	bic	x28,x21,x27
681	ror	x0,x23,#28
682	add	x22,x22,x8			// h+=X[i]
683	eor	x16,x16,x27,ror#18
684	eor	x15,x15,x10,ror#8
685	orr	x17,x17,x28			// Ch(e,f,g)
686	eor	x28,x23,x24			// a^b, b^c in next round
687	eor	x16,x16,x27,ror#41	// Sigma1(e)
688	eor	x0,x0,x23,ror#34
689	add	x22,x22,x17			// h+=Ch(e,f,g)
690	and	x19,x19,x28			// (b^c)&=(a^b)
691	eor	x14,x14,x7,ror#61
692	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
693	add	x22,x22,x16			// h+=Sigma1(e)
694	eor	x19,x19,x24			// Maj(a,b,c)
695	eor	x17,x0,x23,ror#39	// Sigma0(a)
696	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
697	add	x9,x9,x2
698	add	x26,x26,x22			// d+=h
699	add	x22,x22,x19			// h+=Maj(a,b,c)
700	ldr	x19,[x30],#8		// *K++, x28 in next round
701	add	x9,x9,x15
702	add	x22,x22,x17			// h+=Sigma0(a)
703	add	x9,x9,x14
704	ldr	x14,[sp,#24]
705	str	x1,[sp,#16]
706	ror	x16,x26,#14
707	add	x21,x21,x19			// h+=K[i]
708	ror	x0,x11,#1
709	and	x17,x27,x26
710	ror	x15,x8,#19
711	bic	x19,x20,x26
712	ror	x1,x22,#28
713	add	x21,x21,x9			// h+=X[i]
714	eor	x16,x16,x26,ror#18
715	eor	x0,x0,x11,ror#8
716	orr	x17,x17,x19			// Ch(e,f,g)
717	eor	x19,x22,x23			// a^b, b^c in next round
718	eor	x16,x16,x26,ror#41	// Sigma1(e)
719	eor	x1,x1,x22,ror#34
720	add	x21,x21,x17			// h+=Ch(e,f,g)
721	and	x28,x28,x19			// (b^c)&=(a^b)
722	eor	x15,x15,x8,ror#61
723	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
724	add	x21,x21,x16			// h+=Sigma1(e)
725	eor	x28,x28,x23			// Maj(a,b,c)
726	eor	x17,x1,x22,ror#39	// Sigma0(a)
727	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
728	add	x10,x10,x3
729	add	x25,x25,x21			// d+=h
730	add	x21,x21,x28			// h+=Maj(a,b,c)
731	ldr	x28,[x30],#8		// *K++, x19 in next round
732	add	x10,x10,x0
733	add	x21,x21,x17			// h+=Sigma0(a)
734	add	x10,x10,x15
735	ldr	x15,[sp,#0]
736	str	x2,[sp,#24]
737	ror	x16,x25,#14
738	add	x20,x20,x28			// h+=K[i]
739	ror	x1,x12,#1
740	and	x17,x26,x25
741	ror	x0,x9,#19
742	bic	x28,x27,x25
743	ror	x2,x21,#28
744	add	x20,x20,x10			// h+=X[i]
745	eor	x16,x16,x25,ror#18
746	eor	x1,x1,x12,ror#8
747	orr	x17,x17,x28			// Ch(e,f,g)
748	eor	x28,x21,x22			// a^b, b^c in next round
749	eor	x16,x16,x25,ror#41	// Sigma1(e)
750	eor	x2,x2,x21,ror#34
751	add	x20,x20,x17			// h+=Ch(e,f,g)
752	and	x19,x19,x28			// (b^c)&=(a^b)
753	eor	x0,x0,x9,ror#61
754	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
755	add	x20,x20,x16			// h+=Sigma1(e)
756	eor	x19,x19,x22			// Maj(a,b,c)
757	eor	x17,x2,x21,ror#39	// Sigma0(a)
758	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
759	add	x11,x11,x4
760	add	x24,x24,x20			// d+=h
761	add	x20,x20,x19			// h+=Maj(a,b,c)
762	ldr	x19,[x30],#8		// *K++, x28 in next round
763	add	x11,x11,x1
764	add	x20,x20,x17			// h+=Sigma0(a)
765	add	x11,x11,x0
766	ldr	x0,[sp,#8]
767	str	x3,[sp,#0]
768	ror	x16,x24,#14
769	add	x27,x27,x19			// h+=K[i]
770	ror	x2,x13,#1
771	and	x17,x25,x24
772	ror	x1,x10,#19
773	bic	x19,x26,x24
774	ror	x3,x20,#28
775	add	x27,x27,x11			// h+=X[i]
776	eor	x16,x16,x24,ror#18
777	eor	x2,x2,x13,ror#8
778	orr	x17,x17,x19			// Ch(e,f,g)
779	eor	x19,x20,x21			// a^b, b^c in next round
780	eor	x16,x16,x24,ror#41	// Sigma1(e)
781	eor	x3,x3,x20,ror#34
782	add	x27,x27,x17			// h+=Ch(e,f,g)
783	and	x28,x28,x19			// (b^c)&=(a^b)
784	eor	x1,x1,x10,ror#61
785	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
786	add	x27,x27,x16			// h+=Sigma1(e)
787	eor	x28,x28,x21			// Maj(a,b,c)
788	eor	x17,x3,x20,ror#39	// Sigma0(a)
789	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
790	add	x12,x12,x5
791	add	x23,x23,x27			// d+=h
792	add	x27,x27,x28			// h+=Maj(a,b,c)
793	ldr	x28,[x30],#8		// *K++, x19 in next round
794	add	x12,x12,x2
795	add	x27,x27,x17			// h+=Sigma0(a)
796	add	x12,x12,x1
797	ldr	x1,[sp,#16]
798	str	x4,[sp,#8]
799	ror	x16,x23,#14
800	add	x26,x26,x28			// h+=K[i]
801	ror	x3,x14,#1
802	and	x17,x24,x23
803	ror	x2,x11,#19
804	bic	x28,x25,x23
805	ror	x4,x27,#28
806	add	x26,x26,x12			// h+=X[i]
807	eor	x16,x16,x23,ror#18
808	eor	x3,x3,x14,ror#8
809	orr	x17,x17,x28			// Ch(e,f,g)
810	eor	x28,x27,x20			// a^b, b^c in next round
811	eor	x16,x16,x23,ror#41	// Sigma1(e)
812	eor	x4,x4,x27,ror#34
813	add	x26,x26,x17			// h+=Ch(e,f,g)
814	and	x19,x19,x28			// (b^c)&=(a^b)
815	eor	x2,x2,x11,ror#61
816	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
817	add	x26,x26,x16			// h+=Sigma1(e)
818	eor	x19,x19,x20			// Maj(a,b,c)
819	eor	x17,x4,x27,ror#39	// Sigma0(a)
820	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
821	add	x13,x13,x6
822	add	x22,x22,x26			// d+=h
823	add	x26,x26,x19			// h+=Maj(a,b,c)
824	ldr	x19,[x30],#8		// *K++, x28 in next round
825	add	x13,x13,x3
826	add	x26,x26,x17			// h+=Sigma0(a)
827	add	x13,x13,x2
828	ldr	x2,[sp,#24]
829	str	x5,[sp,#16]
830	ror	x16,x22,#14
831	add	x25,x25,x19			// h+=K[i]
832	ror	x4,x15,#1
833	and	x17,x23,x22
834	ror	x3,x12,#19
835	bic	x19,x24,x22
836	ror	x5,x26,#28
837	add	x25,x25,x13			// h+=X[i]
838	eor	x16,x16,x22,ror#18
839	eor	x4,x4,x15,ror#8
840	orr	x17,x17,x19			// Ch(e,f,g)
841	eor	x19,x26,x27			// a^b, b^c in next round
842	eor	x16,x16,x22,ror#41	// Sigma1(e)
843	eor	x5,x5,x26,ror#34
844	add	x25,x25,x17			// h+=Ch(e,f,g)
845	and	x28,x28,x19			// (b^c)&=(a^b)
846	eor	x3,x3,x12,ror#61
847	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
848	add	x25,x25,x16			// h+=Sigma1(e)
849	eor	x28,x28,x27			// Maj(a,b,c)
850	eor	x17,x5,x26,ror#39	// Sigma0(a)
851	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
852	add	x14,x14,x7
853	add	x21,x21,x25			// d+=h
854	add	x25,x25,x28			// h+=Maj(a,b,c)
855	ldr	x28,[x30],#8		// *K++, x19 in next round
856	add	x14,x14,x4
857	add	x25,x25,x17			// h+=Sigma0(a)
858	add	x14,x14,x3
859	ldr	x3,[sp,#0]
860	str	x6,[sp,#24]
861	ror	x16,x21,#14
862	add	x24,x24,x28			// h+=K[i]
863	ror	x5,x0,#1
864	and	x17,x22,x21
865	ror	x4,x13,#19
866	bic	x28,x23,x21
867	ror	x6,x25,#28
868	add	x24,x24,x14			// h+=X[i]
869	eor	x16,x16,x21,ror#18
870	eor	x5,x5,x0,ror#8
871	orr	x17,x17,x28			// Ch(e,f,g)
872	eor	x28,x25,x26			// a^b, b^c in next round
873	eor	x16,x16,x21,ror#41	// Sigma1(e)
874	eor	x6,x6,x25,ror#34
875	add	x24,x24,x17			// h+=Ch(e,f,g)
876	and	x19,x19,x28			// (b^c)&=(a^b)
877	eor	x4,x4,x13,ror#61
878	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
879	add	x24,x24,x16			// h+=Sigma1(e)
880	eor	x19,x19,x26			// Maj(a,b,c)
881	eor	x17,x6,x25,ror#39	// Sigma0(a)
882	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
883	add	x15,x15,x8
884	add	x20,x20,x24			// d+=h
885	add	x24,x24,x19			// h+=Maj(a,b,c)
886	ldr	x19,[x30],#8		// *K++, x28 in next round
887	add	x15,x15,x5
888	add	x24,x24,x17			// h+=Sigma0(a)
889	add	x15,x15,x4
890	ldr	x4,[sp,#8]
891	str	x7,[sp,#0]
892	ror	x16,x20,#14
893	add	x23,x23,x19			// h+=K[i]
894	ror	x6,x1,#1
895	and	x17,x21,x20
896	ror	x5,x14,#19
897	bic	x19,x22,x20
898	ror	x7,x24,#28
899	add	x23,x23,x15			// h+=X[i]
900	eor	x16,x16,x20,ror#18
901	eor	x6,x6,x1,ror#8
902	orr	x17,x17,x19			// Ch(e,f,g)
903	eor	x19,x24,x25			// a^b, b^c in next round
904	eor	x16,x16,x20,ror#41	// Sigma1(e)
905	eor	x7,x7,x24,ror#34
906	add	x23,x23,x17			// h+=Ch(e,f,g)
907	and	x28,x28,x19			// (b^c)&=(a^b)
908	eor	x5,x5,x14,ror#61
909	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
910	add	x23,x23,x16			// h+=Sigma1(e)
911	eor	x28,x28,x25			// Maj(a,b,c)
912	eor	x17,x7,x24,ror#39	// Sigma0(a)
913	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
914	add	x0,x0,x9
915	add	x27,x27,x23			// d+=h
916	add	x23,x23,x28			// h+=Maj(a,b,c)
917	ldr	x28,[x30],#8		// *K++, x19 in next round
918	add	x0,x0,x6
919	add	x23,x23,x17			// h+=Sigma0(a)
920	add	x0,x0,x5
921	ldr	x5,[sp,#16]
922	str	x8,[sp,#8]
923	ror	x16,x27,#14
924	add	x22,x22,x28			// h+=K[i]
925	ror	x7,x2,#1
926	and	x17,x20,x27
927	ror	x6,x15,#19
928	bic	x28,x21,x27
929	ror	x8,x23,#28
930	add	x22,x22,x0			// h+=X[i]
931	eor	x16,x16,x27,ror#18
932	eor	x7,x7,x2,ror#8
933	orr	x17,x17,x28			// Ch(e,f,g)
934	eor	x28,x23,x24			// a^b, b^c in next round
935	eor	x16,x16,x27,ror#41	// Sigma1(e)
936	eor	x8,x8,x23,ror#34
937	add	x22,x22,x17			// h+=Ch(e,f,g)
938	and	x19,x19,x28			// (b^c)&=(a^b)
939	eor	x6,x6,x15,ror#61
940	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
941	add	x22,x22,x16			// h+=Sigma1(e)
942	eor	x19,x19,x24			// Maj(a,b,c)
943	eor	x17,x8,x23,ror#39	// Sigma0(a)
944	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
945	add	x1,x1,x10
946	add	x26,x26,x22			// d+=h
947	add	x22,x22,x19			// h+=Maj(a,b,c)
948	ldr	x19,[x30],#8		// *K++, x28 in next round
949	add	x1,x1,x7
950	add	x22,x22,x17			// h+=Sigma0(a)
951	add	x1,x1,x6
952	ldr	x6,[sp,#24]
953	str	x9,[sp,#16]
954	ror	x16,x26,#14
955	add	x21,x21,x19			// h+=K[i]
956	ror	x8,x3,#1
957	and	x17,x27,x26
958	ror	x7,x0,#19
959	bic	x19,x20,x26
960	ror	x9,x22,#28
961	add	x21,x21,x1			// h+=X[i]
962	eor	x16,x16,x26,ror#18
963	eor	x8,x8,x3,ror#8
964	orr	x17,x17,x19			// Ch(e,f,g)
965	eor	x19,x22,x23			// a^b, b^c in next round
966	eor	x16,x16,x26,ror#41	// Sigma1(e)
967	eor	x9,x9,x22,ror#34
968	add	x21,x21,x17			// h+=Ch(e,f,g)
969	and	x28,x28,x19			// (b^c)&=(a^b)
970	eor	x7,x7,x0,ror#61
971	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
972	add	x21,x21,x16			// h+=Sigma1(e)
973	eor	x28,x28,x23			// Maj(a,b,c)
974	eor	x17,x9,x22,ror#39	// Sigma0(a)
975	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
976	add	x2,x2,x11
977	add	x25,x25,x21			// d+=h
978	add	x21,x21,x28			// h+=Maj(a,b,c)
979	ldr	x28,[x30],#8		// *K++, x19 in next round
980	add	x2,x2,x8
981	add	x21,x21,x17			// h+=Sigma0(a)
982	add	x2,x2,x7
983	ldr	x7,[sp,#0]
984	str	x10,[sp,#24]
985	ror	x16,x25,#14
986	add	x20,x20,x28			// h+=K[i]
987	ror	x9,x4,#1
988	and	x17,x26,x25
989	ror	x8,x1,#19
990	bic	x28,x27,x25
991	ror	x10,x21,#28
992	add	x20,x20,x2			// h+=X[i]
993	eor	x16,x16,x25,ror#18
994	eor	x9,x9,x4,ror#8
995	orr	x17,x17,x28			// Ch(e,f,g)
996	eor	x28,x21,x22			// a^b, b^c in next round
997	eor	x16,x16,x25,ror#41	// Sigma1(e)
998	eor	x10,x10,x21,ror#34
999	add	x20,x20,x17			// h+=Ch(e,f,g)
1000	and	x19,x19,x28			// (b^c)&=(a^b)
1001	eor	x8,x8,x1,ror#61
1002	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
1003	add	x20,x20,x16			// h+=Sigma1(e)
1004	eor	x19,x19,x22			// Maj(a,b,c)
1005	eor	x17,x10,x21,ror#39	// Sigma0(a)
1006	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
1007	add	x3,x3,x12
1008	add	x24,x24,x20			// d+=h
1009	add	x20,x20,x19			// h+=Maj(a,b,c)
1010	ldr	x19,[x30],#8		// *K++, x28 in next round
1011	add	x3,x3,x9
1012	add	x20,x20,x17			// h+=Sigma0(a)
1013	add	x3,x3,x8
1014	cbnz	x19,.Loop_16_xx
1015
1016	ldp	x0,x2,[x29,#96]
1017	ldr	x1,[x29,#112]
1018	sub	x30,x30,#648		// rewind
1019
1020	ldp	x3,x4,[x0]
1021	ldp	x5,x6,[x0,#2*8]
1022	add	x1,x1,#14*8			// advance input pointer
1023	ldp	x7,x8,[x0,#4*8]
1024	add	x20,x20,x3
1025	ldp	x9,x10,[x0,#6*8]
1026	add	x21,x21,x4
1027	add	x22,x22,x5
1028	add	x23,x23,x6
1029	stp	x20,x21,[x0]
1030	add	x24,x24,x7
1031	add	x25,x25,x8
1032	stp	x22,x23,[x0,#2*8]
1033	add	x26,x26,x9
1034	add	x27,x27,x10
1035	cmp	x1,x2
1036	stp	x24,x25,[x0,#4*8]
1037	stp	x26,x27,[x0,#6*8]
1038	b.ne	.Loop
1039
1040	ldp	x19,x20,[x29,#16]
1041	add	sp,sp,#4*8
1042	ldp	x21,x22,[x29,#32]
1043	ldp	x23,x24,[x29,#48]
1044	ldp	x25,x26,[x29,#64]
1045	ldp	x27,x28,[x29,#80]
1046	ldp	x29,x30,[sp],#128
1047	ret
1048.size	zfs_sha512_block_armv7,.-zfs_sha512_block_armv7
1049
1050
1051.globl	zfs_sha512_block_armv8
1052.type	zfs_sha512_block_armv8,%function
1053.align	6
1054zfs_sha512_block_armv8:
1055	hint		#34				// bti c
1056.Lv8_entry:
1057	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later
1058	stp		x29,x30,[sp,#-16]!
1059	add		x29,sp,#0
1060
1061	ld1		{v16.16b-v19.16b},[x1],#64	// load input
1062	ld1		{v20.16b-v23.16b},[x1],#64
1063
1064	ld1		{v0.2d-v3.2d},[x0]		// load context
1065	adr		x3,.LK512
1066
1067	rev64		v16.16b,v16.16b
1068	rev64		v17.16b,v17.16b
1069	rev64		v18.16b,v18.16b
1070	rev64		v19.16b,v19.16b
1071	rev64		v20.16b,v20.16b
1072	rev64		v21.16b,v21.16b
1073	rev64		v22.16b,v22.16b
1074	rev64		v23.16b,v23.16b
1075	b		.Loop_hw
1076
1077.align	4
1078.Loop_hw:
1079	ld1		{v24.2d},[x3],#16
1080	subs		x2,x2,#1
1081	sub		x4,x1,#128
1082	orr		v26.16b,v0.16b,v0.16b			// offload
1083	orr		v27.16b,v1.16b,v1.16b
1084	orr		v28.16b,v2.16b,v2.16b
1085	orr		v29.16b,v3.16b,v3.16b
1086	csel		x1,x1,x4,ne			// conditional rewind
1087	add		v24.2d,v24.2d,v16.2d
1088	ld1		{v25.2d},[x3],#16
1089	ext		v24.16b,v24.16b,v24.16b,#8
1090	ext		v5.16b,v2.16b,v3.16b,#8
1091	ext		v6.16b,v1.16b,v2.16b,#8
1092	add		v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1093	 .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1094	 ext		v7.16b,v20.16b,v21.16b,#8
1095	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1096	 .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1097	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1098	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1099	add		v25.2d,v25.2d,v17.2d
1100	ld1		{v24.2d},[x3],#16
1101	ext		v25.16b,v25.16b,v25.16b,#8
1102	ext		v5.16b,v4.16b,v2.16b,#8
1103	ext		v6.16b,v0.16b,v4.16b,#8
1104	add		v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1105	 .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1106	 ext		v7.16b,v21.16b,v22.16b,#8
1107	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1108	 .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1109	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1110	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1111	add		v24.2d,v24.2d,v18.2d
1112	ld1		{v25.2d},[x3],#16
1113	ext		v24.16b,v24.16b,v24.16b,#8
1114	ext		v5.16b,v1.16b,v4.16b,#8
1115	ext		v6.16b,v3.16b,v1.16b,#8
1116	add		v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1117	 .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1118	 ext		v7.16b,v22.16b,v23.16b,#8
1119	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1120	 .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1121	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1122	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1123	add		v25.2d,v25.2d,v19.2d
1124	ld1		{v24.2d},[x3],#16
1125	ext		v25.16b,v25.16b,v25.16b,#8
1126	ext		v5.16b,v0.16b,v1.16b,#8
1127	ext		v6.16b,v2.16b,v0.16b,#8
1128	add		v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1129	 .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1130	 ext		v7.16b,v23.16b,v16.16b,#8
1131	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1132	 .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1133	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1134	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1135	add		v24.2d,v24.2d,v20.2d
1136	ld1		{v25.2d},[x3],#16
1137	ext		v24.16b,v24.16b,v24.16b,#8
1138	ext		v5.16b,v3.16b,v0.16b,#8
1139	ext		v6.16b,v4.16b,v3.16b,#8
1140	add		v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1141	 .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1142	 ext		v7.16b,v16.16b,v17.16b,#8
1143	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1144	 .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1145	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1146	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1147	add		v25.2d,v25.2d,v21.2d
1148	ld1		{v24.2d},[x3],#16
1149	ext		v25.16b,v25.16b,v25.16b,#8
1150	ext		v5.16b,v2.16b,v3.16b,#8
1151	ext		v6.16b,v1.16b,v2.16b,#8
1152	add		v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1153	 .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1154	 ext		v7.16b,v17.16b,v18.16b,#8
1155	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1156	 .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1157	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1158	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1159	add		v24.2d,v24.2d,v22.2d
1160	ld1		{v25.2d},[x3],#16
1161	ext		v24.16b,v24.16b,v24.16b,#8
1162	ext		v5.16b,v4.16b,v2.16b,#8
1163	ext		v6.16b,v0.16b,v4.16b,#8
1164	add		v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1165	 .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1166	 ext		v7.16b,v18.16b,v19.16b,#8
1167	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1168	 .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1169	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1170	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1171	add		v25.2d,v25.2d,v23.2d
1172	ld1		{v24.2d},[x3],#16
1173	ext		v25.16b,v25.16b,v25.16b,#8
1174	ext		v5.16b,v1.16b,v4.16b,#8
1175	ext		v6.16b,v3.16b,v1.16b,#8
1176	add		v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1177	 .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1178	 ext		v7.16b,v19.16b,v20.16b,#8
1179	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1180	 .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1181	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1182	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1183	add		v24.2d,v24.2d,v16.2d
1184	ld1		{v25.2d},[x3],#16
1185	ext		v24.16b,v24.16b,v24.16b,#8
1186	ext		v5.16b,v0.16b,v1.16b,#8
1187	ext		v6.16b,v2.16b,v0.16b,#8
1188	add		v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1189	 .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1190	 ext		v7.16b,v20.16b,v21.16b,#8
1191	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1192	 .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1193	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1194	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1195	add		v25.2d,v25.2d,v17.2d
1196	ld1		{v24.2d},[x3],#16
1197	ext		v25.16b,v25.16b,v25.16b,#8
1198	ext		v5.16b,v3.16b,v0.16b,#8
1199	ext		v6.16b,v4.16b,v3.16b,#8
1200	add		v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1201	 .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1202	 ext		v7.16b,v21.16b,v22.16b,#8
1203	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1204	 .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1205	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1206	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1207	add		v24.2d,v24.2d,v18.2d
1208	ld1		{v25.2d},[x3],#16
1209	ext		v24.16b,v24.16b,v24.16b,#8
1210	ext		v5.16b,v2.16b,v3.16b,#8
1211	ext		v6.16b,v1.16b,v2.16b,#8
1212	add		v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1213	 .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1214	 ext		v7.16b,v22.16b,v23.16b,#8
1215	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1216	 .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1217	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1218	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1219	add		v25.2d,v25.2d,v19.2d
1220	ld1		{v24.2d},[x3],#16
1221	ext		v25.16b,v25.16b,v25.16b,#8
1222	ext		v5.16b,v4.16b,v2.16b,#8
1223	ext		v6.16b,v0.16b,v4.16b,#8
1224	add		v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1225	 .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1226	 ext		v7.16b,v23.16b,v16.16b,#8
1227	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1228	 .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1229	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1230	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1231	add		v24.2d,v24.2d,v20.2d
1232	ld1		{v25.2d},[x3],#16
1233	ext		v24.16b,v24.16b,v24.16b,#8
1234	ext		v5.16b,v1.16b,v4.16b,#8
1235	ext		v6.16b,v3.16b,v1.16b,#8
1236	add		v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1237	 .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1238	 ext		v7.16b,v16.16b,v17.16b,#8
1239	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1240	 .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1241	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1242	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1243	add		v25.2d,v25.2d,v21.2d
1244	ld1		{v24.2d},[x3],#16
1245	ext		v25.16b,v25.16b,v25.16b,#8
1246	ext		v5.16b,v0.16b,v1.16b,#8
1247	ext		v6.16b,v2.16b,v0.16b,#8
1248	add		v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1249	 .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1250	 ext		v7.16b,v17.16b,v18.16b,#8
1251	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1252	 .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1253	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1254	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1255	add		v24.2d,v24.2d,v22.2d
1256	ld1		{v25.2d},[x3],#16
1257	ext		v24.16b,v24.16b,v24.16b,#8
1258	ext		v5.16b,v3.16b,v0.16b,#8
1259	ext		v6.16b,v4.16b,v3.16b,#8
1260	add		v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1261	 .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1262	 ext		v7.16b,v18.16b,v19.16b,#8
1263	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1264	 .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1265	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1266	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1267	add		v25.2d,v25.2d,v23.2d
1268	ld1		{v24.2d},[x3],#16
1269	ext		v25.16b,v25.16b,v25.16b,#8
1270	ext		v5.16b,v2.16b,v3.16b,#8
1271	ext		v6.16b,v1.16b,v2.16b,#8
1272	add		v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1273	 .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1274	 ext		v7.16b,v19.16b,v20.16b,#8
1275	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1276	 .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1277	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1278	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1279	add		v24.2d,v24.2d,v16.2d
1280	ld1		{v25.2d},[x3],#16
1281	ext		v24.16b,v24.16b,v24.16b,#8
1282	ext		v5.16b,v4.16b,v2.16b,#8
1283	ext		v6.16b,v0.16b,v4.16b,#8
1284	add		v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1285	 .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1286	 ext		v7.16b,v20.16b,v21.16b,#8
1287	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1288	 .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1289	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1290	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1291	add		v25.2d,v25.2d,v17.2d
1292	ld1		{v24.2d},[x3],#16
1293	ext		v25.16b,v25.16b,v25.16b,#8
1294	ext		v5.16b,v1.16b,v4.16b,#8
1295	ext		v6.16b,v3.16b,v1.16b,#8
1296	add		v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1297	 .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1298	 ext		v7.16b,v21.16b,v22.16b,#8
1299	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1300	 .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1301	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1302	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1303	add		v24.2d,v24.2d,v18.2d
1304	ld1		{v25.2d},[x3],#16
1305	ext		v24.16b,v24.16b,v24.16b,#8
1306	ext		v5.16b,v0.16b,v1.16b,#8
1307	ext		v6.16b,v2.16b,v0.16b,#8
1308	add		v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1309	 .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1310	 ext		v7.16b,v22.16b,v23.16b,#8
1311	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1312	 .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1313	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1314	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1315	add		v25.2d,v25.2d,v19.2d
1316	ld1		{v24.2d},[x3],#16
1317	ext		v25.16b,v25.16b,v25.16b,#8
1318	ext		v5.16b,v3.16b,v0.16b,#8
1319	ext		v6.16b,v4.16b,v3.16b,#8
1320	add		v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1321	 .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1322	 ext		v7.16b,v23.16b,v16.16b,#8
1323	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1324	 .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1325	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1326	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1327	add		v24.2d,v24.2d,v20.2d
1328	ld1		{v25.2d},[x3],#16
1329	ext		v24.16b,v24.16b,v24.16b,#8
1330	ext		v5.16b,v2.16b,v3.16b,#8
1331	ext		v6.16b,v1.16b,v2.16b,#8
1332	add		v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1333	 .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1334	 ext		v7.16b,v16.16b,v17.16b,#8
1335	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1336	 .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1337	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1338	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1339	add		v25.2d,v25.2d,v21.2d
1340	ld1		{v24.2d},[x3],#16
1341	ext		v25.16b,v25.16b,v25.16b,#8
1342	ext		v5.16b,v4.16b,v2.16b,#8
1343	ext		v6.16b,v0.16b,v4.16b,#8
1344	add		v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1345	 .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1346	 ext		v7.16b,v17.16b,v18.16b,#8
1347	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1348	 .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1349	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1350	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1351	add		v24.2d,v24.2d,v22.2d
1352	ld1		{v25.2d},[x3],#16
1353	ext		v24.16b,v24.16b,v24.16b,#8
1354	ext		v5.16b,v1.16b,v4.16b,#8
1355	ext		v6.16b,v3.16b,v1.16b,#8
1356	add		v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1357	 .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1358	 ext		v7.16b,v18.16b,v19.16b,#8
1359	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1360	 .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1361	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1362	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1363	add		v25.2d,v25.2d,v23.2d
1364	ld1		{v24.2d},[x3],#16
1365	ext		v25.16b,v25.16b,v25.16b,#8
1366	ext		v5.16b,v0.16b,v1.16b,#8
1367	ext		v6.16b,v2.16b,v0.16b,#8
1368	add		v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1369	 .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1370	 ext		v7.16b,v19.16b,v20.16b,#8
1371	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1372	 .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1373	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1374	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1375	add		v24.2d,v24.2d,v16.2d
1376	ld1		{v25.2d},[x3],#16
1377	ext		v24.16b,v24.16b,v24.16b,#8
1378	ext		v5.16b,v3.16b,v0.16b,#8
1379	ext		v6.16b,v4.16b,v3.16b,#8
1380	add		v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1381	 .inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1382	 ext		v7.16b,v20.16b,v21.16b,#8
1383	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1384	 .inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1385	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1386	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1387	add		v25.2d,v25.2d,v17.2d
1388	ld1		{v24.2d},[x3],#16
1389	ext		v25.16b,v25.16b,v25.16b,#8
1390	ext		v5.16b,v2.16b,v3.16b,#8
1391	ext		v6.16b,v1.16b,v2.16b,#8
1392	add		v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1393	 .inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1394	 ext		v7.16b,v21.16b,v22.16b,#8
1395	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1396	 .inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1397	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1398	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1399	add		v24.2d,v24.2d,v18.2d
1400	ld1		{v25.2d},[x3],#16
1401	ext		v24.16b,v24.16b,v24.16b,#8
1402	ext		v5.16b,v4.16b,v2.16b,#8
1403	ext		v6.16b,v0.16b,v4.16b,#8
1404	add		v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1405	 .inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1406	 ext		v7.16b,v22.16b,v23.16b,#8
1407	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1408	 .inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1409	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1410	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1411	add		v25.2d,v25.2d,v19.2d
1412	ld1		{v24.2d},[x3],#16
1413	ext		v25.16b,v25.16b,v25.16b,#8
1414	ext		v5.16b,v1.16b,v4.16b,#8
1415	ext		v6.16b,v3.16b,v1.16b,#8
1416	add		v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1417	 .inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1418	 ext		v7.16b,v23.16b,v16.16b,#8
1419	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1420	 .inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1421	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1422	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1423	add		v24.2d,v24.2d,v20.2d
1424	ld1		{v25.2d},[x3],#16
1425	ext		v24.16b,v24.16b,v24.16b,#8
1426	ext		v5.16b,v0.16b,v1.16b,#8
1427	ext		v6.16b,v2.16b,v0.16b,#8
1428	add		v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1429	 .inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1430	 ext		v7.16b,v16.16b,v17.16b,#8
1431	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1432	 .inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1433	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1434	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1435	add		v25.2d,v25.2d,v21.2d
1436	ld1		{v24.2d},[x3],#16
1437	ext		v25.16b,v25.16b,v25.16b,#8
1438	ext		v5.16b,v3.16b,v0.16b,#8
1439	ext		v6.16b,v4.16b,v3.16b,#8
1440	add		v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1441	 .inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1442	 ext		v7.16b,v17.16b,v18.16b,#8
1443	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1444	 .inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1445	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1446	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1447	add		v24.2d,v24.2d,v22.2d
1448	ld1		{v25.2d},[x3],#16
1449	ext		v24.16b,v24.16b,v24.16b,#8
1450	ext		v5.16b,v2.16b,v3.16b,#8
1451	ext		v6.16b,v1.16b,v2.16b,#8
1452	add		v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1453	 .inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1454	 ext		v7.16b,v18.16b,v19.16b,#8
1455	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1456	 .inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1457	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1458	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1459	add		v25.2d,v25.2d,v23.2d
1460	ld1		{v24.2d},[x3],#16
1461	ext		v25.16b,v25.16b,v25.16b,#8
1462	ext		v5.16b,v4.16b,v2.16b,#8
1463	ext		v6.16b,v0.16b,v4.16b,#8
1464	add		v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1465	 .inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1466	 ext		v7.16b,v19.16b,v20.16b,#8
1467	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1468	 .inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1469	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1470	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1471	ld1		{v25.2d},[x3],#16
1472	add		v24.2d,v24.2d,v16.2d
1473	 ld1		{v16.16b},[x1],#16		// load next input
1474	ext		v24.16b,v24.16b,v24.16b,#8
1475	ext		v5.16b,v1.16b,v4.16b,#8
1476	ext		v6.16b,v3.16b,v1.16b,#8
1477	add		v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1478	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1479	 rev64		v16.16b,v16.16b
1480	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1481	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1482	ld1		{v24.2d},[x3],#16
1483	add		v25.2d,v25.2d,v17.2d
1484	 ld1		{v17.16b},[x1],#16		// load next input
1485	ext		v25.16b,v25.16b,v25.16b,#8
1486	ext		v5.16b,v0.16b,v1.16b,#8
1487	ext		v6.16b,v2.16b,v0.16b,#8
1488	add		v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1489	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1490	 rev64		v17.16b,v17.16b
1491	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1492	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1493	ld1		{v25.2d},[x3],#16
1494	add		v24.2d,v24.2d,v18.2d
1495	 ld1		{v18.16b},[x1],#16		// load next input
1496	ext		v24.16b,v24.16b,v24.16b,#8
1497	ext		v5.16b,v3.16b,v0.16b,#8
1498	ext		v6.16b,v4.16b,v3.16b,#8
1499	add		v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1500	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1501	 rev64		v18.16b,v18.16b
1502	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1503	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1504	ld1		{v24.2d},[x3],#16
1505	add		v25.2d,v25.2d,v19.2d
1506	 ld1		{v19.16b},[x1],#16		// load next input
1507	ext		v25.16b,v25.16b,v25.16b,#8
1508	ext		v5.16b,v2.16b,v3.16b,#8
1509	ext		v6.16b,v1.16b,v2.16b,#8
1510	add		v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1511	.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1512	 rev64		v19.16b,v19.16b
1513	add		v4.2d,v1.2d,v3.2d		// "D + T1"
1514	.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1515	ld1		{v25.2d},[x3],#16
1516	add		v24.2d,v24.2d,v20.2d
1517	 ld1		{v20.16b},[x1],#16		// load next input
1518	ext		v24.16b,v24.16b,v24.16b,#8
1519	ext		v5.16b,v4.16b,v2.16b,#8
1520	ext		v6.16b,v0.16b,v4.16b,#8
1521	add		v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1522	.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1523	 rev64		v20.16b,v20.16b
1524	add		v1.2d,v0.2d,v2.2d		// "D + T1"
1525	.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1526	ld1		{v24.2d},[x3],#16
1527	add		v25.2d,v25.2d,v21.2d
1528	 ld1		{v21.16b},[x1],#16		// load next input
1529	ext		v25.16b,v25.16b,v25.16b,#8
1530	ext		v5.16b,v1.16b,v4.16b,#8
1531	ext		v6.16b,v3.16b,v1.16b,#8
1532	add		v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1533	.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1534	 rev64		v21.16b,v21.16b
1535	add		v0.2d,v3.2d,v4.2d		// "D + T1"
1536	.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1537	ld1		{v25.2d},[x3],#16
1538	add		v24.2d,v24.2d,v22.2d
1539	 ld1		{v22.16b},[x1],#16		// load next input
1540	ext		v24.16b,v24.16b,v24.16b,#8
1541	ext		v5.16b,v0.16b,v1.16b,#8
1542	ext		v6.16b,v2.16b,v0.16b,#8
1543	add		v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1544	.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1545	 rev64		v22.16b,v22.16b
1546	add		v3.2d,v2.2d,v1.2d		// "D + T1"
1547	.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1548	sub		x3,x3,#80*8	// rewind
1549	add		v25.2d,v25.2d,v23.2d
1550	 ld1		{v23.16b},[x1],#16		// load next input
1551	ext		v25.16b,v25.16b,v25.16b,#8
1552	ext		v5.16b,v3.16b,v0.16b,#8
1553	ext		v6.16b,v4.16b,v3.16b,#8
1554	add		v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1555	.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1556	 rev64		v23.16b,v23.16b
1557	add		v2.2d,v4.2d,v0.2d		// "D + T1"
1558	.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1559	add		v0.2d,v0.2d,v26.2d			// accumulate
1560	add		v1.2d,v1.2d,v27.2d
1561	add		v2.2d,v2.2d,v28.2d
1562	add		v3.2d,v3.2d,v29.2d
1563
1564	cbnz		x2,.Loop_hw
1565
1566	st1		{v0.2d-v3.2d},[x0]		// store context
1567
1568	ldr		x29,[sp],#16
1569	ret
1570.size	zfs_sha512_block_armv8,.-zfs_sha512_block_armv8
1571#endif
1572