xref: /freebsd/sys/crypto/openssl/aarch64/sha512-armv8.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1/* Do not modify. This file is auto-generated from sha512-armv8.pl. */
2// Copyright 2014-2025 The OpenSSL Project Authors. All Rights Reserved.
3//
4// Licensed under the Apache License 2.0 (the "License").  You may not use
5// this file except in compliance with the License.  You can obtain a copy
6// in the file LICENSE in the source distribution or at
7// https://www.openssl.org/source/license.html
8
9// ====================================================================
10// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11// project. The module is, however, dual licensed under OpenSSL and
12// CRYPTOGAMS licenses depending on where you obtain it. For further
13// details see http://www.openssl.org/~appro/cryptogams/.
14//
15// Permission to use under GPLv2 terms is granted.
16// ====================================================================
17//
18// SHA256/512 for ARMv8.
19//
20// Performance in cycles per processed byte and improvement coefficient
21// over code generated with "default" compiler:
22//
23//		SHA256-hw	SHA256(*)	SHA512
24// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
25// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
26// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
27// Denver	2.01		10.5 (+26%)	6.70 (+8%)
28// X-Gene			20.0 (+100%)	12.8 (+300%(***))
29// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
30// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
31// ThunderX2	2.54		13.2 (+40%)	8.40 (+18%)
32//
33// (*)	Software SHA256 results are of lesser relevance, presented
34//	mostly for informational purposes.
35// (**)	The result is a trade-off: it's possible to improve it by
36//	10% (or by 1 cycle per round), but at the cost of 20% loss
37//	on Cortex-A53 (or by 4 cycles per round).
38// (***)	Super-impressive coefficients over gcc-generated code are
39//	indication of some compiler "pathology", most notably code
40//	generated with -mgeneral-regs-only is significantly faster
41//	and the gap is only 40-90%.
42//
43// October 2016.
44//
45// Originally it was reckoned that it makes no sense to implement NEON
46// version of SHA256 for 64-bit processors. This is because performance
47// improvement on most wide-spread Cortex-A5x processors was observed
48// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
49// observed that 32-bit NEON SHA256 performs significantly better than
50// 64-bit scalar version on *some* of the more recent processors. As
51// result 64-bit NEON version of SHA256 was added to provide best
52// all-round performance. For example it executes ~30% faster on X-Gene
53// and Mongoose. [For reference, NEON version of SHA512 is bound to
54// deliver much less improvement, likely *negative* on Cortex-A5x.
55// Which is why NEON support is limited to SHA256.]
56
57// $output is the last argument if it looks like a file (it has an extension)
58// $flavour is the first argument if it doesn't look like a file
59#include "arm_arch.h"
60#ifndef	__KERNEL__
61
62.hidden	OPENSSL_armcap_P
63#endif
64
65.text
66
67.globl	sha512_block_data_order
68.type	sha512_block_data_order,%function
69.align	6
70sha512_block_data_order:
71	AARCH64_VALID_CALL_TARGET
72#ifndef	__KERNEL__
73	adrp	x16,OPENSSL_armcap_P
74	ldr	w16,[x16,#:lo12:OPENSSL_armcap_P]
75	tst	w16,#ARMV8_SHA512
76	b.ne	.Lv8_entry
77#endif
78	AARCH64_SIGN_LINK_REGISTER
79	stp	x29,x30,[sp,#-128]!
80	add	x29,sp,#0
81
82	stp	x19,x20,[sp,#16]
83	stp	x21,x22,[sp,#32]
84	stp	x23,x24,[sp,#48]
85	stp	x25,x26,[sp,#64]
86	stp	x27,x28,[sp,#80]
87	sub	sp,sp,#4*8
88
89	ldp	x20,x21,[x0]				// load context
90	ldp	x22,x23,[x0,#2*8]
91	ldp	x24,x25,[x0,#4*8]
92	add	x2,x1,x2,lsl#7	// end of input
93	ldp	x26,x27,[x0,#6*8]
94	adrp	x30,.LK512
95	add	x30,x30,#:lo12:.LK512
96	stp	x0,x2,[x29,#96]
97
98.Loop:
99	ldp	x3,x4,[x1],#2*8
100	ldr	x19,[x30],#8			// *K++
101	eor	x28,x21,x22				// magic seed
102	str	x1,[x29,#112]
103#ifndef	__AARCH64EB__
104	rev	x3,x3			// 0
105#endif
106	ror	x16,x24,#14
107	add	x27,x27,x19			// h+=K[i]
108	eor	x6,x24,x24,ror#23
109	and	x17,x25,x24
110	bic	x19,x26,x24
111	add	x27,x27,x3			// h+=X[i]
112	orr	x17,x17,x19			// Ch(e,f,g)
113	eor	x19,x20,x21			// a^b, b^c in next round
114	eor	x16,x16,x6,ror#18	// Sigma1(e)
115	ror	x6,x20,#28
116	add	x27,x27,x17			// h+=Ch(e,f,g)
117	eor	x17,x20,x20,ror#5
118	add	x27,x27,x16			// h+=Sigma1(e)
119	and	x28,x28,x19			// (b^c)&=(a^b)
120	add	x23,x23,x27			// d+=h
121	eor	x28,x28,x21			// Maj(a,b,c)
122	eor	x17,x6,x17,ror#34	// Sigma0(a)
123	add	x27,x27,x28			// h+=Maj(a,b,c)
124	ldr	x28,[x30],#8		// *K++, x19 in next round
125	//add	x27,x27,x17			// h+=Sigma0(a)
126#ifndef	__AARCH64EB__
127	rev	x4,x4			// 1
128#endif
129	ldp	x5,x6,[x1],#2*8
130	add	x27,x27,x17			// h+=Sigma0(a)
131	ror	x16,x23,#14
132	add	x26,x26,x28			// h+=K[i]
133	eor	x7,x23,x23,ror#23
134	and	x17,x24,x23
135	bic	x28,x25,x23
136	add	x26,x26,x4			// h+=X[i]
137	orr	x17,x17,x28			// Ch(e,f,g)
138	eor	x28,x27,x20			// a^b, b^c in next round
139	eor	x16,x16,x7,ror#18	// Sigma1(e)
140	ror	x7,x27,#28
141	add	x26,x26,x17			// h+=Ch(e,f,g)
142	eor	x17,x27,x27,ror#5
143	add	x26,x26,x16			// h+=Sigma1(e)
144	and	x19,x19,x28			// (b^c)&=(a^b)
145	add	x22,x22,x26			// d+=h
146	eor	x19,x19,x20			// Maj(a,b,c)
147	eor	x17,x7,x17,ror#34	// Sigma0(a)
148	add	x26,x26,x19			// h+=Maj(a,b,c)
149	ldr	x19,[x30],#8		// *K++, x28 in next round
150	//add	x26,x26,x17			// h+=Sigma0(a)
151#ifndef	__AARCH64EB__
152	rev	x5,x5			// 2
153#endif
154	add	x26,x26,x17			// h+=Sigma0(a)
155	ror	x16,x22,#14
156	add	x25,x25,x19			// h+=K[i]
157	eor	x8,x22,x22,ror#23
158	and	x17,x23,x22
159	bic	x19,x24,x22
160	add	x25,x25,x5			// h+=X[i]
161	orr	x17,x17,x19			// Ch(e,f,g)
162	eor	x19,x26,x27			// a^b, b^c in next round
163	eor	x16,x16,x8,ror#18	// Sigma1(e)
164	ror	x8,x26,#28
165	add	x25,x25,x17			// h+=Ch(e,f,g)
166	eor	x17,x26,x26,ror#5
167	add	x25,x25,x16			// h+=Sigma1(e)
168	and	x28,x28,x19			// (b^c)&=(a^b)
169	add	x21,x21,x25			// d+=h
170	eor	x28,x28,x27			// Maj(a,b,c)
171	eor	x17,x8,x17,ror#34	// Sigma0(a)
172	add	x25,x25,x28			// h+=Maj(a,b,c)
173	ldr	x28,[x30],#8		// *K++, x19 in next round
174	//add	x25,x25,x17			// h+=Sigma0(a)
175#ifndef	__AARCH64EB__
176	rev	x6,x6			// 3
177#endif
178	ldp	x7,x8,[x1],#2*8
179	add	x25,x25,x17			// h+=Sigma0(a)
180	ror	x16,x21,#14
181	add	x24,x24,x28			// h+=K[i]
182	eor	x9,x21,x21,ror#23
183	and	x17,x22,x21
184	bic	x28,x23,x21
185	add	x24,x24,x6			// h+=X[i]
186	orr	x17,x17,x28			// Ch(e,f,g)
187	eor	x28,x25,x26			// a^b, b^c in next round
188	eor	x16,x16,x9,ror#18	// Sigma1(e)
189	ror	x9,x25,#28
190	add	x24,x24,x17			// h+=Ch(e,f,g)
191	eor	x17,x25,x25,ror#5
192	add	x24,x24,x16			// h+=Sigma1(e)
193	and	x19,x19,x28			// (b^c)&=(a^b)
194	add	x20,x20,x24			// d+=h
195	eor	x19,x19,x26			// Maj(a,b,c)
196	eor	x17,x9,x17,ror#34	// Sigma0(a)
197	add	x24,x24,x19			// h+=Maj(a,b,c)
198	ldr	x19,[x30],#8		// *K++, x28 in next round
199	//add	x24,x24,x17			// h+=Sigma0(a)
200#ifndef	__AARCH64EB__
201	rev	x7,x7			// 4
202#endif
203	add	x24,x24,x17			// h+=Sigma0(a)
204	ror	x16,x20,#14
205	add	x23,x23,x19			// h+=K[i]
206	eor	x10,x20,x20,ror#23
207	and	x17,x21,x20
208	bic	x19,x22,x20
209	add	x23,x23,x7			// h+=X[i]
210	orr	x17,x17,x19			// Ch(e,f,g)
211	eor	x19,x24,x25			// a^b, b^c in next round
212	eor	x16,x16,x10,ror#18	// Sigma1(e)
213	ror	x10,x24,#28
214	add	x23,x23,x17			// h+=Ch(e,f,g)
215	eor	x17,x24,x24,ror#5
216	add	x23,x23,x16			// h+=Sigma1(e)
217	and	x28,x28,x19			// (b^c)&=(a^b)
218	add	x27,x27,x23			// d+=h
219	eor	x28,x28,x25			// Maj(a,b,c)
220	eor	x17,x10,x17,ror#34	// Sigma0(a)
221	add	x23,x23,x28			// h+=Maj(a,b,c)
222	ldr	x28,[x30],#8		// *K++, x19 in next round
223	//add	x23,x23,x17			// h+=Sigma0(a)
224#ifndef	__AARCH64EB__
225	rev	x8,x8			// 5
226#endif
227	ldp	x9,x10,[x1],#2*8
228	add	x23,x23,x17			// h+=Sigma0(a)
229	ror	x16,x27,#14
230	add	x22,x22,x28			// h+=K[i]
231	eor	x11,x27,x27,ror#23
232	and	x17,x20,x27
233	bic	x28,x21,x27
234	add	x22,x22,x8			// h+=X[i]
235	orr	x17,x17,x28			// Ch(e,f,g)
236	eor	x28,x23,x24			// a^b, b^c in next round
237	eor	x16,x16,x11,ror#18	// Sigma1(e)
238	ror	x11,x23,#28
239	add	x22,x22,x17			// h+=Ch(e,f,g)
240	eor	x17,x23,x23,ror#5
241	add	x22,x22,x16			// h+=Sigma1(e)
242	and	x19,x19,x28			// (b^c)&=(a^b)
243	add	x26,x26,x22			// d+=h
244	eor	x19,x19,x24			// Maj(a,b,c)
245	eor	x17,x11,x17,ror#34	// Sigma0(a)
246	add	x22,x22,x19			// h+=Maj(a,b,c)
247	ldr	x19,[x30],#8		// *K++, x28 in next round
248	//add	x22,x22,x17			// h+=Sigma0(a)
249#ifndef	__AARCH64EB__
250	rev	x9,x9			// 6
251#endif
252	add	x22,x22,x17			// h+=Sigma0(a)
253	ror	x16,x26,#14
254	add	x21,x21,x19			// h+=K[i]
255	eor	x12,x26,x26,ror#23
256	and	x17,x27,x26
257	bic	x19,x20,x26
258	add	x21,x21,x9			// h+=X[i]
259	orr	x17,x17,x19			// Ch(e,f,g)
260	eor	x19,x22,x23			// a^b, b^c in next round
261	eor	x16,x16,x12,ror#18	// Sigma1(e)
262	ror	x12,x22,#28
263	add	x21,x21,x17			// h+=Ch(e,f,g)
264	eor	x17,x22,x22,ror#5
265	add	x21,x21,x16			// h+=Sigma1(e)
266	and	x28,x28,x19			// (b^c)&=(a^b)
267	add	x25,x25,x21			// d+=h
268	eor	x28,x28,x23			// Maj(a,b,c)
269	eor	x17,x12,x17,ror#34	// Sigma0(a)
270	add	x21,x21,x28			// h+=Maj(a,b,c)
271	ldr	x28,[x30],#8		// *K++, x19 in next round
272	//add	x21,x21,x17			// h+=Sigma0(a)
273#ifndef	__AARCH64EB__
274	rev	x10,x10			// 7
275#endif
276	ldp	x11,x12,[x1],#2*8
277	add	x21,x21,x17			// h+=Sigma0(a)
278	ror	x16,x25,#14
279	add	x20,x20,x28			// h+=K[i]
280	eor	x13,x25,x25,ror#23
281	and	x17,x26,x25
282	bic	x28,x27,x25
283	add	x20,x20,x10			// h+=X[i]
284	orr	x17,x17,x28			// Ch(e,f,g)
285	eor	x28,x21,x22			// a^b, b^c in next round
286	eor	x16,x16,x13,ror#18	// Sigma1(e)
287	ror	x13,x21,#28
288	add	x20,x20,x17			// h+=Ch(e,f,g)
289	eor	x17,x21,x21,ror#5
290	add	x20,x20,x16			// h+=Sigma1(e)
291	and	x19,x19,x28			// (b^c)&=(a^b)
292	add	x24,x24,x20			// d+=h
293	eor	x19,x19,x22			// Maj(a,b,c)
294	eor	x17,x13,x17,ror#34	// Sigma0(a)
295	add	x20,x20,x19			// h+=Maj(a,b,c)
296	ldr	x19,[x30],#8		// *K++, x28 in next round
297	//add	x20,x20,x17			// h+=Sigma0(a)
298#ifndef	__AARCH64EB__
299	rev	x11,x11			// 8
300#endif
301	add	x20,x20,x17			// h+=Sigma0(a)
302	ror	x16,x24,#14
303	add	x27,x27,x19			// h+=K[i]
304	eor	x14,x24,x24,ror#23
305	and	x17,x25,x24
306	bic	x19,x26,x24
307	add	x27,x27,x11			// h+=X[i]
308	orr	x17,x17,x19			// Ch(e,f,g)
309	eor	x19,x20,x21			// a^b, b^c in next round
310	eor	x16,x16,x14,ror#18	// Sigma1(e)
311	ror	x14,x20,#28
312	add	x27,x27,x17			// h+=Ch(e,f,g)
313	eor	x17,x20,x20,ror#5
314	add	x27,x27,x16			// h+=Sigma1(e)
315	and	x28,x28,x19			// (b^c)&=(a^b)
316	add	x23,x23,x27			// d+=h
317	eor	x28,x28,x21			// Maj(a,b,c)
318	eor	x17,x14,x17,ror#34	// Sigma0(a)
319	add	x27,x27,x28			// h+=Maj(a,b,c)
320	ldr	x28,[x30],#8		// *K++, x19 in next round
321	//add	x27,x27,x17			// h+=Sigma0(a)
322#ifndef	__AARCH64EB__
323	rev	x12,x12			// 9
324#endif
325	ldp	x13,x14,[x1],#2*8
326	add	x27,x27,x17			// h+=Sigma0(a)
327	ror	x16,x23,#14
328	add	x26,x26,x28			// h+=K[i]
329	eor	x15,x23,x23,ror#23
330	and	x17,x24,x23
331	bic	x28,x25,x23
332	add	x26,x26,x12			// h+=X[i]
333	orr	x17,x17,x28			// Ch(e,f,g)
334	eor	x28,x27,x20			// a^b, b^c in next round
335	eor	x16,x16,x15,ror#18	// Sigma1(e)
336	ror	x15,x27,#28
337	add	x26,x26,x17			// h+=Ch(e,f,g)
338	eor	x17,x27,x27,ror#5
339	add	x26,x26,x16			// h+=Sigma1(e)
340	and	x19,x19,x28			// (b^c)&=(a^b)
341	add	x22,x22,x26			// d+=h
342	eor	x19,x19,x20			// Maj(a,b,c)
343	eor	x17,x15,x17,ror#34	// Sigma0(a)
344	add	x26,x26,x19			// h+=Maj(a,b,c)
345	ldr	x19,[x30],#8		// *K++, x28 in next round
346	//add	x26,x26,x17			// h+=Sigma0(a)
347#ifndef	__AARCH64EB__
348	rev	x13,x13			// 10
349#endif
350	add	x26,x26,x17			// h+=Sigma0(a)
351	ror	x16,x22,#14
352	add	x25,x25,x19			// h+=K[i]
353	eor	x0,x22,x22,ror#23
354	and	x17,x23,x22
355	bic	x19,x24,x22
356	add	x25,x25,x13			// h+=X[i]
357	orr	x17,x17,x19			// Ch(e,f,g)
358	eor	x19,x26,x27			// a^b, b^c in next round
359	eor	x16,x16,x0,ror#18	// Sigma1(e)
360	ror	x0,x26,#28
361	add	x25,x25,x17			// h+=Ch(e,f,g)
362	eor	x17,x26,x26,ror#5
363	add	x25,x25,x16			// h+=Sigma1(e)
364	and	x28,x28,x19			// (b^c)&=(a^b)
365	add	x21,x21,x25			// d+=h
366	eor	x28,x28,x27			// Maj(a,b,c)
367	eor	x17,x0,x17,ror#34	// Sigma0(a)
368	add	x25,x25,x28			// h+=Maj(a,b,c)
369	ldr	x28,[x30],#8		// *K++, x19 in next round
370	//add	x25,x25,x17			// h+=Sigma0(a)
371#ifndef	__AARCH64EB__
372	rev	x14,x14			// 11
373#endif
374	ldp	x15,x0,[x1],#2*8
375	add	x25,x25,x17			// h+=Sigma0(a)
376	str	x6,[sp,#24]
377	ror	x16,x21,#14
378	add	x24,x24,x28			// h+=K[i]
379	eor	x6,x21,x21,ror#23
380	and	x17,x22,x21
381	bic	x28,x23,x21
382	add	x24,x24,x14			// h+=X[i]
383	orr	x17,x17,x28			// Ch(e,f,g)
384	eor	x28,x25,x26			// a^b, b^c in next round
385	eor	x16,x16,x6,ror#18	// Sigma1(e)
386	ror	x6,x25,#28
387	add	x24,x24,x17			// h+=Ch(e,f,g)
388	eor	x17,x25,x25,ror#5
389	add	x24,x24,x16			// h+=Sigma1(e)
390	and	x19,x19,x28			// (b^c)&=(a^b)
391	add	x20,x20,x24			// d+=h
392	eor	x19,x19,x26			// Maj(a,b,c)
393	eor	x17,x6,x17,ror#34	// Sigma0(a)
394	add	x24,x24,x19			// h+=Maj(a,b,c)
395	ldr	x19,[x30],#8		// *K++, x28 in next round
396	//add	x24,x24,x17			// h+=Sigma0(a)
397#ifndef	__AARCH64EB__
398	rev	x15,x15			// 12
399#endif
400	add	x24,x24,x17			// h+=Sigma0(a)
401	str	x7,[sp,#0]
402	ror	x16,x20,#14
403	add	x23,x23,x19			// h+=K[i]
404	eor	x7,x20,x20,ror#23
405	and	x17,x21,x20
406	bic	x19,x22,x20
407	add	x23,x23,x15			// h+=X[i]
408	orr	x17,x17,x19			// Ch(e,f,g)
409	eor	x19,x24,x25			// a^b, b^c in next round
410	eor	x16,x16,x7,ror#18	// Sigma1(e)
411	ror	x7,x24,#28
412	add	x23,x23,x17			// h+=Ch(e,f,g)
413	eor	x17,x24,x24,ror#5
414	add	x23,x23,x16			// h+=Sigma1(e)
415	and	x28,x28,x19			// (b^c)&=(a^b)
416	add	x27,x27,x23			// d+=h
417	eor	x28,x28,x25			// Maj(a,b,c)
418	eor	x17,x7,x17,ror#34	// Sigma0(a)
419	add	x23,x23,x28			// h+=Maj(a,b,c)
420	ldr	x28,[x30],#8		// *K++, x19 in next round
421	//add	x23,x23,x17			// h+=Sigma0(a)
422#ifndef	__AARCH64EB__
423	rev	x0,x0			// 13
424#endif
425	ldp	x1,x2,[x1]
426	add	x23,x23,x17			// h+=Sigma0(a)
427	str	x8,[sp,#8]
428	ror	x16,x27,#14
429	add	x22,x22,x28			// h+=K[i]
430	eor	x8,x27,x27,ror#23
431	and	x17,x20,x27
432	bic	x28,x21,x27
433	add	x22,x22,x0			// h+=X[i]
434	orr	x17,x17,x28			// Ch(e,f,g)
435	eor	x28,x23,x24			// a^b, b^c in next round
436	eor	x16,x16,x8,ror#18	// Sigma1(e)
437	ror	x8,x23,#28
438	add	x22,x22,x17			// h+=Ch(e,f,g)
439	eor	x17,x23,x23,ror#5
440	add	x22,x22,x16			// h+=Sigma1(e)
441	and	x19,x19,x28			// (b^c)&=(a^b)
442	add	x26,x26,x22			// d+=h
443	eor	x19,x19,x24			// Maj(a,b,c)
444	eor	x17,x8,x17,ror#34	// Sigma0(a)
445	add	x22,x22,x19			// h+=Maj(a,b,c)
446	ldr	x19,[x30],#8		// *K++, x28 in next round
447	//add	x22,x22,x17			// h+=Sigma0(a)
448#ifndef	__AARCH64EB__
449	rev	x1,x1			// 14
450#endif
451	ldr	x6,[sp,#24]
452	add	x22,x22,x17			// h+=Sigma0(a)
453	str	x9,[sp,#16]
454	ror	x16,x26,#14
455	add	x21,x21,x19			// h+=K[i]
456	eor	x9,x26,x26,ror#23
457	and	x17,x27,x26
458	bic	x19,x20,x26
459	add	x21,x21,x1			// h+=X[i]
460	orr	x17,x17,x19			// Ch(e,f,g)
461	eor	x19,x22,x23			// a^b, b^c in next round
462	eor	x16,x16,x9,ror#18	// Sigma1(e)
463	ror	x9,x22,#28
464	add	x21,x21,x17			// h+=Ch(e,f,g)
465	eor	x17,x22,x22,ror#5
466	add	x21,x21,x16			// h+=Sigma1(e)
467	and	x28,x28,x19			// (b^c)&=(a^b)
468	add	x25,x25,x21			// d+=h
469	eor	x28,x28,x23			// Maj(a,b,c)
470	eor	x17,x9,x17,ror#34	// Sigma0(a)
471	add	x21,x21,x28			// h+=Maj(a,b,c)
472	ldr	x28,[x30],#8		// *K++, x19 in next round
473	//add	x21,x21,x17			// h+=Sigma0(a)
474#ifndef	__AARCH64EB__
475	rev	x2,x2			// 15
476#endif
477	ldr	x7,[sp,#0]
478	add	x21,x21,x17			// h+=Sigma0(a)
479	str	x10,[sp,#24]
480	ror	x16,x25,#14
481	add	x20,x20,x28			// h+=K[i]
482	ror	x9,x4,#1
483	and	x17,x26,x25
484	ror	x8,x1,#19
485	bic	x28,x27,x25
486	ror	x10,x21,#28
487	add	x20,x20,x2			// h+=X[i]
488	eor	x16,x16,x25,ror#18
489	eor	x9,x9,x4,ror#8
490	orr	x17,x17,x28			// Ch(e,f,g)
491	eor	x28,x21,x22			// a^b, b^c in next round
492	eor	x16,x16,x25,ror#41	// Sigma1(e)
493	eor	x10,x10,x21,ror#34
494	add	x20,x20,x17			// h+=Ch(e,f,g)
495	and	x19,x19,x28			// (b^c)&=(a^b)
496	eor	x8,x8,x1,ror#61
497	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
498	add	x20,x20,x16			// h+=Sigma1(e)
499	eor	x19,x19,x22			// Maj(a,b,c)
500	eor	x17,x10,x21,ror#39	// Sigma0(a)
501	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
502	add	x3,x3,x12
503	add	x24,x24,x20			// d+=h
504	add	x20,x20,x19			// h+=Maj(a,b,c)
505	ldr	x19,[x30],#8		// *K++, x28 in next round
506	add	x3,x3,x9
507	add	x20,x20,x17			// h+=Sigma0(a)
508	add	x3,x3,x8
509.Loop_16_xx:
510	ldr	x8,[sp,#8]
511	str	x11,[sp,#0]
512	ror	x16,x24,#14
513	add	x27,x27,x19			// h+=K[i]
514	ror	x10,x5,#1
515	and	x17,x25,x24
516	ror	x9,x2,#19
517	bic	x19,x26,x24
518	ror	x11,x20,#28
519	add	x27,x27,x3			// h+=X[i]
520	eor	x16,x16,x24,ror#18
521	eor	x10,x10,x5,ror#8
522	orr	x17,x17,x19			// Ch(e,f,g)
523	eor	x19,x20,x21			// a^b, b^c in next round
524	eor	x16,x16,x24,ror#41	// Sigma1(e)
525	eor	x11,x11,x20,ror#34
526	add	x27,x27,x17			// h+=Ch(e,f,g)
527	and	x28,x28,x19			// (b^c)&=(a^b)
528	eor	x9,x9,x2,ror#61
529	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
530	add	x27,x27,x16			// h+=Sigma1(e)
531	eor	x28,x28,x21			// Maj(a,b,c)
532	eor	x17,x11,x20,ror#39	// Sigma0(a)
533	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
534	add	x4,x4,x13
535	add	x23,x23,x27			// d+=h
536	add	x27,x27,x28			// h+=Maj(a,b,c)
537	ldr	x28,[x30],#8		// *K++, x19 in next round
538	add	x4,x4,x10
539	add	x27,x27,x17			// h+=Sigma0(a)
540	add	x4,x4,x9
541	ldr	x9,[sp,#16]
542	str	x12,[sp,#8]
543	ror	x16,x23,#14
544	add	x26,x26,x28			// h+=K[i]
545	ror	x11,x6,#1
546	and	x17,x24,x23
547	ror	x10,x3,#19
548	bic	x28,x25,x23
549	ror	x12,x27,#28
550	add	x26,x26,x4			// h+=X[i]
551	eor	x16,x16,x23,ror#18
552	eor	x11,x11,x6,ror#8
553	orr	x17,x17,x28			// Ch(e,f,g)
554	eor	x28,x27,x20			// a^b, b^c in next round
555	eor	x16,x16,x23,ror#41	// Sigma1(e)
556	eor	x12,x12,x27,ror#34
557	add	x26,x26,x17			// h+=Ch(e,f,g)
558	and	x19,x19,x28			// (b^c)&=(a^b)
559	eor	x10,x10,x3,ror#61
560	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
561	add	x26,x26,x16			// h+=Sigma1(e)
562	eor	x19,x19,x20			// Maj(a,b,c)
563	eor	x17,x12,x27,ror#39	// Sigma0(a)
564	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
565	add	x5,x5,x14
566	add	x22,x22,x26			// d+=h
567	add	x26,x26,x19			// h+=Maj(a,b,c)
568	ldr	x19,[x30],#8		// *K++, x28 in next round
569	add	x5,x5,x11
570	add	x26,x26,x17			// h+=Sigma0(a)
571	add	x5,x5,x10
572	ldr	x10,[sp,#24]
573	str	x13,[sp,#16]
574	ror	x16,x22,#14
575	add	x25,x25,x19			// h+=K[i]
576	ror	x12,x7,#1
577	and	x17,x23,x22
578	ror	x11,x4,#19
579	bic	x19,x24,x22
580	ror	x13,x26,#28
581	add	x25,x25,x5			// h+=X[i]
582	eor	x16,x16,x22,ror#18
583	eor	x12,x12,x7,ror#8
584	orr	x17,x17,x19			// Ch(e,f,g)
585	eor	x19,x26,x27			// a^b, b^c in next round
586	eor	x16,x16,x22,ror#41	// Sigma1(e)
587	eor	x13,x13,x26,ror#34
588	add	x25,x25,x17			// h+=Ch(e,f,g)
589	and	x28,x28,x19			// (b^c)&=(a^b)
590	eor	x11,x11,x4,ror#61
591	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
592	add	x25,x25,x16			// h+=Sigma1(e)
593	eor	x28,x28,x27			// Maj(a,b,c)
594	eor	x17,x13,x26,ror#39	// Sigma0(a)
595	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
596	add	x6,x6,x15
597	add	x21,x21,x25			// d+=h
598	add	x25,x25,x28			// h+=Maj(a,b,c)
599	ldr	x28,[x30],#8		// *K++, x19 in next round
600	add	x6,x6,x12
601	add	x25,x25,x17			// h+=Sigma0(a)
602	add	x6,x6,x11
603	ldr	x11,[sp,#0]
604	str	x14,[sp,#24]
605	ror	x16,x21,#14
606	add	x24,x24,x28			// h+=K[i]
607	ror	x13,x8,#1
608	and	x17,x22,x21
609	ror	x12,x5,#19
610	bic	x28,x23,x21
611	ror	x14,x25,#28
612	add	x24,x24,x6			// h+=X[i]
613	eor	x16,x16,x21,ror#18
614	eor	x13,x13,x8,ror#8
615	orr	x17,x17,x28			// Ch(e,f,g)
616	eor	x28,x25,x26			// a^b, b^c in next round
617	eor	x16,x16,x21,ror#41	// Sigma1(e)
618	eor	x14,x14,x25,ror#34
619	add	x24,x24,x17			// h+=Ch(e,f,g)
620	and	x19,x19,x28			// (b^c)&=(a^b)
621	eor	x12,x12,x5,ror#61
622	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
623	add	x24,x24,x16			// h+=Sigma1(e)
624	eor	x19,x19,x26			// Maj(a,b,c)
625	eor	x17,x14,x25,ror#39	// Sigma0(a)
626	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
627	add	x7,x7,x0
628	add	x20,x20,x24			// d+=h
629	add	x24,x24,x19			// h+=Maj(a,b,c)
630	ldr	x19,[x30],#8		// *K++, x28 in next round
631	add	x7,x7,x13
632	add	x24,x24,x17			// h+=Sigma0(a)
633	add	x7,x7,x12
634	ldr	x12,[sp,#8]
635	str	x15,[sp,#0]
636	ror	x16,x20,#14
637	add	x23,x23,x19			// h+=K[i]
638	ror	x14,x9,#1
639	and	x17,x21,x20
640	ror	x13,x6,#19
641	bic	x19,x22,x20
642	ror	x15,x24,#28
643	add	x23,x23,x7			// h+=X[i]
644	eor	x16,x16,x20,ror#18
645	eor	x14,x14,x9,ror#8
646	orr	x17,x17,x19			// Ch(e,f,g)
647	eor	x19,x24,x25			// a^b, b^c in next round
648	eor	x16,x16,x20,ror#41	// Sigma1(e)
649	eor	x15,x15,x24,ror#34
650	add	x23,x23,x17			// h+=Ch(e,f,g)
651	and	x28,x28,x19			// (b^c)&=(a^b)
652	eor	x13,x13,x6,ror#61
653	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
654	add	x23,x23,x16			// h+=Sigma1(e)
655	eor	x28,x28,x25			// Maj(a,b,c)
656	eor	x17,x15,x24,ror#39	// Sigma0(a)
657	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
658	add	x8,x8,x1
659	add	x27,x27,x23			// d+=h
660	add	x23,x23,x28			// h+=Maj(a,b,c)
661	ldr	x28,[x30],#8		// *K++, x19 in next round
662	add	x8,x8,x14
663	add	x23,x23,x17			// h+=Sigma0(a)
664	add	x8,x8,x13
665	ldr	x13,[sp,#16]
666	str	x0,[sp,#8]
667	ror	x16,x27,#14
668	add	x22,x22,x28			// h+=K[i]
669	ror	x15,x10,#1
670	and	x17,x20,x27
671	ror	x14,x7,#19
672	bic	x28,x21,x27
673	ror	x0,x23,#28
674	add	x22,x22,x8			// h+=X[i]
675	eor	x16,x16,x27,ror#18
676	eor	x15,x15,x10,ror#8
677	orr	x17,x17,x28			// Ch(e,f,g)
678	eor	x28,x23,x24			// a^b, b^c in next round
679	eor	x16,x16,x27,ror#41	// Sigma1(e)
680	eor	x0,x0,x23,ror#34
681	add	x22,x22,x17			// h+=Ch(e,f,g)
682	and	x19,x19,x28			// (b^c)&=(a^b)
683	eor	x14,x14,x7,ror#61
684	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
685	add	x22,x22,x16			// h+=Sigma1(e)
686	eor	x19,x19,x24			// Maj(a,b,c)
687	eor	x17,x0,x23,ror#39	// Sigma0(a)
688	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
689	add	x9,x9,x2
690	add	x26,x26,x22			// d+=h
691	add	x22,x22,x19			// h+=Maj(a,b,c)
692	ldr	x19,[x30],#8		// *K++, x28 in next round
693	add	x9,x9,x15
694	add	x22,x22,x17			// h+=Sigma0(a)
695	add	x9,x9,x14
696	ldr	x14,[sp,#24]
697	str	x1,[sp,#16]
698	ror	x16,x26,#14
699	add	x21,x21,x19			// h+=K[i]
700	ror	x0,x11,#1
701	and	x17,x27,x26
702	ror	x15,x8,#19
703	bic	x19,x20,x26
704	ror	x1,x22,#28
705	add	x21,x21,x9			// h+=X[i]
706	eor	x16,x16,x26,ror#18
707	eor	x0,x0,x11,ror#8
708	orr	x17,x17,x19			// Ch(e,f,g)
709	eor	x19,x22,x23			// a^b, b^c in next round
710	eor	x16,x16,x26,ror#41	// Sigma1(e)
711	eor	x1,x1,x22,ror#34
712	add	x21,x21,x17			// h+=Ch(e,f,g)
713	and	x28,x28,x19			// (b^c)&=(a^b)
714	eor	x15,x15,x8,ror#61
715	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
716	add	x21,x21,x16			// h+=Sigma1(e)
717	eor	x28,x28,x23			// Maj(a,b,c)
718	eor	x17,x1,x22,ror#39	// Sigma0(a)
719	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
720	add	x10,x10,x3
721	add	x25,x25,x21			// d+=h
722	add	x21,x21,x28			// h+=Maj(a,b,c)
723	ldr	x28,[x30],#8		// *K++, x19 in next round
724	add	x10,x10,x0
725	add	x21,x21,x17			// h+=Sigma0(a)
726	add	x10,x10,x15
727	ldr	x15,[sp,#0]
728	str	x2,[sp,#24]
729	ror	x16,x25,#14
730	add	x20,x20,x28			// h+=K[i]
731	ror	x1,x12,#1
732	and	x17,x26,x25
733	ror	x0,x9,#19
734	bic	x28,x27,x25
735	ror	x2,x21,#28
736	add	x20,x20,x10			// h+=X[i]
737	eor	x16,x16,x25,ror#18
738	eor	x1,x1,x12,ror#8
739	orr	x17,x17,x28			// Ch(e,f,g)
740	eor	x28,x21,x22			// a^b, b^c in next round
741	eor	x16,x16,x25,ror#41	// Sigma1(e)
742	eor	x2,x2,x21,ror#34
743	add	x20,x20,x17			// h+=Ch(e,f,g)
744	and	x19,x19,x28			// (b^c)&=(a^b)
745	eor	x0,x0,x9,ror#61
746	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
747	add	x20,x20,x16			// h+=Sigma1(e)
748	eor	x19,x19,x22			// Maj(a,b,c)
749	eor	x17,x2,x21,ror#39	// Sigma0(a)
750	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
751	add	x11,x11,x4
752	add	x24,x24,x20			// d+=h
753	add	x20,x20,x19			// h+=Maj(a,b,c)
754	ldr	x19,[x30],#8		// *K++, x28 in next round
755	add	x11,x11,x1
756	add	x20,x20,x17			// h+=Sigma0(a)
757	add	x11,x11,x0
758	ldr	x0,[sp,#8]
759	str	x3,[sp,#0]
760	ror	x16,x24,#14
761	add	x27,x27,x19			// h+=K[i]
762	ror	x2,x13,#1
763	and	x17,x25,x24
764	ror	x1,x10,#19
765	bic	x19,x26,x24
766	ror	x3,x20,#28
767	add	x27,x27,x11			// h+=X[i]
768	eor	x16,x16,x24,ror#18
769	eor	x2,x2,x13,ror#8
770	orr	x17,x17,x19			// Ch(e,f,g)
771	eor	x19,x20,x21			// a^b, b^c in next round
772	eor	x16,x16,x24,ror#41	// Sigma1(e)
773	eor	x3,x3,x20,ror#34
774	add	x27,x27,x17			// h+=Ch(e,f,g)
775	and	x28,x28,x19			// (b^c)&=(a^b)
776	eor	x1,x1,x10,ror#61
777	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
778	add	x27,x27,x16			// h+=Sigma1(e)
779	eor	x28,x28,x21			// Maj(a,b,c)
780	eor	x17,x3,x20,ror#39	// Sigma0(a)
781	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
782	add	x12,x12,x5
783	add	x23,x23,x27			// d+=h
784	add	x27,x27,x28			// h+=Maj(a,b,c)
785	ldr	x28,[x30],#8		// *K++, x19 in next round
786	add	x12,x12,x2
787	add	x27,x27,x17			// h+=Sigma0(a)
788	add	x12,x12,x1
789	ldr	x1,[sp,#16]
790	str	x4,[sp,#8]
791	ror	x16,x23,#14
792	add	x26,x26,x28			// h+=K[i]
793	ror	x3,x14,#1
794	and	x17,x24,x23
795	ror	x2,x11,#19
796	bic	x28,x25,x23
797	ror	x4,x27,#28
798	add	x26,x26,x12			// h+=X[i]
799	eor	x16,x16,x23,ror#18
800	eor	x3,x3,x14,ror#8
801	orr	x17,x17,x28			// Ch(e,f,g)
802	eor	x28,x27,x20			// a^b, b^c in next round
803	eor	x16,x16,x23,ror#41	// Sigma1(e)
804	eor	x4,x4,x27,ror#34
805	add	x26,x26,x17			// h+=Ch(e,f,g)
806	and	x19,x19,x28			// (b^c)&=(a^b)
807	eor	x2,x2,x11,ror#61
808	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
809	add	x26,x26,x16			// h+=Sigma1(e)
810	eor	x19,x19,x20			// Maj(a,b,c)
811	eor	x17,x4,x27,ror#39	// Sigma0(a)
812	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
813	add	x13,x13,x6
814	add	x22,x22,x26			// d+=h
815	add	x26,x26,x19			// h+=Maj(a,b,c)
816	ldr	x19,[x30],#8		// *K++, x28 in next round
817	add	x13,x13,x3
818	add	x26,x26,x17			// h+=Sigma0(a)
819	add	x13,x13,x2
820	ldr	x2,[sp,#24]
821	str	x5,[sp,#16]
822	ror	x16,x22,#14
823	add	x25,x25,x19			// h+=K[i]
824	ror	x4,x15,#1
825	and	x17,x23,x22
826	ror	x3,x12,#19
827	bic	x19,x24,x22
828	ror	x5,x26,#28
829	add	x25,x25,x13			// h+=X[i]
830	eor	x16,x16,x22,ror#18
831	eor	x4,x4,x15,ror#8
832	orr	x17,x17,x19			// Ch(e,f,g)
833	eor	x19,x26,x27			// a^b, b^c in next round
834	eor	x16,x16,x22,ror#41	// Sigma1(e)
835	eor	x5,x5,x26,ror#34
836	add	x25,x25,x17			// h+=Ch(e,f,g)
837	and	x28,x28,x19			// (b^c)&=(a^b)
838	eor	x3,x3,x12,ror#61
839	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
840	add	x25,x25,x16			// h+=Sigma1(e)
841	eor	x28,x28,x27			// Maj(a,b,c)
842	eor	x17,x5,x26,ror#39	// Sigma0(a)
843	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
844	add	x14,x14,x7
845	add	x21,x21,x25			// d+=h
846	add	x25,x25,x28			// h+=Maj(a,b,c)
847	ldr	x28,[x30],#8		// *K++, x19 in next round
848	add	x14,x14,x4
849	add	x25,x25,x17			// h+=Sigma0(a)
850	add	x14,x14,x3
851	ldr	x3,[sp,#0]
852	str	x6,[sp,#24]
853	ror	x16,x21,#14
854	add	x24,x24,x28			// h+=K[i]
855	ror	x5,x0,#1
856	and	x17,x22,x21
857	ror	x4,x13,#19
858	bic	x28,x23,x21
859	ror	x6,x25,#28
860	add	x24,x24,x14			// h+=X[i]
861	eor	x16,x16,x21,ror#18
862	eor	x5,x5,x0,ror#8
863	orr	x17,x17,x28			// Ch(e,f,g)
864	eor	x28,x25,x26			// a^b, b^c in next round
865	eor	x16,x16,x21,ror#41	// Sigma1(e)
866	eor	x6,x6,x25,ror#34
867	add	x24,x24,x17			// h+=Ch(e,f,g)
868	and	x19,x19,x28			// (b^c)&=(a^b)
869	eor	x4,x4,x13,ror#61
870	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
871	add	x24,x24,x16			// h+=Sigma1(e)
872	eor	x19,x19,x26			// Maj(a,b,c)
873	eor	x17,x6,x25,ror#39	// Sigma0(a)
874	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
875	add	x15,x15,x8
876	add	x20,x20,x24			// d+=h
877	add	x24,x24,x19			// h+=Maj(a,b,c)
878	ldr	x19,[x30],#8		// *K++, x28 in next round
879	add	x15,x15,x5
880	add	x24,x24,x17			// h+=Sigma0(a)
881	add	x15,x15,x4
882	ldr	x4,[sp,#8]
883	str	x7,[sp,#0]
884	ror	x16,x20,#14
885	add	x23,x23,x19			// h+=K[i]
886	ror	x6,x1,#1
887	and	x17,x21,x20
888	ror	x5,x14,#19
889	bic	x19,x22,x20
890	ror	x7,x24,#28
891	add	x23,x23,x15			// h+=X[i]
892	eor	x16,x16,x20,ror#18
893	eor	x6,x6,x1,ror#8
894	orr	x17,x17,x19			// Ch(e,f,g)
895	eor	x19,x24,x25			// a^b, b^c in next round
896	eor	x16,x16,x20,ror#41	// Sigma1(e)
897	eor	x7,x7,x24,ror#34
898	add	x23,x23,x17			// h+=Ch(e,f,g)
899	and	x28,x28,x19			// (b^c)&=(a^b)
900	eor	x5,x5,x14,ror#61
901	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
902	add	x23,x23,x16			// h+=Sigma1(e)
903	eor	x28,x28,x25			// Maj(a,b,c)
904	eor	x17,x7,x24,ror#39	// Sigma0(a)
905	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
906	add	x0,x0,x9
907	add	x27,x27,x23			// d+=h
908	add	x23,x23,x28			// h+=Maj(a,b,c)
909	ldr	x28,[x30],#8		// *K++, x19 in next round
910	add	x0,x0,x6
911	add	x23,x23,x17			// h+=Sigma0(a)
912	add	x0,x0,x5
913	ldr	x5,[sp,#16]
914	str	x8,[sp,#8]
915	ror	x16,x27,#14
916	add	x22,x22,x28			// h+=K[i]
917	ror	x7,x2,#1
918	and	x17,x20,x27
919	ror	x6,x15,#19
920	bic	x28,x21,x27
921	ror	x8,x23,#28
922	add	x22,x22,x0			// h+=X[i]
923	eor	x16,x16,x27,ror#18
924	eor	x7,x7,x2,ror#8
925	orr	x17,x17,x28			// Ch(e,f,g)
926	eor	x28,x23,x24			// a^b, b^c in next round
927	eor	x16,x16,x27,ror#41	// Sigma1(e)
928	eor	x8,x8,x23,ror#34
929	add	x22,x22,x17			// h+=Ch(e,f,g)
930	and	x19,x19,x28			// (b^c)&=(a^b)
931	eor	x6,x6,x15,ror#61
932	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
933	add	x22,x22,x16			// h+=Sigma1(e)
934	eor	x19,x19,x24			// Maj(a,b,c)
935	eor	x17,x8,x23,ror#39	// Sigma0(a)
936	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
937	add	x1,x1,x10
938	add	x26,x26,x22			// d+=h
939	add	x22,x22,x19			// h+=Maj(a,b,c)
940	ldr	x19,[x30],#8		// *K++, x28 in next round
941	add	x1,x1,x7
942	add	x22,x22,x17			// h+=Sigma0(a)
943	add	x1,x1,x6
944	ldr	x6,[sp,#24]
945	str	x9,[sp,#16]
946	ror	x16,x26,#14
947	add	x21,x21,x19			// h+=K[i]
948	ror	x8,x3,#1
949	and	x17,x27,x26
950	ror	x7,x0,#19
951	bic	x19,x20,x26
952	ror	x9,x22,#28
953	add	x21,x21,x1			// h+=X[i]
954	eor	x16,x16,x26,ror#18
955	eor	x8,x8,x3,ror#8
956	orr	x17,x17,x19			// Ch(e,f,g)
957	eor	x19,x22,x23			// a^b, b^c in next round
958	eor	x16,x16,x26,ror#41	// Sigma1(e)
959	eor	x9,x9,x22,ror#34
960	add	x21,x21,x17			// h+=Ch(e,f,g)
961	and	x28,x28,x19			// (b^c)&=(a^b)
962	eor	x7,x7,x0,ror#61
963	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
964	add	x21,x21,x16			// h+=Sigma1(e)
965	eor	x28,x28,x23			// Maj(a,b,c)
966	eor	x17,x9,x22,ror#39	// Sigma0(a)
967	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
968	add	x2,x2,x11
969	add	x25,x25,x21			// d+=h
970	add	x21,x21,x28			// h+=Maj(a,b,c)
971	ldr	x28,[x30],#8		// *K++, x19 in next round
972	add	x2,x2,x8
973	add	x21,x21,x17			// h+=Sigma0(a)
974	add	x2,x2,x7
975	ldr	x7,[sp,#0]
976	str	x10,[sp,#24]
977	ror	x16,x25,#14
978	add	x20,x20,x28			// h+=K[i]
979	ror	x9,x4,#1
980	and	x17,x26,x25
981	ror	x8,x1,#19
982	bic	x28,x27,x25
983	ror	x10,x21,#28
984	add	x20,x20,x2			// h+=X[i]
985	eor	x16,x16,x25,ror#18
986	eor	x9,x9,x4,ror#8
987	orr	x17,x17,x28			// Ch(e,f,g)
988	eor	x28,x21,x22			// a^b, b^c in next round
989	eor	x16,x16,x25,ror#41	// Sigma1(e)
990	eor	x10,x10,x21,ror#34
991	add	x20,x20,x17			// h+=Ch(e,f,g)
992	and	x19,x19,x28			// (b^c)&=(a^b)
993	eor	x8,x8,x1,ror#61
994	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
995	add	x20,x20,x16			// h+=Sigma1(e)
996	eor	x19,x19,x22			// Maj(a,b,c)
997	eor	x17,x10,x21,ror#39	// Sigma0(a)
998	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
999	add	x3,x3,x12
1000	add	x24,x24,x20			// d+=h
1001	add	x20,x20,x19			// h+=Maj(a,b,c)
1002	ldr	x19,[x30],#8		// *K++, x28 in next round
1003	add	x3,x3,x9
1004	add	x20,x20,x17			// h+=Sigma0(a)
1005	add	x3,x3,x8
1006	cbnz	x19,.Loop_16_xx
1007
1008	ldp	x0,x2,[x29,#96]
1009	ldr	x1,[x29,#112]
1010	sub	x30,x30,#648		// rewind
1011
1012	ldp	x3,x4,[x0]
1013	ldp	x5,x6,[x0,#2*8]
1014	add	x1,x1,#14*8			// advance input pointer
1015	ldp	x7,x8,[x0,#4*8]
1016	add	x20,x20,x3
1017	ldp	x9,x10,[x0,#6*8]
1018	add	x21,x21,x4
1019	add	x22,x22,x5
1020	add	x23,x23,x6
1021	stp	x20,x21,[x0]
1022	add	x24,x24,x7
1023	add	x25,x25,x8
1024	stp	x22,x23,[x0,#2*8]
1025	add	x26,x26,x9
1026	add	x27,x27,x10
1027	cmp	x1,x2
1028	stp	x24,x25,[x0,#4*8]
1029	stp	x26,x27,[x0,#6*8]
1030	b.ne	.Loop
1031
1032	ldp	x19,x20,[x29,#16]
1033	add	sp,sp,#4*8
1034	ldp	x21,x22,[x29,#32]
1035	ldp	x23,x24,[x29,#48]
1036	ldp	x25,x26,[x29,#64]
1037	ldp	x27,x28,[x29,#80]
1038	ldp	x29,x30,[sp],#128
1039	AARCH64_VALIDATE_LINK_REGISTER
1040	ret
1041.size	sha512_block_data_order,.-sha512_block_data_order
1042
1043.section	.rodata
1044
1045.align	6
1046.type	.LK512,%object
1047.LK512:
1048.quad	0x428a2f98d728ae22,0x7137449123ef65cd
1049.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1050.quad	0x3956c25bf348b538,0x59f111f1b605d019
1051.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
1052.quad	0xd807aa98a3030242,0x12835b0145706fbe
1053.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1054.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
1055.quad	0x9bdc06a725c71235,0xc19bf174cf692694
1056.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
1057.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1058.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
1059.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1060.quad	0x983e5152ee66dfab,0xa831c66d2db43210
1061.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
1062.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
1063.quad	0x06ca6351e003826f,0x142929670a0e6e70
1064.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
1065.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1066.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
1067.quad	0x81c2c92e47edaee6,0x92722c851482353b
1068.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
1069.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
1070.quad	0xd192e819d6ef5218,0xd69906245565a910
1071.quad	0xf40e35855771202a,0x106aa07032bbd1b8
1072.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
1073.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1074.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1075.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1076.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
1077.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
1078.quad	0x90befffa23631e28,0xa4506cebde82bde9
1079.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
1080.quad	0xca273eceea26619c,0xd186b8c721c0c207
1081.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1082.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
1083.quad	0x113f9804bef90dae,0x1b710b35131c471b
1084.quad	0x28db77f523047d84,0x32caab7b40c72493
1085.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1086.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1087.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
1088.quad	0	// terminator
1089.size	.LK512,.-.LK512
1090.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1091.align	2
1092.align	2
1093
1094.text
1095#ifndef	__KERNEL__
1096.type	sha512_block_armv8,%function
1097.align	6
1098sha512_block_armv8:
1099.Lv8_entry:
1100	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later
1101	stp	x29,x30,[sp,#-16]!
1102	add	x29,sp,#0
1103
1104	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
1105	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1106
1107	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
1108	adrp	x3,.LK512
1109	add	x3,x3,#:lo12:.LK512
1110
1111	rev64	v16.16b,v16.16b
1112	rev64	v17.16b,v17.16b
1113	rev64	v18.16b,v18.16b
1114	rev64	v19.16b,v19.16b
1115	rev64	v20.16b,v20.16b
1116	rev64	v21.16b,v21.16b
1117	rev64	v22.16b,v22.16b
1118	rev64	v23.16b,v23.16b
1119	b	.Loop_hw
1120
1121.align	4
1122.Loop_hw:
1123	ld1	{v24.2d},[x3],#16
1124	subs	x2,x2,#1
1125	sub	x4,x1,#128
1126	orr	v26.16b,v0.16b,v0.16b			// offload
1127	orr	v27.16b,v1.16b,v1.16b
1128	orr	v28.16b,v2.16b,v2.16b
1129	orr	v29.16b,v3.16b,v3.16b
1130	csel	x1,x1,x4,ne			// conditional rewind
1131	add	v24.2d,v24.2d,v16.2d
1132	ld1	{v25.2d},[x3],#16
1133	ext	v24.16b,v24.16b,v24.16b,#8
1134	ext	v5.16b,v2.16b,v3.16b,#8
1135	ext	v6.16b,v1.16b,v2.16b,#8
1136	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1137.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1138	ext	v7.16b,v20.16b,v21.16b,#8
1139.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1140.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1141	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1142.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1143	add	v25.2d,v25.2d,v17.2d
1144	ld1	{v24.2d},[x3],#16
1145	ext	v25.16b,v25.16b,v25.16b,#8
1146	ext	v5.16b,v4.16b,v2.16b,#8
1147	ext	v6.16b,v0.16b,v4.16b,#8
1148	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1149.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1150	ext	v7.16b,v21.16b,v22.16b,#8
1151.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1152.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1153	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1154.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1155	add	v24.2d,v24.2d,v18.2d
1156	ld1	{v25.2d},[x3],#16
1157	ext	v24.16b,v24.16b,v24.16b,#8
1158	ext	v5.16b,v1.16b,v4.16b,#8
1159	ext	v6.16b,v3.16b,v1.16b,#8
1160	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1161.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1162	ext	v7.16b,v22.16b,v23.16b,#8
1163.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1164.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1165	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1166.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1167	add	v25.2d,v25.2d,v19.2d
1168	ld1	{v24.2d},[x3],#16
1169	ext	v25.16b,v25.16b,v25.16b,#8
1170	ext	v5.16b,v0.16b,v1.16b,#8
1171	ext	v6.16b,v2.16b,v0.16b,#8
1172	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1173.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1174	ext	v7.16b,v23.16b,v16.16b,#8
1175.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1176.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1177	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1178.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1179	add	v24.2d,v24.2d,v20.2d
1180	ld1	{v25.2d},[x3],#16
1181	ext	v24.16b,v24.16b,v24.16b,#8
1182	ext	v5.16b,v3.16b,v0.16b,#8
1183	ext	v6.16b,v4.16b,v3.16b,#8
1184	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1185.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1186	ext	v7.16b,v16.16b,v17.16b,#8
1187.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1188.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1189	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1190.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1191	add	v25.2d,v25.2d,v21.2d
1192	ld1	{v24.2d},[x3],#16
1193	ext	v25.16b,v25.16b,v25.16b,#8
1194	ext	v5.16b,v2.16b,v3.16b,#8
1195	ext	v6.16b,v1.16b,v2.16b,#8
1196	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1197.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1198	ext	v7.16b,v17.16b,v18.16b,#8
1199.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1200.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1201	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1202.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1203	add	v24.2d,v24.2d,v22.2d
1204	ld1	{v25.2d},[x3],#16
1205	ext	v24.16b,v24.16b,v24.16b,#8
1206	ext	v5.16b,v4.16b,v2.16b,#8
1207	ext	v6.16b,v0.16b,v4.16b,#8
1208	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1209.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1210	ext	v7.16b,v18.16b,v19.16b,#8
1211.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1212.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1213	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1214.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1215	add	v25.2d,v25.2d,v23.2d
1216	ld1	{v24.2d},[x3],#16
1217	ext	v25.16b,v25.16b,v25.16b,#8
1218	ext	v5.16b,v1.16b,v4.16b,#8
1219	ext	v6.16b,v3.16b,v1.16b,#8
1220	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1221.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1222	ext	v7.16b,v19.16b,v20.16b,#8
1223.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1224.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1225	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1226.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1227	add	v24.2d,v24.2d,v16.2d
1228	ld1	{v25.2d},[x3],#16
1229	ext	v24.16b,v24.16b,v24.16b,#8
1230	ext	v5.16b,v0.16b,v1.16b,#8
1231	ext	v6.16b,v2.16b,v0.16b,#8
1232	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1233.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1234	ext	v7.16b,v20.16b,v21.16b,#8
1235.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1236.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1237	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1238.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1239	add	v25.2d,v25.2d,v17.2d
1240	ld1	{v24.2d},[x3],#16
1241	ext	v25.16b,v25.16b,v25.16b,#8
1242	ext	v5.16b,v3.16b,v0.16b,#8
1243	ext	v6.16b,v4.16b,v3.16b,#8
1244	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1245.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1246	ext	v7.16b,v21.16b,v22.16b,#8
1247.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1248.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1249	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1250.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1251	add	v24.2d,v24.2d,v18.2d
1252	ld1	{v25.2d},[x3],#16
1253	ext	v24.16b,v24.16b,v24.16b,#8
1254	ext	v5.16b,v2.16b,v3.16b,#8
1255	ext	v6.16b,v1.16b,v2.16b,#8
1256	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1257.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1258	ext	v7.16b,v22.16b,v23.16b,#8
1259.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1260.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1261	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1262.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1263	add	v25.2d,v25.2d,v19.2d
1264	ld1	{v24.2d},[x3],#16
1265	ext	v25.16b,v25.16b,v25.16b,#8
1266	ext	v5.16b,v4.16b,v2.16b,#8
1267	ext	v6.16b,v0.16b,v4.16b,#8
1268	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1269.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1270	ext	v7.16b,v23.16b,v16.16b,#8
1271.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1272.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1273	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1274.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1275	add	v24.2d,v24.2d,v20.2d
1276	ld1	{v25.2d},[x3],#16
1277	ext	v24.16b,v24.16b,v24.16b,#8
1278	ext	v5.16b,v1.16b,v4.16b,#8
1279	ext	v6.16b,v3.16b,v1.16b,#8
1280	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1281.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1282	ext	v7.16b,v16.16b,v17.16b,#8
1283.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1284.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1285	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1286.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1287	add	v25.2d,v25.2d,v21.2d
1288	ld1	{v24.2d},[x3],#16
1289	ext	v25.16b,v25.16b,v25.16b,#8
1290	ext	v5.16b,v0.16b,v1.16b,#8
1291	ext	v6.16b,v2.16b,v0.16b,#8
1292	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1293.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1294	ext	v7.16b,v17.16b,v18.16b,#8
1295.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1296.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1297	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1298.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1299	add	v24.2d,v24.2d,v22.2d
1300	ld1	{v25.2d},[x3],#16
1301	ext	v24.16b,v24.16b,v24.16b,#8
1302	ext	v5.16b,v3.16b,v0.16b,#8
1303	ext	v6.16b,v4.16b,v3.16b,#8
1304	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1305.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1306	ext	v7.16b,v18.16b,v19.16b,#8
1307.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1308.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1309	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1310.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1311	add	v25.2d,v25.2d,v23.2d
1312	ld1	{v24.2d},[x3],#16
1313	ext	v25.16b,v25.16b,v25.16b,#8
1314	ext	v5.16b,v2.16b,v3.16b,#8
1315	ext	v6.16b,v1.16b,v2.16b,#8
1316	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1317.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1318	ext	v7.16b,v19.16b,v20.16b,#8
1319.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1320.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1321	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1322.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1323	add	v24.2d,v24.2d,v16.2d
1324	ld1	{v25.2d},[x3],#16
1325	ext	v24.16b,v24.16b,v24.16b,#8
1326	ext	v5.16b,v4.16b,v2.16b,#8
1327	ext	v6.16b,v0.16b,v4.16b,#8
1328	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1329.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1330	ext	v7.16b,v20.16b,v21.16b,#8
1331.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1332.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1333	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1334.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1335	add	v25.2d,v25.2d,v17.2d
1336	ld1	{v24.2d},[x3],#16
1337	ext	v25.16b,v25.16b,v25.16b,#8
1338	ext	v5.16b,v1.16b,v4.16b,#8
1339	ext	v6.16b,v3.16b,v1.16b,#8
1340	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1341.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1342	ext	v7.16b,v21.16b,v22.16b,#8
1343.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1344.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1345	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1346.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1347	add	v24.2d,v24.2d,v18.2d
1348	ld1	{v25.2d},[x3],#16
1349	ext	v24.16b,v24.16b,v24.16b,#8
1350	ext	v5.16b,v0.16b,v1.16b,#8
1351	ext	v6.16b,v2.16b,v0.16b,#8
1352	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1353.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1354	ext	v7.16b,v22.16b,v23.16b,#8
1355.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1356.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1357	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1358.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1359	add	v25.2d,v25.2d,v19.2d
1360	ld1	{v24.2d},[x3],#16
1361	ext	v25.16b,v25.16b,v25.16b,#8
1362	ext	v5.16b,v3.16b,v0.16b,#8
1363	ext	v6.16b,v4.16b,v3.16b,#8
1364	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1365.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1366	ext	v7.16b,v23.16b,v16.16b,#8
1367.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1368.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1369	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1370.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1371	add	v24.2d,v24.2d,v20.2d
1372	ld1	{v25.2d},[x3],#16
1373	ext	v24.16b,v24.16b,v24.16b,#8
1374	ext	v5.16b,v2.16b,v3.16b,#8
1375	ext	v6.16b,v1.16b,v2.16b,#8
1376	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1377.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1378	ext	v7.16b,v16.16b,v17.16b,#8
1379.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1380.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1381	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1382.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1383	add	v25.2d,v25.2d,v21.2d
1384	ld1	{v24.2d},[x3],#16
1385	ext	v25.16b,v25.16b,v25.16b,#8
1386	ext	v5.16b,v4.16b,v2.16b,#8
1387	ext	v6.16b,v0.16b,v4.16b,#8
1388	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1389.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1390	ext	v7.16b,v17.16b,v18.16b,#8
1391.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1392.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1393	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1394.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1395	add	v24.2d,v24.2d,v22.2d
1396	ld1	{v25.2d},[x3],#16
1397	ext	v24.16b,v24.16b,v24.16b,#8
1398	ext	v5.16b,v1.16b,v4.16b,#8
1399	ext	v6.16b,v3.16b,v1.16b,#8
1400	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1401.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1402	ext	v7.16b,v18.16b,v19.16b,#8
1403.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1404.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1405	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1406.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1407	add	v25.2d,v25.2d,v23.2d
1408	ld1	{v24.2d},[x3],#16
1409	ext	v25.16b,v25.16b,v25.16b,#8
1410	ext	v5.16b,v0.16b,v1.16b,#8
1411	ext	v6.16b,v2.16b,v0.16b,#8
1412	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1413.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1414	ext	v7.16b,v19.16b,v20.16b,#8
1415.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1416.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1417	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1418.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1419	add	v24.2d,v24.2d,v16.2d
1420	ld1	{v25.2d},[x3],#16
1421	ext	v24.16b,v24.16b,v24.16b,#8
1422	ext	v5.16b,v3.16b,v0.16b,#8
1423	ext	v6.16b,v4.16b,v3.16b,#8
1424	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1425.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1426	ext	v7.16b,v20.16b,v21.16b,#8
1427.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1428.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1429	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1430.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1431	add	v25.2d,v25.2d,v17.2d
1432	ld1	{v24.2d},[x3],#16
1433	ext	v25.16b,v25.16b,v25.16b,#8
1434	ext	v5.16b,v2.16b,v3.16b,#8
1435	ext	v6.16b,v1.16b,v2.16b,#8
1436	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1437.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1438	ext	v7.16b,v21.16b,v22.16b,#8
1439.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1440.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1441	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1442.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1443	add	v24.2d,v24.2d,v18.2d
1444	ld1	{v25.2d},[x3],#16
1445	ext	v24.16b,v24.16b,v24.16b,#8
1446	ext	v5.16b,v4.16b,v2.16b,#8
1447	ext	v6.16b,v0.16b,v4.16b,#8
1448	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1449.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1450	ext	v7.16b,v22.16b,v23.16b,#8
1451.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1452.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1453	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1454.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1455	add	v25.2d,v25.2d,v19.2d
1456	ld1	{v24.2d},[x3],#16
1457	ext	v25.16b,v25.16b,v25.16b,#8
1458	ext	v5.16b,v1.16b,v4.16b,#8
1459	ext	v6.16b,v3.16b,v1.16b,#8
1460	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1461.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1462	ext	v7.16b,v23.16b,v16.16b,#8
1463.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1464.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1465	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1466.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1467	add	v24.2d,v24.2d,v20.2d
1468	ld1	{v25.2d},[x3],#16
1469	ext	v24.16b,v24.16b,v24.16b,#8
1470	ext	v5.16b,v0.16b,v1.16b,#8
1471	ext	v6.16b,v2.16b,v0.16b,#8
1472	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1473.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1474	ext	v7.16b,v16.16b,v17.16b,#8
1475.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1476.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1477	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1478.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1479	add	v25.2d,v25.2d,v21.2d
1480	ld1	{v24.2d},[x3],#16
1481	ext	v25.16b,v25.16b,v25.16b,#8
1482	ext	v5.16b,v3.16b,v0.16b,#8
1483	ext	v6.16b,v4.16b,v3.16b,#8
1484	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1485.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1486	ext	v7.16b,v17.16b,v18.16b,#8
1487.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1488.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1489	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1490.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1491	add	v24.2d,v24.2d,v22.2d
1492	ld1	{v25.2d},[x3],#16
1493	ext	v24.16b,v24.16b,v24.16b,#8
1494	ext	v5.16b,v2.16b,v3.16b,#8
1495	ext	v6.16b,v1.16b,v2.16b,#8
1496	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1497.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1498	ext	v7.16b,v18.16b,v19.16b,#8
1499.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1500.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1501	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1502.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1503	add	v25.2d,v25.2d,v23.2d
1504	ld1	{v24.2d},[x3],#16
1505	ext	v25.16b,v25.16b,v25.16b,#8
1506	ext	v5.16b,v4.16b,v2.16b,#8
1507	ext	v6.16b,v0.16b,v4.16b,#8
1508	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1509.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1510	ext	v7.16b,v19.16b,v20.16b,#8
1511.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1512.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1513	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1514.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1515	ld1	{v25.2d},[x3],#16
1516	add	v24.2d,v24.2d,v16.2d
1517	ld1	{v16.16b},[x1],#16		// load next input
1518	ext	v24.16b,v24.16b,v24.16b,#8
1519	ext	v5.16b,v1.16b,v4.16b,#8
1520	ext	v6.16b,v3.16b,v1.16b,#8
1521	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1522.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1523	rev64	v16.16b,v16.16b
1524	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1525.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1526	ld1	{v24.2d},[x3],#16
1527	add	v25.2d,v25.2d,v17.2d
1528	ld1	{v17.16b},[x1],#16		// load next input
1529	ext	v25.16b,v25.16b,v25.16b,#8
1530	ext	v5.16b,v0.16b,v1.16b,#8
1531	ext	v6.16b,v2.16b,v0.16b,#8
1532	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1533.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1534	rev64	v17.16b,v17.16b
1535	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1536.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1537	ld1	{v25.2d},[x3],#16
1538	add	v24.2d,v24.2d,v18.2d
1539	ld1	{v18.16b},[x1],#16		// load next input
1540	ext	v24.16b,v24.16b,v24.16b,#8
1541	ext	v5.16b,v3.16b,v0.16b,#8
1542	ext	v6.16b,v4.16b,v3.16b,#8
1543	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1544.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1545	rev64	v18.16b,v18.16b
1546	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1547.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1548	ld1	{v24.2d},[x3],#16
1549	add	v25.2d,v25.2d,v19.2d
1550	ld1	{v19.16b},[x1],#16		// load next input
1551	ext	v25.16b,v25.16b,v25.16b,#8
1552	ext	v5.16b,v2.16b,v3.16b,#8
1553	ext	v6.16b,v1.16b,v2.16b,#8
1554	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1555.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1556	rev64	v19.16b,v19.16b
1557	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1558.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1559	ld1	{v25.2d},[x3],#16
1560	add	v24.2d,v24.2d,v20.2d
1561	ld1	{v20.16b},[x1],#16		// load next input
1562	ext	v24.16b,v24.16b,v24.16b,#8
1563	ext	v5.16b,v4.16b,v2.16b,#8
1564	ext	v6.16b,v0.16b,v4.16b,#8
1565	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1566.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1567	rev64	v20.16b,v20.16b
1568	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1569.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1570	ld1	{v24.2d},[x3],#16
1571	add	v25.2d,v25.2d,v21.2d
1572	ld1	{v21.16b},[x1],#16		// load next input
1573	ext	v25.16b,v25.16b,v25.16b,#8
1574	ext	v5.16b,v1.16b,v4.16b,#8
1575	ext	v6.16b,v3.16b,v1.16b,#8
1576	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1577.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1578	rev64	v21.16b,v21.16b
1579	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1580.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1581	ld1	{v25.2d},[x3],#16
1582	add	v24.2d,v24.2d,v22.2d
1583	ld1	{v22.16b},[x1],#16		// load next input
1584	ext	v24.16b,v24.16b,v24.16b,#8
1585	ext	v5.16b,v0.16b,v1.16b,#8
1586	ext	v6.16b,v2.16b,v0.16b,#8
1587	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1588.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1589	rev64	v22.16b,v22.16b
1590	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1591.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1592	sub	x3,x3,#80*8	// rewind
1593	add	v25.2d,v25.2d,v23.2d
1594	ld1	{v23.16b},[x1],#16		// load next input
1595	ext	v25.16b,v25.16b,v25.16b,#8
1596	ext	v5.16b,v3.16b,v0.16b,#8
1597	ext	v6.16b,v4.16b,v3.16b,#8
1598	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1599.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1600	rev64	v23.16b,v23.16b
1601	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1602.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1603	add	v0.2d,v0.2d,v26.2d			// accumulate
1604	add	v1.2d,v1.2d,v27.2d
1605	add	v2.2d,v2.2d,v28.2d
1606	add	v3.2d,v3.2d,v29.2d
1607
1608	cbnz	x2,.Loop_hw
1609
1610	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
1611
1612	ldr	x29,[sp],#16
1613	ret
1614.size	sha512_block_armv8,.-sha512_block_armv8
1615#endif
1616