xref: /freebsd/sys/crypto/openssl/aarch64/sha512-armv8.S (revision a8089ea5aee578e08acab2438e82fc9a9ae50ed8)
1/* Do not modify. This file is auto-generated from sha512-armv8.pl. */
2// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3//
4// Licensed under the Apache License 2.0 (the "License").  You may not use
5// this file except in compliance with the License.  You can obtain a copy
6// in the file LICENSE in the source distribution or at
7// https://www.openssl.org/source/license.html
8
9// ====================================================================
10// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11// project. The module is, however, dual licensed under OpenSSL and
12// CRYPTOGAMS licenses depending on where you obtain it. For further
13// details see http://www.openssl.org/~appro/cryptogams/.
14//
15// Permission to use under GPLv2 terms is granted.
16// ====================================================================
17//
18// SHA256/512 for ARMv8.
19//
20// Performance in cycles per processed byte and improvement coefficient
21// over code generated with "default" compiler:
22//
23//		SHA256-hw	SHA256(*)	SHA512
24// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
25// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
26// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
27// Denver	2.01		10.5 (+26%)	6.70 (+8%)
28// X-Gene			20.0 (+100%)	12.8 (+300%(***))
29// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
30// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
31// ThunderX2	2.54		13.2 (+40%)	8.40 (+18%)
32//
33// (*)	Software SHA256 results are of lesser relevance, presented
34//	mostly for informational purposes.
35// (**)	The result is a trade-off: it's possible to improve it by
36//	10% (or by 1 cycle per round), but at the cost of 20% loss
37//	on Cortex-A53 (or by 4 cycles per round).
38// (***)	Super-impressive coefficients over gcc-generated code are
39//	indication of some compiler "pathology", most notably code
40//	generated with -mgeneral-regs-only is significantly faster
41//	and the gap is only 40-90%.
42//
43// October 2016.
44//
45// Originally it was reckoned that it makes no sense to implement NEON
46// version of SHA256 for 64-bit processors. This is because performance
47// improvement on most wide-spread Cortex-A5x processors was observed
48// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
49// observed that 32-bit NEON SHA256 performs significantly better than
50// 64-bit scalar version on *some* of the more recent processors. As
51// result 64-bit NEON version of SHA256 was added to provide best
52// all-round performance. For example it executes ~30% faster on X-Gene
53// and Mongoose. [For reference, NEON version of SHA512 is bound to
54// deliver much less improvement, likely *negative* on Cortex-A5x.
55// Which is why NEON support is limited to SHA256.]
56
57// $output is the last argument if it looks like a file (it has an extension)
58// $flavour is the first argument if it doesn't look like a file
59#include "arm_arch.h"
60#ifndef	__KERNEL__
61
62.hidden	OPENSSL_armcap_P
63#endif
64
65.text
66
67.globl	sha512_block_data_order
68.type	sha512_block_data_order,%function
69.align	6
70sha512_block_data_order:
71	AARCH64_VALID_CALL_TARGET
72#ifndef	__KERNEL__
73	adrp	x16,OPENSSL_armcap_P
74	ldr	w16,[x16,#:lo12:OPENSSL_armcap_P]
75	tst	w16,#ARMV8_SHA512
76	b.ne	.Lv8_entry
77#endif
78	AARCH64_SIGN_LINK_REGISTER
79	stp	x29,x30,[sp,#-128]!
80	add	x29,sp,#0
81
82	stp	x19,x20,[sp,#16]
83	stp	x21,x22,[sp,#32]
84	stp	x23,x24,[sp,#48]
85	stp	x25,x26,[sp,#64]
86	stp	x27,x28,[sp,#80]
87	sub	sp,sp,#4*8
88
89	ldp	x20,x21,[x0]				// load context
90	ldp	x22,x23,[x0,#2*8]
91	ldp	x24,x25,[x0,#4*8]
92	add	x2,x1,x2,lsl#7	// end of input
93	ldp	x26,x27,[x0,#6*8]
94	adr	x30,.LK512
95	stp	x0,x2,[x29,#96]
96
97.Loop:
98	ldp	x3,x4,[x1],#2*8
99	ldr	x19,[x30],#8			// *K++
100	eor	x28,x21,x22				// magic seed
101	str	x1,[x29,#112]
102#ifndef	__AARCH64EB__
103	rev	x3,x3			// 0
104#endif
105	ror	x16,x24,#14
106	add	x27,x27,x19			// h+=K[i]
107	eor	x6,x24,x24,ror#23
108	and	x17,x25,x24
109	bic	x19,x26,x24
110	add	x27,x27,x3			// h+=X[i]
111	orr	x17,x17,x19			// Ch(e,f,g)
112	eor	x19,x20,x21			// a^b, b^c in next round
113	eor	x16,x16,x6,ror#18	// Sigma1(e)
114	ror	x6,x20,#28
115	add	x27,x27,x17			// h+=Ch(e,f,g)
116	eor	x17,x20,x20,ror#5
117	add	x27,x27,x16			// h+=Sigma1(e)
118	and	x28,x28,x19			// (b^c)&=(a^b)
119	add	x23,x23,x27			// d+=h
120	eor	x28,x28,x21			// Maj(a,b,c)
121	eor	x17,x6,x17,ror#34	// Sigma0(a)
122	add	x27,x27,x28			// h+=Maj(a,b,c)
123	ldr	x28,[x30],#8		// *K++, x19 in next round
124	//add	x27,x27,x17			// h+=Sigma0(a)
125#ifndef	__AARCH64EB__
126	rev	x4,x4			// 1
127#endif
128	ldp	x5,x6,[x1],#2*8
129	add	x27,x27,x17			// h+=Sigma0(a)
130	ror	x16,x23,#14
131	add	x26,x26,x28			// h+=K[i]
132	eor	x7,x23,x23,ror#23
133	and	x17,x24,x23
134	bic	x28,x25,x23
135	add	x26,x26,x4			// h+=X[i]
136	orr	x17,x17,x28			// Ch(e,f,g)
137	eor	x28,x27,x20			// a^b, b^c in next round
138	eor	x16,x16,x7,ror#18	// Sigma1(e)
139	ror	x7,x27,#28
140	add	x26,x26,x17			// h+=Ch(e,f,g)
141	eor	x17,x27,x27,ror#5
142	add	x26,x26,x16			// h+=Sigma1(e)
143	and	x19,x19,x28			// (b^c)&=(a^b)
144	add	x22,x22,x26			// d+=h
145	eor	x19,x19,x20			// Maj(a,b,c)
146	eor	x17,x7,x17,ror#34	// Sigma0(a)
147	add	x26,x26,x19			// h+=Maj(a,b,c)
148	ldr	x19,[x30],#8		// *K++, x28 in next round
149	//add	x26,x26,x17			// h+=Sigma0(a)
150#ifndef	__AARCH64EB__
151	rev	x5,x5			// 2
152#endif
153	add	x26,x26,x17			// h+=Sigma0(a)
154	ror	x16,x22,#14
155	add	x25,x25,x19			// h+=K[i]
156	eor	x8,x22,x22,ror#23
157	and	x17,x23,x22
158	bic	x19,x24,x22
159	add	x25,x25,x5			// h+=X[i]
160	orr	x17,x17,x19			// Ch(e,f,g)
161	eor	x19,x26,x27			// a^b, b^c in next round
162	eor	x16,x16,x8,ror#18	// Sigma1(e)
163	ror	x8,x26,#28
164	add	x25,x25,x17			// h+=Ch(e,f,g)
165	eor	x17,x26,x26,ror#5
166	add	x25,x25,x16			// h+=Sigma1(e)
167	and	x28,x28,x19			// (b^c)&=(a^b)
168	add	x21,x21,x25			// d+=h
169	eor	x28,x28,x27			// Maj(a,b,c)
170	eor	x17,x8,x17,ror#34	// Sigma0(a)
171	add	x25,x25,x28			// h+=Maj(a,b,c)
172	ldr	x28,[x30],#8		// *K++, x19 in next round
173	//add	x25,x25,x17			// h+=Sigma0(a)
174#ifndef	__AARCH64EB__
175	rev	x6,x6			// 3
176#endif
177	ldp	x7,x8,[x1],#2*8
178	add	x25,x25,x17			// h+=Sigma0(a)
179	ror	x16,x21,#14
180	add	x24,x24,x28			// h+=K[i]
181	eor	x9,x21,x21,ror#23
182	and	x17,x22,x21
183	bic	x28,x23,x21
184	add	x24,x24,x6			// h+=X[i]
185	orr	x17,x17,x28			// Ch(e,f,g)
186	eor	x28,x25,x26			// a^b, b^c in next round
187	eor	x16,x16,x9,ror#18	// Sigma1(e)
188	ror	x9,x25,#28
189	add	x24,x24,x17			// h+=Ch(e,f,g)
190	eor	x17,x25,x25,ror#5
191	add	x24,x24,x16			// h+=Sigma1(e)
192	and	x19,x19,x28			// (b^c)&=(a^b)
193	add	x20,x20,x24			// d+=h
194	eor	x19,x19,x26			// Maj(a,b,c)
195	eor	x17,x9,x17,ror#34	// Sigma0(a)
196	add	x24,x24,x19			// h+=Maj(a,b,c)
197	ldr	x19,[x30],#8		// *K++, x28 in next round
198	//add	x24,x24,x17			// h+=Sigma0(a)
199#ifndef	__AARCH64EB__
200	rev	x7,x7			// 4
201#endif
202	add	x24,x24,x17			// h+=Sigma0(a)
203	ror	x16,x20,#14
204	add	x23,x23,x19			// h+=K[i]
205	eor	x10,x20,x20,ror#23
206	and	x17,x21,x20
207	bic	x19,x22,x20
208	add	x23,x23,x7			// h+=X[i]
209	orr	x17,x17,x19			// Ch(e,f,g)
210	eor	x19,x24,x25			// a^b, b^c in next round
211	eor	x16,x16,x10,ror#18	// Sigma1(e)
212	ror	x10,x24,#28
213	add	x23,x23,x17			// h+=Ch(e,f,g)
214	eor	x17,x24,x24,ror#5
215	add	x23,x23,x16			// h+=Sigma1(e)
216	and	x28,x28,x19			// (b^c)&=(a^b)
217	add	x27,x27,x23			// d+=h
218	eor	x28,x28,x25			// Maj(a,b,c)
219	eor	x17,x10,x17,ror#34	// Sigma0(a)
220	add	x23,x23,x28			// h+=Maj(a,b,c)
221	ldr	x28,[x30],#8		// *K++, x19 in next round
222	//add	x23,x23,x17			// h+=Sigma0(a)
223#ifndef	__AARCH64EB__
224	rev	x8,x8			// 5
225#endif
226	ldp	x9,x10,[x1],#2*8
227	add	x23,x23,x17			// h+=Sigma0(a)
228	ror	x16,x27,#14
229	add	x22,x22,x28			// h+=K[i]
230	eor	x11,x27,x27,ror#23
231	and	x17,x20,x27
232	bic	x28,x21,x27
233	add	x22,x22,x8			// h+=X[i]
234	orr	x17,x17,x28			// Ch(e,f,g)
235	eor	x28,x23,x24			// a^b, b^c in next round
236	eor	x16,x16,x11,ror#18	// Sigma1(e)
237	ror	x11,x23,#28
238	add	x22,x22,x17			// h+=Ch(e,f,g)
239	eor	x17,x23,x23,ror#5
240	add	x22,x22,x16			// h+=Sigma1(e)
241	and	x19,x19,x28			// (b^c)&=(a^b)
242	add	x26,x26,x22			// d+=h
243	eor	x19,x19,x24			// Maj(a,b,c)
244	eor	x17,x11,x17,ror#34	// Sigma0(a)
245	add	x22,x22,x19			// h+=Maj(a,b,c)
246	ldr	x19,[x30],#8		// *K++, x28 in next round
247	//add	x22,x22,x17			// h+=Sigma0(a)
248#ifndef	__AARCH64EB__
249	rev	x9,x9			// 6
250#endif
251	add	x22,x22,x17			// h+=Sigma0(a)
252	ror	x16,x26,#14
253	add	x21,x21,x19			// h+=K[i]
254	eor	x12,x26,x26,ror#23
255	and	x17,x27,x26
256	bic	x19,x20,x26
257	add	x21,x21,x9			// h+=X[i]
258	orr	x17,x17,x19			// Ch(e,f,g)
259	eor	x19,x22,x23			// a^b, b^c in next round
260	eor	x16,x16,x12,ror#18	// Sigma1(e)
261	ror	x12,x22,#28
262	add	x21,x21,x17			// h+=Ch(e,f,g)
263	eor	x17,x22,x22,ror#5
264	add	x21,x21,x16			// h+=Sigma1(e)
265	and	x28,x28,x19			// (b^c)&=(a^b)
266	add	x25,x25,x21			// d+=h
267	eor	x28,x28,x23			// Maj(a,b,c)
268	eor	x17,x12,x17,ror#34	// Sigma0(a)
269	add	x21,x21,x28			// h+=Maj(a,b,c)
270	ldr	x28,[x30],#8		// *K++, x19 in next round
271	//add	x21,x21,x17			// h+=Sigma0(a)
272#ifndef	__AARCH64EB__
273	rev	x10,x10			// 7
274#endif
275	ldp	x11,x12,[x1],#2*8
276	add	x21,x21,x17			// h+=Sigma0(a)
277	ror	x16,x25,#14
278	add	x20,x20,x28			// h+=K[i]
279	eor	x13,x25,x25,ror#23
280	and	x17,x26,x25
281	bic	x28,x27,x25
282	add	x20,x20,x10			// h+=X[i]
283	orr	x17,x17,x28			// Ch(e,f,g)
284	eor	x28,x21,x22			// a^b, b^c in next round
285	eor	x16,x16,x13,ror#18	// Sigma1(e)
286	ror	x13,x21,#28
287	add	x20,x20,x17			// h+=Ch(e,f,g)
288	eor	x17,x21,x21,ror#5
289	add	x20,x20,x16			// h+=Sigma1(e)
290	and	x19,x19,x28			// (b^c)&=(a^b)
291	add	x24,x24,x20			// d+=h
292	eor	x19,x19,x22			// Maj(a,b,c)
293	eor	x17,x13,x17,ror#34	// Sigma0(a)
294	add	x20,x20,x19			// h+=Maj(a,b,c)
295	ldr	x19,[x30],#8		// *K++, x28 in next round
296	//add	x20,x20,x17			// h+=Sigma0(a)
297#ifndef	__AARCH64EB__
298	rev	x11,x11			// 8
299#endif
300	add	x20,x20,x17			// h+=Sigma0(a)
301	ror	x16,x24,#14
302	add	x27,x27,x19			// h+=K[i]
303	eor	x14,x24,x24,ror#23
304	and	x17,x25,x24
305	bic	x19,x26,x24
306	add	x27,x27,x11			// h+=X[i]
307	orr	x17,x17,x19			// Ch(e,f,g)
308	eor	x19,x20,x21			// a^b, b^c in next round
309	eor	x16,x16,x14,ror#18	// Sigma1(e)
310	ror	x14,x20,#28
311	add	x27,x27,x17			// h+=Ch(e,f,g)
312	eor	x17,x20,x20,ror#5
313	add	x27,x27,x16			// h+=Sigma1(e)
314	and	x28,x28,x19			// (b^c)&=(a^b)
315	add	x23,x23,x27			// d+=h
316	eor	x28,x28,x21			// Maj(a,b,c)
317	eor	x17,x14,x17,ror#34	// Sigma0(a)
318	add	x27,x27,x28			// h+=Maj(a,b,c)
319	ldr	x28,[x30],#8		// *K++, x19 in next round
320	//add	x27,x27,x17			// h+=Sigma0(a)
321#ifndef	__AARCH64EB__
322	rev	x12,x12			// 9
323#endif
324	ldp	x13,x14,[x1],#2*8
325	add	x27,x27,x17			// h+=Sigma0(a)
326	ror	x16,x23,#14
327	add	x26,x26,x28			// h+=K[i]
328	eor	x15,x23,x23,ror#23
329	and	x17,x24,x23
330	bic	x28,x25,x23
331	add	x26,x26,x12			// h+=X[i]
332	orr	x17,x17,x28			// Ch(e,f,g)
333	eor	x28,x27,x20			// a^b, b^c in next round
334	eor	x16,x16,x15,ror#18	// Sigma1(e)
335	ror	x15,x27,#28
336	add	x26,x26,x17			// h+=Ch(e,f,g)
337	eor	x17,x27,x27,ror#5
338	add	x26,x26,x16			// h+=Sigma1(e)
339	and	x19,x19,x28			// (b^c)&=(a^b)
340	add	x22,x22,x26			// d+=h
341	eor	x19,x19,x20			// Maj(a,b,c)
342	eor	x17,x15,x17,ror#34	// Sigma0(a)
343	add	x26,x26,x19			// h+=Maj(a,b,c)
344	ldr	x19,[x30],#8		// *K++, x28 in next round
345	//add	x26,x26,x17			// h+=Sigma0(a)
346#ifndef	__AARCH64EB__
347	rev	x13,x13			// 10
348#endif
349	add	x26,x26,x17			// h+=Sigma0(a)
350	ror	x16,x22,#14
351	add	x25,x25,x19			// h+=K[i]
352	eor	x0,x22,x22,ror#23
353	and	x17,x23,x22
354	bic	x19,x24,x22
355	add	x25,x25,x13			// h+=X[i]
356	orr	x17,x17,x19			// Ch(e,f,g)
357	eor	x19,x26,x27			// a^b, b^c in next round
358	eor	x16,x16,x0,ror#18	// Sigma1(e)
359	ror	x0,x26,#28
360	add	x25,x25,x17			// h+=Ch(e,f,g)
361	eor	x17,x26,x26,ror#5
362	add	x25,x25,x16			// h+=Sigma1(e)
363	and	x28,x28,x19			// (b^c)&=(a^b)
364	add	x21,x21,x25			// d+=h
365	eor	x28,x28,x27			// Maj(a,b,c)
366	eor	x17,x0,x17,ror#34	// Sigma0(a)
367	add	x25,x25,x28			// h+=Maj(a,b,c)
368	ldr	x28,[x30],#8		// *K++, x19 in next round
369	//add	x25,x25,x17			// h+=Sigma0(a)
370#ifndef	__AARCH64EB__
371	rev	x14,x14			// 11
372#endif
373	ldp	x15,x0,[x1],#2*8
374	add	x25,x25,x17			// h+=Sigma0(a)
375	str	x6,[sp,#24]
376	ror	x16,x21,#14
377	add	x24,x24,x28			// h+=K[i]
378	eor	x6,x21,x21,ror#23
379	and	x17,x22,x21
380	bic	x28,x23,x21
381	add	x24,x24,x14			// h+=X[i]
382	orr	x17,x17,x28			// Ch(e,f,g)
383	eor	x28,x25,x26			// a^b, b^c in next round
384	eor	x16,x16,x6,ror#18	// Sigma1(e)
385	ror	x6,x25,#28
386	add	x24,x24,x17			// h+=Ch(e,f,g)
387	eor	x17,x25,x25,ror#5
388	add	x24,x24,x16			// h+=Sigma1(e)
389	and	x19,x19,x28			// (b^c)&=(a^b)
390	add	x20,x20,x24			// d+=h
391	eor	x19,x19,x26			// Maj(a,b,c)
392	eor	x17,x6,x17,ror#34	// Sigma0(a)
393	add	x24,x24,x19			// h+=Maj(a,b,c)
394	ldr	x19,[x30],#8		// *K++, x28 in next round
395	//add	x24,x24,x17			// h+=Sigma0(a)
396#ifndef	__AARCH64EB__
397	rev	x15,x15			// 12
398#endif
399	add	x24,x24,x17			// h+=Sigma0(a)
400	str	x7,[sp,#0]
401	ror	x16,x20,#14
402	add	x23,x23,x19			// h+=K[i]
403	eor	x7,x20,x20,ror#23
404	and	x17,x21,x20
405	bic	x19,x22,x20
406	add	x23,x23,x15			// h+=X[i]
407	orr	x17,x17,x19			// Ch(e,f,g)
408	eor	x19,x24,x25			// a^b, b^c in next round
409	eor	x16,x16,x7,ror#18	// Sigma1(e)
410	ror	x7,x24,#28
411	add	x23,x23,x17			// h+=Ch(e,f,g)
412	eor	x17,x24,x24,ror#5
413	add	x23,x23,x16			// h+=Sigma1(e)
414	and	x28,x28,x19			// (b^c)&=(a^b)
415	add	x27,x27,x23			// d+=h
416	eor	x28,x28,x25			// Maj(a,b,c)
417	eor	x17,x7,x17,ror#34	// Sigma0(a)
418	add	x23,x23,x28			// h+=Maj(a,b,c)
419	ldr	x28,[x30],#8		// *K++, x19 in next round
420	//add	x23,x23,x17			// h+=Sigma0(a)
421#ifndef	__AARCH64EB__
422	rev	x0,x0			// 13
423#endif
424	ldp	x1,x2,[x1]
425	add	x23,x23,x17			// h+=Sigma0(a)
426	str	x8,[sp,#8]
427	ror	x16,x27,#14
428	add	x22,x22,x28			// h+=K[i]
429	eor	x8,x27,x27,ror#23
430	and	x17,x20,x27
431	bic	x28,x21,x27
432	add	x22,x22,x0			// h+=X[i]
433	orr	x17,x17,x28			// Ch(e,f,g)
434	eor	x28,x23,x24			// a^b, b^c in next round
435	eor	x16,x16,x8,ror#18	// Sigma1(e)
436	ror	x8,x23,#28
437	add	x22,x22,x17			// h+=Ch(e,f,g)
438	eor	x17,x23,x23,ror#5
439	add	x22,x22,x16			// h+=Sigma1(e)
440	and	x19,x19,x28			// (b^c)&=(a^b)
441	add	x26,x26,x22			// d+=h
442	eor	x19,x19,x24			// Maj(a,b,c)
443	eor	x17,x8,x17,ror#34	// Sigma0(a)
444	add	x22,x22,x19			// h+=Maj(a,b,c)
445	ldr	x19,[x30],#8		// *K++, x28 in next round
446	//add	x22,x22,x17			// h+=Sigma0(a)
447#ifndef	__AARCH64EB__
448	rev	x1,x1			// 14
449#endif
450	ldr	x6,[sp,#24]
451	add	x22,x22,x17			// h+=Sigma0(a)
452	str	x9,[sp,#16]
453	ror	x16,x26,#14
454	add	x21,x21,x19			// h+=K[i]
455	eor	x9,x26,x26,ror#23
456	and	x17,x27,x26
457	bic	x19,x20,x26
458	add	x21,x21,x1			// h+=X[i]
459	orr	x17,x17,x19			// Ch(e,f,g)
460	eor	x19,x22,x23			// a^b, b^c in next round
461	eor	x16,x16,x9,ror#18	// Sigma1(e)
462	ror	x9,x22,#28
463	add	x21,x21,x17			// h+=Ch(e,f,g)
464	eor	x17,x22,x22,ror#5
465	add	x21,x21,x16			// h+=Sigma1(e)
466	and	x28,x28,x19			// (b^c)&=(a^b)
467	add	x25,x25,x21			// d+=h
468	eor	x28,x28,x23			// Maj(a,b,c)
469	eor	x17,x9,x17,ror#34	// Sigma0(a)
470	add	x21,x21,x28			// h+=Maj(a,b,c)
471	ldr	x28,[x30],#8		// *K++, x19 in next round
472	//add	x21,x21,x17			// h+=Sigma0(a)
473#ifndef	__AARCH64EB__
474	rev	x2,x2			// 15
475#endif
476	ldr	x7,[sp,#0]
477	add	x21,x21,x17			// h+=Sigma0(a)
478	str	x10,[sp,#24]
479	ror	x16,x25,#14
480	add	x20,x20,x28			// h+=K[i]
481	ror	x9,x4,#1
482	and	x17,x26,x25
483	ror	x8,x1,#19
484	bic	x28,x27,x25
485	ror	x10,x21,#28
486	add	x20,x20,x2			// h+=X[i]
487	eor	x16,x16,x25,ror#18
488	eor	x9,x9,x4,ror#8
489	orr	x17,x17,x28			// Ch(e,f,g)
490	eor	x28,x21,x22			// a^b, b^c in next round
491	eor	x16,x16,x25,ror#41	// Sigma1(e)
492	eor	x10,x10,x21,ror#34
493	add	x20,x20,x17			// h+=Ch(e,f,g)
494	and	x19,x19,x28			// (b^c)&=(a^b)
495	eor	x8,x8,x1,ror#61
496	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
497	add	x20,x20,x16			// h+=Sigma1(e)
498	eor	x19,x19,x22			// Maj(a,b,c)
499	eor	x17,x10,x21,ror#39	// Sigma0(a)
500	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
501	add	x3,x3,x12
502	add	x24,x24,x20			// d+=h
503	add	x20,x20,x19			// h+=Maj(a,b,c)
504	ldr	x19,[x30],#8		// *K++, x28 in next round
505	add	x3,x3,x9
506	add	x20,x20,x17			// h+=Sigma0(a)
507	add	x3,x3,x8
508.Loop_16_xx:
509	ldr	x8,[sp,#8]
510	str	x11,[sp,#0]
511	ror	x16,x24,#14
512	add	x27,x27,x19			// h+=K[i]
513	ror	x10,x5,#1
514	and	x17,x25,x24
515	ror	x9,x2,#19
516	bic	x19,x26,x24
517	ror	x11,x20,#28
518	add	x27,x27,x3			// h+=X[i]
519	eor	x16,x16,x24,ror#18
520	eor	x10,x10,x5,ror#8
521	orr	x17,x17,x19			// Ch(e,f,g)
522	eor	x19,x20,x21			// a^b, b^c in next round
523	eor	x16,x16,x24,ror#41	// Sigma1(e)
524	eor	x11,x11,x20,ror#34
525	add	x27,x27,x17			// h+=Ch(e,f,g)
526	and	x28,x28,x19			// (b^c)&=(a^b)
527	eor	x9,x9,x2,ror#61
528	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
529	add	x27,x27,x16			// h+=Sigma1(e)
530	eor	x28,x28,x21			// Maj(a,b,c)
531	eor	x17,x11,x20,ror#39	// Sigma0(a)
532	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
533	add	x4,x4,x13
534	add	x23,x23,x27			// d+=h
535	add	x27,x27,x28			// h+=Maj(a,b,c)
536	ldr	x28,[x30],#8		// *K++, x19 in next round
537	add	x4,x4,x10
538	add	x27,x27,x17			// h+=Sigma0(a)
539	add	x4,x4,x9
540	ldr	x9,[sp,#16]
541	str	x12,[sp,#8]
542	ror	x16,x23,#14
543	add	x26,x26,x28			// h+=K[i]
544	ror	x11,x6,#1
545	and	x17,x24,x23
546	ror	x10,x3,#19
547	bic	x28,x25,x23
548	ror	x12,x27,#28
549	add	x26,x26,x4			// h+=X[i]
550	eor	x16,x16,x23,ror#18
551	eor	x11,x11,x6,ror#8
552	orr	x17,x17,x28			// Ch(e,f,g)
553	eor	x28,x27,x20			// a^b, b^c in next round
554	eor	x16,x16,x23,ror#41	// Sigma1(e)
555	eor	x12,x12,x27,ror#34
556	add	x26,x26,x17			// h+=Ch(e,f,g)
557	and	x19,x19,x28			// (b^c)&=(a^b)
558	eor	x10,x10,x3,ror#61
559	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
560	add	x26,x26,x16			// h+=Sigma1(e)
561	eor	x19,x19,x20			// Maj(a,b,c)
562	eor	x17,x12,x27,ror#39	// Sigma0(a)
563	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
564	add	x5,x5,x14
565	add	x22,x22,x26			// d+=h
566	add	x26,x26,x19			// h+=Maj(a,b,c)
567	ldr	x19,[x30],#8		// *K++, x28 in next round
568	add	x5,x5,x11
569	add	x26,x26,x17			// h+=Sigma0(a)
570	add	x5,x5,x10
571	ldr	x10,[sp,#24]
572	str	x13,[sp,#16]
573	ror	x16,x22,#14
574	add	x25,x25,x19			// h+=K[i]
575	ror	x12,x7,#1
576	and	x17,x23,x22
577	ror	x11,x4,#19
578	bic	x19,x24,x22
579	ror	x13,x26,#28
580	add	x25,x25,x5			// h+=X[i]
581	eor	x16,x16,x22,ror#18
582	eor	x12,x12,x7,ror#8
583	orr	x17,x17,x19			// Ch(e,f,g)
584	eor	x19,x26,x27			// a^b, b^c in next round
585	eor	x16,x16,x22,ror#41	// Sigma1(e)
586	eor	x13,x13,x26,ror#34
587	add	x25,x25,x17			// h+=Ch(e,f,g)
588	and	x28,x28,x19			// (b^c)&=(a^b)
589	eor	x11,x11,x4,ror#61
590	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
591	add	x25,x25,x16			// h+=Sigma1(e)
592	eor	x28,x28,x27			// Maj(a,b,c)
593	eor	x17,x13,x26,ror#39	// Sigma0(a)
594	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
595	add	x6,x6,x15
596	add	x21,x21,x25			// d+=h
597	add	x25,x25,x28			// h+=Maj(a,b,c)
598	ldr	x28,[x30],#8		// *K++, x19 in next round
599	add	x6,x6,x12
600	add	x25,x25,x17			// h+=Sigma0(a)
601	add	x6,x6,x11
602	ldr	x11,[sp,#0]
603	str	x14,[sp,#24]
604	ror	x16,x21,#14
605	add	x24,x24,x28			// h+=K[i]
606	ror	x13,x8,#1
607	and	x17,x22,x21
608	ror	x12,x5,#19
609	bic	x28,x23,x21
610	ror	x14,x25,#28
611	add	x24,x24,x6			// h+=X[i]
612	eor	x16,x16,x21,ror#18
613	eor	x13,x13,x8,ror#8
614	orr	x17,x17,x28			// Ch(e,f,g)
615	eor	x28,x25,x26			// a^b, b^c in next round
616	eor	x16,x16,x21,ror#41	// Sigma1(e)
617	eor	x14,x14,x25,ror#34
618	add	x24,x24,x17			// h+=Ch(e,f,g)
619	and	x19,x19,x28			// (b^c)&=(a^b)
620	eor	x12,x12,x5,ror#61
621	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
622	add	x24,x24,x16			// h+=Sigma1(e)
623	eor	x19,x19,x26			// Maj(a,b,c)
624	eor	x17,x14,x25,ror#39	// Sigma0(a)
625	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
626	add	x7,x7,x0
627	add	x20,x20,x24			// d+=h
628	add	x24,x24,x19			// h+=Maj(a,b,c)
629	ldr	x19,[x30],#8		// *K++, x28 in next round
630	add	x7,x7,x13
631	add	x24,x24,x17			// h+=Sigma0(a)
632	add	x7,x7,x12
633	ldr	x12,[sp,#8]
634	str	x15,[sp,#0]
635	ror	x16,x20,#14
636	add	x23,x23,x19			// h+=K[i]
637	ror	x14,x9,#1
638	and	x17,x21,x20
639	ror	x13,x6,#19
640	bic	x19,x22,x20
641	ror	x15,x24,#28
642	add	x23,x23,x7			// h+=X[i]
643	eor	x16,x16,x20,ror#18
644	eor	x14,x14,x9,ror#8
645	orr	x17,x17,x19			// Ch(e,f,g)
646	eor	x19,x24,x25			// a^b, b^c in next round
647	eor	x16,x16,x20,ror#41	// Sigma1(e)
648	eor	x15,x15,x24,ror#34
649	add	x23,x23,x17			// h+=Ch(e,f,g)
650	and	x28,x28,x19			// (b^c)&=(a^b)
651	eor	x13,x13,x6,ror#61
652	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
653	add	x23,x23,x16			// h+=Sigma1(e)
654	eor	x28,x28,x25			// Maj(a,b,c)
655	eor	x17,x15,x24,ror#39	// Sigma0(a)
656	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
657	add	x8,x8,x1
658	add	x27,x27,x23			// d+=h
659	add	x23,x23,x28			// h+=Maj(a,b,c)
660	ldr	x28,[x30],#8		// *K++, x19 in next round
661	add	x8,x8,x14
662	add	x23,x23,x17			// h+=Sigma0(a)
663	add	x8,x8,x13
664	ldr	x13,[sp,#16]
665	str	x0,[sp,#8]
666	ror	x16,x27,#14
667	add	x22,x22,x28			// h+=K[i]
668	ror	x15,x10,#1
669	and	x17,x20,x27
670	ror	x14,x7,#19
671	bic	x28,x21,x27
672	ror	x0,x23,#28
673	add	x22,x22,x8			// h+=X[i]
674	eor	x16,x16,x27,ror#18
675	eor	x15,x15,x10,ror#8
676	orr	x17,x17,x28			// Ch(e,f,g)
677	eor	x28,x23,x24			// a^b, b^c in next round
678	eor	x16,x16,x27,ror#41	// Sigma1(e)
679	eor	x0,x0,x23,ror#34
680	add	x22,x22,x17			// h+=Ch(e,f,g)
681	and	x19,x19,x28			// (b^c)&=(a^b)
682	eor	x14,x14,x7,ror#61
683	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
684	add	x22,x22,x16			// h+=Sigma1(e)
685	eor	x19,x19,x24			// Maj(a,b,c)
686	eor	x17,x0,x23,ror#39	// Sigma0(a)
687	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
688	add	x9,x9,x2
689	add	x26,x26,x22			// d+=h
690	add	x22,x22,x19			// h+=Maj(a,b,c)
691	ldr	x19,[x30],#8		// *K++, x28 in next round
692	add	x9,x9,x15
693	add	x22,x22,x17			// h+=Sigma0(a)
694	add	x9,x9,x14
695	ldr	x14,[sp,#24]
696	str	x1,[sp,#16]
697	ror	x16,x26,#14
698	add	x21,x21,x19			// h+=K[i]
699	ror	x0,x11,#1
700	and	x17,x27,x26
701	ror	x15,x8,#19
702	bic	x19,x20,x26
703	ror	x1,x22,#28
704	add	x21,x21,x9			// h+=X[i]
705	eor	x16,x16,x26,ror#18
706	eor	x0,x0,x11,ror#8
707	orr	x17,x17,x19			// Ch(e,f,g)
708	eor	x19,x22,x23			// a^b, b^c in next round
709	eor	x16,x16,x26,ror#41	// Sigma1(e)
710	eor	x1,x1,x22,ror#34
711	add	x21,x21,x17			// h+=Ch(e,f,g)
712	and	x28,x28,x19			// (b^c)&=(a^b)
713	eor	x15,x15,x8,ror#61
714	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
715	add	x21,x21,x16			// h+=Sigma1(e)
716	eor	x28,x28,x23			// Maj(a,b,c)
717	eor	x17,x1,x22,ror#39	// Sigma0(a)
718	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
719	add	x10,x10,x3
720	add	x25,x25,x21			// d+=h
721	add	x21,x21,x28			// h+=Maj(a,b,c)
722	ldr	x28,[x30],#8		// *K++, x19 in next round
723	add	x10,x10,x0
724	add	x21,x21,x17			// h+=Sigma0(a)
725	add	x10,x10,x15
726	ldr	x15,[sp,#0]
727	str	x2,[sp,#24]
728	ror	x16,x25,#14
729	add	x20,x20,x28			// h+=K[i]
730	ror	x1,x12,#1
731	and	x17,x26,x25
732	ror	x0,x9,#19
733	bic	x28,x27,x25
734	ror	x2,x21,#28
735	add	x20,x20,x10			// h+=X[i]
736	eor	x16,x16,x25,ror#18
737	eor	x1,x1,x12,ror#8
738	orr	x17,x17,x28			// Ch(e,f,g)
739	eor	x28,x21,x22			// a^b, b^c in next round
740	eor	x16,x16,x25,ror#41	// Sigma1(e)
741	eor	x2,x2,x21,ror#34
742	add	x20,x20,x17			// h+=Ch(e,f,g)
743	and	x19,x19,x28			// (b^c)&=(a^b)
744	eor	x0,x0,x9,ror#61
745	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
746	add	x20,x20,x16			// h+=Sigma1(e)
747	eor	x19,x19,x22			// Maj(a,b,c)
748	eor	x17,x2,x21,ror#39	// Sigma0(a)
749	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
750	add	x11,x11,x4
751	add	x24,x24,x20			// d+=h
752	add	x20,x20,x19			// h+=Maj(a,b,c)
753	ldr	x19,[x30],#8		// *K++, x28 in next round
754	add	x11,x11,x1
755	add	x20,x20,x17			// h+=Sigma0(a)
756	add	x11,x11,x0
757	ldr	x0,[sp,#8]
758	str	x3,[sp,#0]
759	ror	x16,x24,#14
760	add	x27,x27,x19			// h+=K[i]
761	ror	x2,x13,#1
762	and	x17,x25,x24
763	ror	x1,x10,#19
764	bic	x19,x26,x24
765	ror	x3,x20,#28
766	add	x27,x27,x11			// h+=X[i]
767	eor	x16,x16,x24,ror#18
768	eor	x2,x2,x13,ror#8
769	orr	x17,x17,x19			// Ch(e,f,g)
770	eor	x19,x20,x21			// a^b, b^c in next round
771	eor	x16,x16,x24,ror#41	// Sigma1(e)
772	eor	x3,x3,x20,ror#34
773	add	x27,x27,x17			// h+=Ch(e,f,g)
774	and	x28,x28,x19			// (b^c)&=(a^b)
775	eor	x1,x1,x10,ror#61
776	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
777	add	x27,x27,x16			// h+=Sigma1(e)
778	eor	x28,x28,x21			// Maj(a,b,c)
779	eor	x17,x3,x20,ror#39	// Sigma0(a)
780	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
781	add	x12,x12,x5
782	add	x23,x23,x27			// d+=h
783	add	x27,x27,x28			// h+=Maj(a,b,c)
784	ldr	x28,[x30],#8		// *K++, x19 in next round
785	add	x12,x12,x2
786	add	x27,x27,x17			// h+=Sigma0(a)
787	add	x12,x12,x1
788	ldr	x1,[sp,#16]
789	str	x4,[sp,#8]
790	ror	x16,x23,#14
791	add	x26,x26,x28			// h+=K[i]
792	ror	x3,x14,#1
793	and	x17,x24,x23
794	ror	x2,x11,#19
795	bic	x28,x25,x23
796	ror	x4,x27,#28
797	add	x26,x26,x12			// h+=X[i]
798	eor	x16,x16,x23,ror#18
799	eor	x3,x3,x14,ror#8
800	orr	x17,x17,x28			// Ch(e,f,g)
801	eor	x28,x27,x20			// a^b, b^c in next round
802	eor	x16,x16,x23,ror#41	// Sigma1(e)
803	eor	x4,x4,x27,ror#34
804	add	x26,x26,x17			// h+=Ch(e,f,g)
805	and	x19,x19,x28			// (b^c)&=(a^b)
806	eor	x2,x2,x11,ror#61
807	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
808	add	x26,x26,x16			// h+=Sigma1(e)
809	eor	x19,x19,x20			// Maj(a,b,c)
810	eor	x17,x4,x27,ror#39	// Sigma0(a)
811	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
812	add	x13,x13,x6
813	add	x22,x22,x26			// d+=h
814	add	x26,x26,x19			// h+=Maj(a,b,c)
815	ldr	x19,[x30],#8		// *K++, x28 in next round
816	add	x13,x13,x3
817	add	x26,x26,x17			// h+=Sigma0(a)
818	add	x13,x13,x2
819	ldr	x2,[sp,#24]
820	str	x5,[sp,#16]
821	ror	x16,x22,#14
822	add	x25,x25,x19			// h+=K[i]
823	ror	x4,x15,#1
824	and	x17,x23,x22
825	ror	x3,x12,#19
826	bic	x19,x24,x22
827	ror	x5,x26,#28
828	add	x25,x25,x13			// h+=X[i]
829	eor	x16,x16,x22,ror#18
830	eor	x4,x4,x15,ror#8
831	orr	x17,x17,x19			// Ch(e,f,g)
832	eor	x19,x26,x27			// a^b, b^c in next round
833	eor	x16,x16,x22,ror#41	// Sigma1(e)
834	eor	x5,x5,x26,ror#34
835	add	x25,x25,x17			// h+=Ch(e,f,g)
836	and	x28,x28,x19			// (b^c)&=(a^b)
837	eor	x3,x3,x12,ror#61
838	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
839	add	x25,x25,x16			// h+=Sigma1(e)
840	eor	x28,x28,x27			// Maj(a,b,c)
841	eor	x17,x5,x26,ror#39	// Sigma0(a)
842	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
843	add	x14,x14,x7
844	add	x21,x21,x25			// d+=h
845	add	x25,x25,x28			// h+=Maj(a,b,c)
846	ldr	x28,[x30],#8		// *K++, x19 in next round
847	add	x14,x14,x4
848	add	x25,x25,x17			// h+=Sigma0(a)
849	add	x14,x14,x3
850	ldr	x3,[sp,#0]
851	str	x6,[sp,#24]
852	ror	x16,x21,#14
853	add	x24,x24,x28			// h+=K[i]
854	ror	x5,x0,#1
855	and	x17,x22,x21
856	ror	x4,x13,#19
857	bic	x28,x23,x21
858	ror	x6,x25,#28
859	add	x24,x24,x14			// h+=X[i]
860	eor	x16,x16,x21,ror#18
861	eor	x5,x5,x0,ror#8
862	orr	x17,x17,x28			// Ch(e,f,g)
863	eor	x28,x25,x26			// a^b, b^c in next round
864	eor	x16,x16,x21,ror#41	// Sigma1(e)
865	eor	x6,x6,x25,ror#34
866	add	x24,x24,x17			// h+=Ch(e,f,g)
867	and	x19,x19,x28			// (b^c)&=(a^b)
868	eor	x4,x4,x13,ror#61
869	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
870	add	x24,x24,x16			// h+=Sigma1(e)
871	eor	x19,x19,x26			// Maj(a,b,c)
872	eor	x17,x6,x25,ror#39	// Sigma0(a)
873	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
874	add	x15,x15,x8
875	add	x20,x20,x24			// d+=h
876	add	x24,x24,x19			// h+=Maj(a,b,c)
877	ldr	x19,[x30],#8		// *K++, x28 in next round
878	add	x15,x15,x5
879	add	x24,x24,x17			// h+=Sigma0(a)
880	add	x15,x15,x4
881	ldr	x4,[sp,#8]
882	str	x7,[sp,#0]
883	ror	x16,x20,#14
884	add	x23,x23,x19			// h+=K[i]
885	ror	x6,x1,#1
886	and	x17,x21,x20
887	ror	x5,x14,#19
888	bic	x19,x22,x20
889	ror	x7,x24,#28
890	add	x23,x23,x15			// h+=X[i]
891	eor	x16,x16,x20,ror#18
892	eor	x6,x6,x1,ror#8
893	orr	x17,x17,x19			// Ch(e,f,g)
894	eor	x19,x24,x25			// a^b, b^c in next round
895	eor	x16,x16,x20,ror#41	// Sigma1(e)
896	eor	x7,x7,x24,ror#34
897	add	x23,x23,x17			// h+=Ch(e,f,g)
898	and	x28,x28,x19			// (b^c)&=(a^b)
899	eor	x5,x5,x14,ror#61
900	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
901	add	x23,x23,x16			// h+=Sigma1(e)
902	eor	x28,x28,x25			// Maj(a,b,c)
903	eor	x17,x7,x24,ror#39	// Sigma0(a)
904	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
905	add	x0,x0,x9
906	add	x27,x27,x23			// d+=h
907	add	x23,x23,x28			// h+=Maj(a,b,c)
908	ldr	x28,[x30],#8		// *K++, x19 in next round
909	add	x0,x0,x6
910	add	x23,x23,x17			// h+=Sigma0(a)
911	add	x0,x0,x5
912	ldr	x5,[sp,#16]
913	str	x8,[sp,#8]
914	ror	x16,x27,#14
915	add	x22,x22,x28			// h+=K[i]
916	ror	x7,x2,#1
917	and	x17,x20,x27
918	ror	x6,x15,#19
919	bic	x28,x21,x27
920	ror	x8,x23,#28
921	add	x22,x22,x0			// h+=X[i]
922	eor	x16,x16,x27,ror#18
923	eor	x7,x7,x2,ror#8
924	orr	x17,x17,x28			// Ch(e,f,g)
925	eor	x28,x23,x24			// a^b, b^c in next round
926	eor	x16,x16,x27,ror#41	// Sigma1(e)
927	eor	x8,x8,x23,ror#34
928	add	x22,x22,x17			// h+=Ch(e,f,g)
929	and	x19,x19,x28			// (b^c)&=(a^b)
930	eor	x6,x6,x15,ror#61
931	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
932	add	x22,x22,x16			// h+=Sigma1(e)
933	eor	x19,x19,x24			// Maj(a,b,c)
934	eor	x17,x8,x23,ror#39	// Sigma0(a)
935	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
936	add	x1,x1,x10
937	add	x26,x26,x22			// d+=h
938	add	x22,x22,x19			// h+=Maj(a,b,c)
939	ldr	x19,[x30],#8		// *K++, x28 in next round
940	add	x1,x1,x7
941	add	x22,x22,x17			// h+=Sigma0(a)
942	add	x1,x1,x6
943	ldr	x6,[sp,#24]
944	str	x9,[sp,#16]
945	ror	x16,x26,#14
946	add	x21,x21,x19			// h+=K[i]
947	ror	x8,x3,#1
948	and	x17,x27,x26
949	ror	x7,x0,#19
950	bic	x19,x20,x26
951	ror	x9,x22,#28
952	add	x21,x21,x1			// h+=X[i]
953	eor	x16,x16,x26,ror#18
954	eor	x8,x8,x3,ror#8
955	orr	x17,x17,x19			// Ch(e,f,g)
956	eor	x19,x22,x23			// a^b, b^c in next round
957	eor	x16,x16,x26,ror#41	// Sigma1(e)
958	eor	x9,x9,x22,ror#34
959	add	x21,x21,x17			// h+=Ch(e,f,g)
960	and	x28,x28,x19			// (b^c)&=(a^b)
961	eor	x7,x7,x0,ror#61
962	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
963	add	x21,x21,x16			// h+=Sigma1(e)
964	eor	x28,x28,x23			// Maj(a,b,c)
965	eor	x17,x9,x22,ror#39	// Sigma0(a)
966	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
967	add	x2,x2,x11
968	add	x25,x25,x21			// d+=h
969	add	x21,x21,x28			// h+=Maj(a,b,c)
970	ldr	x28,[x30],#8		// *K++, x19 in next round
971	add	x2,x2,x8
972	add	x21,x21,x17			// h+=Sigma0(a)
973	add	x2,x2,x7
974	ldr	x7,[sp,#0]
975	str	x10,[sp,#24]
976	ror	x16,x25,#14
977	add	x20,x20,x28			// h+=K[i]
978	ror	x9,x4,#1
979	and	x17,x26,x25
980	ror	x8,x1,#19
981	bic	x28,x27,x25
982	ror	x10,x21,#28
983	add	x20,x20,x2			// h+=X[i]
984	eor	x16,x16,x25,ror#18
985	eor	x9,x9,x4,ror#8
986	orr	x17,x17,x28			// Ch(e,f,g)
987	eor	x28,x21,x22			// a^b, b^c in next round
988	eor	x16,x16,x25,ror#41	// Sigma1(e)
989	eor	x10,x10,x21,ror#34
990	add	x20,x20,x17			// h+=Ch(e,f,g)
991	and	x19,x19,x28			// (b^c)&=(a^b)
992	eor	x8,x8,x1,ror#61
993	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
994	add	x20,x20,x16			// h+=Sigma1(e)
995	eor	x19,x19,x22			// Maj(a,b,c)
996	eor	x17,x10,x21,ror#39	// Sigma0(a)
997	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
998	add	x3,x3,x12
999	add	x24,x24,x20			// d+=h
1000	add	x20,x20,x19			// h+=Maj(a,b,c)
1001	ldr	x19,[x30],#8		// *K++, x28 in next round
1002	add	x3,x3,x9
1003	add	x20,x20,x17			// h+=Sigma0(a)
1004	add	x3,x3,x8
1005	cbnz	x19,.Loop_16_xx
1006
1007	ldp	x0,x2,[x29,#96]
1008	ldr	x1,[x29,#112]
1009	sub	x30,x30,#648		// rewind
1010
1011	ldp	x3,x4,[x0]
1012	ldp	x5,x6,[x0,#2*8]
1013	add	x1,x1,#14*8			// advance input pointer
1014	ldp	x7,x8,[x0,#4*8]
1015	add	x20,x20,x3
1016	ldp	x9,x10,[x0,#6*8]
1017	add	x21,x21,x4
1018	add	x22,x22,x5
1019	add	x23,x23,x6
1020	stp	x20,x21,[x0]
1021	add	x24,x24,x7
1022	add	x25,x25,x8
1023	stp	x22,x23,[x0,#2*8]
1024	add	x26,x26,x9
1025	add	x27,x27,x10
1026	cmp	x1,x2
1027	stp	x24,x25,[x0,#4*8]
1028	stp	x26,x27,[x0,#6*8]
1029	b.ne	.Loop
1030
1031	ldp	x19,x20,[x29,#16]
1032	add	sp,sp,#4*8
1033	ldp	x21,x22,[x29,#32]
1034	ldp	x23,x24,[x29,#48]
1035	ldp	x25,x26,[x29,#64]
1036	ldp	x27,x28,[x29,#80]
1037	ldp	x29,x30,[sp],#128
1038	AARCH64_VALIDATE_LINK_REGISTER
1039	ret
1040.size	sha512_block_data_order,.-sha512_block_data_order
1041
1042.align	6
1043.type	.LK512,%object
1044.LK512:
1045.quad	0x428a2f98d728ae22,0x7137449123ef65cd
1046.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1047.quad	0x3956c25bf348b538,0x59f111f1b605d019
1048.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
1049.quad	0xd807aa98a3030242,0x12835b0145706fbe
1050.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1051.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
1052.quad	0x9bdc06a725c71235,0xc19bf174cf692694
1053.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
1054.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1055.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
1056.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1057.quad	0x983e5152ee66dfab,0xa831c66d2db43210
1058.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
1059.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
1060.quad	0x06ca6351e003826f,0x142929670a0e6e70
1061.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
1062.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1063.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
1064.quad	0x81c2c92e47edaee6,0x92722c851482353b
1065.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
1066.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
1067.quad	0xd192e819d6ef5218,0xd69906245565a910
1068.quad	0xf40e35855771202a,0x106aa07032bbd1b8
1069.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
1070.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1071.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1072.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1073.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
1074.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
1075.quad	0x90befffa23631e28,0xa4506cebde82bde9
1076.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
1077.quad	0xca273eceea26619c,0xd186b8c721c0c207
1078.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1079.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
1080.quad	0x113f9804bef90dae,0x1b710b35131c471b
1081.quad	0x28db77f523047d84,0x32caab7b40c72493
1082.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1083.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1084.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
1085.quad	0	// terminator
1086.size	.LK512,.-.LK512
1087.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1088.align	2
1089.align	2
1090#ifndef	__KERNEL__
1091.type	sha512_block_armv8,%function
1092.align	6
1093sha512_block_armv8:
1094.Lv8_entry:
1095	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later
1096	stp	x29,x30,[sp,#-16]!
1097	add	x29,sp,#0
1098
1099	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
1100	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1101
1102	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
1103	adr	x3,.LK512
1104
1105	rev64	v16.16b,v16.16b
1106	rev64	v17.16b,v17.16b
1107	rev64	v18.16b,v18.16b
1108	rev64	v19.16b,v19.16b
1109	rev64	v20.16b,v20.16b
1110	rev64	v21.16b,v21.16b
1111	rev64	v22.16b,v22.16b
1112	rev64	v23.16b,v23.16b
1113	b	.Loop_hw
1114
1115.align	4
1116.Loop_hw:
1117	ld1	{v24.2d},[x3],#16
1118	subs	x2,x2,#1
1119	sub	x4,x1,#128
1120	orr	v26.16b,v0.16b,v0.16b			// offload
1121	orr	v27.16b,v1.16b,v1.16b
1122	orr	v28.16b,v2.16b,v2.16b
1123	orr	v29.16b,v3.16b,v3.16b
1124	csel	x1,x1,x4,ne			// conditional rewind
1125	add	v24.2d,v24.2d,v16.2d
1126	ld1	{v25.2d},[x3],#16
1127	ext	v24.16b,v24.16b,v24.16b,#8
1128	ext	v5.16b,v2.16b,v3.16b,#8
1129	ext	v6.16b,v1.16b,v2.16b,#8
1130	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1131.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1132	ext	v7.16b,v20.16b,v21.16b,#8
1133.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1134.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1135	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1136.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1137	add	v25.2d,v25.2d,v17.2d
1138	ld1	{v24.2d},[x3],#16
1139	ext	v25.16b,v25.16b,v25.16b,#8
1140	ext	v5.16b,v4.16b,v2.16b,#8
1141	ext	v6.16b,v0.16b,v4.16b,#8
1142	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1143.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1144	ext	v7.16b,v21.16b,v22.16b,#8
1145.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1146.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1147	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1148.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1149	add	v24.2d,v24.2d,v18.2d
1150	ld1	{v25.2d},[x3],#16
1151	ext	v24.16b,v24.16b,v24.16b,#8
1152	ext	v5.16b,v1.16b,v4.16b,#8
1153	ext	v6.16b,v3.16b,v1.16b,#8
1154	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1155.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1156	ext	v7.16b,v22.16b,v23.16b,#8
1157.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1158.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1159	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1160.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1161	add	v25.2d,v25.2d,v19.2d
1162	ld1	{v24.2d},[x3],#16
1163	ext	v25.16b,v25.16b,v25.16b,#8
1164	ext	v5.16b,v0.16b,v1.16b,#8
1165	ext	v6.16b,v2.16b,v0.16b,#8
1166	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1167.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1168	ext	v7.16b,v23.16b,v16.16b,#8
1169.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1170.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1171	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1172.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1173	add	v24.2d,v24.2d,v20.2d
1174	ld1	{v25.2d},[x3],#16
1175	ext	v24.16b,v24.16b,v24.16b,#8
1176	ext	v5.16b,v3.16b,v0.16b,#8
1177	ext	v6.16b,v4.16b,v3.16b,#8
1178	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1179.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1180	ext	v7.16b,v16.16b,v17.16b,#8
1181.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1182.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1183	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1184.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1185	add	v25.2d,v25.2d,v21.2d
1186	ld1	{v24.2d},[x3],#16
1187	ext	v25.16b,v25.16b,v25.16b,#8
1188	ext	v5.16b,v2.16b,v3.16b,#8
1189	ext	v6.16b,v1.16b,v2.16b,#8
1190	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1191.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1192	ext	v7.16b,v17.16b,v18.16b,#8
1193.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1194.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1195	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1196.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1197	add	v24.2d,v24.2d,v22.2d
1198	ld1	{v25.2d},[x3],#16
1199	ext	v24.16b,v24.16b,v24.16b,#8
1200	ext	v5.16b,v4.16b,v2.16b,#8
1201	ext	v6.16b,v0.16b,v4.16b,#8
1202	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1203.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1204	ext	v7.16b,v18.16b,v19.16b,#8
1205.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1206.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1207	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1208.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1209	add	v25.2d,v25.2d,v23.2d
1210	ld1	{v24.2d},[x3],#16
1211	ext	v25.16b,v25.16b,v25.16b,#8
1212	ext	v5.16b,v1.16b,v4.16b,#8
1213	ext	v6.16b,v3.16b,v1.16b,#8
1214	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1215.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1216	ext	v7.16b,v19.16b,v20.16b,#8
1217.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1218.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1219	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1220.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1221	add	v24.2d,v24.2d,v16.2d
1222	ld1	{v25.2d},[x3],#16
1223	ext	v24.16b,v24.16b,v24.16b,#8
1224	ext	v5.16b,v0.16b,v1.16b,#8
1225	ext	v6.16b,v2.16b,v0.16b,#8
1226	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1227.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1228	ext	v7.16b,v20.16b,v21.16b,#8
1229.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1230.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1231	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1232.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1233	add	v25.2d,v25.2d,v17.2d
1234	ld1	{v24.2d},[x3],#16
1235	ext	v25.16b,v25.16b,v25.16b,#8
1236	ext	v5.16b,v3.16b,v0.16b,#8
1237	ext	v6.16b,v4.16b,v3.16b,#8
1238	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1239.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1240	ext	v7.16b,v21.16b,v22.16b,#8
1241.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1242.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1243	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1244.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1245	add	v24.2d,v24.2d,v18.2d
1246	ld1	{v25.2d},[x3],#16
1247	ext	v24.16b,v24.16b,v24.16b,#8
1248	ext	v5.16b,v2.16b,v3.16b,#8
1249	ext	v6.16b,v1.16b,v2.16b,#8
1250	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1251.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1252	ext	v7.16b,v22.16b,v23.16b,#8
1253.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1254.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1255	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1256.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1257	add	v25.2d,v25.2d,v19.2d
1258	ld1	{v24.2d},[x3],#16
1259	ext	v25.16b,v25.16b,v25.16b,#8
1260	ext	v5.16b,v4.16b,v2.16b,#8
1261	ext	v6.16b,v0.16b,v4.16b,#8
1262	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1263.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1264	ext	v7.16b,v23.16b,v16.16b,#8
1265.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1266.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1267	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1268.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1269	add	v24.2d,v24.2d,v20.2d
1270	ld1	{v25.2d},[x3],#16
1271	ext	v24.16b,v24.16b,v24.16b,#8
1272	ext	v5.16b,v1.16b,v4.16b,#8
1273	ext	v6.16b,v3.16b,v1.16b,#8
1274	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1275.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1276	ext	v7.16b,v16.16b,v17.16b,#8
1277.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1278.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1279	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1280.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1281	add	v25.2d,v25.2d,v21.2d
1282	ld1	{v24.2d},[x3],#16
1283	ext	v25.16b,v25.16b,v25.16b,#8
1284	ext	v5.16b,v0.16b,v1.16b,#8
1285	ext	v6.16b,v2.16b,v0.16b,#8
1286	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1287.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1288	ext	v7.16b,v17.16b,v18.16b,#8
1289.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1290.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1291	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1292.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1293	add	v24.2d,v24.2d,v22.2d
1294	ld1	{v25.2d},[x3],#16
1295	ext	v24.16b,v24.16b,v24.16b,#8
1296	ext	v5.16b,v3.16b,v0.16b,#8
1297	ext	v6.16b,v4.16b,v3.16b,#8
1298	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1299.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1300	ext	v7.16b,v18.16b,v19.16b,#8
1301.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1302.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1303	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1304.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1305	add	v25.2d,v25.2d,v23.2d
1306	ld1	{v24.2d},[x3],#16
1307	ext	v25.16b,v25.16b,v25.16b,#8
1308	ext	v5.16b,v2.16b,v3.16b,#8
1309	ext	v6.16b,v1.16b,v2.16b,#8
1310	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1311.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1312	ext	v7.16b,v19.16b,v20.16b,#8
1313.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1314.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1315	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1316.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1317	add	v24.2d,v24.2d,v16.2d
1318	ld1	{v25.2d},[x3],#16
1319	ext	v24.16b,v24.16b,v24.16b,#8
1320	ext	v5.16b,v4.16b,v2.16b,#8
1321	ext	v6.16b,v0.16b,v4.16b,#8
1322	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1323.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1324	ext	v7.16b,v20.16b,v21.16b,#8
1325.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1326.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1327	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1328.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1329	add	v25.2d,v25.2d,v17.2d
1330	ld1	{v24.2d},[x3],#16
1331	ext	v25.16b,v25.16b,v25.16b,#8
1332	ext	v5.16b,v1.16b,v4.16b,#8
1333	ext	v6.16b,v3.16b,v1.16b,#8
1334	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1335.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1336	ext	v7.16b,v21.16b,v22.16b,#8
1337.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1338.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1339	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1340.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1341	add	v24.2d,v24.2d,v18.2d
1342	ld1	{v25.2d},[x3],#16
1343	ext	v24.16b,v24.16b,v24.16b,#8
1344	ext	v5.16b,v0.16b,v1.16b,#8
1345	ext	v6.16b,v2.16b,v0.16b,#8
1346	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1347.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1348	ext	v7.16b,v22.16b,v23.16b,#8
1349.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1350.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1351	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1352.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1353	add	v25.2d,v25.2d,v19.2d
1354	ld1	{v24.2d},[x3],#16
1355	ext	v25.16b,v25.16b,v25.16b,#8
1356	ext	v5.16b,v3.16b,v0.16b,#8
1357	ext	v6.16b,v4.16b,v3.16b,#8
1358	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1359.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1360	ext	v7.16b,v23.16b,v16.16b,#8
1361.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1362.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1363	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1364.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1365	add	v24.2d,v24.2d,v20.2d
1366	ld1	{v25.2d},[x3],#16
1367	ext	v24.16b,v24.16b,v24.16b,#8
1368	ext	v5.16b,v2.16b,v3.16b,#8
1369	ext	v6.16b,v1.16b,v2.16b,#8
1370	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1371.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1372	ext	v7.16b,v16.16b,v17.16b,#8
1373.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1374.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1375	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1376.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1377	add	v25.2d,v25.2d,v21.2d
1378	ld1	{v24.2d},[x3],#16
1379	ext	v25.16b,v25.16b,v25.16b,#8
1380	ext	v5.16b,v4.16b,v2.16b,#8
1381	ext	v6.16b,v0.16b,v4.16b,#8
1382	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1383.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1384	ext	v7.16b,v17.16b,v18.16b,#8
1385.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1386.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1387	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1388.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1389	add	v24.2d,v24.2d,v22.2d
1390	ld1	{v25.2d},[x3],#16
1391	ext	v24.16b,v24.16b,v24.16b,#8
1392	ext	v5.16b,v1.16b,v4.16b,#8
1393	ext	v6.16b,v3.16b,v1.16b,#8
1394	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1395.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1396	ext	v7.16b,v18.16b,v19.16b,#8
1397.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1398.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1399	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1400.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1401	add	v25.2d,v25.2d,v23.2d
1402	ld1	{v24.2d},[x3],#16
1403	ext	v25.16b,v25.16b,v25.16b,#8
1404	ext	v5.16b,v0.16b,v1.16b,#8
1405	ext	v6.16b,v2.16b,v0.16b,#8
1406	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1407.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1408	ext	v7.16b,v19.16b,v20.16b,#8
1409.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1410.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1411	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1412.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1413	add	v24.2d,v24.2d,v16.2d
1414	ld1	{v25.2d},[x3],#16
1415	ext	v24.16b,v24.16b,v24.16b,#8
1416	ext	v5.16b,v3.16b,v0.16b,#8
1417	ext	v6.16b,v4.16b,v3.16b,#8
1418	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1419.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1420	ext	v7.16b,v20.16b,v21.16b,#8
1421.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1422.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1423	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1424.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1425	add	v25.2d,v25.2d,v17.2d
1426	ld1	{v24.2d},[x3],#16
1427	ext	v25.16b,v25.16b,v25.16b,#8
1428	ext	v5.16b,v2.16b,v3.16b,#8
1429	ext	v6.16b,v1.16b,v2.16b,#8
1430	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1431.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1432	ext	v7.16b,v21.16b,v22.16b,#8
1433.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1434.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1435	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1436.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1437	add	v24.2d,v24.2d,v18.2d
1438	ld1	{v25.2d},[x3],#16
1439	ext	v24.16b,v24.16b,v24.16b,#8
1440	ext	v5.16b,v4.16b,v2.16b,#8
1441	ext	v6.16b,v0.16b,v4.16b,#8
1442	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1443.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1444	ext	v7.16b,v22.16b,v23.16b,#8
1445.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1446.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1447	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1448.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1449	add	v25.2d,v25.2d,v19.2d
1450	ld1	{v24.2d},[x3],#16
1451	ext	v25.16b,v25.16b,v25.16b,#8
1452	ext	v5.16b,v1.16b,v4.16b,#8
1453	ext	v6.16b,v3.16b,v1.16b,#8
1454	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1455.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1456	ext	v7.16b,v23.16b,v16.16b,#8
1457.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1458.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1459	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1460.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1461	add	v24.2d,v24.2d,v20.2d
1462	ld1	{v25.2d},[x3],#16
1463	ext	v24.16b,v24.16b,v24.16b,#8
1464	ext	v5.16b,v0.16b,v1.16b,#8
1465	ext	v6.16b,v2.16b,v0.16b,#8
1466	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1467.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1468	ext	v7.16b,v16.16b,v17.16b,#8
1469.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1470.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1471	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1472.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1473	add	v25.2d,v25.2d,v21.2d
1474	ld1	{v24.2d},[x3],#16
1475	ext	v25.16b,v25.16b,v25.16b,#8
1476	ext	v5.16b,v3.16b,v0.16b,#8
1477	ext	v6.16b,v4.16b,v3.16b,#8
1478	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1479.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1480	ext	v7.16b,v17.16b,v18.16b,#8
1481.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1482.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1483	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1484.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1485	add	v24.2d,v24.2d,v22.2d
1486	ld1	{v25.2d},[x3],#16
1487	ext	v24.16b,v24.16b,v24.16b,#8
1488	ext	v5.16b,v2.16b,v3.16b,#8
1489	ext	v6.16b,v1.16b,v2.16b,#8
1490	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1491.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1492	ext	v7.16b,v18.16b,v19.16b,#8
1493.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1494.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1495	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1496.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1497	add	v25.2d,v25.2d,v23.2d
1498	ld1	{v24.2d},[x3],#16
1499	ext	v25.16b,v25.16b,v25.16b,#8
1500	ext	v5.16b,v4.16b,v2.16b,#8
1501	ext	v6.16b,v0.16b,v4.16b,#8
1502	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1503.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1504	ext	v7.16b,v19.16b,v20.16b,#8
1505.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1506.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1507	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1508.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1509	ld1	{v25.2d},[x3],#16
1510	add	v24.2d,v24.2d,v16.2d
1511	ld1	{v16.16b},[x1],#16		// load next input
1512	ext	v24.16b,v24.16b,v24.16b,#8
1513	ext	v5.16b,v1.16b,v4.16b,#8
1514	ext	v6.16b,v3.16b,v1.16b,#8
1515	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1516.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1517	rev64	v16.16b,v16.16b
1518	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1519.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1520	ld1	{v24.2d},[x3],#16
1521	add	v25.2d,v25.2d,v17.2d
1522	ld1	{v17.16b},[x1],#16		// load next input
1523	ext	v25.16b,v25.16b,v25.16b,#8
1524	ext	v5.16b,v0.16b,v1.16b,#8
1525	ext	v6.16b,v2.16b,v0.16b,#8
1526	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1527.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1528	rev64	v17.16b,v17.16b
1529	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1530.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1531	ld1	{v25.2d},[x3],#16
1532	add	v24.2d,v24.2d,v18.2d
1533	ld1	{v18.16b},[x1],#16		// load next input
1534	ext	v24.16b,v24.16b,v24.16b,#8
1535	ext	v5.16b,v3.16b,v0.16b,#8
1536	ext	v6.16b,v4.16b,v3.16b,#8
1537	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1538.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1539	rev64	v18.16b,v18.16b
1540	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1541.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1542	ld1	{v24.2d},[x3],#16
1543	add	v25.2d,v25.2d,v19.2d
1544	ld1	{v19.16b},[x1],#16		// load next input
1545	ext	v25.16b,v25.16b,v25.16b,#8
1546	ext	v5.16b,v2.16b,v3.16b,#8
1547	ext	v6.16b,v1.16b,v2.16b,#8
1548	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1549.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1550	rev64	v19.16b,v19.16b
1551	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1552.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1553	ld1	{v25.2d},[x3],#16
1554	add	v24.2d,v24.2d,v20.2d
1555	ld1	{v20.16b},[x1],#16		// load next input
1556	ext	v24.16b,v24.16b,v24.16b,#8
1557	ext	v5.16b,v4.16b,v2.16b,#8
1558	ext	v6.16b,v0.16b,v4.16b,#8
1559	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1560.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1561	rev64	v20.16b,v20.16b
1562	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1563.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1564	ld1	{v24.2d},[x3],#16
1565	add	v25.2d,v25.2d,v21.2d
1566	ld1	{v21.16b},[x1],#16		// load next input
1567	ext	v25.16b,v25.16b,v25.16b,#8
1568	ext	v5.16b,v1.16b,v4.16b,#8
1569	ext	v6.16b,v3.16b,v1.16b,#8
1570	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1571.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1572	rev64	v21.16b,v21.16b
1573	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1574.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1575	ld1	{v25.2d},[x3],#16
1576	add	v24.2d,v24.2d,v22.2d
1577	ld1	{v22.16b},[x1],#16		// load next input
1578	ext	v24.16b,v24.16b,v24.16b,#8
1579	ext	v5.16b,v0.16b,v1.16b,#8
1580	ext	v6.16b,v2.16b,v0.16b,#8
1581	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1582.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1583	rev64	v22.16b,v22.16b
1584	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1585.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1586	sub	x3,x3,#80*8	// rewind
1587	add	v25.2d,v25.2d,v23.2d
1588	ld1	{v23.16b},[x1],#16		// load next input
1589	ext	v25.16b,v25.16b,v25.16b,#8
1590	ext	v5.16b,v3.16b,v0.16b,#8
1591	ext	v6.16b,v4.16b,v3.16b,#8
1592	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1593.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1594	rev64	v23.16b,v23.16b
1595	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1596.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1597	add	v0.2d,v0.2d,v26.2d			// accumulate
1598	add	v1.2d,v1.2d,v27.2d
1599	add	v2.2d,v2.2d,v28.2d
1600	add	v3.2d,v3.2d,v29.2d
1601
1602	cbnz	x2,.Loop_hw
1603
1604	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
1605
1606	ldr	x29,[sp],#16
1607	ret
1608.size	sha512_block_armv8,.-sha512_block_armv8
1609#endif
1610