xref: /freebsd/sys/crypto/openssl/aarch64/sha512-armv8.S (revision 924226fba12cc9a228c73b956e1b7fa24c60b055)
1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from sha512-armv8.pl. */
3// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
4//
5// Licensed under the OpenSSL license (the "License").  You may not use
6// this file except in compliance with the License.  You can obtain a copy
7// in the file LICENSE in the source distribution or at
8// https://www.openssl.org/source/license.html
9
10// ====================================================================
11// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12// project. The module is, however, dual licensed under OpenSSL and
13// CRYPTOGAMS licenses depending on where you obtain it. For further
14// details see http://www.openssl.org/~appro/cryptogams/.
15//
16// Permission to use under GPLv2 terms is granted.
17// ====================================================================
18//
19// SHA256/512 for ARMv8.
20//
21// Performance in cycles per processed byte and improvement coefficient
22// over code generated with "default" compiler:
23//
24//		SHA256-hw	SHA256(*)	SHA512
25// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
26// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
27// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
28// Denver	2.01		10.5 (+26%)	6.70 (+8%)
29// X-Gene			20.0 (+100%)	12.8 (+300%(***))
30// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
31// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
32//
33// (*)	Software SHA256 results are of lesser relevance, presented
34//	mostly for informational purposes.
35// (**)	The result is a trade-off: it's possible to improve it by
36//	10% (or by 1 cycle per round), but at the cost of 20% loss
37//	on Cortex-A53 (or by 4 cycles per round).
38// (***)	Super-impressive coefficients over gcc-generated code are
39//	indication of some compiler "pathology", most notably code
40//	generated with -mgeneral-regs-only is significantly faster
41//	and the gap is only 40-90%.
42//
43// October 2016.
44//
45// Originally it was reckoned that it makes no sense to implement NEON
46// version of SHA256 for 64-bit processors. This is because performance
47// improvement on most wide-spread Cortex-A5x processors was observed
48// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
49// observed that 32-bit NEON SHA256 performs significantly better than
50// 64-bit scalar version on *some* of the more recent processors. As
51// result 64-bit NEON version of SHA256 was added to provide best
52// all-round performance. For example it executes ~30% faster on X-Gene
53// and Mongoose. [For reference, NEON version of SHA512 is bound to
54// deliver much less improvement, likely *negative* on Cortex-A5x.
55// Which is why NEON support is limited to SHA256.]
56
57#ifndef	__KERNEL__
58# include "arm_arch.h"
59#endif
60
61.text
62
63
64.hidden	OPENSSL_armcap_P
65.globl	sha512_block_data_order
66.type	sha512_block_data_order,%function
67.align	6
68sha512_block_data_order:
69#ifndef	__KERNEL__
70# ifdef	__ILP32__
71	ldrsw	x16,.LOPENSSL_armcap_P
72# else
73	ldr	x16,.LOPENSSL_armcap_P
74# endif
75	adr	x17,.LOPENSSL_armcap_P
76	add	x16,x16,x17
77	ldr	w16,[x16]
78	tst	w16,#ARMV8_SHA512
79	b.ne	.Lv8_entry
80#endif
81.inst	0xd503233f				// paciasp
82	stp	x29,x30,[sp,#-128]!
83	add	x29,sp,#0
84
85	stp	x19,x20,[sp,#16]
86	stp	x21,x22,[sp,#32]
87	stp	x23,x24,[sp,#48]
88	stp	x25,x26,[sp,#64]
89	stp	x27,x28,[sp,#80]
90	sub	sp,sp,#4*8
91
92	ldp	x20,x21,[x0]				// load context
93	ldp	x22,x23,[x0,#2*8]
94	ldp	x24,x25,[x0,#4*8]
95	add	x2,x1,x2,lsl#7	// end of input
96	ldp	x26,x27,[x0,#6*8]
97	adr	x30,.LK512
98	stp	x0,x2,[x29,#96]
99
100.Loop:
101	ldp	x3,x4,[x1],#2*8
102	ldr	x19,[x30],#8			// *K++
103	eor	x28,x21,x22				// magic seed
104	str	x1,[x29,#112]
105#ifndef	__AARCH64EB__
106	rev	x3,x3			// 0
107#endif
108	ror	x16,x24,#14
109	add	x27,x27,x19			// h+=K[i]
110	eor	x6,x24,x24,ror#23
111	and	x17,x25,x24
112	bic	x19,x26,x24
113	add	x27,x27,x3			// h+=X[i]
114	orr	x17,x17,x19			// Ch(e,f,g)
115	eor	x19,x20,x21			// a^b, b^c in next round
116	eor	x16,x16,x6,ror#18	// Sigma1(e)
117	ror	x6,x20,#28
118	add	x27,x27,x17			// h+=Ch(e,f,g)
119	eor	x17,x20,x20,ror#5
120	add	x27,x27,x16			// h+=Sigma1(e)
121	and	x28,x28,x19			// (b^c)&=(a^b)
122	add	x23,x23,x27			// d+=h
123	eor	x28,x28,x21			// Maj(a,b,c)
124	eor	x17,x6,x17,ror#34	// Sigma0(a)
125	add	x27,x27,x28			// h+=Maj(a,b,c)
126	ldr	x28,[x30],#8		// *K++, x19 in next round
127	//add	x27,x27,x17			// h+=Sigma0(a)
128#ifndef	__AARCH64EB__
129	rev	x4,x4			// 1
130#endif
131	ldp	x5,x6,[x1],#2*8
132	add	x27,x27,x17			// h+=Sigma0(a)
133	ror	x16,x23,#14
134	add	x26,x26,x28			// h+=K[i]
135	eor	x7,x23,x23,ror#23
136	and	x17,x24,x23
137	bic	x28,x25,x23
138	add	x26,x26,x4			// h+=X[i]
139	orr	x17,x17,x28			// Ch(e,f,g)
140	eor	x28,x27,x20			// a^b, b^c in next round
141	eor	x16,x16,x7,ror#18	// Sigma1(e)
142	ror	x7,x27,#28
143	add	x26,x26,x17			// h+=Ch(e,f,g)
144	eor	x17,x27,x27,ror#5
145	add	x26,x26,x16			// h+=Sigma1(e)
146	and	x19,x19,x28			// (b^c)&=(a^b)
147	add	x22,x22,x26			// d+=h
148	eor	x19,x19,x20			// Maj(a,b,c)
149	eor	x17,x7,x17,ror#34	// Sigma0(a)
150	add	x26,x26,x19			// h+=Maj(a,b,c)
151	ldr	x19,[x30],#8		// *K++, x28 in next round
152	//add	x26,x26,x17			// h+=Sigma0(a)
153#ifndef	__AARCH64EB__
154	rev	x5,x5			// 2
155#endif
156	add	x26,x26,x17			// h+=Sigma0(a)
157	ror	x16,x22,#14
158	add	x25,x25,x19			// h+=K[i]
159	eor	x8,x22,x22,ror#23
160	and	x17,x23,x22
161	bic	x19,x24,x22
162	add	x25,x25,x5			// h+=X[i]
163	orr	x17,x17,x19			// Ch(e,f,g)
164	eor	x19,x26,x27			// a^b, b^c in next round
165	eor	x16,x16,x8,ror#18	// Sigma1(e)
166	ror	x8,x26,#28
167	add	x25,x25,x17			// h+=Ch(e,f,g)
168	eor	x17,x26,x26,ror#5
169	add	x25,x25,x16			// h+=Sigma1(e)
170	and	x28,x28,x19			// (b^c)&=(a^b)
171	add	x21,x21,x25			// d+=h
172	eor	x28,x28,x27			// Maj(a,b,c)
173	eor	x17,x8,x17,ror#34	// Sigma0(a)
174	add	x25,x25,x28			// h+=Maj(a,b,c)
175	ldr	x28,[x30],#8		// *K++, x19 in next round
176	//add	x25,x25,x17			// h+=Sigma0(a)
177#ifndef	__AARCH64EB__
178	rev	x6,x6			// 3
179#endif
180	ldp	x7,x8,[x1],#2*8
181	add	x25,x25,x17			// h+=Sigma0(a)
182	ror	x16,x21,#14
183	add	x24,x24,x28			// h+=K[i]
184	eor	x9,x21,x21,ror#23
185	and	x17,x22,x21
186	bic	x28,x23,x21
187	add	x24,x24,x6			// h+=X[i]
188	orr	x17,x17,x28			// Ch(e,f,g)
189	eor	x28,x25,x26			// a^b, b^c in next round
190	eor	x16,x16,x9,ror#18	// Sigma1(e)
191	ror	x9,x25,#28
192	add	x24,x24,x17			// h+=Ch(e,f,g)
193	eor	x17,x25,x25,ror#5
194	add	x24,x24,x16			// h+=Sigma1(e)
195	and	x19,x19,x28			// (b^c)&=(a^b)
196	add	x20,x20,x24			// d+=h
197	eor	x19,x19,x26			// Maj(a,b,c)
198	eor	x17,x9,x17,ror#34	// Sigma0(a)
199	add	x24,x24,x19			// h+=Maj(a,b,c)
200	ldr	x19,[x30],#8		// *K++, x28 in next round
201	//add	x24,x24,x17			// h+=Sigma0(a)
202#ifndef	__AARCH64EB__
203	rev	x7,x7			// 4
204#endif
205	add	x24,x24,x17			// h+=Sigma0(a)
206	ror	x16,x20,#14
207	add	x23,x23,x19			// h+=K[i]
208	eor	x10,x20,x20,ror#23
209	and	x17,x21,x20
210	bic	x19,x22,x20
211	add	x23,x23,x7			// h+=X[i]
212	orr	x17,x17,x19			// Ch(e,f,g)
213	eor	x19,x24,x25			// a^b, b^c in next round
214	eor	x16,x16,x10,ror#18	// Sigma1(e)
215	ror	x10,x24,#28
216	add	x23,x23,x17			// h+=Ch(e,f,g)
217	eor	x17,x24,x24,ror#5
218	add	x23,x23,x16			// h+=Sigma1(e)
219	and	x28,x28,x19			// (b^c)&=(a^b)
220	add	x27,x27,x23			// d+=h
221	eor	x28,x28,x25			// Maj(a,b,c)
222	eor	x17,x10,x17,ror#34	// Sigma0(a)
223	add	x23,x23,x28			// h+=Maj(a,b,c)
224	ldr	x28,[x30],#8		// *K++, x19 in next round
225	//add	x23,x23,x17			// h+=Sigma0(a)
226#ifndef	__AARCH64EB__
227	rev	x8,x8			// 5
228#endif
229	ldp	x9,x10,[x1],#2*8
230	add	x23,x23,x17			// h+=Sigma0(a)
231	ror	x16,x27,#14
232	add	x22,x22,x28			// h+=K[i]
233	eor	x11,x27,x27,ror#23
234	and	x17,x20,x27
235	bic	x28,x21,x27
236	add	x22,x22,x8			// h+=X[i]
237	orr	x17,x17,x28			// Ch(e,f,g)
238	eor	x28,x23,x24			// a^b, b^c in next round
239	eor	x16,x16,x11,ror#18	// Sigma1(e)
240	ror	x11,x23,#28
241	add	x22,x22,x17			// h+=Ch(e,f,g)
242	eor	x17,x23,x23,ror#5
243	add	x22,x22,x16			// h+=Sigma1(e)
244	and	x19,x19,x28			// (b^c)&=(a^b)
245	add	x26,x26,x22			// d+=h
246	eor	x19,x19,x24			// Maj(a,b,c)
247	eor	x17,x11,x17,ror#34	// Sigma0(a)
248	add	x22,x22,x19			// h+=Maj(a,b,c)
249	ldr	x19,[x30],#8		// *K++, x28 in next round
250	//add	x22,x22,x17			// h+=Sigma0(a)
251#ifndef	__AARCH64EB__
252	rev	x9,x9			// 6
253#endif
254	add	x22,x22,x17			// h+=Sigma0(a)
255	ror	x16,x26,#14
256	add	x21,x21,x19			// h+=K[i]
257	eor	x12,x26,x26,ror#23
258	and	x17,x27,x26
259	bic	x19,x20,x26
260	add	x21,x21,x9			// h+=X[i]
261	orr	x17,x17,x19			// Ch(e,f,g)
262	eor	x19,x22,x23			// a^b, b^c in next round
263	eor	x16,x16,x12,ror#18	// Sigma1(e)
264	ror	x12,x22,#28
265	add	x21,x21,x17			// h+=Ch(e,f,g)
266	eor	x17,x22,x22,ror#5
267	add	x21,x21,x16			// h+=Sigma1(e)
268	and	x28,x28,x19			// (b^c)&=(a^b)
269	add	x25,x25,x21			// d+=h
270	eor	x28,x28,x23			// Maj(a,b,c)
271	eor	x17,x12,x17,ror#34	// Sigma0(a)
272	add	x21,x21,x28			// h+=Maj(a,b,c)
273	ldr	x28,[x30],#8		// *K++, x19 in next round
274	//add	x21,x21,x17			// h+=Sigma0(a)
275#ifndef	__AARCH64EB__
276	rev	x10,x10			// 7
277#endif
278	ldp	x11,x12,[x1],#2*8
279	add	x21,x21,x17			// h+=Sigma0(a)
280	ror	x16,x25,#14
281	add	x20,x20,x28			// h+=K[i]
282	eor	x13,x25,x25,ror#23
283	and	x17,x26,x25
284	bic	x28,x27,x25
285	add	x20,x20,x10			// h+=X[i]
286	orr	x17,x17,x28			// Ch(e,f,g)
287	eor	x28,x21,x22			// a^b, b^c in next round
288	eor	x16,x16,x13,ror#18	// Sigma1(e)
289	ror	x13,x21,#28
290	add	x20,x20,x17			// h+=Ch(e,f,g)
291	eor	x17,x21,x21,ror#5
292	add	x20,x20,x16			// h+=Sigma1(e)
293	and	x19,x19,x28			// (b^c)&=(a^b)
294	add	x24,x24,x20			// d+=h
295	eor	x19,x19,x22			// Maj(a,b,c)
296	eor	x17,x13,x17,ror#34	// Sigma0(a)
297	add	x20,x20,x19			// h+=Maj(a,b,c)
298	ldr	x19,[x30],#8		// *K++, x28 in next round
299	//add	x20,x20,x17			// h+=Sigma0(a)
300#ifndef	__AARCH64EB__
301	rev	x11,x11			// 8
302#endif
303	add	x20,x20,x17			// h+=Sigma0(a)
304	ror	x16,x24,#14
305	add	x27,x27,x19			// h+=K[i]
306	eor	x14,x24,x24,ror#23
307	and	x17,x25,x24
308	bic	x19,x26,x24
309	add	x27,x27,x11			// h+=X[i]
310	orr	x17,x17,x19			// Ch(e,f,g)
311	eor	x19,x20,x21			// a^b, b^c in next round
312	eor	x16,x16,x14,ror#18	// Sigma1(e)
313	ror	x14,x20,#28
314	add	x27,x27,x17			// h+=Ch(e,f,g)
315	eor	x17,x20,x20,ror#5
316	add	x27,x27,x16			// h+=Sigma1(e)
317	and	x28,x28,x19			// (b^c)&=(a^b)
318	add	x23,x23,x27			// d+=h
319	eor	x28,x28,x21			// Maj(a,b,c)
320	eor	x17,x14,x17,ror#34	// Sigma0(a)
321	add	x27,x27,x28			// h+=Maj(a,b,c)
322	ldr	x28,[x30],#8		// *K++, x19 in next round
323	//add	x27,x27,x17			// h+=Sigma0(a)
324#ifndef	__AARCH64EB__
325	rev	x12,x12			// 9
326#endif
327	ldp	x13,x14,[x1],#2*8
328	add	x27,x27,x17			// h+=Sigma0(a)
329	ror	x16,x23,#14
330	add	x26,x26,x28			// h+=K[i]
331	eor	x15,x23,x23,ror#23
332	and	x17,x24,x23
333	bic	x28,x25,x23
334	add	x26,x26,x12			// h+=X[i]
335	orr	x17,x17,x28			// Ch(e,f,g)
336	eor	x28,x27,x20			// a^b, b^c in next round
337	eor	x16,x16,x15,ror#18	// Sigma1(e)
338	ror	x15,x27,#28
339	add	x26,x26,x17			// h+=Ch(e,f,g)
340	eor	x17,x27,x27,ror#5
341	add	x26,x26,x16			// h+=Sigma1(e)
342	and	x19,x19,x28			// (b^c)&=(a^b)
343	add	x22,x22,x26			// d+=h
344	eor	x19,x19,x20			// Maj(a,b,c)
345	eor	x17,x15,x17,ror#34	// Sigma0(a)
346	add	x26,x26,x19			// h+=Maj(a,b,c)
347	ldr	x19,[x30],#8		// *K++, x28 in next round
348	//add	x26,x26,x17			// h+=Sigma0(a)
349#ifndef	__AARCH64EB__
350	rev	x13,x13			// 10
351#endif
352	add	x26,x26,x17			// h+=Sigma0(a)
353	ror	x16,x22,#14
354	add	x25,x25,x19			// h+=K[i]
355	eor	x0,x22,x22,ror#23
356	and	x17,x23,x22
357	bic	x19,x24,x22
358	add	x25,x25,x13			// h+=X[i]
359	orr	x17,x17,x19			// Ch(e,f,g)
360	eor	x19,x26,x27			// a^b, b^c in next round
361	eor	x16,x16,x0,ror#18	// Sigma1(e)
362	ror	x0,x26,#28
363	add	x25,x25,x17			// h+=Ch(e,f,g)
364	eor	x17,x26,x26,ror#5
365	add	x25,x25,x16			// h+=Sigma1(e)
366	and	x28,x28,x19			// (b^c)&=(a^b)
367	add	x21,x21,x25			// d+=h
368	eor	x28,x28,x27			// Maj(a,b,c)
369	eor	x17,x0,x17,ror#34	// Sigma0(a)
370	add	x25,x25,x28			// h+=Maj(a,b,c)
371	ldr	x28,[x30],#8		// *K++, x19 in next round
372	//add	x25,x25,x17			// h+=Sigma0(a)
373#ifndef	__AARCH64EB__
374	rev	x14,x14			// 11
375#endif
376	ldp	x15,x0,[x1],#2*8
377	add	x25,x25,x17			// h+=Sigma0(a)
378	str	x6,[sp,#24]
379	ror	x16,x21,#14
380	add	x24,x24,x28			// h+=K[i]
381	eor	x6,x21,x21,ror#23
382	and	x17,x22,x21
383	bic	x28,x23,x21
384	add	x24,x24,x14			// h+=X[i]
385	orr	x17,x17,x28			// Ch(e,f,g)
386	eor	x28,x25,x26			// a^b, b^c in next round
387	eor	x16,x16,x6,ror#18	// Sigma1(e)
388	ror	x6,x25,#28
389	add	x24,x24,x17			// h+=Ch(e,f,g)
390	eor	x17,x25,x25,ror#5
391	add	x24,x24,x16			// h+=Sigma1(e)
392	and	x19,x19,x28			// (b^c)&=(a^b)
393	add	x20,x20,x24			// d+=h
394	eor	x19,x19,x26			// Maj(a,b,c)
395	eor	x17,x6,x17,ror#34	// Sigma0(a)
396	add	x24,x24,x19			// h+=Maj(a,b,c)
397	ldr	x19,[x30],#8		// *K++, x28 in next round
398	//add	x24,x24,x17			// h+=Sigma0(a)
399#ifndef	__AARCH64EB__
400	rev	x15,x15			// 12
401#endif
402	add	x24,x24,x17			// h+=Sigma0(a)
403	str	x7,[sp,#0]
404	ror	x16,x20,#14
405	add	x23,x23,x19			// h+=K[i]
406	eor	x7,x20,x20,ror#23
407	and	x17,x21,x20
408	bic	x19,x22,x20
409	add	x23,x23,x15			// h+=X[i]
410	orr	x17,x17,x19			// Ch(e,f,g)
411	eor	x19,x24,x25			// a^b, b^c in next round
412	eor	x16,x16,x7,ror#18	// Sigma1(e)
413	ror	x7,x24,#28
414	add	x23,x23,x17			// h+=Ch(e,f,g)
415	eor	x17,x24,x24,ror#5
416	add	x23,x23,x16			// h+=Sigma1(e)
417	and	x28,x28,x19			// (b^c)&=(a^b)
418	add	x27,x27,x23			// d+=h
419	eor	x28,x28,x25			// Maj(a,b,c)
420	eor	x17,x7,x17,ror#34	// Sigma0(a)
421	add	x23,x23,x28			// h+=Maj(a,b,c)
422	ldr	x28,[x30],#8		// *K++, x19 in next round
423	//add	x23,x23,x17			// h+=Sigma0(a)
424#ifndef	__AARCH64EB__
425	rev	x0,x0			// 13
426#endif
427	ldp	x1,x2,[x1]
428	add	x23,x23,x17			// h+=Sigma0(a)
429	str	x8,[sp,#8]
430	ror	x16,x27,#14
431	add	x22,x22,x28			// h+=K[i]
432	eor	x8,x27,x27,ror#23
433	and	x17,x20,x27
434	bic	x28,x21,x27
435	add	x22,x22,x0			// h+=X[i]
436	orr	x17,x17,x28			// Ch(e,f,g)
437	eor	x28,x23,x24			// a^b, b^c in next round
438	eor	x16,x16,x8,ror#18	// Sigma1(e)
439	ror	x8,x23,#28
440	add	x22,x22,x17			// h+=Ch(e,f,g)
441	eor	x17,x23,x23,ror#5
442	add	x22,x22,x16			// h+=Sigma1(e)
443	and	x19,x19,x28			// (b^c)&=(a^b)
444	add	x26,x26,x22			// d+=h
445	eor	x19,x19,x24			// Maj(a,b,c)
446	eor	x17,x8,x17,ror#34	// Sigma0(a)
447	add	x22,x22,x19			// h+=Maj(a,b,c)
448	ldr	x19,[x30],#8		// *K++, x28 in next round
449	//add	x22,x22,x17			// h+=Sigma0(a)
450#ifndef	__AARCH64EB__
451	rev	x1,x1			// 14
452#endif
453	ldr	x6,[sp,#24]
454	add	x22,x22,x17			// h+=Sigma0(a)
455	str	x9,[sp,#16]
456	ror	x16,x26,#14
457	add	x21,x21,x19			// h+=K[i]
458	eor	x9,x26,x26,ror#23
459	and	x17,x27,x26
460	bic	x19,x20,x26
461	add	x21,x21,x1			// h+=X[i]
462	orr	x17,x17,x19			// Ch(e,f,g)
463	eor	x19,x22,x23			// a^b, b^c in next round
464	eor	x16,x16,x9,ror#18	// Sigma1(e)
465	ror	x9,x22,#28
466	add	x21,x21,x17			// h+=Ch(e,f,g)
467	eor	x17,x22,x22,ror#5
468	add	x21,x21,x16			// h+=Sigma1(e)
469	and	x28,x28,x19			// (b^c)&=(a^b)
470	add	x25,x25,x21			// d+=h
471	eor	x28,x28,x23			// Maj(a,b,c)
472	eor	x17,x9,x17,ror#34	// Sigma0(a)
473	add	x21,x21,x28			// h+=Maj(a,b,c)
474	ldr	x28,[x30],#8		// *K++, x19 in next round
475	//add	x21,x21,x17			// h+=Sigma0(a)
476#ifndef	__AARCH64EB__
477	rev	x2,x2			// 15
478#endif
479	ldr	x7,[sp,#0]
480	add	x21,x21,x17			// h+=Sigma0(a)
481	str	x10,[sp,#24]
482	ror	x16,x25,#14
483	add	x20,x20,x28			// h+=K[i]
484	ror	x9,x4,#1
485	and	x17,x26,x25
486	ror	x8,x1,#19
487	bic	x28,x27,x25
488	ror	x10,x21,#28
489	add	x20,x20,x2			// h+=X[i]
490	eor	x16,x16,x25,ror#18
491	eor	x9,x9,x4,ror#8
492	orr	x17,x17,x28			// Ch(e,f,g)
493	eor	x28,x21,x22			// a^b, b^c in next round
494	eor	x16,x16,x25,ror#41	// Sigma1(e)
495	eor	x10,x10,x21,ror#34
496	add	x20,x20,x17			// h+=Ch(e,f,g)
497	and	x19,x19,x28			// (b^c)&=(a^b)
498	eor	x8,x8,x1,ror#61
499	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
500	add	x20,x20,x16			// h+=Sigma1(e)
501	eor	x19,x19,x22			// Maj(a,b,c)
502	eor	x17,x10,x21,ror#39	// Sigma0(a)
503	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
504	add	x3,x3,x12
505	add	x24,x24,x20			// d+=h
506	add	x20,x20,x19			// h+=Maj(a,b,c)
507	ldr	x19,[x30],#8		// *K++, x28 in next round
508	add	x3,x3,x9
509	add	x20,x20,x17			// h+=Sigma0(a)
510	add	x3,x3,x8
511.Loop_16_xx:
512	ldr	x8,[sp,#8]
513	str	x11,[sp,#0]
514	ror	x16,x24,#14
515	add	x27,x27,x19			// h+=K[i]
516	ror	x10,x5,#1
517	and	x17,x25,x24
518	ror	x9,x2,#19
519	bic	x19,x26,x24
520	ror	x11,x20,#28
521	add	x27,x27,x3			// h+=X[i]
522	eor	x16,x16,x24,ror#18
523	eor	x10,x10,x5,ror#8
524	orr	x17,x17,x19			// Ch(e,f,g)
525	eor	x19,x20,x21			// a^b, b^c in next round
526	eor	x16,x16,x24,ror#41	// Sigma1(e)
527	eor	x11,x11,x20,ror#34
528	add	x27,x27,x17			// h+=Ch(e,f,g)
529	and	x28,x28,x19			// (b^c)&=(a^b)
530	eor	x9,x9,x2,ror#61
531	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
532	add	x27,x27,x16			// h+=Sigma1(e)
533	eor	x28,x28,x21			// Maj(a,b,c)
534	eor	x17,x11,x20,ror#39	// Sigma0(a)
535	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
536	add	x4,x4,x13
537	add	x23,x23,x27			// d+=h
538	add	x27,x27,x28			// h+=Maj(a,b,c)
539	ldr	x28,[x30],#8		// *K++, x19 in next round
540	add	x4,x4,x10
541	add	x27,x27,x17			// h+=Sigma0(a)
542	add	x4,x4,x9
543	ldr	x9,[sp,#16]
544	str	x12,[sp,#8]
545	ror	x16,x23,#14
546	add	x26,x26,x28			// h+=K[i]
547	ror	x11,x6,#1
548	and	x17,x24,x23
549	ror	x10,x3,#19
550	bic	x28,x25,x23
551	ror	x12,x27,#28
552	add	x26,x26,x4			// h+=X[i]
553	eor	x16,x16,x23,ror#18
554	eor	x11,x11,x6,ror#8
555	orr	x17,x17,x28			// Ch(e,f,g)
556	eor	x28,x27,x20			// a^b, b^c in next round
557	eor	x16,x16,x23,ror#41	// Sigma1(e)
558	eor	x12,x12,x27,ror#34
559	add	x26,x26,x17			// h+=Ch(e,f,g)
560	and	x19,x19,x28			// (b^c)&=(a^b)
561	eor	x10,x10,x3,ror#61
562	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
563	add	x26,x26,x16			// h+=Sigma1(e)
564	eor	x19,x19,x20			// Maj(a,b,c)
565	eor	x17,x12,x27,ror#39	// Sigma0(a)
566	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
567	add	x5,x5,x14
568	add	x22,x22,x26			// d+=h
569	add	x26,x26,x19			// h+=Maj(a,b,c)
570	ldr	x19,[x30],#8		// *K++, x28 in next round
571	add	x5,x5,x11
572	add	x26,x26,x17			// h+=Sigma0(a)
573	add	x5,x5,x10
574	ldr	x10,[sp,#24]
575	str	x13,[sp,#16]
576	ror	x16,x22,#14
577	add	x25,x25,x19			// h+=K[i]
578	ror	x12,x7,#1
579	and	x17,x23,x22
580	ror	x11,x4,#19
581	bic	x19,x24,x22
582	ror	x13,x26,#28
583	add	x25,x25,x5			// h+=X[i]
584	eor	x16,x16,x22,ror#18
585	eor	x12,x12,x7,ror#8
586	orr	x17,x17,x19			// Ch(e,f,g)
587	eor	x19,x26,x27			// a^b, b^c in next round
588	eor	x16,x16,x22,ror#41	// Sigma1(e)
589	eor	x13,x13,x26,ror#34
590	add	x25,x25,x17			// h+=Ch(e,f,g)
591	and	x28,x28,x19			// (b^c)&=(a^b)
592	eor	x11,x11,x4,ror#61
593	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
594	add	x25,x25,x16			// h+=Sigma1(e)
595	eor	x28,x28,x27			// Maj(a,b,c)
596	eor	x17,x13,x26,ror#39	// Sigma0(a)
597	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
598	add	x6,x6,x15
599	add	x21,x21,x25			// d+=h
600	add	x25,x25,x28			// h+=Maj(a,b,c)
601	ldr	x28,[x30],#8		// *K++, x19 in next round
602	add	x6,x6,x12
603	add	x25,x25,x17			// h+=Sigma0(a)
604	add	x6,x6,x11
605	ldr	x11,[sp,#0]
606	str	x14,[sp,#24]
607	ror	x16,x21,#14
608	add	x24,x24,x28			// h+=K[i]
609	ror	x13,x8,#1
610	and	x17,x22,x21
611	ror	x12,x5,#19
612	bic	x28,x23,x21
613	ror	x14,x25,#28
614	add	x24,x24,x6			// h+=X[i]
615	eor	x16,x16,x21,ror#18
616	eor	x13,x13,x8,ror#8
617	orr	x17,x17,x28			// Ch(e,f,g)
618	eor	x28,x25,x26			// a^b, b^c in next round
619	eor	x16,x16,x21,ror#41	// Sigma1(e)
620	eor	x14,x14,x25,ror#34
621	add	x24,x24,x17			// h+=Ch(e,f,g)
622	and	x19,x19,x28			// (b^c)&=(a^b)
623	eor	x12,x12,x5,ror#61
624	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
625	add	x24,x24,x16			// h+=Sigma1(e)
626	eor	x19,x19,x26			// Maj(a,b,c)
627	eor	x17,x14,x25,ror#39	// Sigma0(a)
628	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
629	add	x7,x7,x0
630	add	x20,x20,x24			// d+=h
631	add	x24,x24,x19			// h+=Maj(a,b,c)
632	ldr	x19,[x30],#8		// *K++, x28 in next round
633	add	x7,x7,x13
634	add	x24,x24,x17			// h+=Sigma0(a)
635	add	x7,x7,x12
636	ldr	x12,[sp,#8]
637	str	x15,[sp,#0]
638	ror	x16,x20,#14
639	add	x23,x23,x19			// h+=K[i]
640	ror	x14,x9,#1
641	and	x17,x21,x20
642	ror	x13,x6,#19
643	bic	x19,x22,x20
644	ror	x15,x24,#28
645	add	x23,x23,x7			// h+=X[i]
646	eor	x16,x16,x20,ror#18
647	eor	x14,x14,x9,ror#8
648	orr	x17,x17,x19			// Ch(e,f,g)
649	eor	x19,x24,x25			// a^b, b^c in next round
650	eor	x16,x16,x20,ror#41	// Sigma1(e)
651	eor	x15,x15,x24,ror#34
652	add	x23,x23,x17			// h+=Ch(e,f,g)
653	and	x28,x28,x19			// (b^c)&=(a^b)
654	eor	x13,x13,x6,ror#61
655	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
656	add	x23,x23,x16			// h+=Sigma1(e)
657	eor	x28,x28,x25			// Maj(a,b,c)
658	eor	x17,x15,x24,ror#39	// Sigma0(a)
659	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
660	add	x8,x8,x1
661	add	x27,x27,x23			// d+=h
662	add	x23,x23,x28			// h+=Maj(a,b,c)
663	ldr	x28,[x30],#8		// *K++, x19 in next round
664	add	x8,x8,x14
665	add	x23,x23,x17			// h+=Sigma0(a)
666	add	x8,x8,x13
667	ldr	x13,[sp,#16]
668	str	x0,[sp,#8]
669	ror	x16,x27,#14
670	add	x22,x22,x28			// h+=K[i]
671	ror	x15,x10,#1
672	and	x17,x20,x27
673	ror	x14,x7,#19
674	bic	x28,x21,x27
675	ror	x0,x23,#28
676	add	x22,x22,x8			// h+=X[i]
677	eor	x16,x16,x27,ror#18
678	eor	x15,x15,x10,ror#8
679	orr	x17,x17,x28			// Ch(e,f,g)
680	eor	x28,x23,x24			// a^b, b^c in next round
681	eor	x16,x16,x27,ror#41	// Sigma1(e)
682	eor	x0,x0,x23,ror#34
683	add	x22,x22,x17			// h+=Ch(e,f,g)
684	and	x19,x19,x28			// (b^c)&=(a^b)
685	eor	x14,x14,x7,ror#61
686	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
687	add	x22,x22,x16			// h+=Sigma1(e)
688	eor	x19,x19,x24			// Maj(a,b,c)
689	eor	x17,x0,x23,ror#39	// Sigma0(a)
690	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
691	add	x9,x9,x2
692	add	x26,x26,x22			// d+=h
693	add	x22,x22,x19			// h+=Maj(a,b,c)
694	ldr	x19,[x30],#8		// *K++, x28 in next round
695	add	x9,x9,x15
696	add	x22,x22,x17			// h+=Sigma0(a)
697	add	x9,x9,x14
698	ldr	x14,[sp,#24]
699	str	x1,[sp,#16]
700	ror	x16,x26,#14
701	add	x21,x21,x19			// h+=K[i]
702	ror	x0,x11,#1
703	and	x17,x27,x26
704	ror	x15,x8,#19
705	bic	x19,x20,x26
706	ror	x1,x22,#28
707	add	x21,x21,x9			// h+=X[i]
708	eor	x16,x16,x26,ror#18
709	eor	x0,x0,x11,ror#8
710	orr	x17,x17,x19			// Ch(e,f,g)
711	eor	x19,x22,x23			// a^b, b^c in next round
712	eor	x16,x16,x26,ror#41	// Sigma1(e)
713	eor	x1,x1,x22,ror#34
714	add	x21,x21,x17			// h+=Ch(e,f,g)
715	and	x28,x28,x19			// (b^c)&=(a^b)
716	eor	x15,x15,x8,ror#61
717	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
718	add	x21,x21,x16			// h+=Sigma1(e)
719	eor	x28,x28,x23			// Maj(a,b,c)
720	eor	x17,x1,x22,ror#39	// Sigma0(a)
721	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
722	add	x10,x10,x3
723	add	x25,x25,x21			// d+=h
724	add	x21,x21,x28			// h+=Maj(a,b,c)
725	ldr	x28,[x30],#8		// *K++, x19 in next round
726	add	x10,x10,x0
727	add	x21,x21,x17			// h+=Sigma0(a)
728	add	x10,x10,x15
729	ldr	x15,[sp,#0]
730	str	x2,[sp,#24]
731	ror	x16,x25,#14
732	add	x20,x20,x28			// h+=K[i]
733	ror	x1,x12,#1
734	and	x17,x26,x25
735	ror	x0,x9,#19
736	bic	x28,x27,x25
737	ror	x2,x21,#28
738	add	x20,x20,x10			// h+=X[i]
739	eor	x16,x16,x25,ror#18
740	eor	x1,x1,x12,ror#8
741	orr	x17,x17,x28			// Ch(e,f,g)
742	eor	x28,x21,x22			// a^b, b^c in next round
743	eor	x16,x16,x25,ror#41	// Sigma1(e)
744	eor	x2,x2,x21,ror#34
745	add	x20,x20,x17			// h+=Ch(e,f,g)
746	and	x19,x19,x28			// (b^c)&=(a^b)
747	eor	x0,x0,x9,ror#61
748	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
749	add	x20,x20,x16			// h+=Sigma1(e)
750	eor	x19,x19,x22			// Maj(a,b,c)
751	eor	x17,x2,x21,ror#39	// Sigma0(a)
752	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
753	add	x11,x11,x4
754	add	x24,x24,x20			// d+=h
755	add	x20,x20,x19			// h+=Maj(a,b,c)
756	ldr	x19,[x30],#8		// *K++, x28 in next round
757	add	x11,x11,x1
758	add	x20,x20,x17			// h+=Sigma0(a)
759	add	x11,x11,x0
760	ldr	x0,[sp,#8]
761	str	x3,[sp,#0]
762	ror	x16,x24,#14
763	add	x27,x27,x19			// h+=K[i]
764	ror	x2,x13,#1
765	and	x17,x25,x24
766	ror	x1,x10,#19
767	bic	x19,x26,x24
768	ror	x3,x20,#28
769	add	x27,x27,x11			// h+=X[i]
770	eor	x16,x16,x24,ror#18
771	eor	x2,x2,x13,ror#8
772	orr	x17,x17,x19			// Ch(e,f,g)
773	eor	x19,x20,x21			// a^b, b^c in next round
774	eor	x16,x16,x24,ror#41	// Sigma1(e)
775	eor	x3,x3,x20,ror#34
776	add	x27,x27,x17			// h+=Ch(e,f,g)
777	and	x28,x28,x19			// (b^c)&=(a^b)
778	eor	x1,x1,x10,ror#61
779	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
780	add	x27,x27,x16			// h+=Sigma1(e)
781	eor	x28,x28,x21			// Maj(a,b,c)
782	eor	x17,x3,x20,ror#39	// Sigma0(a)
783	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
784	add	x12,x12,x5
785	add	x23,x23,x27			// d+=h
786	add	x27,x27,x28			// h+=Maj(a,b,c)
787	ldr	x28,[x30],#8		// *K++, x19 in next round
788	add	x12,x12,x2
789	add	x27,x27,x17			// h+=Sigma0(a)
790	add	x12,x12,x1
791	ldr	x1,[sp,#16]
792	str	x4,[sp,#8]
793	ror	x16,x23,#14
794	add	x26,x26,x28			// h+=K[i]
795	ror	x3,x14,#1
796	and	x17,x24,x23
797	ror	x2,x11,#19
798	bic	x28,x25,x23
799	ror	x4,x27,#28
800	add	x26,x26,x12			// h+=X[i]
801	eor	x16,x16,x23,ror#18
802	eor	x3,x3,x14,ror#8
803	orr	x17,x17,x28			// Ch(e,f,g)
804	eor	x28,x27,x20			// a^b, b^c in next round
805	eor	x16,x16,x23,ror#41	// Sigma1(e)
806	eor	x4,x4,x27,ror#34
807	add	x26,x26,x17			// h+=Ch(e,f,g)
808	and	x19,x19,x28			// (b^c)&=(a^b)
809	eor	x2,x2,x11,ror#61
810	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
811	add	x26,x26,x16			// h+=Sigma1(e)
812	eor	x19,x19,x20			// Maj(a,b,c)
813	eor	x17,x4,x27,ror#39	// Sigma0(a)
814	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
815	add	x13,x13,x6
816	add	x22,x22,x26			// d+=h
817	add	x26,x26,x19			// h+=Maj(a,b,c)
818	ldr	x19,[x30],#8		// *K++, x28 in next round
819	add	x13,x13,x3
820	add	x26,x26,x17			// h+=Sigma0(a)
821	add	x13,x13,x2
822	ldr	x2,[sp,#24]
823	str	x5,[sp,#16]
824	ror	x16,x22,#14
825	add	x25,x25,x19			// h+=K[i]
826	ror	x4,x15,#1
827	and	x17,x23,x22
828	ror	x3,x12,#19
829	bic	x19,x24,x22
830	ror	x5,x26,#28
831	add	x25,x25,x13			// h+=X[i]
832	eor	x16,x16,x22,ror#18
833	eor	x4,x4,x15,ror#8
834	orr	x17,x17,x19			// Ch(e,f,g)
835	eor	x19,x26,x27			// a^b, b^c in next round
836	eor	x16,x16,x22,ror#41	// Sigma1(e)
837	eor	x5,x5,x26,ror#34
838	add	x25,x25,x17			// h+=Ch(e,f,g)
839	and	x28,x28,x19			// (b^c)&=(a^b)
840	eor	x3,x3,x12,ror#61
841	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
842	add	x25,x25,x16			// h+=Sigma1(e)
843	eor	x28,x28,x27			// Maj(a,b,c)
844	eor	x17,x5,x26,ror#39	// Sigma0(a)
845	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
846	add	x14,x14,x7
847	add	x21,x21,x25			// d+=h
848	add	x25,x25,x28			// h+=Maj(a,b,c)
849	ldr	x28,[x30],#8		// *K++, x19 in next round
850	add	x14,x14,x4
851	add	x25,x25,x17			// h+=Sigma0(a)
852	add	x14,x14,x3
853	ldr	x3,[sp,#0]
854	str	x6,[sp,#24]
855	ror	x16,x21,#14
856	add	x24,x24,x28			// h+=K[i]
857	ror	x5,x0,#1
858	and	x17,x22,x21
859	ror	x4,x13,#19
860	bic	x28,x23,x21
861	ror	x6,x25,#28
862	add	x24,x24,x14			// h+=X[i]
863	eor	x16,x16,x21,ror#18
864	eor	x5,x5,x0,ror#8
865	orr	x17,x17,x28			// Ch(e,f,g)
866	eor	x28,x25,x26			// a^b, b^c in next round
867	eor	x16,x16,x21,ror#41	// Sigma1(e)
868	eor	x6,x6,x25,ror#34
869	add	x24,x24,x17			// h+=Ch(e,f,g)
870	and	x19,x19,x28			// (b^c)&=(a^b)
871	eor	x4,x4,x13,ror#61
872	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
873	add	x24,x24,x16			// h+=Sigma1(e)
874	eor	x19,x19,x26			// Maj(a,b,c)
875	eor	x17,x6,x25,ror#39	// Sigma0(a)
876	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
877	add	x15,x15,x8
878	add	x20,x20,x24			// d+=h
879	add	x24,x24,x19			// h+=Maj(a,b,c)
880	ldr	x19,[x30],#8		// *K++, x28 in next round
881	add	x15,x15,x5
882	add	x24,x24,x17			// h+=Sigma0(a)
883	add	x15,x15,x4
884	ldr	x4,[sp,#8]
885	str	x7,[sp,#0]
886	ror	x16,x20,#14
887	add	x23,x23,x19			// h+=K[i]
888	ror	x6,x1,#1
889	and	x17,x21,x20
890	ror	x5,x14,#19
891	bic	x19,x22,x20
892	ror	x7,x24,#28
893	add	x23,x23,x15			// h+=X[i]
894	eor	x16,x16,x20,ror#18
895	eor	x6,x6,x1,ror#8
896	orr	x17,x17,x19			// Ch(e,f,g)
897	eor	x19,x24,x25			// a^b, b^c in next round
898	eor	x16,x16,x20,ror#41	// Sigma1(e)
899	eor	x7,x7,x24,ror#34
900	add	x23,x23,x17			// h+=Ch(e,f,g)
901	and	x28,x28,x19			// (b^c)&=(a^b)
902	eor	x5,x5,x14,ror#61
903	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
904	add	x23,x23,x16			// h+=Sigma1(e)
905	eor	x28,x28,x25			// Maj(a,b,c)
906	eor	x17,x7,x24,ror#39	// Sigma0(a)
907	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
908	add	x0,x0,x9
909	add	x27,x27,x23			// d+=h
910	add	x23,x23,x28			// h+=Maj(a,b,c)
911	ldr	x28,[x30],#8		// *K++, x19 in next round
912	add	x0,x0,x6
913	add	x23,x23,x17			// h+=Sigma0(a)
914	add	x0,x0,x5
915	ldr	x5,[sp,#16]
916	str	x8,[sp,#8]
917	ror	x16,x27,#14
918	add	x22,x22,x28			// h+=K[i]
919	ror	x7,x2,#1
920	and	x17,x20,x27
921	ror	x6,x15,#19
922	bic	x28,x21,x27
923	ror	x8,x23,#28
924	add	x22,x22,x0			// h+=X[i]
925	eor	x16,x16,x27,ror#18
926	eor	x7,x7,x2,ror#8
927	orr	x17,x17,x28			// Ch(e,f,g)
928	eor	x28,x23,x24			// a^b, b^c in next round
929	eor	x16,x16,x27,ror#41	// Sigma1(e)
930	eor	x8,x8,x23,ror#34
931	add	x22,x22,x17			// h+=Ch(e,f,g)
932	and	x19,x19,x28			// (b^c)&=(a^b)
933	eor	x6,x6,x15,ror#61
934	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
935	add	x22,x22,x16			// h+=Sigma1(e)
936	eor	x19,x19,x24			// Maj(a,b,c)
937	eor	x17,x8,x23,ror#39	// Sigma0(a)
938	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
939	add	x1,x1,x10
940	add	x26,x26,x22			// d+=h
941	add	x22,x22,x19			// h+=Maj(a,b,c)
942	ldr	x19,[x30],#8		// *K++, x28 in next round
943	add	x1,x1,x7
944	add	x22,x22,x17			// h+=Sigma0(a)
945	add	x1,x1,x6
946	ldr	x6,[sp,#24]
947	str	x9,[sp,#16]
948	ror	x16,x26,#14
949	add	x21,x21,x19			// h+=K[i]
950	ror	x8,x3,#1
951	and	x17,x27,x26
952	ror	x7,x0,#19
953	bic	x19,x20,x26
954	ror	x9,x22,#28
955	add	x21,x21,x1			// h+=X[i]
956	eor	x16,x16,x26,ror#18
957	eor	x8,x8,x3,ror#8
958	orr	x17,x17,x19			// Ch(e,f,g)
959	eor	x19,x22,x23			// a^b, b^c in next round
960	eor	x16,x16,x26,ror#41	// Sigma1(e)
961	eor	x9,x9,x22,ror#34
962	add	x21,x21,x17			// h+=Ch(e,f,g)
963	and	x28,x28,x19			// (b^c)&=(a^b)
964	eor	x7,x7,x0,ror#61
965	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
966	add	x21,x21,x16			// h+=Sigma1(e)
967	eor	x28,x28,x23			// Maj(a,b,c)
968	eor	x17,x9,x22,ror#39	// Sigma0(a)
969	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
970	add	x2,x2,x11
971	add	x25,x25,x21			// d+=h
972	add	x21,x21,x28			// h+=Maj(a,b,c)
973	ldr	x28,[x30],#8		// *K++, x19 in next round
974	add	x2,x2,x8
975	add	x21,x21,x17			// h+=Sigma0(a)
976	add	x2,x2,x7
977	ldr	x7,[sp,#0]
978	str	x10,[sp,#24]
979	ror	x16,x25,#14
980	add	x20,x20,x28			// h+=K[i]
981	ror	x9,x4,#1
982	and	x17,x26,x25
983	ror	x8,x1,#19
984	bic	x28,x27,x25
985	ror	x10,x21,#28
986	add	x20,x20,x2			// h+=X[i]
987	eor	x16,x16,x25,ror#18
988	eor	x9,x9,x4,ror#8
989	orr	x17,x17,x28			// Ch(e,f,g)
990	eor	x28,x21,x22			// a^b, b^c in next round
991	eor	x16,x16,x25,ror#41	// Sigma1(e)
992	eor	x10,x10,x21,ror#34
993	add	x20,x20,x17			// h+=Ch(e,f,g)
994	and	x19,x19,x28			// (b^c)&=(a^b)
995	eor	x8,x8,x1,ror#61
996	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
997	add	x20,x20,x16			// h+=Sigma1(e)
998	eor	x19,x19,x22			// Maj(a,b,c)
999	eor	x17,x10,x21,ror#39	// Sigma0(a)
1000	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
1001	add	x3,x3,x12
1002	add	x24,x24,x20			// d+=h
1003	add	x20,x20,x19			// h+=Maj(a,b,c)
1004	ldr	x19,[x30],#8		// *K++, x28 in next round
1005	add	x3,x3,x9
1006	add	x20,x20,x17			// h+=Sigma0(a)
1007	add	x3,x3,x8
1008	cbnz	x19,.Loop_16_xx
1009
1010	ldp	x0,x2,[x29,#96]
1011	ldr	x1,[x29,#112]
1012	sub	x30,x30,#648		// rewind
1013
1014	ldp	x3,x4,[x0]
1015	ldp	x5,x6,[x0,#2*8]
1016	add	x1,x1,#14*8			// advance input pointer
1017	ldp	x7,x8,[x0,#4*8]
1018	add	x20,x20,x3
1019	ldp	x9,x10,[x0,#6*8]
1020	add	x21,x21,x4
1021	add	x22,x22,x5
1022	add	x23,x23,x6
1023	stp	x20,x21,[x0]
1024	add	x24,x24,x7
1025	add	x25,x25,x8
1026	stp	x22,x23,[x0,#2*8]
1027	add	x26,x26,x9
1028	add	x27,x27,x10
1029	cmp	x1,x2
1030	stp	x24,x25,[x0,#4*8]
1031	stp	x26,x27,[x0,#6*8]
1032	b.ne	.Loop
1033
1034	ldp	x19,x20,[x29,#16]
1035	add	sp,sp,#4*8
1036	ldp	x21,x22,[x29,#32]
1037	ldp	x23,x24,[x29,#48]
1038	ldp	x25,x26,[x29,#64]
1039	ldp	x27,x28,[x29,#80]
1040	ldp	x29,x30,[sp],#128
1041.inst	0xd50323bf				// autiasp
1042	ret
1043.size	sha512_block_data_order,.-sha512_block_data_order
1044
1045.align	6
1046.type	.LK512,%object
1047.LK512:
1048.quad	0x428a2f98d728ae22,0x7137449123ef65cd
1049.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1050.quad	0x3956c25bf348b538,0x59f111f1b605d019
1051.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
1052.quad	0xd807aa98a3030242,0x12835b0145706fbe
1053.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1054.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
1055.quad	0x9bdc06a725c71235,0xc19bf174cf692694
1056.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
1057.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1058.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
1059.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1060.quad	0x983e5152ee66dfab,0xa831c66d2db43210
1061.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
1062.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
1063.quad	0x06ca6351e003826f,0x142929670a0e6e70
1064.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
1065.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1066.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
1067.quad	0x81c2c92e47edaee6,0x92722c851482353b
1068.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
1069.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
1070.quad	0xd192e819d6ef5218,0xd69906245565a910
1071.quad	0xf40e35855771202a,0x106aa07032bbd1b8
1072.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
1073.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1074.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1075.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1076.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
1077.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
1078.quad	0x90befffa23631e28,0xa4506cebde82bde9
1079.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
1080.quad	0xca273eceea26619c,0xd186b8c721c0c207
1081.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1082.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
1083.quad	0x113f9804bef90dae,0x1b710b35131c471b
1084.quad	0x28db77f523047d84,0x32caab7b40c72493
1085.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1086.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1087.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
1088.quad	0	// terminator
1089.size	.LK512,.-.LK512
1090#ifndef	__KERNEL__
1091.align	3
1092.LOPENSSL_armcap_P:
1093# ifdef	__ILP32__
1094.long	OPENSSL_armcap_P-.
1095# else
1096.quad	OPENSSL_armcap_P-.
1097# endif
1098#endif
1099.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1100.align	2
1101.align	2
1102#ifndef	__KERNEL__
1103.type	sha512_block_armv8,%function
1104.align	6
1105sha512_block_armv8:
1106.Lv8_entry:
1107	stp	x29,x30,[sp,#-16]!
1108	add	x29,sp,#0
1109
1110	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
1111	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1112
1113	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
1114	adr	x3,.LK512
1115
1116	rev64	v16.16b,v16.16b
1117	rev64	v17.16b,v17.16b
1118	rev64	v18.16b,v18.16b
1119	rev64	v19.16b,v19.16b
1120	rev64	v20.16b,v20.16b
1121	rev64	v21.16b,v21.16b
1122	rev64	v22.16b,v22.16b
1123	rev64	v23.16b,v23.16b
1124	b	.Loop_hw
1125
1126.align	4
1127.Loop_hw:
1128	ld1	{v24.2d},[x3],#16
1129	subs	x2,x2,#1
1130	sub	x4,x1,#128
1131	orr	v26.16b,v0.16b,v0.16b			// offload
1132	orr	v27.16b,v1.16b,v1.16b
1133	orr	v28.16b,v2.16b,v2.16b
1134	orr	v29.16b,v3.16b,v3.16b
1135	csel	x1,x1,x4,ne			// conditional rewind
1136	add	v24.2d,v24.2d,v16.2d
1137	ld1	{v25.2d},[x3],#16
1138	ext	v24.16b,v24.16b,v24.16b,#8
1139	ext	v5.16b,v2.16b,v3.16b,#8
1140	ext	v6.16b,v1.16b,v2.16b,#8
1141	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1142.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1143	ext	v7.16b,v20.16b,v21.16b,#8
1144.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1145.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1146	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1147.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1148	add	v25.2d,v25.2d,v17.2d
1149	ld1	{v24.2d},[x3],#16
1150	ext	v25.16b,v25.16b,v25.16b,#8
1151	ext	v5.16b,v4.16b,v2.16b,#8
1152	ext	v6.16b,v0.16b,v4.16b,#8
1153	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1154.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1155	ext	v7.16b,v21.16b,v22.16b,#8
1156.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1157.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1158	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1159.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1160	add	v24.2d,v24.2d,v18.2d
1161	ld1	{v25.2d},[x3],#16
1162	ext	v24.16b,v24.16b,v24.16b,#8
1163	ext	v5.16b,v1.16b,v4.16b,#8
1164	ext	v6.16b,v3.16b,v1.16b,#8
1165	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1166.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1167	ext	v7.16b,v22.16b,v23.16b,#8
1168.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1169.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1170	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1171.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1172	add	v25.2d,v25.2d,v19.2d
1173	ld1	{v24.2d},[x3],#16
1174	ext	v25.16b,v25.16b,v25.16b,#8
1175	ext	v5.16b,v0.16b,v1.16b,#8
1176	ext	v6.16b,v2.16b,v0.16b,#8
1177	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1178.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1179	ext	v7.16b,v23.16b,v16.16b,#8
1180.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1181.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1182	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1183.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1184	add	v24.2d,v24.2d,v20.2d
1185	ld1	{v25.2d},[x3],#16
1186	ext	v24.16b,v24.16b,v24.16b,#8
1187	ext	v5.16b,v3.16b,v0.16b,#8
1188	ext	v6.16b,v4.16b,v3.16b,#8
1189	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1190.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1191	ext	v7.16b,v16.16b,v17.16b,#8
1192.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1193.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1194	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1195.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1196	add	v25.2d,v25.2d,v21.2d
1197	ld1	{v24.2d},[x3],#16
1198	ext	v25.16b,v25.16b,v25.16b,#8
1199	ext	v5.16b,v2.16b,v3.16b,#8
1200	ext	v6.16b,v1.16b,v2.16b,#8
1201	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1202.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1203	ext	v7.16b,v17.16b,v18.16b,#8
1204.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1205.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1206	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1207.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1208	add	v24.2d,v24.2d,v22.2d
1209	ld1	{v25.2d},[x3],#16
1210	ext	v24.16b,v24.16b,v24.16b,#8
1211	ext	v5.16b,v4.16b,v2.16b,#8
1212	ext	v6.16b,v0.16b,v4.16b,#8
1213	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1214.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1215	ext	v7.16b,v18.16b,v19.16b,#8
1216.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1217.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1218	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1219.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1220	add	v25.2d,v25.2d,v23.2d
1221	ld1	{v24.2d},[x3],#16
1222	ext	v25.16b,v25.16b,v25.16b,#8
1223	ext	v5.16b,v1.16b,v4.16b,#8
1224	ext	v6.16b,v3.16b,v1.16b,#8
1225	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1226.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1227	ext	v7.16b,v19.16b,v20.16b,#8
1228.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1229.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1230	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1231.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1232	add	v24.2d,v24.2d,v16.2d
1233	ld1	{v25.2d},[x3],#16
1234	ext	v24.16b,v24.16b,v24.16b,#8
1235	ext	v5.16b,v0.16b,v1.16b,#8
1236	ext	v6.16b,v2.16b,v0.16b,#8
1237	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1238.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1239	ext	v7.16b,v20.16b,v21.16b,#8
1240.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1241.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1242	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1243.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1244	add	v25.2d,v25.2d,v17.2d
1245	ld1	{v24.2d},[x3],#16
1246	ext	v25.16b,v25.16b,v25.16b,#8
1247	ext	v5.16b,v3.16b,v0.16b,#8
1248	ext	v6.16b,v4.16b,v3.16b,#8
1249	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1250.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1251	ext	v7.16b,v21.16b,v22.16b,#8
1252.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1253.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1254	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1255.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1256	add	v24.2d,v24.2d,v18.2d
1257	ld1	{v25.2d},[x3],#16
1258	ext	v24.16b,v24.16b,v24.16b,#8
1259	ext	v5.16b,v2.16b,v3.16b,#8
1260	ext	v6.16b,v1.16b,v2.16b,#8
1261	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1262.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1263	ext	v7.16b,v22.16b,v23.16b,#8
1264.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1265.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1266	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1267.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1268	add	v25.2d,v25.2d,v19.2d
1269	ld1	{v24.2d},[x3],#16
1270	ext	v25.16b,v25.16b,v25.16b,#8
1271	ext	v5.16b,v4.16b,v2.16b,#8
1272	ext	v6.16b,v0.16b,v4.16b,#8
1273	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1274.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1275	ext	v7.16b,v23.16b,v16.16b,#8
1276.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1277.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1278	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1279.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1280	add	v24.2d,v24.2d,v20.2d
1281	ld1	{v25.2d},[x3],#16
1282	ext	v24.16b,v24.16b,v24.16b,#8
1283	ext	v5.16b,v1.16b,v4.16b,#8
1284	ext	v6.16b,v3.16b,v1.16b,#8
1285	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1286.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1287	ext	v7.16b,v16.16b,v17.16b,#8
1288.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1289.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1290	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1291.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1292	add	v25.2d,v25.2d,v21.2d
1293	ld1	{v24.2d},[x3],#16
1294	ext	v25.16b,v25.16b,v25.16b,#8
1295	ext	v5.16b,v0.16b,v1.16b,#8
1296	ext	v6.16b,v2.16b,v0.16b,#8
1297	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1298.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1299	ext	v7.16b,v17.16b,v18.16b,#8
1300.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1301.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1302	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1303.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1304	add	v24.2d,v24.2d,v22.2d
1305	ld1	{v25.2d},[x3],#16
1306	ext	v24.16b,v24.16b,v24.16b,#8
1307	ext	v5.16b,v3.16b,v0.16b,#8
1308	ext	v6.16b,v4.16b,v3.16b,#8
1309	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1310.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1311	ext	v7.16b,v18.16b,v19.16b,#8
1312.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1313.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1314	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1315.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1316	add	v25.2d,v25.2d,v23.2d
1317	ld1	{v24.2d},[x3],#16
1318	ext	v25.16b,v25.16b,v25.16b,#8
1319	ext	v5.16b,v2.16b,v3.16b,#8
1320	ext	v6.16b,v1.16b,v2.16b,#8
1321	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1322.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1323	ext	v7.16b,v19.16b,v20.16b,#8
1324.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1325.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1326	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1327.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1328	add	v24.2d,v24.2d,v16.2d
1329	ld1	{v25.2d},[x3],#16
1330	ext	v24.16b,v24.16b,v24.16b,#8
1331	ext	v5.16b,v4.16b,v2.16b,#8
1332	ext	v6.16b,v0.16b,v4.16b,#8
1333	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1334.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1335	ext	v7.16b,v20.16b,v21.16b,#8
1336.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1337.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1338	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1339.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1340	add	v25.2d,v25.2d,v17.2d
1341	ld1	{v24.2d},[x3],#16
1342	ext	v25.16b,v25.16b,v25.16b,#8
1343	ext	v5.16b,v1.16b,v4.16b,#8
1344	ext	v6.16b,v3.16b,v1.16b,#8
1345	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1346.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1347	ext	v7.16b,v21.16b,v22.16b,#8
1348.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1349.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1350	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1351.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1352	add	v24.2d,v24.2d,v18.2d
1353	ld1	{v25.2d},[x3],#16
1354	ext	v24.16b,v24.16b,v24.16b,#8
1355	ext	v5.16b,v0.16b,v1.16b,#8
1356	ext	v6.16b,v2.16b,v0.16b,#8
1357	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1358.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1359	ext	v7.16b,v22.16b,v23.16b,#8
1360.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1361.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1362	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1363.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1364	add	v25.2d,v25.2d,v19.2d
1365	ld1	{v24.2d},[x3],#16
1366	ext	v25.16b,v25.16b,v25.16b,#8
1367	ext	v5.16b,v3.16b,v0.16b,#8
1368	ext	v6.16b,v4.16b,v3.16b,#8
1369	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1370.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1371	ext	v7.16b,v23.16b,v16.16b,#8
1372.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1373.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1374	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1375.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1376	add	v24.2d,v24.2d,v20.2d
1377	ld1	{v25.2d},[x3],#16
1378	ext	v24.16b,v24.16b,v24.16b,#8
1379	ext	v5.16b,v2.16b,v3.16b,#8
1380	ext	v6.16b,v1.16b,v2.16b,#8
1381	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1382.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1383	ext	v7.16b,v16.16b,v17.16b,#8
1384.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1385.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1386	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1387.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1388	add	v25.2d,v25.2d,v21.2d
1389	ld1	{v24.2d},[x3],#16
1390	ext	v25.16b,v25.16b,v25.16b,#8
1391	ext	v5.16b,v4.16b,v2.16b,#8
1392	ext	v6.16b,v0.16b,v4.16b,#8
1393	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1394.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1395	ext	v7.16b,v17.16b,v18.16b,#8
1396.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1397.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1398	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1399.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1400	add	v24.2d,v24.2d,v22.2d
1401	ld1	{v25.2d},[x3],#16
1402	ext	v24.16b,v24.16b,v24.16b,#8
1403	ext	v5.16b,v1.16b,v4.16b,#8
1404	ext	v6.16b,v3.16b,v1.16b,#8
1405	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1406.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1407	ext	v7.16b,v18.16b,v19.16b,#8
1408.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1409.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1410	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1411.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1412	add	v25.2d,v25.2d,v23.2d
1413	ld1	{v24.2d},[x3],#16
1414	ext	v25.16b,v25.16b,v25.16b,#8
1415	ext	v5.16b,v0.16b,v1.16b,#8
1416	ext	v6.16b,v2.16b,v0.16b,#8
1417	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1418.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1419	ext	v7.16b,v19.16b,v20.16b,#8
1420.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1421.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1422	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1423.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1424	add	v24.2d,v24.2d,v16.2d
1425	ld1	{v25.2d},[x3],#16
1426	ext	v24.16b,v24.16b,v24.16b,#8
1427	ext	v5.16b,v3.16b,v0.16b,#8
1428	ext	v6.16b,v4.16b,v3.16b,#8
1429	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1430.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1431	ext	v7.16b,v20.16b,v21.16b,#8
1432.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1433.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1434	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1435.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1436	add	v25.2d,v25.2d,v17.2d
1437	ld1	{v24.2d},[x3],#16
1438	ext	v25.16b,v25.16b,v25.16b,#8
1439	ext	v5.16b,v2.16b,v3.16b,#8
1440	ext	v6.16b,v1.16b,v2.16b,#8
1441	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1442.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1443	ext	v7.16b,v21.16b,v22.16b,#8
1444.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1445.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1446	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1447.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1448	add	v24.2d,v24.2d,v18.2d
1449	ld1	{v25.2d},[x3],#16
1450	ext	v24.16b,v24.16b,v24.16b,#8
1451	ext	v5.16b,v4.16b,v2.16b,#8
1452	ext	v6.16b,v0.16b,v4.16b,#8
1453	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1454.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1455	ext	v7.16b,v22.16b,v23.16b,#8
1456.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1457.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1458	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1459.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1460	add	v25.2d,v25.2d,v19.2d
1461	ld1	{v24.2d},[x3],#16
1462	ext	v25.16b,v25.16b,v25.16b,#8
1463	ext	v5.16b,v1.16b,v4.16b,#8
1464	ext	v6.16b,v3.16b,v1.16b,#8
1465	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1466.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1467	ext	v7.16b,v23.16b,v16.16b,#8
1468.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1469.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1470	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1471.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1472	add	v24.2d,v24.2d,v20.2d
1473	ld1	{v25.2d},[x3],#16
1474	ext	v24.16b,v24.16b,v24.16b,#8
1475	ext	v5.16b,v0.16b,v1.16b,#8
1476	ext	v6.16b,v2.16b,v0.16b,#8
1477	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1478.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1479	ext	v7.16b,v16.16b,v17.16b,#8
1480.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1481.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1482	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1483.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1484	add	v25.2d,v25.2d,v21.2d
1485	ld1	{v24.2d},[x3],#16
1486	ext	v25.16b,v25.16b,v25.16b,#8
1487	ext	v5.16b,v3.16b,v0.16b,#8
1488	ext	v6.16b,v4.16b,v3.16b,#8
1489	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1490.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1491	ext	v7.16b,v17.16b,v18.16b,#8
1492.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1493.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1494	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1495.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1496	add	v24.2d,v24.2d,v22.2d
1497	ld1	{v25.2d},[x3],#16
1498	ext	v24.16b,v24.16b,v24.16b,#8
1499	ext	v5.16b,v2.16b,v3.16b,#8
1500	ext	v6.16b,v1.16b,v2.16b,#8
1501	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1502.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1503	ext	v7.16b,v18.16b,v19.16b,#8
1504.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1505.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1506	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1507.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1508	add	v25.2d,v25.2d,v23.2d
1509	ld1	{v24.2d},[x3],#16
1510	ext	v25.16b,v25.16b,v25.16b,#8
1511	ext	v5.16b,v4.16b,v2.16b,#8
1512	ext	v6.16b,v0.16b,v4.16b,#8
1513	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1514.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1515	ext	v7.16b,v19.16b,v20.16b,#8
1516.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1517.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1518	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1519.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1520	ld1	{v25.2d},[x3],#16
1521	add	v24.2d,v24.2d,v16.2d
1522	ld1	{v16.16b},[x1],#16		// load next input
1523	ext	v24.16b,v24.16b,v24.16b,#8
1524	ext	v5.16b,v1.16b,v4.16b,#8
1525	ext	v6.16b,v3.16b,v1.16b,#8
1526	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1527.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1528	rev64	v16.16b,v16.16b
1529	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1530.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1531	ld1	{v24.2d},[x3],#16
1532	add	v25.2d,v25.2d,v17.2d
1533	ld1	{v17.16b},[x1],#16		// load next input
1534	ext	v25.16b,v25.16b,v25.16b,#8
1535	ext	v5.16b,v0.16b,v1.16b,#8
1536	ext	v6.16b,v2.16b,v0.16b,#8
1537	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1538.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1539	rev64	v17.16b,v17.16b
1540	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1541.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1542	ld1	{v25.2d},[x3],#16
1543	add	v24.2d,v24.2d,v18.2d
1544	ld1	{v18.16b},[x1],#16		// load next input
1545	ext	v24.16b,v24.16b,v24.16b,#8
1546	ext	v5.16b,v3.16b,v0.16b,#8
1547	ext	v6.16b,v4.16b,v3.16b,#8
1548	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1549.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1550	rev64	v18.16b,v18.16b
1551	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1552.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1553	ld1	{v24.2d},[x3],#16
1554	add	v25.2d,v25.2d,v19.2d
1555	ld1	{v19.16b},[x1],#16		// load next input
1556	ext	v25.16b,v25.16b,v25.16b,#8
1557	ext	v5.16b,v2.16b,v3.16b,#8
1558	ext	v6.16b,v1.16b,v2.16b,#8
1559	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1560.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1561	rev64	v19.16b,v19.16b
1562	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1563.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1564	ld1	{v25.2d},[x3],#16
1565	add	v24.2d,v24.2d,v20.2d
1566	ld1	{v20.16b},[x1],#16		// load next input
1567	ext	v24.16b,v24.16b,v24.16b,#8
1568	ext	v5.16b,v4.16b,v2.16b,#8
1569	ext	v6.16b,v0.16b,v4.16b,#8
1570	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1571.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1572	rev64	v20.16b,v20.16b
1573	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1574.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1575	ld1	{v24.2d},[x3],#16
1576	add	v25.2d,v25.2d,v21.2d
1577	ld1	{v21.16b},[x1],#16		// load next input
1578	ext	v25.16b,v25.16b,v25.16b,#8
1579	ext	v5.16b,v1.16b,v4.16b,#8
1580	ext	v6.16b,v3.16b,v1.16b,#8
1581	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1582.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1583	rev64	v21.16b,v21.16b
1584	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1585.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1586	ld1	{v25.2d},[x3],#16
1587	add	v24.2d,v24.2d,v22.2d
1588	ld1	{v22.16b},[x1],#16		// load next input
1589	ext	v24.16b,v24.16b,v24.16b,#8
1590	ext	v5.16b,v0.16b,v1.16b,#8
1591	ext	v6.16b,v2.16b,v0.16b,#8
1592	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1593.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1594	rev64	v22.16b,v22.16b
1595	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1596.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1597	sub	x3,x3,#80*8	// rewind
1598	add	v25.2d,v25.2d,v23.2d
1599	ld1	{v23.16b},[x1],#16		// load next input
1600	ext	v25.16b,v25.16b,v25.16b,#8
1601	ext	v5.16b,v3.16b,v0.16b,#8
1602	ext	v6.16b,v4.16b,v3.16b,#8
1603	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1604.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1605	rev64	v23.16b,v23.16b
1606	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1607.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1608	add	v0.2d,v0.2d,v26.2d			// accumulate
1609	add	v1.2d,v1.2d,v27.2d
1610	add	v2.2d,v2.2d,v28.2d
1611	add	v3.2d,v3.2d,v29.2d
1612
1613	cbnz	x2,.Loop_hw
1614
1615	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
1616
1617	ldr	x29,[sp],#16
1618	ret
1619.size	sha512_block_armv8,.-sha512_block_armv8
1620#endif
1621