xref: /freebsd/sys/crypto/openssl/aarch64/sha512-armv8.S (revision 25fb30bd9abc492359ad1f66901a06cb8cd08370)
1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from sha512-armv8.pl. */
3// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
4//
5// Licensed under the OpenSSL license (the "License").  You may not use
6// this file except in compliance with the License.  You can obtain a copy
7// in the file LICENSE in the source distribution or at
8// https://www.openssl.org/source/license.html
9
10// ====================================================================
11// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12// project. The module is, however, dual licensed under OpenSSL and
13// CRYPTOGAMS licenses depending on where you obtain it. For further
14// details see http://www.openssl.org/~appro/cryptogams/.
15//
16// Permission to use under GPLv2 terms is granted.
17// ====================================================================
18//
19// SHA256/512 for ARMv8.
20//
21// Performance in cycles per processed byte and improvement coefficient
22// over code generated with "default" compiler:
23//
24//		SHA256-hw	SHA256(*)	SHA512
25// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
26// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
27// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
28// Denver	2.01		10.5 (+26%)	6.70 (+8%)
29// X-Gene			20.0 (+100%)	12.8 (+300%(***))
30// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
31// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
32//
33// (*)	Software SHA256 results are of lesser relevance, presented
34//	mostly for informational purposes.
35// (**)	The result is a trade-off: it's possible to improve it by
36//	10% (or by 1 cycle per round), but at the cost of 20% loss
37//	on Cortex-A53 (or by 4 cycles per round).
38// (***)	Super-impressive coefficients over gcc-generated code are
39//	indication of some compiler "pathology", most notably code
40//	generated with -mgeneral-regs-only is significantly faster
41//	and the gap is only 40-90%.
42//
43// October 2016.
44//
45// Originally it was reckoned that it makes no sense to implement NEON
46// version of SHA256 for 64-bit processors. This is because performance
47// improvement on most wide-spread Cortex-A5x processors was observed
48// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
49// observed that 32-bit NEON SHA256 performs significantly better than
50// 64-bit scalar version on *some* of the more recent processors. As
51// result 64-bit NEON version of SHA256 was added to provide best
52// all-round performance. For example it executes ~30% faster on X-Gene
53// and Mongoose. [For reference, NEON version of SHA512 is bound to
54// deliver much less improvement, likely *negative* on Cortex-A5x.
55// Which is why NEON support is limited to SHA256.]
56
57#ifndef	__KERNEL__
58# include "arm_arch.h"
59#endif
60
61.text
62
63
64.globl	sha512_block_data_order
65.type	sha512_block_data_order,%function
66.align	6
67sha512_block_data_order:
68#ifndef	__KERNEL__
69# ifdef	__ILP32__
70	ldrsw	x16,.LOPENSSL_armcap_P
71# else
72	ldr	x16,.LOPENSSL_armcap_P
73# endif
74	adr	x17,.LOPENSSL_armcap_P
75	add	x16,x16,x17
76	ldr	w16,[x16]
77	tst	w16,#ARMV8_SHA512
78	b.ne	.Lv8_entry
79#endif
80.inst	0xd503233f				// paciasp
81	stp	x29,x30,[sp,#-128]!
82	add	x29,sp,#0
83
84	stp	x19,x20,[sp,#16]
85	stp	x21,x22,[sp,#32]
86	stp	x23,x24,[sp,#48]
87	stp	x25,x26,[sp,#64]
88	stp	x27,x28,[sp,#80]
89	sub	sp,sp,#4*8
90
91	ldp	x20,x21,[x0]				// load context
92	ldp	x22,x23,[x0,#2*8]
93	ldp	x24,x25,[x0,#4*8]
94	add	x2,x1,x2,lsl#7	// end of input
95	ldp	x26,x27,[x0,#6*8]
96	adr	x30,.LK512
97	stp	x0,x2,[x29,#96]
98
99.Loop:
100	ldp	x3,x4,[x1],#2*8
101	ldr	x19,[x30],#8			// *K++
102	eor	x28,x21,x22				// magic seed
103	str	x1,[x29,#112]
104#ifndef	__AARCH64EB__
105	rev	x3,x3			// 0
106#endif
107	ror	x16,x24,#14
108	add	x27,x27,x19			// h+=K[i]
109	eor	x6,x24,x24,ror#23
110	and	x17,x25,x24
111	bic	x19,x26,x24
112	add	x27,x27,x3			// h+=X[i]
113	orr	x17,x17,x19			// Ch(e,f,g)
114	eor	x19,x20,x21			// a^b, b^c in next round
115	eor	x16,x16,x6,ror#18	// Sigma1(e)
116	ror	x6,x20,#28
117	add	x27,x27,x17			// h+=Ch(e,f,g)
118	eor	x17,x20,x20,ror#5
119	add	x27,x27,x16			// h+=Sigma1(e)
120	and	x28,x28,x19			// (b^c)&=(a^b)
121	add	x23,x23,x27			// d+=h
122	eor	x28,x28,x21			// Maj(a,b,c)
123	eor	x17,x6,x17,ror#34	// Sigma0(a)
124	add	x27,x27,x28			// h+=Maj(a,b,c)
125	ldr	x28,[x30],#8		// *K++, x19 in next round
126	//add	x27,x27,x17			// h+=Sigma0(a)
127#ifndef	__AARCH64EB__
128	rev	x4,x4			// 1
129#endif
130	ldp	x5,x6,[x1],#2*8
131	add	x27,x27,x17			// h+=Sigma0(a)
132	ror	x16,x23,#14
133	add	x26,x26,x28			// h+=K[i]
134	eor	x7,x23,x23,ror#23
135	and	x17,x24,x23
136	bic	x28,x25,x23
137	add	x26,x26,x4			// h+=X[i]
138	orr	x17,x17,x28			// Ch(e,f,g)
139	eor	x28,x27,x20			// a^b, b^c in next round
140	eor	x16,x16,x7,ror#18	// Sigma1(e)
141	ror	x7,x27,#28
142	add	x26,x26,x17			// h+=Ch(e,f,g)
143	eor	x17,x27,x27,ror#5
144	add	x26,x26,x16			// h+=Sigma1(e)
145	and	x19,x19,x28			// (b^c)&=(a^b)
146	add	x22,x22,x26			// d+=h
147	eor	x19,x19,x20			// Maj(a,b,c)
148	eor	x17,x7,x17,ror#34	// Sigma0(a)
149	add	x26,x26,x19			// h+=Maj(a,b,c)
150	ldr	x19,[x30],#8		// *K++, x28 in next round
151	//add	x26,x26,x17			// h+=Sigma0(a)
152#ifndef	__AARCH64EB__
153	rev	x5,x5			// 2
154#endif
155	add	x26,x26,x17			// h+=Sigma0(a)
156	ror	x16,x22,#14
157	add	x25,x25,x19			// h+=K[i]
158	eor	x8,x22,x22,ror#23
159	and	x17,x23,x22
160	bic	x19,x24,x22
161	add	x25,x25,x5			// h+=X[i]
162	orr	x17,x17,x19			// Ch(e,f,g)
163	eor	x19,x26,x27			// a^b, b^c in next round
164	eor	x16,x16,x8,ror#18	// Sigma1(e)
165	ror	x8,x26,#28
166	add	x25,x25,x17			// h+=Ch(e,f,g)
167	eor	x17,x26,x26,ror#5
168	add	x25,x25,x16			// h+=Sigma1(e)
169	and	x28,x28,x19			// (b^c)&=(a^b)
170	add	x21,x21,x25			// d+=h
171	eor	x28,x28,x27			// Maj(a,b,c)
172	eor	x17,x8,x17,ror#34	// Sigma0(a)
173	add	x25,x25,x28			// h+=Maj(a,b,c)
174	ldr	x28,[x30],#8		// *K++, x19 in next round
175	//add	x25,x25,x17			// h+=Sigma0(a)
176#ifndef	__AARCH64EB__
177	rev	x6,x6			// 3
178#endif
179	ldp	x7,x8,[x1],#2*8
180	add	x25,x25,x17			// h+=Sigma0(a)
181	ror	x16,x21,#14
182	add	x24,x24,x28			// h+=K[i]
183	eor	x9,x21,x21,ror#23
184	and	x17,x22,x21
185	bic	x28,x23,x21
186	add	x24,x24,x6			// h+=X[i]
187	orr	x17,x17,x28			// Ch(e,f,g)
188	eor	x28,x25,x26			// a^b, b^c in next round
189	eor	x16,x16,x9,ror#18	// Sigma1(e)
190	ror	x9,x25,#28
191	add	x24,x24,x17			// h+=Ch(e,f,g)
192	eor	x17,x25,x25,ror#5
193	add	x24,x24,x16			// h+=Sigma1(e)
194	and	x19,x19,x28			// (b^c)&=(a^b)
195	add	x20,x20,x24			// d+=h
196	eor	x19,x19,x26			// Maj(a,b,c)
197	eor	x17,x9,x17,ror#34	// Sigma0(a)
198	add	x24,x24,x19			// h+=Maj(a,b,c)
199	ldr	x19,[x30],#8		// *K++, x28 in next round
200	//add	x24,x24,x17			// h+=Sigma0(a)
201#ifndef	__AARCH64EB__
202	rev	x7,x7			// 4
203#endif
204	add	x24,x24,x17			// h+=Sigma0(a)
205	ror	x16,x20,#14
206	add	x23,x23,x19			// h+=K[i]
207	eor	x10,x20,x20,ror#23
208	and	x17,x21,x20
209	bic	x19,x22,x20
210	add	x23,x23,x7			// h+=X[i]
211	orr	x17,x17,x19			// Ch(e,f,g)
212	eor	x19,x24,x25			// a^b, b^c in next round
213	eor	x16,x16,x10,ror#18	// Sigma1(e)
214	ror	x10,x24,#28
215	add	x23,x23,x17			// h+=Ch(e,f,g)
216	eor	x17,x24,x24,ror#5
217	add	x23,x23,x16			// h+=Sigma1(e)
218	and	x28,x28,x19			// (b^c)&=(a^b)
219	add	x27,x27,x23			// d+=h
220	eor	x28,x28,x25			// Maj(a,b,c)
221	eor	x17,x10,x17,ror#34	// Sigma0(a)
222	add	x23,x23,x28			// h+=Maj(a,b,c)
223	ldr	x28,[x30],#8		// *K++, x19 in next round
224	//add	x23,x23,x17			// h+=Sigma0(a)
225#ifndef	__AARCH64EB__
226	rev	x8,x8			// 5
227#endif
228	ldp	x9,x10,[x1],#2*8
229	add	x23,x23,x17			// h+=Sigma0(a)
230	ror	x16,x27,#14
231	add	x22,x22,x28			// h+=K[i]
232	eor	x11,x27,x27,ror#23
233	and	x17,x20,x27
234	bic	x28,x21,x27
235	add	x22,x22,x8			// h+=X[i]
236	orr	x17,x17,x28			// Ch(e,f,g)
237	eor	x28,x23,x24			// a^b, b^c in next round
238	eor	x16,x16,x11,ror#18	// Sigma1(e)
239	ror	x11,x23,#28
240	add	x22,x22,x17			// h+=Ch(e,f,g)
241	eor	x17,x23,x23,ror#5
242	add	x22,x22,x16			// h+=Sigma1(e)
243	and	x19,x19,x28			// (b^c)&=(a^b)
244	add	x26,x26,x22			// d+=h
245	eor	x19,x19,x24			// Maj(a,b,c)
246	eor	x17,x11,x17,ror#34	// Sigma0(a)
247	add	x22,x22,x19			// h+=Maj(a,b,c)
248	ldr	x19,[x30],#8		// *K++, x28 in next round
249	//add	x22,x22,x17			// h+=Sigma0(a)
250#ifndef	__AARCH64EB__
251	rev	x9,x9			// 6
252#endif
253	add	x22,x22,x17			// h+=Sigma0(a)
254	ror	x16,x26,#14
255	add	x21,x21,x19			// h+=K[i]
256	eor	x12,x26,x26,ror#23
257	and	x17,x27,x26
258	bic	x19,x20,x26
259	add	x21,x21,x9			// h+=X[i]
260	orr	x17,x17,x19			// Ch(e,f,g)
261	eor	x19,x22,x23			// a^b, b^c in next round
262	eor	x16,x16,x12,ror#18	// Sigma1(e)
263	ror	x12,x22,#28
264	add	x21,x21,x17			// h+=Ch(e,f,g)
265	eor	x17,x22,x22,ror#5
266	add	x21,x21,x16			// h+=Sigma1(e)
267	and	x28,x28,x19			// (b^c)&=(a^b)
268	add	x25,x25,x21			// d+=h
269	eor	x28,x28,x23			// Maj(a,b,c)
270	eor	x17,x12,x17,ror#34	// Sigma0(a)
271	add	x21,x21,x28			// h+=Maj(a,b,c)
272	ldr	x28,[x30],#8		// *K++, x19 in next round
273	//add	x21,x21,x17			// h+=Sigma0(a)
274#ifndef	__AARCH64EB__
275	rev	x10,x10			// 7
276#endif
277	ldp	x11,x12,[x1],#2*8
278	add	x21,x21,x17			// h+=Sigma0(a)
279	ror	x16,x25,#14
280	add	x20,x20,x28			// h+=K[i]
281	eor	x13,x25,x25,ror#23
282	and	x17,x26,x25
283	bic	x28,x27,x25
284	add	x20,x20,x10			// h+=X[i]
285	orr	x17,x17,x28			// Ch(e,f,g)
286	eor	x28,x21,x22			// a^b, b^c in next round
287	eor	x16,x16,x13,ror#18	// Sigma1(e)
288	ror	x13,x21,#28
289	add	x20,x20,x17			// h+=Ch(e,f,g)
290	eor	x17,x21,x21,ror#5
291	add	x20,x20,x16			// h+=Sigma1(e)
292	and	x19,x19,x28			// (b^c)&=(a^b)
293	add	x24,x24,x20			// d+=h
294	eor	x19,x19,x22			// Maj(a,b,c)
295	eor	x17,x13,x17,ror#34	// Sigma0(a)
296	add	x20,x20,x19			// h+=Maj(a,b,c)
297	ldr	x19,[x30],#8		// *K++, x28 in next round
298	//add	x20,x20,x17			// h+=Sigma0(a)
299#ifndef	__AARCH64EB__
300	rev	x11,x11			// 8
301#endif
302	add	x20,x20,x17			// h+=Sigma0(a)
303	ror	x16,x24,#14
304	add	x27,x27,x19			// h+=K[i]
305	eor	x14,x24,x24,ror#23
306	and	x17,x25,x24
307	bic	x19,x26,x24
308	add	x27,x27,x11			// h+=X[i]
309	orr	x17,x17,x19			// Ch(e,f,g)
310	eor	x19,x20,x21			// a^b, b^c in next round
311	eor	x16,x16,x14,ror#18	// Sigma1(e)
312	ror	x14,x20,#28
313	add	x27,x27,x17			// h+=Ch(e,f,g)
314	eor	x17,x20,x20,ror#5
315	add	x27,x27,x16			// h+=Sigma1(e)
316	and	x28,x28,x19			// (b^c)&=(a^b)
317	add	x23,x23,x27			// d+=h
318	eor	x28,x28,x21			// Maj(a,b,c)
319	eor	x17,x14,x17,ror#34	// Sigma0(a)
320	add	x27,x27,x28			// h+=Maj(a,b,c)
321	ldr	x28,[x30],#8		// *K++, x19 in next round
322	//add	x27,x27,x17			// h+=Sigma0(a)
323#ifndef	__AARCH64EB__
324	rev	x12,x12			// 9
325#endif
326	ldp	x13,x14,[x1],#2*8
327	add	x27,x27,x17			// h+=Sigma0(a)
328	ror	x16,x23,#14
329	add	x26,x26,x28			// h+=K[i]
330	eor	x15,x23,x23,ror#23
331	and	x17,x24,x23
332	bic	x28,x25,x23
333	add	x26,x26,x12			// h+=X[i]
334	orr	x17,x17,x28			// Ch(e,f,g)
335	eor	x28,x27,x20			// a^b, b^c in next round
336	eor	x16,x16,x15,ror#18	// Sigma1(e)
337	ror	x15,x27,#28
338	add	x26,x26,x17			// h+=Ch(e,f,g)
339	eor	x17,x27,x27,ror#5
340	add	x26,x26,x16			// h+=Sigma1(e)
341	and	x19,x19,x28			// (b^c)&=(a^b)
342	add	x22,x22,x26			// d+=h
343	eor	x19,x19,x20			// Maj(a,b,c)
344	eor	x17,x15,x17,ror#34	// Sigma0(a)
345	add	x26,x26,x19			// h+=Maj(a,b,c)
346	ldr	x19,[x30],#8		// *K++, x28 in next round
347	//add	x26,x26,x17			// h+=Sigma0(a)
348#ifndef	__AARCH64EB__
349	rev	x13,x13			// 10
350#endif
351	add	x26,x26,x17			// h+=Sigma0(a)
352	ror	x16,x22,#14
353	add	x25,x25,x19			// h+=K[i]
354	eor	x0,x22,x22,ror#23
355	and	x17,x23,x22
356	bic	x19,x24,x22
357	add	x25,x25,x13			// h+=X[i]
358	orr	x17,x17,x19			// Ch(e,f,g)
359	eor	x19,x26,x27			// a^b, b^c in next round
360	eor	x16,x16,x0,ror#18	// Sigma1(e)
361	ror	x0,x26,#28
362	add	x25,x25,x17			// h+=Ch(e,f,g)
363	eor	x17,x26,x26,ror#5
364	add	x25,x25,x16			// h+=Sigma1(e)
365	and	x28,x28,x19			// (b^c)&=(a^b)
366	add	x21,x21,x25			// d+=h
367	eor	x28,x28,x27			// Maj(a,b,c)
368	eor	x17,x0,x17,ror#34	// Sigma0(a)
369	add	x25,x25,x28			// h+=Maj(a,b,c)
370	ldr	x28,[x30],#8		// *K++, x19 in next round
371	//add	x25,x25,x17			// h+=Sigma0(a)
372#ifndef	__AARCH64EB__
373	rev	x14,x14			// 11
374#endif
375	ldp	x15,x0,[x1],#2*8
376	add	x25,x25,x17			// h+=Sigma0(a)
377	str	x6,[sp,#24]
378	ror	x16,x21,#14
379	add	x24,x24,x28			// h+=K[i]
380	eor	x6,x21,x21,ror#23
381	and	x17,x22,x21
382	bic	x28,x23,x21
383	add	x24,x24,x14			// h+=X[i]
384	orr	x17,x17,x28			// Ch(e,f,g)
385	eor	x28,x25,x26			// a^b, b^c in next round
386	eor	x16,x16,x6,ror#18	// Sigma1(e)
387	ror	x6,x25,#28
388	add	x24,x24,x17			// h+=Ch(e,f,g)
389	eor	x17,x25,x25,ror#5
390	add	x24,x24,x16			// h+=Sigma1(e)
391	and	x19,x19,x28			// (b^c)&=(a^b)
392	add	x20,x20,x24			// d+=h
393	eor	x19,x19,x26			// Maj(a,b,c)
394	eor	x17,x6,x17,ror#34	// Sigma0(a)
395	add	x24,x24,x19			// h+=Maj(a,b,c)
396	ldr	x19,[x30],#8		// *K++, x28 in next round
397	//add	x24,x24,x17			// h+=Sigma0(a)
398#ifndef	__AARCH64EB__
399	rev	x15,x15			// 12
400#endif
401	add	x24,x24,x17			// h+=Sigma0(a)
402	str	x7,[sp,#0]
403	ror	x16,x20,#14
404	add	x23,x23,x19			// h+=K[i]
405	eor	x7,x20,x20,ror#23
406	and	x17,x21,x20
407	bic	x19,x22,x20
408	add	x23,x23,x15			// h+=X[i]
409	orr	x17,x17,x19			// Ch(e,f,g)
410	eor	x19,x24,x25			// a^b, b^c in next round
411	eor	x16,x16,x7,ror#18	// Sigma1(e)
412	ror	x7,x24,#28
413	add	x23,x23,x17			// h+=Ch(e,f,g)
414	eor	x17,x24,x24,ror#5
415	add	x23,x23,x16			// h+=Sigma1(e)
416	and	x28,x28,x19			// (b^c)&=(a^b)
417	add	x27,x27,x23			// d+=h
418	eor	x28,x28,x25			// Maj(a,b,c)
419	eor	x17,x7,x17,ror#34	// Sigma0(a)
420	add	x23,x23,x28			// h+=Maj(a,b,c)
421	ldr	x28,[x30],#8		// *K++, x19 in next round
422	//add	x23,x23,x17			// h+=Sigma0(a)
423#ifndef	__AARCH64EB__
424	rev	x0,x0			// 13
425#endif
426	ldp	x1,x2,[x1]
427	add	x23,x23,x17			// h+=Sigma0(a)
428	str	x8,[sp,#8]
429	ror	x16,x27,#14
430	add	x22,x22,x28			// h+=K[i]
431	eor	x8,x27,x27,ror#23
432	and	x17,x20,x27
433	bic	x28,x21,x27
434	add	x22,x22,x0			// h+=X[i]
435	orr	x17,x17,x28			// Ch(e,f,g)
436	eor	x28,x23,x24			// a^b, b^c in next round
437	eor	x16,x16,x8,ror#18	// Sigma1(e)
438	ror	x8,x23,#28
439	add	x22,x22,x17			// h+=Ch(e,f,g)
440	eor	x17,x23,x23,ror#5
441	add	x22,x22,x16			// h+=Sigma1(e)
442	and	x19,x19,x28			// (b^c)&=(a^b)
443	add	x26,x26,x22			// d+=h
444	eor	x19,x19,x24			// Maj(a,b,c)
445	eor	x17,x8,x17,ror#34	// Sigma0(a)
446	add	x22,x22,x19			// h+=Maj(a,b,c)
447	ldr	x19,[x30],#8		// *K++, x28 in next round
448	//add	x22,x22,x17			// h+=Sigma0(a)
449#ifndef	__AARCH64EB__
450	rev	x1,x1			// 14
451#endif
452	ldr	x6,[sp,#24]
453	add	x22,x22,x17			// h+=Sigma0(a)
454	str	x9,[sp,#16]
455	ror	x16,x26,#14
456	add	x21,x21,x19			// h+=K[i]
457	eor	x9,x26,x26,ror#23
458	and	x17,x27,x26
459	bic	x19,x20,x26
460	add	x21,x21,x1			// h+=X[i]
461	orr	x17,x17,x19			// Ch(e,f,g)
462	eor	x19,x22,x23			// a^b, b^c in next round
463	eor	x16,x16,x9,ror#18	// Sigma1(e)
464	ror	x9,x22,#28
465	add	x21,x21,x17			// h+=Ch(e,f,g)
466	eor	x17,x22,x22,ror#5
467	add	x21,x21,x16			// h+=Sigma1(e)
468	and	x28,x28,x19			// (b^c)&=(a^b)
469	add	x25,x25,x21			// d+=h
470	eor	x28,x28,x23			// Maj(a,b,c)
471	eor	x17,x9,x17,ror#34	// Sigma0(a)
472	add	x21,x21,x28			// h+=Maj(a,b,c)
473	ldr	x28,[x30],#8		// *K++, x19 in next round
474	//add	x21,x21,x17			// h+=Sigma0(a)
475#ifndef	__AARCH64EB__
476	rev	x2,x2			// 15
477#endif
478	ldr	x7,[sp,#0]
479	add	x21,x21,x17			// h+=Sigma0(a)
480	str	x10,[sp,#24]
481	ror	x16,x25,#14
482	add	x20,x20,x28			// h+=K[i]
483	ror	x9,x4,#1
484	and	x17,x26,x25
485	ror	x8,x1,#19
486	bic	x28,x27,x25
487	ror	x10,x21,#28
488	add	x20,x20,x2			// h+=X[i]
489	eor	x16,x16,x25,ror#18
490	eor	x9,x9,x4,ror#8
491	orr	x17,x17,x28			// Ch(e,f,g)
492	eor	x28,x21,x22			// a^b, b^c in next round
493	eor	x16,x16,x25,ror#41	// Sigma1(e)
494	eor	x10,x10,x21,ror#34
495	add	x20,x20,x17			// h+=Ch(e,f,g)
496	and	x19,x19,x28			// (b^c)&=(a^b)
497	eor	x8,x8,x1,ror#61
498	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
499	add	x20,x20,x16			// h+=Sigma1(e)
500	eor	x19,x19,x22			// Maj(a,b,c)
501	eor	x17,x10,x21,ror#39	// Sigma0(a)
502	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
503	add	x3,x3,x12
504	add	x24,x24,x20			// d+=h
505	add	x20,x20,x19			// h+=Maj(a,b,c)
506	ldr	x19,[x30],#8		// *K++, x28 in next round
507	add	x3,x3,x9
508	add	x20,x20,x17			// h+=Sigma0(a)
509	add	x3,x3,x8
510.Loop_16_xx:
511	ldr	x8,[sp,#8]
512	str	x11,[sp,#0]
513	ror	x16,x24,#14
514	add	x27,x27,x19			// h+=K[i]
515	ror	x10,x5,#1
516	and	x17,x25,x24
517	ror	x9,x2,#19
518	bic	x19,x26,x24
519	ror	x11,x20,#28
520	add	x27,x27,x3			// h+=X[i]
521	eor	x16,x16,x24,ror#18
522	eor	x10,x10,x5,ror#8
523	orr	x17,x17,x19			// Ch(e,f,g)
524	eor	x19,x20,x21			// a^b, b^c in next round
525	eor	x16,x16,x24,ror#41	// Sigma1(e)
526	eor	x11,x11,x20,ror#34
527	add	x27,x27,x17			// h+=Ch(e,f,g)
528	and	x28,x28,x19			// (b^c)&=(a^b)
529	eor	x9,x9,x2,ror#61
530	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
531	add	x27,x27,x16			// h+=Sigma1(e)
532	eor	x28,x28,x21			// Maj(a,b,c)
533	eor	x17,x11,x20,ror#39	// Sigma0(a)
534	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
535	add	x4,x4,x13
536	add	x23,x23,x27			// d+=h
537	add	x27,x27,x28			// h+=Maj(a,b,c)
538	ldr	x28,[x30],#8		// *K++, x19 in next round
539	add	x4,x4,x10
540	add	x27,x27,x17			// h+=Sigma0(a)
541	add	x4,x4,x9
542	ldr	x9,[sp,#16]
543	str	x12,[sp,#8]
544	ror	x16,x23,#14
545	add	x26,x26,x28			// h+=K[i]
546	ror	x11,x6,#1
547	and	x17,x24,x23
548	ror	x10,x3,#19
549	bic	x28,x25,x23
550	ror	x12,x27,#28
551	add	x26,x26,x4			// h+=X[i]
552	eor	x16,x16,x23,ror#18
553	eor	x11,x11,x6,ror#8
554	orr	x17,x17,x28			// Ch(e,f,g)
555	eor	x28,x27,x20			// a^b, b^c in next round
556	eor	x16,x16,x23,ror#41	// Sigma1(e)
557	eor	x12,x12,x27,ror#34
558	add	x26,x26,x17			// h+=Ch(e,f,g)
559	and	x19,x19,x28			// (b^c)&=(a^b)
560	eor	x10,x10,x3,ror#61
561	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
562	add	x26,x26,x16			// h+=Sigma1(e)
563	eor	x19,x19,x20			// Maj(a,b,c)
564	eor	x17,x12,x27,ror#39	// Sigma0(a)
565	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
566	add	x5,x5,x14
567	add	x22,x22,x26			// d+=h
568	add	x26,x26,x19			// h+=Maj(a,b,c)
569	ldr	x19,[x30],#8		// *K++, x28 in next round
570	add	x5,x5,x11
571	add	x26,x26,x17			// h+=Sigma0(a)
572	add	x5,x5,x10
573	ldr	x10,[sp,#24]
574	str	x13,[sp,#16]
575	ror	x16,x22,#14
576	add	x25,x25,x19			// h+=K[i]
577	ror	x12,x7,#1
578	and	x17,x23,x22
579	ror	x11,x4,#19
580	bic	x19,x24,x22
581	ror	x13,x26,#28
582	add	x25,x25,x5			// h+=X[i]
583	eor	x16,x16,x22,ror#18
584	eor	x12,x12,x7,ror#8
585	orr	x17,x17,x19			// Ch(e,f,g)
586	eor	x19,x26,x27			// a^b, b^c in next round
587	eor	x16,x16,x22,ror#41	// Sigma1(e)
588	eor	x13,x13,x26,ror#34
589	add	x25,x25,x17			// h+=Ch(e,f,g)
590	and	x28,x28,x19			// (b^c)&=(a^b)
591	eor	x11,x11,x4,ror#61
592	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
593	add	x25,x25,x16			// h+=Sigma1(e)
594	eor	x28,x28,x27			// Maj(a,b,c)
595	eor	x17,x13,x26,ror#39	// Sigma0(a)
596	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
597	add	x6,x6,x15
598	add	x21,x21,x25			// d+=h
599	add	x25,x25,x28			// h+=Maj(a,b,c)
600	ldr	x28,[x30],#8		// *K++, x19 in next round
601	add	x6,x6,x12
602	add	x25,x25,x17			// h+=Sigma0(a)
603	add	x6,x6,x11
604	ldr	x11,[sp,#0]
605	str	x14,[sp,#24]
606	ror	x16,x21,#14
607	add	x24,x24,x28			// h+=K[i]
608	ror	x13,x8,#1
609	and	x17,x22,x21
610	ror	x12,x5,#19
611	bic	x28,x23,x21
612	ror	x14,x25,#28
613	add	x24,x24,x6			// h+=X[i]
614	eor	x16,x16,x21,ror#18
615	eor	x13,x13,x8,ror#8
616	orr	x17,x17,x28			// Ch(e,f,g)
617	eor	x28,x25,x26			// a^b, b^c in next round
618	eor	x16,x16,x21,ror#41	// Sigma1(e)
619	eor	x14,x14,x25,ror#34
620	add	x24,x24,x17			// h+=Ch(e,f,g)
621	and	x19,x19,x28			// (b^c)&=(a^b)
622	eor	x12,x12,x5,ror#61
623	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
624	add	x24,x24,x16			// h+=Sigma1(e)
625	eor	x19,x19,x26			// Maj(a,b,c)
626	eor	x17,x14,x25,ror#39	// Sigma0(a)
627	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
628	add	x7,x7,x0
629	add	x20,x20,x24			// d+=h
630	add	x24,x24,x19			// h+=Maj(a,b,c)
631	ldr	x19,[x30],#8		// *K++, x28 in next round
632	add	x7,x7,x13
633	add	x24,x24,x17			// h+=Sigma0(a)
634	add	x7,x7,x12
635	ldr	x12,[sp,#8]
636	str	x15,[sp,#0]
637	ror	x16,x20,#14
638	add	x23,x23,x19			// h+=K[i]
639	ror	x14,x9,#1
640	and	x17,x21,x20
641	ror	x13,x6,#19
642	bic	x19,x22,x20
643	ror	x15,x24,#28
644	add	x23,x23,x7			// h+=X[i]
645	eor	x16,x16,x20,ror#18
646	eor	x14,x14,x9,ror#8
647	orr	x17,x17,x19			// Ch(e,f,g)
648	eor	x19,x24,x25			// a^b, b^c in next round
649	eor	x16,x16,x20,ror#41	// Sigma1(e)
650	eor	x15,x15,x24,ror#34
651	add	x23,x23,x17			// h+=Ch(e,f,g)
652	and	x28,x28,x19			// (b^c)&=(a^b)
653	eor	x13,x13,x6,ror#61
654	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
655	add	x23,x23,x16			// h+=Sigma1(e)
656	eor	x28,x28,x25			// Maj(a,b,c)
657	eor	x17,x15,x24,ror#39	// Sigma0(a)
658	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
659	add	x8,x8,x1
660	add	x27,x27,x23			// d+=h
661	add	x23,x23,x28			// h+=Maj(a,b,c)
662	ldr	x28,[x30],#8		// *K++, x19 in next round
663	add	x8,x8,x14
664	add	x23,x23,x17			// h+=Sigma0(a)
665	add	x8,x8,x13
666	ldr	x13,[sp,#16]
667	str	x0,[sp,#8]
668	ror	x16,x27,#14
669	add	x22,x22,x28			// h+=K[i]
670	ror	x15,x10,#1
671	and	x17,x20,x27
672	ror	x14,x7,#19
673	bic	x28,x21,x27
674	ror	x0,x23,#28
675	add	x22,x22,x8			// h+=X[i]
676	eor	x16,x16,x27,ror#18
677	eor	x15,x15,x10,ror#8
678	orr	x17,x17,x28			// Ch(e,f,g)
679	eor	x28,x23,x24			// a^b, b^c in next round
680	eor	x16,x16,x27,ror#41	// Sigma1(e)
681	eor	x0,x0,x23,ror#34
682	add	x22,x22,x17			// h+=Ch(e,f,g)
683	and	x19,x19,x28			// (b^c)&=(a^b)
684	eor	x14,x14,x7,ror#61
685	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
686	add	x22,x22,x16			// h+=Sigma1(e)
687	eor	x19,x19,x24			// Maj(a,b,c)
688	eor	x17,x0,x23,ror#39	// Sigma0(a)
689	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
690	add	x9,x9,x2
691	add	x26,x26,x22			// d+=h
692	add	x22,x22,x19			// h+=Maj(a,b,c)
693	ldr	x19,[x30],#8		// *K++, x28 in next round
694	add	x9,x9,x15
695	add	x22,x22,x17			// h+=Sigma0(a)
696	add	x9,x9,x14
697	ldr	x14,[sp,#24]
698	str	x1,[sp,#16]
699	ror	x16,x26,#14
700	add	x21,x21,x19			// h+=K[i]
701	ror	x0,x11,#1
702	and	x17,x27,x26
703	ror	x15,x8,#19
704	bic	x19,x20,x26
705	ror	x1,x22,#28
706	add	x21,x21,x9			// h+=X[i]
707	eor	x16,x16,x26,ror#18
708	eor	x0,x0,x11,ror#8
709	orr	x17,x17,x19			// Ch(e,f,g)
710	eor	x19,x22,x23			// a^b, b^c in next round
711	eor	x16,x16,x26,ror#41	// Sigma1(e)
712	eor	x1,x1,x22,ror#34
713	add	x21,x21,x17			// h+=Ch(e,f,g)
714	and	x28,x28,x19			// (b^c)&=(a^b)
715	eor	x15,x15,x8,ror#61
716	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
717	add	x21,x21,x16			// h+=Sigma1(e)
718	eor	x28,x28,x23			// Maj(a,b,c)
719	eor	x17,x1,x22,ror#39	// Sigma0(a)
720	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
721	add	x10,x10,x3
722	add	x25,x25,x21			// d+=h
723	add	x21,x21,x28			// h+=Maj(a,b,c)
724	ldr	x28,[x30],#8		// *K++, x19 in next round
725	add	x10,x10,x0
726	add	x21,x21,x17			// h+=Sigma0(a)
727	add	x10,x10,x15
728	ldr	x15,[sp,#0]
729	str	x2,[sp,#24]
730	ror	x16,x25,#14
731	add	x20,x20,x28			// h+=K[i]
732	ror	x1,x12,#1
733	and	x17,x26,x25
734	ror	x0,x9,#19
735	bic	x28,x27,x25
736	ror	x2,x21,#28
737	add	x20,x20,x10			// h+=X[i]
738	eor	x16,x16,x25,ror#18
739	eor	x1,x1,x12,ror#8
740	orr	x17,x17,x28			// Ch(e,f,g)
741	eor	x28,x21,x22			// a^b, b^c in next round
742	eor	x16,x16,x25,ror#41	// Sigma1(e)
743	eor	x2,x2,x21,ror#34
744	add	x20,x20,x17			// h+=Ch(e,f,g)
745	and	x19,x19,x28			// (b^c)&=(a^b)
746	eor	x0,x0,x9,ror#61
747	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
748	add	x20,x20,x16			// h+=Sigma1(e)
749	eor	x19,x19,x22			// Maj(a,b,c)
750	eor	x17,x2,x21,ror#39	// Sigma0(a)
751	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
752	add	x11,x11,x4
753	add	x24,x24,x20			// d+=h
754	add	x20,x20,x19			// h+=Maj(a,b,c)
755	ldr	x19,[x30],#8		// *K++, x28 in next round
756	add	x11,x11,x1
757	add	x20,x20,x17			// h+=Sigma0(a)
758	add	x11,x11,x0
759	ldr	x0,[sp,#8]
760	str	x3,[sp,#0]
761	ror	x16,x24,#14
762	add	x27,x27,x19			// h+=K[i]
763	ror	x2,x13,#1
764	and	x17,x25,x24
765	ror	x1,x10,#19
766	bic	x19,x26,x24
767	ror	x3,x20,#28
768	add	x27,x27,x11			// h+=X[i]
769	eor	x16,x16,x24,ror#18
770	eor	x2,x2,x13,ror#8
771	orr	x17,x17,x19			// Ch(e,f,g)
772	eor	x19,x20,x21			// a^b, b^c in next round
773	eor	x16,x16,x24,ror#41	// Sigma1(e)
774	eor	x3,x3,x20,ror#34
775	add	x27,x27,x17			// h+=Ch(e,f,g)
776	and	x28,x28,x19			// (b^c)&=(a^b)
777	eor	x1,x1,x10,ror#61
778	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
779	add	x27,x27,x16			// h+=Sigma1(e)
780	eor	x28,x28,x21			// Maj(a,b,c)
781	eor	x17,x3,x20,ror#39	// Sigma0(a)
782	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
783	add	x12,x12,x5
784	add	x23,x23,x27			// d+=h
785	add	x27,x27,x28			// h+=Maj(a,b,c)
786	ldr	x28,[x30],#8		// *K++, x19 in next round
787	add	x12,x12,x2
788	add	x27,x27,x17			// h+=Sigma0(a)
789	add	x12,x12,x1
790	ldr	x1,[sp,#16]
791	str	x4,[sp,#8]
792	ror	x16,x23,#14
793	add	x26,x26,x28			// h+=K[i]
794	ror	x3,x14,#1
795	and	x17,x24,x23
796	ror	x2,x11,#19
797	bic	x28,x25,x23
798	ror	x4,x27,#28
799	add	x26,x26,x12			// h+=X[i]
800	eor	x16,x16,x23,ror#18
801	eor	x3,x3,x14,ror#8
802	orr	x17,x17,x28			// Ch(e,f,g)
803	eor	x28,x27,x20			// a^b, b^c in next round
804	eor	x16,x16,x23,ror#41	// Sigma1(e)
805	eor	x4,x4,x27,ror#34
806	add	x26,x26,x17			// h+=Ch(e,f,g)
807	and	x19,x19,x28			// (b^c)&=(a^b)
808	eor	x2,x2,x11,ror#61
809	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
810	add	x26,x26,x16			// h+=Sigma1(e)
811	eor	x19,x19,x20			// Maj(a,b,c)
812	eor	x17,x4,x27,ror#39	// Sigma0(a)
813	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
814	add	x13,x13,x6
815	add	x22,x22,x26			// d+=h
816	add	x26,x26,x19			// h+=Maj(a,b,c)
817	ldr	x19,[x30],#8		// *K++, x28 in next round
818	add	x13,x13,x3
819	add	x26,x26,x17			// h+=Sigma0(a)
820	add	x13,x13,x2
821	ldr	x2,[sp,#24]
822	str	x5,[sp,#16]
823	ror	x16,x22,#14
824	add	x25,x25,x19			// h+=K[i]
825	ror	x4,x15,#1
826	and	x17,x23,x22
827	ror	x3,x12,#19
828	bic	x19,x24,x22
829	ror	x5,x26,#28
830	add	x25,x25,x13			// h+=X[i]
831	eor	x16,x16,x22,ror#18
832	eor	x4,x4,x15,ror#8
833	orr	x17,x17,x19			// Ch(e,f,g)
834	eor	x19,x26,x27			// a^b, b^c in next round
835	eor	x16,x16,x22,ror#41	// Sigma1(e)
836	eor	x5,x5,x26,ror#34
837	add	x25,x25,x17			// h+=Ch(e,f,g)
838	and	x28,x28,x19			// (b^c)&=(a^b)
839	eor	x3,x3,x12,ror#61
840	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
841	add	x25,x25,x16			// h+=Sigma1(e)
842	eor	x28,x28,x27			// Maj(a,b,c)
843	eor	x17,x5,x26,ror#39	// Sigma0(a)
844	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
845	add	x14,x14,x7
846	add	x21,x21,x25			// d+=h
847	add	x25,x25,x28			// h+=Maj(a,b,c)
848	ldr	x28,[x30],#8		// *K++, x19 in next round
849	add	x14,x14,x4
850	add	x25,x25,x17			// h+=Sigma0(a)
851	add	x14,x14,x3
852	ldr	x3,[sp,#0]
853	str	x6,[sp,#24]
854	ror	x16,x21,#14
855	add	x24,x24,x28			// h+=K[i]
856	ror	x5,x0,#1
857	and	x17,x22,x21
858	ror	x4,x13,#19
859	bic	x28,x23,x21
860	ror	x6,x25,#28
861	add	x24,x24,x14			// h+=X[i]
862	eor	x16,x16,x21,ror#18
863	eor	x5,x5,x0,ror#8
864	orr	x17,x17,x28			// Ch(e,f,g)
865	eor	x28,x25,x26			// a^b, b^c in next round
866	eor	x16,x16,x21,ror#41	// Sigma1(e)
867	eor	x6,x6,x25,ror#34
868	add	x24,x24,x17			// h+=Ch(e,f,g)
869	and	x19,x19,x28			// (b^c)&=(a^b)
870	eor	x4,x4,x13,ror#61
871	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
872	add	x24,x24,x16			// h+=Sigma1(e)
873	eor	x19,x19,x26			// Maj(a,b,c)
874	eor	x17,x6,x25,ror#39	// Sigma0(a)
875	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
876	add	x15,x15,x8
877	add	x20,x20,x24			// d+=h
878	add	x24,x24,x19			// h+=Maj(a,b,c)
879	ldr	x19,[x30],#8		// *K++, x28 in next round
880	add	x15,x15,x5
881	add	x24,x24,x17			// h+=Sigma0(a)
882	add	x15,x15,x4
883	ldr	x4,[sp,#8]
884	str	x7,[sp,#0]
885	ror	x16,x20,#14
886	add	x23,x23,x19			// h+=K[i]
887	ror	x6,x1,#1
888	and	x17,x21,x20
889	ror	x5,x14,#19
890	bic	x19,x22,x20
891	ror	x7,x24,#28
892	add	x23,x23,x15			// h+=X[i]
893	eor	x16,x16,x20,ror#18
894	eor	x6,x6,x1,ror#8
895	orr	x17,x17,x19			// Ch(e,f,g)
896	eor	x19,x24,x25			// a^b, b^c in next round
897	eor	x16,x16,x20,ror#41	// Sigma1(e)
898	eor	x7,x7,x24,ror#34
899	add	x23,x23,x17			// h+=Ch(e,f,g)
900	and	x28,x28,x19			// (b^c)&=(a^b)
901	eor	x5,x5,x14,ror#61
902	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
903	add	x23,x23,x16			// h+=Sigma1(e)
904	eor	x28,x28,x25			// Maj(a,b,c)
905	eor	x17,x7,x24,ror#39	// Sigma0(a)
906	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
907	add	x0,x0,x9
908	add	x27,x27,x23			// d+=h
909	add	x23,x23,x28			// h+=Maj(a,b,c)
910	ldr	x28,[x30],#8		// *K++, x19 in next round
911	add	x0,x0,x6
912	add	x23,x23,x17			// h+=Sigma0(a)
913	add	x0,x0,x5
914	ldr	x5,[sp,#16]
915	str	x8,[sp,#8]
916	ror	x16,x27,#14
917	add	x22,x22,x28			// h+=K[i]
918	ror	x7,x2,#1
919	and	x17,x20,x27
920	ror	x6,x15,#19
921	bic	x28,x21,x27
922	ror	x8,x23,#28
923	add	x22,x22,x0			// h+=X[i]
924	eor	x16,x16,x27,ror#18
925	eor	x7,x7,x2,ror#8
926	orr	x17,x17,x28			// Ch(e,f,g)
927	eor	x28,x23,x24			// a^b, b^c in next round
928	eor	x16,x16,x27,ror#41	// Sigma1(e)
929	eor	x8,x8,x23,ror#34
930	add	x22,x22,x17			// h+=Ch(e,f,g)
931	and	x19,x19,x28			// (b^c)&=(a^b)
932	eor	x6,x6,x15,ror#61
933	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
934	add	x22,x22,x16			// h+=Sigma1(e)
935	eor	x19,x19,x24			// Maj(a,b,c)
936	eor	x17,x8,x23,ror#39	// Sigma0(a)
937	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
938	add	x1,x1,x10
939	add	x26,x26,x22			// d+=h
940	add	x22,x22,x19			// h+=Maj(a,b,c)
941	ldr	x19,[x30],#8		// *K++, x28 in next round
942	add	x1,x1,x7
943	add	x22,x22,x17			// h+=Sigma0(a)
944	add	x1,x1,x6
945	ldr	x6,[sp,#24]
946	str	x9,[sp,#16]
947	ror	x16,x26,#14
948	add	x21,x21,x19			// h+=K[i]
949	ror	x8,x3,#1
950	and	x17,x27,x26
951	ror	x7,x0,#19
952	bic	x19,x20,x26
953	ror	x9,x22,#28
954	add	x21,x21,x1			// h+=X[i]
955	eor	x16,x16,x26,ror#18
956	eor	x8,x8,x3,ror#8
957	orr	x17,x17,x19			// Ch(e,f,g)
958	eor	x19,x22,x23			// a^b, b^c in next round
959	eor	x16,x16,x26,ror#41	// Sigma1(e)
960	eor	x9,x9,x22,ror#34
961	add	x21,x21,x17			// h+=Ch(e,f,g)
962	and	x28,x28,x19			// (b^c)&=(a^b)
963	eor	x7,x7,x0,ror#61
964	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
965	add	x21,x21,x16			// h+=Sigma1(e)
966	eor	x28,x28,x23			// Maj(a,b,c)
967	eor	x17,x9,x22,ror#39	// Sigma0(a)
968	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
969	add	x2,x2,x11
970	add	x25,x25,x21			// d+=h
971	add	x21,x21,x28			// h+=Maj(a,b,c)
972	ldr	x28,[x30],#8		// *K++, x19 in next round
973	add	x2,x2,x8
974	add	x21,x21,x17			// h+=Sigma0(a)
975	add	x2,x2,x7
976	ldr	x7,[sp,#0]
977	str	x10,[sp,#24]
978	ror	x16,x25,#14
979	add	x20,x20,x28			// h+=K[i]
980	ror	x9,x4,#1
981	and	x17,x26,x25
982	ror	x8,x1,#19
983	bic	x28,x27,x25
984	ror	x10,x21,#28
985	add	x20,x20,x2			// h+=X[i]
986	eor	x16,x16,x25,ror#18
987	eor	x9,x9,x4,ror#8
988	orr	x17,x17,x28			// Ch(e,f,g)
989	eor	x28,x21,x22			// a^b, b^c in next round
990	eor	x16,x16,x25,ror#41	// Sigma1(e)
991	eor	x10,x10,x21,ror#34
992	add	x20,x20,x17			// h+=Ch(e,f,g)
993	and	x19,x19,x28			// (b^c)&=(a^b)
994	eor	x8,x8,x1,ror#61
995	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
996	add	x20,x20,x16			// h+=Sigma1(e)
997	eor	x19,x19,x22			// Maj(a,b,c)
998	eor	x17,x10,x21,ror#39	// Sigma0(a)
999	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
1000	add	x3,x3,x12
1001	add	x24,x24,x20			// d+=h
1002	add	x20,x20,x19			// h+=Maj(a,b,c)
1003	ldr	x19,[x30],#8		// *K++, x28 in next round
1004	add	x3,x3,x9
1005	add	x20,x20,x17			// h+=Sigma0(a)
1006	add	x3,x3,x8
1007	cbnz	x19,.Loop_16_xx
1008
1009	ldp	x0,x2,[x29,#96]
1010	ldr	x1,[x29,#112]
1011	sub	x30,x30,#648		// rewind
1012
1013	ldp	x3,x4,[x0]
1014	ldp	x5,x6,[x0,#2*8]
1015	add	x1,x1,#14*8			// advance input pointer
1016	ldp	x7,x8,[x0,#4*8]
1017	add	x20,x20,x3
1018	ldp	x9,x10,[x0,#6*8]
1019	add	x21,x21,x4
1020	add	x22,x22,x5
1021	add	x23,x23,x6
1022	stp	x20,x21,[x0]
1023	add	x24,x24,x7
1024	add	x25,x25,x8
1025	stp	x22,x23,[x0,#2*8]
1026	add	x26,x26,x9
1027	add	x27,x27,x10
1028	cmp	x1,x2
1029	stp	x24,x25,[x0,#4*8]
1030	stp	x26,x27,[x0,#6*8]
1031	b.ne	.Loop
1032
1033	ldp	x19,x20,[x29,#16]
1034	add	sp,sp,#4*8
1035	ldp	x21,x22,[x29,#32]
1036	ldp	x23,x24,[x29,#48]
1037	ldp	x25,x26,[x29,#64]
1038	ldp	x27,x28,[x29,#80]
1039	ldp	x29,x30,[sp],#128
1040.inst	0xd50323bf				// autiasp
1041	ret
1042.size	sha512_block_data_order,.-sha512_block_data_order
1043
1044.align	6
1045.type	.LK512,%object
1046.LK512:
1047.quad	0x428a2f98d728ae22,0x7137449123ef65cd
1048.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1049.quad	0x3956c25bf348b538,0x59f111f1b605d019
1050.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
1051.quad	0xd807aa98a3030242,0x12835b0145706fbe
1052.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1053.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
1054.quad	0x9bdc06a725c71235,0xc19bf174cf692694
1055.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
1056.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1057.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
1058.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1059.quad	0x983e5152ee66dfab,0xa831c66d2db43210
1060.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
1061.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
1062.quad	0x06ca6351e003826f,0x142929670a0e6e70
1063.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
1064.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1065.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
1066.quad	0x81c2c92e47edaee6,0x92722c851482353b
1067.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
1068.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
1069.quad	0xd192e819d6ef5218,0xd69906245565a910
1070.quad	0xf40e35855771202a,0x106aa07032bbd1b8
1071.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
1072.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1073.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1074.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1075.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
1076.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
1077.quad	0x90befffa23631e28,0xa4506cebde82bde9
1078.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
1079.quad	0xca273eceea26619c,0xd186b8c721c0c207
1080.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1081.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
1082.quad	0x113f9804bef90dae,0x1b710b35131c471b
1083.quad	0x28db77f523047d84,0x32caab7b40c72493
1084.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1085.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1086.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
1087.quad	0	// terminator
1088.size	.LK512,.-.LK512
1089#ifndef	__KERNEL__
1090.align	3
1091.LOPENSSL_armcap_P:
1092# ifdef	__ILP32__
1093.long	OPENSSL_armcap_P-.
1094# else
1095.quad	OPENSSL_armcap_P-.
1096# endif
1097#endif
1098.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1099.align	2
1100.align	2
1101#ifndef	__KERNEL__
1102.type	sha512_block_armv8,%function
1103.align	6
1104sha512_block_armv8:
1105.Lv8_entry:
1106	stp	x29,x30,[sp,#-16]!
1107	add	x29,sp,#0
1108
1109	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
1110	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1111
1112	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
1113	adr	x3,.LK512
1114
1115	rev64	v16.16b,v16.16b
1116	rev64	v17.16b,v17.16b
1117	rev64	v18.16b,v18.16b
1118	rev64	v19.16b,v19.16b
1119	rev64	v20.16b,v20.16b
1120	rev64	v21.16b,v21.16b
1121	rev64	v22.16b,v22.16b
1122	rev64	v23.16b,v23.16b
1123	b	.Loop_hw
1124
1125.align	4
1126.Loop_hw:
1127	ld1	{v24.2d},[x3],#16
1128	subs	x2,x2,#1
1129	sub	x4,x1,#128
1130	orr	v26.16b,v0.16b,v0.16b			// offload
1131	orr	v27.16b,v1.16b,v1.16b
1132	orr	v28.16b,v2.16b,v2.16b
1133	orr	v29.16b,v3.16b,v3.16b
1134	csel	x1,x1,x4,ne			// conditional rewind
1135	add	v24.2d,v24.2d,v16.2d
1136	ld1	{v25.2d},[x3],#16
1137	ext	v24.16b,v24.16b,v24.16b,#8
1138	ext	v5.16b,v2.16b,v3.16b,#8
1139	ext	v6.16b,v1.16b,v2.16b,#8
1140	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1141.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1142	ext	v7.16b,v20.16b,v21.16b,#8
1143.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1144.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1145	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1146.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1147	add	v25.2d,v25.2d,v17.2d
1148	ld1	{v24.2d},[x3],#16
1149	ext	v25.16b,v25.16b,v25.16b,#8
1150	ext	v5.16b,v4.16b,v2.16b,#8
1151	ext	v6.16b,v0.16b,v4.16b,#8
1152	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1153.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1154	ext	v7.16b,v21.16b,v22.16b,#8
1155.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1156.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1157	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1158.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1159	add	v24.2d,v24.2d,v18.2d
1160	ld1	{v25.2d},[x3],#16
1161	ext	v24.16b,v24.16b,v24.16b,#8
1162	ext	v5.16b,v1.16b,v4.16b,#8
1163	ext	v6.16b,v3.16b,v1.16b,#8
1164	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1165.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1166	ext	v7.16b,v22.16b,v23.16b,#8
1167.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1168.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1169	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1170.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1171	add	v25.2d,v25.2d,v19.2d
1172	ld1	{v24.2d},[x3],#16
1173	ext	v25.16b,v25.16b,v25.16b,#8
1174	ext	v5.16b,v0.16b,v1.16b,#8
1175	ext	v6.16b,v2.16b,v0.16b,#8
1176	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1177.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1178	ext	v7.16b,v23.16b,v16.16b,#8
1179.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1180.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1181	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1182.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1183	add	v24.2d,v24.2d,v20.2d
1184	ld1	{v25.2d},[x3],#16
1185	ext	v24.16b,v24.16b,v24.16b,#8
1186	ext	v5.16b,v3.16b,v0.16b,#8
1187	ext	v6.16b,v4.16b,v3.16b,#8
1188	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1189.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1190	ext	v7.16b,v16.16b,v17.16b,#8
1191.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1192.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1193	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1194.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1195	add	v25.2d,v25.2d,v21.2d
1196	ld1	{v24.2d},[x3],#16
1197	ext	v25.16b,v25.16b,v25.16b,#8
1198	ext	v5.16b,v2.16b,v3.16b,#8
1199	ext	v6.16b,v1.16b,v2.16b,#8
1200	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1201.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1202	ext	v7.16b,v17.16b,v18.16b,#8
1203.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1204.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1205	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1206.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1207	add	v24.2d,v24.2d,v22.2d
1208	ld1	{v25.2d},[x3],#16
1209	ext	v24.16b,v24.16b,v24.16b,#8
1210	ext	v5.16b,v4.16b,v2.16b,#8
1211	ext	v6.16b,v0.16b,v4.16b,#8
1212	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1213.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1214	ext	v7.16b,v18.16b,v19.16b,#8
1215.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1216.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1217	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1218.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1219	add	v25.2d,v25.2d,v23.2d
1220	ld1	{v24.2d},[x3],#16
1221	ext	v25.16b,v25.16b,v25.16b,#8
1222	ext	v5.16b,v1.16b,v4.16b,#8
1223	ext	v6.16b,v3.16b,v1.16b,#8
1224	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1225.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1226	ext	v7.16b,v19.16b,v20.16b,#8
1227.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1228.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1229	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1230.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1231	add	v24.2d,v24.2d,v16.2d
1232	ld1	{v25.2d},[x3],#16
1233	ext	v24.16b,v24.16b,v24.16b,#8
1234	ext	v5.16b,v0.16b,v1.16b,#8
1235	ext	v6.16b,v2.16b,v0.16b,#8
1236	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1237.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1238	ext	v7.16b,v20.16b,v21.16b,#8
1239.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1240.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1241	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1242.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1243	add	v25.2d,v25.2d,v17.2d
1244	ld1	{v24.2d},[x3],#16
1245	ext	v25.16b,v25.16b,v25.16b,#8
1246	ext	v5.16b,v3.16b,v0.16b,#8
1247	ext	v6.16b,v4.16b,v3.16b,#8
1248	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1249.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1250	ext	v7.16b,v21.16b,v22.16b,#8
1251.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1252.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1253	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1254.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1255	add	v24.2d,v24.2d,v18.2d
1256	ld1	{v25.2d},[x3],#16
1257	ext	v24.16b,v24.16b,v24.16b,#8
1258	ext	v5.16b,v2.16b,v3.16b,#8
1259	ext	v6.16b,v1.16b,v2.16b,#8
1260	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1261.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1262	ext	v7.16b,v22.16b,v23.16b,#8
1263.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1264.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1265	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1266.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1267	add	v25.2d,v25.2d,v19.2d
1268	ld1	{v24.2d},[x3],#16
1269	ext	v25.16b,v25.16b,v25.16b,#8
1270	ext	v5.16b,v4.16b,v2.16b,#8
1271	ext	v6.16b,v0.16b,v4.16b,#8
1272	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1273.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1274	ext	v7.16b,v23.16b,v16.16b,#8
1275.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1276.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1277	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1278.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1279	add	v24.2d,v24.2d,v20.2d
1280	ld1	{v25.2d},[x3],#16
1281	ext	v24.16b,v24.16b,v24.16b,#8
1282	ext	v5.16b,v1.16b,v4.16b,#8
1283	ext	v6.16b,v3.16b,v1.16b,#8
1284	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1285.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1286	ext	v7.16b,v16.16b,v17.16b,#8
1287.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1288.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1289	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1290.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1291	add	v25.2d,v25.2d,v21.2d
1292	ld1	{v24.2d},[x3],#16
1293	ext	v25.16b,v25.16b,v25.16b,#8
1294	ext	v5.16b,v0.16b,v1.16b,#8
1295	ext	v6.16b,v2.16b,v0.16b,#8
1296	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1297.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1298	ext	v7.16b,v17.16b,v18.16b,#8
1299.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1300.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1301	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1302.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1303	add	v24.2d,v24.2d,v22.2d
1304	ld1	{v25.2d},[x3],#16
1305	ext	v24.16b,v24.16b,v24.16b,#8
1306	ext	v5.16b,v3.16b,v0.16b,#8
1307	ext	v6.16b,v4.16b,v3.16b,#8
1308	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1309.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1310	ext	v7.16b,v18.16b,v19.16b,#8
1311.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1312.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1313	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1314.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1315	add	v25.2d,v25.2d,v23.2d
1316	ld1	{v24.2d},[x3],#16
1317	ext	v25.16b,v25.16b,v25.16b,#8
1318	ext	v5.16b,v2.16b,v3.16b,#8
1319	ext	v6.16b,v1.16b,v2.16b,#8
1320	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1321.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1322	ext	v7.16b,v19.16b,v20.16b,#8
1323.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1324.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1325	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1326.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1327	add	v24.2d,v24.2d,v16.2d
1328	ld1	{v25.2d},[x3],#16
1329	ext	v24.16b,v24.16b,v24.16b,#8
1330	ext	v5.16b,v4.16b,v2.16b,#8
1331	ext	v6.16b,v0.16b,v4.16b,#8
1332	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1333.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1334	ext	v7.16b,v20.16b,v21.16b,#8
1335.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1336.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1337	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1338.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1339	add	v25.2d,v25.2d,v17.2d
1340	ld1	{v24.2d},[x3],#16
1341	ext	v25.16b,v25.16b,v25.16b,#8
1342	ext	v5.16b,v1.16b,v4.16b,#8
1343	ext	v6.16b,v3.16b,v1.16b,#8
1344	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1345.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1346	ext	v7.16b,v21.16b,v22.16b,#8
1347.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1348.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1349	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1350.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1351	add	v24.2d,v24.2d,v18.2d
1352	ld1	{v25.2d},[x3],#16
1353	ext	v24.16b,v24.16b,v24.16b,#8
1354	ext	v5.16b,v0.16b,v1.16b,#8
1355	ext	v6.16b,v2.16b,v0.16b,#8
1356	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1357.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1358	ext	v7.16b,v22.16b,v23.16b,#8
1359.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1360.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1361	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1362.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1363	add	v25.2d,v25.2d,v19.2d
1364	ld1	{v24.2d},[x3],#16
1365	ext	v25.16b,v25.16b,v25.16b,#8
1366	ext	v5.16b,v3.16b,v0.16b,#8
1367	ext	v6.16b,v4.16b,v3.16b,#8
1368	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1369.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1370	ext	v7.16b,v23.16b,v16.16b,#8
1371.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1372.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1373	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1374.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1375	add	v24.2d,v24.2d,v20.2d
1376	ld1	{v25.2d},[x3],#16
1377	ext	v24.16b,v24.16b,v24.16b,#8
1378	ext	v5.16b,v2.16b,v3.16b,#8
1379	ext	v6.16b,v1.16b,v2.16b,#8
1380	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1381.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1382	ext	v7.16b,v16.16b,v17.16b,#8
1383.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1384.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1385	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1386.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1387	add	v25.2d,v25.2d,v21.2d
1388	ld1	{v24.2d},[x3],#16
1389	ext	v25.16b,v25.16b,v25.16b,#8
1390	ext	v5.16b,v4.16b,v2.16b,#8
1391	ext	v6.16b,v0.16b,v4.16b,#8
1392	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1393.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1394	ext	v7.16b,v17.16b,v18.16b,#8
1395.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1396.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1397	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1398.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1399	add	v24.2d,v24.2d,v22.2d
1400	ld1	{v25.2d},[x3],#16
1401	ext	v24.16b,v24.16b,v24.16b,#8
1402	ext	v5.16b,v1.16b,v4.16b,#8
1403	ext	v6.16b,v3.16b,v1.16b,#8
1404	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1405.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1406	ext	v7.16b,v18.16b,v19.16b,#8
1407.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1408.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1409	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1410.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1411	add	v25.2d,v25.2d,v23.2d
1412	ld1	{v24.2d},[x3],#16
1413	ext	v25.16b,v25.16b,v25.16b,#8
1414	ext	v5.16b,v0.16b,v1.16b,#8
1415	ext	v6.16b,v2.16b,v0.16b,#8
1416	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1417.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1418	ext	v7.16b,v19.16b,v20.16b,#8
1419.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1420.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1421	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1422.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1423	add	v24.2d,v24.2d,v16.2d
1424	ld1	{v25.2d},[x3],#16
1425	ext	v24.16b,v24.16b,v24.16b,#8
1426	ext	v5.16b,v3.16b,v0.16b,#8
1427	ext	v6.16b,v4.16b,v3.16b,#8
1428	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1429.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1430	ext	v7.16b,v20.16b,v21.16b,#8
1431.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1432.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1433	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1434.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1435	add	v25.2d,v25.2d,v17.2d
1436	ld1	{v24.2d},[x3],#16
1437	ext	v25.16b,v25.16b,v25.16b,#8
1438	ext	v5.16b,v2.16b,v3.16b,#8
1439	ext	v6.16b,v1.16b,v2.16b,#8
1440	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1441.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1442	ext	v7.16b,v21.16b,v22.16b,#8
1443.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1444.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1445	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1446.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1447	add	v24.2d,v24.2d,v18.2d
1448	ld1	{v25.2d},[x3],#16
1449	ext	v24.16b,v24.16b,v24.16b,#8
1450	ext	v5.16b,v4.16b,v2.16b,#8
1451	ext	v6.16b,v0.16b,v4.16b,#8
1452	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1453.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1454	ext	v7.16b,v22.16b,v23.16b,#8
1455.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1456.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1457	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1458.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1459	add	v25.2d,v25.2d,v19.2d
1460	ld1	{v24.2d},[x3],#16
1461	ext	v25.16b,v25.16b,v25.16b,#8
1462	ext	v5.16b,v1.16b,v4.16b,#8
1463	ext	v6.16b,v3.16b,v1.16b,#8
1464	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1465.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1466	ext	v7.16b,v23.16b,v16.16b,#8
1467.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1468.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1469	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1470.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1471	add	v24.2d,v24.2d,v20.2d
1472	ld1	{v25.2d},[x3],#16
1473	ext	v24.16b,v24.16b,v24.16b,#8
1474	ext	v5.16b,v0.16b,v1.16b,#8
1475	ext	v6.16b,v2.16b,v0.16b,#8
1476	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1477.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1478	ext	v7.16b,v16.16b,v17.16b,#8
1479.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1480.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1481	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1482.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1483	add	v25.2d,v25.2d,v21.2d
1484	ld1	{v24.2d},[x3],#16
1485	ext	v25.16b,v25.16b,v25.16b,#8
1486	ext	v5.16b,v3.16b,v0.16b,#8
1487	ext	v6.16b,v4.16b,v3.16b,#8
1488	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1489.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1490	ext	v7.16b,v17.16b,v18.16b,#8
1491.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1492.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1493	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1494.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1495	add	v24.2d,v24.2d,v22.2d
1496	ld1	{v25.2d},[x3],#16
1497	ext	v24.16b,v24.16b,v24.16b,#8
1498	ext	v5.16b,v2.16b,v3.16b,#8
1499	ext	v6.16b,v1.16b,v2.16b,#8
1500	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1501.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1502	ext	v7.16b,v18.16b,v19.16b,#8
1503.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1504.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1505	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1506.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1507	add	v25.2d,v25.2d,v23.2d
1508	ld1	{v24.2d},[x3],#16
1509	ext	v25.16b,v25.16b,v25.16b,#8
1510	ext	v5.16b,v4.16b,v2.16b,#8
1511	ext	v6.16b,v0.16b,v4.16b,#8
1512	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1513.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1514	ext	v7.16b,v19.16b,v20.16b,#8
1515.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1516.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1517	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1518.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1519	ld1	{v25.2d},[x3],#16
1520	add	v24.2d,v24.2d,v16.2d
1521	ld1	{v16.16b},[x1],#16		// load next input
1522	ext	v24.16b,v24.16b,v24.16b,#8
1523	ext	v5.16b,v1.16b,v4.16b,#8
1524	ext	v6.16b,v3.16b,v1.16b,#8
1525	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1526.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1527	rev64	v16.16b,v16.16b
1528	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1529.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1530	ld1	{v24.2d},[x3],#16
1531	add	v25.2d,v25.2d,v17.2d
1532	ld1	{v17.16b},[x1],#16		// load next input
1533	ext	v25.16b,v25.16b,v25.16b,#8
1534	ext	v5.16b,v0.16b,v1.16b,#8
1535	ext	v6.16b,v2.16b,v0.16b,#8
1536	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1537.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1538	rev64	v17.16b,v17.16b
1539	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1540.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1541	ld1	{v25.2d},[x3],#16
1542	add	v24.2d,v24.2d,v18.2d
1543	ld1	{v18.16b},[x1],#16		// load next input
1544	ext	v24.16b,v24.16b,v24.16b,#8
1545	ext	v5.16b,v3.16b,v0.16b,#8
1546	ext	v6.16b,v4.16b,v3.16b,#8
1547	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1548.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1549	rev64	v18.16b,v18.16b
1550	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1551.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1552	ld1	{v24.2d},[x3],#16
1553	add	v25.2d,v25.2d,v19.2d
1554	ld1	{v19.16b},[x1],#16		// load next input
1555	ext	v25.16b,v25.16b,v25.16b,#8
1556	ext	v5.16b,v2.16b,v3.16b,#8
1557	ext	v6.16b,v1.16b,v2.16b,#8
1558	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1559.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1560	rev64	v19.16b,v19.16b
1561	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1562.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1563	ld1	{v25.2d},[x3],#16
1564	add	v24.2d,v24.2d,v20.2d
1565	ld1	{v20.16b},[x1],#16		// load next input
1566	ext	v24.16b,v24.16b,v24.16b,#8
1567	ext	v5.16b,v4.16b,v2.16b,#8
1568	ext	v6.16b,v0.16b,v4.16b,#8
1569	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1570.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1571	rev64	v20.16b,v20.16b
1572	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1573.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1574	ld1	{v24.2d},[x3],#16
1575	add	v25.2d,v25.2d,v21.2d
1576	ld1	{v21.16b},[x1],#16		// load next input
1577	ext	v25.16b,v25.16b,v25.16b,#8
1578	ext	v5.16b,v1.16b,v4.16b,#8
1579	ext	v6.16b,v3.16b,v1.16b,#8
1580	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1581.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1582	rev64	v21.16b,v21.16b
1583	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1584.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1585	ld1	{v25.2d},[x3],#16
1586	add	v24.2d,v24.2d,v22.2d
1587	ld1	{v22.16b},[x1],#16		// load next input
1588	ext	v24.16b,v24.16b,v24.16b,#8
1589	ext	v5.16b,v0.16b,v1.16b,#8
1590	ext	v6.16b,v2.16b,v0.16b,#8
1591	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1592.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1593	rev64	v22.16b,v22.16b
1594	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1595.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1596	sub	x3,x3,#80*8	// rewind
1597	add	v25.2d,v25.2d,v23.2d
1598	ld1	{v23.16b},[x1],#16		// load next input
1599	ext	v25.16b,v25.16b,v25.16b,#8
1600	ext	v5.16b,v3.16b,v0.16b,#8
1601	ext	v6.16b,v4.16b,v3.16b,#8
1602	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1603.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1604	rev64	v23.16b,v23.16b
1605	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1606.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1607	add	v0.2d,v0.2d,v26.2d			// accumulate
1608	add	v1.2d,v1.2d,v27.2d
1609	add	v2.2d,v2.2d,v28.2d
1610	add	v3.2d,v3.2d,v29.2d
1611
1612	cbnz	x2,.Loop_hw
1613
1614	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
1615
1616	ldr	x29,[sp],#16
1617	ret
1618.size	sha512_block_armv8,.-sha512_block_armv8
1619#endif
1620#ifndef	__KERNEL__
1621.comm	OPENSSL_armcap_P,4,4
1622#endif
1623