xref: /freebsd/sys/crypto/openssl/aarch64/sha512-armv8.S (revision bd9588bca05f5cbdeac6e5f9f426b2589301d7c6)
1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from sha512-armv8.pl. */
2bc3d5698SJohn Baldwin// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3bc3d5698SJohn Baldwin//
4c0855eaaSJohn Baldwin// Licensed under the Apache License 2.0 (the "License").  You may not use
5bc3d5698SJohn Baldwin// this file except in compliance with the License.  You can obtain a copy
6bc3d5698SJohn Baldwin// in the file LICENSE in the source distribution or at
7bc3d5698SJohn Baldwin// https://www.openssl.org/source/license.html
8bc3d5698SJohn Baldwin
9bc3d5698SJohn Baldwin// ====================================================================
10bc3d5698SJohn Baldwin// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11bc3d5698SJohn Baldwin// project. The module is, however, dual licensed under OpenSSL and
12bc3d5698SJohn Baldwin// CRYPTOGAMS licenses depending on where you obtain it. For further
13bc3d5698SJohn Baldwin// details see http://www.openssl.org/~appro/cryptogams/.
14bc3d5698SJohn Baldwin//
15bc3d5698SJohn Baldwin// Permission to use under GPLv2 terms is granted.
16bc3d5698SJohn Baldwin// ====================================================================
17bc3d5698SJohn Baldwin//
18bc3d5698SJohn Baldwin// SHA256/512 for ARMv8.
19bc3d5698SJohn Baldwin//
20bc3d5698SJohn Baldwin// Performance in cycles per processed byte and improvement coefficient
21bc3d5698SJohn Baldwin// over code generated with "default" compiler:
22bc3d5698SJohn Baldwin//
23bc3d5698SJohn Baldwin//		SHA256-hw	SHA256(*)	SHA512
24bc3d5698SJohn Baldwin// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
25bc3d5698SJohn Baldwin// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
26bc3d5698SJohn Baldwin// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
27bc3d5698SJohn Baldwin// Denver	2.01		10.5 (+26%)	6.70 (+8%)
28bc3d5698SJohn Baldwin// X-Gene			20.0 (+100%)	12.8 (+300%(***))
29bc3d5698SJohn Baldwin// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
30bc3d5698SJohn Baldwin// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
31c0855eaaSJohn Baldwin// ThunderX2	2.54		13.2 (+40%)	8.40 (+18%)
32bc3d5698SJohn Baldwin//
33bc3d5698SJohn Baldwin// (*)	Software SHA256 results are of lesser relevance, presented
34bc3d5698SJohn Baldwin//	mostly for informational purposes.
35bc3d5698SJohn Baldwin// (**)	The result is a trade-off: it's possible to improve it by
36bc3d5698SJohn Baldwin//	10% (or by 1 cycle per round), but at the cost of 20% loss
37bc3d5698SJohn Baldwin//	on Cortex-A53 (or by 4 cycles per round).
38bc3d5698SJohn Baldwin// (***)	Super-impressive coefficients over gcc-generated code are
39bc3d5698SJohn Baldwin//	indication of some compiler "pathology", most notably code
40bc3d5698SJohn Baldwin//	generated with -mgeneral-regs-only is significantly faster
41bc3d5698SJohn Baldwin//	and the gap is only 40-90%.
42bc3d5698SJohn Baldwin//
43bc3d5698SJohn Baldwin// October 2016.
44bc3d5698SJohn Baldwin//
45bc3d5698SJohn Baldwin// Originally it was reckoned that it makes no sense to implement NEON
46bc3d5698SJohn Baldwin// version of SHA256 for 64-bit processors. This is because performance
47bc3d5698SJohn Baldwin// improvement on most wide-spread Cortex-A5x processors was observed
48bc3d5698SJohn Baldwin// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
49bc3d5698SJohn Baldwin// observed that 32-bit NEON SHA256 performs significantly better than
50bc3d5698SJohn Baldwin// 64-bit scalar version on *some* of the more recent processors. As
51bc3d5698SJohn Baldwin// result 64-bit NEON version of SHA256 was added to provide best
52bc3d5698SJohn Baldwin// all-round performance. For example it executes ~30% faster on X-Gene
53bc3d5698SJohn Baldwin// and Mongoose. [For reference, NEON version of SHA512 is bound to
54bc3d5698SJohn Baldwin// deliver much less improvement, likely *negative* on Cortex-A5x.
55bc3d5698SJohn Baldwin// Which is why NEON support is limited to SHA256.]
56bc3d5698SJohn Baldwin
57c0855eaaSJohn Baldwin// $output is the last argument if it looks like a file (it has an extension)
58c0855eaaSJohn Baldwin// $flavour is the first argument if it doesn't look like a file
59bc3d5698SJohn Baldwin#include "arm_arch.h"
60*bd9588bcSAndrew Turner#ifndef	__KERNEL__
61c0855eaaSJohn Baldwin
62c0855eaaSJohn Baldwin.hidden	OPENSSL_armcap_P
63bc3d5698SJohn Baldwin#endif
64bc3d5698SJohn Baldwin
65bc3d5698SJohn Baldwin.text
66bc3d5698SJohn Baldwin
67bc3d5698SJohn Baldwin.globl	sha512_block_data_order
68bc3d5698SJohn Baldwin.type	sha512_block_data_order,%function
69bc3d5698SJohn Baldwin.align	6
70bc3d5698SJohn Baldwinsha512_block_data_order:
71*bd9588bcSAndrew Turner	AARCH64_VALID_CALL_TARGET
72bc3d5698SJohn Baldwin#ifndef	__KERNEL__
73c0855eaaSJohn Baldwin	adrp	x16,OPENSSL_armcap_P
74c0855eaaSJohn Baldwin	ldr	w16,[x16,#:lo12:OPENSSL_armcap_P]
75bc3d5698SJohn Baldwin	tst	w16,#ARMV8_SHA512
76bc3d5698SJohn Baldwin	b.ne	.Lv8_entry
77bc3d5698SJohn Baldwin#endif
78*bd9588bcSAndrew Turner	AARCH64_SIGN_LINK_REGISTER
79bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-128]!
80bc3d5698SJohn Baldwin	add	x29,sp,#0
81bc3d5698SJohn Baldwin
82bc3d5698SJohn Baldwin	stp	x19,x20,[sp,#16]
83bc3d5698SJohn Baldwin	stp	x21,x22,[sp,#32]
84bc3d5698SJohn Baldwin	stp	x23,x24,[sp,#48]
85bc3d5698SJohn Baldwin	stp	x25,x26,[sp,#64]
86bc3d5698SJohn Baldwin	stp	x27,x28,[sp,#80]
87bc3d5698SJohn Baldwin	sub	sp,sp,#4*8
88bc3d5698SJohn Baldwin
89bc3d5698SJohn Baldwin	ldp	x20,x21,[x0]				// load context
90bc3d5698SJohn Baldwin	ldp	x22,x23,[x0,#2*8]
91bc3d5698SJohn Baldwin	ldp	x24,x25,[x0,#4*8]
92bc3d5698SJohn Baldwin	add	x2,x1,x2,lsl#7	// end of input
93bc3d5698SJohn Baldwin	ldp	x26,x27,[x0,#6*8]
94bc3d5698SJohn Baldwin	adr	x30,.LK512
95bc3d5698SJohn Baldwin	stp	x0,x2,[x29,#96]
96bc3d5698SJohn Baldwin
97bc3d5698SJohn Baldwin.Loop:
98bc3d5698SJohn Baldwin	ldp	x3,x4,[x1],#2*8
99bc3d5698SJohn Baldwin	ldr	x19,[x30],#8			// *K++
100bc3d5698SJohn Baldwin	eor	x28,x21,x22				// magic seed
101bc3d5698SJohn Baldwin	str	x1,[x29,#112]
102bc3d5698SJohn Baldwin#ifndef	__AARCH64EB__
103bc3d5698SJohn Baldwin	rev	x3,x3			// 0
104bc3d5698SJohn Baldwin#endif
105bc3d5698SJohn Baldwin	ror	x16,x24,#14
106bc3d5698SJohn Baldwin	add	x27,x27,x19			// h+=K[i]
107bc3d5698SJohn Baldwin	eor	x6,x24,x24,ror#23
108bc3d5698SJohn Baldwin	and	x17,x25,x24
109bc3d5698SJohn Baldwin	bic	x19,x26,x24
110bc3d5698SJohn Baldwin	add	x27,x27,x3			// h+=X[i]
111bc3d5698SJohn Baldwin	orr	x17,x17,x19			// Ch(e,f,g)
112bc3d5698SJohn Baldwin	eor	x19,x20,x21			// a^b, b^c in next round
113bc3d5698SJohn Baldwin	eor	x16,x16,x6,ror#18	// Sigma1(e)
114bc3d5698SJohn Baldwin	ror	x6,x20,#28
115bc3d5698SJohn Baldwin	add	x27,x27,x17			// h+=Ch(e,f,g)
116bc3d5698SJohn Baldwin	eor	x17,x20,x20,ror#5
117bc3d5698SJohn Baldwin	add	x27,x27,x16			// h+=Sigma1(e)
118bc3d5698SJohn Baldwin	and	x28,x28,x19			// (b^c)&=(a^b)
119bc3d5698SJohn Baldwin	add	x23,x23,x27			// d+=h
120bc3d5698SJohn Baldwin	eor	x28,x28,x21			// Maj(a,b,c)
121bc3d5698SJohn Baldwin	eor	x17,x6,x17,ror#34	// Sigma0(a)
122bc3d5698SJohn Baldwin	add	x27,x27,x28			// h+=Maj(a,b,c)
123bc3d5698SJohn Baldwin	ldr	x28,[x30],#8		// *K++, x19 in next round
124bc3d5698SJohn Baldwin	//add	x27,x27,x17			// h+=Sigma0(a)
125bc3d5698SJohn Baldwin#ifndef	__AARCH64EB__
126bc3d5698SJohn Baldwin	rev	x4,x4			// 1
127bc3d5698SJohn Baldwin#endif
128bc3d5698SJohn Baldwin	ldp	x5,x6,[x1],#2*8
129bc3d5698SJohn Baldwin	add	x27,x27,x17			// h+=Sigma0(a)
130bc3d5698SJohn Baldwin	ror	x16,x23,#14
131bc3d5698SJohn Baldwin	add	x26,x26,x28			// h+=K[i]
132bc3d5698SJohn Baldwin	eor	x7,x23,x23,ror#23
133bc3d5698SJohn Baldwin	and	x17,x24,x23
134bc3d5698SJohn Baldwin	bic	x28,x25,x23
135bc3d5698SJohn Baldwin	add	x26,x26,x4			// h+=X[i]
136bc3d5698SJohn Baldwin	orr	x17,x17,x28			// Ch(e,f,g)
137bc3d5698SJohn Baldwin	eor	x28,x27,x20			// a^b, b^c in next round
138bc3d5698SJohn Baldwin	eor	x16,x16,x7,ror#18	// Sigma1(e)
139bc3d5698SJohn Baldwin	ror	x7,x27,#28
140bc3d5698SJohn Baldwin	add	x26,x26,x17			// h+=Ch(e,f,g)
141bc3d5698SJohn Baldwin	eor	x17,x27,x27,ror#5
142bc3d5698SJohn Baldwin	add	x26,x26,x16			// h+=Sigma1(e)
143bc3d5698SJohn Baldwin	and	x19,x19,x28			// (b^c)&=(a^b)
144bc3d5698SJohn Baldwin	add	x22,x22,x26			// d+=h
145bc3d5698SJohn Baldwin	eor	x19,x19,x20			// Maj(a,b,c)
146bc3d5698SJohn Baldwin	eor	x17,x7,x17,ror#34	// Sigma0(a)
147bc3d5698SJohn Baldwin	add	x26,x26,x19			// h+=Maj(a,b,c)
148bc3d5698SJohn Baldwin	ldr	x19,[x30],#8		// *K++, x28 in next round
149bc3d5698SJohn Baldwin	//add	x26,x26,x17			// h+=Sigma0(a)
150bc3d5698SJohn Baldwin#ifndef	__AARCH64EB__
151bc3d5698SJohn Baldwin	rev	x5,x5			// 2
152bc3d5698SJohn Baldwin#endif
153bc3d5698SJohn Baldwin	add	x26,x26,x17			// h+=Sigma0(a)
154bc3d5698SJohn Baldwin	ror	x16,x22,#14
155bc3d5698SJohn Baldwin	add	x25,x25,x19			// h+=K[i]
156bc3d5698SJohn Baldwin	eor	x8,x22,x22,ror#23
157bc3d5698SJohn Baldwin	and	x17,x23,x22
158bc3d5698SJohn Baldwin	bic	x19,x24,x22
159bc3d5698SJohn Baldwin	add	x25,x25,x5			// h+=X[i]
160bc3d5698SJohn Baldwin	orr	x17,x17,x19			// Ch(e,f,g)
161bc3d5698SJohn Baldwin	eor	x19,x26,x27			// a^b, b^c in next round
162bc3d5698SJohn Baldwin	eor	x16,x16,x8,ror#18	// Sigma1(e)
163bc3d5698SJohn Baldwin	ror	x8,x26,#28
164bc3d5698SJohn Baldwin	add	x25,x25,x17			// h+=Ch(e,f,g)
165bc3d5698SJohn Baldwin	eor	x17,x26,x26,ror#5
166bc3d5698SJohn Baldwin	add	x25,x25,x16			// h+=Sigma1(e)
167bc3d5698SJohn Baldwin	and	x28,x28,x19			// (b^c)&=(a^b)
168bc3d5698SJohn Baldwin	add	x21,x21,x25			// d+=h
169bc3d5698SJohn Baldwin	eor	x28,x28,x27			// Maj(a,b,c)
170bc3d5698SJohn Baldwin	eor	x17,x8,x17,ror#34	// Sigma0(a)
171bc3d5698SJohn Baldwin	add	x25,x25,x28			// h+=Maj(a,b,c)
172bc3d5698SJohn Baldwin	ldr	x28,[x30],#8		// *K++, x19 in next round
173bc3d5698SJohn Baldwin	//add	x25,x25,x17			// h+=Sigma0(a)
174bc3d5698SJohn Baldwin#ifndef	__AARCH64EB__
175bc3d5698SJohn Baldwin	rev	x6,x6			// 3
176bc3d5698SJohn Baldwin#endif
177bc3d5698SJohn Baldwin	ldp	x7,x8,[x1],#2*8
178bc3d5698SJohn Baldwin	add	x25,x25,x17			// h+=Sigma0(a)
179bc3d5698SJohn Baldwin	ror	x16,x21,#14
180bc3d5698SJohn Baldwin	add	x24,x24,x28			// h+=K[i]
181bc3d5698SJohn Baldwin	eor	x9,x21,x21,ror#23
182bc3d5698SJohn Baldwin	and	x17,x22,x21
183bc3d5698SJohn Baldwin	bic	x28,x23,x21
184bc3d5698SJohn Baldwin	add	x24,x24,x6			// h+=X[i]
185bc3d5698SJohn Baldwin	orr	x17,x17,x28			// Ch(e,f,g)
186bc3d5698SJohn Baldwin	eor	x28,x25,x26			// a^b, b^c in next round
187bc3d5698SJohn Baldwin	eor	x16,x16,x9,ror#18	// Sigma1(e)
188bc3d5698SJohn Baldwin	ror	x9,x25,#28
189bc3d5698SJohn Baldwin	add	x24,x24,x17			// h+=Ch(e,f,g)
190bc3d5698SJohn Baldwin	eor	x17,x25,x25,ror#5
191bc3d5698SJohn Baldwin	add	x24,x24,x16			// h+=Sigma1(e)
192bc3d5698SJohn Baldwin	and	x19,x19,x28			// (b^c)&=(a^b)
193bc3d5698SJohn Baldwin	add	x20,x20,x24			// d+=h
194bc3d5698SJohn Baldwin	eor	x19,x19,x26			// Maj(a,b,c)
195bc3d5698SJohn Baldwin	eor	x17,x9,x17,ror#34	// Sigma0(a)
196bc3d5698SJohn Baldwin	add	x24,x24,x19			// h+=Maj(a,b,c)
197bc3d5698SJohn Baldwin	ldr	x19,[x30],#8		// *K++, x28 in next round
198bc3d5698SJohn Baldwin	//add	x24,x24,x17			// h+=Sigma0(a)
199bc3d5698SJohn Baldwin#ifndef	__AARCH64EB__
200bc3d5698SJohn Baldwin	rev	x7,x7			// 4
201bc3d5698SJohn Baldwin#endif
202bc3d5698SJohn Baldwin	add	x24,x24,x17			// h+=Sigma0(a)
203bc3d5698SJohn Baldwin	ror	x16,x20,#14
204bc3d5698SJohn Baldwin	add	x23,x23,x19			// h+=K[i]
205bc3d5698SJohn Baldwin	eor	x10,x20,x20,ror#23
206bc3d5698SJohn Baldwin	and	x17,x21,x20
207bc3d5698SJohn Baldwin	bic	x19,x22,x20
208bc3d5698SJohn Baldwin	add	x23,x23,x7			// h+=X[i]
209bc3d5698SJohn Baldwin	orr	x17,x17,x19			// Ch(e,f,g)
210bc3d5698SJohn Baldwin	eor	x19,x24,x25			// a^b, b^c in next round
211bc3d5698SJohn Baldwin	eor	x16,x16,x10,ror#18	// Sigma1(e)
212bc3d5698SJohn Baldwin	ror	x10,x24,#28
213bc3d5698SJohn Baldwin	add	x23,x23,x17			// h+=Ch(e,f,g)
214bc3d5698SJohn Baldwin	eor	x17,x24,x24,ror#5
215bc3d5698SJohn Baldwin	add	x23,x23,x16			// h+=Sigma1(e)
216bc3d5698SJohn Baldwin	and	x28,x28,x19			// (b^c)&=(a^b)
217bc3d5698SJohn Baldwin	add	x27,x27,x23			// d+=h
218bc3d5698SJohn Baldwin	eor	x28,x28,x25			// Maj(a,b,c)
219bc3d5698SJohn Baldwin	eor	x17,x10,x17,ror#34	// Sigma0(a)
220bc3d5698SJohn Baldwin	add	x23,x23,x28			// h+=Maj(a,b,c)
221bc3d5698SJohn Baldwin	ldr	x28,[x30],#8		// *K++, x19 in next round
222bc3d5698SJohn Baldwin	//add	x23,x23,x17			// h+=Sigma0(a)
223bc3d5698SJohn Baldwin#ifndef	__AARCH64EB__
224bc3d5698SJohn Baldwin	rev	x8,x8			// 5
225bc3d5698SJohn Baldwin#endif
226bc3d5698SJohn Baldwin	ldp	x9,x10,[x1],#2*8
227bc3d5698SJohn Baldwin	add	x23,x23,x17			// h+=Sigma0(a)
228bc3d5698SJohn Baldwin	ror	x16,x27,#14
229bc3d5698SJohn Baldwin	add	x22,x22,x28			// h+=K[i]
230bc3d5698SJohn Baldwin	eor	x11,x27,x27,ror#23
231bc3d5698SJohn Baldwin	and	x17,x20,x27
232bc3d5698SJohn Baldwin	bic	x28,x21,x27
233bc3d5698SJohn Baldwin	add	x22,x22,x8			// h+=X[i]
234bc3d5698SJohn Baldwin	orr	x17,x17,x28			// Ch(e,f,g)
235bc3d5698SJohn Baldwin	eor	x28,x23,x24			// a^b, b^c in next round
236bc3d5698SJohn Baldwin	eor	x16,x16,x11,ror#18	// Sigma1(e)
237bc3d5698SJohn Baldwin	ror	x11,x23,#28
238bc3d5698SJohn Baldwin	add	x22,x22,x17			// h+=Ch(e,f,g)
239bc3d5698SJohn Baldwin	eor	x17,x23,x23,ror#5
240bc3d5698SJohn Baldwin	add	x22,x22,x16			// h+=Sigma1(e)
241bc3d5698SJohn Baldwin	and	x19,x19,x28			// (b^c)&=(a^b)
242bc3d5698SJohn Baldwin	add	x26,x26,x22			// d+=h
243bc3d5698SJohn Baldwin	eor	x19,x19,x24			// Maj(a,b,c)
244bc3d5698SJohn Baldwin	eor	x17,x11,x17,ror#34	// Sigma0(a)
245bc3d5698SJohn Baldwin	add	x22,x22,x19			// h+=Maj(a,b,c)
246bc3d5698SJohn Baldwin	ldr	x19,[x30],#8		// *K++, x28 in next round
247bc3d5698SJohn Baldwin	//add	x22,x22,x17			// h+=Sigma0(a)
248bc3d5698SJohn Baldwin#ifndef	__AARCH64EB__
249bc3d5698SJohn Baldwin	rev	x9,x9			// 6
250bc3d5698SJohn Baldwin#endif
251bc3d5698SJohn Baldwin	add	x22,x22,x17			// h+=Sigma0(a)
252bc3d5698SJohn Baldwin	ror	x16,x26,#14
253bc3d5698SJohn Baldwin	add	x21,x21,x19			// h+=K[i]
254bc3d5698SJohn Baldwin	eor	x12,x26,x26,ror#23
255bc3d5698SJohn Baldwin	and	x17,x27,x26
256bc3d5698SJohn Baldwin	bic	x19,x20,x26
257bc3d5698SJohn Baldwin	add	x21,x21,x9			// h+=X[i]
258bc3d5698SJohn Baldwin	orr	x17,x17,x19			// Ch(e,f,g)
259bc3d5698SJohn Baldwin	eor	x19,x22,x23			// a^b, b^c in next round
260bc3d5698SJohn Baldwin	eor	x16,x16,x12,ror#18	// Sigma1(e)
261bc3d5698SJohn Baldwin	ror	x12,x22,#28
262bc3d5698SJohn Baldwin	add	x21,x21,x17			// h+=Ch(e,f,g)
263bc3d5698SJohn Baldwin	eor	x17,x22,x22,ror#5
264bc3d5698SJohn Baldwin	add	x21,x21,x16			// h+=Sigma1(e)
265bc3d5698SJohn Baldwin	and	x28,x28,x19			// (b^c)&=(a^b)
266bc3d5698SJohn Baldwin	add	x25,x25,x21			// d+=h
267bc3d5698SJohn Baldwin	eor	x28,x28,x23			// Maj(a,b,c)
268bc3d5698SJohn Baldwin	eor	x17,x12,x17,ror#34	// Sigma0(a)
269bc3d5698SJohn Baldwin	add	x21,x21,x28			// h+=Maj(a,b,c)
270bc3d5698SJohn Baldwin	ldr	x28,[x30],#8		// *K++, x19 in next round
271bc3d5698SJohn Baldwin	//add	x21,x21,x17			// h+=Sigma0(a)
272bc3d5698SJohn Baldwin#ifndef	__AARCH64EB__
273bc3d5698SJohn Baldwin	rev	x10,x10			// 7
274bc3d5698SJohn Baldwin#endif
275bc3d5698SJohn Baldwin	ldp	x11,x12,[x1],#2*8
276bc3d5698SJohn Baldwin	add	x21,x21,x17			// h+=Sigma0(a)
277bc3d5698SJohn Baldwin	ror	x16,x25,#14
278bc3d5698SJohn Baldwin	add	x20,x20,x28			// h+=K[i]
279bc3d5698SJohn Baldwin	eor	x13,x25,x25,ror#23
280bc3d5698SJohn Baldwin	and	x17,x26,x25
281bc3d5698SJohn Baldwin	bic	x28,x27,x25
282bc3d5698SJohn Baldwin	add	x20,x20,x10			// h+=X[i]
283bc3d5698SJohn Baldwin	orr	x17,x17,x28			// Ch(e,f,g)
284bc3d5698SJohn Baldwin	eor	x28,x21,x22			// a^b, b^c in next round
285bc3d5698SJohn Baldwin	eor	x16,x16,x13,ror#18	// Sigma1(e)
286bc3d5698SJohn Baldwin	ror	x13,x21,#28
287bc3d5698SJohn Baldwin	add	x20,x20,x17			// h+=Ch(e,f,g)
288bc3d5698SJohn Baldwin	eor	x17,x21,x21,ror#5
289bc3d5698SJohn Baldwin	add	x20,x20,x16			// h+=Sigma1(e)
290bc3d5698SJohn Baldwin	and	x19,x19,x28			// (b^c)&=(a^b)
291bc3d5698SJohn Baldwin	add	x24,x24,x20			// d+=h
292bc3d5698SJohn Baldwin	eor	x19,x19,x22			// Maj(a,b,c)
293bc3d5698SJohn Baldwin	eor	x17,x13,x17,ror#34	// Sigma0(a)
294bc3d5698SJohn Baldwin	add	x20,x20,x19			// h+=Maj(a,b,c)
295bc3d5698SJohn Baldwin	ldr	x19,[x30],#8		// *K++, x28 in next round
296bc3d5698SJohn Baldwin	//add	x20,x20,x17			// h+=Sigma0(a)
297bc3d5698SJohn Baldwin#ifndef	__AARCH64EB__
298bc3d5698SJohn Baldwin	rev	x11,x11			// 8
299bc3d5698SJohn Baldwin#endif
300bc3d5698SJohn Baldwin	add	x20,x20,x17			// h+=Sigma0(a)
301bc3d5698SJohn Baldwin	ror	x16,x24,#14
302bc3d5698SJohn Baldwin	add	x27,x27,x19			// h+=K[i]
303bc3d5698SJohn Baldwin	eor	x14,x24,x24,ror#23
304bc3d5698SJohn Baldwin	and	x17,x25,x24
305bc3d5698SJohn Baldwin	bic	x19,x26,x24
306bc3d5698SJohn Baldwin	add	x27,x27,x11			// h+=X[i]
307bc3d5698SJohn Baldwin	orr	x17,x17,x19			// Ch(e,f,g)
308bc3d5698SJohn Baldwin	eor	x19,x20,x21			// a^b, b^c in next round
309bc3d5698SJohn Baldwin	eor	x16,x16,x14,ror#18	// Sigma1(e)
310bc3d5698SJohn Baldwin	ror	x14,x20,#28
311bc3d5698SJohn Baldwin	add	x27,x27,x17			// h+=Ch(e,f,g)
312bc3d5698SJohn Baldwin	eor	x17,x20,x20,ror#5
313bc3d5698SJohn Baldwin	add	x27,x27,x16			// h+=Sigma1(e)
314bc3d5698SJohn Baldwin	and	x28,x28,x19			// (b^c)&=(a^b)
315bc3d5698SJohn Baldwin	add	x23,x23,x27			// d+=h
316bc3d5698SJohn Baldwin	eor	x28,x28,x21			// Maj(a,b,c)
317bc3d5698SJohn Baldwin	eor	x17,x14,x17,ror#34	// Sigma0(a)
318bc3d5698SJohn Baldwin	add	x27,x27,x28			// h+=Maj(a,b,c)
319bc3d5698SJohn Baldwin	ldr	x28,[x30],#8		// *K++, x19 in next round
320bc3d5698SJohn Baldwin	//add	x27,x27,x17			// h+=Sigma0(a)
321bc3d5698SJohn Baldwin#ifndef	__AARCH64EB__
322bc3d5698SJohn Baldwin	rev	x12,x12			// 9
323bc3d5698SJohn Baldwin#endif
324bc3d5698SJohn Baldwin	ldp	x13,x14,[x1],#2*8
325bc3d5698SJohn Baldwin	add	x27,x27,x17			// h+=Sigma0(a)
326bc3d5698SJohn Baldwin	ror	x16,x23,#14
327bc3d5698SJohn Baldwin	add	x26,x26,x28			// h+=K[i]
328bc3d5698SJohn Baldwin	eor	x15,x23,x23,ror#23
329bc3d5698SJohn Baldwin	and	x17,x24,x23
330bc3d5698SJohn Baldwin	bic	x28,x25,x23
331bc3d5698SJohn Baldwin	add	x26,x26,x12			// h+=X[i]
332bc3d5698SJohn Baldwin	orr	x17,x17,x28			// Ch(e,f,g)
333bc3d5698SJohn Baldwin	eor	x28,x27,x20			// a^b, b^c in next round
334bc3d5698SJohn Baldwin	eor	x16,x16,x15,ror#18	// Sigma1(e)
335bc3d5698SJohn Baldwin	ror	x15,x27,#28
336bc3d5698SJohn Baldwin	add	x26,x26,x17			// h+=Ch(e,f,g)
337bc3d5698SJohn Baldwin	eor	x17,x27,x27,ror#5
338bc3d5698SJohn Baldwin	add	x26,x26,x16			// h+=Sigma1(e)
339bc3d5698SJohn Baldwin	and	x19,x19,x28			// (b^c)&=(a^b)
340bc3d5698SJohn Baldwin	add	x22,x22,x26			// d+=h
341bc3d5698SJohn Baldwin	eor	x19,x19,x20			// Maj(a,b,c)
342bc3d5698SJohn Baldwin	eor	x17,x15,x17,ror#34	// Sigma0(a)
343bc3d5698SJohn Baldwin	add	x26,x26,x19			// h+=Maj(a,b,c)
344bc3d5698SJohn Baldwin	ldr	x19,[x30],#8		// *K++, x28 in next round
345bc3d5698SJohn Baldwin	//add	x26,x26,x17			// h+=Sigma0(a)
346bc3d5698SJohn Baldwin#ifndef	__AARCH64EB__
347bc3d5698SJohn Baldwin	rev	x13,x13			// 10
348bc3d5698SJohn Baldwin#endif
349bc3d5698SJohn Baldwin	add	x26,x26,x17			// h+=Sigma0(a)
350bc3d5698SJohn Baldwin	ror	x16,x22,#14
351bc3d5698SJohn Baldwin	add	x25,x25,x19			// h+=K[i]
352bc3d5698SJohn Baldwin	eor	x0,x22,x22,ror#23
353bc3d5698SJohn Baldwin	and	x17,x23,x22
354bc3d5698SJohn Baldwin	bic	x19,x24,x22
355bc3d5698SJohn Baldwin	add	x25,x25,x13			// h+=X[i]
356bc3d5698SJohn Baldwin	orr	x17,x17,x19			// Ch(e,f,g)
357bc3d5698SJohn Baldwin	eor	x19,x26,x27			// a^b, b^c in next round
358bc3d5698SJohn Baldwin	eor	x16,x16,x0,ror#18	// Sigma1(e)
359bc3d5698SJohn Baldwin	ror	x0,x26,#28
360bc3d5698SJohn Baldwin	add	x25,x25,x17			// h+=Ch(e,f,g)
361bc3d5698SJohn Baldwin	eor	x17,x26,x26,ror#5
362bc3d5698SJohn Baldwin	add	x25,x25,x16			// h+=Sigma1(e)
363bc3d5698SJohn Baldwin	and	x28,x28,x19			// (b^c)&=(a^b)
364bc3d5698SJohn Baldwin	add	x21,x21,x25			// d+=h
365bc3d5698SJohn Baldwin	eor	x28,x28,x27			// Maj(a,b,c)
366bc3d5698SJohn Baldwin	eor	x17,x0,x17,ror#34	// Sigma0(a)
367bc3d5698SJohn Baldwin	add	x25,x25,x28			// h+=Maj(a,b,c)
368bc3d5698SJohn Baldwin	ldr	x28,[x30],#8		// *K++, x19 in next round
369bc3d5698SJohn Baldwin	//add	x25,x25,x17			// h+=Sigma0(a)
370bc3d5698SJohn Baldwin#ifndef	__AARCH64EB__
371bc3d5698SJohn Baldwin	rev	x14,x14			// 11
372bc3d5698SJohn Baldwin#endif
373bc3d5698SJohn Baldwin	ldp	x15,x0,[x1],#2*8
374bc3d5698SJohn Baldwin	add	x25,x25,x17			// h+=Sigma0(a)
375bc3d5698SJohn Baldwin	str	x6,[sp,#24]
376bc3d5698SJohn Baldwin	ror	x16,x21,#14
377bc3d5698SJohn Baldwin	add	x24,x24,x28			// h+=K[i]
378bc3d5698SJohn Baldwin	eor	x6,x21,x21,ror#23
379bc3d5698SJohn Baldwin	and	x17,x22,x21
380bc3d5698SJohn Baldwin	bic	x28,x23,x21
381bc3d5698SJohn Baldwin	add	x24,x24,x14			// h+=X[i]
382bc3d5698SJohn Baldwin	orr	x17,x17,x28			// Ch(e,f,g)
383bc3d5698SJohn Baldwin	eor	x28,x25,x26			// a^b, b^c in next round
384bc3d5698SJohn Baldwin	eor	x16,x16,x6,ror#18	// Sigma1(e)
385bc3d5698SJohn Baldwin	ror	x6,x25,#28
386bc3d5698SJohn Baldwin	add	x24,x24,x17			// h+=Ch(e,f,g)
387bc3d5698SJohn Baldwin	eor	x17,x25,x25,ror#5
388bc3d5698SJohn Baldwin	add	x24,x24,x16			// h+=Sigma1(e)
389bc3d5698SJohn Baldwin	and	x19,x19,x28			// (b^c)&=(a^b)
390bc3d5698SJohn Baldwin	add	x20,x20,x24			// d+=h
391bc3d5698SJohn Baldwin	eor	x19,x19,x26			// Maj(a,b,c)
392bc3d5698SJohn Baldwin	eor	x17,x6,x17,ror#34	// Sigma0(a)
393bc3d5698SJohn Baldwin	add	x24,x24,x19			// h+=Maj(a,b,c)
394bc3d5698SJohn Baldwin	ldr	x19,[x30],#8		// *K++, x28 in next round
395bc3d5698SJohn Baldwin	//add	x24,x24,x17			// h+=Sigma0(a)
396bc3d5698SJohn Baldwin#ifndef	__AARCH64EB__
397bc3d5698SJohn Baldwin	rev	x15,x15			// 12
398bc3d5698SJohn Baldwin#endif
399bc3d5698SJohn Baldwin	add	x24,x24,x17			// h+=Sigma0(a)
400bc3d5698SJohn Baldwin	str	x7,[sp,#0]
401bc3d5698SJohn Baldwin	ror	x16,x20,#14
402bc3d5698SJohn Baldwin	add	x23,x23,x19			// h+=K[i]
403bc3d5698SJohn Baldwin	eor	x7,x20,x20,ror#23
404bc3d5698SJohn Baldwin	and	x17,x21,x20
405bc3d5698SJohn Baldwin	bic	x19,x22,x20
406bc3d5698SJohn Baldwin	add	x23,x23,x15			// h+=X[i]
407bc3d5698SJohn Baldwin	orr	x17,x17,x19			// Ch(e,f,g)
408bc3d5698SJohn Baldwin	eor	x19,x24,x25			// a^b, b^c in next round
409bc3d5698SJohn Baldwin	eor	x16,x16,x7,ror#18	// Sigma1(e)
410bc3d5698SJohn Baldwin	ror	x7,x24,#28
411bc3d5698SJohn Baldwin	add	x23,x23,x17			// h+=Ch(e,f,g)
412bc3d5698SJohn Baldwin	eor	x17,x24,x24,ror#5
413bc3d5698SJohn Baldwin	add	x23,x23,x16			// h+=Sigma1(e)
414bc3d5698SJohn Baldwin	and	x28,x28,x19			// (b^c)&=(a^b)
415bc3d5698SJohn Baldwin	add	x27,x27,x23			// d+=h
416bc3d5698SJohn Baldwin	eor	x28,x28,x25			// Maj(a,b,c)
417bc3d5698SJohn Baldwin	eor	x17,x7,x17,ror#34	// Sigma0(a)
418bc3d5698SJohn Baldwin	add	x23,x23,x28			// h+=Maj(a,b,c)
419bc3d5698SJohn Baldwin	ldr	x28,[x30],#8		// *K++, x19 in next round
420bc3d5698SJohn Baldwin	//add	x23,x23,x17			// h+=Sigma0(a)
421bc3d5698SJohn Baldwin#ifndef	__AARCH64EB__
422bc3d5698SJohn Baldwin	rev	x0,x0			// 13
423bc3d5698SJohn Baldwin#endif
424bc3d5698SJohn Baldwin	ldp	x1,x2,[x1]
425bc3d5698SJohn Baldwin	add	x23,x23,x17			// h+=Sigma0(a)
426bc3d5698SJohn Baldwin	str	x8,[sp,#8]
427bc3d5698SJohn Baldwin	ror	x16,x27,#14
428bc3d5698SJohn Baldwin	add	x22,x22,x28			// h+=K[i]
429bc3d5698SJohn Baldwin	eor	x8,x27,x27,ror#23
430bc3d5698SJohn Baldwin	and	x17,x20,x27
431bc3d5698SJohn Baldwin	bic	x28,x21,x27
432bc3d5698SJohn Baldwin	add	x22,x22,x0			// h+=X[i]
433bc3d5698SJohn Baldwin	orr	x17,x17,x28			// Ch(e,f,g)
434bc3d5698SJohn Baldwin	eor	x28,x23,x24			// a^b, b^c in next round
435bc3d5698SJohn Baldwin	eor	x16,x16,x8,ror#18	// Sigma1(e)
436bc3d5698SJohn Baldwin	ror	x8,x23,#28
437bc3d5698SJohn Baldwin	add	x22,x22,x17			// h+=Ch(e,f,g)
438bc3d5698SJohn Baldwin	eor	x17,x23,x23,ror#5
439bc3d5698SJohn Baldwin	add	x22,x22,x16			// h+=Sigma1(e)
440bc3d5698SJohn Baldwin	and	x19,x19,x28			// (b^c)&=(a^b)
441bc3d5698SJohn Baldwin	add	x26,x26,x22			// d+=h
442bc3d5698SJohn Baldwin	eor	x19,x19,x24			// Maj(a,b,c)
443bc3d5698SJohn Baldwin	eor	x17,x8,x17,ror#34	// Sigma0(a)
444bc3d5698SJohn Baldwin	add	x22,x22,x19			// h+=Maj(a,b,c)
445bc3d5698SJohn Baldwin	ldr	x19,[x30],#8		// *K++, x28 in next round
446bc3d5698SJohn Baldwin	//add	x22,x22,x17			// h+=Sigma0(a)
447bc3d5698SJohn Baldwin#ifndef	__AARCH64EB__
448bc3d5698SJohn Baldwin	rev	x1,x1			// 14
449bc3d5698SJohn Baldwin#endif
450bc3d5698SJohn Baldwin	ldr	x6,[sp,#24]
451bc3d5698SJohn Baldwin	add	x22,x22,x17			// h+=Sigma0(a)
452bc3d5698SJohn Baldwin	str	x9,[sp,#16]
453bc3d5698SJohn Baldwin	ror	x16,x26,#14
454bc3d5698SJohn Baldwin	add	x21,x21,x19			// h+=K[i]
455bc3d5698SJohn Baldwin	eor	x9,x26,x26,ror#23
456bc3d5698SJohn Baldwin	and	x17,x27,x26
457bc3d5698SJohn Baldwin	bic	x19,x20,x26
458bc3d5698SJohn Baldwin	add	x21,x21,x1			// h+=X[i]
459bc3d5698SJohn Baldwin	orr	x17,x17,x19			// Ch(e,f,g)
460bc3d5698SJohn Baldwin	eor	x19,x22,x23			// a^b, b^c in next round
461bc3d5698SJohn Baldwin	eor	x16,x16,x9,ror#18	// Sigma1(e)
462bc3d5698SJohn Baldwin	ror	x9,x22,#28
463bc3d5698SJohn Baldwin	add	x21,x21,x17			// h+=Ch(e,f,g)
464bc3d5698SJohn Baldwin	eor	x17,x22,x22,ror#5
465bc3d5698SJohn Baldwin	add	x21,x21,x16			// h+=Sigma1(e)
466bc3d5698SJohn Baldwin	and	x28,x28,x19			// (b^c)&=(a^b)
467bc3d5698SJohn Baldwin	add	x25,x25,x21			// d+=h
468bc3d5698SJohn Baldwin	eor	x28,x28,x23			// Maj(a,b,c)
469bc3d5698SJohn Baldwin	eor	x17,x9,x17,ror#34	// Sigma0(a)
470bc3d5698SJohn Baldwin	add	x21,x21,x28			// h+=Maj(a,b,c)
471bc3d5698SJohn Baldwin	ldr	x28,[x30],#8		// *K++, x19 in next round
472bc3d5698SJohn Baldwin	//add	x21,x21,x17			// h+=Sigma0(a)
473bc3d5698SJohn Baldwin#ifndef	__AARCH64EB__
474bc3d5698SJohn Baldwin	rev	x2,x2			// 15
475bc3d5698SJohn Baldwin#endif
476bc3d5698SJohn Baldwin	ldr	x7,[sp,#0]
477bc3d5698SJohn Baldwin	add	x21,x21,x17			// h+=Sigma0(a)
478bc3d5698SJohn Baldwin	str	x10,[sp,#24]
479bc3d5698SJohn Baldwin	ror	x16,x25,#14
480bc3d5698SJohn Baldwin	add	x20,x20,x28			// h+=K[i]
481bc3d5698SJohn Baldwin	ror	x9,x4,#1
482bc3d5698SJohn Baldwin	and	x17,x26,x25
483bc3d5698SJohn Baldwin	ror	x8,x1,#19
484bc3d5698SJohn Baldwin	bic	x28,x27,x25
485bc3d5698SJohn Baldwin	ror	x10,x21,#28
486bc3d5698SJohn Baldwin	add	x20,x20,x2			// h+=X[i]
487bc3d5698SJohn Baldwin	eor	x16,x16,x25,ror#18
488bc3d5698SJohn Baldwin	eor	x9,x9,x4,ror#8
489bc3d5698SJohn Baldwin	orr	x17,x17,x28			// Ch(e,f,g)
490bc3d5698SJohn Baldwin	eor	x28,x21,x22			// a^b, b^c in next round
491bc3d5698SJohn Baldwin	eor	x16,x16,x25,ror#41	// Sigma1(e)
492bc3d5698SJohn Baldwin	eor	x10,x10,x21,ror#34
493bc3d5698SJohn Baldwin	add	x20,x20,x17			// h+=Ch(e,f,g)
494bc3d5698SJohn Baldwin	and	x19,x19,x28			// (b^c)&=(a^b)
495bc3d5698SJohn Baldwin	eor	x8,x8,x1,ror#61
496bc3d5698SJohn Baldwin	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
497bc3d5698SJohn Baldwin	add	x20,x20,x16			// h+=Sigma1(e)
498bc3d5698SJohn Baldwin	eor	x19,x19,x22			// Maj(a,b,c)
499bc3d5698SJohn Baldwin	eor	x17,x10,x21,ror#39	// Sigma0(a)
500bc3d5698SJohn Baldwin	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
501bc3d5698SJohn Baldwin	add	x3,x3,x12
502bc3d5698SJohn Baldwin	add	x24,x24,x20			// d+=h
503bc3d5698SJohn Baldwin	add	x20,x20,x19			// h+=Maj(a,b,c)
504bc3d5698SJohn Baldwin	ldr	x19,[x30],#8		// *K++, x28 in next round
505bc3d5698SJohn Baldwin	add	x3,x3,x9
506bc3d5698SJohn Baldwin	add	x20,x20,x17			// h+=Sigma0(a)
507bc3d5698SJohn Baldwin	add	x3,x3,x8
508bc3d5698SJohn Baldwin.Loop_16_xx:
509bc3d5698SJohn Baldwin	ldr	x8,[sp,#8]
510bc3d5698SJohn Baldwin	str	x11,[sp,#0]
511bc3d5698SJohn Baldwin	ror	x16,x24,#14
512bc3d5698SJohn Baldwin	add	x27,x27,x19			// h+=K[i]
513bc3d5698SJohn Baldwin	ror	x10,x5,#1
514bc3d5698SJohn Baldwin	and	x17,x25,x24
515bc3d5698SJohn Baldwin	ror	x9,x2,#19
516bc3d5698SJohn Baldwin	bic	x19,x26,x24
517bc3d5698SJohn Baldwin	ror	x11,x20,#28
518bc3d5698SJohn Baldwin	add	x27,x27,x3			// h+=X[i]
519bc3d5698SJohn Baldwin	eor	x16,x16,x24,ror#18
520bc3d5698SJohn Baldwin	eor	x10,x10,x5,ror#8
521bc3d5698SJohn Baldwin	orr	x17,x17,x19			// Ch(e,f,g)
522bc3d5698SJohn Baldwin	eor	x19,x20,x21			// a^b, b^c in next round
523bc3d5698SJohn Baldwin	eor	x16,x16,x24,ror#41	// Sigma1(e)
524bc3d5698SJohn Baldwin	eor	x11,x11,x20,ror#34
525bc3d5698SJohn Baldwin	add	x27,x27,x17			// h+=Ch(e,f,g)
526bc3d5698SJohn Baldwin	and	x28,x28,x19			// (b^c)&=(a^b)
527bc3d5698SJohn Baldwin	eor	x9,x9,x2,ror#61
528bc3d5698SJohn Baldwin	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
529bc3d5698SJohn Baldwin	add	x27,x27,x16			// h+=Sigma1(e)
530bc3d5698SJohn Baldwin	eor	x28,x28,x21			// Maj(a,b,c)
531bc3d5698SJohn Baldwin	eor	x17,x11,x20,ror#39	// Sigma0(a)
532bc3d5698SJohn Baldwin	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
533bc3d5698SJohn Baldwin	add	x4,x4,x13
534bc3d5698SJohn Baldwin	add	x23,x23,x27			// d+=h
535bc3d5698SJohn Baldwin	add	x27,x27,x28			// h+=Maj(a,b,c)
536bc3d5698SJohn Baldwin	ldr	x28,[x30],#8		// *K++, x19 in next round
537bc3d5698SJohn Baldwin	add	x4,x4,x10
538bc3d5698SJohn Baldwin	add	x27,x27,x17			// h+=Sigma0(a)
539bc3d5698SJohn Baldwin	add	x4,x4,x9
540bc3d5698SJohn Baldwin	ldr	x9,[sp,#16]
541bc3d5698SJohn Baldwin	str	x12,[sp,#8]
542bc3d5698SJohn Baldwin	ror	x16,x23,#14
543bc3d5698SJohn Baldwin	add	x26,x26,x28			// h+=K[i]
544bc3d5698SJohn Baldwin	ror	x11,x6,#1
545bc3d5698SJohn Baldwin	and	x17,x24,x23
546bc3d5698SJohn Baldwin	ror	x10,x3,#19
547bc3d5698SJohn Baldwin	bic	x28,x25,x23
548bc3d5698SJohn Baldwin	ror	x12,x27,#28
549bc3d5698SJohn Baldwin	add	x26,x26,x4			// h+=X[i]
550bc3d5698SJohn Baldwin	eor	x16,x16,x23,ror#18
551bc3d5698SJohn Baldwin	eor	x11,x11,x6,ror#8
552bc3d5698SJohn Baldwin	orr	x17,x17,x28			// Ch(e,f,g)
553bc3d5698SJohn Baldwin	eor	x28,x27,x20			// a^b, b^c in next round
554bc3d5698SJohn Baldwin	eor	x16,x16,x23,ror#41	// Sigma1(e)
555bc3d5698SJohn Baldwin	eor	x12,x12,x27,ror#34
556bc3d5698SJohn Baldwin	add	x26,x26,x17			// h+=Ch(e,f,g)
557bc3d5698SJohn Baldwin	and	x19,x19,x28			// (b^c)&=(a^b)
558bc3d5698SJohn Baldwin	eor	x10,x10,x3,ror#61
559bc3d5698SJohn Baldwin	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
560bc3d5698SJohn Baldwin	add	x26,x26,x16			// h+=Sigma1(e)
561bc3d5698SJohn Baldwin	eor	x19,x19,x20			// Maj(a,b,c)
562bc3d5698SJohn Baldwin	eor	x17,x12,x27,ror#39	// Sigma0(a)
563bc3d5698SJohn Baldwin	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
564bc3d5698SJohn Baldwin	add	x5,x5,x14
565bc3d5698SJohn Baldwin	add	x22,x22,x26			// d+=h
566bc3d5698SJohn Baldwin	add	x26,x26,x19			// h+=Maj(a,b,c)
567bc3d5698SJohn Baldwin	ldr	x19,[x30],#8		// *K++, x28 in next round
568bc3d5698SJohn Baldwin	add	x5,x5,x11
569bc3d5698SJohn Baldwin	add	x26,x26,x17			// h+=Sigma0(a)
570bc3d5698SJohn Baldwin	add	x5,x5,x10
571bc3d5698SJohn Baldwin	ldr	x10,[sp,#24]
572bc3d5698SJohn Baldwin	str	x13,[sp,#16]
573bc3d5698SJohn Baldwin	ror	x16,x22,#14
574bc3d5698SJohn Baldwin	add	x25,x25,x19			// h+=K[i]
575bc3d5698SJohn Baldwin	ror	x12,x7,#1
576bc3d5698SJohn Baldwin	and	x17,x23,x22
577bc3d5698SJohn Baldwin	ror	x11,x4,#19
578bc3d5698SJohn Baldwin	bic	x19,x24,x22
579bc3d5698SJohn Baldwin	ror	x13,x26,#28
580bc3d5698SJohn Baldwin	add	x25,x25,x5			// h+=X[i]
581bc3d5698SJohn Baldwin	eor	x16,x16,x22,ror#18
582bc3d5698SJohn Baldwin	eor	x12,x12,x7,ror#8
583bc3d5698SJohn Baldwin	orr	x17,x17,x19			// Ch(e,f,g)
584bc3d5698SJohn Baldwin	eor	x19,x26,x27			// a^b, b^c in next round
585bc3d5698SJohn Baldwin	eor	x16,x16,x22,ror#41	// Sigma1(e)
586bc3d5698SJohn Baldwin	eor	x13,x13,x26,ror#34
587bc3d5698SJohn Baldwin	add	x25,x25,x17			// h+=Ch(e,f,g)
588bc3d5698SJohn Baldwin	and	x28,x28,x19			// (b^c)&=(a^b)
589bc3d5698SJohn Baldwin	eor	x11,x11,x4,ror#61
590bc3d5698SJohn Baldwin	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
591bc3d5698SJohn Baldwin	add	x25,x25,x16			// h+=Sigma1(e)
592bc3d5698SJohn Baldwin	eor	x28,x28,x27			// Maj(a,b,c)
593bc3d5698SJohn Baldwin	eor	x17,x13,x26,ror#39	// Sigma0(a)
594bc3d5698SJohn Baldwin	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
595bc3d5698SJohn Baldwin	add	x6,x6,x15
596bc3d5698SJohn Baldwin	add	x21,x21,x25			// d+=h
597bc3d5698SJohn Baldwin	add	x25,x25,x28			// h+=Maj(a,b,c)
598bc3d5698SJohn Baldwin	ldr	x28,[x30],#8		// *K++, x19 in next round
599bc3d5698SJohn Baldwin	add	x6,x6,x12
600bc3d5698SJohn Baldwin	add	x25,x25,x17			// h+=Sigma0(a)
601bc3d5698SJohn Baldwin	add	x6,x6,x11
602bc3d5698SJohn Baldwin	ldr	x11,[sp,#0]
603bc3d5698SJohn Baldwin	str	x14,[sp,#24]
604bc3d5698SJohn Baldwin	ror	x16,x21,#14
605bc3d5698SJohn Baldwin	add	x24,x24,x28			// h+=K[i]
606bc3d5698SJohn Baldwin	ror	x13,x8,#1
607bc3d5698SJohn Baldwin	and	x17,x22,x21
608bc3d5698SJohn Baldwin	ror	x12,x5,#19
609bc3d5698SJohn Baldwin	bic	x28,x23,x21
610bc3d5698SJohn Baldwin	ror	x14,x25,#28
611bc3d5698SJohn Baldwin	add	x24,x24,x6			// h+=X[i]
612bc3d5698SJohn Baldwin	eor	x16,x16,x21,ror#18
613bc3d5698SJohn Baldwin	eor	x13,x13,x8,ror#8
614bc3d5698SJohn Baldwin	orr	x17,x17,x28			// Ch(e,f,g)
615bc3d5698SJohn Baldwin	eor	x28,x25,x26			// a^b, b^c in next round
616bc3d5698SJohn Baldwin	eor	x16,x16,x21,ror#41	// Sigma1(e)
617bc3d5698SJohn Baldwin	eor	x14,x14,x25,ror#34
618bc3d5698SJohn Baldwin	add	x24,x24,x17			// h+=Ch(e,f,g)
619bc3d5698SJohn Baldwin	and	x19,x19,x28			// (b^c)&=(a^b)
620bc3d5698SJohn Baldwin	eor	x12,x12,x5,ror#61
621bc3d5698SJohn Baldwin	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
622bc3d5698SJohn Baldwin	add	x24,x24,x16			// h+=Sigma1(e)
623bc3d5698SJohn Baldwin	eor	x19,x19,x26			// Maj(a,b,c)
624bc3d5698SJohn Baldwin	eor	x17,x14,x25,ror#39	// Sigma0(a)
625bc3d5698SJohn Baldwin	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
626bc3d5698SJohn Baldwin	add	x7,x7,x0
627bc3d5698SJohn Baldwin	add	x20,x20,x24			// d+=h
628bc3d5698SJohn Baldwin	add	x24,x24,x19			// h+=Maj(a,b,c)
629bc3d5698SJohn Baldwin	ldr	x19,[x30],#8		// *K++, x28 in next round
630bc3d5698SJohn Baldwin	add	x7,x7,x13
631bc3d5698SJohn Baldwin	add	x24,x24,x17			// h+=Sigma0(a)
632bc3d5698SJohn Baldwin	add	x7,x7,x12
633bc3d5698SJohn Baldwin	ldr	x12,[sp,#8]
634bc3d5698SJohn Baldwin	str	x15,[sp,#0]
635bc3d5698SJohn Baldwin	ror	x16,x20,#14
636bc3d5698SJohn Baldwin	add	x23,x23,x19			// h+=K[i]
637bc3d5698SJohn Baldwin	ror	x14,x9,#1
638bc3d5698SJohn Baldwin	and	x17,x21,x20
639bc3d5698SJohn Baldwin	ror	x13,x6,#19
640bc3d5698SJohn Baldwin	bic	x19,x22,x20
641bc3d5698SJohn Baldwin	ror	x15,x24,#28
642bc3d5698SJohn Baldwin	add	x23,x23,x7			// h+=X[i]
643bc3d5698SJohn Baldwin	eor	x16,x16,x20,ror#18
644bc3d5698SJohn Baldwin	eor	x14,x14,x9,ror#8
645bc3d5698SJohn Baldwin	orr	x17,x17,x19			// Ch(e,f,g)
646bc3d5698SJohn Baldwin	eor	x19,x24,x25			// a^b, b^c in next round
647bc3d5698SJohn Baldwin	eor	x16,x16,x20,ror#41	// Sigma1(e)
648bc3d5698SJohn Baldwin	eor	x15,x15,x24,ror#34
649bc3d5698SJohn Baldwin	add	x23,x23,x17			// h+=Ch(e,f,g)
650bc3d5698SJohn Baldwin	and	x28,x28,x19			// (b^c)&=(a^b)
651bc3d5698SJohn Baldwin	eor	x13,x13,x6,ror#61
652bc3d5698SJohn Baldwin	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
653bc3d5698SJohn Baldwin	add	x23,x23,x16			// h+=Sigma1(e)
654bc3d5698SJohn Baldwin	eor	x28,x28,x25			// Maj(a,b,c)
655bc3d5698SJohn Baldwin	eor	x17,x15,x24,ror#39	// Sigma0(a)
656bc3d5698SJohn Baldwin	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
657bc3d5698SJohn Baldwin	add	x8,x8,x1
658bc3d5698SJohn Baldwin	add	x27,x27,x23			// d+=h
659bc3d5698SJohn Baldwin	add	x23,x23,x28			// h+=Maj(a,b,c)
660bc3d5698SJohn Baldwin	ldr	x28,[x30],#8		// *K++, x19 in next round
661bc3d5698SJohn Baldwin	add	x8,x8,x14
662bc3d5698SJohn Baldwin	add	x23,x23,x17			// h+=Sigma0(a)
663bc3d5698SJohn Baldwin	add	x8,x8,x13
664bc3d5698SJohn Baldwin	ldr	x13,[sp,#16]
665bc3d5698SJohn Baldwin	str	x0,[sp,#8]
666bc3d5698SJohn Baldwin	ror	x16,x27,#14
667bc3d5698SJohn Baldwin	add	x22,x22,x28			// h+=K[i]
668bc3d5698SJohn Baldwin	ror	x15,x10,#1
669bc3d5698SJohn Baldwin	and	x17,x20,x27
670bc3d5698SJohn Baldwin	ror	x14,x7,#19
671bc3d5698SJohn Baldwin	bic	x28,x21,x27
672bc3d5698SJohn Baldwin	ror	x0,x23,#28
673bc3d5698SJohn Baldwin	add	x22,x22,x8			// h+=X[i]
674bc3d5698SJohn Baldwin	eor	x16,x16,x27,ror#18
675bc3d5698SJohn Baldwin	eor	x15,x15,x10,ror#8
676bc3d5698SJohn Baldwin	orr	x17,x17,x28			// Ch(e,f,g)
677bc3d5698SJohn Baldwin	eor	x28,x23,x24			// a^b, b^c in next round
678bc3d5698SJohn Baldwin	eor	x16,x16,x27,ror#41	// Sigma1(e)
679bc3d5698SJohn Baldwin	eor	x0,x0,x23,ror#34
680bc3d5698SJohn Baldwin	add	x22,x22,x17			// h+=Ch(e,f,g)
681bc3d5698SJohn Baldwin	and	x19,x19,x28			// (b^c)&=(a^b)
682bc3d5698SJohn Baldwin	eor	x14,x14,x7,ror#61
683bc3d5698SJohn Baldwin	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
684bc3d5698SJohn Baldwin	add	x22,x22,x16			// h+=Sigma1(e)
685bc3d5698SJohn Baldwin	eor	x19,x19,x24			// Maj(a,b,c)
686bc3d5698SJohn Baldwin	eor	x17,x0,x23,ror#39	// Sigma0(a)
687bc3d5698SJohn Baldwin	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
688bc3d5698SJohn Baldwin	add	x9,x9,x2
689bc3d5698SJohn Baldwin	add	x26,x26,x22			// d+=h
690bc3d5698SJohn Baldwin	add	x22,x22,x19			// h+=Maj(a,b,c)
691bc3d5698SJohn Baldwin	ldr	x19,[x30],#8		// *K++, x28 in next round
692bc3d5698SJohn Baldwin	add	x9,x9,x15
693bc3d5698SJohn Baldwin	add	x22,x22,x17			// h+=Sigma0(a)
694bc3d5698SJohn Baldwin	add	x9,x9,x14
695bc3d5698SJohn Baldwin	ldr	x14,[sp,#24]
696bc3d5698SJohn Baldwin	str	x1,[sp,#16]
697bc3d5698SJohn Baldwin	ror	x16,x26,#14
698bc3d5698SJohn Baldwin	add	x21,x21,x19			// h+=K[i]
699bc3d5698SJohn Baldwin	ror	x0,x11,#1
700bc3d5698SJohn Baldwin	and	x17,x27,x26
701bc3d5698SJohn Baldwin	ror	x15,x8,#19
702bc3d5698SJohn Baldwin	bic	x19,x20,x26
703bc3d5698SJohn Baldwin	ror	x1,x22,#28
704bc3d5698SJohn Baldwin	add	x21,x21,x9			// h+=X[i]
705bc3d5698SJohn Baldwin	eor	x16,x16,x26,ror#18
706bc3d5698SJohn Baldwin	eor	x0,x0,x11,ror#8
707bc3d5698SJohn Baldwin	orr	x17,x17,x19			// Ch(e,f,g)
708bc3d5698SJohn Baldwin	eor	x19,x22,x23			// a^b, b^c in next round
709bc3d5698SJohn Baldwin	eor	x16,x16,x26,ror#41	// Sigma1(e)
710bc3d5698SJohn Baldwin	eor	x1,x1,x22,ror#34
711bc3d5698SJohn Baldwin	add	x21,x21,x17			// h+=Ch(e,f,g)
712bc3d5698SJohn Baldwin	and	x28,x28,x19			// (b^c)&=(a^b)
713bc3d5698SJohn Baldwin	eor	x15,x15,x8,ror#61
714bc3d5698SJohn Baldwin	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
715bc3d5698SJohn Baldwin	add	x21,x21,x16			// h+=Sigma1(e)
716bc3d5698SJohn Baldwin	eor	x28,x28,x23			// Maj(a,b,c)
717bc3d5698SJohn Baldwin	eor	x17,x1,x22,ror#39	// Sigma0(a)
718bc3d5698SJohn Baldwin	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
719bc3d5698SJohn Baldwin	add	x10,x10,x3
720bc3d5698SJohn Baldwin	add	x25,x25,x21			// d+=h
721bc3d5698SJohn Baldwin	add	x21,x21,x28			// h+=Maj(a,b,c)
722bc3d5698SJohn Baldwin	ldr	x28,[x30],#8		// *K++, x19 in next round
723bc3d5698SJohn Baldwin	add	x10,x10,x0
724bc3d5698SJohn Baldwin	add	x21,x21,x17			// h+=Sigma0(a)
725bc3d5698SJohn Baldwin	add	x10,x10,x15
726bc3d5698SJohn Baldwin	ldr	x15,[sp,#0]
727bc3d5698SJohn Baldwin	str	x2,[sp,#24]
728bc3d5698SJohn Baldwin	ror	x16,x25,#14
729bc3d5698SJohn Baldwin	add	x20,x20,x28			// h+=K[i]
730bc3d5698SJohn Baldwin	ror	x1,x12,#1
731bc3d5698SJohn Baldwin	and	x17,x26,x25
732bc3d5698SJohn Baldwin	ror	x0,x9,#19
733bc3d5698SJohn Baldwin	bic	x28,x27,x25
734bc3d5698SJohn Baldwin	ror	x2,x21,#28
735bc3d5698SJohn Baldwin	add	x20,x20,x10			// h+=X[i]
736bc3d5698SJohn Baldwin	eor	x16,x16,x25,ror#18
737bc3d5698SJohn Baldwin	eor	x1,x1,x12,ror#8
738bc3d5698SJohn Baldwin	orr	x17,x17,x28			// Ch(e,f,g)
739bc3d5698SJohn Baldwin	eor	x28,x21,x22			// a^b, b^c in next round
740bc3d5698SJohn Baldwin	eor	x16,x16,x25,ror#41	// Sigma1(e)
741bc3d5698SJohn Baldwin	eor	x2,x2,x21,ror#34
742bc3d5698SJohn Baldwin	add	x20,x20,x17			// h+=Ch(e,f,g)
743bc3d5698SJohn Baldwin	and	x19,x19,x28			// (b^c)&=(a^b)
744bc3d5698SJohn Baldwin	eor	x0,x0,x9,ror#61
745bc3d5698SJohn Baldwin	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
746bc3d5698SJohn Baldwin	add	x20,x20,x16			// h+=Sigma1(e)
747bc3d5698SJohn Baldwin	eor	x19,x19,x22			// Maj(a,b,c)
748bc3d5698SJohn Baldwin	eor	x17,x2,x21,ror#39	// Sigma0(a)
749bc3d5698SJohn Baldwin	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
750bc3d5698SJohn Baldwin	add	x11,x11,x4
751bc3d5698SJohn Baldwin	add	x24,x24,x20			// d+=h
752bc3d5698SJohn Baldwin	add	x20,x20,x19			// h+=Maj(a,b,c)
753bc3d5698SJohn Baldwin	ldr	x19,[x30],#8		// *K++, x28 in next round
754bc3d5698SJohn Baldwin	add	x11,x11,x1
755bc3d5698SJohn Baldwin	add	x20,x20,x17			// h+=Sigma0(a)
756bc3d5698SJohn Baldwin	add	x11,x11,x0
757bc3d5698SJohn Baldwin	ldr	x0,[sp,#8]
758bc3d5698SJohn Baldwin	str	x3,[sp,#0]
759bc3d5698SJohn Baldwin	ror	x16,x24,#14
760bc3d5698SJohn Baldwin	add	x27,x27,x19			// h+=K[i]
761bc3d5698SJohn Baldwin	ror	x2,x13,#1
762bc3d5698SJohn Baldwin	and	x17,x25,x24
763bc3d5698SJohn Baldwin	ror	x1,x10,#19
764bc3d5698SJohn Baldwin	bic	x19,x26,x24
765bc3d5698SJohn Baldwin	ror	x3,x20,#28
766bc3d5698SJohn Baldwin	add	x27,x27,x11			// h+=X[i]
767bc3d5698SJohn Baldwin	eor	x16,x16,x24,ror#18
768bc3d5698SJohn Baldwin	eor	x2,x2,x13,ror#8
769bc3d5698SJohn Baldwin	orr	x17,x17,x19			// Ch(e,f,g)
770bc3d5698SJohn Baldwin	eor	x19,x20,x21			// a^b, b^c in next round
771bc3d5698SJohn Baldwin	eor	x16,x16,x24,ror#41	// Sigma1(e)
772bc3d5698SJohn Baldwin	eor	x3,x3,x20,ror#34
773bc3d5698SJohn Baldwin	add	x27,x27,x17			// h+=Ch(e,f,g)
774bc3d5698SJohn Baldwin	and	x28,x28,x19			// (b^c)&=(a^b)
775bc3d5698SJohn Baldwin	eor	x1,x1,x10,ror#61
776bc3d5698SJohn Baldwin	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
777bc3d5698SJohn Baldwin	add	x27,x27,x16			// h+=Sigma1(e)
778bc3d5698SJohn Baldwin	eor	x28,x28,x21			// Maj(a,b,c)
779bc3d5698SJohn Baldwin	eor	x17,x3,x20,ror#39	// Sigma0(a)
780bc3d5698SJohn Baldwin	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
781bc3d5698SJohn Baldwin	add	x12,x12,x5
782bc3d5698SJohn Baldwin	add	x23,x23,x27			// d+=h
783bc3d5698SJohn Baldwin	add	x27,x27,x28			// h+=Maj(a,b,c)
784bc3d5698SJohn Baldwin	ldr	x28,[x30],#8		// *K++, x19 in next round
785bc3d5698SJohn Baldwin	add	x12,x12,x2
786bc3d5698SJohn Baldwin	add	x27,x27,x17			// h+=Sigma0(a)
787bc3d5698SJohn Baldwin	add	x12,x12,x1
788bc3d5698SJohn Baldwin	ldr	x1,[sp,#16]
789bc3d5698SJohn Baldwin	str	x4,[sp,#8]
790bc3d5698SJohn Baldwin	ror	x16,x23,#14
791bc3d5698SJohn Baldwin	add	x26,x26,x28			// h+=K[i]
792bc3d5698SJohn Baldwin	ror	x3,x14,#1
793bc3d5698SJohn Baldwin	and	x17,x24,x23
794bc3d5698SJohn Baldwin	ror	x2,x11,#19
795bc3d5698SJohn Baldwin	bic	x28,x25,x23
796bc3d5698SJohn Baldwin	ror	x4,x27,#28
797bc3d5698SJohn Baldwin	add	x26,x26,x12			// h+=X[i]
798bc3d5698SJohn Baldwin	eor	x16,x16,x23,ror#18
799bc3d5698SJohn Baldwin	eor	x3,x3,x14,ror#8
800bc3d5698SJohn Baldwin	orr	x17,x17,x28			// Ch(e,f,g)
801bc3d5698SJohn Baldwin	eor	x28,x27,x20			// a^b, b^c in next round
802bc3d5698SJohn Baldwin	eor	x16,x16,x23,ror#41	// Sigma1(e)
803bc3d5698SJohn Baldwin	eor	x4,x4,x27,ror#34
804bc3d5698SJohn Baldwin	add	x26,x26,x17			// h+=Ch(e,f,g)
805bc3d5698SJohn Baldwin	and	x19,x19,x28			// (b^c)&=(a^b)
806bc3d5698SJohn Baldwin	eor	x2,x2,x11,ror#61
807bc3d5698SJohn Baldwin	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
808bc3d5698SJohn Baldwin	add	x26,x26,x16			// h+=Sigma1(e)
809bc3d5698SJohn Baldwin	eor	x19,x19,x20			// Maj(a,b,c)
810bc3d5698SJohn Baldwin	eor	x17,x4,x27,ror#39	// Sigma0(a)
811bc3d5698SJohn Baldwin	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
812bc3d5698SJohn Baldwin	add	x13,x13,x6
813bc3d5698SJohn Baldwin	add	x22,x22,x26			// d+=h
814bc3d5698SJohn Baldwin	add	x26,x26,x19			// h+=Maj(a,b,c)
815bc3d5698SJohn Baldwin	ldr	x19,[x30],#8		// *K++, x28 in next round
816bc3d5698SJohn Baldwin	add	x13,x13,x3
817bc3d5698SJohn Baldwin	add	x26,x26,x17			// h+=Sigma0(a)
818bc3d5698SJohn Baldwin	add	x13,x13,x2
819bc3d5698SJohn Baldwin	ldr	x2,[sp,#24]
820bc3d5698SJohn Baldwin	str	x5,[sp,#16]
821bc3d5698SJohn Baldwin	ror	x16,x22,#14
822bc3d5698SJohn Baldwin	add	x25,x25,x19			// h+=K[i]
823bc3d5698SJohn Baldwin	ror	x4,x15,#1
824bc3d5698SJohn Baldwin	and	x17,x23,x22
825bc3d5698SJohn Baldwin	ror	x3,x12,#19
826bc3d5698SJohn Baldwin	bic	x19,x24,x22
827bc3d5698SJohn Baldwin	ror	x5,x26,#28
828bc3d5698SJohn Baldwin	add	x25,x25,x13			// h+=X[i]
829bc3d5698SJohn Baldwin	eor	x16,x16,x22,ror#18
830bc3d5698SJohn Baldwin	eor	x4,x4,x15,ror#8
831bc3d5698SJohn Baldwin	orr	x17,x17,x19			// Ch(e,f,g)
832bc3d5698SJohn Baldwin	eor	x19,x26,x27			// a^b, b^c in next round
833bc3d5698SJohn Baldwin	eor	x16,x16,x22,ror#41	// Sigma1(e)
834bc3d5698SJohn Baldwin	eor	x5,x5,x26,ror#34
835bc3d5698SJohn Baldwin	add	x25,x25,x17			// h+=Ch(e,f,g)
836bc3d5698SJohn Baldwin	and	x28,x28,x19			// (b^c)&=(a^b)
837bc3d5698SJohn Baldwin	eor	x3,x3,x12,ror#61
838bc3d5698SJohn Baldwin	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
839bc3d5698SJohn Baldwin	add	x25,x25,x16			// h+=Sigma1(e)
840bc3d5698SJohn Baldwin	eor	x28,x28,x27			// Maj(a,b,c)
841bc3d5698SJohn Baldwin	eor	x17,x5,x26,ror#39	// Sigma0(a)
842bc3d5698SJohn Baldwin	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
843bc3d5698SJohn Baldwin	add	x14,x14,x7
844bc3d5698SJohn Baldwin	add	x21,x21,x25			// d+=h
845bc3d5698SJohn Baldwin	add	x25,x25,x28			// h+=Maj(a,b,c)
846bc3d5698SJohn Baldwin	ldr	x28,[x30],#8		// *K++, x19 in next round
847bc3d5698SJohn Baldwin	add	x14,x14,x4
848bc3d5698SJohn Baldwin	add	x25,x25,x17			// h+=Sigma0(a)
849bc3d5698SJohn Baldwin	add	x14,x14,x3
850bc3d5698SJohn Baldwin	ldr	x3,[sp,#0]
851bc3d5698SJohn Baldwin	str	x6,[sp,#24]
852bc3d5698SJohn Baldwin	ror	x16,x21,#14
853bc3d5698SJohn Baldwin	add	x24,x24,x28			// h+=K[i]
854bc3d5698SJohn Baldwin	ror	x5,x0,#1
855bc3d5698SJohn Baldwin	and	x17,x22,x21
856bc3d5698SJohn Baldwin	ror	x4,x13,#19
857bc3d5698SJohn Baldwin	bic	x28,x23,x21
858bc3d5698SJohn Baldwin	ror	x6,x25,#28
859bc3d5698SJohn Baldwin	add	x24,x24,x14			// h+=X[i]
860bc3d5698SJohn Baldwin	eor	x16,x16,x21,ror#18
861bc3d5698SJohn Baldwin	eor	x5,x5,x0,ror#8
862bc3d5698SJohn Baldwin	orr	x17,x17,x28			// Ch(e,f,g)
863bc3d5698SJohn Baldwin	eor	x28,x25,x26			// a^b, b^c in next round
864bc3d5698SJohn Baldwin	eor	x16,x16,x21,ror#41	// Sigma1(e)
865bc3d5698SJohn Baldwin	eor	x6,x6,x25,ror#34
866bc3d5698SJohn Baldwin	add	x24,x24,x17			// h+=Ch(e,f,g)
867bc3d5698SJohn Baldwin	and	x19,x19,x28			// (b^c)&=(a^b)
868bc3d5698SJohn Baldwin	eor	x4,x4,x13,ror#61
869bc3d5698SJohn Baldwin	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
870bc3d5698SJohn Baldwin	add	x24,x24,x16			// h+=Sigma1(e)
871bc3d5698SJohn Baldwin	eor	x19,x19,x26			// Maj(a,b,c)
872bc3d5698SJohn Baldwin	eor	x17,x6,x25,ror#39	// Sigma0(a)
873bc3d5698SJohn Baldwin	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
874bc3d5698SJohn Baldwin	add	x15,x15,x8
875bc3d5698SJohn Baldwin	add	x20,x20,x24			// d+=h
876bc3d5698SJohn Baldwin	add	x24,x24,x19			// h+=Maj(a,b,c)
877bc3d5698SJohn Baldwin	ldr	x19,[x30],#8		// *K++, x28 in next round
878bc3d5698SJohn Baldwin	add	x15,x15,x5
879bc3d5698SJohn Baldwin	add	x24,x24,x17			// h+=Sigma0(a)
880bc3d5698SJohn Baldwin	add	x15,x15,x4
881bc3d5698SJohn Baldwin	ldr	x4,[sp,#8]
882bc3d5698SJohn Baldwin	str	x7,[sp,#0]
883bc3d5698SJohn Baldwin	ror	x16,x20,#14
884bc3d5698SJohn Baldwin	add	x23,x23,x19			// h+=K[i]
885bc3d5698SJohn Baldwin	ror	x6,x1,#1
886bc3d5698SJohn Baldwin	and	x17,x21,x20
887bc3d5698SJohn Baldwin	ror	x5,x14,#19
888bc3d5698SJohn Baldwin	bic	x19,x22,x20
889bc3d5698SJohn Baldwin	ror	x7,x24,#28
890bc3d5698SJohn Baldwin	add	x23,x23,x15			// h+=X[i]
891bc3d5698SJohn Baldwin	eor	x16,x16,x20,ror#18
892bc3d5698SJohn Baldwin	eor	x6,x6,x1,ror#8
893bc3d5698SJohn Baldwin	orr	x17,x17,x19			// Ch(e,f,g)
894bc3d5698SJohn Baldwin	eor	x19,x24,x25			// a^b, b^c in next round
895bc3d5698SJohn Baldwin	eor	x16,x16,x20,ror#41	// Sigma1(e)
896bc3d5698SJohn Baldwin	eor	x7,x7,x24,ror#34
897bc3d5698SJohn Baldwin	add	x23,x23,x17			// h+=Ch(e,f,g)
898bc3d5698SJohn Baldwin	and	x28,x28,x19			// (b^c)&=(a^b)
899bc3d5698SJohn Baldwin	eor	x5,x5,x14,ror#61
900bc3d5698SJohn Baldwin	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
901bc3d5698SJohn Baldwin	add	x23,x23,x16			// h+=Sigma1(e)
902bc3d5698SJohn Baldwin	eor	x28,x28,x25			// Maj(a,b,c)
903bc3d5698SJohn Baldwin	eor	x17,x7,x24,ror#39	// Sigma0(a)
904bc3d5698SJohn Baldwin	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
905bc3d5698SJohn Baldwin	add	x0,x0,x9
906bc3d5698SJohn Baldwin	add	x27,x27,x23			// d+=h
907bc3d5698SJohn Baldwin	add	x23,x23,x28			// h+=Maj(a,b,c)
908bc3d5698SJohn Baldwin	ldr	x28,[x30],#8		// *K++, x19 in next round
909bc3d5698SJohn Baldwin	add	x0,x0,x6
910bc3d5698SJohn Baldwin	add	x23,x23,x17			// h+=Sigma0(a)
911bc3d5698SJohn Baldwin	add	x0,x0,x5
912bc3d5698SJohn Baldwin	ldr	x5,[sp,#16]
913bc3d5698SJohn Baldwin	str	x8,[sp,#8]
914bc3d5698SJohn Baldwin	ror	x16,x27,#14
915bc3d5698SJohn Baldwin	add	x22,x22,x28			// h+=K[i]
916bc3d5698SJohn Baldwin	ror	x7,x2,#1
917bc3d5698SJohn Baldwin	and	x17,x20,x27
918bc3d5698SJohn Baldwin	ror	x6,x15,#19
919bc3d5698SJohn Baldwin	bic	x28,x21,x27
920bc3d5698SJohn Baldwin	ror	x8,x23,#28
921bc3d5698SJohn Baldwin	add	x22,x22,x0			// h+=X[i]
922bc3d5698SJohn Baldwin	eor	x16,x16,x27,ror#18
923bc3d5698SJohn Baldwin	eor	x7,x7,x2,ror#8
924bc3d5698SJohn Baldwin	orr	x17,x17,x28			// Ch(e,f,g)
925bc3d5698SJohn Baldwin	eor	x28,x23,x24			// a^b, b^c in next round
926bc3d5698SJohn Baldwin	eor	x16,x16,x27,ror#41	// Sigma1(e)
927bc3d5698SJohn Baldwin	eor	x8,x8,x23,ror#34
928bc3d5698SJohn Baldwin	add	x22,x22,x17			// h+=Ch(e,f,g)
929bc3d5698SJohn Baldwin	and	x19,x19,x28			// (b^c)&=(a^b)
930bc3d5698SJohn Baldwin	eor	x6,x6,x15,ror#61
931bc3d5698SJohn Baldwin	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
932bc3d5698SJohn Baldwin	add	x22,x22,x16			// h+=Sigma1(e)
933bc3d5698SJohn Baldwin	eor	x19,x19,x24			// Maj(a,b,c)
934bc3d5698SJohn Baldwin	eor	x17,x8,x23,ror#39	// Sigma0(a)
935bc3d5698SJohn Baldwin	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
936bc3d5698SJohn Baldwin	add	x1,x1,x10
937bc3d5698SJohn Baldwin	add	x26,x26,x22			// d+=h
938bc3d5698SJohn Baldwin	add	x22,x22,x19			// h+=Maj(a,b,c)
939bc3d5698SJohn Baldwin	ldr	x19,[x30],#8		// *K++, x28 in next round
940bc3d5698SJohn Baldwin	add	x1,x1,x7
941bc3d5698SJohn Baldwin	add	x22,x22,x17			// h+=Sigma0(a)
942bc3d5698SJohn Baldwin	add	x1,x1,x6
943bc3d5698SJohn Baldwin	ldr	x6,[sp,#24]
944bc3d5698SJohn Baldwin	str	x9,[sp,#16]
945bc3d5698SJohn Baldwin	ror	x16,x26,#14
946bc3d5698SJohn Baldwin	add	x21,x21,x19			// h+=K[i]
947bc3d5698SJohn Baldwin	ror	x8,x3,#1
948bc3d5698SJohn Baldwin	and	x17,x27,x26
949bc3d5698SJohn Baldwin	ror	x7,x0,#19
950bc3d5698SJohn Baldwin	bic	x19,x20,x26
951bc3d5698SJohn Baldwin	ror	x9,x22,#28
952bc3d5698SJohn Baldwin	add	x21,x21,x1			// h+=X[i]
953bc3d5698SJohn Baldwin	eor	x16,x16,x26,ror#18
954bc3d5698SJohn Baldwin	eor	x8,x8,x3,ror#8
955bc3d5698SJohn Baldwin	orr	x17,x17,x19			// Ch(e,f,g)
956bc3d5698SJohn Baldwin	eor	x19,x22,x23			// a^b, b^c in next round
957bc3d5698SJohn Baldwin	eor	x16,x16,x26,ror#41	// Sigma1(e)
958bc3d5698SJohn Baldwin	eor	x9,x9,x22,ror#34
959bc3d5698SJohn Baldwin	add	x21,x21,x17			// h+=Ch(e,f,g)
960bc3d5698SJohn Baldwin	and	x28,x28,x19			// (b^c)&=(a^b)
961bc3d5698SJohn Baldwin	eor	x7,x7,x0,ror#61
962bc3d5698SJohn Baldwin	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
963bc3d5698SJohn Baldwin	add	x21,x21,x16			// h+=Sigma1(e)
964bc3d5698SJohn Baldwin	eor	x28,x28,x23			// Maj(a,b,c)
965bc3d5698SJohn Baldwin	eor	x17,x9,x22,ror#39	// Sigma0(a)
966bc3d5698SJohn Baldwin	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
967bc3d5698SJohn Baldwin	add	x2,x2,x11
968bc3d5698SJohn Baldwin	add	x25,x25,x21			// d+=h
969bc3d5698SJohn Baldwin	add	x21,x21,x28			// h+=Maj(a,b,c)
970bc3d5698SJohn Baldwin	ldr	x28,[x30],#8		// *K++, x19 in next round
971bc3d5698SJohn Baldwin	add	x2,x2,x8
972bc3d5698SJohn Baldwin	add	x21,x21,x17			// h+=Sigma0(a)
973bc3d5698SJohn Baldwin	add	x2,x2,x7
974bc3d5698SJohn Baldwin	ldr	x7,[sp,#0]
975bc3d5698SJohn Baldwin	str	x10,[sp,#24]
976bc3d5698SJohn Baldwin	ror	x16,x25,#14
977bc3d5698SJohn Baldwin	add	x20,x20,x28			// h+=K[i]
978bc3d5698SJohn Baldwin	ror	x9,x4,#1
979bc3d5698SJohn Baldwin	and	x17,x26,x25
980bc3d5698SJohn Baldwin	ror	x8,x1,#19
981bc3d5698SJohn Baldwin	bic	x28,x27,x25
982bc3d5698SJohn Baldwin	ror	x10,x21,#28
983bc3d5698SJohn Baldwin	add	x20,x20,x2			// h+=X[i]
984bc3d5698SJohn Baldwin	eor	x16,x16,x25,ror#18
985bc3d5698SJohn Baldwin	eor	x9,x9,x4,ror#8
986bc3d5698SJohn Baldwin	orr	x17,x17,x28			// Ch(e,f,g)
987bc3d5698SJohn Baldwin	eor	x28,x21,x22			// a^b, b^c in next round
988bc3d5698SJohn Baldwin	eor	x16,x16,x25,ror#41	// Sigma1(e)
989bc3d5698SJohn Baldwin	eor	x10,x10,x21,ror#34
990bc3d5698SJohn Baldwin	add	x20,x20,x17			// h+=Ch(e,f,g)
991bc3d5698SJohn Baldwin	and	x19,x19,x28			// (b^c)&=(a^b)
992bc3d5698SJohn Baldwin	eor	x8,x8,x1,ror#61
993bc3d5698SJohn Baldwin	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
994bc3d5698SJohn Baldwin	add	x20,x20,x16			// h+=Sigma1(e)
995bc3d5698SJohn Baldwin	eor	x19,x19,x22			// Maj(a,b,c)
996bc3d5698SJohn Baldwin	eor	x17,x10,x21,ror#39	// Sigma0(a)
997bc3d5698SJohn Baldwin	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
998bc3d5698SJohn Baldwin	add	x3,x3,x12
999bc3d5698SJohn Baldwin	add	x24,x24,x20			// d+=h
1000bc3d5698SJohn Baldwin	add	x20,x20,x19			// h+=Maj(a,b,c)
1001bc3d5698SJohn Baldwin	ldr	x19,[x30],#8		// *K++, x28 in next round
1002bc3d5698SJohn Baldwin	add	x3,x3,x9
1003bc3d5698SJohn Baldwin	add	x20,x20,x17			// h+=Sigma0(a)
1004bc3d5698SJohn Baldwin	add	x3,x3,x8
1005bc3d5698SJohn Baldwin	cbnz	x19,.Loop_16_xx
1006bc3d5698SJohn Baldwin
1007bc3d5698SJohn Baldwin	ldp	x0,x2,[x29,#96]
1008bc3d5698SJohn Baldwin	ldr	x1,[x29,#112]
1009bc3d5698SJohn Baldwin	sub	x30,x30,#648		// rewind
1010bc3d5698SJohn Baldwin
1011bc3d5698SJohn Baldwin	ldp	x3,x4,[x0]
1012bc3d5698SJohn Baldwin	ldp	x5,x6,[x0,#2*8]
1013bc3d5698SJohn Baldwin	add	x1,x1,#14*8			// advance input pointer
1014bc3d5698SJohn Baldwin	ldp	x7,x8,[x0,#4*8]
1015bc3d5698SJohn Baldwin	add	x20,x20,x3
1016bc3d5698SJohn Baldwin	ldp	x9,x10,[x0,#6*8]
1017bc3d5698SJohn Baldwin	add	x21,x21,x4
1018bc3d5698SJohn Baldwin	add	x22,x22,x5
1019bc3d5698SJohn Baldwin	add	x23,x23,x6
1020bc3d5698SJohn Baldwin	stp	x20,x21,[x0]
1021bc3d5698SJohn Baldwin	add	x24,x24,x7
1022bc3d5698SJohn Baldwin	add	x25,x25,x8
1023bc3d5698SJohn Baldwin	stp	x22,x23,[x0,#2*8]
1024bc3d5698SJohn Baldwin	add	x26,x26,x9
1025bc3d5698SJohn Baldwin	add	x27,x27,x10
1026bc3d5698SJohn Baldwin	cmp	x1,x2
1027bc3d5698SJohn Baldwin	stp	x24,x25,[x0,#4*8]
1028bc3d5698SJohn Baldwin	stp	x26,x27,[x0,#6*8]
1029bc3d5698SJohn Baldwin	b.ne	.Loop
1030bc3d5698SJohn Baldwin
1031bc3d5698SJohn Baldwin	ldp	x19,x20,[x29,#16]
1032bc3d5698SJohn Baldwin	add	sp,sp,#4*8
1033bc3d5698SJohn Baldwin	ldp	x21,x22,[x29,#32]
1034bc3d5698SJohn Baldwin	ldp	x23,x24,[x29,#48]
1035bc3d5698SJohn Baldwin	ldp	x25,x26,[x29,#64]
1036bc3d5698SJohn Baldwin	ldp	x27,x28,[x29,#80]
1037bc3d5698SJohn Baldwin	ldp	x29,x30,[sp],#128
1038*bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
1039bc3d5698SJohn Baldwin	ret
1040bc3d5698SJohn Baldwin.size	sha512_block_data_order,.-sha512_block_data_order
1041bc3d5698SJohn Baldwin
1042bc3d5698SJohn Baldwin.align	6
1043bc3d5698SJohn Baldwin.type	.LK512,%object
1044bc3d5698SJohn Baldwin.LK512:
1045bc3d5698SJohn Baldwin.quad	0x428a2f98d728ae22,0x7137449123ef65cd
1046bc3d5698SJohn Baldwin.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1047bc3d5698SJohn Baldwin.quad	0x3956c25bf348b538,0x59f111f1b605d019
1048bc3d5698SJohn Baldwin.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
1049bc3d5698SJohn Baldwin.quad	0xd807aa98a3030242,0x12835b0145706fbe
1050bc3d5698SJohn Baldwin.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1051bc3d5698SJohn Baldwin.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
1052bc3d5698SJohn Baldwin.quad	0x9bdc06a725c71235,0xc19bf174cf692694
1053bc3d5698SJohn Baldwin.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
1054bc3d5698SJohn Baldwin.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1055bc3d5698SJohn Baldwin.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
1056bc3d5698SJohn Baldwin.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1057bc3d5698SJohn Baldwin.quad	0x983e5152ee66dfab,0xa831c66d2db43210
1058bc3d5698SJohn Baldwin.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
1059bc3d5698SJohn Baldwin.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
1060bc3d5698SJohn Baldwin.quad	0x06ca6351e003826f,0x142929670a0e6e70
1061bc3d5698SJohn Baldwin.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
1062bc3d5698SJohn Baldwin.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1063bc3d5698SJohn Baldwin.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
1064bc3d5698SJohn Baldwin.quad	0x81c2c92e47edaee6,0x92722c851482353b
1065bc3d5698SJohn Baldwin.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
1066bc3d5698SJohn Baldwin.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
1067bc3d5698SJohn Baldwin.quad	0xd192e819d6ef5218,0xd69906245565a910
1068bc3d5698SJohn Baldwin.quad	0xf40e35855771202a,0x106aa07032bbd1b8
1069bc3d5698SJohn Baldwin.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
1070bc3d5698SJohn Baldwin.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1071bc3d5698SJohn Baldwin.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1072bc3d5698SJohn Baldwin.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1073bc3d5698SJohn Baldwin.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
1074bc3d5698SJohn Baldwin.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
1075bc3d5698SJohn Baldwin.quad	0x90befffa23631e28,0xa4506cebde82bde9
1076bc3d5698SJohn Baldwin.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
1077bc3d5698SJohn Baldwin.quad	0xca273eceea26619c,0xd186b8c721c0c207
1078bc3d5698SJohn Baldwin.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1079bc3d5698SJohn Baldwin.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
1080bc3d5698SJohn Baldwin.quad	0x113f9804bef90dae,0x1b710b35131c471b
1081bc3d5698SJohn Baldwin.quad	0x28db77f523047d84,0x32caab7b40c72493
1082bc3d5698SJohn Baldwin.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1083bc3d5698SJohn Baldwin.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1084bc3d5698SJohn Baldwin.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
1085bc3d5698SJohn Baldwin.quad	0	// terminator
1086bc3d5698SJohn Baldwin.size	.LK512,.-.LK512
1087bc3d5698SJohn Baldwin.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1088bc3d5698SJohn Baldwin.align	2
1089bc3d5698SJohn Baldwin.align	2
1090bc3d5698SJohn Baldwin#ifndef	__KERNEL__
1091bc3d5698SJohn Baldwin.type	sha512_block_armv8,%function
1092bc3d5698SJohn Baldwin.align	6
1093bc3d5698SJohn Baldwinsha512_block_armv8:
1094bc3d5698SJohn Baldwin.Lv8_entry:
1095*bd9588bcSAndrew Turner	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later
1096bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-16]!
1097bc3d5698SJohn Baldwin	add	x29,sp,#0
1098bc3d5698SJohn Baldwin
1099bc3d5698SJohn Baldwin	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
1100bc3d5698SJohn Baldwin	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1101bc3d5698SJohn Baldwin
1102bc3d5698SJohn Baldwin	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
1103bc3d5698SJohn Baldwin	adr	x3,.LK512
1104bc3d5698SJohn Baldwin
1105bc3d5698SJohn Baldwin	rev64	v16.16b,v16.16b
1106bc3d5698SJohn Baldwin	rev64	v17.16b,v17.16b
1107bc3d5698SJohn Baldwin	rev64	v18.16b,v18.16b
1108bc3d5698SJohn Baldwin	rev64	v19.16b,v19.16b
1109bc3d5698SJohn Baldwin	rev64	v20.16b,v20.16b
1110bc3d5698SJohn Baldwin	rev64	v21.16b,v21.16b
1111bc3d5698SJohn Baldwin	rev64	v22.16b,v22.16b
1112bc3d5698SJohn Baldwin	rev64	v23.16b,v23.16b
1113bc3d5698SJohn Baldwin	b	.Loop_hw
1114bc3d5698SJohn Baldwin
1115bc3d5698SJohn Baldwin.align	4
1116bc3d5698SJohn Baldwin.Loop_hw:
1117bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1118bc3d5698SJohn Baldwin	subs	x2,x2,#1
1119bc3d5698SJohn Baldwin	sub	x4,x1,#128
1120bc3d5698SJohn Baldwin	orr	v26.16b,v0.16b,v0.16b			// offload
1121bc3d5698SJohn Baldwin	orr	v27.16b,v1.16b,v1.16b
1122bc3d5698SJohn Baldwin	orr	v28.16b,v2.16b,v2.16b
1123bc3d5698SJohn Baldwin	orr	v29.16b,v3.16b,v3.16b
1124bc3d5698SJohn Baldwin	csel	x1,x1,x4,ne			// conditional rewind
1125bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v16.2d
1126bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1127bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1128bc3d5698SJohn Baldwin	ext	v5.16b,v2.16b,v3.16b,#8
1129bc3d5698SJohn Baldwin	ext	v6.16b,v1.16b,v2.16b,#8
1130bc3d5698SJohn Baldwin	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1131bc3d5698SJohn Baldwin.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1132bc3d5698SJohn Baldwin	ext	v7.16b,v20.16b,v21.16b,#8
1133bc3d5698SJohn Baldwin.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1134bc3d5698SJohn Baldwin.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1135bc3d5698SJohn Baldwin	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1136bc3d5698SJohn Baldwin.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1137bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v17.2d
1138bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1139bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1140bc3d5698SJohn Baldwin	ext	v5.16b,v4.16b,v2.16b,#8
1141bc3d5698SJohn Baldwin	ext	v6.16b,v0.16b,v4.16b,#8
1142bc3d5698SJohn Baldwin	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1143bc3d5698SJohn Baldwin.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1144bc3d5698SJohn Baldwin	ext	v7.16b,v21.16b,v22.16b,#8
1145bc3d5698SJohn Baldwin.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1146bc3d5698SJohn Baldwin.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1147bc3d5698SJohn Baldwin	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1148bc3d5698SJohn Baldwin.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1149bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v18.2d
1150bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1151bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1152bc3d5698SJohn Baldwin	ext	v5.16b,v1.16b,v4.16b,#8
1153bc3d5698SJohn Baldwin	ext	v6.16b,v3.16b,v1.16b,#8
1154bc3d5698SJohn Baldwin	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1155bc3d5698SJohn Baldwin.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1156bc3d5698SJohn Baldwin	ext	v7.16b,v22.16b,v23.16b,#8
1157bc3d5698SJohn Baldwin.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1158bc3d5698SJohn Baldwin.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1159bc3d5698SJohn Baldwin	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1160bc3d5698SJohn Baldwin.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1161bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v19.2d
1162bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1163bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1164bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v1.16b,#8
1165bc3d5698SJohn Baldwin	ext	v6.16b,v2.16b,v0.16b,#8
1166bc3d5698SJohn Baldwin	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1167bc3d5698SJohn Baldwin.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1168bc3d5698SJohn Baldwin	ext	v7.16b,v23.16b,v16.16b,#8
1169bc3d5698SJohn Baldwin.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1170bc3d5698SJohn Baldwin.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1171bc3d5698SJohn Baldwin	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1172bc3d5698SJohn Baldwin.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1173bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v20.2d
1174bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1175bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1176bc3d5698SJohn Baldwin	ext	v5.16b,v3.16b,v0.16b,#8
1177bc3d5698SJohn Baldwin	ext	v6.16b,v4.16b,v3.16b,#8
1178bc3d5698SJohn Baldwin	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1179bc3d5698SJohn Baldwin.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1180bc3d5698SJohn Baldwin	ext	v7.16b,v16.16b,v17.16b,#8
1181bc3d5698SJohn Baldwin.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1182bc3d5698SJohn Baldwin.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1183bc3d5698SJohn Baldwin	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1184bc3d5698SJohn Baldwin.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1185bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v21.2d
1186bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1187bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1188bc3d5698SJohn Baldwin	ext	v5.16b,v2.16b,v3.16b,#8
1189bc3d5698SJohn Baldwin	ext	v6.16b,v1.16b,v2.16b,#8
1190bc3d5698SJohn Baldwin	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1191bc3d5698SJohn Baldwin.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1192bc3d5698SJohn Baldwin	ext	v7.16b,v17.16b,v18.16b,#8
1193bc3d5698SJohn Baldwin.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1194bc3d5698SJohn Baldwin.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1195bc3d5698SJohn Baldwin	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1196bc3d5698SJohn Baldwin.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1197bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v22.2d
1198bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1199bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1200bc3d5698SJohn Baldwin	ext	v5.16b,v4.16b,v2.16b,#8
1201bc3d5698SJohn Baldwin	ext	v6.16b,v0.16b,v4.16b,#8
1202bc3d5698SJohn Baldwin	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1203bc3d5698SJohn Baldwin.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1204bc3d5698SJohn Baldwin	ext	v7.16b,v18.16b,v19.16b,#8
1205bc3d5698SJohn Baldwin.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1206bc3d5698SJohn Baldwin.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1207bc3d5698SJohn Baldwin	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1208bc3d5698SJohn Baldwin.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1209bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v23.2d
1210bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1211bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1212bc3d5698SJohn Baldwin	ext	v5.16b,v1.16b,v4.16b,#8
1213bc3d5698SJohn Baldwin	ext	v6.16b,v3.16b,v1.16b,#8
1214bc3d5698SJohn Baldwin	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1215bc3d5698SJohn Baldwin.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1216bc3d5698SJohn Baldwin	ext	v7.16b,v19.16b,v20.16b,#8
1217bc3d5698SJohn Baldwin.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1218bc3d5698SJohn Baldwin.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1219bc3d5698SJohn Baldwin	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1220bc3d5698SJohn Baldwin.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1221bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v16.2d
1222bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1223bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1224bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v1.16b,#8
1225bc3d5698SJohn Baldwin	ext	v6.16b,v2.16b,v0.16b,#8
1226bc3d5698SJohn Baldwin	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1227bc3d5698SJohn Baldwin.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1228bc3d5698SJohn Baldwin	ext	v7.16b,v20.16b,v21.16b,#8
1229bc3d5698SJohn Baldwin.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1230bc3d5698SJohn Baldwin.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1231bc3d5698SJohn Baldwin	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1232bc3d5698SJohn Baldwin.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1233bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v17.2d
1234bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1235bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1236bc3d5698SJohn Baldwin	ext	v5.16b,v3.16b,v0.16b,#8
1237bc3d5698SJohn Baldwin	ext	v6.16b,v4.16b,v3.16b,#8
1238bc3d5698SJohn Baldwin	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1239bc3d5698SJohn Baldwin.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1240bc3d5698SJohn Baldwin	ext	v7.16b,v21.16b,v22.16b,#8
1241bc3d5698SJohn Baldwin.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1242bc3d5698SJohn Baldwin.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1243bc3d5698SJohn Baldwin	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1244bc3d5698SJohn Baldwin.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1245bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v18.2d
1246bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1247bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1248bc3d5698SJohn Baldwin	ext	v5.16b,v2.16b,v3.16b,#8
1249bc3d5698SJohn Baldwin	ext	v6.16b,v1.16b,v2.16b,#8
1250bc3d5698SJohn Baldwin	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1251bc3d5698SJohn Baldwin.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1252bc3d5698SJohn Baldwin	ext	v7.16b,v22.16b,v23.16b,#8
1253bc3d5698SJohn Baldwin.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1254bc3d5698SJohn Baldwin.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1255bc3d5698SJohn Baldwin	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1256bc3d5698SJohn Baldwin.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1257bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v19.2d
1258bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1259bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1260bc3d5698SJohn Baldwin	ext	v5.16b,v4.16b,v2.16b,#8
1261bc3d5698SJohn Baldwin	ext	v6.16b,v0.16b,v4.16b,#8
1262bc3d5698SJohn Baldwin	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1263bc3d5698SJohn Baldwin.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1264bc3d5698SJohn Baldwin	ext	v7.16b,v23.16b,v16.16b,#8
1265bc3d5698SJohn Baldwin.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1266bc3d5698SJohn Baldwin.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1267bc3d5698SJohn Baldwin	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1268bc3d5698SJohn Baldwin.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1269bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v20.2d
1270bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1271bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1272bc3d5698SJohn Baldwin	ext	v5.16b,v1.16b,v4.16b,#8
1273bc3d5698SJohn Baldwin	ext	v6.16b,v3.16b,v1.16b,#8
1274bc3d5698SJohn Baldwin	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1275bc3d5698SJohn Baldwin.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1276bc3d5698SJohn Baldwin	ext	v7.16b,v16.16b,v17.16b,#8
1277bc3d5698SJohn Baldwin.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1278bc3d5698SJohn Baldwin.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1279bc3d5698SJohn Baldwin	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1280bc3d5698SJohn Baldwin.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1281bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v21.2d
1282bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1283bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1284bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v1.16b,#8
1285bc3d5698SJohn Baldwin	ext	v6.16b,v2.16b,v0.16b,#8
1286bc3d5698SJohn Baldwin	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1287bc3d5698SJohn Baldwin.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1288bc3d5698SJohn Baldwin	ext	v7.16b,v17.16b,v18.16b,#8
1289bc3d5698SJohn Baldwin.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1290bc3d5698SJohn Baldwin.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1291bc3d5698SJohn Baldwin	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1292bc3d5698SJohn Baldwin.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1293bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v22.2d
1294bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1295bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1296bc3d5698SJohn Baldwin	ext	v5.16b,v3.16b,v0.16b,#8
1297bc3d5698SJohn Baldwin	ext	v6.16b,v4.16b,v3.16b,#8
1298bc3d5698SJohn Baldwin	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1299bc3d5698SJohn Baldwin.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1300bc3d5698SJohn Baldwin	ext	v7.16b,v18.16b,v19.16b,#8
1301bc3d5698SJohn Baldwin.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1302bc3d5698SJohn Baldwin.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1303bc3d5698SJohn Baldwin	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1304bc3d5698SJohn Baldwin.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1305bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v23.2d
1306bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1307bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1308bc3d5698SJohn Baldwin	ext	v5.16b,v2.16b,v3.16b,#8
1309bc3d5698SJohn Baldwin	ext	v6.16b,v1.16b,v2.16b,#8
1310bc3d5698SJohn Baldwin	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1311bc3d5698SJohn Baldwin.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1312bc3d5698SJohn Baldwin	ext	v7.16b,v19.16b,v20.16b,#8
1313bc3d5698SJohn Baldwin.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1314bc3d5698SJohn Baldwin.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1315bc3d5698SJohn Baldwin	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1316bc3d5698SJohn Baldwin.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1317bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v16.2d
1318bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1319bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1320bc3d5698SJohn Baldwin	ext	v5.16b,v4.16b,v2.16b,#8
1321bc3d5698SJohn Baldwin	ext	v6.16b,v0.16b,v4.16b,#8
1322bc3d5698SJohn Baldwin	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1323bc3d5698SJohn Baldwin.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1324bc3d5698SJohn Baldwin	ext	v7.16b,v20.16b,v21.16b,#8
1325bc3d5698SJohn Baldwin.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1326bc3d5698SJohn Baldwin.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1327bc3d5698SJohn Baldwin	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1328bc3d5698SJohn Baldwin.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1329bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v17.2d
1330bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1331bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1332bc3d5698SJohn Baldwin	ext	v5.16b,v1.16b,v4.16b,#8
1333bc3d5698SJohn Baldwin	ext	v6.16b,v3.16b,v1.16b,#8
1334bc3d5698SJohn Baldwin	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1335bc3d5698SJohn Baldwin.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1336bc3d5698SJohn Baldwin	ext	v7.16b,v21.16b,v22.16b,#8
1337bc3d5698SJohn Baldwin.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1338bc3d5698SJohn Baldwin.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1339bc3d5698SJohn Baldwin	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1340bc3d5698SJohn Baldwin.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1341bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v18.2d
1342bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1343bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1344bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v1.16b,#8
1345bc3d5698SJohn Baldwin	ext	v6.16b,v2.16b,v0.16b,#8
1346bc3d5698SJohn Baldwin	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1347bc3d5698SJohn Baldwin.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1348bc3d5698SJohn Baldwin	ext	v7.16b,v22.16b,v23.16b,#8
1349bc3d5698SJohn Baldwin.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1350bc3d5698SJohn Baldwin.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1351bc3d5698SJohn Baldwin	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1352bc3d5698SJohn Baldwin.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1353bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v19.2d
1354bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1355bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1356bc3d5698SJohn Baldwin	ext	v5.16b,v3.16b,v0.16b,#8
1357bc3d5698SJohn Baldwin	ext	v6.16b,v4.16b,v3.16b,#8
1358bc3d5698SJohn Baldwin	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1359bc3d5698SJohn Baldwin.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1360bc3d5698SJohn Baldwin	ext	v7.16b,v23.16b,v16.16b,#8
1361bc3d5698SJohn Baldwin.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1362bc3d5698SJohn Baldwin.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1363bc3d5698SJohn Baldwin	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1364bc3d5698SJohn Baldwin.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1365bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v20.2d
1366bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1367bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1368bc3d5698SJohn Baldwin	ext	v5.16b,v2.16b,v3.16b,#8
1369bc3d5698SJohn Baldwin	ext	v6.16b,v1.16b,v2.16b,#8
1370bc3d5698SJohn Baldwin	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1371bc3d5698SJohn Baldwin.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1372bc3d5698SJohn Baldwin	ext	v7.16b,v16.16b,v17.16b,#8
1373bc3d5698SJohn Baldwin.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1374bc3d5698SJohn Baldwin.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1375bc3d5698SJohn Baldwin	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1376bc3d5698SJohn Baldwin.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1377bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v21.2d
1378bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1379bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1380bc3d5698SJohn Baldwin	ext	v5.16b,v4.16b,v2.16b,#8
1381bc3d5698SJohn Baldwin	ext	v6.16b,v0.16b,v4.16b,#8
1382bc3d5698SJohn Baldwin	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1383bc3d5698SJohn Baldwin.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1384bc3d5698SJohn Baldwin	ext	v7.16b,v17.16b,v18.16b,#8
1385bc3d5698SJohn Baldwin.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1386bc3d5698SJohn Baldwin.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1387bc3d5698SJohn Baldwin	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1388bc3d5698SJohn Baldwin.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1389bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v22.2d
1390bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1391bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1392bc3d5698SJohn Baldwin	ext	v5.16b,v1.16b,v4.16b,#8
1393bc3d5698SJohn Baldwin	ext	v6.16b,v3.16b,v1.16b,#8
1394bc3d5698SJohn Baldwin	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1395bc3d5698SJohn Baldwin.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1396bc3d5698SJohn Baldwin	ext	v7.16b,v18.16b,v19.16b,#8
1397bc3d5698SJohn Baldwin.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1398bc3d5698SJohn Baldwin.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1399bc3d5698SJohn Baldwin	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1400bc3d5698SJohn Baldwin.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1401bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v23.2d
1402bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1403bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1404bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v1.16b,#8
1405bc3d5698SJohn Baldwin	ext	v6.16b,v2.16b,v0.16b,#8
1406bc3d5698SJohn Baldwin	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1407bc3d5698SJohn Baldwin.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1408bc3d5698SJohn Baldwin	ext	v7.16b,v19.16b,v20.16b,#8
1409bc3d5698SJohn Baldwin.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1410bc3d5698SJohn Baldwin.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1411bc3d5698SJohn Baldwin	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1412bc3d5698SJohn Baldwin.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1413bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v16.2d
1414bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1415bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1416bc3d5698SJohn Baldwin	ext	v5.16b,v3.16b,v0.16b,#8
1417bc3d5698SJohn Baldwin	ext	v6.16b,v4.16b,v3.16b,#8
1418bc3d5698SJohn Baldwin	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1419bc3d5698SJohn Baldwin.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1420bc3d5698SJohn Baldwin	ext	v7.16b,v20.16b,v21.16b,#8
1421bc3d5698SJohn Baldwin.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1422bc3d5698SJohn Baldwin.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1423bc3d5698SJohn Baldwin	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1424bc3d5698SJohn Baldwin.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1425bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v17.2d
1426bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1427bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1428bc3d5698SJohn Baldwin	ext	v5.16b,v2.16b,v3.16b,#8
1429bc3d5698SJohn Baldwin	ext	v6.16b,v1.16b,v2.16b,#8
1430bc3d5698SJohn Baldwin	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1431bc3d5698SJohn Baldwin.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1432bc3d5698SJohn Baldwin	ext	v7.16b,v21.16b,v22.16b,#8
1433bc3d5698SJohn Baldwin.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1434bc3d5698SJohn Baldwin.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1435bc3d5698SJohn Baldwin	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1436bc3d5698SJohn Baldwin.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1437bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v18.2d
1438bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1439bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1440bc3d5698SJohn Baldwin	ext	v5.16b,v4.16b,v2.16b,#8
1441bc3d5698SJohn Baldwin	ext	v6.16b,v0.16b,v4.16b,#8
1442bc3d5698SJohn Baldwin	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1443bc3d5698SJohn Baldwin.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1444bc3d5698SJohn Baldwin	ext	v7.16b,v22.16b,v23.16b,#8
1445bc3d5698SJohn Baldwin.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1446bc3d5698SJohn Baldwin.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1447bc3d5698SJohn Baldwin	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1448bc3d5698SJohn Baldwin.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1449bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v19.2d
1450bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1451bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1452bc3d5698SJohn Baldwin	ext	v5.16b,v1.16b,v4.16b,#8
1453bc3d5698SJohn Baldwin	ext	v6.16b,v3.16b,v1.16b,#8
1454bc3d5698SJohn Baldwin	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1455bc3d5698SJohn Baldwin.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1456bc3d5698SJohn Baldwin	ext	v7.16b,v23.16b,v16.16b,#8
1457bc3d5698SJohn Baldwin.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1458bc3d5698SJohn Baldwin.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1459bc3d5698SJohn Baldwin	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1460bc3d5698SJohn Baldwin.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1461bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v20.2d
1462bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1463bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1464bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v1.16b,#8
1465bc3d5698SJohn Baldwin	ext	v6.16b,v2.16b,v0.16b,#8
1466bc3d5698SJohn Baldwin	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1467bc3d5698SJohn Baldwin.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1468bc3d5698SJohn Baldwin	ext	v7.16b,v16.16b,v17.16b,#8
1469bc3d5698SJohn Baldwin.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1470bc3d5698SJohn Baldwin.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1471bc3d5698SJohn Baldwin	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1472bc3d5698SJohn Baldwin.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1473bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v21.2d
1474bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1475bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1476bc3d5698SJohn Baldwin	ext	v5.16b,v3.16b,v0.16b,#8
1477bc3d5698SJohn Baldwin	ext	v6.16b,v4.16b,v3.16b,#8
1478bc3d5698SJohn Baldwin	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1479bc3d5698SJohn Baldwin.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1480bc3d5698SJohn Baldwin	ext	v7.16b,v17.16b,v18.16b,#8
1481bc3d5698SJohn Baldwin.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1482bc3d5698SJohn Baldwin.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1483bc3d5698SJohn Baldwin	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1484bc3d5698SJohn Baldwin.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1485bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v22.2d
1486bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1487bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1488bc3d5698SJohn Baldwin	ext	v5.16b,v2.16b,v3.16b,#8
1489bc3d5698SJohn Baldwin	ext	v6.16b,v1.16b,v2.16b,#8
1490bc3d5698SJohn Baldwin	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1491bc3d5698SJohn Baldwin.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1492bc3d5698SJohn Baldwin	ext	v7.16b,v18.16b,v19.16b,#8
1493bc3d5698SJohn Baldwin.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1494bc3d5698SJohn Baldwin.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1495bc3d5698SJohn Baldwin	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1496bc3d5698SJohn Baldwin.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1497bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v23.2d
1498bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1499bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1500bc3d5698SJohn Baldwin	ext	v5.16b,v4.16b,v2.16b,#8
1501bc3d5698SJohn Baldwin	ext	v6.16b,v0.16b,v4.16b,#8
1502bc3d5698SJohn Baldwin	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1503bc3d5698SJohn Baldwin.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1504bc3d5698SJohn Baldwin	ext	v7.16b,v19.16b,v20.16b,#8
1505bc3d5698SJohn Baldwin.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1506bc3d5698SJohn Baldwin.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1507bc3d5698SJohn Baldwin	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1508bc3d5698SJohn Baldwin.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1509bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1510bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v16.2d
1511bc3d5698SJohn Baldwin	ld1	{v16.16b},[x1],#16		// load next input
1512bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1513bc3d5698SJohn Baldwin	ext	v5.16b,v1.16b,v4.16b,#8
1514bc3d5698SJohn Baldwin	ext	v6.16b,v3.16b,v1.16b,#8
1515bc3d5698SJohn Baldwin	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1516bc3d5698SJohn Baldwin.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1517bc3d5698SJohn Baldwin	rev64	v16.16b,v16.16b
1518bc3d5698SJohn Baldwin	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1519bc3d5698SJohn Baldwin.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1520bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1521bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v17.2d
1522bc3d5698SJohn Baldwin	ld1	{v17.16b},[x1],#16		// load next input
1523bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1524bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v1.16b,#8
1525bc3d5698SJohn Baldwin	ext	v6.16b,v2.16b,v0.16b,#8
1526bc3d5698SJohn Baldwin	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1527bc3d5698SJohn Baldwin.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1528bc3d5698SJohn Baldwin	rev64	v17.16b,v17.16b
1529bc3d5698SJohn Baldwin	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1530bc3d5698SJohn Baldwin.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1531bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1532bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v18.2d
1533bc3d5698SJohn Baldwin	ld1	{v18.16b},[x1],#16		// load next input
1534bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1535bc3d5698SJohn Baldwin	ext	v5.16b,v3.16b,v0.16b,#8
1536bc3d5698SJohn Baldwin	ext	v6.16b,v4.16b,v3.16b,#8
1537bc3d5698SJohn Baldwin	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1538bc3d5698SJohn Baldwin.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1539bc3d5698SJohn Baldwin	rev64	v18.16b,v18.16b
1540bc3d5698SJohn Baldwin	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1541bc3d5698SJohn Baldwin.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1542bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1543bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v19.2d
1544bc3d5698SJohn Baldwin	ld1	{v19.16b},[x1],#16		// load next input
1545bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1546bc3d5698SJohn Baldwin	ext	v5.16b,v2.16b,v3.16b,#8
1547bc3d5698SJohn Baldwin	ext	v6.16b,v1.16b,v2.16b,#8
1548bc3d5698SJohn Baldwin	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1549bc3d5698SJohn Baldwin.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1550bc3d5698SJohn Baldwin	rev64	v19.16b,v19.16b
1551bc3d5698SJohn Baldwin	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1552bc3d5698SJohn Baldwin.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1553bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1554bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v20.2d
1555bc3d5698SJohn Baldwin	ld1	{v20.16b},[x1],#16		// load next input
1556bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1557bc3d5698SJohn Baldwin	ext	v5.16b,v4.16b,v2.16b,#8
1558bc3d5698SJohn Baldwin	ext	v6.16b,v0.16b,v4.16b,#8
1559bc3d5698SJohn Baldwin	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1560bc3d5698SJohn Baldwin.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1561bc3d5698SJohn Baldwin	rev64	v20.16b,v20.16b
1562bc3d5698SJohn Baldwin	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1563bc3d5698SJohn Baldwin.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1564bc3d5698SJohn Baldwin	ld1	{v24.2d},[x3],#16
1565bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v21.2d
1566bc3d5698SJohn Baldwin	ld1	{v21.16b},[x1],#16		// load next input
1567bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1568bc3d5698SJohn Baldwin	ext	v5.16b,v1.16b,v4.16b,#8
1569bc3d5698SJohn Baldwin	ext	v6.16b,v3.16b,v1.16b,#8
1570bc3d5698SJohn Baldwin	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1571bc3d5698SJohn Baldwin.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1572bc3d5698SJohn Baldwin	rev64	v21.16b,v21.16b
1573bc3d5698SJohn Baldwin	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1574bc3d5698SJohn Baldwin.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1575bc3d5698SJohn Baldwin	ld1	{v25.2d},[x3],#16
1576bc3d5698SJohn Baldwin	add	v24.2d,v24.2d,v22.2d
1577bc3d5698SJohn Baldwin	ld1	{v22.16b},[x1],#16		// load next input
1578bc3d5698SJohn Baldwin	ext	v24.16b,v24.16b,v24.16b,#8
1579bc3d5698SJohn Baldwin	ext	v5.16b,v0.16b,v1.16b,#8
1580bc3d5698SJohn Baldwin	ext	v6.16b,v2.16b,v0.16b,#8
1581bc3d5698SJohn Baldwin	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1582bc3d5698SJohn Baldwin.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1583bc3d5698SJohn Baldwin	rev64	v22.16b,v22.16b
1584bc3d5698SJohn Baldwin	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1585bc3d5698SJohn Baldwin.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1586bc3d5698SJohn Baldwin	sub	x3,x3,#80*8	// rewind
1587bc3d5698SJohn Baldwin	add	v25.2d,v25.2d,v23.2d
1588bc3d5698SJohn Baldwin	ld1	{v23.16b},[x1],#16		// load next input
1589bc3d5698SJohn Baldwin	ext	v25.16b,v25.16b,v25.16b,#8
1590bc3d5698SJohn Baldwin	ext	v5.16b,v3.16b,v0.16b,#8
1591bc3d5698SJohn Baldwin	ext	v6.16b,v4.16b,v3.16b,#8
1592bc3d5698SJohn Baldwin	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1593bc3d5698SJohn Baldwin.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1594bc3d5698SJohn Baldwin	rev64	v23.16b,v23.16b
1595bc3d5698SJohn Baldwin	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1596bc3d5698SJohn Baldwin.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1597bc3d5698SJohn Baldwin	add	v0.2d,v0.2d,v26.2d			// accumulate
1598bc3d5698SJohn Baldwin	add	v1.2d,v1.2d,v27.2d
1599bc3d5698SJohn Baldwin	add	v2.2d,v2.2d,v28.2d
1600bc3d5698SJohn Baldwin	add	v3.2d,v3.2d,v29.2d
1601bc3d5698SJohn Baldwin
1602bc3d5698SJohn Baldwin	cbnz	x2,.Loop_hw
1603bc3d5698SJohn Baldwin
1604bc3d5698SJohn Baldwin	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
1605bc3d5698SJohn Baldwin
1606bc3d5698SJohn Baldwin	ldr	x29,[sp],#16
1607bc3d5698SJohn Baldwin	ret
1608bc3d5698SJohn Baldwin.size	sha512_block_armv8,.-sha512_block_armv8
1609bc3d5698SJohn Baldwin#endif
1610