xref: /freebsd/sys/crypto/openssl/aarch64/armv8-mont.S (revision d2a55e6a9348bb55038dbc6b727ab041085f22db)
1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from armv8-mont.pl. */
2c0855eaaSJohn Baldwin#include "arm_arch.h"
3bd9588bcSAndrew Turner#ifndef	__KERNEL__
4c0855eaaSJohn Baldwin
5c0855eaaSJohn Baldwin.hidden	OPENSSL_armv8_rsa_neonized
6c0855eaaSJohn Baldwin#endif
7bc3d5698SJohn Baldwin.text
8bc3d5698SJohn Baldwin
9bc3d5698SJohn Baldwin.globl	bn_mul_mont
10bc3d5698SJohn Baldwin.type	bn_mul_mont,%function
11bc3d5698SJohn Baldwin.align	5
12bc3d5698SJohn Baldwinbn_mul_mont:
13bd9588bcSAndrew Turner	AARCH64_SIGN_LINK_REGISTER
14c0855eaaSJohn Baldwin.Lbn_mul_mont:
15c0855eaaSJohn Baldwin	tst	x5,#3
16c0855eaaSJohn Baldwin	b.ne	.Lmul_mont
17c0855eaaSJohn Baldwin	cmp	x5,#32
18c0855eaaSJohn Baldwin	b.le	.Lscalar_impl
19c0855eaaSJohn Baldwin#ifndef	__KERNEL__
20*d2a55e6aSEnji Cooper#ifndef	__AARCH64EB__
21c0855eaaSJohn Baldwin	adrp	x17,OPENSSL_armv8_rsa_neonized
22c0855eaaSJohn Baldwin	ldr	w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
23c0855eaaSJohn Baldwin	cbnz	w17, bn_mul8x_mont_neon
24c0855eaaSJohn Baldwin#endif
25*d2a55e6aSEnji Cooper#endif
26c0855eaaSJohn Baldwin
27c0855eaaSJohn Baldwin.Lscalar_impl:
28bc3d5698SJohn Baldwin	tst	x5,#7
29bc3d5698SJohn Baldwin	b.eq	__bn_sqr8x_mont
30bc3d5698SJohn Baldwin	tst	x5,#3
31bc3d5698SJohn Baldwin	b.eq	__bn_mul4x_mont
32c0855eaaSJohn Baldwin
33bc3d5698SJohn Baldwin.Lmul_mont:
34bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-64]!
35bc3d5698SJohn Baldwin	add	x29,sp,#0
36bc3d5698SJohn Baldwin	stp	x19,x20,[sp,#16]
37bc3d5698SJohn Baldwin	stp	x21,x22,[sp,#32]
38bc3d5698SJohn Baldwin	stp	x23,x24,[sp,#48]
39bc3d5698SJohn Baldwin
40bc3d5698SJohn Baldwin	ldr	x9,[x2],#8		// bp[0]
41bc3d5698SJohn Baldwin	sub	x22,sp,x5,lsl#3
42bc3d5698SJohn Baldwin	ldp	x7,x8,[x1],#16	// ap[0..1]
43bc3d5698SJohn Baldwin	lsl	x5,x5,#3
44bc3d5698SJohn Baldwin	ldr	x4,[x4]		// *n0
45bc3d5698SJohn Baldwin	and	x22,x22,#-16		// ABI says so
46bc3d5698SJohn Baldwin	ldp	x13,x14,[x3],#16	// np[0..1]
47bc3d5698SJohn Baldwin
48bc3d5698SJohn Baldwin	mul	x6,x7,x9		// ap[0]*bp[0]
49bc3d5698SJohn Baldwin	sub	x21,x5,#16		// j=num-2
50bc3d5698SJohn Baldwin	umulh	x7,x7,x9
51bc3d5698SJohn Baldwin	mul	x10,x8,x9		// ap[1]*bp[0]
52bc3d5698SJohn Baldwin	umulh	x11,x8,x9
53bc3d5698SJohn Baldwin
54bc3d5698SJohn Baldwin	mul	x15,x6,x4		// "tp[0]"*n0
55bc3d5698SJohn Baldwin	mov	sp,x22			// alloca
56bc3d5698SJohn Baldwin
57bc3d5698SJohn Baldwin	// (*)	mul	x12,x13,x15	// np[0]*m1
58bc3d5698SJohn Baldwin	umulh	x13,x13,x15
59bc3d5698SJohn Baldwin	mul	x16,x14,x15		// np[1]*m1
60bc3d5698SJohn Baldwin	// (*)	adds	x12,x12,x6	// discarded
61bc3d5698SJohn Baldwin	// (*)	As for removal of first multiplication and addition
62bc3d5698SJohn Baldwin	//	instructions. The outcome of first addition is
63bc3d5698SJohn Baldwin	//	guaranteed to be zero, which leaves two computationally
64bc3d5698SJohn Baldwin	//	significant outcomes: it either carries or not. Then
65bc3d5698SJohn Baldwin	//	question is when does it carry? Is there alternative
66bc3d5698SJohn Baldwin	//	way to deduce it? If you follow operations, you can
67bc3d5698SJohn Baldwin	//	observe that condition for carry is quite simple:
68bc3d5698SJohn Baldwin	//	x6 being non-zero. So that carry can be calculated
69bc3d5698SJohn Baldwin	//	by adding -1 to x6. That's what next instruction does.
70bc3d5698SJohn Baldwin	subs	xzr,x6,#1		// (*)
71bc3d5698SJohn Baldwin	umulh	x17,x14,x15
72bc3d5698SJohn Baldwin	adc	x13,x13,xzr
73bc3d5698SJohn Baldwin	cbz	x21,.L1st_skip
74bc3d5698SJohn Baldwin
75bc3d5698SJohn Baldwin.L1st:
76bc3d5698SJohn Baldwin	ldr	x8,[x1],#8
77bc3d5698SJohn Baldwin	adds	x6,x10,x7
78bc3d5698SJohn Baldwin	sub	x21,x21,#8		// j--
79bc3d5698SJohn Baldwin	adc	x7,x11,xzr
80bc3d5698SJohn Baldwin
81bc3d5698SJohn Baldwin	ldr	x14,[x3],#8
82bc3d5698SJohn Baldwin	adds	x12,x16,x13
83bc3d5698SJohn Baldwin	mul	x10,x8,x9		// ap[j]*bp[0]
84bc3d5698SJohn Baldwin	adc	x13,x17,xzr
85bc3d5698SJohn Baldwin	umulh	x11,x8,x9
86bc3d5698SJohn Baldwin
87bc3d5698SJohn Baldwin	adds	x12,x12,x6
88bc3d5698SJohn Baldwin	mul	x16,x14,x15		// np[j]*m1
89bc3d5698SJohn Baldwin	adc	x13,x13,xzr
90bc3d5698SJohn Baldwin	umulh	x17,x14,x15
91bc3d5698SJohn Baldwin	str	x12,[x22],#8		// tp[j-1]
92bc3d5698SJohn Baldwin	cbnz	x21,.L1st
93bc3d5698SJohn Baldwin
94bc3d5698SJohn Baldwin.L1st_skip:
95bc3d5698SJohn Baldwin	adds	x6,x10,x7
96bc3d5698SJohn Baldwin	sub	x1,x1,x5		// rewind x1
97bc3d5698SJohn Baldwin	adc	x7,x11,xzr
98bc3d5698SJohn Baldwin
99bc3d5698SJohn Baldwin	adds	x12,x16,x13
100bc3d5698SJohn Baldwin	sub	x3,x3,x5		// rewind x3
101bc3d5698SJohn Baldwin	adc	x13,x17,xzr
102bc3d5698SJohn Baldwin
103bc3d5698SJohn Baldwin	adds	x12,x12,x6
104bc3d5698SJohn Baldwin	sub	x20,x5,#8		// i=num-1
105bc3d5698SJohn Baldwin	adcs	x13,x13,x7
106bc3d5698SJohn Baldwin
107bc3d5698SJohn Baldwin	adc	x19,xzr,xzr		// upmost overflow bit
108bc3d5698SJohn Baldwin	stp	x12,x13,[x22]
109bc3d5698SJohn Baldwin
110bc3d5698SJohn Baldwin.Louter:
111bc3d5698SJohn Baldwin	ldr	x9,[x2],#8		// bp[i]
112bc3d5698SJohn Baldwin	ldp	x7,x8,[x1],#16
113bc3d5698SJohn Baldwin	ldr	x23,[sp]		// tp[0]
114bc3d5698SJohn Baldwin	add	x22,sp,#8
115bc3d5698SJohn Baldwin
116bc3d5698SJohn Baldwin	mul	x6,x7,x9		// ap[0]*bp[i]
117bc3d5698SJohn Baldwin	sub	x21,x5,#16		// j=num-2
118bc3d5698SJohn Baldwin	umulh	x7,x7,x9
119bc3d5698SJohn Baldwin	ldp	x13,x14,[x3],#16
120bc3d5698SJohn Baldwin	mul	x10,x8,x9		// ap[1]*bp[i]
121bc3d5698SJohn Baldwin	adds	x6,x6,x23
122bc3d5698SJohn Baldwin	umulh	x11,x8,x9
123bc3d5698SJohn Baldwin	adc	x7,x7,xzr
124bc3d5698SJohn Baldwin
125bc3d5698SJohn Baldwin	mul	x15,x6,x4
126bc3d5698SJohn Baldwin	sub	x20,x20,#8		// i--
127bc3d5698SJohn Baldwin
128bc3d5698SJohn Baldwin	// (*)	mul	x12,x13,x15	// np[0]*m1
129bc3d5698SJohn Baldwin	umulh	x13,x13,x15
130bc3d5698SJohn Baldwin	mul	x16,x14,x15		// np[1]*m1
131bc3d5698SJohn Baldwin	// (*)	adds	x12,x12,x6
132bc3d5698SJohn Baldwin	subs	xzr,x6,#1		// (*)
133bc3d5698SJohn Baldwin	umulh	x17,x14,x15
134bc3d5698SJohn Baldwin	cbz	x21,.Linner_skip
135bc3d5698SJohn Baldwin
136bc3d5698SJohn Baldwin.Linner:
137bc3d5698SJohn Baldwin	ldr	x8,[x1],#8
138bc3d5698SJohn Baldwin	adc	x13,x13,xzr
139bc3d5698SJohn Baldwin	ldr	x23,[x22],#8		// tp[j]
140bc3d5698SJohn Baldwin	adds	x6,x10,x7
141bc3d5698SJohn Baldwin	sub	x21,x21,#8		// j--
142bc3d5698SJohn Baldwin	adc	x7,x11,xzr
143bc3d5698SJohn Baldwin
144bc3d5698SJohn Baldwin	adds	x12,x16,x13
145bc3d5698SJohn Baldwin	ldr	x14,[x3],#8
146bc3d5698SJohn Baldwin	adc	x13,x17,xzr
147bc3d5698SJohn Baldwin
148bc3d5698SJohn Baldwin	mul	x10,x8,x9		// ap[j]*bp[i]
149bc3d5698SJohn Baldwin	adds	x6,x6,x23
150bc3d5698SJohn Baldwin	umulh	x11,x8,x9
151bc3d5698SJohn Baldwin	adc	x7,x7,xzr
152bc3d5698SJohn Baldwin
153bc3d5698SJohn Baldwin	mul	x16,x14,x15		// np[j]*m1
154bc3d5698SJohn Baldwin	adds	x12,x12,x6
155bc3d5698SJohn Baldwin	umulh	x17,x14,x15
156c0855eaaSJohn Baldwin	stur	x12,[x22,#-16]		// tp[j-1]
157bc3d5698SJohn Baldwin	cbnz	x21,.Linner
158bc3d5698SJohn Baldwin
159bc3d5698SJohn Baldwin.Linner_skip:
160bc3d5698SJohn Baldwin	ldr	x23,[x22],#8		// tp[j]
161bc3d5698SJohn Baldwin	adc	x13,x13,xzr
162bc3d5698SJohn Baldwin	adds	x6,x10,x7
163bc3d5698SJohn Baldwin	sub	x1,x1,x5		// rewind x1
164bc3d5698SJohn Baldwin	adc	x7,x11,xzr
165bc3d5698SJohn Baldwin
166bc3d5698SJohn Baldwin	adds	x12,x16,x13
167bc3d5698SJohn Baldwin	sub	x3,x3,x5		// rewind x3
168bc3d5698SJohn Baldwin	adcs	x13,x17,x19
169bc3d5698SJohn Baldwin	adc	x19,xzr,xzr
170bc3d5698SJohn Baldwin
171bc3d5698SJohn Baldwin	adds	x6,x6,x23
172bc3d5698SJohn Baldwin	adc	x7,x7,xzr
173bc3d5698SJohn Baldwin
174bc3d5698SJohn Baldwin	adds	x12,x12,x6
175bc3d5698SJohn Baldwin	adcs	x13,x13,x7
176bc3d5698SJohn Baldwin	adc	x19,x19,xzr		// upmost overflow bit
177bc3d5698SJohn Baldwin	stp	x12,x13,[x22,#-16]
178bc3d5698SJohn Baldwin
179bc3d5698SJohn Baldwin	cbnz	x20,.Louter
180bc3d5698SJohn Baldwin
181bc3d5698SJohn Baldwin	// Final step. We see if result is larger than modulus, and
182bc3d5698SJohn Baldwin	// if it is, subtract the modulus. But comparison implies
183bc3d5698SJohn Baldwin	// subtraction. So we subtract modulus, see if it borrowed,
184bc3d5698SJohn Baldwin	// and conditionally copy original value.
185bc3d5698SJohn Baldwin	ldr	x23,[sp]		// tp[0]
186bc3d5698SJohn Baldwin	add	x22,sp,#8
187bc3d5698SJohn Baldwin	ldr	x14,[x3],#8		// np[0]
188bc3d5698SJohn Baldwin	subs	x21,x5,#8		// j=num-1 and clear borrow
189bc3d5698SJohn Baldwin	mov	x1,x0
190bc3d5698SJohn Baldwin.Lsub:
191bc3d5698SJohn Baldwin	sbcs	x8,x23,x14		// tp[j]-np[j]
192bc3d5698SJohn Baldwin	ldr	x23,[x22],#8
193bc3d5698SJohn Baldwin	sub	x21,x21,#8		// j--
194bc3d5698SJohn Baldwin	ldr	x14,[x3],#8
195bc3d5698SJohn Baldwin	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
196bc3d5698SJohn Baldwin	cbnz	x21,.Lsub
197bc3d5698SJohn Baldwin
198bc3d5698SJohn Baldwin	sbcs	x8,x23,x14
199bc3d5698SJohn Baldwin	sbcs	x19,x19,xzr		// did it borrow?
200bc3d5698SJohn Baldwin	str	x8,[x1],#8		// rp[num-1]
201bc3d5698SJohn Baldwin
202bc3d5698SJohn Baldwin	ldr	x23,[sp]		// tp[0]
203bc3d5698SJohn Baldwin	add	x22,sp,#8
204bc3d5698SJohn Baldwin	ldr	x8,[x0],#8		// rp[0]
205bc3d5698SJohn Baldwin	sub	x5,x5,#8		// num--
206bc3d5698SJohn Baldwin	nop
207bc3d5698SJohn Baldwin.Lcond_copy:
208bc3d5698SJohn Baldwin	sub	x5,x5,#8		// num--
209bc3d5698SJohn Baldwin	csel	x14,x23,x8,lo		// did it borrow?
210bc3d5698SJohn Baldwin	ldr	x23,[x22],#8
211bc3d5698SJohn Baldwin	ldr	x8,[x0],#8
212c0855eaaSJohn Baldwin	stur	xzr,[x22,#-16]		// wipe tp
213c0855eaaSJohn Baldwin	stur	x14,[x0,#-16]
214bc3d5698SJohn Baldwin	cbnz	x5,.Lcond_copy
215bc3d5698SJohn Baldwin
216bc3d5698SJohn Baldwin	csel	x14,x23,x8,lo
217c0855eaaSJohn Baldwin	stur	xzr,[x22,#-8]		// wipe tp
218c0855eaaSJohn Baldwin	stur	x14,[x0,#-8]
219bc3d5698SJohn Baldwin
220bc3d5698SJohn Baldwin	ldp	x19,x20,[x29,#16]
221bc3d5698SJohn Baldwin	mov	sp,x29
222bc3d5698SJohn Baldwin	ldp	x21,x22,[x29,#32]
223bc3d5698SJohn Baldwin	mov	x0,#1
224bc3d5698SJohn Baldwin	ldp	x23,x24,[x29,#48]
225bc3d5698SJohn Baldwin	ldr	x29,[sp],#64
226bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
227bc3d5698SJohn Baldwin	ret
228bc3d5698SJohn Baldwin.size	bn_mul_mont,.-bn_mul_mont
229c0855eaaSJohn Baldwin.type	bn_mul8x_mont_neon,%function
230c0855eaaSJohn Baldwin.align	5
231c0855eaaSJohn Baldwinbn_mul8x_mont_neon:
232bd9588bcSAndrew Turner	// Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to
233bd9588bcSAndrew Turner	// only from bn_mul_mont which has already signed the return address.
234c0855eaaSJohn Baldwin	stp	x29,x30,[sp,#-80]!
235c0855eaaSJohn Baldwin	mov	x16,sp
236c0855eaaSJohn Baldwin	stp	d8,d9,[sp,#16]
237c0855eaaSJohn Baldwin	stp	d10,d11,[sp,#32]
238c0855eaaSJohn Baldwin	stp	d12,d13,[sp,#48]
239c0855eaaSJohn Baldwin	stp	d14,d15,[sp,#64]
240c0855eaaSJohn Baldwin	lsl	x5,x5,#1
241c0855eaaSJohn Baldwin	eor	v14.16b,v14.16b,v14.16b
242c0855eaaSJohn Baldwin
243c0855eaaSJohn Baldwin.align	4
244c0855eaaSJohn Baldwin.LNEON_8n:
245c0855eaaSJohn Baldwin	eor	v6.16b,v6.16b,v6.16b
246c0855eaaSJohn Baldwin	sub	x7,sp,#128
247c0855eaaSJohn Baldwin	eor	v7.16b,v7.16b,v7.16b
248c0855eaaSJohn Baldwin	sub	x7,x7,x5,lsl#4
249c0855eaaSJohn Baldwin	eor	v8.16b,v8.16b,v8.16b
250c0855eaaSJohn Baldwin	and	x7,x7,#-64
251c0855eaaSJohn Baldwin	eor	v9.16b,v9.16b,v9.16b
252c0855eaaSJohn Baldwin	mov	sp,x7		// alloca
253c0855eaaSJohn Baldwin	eor	v10.16b,v10.16b,v10.16b
254c0855eaaSJohn Baldwin	add	x7,x7,#256
255c0855eaaSJohn Baldwin	eor	v11.16b,v11.16b,v11.16b
256c0855eaaSJohn Baldwin	sub	x8,x5,#8
257c0855eaaSJohn Baldwin	eor	v12.16b,v12.16b,v12.16b
258c0855eaaSJohn Baldwin	eor	v13.16b,v13.16b,v13.16b
259c0855eaaSJohn Baldwin
260c0855eaaSJohn Baldwin.LNEON_8n_init:
261c0855eaaSJohn Baldwin	st1	{v6.2d,v7.2d},[x7],#32
262c0855eaaSJohn Baldwin	subs	x8,x8,#8
263c0855eaaSJohn Baldwin	st1	{v8.2d,v9.2d},[x7],#32
264c0855eaaSJohn Baldwin	st1	{v10.2d,v11.2d},[x7],#32
265c0855eaaSJohn Baldwin	st1	{v12.2d,v13.2d},[x7],#32
266c0855eaaSJohn Baldwin	bne	.LNEON_8n_init
267c0855eaaSJohn Baldwin
268c0855eaaSJohn Baldwin	add	x6,sp,#256
269c0855eaaSJohn Baldwin	ld1	{v0.4s,v1.4s},[x1],#32
270c0855eaaSJohn Baldwin	add	x10,sp,#8
271c0855eaaSJohn Baldwin	ldr	s30,[x4],#4
272c0855eaaSJohn Baldwin	mov	x9,x5
273c0855eaaSJohn Baldwin	b	.LNEON_8n_outer
274c0855eaaSJohn Baldwin
275c0855eaaSJohn Baldwin.align	4
276c0855eaaSJohn Baldwin.LNEON_8n_outer:
277c0855eaaSJohn Baldwin	ldr	s28,[x2],#4   // *b++
278c0855eaaSJohn Baldwin	uxtl	v28.4s,v28.4h
279c0855eaaSJohn Baldwin	add	x7,sp,#128
280c0855eaaSJohn Baldwin	ld1	{v2.4s,v3.4s},[x3],#32
281c0855eaaSJohn Baldwin
282c0855eaaSJohn Baldwin	umlal	v6.2d,v28.2s,v0.s[0]
283c0855eaaSJohn Baldwin	umlal	v7.2d,v28.2s,v0.s[1]
284c0855eaaSJohn Baldwin	umlal	v8.2d,v28.2s,v0.s[2]
285c0855eaaSJohn Baldwin	shl	v29.2d,v6.2d,#16
286c0855eaaSJohn Baldwin	ext	v29.16b,v29.16b,v29.16b,#8
287c0855eaaSJohn Baldwin	umlal	v9.2d,v28.2s,v0.s[3]
288c0855eaaSJohn Baldwin	add	v29.2d,v29.2d,v6.2d
289c0855eaaSJohn Baldwin	umlal	v10.2d,v28.2s,v1.s[0]
290c0855eaaSJohn Baldwin	mul	v29.2s,v29.2s,v30.2s
291c0855eaaSJohn Baldwin	umlal	v11.2d,v28.2s,v1.s[1]
292c0855eaaSJohn Baldwin	st1	{v28.2s},[sp]		// put aside smashed b[8*i+0]
293c0855eaaSJohn Baldwin	umlal	v12.2d,v28.2s,v1.s[2]
294c0855eaaSJohn Baldwin	uxtl	v29.4s,v29.4h
295c0855eaaSJohn Baldwin	umlal	v13.2d,v28.2s,v1.s[3]
296c0855eaaSJohn Baldwin	ldr	s28,[x2],#4   // *b++
297c0855eaaSJohn Baldwin	umlal	v6.2d,v29.2s,v2.s[0]
298c0855eaaSJohn Baldwin	umlal	v7.2d,v29.2s,v2.s[1]
299c0855eaaSJohn Baldwin	uxtl	v28.4s,v28.4h
300c0855eaaSJohn Baldwin	umlal	v8.2d,v29.2s,v2.s[2]
301c0855eaaSJohn Baldwin	ushr	v15.2d,v6.2d,#16
302c0855eaaSJohn Baldwin	umlal	v9.2d,v29.2s,v2.s[3]
303c0855eaaSJohn Baldwin	umlal	v10.2d,v29.2s,v3.s[0]
304c0855eaaSJohn Baldwin	ext	v6.16b,v6.16b,v6.16b,#8
305c0855eaaSJohn Baldwin	add	v6.2d,v6.2d,v15.2d
306c0855eaaSJohn Baldwin	umlal	v11.2d,v29.2s,v3.s[1]
307c0855eaaSJohn Baldwin	ushr	v6.2d,v6.2d,#16
308c0855eaaSJohn Baldwin	umlal	v12.2d,v29.2s,v3.s[2]
309c0855eaaSJohn Baldwin	umlal	v13.2d,v29.2s,v3.s[3]
310c0855eaaSJohn Baldwin	add	v16.2d,v7.2d,v6.2d
311c0855eaaSJohn Baldwin	ins	v7.d[0],v16.d[0]
312c0855eaaSJohn Baldwin	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+0]
313c0855eaaSJohn Baldwin	umlal	v7.2d,v28.2s,v0.s[0]
314c0855eaaSJohn Baldwin	ld1	{v6.2d},[x6],#16
315c0855eaaSJohn Baldwin	umlal	v8.2d,v28.2s,v0.s[1]
316c0855eaaSJohn Baldwin	umlal	v9.2d,v28.2s,v0.s[2]
317c0855eaaSJohn Baldwin	shl	v29.2d,v7.2d,#16
318c0855eaaSJohn Baldwin	ext	v29.16b,v29.16b,v29.16b,#8
319c0855eaaSJohn Baldwin	umlal	v10.2d,v28.2s,v0.s[3]
320c0855eaaSJohn Baldwin	add	v29.2d,v29.2d,v7.2d
321c0855eaaSJohn Baldwin	umlal	v11.2d,v28.2s,v1.s[0]
322c0855eaaSJohn Baldwin	mul	v29.2s,v29.2s,v30.2s
323c0855eaaSJohn Baldwin	umlal	v12.2d,v28.2s,v1.s[1]
324c0855eaaSJohn Baldwin	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+1]
325c0855eaaSJohn Baldwin	umlal	v13.2d,v28.2s,v1.s[2]
326c0855eaaSJohn Baldwin	uxtl	v29.4s,v29.4h
327c0855eaaSJohn Baldwin	umlal	v6.2d,v28.2s,v1.s[3]
328c0855eaaSJohn Baldwin	ldr	s28,[x2],#4   // *b++
329c0855eaaSJohn Baldwin	umlal	v7.2d,v29.2s,v2.s[0]
330c0855eaaSJohn Baldwin	umlal	v8.2d,v29.2s,v2.s[1]
331c0855eaaSJohn Baldwin	uxtl	v28.4s,v28.4h
332c0855eaaSJohn Baldwin	umlal	v9.2d,v29.2s,v2.s[2]
333c0855eaaSJohn Baldwin	ushr	v15.2d,v7.2d,#16
334c0855eaaSJohn Baldwin	umlal	v10.2d,v29.2s,v2.s[3]
335c0855eaaSJohn Baldwin	umlal	v11.2d,v29.2s,v3.s[0]
336c0855eaaSJohn Baldwin	ext	v7.16b,v7.16b,v7.16b,#8
337c0855eaaSJohn Baldwin	add	v7.2d,v7.2d,v15.2d
338c0855eaaSJohn Baldwin	umlal	v12.2d,v29.2s,v3.s[1]
339c0855eaaSJohn Baldwin	ushr	v7.2d,v7.2d,#16
340c0855eaaSJohn Baldwin	umlal	v13.2d,v29.2s,v3.s[2]
341c0855eaaSJohn Baldwin	umlal	v6.2d,v29.2s,v3.s[3]
342c0855eaaSJohn Baldwin	add	v16.2d,v8.2d,v7.2d
343c0855eaaSJohn Baldwin	ins	v8.d[0],v16.d[0]
344c0855eaaSJohn Baldwin	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+1]
345c0855eaaSJohn Baldwin	umlal	v8.2d,v28.2s,v0.s[0]
346c0855eaaSJohn Baldwin	ld1	{v7.2d},[x6],#16
347c0855eaaSJohn Baldwin	umlal	v9.2d,v28.2s,v0.s[1]
348c0855eaaSJohn Baldwin	umlal	v10.2d,v28.2s,v0.s[2]
349c0855eaaSJohn Baldwin	shl	v29.2d,v8.2d,#16
350c0855eaaSJohn Baldwin	ext	v29.16b,v29.16b,v29.16b,#8
351c0855eaaSJohn Baldwin	umlal	v11.2d,v28.2s,v0.s[3]
352c0855eaaSJohn Baldwin	add	v29.2d,v29.2d,v8.2d
353c0855eaaSJohn Baldwin	umlal	v12.2d,v28.2s,v1.s[0]
354c0855eaaSJohn Baldwin	mul	v29.2s,v29.2s,v30.2s
355c0855eaaSJohn Baldwin	umlal	v13.2d,v28.2s,v1.s[1]
356c0855eaaSJohn Baldwin	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+2]
357c0855eaaSJohn Baldwin	umlal	v6.2d,v28.2s,v1.s[2]
358c0855eaaSJohn Baldwin	uxtl	v29.4s,v29.4h
359c0855eaaSJohn Baldwin	umlal	v7.2d,v28.2s,v1.s[3]
360c0855eaaSJohn Baldwin	ldr	s28,[x2],#4   // *b++
361c0855eaaSJohn Baldwin	umlal	v8.2d,v29.2s,v2.s[0]
362c0855eaaSJohn Baldwin	umlal	v9.2d,v29.2s,v2.s[1]
363c0855eaaSJohn Baldwin	uxtl	v28.4s,v28.4h
364c0855eaaSJohn Baldwin	umlal	v10.2d,v29.2s,v2.s[2]
365c0855eaaSJohn Baldwin	ushr	v15.2d,v8.2d,#16
366c0855eaaSJohn Baldwin	umlal	v11.2d,v29.2s,v2.s[3]
367c0855eaaSJohn Baldwin	umlal	v12.2d,v29.2s,v3.s[0]
368c0855eaaSJohn Baldwin	ext	v8.16b,v8.16b,v8.16b,#8
369c0855eaaSJohn Baldwin	add	v8.2d,v8.2d,v15.2d
370c0855eaaSJohn Baldwin	umlal	v13.2d,v29.2s,v3.s[1]
371c0855eaaSJohn Baldwin	ushr	v8.2d,v8.2d,#16
372c0855eaaSJohn Baldwin	umlal	v6.2d,v29.2s,v3.s[2]
373c0855eaaSJohn Baldwin	umlal	v7.2d,v29.2s,v3.s[3]
374c0855eaaSJohn Baldwin	add	v16.2d,v9.2d,v8.2d
375c0855eaaSJohn Baldwin	ins	v9.d[0],v16.d[0]
376c0855eaaSJohn Baldwin	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+2]
377c0855eaaSJohn Baldwin	umlal	v9.2d,v28.2s,v0.s[0]
378c0855eaaSJohn Baldwin	ld1	{v8.2d},[x6],#16
379c0855eaaSJohn Baldwin	umlal	v10.2d,v28.2s,v0.s[1]
380c0855eaaSJohn Baldwin	umlal	v11.2d,v28.2s,v0.s[2]
381c0855eaaSJohn Baldwin	shl	v29.2d,v9.2d,#16
382c0855eaaSJohn Baldwin	ext	v29.16b,v29.16b,v29.16b,#8
383c0855eaaSJohn Baldwin	umlal	v12.2d,v28.2s,v0.s[3]
384c0855eaaSJohn Baldwin	add	v29.2d,v29.2d,v9.2d
385c0855eaaSJohn Baldwin	umlal	v13.2d,v28.2s,v1.s[0]
386c0855eaaSJohn Baldwin	mul	v29.2s,v29.2s,v30.2s
387c0855eaaSJohn Baldwin	umlal	v6.2d,v28.2s,v1.s[1]
388c0855eaaSJohn Baldwin	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+3]
389c0855eaaSJohn Baldwin	umlal	v7.2d,v28.2s,v1.s[2]
390c0855eaaSJohn Baldwin	uxtl	v29.4s,v29.4h
391c0855eaaSJohn Baldwin	umlal	v8.2d,v28.2s,v1.s[3]
392c0855eaaSJohn Baldwin	ldr	s28,[x2],#4   // *b++
393c0855eaaSJohn Baldwin	umlal	v9.2d,v29.2s,v2.s[0]
394c0855eaaSJohn Baldwin	umlal	v10.2d,v29.2s,v2.s[1]
395c0855eaaSJohn Baldwin	uxtl	v28.4s,v28.4h
396c0855eaaSJohn Baldwin	umlal	v11.2d,v29.2s,v2.s[2]
397c0855eaaSJohn Baldwin	ushr	v15.2d,v9.2d,#16
398c0855eaaSJohn Baldwin	umlal	v12.2d,v29.2s,v2.s[3]
399c0855eaaSJohn Baldwin	umlal	v13.2d,v29.2s,v3.s[0]
400c0855eaaSJohn Baldwin	ext	v9.16b,v9.16b,v9.16b,#8
401c0855eaaSJohn Baldwin	add	v9.2d,v9.2d,v15.2d
402c0855eaaSJohn Baldwin	umlal	v6.2d,v29.2s,v3.s[1]
403c0855eaaSJohn Baldwin	ushr	v9.2d,v9.2d,#16
404c0855eaaSJohn Baldwin	umlal	v7.2d,v29.2s,v3.s[2]
405c0855eaaSJohn Baldwin	umlal	v8.2d,v29.2s,v3.s[3]
406c0855eaaSJohn Baldwin	add	v16.2d,v10.2d,v9.2d
407c0855eaaSJohn Baldwin	ins	v10.d[0],v16.d[0]
408c0855eaaSJohn Baldwin	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+3]
409c0855eaaSJohn Baldwin	umlal	v10.2d,v28.2s,v0.s[0]
410c0855eaaSJohn Baldwin	ld1	{v9.2d},[x6],#16
411c0855eaaSJohn Baldwin	umlal	v11.2d,v28.2s,v0.s[1]
412c0855eaaSJohn Baldwin	umlal	v12.2d,v28.2s,v0.s[2]
413c0855eaaSJohn Baldwin	shl	v29.2d,v10.2d,#16
414c0855eaaSJohn Baldwin	ext	v29.16b,v29.16b,v29.16b,#8
415c0855eaaSJohn Baldwin	umlal	v13.2d,v28.2s,v0.s[3]
416c0855eaaSJohn Baldwin	add	v29.2d,v29.2d,v10.2d
417c0855eaaSJohn Baldwin	umlal	v6.2d,v28.2s,v1.s[0]
418c0855eaaSJohn Baldwin	mul	v29.2s,v29.2s,v30.2s
419c0855eaaSJohn Baldwin	umlal	v7.2d,v28.2s,v1.s[1]
420c0855eaaSJohn Baldwin	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+4]
421c0855eaaSJohn Baldwin	umlal	v8.2d,v28.2s,v1.s[2]
422c0855eaaSJohn Baldwin	uxtl	v29.4s,v29.4h
423c0855eaaSJohn Baldwin	umlal	v9.2d,v28.2s,v1.s[3]
424c0855eaaSJohn Baldwin	ldr	s28,[x2],#4   // *b++
425c0855eaaSJohn Baldwin	umlal	v10.2d,v29.2s,v2.s[0]
426c0855eaaSJohn Baldwin	umlal	v11.2d,v29.2s,v2.s[1]
427c0855eaaSJohn Baldwin	uxtl	v28.4s,v28.4h
428c0855eaaSJohn Baldwin	umlal	v12.2d,v29.2s,v2.s[2]
429c0855eaaSJohn Baldwin	ushr	v15.2d,v10.2d,#16
430c0855eaaSJohn Baldwin	umlal	v13.2d,v29.2s,v2.s[3]
431c0855eaaSJohn Baldwin	umlal	v6.2d,v29.2s,v3.s[0]
432c0855eaaSJohn Baldwin	ext	v10.16b,v10.16b,v10.16b,#8
433c0855eaaSJohn Baldwin	add	v10.2d,v10.2d,v15.2d
434c0855eaaSJohn Baldwin	umlal	v7.2d,v29.2s,v3.s[1]
435c0855eaaSJohn Baldwin	ushr	v10.2d,v10.2d,#16
436c0855eaaSJohn Baldwin	umlal	v8.2d,v29.2s,v3.s[2]
437c0855eaaSJohn Baldwin	umlal	v9.2d,v29.2s,v3.s[3]
438c0855eaaSJohn Baldwin	add	v16.2d,v11.2d,v10.2d
439c0855eaaSJohn Baldwin	ins	v11.d[0],v16.d[0]
440c0855eaaSJohn Baldwin	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+4]
441c0855eaaSJohn Baldwin	umlal	v11.2d,v28.2s,v0.s[0]
442c0855eaaSJohn Baldwin	ld1	{v10.2d},[x6],#16
443c0855eaaSJohn Baldwin	umlal	v12.2d,v28.2s,v0.s[1]
444c0855eaaSJohn Baldwin	umlal	v13.2d,v28.2s,v0.s[2]
445c0855eaaSJohn Baldwin	shl	v29.2d,v11.2d,#16
446c0855eaaSJohn Baldwin	ext	v29.16b,v29.16b,v29.16b,#8
447c0855eaaSJohn Baldwin	umlal	v6.2d,v28.2s,v0.s[3]
448c0855eaaSJohn Baldwin	add	v29.2d,v29.2d,v11.2d
449c0855eaaSJohn Baldwin	umlal	v7.2d,v28.2s,v1.s[0]
450c0855eaaSJohn Baldwin	mul	v29.2s,v29.2s,v30.2s
451c0855eaaSJohn Baldwin	umlal	v8.2d,v28.2s,v1.s[1]
452c0855eaaSJohn Baldwin	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+5]
453c0855eaaSJohn Baldwin	umlal	v9.2d,v28.2s,v1.s[2]
454c0855eaaSJohn Baldwin	uxtl	v29.4s,v29.4h
455c0855eaaSJohn Baldwin	umlal	v10.2d,v28.2s,v1.s[3]
456c0855eaaSJohn Baldwin	ldr	s28,[x2],#4   // *b++
457c0855eaaSJohn Baldwin	umlal	v11.2d,v29.2s,v2.s[0]
458c0855eaaSJohn Baldwin	umlal	v12.2d,v29.2s,v2.s[1]
459c0855eaaSJohn Baldwin	uxtl	v28.4s,v28.4h
460c0855eaaSJohn Baldwin	umlal	v13.2d,v29.2s,v2.s[2]
461c0855eaaSJohn Baldwin	ushr	v15.2d,v11.2d,#16
462c0855eaaSJohn Baldwin	umlal	v6.2d,v29.2s,v2.s[3]
463c0855eaaSJohn Baldwin	umlal	v7.2d,v29.2s,v3.s[0]
464c0855eaaSJohn Baldwin	ext	v11.16b,v11.16b,v11.16b,#8
465c0855eaaSJohn Baldwin	add	v11.2d,v11.2d,v15.2d
466c0855eaaSJohn Baldwin	umlal	v8.2d,v29.2s,v3.s[1]
467c0855eaaSJohn Baldwin	ushr	v11.2d,v11.2d,#16
468c0855eaaSJohn Baldwin	umlal	v9.2d,v29.2s,v3.s[2]
469c0855eaaSJohn Baldwin	umlal	v10.2d,v29.2s,v3.s[3]
470c0855eaaSJohn Baldwin	add	v16.2d,v12.2d,v11.2d
471c0855eaaSJohn Baldwin	ins	v12.d[0],v16.d[0]
472c0855eaaSJohn Baldwin	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+5]
473c0855eaaSJohn Baldwin	umlal	v12.2d,v28.2s,v0.s[0]
474c0855eaaSJohn Baldwin	ld1	{v11.2d},[x6],#16
475c0855eaaSJohn Baldwin	umlal	v13.2d,v28.2s,v0.s[1]
476c0855eaaSJohn Baldwin	umlal	v6.2d,v28.2s,v0.s[2]
477c0855eaaSJohn Baldwin	shl	v29.2d,v12.2d,#16
478c0855eaaSJohn Baldwin	ext	v29.16b,v29.16b,v29.16b,#8
479c0855eaaSJohn Baldwin	umlal	v7.2d,v28.2s,v0.s[3]
480c0855eaaSJohn Baldwin	add	v29.2d,v29.2d,v12.2d
481c0855eaaSJohn Baldwin	umlal	v8.2d,v28.2s,v1.s[0]
482c0855eaaSJohn Baldwin	mul	v29.2s,v29.2s,v30.2s
483c0855eaaSJohn Baldwin	umlal	v9.2d,v28.2s,v1.s[1]
484c0855eaaSJohn Baldwin	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+6]
485c0855eaaSJohn Baldwin	umlal	v10.2d,v28.2s,v1.s[2]
486c0855eaaSJohn Baldwin	uxtl	v29.4s,v29.4h
487c0855eaaSJohn Baldwin	umlal	v11.2d,v28.2s,v1.s[3]
488c0855eaaSJohn Baldwin	ldr	s28,[x2],#4   // *b++
489c0855eaaSJohn Baldwin	umlal	v12.2d,v29.2s,v2.s[0]
490c0855eaaSJohn Baldwin	umlal	v13.2d,v29.2s,v2.s[1]
491c0855eaaSJohn Baldwin	uxtl	v28.4s,v28.4h
492c0855eaaSJohn Baldwin	umlal	v6.2d,v29.2s,v2.s[2]
493c0855eaaSJohn Baldwin	ushr	v15.2d,v12.2d,#16
494c0855eaaSJohn Baldwin	umlal	v7.2d,v29.2s,v2.s[3]
495c0855eaaSJohn Baldwin	umlal	v8.2d,v29.2s,v3.s[0]
496c0855eaaSJohn Baldwin	ext	v12.16b,v12.16b,v12.16b,#8
497c0855eaaSJohn Baldwin	add	v12.2d,v12.2d,v15.2d
498c0855eaaSJohn Baldwin	umlal	v9.2d,v29.2s,v3.s[1]
499c0855eaaSJohn Baldwin	ushr	v12.2d,v12.2d,#16
500c0855eaaSJohn Baldwin	umlal	v10.2d,v29.2s,v3.s[2]
501c0855eaaSJohn Baldwin	umlal	v11.2d,v29.2s,v3.s[3]
502c0855eaaSJohn Baldwin	add	v16.2d,v13.2d,v12.2d
503c0855eaaSJohn Baldwin	ins	v13.d[0],v16.d[0]
504c0855eaaSJohn Baldwin	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+6]
505c0855eaaSJohn Baldwin	umlal	v13.2d,v28.2s,v0.s[0]
506c0855eaaSJohn Baldwin	ld1	{v12.2d},[x6],#16
507c0855eaaSJohn Baldwin	umlal	v6.2d,v28.2s,v0.s[1]
508c0855eaaSJohn Baldwin	umlal	v7.2d,v28.2s,v0.s[2]
509c0855eaaSJohn Baldwin	shl	v29.2d,v13.2d,#16
510c0855eaaSJohn Baldwin	ext	v29.16b,v29.16b,v29.16b,#8
511c0855eaaSJohn Baldwin	umlal	v8.2d,v28.2s,v0.s[3]
512c0855eaaSJohn Baldwin	add	v29.2d,v29.2d,v13.2d
513c0855eaaSJohn Baldwin	umlal	v9.2d,v28.2s,v1.s[0]
514c0855eaaSJohn Baldwin	mul	v29.2s,v29.2s,v30.2s
515c0855eaaSJohn Baldwin	umlal	v10.2d,v28.2s,v1.s[1]
516c0855eaaSJohn Baldwin	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+7]
517c0855eaaSJohn Baldwin	umlal	v11.2d,v28.2s,v1.s[2]
518c0855eaaSJohn Baldwin	uxtl	v29.4s,v29.4h
519c0855eaaSJohn Baldwin	umlal	v12.2d,v28.2s,v1.s[3]
520c0855eaaSJohn Baldwin	ld1	{v28.2s},[sp]		// pull smashed b[8*i+0]
521c0855eaaSJohn Baldwin	umlal	v13.2d,v29.2s,v2.s[0]
522c0855eaaSJohn Baldwin	ld1	{v0.4s,v1.4s},[x1],#32
523c0855eaaSJohn Baldwin	umlal	v6.2d,v29.2s,v2.s[1]
524c0855eaaSJohn Baldwin	umlal	v7.2d,v29.2s,v2.s[2]
525c0855eaaSJohn Baldwin	mov	v5.16b,v13.16b
526c0855eaaSJohn Baldwin	ushr	v5.2d,v5.2d,#16
527c0855eaaSJohn Baldwin	ext	v13.16b,v13.16b,v13.16b,#8
528c0855eaaSJohn Baldwin	umlal	v8.2d,v29.2s,v2.s[3]
529c0855eaaSJohn Baldwin	umlal	v9.2d,v29.2s,v3.s[0]
530c0855eaaSJohn Baldwin	add	v13.2d,v13.2d,v5.2d
531c0855eaaSJohn Baldwin	umlal	v10.2d,v29.2s,v3.s[1]
532c0855eaaSJohn Baldwin	ushr	v13.2d,v13.2d,#16
533c0855eaaSJohn Baldwin	eor	v15.16b,v15.16b,v15.16b
534c0855eaaSJohn Baldwin	ins	v13.d[1],v15.d[0]
535c0855eaaSJohn Baldwin	umlal	v11.2d,v29.2s,v3.s[2]
536c0855eaaSJohn Baldwin	umlal	v12.2d,v29.2s,v3.s[3]
537c0855eaaSJohn Baldwin	add	v6.2d,v6.2d,v13.2d
538c0855eaaSJohn Baldwin	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+7]
539c0855eaaSJohn Baldwin	add	x10,sp,#8		// rewind
540c0855eaaSJohn Baldwin	sub	x8,x5,#8
541c0855eaaSJohn Baldwin	b	.LNEON_8n_inner
542c0855eaaSJohn Baldwin
543c0855eaaSJohn Baldwin.align	4
544c0855eaaSJohn Baldwin.LNEON_8n_inner:
545c0855eaaSJohn Baldwin	subs	x8,x8,#8
546c0855eaaSJohn Baldwin	umlal	v6.2d,v28.2s,v0.s[0]
547c0855eaaSJohn Baldwin	ld1	{v13.2d},[x6]
548c0855eaaSJohn Baldwin	umlal	v7.2d,v28.2s,v0.s[1]
549c0855eaaSJohn Baldwin	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+0]
550c0855eaaSJohn Baldwin	umlal	v8.2d,v28.2s,v0.s[2]
551c0855eaaSJohn Baldwin	ld1	{v2.4s,v3.4s},[x3],#32
552c0855eaaSJohn Baldwin	umlal	v9.2d,v28.2s,v0.s[3]
553c0855eaaSJohn Baldwin	b.eq	.LInner_jump
554c0855eaaSJohn Baldwin	add	x6,x6,#16	// don't advance in last iteration
555c0855eaaSJohn Baldwin.LInner_jump:
556c0855eaaSJohn Baldwin	umlal	v10.2d,v28.2s,v1.s[0]
557c0855eaaSJohn Baldwin	umlal	v11.2d,v28.2s,v1.s[1]
558c0855eaaSJohn Baldwin	umlal	v12.2d,v28.2s,v1.s[2]
559c0855eaaSJohn Baldwin	umlal	v13.2d,v28.2s,v1.s[3]
560c0855eaaSJohn Baldwin	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+1]
561c0855eaaSJohn Baldwin	umlal	v6.2d,v29.2s,v2.s[0]
562c0855eaaSJohn Baldwin	umlal	v7.2d,v29.2s,v2.s[1]
563c0855eaaSJohn Baldwin	umlal	v8.2d,v29.2s,v2.s[2]
564c0855eaaSJohn Baldwin	umlal	v9.2d,v29.2s,v2.s[3]
565c0855eaaSJohn Baldwin	umlal	v10.2d,v29.2s,v3.s[0]
566c0855eaaSJohn Baldwin	umlal	v11.2d,v29.2s,v3.s[1]
567c0855eaaSJohn Baldwin	umlal	v12.2d,v29.2s,v3.s[2]
568c0855eaaSJohn Baldwin	umlal	v13.2d,v29.2s,v3.s[3]
569c0855eaaSJohn Baldwin	st1	{v6.2d},[x7],#16
570c0855eaaSJohn Baldwin	umlal	v7.2d,v28.2s,v0.s[0]
571c0855eaaSJohn Baldwin	ld1	{v6.2d},[x6]
572c0855eaaSJohn Baldwin	umlal	v8.2d,v28.2s,v0.s[1]
573c0855eaaSJohn Baldwin	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+1]
574c0855eaaSJohn Baldwin	umlal	v9.2d,v28.2s,v0.s[2]
575c0855eaaSJohn Baldwin	b.eq	.LInner_jump1
576c0855eaaSJohn Baldwin	add	x6,x6,#16	// don't advance in last iteration
577c0855eaaSJohn Baldwin.LInner_jump1:
578c0855eaaSJohn Baldwin	umlal	v10.2d,v28.2s,v0.s[3]
579c0855eaaSJohn Baldwin	umlal	v11.2d,v28.2s,v1.s[0]
580c0855eaaSJohn Baldwin	umlal	v12.2d,v28.2s,v1.s[1]
581c0855eaaSJohn Baldwin	umlal	v13.2d,v28.2s,v1.s[2]
582c0855eaaSJohn Baldwin	umlal	v6.2d,v28.2s,v1.s[3]
583c0855eaaSJohn Baldwin	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+2]
584c0855eaaSJohn Baldwin	umlal	v7.2d,v29.2s,v2.s[0]
585c0855eaaSJohn Baldwin	umlal	v8.2d,v29.2s,v2.s[1]
586c0855eaaSJohn Baldwin	umlal	v9.2d,v29.2s,v2.s[2]
587c0855eaaSJohn Baldwin	umlal	v10.2d,v29.2s,v2.s[3]
588c0855eaaSJohn Baldwin	umlal	v11.2d,v29.2s,v3.s[0]
589c0855eaaSJohn Baldwin	umlal	v12.2d,v29.2s,v3.s[1]
590c0855eaaSJohn Baldwin	umlal	v13.2d,v29.2s,v3.s[2]
591c0855eaaSJohn Baldwin	umlal	v6.2d,v29.2s,v3.s[3]
592c0855eaaSJohn Baldwin	st1	{v7.2d},[x7],#16
593c0855eaaSJohn Baldwin	umlal	v8.2d,v28.2s,v0.s[0]
594c0855eaaSJohn Baldwin	ld1	{v7.2d},[x6]
595c0855eaaSJohn Baldwin	umlal	v9.2d,v28.2s,v0.s[1]
596c0855eaaSJohn Baldwin	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+2]
597c0855eaaSJohn Baldwin	umlal	v10.2d,v28.2s,v0.s[2]
598c0855eaaSJohn Baldwin	b.eq	.LInner_jump2
599c0855eaaSJohn Baldwin	add	x6,x6,#16	// don't advance in last iteration
600c0855eaaSJohn Baldwin.LInner_jump2:
601c0855eaaSJohn Baldwin	umlal	v11.2d,v28.2s,v0.s[3]
602c0855eaaSJohn Baldwin	umlal	v12.2d,v28.2s,v1.s[0]
603c0855eaaSJohn Baldwin	umlal	v13.2d,v28.2s,v1.s[1]
604c0855eaaSJohn Baldwin	umlal	v6.2d,v28.2s,v1.s[2]
605c0855eaaSJohn Baldwin	umlal	v7.2d,v28.2s,v1.s[3]
606c0855eaaSJohn Baldwin	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+3]
607c0855eaaSJohn Baldwin	umlal	v8.2d,v29.2s,v2.s[0]
608c0855eaaSJohn Baldwin	umlal	v9.2d,v29.2s,v2.s[1]
609c0855eaaSJohn Baldwin	umlal	v10.2d,v29.2s,v2.s[2]
610c0855eaaSJohn Baldwin	umlal	v11.2d,v29.2s,v2.s[3]
611c0855eaaSJohn Baldwin	umlal	v12.2d,v29.2s,v3.s[0]
612c0855eaaSJohn Baldwin	umlal	v13.2d,v29.2s,v3.s[1]
613c0855eaaSJohn Baldwin	umlal	v6.2d,v29.2s,v3.s[2]
614c0855eaaSJohn Baldwin	umlal	v7.2d,v29.2s,v3.s[3]
615c0855eaaSJohn Baldwin	st1	{v8.2d},[x7],#16
616c0855eaaSJohn Baldwin	umlal	v9.2d,v28.2s,v0.s[0]
617c0855eaaSJohn Baldwin	ld1	{v8.2d},[x6]
618c0855eaaSJohn Baldwin	umlal	v10.2d,v28.2s,v0.s[1]
619c0855eaaSJohn Baldwin	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+3]
620c0855eaaSJohn Baldwin	umlal	v11.2d,v28.2s,v0.s[2]
621c0855eaaSJohn Baldwin	b.eq	.LInner_jump3
622c0855eaaSJohn Baldwin	add	x6,x6,#16	// don't advance in last iteration
623c0855eaaSJohn Baldwin.LInner_jump3:
624c0855eaaSJohn Baldwin	umlal	v12.2d,v28.2s,v0.s[3]
625c0855eaaSJohn Baldwin	umlal	v13.2d,v28.2s,v1.s[0]
626c0855eaaSJohn Baldwin	umlal	v6.2d,v28.2s,v1.s[1]
627c0855eaaSJohn Baldwin	umlal	v7.2d,v28.2s,v1.s[2]
628c0855eaaSJohn Baldwin	umlal	v8.2d,v28.2s,v1.s[3]
629c0855eaaSJohn Baldwin	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+4]
630c0855eaaSJohn Baldwin	umlal	v9.2d,v29.2s,v2.s[0]
631c0855eaaSJohn Baldwin	umlal	v10.2d,v29.2s,v2.s[1]
632c0855eaaSJohn Baldwin	umlal	v11.2d,v29.2s,v2.s[2]
633c0855eaaSJohn Baldwin	umlal	v12.2d,v29.2s,v2.s[3]
634c0855eaaSJohn Baldwin	umlal	v13.2d,v29.2s,v3.s[0]
635c0855eaaSJohn Baldwin	umlal	v6.2d,v29.2s,v3.s[1]
636c0855eaaSJohn Baldwin	umlal	v7.2d,v29.2s,v3.s[2]
637c0855eaaSJohn Baldwin	umlal	v8.2d,v29.2s,v3.s[3]
638c0855eaaSJohn Baldwin	st1	{v9.2d},[x7],#16
639c0855eaaSJohn Baldwin	umlal	v10.2d,v28.2s,v0.s[0]
640c0855eaaSJohn Baldwin	ld1	{v9.2d},[x6]
641c0855eaaSJohn Baldwin	umlal	v11.2d,v28.2s,v0.s[1]
642c0855eaaSJohn Baldwin	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+4]
643c0855eaaSJohn Baldwin	umlal	v12.2d,v28.2s,v0.s[2]
644c0855eaaSJohn Baldwin	b.eq	.LInner_jump4
645c0855eaaSJohn Baldwin	add	x6,x6,#16	// don't advance in last iteration
646c0855eaaSJohn Baldwin.LInner_jump4:
647c0855eaaSJohn Baldwin	umlal	v13.2d,v28.2s,v0.s[3]
648c0855eaaSJohn Baldwin	umlal	v6.2d,v28.2s,v1.s[0]
649c0855eaaSJohn Baldwin	umlal	v7.2d,v28.2s,v1.s[1]
650c0855eaaSJohn Baldwin	umlal	v8.2d,v28.2s,v1.s[2]
651c0855eaaSJohn Baldwin	umlal	v9.2d,v28.2s,v1.s[3]
652c0855eaaSJohn Baldwin	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+5]
653c0855eaaSJohn Baldwin	umlal	v10.2d,v29.2s,v2.s[0]
654c0855eaaSJohn Baldwin	umlal	v11.2d,v29.2s,v2.s[1]
655c0855eaaSJohn Baldwin	umlal	v12.2d,v29.2s,v2.s[2]
656c0855eaaSJohn Baldwin	umlal	v13.2d,v29.2s,v2.s[3]
657c0855eaaSJohn Baldwin	umlal	v6.2d,v29.2s,v3.s[0]
658c0855eaaSJohn Baldwin	umlal	v7.2d,v29.2s,v3.s[1]
659c0855eaaSJohn Baldwin	umlal	v8.2d,v29.2s,v3.s[2]
660c0855eaaSJohn Baldwin	umlal	v9.2d,v29.2s,v3.s[3]
661c0855eaaSJohn Baldwin	st1	{v10.2d},[x7],#16
662c0855eaaSJohn Baldwin	umlal	v11.2d,v28.2s,v0.s[0]
663c0855eaaSJohn Baldwin	ld1	{v10.2d},[x6]
664c0855eaaSJohn Baldwin	umlal	v12.2d,v28.2s,v0.s[1]
665c0855eaaSJohn Baldwin	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+5]
666c0855eaaSJohn Baldwin	umlal	v13.2d,v28.2s,v0.s[2]
667c0855eaaSJohn Baldwin	b.eq	.LInner_jump5
668c0855eaaSJohn Baldwin	add	x6,x6,#16	// don't advance in last iteration
669c0855eaaSJohn Baldwin.LInner_jump5:
670c0855eaaSJohn Baldwin	umlal	v6.2d,v28.2s,v0.s[3]
671c0855eaaSJohn Baldwin	umlal	v7.2d,v28.2s,v1.s[0]
672c0855eaaSJohn Baldwin	umlal	v8.2d,v28.2s,v1.s[1]
673c0855eaaSJohn Baldwin	umlal	v9.2d,v28.2s,v1.s[2]
674c0855eaaSJohn Baldwin	umlal	v10.2d,v28.2s,v1.s[3]
675c0855eaaSJohn Baldwin	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+6]
676c0855eaaSJohn Baldwin	umlal	v11.2d,v29.2s,v2.s[0]
677c0855eaaSJohn Baldwin	umlal	v12.2d,v29.2s,v2.s[1]
678c0855eaaSJohn Baldwin	umlal	v13.2d,v29.2s,v2.s[2]
679c0855eaaSJohn Baldwin	umlal	v6.2d,v29.2s,v2.s[3]
680c0855eaaSJohn Baldwin	umlal	v7.2d,v29.2s,v3.s[0]
681c0855eaaSJohn Baldwin	umlal	v8.2d,v29.2s,v3.s[1]
682c0855eaaSJohn Baldwin	umlal	v9.2d,v29.2s,v3.s[2]
683c0855eaaSJohn Baldwin	umlal	v10.2d,v29.2s,v3.s[3]
684c0855eaaSJohn Baldwin	st1	{v11.2d},[x7],#16
685c0855eaaSJohn Baldwin	umlal	v12.2d,v28.2s,v0.s[0]
686c0855eaaSJohn Baldwin	ld1	{v11.2d},[x6]
687c0855eaaSJohn Baldwin	umlal	v13.2d,v28.2s,v0.s[1]
688c0855eaaSJohn Baldwin	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+6]
689c0855eaaSJohn Baldwin	umlal	v6.2d,v28.2s,v0.s[2]
690c0855eaaSJohn Baldwin	b.eq	.LInner_jump6
691c0855eaaSJohn Baldwin	add	x6,x6,#16	// don't advance in last iteration
692c0855eaaSJohn Baldwin.LInner_jump6:
693c0855eaaSJohn Baldwin	umlal	v7.2d,v28.2s,v0.s[3]
694c0855eaaSJohn Baldwin	umlal	v8.2d,v28.2s,v1.s[0]
695c0855eaaSJohn Baldwin	umlal	v9.2d,v28.2s,v1.s[1]
696c0855eaaSJohn Baldwin	umlal	v10.2d,v28.2s,v1.s[2]
697c0855eaaSJohn Baldwin	umlal	v11.2d,v28.2s,v1.s[3]
698c0855eaaSJohn Baldwin	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+7]
699c0855eaaSJohn Baldwin	umlal	v12.2d,v29.2s,v2.s[0]
700c0855eaaSJohn Baldwin	umlal	v13.2d,v29.2s,v2.s[1]
701c0855eaaSJohn Baldwin	umlal	v6.2d,v29.2s,v2.s[2]
702c0855eaaSJohn Baldwin	umlal	v7.2d,v29.2s,v2.s[3]
703c0855eaaSJohn Baldwin	umlal	v8.2d,v29.2s,v3.s[0]
704c0855eaaSJohn Baldwin	umlal	v9.2d,v29.2s,v3.s[1]
705c0855eaaSJohn Baldwin	umlal	v10.2d,v29.2s,v3.s[2]
706c0855eaaSJohn Baldwin	umlal	v11.2d,v29.2s,v3.s[3]
707c0855eaaSJohn Baldwin	st1	{v12.2d},[x7],#16
708c0855eaaSJohn Baldwin	umlal	v13.2d,v28.2s,v0.s[0]
709c0855eaaSJohn Baldwin	ld1	{v12.2d},[x6]
710c0855eaaSJohn Baldwin	umlal	v6.2d,v28.2s,v0.s[1]
711c0855eaaSJohn Baldwin	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+7]
712c0855eaaSJohn Baldwin	umlal	v7.2d,v28.2s,v0.s[2]
713c0855eaaSJohn Baldwin	b.eq	.LInner_jump7
714c0855eaaSJohn Baldwin	add	x6,x6,#16	// don't advance in last iteration
715c0855eaaSJohn Baldwin.LInner_jump7:
716c0855eaaSJohn Baldwin	umlal	v8.2d,v28.2s,v0.s[3]
717c0855eaaSJohn Baldwin	umlal	v9.2d,v28.2s,v1.s[0]
718c0855eaaSJohn Baldwin	umlal	v10.2d,v28.2s,v1.s[1]
719c0855eaaSJohn Baldwin	umlal	v11.2d,v28.2s,v1.s[2]
720c0855eaaSJohn Baldwin	umlal	v12.2d,v28.2s,v1.s[3]
721c0855eaaSJohn Baldwin	b.ne	.LInner_after_rewind8
722c0855eaaSJohn Baldwin	sub	x1,x1,x5,lsl#2	// rewind
723c0855eaaSJohn Baldwin.LInner_after_rewind8:
724c0855eaaSJohn Baldwin	umlal	v13.2d,v29.2s,v2.s[0]
725c0855eaaSJohn Baldwin	ld1	{v28.2s},[sp]		// pull smashed b[8*i+0]
726c0855eaaSJohn Baldwin	umlal	v6.2d,v29.2s,v2.s[1]
727c0855eaaSJohn Baldwin	ld1	{v0.4s,v1.4s},[x1],#32
728c0855eaaSJohn Baldwin	umlal	v7.2d,v29.2s,v2.s[2]
729c0855eaaSJohn Baldwin	add	x10,sp,#8		// rewind
730c0855eaaSJohn Baldwin	umlal	v8.2d,v29.2s,v2.s[3]
731c0855eaaSJohn Baldwin	umlal	v9.2d,v29.2s,v3.s[0]
732c0855eaaSJohn Baldwin	umlal	v10.2d,v29.2s,v3.s[1]
733c0855eaaSJohn Baldwin	umlal	v11.2d,v29.2s,v3.s[2]
734c0855eaaSJohn Baldwin	st1	{v13.2d},[x7],#16
735c0855eaaSJohn Baldwin	umlal	v12.2d,v29.2s,v3.s[3]
736c0855eaaSJohn Baldwin
737c0855eaaSJohn Baldwin	bne	.LNEON_8n_inner
738c0855eaaSJohn Baldwin	add	x6,sp,#128
739c0855eaaSJohn Baldwin	st1	{v6.2d,v7.2d},[x7],#32
740c0855eaaSJohn Baldwin	eor	v2.16b,v2.16b,v2.16b	// v2
741c0855eaaSJohn Baldwin	st1	{v8.2d,v9.2d},[x7],#32
742c0855eaaSJohn Baldwin	eor	v3.16b,v3.16b,v3.16b	// v3
743c0855eaaSJohn Baldwin	st1	{v10.2d,v11.2d},[x7],#32
744c0855eaaSJohn Baldwin	st1	{v12.2d},[x7]
745c0855eaaSJohn Baldwin
746c0855eaaSJohn Baldwin	subs	x9,x9,#8
747c0855eaaSJohn Baldwin	ld1	{v6.2d,v7.2d},[x6],#32
748c0855eaaSJohn Baldwin	ld1	{v8.2d,v9.2d},[x6],#32
749c0855eaaSJohn Baldwin	ld1	{v10.2d,v11.2d},[x6],#32
750c0855eaaSJohn Baldwin	ld1	{v12.2d,v13.2d},[x6],#32
751c0855eaaSJohn Baldwin
752c0855eaaSJohn Baldwin	b.eq	.LInner_8n_jump_2steps
753c0855eaaSJohn Baldwin	sub	x3,x3,x5,lsl#2	// rewind
754c0855eaaSJohn Baldwin	b	.LNEON_8n_outer
755c0855eaaSJohn Baldwin
756c0855eaaSJohn Baldwin.LInner_8n_jump_2steps:
757c0855eaaSJohn Baldwin	add	x7,sp,#128
758c0855eaaSJohn Baldwin	st1	{v2.2d,v3.2d}, [sp],#32	// start wiping stack frame
759c0855eaaSJohn Baldwin	mov	v5.16b,v6.16b
760c0855eaaSJohn Baldwin	ushr	v15.2d,v6.2d,#16
761c0855eaaSJohn Baldwin	ext	v6.16b,v6.16b,v6.16b,#8
762c0855eaaSJohn Baldwin	st1	{v2.2d,v3.2d}, [sp],#32
763c0855eaaSJohn Baldwin	add	v6.2d,v6.2d,v15.2d
764c0855eaaSJohn Baldwin	st1	{v2.2d,v3.2d}, [sp],#32
765c0855eaaSJohn Baldwin	ushr	v15.2d,v6.2d,#16
766c0855eaaSJohn Baldwin	st1	{v2.2d,v3.2d}, [sp],#32
767c0855eaaSJohn Baldwin	zip1	v6.4h,v5.4h,v6.4h
768c0855eaaSJohn Baldwin	ins	v15.d[1],v14.d[0]
769c0855eaaSJohn Baldwin
770c0855eaaSJohn Baldwin	mov	x8,x5
771c0855eaaSJohn Baldwin	b	.LNEON_tail_entry
772c0855eaaSJohn Baldwin
773c0855eaaSJohn Baldwin.align	4
774c0855eaaSJohn Baldwin.LNEON_tail:
775c0855eaaSJohn Baldwin	add	v6.2d,v6.2d,v15.2d
776c0855eaaSJohn Baldwin	mov	v5.16b,v6.16b
777c0855eaaSJohn Baldwin	ushr	v15.2d,v6.2d,#16
778c0855eaaSJohn Baldwin	ext	v6.16b,v6.16b,v6.16b,#8
779c0855eaaSJohn Baldwin	ld1	{v8.2d,v9.2d}, [x6],#32
780c0855eaaSJohn Baldwin	add	v6.2d,v6.2d,v15.2d
781c0855eaaSJohn Baldwin	ld1	{v10.2d,v11.2d}, [x6],#32
782c0855eaaSJohn Baldwin	ushr	v15.2d,v6.2d,#16
783c0855eaaSJohn Baldwin	ld1	{v12.2d,v13.2d}, [x6],#32
784c0855eaaSJohn Baldwin	zip1	v6.4h,v5.4h,v6.4h
785c0855eaaSJohn Baldwin	ins	v15.d[1],v14.d[0]
786c0855eaaSJohn Baldwin
787c0855eaaSJohn Baldwin.LNEON_tail_entry:
788c0855eaaSJohn Baldwin	add	v7.2d,v7.2d,v15.2d
789c0855eaaSJohn Baldwin	st1	{v6.s}[0], [x7],#4
790c0855eaaSJohn Baldwin	ushr	v15.2d,v7.2d,#16
791c0855eaaSJohn Baldwin	mov	v5.16b,v7.16b
792c0855eaaSJohn Baldwin	ext	v7.16b,v7.16b,v7.16b,#8
793c0855eaaSJohn Baldwin	add	v7.2d,v7.2d,v15.2d
794c0855eaaSJohn Baldwin	ushr	v15.2d,v7.2d,#16
795c0855eaaSJohn Baldwin	zip1	v7.4h,v5.4h,v7.4h
796c0855eaaSJohn Baldwin	ins	v15.d[1],v14.d[0]
797c0855eaaSJohn Baldwin	add	v8.2d,v8.2d,v15.2d
798c0855eaaSJohn Baldwin	st1	{v7.s}[0], [x7],#4
799c0855eaaSJohn Baldwin	ushr	v15.2d,v8.2d,#16
800c0855eaaSJohn Baldwin	mov	v5.16b,v8.16b
801c0855eaaSJohn Baldwin	ext	v8.16b,v8.16b,v8.16b,#8
802c0855eaaSJohn Baldwin	add	v8.2d,v8.2d,v15.2d
803c0855eaaSJohn Baldwin	ushr	v15.2d,v8.2d,#16
804c0855eaaSJohn Baldwin	zip1	v8.4h,v5.4h,v8.4h
805c0855eaaSJohn Baldwin	ins	v15.d[1],v14.d[0]
806c0855eaaSJohn Baldwin	add	v9.2d,v9.2d,v15.2d
807c0855eaaSJohn Baldwin	st1	{v8.s}[0], [x7],#4
808c0855eaaSJohn Baldwin	ushr	v15.2d,v9.2d,#16
809c0855eaaSJohn Baldwin	mov	v5.16b,v9.16b
810c0855eaaSJohn Baldwin	ext	v9.16b,v9.16b,v9.16b,#8
811c0855eaaSJohn Baldwin	add	v9.2d,v9.2d,v15.2d
812c0855eaaSJohn Baldwin	ushr	v15.2d,v9.2d,#16
813c0855eaaSJohn Baldwin	zip1	v9.4h,v5.4h,v9.4h
814c0855eaaSJohn Baldwin	ins	v15.d[1],v14.d[0]
815c0855eaaSJohn Baldwin	add	v10.2d,v10.2d,v15.2d
816c0855eaaSJohn Baldwin	st1	{v9.s}[0], [x7],#4
817c0855eaaSJohn Baldwin	ushr	v15.2d,v10.2d,#16
818c0855eaaSJohn Baldwin	mov	v5.16b,v10.16b
819c0855eaaSJohn Baldwin	ext	v10.16b,v10.16b,v10.16b,#8
820c0855eaaSJohn Baldwin	add	v10.2d,v10.2d,v15.2d
821c0855eaaSJohn Baldwin	ushr	v15.2d,v10.2d,#16
822c0855eaaSJohn Baldwin	zip1	v10.4h,v5.4h,v10.4h
823c0855eaaSJohn Baldwin	ins	v15.d[1],v14.d[0]
824c0855eaaSJohn Baldwin	add	v11.2d,v11.2d,v15.2d
825c0855eaaSJohn Baldwin	st1	{v10.s}[0], [x7],#4
826c0855eaaSJohn Baldwin	ushr	v15.2d,v11.2d,#16
827c0855eaaSJohn Baldwin	mov	v5.16b,v11.16b
828c0855eaaSJohn Baldwin	ext	v11.16b,v11.16b,v11.16b,#8
829c0855eaaSJohn Baldwin	add	v11.2d,v11.2d,v15.2d
830c0855eaaSJohn Baldwin	ushr	v15.2d,v11.2d,#16
831c0855eaaSJohn Baldwin	zip1	v11.4h,v5.4h,v11.4h
832c0855eaaSJohn Baldwin	ins	v15.d[1],v14.d[0]
833c0855eaaSJohn Baldwin	add	v12.2d,v12.2d,v15.2d
834c0855eaaSJohn Baldwin	st1	{v11.s}[0], [x7],#4
835c0855eaaSJohn Baldwin	ushr	v15.2d,v12.2d,#16
836c0855eaaSJohn Baldwin	mov	v5.16b,v12.16b
837c0855eaaSJohn Baldwin	ext	v12.16b,v12.16b,v12.16b,#8
838c0855eaaSJohn Baldwin	add	v12.2d,v12.2d,v15.2d
839c0855eaaSJohn Baldwin	ushr	v15.2d,v12.2d,#16
840c0855eaaSJohn Baldwin	zip1	v12.4h,v5.4h,v12.4h
841c0855eaaSJohn Baldwin	ins	v15.d[1],v14.d[0]
842c0855eaaSJohn Baldwin	add	v13.2d,v13.2d,v15.2d
843c0855eaaSJohn Baldwin	st1	{v12.s}[0], [x7],#4
844c0855eaaSJohn Baldwin	ushr	v15.2d,v13.2d,#16
845c0855eaaSJohn Baldwin	mov	v5.16b,v13.16b
846c0855eaaSJohn Baldwin	ext	v13.16b,v13.16b,v13.16b,#8
847c0855eaaSJohn Baldwin	add	v13.2d,v13.2d,v15.2d
848c0855eaaSJohn Baldwin	ushr	v15.2d,v13.2d,#16
849c0855eaaSJohn Baldwin	zip1	v13.4h,v5.4h,v13.4h
850c0855eaaSJohn Baldwin	ins	v15.d[1],v14.d[0]
851c0855eaaSJohn Baldwin	ld1	{v6.2d,v7.2d}, [x6],#32
852c0855eaaSJohn Baldwin	subs	x8,x8,#8
853c0855eaaSJohn Baldwin	st1	{v13.s}[0], [x7],#4
854c0855eaaSJohn Baldwin	bne	.LNEON_tail
855c0855eaaSJohn Baldwin
856c0855eaaSJohn Baldwin	st1	{v15.s}[0], [x7],#4	// top-most bit
857c0855eaaSJohn Baldwin	sub	x3,x3,x5,lsl#2		// rewind x3
858c0855eaaSJohn Baldwin	subs	x1,sp,#0			// clear carry flag
859c0855eaaSJohn Baldwin	add	x2,sp,x5,lsl#2
860c0855eaaSJohn Baldwin
861c0855eaaSJohn Baldwin.LNEON_sub:
862c0855eaaSJohn Baldwin	ldp	w4,w5,[x1],#8
863c0855eaaSJohn Baldwin	ldp	w6,w7,[x1],#8
864c0855eaaSJohn Baldwin	ldp	w8,w9,[x3],#8
865c0855eaaSJohn Baldwin	ldp	w10,w11,[x3],#8
866c0855eaaSJohn Baldwin	sbcs	w8,w4,w8
867c0855eaaSJohn Baldwin	sbcs	w9,w5,w9
868c0855eaaSJohn Baldwin	sbcs	w10,w6,w10
869c0855eaaSJohn Baldwin	sbcs	w11,w7,w11
870c0855eaaSJohn Baldwin	sub	x17,x2,x1
871c0855eaaSJohn Baldwin	stp	w8,w9,[x0],#8
872c0855eaaSJohn Baldwin	stp	w10,w11,[x0],#8
873c0855eaaSJohn Baldwin	cbnz	x17,.LNEON_sub
874c0855eaaSJohn Baldwin
875c0855eaaSJohn Baldwin	ldr	w10, [x1]		// load top-most bit
876c0855eaaSJohn Baldwin	mov	x11,sp
877c0855eaaSJohn Baldwin	eor	v0.16b,v0.16b,v0.16b
878c0855eaaSJohn Baldwin	sub	x11,x2,x11		// this is num*4
879c0855eaaSJohn Baldwin	eor	v1.16b,v1.16b,v1.16b
880c0855eaaSJohn Baldwin	mov	x1,sp
881c0855eaaSJohn Baldwin	sub	x0,x0,x11		// rewind x0
882c0855eaaSJohn Baldwin	mov	x3,x2		// second 3/4th of frame
883c0855eaaSJohn Baldwin	sbcs	w10,w10,wzr		// result is carry flag
884c0855eaaSJohn Baldwin
885c0855eaaSJohn Baldwin.LNEON_copy_n_zap:
886c0855eaaSJohn Baldwin	ldp	w4,w5,[x1],#8
887c0855eaaSJohn Baldwin	ldp	w6,w7,[x1],#8
888c0855eaaSJohn Baldwin	ldp	w8,w9,[x0],#8
889c0855eaaSJohn Baldwin	ldp	w10,w11,[x0]
890c0855eaaSJohn Baldwin	sub	x0,x0,#8
891c0855eaaSJohn Baldwin	b.cs	.LCopy_1
892c0855eaaSJohn Baldwin	mov	w8,w4
893c0855eaaSJohn Baldwin	mov	w9,w5
894c0855eaaSJohn Baldwin	mov	w10,w6
895c0855eaaSJohn Baldwin	mov	w11,w7
896c0855eaaSJohn Baldwin.LCopy_1:
897c0855eaaSJohn Baldwin	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
898c0855eaaSJohn Baldwin	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
899c0855eaaSJohn Baldwin	ldp	w4,w5,[x1],#8
900c0855eaaSJohn Baldwin	ldp	w6,w7,[x1],#8
901c0855eaaSJohn Baldwin	stp	w8,w9,[x0],#8
902c0855eaaSJohn Baldwin	stp	w10,w11,[x0],#8
903c0855eaaSJohn Baldwin	sub	x1,x1,#32
904c0855eaaSJohn Baldwin	ldp	w8,w9,[x0],#8
905c0855eaaSJohn Baldwin	ldp	w10,w11,[x0]
906c0855eaaSJohn Baldwin	sub	x0,x0,#8
907c0855eaaSJohn Baldwin	b.cs	.LCopy_2
908c0855eaaSJohn Baldwin	mov	w8, w4
909c0855eaaSJohn Baldwin	mov	w9, w5
910c0855eaaSJohn Baldwin	mov	w10, w6
911c0855eaaSJohn Baldwin	mov	w11, w7
912c0855eaaSJohn Baldwin.LCopy_2:
913c0855eaaSJohn Baldwin	st1	{v0.2d,v1.2d}, [x1],#32		// wipe
914c0855eaaSJohn Baldwin	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
915c0855eaaSJohn Baldwin	sub	x17,x2,x1		// preserves carry
916c0855eaaSJohn Baldwin	stp	w8,w9,[x0],#8
917c0855eaaSJohn Baldwin	stp	w10,w11,[x0],#8
918c0855eaaSJohn Baldwin	cbnz	x17,.LNEON_copy_n_zap
919c0855eaaSJohn Baldwin
920c0855eaaSJohn Baldwin	mov	sp,x16
921c0855eaaSJohn Baldwin	ldp	d14,d15,[sp,#64]
922c0855eaaSJohn Baldwin	ldp	d12,d13,[sp,#48]
923c0855eaaSJohn Baldwin	ldp	d10,d11,[sp,#32]
924c0855eaaSJohn Baldwin	ldp	d8,d9,[sp,#16]
925c0855eaaSJohn Baldwin	ldr	x29,[sp],#80
926bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
927c0855eaaSJohn Baldwin	ret	// bx lr
928c0855eaaSJohn Baldwin
929c0855eaaSJohn Baldwin.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
930bc3d5698SJohn Baldwin.type	__bn_sqr8x_mont,%function
931bc3d5698SJohn Baldwin.align	5
932bc3d5698SJohn Baldwin__bn_sqr8x_mont:
933bc3d5698SJohn Baldwin	cmp	x1,x2
934bc3d5698SJohn Baldwin	b.ne	__bn_mul4x_mont
935bc3d5698SJohn Baldwin.Lsqr8x_mont:
936bd9588bcSAndrew Turner	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
937bd9588bcSAndrew Turner	// only from bn_mul_mont which has already signed the return address.
938bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-128]!
939bc3d5698SJohn Baldwin	add	x29,sp,#0
940bc3d5698SJohn Baldwin	stp	x19,x20,[sp,#16]
941bc3d5698SJohn Baldwin	stp	x21,x22,[sp,#32]
942bc3d5698SJohn Baldwin	stp	x23,x24,[sp,#48]
943bc3d5698SJohn Baldwin	stp	x25,x26,[sp,#64]
944bc3d5698SJohn Baldwin	stp	x27,x28,[sp,#80]
945bc3d5698SJohn Baldwin	stp	x0,x3,[sp,#96]	// offload rp and np
946bc3d5698SJohn Baldwin
947bc3d5698SJohn Baldwin	ldp	x6,x7,[x1,#8*0]
948bc3d5698SJohn Baldwin	ldp	x8,x9,[x1,#8*2]
949bc3d5698SJohn Baldwin	ldp	x10,x11,[x1,#8*4]
950bc3d5698SJohn Baldwin	ldp	x12,x13,[x1,#8*6]
951bc3d5698SJohn Baldwin
952bc3d5698SJohn Baldwin	sub	x2,sp,x5,lsl#4
953bc3d5698SJohn Baldwin	lsl	x5,x5,#3
954bc3d5698SJohn Baldwin	ldr	x4,[x4]		// *n0
955bc3d5698SJohn Baldwin	mov	sp,x2			// alloca
956bc3d5698SJohn Baldwin	sub	x27,x5,#8*8
957bc3d5698SJohn Baldwin	b	.Lsqr8x_zero_start
958bc3d5698SJohn Baldwin
959bc3d5698SJohn Baldwin.Lsqr8x_zero:
960bc3d5698SJohn Baldwin	sub	x27,x27,#8*8
961bc3d5698SJohn Baldwin	stp	xzr,xzr,[x2,#8*0]
962bc3d5698SJohn Baldwin	stp	xzr,xzr,[x2,#8*2]
963bc3d5698SJohn Baldwin	stp	xzr,xzr,[x2,#8*4]
964bc3d5698SJohn Baldwin	stp	xzr,xzr,[x2,#8*6]
965bc3d5698SJohn Baldwin.Lsqr8x_zero_start:
966bc3d5698SJohn Baldwin	stp	xzr,xzr,[x2,#8*8]
967bc3d5698SJohn Baldwin	stp	xzr,xzr,[x2,#8*10]
968bc3d5698SJohn Baldwin	stp	xzr,xzr,[x2,#8*12]
969bc3d5698SJohn Baldwin	stp	xzr,xzr,[x2,#8*14]
970bc3d5698SJohn Baldwin	add	x2,x2,#8*16
971bc3d5698SJohn Baldwin	cbnz	x27,.Lsqr8x_zero
972bc3d5698SJohn Baldwin
973bc3d5698SJohn Baldwin	add	x3,x1,x5
974bc3d5698SJohn Baldwin	add	x1,x1,#8*8
975bc3d5698SJohn Baldwin	mov	x19,xzr
976bc3d5698SJohn Baldwin	mov	x20,xzr
977bc3d5698SJohn Baldwin	mov	x21,xzr
978bc3d5698SJohn Baldwin	mov	x22,xzr
979bc3d5698SJohn Baldwin	mov	x23,xzr
980bc3d5698SJohn Baldwin	mov	x24,xzr
981bc3d5698SJohn Baldwin	mov	x25,xzr
982bc3d5698SJohn Baldwin	mov	x26,xzr
983bc3d5698SJohn Baldwin	mov	x2,sp
984bc3d5698SJohn Baldwin	str	x4,[x29,#112]		// offload n0
985bc3d5698SJohn Baldwin
986bc3d5698SJohn Baldwin	// Multiply everything but a[i]*a[i]
987bc3d5698SJohn Baldwin.align	4
988bc3d5698SJohn Baldwin.Lsqr8x_outer_loop:
989bc3d5698SJohn Baldwin        //                                                 a[1]a[0]	(i)
990bc3d5698SJohn Baldwin        //                                             a[2]a[0]
991bc3d5698SJohn Baldwin        //                                         a[3]a[0]
992bc3d5698SJohn Baldwin        //                                     a[4]a[0]
993bc3d5698SJohn Baldwin        //                                 a[5]a[0]
994bc3d5698SJohn Baldwin        //                             a[6]a[0]
995bc3d5698SJohn Baldwin        //                         a[7]a[0]
996bc3d5698SJohn Baldwin        //                                         a[2]a[1]		(ii)
997bc3d5698SJohn Baldwin        //                                     a[3]a[1]
998bc3d5698SJohn Baldwin        //                                 a[4]a[1]
999bc3d5698SJohn Baldwin        //                             a[5]a[1]
1000bc3d5698SJohn Baldwin        //                         a[6]a[1]
1001bc3d5698SJohn Baldwin        //                     a[7]a[1]
1002bc3d5698SJohn Baldwin        //                                 a[3]a[2]			(iii)
1003bc3d5698SJohn Baldwin        //                             a[4]a[2]
1004bc3d5698SJohn Baldwin        //                         a[5]a[2]
1005bc3d5698SJohn Baldwin        //                     a[6]a[2]
1006bc3d5698SJohn Baldwin        //                 a[7]a[2]
1007bc3d5698SJohn Baldwin        //                         a[4]a[3]				(iv)
1008bc3d5698SJohn Baldwin        //                     a[5]a[3]
1009bc3d5698SJohn Baldwin        //                 a[6]a[3]
1010bc3d5698SJohn Baldwin        //             a[7]a[3]
1011bc3d5698SJohn Baldwin        //                 a[5]a[4]					(v)
1012bc3d5698SJohn Baldwin        //             a[6]a[4]
1013bc3d5698SJohn Baldwin        //         a[7]a[4]
1014bc3d5698SJohn Baldwin        //         a[6]a[5]						(vi)
1015bc3d5698SJohn Baldwin        //     a[7]a[5]
1016bc3d5698SJohn Baldwin        // a[7]a[6]							(vii)
1017bc3d5698SJohn Baldwin
1018bc3d5698SJohn Baldwin	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
1019bc3d5698SJohn Baldwin	mul	x15,x8,x6
1020bc3d5698SJohn Baldwin	mul	x16,x9,x6
1021bc3d5698SJohn Baldwin	mul	x17,x10,x6
1022bc3d5698SJohn Baldwin	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
1023bc3d5698SJohn Baldwin	mul	x14,x11,x6
1024bc3d5698SJohn Baldwin	adcs	x21,x21,x15
1025bc3d5698SJohn Baldwin	mul	x15,x12,x6
1026bc3d5698SJohn Baldwin	adcs	x22,x22,x16
1027bc3d5698SJohn Baldwin	mul	x16,x13,x6
1028bc3d5698SJohn Baldwin	adcs	x23,x23,x17
1029bc3d5698SJohn Baldwin	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
1030bc3d5698SJohn Baldwin	adcs	x24,x24,x14
1031bc3d5698SJohn Baldwin	umulh	x14,x8,x6
1032bc3d5698SJohn Baldwin	adcs	x25,x25,x15
1033bc3d5698SJohn Baldwin	umulh	x15,x9,x6
1034bc3d5698SJohn Baldwin	adcs	x26,x26,x16
1035bc3d5698SJohn Baldwin	umulh	x16,x10,x6
1036bc3d5698SJohn Baldwin	stp	x19,x20,[x2],#8*2	// t[0..1]
1037bc3d5698SJohn Baldwin	adc	x19,xzr,xzr		// t[8]
1038bc3d5698SJohn Baldwin	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
1039bc3d5698SJohn Baldwin	umulh	x17,x11,x6
1040bc3d5698SJohn Baldwin	adcs	x22,x22,x14
1041bc3d5698SJohn Baldwin	umulh	x14,x12,x6
1042bc3d5698SJohn Baldwin	adcs	x23,x23,x15
1043bc3d5698SJohn Baldwin	umulh	x15,x13,x6
1044bc3d5698SJohn Baldwin	adcs	x24,x24,x16
1045bc3d5698SJohn Baldwin	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
1046bc3d5698SJohn Baldwin	adcs	x25,x25,x17
1047bc3d5698SJohn Baldwin	mul	x17,x9,x7
1048bc3d5698SJohn Baldwin	adcs	x26,x26,x14
1049bc3d5698SJohn Baldwin	mul	x14,x10,x7
1050bc3d5698SJohn Baldwin	adc	x19,x19,x15
1051bc3d5698SJohn Baldwin
1052bc3d5698SJohn Baldwin	mul	x15,x11,x7
1053bc3d5698SJohn Baldwin	adds	x22,x22,x16
1054bc3d5698SJohn Baldwin	mul	x16,x12,x7
1055bc3d5698SJohn Baldwin	adcs	x23,x23,x17
1056bc3d5698SJohn Baldwin	mul	x17,x13,x7
1057bc3d5698SJohn Baldwin	adcs	x24,x24,x14
1058bc3d5698SJohn Baldwin	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
1059bc3d5698SJohn Baldwin	adcs	x25,x25,x15
1060bc3d5698SJohn Baldwin	umulh	x15,x9,x7
1061bc3d5698SJohn Baldwin	adcs	x26,x26,x16
1062bc3d5698SJohn Baldwin	umulh	x16,x10,x7
1063bc3d5698SJohn Baldwin	adcs	x19,x19,x17
1064bc3d5698SJohn Baldwin	umulh	x17,x11,x7
1065bc3d5698SJohn Baldwin	stp	x21,x22,[x2],#8*2	// t[2..3]
1066bc3d5698SJohn Baldwin	adc	x20,xzr,xzr		// t[9]
1067bc3d5698SJohn Baldwin	adds	x23,x23,x14
1068bc3d5698SJohn Baldwin	umulh	x14,x12,x7
1069bc3d5698SJohn Baldwin	adcs	x24,x24,x15
1070bc3d5698SJohn Baldwin	umulh	x15,x13,x7
1071bc3d5698SJohn Baldwin	adcs	x25,x25,x16
1072bc3d5698SJohn Baldwin	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
1073bc3d5698SJohn Baldwin	adcs	x26,x26,x17
1074bc3d5698SJohn Baldwin	mul	x17,x10,x8
1075bc3d5698SJohn Baldwin	adcs	x19,x19,x14
1076bc3d5698SJohn Baldwin	mul	x14,x11,x8
1077bc3d5698SJohn Baldwin	adc	x20,x20,x15
1078bc3d5698SJohn Baldwin
1079bc3d5698SJohn Baldwin	mul	x15,x12,x8
1080bc3d5698SJohn Baldwin	adds	x24,x24,x16
1081bc3d5698SJohn Baldwin	mul	x16,x13,x8
1082bc3d5698SJohn Baldwin	adcs	x25,x25,x17
1083bc3d5698SJohn Baldwin	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
1084bc3d5698SJohn Baldwin	adcs	x26,x26,x14
1085bc3d5698SJohn Baldwin	umulh	x14,x10,x8
1086bc3d5698SJohn Baldwin	adcs	x19,x19,x15
1087bc3d5698SJohn Baldwin	umulh	x15,x11,x8
1088bc3d5698SJohn Baldwin	adcs	x20,x20,x16
1089bc3d5698SJohn Baldwin	umulh	x16,x12,x8
1090bc3d5698SJohn Baldwin	stp	x23,x24,[x2],#8*2	// t[4..5]
1091bc3d5698SJohn Baldwin	adc	x21,xzr,xzr		// t[10]
1092bc3d5698SJohn Baldwin	adds	x25,x25,x17
1093bc3d5698SJohn Baldwin	umulh	x17,x13,x8
1094bc3d5698SJohn Baldwin	adcs	x26,x26,x14
1095bc3d5698SJohn Baldwin	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
1096bc3d5698SJohn Baldwin	adcs	x19,x19,x15
1097bc3d5698SJohn Baldwin	mul	x15,x11,x9
1098bc3d5698SJohn Baldwin	adcs	x20,x20,x16
1099bc3d5698SJohn Baldwin	mul	x16,x12,x9
1100bc3d5698SJohn Baldwin	adc	x21,x21,x17
1101bc3d5698SJohn Baldwin
1102bc3d5698SJohn Baldwin	mul	x17,x13,x9
1103bc3d5698SJohn Baldwin	adds	x26,x26,x14
1104bc3d5698SJohn Baldwin	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
1105bc3d5698SJohn Baldwin	adcs	x19,x19,x15
1106bc3d5698SJohn Baldwin	umulh	x15,x11,x9
1107bc3d5698SJohn Baldwin	adcs	x20,x20,x16
1108bc3d5698SJohn Baldwin	umulh	x16,x12,x9
1109bc3d5698SJohn Baldwin	adcs	x21,x21,x17
1110bc3d5698SJohn Baldwin	umulh	x17,x13,x9
1111bc3d5698SJohn Baldwin	stp	x25,x26,[x2],#8*2	// t[6..7]
1112bc3d5698SJohn Baldwin	adc	x22,xzr,xzr		// t[11]
1113bc3d5698SJohn Baldwin	adds	x19,x19,x14
1114bc3d5698SJohn Baldwin	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
1115bc3d5698SJohn Baldwin	adcs	x20,x20,x15
1116bc3d5698SJohn Baldwin	mul	x15,x12,x10
1117bc3d5698SJohn Baldwin	adcs	x21,x21,x16
1118bc3d5698SJohn Baldwin	mul	x16,x13,x10
1119bc3d5698SJohn Baldwin	adc	x22,x22,x17
1120bc3d5698SJohn Baldwin
1121bc3d5698SJohn Baldwin	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
1122bc3d5698SJohn Baldwin	adds	x20,x20,x14
1123bc3d5698SJohn Baldwin	umulh	x14,x12,x10
1124bc3d5698SJohn Baldwin	adcs	x21,x21,x15
1125bc3d5698SJohn Baldwin	umulh	x15,x13,x10
1126bc3d5698SJohn Baldwin	adcs	x22,x22,x16
1127bc3d5698SJohn Baldwin	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
1128bc3d5698SJohn Baldwin	adc	x23,xzr,xzr		// t[12]
1129bc3d5698SJohn Baldwin	adds	x21,x21,x17
1130bc3d5698SJohn Baldwin	mul	x17,x13,x11
1131bc3d5698SJohn Baldwin	adcs	x22,x22,x14
1132bc3d5698SJohn Baldwin	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
1133bc3d5698SJohn Baldwin	adc	x23,x23,x15
1134bc3d5698SJohn Baldwin
1135bc3d5698SJohn Baldwin	umulh	x15,x13,x11
1136bc3d5698SJohn Baldwin	adds	x22,x22,x16
1137bc3d5698SJohn Baldwin	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
1138bc3d5698SJohn Baldwin	adcs	x23,x23,x17
1139bc3d5698SJohn Baldwin	umulh	x17,x13,x12		// hi(a[7]*a[6])
1140bc3d5698SJohn Baldwin	adc	x24,xzr,xzr		// t[13]
1141bc3d5698SJohn Baldwin	adds	x23,x23,x14
1142bc3d5698SJohn Baldwin	sub	x27,x3,x1	// done yet?
1143bc3d5698SJohn Baldwin	adc	x24,x24,x15
1144bc3d5698SJohn Baldwin
1145bc3d5698SJohn Baldwin	adds	x24,x24,x16
1146bc3d5698SJohn Baldwin	sub	x14,x3,x5	// rewinded ap
1147bc3d5698SJohn Baldwin	adc	x25,xzr,xzr		// t[14]
1148bc3d5698SJohn Baldwin	add	x25,x25,x17
1149bc3d5698SJohn Baldwin
1150bc3d5698SJohn Baldwin	cbz	x27,.Lsqr8x_outer_break
1151bc3d5698SJohn Baldwin
1152bc3d5698SJohn Baldwin	mov	x4,x6
1153bc3d5698SJohn Baldwin	ldp	x6,x7,[x2,#8*0]
1154bc3d5698SJohn Baldwin	ldp	x8,x9,[x2,#8*2]
1155bc3d5698SJohn Baldwin	ldp	x10,x11,[x2,#8*4]
1156bc3d5698SJohn Baldwin	ldp	x12,x13,[x2,#8*6]
1157bc3d5698SJohn Baldwin	adds	x19,x19,x6
1158bc3d5698SJohn Baldwin	adcs	x20,x20,x7
1159bc3d5698SJohn Baldwin	ldp	x6,x7,[x1,#8*0]
1160bc3d5698SJohn Baldwin	adcs	x21,x21,x8
1161bc3d5698SJohn Baldwin	adcs	x22,x22,x9
1162bc3d5698SJohn Baldwin	ldp	x8,x9,[x1,#8*2]
1163bc3d5698SJohn Baldwin	adcs	x23,x23,x10
1164bc3d5698SJohn Baldwin	adcs	x24,x24,x11
1165bc3d5698SJohn Baldwin	ldp	x10,x11,[x1,#8*4]
1166bc3d5698SJohn Baldwin	adcs	x25,x25,x12
1167bc3d5698SJohn Baldwin	mov	x0,x1
1168bc3d5698SJohn Baldwin	adcs	x26,xzr,x13
1169bc3d5698SJohn Baldwin	ldp	x12,x13,[x1,#8*6]
1170bc3d5698SJohn Baldwin	add	x1,x1,#8*8
1171bc3d5698SJohn Baldwin	//adc	x28,xzr,xzr		// moved below
1172bc3d5698SJohn Baldwin	mov	x27,#-8*8
1173bc3d5698SJohn Baldwin
1174bc3d5698SJohn Baldwin	//                                                         a[8]a[0]
1175bc3d5698SJohn Baldwin	//                                                     a[9]a[0]
1176bc3d5698SJohn Baldwin	//                                                 a[a]a[0]
1177bc3d5698SJohn Baldwin	//                                             a[b]a[0]
1178bc3d5698SJohn Baldwin	//                                         a[c]a[0]
1179bc3d5698SJohn Baldwin	//                                     a[d]a[0]
1180bc3d5698SJohn Baldwin	//                                 a[e]a[0]
1181bc3d5698SJohn Baldwin	//                             a[f]a[0]
1182bc3d5698SJohn Baldwin	//                                                     a[8]a[1]
1183bc3d5698SJohn Baldwin	//                         a[f]a[1]........................
1184bc3d5698SJohn Baldwin	//                                                 a[8]a[2]
1185bc3d5698SJohn Baldwin	//                     a[f]a[2]........................
1186bc3d5698SJohn Baldwin	//                                             a[8]a[3]
1187bc3d5698SJohn Baldwin	//                 a[f]a[3]........................
1188bc3d5698SJohn Baldwin	//                                         a[8]a[4]
1189bc3d5698SJohn Baldwin	//             a[f]a[4]........................
1190bc3d5698SJohn Baldwin	//                                     a[8]a[5]
1191bc3d5698SJohn Baldwin	//         a[f]a[5]........................
1192bc3d5698SJohn Baldwin	//                                 a[8]a[6]
1193bc3d5698SJohn Baldwin	//     a[f]a[6]........................
1194bc3d5698SJohn Baldwin	//                             a[8]a[7]
1195bc3d5698SJohn Baldwin	// a[f]a[7]........................
1196bc3d5698SJohn Baldwin.Lsqr8x_mul:
1197bc3d5698SJohn Baldwin	mul	x14,x6,x4
1198bc3d5698SJohn Baldwin	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
1199bc3d5698SJohn Baldwin	mul	x15,x7,x4
1200bc3d5698SJohn Baldwin	add	x27,x27,#8
1201bc3d5698SJohn Baldwin	mul	x16,x8,x4
1202bc3d5698SJohn Baldwin	mul	x17,x9,x4
1203bc3d5698SJohn Baldwin	adds	x19,x19,x14
1204bc3d5698SJohn Baldwin	mul	x14,x10,x4
1205bc3d5698SJohn Baldwin	adcs	x20,x20,x15
1206bc3d5698SJohn Baldwin	mul	x15,x11,x4
1207bc3d5698SJohn Baldwin	adcs	x21,x21,x16
1208bc3d5698SJohn Baldwin	mul	x16,x12,x4
1209bc3d5698SJohn Baldwin	adcs	x22,x22,x17
1210bc3d5698SJohn Baldwin	mul	x17,x13,x4
1211bc3d5698SJohn Baldwin	adcs	x23,x23,x14
1212bc3d5698SJohn Baldwin	umulh	x14,x6,x4
1213bc3d5698SJohn Baldwin	adcs	x24,x24,x15
1214bc3d5698SJohn Baldwin	umulh	x15,x7,x4
1215bc3d5698SJohn Baldwin	adcs	x25,x25,x16
1216bc3d5698SJohn Baldwin	umulh	x16,x8,x4
1217bc3d5698SJohn Baldwin	adcs	x26,x26,x17
1218bc3d5698SJohn Baldwin	umulh	x17,x9,x4
1219bc3d5698SJohn Baldwin	adc	x28,x28,xzr
1220bc3d5698SJohn Baldwin	str	x19,[x2],#8
1221bc3d5698SJohn Baldwin	adds	x19,x20,x14
1222bc3d5698SJohn Baldwin	umulh	x14,x10,x4
1223bc3d5698SJohn Baldwin	adcs	x20,x21,x15
1224bc3d5698SJohn Baldwin	umulh	x15,x11,x4
1225bc3d5698SJohn Baldwin	adcs	x21,x22,x16
1226bc3d5698SJohn Baldwin	umulh	x16,x12,x4
1227bc3d5698SJohn Baldwin	adcs	x22,x23,x17
1228bc3d5698SJohn Baldwin	umulh	x17,x13,x4
1229bc3d5698SJohn Baldwin	ldr	x4,[x0,x27]
1230bc3d5698SJohn Baldwin	adcs	x23,x24,x14
1231bc3d5698SJohn Baldwin	adcs	x24,x25,x15
1232bc3d5698SJohn Baldwin	adcs	x25,x26,x16
1233bc3d5698SJohn Baldwin	adcs	x26,x28,x17
1234bc3d5698SJohn Baldwin	//adc	x28,xzr,xzr		// moved above
1235bc3d5698SJohn Baldwin	cbnz	x27,.Lsqr8x_mul
1236bc3d5698SJohn Baldwin					// note that carry flag is guaranteed
1237bc3d5698SJohn Baldwin					// to be zero at this point
1238bc3d5698SJohn Baldwin	cmp	x1,x3		// done yet?
1239bc3d5698SJohn Baldwin	b.eq	.Lsqr8x_break
1240bc3d5698SJohn Baldwin
1241bc3d5698SJohn Baldwin	ldp	x6,x7,[x2,#8*0]
1242bc3d5698SJohn Baldwin	ldp	x8,x9,[x2,#8*2]
1243bc3d5698SJohn Baldwin	ldp	x10,x11,[x2,#8*4]
1244bc3d5698SJohn Baldwin	ldp	x12,x13,[x2,#8*6]
1245bc3d5698SJohn Baldwin	adds	x19,x19,x6
1246c0855eaaSJohn Baldwin	ldur	x4,[x0,#-8*8]
1247bc3d5698SJohn Baldwin	adcs	x20,x20,x7
1248bc3d5698SJohn Baldwin	ldp	x6,x7,[x1,#8*0]
1249bc3d5698SJohn Baldwin	adcs	x21,x21,x8
1250bc3d5698SJohn Baldwin	adcs	x22,x22,x9
1251bc3d5698SJohn Baldwin	ldp	x8,x9,[x1,#8*2]
1252bc3d5698SJohn Baldwin	adcs	x23,x23,x10
1253bc3d5698SJohn Baldwin	adcs	x24,x24,x11
1254bc3d5698SJohn Baldwin	ldp	x10,x11,[x1,#8*4]
1255bc3d5698SJohn Baldwin	adcs	x25,x25,x12
1256bc3d5698SJohn Baldwin	mov	x27,#-8*8
1257bc3d5698SJohn Baldwin	adcs	x26,x26,x13
1258bc3d5698SJohn Baldwin	ldp	x12,x13,[x1,#8*6]
1259bc3d5698SJohn Baldwin	add	x1,x1,#8*8
1260bc3d5698SJohn Baldwin	//adc	x28,xzr,xzr		// moved above
1261bc3d5698SJohn Baldwin	b	.Lsqr8x_mul
1262bc3d5698SJohn Baldwin
1263bc3d5698SJohn Baldwin.align	4
1264bc3d5698SJohn Baldwin.Lsqr8x_break:
1265bc3d5698SJohn Baldwin	ldp	x6,x7,[x0,#8*0]
1266bc3d5698SJohn Baldwin	add	x1,x0,#8*8
1267bc3d5698SJohn Baldwin	ldp	x8,x9,[x0,#8*2]
1268bc3d5698SJohn Baldwin	sub	x14,x3,x1		// is it last iteration?
1269bc3d5698SJohn Baldwin	ldp	x10,x11,[x0,#8*4]
1270bc3d5698SJohn Baldwin	sub	x15,x2,x14
1271bc3d5698SJohn Baldwin	ldp	x12,x13,[x0,#8*6]
1272bc3d5698SJohn Baldwin	cbz	x14,.Lsqr8x_outer_loop
1273bc3d5698SJohn Baldwin
1274bc3d5698SJohn Baldwin	stp	x19,x20,[x2,#8*0]
1275bc3d5698SJohn Baldwin	ldp	x19,x20,[x15,#8*0]
1276bc3d5698SJohn Baldwin	stp	x21,x22,[x2,#8*2]
1277bc3d5698SJohn Baldwin	ldp	x21,x22,[x15,#8*2]
1278bc3d5698SJohn Baldwin	stp	x23,x24,[x2,#8*4]
1279bc3d5698SJohn Baldwin	ldp	x23,x24,[x15,#8*4]
1280bc3d5698SJohn Baldwin	stp	x25,x26,[x2,#8*6]
1281bc3d5698SJohn Baldwin	mov	x2,x15
1282bc3d5698SJohn Baldwin	ldp	x25,x26,[x15,#8*6]
1283bc3d5698SJohn Baldwin	b	.Lsqr8x_outer_loop
1284bc3d5698SJohn Baldwin
1285bc3d5698SJohn Baldwin.align	4
1286bc3d5698SJohn Baldwin.Lsqr8x_outer_break:
1287bc3d5698SJohn Baldwin	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1288bc3d5698SJohn Baldwin	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
1289bc3d5698SJohn Baldwin	ldp	x15,x16,[sp,#8*1]
1290bc3d5698SJohn Baldwin	ldp	x11,x13,[x14,#8*2]
1291bc3d5698SJohn Baldwin	add	x1,x14,#8*4
1292bc3d5698SJohn Baldwin	ldp	x17,x14,[sp,#8*3]
1293bc3d5698SJohn Baldwin
1294bc3d5698SJohn Baldwin	stp	x19,x20,[x2,#8*0]
1295bc3d5698SJohn Baldwin	mul	x19,x7,x7
1296bc3d5698SJohn Baldwin	stp	x21,x22,[x2,#8*2]
1297bc3d5698SJohn Baldwin	umulh	x7,x7,x7
1298bc3d5698SJohn Baldwin	stp	x23,x24,[x2,#8*4]
1299bc3d5698SJohn Baldwin	mul	x8,x9,x9
1300bc3d5698SJohn Baldwin	stp	x25,x26,[x2,#8*6]
1301bc3d5698SJohn Baldwin	mov	x2,sp
1302bc3d5698SJohn Baldwin	umulh	x9,x9,x9
1303bc3d5698SJohn Baldwin	adds	x20,x7,x15,lsl#1
1304bc3d5698SJohn Baldwin	extr	x15,x16,x15,#63
1305bc3d5698SJohn Baldwin	sub	x27,x5,#8*4
1306bc3d5698SJohn Baldwin
1307bc3d5698SJohn Baldwin.Lsqr4x_shift_n_add:
1308bc3d5698SJohn Baldwin	adcs	x21,x8,x15
1309bc3d5698SJohn Baldwin	extr	x16,x17,x16,#63
1310bc3d5698SJohn Baldwin	sub	x27,x27,#8*4
1311bc3d5698SJohn Baldwin	adcs	x22,x9,x16
1312bc3d5698SJohn Baldwin	ldp	x15,x16,[x2,#8*5]
1313bc3d5698SJohn Baldwin	mul	x10,x11,x11
1314bc3d5698SJohn Baldwin	ldp	x7,x9,[x1],#8*2
1315bc3d5698SJohn Baldwin	umulh	x11,x11,x11
1316bc3d5698SJohn Baldwin	mul	x12,x13,x13
1317bc3d5698SJohn Baldwin	umulh	x13,x13,x13
1318bc3d5698SJohn Baldwin	extr	x17,x14,x17,#63
1319bc3d5698SJohn Baldwin	stp	x19,x20,[x2,#8*0]
1320bc3d5698SJohn Baldwin	adcs	x23,x10,x17
1321bc3d5698SJohn Baldwin	extr	x14,x15,x14,#63
1322bc3d5698SJohn Baldwin	stp	x21,x22,[x2,#8*2]
1323bc3d5698SJohn Baldwin	adcs	x24,x11,x14
1324bc3d5698SJohn Baldwin	ldp	x17,x14,[x2,#8*7]
1325bc3d5698SJohn Baldwin	extr	x15,x16,x15,#63
1326bc3d5698SJohn Baldwin	adcs	x25,x12,x15
1327bc3d5698SJohn Baldwin	extr	x16,x17,x16,#63
1328bc3d5698SJohn Baldwin	adcs	x26,x13,x16
1329bc3d5698SJohn Baldwin	ldp	x15,x16,[x2,#8*9]
1330bc3d5698SJohn Baldwin	mul	x6,x7,x7
1331bc3d5698SJohn Baldwin	ldp	x11,x13,[x1],#8*2
1332bc3d5698SJohn Baldwin	umulh	x7,x7,x7
1333bc3d5698SJohn Baldwin	mul	x8,x9,x9
1334bc3d5698SJohn Baldwin	umulh	x9,x9,x9
1335bc3d5698SJohn Baldwin	stp	x23,x24,[x2,#8*4]
1336bc3d5698SJohn Baldwin	extr	x17,x14,x17,#63
1337bc3d5698SJohn Baldwin	stp	x25,x26,[x2,#8*6]
1338bc3d5698SJohn Baldwin	add	x2,x2,#8*8
1339bc3d5698SJohn Baldwin	adcs	x19,x6,x17
1340bc3d5698SJohn Baldwin	extr	x14,x15,x14,#63
1341bc3d5698SJohn Baldwin	adcs	x20,x7,x14
1342bc3d5698SJohn Baldwin	ldp	x17,x14,[x2,#8*3]
1343bc3d5698SJohn Baldwin	extr	x15,x16,x15,#63
1344bc3d5698SJohn Baldwin	cbnz	x27,.Lsqr4x_shift_n_add
1345bc3d5698SJohn Baldwin	ldp	x1,x4,[x29,#104]	// pull np and n0
1346bc3d5698SJohn Baldwin
1347bc3d5698SJohn Baldwin	adcs	x21,x8,x15
1348bc3d5698SJohn Baldwin	extr	x16,x17,x16,#63
1349bc3d5698SJohn Baldwin	adcs	x22,x9,x16
1350bc3d5698SJohn Baldwin	ldp	x15,x16,[x2,#8*5]
1351bc3d5698SJohn Baldwin	mul	x10,x11,x11
1352bc3d5698SJohn Baldwin	umulh	x11,x11,x11
1353bc3d5698SJohn Baldwin	stp	x19,x20,[x2,#8*0]
1354bc3d5698SJohn Baldwin	mul	x12,x13,x13
1355bc3d5698SJohn Baldwin	umulh	x13,x13,x13
1356bc3d5698SJohn Baldwin	stp	x21,x22,[x2,#8*2]
1357bc3d5698SJohn Baldwin	extr	x17,x14,x17,#63
1358bc3d5698SJohn Baldwin	adcs	x23,x10,x17
1359bc3d5698SJohn Baldwin	extr	x14,x15,x14,#63
1360bc3d5698SJohn Baldwin	ldp	x19,x20,[sp,#8*0]
1361bc3d5698SJohn Baldwin	adcs	x24,x11,x14
1362bc3d5698SJohn Baldwin	extr	x15,x16,x15,#63
1363bc3d5698SJohn Baldwin	ldp	x6,x7,[x1,#8*0]
1364bc3d5698SJohn Baldwin	adcs	x25,x12,x15
1365bc3d5698SJohn Baldwin	extr	x16,xzr,x16,#63
1366bc3d5698SJohn Baldwin	ldp	x8,x9,[x1,#8*2]
1367bc3d5698SJohn Baldwin	adc	x26,x13,x16
1368bc3d5698SJohn Baldwin	ldp	x10,x11,[x1,#8*4]
1369bc3d5698SJohn Baldwin
1370bc3d5698SJohn Baldwin	// Reduce by 512 bits per iteration
1371bc3d5698SJohn Baldwin	mul	x28,x4,x19		// t[0]*n0
1372bc3d5698SJohn Baldwin	ldp	x12,x13,[x1,#8*6]
1373bc3d5698SJohn Baldwin	add	x3,x1,x5
1374bc3d5698SJohn Baldwin	ldp	x21,x22,[sp,#8*2]
1375bc3d5698SJohn Baldwin	stp	x23,x24,[x2,#8*4]
1376bc3d5698SJohn Baldwin	ldp	x23,x24,[sp,#8*4]
1377bc3d5698SJohn Baldwin	stp	x25,x26,[x2,#8*6]
1378bc3d5698SJohn Baldwin	ldp	x25,x26,[sp,#8*6]
1379bc3d5698SJohn Baldwin	add	x1,x1,#8*8
1380bc3d5698SJohn Baldwin	mov	x30,xzr		// initial top-most carry
1381bc3d5698SJohn Baldwin	mov	x2,sp
1382bc3d5698SJohn Baldwin	mov	x27,#8
1383bc3d5698SJohn Baldwin
1384bc3d5698SJohn Baldwin.Lsqr8x_reduction:
1385bc3d5698SJohn Baldwin	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
1386bc3d5698SJohn Baldwin	mul	x15,x7,x28
1387bc3d5698SJohn Baldwin	sub	x27,x27,#1
1388bc3d5698SJohn Baldwin	mul	x16,x8,x28
1389bc3d5698SJohn Baldwin	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
1390bc3d5698SJohn Baldwin	mul	x17,x9,x28
1391bc3d5698SJohn Baldwin	// (*)	adds	xzr,x19,x14
1392bc3d5698SJohn Baldwin	subs	xzr,x19,#1		// (*)
1393bc3d5698SJohn Baldwin	mul	x14,x10,x28
1394bc3d5698SJohn Baldwin	adcs	x19,x20,x15
1395bc3d5698SJohn Baldwin	mul	x15,x11,x28
1396bc3d5698SJohn Baldwin	adcs	x20,x21,x16
1397bc3d5698SJohn Baldwin	mul	x16,x12,x28
1398bc3d5698SJohn Baldwin	adcs	x21,x22,x17
1399bc3d5698SJohn Baldwin	mul	x17,x13,x28
1400bc3d5698SJohn Baldwin	adcs	x22,x23,x14
1401bc3d5698SJohn Baldwin	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
1402bc3d5698SJohn Baldwin	adcs	x23,x24,x15
1403bc3d5698SJohn Baldwin	umulh	x15,x7,x28
1404bc3d5698SJohn Baldwin	adcs	x24,x25,x16
1405bc3d5698SJohn Baldwin	umulh	x16,x8,x28
1406bc3d5698SJohn Baldwin	adcs	x25,x26,x17
1407bc3d5698SJohn Baldwin	umulh	x17,x9,x28
1408bc3d5698SJohn Baldwin	adc	x26,xzr,xzr
1409bc3d5698SJohn Baldwin	adds	x19,x19,x14
1410bc3d5698SJohn Baldwin	umulh	x14,x10,x28
1411bc3d5698SJohn Baldwin	adcs	x20,x20,x15
1412bc3d5698SJohn Baldwin	umulh	x15,x11,x28
1413bc3d5698SJohn Baldwin	adcs	x21,x21,x16
1414bc3d5698SJohn Baldwin	umulh	x16,x12,x28
1415bc3d5698SJohn Baldwin	adcs	x22,x22,x17
1416bc3d5698SJohn Baldwin	umulh	x17,x13,x28
1417bc3d5698SJohn Baldwin	mul	x28,x4,x19		// next t[0]*n0
1418bc3d5698SJohn Baldwin	adcs	x23,x23,x14
1419bc3d5698SJohn Baldwin	adcs	x24,x24,x15
1420bc3d5698SJohn Baldwin	adcs	x25,x25,x16
1421bc3d5698SJohn Baldwin	adc	x26,x26,x17
1422bc3d5698SJohn Baldwin	cbnz	x27,.Lsqr8x_reduction
1423bc3d5698SJohn Baldwin
1424bc3d5698SJohn Baldwin	ldp	x14,x15,[x2,#8*0]
1425bc3d5698SJohn Baldwin	ldp	x16,x17,[x2,#8*2]
1426bc3d5698SJohn Baldwin	mov	x0,x2
1427bc3d5698SJohn Baldwin	sub	x27,x3,x1	// done yet?
1428bc3d5698SJohn Baldwin	adds	x19,x19,x14
1429bc3d5698SJohn Baldwin	adcs	x20,x20,x15
1430bc3d5698SJohn Baldwin	ldp	x14,x15,[x2,#8*4]
1431bc3d5698SJohn Baldwin	adcs	x21,x21,x16
1432bc3d5698SJohn Baldwin	adcs	x22,x22,x17
1433bc3d5698SJohn Baldwin	ldp	x16,x17,[x2,#8*6]
1434bc3d5698SJohn Baldwin	adcs	x23,x23,x14
1435bc3d5698SJohn Baldwin	adcs	x24,x24,x15
1436bc3d5698SJohn Baldwin	adcs	x25,x25,x16
1437bc3d5698SJohn Baldwin	adcs	x26,x26,x17
1438bc3d5698SJohn Baldwin	//adc	x28,xzr,xzr		// moved below
1439bc3d5698SJohn Baldwin	cbz	x27,.Lsqr8x8_post_condition
1440bc3d5698SJohn Baldwin
1441c0855eaaSJohn Baldwin	ldur	x4,[x2,#-8*8]
1442bc3d5698SJohn Baldwin	ldp	x6,x7,[x1,#8*0]
1443bc3d5698SJohn Baldwin	ldp	x8,x9,[x1,#8*2]
1444bc3d5698SJohn Baldwin	ldp	x10,x11,[x1,#8*4]
1445bc3d5698SJohn Baldwin	mov	x27,#-8*8
1446bc3d5698SJohn Baldwin	ldp	x12,x13,[x1,#8*6]
1447bc3d5698SJohn Baldwin	add	x1,x1,#8*8
1448bc3d5698SJohn Baldwin
1449bc3d5698SJohn Baldwin.Lsqr8x_tail:
1450bc3d5698SJohn Baldwin	mul	x14,x6,x4
1451bc3d5698SJohn Baldwin	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
1452bc3d5698SJohn Baldwin	mul	x15,x7,x4
1453bc3d5698SJohn Baldwin	add	x27,x27,#8
1454bc3d5698SJohn Baldwin	mul	x16,x8,x4
1455bc3d5698SJohn Baldwin	mul	x17,x9,x4
1456bc3d5698SJohn Baldwin	adds	x19,x19,x14
1457bc3d5698SJohn Baldwin	mul	x14,x10,x4
1458bc3d5698SJohn Baldwin	adcs	x20,x20,x15
1459bc3d5698SJohn Baldwin	mul	x15,x11,x4
1460bc3d5698SJohn Baldwin	adcs	x21,x21,x16
1461bc3d5698SJohn Baldwin	mul	x16,x12,x4
1462bc3d5698SJohn Baldwin	adcs	x22,x22,x17
1463bc3d5698SJohn Baldwin	mul	x17,x13,x4
1464bc3d5698SJohn Baldwin	adcs	x23,x23,x14
1465bc3d5698SJohn Baldwin	umulh	x14,x6,x4
1466bc3d5698SJohn Baldwin	adcs	x24,x24,x15
1467bc3d5698SJohn Baldwin	umulh	x15,x7,x4
1468bc3d5698SJohn Baldwin	adcs	x25,x25,x16
1469bc3d5698SJohn Baldwin	umulh	x16,x8,x4
1470bc3d5698SJohn Baldwin	adcs	x26,x26,x17
1471bc3d5698SJohn Baldwin	umulh	x17,x9,x4
1472bc3d5698SJohn Baldwin	adc	x28,x28,xzr
1473bc3d5698SJohn Baldwin	str	x19,[x2],#8
1474bc3d5698SJohn Baldwin	adds	x19,x20,x14
1475bc3d5698SJohn Baldwin	umulh	x14,x10,x4
1476bc3d5698SJohn Baldwin	adcs	x20,x21,x15
1477bc3d5698SJohn Baldwin	umulh	x15,x11,x4
1478bc3d5698SJohn Baldwin	adcs	x21,x22,x16
1479bc3d5698SJohn Baldwin	umulh	x16,x12,x4
1480bc3d5698SJohn Baldwin	adcs	x22,x23,x17
1481bc3d5698SJohn Baldwin	umulh	x17,x13,x4
1482bc3d5698SJohn Baldwin	ldr	x4,[x0,x27]
1483bc3d5698SJohn Baldwin	adcs	x23,x24,x14
1484bc3d5698SJohn Baldwin	adcs	x24,x25,x15
1485bc3d5698SJohn Baldwin	adcs	x25,x26,x16
1486bc3d5698SJohn Baldwin	adcs	x26,x28,x17
1487bc3d5698SJohn Baldwin	//adc	x28,xzr,xzr		// moved above
1488bc3d5698SJohn Baldwin	cbnz	x27,.Lsqr8x_tail
1489bc3d5698SJohn Baldwin					// note that carry flag is guaranteed
1490bc3d5698SJohn Baldwin					// to be zero at this point
1491bc3d5698SJohn Baldwin	ldp	x6,x7,[x2,#8*0]
1492bc3d5698SJohn Baldwin	sub	x27,x3,x1	// done yet?
1493bc3d5698SJohn Baldwin	sub	x16,x3,x5	// rewinded np
1494bc3d5698SJohn Baldwin	ldp	x8,x9,[x2,#8*2]
1495bc3d5698SJohn Baldwin	ldp	x10,x11,[x2,#8*4]
1496bc3d5698SJohn Baldwin	ldp	x12,x13,[x2,#8*6]
1497bc3d5698SJohn Baldwin	cbz	x27,.Lsqr8x_tail_break
1498bc3d5698SJohn Baldwin
1499c0855eaaSJohn Baldwin	ldur	x4,[x0,#-8*8]
1500bc3d5698SJohn Baldwin	adds	x19,x19,x6
1501bc3d5698SJohn Baldwin	adcs	x20,x20,x7
1502bc3d5698SJohn Baldwin	ldp	x6,x7,[x1,#8*0]
1503bc3d5698SJohn Baldwin	adcs	x21,x21,x8
1504bc3d5698SJohn Baldwin	adcs	x22,x22,x9
1505bc3d5698SJohn Baldwin	ldp	x8,x9,[x1,#8*2]
1506bc3d5698SJohn Baldwin	adcs	x23,x23,x10
1507bc3d5698SJohn Baldwin	adcs	x24,x24,x11
1508bc3d5698SJohn Baldwin	ldp	x10,x11,[x1,#8*4]
1509bc3d5698SJohn Baldwin	adcs	x25,x25,x12
1510bc3d5698SJohn Baldwin	mov	x27,#-8*8
1511bc3d5698SJohn Baldwin	adcs	x26,x26,x13
1512bc3d5698SJohn Baldwin	ldp	x12,x13,[x1,#8*6]
1513bc3d5698SJohn Baldwin	add	x1,x1,#8*8
1514bc3d5698SJohn Baldwin	//adc	x28,xzr,xzr		// moved above
1515bc3d5698SJohn Baldwin	b	.Lsqr8x_tail
1516bc3d5698SJohn Baldwin
1517bc3d5698SJohn Baldwin.align	4
1518bc3d5698SJohn Baldwin.Lsqr8x_tail_break:
1519bc3d5698SJohn Baldwin	ldr	x4,[x29,#112]		// pull n0
1520bc3d5698SJohn Baldwin	add	x27,x2,#8*8		// end of current t[num] window
1521bc3d5698SJohn Baldwin
1522bc3d5698SJohn Baldwin	subs	xzr,x30,#1		// "move" top-most carry to carry bit
1523bc3d5698SJohn Baldwin	adcs	x14,x19,x6
1524bc3d5698SJohn Baldwin	adcs	x15,x20,x7
1525bc3d5698SJohn Baldwin	ldp	x19,x20,[x0,#8*0]
1526bc3d5698SJohn Baldwin	adcs	x21,x21,x8
1527bc3d5698SJohn Baldwin	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
1528bc3d5698SJohn Baldwin	adcs	x22,x22,x9
1529bc3d5698SJohn Baldwin	ldp	x8,x9,[x16,#8*2]
1530bc3d5698SJohn Baldwin	adcs	x23,x23,x10
1531bc3d5698SJohn Baldwin	adcs	x24,x24,x11
1532bc3d5698SJohn Baldwin	ldp	x10,x11,[x16,#8*4]
1533bc3d5698SJohn Baldwin	adcs	x25,x25,x12
1534bc3d5698SJohn Baldwin	adcs	x26,x26,x13
1535bc3d5698SJohn Baldwin	ldp	x12,x13,[x16,#8*6]
1536bc3d5698SJohn Baldwin	add	x1,x16,#8*8
1537bc3d5698SJohn Baldwin	adc	x30,xzr,xzr	// top-most carry
1538bc3d5698SJohn Baldwin	mul	x28,x4,x19
1539bc3d5698SJohn Baldwin	stp	x14,x15,[x2,#8*0]
1540bc3d5698SJohn Baldwin	stp	x21,x22,[x2,#8*2]
1541bc3d5698SJohn Baldwin	ldp	x21,x22,[x0,#8*2]
1542bc3d5698SJohn Baldwin	stp	x23,x24,[x2,#8*4]
1543bc3d5698SJohn Baldwin	ldp	x23,x24,[x0,#8*4]
1544bc3d5698SJohn Baldwin	cmp	x27,x29		// did we hit the bottom?
1545bc3d5698SJohn Baldwin	stp	x25,x26,[x2,#8*6]
1546bc3d5698SJohn Baldwin	mov	x2,x0			// slide the window
1547bc3d5698SJohn Baldwin	ldp	x25,x26,[x0,#8*6]
1548bc3d5698SJohn Baldwin	mov	x27,#8
1549bc3d5698SJohn Baldwin	b.ne	.Lsqr8x_reduction
1550bc3d5698SJohn Baldwin
1551bc3d5698SJohn Baldwin	// Final step. We see if result is larger than modulus, and
1552bc3d5698SJohn Baldwin	// if it is, subtract the modulus. But comparison implies
1553bc3d5698SJohn Baldwin	// subtraction. So we subtract modulus, see if it borrowed,
1554bc3d5698SJohn Baldwin	// and conditionally copy original value.
1555bc3d5698SJohn Baldwin	ldr	x0,[x29,#96]		// pull rp
1556bc3d5698SJohn Baldwin	add	x2,x2,#8*8
1557bc3d5698SJohn Baldwin	subs	x14,x19,x6
1558bc3d5698SJohn Baldwin	sbcs	x15,x20,x7
1559bc3d5698SJohn Baldwin	sub	x27,x5,#8*8
1560bc3d5698SJohn Baldwin	mov	x3,x0		// x0 copy
1561bc3d5698SJohn Baldwin
1562bc3d5698SJohn Baldwin.Lsqr8x_sub:
1563bc3d5698SJohn Baldwin	sbcs	x16,x21,x8
1564bc3d5698SJohn Baldwin	ldp	x6,x7,[x1,#8*0]
1565bc3d5698SJohn Baldwin	sbcs	x17,x22,x9
1566bc3d5698SJohn Baldwin	stp	x14,x15,[x0,#8*0]
1567bc3d5698SJohn Baldwin	sbcs	x14,x23,x10
1568bc3d5698SJohn Baldwin	ldp	x8,x9,[x1,#8*2]
1569bc3d5698SJohn Baldwin	sbcs	x15,x24,x11
1570bc3d5698SJohn Baldwin	stp	x16,x17,[x0,#8*2]
1571bc3d5698SJohn Baldwin	sbcs	x16,x25,x12
1572bc3d5698SJohn Baldwin	ldp	x10,x11,[x1,#8*4]
1573bc3d5698SJohn Baldwin	sbcs	x17,x26,x13
1574bc3d5698SJohn Baldwin	ldp	x12,x13,[x1,#8*6]
1575bc3d5698SJohn Baldwin	add	x1,x1,#8*8
1576bc3d5698SJohn Baldwin	ldp	x19,x20,[x2,#8*0]
1577bc3d5698SJohn Baldwin	sub	x27,x27,#8*8
1578bc3d5698SJohn Baldwin	ldp	x21,x22,[x2,#8*2]
1579bc3d5698SJohn Baldwin	ldp	x23,x24,[x2,#8*4]
1580bc3d5698SJohn Baldwin	ldp	x25,x26,[x2,#8*6]
1581bc3d5698SJohn Baldwin	add	x2,x2,#8*8
1582bc3d5698SJohn Baldwin	stp	x14,x15,[x0,#8*4]
1583bc3d5698SJohn Baldwin	sbcs	x14,x19,x6
1584bc3d5698SJohn Baldwin	stp	x16,x17,[x0,#8*6]
1585bc3d5698SJohn Baldwin	add	x0,x0,#8*8
1586bc3d5698SJohn Baldwin	sbcs	x15,x20,x7
1587bc3d5698SJohn Baldwin	cbnz	x27,.Lsqr8x_sub
1588bc3d5698SJohn Baldwin
1589bc3d5698SJohn Baldwin	sbcs	x16,x21,x8
1590bc3d5698SJohn Baldwin	mov	x2,sp
1591bc3d5698SJohn Baldwin	add	x1,sp,x5
1592bc3d5698SJohn Baldwin	ldp	x6,x7,[x3,#8*0]
1593bc3d5698SJohn Baldwin	sbcs	x17,x22,x9
1594bc3d5698SJohn Baldwin	stp	x14,x15,[x0,#8*0]
1595bc3d5698SJohn Baldwin	sbcs	x14,x23,x10
1596bc3d5698SJohn Baldwin	ldp	x8,x9,[x3,#8*2]
1597bc3d5698SJohn Baldwin	sbcs	x15,x24,x11
1598bc3d5698SJohn Baldwin	stp	x16,x17,[x0,#8*2]
1599bc3d5698SJohn Baldwin	sbcs	x16,x25,x12
1600bc3d5698SJohn Baldwin	ldp	x19,x20,[x1,#8*0]
1601bc3d5698SJohn Baldwin	sbcs	x17,x26,x13
1602bc3d5698SJohn Baldwin	ldp	x21,x22,[x1,#8*2]
1603bc3d5698SJohn Baldwin	sbcs	xzr,x30,xzr	// did it borrow?
1604bc3d5698SJohn Baldwin	ldr	x30,[x29,#8]		// pull return address
1605bc3d5698SJohn Baldwin	stp	x14,x15,[x0,#8*4]
1606bc3d5698SJohn Baldwin	stp	x16,x17,[x0,#8*6]
1607bc3d5698SJohn Baldwin
1608bc3d5698SJohn Baldwin	sub	x27,x5,#8*4
1609bc3d5698SJohn Baldwin.Lsqr4x_cond_copy:
1610bc3d5698SJohn Baldwin	sub	x27,x27,#8*4
1611bc3d5698SJohn Baldwin	csel	x14,x19,x6,lo
1612bc3d5698SJohn Baldwin	stp	xzr,xzr,[x2,#8*0]
1613bc3d5698SJohn Baldwin	csel	x15,x20,x7,lo
1614bc3d5698SJohn Baldwin	ldp	x6,x7,[x3,#8*4]
1615bc3d5698SJohn Baldwin	ldp	x19,x20,[x1,#8*4]
1616bc3d5698SJohn Baldwin	csel	x16,x21,x8,lo
1617bc3d5698SJohn Baldwin	stp	xzr,xzr,[x2,#8*2]
1618bc3d5698SJohn Baldwin	add	x2,x2,#8*4
1619bc3d5698SJohn Baldwin	csel	x17,x22,x9,lo
1620bc3d5698SJohn Baldwin	ldp	x8,x9,[x3,#8*6]
1621bc3d5698SJohn Baldwin	ldp	x21,x22,[x1,#8*6]
1622bc3d5698SJohn Baldwin	add	x1,x1,#8*4
1623bc3d5698SJohn Baldwin	stp	x14,x15,[x3,#8*0]
1624bc3d5698SJohn Baldwin	stp	x16,x17,[x3,#8*2]
1625bc3d5698SJohn Baldwin	add	x3,x3,#8*4
1626bc3d5698SJohn Baldwin	stp	xzr,xzr,[x1,#8*0]
1627bc3d5698SJohn Baldwin	stp	xzr,xzr,[x1,#8*2]
1628bc3d5698SJohn Baldwin	cbnz	x27,.Lsqr4x_cond_copy
1629bc3d5698SJohn Baldwin
1630bc3d5698SJohn Baldwin	csel	x14,x19,x6,lo
1631bc3d5698SJohn Baldwin	stp	xzr,xzr,[x2,#8*0]
1632bc3d5698SJohn Baldwin	csel	x15,x20,x7,lo
1633bc3d5698SJohn Baldwin	stp	xzr,xzr,[x2,#8*2]
1634bc3d5698SJohn Baldwin	csel	x16,x21,x8,lo
1635bc3d5698SJohn Baldwin	csel	x17,x22,x9,lo
1636bc3d5698SJohn Baldwin	stp	x14,x15,[x3,#8*0]
1637bc3d5698SJohn Baldwin	stp	x16,x17,[x3,#8*2]
1638bc3d5698SJohn Baldwin
1639bc3d5698SJohn Baldwin	b	.Lsqr8x_done
1640bc3d5698SJohn Baldwin
1641bc3d5698SJohn Baldwin.align	4
1642bc3d5698SJohn Baldwin.Lsqr8x8_post_condition:
1643bc3d5698SJohn Baldwin	adc	x28,xzr,xzr
1644bc3d5698SJohn Baldwin	ldr	x30,[x29,#8]		// pull return address
1645bc3d5698SJohn Baldwin	// x19-7,x28 hold result, x6-7 hold modulus
1646bc3d5698SJohn Baldwin	subs	x6,x19,x6
1647bc3d5698SJohn Baldwin	ldr	x1,[x29,#96]		// pull rp
1648bc3d5698SJohn Baldwin	sbcs	x7,x20,x7
1649bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#8*0]
1650bc3d5698SJohn Baldwin	sbcs	x8,x21,x8
1651bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#8*2]
1652bc3d5698SJohn Baldwin	sbcs	x9,x22,x9
1653bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#8*4]
1654bc3d5698SJohn Baldwin	sbcs	x10,x23,x10
1655bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#8*6]
1656bc3d5698SJohn Baldwin	sbcs	x11,x24,x11
1657bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#8*8]
1658bc3d5698SJohn Baldwin	sbcs	x12,x25,x12
1659bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#8*10]
1660bc3d5698SJohn Baldwin	sbcs	x13,x26,x13
1661bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#8*12]
1662bc3d5698SJohn Baldwin	sbcs	x28,x28,xzr	// did it borrow?
1663bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#8*14]
1664bc3d5698SJohn Baldwin
1665bc3d5698SJohn Baldwin	// x6-7 hold result-modulus
1666bc3d5698SJohn Baldwin	csel	x6,x19,x6,lo
1667bc3d5698SJohn Baldwin	csel	x7,x20,x7,lo
1668bc3d5698SJohn Baldwin	csel	x8,x21,x8,lo
1669bc3d5698SJohn Baldwin	csel	x9,x22,x9,lo
1670bc3d5698SJohn Baldwin	stp	x6,x7,[x1,#8*0]
1671bc3d5698SJohn Baldwin	csel	x10,x23,x10,lo
1672bc3d5698SJohn Baldwin	csel	x11,x24,x11,lo
1673bc3d5698SJohn Baldwin	stp	x8,x9,[x1,#8*2]
1674bc3d5698SJohn Baldwin	csel	x12,x25,x12,lo
1675bc3d5698SJohn Baldwin	csel	x13,x26,x13,lo
1676bc3d5698SJohn Baldwin	stp	x10,x11,[x1,#8*4]
1677bc3d5698SJohn Baldwin	stp	x12,x13,[x1,#8*6]
1678bc3d5698SJohn Baldwin
1679bc3d5698SJohn Baldwin.Lsqr8x_done:
1680bc3d5698SJohn Baldwin	ldp	x19,x20,[x29,#16]
1681bc3d5698SJohn Baldwin	mov	sp,x29
1682bc3d5698SJohn Baldwin	ldp	x21,x22,[x29,#32]
1683bc3d5698SJohn Baldwin	mov	x0,#1
1684bc3d5698SJohn Baldwin	ldp	x23,x24,[x29,#48]
1685bc3d5698SJohn Baldwin	ldp	x25,x26,[x29,#64]
1686bc3d5698SJohn Baldwin	ldp	x27,x28,[x29,#80]
1687bc3d5698SJohn Baldwin	ldr	x29,[sp],#128
1688bd9588bcSAndrew Turner	// x30 is loaded earlier
1689bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
1690bc3d5698SJohn Baldwin	ret
1691bc3d5698SJohn Baldwin.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
1692bc3d5698SJohn Baldwin.type	__bn_mul4x_mont,%function
1693bc3d5698SJohn Baldwin.align	5
1694bc3d5698SJohn Baldwin__bn_mul4x_mont:
1695bd9588bcSAndrew Turner	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
1696bd9588bcSAndrew Turner	// only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address.
1697bc3d5698SJohn Baldwin	stp	x29,x30,[sp,#-128]!
1698bc3d5698SJohn Baldwin	add	x29,sp,#0
1699bc3d5698SJohn Baldwin	stp	x19,x20,[sp,#16]
1700bc3d5698SJohn Baldwin	stp	x21,x22,[sp,#32]
1701bc3d5698SJohn Baldwin	stp	x23,x24,[sp,#48]
1702bc3d5698SJohn Baldwin	stp	x25,x26,[sp,#64]
1703bc3d5698SJohn Baldwin	stp	x27,x28,[sp,#80]
1704bc3d5698SJohn Baldwin
1705bc3d5698SJohn Baldwin	sub	x26,sp,x5,lsl#3
1706bc3d5698SJohn Baldwin	lsl	x5,x5,#3
1707bc3d5698SJohn Baldwin	ldr	x4,[x4]		// *n0
1708bc3d5698SJohn Baldwin	sub	sp,x26,#8*4		// alloca
1709bc3d5698SJohn Baldwin
1710bc3d5698SJohn Baldwin	add	x10,x2,x5
1711bc3d5698SJohn Baldwin	add	x27,x1,x5
1712bc3d5698SJohn Baldwin	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
1713bc3d5698SJohn Baldwin
1714bc3d5698SJohn Baldwin	ldr	x24,[x2,#8*0]		// b[0]
1715bc3d5698SJohn Baldwin	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1716bc3d5698SJohn Baldwin	ldp	x8,x9,[x1,#8*2]
1717bc3d5698SJohn Baldwin	add	x1,x1,#8*4
1718bc3d5698SJohn Baldwin	mov	x19,xzr
1719bc3d5698SJohn Baldwin	mov	x20,xzr
1720bc3d5698SJohn Baldwin	mov	x21,xzr
1721bc3d5698SJohn Baldwin	mov	x22,xzr
1722bc3d5698SJohn Baldwin	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1723bc3d5698SJohn Baldwin	ldp	x16,x17,[x3,#8*2]
1724bc3d5698SJohn Baldwin	adds	x3,x3,#8*4		// clear carry bit
1725bc3d5698SJohn Baldwin	mov	x0,xzr
1726bc3d5698SJohn Baldwin	mov	x28,#0
1727bc3d5698SJohn Baldwin	mov	x26,sp
1728bc3d5698SJohn Baldwin
1729bc3d5698SJohn Baldwin.Loop_mul4x_1st_reduction:
1730bc3d5698SJohn Baldwin	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1731bc3d5698SJohn Baldwin	adc	x0,x0,xzr	// modulo-scheduled
1732bc3d5698SJohn Baldwin	mul	x11,x7,x24
1733bc3d5698SJohn Baldwin	add	x28,x28,#8
1734bc3d5698SJohn Baldwin	mul	x12,x8,x24
1735bc3d5698SJohn Baldwin	and	x28,x28,#31
1736bc3d5698SJohn Baldwin	mul	x13,x9,x24
1737bc3d5698SJohn Baldwin	adds	x19,x19,x10
1738bc3d5698SJohn Baldwin	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1739bc3d5698SJohn Baldwin	adcs	x20,x20,x11
1740bc3d5698SJohn Baldwin	mul	x25,x19,x4		// t[0]*n0
1741bc3d5698SJohn Baldwin	adcs	x21,x21,x12
1742bc3d5698SJohn Baldwin	umulh	x11,x7,x24
1743bc3d5698SJohn Baldwin	adcs	x22,x22,x13
1744bc3d5698SJohn Baldwin	umulh	x12,x8,x24
1745bc3d5698SJohn Baldwin	adc	x23,xzr,xzr
1746bc3d5698SJohn Baldwin	umulh	x13,x9,x24
1747bc3d5698SJohn Baldwin	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1748bc3d5698SJohn Baldwin	adds	x20,x20,x10
1749bc3d5698SJohn Baldwin	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1750bc3d5698SJohn Baldwin	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1751bc3d5698SJohn Baldwin	adcs	x21,x21,x11
1752bc3d5698SJohn Baldwin	mul	x11,x15,x25
1753bc3d5698SJohn Baldwin	adcs	x22,x22,x12
1754bc3d5698SJohn Baldwin	mul	x12,x16,x25
1755bc3d5698SJohn Baldwin	adc	x23,x23,x13		// can't overflow
1756bc3d5698SJohn Baldwin	mul	x13,x17,x25
1757bc3d5698SJohn Baldwin	// (*)	adds	xzr,x19,x10
1758bc3d5698SJohn Baldwin	subs	xzr,x19,#1		// (*)
1759bc3d5698SJohn Baldwin	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1760bc3d5698SJohn Baldwin	adcs	x19,x20,x11
1761bc3d5698SJohn Baldwin	umulh	x11,x15,x25
1762bc3d5698SJohn Baldwin	adcs	x20,x21,x12
1763bc3d5698SJohn Baldwin	umulh	x12,x16,x25
1764bc3d5698SJohn Baldwin	adcs	x21,x22,x13
1765bc3d5698SJohn Baldwin	umulh	x13,x17,x25
1766bc3d5698SJohn Baldwin	adcs	x22,x23,x0
1767bc3d5698SJohn Baldwin	adc	x0,xzr,xzr
1768bc3d5698SJohn Baldwin	adds	x19,x19,x10
1769bc3d5698SJohn Baldwin	sub	x10,x27,x1
1770bc3d5698SJohn Baldwin	adcs	x20,x20,x11
1771bc3d5698SJohn Baldwin	adcs	x21,x21,x12
1772bc3d5698SJohn Baldwin	adcs	x22,x22,x13
1773bc3d5698SJohn Baldwin	//adc	x0,x0,xzr
1774bc3d5698SJohn Baldwin	cbnz	x28,.Loop_mul4x_1st_reduction
1775bc3d5698SJohn Baldwin
1776bc3d5698SJohn Baldwin	cbz	x10,.Lmul4x4_post_condition
1777bc3d5698SJohn Baldwin
1778bc3d5698SJohn Baldwin	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1779bc3d5698SJohn Baldwin	ldp	x8,x9,[x1,#8*2]
1780bc3d5698SJohn Baldwin	add	x1,x1,#8*4
1781bc3d5698SJohn Baldwin	ldr	x25,[sp]		// a[0]*n0
1782bc3d5698SJohn Baldwin	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1783bc3d5698SJohn Baldwin	ldp	x16,x17,[x3,#8*2]
1784bc3d5698SJohn Baldwin	add	x3,x3,#8*4
1785bc3d5698SJohn Baldwin
1786bc3d5698SJohn Baldwin.Loop_mul4x_1st_tail:
1787bc3d5698SJohn Baldwin	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1788bc3d5698SJohn Baldwin	adc	x0,x0,xzr	// modulo-scheduled
1789bc3d5698SJohn Baldwin	mul	x11,x7,x24
1790bc3d5698SJohn Baldwin	add	x28,x28,#8
1791bc3d5698SJohn Baldwin	mul	x12,x8,x24
1792bc3d5698SJohn Baldwin	and	x28,x28,#31
1793bc3d5698SJohn Baldwin	mul	x13,x9,x24
1794bc3d5698SJohn Baldwin	adds	x19,x19,x10
1795bc3d5698SJohn Baldwin	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1796bc3d5698SJohn Baldwin	adcs	x20,x20,x11
1797bc3d5698SJohn Baldwin	umulh	x11,x7,x24
1798bc3d5698SJohn Baldwin	adcs	x21,x21,x12
1799bc3d5698SJohn Baldwin	umulh	x12,x8,x24
1800bc3d5698SJohn Baldwin	adcs	x22,x22,x13
1801bc3d5698SJohn Baldwin	umulh	x13,x9,x24
1802bc3d5698SJohn Baldwin	adc	x23,xzr,xzr
1803bc3d5698SJohn Baldwin	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1804bc3d5698SJohn Baldwin	adds	x20,x20,x10
1805bc3d5698SJohn Baldwin	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1806bc3d5698SJohn Baldwin	adcs	x21,x21,x11
1807bc3d5698SJohn Baldwin	mul	x11,x15,x25
1808bc3d5698SJohn Baldwin	adcs	x22,x22,x12
1809bc3d5698SJohn Baldwin	mul	x12,x16,x25
1810bc3d5698SJohn Baldwin	adc	x23,x23,x13		// can't overflow
1811bc3d5698SJohn Baldwin	mul	x13,x17,x25
1812bc3d5698SJohn Baldwin	adds	x19,x19,x10
1813bc3d5698SJohn Baldwin	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1814bc3d5698SJohn Baldwin	adcs	x20,x20,x11
1815bc3d5698SJohn Baldwin	umulh	x11,x15,x25
1816bc3d5698SJohn Baldwin	adcs	x21,x21,x12
1817bc3d5698SJohn Baldwin	umulh	x12,x16,x25
1818bc3d5698SJohn Baldwin	adcs	x22,x22,x13
1819bc3d5698SJohn Baldwin	adcs	x23,x23,x0
1820bc3d5698SJohn Baldwin	umulh	x13,x17,x25
1821bc3d5698SJohn Baldwin	adc	x0,xzr,xzr
1822bc3d5698SJohn Baldwin	ldr	x25,[sp,x28]		// next t[0]*n0
1823bc3d5698SJohn Baldwin	str	x19,[x26],#8		// result!!!
1824bc3d5698SJohn Baldwin	adds	x19,x20,x10
1825bc3d5698SJohn Baldwin	sub	x10,x27,x1		// done yet?
1826bc3d5698SJohn Baldwin	adcs	x20,x21,x11
1827bc3d5698SJohn Baldwin	adcs	x21,x22,x12
1828bc3d5698SJohn Baldwin	adcs	x22,x23,x13
1829bc3d5698SJohn Baldwin	//adc	x0,x0,xzr
1830bc3d5698SJohn Baldwin	cbnz	x28,.Loop_mul4x_1st_tail
1831bc3d5698SJohn Baldwin
1832bc3d5698SJohn Baldwin	sub	x11,x27,x5	// rewinded x1
1833bc3d5698SJohn Baldwin	cbz	x10,.Lmul4x_proceed
1834bc3d5698SJohn Baldwin
1835bc3d5698SJohn Baldwin	ldp	x6,x7,[x1,#8*0]
1836bc3d5698SJohn Baldwin	ldp	x8,x9,[x1,#8*2]
1837bc3d5698SJohn Baldwin	add	x1,x1,#8*4
1838bc3d5698SJohn Baldwin	ldp	x14,x15,[x3,#8*0]
1839bc3d5698SJohn Baldwin	ldp	x16,x17,[x3,#8*2]
1840bc3d5698SJohn Baldwin	add	x3,x3,#8*4
1841bc3d5698SJohn Baldwin	b	.Loop_mul4x_1st_tail
1842bc3d5698SJohn Baldwin
1843bc3d5698SJohn Baldwin.align	5
1844bc3d5698SJohn Baldwin.Lmul4x_proceed:
1845bc3d5698SJohn Baldwin	ldr	x24,[x2,#8*4]!		// *++b
1846bc3d5698SJohn Baldwin	adc	x30,x0,xzr
1847bc3d5698SJohn Baldwin	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1848bc3d5698SJohn Baldwin	sub	x3,x3,x5		// rewind np
1849bc3d5698SJohn Baldwin	ldp	x8,x9,[x11,#8*2]
1850bc3d5698SJohn Baldwin	add	x1,x11,#8*4
1851bc3d5698SJohn Baldwin
1852bc3d5698SJohn Baldwin	stp	x19,x20,[x26,#8*0]	// result!!!
1853bc3d5698SJohn Baldwin	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1854bc3d5698SJohn Baldwin	stp	x21,x22,[x26,#8*2]	// result!!!
1855bc3d5698SJohn Baldwin	ldp	x21,x22,[sp,#8*6]
1856bc3d5698SJohn Baldwin
1857bc3d5698SJohn Baldwin	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1858bc3d5698SJohn Baldwin	mov	x26,sp
1859bc3d5698SJohn Baldwin	ldp	x16,x17,[x3,#8*2]
1860bc3d5698SJohn Baldwin	adds	x3,x3,#8*4		// clear carry bit
1861bc3d5698SJohn Baldwin	mov	x0,xzr
1862bc3d5698SJohn Baldwin
1863bc3d5698SJohn Baldwin.align	4
1864bc3d5698SJohn Baldwin.Loop_mul4x_reduction:
1865bc3d5698SJohn Baldwin	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1866bc3d5698SJohn Baldwin	adc	x0,x0,xzr	// modulo-scheduled
1867bc3d5698SJohn Baldwin	mul	x11,x7,x24
1868bc3d5698SJohn Baldwin	add	x28,x28,#8
1869bc3d5698SJohn Baldwin	mul	x12,x8,x24
1870bc3d5698SJohn Baldwin	and	x28,x28,#31
1871bc3d5698SJohn Baldwin	mul	x13,x9,x24
1872bc3d5698SJohn Baldwin	adds	x19,x19,x10
1873bc3d5698SJohn Baldwin	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1874bc3d5698SJohn Baldwin	adcs	x20,x20,x11
1875bc3d5698SJohn Baldwin	mul	x25,x19,x4		// t[0]*n0
1876bc3d5698SJohn Baldwin	adcs	x21,x21,x12
1877bc3d5698SJohn Baldwin	umulh	x11,x7,x24
1878bc3d5698SJohn Baldwin	adcs	x22,x22,x13
1879bc3d5698SJohn Baldwin	umulh	x12,x8,x24
1880bc3d5698SJohn Baldwin	adc	x23,xzr,xzr
1881bc3d5698SJohn Baldwin	umulh	x13,x9,x24
1882bc3d5698SJohn Baldwin	ldr	x24,[x2,x28]		// next b[i]
1883bc3d5698SJohn Baldwin	adds	x20,x20,x10
1884bc3d5698SJohn Baldwin	// (*)	mul	x10,x14,x25
1885bc3d5698SJohn Baldwin	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1886bc3d5698SJohn Baldwin	adcs	x21,x21,x11
1887bc3d5698SJohn Baldwin	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1888bc3d5698SJohn Baldwin	adcs	x22,x22,x12
1889bc3d5698SJohn Baldwin	mul	x12,x16,x25
1890bc3d5698SJohn Baldwin	adc	x23,x23,x13		// can't overflow
1891bc3d5698SJohn Baldwin	mul	x13,x17,x25
1892bc3d5698SJohn Baldwin	// (*)	adds	xzr,x19,x10
1893bc3d5698SJohn Baldwin	subs	xzr,x19,#1		// (*)
1894bc3d5698SJohn Baldwin	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1895bc3d5698SJohn Baldwin	adcs	x19,x20,x11
1896bc3d5698SJohn Baldwin	umulh	x11,x15,x25
1897bc3d5698SJohn Baldwin	adcs	x20,x21,x12
1898bc3d5698SJohn Baldwin	umulh	x12,x16,x25
1899bc3d5698SJohn Baldwin	adcs	x21,x22,x13
1900bc3d5698SJohn Baldwin	umulh	x13,x17,x25
1901bc3d5698SJohn Baldwin	adcs	x22,x23,x0
1902bc3d5698SJohn Baldwin	adc	x0,xzr,xzr
1903bc3d5698SJohn Baldwin	adds	x19,x19,x10
1904bc3d5698SJohn Baldwin	adcs	x20,x20,x11
1905bc3d5698SJohn Baldwin	adcs	x21,x21,x12
1906bc3d5698SJohn Baldwin	adcs	x22,x22,x13
1907bc3d5698SJohn Baldwin	//adc	x0,x0,xzr
1908bc3d5698SJohn Baldwin	cbnz	x28,.Loop_mul4x_reduction
1909bc3d5698SJohn Baldwin
1910bc3d5698SJohn Baldwin	adc	x0,x0,xzr
1911bc3d5698SJohn Baldwin	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1912bc3d5698SJohn Baldwin	ldp	x12,x13,[x26,#8*6]
1913bc3d5698SJohn Baldwin	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1914bc3d5698SJohn Baldwin	ldp	x8,x9,[x1,#8*2]
1915bc3d5698SJohn Baldwin	add	x1,x1,#8*4
1916bc3d5698SJohn Baldwin	adds	x19,x19,x10
1917bc3d5698SJohn Baldwin	adcs	x20,x20,x11
1918bc3d5698SJohn Baldwin	adcs	x21,x21,x12
1919bc3d5698SJohn Baldwin	adcs	x22,x22,x13
1920bc3d5698SJohn Baldwin	//adc	x0,x0,xzr
1921bc3d5698SJohn Baldwin
1922bc3d5698SJohn Baldwin	ldr	x25,[sp]		// t[0]*n0
1923bc3d5698SJohn Baldwin	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1924bc3d5698SJohn Baldwin	ldp	x16,x17,[x3,#8*2]
1925bc3d5698SJohn Baldwin	add	x3,x3,#8*4
1926bc3d5698SJohn Baldwin
1927bc3d5698SJohn Baldwin.align	4
1928bc3d5698SJohn Baldwin.Loop_mul4x_tail:
1929bc3d5698SJohn Baldwin	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1930bc3d5698SJohn Baldwin	adc	x0,x0,xzr	// modulo-scheduled
1931bc3d5698SJohn Baldwin	mul	x11,x7,x24
1932bc3d5698SJohn Baldwin	add	x28,x28,#8
1933bc3d5698SJohn Baldwin	mul	x12,x8,x24
1934bc3d5698SJohn Baldwin	and	x28,x28,#31
1935bc3d5698SJohn Baldwin	mul	x13,x9,x24
1936bc3d5698SJohn Baldwin	adds	x19,x19,x10
1937bc3d5698SJohn Baldwin	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1938bc3d5698SJohn Baldwin	adcs	x20,x20,x11
1939bc3d5698SJohn Baldwin	umulh	x11,x7,x24
1940bc3d5698SJohn Baldwin	adcs	x21,x21,x12
1941bc3d5698SJohn Baldwin	umulh	x12,x8,x24
1942bc3d5698SJohn Baldwin	adcs	x22,x22,x13
1943bc3d5698SJohn Baldwin	umulh	x13,x9,x24
1944bc3d5698SJohn Baldwin	adc	x23,xzr,xzr
1945bc3d5698SJohn Baldwin	ldr	x24,[x2,x28]		// next b[i]
1946bc3d5698SJohn Baldwin	adds	x20,x20,x10
1947bc3d5698SJohn Baldwin	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1948bc3d5698SJohn Baldwin	adcs	x21,x21,x11
1949bc3d5698SJohn Baldwin	mul	x11,x15,x25
1950bc3d5698SJohn Baldwin	adcs	x22,x22,x12
1951bc3d5698SJohn Baldwin	mul	x12,x16,x25
1952bc3d5698SJohn Baldwin	adc	x23,x23,x13		// can't overflow
1953bc3d5698SJohn Baldwin	mul	x13,x17,x25
1954bc3d5698SJohn Baldwin	adds	x19,x19,x10
1955bc3d5698SJohn Baldwin	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1956bc3d5698SJohn Baldwin	adcs	x20,x20,x11
1957bc3d5698SJohn Baldwin	umulh	x11,x15,x25
1958bc3d5698SJohn Baldwin	adcs	x21,x21,x12
1959bc3d5698SJohn Baldwin	umulh	x12,x16,x25
1960bc3d5698SJohn Baldwin	adcs	x22,x22,x13
1961bc3d5698SJohn Baldwin	umulh	x13,x17,x25
1962bc3d5698SJohn Baldwin	adcs	x23,x23,x0
1963bc3d5698SJohn Baldwin	ldr	x25,[sp,x28]		// next a[0]*n0
1964bc3d5698SJohn Baldwin	adc	x0,xzr,xzr
1965bc3d5698SJohn Baldwin	str	x19,[x26],#8		// result!!!
1966bc3d5698SJohn Baldwin	adds	x19,x20,x10
1967bc3d5698SJohn Baldwin	sub	x10,x27,x1		// done yet?
1968bc3d5698SJohn Baldwin	adcs	x20,x21,x11
1969bc3d5698SJohn Baldwin	adcs	x21,x22,x12
1970bc3d5698SJohn Baldwin	adcs	x22,x23,x13
1971bc3d5698SJohn Baldwin	//adc	x0,x0,xzr
1972bc3d5698SJohn Baldwin	cbnz	x28,.Loop_mul4x_tail
1973bc3d5698SJohn Baldwin
1974bc3d5698SJohn Baldwin	sub	x11,x3,x5		// rewinded np?
1975bc3d5698SJohn Baldwin	adc	x0,x0,xzr
1976bc3d5698SJohn Baldwin	cbz	x10,.Loop_mul4x_break
1977bc3d5698SJohn Baldwin
1978bc3d5698SJohn Baldwin	ldp	x10,x11,[x26,#8*4]
1979bc3d5698SJohn Baldwin	ldp	x12,x13,[x26,#8*6]
1980bc3d5698SJohn Baldwin	ldp	x6,x7,[x1,#8*0]
1981bc3d5698SJohn Baldwin	ldp	x8,x9,[x1,#8*2]
1982bc3d5698SJohn Baldwin	add	x1,x1,#8*4
1983bc3d5698SJohn Baldwin	adds	x19,x19,x10
1984bc3d5698SJohn Baldwin	adcs	x20,x20,x11
1985bc3d5698SJohn Baldwin	adcs	x21,x21,x12
1986bc3d5698SJohn Baldwin	adcs	x22,x22,x13
1987bc3d5698SJohn Baldwin	//adc	x0,x0,xzr
1988bc3d5698SJohn Baldwin	ldp	x14,x15,[x3,#8*0]
1989bc3d5698SJohn Baldwin	ldp	x16,x17,[x3,#8*2]
1990bc3d5698SJohn Baldwin	add	x3,x3,#8*4
1991bc3d5698SJohn Baldwin	b	.Loop_mul4x_tail
1992bc3d5698SJohn Baldwin
1993bc3d5698SJohn Baldwin.align	4
1994bc3d5698SJohn Baldwin.Loop_mul4x_break:
1995bc3d5698SJohn Baldwin	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1996bc3d5698SJohn Baldwin	adds	x19,x19,x30
1997bc3d5698SJohn Baldwin	add	x2,x2,#8*4		// bp++
1998bc3d5698SJohn Baldwin	adcs	x20,x20,xzr
1999bc3d5698SJohn Baldwin	sub	x1,x1,x5		// rewind ap
2000bc3d5698SJohn Baldwin	adcs	x21,x21,xzr
2001bc3d5698SJohn Baldwin	stp	x19,x20,[x26,#8*0]	// result!!!
2002bc3d5698SJohn Baldwin	adcs	x22,x22,xzr
2003bc3d5698SJohn Baldwin	ldp	x19,x20,[sp,#8*4]	// t[0..3]
2004bc3d5698SJohn Baldwin	adc	x30,x0,xzr
2005bc3d5698SJohn Baldwin	stp	x21,x22,[x26,#8*2]	// result!!!
2006bc3d5698SJohn Baldwin	cmp	x2,x13			// done yet?
2007bc3d5698SJohn Baldwin	ldp	x21,x22,[sp,#8*6]
2008bc3d5698SJohn Baldwin	ldp	x14,x15,[x11,#8*0]	// n[0..3]
2009bc3d5698SJohn Baldwin	ldp	x16,x17,[x11,#8*2]
2010bc3d5698SJohn Baldwin	add	x3,x11,#8*4
2011bc3d5698SJohn Baldwin	b.eq	.Lmul4x_post
2012bc3d5698SJohn Baldwin
2013bc3d5698SJohn Baldwin	ldr	x24,[x2]
2014bc3d5698SJohn Baldwin	ldp	x6,x7,[x1,#8*0]	// a[0..3]
2015bc3d5698SJohn Baldwin	ldp	x8,x9,[x1,#8*2]
2016bc3d5698SJohn Baldwin	adds	x1,x1,#8*4		// clear carry bit
2017bc3d5698SJohn Baldwin	mov	x0,xzr
2018bc3d5698SJohn Baldwin	mov	x26,sp
2019bc3d5698SJohn Baldwin	b	.Loop_mul4x_reduction
2020bc3d5698SJohn Baldwin
2021bc3d5698SJohn Baldwin.align	4
2022bc3d5698SJohn Baldwin.Lmul4x_post:
2023bc3d5698SJohn Baldwin	// Final step. We see if result is larger than modulus, and
2024bc3d5698SJohn Baldwin	// if it is, subtract the modulus. But comparison implies
2025bc3d5698SJohn Baldwin	// subtraction. So we subtract modulus, see if it borrowed,
2026bc3d5698SJohn Baldwin	// and conditionally copy original value.
2027bc3d5698SJohn Baldwin	mov	x0,x12
2028bc3d5698SJohn Baldwin	mov	x27,x12		// x0 copy
2029bc3d5698SJohn Baldwin	subs	x10,x19,x14
2030bc3d5698SJohn Baldwin	add	x26,sp,#8*8
2031bc3d5698SJohn Baldwin	sbcs	x11,x20,x15
2032bc3d5698SJohn Baldwin	sub	x28,x5,#8*4
2033bc3d5698SJohn Baldwin
2034bc3d5698SJohn Baldwin.Lmul4x_sub:
2035bc3d5698SJohn Baldwin	sbcs	x12,x21,x16
2036bc3d5698SJohn Baldwin	ldp	x14,x15,[x3,#8*0]
2037bc3d5698SJohn Baldwin	sub	x28,x28,#8*4
2038bc3d5698SJohn Baldwin	ldp	x19,x20,[x26,#8*0]
2039bc3d5698SJohn Baldwin	sbcs	x13,x22,x17
2040bc3d5698SJohn Baldwin	ldp	x16,x17,[x3,#8*2]
2041bc3d5698SJohn Baldwin	add	x3,x3,#8*4
2042bc3d5698SJohn Baldwin	ldp	x21,x22,[x26,#8*2]
2043bc3d5698SJohn Baldwin	add	x26,x26,#8*4
2044bc3d5698SJohn Baldwin	stp	x10,x11,[x0,#8*0]
2045bc3d5698SJohn Baldwin	sbcs	x10,x19,x14
2046bc3d5698SJohn Baldwin	stp	x12,x13,[x0,#8*2]
2047bc3d5698SJohn Baldwin	add	x0,x0,#8*4
2048bc3d5698SJohn Baldwin	sbcs	x11,x20,x15
2049bc3d5698SJohn Baldwin	cbnz	x28,.Lmul4x_sub
2050bc3d5698SJohn Baldwin
2051bc3d5698SJohn Baldwin	sbcs	x12,x21,x16
2052bc3d5698SJohn Baldwin	mov	x26,sp
2053bc3d5698SJohn Baldwin	add	x1,sp,#8*4
2054bc3d5698SJohn Baldwin	ldp	x6,x7,[x27,#8*0]
2055bc3d5698SJohn Baldwin	sbcs	x13,x22,x17
2056bc3d5698SJohn Baldwin	stp	x10,x11,[x0,#8*0]
2057bc3d5698SJohn Baldwin	ldp	x8,x9,[x27,#8*2]
2058bc3d5698SJohn Baldwin	stp	x12,x13,[x0,#8*2]
2059bc3d5698SJohn Baldwin	ldp	x19,x20,[x1,#8*0]
2060bc3d5698SJohn Baldwin	ldp	x21,x22,[x1,#8*2]
2061bc3d5698SJohn Baldwin	sbcs	xzr,x30,xzr	// did it borrow?
2062bc3d5698SJohn Baldwin	ldr	x30,[x29,#8]		// pull return address
2063bc3d5698SJohn Baldwin
2064bc3d5698SJohn Baldwin	sub	x28,x5,#8*4
2065bc3d5698SJohn Baldwin.Lmul4x_cond_copy:
2066bc3d5698SJohn Baldwin	sub	x28,x28,#8*4
2067bc3d5698SJohn Baldwin	csel	x10,x19,x6,lo
2068bc3d5698SJohn Baldwin	stp	xzr,xzr,[x26,#8*0]
2069bc3d5698SJohn Baldwin	csel	x11,x20,x7,lo
2070bc3d5698SJohn Baldwin	ldp	x6,x7,[x27,#8*4]
2071bc3d5698SJohn Baldwin	ldp	x19,x20,[x1,#8*4]
2072bc3d5698SJohn Baldwin	csel	x12,x21,x8,lo
2073bc3d5698SJohn Baldwin	stp	xzr,xzr,[x26,#8*2]
2074bc3d5698SJohn Baldwin	add	x26,x26,#8*4
2075bc3d5698SJohn Baldwin	csel	x13,x22,x9,lo
2076bc3d5698SJohn Baldwin	ldp	x8,x9,[x27,#8*6]
2077bc3d5698SJohn Baldwin	ldp	x21,x22,[x1,#8*6]
2078bc3d5698SJohn Baldwin	add	x1,x1,#8*4
2079bc3d5698SJohn Baldwin	stp	x10,x11,[x27,#8*0]
2080bc3d5698SJohn Baldwin	stp	x12,x13,[x27,#8*2]
2081bc3d5698SJohn Baldwin	add	x27,x27,#8*4
2082bc3d5698SJohn Baldwin	cbnz	x28,.Lmul4x_cond_copy
2083bc3d5698SJohn Baldwin
2084bc3d5698SJohn Baldwin	csel	x10,x19,x6,lo
2085bc3d5698SJohn Baldwin	stp	xzr,xzr,[x26,#8*0]
2086bc3d5698SJohn Baldwin	csel	x11,x20,x7,lo
2087bc3d5698SJohn Baldwin	stp	xzr,xzr,[x26,#8*2]
2088bc3d5698SJohn Baldwin	csel	x12,x21,x8,lo
2089bc3d5698SJohn Baldwin	stp	xzr,xzr,[x26,#8*3]
2090bc3d5698SJohn Baldwin	csel	x13,x22,x9,lo
2091bc3d5698SJohn Baldwin	stp	xzr,xzr,[x26,#8*4]
2092bc3d5698SJohn Baldwin	stp	x10,x11,[x27,#8*0]
2093bc3d5698SJohn Baldwin	stp	x12,x13,[x27,#8*2]
2094bc3d5698SJohn Baldwin
2095bc3d5698SJohn Baldwin	b	.Lmul4x_done
2096bc3d5698SJohn Baldwin
2097bc3d5698SJohn Baldwin.align	4
2098bc3d5698SJohn Baldwin.Lmul4x4_post_condition:
2099bc3d5698SJohn Baldwin	adc	x0,x0,xzr
2100bc3d5698SJohn Baldwin	ldr	x1,[x29,#96]		// pull rp
2101bc3d5698SJohn Baldwin	// x19-3,x0 hold result, x14-7 hold modulus
2102bc3d5698SJohn Baldwin	subs	x6,x19,x14
2103bc3d5698SJohn Baldwin	ldr	x30,[x29,#8]		// pull return address
2104bc3d5698SJohn Baldwin	sbcs	x7,x20,x15
2105bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#8*0]
2106bc3d5698SJohn Baldwin	sbcs	x8,x21,x16
2107bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#8*2]
2108bc3d5698SJohn Baldwin	sbcs	x9,x22,x17
2109bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#8*4]
2110bc3d5698SJohn Baldwin	sbcs	xzr,x0,xzr		// did it borrow?
2111bc3d5698SJohn Baldwin	stp	xzr,xzr,[sp,#8*6]
2112bc3d5698SJohn Baldwin
2113bc3d5698SJohn Baldwin	// x6-3 hold result-modulus
2114bc3d5698SJohn Baldwin	csel	x6,x19,x6,lo
2115bc3d5698SJohn Baldwin	csel	x7,x20,x7,lo
2116bc3d5698SJohn Baldwin	csel	x8,x21,x8,lo
2117bc3d5698SJohn Baldwin	csel	x9,x22,x9,lo
2118bc3d5698SJohn Baldwin	stp	x6,x7,[x1,#8*0]
2119bc3d5698SJohn Baldwin	stp	x8,x9,[x1,#8*2]
2120bc3d5698SJohn Baldwin
2121bc3d5698SJohn Baldwin.Lmul4x_done:
2122bc3d5698SJohn Baldwin	ldp	x19,x20,[x29,#16]
2123bc3d5698SJohn Baldwin	mov	sp,x29
2124bc3d5698SJohn Baldwin	ldp	x21,x22,[x29,#32]
2125bc3d5698SJohn Baldwin	mov	x0,#1
2126bc3d5698SJohn Baldwin	ldp	x23,x24,[x29,#48]
2127bc3d5698SJohn Baldwin	ldp	x25,x26,[x29,#64]
2128bc3d5698SJohn Baldwin	ldp	x27,x28,[x29,#80]
2129bc3d5698SJohn Baldwin	ldr	x29,[sp],#128
2130bd9588bcSAndrew Turner	// x30 loaded earlier
2131bd9588bcSAndrew Turner	AARCH64_VALIDATE_LINK_REGISTER
2132bc3d5698SJohn Baldwin	ret
2133bc3d5698SJohn Baldwin.size	__bn_mul4x_mont,.-__bn_mul4x_mont
2134bc3d5698SJohn Baldwin.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
2135bc3d5698SJohn Baldwin.align	2
2136bc3d5698SJohn Baldwin.align	4
2137