xref: /freebsd/sys/crypto/openssl/aarch64/armv8-mont.S (revision d2a55e6a9348bb55038dbc6b727ab041085f22db)
1/* Do not modify. This file is auto-generated from armv8-mont.pl. */
2#include "arm_arch.h"
3#ifndef	__KERNEL__
4
5.hidden	OPENSSL_armv8_rsa_neonized
6#endif
7.text
8
9.globl	bn_mul_mont
10.type	bn_mul_mont,%function
11.align	5
12bn_mul_mont:
13	AARCH64_SIGN_LINK_REGISTER
14.Lbn_mul_mont:
15	tst	x5,#3
16	b.ne	.Lmul_mont
17	cmp	x5,#32
18	b.le	.Lscalar_impl
19#ifndef	__KERNEL__
20#ifndef	__AARCH64EB__
21	adrp	x17,OPENSSL_armv8_rsa_neonized
22	ldr	w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
23	cbnz	w17, bn_mul8x_mont_neon
24#endif
25#endif
26
27.Lscalar_impl:
28	tst	x5,#7
29	b.eq	__bn_sqr8x_mont
30	tst	x5,#3
31	b.eq	__bn_mul4x_mont
32
33.Lmul_mont:
34	stp	x29,x30,[sp,#-64]!
35	add	x29,sp,#0
36	stp	x19,x20,[sp,#16]
37	stp	x21,x22,[sp,#32]
38	stp	x23,x24,[sp,#48]
39
40	ldr	x9,[x2],#8		// bp[0]
41	sub	x22,sp,x5,lsl#3
42	ldp	x7,x8,[x1],#16	// ap[0..1]
43	lsl	x5,x5,#3
44	ldr	x4,[x4]		// *n0
45	and	x22,x22,#-16		// ABI says so
46	ldp	x13,x14,[x3],#16	// np[0..1]
47
48	mul	x6,x7,x9		// ap[0]*bp[0]
49	sub	x21,x5,#16		// j=num-2
50	umulh	x7,x7,x9
51	mul	x10,x8,x9		// ap[1]*bp[0]
52	umulh	x11,x8,x9
53
54	mul	x15,x6,x4		// "tp[0]"*n0
55	mov	sp,x22			// alloca
56
57	// (*)	mul	x12,x13,x15	// np[0]*m1
58	umulh	x13,x13,x15
59	mul	x16,x14,x15		// np[1]*m1
60	// (*)	adds	x12,x12,x6	// discarded
61	// (*)	As for removal of first multiplication and addition
62	//	instructions. The outcome of first addition is
63	//	guaranteed to be zero, which leaves two computationally
64	//	significant outcomes: it either carries or not. Then
65	//	question is when does it carry? Is there alternative
66	//	way to deduce it? If you follow operations, you can
67	//	observe that condition for carry is quite simple:
68	//	x6 being non-zero. So that carry can be calculated
69	//	by adding -1 to x6. That's what next instruction does.
70	subs	xzr,x6,#1		// (*)
71	umulh	x17,x14,x15
72	adc	x13,x13,xzr
73	cbz	x21,.L1st_skip
74
75.L1st:
76	ldr	x8,[x1],#8
77	adds	x6,x10,x7
78	sub	x21,x21,#8		// j--
79	adc	x7,x11,xzr
80
81	ldr	x14,[x3],#8
82	adds	x12,x16,x13
83	mul	x10,x8,x9		// ap[j]*bp[0]
84	adc	x13,x17,xzr
85	umulh	x11,x8,x9
86
87	adds	x12,x12,x6
88	mul	x16,x14,x15		// np[j]*m1
89	adc	x13,x13,xzr
90	umulh	x17,x14,x15
91	str	x12,[x22],#8		// tp[j-1]
92	cbnz	x21,.L1st
93
94.L1st_skip:
95	adds	x6,x10,x7
96	sub	x1,x1,x5		// rewind x1
97	adc	x7,x11,xzr
98
99	adds	x12,x16,x13
100	sub	x3,x3,x5		// rewind x3
101	adc	x13,x17,xzr
102
103	adds	x12,x12,x6
104	sub	x20,x5,#8		// i=num-1
105	adcs	x13,x13,x7
106
107	adc	x19,xzr,xzr		// upmost overflow bit
108	stp	x12,x13,[x22]
109
110.Louter:
111	ldr	x9,[x2],#8		// bp[i]
112	ldp	x7,x8,[x1],#16
113	ldr	x23,[sp]		// tp[0]
114	add	x22,sp,#8
115
116	mul	x6,x7,x9		// ap[0]*bp[i]
117	sub	x21,x5,#16		// j=num-2
118	umulh	x7,x7,x9
119	ldp	x13,x14,[x3],#16
120	mul	x10,x8,x9		// ap[1]*bp[i]
121	adds	x6,x6,x23
122	umulh	x11,x8,x9
123	adc	x7,x7,xzr
124
125	mul	x15,x6,x4
126	sub	x20,x20,#8		// i--
127
128	// (*)	mul	x12,x13,x15	// np[0]*m1
129	umulh	x13,x13,x15
130	mul	x16,x14,x15		// np[1]*m1
131	// (*)	adds	x12,x12,x6
132	subs	xzr,x6,#1		// (*)
133	umulh	x17,x14,x15
134	cbz	x21,.Linner_skip
135
136.Linner:
137	ldr	x8,[x1],#8
138	adc	x13,x13,xzr
139	ldr	x23,[x22],#8		// tp[j]
140	adds	x6,x10,x7
141	sub	x21,x21,#8		// j--
142	adc	x7,x11,xzr
143
144	adds	x12,x16,x13
145	ldr	x14,[x3],#8
146	adc	x13,x17,xzr
147
148	mul	x10,x8,x9		// ap[j]*bp[i]
149	adds	x6,x6,x23
150	umulh	x11,x8,x9
151	adc	x7,x7,xzr
152
153	mul	x16,x14,x15		// np[j]*m1
154	adds	x12,x12,x6
155	umulh	x17,x14,x15
156	stur	x12,[x22,#-16]		// tp[j-1]
157	cbnz	x21,.Linner
158
159.Linner_skip:
160	ldr	x23,[x22],#8		// tp[j]
161	adc	x13,x13,xzr
162	adds	x6,x10,x7
163	sub	x1,x1,x5		// rewind x1
164	adc	x7,x11,xzr
165
166	adds	x12,x16,x13
167	sub	x3,x3,x5		// rewind x3
168	adcs	x13,x17,x19
169	adc	x19,xzr,xzr
170
171	adds	x6,x6,x23
172	adc	x7,x7,xzr
173
174	adds	x12,x12,x6
175	adcs	x13,x13,x7
176	adc	x19,x19,xzr		// upmost overflow bit
177	stp	x12,x13,[x22,#-16]
178
179	cbnz	x20,.Louter
180
181	// Final step. We see if result is larger than modulus, and
182	// if it is, subtract the modulus. But comparison implies
183	// subtraction. So we subtract modulus, see if it borrowed,
184	// and conditionally copy original value.
185	ldr	x23,[sp]		// tp[0]
186	add	x22,sp,#8
187	ldr	x14,[x3],#8		// np[0]
188	subs	x21,x5,#8		// j=num-1 and clear borrow
189	mov	x1,x0
190.Lsub:
191	sbcs	x8,x23,x14		// tp[j]-np[j]
192	ldr	x23,[x22],#8
193	sub	x21,x21,#8		// j--
194	ldr	x14,[x3],#8
195	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
196	cbnz	x21,.Lsub
197
198	sbcs	x8,x23,x14
199	sbcs	x19,x19,xzr		// did it borrow?
200	str	x8,[x1],#8		// rp[num-1]
201
202	ldr	x23,[sp]		// tp[0]
203	add	x22,sp,#8
204	ldr	x8,[x0],#8		// rp[0]
205	sub	x5,x5,#8		// num--
206	nop
207.Lcond_copy:
208	sub	x5,x5,#8		// num--
209	csel	x14,x23,x8,lo		// did it borrow?
210	ldr	x23,[x22],#8
211	ldr	x8,[x0],#8
212	stur	xzr,[x22,#-16]		// wipe tp
213	stur	x14,[x0,#-16]
214	cbnz	x5,.Lcond_copy
215
216	csel	x14,x23,x8,lo
217	stur	xzr,[x22,#-8]		// wipe tp
218	stur	x14,[x0,#-8]
219
220	ldp	x19,x20,[x29,#16]
221	mov	sp,x29
222	ldp	x21,x22,[x29,#32]
223	mov	x0,#1
224	ldp	x23,x24,[x29,#48]
225	ldr	x29,[sp],#64
226	AARCH64_VALIDATE_LINK_REGISTER
227	ret
228.size	bn_mul_mont,.-bn_mul_mont
229.type	bn_mul8x_mont_neon,%function
230.align	5
231bn_mul8x_mont_neon:
232	// Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to
233	// only from bn_mul_mont which has already signed the return address.
234	stp	x29,x30,[sp,#-80]!
235	mov	x16,sp
236	stp	d8,d9,[sp,#16]
237	stp	d10,d11,[sp,#32]
238	stp	d12,d13,[sp,#48]
239	stp	d14,d15,[sp,#64]
240	lsl	x5,x5,#1
241	eor	v14.16b,v14.16b,v14.16b
242
243.align	4
244.LNEON_8n:
245	eor	v6.16b,v6.16b,v6.16b
246	sub	x7,sp,#128
247	eor	v7.16b,v7.16b,v7.16b
248	sub	x7,x7,x5,lsl#4
249	eor	v8.16b,v8.16b,v8.16b
250	and	x7,x7,#-64
251	eor	v9.16b,v9.16b,v9.16b
252	mov	sp,x7		// alloca
253	eor	v10.16b,v10.16b,v10.16b
254	add	x7,x7,#256
255	eor	v11.16b,v11.16b,v11.16b
256	sub	x8,x5,#8
257	eor	v12.16b,v12.16b,v12.16b
258	eor	v13.16b,v13.16b,v13.16b
259
260.LNEON_8n_init:
261	st1	{v6.2d,v7.2d},[x7],#32
262	subs	x8,x8,#8
263	st1	{v8.2d,v9.2d},[x7],#32
264	st1	{v10.2d,v11.2d},[x7],#32
265	st1	{v12.2d,v13.2d},[x7],#32
266	bne	.LNEON_8n_init
267
268	add	x6,sp,#256
269	ld1	{v0.4s,v1.4s},[x1],#32
270	add	x10,sp,#8
271	ldr	s30,[x4],#4
272	mov	x9,x5
273	b	.LNEON_8n_outer
274
275.align	4
276.LNEON_8n_outer:
277	ldr	s28,[x2],#4   // *b++
278	uxtl	v28.4s,v28.4h
279	add	x7,sp,#128
280	ld1	{v2.4s,v3.4s},[x3],#32
281
282	umlal	v6.2d,v28.2s,v0.s[0]
283	umlal	v7.2d,v28.2s,v0.s[1]
284	umlal	v8.2d,v28.2s,v0.s[2]
285	shl	v29.2d,v6.2d,#16
286	ext	v29.16b,v29.16b,v29.16b,#8
287	umlal	v9.2d,v28.2s,v0.s[3]
288	add	v29.2d,v29.2d,v6.2d
289	umlal	v10.2d,v28.2s,v1.s[0]
290	mul	v29.2s,v29.2s,v30.2s
291	umlal	v11.2d,v28.2s,v1.s[1]
292	st1	{v28.2s},[sp]		// put aside smashed b[8*i+0]
293	umlal	v12.2d,v28.2s,v1.s[2]
294	uxtl	v29.4s,v29.4h
295	umlal	v13.2d,v28.2s,v1.s[3]
296	ldr	s28,[x2],#4   // *b++
297	umlal	v6.2d,v29.2s,v2.s[0]
298	umlal	v7.2d,v29.2s,v2.s[1]
299	uxtl	v28.4s,v28.4h
300	umlal	v8.2d,v29.2s,v2.s[2]
301	ushr	v15.2d,v6.2d,#16
302	umlal	v9.2d,v29.2s,v2.s[3]
303	umlal	v10.2d,v29.2s,v3.s[0]
304	ext	v6.16b,v6.16b,v6.16b,#8
305	add	v6.2d,v6.2d,v15.2d
306	umlal	v11.2d,v29.2s,v3.s[1]
307	ushr	v6.2d,v6.2d,#16
308	umlal	v12.2d,v29.2s,v3.s[2]
309	umlal	v13.2d,v29.2s,v3.s[3]
310	add	v16.2d,v7.2d,v6.2d
311	ins	v7.d[0],v16.d[0]
312	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+0]
313	umlal	v7.2d,v28.2s,v0.s[0]
314	ld1	{v6.2d},[x6],#16
315	umlal	v8.2d,v28.2s,v0.s[1]
316	umlal	v9.2d,v28.2s,v0.s[2]
317	shl	v29.2d,v7.2d,#16
318	ext	v29.16b,v29.16b,v29.16b,#8
319	umlal	v10.2d,v28.2s,v0.s[3]
320	add	v29.2d,v29.2d,v7.2d
321	umlal	v11.2d,v28.2s,v1.s[0]
322	mul	v29.2s,v29.2s,v30.2s
323	umlal	v12.2d,v28.2s,v1.s[1]
324	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+1]
325	umlal	v13.2d,v28.2s,v1.s[2]
326	uxtl	v29.4s,v29.4h
327	umlal	v6.2d,v28.2s,v1.s[3]
328	ldr	s28,[x2],#4   // *b++
329	umlal	v7.2d,v29.2s,v2.s[0]
330	umlal	v8.2d,v29.2s,v2.s[1]
331	uxtl	v28.4s,v28.4h
332	umlal	v9.2d,v29.2s,v2.s[2]
333	ushr	v15.2d,v7.2d,#16
334	umlal	v10.2d,v29.2s,v2.s[3]
335	umlal	v11.2d,v29.2s,v3.s[0]
336	ext	v7.16b,v7.16b,v7.16b,#8
337	add	v7.2d,v7.2d,v15.2d
338	umlal	v12.2d,v29.2s,v3.s[1]
339	ushr	v7.2d,v7.2d,#16
340	umlal	v13.2d,v29.2s,v3.s[2]
341	umlal	v6.2d,v29.2s,v3.s[3]
342	add	v16.2d,v8.2d,v7.2d
343	ins	v8.d[0],v16.d[0]
344	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+1]
345	umlal	v8.2d,v28.2s,v0.s[0]
346	ld1	{v7.2d},[x6],#16
347	umlal	v9.2d,v28.2s,v0.s[1]
348	umlal	v10.2d,v28.2s,v0.s[2]
349	shl	v29.2d,v8.2d,#16
350	ext	v29.16b,v29.16b,v29.16b,#8
351	umlal	v11.2d,v28.2s,v0.s[3]
352	add	v29.2d,v29.2d,v8.2d
353	umlal	v12.2d,v28.2s,v1.s[0]
354	mul	v29.2s,v29.2s,v30.2s
355	umlal	v13.2d,v28.2s,v1.s[1]
356	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+2]
357	umlal	v6.2d,v28.2s,v1.s[2]
358	uxtl	v29.4s,v29.4h
359	umlal	v7.2d,v28.2s,v1.s[3]
360	ldr	s28,[x2],#4   // *b++
361	umlal	v8.2d,v29.2s,v2.s[0]
362	umlal	v9.2d,v29.2s,v2.s[1]
363	uxtl	v28.4s,v28.4h
364	umlal	v10.2d,v29.2s,v2.s[2]
365	ushr	v15.2d,v8.2d,#16
366	umlal	v11.2d,v29.2s,v2.s[3]
367	umlal	v12.2d,v29.2s,v3.s[0]
368	ext	v8.16b,v8.16b,v8.16b,#8
369	add	v8.2d,v8.2d,v15.2d
370	umlal	v13.2d,v29.2s,v3.s[1]
371	ushr	v8.2d,v8.2d,#16
372	umlal	v6.2d,v29.2s,v3.s[2]
373	umlal	v7.2d,v29.2s,v3.s[3]
374	add	v16.2d,v9.2d,v8.2d
375	ins	v9.d[0],v16.d[0]
376	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+2]
377	umlal	v9.2d,v28.2s,v0.s[0]
378	ld1	{v8.2d},[x6],#16
379	umlal	v10.2d,v28.2s,v0.s[1]
380	umlal	v11.2d,v28.2s,v0.s[2]
381	shl	v29.2d,v9.2d,#16
382	ext	v29.16b,v29.16b,v29.16b,#8
383	umlal	v12.2d,v28.2s,v0.s[3]
384	add	v29.2d,v29.2d,v9.2d
385	umlal	v13.2d,v28.2s,v1.s[0]
386	mul	v29.2s,v29.2s,v30.2s
387	umlal	v6.2d,v28.2s,v1.s[1]
388	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+3]
389	umlal	v7.2d,v28.2s,v1.s[2]
390	uxtl	v29.4s,v29.4h
391	umlal	v8.2d,v28.2s,v1.s[3]
392	ldr	s28,[x2],#4   // *b++
393	umlal	v9.2d,v29.2s,v2.s[0]
394	umlal	v10.2d,v29.2s,v2.s[1]
395	uxtl	v28.4s,v28.4h
396	umlal	v11.2d,v29.2s,v2.s[2]
397	ushr	v15.2d,v9.2d,#16
398	umlal	v12.2d,v29.2s,v2.s[3]
399	umlal	v13.2d,v29.2s,v3.s[0]
400	ext	v9.16b,v9.16b,v9.16b,#8
401	add	v9.2d,v9.2d,v15.2d
402	umlal	v6.2d,v29.2s,v3.s[1]
403	ushr	v9.2d,v9.2d,#16
404	umlal	v7.2d,v29.2s,v3.s[2]
405	umlal	v8.2d,v29.2s,v3.s[3]
406	add	v16.2d,v10.2d,v9.2d
407	ins	v10.d[0],v16.d[0]
408	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+3]
409	umlal	v10.2d,v28.2s,v0.s[0]
410	ld1	{v9.2d},[x6],#16
411	umlal	v11.2d,v28.2s,v0.s[1]
412	umlal	v12.2d,v28.2s,v0.s[2]
413	shl	v29.2d,v10.2d,#16
414	ext	v29.16b,v29.16b,v29.16b,#8
415	umlal	v13.2d,v28.2s,v0.s[3]
416	add	v29.2d,v29.2d,v10.2d
417	umlal	v6.2d,v28.2s,v1.s[0]
418	mul	v29.2s,v29.2s,v30.2s
419	umlal	v7.2d,v28.2s,v1.s[1]
420	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+4]
421	umlal	v8.2d,v28.2s,v1.s[2]
422	uxtl	v29.4s,v29.4h
423	umlal	v9.2d,v28.2s,v1.s[3]
424	ldr	s28,[x2],#4   // *b++
425	umlal	v10.2d,v29.2s,v2.s[0]
426	umlal	v11.2d,v29.2s,v2.s[1]
427	uxtl	v28.4s,v28.4h
428	umlal	v12.2d,v29.2s,v2.s[2]
429	ushr	v15.2d,v10.2d,#16
430	umlal	v13.2d,v29.2s,v2.s[3]
431	umlal	v6.2d,v29.2s,v3.s[0]
432	ext	v10.16b,v10.16b,v10.16b,#8
433	add	v10.2d,v10.2d,v15.2d
434	umlal	v7.2d,v29.2s,v3.s[1]
435	ushr	v10.2d,v10.2d,#16
436	umlal	v8.2d,v29.2s,v3.s[2]
437	umlal	v9.2d,v29.2s,v3.s[3]
438	add	v16.2d,v11.2d,v10.2d
439	ins	v11.d[0],v16.d[0]
440	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+4]
441	umlal	v11.2d,v28.2s,v0.s[0]
442	ld1	{v10.2d},[x6],#16
443	umlal	v12.2d,v28.2s,v0.s[1]
444	umlal	v13.2d,v28.2s,v0.s[2]
445	shl	v29.2d,v11.2d,#16
446	ext	v29.16b,v29.16b,v29.16b,#8
447	umlal	v6.2d,v28.2s,v0.s[3]
448	add	v29.2d,v29.2d,v11.2d
449	umlal	v7.2d,v28.2s,v1.s[0]
450	mul	v29.2s,v29.2s,v30.2s
451	umlal	v8.2d,v28.2s,v1.s[1]
452	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+5]
453	umlal	v9.2d,v28.2s,v1.s[2]
454	uxtl	v29.4s,v29.4h
455	umlal	v10.2d,v28.2s,v1.s[3]
456	ldr	s28,[x2],#4   // *b++
457	umlal	v11.2d,v29.2s,v2.s[0]
458	umlal	v12.2d,v29.2s,v2.s[1]
459	uxtl	v28.4s,v28.4h
460	umlal	v13.2d,v29.2s,v2.s[2]
461	ushr	v15.2d,v11.2d,#16
462	umlal	v6.2d,v29.2s,v2.s[3]
463	umlal	v7.2d,v29.2s,v3.s[0]
464	ext	v11.16b,v11.16b,v11.16b,#8
465	add	v11.2d,v11.2d,v15.2d
466	umlal	v8.2d,v29.2s,v3.s[1]
467	ushr	v11.2d,v11.2d,#16
468	umlal	v9.2d,v29.2s,v3.s[2]
469	umlal	v10.2d,v29.2s,v3.s[3]
470	add	v16.2d,v12.2d,v11.2d
471	ins	v12.d[0],v16.d[0]
472	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+5]
473	umlal	v12.2d,v28.2s,v0.s[0]
474	ld1	{v11.2d},[x6],#16
475	umlal	v13.2d,v28.2s,v0.s[1]
476	umlal	v6.2d,v28.2s,v0.s[2]
477	shl	v29.2d,v12.2d,#16
478	ext	v29.16b,v29.16b,v29.16b,#8
479	umlal	v7.2d,v28.2s,v0.s[3]
480	add	v29.2d,v29.2d,v12.2d
481	umlal	v8.2d,v28.2s,v1.s[0]
482	mul	v29.2s,v29.2s,v30.2s
483	umlal	v9.2d,v28.2s,v1.s[1]
484	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+6]
485	umlal	v10.2d,v28.2s,v1.s[2]
486	uxtl	v29.4s,v29.4h
487	umlal	v11.2d,v28.2s,v1.s[3]
488	ldr	s28,[x2],#4   // *b++
489	umlal	v12.2d,v29.2s,v2.s[0]
490	umlal	v13.2d,v29.2s,v2.s[1]
491	uxtl	v28.4s,v28.4h
492	umlal	v6.2d,v29.2s,v2.s[2]
493	ushr	v15.2d,v12.2d,#16
494	umlal	v7.2d,v29.2s,v2.s[3]
495	umlal	v8.2d,v29.2s,v3.s[0]
496	ext	v12.16b,v12.16b,v12.16b,#8
497	add	v12.2d,v12.2d,v15.2d
498	umlal	v9.2d,v29.2s,v3.s[1]
499	ushr	v12.2d,v12.2d,#16
500	umlal	v10.2d,v29.2s,v3.s[2]
501	umlal	v11.2d,v29.2s,v3.s[3]
502	add	v16.2d,v13.2d,v12.2d
503	ins	v13.d[0],v16.d[0]
504	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+6]
505	umlal	v13.2d,v28.2s,v0.s[0]
506	ld1	{v12.2d},[x6],#16
507	umlal	v6.2d,v28.2s,v0.s[1]
508	umlal	v7.2d,v28.2s,v0.s[2]
509	shl	v29.2d,v13.2d,#16
510	ext	v29.16b,v29.16b,v29.16b,#8
511	umlal	v8.2d,v28.2s,v0.s[3]
512	add	v29.2d,v29.2d,v13.2d
513	umlal	v9.2d,v28.2s,v1.s[0]
514	mul	v29.2s,v29.2s,v30.2s
515	umlal	v10.2d,v28.2s,v1.s[1]
516	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+7]
517	umlal	v11.2d,v28.2s,v1.s[2]
518	uxtl	v29.4s,v29.4h
519	umlal	v12.2d,v28.2s,v1.s[3]
520	ld1	{v28.2s},[sp]		// pull smashed b[8*i+0]
521	umlal	v13.2d,v29.2s,v2.s[0]
522	ld1	{v0.4s,v1.4s},[x1],#32
523	umlal	v6.2d,v29.2s,v2.s[1]
524	umlal	v7.2d,v29.2s,v2.s[2]
525	mov	v5.16b,v13.16b
526	ushr	v5.2d,v5.2d,#16
527	ext	v13.16b,v13.16b,v13.16b,#8
528	umlal	v8.2d,v29.2s,v2.s[3]
529	umlal	v9.2d,v29.2s,v3.s[0]
530	add	v13.2d,v13.2d,v5.2d
531	umlal	v10.2d,v29.2s,v3.s[1]
532	ushr	v13.2d,v13.2d,#16
533	eor	v15.16b,v15.16b,v15.16b
534	ins	v13.d[1],v15.d[0]
535	umlal	v11.2d,v29.2s,v3.s[2]
536	umlal	v12.2d,v29.2s,v3.s[3]
537	add	v6.2d,v6.2d,v13.2d
538	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+7]
539	add	x10,sp,#8		// rewind
540	sub	x8,x5,#8
541	b	.LNEON_8n_inner
542
543.align	4
544.LNEON_8n_inner:
545	subs	x8,x8,#8
546	umlal	v6.2d,v28.2s,v0.s[0]
547	ld1	{v13.2d},[x6]
548	umlal	v7.2d,v28.2s,v0.s[1]
549	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+0]
550	umlal	v8.2d,v28.2s,v0.s[2]
551	ld1	{v2.4s,v3.4s},[x3],#32
552	umlal	v9.2d,v28.2s,v0.s[3]
553	b.eq	.LInner_jump
554	add	x6,x6,#16	// don't advance in last iteration
555.LInner_jump:
556	umlal	v10.2d,v28.2s,v1.s[0]
557	umlal	v11.2d,v28.2s,v1.s[1]
558	umlal	v12.2d,v28.2s,v1.s[2]
559	umlal	v13.2d,v28.2s,v1.s[3]
560	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+1]
561	umlal	v6.2d,v29.2s,v2.s[0]
562	umlal	v7.2d,v29.2s,v2.s[1]
563	umlal	v8.2d,v29.2s,v2.s[2]
564	umlal	v9.2d,v29.2s,v2.s[3]
565	umlal	v10.2d,v29.2s,v3.s[0]
566	umlal	v11.2d,v29.2s,v3.s[1]
567	umlal	v12.2d,v29.2s,v3.s[2]
568	umlal	v13.2d,v29.2s,v3.s[3]
569	st1	{v6.2d},[x7],#16
570	umlal	v7.2d,v28.2s,v0.s[0]
571	ld1	{v6.2d},[x6]
572	umlal	v8.2d,v28.2s,v0.s[1]
573	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+1]
574	umlal	v9.2d,v28.2s,v0.s[2]
575	b.eq	.LInner_jump1
576	add	x6,x6,#16	// don't advance in last iteration
577.LInner_jump1:
578	umlal	v10.2d,v28.2s,v0.s[3]
579	umlal	v11.2d,v28.2s,v1.s[0]
580	umlal	v12.2d,v28.2s,v1.s[1]
581	umlal	v13.2d,v28.2s,v1.s[2]
582	umlal	v6.2d,v28.2s,v1.s[3]
583	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+2]
584	umlal	v7.2d,v29.2s,v2.s[0]
585	umlal	v8.2d,v29.2s,v2.s[1]
586	umlal	v9.2d,v29.2s,v2.s[2]
587	umlal	v10.2d,v29.2s,v2.s[3]
588	umlal	v11.2d,v29.2s,v3.s[0]
589	umlal	v12.2d,v29.2s,v3.s[1]
590	umlal	v13.2d,v29.2s,v3.s[2]
591	umlal	v6.2d,v29.2s,v3.s[3]
592	st1	{v7.2d},[x7],#16
593	umlal	v8.2d,v28.2s,v0.s[0]
594	ld1	{v7.2d},[x6]
595	umlal	v9.2d,v28.2s,v0.s[1]
596	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+2]
597	umlal	v10.2d,v28.2s,v0.s[2]
598	b.eq	.LInner_jump2
599	add	x6,x6,#16	// don't advance in last iteration
600.LInner_jump2:
601	umlal	v11.2d,v28.2s,v0.s[3]
602	umlal	v12.2d,v28.2s,v1.s[0]
603	umlal	v13.2d,v28.2s,v1.s[1]
604	umlal	v6.2d,v28.2s,v1.s[2]
605	umlal	v7.2d,v28.2s,v1.s[3]
606	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+3]
607	umlal	v8.2d,v29.2s,v2.s[0]
608	umlal	v9.2d,v29.2s,v2.s[1]
609	umlal	v10.2d,v29.2s,v2.s[2]
610	umlal	v11.2d,v29.2s,v2.s[3]
611	umlal	v12.2d,v29.2s,v3.s[0]
612	umlal	v13.2d,v29.2s,v3.s[1]
613	umlal	v6.2d,v29.2s,v3.s[2]
614	umlal	v7.2d,v29.2s,v3.s[3]
615	st1	{v8.2d},[x7],#16
616	umlal	v9.2d,v28.2s,v0.s[0]
617	ld1	{v8.2d},[x6]
618	umlal	v10.2d,v28.2s,v0.s[1]
619	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+3]
620	umlal	v11.2d,v28.2s,v0.s[2]
621	b.eq	.LInner_jump3
622	add	x6,x6,#16	// don't advance in last iteration
623.LInner_jump3:
624	umlal	v12.2d,v28.2s,v0.s[3]
625	umlal	v13.2d,v28.2s,v1.s[0]
626	umlal	v6.2d,v28.2s,v1.s[1]
627	umlal	v7.2d,v28.2s,v1.s[2]
628	umlal	v8.2d,v28.2s,v1.s[3]
629	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+4]
630	umlal	v9.2d,v29.2s,v2.s[0]
631	umlal	v10.2d,v29.2s,v2.s[1]
632	umlal	v11.2d,v29.2s,v2.s[2]
633	umlal	v12.2d,v29.2s,v2.s[3]
634	umlal	v13.2d,v29.2s,v3.s[0]
635	umlal	v6.2d,v29.2s,v3.s[1]
636	umlal	v7.2d,v29.2s,v3.s[2]
637	umlal	v8.2d,v29.2s,v3.s[3]
638	st1	{v9.2d},[x7],#16
639	umlal	v10.2d,v28.2s,v0.s[0]
640	ld1	{v9.2d},[x6]
641	umlal	v11.2d,v28.2s,v0.s[1]
642	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+4]
643	umlal	v12.2d,v28.2s,v0.s[2]
644	b.eq	.LInner_jump4
645	add	x6,x6,#16	// don't advance in last iteration
646.LInner_jump4:
647	umlal	v13.2d,v28.2s,v0.s[3]
648	umlal	v6.2d,v28.2s,v1.s[0]
649	umlal	v7.2d,v28.2s,v1.s[1]
650	umlal	v8.2d,v28.2s,v1.s[2]
651	umlal	v9.2d,v28.2s,v1.s[3]
652	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+5]
653	umlal	v10.2d,v29.2s,v2.s[0]
654	umlal	v11.2d,v29.2s,v2.s[1]
655	umlal	v12.2d,v29.2s,v2.s[2]
656	umlal	v13.2d,v29.2s,v2.s[3]
657	umlal	v6.2d,v29.2s,v3.s[0]
658	umlal	v7.2d,v29.2s,v3.s[1]
659	umlal	v8.2d,v29.2s,v3.s[2]
660	umlal	v9.2d,v29.2s,v3.s[3]
661	st1	{v10.2d},[x7],#16
662	umlal	v11.2d,v28.2s,v0.s[0]
663	ld1	{v10.2d},[x6]
664	umlal	v12.2d,v28.2s,v0.s[1]
665	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+5]
666	umlal	v13.2d,v28.2s,v0.s[2]
667	b.eq	.LInner_jump5
668	add	x6,x6,#16	// don't advance in last iteration
669.LInner_jump5:
670	umlal	v6.2d,v28.2s,v0.s[3]
671	umlal	v7.2d,v28.2s,v1.s[0]
672	umlal	v8.2d,v28.2s,v1.s[1]
673	umlal	v9.2d,v28.2s,v1.s[2]
674	umlal	v10.2d,v28.2s,v1.s[3]
675	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+6]
676	umlal	v11.2d,v29.2s,v2.s[0]
677	umlal	v12.2d,v29.2s,v2.s[1]
678	umlal	v13.2d,v29.2s,v2.s[2]
679	umlal	v6.2d,v29.2s,v2.s[3]
680	umlal	v7.2d,v29.2s,v3.s[0]
681	umlal	v8.2d,v29.2s,v3.s[1]
682	umlal	v9.2d,v29.2s,v3.s[2]
683	umlal	v10.2d,v29.2s,v3.s[3]
684	st1	{v11.2d},[x7],#16
685	umlal	v12.2d,v28.2s,v0.s[0]
686	ld1	{v11.2d},[x6]
687	umlal	v13.2d,v28.2s,v0.s[1]
688	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+6]
689	umlal	v6.2d,v28.2s,v0.s[2]
690	b.eq	.LInner_jump6
691	add	x6,x6,#16	// don't advance in last iteration
692.LInner_jump6:
693	umlal	v7.2d,v28.2s,v0.s[3]
694	umlal	v8.2d,v28.2s,v1.s[0]
695	umlal	v9.2d,v28.2s,v1.s[1]
696	umlal	v10.2d,v28.2s,v1.s[2]
697	umlal	v11.2d,v28.2s,v1.s[3]
698	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+7]
699	umlal	v12.2d,v29.2s,v2.s[0]
700	umlal	v13.2d,v29.2s,v2.s[1]
701	umlal	v6.2d,v29.2s,v2.s[2]
702	umlal	v7.2d,v29.2s,v2.s[3]
703	umlal	v8.2d,v29.2s,v3.s[0]
704	umlal	v9.2d,v29.2s,v3.s[1]
705	umlal	v10.2d,v29.2s,v3.s[2]
706	umlal	v11.2d,v29.2s,v3.s[3]
707	st1	{v12.2d},[x7],#16
708	umlal	v13.2d,v28.2s,v0.s[0]
709	ld1	{v12.2d},[x6]
710	umlal	v6.2d,v28.2s,v0.s[1]
711	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+7]
712	umlal	v7.2d,v28.2s,v0.s[2]
713	b.eq	.LInner_jump7
714	add	x6,x6,#16	// don't advance in last iteration
715.LInner_jump7:
716	umlal	v8.2d,v28.2s,v0.s[3]
717	umlal	v9.2d,v28.2s,v1.s[0]
718	umlal	v10.2d,v28.2s,v1.s[1]
719	umlal	v11.2d,v28.2s,v1.s[2]
720	umlal	v12.2d,v28.2s,v1.s[3]
721	b.ne	.LInner_after_rewind8
722	sub	x1,x1,x5,lsl#2	// rewind
723.LInner_after_rewind8:
724	umlal	v13.2d,v29.2s,v2.s[0]
725	ld1	{v28.2s},[sp]		// pull smashed b[8*i+0]
726	umlal	v6.2d,v29.2s,v2.s[1]
727	ld1	{v0.4s,v1.4s},[x1],#32
728	umlal	v7.2d,v29.2s,v2.s[2]
729	add	x10,sp,#8		// rewind
730	umlal	v8.2d,v29.2s,v2.s[3]
731	umlal	v9.2d,v29.2s,v3.s[0]
732	umlal	v10.2d,v29.2s,v3.s[1]
733	umlal	v11.2d,v29.2s,v3.s[2]
734	st1	{v13.2d},[x7],#16
735	umlal	v12.2d,v29.2s,v3.s[3]
736
737	bne	.LNEON_8n_inner
738	add	x6,sp,#128
739	st1	{v6.2d,v7.2d},[x7],#32
740	eor	v2.16b,v2.16b,v2.16b	// v2
741	st1	{v8.2d,v9.2d},[x7],#32
742	eor	v3.16b,v3.16b,v3.16b	// v3
743	st1	{v10.2d,v11.2d},[x7],#32
744	st1	{v12.2d},[x7]
745
746	subs	x9,x9,#8
747	ld1	{v6.2d,v7.2d},[x6],#32
748	ld1	{v8.2d,v9.2d},[x6],#32
749	ld1	{v10.2d,v11.2d},[x6],#32
750	ld1	{v12.2d,v13.2d},[x6],#32
751
752	b.eq	.LInner_8n_jump_2steps
753	sub	x3,x3,x5,lsl#2	// rewind
754	b	.LNEON_8n_outer
755
756.LInner_8n_jump_2steps:
757	add	x7,sp,#128
758	st1	{v2.2d,v3.2d}, [sp],#32	// start wiping stack frame
759	mov	v5.16b,v6.16b
760	ushr	v15.2d,v6.2d,#16
761	ext	v6.16b,v6.16b,v6.16b,#8
762	st1	{v2.2d,v3.2d}, [sp],#32
763	add	v6.2d,v6.2d,v15.2d
764	st1	{v2.2d,v3.2d}, [sp],#32
765	ushr	v15.2d,v6.2d,#16
766	st1	{v2.2d,v3.2d}, [sp],#32
767	zip1	v6.4h,v5.4h,v6.4h
768	ins	v15.d[1],v14.d[0]
769
770	mov	x8,x5
771	b	.LNEON_tail_entry
772
773.align	4
774.LNEON_tail:
775	add	v6.2d,v6.2d,v15.2d
776	mov	v5.16b,v6.16b
777	ushr	v15.2d,v6.2d,#16
778	ext	v6.16b,v6.16b,v6.16b,#8
779	ld1	{v8.2d,v9.2d}, [x6],#32
780	add	v6.2d,v6.2d,v15.2d
781	ld1	{v10.2d,v11.2d}, [x6],#32
782	ushr	v15.2d,v6.2d,#16
783	ld1	{v12.2d,v13.2d}, [x6],#32
784	zip1	v6.4h,v5.4h,v6.4h
785	ins	v15.d[1],v14.d[0]
786
787.LNEON_tail_entry:
788	add	v7.2d,v7.2d,v15.2d
789	st1	{v6.s}[0], [x7],#4
790	ushr	v15.2d,v7.2d,#16
791	mov	v5.16b,v7.16b
792	ext	v7.16b,v7.16b,v7.16b,#8
793	add	v7.2d,v7.2d,v15.2d
794	ushr	v15.2d,v7.2d,#16
795	zip1	v7.4h,v5.4h,v7.4h
796	ins	v15.d[1],v14.d[0]
797	add	v8.2d,v8.2d,v15.2d
798	st1	{v7.s}[0], [x7],#4
799	ushr	v15.2d,v8.2d,#16
800	mov	v5.16b,v8.16b
801	ext	v8.16b,v8.16b,v8.16b,#8
802	add	v8.2d,v8.2d,v15.2d
803	ushr	v15.2d,v8.2d,#16
804	zip1	v8.4h,v5.4h,v8.4h
805	ins	v15.d[1],v14.d[0]
806	add	v9.2d,v9.2d,v15.2d
807	st1	{v8.s}[0], [x7],#4
808	ushr	v15.2d,v9.2d,#16
809	mov	v5.16b,v9.16b
810	ext	v9.16b,v9.16b,v9.16b,#8
811	add	v9.2d,v9.2d,v15.2d
812	ushr	v15.2d,v9.2d,#16
813	zip1	v9.4h,v5.4h,v9.4h
814	ins	v15.d[1],v14.d[0]
815	add	v10.2d,v10.2d,v15.2d
816	st1	{v9.s}[0], [x7],#4
817	ushr	v15.2d,v10.2d,#16
818	mov	v5.16b,v10.16b
819	ext	v10.16b,v10.16b,v10.16b,#8
820	add	v10.2d,v10.2d,v15.2d
821	ushr	v15.2d,v10.2d,#16
822	zip1	v10.4h,v5.4h,v10.4h
823	ins	v15.d[1],v14.d[0]
824	add	v11.2d,v11.2d,v15.2d
825	st1	{v10.s}[0], [x7],#4
826	ushr	v15.2d,v11.2d,#16
827	mov	v5.16b,v11.16b
828	ext	v11.16b,v11.16b,v11.16b,#8
829	add	v11.2d,v11.2d,v15.2d
830	ushr	v15.2d,v11.2d,#16
831	zip1	v11.4h,v5.4h,v11.4h
832	ins	v15.d[1],v14.d[0]
833	add	v12.2d,v12.2d,v15.2d
834	st1	{v11.s}[0], [x7],#4
835	ushr	v15.2d,v12.2d,#16
836	mov	v5.16b,v12.16b
837	ext	v12.16b,v12.16b,v12.16b,#8
838	add	v12.2d,v12.2d,v15.2d
839	ushr	v15.2d,v12.2d,#16
840	zip1	v12.4h,v5.4h,v12.4h
841	ins	v15.d[1],v14.d[0]
842	add	v13.2d,v13.2d,v15.2d
843	st1	{v12.s}[0], [x7],#4
844	ushr	v15.2d,v13.2d,#16
845	mov	v5.16b,v13.16b
846	ext	v13.16b,v13.16b,v13.16b,#8
847	add	v13.2d,v13.2d,v15.2d
848	ushr	v15.2d,v13.2d,#16
849	zip1	v13.4h,v5.4h,v13.4h
850	ins	v15.d[1],v14.d[0]
851	ld1	{v6.2d,v7.2d}, [x6],#32
852	subs	x8,x8,#8
853	st1	{v13.s}[0], [x7],#4
854	bne	.LNEON_tail
855
856	st1	{v15.s}[0], [x7],#4	// top-most bit
857	sub	x3,x3,x5,lsl#2		// rewind x3
858	subs	x1,sp,#0			// clear carry flag
859	add	x2,sp,x5,lsl#2
860
861.LNEON_sub:
862	ldp	w4,w5,[x1],#8
863	ldp	w6,w7,[x1],#8
864	ldp	w8,w9,[x3],#8
865	ldp	w10,w11,[x3],#8
866	sbcs	w8,w4,w8
867	sbcs	w9,w5,w9
868	sbcs	w10,w6,w10
869	sbcs	w11,w7,w11
870	sub	x17,x2,x1
871	stp	w8,w9,[x0],#8
872	stp	w10,w11,[x0],#8
873	cbnz	x17,.LNEON_sub
874
875	ldr	w10, [x1]		// load top-most bit
876	mov	x11,sp
877	eor	v0.16b,v0.16b,v0.16b
878	sub	x11,x2,x11		// this is num*4
879	eor	v1.16b,v1.16b,v1.16b
880	mov	x1,sp
881	sub	x0,x0,x11		// rewind x0
882	mov	x3,x2		// second 3/4th of frame
883	sbcs	w10,w10,wzr		// result is carry flag
884
885.LNEON_copy_n_zap:
886	ldp	w4,w5,[x1],#8
887	ldp	w6,w7,[x1],#8
888	ldp	w8,w9,[x0],#8
889	ldp	w10,w11,[x0]
890	sub	x0,x0,#8
891	b.cs	.LCopy_1
892	mov	w8,w4
893	mov	w9,w5
894	mov	w10,w6
895	mov	w11,w7
896.LCopy_1:
897	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
898	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
899	ldp	w4,w5,[x1],#8
900	ldp	w6,w7,[x1],#8
901	stp	w8,w9,[x0],#8
902	stp	w10,w11,[x0],#8
903	sub	x1,x1,#32
904	ldp	w8,w9,[x0],#8
905	ldp	w10,w11,[x0]
906	sub	x0,x0,#8
907	b.cs	.LCopy_2
908	mov	w8, w4
909	mov	w9, w5
910	mov	w10, w6
911	mov	w11, w7
912.LCopy_2:
913	st1	{v0.2d,v1.2d}, [x1],#32		// wipe
914	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
915	sub	x17,x2,x1		// preserves carry
916	stp	w8,w9,[x0],#8
917	stp	w10,w11,[x0],#8
918	cbnz	x17,.LNEON_copy_n_zap
919
920	mov	sp,x16
921	ldp	d14,d15,[sp,#64]
922	ldp	d12,d13,[sp,#48]
923	ldp	d10,d11,[sp,#32]
924	ldp	d8,d9,[sp,#16]
925	ldr	x29,[sp],#80
926	AARCH64_VALIDATE_LINK_REGISTER
927	ret	// bx lr
928
929.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
930.type	__bn_sqr8x_mont,%function
931.align	5
932__bn_sqr8x_mont:
933	cmp	x1,x2
934	b.ne	__bn_mul4x_mont
935.Lsqr8x_mont:
936	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
937	// only from bn_mul_mont which has already signed the return address.
938	stp	x29,x30,[sp,#-128]!
939	add	x29,sp,#0
940	stp	x19,x20,[sp,#16]
941	stp	x21,x22,[sp,#32]
942	stp	x23,x24,[sp,#48]
943	stp	x25,x26,[sp,#64]
944	stp	x27,x28,[sp,#80]
945	stp	x0,x3,[sp,#96]	// offload rp and np
946
947	ldp	x6,x7,[x1,#8*0]
948	ldp	x8,x9,[x1,#8*2]
949	ldp	x10,x11,[x1,#8*4]
950	ldp	x12,x13,[x1,#8*6]
951
952	sub	x2,sp,x5,lsl#4
953	lsl	x5,x5,#3
954	ldr	x4,[x4]		// *n0
955	mov	sp,x2			// alloca
956	sub	x27,x5,#8*8
957	b	.Lsqr8x_zero_start
958
959.Lsqr8x_zero:
960	sub	x27,x27,#8*8
961	stp	xzr,xzr,[x2,#8*0]
962	stp	xzr,xzr,[x2,#8*2]
963	stp	xzr,xzr,[x2,#8*4]
964	stp	xzr,xzr,[x2,#8*6]
965.Lsqr8x_zero_start:
966	stp	xzr,xzr,[x2,#8*8]
967	stp	xzr,xzr,[x2,#8*10]
968	stp	xzr,xzr,[x2,#8*12]
969	stp	xzr,xzr,[x2,#8*14]
970	add	x2,x2,#8*16
971	cbnz	x27,.Lsqr8x_zero
972
973	add	x3,x1,x5
974	add	x1,x1,#8*8
975	mov	x19,xzr
976	mov	x20,xzr
977	mov	x21,xzr
978	mov	x22,xzr
979	mov	x23,xzr
980	mov	x24,xzr
981	mov	x25,xzr
982	mov	x26,xzr
983	mov	x2,sp
984	str	x4,[x29,#112]		// offload n0
985
986	// Multiply everything but a[i]*a[i]
987.align	4
988.Lsqr8x_outer_loop:
989        //                                                 a[1]a[0]	(i)
990        //                                             a[2]a[0]
991        //                                         a[3]a[0]
992        //                                     a[4]a[0]
993        //                                 a[5]a[0]
994        //                             a[6]a[0]
995        //                         a[7]a[0]
996        //                                         a[2]a[1]		(ii)
997        //                                     a[3]a[1]
998        //                                 a[4]a[1]
999        //                             a[5]a[1]
1000        //                         a[6]a[1]
1001        //                     a[7]a[1]
1002        //                                 a[3]a[2]			(iii)
1003        //                             a[4]a[2]
1004        //                         a[5]a[2]
1005        //                     a[6]a[2]
1006        //                 a[7]a[2]
1007        //                         a[4]a[3]				(iv)
1008        //                     a[5]a[3]
1009        //                 a[6]a[3]
1010        //             a[7]a[3]
1011        //                 a[5]a[4]					(v)
1012        //             a[6]a[4]
1013        //         a[7]a[4]
1014        //         a[6]a[5]						(vi)
1015        //     a[7]a[5]
1016        // a[7]a[6]							(vii)
1017
1018	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
1019	mul	x15,x8,x6
1020	mul	x16,x9,x6
1021	mul	x17,x10,x6
1022	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
1023	mul	x14,x11,x6
1024	adcs	x21,x21,x15
1025	mul	x15,x12,x6
1026	adcs	x22,x22,x16
1027	mul	x16,x13,x6
1028	adcs	x23,x23,x17
1029	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
1030	adcs	x24,x24,x14
1031	umulh	x14,x8,x6
1032	adcs	x25,x25,x15
1033	umulh	x15,x9,x6
1034	adcs	x26,x26,x16
1035	umulh	x16,x10,x6
1036	stp	x19,x20,[x2],#8*2	// t[0..1]
1037	adc	x19,xzr,xzr		// t[8]
1038	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
1039	umulh	x17,x11,x6
1040	adcs	x22,x22,x14
1041	umulh	x14,x12,x6
1042	adcs	x23,x23,x15
1043	umulh	x15,x13,x6
1044	adcs	x24,x24,x16
1045	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
1046	adcs	x25,x25,x17
1047	mul	x17,x9,x7
1048	adcs	x26,x26,x14
1049	mul	x14,x10,x7
1050	adc	x19,x19,x15
1051
1052	mul	x15,x11,x7
1053	adds	x22,x22,x16
1054	mul	x16,x12,x7
1055	adcs	x23,x23,x17
1056	mul	x17,x13,x7
1057	adcs	x24,x24,x14
1058	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
1059	adcs	x25,x25,x15
1060	umulh	x15,x9,x7
1061	adcs	x26,x26,x16
1062	umulh	x16,x10,x7
1063	adcs	x19,x19,x17
1064	umulh	x17,x11,x7
1065	stp	x21,x22,[x2],#8*2	// t[2..3]
1066	adc	x20,xzr,xzr		// t[9]
1067	adds	x23,x23,x14
1068	umulh	x14,x12,x7
1069	adcs	x24,x24,x15
1070	umulh	x15,x13,x7
1071	adcs	x25,x25,x16
1072	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
1073	adcs	x26,x26,x17
1074	mul	x17,x10,x8
1075	adcs	x19,x19,x14
1076	mul	x14,x11,x8
1077	adc	x20,x20,x15
1078
1079	mul	x15,x12,x8
1080	adds	x24,x24,x16
1081	mul	x16,x13,x8
1082	adcs	x25,x25,x17
1083	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
1084	adcs	x26,x26,x14
1085	umulh	x14,x10,x8
1086	adcs	x19,x19,x15
1087	umulh	x15,x11,x8
1088	adcs	x20,x20,x16
1089	umulh	x16,x12,x8
1090	stp	x23,x24,[x2],#8*2	// t[4..5]
1091	adc	x21,xzr,xzr		// t[10]
1092	adds	x25,x25,x17
1093	umulh	x17,x13,x8
1094	adcs	x26,x26,x14
1095	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
1096	adcs	x19,x19,x15
1097	mul	x15,x11,x9
1098	adcs	x20,x20,x16
1099	mul	x16,x12,x9
1100	adc	x21,x21,x17
1101
1102	mul	x17,x13,x9
1103	adds	x26,x26,x14
1104	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
1105	adcs	x19,x19,x15
1106	umulh	x15,x11,x9
1107	adcs	x20,x20,x16
1108	umulh	x16,x12,x9
1109	adcs	x21,x21,x17
1110	umulh	x17,x13,x9
1111	stp	x25,x26,[x2],#8*2	// t[6..7]
1112	adc	x22,xzr,xzr		// t[11]
1113	adds	x19,x19,x14
1114	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
1115	adcs	x20,x20,x15
1116	mul	x15,x12,x10
1117	adcs	x21,x21,x16
1118	mul	x16,x13,x10
1119	adc	x22,x22,x17
1120
1121	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
1122	adds	x20,x20,x14
1123	umulh	x14,x12,x10
1124	adcs	x21,x21,x15
1125	umulh	x15,x13,x10
1126	adcs	x22,x22,x16
1127	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
1128	adc	x23,xzr,xzr		// t[12]
1129	adds	x21,x21,x17
1130	mul	x17,x13,x11
1131	adcs	x22,x22,x14
1132	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
1133	adc	x23,x23,x15
1134
1135	umulh	x15,x13,x11
1136	adds	x22,x22,x16
1137	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
1138	adcs	x23,x23,x17
1139	umulh	x17,x13,x12		// hi(a[7]*a[6])
1140	adc	x24,xzr,xzr		// t[13]
1141	adds	x23,x23,x14
1142	sub	x27,x3,x1	// done yet?
1143	adc	x24,x24,x15
1144
1145	adds	x24,x24,x16
1146	sub	x14,x3,x5	// rewinded ap
1147	adc	x25,xzr,xzr		// t[14]
1148	add	x25,x25,x17
1149
1150	cbz	x27,.Lsqr8x_outer_break
1151
1152	mov	x4,x6
1153	ldp	x6,x7,[x2,#8*0]
1154	ldp	x8,x9,[x2,#8*2]
1155	ldp	x10,x11,[x2,#8*4]
1156	ldp	x12,x13,[x2,#8*6]
1157	adds	x19,x19,x6
1158	adcs	x20,x20,x7
1159	ldp	x6,x7,[x1,#8*0]
1160	adcs	x21,x21,x8
1161	adcs	x22,x22,x9
1162	ldp	x8,x9,[x1,#8*2]
1163	adcs	x23,x23,x10
1164	adcs	x24,x24,x11
1165	ldp	x10,x11,[x1,#8*4]
1166	adcs	x25,x25,x12
1167	mov	x0,x1
1168	adcs	x26,xzr,x13
1169	ldp	x12,x13,[x1,#8*6]
1170	add	x1,x1,#8*8
1171	//adc	x28,xzr,xzr		// moved below
1172	mov	x27,#-8*8
1173
1174	//                                                         a[8]a[0]
1175	//                                                     a[9]a[0]
1176	//                                                 a[a]a[0]
1177	//                                             a[b]a[0]
1178	//                                         a[c]a[0]
1179	//                                     a[d]a[0]
1180	//                                 a[e]a[0]
1181	//                             a[f]a[0]
1182	//                                                     a[8]a[1]
1183	//                         a[f]a[1]........................
1184	//                                                 a[8]a[2]
1185	//                     a[f]a[2]........................
1186	//                                             a[8]a[3]
1187	//                 a[f]a[3]........................
1188	//                                         a[8]a[4]
1189	//             a[f]a[4]........................
1190	//                                     a[8]a[5]
1191	//         a[f]a[5]........................
1192	//                                 a[8]a[6]
1193	//     a[f]a[6]........................
1194	//                             a[8]a[7]
1195	// a[f]a[7]........................
1196.Lsqr8x_mul:
1197	mul	x14,x6,x4
1198	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
1199	mul	x15,x7,x4
1200	add	x27,x27,#8
1201	mul	x16,x8,x4
1202	mul	x17,x9,x4
1203	adds	x19,x19,x14
1204	mul	x14,x10,x4
1205	adcs	x20,x20,x15
1206	mul	x15,x11,x4
1207	adcs	x21,x21,x16
1208	mul	x16,x12,x4
1209	adcs	x22,x22,x17
1210	mul	x17,x13,x4
1211	adcs	x23,x23,x14
1212	umulh	x14,x6,x4
1213	adcs	x24,x24,x15
1214	umulh	x15,x7,x4
1215	adcs	x25,x25,x16
1216	umulh	x16,x8,x4
1217	adcs	x26,x26,x17
1218	umulh	x17,x9,x4
1219	adc	x28,x28,xzr
1220	str	x19,[x2],#8
1221	adds	x19,x20,x14
1222	umulh	x14,x10,x4
1223	adcs	x20,x21,x15
1224	umulh	x15,x11,x4
1225	adcs	x21,x22,x16
1226	umulh	x16,x12,x4
1227	adcs	x22,x23,x17
1228	umulh	x17,x13,x4
1229	ldr	x4,[x0,x27]
1230	adcs	x23,x24,x14
1231	adcs	x24,x25,x15
1232	adcs	x25,x26,x16
1233	adcs	x26,x28,x17
1234	//adc	x28,xzr,xzr		// moved above
1235	cbnz	x27,.Lsqr8x_mul
1236					// note that carry flag is guaranteed
1237					// to be zero at this point
1238	cmp	x1,x3		// done yet?
1239	b.eq	.Lsqr8x_break
1240
1241	ldp	x6,x7,[x2,#8*0]
1242	ldp	x8,x9,[x2,#8*2]
1243	ldp	x10,x11,[x2,#8*4]
1244	ldp	x12,x13,[x2,#8*6]
1245	adds	x19,x19,x6
1246	ldur	x4,[x0,#-8*8]
1247	adcs	x20,x20,x7
1248	ldp	x6,x7,[x1,#8*0]
1249	adcs	x21,x21,x8
1250	adcs	x22,x22,x9
1251	ldp	x8,x9,[x1,#8*2]
1252	adcs	x23,x23,x10
1253	adcs	x24,x24,x11
1254	ldp	x10,x11,[x1,#8*4]
1255	adcs	x25,x25,x12
1256	mov	x27,#-8*8
1257	adcs	x26,x26,x13
1258	ldp	x12,x13,[x1,#8*6]
1259	add	x1,x1,#8*8
1260	//adc	x28,xzr,xzr		// moved above
1261	b	.Lsqr8x_mul
1262
1263.align	4
1264.Lsqr8x_break:
1265	ldp	x6,x7,[x0,#8*0]
1266	add	x1,x0,#8*8
1267	ldp	x8,x9,[x0,#8*2]
1268	sub	x14,x3,x1		// is it last iteration?
1269	ldp	x10,x11,[x0,#8*4]
1270	sub	x15,x2,x14
1271	ldp	x12,x13,[x0,#8*6]
1272	cbz	x14,.Lsqr8x_outer_loop
1273
1274	stp	x19,x20,[x2,#8*0]
1275	ldp	x19,x20,[x15,#8*0]
1276	stp	x21,x22,[x2,#8*2]
1277	ldp	x21,x22,[x15,#8*2]
1278	stp	x23,x24,[x2,#8*4]
1279	ldp	x23,x24,[x15,#8*4]
1280	stp	x25,x26,[x2,#8*6]
1281	mov	x2,x15
1282	ldp	x25,x26,[x15,#8*6]
1283	b	.Lsqr8x_outer_loop
1284
1285.align	4
1286.Lsqr8x_outer_break:
1287	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1288	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
1289	ldp	x15,x16,[sp,#8*1]
1290	ldp	x11,x13,[x14,#8*2]
1291	add	x1,x14,#8*4
1292	ldp	x17,x14,[sp,#8*3]
1293
1294	stp	x19,x20,[x2,#8*0]
1295	mul	x19,x7,x7
1296	stp	x21,x22,[x2,#8*2]
1297	umulh	x7,x7,x7
1298	stp	x23,x24,[x2,#8*4]
1299	mul	x8,x9,x9
1300	stp	x25,x26,[x2,#8*6]
1301	mov	x2,sp
1302	umulh	x9,x9,x9
1303	adds	x20,x7,x15,lsl#1
1304	extr	x15,x16,x15,#63
1305	sub	x27,x5,#8*4
1306
1307.Lsqr4x_shift_n_add:
1308	adcs	x21,x8,x15
1309	extr	x16,x17,x16,#63
1310	sub	x27,x27,#8*4
1311	adcs	x22,x9,x16
1312	ldp	x15,x16,[x2,#8*5]
1313	mul	x10,x11,x11
1314	ldp	x7,x9,[x1],#8*2
1315	umulh	x11,x11,x11
1316	mul	x12,x13,x13
1317	umulh	x13,x13,x13
1318	extr	x17,x14,x17,#63
1319	stp	x19,x20,[x2,#8*0]
1320	adcs	x23,x10,x17
1321	extr	x14,x15,x14,#63
1322	stp	x21,x22,[x2,#8*2]
1323	adcs	x24,x11,x14
1324	ldp	x17,x14,[x2,#8*7]
1325	extr	x15,x16,x15,#63
1326	adcs	x25,x12,x15
1327	extr	x16,x17,x16,#63
1328	adcs	x26,x13,x16
1329	ldp	x15,x16,[x2,#8*9]
1330	mul	x6,x7,x7
1331	ldp	x11,x13,[x1],#8*2
1332	umulh	x7,x7,x7
1333	mul	x8,x9,x9
1334	umulh	x9,x9,x9
1335	stp	x23,x24,[x2,#8*4]
1336	extr	x17,x14,x17,#63
1337	stp	x25,x26,[x2,#8*6]
1338	add	x2,x2,#8*8
1339	adcs	x19,x6,x17
1340	extr	x14,x15,x14,#63
1341	adcs	x20,x7,x14
1342	ldp	x17,x14,[x2,#8*3]
1343	extr	x15,x16,x15,#63
1344	cbnz	x27,.Lsqr4x_shift_n_add
1345	ldp	x1,x4,[x29,#104]	// pull np and n0
1346
1347	adcs	x21,x8,x15
1348	extr	x16,x17,x16,#63
1349	adcs	x22,x9,x16
1350	ldp	x15,x16,[x2,#8*5]
1351	mul	x10,x11,x11
1352	umulh	x11,x11,x11
1353	stp	x19,x20,[x2,#8*0]
1354	mul	x12,x13,x13
1355	umulh	x13,x13,x13
1356	stp	x21,x22,[x2,#8*2]
1357	extr	x17,x14,x17,#63
1358	adcs	x23,x10,x17
1359	extr	x14,x15,x14,#63
1360	ldp	x19,x20,[sp,#8*0]
1361	adcs	x24,x11,x14
1362	extr	x15,x16,x15,#63
1363	ldp	x6,x7,[x1,#8*0]
1364	adcs	x25,x12,x15
1365	extr	x16,xzr,x16,#63
1366	ldp	x8,x9,[x1,#8*2]
1367	adc	x26,x13,x16
1368	ldp	x10,x11,[x1,#8*4]
1369
1370	// Reduce by 512 bits per iteration
1371	mul	x28,x4,x19		// t[0]*n0
1372	ldp	x12,x13,[x1,#8*6]
1373	add	x3,x1,x5
1374	ldp	x21,x22,[sp,#8*2]
1375	stp	x23,x24,[x2,#8*4]
1376	ldp	x23,x24,[sp,#8*4]
1377	stp	x25,x26,[x2,#8*6]
1378	ldp	x25,x26,[sp,#8*6]
1379	add	x1,x1,#8*8
1380	mov	x30,xzr		// initial top-most carry
1381	mov	x2,sp
1382	mov	x27,#8
1383
1384.Lsqr8x_reduction:
1385	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
1386	mul	x15,x7,x28
1387	sub	x27,x27,#1
1388	mul	x16,x8,x28
1389	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
1390	mul	x17,x9,x28
1391	// (*)	adds	xzr,x19,x14
1392	subs	xzr,x19,#1		// (*)
1393	mul	x14,x10,x28
1394	adcs	x19,x20,x15
1395	mul	x15,x11,x28
1396	adcs	x20,x21,x16
1397	mul	x16,x12,x28
1398	adcs	x21,x22,x17
1399	mul	x17,x13,x28
1400	adcs	x22,x23,x14
1401	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
1402	adcs	x23,x24,x15
1403	umulh	x15,x7,x28
1404	adcs	x24,x25,x16
1405	umulh	x16,x8,x28
1406	adcs	x25,x26,x17
1407	umulh	x17,x9,x28
1408	adc	x26,xzr,xzr
1409	adds	x19,x19,x14
1410	umulh	x14,x10,x28
1411	adcs	x20,x20,x15
1412	umulh	x15,x11,x28
1413	adcs	x21,x21,x16
1414	umulh	x16,x12,x28
1415	adcs	x22,x22,x17
1416	umulh	x17,x13,x28
1417	mul	x28,x4,x19		// next t[0]*n0
1418	adcs	x23,x23,x14
1419	adcs	x24,x24,x15
1420	adcs	x25,x25,x16
1421	adc	x26,x26,x17
1422	cbnz	x27,.Lsqr8x_reduction
1423
1424	ldp	x14,x15,[x2,#8*0]
1425	ldp	x16,x17,[x2,#8*2]
1426	mov	x0,x2
1427	sub	x27,x3,x1	// done yet?
1428	adds	x19,x19,x14
1429	adcs	x20,x20,x15
1430	ldp	x14,x15,[x2,#8*4]
1431	adcs	x21,x21,x16
1432	adcs	x22,x22,x17
1433	ldp	x16,x17,[x2,#8*6]
1434	adcs	x23,x23,x14
1435	adcs	x24,x24,x15
1436	adcs	x25,x25,x16
1437	adcs	x26,x26,x17
1438	//adc	x28,xzr,xzr		// moved below
1439	cbz	x27,.Lsqr8x8_post_condition
1440
1441	ldur	x4,[x2,#-8*8]
1442	ldp	x6,x7,[x1,#8*0]
1443	ldp	x8,x9,[x1,#8*2]
1444	ldp	x10,x11,[x1,#8*4]
1445	mov	x27,#-8*8
1446	ldp	x12,x13,[x1,#8*6]
1447	add	x1,x1,#8*8
1448
1449.Lsqr8x_tail:
1450	mul	x14,x6,x4
1451	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
1452	mul	x15,x7,x4
1453	add	x27,x27,#8
1454	mul	x16,x8,x4
1455	mul	x17,x9,x4
1456	adds	x19,x19,x14
1457	mul	x14,x10,x4
1458	adcs	x20,x20,x15
1459	mul	x15,x11,x4
1460	adcs	x21,x21,x16
1461	mul	x16,x12,x4
1462	adcs	x22,x22,x17
1463	mul	x17,x13,x4
1464	adcs	x23,x23,x14
1465	umulh	x14,x6,x4
1466	adcs	x24,x24,x15
1467	umulh	x15,x7,x4
1468	adcs	x25,x25,x16
1469	umulh	x16,x8,x4
1470	adcs	x26,x26,x17
1471	umulh	x17,x9,x4
1472	adc	x28,x28,xzr
1473	str	x19,[x2],#8
1474	adds	x19,x20,x14
1475	umulh	x14,x10,x4
1476	adcs	x20,x21,x15
1477	umulh	x15,x11,x4
1478	adcs	x21,x22,x16
1479	umulh	x16,x12,x4
1480	adcs	x22,x23,x17
1481	umulh	x17,x13,x4
1482	ldr	x4,[x0,x27]
1483	adcs	x23,x24,x14
1484	adcs	x24,x25,x15
1485	adcs	x25,x26,x16
1486	adcs	x26,x28,x17
1487	//adc	x28,xzr,xzr		// moved above
1488	cbnz	x27,.Lsqr8x_tail
1489					// note that carry flag is guaranteed
1490					// to be zero at this point
1491	ldp	x6,x7,[x2,#8*0]
1492	sub	x27,x3,x1	// done yet?
1493	sub	x16,x3,x5	// rewinded np
1494	ldp	x8,x9,[x2,#8*2]
1495	ldp	x10,x11,[x2,#8*4]
1496	ldp	x12,x13,[x2,#8*6]
1497	cbz	x27,.Lsqr8x_tail_break
1498
1499	ldur	x4,[x0,#-8*8]
1500	adds	x19,x19,x6
1501	adcs	x20,x20,x7
1502	ldp	x6,x7,[x1,#8*0]
1503	adcs	x21,x21,x8
1504	adcs	x22,x22,x9
1505	ldp	x8,x9,[x1,#8*2]
1506	adcs	x23,x23,x10
1507	adcs	x24,x24,x11
1508	ldp	x10,x11,[x1,#8*4]
1509	adcs	x25,x25,x12
1510	mov	x27,#-8*8
1511	adcs	x26,x26,x13
1512	ldp	x12,x13,[x1,#8*6]
1513	add	x1,x1,#8*8
1514	//adc	x28,xzr,xzr		// moved above
1515	b	.Lsqr8x_tail
1516
1517.align	4
1518.Lsqr8x_tail_break:
1519	ldr	x4,[x29,#112]		// pull n0
1520	add	x27,x2,#8*8		// end of current t[num] window
1521
1522	subs	xzr,x30,#1		// "move" top-most carry to carry bit
1523	adcs	x14,x19,x6
1524	adcs	x15,x20,x7
1525	ldp	x19,x20,[x0,#8*0]
1526	adcs	x21,x21,x8
1527	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
1528	adcs	x22,x22,x9
1529	ldp	x8,x9,[x16,#8*2]
1530	adcs	x23,x23,x10
1531	adcs	x24,x24,x11
1532	ldp	x10,x11,[x16,#8*4]
1533	adcs	x25,x25,x12
1534	adcs	x26,x26,x13
1535	ldp	x12,x13,[x16,#8*6]
1536	add	x1,x16,#8*8
1537	adc	x30,xzr,xzr	// top-most carry
1538	mul	x28,x4,x19
1539	stp	x14,x15,[x2,#8*0]
1540	stp	x21,x22,[x2,#8*2]
1541	ldp	x21,x22,[x0,#8*2]
1542	stp	x23,x24,[x2,#8*4]
1543	ldp	x23,x24,[x0,#8*4]
1544	cmp	x27,x29		// did we hit the bottom?
1545	stp	x25,x26,[x2,#8*6]
1546	mov	x2,x0			// slide the window
1547	ldp	x25,x26,[x0,#8*6]
1548	mov	x27,#8
1549	b.ne	.Lsqr8x_reduction
1550
1551	// Final step. We see if result is larger than modulus, and
1552	// if it is, subtract the modulus. But comparison implies
1553	// subtraction. So we subtract modulus, see if it borrowed,
1554	// and conditionally copy original value.
1555	ldr	x0,[x29,#96]		// pull rp
1556	add	x2,x2,#8*8
1557	subs	x14,x19,x6
1558	sbcs	x15,x20,x7
1559	sub	x27,x5,#8*8
1560	mov	x3,x0		// x0 copy
1561
1562.Lsqr8x_sub:
1563	sbcs	x16,x21,x8
1564	ldp	x6,x7,[x1,#8*0]
1565	sbcs	x17,x22,x9
1566	stp	x14,x15,[x0,#8*0]
1567	sbcs	x14,x23,x10
1568	ldp	x8,x9,[x1,#8*2]
1569	sbcs	x15,x24,x11
1570	stp	x16,x17,[x0,#8*2]
1571	sbcs	x16,x25,x12
1572	ldp	x10,x11,[x1,#8*4]
1573	sbcs	x17,x26,x13
1574	ldp	x12,x13,[x1,#8*6]
1575	add	x1,x1,#8*8
1576	ldp	x19,x20,[x2,#8*0]
1577	sub	x27,x27,#8*8
1578	ldp	x21,x22,[x2,#8*2]
1579	ldp	x23,x24,[x2,#8*4]
1580	ldp	x25,x26,[x2,#8*6]
1581	add	x2,x2,#8*8
1582	stp	x14,x15,[x0,#8*4]
1583	sbcs	x14,x19,x6
1584	stp	x16,x17,[x0,#8*6]
1585	add	x0,x0,#8*8
1586	sbcs	x15,x20,x7
1587	cbnz	x27,.Lsqr8x_sub
1588
1589	sbcs	x16,x21,x8
1590	mov	x2,sp
1591	add	x1,sp,x5
1592	ldp	x6,x7,[x3,#8*0]
1593	sbcs	x17,x22,x9
1594	stp	x14,x15,[x0,#8*0]
1595	sbcs	x14,x23,x10
1596	ldp	x8,x9,[x3,#8*2]
1597	sbcs	x15,x24,x11
1598	stp	x16,x17,[x0,#8*2]
1599	sbcs	x16,x25,x12
1600	ldp	x19,x20,[x1,#8*0]
1601	sbcs	x17,x26,x13
1602	ldp	x21,x22,[x1,#8*2]
1603	sbcs	xzr,x30,xzr	// did it borrow?
1604	ldr	x30,[x29,#8]		// pull return address
1605	stp	x14,x15,[x0,#8*4]
1606	stp	x16,x17,[x0,#8*6]
1607
1608	sub	x27,x5,#8*4
1609.Lsqr4x_cond_copy:
1610	sub	x27,x27,#8*4
1611	csel	x14,x19,x6,lo
1612	stp	xzr,xzr,[x2,#8*0]
1613	csel	x15,x20,x7,lo
1614	ldp	x6,x7,[x3,#8*4]
1615	ldp	x19,x20,[x1,#8*4]
1616	csel	x16,x21,x8,lo
1617	stp	xzr,xzr,[x2,#8*2]
1618	add	x2,x2,#8*4
1619	csel	x17,x22,x9,lo
1620	ldp	x8,x9,[x3,#8*6]
1621	ldp	x21,x22,[x1,#8*6]
1622	add	x1,x1,#8*4
1623	stp	x14,x15,[x3,#8*0]
1624	stp	x16,x17,[x3,#8*2]
1625	add	x3,x3,#8*4
1626	stp	xzr,xzr,[x1,#8*0]
1627	stp	xzr,xzr,[x1,#8*2]
1628	cbnz	x27,.Lsqr4x_cond_copy
1629
1630	csel	x14,x19,x6,lo
1631	stp	xzr,xzr,[x2,#8*0]
1632	csel	x15,x20,x7,lo
1633	stp	xzr,xzr,[x2,#8*2]
1634	csel	x16,x21,x8,lo
1635	csel	x17,x22,x9,lo
1636	stp	x14,x15,[x3,#8*0]
1637	stp	x16,x17,[x3,#8*2]
1638
1639	b	.Lsqr8x_done
1640
1641.align	4
1642.Lsqr8x8_post_condition:
1643	adc	x28,xzr,xzr
1644	ldr	x30,[x29,#8]		// pull return address
1645	// x19-7,x28 hold result, x6-7 hold modulus
1646	subs	x6,x19,x6
1647	ldr	x1,[x29,#96]		// pull rp
1648	sbcs	x7,x20,x7
1649	stp	xzr,xzr,[sp,#8*0]
1650	sbcs	x8,x21,x8
1651	stp	xzr,xzr,[sp,#8*2]
1652	sbcs	x9,x22,x9
1653	stp	xzr,xzr,[sp,#8*4]
1654	sbcs	x10,x23,x10
1655	stp	xzr,xzr,[sp,#8*6]
1656	sbcs	x11,x24,x11
1657	stp	xzr,xzr,[sp,#8*8]
1658	sbcs	x12,x25,x12
1659	stp	xzr,xzr,[sp,#8*10]
1660	sbcs	x13,x26,x13
1661	stp	xzr,xzr,[sp,#8*12]
1662	sbcs	x28,x28,xzr	// did it borrow?
1663	stp	xzr,xzr,[sp,#8*14]
1664
1665	// x6-7 hold result-modulus
1666	csel	x6,x19,x6,lo
1667	csel	x7,x20,x7,lo
1668	csel	x8,x21,x8,lo
1669	csel	x9,x22,x9,lo
1670	stp	x6,x7,[x1,#8*0]
1671	csel	x10,x23,x10,lo
1672	csel	x11,x24,x11,lo
1673	stp	x8,x9,[x1,#8*2]
1674	csel	x12,x25,x12,lo
1675	csel	x13,x26,x13,lo
1676	stp	x10,x11,[x1,#8*4]
1677	stp	x12,x13,[x1,#8*6]
1678
1679.Lsqr8x_done:
1680	ldp	x19,x20,[x29,#16]
1681	mov	sp,x29
1682	ldp	x21,x22,[x29,#32]
1683	mov	x0,#1
1684	ldp	x23,x24,[x29,#48]
1685	ldp	x25,x26,[x29,#64]
1686	ldp	x27,x28,[x29,#80]
1687	ldr	x29,[sp],#128
1688	// x30 is loaded earlier
1689	AARCH64_VALIDATE_LINK_REGISTER
1690	ret
1691.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
1692.type	__bn_mul4x_mont,%function
1693.align	5
1694__bn_mul4x_mont:
1695	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
1696	// only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address.
1697	stp	x29,x30,[sp,#-128]!
1698	add	x29,sp,#0
1699	stp	x19,x20,[sp,#16]
1700	stp	x21,x22,[sp,#32]
1701	stp	x23,x24,[sp,#48]
1702	stp	x25,x26,[sp,#64]
1703	stp	x27,x28,[sp,#80]
1704
1705	sub	x26,sp,x5,lsl#3
1706	lsl	x5,x5,#3
1707	ldr	x4,[x4]		// *n0
1708	sub	sp,x26,#8*4		// alloca
1709
1710	add	x10,x2,x5
1711	add	x27,x1,x5
1712	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
1713
1714	ldr	x24,[x2,#8*0]		// b[0]
1715	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1716	ldp	x8,x9,[x1,#8*2]
1717	add	x1,x1,#8*4
1718	mov	x19,xzr
1719	mov	x20,xzr
1720	mov	x21,xzr
1721	mov	x22,xzr
1722	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1723	ldp	x16,x17,[x3,#8*2]
1724	adds	x3,x3,#8*4		// clear carry bit
1725	mov	x0,xzr
1726	mov	x28,#0
1727	mov	x26,sp
1728
1729.Loop_mul4x_1st_reduction:
1730	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1731	adc	x0,x0,xzr	// modulo-scheduled
1732	mul	x11,x7,x24
1733	add	x28,x28,#8
1734	mul	x12,x8,x24
1735	and	x28,x28,#31
1736	mul	x13,x9,x24
1737	adds	x19,x19,x10
1738	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1739	adcs	x20,x20,x11
1740	mul	x25,x19,x4		// t[0]*n0
1741	adcs	x21,x21,x12
1742	umulh	x11,x7,x24
1743	adcs	x22,x22,x13
1744	umulh	x12,x8,x24
1745	adc	x23,xzr,xzr
1746	umulh	x13,x9,x24
1747	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1748	adds	x20,x20,x10
1749	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1750	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1751	adcs	x21,x21,x11
1752	mul	x11,x15,x25
1753	adcs	x22,x22,x12
1754	mul	x12,x16,x25
1755	adc	x23,x23,x13		// can't overflow
1756	mul	x13,x17,x25
1757	// (*)	adds	xzr,x19,x10
1758	subs	xzr,x19,#1		// (*)
1759	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1760	adcs	x19,x20,x11
1761	umulh	x11,x15,x25
1762	adcs	x20,x21,x12
1763	umulh	x12,x16,x25
1764	adcs	x21,x22,x13
1765	umulh	x13,x17,x25
1766	adcs	x22,x23,x0
1767	adc	x0,xzr,xzr
1768	adds	x19,x19,x10
1769	sub	x10,x27,x1
1770	adcs	x20,x20,x11
1771	adcs	x21,x21,x12
1772	adcs	x22,x22,x13
1773	//adc	x0,x0,xzr
1774	cbnz	x28,.Loop_mul4x_1st_reduction
1775
1776	cbz	x10,.Lmul4x4_post_condition
1777
1778	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1779	ldp	x8,x9,[x1,#8*2]
1780	add	x1,x1,#8*4
1781	ldr	x25,[sp]		// a[0]*n0
1782	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1783	ldp	x16,x17,[x3,#8*2]
1784	add	x3,x3,#8*4
1785
1786.Loop_mul4x_1st_tail:
1787	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1788	adc	x0,x0,xzr	// modulo-scheduled
1789	mul	x11,x7,x24
1790	add	x28,x28,#8
1791	mul	x12,x8,x24
1792	and	x28,x28,#31
1793	mul	x13,x9,x24
1794	adds	x19,x19,x10
1795	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1796	adcs	x20,x20,x11
1797	umulh	x11,x7,x24
1798	adcs	x21,x21,x12
1799	umulh	x12,x8,x24
1800	adcs	x22,x22,x13
1801	umulh	x13,x9,x24
1802	adc	x23,xzr,xzr
1803	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1804	adds	x20,x20,x10
1805	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1806	adcs	x21,x21,x11
1807	mul	x11,x15,x25
1808	adcs	x22,x22,x12
1809	mul	x12,x16,x25
1810	adc	x23,x23,x13		// can't overflow
1811	mul	x13,x17,x25
1812	adds	x19,x19,x10
1813	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1814	adcs	x20,x20,x11
1815	umulh	x11,x15,x25
1816	adcs	x21,x21,x12
1817	umulh	x12,x16,x25
1818	adcs	x22,x22,x13
1819	adcs	x23,x23,x0
1820	umulh	x13,x17,x25
1821	adc	x0,xzr,xzr
1822	ldr	x25,[sp,x28]		// next t[0]*n0
1823	str	x19,[x26],#8		// result!!!
1824	adds	x19,x20,x10
1825	sub	x10,x27,x1		// done yet?
1826	adcs	x20,x21,x11
1827	adcs	x21,x22,x12
1828	adcs	x22,x23,x13
1829	//adc	x0,x0,xzr
1830	cbnz	x28,.Loop_mul4x_1st_tail
1831
1832	sub	x11,x27,x5	// rewinded x1
1833	cbz	x10,.Lmul4x_proceed
1834
1835	ldp	x6,x7,[x1,#8*0]
1836	ldp	x8,x9,[x1,#8*2]
1837	add	x1,x1,#8*4
1838	ldp	x14,x15,[x3,#8*0]
1839	ldp	x16,x17,[x3,#8*2]
1840	add	x3,x3,#8*4
1841	b	.Loop_mul4x_1st_tail
1842
1843.align	5
1844.Lmul4x_proceed:
1845	ldr	x24,[x2,#8*4]!		// *++b
1846	adc	x30,x0,xzr
1847	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1848	sub	x3,x3,x5		// rewind np
1849	ldp	x8,x9,[x11,#8*2]
1850	add	x1,x11,#8*4
1851
1852	stp	x19,x20,[x26,#8*0]	// result!!!
1853	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1854	stp	x21,x22,[x26,#8*2]	// result!!!
1855	ldp	x21,x22,[sp,#8*6]
1856
1857	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1858	mov	x26,sp
1859	ldp	x16,x17,[x3,#8*2]
1860	adds	x3,x3,#8*4		// clear carry bit
1861	mov	x0,xzr
1862
1863.align	4
1864.Loop_mul4x_reduction:
1865	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1866	adc	x0,x0,xzr	// modulo-scheduled
1867	mul	x11,x7,x24
1868	add	x28,x28,#8
1869	mul	x12,x8,x24
1870	and	x28,x28,#31
1871	mul	x13,x9,x24
1872	adds	x19,x19,x10
1873	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1874	adcs	x20,x20,x11
1875	mul	x25,x19,x4		// t[0]*n0
1876	adcs	x21,x21,x12
1877	umulh	x11,x7,x24
1878	adcs	x22,x22,x13
1879	umulh	x12,x8,x24
1880	adc	x23,xzr,xzr
1881	umulh	x13,x9,x24
1882	ldr	x24,[x2,x28]		// next b[i]
1883	adds	x20,x20,x10
1884	// (*)	mul	x10,x14,x25
1885	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1886	adcs	x21,x21,x11
1887	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1888	adcs	x22,x22,x12
1889	mul	x12,x16,x25
1890	adc	x23,x23,x13		// can't overflow
1891	mul	x13,x17,x25
1892	// (*)	adds	xzr,x19,x10
1893	subs	xzr,x19,#1		// (*)
1894	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1895	adcs	x19,x20,x11
1896	umulh	x11,x15,x25
1897	adcs	x20,x21,x12
1898	umulh	x12,x16,x25
1899	adcs	x21,x22,x13
1900	umulh	x13,x17,x25
1901	adcs	x22,x23,x0
1902	adc	x0,xzr,xzr
1903	adds	x19,x19,x10
1904	adcs	x20,x20,x11
1905	adcs	x21,x21,x12
1906	adcs	x22,x22,x13
1907	//adc	x0,x0,xzr
1908	cbnz	x28,.Loop_mul4x_reduction
1909
1910	adc	x0,x0,xzr
1911	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1912	ldp	x12,x13,[x26,#8*6]
1913	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1914	ldp	x8,x9,[x1,#8*2]
1915	add	x1,x1,#8*4
1916	adds	x19,x19,x10
1917	adcs	x20,x20,x11
1918	adcs	x21,x21,x12
1919	adcs	x22,x22,x13
1920	//adc	x0,x0,xzr
1921
1922	ldr	x25,[sp]		// t[0]*n0
1923	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1924	ldp	x16,x17,[x3,#8*2]
1925	add	x3,x3,#8*4
1926
1927.align	4
1928.Loop_mul4x_tail:
1929	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1930	adc	x0,x0,xzr	// modulo-scheduled
1931	mul	x11,x7,x24
1932	add	x28,x28,#8
1933	mul	x12,x8,x24
1934	and	x28,x28,#31
1935	mul	x13,x9,x24
1936	adds	x19,x19,x10
1937	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1938	adcs	x20,x20,x11
1939	umulh	x11,x7,x24
1940	adcs	x21,x21,x12
1941	umulh	x12,x8,x24
1942	adcs	x22,x22,x13
1943	umulh	x13,x9,x24
1944	adc	x23,xzr,xzr
1945	ldr	x24,[x2,x28]		// next b[i]
1946	adds	x20,x20,x10
1947	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1948	adcs	x21,x21,x11
1949	mul	x11,x15,x25
1950	adcs	x22,x22,x12
1951	mul	x12,x16,x25
1952	adc	x23,x23,x13		// can't overflow
1953	mul	x13,x17,x25
1954	adds	x19,x19,x10
1955	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1956	adcs	x20,x20,x11
1957	umulh	x11,x15,x25
1958	adcs	x21,x21,x12
1959	umulh	x12,x16,x25
1960	adcs	x22,x22,x13
1961	umulh	x13,x17,x25
1962	adcs	x23,x23,x0
1963	ldr	x25,[sp,x28]		// next a[0]*n0
1964	adc	x0,xzr,xzr
1965	str	x19,[x26],#8		// result!!!
1966	adds	x19,x20,x10
1967	sub	x10,x27,x1		// done yet?
1968	adcs	x20,x21,x11
1969	adcs	x21,x22,x12
1970	adcs	x22,x23,x13
1971	//adc	x0,x0,xzr
1972	cbnz	x28,.Loop_mul4x_tail
1973
1974	sub	x11,x3,x5		// rewinded np?
1975	adc	x0,x0,xzr
1976	cbz	x10,.Loop_mul4x_break
1977
1978	ldp	x10,x11,[x26,#8*4]
1979	ldp	x12,x13,[x26,#8*6]
1980	ldp	x6,x7,[x1,#8*0]
1981	ldp	x8,x9,[x1,#8*2]
1982	add	x1,x1,#8*4
1983	adds	x19,x19,x10
1984	adcs	x20,x20,x11
1985	adcs	x21,x21,x12
1986	adcs	x22,x22,x13
1987	//adc	x0,x0,xzr
1988	ldp	x14,x15,[x3,#8*0]
1989	ldp	x16,x17,[x3,#8*2]
1990	add	x3,x3,#8*4
1991	b	.Loop_mul4x_tail
1992
1993.align	4
1994.Loop_mul4x_break:
1995	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1996	adds	x19,x19,x30
1997	add	x2,x2,#8*4		// bp++
1998	adcs	x20,x20,xzr
1999	sub	x1,x1,x5		// rewind ap
2000	adcs	x21,x21,xzr
2001	stp	x19,x20,[x26,#8*0]	// result!!!
2002	adcs	x22,x22,xzr
2003	ldp	x19,x20,[sp,#8*4]	// t[0..3]
2004	adc	x30,x0,xzr
2005	stp	x21,x22,[x26,#8*2]	// result!!!
2006	cmp	x2,x13			// done yet?
2007	ldp	x21,x22,[sp,#8*6]
2008	ldp	x14,x15,[x11,#8*0]	// n[0..3]
2009	ldp	x16,x17,[x11,#8*2]
2010	add	x3,x11,#8*4
2011	b.eq	.Lmul4x_post
2012
2013	ldr	x24,[x2]
2014	ldp	x6,x7,[x1,#8*0]	// a[0..3]
2015	ldp	x8,x9,[x1,#8*2]
2016	adds	x1,x1,#8*4		// clear carry bit
2017	mov	x0,xzr
2018	mov	x26,sp
2019	b	.Loop_mul4x_reduction
2020
2021.align	4
2022.Lmul4x_post:
2023	// Final step. We see if result is larger than modulus, and
2024	// if it is, subtract the modulus. But comparison implies
2025	// subtraction. So we subtract modulus, see if it borrowed,
2026	// and conditionally copy original value.
2027	mov	x0,x12
2028	mov	x27,x12		// x0 copy
2029	subs	x10,x19,x14
2030	add	x26,sp,#8*8
2031	sbcs	x11,x20,x15
2032	sub	x28,x5,#8*4
2033
2034.Lmul4x_sub:
2035	sbcs	x12,x21,x16
2036	ldp	x14,x15,[x3,#8*0]
2037	sub	x28,x28,#8*4
2038	ldp	x19,x20,[x26,#8*0]
2039	sbcs	x13,x22,x17
2040	ldp	x16,x17,[x3,#8*2]
2041	add	x3,x3,#8*4
2042	ldp	x21,x22,[x26,#8*2]
2043	add	x26,x26,#8*4
2044	stp	x10,x11,[x0,#8*0]
2045	sbcs	x10,x19,x14
2046	stp	x12,x13,[x0,#8*2]
2047	add	x0,x0,#8*4
2048	sbcs	x11,x20,x15
2049	cbnz	x28,.Lmul4x_sub
2050
2051	sbcs	x12,x21,x16
2052	mov	x26,sp
2053	add	x1,sp,#8*4
2054	ldp	x6,x7,[x27,#8*0]
2055	sbcs	x13,x22,x17
2056	stp	x10,x11,[x0,#8*0]
2057	ldp	x8,x9,[x27,#8*2]
2058	stp	x12,x13,[x0,#8*2]
2059	ldp	x19,x20,[x1,#8*0]
2060	ldp	x21,x22,[x1,#8*2]
2061	sbcs	xzr,x30,xzr	// did it borrow?
2062	ldr	x30,[x29,#8]		// pull return address
2063
2064	sub	x28,x5,#8*4
2065.Lmul4x_cond_copy:
2066	sub	x28,x28,#8*4
2067	csel	x10,x19,x6,lo
2068	stp	xzr,xzr,[x26,#8*0]
2069	csel	x11,x20,x7,lo
2070	ldp	x6,x7,[x27,#8*4]
2071	ldp	x19,x20,[x1,#8*4]
2072	csel	x12,x21,x8,lo
2073	stp	xzr,xzr,[x26,#8*2]
2074	add	x26,x26,#8*4
2075	csel	x13,x22,x9,lo
2076	ldp	x8,x9,[x27,#8*6]
2077	ldp	x21,x22,[x1,#8*6]
2078	add	x1,x1,#8*4
2079	stp	x10,x11,[x27,#8*0]
2080	stp	x12,x13,[x27,#8*2]
2081	add	x27,x27,#8*4
2082	cbnz	x28,.Lmul4x_cond_copy
2083
2084	csel	x10,x19,x6,lo
2085	stp	xzr,xzr,[x26,#8*0]
2086	csel	x11,x20,x7,lo
2087	stp	xzr,xzr,[x26,#8*2]
2088	csel	x12,x21,x8,lo
2089	stp	xzr,xzr,[x26,#8*3]
2090	csel	x13,x22,x9,lo
2091	stp	xzr,xzr,[x26,#8*4]
2092	stp	x10,x11,[x27,#8*0]
2093	stp	x12,x13,[x27,#8*2]
2094
2095	b	.Lmul4x_done
2096
2097.align	4
2098.Lmul4x4_post_condition:
2099	adc	x0,x0,xzr
2100	ldr	x1,[x29,#96]		// pull rp
2101	// x19-3,x0 hold result, x14-7 hold modulus
2102	subs	x6,x19,x14
2103	ldr	x30,[x29,#8]		// pull return address
2104	sbcs	x7,x20,x15
2105	stp	xzr,xzr,[sp,#8*0]
2106	sbcs	x8,x21,x16
2107	stp	xzr,xzr,[sp,#8*2]
2108	sbcs	x9,x22,x17
2109	stp	xzr,xzr,[sp,#8*4]
2110	sbcs	xzr,x0,xzr		// did it borrow?
2111	stp	xzr,xzr,[sp,#8*6]
2112
2113	// x6-3 hold result-modulus
2114	csel	x6,x19,x6,lo
2115	csel	x7,x20,x7,lo
2116	csel	x8,x21,x8,lo
2117	csel	x9,x22,x9,lo
2118	stp	x6,x7,[x1,#8*0]
2119	stp	x8,x9,[x1,#8*2]
2120
2121.Lmul4x_done:
2122	ldp	x19,x20,[x29,#16]
2123	mov	sp,x29
2124	ldp	x21,x22,[x29,#32]
2125	mov	x0,#1
2126	ldp	x23,x24,[x29,#48]
2127	ldp	x25,x26,[x29,#64]
2128	ldp	x27,x28,[x29,#80]
2129	ldr	x29,[sp],#128
2130	// x30 loaded earlier
2131	AARCH64_VALIDATE_LINK_REGISTER
2132	ret
2133.size	__bn_mul4x_mont,.-__bn_mul4x_mont
2134.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
2135.align	2
2136.align	4
2137