xref: /freebsd/sys/crypto/openssl/aarch64/armv8-mont.S (revision 06c3fb2749bda94cb5201f81ffdb8fa6c3161b2e)
1/* Do not modify. This file is auto-generated from armv8-mont.pl. */
2#include "arm_arch.h"
3#ifndef	__KERNEL__
4
5.hidden	OPENSSL_armv8_rsa_neonized
6#endif
7.text
8
9.globl	bn_mul_mont
10.type	bn_mul_mont,%function
11.align	5
12bn_mul_mont:
13	AARCH64_SIGN_LINK_REGISTER
14.Lbn_mul_mont:
15	tst	x5,#3
16	b.ne	.Lmul_mont
17	cmp	x5,#32
18	b.le	.Lscalar_impl
19#ifndef	__KERNEL__
20	adrp	x17,OPENSSL_armv8_rsa_neonized
21	ldr	w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
22	cbnz	w17, bn_mul8x_mont_neon
23#endif
24
25.Lscalar_impl:
26	tst	x5,#7
27	b.eq	__bn_sqr8x_mont
28	tst	x5,#3
29	b.eq	__bn_mul4x_mont
30
31.Lmul_mont:
32	stp	x29,x30,[sp,#-64]!
33	add	x29,sp,#0
34	stp	x19,x20,[sp,#16]
35	stp	x21,x22,[sp,#32]
36	stp	x23,x24,[sp,#48]
37
38	ldr	x9,[x2],#8		// bp[0]
39	sub	x22,sp,x5,lsl#3
40	ldp	x7,x8,[x1],#16	// ap[0..1]
41	lsl	x5,x5,#3
42	ldr	x4,[x4]		// *n0
43	and	x22,x22,#-16		// ABI says so
44	ldp	x13,x14,[x3],#16	// np[0..1]
45
46	mul	x6,x7,x9		// ap[0]*bp[0]
47	sub	x21,x5,#16		// j=num-2
48	umulh	x7,x7,x9
49	mul	x10,x8,x9		// ap[1]*bp[0]
50	umulh	x11,x8,x9
51
52	mul	x15,x6,x4		// "tp[0]"*n0
53	mov	sp,x22			// alloca
54
55	// (*)	mul	x12,x13,x15	// np[0]*m1
56	umulh	x13,x13,x15
57	mul	x16,x14,x15		// np[1]*m1
58	// (*)	adds	x12,x12,x6	// discarded
59	// (*)	As for removal of first multiplication and addition
60	//	instructions. The outcome of first addition is
61	//	guaranteed to be zero, which leaves two computationally
62	//	significant outcomes: it either carries or not. Then
63	//	question is when does it carry? Is there alternative
64	//	way to deduce it? If you follow operations, you can
65	//	observe that condition for carry is quite simple:
66	//	x6 being non-zero. So that carry can be calculated
67	//	by adding -1 to x6. That's what next instruction does.
68	subs	xzr,x6,#1		// (*)
69	umulh	x17,x14,x15
70	adc	x13,x13,xzr
71	cbz	x21,.L1st_skip
72
73.L1st:
74	ldr	x8,[x1],#8
75	adds	x6,x10,x7
76	sub	x21,x21,#8		// j--
77	adc	x7,x11,xzr
78
79	ldr	x14,[x3],#8
80	adds	x12,x16,x13
81	mul	x10,x8,x9		// ap[j]*bp[0]
82	adc	x13,x17,xzr
83	umulh	x11,x8,x9
84
85	adds	x12,x12,x6
86	mul	x16,x14,x15		// np[j]*m1
87	adc	x13,x13,xzr
88	umulh	x17,x14,x15
89	str	x12,[x22],#8		// tp[j-1]
90	cbnz	x21,.L1st
91
92.L1st_skip:
93	adds	x6,x10,x7
94	sub	x1,x1,x5		// rewind x1
95	adc	x7,x11,xzr
96
97	adds	x12,x16,x13
98	sub	x3,x3,x5		// rewind x3
99	adc	x13,x17,xzr
100
101	adds	x12,x12,x6
102	sub	x20,x5,#8		// i=num-1
103	adcs	x13,x13,x7
104
105	adc	x19,xzr,xzr		// upmost overflow bit
106	stp	x12,x13,[x22]
107
108.Louter:
109	ldr	x9,[x2],#8		// bp[i]
110	ldp	x7,x8,[x1],#16
111	ldr	x23,[sp]		// tp[0]
112	add	x22,sp,#8
113
114	mul	x6,x7,x9		// ap[0]*bp[i]
115	sub	x21,x5,#16		// j=num-2
116	umulh	x7,x7,x9
117	ldp	x13,x14,[x3],#16
118	mul	x10,x8,x9		// ap[1]*bp[i]
119	adds	x6,x6,x23
120	umulh	x11,x8,x9
121	adc	x7,x7,xzr
122
123	mul	x15,x6,x4
124	sub	x20,x20,#8		// i--
125
126	// (*)	mul	x12,x13,x15	// np[0]*m1
127	umulh	x13,x13,x15
128	mul	x16,x14,x15		// np[1]*m1
129	// (*)	adds	x12,x12,x6
130	subs	xzr,x6,#1		// (*)
131	umulh	x17,x14,x15
132	cbz	x21,.Linner_skip
133
134.Linner:
135	ldr	x8,[x1],#8
136	adc	x13,x13,xzr
137	ldr	x23,[x22],#8		// tp[j]
138	adds	x6,x10,x7
139	sub	x21,x21,#8		// j--
140	adc	x7,x11,xzr
141
142	adds	x12,x16,x13
143	ldr	x14,[x3],#8
144	adc	x13,x17,xzr
145
146	mul	x10,x8,x9		// ap[j]*bp[i]
147	adds	x6,x6,x23
148	umulh	x11,x8,x9
149	adc	x7,x7,xzr
150
151	mul	x16,x14,x15		// np[j]*m1
152	adds	x12,x12,x6
153	umulh	x17,x14,x15
154	stur	x12,[x22,#-16]		// tp[j-1]
155	cbnz	x21,.Linner
156
157.Linner_skip:
158	ldr	x23,[x22],#8		// tp[j]
159	adc	x13,x13,xzr
160	adds	x6,x10,x7
161	sub	x1,x1,x5		// rewind x1
162	adc	x7,x11,xzr
163
164	adds	x12,x16,x13
165	sub	x3,x3,x5		// rewind x3
166	adcs	x13,x17,x19
167	adc	x19,xzr,xzr
168
169	adds	x6,x6,x23
170	adc	x7,x7,xzr
171
172	adds	x12,x12,x6
173	adcs	x13,x13,x7
174	adc	x19,x19,xzr		// upmost overflow bit
175	stp	x12,x13,[x22,#-16]
176
177	cbnz	x20,.Louter
178
179	// Final step. We see if result is larger than modulus, and
180	// if it is, subtract the modulus. But comparison implies
181	// subtraction. So we subtract modulus, see if it borrowed,
182	// and conditionally copy original value.
183	ldr	x23,[sp]		// tp[0]
184	add	x22,sp,#8
185	ldr	x14,[x3],#8		// np[0]
186	subs	x21,x5,#8		// j=num-1 and clear borrow
187	mov	x1,x0
188.Lsub:
189	sbcs	x8,x23,x14		// tp[j]-np[j]
190	ldr	x23,[x22],#8
191	sub	x21,x21,#8		// j--
192	ldr	x14,[x3],#8
193	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
194	cbnz	x21,.Lsub
195
196	sbcs	x8,x23,x14
197	sbcs	x19,x19,xzr		// did it borrow?
198	str	x8,[x1],#8		// rp[num-1]
199
200	ldr	x23,[sp]		// tp[0]
201	add	x22,sp,#8
202	ldr	x8,[x0],#8		// rp[0]
203	sub	x5,x5,#8		// num--
204	nop
205.Lcond_copy:
206	sub	x5,x5,#8		// num--
207	csel	x14,x23,x8,lo		// did it borrow?
208	ldr	x23,[x22],#8
209	ldr	x8,[x0],#8
210	stur	xzr,[x22,#-16]		// wipe tp
211	stur	x14,[x0,#-16]
212	cbnz	x5,.Lcond_copy
213
214	csel	x14,x23,x8,lo
215	stur	xzr,[x22,#-8]		// wipe tp
216	stur	x14,[x0,#-8]
217
218	ldp	x19,x20,[x29,#16]
219	mov	sp,x29
220	ldp	x21,x22,[x29,#32]
221	mov	x0,#1
222	ldp	x23,x24,[x29,#48]
223	ldr	x29,[sp],#64
224	AARCH64_VALIDATE_LINK_REGISTER
225	ret
226.size	bn_mul_mont,.-bn_mul_mont
227.type	bn_mul8x_mont_neon,%function
228.align	5
229bn_mul8x_mont_neon:
230	// Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to
231	// only from bn_mul_mont which has already signed the return address.
232	stp	x29,x30,[sp,#-80]!
233	mov	x16,sp
234	stp	d8,d9,[sp,#16]
235	stp	d10,d11,[sp,#32]
236	stp	d12,d13,[sp,#48]
237	stp	d14,d15,[sp,#64]
238	lsl	x5,x5,#1
239	eor	v14.16b,v14.16b,v14.16b
240
241.align	4
242.LNEON_8n:
243	eor	v6.16b,v6.16b,v6.16b
244	sub	x7,sp,#128
245	eor	v7.16b,v7.16b,v7.16b
246	sub	x7,x7,x5,lsl#4
247	eor	v8.16b,v8.16b,v8.16b
248	and	x7,x7,#-64
249	eor	v9.16b,v9.16b,v9.16b
250	mov	sp,x7		// alloca
251	eor	v10.16b,v10.16b,v10.16b
252	add	x7,x7,#256
253	eor	v11.16b,v11.16b,v11.16b
254	sub	x8,x5,#8
255	eor	v12.16b,v12.16b,v12.16b
256	eor	v13.16b,v13.16b,v13.16b
257
258.LNEON_8n_init:
259	st1	{v6.2d,v7.2d},[x7],#32
260	subs	x8,x8,#8
261	st1	{v8.2d,v9.2d},[x7],#32
262	st1	{v10.2d,v11.2d},[x7],#32
263	st1	{v12.2d,v13.2d},[x7],#32
264	bne	.LNEON_8n_init
265
266	add	x6,sp,#256
267	ld1	{v0.4s,v1.4s},[x1],#32
268	add	x10,sp,#8
269	ldr	s30,[x4],#4
270	mov	x9,x5
271	b	.LNEON_8n_outer
272
273.align	4
274.LNEON_8n_outer:
275	ldr	s28,[x2],#4   // *b++
276	uxtl	v28.4s,v28.4h
277	add	x7,sp,#128
278	ld1	{v2.4s,v3.4s},[x3],#32
279
280	umlal	v6.2d,v28.2s,v0.s[0]
281	umlal	v7.2d,v28.2s,v0.s[1]
282	umlal	v8.2d,v28.2s,v0.s[2]
283	shl	v29.2d,v6.2d,#16
284	ext	v29.16b,v29.16b,v29.16b,#8
285	umlal	v9.2d,v28.2s,v0.s[3]
286	add	v29.2d,v29.2d,v6.2d
287	umlal	v10.2d,v28.2s,v1.s[0]
288	mul	v29.2s,v29.2s,v30.2s
289	umlal	v11.2d,v28.2s,v1.s[1]
290	st1	{v28.2s},[sp]		// put aside smashed b[8*i+0]
291	umlal	v12.2d,v28.2s,v1.s[2]
292	uxtl	v29.4s,v29.4h
293	umlal	v13.2d,v28.2s,v1.s[3]
294	ldr	s28,[x2],#4   // *b++
295	umlal	v6.2d,v29.2s,v2.s[0]
296	umlal	v7.2d,v29.2s,v2.s[1]
297	uxtl	v28.4s,v28.4h
298	umlal	v8.2d,v29.2s,v2.s[2]
299	ushr	v15.2d,v6.2d,#16
300	umlal	v9.2d,v29.2s,v2.s[3]
301	umlal	v10.2d,v29.2s,v3.s[0]
302	ext	v6.16b,v6.16b,v6.16b,#8
303	add	v6.2d,v6.2d,v15.2d
304	umlal	v11.2d,v29.2s,v3.s[1]
305	ushr	v6.2d,v6.2d,#16
306	umlal	v12.2d,v29.2s,v3.s[2]
307	umlal	v13.2d,v29.2s,v3.s[3]
308	add	v16.2d,v7.2d,v6.2d
309	ins	v7.d[0],v16.d[0]
310	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+0]
311	umlal	v7.2d,v28.2s,v0.s[0]
312	ld1	{v6.2d},[x6],#16
313	umlal	v8.2d,v28.2s,v0.s[1]
314	umlal	v9.2d,v28.2s,v0.s[2]
315	shl	v29.2d,v7.2d,#16
316	ext	v29.16b,v29.16b,v29.16b,#8
317	umlal	v10.2d,v28.2s,v0.s[3]
318	add	v29.2d,v29.2d,v7.2d
319	umlal	v11.2d,v28.2s,v1.s[0]
320	mul	v29.2s,v29.2s,v30.2s
321	umlal	v12.2d,v28.2s,v1.s[1]
322	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+1]
323	umlal	v13.2d,v28.2s,v1.s[2]
324	uxtl	v29.4s,v29.4h
325	umlal	v6.2d,v28.2s,v1.s[3]
326	ldr	s28,[x2],#4   // *b++
327	umlal	v7.2d,v29.2s,v2.s[0]
328	umlal	v8.2d,v29.2s,v2.s[1]
329	uxtl	v28.4s,v28.4h
330	umlal	v9.2d,v29.2s,v2.s[2]
331	ushr	v15.2d,v7.2d,#16
332	umlal	v10.2d,v29.2s,v2.s[3]
333	umlal	v11.2d,v29.2s,v3.s[0]
334	ext	v7.16b,v7.16b,v7.16b,#8
335	add	v7.2d,v7.2d,v15.2d
336	umlal	v12.2d,v29.2s,v3.s[1]
337	ushr	v7.2d,v7.2d,#16
338	umlal	v13.2d,v29.2s,v3.s[2]
339	umlal	v6.2d,v29.2s,v3.s[3]
340	add	v16.2d,v8.2d,v7.2d
341	ins	v8.d[0],v16.d[0]
342	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+1]
343	umlal	v8.2d,v28.2s,v0.s[0]
344	ld1	{v7.2d},[x6],#16
345	umlal	v9.2d,v28.2s,v0.s[1]
346	umlal	v10.2d,v28.2s,v0.s[2]
347	shl	v29.2d,v8.2d,#16
348	ext	v29.16b,v29.16b,v29.16b,#8
349	umlal	v11.2d,v28.2s,v0.s[3]
350	add	v29.2d,v29.2d,v8.2d
351	umlal	v12.2d,v28.2s,v1.s[0]
352	mul	v29.2s,v29.2s,v30.2s
353	umlal	v13.2d,v28.2s,v1.s[1]
354	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+2]
355	umlal	v6.2d,v28.2s,v1.s[2]
356	uxtl	v29.4s,v29.4h
357	umlal	v7.2d,v28.2s,v1.s[3]
358	ldr	s28,[x2],#4   // *b++
359	umlal	v8.2d,v29.2s,v2.s[0]
360	umlal	v9.2d,v29.2s,v2.s[1]
361	uxtl	v28.4s,v28.4h
362	umlal	v10.2d,v29.2s,v2.s[2]
363	ushr	v15.2d,v8.2d,#16
364	umlal	v11.2d,v29.2s,v2.s[3]
365	umlal	v12.2d,v29.2s,v3.s[0]
366	ext	v8.16b,v8.16b,v8.16b,#8
367	add	v8.2d,v8.2d,v15.2d
368	umlal	v13.2d,v29.2s,v3.s[1]
369	ushr	v8.2d,v8.2d,#16
370	umlal	v6.2d,v29.2s,v3.s[2]
371	umlal	v7.2d,v29.2s,v3.s[3]
372	add	v16.2d,v9.2d,v8.2d
373	ins	v9.d[0],v16.d[0]
374	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+2]
375	umlal	v9.2d,v28.2s,v0.s[0]
376	ld1	{v8.2d},[x6],#16
377	umlal	v10.2d,v28.2s,v0.s[1]
378	umlal	v11.2d,v28.2s,v0.s[2]
379	shl	v29.2d,v9.2d,#16
380	ext	v29.16b,v29.16b,v29.16b,#8
381	umlal	v12.2d,v28.2s,v0.s[3]
382	add	v29.2d,v29.2d,v9.2d
383	umlal	v13.2d,v28.2s,v1.s[0]
384	mul	v29.2s,v29.2s,v30.2s
385	umlal	v6.2d,v28.2s,v1.s[1]
386	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+3]
387	umlal	v7.2d,v28.2s,v1.s[2]
388	uxtl	v29.4s,v29.4h
389	umlal	v8.2d,v28.2s,v1.s[3]
390	ldr	s28,[x2],#4   // *b++
391	umlal	v9.2d,v29.2s,v2.s[0]
392	umlal	v10.2d,v29.2s,v2.s[1]
393	uxtl	v28.4s,v28.4h
394	umlal	v11.2d,v29.2s,v2.s[2]
395	ushr	v15.2d,v9.2d,#16
396	umlal	v12.2d,v29.2s,v2.s[3]
397	umlal	v13.2d,v29.2s,v3.s[0]
398	ext	v9.16b,v9.16b,v9.16b,#8
399	add	v9.2d,v9.2d,v15.2d
400	umlal	v6.2d,v29.2s,v3.s[1]
401	ushr	v9.2d,v9.2d,#16
402	umlal	v7.2d,v29.2s,v3.s[2]
403	umlal	v8.2d,v29.2s,v3.s[3]
404	add	v16.2d,v10.2d,v9.2d
405	ins	v10.d[0],v16.d[0]
406	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+3]
407	umlal	v10.2d,v28.2s,v0.s[0]
408	ld1	{v9.2d},[x6],#16
409	umlal	v11.2d,v28.2s,v0.s[1]
410	umlal	v12.2d,v28.2s,v0.s[2]
411	shl	v29.2d,v10.2d,#16
412	ext	v29.16b,v29.16b,v29.16b,#8
413	umlal	v13.2d,v28.2s,v0.s[3]
414	add	v29.2d,v29.2d,v10.2d
415	umlal	v6.2d,v28.2s,v1.s[0]
416	mul	v29.2s,v29.2s,v30.2s
417	umlal	v7.2d,v28.2s,v1.s[1]
418	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+4]
419	umlal	v8.2d,v28.2s,v1.s[2]
420	uxtl	v29.4s,v29.4h
421	umlal	v9.2d,v28.2s,v1.s[3]
422	ldr	s28,[x2],#4   // *b++
423	umlal	v10.2d,v29.2s,v2.s[0]
424	umlal	v11.2d,v29.2s,v2.s[1]
425	uxtl	v28.4s,v28.4h
426	umlal	v12.2d,v29.2s,v2.s[2]
427	ushr	v15.2d,v10.2d,#16
428	umlal	v13.2d,v29.2s,v2.s[3]
429	umlal	v6.2d,v29.2s,v3.s[0]
430	ext	v10.16b,v10.16b,v10.16b,#8
431	add	v10.2d,v10.2d,v15.2d
432	umlal	v7.2d,v29.2s,v3.s[1]
433	ushr	v10.2d,v10.2d,#16
434	umlal	v8.2d,v29.2s,v3.s[2]
435	umlal	v9.2d,v29.2s,v3.s[3]
436	add	v16.2d,v11.2d,v10.2d
437	ins	v11.d[0],v16.d[0]
438	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+4]
439	umlal	v11.2d,v28.2s,v0.s[0]
440	ld1	{v10.2d},[x6],#16
441	umlal	v12.2d,v28.2s,v0.s[1]
442	umlal	v13.2d,v28.2s,v0.s[2]
443	shl	v29.2d,v11.2d,#16
444	ext	v29.16b,v29.16b,v29.16b,#8
445	umlal	v6.2d,v28.2s,v0.s[3]
446	add	v29.2d,v29.2d,v11.2d
447	umlal	v7.2d,v28.2s,v1.s[0]
448	mul	v29.2s,v29.2s,v30.2s
449	umlal	v8.2d,v28.2s,v1.s[1]
450	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+5]
451	umlal	v9.2d,v28.2s,v1.s[2]
452	uxtl	v29.4s,v29.4h
453	umlal	v10.2d,v28.2s,v1.s[3]
454	ldr	s28,[x2],#4   // *b++
455	umlal	v11.2d,v29.2s,v2.s[0]
456	umlal	v12.2d,v29.2s,v2.s[1]
457	uxtl	v28.4s,v28.4h
458	umlal	v13.2d,v29.2s,v2.s[2]
459	ushr	v15.2d,v11.2d,#16
460	umlal	v6.2d,v29.2s,v2.s[3]
461	umlal	v7.2d,v29.2s,v3.s[0]
462	ext	v11.16b,v11.16b,v11.16b,#8
463	add	v11.2d,v11.2d,v15.2d
464	umlal	v8.2d,v29.2s,v3.s[1]
465	ushr	v11.2d,v11.2d,#16
466	umlal	v9.2d,v29.2s,v3.s[2]
467	umlal	v10.2d,v29.2s,v3.s[3]
468	add	v16.2d,v12.2d,v11.2d
469	ins	v12.d[0],v16.d[0]
470	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+5]
471	umlal	v12.2d,v28.2s,v0.s[0]
472	ld1	{v11.2d},[x6],#16
473	umlal	v13.2d,v28.2s,v0.s[1]
474	umlal	v6.2d,v28.2s,v0.s[2]
475	shl	v29.2d,v12.2d,#16
476	ext	v29.16b,v29.16b,v29.16b,#8
477	umlal	v7.2d,v28.2s,v0.s[3]
478	add	v29.2d,v29.2d,v12.2d
479	umlal	v8.2d,v28.2s,v1.s[0]
480	mul	v29.2s,v29.2s,v30.2s
481	umlal	v9.2d,v28.2s,v1.s[1]
482	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+6]
483	umlal	v10.2d,v28.2s,v1.s[2]
484	uxtl	v29.4s,v29.4h
485	umlal	v11.2d,v28.2s,v1.s[3]
486	ldr	s28,[x2],#4   // *b++
487	umlal	v12.2d,v29.2s,v2.s[0]
488	umlal	v13.2d,v29.2s,v2.s[1]
489	uxtl	v28.4s,v28.4h
490	umlal	v6.2d,v29.2s,v2.s[2]
491	ushr	v15.2d,v12.2d,#16
492	umlal	v7.2d,v29.2s,v2.s[3]
493	umlal	v8.2d,v29.2s,v3.s[0]
494	ext	v12.16b,v12.16b,v12.16b,#8
495	add	v12.2d,v12.2d,v15.2d
496	umlal	v9.2d,v29.2s,v3.s[1]
497	ushr	v12.2d,v12.2d,#16
498	umlal	v10.2d,v29.2s,v3.s[2]
499	umlal	v11.2d,v29.2s,v3.s[3]
500	add	v16.2d,v13.2d,v12.2d
501	ins	v13.d[0],v16.d[0]
502	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+6]
503	umlal	v13.2d,v28.2s,v0.s[0]
504	ld1	{v12.2d},[x6],#16
505	umlal	v6.2d,v28.2s,v0.s[1]
506	umlal	v7.2d,v28.2s,v0.s[2]
507	shl	v29.2d,v13.2d,#16
508	ext	v29.16b,v29.16b,v29.16b,#8
509	umlal	v8.2d,v28.2s,v0.s[3]
510	add	v29.2d,v29.2d,v13.2d
511	umlal	v9.2d,v28.2s,v1.s[0]
512	mul	v29.2s,v29.2s,v30.2s
513	umlal	v10.2d,v28.2s,v1.s[1]
514	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+7]
515	umlal	v11.2d,v28.2s,v1.s[2]
516	uxtl	v29.4s,v29.4h
517	umlal	v12.2d,v28.2s,v1.s[3]
518	ld1	{v28.2s},[sp]		// pull smashed b[8*i+0]
519	umlal	v13.2d,v29.2s,v2.s[0]
520	ld1	{v0.4s,v1.4s},[x1],#32
521	umlal	v6.2d,v29.2s,v2.s[1]
522	umlal	v7.2d,v29.2s,v2.s[2]
523	mov	v5.16b,v13.16b
524	ushr	v5.2d,v5.2d,#16
525	ext	v13.16b,v13.16b,v13.16b,#8
526	umlal	v8.2d,v29.2s,v2.s[3]
527	umlal	v9.2d,v29.2s,v3.s[0]
528	add	v13.2d,v13.2d,v5.2d
529	umlal	v10.2d,v29.2s,v3.s[1]
530	ushr	v13.2d,v13.2d,#16
531	eor	v15.16b,v15.16b,v15.16b
532	ins	v13.d[1],v15.d[0]
533	umlal	v11.2d,v29.2s,v3.s[2]
534	umlal	v12.2d,v29.2s,v3.s[3]
535	add	v6.2d,v6.2d,v13.2d
536	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+7]
537	add	x10,sp,#8		// rewind
538	sub	x8,x5,#8
539	b	.LNEON_8n_inner
540
541.align	4
542.LNEON_8n_inner:
543	subs	x8,x8,#8
544	umlal	v6.2d,v28.2s,v0.s[0]
545	ld1	{v13.2d},[x6]
546	umlal	v7.2d,v28.2s,v0.s[1]
547	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+0]
548	umlal	v8.2d,v28.2s,v0.s[2]
549	ld1	{v2.4s,v3.4s},[x3],#32
550	umlal	v9.2d,v28.2s,v0.s[3]
551	b.eq	.LInner_jump
552	add	x6,x6,#16	// don't advance in last iteration
553.LInner_jump:
554	umlal	v10.2d,v28.2s,v1.s[0]
555	umlal	v11.2d,v28.2s,v1.s[1]
556	umlal	v12.2d,v28.2s,v1.s[2]
557	umlal	v13.2d,v28.2s,v1.s[3]
558	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+1]
559	umlal	v6.2d,v29.2s,v2.s[0]
560	umlal	v7.2d,v29.2s,v2.s[1]
561	umlal	v8.2d,v29.2s,v2.s[2]
562	umlal	v9.2d,v29.2s,v2.s[3]
563	umlal	v10.2d,v29.2s,v3.s[0]
564	umlal	v11.2d,v29.2s,v3.s[1]
565	umlal	v12.2d,v29.2s,v3.s[2]
566	umlal	v13.2d,v29.2s,v3.s[3]
567	st1	{v6.2d},[x7],#16
568	umlal	v7.2d,v28.2s,v0.s[0]
569	ld1	{v6.2d},[x6]
570	umlal	v8.2d,v28.2s,v0.s[1]
571	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+1]
572	umlal	v9.2d,v28.2s,v0.s[2]
573	b.eq	.LInner_jump1
574	add	x6,x6,#16	// don't advance in last iteration
575.LInner_jump1:
576	umlal	v10.2d,v28.2s,v0.s[3]
577	umlal	v11.2d,v28.2s,v1.s[0]
578	umlal	v12.2d,v28.2s,v1.s[1]
579	umlal	v13.2d,v28.2s,v1.s[2]
580	umlal	v6.2d,v28.2s,v1.s[3]
581	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+2]
582	umlal	v7.2d,v29.2s,v2.s[0]
583	umlal	v8.2d,v29.2s,v2.s[1]
584	umlal	v9.2d,v29.2s,v2.s[2]
585	umlal	v10.2d,v29.2s,v2.s[3]
586	umlal	v11.2d,v29.2s,v3.s[0]
587	umlal	v12.2d,v29.2s,v3.s[1]
588	umlal	v13.2d,v29.2s,v3.s[2]
589	umlal	v6.2d,v29.2s,v3.s[3]
590	st1	{v7.2d},[x7],#16
591	umlal	v8.2d,v28.2s,v0.s[0]
592	ld1	{v7.2d},[x6]
593	umlal	v9.2d,v28.2s,v0.s[1]
594	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+2]
595	umlal	v10.2d,v28.2s,v0.s[2]
596	b.eq	.LInner_jump2
597	add	x6,x6,#16	// don't advance in last iteration
598.LInner_jump2:
599	umlal	v11.2d,v28.2s,v0.s[3]
600	umlal	v12.2d,v28.2s,v1.s[0]
601	umlal	v13.2d,v28.2s,v1.s[1]
602	umlal	v6.2d,v28.2s,v1.s[2]
603	umlal	v7.2d,v28.2s,v1.s[3]
604	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+3]
605	umlal	v8.2d,v29.2s,v2.s[0]
606	umlal	v9.2d,v29.2s,v2.s[1]
607	umlal	v10.2d,v29.2s,v2.s[2]
608	umlal	v11.2d,v29.2s,v2.s[3]
609	umlal	v12.2d,v29.2s,v3.s[0]
610	umlal	v13.2d,v29.2s,v3.s[1]
611	umlal	v6.2d,v29.2s,v3.s[2]
612	umlal	v7.2d,v29.2s,v3.s[3]
613	st1	{v8.2d},[x7],#16
614	umlal	v9.2d,v28.2s,v0.s[0]
615	ld1	{v8.2d},[x6]
616	umlal	v10.2d,v28.2s,v0.s[1]
617	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+3]
618	umlal	v11.2d,v28.2s,v0.s[2]
619	b.eq	.LInner_jump3
620	add	x6,x6,#16	// don't advance in last iteration
621.LInner_jump3:
622	umlal	v12.2d,v28.2s,v0.s[3]
623	umlal	v13.2d,v28.2s,v1.s[0]
624	umlal	v6.2d,v28.2s,v1.s[1]
625	umlal	v7.2d,v28.2s,v1.s[2]
626	umlal	v8.2d,v28.2s,v1.s[3]
627	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+4]
628	umlal	v9.2d,v29.2s,v2.s[0]
629	umlal	v10.2d,v29.2s,v2.s[1]
630	umlal	v11.2d,v29.2s,v2.s[2]
631	umlal	v12.2d,v29.2s,v2.s[3]
632	umlal	v13.2d,v29.2s,v3.s[0]
633	umlal	v6.2d,v29.2s,v3.s[1]
634	umlal	v7.2d,v29.2s,v3.s[2]
635	umlal	v8.2d,v29.2s,v3.s[3]
636	st1	{v9.2d},[x7],#16
637	umlal	v10.2d,v28.2s,v0.s[0]
638	ld1	{v9.2d},[x6]
639	umlal	v11.2d,v28.2s,v0.s[1]
640	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+4]
641	umlal	v12.2d,v28.2s,v0.s[2]
642	b.eq	.LInner_jump4
643	add	x6,x6,#16	// don't advance in last iteration
644.LInner_jump4:
645	umlal	v13.2d,v28.2s,v0.s[3]
646	umlal	v6.2d,v28.2s,v1.s[0]
647	umlal	v7.2d,v28.2s,v1.s[1]
648	umlal	v8.2d,v28.2s,v1.s[2]
649	umlal	v9.2d,v28.2s,v1.s[3]
650	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+5]
651	umlal	v10.2d,v29.2s,v2.s[0]
652	umlal	v11.2d,v29.2s,v2.s[1]
653	umlal	v12.2d,v29.2s,v2.s[2]
654	umlal	v13.2d,v29.2s,v2.s[3]
655	umlal	v6.2d,v29.2s,v3.s[0]
656	umlal	v7.2d,v29.2s,v3.s[1]
657	umlal	v8.2d,v29.2s,v3.s[2]
658	umlal	v9.2d,v29.2s,v3.s[3]
659	st1	{v10.2d},[x7],#16
660	umlal	v11.2d,v28.2s,v0.s[0]
661	ld1	{v10.2d},[x6]
662	umlal	v12.2d,v28.2s,v0.s[1]
663	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+5]
664	umlal	v13.2d,v28.2s,v0.s[2]
665	b.eq	.LInner_jump5
666	add	x6,x6,#16	// don't advance in last iteration
667.LInner_jump5:
668	umlal	v6.2d,v28.2s,v0.s[3]
669	umlal	v7.2d,v28.2s,v1.s[0]
670	umlal	v8.2d,v28.2s,v1.s[1]
671	umlal	v9.2d,v28.2s,v1.s[2]
672	umlal	v10.2d,v28.2s,v1.s[3]
673	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+6]
674	umlal	v11.2d,v29.2s,v2.s[0]
675	umlal	v12.2d,v29.2s,v2.s[1]
676	umlal	v13.2d,v29.2s,v2.s[2]
677	umlal	v6.2d,v29.2s,v2.s[3]
678	umlal	v7.2d,v29.2s,v3.s[0]
679	umlal	v8.2d,v29.2s,v3.s[1]
680	umlal	v9.2d,v29.2s,v3.s[2]
681	umlal	v10.2d,v29.2s,v3.s[3]
682	st1	{v11.2d},[x7],#16
683	umlal	v12.2d,v28.2s,v0.s[0]
684	ld1	{v11.2d},[x6]
685	umlal	v13.2d,v28.2s,v0.s[1]
686	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+6]
687	umlal	v6.2d,v28.2s,v0.s[2]
688	b.eq	.LInner_jump6
689	add	x6,x6,#16	// don't advance in last iteration
690.LInner_jump6:
691	umlal	v7.2d,v28.2s,v0.s[3]
692	umlal	v8.2d,v28.2s,v1.s[0]
693	umlal	v9.2d,v28.2s,v1.s[1]
694	umlal	v10.2d,v28.2s,v1.s[2]
695	umlal	v11.2d,v28.2s,v1.s[3]
696	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+7]
697	umlal	v12.2d,v29.2s,v2.s[0]
698	umlal	v13.2d,v29.2s,v2.s[1]
699	umlal	v6.2d,v29.2s,v2.s[2]
700	umlal	v7.2d,v29.2s,v2.s[3]
701	umlal	v8.2d,v29.2s,v3.s[0]
702	umlal	v9.2d,v29.2s,v3.s[1]
703	umlal	v10.2d,v29.2s,v3.s[2]
704	umlal	v11.2d,v29.2s,v3.s[3]
705	st1	{v12.2d},[x7],#16
706	umlal	v13.2d,v28.2s,v0.s[0]
707	ld1	{v12.2d},[x6]
708	umlal	v6.2d,v28.2s,v0.s[1]
709	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+7]
710	umlal	v7.2d,v28.2s,v0.s[2]
711	b.eq	.LInner_jump7
712	add	x6,x6,#16	// don't advance in last iteration
713.LInner_jump7:
714	umlal	v8.2d,v28.2s,v0.s[3]
715	umlal	v9.2d,v28.2s,v1.s[0]
716	umlal	v10.2d,v28.2s,v1.s[1]
717	umlal	v11.2d,v28.2s,v1.s[2]
718	umlal	v12.2d,v28.2s,v1.s[3]
719	b.ne	.LInner_after_rewind8
720	sub	x1,x1,x5,lsl#2	// rewind
721.LInner_after_rewind8:
722	umlal	v13.2d,v29.2s,v2.s[0]
723	ld1	{v28.2s},[sp]		// pull smashed b[8*i+0]
724	umlal	v6.2d,v29.2s,v2.s[1]
725	ld1	{v0.4s,v1.4s},[x1],#32
726	umlal	v7.2d,v29.2s,v2.s[2]
727	add	x10,sp,#8		// rewind
728	umlal	v8.2d,v29.2s,v2.s[3]
729	umlal	v9.2d,v29.2s,v3.s[0]
730	umlal	v10.2d,v29.2s,v3.s[1]
731	umlal	v11.2d,v29.2s,v3.s[2]
732	st1	{v13.2d},[x7],#16
733	umlal	v12.2d,v29.2s,v3.s[3]
734
735	bne	.LNEON_8n_inner
736	add	x6,sp,#128
737	st1	{v6.2d,v7.2d},[x7],#32
738	eor	v2.16b,v2.16b,v2.16b	// v2
739	st1	{v8.2d,v9.2d},[x7],#32
740	eor	v3.16b,v3.16b,v3.16b	// v3
741	st1	{v10.2d,v11.2d},[x7],#32
742	st1	{v12.2d},[x7]
743
744	subs	x9,x9,#8
745	ld1	{v6.2d,v7.2d},[x6],#32
746	ld1	{v8.2d,v9.2d},[x6],#32
747	ld1	{v10.2d,v11.2d},[x6],#32
748	ld1	{v12.2d,v13.2d},[x6],#32
749
750	b.eq	.LInner_8n_jump_2steps
751	sub	x3,x3,x5,lsl#2	// rewind
752	b	.LNEON_8n_outer
753
754.LInner_8n_jump_2steps:
755	add	x7,sp,#128
756	st1	{v2.2d,v3.2d}, [sp],#32	// start wiping stack frame
757	mov	v5.16b,v6.16b
758	ushr	v15.2d,v6.2d,#16
759	ext	v6.16b,v6.16b,v6.16b,#8
760	st1	{v2.2d,v3.2d}, [sp],#32
761	add	v6.2d,v6.2d,v15.2d
762	st1	{v2.2d,v3.2d}, [sp],#32
763	ushr	v15.2d,v6.2d,#16
764	st1	{v2.2d,v3.2d}, [sp],#32
765	zip1	v6.4h,v5.4h,v6.4h
766	ins	v15.d[1],v14.d[0]
767
768	mov	x8,x5
769	b	.LNEON_tail_entry
770
771.align	4
772.LNEON_tail:
773	add	v6.2d,v6.2d,v15.2d
774	mov	v5.16b,v6.16b
775	ushr	v15.2d,v6.2d,#16
776	ext	v6.16b,v6.16b,v6.16b,#8
777	ld1	{v8.2d,v9.2d}, [x6],#32
778	add	v6.2d,v6.2d,v15.2d
779	ld1	{v10.2d,v11.2d}, [x6],#32
780	ushr	v15.2d,v6.2d,#16
781	ld1	{v12.2d,v13.2d}, [x6],#32
782	zip1	v6.4h,v5.4h,v6.4h
783	ins	v15.d[1],v14.d[0]
784
785.LNEON_tail_entry:
786	add	v7.2d,v7.2d,v15.2d
787	st1	{v6.s}[0], [x7],#4
788	ushr	v15.2d,v7.2d,#16
789	mov	v5.16b,v7.16b
790	ext	v7.16b,v7.16b,v7.16b,#8
791	add	v7.2d,v7.2d,v15.2d
792	ushr	v15.2d,v7.2d,#16
793	zip1	v7.4h,v5.4h,v7.4h
794	ins	v15.d[1],v14.d[0]
795	add	v8.2d,v8.2d,v15.2d
796	st1	{v7.s}[0], [x7],#4
797	ushr	v15.2d,v8.2d,#16
798	mov	v5.16b,v8.16b
799	ext	v8.16b,v8.16b,v8.16b,#8
800	add	v8.2d,v8.2d,v15.2d
801	ushr	v15.2d,v8.2d,#16
802	zip1	v8.4h,v5.4h,v8.4h
803	ins	v15.d[1],v14.d[0]
804	add	v9.2d,v9.2d,v15.2d
805	st1	{v8.s}[0], [x7],#4
806	ushr	v15.2d,v9.2d,#16
807	mov	v5.16b,v9.16b
808	ext	v9.16b,v9.16b,v9.16b,#8
809	add	v9.2d,v9.2d,v15.2d
810	ushr	v15.2d,v9.2d,#16
811	zip1	v9.4h,v5.4h,v9.4h
812	ins	v15.d[1],v14.d[0]
813	add	v10.2d,v10.2d,v15.2d
814	st1	{v9.s}[0], [x7],#4
815	ushr	v15.2d,v10.2d,#16
816	mov	v5.16b,v10.16b
817	ext	v10.16b,v10.16b,v10.16b,#8
818	add	v10.2d,v10.2d,v15.2d
819	ushr	v15.2d,v10.2d,#16
820	zip1	v10.4h,v5.4h,v10.4h
821	ins	v15.d[1],v14.d[0]
822	add	v11.2d,v11.2d,v15.2d
823	st1	{v10.s}[0], [x7],#4
824	ushr	v15.2d,v11.2d,#16
825	mov	v5.16b,v11.16b
826	ext	v11.16b,v11.16b,v11.16b,#8
827	add	v11.2d,v11.2d,v15.2d
828	ushr	v15.2d,v11.2d,#16
829	zip1	v11.4h,v5.4h,v11.4h
830	ins	v15.d[1],v14.d[0]
831	add	v12.2d,v12.2d,v15.2d
832	st1	{v11.s}[0], [x7],#4
833	ushr	v15.2d,v12.2d,#16
834	mov	v5.16b,v12.16b
835	ext	v12.16b,v12.16b,v12.16b,#8
836	add	v12.2d,v12.2d,v15.2d
837	ushr	v15.2d,v12.2d,#16
838	zip1	v12.4h,v5.4h,v12.4h
839	ins	v15.d[1],v14.d[0]
840	add	v13.2d,v13.2d,v15.2d
841	st1	{v12.s}[0], [x7],#4
842	ushr	v15.2d,v13.2d,#16
843	mov	v5.16b,v13.16b
844	ext	v13.16b,v13.16b,v13.16b,#8
845	add	v13.2d,v13.2d,v15.2d
846	ushr	v15.2d,v13.2d,#16
847	zip1	v13.4h,v5.4h,v13.4h
848	ins	v15.d[1],v14.d[0]
849	ld1	{v6.2d,v7.2d}, [x6],#32
850	subs	x8,x8,#8
851	st1	{v13.s}[0], [x7],#4
852	bne	.LNEON_tail
853
854	st1	{v15.s}[0], [x7],#4	// top-most bit
855	sub	x3,x3,x5,lsl#2		// rewind x3
856	subs	x1,sp,#0			// clear carry flag
857	add	x2,sp,x5,lsl#2
858
859.LNEON_sub:
860	ldp	w4,w5,[x1],#8
861	ldp	w6,w7,[x1],#8
862	ldp	w8,w9,[x3],#8
863	ldp	w10,w11,[x3],#8
864	sbcs	w8,w4,w8
865	sbcs	w9,w5,w9
866	sbcs	w10,w6,w10
867	sbcs	w11,w7,w11
868	sub	x17,x2,x1
869	stp	w8,w9,[x0],#8
870	stp	w10,w11,[x0],#8
871	cbnz	x17,.LNEON_sub
872
873	ldr	w10, [x1]		// load top-most bit
874	mov	x11,sp
875	eor	v0.16b,v0.16b,v0.16b
876	sub	x11,x2,x11		// this is num*4
877	eor	v1.16b,v1.16b,v1.16b
878	mov	x1,sp
879	sub	x0,x0,x11		// rewind x0
880	mov	x3,x2		// second 3/4th of frame
881	sbcs	w10,w10,wzr		// result is carry flag
882
883.LNEON_copy_n_zap:
884	ldp	w4,w5,[x1],#8
885	ldp	w6,w7,[x1],#8
886	ldp	w8,w9,[x0],#8
887	ldp	w10,w11,[x0]
888	sub	x0,x0,#8
889	b.cs	.LCopy_1
890	mov	w8,w4
891	mov	w9,w5
892	mov	w10,w6
893	mov	w11,w7
894.LCopy_1:
895	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
896	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
897	ldp	w4,w5,[x1],#8
898	ldp	w6,w7,[x1],#8
899	stp	w8,w9,[x0],#8
900	stp	w10,w11,[x0],#8
901	sub	x1,x1,#32
902	ldp	w8,w9,[x0],#8
903	ldp	w10,w11,[x0]
904	sub	x0,x0,#8
905	b.cs	.LCopy_2
906	mov	w8, w4
907	mov	w9, w5
908	mov	w10, w6
909	mov	w11, w7
910.LCopy_2:
911	st1	{v0.2d,v1.2d}, [x1],#32		// wipe
912	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
913	sub	x17,x2,x1		// preserves carry
914	stp	w8,w9,[x0],#8
915	stp	w10,w11,[x0],#8
916	cbnz	x17,.LNEON_copy_n_zap
917
918	mov	sp,x16
919	ldp	d14,d15,[sp,#64]
920	ldp	d12,d13,[sp,#48]
921	ldp	d10,d11,[sp,#32]
922	ldp	d8,d9,[sp,#16]
923	ldr	x29,[sp],#80
924	AARCH64_VALIDATE_LINK_REGISTER
925	ret	// bx lr
926
927.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
928.type	__bn_sqr8x_mont,%function
929.align	5
930__bn_sqr8x_mont:
931	cmp	x1,x2
932	b.ne	__bn_mul4x_mont
933.Lsqr8x_mont:
934	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
935	// only from bn_mul_mont which has already signed the return address.
936	stp	x29,x30,[sp,#-128]!
937	add	x29,sp,#0
938	stp	x19,x20,[sp,#16]
939	stp	x21,x22,[sp,#32]
940	stp	x23,x24,[sp,#48]
941	stp	x25,x26,[sp,#64]
942	stp	x27,x28,[sp,#80]
943	stp	x0,x3,[sp,#96]	// offload rp and np
944
945	ldp	x6,x7,[x1,#8*0]
946	ldp	x8,x9,[x1,#8*2]
947	ldp	x10,x11,[x1,#8*4]
948	ldp	x12,x13,[x1,#8*6]
949
950	sub	x2,sp,x5,lsl#4
951	lsl	x5,x5,#3
952	ldr	x4,[x4]		// *n0
953	mov	sp,x2			// alloca
954	sub	x27,x5,#8*8
955	b	.Lsqr8x_zero_start
956
957.Lsqr8x_zero:
958	sub	x27,x27,#8*8
959	stp	xzr,xzr,[x2,#8*0]
960	stp	xzr,xzr,[x2,#8*2]
961	stp	xzr,xzr,[x2,#8*4]
962	stp	xzr,xzr,[x2,#8*6]
963.Lsqr8x_zero_start:
964	stp	xzr,xzr,[x2,#8*8]
965	stp	xzr,xzr,[x2,#8*10]
966	stp	xzr,xzr,[x2,#8*12]
967	stp	xzr,xzr,[x2,#8*14]
968	add	x2,x2,#8*16
969	cbnz	x27,.Lsqr8x_zero
970
971	add	x3,x1,x5
972	add	x1,x1,#8*8
973	mov	x19,xzr
974	mov	x20,xzr
975	mov	x21,xzr
976	mov	x22,xzr
977	mov	x23,xzr
978	mov	x24,xzr
979	mov	x25,xzr
980	mov	x26,xzr
981	mov	x2,sp
982	str	x4,[x29,#112]		// offload n0
983
984	// Multiply everything but a[i]*a[i]
985.align	4
986.Lsqr8x_outer_loop:
987        //                                                 a[1]a[0]	(i)
988        //                                             a[2]a[0]
989        //                                         a[3]a[0]
990        //                                     a[4]a[0]
991        //                                 a[5]a[0]
992        //                             a[6]a[0]
993        //                         a[7]a[0]
994        //                                         a[2]a[1]		(ii)
995        //                                     a[3]a[1]
996        //                                 a[4]a[1]
997        //                             a[5]a[1]
998        //                         a[6]a[1]
999        //                     a[7]a[1]
1000        //                                 a[3]a[2]			(iii)
1001        //                             a[4]a[2]
1002        //                         a[5]a[2]
1003        //                     a[6]a[2]
1004        //                 a[7]a[2]
1005        //                         a[4]a[3]				(iv)
1006        //                     a[5]a[3]
1007        //                 a[6]a[3]
1008        //             a[7]a[3]
1009        //                 a[5]a[4]					(v)
1010        //             a[6]a[4]
1011        //         a[7]a[4]
1012        //         a[6]a[5]						(vi)
1013        //     a[7]a[5]
1014        // a[7]a[6]							(vii)
1015
1016	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
1017	mul	x15,x8,x6
1018	mul	x16,x9,x6
1019	mul	x17,x10,x6
1020	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
1021	mul	x14,x11,x6
1022	adcs	x21,x21,x15
1023	mul	x15,x12,x6
1024	adcs	x22,x22,x16
1025	mul	x16,x13,x6
1026	adcs	x23,x23,x17
1027	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
1028	adcs	x24,x24,x14
1029	umulh	x14,x8,x6
1030	adcs	x25,x25,x15
1031	umulh	x15,x9,x6
1032	adcs	x26,x26,x16
1033	umulh	x16,x10,x6
1034	stp	x19,x20,[x2],#8*2	// t[0..1]
1035	adc	x19,xzr,xzr		// t[8]
1036	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
1037	umulh	x17,x11,x6
1038	adcs	x22,x22,x14
1039	umulh	x14,x12,x6
1040	adcs	x23,x23,x15
1041	umulh	x15,x13,x6
1042	adcs	x24,x24,x16
1043	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
1044	adcs	x25,x25,x17
1045	mul	x17,x9,x7
1046	adcs	x26,x26,x14
1047	mul	x14,x10,x7
1048	adc	x19,x19,x15
1049
1050	mul	x15,x11,x7
1051	adds	x22,x22,x16
1052	mul	x16,x12,x7
1053	adcs	x23,x23,x17
1054	mul	x17,x13,x7
1055	adcs	x24,x24,x14
1056	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
1057	adcs	x25,x25,x15
1058	umulh	x15,x9,x7
1059	adcs	x26,x26,x16
1060	umulh	x16,x10,x7
1061	adcs	x19,x19,x17
1062	umulh	x17,x11,x7
1063	stp	x21,x22,[x2],#8*2	// t[2..3]
1064	adc	x20,xzr,xzr		// t[9]
1065	adds	x23,x23,x14
1066	umulh	x14,x12,x7
1067	adcs	x24,x24,x15
1068	umulh	x15,x13,x7
1069	adcs	x25,x25,x16
1070	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
1071	adcs	x26,x26,x17
1072	mul	x17,x10,x8
1073	adcs	x19,x19,x14
1074	mul	x14,x11,x8
1075	adc	x20,x20,x15
1076
1077	mul	x15,x12,x8
1078	adds	x24,x24,x16
1079	mul	x16,x13,x8
1080	adcs	x25,x25,x17
1081	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
1082	adcs	x26,x26,x14
1083	umulh	x14,x10,x8
1084	adcs	x19,x19,x15
1085	umulh	x15,x11,x8
1086	adcs	x20,x20,x16
1087	umulh	x16,x12,x8
1088	stp	x23,x24,[x2],#8*2	// t[4..5]
1089	adc	x21,xzr,xzr		// t[10]
1090	adds	x25,x25,x17
1091	umulh	x17,x13,x8
1092	adcs	x26,x26,x14
1093	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
1094	adcs	x19,x19,x15
1095	mul	x15,x11,x9
1096	adcs	x20,x20,x16
1097	mul	x16,x12,x9
1098	adc	x21,x21,x17
1099
1100	mul	x17,x13,x9
1101	adds	x26,x26,x14
1102	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
1103	adcs	x19,x19,x15
1104	umulh	x15,x11,x9
1105	adcs	x20,x20,x16
1106	umulh	x16,x12,x9
1107	adcs	x21,x21,x17
1108	umulh	x17,x13,x9
1109	stp	x25,x26,[x2],#8*2	// t[6..7]
1110	adc	x22,xzr,xzr		// t[11]
1111	adds	x19,x19,x14
1112	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
1113	adcs	x20,x20,x15
1114	mul	x15,x12,x10
1115	adcs	x21,x21,x16
1116	mul	x16,x13,x10
1117	adc	x22,x22,x17
1118
1119	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
1120	adds	x20,x20,x14
1121	umulh	x14,x12,x10
1122	adcs	x21,x21,x15
1123	umulh	x15,x13,x10
1124	adcs	x22,x22,x16
1125	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
1126	adc	x23,xzr,xzr		// t[12]
1127	adds	x21,x21,x17
1128	mul	x17,x13,x11
1129	adcs	x22,x22,x14
1130	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
1131	adc	x23,x23,x15
1132
1133	umulh	x15,x13,x11
1134	adds	x22,x22,x16
1135	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
1136	adcs	x23,x23,x17
1137	umulh	x17,x13,x12		// hi(a[7]*a[6])
1138	adc	x24,xzr,xzr		// t[13]
1139	adds	x23,x23,x14
1140	sub	x27,x3,x1	// done yet?
1141	adc	x24,x24,x15
1142
1143	adds	x24,x24,x16
1144	sub	x14,x3,x5	// rewinded ap
1145	adc	x25,xzr,xzr		// t[14]
1146	add	x25,x25,x17
1147
1148	cbz	x27,.Lsqr8x_outer_break
1149
1150	mov	x4,x6
1151	ldp	x6,x7,[x2,#8*0]
1152	ldp	x8,x9,[x2,#8*2]
1153	ldp	x10,x11,[x2,#8*4]
1154	ldp	x12,x13,[x2,#8*6]
1155	adds	x19,x19,x6
1156	adcs	x20,x20,x7
1157	ldp	x6,x7,[x1,#8*0]
1158	adcs	x21,x21,x8
1159	adcs	x22,x22,x9
1160	ldp	x8,x9,[x1,#8*2]
1161	adcs	x23,x23,x10
1162	adcs	x24,x24,x11
1163	ldp	x10,x11,[x1,#8*4]
1164	adcs	x25,x25,x12
1165	mov	x0,x1
1166	adcs	x26,xzr,x13
1167	ldp	x12,x13,[x1,#8*6]
1168	add	x1,x1,#8*8
1169	//adc	x28,xzr,xzr		// moved below
1170	mov	x27,#-8*8
1171
1172	//                                                         a[8]a[0]
1173	//                                                     a[9]a[0]
1174	//                                                 a[a]a[0]
1175	//                                             a[b]a[0]
1176	//                                         a[c]a[0]
1177	//                                     a[d]a[0]
1178	//                                 a[e]a[0]
1179	//                             a[f]a[0]
1180	//                                                     a[8]a[1]
1181	//                         a[f]a[1]........................
1182	//                                                 a[8]a[2]
1183	//                     a[f]a[2]........................
1184	//                                             a[8]a[3]
1185	//                 a[f]a[3]........................
1186	//                                         a[8]a[4]
1187	//             a[f]a[4]........................
1188	//                                     a[8]a[5]
1189	//         a[f]a[5]........................
1190	//                                 a[8]a[6]
1191	//     a[f]a[6]........................
1192	//                             a[8]a[7]
1193	// a[f]a[7]........................
1194.Lsqr8x_mul:
1195	mul	x14,x6,x4
1196	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
1197	mul	x15,x7,x4
1198	add	x27,x27,#8
1199	mul	x16,x8,x4
1200	mul	x17,x9,x4
1201	adds	x19,x19,x14
1202	mul	x14,x10,x4
1203	adcs	x20,x20,x15
1204	mul	x15,x11,x4
1205	adcs	x21,x21,x16
1206	mul	x16,x12,x4
1207	adcs	x22,x22,x17
1208	mul	x17,x13,x4
1209	adcs	x23,x23,x14
1210	umulh	x14,x6,x4
1211	adcs	x24,x24,x15
1212	umulh	x15,x7,x4
1213	adcs	x25,x25,x16
1214	umulh	x16,x8,x4
1215	adcs	x26,x26,x17
1216	umulh	x17,x9,x4
1217	adc	x28,x28,xzr
1218	str	x19,[x2],#8
1219	adds	x19,x20,x14
1220	umulh	x14,x10,x4
1221	adcs	x20,x21,x15
1222	umulh	x15,x11,x4
1223	adcs	x21,x22,x16
1224	umulh	x16,x12,x4
1225	adcs	x22,x23,x17
1226	umulh	x17,x13,x4
1227	ldr	x4,[x0,x27]
1228	adcs	x23,x24,x14
1229	adcs	x24,x25,x15
1230	adcs	x25,x26,x16
1231	adcs	x26,x28,x17
1232	//adc	x28,xzr,xzr		// moved above
1233	cbnz	x27,.Lsqr8x_mul
1234					// note that carry flag is guaranteed
1235					// to be zero at this point
1236	cmp	x1,x3		// done yet?
1237	b.eq	.Lsqr8x_break
1238
1239	ldp	x6,x7,[x2,#8*0]
1240	ldp	x8,x9,[x2,#8*2]
1241	ldp	x10,x11,[x2,#8*4]
1242	ldp	x12,x13,[x2,#8*6]
1243	adds	x19,x19,x6
1244	ldur	x4,[x0,#-8*8]
1245	adcs	x20,x20,x7
1246	ldp	x6,x7,[x1,#8*0]
1247	adcs	x21,x21,x8
1248	adcs	x22,x22,x9
1249	ldp	x8,x9,[x1,#8*2]
1250	adcs	x23,x23,x10
1251	adcs	x24,x24,x11
1252	ldp	x10,x11,[x1,#8*4]
1253	adcs	x25,x25,x12
1254	mov	x27,#-8*8
1255	adcs	x26,x26,x13
1256	ldp	x12,x13,[x1,#8*6]
1257	add	x1,x1,#8*8
1258	//adc	x28,xzr,xzr		// moved above
1259	b	.Lsqr8x_mul
1260
1261.align	4
1262.Lsqr8x_break:
1263	ldp	x6,x7,[x0,#8*0]
1264	add	x1,x0,#8*8
1265	ldp	x8,x9,[x0,#8*2]
1266	sub	x14,x3,x1		// is it last iteration?
1267	ldp	x10,x11,[x0,#8*4]
1268	sub	x15,x2,x14
1269	ldp	x12,x13,[x0,#8*6]
1270	cbz	x14,.Lsqr8x_outer_loop
1271
1272	stp	x19,x20,[x2,#8*0]
1273	ldp	x19,x20,[x15,#8*0]
1274	stp	x21,x22,[x2,#8*2]
1275	ldp	x21,x22,[x15,#8*2]
1276	stp	x23,x24,[x2,#8*4]
1277	ldp	x23,x24,[x15,#8*4]
1278	stp	x25,x26,[x2,#8*6]
1279	mov	x2,x15
1280	ldp	x25,x26,[x15,#8*6]
1281	b	.Lsqr8x_outer_loop
1282
1283.align	4
1284.Lsqr8x_outer_break:
1285	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1286	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
1287	ldp	x15,x16,[sp,#8*1]
1288	ldp	x11,x13,[x14,#8*2]
1289	add	x1,x14,#8*4
1290	ldp	x17,x14,[sp,#8*3]
1291
1292	stp	x19,x20,[x2,#8*0]
1293	mul	x19,x7,x7
1294	stp	x21,x22,[x2,#8*2]
1295	umulh	x7,x7,x7
1296	stp	x23,x24,[x2,#8*4]
1297	mul	x8,x9,x9
1298	stp	x25,x26,[x2,#8*6]
1299	mov	x2,sp
1300	umulh	x9,x9,x9
1301	adds	x20,x7,x15,lsl#1
1302	extr	x15,x16,x15,#63
1303	sub	x27,x5,#8*4
1304
1305.Lsqr4x_shift_n_add:
1306	adcs	x21,x8,x15
1307	extr	x16,x17,x16,#63
1308	sub	x27,x27,#8*4
1309	adcs	x22,x9,x16
1310	ldp	x15,x16,[x2,#8*5]
1311	mul	x10,x11,x11
1312	ldp	x7,x9,[x1],#8*2
1313	umulh	x11,x11,x11
1314	mul	x12,x13,x13
1315	umulh	x13,x13,x13
1316	extr	x17,x14,x17,#63
1317	stp	x19,x20,[x2,#8*0]
1318	adcs	x23,x10,x17
1319	extr	x14,x15,x14,#63
1320	stp	x21,x22,[x2,#8*2]
1321	adcs	x24,x11,x14
1322	ldp	x17,x14,[x2,#8*7]
1323	extr	x15,x16,x15,#63
1324	adcs	x25,x12,x15
1325	extr	x16,x17,x16,#63
1326	adcs	x26,x13,x16
1327	ldp	x15,x16,[x2,#8*9]
1328	mul	x6,x7,x7
1329	ldp	x11,x13,[x1],#8*2
1330	umulh	x7,x7,x7
1331	mul	x8,x9,x9
1332	umulh	x9,x9,x9
1333	stp	x23,x24,[x2,#8*4]
1334	extr	x17,x14,x17,#63
1335	stp	x25,x26,[x2,#8*6]
1336	add	x2,x2,#8*8
1337	adcs	x19,x6,x17
1338	extr	x14,x15,x14,#63
1339	adcs	x20,x7,x14
1340	ldp	x17,x14,[x2,#8*3]
1341	extr	x15,x16,x15,#63
1342	cbnz	x27,.Lsqr4x_shift_n_add
1343	ldp	x1,x4,[x29,#104]	// pull np and n0
1344
1345	adcs	x21,x8,x15
1346	extr	x16,x17,x16,#63
1347	adcs	x22,x9,x16
1348	ldp	x15,x16,[x2,#8*5]
1349	mul	x10,x11,x11
1350	umulh	x11,x11,x11
1351	stp	x19,x20,[x2,#8*0]
1352	mul	x12,x13,x13
1353	umulh	x13,x13,x13
1354	stp	x21,x22,[x2,#8*2]
1355	extr	x17,x14,x17,#63
1356	adcs	x23,x10,x17
1357	extr	x14,x15,x14,#63
1358	ldp	x19,x20,[sp,#8*0]
1359	adcs	x24,x11,x14
1360	extr	x15,x16,x15,#63
1361	ldp	x6,x7,[x1,#8*0]
1362	adcs	x25,x12,x15
1363	extr	x16,xzr,x16,#63
1364	ldp	x8,x9,[x1,#8*2]
1365	adc	x26,x13,x16
1366	ldp	x10,x11,[x1,#8*4]
1367
1368	// Reduce by 512 bits per iteration
1369	mul	x28,x4,x19		// t[0]*n0
1370	ldp	x12,x13,[x1,#8*6]
1371	add	x3,x1,x5
1372	ldp	x21,x22,[sp,#8*2]
1373	stp	x23,x24,[x2,#8*4]
1374	ldp	x23,x24,[sp,#8*4]
1375	stp	x25,x26,[x2,#8*6]
1376	ldp	x25,x26,[sp,#8*6]
1377	add	x1,x1,#8*8
1378	mov	x30,xzr		// initial top-most carry
1379	mov	x2,sp
1380	mov	x27,#8
1381
1382.Lsqr8x_reduction:
1383	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
1384	mul	x15,x7,x28
1385	sub	x27,x27,#1
1386	mul	x16,x8,x28
1387	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
1388	mul	x17,x9,x28
1389	// (*)	adds	xzr,x19,x14
1390	subs	xzr,x19,#1		// (*)
1391	mul	x14,x10,x28
1392	adcs	x19,x20,x15
1393	mul	x15,x11,x28
1394	adcs	x20,x21,x16
1395	mul	x16,x12,x28
1396	adcs	x21,x22,x17
1397	mul	x17,x13,x28
1398	adcs	x22,x23,x14
1399	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
1400	adcs	x23,x24,x15
1401	umulh	x15,x7,x28
1402	adcs	x24,x25,x16
1403	umulh	x16,x8,x28
1404	adcs	x25,x26,x17
1405	umulh	x17,x9,x28
1406	adc	x26,xzr,xzr
1407	adds	x19,x19,x14
1408	umulh	x14,x10,x28
1409	adcs	x20,x20,x15
1410	umulh	x15,x11,x28
1411	adcs	x21,x21,x16
1412	umulh	x16,x12,x28
1413	adcs	x22,x22,x17
1414	umulh	x17,x13,x28
1415	mul	x28,x4,x19		// next t[0]*n0
1416	adcs	x23,x23,x14
1417	adcs	x24,x24,x15
1418	adcs	x25,x25,x16
1419	adc	x26,x26,x17
1420	cbnz	x27,.Lsqr8x_reduction
1421
1422	ldp	x14,x15,[x2,#8*0]
1423	ldp	x16,x17,[x2,#8*2]
1424	mov	x0,x2
1425	sub	x27,x3,x1	// done yet?
1426	adds	x19,x19,x14
1427	adcs	x20,x20,x15
1428	ldp	x14,x15,[x2,#8*4]
1429	adcs	x21,x21,x16
1430	adcs	x22,x22,x17
1431	ldp	x16,x17,[x2,#8*6]
1432	adcs	x23,x23,x14
1433	adcs	x24,x24,x15
1434	adcs	x25,x25,x16
1435	adcs	x26,x26,x17
1436	//adc	x28,xzr,xzr		// moved below
1437	cbz	x27,.Lsqr8x8_post_condition
1438
1439	ldur	x4,[x2,#-8*8]
1440	ldp	x6,x7,[x1,#8*0]
1441	ldp	x8,x9,[x1,#8*2]
1442	ldp	x10,x11,[x1,#8*4]
1443	mov	x27,#-8*8
1444	ldp	x12,x13,[x1,#8*6]
1445	add	x1,x1,#8*8
1446
1447.Lsqr8x_tail:
1448	mul	x14,x6,x4
1449	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
1450	mul	x15,x7,x4
1451	add	x27,x27,#8
1452	mul	x16,x8,x4
1453	mul	x17,x9,x4
1454	adds	x19,x19,x14
1455	mul	x14,x10,x4
1456	adcs	x20,x20,x15
1457	mul	x15,x11,x4
1458	adcs	x21,x21,x16
1459	mul	x16,x12,x4
1460	adcs	x22,x22,x17
1461	mul	x17,x13,x4
1462	adcs	x23,x23,x14
1463	umulh	x14,x6,x4
1464	adcs	x24,x24,x15
1465	umulh	x15,x7,x4
1466	adcs	x25,x25,x16
1467	umulh	x16,x8,x4
1468	adcs	x26,x26,x17
1469	umulh	x17,x9,x4
1470	adc	x28,x28,xzr
1471	str	x19,[x2],#8
1472	adds	x19,x20,x14
1473	umulh	x14,x10,x4
1474	adcs	x20,x21,x15
1475	umulh	x15,x11,x4
1476	adcs	x21,x22,x16
1477	umulh	x16,x12,x4
1478	adcs	x22,x23,x17
1479	umulh	x17,x13,x4
1480	ldr	x4,[x0,x27]
1481	adcs	x23,x24,x14
1482	adcs	x24,x25,x15
1483	adcs	x25,x26,x16
1484	adcs	x26,x28,x17
1485	//adc	x28,xzr,xzr		// moved above
1486	cbnz	x27,.Lsqr8x_tail
1487					// note that carry flag is guaranteed
1488					// to be zero at this point
1489	ldp	x6,x7,[x2,#8*0]
1490	sub	x27,x3,x1	// done yet?
1491	sub	x16,x3,x5	// rewinded np
1492	ldp	x8,x9,[x2,#8*2]
1493	ldp	x10,x11,[x2,#8*4]
1494	ldp	x12,x13,[x2,#8*6]
1495	cbz	x27,.Lsqr8x_tail_break
1496
1497	ldur	x4,[x0,#-8*8]
1498	adds	x19,x19,x6
1499	adcs	x20,x20,x7
1500	ldp	x6,x7,[x1,#8*0]
1501	adcs	x21,x21,x8
1502	adcs	x22,x22,x9
1503	ldp	x8,x9,[x1,#8*2]
1504	adcs	x23,x23,x10
1505	adcs	x24,x24,x11
1506	ldp	x10,x11,[x1,#8*4]
1507	adcs	x25,x25,x12
1508	mov	x27,#-8*8
1509	adcs	x26,x26,x13
1510	ldp	x12,x13,[x1,#8*6]
1511	add	x1,x1,#8*8
1512	//adc	x28,xzr,xzr		// moved above
1513	b	.Lsqr8x_tail
1514
1515.align	4
1516.Lsqr8x_tail_break:
1517	ldr	x4,[x29,#112]		// pull n0
1518	add	x27,x2,#8*8		// end of current t[num] window
1519
1520	subs	xzr,x30,#1		// "move" top-most carry to carry bit
1521	adcs	x14,x19,x6
1522	adcs	x15,x20,x7
1523	ldp	x19,x20,[x0,#8*0]
1524	adcs	x21,x21,x8
1525	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
1526	adcs	x22,x22,x9
1527	ldp	x8,x9,[x16,#8*2]
1528	adcs	x23,x23,x10
1529	adcs	x24,x24,x11
1530	ldp	x10,x11,[x16,#8*4]
1531	adcs	x25,x25,x12
1532	adcs	x26,x26,x13
1533	ldp	x12,x13,[x16,#8*6]
1534	add	x1,x16,#8*8
1535	adc	x30,xzr,xzr	// top-most carry
1536	mul	x28,x4,x19
1537	stp	x14,x15,[x2,#8*0]
1538	stp	x21,x22,[x2,#8*2]
1539	ldp	x21,x22,[x0,#8*2]
1540	stp	x23,x24,[x2,#8*4]
1541	ldp	x23,x24,[x0,#8*4]
1542	cmp	x27,x29		// did we hit the bottom?
1543	stp	x25,x26,[x2,#8*6]
1544	mov	x2,x0			// slide the window
1545	ldp	x25,x26,[x0,#8*6]
1546	mov	x27,#8
1547	b.ne	.Lsqr8x_reduction
1548
1549	// Final step. We see if result is larger than modulus, and
1550	// if it is, subtract the modulus. But comparison implies
1551	// subtraction. So we subtract modulus, see if it borrowed,
1552	// and conditionally copy original value.
1553	ldr	x0,[x29,#96]		// pull rp
1554	add	x2,x2,#8*8
1555	subs	x14,x19,x6
1556	sbcs	x15,x20,x7
1557	sub	x27,x5,#8*8
1558	mov	x3,x0		// x0 copy
1559
1560.Lsqr8x_sub:
1561	sbcs	x16,x21,x8
1562	ldp	x6,x7,[x1,#8*0]
1563	sbcs	x17,x22,x9
1564	stp	x14,x15,[x0,#8*0]
1565	sbcs	x14,x23,x10
1566	ldp	x8,x9,[x1,#8*2]
1567	sbcs	x15,x24,x11
1568	stp	x16,x17,[x0,#8*2]
1569	sbcs	x16,x25,x12
1570	ldp	x10,x11,[x1,#8*4]
1571	sbcs	x17,x26,x13
1572	ldp	x12,x13,[x1,#8*6]
1573	add	x1,x1,#8*8
1574	ldp	x19,x20,[x2,#8*0]
1575	sub	x27,x27,#8*8
1576	ldp	x21,x22,[x2,#8*2]
1577	ldp	x23,x24,[x2,#8*4]
1578	ldp	x25,x26,[x2,#8*6]
1579	add	x2,x2,#8*8
1580	stp	x14,x15,[x0,#8*4]
1581	sbcs	x14,x19,x6
1582	stp	x16,x17,[x0,#8*6]
1583	add	x0,x0,#8*8
1584	sbcs	x15,x20,x7
1585	cbnz	x27,.Lsqr8x_sub
1586
1587	sbcs	x16,x21,x8
1588	mov	x2,sp
1589	add	x1,sp,x5
1590	ldp	x6,x7,[x3,#8*0]
1591	sbcs	x17,x22,x9
1592	stp	x14,x15,[x0,#8*0]
1593	sbcs	x14,x23,x10
1594	ldp	x8,x9,[x3,#8*2]
1595	sbcs	x15,x24,x11
1596	stp	x16,x17,[x0,#8*2]
1597	sbcs	x16,x25,x12
1598	ldp	x19,x20,[x1,#8*0]
1599	sbcs	x17,x26,x13
1600	ldp	x21,x22,[x1,#8*2]
1601	sbcs	xzr,x30,xzr	// did it borrow?
1602	ldr	x30,[x29,#8]		// pull return address
1603	stp	x14,x15,[x0,#8*4]
1604	stp	x16,x17,[x0,#8*6]
1605
1606	sub	x27,x5,#8*4
1607.Lsqr4x_cond_copy:
1608	sub	x27,x27,#8*4
1609	csel	x14,x19,x6,lo
1610	stp	xzr,xzr,[x2,#8*0]
1611	csel	x15,x20,x7,lo
1612	ldp	x6,x7,[x3,#8*4]
1613	ldp	x19,x20,[x1,#8*4]
1614	csel	x16,x21,x8,lo
1615	stp	xzr,xzr,[x2,#8*2]
1616	add	x2,x2,#8*4
1617	csel	x17,x22,x9,lo
1618	ldp	x8,x9,[x3,#8*6]
1619	ldp	x21,x22,[x1,#8*6]
1620	add	x1,x1,#8*4
1621	stp	x14,x15,[x3,#8*0]
1622	stp	x16,x17,[x3,#8*2]
1623	add	x3,x3,#8*4
1624	stp	xzr,xzr,[x1,#8*0]
1625	stp	xzr,xzr,[x1,#8*2]
1626	cbnz	x27,.Lsqr4x_cond_copy
1627
1628	csel	x14,x19,x6,lo
1629	stp	xzr,xzr,[x2,#8*0]
1630	csel	x15,x20,x7,lo
1631	stp	xzr,xzr,[x2,#8*2]
1632	csel	x16,x21,x8,lo
1633	csel	x17,x22,x9,lo
1634	stp	x14,x15,[x3,#8*0]
1635	stp	x16,x17,[x3,#8*2]
1636
1637	b	.Lsqr8x_done
1638
1639.align	4
1640.Lsqr8x8_post_condition:
1641	adc	x28,xzr,xzr
1642	ldr	x30,[x29,#8]		// pull return address
1643	// x19-7,x28 hold result, x6-7 hold modulus
1644	subs	x6,x19,x6
1645	ldr	x1,[x29,#96]		// pull rp
1646	sbcs	x7,x20,x7
1647	stp	xzr,xzr,[sp,#8*0]
1648	sbcs	x8,x21,x8
1649	stp	xzr,xzr,[sp,#8*2]
1650	sbcs	x9,x22,x9
1651	stp	xzr,xzr,[sp,#8*4]
1652	sbcs	x10,x23,x10
1653	stp	xzr,xzr,[sp,#8*6]
1654	sbcs	x11,x24,x11
1655	stp	xzr,xzr,[sp,#8*8]
1656	sbcs	x12,x25,x12
1657	stp	xzr,xzr,[sp,#8*10]
1658	sbcs	x13,x26,x13
1659	stp	xzr,xzr,[sp,#8*12]
1660	sbcs	x28,x28,xzr	// did it borrow?
1661	stp	xzr,xzr,[sp,#8*14]
1662
1663	// x6-7 hold result-modulus
1664	csel	x6,x19,x6,lo
1665	csel	x7,x20,x7,lo
1666	csel	x8,x21,x8,lo
1667	csel	x9,x22,x9,lo
1668	stp	x6,x7,[x1,#8*0]
1669	csel	x10,x23,x10,lo
1670	csel	x11,x24,x11,lo
1671	stp	x8,x9,[x1,#8*2]
1672	csel	x12,x25,x12,lo
1673	csel	x13,x26,x13,lo
1674	stp	x10,x11,[x1,#8*4]
1675	stp	x12,x13,[x1,#8*6]
1676
1677.Lsqr8x_done:
1678	ldp	x19,x20,[x29,#16]
1679	mov	sp,x29
1680	ldp	x21,x22,[x29,#32]
1681	mov	x0,#1
1682	ldp	x23,x24,[x29,#48]
1683	ldp	x25,x26,[x29,#64]
1684	ldp	x27,x28,[x29,#80]
1685	ldr	x29,[sp],#128
1686	// x30 is loaded earlier
1687	AARCH64_VALIDATE_LINK_REGISTER
1688	ret
1689.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
1690.type	__bn_mul4x_mont,%function
1691.align	5
1692__bn_mul4x_mont:
1693	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
1694	// only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address.
1695	stp	x29,x30,[sp,#-128]!
1696	add	x29,sp,#0
1697	stp	x19,x20,[sp,#16]
1698	stp	x21,x22,[sp,#32]
1699	stp	x23,x24,[sp,#48]
1700	stp	x25,x26,[sp,#64]
1701	stp	x27,x28,[sp,#80]
1702
1703	sub	x26,sp,x5,lsl#3
1704	lsl	x5,x5,#3
1705	ldr	x4,[x4]		// *n0
1706	sub	sp,x26,#8*4		// alloca
1707
1708	add	x10,x2,x5
1709	add	x27,x1,x5
1710	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
1711
1712	ldr	x24,[x2,#8*0]		// b[0]
1713	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1714	ldp	x8,x9,[x1,#8*2]
1715	add	x1,x1,#8*4
1716	mov	x19,xzr
1717	mov	x20,xzr
1718	mov	x21,xzr
1719	mov	x22,xzr
1720	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1721	ldp	x16,x17,[x3,#8*2]
1722	adds	x3,x3,#8*4		// clear carry bit
1723	mov	x0,xzr
1724	mov	x28,#0
1725	mov	x26,sp
1726
1727.Loop_mul4x_1st_reduction:
1728	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1729	adc	x0,x0,xzr	// modulo-scheduled
1730	mul	x11,x7,x24
1731	add	x28,x28,#8
1732	mul	x12,x8,x24
1733	and	x28,x28,#31
1734	mul	x13,x9,x24
1735	adds	x19,x19,x10
1736	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1737	adcs	x20,x20,x11
1738	mul	x25,x19,x4		// t[0]*n0
1739	adcs	x21,x21,x12
1740	umulh	x11,x7,x24
1741	adcs	x22,x22,x13
1742	umulh	x12,x8,x24
1743	adc	x23,xzr,xzr
1744	umulh	x13,x9,x24
1745	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1746	adds	x20,x20,x10
1747	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1748	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1749	adcs	x21,x21,x11
1750	mul	x11,x15,x25
1751	adcs	x22,x22,x12
1752	mul	x12,x16,x25
1753	adc	x23,x23,x13		// can't overflow
1754	mul	x13,x17,x25
1755	// (*)	adds	xzr,x19,x10
1756	subs	xzr,x19,#1		// (*)
1757	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1758	adcs	x19,x20,x11
1759	umulh	x11,x15,x25
1760	adcs	x20,x21,x12
1761	umulh	x12,x16,x25
1762	adcs	x21,x22,x13
1763	umulh	x13,x17,x25
1764	adcs	x22,x23,x0
1765	adc	x0,xzr,xzr
1766	adds	x19,x19,x10
1767	sub	x10,x27,x1
1768	adcs	x20,x20,x11
1769	adcs	x21,x21,x12
1770	adcs	x22,x22,x13
1771	//adc	x0,x0,xzr
1772	cbnz	x28,.Loop_mul4x_1st_reduction
1773
1774	cbz	x10,.Lmul4x4_post_condition
1775
1776	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1777	ldp	x8,x9,[x1,#8*2]
1778	add	x1,x1,#8*4
1779	ldr	x25,[sp]		// a[0]*n0
1780	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1781	ldp	x16,x17,[x3,#8*2]
1782	add	x3,x3,#8*4
1783
1784.Loop_mul4x_1st_tail:
1785	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1786	adc	x0,x0,xzr	// modulo-scheduled
1787	mul	x11,x7,x24
1788	add	x28,x28,#8
1789	mul	x12,x8,x24
1790	and	x28,x28,#31
1791	mul	x13,x9,x24
1792	adds	x19,x19,x10
1793	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1794	adcs	x20,x20,x11
1795	umulh	x11,x7,x24
1796	adcs	x21,x21,x12
1797	umulh	x12,x8,x24
1798	adcs	x22,x22,x13
1799	umulh	x13,x9,x24
1800	adc	x23,xzr,xzr
1801	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1802	adds	x20,x20,x10
1803	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1804	adcs	x21,x21,x11
1805	mul	x11,x15,x25
1806	adcs	x22,x22,x12
1807	mul	x12,x16,x25
1808	adc	x23,x23,x13		// can't overflow
1809	mul	x13,x17,x25
1810	adds	x19,x19,x10
1811	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1812	adcs	x20,x20,x11
1813	umulh	x11,x15,x25
1814	adcs	x21,x21,x12
1815	umulh	x12,x16,x25
1816	adcs	x22,x22,x13
1817	adcs	x23,x23,x0
1818	umulh	x13,x17,x25
1819	adc	x0,xzr,xzr
1820	ldr	x25,[sp,x28]		// next t[0]*n0
1821	str	x19,[x26],#8		// result!!!
1822	adds	x19,x20,x10
1823	sub	x10,x27,x1		// done yet?
1824	adcs	x20,x21,x11
1825	adcs	x21,x22,x12
1826	adcs	x22,x23,x13
1827	//adc	x0,x0,xzr
1828	cbnz	x28,.Loop_mul4x_1st_tail
1829
1830	sub	x11,x27,x5	// rewinded x1
1831	cbz	x10,.Lmul4x_proceed
1832
1833	ldp	x6,x7,[x1,#8*0]
1834	ldp	x8,x9,[x1,#8*2]
1835	add	x1,x1,#8*4
1836	ldp	x14,x15,[x3,#8*0]
1837	ldp	x16,x17,[x3,#8*2]
1838	add	x3,x3,#8*4
1839	b	.Loop_mul4x_1st_tail
1840
1841.align	5
1842.Lmul4x_proceed:
1843	ldr	x24,[x2,#8*4]!		// *++b
1844	adc	x30,x0,xzr
1845	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1846	sub	x3,x3,x5		// rewind np
1847	ldp	x8,x9,[x11,#8*2]
1848	add	x1,x11,#8*4
1849
1850	stp	x19,x20,[x26,#8*0]	// result!!!
1851	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1852	stp	x21,x22,[x26,#8*2]	// result!!!
1853	ldp	x21,x22,[sp,#8*6]
1854
1855	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1856	mov	x26,sp
1857	ldp	x16,x17,[x3,#8*2]
1858	adds	x3,x3,#8*4		// clear carry bit
1859	mov	x0,xzr
1860
1861.align	4
1862.Loop_mul4x_reduction:
1863	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1864	adc	x0,x0,xzr	// modulo-scheduled
1865	mul	x11,x7,x24
1866	add	x28,x28,#8
1867	mul	x12,x8,x24
1868	and	x28,x28,#31
1869	mul	x13,x9,x24
1870	adds	x19,x19,x10
1871	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1872	adcs	x20,x20,x11
1873	mul	x25,x19,x4		// t[0]*n0
1874	adcs	x21,x21,x12
1875	umulh	x11,x7,x24
1876	adcs	x22,x22,x13
1877	umulh	x12,x8,x24
1878	adc	x23,xzr,xzr
1879	umulh	x13,x9,x24
1880	ldr	x24,[x2,x28]		// next b[i]
1881	adds	x20,x20,x10
1882	// (*)	mul	x10,x14,x25
1883	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1884	adcs	x21,x21,x11
1885	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1886	adcs	x22,x22,x12
1887	mul	x12,x16,x25
1888	adc	x23,x23,x13		// can't overflow
1889	mul	x13,x17,x25
1890	// (*)	adds	xzr,x19,x10
1891	subs	xzr,x19,#1		// (*)
1892	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1893	adcs	x19,x20,x11
1894	umulh	x11,x15,x25
1895	adcs	x20,x21,x12
1896	umulh	x12,x16,x25
1897	adcs	x21,x22,x13
1898	umulh	x13,x17,x25
1899	adcs	x22,x23,x0
1900	adc	x0,xzr,xzr
1901	adds	x19,x19,x10
1902	adcs	x20,x20,x11
1903	adcs	x21,x21,x12
1904	adcs	x22,x22,x13
1905	//adc	x0,x0,xzr
1906	cbnz	x28,.Loop_mul4x_reduction
1907
1908	adc	x0,x0,xzr
1909	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1910	ldp	x12,x13,[x26,#8*6]
1911	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1912	ldp	x8,x9,[x1,#8*2]
1913	add	x1,x1,#8*4
1914	adds	x19,x19,x10
1915	adcs	x20,x20,x11
1916	adcs	x21,x21,x12
1917	adcs	x22,x22,x13
1918	//adc	x0,x0,xzr
1919
1920	ldr	x25,[sp]		// t[0]*n0
1921	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1922	ldp	x16,x17,[x3,#8*2]
1923	add	x3,x3,#8*4
1924
1925.align	4
1926.Loop_mul4x_tail:
1927	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1928	adc	x0,x0,xzr	// modulo-scheduled
1929	mul	x11,x7,x24
1930	add	x28,x28,#8
1931	mul	x12,x8,x24
1932	and	x28,x28,#31
1933	mul	x13,x9,x24
1934	adds	x19,x19,x10
1935	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1936	adcs	x20,x20,x11
1937	umulh	x11,x7,x24
1938	adcs	x21,x21,x12
1939	umulh	x12,x8,x24
1940	adcs	x22,x22,x13
1941	umulh	x13,x9,x24
1942	adc	x23,xzr,xzr
1943	ldr	x24,[x2,x28]		// next b[i]
1944	adds	x20,x20,x10
1945	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1946	adcs	x21,x21,x11
1947	mul	x11,x15,x25
1948	adcs	x22,x22,x12
1949	mul	x12,x16,x25
1950	adc	x23,x23,x13		// can't overflow
1951	mul	x13,x17,x25
1952	adds	x19,x19,x10
1953	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1954	adcs	x20,x20,x11
1955	umulh	x11,x15,x25
1956	adcs	x21,x21,x12
1957	umulh	x12,x16,x25
1958	adcs	x22,x22,x13
1959	umulh	x13,x17,x25
1960	adcs	x23,x23,x0
1961	ldr	x25,[sp,x28]		// next a[0]*n0
1962	adc	x0,xzr,xzr
1963	str	x19,[x26],#8		// result!!!
1964	adds	x19,x20,x10
1965	sub	x10,x27,x1		// done yet?
1966	adcs	x20,x21,x11
1967	adcs	x21,x22,x12
1968	adcs	x22,x23,x13
1969	//adc	x0,x0,xzr
1970	cbnz	x28,.Loop_mul4x_tail
1971
1972	sub	x11,x3,x5		// rewinded np?
1973	adc	x0,x0,xzr
1974	cbz	x10,.Loop_mul4x_break
1975
1976	ldp	x10,x11,[x26,#8*4]
1977	ldp	x12,x13,[x26,#8*6]
1978	ldp	x6,x7,[x1,#8*0]
1979	ldp	x8,x9,[x1,#8*2]
1980	add	x1,x1,#8*4
1981	adds	x19,x19,x10
1982	adcs	x20,x20,x11
1983	adcs	x21,x21,x12
1984	adcs	x22,x22,x13
1985	//adc	x0,x0,xzr
1986	ldp	x14,x15,[x3,#8*0]
1987	ldp	x16,x17,[x3,#8*2]
1988	add	x3,x3,#8*4
1989	b	.Loop_mul4x_tail
1990
1991.align	4
1992.Loop_mul4x_break:
1993	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1994	adds	x19,x19,x30
1995	add	x2,x2,#8*4		// bp++
1996	adcs	x20,x20,xzr
1997	sub	x1,x1,x5		// rewind ap
1998	adcs	x21,x21,xzr
1999	stp	x19,x20,[x26,#8*0]	// result!!!
2000	adcs	x22,x22,xzr
2001	ldp	x19,x20,[sp,#8*4]	// t[0..3]
2002	adc	x30,x0,xzr
2003	stp	x21,x22,[x26,#8*2]	// result!!!
2004	cmp	x2,x13			// done yet?
2005	ldp	x21,x22,[sp,#8*6]
2006	ldp	x14,x15,[x11,#8*0]	// n[0..3]
2007	ldp	x16,x17,[x11,#8*2]
2008	add	x3,x11,#8*4
2009	b.eq	.Lmul4x_post
2010
2011	ldr	x24,[x2]
2012	ldp	x6,x7,[x1,#8*0]	// a[0..3]
2013	ldp	x8,x9,[x1,#8*2]
2014	adds	x1,x1,#8*4		// clear carry bit
2015	mov	x0,xzr
2016	mov	x26,sp
2017	b	.Loop_mul4x_reduction
2018
2019.align	4
2020.Lmul4x_post:
2021	// Final step. We see if result is larger than modulus, and
2022	// if it is, subtract the modulus. But comparison implies
2023	// subtraction. So we subtract modulus, see if it borrowed,
2024	// and conditionally copy original value.
2025	mov	x0,x12
2026	mov	x27,x12		// x0 copy
2027	subs	x10,x19,x14
2028	add	x26,sp,#8*8
2029	sbcs	x11,x20,x15
2030	sub	x28,x5,#8*4
2031
2032.Lmul4x_sub:
2033	sbcs	x12,x21,x16
2034	ldp	x14,x15,[x3,#8*0]
2035	sub	x28,x28,#8*4
2036	ldp	x19,x20,[x26,#8*0]
2037	sbcs	x13,x22,x17
2038	ldp	x16,x17,[x3,#8*2]
2039	add	x3,x3,#8*4
2040	ldp	x21,x22,[x26,#8*2]
2041	add	x26,x26,#8*4
2042	stp	x10,x11,[x0,#8*0]
2043	sbcs	x10,x19,x14
2044	stp	x12,x13,[x0,#8*2]
2045	add	x0,x0,#8*4
2046	sbcs	x11,x20,x15
2047	cbnz	x28,.Lmul4x_sub
2048
2049	sbcs	x12,x21,x16
2050	mov	x26,sp
2051	add	x1,sp,#8*4
2052	ldp	x6,x7,[x27,#8*0]
2053	sbcs	x13,x22,x17
2054	stp	x10,x11,[x0,#8*0]
2055	ldp	x8,x9,[x27,#8*2]
2056	stp	x12,x13,[x0,#8*2]
2057	ldp	x19,x20,[x1,#8*0]
2058	ldp	x21,x22,[x1,#8*2]
2059	sbcs	xzr,x30,xzr	// did it borrow?
2060	ldr	x30,[x29,#8]		// pull return address
2061
2062	sub	x28,x5,#8*4
2063.Lmul4x_cond_copy:
2064	sub	x28,x28,#8*4
2065	csel	x10,x19,x6,lo
2066	stp	xzr,xzr,[x26,#8*0]
2067	csel	x11,x20,x7,lo
2068	ldp	x6,x7,[x27,#8*4]
2069	ldp	x19,x20,[x1,#8*4]
2070	csel	x12,x21,x8,lo
2071	stp	xzr,xzr,[x26,#8*2]
2072	add	x26,x26,#8*4
2073	csel	x13,x22,x9,lo
2074	ldp	x8,x9,[x27,#8*6]
2075	ldp	x21,x22,[x1,#8*6]
2076	add	x1,x1,#8*4
2077	stp	x10,x11,[x27,#8*0]
2078	stp	x12,x13,[x27,#8*2]
2079	add	x27,x27,#8*4
2080	cbnz	x28,.Lmul4x_cond_copy
2081
2082	csel	x10,x19,x6,lo
2083	stp	xzr,xzr,[x26,#8*0]
2084	csel	x11,x20,x7,lo
2085	stp	xzr,xzr,[x26,#8*2]
2086	csel	x12,x21,x8,lo
2087	stp	xzr,xzr,[x26,#8*3]
2088	csel	x13,x22,x9,lo
2089	stp	xzr,xzr,[x26,#8*4]
2090	stp	x10,x11,[x27,#8*0]
2091	stp	x12,x13,[x27,#8*2]
2092
2093	b	.Lmul4x_done
2094
2095.align	4
2096.Lmul4x4_post_condition:
2097	adc	x0,x0,xzr
2098	ldr	x1,[x29,#96]		// pull rp
2099	// x19-3,x0 hold result, x14-7 hold modulus
2100	subs	x6,x19,x14
2101	ldr	x30,[x29,#8]		// pull return address
2102	sbcs	x7,x20,x15
2103	stp	xzr,xzr,[sp,#8*0]
2104	sbcs	x8,x21,x16
2105	stp	xzr,xzr,[sp,#8*2]
2106	sbcs	x9,x22,x17
2107	stp	xzr,xzr,[sp,#8*4]
2108	sbcs	xzr,x0,xzr		// did it borrow?
2109	stp	xzr,xzr,[sp,#8*6]
2110
2111	// x6-3 hold result-modulus
2112	csel	x6,x19,x6,lo
2113	csel	x7,x20,x7,lo
2114	csel	x8,x21,x8,lo
2115	csel	x9,x22,x9,lo
2116	stp	x6,x7,[x1,#8*0]
2117	stp	x8,x9,[x1,#8*2]
2118
2119.Lmul4x_done:
2120	ldp	x19,x20,[x29,#16]
2121	mov	sp,x29
2122	ldp	x21,x22,[x29,#32]
2123	mov	x0,#1
2124	ldp	x23,x24,[x29,#48]
2125	ldp	x25,x26,[x29,#64]
2126	ldp	x27,x28,[x29,#80]
2127	ldr	x29,[sp],#128
2128	// x30 loaded earlier
2129	AARCH64_VALIDATE_LINK_REGISTER
2130	ret
2131.size	__bn_mul4x_mont,.-__bn_mul4x_mont
2132.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
2133.align	2
2134.align	4
2135