xref: /freebsd/sys/crypto/openssl/aarch64/armv8-mont.S (revision ee12faa062c04a49bf6fe4e6867bad8606e2413f)
1/* Do not modify. This file is auto-generated from armv8-mont.pl. */
2#ifndef	__KERNEL__
3# include "arm_arch.h"
4
5.hidden	OPENSSL_armv8_rsa_neonized
6#endif
7.text
8
9.globl	bn_mul_mont
10.type	bn_mul_mont,%function
11.align	5
12bn_mul_mont:
13.Lbn_mul_mont:
14	tst	x5,#3
15	b.ne	.Lmul_mont
16	cmp	x5,#32
17	b.le	.Lscalar_impl
18#ifndef	__KERNEL__
19	adrp	x17,OPENSSL_armv8_rsa_neonized
20	ldr	w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
21	cbnz	w17, bn_mul8x_mont_neon
22#endif
23
24.Lscalar_impl:
25	tst	x5,#7
26	b.eq	__bn_sqr8x_mont
27	tst	x5,#3
28	b.eq	__bn_mul4x_mont
29
30.Lmul_mont:
31	stp	x29,x30,[sp,#-64]!
32	add	x29,sp,#0
33	stp	x19,x20,[sp,#16]
34	stp	x21,x22,[sp,#32]
35	stp	x23,x24,[sp,#48]
36
37	ldr	x9,[x2],#8		// bp[0]
38	sub	x22,sp,x5,lsl#3
39	ldp	x7,x8,[x1],#16	// ap[0..1]
40	lsl	x5,x5,#3
41	ldr	x4,[x4]		// *n0
42	and	x22,x22,#-16		// ABI says so
43	ldp	x13,x14,[x3],#16	// np[0..1]
44
45	mul	x6,x7,x9		// ap[0]*bp[0]
46	sub	x21,x5,#16		// j=num-2
47	umulh	x7,x7,x9
48	mul	x10,x8,x9		// ap[1]*bp[0]
49	umulh	x11,x8,x9
50
51	mul	x15,x6,x4		// "tp[0]"*n0
52	mov	sp,x22			// alloca
53
54	// (*)	mul	x12,x13,x15	// np[0]*m1
55	umulh	x13,x13,x15
56	mul	x16,x14,x15		// np[1]*m1
57	// (*)	adds	x12,x12,x6	// discarded
58	// (*)	As for removal of first multiplication and addition
59	//	instructions. The outcome of first addition is
60	//	guaranteed to be zero, which leaves two computationally
61	//	significant outcomes: it either carries or not. Then
62	//	question is when does it carry? Is there alternative
63	//	way to deduce it? If you follow operations, you can
64	//	observe that condition for carry is quite simple:
65	//	x6 being non-zero. So that carry can be calculated
66	//	by adding -1 to x6. That's what next instruction does.
67	subs	xzr,x6,#1		// (*)
68	umulh	x17,x14,x15
69	adc	x13,x13,xzr
70	cbz	x21,.L1st_skip
71
72.L1st:
73	ldr	x8,[x1],#8
74	adds	x6,x10,x7
75	sub	x21,x21,#8		// j--
76	adc	x7,x11,xzr
77
78	ldr	x14,[x3],#8
79	adds	x12,x16,x13
80	mul	x10,x8,x9		// ap[j]*bp[0]
81	adc	x13,x17,xzr
82	umulh	x11,x8,x9
83
84	adds	x12,x12,x6
85	mul	x16,x14,x15		// np[j]*m1
86	adc	x13,x13,xzr
87	umulh	x17,x14,x15
88	str	x12,[x22],#8		// tp[j-1]
89	cbnz	x21,.L1st
90
91.L1st_skip:
92	adds	x6,x10,x7
93	sub	x1,x1,x5		// rewind x1
94	adc	x7,x11,xzr
95
96	adds	x12,x16,x13
97	sub	x3,x3,x5		// rewind x3
98	adc	x13,x17,xzr
99
100	adds	x12,x12,x6
101	sub	x20,x5,#8		// i=num-1
102	adcs	x13,x13,x7
103
104	adc	x19,xzr,xzr		// upmost overflow bit
105	stp	x12,x13,[x22]
106
107.Louter:
108	ldr	x9,[x2],#8		// bp[i]
109	ldp	x7,x8,[x1],#16
110	ldr	x23,[sp]		// tp[0]
111	add	x22,sp,#8
112
113	mul	x6,x7,x9		// ap[0]*bp[i]
114	sub	x21,x5,#16		// j=num-2
115	umulh	x7,x7,x9
116	ldp	x13,x14,[x3],#16
117	mul	x10,x8,x9		// ap[1]*bp[i]
118	adds	x6,x6,x23
119	umulh	x11,x8,x9
120	adc	x7,x7,xzr
121
122	mul	x15,x6,x4
123	sub	x20,x20,#8		// i--
124
125	// (*)	mul	x12,x13,x15	// np[0]*m1
126	umulh	x13,x13,x15
127	mul	x16,x14,x15		// np[1]*m1
128	// (*)	adds	x12,x12,x6
129	subs	xzr,x6,#1		// (*)
130	umulh	x17,x14,x15
131	cbz	x21,.Linner_skip
132
133.Linner:
134	ldr	x8,[x1],#8
135	adc	x13,x13,xzr
136	ldr	x23,[x22],#8		// tp[j]
137	adds	x6,x10,x7
138	sub	x21,x21,#8		// j--
139	adc	x7,x11,xzr
140
141	adds	x12,x16,x13
142	ldr	x14,[x3],#8
143	adc	x13,x17,xzr
144
145	mul	x10,x8,x9		// ap[j]*bp[i]
146	adds	x6,x6,x23
147	umulh	x11,x8,x9
148	adc	x7,x7,xzr
149
150	mul	x16,x14,x15		// np[j]*m1
151	adds	x12,x12,x6
152	umulh	x17,x14,x15
153	stur	x12,[x22,#-16]		// tp[j-1]
154	cbnz	x21,.Linner
155
156.Linner_skip:
157	ldr	x23,[x22],#8		// tp[j]
158	adc	x13,x13,xzr
159	adds	x6,x10,x7
160	sub	x1,x1,x5		// rewind x1
161	adc	x7,x11,xzr
162
163	adds	x12,x16,x13
164	sub	x3,x3,x5		// rewind x3
165	adcs	x13,x17,x19
166	adc	x19,xzr,xzr
167
168	adds	x6,x6,x23
169	adc	x7,x7,xzr
170
171	adds	x12,x12,x6
172	adcs	x13,x13,x7
173	adc	x19,x19,xzr		// upmost overflow bit
174	stp	x12,x13,[x22,#-16]
175
176	cbnz	x20,.Louter
177
178	// Final step. We see if result is larger than modulus, and
179	// if it is, subtract the modulus. But comparison implies
180	// subtraction. So we subtract modulus, see if it borrowed,
181	// and conditionally copy original value.
182	ldr	x23,[sp]		// tp[0]
183	add	x22,sp,#8
184	ldr	x14,[x3],#8		// np[0]
185	subs	x21,x5,#8		// j=num-1 and clear borrow
186	mov	x1,x0
187.Lsub:
188	sbcs	x8,x23,x14		// tp[j]-np[j]
189	ldr	x23,[x22],#8
190	sub	x21,x21,#8		// j--
191	ldr	x14,[x3],#8
192	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
193	cbnz	x21,.Lsub
194
195	sbcs	x8,x23,x14
196	sbcs	x19,x19,xzr		// did it borrow?
197	str	x8,[x1],#8		// rp[num-1]
198
199	ldr	x23,[sp]		// tp[0]
200	add	x22,sp,#8
201	ldr	x8,[x0],#8		// rp[0]
202	sub	x5,x5,#8		// num--
203	nop
204.Lcond_copy:
205	sub	x5,x5,#8		// num--
206	csel	x14,x23,x8,lo		// did it borrow?
207	ldr	x23,[x22],#8
208	ldr	x8,[x0],#8
209	stur	xzr,[x22,#-16]		// wipe tp
210	stur	x14,[x0,#-16]
211	cbnz	x5,.Lcond_copy
212
213	csel	x14,x23,x8,lo
214	stur	xzr,[x22,#-8]		// wipe tp
215	stur	x14,[x0,#-8]
216
217	ldp	x19,x20,[x29,#16]
218	mov	sp,x29
219	ldp	x21,x22,[x29,#32]
220	mov	x0,#1
221	ldp	x23,x24,[x29,#48]
222	ldr	x29,[sp],#64
223	ret
224.size	bn_mul_mont,.-bn_mul_mont
225.type	bn_mul8x_mont_neon,%function
226.align	5
227bn_mul8x_mont_neon:
228	stp	x29,x30,[sp,#-80]!
229	mov	x16,sp
230	stp	d8,d9,[sp,#16]
231	stp	d10,d11,[sp,#32]
232	stp	d12,d13,[sp,#48]
233	stp	d14,d15,[sp,#64]
234	lsl	x5,x5,#1
235	eor	v14.16b,v14.16b,v14.16b
236
237.align	4
238.LNEON_8n:
239	eor	v6.16b,v6.16b,v6.16b
240	sub	x7,sp,#128
241	eor	v7.16b,v7.16b,v7.16b
242	sub	x7,x7,x5,lsl#4
243	eor	v8.16b,v8.16b,v8.16b
244	and	x7,x7,#-64
245	eor	v9.16b,v9.16b,v9.16b
246	mov	sp,x7		// alloca
247	eor	v10.16b,v10.16b,v10.16b
248	add	x7,x7,#256
249	eor	v11.16b,v11.16b,v11.16b
250	sub	x8,x5,#8
251	eor	v12.16b,v12.16b,v12.16b
252	eor	v13.16b,v13.16b,v13.16b
253
254.LNEON_8n_init:
255	st1	{v6.2d,v7.2d},[x7],#32
256	subs	x8,x8,#8
257	st1	{v8.2d,v9.2d},[x7],#32
258	st1	{v10.2d,v11.2d},[x7],#32
259	st1	{v12.2d,v13.2d},[x7],#32
260	bne	.LNEON_8n_init
261
262	add	x6,sp,#256
263	ld1	{v0.4s,v1.4s},[x1],#32
264	add	x10,sp,#8
265	ldr	s30,[x4],#4
266	mov	x9,x5
267	b	.LNEON_8n_outer
268
269.align	4
270.LNEON_8n_outer:
271	ldr	s28,[x2],#4   // *b++
272	uxtl	v28.4s,v28.4h
273	add	x7,sp,#128
274	ld1	{v2.4s,v3.4s},[x3],#32
275
276	umlal	v6.2d,v28.2s,v0.s[0]
277	umlal	v7.2d,v28.2s,v0.s[1]
278	umlal	v8.2d,v28.2s,v0.s[2]
279	shl	v29.2d,v6.2d,#16
280	ext	v29.16b,v29.16b,v29.16b,#8
281	umlal	v9.2d,v28.2s,v0.s[3]
282	add	v29.2d,v29.2d,v6.2d
283	umlal	v10.2d,v28.2s,v1.s[0]
284	mul	v29.2s,v29.2s,v30.2s
285	umlal	v11.2d,v28.2s,v1.s[1]
286	st1	{v28.2s},[sp]		// put aside smashed b[8*i+0]
287	umlal	v12.2d,v28.2s,v1.s[2]
288	uxtl	v29.4s,v29.4h
289	umlal	v13.2d,v28.2s,v1.s[3]
290	ldr	s28,[x2],#4   // *b++
291	umlal	v6.2d,v29.2s,v2.s[0]
292	umlal	v7.2d,v29.2s,v2.s[1]
293	uxtl	v28.4s,v28.4h
294	umlal	v8.2d,v29.2s,v2.s[2]
295	ushr	v15.2d,v6.2d,#16
296	umlal	v9.2d,v29.2s,v2.s[3]
297	umlal	v10.2d,v29.2s,v3.s[0]
298	ext	v6.16b,v6.16b,v6.16b,#8
299	add	v6.2d,v6.2d,v15.2d
300	umlal	v11.2d,v29.2s,v3.s[1]
301	ushr	v6.2d,v6.2d,#16
302	umlal	v12.2d,v29.2s,v3.s[2]
303	umlal	v13.2d,v29.2s,v3.s[3]
304	add	v16.2d,v7.2d,v6.2d
305	ins	v7.d[0],v16.d[0]
306	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+0]
307	umlal	v7.2d,v28.2s,v0.s[0]
308	ld1	{v6.2d},[x6],#16
309	umlal	v8.2d,v28.2s,v0.s[1]
310	umlal	v9.2d,v28.2s,v0.s[2]
311	shl	v29.2d,v7.2d,#16
312	ext	v29.16b,v29.16b,v29.16b,#8
313	umlal	v10.2d,v28.2s,v0.s[3]
314	add	v29.2d,v29.2d,v7.2d
315	umlal	v11.2d,v28.2s,v1.s[0]
316	mul	v29.2s,v29.2s,v30.2s
317	umlal	v12.2d,v28.2s,v1.s[1]
318	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+1]
319	umlal	v13.2d,v28.2s,v1.s[2]
320	uxtl	v29.4s,v29.4h
321	umlal	v6.2d,v28.2s,v1.s[3]
322	ldr	s28,[x2],#4   // *b++
323	umlal	v7.2d,v29.2s,v2.s[0]
324	umlal	v8.2d,v29.2s,v2.s[1]
325	uxtl	v28.4s,v28.4h
326	umlal	v9.2d,v29.2s,v2.s[2]
327	ushr	v15.2d,v7.2d,#16
328	umlal	v10.2d,v29.2s,v2.s[3]
329	umlal	v11.2d,v29.2s,v3.s[0]
330	ext	v7.16b,v7.16b,v7.16b,#8
331	add	v7.2d,v7.2d,v15.2d
332	umlal	v12.2d,v29.2s,v3.s[1]
333	ushr	v7.2d,v7.2d,#16
334	umlal	v13.2d,v29.2s,v3.s[2]
335	umlal	v6.2d,v29.2s,v3.s[3]
336	add	v16.2d,v8.2d,v7.2d
337	ins	v8.d[0],v16.d[0]
338	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+1]
339	umlal	v8.2d,v28.2s,v0.s[0]
340	ld1	{v7.2d},[x6],#16
341	umlal	v9.2d,v28.2s,v0.s[1]
342	umlal	v10.2d,v28.2s,v0.s[2]
343	shl	v29.2d,v8.2d,#16
344	ext	v29.16b,v29.16b,v29.16b,#8
345	umlal	v11.2d,v28.2s,v0.s[3]
346	add	v29.2d,v29.2d,v8.2d
347	umlal	v12.2d,v28.2s,v1.s[0]
348	mul	v29.2s,v29.2s,v30.2s
349	umlal	v13.2d,v28.2s,v1.s[1]
350	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+2]
351	umlal	v6.2d,v28.2s,v1.s[2]
352	uxtl	v29.4s,v29.4h
353	umlal	v7.2d,v28.2s,v1.s[3]
354	ldr	s28,[x2],#4   // *b++
355	umlal	v8.2d,v29.2s,v2.s[0]
356	umlal	v9.2d,v29.2s,v2.s[1]
357	uxtl	v28.4s,v28.4h
358	umlal	v10.2d,v29.2s,v2.s[2]
359	ushr	v15.2d,v8.2d,#16
360	umlal	v11.2d,v29.2s,v2.s[3]
361	umlal	v12.2d,v29.2s,v3.s[0]
362	ext	v8.16b,v8.16b,v8.16b,#8
363	add	v8.2d,v8.2d,v15.2d
364	umlal	v13.2d,v29.2s,v3.s[1]
365	ushr	v8.2d,v8.2d,#16
366	umlal	v6.2d,v29.2s,v3.s[2]
367	umlal	v7.2d,v29.2s,v3.s[3]
368	add	v16.2d,v9.2d,v8.2d
369	ins	v9.d[0],v16.d[0]
370	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+2]
371	umlal	v9.2d,v28.2s,v0.s[0]
372	ld1	{v8.2d},[x6],#16
373	umlal	v10.2d,v28.2s,v0.s[1]
374	umlal	v11.2d,v28.2s,v0.s[2]
375	shl	v29.2d,v9.2d,#16
376	ext	v29.16b,v29.16b,v29.16b,#8
377	umlal	v12.2d,v28.2s,v0.s[3]
378	add	v29.2d,v29.2d,v9.2d
379	umlal	v13.2d,v28.2s,v1.s[0]
380	mul	v29.2s,v29.2s,v30.2s
381	umlal	v6.2d,v28.2s,v1.s[1]
382	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+3]
383	umlal	v7.2d,v28.2s,v1.s[2]
384	uxtl	v29.4s,v29.4h
385	umlal	v8.2d,v28.2s,v1.s[3]
386	ldr	s28,[x2],#4   // *b++
387	umlal	v9.2d,v29.2s,v2.s[0]
388	umlal	v10.2d,v29.2s,v2.s[1]
389	uxtl	v28.4s,v28.4h
390	umlal	v11.2d,v29.2s,v2.s[2]
391	ushr	v15.2d,v9.2d,#16
392	umlal	v12.2d,v29.2s,v2.s[3]
393	umlal	v13.2d,v29.2s,v3.s[0]
394	ext	v9.16b,v9.16b,v9.16b,#8
395	add	v9.2d,v9.2d,v15.2d
396	umlal	v6.2d,v29.2s,v3.s[1]
397	ushr	v9.2d,v9.2d,#16
398	umlal	v7.2d,v29.2s,v3.s[2]
399	umlal	v8.2d,v29.2s,v3.s[3]
400	add	v16.2d,v10.2d,v9.2d
401	ins	v10.d[0],v16.d[0]
402	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+3]
403	umlal	v10.2d,v28.2s,v0.s[0]
404	ld1	{v9.2d},[x6],#16
405	umlal	v11.2d,v28.2s,v0.s[1]
406	umlal	v12.2d,v28.2s,v0.s[2]
407	shl	v29.2d,v10.2d,#16
408	ext	v29.16b,v29.16b,v29.16b,#8
409	umlal	v13.2d,v28.2s,v0.s[3]
410	add	v29.2d,v29.2d,v10.2d
411	umlal	v6.2d,v28.2s,v1.s[0]
412	mul	v29.2s,v29.2s,v30.2s
413	umlal	v7.2d,v28.2s,v1.s[1]
414	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+4]
415	umlal	v8.2d,v28.2s,v1.s[2]
416	uxtl	v29.4s,v29.4h
417	umlal	v9.2d,v28.2s,v1.s[3]
418	ldr	s28,[x2],#4   // *b++
419	umlal	v10.2d,v29.2s,v2.s[0]
420	umlal	v11.2d,v29.2s,v2.s[1]
421	uxtl	v28.4s,v28.4h
422	umlal	v12.2d,v29.2s,v2.s[2]
423	ushr	v15.2d,v10.2d,#16
424	umlal	v13.2d,v29.2s,v2.s[3]
425	umlal	v6.2d,v29.2s,v3.s[0]
426	ext	v10.16b,v10.16b,v10.16b,#8
427	add	v10.2d,v10.2d,v15.2d
428	umlal	v7.2d,v29.2s,v3.s[1]
429	ushr	v10.2d,v10.2d,#16
430	umlal	v8.2d,v29.2s,v3.s[2]
431	umlal	v9.2d,v29.2s,v3.s[3]
432	add	v16.2d,v11.2d,v10.2d
433	ins	v11.d[0],v16.d[0]
434	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+4]
435	umlal	v11.2d,v28.2s,v0.s[0]
436	ld1	{v10.2d},[x6],#16
437	umlal	v12.2d,v28.2s,v0.s[1]
438	umlal	v13.2d,v28.2s,v0.s[2]
439	shl	v29.2d,v11.2d,#16
440	ext	v29.16b,v29.16b,v29.16b,#8
441	umlal	v6.2d,v28.2s,v0.s[3]
442	add	v29.2d,v29.2d,v11.2d
443	umlal	v7.2d,v28.2s,v1.s[0]
444	mul	v29.2s,v29.2s,v30.2s
445	umlal	v8.2d,v28.2s,v1.s[1]
446	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+5]
447	umlal	v9.2d,v28.2s,v1.s[2]
448	uxtl	v29.4s,v29.4h
449	umlal	v10.2d,v28.2s,v1.s[3]
450	ldr	s28,[x2],#4   // *b++
451	umlal	v11.2d,v29.2s,v2.s[0]
452	umlal	v12.2d,v29.2s,v2.s[1]
453	uxtl	v28.4s,v28.4h
454	umlal	v13.2d,v29.2s,v2.s[2]
455	ushr	v15.2d,v11.2d,#16
456	umlal	v6.2d,v29.2s,v2.s[3]
457	umlal	v7.2d,v29.2s,v3.s[0]
458	ext	v11.16b,v11.16b,v11.16b,#8
459	add	v11.2d,v11.2d,v15.2d
460	umlal	v8.2d,v29.2s,v3.s[1]
461	ushr	v11.2d,v11.2d,#16
462	umlal	v9.2d,v29.2s,v3.s[2]
463	umlal	v10.2d,v29.2s,v3.s[3]
464	add	v16.2d,v12.2d,v11.2d
465	ins	v12.d[0],v16.d[0]
466	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+5]
467	umlal	v12.2d,v28.2s,v0.s[0]
468	ld1	{v11.2d},[x6],#16
469	umlal	v13.2d,v28.2s,v0.s[1]
470	umlal	v6.2d,v28.2s,v0.s[2]
471	shl	v29.2d,v12.2d,#16
472	ext	v29.16b,v29.16b,v29.16b,#8
473	umlal	v7.2d,v28.2s,v0.s[3]
474	add	v29.2d,v29.2d,v12.2d
475	umlal	v8.2d,v28.2s,v1.s[0]
476	mul	v29.2s,v29.2s,v30.2s
477	umlal	v9.2d,v28.2s,v1.s[1]
478	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+6]
479	umlal	v10.2d,v28.2s,v1.s[2]
480	uxtl	v29.4s,v29.4h
481	umlal	v11.2d,v28.2s,v1.s[3]
482	ldr	s28,[x2],#4   // *b++
483	umlal	v12.2d,v29.2s,v2.s[0]
484	umlal	v13.2d,v29.2s,v2.s[1]
485	uxtl	v28.4s,v28.4h
486	umlal	v6.2d,v29.2s,v2.s[2]
487	ushr	v15.2d,v12.2d,#16
488	umlal	v7.2d,v29.2s,v2.s[3]
489	umlal	v8.2d,v29.2s,v3.s[0]
490	ext	v12.16b,v12.16b,v12.16b,#8
491	add	v12.2d,v12.2d,v15.2d
492	umlal	v9.2d,v29.2s,v3.s[1]
493	ushr	v12.2d,v12.2d,#16
494	umlal	v10.2d,v29.2s,v3.s[2]
495	umlal	v11.2d,v29.2s,v3.s[3]
496	add	v16.2d,v13.2d,v12.2d
497	ins	v13.d[0],v16.d[0]
498	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+6]
499	umlal	v13.2d,v28.2s,v0.s[0]
500	ld1	{v12.2d},[x6],#16
501	umlal	v6.2d,v28.2s,v0.s[1]
502	umlal	v7.2d,v28.2s,v0.s[2]
503	shl	v29.2d,v13.2d,#16
504	ext	v29.16b,v29.16b,v29.16b,#8
505	umlal	v8.2d,v28.2s,v0.s[3]
506	add	v29.2d,v29.2d,v13.2d
507	umlal	v9.2d,v28.2s,v1.s[0]
508	mul	v29.2s,v29.2s,v30.2s
509	umlal	v10.2d,v28.2s,v1.s[1]
510	st1	{v28.2s},[x10],#8	// put aside smashed b[8*i+7]
511	umlal	v11.2d,v28.2s,v1.s[2]
512	uxtl	v29.4s,v29.4h
513	umlal	v12.2d,v28.2s,v1.s[3]
514	ld1	{v28.2s},[sp]		// pull smashed b[8*i+0]
515	umlal	v13.2d,v29.2s,v2.s[0]
516	ld1	{v0.4s,v1.4s},[x1],#32
517	umlal	v6.2d,v29.2s,v2.s[1]
518	umlal	v7.2d,v29.2s,v2.s[2]
519	mov	v5.16b,v13.16b
520	ushr	v5.2d,v5.2d,#16
521	ext	v13.16b,v13.16b,v13.16b,#8
522	umlal	v8.2d,v29.2s,v2.s[3]
523	umlal	v9.2d,v29.2s,v3.s[0]
524	add	v13.2d,v13.2d,v5.2d
525	umlal	v10.2d,v29.2s,v3.s[1]
526	ushr	v13.2d,v13.2d,#16
527	eor	v15.16b,v15.16b,v15.16b
528	ins	v13.d[1],v15.d[0]
529	umlal	v11.2d,v29.2s,v3.s[2]
530	umlal	v12.2d,v29.2s,v3.s[3]
531	add	v6.2d,v6.2d,v13.2d
532	st1	{v29.2s},[x10],#8	// put aside smashed m[8*i+7]
533	add	x10,sp,#8		// rewind
534	sub	x8,x5,#8
535	b	.LNEON_8n_inner
536
537.align	4
538.LNEON_8n_inner:
539	subs	x8,x8,#8
540	umlal	v6.2d,v28.2s,v0.s[0]
541	ld1	{v13.2d},[x6]
542	umlal	v7.2d,v28.2s,v0.s[1]
543	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+0]
544	umlal	v8.2d,v28.2s,v0.s[2]
545	ld1	{v2.4s,v3.4s},[x3],#32
546	umlal	v9.2d,v28.2s,v0.s[3]
547	b.eq	.LInner_jump
548	add	x6,x6,#16	// don't advance in last iteration
549.LInner_jump:
550	umlal	v10.2d,v28.2s,v1.s[0]
551	umlal	v11.2d,v28.2s,v1.s[1]
552	umlal	v12.2d,v28.2s,v1.s[2]
553	umlal	v13.2d,v28.2s,v1.s[3]
554	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+1]
555	umlal	v6.2d,v29.2s,v2.s[0]
556	umlal	v7.2d,v29.2s,v2.s[1]
557	umlal	v8.2d,v29.2s,v2.s[2]
558	umlal	v9.2d,v29.2s,v2.s[3]
559	umlal	v10.2d,v29.2s,v3.s[0]
560	umlal	v11.2d,v29.2s,v3.s[1]
561	umlal	v12.2d,v29.2s,v3.s[2]
562	umlal	v13.2d,v29.2s,v3.s[3]
563	st1	{v6.2d},[x7],#16
564	umlal	v7.2d,v28.2s,v0.s[0]
565	ld1	{v6.2d},[x6]
566	umlal	v8.2d,v28.2s,v0.s[1]
567	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+1]
568	umlal	v9.2d,v28.2s,v0.s[2]
569	b.eq	.LInner_jump1
570	add	x6,x6,#16	// don't advance in last iteration
571.LInner_jump1:
572	umlal	v10.2d,v28.2s,v0.s[3]
573	umlal	v11.2d,v28.2s,v1.s[0]
574	umlal	v12.2d,v28.2s,v1.s[1]
575	umlal	v13.2d,v28.2s,v1.s[2]
576	umlal	v6.2d,v28.2s,v1.s[3]
577	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+2]
578	umlal	v7.2d,v29.2s,v2.s[0]
579	umlal	v8.2d,v29.2s,v2.s[1]
580	umlal	v9.2d,v29.2s,v2.s[2]
581	umlal	v10.2d,v29.2s,v2.s[3]
582	umlal	v11.2d,v29.2s,v3.s[0]
583	umlal	v12.2d,v29.2s,v3.s[1]
584	umlal	v13.2d,v29.2s,v3.s[2]
585	umlal	v6.2d,v29.2s,v3.s[3]
586	st1	{v7.2d},[x7],#16
587	umlal	v8.2d,v28.2s,v0.s[0]
588	ld1	{v7.2d},[x6]
589	umlal	v9.2d,v28.2s,v0.s[1]
590	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+2]
591	umlal	v10.2d,v28.2s,v0.s[2]
592	b.eq	.LInner_jump2
593	add	x6,x6,#16	// don't advance in last iteration
594.LInner_jump2:
595	umlal	v11.2d,v28.2s,v0.s[3]
596	umlal	v12.2d,v28.2s,v1.s[0]
597	umlal	v13.2d,v28.2s,v1.s[1]
598	umlal	v6.2d,v28.2s,v1.s[2]
599	umlal	v7.2d,v28.2s,v1.s[3]
600	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+3]
601	umlal	v8.2d,v29.2s,v2.s[0]
602	umlal	v9.2d,v29.2s,v2.s[1]
603	umlal	v10.2d,v29.2s,v2.s[2]
604	umlal	v11.2d,v29.2s,v2.s[3]
605	umlal	v12.2d,v29.2s,v3.s[0]
606	umlal	v13.2d,v29.2s,v3.s[1]
607	umlal	v6.2d,v29.2s,v3.s[2]
608	umlal	v7.2d,v29.2s,v3.s[3]
609	st1	{v8.2d},[x7],#16
610	umlal	v9.2d,v28.2s,v0.s[0]
611	ld1	{v8.2d},[x6]
612	umlal	v10.2d,v28.2s,v0.s[1]
613	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+3]
614	umlal	v11.2d,v28.2s,v0.s[2]
615	b.eq	.LInner_jump3
616	add	x6,x6,#16	// don't advance in last iteration
617.LInner_jump3:
618	umlal	v12.2d,v28.2s,v0.s[3]
619	umlal	v13.2d,v28.2s,v1.s[0]
620	umlal	v6.2d,v28.2s,v1.s[1]
621	umlal	v7.2d,v28.2s,v1.s[2]
622	umlal	v8.2d,v28.2s,v1.s[3]
623	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+4]
624	umlal	v9.2d,v29.2s,v2.s[0]
625	umlal	v10.2d,v29.2s,v2.s[1]
626	umlal	v11.2d,v29.2s,v2.s[2]
627	umlal	v12.2d,v29.2s,v2.s[3]
628	umlal	v13.2d,v29.2s,v3.s[0]
629	umlal	v6.2d,v29.2s,v3.s[1]
630	umlal	v7.2d,v29.2s,v3.s[2]
631	umlal	v8.2d,v29.2s,v3.s[3]
632	st1	{v9.2d},[x7],#16
633	umlal	v10.2d,v28.2s,v0.s[0]
634	ld1	{v9.2d},[x6]
635	umlal	v11.2d,v28.2s,v0.s[1]
636	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+4]
637	umlal	v12.2d,v28.2s,v0.s[2]
638	b.eq	.LInner_jump4
639	add	x6,x6,#16	// don't advance in last iteration
640.LInner_jump4:
641	umlal	v13.2d,v28.2s,v0.s[3]
642	umlal	v6.2d,v28.2s,v1.s[0]
643	umlal	v7.2d,v28.2s,v1.s[1]
644	umlal	v8.2d,v28.2s,v1.s[2]
645	umlal	v9.2d,v28.2s,v1.s[3]
646	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+5]
647	umlal	v10.2d,v29.2s,v2.s[0]
648	umlal	v11.2d,v29.2s,v2.s[1]
649	umlal	v12.2d,v29.2s,v2.s[2]
650	umlal	v13.2d,v29.2s,v2.s[3]
651	umlal	v6.2d,v29.2s,v3.s[0]
652	umlal	v7.2d,v29.2s,v3.s[1]
653	umlal	v8.2d,v29.2s,v3.s[2]
654	umlal	v9.2d,v29.2s,v3.s[3]
655	st1	{v10.2d},[x7],#16
656	umlal	v11.2d,v28.2s,v0.s[0]
657	ld1	{v10.2d},[x6]
658	umlal	v12.2d,v28.2s,v0.s[1]
659	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+5]
660	umlal	v13.2d,v28.2s,v0.s[2]
661	b.eq	.LInner_jump5
662	add	x6,x6,#16	// don't advance in last iteration
663.LInner_jump5:
664	umlal	v6.2d,v28.2s,v0.s[3]
665	umlal	v7.2d,v28.2s,v1.s[0]
666	umlal	v8.2d,v28.2s,v1.s[1]
667	umlal	v9.2d,v28.2s,v1.s[2]
668	umlal	v10.2d,v28.2s,v1.s[3]
669	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+6]
670	umlal	v11.2d,v29.2s,v2.s[0]
671	umlal	v12.2d,v29.2s,v2.s[1]
672	umlal	v13.2d,v29.2s,v2.s[2]
673	umlal	v6.2d,v29.2s,v2.s[3]
674	umlal	v7.2d,v29.2s,v3.s[0]
675	umlal	v8.2d,v29.2s,v3.s[1]
676	umlal	v9.2d,v29.2s,v3.s[2]
677	umlal	v10.2d,v29.2s,v3.s[3]
678	st1	{v11.2d},[x7],#16
679	umlal	v12.2d,v28.2s,v0.s[0]
680	ld1	{v11.2d},[x6]
681	umlal	v13.2d,v28.2s,v0.s[1]
682	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+6]
683	umlal	v6.2d,v28.2s,v0.s[2]
684	b.eq	.LInner_jump6
685	add	x6,x6,#16	// don't advance in last iteration
686.LInner_jump6:
687	umlal	v7.2d,v28.2s,v0.s[3]
688	umlal	v8.2d,v28.2s,v1.s[0]
689	umlal	v9.2d,v28.2s,v1.s[1]
690	umlal	v10.2d,v28.2s,v1.s[2]
691	umlal	v11.2d,v28.2s,v1.s[3]
692	ld1	{v28.2s},[x10],#8	// pull smashed b[8*i+7]
693	umlal	v12.2d,v29.2s,v2.s[0]
694	umlal	v13.2d,v29.2s,v2.s[1]
695	umlal	v6.2d,v29.2s,v2.s[2]
696	umlal	v7.2d,v29.2s,v2.s[3]
697	umlal	v8.2d,v29.2s,v3.s[0]
698	umlal	v9.2d,v29.2s,v3.s[1]
699	umlal	v10.2d,v29.2s,v3.s[2]
700	umlal	v11.2d,v29.2s,v3.s[3]
701	st1	{v12.2d},[x7],#16
702	umlal	v13.2d,v28.2s,v0.s[0]
703	ld1	{v12.2d},[x6]
704	umlal	v6.2d,v28.2s,v0.s[1]
705	ld1	{v29.2s},[x10],#8	// pull smashed m[8*i+7]
706	umlal	v7.2d,v28.2s,v0.s[2]
707	b.eq	.LInner_jump7
708	add	x6,x6,#16	// don't advance in last iteration
709.LInner_jump7:
710	umlal	v8.2d,v28.2s,v0.s[3]
711	umlal	v9.2d,v28.2s,v1.s[0]
712	umlal	v10.2d,v28.2s,v1.s[1]
713	umlal	v11.2d,v28.2s,v1.s[2]
714	umlal	v12.2d,v28.2s,v1.s[3]
715	b.ne	.LInner_after_rewind8
716	sub	x1,x1,x5,lsl#2	// rewind
717.LInner_after_rewind8:
718	umlal	v13.2d,v29.2s,v2.s[0]
719	ld1	{v28.2s},[sp]		// pull smashed b[8*i+0]
720	umlal	v6.2d,v29.2s,v2.s[1]
721	ld1	{v0.4s,v1.4s},[x1],#32
722	umlal	v7.2d,v29.2s,v2.s[2]
723	add	x10,sp,#8		// rewind
724	umlal	v8.2d,v29.2s,v2.s[3]
725	umlal	v9.2d,v29.2s,v3.s[0]
726	umlal	v10.2d,v29.2s,v3.s[1]
727	umlal	v11.2d,v29.2s,v3.s[2]
728	st1	{v13.2d},[x7],#16
729	umlal	v12.2d,v29.2s,v3.s[3]
730
731	bne	.LNEON_8n_inner
732	add	x6,sp,#128
733	st1	{v6.2d,v7.2d},[x7],#32
734	eor	v2.16b,v2.16b,v2.16b	// v2
735	st1	{v8.2d,v9.2d},[x7],#32
736	eor	v3.16b,v3.16b,v3.16b	// v3
737	st1	{v10.2d,v11.2d},[x7],#32
738	st1	{v12.2d},[x7]
739
740	subs	x9,x9,#8
741	ld1	{v6.2d,v7.2d},[x6],#32
742	ld1	{v8.2d,v9.2d},[x6],#32
743	ld1	{v10.2d,v11.2d},[x6],#32
744	ld1	{v12.2d,v13.2d},[x6],#32
745
746	b.eq	.LInner_8n_jump_2steps
747	sub	x3,x3,x5,lsl#2	// rewind
748	b	.LNEON_8n_outer
749
750.LInner_8n_jump_2steps:
751	add	x7,sp,#128
752	st1	{v2.2d,v3.2d}, [sp],#32	// start wiping stack frame
753	mov	v5.16b,v6.16b
754	ushr	v15.2d,v6.2d,#16
755	ext	v6.16b,v6.16b,v6.16b,#8
756	st1	{v2.2d,v3.2d}, [sp],#32
757	add	v6.2d,v6.2d,v15.2d
758	st1	{v2.2d,v3.2d}, [sp],#32
759	ushr	v15.2d,v6.2d,#16
760	st1	{v2.2d,v3.2d}, [sp],#32
761	zip1	v6.4h,v5.4h,v6.4h
762	ins	v15.d[1],v14.d[0]
763
764	mov	x8,x5
765	b	.LNEON_tail_entry
766
767.align	4
768.LNEON_tail:
769	add	v6.2d,v6.2d,v15.2d
770	mov	v5.16b,v6.16b
771	ushr	v15.2d,v6.2d,#16
772	ext	v6.16b,v6.16b,v6.16b,#8
773	ld1	{v8.2d,v9.2d}, [x6],#32
774	add	v6.2d,v6.2d,v15.2d
775	ld1	{v10.2d,v11.2d}, [x6],#32
776	ushr	v15.2d,v6.2d,#16
777	ld1	{v12.2d,v13.2d}, [x6],#32
778	zip1	v6.4h,v5.4h,v6.4h
779	ins	v15.d[1],v14.d[0]
780
781.LNEON_tail_entry:
782	add	v7.2d,v7.2d,v15.2d
783	st1	{v6.s}[0], [x7],#4
784	ushr	v15.2d,v7.2d,#16
785	mov	v5.16b,v7.16b
786	ext	v7.16b,v7.16b,v7.16b,#8
787	add	v7.2d,v7.2d,v15.2d
788	ushr	v15.2d,v7.2d,#16
789	zip1	v7.4h,v5.4h,v7.4h
790	ins	v15.d[1],v14.d[0]
791	add	v8.2d,v8.2d,v15.2d
792	st1	{v7.s}[0], [x7],#4
793	ushr	v15.2d,v8.2d,#16
794	mov	v5.16b,v8.16b
795	ext	v8.16b,v8.16b,v8.16b,#8
796	add	v8.2d,v8.2d,v15.2d
797	ushr	v15.2d,v8.2d,#16
798	zip1	v8.4h,v5.4h,v8.4h
799	ins	v15.d[1],v14.d[0]
800	add	v9.2d,v9.2d,v15.2d
801	st1	{v8.s}[0], [x7],#4
802	ushr	v15.2d,v9.2d,#16
803	mov	v5.16b,v9.16b
804	ext	v9.16b,v9.16b,v9.16b,#8
805	add	v9.2d,v9.2d,v15.2d
806	ushr	v15.2d,v9.2d,#16
807	zip1	v9.4h,v5.4h,v9.4h
808	ins	v15.d[1],v14.d[0]
809	add	v10.2d,v10.2d,v15.2d
810	st1	{v9.s}[0], [x7],#4
811	ushr	v15.2d,v10.2d,#16
812	mov	v5.16b,v10.16b
813	ext	v10.16b,v10.16b,v10.16b,#8
814	add	v10.2d,v10.2d,v15.2d
815	ushr	v15.2d,v10.2d,#16
816	zip1	v10.4h,v5.4h,v10.4h
817	ins	v15.d[1],v14.d[0]
818	add	v11.2d,v11.2d,v15.2d
819	st1	{v10.s}[0], [x7],#4
820	ushr	v15.2d,v11.2d,#16
821	mov	v5.16b,v11.16b
822	ext	v11.16b,v11.16b,v11.16b,#8
823	add	v11.2d,v11.2d,v15.2d
824	ushr	v15.2d,v11.2d,#16
825	zip1	v11.4h,v5.4h,v11.4h
826	ins	v15.d[1],v14.d[0]
827	add	v12.2d,v12.2d,v15.2d
828	st1	{v11.s}[0], [x7],#4
829	ushr	v15.2d,v12.2d,#16
830	mov	v5.16b,v12.16b
831	ext	v12.16b,v12.16b,v12.16b,#8
832	add	v12.2d,v12.2d,v15.2d
833	ushr	v15.2d,v12.2d,#16
834	zip1	v12.4h,v5.4h,v12.4h
835	ins	v15.d[1],v14.d[0]
836	add	v13.2d,v13.2d,v15.2d
837	st1	{v12.s}[0], [x7],#4
838	ushr	v15.2d,v13.2d,#16
839	mov	v5.16b,v13.16b
840	ext	v13.16b,v13.16b,v13.16b,#8
841	add	v13.2d,v13.2d,v15.2d
842	ushr	v15.2d,v13.2d,#16
843	zip1	v13.4h,v5.4h,v13.4h
844	ins	v15.d[1],v14.d[0]
845	ld1	{v6.2d,v7.2d}, [x6],#32
846	subs	x8,x8,#8
847	st1	{v13.s}[0], [x7],#4
848	bne	.LNEON_tail
849
850	st1	{v15.s}[0], [x7],#4	// top-most bit
851	sub	x3,x3,x5,lsl#2		// rewind x3
852	subs	x1,sp,#0			// clear carry flag
853	add	x2,sp,x5,lsl#2
854
855.LNEON_sub:
856	ldp	w4,w5,[x1],#8
857	ldp	w6,w7,[x1],#8
858	ldp	w8,w9,[x3],#8
859	ldp	w10,w11,[x3],#8
860	sbcs	w8,w4,w8
861	sbcs	w9,w5,w9
862	sbcs	w10,w6,w10
863	sbcs	w11,w7,w11
864	sub	x17,x2,x1
865	stp	w8,w9,[x0],#8
866	stp	w10,w11,[x0],#8
867	cbnz	x17,.LNEON_sub
868
869	ldr	w10, [x1]		// load top-most bit
870	mov	x11,sp
871	eor	v0.16b,v0.16b,v0.16b
872	sub	x11,x2,x11		// this is num*4
873	eor	v1.16b,v1.16b,v1.16b
874	mov	x1,sp
875	sub	x0,x0,x11		// rewind x0
876	mov	x3,x2		// second 3/4th of frame
877	sbcs	w10,w10,wzr		// result is carry flag
878
879.LNEON_copy_n_zap:
880	ldp	w4,w5,[x1],#8
881	ldp	w6,w7,[x1],#8
882	ldp	w8,w9,[x0],#8
883	ldp	w10,w11,[x0]
884	sub	x0,x0,#8
885	b.cs	.LCopy_1
886	mov	w8,w4
887	mov	w9,w5
888	mov	w10,w6
889	mov	w11,w7
890.LCopy_1:
891	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
892	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
893	ldp	w4,w5,[x1],#8
894	ldp	w6,w7,[x1],#8
895	stp	w8,w9,[x0],#8
896	stp	w10,w11,[x0],#8
897	sub	x1,x1,#32
898	ldp	w8,w9,[x0],#8
899	ldp	w10,w11,[x0]
900	sub	x0,x0,#8
901	b.cs	.LCopy_2
902	mov	w8, w4
903	mov	w9, w5
904	mov	w10, w6
905	mov	w11, w7
906.LCopy_2:
907	st1	{v0.2d,v1.2d}, [x1],#32		// wipe
908	st1	{v0.2d,v1.2d}, [x3],#32		// wipe
909	sub	x17,x2,x1		// preserves carry
910	stp	w8,w9,[x0],#8
911	stp	w10,w11,[x0],#8
912	cbnz	x17,.LNEON_copy_n_zap
913
914	mov	sp,x16
915	ldp	d14,d15,[sp,#64]
916	ldp	d12,d13,[sp,#48]
917	ldp	d10,d11,[sp,#32]
918	ldp	d8,d9,[sp,#16]
919	ldr	x29,[sp],#80
920	ret	// bx lr
921
922.size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
923.type	__bn_sqr8x_mont,%function
924.align	5
925__bn_sqr8x_mont:
926	cmp	x1,x2
927	b.ne	__bn_mul4x_mont
928.Lsqr8x_mont:
929.inst	0xd503233f		// paciasp
930	stp	x29,x30,[sp,#-128]!
931	add	x29,sp,#0
932	stp	x19,x20,[sp,#16]
933	stp	x21,x22,[sp,#32]
934	stp	x23,x24,[sp,#48]
935	stp	x25,x26,[sp,#64]
936	stp	x27,x28,[sp,#80]
937	stp	x0,x3,[sp,#96]	// offload rp and np
938
939	ldp	x6,x7,[x1,#8*0]
940	ldp	x8,x9,[x1,#8*2]
941	ldp	x10,x11,[x1,#8*4]
942	ldp	x12,x13,[x1,#8*6]
943
944	sub	x2,sp,x5,lsl#4
945	lsl	x5,x5,#3
946	ldr	x4,[x4]		// *n0
947	mov	sp,x2			// alloca
948	sub	x27,x5,#8*8
949	b	.Lsqr8x_zero_start
950
951.Lsqr8x_zero:
952	sub	x27,x27,#8*8
953	stp	xzr,xzr,[x2,#8*0]
954	stp	xzr,xzr,[x2,#8*2]
955	stp	xzr,xzr,[x2,#8*4]
956	stp	xzr,xzr,[x2,#8*6]
957.Lsqr8x_zero_start:
958	stp	xzr,xzr,[x2,#8*8]
959	stp	xzr,xzr,[x2,#8*10]
960	stp	xzr,xzr,[x2,#8*12]
961	stp	xzr,xzr,[x2,#8*14]
962	add	x2,x2,#8*16
963	cbnz	x27,.Lsqr8x_zero
964
965	add	x3,x1,x5
966	add	x1,x1,#8*8
967	mov	x19,xzr
968	mov	x20,xzr
969	mov	x21,xzr
970	mov	x22,xzr
971	mov	x23,xzr
972	mov	x24,xzr
973	mov	x25,xzr
974	mov	x26,xzr
975	mov	x2,sp
976	str	x4,[x29,#112]		// offload n0
977
978	// Multiply everything but a[i]*a[i]
979.align	4
980.Lsqr8x_outer_loop:
981        //                                                 a[1]a[0]	(i)
982        //                                             a[2]a[0]
983        //                                         a[3]a[0]
984        //                                     a[4]a[0]
985        //                                 a[5]a[0]
986        //                             a[6]a[0]
987        //                         a[7]a[0]
988        //                                         a[2]a[1]		(ii)
989        //                                     a[3]a[1]
990        //                                 a[4]a[1]
991        //                             a[5]a[1]
992        //                         a[6]a[1]
993        //                     a[7]a[1]
994        //                                 a[3]a[2]			(iii)
995        //                             a[4]a[2]
996        //                         a[5]a[2]
997        //                     a[6]a[2]
998        //                 a[7]a[2]
999        //                         a[4]a[3]				(iv)
1000        //                     a[5]a[3]
1001        //                 a[6]a[3]
1002        //             a[7]a[3]
1003        //                 a[5]a[4]					(v)
1004        //             a[6]a[4]
1005        //         a[7]a[4]
1006        //         a[6]a[5]						(vi)
1007        //     a[7]a[5]
1008        // a[7]a[6]							(vii)
1009
1010	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
1011	mul	x15,x8,x6
1012	mul	x16,x9,x6
1013	mul	x17,x10,x6
1014	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
1015	mul	x14,x11,x6
1016	adcs	x21,x21,x15
1017	mul	x15,x12,x6
1018	adcs	x22,x22,x16
1019	mul	x16,x13,x6
1020	adcs	x23,x23,x17
1021	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
1022	adcs	x24,x24,x14
1023	umulh	x14,x8,x6
1024	adcs	x25,x25,x15
1025	umulh	x15,x9,x6
1026	adcs	x26,x26,x16
1027	umulh	x16,x10,x6
1028	stp	x19,x20,[x2],#8*2	// t[0..1]
1029	adc	x19,xzr,xzr		// t[8]
1030	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
1031	umulh	x17,x11,x6
1032	adcs	x22,x22,x14
1033	umulh	x14,x12,x6
1034	adcs	x23,x23,x15
1035	umulh	x15,x13,x6
1036	adcs	x24,x24,x16
1037	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
1038	adcs	x25,x25,x17
1039	mul	x17,x9,x7
1040	adcs	x26,x26,x14
1041	mul	x14,x10,x7
1042	adc	x19,x19,x15
1043
1044	mul	x15,x11,x7
1045	adds	x22,x22,x16
1046	mul	x16,x12,x7
1047	adcs	x23,x23,x17
1048	mul	x17,x13,x7
1049	adcs	x24,x24,x14
1050	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
1051	adcs	x25,x25,x15
1052	umulh	x15,x9,x7
1053	adcs	x26,x26,x16
1054	umulh	x16,x10,x7
1055	adcs	x19,x19,x17
1056	umulh	x17,x11,x7
1057	stp	x21,x22,[x2],#8*2	// t[2..3]
1058	adc	x20,xzr,xzr		// t[9]
1059	adds	x23,x23,x14
1060	umulh	x14,x12,x7
1061	adcs	x24,x24,x15
1062	umulh	x15,x13,x7
1063	adcs	x25,x25,x16
1064	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
1065	adcs	x26,x26,x17
1066	mul	x17,x10,x8
1067	adcs	x19,x19,x14
1068	mul	x14,x11,x8
1069	adc	x20,x20,x15
1070
1071	mul	x15,x12,x8
1072	adds	x24,x24,x16
1073	mul	x16,x13,x8
1074	adcs	x25,x25,x17
1075	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
1076	adcs	x26,x26,x14
1077	umulh	x14,x10,x8
1078	adcs	x19,x19,x15
1079	umulh	x15,x11,x8
1080	adcs	x20,x20,x16
1081	umulh	x16,x12,x8
1082	stp	x23,x24,[x2],#8*2	// t[4..5]
1083	adc	x21,xzr,xzr		// t[10]
1084	adds	x25,x25,x17
1085	umulh	x17,x13,x8
1086	adcs	x26,x26,x14
1087	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
1088	adcs	x19,x19,x15
1089	mul	x15,x11,x9
1090	adcs	x20,x20,x16
1091	mul	x16,x12,x9
1092	adc	x21,x21,x17
1093
1094	mul	x17,x13,x9
1095	adds	x26,x26,x14
1096	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
1097	adcs	x19,x19,x15
1098	umulh	x15,x11,x9
1099	adcs	x20,x20,x16
1100	umulh	x16,x12,x9
1101	adcs	x21,x21,x17
1102	umulh	x17,x13,x9
1103	stp	x25,x26,[x2],#8*2	// t[6..7]
1104	adc	x22,xzr,xzr		// t[11]
1105	adds	x19,x19,x14
1106	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
1107	adcs	x20,x20,x15
1108	mul	x15,x12,x10
1109	adcs	x21,x21,x16
1110	mul	x16,x13,x10
1111	adc	x22,x22,x17
1112
1113	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
1114	adds	x20,x20,x14
1115	umulh	x14,x12,x10
1116	adcs	x21,x21,x15
1117	umulh	x15,x13,x10
1118	adcs	x22,x22,x16
1119	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
1120	adc	x23,xzr,xzr		// t[12]
1121	adds	x21,x21,x17
1122	mul	x17,x13,x11
1123	adcs	x22,x22,x14
1124	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
1125	adc	x23,x23,x15
1126
1127	umulh	x15,x13,x11
1128	adds	x22,x22,x16
1129	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
1130	adcs	x23,x23,x17
1131	umulh	x17,x13,x12		// hi(a[7]*a[6])
1132	adc	x24,xzr,xzr		// t[13]
1133	adds	x23,x23,x14
1134	sub	x27,x3,x1	// done yet?
1135	adc	x24,x24,x15
1136
1137	adds	x24,x24,x16
1138	sub	x14,x3,x5	// rewinded ap
1139	adc	x25,xzr,xzr		// t[14]
1140	add	x25,x25,x17
1141
1142	cbz	x27,.Lsqr8x_outer_break
1143
1144	mov	x4,x6
1145	ldp	x6,x7,[x2,#8*0]
1146	ldp	x8,x9,[x2,#8*2]
1147	ldp	x10,x11,[x2,#8*4]
1148	ldp	x12,x13,[x2,#8*6]
1149	adds	x19,x19,x6
1150	adcs	x20,x20,x7
1151	ldp	x6,x7,[x1,#8*0]
1152	adcs	x21,x21,x8
1153	adcs	x22,x22,x9
1154	ldp	x8,x9,[x1,#8*2]
1155	adcs	x23,x23,x10
1156	adcs	x24,x24,x11
1157	ldp	x10,x11,[x1,#8*4]
1158	adcs	x25,x25,x12
1159	mov	x0,x1
1160	adcs	x26,xzr,x13
1161	ldp	x12,x13,[x1,#8*6]
1162	add	x1,x1,#8*8
1163	//adc	x28,xzr,xzr		// moved below
1164	mov	x27,#-8*8
1165
1166	//                                                         a[8]a[0]
1167	//                                                     a[9]a[0]
1168	//                                                 a[a]a[0]
1169	//                                             a[b]a[0]
1170	//                                         a[c]a[0]
1171	//                                     a[d]a[0]
1172	//                                 a[e]a[0]
1173	//                             a[f]a[0]
1174	//                                                     a[8]a[1]
1175	//                         a[f]a[1]........................
1176	//                                                 a[8]a[2]
1177	//                     a[f]a[2]........................
1178	//                                             a[8]a[3]
1179	//                 a[f]a[3]........................
1180	//                                         a[8]a[4]
1181	//             a[f]a[4]........................
1182	//                                     a[8]a[5]
1183	//         a[f]a[5]........................
1184	//                                 a[8]a[6]
1185	//     a[f]a[6]........................
1186	//                             a[8]a[7]
1187	// a[f]a[7]........................
1188.Lsqr8x_mul:
1189	mul	x14,x6,x4
1190	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
1191	mul	x15,x7,x4
1192	add	x27,x27,#8
1193	mul	x16,x8,x4
1194	mul	x17,x9,x4
1195	adds	x19,x19,x14
1196	mul	x14,x10,x4
1197	adcs	x20,x20,x15
1198	mul	x15,x11,x4
1199	adcs	x21,x21,x16
1200	mul	x16,x12,x4
1201	adcs	x22,x22,x17
1202	mul	x17,x13,x4
1203	adcs	x23,x23,x14
1204	umulh	x14,x6,x4
1205	adcs	x24,x24,x15
1206	umulh	x15,x7,x4
1207	adcs	x25,x25,x16
1208	umulh	x16,x8,x4
1209	adcs	x26,x26,x17
1210	umulh	x17,x9,x4
1211	adc	x28,x28,xzr
1212	str	x19,[x2],#8
1213	adds	x19,x20,x14
1214	umulh	x14,x10,x4
1215	adcs	x20,x21,x15
1216	umulh	x15,x11,x4
1217	adcs	x21,x22,x16
1218	umulh	x16,x12,x4
1219	adcs	x22,x23,x17
1220	umulh	x17,x13,x4
1221	ldr	x4,[x0,x27]
1222	adcs	x23,x24,x14
1223	adcs	x24,x25,x15
1224	adcs	x25,x26,x16
1225	adcs	x26,x28,x17
1226	//adc	x28,xzr,xzr		// moved above
1227	cbnz	x27,.Lsqr8x_mul
1228					// note that carry flag is guaranteed
1229					// to be zero at this point
1230	cmp	x1,x3		// done yet?
1231	b.eq	.Lsqr8x_break
1232
1233	ldp	x6,x7,[x2,#8*0]
1234	ldp	x8,x9,[x2,#8*2]
1235	ldp	x10,x11,[x2,#8*4]
1236	ldp	x12,x13,[x2,#8*6]
1237	adds	x19,x19,x6
1238	ldur	x4,[x0,#-8*8]
1239	adcs	x20,x20,x7
1240	ldp	x6,x7,[x1,#8*0]
1241	adcs	x21,x21,x8
1242	adcs	x22,x22,x9
1243	ldp	x8,x9,[x1,#8*2]
1244	adcs	x23,x23,x10
1245	adcs	x24,x24,x11
1246	ldp	x10,x11,[x1,#8*4]
1247	adcs	x25,x25,x12
1248	mov	x27,#-8*8
1249	adcs	x26,x26,x13
1250	ldp	x12,x13,[x1,#8*6]
1251	add	x1,x1,#8*8
1252	//adc	x28,xzr,xzr		// moved above
1253	b	.Lsqr8x_mul
1254
1255.align	4
1256.Lsqr8x_break:
1257	ldp	x6,x7,[x0,#8*0]
1258	add	x1,x0,#8*8
1259	ldp	x8,x9,[x0,#8*2]
1260	sub	x14,x3,x1		// is it last iteration?
1261	ldp	x10,x11,[x0,#8*4]
1262	sub	x15,x2,x14
1263	ldp	x12,x13,[x0,#8*6]
1264	cbz	x14,.Lsqr8x_outer_loop
1265
1266	stp	x19,x20,[x2,#8*0]
1267	ldp	x19,x20,[x15,#8*0]
1268	stp	x21,x22,[x2,#8*2]
1269	ldp	x21,x22,[x15,#8*2]
1270	stp	x23,x24,[x2,#8*4]
1271	ldp	x23,x24,[x15,#8*4]
1272	stp	x25,x26,[x2,#8*6]
1273	mov	x2,x15
1274	ldp	x25,x26,[x15,#8*6]
1275	b	.Lsqr8x_outer_loop
1276
1277.align	4
1278.Lsqr8x_outer_break:
1279	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1280	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
1281	ldp	x15,x16,[sp,#8*1]
1282	ldp	x11,x13,[x14,#8*2]
1283	add	x1,x14,#8*4
1284	ldp	x17,x14,[sp,#8*3]
1285
1286	stp	x19,x20,[x2,#8*0]
1287	mul	x19,x7,x7
1288	stp	x21,x22,[x2,#8*2]
1289	umulh	x7,x7,x7
1290	stp	x23,x24,[x2,#8*4]
1291	mul	x8,x9,x9
1292	stp	x25,x26,[x2,#8*6]
1293	mov	x2,sp
1294	umulh	x9,x9,x9
1295	adds	x20,x7,x15,lsl#1
1296	extr	x15,x16,x15,#63
1297	sub	x27,x5,#8*4
1298
1299.Lsqr4x_shift_n_add:
1300	adcs	x21,x8,x15
1301	extr	x16,x17,x16,#63
1302	sub	x27,x27,#8*4
1303	adcs	x22,x9,x16
1304	ldp	x15,x16,[x2,#8*5]
1305	mul	x10,x11,x11
1306	ldp	x7,x9,[x1],#8*2
1307	umulh	x11,x11,x11
1308	mul	x12,x13,x13
1309	umulh	x13,x13,x13
1310	extr	x17,x14,x17,#63
1311	stp	x19,x20,[x2,#8*0]
1312	adcs	x23,x10,x17
1313	extr	x14,x15,x14,#63
1314	stp	x21,x22,[x2,#8*2]
1315	adcs	x24,x11,x14
1316	ldp	x17,x14,[x2,#8*7]
1317	extr	x15,x16,x15,#63
1318	adcs	x25,x12,x15
1319	extr	x16,x17,x16,#63
1320	adcs	x26,x13,x16
1321	ldp	x15,x16,[x2,#8*9]
1322	mul	x6,x7,x7
1323	ldp	x11,x13,[x1],#8*2
1324	umulh	x7,x7,x7
1325	mul	x8,x9,x9
1326	umulh	x9,x9,x9
1327	stp	x23,x24,[x2,#8*4]
1328	extr	x17,x14,x17,#63
1329	stp	x25,x26,[x2,#8*6]
1330	add	x2,x2,#8*8
1331	adcs	x19,x6,x17
1332	extr	x14,x15,x14,#63
1333	adcs	x20,x7,x14
1334	ldp	x17,x14,[x2,#8*3]
1335	extr	x15,x16,x15,#63
1336	cbnz	x27,.Lsqr4x_shift_n_add
1337	ldp	x1,x4,[x29,#104]	// pull np and n0
1338
1339	adcs	x21,x8,x15
1340	extr	x16,x17,x16,#63
1341	adcs	x22,x9,x16
1342	ldp	x15,x16,[x2,#8*5]
1343	mul	x10,x11,x11
1344	umulh	x11,x11,x11
1345	stp	x19,x20,[x2,#8*0]
1346	mul	x12,x13,x13
1347	umulh	x13,x13,x13
1348	stp	x21,x22,[x2,#8*2]
1349	extr	x17,x14,x17,#63
1350	adcs	x23,x10,x17
1351	extr	x14,x15,x14,#63
1352	ldp	x19,x20,[sp,#8*0]
1353	adcs	x24,x11,x14
1354	extr	x15,x16,x15,#63
1355	ldp	x6,x7,[x1,#8*0]
1356	adcs	x25,x12,x15
1357	extr	x16,xzr,x16,#63
1358	ldp	x8,x9,[x1,#8*2]
1359	adc	x26,x13,x16
1360	ldp	x10,x11,[x1,#8*4]
1361
1362	// Reduce by 512 bits per iteration
1363	mul	x28,x4,x19		// t[0]*n0
1364	ldp	x12,x13,[x1,#8*6]
1365	add	x3,x1,x5
1366	ldp	x21,x22,[sp,#8*2]
1367	stp	x23,x24,[x2,#8*4]
1368	ldp	x23,x24,[sp,#8*4]
1369	stp	x25,x26,[x2,#8*6]
1370	ldp	x25,x26,[sp,#8*6]
1371	add	x1,x1,#8*8
1372	mov	x30,xzr		// initial top-most carry
1373	mov	x2,sp
1374	mov	x27,#8
1375
1376.Lsqr8x_reduction:
1377	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
1378	mul	x15,x7,x28
1379	sub	x27,x27,#1
1380	mul	x16,x8,x28
1381	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
1382	mul	x17,x9,x28
1383	// (*)	adds	xzr,x19,x14
1384	subs	xzr,x19,#1		// (*)
1385	mul	x14,x10,x28
1386	adcs	x19,x20,x15
1387	mul	x15,x11,x28
1388	adcs	x20,x21,x16
1389	mul	x16,x12,x28
1390	adcs	x21,x22,x17
1391	mul	x17,x13,x28
1392	adcs	x22,x23,x14
1393	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
1394	adcs	x23,x24,x15
1395	umulh	x15,x7,x28
1396	adcs	x24,x25,x16
1397	umulh	x16,x8,x28
1398	adcs	x25,x26,x17
1399	umulh	x17,x9,x28
1400	adc	x26,xzr,xzr
1401	adds	x19,x19,x14
1402	umulh	x14,x10,x28
1403	adcs	x20,x20,x15
1404	umulh	x15,x11,x28
1405	adcs	x21,x21,x16
1406	umulh	x16,x12,x28
1407	adcs	x22,x22,x17
1408	umulh	x17,x13,x28
1409	mul	x28,x4,x19		// next t[0]*n0
1410	adcs	x23,x23,x14
1411	adcs	x24,x24,x15
1412	adcs	x25,x25,x16
1413	adc	x26,x26,x17
1414	cbnz	x27,.Lsqr8x_reduction
1415
1416	ldp	x14,x15,[x2,#8*0]
1417	ldp	x16,x17,[x2,#8*2]
1418	mov	x0,x2
1419	sub	x27,x3,x1	// done yet?
1420	adds	x19,x19,x14
1421	adcs	x20,x20,x15
1422	ldp	x14,x15,[x2,#8*4]
1423	adcs	x21,x21,x16
1424	adcs	x22,x22,x17
1425	ldp	x16,x17,[x2,#8*6]
1426	adcs	x23,x23,x14
1427	adcs	x24,x24,x15
1428	adcs	x25,x25,x16
1429	adcs	x26,x26,x17
1430	//adc	x28,xzr,xzr		// moved below
1431	cbz	x27,.Lsqr8x8_post_condition
1432
1433	ldur	x4,[x2,#-8*8]
1434	ldp	x6,x7,[x1,#8*0]
1435	ldp	x8,x9,[x1,#8*2]
1436	ldp	x10,x11,[x1,#8*4]
1437	mov	x27,#-8*8
1438	ldp	x12,x13,[x1,#8*6]
1439	add	x1,x1,#8*8
1440
1441.Lsqr8x_tail:
1442	mul	x14,x6,x4
1443	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
1444	mul	x15,x7,x4
1445	add	x27,x27,#8
1446	mul	x16,x8,x4
1447	mul	x17,x9,x4
1448	adds	x19,x19,x14
1449	mul	x14,x10,x4
1450	adcs	x20,x20,x15
1451	mul	x15,x11,x4
1452	adcs	x21,x21,x16
1453	mul	x16,x12,x4
1454	adcs	x22,x22,x17
1455	mul	x17,x13,x4
1456	adcs	x23,x23,x14
1457	umulh	x14,x6,x4
1458	adcs	x24,x24,x15
1459	umulh	x15,x7,x4
1460	adcs	x25,x25,x16
1461	umulh	x16,x8,x4
1462	adcs	x26,x26,x17
1463	umulh	x17,x9,x4
1464	adc	x28,x28,xzr
1465	str	x19,[x2],#8
1466	adds	x19,x20,x14
1467	umulh	x14,x10,x4
1468	adcs	x20,x21,x15
1469	umulh	x15,x11,x4
1470	adcs	x21,x22,x16
1471	umulh	x16,x12,x4
1472	adcs	x22,x23,x17
1473	umulh	x17,x13,x4
1474	ldr	x4,[x0,x27]
1475	adcs	x23,x24,x14
1476	adcs	x24,x25,x15
1477	adcs	x25,x26,x16
1478	adcs	x26,x28,x17
1479	//adc	x28,xzr,xzr		// moved above
1480	cbnz	x27,.Lsqr8x_tail
1481					// note that carry flag is guaranteed
1482					// to be zero at this point
1483	ldp	x6,x7,[x2,#8*0]
1484	sub	x27,x3,x1	// done yet?
1485	sub	x16,x3,x5	// rewinded np
1486	ldp	x8,x9,[x2,#8*2]
1487	ldp	x10,x11,[x2,#8*4]
1488	ldp	x12,x13,[x2,#8*6]
1489	cbz	x27,.Lsqr8x_tail_break
1490
1491	ldur	x4,[x0,#-8*8]
1492	adds	x19,x19,x6
1493	adcs	x20,x20,x7
1494	ldp	x6,x7,[x1,#8*0]
1495	adcs	x21,x21,x8
1496	adcs	x22,x22,x9
1497	ldp	x8,x9,[x1,#8*2]
1498	adcs	x23,x23,x10
1499	adcs	x24,x24,x11
1500	ldp	x10,x11,[x1,#8*4]
1501	adcs	x25,x25,x12
1502	mov	x27,#-8*8
1503	adcs	x26,x26,x13
1504	ldp	x12,x13,[x1,#8*6]
1505	add	x1,x1,#8*8
1506	//adc	x28,xzr,xzr		// moved above
1507	b	.Lsqr8x_tail
1508
1509.align	4
1510.Lsqr8x_tail_break:
1511	ldr	x4,[x29,#112]		// pull n0
1512	add	x27,x2,#8*8		// end of current t[num] window
1513
1514	subs	xzr,x30,#1		// "move" top-most carry to carry bit
1515	adcs	x14,x19,x6
1516	adcs	x15,x20,x7
1517	ldp	x19,x20,[x0,#8*0]
1518	adcs	x21,x21,x8
1519	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
1520	adcs	x22,x22,x9
1521	ldp	x8,x9,[x16,#8*2]
1522	adcs	x23,x23,x10
1523	adcs	x24,x24,x11
1524	ldp	x10,x11,[x16,#8*4]
1525	adcs	x25,x25,x12
1526	adcs	x26,x26,x13
1527	ldp	x12,x13,[x16,#8*6]
1528	add	x1,x16,#8*8
1529	adc	x30,xzr,xzr	// top-most carry
1530	mul	x28,x4,x19
1531	stp	x14,x15,[x2,#8*0]
1532	stp	x21,x22,[x2,#8*2]
1533	ldp	x21,x22,[x0,#8*2]
1534	stp	x23,x24,[x2,#8*4]
1535	ldp	x23,x24,[x0,#8*4]
1536	cmp	x27,x29		// did we hit the bottom?
1537	stp	x25,x26,[x2,#8*6]
1538	mov	x2,x0			// slide the window
1539	ldp	x25,x26,[x0,#8*6]
1540	mov	x27,#8
1541	b.ne	.Lsqr8x_reduction
1542
1543	// Final step. We see if result is larger than modulus, and
1544	// if it is, subtract the modulus. But comparison implies
1545	// subtraction. So we subtract modulus, see if it borrowed,
1546	// and conditionally copy original value.
1547	ldr	x0,[x29,#96]		// pull rp
1548	add	x2,x2,#8*8
1549	subs	x14,x19,x6
1550	sbcs	x15,x20,x7
1551	sub	x27,x5,#8*8
1552	mov	x3,x0		// x0 copy
1553
1554.Lsqr8x_sub:
1555	sbcs	x16,x21,x8
1556	ldp	x6,x7,[x1,#8*0]
1557	sbcs	x17,x22,x9
1558	stp	x14,x15,[x0,#8*0]
1559	sbcs	x14,x23,x10
1560	ldp	x8,x9,[x1,#8*2]
1561	sbcs	x15,x24,x11
1562	stp	x16,x17,[x0,#8*2]
1563	sbcs	x16,x25,x12
1564	ldp	x10,x11,[x1,#8*4]
1565	sbcs	x17,x26,x13
1566	ldp	x12,x13,[x1,#8*6]
1567	add	x1,x1,#8*8
1568	ldp	x19,x20,[x2,#8*0]
1569	sub	x27,x27,#8*8
1570	ldp	x21,x22,[x2,#8*2]
1571	ldp	x23,x24,[x2,#8*4]
1572	ldp	x25,x26,[x2,#8*6]
1573	add	x2,x2,#8*8
1574	stp	x14,x15,[x0,#8*4]
1575	sbcs	x14,x19,x6
1576	stp	x16,x17,[x0,#8*6]
1577	add	x0,x0,#8*8
1578	sbcs	x15,x20,x7
1579	cbnz	x27,.Lsqr8x_sub
1580
1581	sbcs	x16,x21,x8
1582	mov	x2,sp
1583	add	x1,sp,x5
1584	ldp	x6,x7,[x3,#8*0]
1585	sbcs	x17,x22,x9
1586	stp	x14,x15,[x0,#8*0]
1587	sbcs	x14,x23,x10
1588	ldp	x8,x9,[x3,#8*2]
1589	sbcs	x15,x24,x11
1590	stp	x16,x17,[x0,#8*2]
1591	sbcs	x16,x25,x12
1592	ldp	x19,x20,[x1,#8*0]
1593	sbcs	x17,x26,x13
1594	ldp	x21,x22,[x1,#8*2]
1595	sbcs	xzr,x30,xzr	// did it borrow?
1596	ldr	x30,[x29,#8]		// pull return address
1597	stp	x14,x15,[x0,#8*4]
1598	stp	x16,x17,[x0,#8*6]
1599
1600	sub	x27,x5,#8*4
1601.Lsqr4x_cond_copy:
1602	sub	x27,x27,#8*4
1603	csel	x14,x19,x6,lo
1604	stp	xzr,xzr,[x2,#8*0]
1605	csel	x15,x20,x7,lo
1606	ldp	x6,x7,[x3,#8*4]
1607	ldp	x19,x20,[x1,#8*4]
1608	csel	x16,x21,x8,lo
1609	stp	xzr,xzr,[x2,#8*2]
1610	add	x2,x2,#8*4
1611	csel	x17,x22,x9,lo
1612	ldp	x8,x9,[x3,#8*6]
1613	ldp	x21,x22,[x1,#8*6]
1614	add	x1,x1,#8*4
1615	stp	x14,x15,[x3,#8*0]
1616	stp	x16,x17,[x3,#8*2]
1617	add	x3,x3,#8*4
1618	stp	xzr,xzr,[x1,#8*0]
1619	stp	xzr,xzr,[x1,#8*2]
1620	cbnz	x27,.Lsqr4x_cond_copy
1621
1622	csel	x14,x19,x6,lo
1623	stp	xzr,xzr,[x2,#8*0]
1624	csel	x15,x20,x7,lo
1625	stp	xzr,xzr,[x2,#8*2]
1626	csel	x16,x21,x8,lo
1627	csel	x17,x22,x9,lo
1628	stp	x14,x15,[x3,#8*0]
1629	stp	x16,x17,[x3,#8*2]
1630
1631	b	.Lsqr8x_done
1632
1633.align	4
1634.Lsqr8x8_post_condition:
1635	adc	x28,xzr,xzr
1636	ldr	x30,[x29,#8]		// pull return address
1637	// x19-7,x28 hold result, x6-7 hold modulus
1638	subs	x6,x19,x6
1639	ldr	x1,[x29,#96]		// pull rp
1640	sbcs	x7,x20,x7
1641	stp	xzr,xzr,[sp,#8*0]
1642	sbcs	x8,x21,x8
1643	stp	xzr,xzr,[sp,#8*2]
1644	sbcs	x9,x22,x9
1645	stp	xzr,xzr,[sp,#8*4]
1646	sbcs	x10,x23,x10
1647	stp	xzr,xzr,[sp,#8*6]
1648	sbcs	x11,x24,x11
1649	stp	xzr,xzr,[sp,#8*8]
1650	sbcs	x12,x25,x12
1651	stp	xzr,xzr,[sp,#8*10]
1652	sbcs	x13,x26,x13
1653	stp	xzr,xzr,[sp,#8*12]
1654	sbcs	x28,x28,xzr	// did it borrow?
1655	stp	xzr,xzr,[sp,#8*14]
1656
1657	// x6-7 hold result-modulus
1658	csel	x6,x19,x6,lo
1659	csel	x7,x20,x7,lo
1660	csel	x8,x21,x8,lo
1661	csel	x9,x22,x9,lo
1662	stp	x6,x7,[x1,#8*0]
1663	csel	x10,x23,x10,lo
1664	csel	x11,x24,x11,lo
1665	stp	x8,x9,[x1,#8*2]
1666	csel	x12,x25,x12,lo
1667	csel	x13,x26,x13,lo
1668	stp	x10,x11,[x1,#8*4]
1669	stp	x12,x13,[x1,#8*6]
1670
1671.Lsqr8x_done:
1672	ldp	x19,x20,[x29,#16]
1673	mov	sp,x29
1674	ldp	x21,x22,[x29,#32]
1675	mov	x0,#1
1676	ldp	x23,x24,[x29,#48]
1677	ldp	x25,x26,[x29,#64]
1678	ldp	x27,x28,[x29,#80]
1679	ldr	x29,[sp],#128
1680.inst	0xd50323bf		// autiasp
1681	ret
1682.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
1683.type	__bn_mul4x_mont,%function
1684.align	5
1685__bn_mul4x_mont:
1686.inst	0xd503233f		// paciasp
1687	stp	x29,x30,[sp,#-128]!
1688	add	x29,sp,#0
1689	stp	x19,x20,[sp,#16]
1690	stp	x21,x22,[sp,#32]
1691	stp	x23,x24,[sp,#48]
1692	stp	x25,x26,[sp,#64]
1693	stp	x27,x28,[sp,#80]
1694
1695	sub	x26,sp,x5,lsl#3
1696	lsl	x5,x5,#3
1697	ldr	x4,[x4]		// *n0
1698	sub	sp,x26,#8*4		// alloca
1699
1700	add	x10,x2,x5
1701	add	x27,x1,x5
1702	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
1703
1704	ldr	x24,[x2,#8*0]		// b[0]
1705	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1706	ldp	x8,x9,[x1,#8*2]
1707	add	x1,x1,#8*4
1708	mov	x19,xzr
1709	mov	x20,xzr
1710	mov	x21,xzr
1711	mov	x22,xzr
1712	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1713	ldp	x16,x17,[x3,#8*2]
1714	adds	x3,x3,#8*4		// clear carry bit
1715	mov	x0,xzr
1716	mov	x28,#0
1717	mov	x26,sp
1718
1719.Loop_mul4x_1st_reduction:
1720	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1721	adc	x0,x0,xzr	// modulo-scheduled
1722	mul	x11,x7,x24
1723	add	x28,x28,#8
1724	mul	x12,x8,x24
1725	and	x28,x28,#31
1726	mul	x13,x9,x24
1727	adds	x19,x19,x10
1728	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1729	adcs	x20,x20,x11
1730	mul	x25,x19,x4		// t[0]*n0
1731	adcs	x21,x21,x12
1732	umulh	x11,x7,x24
1733	adcs	x22,x22,x13
1734	umulh	x12,x8,x24
1735	adc	x23,xzr,xzr
1736	umulh	x13,x9,x24
1737	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1738	adds	x20,x20,x10
1739	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1740	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1741	adcs	x21,x21,x11
1742	mul	x11,x15,x25
1743	adcs	x22,x22,x12
1744	mul	x12,x16,x25
1745	adc	x23,x23,x13		// can't overflow
1746	mul	x13,x17,x25
1747	// (*)	adds	xzr,x19,x10
1748	subs	xzr,x19,#1		// (*)
1749	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1750	adcs	x19,x20,x11
1751	umulh	x11,x15,x25
1752	adcs	x20,x21,x12
1753	umulh	x12,x16,x25
1754	adcs	x21,x22,x13
1755	umulh	x13,x17,x25
1756	adcs	x22,x23,x0
1757	adc	x0,xzr,xzr
1758	adds	x19,x19,x10
1759	sub	x10,x27,x1
1760	adcs	x20,x20,x11
1761	adcs	x21,x21,x12
1762	adcs	x22,x22,x13
1763	//adc	x0,x0,xzr
1764	cbnz	x28,.Loop_mul4x_1st_reduction
1765
1766	cbz	x10,.Lmul4x4_post_condition
1767
1768	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1769	ldp	x8,x9,[x1,#8*2]
1770	add	x1,x1,#8*4
1771	ldr	x25,[sp]		// a[0]*n0
1772	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1773	ldp	x16,x17,[x3,#8*2]
1774	add	x3,x3,#8*4
1775
1776.Loop_mul4x_1st_tail:
1777	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1778	adc	x0,x0,xzr	// modulo-scheduled
1779	mul	x11,x7,x24
1780	add	x28,x28,#8
1781	mul	x12,x8,x24
1782	and	x28,x28,#31
1783	mul	x13,x9,x24
1784	adds	x19,x19,x10
1785	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1786	adcs	x20,x20,x11
1787	umulh	x11,x7,x24
1788	adcs	x21,x21,x12
1789	umulh	x12,x8,x24
1790	adcs	x22,x22,x13
1791	umulh	x13,x9,x24
1792	adc	x23,xzr,xzr
1793	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1794	adds	x20,x20,x10
1795	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1796	adcs	x21,x21,x11
1797	mul	x11,x15,x25
1798	adcs	x22,x22,x12
1799	mul	x12,x16,x25
1800	adc	x23,x23,x13		// can't overflow
1801	mul	x13,x17,x25
1802	adds	x19,x19,x10
1803	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1804	adcs	x20,x20,x11
1805	umulh	x11,x15,x25
1806	adcs	x21,x21,x12
1807	umulh	x12,x16,x25
1808	adcs	x22,x22,x13
1809	adcs	x23,x23,x0
1810	umulh	x13,x17,x25
1811	adc	x0,xzr,xzr
1812	ldr	x25,[sp,x28]		// next t[0]*n0
1813	str	x19,[x26],#8		// result!!!
1814	adds	x19,x20,x10
1815	sub	x10,x27,x1		// done yet?
1816	adcs	x20,x21,x11
1817	adcs	x21,x22,x12
1818	adcs	x22,x23,x13
1819	//adc	x0,x0,xzr
1820	cbnz	x28,.Loop_mul4x_1st_tail
1821
1822	sub	x11,x27,x5	// rewinded x1
1823	cbz	x10,.Lmul4x_proceed
1824
1825	ldp	x6,x7,[x1,#8*0]
1826	ldp	x8,x9,[x1,#8*2]
1827	add	x1,x1,#8*4
1828	ldp	x14,x15,[x3,#8*0]
1829	ldp	x16,x17,[x3,#8*2]
1830	add	x3,x3,#8*4
1831	b	.Loop_mul4x_1st_tail
1832
1833.align	5
1834.Lmul4x_proceed:
1835	ldr	x24,[x2,#8*4]!		// *++b
1836	adc	x30,x0,xzr
1837	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1838	sub	x3,x3,x5		// rewind np
1839	ldp	x8,x9,[x11,#8*2]
1840	add	x1,x11,#8*4
1841
1842	stp	x19,x20,[x26,#8*0]	// result!!!
1843	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1844	stp	x21,x22,[x26,#8*2]	// result!!!
1845	ldp	x21,x22,[sp,#8*6]
1846
1847	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1848	mov	x26,sp
1849	ldp	x16,x17,[x3,#8*2]
1850	adds	x3,x3,#8*4		// clear carry bit
1851	mov	x0,xzr
1852
1853.align	4
1854.Loop_mul4x_reduction:
1855	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1856	adc	x0,x0,xzr	// modulo-scheduled
1857	mul	x11,x7,x24
1858	add	x28,x28,#8
1859	mul	x12,x8,x24
1860	and	x28,x28,#31
1861	mul	x13,x9,x24
1862	adds	x19,x19,x10
1863	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1864	adcs	x20,x20,x11
1865	mul	x25,x19,x4		// t[0]*n0
1866	adcs	x21,x21,x12
1867	umulh	x11,x7,x24
1868	adcs	x22,x22,x13
1869	umulh	x12,x8,x24
1870	adc	x23,xzr,xzr
1871	umulh	x13,x9,x24
1872	ldr	x24,[x2,x28]		// next b[i]
1873	adds	x20,x20,x10
1874	// (*)	mul	x10,x14,x25
1875	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1876	adcs	x21,x21,x11
1877	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1878	adcs	x22,x22,x12
1879	mul	x12,x16,x25
1880	adc	x23,x23,x13		// can't overflow
1881	mul	x13,x17,x25
1882	// (*)	adds	xzr,x19,x10
1883	subs	xzr,x19,#1		// (*)
1884	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1885	adcs	x19,x20,x11
1886	umulh	x11,x15,x25
1887	adcs	x20,x21,x12
1888	umulh	x12,x16,x25
1889	adcs	x21,x22,x13
1890	umulh	x13,x17,x25
1891	adcs	x22,x23,x0
1892	adc	x0,xzr,xzr
1893	adds	x19,x19,x10
1894	adcs	x20,x20,x11
1895	adcs	x21,x21,x12
1896	adcs	x22,x22,x13
1897	//adc	x0,x0,xzr
1898	cbnz	x28,.Loop_mul4x_reduction
1899
1900	adc	x0,x0,xzr
1901	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1902	ldp	x12,x13,[x26,#8*6]
1903	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1904	ldp	x8,x9,[x1,#8*2]
1905	add	x1,x1,#8*4
1906	adds	x19,x19,x10
1907	adcs	x20,x20,x11
1908	adcs	x21,x21,x12
1909	adcs	x22,x22,x13
1910	//adc	x0,x0,xzr
1911
1912	ldr	x25,[sp]		// t[0]*n0
1913	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1914	ldp	x16,x17,[x3,#8*2]
1915	add	x3,x3,#8*4
1916
1917.align	4
1918.Loop_mul4x_tail:
1919	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1920	adc	x0,x0,xzr	// modulo-scheduled
1921	mul	x11,x7,x24
1922	add	x28,x28,#8
1923	mul	x12,x8,x24
1924	and	x28,x28,#31
1925	mul	x13,x9,x24
1926	adds	x19,x19,x10
1927	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1928	adcs	x20,x20,x11
1929	umulh	x11,x7,x24
1930	adcs	x21,x21,x12
1931	umulh	x12,x8,x24
1932	adcs	x22,x22,x13
1933	umulh	x13,x9,x24
1934	adc	x23,xzr,xzr
1935	ldr	x24,[x2,x28]		// next b[i]
1936	adds	x20,x20,x10
1937	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1938	adcs	x21,x21,x11
1939	mul	x11,x15,x25
1940	adcs	x22,x22,x12
1941	mul	x12,x16,x25
1942	adc	x23,x23,x13		// can't overflow
1943	mul	x13,x17,x25
1944	adds	x19,x19,x10
1945	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1946	adcs	x20,x20,x11
1947	umulh	x11,x15,x25
1948	adcs	x21,x21,x12
1949	umulh	x12,x16,x25
1950	adcs	x22,x22,x13
1951	umulh	x13,x17,x25
1952	adcs	x23,x23,x0
1953	ldr	x25,[sp,x28]		// next a[0]*n0
1954	adc	x0,xzr,xzr
1955	str	x19,[x26],#8		// result!!!
1956	adds	x19,x20,x10
1957	sub	x10,x27,x1		// done yet?
1958	adcs	x20,x21,x11
1959	adcs	x21,x22,x12
1960	adcs	x22,x23,x13
1961	//adc	x0,x0,xzr
1962	cbnz	x28,.Loop_mul4x_tail
1963
1964	sub	x11,x3,x5		// rewinded np?
1965	adc	x0,x0,xzr
1966	cbz	x10,.Loop_mul4x_break
1967
1968	ldp	x10,x11,[x26,#8*4]
1969	ldp	x12,x13,[x26,#8*6]
1970	ldp	x6,x7,[x1,#8*0]
1971	ldp	x8,x9,[x1,#8*2]
1972	add	x1,x1,#8*4
1973	adds	x19,x19,x10
1974	adcs	x20,x20,x11
1975	adcs	x21,x21,x12
1976	adcs	x22,x22,x13
1977	//adc	x0,x0,xzr
1978	ldp	x14,x15,[x3,#8*0]
1979	ldp	x16,x17,[x3,#8*2]
1980	add	x3,x3,#8*4
1981	b	.Loop_mul4x_tail
1982
1983.align	4
1984.Loop_mul4x_break:
1985	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1986	adds	x19,x19,x30
1987	add	x2,x2,#8*4		// bp++
1988	adcs	x20,x20,xzr
1989	sub	x1,x1,x5		// rewind ap
1990	adcs	x21,x21,xzr
1991	stp	x19,x20,[x26,#8*0]	// result!!!
1992	adcs	x22,x22,xzr
1993	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1994	adc	x30,x0,xzr
1995	stp	x21,x22,[x26,#8*2]	// result!!!
1996	cmp	x2,x13			// done yet?
1997	ldp	x21,x22,[sp,#8*6]
1998	ldp	x14,x15,[x11,#8*0]	// n[0..3]
1999	ldp	x16,x17,[x11,#8*2]
2000	add	x3,x11,#8*4
2001	b.eq	.Lmul4x_post
2002
2003	ldr	x24,[x2]
2004	ldp	x6,x7,[x1,#8*0]	// a[0..3]
2005	ldp	x8,x9,[x1,#8*2]
2006	adds	x1,x1,#8*4		// clear carry bit
2007	mov	x0,xzr
2008	mov	x26,sp
2009	b	.Loop_mul4x_reduction
2010
2011.align	4
2012.Lmul4x_post:
2013	// Final step. We see if result is larger than modulus, and
2014	// if it is, subtract the modulus. But comparison implies
2015	// subtraction. So we subtract modulus, see if it borrowed,
2016	// and conditionally copy original value.
2017	mov	x0,x12
2018	mov	x27,x12		// x0 copy
2019	subs	x10,x19,x14
2020	add	x26,sp,#8*8
2021	sbcs	x11,x20,x15
2022	sub	x28,x5,#8*4
2023
2024.Lmul4x_sub:
2025	sbcs	x12,x21,x16
2026	ldp	x14,x15,[x3,#8*0]
2027	sub	x28,x28,#8*4
2028	ldp	x19,x20,[x26,#8*0]
2029	sbcs	x13,x22,x17
2030	ldp	x16,x17,[x3,#8*2]
2031	add	x3,x3,#8*4
2032	ldp	x21,x22,[x26,#8*2]
2033	add	x26,x26,#8*4
2034	stp	x10,x11,[x0,#8*0]
2035	sbcs	x10,x19,x14
2036	stp	x12,x13,[x0,#8*2]
2037	add	x0,x0,#8*4
2038	sbcs	x11,x20,x15
2039	cbnz	x28,.Lmul4x_sub
2040
2041	sbcs	x12,x21,x16
2042	mov	x26,sp
2043	add	x1,sp,#8*4
2044	ldp	x6,x7,[x27,#8*0]
2045	sbcs	x13,x22,x17
2046	stp	x10,x11,[x0,#8*0]
2047	ldp	x8,x9,[x27,#8*2]
2048	stp	x12,x13,[x0,#8*2]
2049	ldp	x19,x20,[x1,#8*0]
2050	ldp	x21,x22,[x1,#8*2]
2051	sbcs	xzr,x30,xzr	// did it borrow?
2052	ldr	x30,[x29,#8]		// pull return address
2053
2054	sub	x28,x5,#8*4
2055.Lmul4x_cond_copy:
2056	sub	x28,x28,#8*4
2057	csel	x10,x19,x6,lo
2058	stp	xzr,xzr,[x26,#8*0]
2059	csel	x11,x20,x7,lo
2060	ldp	x6,x7,[x27,#8*4]
2061	ldp	x19,x20,[x1,#8*4]
2062	csel	x12,x21,x8,lo
2063	stp	xzr,xzr,[x26,#8*2]
2064	add	x26,x26,#8*4
2065	csel	x13,x22,x9,lo
2066	ldp	x8,x9,[x27,#8*6]
2067	ldp	x21,x22,[x1,#8*6]
2068	add	x1,x1,#8*4
2069	stp	x10,x11,[x27,#8*0]
2070	stp	x12,x13,[x27,#8*2]
2071	add	x27,x27,#8*4
2072	cbnz	x28,.Lmul4x_cond_copy
2073
2074	csel	x10,x19,x6,lo
2075	stp	xzr,xzr,[x26,#8*0]
2076	csel	x11,x20,x7,lo
2077	stp	xzr,xzr,[x26,#8*2]
2078	csel	x12,x21,x8,lo
2079	stp	xzr,xzr,[x26,#8*3]
2080	csel	x13,x22,x9,lo
2081	stp	xzr,xzr,[x26,#8*4]
2082	stp	x10,x11,[x27,#8*0]
2083	stp	x12,x13,[x27,#8*2]
2084
2085	b	.Lmul4x_done
2086
2087.align	4
2088.Lmul4x4_post_condition:
2089	adc	x0,x0,xzr
2090	ldr	x1,[x29,#96]		// pull rp
2091	// x19-3,x0 hold result, x14-7 hold modulus
2092	subs	x6,x19,x14
2093	ldr	x30,[x29,#8]		// pull return address
2094	sbcs	x7,x20,x15
2095	stp	xzr,xzr,[sp,#8*0]
2096	sbcs	x8,x21,x16
2097	stp	xzr,xzr,[sp,#8*2]
2098	sbcs	x9,x22,x17
2099	stp	xzr,xzr,[sp,#8*4]
2100	sbcs	xzr,x0,xzr		// did it borrow?
2101	stp	xzr,xzr,[sp,#8*6]
2102
2103	// x6-3 hold result-modulus
2104	csel	x6,x19,x6,lo
2105	csel	x7,x20,x7,lo
2106	csel	x8,x21,x8,lo
2107	csel	x9,x22,x9,lo
2108	stp	x6,x7,[x1,#8*0]
2109	stp	x8,x9,[x1,#8*2]
2110
2111.Lmul4x_done:
2112	ldp	x19,x20,[x29,#16]
2113	mov	sp,x29
2114	ldp	x21,x22,[x29,#32]
2115	mov	x0,#1
2116	ldp	x23,x24,[x29,#48]
2117	ldp	x25,x26,[x29,#64]
2118	ldp	x27,x28,[x29,#80]
2119	ldr	x29,[sp],#128
2120.inst	0xd50323bf		// autiasp
2121	ret
2122.size	__bn_mul4x_mont,.-__bn_mul4x_mont
2123.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
2124.align	2
2125.align	4
2126