xref: /freebsd/sys/crypto/openssl/aarch64/armv8-mont.S (revision 184c1b943937986c81e1996d999d21626ec7a4ff)
1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from armv8-mont.pl. */
3.text
4
5.globl	bn_mul_mont
6.type	bn_mul_mont,%function
7.align	5
8bn_mul_mont:
9	tst	x5,#7
10	b.eq	__bn_sqr8x_mont
11	tst	x5,#3
12	b.eq	__bn_mul4x_mont
13.Lmul_mont:
14	stp	x29,x30,[sp,#-64]!
15	add	x29,sp,#0
16	stp	x19,x20,[sp,#16]
17	stp	x21,x22,[sp,#32]
18	stp	x23,x24,[sp,#48]
19
20	ldr	x9,[x2],#8		// bp[0]
21	sub	x22,sp,x5,lsl#3
22	ldp	x7,x8,[x1],#16	// ap[0..1]
23	lsl	x5,x5,#3
24	ldr	x4,[x4]		// *n0
25	and	x22,x22,#-16		// ABI says so
26	ldp	x13,x14,[x3],#16	// np[0..1]
27
28	mul	x6,x7,x9		// ap[0]*bp[0]
29	sub	x21,x5,#16		// j=num-2
30	umulh	x7,x7,x9
31	mul	x10,x8,x9		// ap[1]*bp[0]
32	umulh	x11,x8,x9
33
34	mul	x15,x6,x4		// "tp[0]"*n0
35	mov	sp,x22			// alloca
36
37	// (*)	mul	x12,x13,x15	// np[0]*m1
38	umulh	x13,x13,x15
39	mul	x16,x14,x15		// np[1]*m1
40	// (*)	adds	x12,x12,x6	// discarded
41	// (*)	As for removal of first multiplication and addition
42	//	instructions. The outcome of first addition is
43	//	guaranteed to be zero, which leaves two computationally
44	//	significant outcomes: it either carries or not. Then
45	//	question is when does it carry? Is there alternative
46	//	way to deduce it? If you follow operations, you can
47	//	observe that condition for carry is quite simple:
48	//	x6 being non-zero. So that carry can be calculated
49	//	by adding -1 to x6. That's what next instruction does.
50	subs	xzr,x6,#1		// (*)
51	umulh	x17,x14,x15
52	adc	x13,x13,xzr
53	cbz	x21,.L1st_skip
54
55.L1st:
56	ldr	x8,[x1],#8
57	adds	x6,x10,x7
58	sub	x21,x21,#8		// j--
59	adc	x7,x11,xzr
60
61	ldr	x14,[x3],#8
62	adds	x12,x16,x13
63	mul	x10,x8,x9		// ap[j]*bp[0]
64	adc	x13,x17,xzr
65	umulh	x11,x8,x9
66
67	adds	x12,x12,x6
68	mul	x16,x14,x15		// np[j]*m1
69	adc	x13,x13,xzr
70	umulh	x17,x14,x15
71	str	x12,[x22],#8		// tp[j-1]
72	cbnz	x21,.L1st
73
74.L1st_skip:
75	adds	x6,x10,x7
76	sub	x1,x1,x5		// rewind x1
77	adc	x7,x11,xzr
78
79	adds	x12,x16,x13
80	sub	x3,x3,x5		// rewind x3
81	adc	x13,x17,xzr
82
83	adds	x12,x12,x6
84	sub	x20,x5,#8		// i=num-1
85	adcs	x13,x13,x7
86
87	adc	x19,xzr,xzr		// upmost overflow bit
88	stp	x12,x13,[x22]
89
90.Louter:
91	ldr	x9,[x2],#8		// bp[i]
92	ldp	x7,x8,[x1],#16
93	ldr	x23,[sp]		// tp[0]
94	add	x22,sp,#8
95
96	mul	x6,x7,x9		// ap[0]*bp[i]
97	sub	x21,x5,#16		// j=num-2
98	umulh	x7,x7,x9
99	ldp	x13,x14,[x3],#16
100	mul	x10,x8,x9		// ap[1]*bp[i]
101	adds	x6,x6,x23
102	umulh	x11,x8,x9
103	adc	x7,x7,xzr
104
105	mul	x15,x6,x4
106	sub	x20,x20,#8		// i--
107
108	// (*)	mul	x12,x13,x15	// np[0]*m1
109	umulh	x13,x13,x15
110	mul	x16,x14,x15		// np[1]*m1
111	// (*)	adds	x12,x12,x6
112	subs	xzr,x6,#1		// (*)
113	umulh	x17,x14,x15
114	cbz	x21,.Linner_skip
115
116.Linner:
117	ldr	x8,[x1],#8
118	adc	x13,x13,xzr
119	ldr	x23,[x22],#8		// tp[j]
120	adds	x6,x10,x7
121	sub	x21,x21,#8		// j--
122	adc	x7,x11,xzr
123
124	adds	x12,x16,x13
125	ldr	x14,[x3],#8
126	adc	x13,x17,xzr
127
128	mul	x10,x8,x9		// ap[j]*bp[i]
129	adds	x6,x6,x23
130	umulh	x11,x8,x9
131	adc	x7,x7,xzr
132
133	mul	x16,x14,x15		// np[j]*m1
134	adds	x12,x12,x6
135	umulh	x17,x14,x15
136	str	x12,[x22,#-16]		// tp[j-1]
137	cbnz	x21,.Linner
138
139.Linner_skip:
140	ldr	x23,[x22],#8		// tp[j]
141	adc	x13,x13,xzr
142	adds	x6,x10,x7
143	sub	x1,x1,x5		// rewind x1
144	adc	x7,x11,xzr
145
146	adds	x12,x16,x13
147	sub	x3,x3,x5		// rewind x3
148	adcs	x13,x17,x19
149	adc	x19,xzr,xzr
150
151	adds	x6,x6,x23
152	adc	x7,x7,xzr
153
154	adds	x12,x12,x6
155	adcs	x13,x13,x7
156	adc	x19,x19,xzr		// upmost overflow bit
157	stp	x12,x13,[x22,#-16]
158
159	cbnz	x20,.Louter
160
161	// Final step. We see if result is larger than modulus, and
162	// if it is, subtract the modulus. But comparison implies
163	// subtraction. So we subtract modulus, see if it borrowed,
164	// and conditionally copy original value.
165	ldr	x23,[sp]		// tp[0]
166	add	x22,sp,#8
167	ldr	x14,[x3],#8		// np[0]
168	subs	x21,x5,#8		// j=num-1 and clear borrow
169	mov	x1,x0
170.Lsub:
171	sbcs	x8,x23,x14		// tp[j]-np[j]
172	ldr	x23,[x22],#8
173	sub	x21,x21,#8		// j--
174	ldr	x14,[x3],#8
175	str	x8,[x1],#8		// rp[j]=tp[j]-np[j]
176	cbnz	x21,.Lsub
177
178	sbcs	x8,x23,x14
179	sbcs	x19,x19,xzr		// did it borrow?
180	str	x8,[x1],#8		// rp[num-1]
181
182	ldr	x23,[sp]		// tp[0]
183	add	x22,sp,#8
184	ldr	x8,[x0],#8		// rp[0]
185	sub	x5,x5,#8		// num--
186	nop
187.Lcond_copy:
188	sub	x5,x5,#8		// num--
189	csel	x14,x23,x8,lo		// did it borrow?
190	ldr	x23,[x22],#8
191	ldr	x8,[x0],#8
192	str	xzr,[x22,#-16]		// wipe tp
193	str	x14,[x0,#-16]
194	cbnz	x5,.Lcond_copy
195
196	csel	x14,x23,x8,lo
197	str	xzr,[x22,#-8]		// wipe tp
198	str	x14,[x0,#-8]
199
200	ldp	x19,x20,[x29,#16]
201	mov	sp,x29
202	ldp	x21,x22,[x29,#32]
203	mov	x0,#1
204	ldp	x23,x24,[x29,#48]
205	ldr	x29,[sp],#64
206	ret
207.size	bn_mul_mont,.-bn_mul_mont
208.type	__bn_sqr8x_mont,%function
209.align	5
210__bn_sqr8x_mont:
211	cmp	x1,x2
212	b.ne	__bn_mul4x_mont
213.Lsqr8x_mont:
214.inst	0xd503233f		// paciasp
215	stp	x29,x30,[sp,#-128]!
216	add	x29,sp,#0
217	stp	x19,x20,[sp,#16]
218	stp	x21,x22,[sp,#32]
219	stp	x23,x24,[sp,#48]
220	stp	x25,x26,[sp,#64]
221	stp	x27,x28,[sp,#80]
222	stp	x0,x3,[sp,#96]	// offload rp and np
223
224	ldp	x6,x7,[x1,#8*0]
225	ldp	x8,x9,[x1,#8*2]
226	ldp	x10,x11,[x1,#8*4]
227	ldp	x12,x13,[x1,#8*6]
228
229	sub	x2,sp,x5,lsl#4
230	lsl	x5,x5,#3
231	ldr	x4,[x4]		// *n0
232	mov	sp,x2			// alloca
233	sub	x27,x5,#8*8
234	b	.Lsqr8x_zero_start
235
236.Lsqr8x_zero:
237	sub	x27,x27,#8*8
238	stp	xzr,xzr,[x2,#8*0]
239	stp	xzr,xzr,[x2,#8*2]
240	stp	xzr,xzr,[x2,#8*4]
241	stp	xzr,xzr,[x2,#8*6]
242.Lsqr8x_zero_start:
243	stp	xzr,xzr,[x2,#8*8]
244	stp	xzr,xzr,[x2,#8*10]
245	stp	xzr,xzr,[x2,#8*12]
246	stp	xzr,xzr,[x2,#8*14]
247	add	x2,x2,#8*16
248	cbnz	x27,.Lsqr8x_zero
249
250	add	x3,x1,x5
251	add	x1,x1,#8*8
252	mov	x19,xzr
253	mov	x20,xzr
254	mov	x21,xzr
255	mov	x22,xzr
256	mov	x23,xzr
257	mov	x24,xzr
258	mov	x25,xzr
259	mov	x26,xzr
260	mov	x2,sp
261	str	x4,[x29,#112]		// offload n0
262
263	// Multiply everything but a[i]*a[i]
264.align	4
265.Lsqr8x_outer_loop:
266        //                                                 a[1]a[0]	(i)
267        //                                             a[2]a[0]
268        //                                         a[3]a[0]
269        //                                     a[4]a[0]
270        //                                 a[5]a[0]
271        //                             a[6]a[0]
272        //                         a[7]a[0]
273        //                                         a[2]a[1]		(ii)
274        //                                     a[3]a[1]
275        //                                 a[4]a[1]
276        //                             a[5]a[1]
277        //                         a[6]a[1]
278        //                     a[7]a[1]
279        //                                 a[3]a[2]			(iii)
280        //                             a[4]a[2]
281        //                         a[5]a[2]
282        //                     a[6]a[2]
283        //                 a[7]a[2]
284        //                         a[4]a[3]				(iv)
285        //                     a[5]a[3]
286        //                 a[6]a[3]
287        //             a[7]a[3]
288        //                 a[5]a[4]					(v)
289        //             a[6]a[4]
290        //         a[7]a[4]
291        //         a[6]a[5]						(vi)
292        //     a[7]a[5]
293        // a[7]a[6]							(vii)
294
295	mul	x14,x7,x6		// lo(a[1..7]*a[0])		(i)
296	mul	x15,x8,x6
297	mul	x16,x9,x6
298	mul	x17,x10,x6
299	adds	x20,x20,x14		// t[1]+lo(a[1]*a[0])
300	mul	x14,x11,x6
301	adcs	x21,x21,x15
302	mul	x15,x12,x6
303	adcs	x22,x22,x16
304	mul	x16,x13,x6
305	adcs	x23,x23,x17
306	umulh	x17,x7,x6		// hi(a[1..7]*a[0])
307	adcs	x24,x24,x14
308	umulh	x14,x8,x6
309	adcs	x25,x25,x15
310	umulh	x15,x9,x6
311	adcs	x26,x26,x16
312	umulh	x16,x10,x6
313	stp	x19,x20,[x2],#8*2	// t[0..1]
314	adc	x19,xzr,xzr		// t[8]
315	adds	x21,x21,x17		// t[2]+lo(a[1]*a[0])
316	umulh	x17,x11,x6
317	adcs	x22,x22,x14
318	umulh	x14,x12,x6
319	adcs	x23,x23,x15
320	umulh	x15,x13,x6
321	adcs	x24,x24,x16
322	mul	x16,x8,x7		// lo(a[2..7]*a[1])		(ii)
323	adcs	x25,x25,x17
324	mul	x17,x9,x7
325	adcs	x26,x26,x14
326	mul	x14,x10,x7
327	adc	x19,x19,x15
328
329	mul	x15,x11,x7
330	adds	x22,x22,x16
331	mul	x16,x12,x7
332	adcs	x23,x23,x17
333	mul	x17,x13,x7
334	adcs	x24,x24,x14
335	umulh	x14,x8,x7		// hi(a[2..7]*a[1])
336	adcs	x25,x25,x15
337	umulh	x15,x9,x7
338	adcs	x26,x26,x16
339	umulh	x16,x10,x7
340	adcs	x19,x19,x17
341	umulh	x17,x11,x7
342	stp	x21,x22,[x2],#8*2	// t[2..3]
343	adc	x20,xzr,xzr		// t[9]
344	adds	x23,x23,x14
345	umulh	x14,x12,x7
346	adcs	x24,x24,x15
347	umulh	x15,x13,x7
348	adcs	x25,x25,x16
349	mul	x16,x9,x8		// lo(a[3..7]*a[2])		(iii)
350	adcs	x26,x26,x17
351	mul	x17,x10,x8
352	adcs	x19,x19,x14
353	mul	x14,x11,x8
354	adc	x20,x20,x15
355
356	mul	x15,x12,x8
357	adds	x24,x24,x16
358	mul	x16,x13,x8
359	adcs	x25,x25,x17
360	umulh	x17,x9,x8		// hi(a[3..7]*a[2])
361	adcs	x26,x26,x14
362	umulh	x14,x10,x8
363	adcs	x19,x19,x15
364	umulh	x15,x11,x8
365	adcs	x20,x20,x16
366	umulh	x16,x12,x8
367	stp	x23,x24,[x2],#8*2	// t[4..5]
368	adc	x21,xzr,xzr		// t[10]
369	adds	x25,x25,x17
370	umulh	x17,x13,x8
371	adcs	x26,x26,x14
372	mul	x14,x10,x9		// lo(a[4..7]*a[3])		(iv)
373	adcs	x19,x19,x15
374	mul	x15,x11,x9
375	adcs	x20,x20,x16
376	mul	x16,x12,x9
377	adc	x21,x21,x17
378
379	mul	x17,x13,x9
380	adds	x26,x26,x14
381	umulh	x14,x10,x9		// hi(a[4..7]*a[3])
382	adcs	x19,x19,x15
383	umulh	x15,x11,x9
384	adcs	x20,x20,x16
385	umulh	x16,x12,x9
386	adcs	x21,x21,x17
387	umulh	x17,x13,x9
388	stp	x25,x26,[x2],#8*2	// t[6..7]
389	adc	x22,xzr,xzr		// t[11]
390	adds	x19,x19,x14
391	mul	x14,x11,x10		// lo(a[5..7]*a[4])		(v)
392	adcs	x20,x20,x15
393	mul	x15,x12,x10
394	adcs	x21,x21,x16
395	mul	x16,x13,x10
396	adc	x22,x22,x17
397
398	umulh	x17,x11,x10		// hi(a[5..7]*a[4])
399	adds	x20,x20,x14
400	umulh	x14,x12,x10
401	adcs	x21,x21,x15
402	umulh	x15,x13,x10
403	adcs	x22,x22,x16
404	mul	x16,x12,x11		// lo(a[6..7]*a[5])		(vi)
405	adc	x23,xzr,xzr		// t[12]
406	adds	x21,x21,x17
407	mul	x17,x13,x11
408	adcs	x22,x22,x14
409	umulh	x14,x12,x11		// hi(a[6..7]*a[5])
410	adc	x23,x23,x15
411
412	umulh	x15,x13,x11
413	adds	x22,x22,x16
414	mul	x16,x13,x12		// lo(a[7]*a[6])		(vii)
415	adcs	x23,x23,x17
416	umulh	x17,x13,x12		// hi(a[7]*a[6])
417	adc	x24,xzr,xzr		// t[13]
418	adds	x23,x23,x14
419	sub	x27,x3,x1	// done yet?
420	adc	x24,x24,x15
421
422	adds	x24,x24,x16
423	sub	x14,x3,x5	// rewinded ap
424	adc	x25,xzr,xzr		// t[14]
425	add	x25,x25,x17
426
427	cbz	x27,.Lsqr8x_outer_break
428
429	mov	x4,x6
430	ldp	x6,x7,[x2,#8*0]
431	ldp	x8,x9,[x2,#8*2]
432	ldp	x10,x11,[x2,#8*4]
433	ldp	x12,x13,[x2,#8*6]
434	adds	x19,x19,x6
435	adcs	x20,x20,x7
436	ldp	x6,x7,[x1,#8*0]
437	adcs	x21,x21,x8
438	adcs	x22,x22,x9
439	ldp	x8,x9,[x1,#8*2]
440	adcs	x23,x23,x10
441	adcs	x24,x24,x11
442	ldp	x10,x11,[x1,#8*4]
443	adcs	x25,x25,x12
444	mov	x0,x1
445	adcs	x26,xzr,x13
446	ldp	x12,x13,[x1,#8*6]
447	add	x1,x1,#8*8
448	//adc	x28,xzr,xzr		// moved below
449	mov	x27,#-8*8
450
451	//                                                         a[8]a[0]
452	//                                                     a[9]a[0]
453	//                                                 a[a]a[0]
454	//                                             a[b]a[0]
455	//                                         a[c]a[0]
456	//                                     a[d]a[0]
457	//                                 a[e]a[0]
458	//                             a[f]a[0]
459	//                                                     a[8]a[1]
460	//                         a[f]a[1]........................
461	//                                                 a[8]a[2]
462	//                     a[f]a[2]........................
463	//                                             a[8]a[3]
464	//                 a[f]a[3]........................
465	//                                         a[8]a[4]
466	//             a[f]a[4]........................
467	//                                     a[8]a[5]
468	//         a[f]a[5]........................
469	//                                 a[8]a[6]
470	//     a[f]a[6]........................
471	//                             a[8]a[7]
472	// a[f]a[7]........................
473.Lsqr8x_mul:
474	mul	x14,x6,x4
475	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
476	mul	x15,x7,x4
477	add	x27,x27,#8
478	mul	x16,x8,x4
479	mul	x17,x9,x4
480	adds	x19,x19,x14
481	mul	x14,x10,x4
482	adcs	x20,x20,x15
483	mul	x15,x11,x4
484	adcs	x21,x21,x16
485	mul	x16,x12,x4
486	adcs	x22,x22,x17
487	mul	x17,x13,x4
488	adcs	x23,x23,x14
489	umulh	x14,x6,x4
490	adcs	x24,x24,x15
491	umulh	x15,x7,x4
492	adcs	x25,x25,x16
493	umulh	x16,x8,x4
494	adcs	x26,x26,x17
495	umulh	x17,x9,x4
496	adc	x28,x28,xzr
497	str	x19,[x2],#8
498	adds	x19,x20,x14
499	umulh	x14,x10,x4
500	adcs	x20,x21,x15
501	umulh	x15,x11,x4
502	adcs	x21,x22,x16
503	umulh	x16,x12,x4
504	adcs	x22,x23,x17
505	umulh	x17,x13,x4
506	ldr	x4,[x0,x27]
507	adcs	x23,x24,x14
508	adcs	x24,x25,x15
509	adcs	x25,x26,x16
510	adcs	x26,x28,x17
511	//adc	x28,xzr,xzr		// moved above
512	cbnz	x27,.Lsqr8x_mul
513					// note that carry flag is guaranteed
514					// to be zero at this point
515	cmp	x1,x3		// done yet?
516	b.eq	.Lsqr8x_break
517
518	ldp	x6,x7,[x2,#8*0]
519	ldp	x8,x9,[x2,#8*2]
520	ldp	x10,x11,[x2,#8*4]
521	ldp	x12,x13,[x2,#8*6]
522	adds	x19,x19,x6
523	ldr	x4,[x0,#-8*8]
524	adcs	x20,x20,x7
525	ldp	x6,x7,[x1,#8*0]
526	adcs	x21,x21,x8
527	adcs	x22,x22,x9
528	ldp	x8,x9,[x1,#8*2]
529	adcs	x23,x23,x10
530	adcs	x24,x24,x11
531	ldp	x10,x11,[x1,#8*4]
532	adcs	x25,x25,x12
533	mov	x27,#-8*8
534	adcs	x26,x26,x13
535	ldp	x12,x13,[x1,#8*6]
536	add	x1,x1,#8*8
537	//adc	x28,xzr,xzr		// moved above
538	b	.Lsqr8x_mul
539
540.align	4
541.Lsqr8x_break:
542	ldp	x6,x7,[x0,#8*0]
543	add	x1,x0,#8*8
544	ldp	x8,x9,[x0,#8*2]
545	sub	x14,x3,x1		// is it last iteration?
546	ldp	x10,x11,[x0,#8*4]
547	sub	x15,x2,x14
548	ldp	x12,x13,[x0,#8*6]
549	cbz	x14,.Lsqr8x_outer_loop
550
551	stp	x19,x20,[x2,#8*0]
552	ldp	x19,x20,[x15,#8*0]
553	stp	x21,x22,[x2,#8*2]
554	ldp	x21,x22,[x15,#8*2]
555	stp	x23,x24,[x2,#8*4]
556	ldp	x23,x24,[x15,#8*4]
557	stp	x25,x26,[x2,#8*6]
558	mov	x2,x15
559	ldp	x25,x26,[x15,#8*6]
560	b	.Lsqr8x_outer_loop
561
562.align	4
563.Lsqr8x_outer_break:
564	// Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
565	ldp	x7,x9,[x14,#8*0]	// recall that x14 is &a[0]
566	ldp	x15,x16,[sp,#8*1]
567	ldp	x11,x13,[x14,#8*2]
568	add	x1,x14,#8*4
569	ldp	x17,x14,[sp,#8*3]
570
571	stp	x19,x20,[x2,#8*0]
572	mul	x19,x7,x7
573	stp	x21,x22,[x2,#8*2]
574	umulh	x7,x7,x7
575	stp	x23,x24,[x2,#8*4]
576	mul	x8,x9,x9
577	stp	x25,x26,[x2,#8*6]
578	mov	x2,sp
579	umulh	x9,x9,x9
580	adds	x20,x7,x15,lsl#1
581	extr	x15,x16,x15,#63
582	sub	x27,x5,#8*4
583
584.Lsqr4x_shift_n_add:
585	adcs	x21,x8,x15
586	extr	x16,x17,x16,#63
587	sub	x27,x27,#8*4
588	adcs	x22,x9,x16
589	ldp	x15,x16,[x2,#8*5]
590	mul	x10,x11,x11
591	ldp	x7,x9,[x1],#8*2
592	umulh	x11,x11,x11
593	mul	x12,x13,x13
594	umulh	x13,x13,x13
595	extr	x17,x14,x17,#63
596	stp	x19,x20,[x2,#8*0]
597	adcs	x23,x10,x17
598	extr	x14,x15,x14,#63
599	stp	x21,x22,[x2,#8*2]
600	adcs	x24,x11,x14
601	ldp	x17,x14,[x2,#8*7]
602	extr	x15,x16,x15,#63
603	adcs	x25,x12,x15
604	extr	x16,x17,x16,#63
605	adcs	x26,x13,x16
606	ldp	x15,x16,[x2,#8*9]
607	mul	x6,x7,x7
608	ldp	x11,x13,[x1],#8*2
609	umulh	x7,x7,x7
610	mul	x8,x9,x9
611	umulh	x9,x9,x9
612	stp	x23,x24,[x2,#8*4]
613	extr	x17,x14,x17,#63
614	stp	x25,x26,[x2,#8*6]
615	add	x2,x2,#8*8
616	adcs	x19,x6,x17
617	extr	x14,x15,x14,#63
618	adcs	x20,x7,x14
619	ldp	x17,x14,[x2,#8*3]
620	extr	x15,x16,x15,#63
621	cbnz	x27,.Lsqr4x_shift_n_add
622	ldp	x1,x4,[x29,#104]	// pull np and n0
623
624	adcs	x21,x8,x15
625	extr	x16,x17,x16,#63
626	adcs	x22,x9,x16
627	ldp	x15,x16,[x2,#8*5]
628	mul	x10,x11,x11
629	umulh	x11,x11,x11
630	stp	x19,x20,[x2,#8*0]
631	mul	x12,x13,x13
632	umulh	x13,x13,x13
633	stp	x21,x22,[x2,#8*2]
634	extr	x17,x14,x17,#63
635	adcs	x23,x10,x17
636	extr	x14,x15,x14,#63
637	ldp	x19,x20,[sp,#8*0]
638	adcs	x24,x11,x14
639	extr	x15,x16,x15,#63
640	ldp	x6,x7,[x1,#8*0]
641	adcs	x25,x12,x15
642	extr	x16,xzr,x16,#63
643	ldp	x8,x9,[x1,#8*2]
644	adc	x26,x13,x16
645	ldp	x10,x11,[x1,#8*4]
646
647	// Reduce by 512 bits per iteration
648	mul	x28,x4,x19		// t[0]*n0
649	ldp	x12,x13,[x1,#8*6]
650	add	x3,x1,x5
651	ldp	x21,x22,[sp,#8*2]
652	stp	x23,x24,[x2,#8*4]
653	ldp	x23,x24,[sp,#8*4]
654	stp	x25,x26,[x2,#8*6]
655	ldp	x25,x26,[sp,#8*6]
656	add	x1,x1,#8*8
657	mov	x30,xzr		// initial top-most carry
658	mov	x2,sp
659	mov	x27,#8
660
661.Lsqr8x_reduction:
662	// (*)	mul	x14,x6,x28	// lo(n[0-7])*lo(t[0]*n0)
663	mul	x15,x7,x28
664	sub	x27,x27,#1
665	mul	x16,x8,x28
666	str	x28,[x2],#8		// put aside t[0]*n0 for tail processing
667	mul	x17,x9,x28
668	// (*)	adds	xzr,x19,x14
669	subs	xzr,x19,#1		// (*)
670	mul	x14,x10,x28
671	adcs	x19,x20,x15
672	mul	x15,x11,x28
673	adcs	x20,x21,x16
674	mul	x16,x12,x28
675	adcs	x21,x22,x17
676	mul	x17,x13,x28
677	adcs	x22,x23,x14
678	umulh	x14,x6,x28		// hi(n[0-7])*lo(t[0]*n0)
679	adcs	x23,x24,x15
680	umulh	x15,x7,x28
681	adcs	x24,x25,x16
682	umulh	x16,x8,x28
683	adcs	x25,x26,x17
684	umulh	x17,x9,x28
685	adc	x26,xzr,xzr
686	adds	x19,x19,x14
687	umulh	x14,x10,x28
688	adcs	x20,x20,x15
689	umulh	x15,x11,x28
690	adcs	x21,x21,x16
691	umulh	x16,x12,x28
692	adcs	x22,x22,x17
693	umulh	x17,x13,x28
694	mul	x28,x4,x19		// next t[0]*n0
695	adcs	x23,x23,x14
696	adcs	x24,x24,x15
697	adcs	x25,x25,x16
698	adc	x26,x26,x17
699	cbnz	x27,.Lsqr8x_reduction
700
701	ldp	x14,x15,[x2,#8*0]
702	ldp	x16,x17,[x2,#8*2]
703	mov	x0,x2
704	sub	x27,x3,x1	// done yet?
705	adds	x19,x19,x14
706	adcs	x20,x20,x15
707	ldp	x14,x15,[x2,#8*4]
708	adcs	x21,x21,x16
709	adcs	x22,x22,x17
710	ldp	x16,x17,[x2,#8*6]
711	adcs	x23,x23,x14
712	adcs	x24,x24,x15
713	adcs	x25,x25,x16
714	adcs	x26,x26,x17
715	//adc	x28,xzr,xzr		// moved below
716	cbz	x27,.Lsqr8x8_post_condition
717
718	ldr	x4,[x2,#-8*8]
719	ldp	x6,x7,[x1,#8*0]
720	ldp	x8,x9,[x1,#8*2]
721	ldp	x10,x11,[x1,#8*4]
722	mov	x27,#-8*8
723	ldp	x12,x13,[x1,#8*6]
724	add	x1,x1,#8*8
725
726.Lsqr8x_tail:
727	mul	x14,x6,x4
728	adc	x28,xzr,xzr		// carry bit, modulo-scheduled
729	mul	x15,x7,x4
730	add	x27,x27,#8
731	mul	x16,x8,x4
732	mul	x17,x9,x4
733	adds	x19,x19,x14
734	mul	x14,x10,x4
735	adcs	x20,x20,x15
736	mul	x15,x11,x4
737	adcs	x21,x21,x16
738	mul	x16,x12,x4
739	adcs	x22,x22,x17
740	mul	x17,x13,x4
741	adcs	x23,x23,x14
742	umulh	x14,x6,x4
743	adcs	x24,x24,x15
744	umulh	x15,x7,x4
745	adcs	x25,x25,x16
746	umulh	x16,x8,x4
747	adcs	x26,x26,x17
748	umulh	x17,x9,x4
749	adc	x28,x28,xzr
750	str	x19,[x2],#8
751	adds	x19,x20,x14
752	umulh	x14,x10,x4
753	adcs	x20,x21,x15
754	umulh	x15,x11,x4
755	adcs	x21,x22,x16
756	umulh	x16,x12,x4
757	adcs	x22,x23,x17
758	umulh	x17,x13,x4
759	ldr	x4,[x0,x27]
760	adcs	x23,x24,x14
761	adcs	x24,x25,x15
762	adcs	x25,x26,x16
763	adcs	x26,x28,x17
764	//adc	x28,xzr,xzr		// moved above
765	cbnz	x27,.Lsqr8x_tail
766					// note that carry flag is guaranteed
767					// to be zero at this point
768	ldp	x6,x7,[x2,#8*0]
769	sub	x27,x3,x1	// done yet?
770	sub	x16,x3,x5	// rewinded np
771	ldp	x8,x9,[x2,#8*2]
772	ldp	x10,x11,[x2,#8*4]
773	ldp	x12,x13,[x2,#8*6]
774	cbz	x27,.Lsqr8x_tail_break
775
776	ldr	x4,[x0,#-8*8]
777	adds	x19,x19,x6
778	adcs	x20,x20,x7
779	ldp	x6,x7,[x1,#8*0]
780	adcs	x21,x21,x8
781	adcs	x22,x22,x9
782	ldp	x8,x9,[x1,#8*2]
783	adcs	x23,x23,x10
784	adcs	x24,x24,x11
785	ldp	x10,x11,[x1,#8*4]
786	adcs	x25,x25,x12
787	mov	x27,#-8*8
788	adcs	x26,x26,x13
789	ldp	x12,x13,[x1,#8*6]
790	add	x1,x1,#8*8
791	//adc	x28,xzr,xzr		// moved above
792	b	.Lsqr8x_tail
793
794.align	4
795.Lsqr8x_tail_break:
796	ldr	x4,[x29,#112]		// pull n0
797	add	x27,x2,#8*8		// end of current t[num] window
798
799	subs	xzr,x30,#1		// "move" top-most carry to carry bit
800	adcs	x14,x19,x6
801	adcs	x15,x20,x7
802	ldp	x19,x20,[x0,#8*0]
803	adcs	x21,x21,x8
804	ldp	x6,x7,[x16,#8*0]	// recall that x16 is &n[0]
805	adcs	x22,x22,x9
806	ldp	x8,x9,[x16,#8*2]
807	adcs	x23,x23,x10
808	adcs	x24,x24,x11
809	ldp	x10,x11,[x16,#8*4]
810	adcs	x25,x25,x12
811	adcs	x26,x26,x13
812	ldp	x12,x13,[x16,#8*6]
813	add	x1,x16,#8*8
814	adc	x30,xzr,xzr	// top-most carry
815	mul	x28,x4,x19
816	stp	x14,x15,[x2,#8*0]
817	stp	x21,x22,[x2,#8*2]
818	ldp	x21,x22,[x0,#8*2]
819	stp	x23,x24,[x2,#8*4]
820	ldp	x23,x24,[x0,#8*4]
821	cmp	x27,x29		// did we hit the bottom?
822	stp	x25,x26,[x2,#8*6]
823	mov	x2,x0			// slide the window
824	ldp	x25,x26,[x0,#8*6]
825	mov	x27,#8
826	b.ne	.Lsqr8x_reduction
827
828	// Final step. We see if result is larger than modulus, and
829	// if it is, subtract the modulus. But comparison implies
830	// subtraction. So we subtract modulus, see if it borrowed,
831	// and conditionally copy original value.
832	ldr	x0,[x29,#96]		// pull rp
833	add	x2,x2,#8*8
834	subs	x14,x19,x6
835	sbcs	x15,x20,x7
836	sub	x27,x5,#8*8
837	mov	x3,x0		// x0 copy
838
839.Lsqr8x_sub:
840	sbcs	x16,x21,x8
841	ldp	x6,x7,[x1,#8*0]
842	sbcs	x17,x22,x9
843	stp	x14,x15,[x0,#8*0]
844	sbcs	x14,x23,x10
845	ldp	x8,x9,[x1,#8*2]
846	sbcs	x15,x24,x11
847	stp	x16,x17,[x0,#8*2]
848	sbcs	x16,x25,x12
849	ldp	x10,x11,[x1,#8*4]
850	sbcs	x17,x26,x13
851	ldp	x12,x13,[x1,#8*6]
852	add	x1,x1,#8*8
853	ldp	x19,x20,[x2,#8*0]
854	sub	x27,x27,#8*8
855	ldp	x21,x22,[x2,#8*2]
856	ldp	x23,x24,[x2,#8*4]
857	ldp	x25,x26,[x2,#8*6]
858	add	x2,x2,#8*8
859	stp	x14,x15,[x0,#8*4]
860	sbcs	x14,x19,x6
861	stp	x16,x17,[x0,#8*6]
862	add	x0,x0,#8*8
863	sbcs	x15,x20,x7
864	cbnz	x27,.Lsqr8x_sub
865
866	sbcs	x16,x21,x8
867	mov	x2,sp
868	add	x1,sp,x5
869	ldp	x6,x7,[x3,#8*0]
870	sbcs	x17,x22,x9
871	stp	x14,x15,[x0,#8*0]
872	sbcs	x14,x23,x10
873	ldp	x8,x9,[x3,#8*2]
874	sbcs	x15,x24,x11
875	stp	x16,x17,[x0,#8*2]
876	sbcs	x16,x25,x12
877	ldp	x19,x20,[x1,#8*0]
878	sbcs	x17,x26,x13
879	ldp	x21,x22,[x1,#8*2]
880	sbcs	xzr,x30,xzr	// did it borrow?
881	ldr	x30,[x29,#8]		// pull return address
882	stp	x14,x15,[x0,#8*4]
883	stp	x16,x17,[x0,#8*6]
884
885	sub	x27,x5,#8*4
886.Lsqr4x_cond_copy:
887	sub	x27,x27,#8*4
888	csel	x14,x19,x6,lo
889	stp	xzr,xzr,[x2,#8*0]
890	csel	x15,x20,x7,lo
891	ldp	x6,x7,[x3,#8*4]
892	ldp	x19,x20,[x1,#8*4]
893	csel	x16,x21,x8,lo
894	stp	xzr,xzr,[x2,#8*2]
895	add	x2,x2,#8*4
896	csel	x17,x22,x9,lo
897	ldp	x8,x9,[x3,#8*6]
898	ldp	x21,x22,[x1,#8*6]
899	add	x1,x1,#8*4
900	stp	x14,x15,[x3,#8*0]
901	stp	x16,x17,[x3,#8*2]
902	add	x3,x3,#8*4
903	stp	xzr,xzr,[x1,#8*0]
904	stp	xzr,xzr,[x1,#8*2]
905	cbnz	x27,.Lsqr4x_cond_copy
906
907	csel	x14,x19,x6,lo
908	stp	xzr,xzr,[x2,#8*0]
909	csel	x15,x20,x7,lo
910	stp	xzr,xzr,[x2,#8*2]
911	csel	x16,x21,x8,lo
912	csel	x17,x22,x9,lo
913	stp	x14,x15,[x3,#8*0]
914	stp	x16,x17,[x3,#8*2]
915
916	b	.Lsqr8x_done
917
918.align	4
919.Lsqr8x8_post_condition:
920	adc	x28,xzr,xzr
921	ldr	x30,[x29,#8]		// pull return address
922	// x19-7,x28 hold result, x6-7 hold modulus
923	subs	x6,x19,x6
924	ldr	x1,[x29,#96]		// pull rp
925	sbcs	x7,x20,x7
926	stp	xzr,xzr,[sp,#8*0]
927	sbcs	x8,x21,x8
928	stp	xzr,xzr,[sp,#8*2]
929	sbcs	x9,x22,x9
930	stp	xzr,xzr,[sp,#8*4]
931	sbcs	x10,x23,x10
932	stp	xzr,xzr,[sp,#8*6]
933	sbcs	x11,x24,x11
934	stp	xzr,xzr,[sp,#8*8]
935	sbcs	x12,x25,x12
936	stp	xzr,xzr,[sp,#8*10]
937	sbcs	x13,x26,x13
938	stp	xzr,xzr,[sp,#8*12]
939	sbcs	x28,x28,xzr	// did it borrow?
940	stp	xzr,xzr,[sp,#8*14]
941
942	// x6-7 hold result-modulus
943	csel	x6,x19,x6,lo
944	csel	x7,x20,x7,lo
945	csel	x8,x21,x8,lo
946	csel	x9,x22,x9,lo
947	stp	x6,x7,[x1,#8*0]
948	csel	x10,x23,x10,lo
949	csel	x11,x24,x11,lo
950	stp	x8,x9,[x1,#8*2]
951	csel	x12,x25,x12,lo
952	csel	x13,x26,x13,lo
953	stp	x10,x11,[x1,#8*4]
954	stp	x12,x13,[x1,#8*6]
955
956.Lsqr8x_done:
957	ldp	x19,x20,[x29,#16]
958	mov	sp,x29
959	ldp	x21,x22,[x29,#32]
960	mov	x0,#1
961	ldp	x23,x24,[x29,#48]
962	ldp	x25,x26,[x29,#64]
963	ldp	x27,x28,[x29,#80]
964	ldr	x29,[sp],#128
965.inst	0xd50323bf		// autiasp
966	ret
967.size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
968.type	__bn_mul4x_mont,%function
969.align	5
970__bn_mul4x_mont:
971.inst	0xd503233f		// paciasp
972	stp	x29,x30,[sp,#-128]!
973	add	x29,sp,#0
974	stp	x19,x20,[sp,#16]
975	stp	x21,x22,[sp,#32]
976	stp	x23,x24,[sp,#48]
977	stp	x25,x26,[sp,#64]
978	stp	x27,x28,[sp,#80]
979
980	sub	x26,sp,x5,lsl#3
981	lsl	x5,x5,#3
982	ldr	x4,[x4]		// *n0
983	sub	sp,x26,#8*4		// alloca
984
985	add	x10,x2,x5
986	add	x27,x1,x5
987	stp	x0,x10,[x29,#96]	// offload rp and &b[num]
988
989	ldr	x24,[x2,#8*0]		// b[0]
990	ldp	x6,x7,[x1,#8*0]	// a[0..3]
991	ldp	x8,x9,[x1,#8*2]
992	add	x1,x1,#8*4
993	mov	x19,xzr
994	mov	x20,xzr
995	mov	x21,xzr
996	mov	x22,xzr
997	ldp	x14,x15,[x3,#8*0]	// n[0..3]
998	ldp	x16,x17,[x3,#8*2]
999	adds	x3,x3,#8*4		// clear carry bit
1000	mov	x0,xzr
1001	mov	x28,#0
1002	mov	x26,sp
1003
1004.Loop_mul4x_1st_reduction:
1005	mul	x10,x6,x24		// lo(a[0..3]*b[0])
1006	adc	x0,x0,xzr	// modulo-scheduled
1007	mul	x11,x7,x24
1008	add	x28,x28,#8
1009	mul	x12,x8,x24
1010	and	x28,x28,#31
1011	mul	x13,x9,x24
1012	adds	x19,x19,x10
1013	umulh	x10,x6,x24		// hi(a[0..3]*b[0])
1014	adcs	x20,x20,x11
1015	mul	x25,x19,x4		// t[0]*n0
1016	adcs	x21,x21,x12
1017	umulh	x11,x7,x24
1018	adcs	x22,x22,x13
1019	umulh	x12,x8,x24
1020	adc	x23,xzr,xzr
1021	umulh	x13,x9,x24
1022	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1023	adds	x20,x20,x10
1024	// (*)	mul	x10,x14,x25	// lo(n[0..3]*t[0]*n0)
1025	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1026	adcs	x21,x21,x11
1027	mul	x11,x15,x25
1028	adcs	x22,x22,x12
1029	mul	x12,x16,x25
1030	adc	x23,x23,x13		// can't overflow
1031	mul	x13,x17,x25
1032	// (*)	adds	xzr,x19,x10
1033	subs	xzr,x19,#1		// (*)
1034	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0)
1035	adcs	x19,x20,x11
1036	umulh	x11,x15,x25
1037	adcs	x20,x21,x12
1038	umulh	x12,x16,x25
1039	adcs	x21,x22,x13
1040	umulh	x13,x17,x25
1041	adcs	x22,x23,x0
1042	adc	x0,xzr,xzr
1043	adds	x19,x19,x10
1044	sub	x10,x27,x1
1045	adcs	x20,x20,x11
1046	adcs	x21,x21,x12
1047	adcs	x22,x22,x13
1048	//adc	x0,x0,xzr
1049	cbnz	x28,.Loop_mul4x_1st_reduction
1050
1051	cbz	x10,.Lmul4x4_post_condition
1052
1053	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1054	ldp	x8,x9,[x1,#8*2]
1055	add	x1,x1,#8*4
1056	ldr	x25,[sp]		// a[0]*n0
1057	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1058	ldp	x16,x17,[x3,#8*2]
1059	add	x3,x3,#8*4
1060
1061.Loop_mul4x_1st_tail:
1062	mul	x10,x6,x24		// lo(a[4..7]*b[i])
1063	adc	x0,x0,xzr	// modulo-scheduled
1064	mul	x11,x7,x24
1065	add	x28,x28,#8
1066	mul	x12,x8,x24
1067	and	x28,x28,#31
1068	mul	x13,x9,x24
1069	adds	x19,x19,x10
1070	umulh	x10,x6,x24		// hi(a[4..7]*b[i])
1071	adcs	x20,x20,x11
1072	umulh	x11,x7,x24
1073	adcs	x21,x21,x12
1074	umulh	x12,x8,x24
1075	adcs	x22,x22,x13
1076	umulh	x13,x9,x24
1077	adc	x23,xzr,xzr
1078	ldr	x24,[x2,x28]		// next b[i] (or b[0])
1079	adds	x20,x20,x10
1080	mul	x10,x14,x25		// lo(n[4..7]*a[0]*n0)
1081	adcs	x21,x21,x11
1082	mul	x11,x15,x25
1083	adcs	x22,x22,x12
1084	mul	x12,x16,x25
1085	adc	x23,x23,x13		// can't overflow
1086	mul	x13,x17,x25
1087	adds	x19,x19,x10
1088	umulh	x10,x14,x25		// hi(n[4..7]*a[0]*n0)
1089	adcs	x20,x20,x11
1090	umulh	x11,x15,x25
1091	adcs	x21,x21,x12
1092	umulh	x12,x16,x25
1093	adcs	x22,x22,x13
1094	adcs	x23,x23,x0
1095	umulh	x13,x17,x25
1096	adc	x0,xzr,xzr
1097	ldr	x25,[sp,x28]		// next t[0]*n0
1098	str	x19,[x26],#8		// result!!!
1099	adds	x19,x20,x10
1100	sub	x10,x27,x1		// done yet?
1101	adcs	x20,x21,x11
1102	adcs	x21,x22,x12
1103	adcs	x22,x23,x13
1104	//adc	x0,x0,xzr
1105	cbnz	x28,.Loop_mul4x_1st_tail
1106
1107	sub	x11,x27,x5	// rewinded x1
1108	cbz	x10,.Lmul4x_proceed
1109
1110	ldp	x6,x7,[x1,#8*0]
1111	ldp	x8,x9,[x1,#8*2]
1112	add	x1,x1,#8*4
1113	ldp	x14,x15,[x3,#8*0]
1114	ldp	x16,x17,[x3,#8*2]
1115	add	x3,x3,#8*4
1116	b	.Loop_mul4x_1st_tail
1117
1118.align	5
1119.Lmul4x_proceed:
1120	ldr	x24,[x2,#8*4]!		// *++b
1121	adc	x30,x0,xzr
1122	ldp	x6,x7,[x11,#8*0]	// a[0..3]
1123	sub	x3,x3,x5		// rewind np
1124	ldp	x8,x9,[x11,#8*2]
1125	add	x1,x11,#8*4
1126
1127	stp	x19,x20,[x26,#8*0]	// result!!!
1128	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1129	stp	x21,x22,[x26,#8*2]	// result!!!
1130	ldp	x21,x22,[sp,#8*6]
1131
1132	ldp	x14,x15,[x3,#8*0]	// n[0..3]
1133	mov	x26,sp
1134	ldp	x16,x17,[x3,#8*2]
1135	adds	x3,x3,#8*4		// clear carry bit
1136	mov	x0,xzr
1137
1138.align	4
1139.Loop_mul4x_reduction:
1140	mul	x10,x6,x24		// lo(a[0..3]*b[4])
1141	adc	x0,x0,xzr	// modulo-scheduled
1142	mul	x11,x7,x24
1143	add	x28,x28,#8
1144	mul	x12,x8,x24
1145	and	x28,x28,#31
1146	mul	x13,x9,x24
1147	adds	x19,x19,x10
1148	umulh	x10,x6,x24		// hi(a[0..3]*b[4])
1149	adcs	x20,x20,x11
1150	mul	x25,x19,x4		// t[0]*n0
1151	adcs	x21,x21,x12
1152	umulh	x11,x7,x24
1153	adcs	x22,x22,x13
1154	umulh	x12,x8,x24
1155	adc	x23,xzr,xzr
1156	umulh	x13,x9,x24
1157	ldr	x24,[x2,x28]		// next b[i]
1158	adds	x20,x20,x10
1159	// (*)	mul	x10,x14,x25
1160	str	x25,[x26],#8		// put aside t[0]*n0 for tail processing
1161	adcs	x21,x21,x11
1162	mul	x11,x15,x25		// lo(n[0..3]*t[0]*n0
1163	adcs	x22,x22,x12
1164	mul	x12,x16,x25
1165	adc	x23,x23,x13		// can't overflow
1166	mul	x13,x17,x25
1167	// (*)	adds	xzr,x19,x10
1168	subs	xzr,x19,#1		// (*)
1169	umulh	x10,x14,x25		// hi(n[0..3]*t[0]*n0
1170	adcs	x19,x20,x11
1171	umulh	x11,x15,x25
1172	adcs	x20,x21,x12
1173	umulh	x12,x16,x25
1174	adcs	x21,x22,x13
1175	umulh	x13,x17,x25
1176	adcs	x22,x23,x0
1177	adc	x0,xzr,xzr
1178	adds	x19,x19,x10
1179	adcs	x20,x20,x11
1180	adcs	x21,x21,x12
1181	adcs	x22,x22,x13
1182	//adc	x0,x0,xzr
1183	cbnz	x28,.Loop_mul4x_reduction
1184
1185	adc	x0,x0,xzr
1186	ldp	x10,x11,[x26,#8*4]	// t[4..7]
1187	ldp	x12,x13,[x26,#8*6]
1188	ldp	x6,x7,[x1,#8*0]	// a[4..7]
1189	ldp	x8,x9,[x1,#8*2]
1190	add	x1,x1,#8*4
1191	adds	x19,x19,x10
1192	adcs	x20,x20,x11
1193	adcs	x21,x21,x12
1194	adcs	x22,x22,x13
1195	//adc	x0,x0,xzr
1196
1197	ldr	x25,[sp]		// t[0]*n0
1198	ldp	x14,x15,[x3,#8*0]	// n[4..7]
1199	ldp	x16,x17,[x3,#8*2]
1200	add	x3,x3,#8*4
1201
1202.align	4
1203.Loop_mul4x_tail:
1204	mul	x10,x6,x24		// lo(a[4..7]*b[4])
1205	adc	x0,x0,xzr	// modulo-scheduled
1206	mul	x11,x7,x24
1207	add	x28,x28,#8
1208	mul	x12,x8,x24
1209	and	x28,x28,#31
1210	mul	x13,x9,x24
1211	adds	x19,x19,x10
1212	umulh	x10,x6,x24		// hi(a[4..7]*b[4])
1213	adcs	x20,x20,x11
1214	umulh	x11,x7,x24
1215	adcs	x21,x21,x12
1216	umulh	x12,x8,x24
1217	adcs	x22,x22,x13
1218	umulh	x13,x9,x24
1219	adc	x23,xzr,xzr
1220	ldr	x24,[x2,x28]		// next b[i]
1221	adds	x20,x20,x10
1222	mul	x10,x14,x25		// lo(n[4..7]*t[0]*n0)
1223	adcs	x21,x21,x11
1224	mul	x11,x15,x25
1225	adcs	x22,x22,x12
1226	mul	x12,x16,x25
1227	adc	x23,x23,x13		// can't overflow
1228	mul	x13,x17,x25
1229	adds	x19,x19,x10
1230	umulh	x10,x14,x25		// hi(n[4..7]*t[0]*n0)
1231	adcs	x20,x20,x11
1232	umulh	x11,x15,x25
1233	adcs	x21,x21,x12
1234	umulh	x12,x16,x25
1235	adcs	x22,x22,x13
1236	umulh	x13,x17,x25
1237	adcs	x23,x23,x0
1238	ldr	x25,[sp,x28]		// next a[0]*n0
1239	adc	x0,xzr,xzr
1240	str	x19,[x26],#8		// result!!!
1241	adds	x19,x20,x10
1242	sub	x10,x27,x1		// done yet?
1243	adcs	x20,x21,x11
1244	adcs	x21,x22,x12
1245	adcs	x22,x23,x13
1246	//adc	x0,x0,xzr
1247	cbnz	x28,.Loop_mul4x_tail
1248
1249	sub	x11,x3,x5		// rewinded np?
1250	adc	x0,x0,xzr
1251	cbz	x10,.Loop_mul4x_break
1252
1253	ldp	x10,x11,[x26,#8*4]
1254	ldp	x12,x13,[x26,#8*6]
1255	ldp	x6,x7,[x1,#8*0]
1256	ldp	x8,x9,[x1,#8*2]
1257	add	x1,x1,#8*4
1258	adds	x19,x19,x10
1259	adcs	x20,x20,x11
1260	adcs	x21,x21,x12
1261	adcs	x22,x22,x13
1262	//adc	x0,x0,xzr
1263	ldp	x14,x15,[x3,#8*0]
1264	ldp	x16,x17,[x3,#8*2]
1265	add	x3,x3,#8*4
1266	b	.Loop_mul4x_tail
1267
1268.align	4
1269.Loop_mul4x_break:
1270	ldp	x12,x13,[x29,#96]	// pull rp and &b[num]
1271	adds	x19,x19,x30
1272	add	x2,x2,#8*4		// bp++
1273	adcs	x20,x20,xzr
1274	sub	x1,x1,x5		// rewind ap
1275	adcs	x21,x21,xzr
1276	stp	x19,x20,[x26,#8*0]	// result!!!
1277	adcs	x22,x22,xzr
1278	ldp	x19,x20,[sp,#8*4]	// t[0..3]
1279	adc	x30,x0,xzr
1280	stp	x21,x22,[x26,#8*2]	// result!!!
1281	cmp	x2,x13			// done yet?
1282	ldp	x21,x22,[sp,#8*6]
1283	ldp	x14,x15,[x11,#8*0]	// n[0..3]
1284	ldp	x16,x17,[x11,#8*2]
1285	add	x3,x11,#8*4
1286	b.eq	.Lmul4x_post
1287
1288	ldr	x24,[x2]
1289	ldp	x6,x7,[x1,#8*0]	// a[0..3]
1290	ldp	x8,x9,[x1,#8*2]
1291	adds	x1,x1,#8*4		// clear carry bit
1292	mov	x0,xzr
1293	mov	x26,sp
1294	b	.Loop_mul4x_reduction
1295
1296.align	4
1297.Lmul4x_post:
1298	// Final step. We see if result is larger than modulus, and
1299	// if it is, subtract the modulus. But comparison implies
1300	// subtraction. So we subtract modulus, see if it borrowed,
1301	// and conditionally copy original value.
1302	mov	x0,x12
1303	mov	x27,x12		// x0 copy
1304	subs	x10,x19,x14
1305	add	x26,sp,#8*8
1306	sbcs	x11,x20,x15
1307	sub	x28,x5,#8*4
1308
1309.Lmul4x_sub:
1310	sbcs	x12,x21,x16
1311	ldp	x14,x15,[x3,#8*0]
1312	sub	x28,x28,#8*4
1313	ldp	x19,x20,[x26,#8*0]
1314	sbcs	x13,x22,x17
1315	ldp	x16,x17,[x3,#8*2]
1316	add	x3,x3,#8*4
1317	ldp	x21,x22,[x26,#8*2]
1318	add	x26,x26,#8*4
1319	stp	x10,x11,[x0,#8*0]
1320	sbcs	x10,x19,x14
1321	stp	x12,x13,[x0,#8*2]
1322	add	x0,x0,#8*4
1323	sbcs	x11,x20,x15
1324	cbnz	x28,.Lmul4x_sub
1325
1326	sbcs	x12,x21,x16
1327	mov	x26,sp
1328	add	x1,sp,#8*4
1329	ldp	x6,x7,[x27,#8*0]
1330	sbcs	x13,x22,x17
1331	stp	x10,x11,[x0,#8*0]
1332	ldp	x8,x9,[x27,#8*2]
1333	stp	x12,x13,[x0,#8*2]
1334	ldp	x19,x20,[x1,#8*0]
1335	ldp	x21,x22,[x1,#8*2]
1336	sbcs	xzr,x30,xzr	// did it borrow?
1337	ldr	x30,[x29,#8]		// pull return address
1338
1339	sub	x28,x5,#8*4
1340.Lmul4x_cond_copy:
1341	sub	x28,x28,#8*4
1342	csel	x10,x19,x6,lo
1343	stp	xzr,xzr,[x26,#8*0]
1344	csel	x11,x20,x7,lo
1345	ldp	x6,x7,[x27,#8*4]
1346	ldp	x19,x20,[x1,#8*4]
1347	csel	x12,x21,x8,lo
1348	stp	xzr,xzr,[x26,#8*2]
1349	add	x26,x26,#8*4
1350	csel	x13,x22,x9,lo
1351	ldp	x8,x9,[x27,#8*6]
1352	ldp	x21,x22,[x1,#8*6]
1353	add	x1,x1,#8*4
1354	stp	x10,x11,[x27,#8*0]
1355	stp	x12,x13,[x27,#8*2]
1356	add	x27,x27,#8*4
1357	cbnz	x28,.Lmul4x_cond_copy
1358
1359	csel	x10,x19,x6,lo
1360	stp	xzr,xzr,[x26,#8*0]
1361	csel	x11,x20,x7,lo
1362	stp	xzr,xzr,[x26,#8*2]
1363	csel	x12,x21,x8,lo
1364	stp	xzr,xzr,[x26,#8*3]
1365	csel	x13,x22,x9,lo
1366	stp	xzr,xzr,[x26,#8*4]
1367	stp	x10,x11,[x27,#8*0]
1368	stp	x12,x13,[x27,#8*2]
1369
1370	b	.Lmul4x_done
1371
1372.align	4
1373.Lmul4x4_post_condition:
1374	adc	x0,x0,xzr
1375	ldr	x1,[x29,#96]		// pull rp
1376	// x19-3,x0 hold result, x14-7 hold modulus
1377	subs	x6,x19,x14
1378	ldr	x30,[x29,#8]		// pull return address
1379	sbcs	x7,x20,x15
1380	stp	xzr,xzr,[sp,#8*0]
1381	sbcs	x8,x21,x16
1382	stp	xzr,xzr,[sp,#8*2]
1383	sbcs	x9,x22,x17
1384	stp	xzr,xzr,[sp,#8*4]
1385	sbcs	xzr,x0,xzr		// did it borrow?
1386	stp	xzr,xzr,[sp,#8*6]
1387
1388	// x6-3 hold result-modulus
1389	csel	x6,x19,x6,lo
1390	csel	x7,x20,x7,lo
1391	csel	x8,x21,x8,lo
1392	csel	x9,x22,x9,lo
1393	stp	x6,x7,[x1,#8*0]
1394	stp	x8,x9,[x1,#8*2]
1395
1396.Lmul4x_done:
1397	ldp	x19,x20,[x29,#16]
1398	mov	sp,x29
1399	ldp	x21,x22,[x29,#32]
1400	mov	x0,#1
1401	ldp	x23,x24,[x29,#48]
1402	ldp	x25,x26,[x29,#64]
1403	ldp	x27,x28,[x29,#80]
1404	ldr	x29,[sp],#128
1405.inst	0xd50323bf		// autiasp
1406	ret
1407.size	__bn_mul4x_mont,.-__bn_mul4x_mont
1408.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1409.align	2
1410.align	4
1411