xref: /freebsd/sys/crypto/openssl/amd64/x86_64-mont.S (revision a90b9d0159070121c221b966469c3e36d912bf82)
1/* Do not modify. This file is auto-generated from x86_64-mont.pl. */
2.text
3
4
5
6.globl	bn_mul_mont
7.type	bn_mul_mont,@function
8.align	16
9bn_mul_mont:
10.cfi_startproc
11	movl	%r9d,%r9d
12	movq	%rsp,%rax
13.cfi_def_cfa_register	%rax
14	testl	$3,%r9d
15	jnz	.Lmul_enter
16	cmpl	$8,%r9d
17	jb	.Lmul_enter
18	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
19	cmpq	%rsi,%rdx
20	jne	.Lmul4x_enter
21	testl	$7,%r9d
22	jz	.Lsqr8x_enter
23	jmp	.Lmul4x_enter
24
25.align	16
26.Lmul_enter:
27	pushq	%rbx
28.cfi_offset	%rbx,-16
29	pushq	%rbp
30.cfi_offset	%rbp,-24
31	pushq	%r12
32.cfi_offset	%r12,-32
33	pushq	%r13
34.cfi_offset	%r13,-40
35	pushq	%r14
36.cfi_offset	%r14,-48
37	pushq	%r15
38.cfi_offset	%r15,-56
39
40	negq	%r9
41	movq	%rsp,%r11
42	leaq	-16(%rsp,%r9,8),%r10
43	negq	%r9
44	andq	$-1024,%r10
45
46
47
48
49
50
51
52
53
54	subq	%r10,%r11
55	andq	$-4096,%r11
56	leaq	(%r10,%r11,1),%rsp
57	movq	(%rsp),%r11
58	cmpq	%r10,%rsp
59	ja	.Lmul_page_walk
60	jmp	.Lmul_page_walk_done
61
62.align	16
63.Lmul_page_walk:
64	leaq	-4096(%rsp),%rsp
65	movq	(%rsp),%r11
66	cmpq	%r10,%rsp
67	ja	.Lmul_page_walk
68.Lmul_page_walk_done:
69
70	movq	%rax,8(%rsp,%r9,8)
71.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
72.Lmul_body:
73	movq	%rdx,%r12
74	movq	(%r8),%r8
75	movq	(%r12),%rbx
76	movq	(%rsi),%rax
77
78	xorq	%r14,%r14
79	xorq	%r15,%r15
80
81	movq	%r8,%rbp
82	mulq	%rbx
83	movq	%rax,%r10
84	movq	(%rcx),%rax
85
86	imulq	%r10,%rbp
87	movq	%rdx,%r11
88
89	mulq	%rbp
90	addq	%rax,%r10
91	movq	8(%rsi),%rax
92	adcq	$0,%rdx
93	movq	%rdx,%r13
94
95	leaq	1(%r15),%r15
96	jmp	.L1st_enter
97
98.align	16
99.L1st:
100	addq	%rax,%r13
101	movq	(%rsi,%r15,8),%rax
102	adcq	$0,%rdx
103	addq	%r11,%r13
104	movq	%r10,%r11
105	adcq	$0,%rdx
106	movq	%r13,-16(%rsp,%r15,8)
107	movq	%rdx,%r13
108
109.L1st_enter:
110	mulq	%rbx
111	addq	%rax,%r11
112	movq	(%rcx,%r15,8),%rax
113	adcq	$0,%rdx
114	leaq	1(%r15),%r15
115	movq	%rdx,%r10
116
117	mulq	%rbp
118	cmpq	%r9,%r15
119	jne	.L1st
120
121	addq	%rax,%r13
122	movq	(%rsi),%rax
123	adcq	$0,%rdx
124	addq	%r11,%r13
125	adcq	$0,%rdx
126	movq	%r13,-16(%rsp,%r15,8)
127	movq	%rdx,%r13
128	movq	%r10,%r11
129
130	xorq	%rdx,%rdx
131	addq	%r11,%r13
132	adcq	$0,%rdx
133	movq	%r13,-8(%rsp,%r9,8)
134	movq	%rdx,(%rsp,%r9,8)
135
136	leaq	1(%r14),%r14
137	jmp	.Louter
138.align	16
139.Louter:
140	movq	(%r12,%r14,8),%rbx
141	xorq	%r15,%r15
142	movq	%r8,%rbp
143	movq	(%rsp),%r10
144	mulq	%rbx
145	addq	%rax,%r10
146	movq	(%rcx),%rax
147	adcq	$0,%rdx
148
149	imulq	%r10,%rbp
150	movq	%rdx,%r11
151
152	mulq	%rbp
153	addq	%rax,%r10
154	movq	8(%rsi),%rax
155	adcq	$0,%rdx
156	movq	8(%rsp),%r10
157	movq	%rdx,%r13
158
159	leaq	1(%r15),%r15
160	jmp	.Linner_enter
161
162.align	16
163.Linner:
164	addq	%rax,%r13
165	movq	(%rsi,%r15,8),%rax
166	adcq	$0,%rdx
167	addq	%r10,%r13
168	movq	(%rsp,%r15,8),%r10
169	adcq	$0,%rdx
170	movq	%r13,-16(%rsp,%r15,8)
171	movq	%rdx,%r13
172
173.Linner_enter:
174	mulq	%rbx
175	addq	%rax,%r11
176	movq	(%rcx,%r15,8),%rax
177	adcq	$0,%rdx
178	addq	%r11,%r10
179	movq	%rdx,%r11
180	adcq	$0,%r11
181	leaq	1(%r15),%r15
182
183	mulq	%rbp
184	cmpq	%r9,%r15
185	jne	.Linner
186
187	addq	%rax,%r13
188	movq	(%rsi),%rax
189	adcq	$0,%rdx
190	addq	%r10,%r13
191	movq	(%rsp,%r15,8),%r10
192	adcq	$0,%rdx
193	movq	%r13,-16(%rsp,%r15,8)
194	movq	%rdx,%r13
195
196	xorq	%rdx,%rdx
197	addq	%r11,%r13
198	adcq	$0,%rdx
199	addq	%r10,%r13
200	adcq	$0,%rdx
201	movq	%r13,-8(%rsp,%r9,8)
202	movq	%rdx,(%rsp,%r9,8)
203
204	leaq	1(%r14),%r14
205	cmpq	%r9,%r14
206	jb	.Louter
207
208	xorq	%r14,%r14
209	movq	(%rsp),%rax
210	movq	%r9,%r15
211
212.align	16
213.Lsub:	sbbq	(%rcx,%r14,8),%rax
214	movq	%rax,(%rdi,%r14,8)
215	movq	8(%rsp,%r14,8),%rax
216	leaq	1(%r14),%r14
217	decq	%r15
218	jnz	.Lsub
219
220	sbbq	$0,%rax
221	movq	$-1,%rbx
222	xorq	%rax,%rbx
223	xorq	%r14,%r14
224	movq	%r9,%r15
225
226.Lcopy:
227	movq	(%rdi,%r14,8),%rcx
228	movq	(%rsp,%r14,8),%rdx
229	andq	%rbx,%rcx
230	andq	%rax,%rdx
231	movq	%r9,(%rsp,%r14,8)
232	orq	%rcx,%rdx
233	movq	%rdx,(%rdi,%r14,8)
234	leaq	1(%r14),%r14
235	subq	$1,%r15
236	jnz	.Lcopy
237
238	movq	8(%rsp,%r9,8),%rsi
239.cfi_def_cfa	%rsi,8
240	movq	$1,%rax
241	movq	-48(%rsi),%r15
242.cfi_restore	%r15
243	movq	-40(%rsi),%r14
244.cfi_restore	%r14
245	movq	-32(%rsi),%r13
246.cfi_restore	%r13
247	movq	-24(%rsi),%r12
248.cfi_restore	%r12
249	movq	-16(%rsi),%rbp
250.cfi_restore	%rbp
251	movq	-8(%rsi),%rbx
252.cfi_restore	%rbx
253	leaq	(%rsi),%rsp
254.cfi_def_cfa_register	%rsp
255.Lmul_epilogue:
256	.byte	0xf3,0xc3
257.cfi_endproc
258.size	bn_mul_mont,.-bn_mul_mont
259.type	bn_mul4x_mont,@function
260.align	16
261bn_mul4x_mont:
262.cfi_startproc
263	movl	%r9d,%r9d
264	movq	%rsp,%rax
265.cfi_def_cfa_register	%rax
266.Lmul4x_enter:
267	andl	$0x80100,%r11d
268	cmpl	$0x80100,%r11d
269	je	.Lmulx4x_enter
270	pushq	%rbx
271.cfi_offset	%rbx,-16
272	pushq	%rbp
273.cfi_offset	%rbp,-24
274	pushq	%r12
275.cfi_offset	%r12,-32
276	pushq	%r13
277.cfi_offset	%r13,-40
278	pushq	%r14
279.cfi_offset	%r14,-48
280	pushq	%r15
281.cfi_offset	%r15,-56
282
283	negq	%r9
284	movq	%rsp,%r11
285	leaq	-32(%rsp,%r9,8),%r10
286	negq	%r9
287	andq	$-1024,%r10
288
289	subq	%r10,%r11
290	andq	$-4096,%r11
291	leaq	(%r10,%r11,1),%rsp
292	movq	(%rsp),%r11
293	cmpq	%r10,%rsp
294	ja	.Lmul4x_page_walk
295	jmp	.Lmul4x_page_walk_done
296
297.Lmul4x_page_walk:
298	leaq	-4096(%rsp),%rsp
299	movq	(%rsp),%r11
300	cmpq	%r10,%rsp
301	ja	.Lmul4x_page_walk
302.Lmul4x_page_walk_done:
303
304	movq	%rax,8(%rsp,%r9,8)
305.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
306.Lmul4x_body:
307	movq	%rdi,16(%rsp,%r9,8)
308	movq	%rdx,%r12
309	movq	(%r8),%r8
310	movq	(%r12),%rbx
311	movq	(%rsi),%rax
312
313	xorq	%r14,%r14
314	xorq	%r15,%r15
315
316	movq	%r8,%rbp
317	mulq	%rbx
318	movq	%rax,%r10
319	movq	(%rcx),%rax
320
321	imulq	%r10,%rbp
322	movq	%rdx,%r11
323
324	mulq	%rbp
325	addq	%rax,%r10
326	movq	8(%rsi),%rax
327	adcq	$0,%rdx
328	movq	%rdx,%rdi
329
330	mulq	%rbx
331	addq	%rax,%r11
332	movq	8(%rcx),%rax
333	adcq	$0,%rdx
334	movq	%rdx,%r10
335
336	mulq	%rbp
337	addq	%rax,%rdi
338	movq	16(%rsi),%rax
339	adcq	$0,%rdx
340	addq	%r11,%rdi
341	leaq	4(%r15),%r15
342	adcq	$0,%rdx
343	movq	%rdi,(%rsp)
344	movq	%rdx,%r13
345	jmp	.L1st4x
346.align	16
347.L1st4x:
348	mulq	%rbx
349	addq	%rax,%r10
350	movq	-16(%rcx,%r15,8),%rax
351	adcq	$0,%rdx
352	movq	%rdx,%r11
353
354	mulq	%rbp
355	addq	%rax,%r13
356	movq	-8(%rsi,%r15,8),%rax
357	adcq	$0,%rdx
358	addq	%r10,%r13
359	adcq	$0,%rdx
360	movq	%r13,-24(%rsp,%r15,8)
361	movq	%rdx,%rdi
362
363	mulq	%rbx
364	addq	%rax,%r11
365	movq	-8(%rcx,%r15,8),%rax
366	adcq	$0,%rdx
367	movq	%rdx,%r10
368
369	mulq	%rbp
370	addq	%rax,%rdi
371	movq	(%rsi,%r15,8),%rax
372	adcq	$0,%rdx
373	addq	%r11,%rdi
374	adcq	$0,%rdx
375	movq	%rdi,-16(%rsp,%r15,8)
376	movq	%rdx,%r13
377
378	mulq	%rbx
379	addq	%rax,%r10
380	movq	(%rcx,%r15,8),%rax
381	adcq	$0,%rdx
382	movq	%rdx,%r11
383
384	mulq	%rbp
385	addq	%rax,%r13
386	movq	8(%rsi,%r15,8),%rax
387	adcq	$0,%rdx
388	addq	%r10,%r13
389	adcq	$0,%rdx
390	movq	%r13,-8(%rsp,%r15,8)
391	movq	%rdx,%rdi
392
393	mulq	%rbx
394	addq	%rax,%r11
395	movq	8(%rcx,%r15,8),%rax
396	adcq	$0,%rdx
397	leaq	4(%r15),%r15
398	movq	%rdx,%r10
399
400	mulq	%rbp
401	addq	%rax,%rdi
402	movq	-16(%rsi,%r15,8),%rax
403	adcq	$0,%rdx
404	addq	%r11,%rdi
405	adcq	$0,%rdx
406	movq	%rdi,-32(%rsp,%r15,8)
407	movq	%rdx,%r13
408	cmpq	%r9,%r15
409	jb	.L1st4x
410
411	mulq	%rbx
412	addq	%rax,%r10
413	movq	-16(%rcx,%r15,8),%rax
414	adcq	$0,%rdx
415	movq	%rdx,%r11
416
417	mulq	%rbp
418	addq	%rax,%r13
419	movq	-8(%rsi,%r15,8),%rax
420	adcq	$0,%rdx
421	addq	%r10,%r13
422	adcq	$0,%rdx
423	movq	%r13,-24(%rsp,%r15,8)
424	movq	%rdx,%rdi
425
426	mulq	%rbx
427	addq	%rax,%r11
428	movq	-8(%rcx,%r15,8),%rax
429	adcq	$0,%rdx
430	movq	%rdx,%r10
431
432	mulq	%rbp
433	addq	%rax,%rdi
434	movq	(%rsi),%rax
435	adcq	$0,%rdx
436	addq	%r11,%rdi
437	adcq	$0,%rdx
438	movq	%rdi,-16(%rsp,%r15,8)
439	movq	%rdx,%r13
440
441	xorq	%rdi,%rdi
442	addq	%r10,%r13
443	adcq	$0,%rdi
444	movq	%r13,-8(%rsp,%r15,8)
445	movq	%rdi,(%rsp,%r15,8)
446
447	leaq	1(%r14),%r14
448.align	4
449.Louter4x:
450	movq	(%r12,%r14,8),%rbx
451	xorq	%r15,%r15
452	movq	(%rsp),%r10
453	movq	%r8,%rbp
454	mulq	%rbx
455	addq	%rax,%r10
456	movq	(%rcx),%rax
457	adcq	$0,%rdx
458
459	imulq	%r10,%rbp
460	movq	%rdx,%r11
461
462	mulq	%rbp
463	addq	%rax,%r10
464	movq	8(%rsi),%rax
465	adcq	$0,%rdx
466	movq	%rdx,%rdi
467
468	mulq	%rbx
469	addq	%rax,%r11
470	movq	8(%rcx),%rax
471	adcq	$0,%rdx
472	addq	8(%rsp),%r11
473	adcq	$0,%rdx
474	movq	%rdx,%r10
475
476	mulq	%rbp
477	addq	%rax,%rdi
478	movq	16(%rsi),%rax
479	adcq	$0,%rdx
480	addq	%r11,%rdi
481	leaq	4(%r15),%r15
482	adcq	$0,%rdx
483	movq	%rdi,(%rsp)
484	movq	%rdx,%r13
485	jmp	.Linner4x
486.align	16
487.Linner4x:
488	mulq	%rbx
489	addq	%rax,%r10
490	movq	-16(%rcx,%r15,8),%rax
491	adcq	$0,%rdx
492	addq	-16(%rsp,%r15,8),%r10
493	adcq	$0,%rdx
494	movq	%rdx,%r11
495
496	mulq	%rbp
497	addq	%rax,%r13
498	movq	-8(%rsi,%r15,8),%rax
499	adcq	$0,%rdx
500	addq	%r10,%r13
501	adcq	$0,%rdx
502	movq	%r13,-24(%rsp,%r15,8)
503	movq	%rdx,%rdi
504
505	mulq	%rbx
506	addq	%rax,%r11
507	movq	-8(%rcx,%r15,8),%rax
508	adcq	$0,%rdx
509	addq	-8(%rsp,%r15,8),%r11
510	adcq	$0,%rdx
511	movq	%rdx,%r10
512
513	mulq	%rbp
514	addq	%rax,%rdi
515	movq	(%rsi,%r15,8),%rax
516	adcq	$0,%rdx
517	addq	%r11,%rdi
518	adcq	$0,%rdx
519	movq	%rdi,-16(%rsp,%r15,8)
520	movq	%rdx,%r13
521
522	mulq	%rbx
523	addq	%rax,%r10
524	movq	(%rcx,%r15,8),%rax
525	adcq	$0,%rdx
526	addq	(%rsp,%r15,8),%r10
527	adcq	$0,%rdx
528	movq	%rdx,%r11
529
530	mulq	%rbp
531	addq	%rax,%r13
532	movq	8(%rsi,%r15,8),%rax
533	adcq	$0,%rdx
534	addq	%r10,%r13
535	adcq	$0,%rdx
536	movq	%r13,-8(%rsp,%r15,8)
537	movq	%rdx,%rdi
538
539	mulq	%rbx
540	addq	%rax,%r11
541	movq	8(%rcx,%r15,8),%rax
542	adcq	$0,%rdx
543	addq	8(%rsp,%r15,8),%r11
544	adcq	$0,%rdx
545	leaq	4(%r15),%r15
546	movq	%rdx,%r10
547
548	mulq	%rbp
549	addq	%rax,%rdi
550	movq	-16(%rsi,%r15,8),%rax
551	adcq	$0,%rdx
552	addq	%r11,%rdi
553	adcq	$0,%rdx
554	movq	%rdi,-32(%rsp,%r15,8)
555	movq	%rdx,%r13
556	cmpq	%r9,%r15
557	jb	.Linner4x
558
559	mulq	%rbx
560	addq	%rax,%r10
561	movq	-16(%rcx,%r15,8),%rax
562	adcq	$0,%rdx
563	addq	-16(%rsp,%r15,8),%r10
564	adcq	$0,%rdx
565	movq	%rdx,%r11
566
567	mulq	%rbp
568	addq	%rax,%r13
569	movq	-8(%rsi,%r15,8),%rax
570	adcq	$0,%rdx
571	addq	%r10,%r13
572	adcq	$0,%rdx
573	movq	%r13,-24(%rsp,%r15,8)
574	movq	%rdx,%rdi
575
576	mulq	%rbx
577	addq	%rax,%r11
578	movq	-8(%rcx,%r15,8),%rax
579	adcq	$0,%rdx
580	addq	-8(%rsp,%r15,8),%r11
581	adcq	$0,%rdx
582	leaq	1(%r14),%r14
583	movq	%rdx,%r10
584
585	mulq	%rbp
586	addq	%rax,%rdi
587	movq	(%rsi),%rax
588	adcq	$0,%rdx
589	addq	%r11,%rdi
590	adcq	$0,%rdx
591	movq	%rdi,-16(%rsp,%r15,8)
592	movq	%rdx,%r13
593
594	xorq	%rdi,%rdi
595	addq	%r10,%r13
596	adcq	$0,%rdi
597	addq	(%rsp,%r9,8),%r13
598	adcq	$0,%rdi
599	movq	%r13,-8(%rsp,%r15,8)
600	movq	%rdi,(%rsp,%r15,8)
601
602	cmpq	%r9,%r14
603	jb	.Louter4x
604	movq	16(%rsp,%r9,8),%rdi
605	leaq	-4(%r9),%r15
606	movq	0(%rsp),%rax
607	movq	8(%rsp),%rdx
608	shrq	$2,%r15
609	leaq	(%rsp),%rsi
610	xorq	%r14,%r14
611
612	subq	0(%rcx),%rax
613	movq	16(%rsi),%rbx
614	movq	24(%rsi),%rbp
615	sbbq	8(%rcx),%rdx
616
617.Lsub4x:
618	movq	%rax,0(%rdi,%r14,8)
619	movq	%rdx,8(%rdi,%r14,8)
620	sbbq	16(%rcx,%r14,8),%rbx
621	movq	32(%rsi,%r14,8),%rax
622	movq	40(%rsi,%r14,8),%rdx
623	sbbq	24(%rcx,%r14,8),%rbp
624	movq	%rbx,16(%rdi,%r14,8)
625	movq	%rbp,24(%rdi,%r14,8)
626	sbbq	32(%rcx,%r14,8),%rax
627	movq	48(%rsi,%r14,8),%rbx
628	movq	56(%rsi,%r14,8),%rbp
629	sbbq	40(%rcx,%r14,8),%rdx
630	leaq	4(%r14),%r14
631	decq	%r15
632	jnz	.Lsub4x
633
634	movq	%rax,0(%rdi,%r14,8)
635	movq	32(%rsi,%r14,8),%rax
636	sbbq	16(%rcx,%r14,8),%rbx
637	movq	%rdx,8(%rdi,%r14,8)
638	sbbq	24(%rcx,%r14,8),%rbp
639	movq	%rbx,16(%rdi,%r14,8)
640
641	sbbq	$0,%rax
642	movq	%rbp,24(%rdi,%r14,8)
643	pxor	%xmm0,%xmm0
644.byte	102,72,15,110,224
645	pcmpeqd	%xmm5,%xmm5
646	pshufd	$0,%xmm4,%xmm4
647	movq	%r9,%r15
648	pxor	%xmm4,%xmm5
649	shrq	$2,%r15
650	xorl	%eax,%eax
651
652	jmp	.Lcopy4x
653.align	16
654.Lcopy4x:
655	movdqa	(%rsp,%rax,1),%xmm1
656	movdqu	(%rdi,%rax,1),%xmm2
657	pand	%xmm4,%xmm1
658	pand	%xmm5,%xmm2
659	movdqa	16(%rsp,%rax,1),%xmm3
660	movdqa	%xmm0,(%rsp,%rax,1)
661	por	%xmm2,%xmm1
662	movdqu	16(%rdi,%rax,1),%xmm2
663	movdqu	%xmm1,(%rdi,%rax,1)
664	pand	%xmm4,%xmm3
665	pand	%xmm5,%xmm2
666	movdqa	%xmm0,16(%rsp,%rax,1)
667	por	%xmm2,%xmm3
668	movdqu	%xmm3,16(%rdi,%rax,1)
669	leaq	32(%rax),%rax
670	decq	%r15
671	jnz	.Lcopy4x
672	movq	8(%rsp,%r9,8),%rsi
673.cfi_def_cfa	%rsi, 8
674	movq	$1,%rax
675	movq	-48(%rsi),%r15
676.cfi_restore	%r15
677	movq	-40(%rsi),%r14
678.cfi_restore	%r14
679	movq	-32(%rsi),%r13
680.cfi_restore	%r13
681	movq	-24(%rsi),%r12
682.cfi_restore	%r12
683	movq	-16(%rsi),%rbp
684.cfi_restore	%rbp
685	movq	-8(%rsi),%rbx
686.cfi_restore	%rbx
687	leaq	(%rsi),%rsp
688.cfi_def_cfa_register	%rsp
689.Lmul4x_epilogue:
690	.byte	0xf3,0xc3
691.cfi_endproc
692.size	bn_mul4x_mont,.-bn_mul4x_mont
693
694
695
696.type	bn_sqr8x_mont,@function
697.align	32
698bn_sqr8x_mont:
699.cfi_startproc
700	movq	%rsp,%rax
701.cfi_def_cfa_register	%rax
702.Lsqr8x_enter:
703	pushq	%rbx
704.cfi_offset	%rbx,-16
705	pushq	%rbp
706.cfi_offset	%rbp,-24
707	pushq	%r12
708.cfi_offset	%r12,-32
709	pushq	%r13
710.cfi_offset	%r13,-40
711	pushq	%r14
712.cfi_offset	%r14,-48
713	pushq	%r15
714.cfi_offset	%r15,-56
715.Lsqr8x_prologue:
716
717	movl	%r9d,%r10d
718	shll	$3,%r9d
719	shlq	$3+2,%r10
720	negq	%r9
721
722
723
724
725
726
727	leaq	-64(%rsp,%r9,2),%r11
728	movq	%rsp,%rbp
729	movq	(%r8),%r8
730	subq	%rsi,%r11
731	andq	$4095,%r11
732	cmpq	%r11,%r10
733	jb	.Lsqr8x_sp_alt
734	subq	%r11,%rbp
735	leaq	-64(%rbp,%r9,2),%rbp
736	jmp	.Lsqr8x_sp_done
737
738.align	32
739.Lsqr8x_sp_alt:
740	leaq	4096-64(,%r9,2),%r10
741	leaq	-64(%rbp,%r9,2),%rbp
742	subq	%r10,%r11
743	movq	$0,%r10
744	cmovcq	%r10,%r11
745	subq	%r11,%rbp
746.Lsqr8x_sp_done:
747	andq	$-64,%rbp
748	movq	%rsp,%r11
749	subq	%rbp,%r11
750	andq	$-4096,%r11
751	leaq	(%r11,%rbp,1),%rsp
752	movq	(%rsp),%r10
753	cmpq	%rbp,%rsp
754	ja	.Lsqr8x_page_walk
755	jmp	.Lsqr8x_page_walk_done
756
757.align	16
758.Lsqr8x_page_walk:
759	leaq	-4096(%rsp),%rsp
760	movq	(%rsp),%r10
761	cmpq	%rbp,%rsp
762	ja	.Lsqr8x_page_walk
763.Lsqr8x_page_walk_done:
764
765	movq	%r9,%r10
766	negq	%r9
767
768	movq	%r8,32(%rsp)
769	movq	%rax,40(%rsp)
770.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
771.Lsqr8x_body:
772
773.byte	102,72,15,110,209
774	pxor	%xmm0,%xmm0
775.byte	102,72,15,110,207
776.byte	102,73,15,110,218
777	movl	OPENSSL_ia32cap_P+8(%rip),%eax
778	andl	$0x80100,%eax
779	cmpl	$0x80100,%eax
780	jne	.Lsqr8x_nox
781
782	call	bn_sqrx8x_internal
783
784
785
786
787	leaq	(%r8,%rcx,1),%rbx
788	movq	%rcx,%r9
789	movq	%rcx,%rdx
790.byte	102,72,15,126,207
791	sarq	$3+2,%rcx
792	jmp	.Lsqr8x_sub
793
794.align	32
795.Lsqr8x_nox:
796	call	bn_sqr8x_internal
797
798
799
800
801	leaq	(%rdi,%r9,1),%rbx
802	movq	%r9,%rcx
803	movq	%r9,%rdx
804.byte	102,72,15,126,207
805	sarq	$3+2,%rcx
806	jmp	.Lsqr8x_sub
807
808.align	32
809.Lsqr8x_sub:
810	movq	0(%rbx),%r12
811	movq	8(%rbx),%r13
812	movq	16(%rbx),%r14
813	movq	24(%rbx),%r15
814	leaq	32(%rbx),%rbx
815	sbbq	0(%rbp),%r12
816	sbbq	8(%rbp),%r13
817	sbbq	16(%rbp),%r14
818	sbbq	24(%rbp),%r15
819	leaq	32(%rbp),%rbp
820	movq	%r12,0(%rdi)
821	movq	%r13,8(%rdi)
822	movq	%r14,16(%rdi)
823	movq	%r15,24(%rdi)
824	leaq	32(%rdi),%rdi
825	incq	%rcx
826	jnz	.Lsqr8x_sub
827
828	sbbq	$0,%rax
829	leaq	(%rbx,%r9,1),%rbx
830	leaq	(%rdi,%r9,1),%rdi
831
832.byte	102,72,15,110,200
833	pxor	%xmm0,%xmm0
834	pshufd	$0,%xmm1,%xmm1
835	movq	40(%rsp),%rsi
836.cfi_def_cfa	%rsi,8
837	jmp	.Lsqr8x_cond_copy
838
839.align	32
840.Lsqr8x_cond_copy:
841	movdqa	0(%rbx),%xmm2
842	movdqa	16(%rbx),%xmm3
843	leaq	32(%rbx),%rbx
844	movdqu	0(%rdi),%xmm4
845	movdqu	16(%rdi),%xmm5
846	leaq	32(%rdi),%rdi
847	movdqa	%xmm0,-32(%rbx)
848	movdqa	%xmm0,-16(%rbx)
849	movdqa	%xmm0,-32(%rbx,%rdx,1)
850	movdqa	%xmm0,-16(%rbx,%rdx,1)
851	pcmpeqd	%xmm1,%xmm0
852	pand	%xmm1,%xmm2
853	pand	%xmm1,%xmm3
854	pand	%xmm0,%xmm4
855	pand	%xmm0,%xmm5
856	pxor	%xmm0,%xmm0
857	por	%xmm2,%xmm4
858	por	%xmm3,%xmm5
859	movdqu	%xmm4,-32(%rdi)
860	movdqu	%xmm5,-16(%rdi)
861	addq	$32,%r9
862	jnz	.Lsqr8x_cond_copy
863
864	movq	$1,%rax
865	movq	-48(%rsi),%r15
866.cfi_restore	%r15
867	movq	-40(%rsi),%r14
868.cfi_restore	%r14
869	movq	-32(%rsi),%r13
870.cfi_restore	%r13
871	movq	-24(%rsi),%r12
872.cfi_restore	%r12
873	movq	-16(%rsi),%rbp
874.cfi_restore	%rbp
875	movq	-8(%rsi),%rbx
876.cfi_restore	%rbx
877	leaq	(%rsi),%rsp
878.cfi_def_cfa_register	%rsp
879.Lsqr8x_epilogue:
880	.byte	0xf3,0xc3
881.cfi_endproc
882.size	bn_sqr8x_mont,.-bn_sqr8x_mont
883.type	bn_mulx4x_mont,@function
884.align	32
885bn_mulx4x_mont:
886.cfi_startproc
887	movq	%rsp,%rax
888.cfi_def_cfa_register	%rax
889.Lmulx4x_enter:
890	pushq	%rbx
891.cfi_offset	%rbx,-16
892	pushq	%rbp
893.cfi_offset	%rbp,-24
894	pushq	%r12
895.cfi_offset	%r12,-32
896	pushq	%r13
897.cfi_offset	%r13,-40
898	pushq	%r14
899.cfi_offset	%r14,-48
900	pushq	%r15
901.cfi_offset	%r15,-56
902.Lmulx4x_prologue:
903
904	shll	$3,%r9d
905	xorq	%r10,%r10
906	subq	%r9,%r10
907	movq	(%r8),%r8
908	leaq	-72(%rsp,%r10,1),%rbp
909	andq	$-128,%rbp
910	movq	%rsp,%r11
911	subq	%rbp,%r11
912	andq	$-4096,%r11
913	leaq	(%r11,%rbp,1),%rsp
914	movq	(%rsp),%r10
915	cmpq	%rbp,%rsp
916	ja	.Lmulx4x_page_walk
917	jmp	.Lmulx4x_page_walk_done
918
919.align	16
920.Lmulx4x_page_walk:
921	leaq	-4096(%rsp),%rsp
922	movq	(%rsp),%r10
923	cmpq	%rbp,%rsp
924	ja	.Lmulx4x_page_walk
925.Lmulx4x_page_walk_done:
926
927	leaq	(%rdx,%r9,1),%r10
928
929
930
931
932
933
934
935
936
937
938
939
940	movq	%r9,0(%rsp)
941	shrq	$5,%r9
942	movq	%r10,16(%rsp)
943	subq	$1,%r9
944	movq	%r8,24(%rsp)
945	movq	%rdi,32(%rsp)
946	movq	%rax,40(%rsp)
947.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
948	movq	%r9,48(%rsp)
949	jmp	.Lmulx4x_body
950
951.align	32
952.Lmulx4x_body:
953	leaq	8(%rdx),%rdi
954	movq	(%rdx),%rdx
955	leaq	64+32(%rsp),%rbx
956	movq	%rdx,%r9
957
958	mulxq	0(%rsi),%r8,%rax
959	mulxq	8(%rsi),%r11,%r14
960	addq	%rax,%r11
961	movq	%rdi,8(%rsp)
962	mulxq	16(%rsi),%r12,%r13
963	adcq	%r14,%r12
964	adcq	$0,%r13
965
966	movq	%r8,%rdi
967	imulq	24(%rsp),%r8
968	xorq	%rbp,%rbp
969
970	mulxq	24(%rsi),%rax,%r14
971	movq	%r8,%rdx
972	leaq	32(%rsi),%rsi
973	adcxq	%rax,%r13
974	adcxq	%rbp,%r14
975
976	mulxq	0(%rcx),%rax,%r10
977	adcxq	%rax,%rdi
978	adoxq	%r11,%r10
979	mulxq	8(%rcx),%rax,%r11
980	adcxq	%rax,%r10
981	adoxq	%r12,%r11
982.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
983	movq	48(%rsp),%rdi
984	movq	%r10,-32(%rbx)
985	adcxq	%rax,%r11
986	adoxq	%r13,%r12
987	mulxq	24(%rcx),%rax,%r15
988	movq	%r9,%rdx
989	movq	%r11,-24(%rbx)
990	adcxq	%rax,%r12
991	adoxq	%rbp,%r15
992	leaq	32(%rcx),%rcx
993	movq	%r12,-16(%rbx)
994
995	jmp	.Lmulx4x_1st
996
997.align	32
998.Lmulx4x_1st:
999	adcxq	%rbp,%r15
1000	mulxq	0(%rsi),%r10,%rax
1001	adcxq	%r14,%r10
1002	mulxq	8(%rsi),%r11,%r14
1003	adcxq	%rax,%r11
1004	mulxq	16(%rsi),%r12,%rax
1005	adcxq	%r14,%r12
1006	mulxq	24(%rsi),%r13,%r14
1007.byte	0x67,0x67
1008	movq	%r8,%rdx
1009	adcxq	%rax,%r13
1010	adcxq	%rbp,%r14
1011	leaq	32(%rsi),%rsi
1012	leaq	32(%rbx),%rbx
1013
1014	adoxq	%r15,%r10
1015	mulxq	0(%rcx),%rax,%r15
1016	adcxq	%rax,%r10
1017	adoxq	%r15,%r11
1018	mulxq	8(%rcx),%rax,%r15
1019	adcxq	%rax,%r11
1020	adoxq	%r15,%r12
1021	mulxq	16(%rcx),%rax,%r15
1022	movq	%r10,-40(%rbx)
1023	adcxq	%rax,%r12
1024	movq	%r11,-32(%rbx)
1025	adoxq	%r15,%r13
1026	mulxq	24(%rcx),%rax,%r15
1027	movq	%r9,%rdx
1028	movq	%r12,-24(%rbx)
1029	adcxq	%rax,%r13
1030	adoxq	%rbp,%r15
1031	leaq	32(%rcx),%rcx
1032	movq	%r13,-16(%rbx)
1033
1034	decq	%rdi
1035	jnz	.Lmulx4x_1st
1036
1037	movq	0(%rsp),%rax
1038	movq	8(%rsp),%rdi
1039	adcq	%rbp,%r15
1040	addq	%r15,%r14
1041	sbbq	%r15,%r15
1042	movq	%r14,-8(%rbx)
1043	jmp	.Lmulx4x_outer
1044
1045.align	32
1046.Lmulx4x_outer:
1047	movq	(%rdi),%rdx
1048	leaq	8(%rdi),%rdi
1049	subq	%rax,%rsi
1050	movq	%r15,(%rbx)
1051	leaq	64+32(%rsp),%rbx
1052	subq	%rax,%rcx
1053
1054	mulxq	0(%rsi),%r8,%r11
1055	xorl	%ebp,%ebp
1056	movq	%rdx,%r9
1057	mulxq	8(%rsi),%r14,%r12
1058	adoxq	-32(%rbx),%r8
1059	adcxq	%r14,%r11
1060	mulxq	16(%rsi),%r15,%r13
1061	adoxq	-24(%rbx),%r11
1062	adcxq	%r15,%r12
1063	adoxq	-16(%rbx),%r12
1064	adcxq	%rbp,%r13
1065	adoxq	%rbp,%r13
1066
1067	movq	%rdi,8(%rsp)
1068	movq	%r8,%r15
1069	imulq	24(%rsp),%r8
1070	xorl	%ebp,%ebp
1071
1072	mulxq	24(%rsi),%rax,%r14
1073	movq	%r8,%rdx
1074	adcxq	%rax,%r13
1075	adoxq	-8(%rbx),%r13
1076	adcxq	%rbp,%r14
1077	leaq	32(%rsi),%rsi
1078	adoxq	%rbp,%r14
1079
1080	mulxq	0(%rcx),%rax,%r10
1081	adcxq	%rax,%r15
1082	adoxq	%r11,%r10
1083	mulxq	8(%rcx),%rax,%r11
1084	adcxq	%rax,%r10
1085	adoxq	%r12,%r11
1086	mulxq	16(%rcx),%rax,%r12
1087	movq	%r10,-32(%rbx)
1088	adcxq	%rax,%r11
1089	adoxq	%r13,%r12
1090	mulxq	24(%rcx),%rax,%r15
1091	movq	%r9,%rdx
1092	movq	%r11,-24(%rbx)
1093	leaq	32(%rcx),%rcx
1094	adcxq	%rax,%r12
1095	adoxq	%rbp,%r15
1096	movq	48(%rsp),%rdi
1097	movq	%r12,-16(%rbx)
1098
1099	jmp	.Lmulx4x_inner
1100
1101.align	32
1102.Lmulx4x_inner:
1103	mulxq	0(%rsi),%r10,%rax
1104	adcxq	%rbp,%r15
1105	adoxq	%r14,%r10
1106	mulxq	8(%rsi),%r11,%r14
1107	adcxq	0(%rbx),%r10
1108	adoxq	%rax,%r11
1109	mulxq	16(%rsi),%r12,%rax
1110	adcxq	8(%rbx),%r11
1111	adoxq	%r14,%r12
1112	mulxq	24(%rsi),%r13,%r14
1113	movq	%r8,%rdx
1114	adcxq	16(%rbx),%r12
1115	adoxq	%rax,%r13
1116	adcxq	24(%rbx),%r13
1117	adoxq	%rbp,%r14
1118	leaq	32(%rsi),%rsi
1119	leaq	32(%rbx),%rbx
1120	adcxq	%rbp,%r14
1121
1122	adoxq	%r15,%r10
1123	mulxq	0(%rcx),%rax,%r15
1124	adcxq	%rax,%r10
1125	adoxq	%r15,%r11
1126	mulxq	8(%rcx),%rax,%r15
1127	adcxq	%rax,%r11
1128	adoxq	%r15,%r12
1129	mulxq	16(%rcx),%rax,%r15
1130	movq	%r10,-40(%rbx)
1131	adcxq	%rax,%r12
1132	adoxq	%r15,%r13
1133	mulxq	24(%rcx),%rax,%r15
1134	movq	%r9,%rdx
1135	movq	%r11,-32(%rbx)
1136	movq	%r12,-24(%rbx)
1137	adcxq	%rax,%r13
1138	adoxq	%rbp,%r15
1139	leaq	32(%rcx),%rcx
1140	movq	%r13,-16(%rbx)
1141
1142	decq	%rdi
1143	jnz	.Lmulx4x_inner
1144
1145	movq	0(%rsp),%rax
1146	movq	8(%rsp),%rdi
1147	adcq	%rbp,%r15
1148	subq	0(%rbx),%rbp
1149	adcq	%r15,%r14
1150	sbbq	%r15,%r15
1151	movq	%r14,-8(%rbx)
1152
1153	cmpq	16(%rsp),%rdi
1154	jne	.Lmulx4x_outer
1155
1156	leaq	64(%rsp),%rbx
1157	subq	%rax,%rcx
1158	negq	%r15
1159	movq	%rax,%rdx
1160	shrq	$3+2,%rax
1161	movq	32(%rsp),%rdi
1162	jmp	.Lmulx4x_sub
1163
1164.align	32
1165.Lmulx4x_sub:
1166	movq	0(%rbx),%r11
1167	movq	8(%rbx),%r12
1168	movq	16(%rbx),%r13
1169	movq	24(%rbx),%r14
1170	leaq	32(%rbx),%rbx
1171	sbbq	0(%rcx),%r11
1172	sbbq	8(%rcx),%r12
1173	sbbq	16(%rcx),%r13
1174	sbbq	24(%rcx),%r14
1175	leaq	32(%rcx),%rcx
1176	movq	%r11,0(%rdi)
1177	movq	%r12,8(%rdi)
1178	movq	%r13,16(%rdi)
1179	movq	%r14,24(%rdi)
1180	leaq	32(%rdi),%rdi
1181	decq	%rax
1182	jnz	.Lmulx4x_sub
1183
1184	sbbq	$0,%r15
1185	leaq	64(%rsp),%rbx
1186	subq	%rdx,%rdi
1187
1188.byte	102,73,15,110,207
1189	pxor	%xmm0,%xmm0
1190	pshufd	$0,%xmm1,%xmm1
1191	movq	40(%rsp),%rsi
1192.cfi_def_cfa	%rsi,8
1193	jmp	.Lmulx4x_cond_copy
1194
1195.align	32
1196.Lmulx4x_cond_copy:
1197	movdqa	0(%rbx),%xmm2
1198	movdqa	16(%rbx),%xmm3
1199	leaq	32(%rbx),%rbx
1200	movdqu	0(%rdi),%xmm4
1201	movdqu	16(%rdi),%xmm5
1202	leaq	32(%rdi),%rdi
1203	movdqa	%xmm0,-32(%rbx)
1204	movdqa	%xmm0,-16(%rbx)
1205	pcmpeqd	%xmm1,%xmm0
1206	pand	%xmm1,%xmm2
1207	pand	%xmm1,%xmm3
1208	pand	%xmm0,%xmm4
1209	pand	%xmm0,%xmm5
1210	pxor	%xmm0,%xmm0
1211	por	%xmm2,%xmm4
1212	por	%xmm3,%xmm5
1213	movdqu	%xmm4,-32(%rdi)
1214	movdqu	%xmm5,-16(%rdi)
1215	subq	$32,%rdx
1216	jnz	.Lmulx4x_cond_copy
1217
1218	movq	%rdx,(%rbx)
1219
1220	movq	$1,%rax
1221	movq	-48(%rsi),%r15
1222.cfi_restore	%r15
1223	movq	-40(%rsi),%r14
1224.cfi_restore	%r14
1225	movq	-32(%rsi),%r13
1226.cfi_restore	%r13
1227	movq	-24(%rsi),%r12
1228.cfi_restore	%r12
1229	movq	-16(%rsi),%rbp
1230.cfi_restore	%rbp
1231	movq	-8(%rsi),%rbx
1232.cfi_restore	%rbx
1233	leaq	(%rsi),%rsp
1234.cfi_def_cfa_register	%rsp
1235.Lmulx4x_epilogue:
1236	.byte	0xf3,0xc3
1237.cfi_endproc
1238.size	bn_mulx4x_mont,.-bn_mulx4x_mont
1239.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1240.align	16
1241	.section ".note.gnu.property", "a"
1242	.p2align 3
1243	.long 1f - 0f
1244	.long 4f - 1f
1245	.long 5
12460:
1247	# "GNU" encoded with .byte, since .asciz isn't supported
1248	# on Solaris.
1249	.byte 0x47
1250	.byte 0x4e
1251	.byte 0x55
1252	.byte 0
12531:
1254	.p2align 3
1255	.long 0xc0000002
1256	.long 3f - 2f
12572:
1258	.long 3
12593:
1260	.p2align 3
12614:
1262