xref: /freebsd/sys/crypto/openssl/amd64/rsaz-x86_64.S (revision edf8578117e8844e02c0121147f45e4609b30680)
1/* Do not modify. This file is auto-generated from rsaz-x86_64.pl. */
2.text
3
4
5
6.globl	rsaz_512_sqr
7.type	rsaz_512_sqr,@function
8.align	32
9rsaz_512_sqr:
10.cfi_startproc
11	pushq	%rbx
12.cfi_adjust_cfa_offset	8
13.cfi_offset	%rbx,-16
14	pushq	%rbp
15.cfi_adjust_cfa_offset	8
16.cfi_offset	%rbp,-24
17	pushq	%r12
18.cfi_adjust_cfa_offset	8
19.cfi_offset	%r12,-32
20	pushq	%r13
21.cfi_adjust_cfa_offset	8
22.cfi_offset	%r13,-40
23	pushq	%r14
24.cfi_adjust_cfa_offset	8
25.cfi_offset	%r14,-48
26	pushq	%r15
27.cfi_adjust_cfa_offset	8
28.cfi_offset	%r15,-56
29
30	subq	$128+24,%rsp
31.cfi_adjust_cfa_offset	128+24
32.Lsqr_body:
33.byte	102,72,15,110,202
34	movq	(%rsi),%rdx
35	movq	8(%rsi),%rax
36	movq	%rcx,128(%rsp)
37	movl	$0x80100,%r11d
38	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
39	cmpl	$0x80100,%r11d
40	je	.Loop_sqrx
41	jmp	.Loop_sqr
42
43.align	32
44.Loop_sqr:
45	movl	%r8d,128+8(%rsp)
46
47	movq	%rdx,%rbx
48	movq	%rax,%rbp
49	mulq	%rdx
50	movq	%rax,%r8
51	movq	16(%rsi),%rax
52	movq	%rdx,%r9
53
54	mulq	%rbx
55	addq	%rax,%r9
56	movq	24(%rsi),%rax
57	movq	%rdx,%r10
58	adcq	$0,%r10
59
60	mulq	%rbx
61	addq	%rax,%r10
62	movq	32(%rsi),%rax
63	movq	%rdx,%r11
64	adcq	$0,%r11
65
66	mulq	%rbx
67	addq	%rax,%r11
68	movq	40(%rsi),%rax
69	movq	%rdx,%r12
70	adcq	$0,%r12
71
72	mulq	%rbx
73	addq	%rax,%r12
74	movq	48(%rsi),%rax
75	movq	%rdx,%r13
76	adcq	$0,%r13
77
78	mulq	%rbx
79	addq	%rax,%r13
80	movq	56(%rsi),%rax
81	movq	%rdx,%r14
82	adcq	$0,%r14
83
84	mulq	%rbx
85	addq	%rax,%r14
86	movq	%rbx,%rax
87	adcq	$0,%rdx
88
89	xorq	%rcx,%rcx
90	addq	%r8,%r8
91	movq	%rdx,%r15
92	adcq	$0,%rcx
93
94	mulq	%rax
95	addq	%r8,%rdx
96	adcq	$0,%rcx
97
98	movq	%rax,(%rsp)
99	movq	%rdx,8(%rsp)
100
101
102	movq	16(%rsi),%rax
103	mulq	%rbp
104	addq	%rax,%r10
105	movq	24(%rsi),%rax
106	movq	%rdx,%rbx
107	adcq	$0,%rbx
108
109	mulq	%rbp
110	addq	%rax,%r11
111	movq	32(%rsi),%rax
112	adcq	$0,%rdx
113	addq	%rbx,%r11
114	movq	%rdx,%rbx
115	adcq	$0,%rbx
116
117	mulq	%rbp
118	addq	%rax,%r12
119	movq	40(%rsi),%rax
120	adcq	$0,%rdx
121	addq	%rbx,%r12
122	movq	%rdx,%rbx
123	adcq	$0,%rbx
124
125	mulq	%rbp
126	addq	%rax,%r13
127	movq	48(%rsi),%rax
128	adcq	$0,%rdx
129	addq	%rbx,%r13
130	movq	%rdx,%rbx
131	adcq	$0,%rbx
132
133	mulq	%rbp
134	addq	%rax,%r14
135	movq	56(%rsi),%rax
136	adcq	$0,%rdx
137	addq	%rbx,%r14
138	movq	%rdx,%rbx
139	adcq	$0,%rbx
140
141	mulq	%rbp
142	addq	%rax,%r15
143	movq	%rbp,%rax
144	adcq	$0,%rdx
145	addq	%rbx,%r15
146	adcq	$0,%rdx
147
148	xorq	%rbx,%rbx
149	addq	%r9,%r9
150	movq	%rdx,%r8
151	adcq	%r10,%r10
152	adcq	$0,%rbx
153
154	mulq	%rax
155
156	addq	%rcx,%rax
157	movq	16(%rsi),%rbp
158	addq	%rax,%r9
159	movq	24(%rsi),%rax
160	adcq	%rdx,%r10
161	adcq	$0,%rbx
162
163	movq	%r9,16(%rsp)
164	movq	%r10,24(%rsp)
165
166
167	mulq	%rbp
168	addq	%rax,%r12
169	movq	32(%rsi),%rax
170	movq	%rdx,%rcx
171	adcq	$0,%rcx
172
173	mulq	%rbp
174	addq	%rax,%r13
175	movq	40(%rsi),%rax
176	adcq	$0,%rdx
177	addq	%rcx,%r13
178	movq	%rdx,%rcx
179	adcq	$0,%rcx
180
181	mulq	%rbp
182	addq	%rax,%r14
183	movq	48(%rsi),%rax
184	adcq	$0,%rdx
185	addq	%rcx,%r14
186	movq	%rdx,%rcx
187	adcq	$0,%rcx
188
189	mulq	%rbp
190	addq	%rax,%r15
191	movq	56(%rsi),%rax
192	adcq	$0,%rdx
193	addq	%rcx,%r15
194	movq	%rdx,%rcx
195	adcq	$0,%rcx
196
197	mulq	%rbp
198	addq	%rax,%r8
199	movq	%rbp,%rax
200	adcq	$0,%rdx
201	addq	%rcx,%r8
202	adcq	$0,%rdx
203
204	xorq	%rcx,%rcx
205	addq	%r11,%r11
206	movq	%rdx,%r9
207	adcq	%r12,%r12
208	adcq	$0,%rcx
209
210	mulq	%rax
211
212	addq	%rbx,%rax
213	movq	24(%rsi),%r10
214	addq	%rax,%r11
215	movq	32(%rsi),%rax
216	adcq	%rdx,%r12
217	adcq	$0,%rcx
218
219	movq	%r11,32(%rsp)
220	movq	%r12,40(%rsp)
221
222
223	movq	%rax,%r11
224	mulq	%r10
225	addq	%rax,%r14
226	movq	40(%rsi),%rax
227	movq	%rdx,%rbx
228	adcq	$0,%rbx
229
230	movq	%rax,%r12
231	mulq	%r10
232	addq	%rax,%r15
233	movq	48(%rsi),%rax
234	adcq	$0,%rdx
235	addq	%rbx,%r15
236	movq	%rdx,%rbx
237	adcq	$0,%rbx
238
239	movq	%rax,%rbp
240	mulq	%r10
241	addq	%rax,%r8
242	movq	56(%rsi),%rax
243	adcq	$0,%rdx
244	addq	%rbx,%r8
245	movq	%rdx,%rbx
246	adcq	$0,%rbx
247
248	mulq	%r10
249	addq	%rax,%r9
250	movq	%r10,%rax
251	adcq	$0,%rdx
252	addq	%rbx,%r9
253	adcq	$0,%rdx
254
255	xorq	%rbx,%rbx
256	addq	%r13,%r13
257	movq	%rdx,%r10
258	adcq	%r14,%r14
259	adcq	$0,%rbx
260
261	mulq	%rax
262
263	addq	%rcx,%rax
264	addq	%rax,%r13
265	movq	%r12,%rax
266	adcq	%rdx,%r14
267	adcq	$0,%rbx
268
269	movq	%r13,48(%rsp)
270	movq	%r14,56(%rsp)
271
272
273	mulq	%r11
274	addq	%rax,%r8
275	movq	%rbp,%rax
276	movq	%rdx,%rcx
277	adcq	$0,%rcx
278
279	mulq	%r11
280	addq	%rax,%r9
281	movq	56(%rsi),%rax
282	adcq	$0,%rdx
283	addq	%rcx,%r9
284	movq	%rdx,%rcx
285	adcq	$0,%rcx
286
287	movq	%rax,%r14
288	mulq	%r11
289	addq	%rax,%r10
290	movq	%r11,%rax
291	adcq	$0,%rdx
292	addq	%rcx,%r10
293	adcq	$0,%rdx
294
295	xorq	%rcx,%rcx
296	addq	%r15,%r15
297	movq	%rdx,%r11
298	adcq	%r8,%r8
299	adcq	$0,%rcx
300
301	mulq	%rax
302
303	addq	%rbx,%rax
304	addq	%rax,%r15
305	movq	%rbp,%rax
306	adcq	%rdx,%r8
307	adcq	$0,%rcx
308
309	movq	%r15,64(%rsp)
310	movq	%r8,72(%rsp)
311
312
313	mulq	%r12
314	addq	%rax,%r10
315	movq	%r14,%rax
316	movq	%rdx,%rbx
317	adcq	$0,%rbx
318
319	mulq	%r12
320	addq	%rax,%r11
321	movq	%r12,%rax
322	adcq	$0,%rdx
323	addq	%rbx,%r11
324	adcq	$0,%rdx
325
326	xorq	%rbx,%rbx
327	addq	%r9,%r9
328	movq	%rdx,%r12
329	adcq	%r10,%r10
330	adcq	$0,%rbx
331
332	mulq	%rax
333
334	addq	%rcx,%rax
335	addq	%rax,%r9
336	movq	%r14,%rax
337	adcq	%rdx,%r10
338	adcq	$0,%rbx
339
340	movq	%r9,80(%rsp)
341	movq	%r10,88(%rsp)
342
343
344	mulq	%rbp
345	addq	%rax,%r12
346	movq	%rbp,%rax
347	adcq	$0,%rdx
348
349	xorq	%rcx,%rcx
350	addq	%r11,%r11
351	movq	%rdx,%r13
352	adcq	%r12,%r12
353	adcq	$0,%rcx
354
355	mulq	%rax
356
357	addq	%rbx,%rax
358	addq	%rax,%r11
359	movq	%r14,%rax
360	adcq	%rdx,%r12
361	adcq	$0,%rcx
362
363	movq	%r11,96(%rsp)
364	movq	%r12,104(%rsp)
365
366
367	xorq	%rbx,%rbx
368	addq	%r13,%r13
369	adcq	$0,%rbx
370
371	mulq	%rax
372
373	addq	%rcx,%rax
374	addq	%r13,%rax
375	adcq	%rbx,%rdx
376
377	movq	(%rsp),%r8
378	movq	8(%rsp),%r9
379	movq	16(%rsp),%r10
380	movq	24(%rsp),%r11
381	movq	32(%rsp),%r12
382	movq	40(%rsp),%r13
383	movq	48(%rsp),%r14
384	movq	56(%rsp),%r15
385.byte	102,72,15,126,205
386
387	movq	%rax,112(%rsp)
388	movq	%rdx,120(%rsp)
389
390	call	__rsaz_512_reduce
391
392	addq	64(%rsp),%r8
393	adcq	72(%rsp),%r9
394	adcq	80(%rsp),%r10
395	adcq	88(%rsp),%r11
396	adcq	96(%rsp),%r12
397	adcq	104(%rsp),%r13
398	adcq	112(%rsp),%r14
399	adcq	120(%rsp),%r15
400	sbbq	%rcx,%rcx
401
402	call	__rsaz_512_subtract
403
404	movq	%r8,%rdx
405	movq	%r9,%rax
406	movl	128+8(%rsp),%r8d
407	movq	%rdi,%rsi
408
409	decl	%r8d
410	jnz	.Loop_sqr
411	jmp	.Lsqr_tail
412
413.align	32
414.Loop_sqrx:
415	movl	%r8d,128+8(%rsp)
416.byte	102,72,15,110,199
417
418	mulxq	%rax,%r8,%r9
419	movq	%rax,%rbx
420
421	mulxq	16(%rsi),%rcx,%r10
422	xorq	%rbp,%rbp
423
424	mulxq	24(%rsi),%rax,%r11
425	adcxq	%rcx,%r9
426
427.byte	0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00
428	adcxq	%rax,%r10
429
430.byte	0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00
431	adcxq	%rcx,%r11
432
433	mulxq	48(%rsi),%rcx,%r14
434	adcxq	%rax,%r12
435	adcxq	%rcx,%r13
436
437	mulxq	56(%rsi),%rax,%r15
438	adcxq	%rax,%r14
439	adcxq	%rbp,%r15
440
441	mulxq	%rdx,%rax,%rdi
442	movq	%rbx,%rdx
443	xorq	%rcx,%rcx
444	adoxq	%r8,%r8
445	adcxq	%rdi,%r8
446	adoxq	%rbp,%rcx
447	adcxq	%rbp,%rcx
448
449	movq	%rax,(%rsp)
450	movq	%r8,8(%rsp)
451
452
453.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00
454	adoxq	%rax,%r10
455	adcxq	%rbx,%r11
456
457	mulxq	24(%rsi),%rdi,%r8
458	adoxq	%rdi,%r11
459.byte	0x66
460	adcxq	%r8,%r12
461
462	mulxq	32(%rsi),%rax,%rbx
463	adoxq	%rax,%r12
464	adcxq	%rbx,%r13
465
466	mulxq	40(%rsi),%rdi,%r8
467	adoxq	%rdi,%r13
468	adcxq	%r8,%r14
469
470.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
471	adoxq	%rax,%r14
472	adcxq	%rbx,%r15
473
474.byte	0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
475	adoxq	%rdi,%r15
476	adcxq	%rbp,%r8
477	mulxq	%rdx,%rax,%rdi
478	adoxq	%rbp,%r8
479.byte	0x48,0x8b,0x96,0x10,0x00,0x00,0x00
480
481	xorq	%rbx,%rbx
482	adoxq	%r9,%r9
483
484	adcxq	%rcx,%rax
485	adoxq	%r10,%r10
486	adcxq	%rax,%r9
487	adoxq	%rbp,%rbx
488	adcxq	%rdi,%r10
489	adcxq	%rbp,%rbx
490
491	movq	%r9,16(%rsp)
492.byte	0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00
493
494
495	mulxq	24(%rsi),%rdi,%r9
496	adoxq	%rdi,%r12
497	adcxq	%r9,%r13
498
499	mulxq	32(%rsi),%rax,%rcx
500	adoxq	%rax,%r13
501	adcxq	%rcx,%r14
502
503.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00
504	adoxq	%rdi,%r14
505	adcxq	%r9,%r15
506
507.byte	0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
508	adoxq	%rax,%r15
509	adcxq	%rcx,%r8
510
511	mulxq	56(%rsi),%rdi,%r9
512	adoxq	%rdi,%r8
513	adcxq	%rbp,%r9
514	mulxq	%rdx,%rax,%rdi
515	adoxq	%rbp,%r9
516	movq	24(%rsi),%rdx
517
518	xorq	%rcx,%rcx
519	adoxq	%r11,%r11
520
521	adcxq	%rbx,%rax
522	adoxq	%r12,%r12
523	adcxq	%rax,%r11
524	adoxq	%rbp,%rcx
525	adcxq	%rdi,%r12
526	adcxq	%rbp,%rcx
527
528	movq	%r11,32(%rsp)
529	movq	%r12,40(%rsp)
530
531
532	mulxq	32(%rsi),%rax,%rbx
533	adoxq	%rax,%r14
534	adcxq	%rbx,%r15
535
536	mulxq	40(%rsi),%rdi,%r10
537	adoxq	%rdi,%r15
538	adcxq	%r10,%r8
539
540	mulxq	48(%rsi),%rax,%rbx
541	adoxq	%rax,%r8
542	adcxq	%rbx,%r9
543
544	mulxq	56(%rsi),%rdi,%r10
545	adoxq	%rdi,%r9
546	adcxq	%rbp,%r10
547	mulxq	%rdx,%rax,%rdi
548	adoxq	%rbp,%r10
549	movq	32(%rsi),%rdx
550
551	xorq	%rbx,%rbx
552	adoxq	%r13,%r13
553
554	adcxq	%rcx,%rax
555	adoxq	%r14,%r14
556	adcxq	%rax,%r13
557	adoxq	%rbp,%rbx
558	adcxq	%rdi,%r14
559	adcxq	%rbp,%rbx
560
561	movq	%r13,48(%rsp)
562	movq	%r14,56(%rsp)
563
564
565	mulxq	40(%rsi),%rdi,%r11
566	adoxq	%rdi,%r8
567	adcxq	%r11,%r9
568
569	mulxq	48(%rsi),%rax,%rcx
570	adoxq	%rax,%r9
571	adcxq	%rcx,%r10
572
573	mulxq	56(%rsi),%rdi,%r11
574	adoxq	%rdi,%r10
575	adcxq	%rbp,%r11
576	mulxq	%rdx,%rax,%rdi
577	movq	40(%rsi),%rdx
578	adoxq	%rbp,%r11
579
580	xorq	%rcx,%rcx
581	adoxq	%r15,%r15
582
583	adcxq	%rbx,%rax
584	adoxq	%r8,%r8
585	adcxq	%rax,%r15
586	adoxq	%rbp,%rcx
587	adcxq	%rdi,%r8
588	adcxq	%rbp,%rcx
589
590	movq	%r15,64(%rsp)
591	movq	%r8,72(%rsp)
592
593
594.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
595	adoxq	%rax,%r10
596	adcxq	%rbx,%r11
597
598.byte	0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
599	adoxq	%rdi,%r11
600	adcxq	%rbp,%r12
601	mulxq	%rdx,%rax,%rdi
602	adoxq	%rbp,%r12
603	movq	48(%rsi),%rdx
604
605	xorq	%rbx,%rbx
606	adoxq	%r9,%r9
607
608	adcxq	%rcx,%rax
609	adoxq	%r10,%r10
610	adcxq	%rax,%r9
611	adcxq	%rdi,%r10
612	adoxq	%rbp,%rbx
613	adcxq	%rbp,%rbx
614
615	movq	%r9,80(%rsp)
616	movq	%r10,88(%rsp)
617
618
619.byte	0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
620	adoxq	%rax,%r12
621	adoxq	%rbp,%r13
622
623	mulxq	%rdx,%rax,%rdi
624	xorq	%rcx,%rcx
625	movq	56(%rsi),%rdx
626	adoxq	%r11,%r11
627
628	adcxq	%rbx,%rax
629	adoxq	%r12,%r12
630	adcxq	%rax,%r11
631	adoxq	%rbp,%rcx
632	adcxq	%rdi,%r12
633	adcxq	%rbp,%rcx
634
635.byte	0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
636.byte	0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00
637
638
639	mulxq	%rdx,%rax,%rdx
640	xorq	%rbx,%rbx
641	adoxq	%r13,%r13
642
643	adcxq	%rcx,%rax
644	adoxq	%rbp,%rbx
645	adcxq	%r13,%rax
646	adcxq	%rdx,%rbx
647
648.byte	102,72,15,126,199
649.byte	102,72,15,126,205
650
651	movq	128(%rsp),%rdx
652	movq	(%rsp),%r8
653	movq	8(%rsp),%r9
654	movq	16(%rsp),%r10
655	movq	24(%rsp),%r11
656	movq	32(%rsp),%r12
657	movq	40(%rsp),%r13
658	movq	48(%rsp),%r14
659	movq	56(%rsp),%r15
660
661	movq	%rax,112(%rsp)
662	movq	%rbx,120(%rsp)
663
664	call	__rsaz_512_reducex
665
666	addq	64(%rsp),%r8
667	adcq	72(%rsp),%r9
668	adcq	80(%rsp),%r10
669	adcq	88(%rsp),%r11
670	adcq	96(%rsp),%r12
671	adcq	104(%rsp),%r13
672	adcq	112(%rsp),%r14
673	adcq	120(%rsp),%r15
674	sbbq	%rcx,%rcx
675
676	call	__rsaz_512_subtract
677
678	movq	%r8,%rdx
679	movq	%r9,%rax
680	movl	128+8(%rsp),%r8d
681	movq	%rdi,%rsi
682
683	decl	%r8d
684	jnz	.Loop_sqrx
685
686.Lsqr_tail:
687
688	leaq	128+24+48(%rsp),%rax
689.cfi_def_cfa	%rax,8
690	movq	-48(%rax),%r15
691.cfi_restore	%r15
692	movq	-40(%rax),%r14
693.cfi_restore	%r14
694	movq	-32(%rax),%r13
695.cfi_restore	%r13
696	movq	-24(%rax),%r12
697.cfi_restore	%r12
698	movq	-16(%rax),%rbp
699.cfi_restore	%rbp
700	movq	-8(%rax),%rbx
701.cfi_restore	%rbx
702	leaq	(%rax),%rsp
703.cfi_def_cfa_register	%rsp
704.Lsqr_epilogue:
705	.byte	0xf3,0xc3
706.cfi_endproc
707.size	rsaz_512_sqr,.-rsaz_512_sqr
708.globl	rsaz_512_mul
709.type	rsaz_512_mul,@function
710.align	32
711rsaz_512_mul:
712.cfi_startproc
713	pushq	%rbx
714.cfi_adjust_cfa_offset	8
715.cfi_offset	%rbx,-16
716	pushq	%rbp
717.cfi_adjust_cfa_offset	8
718.cfi_offset	%rbp,-24
719	pushq	%r12
720.cfi_adjust_cfa_offset	8
721.cfi_offset	%r12,-32
722	pushq	%r13
723.cfi_adjust_cfa_offset	8
724.cfi_offset	%r13,-40
725	pushq	%r14
726.cfi_adjust_cfa_offset	8
727.cfi_offset	%r14,-48
728	pushq	%r15
729.cfi_adjust_cfa_offset	8
730.cfi_offset	%r15,-56
731
732	subq	$128+24,%rsp
733.cfi_adjust_cfa_offset	128+24
734.Lmul_body:
735.byte	102,72,15,110,199
736.byte	102,72,15,110,201
737	movq	%r8,128(%rsp)
738	movl	$0x80100,%r11d
739	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
740	cmpl	$0x80100,%r11d
741	je	.Lmulx
742	movq	(%rdx),%rbx
743	movq	%rdx,%rbp
744	call	__rsaz_512_mul
745
746.byte	102,72,15,126,199
747.byte	102,72,15,126,205
748
749	movq	(%rsp),%r8
750	movq	8(%rsp),%r9
751	movq	16(%rsp),%r10
752	movq	24(%rsp),%r11
753	movq	32(%rsp),%r12
754	movq	40(%rsp),%r13
755	movq	48(%rsp),%r14
756	movq	56(%rsp),%r15
757
758	call	__rsaz_512_reduce
759	jmp	.Lmul_tail
760
761.align	32
762.Lmulx:
763	movq	%rdx,%rbp
764	movq	(%rdx),%rdx
765	call	__rsaz_512_mulx
766
767.byte	102,72,15,126,199
768.byte	102,72,15,126,205
769
770	movq	128(%rsp),%rdx
771	movq	(%rsp),%r8
772	movq	8(%rsp),%r9
773	movq	16(%rsp),%r10
774	movq	24(%rsp),%r11
775	movq	32(%rsp),%r12
776	movq	40(%rsp),%r13
777	movq	48(%rsp),%r14
778	movq	56(%rsp),%r15
779
780	call	__rsaz_512_reducex
781.Lmul_tail:
782	addq	64(%rsp),%r8
783	adcq	72(%rsp),%r9
784	adcq	80(%rsp),%r10
785	adcq	88(%rsp),%r11
786	adcq	96(%rsp),%r12
787	adcq	104(%rsp),%r13
788	adcq	112(%rsp),%r14
789	adcq	120(%rsp),%r15
790	sbbq	%rcx,%rcx
791
792	call	__rsaz_512_subtract
793
794	leaq	128+24+48(%rsp),%rax
795.cfi_def_cfa	%rax,8
796	movq	-48(%rax),%r15
797.cfi_restore	%r15
798	movq	-40(%rax),%r14
799.cfi_restore	%r14
800	movq	-32(%rax),%r13
801.cfi_restore	%r13
802	movq	-24(%rax),%r12
803.cfi_restore	%r12
804	movq	-16(%rax),%rbp
805.cfi_restore	%rbp
806	movq	-8(%rax),%rbx
807.cfi_restore	%rbx
808	leaq	(%rax),%rsp
809.cfi_def_cfa_register	%rsp
810.Lmul_epilogue:
811	.byte	0xf3,0xc3
812.cfi_endproc
813.size	rsaz_512_mul,.-rsaz_512_mul
814.globl	rsaz_512_mul_gather4
815.type	rsaz_512_mul_gather4,@function
816.align	32
817rsaz_512_mul_gather4:
818.cfi_startproc
819	pushq	%rbx
820.cfi_adjust_cfa_offset	8
821.cfi_offset	%rbx,-16
822	pushq	%rbp
823.cfi_adjust_cfa_offset	8
824.cfi_offset	%rbp,-24
825	pushq	%r12
826.cfi_adjust_cfa_offset	8
827.cfi_offset	%r12,-32
828	pushq	%r13
829.cfi_adjust_cfa_offset	8
830.cfi_offset	%r13,-40
831	pushq	%r14
832.cfi_adjust_cfa_offset	8
833.cfi_offset	%r14,-48
834	pushq	%r15
835.cfi_adjust_cfa_offset	8
836.cfi_offset	%r15,-56
837
838	subq	$152,%rsp
839.cfi_adjust_cfa_offset	152
840.Lmul_gather4_body:
841	movd	%r9d,%xmm8
842	movdqa	.Linc+16(%rip),%xmm1
843	movdqa	.Linc(%rip),%xmm0
844
845	pshufd	$0,%xmm8,%xmm8
846	movdqa	%xmm1,%xmm7
847	movdqa	%xmm1,%xmm2
848	paddd	%xmm0,%xmm1
849	pcmpeqd	%xmm8,%xmm0
850	movdqa	%xmm7,%xmm3
851	paddd	%xmm1,%xmm2
852	pcmpeqd	%xmm8,%xmm1
853	movdqa	%xmm7,%xmm4
854	paddd	%xmm2,%xmm3
855	pcmpeqd	%xmm8,%xmm2
856	movdqa	%xmm7,%xmm5
857	paddd	%xmm3,%xmm4
858	pcmpeqd	%xmm8,%xmm3
859	movdqa	%xmm7,%xmm6
860	paddd	%xmm4,%xmm5
861	pcmpeqd	%xmm8,%xmm4
862	paddd	%xmm5,%xmm6
863	pcmpeqd	%xmm8,%xmm5
864	paddd	%xmm6,%xmm7
865	pcmpeqd	%xmm8,%xmm6
866	pcmpeqd	%xmm8,%xmm7
867
868	movdqa	0(%rdx),%xmm8
869	movdqa	16(%rdx),%xmm9
870	movdqa	32(%rdx),%xmm10
871	movdqa	48(%rdx),%xmm11
872	pand	%xmm0,%xmm8
873	movdqa	64(%rdx),%xmm12
874	pand	%xmm1,%xmm9
875	movdqa	80(%rdx),%xmm13
876	pand	%xmm2,%xmm10
877	movdqa	96(%rdx),%xmm14
878	pand	%xmm3,%xmm11
879	movdqa	112(%rdx),%xmm15
880	leaq	128(%rdx),%rbp
881	pand	%xmm4,%xmm12
882	pand	%xmm5,%xmm13
883	pand	%xmm6,%xmm14
884	pand	%xmm7,%xmm15
885	por	%xmm10,%xmm8
886	por	%xmm11,%xmm9
887	por	%xmm12,%xmm8
888	por	%xmm13,%xmm9
889	por	%xmm14,%xmm8
890	por	%xmm15,%xmm9
891
892	por	%xmm9,%xmm8
893	pshufd	$0x4e,%xmm8,%xmm9
894	por	%xmm9,%xmm8
895	movl	$0x80100,%r11d
896	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
897	cmpl	$0x80100,%r11d
898	je	.Lmulx_gather
899.byte	102,76,15,126,195
900
901	movq	%r8,128(%rsp)
902	movq	%rdi,128+8(%rsp)
903	movq	%rcx,128+16(%rsp)
904
905	movq	(%rsi),%rax
906	movq	8(%rsi),%rcx
907	mulq	%rbx
908	movq	%rax,(%rsp)
909	movq	%rcx,%rax
910	movq	%rdx,%r8
911
912	mulq	%rbx
913	addq	%rax,%r8
914	movq	16(%rsi),%rax
915	movq	%rdx,%r9
916	adcq	$0,%r9
917
918	mulq	%rbx
919	addq	%rax,%r9
920	movq	24(%rsi),%rax
921	movq	%rdx,%r10
922	adcq	$0,%r10
923
924	mulq	%rbx
925	addq	%rax,%r10
926	movq	32(%rsi),%rax
927	movq	%rdx,%r11
928	adcq	$0,%r11
929
930	mulq	%rbx
931	addq	%rax,%r11
932	movq	40(%rsi),%rax
933	movq	%rdx,%r12
934	adcq	$0,%r12
935
936	mulq	%rbx
937	addq	%rax,%r12
938	movq	48(%rsi),%rax
939	movq	%rdx,%r13
940	adcq	$0,%r13
941
942	mulq	%rbx
943	addq	%rax,%r13
944	movq	56(%rsi),%rax
945	movq	%rdx,%r14
946	adcq	$0,%r14
947
948	mulq	%rbx
949	addq	%rax,%r14
950	movq	(%rsi),%rax
951	movq	%rdx,%r15
952	adcq	$0,%r15
953
954	leaq	8(%rsp),%rdi
955	movl	$7,%ecx
956	jmp	.Loop_mul_gather
957
958.align	32
959.Loop_mul_gather:
960	movdqa	0(%rbp),%xmm8
961	movdqa	16(%rbp),%xmm9
962	movdqa	32(%rbp),%xmm10
963	movdqa	48(%rbp),%xmm11
964	pand	%xmm0,%xmm8
965	movdqa	64(%rbp),%xmm12
966	pand	%xmm1,%xmm9
967	movdqa	80(%rbp),%xmm13
968	pand	%xmm2,%xmm10
969	movdqa	96(%rbp),%xmm14
970	pand	%xmm3,%xmm11
971	movdqa	112(%rbp),%xmm15
972	leaq	128(%rbp),%rbp
973	pand	%xmm4,%xmm12
974	pand	%xmm5,%xmm13
975	pand	%xmm6,%xmm14
976	pand	%xmm7,%xmm15
977	por	%xmm10,%xmm8
978	por	%xmm11,%xmm9
979	por	%xmm12,%xmm8
980	por	%xmm13,%xmm9
981	por	%xmm14,%xmm8
982	por	%xmm15,%xmm9
983
984	por	%xmm9,%xmm8
985	pshufd	$0x4e,%xmm8,%xmm9
986	por	%xmm9,%xmm8
987.byte	102,76,15,126,195
988
989	mulq	%rbx
990	addq	%rax,%r8
991	movq	8(%rsi),%rax
992	movq	%r8,(%rdi)
993	movq	%rdx,%r8
994	adcq	$0,%r8
995
996	mulq	%rbx
997	addq	%rax,%r9
998	movq	16(%rsi),%rax
999	adcq	$0,%rdx
1000	addq	%r9,%r8
1001	movq	%rdx,%r9
1002	adcq	$0,%r9
1003
1004	mulq	%rbx
1005	addq	%rax,%r10
1006	movq	24(%rsi),%rax
1007	adcq	$0,%rdx
1008	addq	%r10,%r9
1009	movq	%rdx,%r10
1010	adcq	$0,%r10
1011
1012	mulq	%rbx
1013	addq	%rax,%r11
1014	movq	32(%rsi),%rax
1015	adcq	$0,%rdx
1016	addq	%r11,%r10
1017	movq	%rdx,%r11
1018	adcq	$0,%r11
1019
1020	mulq	%rbx
1021	addq	%rax,%r12
1022	movq	40(%rsi),%rax
1023	adcq	$0,%rdx
1024	addq	%r12,%r11
1025	movq	%rdx,%r12
1026	adcq	$0,%r12
1027
1028	mulq	%rbx
1029	addq	%rax,%r13
1030	movq	48(%rsi),%rax
1031	adcq	$0,%rdx
1032	addq	%r13,%r12
1033	movq	%rdx,%r13
1034	adcq	$0,%r13
1035
1036	mulq	%rbx
1037	addq	%rax,%r14
1038	movq	56(%rsi),%rax
1039	adcq	$0,%rdx
1040	addq	%r14,%r13
1041	movq	%rdx,%r14
1042	adcq	$0,%r14
1043
1044	mulq	%rbx
1045	addq	%rax,%r15
1046	movq	(%rsi),%rax
1047	adcq	$0,%rdx
1048	addq	%r15,%r14
1049	movq	%rdx,%r15
1050	adcq	$0,%r15
1051
1052	leaq	8(%rdi),%rdi
1053
1054	decl	%ecx
1055	jnz	.Loop_mul_gather
1056
1057	movq	%r8,(%rdi)
1058	movq	%r9,8(%rdi)
1059	movq	%r10,16(%rdi)
1060	movq	%r11,24(%rdi)
1061	movq	%r12,32(%rdi)
1062	movq	%r13,40(%rdi)
1063	movq	%r14,48(%rdi)
1064	movq	%r15,56(%rdi)
1065
1066	movq	128+8(%rsp),%rdi
1067	movq	128+16(%rsp),%rbp
1068
1069	movq	(%rsp),%r8
1070	movq	8(%rsp),%r9
1071	movq	16(%rsp),%r10
1072	movq	24(%rsp),%r11
1073	movq	32(%rsp),%r12
1074	movq	40(%rsp),%r13
1075	movq	48(%rsp),%r14
1076	movq	56(%rsp),%r15
1077
1078	call	__rsaz_512_reduce
1079	jmp	.Lmul_gather_tail
1080
1081.align	32
1082.Lmulx_gather:
1083.byte	102,76,15,126,194
1084
1085	movq	%r8,128(%rsp)
1086	movq	%rdi,128+8(%rsp)
1087	movq	%rcx,128+16(%rsp)
1088
1089	mulxq	(%rsi),%rbx,%r8
1090	movq	%rbx,(%rsp)
1091	xorl	%edi,%edi
1092
1093	mulxq	8(%rsi),%rax,%r9
1094
1095	mulxq	16(%rsi),%rbx,%r10
1096	adcxq	%rax,%r8
1097
1098	mulxq	24(%rsi),%rax,%r11
1099	adcxq	%rbx,%r9
1100
1101	mulxq	32(%rsi),%rbx,%r12
1102	adcxq	%rax,%r10
1103
1104	mulxq	40(%rsi),%rax,%r13
1105	adcxq	%rbx,%r11
1106
1107	mulxq	48(%rsi),%rbx,%r14
1108	adcxq	%rax,%r12
1109
1110	mulxq	56(%rsi),%rax,%r15
1111	adcxq	%rbx,%r13
1112	adcxq	%rax,%r14
1113.byte	0x67
1114	movq	%r8,%rbx
1115	adcxq	%rdi,%r15
1116
1117	movq	$-7,%rcx
1118	jmp	.Loop_mulx_gather
1119
1120.align	32
1121.Loop_mulx_gather:
1122	movdqa	0(%rbp),%xmm8
1123	movdqa	16(%rbp),%xmm9
1124	movdqa	32(%rbp),%xmm10
1125	movdqa	48(%rbp),%xmm11
1126	pand	%xmm0,%xmm8
1127	movdqa	64(%rbp),%xmm12
1128	pand	%xmm1,%xmm9
1129	movdqa	80(%rbp),%xmm13
1130	pand	%xmm2,%xmm10
1131	movdqa	96(%rbp),%xmm14
1132	pand	%xmm3,%xmm11
1133	movdqa	112(%rbp),%xmm15
1134	leaq	128(%rbp),%rbp
1135	pand	%xmm4,%xmm12
1136	pand	%xmm5,%xmm13
1137	pand	%xmm6,%xmm14
1138	pand	%xmm7,%xmm15
1139	por	%xmm10,%xmm8
1140	por	%xmm11,%xmm9
1141	por	%xmm12,%xmm8
1142	por	%xmm13,%xmm9
1143	por	%xmm14,%xmm8
1144	por	%xmm15,%xmm9
1145
1146	por	%xmm9,%xmm8
1147	pshufd	$0x4e,%xmm8,%xmm9
1148	por	%xmm9,%xmm8
1149.byte	102,76,15,126,194
1150
1151.byte	0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00
1152	adcxq	%rax,%rbx
1153	adoxq	%r9,%r8
1154
1155	mulxq	8(%rsi),%rax,%r9
1156	adcxq	%rax,%r8
1157	adoxq	%r10,%r9
1158
1159	mulxq	16(%rsi),%rax,%r10
1160	adcxq	%rax,%r9
1161	adoxq	%r11,%r10
1162
1163.byte	0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
1164	adcxq	%rax,%r10
1165	adoxq	%r12,%r11
1166
1167	mulxq	32(%rsi),%rax,%r12
1168	adcxq	%rax,%r11
1169	adoxq	%r13,%r12
1170
1171	mulxq	40(%rsi),%rax,%r13
1172	adcxq	%rax,%r12
1173	adoxq	%r14,%r13
1174
1175.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
1176	adcxq	%rax,%r13
1177.byte	0x67
1178	adoxq	%r15,%r14
1179
1180	mulxq	56(%rsi),%rax,%r15
1181	movq	%rbx,64(%rsp,%rcx,8)
1182	adcxq	%rax,%r14
1183	adoxq	%rdi,%r15
1184	movq	%r8,%rbx
1185	adcxq	%rdi,%r15
1186
1187	incq	%rcx
1188	jnz	.Loop_mulx_gather
1189
1190	movq	%r8,64(%rsp)
1191	movq	%r9,64+8(%rsp)
1192	movq	%r10,64+16(%rsp)
1193	movq	%r11,64+24(%rsp)
1194	movq	%r12,64+32(%rsp)
1195	movq	%r13,64+40(%rsp)
1196	movq	%r14,64+48(%rsp)
1197	movq	%r15,64+56(%rsp)
1198
1199	movq	128(%rsp),%rdx
1200	movq	128+8(%rsp),%rdi
1201	movq	128+16(%rsp),%rbp
1202
1203	movq	(%rsp),%r8
1204	movq	8(%rsp),%r9
1205	movq	16(%rsp),%r10
1206	movq	24(%rsp),%r11
1207	movq	32(%rsp),%r12
1208	movq	40(%rsp),%r13
1209	movq	48(%rsp),%r14
1210	movq	56(%rsp),%r15
1211
1212	call	__rsaz_512_reducex
1213
1214.Lmul_gather_tail:
1215	addq	64(%rsp),%r8
1216	adcq	72(%rsp),%r9
1217	adcq	80(%rsp),%r10
1218	adcq	88(%rsp),%r11
1219	adcq	96(%rsp),%r12
1220	adcq	104(%rsp),%r13
1221	adcq	112(%rsp),%r14
1222	adcq	120(%rsp),%r15
1223	sbbq	%rcx,%rcx
1224
1225	call	__rsaz_512_subtract
1226
1227	leaq	128+24+48(%rsp),%rax
1228.cfi_def_cfa	%rax,8
1229	movq	-48(%rax),%r15
1230.cfi_restore	%r15
1231	movq	-40(%rax),%r14
1232.cfi_restore	%r14
1233	movq	-32(%rax),%r13
1234.cfi_restore	%r13
1235	movq	-24(%rax),%r12
1236.cfi_restore	%r12
1237	movq	-16(%rax),%rbp
1238.cfi_restore	%rbp
1239	movq	-8(%rax),%rbx
1240.cfi_restore	%rbx
1241	leaq	(%rax),%rsp
1242.cfi_def_cfa_register	%rsp
1243.Lmul_gather4_epilogue:
1244	.byte	0xf3,0xc3
1245.cfi_endproc
1246.size	rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1247.globl	rsaz_512_mul_scatter4
1248.type	rsaz_512_mul_scatter4,@function
1249.align	32
1250rsaz_512_mul_scatter4:
1251.cfi_startproc
1252	pushq	%rbx
1253.cfi_adjust_cfa_offset	8
1254.cfi_offset	%rbx,-16
1255	pushq	%rbp
1256.cfi_adjust_cfa_offset	8
1257.cfi_offset	%rbp,-24
1258	pushq	%r12
1259.cfi_adjust_cfa_offset	8
1260.cfi_offset	%r12,-32
1261	pushq	%r13
1262.cfi_adjust_cfa_offset	8
1263.cfi_offset	%r13,-40
1264	pushq	%r14
1265.cfi_adjust_cfa_offset	8
1266.cfi_offset	%r14,-48
1267	pushq	%r15
1268.cfi_adjust_cfa_offset	8
1269.cfi_offset	%r15,-56
1270
1271	movl	%r9d,%r9d
1272	subq	$128+24,%rsp
1273.cfi_adjust_cfa_offset	128+24
1274.Lmul_scatter4_body:
1275	leaq	(%r8,%r9,8),%r8
1276.byte	102,72,15,110,199
1277.byte	102,72,15,110,202
1278.byte	102,73,15,110,208
1279	movq	%rcx,128(%rsp)
1280
1281	movq	%rdi,%rbp
1282	movl	$0x80100,%r11d
1283	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
1284	cmpl	$0x80100,%r11d
1285	je	.Lmulx_scatter
1286	movq	(%rdi),%rbx
1287	call	__rsaz_512_mul
1288
1289.byte	102,72,15,126,199
1290.byte	102,72,15,126,205
1291
1292	movq	(%rsp),%r8
1293	movq	8(%rsp),%r9
1294	movq	16(%rsp),%r10
1295	movq	24(%rsp),%r11
1296	movq	32(%rsp),%r12
1297	movq	40(%rsp),%r13
1298	movq	48(%rsp),%r14
1299	movq	56(%rsp),%r15
1300
1301	call	__rsaz_512_reduce
1302	jmp	.Lmul_scatter_tail
1303
1304.align	32
1305.Lmulx_scatter:
1306	movq	(%rdi),%rdx
1307	call	__rsaz_512_mulx
1308
1309.byte	102,72,15,126,199
1310.byte	102,72,15,126,205
1311
1312	movq	128(%rsp),%rdx
1313	movq	(%rsp),%r8
1314	movq	8(%rsp),%r9
1315	movq	16(%rsp),%r10
1316	movq	24(%rsp),%r11
1317	movq	32(%rsp),%r12
1318	movq	40(%rsp),%r13
1319	movq	48(%rsp),%r14
1320	movq	56(%rsp),%r15
1321
1322	call	__rsaz_512_reducex
1323
1324.Lmul_scatter_tail:
1325	addq	64(%rsp),%r8
1326	adcq	72(%rsp),%r9
1327	adcq	80(%rsp),%r10
1328	adcq	88(%rsp),%r11
1329	adcq	96(%rsp),%r12
1330	adcq	104(%rsp),%r13
1331	adcq	112(%rsp),%r14
1332	adcq	120(%rsp),%r15
1333.byte	102,72,15,126,214
1334	sbbq	%rcx,%rcx
1335
1336	call	__rsaz_512_subtract
1337
1338	movq	%r8,0(%rsi)
1339	movq	%r9,128(%rsi)
1340	movq	%r10,256(%rsi)
1341	movq	%r11,384(%rsi)
1342	movq	%r12,512(%rsi)
1343	movq	%r13,640(%rsi)
1344	movq	%r14,768(%rsi)
1345	movq	%r15,896(%rsi)
1346
1347	leaq	128+24+48(%rsp),%rax
1348.cfi_def_cfa	%rax,8
1349	movq	-48(%rax),%r15
1350.cfi_restore	%r15
1351	movq	-40(%rax),%r14
1352.cfi_restore	%r14
1353	movq	-32(%rax),%r13
1354.cfi_restore	%r13
1355	movq	-24(%rax),%r12
1356.cfi_restore	%r12
1357	movq	-16(%rax),%rbp
1358.cfi_restore	%rbp
1359	movq	-8(%rax),%rbx
1360.cfi_restore	%rbx
1361	leaq	(%rax),%rsp
1362.cfi_def_cfa_register	%rsp
1363.Lmul_scatter4_epilogue:
1364	.byte	0xf3,0xc3
1365.cfi_endproc
1366.size	rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1367.globl	rsaz_512_mul_by_one
1368.type	rsaz_512_mul_by_one,@function
1369.align	32
1370rsaz_512_mul_by_one:
1371.cfi_startproc
1372	pushq	%rbx
1373.cfi_adjust_cfa_offset	8
1374.cfi_offset	%rbx,-16
1375	pushq	%rbp
1376.cfi_adjust_cfa_offset	8
1377.cfi_offset	%rbp,-24
1378	pushq	%r12
1379.cfi_adjust_cfa_offset	8
1380.cfi_offset	%r12,-32
1381	pushq	%r13
1382.cfi_adjust_cfa_offset	8
1383.cfi_offset	%r13,-40
1384	pushq	%r14
1385.cfi_adjust_cfa_offset	8
1386.cfi_offset	%r14,-48
1387	pushq	%r15
1388.cfi_adjust_cfa_offset	8
1389.cfi_offset	%r15,-56
1390
1391	subq	$128+24,%rsp
1392.cfi_adjust_cfa_offset	128+24
1393.Lmul_by_one_body:
1394	movl	OPENSSL_ia32cap_P+8(%rip),%eax
1395	movq	%rdx,%rbp
1396	movq	%rcx,128(%rsp)
1397
1398	movq	(%rsi),%r8
1399	pxor	%xmm0,%xmm0
1400	movq	8(%rsi),%r9
1401	movq	16(%rsi),%r10
1402	movq	24(%rsi),%r11
1403	movq	32(%rsi),%r12
1404	movq	40(%rsi),%r13
1405	movq	48(%rsi),%r14
1406	movq	56(%rsi),%r15
1407
1408	movdqa	%xmm0,(%rsp)
1409	movdqa	%xmm0,16(%rsp)
1410	movdqa	%xmm0,32(%rsp)
1411	movdqa	%xmm0,48(%rsp)
1412	movdqa	%xmm0,64(%rsp)
1413	movdqa	%xmm0,80(%rsp)
1414	movdqa	%xmm0,96(%rsp)
1415	andl	$0x80100,%eax
1416	cmpl	$0x80100,%eax
1417	je	.Lby_one_callx
1418	call	__rsaz_512_reduce
1419	jmp	.Lby_one_tail
1420.align	32
1421.Lby_one_callx:
1422	movq	128(%rsp),%rdx
1423	call	__rsaz_512_reducex
1424.Lby_one_tail:
1425	movq	%r8,(%rdi)
1426	movq	%r9,8(%rdi)
1427	movq	%r10,16(%rdi)
1428	movq	%r11,24(%rdi)
1429	movq	%r12,32(%rdi)
1430	movq	%r13,40(%rdi)
1431	movq	%r14,48(%rdi)
1432	movq	%r15,56(%rdi)
1433
1434	leaq	128+24+48(%rsp),%rax
1435.cfi_def_cfa	%rax,8
1436	movq	-48(%rax),%r15
1437.cfi_restore	%r15
1438	movq	-40(%rax),%r14
1439.cfi_restore	%r14
1440	movq	-32(%rax),%r13
1441.cfi_restore	%r13
1442	movq	-24(%rax),%r12
1443.cfi_restore	%r12
1444	movq	-16(%rax),%rbp
1445.cfi_restore	%rbp
1446	movq	-8(%rax),%rbx
1447.cfi_restore	%rbx
1448	leaq	(%rax),%rsp
1449.cfi_def_cfa_register	%rsp
1450.Lmul_by_one_epilogue:
1451	.byte	0xf3,0xc3
1452.cfi_endproc
1453.size	rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1454.type	__rsaz_512_reduce,@function
1455.align	32
1456__rsaz_512_reduce:
1457.cfi_startproc
1458	movq	%r8,%rbx
1459	imulq	128+8(%rsp),%rbx
1460	movq	0(%rbp),%rax
1461	movl	$8,%ecx
1462	jmp	.Lreduction_loop
1463
1464.align	32
1465.Lreduction_loop:
1466	mulq	%rbx
1467	movq	8(%rbp),%rax
1468	negq	%r8
1469	movq	%rdx,%r8
1470	adcq	$0,%r8
1471
1472	mulq	%rbx
1473	addq	%rax,%r9
1474	movq	16(%rbp),%rax
1475	adcq	$0,%rdx
1476	addq	%r9,%r8
1477	movq	%rdx,%r9
1478	adcq	$0,%r9
1479
1480	mulq	%rbx
1481	addq	%rax,%r10
1482	movq	24(%rbp),%rax
1483	adcq	$0,%rdx
1484	addq	%r10,%r9
1485	movq	%rdx,%r10
1486	adcq	$0,%r10
1487
1488	mulq	%rbx
1489	addq	%rax,%r11
1490	movq	32(%rbp),%rax
1491	adcq	$0,%rdx
1492	addq	%r11,%r10
1493	movq	128+8(%rsp),%rsi
1494
1495
1496	adcq	$0,%rdx
1497	movq	%rdx,%r11
1498
1499	mulq	%rbx
1500	addq	%rax,%r12
1501	movq	40(%rbp),%rax
1502	adcq	$0,%rdx
1503	imulq	%r8,%rsi
1504	addq	%r12,%r11
1505	movq	%rdx,%r12
1506	adcq	$0,%r12
1507
1508	mulq	%rbx
1509	addq	%rax,%r13
1510	movq	48(%rbp),%rax
1511	adcq	$0,%rdx
1512	addq	%r13,%r12
1513	movq	%rdx,%r13
1514	adcq	$0,%r13
1515
1516	mulq	%rbx
1517	addq	%rax,%r14
1518	movq	56(%rbp),%rax
1519	adcq	$0,%rdx
1520	addq	%r14,%r13
1521	movq	%rdx,%r14
1522	adcq	$0,%r14
1523
1524	mulq	%rbx
1525	movq	%rsi,%rbx
1526	addq	%rax,%r15
1527	movq	0(%rbp),%rax
1528	adcq	$0,%rdx
1529	addq	%r15,%r14
1530	movq	%rdx,%r15
1531	adcq	$0,%r15
1532
1533	decl	%ecx
1534	jne	.Lreduction_loop
1535
1536	.byte	0xf3,0xc3
1537.cfi_endproc
1538.size	__rsaz_512_reduce,.-__rsaz_512_reduce
1539.type	__rsaz_512_reducex,@function
1540.align	32
1541__rsaz_512_reducex:
1542.cfi_startproc
1543
1544	imulq	%r8,%rdx
1545	xorq	%rsi,%rsi
1546	movl	$8,%ecx
1547	jmp	.Lreduction_loopx
1548
1549.align	32
1550.Lreduction_loopx:
1551	movq	%r8,%rbx
1552	mulxq	0(%rbp),%rax,%r8
1553	adcxq	%rbx,%rax
1554	adoxq	%r9,%r8
1555
1556	mulxq	8(%rbp),%rax,%r9
1557	adcxq	%rax,%r8
1558	adoxq	%r10,%r9
1559
1560	mulxq	16(%rbp),%rbx,%r10
1561	adcxq	%rbx,%r9
1562	adoxq	%r11,%r10
1563
1564	mulxq	24(%rbp),%rbx,%r11
1565	adcxq	%rbx,%r10
1566	adoxq	%r12,%r11
1567
1568.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
1569	movq	%rdx,%rax
1570	movq	%r8,%rdx
1571	adcxq	%rbx,%r11
1572	adoxq	%r13,%r12
1573
1574	mulxq	128+8(%rsp),%rbx,%rdx
1575	movq	%rax,%rdx
1576
1577	mulxq	40(%rbp),%rax,%r13
1578	adcxq	%rax,%r12
1579	adoxq	%r14,%r13
1580
1581.byte	0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
1582	adcxq	%rax,%r13
1583	adoxq	%r15,%r14
1584
1585	mulxq	56(%rbp),%rax,%r15
1586	movq	%rbx,%rdx
1587	adcxq	%rax,%r14
1588	adoxq	%rsi,%r15
1589	adcxq	%rsi,%r15
1590
1591	decl	%ecx
1592	jne	.Lreduction_loopx
1593
1594	.byte	0xf3,0xc3
1595.cfi_endproc
1596.size	__rsaz_512_reducex,.-__rsaz_512_reducex
1597.type	__rsaz_512_subtract,@function
1598.align	32
1599__rsaz_512_subtract:
1600.cfi_startproc
1601	movq	%r8,(%rdi)
1602	movq	%r9,8(%rdi)
1603	movq	%r10,16(%rdi)
1604	movq	%r11,24(%rdi)
1605	movq	%r12,32(%rdi)
1606	movq	%r13,40(%rdi)
1607	movq	%r14,48(%rdi)
1608	movq	%r15,56(%rdi)
1609
1610	movq	0(%rbp),%r8
1611	movq	8(%rbp),%r9
1612	negq	%r8
1613	notq	%r9
1614	andq	%rcx,%r8
1615	movq	16(%rbp),%r10
1616	andq	%rcx,%r9
1617	notq	%r10
1618	movq	24(%rbp),%r11
1619	andq	%rcx,%r10
1620	notq	%r11
1621	movq	32(%rbp),%r12
1622	andq	%rcx,%r11
1623	notq	%r12
1624	movq	40(%rbp),%r13
1625	andq	%rcx,%r12
1626	notq	%r13
1627	movq	48(%rbp),%r14
1628	andq	%rcx,%r13
1629	notq	%r14
1630	movq	56(%rbp),%r15
1631	andq	%rcx,%r14
1632	notq	%r15
1633	andq	%rcx,%r15
1634
1635	addq	(%rdi),%r8
1636	adcq	8(%rdi),%r9
1637	adcq	16(%rdi),%r10
1638	adcq	24(%rdi),%r11
1639	adcq	32(%rdi),%r12
1640	adcq	40(%rdi),%r13
1641	adcq	48(%rdi),%r14
1642	adcq	56(%rdi),%r15
1643
1644	movq	%r8,(%rdi)
1645	movq	%r9,8(%rdi)
1646	movq	%r10,16(%rdi)
1647	movq	%r11,24(%rdi)
1648	movq	%r12,32(%rdi)
1649	movq	%r13,40(%rdi)
1650	movq	%r14,48(%rdi)
1651	movq	%r15,56(%rdi)
1652
1653	.byte	0xf3,0xc3
1654.cfi_endproc
1655.size	__rsaz_512_subtract,.-__rsaz_512_subtract
1656.type	__rsaz_512_mul,@function
1657.align	32
1658__rsaz_512_mul:
1659.cfi_startproc
1660	leaq	8(%rsp),%rdi
1661
1662	movq	(%rsi),%rax
1663	mulq	%rbx
1664	movq	%rax,(%rdi)
1665	movq	8(%rsi),%rax
1666	movq	%rdx,%r8
1667
1668	mulq	%rbx
1669	addq	%rax,%r8
1670	movq	16(%rsi),%rax
1671	movq	%rdx,%r9
1672	adcq	$0,%r9
1673
1674	mulq	%rbx
1675	addq	%rax,%r9
1676	movq	24(%rsi),%rax
1677	movq	%rdx,%r10
1678	adcq	$0,%r10
1679
1680	mulq	%rbx
1681	addq	%rax,%r10
1682	movq	32(%rsi),%rax
1683	movq	%rdx,%r11
1684	adcq	$0,%r11
1685
1686	mulq	%rbx
1687	addq	%rax,%r11
1688	movq	40(%rsi),%rax
1689	movq	%rdx,%r12
1690	adcq	$0,%r12
1691
1692	mulq	%rbx
1693	addq	%rax,%r12
1694	movq	48(%rsi),%rax
1695	movq	%rdx,%r13
1696	adcq	$0,%r13
1697
1698	mulq	%rbx
1699	addq	%rax,%r13
1700	movq	56(%rsi),%rax
1701	movq	%rdx,%r14
1702	adcq	$0,%r14
1703
1704	mulq	%rbx
1705	addq	%rax,%r14
1706	movq	(%rsi),%rax
1707	movq	%rdx,%r15
1708	adcq	$0,%r15
1709
1710	leaq	8(%rbp),%rbp
1711	leaq	8(%rdi),%rdi
1712
1713	movl	$7,%ecx
1714	jmp	.Loop_mul
1715
1716.align	32
1717.Loop_mul:
1718	movq	(%rbp),%rbx
1719	mulq	%rbx
1720	addq	%rax,%r8
1721	movq	8(%rsi),%rax
1722	movq	%r8,(%rdi)
1723	movq	%rdx,%r8
1724	adcq	$0,%r8
1725
1726	mulq	%rbx
1727	addq	%rax,%r9
1728	movq	16(%rsi),%rax
1729	adcq	$0,%rdx
1730	addq	%r9,%r8
1731	movq	%rdx,%r9
1732	adcq	$0,%r9
1733
1734	mulq	%rbx
1735	addq	%rax,%r10
1736	movq	24(%rsi),%rax
1737	adcq	$0,%rdx
1738	addq	%r10,%r9
1739	movq	%rdx,%r10
1740	adcq	$0,%r10
1741
1742	mulq	%rbx
1743	addq	%rax,%r11
1744	movq	32(%rsi),%rax
1745	adcq	$0,%rdx
1746	addq	%r11,%r10
1747	movq	%rdx,%r11
1748	adcq	$0,%r11
1749
1750	mulq	%rbx
1751	addq	%rax,%r12
1752	movq	40(%rsi),%rax
1753	adcq	$0,%rdx
1754	addq	%r12,%r11
1755	movq	%rdx,%r12
1756	adcq	$0,%r12
1757
1758	mulq	%rbx
1759	addq	%rax,%r13
1760	movq	48(%rsi),%rax
1761	adcq	$0,%rdx
1762	addq	%r13,%r12
1763	movq	%rdx,%r13
1764	adcq	$0,%r13
1765
1766	mulq	%rbx
1767	addq	%rax,%r14
1768	movq	56(%rsi),%rax
1769	adcq	$0,%rdx
1770	addq	%r14,%r13
1771	movq	%rdx,%r14
1772	leaq	8(%rbp),%rbp
1773	adcq	$0,%r14
1774
1775	mulq	%rbx
1776	addq	%rax,%r15
1777	movq	(%rsi),%rax
1778	adcq	$0,%rdx
1779	addq	%r15,%r14
1780	movq	%rdx,%r15
1781	adcq	$0,%r15
1782
1783	leaq	8(%rdi),%rdi
1784
1785	decl	%ecx
1786	jnz	.Loop_mul
1787
1788	movq	%r8,(%rdi)
1789	movq	%r9,8(%rdi)
1790	movq	%r10,16(%rdi)
1791	movq	%r11,24(%rdi)
1792	movq	%r12,32(%rdi)
1793	movq	%r13,40(%rdi)
1794	movq	%r14,48(%rdi)
1795	movq	%r15,56(%rdi)
1796
1797	.byte	0xf3,0xc3
1798.cfi_endproc
1799.size	__rsaz_512_mul,.-__rsaz_512_mul
1800.type	__rsaz_512_mulx,@function
1801.align	32
1802__rsaz_512_mulx:
1803.cfi_startproc
1804	mulxq	(%rsi),%rbx,%r8
1805	movq	$-6,%rcx
1806
1807	mulxq	8(%rsi),%rax,%r9
1808	movq	%rbx,8(%rsp)
1809
1810	mulxq	16(%rsi),%rbx,%r10
1811	adcq	%rax,%r8
1812
1813	mulxq	24(%rsi),%rax,%r11
1814	adcq	%rbx,%r9
1815
1816	mulxq	32(%rsi),%rbx,%r12
1817	adcq	%rax,%r10
1818
1819	mulxq	40(%rsi),%rax,%r13
1820	adcq	%rbx,%r11
1821
1822	mulxq	48(%rsi),%rbx,%r14
1823	adcq	%rax,%r12
1824
1825	mulxq	56(%rsi),%rax,%r15
1826	movq	8(%rbp),%rdx
1827	adcq	%rbx,%r13
1828	adcq	%rax,%r14
1829	adcq	$0,%r15
1830
1831	xorq	%rdi,%rdi
1832	jmp	.Loop_mulx
1833
1834.align	32
1835.Loop_mulx:
1836	movq	%r8,%rbx
1837	mulxq	(%rsi),%rax,%r8
1838	adcxq	%rax,%rbx
1839	adoxq	%r9,%r8
1840
1841	mulxq	8(%rsi),%rax,%r9
1842	adcxq	%rax,%r8
1843	adoxq	%r10,%r9
1844
1845	mulxq	16(%rsi),%rax,%r10
1846	adcxq	%rax,%r9
1847	adoxq	%r11,%r10
1848
1849	mulxq	24(%rsi),%rax,%r11
1850	adcxq	%rax,%r10
1851	adoxq	%r12,%r11
1852
1853.byte	0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
1854	adcxq	%rax,%r11
1855	adoxq	%r13,%r12
1856
1857	mulxq	40(%rsi),%rax,%r13
1858	adcxq	%rax,%r12
1859	adoxq	%r14,%r13
1860
1861	mulxq	48(%rsi),%rax,%r14
1862	adcxq	%rax,%r13
1863	adoxq	%r15,%r14
1864
1865	mulxq	56(%rsi),%rax,%r15
1866	movq	64(%rbp,%rcx,8),%rdx
1867	movq	%rbx,8+64-8(%rsp,%rcx,8)
1868	adcxq	%rax,%r14
1869	adoxq	%rdi,%r15
1870	adcxq	%rdi,%r15
1871
1872	incq	%rcx
1873	jnz	.Loop_mulx
1874
1875	movq	%r8,%rbx
1876	mulxq	(%rsi),%rax,%r8
1877	adcxq	%rax,%rbx
1878	adoxq	%r9,%r8
1879
1880.byte	0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
1881	adcxq	%rax,%r8
1882	adoxq	%r10,%r9
1883
1884.byte	0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
1885	adcxq	%rax,%r9
1886	adoxq	%r11,%r10
1887
1888	mulxq	24(%rsi),%rax,%r11
1889	adcxq	%rax,%r10
1890	adoxq	%r12,%r11
1891
1892	mulxq	32(%rsi),%rax,%r12
1893	adcxq	%rax,%r11
1894	adoxq	%r13,%r12
1895
1896	mulxq	40(%rsi),%rax,%r13
1897	adcxq	%rax,%r12
1898	adoxq	%r14,%r13
1899
1900.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
1901	adcxq	%rax,%r13
1902	adoxq	%r15,%r14
1903
1904.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
1905	adcxq	%rax,%r14
1906	adoxq	%rdi,%r15
1907	adcxq	%rdi,%r15
1908
1909	movq	%rbx,8+64-8(%rsp)
1910	movq	%r8,8+64(%rsp)
1911	movq	%r9,8+64+8(%rsp)
1912	movq	%r10,8+64+16(%rsp)
1913	movq	%r11,8+64+24(%rsp)
1914	movq	%r12,8+64+32(%rsp)
1915	movq	%r13,8+64+40(%rsp)
1916	movq	%r14,8+64+48(%rsp)
1917	movq	%r15,8+64+56(%rsp)
1918
1919	.byte	0xf3,0xc3
1920.cfi_endproc
1921.size	__rsaz_512_mulx,.-__rsaz_512_mulx
1922.globl	rsaz_512_scatter4
1923.type	rsaz_512_scatter4,@function
1924.align	16
1925rsaz_512_scatter4:
1926.cfi_startproc
1927	leaq	(%rdi,%rdx,8),%rdi
1928	movl	$8,%r9d
1929	jmp	.Loop_scatter
1930.align	16
1931.Loop_scatter:
1932	movq	(%rsi),%rax
1933	leaq	8(%rsi),%rsi
1934	movq	%rax,(%rdi)
1935	leaq	128(%rdi),%rdi
1936	decl	%r9d
1937	jnz	.Loop_scatter
1938	.byte	0xf3,0xc3
1939.cfi_endproc
1940.size	rsaz_512_scatter4,.-rsaz_512_scatter4
1941
1942.globl	rsaz_512_gather4
1943.type	rsaz_512_gather4,@function
1944.align	16
1945rsaz_512_gather4:
1946.cfi_startproc
1947	movd	%edx,%xmm8
1948	movdqa	.Linc+16(%rip),%xmm1
1949	movdqa	.Linc(%rip),%xmm0
1950
1951	pshufd	$0,%xmm8,%xmm8
1952	movdqa	%xmm1,%xmm7
1953	movdqa	%xmm1,%xmm2
1954	paddd	%xmm0,%xmm1
1955	pcmpeqd	%xmm8,%xmm0
1956	movdqa	%xmm7,%xmm3
1957	paddd	%xmm1,%xmm2
1958	pcmpeqd	%xmm8,%xmm1
1959	movdqa	%xmm7,%xmm4
1960	paddd	%xmm2,%xmm3
1961	pcmpeqd	%xmm8,%xmm2
1962	movdqa	%xmm7,%xmm5
1963	paddd	%xmm3,%xmm4
1964	pcmpeqd	%xmm8,%xmm3
1965	movdqa	%xmm7,%xmm6
1966	paddd	%xmm4,%xmm5
1967	pcmpeqd	%xmm8,%xmm4
1968	paddd	%xmm5,%xmm6
1969	pcmpeqd	%xmm8,%xmm5
1970	paddd	%xmm6,%xmm7
1971	pcmpeqd	%xmm8,%xmm6
1972	pcmpeqd	%xmm8,%xmm7
1973	movl	$8,%r9d
1974	jmp	.Loop_gather
1975.align	16
1976.Loop_gather:
1977	movdqa	0(%rsi),%xmm8
1978	movdqa	16(%rsi),%xmm9
1979	movdqa	32(%rsi),%xmm10
1980	movdqa	48(%rsi),%xmm11
1981	pand	%xmm0,%xmm8
1982	movdqa	64(%rsi),%xmm12
1983	pand	%xmm1,%xmm9
1984	movdqa	80(%rsi),%xmm13
1985	pand	%xmm2,%xmm10
1986	movdqa	96(%rsi),%xmm14
1987	pand	%xmm3,%xmm11
1988	movdqa	112(%rsi),%xmm15
1989	leaq	128(%rsi),%rsi
1990	pand	%xmm4,%xmm12
1991	pand	%xmm5,%xmm13
1992	pand	%xmm6,%xmm14
1993	pand	%xmm7,%xmm15
1994	por	%xmm10,%xmm8
1995	por	%xmm11,%xmm9
1996	por	%xmm12,%xmm8
1997	por	%xmm13,%xmm9
1998	por	%xmm14,%xmm8
1999	por	%xmm15,%xmm9
2000
2001	por	%xmm9,%xmm8
2002	pshufd	$0x4e,%xmm8,%xmm9
2003	por	%xmm9,%xmm8
2004	movq	%xmm8,(%rdi)
2005	leaq	8(%rdi),%rdi
2006	decl	%r9d
2007	jnz	.Loop_gather
2008	.byte	0xf3,0xc3
2009.LSEH_end_rsaz_512_gather4:
2010.cfi_endproc
2011.size	rsaz_512_gather4,.-rsaz_512_gather4
2012
2013.align	64
2014.Linc:
2015.long	0,0, 1,1
2016.long	2,2, 2,2
2017	.section ".note.gnu.property", "a"
2018	.p2align 3
2019	.long 1f - 0f
2020	.long 4f - 1f
2021	.long 5
20220:
2023	# "GNU" encoded with .byte, since .asciz isn't supported
2024	# on Solaris.
2025	.byte 0x47
2026	.byte 0x4e
2027	.byte 0x55
2028	.byte 0
20291:
2030	.p2align 3
2031	.long 0xc0000002
2032	.long 3f - 2f
20332:
2034	.long 3
20353:
2036	.p2align 3
20374:
2038