xref: /freebsd/sys/crypto/openssl/amd64/rsaz-x86_64.S (revision 184c1b943937986c81e1996d999d21626ec7a4ff)
1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from rsaz-x86_64.pl. */
3.text
4
5
6
7.globl	rsaz_512_sqr
8.type	rsaz_512_sqr,@function
9.align	32
10rsaz_512_sqr:
11.cfi_startproc
12	pushq	%rbx
13.cfi_adjust_cfa_offset	8
14.cfi_offset	%rbx,-16
15	pushq	%rbp
16.cfi_adjust_cfa_offset	8
17.cfi_offset	%rbp,-24
18	pushq	%r12
19.cfi_adjust_cfa_offset	8
20.cfi_offset	%r12,-32
21	pushq	%r13
22.cfi_adjust_cfa_offset	8
23.cfi_offset	%r13,-40
24	pushq	%r14
25.cfi_adjust_cfa_offset	8
26.cfi_offset	%r14,-48
27	pushq	%r15
28.cfi_adjust_cfa_offset	8
29.cfi_offset	%r15,-56
30
31	subq	$128+24,%rsp
32.cfi_adjust_cfa_offset	128+24
33.Lsqr_body:
34.byte	102,72,15,110,202
35	movq	(%rsi),%rdx
36	movq	8(%rsi),%rax
37	movq	%rcx,128(%rsp)
38	movl	$0x80100,%r11d
39	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
40	cmpl	$0x80100,%r11d
41	je	.Loop_sqrx
42	jmp	.Loop_sqr
43
44.align	32
45.Loop_sqr:
46	movl	%r8d,128+8(%rsp)
47
48	movq	%rdx,%rbx
49	movq	%rax,%rbp
50	mulq	%rdx
51	movq	%rax,%r8
52	movq	16(%rsi),%rax
53	movq	%rdx,%r9
54
55	mulq	%rbx
56	addq	%rax,%r9
57	movq	24(%rsi),%rax
58	movq	%rdx,%r10
59	adcq	$0,%r10
60
61	mulq	%rbx
62	addq	%rax,%r10
63	movq	32(%rsi),%rax
64	movq	%rdx,%r11
65	adcq	$0,%r11
66
67	mulq	%rbx
68	addq	%rax,%r11
69	movq	40(%rsi),%rax
70	movq	%rdx,%r12
71	adcq	$0,%r12
72
73	mulq	%rbx
74	addq	%rax,%r12
75	movq	48(%rsi),%rax
76	movq	%rdx,%r13
77	adcq	$0,%r13
78
79	mulq	%rbx
80	addq	%rax,%r13
81	movq	56(%rsi),%rax
82	movq	%rdx,%r14
83	adcq	$0,%r14
84
85	mulq	%rbx
86	addq	%rax,%r14
87	movq	%rbx,%rax
88	adcq	$0,%rdx
89
90	xorq	%rcx,%rcx
91	addq	%r8,%r8
92	movq	%rdx,%r15
93	adcq	$0,%rcx
94
95	mulq	%rax
96	addq	%r8,%rdx
97	adcq	$0,%rcx
98
99	movq	%rax,(%rsp)
100	movq	%rdx,8(%rsp)
101
102
103	movq	16(%rsi),%rax
104	mulq	%rbp
105	addq	%rax,%r10
106	movq	24(%rsi),%rax
107	movq	%rdx,%rbx
108	adcq	$0,%rbx
109
110	mulq	%rbp
111	addq	%rax,%r11
112	movq	32(%rsi),%rax
113	adcq	$0,%rdx
114	addq	%rbx,%r11
115	movq	%rdx,%rbx
116	adcq	$0,%rbx
117
118	mulq	%rbp
119	addq	%rax,%r12
120	movq	40(%rsi),%rax
121	adcq	$0,%rdx
122	addq	%rbx,%r12
123	movq	%rdx,%rbx
124	adcq	$0,%rbx
125
126	mulq	%rbp
127	addq	%rax,%r13
128	movq	48(%rsi),%rax
129	adcq	$0,%rdx
130	addq	%rbx,%r13
131	movq	%rdx,%rbx
132	adcq	$0,%rbx
133
134	mulq	%rbp
135	addq	%rax,%r14
136	movq	56(%rsi),%rax
137	adcq	$0,%rdx
138	addq	%rbx,%r14
139	movq	%rdx,%rbx
140	adcq	$0,%rbx
141
142	mulq	%rbp
143	addq	%rax,%r15
144	movq	%rbp,%rax
145	adcq	$0,%rdx
146	addq	%rbx,%r15
147	adcq	$0,%rdx
148
149	xorq	%rbx,%rbx
150	addq	%r9,%r9
151	movq	%rdx,%r8
152	adcq	%r10,%r10
153	adcq	$0,%rbx
154
155	mulq	%rax
156
157	addq	%rcx,%rax
158	movq	16(%rsi),%rbp
159	addq	%rax,%r9
160	movq	24(%rsi),%rax
161	adcq	%rdx,%r10
162	adcq	$0,%rbx
163
164	movq	%r9,16(%rsp)
165	movq	%r10,24(%rsp)
166
167
168	mulq	%rbp
169	addq	%rax,%r12
170	movq	32(%rsi),%rax
171	movq	%rdx,%rcx
172	adcq	$0,%rcx
173
174	mulq	%rbp
175	addq	%rax,%r13
176	movq	40(%rsi),%rax
177	adcq	$0,%rdx
178	addq	%rcx,%r13
179	movq	%rdx,%rcx
180	adcq	$0,%rcx
181
182	mulq	%rbp
183	addq	%rax,%r14
184	movq	48(%rsi),%rax
185	adcq	$0,%rdx
186	addq	%rcx,%r14
187	movq	%rdx,%rcx
188	adcq	$0,%rcx
189
190	mulq	%rbp
191	addq	%rax,%r15
192	movq	56(%rsi),%rax
193	adcq	$0,%rdx
194	addq	%rcx,%r15
195	movq	%rdx,%rcx
196	adcq	$0,%rcx
197
198	mulq	%rbp
199	addq	%rax,%r8
200	movq	%rbp,%rax
201	adcq	$0,%rdx
202	addq	%rcx,%r8
203	adcq	$0,%rdx
204
205	xorq	%rcx,%rcx
206	addq	%r11,%r11
207	movq	%rdx,%r9
208	adcq	%r12,%r12
209	adcq	$0,%rcx
210
211	mulq	%rax
212
213	addq	%rbx,%rax
214	movq	24(%rsi),%r10
215	addq	%rax,%r11
216	movq	32(%rsi),%rax
217	adcq	%rdx,%r12
218	adcq	$0,%rcx
219
220	movq	%r11,32(%rsp)
221	movq	%r12,40(%rsp)
222
223
224	movq	%rax,%r11
225	mulq	%r10
226	addq	%rax,%r14
227	movq	40(%rsi),%rax
228	movq	%rdx,%rbx
229	adcq	$0,%rbx
230
231	movq	%rax,%r12
232	mulq	%r10
233	addq	%rax,%r15
234	movq	48(%rsi),%rax
235	adcq	$0,%rdx
236	addq	%rbx,%r15
237	movq	%rdx,%rbx
238	adcq	$0,%rbx
239
240	movq	%rax,%rbp
241	mulq	%r10
242	addq	%rax,%r8
243	movq	56(%rsi),%rax
244	adcq	$0,%rdx
245	addq	%rbx,%r8
246	movq	%rdx,%rbx
247	adcq	$0,%rbx
248
249	mulq	%r10
250	addq	%rax,%r9
251	movq	%r10,%rax
252	adcq	$0,%rdx
253	addq	%rbx,%r9
254	adcq	$0,%rdx
255
256	xorq	%rbx,%rbx
257	addq	%r13,%r13
258	movq	%rdx,%r10
259	adcq	%r14,%r14
260	adcq	$0,%rbx
261
262	mulq	%rax
263
264	addq	%rcx,%rax
265	addq	%rax,%r13
266	movq	%r12,%rax
267	adcq	%rdx,%r14
268	adcq	$0,%rbx
269
270	movq	%r13,48(%rsp)
271	movq	%r14,56(%rsp)
272
273
274	mulq	%r11
275	addq	%rax,%r8
276	movq	%rbp,%rax
277	movq	%rdx,%rcx
278	adcq	$0,%rcx
279
280	mulq	%r11
281	addq	%rax,%r9
282	movq	56(%rsi),%rax
283	adcq	$0,%rdx
284	addq	%rcx,%r9
285	movq	%rdx,%rcx
286	adcq	$0,%rcx
287
288	movq	%rax,%r14
289	mulq	%r11
290	addq	%rax,%r10
291	movq	%r11,%rax
292	adcq	$0,%rdx
293	addq	%rcx,%r10
294	adcq	$0,%rdx
295
296	xorq	%rcx,%rcx
297	addq	%r15,%r15
298	movq	%rdx,%r11
299	adcq	%r8,%r8
300	adcq	$0,%rcx
301
302	mulq	%rax
303
304	addq	%rbx,%rax
305	addq	%rax,%r15
306	movq	%rbp,%rax
307	adcq	%rdx,%r8
308	adcq	$0,%rcx
309
310	movq	%r15,64(%rsp)
311	movq	%r8,72(%rsp)
312
313
314	mulq	%r12
315	addq	%rax,%r10
316	movq	%r14,%rax
317	movq	%rdx,%rbx
318	adcq	$0,%rbx
319
320	mulq	%r12
321	addq	%rax,%r11
322	movq	%r12,%rax
323	adcq	$0,%rdx
324	addq	%rbx,%r11
325	adcq	$0,%rdx
326
327	xorq	%rbx,%rbx
328	addq	%r9,%r9
329	movq	%rdx,%r12
330	adcq	%r10,%r10
331	adcq	$0,%rbx
332
333	mulq	%rax
334
335	addq	%rcx,%rax
336	addq	%rax,%r9
337	movq	%r14,%rax
338	adcq	%rdx,%r10
339	adcq	$0,%rbx
340
341	movq	%r9,80(%rsp)
342	movq	%r10,88(%rsp)
343
344
345	mulq	%rbp
346	addq	%rax,%r12
347	movq	%rbp,%rax
348	adcq	$0,%rdx
349
350	xorq	%rcx,%rcx
351	addq	%r11,%r11
352	movq	%rdx,%r13
353	adcq	%r12,%r12
354	adcq	$0,%rcx
355
356	mulq	%rax
357
358	addq	%rbx,%rax
359	addq	%rax,%r11
360	movq	%r14,%rax
361	adcq	%rdx,%r12
362	adcq	$0,%rcx
363
364	movq	%r11,96(%rsp)
365	movq	%r12,104(%rsp)
366
367
368	xorq	%rbx,%rbx
369	addq	%r13,%r13
370	adcq	$0,%rbx
371
372	mulq	%rax
373
374	addq	%rcx,%rax
375	addq	%r13,%rax
376	adcq	%rbx,%rdx
377
378	movq	(%rsp),%r8
379	movq	8(%rsp),%r9
380	movq	16(%rsp),%r10
381	movq	24(%rsp),%r11
382	movq	32(%rsp),%r12
383	movq	40(%rsp),%r13
384	movq	48(%rsp),%r14
385	movq	56(%rsp),%r15
386.byte	102,72,15,126,205
387
388	movq	%rax,112(%rsp)
389	movq	%rdx,120(%rsp)
390
391	call	__rsaz_512_reduce
392
393	addq	64(%rsp),%r8
394	adcq	72(%rsp),%r9
395	adcq	80(%rsp),%r10
396	adcq	88(%rsp),%r11
397	adcq	96(%rsp),%r12
398	adcq	104(%rsp),%r13
399	adcq	112(%rsp),%r14
400	adcq	120(%rsp),%r15
401	sbbq	%rcx,%rcx
402
403	call	__rsaz_512_subtract
404
405	movq	%r8,%rdx
406	movq	%r9,%rax
407	movl	128+8(%rsp),%r8d
408	movq	%rdi,%rsi
409
410	decl	%r8d
411	jnz	.Loop_sqr
412	jmp	.Lsqr_tail
413
414.align	32
415.Loop_sqrx:
416	movl	%r8d,128+8(%rsp)
417.byte	102,72,15,110,199
418
419	mulxq	%rax,%r8,%r9
420	movq	%rax,%rbx
421
422	mulxq	16(%rsi),%rcx,%r10
423	xorq	%rbp,%rbp
424
425	mulxq	24(%rsi),%rax,%r11
426	adcxq	%rcx,%r9
427
428.byte	0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00
429	adcxq	%rax,%r10
430
431.byte	0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00
432	adcxq	%rcx,%r11
433
434	mulxq	48(%rsi),%rcx,%r14
435	adcxq	%rax,%r12
436	adcxq	%rcx,%r13
437
438	mulxq	56(%rsi),%rax,%r15
439	adcxq	%rax,%r14
440	adcxq	%rbp,%r15
441
442	mulxq	%rdx,%rax,%rdi
443	movq	%rbx,%rdx
444	xorq	%rcx,%rcx
445	adoxq	%r8,%r8
446	adcxq	%rdi,%r8
447	adoxq	%rbp,%rcx
448	adcxq	%rbp,%rcx
449
450	movq	%rax,(%rsp)
451	movq	%r8,8(%rsp)
452
453
454.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00
455	adoxq	%rax,%r10
456	adcxq	%rbx,%r11
457
458	mulxq	24(%rsi),%rdi,%r8
459	adoxq	%rdi,%r11
460.byte	0x66
461	adcxq	%r8,%r12
462
463	mulxq	32(%rsi),%rax,%rbx
464	adoxq	%rax,%r12
465	adcxq	%rbx,%r13
466
467	mulxq	40(%rsi),%rdi,%r8
468	adoxq	%rdi,%r13
469	adcxq	%r8,%r14
470
471.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
472	adoxq	%rax,%r14
473	adcxq	%rbx,%r15
474
475.byte	0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00
476	adoxq	%rdi,%r15
477	adcxq	%rbp,%r8
478	mulxq	%rdx,%rax,%rdi
479	adoxq	%rbp,%r8
480.byte	0x48,0x8b,0x96,0x10,0x00,0x00,0x00
481
482	xorq	%rbx,%rbx
483	adoxq	%r9,%r9
484
485	adcxq	%rcx,%rax
486	adoxq	%r10,%r10
487	adcxq	%rax,%r9
488	adoxq	%rbp,%rbx
489	adcxq	%rdi,%r10
490	adcxq	%rbp,%rbx
491
492	movq	%r9,16(%rsp)
493.byte	0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00
494
495
496	mulxq	24(%rsi),%rdi,%r9
497	adoxq	%rdi,%r12
498	adcxq	%r9,%r13
499
500	mulxq	32(%rsi),%rax,%rcx
501	adoxq	%rax,%r13
502	adcxq	%rcx,%r14
503
504.byte	0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00
505	adoxq	%rdi,%r14
506	adcxq	%r9,%r15
507
508.byte	0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00
509	adoxq	%rax,%r15
510	adcxq	%rcx,%r8
511
512	mulxq	56(%rsi),%rdi,%r9
513	adoxq	%rdi,%r8
514	adcxq	%rbp,%r9
515	mulxq	%rdx,%rax,%rdi
516	adoxq	%rbp,%r9
517	movq	24(%rsi),%rdx
518
519	xorq	%rcx,%rcx
520	adoxq	%r11,%r11
521
522	adcxq	%rbx,%rax
523	adoxq	%r12,%r12
524	adcxq	%rax,%r11
525	adoxq	%rbp,%rcx
526	adcxq	%rdi,%r12
527	adcxq	%rbp,%rcx
528
529	movq	%r11,32(%rsp)
530	movq	%r12,40(%rsp)
531
532
533	mulxq	32(%rsi),%rax,%rbx
534	adoxq	%rax,%r14
535	adcxq	%rbx,%r15
536
537	mulxq	40(%rsi),%rdi,%r10
538	adoxq	%rdi,%r15
539	adcxq	%r10,%r8
540
541	mulxq	48(%rsi),%rax,%rbx
542	adoxq	%rax,%r8
543	adcxq	%rbx,%r9
544
545	mulxq	56(%rsi),%rdi,%r10
546	adoxq	%rdi,%r9
547	adcxq	%rbp,%r10
548	mulxq	%rdx,%rax,%rdi
549	adoxq	%rbp,%r10
550	movq	32(%rsi),%rdx
551
552	xorq	%rbx,%rbx
553	adoxq	%r13,%r13
554
555	adcxq	%rcx,%rax
556	adoxq	%r14,%r14
557	adcxq	%rax,%r13
558	adoxq	%rbp,%rbx
559	adcxq	%rdi,%r14
560	adcxq	%rbp,%rbx
561
562	movq	%r13,48(%rsp)
563	movq	%r14,56(%rsp)
564
565
566	mulxq	40(%rsi),%rdi,%r11
567	adoxq	%rdi,%r8
568	adcxq	%r11,%r9
569
570	mulxq	48(%rsi),%rax,%rcx
571	adoxq	%rax,%r9
572	adcxq	%rcx,%r10
573
574	mulxq	56(%rsi),%rdi,%r11
575	adoxq	%rdi,%r10
576	adcxq	%rbp,%r11
577	mulxq	%rdx,%rax,%rdi
578	movq	40(%rsi),%rdx
579	adoxq	%rbp,%r11
580
581	xorq	%rcx,%rcx
582	adoxq	%r15,%r15
583
584	adcxq	%rbx,%rax
585	adoxq	%r8,%r8
586	adcxq	%rax,%r15
587	adoxq	%rbp,%rcx
588	adcxq	%rdi,%r8
589	adcxq	%rbp,%rcx
590
591	movq	%r15,64(%rsp)
592	movq	%r8,72(%rsp)
593
594
595.byte	0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00
596	adoxq	%rax,%r10
597	adcxq	%rbx,%r11
598
599.byte	0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00
600	adoxq	%rdi,%r11
601	adcxq	%rbp,%r12
602	mulxq	%rdx,%rax,%rdi
603	adoxq	%rbp,%r12
604	movq	48(%rsi),%rdx
605
606	xorq	%rbx,%rbx
607	adoxq	%r9,%r9
608
609	adcxq	%rcx,%rax
610	adoxq	%r10,%r10
611	adcxq	%rax,%r9
612	adcxq	%rdi,%r10
613	adoxq	%rbp,%rbx
614	adcxq	%rbp,%rbx
615
616	movq	%r9,80(%rsp)
617	movq	%r10,88(%rsp)
618
619
620.byte	0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00
621	adoxq	%rax,%r12
622	adoxq	%rbp,%r13
623
624	mulxq	%rdx,%rax,%rdi
625	xorq	%rcx,%rcx
626	movq	56(%rsi),%rdx
627	adoxq	%r11,%r11
628
629	adcxq	%rbx,%rax
630	adoxq	%r12,%r12
631	adcxq	%rax,%r11
632	adoxq	%rbp,%rcx
633	adcxq	%rdi,%r12
634	adcxq	%rbp,%rcx
635
636.byte	0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00
637.byte	0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00
638
639
640	mulxq	%rdx,%rax,%rdx
641	xorq	%rbx,%rbx
642	adoxq	%r13,%r13
643
644	adcxq	%rcx,%rax
645	adoxq	%rbp,%rbx
646	adcxq	%r13,%rax
647	adcxq	%rdx,%rbx
648
649.byte	102,72,15,126,199
650.byte	102,72,15,126,205
651
652	movq	128(%rsp),%rdx
653	movq	(%rsp),%r8
654	movq	8(%rsp),%r9
655	movq	16(%rsp),%r10
656	movq	24(%rsp),%r11
657	movq	32(%rsp),%r12
658	movq	40(%rsp),%r13
659	movq	48(%rsp),%r14
660	movq	56(%rsp),%r15
661
662	movq	%rax,112(%rsp)
663	movq	%rbx,120(%rsp)
664
665	call	__rsaz_512_reducex
666
667	addq	64(%rsp),%r8
668	adcq	72(%rsp),%r9
669	adcq	80(%rsp),%r10
670	adcq	88(%rsp),%r11
671	adcq	96(%rsp),%r12
672	adcq	104(%rsp),%r13
673	adcq	112(%rsp),%r14
674	adcq	120(%rsp),%r15
675	sbbq	%rcx,%rcx
676
677	call	__rsaz_512_subtract
678
679	movq	%r8,%rdx
680	movq	%r9,%rax
681	movl	128+8(%rsp),%r8d
682	movq	%rdi,%rsi
683
684	decl	%r8d
685	jnz	.Loop_sqrx
686
687.Lsqr_tail:
688
689	leaq	128+24+48(%rsp),%rax
690.cfi_def_cfa	%rax,8
691	movq	-48(%rax),%r15
692.cfi_restore	%r15
693	movq	-40(%rax),%r14
694.cfi_restore	%r14
695	movq	-32(%rax),%r13
696.cfi_restore	%r13
697	movq	-24(%rax),%r12
698.cfi_restore	%r12
699	movq	-16(%rax),%rbp
700.cfi_restore	%rbp
701	movq	-8(%rax),%rbx
702.cfi_restore	%rbx
703	leaq	(%rax),%rsp
704.cfi_def_cfa_register	%rsp
705.Lsqr_epilogue:
706	.byte	0xf3,0xc3
707.cfi_endproc
708.size	rsaz_512_sqr,.-rsaz_512_sqr
709.globl	rsaz_512_mul
710.type	rsaz_512_mul,@function
711.align	32
712rsaz_512_mul:
713.cfi_startproc
714	pushq	%rbx
715.cfi_adjust_cfa_offset	8
716.cfi_offset	%rbx,-16
717	pushq	%rbp
718.cfi_adjust_cfa_offset	8
719.cfi_offset	%rbp,-24
720	pushq	%r12
721.cfi_adjust_cfa_offset	8
722.cfi_offset	%r12,-32
723	pushq	%r13
724.cfi_adjust_cfa_offset	8
725.cfi_offset	%r13,-40
726	pushq	%r14
727.cfi_adjust_cfa_offset	8
728.cfi_offset	%r14,-48
729	pushq	%r15
730.cfi_adjust_cfa_offset	8
731.cfi_offset	%r15,-56
732
733	subq	$128+24,%rsp
734.cfi_adjust_cfa_offset	128+24
735.Lmul_body:
736.byte	102,72,15,110,199
737.byte	102,72,15,110,201
738	movq	%r8,128(%rsp)
739	movl	$0x80100,%r11d
740	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
741	cmpl	$0x80100,%r11d
742	je	.Lmulx
743	movq	(%rdx),%rbx
744	movq	%rdx,%rbp
745	call	__rsaz_512_mul
746
747.byte	102,72,15,126,199
748.byte	102,72,15,126,205
749
750	movq	(%rsp),%r8
751	movq	8(%rsp),%r9
752	movq	16(%rsp),%r10
753	movq	24(%rsp),%r11
754	movq	32(%rsp),%r12
755	movq	40(%rsp),%r13
756	movq	48(%rsp),%r14
757	movq	56(%rsp),%r15
758
759	call	__rsaz_512_reduce
760	jmp	.Lmul_tail
761
762.align	32
763.Lmulx:
764	movq	%rdx,%rbp
765	movq	(%rdx),%rdx
766	call	__rsaz_512_mulx
767
768.byte	102,72,15,126,199
769.byte	102,72,15,126,205
770
771	movq	128(%rsp),%rdx
772	movq	(%rsp),%r8
773	movq	8(%rsp),%r9
774	movq	16(%rsp),%r10
775	movq	24(%rsp),%r11
776	movq	32(%rsp),%r12
777	movq	40(%rsp),%r13
778	movq	48(%rsp),%r14
779	movq	56(%rsp),%r15
780
781	call	__rsaz_512_reducex
782.Lmul_tail:
783	addq	64(%rsp),%r8
784	adcq	72(%rsp),%r9
785	adcq	80(%rsp),%r10
786	adcq	88(%rsp),%r11
787	adcq	96(%rsp),%r12
788	adcq	104(%rsp),%r13
789	adcq	112(%rsp),%r14
790	adcq	120(%rsp),%r15
791	sbbq	%rcx,%rcx
792
793	call	__rsaz_512_subtract
794
795	leaq	128+24+48(%rsp),%rax
796.cfi_def_cfa	%rax,8
797	movq	-48(%rax),%r15
798.cfi_restore	%r15
799	movq	-40(%rax),%r14
800.cfi_restore	%r14
801	movq	-32(%rax),%r13
802.cfi_restore	%r13
803	movq	-24(%rax),%r12
804.cfi_restore	%r12
805	movq	-16(%rax),%rbp
806.cfi_restore	%rbp
807	movq	-8(%rax),%rbx
808.cfi_restore	%rbx
809	leaq	(%rax),%rsp
810.cfi_def_cfa_register	%rsp
811.Lmul_epilogue:
812	.byte	0xf3,0xc3
813.cfi_endproc
814.size	rsaz_512_mul,.-rsaz_512_mul
815.globl	rsaz_512_mul_gather4
816.type	rsaz_512_mul_gather4,@function
817.align	32
818rsaz_512_mul_gather4:
819.cfi_startproc
820	pushq	%rbx
821.cfi_adjust_cfa_offset	8
822.cfi_offset	%rbx,-16
823	pushq	%rbp
824.cfi_adjust_cfa_offset	8
825.cfi_offset	%rbp,-24
826	pushq	%r12
827.cfi_adjust_cfa_offset	8
828.cfi_offset	%r12,-32
829	pushq	%r13
830.cfi_adjust_cfa_offset	8
831.cfi_offset	%r13,-40
832	pushq	%r14
833.cfi_adjust_cfa_offset	8
834.cfi_offset	%r14,-48
835	pushq	%r15
836.cfi_adjust_cfa_offset	8
837.cfi_offset	%r15,-56
838
839	subq	$152,%rsp
840.cfi_adjust_cfa_offset	152
841.Lmul_gather4_body:
842	movd	%r9d,%xmm8
843	movdqa	.Linc+16(%rip),%xmm1
844	movdqa	.Linc(%rip),%xmm0
845
846	pshufd	$0,%xmm8,%xmm8
847	movdqa	%xmm1,%xmm7
848	movdqa	%xmm1,%xmm2
849	paddd	%xmm0,%xmm1
850	pcmpeqd	%xmm8,%xmm0
851	movdqa	%xmm7,%xmm3
852	paddd	%xmm1,%xmm2
853	pcmpeqd	%xmm8,%xmm1
854	movdqa	%xmm7,%xmm4
855	paddd	%xmm2,%xmm3
856	pcmpeqd	%xmm8,%xmm2
857	movdqa	%xmm7,%xmm5
858	paddd	%xmm3,%xmm4
859	pcmpeqd	%xmm8,%xmm3
860	movdqa	%xmm7,%xmm6
861	paddd	%xmm4,%xmm5
862	pcmpeqd	%xmm8,%xmm4
863	paddd	%xmm5,%xmm6
864	pcmpeqd	%xmm8,%xmm5
865	paddd	%xmm6,%xmm7
866	pcmpeqd	%xmm8,%xmm6
867	pcmpeqd	%xmm8,%xmm7
868
869	movdqa	0(%rdx),%xmm8
870	movdqa	16(%rdx),%xmm9
871	movdqa	32(%rdx),%xmm10
872	movdqa	48(%rdx),%xmm11
873	pand	%xmm0,%xmm8
874	movdqa	64(%rdx),%xmm12
875	pand	%xmm1,%xmm9
876	movdqa	80(%rdx),%xmm13
877	pand	%xmm2,%xmm10
878	movdqa	96(%rdx),%xmm14
879	pand	%xmm3,%xmm11
880	movdqa	112(%rdx),%xmm15
881	leaq	128(%rdx),%rbp
882	pand	%xmm4,%xmm12
883	pand	%xmm5,%xmm13
884	pand	%xmm6,%xmm14
885	pand	%xmm7,%xmm15
886	por	%xmm10,%xmm8
887	por	%xmm11,%xmm9
888	por	%xmm12,%xmm8
889	por	%xmm13,%xmm9
890	por	%xmm14,%xmm8
891	por	%xmm15,%xmm9
892
893	por	%xmm9,%xmm8
894	pshufd	$0x4e,%xmm8,%xmm9
895	por	%xmm9,%xmm8
896	movl	$0x80100,%r11d
897	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
898	cmpl	$0x80100,%r11d
899	je	.Lmulx_gather
900.byte	102,76,15,126,195
901
902	movq	%r8,128(%rsp)
903	movq	%rdi,128+8(%rsp)
904	movq	%rcx,128+16(%rsp)
905
906	movq	(%rsi),%rax
907	movq	8(%rsi),%rcx
908	mulq	%rbx
909	movq	%rax,(%rsp)
910	movq	%rcx,%rax
911	movq	%rdx,%r8
912
913	mulq	%rbx
914	addq	%rax,%r8
915	movq	16(%rsi),%rax
916	movq	%rdx,%r9
917	adcq	$0,%r9
918
919	mulq	%rbx
920	addq	%rax,%r9
921	movq	24(%rsi),%rax
922	movq	%rdx,%r10
923	adcq	$0,%r10
924
925	mulq	%rbx
926	addq	%rax,%r10
927	movq	32(%rsi),%rax
928	movq	%rdx,%r11
929	adcq	$0,%r11
930
931	mulq	%rbx
932	addq	%rax,%r11
933	movq	40(%rsi),%rax
934	movq	%rdx,%r12
935	adcq	$0,%r12
936
937	mulq	%rbx
938	addq	%rax,%r12
939	movq	48(%rsi),%rax
940	movq	%rdx,%r13
941	adcq	$0,%r13
942
943	mulq	%rbx
944	addq	%rax,%r13
945	movq	56(%rsi),%rax
946	movq	%rdx,%r14
947	adcq	$0,%r14
948
949	mulq	%rbx
950	addq	%rax,%r14
951	movq	(%rsi),%rax
952	movq	%rdx,%r15
953	adcq	$0,%r15
954
955	leaq	8(%rsp),%rdi
956	movl	$7,%ecx
957	jmp	.Loop_mul_gather
958
959.align	32
960.Loop_mul_gather:
961	movdqa	0(%rbp),%xmm8
962	movdqa	16(%rbp),%xmm9
963	movdqa	32(%rbp),%xmm10
964	movdqa	48(%rbp),%xmm11
965	pand	%xmm0,%xmm8
966	movdqa	64(%rbp),%xmm12
967	pand	%xmm1,%xmm9
968	movdqa	80(%rbp),%xmm13
969	pand	%xmm2,%xmm10
970	movdqa	96(%rbp),%xmm14
971	pand	%xmm3,%xmm11
972	movdqa	112(%rbp),%xmm15
973	leaq	128(%rbp),%rbp
974	pand	%xmm4,%xmm12
975	pand	%xmm5,%xmm13
976	pand	%xmm6,%xmm14
977	pand	%xmm7,%xmm15
978	por	%xmm10,%xmm8
979	por	%xmm11,%xmm9
980	por	%xmm12,%xmm8
981	por	%xmm13,%xmm9
982	por	%xmm14,%xmm8
983	por	%xmm15,%xmm9
984
985	por	%xmm9,%xmm8
986	pshufd	$0x4e,%xmm8,%xmm9
987	por	%xmm9,%xmm8
988.byte	102,76,15,126,195
989
990	mulq	%rbx
991	addq	%rax,%r8
992	movq	8(%rsi),%rax
993	movq	%r8,(%rdi)
994	movq	%rdx,%r8
995	adcq	$0,%r8
996
997	mulq	%rbx
998	addq	%rax,%r9
999	movq	16(%rsi),%rax
1000	adcq	$0,%rdx
1001	addq	%r9,%r8
1002	movq	%rdx,%r9
1003	adcq	$0,%r9
1004
1005	mulq	%rbx
1006	addq	%rax,%r10
1007	movq	24(%rsi),%rax
1008	adcq	$0,%rdx
1009	addq	%r10,%r9
1010	movq	%rdx,%r10
1011	adcq	$0,%r10
1012
1013	mulq	%rbx
1014	addq	%rax,%r11
1015	movq	32(%rsi),%rax
1016	adcq	$0,%rdx
1017	addq	%r11,%r10
1018	movq	%rdx,%r11
1019	adcq	$0,%r11
1020
1021	mulq	%rbx
1022	addq	%rax,%r12
1023	movq	40(%rsi),%rax
1024	adcq	$0,%rdx
1025	addq	%r12,%r11
1026	movq	%rdx,%r12
1027	adcq	$0,%r12
1028
1029	mulq	%rbx
1030	addq	%rax,%r13
1031	movq	48(%rsi),%rax
1032	adcq	$0,%rdx
1033	addq	%r13,%r12
1034	movq	%rdx,%r13
1035	adcq	$0,%r13
1036
1037	mulq	%rbx
1038	addq	%rax,%r14
1039	movq	56(%rsi),%rax
1040	adcq	$0,%rdx
1041	addq	%r14,%r13
1042	movq	%rdx,%r14
1043	adcq	$0,%r14
1044
1045	mulq	%rbx
1046	addq	%rax,%r15
1047	movq	(%rsi),%rax
1048	adcq	$0,%rdx
1049	addq	%r15,%r14
1050	movq	%rdx,%r15
1051	adcq	$0,%r15
1052
1053	leaq	8(%rdi),%rdi
1054
1055	decl	%ecx
1056	jnz	.Loop_mul_gather
1057
1058	movq	%r8,(%rdi)
1059	movq	%r9,8(%rdi)
1060	movq	%r10,16(%rdi)
1061	movq	%r11,24(%rdi)
1062	movq	%r12,32(%rdi)
1063	movq	%r13,40(%rdi)
1064	movq	%r14,48(%rdi)
1065	movq	%r15,56(%rdi)
1066
1067	movq	128+8(%rsp),%rdi
1068	movq	128+16(%rsp),%rbp
1069
1070	movq	(%rsp),%r8
1071	movq	8(%rsp),%r9
1072	movq	16(%rsp),%r10
1073	movq	24(%rsp),%r11
1074	movq	32(%rsp),%r12
1075	movq	40(%rsp),%r13
1076	movq	48(%rsp),%r14
1077	movq	56(%rsp),%r15
1078
1079	call	__rsaz_512_reduce
1080	jmp	.Lmul_gather_tail
1081
1082.align	32
1083.Lmulx_gather:
1084.byte	102,76,15,126,194
1085
1086	movq	%r8,128(%rsp)
1087	movq	%rdi,128+8(%rsp)
1088	movq	%rcx,128+16(%rsp)
1089
1090	mulxq	(%rsi),%rbx,%r8
1091	movq	%rbx,(%rsp)
1092	xorl	%edi,%edi
1093
1094	mulxq	8(%rsi),%rax,%r9
1095
1096	mulxq	16(%rsi),%rbx,%r10
1097	adcxq	%rax,%r8
1098
1099	mulxq	24(%rsi),%rax,%r11
1100	adcxq	%rbx,%r9
1101
1102	mulxq	32(%rsi),%rbx,%r12
1103	adcxq	%rax,%r10
1104
1105	mulxq	40(%rsi),%rax,%r13
1106	adcxq	%rbx,%r11
1107
1108	mulxq	48(%rsi),%rbx,%r14
1109	adcxq	%rax,%r12
1110
1111	mulxq	56(%rsi),%rax,%r15
1112	adcxq	%rbx,%r13
1113	adcxq	%rax,%r14
1114.byte	0x67
1115	movq	%r8,%rbx
1116	adcxq	%rdi,%r15
1117
1118	movq	$-7,%rcx
1119	jmp	.Loop_mulx_gather
1120
1121.align	32
1122.Loop_mulx_gather:
1123	movdqa	0(%rbp),%xmm8
1124	movdqa	16(%rbp),%xmm9
1125	movdqa	32(%rbp),%xmm10
1126	movdqa	48(%rbp),%xmm11
1127	pand	%xmm0,%xmm8
1128	movdqa	64(%rbp),%xmm12
1129	pand	%xmm1,%xmm9
1130	movdqa	80(%rbp),%xmm13
1131	pand	%xmm2,%xmm10
1132	movdqa	96(%rbp),%xmm14
1133	pand	%xmm3,%xmm11
1134	movdqa	112(%rbp),%xmm15
1135	leaq	128(%rbp),%rbp
1136	pand	%xmm4,%xmm12
1137	pand	%xmm5,%xmm13
1138	pand	%xmm6,%xmm14
1139	pand	%xmm7,%xmm15
1140	por	%xmm10,%xmm8
1141	por	%xmm11,%xmm9
1142	por	%xmm12,%xmm8
1143	por	%xmm13,%xmm9
1144	por	%xmm14,%xmm8
1145	por	%xmm15,%xmm9
1146
1147	por	%xmm9,%xmm8
1148	pshufd	$0x4e,%xmm8,%xmm9
1149	por	%xmm9,%xmm8
1150.byte	102,76,15,126,194
1151
1152.byte	0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00
1153	adcxq	%rax,%rbx
1154	adoxq	%r9,%r8
1155
1156	mulxq	8(%rsi),%rax,%r9
1157	adcxq	%rax,%r8
1158	adoxq	%r10,%r9
1159
1160	mulxq	16(%rsi),%rax,%r10
1161	adcxq	%rax,%r9
1162	adoxq	%r11,%r10
1163
1164.byte	0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00
1165	adcxq	%rax,%r10
1166	adoxq	%r12,%r11
1167
1168	mulxq	32(%rsi),%rax,%r12
1169	adcxq	%rax,%r11
1170	adoxq	%r13,%r12
1171
1172	mulxq	40(%rsi),%rax,%r13
1173	adcxq	%rax,%r12
1174	adoxq	%r14,%r13
1175
1176.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
1177	adcxq	%rax,%r13
1178.byte	0x67
1179	adoxq	%r15,%r14
1180
1181	mulxq	56(%rsi),%rax,%r15
1182	movq	%rbx,64(%rsp,%rcx,8)
1183	adcxq	%rax,%r14
1184	adoxq	%rdi,%r15
1185	movq	%r8,%rbx
1186	adcxq	%rdi,%r15
1187
1188	incq	%rcx
1189	jnz	.Loop_mulx_gather
1190
1191	movq	%r8,64(%rsp)
1192	movq	%r9,64+8(%rsp)
1193	movq	%r10,64+16(%rsp)
1194	movq	%r11,64+24(%rsp)
1195	movq	%r12,64+32(%rsp)
1196	movq	%r13,64+40(%rsp)
1197	movq	%r14,64+48(%rsp)
1198	movq	%r15,64+56(%rsp)
1199
1200	movq	128(%rsp),%rdx
1201	movq	128+8(%rsp),%rdi
1202	movq	128+16(%rsp),%rbp
1203
1204	movq	(%rsp),%r8
1205	movq	8(%rsp),%r9
1206	movq	16(%rsp),%r10
1207	movq	24(%rsp),%r11
1208	movq	32(%rsp),%r12
1209	movq	40(%rsp),%r13
1210	movq	48(%rsp),%r14
1211	movq	56(%rsp),%r15
1212
1213	call	__rsaz_512_reducex
1214
1215.Lmul_gather_tail:
1216	addq	64(%rsp),%r8
1217	adcq	72(%rsp),%r9
1218	adcq	80(%rsp),%r10
1219	adcq	88(%rsp),%r11
1220	adcq	96(%rsp),%r12
1221	adcq	104(%rsp),%r13
1222	adcq	112(%rsp),%r14
1223	adcq	120(%rsp),%r15
1224	sbbq	%rcx,%rcx
1225
1226	call	__rsaz_512_subtract
1227
1228	leaq	128+24+48(%rsp),%rax
1229.cfi_def_cfa	%rax,8
1230	movq	-48(%rax),%r15
1231.cfi_restore	%r15
1232	movq	-40(%rax),%r14
1233.cfi_restore	%r14
1234	movq	-32(%rax),%r13
1235.cfi_restore	%r13
1236	movq	-24(%rax),%r12
1237.cfi_restore	%r12
1238	movq	-16(%rax),%rbp
1239.cfi_restore	%rbp
1240	movq	-8(%rax),%rbx
1241.cfi_restore	%rbx
1242	leaq	(%rax),%rsp
1243.cfi_def_cfa_register	%rsp
1244.Lmul_gather4_epilogue:
1245	.byte	0xf3,0xc3
1246.cfi_endproc
1247.size	rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
1248.globl	rsaz_512_mul_scatter4
1249.type	rsaz_512_mul_scatter4,@function
1250.align	32
1251rsaz_512_mul_scatter4:
1252.cfi_startproc
1253	pushq	%rbx
1254.cfi_adjust_cfa_offset	8
1255.cfi_offset	%rbx,-16
1256	pushq	%rbp
1257.cfi_adjust_cfa_offset	8
1258.cfi_offset	%rbp,-24
1259	pushq	%r12
1260.cfi_adjust_cfa_offset	8
1261.cfi_offset	%r12,-32
1262	pushq	%r13
1263.cfi_adjust_cfa_offset	8
1264.cfi_offset	%r13,-40
1265	pushq	%r14
1266.cfi_adjust_cfa_offset	8
1267.cfi_offset	%r14,-48
1268	pushq	%r15
1269.cfi_adjust_cfa_offset	8
1270.cfi_offset	%r15,-56
1271
1272	movl	%r9d,%r9d
1273	subq	$128+24,%rsp
1274.cfi_adjust_cfa_offset	128+24
1275.Lmul_scatter4_body:
1276	leaq	(%r8,%r9,8),%r8
1277.byte	102,72,15,110,199
1278.byte	102,72,15,110,202
1279.byte	102,73,15,110,208
1280	movq	%rcx,128(%rsp)
1281
1282	movq	%rdi,%rbp
1283	movl	$0x80100,%r11d
1284	andl	OPENSSL_ia32cap_P+8(%rip),%r11d
1285	cmpl	$0x80100,%r11d
1286	je	.Lmulx_scatter
1287	movq	(%rdi),%rbx
1288	call	__rsaz_512_mul
1289
1290.byte	102,72,15,126,199
1291.byte	102,72,15,126,205
1292
1293	movq	(%rsp),%r8
1294	movq	8(%rsp),%r9
1295	movq	16(%rsp),%r10
1296	movq	24(%rsp),%r11
1297	movq	32(%rsp),%r12
1298	movq	40(%rsp),%r13
1299	movq	48(%rsp),%r14
1300	movq	56(%rsp),%r15
1301
1302	call	__rsaz_512_reduce
1303	jmp	.Lmul_scatter_tail
1304
1305.align	32
1306.Lmulx_scatter:
1307	movq	(%rdi),%rdx
1308	call	__rsaz_512_mulx
1309
1310.byte	102,72,15,126,199
1311.byte	102,72,15,126,205
1312
1313	movq	128(%rsp),%rdx
1314	movq	(%rsp),%r8
1315	movq	8(%rsp),%r9
1316	movq	16(%rsp),%r10
1317	movq	24(%rsp),%r11
1318	movq	32(%rsp),%r12
1319	movq	40(%rsp),%r13
1320	movq	48(%rsp),%r14
1321	movq	56(%rsp),%r15
1322
1323	call	__rsaz_512_reducex
1324
1325.Lmul_scatter_tail:
1326	addq	64(%rsp),%r8
1327	adcq	72(%rsp),%r9
1328	adcq	80(%rsp),%r10
1329	adcq	88(%rsp),%r11
1330	adcq	96(%rsp),%r12
1331	adcq	104(%rsp),%r13
1332	adcq	112(%rsp),%r14
1333	adcq	120(%rsp),%r15
1334.byte	102,72,15,126,214
1335	sbbq	%rcx,%rcx
1336
1337	call	__rsaz_512_subtract
1338
1339	movq	%r8,0(%rsi)
1340	movq	%r9,128(%rsi)
1341	movq	%r10,256(%rsi)
1342	movq	%r11,384(%rsi)
1343	movq	%r12,512(%rsi)
1344	movq	%r13,640(%rsi)
1345	movq	%r14,768(%rsi)
1346	movq	%r15,896(%rsi)
1347
1348	leaq	128+24+48(%rsp),%rax
1349.cfi_def_cfa	%rax,8
1350	movq	-48(%rax),%r15
1351.cfi_restore	%r15
1352	movq	-40(%rax),%r14
1353.cfi_restore	%r14
1354	movq	-32(%rax),%r13
1355.cfi_restore	%r13
1356	movq	-24(%rax),%r12
1357.cfi_restore	%r12
1358	movq	-16(%rax),%rbp
1359.cfi_restore	%rbp
1360	movq	-8(%rax),%rbx
1361.cfi_restore	%rbx
1362	leaq	(%rax),%rsp
1363.cfi_def_cfa_register	%rsp
1364.Lmul_scatter4_epilogue:
1365	.byte	0xf3,0xc3
1366.cfi_endproc
1367.size	rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
1368.globl	rsaz_512_mul_by_one
1369.type	rsaz_512_mul_by_one,@function
1370.align	32
1371rsaz_512_mul_by_one:
1372.cfi_startproc
1373	pushq	%rbx
1374.cfi_adjust_cfa_offset	8
1375.cfi_offset	%rbx,-16
1376	pushq	%rbp
1377.cfi_adjust_cfa_offset	8
1378.cfi_offset	%rbp,-24
1379	pushq	%r12
1380.cfi_adjust_cfa_offset	8
1381.cfi_offset	%r12,-32
1382	pushq	%r13
1383.cfi_adjust_cfa_offset	8
1384.cfi_offset	%r13,-40
1385	pushq	%r14
1386.cfi_adjust_cfa_offset	8
1387.cfi_offset	%r14,-48
1388	pushq	%r15
1389.cfi_adjust_cfa_offset	8
1390.cfi_offset	%r15,-56
1391
1392	subq	$128+24,%rsp
1393.cfi_adjust_cfa_offset	128+24
1394.Lmul_by_one_body:
1395	movl	OPENSSL_ia32cap_P+8(%rip),%eax
1396	movq	%rdx,%rbp
1397	movq	%rcx,128(%rsp)
1398
1399	movq	(%rsi),%r8
1400	pxor	%xmm0,%xmm0
1401	movq	8(%rsi),%r9
1402	movq	16(%rsi),%r10
1403	movq	24(%rsi),%r11
1404	movq	32(%rsi),%r12
1405	movq	40(%rsi),%r13
1406	movq	48(%rsi),%r14
1407	movq	56(%rsi),%r15
1408
1409	movdqa	%xmm0,(%rsp)
1410	movdqa	%xmm0,16(%rsp)
1411	movdqa	%xmm0,32(%rsp)
1412	movdqa	%xmm0,48(%rsp)
1413	movdqa	%xmm0,64(%rsp)
1414	movdqa	%xmm0,80(%rsp)
1415	movdqa	%xmm0,96(%rsp)
1416	andl	$0x80100,%eax
1417	cmpl	$0x80100,%eax
1418	je	.Lby_one_callx
1419	call	__rsaz_512_reduce
1420	jmp	.Lby_one_tail
1421.align	32
1422.Lby_one_callx:
1423	movq	128(%rsp),%rdx
1424	call	__rsaz_512_reducex
1425.Lby_one_tail:
1426	movq	%r8,(%rdi)
1427	movq	%r9,8(%rdi)
1428	movq	%r10,16(%rdi)
1429	movq	%r11,24(%rdi)
1430	movq	%r12,32(%rdi)
1431	movq	%r13,40(%rdi)
1432	movq	%r14,48(%rdi)
1433	movq	%r15,56(%rdi)
1434
1435	leaq	128+24+48(%rsp),%rax
1436.cfi_def_cfa	%rax,8
1437	movq	-48(%rax),%r15
1438.cfi_restore	%r15
1439	movq	-40(%rax),%r14
1440.cfi_restore	%r14
1441	movq	-32(%rax),%r13
1442.cfi_restore	%r13
1443	movq	-24(%rax),%r12
1444.cfi_restore	%r12
1445	movq	-16(%rax),%rbp
1446.cfi_restore	%rbp
1447	movq	-8(%rax),%rbx
1448.cfi_restore	%rbx
1449	leaq	(%rax),%rsp
1450.cfi_def_cfa_register	%rsp
1451.Lmul_by_one_epilogue:
1452	.byte	0xf3,0xc3
1453.cfi_endproc
1454.size	rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
1455.type	__rsaz_512_reduce,@function
1456.align	32
1457__rsaz_512_reduce:
1458.cfi_startproc
1459	movq	%r8,%rbx
1460	imulq	128+8(%rsp),%rbx
1461	movq	0(%rbp),%rax
1462	movl	$8,%ecx
1463	jmp	.Lreduction_loop
1464
1465.align	32
1466.Lreduction_loop:
1467	mulq	%rbx
1468	movq	8(%rbp),%rax
1469	negq	%r8
1470	movq	%rdx,%r8
1471	adcq	$0,%r8
1472
1473	mulq	%rbx
1474	addq	%rax,%r9
1475	movq	16(%rbp),%rax
1476	adcq	$0,%rdx
1477	addq	%r9,%r8
1478	movq	%rdx,%r9
1479	adcq	$0,%r9
1480
1481	mulq	%rbx
1482	addq	%rax,%r10
1483	movq	24(%rbp),%rax
1484	adcq	$0,%rdx
1485	addq	%r10,%r9
1486	movq	%rdx,%r10
1487	adcq	$0,%r10
1488
1489	mulq	%rbx
1490	addq	%rax,%r11
1491	movq	32(%rbp),%rax
1492	adcq	$0,%rdx
1493	addq	%r11,%r10
1494	movq	128+8(%rsp),%rsi
1495
1496
1497	adcq	$0,%rdx
1498	movq	%rdx,%r11
1499
1500	mulq	%rbx
1501	addq	%rax,%r12
1502	movq	40(%rbp),%rax
1503	adcq	$0,%rdx
1504	imulq	%r8,%rsi
1505	addq	%r12,%r11
1506	movq	%rdx,%r12
1507	adcq	$0,%r12
1508
1509	mulq	%rbx
1510	addq	%rax,%r13
1511	movq	48(%rbp),%rax
1512	adcq	$0,%rdx
1513	addq	%r13,%r12
1514	movq	%rdx,%r13
1515	adcq	$0,%r13
1516
1517	mulq	%rbx
1518	addq	%rax,%r14
1519	movq	56(%rbp),%rax
1520	adcq	$0,%rdx
1521	addq	%r14,%r13
1522	movq	%rdx,%r14
1523	adcq	$0,%r14
1524
1525	mulq	%rbx
1526	movq	%rsi,%rbx
1527	addq	%rax,%r15
1528	movq	0(%rbp),%rax
1529	adcq	$0,%rdx
1530	addq	%r15,%r14
1531	movq	%rdx,%r15
1532	adcq	$0,%r15
1533
1534	decl	%ecx
1535	jne	.Lreduction_loop
1536
1537	.byte	0xf3,0xc3
1538.cfi_endproc
1539.size	__rsaz_512_reduce,.-__rsaz_512_reduce
1540.type	__rsaz_512_reducex,@function
1541.align	32
1542__rsaz_512_reducex:
1543.cfi_startproc
1544
1545	imulq	%r8,%rdx
1546	xorq	%rsi,%rsi
1547	movl	$8,%ecx
1548	jmp	.Lreduction_loopx
1549
1550.align	32
1551.Lreduction_loopx:
1552	movq	%r8,%rbx
1553	mulxq	0(%rbp),%rax,%r8
1554	adcxq	%rbx,%rax
1555	adoxq	%r9,%r8
1556
1557	mulxq	8(%rbp),%rax,%r9
1558	adcxq	%rax,%r8
1559	adoxq	%r10,%r9
1560
1561	mulxq	16(%rbp),%rbx,%r10
1562	adcxq	%rbx,%r9
1563	adoxq	%r11,%r10
1564
1565	mulxq	24(%rbp),%rbx,%r11
1566	adcxq	%rbx,%r10
1567	adoxq	%r12,%r11
1568
1569.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
1570	movq	%rdx,%rax
1571	movq	%r8,%rdx
1572	adcxq	%rbx,%r11
1573	adoxq	%r13,%r12
1574
1575	mulxq	128+8(%rsp),%rbx,%rdx
1576	movq	%rax,%rdx
1577
1578	mulxq	40(%rbp),%rax,%r13
1579	adcxq	%rax,%r12
1580	adoxq	%r14,%r13
1581
1582.byte	0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00
1583	adcxq	%rax,%r13
1584	adoxq	%r15,%r14
1585
1586	mulxq	56(%rbp),%rax,%r15
1587	movq	%rbx,%rdx
1588	adcxq	%rax,%r14
1589	adoxq	%rsi,%r15
1590	adcxq	%rsi,%r15
1591
1592	decl	%ecx
1593	jne	.Lreduction_loopx
1594
1595	.byte	0xf3,0xc3
1596.cfi_endproc
1597.size	__rsaz_512_reducex,.-__rsaz_512_reducex
1598.type	__rsaz_512_subtract,@function
1599.align	32
1600__rsaz_512_subtract:
1601.cfi_startproc
1602	movq	%r8,(%rdi)
1603	movq	%r9,8(%rdi)
1604	movq	%r10,16(%rdi)
1605	movq	%r11,24(%rdi)
1606	movq	%r12,32(%rdi)
1607	movq	%r13,40(%rdi)
1608	movq	%r14,48(%rdi)
1609	movq	%r15,56(%rdi)
1610
1611	movq	0(%rbp),%r8
1612	movq	8(%rbp),%r9
1613	negq	%r8
1614	notq	%r9
1615	andq	%rcx,%r8
1616	movq	16(%rbp),%r10
1617	andq	%rcx,%r9
1618	notq	%r10
1619	movq	24(%rbp),%r11
1620	andq	%rcx,%r10
1621	notq	%r11
1622	movq	32(%rbp),%r12
1623	andq	%rcx,%r11
1624	notq	%r12
1625	movq	40(%rbp),%r13
1626	andq	%rcx,%r12
1627	notq	%r13
1628	movq	48(%rbp),%r14
1629	andq	%rcx,%r13
1630	notq	%r14
1631	movq	56(%rbp),%r15
1632	andq	%rcx,%r14
1633	notq	%r15
1634	andq	%rcx,%r15
1635
1636	addq	(%rdi),%r8
1637	adcq	8(%rdi),%r9
1638	adcq	16(%rdi),%r10
1639	adcq	24(%rdi),%r11
1640	adcq	32(%rdi),%r12
1641	adcq	40(%rdi),%r13
1642	adcq	48(%rdi),%r14
1643	adcq	56(%rdi),%r15
1644
1645	movq	%r8,(%rdi)
1646	movq	%r9,8(%rdi)
1647	movq	%r10,16(%rdi)
1648	movq	%r11,24(%rdi)
1649	movq	%r12,32(%rdi)
1650	movq	%r13,40(%rdi)
1651	movq	%r14,48(%rdi)
1652	movq	%r15,56(%rdi)
1653
1654	.byte	0xf3,0xc3
1655.cfi_endproc
1656.size	__rsaz_512_subtract,.-__rsaz_512_subtract
1657.type	__rsaz_512_mul,@function
1658.align	32
1659__rsaz_512_mul:
1660.cfi_startproc
1661	leaq	8(%rsp),%rdi
1662
1663	movq	(%rsi),%rax
1664	mulq	%rbx
1665	movq	%rax,(%rdi)
1666	movq	8(%rsi),%rax
1667	movq	%rdx,%r8
1668
1669	mulq	%rbx
1670	addq	%rax,%r8
1671	movq	16(%rsi),%rax
1672	movq	%rdx,%r9
1673	adcq	$0,%r9
1674
1675	mulq	%rbx
1676	addq	%rax,%r9
1677	movq	24(%rsi),%rax
1678	movq	%rdx,%r10
1679	adcq	$0,%r10
1680
1681	mulq	%rbx
1682	addq	%rax,%r10
1683	movq	32(%rsi),%rax
1684	movq	%rdx,%r11
1685	adcq	$0,%r11
1686
1687	mulq	%rbx
1688	addq	%rax,%r11
1689	movq	40(%rsi),%rax
1690	movq	%rdx,%r12
1691	adcq	$0,%r12
1692
1693	mulq	%rbx
1694	addq	%rax,%r12
1695	movq	48(%rsi),%rax
1696	movq	%rdx,%r13
1697	adcq	$0,%r13
1698
1699	mulq	%rbx
1700	addq	%rax,%r13
1701	movq	56(%rsi),%rax
1702	movq	%rdx,%r14
1703	adcq	$0,%r14
1704
1705	mulq	%rbx
1706	addq	%rax,%r14
1707	movq	(%rsi),%rax
1708	movq	%rdx,%r15
1709	adcq	$0,%r15
1710
1711	leaq	8(%rbp),%rbp
1712	leaq	8(%rdi),%rdi
1713
1714	movl	$7,%ecx
1715	jmp	.Loop_mul
1716
1717.align	32
1718.Loop_mul:
1719	movq	(%rbp),%rbx
1720	mulq	%rbx
1721	addq	%rax,%r8
1722	movq	8(%rsi),%rax
1723	movq	%r8,(%rdi)
1724	movq	%rdx,%r8
1725	adcq	$0,%r8
1726
1727	mulq	%rbx
1728	addq	%rax,%r9
1729	movq	16(%rsi),%rax
1730	adcq	$0,%rdx
1731	addq	%r9,%r8
1732	movq	%rdx,%r9
1733	adcq	$0,%r9
1734
1735	mulq	%rbx
1736	addq	%rax,%r10
1737	movq	24(%rsi),%rax
1738	adcq	$0,%rdx
1739	addq	%r10,%r9
1740	movq	%rdx,%r10
1741	adcq	$0,%r10
1742
1743	mulq	%rbx
1744	addq	%rax,%r11
1745	movq	32(%rsi),%rax
1746	adcq	$0,%rdx
1747	addq	%r11,%r10
1748	movq	%rdx,%r11
1749	adcq	$0,%r11
1750
1751	mulq	%rbx
1752	addq	%rax,%r12
1753	movq	40(%rsi),%rax
1754	adcq	$0,%rdx
1755	addq	%r12,%r11
1756	movq	%rdx,%r12
1757	adcq	$0,%r12
1758
1759	mulq	%rbx
1760	addq	%rax,%r13
1761	movq	48(%rsi),%rax
1762	adcq	$0,%rdx
1763	addq	%r13,%r12
1764	movq	%rdx,%r13
1765	adcq	$0,%r13
1766
1767	mulq	%rbx
1768	addq	%rax,%r14
1769	movq	56(%rsi),%rax
1770	adcq	$0,%rdx
1771	addq	%r14,%r13
1772	movq	%rdx,%r14
1773	leaq	8(%rbp),%rbp
1774	adcq	$0,%r14
1775
1776	mulq	%rbx
1777	addq	%rax,%r15
1778	movq	(%rsi),%rax
1779	adcq	$0,%rdx
1780	addq	%r15,%r14
1781	movq	%rdx,%r15
1782	adcq	$0,%r15
1783
1784	leaq	8(%rdi),%rdi
1785
1786	decl	%ecx
1787	jnz	.Loop_mul
1788
1789	movq	%r8,(%rdi)
1790	movq	%r9,8(%rdi)
1791	movq	%r10,16(%rdi)
1792	movq	%r11,24(%rdi)
1793	movq	%r12,32(%rdi)
1794	movq	%r13,40(%rdi)
1795	movq	%r14,48(%rdi)
1796	movq	%r15,56(%rdi)
1797
1798	.byte	0xf3,0xc3
1799.cfi_endproc
1800.size	__rsaz_512_mul,.-__rsaz_512_mul
1801.type	__rsaz_512_mulx,@function
1802.align	32
1803__rsaz_512_mulx:
1804.cfi_startproc
1805	mulxq	(%rsi),%rbx,%r8
1806	movq	$-6,%rcx
1807
1808	mulxq	8(%rsi),%rax,%r9
1809	movq	%rbx,8(%rsp)
1810
1811	mulxq	16(%rsi),%rbx,%r10
1812	adcq	%rax,%r8
1813
1814	mulxq	24(%rsi),%rax,%r11
1815	adcq	%rbx,%r9
1816
1817	mulxq	32(%rsi),%rbx,%r12
1818	adcq	%rax,%r10
1819
1820	mulxq	40(%rsi),%rax,%r13
1821	adcq	%rbx,%r11
1822
1823	mulxq	48(%rsi),%rbx,%r14
1824	adcq	%rax,%r12
1825
1826	mulxq	56(%rsi),%rax,%r15
1827	movq	8(%rbp),%rdx
1828	adcq	%rbx,%r13
1829	adcq	%rax,%r14
1830	adcq	$0,%r15
1831
1832	xorq	%rdi,%rdi
1833	jmp	.Loop_mulx
1834
1835.align	32
1836.Loop_mulx:
1837	movq	%r8,%rbx
1838	mulxq	(%rsi),%rax,%r8
1839	adcxq	%rax,%rbx
1840	adoxq	%r9,%r8
1841
1842	mulxq	8(%rsi),%rax,%r9
1843	adcxq	%rax,%r8
1844	adoxq	%r10,%r9
1845
1846	mulxq	16(%rsi),%rax,%r10
1847	adcxq	%rax,%r9
1848	adoxq	%r11,%r10
1849
1850	mulxq	24(%rsi),%rax,%r11
1851	adcxq	%rax,%r10
1852	adoxq	%r12,%r11
1853
1854.byte	0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00
1855	adcxq	%rax,%r11
1856	adoxq	%r13,%r12
1857
1858	mulxq	40(%rsi),%rax,%r13
1859	adcxq	%rax,%r12
1860	adoxq	%r14,%r13
1861
1862	mulxq	48(%rsi),%rax,%r14
1863	adcxq	%rax,%r13
1864	adoxq	%r15,%r14
1865
1866	mulxq	56(%rsi),%rax,%r15
1867	movq	64(%rbp,%rcx,8),%rdx
1868	movq	%rbx,8+64-8(%rsp,%rcx,8)
1869	adcxq	%rax,%r14
1870	adoxq	%rdi,%r15
1871	adcxq	%rdi,%r15
1872
1873	incq	%rcx
1874	jnz	.Loop_mulx
1875
1876	movq	%r8,%rbx
1877	mulxq	(%rsi),%rax,%r8
1878	adcxq	%rax,%rbx
1879	adoxq	%r9,%r8
1880
1881.byte	0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00
1882	adcxq	%rax,%r8
1883	adoxq	%r10,%r9
1884
1885.byte	0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00
1886	adcxq	%rax,%r9
1887	adoxq	%r11,%r10
1888
1889	mulxq	24(%rsi),%rax,%r11
1890	adcxq	%rax,%r10
1891	adoxq	%r12,%r11
1892
1893	mulxq	32(%rsi),%rax,%r12
1894	adcxq	%rax,%r11
1895	adoxq	%r13,%r12
1896
1897	mulxq	40(%rsi),%rax,%r13
1898	adcxq	%rax,%r12
1899	adoxq	%r14,%r13
1900
1901.byte	0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00
1902	adcxq	%rax,%r13
1903	adoxq	%r15,%r14
1904
1905.byte	0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00
1906	adcxq	%rax,%r14
1907	adoxq	%rdi,%r15
1908	adcxq	%rdi,%r15
1909
1910	movq	%rbx,8+64-8(%rsp)
1911	movq	%r8,8+64(%rsp)
1912	movq	%r9,8+64+8(%rsp)
1913	movq	%r10,8+64+16(%rsp)
1914	movq	%r11,8+64+24(%rsp)
1915	movq	%r12,8+64+32(%rsp)
1916	movq	%r13,8+64+40(%rsp)
1917	movq	%r14,8+64+48(%rsp)
1918	movq	%r15,8+64+56(%rsp)
1919
1920	.byte	0xf3,0xc3
1921.cfi_endproc
1922.size	__rsaz_512_mulx,.-__rsaz_512_mulx
1923.globl	rsaz_512_scatter4
1924.type	rsaz_512_scatter4,@function
1925.align	16
1926rsaz_512_scatter4:
1927.cfi_startproc
1928	leaq	(%rdi,%rdx,8),%rdi
1929	movl	$8,%r9d
1930	jmp	.Loop_scatter
1931.align	16
1932.Loop_scatter:
1933	movq	(%rsi),%rax
1934	leaq	8(%rsi),%rsi
1935	movq	%rax,(%rdi)
1936	leaq	128(%rdi),%rdi
1937	decl	%r9d
1938	jnz	.Loop_scatter
1939	.byte	0xf3,0xc3
1940.cfi_endproc
1941.size	rsaz_512_scatter4,.-rsaz_512_scatter4
1942
1943.globl	rsaz_512_gather4
1944.type	rsaz_512_gather4,@function
1945.align	16
1946rsaz_512_gather4:
1947.cfi_startproc
1948	movd	%edx,%xmm8
1949	movdqa	.Linc+16(%rip),%xmm1
1950	movdqa	.Linc(%rip),%xmm0
1951
1952	pshufd	$0,%xmm8,%xmm8
1953	movdqa	%xmm1,%xmm7
1954	movdqa	%xmm1,%xmm2
1955	paddd	%xmm0,%xmm1
1956	pcmpeqd	%xmm8,%xmm0
1957	movdqa	%xmm7,%xmm3
1958	paddd	%xmm1,%xmm2
1959	pcmpeqd	%xmm8,%xmm1
1960	movdqa	%xmm7,%xmm4
1961	paddd	%xmm2,%xmm3
1962	pcmpeqd	%xmm8,%xmm2
1963	movdqa	%xmm7,%xmm5
1964	paddd	%xmm3,%xmm4
1965	pcmpeqd	%xmm8,%xmm3
1966	movdqa	%xmm7,%xmm6
1967	paddd	%xmm4,%xmm5
1968	pcmpeqd	%xmm8,%xmm4
1969	paddd	%xmm5,%xmm6
1970	pcmpeqd	%xmm8,%xmm5
1971	paddd	%xmm6,%xmm7
1972	pcmpeqd	%xmm8,%xmm6
1973	pcmpeqd	%xmm8,%xmm7
1974	movl	$8,%r9d
1975	jmp	.Loop_gather
1976.align	16
1977.Loop_gather:
1978	movdqa	0(%rsi),%xmm8
1979	movdqa	16(%rsi),%xmm9
1980	movdqa	32(%rsi),%xmm10
1981	movdqa	48(%rsi),%xmm11
1982	pand	%xmm0,%xmm8
1983	movdqa	64(%rsi),%xmm12
1984	pand	%xmm1,%xmm9
1985	movdqa	80(%rsi),%xmm13
1986	pand	%xmm2,%xmm10
1987	movdqa	96(%rsi),%xmm14
1988	pand	%xmm3,%xmm11
1989	movdqa	112(%rsi),%xmm15
1990	leaq	128(%rsi),%rsi
1991	pand	%xmm4,%xmm12
1992	pand	%xmm5,%xmm13
1993	pand	%xmm6,%xmm14
1994	pand	%xmm7,%xmm15
1995	por	%xmm10,%xmm8
1996	por	%xmm11,%xmm9
1997	por	%xmm12,%xmm8
1998	por	%xmm13,%xmm9
1999	por	%xmm14,%xmm8
2000	por	%xmm15,%xmm9
2001
2002	por	%xmm9,%xmm8
2003	pshufd	$0x4e,%xmm8,%xmm9
2004	por	%xmm9,%xmm8
2005	movq	%xmm8,(%rdi)
2006	leaq	8(%rdi),%rdi
2007	decl	%r9d
2008	jnz	.Loop_gather
2009	.byte	0xf3,0xc3
2010.LSEH_end_rsaz_512_gather4:
2011.cfi_endproc
2012.size	rsaz_512_gather4,.-rsaz_512_gather4
2013
2014.align	64
2015.Linc:
2016.long	0,0, 1,1
2017.long	2,2, 2,2
2018