xref: /freebsd/sys/crypto/openssl/amd64/x86_64-mont5.S (revision c203bd70b5957f85616424b6fa374479372d06e3)
1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from x86_64-mont5.pl. */
3.text
4
5
6
7.globl	bn_mul_mont_gather5
8.type	bn_mul_mont_gather5,@function
9.align	64
10bn_mul_mont_gather5:
11.cfi_startproc
12	movl	%r9d,%r9d
13	movq	%rsp,%rax
14.cfi_def_cfa_register	%rax
15	testl	$7,%r9d
16	jnz	.Lmul_enter
17	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
18	jmp	.Lmul4x_enter
19
20.align	16
21.Lmul_enter:
22	movd	8(%rsp),%xmm5
23	pushq	%rbx
24.cfi_offset	%rbx,-16
25	pushq	%rbp
26.cfi_offset	%rbp,-24
27	pushq	%r12
28.cfi_offset	%r12,-32
29	pushq	%r13
30.cfi_offset	%r13,-40
31	pushq	%r14
32.cfi_offset	%r14,-48
33	pushq	%r15
34.cfi_offset	%r15,-56
35
36	negq	%r9
37	movq	%rsp,%r11
38	leaq	-280(%rsp,%r9,8),%r10
39	negq	%r9
40	andq	$-1024,%r10
41
42
43
44
45
46
47
48
49
50	subq	%r10,%r11
51	andq	$-4096,%r11
52	leaq	(%r10,%r11,1),%rsp
53	movq	(%rsp),%r11
54	cmpq	%r10,%rsp
55	ja	.Lmul_page_walk
56	jmp	.Lmul_page_walk_done
57
58.Lmul_page_walk:
59	leaq	-4096(%rsp),%rsp
60	movq	(%rsp),%r11
61	cmpq	%r10,%rsp
62	ja	.Lmul_page_walk
63.Lmul_page_walk_done:
64
65	leaq	.Linc(%rip),%r10
66	movq	%rax,8(%rsp,%r9,8)
67.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
68.Lmul_body:
69
70	leaq	128(%rdx),%r12
71	movdqa	0(%r10),%xmm0
72	movdqa	16(%r10),%xmm1
73	leaq	24-112(%rsp,%r9,8),%r10
74	andq	$-16,%r10
75
76	pshufd	$0,%xmm5,%xmm5
77	movdqa	%xmm1,%xmm4
78	movdqa	%xmm1,%xmm2
79	paddd	%xmm0,%xmm1
80	pcmpeqd	%xmm5,%xmm0
81.byte	0x67
82	movdqa	%xmm4,%xmm3
83	paddd	%xmm1,%xmm2
84	pcmpeqd	%xmm5,%xmm1
85	movdqa	%xmm0,112(%r10)
86	movdqa	%xmm4,%xmm0
87
88	paddd	%xmm2,%xmm3
89	pcmpeqd	%xmm5,%xmm2
90	movdqa	%xmm1,128(%r10)
91	movdqa	%xmm4,%xmm1
92
93	paddd	%xmm3,%xmm0
94	pcmpeqd	%xmm5,%xmm3
95	movdqa	%xmm2,144(%r10)
96	movdqa	%xmm4,%xmm2
97
98	paddd	%xmm0,%xmm1
99	pcmpeqd	%xmm5,%xmm0
100	movdqa	%xmm3,160(%r10)
101	movdqa	%xmm4,%xmm3
102	paddd	%xmm1,%xmm2
103	pcmpeqd	%xmm5,%xmm1
104	movdqa	%xmm0,176(%r10)
105	movdqa	%xmm4,%xmm0
106
107	paddd	%xmm2,%xmm3
108	pcmpeqd	%xmm5,%xmm2
109	movdqa	%xmm1,192(%r10)
110	movdqa	%xmm4,%xmm1
111
112	paddd	%xmm3,%xmm0
113	pcmpeqd	%xmm5,%xmm3
114	movdqa	%xmm2,208(%r10)
115	movdqa	%xmm4,%xmm2
116
117	paddd	%xmm0,%xmm1
118	pcmpeqd	%xmm5,%xmm0
119	movdqa	%xmm3,224(%r10)
120	movdqa	%xmm4,%xmm3
121	paddd	%xmm1,%xmm2
122	pcmpeqd	%xmm5,%xmm1
123	movdqa	%xmm0,240(%r10)
124	movdqa	%xmm4,%xmm0
125
126	paddd	%xmm2,%xmm3
127	pcmpeqd	%xmm5,%xmm2
128	movdqa	%xmm1,256(%r10)
129	movdqa	%xmm4,%xmm1
130
131	paddd	%xmm3,%xmm0
132	pcmpeqd	%xmm5,%xmm3
133	movdqa	%xmm2,272(%r10)
134	movdqa	%xmm4,%xmm2
135
136	paddd	%xmm0,%xmm1
137	pcmpeqd	%xmm5,%xmm0
138	movdqa	%xmm3,288(%r10)
139	movdqa	%xmm4,%xmm3
140	paddd	%xmm1,%xmm2
141	pcmpeqd	%xmm5,%xmm1
142	movdqa	%xmm0,304(%r10)
143
144	paddd	%xmm2,%xmm3
145.byte	0x67
146	pcmpeqd	%xmm5,%xmm2
147	movdqa	%xmm1,320(%r10)
148
149	pcmpeqd	%xmm5,%xmm3
150	movdqa	%xmm2,336(%r10)
151	pand	64(%r12),%xmm0
152
153	pand	80(%r12),%xmm1
154	pand	96(%r12),%xmm2
155	movdqa	%xmm3,352(%r10)
156	pand	112(%r12),%xmm3
157	por	%xmm2,%xmm0
158	por	%xmm3,%xmm1
159	movdqa	-128(%r12),%xmm4
160	movdqa	-112(%r12),%xmm5
161	movdqa	-96(%r12),%xmm2
162	pand	112(%r10),%xmm4
163	movdqa	-80(%r12),%xmm3
164	pand	128(%r10),%xmm5
165	por	%xmm4,%xmm0
166	pand	144(%r10),%xmm2
167	por	%xmm5,%xmm1
168	pand	160(%r10),%xmm3
169	por	%xmm2,%xmm0
170	por	%xmm3,%xmm1
171	movdqa	-64(%r12),%xmm4
172	movdqa	-48(%r12),%xmm5
173	movdqa	-32(%r12),%xmm2
174	pand	176(%r10),%xmm4
175	movdqa	-16(%r12),%xmm3
176	pand	192(%r10),%xmm5
177	por	%xmm4,%xmm0
178	pand	208(%r10),%xmm2
179	por	%xmm5,%xmm1
180	pand	224(%r10),%xmm3
181	por	%xmm2,%xmm0
182	por	%xmm3,%xmm1
183	movdqa	0(%r12),%xmm4
184	movdqa	16(%r12),%xmm5
185	movdqa	32(%r12),%xmm2
186	pand	240(%r10),%xmm4
187	movdqa	48(%r12),%xmm3
188	pand	256(%r10),%xmm5
189	por	%xmm4,%xmm0
190	pand	272(%r10),%xmm2
191	por	%xmm5,%xmm1
192	pand	288(%r10),%xmm3
193	por	%xmm2,%xmm0
194	por	%xmm3,%xmm1
195	por	%xmm1,%xmm0
196	pshufd	$0x4e,%xmm0,%xmm1
197	por	%xmm1,%xmm0
198	leaq	256(%r12),%r12
199.byte	102,72,15,126,195
200
201	movq	(%r8),%r8
202	movq	(%rsi),%rax
203
204	xorq	%r14,%r14
205	xorq	%r15,%r15
206
207	movq	%r8,%rbp
208	mulq	%rbx
209	movq	%rax,%r10
210	movq	(%rcx),%rax
211
212	imulq	%r10,%rbp
213	movq	%rdx,%r11
214
215	mulq	%rbp
216	addq	%rax,%r10
217	movq	8(%rsi),%rax
218	adcq	$0,%rdx
219	movq	%rdx,%r13
220
221	leaq	1(%r15),%r15
222	jmp	.L1st_enter
223
224.align	16
225.L1st:
226	addq	%rax,%r13
227	movq	(%rsi,%r15,8),%rax
228	adcq	$0,%rdx
229	addq	%r11,%r13
230	movq	%r10,%r11
231	adcq	$0,%rdx
232	movq	%r13,-16(%rsp,%r15,8)
233	movq	%rdx,%r13
234
235.L1st_enter:
236	mulq	%rbx
237	addq	%rax,%r11
238	movq	(%rcx,%r15,8),%rax
239	adcq	$0,%rdx
240	leaq	1(%r15),%r15
241	movq	%rdx,%r10
242
243	mulq	%rbp
244	cmpq	%r9,%r15
245	jne	.L1st
246
247
248	addq	%rax,%r13
249	adcq	$0,%rdx
250	addq	%r11,%r13
251	adcq	$0,%rdx
252	movq	%r13,-16(%rsp,%r9,8)
253	movq	%rdx,%r13
254	movq	%r10,%r11
255
256	xorq	%rdx,%rdx
257	addq	%r11,%r13
258	adcq	$0,%rdx
259	movq	%r13,-8(%rsp,%r9,8)
260	movq	%rdx,(%rsp,%r9,8)
261
262	leaq	1(%r14),%r14
263	jmp	.Louter
264.align	16
265.Louter:
266	leaq	24+128(%rsp,%r9,8),%rdx
267	andq	$-16,%rdx
268	pxor	%xmm4,%xmm4
269	pxor	%xmm5,%xmm5
270	movdqa	-128(%r12),%xmm0
271	movdqa	-112(%r12),%xmm1
272	movdqa	-96(%r12),%xmm2
273	movdqa	-80(%r12),%xmm3
274	pand	-128(%rdx),%xmm0
275	pand	-112(%rdx),%xmm1
276	por	%xmm0,%xmm4
277	pand	-96(%rdx),%xmm2
278	por	%xmm1,%xmm5
279	pand	-80(%rdx),%xmm3
280	por	%xmm2,%xmm4
281	por	%xmm3,%xmm5
282	movdqa	-64(%r12),%xmm0
283	movdqa	-48(%r12),%xmm1
284	movdqa	-32(%r12),%xmm2
285	movdqa	-16(%r12),%xmm3
286	pand	-64(%rdx),%xmm0
287	pand	-48(%rdx),%xmm1
288	por	%xmm0,%xmm4
289	pand	-32(%rdx),%xmm2
290	por	%xmm1,%xmm5
291	pand	-16(%rdx),%xmm3
292	por	%xmm2,%xmm4
293	por	%xmm3,%xmm5
294	movdqa	0(%r12),%xmm0
295	movdqa	16(%r12),%xmm1
296	movdqa	32(%r12),%xmm2
297	movdqa	48(%r12),%xmm3
298	pand	0(%rdx),%xmm0
299	pand	16(%rdx),%xmm1
300	por	%xmm0,%xmm4
301	pand	32(%rdx),%xmm2
302	por	%xmm1,%xmm5
303	pand	48(%rdx),%xmm3
304	por	%xmm2,%xmm4
305	por	%xmm3,%xmm5
306	movdqa	64(%r12),%xmm0
307	movdqa	80(%r12),%xmm1
308	movdqa	96(%r12),%xmm2
309	movdqa	112(%r12),%xmm3
310	pand	64(%rdx),%xmm0
311	pand	80(%rdx),%xmm1
312	por	%xmm0,%xmm4
313	pand	96(%rdx),%xmm2
314	por	%xmm1,%xmm5
315	pand	112(%rdx),%xmm3
316	por	%xmm2,%xmm4
317	por	%xmm3,%xmm5
318	por	%xmm5,%xmm4
319	pshufd	$0x4e,%xmm4,%xmm0
320	por	%xmm4,%xmm0
321	leaq	256(%r12),%r12
322
323	movq	(%rsi),%rax
324.byte	102,72,15,126,195
325
326	xorq	%r15,%r15
327	movq	%r8,%rbp
328	movq	(%rsp),%r10
329
330	mulq	%rbx
331	addq	%rax,%r10
332	movq	(%rcx),%rax
333	adcq	$0,%rdx
334
335	imulq	%r10,%rbp
336	movq	%rdx,%r11
337
338	mulq	%rbp
339	addq	%rax,%r10
340	movq	8(%rsi),%rax
341	adcq	$0,%rdx
342	movq	8(%rsp),%r10
343	movq	%rdx,%r13
344
345	leaq	1(%r15),%r15
346	jmp	.Linner_enter
347
348.align	16
349.Linner:
350	addq	%rax,%r13
351	movq	(%rsi,%r15,8),%rax
352	adcq	$0,%rdx
353	addq	%r10,%r13
354	movq	(%rsp,%r15,8),%r10
355	adcq	$0,%rdx
356	movq	%r13,-16(%rsp,%r15,8)
357	movq	%rdx,%r13
358
359.Linner_enter:
360	mulq	%rbx
361	addq	%rax,%r11
362	movq	(%rcx,%r15,8),%rax
363	adcq	$0,%rdx
364	addq	%r11,%r10
365	movq	%rdx,%r11
366	adcq	$0,%r11
367	leaq	1(%r15),%r15
368
369	mulq	%rbp
370	cmpq	%r9,%r15
371	jne	.Linner
372
373	addq	%rax,%r13
374	adcq	$0,%rdx
375	addq	%r10,%r13
376	movq	(%rsp,%r9,8),%r10
377	adcq	$0,%rdx
378	movq	%r13,-16(%rsp,%r9,8)
379	movq	%rdx,%r13
380
381	xorq	%rdx,%rdx
382	addq	%r11,%r13
383	adcq	$0,%rdx
384	addq	%r10,%r13
385	adcq	$0,%rdx
386	movq	%r13,-8(%rsp,%r9,8)
387	movq	%rdx,(%rsp,%r9,8)
388
389	leaq	1(%r14),%r14
390	cmpq	%r9,%r14
391	jb	.Louter
392
393	xorq	%r14,%r14
394	movq	(%rsp),%rax
395	leaq	(%rsp),%rsi
396	movq	%r9,%r15
397	jmp	.Lsub
398.align	16
399.Lsub:	sbbq	(%rcx,%r14,8),%rax
400	movq	%rax,(%rdi,%r14,8)
401	movq	8(%rsi,%r14,8),%rax
402	leaq	1(%r14),%r14
403	decq	%r15
404	jnz	.Lsub
405
406	sbbq	$0,%rax
407	movq	$-1,%rbx
408	xorq	%rax,%rbx
409	xorq	%r14,%r14
410	movq	%r9,%r15
411
412.Lcopy:
413	movq	(%rdi,%r14,8),%rcx
414	movq	(%rsp,%r14,8),%rdx
415	andq	%rbx,%rcx
416	andq	%rax,%rdx
417	movq	%r14,(%rsp,%r14,8)
418	orq	%rcx,%rdx
419	movq	%rdx,(%rdi,%r14,8)
420	leaq	1(%r14),%r14
421	subq	$1,%r15
422	jnz	.Lcopy
423
424	movq	8(%rsp,%r9,8),%rsi
425.cfi_def_cfa	%rsi,8
426	movq	$1,%rax
427
428	movq	-48(%rsi),%r15
429.cfi_restore	%r15
430	movq	-40(%rsi),%r14
431.cfi_restore	%r14
432	movq	-32(%rsi),%r13
433.cfi_restore	%r13
434	movq	-24(%rsi),%r12
435.cfi_restore	%r12
436	movq	-16(%rsi),%rbp
437.cfi_restore	%rbp
438	movq	-8(%rsi),%rbx
439.cfi_restore	%rbx
440	leaq	(%rsi),%rsp
441.cfi_def_cfa_register	%rsp
442.Lmul_epilogue:
443	.byte	0xf3,0xc3
444.cfi_endproc
445.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
446.type	bn_mul4x_mont_gather5,@function
447.align	32
448bn_mul4x_mont_gather5:
449.cfi_startproc
450.byte	0x67
451	movq	%rsp,%rax
452.cfi_def_cfa_register	%rax
453.Lmul4x_enter:
454	andl	$0x80108,%r11d
455	cmpl	$0x80108,%r11d
456	je	.Lmulx4x_enter
457	pushq	%rbx
458.cfi_offset	%rbx,-16
459	pushq	%rbp
460.cfi_offset	%rbp,-24
461	pushq	%r12
462.cfi_offset	%r12,-32
463	pushq	%r13
464.cfi_offset	%r13,-40
465	pushq	%r14
466.cfi_offset	%r14,-48
467	pushq	%r15
468.cfi_offset	%r15,-56
469.Lmul4x_prologue:
470
471.byte	0x67
472	shll	$3,%r9d
473	leaq	(%r9,%r9,2),%r10
474	negq	%r9
475
476
477
478
479
480
481
482
483
484
485	leaq	-320(%rsp,%r9,2),%r11
486	movq	%rsp,%rbp
487	subq	%rdi,%r11
488	andq	$4095,%r11
489	cmpq	%r11,%r10
490	jb	.Lmul4xsp_alt
491	subq	%r11,%rbp
492	leaq	-320(%rbp,%r9,2),%rbp
493	jmp	.Lmul4xsp_done
494
495.align	32
496.Lmul4xsp_alt:
497	leaq	4096-320(,%r9,2),%r10
498	leaq	-320(%rbp,%r9,2),%rbp
499	subq	%r10,%r11
500	movq	$0,%r10
501	cmovcq	%r10,%r11
502	subq	%r11,%rbp
503.Lmul4xsp_done:
504	andq	$-64,%rbp
505	movq	%rsp,%r11
506	subq	%rbp,%r11
507	andq	$-4096,%r11
508	leaq	(%r11,%rbp,1),%rsp
509	movq	(%rsp),%r10
510	cmpq	%rbp,%rsp
511	ja	.Lmul4x_page_walk
512	jmp	.Lmul4x_page_walk_done
513
514.Lmul4x_page_walk:
515	leaq	-4096(%rsp),%rsp
516	movq	(%rsp),%r10
517	cmpq	%rbp,%rsp
518	ja	.Lmul4x_page_walk
519.Lmul4x_page_walk_done:
520
521	negq	%r9
522
523	movq	%rax,40(%rsp)
524.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
525.Lmul4x_body:
526
527	call	mul4x_internal
528
529	movq	40(%rsp),%rsi
530.cfi_def_cfa	%rsi,8
531	movq	$1,%rax
532
533	movq	-48(%rsi),%r15
534.cfi_restore	%r15
535	movq	-40(%rsi),%r14
536.cfi_restore	%r14
537	movq	-32(%rsi),%r13
538.cfi_restore	%r13
539	movq	-24(%rsi),%r12
540.cfi_restore	%r12
541	movq	-16(%rsi),%rbp
542.cfi_restore	%rbp
543	movq	-8(%rsi),%rbx
544.cfi_restore	%rbx
545	leaq	(%rsi),%rsp
546.cfi_def_cfa_register	%rsp
547.Lmul4x_epilogue:
548	.byte	0xf3,0xc3
549.cfi_endproc
550.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
551
552.type	mul4x_internal,@function
553.align	32
554mul4x_internal:
555.cfi_startproc
556	shlq	$5,%r9
557	movd	8(%rax),%xmm5
558	leaq	.Linc(%rip),%rax
559	leaq	128(%rdx,%r9,1),%r13
560	shrq	$5,%r9
561	movdqa	0(%rax),%xmm0
562	movdqa	16(%rax),%xmm1
563	leaq	88-112(%rsp,%r9,1),%r10
564	leaq	128(%rdx),%r12
565
566	pshufd	$0,%xmm5,%xmm5
567	movdqa	%xmm1,%xmm4
568.byte	0x67,0x67
569	movdqa	%xmm1,%xmm2
570	paddd	%xmm0,%xmm1
571	pcmpeqd	%xmm5,%xmm0
572.byte	0x67
573	movdqa	%xmm4,%xmm3
574	paddd	%xmm1,%xmm2
575	pcmpeqd	%xmm5,%xmm1
576	movdqa	%xmm0,112(%r10)
577	movdqa	%xmm4,%xmm0
578
579	paddd	%xmm2,%xmm3
580	pcmpeqd	%xmm5,%xmm2
581	movdqa	%xmm1,128(%r10)
582	movdqa	%xmm4,%xmm1
583
584	paddd	%xmm3,%xmm0
585	pcmpeqd	%xmm5,%xmm3
586	movdqa	%xmm2,144(%r10)
587	movdqa	%xmm4,%xmm2
588
589	paddd	%xmm0,%xmm1
590	pcmpeqd	%xmm5,%xmm0
591	movdqa	%xmm3,160(%r10)
592	movdqa	%xmm4,%xmm3
593	paddd	%xmm1,%xmm2
594	pcmpeqd	%xmm5,%xmm1
595	movdqa	%xmm0,176(%r10)
596	movdqa	%xmm4,%xmm0
597
598	paddd	%xmm2,%xmm3
599	pcmpeqd	%xmm5,%xmm2
600	movdqa	%xmm1,192(%r10)
601	movdqa	%xmm4,%xmm1
602
603	paddd	%xmm3,%xmm0
604	pcmpeqd	%xmm5,%xmm3
605	movdqa	%xmm2,208(%r10)
606	movdqa	%xmm4,%xmm2
607
608	paddd	%xmm0,%xmm1
609	pcmpeqd	%xmm5,%xmm0
610	movdqa	%xmm3,224(%r10)
611	movdqa	%xmm4,%xmm3
612	paddd	%xmm1,%xmm2
613	pcmpeqd	%xmm5,%xmm1
614	movdqa	%xmm0,240(%r10)
615	movdqa	%xmm4,%xmm0
616
617	paddd	%xmm2,%xmm3
618	pcmpeqd	%xmm5,%xmm2
619	movdqa	%xmm1,256(%r10)
620	movdqa	%xmm4,%xmm1
621
622	paddd	%xmm3,%xmm0
623	pcmpeqd	%xmm5,%xmm3
624	movdqa	%xmm2,272(%r10)
625	movdqa	%xmm4,%xmm2
626
627	paddd	%xmm0,%xmm1
628	pcmpeqd	%xmm5,%xmm0
629	movdqa	%xmm3,288(%r10)
630	movdqa	%xmm4,%xmm3
631	paddd	%xmm1,%xmm2
632	pcmpeqd	%xmm5,%xmm1
633	movdqa	%xmm0,304(%r10)
634
635	paddd	%xmm2,%xmm3
636.byte	0x67
637	pcmpeqd	%xmm5,%xmm2
638	movdqa	%xmm1,320(%r10)
639
640	pcmpeqd	%xmm5,%xmm3
641	movdqa	%xmm2,336(%r10)
642	pand	64(%r12),%xmm0
643
644	pand	80(%r12),%xmm1
645	pand	96(%r12),%xmm2
646	movdqa	%xmm3,352(%r10)
647	pand	112(%r12),%xmm3
648	por	%xmm2,%xmm0
649	por	%xmm3,%xmm1
650	movdqa	-128(%r12),%xmm4
651	movdqa	-112(%r12),%xmm5
652	movdqa	-96(%r12),%xmm2
653	pand	112(%r10),%xmm4
654	movdqa	-80(%r12),%xmm3
655	pand	128(%r10),%xmm5
656	por	%xmm4,%xmm0
657	pand	144(%r10),%xmm2
658	por	%xmm5,%xmm1
659	pand	160(%r10),%xmm3
660	por	%xmm2,%xmm0
661	por	%xmm3,%xmm1
662	movdqa	-64(%r12),%xmm4
663	movdqa	-48(%r12),%xmm5
664	movdqa	-32(%r12),%xmm2
665	pand	176(%r10),%xmm4
666	movdqa	-16(%r12),%xmm3
667	pand	192(%r10),%xmm5
668	por	%xmm4,%xmm0
669	pand	208(%r10),%xmm2
670	por	%xmm5,%xmm1
671	pand	224(%r10),%xmm3
672	por	%xmm2,%xmm0
673	por	%xmm3,%xmm1
674	movdqa	0(%r12),%xmm4
675	movdqa	16(%r12),%xmm5
676	movdqa	32(%r12),%xmm2
677	pand	240(%r10),%xmm4
678	movdqa	48(%r12),%xmm3
679	pand	256(%r10),%xmm5
680	por	%xmm4,%xmm0
681	pand	272(%r10),%xmm2
682	por	%xmm5,%xmm1
683	pand	288(%r10),%xmm3
684	por	%xmm2,%xmm0
685	por	%xmm3,%xmm1
686	por	%xmm1,%xmm0
687	pshufd	$0x4e,%xmm0,%xmm1
688	por	%xmm1,%xmm0
689	leaq	256(%r12),%r12
690.byte	102,72,15,126,195
691
692	movq	%r13,16+8(%rsp)
693	movq	%rdi,56+8(%rsp)
694
695	movq	(%r8),%r8
696	movq	(%rsi),%rax
697	leaq	(%rsi,%r9,1),%rsi
698	negq	%r9
699
700	movq	%r8,%rbp
701	mulq	%rbx
702	movq	%rax,%r10
703	movq	(%rcx),%rax
704
705	imulq	%r10,%rbp
706	leaq	64+8(%rsp),%r14
707	movq	%rdx,%r11
708
709	mulq	%rbp
710	addq	%rax,%r10
711	movq	8(%rsi,%r9,1),%rax
712	adcq	$0,%rdx
713	movq	%rdx,%rdi
714
715	mulq	%rbx
716	addq	%rax,%r11
717	movq	8(%rcx),%rax
718	adcq	$0,%rdx
719	movq	%rdx,%r10
720
721	mulq	%rbp
722	addq	%rax,%rdi
723	movq	16(%rsi,%r9,1),%rax
724	adcq	$0,%rdx
725	addq	%r11,%rdi
726	leaq	32(%r9),%r15
727	leaq	32(%rcx),%rcx
728	adcq	$0,%rdx
729	movq	%rdi,(%r14)
730	movq	%rdx,%r13
731	jmp	.L1st4x
732
733.align	32
734.L1st4x:
735	mulq	%rbx
736	addq	%rax,%r10
737	movq	-16(%rcx),%rax
738	leaq	32(%r14),%r14
739	adcq	$0,%rdx
740	movq	%rdx,%r11
741
742	mulq	%rbp
743	addq	%rax,%r13
744	movq	-8(%rsi,%r15,1),%rax
745	adcq	$0,%rdx
746	addq	%r10,%r13
747	adcq	$0,%rdx
748	movq	%r13,-24(%r14)
749	movq	%rdx,%rdi
750
751	mulq	%rbx
752	addq	%rax,%r11
753	movq	-8(%rcx),%rax
754	adcq	$0,%rdx
755	movq	%rdx,%r10
756
757	mulq	%rbp
758	addq	%rax,%rdi
759	movq	(%rsi,%r15,1),%rax
760	adcq	$0,%rdx
761	addq	%r11,%rdi
762	adcq	$0,%rdx
763	movq	%rdi,-16(%r14)
764	movq	%rdx,%r13
765
766	mulq	%rbx
767	addq	%rax,%r10
768	movq	0(%rcx),%rax
769	adcq	$0,%rdx
770	movq	%rdx,%r11
771
772	mulq	%rbp
773	addq	%rax,%r13
774	movq	8(%rsi,%r15,1),%rax
775	adcq	$0,%rdx
776	addq	%r10,%r13
777	adcq	$0,%rdx
778	movq	%r13,-8(%r14)
779	movq	%rdx,%rdi
780
781	mulq	%rbx
782	addq	%rax,%r11
783	movq	8(%rcx),%rax
784	adcq	$0,%rdx
785	movq	%rdx,%r10
786
787	mulq	%rbp
788	addq	%rax,%rdi
789	movq	16(%rsi,%r15,1),%rax
790	adcq	$0,%rdx
791	addq	%r11,%rdi
792	leaq	32(%rcx),%rcx
793	adcq	$0,%rdx
794	movq	%rdi,(%r14)
795	movq	%rdx,%r13
796
797	addq	$32,%r15
798	jnz	.L1st4x
799
800	mulq	%rbx
801	addq	%rax,%r10
802	movq	-16(%rcx),%rax
803	leaq	32(%r14),%r14
804	adcq	$0,%rdx
805	movq	%rdx,%r11
806
807	mulq	%rbp
808	addq	%rax,%r13
809	movq	-8(%rsi),%rax
810	adcq	$0,%rdx
811	addq	%r10,%r13
812	adcq	$0,%rdx
813	movq	%r13,-24(%r14)
814	movq	%rdx,%rdi
815
816	mulq	%rbx
817	addq	%rax,%r11
818	movq	-8(%rcx),%rax
819	adcq	$0,%rdx
820	movq	%rdx,%r10
821
822	mulq	%rbp
823	addq	%rax,%rdi
824	movq	(%rsi,%r9,1),%rax
825	adcq	$0,%rdx
826	addq	%r11,%rdi
827	adcq	$0,%rdx
828	movq	%rdi,-16(%r14)
829	movq	%rdx,%r13
830
831	leaq	(%rcx,%r9,1),%rcx
832
833	xorq	%rdi,%rdi
834	addq	%r10,%r13
835	adcq	$0,%rdi
836	movq	%r13,-8(%r14)
837
838	jmp	.Louter4x
839
840.align	32
841.Louter4x:
842	leaq	16+128(%r14),%rdx
843	pxor	%xmm4,%xmm4
844	pxor	%xmm5,%xmm5
845	movdqa	-128(%r12),%xmm0
846	movdqa	-112(%r12),%xmm1
847	movdqa	-96(%r12),%xmm2
848	movdqa	-80(%r12),%xmm3
849	pand	-128(%rdx),%xmm0
850	pand	-112(%rdx),%xmm1
851	por	%xmm0,%xmm4
852	pand	-96(%rdx),%xmm2
853	por	%xmm1,%xmm5
854	pand	-80(%rdx),%xmm3
855	por	%xmm2,%xmm4
856	por	%xmm3,%xmm5
857	movdqa	-64(%r12),%xmm0
858	movdqa	-48(%r12),%xmm1
859	movdqa	-32(%r12),%xmm2
860	movdqa	-16(%r12),%xmm3
861	pand	-64(%rdx),%xmm0
862	pand	-48(%rdx),%xmm1
863	por	%xmm0,%xmm4
864	pand	-32(%rdx),%xmm2
865	por	%xmm1,%xmm5
866	pand	-16(%rdx),%xmm3
867	por	%xmm2,%xmm4
868	por	%xmm3,%xmm5
869	movdqa	0(%r12),%xmm0
870	movdqa	16(%r12),%xmm1
871	movdqa	32(%r12),%xmm2
872	movdqa	48(%r12),%xmm3
873	pand	0(%rdx),%xmm0
874	pand	16(%rdx),%xmm1
875	por	%xmm0,%xmm4
876	pand	32(%rdx),%xmm2
877	por	%xmm1,%xmm5
878	pand	48(%rdx),%xmm3
879	por	%xmm2,%xmm4
880	por	%xmm3,%xmm5
881	movdqa	64(%r12),%xmm0
882	movdqa	80(%r12),%xmm1
883	movdqa	96(%r12),%xmm2
884	movdqa	112(%r12),%xmm3
885	pand	64(%rdx),%xmm0
886	pand	80(%rdx),%xmm1
887	por	%xmm0,%xmm4
888	pand	96(%rdx),%xmm2
889	por	%xmm1,%xmm5
890	pand	112(%rdx),%xmm3
891	por	%xmm2,%xmm4
892	por	%xmm3,%xmm5
893	por	%xmm5,%xmm4
894	pshufd	$0x4e,%xmm4,%xmm0
895	por	%xmm4,%xmm0
896	leaq	256(%r12),%r12
897.byte	102,72,15,126,195
898
899	movq	(%r14,%r9,1),%r10
900	movq	%r8,%rbp
901	mulq	%rbx
902	addq	%rax,%r10
903	movq	(%rcx),%rax
904	adcq	$0,%rdx
905
906	imulq	%r10,%rbp
907	movq	%rdx,%r11
908	movq	%rdi,(%r14)
909
910	leaq	(%r14,%r9,1),%r14
911
912	mulq	%rbp
913	addq	%rax,%r10
914	movq	8(%rsi,%r9,1),%rax
915	adcq	$0,%rdx
916	movq	%rdx,%rdi
917
918	mulq	%rbx
919	addq	%rax,%r11
920	movq	8(%rcx),%rax
921	adcq	$0,%rdx
922	addq	8(%r14),%r11
923	adcq	$0,%rdx
924	movq	%rdx,%r10
925
926	mulq	%rbp
927	addq	%rax,%rdi
928	movq	16(%rsi,%r9,1),%rax
929	adcq	$0,%rdx
930	addq	%r11,%rdi
931	leaq	32(%r9),%r15
932	leaq	32(%rcx),%rcx
933	adcq	$0,%rdx
934	movq	%rdx,%r13
935	jmp	.Linner4x
936
937.align	32
938.Linner4x:
939	mulq	%rbx
940	addq	%rax,%r10
941	movq	-16(%rcx),%rax
942	adcq	$0,%rdx
943	addq	16(%r14),%r10
944	leaq	32(%r14),%r14
945	adcq	$0,%rdx
946	movq	%rdx,%r11
947
948	mulq	%rbp
949	addq	%rax,%r13
950	movq	-8(%rsi,%r15,1),%rax
951	adcq	$0,%rdx
952	addq	%r10,%r13
953	adcq	$0,%rdx
954	movq	%rdi,-32(%r14)
955	movq	%rdx,%rdi
956
957	mulq	%rbx
958	addq	%rax,%r11
959	movq	-8(%rcx),%rax
960	adcq	$0,%rdx
961	addq	-8(%r14),%r11
962	adcq	$0,%rdx
963	movq	%rdx,%r10
964
965	mulq	%rbp
966	addq	%rax,%rdi
967	movq	(%rsi,%r15,1),%rax
968	adcq	$0,%rdx
969	addq	%r11,%rdi
970	adcq	$0,%rdx
971	movq	%r13,-24(%r14)
972	movq	%rdx,%r13
973
974	mulq	%rbx
975	addq	%rax,%r10
976	movq	0(%rcx),%rax
977	adcq	$0,%rdx
978	addq	(%r14),%r10
979	adcq	$0,%rdx
980	movq	%rdx,%r11
981
982	mulq	%rbp
983	addq	%rax,%r13
984	movq	8(%rsi,%r15,1),%rax
985	adcq	$0,%rdx
986	addq	%r10,%r13
987	adcq	$0,%rdx
988	movq	%rdi,-16(%r14)
989	movq	%rdx,%rdi
990
991	mulq	%rbx
992	addq	%rax,%r11
993	movq	8(%rcx),%rax
994	adcq	$0,%rdx
995	addq	8(%r14),%r11
996	adcq	$0,%rdx
997	movq	%rdx,%r10
998
999	mulq	%rbp
1000	addq	%rax,%rdi
1001	movq	16(%rsi,%r15,1),%rax
1002	adcq	$0,%rdx
1003	addq	%r11,%rdi
1004	leaq	32(%rcx),%rcx
1005	adcq	$0,%rdx
1006	movq	%r13,-8(%r14)
1007	movq	%rdx,%r13
1008
1009	addq	$32,%r15
1010	jnz	.Linner4x
1011
1012	mulq	%rbx
1013	addq	%rax,%r10
1014	movq	-16(%rcx),%rax
1015	adcq	$0,%rdx
1016	addq	16(%r14),%r10
1017	leaq	32(%r14),%r14
1018	adcq	$0,%rdx
1019	movq	%rdx,%r11
1020
1021	mulq	%rbp
1022	addq	%rax,%r13
1023	movq	-8(%rsi),%rax
1024	adcq	$0,%rdx
1025	addq	%r10,%r13
1026	adcq	$0,%rdx
1027	movq	%rdi,-32(%r14)
1028	movq	%rdx,%rdi
1029
1030	mulq	%rbx
1031	addq	%rax,%r11
1032	movq	%rbp,%rax
1033	movq	-8(%rcx),%rbp
1034	adcq	$0,%rdx
1035	addq	-8(%r14),%r11
1036	adcq	$0,%rdx
1037	movq	%rdx,%r10
1038
1039	mulq	%rbp
1040	addq	%rax,%rdi
1041	movq	(%rsi,%r9,1),%rax
1042	adcq	$0,%rdx
1043	addq	%r11,%rdi
1044	adcq	$0,%rdx
1045	movq	%r13,-24(%r14)
1046	movq	%rdx,%r13
1047
1048	movq	%rdi,-16(%r14)
1049	leaq	(%rcx,%r9,1),%rcx
1050
1051	xorq	%rdi,%rdi
1052	addq	%r10,%r13
1053	adcq	$0,%rdi
1054	addq	(%r14),%r13
1055	adcq	$0,%rdi
1056	movq	%r13,-8(%r14)
1057
1058	cmpq	16+8(%rsp),%r12
1059	jb	.Louter4x
1060	xorq	%rax,%rax
1061	subq	%r13,%rbp
1062	adcq	%r15,%r15
1063	orq	%r15,%rdi
1064	subq	%rdi,%rax
1065	leaq	(%r14,%r9,1),%rbx
1066	movq	(%rcx),%r12
1067	leaq	(%rcx),%rbp
1068	movq	%r9,%rcx
1069	sarq	$3+2,%rcx
1070	movq	56+8(%rsp),%rdi
1071	decq	%r12
1072	xorq	%r10,%r10
1073	movq	8(%rbp),%r13
1074	movq	16(%rbp),%r14
1075	movq	24(%rbp),%r15
1076	jmp	.Lsqr4x_sub_entry
1077.cfi_endproc
1078.size	mul4x_internal,.-mul4x_internal
1079.globl	bn_power5
1080.type	bn_power5,@function
1081.align	32
1082bn_power5:
1083.cfi_startproc
1084	movq	%rsp,%rax
1085.cfi_def_cfa_register	%rax
1086	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
1087	andl	$0x80108,%r11d
1088	cmpl	$0x80108,%r11d
1089	je	.Lpowerx5_enter
1090	pushq	%rbx
1091.cfi_offset	%rbx,-16
1092	pushq	%rbp
1093.cfi_offset	%rbp,-24
1094	pushq	%r12
1095.cfi_offset	%r12,-32
1096	pushq	%r13
1097.cfi_offset	%r13,-40
1098	pushq	%r14
1099.cfi_offset	%r14,-48
1100	pushq	%r15
1101.cfi_offset	%r15,-56
1102.Lpower5_prologue:
1103
1104	shll	$3,%r9d
1105	leal	(%r9,%r9,2),%r10d
1106	negq	%r9
1107	movq	(%r8),%r8
1108
1109
1110
1111
1112
1113
1114
1115
1116	leaq	-320(%rsp,%r9,2),%r11
1117	movq	%rsp,%rbp
1118	subq	%rdi,%r11
1119	andq	$4095,%r11
1120	cmpq	%r11,%r10
1121	jb	.Lpwr_sp_alt
1122	subq	%r11,%rbp
1123	leaq	-320(%rbp,%r9,2),%rbp
1124	jmp	.Lpwr_sp_done
1125
1126.align	32
1127.Lpwr_sp_alt:
1128	leaq	4096-320(,%r9,2),%r10
1129	leaq	-320(%rbp,%r9,2),%rbp
1130	subq	%r10,%r11
1131	movq	$0,%r10
1132	cmovcq	%r10,%r11
1133	subq	%r11,%rbp
1134.Lpwr_sp_done:
1135	andq	$-64,%rbp
1136	movq	%rsp,%r11
1137	subq	%rbp,%r11
1138	andq	$-4096,%r11
1139	leaq	(%r11,%rbp,1),%rsp
1140	movq	(%rsp),%r10
1141	cmpq	%rbp,%rsp
1142	ja	.Lpwr_page_walk
1143	jmp	.Lpwr_page_walk_done
1144
1145.Lpwr_page_walk:
1146	leaq	-4096(%rsp),%rsp
1147	movq	(%rsp),%r10
1148	cmpq	%rbp,%rsp
1149	ja	.Lpwr_page_walk
1150.Lpwr_page_walk_done:
1151
1152	movq	%r9,%r10
1153	negq	%r9
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164	movq	%r8,32(%rsp)
1165	movq	%rax,40(%rsp)
1166.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
1167.Lpower5_body:
1168.byte	102,72,15,110,207
1169.byte	102,72,15,110,209
1170.byte	102,73,15,110,218
1171.byte	102,72,15,110,226
1172
1173	call	__bn_sqr8x_internal
1174	call	__bn_post4x_internal
1175	call	__bn_sqr8x_internal
1176	call	__bn_post4x_internal
1177	call	__bn_sqr8x_internal
1178	call	__bn_post4x_internal
1179	call	__bn_sqr8x_internal
1180	call	__bn_post4x_internal
1181	call	__bn_sqr8x_internal
1182	call	__bn_post4x_internal
1183
1184.byte	102,72,15,126,209
1185.byte	102,72,15,126,226
1186	movq	%rsi,%rdi
1187	movq	40(%rsp),%rax
1188	leaq	32(%rsp),%r8
1189
1190	call	mul4x_internal
1191
1192	movq	40(%rsp),%rsi
1193.cfi_def_cfa	%rsi,8
1194	movq	$1,%rax
1195	movq	-48(%rsi),%r15
1196.cfi_restore	%r15
1197	movq	-40(%rsi),%r14
1198.cfi_restore	%r14
1199	movq	-32(%rsi),%r13
1200.cfi_restore	%r13
1201	movq	-24(%rsi),%r12
1202.cfi_restore	%r12
1203	movq	-16(%rsi),%rbp
1204.cfi_restore	%rbp
1205	movq	-8(%rsi),%rbx
1206.cfi_restore	%rbx
1207	leaq	(%rsi),%rsp
1208.cfi_def_cfa_register	%rsp
1209.Lpower5_epilogue:
1210	.byte	0xf3,0xc3
1211.cfi_endproc
1212.size	bn_power5,.-bn_power5
1213
1214.globl	bn_sqr8x_internal
1215.hidden	bn_sqr8x_internal
1216.type	bn_sqr8x_internal,@function
1217.align	32
1218bn_sqr8x_internal:
1219__bn_sqr8x_internal:
1220.cfi_startproc
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294	leaq	32(%r10),%rbp
1295	leaq	(%rsi,%r9,1),%rsi
1296
1297	movq	%r9,%rcx
1298
1299
1300	movq	-32(%rsi,%rbp,1),%r14
1301	leaq	48+8(%rsp,%r9,2),%rdi
1302	movq	-24(%rsi,%rbp,1),%rax
1303	leaq	-32(%rdi,%rbp,1),%rdi
1304	movq	-16(%rsi,%rbp,1),%rbx
1305	movq	%rax,%r15
1306
1307	mulq	%r14
1308	movq	%rax,%r10
1309	movq	%rbx,%rax
1310	movq	%rdx,%r11
1311	movq	%r10,-24(%rdi,%rbp,1)
1312
1313	mulq	%r14
1314	addq	%rax,%r11
1315	movq	%rbx,%rax
1316	adcq	$0,%rdx
1317	movq	%r11,-16(%rdi,%rbp,1)
1318	movq	%rdx,%r10
1319
1320
1321	movq	-8(%rsi,%rbp,1),%rbx
1322	mulq	%r15
1323	movq	%rax,%r12
1324	movq	%rbx,%rax
1325	movq	%rdx,%r13
1326
1327	leaq	(%rbp),%rcx
1328	mulq	%r14
1329	addq	%rax,%r10
1330	movq	%rbx,%rax
1331	movq	%rdx,%r11
1332	adcq	$0,%r11
1333	addq	%r12,%r10
1334	adcq	$0,%r11
1335	movq	%r10,-8(%rdi,%rcx,1)
1336	jmp	.Lsqr4x_1st
1337
1338.align	32
1339.Lsqr4x_1st:
1340	movq	(%rsi,%rcx,1),%rbx
1341	mulq	%r15
1342	addq	%rax,%r13
1343	movq	%rbx,%rax
1344	movq	%rdx,%r12
1345	adcq	$0,%r12
1346
1347	mulq	%r14
1348	addq	%rax,%r11
1349	movq	%rbx,%rax
1350	movq	8(%rsi,%rcx,1),%rbx
1351	movq	%rdx,%r10
1352	adcq	$0,%r10
1353	addq	%r13,%r11
1354	adcq	$0,%r10
1355
1356
1357	mulq	%r15
1358	addq	%rax,%r12
1359	movq	%rbx,%rax
1360	movq	%r11,(%rdi,%rcx,1)
1361	movq	%rdx,%r13
1362	adcq	$0,%r13
1363
1364	mulq	%r14
1365	addq	%rax,%r10
1366	movq	%rbx,%rax
1367	movq	16(%rsi,%rcx,1),%rbx
1368	movq	%rdx,%r11
1369	adcq	$0,%r11
1370	addq	%r12,%r10
1371	adcq	$0,%r11
1372
1373	mulq	%r15
1374	addq	%rax,%r13
1375	movq	%rbx,%rax
1376	movq	%r10,8(%rdi,%rcx,1)
1377	movq	%rdx,%r12
1378	adcq	$0,%r12
1379
1380	mulq	%r14
1381	addq	%rax,%r11
1382	movq	%rbx,%rax
1383	movq	24(%rsi,%rcx,1),%rbx
1384	movq	%rdx,%r10
1385	adcq	$0,%r10
1386	addq	%r13,%r11
1387	adcq	$0,%r10
1388
1389
1390	mulq	%r15
1391	addq	%rax,%r12
1392	movq	%rbx,%rax
1393	movq	%r11,16(%rdi,%rcx,1)
1394	movq	%rdx,%r13
1395	adcq	$0,%r13
1396	leaq	32(%rcx),%rcx
1397
1398	mulq	%r14
1399	addq	%rax,%r10
1400	movq	%rbx,%rax
1401	movq	%rdx,%r11
1402	adcq	$0,%r11
1403	addq	%r12,%r10
1404	adcq	$0,%r11
1405	movq	%r10,-8(%rdi,%rcx,1)
1406
1407	cmpq	$0,%rcx
1408	jne	.Lsqr4x_1st
1409
1410	mulq	%r15
1411	addq	%rax,%r13
1412	leaq	16(%rbp),%rbp
1413	adcq	$0,%rdx
1414	addq	%r11,%r13
1415	adcq	$0,%rdx
1416
1417	movq	%r13,(%rdi)
1418	movq	%rdx,%r12
1419	movq	%rdx,8(%rdi)
1420	jmp	.Lsqr4x_outer
1421
1422.align	32
1423.Lsqr4x_outer:
1424	movq	-32(%rsi,%rbp,1),%r14
1425	leaq	48+8(%rsp,%r9,2),%rdi
1426	movq	-24(%rsi,%rbp,1),%rax
1427	leaq	-32(%rdi,%rbp,1),%rdi
1428	movq	-16(%rsi,%rbp,1),%rbx
1429	movq	%rax,%r15
1430
1431	mulq	%r14
1432	movq	-24(%rdi,%rbp,1),%r10
1433	addq	%rax,%r10
1434	movq	%rbx,%rax
1435	adcq	$0,%rdx
1436	movq	%r10,-24(%rdi,%rbp,1)
1437	movq	%rdx,%r11
1438
1439	mulq	%r14
1440	addq	%rax,%r11
1441	movq	%rbx,%rax
1442	adcq	$0,%rdx
1443	addq	-16(%rdi,%rbp,1),%r11
1444	movq	%rdx,%r10
1445	adcq	$0,%r10
1446	movq	%r11,-16(%rdi,%rbp,1)
1447
1448	xorq	%r12,%r12
1449
1450	movq	-8(%rsi,%rbp,1),%rbx
1451	mulq	%r15
1452	addq	%rax,%r12
1453	movq	%rbx,%rax
1454	adcq	$0,%rdx
1455	addq	-8(%rdi,%rbp,1),%r12
1456	movq	%rdx,%r13
1457	adcq	$0,%r13
1458
1459	mulq	%r14
1460	addq	%rax,%r10
1461	movq	%rbx,%rax
1462	adcq	$0,%rdx
1463	addq	%r12,%r10
1464	movq	%rdx,%r11
1465	adcq	$0,%r11
1466	movq	%r10,-8(%rdi,%rbp,1)
1467
1468	leaq	(%rbp),%rcx
1469	jmp	.Lsqr4x_inner
1470
1471.align	32
1472.Lsqr4x_inner:
1473	movq	(%rsi,%rcx,1),%rbx
1474	mulq	%r15
1475	addq	%rax,%r13
1476	movq	%rbx,%rax
1477	movq	%rdx,%r12
1478	adcq	$0,%r12
1479	addq	(%rdi,%rcx,1),%r13
1480	adcq	$0,%r12
1481
1482.byte	0x67
1483	mulq	%r14
1484	addq	%rax,%r11
1485	movq	%rbx,%rax
1486	movq	8(%rsi,%rcx,1),%rbx
1487	movq	%rdx,%r10
1488	adcq	$0,%r10
1489	addq	%r13,%r11
1490	adcq	$0,%r10
1491
1492	mulq	%r15
1493	addq	%rax,%r12
1494	movq	%r11,(%rdi,%rcx,1)
1495	movq	%rbx,%rax
1496	movq	%rdx,%r13
1497	adcq	$0,%r13
1498	addq	8(%rdi,%rcx,1),%r12
1499	leaq	16(%rcx),%rcx
1500	adcq	$0,%r13
1501
1502	mulq	%r14
1503	addq	%rax,%r10
1504	movq	%rbx,%rax
1505	adcq	$0,%rdx
1506	addq	%r12,%r10
1507	movq	%rdx,%r11
1508	adcq	$0,%r11
1509	movq	%r10,-8(%rdi,%rcx,1)
1510
1511	cmpq	$0,%rcx
1512	jne	.Lsqr4x_inner
1513
1514.byte	0x67
1515	mulq	%r15
1516	addq	%rax,%r13
1517	adcq	$0,%rdx
1518	addq	%r11,%r13
1519	adcq	$0,%rdx
1520
1521	movq	%r13,(%rdi)
1522	movq	%rdx,%r12
1523	movq	%rdx,8(%rdi)
1524
1525	addq	$16,%rbp
1526	jnz	.Lsqr4x_outer
1527
1528
1529	movq	-32(%rsi),%r14
1530	leaq	48+8(%rsp,%r9,2),%rdi
1531	movq	-24(%rsi),%rax
1532	leaq	-32(%rdi,%rbp,1),%rdi
1533	movq	-16(%rsi),%rbx
1534	movq	%rax,%r15
1535
1536	mulq	%r14
1537	addq	%rax,%r10
1538	movq	%rbx,%rax
1539	movq	%rdx,%r11
1540	adcq	$0,%r11
1541
1542	mulq	%r14
1543	addq	%rax,%r11
1544	movq	%rbx,%rax
1545	movq	%r10,-24(%rdi)
1546	movq	%rdx,%r10
1547	adcq	$0,%r10
1548	addq	%r13,%r11
1549	movq	-8(%rsi),%rbx
1550	adcq	$0,%r10
1551
1552	mulq	%r15
1553	addq	%rax,%r12
1554	movq	%rbx,%rax
1555	movq	%r11,-16(%rdi)
1556	movq	%rdx,%r13
1557	adcq	$0,%r13
1558
1559	mulq	%r14
1560	addq	%rax,%r10
1561	movq	%rbx,%rax
1562	movq	%rdx,%r11
1563	adcq	$0,%r11
1564	addq	%r12,%r10
1565	adcq	$0,%r11
1566	movq	%r10,-8(%rdi)
1567
1568	mulq	%r15
1569	addq	%rax,%r13
1570	movq	-16(%rsi),%rax
1571	adcq	$0,%rdx
1572	addq	%r11,%r13
1573	adcq	$0,%rdx
1574
1575	movq	%r13,(%rdi)
1576	movq	%rdx,%r12
1577	movq	%rdx,8(%rdi)
1578
1579	mulq	%rbx
1580	addq	$16,%rbp
1581	xorq	%r14,%r14
1582	subq	%r9,%rbp
1583	xorq	%r15,%r15
1584
1585	addq	%r12,%rax
1586	adcq	$0,%rdx
1587	movq	%rax,8(%rdi)
1588	movq	%rdx,16(%rdi)
1589	movq	%r15,24(%rdi)
1590
1591	movq	-16(%rsi,%rbp,1),%rax
1592	leaq	48+8(%rsp),%rdi
1593	xorq	%r10,%r10
1594	movq	8(%rdi),%r11
1595
1596	leaq	(%r14,%r10,2),%r12
1597	shrq	$63,%r10
1598	leaq	(%rcx,%r11,2),%r13
1599	shrq	$63,%r11
1600	orq	%r10,%r13
1601	movq	16(%rdi),%r10
1602	movq	%r11,%r14
1603	mulq	%rax
1604	negq	%r15
1605	movq	24(%rdi),%r11
1606	adcq	%rax,%r12
1607	movq	-8(%rsi,%rbp,1),%rax
1608	movq	%r12,(%rdi)
1609	adcq	%rdx,%r13
1610
1611	leaq	(%r14,%r10,2),%rbx
1612	movq	%r13,8(%rdi)
1613	sbbq	%r15,%r15
1614	shrq	$63,%r10
1615	leaq	(%rcx,%r11,2),%r8
1616	shrq	$63,%r11
1617	orq	%r10,%r8
1618	movq	32(%rdi),%r10
1619	movq	%r11,%r14
1620	mulq	%rax
1621	negq	%r15
1622	movq	40(%rdi),%r11
1623	adcq	%rax,%rbx
1624	movq	0(%rsi,%rbp,1),%rax
1625	movq	%rbx,16(%rdi)
1626	adcq	%rdx,%r8
1627	leaq	16(%rbp),%rbp
1628	movq	%r8,24(%rdi)
1629	sbbq	%r15,%r15
1630	leaq	64(%rdi),%rdi
1631	jmp	.Lsqr4x_shift_n_add
1632
1633.align	32
1634.Lsqr4x_shift_n_add:
1635	leaq	(%r14,%r10,2),%r12
1636	shrq	$63,%r10
1637	leaq	(%rcx,%r11,2),%r13
1638	shrq	$63,%r11
1639	orq	%r10,%r13
1640	movq	-16(%rdi),%r10
1641	movq	%r11,%r14
1642	mulq	%rax
1643	negq	%r15
1644	movq	-8(%rdi),%r11
1645	adcq	%rax,%r12
1646	movq	-8(%rsi,%rbp,1),%rax
1647	movq	%r12,-32(%rdi)
1648	adcq	%rdx,%r13
1649
1650	leaq	(%r14,%r10,2),%rbx
1651	movq	%r13,-24(%rdi)
1652	sbbq	%r15,%r15
1653	shrq	$63,%r10
1654	leaq	(%rcx,%r11,2),%r8
1655	shrq	$63,%r11
1656	orq	%r10,%r8
1657	movq	0(%rdi),%r10
1658	movq	%r11,%r14
1659	mulq	%rax
1660	negq	%r15
1661	movq	8(%rdi),%r11
1662	adcq	%rax,%rbx
1663	movq	0(%rsi,%rbp,1),%rax
1664	movq	%rbx,-16(%rdi)
1665	adcq	%rdx,%r8
1666
1667	leaq	(%r14,%r10,2),%r12
1668	movq	%r8,-8(%rdi)
1669	sbbq	%r15,%r15
1670	shrq	$63,%r10
1671	leaq	(%rcx,%r11,2),%r13
1672	shrq	$63,%r11
1673	orq	%r10,%r13
1674	movq	16(%rdi),%r10
1675	movq	%r11,%r14
1676	mulq	%rax
1677	negq	%r15
1678	movq	24(%rdi),%r11
1679	adcq	%rax,%r12
1680	movq	8(%rsi,%rbp,1),%rax
1681	movq	%r12,0(%rdi)
1682	adcq	%rdx,%r13
1683
1684	leaq	(%r14,%r10,2),%rbx
1685	movq	%r13,8(%rdi)
1686	sbbq	%r15,%r15
1687	shrq	$63,%r10
1688	leaq	(%rcx,%r11,2),%r8
1689	shrq	$63,%r11
1690	orq	%r10,%r8
1691	movq	32(%rdi),%r10
1692	movq	%r11,%r14
1693	mulq	%rax
1694	negq	%r15
1695	movq	40(%rdi),%r11
1696	adcq	%rax,%rbx
1697	movq	16(%rsi,%rbp,1),%rax
1698	movq	%rbx,16(%rdi)
1699	adcq	%rdx,%r8
1700	movq	%r8,24(%rdi)
1701	sbbq	%r15,%r15
1702	leaq	64(%rdi),%rdi
1703	addq	$32,%rbp
1704	jnz	.Lsqr4x_shift_n_add
1705
1706	leaq	(%r14,%r10,2),%r12
1707.byte	0x67
1708	shrq	$63,%r10
1709	leaq	(%rcx,%r11,2),%r13
1710	shrq	$63,%r11
1711	orq	%r10,%r13
1712	movq	-16(%rdi),%r10
1713	movq	%r11,%r14
1714	mulq	%rax
1715	negq	%r15
1716	movq	-8(%rdi),%r11
1717	adcq	%rax,%r12
1718	movq	-8(%rsi),%rax
1719	movq	%r12,-32(%rdi)
1720	adcq	%rdx,%r13
1721
1722	leaq	(%r14,%r10,2),%rbx
1723	movq	%r13,-24(%rdi)
1724	sbbq	%r15,%r15
1725	shrq	$63,%r10
1726	leaq	(%rcx,%r11,2),%r8
1727	shrq	$63,%r11
1728	orq	%r10,%r8
1729	mulq	%rax
1730	negq	%r15
1731	adcq	%rax,%rbx
1732	adcq	%rdx,%r8
1733	movq	%rbx,-16(%rdi)
1734	movq	%r8,-8(%rdi)
1735.byte	102,72,15,126,213
1736__bn_sqr8x_reduction:
1737	xorq	%rax,%rax
1738	leaq	(%r9,%rbp,1),%rcx
1739	leaq	48+8(%rsp,%r9,2),%rdx
1740	movq	%rcx,0+8(%rsp)
1741	leaq	48+8(%rsp,%r9,1),%rdi
1742	movq	%rdx,8+8(%rsp)
1743	negq	%r9
1744	jmp	.L8x_reduction_loop
1745
1746.align	32
1747.L8x_reduction_loop:
1748	leaq	(%rdi,%r9,1),%rdi
1749.byte	0x66
1750	movq	0(%rdi),%rbx
1751	movq	8(%rdi),%r9
1752	movq	16(%rdi),%r10
1753	movq	24(%rdi),%r11
1754	movq	32(%rdi),%r12
1755	movq	40(%rdi),%r13
1756	movq	48(%rdi),%r14
1757	movq	56(%rdi),%r15
1758	movq	%rax,(%rdx)
1759	leaq	64(%rdi),%rdi
1760
1761.byte	0x67
1762	movq	%rbx,%r8
1763	imulq	32+8(%rsp),%rbx
1764	movq	0(%rbp),%rax
1765	movl	$8,%ecx
1766	jmp	.L8x_reduce
1767
1768.align	32
1769.L8x_reduce:
1770	mulq	%rbx
1771	movq	8(%rbp),%rax
1772	negq	%r8
1773	movq	%rdx,%r8
1774	adcq	$0,%r8
1775
1776	mulq	%rbx
1777	addq	%rax,%r9
1778	movq	16(%rbp),%rax
1779	adcq	$0,%rdx
1780	addq	%r9,%r8
1781	movq	%rbx,48-8+8(%rsp,%rcx,8)
1782	movq	%rdx,%r9
1783	adcq	$0,%r9
1784
1785	mulq	%rbx
1786	addq	%rax,%r10
1787	movq	24(%rbp),%rax
1788	adcq	$0,%rdx
1789	addq	%r10,%r9
1790	movq	32+8(%rsp),%rsi
1791	movq	%rdx,%r10
1792	adcq	$0,%r10
1793
1794	mulq	%rbx
1795	addq	%rax,%r11
1796	movq	32(%rbp),%rax
1797	adcq	$0,%rdx
1798	imulq	%r8,%rsi
1799	addq	%r11,%r10
1800	movq	%rdx,%r11
1801	adcq	$0,%r11
1802
1803	mulq	%rbx
1804	addq	%rax,%r12
1805	movq	40(%rbp),%rax
1806	adcq	$0,%rdx
1807	addq	%r12,%r11
1808	movq	%rdx,%r12
1809	adcq	$0,%r12
1810
1811	mulq	%rbx
1812	addq	%rax,%r13
1813	movq	48(%rbp),%rax
1814	adcq	$0,%rdx
1815	addq	%r13,%r12
1816	movq	%rdx,%r13
1817	adcq	$0,%r13
1818
1819	mulq	%rbx
1820	addq	%rax,%r14
1821	movq	56(%rbp),%rax
1822	adcq	$0,%rdx
1823	addq	%r14,%r13
1824	movq	%rdx,%r14
1825	adcq	$0,%r14
1826
1827	mulq	%rbx
1828	movq	%rsi,%rbx
1829	addq	%rax,%r15
1830	movq	0(%rbp),%rax
1831	adcq	$0,%rdx
1832	addq	%r15,%r14
1833	movq	%rdx,%r15
1834	adcq	$0,%r15
1835
1836	decl	%ecx
1837	jnz	.L8x_reduce
1838
1839	leaq	64(%rbp),%rbp
1840	xorq	%rax,%rax
1841	movq	8+8(%rsp),%rdx
1842	cmpq	0+8(%rsp),%rbp
1843	jae	.L8x_no_tail
1844
1845.byte	0x66
1846	addq	0(%rdi),%r8
1847	adcq	8(%rdi),%r9
1848	adcq	16(%rdi),%r10
1849	adcq	24(%rdi),%r11
1850	adcq	32(%rdi),%r12
1851	adcq	40(%rdi),%r13
1852	adcq	48(%rdi),%r14
1853	adcq	56(%rdi),%r15
1854	sbbq	%rsi,%rsi
1855
1856	movq	48+56+8(%rsp),%rbx
1857	movl	$8,%ecx
1858	movq	0(%rbp),%rax
1859	jmp	.L8x_tail
1860
1861.align	32
1862.L8x_tail:
1863	mulq	%rbx
1864	addq	%rax,%r8
1865	movq	8(%rbp),%rax
1866	movq	%r8,(%rdi)
1867	movq	%rdx,%r8
1868	adcq	$0,%r8
1869
1870	mulq	%rbx
1871	addq	%rax,%r9
1872	movq	16(%rbp),%rax
1873	adcq	$0,%rdx
1874	addq	%r9,%r8
1875	leaq	8(%rdi),%rdi
1876	movq	%rdx,%r9
1877	adcq	$0,%r9
1878
1879	mulq	%rbx
1880	addq	%rax,%r10
1881	movq	24(%rbp),%rax
1882	adcq	$0,%rdx
1883	addq	%r10,%r9
1884	movq	%rdx,%r10
1885	adcq	$0,%r10
1886
1887	mulq	%rbx
1888	addq	%rax,%r11
1889	movq	32(%rbp),%rax
1890	adcq	$0,%rdx
1891	addq	%r11,%r10
1892	movq	%rdx,%r11
1893	adcq	$0,%r11
1894
1895	mulq	%rbx
1896	addq	%rax,%r12
1897	movq	40(%rbp),%rax
1898	adcq	$0,%rdx
1899	addq	%r12,%r11
1900	movq	%rdx,%r12
1901	adcq	$0,%r12
1902
1903	mulq	%rbx
1904	addq	%rax,%r13
1905	movq	48(%rbp),%rax
1906	adcq	$0,%rdx
1907	addq	%r13,%r12
1908	movq	%rdx,%r13
1909	adcq	$0,%r13
1910
1911	mulq	%rbx
1912	addq	%rax,%r14
1913	movq	56(%rbp),%rax
1914	adcq	$0,%rdx
1915	addq	%r14,%r13
1916	movq	%rdx,%r14
1917	adcq	$0,%r14
1918
1919	mulq	%rbx
1920	movq	48-16+8(%rsp,%rcx,8),%rbx
1921	addq	%rax,%r15
1922	adcq	$0,%rdx
1923	addq	%r15,%r14
1924	movq	0(%rbp),%rax
1925	movq	%rdx,%r15
1926	adcq	$0,%r15
1927
1928	decl	%ecx
1929	jnz	.L8x_tail
1930
1931	leaq	64(%rbp),%rbp
1932	movq	8+8(%rsp),%rdx
1933	cmpq	0+8(%rsp),%rbp
1934	jae	.L8x_tail_done
1935
1936	movq	48+56+8(%rsp),%rbx
1937	negq	%rsi
1938	movq	0(%rbp),%rax
1939	adcq	0(%rdi),%r8
1940	adcq	8(%rdi),%r9
1941	adcq	16(%rdi),%r10
1942	adcq	24(%rdi),%r11
1943	adcq	32(%rdi),%r12
1944	adcq	40(%rdi),%r13
1945	adcq	48(%rdi),%r14
1946	adcq	56(%rdi),%r15
1947	sbbq	%rsi,%rsi
1948
1949	movl	$8,%ecx
1950	jmp	.L8x_tail
1951
1952.align	32
1953.L8x_tail_done:
1954	xorq	%rax,%rax
1955	addq	(%rdx),%r8
1956	adcq	$0,%r9
1957	adcq	$0,%r10
1958	adcq	$0,%r11
1959	adcq	$0,%r12
1960	adcq	$0,%r13
1961	adcq	$0,%r14
1962	adcq	$0,%r15
1963	adcq	$0,%rax
1964
1965	negq	%rsi
1966.L8x_no_tail:
1967	adcq	0(%rdi),%r8
1968	adcq	8(%rdi),%r9
1969	adcq	16(%rdi),%r10
1970	adcq	24(%rdi),%r11
1971	adcq	32(%rdi),%r12
1972	adcq	40(%rdi),%r13
1973	adcq	48(%rdi),%r14
1974	adcq	56(%rdi),%r15
1975	adcq	$0,%rax
1976	movq	-8(%rbp),%rcx
1977	xorq	%rsi,%rsi
1978
1979.byte	102,72,15,126,213
1980
1981	movq	%r8,0(%rdi)
1982	movq	%r9,8(%rdi)
1983.byte	102,73,15,126,217
1984	movq	%r10,16(%rdi)
1985	movq	%r11,24(%rdi)
1986	movq	%r12,32(%rdi)
1987	movq	%r13,40(%rdi)
1988	movq	%r14,48(%rdi)
1989	movq	%r15,56(%rdi)
1990	leaq	64(%rdi),%rdi
1991
1992	cmpq	%rdx,%rdi
1993	jb	.L8x_reduction_loop
1994	.byte	0xf3,0xc3
1995.cfi_endproc
1996.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1997.type	__bn_post4x_internal,@function
1998.align	32
1999__bn_post4x_internal:
2000.cfi_startproc
2001	movq	0(%rbp),%r12
2002	leaq	(%rdi,%r9,1),%rbx
2003	movq	%r9,%rcx
2004.byte	102,72,15,126,207
2005	negq	%rax
2006.byte	102,72,15,126,206
2007	sarq	$3+2,%rcx
2008	decq	%r12
2009	xorq	%r10,%r10
2010	movq	8(%rbp),%r13
2011	movq	16(%rbp),%r14
2012	movq	24(%rbp),%r15
2013	jmp	.Lsqr4x_sub_entry
2014
2015.align	16
2016.Lsqr4x_sub:
2017	movq	0(%rbp),%r12
2018	movq	8(%rbp),%r13
2019	movq	16(%rbp),%r14
2020	movq	24(%rbp),%r15
2021.Lsqr4x_sub_entry:
2022	leaq	32(%rbp),%rbp
2023	notq	%r12
2024	notq	%r13
2025	notq	%r14
2026	notq	%r15
2027	andq	%rax,%r12
2028	andq	%rax,%r13
2029	andq	%rax,%r14
2030	andq	%rax,%r15
2031
2032	negq	%r10
2033	adcq	0(%rbx),%r12
2034	adcq	8(%rbx),%r13
2035	adcq	16(%rbx),%r14
2036	adcq	24(%rbx),%r15
2037	movq	%r12,0(%rdi)
2038	leaq	32(%rbx),%rbx
2039	movq	%r13,8(%rdi)
2040	sbbq	%r10,%r10
2041	movq	%r14,16(%rdi)
2042	movq	%r15,24(%rdi)
2043	leaq	32(%rdi),%rdi
2044
2045	incq	%rcx
2046	jnz	.Lsqr4x_sub
2047
2048	movq	%r9,%r10
2049	negq	%r9
2050	.byte	0xf3,0xc3
2051.cfi_endproc
2052.size	__bn_post4x_internal,.-__bn_post4x_internal
2053.globl	bn_from_montgomery
2054.type	bn_from_montgomery,@function
2055.align	32
2056bn_from_montgomery:
2057.cfi_startproc
2058	testl	$7,%r9d
2059	jz	bn_from_mont8x
2060	xorl	%eax,%eax
2061	.byte	0xf3,0xc3
2062.cfi_endproc
2063.size	bn_from_montgomery,.-bn_from_montgomery
2064
2065.type	bn_from_mont8x,@function
2066.align	32
2067bn_from_mont8x:
2068.cfi_startproc
2069.byte	0x67
2070	movq	%rsp,%rax
2071.cfi_def_cfa_register	%rax
2072	pushq	%rbx
2073.cfi_offset	%rbx,-16
2074	pushq	%rbp
2075.cfi_offset	%rbp,-24
2076	pushq	%r12
2077.cfi_offset	%r12,-32
2078	pushq	%r13
2079.cfi_offset	%r13,-40
2080	pushq	%r14
2081.cfi_offset	%r14,-48
2082	pushq	%r15
2083.cfi_offset	%r15,-56
2084.Lfrom_prologue:
2085
2086	shll	$3,%r9d
2087	leaq	(%r9,%r9,2),%r10
2088	negq	%r9
2089	movq	(%r8),%r8
2090
2091
2092
2093
2094
2095
2096
2097
2098	leaq	-320(%rsp,%r9,2),%r11
2099	movq	%rsp,%rbp
2100	subq	%rdi,%r11
2101	andq	$4095,%r11
2102	cmpq	%r11,%r10
2103	jb	.Lfrom_sp_alt
2104	subq	%r11,%rbp
2105	leaq	-320(%rbp,%r9,2),%rbp
2106	jmp	.Lfrom_sp_done
2107
2108.align	32
2109.Lfrom_sp_alt:
2110	leaq	4096-320(,%r9,2),%r10
2111	leaq	-320(%rbp,%r9,2),%rbp
2112	subq	%r10,%r11
2113	movq	$0,%r10
2114	cmovcq	%r10,%r11
2115	subq	%r11,%rbp
2116.Lfrom_sp_done:
2117	andq	$-64,%rbp
2118	movq	%rsp,%r11
2119	subq	%rbp,%r11
2120	andq	$-4096,%r11
2121	leaq	(%r11,%rbp,1),%rsp
2122	movq	(%rsp),%r10
2123	cmpq	%rbp,%rsp
2124	ja	.Lfrom_page_walk
2125	jmp	.Lfrom_page_walk_done
2126
2127.Lfrom_page_walk:
2128	leaq	-4096(%rsp),%rsp
2129	movq	(%rsp),%r10
2130	cmpq	%rbp,%rsp
2131	ja	.Lfrom_page_walk
2132.Lfrom_page_walk_done:
2133
2134	movq	%r9,%r10
2135	negq	%r9
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146	movq	%r8,32(%rsp)
2147	movq	%rax,40(%rsp)
2148.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2149.Lfrom_body:
2150	movq	%r9,%r11
2151	leaq	48(%rsp),%rax
2152	pxor	%xmm0,%xmm0
2153	jmp	.Lmul_by_1
2154
2155.align	32
2156.Lmul_by_1:
2157	movdqu	(%rsi),%xmm1
2158	movdqu	16(%rsi),%xmm2
2159	movdqu	32(%rsi),%xmm3
2160	movdqa	%xmm0,(%rax,%r9,1)
2161	movdqu	48(%rsi),%xmm4
2162	movdqa	%xmm0,16(%rax,%r9,1)
2163.byte	0x48,0x8d,0xb6,0x40,0x00,0x00,0x00
2164	movdqa	%xmm1,(%rax)
2165	movdqa	%xmm0,32(%rax,%r9,1)
2166	movdqa	%xmm2,16(%rax)
2167	movdqa	%xmm0,48(%rax,%r9,1)
2168	movdqa	%xmm3,32(%rax)
2169	movdqa	%xmm4,48(%rax)
2170	leaq	64(%rax),%rax
2171	subq	$64,%r11
2172	jnz	.Lmul_by_1
2173
2174.byte	102,72,15,110,207
2175.byte	102,72,15,110,209
2176.byte	0x67
2177	movq	%rcx,%rbp
2178.byte	102,73,15,110,218
2179	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
2180	andl	$0x80108,%r11d
2181	cmpl	$0x80108,%r11d
2182	jne	.Lfrom_mont_nox
2183
2184	leaq	(%rax,%r9,1),%rdi
2185	call	__bn_sqrx8x_reduction
2186	call	__bn_postx4x_internal
2187
2188	pxor	%xmm0,%xmm0
2189	leaq	48(%rsp),%rax
2190	jmp	.Lfrom_mont_zero
2191
2192.align	32
2193.Lfrom_mont_nox:
2194	call	__bn_sqr8x_reduction
2195	call	__bn_post4x_internal
2196
2197	pxor	%xmm0,%xmm0
2198	leaq	48(%rsp),%rax
2199	jmp	.Lfrom_mont_zero
2200
2201.align	32
2202.Lfrom_mont_zero:
2203	movq	40(%rsp),%rsi
2204.cfi_def_cfa	%rsi,8
2205	movdqa	%xmm0,0(%rax)
2206	movdqa	%xmm0,16(%rax)
2207	movdqa	%xmm0,32(%rax)
2208	movdqa	%xmm0,48(%rax)
2209	leaq	64(%rax),%rax
2210	subq	$32,%r9
2211	jnz	.Lfrom_mont_zero
2212
2213	movq	$1,%rax
2214	movq	-48(%rsi),%r15
2215.cfi_restore	%r15
2216	movq	-40(%rsi),%r14
2217.cfi_restore	%r14
2218	movq	-32(%rsi),%r13
2219.cfi_restore	%r13
2220	movq	-24(%rsi),%r12
2221.cfi_restore	%r12
2222	movq	-16(%rsi),%rbp
2223.cfi_restore	%rbp
2224	movq	-8(%rsi),%rbx
2225.cfi_restore	%rbx
2226	leaq	(%rsi),%rsp
2227.cfi_def_cfa_register	%rsp
2228.Lfrom_epilogue:
2229	.byte	0xf3,0xc3
2230.cfi_endproc
2231.size	bn_from_mont8x,.-bn_from_mont8x
2232.type	bn_mulx4x_mont_gather5,@function
2233.align	32
2234bn_mulx4x_mont_gather5:
2235.cfi_startproc
2236	movq	%rsp,%rax
2237.cfi_def_cfa_register	%rax
2238.Lmulx4x_enter:
2239	pushq	%rbx
2240.cfi_offset	%rbx,-16
2241	pushq	%rbp
2242.cfi_offset	%rbp,-24
2243	pushq	%r12
2244.cfi_offset	%r12,-32
2245	pushq	%r13
2246.cfi_offset	%r13,-40
2247	pushq	%r14
2248.cfi_offset	%r14,-48
2249	pushq	%r15
2250.cfi_offset	%r15,-56
2251.Lmulx4x_prologue:
2252
2253	shll	$3,%r9d
2254	leaq	(%r9,%r9,2),%r10
2255	negq	%r9
2256	movq	(%r8),%r8
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267	leaq	-320(%rsp,%r9,2),%r11
2268	movq	%rsp,%rbp
2269	subq	%rdi,%r11
2270	andq	$4095,%r11
2271	cmpq	%r11,%r10
2272	jb	.Lmulx4xsp_alt
2273	subq	%r11,%rbp
2274	leaq	-320(%rbp,%r9,2),%rbp
2275	jmp	.Lmulx4xsp_done
2276
2277.Lmulx4xsp_alt:
2278	leaq	4096-320(,%r9,2),%r10
2279	leaq	-320(%rbp,%r9,2),%rbp
2280	subq	%r10,%r11
2281	movq	$0,%r10
2282	cmovcq	%r10,%r11
2283	subq	%r11,%rbp
2284.Lmulx4xsp_done:
2285	andq	$-64,%rbp
2286	movq	%rsp,%r11
2287	subq	%rbp,%r11
2288	andq	$-4096,%r11
2289	leaq	(%r11,%rbp,1),%rsp
2290	movq	(%rsp),%r10
2291	cmpq	%rbp,%rsp
2292	ja	.Lmulx4x_page_walk
2293	jmp	.Lmulx4x_page_walk_done
2294
2295.Lmulx4x_page_walk:
2296	leaq	-4096(%rsp),%rsp
2297	movq	(%rsp),%r10
2298	cmpq	%rbp,%rsp
2299	ja	.Lmulx4x_page_walk
2300.Lmulx4x_page_walk_done:
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314	movq	%r8,32(%rsp)
2315	movq	%rax,40(%rsp)
2316.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2317.Lmulx4x_body:
2318	call	mulx4x_internal
2319
2320	movq	40(%rsp),%rsi
2321.cfi_def_cfa	%rsi,8
2322	movq	$1,%rax
2323
2324	movq	-48(%rsi),%r15
2325.cfi_restore	%r15
2326	movq	-40(%rsi),%r14
2327.cfi_restore	%r14
2328	movq	-32(%rsi),%r13
2329.cfi_restore	%r13
2330	movq	-24(%rsi),%r12
2331.cfi_restore	%r12
2332	movq	-16(%rsi),%rbp
2333.cfi_restore	%rbp
2334	movq	-8(%rsi),%rbx
2335.cfi_restore	%rbx
2336	leaq	(%rsi),%rsp
2337.cfi_def_cfa_register	%rsp
2338.Lmulx4x_epilogue:
2339	.byte	0xf3,0xc3
2340.cfi_endproc
2341.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2342
2343.type	mulx4x_internal,@function
2344.align	32
2345mulx4x_internal:
2346.cfi_startproc
2347	movq	%r9,8(%rsp)
2348	movq	%r9,%r10
2349	negq	%r9
2350	shlq	$5,%r9
2351	negq	%r10
2352	leaq	128(%rdx,%r9,1),%r13
2353	shrq	$5+5,%r9
2354	movd	8(%rax),%xmm5
2355	subq	$1,%r9
2356	leaq	.Linc(%rip),%rax
2357	movq	%r13,16+8(%rsp)
2358	movq	%r9,24+8(%rsp)
2359	movq	%rdi,56+8(%rsp)
2360	movdqa	0(%rax),%xmm0
2361	movdqa	16(%rax),%xmm1
2362	leaq	88-112(%rsp,%r10,1),%r10
2363	leaq	128(%rdx),%rdi
2364
2365	pshufd	$0,%xmm5,%xmm5
2366	movdqa	%xmm1,%xmm4
2367.byte	0x67
2368	movdqa	%xmm1,%xmm2
2369.byte	0x67
2370	paddd	%xmm0,%xmm1
2371	pcmpeqd	%xmm5,%xmm0
2372	movdqa	%xmm4,%xmm3
2373	paddd	%xmm1,%xmm2
2374	pcmpeqd	%xmm5,%xmm1
2375	movdqa	%xmm0,112(%r10)
2376	movdqa	%xmm4,%xmm0
2377
2378	paddd	%xmm2,%xmm3
2379	pcmpeqd	%xmm5,%xmm2
2380	movdqa	%xmm1,128(%r10)
2381	movdqa	%xmm4,%xmm1
2382
2383	paddd	%xmm3,%xmm0
2384	pcmpeqd	%xmm5,%xmm3
2385	movdqa	%xmm2,144(%r10)
2386	movdqa	%xmm4,%xmm2
2387
2388	paddd	%xmm0,%xmm1
2389	pcmpeqd	%xmm5,%xmm0
2390	movdqa	%xmm3,160(%r10)
2391	movdqa	%xmm4,%xmm3
2392	paddd	%xmm1,%xmm2
2393	pcmpeqd	%xmm5,%xmm1
2394	movdqa	%xmm0,176(%r10)
2395	movdqa	%xmm4,%xmm0
2396
2397	paddd	%xmm2,%xmm3
2398	pcmpeqd	%xmm5,%xmm2
2399	movdqa	%xmm1,192(%r10)
2400	movdqa	%xmm4,%xmm1
2401
2402	paddd	%xmm3,%xmm0
2403	pcmpeqd	%xmm5,%xmm3
2404	movdqa	%xmm2,208(%r10)
2405	movdqa	%xmm4,%xmm2
2406
2407	paddd	%xmm0,%xmm1
2408	pcmpeqd	%xmm5,%xmm0
2409	movdqa	%xmm3,224(%r10)
2410	movdqa	%xmm4,%xmm3
2411	paddd	%xmm1,%xmm2
2412	pcmpeqd	%xmm5,%xmm1
2413	movdqa	%xmm0,240(%r10)
2414	movdqa	%xmm4,%xmm0
2415
2416	paddd	%xmm2,%xmm3
2417	pcmpeqd	%xmm5,%xmm2
2418	movdqa	%xmm1,256(%r10)
2419	movdqa	%xmm4,%xmm1
2420
2421	paddd	%xmm3,%xmm0
2422	pcmpeqd	%xmm5,%xmm3
2423	movdqa	%xmm2,272(%r10)
2424	movdqa	%xmm4,%xmm2
2425
2426	paddd	%xmm0,%xmm1
2427	pcmpeqd	%xmm5,%xmm0
2428	movdqa	%xmm3,288(%r10)
2429	movdqa	%xmm4,%xmm3
2430.byte	0x67
2431	paddd	%xmm1,%xmm2
2432	pcmpeqd	%xmm5,%xmm1
2433	movdqa	%xmm0,304(%r10)
2434
2435	paddd	%xmm2,%xmm3
2436	pcmpeqd	%xmm5,%xmm2
2437	movdqa	%xmm1,320(%r10)
2438
2439	pcmpeqd	%xmm5,%xmm3
2440	movdqa	%xmm2,336(%r10)
2441
2442	pand	64(%rdi),%xmm0
2443	pand	80(%rdi),%xmm1
2444	pand	96(%rdi),%xmm2
2445	movdqa	%xmm3,352(%r10)
2446	pand	112(%rdi),%xmm3
2447	por	%xmm2,%xmm0
2448	por	%xmm3,%xmm1
2449	movdqa	-128(%rdi),%xmm4
2450	movdqa	-112(%rdi),%xmm5
2451	movdqa	-96(%rdi),%xmm2
2452	pand	112(%r10),%xmm4
2453	movdqa	-80(%rdi),%xmm3
2454	pand	128(%r10),%xmm5
2455	por	%xmm4,%xmm0
2456	pand	144(%r10),%xmm2
2457	por	%xmm5,%xmm1
2458	pand	160(%r10),%xmm3
2459	por	%xmm2,%xmm0
2460	por	%xmm3,%xmm1
2461	movdqa	-64(%rdi),%xmm4
2462	movdqa	-48(%rdi),%xmm5
2463	movdqa	-32(%rdi),%xmm2
2464	pand	176(%r10),%xmm4
2465	movdqa	-16(%rdi),%xmm3
2466	pand	192(%r10),%xmm5
2467	por	%xmm4,%xmm0
2468	pand	208(%r10),%xmm2
2469	por	%xmm5,%xmm1
2470	pand	224(%r10),%xmm3
2471	por	%xmm2,%xmm0
2472	por	%xmm3,%xmm1
2473	movdqa	0(%rdi),%xmm4
2474	movdqa	16(%rdi),%xmm5
2475	movdqa	32(%rdi),%xmm2
2476	pand	240(%r10),%xmm4
2477	movdqa	48(%rdi),%xmm3
2478	pand	256(%r10),%xmm5
2479	por	%xmm4,%xmm0
2480	pand	272(%r10),%xmm2
2481	por	%xmm5,%xmm1
2482	pand	288(%r10),%xmm3
2483	por	%xmm2,%xmm0
2484	por	%xmm3,%xmm1
2485	pxor	%xmm1,%xmm0
2486	pshufd	$0x4e,%xmm0,%xmm1
2487	por	%xmm1,%xmm0
2488	leaq	256(%rdi),%rdi
2489.byte	102,72,15,126,194
2490	leaq	64+32+8(%rsp),%rbx
2491
2492	movq	%rdx,%r9
2493	mulxq	0(%rsi),%r8,%rax
2494	mulxq	8(%rsi),%r11,%r12
2495	addq	%rax,%r11
2496	mulxq	16(%rsi),%rax,%r13
2497	adcq	%rax,%r12
2498	adcq	$0,%r13
2499	mulxq	24(%rsi),%rax,%r14
2500
2501	movq	%r8,%r15
2502	imulq	32+8(%rsp),%r8
2503	xorq	%rbp,%rbp
2504	movq	%r8,%rdx
2505
2506	movq	%rdi,8+8(%rsp)
2507
2508	leaq	32(%rsi),%rsi
2509	adcxq	%rax,%r13
2510	adcxq	%rbp,%r14
2511
2512	mulxq	0(%rcx),%rax,%r10
2513	adcxq	%rax,%r15
2514	adoxq	%r11,%r10
2515	mulxq	8(%rcx),%rax,%r11
2516	adcxq	%rax,%r10
2517	adoxq	%r12,%r11
2518	mulxq	16(%rcx),%rax,%r12
2519	movq	24+8(%rsp),%rdi
2520	movq	%r10,-32(%rbx)
2521	adcxq	%rax,%r11
2522	adoxq	%r13,%r12
2523	mulxq	24(%rcx),%rax,%r15
2524	movq	%r9,%rdx
2525	movq	%r11,-24(%rbx)
2526	adcxq	%rax,%r12
2527	adoxq	%rbp,%r15
2528	leaq	32(%rcx),%rcx
2529	movq	%r12,-16(%rbx)
2530	jmp	.Lmulx4x_1st
2531
2532.align	32
2533.Lmulx4x_1st:
2534	adcxq	%rbp,%r15
2535	mulxq	0(%rsi),%r10,%rax
2536	adcxq	%r14,%r10
2537	mulxq	8(%rsi),%r11,%r14
2538	adcxq	%rax,%r11
2539	mulxq	16(%rsi),%r12,%rax
2540	adcxq	%r14,%r12
2541	mulxq	24(%rsi),%r13,%r14
2542.byte	0x67,0x67
2543	movq	%r8,%rdx
2544	adcxq	%rax,%r13
2545	adcxq	%rbp,%r14
2546	leaq	32(%rsi),%rsi
2547	leaq	32(%rbx),%rbx
2548
2549	adoxq	%r15,%r10
2550	mulxq	0(%rcx),%rax,%r15
2551	adcxq	%rax,%r10
2552	adoxq	%r15,%r11
2553	mulxq	8(%rcx),%rax,%r15
2554	adcxq	%rax,%r11
2555	adoxq	%r15,%r12
2556	mulxq	16(%rcx),%rax,%r15
2557	movq	%r10,-40(%rbx)
2558	adcxq	%rax,%r12
2559	movq	%r11,-32(%rbx)
2560	adoxq	%r15,%r13
2561	mulxq	24(%rcx),%rax,%r15
2562	movq	%r9,%rdx
2563	movq	%r12,-24(%rbx)
2564	adcxq	%rax,%r13
2565	adoxq	%rbp,%r15
2566	leaq	32(%rcx),%rcx
2567	movq	%r13,-16(%rbx)
2568
2569	decq	%rdi
2570	jnz	.Lmulx4x_1st
2571
2572	movq	8(%rsp),%rax
2573	adcq	%rbp,%r15
2574	leaq	(%rsi,%rax,1),%rsi
2575	addq	%r15,%r14
2576	movq	8+8(%rsp),%rdi
2577	adcq	%rbp,%rbp
2578	movq	%r14,-8(%rbx)
2579	jmp	.Lmulx4x_outer
2580
2581.align	32
2582.Lmulx4x_outer:
2583	leaq	16-256(%rbx),%r10
2584	pxor	%xmm4,%xmm4
2585.byte	0x67,0x67
2586	pxor	%xmm5,%xmm5
2587	movdqa	-128(%rdi),%xmm0
2588	movdqa	-112(%rdi),%xmm1
2589	movdqa	-96(%rdi),%xmm2
2590	pand	256(%r10),%xmm0
2591	movdqa	-80(%rdi),%xmm3
2592	pand	272(%r10),%xmm1
2593	por	%xmm0,%xmm4
2594	pand	288(%r10),%xmm2
2595	por	%xmm1,%xmm5
2596	pand	304(%r10),%xmm3
2597	por	%xmm2,%xmm4
2598	por	%xmm3,%xmm5
2599	movdqa	-64(%rdi),%xmm0
2600	movdqa	-48(%rdi),%xmm1
2601	movdqa	-32(%rdi),%xmm2
2602	pand	320(%r10),%xmm0
2603	movdqa	-16(%rdi),%xmm3
2604	pand	336(%r10),%xmm1
2605	por	%xmm0,%xmm4
2606	pand	352(%r10),%xmm2
2607	por	%xmm1,%xmm5
2608	pand	368(%r10),%xmm3
2609	por	%xmm2,%xmm4
2610	por	%xmm3,%xmm5
2611	movdqa	0(%rdi),%xmm0
2612	movdqa	16(%rdi),%xmm1
2613	movdqa	32(%rdi),%xmm2
2614	pand	384(%r10),%xmm0
2615	movdqa	48(%rdi),%xmm3
2616	pand	400(%r10),%xmm1
2617	por	%xmm0,%xmm4
2618	pand	416(%r10),%xmm2
2619	por	%xmm1,%xmm5
2620	pand	432(%r10),%xmm3
2621	por	%xmm2,%xmm4
2622	por	%xmm3,%xmm5
2623	movdqa	64(%rdi),%xmm0
2624	movdqa	80(%rdi),%xmm1
2625	movdqa	96(%rdi),%xmm2
2626	pand	448(%r10),%xmm0
2627	movdqa	112(%rdi),%xmm3
2628	pand	464(%r10),%xmm1
2629	por	%xmm0,%xmm4
2630	pand	480(%r10),%xmm2
2631	por	%xmm1,%xmm5
2632	pand	496(%r10),%xmm3
2633	por	%xmm2,%xmm4
2634	por	%xmm3,%xmm5
2635	por	%xmm5,%xmm4
2636	pshufd	$0x4e,%xmm4,%xmm0
2637	por	%xmm4,%xmm0
2638	leaq	256(%rdi),%rdi
2639.byte	102,72,15,126,194
2640
2641	movq	%rbp,(%rbx)
2642	leaq	32(%rbx,%rax,1),%rbx
2643	mulxq	0(%rsi),%r8,%r11
2644	xorq	%rbp,%rbp
2645	movq	%rdx,%r9
2646	mulxq	8(%rsi),%r14,%r12
2647	adoxq	-32(%rbx),%r8
2648	adcxq	%r14,%r11
2649	mulxq	16(%rsi),%r15,%r13
2650	adoxq	-24(%rbx),%r11
2651	adcxq	%r15,%r12
2652	mulxq	24(%rsi),%rdx,%r14
2653	adoxq	-16(%rbx),%r12
2654	adcxq	%rdx,%r13
2655	leaq	(%rcx,%rax,1),%rcx
2656	leaq	32(%rsi),%rsi
2657	adoxq	-8(%rbx),%r13
2658	adcxq	%rbp,%r14
2659	adoxq	%rbp,%r14
2660
2661	movq	%r8,%r15
2662	imulq	32+8(%rsp),%r8
2663
2664	movq	%r8,%rdx
2665	xorq	%rbp,%rbp
2666	movq	%rdi,8+8(%rsp)
2667
2668	mulxq	0(%rcx),%rax,%r10
2669	adcxq	%rax,%r15
2670	adoxq	%r11,%r10
2671	mulxq	8(%rcx),%rax,%r11
2672	adcxq	%rax,%r10
2673	adoxq	%r12,%r11
2674	mulxq	16(%rcx),%rax,%r12
2675	adcxq	%rax,%r11
2676	adoxq	%r13,%r12
2677	mulxq	24(%rcx),%rax,%r15
2678	movq	%r9,%rdx
2679	movq	24+8(%rsp),%rdi
2680	movq	%r10,-32(%rbx)
2681	adcxq	%rax,%r12
2682	movq	%r11,-24(%rbx)
2683	adoxq	%rbp,%r15
2684	movq	%r12,-16(%rbx)
2685	leaq	32(%rcx),%rcx
2686	jmp	.Lmulx4x_inner
2687
2688.align	32
2689.Lmulx4x_inner:
2690	mulxq	0(%rsi),%r10,%rax
2691	adcxq	%rbp,%r15
2692	adoxq	%r14,%r10
2693	mulxq	8(%rsi),%r11,%r14
2694	adcxq	0(%rbx),%r10
2695	adoxq	%rax,%r11
2696	mulxq	16(%rsi),%r12,%rax
2697	adcxq	8(%rbx),%r11
2698	adoxq	%r14,%r12
2699	mulxq	24(%rsi),%r13,%r14
2700	movq	%r8,%rdx
2701	adcxq	16(%rbx),%r12
2702	adoxq	%rax,%r13
2703	adcxq	24(%rbx),%r13
2704	adoxq	%rbp,%r14
2705	leaq	32(%rsi),%rsi
2706	leaq	32(%rbx),%rbx
2707	adcxq	%rbp,%r14
2708
2709	adoxq	%r15,%r10
2710	mulxq	0(%rcx),%rax,%r15
2711	adcxq	%rax,%r10
2712	adoxq	%r15,%r11
2713	mulxq	8(%rcx),%rax,%r15
2714	adcxq	%rax,%r11
2715	adoxq	%r15,%r12
2716	mulxq	16(%rcx),%rax,%r15
2717	movq	%r10,-40(%rbx)
2718	adcxq	%rax,%r12
2719	adoxq	%r15,%r13
2720	movq	%r11,-32(%rbx)
2721	mulxq	24(%rcx),%rax,%r15
2722	movq	%r9,%rdx
2723	leaq	32(%rcx),%rcx
2724	movq	%r12,-24(%rbx)
2725	adcxq	%rax,%r13
2726	adoxq	%rbp,%r15
2727	movq	%r13,-16(%rbx)
2728
2729	decq	%rdi
2730	jnz	.Lmulx4x_inner
2731
2732	movq	0+8(%rsp),%rax
2733	adcq	%rbp,%r15
2734	subq	0(%rbx),%rdi
2735	movq	8+8(%rsp),%rdi
2736	movq	16+8(%rsp),%r10
2737	adcq	%r15,%r14
2738	leaq	(%rsi,%rax,1),%rsi
2739	adcq	%rbp,%rbp
2740	movq	%r14,-8(%rbx)
2741
2742	cmpq	%r10,%rdi
2743	jb	.Lmulx4x_outer
2744
2745	movq	-8(%rcx),%r10
2746	movq	%rbp,%r8
2747	movq	(%rcx,%rax,1),%r12
2748	leaq	(%rcx,%rax,1),%rbp
2749	movq	%rax,%rcx
2750	leaq	(%rbx,%rax,1),%rdi
2751	xorl	%eax,%eax
2752	xorq	%r15,%r15
2753	subq	%r14,%r10
2754	adcq	%r15,%r15
2755	orq	%r15,%r8
2756	sarq	$3+2,%rcx
2757	subq	%r8,%rax
2758	movq	56+8(%rsp),%rdx
2759	decq	%r12
2760	movq	8(%rbp),%r13
2761	xorq	%r8,%r8
2762	movq	16(%rbp),%r14
2763	movq	24(%rbp),%r15
2764	jmp	.Lsqrx4x_sub_entry
2765.cfi_endproc
2766.size	mulx4x_internal,.-mulx4x_internal
2767.type	bn_powerx5,@function
2768.align	32
2769bn_powerx5:
2770.cfi_startproc
2771	movq	%rsp,%rax
2772.cfi_def_cfa_register	%rax
2773.Lpowerx5_enter:
2774	pushq	%rbx
2775.cfi_offset	%rbx,-16
2776	pushq	%rbp
2777.cfi_offset	%rbp,-24
2778	pushq	%r12
2779.cfi_offset	%r12,-32
2780	pushq	%r13
2781.cfi_offset	%r13,-40
2782	pushq	%r14
2783.cfi_offset	%r14,-48
2784	pushq	%r15
2785.cfi_offset	%r15,-56
2786.Lpowerx5_prologue:
2787
2788	shll	$3,%r9d
2789	leaq	(%r9,%r9,2),%r10
2790	negq	%r9
2791	movq	(%r8),%r8
2792
2793
2794
2795
2796
2797
2798
2799
2800	leaq	-320(%rsp,%r9,2),%r11
2801	movq	%rsp,%rbp
2802	subq	%rdi,%r11
2803	andq	$4095,%r11
2804	cmpq	%r11,%r10
2805	jb	.Lpwrx_sp_alt
2806	subq	%r11,%rbp
2807	leaq	-320(%rbp,%r9,2),%rbp
2808	jmp	.Lpwrx_sp_done
2809
2810.align	32
2811.Lpwrx_sp_alt:
2812	leaq	4096-320(,%r9,2),%r10
2813	leaq	-320(%rbp,%r9,2),%rbp
2814	subq	%r10,%r11
2815	movq	$0,%r10
2816	cmovcq	%r10,%r11
2817	subq	%r11,%rbp
2818.Lpwrx_sp_done:
2819	andq	$-64,%rbp
2820	movq	%rsp,%r11
2821	subq	%rbp,%r11
2822	andq	$-4096,%r11
2823	leaq	(%r11,%rbp,1),%rsp
2824	movq	(%rsp),%r10
2825	cmpq	%rbp,%rsp
2826	ja	.Lpwrx_page_walk
2827	jmp	.Lpwrx_page_walk_done
2828
2829.Lpwrx_page_walk:
2830	leaq	-4096(%rsp),%rsp
2831	movq	(%rsp),%r10
2832	cmpq	%rbp,%rsp
2833	ja	.Lpwrx_page_walk
2834.Lpwrx_page_walk_done:
2835
2836	movq	%r9,%r10
2837	negq	%r9
2838
2839
2840
2841
2842
2843
2844
2845
2846
2847
2848
2849
2850	pxor	%xmm0,%xmm0
2851.byte	102,72,15,110,207
2852.byte	102,72,15,110,209
2853.byte	102,73,15,110,218
2854.byte	102,72,15,110,226
2855	movq	%r8,32(%rsp)
2856	movq	%rax,40(%rsp)
2857.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2858.Lpowerx5_body:
2859
2860	call	__bn_sqrx8x_internal
2861	call	__bn_postx4x_internal
2862	call	__bn_sqrx8x_internal
2863	call	__bn_postx4x_internal
2864	call	__bn_sqrx8x_internal
2865	call	__bn_postx4x_internal
2866	call	__bn_sqrx8x_internal
2867	call	__bn_postx4x_internal
2868	call	__bn_sqrx8x_internal
2869	call	__bn_postx4x_internal
2870
2871	movq	%r10,%r9
2872	movq	%rsi,%rdi
2873.byte	102,72,15,126,209
2874.byte	102,72,15,126,226
2875	movq	40(%rsp),%rax
2876
2877	call	mulx4x_internal
2878
2879	movq	40(%rsp),%rsi
2880.cfi_def_cfa	%rsi,8
2881	movq	$1,%rax
2882
2883	movq	-48(%rsi),%r15
2884.cfi_restore	%r15
2885	movq	-40(%rsi),%r14
2886.cfi_restore	%r14
2887	movq	-32(%rsi),%r13
2888.cfi_restore	%r13
2889	movq	-24(%rsi),%r12
2890.cfi_restore	%r12
2891	movq	-16(%rsi),%rbp
2892.cfi_restore	%rbp
2893	movq	-8(%rsi),%rbx
2894.cfi_restore	%rbx
2895	leaq	(%rsi),%rsp
2896.cfi_def_cfa_register	%rsp
2897.Lpowerx5_epilogue:
2898	.byte	0xf3,0xc3
2899.cfi_endproc
2900.size	bn_powerx5,.-bn_powerx5
2901
2902.globl	bn_sqrx8x_internal
2903.hidden	bn_sqrx8x_internal
2904.type	bn_sqrx8x_internal,@function
2905.align	32
2906bn_sqrx8x_internal:
2907__bn_sqrx8x_internal:
2908.cfi_startproc
2909
2910
2911
2912
2913
2914
2915
2916
2917
2918
2919
2920
2921
2922
2923
2924
2925
2926
2927
2928
2929
2930
2931
2932
2933
2934
2935
2936
2937
2938
2939
2940
2941
2942
2943
2944
2945
2946
2947
2948
2949	leaq	48+8(%rsp),%rdi
2950	leaq	(%rsi,%r9,1),%rbp
2951	movq	%r9,0+8(%rsp)
2952	movq	%rbp,8+8(%rsp)
2953	jmp	.Lsqr8x_zero_start
2954
2955.align	32
2956.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2957.Lsqrx8x_zero:
2958.byte	0x3e
2959	movdqa	%xmm0,0(%rdi)
2960	movdqa	%xmm0,16(%rdi)
2961	movdqa	%xmm0,32(%rdi)
2962	movdqa	%xmm0,48(%rdi)
2963.Lsqr8x_zero_start:
2964	movdqa	%xmm0,64(%rdi)
2965	movdqa	%xmm0,80(%rdi)
2966	movdqa	%xmm0,96(%rdi)
2967	movdqa	%xmm0,112(%rdi)
2968	leaq	128(%rdi),%rdi
2969	subq	$64,%r9
2970	jnz	.Lsqrx8x_zero
2971
2972	movq	0(%rsi),%rdx
2973
2974	xorq	%r10,%r10
2975	xorq	%r11,%r11
2976	xorq	%r12,%r12
2977	xorq	%r13,%r13
2978	xorq	%r14,%r14
2979	xorq	%r15,%r15
2980	leaq	48+8(%rsp),%rdi
2981	xorq	%rbp,%rbp
2982	jmp	.Lsqrx8x_outer_loop
2983
2984.align	32
2985.Lsqrx8x_outer_loop:
2986	mulxq	8(%rsi),%r8,%rax
2987	adcxq	%r9,%r8
2988	adoxq	%rax,%r10
2989	mulxq	16(%rsi),%r9,%rax
2990	adcxq	%r10,%r9
2991	adoxq	%rax,%r11
2992.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2993	adcxq	%r11,%r10
2994	adoxq	%rax,%r12
2995.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2996	adcxq	%r12,%r11
2997	adoxq	%rax,%r13
2998	mulxq	40(%rsi),%r12,%rax
2999	adcxq	%r13,%r12
3000	adoxq	%rax,%r14
3001	mulxq	48(%rsi),%r13,%rax
3002	adcxq	%r14,%r13
3003	adoxq	%r15,%rax
3004	mulxq	56(%rsi),%r14,%r15
3005	movq	8(%rsi),%rdx
3006	adcxq	%rax,%r14
3007	adoxq	%rbp,%r15
3008	adcq	64(%rdi),%r15
3009	movq	%r8,8(%rdi)
3010	movq	%r9,16(%rdi)
3011	sbbq	%rcx,%rcx
3012	xorq	%rbp,%rbp
3013
3014
3015	mulxq	16(%rsi),%r8,%rbx
3016	mulxq	24(%rsi),%r9,%rax
3017	adcxq	%r10,%r8
3018	adoxq	%rbx,%r9
3019	mulxq	32(%rsi),%r10,%rbx
3020	adcxq	%r11,%r9
3021	adoxq	%rax,%r10
3022.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
3023	adcxq	%r12,%r10
3024	adoxq	%rbx,%r11
3025.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
3026	adcxq	%r13,%r11
3027	adoxq	%r14,%r12
3028.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
3029	movq	16(%rsi),%rdx
3030	adcxq	%rax,%r12
3031	adoxq	%rbx,%r13
3032	adcxq	%r15,%r13
3033	adoxq	%rbp,%r14
3034	adcxq	%rbp,%r14
3035
3036	movq	%r8,24(%rdi)
3037	movq	%r9,32(%rdi)
3038
3039	mulxq	24(%rsi),%r8,%rbx
3040	mulxq	32(%rsi),%r9,%rax
3041	adcxq	%r10,%r8
3042	adoxq	%rbx,%r9
3043	mulxq	40(%rsi),%r10,%rbx
3044	adcxq	%r11,%r9
3045	adoxq	%rax,%r10
3046.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
3047	adcxq	%r12,%r10
3048	adoxq	%r13,%r11
3049.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
3050.byte	0x3e
3051	movq	24(%rsi),%rdx
3052	adcxq	%rbx,%r11
3053	adoxq	%rax,%r12
3054	adcxq	%r14,%r12
3055	movq	%r8,40(%rdi)
3056	movq	%r9,48(%rdi)
3057	mulxq	32(%rsi),%r8,%rax
3058	adoxq	%rbp,%r13
3059	adcxq	%rbp,%r13
3060
3061	mulxq	40(%rsi),%r9,%rbx
3062	adcxq	%r10,%r8
3063	adoxq	%rax,%r9
3064	mulxq	48(%rsi),%r10,%rax
3065	adcxq	%r11,%r9
3066	adoxq	%r12,%r10
3067	mulxq	56(%rsi),%r11,%r12
3068	movq	32(%rsi),%rdx
3069	movq	40(%rsi),%r14
3070	adcxq	%rbx,%r10
3071	adoxq	%rax,%r11
3072	movq	48(%rsi),%r15
3073	adcxq	%r13,%r11
3074	adoxq	%rbp,%r12
3075	adcxq	%rbp,%r12
3076
3077	movq	%r8,56(%rdi)
3078	movq	%r9,64(%rdi)
3079
3080	mulxq	%r14,%r9,%rax
3081	movq	56(%rsi),%r8
3082	adcxq	%r10,%r9
3083	mulxq	%r15,%r10,%rbx
3084	adoxq	%rax,%r10
3085	adcxq	%r11,%r10
3086	mulxq	%r8,%r11,%rax
3087	movq	%r14,%rdx
3088	adoxq	%rbx,%r11
3089	adcxq	%r12,%r11
3090
3091	adcxq	%rbp,%rax
3092
3093	mulxq	%r15,%r14,%rbx
3094	mulxq	%r8,%r12,%r13
3095	movq	%r15,%rdx
3096	leaq	64(%rsi),%rsi
3097	adcxq	%r14,%r11
3098	adoxq	%rbx,%r12
3099	adcxq	%rax,%r12
3100	adoxq	%rbp,%r13
3101
3102.byte	0x67,0x67
3103	mulxq	%r8,%r8,%r14
3104	adcxq	%r8,%r13
3105	adcxq	%rbp,%r14
3106
3107	cmpq	8+8(%rsp),%rsi
3108	je	.Lsqrx8x_outer_break
3109
3110	negq	%rcx
3111	movq	$-8,%rcx
3112	movq	%rbp,%r15
3113	movq	64(%rdi),%r8
3114	adcxq	72(%rdi),%r9
3115	adcxq	80(%rdi),%r10
3116	adcxq	88(%rdi),%r11
3117	adcq	96(%rdi),%r12
3118	adcq	104(%rdi),%r13
3119	adcq	112(%rdi),%r14
3120	adcq	120(%rdi),%r15
3121	leaq	(%rsi),%rbp
3122	leaq	128(%rdi),%rdi
3123	sbbq	%rax,%rax
3124
3125	movq	-64(%rsi),%rdx
3126	movq	%rax,16+8(%rsp)
3127	movq	%rdi,24+8(%rsp)
3128
3129
3130	xorl	%eax,%eax
3131	jmp	.Lsqrx8x_loop
3132
3133.align	32
3134.Lsqrx8x_loop:
3135	movq	%r8,%rbx
3136	mulxq	0(%rbp),%rax,%r8
3137	adcxq	%rax,%rbx
3138	adoxq	%r9,%r8
3139
3140	mulxq	8(%rbp),%rax,%r9
3141	adcxq	%rax,%r8
3142	adoxq	%r10,%r9
3143
3144	mulxq	16(%rbp),%rax,%r10
3145	adcxq	%rax,%r9
3146	adoxq	%r11,%r10
3147
3148	mulxq	24(%rbp),%rax,%r11
3149	adcxq	%rax,%r10
3150	adoxq	%r12,%r11
3151
3152.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3153	adcxq	%rax,%r11
3154	adoxq	%r13,%r12
3155
3156	mulxq	40(%rbp),%rax,%r13
3157	adcxq	%rax,%r12
3158	adoxq	%r14,%r13
3159
3160	mulxq	48(%rbp),%rax,%r14
3161	movq	%rbx,(%rdi,%rcx,8)
3162	movl	$0,%ebx
3163	adcxq	%rax,%r13
3164	adoxq	%r15,%r14
3165
3166.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
3167	movq	8(%rsi,%rcx,8),%rdx
3168	adcxq	%rax,%r14
3169	adoxq	%rbx,%r15
3170	adcxq	%rbx,%r15
3171
3172.byte	0x67
3173	incq	%rcx
3174	jnz	.Lsqrx8x_loop
3175
3176	leaq	64(%rbp),%rbp
3177	movq	$-8,%rcx
3178	cmpq	8+8(%rsp),%rbp
3179	je	.Lsqrx8x_break
3180
3181	subq	16+8(%rsp),%rbx
3182.byte	0x66
3183	movq	-64(%rsi),%rdx
3184	adcxq	0(%rdi),%r8
3185	adcxq	8(%rdi),%r9
3186	adcq	16(%rdi),%r10
3187	adcq	24(%rdi),%r11
3188	adcq	32(%rdi),%r12
3189	adcq	40(%rdi),%r13
3190	adcq	48(%rdi),%r14
3191	adcq	56(%rdi),%r15
3192	leaq	64(%rdi),%rdi
3193.byte	0x67
3194	sbbq	%rax,%rax
3195	xorl	%ebx,%ebx
3196	movq	%rax,16+8(%rsp)
3197	jmp	.Lsqrx8x_loop
3198
3199.align	32
3200.Lsqrx8x_break:
3201	xorq	%rbp,%rbp
3202	subq	16+8(%rsp),%rbx
3203	adcxq	%rbp,%r8
3204	movq	24+8(%rsp),%rcx
3205	adcxq	%rbp,%r9
3206	movq	0(%rsi),%rdx
3207	adcq	$0,%r10
3208	movq	%r8,0(%rdi)
3209	adcq	$0,%r11
3210	adcq	$0,%r12
3211	adcq	$0,%r13
3212	adcq	$0,%r14
3213	adcq	$0,%r15
3214	cmpq	%rcx,%rdi
3215	je	.Lsqrx8x_outer_loop
3216
3217	movq	%r9,8(%rdi)
3218	movq	8(%rcx),%r9
3219	movq	%r10,16(%rdi)
3220	movq	16(%rcx),%r10
3221	movq	%r11,24(%rdi)
3222	movq	24(%rcx),%r11
3223	movq	%r12,32(%rdi)
3224	movq	32(%rcx),%r12
3225	movq	%r13,40(%rdi)
3226	movq	40(%rcx),%r13
3227	movq	%r14,48(%rdi)
3228	movq	48(%rcx),%r14
3229	movq	%r15,56(%rdi)
3230	movq	56(%rcx),%r15
3231	movq	%rcx,%rdi
3232	jmp	.Lsqrx8x_outer_loop
3233
3234.align	32
3235.Lsqrx8x_outer_break:
3236	movq	%r9,72(%rdi)
3237.byte	102,72,15,126,217
3238	movq	%r10,80(%rdi)
3239	movq	%r11,88(%rdi)
3240	movq	%r12,96(%rdi)
3241	movq	%r13,104(%rdi)
3242	movq	%r14,112(%rdi)
3243	leaq	48+8(%rsp),%rdi
3244	movq	(%rsi,%rcx,1),%rdx
3245
3246	movq	8(%rdi),%r11
3247	xorq	%r10,%r10
3248	movq	0+8(%rsp),%r9
3249	adoxq	%r11,%r11
3250	movq	16(%rdi),%r12
3251	movq	24(%rdi),%r13
3252
3253
3254.align	32
3255.Lsqrx4x_shift_n_add:
3256	mulxq	%rdx,%rax,%rbx
3257	adoxq	%r12,%r12
3258	adcxq	%r10,%rax
3259.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3260.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3261	adoxq	%r13,%r13
3262	adcxq	%r11,%rbx
3263	movq	40(%rdi),%r11
3264	movq	%rax,0(%rdi)
3265	movq	%rbx,8(%rdi)
3266
3267	mulxq	%rdx,%rax,%rbx
3268	adoxq	%r10,%r10
3269	adcxq	%r12,%rax
3270	movq	16(%rsi,%rcx,1),%rdx
3271	movq	48(%rdi),%r12
3272	adoxq	%r11,%r11
3273	adcxq	%r13,%rbx
3274	movq	56(%rdi),%r13
3275	movq	%rax,16(%rdi)
3276	movq	%rbx,24(%rdi)
3277
3278	mulxq	%rdx,%rax,%rbx
3279	adoxq	%r12,%r12
3280	adcxq	%r10,%rax
3281	movq	24(%rsi,%rcx,1),%rdx
3282	leaq	32(%rcx),%rcx
3283	movq	64(%rdi),%r10
3284	adoxq	%r13,%r13
3285	adcxq	%r11,%rbx
3286	movq	72(%rdi),%r11
3287	movq	%rax,32(%rdi)
3288	movq	%rbx,40(%rdi)
3289
3290	mulxq	%rdx,%rax,%rbx
3291	adoxq	%r10,%r10
3292	adcxq	%r12,%rax
3293	jrcxz	.Lsqrx4x_shift_n_add_break
3294.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3295	adoxq	%r11,%r11
3296	adcxq	%r13,%rbx
3297	movq	80(%rdi),%r12
3298	movq	88(%rdi),%r13
3299	movq	%rax,48(%rdi)
3300	movq	%rbx,56(%rdi)
3301	leaq	64(%rdi),%rdi
3302	nop
3303	jmp	.Lsqrx4x_shift_n_add
3304
3305.align	32
3306.Lsqrx4x_shift_n_add_break:
3307	adcxq	%r13,%rbx
3308	movq	%rax,48(%rdi)
3309	movq	%rbx,56(%rdi)
3310	leaq	64(%rdi),%rdi
3311.byte	102,72,15,126,213
3312__bn_sqrx8x_reduction:
3313	xorl	%eax,%eax
3314	movq	32+8(%rsp),%rbx
3315	movq	48+8(%rsp),%rdx
3316	leaq	-64(%rbp,%r9,1),%rcx
3317
3318	movq	%rcx,0+8(%rsp)
3319	movq	%rdi,8+8(%rsp)
3320
3321	leaq	48+8(%rsp),%rdi
3322	jmp	.Lsqrx8x_reduction_loop
3323
3324.align	32
3325.Lsqrx8x_reduction_loop:
3326	movq	8(%rdi),%r9
3327	movq	16(%rdi),%r10
3328	movq	24(%rdi),%r11
3329	movq	32(%rdi),%r12
3330	movq	%rdx,%r8
3331	imulq	%rbx,%rdx
3332	movq	40(%rdi),%r13
3333	movq	48(%rdi),%r14
3334	movq	56(%rdi),%r15
3335	movq	%rax,24+8(%rsp)
3336
3337	leaq	64(%rdi),%rdi
3338	xorq	%rsi,%rsi
3339	movq	$-8,%rcx
3340	jmp	.Lsqrx8x_reduce
3341
3342.align	32
3343.Lsqrx8x_reduce:
3344	movq	%r8,%rbx
3345	mulxq	0(%rbp),%rax,%r8
3346	adcxq	%rbx,%rax
3347	adoxq	%r9,%r8
3348
3349	mulxq	8(%rbp),%rbx,%r9
3350	adcxq	%rbx,%r8
3351	adoxq	%r10,%r9
3352
3353	mulxq	16(%rbp),%rbx,%r10
3354	adcxq	%rbx,%r9
3355	adoxq	%r11,%r10
3356
3357	mulxq	24(%rbp),%rbx,%r11
3358	adcxq	%rbx,%r10
3359	adoxq	%r12,%r11
3360
3361.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3362	movq	%rdx,%rax
3363	movq	%r8,%rdx
3364	adcxq	%rbx,%r11
3365	adoxq	%r13,%r12
3366
3367	mulxq	32+8(%rsp),%rbx,%rdx
3368	movq	%rax,%rdx
3369	movq	%rax,64+48+8(%rsp,%rcx,8)
3370
3371	mulxq	40(%rbp),%rax,%r13
3372	adcxq	%rax,%r12
3373	adoxq	%r14,%r13
3374
3375	mulxq	48(%rbp),%rax,%r14
3376	adcxq	%rax,%r13
3377	adoxq	%r15,%r14
3378
3379	mulxq	56(%rbp),%rax,%r15
3380	movq	%rbx,%rdx
3381	adcxq	%rax,%r14
3382	adoxq	%rsi,%r15
3383	adcxq	%rsi,%r15
3384
3385.byte	0x67,0x67,0x67
3386	incq	%rcx
3387	jnz	.Lsqrx8x_reduce
3388
3389	movq	%rsi,%rax
3390	cmpq	0+8(%rsp),%rbp
3391	jae	.Lsqrx8x_no_tail
3392
3393	movq	48+8(%rsp),%rdx
3394	addq	0(%rdi),%r8
3395	leaq	64(%rbp),%rbp
3396	movq	$-8,%rcx
3397	adcxq	8(%rdi),%r9
3398	adcxq	16(%rdi),%r10
3399	adcq	24(%rdi),%r11
3400	adcq	32(%rdi),%r12
3401	adcq	40(%rdi),%r13
3402	adcq	48(%rdi),%r14
3403	adcq	56(%rdi),%r15
3404	leaq	64(%rdi),%rdi
3405	sbbq	%rax,%rax
3406
3407	xorq	%rsi,%rsi
3408	movq	%rax,16+8(%rsp)
3409	jmp	.Lsqrx8x_tail
3410
3411.align	32
3412.Lsqrx8x_tail:
3413	movq	%r8,%rbx
3414	mulxq	0(%rbp),%rax,%r8
3415	adcxq	%rax,%rbx
3416	adoxq	%r9,%r8
3417
3418	mulxq	8(%rbp),%rax,%r9
3419	adcxq	%rax,%r8
3420	adoxq	%r10,%r9
3421
3422	mulxq	16(%rbp),%rax,%r10
3423	adcxq	%rax,%r9
3424	adoxq	%r11,%r10
3425
3426	mulxq	24(%rbp),%rax,%r11
3427	adcxq	%rax,%r10
3428	adoxq	%r12,%r11
3429
3430.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3431	adcxq	%rax,%r11
3432	adoxq	%r13,%r12
3433
3434	mulxq	40(%rbp),%rax,%r13
3435	adcxq	%rax,%r12
3436	adoxq	%r14,%r13
3437
3438	mulxq	48(%rbp),%rax,%r14
3439	adcxq	%rax,%r13
3440	adoxq	%r15,%r14
3441
3442	mulxq	56(%rbp),%rax,%r15
3443	movq	72+48+8(%rsp,%rcx,8),%rdx
3444	adcxq	%rax,%r14
3445	adoxq	%rsi,%r15
3446	movq	%rbx,(%rdi,%rcx,8)
3447	movq	%r8,%rbx
3448	adcxq	%rsi,%r15
3449
3450	incq	%rcx
3451	jnz	.Lsqrx8x_tail
3452
3453	cmpq	0+8(%rsp),%rbp
3454	jae	.Lsqrx8x_tail_done
3455
3456	subq	16+8(%rsp),%rsi
3457	movq	48+8(%rsp),%rdx
3458	leaq	64(%rbp),%rbp
3459	adcq	0(%rdi),%r8
3460	adcq	8(%rdi),%r9
3461	adcq	16(%rdi),%r10
3462	adcq	24(%rdi),%r11
3463	adcq	32(%rdi),%r12
3464	adcq	40(%rdi),%r13
3465	adcq	48(%rdi),%r14
3466	adcq	56(%rdi),%r15
3467	leaq	64(%rdi),%rdi
3468	sbbq	%rax,%rax
3469	subq	$8,%rcx
3470
3471	xorq	%rsi,%rsi
3472	movq	%rax,16+8(%rsp)
3473	jmp	.Lsqrx8x_tail
3474
3475.align	32
3476.Lsqrx8x_tail_done:
3477	xorq	%rax,%rax
3478	addq	24+8(%rsp),%r8
3479	adcq	$0,%r9
3480	adcq	$0,%r10
3481	adcq	$0,%r11
3482	adcq	$0,%r12
3483	adcq	$0,%r13
3484	adcq	$0,%r14
3485	adcq	$0,%r15
3486	adcq	$0,%rax
3487
3488	subq	16+8(%rsp),%rsi
3489.Lsqrx8x_no_tail:
3490	adcq	0(%rdi),%r8
3491.byte	102,72,15,126,217
3492	adcq	8(%rdi),%r9
3493	movq	56(%rbp),%rsi
3494.byte	102,72,15,126,213
3495	adcq	16(%rdi),%r10
3496	adcq	24(%rdi),%r11
3497	adcq	32(%rdi),%r12
3498	adcq	40(%rdi),%r13
3499	adcq	48(%rdi),%r14
3500	adcq	56(%rdi),%r15
3501	adcq	$0,%rax
3502
3503	movq	32+8(%rsp),%rbx
3504	movq	64(%rdi,%rcx,1),%rdx
3505
3506	movq	%r8,0(%rdi)
3507	leaq	64(%rdi),%r8
3508	movq	%r9,8(%rdi)
3509	movq	%r10,16(%rdi)
3510	movq	%r11,24(%rdi)
3511	movq	%r12,32(%rdi)
3512	movq	%r13,40(%rdi)
3513	movq	%r14,48(%rdi)
3514	movq	%r15,56(%rdi)
3515
3516	leaq	64(%rdi,%rcx,1),%rdi
3517	cmpq	8+8(%rsp),%r8
3518	jb	.Lsqrx8x_reduction_loop
3519	.byte	0xf3,0xc3
3520.cfi_endproc
3521.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3522.align	32
3523__bn_postx4x_internal:
3524.cfi_startproc
3525	movq	0(%rbp),%r12
3526	movq	%rcx,%r10
3527	movq	%rcx,%r9
3528	negq	%rax
3529	sarq	$3+2,%rcx
3530
3531.byte	102,72,15,126,202
3532.byte	102,72,15,126,206
3533	decq	%r12
3534	movq	8(%rbp),%r13
3535	xorq	%r8,%r8
3536	movq	16(%rbp),%r14
3537	movq	24(%rbp),%r15
3538	jmp	.Lsqrx4x_sub_entry
3539
3540.align	16
3541.Lsqrx4x_sub:
3542	movq	0(%rbp),%r12
3543	movq	8(%rbp),%r13
3544	movq	16(%rbp),%r14
3545	movq	24(%rbp),%r15
3546.Lsqrx4x_sub_entry:
3547	andnq	%rax,%r12,%r12
3548	leaq	32(%rbp),%rbp
3549	andnq	%rax,%r13,%r13
3550	andnq	%rax,%r14,%r14
3551	andnq	%rax,%r15,%r15
3552
3553	negq	%r8
3554	adcq	0(%rdi),%r12
3555	adcq	8(%rdi),%r13
3556	adcq	16(%rdi),%r14
3557	adcq	24(%rdi),%r15
3558	movq	%r12,0(%rdx)
3559	leaq	32(%rdi),%rdi
3560	movq	%r13,8(%rdx)
3561	sbbq	%r8,%r8
3562	movq	%r14,16(%rdx)
3563	movq	%r15,24(%rdx)
3564	leaq	32(%rdx),%rdx
3565
3566	incq	%rcx
3567	jnz	.Lsqrx4x_sub
3568
3569	negq	%r9
3570
3571	.byte	0xf3,0xc3
3572.cfi_endproc
3573.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3574.globl	bn_get_bits5
3575.type	bn_get_bits5,@function
3576.align	16
3577bn_get_bits5:
3578.cfi_startproc
3579	leaq	0(%rdi),%r10
3580	leaq	1(%rdi),%r11
3581	movl	%esi,%ecx
3582	shrl	$4,%esi
3583	andl	$15,%ecx
3584	leal	-8(%rcx),%eax
3585	cmpl	$11,%ecx
3586	cmovaq	%r11,%r10
3587	cmoval	%eax,%ecx
3588	movzwl	(%r10,%rsi,2),%eax
3589	shrl	%cl,%eax
3590	andl	$31,%eax
3591	.byte	0xf3,0xc3
3592.cfi_endproc
3593.size	bn_get_bits5,.-bn_get_bits5
3594
3595.globl	bn_scatter5
3596.type	bn_scatter5,@function
3597.align	16
3598bn_scatter5:
3599.cfi_startproc
3600	cmpl	$0,%esi
3601	jz	.Lscatter_epilogue
3602	leaq	(%rdx,%rcx,8),%rdx
3603.Lscatter:
3604	movq	(%rdi),%rax
3605	leaq	8(%rdi),%rdi
3606	movq	%rax,(%rdx)
3607	leaq	256(%rdx),%rdx
3608	subl	$1,%esi
3609	jnz	.Lscatter
3610.Lscatter_epilogue:
3611	.byte	0xf3,0xc3
3612.cfi_endproc
3613.size	bn_scatter5,.-bn_scatter5
3614
3615.globl	bn_gather5
3616.type	bn_gather5,@function
3617.align	32
3618bn_gather5:
3619.LSEH_begin_bn_gather5:
3620.cfi_startproc
3621
3622.byte	0x4c,0x8d,0x14,0x24
3623.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3624	leaq	.Linc(%rip),%rax
3625	andq	$-16,%rsp
3626
3627	movd	%ecx,%xmm5
3628	movdqa	0(%rax),%xmm0
3629	movdqa	16(%rax),%xmm1
3630	leaq	128(%rdx),%r11
3631	leaq	128(%rsp),%rax
3632
3633	pshufd	$0,%xmm5,%xmm5
3634	movdqa	%xmm1,%xmm4
3635	movdqa	%xmm1,%xmm2
3636	paddd	%xmm0,%xmm1
3637	pcmpeqd	%xmm5,%xmm0
3638	movdqa	%xmm4,%xmm3
3639
3640	paddd	%xmm1,%xmm2
3641	pcmpeqd	%xmm5,%xmm1
3642	movdqa	%xmm0,-128(%rax)
3643	movdqa	%xmm4,%xmm0
3644
3645	paddd	%xmm2,%xmm3
3646	pcmpeqd	%xmm5,%xmm2
3647	movdqa	%xmm1,-112(%rax)
3648	movdqa	%xmm4,%xmm1
3649
3650	paddd	%xmm3,%xmm0
3651	pcmpeqd	%xmm5,%xmm3
3652	movdqa	%xmm2,-96(%rax)
3653	movdqa	%xmm4,%xmm2
3654	paddd	%xmm0,%xmm1
3655	pcmpeqd	%xmm5,%xmm0
3656	movdqa	%xmm3,-80(%rax)
3657	movdqa	%xmm4,%xmm3
3658
3659	paddd	%xmm1,%xmm2
3660	pcmpeqd	%xmm5,%xmm1
3661	movdqa	%xmm0,-64(%rax)
3662	movdqa	%xmm4,%xmm0
3663
3664	paddd	%xmm2,%xmm3
3665	pcmpeqd	%xmm5,%xmm2
3666	movdqa	%xmm1,-48(%rax)
3667	movdqa	%xmm4,%xmm1
3668
3669	paddd	%xmm3,%xmm0
3670	pcmpeqd	%xmm5,%xmm3
3671	movdqa	%xmm2,-32(%rax)
3672	movdqa	%xmm4,%xmm2
3673	paddd	%xmm0,%xmm1
3674	pcmpeqd	%xmm5,%xmm0
3675	movdqa	%xmm3,-16(%rax)
3676	movdqa	%xmm4,%xmm3
3677
3678	paddd	%xmm1,%xmm2
3679	pcmpeqd	%xmm5,%xmm1
3680	movdqa	%xmm0,0(%rax)
3681	movdqa	%xmm4,%xmm0
3682
3683	paddd	%xmm2,%xmm3
3684	pcmpeqd	%xmm5,%xmm2
3685	movdqa	%xmm1,16(%rax)
3686	movdqa	%xmm4,%xmm1
3687
3688	paddd	%xmm3,%xmm0
3689	pcmpeqd	%xmm5,%xmm3
3690	movdqa	%xmm2,32(%rax)
3691	movdqa	%xmm4,%xmm2
3692	paddd	%xmm0,%xmm1
3693	pcmpeqd	%xmm5,%xmm0
3694	movdqa	%xmm3,48(%rax)
3695	movdqa	%xmm4,%xmm3
3696
3697	paddd	%xmm1,%xmm2
3698	pcmpeqd	%xmm5,%xmm1
3699	movdqa	%xmm0,64(%rax)
3700	movdqa	%xmm4,%xmm0
3701
3702	paddd	%xmm2,%xmm3
3703	pcmpeqd	%xmm5,%xmm2
3704	movdqa	%xmm1,80(%rax)
3705	movdqa	%xmm4,%xmm1
3706
3707	paddd	%xmm3,%xmm0
3708	pcmpeqd	%xmm5,%xmm3
3709	movdqa	%xmm2,96(%rax)
3710	movdqa	%xmm4,%xmm2
3711	movdqa	%xmm3,112(%rax)
3712	jmp	.Lgather
3713
3714.align	32
3715.Lgather:
3716	pxor	%xmm4,%xmm4
3717	pxor	%xmm5,%xmm5
3718	movdqa	-128(%r11),%xmm0
3719	movdqa	-112(%r11),%xmm1
3720	movdqa	-96(%r11),%xmm2
3721	pand	-128(%rax),%xmm0
3722	movdqa	-80(%r11),%xmm3
3723	pand	-112(%rax),%xmm1
3724	por	%xmm0,%xmm4
3725	pand	-96(%rax),%xmm2
3726	por	%xmm1,%xmm5
3727	pand	-80(%rax),%xmm3
3728	por	%xmm2,%xmm4
3729	por	%xmm3,%xmm5
3730	movdqa	-64(%r11),%xmm0
3731	movdqa	-48(%r11),%xmm1
3732	movdqa	-32(%r11),%xmm2
3733	pand	-64(%rax),%xmm0
3734	movdqa	-16(%r11),%xmm3
3735	pand	-48(%rax),%xmm1
3736	por	%xmm0,%xmm4
3737	pand	-32(%rax),%xmm2
3738	por	%xmm1,%xmm5
3739	pand	-16(%rax),%xmm3
3740	por	%xmm2,%xmm4
3741	por	%xmm3,%xmm5
3742	movdqa	0(%r11),%xmm0
3743	movdqa	16(%r11),%xmm1
3744	movdqa	32(%r11),%xmm2
3745	pand	0(%rax),%xmm0
3746	movdqa	48(%r11),%xmm3
3747	pand	16(%rax),%xmm1
3748	por	%xmm0,%xmm4
3749	pand	32(%rax),%xmm2
3750	por	%xmm1,%xmm5
3751	pand	48(%rax),%xmm3
3752	por	%xmm2,%xmm4
3753	por	%xmm3,%xmm5
3754	movdqa	64(%r11),%xmm0
3755	movdqa	80(%r11),%xmm1
3756	movdqa	96(%r11),%xmm2
3757	pand	64(%rax),%xmm0
3758	movdqa	112(%r11),%xmm3
3759	pand	80(%rax),%xmm1
3760	por	%xmm0,%xmm4
3761	pand	96(%rax),%xmm2
3762	por	%xmm1,%xmm5
3763	pand	112(%rax),%xmm3
3764	por	%xmm2,%xmm4
3765	por	%xmm3,%xmm5
3766	por	%xmm5,%xmm4
3767	leaq	256(%r11),%r11
3768	pshufd	$0x4e,%xmm4,%xmm0
3769	por	%xmm4,%xmm0
3770	movq	%xmm0,(%rdi)
3771	leaq	8(%rdi),%rdi
3772	subl	$1,%esi
3773	jnz	.Lgather
3774
3775	leaq	(%r10),%rsp
3776	.byte	0xf3,0xc3
3777.LSEH_end_bn_gather5:
3778.cfi_endproc
3779.size	bn_gather5,.-bn_gather5
3780.align	64
3781.Linc:
3782.long	0,0, 1,1
3783.long	2,2, 2,2
3784.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3785