xref: /freebsd/sys/crypto/openssl/amd64/x86_64-mont5.S (revision 734e82fe33aa764367791a7d603b383996c6b40b)
1/* Do not modify. This file is auto-generated from x86_64-mont5.pl. */
2.text
3
4
5
6.globl	bn_mul_mont_gather5
7.type	bn_mul_mont_gather5,@function
8.align	64
9bn_mul_mont_gather5:
10.cfi_startproc
11	movl	%r9d,%r9d
12	movq	%rsp,%rax
13.cfi_def_cfa_register	%rax
14	testl	$7,%r9d
15	jnz	.Lmul_enter
16	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
17	jmp	.Lmul4x_enter
18
19.align	16
20.Lmul_enter:
21	movd	8(%rsp),%xmm5
22	pushq	%rbx
23.cfi_offset	%rbx,-16
24	pushq	%rbp
25.cfi_offset	%rbp,-24
26	pushq	%r12
27.cfi_offset	%r12,-32
28	pushq	%r13
29.cfi_offset	%r13,-40
30	pushq	%r14
31.cfi_offset	%r14,-48
32	pushq	%r15
33.cfi_offset	%r15,-56
34
35	negq	%r9
36	movq	%rsp,%r11
37	leaq	-280(%rsp,%r9,8),%r10
38	negq	%r9
39	andq	$-1024,%r10
40
41
42
43
44
45
46
47
48
49	subq	%r10,%r11
50	andq	$-4096,%r11
51	leaq	(%r10,%r11,1),%rsp
52	movq	(%rsp),%r11
53	cmpq	%r10,%rsp
54	ja	.Lmul_page_walk
55	jmp	.Lmul_page_walk_done
56
57.Lmul_page_walk:
58	leaq	-4096(%rsp),%rsp
59	movq	(%rsp),%r11
60	cmpq	%r10,%rsp
61	ja	.Lmul_page_walk
62.Lmul_page_walk_done:
63
64	leaq	.Linc(%rip),%r10
65	movq	%rax,8(%rsp,%r9,8)
66.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
67.Lmul_body:
68
69	leaq	128(%rdx),%r12
70	movdqa	0(%r10),%xmm0
71	movdqa	16(%r10),%xmm1
72	leaq	24-112(%rsp,%r9,8),%r10
73	andq	$-16,%r10
74
75	pshufd	$0,%xmm5,%xmm5
76	movdqa	%xmm1,%xmm4
77	movdqa	%xmm1,%xmm2
78	paddd	%xmm0,%xmm1
79	pcmpeqd	%xmm5,%xmm0
80.byte	0x67
81	movdqa	%xmm4,%xmm3
82	paddd	%xmm1,%xmm2
83	pcmpeqd	%xmm5,%xmm1
84	movdqa	%xmm0,112(%r10)
85	movdqa	%xmm4,%xmm0
86
87	paddd	%xmm2,%xmm3
88	pcmpeqd	%xmm5,%xmm2
89	movdqa	%xmm1,128(%r10)
90	movdqa	%xmm4,%xmm1
91
92	paddd	%xmm3,%xmm0
93	pcmpeqd	%xmm5,%xmm3
94	movdqa	%xmm2,144(%r10)
95	movdqa	%xmm4,%xmm2
96
97	paddd	%xmm0,%xmm1
98	pcmpeqd	%xmm5,%xmm0
99	movdqa	%xmm3,160(%r10)
100	movdqa	%xmm4,%xmm3
101	paddd	%xmm1,%xmm2
102	pcmpeqd	%xmm5,%xmm1
103	movdqa	%xmm0,176(%r10)
104	movdqa	%xmm4,%xmm0
105
106	paddd	%xmm2,%xmm3
107	pcmpeqd	%xmm5,%xmm2
108	movdqa	%xmm1,192(%r10)
109	movdqa	%xmm4,%xmm1
110
111	paddd	%xmm3,%xmm0
112	pcmpeqd	%xmm5,%xmm3
113	movdqa	%xmm2,208(%r10)
114	movdqa	%xmm4,%xmm2
115
116	paddd	%xmm0,%xmm1
117	pcmpeqd	%xmm5,%xmm0
118	movdqa	%xmm3,224(%r10)
119	movdqa	%xmm4,%xmm3
120	paddd	%xmm1,%xmm2
121	pcmpeqd	%xmm5,%xmm1
122	movdqa	%xmm0,240(%r10)
123	movdqa	%xmm4,%xmm0
124
125	paddd	%xmm2,%xmm3
126	pcmpeqd	%xmm5,%xmm2
127	movdqa	%xmm1,256(%r10)
128	movdqa	%xmm4,%xmm1
129
130	paddd	%xmm3,%xmm0
131	pcmpeqd	%xmm5,%xmm3
132	movdqa	%xmm2,272(%r10)
133	movdqa	%xmm4,%xmm2
134
135	paddd	%xmm0,%xmm1
136	pcmpeqd	%xmm5,%xmm0
137	movdqa	%xmm3,288(%r10)
138	movdqa	%xmm4,%xmm3
139	paddd	%xmm1,%xmm2
140	pcmpeqd	%xmm5,%xmm1
141	movdqa	%xmm0,304(%r10)
142
143	paddd	%xmm2,%xmm3
144.byte	0x67
145	pcmpeqd	%xmm5,%xmm2
146	movdqa	%xmm1,320(%r10)
147
148	pcmpeqd	%xmm5,%xmm3
149	movdqa	%xmm2,336(%r10)
150	pand	64(%r12),%xmm0
151
152	pand	80(%r12),%xmm1
153	pand	96(%r12),%xmm2
154	movdqa	%xmm3,352(%r10)
155	pand	112(%r12),%xmm3
156	por	%xmm2,%xmm0
157	por	%xmm3,%xmm1
158	movdqa	-128(%r12),%xmm4
159	movdqa	-112(%r12),%xmm5
160	movdqa	-96(%r12),%xmm2
161	pand	112(%r10),%xmm4
162	movdqa	-80(%r12),%xmm3
163	pand	128(%r10),%xmm5
164	por	%xmm4,%xmm0
165	pand	144(%r10),%xmm2
166	por	%xmm5,%xmm1
167	pand	160(%r10),%xmm3
168	por	%xmm2,%xmm0
169	por	%xmm3,%xmm1
170	movdqa	-64(%r12),%xmm4
171	movdqa	-48(%r12),%xmm5
172	movdqa	-32(%r12),%xmm2
173	pand	176(%r10),%xmm4
174	movdqa	-16(%r12),%xmm3
175	pand	192(%r10),%xmm5
176	por	%xmm4,%xmm0
177	pand	208(%r10),%xmm2
178	por	%xmm5,%xmm1
179	pand	224(%r10),%xmm3
180	por	%xmm2,%xmm0
181	por	%xmm3,%xmm1
182	movdqa	0(%r12),%xmm4
183	movdqa	16(%r12),%xmm5
184	movdqa	32(%r12),%xmm2
185	pand	240(%r10),%xmm4
186	movdqa	48(%r12),%xmm3
187	pand	256(%r10),%xmm5
188	por	%xmm4,%xmm0
189	pand	272(%r10),%xmm2
190	por	%xmm5,%xmm1
191	pand	288(%r10),%xmm3
192	por	%xmm2,%xmm0
193	por	%xmm3,%xmm1
194	por	%xmm1,%xmm0
195	pshufd	$0x4e,%xmm0,%xmm1
196	por	%xmm1,%xmm0
197	leaq	256(%r12),%r12
198.byte	102,72,15,126,195
199
200	movq	(%r8),%r8
201	movq	(%rsi),%rax
202
203	xorq	%r14,%r14
204	xorq	%r15,%r15
205
206	movq	%r8,%rbp
207	mulq	%rbx
208	movq	%rax,%r10
209	movq	(%rcx),%rax
210
211	imulq	%r10,%rbp
212	movq	%rdx,%r11
213
214	mulq	%rbp
215	addq	%rax,%r10
216	movq	8(%rsi),%rax
217	adcq	$0,%rdx
218	movq	%rdx,%r13
219
220	leaq	1(%r15),%r15
221	jmp	.L1st_enter
222
223.align	16
224.L1st:
225	addq	%rax,%r13
226	movq	(%rsi,%r15,8),%rax
227	adcq	$0,%rdx
228	addq	%r11,%r13
229	movq	%r10,%r11
230	adcq	$0,%rdx
231	movq	%r13,-16(%rsp,%r15,8)
232	movq	%rdx,%r13
233
234.L1st_enter:
235	mulq	%rbx
236	addq	%rax,%r11
237	movq	(%rcx,%r15,8),%rax
238	adcq	$0,%rdx
239	leaq	1(%r15),%r15
240	movq	%rdx,%r10
241
242	mulq	%rbp
243	cmpq	%r9,%r15
244	jne	.L1st
245
246
247	addq	%rax,%r13
248	adcq	$0,%rdx
249	addq	%r11,%r13
250	adcq	$0,%rdx
251	movq	%r13,-16(%rsp,%r9,8)
252	movq	%rdx,%r13
253	movq	%r10,%r11
254
255	xorq	%rdx,%rdx
256	addq	%r11,%r13
257	adcq	$0,%rdx
258	movq	%r13,-8(%rsp,%r9,8)
259	movq	%rdx,(%rsp,%r9,8)
260
261	leaq	1(%r14),%r14
262	jmp	.Louter
263.align	16
264.Louter:
265	leaq	24+128(%rsp,%r9,8),%rdx
266	andq	$-16,%rdx
267	pxor	%xmm4,%xmm4
268	pxor	%xmm5,%xmm5
269	movdqa	-128(%r12),%xmm0
270	movdqa	-112(%r12),%xmm1
271	movdqa	-96(%r12),%xmm2
272	movdqa	-80(%r12),%xmm3
273	pand	-128(%rdx),%xmm0
274	pand	-112(%rdx),%xmm1
275	por	%xmm0,%xmm4
276	pand	-96(%rdx),%xmm2
277	por	%xmm1,%xmm5
278	pand	-80(%rdx),%xmm3
279	por	%xmm2,%xmm4
280	por	%xmm3,%xmm5
281	movdqa	-64(%r12),%xmm0
282	movdqa	-48(%r12),%xmm1
283	movdqa	-32(%r12),%xmm2
284	movdqa	-16(%r12),%xmm3
285	pand	-64(%rdx),%xmm0
286	pand	-48(%rdx),%xmm1
287	por	%xmm0,%xmm4
288	pand	-32(%rdx),%xmm2
289	por	%xmm1,%xmm5
290	pand	-16(%rdx),%xmm3
291	por	%xmm2,%xmm4
292	por	%xmm3,%xmm5
293	movdqa	0(%r12),%xmm0
294	movdqa	16(%r12),%xmm1
295	movdqa	32(%r12),%xmm2
296	movdqa	48(%r12),%xmm3
297	pand	0(%rdx),%xmm0
298	pand	16(%rdx),%xmm1
299	por	%xmm0,%xmm4
300	pand	32(%rdx),%xmm2
301	por	%xmm1,%xmm5
302	pand	48(%rdx),%xmm3
303	por	%xmm2,%xmm4
304	por	%xmm3,%xmm5
305	movdqa	64(%r12),%xmm0
306	movdqa	80(%r12),%xmm1
307	movdqa	96(%r12),%xmm2
308	movdqa	112(%r12),%xmm3
309	pand	64(%rdx),%xmm0
310	pand	80(%rdx),%xmm1
311	por	%xmm0,%xmm4
312	pand	96(%rdx),%xmm2
313	por	%xmm1,%xmm5
314	pand	112(%rdx),%xmm3
315	por	%xmm2,%xmm4
316	por	%xmm3,%xmm5
317	por	%xmm5,%xmm4
318	pshufd	$0x4e,%xmm4,%xmm0
319	por	%xmm4,%xmm0
320	leaq	256(%r12),%r12
321
322	movq	(%rsi),%rax
323.byte	102,72,15,126,195
324
325	xorq	%r15,%r15
326	movq	%r8,%rbp
327	movq	(%rsp),%r10
328
329	mulq	%rbx
330	addq	%rax,%r10
331	movq	(%rcx),%rax
332	adcq	$0,%rdx
333
334	imulq	%r10,%rbp
335	movq	%rdx,%r11
336
337	mulq	%rbp
338	addq	%rax,%r10
339	movq	8(%rsi),%rax
340	adcq	$0,%rdx
341	movq	8(%rsp),%r10
342	movq	%rdx,%r13
343
344	leaq	1(%r15),%r15
345	jmp	.Linner_enter
346
347.align	16
348.Linner:
349	addq	%rax,%r13
350	movq	(%rsi,%r15,8),%rax
351	adcq	$0,%rdx
352	addq	%r10,%r13
353	movq	(%rsp,%r15,8),%r10
354	adcq	$0,%rdx
355	movq	%r13,-16(%rsp,%r15,8)
356	movq	%rdx,%r13
357
358.Linner_enter:
359	mulq	%rbx
360	addq	%rax,%r11
361	movq	(%rcx,%r15,8),%rax
362	adcq	$0,%rdx
363	addq	%r11,%r10
364	movq	%rdx,%r11
365	adcq	$0,%r11
366	leaq	1(%r15),%r15
367
368	mulq	%rbp
369	cmpq	%r9,%r15
370	jne	.Linner
371
372	addq	%rax,%r13
373	adcq	$0,%rdx
374	addq	%r10,%r13
375	movq	(%rsp,%r9,8),%r10
376	adcq	$0,%rdx
377	movq	%r13,-16(%rsp,%r9,8)
378	movq	%rdx,%r13
379
380	xorq	%rdx,%rdx
381	addq	%r11,%r13
382	adcq	$0,%rdx
383	addq	%r10,%r13
384	adcq	$0,%rdx
385	movq	%r13,-8(%rsp,%r9,8)
386	movq	%rdx,(%rsp,%r9,8)
387
388	leaq	1(%r14),%r14
389	cmpq	%r9,%r14
390	jb	.Louter
391
392	xorq	%r14,%r14
393	movq	(%rsp),%rax
394	leaq	(%rsp),%rsi
395	movq	%r9,%r15
396	jmp	.Lsub
397.align	16
398.Lsub:	sbbq	(%rcx,%r14,8),%rax
399	movq	%rax,(%rdi,%r14,8)
400	movq	8(%rsi,%r14,8),%rax
401	leaq	1(%r14),%r14
402	decq	%r15
403	jnz	.Lsub
404
405	sbbq	$0,%rax
406	movq	$-1,%rbx
407	xorq	%rax,%rbx
408	xorq	%r14,%r14
409	movq	%r9,%r15
410
411.Lcopy:
412	movq	(%rdi,%r14,8),%rcx
413	movq	(%rsp,%r14,8),%rdx
414	andq	%rbx,%rcx
415	andq	%rax,%rdx
416	movq	%r14,(%rsp,%r14,8)
417	orq	%rcx,%rdx
418	movq	%rdx,(%rdi,%r14,8)
419	leaq	1(%r14),%r14
420	subq	$1,%r15
421	jnz	.Lcopy
422
423	movq	8(%rsp,%r9,8),%rsi
424.cfi_def_cfa	%rsi,8
425	movq	$1,%rax
426
427	movq	-48(%rsi),%r15
428.cfi_restore	%r15
429	movq	-40(%rsi),%r14
430.cfi_restore	%r14
431	movq	-32(%rsi),%r13
432.cfi_restore	%r13
433	movq	-24(%rsi),%r12
434.cfi_restore	%r12
435	movq	-16(%rsi),%rbp
436.cfi_restore	%rbp
437	movq	-8(%rsi),%rbx
438.cfi_restore	%rbx
439	leaq	(%rsi),%rsp
440.cfi_def_cfa_register	%rsp
441.Lmul_epilogue:
442	.byte	0xf3,0xc3
443.cfi_endproc
444.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
445.type	bn_mul4x_mont_gather5,@function
446.align	32
447bn_mul4x_mont_gather5:
448.cfi_startproc
449.byte	0x67
450	movq	%rsp,%rax
451.cfi_def_cfa_register	%rax
452.Lmul4x_enter:
453	andl	$0x80108,%r11d
454	cmpl	$0x80108,%r11d
455	je	.Lmulx4x_enter
456	pushq	%rbx
457.cfi_offset	%rbx,-16
458	pushq	%rbp
459.cfi_offset	%rbp,-24
460	pushq	%r12
461.cfi_offset	%r12,-32
462	pushq	%r13
463.cfi_offset	%r13,-40
464	pushq	%r14
465.cfi_offset	%r14,-48
466	pushq	%r15
467.cfi_offset	%r15,-56
468.Lmul4x_prologue:
469
470.byte	0x67
471	shll	$3,%r9d
472	leaq	(%r9,%r9,2),%r10
473	negq	%r9
474
475
476
477
478
479
480
481
482
483
484	leaq	-320(%rsp,%r9,2),%r11
485	movq	%rsp,%rbp
486	subq	%rdi,%r11
487	andq	$4095,%r11
488	cmpq	%r11,%r10
489	jb	.Lmul4xsp_alt
490	subq	%r11,%rbp
491	leaq	-320(%rbp,%r9,2),%rbp
492	jmp	.Lmul4xsp_done
493
494.align	32
495.Lmul4xsp_alt:
496	leaq	4096-320(,%r9,2),%r10
497	leaq	-320(%rbp,%r9,2),%rbp
498	subq	%r10,%r11
499	movq	$0,%r10
500	cmovcq	%r10,%r11
501	subq	%r11,%rbp
502.Lmul4xsp_done:
503	andq	$-64,%rbp
504	movq	%rsp,%r11
505	subq	%rbp,%r11
506	andq	$-4096,%r11
507	leaq	(%r11,%rbp,1),%rsp
508	movq	(%rsp),%r10
509	cmpq	%rbp,%rsp
510	ja	.Lmul4x_page_walk
511	jmp	.Lmul4x_page_walk_done
512
513.Lmul4x_page_walk:
514	leaq	-4096(%rsp),%rsp
515	movq	(%rsp),%r10
516	cmpq	%rbp,%rsp
517	ja	.Lmul4x_page_walk
518.Lmul4x_page_walk_done:
519
520	negq	%r9
521
522	movq	%rax,40(%rsp)
523.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
524.Lmul4x_body:
525
526	call	mul4x_internal
527
528	movq	40(%rsp),%rsi
529.cfi_def_cfa	%rsi,8
530	movq	$1,%rax
531
532	movq	-48(%rsi),%r15
533.cfi_restore	%r15
534	movq	-40(%rsi),%r14
535.cfi_restore	%r14
536	movq	-32(%rsi),%r13
537.cfi_restore	%r13
538	movq	-24(%rsi),%r12
539.cfi_restore	%r12
540	movq	-16(%rsi),%rbp
541.cfi_restore	%rbp
542	movq	-8(%rsi),%rbx
543.cfi_restore	%rbx
544	leaq	(%rsi),%rsp
545.cfi_def_cfa_register	%rsp
546.Lmul4x_epilogue:
547	.byte	0xf3,0xc3
548.cfi_endproc
549.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
550
551.type	mul4x_internal,@function
552.align	32
553mul4x_internal:
554.cfi_startproc
555	shlq	$5,%r9
556	movd	8(%rax),%xmm5
557	leaq	.Linc(%rip),%rax
558	leaq	128(%rdx,%r9,1),%r13
559	shrq	$5,%r9
560	movdqa	0(%rax),%xmm0
561	movdqa	16(%rax),%xmm1
562	leaq	88-112(%rsp,%r9,1),%r10
563	leaq	128(%rdx),%r12
564
565	pshufd	$0,%xmm5,%xmm5
566	movdqa	%xmm1,%xmm4
567.byte	0x67,0x67
568	movdqa	%xmm1,%xmm2
569	paddd	%xmm0,%xmm1
570	pcmpeqd	%xmm5,%xmm0
571.byte	0x67
572	movdqa	%xmm4,%xmm3
573	paddd	%xmm1,%xmm2
574	pcmpeqd	%xmm5,%xmm1
575	movdqa	%xmm0,112(%r10)
576	movdqa	%xmm4,%xmm0
577
578	paddd	%xmm2,%xmm3
579	pcmpeqd	%xmm5,%xmm2
580	movdqa	%xmm1,128(%r10)
581	movdqa	%xmm4,%xmm1
582
583	paddd	%xmm3,%xmm0
584	pcmpeqd	%xmm5,%xmm3
585	movdqa	%xmm2,144(%r10)
586	movdqa	%xmm4,%xmm2
587
588	paddd	%xmm0,%xmm1
589	pcmpeqd	%xmm5,%xmm0
590	movdqa	%xmm3,160(%r10)
591	movdqa	%xmm4,%xmm3
592	paddd	%xmm1,%xmm2
593	pcmpeqd	%xmm5,%xmm1
594	movdqa	%xmm0,176(%r10)
595	movdqa	%xmm4,%xmm0
596
597	paddd	%xmm2,%xmm3
598	pcmpeqd	%xmm5,%xmm2
599	movdqa	%xmm1,192(%r10)
600	movdqa	%xmm4,%xmm1
601
602	paddd	%xmm3,%xmm0
603	pcmpeqd	%xmm5,%xmm3
604	movdqa	%xmm2,208(%r10)
605	movdqa	%xmm4,%xmm2
606
607	paddd	%xmm0,%xmm1
608	pcmpeqd	%xmm5,%xmm0
609	movdqa	%xmm3,224(%r10)
610	movdqa	%xmm4,%xmm3
611	paddd	%xmm1,%xmm2
612	pcmpeqd	%xmm5,%xmm1
613	movdqa	%xmm0,240(%r10)
614	movdqa	%xmm4,%xmm0
615
616	paddd	%xmm2,%xmm3
617	pcmpeqd	%xmm5,%xmm2
618	movdqa	%xmm1,256(%r10)
619	movdqa	%xmm4,%xmm1
620
621	paddd	%xmm3,%xmm0
622	pcmpeqd	%xmm5,%xmm3
623	movdqa	%xmm2,272(%r10)
624	movdqa	%xmm4,%xmm2
625
626	paddd	%xmm0,%xmm1
627	pcmpeqd	%xmm5,%xmm0
628	movdqa	%xmm3,288(%r10)
629	movdqa	%xmm4,%xmm3
630	paddd	%xmm1,%xmm2
631	pcmpeqd	%xmm5,%xmm1
632	movdqa	%xmm0,304(%r10)
633
634	paddd	%xmm2,%xmm3
635.byte	0x67
636	pcmpeqd	%xmm5,%xmm2
637	movdqa	%xmm1,320(%r10)
638
639	pcmpeqd	%xmm5,%xmm3
640	movdqa	%xmm2,336(%r10)
641	pand	64(%r12),%xmm0
642
643	pand	80(%r12),%xmm1
644	pand	96(%r12),%xmm2
645	movdqa	%xmm3,352(%r10)
646	pand	112(%r12),%xmm3
647	por	%xmm2,%xmm0
648	por	%xmm3,%xmm1
649	movdqa	-128(%r12),%xmm4
650	movdqa	-112(%r12),%xmm5
651	movdqa	-96(%r12),%xmm2
652	pand	112(%r10),%xmm4
653	movdqa	-80(%r12),%xmm3
654	pand	128(%r10),%xmm5
655	por	%xmm4,%xmm0
656	pand	144(%r10),%xmm2
657	por	%xmm5,%xmm1
658	pand	160(%r10),%xmm3
659	por	%xmm2,%xmm0
660	por	%xmm3,%xmm1
661	movdqa	-64(%r12),%xmm4
662	movdqa	-48(%r12),%xmm5
663	movdqa	-32(%r12),%xmm2
664	pand	176(%r10),%xmm4
665	movdqa	-16(%r12),%xmm3
666	pand	192(%r10),%xmm5
667	por	%xmm4,%xmm0
668	pand	208(%r10),%xmm2
669	por	%xmm5,%xmm1
670	pand	224(%r10),%xmm3
671	por	%xmm2,%xmm0
672	por	%xmm3,%xmm1
673	movdqa	0(%r12),%xmm4
674	movdqa	16(%r12),%xmm5
675	movdqa	32(%r12),%xmm2
676	pand	240(%r10),%xmm4
677	movdqa	48(%r12),%xmm3
678	pand	256(%r10),%xmm5
679	por	%xmm4,%xmm0
680	pand	272(%r10),%xmm2
681	por	%xmm5,%xmm1
682	pand	288(%r10),%xmm3
683	por	%xmm2,%xmm0
684	por	%xmm3,%xmm1
685	por	%xmm1,%xmm0
686	pshufd	$0x4e,%xmm0,%xmm1
687	por	%xmm1,%xmm0
688	leaq	256(%r12),%r12
689.byte	102,72,15,126,195
690
691	movq	%r13,16+8(%rsp)
692	movq	%rdi,56+8(%rsp)
693
694	movq	(%r8),%r8
695	movq	(%rsi),%rax
696	leaq	(%rsi,%r9,1),%rsi
697	negq	%r9
698
699	movq	%r8,%rbp
700	mulq	%rbx
701	movq	%rax,%r10
702	movq	(%rcx),%rax
703
704	imulq	%r10,%rbp
705	leaq	64+8(%rsp),%r14
706	movq	%rdx,%r11
707
708	mulq	%rbp
709	addq	%rax,%r10
710	movq	8(%rsi,%r9,1),%rax
711	adcq	$0,%rdx
712	movq	%rdx,%rdi
713
714	mulq	%rbx
715	addq	%rax,%r11
716	movq	8(%rcx),%rax
717	adcq	$0,%rdx
718	movq	%rdx,%r10
719
720	mulq	%rbp
721	addq	%rax,%rdi
722	movq	16(%rsi,%r9,1),%rax
723	adcq	$0,%rdx
724	addq	%r11,%rdi
725	leaq	32(%r9),%r15
726	leaq	32(%rcx),%rcx
727	adcq	$0,%rdx
728	movq	%rdi,(%r14)
729	movq	%rdx,%r13
730	jmp	.L1st4x
731
732.align	32
733.L1st4x:
734	mulq	%rbx
735	addq	%rax,%r10
736	movq	-16(%rcx),%rax
737	leaq	32(%r14),%r14
738	adcq	$0,%rdx
739	movq	%rdx,%r11
740
741	mulq	%rbp
742	addq	%rax,%r13
743	movq	-8(%rsi,%r15,1),%rax
744	adcq	$0,%rdx
745	addq	%r10,%r13
746	adcq	$0,%rdx
747	movq	%r13,-24(%r14)
748	movq	%rdx,%rdi
749
750	mulq	%rbx
751	addq	%rax,%r11
752	movq	-8(%rcx),%rax
753	adcq	$0,%rdx
754	movq	%rdx,%r10
755
756	mulq	%rbp
757	addq	%rax,%rdi
758	movq	(%rsi,%r15,1),%rax
759	adcq	$0,%rdx
760	addq	%r11,%rdi
761	adcq	$0,%rdx
762	movq	%rdi,-16(%r14)
763	movq	%rdx,%r13
764
765	mulq	%rbx
766	addq	%rax,%r10
767	movq	0(%rcx),%rax
768	adcq	$0,%rdx
769	movq	%rdx,%r11
770
771	mulq	%rbp
772	addq	%rax,%r13
773	movq	8(%rsi,%r15,1),%rax
774	adcq	$0,%rdx
775	addq	%r10,%r13
776	adcq	$0,%rdx
777	movq	%r13,-8(%r14)
778	movq	%rdx,%rdi
779
780	mulq	%rbx
781	addq	%rax,%r11
782	movq	8(%rcx),%rax
783	adcq	$0,%rdx
784	movq	%rdx,%r10
785
786	mulq	%rbp
787	addq	%rax,%rdi
788	movq	16(%rsi,%r15,1),%rax
789	adcq	$0,%rdx
790	addq	%r11,%rdi
791	leaq	32(%rcx),%rcx
792	adcq	$0,%rdx
793	movq	%rdi,(%r14)
794	movq	%rdx,%r13
795
796	addq	$32,%r15
797	jnz	.L1st4x
798
799	mulq	%rbx
800	addq	%rax,%r10
801	movq	-16(%rcx),%rax
802	leaq	32(%r14),%r14
803	adcq	$0,%rdx
804	movq	%rdx,%r11
805
806	mulq	%rbp
807	addq	%rax,%r13
808	movq	-8(%rsi),%rax
809	adcq	$0,%rdx
810	addq	%r10,%r13
811	adcq	$0,%rdx
812	movq	%r13,-24(%r14)
813	movq	%rdx,%rdi
814
815	mulq	%rbx
816	addq	%rax,%r11
817	movq	-8(%rcx),%rax
818	adcq	$0,%rdx
819	movq	%rdx,%r10
820
821	mulq	%rbp
822	addq	%rax,%rdi
823	movq	(%rsi,%r9,1),%rax
824	adcq	$0,%rdx
825	addq	%r11,%rdi
826	adcq	$0,%rdx
827	movq	%rdi,-16(%r14)
828	movq	%rdx,%r13
829
830	leaq	(%rcx,%r9,1),%rcx
831
832	xorq	%rdi,%rdi
833	addq	%r10,%r13
834	adcq	$0,%rdi
835	movq	%r13,-8(%r14)
836
837	jmp	.Louter4x
838
839.align	32
840.Louter4x:
841	leaq	16+128(%r14),%rdx
842	pxor	%xmm4,%xmm4
843	pxor	%xmm5,%xmm5
844	movdqa	-128(%r12),%xmm0
845	movdqa	-112(%r12),%xmm1
846	movdqa	-96(%r12),%xmm2
847	movdqa	-80(%r12),%xmm3
848	pand	-128(%rdx),%xmm0
849	pand	-112(%rdx),%xmm1
850	por	%xmm0,%xmm4
851	pand	-96(%rdx),%xmm2
852	por	%xmm1,%xmm5
853	pand	-80(%rdx),%xmm3
854	por	%xmm2,%xmm4
855	por	%xmm3,%xmm5
856	movdqa	-64(%r12),%xmm0
857	movdqa	-48(%r12),%xmm1
858	movdqa	-32(%r12),%xmm2
859	movdqa	-16(%r12),%xmm3
860	pand	-64(%rdx),%xmm0
861	pand	-48(%rdx),%xmm1
862	por	%xmm0,%xmm4
863	pand	-32(%rdx),%xmm2
864	por	%xmm1,%xmm5
865	pand	-16(%rdx),%xmm3
866	por	%xmm2,%xmm4
867	por	%xmm3,%xmm5
868	movdqa	0(%r12),%xmm0
869	movdqa	16(%r12),%xmm1
870	movdqa	32(%r12),%xmm2
871	movdqa	48(%r12),%xmm3
872	pand	0(%rdx),%xmm0
873	pand	16(%rdx),%xmm1
874	por	%xmm0,%xmm4
875	pand	32(%rdx),%xmm2
876	por	%xmm1,%xmm5
877	pand	48(%rdx),%xmm3
878	por	%xmm2,%xmm4
879	por	%xmm3,%xmm5
880	movdqa	64(%r12),%xmm0
881	movdqa	80(%r12),%xmm1
882	movdqa	96(%r12),%xmm2
883	movdqa	112(%r12),%xmm3
884	pand	64(%rdx),%xmm0
885	pand	80(%rdx),%xmm1
886	por	%xmm0,%xmm4
887	pand	96(%rdx),%xmm2
888	por	%xmm1,%xmm5
889	pand	112(%rdx),%xmm3
890	por	%xmm2,%xmm4
891	por	%xmm3,%xmm5
892	por	%xmm5,%xmm4
893	pshufd	$0x4e,%xmm4,%xmm0
894	por	%xmm4,%xmm0
895	leaq	256(%r12),%r12
896.byte	102,72,15,126,195
897
898	movq	(%r14,%r9,1),%r10
899	movq	%r8,%rbp
900	mulq	%rbx
901	addq	%rax,%r10
902	movq	(%rcx),%rax
903	adcq	$0,%rdx
904
905	imulq	%r10,%rbp
906	movq	%rdx,%r11
907	movq	%rdi,(%r14)
908
909	leaq	(%r14,%r9,1),%r14
910
911	mulq	%rbp
912	addq	%rax,%r10
913	movq	8(%rsi,%r9,1),%rax
914	adcq	$0,%rdx
915	movq	%rdx,%rdi
916
917	mulq	%rbx
918	addq	%rax,%r11
919	movq	8(%rcx),%rax
920	adcq	$0,%rdx
921	addq	8(%r14),%r11
922	adcq	$0,%rdx
923	movq	%rdx,%r10
924
925	mulq	%rbp
926	addq	%rax,%rdi
927	movq	16(%rsi,%r9,1),%rax
928	adcq	$0,%rdx
929	addq	%r11,%rdi
930	leaq	32(%r9),%r15
931	leaq	32(%rcx),%rcx
932	adcq	$0,%rdx
933	movq	%rdx,%r13
934	jmp	.Linner4x
935
936.align	32
937.Linner4x:
938	mulq	%rbx
939	addq	%rax,%r10
940	movq	-16(%rcx),%rax
941	adcq	$0,%rdx
942	addq	16(%r14),%r10
943	leaq	32(%r14),%r14
944	adcq	$0,%rdx
945	movq	%rdx,%r11
946
947	mulq	%rbp
948	addq	%rax,%r13
949	movq	-8(%rsi,%r15,1),%rax
950	adcq	$0,%rdx
951	addq	%r10,%r13
952	adcq	$0,%rdx
953	movq	%rdi,-32(%r14)
954	movq	%rdx,%rdi
955
956	mulq	%rbx
957	addq	%rax,%r11
958	movq	-8(%rcx),%rax
959	adcq	$0,%rdx
960	addq	-8(%r14),%r11
961	adcq	$0,%rdx
962	movq	%rdx,%r10
963
964	mulq	%rbp
965	addq	%rax,%rdi
966	movq	(%rsi,%r15,1),%rax
967	adcq	$0,%rdx
968	addq	%r11,%rdi
969	adcq	$0,%rdx
970	movq	%r13,-24(%r14)
971	movq	%rdx,%r13
972
973	mulq	%rbx
974	addq	%rax,%r10
975	movq	0(%rcx),%rax
976	adcq	$0,%rdx
977	addq	(%r14),%r10
978	adcq	$0,%rdx
979	movq	%rdx,%r11
980
981	mulq	%rbp
982	addq	%rax,%r13
983	movq	8(%rsi,%r15,1),%rax
984	adcq	$0,%rdx
985	addq	%r10,%r13
986	adcq	$0,%rdx
987	movq	%rdi,-16(%r14)
988	movq	%rdx,%rdi
989
990	mulq	%rbx
991	addq	%rax,%r11
992	movq	8(%rcx),%rax
993	adcq	$0,%rdx
994	addq	8(%r14),%r11
995	adcq	$0,%rdx
996	movq	%rdx,%r10
997
998	mulq	%rbp
999	addq	%rax,%rdi
1000	movq	16(%rsi,%r15,1),%rax
1001	adcq	$0,%rdx
1002	addq	%r11,%rdi
1003	leaq	32(%rcx),%rcx
1004	adcq	$0,%rdx
1005	movq	%r13,-8(%r14)
1006	movq	%rdx,%r13
1007
1008	addq	$32,%r15
1009	jnz	.Linner4x
1010
1011	mulq	%rbx
1012	addq	%rax,%r10
1013	movq	-16(%rcx),%rax
1014	adcq	$0,%rdx
1015	addq	16(%r14),%r10
1016	leaq	32(%r14),%r14
1017	adcq	$0,%rdx
1018	movq	%rdx,%r11
1019
1020	mulq	%rbp
1021	addq	%rax,%r13
1022	movq	-8(%rsi),%rax
1023	adcq	$0,%rdx
1024	addq	%r10,%r13
1025	adcq	$0,%rdx
1026	movq	%rdi,-32(%r14)
1027	movq	%rdx,%rdi
1028
1029	mulq	%rbx
1030	addq	%rax,%r11
1031	movq	%rbp,%rax
1032	movq	-8(%rcx),%rbp
1033	adcq	$0,%rdx
1034	addq	-8(%r14),%r11
1035	adcq	$0,%rdx
1036	movq	%rdx,%r10
1037
1038	mulq	%rbp
1039	addq	%rax,%rdi
1040	movq	(%rsi,%r9,1),%rax
1041	adcq	$0,%rdx
1042	addq	%r11,%rdi
1043	adcq	$0,%rdx
1044	movq	%r13,-24(%r14)
1045	movq	%rdx,%r13
1046
1047	movq	%rdi,-16(%r14)
1048	leaq	(%rcx,%r9,1),%rcx
1049
1050	xorq	%rdi,%rdi
1051	addq	%r10,%r13
1052	adcq	$0,%rdi
1053	addq	(%r14),%r13
1054	adcq	$0,%rdi
1055	movq	%r13,-8(%r14)
1056
1057	cmpq	16+8(%rsp),%r12
1058	jb	.Louter4x
1059	xorq	%rax,%rax
1060	subq	%r13,%rbp
1061	adcq	%r15,%r15
1062	orq	%r15,%rdi
1063	subq	%rdi,%rax
1064	leaq	(%r14,%r9,1),%rbx
1065	movq	(%rcx),%r12
1066	leaq	(%rcx),%rbp
1067	movq	%r9,%rcx
1068	sarq	$3+2,%rcx
1069	movq	56+8(%rsp),%rdi
1070	decq	%r12
1071	xorq	%r10,%r10
1072	movq	8(%rbp),%r13
1073	movq	16(%rbp),%r14
1074	movq	24(%rbp),%r15
1075	jmp	.Lsqr4x_sub_entry
1076.cfi_endproc
1077.size	mul4x_internal,.-mul4x_internal
1078.globl	bn_power5
1079.type	bn_power5,@function
1080.align	32
1081bn_power5:
1082.cfi_startproc
1083	movq	%rsp,%rax
1084.cfi_def_cfa_register	%rax
1085	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
1086	andl	$0x80108,%r11d
1087	cmpl	$0x80108,%r11d
1088	je	.Lpowerx5_enter
1089	pushq	%rbx
1090.cfi_offset	%rbx,-16
1091	pushq	%rbp
1092.cfi_offset	%rbp,-24
1093	pushq	%r12
1094.cfi_offset	%r12,-32
1095	pushq	%r13
1096.cfi_offset	%r13,-40
1097	pushq	%r14
1098.cfi_offset	%r14,-48
1099	pushq	%r15
1100.cfi_offset	%r15,-56
1101.Lpower5_prologue:
1102
1103	shll	$3,%r9d
1104	leal	(%r9,%r9,2),%r10d
1105	negq	%r9
1106	movq	(%r8),%r8
1107
1108
1109
1110
1111
1112
1113
1114
1115	leaq	-320(%rsp,%r9,2),%r11
1116	movq	%rsp,%rbp
1117	subq	%rdi,%r11
1118	andq	$4095,%r11
1119	cmpq	%r11,%r10
1120	jb	.Lpwr_sp_alt
1121	subq	%r11,%rbp
1122	leaq	-320(%rbp,%r9,2),%rbp
1123	jmp	.Lpwr_sp_done
1124
1125.align	32
1126.Lpwr_sp_alt:
1127	leaq	4096-320(,%r9,2),%r10
1128	leaq	-320(%rbp,%r9,2),%rbp
1129	subq	%r10,%r11
1130	movq	$0,%r10
1131	cmovcq	%r10,%r11
1132	subq	%r11,%rbp
1133.Lpwr_sp_done:
1134	andq	$-64,%rbp
1135	movq	%rsp,%r11
1136	subq	%rbp,%r11
1137	andq	$-4096,%r11
1138	leaq	(%r11,%rbp,1),%rsp
1139	movq	(%rsp),%r10
1140	cmpq	%rbp,%rsp
1141	ja	.Lpwr_page_walk
1142	jmp	.Lpwr_page_walk_done
1143
1144.Lpwr_page_walk:
1145	leaq	-4096(%rsp),%rsp
1146	movq	(%rsp),%r10
1147	cmpq	%rbp,%rsp
1148	ja	.Lpwr_page_walk
1149.Lpwr_page_walk_done:
1150
1151	movq	%r9,%r10
1152	negq	%r9
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163	movq	%r8,32(%rsp)
1164	movq	%rax,40(%rsp)
1165.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
1166.Lpower5_body:
1167.byte	102,72,15,110,207
1168.byte	102,72,15,110,209
1169.byte	102,73,15,110,218
1170.byte	102,72,15,110,226
1171
1172	call	__bn_sqr8x_internal
1173	call	__bn_post4x_internal
1174	call	__bn_sqr8x_internal
1175	call	__bn_post4x_internal
1176	call	__bn_sqr8x_internal
1177	call	__bn_post4x_internal
1178	call	__bn_sqr8x_internal
1179	call	__bn_post4x_internal
1180	call	__bn_sqr8x_internal
1181	call	__bn_post4x_internal
1182
1183.byte	102,72,15,126,209
1184.byte	102,72,15,126,226
1185	movq	%rsi,%rdi
1186	movq	40(%rsp),%rax
1187	leaq	32(%rsp),%r8
1188
1189	call	mul4x_internal
1190
1191	movq	40(%rsp),%rsi
1192.cfi_def_cfa	%rsi,8
1193	movq	$1,%rax
1194	movq	-48(%rsi),%r15
1195.cfi_restore	%r15
1196	movq	-40(%rsi),%r14
1197.cfi_restore	%r14
1198	movq	-32(%rsi),%r13
1199.cfi_restore	%r13
1200	movq	-24(%rsi),%r12
1201.cfi_restore	%r12
1202	movq	-16(%rsi),%rbp
1203.cfi_restore	%rbp
1204	movq	-8(%rsi),%rbx
1205.cfi_restore	%rbx
1206	leaq	(%rsi),%rsp
1207.cfi_def_cfa_register	%rsp
1208.Lpower5_epilogue:
1209	.byte	0xf3,0xc3
1210.cfi_endproc
1211.size	bn_power5,.-bn_power5
1212
1213.globl	bn_sqr8x_internal
1214.hidden	bn_sqr8x_internal
1215.type	bn_sqr8x_internal,@function
1216.align	32
1217bn_sqr8x_internal:
1218__bn_sqr8x_internal:
1219.cfi_startproc
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293	leaq	32(%r10),%rbp
1294	leaq	(%rsi,%r9,1),%rsi
1295
1296	movq	%r9,%rcx
1297
1298
1299	movq	-32(%rsi,%rbp,1),%r14
1300	leaq	48+8(%rsp,%r9,2),%rdi
1301	movq	-24(%rsi,%rbp,1),%rax
1302	leaq	-32(%rdi,%rbp,1),%rdi
1303	movq	-16(%rsi,%rbp,1),%rbx
1304	movq	%rax,%r15
1305
1306	mulq	%r14
1307	movq	%rax,%r10
1308	movq	%rbx,%rax
1309	movq	%rdx,%r11
1310	movq	%r10,-24(%rdi,%rbp,1)
1311
1312	mulq	%r14
1313	addq	%rax,%r11
1314	movq	%rbx,%rax
1315	adcq	$0,%rdx
1316	movq	%r11,-16(%rdi,%rbp,1)
1317	movq	%rdx,%r10
1318
1319
1320	movq	-8(%rsi,%rbp,1),%rbx
1321	mulq	%r15
1322	movq	%rax,%r12
1323	movq	%rbx,%rax
1324	movq	%rdx,%r13
1325
1326	leaq	(%rbp),%rcx
1327	mulq	%r14
1328	addq	%rax,%r10
1329	movq	%rbx,%rax
1330	movq	%rdx,%r11
1331	adcq	$0,%r11
1332	addq	%r12,%r10
1333	adcq	$0,%r11
1334	movq	%r10,-8(%rdi,%rcx,1)
1335	jmp	.Lsqr4x_1st
1336
1337.align	32
1338.Lsqr4x_1st:
1339	movq	(%rsi,%rcx,1),%rbx
1340	mulq	%r15
1341	addq	%rax,%r13
1342	movq	%rbx,%rax
1343	movq	%rdx,%r12
1344	adcq	$0,%r12
1345
1346	mulq	%r14
1347	addq	%rax,%r11
1348	movq	%rbx,%rax
1349	movq	8(%rsi,%rcx,1),%rbx
1350	movq	%rdx,%r10
1351	adcq	$0,%r10
1352	addq	%r13,%r11
1353	adcq	$0,%r10
1354
1355
1356	mulq	%r15
1357	addq	%rax,%r12
1358	movq	%rbx,%rax
1359	movq	%r11,(%rdi,%rcx,1)
1360	movq	%rdx,%r13
1361	adcq	$0,%r13
1362
1363	mulq	%r14
1364	addq	%rax,%r10
1365	movq	%rbx,%rax
1366	movq	16(%rsi,%rcx,1),%rbx
1367	movq	%rdx,%r11
1368	adcq	$0,%r11
1369	addq	%r12,%r10
1370	adcq	$0,%r11
1371
1372	mulq	%r15
1373	addq	%rax,%r13
1374	movq	%rbx,%rax
1375	movq	%r10,8(%rdi,%rcx,1)
1376	movq	%rdx,%r12
1377	adcq	$0,%r12
1378
1379	mulq	%r14
1380	addq	%rax,%r11
1381	movq	%rbx,%rax
1382	movq	24(%rsi,%rcx,1),%rbx
1383	movq	%rdx,%r10
1384	adcq	$0,%r10
1385	addq	%r13,%r11
1386	adcq	$0,%r10
1387
1388
1389	mulq	%r15
1390	addq	%rax,%r12
1391	movq	%rbx,%rax
1392	movq	%r11,16(%rdi,%rcx,1)
1393	movq	%rdx,%r13
1394	adcq	$0,%r13
1395	leaq	32(%rcx),%rcx
1396
1397	mulq	%r14
1398	addq	%rax,%r10
1399	movq	%rbx,%rax
1400	movq	%rdx,%r11
1401	adcq	$0,%r11
1402	addq	%r12,%r10
1403	adcq	$0,%r11
1404	movq	%r10,-8(%rdi,%rcx,1)
1405
1406	cmpq	$0,%rcx
1407	jne	.Lsqr4x_1st
1408
1409	mulq	%r15
1410	addq	%rax,%r13
1411	leaq	16(%rbp),%rbp
1412	adcq	$0,%rdx
1413	addq	%r11,%r13
1414	adcq	$0,%rdx
1415
1416	movq	%r13,(%rdi)
1417	movq	%rdx,%r12
1418	movq	%rdx,8(%rdi)
1419	jmp	.Lsqr4x_outer
1420
1421.align	32
1422.Lsqr4x_outer:
1423	movq	-32(%rsi,%rbp,1),%r14
1424	leaq	48+8(%rsp,%r9,2),%rdi
1425	movq	-24(%rsi,%rbp,1),%rax
1426	leaq	-32(%rdi,%rbp,1),%rdi
1427	movq	-16(%rsi,%rbp,1),%rbx
1428	movq	%rax,%r15
1429
1430	mulq	%r14
1431	movq	-24(%rdi,%rbp,1),%r10
1432	addq	%rax,%r10
1433	movq	%rbx,%rax
1434	adcq	$0,%rdx
1435	movq	%r10,-24(%rdi,%rbp,1)
1436	movq	%rdx,%r11
1437
1438	mulq	%r14
1439	addq	%rax,%r11
1440	movq	%rbx,%rax
1441	adcq	$0,%rdx
1442	addq	-16(%rdi,%rbp,1),%r11
1443	movq	%rdx,%r10
1444	adcq	$0,%r10
1445	movq	%r11,-16(%rdi,%rbp,1)
1446
1447	xorq	%r12,%r12
1448
1449	movq	-8(%rsi,%rbp,1),%rbx
1450	mulq	%r15
1451	addq	%rax,%r12
1452	movq	%rbx,%rax
1453	adcq	$0,%rdx
1454	addq	-8(%rdi,%rbp,1),%r12
1455	movq	%rdx,%r13
1456	adcq	$0,%r13
1457
1458	mulq	%r14
1459	addq	%rax,%r10
1460	movq	%rbx,%rax
1461	adcq	$0,%rdx
1462	addq	%r12,%r10
1463	movq	%rdx,%r11
1464	adcq	$0,%r11
1465	movq	%r10,-8(%rdi,%rbp,1)
1466
1467	leaq	(%rbp),%rcx
1468	jmp	.Lsqr4x_inner
1469
1470.align	32
1471.Lsqr4x_inner:
1472	movq	(%rsi,%rcx,1),%rbx
1473	mulq	%r15
1474	addq	%rax,%r13
1475	movq	%rbx,%rax
1476	movq	%rdx,%r12
1477	adcq	$0,%r12
1478	addq	(%rdi,%rcx,1),%r13
1479	adcq	$0,%r12
1480
1481.byte	0x67
1482	mulq	%r14
1483	addq	%rax,%r11
1484	movq	%rbx,%rax
1485	movq	8(%rsi,%rcx,1),%rbx
1486	movq	%rdx,%r10
1487	adcq	$0,%r10
1488	addq	%r13,%r11
1489	adcq	$0,%r10
1490
1491	mulq	%r15
1492	addq	%rax,%r12
1493	movq	%r11,(%rdi,%rcx,1)
1494	movq	%rbx,%rax
1495	movq	%rdx,%r13
1496	adcq	$0,%r13
1497	addq	8(%rdi,%rcx,1),%r12
1498	leaq	16(%rcx),%rcx
1499	adcq	$0,%r13
1500
1501	mulq	%r14
1502	addq	%rax,%r10
1503	movq	%rbx,%rax
1504	adcq	$0,%rdx
1505	addq	%r12,%r10
1506	movq	%rdx,%r11
1507	adcq	$0,%r11
1508	movq	%r10,-8(%rdi,%rcx,1)
1509
1510	cmpq	$0,%rcx
1511	jne	.Lsqr4x_inner
1512
1513.byte	0x67
1514	mulq	%r15
1515	addq	%rax,%r13
1516	adcq	$0,%rdx
1517	addq	%r11,%r13
1518	adcq	$0,%rdx
1519
1520	movq	%r13,(%rdi)
1521	movq	%rdx,%r12
1522	movq	%rdx,8(%rdi)
1523
1524	addq	$16,%rbp
1525	jnz	.Lsqr4x_outer
1526
1527
1528	movq	-32(%rsi),%r14
1529	leaq	48+8(%rsp,%r9,2),%rdi
1530	movq	-24(%rsi),%rax
1531	leaq	-32(%rdi,%rbp,1),%rdi
1532	movq	-16(%rsi),%rbx
1533	movq	%rax,%r15
1534
1535	mulq	%r14
1536	addq	%rax,%r10
1537	movq	%rbx,%rax
1538	movq	%rdx,%r11
1539	adcq	$0,%r11
1540
1541	mulq	%r14
1542	addq	%rax,%r11
1543	movq	%rbx,%rax
1544	movq	%r10,-24(%rdi)
1545	movq	%rdx,%r10
1546	adcq	$0,%r10
1547	addq	%r13,%r11
1548	movq	-8(%rsi),%rbx
1549	adcq	$0,%r10
1550
1551	mulq	%r15
1552	addq	%rax,%r12
1553	movq	%rbx,%rax
1554	movq	%r11,-16(%rdi)
1555	movq	%rdx,%r13
1556	adcq	$0,%r13
1557
1558	mulq	%r14
1559	addq	%rax,%r10
1560	movq	%rbx,%rax
1561	movq	%rdx,%r11
1562	adcq	$0,%r11
1563	addq	%r12,%r10
1564	adcq	$0,%r11
1565	movq	%r10,-8(%rdi)
1566
1567	mulq	%r15
1568	addq	%rax,%r13
1569	movq	-16(%rsi),%rax
1570	adcq	$0,%rdx
1571	addq	%r11,%r13
1572	adcq	$0,%rdx
1573
1574	movq	%r13,(%rdi)
1575	movq	%rdx,%r12
1576	movq	%rdx,8(%rdi)
1577
1578	mulq	%rbx
1579	addq	$16,%rbp
1580	xorq	%r14,%r14
1581	subq	%r9,%rbp
1582	xorq	%r15,%r15
1583
1584	addq	%r12,%rax
1585	adcq	$0,%rdx
1586	movq	%rax,8(%rdi)
1587	movq	%rdx,16(%rdi)
1588	movq	%r15,24(%rdi)
1589
1590	movq	-16(%rsi,%rbp,1),%rax
1591	leaq	48+8(%rsp),%rdi
1592	xorq	%r10,%r10
1593	movq	8(%rdi),%r11
1594
1595	leaq	(%r14,%r10,2),%r12
1596	shrq	$63,%r10
1597	leaq	(%rcx,%r11,2),%r13
1598	shrq	$63,%r11
1599	orq	%r10,%r13
1600	movq	16(%rdi),%r10
1601	movq	%r11,%r14
1602	mulq	%rax
1603	negq	%r15
1604	movq	24(%rdi),%r11
1605	adcq	%rax,%r12
1606	movq	-8(%rsi,%rbp,1),%rax
1607	movq	%r12,(%rdi)
1608	adcq	%rdx,%r13
1609
1610	leaq	(%r14,%r10,2),%rbx
1611	movq	%r13,8(%rdi)
1612	sbbq	%r15,%r15
1613	shrq	$63,%r10
1614	leaq	(%rcx,%r11,2),%r8
1615	shrq	$63,%r11
1616	orq	%r10,%r8
1617	movq	32(%rdi),%r10
1618	movq	%r11,%r14
1619	mulq	%rax
1620	negq	%r15
1621	movq	40(%rdi),%r11
1622	adcq	%rax,%rbx
1623	movq	0(%rsi,%rbp,1),%rax
1624	movq	%rbx,16(%rdi)
1625	adcq	%rdx,%r8
1626	leaq	16(%rbp),%rbp
1627	movq	%r8,24(%rdi)
1628	sbbq	%r15,%r15
1629	leaq	64(%rdi),%rdi
1630	jmp	.Lsqr4x_shift_n_add
1631
1632.align	32
1633.Lsqr4x_shift_n_add:
1634	leaq	(%r14,%r10,2),%r12
1635	shrq	$63,%r10
1636	leaq	(%rcx,%r11,2),%r13
1637	shrq	$63,%r11
1638	orq	%r10,%r13
1639	movq	-16(%rdi),%r10
1640	movq	%r11,%r14
1641	mulq	%rax
1642	negq	%r15
1643	movq	-8(%rdi),%r11
1644	adcq	%rax,%r12
1645	movq	-8(%rsi,%rbp,1),%rax
1646	movq	%r12,-32(%rdi)
1647	adcq	%rdx,%r13
1648
1649	leaq	(%r14,%r10,2),%rbx
1650	movq	%r13,-24(%rdi)
1651	sbbq	%r15,%r15
1652	shrq	$63,%r10
1653	leaq	(%rcx,%r11,2),%r8
1654	shrq	$63,%r11
1655	orq	%r10,%r8
1656	movq	0(%rdi),%r10
1657	movq	%r11,%r14
1658	mulq	%rax
1659	negq	%r15
1660	movq	8(%rdi),%r11
1661	adcq	%rax,%rbx
1662	movq	0(%rsi,%rbp,1),%rax
1663	movq	%rbx,-16(%rdi)
1664	adcq	%rdx,%r8
1665
1666	leaq	(%r14,%r10,2),%r12
1667	movq	%r8,-8(%rdi)
1668	sbbq	%r15,%r15
1669	shrq	$63,%r10
1670	leaq	(%rcx,%r11,2),%r13
1671	shrq	$63,%r11
1672	orq	%r10,%r13
1673	movq	16(%rdi),%r10
1674	movq	%r11,%r14
1675	mulq	%rax
1676	negq	%r15
1677	movq	24(%rdi),%r11
1678	adcq	%rax,%r12
1679	movq	8(%rsi,%rbp,1),%rax
1680	movq	%r12,0(%rdi)
1681	adcq	%rdx,%r13
1682
1683	leaq	(%r14,%r10,2),%rbx
1684	movq	%r13,8(%rdi)
1685	sbbq	%r15,%r15
1686	shrq	$63,%r10
1687	leaq	(%rcx,%r11,2),%r8
1688	shrq	$63,%r11
1689	orq	%r10,%r8
1690	movq	32(%rdi),%r10
1691	movq	%r11,%r14
1692	mulq	%rax
1693	negq	%r15
1694	movq	40(%rdi),%r11
1695	adcq	%rax,%rbx
1696	movq	16(%rsi,%rbp,1),%rax
1697	movq	%rbx,16(%rdi)
1698	adcq	%rdx,%r8
1699	movq	%r8,24(%rdi)
1700	sbbq	%r15,%r15
1701	leaq	64(%rdi),%rdi
1702	addq	$32,%rbp
1703	jnz	.Lsqr4x_shift_n_add
1704
1705	leaq	(%r14,%r10,2),%r12
1706.byte	0x67
1707	shrq	$63,%r10
1708	leaq	(%rcx,%r11,2),%r13
1709	shrq	$63,%r11
1710	orq	%r10,%r13
1711	movq	-16(%rdi),%r10
1712	movq	%r11,%r14
1713	mulq	%rax
1714	negq	%r15
1715	movq	-8(%rdi),%r11
1716	adcq	%rax,%r12
1717	movq	-8(%rsi),%rax
1718	movq	%r12,-32(%rdi)
1719	adcq	%rdx,%r13
1720
1721	leaq	(%r14,%r10,2),%rbx
1722	movq	%r13,-24(%rdi)
1723	sbbq	%r15,%r15
1724	shrq	$63,%r10
1725	leaq	(%rcx,%r11,2),%r8
1726	shrq	$63,%r11
1727	orq	%r10,%r8
1728	mulq	%rax
1729	negq	%r15
1730	adcq	%rax,%rbx
1731	adcq	%rdx,%r8
1732	movq	%rbx,-16(%rdi)
1733	movq	%r8,-8(%rdi)
1734.byte	102,72,15,126,213
1735__bn_sqr8x_reduction:
1736	xorq	%rax,%rax
1737	leaq	(%r9,%rbp,1),%rcx
1738	leaq	48+8(%rsp,%r9,2),%rdx
1739	movq	%rcx,0+8(%rsp)
1740	leaq	48+8(%rsp,%r9,1),%rdi
1741	movq	%rdx,8+8(%rsp)
1742	negq	%r9
1743	jmp	.L8x_reduction_loop
1744
1745.align	32
1746.L8x_reduction_loop:
1747	leaq	(%rdi,%r9,1),%rdi
1748.byte	0x66
1749	movq	0(%rdi),%rbx
1750	movq	8(%rdi),%r9
1751	movq	16(%rdi),%r10
1752	movq	24(%rdi),%r11
1753	movq	32(%rdi),%r12
1754	movq	40(%rdi),%r13
1755	movq	48(%rdi),%r14
1756	movq	56(%rdi),%r15
1757	movq	%rax,(%rdx)
1758	leaq	64(%rdi),%rdi
1759
1760.byte	0x67
1761	movq	%rbx,%r8
1762	imulq	32+8(%rsp),%rbx
1763	movq	0(%rbp),%rax
1764	movl	$8,%ecx
1765	jmp	.L8x_reduce
1766
1767.align	32
1768.L8x_reduce:
1769	mulq	%rbx
1770	movq	8(%rbp),%rax
1771	negq	%r8
1772	movq	%rdx,%r8
1773	adcq	$0,%r8
1774
1775	mulq	%rbx
1776	addq	%rax,%r9
1777	movq	16(%rbp),%rax
1778	adcq	$0,%rdx
1779	addq	%r9,%r8
1780	movq	%rbx,48-8+8(%rsp,%rcx,8)
1781	movq	%rdx,%r9
1782	adcq	$0,%r9
1783
1784	mulq	%rbx
1785	addq	%rax,%r10
1786	movq	24(%rbp),%rax
1787	adcq	$0,%rdx
1788	addq	%r10,%r9
1789	movq	32+8(%rsp),%rsi
1790	movq	%rdx,%r10
1791	adcq	$0,%r10
1792
1793	mulq	%rbx
1794	addq	%rax,%r11
1795	movq	32(%rbp),%rax
1796	adcq	$0,%rdx
1797	imulq	%r8,%rsi
1798	addq	%r11,%r10
1799	movq	%rdx,%r11
1800	adcq	$0,%r11
1801
1802	mulq	%rbx
1803	addq	%rax,%r12
1804	movq	40(%rbp),%rax
1805	adcq	$0,%rdx
1806	addq	%r12,%r11
1807	movq	%rdx,%r12
1808	adcq	$0,%r12
1809
1810	mulq	%rbx
1811	addq	%rax,%r13
1812	movq	48(%rbp),%rax
1813	adcq	$0,%rdx
1814	addq	%r13,%r12
1815	movq	%rdx,%r13
1816	adcq	$0,%r13
1817
1818	mulq	%rbx
1819	addq	%rax,%r14
1820	movq	56(%rbp),%rax
1821	adcq	$0,%rdx
1822	addq	%r14,%r13
1823	movq	%rdx,%r14
1824	adcq	$0,%r14
1825
1826	mulq	%rbx
1827	movq	%rsi,%rbx
1828	addq	%rax,%r15
1829	movq	0(%rbp),%rax
1830	adcq	$0,%rdx
1831	addq	%r15,%r14
1832	movq	%rdx,%r15
1833	adcq	$0,%r15
1834
1835	decl	%ecx
1836	jnz	.L8x_reduce
1837
1838	leaq	64(%rbp),%rbp
1839	xorq	%rax,%rax
1840	movq	8+8(%rsp),%rdx
1841	cmpq	0+8(%rsp),%rbp
1842	jae	.L8x_no_tail
1843
1844.byte	0x66
1845	addq	0(%rdi),%r8
1846	adcq	8(%rdi),%r9
1847	adcq	16(%rdi),%r10
1848	adcq	24(%rdi),%r11
1849	adcq	32(%rdi),%r12
1850	adcq	40(%rdi),%r13
1851	adcq	48(%rdi),%r14
1852	adcq	56(%rdi),%r15
1853	sbbq	%rsi,%rsi
1854
1855	movq	48+56+8(%rsp),%rbx
1856	movl	$8,%ecx
1857	movq	0(%rbp),%rax
1858	jmp	.L8x_tail
1859
1860.align	32
1861.L8x_tail:
1862	mulq	%rbx
1863	addq	%rax,%r8
1864	movq	8(%rbp),%rax
1865	movq	%r8,(%rdi)
1866	movq	%rdx,%r8
1867	adcq	$0,%r8
1868
1869	mulq	%rbx
1870	addq	%rax,%r9
1871	movq	16(%rbp),%rax
1872	adcq	$0,%rdx
1873	addq	%r9,%r8
1874	leaq	8(%rdi),%rdi
1875	movq	%rdx,%r9
1876	adcq	$0,%r9
1877
1878	mulq	%rbx
1879	addq	%rax,%r10
1880	movq	24(%rbp),%rax
1881	adcq	$0,%rdx
1882	addq	%r10,%r9
1883	movq	%rdx,%r10
1884	adcq	$0,%r10
1885
1886	mulq	%rbx
1887	addq	%rax,%r11
1888	movq	32(%rbp),%rax
1889	adcq	$0,%rdx
1890	addq	%r11,%r10
1891	movq	%rdx,%r11
1892	adcq	$0,%r11
1893
1894	mulq	%rbx
1895	addq	%rax,%r12
1896	movq	40(%rbp),%rax
1897	adcq	$0,%rdx
1898	addq	%r12,%r11
1899	movq	%rdx,%r12
1900	adcq	$0,%r12
1901
1902	mulq	%rbx
1903	addq	%rax,%r13
1904	movq	48(%rbp),%rax
1905	adcq	$0,%rdx
1906	addq	%r13,%r12
1907	movq	%rdx,%r13
1908	adcq	$0,%r13
1909
1910	mulq	%rbx
1911	addq	%rax,%r14
1912	movq	56(%rbp),%rax
1913	adcq	$0,%rdx
1914	addq	%r14,%r13
1915	movq	%rdx,%r14
1916	adcq	$0,%r14
1917
1918	mulq	%rbx
1919	movq	48-16+8(%rsp,%rcx,8),%rbx
1920	addq	%rax,%r15
1921	adcq	$0,%rdx
1922	addq	%r15,%r14
1923	movq	0(%rbp),%rax
1924	movq	%rdx,%r15
1925	adcq	$0,%r15
1926
1927	decl	%ecx
1928	jnz	.L8x_tail
1929
1930	leaq	64(%rbp),%rbp
1931	movq	8+8(%rsp),%rdx
1932	cmpq	0+8(%rsp),%rbp
1933	jae	.L8x_tail_done
1934
1935	movq	48+56+8(%rsp),%rbx
1936	negq	%rsi
1937	movq	0(%rbp),%rax
1938	adcq	0(%rdi),%r8
1939	adcq	8(%rdi),%r9
1940	adcq	16(%rdi),%r10
1941	adcq	24(%rdi),%r11
1942	adcq	32(%rdi),%r12
1943	adcq	40(%rdi),%r13
1944	adcq	48(%rdi),%r14
1945	adcq	56(%rdi),%r15
1946	sbbq	%rsi,%rsi
1947
1948	movl	$8,%ecx
1949	jmp	.L8x_tail
1950
1951.align	32
1952.L8x_tail_done:
1953	xorq	%rax,%rax
1954	addq	(%rdx),%r8
1955	adcq	$0,%r9
1956	adcq	$0,%r10
1957	adcq	$0,%r11
1958	adcq	$0,%r12
1959	adcq	$0,%r13
1960	adcq	$0,%r14
1961	adcq	$0,%r15
1962	adcq	$0,%rax
1963
1964	negq	%rsi
1965.L8x_no_tail:
1966	adcq	0(%rdi),%r8
1967	adcq	8(%rdi),%r9
1968	adcq	16(%rdi),%r10
1969	adcq	24(%rdi),%r11
1970	adcq	32(%rdi),%r12
1971	adcq	40(%rdi),%r13
1972	adcq	48(%rdi),%r14
1973	adcq	56(%rdi),%r15
1974	adcq	$0,%rax
1975	movq	-8(%rbp),%rcx
1976	xorq	%rsi,%rsi
1977
1978.byte	102,72,15,126,213
1979
1980	movq	%r8,0(%rdi)
1981	movq	%r9,8(%rdi)
1982.byte	102,73,15,126,217
1983	movq	%r10,16(%rdi)
1984	movq	%r11,24(%rdi)
1985	movq	%r12,32(%rdi)
1986	movq	%r13,40(%rdi)
1987	movq	%r14,48(%rdi)
1988	movq	%r15,56(%rdi)
1989	leaq	64(%rdi),%rdi
1990
1991	cmpq	%rdx,%rdi
1992	jb	.L8x_reduction_loop
1993	.byte	0xf3,0xc3
1994.cfi_endproc
1995.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1996.type	__bn_post4x_internal,@function
1997.align	32
1998__bn_post4x_internal:
1999.cfi_startproc
2000	movq	0(%rbp),%r12
2001	leaq	(%rdi,%r9,1),%rbx
2002	movq	%r9,%rcx
2003.byte	102,72,15,126,207
2004	negq	%rax
2005.byte	102,72,15,126,206
2006	sarq	$3+2,%rcx
2007	decq	%r12
2008	xorq	%r10,%r10
2009	movq	8(%rbp),%r13
2010	movq	16(%rbp),%r14
2011	movq	24(%rbp),%r15
2012	jmp	.Lsqr4x_sub_entry
2013
2014.align	16
2015.Lsqr4x_sub:
2016	movq	0(%rbp),%r12
2017	movq	8(%rbp),%r13
2018	movq	16(%rbp),%r14
2019	movq	24(%rbp),%r15
2020.Lsqr4x_sub_entry:
2021	leaq	32(%rbp),%rbp
2022	notq	%r12
2023	notq	%r13
2024	notq	%r14
2025	notq	%r15
2026	andq	%rax,%r12
2027	andq	%rax,%r13
2028	andq	%rax,%r14
2029	andq	%rax,%r15
2030
2031	negq	%r10
2032	adcq	0(%rbx),%r12
2033	adcq	8(%rbx),%r13
2034	adcq	16(%rbx),%r14
2035	adcq	24(%rbx),%r15
2036	movq	%r12,0(%rdi)
2037	leaq	32(%rbx),%rbx
2038	movq	%r13,8(%rdi)
2039	sbbq	%r10,%r10
2040	movq	%r14,16(%rdi)
2041	movq	%r15,24(%rdi)
2042	leaq	32(%rdi),%rdi
2043
2044	incq	%rcx
2045	jnz	.Lsqr4x_sub
2046
2047	movq	%r9,%r10
2048	negq	%r9
2049	.byte	0xf3,0xc3
2050.cfi_endproc
2051.size	__bn_post4x_internal,.-__bn_post4x_internal
2052.type	bn_mulx4x_mont_gather5,@function
2053.align	32
2054bn_mulx4x_mont_gather5:
2055.cfi_startproc
2056	movq	%rsp,%rax
2057.cfi_def_cfa_register	%rax
2058.Lmulx4x_enter:
2059	pushq	%rbx
2060.cfi_offset	%rbx,-16
2061	pushq	%rbp
2062.cfi_offset	%rbp,-24
2063	pushq	%r12
2064.cfi_offset	%r12,-32
2065	pushq	%r13
2066.cfi_offset	%r13,-40
2067	pushq	%r14
2068.cfi_offset	%r14,-48
2069	pushq	%r15
2070.cfi_offset	%r15,-56
2071.Lmulx4x_prologue:
2072
2073	shll	$3,%r9d
2074	leaq	(%r9,%r9,2),%r10
2075	negq	%r9
2076	movq	(%r8),%r8
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087	leaq	-320(%rsp,%r9,2),%r11
2088	movq	%rsp,%rbp
2089	subq	%rdi,%r11
2090	andq	$4095,%r11
2091	cmpq	%r11,%r10
2092	jb	.Lmulx4xsp_alt
2093	subq	%r11,%rbp
2094	leaq	-320(%rbp,%r9,2),%rbp
2095	jmp	.Lmulx4xsp_done
2096
2097.Lmulx4xsp_alt:
2098	leaq	4096-320(,%r9,2),%r10
2099	leaq	-320(%rbp,%r9,2),%rbp
2100	subq	%r10,%r11
2101	movq	$0,%r10
2102	cmovcq	%r10,%r11
2103	subq	%r11,%rbp
2104.Lmulx4xsp_done:
2105	andq	$-64,%rbp
2106	movq	%rsp,%r11
2107	subq	%rbp,%r11
2108	andq	$-4096,%r11
2109	leaq	(%r11,%rbp,1),%rsp
2110	movq	(%rsp),%r10
2111	cmpq	%rbp,%rsp
2112	ja	.Lmulx4x_page_walk
2113	jmp	.Lmulx4x_page_walk_done
2114
2115.Lmulx4x_page_walk:
2116	leaq	-4096(%rsp),%rsp
2117	movq	(%rsp),%r10
2118	cmpq	%rbp,%rsp
2119	ja	.Lmulx4x_page_walk
2120.Lmulx4x_page_walk_done:
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134	movq	%r8,32(%rsp)
2135	movq	%rax,40(%rsp)
2136.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2137.Lmulx4x_body:
2138	call	mulx4x_internal
2139
2140	movq	40(%rsp),%rsi
2141.cfi_def_cfa	%rsi,8
2142	movq	$1,%rax
2143
2144	movq	-48(%rsi),%r15
2145.cfi_restore	%r15
2146	movq	-40(%rsi),%r14
2147.cfi_restore	%r14
2148	movq	-32(%rsi),%r13
2149.cfi_restore	%r13
2150	movq	-24(%rsi),%r12
2151.cfi_restore	%r12
2152	movq	-16(%rsi),%rbp
2153.cfi_restore	%rbp
2154	movq	-8(%rsi),%rbx
2155.cfi_restore	%rbx
2156	leaq	(%rsi),%rsp
2157.cfi_def_cfa_register	%rsp
2158.Lmulx4x_epilogue:
2159	.byte	0xf3,0xc3
2160.cfi_endproc
2161.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2162
2163.type	mulx4x_internal,@function
2164.align	32
2165mulx4x_internal:
2166.cfi_startproc
2167	movq	%r9,8(%rsp)
2168	movq	%r9,%r10
2169	negq	%r9
2170	shlq	$5,%r9
2171	negq	%r10
2172	leaq	128(%rdx,%r9,1),%r13
2173	shrq	$5+5,%r9
2174	movd	8(%rax),%xmm5
2175	subq	$1,%r9
2176	leaq	.Linc(%rip),%rax
2177	movq	%r13,16+8(%rsp)
2178	movq	%r9,24+8(%rsp)
2179	movq	%rdi,56+8(%rsp)
2180	movdqa	0(%rax),%xmm0
2181	movdqa	16(%rax),%xmm1
2182	leaq	88-112(%rsp,%r10,1),%r10
2183	leaq	128(%rdx),%rdi
2184
2185	pshufd	$0,%xmm5,%xmm5
2186	movdqa	%xmm1,%xmm4
2187.byte	0x67
2188	movdqa	%xmm1,%xmm2
2189.byte	0x67
2190	paddd	%xmm0,%xmm1
2191	pcmpeqd	%xmm5,%xmm0
2192	movdqa	%xmm4,%xmm3
2193	paddd	%xmm1,%xmm2
2194	pcmpeqd	%xmm5,%xmm1
2195	movdqa	%xmm0,112(%r10)
2196	movdqa	%xmm4,%xmm0
2197
2198	paddd	%xmm2,%xmm3
2199	pcmpeqd	%xmm5,%xmm2
2200	movdqa	%xmm1,128(%r10)
2201	movdqa	%xmm4,%xmm1
2202
2203	paddd	%xmm3,%xmm0
2204	pcmpeqd	%xmm5,%xmm3
2205	movdqa	%xmm2,144(%r10)
2206	movdqa	%xmm4,%xmm2
2207
2208	paddd	%xmm0,%xmm1
2209	pcmpeqd	%xmm5,%xmm0
2210	movdqa	%xmm3,160(%r10)
2211	movdqa	%xmm4,%xmm3
2212	paddd	%xmm1,%xmm2
2213	pcmpeqd	%xmm5,%xmm1
2214	movdqa	%xmm0,176(%r10)
2215	movdqa	%xmm4,%xmm0
2216
2217	paddd	%xmm2,%xmm3
2218	pcmpeqd	%xmm5,%xmm2
2219	movdqa	%xmm1,192(%r10)
2220	movdqa	%xmm4,%xmm1
2221
2222	paddd	%xmm3,%xmm0
2223	pcmpeqd	%xmm5,%xmm3
2224	movdqa	%xmm2,208(%r10)
2225	movdqa	%xmm4,%xmm2
2226
2227	paddd	%xmm0,%xmm1
2228	pcmpeqd	%xmm5,%xmm0
2229	movdqa	%xmm3,224(%r10)
2230	movdqa	%xmm4,%xmm3
2231	paddd	%xmm1,%xmm2
2232	pcmpeqd	%xmm5,%xmm1
2233	movdqa	%xmm0,240(%r10)
2234	movdqa	%xmm4,%xmm0
2235
2236	paddd	%xmm2,%xmm3
2237	pcmpeqd	%xmm5,%xmm2
2238	movdqa	%xmm1,256(%r10)
2239	movdqa	%xmm4,%xmm1
2240
2241	paddd	%xmm3,%xmm0
2242	pcmpeqd	%xmm5,%xmm3
2243	movdqa	%xmm2,272(%r10)
2244	movdqa	%xmm4,%xmm2
2245
2246	paddd	%xmm0,%xmm1
2247	pcmpeqd	%xmm5,%xmm0
2248	movdqa	%xmm3,288(%r10)
2249	movdqa	%xmm4,%xmm3
2250.byte	0x67
2251	paddd	%xmm1,%xmm2
2252	pcmpeqd	%xmm5,%xmm1
2253	movdqa	%xmm0,304(%r10)
2254
2255	paddd	%xmm2,%xmm3
2256	pcmpeqd	%xmm5,%xmm2
2257	movdqa	%xmm1,320(%r10)
2258
2259	pcmpeqd	%xmm5,%xmm3
2260	movdqa	%xmm2,336(%r10)
2261
2262	pand	64(%rdi),%xmm0
2263	pand	80(%rdi),%xmm1
2264	pand	96(%rdi),%xmm2
2265	movdqa	%xmm3,352(%r10)
2266	pand	112(%rdi),%xmm3
2267	por	%xmm2,%xmm0
2268	por	%xmm3,%xmm1
2269	movdqa	-128(%rdi),%xmm4
2270	movdqa	-112(%rdi),%xmm5
2271	movdqa	-96(%rdi),%xmm2
2272	pand	112(%r10),%xmm4
2273	movdqa	-80(%rdi),%xmm3
2274	pand	128(%r10),%xmm5
2275	por	%xmm4,%xmm0
2276	pand	144(%r10),%xmm2
2277	por	%xmm5,%xmm1
2278	pand	160(%r10),%xmm3
2279	por	%xmm2,%xmm0
2280	por	%xmm3,%xmm1
2281	movdqa	-64(%rdi),%xmm4
2282	movdqa	-48(%rdi),%xmm5
2283	movdqa	-32(%rdi),%xmm2
2284	pand	176(%r10),%xmm4
2285	movdqa	-16(%rdi),%xmm3
2286	pand	192(%r10),%xmm5
2287	por	%xmm4,%xmm0
2288	pand	208(%r10),%xmm2
2289	por	%xmm5,%xmm1
2290	pand	224(%r10),%xmm3
2291	por	%xmm2,%xmm0
2292	por	%xmm3,%xmm1
2293	movdqa	0(%rdi),%xmm4
2294	movdqa	16(%rdi),%xmm5
2295	movdqa	32(%rdi),%xmm2
2296	pand	240(%r10),%xmm4
2297	movdqa	48(%rdi),%xmm3
2298	pand	256(%r10),%xmm5
2299	por	%xmm4,%xmm0
2300	pand	272(%r10),%xmm2
2301	por	%xmm5,%xmm1
2302	pand	288(%r10),%xmm3
2303	por	%xmm2,%xmm0
2304	por	%xmm3,%xmm1
2305	pxor	%xmm1,%xmm0
2306	pshufd	$0x4e,%xmm0,%xmm1
2307	por	%xmm1,%xmm0
2308	leaq	256(%rdi),%rdi
2309.byte	102,72,15,126,194
2310	leaq	64+32+8(%rsp),%rbx
2311
2312	movq	%rdx,%r9
2313	mulxq	0(%rsi),%r8,%rax
2314	mulxq	8(%rsi),%r11,%r12
2315	addq	%rax,%r11
2316	mulxq	16(%rsi),%rax,%r13
2317	adcq	%rax,%r12
2318	adcq	$0,%r13
2319	mulxq	24(%rsi),%rax,%r14
2320
2321	movq	%r8,%r15
2322	imulq	32+8(%rsp),%r8
2323	xorq	%rbp,%rbp
2324	movq	%r8,%rdx
2325
2326	movq	%rdi,8+8(%rsp)
2327
2328	leaq	32(%rsi),%rsi
2329	adcxq	%rax,%r13
2330	adcxq	%rbp,%r14
2331
2332	mulxq	0(%rcx),%rax,%r10
2333	adcxq	%rax,%r15
2334	adoxq	%r11,%r10
2335	mulxq	8(%rcx),%rax,%r11
2336	adcxq	%rax,%r10
2337	adoxq	%r12,%r11
2338	mulxq	16(%rcx),%rax,%r12
2339	movq	24+8(%rsp),%rdi
2340	movq	%r10,-32(%rbx)
2341	adcxq	%rax,%r11
2342	adoxq	%r13,%r12
2343	mulxq	24(%rcx),%rax,%r15
2344	movq	%r9,%rdx
2345	movq	%r11,-24(%rbx)
2346	adcxq	%rax,%r12
2347	adoxq	%rbp,%r15
2348	leaq	32(%rcx),%rcx
2349	movq	%r12,-16(%rbx)
2350	jmp	.Lmulx4x_1st
2351
2352.align	32
2353.Lmulx4x_1st:
2354	adcxq	%rbp,%r15
2355	mulxq	0(%rsi),%r10,%rax
2356	adcxq	%r14,%r10
2357	mulxq	8(%rsi),%r11,%r14
2358	adcxq	%rax,%r11
2359	mulxq	16(%rsi),%r12,%rax
2360	adcxq	%r14,%r12
2361	mulxq	24(%rsi),%r13,%r14
2362.byte	0x67,0x67
2363	movq	%r8,%rdx
2364	adcxq	%rax,%r13
2365	adcxq	%rbp,%r14
2366	leaq	32(%rsi),%rsi
2367	leaq	32(%rbx),%rbx
2368
2369	adoxq	%r15,%r10
2370	mulxq	0(%rcx),%rax,%r15
2371	adcxq	%rax,%r10
2372	adoxq	%r15,%r11
2373	mulxq	8(%rcx),%rax,%r15
2374	adcxq	%rax,%r11
2375	adoxq	%r15,%r12
2376	mulxq	16(%rcx),%rax,%r15
2377	movq	%r10,-40(%rbx)
2378	adcxq	%rax,%r12
2379	movq	%r11,-32(%rbx)
2380	adoxq	%r15,%r13
2381	mulxq	24(%rcx),%rax,%r15
2382	movq	%r9,%rdx
2383	movq	%r12,-24(%rbx)
2384	adcxq	%rax,%r13
2385	adoxq	%rbp,%r15
2386	leaq	32(%rcx),%rcx
2387	movq	%r13,-16(%rbx)
2388
2389	decq	%rdi
2390	jnz	.Lmulx4x_1st
2391
2392	movq	8(%rsp),%rax
2393	adcq	%rbp,%r15
2394	leaq	(%rsi,%rax,1),%rsi
2395	addq	%r15,%r14
2396	movq	8+8(%rsp),%rdi
2397	adcq	%rbp,%rbp
2398	movq	%r14,-8(%rbx)
2399	jmp	.Lmulx4x_outer
2400
2401.align	32
2402.Lmulx4x_outer:
2403	leaq	16-256(%rbx),%r10
2404	pxor	%xmm4,%xmm4
2405.byte	0x67,0x67
2406	pxor	%xmm5,%xmm5
2407	movdqa	-128(%rdi),%xmm0
2408	movdqa	-112(%rdi),%xmm1
2409	movdqa	-96(%rdi),%xmm2
2410	pand	256(%r10),%xmm0
2411	movdqa	-80(%rdi),%xmm3
2412	pand	272(%r10),%xmm1
2413	por	%xmm0,%xmm4
2414	pand	288(%r10),%xmm2
2415	por	%xmm1,%xmm5
2416	pand	304(%r10),%xmm3
2417	por	%xmm2,%xmm4
2418	por	%xmm3,%xmm5
2419	movdqa	-64(%rdi),%xmm0
2420	movdqa	-48(%rdi),%xmm1
2421	movdqa	-32(%rdi),%xmm2
2422	pand	320(%r10),%xmm0
2423	movdqa	-16(%rdi),%xmm3
2424	pand	336(%r10),%xmm1
2425	por	%xmm0,%xmm4
2426	pand	352(%r10),%xmm2
2427	por	%xmm1,%xmm5
2428	pand	368(%r10),%xmm3
2429	por	%xmm2,%xmm4
2430	por	%xmm3,%xmm5
2431	movdqa	0(%rdi),%xmm0
2432	movdqa	16(%rdi),%xmm1
2433	movdqa	32(%rdi),%xmm2
2434	pand	384(%r10),%xmm0
2435	movdqa	48(%rdi),%xmm3
2436	pand	400(%r10),%xmm1
2437	por	%xmm0,%xmm4
2438	pand	416(%r10),%xmm2
2439	por	%xmm1,%xmm5
2440	pand	432(%r10),%xmm3
2441	por	%xmm2,%xmm4
2442	por	%xmm3,%xmm5
2443	movdqa	64(%rdi),%xmm0
2444	movdqa	80(%rdi),%xmm1
2445	movdqa	96(%rdi),%xmm2
2446	pand	448(%r10),%xmm0
2447	movdqa	112(%rdi),%xmm3
2448	pand	464(%r10),%xmm1
2449	por	%xmm0,%xmm4
2450	pand	480(%r10),%xmm2
2451	por	%xmm1,%xmm5
2452	pand	496(%r10),%xmm3
2453	por	%xmm2,%xmm4
2454	por	%xmm3,%xmm5
2455	por	%xmm5,%xmm4
2456	pshufd	$0x4e,%xmm4,%xmm0
2457	por	%xmm4,%xmm0
2458	leaq	256(%rdi),%rdi
2459.byte	102,72,15,126,194
2460
2461	movq	%rbp,(%rbx)
2462	leaq	32(%rbx,%rax,1),%rbx
2463	mulxq	0(%rsi),%r8,%r11
2464	xorq	%rbp,%rbp
2465	movq	%rdx,%r9
2466	mulxq	8(%rsi),%r14,%r12
2467	adoxq	-32(%rbx),%r8
2468	adcxq	%r14,%r11
2469	mulxq	16(%rsi),%r15,%r13
2470	adoxq	-24(%rbx),%r11
2471	adcxq	%r15,%r12
2472	mulxq	24(%rsi),%rdx,%r14
2473	adoxq	-16(%rbx),%r12
2474	adcxq	%rdx,%r13
2475	leaq	(%rcx,%rax,1),%rcx
2476	leaq	32(%rsi),%rsi
2477	adoxq	-8(%rbx),%r13
2478	adcxq	%rbp,%r14
2479	adoxq	%rbp,%r14
2480
2481	movq	%r8,%r15
2482	imulq	32+8(%rsp),%r8
2483
2484	movq	%r8,%rdx
2485	xorq	%rbp,%rbp
2486	movq	%rdi,8+8(%rsp)
2487
2488	mulxq	0(%rcx),%rax,%r10
2489	adcxq	%rax,%r15
2490	adoxq	%r11,%r10
2491	mulxq	8(%rcx),%rax,%r11
2492	adcxq	%rax,%r10
2493	adoxq	%r12,%r11
2494	mulxq	16(%rcx),%rax,%r12
2495	adcxq	%rax,%r11
2496	adoxq	%r13,%r12
2497	mulxq	24(%rcx),%rax,%r15
2498	movq	%r9,%rdx
2499	movq	24+8(%rsp),%rdi
2500	movq	%r10,-32(%rbx)
2501	adcxq	%rax,%r12
2502	movq	%r11,-24(%rbx)
2503	adoxq	%rbp,%r15
2504	movq	%r12,-16(%rbx)
2505	leaq	32(%rcx),%rcx
2506	jmp	.Lmulx4x_inner
2507
2508.align	32
2509.Lmulx4x_inner:
2510	mulxq	0(%rsi),%r10,%rax
2511	adcxq	%rbp,%r15
2512	adoxq	%r14,%r10
2513	mulxq	8(%rsi),%r11,%r14
2514	adcxq	0(%rbx),%r10
2515	adoxq	%rax,%r11
2516	mulxq	16(%rsi),%r12,%rax
2517	adcxq	8(%rbx),%r11
2518	adoxq	%r14,%r12
2519	mulxq	24(%rsi),%r13,%r14
2520	movq	%r8,%rdx
2521	adcxq	16(%rbx),%r12
2522	adoxq	%rax,%r13
2523	adcxq	24(%rbx),%r13
2524	adoxq	%rbp,%r14
2525	leaq	32(%rsi),%rsi
2526	leaq	32(%rbx),%rbx
2527	adcxq	%rbp,%r14
2528
2529	adoxq	%r15,%r10
2530	mulxq	0(%rcx),%rax,%r15
2531	adcxq	%rax,%r10
2532	adoxq	%r15,%r11
2533	mulxq	8(%rcx),%rax,%r15
2534	adcxq	%rax,%r11
2535	adoxq	%r15,%r12
2536	mulxq	16(%rcx),%rax,%r15
2537	movq	%r10,-40(%rbx)
2538	adcxq	%rax,%r12
2539	adoxq	%r15,%r13
2540	movq	%r11,-32(%rbx)
2541	mulxq	24(%rcx),%rax,%r15
2542	movq	%r9,%rdx
2543	leaq	32(%rcx),%rcx
2544	movq	%r12,-24(%rbx)
2545	adcxq	%rax,%r13
2546	adoxq	%rbp,%r15
2547	movq	%r13,-16(%rbx)
2548
2549	decq	%rdi
2550	jnz	.Lmulx4x_inner
2551
2552	movq	0+8(%rsp),%rax
2553	adcq	%rbp,%r15
2554	subq	0(%rbx),%rdi
2555	movq	8+8(%rsp),%rdi
2556	movq	16+8(%rsp),%r10
2557	adcq	%r15,%r14
2558	leaq	(%rsi,%rax,1),%rsi
2559	adcq	%rbp,%rbp
2560	movq	%r14,-8(%rbx)
2561
2562	cmpq	%r10,%rdi
2563	jb	.Lmulx4x_outer
2564
2565	movq	-8(%rcx),%r10
2566	movq	%rbp,%r8
2567	movq	(%rcx,%rax,1),%r12
2568	leaq	(%rcx,%rax,1),%rbp
2569	movq	%rax,%rcx
2570	leaq	(%rbx,%rax,1),%rdi
2571	xorl	%eax,%eax
2572	xorq	%r15,%r15
2573	subq	%r14,%r10
2574	adcq	%r15,%r15
2575	orq	%r15,%r8
2576	sarq	$3+2,%rcx
2577	subq	%r8,%rax
2578	movq	56+8(%rsp),%rdx
2579	decq	%r12
2580	movq	8(%rbp),%r13
2581	xorq	%r8,%r8
2582	movq	16(%rbp),%r14
2583	movq	24(%rbp),%r15
2584	jmp	.Lsqrx4x_sub_entry
2585.cfi_endproc
2586.size	mulx4x_internal,.-mulx4x_internal
2587.type	bn_powerx5,@function
2588.align	32
2589bn_powerx5:
2590.cfi_startproc
2591	movq	%rsp,%rax
2592.cfi_def_cfa_register	%rax
2593.Lpowerx5_enter:
2594	pushq	%rbx
2595.cfi_offset	%rbx,-16
2596	pushq	%rbp
2597.cfi_offset	%rbp,-24
2598	pushq	%r12
2599.cfi_offset	%r12,-32
2600	pushq	%r13
2601.cfi_offset	%r13,-40
2602	pushq	%r14
2603.cfi_offset	%r14,-48
2604	pushq	%r15
2605.cfi_offset	%r15,-56
2606.Lpowerx5_prologue:
2607
2608	shll	$3,%r9d
2609	leaq	(%r9,%r9,2),%r10
2610	negq	%r9
2611	movq	(%r8),%r8
2612
2613
2614
2615
2616
2617
2618
2619
2620	leaq	-320(%rsp,%r9,2),%r11
2621	movq	%rsp,%rbp
2622	subq	%rdi,%r11
2623	andq	$4095,%r11
2624	cmpq	%r11,%r10
2625	jb	.Lpwrx_sp_alt
2626	subq	%r11,%rbp
2627	leaq	-320(%rbp,%r9,2),%rbp
2628	jmp	.Lpwrx_sp_done
2629
2630.align	32
2631.Lpwrx_sp_alt:
2632	leaq	4096-320(,%r9,2),%r10
2633	leaq	-320(%rbp,%r9,2),%rbp
2634	subq	%r10,%r11
2635	movq	$0,%r10
2636	cmovcq	%r10,%r11
2637	subq	%r11,%rbp
2638.Lpwrx_sp_done:
2639	andq	$-64,%rbp
2640	movq	%rsp,%r11
2641	subq	%rbp,%r11
2642	andq	$-4096,%r11
2643	leaq	(%r11,%rbp,1),%rsp
2644	movq	(%rsp),%r10
2645	cmpq	%rbp,%rsp
2646	ja	.Lpwrx_page_walk
2647	jmp	.Lpwrx_page_walk_done
2648
2649.Lpwrx_page_walk:
2650	leaq	-4096(%rsp),%rsp
2651	movq	(%rsp),%r10
2652	cmpq	%rbp,%rsp
2653	ja	.Lpwrx_page_walk
2654.Lpwrx_page_walk_done:
2655
2656	movq	%r9,%r10
2657	negq	%r9
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670	pxor	%xmm0,%xmm0
2671.byte	102,72,15,110,207
2672.byte	102,72,15,110,209
2673.byte	102,73,15,110,218
2674.byte	102,72,15,110,226
2675	movq	%r8,32(%rsp)
2676	movq	%rax,40(%rsp)
2677.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2678.Lpowerx5_body:
2679
2680	call	__bn_sqrx8x_internal
2681	call	__bn_postx4x_internal
2682	call	__bn_sqrx8x_internal
2683	call	__bn_postx4x_internal
2684	call	__bn_sqrx8x_internal
2685	call	__bn_postx4x_internal
2686	call	__bn_sqrx8x_internal
2687	call	__bn_postx4x_internal
2688	call	__bn_sqrx8x_internal
2689	call	__bn_postx4x_internal
2690
2691	movq	%r10,%r9
2692	movq	%rsi,%rdi
2693.byte	102,72,15,126,209
2694.byte	102,72,15,126,226
2695	movq	40(%rsp),%rax
2696
2697	call	mulx4x_internal
2698
2699	movq	40(%rsp),%rsi
2700.cfi_def_cfa	%rsi,8
2701	movq	$1,%rax
2702
2703	movq	-48(%rsi),%r15
2704.cfi_restore	%r15
2705	movq	-40(%rsi),%r14
2706.cfi_restore	%r14
2707	movq	-32(%rsi),%r13
2708.cfi_restore	%r13
2709	movq	-24(%rsi),%r12
2710.cfi_restore	%r12
2711	movq	-16(%rsi),%rbp
2712.cfi_restore	%rbp
2713	movq	-8(%rsi),%rbx
2714.cfi_restore	%rbx
2715	leaq	(%rsi),%rsp
2716.cfi_def_cfa_register	%rsp
2717.Lpowerx5_epilogue:
2718	.byte	0xf3,0xc3
2719.cfi_endproc
2720.size	bn_powerx5,.-bn_powerx5
2721
2722.globl	bn_sqrx8x_internal
2723.hidden	bn_sqrx8x_internal
2724.type	bn_sqrx8x_internal,@function
2725.align	32
2726bn_sqrx8x_internal:
2727__bn_sqrx8x_internal:
2728.cfi_startproc
2729
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769	leaq	48+8(%rsp),%rdi
2770	leaq	(%rsi,%r9,1),%rbp
2771	movq	%r9,0+8(%rsp)
2772	movq	%rbp,8+8(%rsp)
2773	jmp	.Lsqr8x_zero_start
2774
2775.align	32
2776.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2777.Lsqrx8x_zero:
2778.byte	0x3e
2779	movdqa	%xmm0,0(%rdi)
2780	movdqa	%xmm0,16(%rdi)
2781	movdqa	%xmm0,32(%rdi)
2782	movdqa	%xmm0,48(%rdi)
2783.Lsqr8x_zero_start:
2784	movdqa	%xmm0,64(%rdi)
2785	movdqa	%xmm0,80(%rdi)
2786	movdqa	%xmm0,96(%rdi)
2787	movdqa	%xmm0,112(%rdi)
2788	leaq	128(%rdi),%rdi
2789	subq	$64,%r9
2790	jnz	.Lsqrx8x_zero
2791
2792	movq	0(%rsi),%rdx
2793
2794	xorq	%r10,%r10
2795	xorq	%r11,%r11
2796	xorq	%r12,%r12
2797	xorq	%r13,%r13
2798	xorq	%r14,%r14
2799	xorq	%r15,%r15
2800	leaq	48+8(%rsp),%rdi
2801	xorq	%rbp,%rbp
2802	jmp	.Lsqrx8x_outer_loop
2803
2804.align	32
2805.Lsqrx8x_outer_loop:
2806	mulxq	8(%rsi),%r8,%rax
2807	adcxq	%r9,%r8
2808	adoxq	%rax,%r10
2809	mulxq	16(%rsi),%r9,%rax
2810	adcxq	%r10,%r9
2811	adoxq	%rax,%r11
2812.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2813	adcxq	%r11,%r10
2814	adoxq	%rax,%r12
2815.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2816	adcxq	%r12,%r11
2817	adoxq	%rax,%r13
2818	mulxq	40(%rsi),%r12,%rax
2819	adcxq	%r13,%r12
2820	adoxq	%rax,%r14
2821	mulxq	48(%rsi),%r13,%rax
2822	adcxq	%r14,%r13
2823	adoxq	%r15,%rax
2824	mulxq	56(%rsi),%r14,%r15
2825	movq	8(%rsi),%rdx
2826	adcxq	%rax,%r14
2827	adoxq	%rbp,%r15
2828	adcq	64(%rdi),%r15
2829	movq	%r8,8(%rdi)
2830	movq	%r9,16(%rdi)
2831	sbbq	%rcx,%rcx
2832	xorq	%rbp,%rbp
2833
2834
2835	mulxq	16(%rsi),%r8,%rbx
2836	mulxq	24(%rsi),%r9,%rax
2837	adcxq	%r10,%r8
2838	adoxq	%rbx,%r9
2839	mulxq	32(%rsi),%r10,%rbx
2840	adcxq	%r11,%r9
2841	adoxq	%rax,%r10
2842.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2843	adcxq	%r12,%r10
2844	adoxq	%rbx,%r11
2845.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2846	adcxq	%r13,%r11
2847	adoxq	%r14,%r12
2848.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2849	movq	16(%rsi),%rdx
2850	adcxq	%rax,%r12
2851	adoxq	%rbx,%r13
2852	adcxq	%r15,%r13
2853	adoxq	%rbp,%r14
2854	adcxq	%rbp,%r14
2855
2856	movq	%r8,24(%rdi)
2857	movq	%r9,32(%rdi)
2858
2859	mulxq	24(%rsi),%r8,%rbx
2860	mulxq	32(%rsi),%r9,%rax
2861	adcxq	%r10,%r8
2862	adoxq	%rbx,%r9
2863	mulxq	40(%rsi),%r10,%rbx
2864	adcxq	%r11,%r9
2865	adoxq	%rax,%r10
2866.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2867	adcxq	%r12,%r10
2868	adoxq	%r13,%r11
2869.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2870.byte	0x3e
2871	movq	24(%rsi),%rdx
2872	adcxq	%rbx,%r11
2873	adoxq	%rax,%r12
2874	adcxq	%r14,%r12
2875	movq	%r8,40(%rdi)
2876	movq	%r9,48(%rdi)
2877	mulxq	32(%rsi),%r8,%rax
2878	adoxq	%rbp,%r13
2879	adcxq	%rbp,%r13
2880
2881	mulxq	40(%rsi),%r9,%rbx
2882	adcxq	%r10,%r8
2883	adoxq	%rax,%r9
2884	mulxq	48(%rsi),%r10,%rax
2885	adcxq	%r11,%r9
2886	adoxq	%r12,%r10
2887	mulxq	56(%rsi),%r11,%r12
2888	movq	32(%rsi),%rdx
2889	movq	40(%rsi),%r14
2890	adcxq	%rbx,%r10
2891	adoxq	%rax,%r11
2892	movq	48(%rsi),%r15
2893	adcxq	%r13,%r11
2894	adoxq	%rbp,%r12
2895	adcxq	%rbp,%r12
2896
2897	movq	%r8,56(%rdi)
2898	movq	%r9,64(%rdi)
2899
2900	mulxq	%r14,%r9,%rax
2901	movq	56(%rsi),%r8
2902	adcxq	%r10,%r9
2903	mulxq	%r15,%r10,%rbx
2904	adoxq	%rax,%r10
2905	adcxq	%r11,%r10
2906	mulxq	%r8,%r11,%rax
2907	movq	%r14,%rdx
2908	adoxq	%rbx,%r11
2909	adcxq	%r12,%r11
2910
2911	adcxq	%rbp,%rax
2912
2913	mulxq	%r15,%r14,%rbx
2914	mulxq	%r8,%r12,%r13
2915	movq	%r15,%rdx
2916	leaq	64(%rsi),%rsi
2917	adcxq	%r14,%r11
2918	adoxq	%rbx,%r12
2919	adcxq	%rax,%r12
2920	adoxq	%rbp,%r13
2921
2922.byte	0x67,0x67
2923	mulxq	%r8,%r8,%r14
2924	adcxq	%r8,%r13
2925	adcxq	%rbp,%r14
2926
2927	cmpq	8+8(%rsp),%rsi
2928	je	.Lsqrx8x_outer_break
2929
2930	negq	%rcx
2931	movq	$-8,%rcx
2932	movq	%rbp,%r15
2933	movq	64(%rdi),%r8
2934	adcxq	72(%rdi),%r9
2935	adcxq	80(%rdi),%r10
2936	adcxq	88(%rdi),%r11
2937	adcq	96(%rdi),%r12
2938	adcq	104(%rdi),%r13
2939	adcq	112(%rdi),%r14
2940	adcq	120(%rdi),%r15
2941	leaq	(%rsi),%rbp
2942	leaq	128(%rdi),%rdi
2943	sbbq	%rax,%rax
2944
2945	movq	-64(%rsi),%rdx
2946	movq	%rax,16+8(%rsp)
2947	movq	%rdi,24+8(%rsp)
2948
2949
2950	xorl	%eax,%eax
2951	jmp	.Lsqrx8x_loop
2952
2953.align	32
2954.Lsqrx8x_loop:
2955	movq	%r8,%rbx
2956	mulxq	0(%rbp),%rax,%r8
2957	adcxq	%rax,%rbx
2958	adoxq	%r9,%r8
2959
2960	mulxq	8(%rbp),%rax,%r9
2961	adcxq	%rax,%r8
2962	adoxq	%r10,%r9
2963
2964	mulxq	16(%rbp),%rax,%r10
2965	adcxq	%rax,%r9
2966	adoxq	%r11,%r10
2967
2968	mulxq	24(%rbp),%rax,%r11
2969	adcxq	%rax,%r10
2970	adoxq	%r12,%r11
2971
2972.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
2973	adcxq	%rax,%r11
2974	adoxq	%r13,%r12
2975
2976	mulxq	40(%rbp),%rax,%r13
2977	adcxq	%rax,%r12
2978	adoxq	%r14,%r13
2979
2980	mulxq	48(%rbp),%rax,%r14
2981	movq	%rbx,(%rdi,%rcx,8)
2982	movl	$0,%ebx
2983	adcxq	%rax,%r13
2984	adoxq	%r15,%r14
2985
2986.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
2987	movq	8(%rsi,%rcx,8),%rdx
2988	adcxq	%rax,%r14
2989	adoxq	%rbx,%r15
2990	adcxq	%rbx,%r15
2991
2992.byte	0x67
2993	incq	%rcx
2994	jnz	.Lsqrx8x_loop
2995
2996	leaq	64(%rbp),%rbp
2997	movq	$-8,%rcx
2998	cmpq	8+8(%rsp),%rbp
2999	je	.Lsqrx8x_break
3000
3001	subq	16+8(%rsp),%rbx
3002.byte	0x66
3003	movq	-64(%rsi),%rdx
3004	adcxq	0(%rdi),%r8
3005	adcxq	8(%rdi),%r9
3006	adcq	16(%rdi),%r10
3007	adcq	24(%rdi),%r11
3008	adcq	32(%rdi),%r12
3009	adcq	40(%rdi),%r13
3010	adcq	48(%rdi),%r14
3011	adcq	56(%rdi),%r15
3012	leaq	64(%rdi),%rdi
3013.byte	0x67
3014	sbbq	%rax,%rax
3015	xorl	%ebx,%ebx
3016	movq	%rax,16+8(%rsp)
3017	jmp	.Lsqrx8x_loop
3018
3019.align	32
3020.Lsqrx8x_break:
3021	xorq	%rbp,%rbp
3022	subq	16+8(%rsp),%rbx
3023	adcxq	%rbp,%r8
3024	movq	24+8(%rsp),%rcx
3025	adcxq	%rbp,%r9
3026	movq	0(%rsi),%rdx
3027	adcq	$0,%r10
3028	movq	%r8,0(%rdi)
3029	adcq	$0,%r11
3030	adcq	$0,%r12
3031	adcq	$0,%r13
3032	adcq	$0,%r14
3033	adcq	$0,%r15
3034	cmpq	%rcx,%rdi
3035	je	.Lsqrx8x_outer_loop
3036
3037	movq	%r9,8(%rdi)
3038	movq	8(%rcx),%r9
3039	movq	%r10,16(%rdi)
3040	movq	16(%rcx),%r10
3041	movq	%r11,24(%rdi)
3042	movq	24(%rcx),%r11
3043	movq	%r12,32(%rdi)
3044	movq	32(%rcx),%r12
3045	movq	%r13,40(%rdi)
3046	movq	40(%rcx),%r13
3047	movq	%r14,48(%rdi)
3048	movq	48(%rcx),%r14
3049	movq	%r15,56(%rdi)
3050	movq	56(%rcx),%r15
3051	movq	%rcx,%rdi
3052	jmp	.Lsqrx8x_outer_loop
3053
3054.align	32
3055.Lsqrx8x_outer_break:
3056	movq	%r9,72(%rdi)
3057.byte	102,72,15,126,217
3058	movq	%r10,80(%rdi)
3059	movq	%r11,88(%rdi)
3060	movq	%r12,96(%rdi)
3061	movq	%r13,104(%rdi)
3062	movq	%r14,112(%rdi)
3063	leaq	48+8(%rsp),%rdi
3064	movq	(%rsi,%rcx,1),%rdx
3065
3066	movq	8(%rdi),%r11
3067	xorq	%r10,%r10
3068	movq	0+8(%rsp),%r9
3069	adoxq	%r11,%r11
3070	movq	16(%rdi),%r12
3071	movq	24(%rdi),%r13
3072
3073
3074.align	32
3075.Lsqrx4x_shift_n_add:
3076	mulxq	%rdx,%rax,%rbx
3077	adoxq	%r12,%r12
3078	adcxq	%r10,%rax
3079.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3080.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3081	adoxq	%r13,%r13
3082	adcxq	%r11,%rbx
3083	movq	40(%rdi),%r11
3084	movq	%rax,0(%rdi)
3085	movq	%rbx,8(%rdi)
3086
3087	mulxq	%rdx,%rax,%rbx
3088	adoxq	%r10,%r10
3089	adcxq	%r12,%rax
3090	movq	16(%rsi,%rcx,1),%rdx
3091	movq	48(%rdi),%r12
3092	adoxq	%r11,%r11
3093	adcxq	%r13,%rbx
3094	movq	56(%rdi),%r13
3095	movq	%rax,16(%rdi)
3096	movq	%rbx,24(%rdi)
3097
3098	mulxq	%rdx,%rax,%rbx
3099	adoxq	%r12,%r12
3100	adcxq	%r10,%rax
3101	movq	24(%rsi,%rcx,1),%rdx
3102	leaq	32(%rcx),%rcx
3103	movq	64(%rdi),%r10
3104	adoxq	%r13,%r13
3105	adcxq	%r11,%rbx
3106	movq	72(%rdi),%r11
3107	movq	%rax,32(%rdi)
3108	movq	%rbx,40(%rdi)
3109
3110	mulxq	%rdx,%rax,%rbx
3111	adoxq	%r10,%r10
3112	adcxq	%r12,%rax
3113	jrcxz	.Lsqrx4x_shift_n_add_break
3114.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3115	adoxq	%r11,%r11
3116	adcxq	%r13,%rbx
3117	movq	80(%rdi),%r12
3118	movq	88(%rdi),%r13
3119	movq	%rax,48(%rdi)
3120	movq	%rbx,56(%rdi)
3121	leaq	64(%rdi),%rdi
3122	nop
3123	jmp	.Lsqrx4x_shift_n_add
3124
3125.align	32
3126.Lsqrx4x_shift_n_add_break:
3127	adcxq	%r13,%rbx
3128	movq	%rax,48(%rdi)
3129	movq	%rbx,56(%rdi)
3130	leaq	64(%rdi),%rdi
3131.byte	102,72,15,126,213
3132__bn_sqrx8x_reduction:
3133	xorl	%eax,%eax
3134	movq	32+8(%rsp),%rbx
3135	movq	48+8(%rsp),%rdx
3136	leaq	-64(%rbp,%r9,1),%rcx
3137
3138	movq	%rcx,0+8(%rsp)
3139	movq	%rdi,8+8(%rsp)
3140
3141	leaq	48+8(%rsp),%rdi
3142	jmp	.Lsqrx8x_reduction_loop
3143
3144.align	32
3145.Lsqrx8x_reduction_loop:
3146	movq	8(%rdi),%r9
3147	movq	16(%rdi),%r10
3148	movq	24(%rdi),%r11
3149	movq	32(%rdi),%r12
3150	movq	%rdx,%r8
3151	imulq	%rbx,%rdx
3152	movq	40(%rdi),%r13
3153	movq	48(%rdi),%r14
3154	movq	56(%rdi),%r15
3155	movq	%rax,24+8(%rsp)
3156
3157	leaq	64(%rdi),%rdi
3158	xorq	%rsi,%rsi
3159	movq	$-8,%rcx
3160	jmp	.Lsqrx8x_reduce
3161
3162.align	32
3163.Lsqrx8x_reduce:
3164	movq	%r8,%rbx
3165	mulxq	0(%rbp),%rax,%r8
3166	adcxq	%rbx,%rax
3167	adoxq	%r9,%r8
3168
3169	mulxq	8(%rbp),%rbx,%r9
3170	adcxq	%rbx,%r8
3171	adoxq	%r10,%r9
3172
3173	mulxq	16(%rbp),%rbx,%r10
3174	adcxq	%rbx,%r9
3175	adoxq	%r11,%r10
3176
3177	mulxq	24(%rbp),%rbx,%r11
3178	adcxq	%rbx,%r10
3179	adoxq	%r12,%r11
3180
3181.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3182	movq	%rdx,%rax
3183	movq	%r8,%rdx
3184	adcxq	%rbx,%r11
3185	adoxq	%r13,%r12
3186
3187	mulxq	32+8(%rsp),%rbx,%rdx
3188	movq	%rax,%rdx
3189	movq	%rax,64+48+8(%rsp,%rcx,8)
3190
3191	mulxq	40(%rbp),%rax,%r13
3192	adcxq	%rax,%r12
3193	adoxq	%r14,%r13
3194
3195	mulxq	48(%rbp),%rax,%r14
3196	adcxq	%rax,%r13
3197	adoxq	%r15,%r14
3198
3199	mulxq	56(%rbp),%rax,%r15
3200	movq	%rbx,%rdx
3201	adcxq	%rax,%r14
3202	adoxq	%rsi,%r15
3203	adcxq	%rsi,%r15
3204
3205.byte	0x67,0x67,0x67
3206	incq	%rcx
3207	jnz	.Lsqrx8x_reduce
3208
3209	movq	%rsi,%rax
3210	cmpq	0+8(%rsp),%rbp
3211	jae	.Lsqrx8x_no_tail
3212
3213	movq	48+8(%rsp),%rdx
3214	addq	0(%rdi),%r8
3215	leaq	64(%rbp),%rbp
3216	movq	$-8,%rcx
3217	adcxq	8(%rdi),%r9
3218	adcxq	16(%rdi),%r10
3219	adcq	24(%rdi),%r11
3220	adcq	32(%rdi),%r12
3221	adcq	40(%rdi),%r13
3222	adcq	48(%rdi),%r14
3223	adcq	56(%rdi),%r15
3224	leaq	64(%rdi),%rdi
3225	sbbq	%rax,%rax
3226
3227	xorq	%rsi,%rsi
3228	movq	%rax,16+8(%rsp)
3229	jmp	.Lsqrx8x_tail
3230
3231.align	32
3232.Lsqrx8x_tail:
3233	movq	%r8,%rbx
3234	mulxq	0(%rbp),%rax,%r8
3235	adcxq	%rax,%rbx
3236	adoxq	%r9,%r8
3237
3238	mulxq	8(%rbp),%rax,%r9
3239	adcxq	%rax,%r8
3240	adoxq	%r10,%r9
3241
3242	mulxq	16(%rbp),%rax,%r10
3243	adcxq	%rax,%r9
3244	adoxq	%r11,%r10
3245
3246	mulxq	24(%rbp),%rax,%r11
3247	adcxq	%rax,%r10
3248	adoxq	%r12,%r11
3249
3250.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3251	adcxq	%rax,%r11
3252	adoxq	%r13,%r12
3253
3254	mulxq	40(%rbp),%rax,%r13
3255	adcxq	%rax,%r12
3256	adoxq	%r14,%r13
3257
3258	mulxq	48(%rbp),%rax,%r14
3259	adcxq	%rax,%r13
3260	adoxq	%r15,%r14
3261
3262	mulxq	56(%rbp),%rax,%r15
3263	movq	72+48+8(%rsp,%rcx,8),%rdx
3264	adcxq	%rax,%r14
3265	adoxq	%rsi,%r15
3266	movq	%rbx,(%rdi,%rcx,8)
3267	movq	%r8,%rbx
3268	adcxq	%rsi,%r15
3269
3270	incq	%rcx
3271	jnz	.Lsqrx8x_tail
3272
3273	cmpq	0+8(%rsp),%rbp
3274	jae	.Lsqrx8x_tail_done
3275
3276	subq	16+8(%rsp),%rsi
3277	movq	48+8(%rsp),%rdx
3278	leaq	64(%rbp),%rbp
3279	adcq	0(%rdi),%r8
3280	adcq	8(%rdi),%r9
3281	adcq	16(%rdi),%r10
3282	adcq	24(%rdi),%r11
3283	adcq	32(%rdi),%r12
3284	adcq	40(%rdi),%r13
3285	adcq	48(%rdi),%r14
3286	adcq	56(%rdi),%r15
3287	leaq	64(%rdi),%rdi
3288	sbbq	%rax,%rax
3289	subq	$8,%rcx
3290
3291	xorq	%rsi,%rsi
3292	movq	%rax,16+8(%rsp)
3293	jmp	.Lsqrx8x_tail
3294
3295.align	32
3296.Lsqrx8x_tail_done:
3297	xorq	%rax,%rax
3298	addq	24+8(%rsp),%r8
3299	adcq	$0,%r9
3300	adcq	$0,%r10
3301	adcq	$0,%r11
3302	adcq	$0,%r12
3303	adcq	$0,%r13
3304	adcq	$0,%r14
3305	adcq	$0,%r15
3306	adcq	$0,%rax
3307
3308	subq	16+8(%rsp),%rsi
3309.Lsqrx8x_no_tail:
3310	adcq	0(%rdi),%r8
3311.byte	102,72,15,126,217
3312	adcq	8(%rdi),%r9
3313	movq	56(%rbp),%rsi
3314.byte	102,72,15,126,213
3315	adcq	16(%rdi),%r10
3316	adcq	24(%rdi),%r11
3317	adcq	32(%rdi),%r12
3318	adcq	40(%rdi),%r13
3319	adcq	48(%rdi),%r14
3320	adcq	56(%rdi),%r15
3321	adcq	$0,%rax
3322
3323	movq	32+8(%rsp),%rbx
3324	movq	64(%rdi,%rcx,1),%rdx
3325
3326	movq	%r8,0(%rdi)
3327	leaq	64(%rdi),%r8
3328	movq	%r9,8(%rdi)
3329	movq	%r10,16(%rdi)
3330	movq	%r11,24(%rdi)
3331	movq	%r12,32(%rdi)
3332	movq	%r13,40(%rdi)
3333	movq	%r14,48(%rdi)
3334	movq	%r15,56(%rdi)
3335
3336	leaq	64(%rdi,%rcx,1),%rdi
3337	cmpq	8+8(%rsp),%r8
3338	jb	.Lsqrx8x_reduction_loop
3339	.byte	0xf3,0xc3
3340.cfi_endproc
3341.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3342.align	32
3343__bn_postx4x_internal:
3344.cfi_startproc
3345	movq	0(%rbp),%r12
3346	movq	%rcx,%r10
3347	movq	%rcx,%r9
3348	negq	%rax
3349	sarq	$3+2,%rcx
3350
3351.byte	102,72,15,126,202
3352.byte	102,72,15,126,206
3353	decq	%r12
3354	movq	8(%rbp),%r13
3355	xorq	%r8,%r8
3356	movq	16(%rbp),%r14
3357	movq	24(%rbp),%r15
3358	jmp	.Lsqrx4x_sub_entry
3359
3360.align	16
3361.Lsqrx4x_sub:
3362	movq	0(%rbp),%r12
3363	movq	8(%rbp),%r13
3364	movq	16(%rbp),%r14
3365	movq	24(%rbp),%r15
3366.Lsqrx4x_sub_entry:
3367	andnq	%rax,%r12,%r12
3368	leaq	32(%rbp),%rbp
3369	andnq	%rax,%r13,%r13
3370	andnq	%rax,%r14,%r14
3371	andnq	%rax,%r15,%r15
3372
3373	negq	%r8
3374	adcq	0(%rdi),%r12
3375	adcq	8(%rdi),%r13
3376	adcq	16(%rdi),%r14
3377	adcq	24(%rdi),%r15
3378	movq	%r12,0(%rdx)
3379	leaq	32(%rdi),%rdi
3380	movq	%r13,8(%rdx)
3381	sbbq	%r8,%r8
3382	movq	%r14,16(%rdx)
3383	movq	%r15,24(%rdx)
3384	leaq	32(%rdx),%rdx
3385
3386	incq	%rcx
3387	jnz	.Lsqrx4x_sub
3388
3389	negq	%r9
3390
3391	.byte	0xf3,0xc3
3392.cfi_endproc
3393.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3394.globl	bn_get_bits5
3395.type	bn_get_bits5,@function
3396.align	16
3397bn_get_bits5:
3398.cfi_startproc
3399	leaq	0(%rdi),%r10
3400	leaq	1(%rdi),%r11
3401	movl	%esi,%ecx
3402	shrl	$4,%esi
3403	andl	$15,%ecx
3404	leal	-8(%rcx),%eax
3405	cmpl	$11,%ecx
3406	cmovaq	%r11,%r10
3407	cmoval	%eax,%ecx
3408	movzwl	(%r10,%rsi,2),%eax
3409	shrl	%cl,%eax
3410	andl	$31,%eax
3411	.byte	0xf3,0xc3
3412.cfi_endproc
3413.size	bn_get_bits5,.-bn_get_bits5
3414
3415.globl	bn_scatter5
3416.type	bn_scatter5,@function
3417.align	16
3418bn_scatter5:
3419.cfi_startproc
3420	cmpl	$0,%esi
3421	jz	.Lscatter_epilogue
3422	leaq	(%rdx,%rcx,8),%rdx
3423.Lscatter:
3424	movq	(%rdi),%rax
3425	leaq	8(%rdi),%rdi
3426	movq	%rax,(%rdx)
3427	leaq	256(%rdx),%rdx
3428	subl	$1,%esi
3429	jnz	.Lscatter
3430.Lscatter_epilogue:
3431	.byte	0xf3,0xc3
3432.cfi_endproc
3433.size	bn_scatter5,.-bn_scatter5
3434
3435.globl	bn_gather5
3436.type	bn_gather5,@function
3437.align	32
3438bn_gather5:
3439.LSEH_begin_bn_gather5:
3440.cfi_startproc
3441
3442.byte	0x4c,0x8d,0x14,0x24
3443.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3444	leaq	.Linc(%rip),%rax
3445	andq	$-16,%rsp
3446
3447	movd	%ecx,%xmm5
3448	movdqa	0(%rax),%xmm0
3449	movdqa	16(%rax),%xmm1
3450	leaq	128(%rdx),%r11
3451	leaq	128(%rsp),%rax
3452
3453	pshufd	$0,%xmm5,%xmm5
3454	movdqa	%xmm1,%xmm4
3455	movdqa	%xmm1,%xmm2
3456	paddd	%xmm0,%xmm1
3457	pcmpeqd	%xmm5,%xmm0
3458	movdqa	%xmm4,%xmm3
3459
3460	paddd	%xmm1,%xmm2
3461	pcmpeqd	%xmm5,%xmm1
3462	movdqa	%xmm0,-128(%rax)
3463	movdqa	%xmm4,%xmm0
3464
3465	paddd	%xmm2,%xmm3
3466	pcmpeqd	%xmm5,%xmm2
3467	movdqa	%xmm1,-112(%rax)
3468	movdqa	%xmm4,%xmm1
3469
3470	paddd	%xmm3,%xmm0
3471	pcmpeqd	%xmm5,%xmm3
3472	movdqa	%xmm2,-96(%rax)
3473	movdqa	%xmm4,%xmm2
3474	paddd	%xmm0,%xmm1
3475	pcmpeqd	%xmm5,%xmm0
3476	movdqa	%xmm3,-80(%rax)
3477	movdqa	%xmm4,%xmm3
3478
3479	paddd	%xmm1,%xmm2
3480	pcmpeqd	%xmm5,%xmm1
3481	movdqa	%xmm0,-64(%rax)
3482	movdqa	%xmm4,%xmm0
3483
3484	paddd	%xmm2,%xmm3
3485	pcmpeqd	%xmm5,%xmm2
3486	movdqa	%xmm1,-48(%rax)
3487	movdqa	%xmm4,%xmm1
3488
3489	paddd	%xmm3,%xmm0
3490	pcmpeqd	%xmm5,%xmm3
3491	movdqa	%xmm2,-32(%rax)
3492	movdqa	%xmm4,%xmm2
3493	paddd	%xmm0,%xmm1
3494	pcmpeqd	%xmm5,%xmm0
3495	movdqa	%xmm3,-16(%rax)
3496	movdqa	%xmm4,%xmm3
3497
3498	paddd	%xmm1,%xmm2
3499	pcmpeqd	%xmm5,%xmm1
3500	movdqa	%xmm0,0(%rax)
3501	movdqa	%xmm4,%xmm0
3502
3503	paddd	%xmm2,%xmm3
3504	pcmpeqd	%xmm5,%xmm2
3505	movdqa	%xmm1,16(%rax)
3506	movdqa	%xmm4,%xmm1
3507
3508	paddd	%xmm3,%xmm0
3509	pcmpeqd	%xmm5,%xmm3
3510	movdqa	%xmm2,32(%rax)
3511	movdqa	%xmm4,%xmm2
3512	paddd	%xmm0,%xmm1
3513	pcmpeqd	%xmm5,%xmm0
3514	movdqa	%xmm3,48(%rax)
3515	movdqa	%xmm4,%xmm3
3516
3517	paddd	%xmm1,%xmm2
3518	pcmpeqd	%xmm5,%xmm1
3519	movdqa	%xmm0,64(%rax)
3520	movdqa	%xmm4,%xmm0
3521
3522	paddd	%xmm2,%xmm3
3523	pcmpeqd	%xmm5,%xmm2
3524	movdqa	%xmm1,80(%rax)
3525	movdqa	%xmm4,%xmm1
3526
3527	paddd	%xmm3,%xmm0
3528	pcmpeqd	%xmm5,%xmm3
3529	movdqa	%xmm2,96(%rax)
3530	movdqa	%xmm4,%xmm2
3531	movdqa	%xmm3,112(%rax)
3532	jmp	.Lgather
3533
3534.align	32
3535.Lgather:
3536	pxor	%xmm4,%xmm4
3537	pxor	%xmm5,%xmm5
3538	movdqa	-128(%r11),%xmm0
3539	movdqa	-112(%r11),%xmm1
3540	movdqa	-96(%r11),%xmm2
3541	pand	-128(%rax),%xmm0
3542	movdqa	-80(%r11),%xmm3
3543	pand	-112(%rax),%xmm1
3544	por	%xmm0,%xmm4
3545	pand	-96(%rax),%xmm2
3546	por	%xmm1,%xmm5
3547	pand	-80(%rax),%xmm3
3548	por	%xmm2,%xmm4
3549	por	%xmm3,%xmm5
3550	movdqa	-64(%r11),%xmm0
3551	movdqa	-48(%r11),%xmm1
3552	movdqa	-32(%r11),%xmm2
3553	pand	-64(%rax),%xmm0
3554	movdqa	-16(%r11),%xmm3
3555	pand	-48(%rax),%xmm1
3556	por	%xmm0,%xmm4
3557	pand	-32(%rax),%xmm2
3558	por	%xmm1,%xmm5
3559	pand	-16(%rax),%xmm3
3560	por	%xmm2,%xmm4
3561	por	%xmm3,%xmm5
3562	movdqa	0(%r11),%xmm0
3563	movdqa	16(%r11),%xmm1
3564	movdqa	32(%r11),%xmm2
3565	pand	0(%rax),%xmm0
3566	movdqa	48(%r11),%xmm3
3567	pand	16(%rax),%xmm1
3568	por	%xmm0,%xmm4
3569	pand	32(%rax),%xmm2
3570	por	%xmm1,%xmm5
3571	pand	48(%rax),%xmm3
3572	por	%xmm2,%xmm4
3573	por	%xmm3,%xmm5
3574	movdqa	64(%r11),%xmm0
3575	movdqa	80(%r11),%xmm1
3576	movdqa	96(%r11),%xmm2
3577	pand	64(%rax),%xmm0
3578	movdqa	112(%r11),%xmm3
3579	pand	80(%rax),%xmm1
3580	por	%xmm0,%xmm4
3581	pand	96(%rax),%xmm2
3582	por	%xmm1,%xmm5
3583	pand	112(%rax),%xmm3
3584	por	%xmm2,%xmm4
3585	por	%xmm3,%xmm5
3586	por	%xmm5,%xmm4
3587	leaq	256(%r11),%r11
3588	pshufd	$0x4e,%xmm4,%xmm0
3589	por	%xmm4,%xmm0
3590	movq	%xmm0,(%rdi)
3591	leaq	8(%rdi),%rdi
3592	subl	$1,%esi
3593	jnz	.Lgather
3594
3595	leaq	(%r10),%rsp
3596	.byte	0xf3,0xc3
3597.LSEH_end_bn_gather5:
3598.cfi_endproc
3599.size	bn_gather5,.-bn_gather5
3600.align	64
3601.Linc:
3602.long	0,0, 1,1
3603.long	2,2, 2,2
3604.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3605