xref: /freebsd/sys/crypto/openssl/amd64/x86_64-mont5.S (revision 51015e6d0f570239b0c2088dc6cf2b018928375d)
1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from x86_64-mont5.pl. */
3.text
4
5
6
7.globl	bn_mul_mont_gather5
8.type	bn_mul_mont_gather5,@function
9.align	64
10bn_mul_mont_gather5:
11.cfi_startproc
12	movl	%r9d,%r9d
13	movq	%rsp,%rax
14.cfi_def_cfa_register	%rax
15	testl	$7,%r9d
16	jnz	.Lmul_enter
17	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
18	jmp	.Lmul4x_enter
19
20.align	16
21.Lmul_enter:
22	movd	8(%rsp),%xmm5
23	pushq	%rbx
24.cfi_offset	%rbx,-16
25	pushq	%rbp
26.cfi_offset	%rbp,-24
27	pushq	%r12
28.cfi_offset	%r12,-32
29	pushq	%r13
30.cfi_offset	%r13,-40
31	pushq	%r14
32.cfi_offset	%r14,-48
33	pushq	%r15
34.cfi_offset	%r15,-56
35
36	negq	%r9
37	movq	%rsp,%r11
38	leaq	-280(%rsp,%r9,8),%r10
39	negq	%r9
40	andq	$-1024,%r10
41
42
43
44
45
46
47
48
49
50	subq	%r10,%r11
51	andq	$-4096,%r11
52	leaq	(%r10,%r11,1),%rsp
53	movq	(%rsp),%r11
54	cmpq	%r10,%rsp
55	ja	.Lmul_page_walk
56	jmp	.Lmul_page_walk_done
57
58.Lmul_page_walk:
59	leaq	-4096(%rsp),%rsp
60	movq	(%rsp),%r11
61	cmpq	%r10,%rsp
62	ja	.Lmul_page_walk
63.Lmul_page_walk_done:
64
65	leaq	.Linc(%rip),%r10
66	movq	%rax,8(%rsp,%r9,8)
67.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
68.Lmul_body:
69
70	leaq	128(%rdx),%r12
71	movdqa	0(%r10),%xmm0
72	movdqa	16(%r10),%xmm1
73	leaq	24-112(%rsp,%r9,8),%r10
74	andq	$-16,%r10
75
76	pshufd	$0,%xmm5,%xmm5
77	movdqa	%xmm1,%xmm4
78	movdqa	%xmm1,%xmm2
79	paddd	%xmm0,%xmm1
80	pcmpeqd	%xmm5,%xmm0
81.byte	0x67
82	movdqa	%xmm4,%xmm3
83	paddd	%xmm1,%xmm2
84	pcmpeqd	%xmm5,%xmm1
85	movdqa	%xmm0,112(%r10)
86	movdqa	%xmm4,%xmm0
87
88	paddd	%xmm2,%xmm3
89	pcmpeqd	%xmm5,%xmm2
90	movdqa	%xmm1,128(%r10)
91	movdqa	%xmm4,%xmm1
92
93	paddd	%xmm3,%xmm0
94	pcmpeqd	%xmm5,%xmm3
95	movdqa	%xmm2,144(%r10)
96	movdqa	%xmm4,%xmm2
97
98	paddd	%xmm0,%xmm1
99	pcmpeqd	%xmm5,%xmm0
100	movdqa	%xmm3,160(%r10)
101	movdqa	%xmm4,%xmm3
102	paddd	%xmm1,%xmm2
103	pcmpeqd	%xmm5,%xmm1
104	movdqa	%xmm0,176(%r10)
105	movdqa	%xmm4,%xmm0
106
107	paddd	%xmm2,%xmm3
108	pcmpeqd	%xmm5,%xmm2
109	movdqa	%xmm1,192(%r10)
110	movdqa	%xmm4,%xmm1
111
112	paddd	%xmm3,%xmm0
113	pcmpeqd	%xmm5,%xmm3
114	movdqa	%xmm2,208(%r10)
115	movdqa	%xmm4,%xmm2
116
117	paddd	%xmm0,%xmm1
118	pcmpeqd	%xmm5,%xmm0
119	movdqa	%xmm3,224(%r10)
120	movdqa	%xmm4,%xmm3
121	paddd	%xmm1,%xmm2
122	pcmpeqd	%xmm5,%xmm1
123	movdqa	%xmm0,240(%r10)
124	movdqa	%xmm4,%xmm0
125
126	paddd	%xmm2,%xmm3
127	pcmpeqd	%xmm5,%xmm2
128	movdqa	%xmm1,256(%r10)
129	movdqa	%xmm4,%xmm1
130
131	paddd	%xmm3,%xmm0
132	pcmpeqd	%xmm5,%xmm3
133	movdqa	%xmm2,272(%r10)
134	movdqa	%xmm4,%xmm2
135
136	paddd	%xmm0,%xmm1
137	pcmpeqd	%xmm5,%xmm0
138	movdqa	%xmm3,288(%r10)
139	movdqa	%xmm4,%xmm3
140	paddd	%xmm1,%xmm2
141	pcmpeqd	%xmm5,%xmm1
142	movdqa	%xmm0,304(%r10)
143
144	paddd	%xmm2,%xmm3
145.byte	0x67
146	pcmpeqd	%xmm5,%xmm2
147	movdqa	%xmm1,320(%r10)
148
149	pcmpeqd	%xmm5,%xmm3
150	movdqa	%xmm2,336(%r10)
151	pand	64(%r12),%xmm0
152
153	pand	80(%r12),%xmm1
154	pand	96(%r12),%xmm2
155	movdqa	%xmm3,352(%r10)
156	pand	112(%r12),%xmm3
157	por	%xmm2,%xmm0
158	por	%xmm3,%xmm1
159	movdqa	-128(%r12),%xmm4
160	movdqa	-112(%r12),%xmm5
161	movdqa	-96(%r12),%xmm2
162	pand	112(%r10),%xmm4
163	movdqa	-80(%r12),%xmm3
164	pand	128(%r10),%xmm5
165	por	%xmm4,%xmm0
166	pand	144(%r10),%xmm2
167	por	%xmm5,%xmm1
168	pand	160(%r10),%xmm3
169	por	%xmm2,%xmm0
170	por	%xmm3,%xmm1
171	movdqa	-64(%r12),%xmm4
172	movdqa	-48(%r12),%xmm5
173	movdqa	-32(%r12),%xmm2
174	pand	176(%r10),%xmm4
175	movdqa	-16(%r12),%xmm3
176	pand	192(%r10),%xmm5
177	por	%xmm4,%xmm0
178	pand	208(%r10),%xmm2
179	por	%xmm5,%xmm1
180	pand	224(%r10),%xmm3
181	por	%xmm2,%xmm0
182	por	%xmm3,%xmm1
183	movdqa	0(%r12),%xmm4
184	movdqa	16(%r12),%xmm5
185	movdqa	32(%r12),%xmm2
186	pand	240(%r10),%xmm4
187	movdqa	48(%r12),%xmm3
188	pand	256(%r10),%xmm5
189	por	%xmm4,%xmm0
190	pand	272(%r10),%xmm2
191	por	%xmm5,%xmm1
192	pand	288(%r10),%xmm3
193	por	%xmm2,%xmm0
194	por	%xmm3,%xmm1
195	por	%xmm1,%xmm0
196	pshufd	$0x4e,%xmm0,%xmm1
197	por	%xmm1,%xmm0
198	leaq	256(%r12),%r12
199.byte	102,72,15,126,195
200
201	movq	(%r8),%r8
202	movq	(%rsi),%rax
203
204	xorq	%r14,%r14
205	xorq	%r15,%r15
206
207	movq	%r8,%rbp
208	mulq	%rbx
209	movq	%rax,%r10
210	movq	(%rcx),%rax
211
212	imulq	%r10,%rbp
213	movq	%rdx,%r11
214
215	mulq	%rbp
216	addq	%rax,%r10
217	movq	8(%rsi),%rax
218	adcq	$0,%rdx
219	movq	%rdx,%r13
220
221	leaq	1(%r15),%r15
222	jmp	.L1st_enter
223
224.align	16
225.L1st:
226	addq	%rax,%r13
227	movq	(%rsi,%r15,8),%rax
228	adcq	$0,%rdx
229	addq	%r11,%r13
230	movq	%r10,%r11
231	adcq	$0,%rdx
232	movq	%r13,-16(%rsp,%r15,8)
233	movq	%rdx,%r13
234
235.L1st_enter:
236	mulq	%rbx
237	addq	%rax,%r11
238	movq	(%rcx,%r15,8),%rax
239	adcq	$0,%rdx
240	leaq	1(%r15),%r15
241	movq	%rdx,%r10
242
243	mulq	%rbp
244	cmpq	%r9,%r15
245	jne	.L1st
246
247
248	addq	%rax,%r13
249	adcq	$0,%rdx
250	addq	%r11,%r13
251	adcq	$0,%rdx
252	movq	%r13,-16(%rsp,%r9,8)
253	movq	%rdx,%r13
254	movq	%r10,%r11
255
256	xorq	%rdx,%rdx
257	addq	%r11,%r13
258	adcq	$0,%rdx
259	movq	%r13,-8(%rsp,%r9,8)
260	movq	%rdx,(%rsp,%r9,8)
261
262	leaq	1(%r14),%r14
263	jmp	.Louter
264.align	16
265.Louter:
266	leaq	24+128(%rsp,%r9,8),%rdx
267	andq	$-16,%rdx
268	pxor	%xmm4,%xmm4
269	pxor	%xmm5,%xmm5
270	movdqa	-128(%r12),%xmm0
271	movdqa	-112(%r12),%xmm1
272	movdqa	-96(%r12),%xmm2
273	movdqa	-80(%r12),%xmm3
274	pand	-128(%rdx),%xmm0
275	pand	-112(%rdx),%xmm1
276	por	%xmm0,%xmm4
277	pand	-96(%rdx),%xmm2
278	por	%xmm1,%xmm5
279	pand	-80(%rdx),%xmm3
280	por	%xmm2,%xmm4
281	por	%xmm3,%xmm5
282	movdqa	-64(%r12),%xmm0
283	movdqa	-48(%r12),%xmm1
284	movdqa	-32(%r12),%xmm2
285	movdqa	-16(%r12),%xmm3
286	pand	-64(%rdx),%xmm0
287	pand	-48(%rdx),%xmm1
288	por	%xmm0,%xmm4
289	pand	-32(%rdx),%xmm2
290	por	%xmm1,%xmm5
291	pand	-16(%rdx),%xmm3
292	por	%xmm2,%xmm4
293	por	%xmm3,%xmm5
294	movdqa	0(%r12),%xmm0
295	movdqa	16(%r12),%xmm1
296	movdqa	32(%r12),%xmm2
297	movdqa	48(%r12),%xmm3
298	pand	0(%rdx),%xmm0
299	pand	16(%rdx),%xmm1
300	por	%xmm0,%xmm4
301	pand	32(%rdx),%xmm2
302	por	%xmm1,%xmm5
303	pand	48(%rdx),%xmm3
304	por	%xmm2,%xmm4
305	por	%xmm3,%xmm5
306	movdqa	64(%r12),%xmm0
307	movdqa	80(%r12),%xmm1
308	movdqa	96(%r12),%xmm2
309	movdqa	112(%r12),%xmm3
310	pand	64(%rdx),%xmm0
311	pand	80(%rdx),%xmm1
312	por	%xmm0,%xmm4
313	pand	96(%rdx),%xmm2
314	por	%xmm1,%xmm5
315	pand	112(%rdx),%xmm3
316	por	%xmm2,%xmm4
317	por	%xmm3,%xmm5
318	por	%xmm5,%xmm4
319	pshufd	$0x4e,%xmm4,%xmm0
320	por	%xmm4,%xmm0
321	leaq	256(%r12),%r12
322
323	movq	(%rsi),%rax
324.byte	102,72,15,126,195
325
326	xorq	%r15,%r15
327	movq	%r8,%rbp
328	movq	(%rsp),%r10
329
330	mulq	%rbx
331	addq	%rax,%r10
332	movq	(%rcx),%rax
333	adcq	$0,%rdx
334
335	imulq	%r10,%rbp
336	movq	%rdx,%r11
337
338	mulq	%rbp
339	addq	%rax,%r10
340	movq	8(%rsi),%rax
341	adcq	$0,%rdx
342	movq	8(%rsp),%r10
343	movq	%rdx,%r13
344
345	leaq	1(%r15),%r15
346	jmp	.Linner_enter
347
348.align	16
349.Linner:
350	addq	%rax,%r13
351	movq	(%rsi,%r15,8),%rax
352	adcq	$0,%rdx
353	addq	%r10,%r13
354	movq	(%rsp,%r15,8),%r10
355	adcq	$0,%rdx
356	movq	%r13,-16(%rsp,%r15,8)
357	movq	%rdx,%r13
358
359.Linner_enter:
360	mulq	%rbx
361	addq	%rax,%r11
362	movq	(%rcx,%r15,8),%rax
363	adcq	$0,%rdx
364	addq	%r11,%r10
365	movq	%rdx,%r11
366	adcq	$0,%r11
367	leaq	1(%r15),%r15
368
369	mulq	%rbp
370	cmpq	%r9,%r15
371	jne	.Linner
372
373	addq	%rax,%r13
374	adcq	$0,%rdx
375	addq	%r10,%r13
376	movq	(%rsp,%r9,8),%r10
377	adcq	$0,%rdx
378	movq	%r13,-16(%rsp,%r9,8)
379	movq	%rdx,%r13
380
381	xorq	%rdx,%rdx
382	addq	%r11,%r13
383	adcq	$0,%rdx
384	addq	%r10,%r13
385	adcq	$0,%rdx
386	movq	%r13,-8(%rsp,%r9,8)
387	movq	%rdx,(%rsp,%r9,8)
388
389	leaq	1(%r14),%r14
390	cmpq	%r9,%r14
391	jb	.Louter
392
393	xorq	%r14,%r14
394	movq	(%rsp),%rax
395	leaq	(%rsp),%rsi
396	movq	%r9,%r15
397	jmp	.Lsub
398.align	16
399.Lsub:	sbbq	(%rcx,%r14,8),%rax
400	movq	%rax,(%rdi,%r14,8)
401	movq	8(%rsi,%r14,8),%rax
402	leaq	1(%r14),%r14
403	decq	%r15
404	jnz	.Lsub
405
406	sbbq	$0,%rax
407	movq	$-1,%rbx
408	xorq	%rax,%rbx
409	xorq	%r14,%r14
410	movq	%r9,%r15
411
412.Lcopy:
413	movq	(%rdi,%r14,8),%rcx
414	movq	(%rsp,%r14,8),%rdx
415	andq	%rbx,%rcx
416	andq	%rax,%rdx
417	movq	%r14,(%rsp,%r14,8)
418	orq	%rcx,%rdx
419	movq	%rdx,(%rdi,%r14,8)
420	leaq	1(%r14),%r14
421	subq	$1,%r15
422	jnz	.Lcopy
423
424	movq	8(%rsp,%r9,8),%rsi
425.cfi_def_cfa	%rsi,8
426	movq	$1,%rax
427
428	movq	-48(%rsi),%r15
429.cfi_restore	%r15
430	movq	-40(%rsi),%r14
431.cfi_restore	%r14
432	movq	-32(%rsi),%r13
433.cfi_restore	%r13
434	movq	-24(%rsi),%r12
435.cfi_restore	%r12
436	movq	-16(%rsi),%rbp
437.cfi_restore	%rbp
438	movq	-8(%rsi),%rbx
439.cfi_restore	%rbx
440	leaq	(%rsi),%rsp
441.cfi_def_cfa_register	%rsp
442.Lmul_epilogue:
443	.byte	0xf3,0xc3
444.cfi_endproc
445.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
446.type	bn_mul4x_mont_gather5,@function
447.align	32
448bn_mul4x_mont_gather5:
449.cfi_startproc
450.byte	0x67
451	movq	%rsp,%rax
452.cfi_def_cfa_register	%rax
453.Lmul4x_enter:
454	andl	$0x80108,%r11d
455	cmpl	$0x80108,%r11d
456	je	.Lmulx4x_enter
457	pushq	%rbx
458.cfi_offset	%rbx,-16
459	pushq	%rbp
460.cfi_offset	%rbp,-24
461	pushq	%r12
462.cfi_offset	%r12,-32
463	pushq	%r13
464.cfi_offset	%r13,-40
465	pushq	%r14
466.cfi_offset	%r14,-48
467	pushq	%r15
468.cfi_offset	%r15,-56
469.Lmul4x_prologue:
470
471.byte	0x67
472	shll	$3,%r9d
473	leaq	(%r9,%r9,2),%r10
474	negq	%r9
475
476
477
478
479
480
481
482
483
484
485	leaq	-320(%rsp,%r9,2),%r11
486	movq	%rsp,%rbp
487	subq	%rdi,%r11
488	andq	$4095,%r11
489	cmpq	%r11,%r10
490	jb	.Lmul4xsp_alt
491	subq	%r11,%rbp
492	leaq	-320(%rbp,%r9,2),%rbp
493	jmp	.Lmul4xsp_done
494
495.align	32
496.Lmul4xsp_alt:
497	leaq	4096-320(,%r9,2),%r10
498	leaq	-320(%rbp,%r9,2),%rbp
499	subq	%r10,%r11
500	movq	$0,%r10
501	cmovcq	%r10,%r11
502	subq	%r11,%rbp
503.Lmul4xsp_done:
504	andq	$-64,%rbp
505	movq	%rsp,%r11
506	subq	%rbp,%r11
507	andq	$-4096,%r11
508	leaq	(%r11,%rbp,1),%rsp
509	movq	(%rsp),%r10
510	cmpq	%rbp,%rsp
511	ja	.Lmul4x_page_walk
512	jmp	.Lmul4x_page_walk_done
513
514.Lmul4x_page_walk:
515	leaq	-4096(%rsp),%rsp
516	movq	(%rsp),%r10
517	cmpq	%rbp,%rsp
518	ja	.Lmul4x_page_walk
519.Lmul4x_page_walk_done:
520
521	negq	%r9
522
523	movq	%rax,40(%rsp)
524.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
525.Lmul4x_body:
526
527	call	mul4x_internal
528
529	movq	40(%rsp),%rsi
530.cfi_def_cfa	%rsi,8
531	movq	$1,%rax
532
533	movq	-48(%rsi),%r15
534.cfi_restore	%r15
535	movq	-40(%rsi),%r14
536.cfi_restore	%r14
537	movq	-32(%rsi),%r13
538.cfi_restore	%r13
539	movq	-24(%rsi),%r12
540.cfi_restore	%r12
541	movq	-16(%rsi),%rbp
542.cfi_restore	%rbp
543	movq	-8(%rsi),%rbx
544.cfi_restore	%rbx
545	leaq	(%rsi),%rsp
546.cfi_def_cfa_register	%rsp
547.Lmul4x_epilogue:
548	.byte	0xf3,0xc3
549.cfi_endproc
550.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
551
552.type	mul4x_internal,@function
553.align	32
554mul4x_internal:
555.cfi_startproc
556	shlq	$5,%r9
557	movd	8(%rax),%xmm5
558	leaq	.Linc(%rip),%rax
559	leaq	128(%rdx,%r9,1),%r13
560	shrq	$5,%r9
561	movdqa	0(%rax),%xmm0
562	movdqa	16(%rax),%xmm1
563	leaq	88-112(%rsp,%r9,1),%r10
564	leaq	128(%rdx),%r12
565
566	pshufd	$0,%xmm5,%xmm5
567	movdqa	%xmm1,%xmm4
568.byte	0x67,0x67
569	movdqa	%xmm1,%xmm2
570	paddd	%xmm0,%xmm1
571	pcmpeqd	%xmm5,%xmm0
572.byte	0x67
573	movdqa	%xmm4,%xmm3
574	paddd	%xmm1,%xmm2
575	pcmpeqd	%xmm5,%xmm1
576	movdqa	%xmm0,112(%r10)
577	movdqa	%xmm4,%xmm0
578
579	paddd	%xmm2,%xmm3
580	pcmpeqd	%xmm5,%xmm2
581	movdqa	%xmm1,128(%r10)
582	movdqa	%xmm4,%xmm1
583
584	paddd	%xmm3,%xmm0
585	pcmpeqd	%xmm5,%xmm3
586	movdqa	%xmm2,144(%r10)
587	movdqa	%xmm4,%xmm2
588
589	paddd	%xmm0,%xmm1
590	pcmpeqd	%xmm5,%xmm0
591	movdqa	%xmm3,160(%r10)
592	movdqa	%xmm4,%xmm3
593	paddd	%xmm1,%xmm2
594	pcmpeqd	%xmm5,%xmm1
595	movdqa	%xmm0,176(%r10)
596	movdqa	%xmm4,%xmm0
597
598	paddd	%xmm2,%xmm3
599	pcmpeqd	%xmm5,%xmm2
600	movdqa	%xmm1,192(%r10)
601	movdqa	%xmm4,%xmm1
602
603	paddd	%xmm3,%xmm0
604	pcmpeqd	%xmm5,%xmm3
605	movdqa	%xmm2,208(%r10)
606	movdqa	%xmm4,%xmm2
607
608	paddd	%xmm0,%xmm1
609	pcmpeqd	%xmm5,%xmm0
610	movdqa	%xmm3,224(%r10)
611	movdqa	%xmm4,%xmm3
612	paddd	%xmm1,%xmm2
613	pcmpeqd	%xmm5,%xmm1
614	movdqa	%xmm0,240(%r10)
615	movdqa	%xmm4,%xmm0
616
617	paddd	%xmm2,%xmm3
618	pcmpeqd	%xmm5,%xmm2
619	movdqa	%xmm1,256(%r10)
620	movdqa	%xmm4,%xmm1
621
622	paddd	%xmm3,%xmm0
623	pcmpeqd	%xmm5,%xmm3
624	movdqa	%xmm2,272(%r10)
625	movdqa	%xmm4,%xmm2
626
627	paddd	%xmm0,%xmm1
628	pcmpeqd	%xmm5,%xmm0
629	movdqa	%xmm3,288(%r10)
630	movdqa	%xmm4,%xmm3
631	paddd	%xmm1,%xmm2
632	pcmpeqd	%xmm5,%xmm1
633	movdqa	%xmm0,304(%r10)
634
635	paddd	%xmm2,%xmm3
636.byte	0x67
637	pcmpeqd	%xmm5,%xmm2
638	movdqa	%xmm1,320(%r10)
639
640	pcmpeqd	%xmm5,%xmm3
641	movdqa	%xmm2,336(%r10)
642	pand	64(%r12),%xmm0
643
644	pand	80(%r12),%xmm1
645	pand	96(%r12),%xmm2
646	movdqa	%xmm3,352(%r10)
647	pand	112(%r12),%xmm3
648	por	%xmm2,%xmm0
649	por	%xmm3,%xmm1
650	movdqa	-128(%r12),%xmm4
651	movdqa	-112(%r12),%xmm5
652	movdqa	-96(%r12),%xmm2
653	pand	112(%r10),%xmm4
654	movdqa	-80(%r12),%xmm3
655	pand	128(%r10),%xmm5
656	por	%xmm4,%xmm0
657	pand	144(%r10),%xmm2
658	por	%xmm5,%xmm1
659	pand	160(%r10),%xmm3
660	por	%xmm2,%xmm0
661	por	%xmm3,%xmm1
662	movdqa	-64(%r12),%xmm4
663	movdqa	-48(%r12),%xmm5
664	movdqa	-32(%r12),%xmm2
665	pand	176(%r10),%xmm4
666	movdqa	-16(%r12),%xmm3
667	pand	192(%r10),%xmm5
668	por	%xmm4,%xmm0
669	pand	208(%r10),%xmm2
670	por	%xmm5,%xmm1
671	pand	224(%r10),%xmm3
672	por	%xmm2,%xmm0
673	por	%xmm3,%xmm1
674	movdqa	0(%r12),%xmm4
675	movdqa	16(%r12),%xmm5
676	movdqa	32(%r12),%xmm2
677	pand	240(%r10),%xmm4
678	movdqa	48(%r12),%xmm3
679	pand	256(%r10),%xmm5
680	por	%xmm4,%xmm0
681	pand	272(%r10),%xmm2
682	por	%xmm5,%xmm1
683	pand	288(%r10),%xmm3
684	por	%xmm2,%xmm0
685	por	%xmm3,%xmm1
686	por	%xmm1,%xmm0
687	pshufd	$0x4e,%xmm0,%xmm1
688	por	%xmm1,%xmm0
689	leaq	256(%r12),%r12
690.byte	102,72,15,126,195
691
692	movq	%r13,16+8(%rsp)
693	movq	%rdi,56+8(%rsp)
694
695	movq	(%r8),%r8
696	movq	(%rsi),%rax
697	leaq	(%rsi,%r9,1),%rsi
698	negq	%r9
699
700	movq	%r8,%rbp
701	mulq	%rbx
702	movq	%rax,%r10
703	movq	(%rcx),%rax
704
705	imulq	%r10,%rbp
706	leaq	64+8(%rsp),%r14
707	movq	%rdx,%r11
708
709	mulq	%rbp
710	addq	%rax,%r10
711	movq	8(%rsi,%r9,1),%rax
712	adcq	$0,%rdx
713	movq	%rdx,%rdi
714
715	mulq	%rbx
716	addq	%rax,%r11
717	movq	8(%rcx),%rax
718	adcq	$0,%rdx
719	movq	%rdx,%r10
720
721	mulq	%rbp
722	addq	%rax,%rdi
723	movq	16(%rsi,%r9,1),%rax
724	adcq	$0,%rdx
725	addq	%r11,%rdi
726	leaq	32(%r9),%r15
727	leaq	32(%rcx),%rcx
728	adcq	$0,%rdx
729	movq	%rdi,(%r14)
730	movq	%rdx,%r13
731	jmp	.L1st4x
732
733.align	32
734.L1st4x:
735	mulq	%rbx
736	addq	%rax,%r10
737	movq	-16(%rcx),%rax
738	leaq	32(%r14),%r14
739	adcq	$0,%rdx
740	movq	%rdx,%r11
741
742	mulq	%rbp
743	addq	%rax,%r13
744	movq	-8(%rsi,%r15,1),%rax
745	adcq	$0,%rdx
746	addq	%r10,%r13
747	adcq	$0,%rdx
748	movq	%r13,-24(%r14)
749	movq	%rdx,%rdi
750
751	mulq	%rbx
752	addq	%rax,%r11
753	movq	-8(%rcx),%rax
754	adcq	$0,%rdx
755	movq	%rdx,%r10
756
757	mulq	%rbp
758	addq	%rax,%rdi
759	movq	(%rsi,%r15,1),%rax
760	adcq	$0,%rdx
761	addq	%r11,%rdi
762	adcq	$0,%rdx
763	movq	%rdi,-16(%r14)
764	movq	%rdx,%r13
765
766	mulq	%rbx
767	addq	%rax,%r10
768	movq	0(%rcx),%rax
769	adcq	$0,%rdx
770	movq	%rdx,%r11
771
772	mulq	%rbp
773	addq	%rax,%r13
774	movq	8(%rsi,%r15,1),%rax
775	adcq	$0,%rdx
776	addq	%r10,%r13
777	adcq	$0,%rdx
778	movq	%r13,-8(%r14)
779	movq	%rdx,%rdi
780
781	mulq	%rbx
782	addq	%rax,%r11
783	movq	8(%rcx),%rax
784	adcq	$0,%rdx
785	movq	%rdx,%r10
786
787	mulq	%rbp
788	addq	%rax,%rdi
789	movq	16(%rsi,%r15,1),%rax
790	adcq	$0,%rdx
791	addq	%r11,%rdi
792	leaq	32(%rcx),%rcx
793	adcq	$0,%rdx
794	movq	%rdi,(%r14)
795	movq	%rdx,%r13
796
797	addq	$32,%r15
798	jnz	.L1st4x
799
800	mulq	%rbx
801	addq	%rax,%r10
802	movq	-16(%rcx),%rax
803	leaq	32(%r14),%r14
804	adcq	$0,%rdx
805	movq	%rdx,%r11
806
807	mulq	%rbp
808	addq	%rax,%r13
809	movq	-8(%rsi),%rax
810	adcq	$0,%rdx
811	addq	%r10,%r13
812	adcq	$0,%rdx
813	movq	%r13,-24(%r14)
814	movq	%rdx,%rdi
815
816	mulq	%rbx
817	addq	%rax,%r11
818	movq	-8(%rcx),%rax
819	adcq	$0,%rdx
820	movq	%rdx,%r10
821
822	mulq	%rbp
823	addq	%rax,%rdi
824	movq	(%rsi,%r9,1),%rax
825	adcq	$0,%rdx
826	addq	%r11,%rdi
827	adcq	$0,%rdx
828	movq	%rdi,-16(%r14)
829	movq	%rdx,%r13
830
831	leaq	(%rcx,%r9,1),%rcx
832
833	xorq	%rdi,%rdi
834	addq	%r10,%r13
835	adcq	$0,%rdi
836	movq	%r13,-8(%r14)
837
838	jmp	.Louter4x
839
840.align	32
841.Louter4x:
842	leaq	16+128(%r14),%rdx
843	pxor	%xmm4,%xmm4
844	pxor	%xmm5,%xmm5
845	movdqa	-128(%r12),%xmm0
846	movdqa	-112(%r12),%xmm1
847	movdqa	-96(%r12),%xmm2
848	movdqa	-80(%r12),%xmm3
849	pand	-128(%rdx),%xmm0
850	pand	-112(%rdx),%xmm1
851	por	%xmm0,%xmm4
852	pand	-96(%rdx),%xmm2
853	por	%xmm1,%xmm5
854	pand	-80(%rdx),%xmm3
855	por	%xmm2,%xmm4
856	por	%xmm3,%xmm5
857	movdqa	-64(%r12),%xmm0
858	movdqa	-48(%r12),%xmm1
859	movdqa	-32(%r12),%xmm2
860	movdqa	-16(%r12),%xmm3
861	pand	-64(%rdx),%xmm0
862	pand	-48(%rdx),%xmm1
863	por	%xmm0,%xmm4
864	pand	-32(%rdx),%xmm2
865	por	%xmm1,%xmm5
866	pand	-16(%rdx),%xmm3
867	por	%xmm2,%xmm4
868	por	%xmm3,%xmm5
869	movdqa	0(%r12),%xmm0
870	movdqa	16(%r12),%xmm1
871	movdqa	32(%r12),%xmm2
872	movdqa	48(%r12),%xmm3
873	pand	0(%rdx),%xmm0
874	pand	16(%rdx),%xmm1
875	por	%xmm0,%xmm4
876	pand	32(%rdx),%xmm2
877	por	%xmm1,%xmm5
878	pand	48(%rdx),%xmm3
879	por	%xmm2,%xmm4
880	por	%xmm3,%xmm5
881	movdqa	64(%r12),%xmm0
882	movdqa	80(%r12),%xmm1
883	movdqa	96(%r12),%xmm2
884	movdqa	112(%r12),%xmm3
885	pand	64(%rdx),%xmm0
886	pand	80(%rdx),%xmm1
887	por	%xmm0,%xmm4
888	pand	96(%rdx),%xmm2
889	por	%xmm1,%xmm5
890	pand	112(%rdx),%xmm3
891	por	%xmm2,%xmm4
892	por	%xmm3,%xmm5
893	por	%xmm5,%xmm4
894	pshufd	$0x4e,%xmm4,%xmm0
895	por	%xmm4,%xmm0
896	leaq	256(%r12),%r12
897.byte	102,72,15,126,195
898
899	movq	(%r14,%r9,1),%r10
900	movq	%r8,%rbp
901	mulq	%rbx
902	addq	%rax,%r10
903	movq	(%rcx),%rax
904	adcq	$0,%rdx
905
906	imulq	%r10,%rbp
907	movq	%rdx,%r11
908	movq	%rdi,(%r14)
909
910	leaq	(%r14,%r9,1),%r14
911
912	mulq	%rbp
913	addq	%rax,%r10
914	movq	8(%rsi,%r9,1),%rax
915	adcq	$0,%rdx
916	movq	%rdx,%rdi
917
918	mulq	%rbx
919	addq	%rax,%r11
920	movq	8(%rcx),%rax
921	adcq	$0,%rdx
922	addq	8(%r14),%r11
923	adcq	$0,%rdx
924	movq	%rdx,%r10
925
926	mulq	%rbp
927	addq	%rax,%rdi
928	movq	16(%rsi,%r9,1),%rax
929	adcq	$0,%rdx
930	addq	%r11,%rdi
931	leaq	32(%r9),%r15
932	leaq	32(%rcx),%rcx
933	adcq	$0,%rdx
934	movq	%rdx,%r13
935	jmp	.Linner4x
936
937.align	32
938.Linner4x:
939	mulq	%rbx
940	addq	%rax,%r10
941	movq	-16(%rcx),%rax
942	adcq	$0,%rdx
943	addq	16(%r14),%r10
944	leaq	32(%r14),%r14
945	adcq	$0,%rdx
946	movq	%rdx,%r11
947
948	mulq	%rbp
949	addq	%rax,%r13
950	movq	-8(%rsi,%r15,1),%rax
951	adcq	$0,%rdx
952	addq	%r10,%r13
953	adcq	$0,%rdx
954	movq	%rdi,-32(%r14)
955	movq	%rdx,%rdi
956
957	mulq	%rbx
958	addq	%rax,%r11
959	movq	-8(%rcx),%rax
960	adcq	$0,%rdx
961	addq	-8(%r14),%r11
962	adcq	$0,%rdx
963	movq	%rdx,%r10
964
965	mulq	%rbp
966	addq	%rax,%rdi
967	movq	(%rsi,%r15,1),%rax
968	adcq	$0,%rdx
969	addq	%r11,%rdi
970	adcq	$0,%rdx
971	movq	%r13,-24(%r14)
972	movq	%rdx,%r13
973
974	mulq	%rbx
975	addq	%rax,%r10
976	movq	0(%rcx),%rax
977	adcq	$0,%rdx
978	addq	(%r14),%r10
979	adcq	$0,%rdx
980	movq	%rdx,%r11
981
982	mulq	%rbp
983	addq	%rax,%r13
984	movq	8(%rsi,%r15,1),%rax
985	adcq	$0,%rdx
986	addq	%r10,%r13
987	adcq	$0,%rdx
988	movq	%rdi,-16(%r14)
989	movq	%rdx,%rdi
990
991	mulq	%rbx
992	addq	%rax,%r11
993	movq	8(%rcx),%rax
994	adcq	$0,%rdx
995	addq	8(%r14),%r11
996	adcq	$0,%rdx
997	movq	%rdx,%r10
998
999	mulq	%rbp
1000	addq	%rax,%rdi
1001	movq	16(%rsi,%r15,1),%rax
1002	adcq	$0,%rdx
1003	addq	%r11,%rdi
1004	leaq	32(%rcx),%rcx
1005	adcq	$0,%rdx
1006	movq	%r13,-8(%r14)
1007	movq	%rdx,%r13
1008
1009	addq	$32,%r15
1010	jnz	.Linner4x
1011
1012	mulq	%rbx
1013	addq	%rax,%r10
1014	movq	-16(%rcx),%rax
1015	adcq	$0,%rdx
1016	addq	16(%r14),%r10
1017	leaq	32(%r14),%r14
1018	adcq	$0,%rdx
1019	movq	%rdx,%r11
1020
1021	mulq	%rbp
1022	addq	%rax,%r13
1023	movq	-8(%rsi),%rax
1024	adcq	$0,%rdx
1025	addq	%r10,%r13
1026	adcq	$0,%rdx
1027	movq	%rdi,-32(%r14)
1028	movq	%rdx,%rdi
1029
1030	mulq	%rbx
1031	addq	%rax,%r11
1032	movq	%rbp,%rax
1033	movq	-8(%rcx),%rbp
1034	adcq	$0,%rdx
1035	addq	-8(%r14),%r11
1036	adcq	$0,%rdx
1037	movq	%rdx,%r10
1038
1039	mulq	%rbp
1040	addq	%rax,%rdi
1041	movq	(%rsi,%r9,1),%rax
1042	adcq	$0,%rdx
1043	addq	%r11,%rdi
1044	adcq	$0,%rdx
1045	movq	%r13,-24(%r14)
1046	movq	%rdx,%r13
1047
1048	movq	%rdi,-16(%r14)
1049	leaq	(%rcx,%r9,1),%rcx
1050
1051	xorq	%rdi,%rdi
1052	addq	%r10,%r13
1053	adcq	$0,%rdi
1054	addq	(%r14),%r13
1055	adcq	$0,%rdi
1056	movq	%r13,-8(%r14)
1057
1058	cmpq	16+8(%rsp),%r12
1059	jb	.Louter4x
1060	xorq	%rax,%rax
1061	subq	%r13,%rbp
1062	adcq	%r15,%r15
1063	orq	%r15,%rdi
1064	subq	%rdi,%rax
1065	leaq	(%r14,%r9,1),%rbx
1066	movq	(%rcx),%r12
1067	leaq	(%rcx),%rbp
1068	movq	%r9,%rcx
1069	sarq	$3+2,%rcx
1070	movq	56+8(%rsp),%rdi
1071	decq	%r12
1072	xorq	%r10,%r10
1073	movq	8(%rbp),%r13
1074	movq	16(%rbp),%r14
1075	movq	24(%rbp),%r15
1076	jmp	.Lsqr4x_sub_entry
1077.cfi_endproc
1078.size	mul4x_internal,.-mul4x_internal
1079.globl	bn_power5
1080.type	bn_power5,@function
1081.align	32
1082bn_power5:
1083.cfi_startproc
1084	movq	%rsp,%rax
1085.cfi_def_cfa_register	%rax
1086	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
1087	andl	$0x80108,%r11d
1088	cmpl	$0x80108,%r11d
1089	je	.Lpowerx5_enter
1090	pushq	%rbx
1091.cfi_offset	%rbx,-16
1092	pushq	%rbp
1093.cfi_offset	%rbp,-24
1094	pushq	%r12
1095.cfi_offset	%r12,-32
1096	pushq	%r13
1097.cfi_offset	%r13,-40
1098	pushq	%r14
1099.cfi_offset	%r14,-48
1100	pushq	%r15
1101.cfi_offset	%r15,-56
1102.Lpower5_prologue:
1103
1104	shll	$3,%r9d
1105	leal	(%r9,%r9,2),%r10d
1106	negq	%r9
1107	movq	(%r8),%r8
1108
1109
1110
1111
1112
1113
1114
1115
1116	leaq	-320(%rsp,%r9,2),%r11
1117	movq	%rsp,%rbp
1118	subq	%rdi,%r11
1119	andq	$4095,%r11
1120	cmpq	%r11,%r10
1121	jb	.Lpwr_sp_alt
1122	subq	%r11,%rbp
1123	leaq	-320(%rbp,%r9,2),%rbp
1124	jmp	.Lpwr_sp_done
1125
1126.align	32
1127.Lpwr_sp_alt:
1128	leaq	4096-320(,%r9,2),%r10
1129	leaq	-320(%rbp,%r9,2),%rbp
1130	subq	%r10,%r11
1131	movq	$0,%r10
1132	cmovcq	%r10,%r11
1133	subq	%r11,%rbp
1134.Lpwr_sp_done:
1135	andq	$-64,%rbp
1136	movq	%rsp,%r11
1137	subq	%rbp,%r11
1138	andq	$-4096,%r11
1139	leaq	(%r11,%rbp,1),%rsp
1140	movq	(%rsp),%r10
1141	cmpq	%rbp,%rsp
1142	ja	.Lpwr_page_walk
1143	jmp	.Lpwr_page_walk_done
1144
1145.Lpwr_page_walk:
1146	leaq	-4096(%rsp),%rsp
1147	movq	(%rsp),%r10
1148	cmpq	%rbp,%rsp
1149	ja	.Lpwr_page_walk
1150.Lpwr_page_walk_done:
1151
1152	movq	%r9,%r10
1153	negq	%r9
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164	movq	%r8,32(%rsp)
1165	movq	%rax,40(%rsp)
1166.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
1167.Lpower5_body:
1168.byte	102,72,15,110,207
1169.byte	102,72,15,110,209
1170.byte	102,73,15,110,218
1171.byte	102,72,15,110,226
1172
1173	call	__bn_sqr8x_internal
1174	call	__bn_post4x_internal
1175	call	__bn_sqr8x_internal
1176	call	__bn_post4x_internal
1177	call	__bn_sqr8x_internal
1178	call	__bn_post4x_internal
1179	call	__bn_sqr8x_internal
1180	call	__bn_post4x_internal
1181	call	__bn_sqr8x_internal
1182	call	__bn_post4x_internal
1183
1184.byte	102,72,15,126,209
1185.byte	102,72,15,126,226
1186	movq	%rsi,%rdi
1187	movq	40(%rsp),%rax
1188	leaq	32(%rsp),%r8
1189
1190	call	mul4x_internal
1191
1192	movq	40(%rsp),%rsi
1193.cfi_def_cfa	%rsi,8
1194	movq	$1,%rax
1195	movq	-48(%rsi),%r15
1196.cfi_restore	%r15
1197	movq	-40(%rsi),%r14
1198.cfi_restore	%r14
1199	movq	-32(%rsi),%r13
1200.cfi_restore	%r13
1201	movq	-24(%rsi),%r12
1202.cfi_restore	%r12
1203	movq	-16(%rsi),%rbp
1204.cfi_restore	%rbp
1205	movq	-8(%rsi),%rbx
1206.cfi_restore	%rbx
1207	leaq	(%rsi),%rsp
1208.cfi_def_cfa_register	%rsp
1209.Lpower5_epilogue:
1210	.byte	0xf3,0xc3
1211.cfi_endproc
1212.size	bn_power5,.-bn_power5
1213
1214.globl	bn_sqr8x_internal
1215.hidden	bn_sqr8x_internal
1216.type	bn_sqr8x_internal,@function
1217.align	32
1218bn_sqr8x_internal:
1219__bn_sqr8x_internal:
1220.cfi_startproc
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294	leaq	32(%r10),%rbp
1295	leaq	(%rsi,%r9,1),%rsi
1296
1297	movq	%r9,%rcx
1298
1299
1300	movq	-32(%rsi,%rbp,1),%r14
1301	leaq	48+8(%rsp,%r9,2),%rdi
1302	movq	-24(%rsi,%rbp,1),%rax
1303	leaq	-32(%rdi,%rbp,1),%rdi
1304	movq	-16(%rsi,%rbp,1),%rbx
1305	movq	%rax,%r15
1306
1307	mulq	%r14
1308	movq	%rax,%r10
1309	movq	%rbx,%rax
1310	movq	%rdx,%r11
1311	movq	%r10,-24(%rdi,%rbp,1)
1312
1313	mulq	%r14
1314	addq	%rax,%r11
1315	movq	%rbx,%rax
1316	adcq	$0,%rdx
1317	movq	%r11,-16(%rdi,%rbp,1)
1318	movq	%rdx,%r10
1319
1320
1321	movq	-8(%rsi,%rbp,1),%rbx
1322	mulq	%r15
1323	movq	%rax,%r12
1324	movq	%rbx,%rax
1325	movq	%rdx,%r13
1326
1327	leaq	(%rbp),%rcx
1328	mulq	%r14
1329	addq	%rax,%r10
1330	movq	%rbx,%rax
1331	movq	%rdx,%r11
1332	adcq	$0,%r11
1333	addq	%r12,%r10
1334	adcq	$0,%r11
1335	movq	%r10,-8(%rdi,%rcx,1)
1336	jmp	.Lsqr4x_1st
1337
1338.align	32
1339.Lsqr4x_1st:
1340	movq	(%rsi,%rcx,1),%rbx
1341	mulq	%r15
1342	addq	%rax,%r13
1343	movq	%rbx,%rax
1344	movq	%rdx,%r12
1345	adcq	$0,%r12
1346
1347	mulq	%r14
1348	addq	%rax,%r11
1349	movq	%rbx,%rax
1350	movq	8(%rsi,%rcx,1),%rbx
1351	movq	%rdx,%r10
1352	adcq	$0,%r10
1353	addq	%r13,%r11
1354	adcq	$0,%r10
1355
1356
1357	mulq	%r15
1358	addq	%rax,%r12
1359	movq	%rbx,%rax
1360	movq	%r11,(%rdi,%rcx,1)
1361	movq	%rdx,%r13
1362	adcq	$0,%r13
1363
1364	mulq	%r14
1365	addq	%rax,%r10
1366	movq	%rbx,%rax
1367	movq	16(%rsi,%rcx,1),%rbx
1368	movq	%rdx,%r11
1369	adcq	$0,%r11
1370	addq	%r12,%r10
1371	adcq	$0,%r11
1372
1373	mulq	%r15
1374	addq	%rax,%r13
1375	movq	%rbx,%rax
1376	movq	%r10,8(%rdi,%rcx,1)
1377	movq	%rdx,%r12
1378	adcq	$0,%r12
1379
1380	mulq	%r14
1381	addq	%rax,%r11
1382	movq	%rbx,%rax
1383	movq	24(%rsi,%rcx,1),%rbx
1384	movq	%rdx,%r10
1385	adcq	$0,%r10
1386	addq	%r13,%r11
1387	adcq	$0,%r10
1388
1389
1390	mulq	%r15
1391	addq	%rax,%r12
1392	movq	%rbx,%rax
1393	movq	%r11,16(%rdi,%rcx,1)
1394	movq	%rdx,%r13
1395	adcq	$0,%r13
1396	leaq	32(%rcx),%rcx
1397
1398	mulq	%r14
1399	addq	%rax,%r10
1400	movq	%rbx,%rax
1401	movq	%rdx,%r11
1402	adcq	$0,%r11
1403	addq	%r12,%r10
1404	adcq	$0,%r11
1405	movq	%r10,-8(%rdi,%rcx,1)
1406
1407	cmpq	$0,%rcx
1408	jne	.Lsqr4x_1st
1409
1410	mulq	%r15
1411	addq	%rax,%r13
1412	leaq	16(%rbp),%rbp
1413	adcq	$0,%rdx
1414	addq	%r11,%r13
1415	adcq	$0,%rdx
1416
1417	movq	%r13,(%rdi)
1418	movq	%rdx,%r12
1419	movq	%rdx,8(%rdi)
1420	jmp	.Lsqr4x_outer
1421
1422.align	32
1423.Lsqr4x_outer:
1424	movq	-32(%rsi,%rbp,1),%r14
1425	leaq	48+8(%rsp,%r9,2),%rdi
1426	movq	-24(%rsi,%rbp,1),%rax
1427	leaq	-32(%rdi,%rbp,1),%rdi
1428	movq	-16(%rsi,%rbp,1),%rbx
1429	movq	%rax,%r15
1430
1431	mulq	%r14
1432	movq	-24(%rdi,%rbp,1),%r10
1433	addq	%rax,%r10
1434	movq	%rbx,%rax
1435	adcq	$0,%rdx
1436	movq	%r10,-24(%rdi,%rbp,1)
1437	movq	%rdx,%r11
1438
1439	mulq	%r14
1440	addq	%rax,%r11
1441	movq	%rbx,%rax
1442	adcq	$0,%rdx
1443	addq	-16(%rdi,%rbp,1),%r11
1444	movq	%rdx,%r10
1445	adcq	$0,%r10
1446	movq	%r11,-16(%rdi,%rbp,1)
1447
1448	xorq	%r12,%r12
1449
1450	movq	-8(%rsi,%rbp,1),%rbx
1451	mulq	%r15
1452	addq	%rax,%r12
1453	movq	%rbx,%rax
1454	adcq	$0,%rdx
1455	addq	-8(%rdi,%rbp,1),%r12
1456	movq	%rdx,%r13
1457	adcq	$0,%r13
1458
1459	mulq	%r14
1460	addq	%rax,%r10
1461	movq	%rbx,%rax
1462	adcq	$0,%rdx
1463	addq	%r12,%r10
1464	movq	%rdx,%r11
1465	adcq	$0,%r11
1466	movq	%r10,-8(%rdi,%rbp,1)
1467
1468	leaq	(%rbp),%rcx
1469	jmp	.Lsqr4x_inner
1470
1471.align	32
1472.Lsqr4x_inner:
1473	movq	(%rsi,%rcx,1),%rbx
1474	mulq	%r15
1475	addq	%rax,%r13
1476	movq	%rbx,%rax
1477	movq	%rdx,%r12
1478	adcq	$0,%r12
1479	addq	(%rdi,%rcx,1),%r13
1480	adcq	$0,%r12
1481
1482.byte	0x67
1483	mulq	%r14
1484	addq	%rax,%r11
1485	movq	%rbx,%rax
1486	movq	8(%rsi,%rcx,1),%rbx
1487	movq	%rdx,%r10
1488	adcq	$0,%r10
1489	addq	%r13,%r11
1490	adcq	$0,%r10
1491
1492	mulq	%r15
1493	addq	%rax,%r12
1494	movq	%r11,(%rdi,%rcx,1)
1495	movq	%rbx,%rax
1496	movq	%rdx,%r13
1497	adcq	$0,%r13
1498	addq	8(%rdi,%rcx,1),%r12
1499	leaq	16(%rcx),%rcx
1500	adcq	$0,%r13
1501
1502	mulq	%r14
1503	addq	%rax,%r10
1504	movq	%rbx,%rax
1505	adcq	$0,%rdx
1506	addq	%r12,%r10
1507	movq	%rdx,%r11
1508	adcq	$0,%r11
1509	movq	%r10,-8(%rdi,%rcx,1)
1510
1511	cmpq	$0,%rcx
1512	jne	.Lsqr4x_inner
1513
1514.byte	0x67
1515	mulq	%r15
1516	addq	%rax,%r13
1517	adcq	$0,%rdx
1518	addq	%r11,%r13
1519	adcq	$0,%rdx
1520
1521	movq	%r13,(%rdi)
1522	movq	%rdx,%r12
1523	movq	%rdx,8(%rdi)
1524
1525	addq	$16,%rbp
1526	jnz	.Lsqr4x_outer
1527
1528
1529	movq	-32(%rsi),%r14
1530	leaq	48+8(%rsp,%r9,2),%rdi
1531	movq	-24(%rsi),%rax
1532	leaq	-32(%rdi,%rbp,1),%rdi
1533	movq	-16(%rsi),%rbx
1534	movq	%rax,%r15
1535
1536	mulq	%r14
1537	addq	%rax,%r10
1538	movq	%rbx,%rax
1539	movq	%rdx,%r11
1540	adcq	$0,%r11
1541
1542	mulq	%r14
1543	addq	%rax,%r11
1544	movq	%rbx,%rax
1545	movq	%r10,-24(%rdi)
1546	movq	%rdx,%r10
1547	adcq	$0,%r10
1548	addq	%r13,%r11
1549	movq	-8(%rsi),%rbx
1550	adcq	$0,%r10
1551
1552	mulq	%r15
1553	addq	%rax,%r12
1554	movq	%rbx,%rax
1555	movq	%r11,-16(%rdi)
1556	movq	%rdx,%r13
1557	adcq	$0,%r13
1558
1559	mulq	%r14
1560	addq	%rax,%r10
1561	movq	%rbx,%rax
1562	movq	%rdx,%r11
1563	adcq	$0,%r11
1564	addq	%r12,%r10
1565	adcq	$0,%r11
1566	movq	%r10,-8(%rdi)
1567
1568	mulq	%r15
1569	addq	%rax,%r13
1570	movq	-16(%rsi),%rax
1571	adcq	$0,%rdx
1572	addq	%r11,%r13
1573	adcq	$0,%rdx
1574
1575	movq	%r13,(%rdi)
1576	movq	%rdx,%r12
1577	movq	%rdx,8(%rdi)
1578
1579	mulq	%rbx
1580	addq	$16,%rbp
1581	xorq	%r14,%r14
1582	subq	%r9,%rbp
1583	xorq	%r15,%r15
1584
1585	addq	%r12,%rax
1586	adcq	$0,%rdx
1587	movq	%rax,8(%rdi)
1588	movq	%rdx,16(%rdi)
1589	movq	%r15,24(%rdi)
1590
1591	movq	-16(%rsi,%rbp,1),%rax
1592	leaq	48+8(%rsp),%rdi
1593	xorq	%r10,%r10
1594	movq	8(%rdi),%r11
1595
1596	leaq	(%r14,%r10,2),%r12
1597	shrq	$63,%r10
1598	leaq	(%rcx,%r11,2),%r13
1599	shrq	$63,%r11
1600	orq	%r10,%r13
1601	movq	16(%rdi),%r10
1602	movq	%r11,%r14
1603	mulq	%rax
1604	negq	%r15
1605	movq	24(%rdi),%r11
1606	adcq	%rax,%r12
1607	movq	-8(%rsi,%rbp,1),%rax
1608	movq	%r12,(%rdi)
1609	adcq	%rdx,%r13
1610
1611	leaq	(%r14,%r10,2),%rbx
1612	movq	%r13,8(%rdi)
1613	sbbq	%r15,%r15
1614	shrq	$63,%r10
1615	leaq	(%rcx,%r11,2),%r8
1616	shrq	$63,%r11
1617	orq	%r10,%r8
1618	movq	32(%rdi),%r10
1619	movq	%r11,%r14
1620	mulq	%rax
1621	negq	%r15
1622	movq	40(%rdi),%r11
1623	adcq	%rax,%rbx
1624	movq	0(%rsi,%rbp,1),%rax
1625	movq	%rbx,16(%rdi)
1626	adcq	%rdx,%r8
1627	leaq	16(%rbp),%rbp
1628	movq	%r8,24(%rdi)
1629	sbbq	%r15,%r15
1630	leaq	64(%rdi),%rdi
1631	jmp	.Lsqr4x_shift_n_add
1632
1633.align	32
1634.Lsqr4x_shift_n_add:
1635	leaq	(%r14,%r10,2),%r12
1636	shrq	$63,%r10
1637	leaq	(%rcx,%r11,2),%r13
1638	shrq	$63,%r11
1639	orq	%r10,%r13
1640	movq	-16(%rdi),%r10
1641	movq	%r11,%r14
1642	mulq	%rax
1643	negq	%r15
1644	movq	-8(%rdi),%r11
1645	adcq	%rax,%r12
1646	movq	-8(%rsi,%rbp,1),%rax
1647	movq	%r12,-32(%rdi)
1648	adcq	%rdx,%r13
1649
1650	leaq	(%r14,%r10,2),%rbx
1651	movq	%r13,-24(%rdi)
1652	sbbq	%r15,%r15
1653	shrq	$63,%r10
1654	leaq	(%rcx,%r11,2),%r8
1655	shrq	$63,%r11
1656	orq	%r10,%r8
1657	movq	0(%rdi),%r10
1658	movq	%r11,%r14
1659	mulq	%rax
1660	negq	%r15
1661	movq	8(%rdi),%r11
1662	adcq	%rax,%rbx
1663	movq	0(%rsi,%rbp,1),%rax
1664	movq	%rbx,-16(%rdi)
1665	adcq	%rdx,%r8
1666
1667	leaq	(%r14,%r10,2),%r12
1668	movq	%r8,-8(%rdi)
1669	sbbq	%r15,%r15
1670	shrq	$63,%r10
1671	leaq	(%rcx,%r11,2),%r13
1672	shrq	$63,%r11
1673	orq	%r10,%r13
1674	movq	16(%rdi),%r10
1675	movq	%r11,%r14
1676	mulq	%rax
1677	negq	%r15
1678	movq	24(%rdi),%r11
1679	adcq	%rax,%r12
1680	movq	8(%rsi,%rbp,1),%rax
1681	movq	%r12,0(%rdi)
1682	adcq	%rdx,%r13
1683
1684	leaq	(%r14,%r10,2),%rbx
1685	movq	%r13,8(%rdi)
1686	sbbq	%r15,%r15
1687	shrq	$63,%r10
1688	leaq	(%rcx,%r11,2),%r8
1689	shrq	$63,%r11
1690	orq	%r10,%r8
1691	movq	32(%rdi),%r10
1692	movq	%r11,%r14
1693	mulq	%rax
1694	negq	%r15
1695	movq	40(%rdi),%r11
1696	adcq	%rax,%rbx
1697	movq	16(%rsi,%rbp,1),%rax
1698	movq	%rbx,16(%rdi)
1699	adcq	%rdx,%r8
1700	movq	%r8,24(%rdi)
1701	sbbq	%r15,%r15
1702	leaq	64(%rdi),%rdi
1703	addq	$32,%rbp
1704	jnz	.Lsqr4x_shift_n_add
1705
1706	leaq	(%r14,%r10,2),%r12
1707.byte	0x67
1708	shrq	$63,%r10
1709	leaq	(%rcx,%r11,2),%r13
1710	shrq	$63,%r11
1711	orq	%r10,%r13
1712	movq	-16(%rdi),%r10
1713	movq	%r11,%r14
1714	mulq	%rax
1715	negq	%r15
1716	movq	-8(%rdi),%r11
1717	adcq	%rax,%r12
1718	movq	-8(%rsi),%rax
1719	movq	%r12,-32(%rdi)
1720	adcq	%rdx,%r13
1721
1722	leaq	(%r14,%r10,2),%rbx
1723	movq	%r13,-24(%rdi)
1724	sbbq	%r15,%r15
1725	shrq	$63,%r10
1726	leaq	(%rcx,%r11,2),%r8
1727	shrq	$63,%r11
1728	orq	%r10,%r8
1729	mulq	%rax
1730	negq	%r15
1731	adcq	%rax,%rbx
1732	adcq	%rdx,%r8
1733	movq	%rbx,-16(%rdi)
1734	movq	%r8,-8(%rdi)
1735.byte	102,72,15,126,213
1736__bn_sqr8x_reduction:
1737	xorq	%rax,%rax
1738	leaq	(%r9,%rbp,1),%rcx
1739	leaq	48+8(%rsp,%r9,2),%rdx
1740	movq	%rcx,0+8(%rsp)
1741	leaq	48+8(%rsp,%r9,1),%rdi
1742	movq	%rdx,8+8(%rsp)
1743	negq	%r9
1744	jmp	.L8x_reduction_loop
1745
1746.align	32
1747.L8x_reduction_loop:
1748	leaq	(%rdi,%r9,1),%rdi
1749.byte	0x66
1750	movq	0(%rdi),%rbx
1751	movq	8(%rdi),%r9
1752	movq	16(%rdi),%r10
1753	movq	24(%rdi),%r11
1754	movq	32(%rdi),%r12
1755	movq	40(%rdi),%r13
1756	movq	48(%rdi),%r14
1757	movq	56(%rdi),%r15
1758	movq	%rax,(%rdx)
1759	leaq	64(%rdi),%rdi
1760
1761.byte	0x67
1762	movq	%rbx,%r8
1763	imulq	32+8(%rsp),%rbx
1764	movq	0(%rbp),%rax
1765	movl	$8,%ecx
1766	jmp	.L8x_reduce
1767
1768.align	32
1769.L8x_reduce:
1770	mulq	%rbx
1771	movq	8(%rbp),%rax
1772	negq	%r8
1773	movq	%rdx,%r8
1774	adcq	$0,%r8
1775
1776	mulq	%rbx
1777	addq	%rax,%r9
1778	movq	16(%rbp),%rax
1779	adcq	$0,%rdx
1780	addq	%r9,%r8
1781	movq	%rbx,48-8+8(%rsp,%rcx,8)
1782	movq	%rdx,%r9
1783	adcq	$0,%r9
1784
1785	mulq	%rbx
1786	addq	%rax,%r10
1787	movq	24(%rbp),%rax
1788	adcq	$0,%rdx
1789	addq	%r10,%r9
1790	movq	32+8(%rsp),%rsi
1791	movq	%rdx,%r10
1792	adcq	$0,%r10
1793
1794	mulq	%rbx
1795	addq	%rax,%r11
1796	movq	32(%rbp),%rax
1797	adcq	$0,%rdx
1798	imulq	%r8,%rsi
1799	addq	%r11,%r10
1800	movq	%rdx,%r11
1801	adcq	$0,%r11
1802
1803	mulq	%rbx
1804	addq	%rax,%r12
1805	movq	40(%rbp),%rax
1806	adcq	$0,%rdx
1807	addq	%r12,%r11
1808	movq	%rdx,%r12
1809	adcq	$0,%r12
1810
1811	mulq	%rbx
1812	addq	%rax,%r13
1813	movq	48(%rbp),%rax
1814	adcq	$0,%rdx
1815	addq	%r13,%r12
1816	movq	%rdx,%r13
1817	adcq	$0,%r13
1818
1819	mulq	%rbx
1820	addq	%rax,%r14
1821	movq	56(%rbp),%rax
1822	adcq	$0,%rdx
1823	addq	%r14,%r13
1824	movq	%rdx,%r14
1825	adcq	$0,%r14
1826
1827	mulq	%rbx
1828	movq	%rsi,%rbx
1829	addq	%rax,%r15
1830	movq	0(%rbp),%rax
1831	adcq	$0,%rdx
1832	addq	%r15,%r14
1833	movq	%rdx,%r15
1834	adcq	$0,%r15
1835
1836	decl	%ecx
1837	jnz	.L8x_reduce
1838
1839	leaq	64(%rbp),%rbp
1840	xorq	%rax,%rax
1841	movq	8+8(%rsp),%rdx
1842	cmpq	0+8(%rsp),%rbp
1843	jae	.L8x_no_tail
1844
1845.byte	0x66
1846	addq	0(%rdi),%r8
1847	adcq	8(%rdi),%r9
1848	adcq	16(%rdi),%r10
1849	adcq	24(%rdi),%r11
1850	adcq	32(%rdi),%r12
1851	adcq	40(%rdi),%r13
1852	adcq	48(%rdi),%r14
1853	adcq	56(%rdi),%r15
1854	sbbq	%rsi,%rsi
1855
1856	movq	48+56+8(%rsp),%rbx
1857	movl	$8,%ecx
1858	movq	0(%rbp),%rax
1859	jmp	.L8x_tail
1860
1861.align	32
1862.L8x_tail:
1863	mulq	%rbx
1864	addq	%rax,%r8
1865	movq	8(%rbp),%rax
1866	movq	%r8,(%rdi)
1867	movq	%rdx,%r8
1868	adcq	$0,%r8
1869
1870	mulq	%rbx
1871	addq	%rax,%r9
1872	movq	16(%rbp),%rax
1873	adcq	$0,%rdx
1874	addq	%r9,%r8
1875	leaq	8(%rdi),%rdi
1876	movq	%rdx,%r9
1877	adcq	$0,%r9
1878
1879	mulq	%rbx
1880	addq	%rax,%r10
1881	movq	24(%rbp),%rax
1882	adcq	$0,%rdx
1883	addq	%r10,%r9
1884	movq	%rdx,%r10
1885	adcq	$0,%r10
1886
1887	mulq	%rbx
1888	addq	%rax,%r11
1889	movq	32(%rbp),%rax
1890	adcq	$0,%rdx
1891	addq	%r11,%r10
1892	movq	%rdx,%r11
1893	adcq	$0,%r11
1894
1895	mulq	%rbx
1896	addq	%rax,%r12
1897	movq	40(%rbp),%rax
1898	adcq	$0,%rdx
1899	addq	%r12,%r11
1900	movq	%rdx,%r12
1901	adcq	$0,%r12
1902
1903	mulq	%rbx
1904	addq	%rax,%r13
1905	movq	48(%rbp),%rax
1906	adcq	$0,%rdx
1907	addq	%r13,%r12
1908	movq	%rdx,%r13
1909	adcq	$0,%r13
1910
1911	mulq	%rbx
1912	addq	%rax,%r14
1913	movq	56(%rbp),%rax
1914	adcq	$0,%rdx
1915	addq	%r14,%r13
1916	movq	%rdx,%r14
1917	adcq	$0,%r14
1918
1919	mulq	%rbx
1920	movq	48-16+8(%rsp,%rcx,8),%rbx
1921	addq	%rax,%r15
1922	adcq	$0,%rdx
1923	addq	%r15,%r14
1924	movq	0(%rbp),%rax
1925	movq	%rdx,%r15
1926	adcq	$0,%r15
1927
1928	decl	%ecx
1929	jnz	.L8x_tail
1930
1931	leaq	64(%rbp),%rbp
1932	movq	8+8(%rsp),%rdx
1933	cmpq	0+8(%rsp),%rbp
1934	jae	.L8x_tail_done
1935
1936	movq	48+56+8(%rsp),%rbx
1937	negq	%rsi
1938	movq	0(%rbp),%rax
1939	adcq	0(%rdi),%r8
1940	adcq	8(%rdi),%r9
1941	adcq	16(%rdi),%r10
1942	adcq	24(%rdi),%r11
1943	adcq	32(%rdi),%r12
1944	adcq	40(%rdi),%r13
1945	adcq	48(%rdi),%r14
1946	adcq	56(%rdi),%r15
1947	sbbq	%rsi,%rsi
1948
1949	movl	$8,%ecx
1950	jmp	.L8x_tail
1951
1952.align	32
1953.L8x_tail_done:
1954	xorq	%rax,%rax
1955	addq	(%rdx),%r8
1956	adcq	$0,%r9
1957	adcq	$0,%r10
1958	adcq	$0,%r11
1959	adcq	$0,%r12
1960	adcq	$0,%r13
1961	adcq	$0,%r14
1962	adcq	$0,%r15
1963	adcq	$0,%rax
1964
1965	negq	%rsi
1966.L8x_no_tail:
1967	adcq	0(%rdi),%r8
1968	adcq	8(%rdi),%r9
1969	adcq	16(%rdi),%r10
1970	adcq	24(%rdi),%r11
1971	adcq	32(%rdi),%r12
1972	adcq	40(%rdi),%r13
1973	adcq	48(%rdi),%r14
1974	adcq	56(%rdi),%r15
1975	adcq	$0,%rax
1976	movq	-8(%rbp),%rcx
1977	xorq	%rsi,%rsi
1978
1979.byte	102,72,15,126,213
1980
1981	movq	%r8,0(%rdi)
1982	movq	%r9,8(%rdi)
1983.byte	102,73,15,126,217
1984	movq	%r10,16(%rdi)
1985	movq	%r11,24(%rdi)
1986	movq	%r12,32(%rdi)
1987	movq	%r13,40(%rdi)
1988	movq	%r14,48(%rdi)
1989	movq	%r15,56(%rdi)
1990	leaq	64(%rdi),%rdi
1991
1992	cmpq	%rdx,%rdi
1993	jb	.L8x_reduction_loop
1994	.byte	0xf3,0xc3
1995.cfi_endproc
1996.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1997.type	__bn_post4x_internal,@function
1998.align	32
1999__bn_post4x_internal:
2000.cfi_startproc
2001	movq	0(%rbp),%r12
2002	leaq	(%rdi,%r9,1),%rbx
2003	movq	%r9,%rcx
2004.byte	102,72,15,126,207
2005	negq	%rax
2006.byte	102,72,15,126,206
2007	sarq	$3+2,%rcx
2008	decq	%r12
2009	xorq	%r10,%r10
2010	movq	8(%rbp),%r13
2011	movq	16(%rbp),%r14
2012	movq	24(%rbp),%r15
2013	jmp	.Lsqr4x_sub_entry
2014
2015.align	16
2016.Lsqr4x_sub:
2017	movq	0(%rbp),%r12
2018	movq	8(%rbp),%r13
2019	movq	16(%rbp),%r14
2020	movq	24(%rbp),%r15
2021.Lsqr4x_sub_entry:
2022	leaq	32(%rbp),%rbp
2023	notq	%r12
2024	notq	%r13
2025	notq	%r14
2026	notq	%r15
2027	andq	%rax,%r12
2028	andq	%rax,%r13
2029	andq	%rax,%r14
2030	andq	%rax,%r15
2031
2032	negq	%r10
2033	adcq	0(%rbx),%r12
2034	adcq	8(%rbx),%r13
2035	adcq	16(%rbx),%r14
2036	adcq	24(%rbx),%r15
2037	movq	%r12,0(%rdi)
2038	leaq	32(%rbx),%rbx
2039	movq	%r13,8(%rdi)
2040	sbbq	%r10,%r10
2041	movq	%r14,16(%rdi)
2042	movq	%r15,24(%rdi)
2043	leaq	32(%rdi),%rdi
2044
2045	incq	%rcx
2046	jnz	.Lsqr4x_sub
2047
2048	movq	%r9,%r10
2049	negq	%r9
2050	.byte	0xf3,0xc3
2051.cfi_endproc
2052.size	__bn_post4x_internal,.-__bn_post4x_internal
2053.type	bn_mulx4x_mont_gather5,@function
2054.align	32
2055bn_mulx4x_mont_gather5:
2056.cfi_startproc
2057	movq	%rsp,%rax
2058.cfi_def_cfa_register	%rax
2059.Lmulx4x_enter:
2060	pushq	%rbx
2061.cfi_offset	%rbx,-16
2062	pushq	%rbp
2063.cfi_offset	%rbp,-24
2064	pushq	%r12
2065.cfi_offset	%r12,-32
2066	pushq	%r13
2067.cfi_offset	%r13,-40
2068	pushq	%r14
2069.cfi_offset	%r14,-48
2070	pushq	%r15
2071.cfi_offset	%r15,-56
2072.Lmulx4x_prologue:
2073
2074	shll	$3,%r9d
2075	leaq	(%r9,%r9,2),%r10
2076	negq	%r9
2077	movq	(%r8),%r8
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088	leaq	-320(%rsp,%r9,2),%r11
2089	movq	%rsp,%rbp
2090	subq	%rdi,%r11
2091	andq	$4095,%r11
2092	cmpq	%r11,%r10
2093	jb	.Lmulx4xsp_alt
2094	subq	%r11,%rbp
2095	leaq	-320(%rbp,%r9,2),%rbp
2096	jmp	.Lmulx4xsp_done
2097
2098.Lmulx4xsp_alt:
2099	leaq	4096-320(,%r9,2),%r10
2100	leaq	-320(%rbp,%r9,2),%rbp
2101	subq	%r10,%r11
2102	movq	$0,%r10
2103	cmovcq	%r10,%r11
2104	subq	%r11,%rbp
2105.Lmulx4xsp_done:
2106	andq	$-64,%rbp
2107	movq	%rsp,%r11
2108	subq	%rbp,%r11
2109	andq	$-4096,%r11
2110	leaq	(%r11,%rbp,1),%rsp
2111	movq	(%rsp),%r10
2112	cmpq	%rbp,%rsp
2113	ja	.Lmulx4x_page_walk
2114	jmp	.Lmulx4x_page_walk_done
2115
2116.Lmulx4x_page_walk:
2117	leaq	-4096(%rsp),%rsp
2118	movq	(%rsp),%r10
2119	cmpq	%rbp,%rsp
2120	ja	.Lmulx4x_page_walk
2121.Lmulx4x_page_walk_done:
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135	movq	%r8,32(%rsp)
2136	movq	%rax,40(%rsp)
2137.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2138.Lmulx4x_body:
2139	call	mulx4x_internal
2140
2141	movq	40(%rsp),%rsi
2142.cfi_def_cfa	%rsi,8
2143	movq	$1,%rax
2144
2145	movq	-48(%rsi),%r15
2146.cfi_restore	%r15
2147	movq	-40(%rsi),%r14
2148.cfi_restore	%r14
2149	movq	-32(%rsi),%r13
2150.cfi_restore	%r13
2151	movq	-24(%rsi),%r12
2152.cfi_restore	%r12
2153	movq	-16(%rsi),%rbp
2154.cfi_restore	%rbp
2155	movq	-8(%rsi),%rbx
2156.cfi_restore	%rbx
2157	leaq	(%rsi),%rsp
2158.cfi_def_cfa_register	%rsp
2159.Lmulx4x_epilogue:
2160	.byte	0xf3,0xc3
2161.cfi_endproc
2162.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2163
2164.type	mulx4x_internal,@function
2165.align	32
2166mulx4x_internal:
2167.cfi_startproc
2168	movq	%r9,8(%rsp)
2169	movq	%r9,%r10
2170	negq	%r9
2171	shlq	$5,%r9
2172	negq	%r10
2173	leaq	128(%rdx,%r9,1),%r13
2174	shrq	$5+5,%r9
2175	movd	8(%rax),%xmm5
2176	subq	$1,%r9
2177	leaq	.Linc(%rip),%rax
2178	movq	%r13,16+8(%rsp)
2179	movq	%r9,24+8(%rsp)
2180	movq	%rdi,56+8(%rsp)
2181	movdqa	0(%rax),%xmm0
2182	movdqa	16(%rax),%xmm1
2183	leaq	88-112(%rsp,%r10,1),%r10
2184	leaq	128(%rdx),%rdi
2185
2186	pshufd	$0,%xmm5,%xmm5
2187	movdqa	%xmm1,%xmm4
2188.byte	0x67
2189	movdqa	%xmm1,%xmm2
2190.byte	0x67
2191	paddd	%xmm0,%xmm1
2192	pcmpeqd	%xmm5,%xmm0
2193	movdqa	%xmm4,%xmm3
2194	paddd	%xmm1,%xmm2
2195	pcmpeqd	%xmm5,%xmm1
2196	movdqa	%xmm0,112(%r10)
2197	movdqa	%xmm4,%xmm0
2198
2199	paddd	%xmm2,%xmm3
2200	pcmpeqd	%xmm5,%xmm2
2201	movdqa	%xmm1,128(%r10)
2202	movdqa	%xmm4,%xmm1
2203
2204	paddd	%xmm3,%xmm0
2205	pcmpeqd	%xmm5,%xmm3
2206	movdqa	%xmm2,144(%r10)
2207	movdqa	%xmm4,%xmm2
2208
2209	paddd	%xmm0,%xmm1
2210	pcmpeqd	%xmm5,%xmm0
2211	movdqa	%xmm3,160(%r10)
2212	movdqa	%xmm4,%xmm3
2213	paddd	%xmm1,%xmm2
2214	pcmpeqd	%xmm5,%xmm1
2215	movdqa	%xmm0,176(%r10)
2216	movdqa	%xmm4,%xmm0
2217
2218	paddd	%xmm2,%xmm3
2219	pcmpeqd	%xmm5,%xmm2
2220	movdqa	%xmm1,192(%r10)
2221	movdqa	%xmm4,%xmm1
2222
2223	paddd	%xmm3,%xmm0
2224	pcmpeqd	%xmm5,%xmm3
2225	movdqa	%xmm2,208(%r10)
2226	movdqa	%xmm4,%xmm2
2227
2228	paddd	%xmm0,%xmm1
2229	pcmpeqd	%xmm5,%xmm0
2230	movdqa	%xmm3,224(%r10)
2231	movdqa	%xmm4,%xmm3
2232	paddd	%xmm1,%xmm2
2233	pcmpeqd	%xmm5,%xmm1
2234	movdqa	%xmm0,240(%r10)
2235	movdqa	%xmm4,%xmm0
2236
2237	paddd	%xmm2,%xmm3
2238	pcmpeqd	%xmm5,%xmm2
2239	movdqa	%xmm1,256(%r10)
2240	movdqa	%xmm4,%xmm1
2241
2242	paddd	%xmm3,%xmm0
2243	pcmpeqd	%xmm5,%xmm3
2244	movdqa	%xmm2,272(%r10)
2245	movdqa	%xmm4,%xmm2
2246
2247	paddd	%xmm0,%xmm1
2248	pcmpeqd	%xmm5,%xmm0
2249	movdqa	%xmm3,288(%r10)
2250	movdqa	%xmm4,%xmm3
2251.byte	0x67
2252	paddd	%xmm1,%xmm2
2253	pcmpeqd	%xmm5,%xmm1
2254	movdqa	%xmm0,304(%r10)
2255
2256	paddd	%xmm2,%xmm3
2257	pcmpeqd	%xmm5,%xmm2
2258	movdqa	%xmm1,320(%r10)
2259
2260	pcmpeqd	%xmm5,%xmm3
2261	movdqa	%xmm2,336(%r10)
2262
2263	pand	64(%rdi),%xmm0
2264	pand	80(%rdi),%xmm1
2265	pand	96(%rdi),%xmm2
2266	movdqa	%xmm3,352(%r10)
2267	pand	112(%rdi),%xmm3
2268	por	%xmm2,%xmm0
2269	por	%xmm3,%xmm1
2270	movdqa	-128(%rdi),%xmm4
2271	movdqa	-112(%rdi),%xmm5
2272	movdqa	-96(%rdi),%xmm2
2273	pand	112(%r10),%xmm4
2274	movdqa	-80(%rdi),%xmm3
2275	pand	128(%r10),%xmm5
2276	por	%xmm4,%xmm0
2277	pand	144(%r10),%xmm2
2278	por	%xmm5,%xmm1
2279	pand	160(%r10),%xmm3
2280	por	%xmm2,%xmm0
2281	por	%xmm3,%xmm1
2282	movdqa	-64(%rdi),%xmm4
2283	movdqa	-48(%rdi),%xmm5
2284	movdqa	-32(%rdi),%xmm2
2285	pand	176(%r10),%xmm4
2286	movdqa	-16(%rdi),%xmm3
2287	pand	192(%r10),%xmm5
2288	por	%xmm4,%xmm0
2289	pand	208(%r10),%xmm2
2290	por	%xmm5,%xmm1
2291	pand	224(%r10),%xmm3
2292	por	%xmm2,%xmm0
2293	por	%xmm3,%xmm1
2294	movdqa	0(%rdi),%xmm4
2295	movdqa	16(%rdi),%xmm5
2296	movdqa	32(%rdi),%xmm2
2297	pand	240(%r10),%xmm4
2298	movdqa	48(%rdi),%xmm3
2299	pand	256(%r10),%xmm5
2300	por	%xmm4,%xmm0
2301	pand	272(%r10),%xmm2
2302	por	%xmm5,%xmm1
2303	pand	288(%r10),%xmm3
2304	por	%xmm2,%xmm0
2305	por	%xmm3,%xmm1
2306	pxor	%xmm1,%xmm0
2307	pshufd	$0x4e,%xmm0,%xmm1
2308	por	%xmm1,%xmm0
2309	leaq	256(%rdi),%rdi
2310.byte	102,72,15,126,194
2311	leaq	64+32+8(%rsp),%rbx
2312
2313	movq	%rdx,%r9
2314	mulxq	0(%rsi),%r8,%rax
2315	mulxq	8(%rsi),%r11,%r12
2316	addq	%rax,%r11
2317	mulxq	16(%rsi),%rax,%r13
2318	adcq	%rax,%r12
2319	adcq	$0,%r13
2320	mulxq	24(%rsi),%rax,%r14
2321
2322	movq	%r8,%r15
2323	imulq	32+8(%rsp),%r8
2324	xorq	%rbp,%rbp
2325	movq	%r8,%rdx
2326
2327	movq	%rdi,8+8(%rsp)
2328
2329	leaq	32(%rsi),%rsi
2330	adcxq	%rax,%r13
2331	adcxq	%rbp,%r14
2332
2333	mulxq	0(%rcx),%rax,%r10
2334	adcxq	%rax,%r15
2335	adoxq	%r11,%r10
2336	mulxq	8(%rcx),%rax,%r11
2337	adcxq	%rax,%r10
2338	adoxq	%r12,%r11
2339	mulxq	16(%rcx),%rax,%r12
2340	movq	24+8(%rsp),%rdi
2341	movq	%r10,-32(%rbx)
2342	adcxq	%rax,%r11
2343	adoxq	%r13,%r12
2344	mulxq	24(%rcx),%rax,%r15
2345	movq	%r9,%rdx
2346	movq	%r11,-24(%rbx)
2347	adcxq	%rax,%r12
2348	adoxq	%rbp,%r15
2349	leaq	32(%rcx),%rcx
2350	movq	%r12,-16(%rbx)
2351	jmp	.Lmulx4x_1st
2352
2353.align	32
2354.Lmulx4x_1st:
2355	adcxq	%rbp,%r15
2356	mulxq	0(%rsi),%r10,%rax
2357	adcxq	%r14,%r10
2358	mulxq	8(%rsi),%r11,%r14
2359	adcxq	%rax,%r11
2360	mulxq	16(%rsi),%r12,%rax
2361	adcxq	%r14,%r12
2362	mulxq	24(%rsi),%r13,%r14
2363.byte	0x67,0x67
2364	movq	%r8,%rdx
2365	adcxq	%rax,%r13
2366	adcxq	%rbp,%r14
2367	leaq	32(%rsi),%rsi
2368	leaq	32(%rbx),%rbx
2369
2370	adoxq	%r15,%r10
2371	mulxq	0(%rcx),%rax,%r15
2372	adcxq	%rax,%r10
2373	adoxq	%r15,%r11
2374	mulxq	8(%rcx),%rax,%r15
2375	adcxq	%rax,%r11
2376	adoxq	%r15,%r12
2377	mulxq	16(%rcx),%rax,%r15
2378	movq	%r10,-40(%rbx)
2379	adcxq	%rax,%r12
2380	movq	%r11,-32(%rbx)
2381	adoxq	%r15,%r13
2382	mulxq	24(%rcx),%rax,%r15
2383	movq	%r9,%rdx
2384	movq	%r12,-24(%rbx)
2385	adcxq	%rax,%r13
2386	adoxq	%rbp,%r15
2387	leaq	32(%rcx),%rcx
2388	movq	%r13,-16(%rbx)
2389
2390	decq	%rdi
2391	jnz	.Lmulx4x_1st
2392
2393	movq	8(%rsp),%rax
2394	adcq	%rbp,%r15
2395	leaq	(%rsi,%rax,1),%rsi
2396	addq	%r15,%r14
2397	movq	8+8(%rsp),%rdi
2398	adcq	%rbp,%rbp
2399	movq	%r14,-8(%rbx)
2400	jmp	.Lmulx4x_outer
2401
2402.align	32
2403.Lmulx4x_outer:
2404	leaq	16-256(%rbx),%r10
2405	pxor	%xmm4,%xmm4
2406.byte	0x67,0x67
2407	pxor	%xmm5,%xmm5
2408	movdqa	-128(%rdi),%xmm0
2409	movdqa	-112(%rdi),%xmm1
2410	movdqa	-96(%rdi),%xmm2
2411	pand	256(%r10),%xmm0
2412	movdqa	-80(%rdi),%xmm3
2413	pand	272(%r10),%xmm1
2414	por	%xmm0,%xmm4
2415	pand	288(%r10),%xmm2
2416	por	%xmm1,%xmm5
2417	pand	304(%r10),%xmm3
2418	por	%xmm2,%xmm4
2419	por	%xmm3,%xmm5
2420	movdqa	-64(%rdi),%xmm0
2421	movdqa	-48(%rdi),%xmm1
2422	movdqa	-32(%rdi),%xmm2
2423	pand	320(%r10),%xmm0
2424	movdqa	-16(%rdi),%xmm3
2425	pand	336(%r10),%xmm1
2426	por	%xmm0,%xmm4
2427	pand	352(%r10),%xmm2
2428	por	%xmm1,%xmm5
2429	pand	368(%r10),%xmm3
2430	por	%xmm2,%xmm4
2431	por	%xmm3,%xmm5
2432	movdqa	0(%rdi),%xmm0
2433	movdqa	16(%rdi),%xmm1
2434	movdqa	32(%rdi),%xmm2
2435	pand	384(%r10),%xmm0
2436	movdqa	48(%rdi),%xmm3
2437	pand	400(%r10),%xmm1
2438	por	%xmm0,%xmm4
2439	pand	416(%r10),%xmm2
2440	por	%xmm1,%xmm5
2441	pand	432(%r10),%xmm3
2442	por	%xmm2,%xmm4
2443	por	%xmm3,%xmm5
2444	movdqa	64(%rdi),%xmm0
2445	movdqa	80(%rdi),%xmm1
2446	movdqa	96(%rdi),%xmm2
2447	pand	448(%r10),%xmm0
2448	movdqa	112(%rdi),%xmm3
2449	pand	464(%r10),%xmm1
2450	por	%xmm0,%xmm4
2451	pand	480(%r10),%xmm2
2452	por	%xmm1,%xmm5
2453	pand	496(%r10),%xmm3
2454	por	%xmm2,%xmm4
2455	por	%xmm3,%xmm5
2456	por	%xmm5,%xmm4
2457	pshufd	$0x4e,%xmm4,%xmm0
2458	por	%xmm4,%xmm0
2459	leaq	256(%rdi),%rdi
2460.byte	102,72,15,126,194
2461
2462	movq	%rbp,(%rbx)
2463	leaq	32(%rbx,%rax,1),%rbx
2464	mulxq	0(%rsi),%r8,%r11
2465	xorq	%rbp,%rbp
2466	movq	%rdx,%r9
2467	mulxq	8(%rsi),%r14,%r12
2468	adoxq	-32(%rbx),%r8
2469	adcxq	%r14,%r11
2470	mulxq	16(%rsi),%r15,%r13
2471	adoxq	-24(%rbx),%r11
2472	adcxq	%r15,%r12
2473	mulxq	24(%rsi),%rdx,%r14
2474	adoxq	-16(%rbx),%r12
2475	adcxq	%rdx,%r13
2476	leaq	(%rcx,%rax,1),%rcx
2477	leaq	32(%rsi),%rsi
2478	adoxq	-8(%rbx),%r13
2479	adcxq	%rbp,%r14
2480	adoxq	%rbp,%r14
2481
2482	movq	%r8,%r15
2483	imulq	32+8(%rsp),%r8
2484
2485	movq	%r8,%rdx
2486	xorq	%rbp,%rbp
2487	movq	%rdi,8+8(%rsp)
2488
2489	mulxq	0(%rcx),%rax,%r10
2490	adcxq	%rax,%r15
2491	adoxq	%r11,%r10
2492	mulxq	8(%rcx),%rax,%r11
2493	adcxq	%rax,%r10
2494	adoxq	%r12,%r11
2495	mulxq	16(%rcx),%rax,%r12
2496	adcxq	%rax,%r11
2497	adoxq	%r13,%r12
2498	mulxq	24(%rcx),%rax,%r15
2499	movq	%r9,%rdx
2500	movq	24+8(%rsp),%rdi
2501	movq	%r10,-32(%rbx)
2502	adcxq	%rax,%r12
2503	movq	%r11,-24(%rbx)
2504	adoxq	%rbp,%r15
2505	movq	%r12,-16(%rbx)
2506	leaq	32(%rcx),%rcx
2507	jmp	.Lmulx4x_inner
2508
2509.align	32
2510.Lmulx4x_inner:
2511	mulxq	0(%rsi),%r10,%rax
2512	adcxq	%rbp,%r15
2513	adoxq	%r14,%r10
2514	mulxq	8(%rsi),%r11,%r14
2515	adcxq	0(%rbx),%r10
2516	adoxq	%rax,%r11
2517	mulxq	16(%rsi),%r12,%rax
2518	adcxq	8(%rbx),%r11
2519	adoxq	%r14,%r12
2520	mulxq	24(%rsi),%r13,%r14
2521	movq	%r8,%rdx
2522	adcxq	16(%rbx),%r12
2523	adoxq	%rax,%r13
2524	adcxq	24(%rbx),%r13
2525	adoxq	%rbp,%r14
2526	leaq	32(%rsi),%rsi
2527	leaq	32(%rbx),%rbx
2528	adcxq	%rbp,%r14
2529
2530	adoxq	%r15,%r10
2531	mulxq	0(%rcx),%rax,%r15
2532	adcxq	%rax,%r10
2533	adoxq	%r15,%r11
2534	mulxq	8(%rcx),%rax,%r15
2535	adcxq	%rax,%r11
2536	adoxq	%r15,%r12
2537	mulxq	16(%rcx),%rax,%r15
2538	movq	%r10,-40(%rbx)
2539	adcxq	%rax,%r12
2540	adoxq	%r15,%r13
2541	movq	%r11,-32(%rbx)
2542	mulxq	24(%rcx),%rax,%r15
2543	movq	%r9,%rdx
2544	leaq	32(%rcx),%rcx
2545	movq	%r12,-24(%rbx)
2546	adcxq	%rax,%r13
2547	adoxq	%rbp,%r15
2548	movq	%r13,-16(%rbx)
2549
2550	decq	%rdi
2551	jnz	.Lmulx4x_inner
2552
2553	movq	0+8(%rsp),%rax
2554	adcq	%rbp,%r15
2555	subq	0(%rbx),%rdi
2556	movq	8+8(%rsp),%rdi
2557	movq	16+8(%rsp),%r10
2558	adcq	%r15,%r14
2559	leaq	(%rsi,%rax,1),%rsi
2560	adcq	%rbp,%rbp
2561	movq	%r14,-8(%rbx)
2562
2563	cmpq	%r10,%rdi
2564	jb	.Lmulx4x_outer
2565
2566	movq	-8(%rcx),%r10
2567	movq	%rbp,%r8
2568	movq	(%rcx,%rax,1),%r12
2569	leaq	(%rcx,%rax,1),%rbp
2570	movq	%rax,%rcx
2571	leaq	(%rbx,%rax,1),%rdi
2572	xorl	%eax,%eax
2573	xorq	%r15,%r15
2574	subq	%r14,%r10
2575	adcq	%r15,%r15
2576	orq	%r15,%r8
2577	sarq	$3+2,%rcx
2578	subq	%r8,%rax
2579	movq	56+8(%rsp),%rdx
2580	decq	%r12
2581	movq	8(%rbp),%r13
2582	xorq	%r8,%r8
2583	movq	16(%rbp),%r14
2584	movq	24(%rbp),%r15
2585	jmp	.Lsqrx4x_sub_entry
2586.cfi_endproc
2587.size	mulx4x_internal,.-mulx4x_internal
2588.type	bn_powerx5,@function
2589.align	32
2590bn_powerx5:
2591.cfi_startproc
2592	movq	%rsp,%rax
2593.cfi_def_cfa_register	%rax
2594.Lpowerx5_enter:
2595	pushq	%rbx
2596.cfi_offset	%rbx,-16
2597	pushq	%rbp
2598.cfi_offset	%rbp,-24
2599	pushq	%r12
2600.cfi_offset	%r12,-32
2601	pushq	%r13
2602.cfi_offset	%r13,-40
2603	pushq	%r14
2604.cfi_offset	%r14,-48
2605	pushq	%r15
2606.cfi_offset	%r15,-56
2607.Lpowerx5_prologue:
2608
2609	shll	$3,%r9d
2610	leaq	(%r9,%r9,2),%r10
2611	negq	%r9
2612	movq	(%r8),%r8
2613
2614
2615
2616
2617
2618
2619
2620
2621	leaq	-320(%rsp,%r9,2),%r11
2622	movq	%rsp,%rbp
2623	subq	%rdi,%r11
2624	andq	$4095,%r11
2625	cmpq	%r11,%r10
2626	jb	.Lpwrx_sp_alt
2627	subq	%r11,%rbp
2628	leaq	-320(%rbp,%r9,2),%rbp
2629	jmp	.Lpwrx_sp_done
2630
2631.align	32
2632.Lpwrx_sp_alt:
2633	leaq	4096-320(,%r9,2),%r10
2634	leaq	-320(%rbp,%r9,2),%rbp
2635	subq	%r10,%r11
2636	movq	$0,%r10
2637	cmovcq	%r10,%r11
2638	subq	%r11,%rbp
2639.Lpwrx_sp_done:
2640	andq	$-64,%rbp
2641	movq	%rsp,%r11
2642	subq	%rbp,%r11
2643	andq	$-4096,%r11
2644	leaq	(%r11,%rbp,1),%rsp
2645	movq	(%rsp),%r10
2646	cmpq	%rbp,%rsp
2647	ja	.Lpwrx_page_walk
2648	jmp	.Lpwrx_page_walk_done
2649
2650.Lpwrx_page_walk:
2651	leaq	-4096(%rsp),%rsp
2652	movq	(%rsp),%r10
2653	cmpq	%rbp,%rsp
2654	ja	.Lpwrx_page_walk
2655.Lpwrx_page_walk_done:
2656
2657	movq	%r9,%r10
2658	negq	%r9
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671	pxor	%xmm0,%xmm0
2672.byte	102,72,15,110,207
2673.byte	102,72,15,110,209
2674.byte	102,73,15,110,218
2675.byte	102,72,15,110,226
2676	movq	%r8,32(%rsp)
2677	movq	%rax,40(%rsp)
2678.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2679.Lpowerx5_body:
2680
2681	call	__bn_sqrx8x_internal
2682	call	__bn_postx4x_internal
2683	call	__bn_sqrx8x_internal
2684	call	__bn_postx4x_internal
2685	call	__bn_sqrx8x_internal
2686	call	__bn_postx4x_internal
2687	call	__bn_sqrx8x_internal
2688	call	__bn_postx4x_internal
2689	call	__bn_sqrx8x_internal
2690	call	__bn_postx4x_internal
2691
2692	movq	%r10,%r9
2693	movq	%rsi,%rdi
2694.byte	102,72,15,126,209
2695.byte	102,72,15,126,226
2696	movq	40(%rsp),%rax
2697
2698	call	mulx4x_internal
2699
2700	movq	40(%rsp),%rsi
2701.cfi_def_cfa	%rsi,8
2702	movq	$1,%rax
2703
2704	movq	-48(%rsi),%r15
2705.cfi_restore	%r15
2706	movq	-40(%rsi),%r14
2707.cfi_restore	%r14
2708	movq	-32(%rsi),%r13
2709.cfi_restore	%r13
2710	movq	-24(%rsi),%r12
2711.cfi_restore	%r12
2712	movq	-16(%rsi),%rbp
2713.cfi_restore	%rbp
2714	movq	-8(%rsi),%rbx
2715.cfi_restore	%rbx
2716	leaq	(%rsi),%rsp
2717.cfi_def_cfa_register	%rsp
2718.Lpowerx5_epilogue:
2719	.byte	0xf3,0xc3
2720.cfi_endproc
2721.size	bn_powerx5,.-bn_powerx5
2722
2723.globl	bn_sqrx8x_internal
2724.hidden	bn_sqrx8x_internal
2725.type	bn_sqrx8x_internal,@function
2726.align	32
2727bn_sqrx8x_internal:
2728__bn_sqrx8x_internal:
2729.cfi_startproc
2730
2731
2732
2733
2734
2735
2736
2737
2738
2739
2740
2741
2742
2743
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763
2764
2765
2766
2767
2768
2769
2770	leaq	48+8(%rsp),%rdi
2771	leaq	(%rsi,%r9,1),%rbp
2772	movq	%r9,0+8(%rsp)
2773	movq	%rbp,8+8(%rsp)
2774	jmp	.Lsqr8x_zero_start
2775
2776.align	32
2777.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2778.Lsqrx8x_zero:
2779.byte	0x3e
2780	movdqa	%xmm0,0(%rdi)
2781	movdqa	%xmm0,16(%rdi)
2782	movdqa	%xmm0,32(%rdi)
2783	movdqa	%xmm0,48(%rdi)
2784.Lsqr8x_zero_start:
2785	movdqa	%xmm0,64(%rdi)
2786	movdqa	%xmm0,80(%rdi)
2787	movdqa	%xmm0,96(%rdi)
2788	movdqa	%xmm0,112(%rdi)
2789	leaq	128(%rdi),%rdi
2790	subq	$64,%r9
2791	jnz	.Lsqrx8x_zero
2792
2793	movq	0(%rsi),%rdx
2794
2795	xorq	%r10,%r10
2796	xorq	%r11,%r11
2797	xorq	%r12,%r12
2798	xorq	%r13,%r13
2799	xorq	%r14,%r14
2800	xorq	%r15,%r15
2801	leaq	48+8(%rsp),%rdi
2802	xorq	%rbp,%rbp
2803	jmp	.Lsqrx8x_outer_loop
2804
2805.align	32
2806.Lsqrx8x_outer_loop:
2807	mulxq	8(%rsi),%r8,%rax
2808	adcxq	%r9,%r8
2809	adoxq	%rax,%r10
2810	mulxq	16(%rsi),%r9,%rax
2811	adcxq	%r10,%r9
2812	adoxq	%rax,%r11
2813.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2814	adcxq	%r11,%r10
2815	adoxq	%rax,%r12
2816.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2817	adcxq	%r12,%r11
2818	adoxq	%rax,%r13
2819	mulxq	40(%rsi),%r12,%rax
2820	adcxq	%r13,%r12
2821	adoxq	%rax,%r14
2822	mulxq	48(%rsi),%r13,%rax
2823	adcxq	%r14,%r13
2824	adoxq	%r15,%rax
2825	mulxq	56(%rsi),%r14,%r15
2826	movq	8(%rsi),%rdx
2827	adcxq	%rax,%r14
2828	adoxq	%rbp,%r15
2829	adcq	64(%rdi),%r15
2830	movq	%r8,8(%rdi)
2831	movq	%r9,16(%rdi)
2832	sbbq	%rcx,%rcx
2833	xorq	%rbp,%rbp
2834
2835
2836	mulxq	16(%rsi),%r8,%rbx
2837	mulxq	24(%rsi),%r9,%rax
2838	adcxq	%r10,%r8
2839	adoxq	%rbx,%r9
2840	mulxq	32(%rsi),%r10,%rbx
2841	adcxq	%r11,%r9
2842	adoxq	%rax,%r10
2843.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2844	adcxq	%r12,%r10
2845	adoxq	%rbx,%r11
2846.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2847	adcxq	%r13,%r11
2848	adoxq	%r14,%r12
2849.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2850	movq	16(%rsi),%rdx
2851	adcxq	%rax,%r12
2852	adoxq	%rbx,%r13
2853	adcxq	%r15,%r13
2854	adoxq	%rbp,%r14
2855	adcxq	%rbp,%r14
2856
2857	movq	%r8,24(%rdi)
2858	movq	%r9,32(%rdi)
2859
2860	mulxq	24(%rsi),%r8,%rbx
2861	mulxq	32(%rsi),%r9,%rax
2862	adcxq	%r10,%r8
2863	adoxq	%rbx,%r9
2864	mulxq	40(%rsi),%r10,%rbx
2865	adcxq	%r11,%r9
2866	adoxq	%rax,%r10
2867.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2868	adcxq	%r12,%r10
2869	adoxq	%r13,%r11
2870.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2871.byte	0x3e
2872	movq	24(%rsi),%rdx
2873	adcxq	%rbx,%r11
2874	adoxq	%rax,%r12
2875	adcxq	%r14,%r12
2876	movq	%r8,40(%rdi)
2877	movq	%r9,48(%rdi)
2878	mulxq	32(%rsi),%r8,%rax
2879	adoxq	%rbp,%r13
2880	adcxq	%rbp,%r13
2881
2882	mulxq	40(%rsi),%r9,%rbx
2883	adcxq	%r10,%r8
2884	adoxq	%rax,%r9
2885	mulxq	48(%rsi),%r10,%rax
2886	adcxq	%r11,%r9
2887	adoxq	%r12,%r10
2888	mulxq	56(%rsi),%r11,%r12
2889	movq	32(%rsi),%rdx
2890	movq	40(%rsi),%r14
2891	adcxq	%rbx,%r10
2892	adoxq	%rax,%r11
2893	movq	48(%rsi),%r15
2894	adcxq	%r13,%r11
2895	adoxq	%rbp,%r12
2896	adcxq	%rbp,%r12
2897
2898	movq	%r8,56(%rdi)
2899	movq	%r9,64(%rdi)
2900
2901	mulxq	%r14,%r9,%rax
2902	movq	56(%rsi),%r8
2903	adcxq	%r10,%r9
2904	mulxq	%r15,%r10,%rbx
2905	adoxq	%rax,%r10
2906	adcxq	%r11,%r10
2907	mulxq	%r8,%r11,%rax
2908	movq	%r14,%rdx
2909	adoxq	%rbx,%r11
2910	adcxq	%r12,%r11
2911
2912	adcxq	%rbp,%rax
2913
2914	mulxq	%r15,%r14,%rbx
2915	mulxq	%r8,%r12,%r13
2916	movq	%r15,%rdx
2917	leaq	64(%rsi),%rsi
2918	adcxq	%r14,%r11
2919	adoxq	%rbx,%r12
2920	adcxq	%rax,%r12
2921	adoxq	%rbp,%r13
2922
2923.byte	0x67,0x67
2924	mulxq	%r8,%r8,%r14
2925	adcxq	%r8,%r13
2926	adcxq	%rbp,%r14
2927
2928	cmpq	8+8(%rsp),%rsi
2929	je	.Lsqrx8x_outer_break
2930
2931	negq	%rcx
2932	movq	$-8,%rcx
2933	movq	%rbp,%r15
2934	movq	64(%rdi),%r8
2935	adcxq	72(%rdi),%r9
2936	adcxq	80(%rdi),%r10
2937	adcxq	88(%rdi),%r11
2938	adcq	96(%rdi),%r12
2939	adcq	104(%rdi),%r13
2940	adcq	112(%rdi),%r14
2941	adcq	120(%rdi),%r15
2942	leaq	(%rsi),%rbp
2943	leaq	128(%rdi),%rdi
2944	sbbq	%rax,%rax
2945
2946	movq	-64(%rsi),%rdx
2947	movq	%rax,16+8(%rsp)
2948	movq	%rdi,24+8(%rsp)
2949
2950
2951	xorl	%eax,%eax
2952	jmp	.Lsqrx8x_loop
2953
2954.align	32
2955.Lsqrx8x_loop:
2956	movq	%r8,%rbx
2957	mulxq	0(%rbp),%rax,%r8
2958	adcxq	%rax,%rbx
2959	adoxq	%r9,%r8
2960
2961	mulxq	8(%rbp),%rax,%r9
2962	adcxq	%rax,%r8
2963	adoxq	%r10,%r9
2964
2965	mulxq	16(%rbp),%rax,%r10
2966	adcxq	%rax,%r9
2967	adoxq	%r11,%r10
2968
2969	mulxq	24(%rbp),%rax,%r11
2970	adcxq	%rax,%r10
2971	adoxq	%r12,%r11
2972
2973.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
2974	adcxq	%rax,%r11
2975	adoxq	%r13,%r12
2976
2977	mulxq	40(%rbp),%rax,%r13
2978	adcxq	%rax,%r12
2979	adoxq	%r14,%r13
2980
2981	mulxq	48(%rbp),%rax,%r14
2982	movq	%rbx,(%rdi,%rcx,8)
2983	movl	$0,%ebx
2984	adcxq	%rax,%r13
2985	adoxq	%r15,%r14
2986
2987.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
2988	movq	8(%rsi,%rcx,8),%rdx
2989	adcxq	%rax,%r14
2990	adoxq	%rbx,%r15
2991	adcxq	%rbx,%r15
2992
2993.byte	0x67
2994	incq	%rcx
2995	jnz	.Lsqrx8x_loop
2996
2997	leaq	64(%rbp),%rbp
2998	movq	$-8,%rcx
2999	cmpq	8+8(%rsp),%rbp
3000	je	.Lsqrx8x_break
3001
3002	subq	16+8(%rsp),%rbx
3003.byte	0x66
3004	movq	-64(%rsi),%rdx
3005	adcxq	0(%rdi),%r8
3006	adcxq	8(%rdi),%r9
3007	adcq	16(%rdi),%r10
3008	adcq	24(%rdi),%r11
3009	adcq	32(%rdi),%r12
3010	adcq	40(%rdi),%r13
3011	adcq	48(%rdi),%r14
3012	adcq	56(%rdi),%r15
3013	leaq	64(%rdi),%rdi
3014.byte	0x67
3015	sbbq	%rax,%rax
3016	xorl	%ebx,%ebx
3017	movq	%rax,16+8(%rsp)
3018	jmp	.Lsqrx8x_loop
3019
3020.align	32
3021.Lsqrx8x_break:
3022	xorq	%rbp,%rbp
3023	subq	16+8(%rsp),%rbx
3024	adcxq	%rbp,%r8
3025	movq	24+8(%rsp),%rcx
3026	adcxq	%rbp,%r9
3027	movq	0(%rsi),%rdx
3028	adcq	$0,%r10
3029	movq	%r8,0(%rdi)
3030	adcq	$0,%r11
3031	adcq	$0,%r12
3032	adcq	$0,%r13
3033	adcq	$0,%r14
3034	adcq	$0,%r15
3035	cmpq	%rcx,%rdi
3036	je	.Lsqrx8x_outer_loop
3037
3038	movq	%r9,8(%rdi)
3039	movq	8(%rcx),%r9
3040	movq	%r10,16(%rdi)
3041	movq	16(%rcx),%r10
3042	movq	%r11,24(%rdi)
3043	movq	24(%rcx),%r11
3044	movq	%r12,32(%rdi)
3045	movq	32(%rcx),%r12
3046	movq	%r13,40(%rdi)
3047	movq	40(%rcx),%r13
3048	movq	%r14,48(%rdi)
3049	movq	48(%rcx),%r14
3050	movq	%r15,56(%rdi)
3051	movq	56(%rcx),%r15
3052	movq	%rcx,%rdi
3053	jmp	.Lsqrx8x_outer_loop
3054
3055.align	32
3056.Lsqrx8x_outer_break:
3057	movq	%r9,72(%rdi)
3058.byte	102,72,15,126,217
3059	movq	%r10,80(%rdi)
3060	movq	%r11,88(%rdi)
3061	movq	%r12,96(%rdi)
3062	movq	%r13,104(%rdi)
3063	movq	%r14,112(%rdi)
3064	leaq	48+8(%rsp),%rdi
3065	movq	(%rsi,%rcx,1),%rdx
3066
3067	movq	8(%rdi),%r11
3068	xorq	%r10,%r10
3069	movq	0+8(%rsp),%r9
3070	adoxq	%r11,%r11
3071	movq	16(%rdi),%r12
3072	movq	24(%rdi),%r13
3073
3074
3075.align	32
3076.Lsqrx4x_shift_n_add:
3077	mulxq	%rdx,%rax,%rbx
3078	adoxq	%r12,%r12
3079	adcxq	%r10,%rax
3080.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3081.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3082	adoxq	%r13,%r13
3083	adcxq	%r11,%rbx
3084	movq	40(%rdi),%r11
3085	movq	%rax,0(%rdi)
3086	movq	%rbx,8(%rdi)
3087
3088	mulxq	%rdx,%rax,%rbx
3089	adoxq	%r10,%r10
3090	adcxq	%r12,%rax
3091	movq	16(%rsi,%rcx,1),%rdx
3092	movq	48(%rdi),%r12
3093	adoxq	%r11,%r11
3094	adcxq	%r13,%rbx
3095	movq	56(%rdi),%r13
3096	movq	%rax,16(%rdi)
3097	movq	%rbx,24(%rdi)
3098
3099	mulxq	%rdx,%rax,%rbx
3100	adoxq	%r12,%r12
3101	adcxq	%r10,%rax
3102	movq	24(%rsi,%rcx,1),%rdx
3103	leaq	32(%rcx),%rcx
3104	movq	64(%rdi),%r10
3105	adoxq	%r13,%r13
3106	adcxq	%r11,%rbx
3107	movq	72(%rdi),%r11
3108	movq	%rax,32(%rdi)
3109	movq	%rbx,40(%rdi)
3110
3111	mulxq	%rdx,%rax,%rbx
3112	adoxq	%r10,%r10
3113	adcxq	%r12,%rax
3114	jrcxz	.Lsqrx4x_shift_n_add_break
3115.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3116	adoxq	%r11,%r11
3117	adcxq	%r13,%rbx
3118	movq	80(%rdi),%r12
3119	movq	88(%rdi),%r13
3120	movq	%rax,48(%rdi)
3121	movq	%rbx,56(%rdi)
3122	leaq	64(%rdi),%rdi
3123	nop
3124	jmp	.Lsqrx4x_shift_n_add
3125
3126.align	32
3127.Lsqrx4x_shift_n_add_break:
3128	adcxq	%r13,%rbx
3129	movq	%rax,48(%rdi)
3130	movq	%rbx,56(%rdi)
3131	leaq	64(%rdi),%rdi
3132.byte	102,72,15,126,213
3133__bn_sqrx8x_reduction:
3134	xorl	%eax,%eax
3135	movq	32+8(%rsp),%rbx
3136	movq	48+8(%rsp),%rdx
3137	leaq	-64(%rbp,%r9,1),%rcx
3138
3139	movq	%rcx,0+8(%rsp)
3140	movq	%rdi,8+8(%rsp)
3141
3142	leaq	48+8(%rsp),%rdi
3143	jmp	.Lsqrx8x_reduction_loop
3144
3145.align	32
3146.Lsqrx8x_reduction_loop:
3147	movq	8(%rdi),%r9
3148	movq	16(%rdi),%r10
3149	movq	24(%rdi),%r11
3150	movq	32(%rdi),%r12
3151	movq	%rdx,%r8
3152	imulq	%rbx,%rdx
3153	movq	40(%rdi),%r13
3154	movq	48(%rdi),%r14
3155	movq	56(%rdi),%r15
3156	movq	%rax,24+8(%rsp)
3157
3158	leaq	64(%rdi),%rdi
3159	xorq	%rsi,%rsi
3160	movq	$-8,%rcx
3161	jmp	.Lsqrx8x_reduce
3162
3163.align	32
3164.Lsqrx8x_reduce:
3165	movq	%r8,%rbx
3166	mulxq	0(%rbp),%rax,%r8
3167	adcxq	%rbx,%rax
3168	adoxq	%r9,%r8
3169
3170	mulxq	8(%rbp),%rbx,%r9
3171	adcxq	%rbx,%r8
3172	adoxq	%r10,%r9
3173
3174	mulxq	16(%rbp),%rbx,%r10
3175	adcxq	%rbx,%r9
3176	adoxq	%r11,%r10
3177
3178	mulxq	24(%rbp),%rbx,%r11
3179	adcxq	%rbx,%r10
3180	adoxq	%r12,%r11
3181
3182.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3183	movq	%rdx,%rax
3184	movq	%r8,%rdx
3185	adcxq	%rbx,%r11
3186	adoxq	%r13,%r12
3187
3188	mulxq	32+8(%rsp),%rbx,%rdx
3189	movq	%rax,%rdx
3190	movq	%rax,64+48+8(%rsp,%rcx,8)
3191
3192	mulxq	40(%rbp),%rax,%r13
3193	adcxq	%rax,%r12
3194	adoxq	%r14,%r13
3195
3196	mulxq	48(%rbp),%rax,%r14
3197	adcxq	%rax,%r13
3198	adoxq	%r15,%r14
3199
3200	mulxq	56(%rbp),%rax,%r15
3201	movq	%rbx,%rdx
3202	adcxq	%rax,%r14
3203	adoxq	%rsi,%r15
3204	adcxq	%rsi,%r15
3205
3206.byte	0x67,0x67,0x67
3207	incq	%rcx
3208	jnz	.Lsqrx8x_reduce
3209
3210	movq	%rsi,%rax
3211	cmpq	0+8(%rsp),%rbp
3212	jae	.Lsqrx8x_no_tail
3213
3214	movq	48+8(%rsp),%rdx
3215	addq	0(%rdi),%r8
3216	leaq	64(%rbp),%rbp
3217	movq	$-8,%rcx
3218	adcxq	8(%rdi),%r9
3219	adcxq	16(%rdi),%r10
3220	adcq	24(%rdi),%r11
3221	adcq	32(%rdi),%r12
3222	adcq	40(%rdi),%r13
3223	adcq	48(%rdi),%r14
3224	adcq	56(%rdi),%r15
3225	leaq	64(%rdi),%rdi
3226	sbbq	%rax,%rax
3227
3228	xorq	%rsi,%rsi
3229	movq	%rax,16+8(%rsp)
3230	jmp	.Lsqrx8x_tail
3231
3232.align	32
3233.Lsqrx8x_tail:
3234	movq	%r8,%rbx
3235	mulxq	0(%rbp),%rax,%r8
3236	adcxq	%rax,%rbx
3237	adoxq	%r9,%r8
3238
3239	mulxq	8(%rbp),%rax,%r9
3240	adcxq	%rax,%r8
3241	adoxq	%r10,%r9
3242
3243	mulxq	16(%rbp),%rax,%r10
3244	adcxq	%rax,%r9
3245	adoxq	%r11,%r10
3246
3247	mulxq	24(%rbp),%rax,%r11
3248	adcxq	%rax,%r10
3249	adoxq	%r12,%r11
3250
3251.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3252	adcxq	%rax,%r11
3253	adoxq	%r13,%r12
3254
3255	mulxq	40(%rbp),%rax,%r13
3256	adcxq	%rax,%r12
3257	adoxq	%r14,%r13
3258
3259	mulxq	48(%rbp),%rax,%r14
3260	adcxq	%rax,%r13
3261	adoxq	%r15,%r14
3262
3263	mulxq	56(%rbp),%rax,%r15
3264	movq	72+48+8(%rsp,%rcx,8),%rdx
3265	adcxq	%rax,%r14
3266	adoxq	%rsi,%r15
3267	movq	%rbx,(%rdi,%rcx,8)
3268	movq	%r8,%rbx
3269	adcxq	%rsi,%r15
3270
3271	incq	%rcx
3272	jnz	.Lsqrx8x_tail
3273
3274	cmpq	0+8(%rsp),%rbp
3275	jae	.Lsqrx8x_tail_done
3276
3277	subq	16+8(%rsp),%rsi
3278	movq	48+8(%rsp),%rdx
3279	leaq	64(%rbp),%rbp
3280	adcq	0(%rdi),%r8
3281	adcq	8(%rdi),%r9
3282	adcq	16(%rdi),%r10
3283	adcq	24(%rdi),%r11
3284	adcq	32(%rdi),%r12
3285	adcq	40(%rdi),%r13
3286	adcq	48(%rdi),%r14
3287	adcq	56(%rdi),%r15
3288	leaq	64(%rdi),%rdi
3289	sbbq	%rax,%rax
3290	subq	$8,%rcx
3291
3292	xorq	%rsi,%rsi
3293	movq	%rax,16+8(%rsp)
3294	jmp	.Lsqrx8x_tail
3295
3296.align	32
3297.Lsqrx8x_tail_done:
3298	xorq	%rax,%rax
3299	addq	24+8(%rsp),%r8
3300	adcq	$0,%r9
3301	adcq	$0,%r10
3302	adcq	$0,%r11
3303	adcq	$0,%r12
3304	adcq	$0,%r13
3305	adcq	$0,%r14
3306	adcq	$0,%r15
3307	adcq	$0,%rax
3308
3309	subq	16+8(%rsp),%rsi
3310.Lsqrx8x_no_tail:
3311	adcq	0(%rdi),%r8
3312.byte	102,72,15,126,217
3313	adcq	8(%rdi),%r9
3314	movq	56(%rbp),%rsi
3315.byte	102,72,15,126,213
3316	adcq	16(%rdi),%r10
3317	adcq	24(%rdi),%r11
3318	adcq	32(%rdi),%r12
3319	adcq	40(%rdi),%r13
3320	adcq	48(%rdi),%r14
3321	adcq	56(%rdi),%r15
3322	adcq	$0,%rax
3323
3324	movq	32+8(%rsp),%rbx
3325	movq	64(%rdi,%rcx,1),%rdx
3326
3327	movq	%r8,0(%rdi)
3328	leaq	64(%rdi),%r8
3329	movq	%r9,8(%rdi)
3330	movq	%r10,16(%rdi)
3331	movq	%r11,24(%rdi)
3332	movq	%r12,32(%rdi)
3333	movq	%r13,40(%rdi)
3334	movq	%r14,48(%rdi)
3335	movq	%r15,56(%rdi)
3336
3337	leaq	64(%rdi,%rcx,1),%rdi
3338	cmpq	8+8(%rsp),%r8
3339	jb	.Lsqrx8x_reduction_loop
3340	.byte	0xf3,0xc3
3341.cfi_endproc
3342.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3343.align	32
3344__bn_postx4x_internal:
3345.cfi_startproc
3346	movq	0(%rbp),%r12
3347	movq	%rcx,%r10
3348	movq	%rcx,%r9
3349	negq	%rax
3350	sarq	$3+2,%rcx
3351
3352.byte	102,72,15,126,202
3353.byte	102,72,15,126,206
3354	decq	%r12
3355	movq	8(%rbp),%r13
3356	xorq	%r8,%r8
3357	movq	16(%rbp),%r14
3358	movq	24(%rbp),%r15
3359	jmp	.Lsqrx4x_sub_entry
3360
3361.align	16
3362.Lsqrx4x_sub:
3363	movq	0(%rbp),%r12
3364	movq	8(%rbp),%r13
3365	movq	16(%rbp),%r14
3366	movq	24(%rbp),%r15
3367.Lsqrx4x_sub_entry:
3368	andnq	%rax,%r12,%r12
3369	leaq	32(%rbp),%rbp
3370	andnq	%rax,%r13,%r13
3371	andnq	%rax,%r14,%r14
3372	andnq	%rax,%r15,%r15
3373
3374	negq	%r8
3375	adcq	0(%rdi),%r12
3376	adcq	8(%rdi),%r13
3377	adcq	16(%rdi),%r14
3378	adcq	24(%rdi),%r15
3379	movq	%r12,0(%rdx)
3380	leaq	32(%rdi),%rdi
3381	movq	%r13,8(%rdx)
3382	sbbq	%r8,%r8
3383	movq	%r14,16(%rdx)
3384	movq	%r15,24(%rdx)
3385	leaq	32(%rdx),%rdx
3386
3387	incq	%rcx
3388	jnz	.Lsqrx4x_sub
3389
3390	negq	%r9
3391
3392	.byte	0xf3,0xc3
3393.cfi_endproc
3394.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3395.globl	bn_get_bits5
3396.type	bn_get_bits5,@function
3397.align	16
3398bn_get_bits5:
3399.cfi_startproc
3400	leaq	0(%rdi),%r10
3401	leaq	1(%rdi),%r11
3402	movl	%esi,%ecx
3403	shrl	$4,%esi
3404	andl	$15,%ecx
3405	leal	-8(%rcx),%eax
3406	cmpl	$11,%ecx
3407	cmovaq	%r11,%r10
3408	cmoval	%eax,%ecx
3409	movzwl	(%r10,%rsi,2),%eax
3410	shrl	%cl,%eax
3411	andl	$31,%eax
3412	.byte	0xf3,0xc3
3413.cfi_endproc
3414.size	bn_get_bits5,.-bn_get_bits5
3415
3416.globl	bn_scatter5
3417.type	bn_scatter5,@function
3418.align	16
3419bn_scatter5:
3420.cfi_startproc
3421	cmpl	$0,%esi
3422	jz	.Lscatter_epilogue
3423	leaq	(%rdx,%rcx,8),%rdx
3424.Lscatter:
3425	movq	(%rdi),%rax
3426	leaq	8(%rdi),%rdi
3427	movq	%rax,(%rdx)
3428	leaq	256(%rdx),%rdx
3429	subl	$1,%esi
3430	jnz	.Lscatter
3431.Lscatter_epilogue:
3432	.byte	0xf3,0xc3
3433.cfi_endproc
3434.size	bn_scatter5,.-bn_scatter5
3435
3436.globl	bn_gather5
3437.type	bn_gather5,@function
3438.align	32
3439bn_gather5:
3440.LSEH_begin_bn_gather5:
3441.cfi_startproc
3442
3443.byte	0x4c,0x8d,0x14,0x24
3444.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3445	leaq	.Linc(%rip),%rax
3446	andq	$-16,%rsp
3447
3448	movd	%ecx,%xmm5
3449	movdqa	0(%rax),%xmm0
3450	movdqa	16(%rax),%xmm1
3451	leaq	128(%rdx),%r11
3452	leaq	128(%rsp),%rax
3453
3454	pshufd	$0,%xmm5,%xmm5
3455	movdqa	%xmm1,%xmm4
3456	movdqa	%xmm1,%xmm2
3457	paddd	%xmm0,%xmm1
3458	pcmpeqd	%xmm5,%xmm0
3459	movdqa	%xmm4,%xmm3
3460
3461	paddd	%xmm1,%xmm2
3462	pcmpeqd	%xmm5,%xmm1
3463	movdqa	%xmm0,-128(%rax)
3464	movdqa	%xmm4,%xmm0
3465
3466	paddd	%xmm2,%xmm3
3467	pcmpeqd	%xmm5,%xmm2
3468	movdqa	%xmm1,-112(%rax)
3469	movdqa	%xmm4,%xmm1
3470
3471	paddd	%xmm3,%xmm0
3472	pcmpeqd	%xmm5,%xmm3
3473	movdqa	%xmm2,-96(%rax)
3474	movdqa	%xmm4,%xmm2
3475	paddd	%xmm0,%xmm1
3476	pcmpeqd	%xmm5,%xmm0
3477	movdqa	%xmm3,-80(%rax)
3478	movdqa	%xmm4,%xmm3
3479
3480	paddd	%xmm1,%xmm2
3481	pcmpeqd	%xmm5,%xmm1
3482	movdqa	%xmm0,-64(%rax)
3483	movdqa	%xmm4,%xmm0
3484
3485	paddd	%xmm2,%xmm3
3486	pcmpeqd	%xmm5,%xmm2
3487	movdqa	%xmm1,-48(%rax)
3488	movdqa	%xmm4,%xmm1
3489
3490	paddd	%xmm3,%xmm0
3491	pcmpeqd	%xmm5,%xmm3
3492	movdqa	%xmm2,-32(%rax)
3493	movdqa	%xmm4,%xmm2
3494	paddd	%xmm0,%xmm1
3495	pcmpeqd	%xmm5,%xmm0
3496	movdqa	%xmm3,-16(%rax)
3497	movdqa	%xmm4,%xmm3
3498
3499	paddd	%xmm1,%xmm2
3500	pcmpeqd	%xmm5,%xmm1
3501	movdqa	%xmm0,0(%rax)
3502	movdqa	%xmm4,%xmm0
3503
3504	paddd	%xmm2,%xmm3
3505	pcmpeqd	%xmm5,%xmm2
3506	movdqa	%xmm1,16(%rax)
3507	movdqa	%xmm4,%xmm1
3508
3509	paddd	%xmm3,%xmm0
3510	pcmpeqd	%xmm5,%xmm3
3511	movdqa	%xmm2,32(%rax)
3512	movdqa	%xmm4,%xmm2
3513	paddd	%xmm0,%xmm1
3514	pcmpeqd	%xmm5,%xmm0
3515	movdqa	%xmm3,48(%rax)
3516	movdqa	%xmm4,%xmm3
3517
3518	paddd	%xmm1,%xmm2
3519	pcmpeqd	%xmm5,%xmm1
3520	movdqa	%xmm0,64(%rax)
3521	movdqa	%xmm4,%xmm0
3522
3523	paddd	%xmm2,%xmm3
3524	pcmpeqd	%xmm5,%xmm2
3525	movdqa	%xmm1,80(%rax)
3526	movdqa	%xmm4,%xmm1
3527
3528	paddd	%xmm3,%xmm0
3529	pcmpeqd	%xmm5,%xmm3
3530	movdqa	%xmm2,96(%rax)
3531	movdqa	%xmm4,%xmm2
3532	movdqa	%xmm3,112(%rax)
3533	jmp	.Lgather
3534
3535.align	32
3536.Lgather:
3537	pxor	%xmm4,%xmm4
3538	pxor	%xmm5,%xmm5
3539	movdqa	-128(%r11),%xmm0
3540	movdqa	-112(%r11),%xmm1
3541	movdqa	-96(%r11),%xmm2
3542	pand	-128(%rax),%xmm0
3543	movdqa	-80(%r11),%xmm3
3544	pand	-112(%rax),%xmm1
3545	por	%xmm0,%xmm4
3546	pand	-96(%rax),%xmm2
3547	por	%xmm1,%xmm5
3548	pand	-80(%rax),%xmm3
3549	por	%xmm2,%xmm4
3550	por	%xmm3,%xmm5
3551	movdqa	-64(%r11),%xmm0
3552	movdqa	-48(%r11),%xmm1
3553	movdqa	-32(%r11),%xmm2
3554	pand	-64(%rax),%xmm0
3555	movdqa	-16(%r11),%xmm3
3556	pand	-48(%rax),%xmm1
3557	por	%xmm0,%xmm4
3558	pand	-32(%rax),%xmm2
3559	por	%xmm1,%xmm5
3560	pand	-16(%rax),%xmm3
3561	por	%xmm2,%xmm4
3562	por	%xmm3,%xmm5
3563	movdqa	0(%r11),%xmm0
3564	movdqa	16(%r11),%xmm1
3565	movdqa	32(%r11),%xmm2
3566	pand	0(%rax),%xmm0
3567	movdqa	48(%r11),%xmm3
3568	pand	16(%rax),%xmm1
3569	por	%xmm0,%xmm4
3570	pand	32(%rax),%xmm2
3571	por	%xmm1,%xmm5
3572	pand	48(%rax),%xmm3
3573	por	%xmm2,%xmm4
3574	por	%xmm3,%xmm5
3575	movdqa	64(%r11),%xmm0
3576	movdqa	80(%r11),%xmm1
3577	movdqa	96(%r11),%xmm2
3578	pand	64(%rax),%xmm0
3579	movdqa	112(%r11),%xmm3
3580	pand	80(%rax),%xmm1
3581	por	%xmm0,%xmm4
3582	pand	96(%rax),%xmm2
3583	por	%xmm1,%xmm5
3584	pand	112(%rax),%xmm3
3585	por	%xmm2,%xmm4
3586	por	%xmm3,%xmm5
3587	por	%xmm5,%xmm4
3588	leaq	256(%r11),%r11
3589	pshufd	$0x4e,%xmm4,%xmm0
3590	por	%xmm4,%xmm0
3591	movq	%xmm0,(%rdi)
3592	leaq	8(%rdi),%rdi
3593	subl	$1,%esi
3594	jnz	.Lgather
3595
3596	leaq	(%r10),%rsp
3597	.byte	0xf3,0xc3
3598.LSEH_end_bn_gather5:
3599.cfi_endproc
3600.size	bn_gather5,.-bn_gather5
3601.align	64
3602.Linc:
3603.long	0,0, 1,1
3604.long	2,2, 2,2
3605.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3606