xref: /freebsd/sys/crypto/openssl/amd64/x86_64-mont5.S (revision c0855eaa3ee9614804b6bd6a255aa9f71e095f43)
1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from x86_64-mont5.pl. */
2bc3d5698SJohn Baldwin.text
3bc3d5698SJohn Baldwin
4bc3d5698SJohn Baldwin
5bc3d5698SJohn Baldwin
6bc3d5698SJohn Baldwin.globl	bn_mul_mont_gather5
7bc3d5698SJohn Baldwin.type	bn_mul_mont_gather5,@function
8bc3d5698SJohn Baldwin.align	64
9bc3d5698SJohn Baldwinbn_mul_mont_gather5:
10bc3d5698SJohn Baldwin.cfi_startproc
11bc3d5698SJohn Baldwin	movl	%r9d,%r9d
12bc3d5698SJohn Baldwin	movq	%rsp,%rax
13bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rax
14bc3d5698SJohn Baldwin	testl	$7,%r9d
15bc3d5698SJohn Baldwin	jnz	.Lmul_enter
16bc3d5698SJohn Baldwin	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
17bc3d5698SJohn Baldwin	jmp	.Lmul4x_enter
18bc3d5698SJohn Baldwin
19bc3d5698SJohn Baldwin.align	16
20bc3d5698SJohn Baldwin.Lmul_enter:
21bc3d5698SJohn Baldwin	movd	8(%rsp),%xmm5
22bc3d5698SJohn Baldwin	pushq	%rbx
23bc3d5698SJohn Baldwin.cfi_offset	%rbx,-16
24bc3d5698SJohn Baldwin	pushq	%rbp
25bc3d5698SJohn Baldwin.cfi_offset	%rbp,-24
26bc3d5698SJohn Baldwin	pushq	%r12
27bc3d5698SJohn Baldwin.cfi_offset	%r12,-32
28bc3d5698SJohn Baldwin	pushq	%r13
29bc3d5698SJohn Baldwin.cfi_offset	%r13,-40
30bc3d5698SJohn Baldwin	pushq	%r14
31bc3d5698SJohn Baldwin.cfi_offset	%r14,-48
32bc3d5698SJohn Baldwin	pushq	%r15
33bc3d5698SJohn Baldwin.cfi_offset	%r15,-56
34bc3d5698SJohn Baldwin
35bc3d5698SJohn Baldwin	negq	%r9
36bc3d5698SJohn Baldwin	movq	%rsp,%r11
37bc3d5698SJohn Baldwin	leaq	-280(%rsp,%r9,8),%r10
38bc3d5698SJohn Baldwin	negq	%r9
39bc3d5698SJohn Baldwin	andq	$-1024,%r10
40bc3d5698SJohn Baldwin
41bc3d5698SJohn Baldwin
42bc3d5698SJohn Baldwin
43bc3d5698SJohn Baldwin
44bc3d5698SJohn Baldwin
45bc3d5698SJohn Baldwin
46bc3d5698SJohn Baldwin
47bc3d5698SJohn Baldwin
48bc3d5698SJohn Baldwin
49bc3d5698SJohn Baldwin	subq	%r10,%r11
50bc3d5698SJohn Baldwin	andq	$-4096,%r11
51bc3d5698SJohn Baldwin	leaq	(%r10,%r11,1),%rsp
52bc3d5698SJohn Baldwin	movq	(%rsp),%r11
53bc3d5698SJohn Baldwin	cmpq	%r10,%rsp
54bc3d5698SJohn Baldwin	ja	.Lmul_page_walk
55bc3d5698SJohn Baldwin	jmp	.Lmul_page_walk_done
56bc3d5698SJohn Baldwin
57bc3d5698SJohn Baldwin.Lmul_page_walk:
58bc3d5698SJohn Baldwin	leaq	-4096(%rsp),%rsp
59bc3d5698SJohn Baldwin	movq	(%rsp),%r11
60bc3d5698SJohn Baldwin	cmpq	%r10,%rsp
61bc3d5698SJohn Baldwin	ja	.Lmul_page_walk
62bc3d5698SJohn Baldwin.Lmul_page_walk_done:
63bc3d5698SJohn Baldwin
64bc3d5698SJohn Baldwin	leaq	.Linc(%rip),%r10
65bc3d5698SJohn Baldwin	movq	%rax,8(%rsp,%r9,8)
66bc3d5698SJohn Baldwin.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
67bc3d5698SJohn Baldwin.Lmul_body:
68bc3d5698SJohn Baldwin
69bc3d5698SJohn Baldwin	leaq	128(%rdx),%r12
70bc3d5698SJohn Baldwin	movdqa	0(%r10),%xmm0
71bc3d5698SJohn Baldwin	movdqa	16(%r10),%xmm1
72bc3d5698SJohn Baldwin	leaq	24-112(%rsp,%r9,8),%r10
73bc3d5698SJohn Baldwin	andq	$-16,%r10
74bc3d5698SJohn Baldwin
75bc3d5698SJohn Baldwin	pshufd	$0,%xmm5,%xmm5
76bc3d5698SJohn Baldwin	movdqa	%xmm1,%xmm4
77bc3d5698SJohn Baldwin	movdqa	%xmm1,%xmm2
78bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm1
79bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm0
80bc3d5698SJohn Baldwin.byte	0x67
81bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm3
82bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm2
83bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm1
84bc3d5698SJohn Baldwin	movdqa	%xmm0,112(%r10)
85bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm0
86bc3d5698SJohn Baldwin
87bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm3
88bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm2
89bc3d5698SJohn Baldwin	movdqa	%xmm1,128(%r10)
90bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm1
91bc3d5698SJohn Baldwin
92bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm0
93bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm3
94bc3d5698SJohn Baldwin	movdqa	%xmm2,144(%r10)
95bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm2
96bc3d5698SJohn Baldwin
97bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm1
98bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm0
99bc3d5698SJohn Baldwin	movdqa	%xmm3,160(%r10)
100bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm3
101bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm2
102bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm1
103bc3d5698SJohn Baldwin	movdqa	%xmm0,176(%r10)
104bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm0
105bc3d5698SJohn Baldwin
106bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm3
107bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm2
108bc3d5698SJohn Baldwin	movdqa	%xmm1,192(%r10)
109bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm1
110bc3d5698SJohn Baldwin
111bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm0
112bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm3
113bc3d5698SJohn Baldwin	movdqa	%xmm2,208(%r10)
114bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm2
115bc3d5698SJohn Baldwin
116bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm1
117bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm0
118bc3d5698SJohn Baldwin	movdqa	%xmm3,224(%r10)
119bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm3
120bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm2
121bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm1
122bc3d5698SJohn Baldwin	movdqa	%xmm0,240(%r10)
123bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm0
124bc3d5698SJohn Baldwin
125bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm3
126bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm2
127bc3d5698SJohn Baldwin	movdqa	%xmm1,256(%r10)
128bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm1
129bc3d5698SJohn Baldwin
130bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm0
131bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm3
132bc3d5698SJohn Baldwin	movdqa	%xmm2,272(%r10)
133bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm2
134bc3d5698SJohn Baldwin
135bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm1
136bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm0
137bc3d5698SJohn Baldwin	movdqa	%xmm3,288(%r10)
138bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm3
139bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm2
140bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm1
141bc3d5698SJohn Baldwin	movdqa	%xmm0,304(%r10)
142bc3d5698SJohn Baldwin
143bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm3
144bc3d5698SJohn Baldwin.byte	0x67
145bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm2
146bc3d5698SJohn Baldwin	movdqa	%xmm1,320(%r10)
147bc3d5698SJohn Baldwin
148bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm3
149bc3d5698SJohn Baldwin	movdqa	%xmm2,336(%r10)
150bc3d5698SJohn Baldwin	pand	64(%r12),%xmm0
151bc3d5698SJohn Baldwin
152bc3d5698SJohn Baldwin	pand	80(%r12),%xmm1
153bc3d5698SJohn Baldwin	pand	96(%r12),%xmm2
154bc3d5698SJohn Baldwin	movdqa	%xmm3,352(%r10)
155bc3d5698SJohn Baldwin	pand	112(%r12),%xmm3
156bc3d5698SJohn Baldwin	por	%xmm2,%xmm0
157bc3d5698SJohn Baldwin	por	%xmm3,%xmm1
158bc3d5698SJohn Baldwin	movdqa	-128(%r12),%xmm4
159bc3d5698SJohn Baldwin	movdqa	-112(%r12),%xmm5
160bc3d5698SJohn Baldwin	movdqa	-96(%r12),%xmm2
161bc3d5698SJohn Baldwin	pand	112(%r10),%xmm4
162bc3d5698SJohn Baldwin	movdqa	-80(%r12),%xmm3
163bc3d5698SJohn Baldwin	pand	128(%r10),%xmm5
164bc3d5698SJohn Baldwin	por	%xmm4,%xmm0
165bc3d5698SJohn Baldwin	pand	144(%r10),%xmm2
166bc3d5698SJohn Baldwin	por	%xmm5,%xmm1
167bc3d5698SJohn Baldwin	pand	160(%r10),%xmm3
168bc3d5698SJohn Baldwin	por	%xmm2,%xmm0
169bc3d5698SJohn Baldwin	por	%xmm3,%xmm1
170bc3d5698SJohn Baldwin	movdqa	-64(%r12),%xmm4
171bc3d5698SJohn Baldwin	movdqa	-48(%r12),%xmm5
172bc3d5698SJohn Baldwin	movdqa	-32(%r12),%xmm2
173bc3d5698SJohn Baldwin	pand	176(%r10),%xmm4
174bc3d5698SJohn Baldwin	movdqa	-16(%r12),%xmm3
175bc3d5698SJohn Baldwin	pand	192(%r10),%xmm5
176bc3d5698SJohn Baldwin	por	%xmm4,%xmm0
177bc3d5698SJohn Baldwin	pand	208(%r10),%xmm2
178bc3d5698SJohn Baldwin	por	%xmm5,%xmm1
179bc3d5698SJohn Baldwin	pand	224(%r10),%xmm3
180bc3d5698SJohn Baldwin	por	%xmm2,%xmm0
181bc3d5698SJohn Baldwin	por	%xmm3,%xmm1
182bc3d5698SJohn Baldwin	movdqa	0(%r12),%xmm4
183bc3d5698SJohn Baldwin	movdqa	16(%r12),%xmm5
184bc3d5698SJohn Baldwin	movdqa	32(%r12),%xmm2
185bc3d5698SJohn Baldwin	pand	240(%r10),%xmm4
186bc3d5698SJohn Baldwin	movdqa	48(%r12),%xmm3
187bc3d5698SJohn Baldwin	pand	256(%r10),%xmm5
188bc3d5698SJohn Baldwin	por	%xmm4,%xmm0
189bc3d5698SJohn Baldwin	pand	272(%r10),%xmm2
190bc3d5698SJohn Baldwin	por	%xmm5,%xmm1
191bc3d5698SJohn Baldwin	pand	288(%r10),%xmm3
192bc3d5698SJohn Baldwin	por	%xmm2,%xmm0
193bc3d5698SJohn Baldwin	por	%xmm3,%xmm1
194bc3d5698SJohn Baldwin	por	%xmm1,%xmm0
195bc3d5698SJohn Baldwin	pshufd	$0x4e,%xmm0,%xmm1
196bc3d5698SJohn Baldwin	por	%xmm1,%xmm0
197bc3d5698SJohn Baldwin	leaq	256(%r12),%r12
198bc3d5698SJohn Baldwin.byte	102,72,15,126,195
199bc3d5698SJohn Baldwin
200bc3d5698SJohn Baldwin	movq	(%r8),%r8
201bc3d5698SJohn Baldwin	movq	(%rsi),%rax
202bc3d5698SJohn Baldwin
203bc3d5698SJohn Baldwin	xorq	%r14,%r14
204bc3d5698SJohn Baldwin	xorq	%r15,%r15
205bc3d5698SJohn Baldwin
206bc3d5698SJohn Baldwin	movq	%r8,%rbp
207bc3d5698SJohn Baldwin	mulq	%rbx
208bc3d5698SJohn Baldwin	movq	%rax,%r10
209bc3d5698SJohn Baldwin	movq	(%rcx),%rax
210bc3d5698SJohn Baldwin
211bc3d5698SJohn Baldwin	imulq	%r10,%rbp
212bc3d5698SJohn Baldwin	movq	%rdx,%r11
213bc3d5698SJohn Baldwin
214bc3d5698SJohn Baldwin	mulq	%rbp
215bc3d5698SJohn Baldwin	addq	%rax,%r10
216bc3d5698SJohn Baldwin	movq	8(%rsi),%rax
217bc3d5698SJohn Baldwin	adcq	$0,%rdx
218bc3d5698SJohn Baldwin	movq	%rdx,%r13
219bc3d5698SJohn Baldwin
220bc3d5698SJohn Baldwin	leaq	1(%r15),%r15
221bc3d5698SJohn Baldwin	jmp	.L1st_enter
222bc3d5698SJohn Baldwin
223bc3d5698SJohn Baldwin.align	16
224bc3d5698SJohn Baldwin.L1st:
225bc3d5698SJohn Baldwin	addq	%rax,%r13
226bc3d5698SJohn Baldwin	movq	(%rsi,%r15,8),%rax
227bc3d5698SJohn Baldwin	adcq	$0,%rdx
228bc3d5698SJohn Baldwin	addq	%r11,%r13
229bc3d5698SJohn Baldwin	movq	%r10,%r11
230bc3d5698SJohn Baldwin	adcq	$0,%rdx
231bc3d5698SJohn Baldwin	movq	%r13,-16(%rsp,%r15,8)
232bc3d5698SJohn Baldwin	movq	%rdx,%r13
233bc3d5698SJohn Baldwin
234bc3d5698SJohn Baldwin.L1st_enter:
235bc3d5698SJohn Baldwin	mulq	%rbx
236bc3d5698SJohn Baldwin	addq	%rax,%r11
237bc3d5698SJohn Baldwin	movq	(%rcx,%r15,8),%rax
238bc3d5698SJohn Baldwin	adcq	$0,%rdx
239bc3d5698SJohn Baldwin	leaq	1(%r15),%r15
240bc3d5698SJohn Baldwin	movq	%rdx,%r10
241bc3d5698SJohn Baldwin
242bc3d5698SJohn Baldwin	mulq	%rbp
243bc3d5698SJohn Baldwin	cmpq	%r9,%r15
244bc3d5698SJohn Baldwin	jne	.L1st
245bc3d5698SJohn Baldwin
246bc3d5698SJohn Baldwin
247bc3d5698SJohn Baldwin	addq	%rax,%r13
248bc3d5698SJohn Baldwin	adcq	$0,%rdx
249bc3d5698SJohn Baldwin	addq	%r11,%r13
250bc3d5698SJohn Baldwin	adcq	$0,%rdx
251bc3d5698SJohn Baldwin	movq	%r13,-16(%rsp,%r9,8)
252bc3d5698SJohn Baldwin	movq	%rdx,%r13
253bc3d5698SJohn Baldwin	movq	%r10,%r11
254bc3d5698SJohn Baldwin
255bc3d5698SJohn Baldwin	xorq	%rdx,%rdx
256bc3d5698SJohn Baldwin	addq	%r11,%r13
257bc3d5698SJohn Baldwin	adcq	$0,%rdx
258bc3d5698SJohn Baldwin	movq	%r13,-8(%rsp,%r9,8)
259bc3d5698SJohn Baldwin	movq	%rdx,(%rsp,%r9,8)
260bc3d5698SJohn Baldwin
261bc3d5698SJohn Baldwin	leaq	1(%r14),%r14
262bc3d5698SJohn Baldwin	jmp	.Louter
263bc3d5698SJohn Baldwin.align	16
264bc3d5698SJohn Baldwin.Louter:
265bc3d5698SJohn Baldwin	leaq	24+128(%rsp,%r9,8),%rdx
266bc3d5698SJohn Baldwin	andq	$-16,%rdx
267bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm4
268bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm5
269bc3d5698SJohn Baldwin	movdqa	-128(%r12),%xmm0
270bc3d5698SJohn Baldwin	movdqa	-112(%r12),%xmm1
271bc3d5698SJohn Baldwin	movdqa	-96(%r12),%xmm2
272bc3d5698SJohn Baldwin	movdqa	-80(%r12),%xmm3
273bc3d5698SJohn Baldwin	pand	-128(%rdx),%xmm0
274bc3d5698SJohn Baldwin	pand	-112(%rdx),%xmm1
275bc3d5698SJohn Baldwin	por	%xmm0,%xmm4
276bc3d5698SJohn Baldwin	pand	-96(%rdx),%xmm2
277bc3d5698SJohn Baldwin	por	%xmm1,%xmm5
278bc3d5698SJohn Baldwin	pand	-80(%rdx),%xmm3
279bc3d5698SJohn Baldwin	por	%xmm2,%xmm4
280bc3d5698SJohn Baldwin	por	%xmm3,%xmm5
281bc3d5698SJohn Baldwin	movdqa	-64(%r12),%xmm0
282bc3d5698SJohn Baldwin	movdqa	-48(%r12),%xmm1
283bc3d5698SJohn Baldwin	movdqa	-32(%r12),%xmm2
284bc3d5698SJohn Baldwin	movdqa	-16(%r12),%xmm3
285bc3d5698SJohn Baldwin	pand	-64(%rdx),%xmm0
286bc3d5698SJohn Baldwin	pand	-48(%rdx),%xmm1
287bc3d5698SJohn Baldwin	por	%xmm0,%xmm4
288bc3d5698SJohn Baldwin	pand	-32(%rdx),%xmm2
289bc3d5698SJohn Baldwin	por	%xmm1,%xmm5
290bc3d5698SJohn Baldwin	pand	-16(%rdx),%xmm3
291bc3d5698SJohn Baldwin	por	%xmm2,%xmm4
292bc3d5698SJohn Baldwin	por	%xmm3,%xmm5
293bc3d5698SJohn Baldwin	movdqa	0(%r12),%xmm0
294bc3d5698SJohn Baldwin	movdqa	16(%r12),%xmm1
295bc3d5698SJohn Baldwin	movdqa	32(%r12),%xmm2
296bc3d5698SJohn Baldwin	movdqa	48(%r12),%xmm3
297bc3d5698SJohn Baldwin	pand	0(%rdx),%xmm0
298bc3d5698SJohn Baldwin	pand	16(%rdx),%xmm1
299bc3d5698SJohn Baldwin	por	%xmm0,%xmm4
300bc3d5698SJohn Baldwin	pand	32(%rdx),%xmm2
301bc3d5698SJohn Baldwin	por	%xmm1,%xmm5
302bc3d5698SJohn Baldwin	pand	48(%rdx),%xmm3
303bc3d5698SJohn Baldwin	por	%xmm2,%xmm4
304bc3d5698SJohn Baldwin	por	%xmm3,%xmm5
305bc3d5698SJohn Baldwin	movdqa	64(%r12),%xmm0
306bc3d5698SJohn Baldwin	movdqa	80(%r12),%xmm1
307bc3d5698SJohn Baldwin	movdqa	96(%r12),%xmm2
308bc3d5698SJohn Baldwin	movdqa	112(%r12),%xmm3
309bc3d5698SJohn Baldwin	pand	64(%rdx),%xmm0
310bc3d5698SJohn Baldwin	pand	80(%rdx),%xmm1
311bc3d5698SJohn Baldwin	por	%xmm0,%xmm4
312bc3d5698SJohn Baldwin	pand	96(%rdx),%xmm2
313bc3d5698SJohn Baldwin	por	%xmm1,%xmm5
314bc3d5698SJohn Baldwin	pand	112(%rdx),%xmm3
315bc3d5698SJohn Baldwin	por	%xmm2,%xmm4
316bc3d5698SJohn Baldwin	por	%xmm3,%xmm5
317bc3d5698SJohn Baldwin	por	%xmm5,%xmm4
318bc3d5698SJohn Baldwin	pshufd	$0x4e,%xmm4,%xmm0
319bc3d5698SJohn Baldwin	por	%xmm4,%xmm0
320bc3d5698SJohn Baldwin	leaq	256(%r12),%r12
321bc3d5698SJohn Baldwin
322bc3d5698SJohn Baldwin	movq	(%rsi),%rax
323bc3d5698SJohn Baldwin.byte	102,72,15,126,195
324bc3d5698SJohn Baldwin
325bc3d5698SJohn Baldwin	xorq	%r15,%r15
326bc3d5698SJohn Baldwin	movq	%r8,%rbp
327bc3d5698SJohn Baldwin	movq	(%rsp),%r10
328bc3d5698SJohn Baldwin
329bc3d5698SJohn Baldwin	mulq	%rbx
330bc3d5698SJohn Baldwin	addq	%rax,%r10
331bc3d5698SJohn Baldwin	movq	(%rcx),%rax
332bc3d5698SJohn Baldwin	adcq	$0,%rdx
333bc3d5698SJohn Baldwin
334bc3d5698SJohn Baldwin	imulq	%r10,%rbp
335bc3d5698SJohn Baldwin	movq	%rdx,%r11
336bc3d5698SJohn Baldwin
337bc3d5698SJohn Baldwin	mulq	%rbp
338bc3d5698SJohn Baldwin	addq	%rax,%r10
339bc3d5698SJohn Baldwin	movq	8(%rsi),%rax
340bc3d5698SJohn Baldwin	adcq	$0,%rdx
341bc3d5698SJohn Baldwin	movq	8(%rsp),%r10
342bc3d5698SJohn Baldwin	movq	%rdx,%r13
343bc3d5698SJohn Baldwin
344bc3d5698SJohn Baldwin	leaq	1(%r15),%r15
345bc3d5698SJohn Baldwin	jmp	.Linner_enter
346bc3d5698SJohn Baldwin
347bc3d5698SJohn Baldwin.align	16
348bc3d5698SJohn Baldwin.Linner:
349bc3d5698SJohn Baldwin	addq	%rax,%r13
350bc3d5698SJohn Baldwin	movq	(%rsi,%r15,8),%rax
351bc3d5698SJohn Baldwin	adcq	$0,%rdx
352bc3d5698SJohn Baldwin	addq	%r10,%r13
353bc3d5698SJohn Baldwin	movq	(%rsp,%r15,8),%r10
354bc3d5698SJohn Baldwin	adcq	$0,%rdx
355bc3d5698SJohn Baldwin	movq	%r13,-16(%rsp,%r15,8)
356bc3d5698SJohn Baldwin	movq	%rdx,%r13
357bc3d5698SJohn Baldwin
358bc3d5698SJohn Baldwin.Linner_enter:
359bc3d5698SJohn Baldwin	mulq	%rbx
360bc3d5698SJohn Baldwin	addq	%rax,%r11
361bc3d5698SJohn Baldwin	movq	(%rcx,%r15,8),%rax
362bc3d5698SJohn Baldwin	adcq	$0,%rdx
363bc3d5698SJohn Baldwin	addq	%r11,%r10
364bc3d5698SJohn Baldwin	movq	%rdx,%r11
365bc3d5698SJohn Baldwin	adcq	$0,%r11
366bc3d5698SJohn Baldwin	leaq	1(%r15),%r15
367bc3d5698SJohn Baldwin
368bc3d5698SJohn Baldwin	mulq	%rbp
369bc3d5698SJohn Baldwin	cmpq	%r9,%r15
370bc3d5698SJohn Baldwin	jne	.Linner
371bc3d5698SJohn Baldwin
372bc3d5698SJohn Baldwin	addq	%rax,%r13
373bc3d5698SJohn Baldwin	adcq	$0,%rdx
374bc3d5698SJohn Baldwin	addq	%r10,%r13
375bc3d5698SJohn Baldwin	movq	(%rsp,%r9,8),%r10
376bc3d5698SJohn Baldwin	adcq	$0,%rdx
377bc3d5698SJohn Baldwin	movq	%r13,-16(%rsp,%r9,8)
378bc3d5698SJohn Baldwin	movq	%rdx,%r13
379bc3d5698SJohn Baldwin
380bc3d5698SJohn Baldwin	xorq	%rdx,%rdx
381bc3d5698SJohn Baldwin	addq	%r11,%r13
382bc3d5698SJohn Baldwin	adcq	$0,%rdx
383bc3d5698SJohn Baldwin	addq	%r10,%r13
384bc3d5698SJohn Baldwin	adcq	$0,%rdx
385bc3d5698SJohn Baldwin	movq	%r13,-8(%rsp,%r9,8)
386bc3d5698SJohn Baldwin	movq	%rdx,(%rsp,%r9,8)
387bc3d5698SJohn Baldwin
388bc3d5698SJohn Baldwin	leaq	1(%r14),%r14
389bc3d5698SJohn Baldwin	cmpq	%r9,%r14
390bc3d5698SJohn Baldwin	jb	.Louter
391bc3d5698SJohn Baldwin
392bc3d5698SJohn Baldwin	xorq	%r14,%r14
393bc3d5698SJohn Baldwin	movq	(%rsp),%rax
394bc3d5698SJohn Baldwin	leaq	(%rsp),%rsi
395bc3d5698SJohn Baldwin	movq	%r9,%r15
396bc3d5698SJohn Baldwin	jmp	.Lsub
397bc3d5698SJohn Baldwin.align	16
398bc3d5698SJohn Baldwin.Lsub:	sbbq	(%rcx,%r14,8),%rax
399bc3d5698SJohn Baldwin	movq	%rax,(%rdi,%r14,8)
400bc3d5698SJohn Baldwin	movq	8(%rsi,%r14,8),%rax
401bc3d5698SJohn Baldwin	leaq	1(%r14),%r14
402bc3d5698SJohn Baldwin	decq	%r15
403bc3d5698SJohn Baldwin	jnz	.Lsub
404bc3d5698SJohn Baldwin
405bc3d5698SJohn Baldwin	sbbq	$0,%rax
406bc3d5698SJohn Baldwin	movq	$-1,%rbx
407bc3d5698SJohn Baldwin	xorq	%rax,%rbx
408bc3d5698SJohn Baldwin	xorq	%r14,%r14
409bc3d5698SJohn Baldwin	movq	%r9,%r15
410bc3d5698SJohn Baldwin
411bc3d5698SJohn Baldwin.Lcopy:
412bc3d5698SJohn Baldwin	movq	(%rdi,%r14,8),%rcx
413bc3d5698SJohn Baldwin	movq	(%rsp,%r14,8),%rdx
414bc3d5698SJohn Baldwin	andq	%rbx,%rcx
415bc3d5698SJohn Baldwin	andq	%rax,%rdx
416bc3d5698SJohn Baldwin	movq	%r14,(%rsp,%r14,8)
417bc3d5698SJohn Baldwin	orq	%rcx,%rdx
418bc3d5698SJohn Baldwin	movq	%rdx,(%rdi,%r14,8)
419bc3d5698SJohn Baldwin	leaq	1(%r14),%r14
420bc3d5698SJohn Baldwin	subq	$1,%r15
421bc3d5698SJohn Baldwin	jnz	.Lcopy
422bc3d5698SJohn Baldwin
423bc3d5698SJohn Baldwin	movq	8(%rsp,%r9,8),%rsi
424bc3d5698SJohn Baldwin.cfi_def_cfa	%rsi,8
425bc3d5698SJohn Baldwin	movq	$1,%rax
426bc3d5698SJohn Baldwin
427bc3d5698SJohn Baldwin	movq	-48(%rsi),%r15
428bc3d5698SJohn Baldwin.cfi_restore	%r15
429bc3d5698SJohn Baldwin	movq	-40(%rsi),%r14
430bc3d5698SJohn Baldwin.cfi_restore	%r14
431bc3d5698SJohn Baldwin	movq	-32(%rsi),%r13
432bc3d5698SJohn Baldwin.cfi_restore	%r13
433bc3d5698SJohn Baldwin	movq	-24(%rsi),%r12
434bc3d5698SJohn Baldwin.cfi_restore	%r12
435bc3d5698SJohn Baldwin	movq	-16(%rsi),%rbp
436bc3d5698SJohn Baldwin.cfi_restore	%rbp
437bc3d5698SJohn Baldwin	movq	-8(%rsi),%rbx
438bc3d5698SJohn Baldwin.cfi_restore	%rbx
439bc3d5698SJohn Baldwin	leaq	(%rsi),%rsp
440bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rsp
441bc3d5698SJohn Baldwin.Lmul_epilogue:
442bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
443bc3d5698SJohn Baldwin.cfi_endproc
444bc3d5698SJohn Baldwin.size	bn_mul_mont_gather5,.-bn_mul_mont_gather5
445bc3d5698SJohn Baldwin.type	bn_mul4x_mont_gather5,@function
446bc3d5698SJohn Baldwin.align	32
447bc3d5698SJohn Baldwinbn_mul4x_mont_gather5:
448bc3d5698SJohn Baldwin.cfi_startproc
449bc3d5698SJohn Baldwin.byte	0x67
450bc3d5698SJohn Baldwin	movq	%rsp,%rax
451bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rax
452bc3d5698SJohn Baldwin.Lmul4x_enter:
453bc3d5698SJohn Baldwin	andl	$0x80108,%r11d
454bc3d5698SJohn Baldwin	cmpl	$0x80108,%r11d
455bc3d5698SJohn Baldwin	je	.Lmulx4x_enter
456bc3d5698SJohn Baldwin	pushq	%rbx
457bc3d5698SJohn Baldwin.cfi_offset	%rbx,-16
458bc3d5698SJohn Baldwin	pushq	%rbp
459bc3d5698SJohn Baldwin.cfi_offset	%rbp,-24
460bc3d5698SJohn Baldwin	pushq	%r12
461bc3d5698SJohn Baldwin.cfi_offset	%r12,-32
462bc3d5698SJohn Baldwin	pushq	%r13
463bc3d5698SJohn Baldwin.cfi_offset	%r13,-40
464bc3d5698SJohn Baldwin	pushq	%r14
465bc3d5698SJohn Baldwin.cfi_offset	%r14,-48
466bc3d5698SJohn Baldwin	pushq	%r15
467bc3d5698SJohn Baldwin.cfi_offset	%r15,-56
468bc3d5698SJohn Baldwin.Lmul4x_prologue:
469bc3d5698SJohn Baldwin
470bc3d5698SJohn Baldwin.byte	0x67
471bc3d5698SJohn Baldwin	shll	$3,%r9d
472bc3d5698SJohn Baldwin	leaq	(%r9,%r9,2),%r10
473bc3d5698SJohn Baldwin	negq	%r9
474bc3d5698SJohn Baldwin
475bc3d5698SJohn Baldwin
476bc3d5698SJohn Baldwin
477bc3d5698SJohn Baldwin
478bc3d5698SJohn Baldwin
479bc3d5698SJohn Baldwin
480bc3d5698SJohn Baldwin
481bc3d5698SJohn Baldwin
482bc3d5698SJohn Baldwin
483bc3d5698SJohn Baldwin
484bc3d5698SJohn Baldwin	leaq	-320(%rsp,%r9,2),%r11
485bc3d5698SJohn Baldwin	movq	%rsp,%rbp
486bc3d5698SJohn Baldwin	subq	%rdi,%r11
487bc3d5698SJohn Baldwin	andq	$4095,%r11
488bc3d5698SJohn Baldwin	cmpq	%r11,%r10
489bc3d5698SJohn Baldwin	jb	.Lmul4xsp_alt
490bc3d5698SJohn Baldwin	subq	%r11,%rbp
491bc3d5698SJohn Baldwin	leaq	-320(%rbp,%r9,2),%rbp
492bc3d5698SJohn Baldwin	jmp	.Lmul4xsp_done
493bc3d5698SJohn Baldwin
494bc3d5698SJohn Baldwin.align	32
495bc3d5698SJohn Baldwin.Lmul4xsp_alt:
496bc3d5698SJohn Baldwin	leaq	4096-320(,%r9,2),%r10
497bc3d5698SJohn Baldwin	leaq	-320(%rbp,%r9,2),%rbp
498bc3d5698SJohn Baldwin	subq	%r10,%r11
499bc3d5698SJohn Baldwin	movq	$0,%r10
500bc3d5698SJohn Baldwin	cmovcq	%r10,%r11
501bc3d5698SJohn Baldwin	subq	%r11,%rbp
502bc3d5698SJohn Baldwin.Lmul4xsp_done:
503bc3d5698SJohn Baldwin	andq	$-64,%rbp
504bc3d5698SJohn Baldwin	movq	%rsp,%r11
505bc3d5698SJohn Baldwin	subq	%rbp,%r11
506bc3d5698SJohn Baldwin	andq	$-4096,%r11
507bc3d5698SJohn Baldwin	leaq	(%r11,%rbp,1),%rsp
508bc3d5698SJohn Baldwin	movq	(%rsp),%r10
509bc3d5698SJohn Baldwin	cmpq	%rbp,%rsp
510bc3d5698SJohn Baldwin	ja	.Lmul4x_page_walk
511bc3d5698SJohn Baldwin	jmp	.Lmul4x_page_walk_done
512bc3d5698SJohn Baldwin
513bc3d5698SJohn Baldwin.Lmul4x_page_walk:
514bc3d5698SJohn Baldwin	leaq	-4096(%rsp),%rsp
515bc3d5698SJohn Baldwin	movq	(%rsp),%r10
516bc3d5698SJohn Baldwin	cmpq	%rbp,%rsp
517bc3d5698SJohn Baldwin	ja	.Lmul4x_page_walk
518bc3d5698SJohn Baldwin.Lmul4x_page_walk_done:
519bc3d5698SJohn Baldwin
520bc3d5698SJohn Baldwin	negq	%r9
521bc3d5698SJohn Baldwin
522bc3d5698SJohn Baldwin	movq	%rax,40(%rsp)
523bc3d5698SJohn Baldwin.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
524bc3d5698SJohn Baldwin.Lmul4x_body:
525bc3d5698SJohn Baldwin
526bc3d5698SJohn Baldwin	call	mul4x_internal
527bc3d5698SJohn Baldwin
528bc3d5698SJohn Baldwin	movq	40(%rsp),%rsi
529bc3d5698SJohn Baldwin.cfi_def_cfa	%rsi,8
530bc3d5698SJohn Baldwin	movq	$1,%rax
531bc3d5698SJohn Baldwin
532bc3d5698SJohn Baldwin	movq	-48(%rsi),%r15
533bc3d5698SJohn Baldwin.cfi_restore	%r15
534bc3d5698SJohn Baldwin	movq	-40(%rsi),%r14
535bc3d5698SJohn Baldwin.cfi_restore	%r14
536bc3d5698SJohn Baldwin	movq	-32(%rsi),%r13
537bc3d5698SJohn Baldwin.cfi_restore	%r13
538bc3d5698SJohn Baldwin	movq	-24(%rsi),%r12
539bc3d5698SJohn Baldwin.cfi_restore	%r12
540bc3d5698SJohn Baldwin	movq	-16(%rsi),%rbp
541bc3d5698SJohn Baldwin.cfi_restore	%rbp
542bc3d5698SJohn Baldwin	movq	-8(%rsi),%rbx
543bc3d5698SJohn Baldwin.cfi_restore	%rbx
544bc3d5698SJohn Baldwin	leaq	(%rsi),%rsp
545bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rsp
546bc3d5698SJohn Baldwin.Lmul4x_epilogue:
547bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
548bc3d5698SJohn Baldwin.cfi_endproc
549bc3d5698SJohn Baldwin.size	bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
550bc3d5698SJohn Baldwin
551bc3d5698SJohn Baldwin.type	mul4x_internal,@function
552bc3d5698SJohn Baldwin.align	32
553bc3d5698SJohn Baldwinmul4x_internal:
554bc3d5698SJohn Baldwin.cfi_startproc
555bc3d5698SJohn Baldwin	shlq	$5,%r9
556bc3d5698SJohn Baldwin	movd	8(%rax),%xmm5
557bc3d5698SJohn Baldwin	leaq	.Linc(%rip),%rax
558bc3d5698SJohn Baldwin	leaq	128(%rdx,%r9,1),%r13
559bc3d5698SJohn Baldwin	shrq	$5,%r9
560bc3d5698SJohn Baldwin	movdqa	0(%rax),%xmm0
561bc3d5698SJohn Baldwin	movdqa	16(%rax),%xmm1
562bc3d5698SJohn Baldwin	leaq	88-112(%rsp,%r9,1),%r10
563bc3d5698SJohn Baldwin	leaq	128(%rdx),%r12
564bc3d5698SJohn Baldwin
565bc3d5698SJohn Baldwin	pshufd	$0,%xmm5,%xmm5
566bc3d5698SJohn Baldwin	movdqa	%xmm1,%xmm4
567bc3d5698SJohn Baldwin.byte	0x67,0x67
568bc3d5698SJohn Baldwin	movdqa	%xmm1,%xmm2
569bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm1
570bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm0
571bc3d5698SJohn Baldwin.byte	0x67
572bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm3
573bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm2
574bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm1
575bc3d5698SJohn Baldwin	movdqa	%xmm0,112(%r10)
576bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm0
577bc3d5698SJohn Baldwin
578bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm3
579bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm2
580bc3d5698SJohn Baldwin	movdqa	%xmm1,128(%r10)
581bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm1
582bc3d5698SJohn Baldwin
583bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm0
584bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm3
585bc3d5698SJohn Baldwin	movdqa	%xmm2,144(%r10)
586bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm2
587bc3d5698SJohn Baldwin
588bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm1
589bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm0
590bc3d5698SJohn Baldwin	movdqa	%xmm3,160(%r10)
591bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm3
592bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm2
593bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm1
594bc3d5698SJohn Baldwin	movdqa	%xmm0,176(%r10)
595bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm0
596bc3d5698SJohn Baldwin
597bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm3
598bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm2
599bc3d5698SJohn Baldwin	movdqa	%xmm1,192(%r10)
600bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm1
601bc3d5698SJohn Baldwin
602bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm0
603bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm3
604bc3d5698SJohn Baldwin	movdqa	%xmm2,208(%r10)
605bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm2
606bc3d5698SJohn Baldwin
607bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm1
608bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm0
609bc3d5698SJohn Baldwin	movdqa	%xmm3,224(%r10)
610bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm3
611bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm2
612bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm1
613bc3d5698SJohn Baldwin	movdqa	%xmm0,240(%r10)
614bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm0
615bc3d5698SJohn Baldwin
616bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm3
617bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm2
618bc3d5698SJohn Baldwin	movdqa	%xmm1,256(%r10)
619bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm1
620bc3d5698SJohn Baldwin
621bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm0
622bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm3
623bc3d5698SJohn Baldwin	movdqa	%xmm2,272(%r10)
624bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm2
625bc3d5698SJohn Baldwin
626bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm1
627bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm0
628bc3d5698SJohn Baldwin	movdqa	%xmm3,288(%r10)
629bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm3
630bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm2
631bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm1
632bc3d5698SJohn Baldwin	movdqa	%xmm0,304(%r10)
633bc3d5698SJohn Baldwin
634bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm3
635bc3d5698SJohn Baldwin.byte	0x67
636bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm2
637bc3d5698SJohn Baldwin	movdqa	%xmm1,320(%r10)
638bc3d5698SJohn Baldwin
639bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm3
640bc3d5698SJohn Baldwin	movdqa	%xmm2,336(%r10)
641bc3d5698SJohn Baldwin	pand	64(%r12),%xmm0
642bc3d5698SJohn Baldwin
643bc3d5698SJohn Baldwin	pand	80(%r12),%xmm1
644bc3d5698SJohn Baldwin	pand	96(%r12),%xmm2
645bc3d5698SJohn Baldwin	movdqa	%xmm3,352(%r10)
646bc3d5698SJohn Baldwin	pand	112(%r12),%xmm3
647bc3d5698SJohn Baldwin	por	%xmm2,%xmm0
648bc3d5698SJohn Baldwin	por	%xmm3,%xmm1
649bc3d5698SJohn Baldwin	movdqa	-128(%r12),%xmm4
650bc3d5698SJohn Baldwin	movdqa	-112(%r12),%xmm5
651bc3d5698SJohn Baldwin	movdqa	-96(%r12),%xmm2
652bc3d5698SJohn Baldwin	pand	112(%r10),%xmm4
653bc3d5698SJohn Baldwin	movdqa	-80(%r12),%xmm3
654bc3d5698SJohn Baldwin	pand	128(%r10),%xmm5
655bc3d5698SJohn Baldwin	por	%xmm4,%xmm0
656bc3d5698SJohn Baldwin	pand	144(%r10),%xmm2
657bc3d5698SJohn Baldwin	por	%xmm5,%xmm1
658bc3d5698SJohn Baldwin	pand	160(%r10),%xmm3
659bc3d5698SJohn Baldwin	por	%xmm2,%xmm0
660bc3d5698SJohn Baldwin	por	%xmm3,%xmm1
661bc3d5698SJohn Baldwin	movdqa	-64(%r12),%xmm4
662bc3d5698SJohn Baldwin	movdqa	-48(%r12),%xmm5
663bc3d5698SJohn Baldwin	movdqa	-32(%r12),%xmm2
664bc3d5698SJohn Baldwin	pand	176(%r10),%xmm4
665bc3d5698SJohn Baldwin	movdqa	-16(%r12),%xmm3
666bc3d5698SJohn Baldwin	pand	192(%r10),%xmm5
667bc3d5698SJohn Baldwin	por	%xmm4,%xmm0
668bc3d5698SJohn Baldwin	pand	208(%r10),%xmm2
669bc3d5698SJohn Baldwin	por	%xmm5,%xmm1
670bc3d5698SJohn Baldwin	pand	224(%r10),%xmm3
671bc3d5698SJohn Baldwin	por	%xmm2,%xmm0
672bc3d5698SJohn Baldwin	por	%xmm3,%xmm1
673bc3d5698SJohn Baldwin	movdqa	0(%r12),%xmm4
674bc3d5698SJohn Baldwin	movdqa	16(%r12),%xmm5
675bc3d5698SJohn Baldwin	movdqa	32(%r12),%xmm2
676bc3d5698SJohn Baldwin	pand	240(%r10),%xmm4
677bc3d5698SJohn Baldwin	movdqa	48(%r12),%xmm3
678bc3d5698SJohn Baldwin	pand	256(%r10),%xmm5
679bc3d5698SJohn Baldwin	por	%xmm4,%xmm0
680bc3d5698SJohn Baldwin	pand	272(%r10),%xmm2
681bc3d5698SJohn Baldwin	por	%xmm5,%xmm1
682bc3d5698SJohn Baldwin	pand	288(%r10),%xmm3
683bc3d5698SJohn Baldwin	por	%xmm2,%xmm0
684bc3d5698SJohn Baldwin	por	%xmm3,%xmm1
685bc3d5698SJohn Baldwin	por	%xmm1,%xmm0
686bc3d5698SJohn Baldwin	pshufd	$0x4e,%xmm0,%xmm1
687bc3d5698SJohn Baldwin	por	%xmm1,%xmm0
688bc3d5698SJohn Baldwin	leaq	256(%r12),%r12
689bc3d5698SJohn Baldwin.byte	102,72,15,126,195
690bc3d5698SJohn Baldwin
691bc3d5698SJohn Baldwin	movq	%r13,16+8(%rsp)
692bc3d5698SJohn Baldwin	movq	%rdi,56+8(%rsp)
693bc3d5698SJohn Baldwin
694bc3d5698SJohn Baldwin	movq	(%r8),%r8
695bc3d5698SJohn Baldwin	movq	(%rsi),%rax
696bc3d5698SJohn Baldwin	leaq	(%rsi,%r9,1),%rsi
697bc3d5698SJohn Baldwin	negq	%r9
698bc3d5698SJohn Baldwin
699bc3d5698SJohn Baldwin	movq	%r8,%rbp
700bc3d5698SJohn Baldwin	mulq	%rbx
701bc3d5698SJohn Baldwin	movq	%rax,%r10
702bc3d5698SJohn Baldwin	movq	(%rcx),%rax
703bc3d5698SJohn Baldwin
704bc3d5698SJohn Baldwin	imulq	%r10,%rbp
705bc3d5698SJohn Baldwin	leaq	64+8(%rsp),%r14
706bc3d5698SJohn Baldwin	movq	%rdx,%r11
707bc3d5698SJohn Baldwin
708bc3d5698SJohn Baldwin	mulq	%rbp
709bc3d5698SJohn Baldwin	addq	%rax,%r10
710bc3d5698SJohn Baldwin	movq	8(%rsi,%r9,1),%rax
711bc3d5698SJohn Baldwin	adcq	$0,%rdx
712bc3d5698SJohn Baldwin	movq	%rdx,%rdi
713bc3d5698SJohn Baldwin
714bc3d5698SJohn Baldwin	mulq	%rbx
715bc3d5698SJohn Baldwin	addq	%rax,%r11
716bc3d5698SJohn Baldwin	movq	8(%rcx),%rax
717bc3d5698SJohn Baldwin	adcq	$0,%rdx
718bc3d5698SJohn Baldwin	movq	%rdx,%r10
719bc3d5698SJohn Baldwin
720bc3d5698SJohn Baldwin	mulq	%rbp
721bc3d5698SJohn Baldwin	addq	%rax,%rdi
722bc3d5698SJohn Baldwin	movq	16(%rsi,%r9,1),%rax
723bc3d5698SJohn Baldwin	adcq	$0,%rdx
724bc3d5698SJohn Baldwin	addq	%r11,%rdi
725bc3d5698SJohn Baldwin	leaq	32(%r9),%r15
726bc3d5698SJohn Baldwin	leaq	32(%rcx),%rcx
727bc3d5698SJohn Baldwin	adcq	$0,%rdx
728bc3d5698SJohn Baldwin	movq	%rdi,(%r14)
729bc3d5698SJohn Baldwin	movq	%rdx,%r13
730bc3d5698SJohn Baldwin	jmp	.L1st4x
731bc3d5698SJohn Baldwin
732bc3d5698SJohn Baldwin.align	32
733bc3d5698SJohn Baldwin.L1st4x:
734bc3d5698SJohn Baldwin	mulq	%rbx
735bc3d5698SJohn Baldwin	addq	%rax,%r10
736bc3d5698SJohn Baldwin	movq	-16(%rcx),%rax
737bc3d5698SJohn Baldwin	leaq	32(%r14),%r14
738bc3d5698SJohn Baldwin	adcq	$0,%rdx
739bc3d5698SJohn Baldwin	movq	%rdx,%r11
740bc3d5698SJohn Baldwin
741bc3d5698SJohn Baldwin	mulq	%rbp
742bc3d5698SJohn Baldwin	addq	%rax,%r13
743bc3d5698SJohn Baldwin	movq	-8(%rsi,%r15,1),%rax
744bc3d5698SJohn Baldwin	adcq	$0,%rdx
745bc3d5698SJohn Baldwin	addq	%r10,%r13
746bc3d5698SJohn Baldwin	adcq	$0,%rdx
747bc3d5698SJohn Baldwin	movq	%r13,-24(%r14)
748bc3d5698SJohn Baldwin	movq	%rdx,%rdi
749bc3d5698SJohn Baldwin
750bc3d5698SJohn Baldwin	mulq	%rbx
751bc3d5698SJohn Baldwin	addq	%rax,%r11
752bc3d5698SJohn Baldwin	movq	-8(%rcx),%rax
753bc3d5698SJohn Baldwin	adcq	$0,%rdx
754bc3d5698SJohn Baldwin	movq	%rdx,%r10
755bc3d5698SJohn Baldwin
756bc3d5698SJohn Baldwin	mulq	%rbp
757bc3d5698SJohn Baldwin	addq	%rax,%rdi
758bc3d5698SJohn Baldwin	movq	(%rsi,%r15,1),%rax
759bc3d5698SJohn Baldwin	adcq	$0,%rdx
760bc3d5698SJohn Baldwin	addq	%r11,%rdi
761bc3d5698SJohn Baldwin	adcq	$0,%rdx
762bc3d5698SJohn Baldwin	movq	%rdi,-16(%r14)
763bc3d5698SJohn Baldwin	movq	%rdx,%r13
764bc3d5698SJohn Baldwin
765bc3d5698SJohn Baldwin	mulq	%rbx
766bc3d5698SJohn Baldwin	addq	%rax,%r10
767bc3d5698SJohn Baldwin	movq	0(%rcx),%rax
768bc3d5698SJohn Baldwin	adcq	$0,%rdx
769bc3d5698SJohn Baldwin	movq	%rdx,%r11
770bc3d5698SJohn Baldwin
771bc3d5698SJohn Baldwin	mulq	%rbp
772bc3d5698SJohn Baldwin	addq	%rax,%r13
773bc3d5698SJohn Baldwin	movq	8(%rsi,%r15,1),%rax
774bc3d5698SJohn Baldwin	adcq	$0,%rdx
775bc3d5698SJohn Baldwin	addq	%r10,%r13
776bc3d5698SJohn Baldwin	adcq	$0,%rdx
777bc3d5698SJohn Baldwin	movq	%r13,-8(%r14)
778bc3d5698SJohn Baldwin	movq	%rdx,%rdi
779bc3d5698SJohn Baldwin
780bc3d5698SJohn Baldwin	mulq	%rbx
781bc3d5698SJohn Baldwin	addq	%rax,%r11
782bc3d5698SJohn Baldwin	movq	8(%rcx),%rax
783bc3d5698SJohn Baldwin	adcq	$0,%rdx
784bc3d5698SJohn Baldwin	movq	%rdx,%r10
785bc3d5698SJohn Baldwin
786bc3d5698SJohn Baldwin	mulq	%rbp
787bc3d5698SJohn Baldwin	addq	%rax,%rdi
788bc3d5698SJohn Baldwin	movq	16(%rsi,%r15,1),%rax
789bc3d5698SJohn Baldwin	adcq	$0,%rdx
790bc3d5698SJohn Baldwin	addq	%r11,%rdi
791bc3d5698SJohn Baldwin	leaq	32(%rcx),%rcx
792bc3d5698SJohn Baldwin	adcq	$0,%rdx
793bc3d5698SJohn Baldwin	movq	%rdi,(%r14)
794bc3d5698SJohn Baldwin	movq	%rdx,%r13
795bc3d5698SJohn Baldwin
796bc3d5698SJohn Baldwin	addq	$32,%r15
797bc3d5698SJohn Baldwin	jnz	.L1st4x
798bc3d5698SJohn Baldwin
799bc3d5698SJohn Baldwin	mulq	%rbx
800bc3d5698SJohn Baldwin	addq	%rax,%r10
801bc3d5698SJohn Baldwin	movq	-16(%rcx),%rax
802bc3d5698SJohn Baldwin	leaq	32(%r14),%r14
803bc3d5698SJohn Baldwin	adcq	$0,%rdx
804bc3d5698SJohn Baldwin	movq	%rdx,%r11
805bc3d5698SJohn Baldwin
806bc3d5698SJohn Baldwin	mulq	%rbp
807bc3d5698SJohn Baldwin	addq	%rax,%r13
808bc3d5698SJohn Baldwin	movq	-8(%rsi),%rax
809bc3d5698SJohn Baldwin	adcq	$0,%rdx
810bc3d5698SJohn Baldwin	addq	%r10,%r13
811bc3d5698SJohn Baldwin	adcq	$0,%rdx
812bc3d5698SJohn Baldwin	movq	%r13,-24(%r14)
813bc3d5698SJohn Baldwin	movq	%rdx,%rdi
814bc3d5698SJohn Baldwin
815bc3d5698SJohn Baldwin	mulq	%rbx
816bc3d5698SJohn Baldwin	addq	%rax,%r11
817bc3d5698SJohn Baldwin	movq	-8(%rcx),%rax
818bc3d5698SJohn Baldwin	adcq	$0,%rdx
819bc3d5698SJohn Baldwin	movq	%rdx,%r10
820bc3d5698SJohn Baldwin
821bc3d5698SJohn Baldwin	mulq	%rbp
822bc3d5698SJohn Baldwin	addq	%rax,%rdi
823bc3d5698SJohn Baldwin	movq	(%rsi,%r9,1),%rax
824bc3d5698SJohn Baldwin	adcq	$0,%rdx
825bc3d5698SJohn Baldwin	addq	%r11,%rdi
826bc3d5698SJohn Baldwin	adcq	$0,%rdx
827bc3d5698SJohn Baldwin	movq	%rdi,-16(%r14)
828bc3d5698SJohn Baldwin	movq	%rdx,%r13
829bc3d5698SJohn Baldwin
830bc3d5698SJohn Baldwin	leaq	(%rcx,%r9,1),%rcx
831bc3d5698SJohn Baldwin
832bc3d5698SJohn Baldwin	xorq	%rdi,%rdi
833bc3d5698SJohn Baldwin	addq	%r10,%r13
834bc3d5698SJohn Baldwin	adcq	$0,%rdi
835bc3d5698SJohn Baldwin	movq	%r13,-8(%r14)
836bc3d5698SJohn Baldwin
837bc3d5698SJohn Baldwin	jmp	.Louter4x
838bc3d5698SJohn Baldwin
839bc3d5698SJohn Baldwin.align	32
840bc3d5698SJohn Baldwin.Louter4x:
841bc3d5698SJohn Baldwin	leaq	16+128(%r14),%rdx
842bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm4
843bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm5
844bc3d5698SJohn Baldwin	movdqa	-128(%r12),%xmm0
845bc3d5698SJohn Baldwin	movdqa	-112(%r12),%xmm1
846bc3d5698SJohn Baldwin	movdqa	-96(%r12),%xmm2
847bc3d5698SJohn Baldwin	movdqa	-80(%r12),%xmm3
848bc3d5698SJohn Baldwin	pand	-128(%rdx),%xmm0
849bc3d5698SJohn Baldwin	pand	-112(%rdx),%xmm1
850bc3d5698SJohn Baldwin	por	%xmm0,%xmm4
851bc3d5698SJohn Baldwin	pand	-96(%rdx),%xmm2
852bc3d5698SJohn Baldwin	por	%xmm1,%xmm5
853bc3d5698SJohn Baldwin	pand	-80(%rdx),%xmm3
854bc3d5698SJohn Baldwin	por	%xmm2,%xmm4
855bc3d5698SJohn Baldwin	por	%xmm3,%xmm5
856bc3d5698SJohn Baldwin	movdqa	-64(%r12),%xmm0
857bc3d5698SJohn Baldwin	movdqa	-48(%r12),%xmm1
858bc3d5698SJohn Baldwin	movdqa	-32(%r12),%xmm2
859bc3d5698SJohn Baldwin	movdqa	-16(%r12),%xmm3
860bc3d5698SJohn Baldwin	pand	-64(%rdx),%xmm0
861bc3d5698SJohn Baldwin	pand	-48(%rdx),%xmm1
862bc3d5698SJohn Baldwin	por	%xmm0,%xmm4
863bc3d5698SJohn Baldwin	pand	-32(%rdx),%xmm2
864bc3d5698SJohn Baldwin	por	%xmm1,%xmm5
865bc3d5698SJohn Baldwin	pand	-16(%rdx),%xmm3
866bc3d5698SJohn Baldwin	por	%xmm2,%xmm4
867bc3d5698SJohn Baldwin	por	%xmm3,%xmm5
868bc3d5698SJohn Baldwin	movdqa	0(%r12),%xmm0
869bc3d5698SJohn Baldwin	movdqa	16(%r12),%xmm1
870bc3d5698SJohn Baldwin	movdqa	32(%r12),%xmm2
871bc3d5698SJohn Baldwin	movdqa	48(%r12),%xmm3
872bc3d5698SJohn Baldwin	pand	0(%rdx),%xmm0
873bc3d5698SJohn Baldwin	pand	16(%rdx),%xmm1
874bc3d5698SJohn Baldwin	por	%xmm0,%xmm4
875bc3d5698SJohn Baldwin	pand	32(%rdx),%xmm2
876bc3d5698SJohn Baldwin	por	%xmm1,%xmm5
877bc3d5698SJohn Baldwin	pand	48(%rdx),%xmm3
878bc3d5698SJohn Baldwin	por	%xmm2,%xmm4
879bc3d5698SJohn Baldwin	por	%xmm3,%xmm5
880bc3d5698SJohn Baldwin	movdqa	64(%r12),%xmm0
881bc3d5698SJohn Baldwin	movdqa	80(%r12),%xmm1
882bc3d5698SJohn Baldwin	movdqa	96(%r12),%xmm2
883bc3d5698SJohn Baldwin	movdqa	112(%r12),%xmm3
884bc3d5698SJohn Baldwin	pand	64(%rdx),%xmm0
885bc3d5698SJohn Baldwin	pand	80(%rdx),%xmm1
886bc3d5698SJohn Baldwin	por	%xmm0,%xmm4
887bc3d5698SJohn Baldwin	pand	96(%rdx),%xmm2
888bc3d5698SJohn Baldwin	por	%xmm1,%xmm5
889bc3d5698SJohn Baldwin	pand	112(%rdx),%xmm3
890bc3d5698SJohn Baldwin	por	%xmm2,%xmm4
891bc3d5698SJohn Baldwin	por	%xmm3,%xmm5
892bc3d5698SJohn Baldwin	por	%xmm5,%xmm4
893bc3d5698SJohn Baldwin	pshufd	$0x4e,%xmm4,%xmm0
894bc3d5698SJohn Baldwin	por	%xmm4,%xmm0
895bc3d5698SJohn Baldwin	leaq	256(%r12),%r12
896bc3d5698SJohn Baldwin.byte	102,72,15,126,195
897bc3d5698SJohn Baldwin
898bc3d5698SJohn Baldwin	movq	(%r14,%r9,1),%r10
899bc3d5698SJohn Baldwin	movq	%r8,%rbp
900bc3d5698SJohn Baldwin	mulq	%rbx
901bc3d5698SJohn Baldwin	addq	%rax,%r10
902bc3d5698SJohn Baldwin	movq	(%rcx),%rax
903bc3d5698SJohn Baldwin	adcq	$0,%rdx
904bc3d5698SJohn Baldwin
905bc3d5698SJohn Baldwin	imulq	%r10,%rbp
906bc3d5698SJohn Baldwin	movq	%rdx,%r11
907bc3d5698SJohn Baldwin	movq	%rdi,(%r14)
908bc3d5698SJohn Baldwin
909bc3d5698SJohn Baldwin	leaq	(%r14,%r9,1),%r14
910bc3d5698SJohn Baldwin
911bc3d5698SJohn Baldwin	mulq	%rbp
912bc3d5698SJohn Baldwin	addq	%rax,%r10
913bc3d5698SJohn Baldwin	movq	8(%rsi,%r9,1),%rax
914bc3d5698SJohn Baldwin	adcq	$0,%rdx
915bc3d5698SJohn Baldwin	movq	%rdx,%rdi
916bc3d5698SJohn Baldwin
917bc3d5698SJohn Baldwin	mulq	%rbx
918bc3d5698SJohn Baldwin	addq	%rax,%r11
919bc3d5698SJohn Baldwin	movq	8(%rcx),%rax
920bc3d5698SJohn Baldwin	adcq	$0,%rdx
921bc3d5698SJohn Baldwin	addq	8(%r14),%r11
922bc3d5698SJohn Baldwin	adcq	$0,%rdx
923bc3d5698SJohn Baldwin	movq	%rdx,%r10
924bc3d5698SJohn Baldwin
925bc3d5698SJohn Baldwin	mulq	%rbp
926bc3d5698SJohn Baldwin	addq	%rax,%rdi
927bc3d5698SJohn Baldwin	movq	16(%rsi,%r9,1),%rax
928bc3d5698SJohn Baldwin	adcq	$0,%rdx
929bc3d5698SJohn Baldwin	addq	%r11,%rdi
930bc3d5698SJohn Baldwin	leaq	32(%r9),%r15
931bc3d5698SJohn Baldwin	leaq	32(%rcx),%rcx
932bc3d5698SJohn Baldwin	adcq	$0,%rdx
933bc3d5698SJohn Baldwin	movq	%rdx,%r13
934bc3d5698SJohn Baldwin	jmp	.Linner4x
935bc3d5698SJohn Baldwin
936bc3d5698SJohn Baldwin.align	32
937bc3d5698SJohn Baldwin.Linner4x:
938bc3d5698SJohn Baldwin	mulq	%rbx
939bc3d5698SJohn Baldwin	addq	%rax,%r10
940bc3d5698SJohn Baldwin	movq	-16(%rcx),%rax
941bc3d5698SJohn Baldwin	adcq	$0,%rdx
942bc3d5698SJohn Baldwin	addq	16(%r14),%r10
943bc3d5698SJohn Baldwin	leaq	32(%r14),%r14
944bc3d5698SJohn Baldwin	adcq	$0,%rdx
945bc3d5698SJohn Baldwin	movq	%rdx,%r11
946bc3d5698SJohn Baldwin
947bc3d5698SJohn Baldwin	mulq	%rbp
948bc3d5698SJohn Baldwin	addq	%rax,%r13
949bc3d5698SJohn Baldwin	movq	-8(%rsi,%r15,1),%rax
950bc3d5698SJohn Baldwin	adcq	$0,%rdx
951bc3d5698SJohn Baldwin	addq	%r10,%r13
952bc3d5698SJohn Baldwin	adcq	$0,%rdx
953bc3d5698SJohn Baldwin	movq	%rdi,-32(%r14)
954bc3d5698SJohn Baldwin	movq	%rdx,%rdi
955bc3d5698SJohn Baldwin
956bc3d5698SJohn Baldwin	mulq	%rbx
957bc3d5698SJohn Baldwin	addq	%rax,%r11
958bc3d5698SJohn Baldwin	movq	-8(%rcx),%rax
959bc3d5698SJohn Baldwin	adcq	$0,%rdx
960bc3d5698SJohn Baldwin	addq	-8(%r14),%r11
961bc3d5698SJohn Baldwin	adcq	$0,%rdx
962bc3d5698SJohn Baldwin	movq	%rdx,%r10
963bc3d5698SJohn Baldwin
964bc3d5698SJohn Baldwin	mulq	%rbp
965bc3d5698SJohn Baldwin	addq	%rax,%rdi
966bc3d5698SJohn Baldwin	movq	(%rsi,%r15,1),%rax
967bc3d5698SJohn Baldwin	adcq	$0,%rdx
968bc3d5698SJohn Baldwin	addq	%r11,%rdi
969bc3d5698SJohn Baldwin	adcq	$0,%rdx
970bc3d5698SJohn Baldwin	movq	%r13,-24(%r14)
971bc3d5698SJohn Baldwin	movq	%rdx,%r13
972bc3d5698SJohn Baldwin
973bc3d5698SJohn Baldwin	mulq	%rbx
974bc3d5698SJohn Baldwin	addq	%rax,%r10
975bc3d5698SJohn Baldwin	movq	0(%rcx),%rax
976bc3d5698SJohn Baldwin	adcq	$0,%rdx
977bc3d5698SJohn Baldwin	addq	(%r14),%r10
978bc3d5698SJohn Baldwin	adcq	$0,%rdx
979bc3d5698SJohn Baldwin	movq	%rdx,%r11
980bc3d5698SJohn Baldwin
981bc3d5698SJohn Baldwin	mulq	%rbp
982bc3d5698SJohn Baldwin	addq	%rax,%r13
983bc3d5698SJohn Baldwin	movq	8(%rsi,%r15,1),%rax
984bc3d5698SJohn Baldwin	adcq	$0,%rdx
985bc3d5698SJohn Baldwin	addq	%r10,%r13
986bc3d5698SJohn Baldwin	adcq	$0,%rdx
987bc3d5698SJohn Baldwin	movq	%rdi,-16(%r14)
988bc3d5698SJohn Baldwin	movq	%rdx,%rdi
989bc3d5698SJohn Baldwin
990bc3d5698SJohn Baldwin	mulq	%rbx
991bc3d5698SJohn Baldwin	addq	%rax,%r11
992bc3d5698SJohn Baldwin	movq	8(%rcx),%rax
993bc3d5698SJohn Baldwin	adcq	$0,%rdx
994bc3d5698SJohn Baldwin	addq	8(%r14),%r11
995bc3d5698SJohn Baldwin	adcq	$0,%rdx
996bc3d5698SJohn Baldwin	movq	%rdx,%r10
997bc3d5698SJohn Baldwin
998bc3d5698SJohn Baldwin	mulq	%rbp
999bc3d5698SJohn Baldwin	addq	%rax,%rdi
1000bc3d5698SJohn Baldwin	movq	16(%rsi,%r15,1),%rax
1001bc3d5698SJohn Baldwin	adcq	$0,%rdx
1002bc3d5698SJohn Baldwin	addq	%r11,%rdi
1003bc3d5698SJohn Baldwin	leaq	32(%rcx),%rcx
1004bc3d5698SJohn Baldwin	adcq	$0,%rdx
1005bc3d5698SJohn Baldwin	movq	%r13,-8(%r14)
1006bc3d5698SJohn Baldwin	movq	%rdx,%r13
1007bc3d5698SJohn Baldwin
1008bc3d5698SJohn Baldwin	addq	$32,%r15
1009bc3d5698SJohn Baldwin	jnz	.Linner4x
1010bc3d5698SJohn Baldwin
1011bc3d5698SJohn Baldwin	mulq	%rbx
1012bc3d5698SJohn Baldwin	addq	%rax,%r10
1013bc3d5698SJohn Baldwin	movq	-16(%rcx),%rax
1014bc3d5698SJohn Baldwin	adcq	$0,%rdx
1015bc3d5698SJohn Baldwin	addq	16(%r14),%r10
1016bc3d5698SJohn Baldwin	leaq	32(%r14),%r14
1017bc3d5698SJohn Baldwin	adcq	$0,%rdx
1018bc3d5698SJohn Baldwin	movq	%rdx,%r11
1019bc3d5698SJohn Baldwin
1020bc3d5698SJohn Baldwin	mulq	%rbp
1021bc3d5698SJohn Baldwin	addq	%rax,%r13
1022bc3d5698SJohn Baldwin	movq	-8(%rsi),%rax
1023bc3d5698SJohn Baldwin	adcq	$0,%rdx
1024bc3d5698SJohn Baldwin	addq	%r10,%r13
1025bc3d5698SJohn Baldwin	adcq	$0,%rdx
1026bc3d5698SJohn Baldwin	movq	%rdi,-32(%r14)
1027bc3d5698SJohn Baldwin	movq	%rdx,%rdi
1028bc3d5698SJohn Baldwin
1029bc3d5698SJohn Baldwin	mulq	%rbx
1030bc3d5698SJohn Baldwin	addq	%rax,%r11
1031bc3d5698SJohn Baldwin	movq	%rbp,%rax
1032bc3d5698SJohn Baldwin	movq	-8(%rcx),%rbp
1033bc3d5698SJohn Baldwin	adcq	$0,%rdx
1034bc3d5698SJohn Baldwin	addq	-8(%r14),%r11
1035bc3d5698SJohn Baldwin	adcq	$0,%rdx
1036bc3d5698SJohn Baldwin	movq	%rdx,%r10
1037bc3d5698SJohn Baldwin
1038bc3d5698SJohn Baldwin	mulq	%rbp
1039bc3d5698SJohn Baldwin	addq	%rax,%rdi
1040bc3d5698SJohn Baldwin	movq	(%rsi,%r9,1),%rax
1041bc3d5698SJohn Baldwin	adcq	$0,%rdx
1042bc3d5698SJohn Baldwin	addq	%r11,%rdi
1043bc3d5698SJohn Baldwin	adcq	$0,%rdx
1044bc3d5698SJohn Baldwin	movq	%r13,-24(%r14)
1045bc3d5698SJohn Baldwin	movq	%rdx,%r13
1046bc3d5698SJohn Baldwin
1047bc3d5698SJohn Baldwin	movq	%rdi,-16(%r14)
1048bc3d5698SJohn Baldwin	leaq	(%rcx,%r9,1),%rcx
1049bc3d5698SJohn Baldwin
1050bc3d5698SJohn Baldwin	xorq	%rdi,%rdi
1051bc3d5698SJohn Baldwin	addq	%r10,%r13
1052bc3d5698SJohn Baldwin	adcq	$0,%rdi
1053bc3d5698SJohn Baldwin	addq	(%r14),%r13
1054bc3d5698SJohn Baldwin	adcq	$0,%rdi
1055bc3d5698SJohn Baldwin	movq	%r13,-8(%r14)
1056bc3d5698SJohn Baldwin
1057bc3d5698SJohn Baldwin	cmpq	16+8(%rsp),%r12
1058bc3d5698SJohn Baldwin	jb	.Louter4x
1059bc3d5698SJohn Baldwin	xorq	%rax,%rax
1060bc3d5698SJohn Baldwin	subq	%r13,%rbp
1061bc3d5698SJohn Baldwin	adcq	%r15,%r15
1062bc3d5698SJohn Baldwin	orq	%r15,%rdi
1063bc3d5698SJohn Baldwin	subq	%rdi,%rax
1064bc3d5698SJohn Baldwin	leaq	(%r14,%r9,1),%rbx
1065bc3d5698SJohn Baldwin	movq	(%rcx),%r12
1066bc3d5698SJohn Baldwin	leaq	(%rcx),%rbp
1067bc3d5698SJohn Baldwin	movq	%r9,%rcx
1068bc3d5698SJohn Baldwin	sarq	$3+2,%rcx
1069bc3d5698SJohn Baldwin	movq	56+8(%rsp),%rdi
1070bc3d5698SJohn Baldwin	decq	%r12
1071bc3d5698SJohn Baldwin	xorq	%r10,%r10
1072bc3d5698SJohn Baldwin	movq	8(%rbp),%r13
1073bc3d5698SJohn Baldwin	movq	16(%rbp),%r14
1074bc3d5698SJohn Baldwin	movq	24(%rbp),%r15
1075bc3d5698SJohn Baldwin	jmp	.Lsqr4x_sub_entry
1076bc3d5698SJohn Baldwin.cfi_endproc
1077bc3d5698SJohn Baldwin.size	mul4x_internal,.-mul4x_internal
1078bc3d5698SJohn Baldwin.globl	bn_power5
1079bc3d5698SJohn Baldwin.type	bn_power5,@function
1080bc3d5698SJohn Baldwin.align	32
1081bc3d5698SJohn Baldwinbn_power5:
1082bc3d5698SJohn Baldwin.cfi_startproc
1083bc3d5698SJohn Baldwin	movq	%rsp,%rax
1084bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rax
1085bc3d5698SJohn Baldwin	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
1086bc3d5698SJohn Baldwin	andl	$0x80108,%r11d
1087bc3d5698SJohn Baldwin	cmpl	$0x80108,%r11d
1088bc3d5698SJohn Baldwin	je	.Lpowerx5_enter
1089bc3d5698SJohn Baldwin	pushq	%rbx
1090bc3d5698SJohn Baldwin.cfi_offset	%rbx,-16
1091bc3d5698SJohn Baldwin	pushq	%rbp
1092bc3d5698SJohn Baldwin.cfi_offset	%rbp,-24
1093bc3d5698SJohn Baldwin	pushq	%r12
1094bc3d5698SJohn Baldwin.cfi_offset	%r12,-32
1095bc3d5698SJohn Baldwin	pushq	%r13
1096bc3d5698SJohn Baldwin.cfi_offset	%r13,-40
1097bc3d5698SJohn Baldwin	pushq	%r14
1098bc3d5698SJohn Baldwin.cfi_offset	%r14,-48
1099bc3d5698SJohn Baldwin	pushq	%r15
1100bc3d5698SJohn Baldwin.cfi_offset	%r15,-56
1101bc3d5698SJohn Baldwin.Lpower5_prologue:
1102bc3d5698SJohn Baldwin
1103bc3d5698SJohn Baldwin	shll	$3,%r9d
1104bc3d5698SJohn Baldwin	leal	(%r9,%r9,2),%r10d
1105bc3d5698SJohn Baldwin	negq	%r9
1106bc3d5698SJohn Baldwin	movq	(%r8),%r8
1107bc3d5698SJohn Baldwin
1108bc3d5698SJohn Baldwin
1109bc3d5698SJohn Baldwin
1110bc3d5698SJohn Baldwin
1111bc3d5698SJohn Baldwin
1112bc3d5698SJohn Baldwin
1113bc3d5698SJohn Baldwin
1114bc3d5698SJohn Baldwin
1115bc3d5698SJohn Baldwin	leaq	-320(%rsp,%r9,2),%r11
1116bc3d5698SJohn Baldwin	movq	%rsp,%rbp
1117bc3d5698SJohn Baldwin	subq	%rdi,%r11
1118bc3d5698SJohn Baldwin	andq	$4095,%r11
1119bc3d5698SJohn Baldwin	cmpq	%r11,%r10
1120bc3d5698SJohn Baldwin	jb	.Lpwr_sp_alt
1121bc3d5698SJohn Baldwin	subq	%r11,%rbp
1122bc3d5698SJohn Baldwin	leaq	-320(%rbp,%r9,2),%rbp
1123bc3d5698SJohn Baldwin	jmp	.Lpwr_sp_done
1124bc3d5698SJohn Baldwin
1125bc3d5698SJohn Baldwin.align	32
1126bc3d5698SJohn Baldwin.Lpwr_sp_alt:
1127bc3d5698SJohn Baldwin	leaq	4096-320(,%r9,2),%r10
1128bc3d5698SJohn Baldwin	leaq	-320(%rbp,%r9,2),%rbp
1129bc3d5698SJohn Baldwin	subq	%r10,%r11
1130bc3d5698SJohn Baldwin	movq	$0,%r10
1131bc3d5698SJohn Baldwin	cmovcq	%r10,%r11
1132bc3d5698SJohn Baldwin	subq	%r11,%rbp
1133bc3d5698SJohn Baldwin.Lpwr_sp_done:
1134bc3d5698SJohn Baldwin	andq	$-64,%rbp
1135bc3d5698SJohn Baldwin	movq	%rsp,%r11
1136bc3d5698SJohn Baldwin	subq	%rbp,%r11
1137bc3d5698SJohn Baldwin	andq	$-4096,%r11
1138bc3d5698SJohn Baldwin	leaq	(%r11,%rbp,1),%rsp
1139bc3d5698SJohn Baldwin	movq	(%rsp),%r10
1140bc3d5698SJohn Baldwin	cmpq	%rbp,%rsp
1141bc3d5698SJohn Baldwin	ja	.Lpwr_page_walk
1142bc3d5698SJohn Baldwin	jmp	.Lpwr_page_walk_done
1143bc3d5698SJohn Baldwin
1144bc3d5698SJohn Baldwin.Lpwr_page_walk:
1145bc3d5698SJohn Baldwin	leaq	-4096(%rsp),%rsp
1146bc3d5698SJohn Baldwin	movq	(%rsp),%r10
1147bc3d5698SJohn Baldwin	cmpq	%rbp,%rsp
1148bc3d5698SJohn Baldwin	ja	.Lpwr_page_walk
1149bc3d5698SJohn Baldwin.Lpwr_page_walk_done:
1150bc3d5698SJohn Baldwin
1151bc3d5698SJohn Baldwin	movq	%r9,%r10
1152bc3d5698SJohn Baldwin	negq	%r9
1153bc3d5698SJohn Baldwin
1154bc3d5698SJohn Baldwin
1155bc3d5698SJohn Baldwin
1156bc3d5698SJohn Baldwin
1157bc3d5698SJohn Baldwin
1158bc3d5698SJohn Baldwin
1159bc3d5698SJohn Baldwin
1160bc3d5698SJohn Baldwin
1161bc3d5698SJohn Baldwin
1162bc3d5698SJohn Baldwin
1163bc3d5698SJohn Baldwin	movq	%r8,32(%rsp)
1164bc3d5698SJohn Baldwin	movq	%rax,40(%rsp)
1165bc3d5698SJohn Baldwin.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
1166bc3d5698SJohn Baldwin.Lpower5_body:
1167bc3d5698SJohn Baldwin.byte	102,72,15,110,207
1168bc3d5698SJohn Baldwin.byte	102,72,15,110,209
1169bc3d5698SJohn Baldwin.byte	102,73,15,110,218
1170bc3d5698SJohn Baldwin.byte	102,72,15,110,226
1171bc3d5698SJohn Baldwin
1172bc3d5698SJohn Baldwin	call	__bn_sqr8x_internal
1173bc3d5698SJohn Baldwin	call	__bn_post4x_internal
1174bc3d5698SJohn Baldwin	call	__bn_sqr8x_internal
1175bc3d5698SJohn Baldwin	call	__bn_post4x_internal
1176bc3d5698SJohn Baldwin	call	__bn_sqr8x_internal
1177bc3d5698SJohn Baldwin	call	__bn_post4x_internal
1178bc3d5698SJohn Baldwin	call	__bn_sqr8x_internal
1179bc3d5698SJohn Baldwin	call	__bn_post4x_internal
1180bc3d5698SJohn Baldwin	call	__bn_sqr8x_internal
1181bc3d5698SJohn Baldwin	call	__bn_post4x_internal
1182bc3d5698SJohn Baldwin
1183bc3d5698SJohn Baldwin.byte	102,72,15,126,209
1184bc3d5698SJohn Baldwin.byte	102,72,15,126,226
1185bc3d5698SJohn Baldwin	movq	%rsi,%rdi
1186bc3d5698SJohn Baldwin	movq	40(%rsp),%rax
1187bc3d5698SJohn Baldwin	leaq	32(%rsp),%r8
1188bc3d5698SJohn Baldwin
1189bc3d5698SJohn Baldwin	call	mul4x_internal
1190bc3d5698SJohn Baldwin
1191bc3d5698SJohn Baldwin	movq	40(%rsp),%rsi
1192bc3d5698SJohn Baldwin.cfi_def_cfa	%rsi,8
1193bc3d5698SJohn Baldwin	movq	$1,%rax
1194bc3d5698SJohn Baldwin	movq	-48(%rsi),%r15
1195bc3d5698SJohn Baldwin.cfi_restore	%r15
1196bc3d5698SJohn Baldwin	movq	-40(%rsi),%r14
1197bc3d5698SJohn Baldwin.cfi_restore	%r14
1198bc3d5698SJohn Baldwin	movq	-32(%rsi),%r13
1199bc3d5698SJohn Baldwin.cfi_restore	%r13
1200bc3d5698SJohn Baldwin	movq	-24(%rsi),%r12
1201bc3d5698SJohn Baldwin.cfi_restore	%r12
1202bc3d5698SJohn Baldwin	movq	-16(%rsi),%rbp
1203bc3d5698SJohn Baldwin.cfi_restore	%rbp
1204bc3d5698SJohn Baldwin	movq	-8(%rsi),%rbx
1205bc3d5698SJohn Baldwin.cfi_restore	%rbx
1206bc3d5698SJohn Baldwin	leaq	(%rsi),%rsp
1207bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rsp
1208bc3d5698SJohn Baldwin.Lpower5_epilogue:
1209bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
1210bc3d5698SJohn Baldwin.cfi_endproc
1211bc3d5698SJohn Baldwin.size	bn_power5,.-bn_power5
1212bc3d5698SJohn Baldwin
1213bc3d5698SJohn Baldwin.globl	bn_sqr8x_internal
1214bc3d5698SJohn Baldwin.hidden	bn_sqr8x_internal
1215bc3d5698SJohn Baldwin.type	bn_sqr8x_internal,@function
1216bc3d5698SJohn Baldwin.align	32
1217bc3d5698SJohn Baldwinbn_sqr8x_internal:
1218bc3d5698SJohn Baldwin__bn_sqr8x_internal:
1219bc3d5698SJohn Baldwin.cfi_startproc
1220bc3d5698SJohn Baldwin
1221bc3d5698SJohn Baldwin
1222bc3d5698SJohn Baldwin
1223bc3d5698SJohn Baldwin
1224bc3d5698SJohn Baldwin
1225bc3d5698SJohn Baldwin
1226bc3d5698SJohn Baldwin
1227bc3d5698SJohn Baldwin
1228bc3d5698SJohn Baldwin
1229bc3d5698SJohn Baldwin
1230bc3d5698SJohn Baldwin
1231bc3d5698SJohn Baldwin
1232bc3d5698SJohn Baldwin
1233bc3d5698SJohn Baldwin
1234bc3d5698SJohn Baldwin
1235bc3d5698SJohn Baldwin
1236bc3d5698SJohn Baldwin
1237bc3d5698SJohn Baldwin
1238bc3d5698SJohn Baldwin
1239bc3d5698SJohn Baldwin
1240bc3d5698SJohn Baldwin
1241bc3d5698SJohn Baldwin
1242bc3d5698SJohn Baldwin
1243bc3d5698SJohn Baldwin
1244bc3d5698SJohn Baldwin
1245bc3d5698SJohn Baldwin
1246bc3d5698SJohn Baldwin
1247bc3d5698SJohn Baldwin
1248bc3d5698SJohn Baldwin
1249bc3d5698SJohn Baldwin
1250bc3d5698SJohn Baldwin
1251bc3d5698SJohn Baldwin
1252bc3d5698SJohn Baldwin
1253bc3d5698SJohn Baldwin
1254bc3d5698SJohn Baldwin
1255bc3d5698SJohn Baldwin
1256bc3d5698SJohn Baldwin
1257bc3d5698SJohn Baldwin
1258bc3d5698SJohn Baldwin
1259bc3d5698SJohn Baldwin
1260bc3d5698SJohn Baldwin
1261bc3d5698SJohn Baldwin
1262bc3d5698SJohn Baldwin
1263bc3d5698SJohn Baldwin
1264bc3d5698SJohn Baldwin
1265bc3d5698SJohn Baldwin
1266bc3d5698SJohn Baldwin
1267bc3d5698SJohn Baldwin
1268bc3d5698SJohn Baldwin
1269bc3d5698SJohn Baldwin
1270bc3d5698SJohn Baldwin
1271bc3d5698SJohn Baldwin
1272bc3d5698SJohn Baldwin
1273bc3d5698SJohn Baldwin
1274bc3d5698SJohn Baldwin
1275bc3d5698SJohn Baldwin
1276bc3d5698SJohn Baldwin
1277bc3d5698SJohn Baldwin
1278bc3d5698SJohn Baldwin
1279bc3d5698SJohn Baldwin
1280bc3d5698SJohn Baldwin
1281bc3d5698SJohn Baldwin
1282bc3d5698SJohn Baldwin
1283bc3d5698SJohn Baldwin
1284bc3d5698SJohn Baldwin
1285bc3d5698SJohn Baldwin
1286bc3d5698SJohn Baldwin
1287bc3d5698SJohn Baldwin
1288bc3d5698SJohn Baldwin
1289bc3d5698SJohn Baldwin
1290bc3d5698SJohn Baldwin
1291bc3d5698SJohn Baldwin
1292bc3d5698SJohn Baldwin
1293bc3d5698SJohn Baldwin	leaq	32(%r10),%rbp
1294bc3d5698SJohn Baldwin	leaq	(%rsi,%r9,1),%rsi
1295bc3d5698SJohn Baldwin
1296bc3d5698SJohn Baldwin	movq	%r9,%rcx
1297bc3d5698SJohn Baldwin
1298bc3d5698SJohn Baldwin
1299bc3d5698SJohn Baldwin	movq	-32(%rsi,%rbp,1),%r14
1300bc3d5698SJohn Baldwin	leaq	48+8(%rsp,%r9,2),%rdi
1301bc3d5698SJohn Baldwin	movq	-24(%rsi,%rbp,1),%rax
1302bc3d5698SJohn Baldwin	leaq	-32(%rdi,%rbp,1),%rdi
1303bc3d5698SJohn Baldwin	movq	-16(%rsi,%rbp,1),%rbx
1304bc3d5698SJohn Baldwin	movq	%rax,%r15
1305bc3d5698SJohn Baldwin
1306bc3d5698SJohn Baldwin	mulq	%r14
1307bc3d5698SJohn Baldwin	movq	%rax,%r10
1308bc3d5698SJohn Baldwin	movq	%rbx,%rax
1309bc3d5698SJohn Baldwin	movq	%rdx,%r11
1310bc3d5698SJohn Baldwin	movq	%r10,-24(%rdi,%rbp,1)
1311bc3d5698SJohn Baldwin
1312bc3d5698SJohn Baldwin	mulq	%r14
1313bc3d5698SJohn Baldwin	addq	%rax,%r11
1314bc3d5698SJohn Baldwin	movq	%rbx,%rax
1315bc3d5698SJohn Baldwin	adcq	$0,%rdx
1316bc3d5698SJohn Baldwin	movq	%r11,-16(%rdi,%rbp,1)
1317bc3d5698SJohn Baldwin	movq	%rdx,%r10
1318bc3d5698SJohn Baldwin
1319bc3d5698SJohn Baldwin
1320bc3d5698SJohn Baldwin	movq	-8(%rsi,%rbp,1),%rbx
1321bc3d5698SJohn Baldwin	mulq	%r15
1322bc3d5698SJohn Baldwin	movq	%rax,%r12
1323bc3d5698SJohn Baldwin	movq	%rbx,%rax
1324bc3d5698SJohn Baldwin	movq	%rdx,%r13
1325bc3d5698SJohn Baldwin
1326bc3d5698SJohn Baldwin	leaq	(%rbp),%rcx
1327bc3d5698SJohn Baldwin	mulq	%r14
1328bc3d5698SJohn Baldwin	addq	%rax,%r10
1329bc3d5698SJohn Baldwin	movq	%rbx,%rax
1330bc3d5698SJohn Baldwin	movq	%rdx,%r11
1331bc3d5698SJohn Baldwin	adcq	$0,%r11
1332bc3d5698SJohn Baldwin	addq	%r12,%r10
1333bc3d5698SJohn Baldwin	adcq	$0,%r11
1334bc3d5698SJohn Baldwin	movq	%r10,-8(%rdi,%rcx,1)
1335bc3d5698SJohn Baldwin	jmp	.Lsqr4x_1st
1336bc3d5698SJohn Baldwin
1337bc3d5698SJohn Baldwin.align	32
1338bc3d5698SJohn Baldwin.Lsqr4x_1st:
1339bc3d5698SJohn Baldwin	movq	(%rsi,%rcx,1),%rbx
1340bc3d5698SJohn Baldwin	mulq	%r15
1341bc3d5698SJohn Baldwin	addq	%rax,%r13
1342bc3d5698SJohn Baldwin	movq	%rbx,%rax
1343bc3d5698SJohn Baldwin	movq	%rdx,%r12
1344bc3d5698SJohn Baldwin	adcq	$0,%r12
1345bc3d5698SJohn Baldwin
1346bc3d5698SJohn Baldwin	mulq	%r14
1347bc3d5698SJohn Baldwin	addq	%rax,%r11
1348bc3d5698SJohn Baldwin	movq	%rbx,%rax
1349bc3d5698SJohn Baldwin	movq	8(%rsi,%rcx,1),%rbx
1350bc3d5698SJohn Baldwin	movq	%rdx,%r10
1351bc3d5698SJohn Baldwin	adcq	$0,%r10
1352bc3d5698SJohn Baldwin	addq	%r13,%r11
1353bc3d5698SJohn Baldwin	adcq	$0,%r10
1354bc3d5698SJohn Baldwin
1355bc3d5698SJohn Baldwin
1356bc3d5698SJohn Baldwin	mulq	%r15
1357bc3d5698SJohn Baldwin	addq	%rax,%r12
1358bc3d5698SJohn Baldwin	movq	%rbx,%rax
1359bc3d5698SJohn Baldwin	movq	%r11,(%rdi,%rcx,1)
1360bc3d5698SJohn Baldwin	movq	%rdx,%r13
1361bc3d5698SJohn Baldwin	adcq	$0,%r13
1362bc3d5698SJohn Baldwin
1363bc3d5698SJohn Baldwin	mulq	%r14
1364bc3d5698SJohn Baldwin	addq	%rax,%r10
1365bc3d5698SJohn Baldwin	movq	%rbx,%rax
1366bc3d5698SJohn Baldwin	movq	16(%rsi,%rcx,1),%rbx
1367bc3d5698SJohn Baldwin	movq	%rdx,%r11
1368bc3d5698SJohn Baldwin	adcq	$0,%r11
1369bc3d5698SJohn Baldwin	addq	%r12,%r10
1370bc3d5698SJohn Baldwin	adcq	$0,%r11
1371bc3d5698SJohn Baldwin
1372bc3d5698SJohn Baldwin	mulq	%r15
1373bc3d5698SJohn Baldwin	addq	%rax,%r13
1374bc3d5698SJohn Baldwin	movq	%rbx,%rax
1375bc3d5698SJohn Baldwin	movq	%r10,8(%rdi,%rcx,1)
1376bc3d5698SJohn Baldwin	movq	%rdx,%r12
1377bc3d5698SJohn Baldwin	adcq	$0,%r12
1378bc3d5698SJohn Baldwin
1379bc3d5698SJohn Baldwin	mulq	%r14
1380bc3d5698SJohn Baldwin	addq	%rax,%r11
1381bc3d5698SJohn Baldwin	movq	%rbx,%rax
1382bc3d5698SJohn Baldwin	movq	24(%rsi,%rcx,1),%rbx
1383bc3d5698SJohn Baldwin	movq	%rdx,%r10
1384bc3d5698SJohn Baldwin	adcq	$0,%r10
1385bc3d5698SJohn Baldwin	addq	%r13,%r11
1386bc3d5698SJohn Baldwin	adcq	$0,%r10
1387bc3d5698SJohn Baldwin
1388bc3d5698SJohn Baldwin
1389bc3d5698SJohn Baldwin	mulq	%r15
1390bc3d5698SJohn Baldwin	addq	%rax,%r12
1391bc3d5698SJohn Baldwin	movq	%rbx,%rax
1392bc3d5698SJohn Baldwin	movq	%r11,16(%rdi,%rcx,1)
1393bc3d5698SJohn Baldwin	movq	%rdx,%r13
1394bc3d5698SJohn Baldwin	adcq	$0,%r13
1395bc3d5698SJohn Baldwin	leaq	32(%rcx),%rcx
1396bc3d5698SJohn Baldwin
1397bc3d5698SJohn Baldwin	mulq	%r14
1398bc3d5698SJohn Baldwin	addq	%rax,%r10
1399bc3d5698SJohn Baldwin	movq	%rbx,%rax
1400bc3d5698SJohn Baldwin	movq	%rdx,%r11
1401bc3d5698SJohn Baldwin	adcq	$0,%r11
1402bc3d5698SJohn Baldwin	addq	%r12,%r10
1403bc3d5698SJohn Baldwin	adcq	$0,%r11
1404bc3d5698SJohn Baldwin	movq	%r10,-8(%rdi,%rcx,1)
1405bc3d5698SJohn Baldwin
1406bc3d5698SJohn Baldwin	cmpq	$0,%rcx
1407bc3d5698SJohn Baldwin	jne	.Lsqr4x_1st
1408bc3d5698SJohn Baldwin
1409bc3d5698SJohn Baldwin	mulq	%r15
1410bc3d5698SJohn Baldwin	addq	%rax,%r13
1411bc3d5698SJohn Baldwin	leaq	16(%rbp),%rbp
1412bc3d5698SJohn Baldwin	adcq	$0,%rdx
1413bc3d5698SJohn Baldwin	addq	%r11,%r13
1414bc3d5698SJohn Baldwin	adcq	$0,%rdx
1415bc3d5698SJohn Baldwin
1416bc3d5698SJohn Baldwin	movq	%r13,(%rdi)
1417bc3d5698SJohn Baldwin	movq	%rdx,%r12
1418bc3d5698SJohn Baldwin	movq	%rdx,8(%rdi)
1419bc3d5698SJohn Baldwin	jmp	.Lsqr4x_outer
1420bc3d5698SJohn Baldwin
1421bc3d5698SJohn Baldwin.align	32
1422bc3d5698SJohn Baldwin.Lsqr4x_outer:
1423bc3d5698SJohn Baldwin	movq	-32(%rsi,%rbp,1),%r14
1424bc3d5698SJohn Baldwin	leaq	48+8(%rsp,%r9,2),%rdi
1425bc3d5698SJohn Baldwin	movq	-24(%rsi,%rbp,1),%rax
1426bc3d5698SJohn Baldwin	leaq	-32(%rdi,%rbp,1),%rdi
1427bc3d5698SJohn Baldwin	movq	-16(%rsi,%rbp,1),%rbx
1428bc3d5698SJohn Baldwin	movq	%rax,%r15
1429bc3d5698SJohn Baldwin
1430bc3d5698SJohn Baldwin	mulq	%r14
1431bc3d5698SJohn Baldwin	movq	-24(%rdi,%rbp,1),%r10
1432bc3d5698SJohn Baldwin	addq	%rax,%r10
1433bc3d5698SJohn Baldwin	movq	%rbx,%rax
1434bc3d5698SJohn Baldwin	adcq	$0,%rdx
1435bc3d5698SJohn Baldwin	movq	%r10,-24(%rdi,%rbp,1)
1436bc3d5698SJohn Baldwin	movq	%rdx,%r11
1437bc3d5698SJohn Baldwin
1438bc3d5698SJohn Baldwin	mulq	%r14
1439bc3d5698SJohn Baldwin	addq	%rax,%r11
1440bc3d5698SJohn Baldwin	movq	%rbx,%rax
1441bc3d5698SJohn Baldwin	adcq	$0,%rdx
1442bc3d5698SJohn Baldwin	addq	-16(%rdi,%rbp,1),%r11
1443bc3d5698SJohn Baldwin	movq	%rdx,%r10
1444bc3d5698SJohn Baldwin	adcq	$0,%r10
1445bc3d5698SJohn Baldwin	movq	%r11,-16(%rdi,%rbp,1)
1446bc3d5698SJohn Baldwin
1447bc3d5698SJohn Baldwin	xorq	%r12,%r12
1448bc3d5698SJohn Baldwin
1449bc3d5698SJohn Baldwin	movq	-8(%rsi,%rbp,1),%rbx
1450bc3d5698SJohn Baldwin	mulq	%r15
1451bc3d5698SJohn Baldwin	addq	%rax,%r12
1452bc3d5698SJohn Baldwin	movq	%rbx,%rax
1453bc3d5698SJohn Baldwin	adcq	$0,%rdx
1454bc3d5698SJohn Baldwin	addq	-8(%rdi,%rbp,1),%r12
1455bc3d5698SJohn Baldwin	movq	%rdx,%r13
1456bc3d5698SJohn Baldwin	adcq	$0,%r13
1457bc3d5698SJohn Baldwin
1458bc3d5698SJohn Baldwin	mulq	%r14
1459bc3d5698SJohn Baldwin	addq	%rax,%r10
1460bc3d5698SJohn Baldwin	movq	%rbx,%rax
1461bc3d5698SJohn Baldwin	adcq	$0,%rdx
1462bc3d5698SJohn Baldwin	addq	%r12,%r10
1463bc3d5698SJohn Baldwin	movq	%rdx,%r11
1464bc3d5698SJohn Baldwin	adcq	$0,%r11
1465bc3d5698SJohn Baldwin	movq	%r10,-8(%rdi,%rbp,1)
1466bc3d5698SJohn Baldwin
1467bc3d5698SJohn Baldwin	leaq	(%rbp),%rcx
1468bc3d5698SJohn Baldwin	jmp	.Lsqr4x_inner
1469bc3d5698SJohn Baldwin
1470bc3d5698SJohn Baldwin.align	32
1471bc3d5698SJohn Baldwin.Lsqr4x_inner:
1472bc3d5698SJohn Baldwin	movq	(%rsi,%rcx,1),%rbx
1473bc3d5698SJohn Baldwin	mulq	%r15
1474bc3d5698SJohn Baldwin	addq	%rax,%r13
1475bc3d5698SJohn Baldwin	movq	%rbx,%rax
1476bc3d5698SJohn Baldwin	movq	%rdx,%r12
1477bc3d5698SJohn Baldwin	adcq	$0,%r12
1478bc3d5698SJohn Baldwin	addq	(%rdi,%rcx,1),%r13
1479bc3d5698SJohn Baldwin	adcq	$0,%r12
1480bc3d5698SJohn Baldwin
1481bc3d5698SJohn Baldwin.byte	0x67
1482bc3d5698SJohn Baldwin	mulq	%r14
1483bc3d5698SJohn Baldwin	addq	%rax,%r11
1484bc3d5698SJohn Baldwin	movq	%rbx,%rax
1485bc3d5698SJohn Baldwin	movq	8(%rsi,%rcx,1),%rbx
1486bc3d5698SJohn Baldwin	movq	%rdx,%r10
1487bc3d5698SJohn Baldwin	adcq	$0,%r10
1488bc3d5698SJohn Baldwin	addq	%r13,%r11
1489bc3d5698SJohn Baldwin	adcq	$0,%r10
1490bc3d5698SJohn Baldwin
1491bc3d5698SJohn Baldwin	mulq	%r15
1492bc3d5698SJohn Baldwin	addq	%rax,%r12
1493bc3d5698SJohn Baldwin	movq	%r11,(%rdi,%rcx,1)
1494bc3d5698SJohn Baldwin	movq	%rbx,%rax
1495bc3d5698SJohn Baldwin	movq	%rdx,%r13
1496bc3d5698SJohn Baldwin	adcq	$0,%r13
1497bc3d5698SJohn Baldwin	addq	8(%rdi,%rcx,1),%r12
1498bc3d5698SJohn Baldwin	leaq	16(%rcx),%rcx
1499bc3d5698SJohn Baldwin	adcq	$0,%r13
1500bc3d5698SJohn Baldwin
1501bc3d5698SJohn Baldwin	mulq	%r14
1502bc3d5698SJohn Baldwin	addq	%rax,%r10
1503bc3d5698SJohn Baldwin	movq	%rbx,%rax
1504bc3d5698SJohn Baldwin	adcq	$0,%rdx
1505bc3d5698SJohn Baldwin	addq	%r12,%r10
1506bc3d5698SJohn Baldwin	movq	%rdx,%r11
1507bc3d5698SJohn Baldwin	adcq	$0,%r11
1508bc3d5698SJohn Baldwin	movq	%r10,-8(%rdi,%rcx,1)
1509bc3d5698SJohn Baldwin
1510bc3d5698SJohn Baldwin	cmpq	$0,%rcx
1511bc3d5698SJohn Baldwin	jne	.Lsqr4x_inner
1512bc3d5698SJohn Baldwin
1513bc3d5698SJohn Baldwin.byte	0x67
1514bc3d5698SJohn Baldwin	mulq	%r15
1515bc3d5698SJohn Baldwin	addq	%rax,%r13
1516bc3d5698SJohn Baldwin	adcq	$0,%rdx
1517bc3d5698SJohn Baldwin	addq	%r11,%r13
1518bc3d5698SJohn Baldwin	adcq	$0,%rdx
1519bc3d5698SJohn Baldwin
1520bc3d5698SJohn Baldwin	movq	%r13,(%rdi)
1521bc3d5698SJohn Baldwin	movq	%rdx,%r12
1522bc3d5698SJohn Baldwin	movq	%rdx,8(%rdi)
1523bc3d5698SJohn Baldwin
1524bc3d5698SJohn Baldwin	addq	$16,%rbp
1525bc3d5698SJohn Baldwin	jnz	.Lsqr4x_outer
1526bc3d5698SJohn Baldwin
1527bc3d5698SJohn Baldwin
1528bc3d5698SJohn Baldwin	movq	-32(%rsi),%r14
1529bc3d5698SJohn Baldwin	leaq	48+8(%rsp,%r9,2),%rdi
1530bc3d5698SJohn Baldwin	movq	-24(%rsi),%rax
1531bc3d5698SJohn Baldwin	leaq	-32(%rdi,%rbp,1),%rdi
1532bc3d5698SJohn Baldwin	movq	-16(%rsi),%rbx
1533bc3d5698SJohn Baldwin	movq	%rax,%r15
1534bc3d5698SJohn Baldwin
1535bc3d5698SJohn Baldwin	mulq	%r14
1536bc3d5698SJohn Baldwin	addq	%rax,%r10
1537bc3d5698SJohn Baldwin	movq	%rbx,%rax
1538bc3d5698SJohn Baldwin	movq	%rdx,%r11
1539bc3d5698SJohn Baldwin	adcq	$0,%r11
1540bc3d5698SJohn Baldwin
1541bc3d5698SJohn Baldwin	mulq	%r14
1542bc3d5698SJohn Baldwin	addq	%rax,%r11
1543bc3d5698SJohn Baldwin	movq	%rbx,%rax
1544bc3d5698SJohn Baldwin	movq	%r10,-24(%rdi)
1545bc3d5698SJohn Baldwin	movq	%rdx,%r10
1546bc3d5698SJohn Baldwin	adcq	$0,%r10
1547bc3d5698SJohn Baldwin	addq	%r13,%r11
1548bc3d5698SJohn Baldwin	movq	-8(%rsi),%rbx
1549bc3d5698SJohn Baldwin	adcq	$0,%r10
1550bc3d5698SJohn Baldwin
1551bc3d5698SJohn Baldwin	mulq	%r15
1552bc3d5698SJohn Baldwin	addq	%rax,%r12
1553bc3d5698SJohn Baldwin	movq	%rbx,%rax
1554bc3d5698SJohn Baldwin	movq	%r11,-16(%rdi)
1555bc3d5698SJohn Baldwin	movq	%rdx,%r13
1556bc3d5698SJohn Baldwin	adcq	$0,%r13
1557bc3d5698SJohn Baldwin
1558bc3d5698SJohn Baldwin	mulq	%r14
1559bc3d5698SJohn Baldwin	addq	%rax,%r10
1560bc3d5698SJohn Baldwin	movq	%rbx,%rax
1561bc3d5698SJohn Baldwin	movq	%rdx,%r11
1562bc3d5698SJohn Baldwin	adcq	$0,%r11
1563bc3d5698SJohn Baldwin	addq	%r12,%r10
1564bc3d5698SJohn Baldwin	adcq	$0,%r11
1565bc3d5698SJohn Baldwin	movq	%r10,-8(%rdi)
1566bc3d5698SJohn Baldwin
1567bc3d5698SJohn Baldwin	mulq	%r15
1568bc3d5698SJohn Baldwin	addq	%rax,%r13
1569bc3d5698SJohn Baldwin	movq	-16(%rsi),%rax
1570bc3d5698SJohn Baldwin	adcq	$0,%rdx
1571bc3d5698SJohn Baldwin	addq	%r11,%r13
1572bc3d5698SJohn Baldwin	adcq	$0,%rdx
1573bc3d5698SJohn Baldwin
1574bc3d5698SJohn Baldwin	movq	%r13,(%rdi)
1575bc3d5698SJohn Baldwin	movq	%rdx,%r12
1576bc3d5698SJohn Baldwin	movq	%rdx,8(%rdi)
1577bc3d5698SJohn Baldwin
1578bc3d5698SJohn Baldwin	mulq	%rbx
1579bc3d5698SJohn Baldwin	addq	$16,%rbp
1580bc3d5698SJohn Baldwin	xorq	%r14,%r14
1581bc3d5698SJohn Baldwin	subq	%r9,%rbp
1582bc3d5698SJohn Baldwin	xorq	%r15,%r15
1583bc3d5698SJohn Baldwin
1584bc3d5698SJohn Baldwin	addq	%r12,%rax
1585bc3d5698SJohn Baldwin	adcq	$0,%rdx
1586bc3d5698SJohn Baldwin	movq	%rax,8(%rdi)
1587bc3d5698SJohn Baldwin	movq	%rdx,16(%rdi)
1588bc3d5698SJohn Baldwin	movq	%r15,24(%rdi)
1589bc3d5698SJohn Baldwin
1590bc3d5698SJohn Baldwin	movq	-16(%rsi,%rbp,1),%rax
1591bc3d5698SJohn Baldwin	leaq	48+8(%rsp),%rdi
1592bc3d5698SJohn Baldwin	xorq	%r10,%r10
1593bc3d5698SJohn Baldwin	movq	8(%rdi),%r11
1594bc3d5698SJohn Baldwin
1595bc3d5698SJohn Baldwin	leaq	(%r14,%r10,2),%r12
1596bc3d5698SJohn Baldwin	shrq	$63,%r10
1597bc3d5698SJohn Baldwin	leaq	(%rcx,%r11,2),%r13
1598bc3d5698SJohn Baldwin	shrq	$63,%r11
1599bc3d5698SJohn Baldwin	orq	%r10,%r13
1600bc3d5698SJohn Baldwin	movq	16(%rdi),%r10
1601bc3d5698SJohn Baldwin	movq	%r11,%r14
1602bc3d5698SJohn Baldwin	mulq	%rax
1603bc3d5698SJohn Baldwin	negq	%r15
1604bc3d5698SJohn Baldwin	movq	24(%rdi),%r11
1605bc3d5698SJohn Baldwin	adcq	%rax,%r12
1606bc3d5698SJohn Baldwin	movq	-8(%rsi,%rbp,1),%rax
1607bc3d5698SJohn Baldwin	movq	%r12,(%rdi)
1608bc3d5698SJohn Baldwin	adcq	%rdx,%r13
1609bc3d5698SJohn Baldwin
1610bc3d5698SJohn Baldwin	leaq	(%r14,%r10,2),%rbx
1611bc3d5698SJohn Baldwin	movq	%r13,8(%rdi)
1612bc3d5698SJohn Baldwin	sbbq	%r15,%r15
1613bc3d5698SJohn Baldwin	shrq	$63,%r10
1614bc3d5698SJohn Baldwin	leaq	(%rcx,%r11,2),%r8
1615bc3d5698SJohn Baldwin	shrq	$63,%r11
1616bc3d5698SJohn Baldwin	orq	%r10,%r8
1617bc3d5698SJohn Baldwin	movq	32(%rdi),%r10
1618bc3d5698SJohn Baldwin	movq	%r11,%r14
1619bc3d5698SJohn Baldwin	mulq	%rax
1620bc3d5698SJohn Baldwin	negq	%r15
1621bc3d5698SJohn Baldwin	movq	40(%rdi),%r11
1622bc3d5698SJohn Baldwin	adcq	%rax,%rbx
1623bc3d5698SJohn Baldwin	movq	0(%rsi,%rbp,1),%rax
1624bc3d5698SJohn Baldwin	movq	%rbx,16(%rdi)
1625bc3d5698SJohn Baldwin	adcq	%rdx,%r8
1626bc3d5698SJohn Baldwin	leaq	16(%rbp),%rbp
1627bc3d5698SJohn Baldwin	movq	%r8,24(%rdi)
1628bc3d5698SJohn Baldwin	sbbq	%r15,%r15
1629bc3d5698SJohn Baldwin	leaq	64(%rdi),%rdi
1630bc3d5698SJohn Baldwin	jmp	.Lsqr4x_shift_n_add
1631bc3d5698SJohn Baldwin
1632bc3d5698SJohn Baldwin.align	32
1633bc3d5698SJohn Baldwin.Lsqr4x_shift_n_add:
1634bc3d5698SJohn Baldwin	leaq	(%r14,%r10,2),%r12
1635bc3d5698SJohn Baldwin	shrq	$63,%r10
1636bc3d5698SJohn Baldwin	leaq	(%rcx,%r11,2),%r13
1637bc3d5698SJohn Baldwin	shrq	$63,%r11
1638bc3d5698SJohn Baldwin	orq	%r10,%r13
1639bc3d5698SJohn Baldwin	movq	-16(%rdi),%r10
1640bc3d5698SJohn Baldwin	movq	%r11,%r14
1641bc3d5698SJohn Baldwin	mulq	%rax
1642bc3d5698SJohn Baldwin	negq	%r15
1643bc3d5698SJohn Baldwin	movq	-8(%rdi),%r11
1644bc3d5698SJohn Baldwin	adcq	%rax,%r12
1645bc3d5698SJohn Baldwin	movq	-8(%rsi,%rbp,1),%rax
1646bc3d5698SJohn Baldwin	movq	%r12,-32(%rdi)
1647bc3d5698SJohn Baldwin	adcq	%rdx,%r13
1648bc3d5698SJohn Baldwin
1649bc3d5698SJohn Baldwin	leaq	(%r14,%r10,2),%rbx
1650bc3d5698SJohn Baldwin	movq	%r13,-24(%rdi)
1651bc3d5698SJohn Baldwin	sbbq	%r15,%r15
1652bc3d5698SJohn Baldwin	shrq	$63,%r10
1653bc3d5698SJohn Baldwin	leaq	(%rcx,%r11,2),%r8
1654bc3d5698SJohn Baldwin	shrq	$63,%r11
1655bc3d5698SJohn Baldwin	orq	%r10,%r8
1656bc3d5698SJohn Baldwin	movq	0(%rdi),%r10
1657bc3d5698SJohn Baldwin	movq	%r11,%r14
1658bc3d5698SJohn Baldwin	mulq	%rax
1659bc3d5698SJohn Baldwin	negq	%r15
1660bc3d5698SJohn Baldwin	movq	8(%rdi),%r11
1661bc3d5698SJohn Baldwin	adcq	%rax,%rbx
1662bc3d5698SJohn Baldwin	movq	0(%rsi,%rbp,1),%rax
1663bc3d5698SJohn Baldwin	movq	%rbx,-16(%rdi)
1664bc3d5698SJohn Baldwin	adcq	%rdx,%r8
1665bc3d5698SJohn Baldwin
1666bc3d5698SJohn Baldwin	leaq	(%r14,%r10,2),%r12
1667bc3d5698SJohn Baldwin	movq	%r8,-8(%rdi)
1668bc3d5698SJohn Baldwin	sbbq	%r15,%r15
1669bc3d5698SJohn Baldwin	shrq	$63,%r10
1670bc3d5698SJohn Baldwin	leaq	(%rcx,%r11,2),%r13
1671bc3d5698SJohn Baldwin	shrq	$63,%r11
1672bc3d5698SJohn Baldwin	orq	%r10,%r13
1673bc3d5698SJohn Baldwin	movq	16(%rdi),%r10
1674bc3d5698SJohn Baldwin	movq	%r11,%r14
1675bc3d5698SJohn Baldwin	mulq	%rax
1676bc3d5698SJohn Baldwin	negq	%r15
1677bc3d5698SJohn Baldwin	movq	24(%rdi),%r11
1678bc3d5698SJohn Baldwin	adcq	%rax,%r12
1679bc3d5698SJohn Baldwin	movq	8(%rsi,%rbp,1),%rax
1680bc3d5698SJohn Baldwin	movq	%r12,0(%rdi)
1681bc3d5698SJohn Baldwin	adcq	%rdx,%r13
1682bc3d5698SJohn Baldwin
1683bc3d5698SJohn Baldwin	leaq	(%r14,%r10,2),%rbx
1684bc3d5698SJohn Baldwin	movq	%r13,8(%rdi)
1685bc3d5698SJohn Baldwin	sbbq	%r15,%r15
1686bc3d5698SJohn Baldwin	shrq	$63,%r10
1687bc3d5698SJohn Baldwin	leaq	(%rcx,%r11,2),%r8
1688bc3d5698SJohn Baldwin	shrq	$63,%r11
1689bc3d5698SJohn Baldwin	orq	%r10,%r8
1690bc3d5698SJohn Baldwin	movq	32(%rdi),%r10
1691bc3d5698SJohn Baldwin	movq	%r11,%r14
1692bc3d5698SJohn Baldwin	mulq	%rax
1693bc3d5698SJohn Baldwin	negq	%r15
1694bc3d5698SJohn Baldwin	movq	40(%rdi),%r11
1695bc3d5698SJohn Baldwin	adcq	%rax,%rbx
1696bc3d5698SJohn Baldwin	movq	16(%rsi,%rbp,1),%rax
1697bc3d5698SJohn Baldwin	movq	%rbx,16(%rdi)
1698bc3d5698SJohn Baldwin	adcq	%rdx,%r8
1699bc3d5698SJohn Baldwin	movq	%r8,24(%rdi)
1700bc3d5698SJohn Baldwin	sbbq	%r15,%r15
1701bc3d5698SJohn Baldwin	leaq	64(%rdi),%rdi
1702bc3d5698SJohn Baldwin	addq	$32,%rbp
1703bc3d5698SJohn Baldwin	jnz	.Lsqr4x_shift_n_add
1704bc3d5698SJohn Baldwin
1705bc3d5698SJohn Baldwin	leaq	(%r14,%r10,2),%r12
1706bc3d5698SJohn Baldwin.byte	0x67
1707bc3d5698SJohn Baldwin	shrq	$63,%r10
1708bc3d5698SJohn Baldwin	leaq	(%rcx,%r11,2),%r13
1709bc3d5698SJohn Baldwin	shrq	$63,%r11
1710bc3d5698SJohn Baldwin	orq	%r10,%r13
1711bc3d5698SJohn Baldwin	movq	-16(%rdi),%r10
1712bc3d5698SJohn Baldwin	movq	%r11,%r14
1713bc3d5698SJohn Baldwin	mulq	%rax
1714bc3d5698SJohn Baldwin	negq	%r15
1715bc3d5698SJohn Baldwin	movq	-8(%rdi),%r11
1716bc3d5698SJohn Baldwin	adcq	%rax,%r12
1717bc3d5698SJohn Baldwin	movq	-8(%rsi),%rax
1718bc3d5698SJohn Baldwin	movq	%r12,-32(%rdi)
1719bc3d5698SJohn Baldwin	adcq	%rdx,%r13
1720bc3d5698SJohn Baldwin
1721bc3d5698SJohn Baldwin	leaq	(%r14,%r10,2),%rbx
1722bc3d5698SJohn Baldwin	movq	%r13,-24(%rdi)
1723bc3d5698SJohn Baldwin	sbbq	%r15,%r15
1724bc3d5698SJohn Baldwin	shrq	$63,%r10
1725bc3d5698SJohn Baldwin	leaq	(%rcx,%r11,2),%r8
1726bc3d5698SJohn Baldwin	shrq	$63,%r11
1727bc3d5698SJohn Baldwin	orq	%r10,%r8
1728bc3d5698SJohn Baldwin	mulq	%rax
1729bc3d5698SJohn Baldwin	negq	%r15
1730bc3d5698SJohn Baldwin	adcq	%rax,%rbx
1731bc3d5698SJohn Baldwin	adcq	%rdx,%r8
1732bc3d5698SJohn Baldwin	movq	%rbx,-16(%rdi)
1733bc3d5698SJohn Baldwin	movq	%r8,-8(%rdi)
1734bc3d5698SJohn Baldwin.byte	102,72,15,126,213
1735bc3d5698SJohn Baldwin__bn_sqr8x_reduction:
1736bc3d5698SJohn Baldwin	xorq	%rax,%rax
1737bc3d5698SJohn Baldwin	leaq	(%r9,%rbp,1),%rcx
1738bc3d5698SJohn Baldwin	leaq	48+8(%rsp,%r9,2),%rdx
1739bc3d5698SJohn Baldwin	movq	%rcx,0+8(%rsp)
1740bc3d5698SJohn Baldwin	leaq	48+8(%rsp,%r9,1),%rdi
1741bc3d5698SJohn Baldwin	movq	%rdx,8+8(%rsp)
1742bc3d5698SJohn Baldwin	negq	%r9
1743bc3d5698SJohn Baldwin	jmp	.L8x_reduction_loop
1744bc3d5698SJohn Baldwin
1745bc3d5698SJohn Baldwin.align	32
1746bc3d5698SJohn Baldwin.L8x_reduction_loop:
1747bc3d5698SJohn Baldwin	leaq	(%rdi,%r9,1),%rdi
1748bc3d5698SJohn Baldwin.byte	0x66
1749bc3d5698SJohn Baldwin	movq	0(%rdi),%rbx
1750bc3d5698SJohn Baldwin	movq	8(%rdi),%r9
1751bc3d5698SJohn Baldwin	movq	16(%rdi),%r10
1752bc3d5698SJohn Baldwin	movq	24(%rdi),%r11
1753bc3d5698SJohn Baldwin	movq	32(%rdi),%r12
1754bc3d5698SJohn Baldwin	movq	40(%rdi),%r13
1755bc3d5698SJohn Baldwin	movq	48(%rdi),%r14
1756bc3d5698SJohn Baldwin	movq	56(%rdi),%r15
1757bc3d5698SJohn Baldwin	movq	%rax,(%rdx)
1758bc3d5698SJohn Baldwin	leaq	64(%rdi),%rdi
1759bc3d5698SJohn Baldwin
1760bc3d5698SJohn Baldwin.byte	0x67
1761bc3d5698SJohn Baldwin	movq	%rbx,%r8
1762bc3d5698SJohn Baldwin	imulq	32+8(%rsp),%rbx
1763bc3d5698SJohn Baldwin	movq	0(%rbp),%rax
1764bc3d5698SJohn Baldwin	movl	$8,%ecx
1765bc3d5698SJohn Baldwin	jmp	.L8x_reduce
1766bc3d5698SJohn Baldwin
1767bc3d5698SJohn Baldwin.align	32
1768bc3d5698SJohn Baldwin.L8x_reduce:
1769bc3d5698SJohn Baldwin	mulq	%rbx
1770bc3d5698SJohn Baldwin	movq	8(%rbp),%rax
1771bc3d5698SJohn Baldwin	negq	%r8
1772bc3d5698SJohn Baldwin	movq	%rdx,%r8
1773bc3d5698SJohn Baldwin	adcq	$0,%r8
1774bc3d5698SJohn Baldwin
1775bc3d5698SJohn Baldwin	mulq	%rbx
1776bc3d5698SJohn Baldwin	addq	%rax,%r9
1777bc3d5698SJohn Baldwin	movq	16(%rbp),%rax
1778bc3d5698SJohn Baldwin	adcq	$0,%rdx
1779bc3d5698SJohn Baldwin	addq	%r9,%r8
1780bc3d5698SJohn Baldwin	movq	%rbx,48-8+8(%rsp,%rcx,8)
1781bc3d5698SJohn Baldwin	movq	%rdx,%r9
1782bc3d5698SJohn Baldwin	adcq	$0,%r9
1783bc3d5698SJohn Baldwin
1784bc3d5698SJohn Baldwin	mulq	%rbx
1785bc3d5698SJohn Baldwin	addq	%rax,%r10
1786bc3d5698SJohn Baldwin	movq	24(%rbp),%rax
1787bc3d5698SJohn Baldwin	adcq	$0,%rdx
1788bc3d5698SJohn Baldwin	addq	%r10,%r9
1789bc3d5698SJohn Baldwin	movq	32+8(%rsp),%rsi
1790bc3d5698SJohn Baldwin	movq	%rdx,%r10
1791bc3d5698SJohn Baldwin	adcq	$0,%r10
1792bc3d5698SJohn Baldwin
1793bc3d5698SJohn Baldwin	mulq	%rbx
1794bc3d5698SJohn Baldwin	addq	%rax,%r11
1795bc3d5698SJohn Baldwin	movq	32(%rbp),%rax
1796bc3d5698SJohn Baldwin	adcq	$0,%rdx
1797bc3d5698SJohn Baldwin	imulq	%r8,%rsi
1798bc3d5698SJohn Baldwin	addq	%r11,%r10
1799bc3d5698SJohn Baldwin	movq	%rdx,%r11
1800bc3d5698SJohn Baldwin	adcq	$0,%r11
1801bc3d5698SJohn Baldwin
1802bc3d5698SJohn Baldwin	mulq	%rbx
1803bc3d5698SJohn Baldwin	addq	%rax,%r12
1804bc3d5698SJohn Baldwin	movq	40(%rbp),%rax
1805bc3d5698SJohn Baldwin	adcq	$0,%rdx
1806bc3d5698SJohn Baldwin	addq	%r12,%r11
1807bc3d5698SJohn Baldwin	movq	%rdx,%r12
1808bc3d5698SJohn Baldwin	adcq	$0,%r12
1809bc3d5698SJohn Baldwin
1810bc3d5698SJohn Baldwin	mulq	%rbx
1811bc3d5698SJohn Baldwin	addq	%rax,%r13
1812bc3d5698SJohn Baldwin	movq	48(%rbp),%rax
1813bc3d5698SJohn Baldwin	adcq	$0,%rdx
1814bc3d5698SJohn Baldwin	addq	%r13,%r12
1815bc3d5698SJohn Baldwin	movq	%rdx,%r13
1816bc3d5698SJohn Baldwin	adcq	$0,%r13
1817bc3d5698SJohn Baldwin
1818bc3d5698SJohn Baldwin	mulq	%rbx
1819bc3d5698SJohn Baldwin	addq	%rax,%r14
1820bc3d5698SJohn Baldwin	movq	56(%rbp),%rax
1821bc3d5698SJohn Baldwin	adcq	$0,%rdx
1822bc3d5698SJohn Baldwin	addq	%r14,%r13
1823bc3d5698SJohn Baldwin	movq	%rdx,%r14
1824bc3d5698SJohn Baldwin	adcq	$0,%r14
1825bc3d5698SJohn Baldwin
1826bc3d5698SJohn Baldwin	mulq	%rbx
1827bc3d5698SJohn Baldwin	movq	%rsi,%rbx
1828bc3d5698SJohn Baldwin	addq	%rax,%r15
1829bc3d5698SJohn Baldwin	movq	0(%rbp),%rax
1830bc3d5698SJohn Baldwin	adcq	$0,%rdx
1831bc3d5698SJohn Baldwin	addq	%r15,%r14
1832bc3d5698SJohn Baldwin	movq	%rdx,%r15
1833bc3d5698SJohn Baldwin	adcq	$0,%r15
1834bc3d5698SJohn Baldwin
1835bc3d5698SJohn Baldwin	decl	%ecx
1836bc3d5698SJohn Baldwin	jnz	.L8x_reduce
1837bc3d5698SJohn Baldwin
1838bc3d5698SJohn Baldwin	leaq	64(%rbp),%rbp
1839bc3d5698SJohn Baldwin	xorq	%rax,%rax
1840bc3d5698SJohn Baldwin	movq	8+8(%rsp),%rdx
1841bc3d5698SJohn Baldwin	cmpq	0+8(%rsp),%rbp
1842bc3d5698SJohn Baldwin	jae	.L8x_no_tail
1843bc3d5698SJohn Baldwin
1844bc3d5698SJohn Baldwin.byte	0x66
1845bc3d5698SJohn Baldwin	addq	0(%rdi),%r8
1846bc3d5698SJohn Baldwin	adcq	8(%rdi),%r9
1847bc3d5698SJohn Baldwin	adcq	16(%rdi),%r10
1848bc3d5698SJohn Baldwin	adcq	24(%rdi),%r11
1849bc3d5698SJohn Baldwin	adcq	32(%rdi),%r12
1850bc3d5698SJohn Baldwin	adcq	40(%rdi),%r13
1851bc3d5698SJohn Baldwin	adcq	48(%rdi),%r14
1852bc3d5698SJohn Baldwin	adcq	56(%rdi),%r15
1853bc3d5698SJohn Baldwin	sbbq	%rsi,%rsi
1854bc3d5698SJohn Baldwin
1855bc3d5698SJohn Baldwin	movq	48+56+8(%rsp),%rbx
1856bc3d5698SJohn Baldwin	movl	$8,%ecx
1857bc3d5698SJohn Baldwin	movq	0(%rbp),%rax
1858bc3d5698SJohn Baldwin	jmp	.L8x_tail
1859bc3d5698SJohn Baldwin
1860bc3d5698SJohn Baldwin.align	32
1861bc3d5698SJohn Baldwin.L8x_tail:
1862bc3d5698SJohn Baldwin	mulq	%rbx
1863bc3d5698SJohn Baldwin	addq	%rax,%r8
1864bc3d5698SJohn Baldwin	movq	8(%rbp),%rax
1865bc3d5698SJohn Baldwin	movq	%r8,(%rdi)
1866bc3d5698SJohn Baldwin	movq	%rdx,%r8
1867bc3d5698SJohn Baldwin	adcq	$0,%r8
1868bc3d5698SJohn Baldwin
1869bc3d5698SJohn Baldwin	mulq	%rbx
1870bc3d5698SJohn Baldwin	addq	%rax,%r9
1871bc3d5698SJohn Baldwin	movq	16(%rbp),%rax
1872bc3d5698SJohn Baldwin	adcq	$0,%rdx
1873bc3d5698SJohn Baldwin	addq	%r9,%r8
1874bc3d5698SJohn Baldwin	leaq	8(%rdi),%rdi
1875bc3d5698SJohn Baldwin	movq	%rdx,%r9
1876bc3d5698SJohn Baldwin	adcq	$0,%r9
1877bc3d5698SJohn Baldwin
1878bc3d5698SJohn Baldwin	mulq	%rbx
1879bc3d5698SJohn Baldwin	addq	%rax,%r10
1880bc3d5698SJohn Baldwin	movq	24(%rbp),%rax
1881bc3d5698SJohn Baldwin	adcq	$0,%rdx
1882bc3d5698SJohn Baldwin	addq	%r10,%r9
1883bc3d5698SJohn Baldwin	movq	%rdx,%r10
1884bc3d5698SJohn Baldwin	adcq	$0,%r10
1885bc3d5698SJohn Baldwin
1886bc3d5698SJohn Baldwin	mulq	%rbx
1887bc3d5698SJohn Baldwin	addq	%rax,%r11
1888bc3d5698SJohn Baldwin	movq	32(%rbp),%rax
1889bc3d5698SJohn Baldwin	adcq	$0,%rdx
1890bc3d5698SJohn Baldwin	addq	%r11,%r10
1891bc3d5698SJohn Baldwin	movq	%rdx,%r11
1892bc3d5698SJohn Baldwin	adcq	$0,%r11
1893bc3d5698SJohn Baldwin
1894bc3d5698SJohn Baldwin	mulq	%rbx
1895bc3d5698SJohn Baldwin	addq	%rax,%r12
1896bc3d5698SJohn Baldwin	movq	40(%rbp),%rax
1897bc3d5698SJohn Baldwin	adcq	$0,%rdx
1898bc3d5698SJohn Baldwin	addq	%r12,%r11
1899bc3d5698SJohn Baldwin	movq	%rdx,%r12
1900bc3d5698SJohn Baldwin	adcq	$0,%r12
1901bc3d5698SJohn Baldwin
1902bc3d5698SJohn Baldwin	mulq	%rbx
1903bc3d5698SJohn Baldwin	addq	%rax,%r13
1904bc3d5698SJohn Baldwin	movq	48(%rbp),%rax
1905bc3d5698SJohn Baldwin	adcq	$0,%rdx
1906bc3d5698SJohn Baldwin	addq	%r13,%r12
1907bc3d5698SJohn Baldwin	movq	%rdx,%r13
1908bc3d5698SJohn Baldwin	adcq	$0,%r13
1909bc3d5698SJohn Baldwin
1910bc3d5698SJohn Baldwin	mulq	%rbx
1911bc3d5698SJohn Baldwin	addq	%rax,%r14
1912bc3d5698SJohn Baldwin	movq	56(%rbp),%rax
1913bc3d5698SJohn Baldwin	adcq	$0,%rdx
1914bc3d5698SJohn Baldwin	addq	%r14,%r13
1915bc3d5698SJohn Baldwin	movq	%rdx,%r14
1916bc3d5698SJohn Baldwin	adcq	$0,%r14
1917bc3d5698SJohn Baldwin
1918bc3d5698SJohn Baldwin	mulq	%rbx
1919bc3d5698SJohn Baldwin	movq	48-16+8(%rsp,%rcx,8),%rbx
1920bc3d5698SJohn Baldwin	addq	%rax,%r15
1921bc3d5698SJohn Baldwin	adcq	$0,%rdx
1922bc3d5698SJohn Baldwin	addq	%r15,%r14
1923bc3d5698SJohn Baldwin	movq	0(%rbp),%rax
1924bc3d5698SJohn Baldwin	movq	%rdx,%r15
1925bc3d5698SJohn Baldwin	adcq	$0,%r15
1926bc3d5698SJohn Baldwin
1927bc3d5698SJohn Baldwin	decl	%ecx
1928bc3d5698SJohn Baldwin	jnz	.L8x_tail
1929bc3d5698SJohn Baldwin
1930bc3d5698SJohn Baldwin	leaq	64(%rbp),%rbp
1931bc3d5698SJohn Baldwin	movq	8+8(%rsp),%rdx
1932bc3d5698SJohn Baldwin	cmpq	0+8(%rsp),%rbp
1933bc3d5698SJohn Baldwin	jae	.L8x_tail_done
1934bc3d5698SJohn Baldwin
1935bc3d5698SJohn Baldwin	movq	48+56+8(%rsp),%rbx
1936bc3d5698SJohn Baldwin	negq	%rsi
1937bc3d5698SJohn Baldwin	movq	0(%rbp),%rax
1938bc3d5698SJohn Baldwin	adcq	0(%rdi),%r8
1939bc3d5698SJohn Baldwin	adcq	8(%rdi),%r9
1940bc3d5698SJohn Baldwin	adcq	16(%rdi),%r10
1941bc3d5698SJohn Baldwin	adcq	24(%rdi),%r11
1942bc3d5698SJohn Baldwin	adcq	32(%rdi),%r12
1943bc3d5698SJohn Baldwin	adcq	40(%rdi),%r13
1944bc3d5698SJohn Baldwin	adcq	48(%rdi),%r14
1945bc3d5698SJohn Baldwin	adcq	56(%rdi),%r15
1946bc3d5698SJohn Baldwin	sbbq	%rsi,%rsi
1947bc3d5698SJohn Baldwin
1948bc3d5698SJohn Baldwin	movl	$8,%ecx
1949bc3d5698SJohn Baldwin	jmp	.L8x_tail
1950bc3d5698SJohn Baldwin
1951bc3d5698SJohn Baldwin.align	32
1952bc3d5698SJohn Baldwin.L8x_tail_done:
1953bc3d5698SJohn Baldwin	xorq	%rax,%rax
1954bc3d5698SJohn Baldwin	addq	(%rdx),%r8
1955bc3d5698SJohn Baldwin	adcq	$0,%r9
1956bc3d5698SJohn Baldwin	adcq	$0,%r10
1957bc3d5698SJohn Baldwin	adcq	$0,%r11
1958bc3d5698SJohn Baldwin	adcq	$0,%r12
1959bc3d5698SJohn Baldwin	adcq	$0,%r13
1960bc3d5698SJohn Baldwin	adcq	$0,%r14
1961bc3d5698SJohn Baldwin	adcq	$0,%r15
1962bc3d5698SJohn Baldwin	adcq	$0,%rax
1963bc3d5698SJohn Baldwin
1964bc3d5698SJohn Baldwin	negq	%rsi
1965bc3d5698SJohn Baldwin.L8x_no_tail:
1966bc3d5698SJohn Baldwin	adcq	0(%rdi),%r8
1967bc3d5698SJohn Baldwin	adcq	8(%rdi),%r9
1968bc3d5698SJohn Baldwin	adcq	16(%rdi),%r10
1969bc3d5698SJohn Baldwin	adcq	24(%rdi),%r11
1970bc3d5698SJohn Baldwin	adcq	32(%rdi),%r12
1971bc3d5698SJohn Baldwin	adcq	40(%rdi),%r13
1972bc3d5698SJohn Baldwin	adcq	48(%rdi),%r14
1973bc3d5698SJohn Baldwin	adcq	56(%rdi),%r15
1974bc3d5698SJohn Baldwin	adcq	$0,%rax
1975bc3d5698SJohn Baldwin	movq	-8(%rbp),%rcx
1976bc3d5698SJohn Baldwin	xorq	%rsi,%rsi
1977bc3d5698SJohn Baldwin
1978bc3d5698SJohn Baldwin.byte	102,72,15,126,213
1979bc3d5698SJohn Baldwin
1980bc3d5698SJohn Baldwin	movq	%r8,0(%rdi)
1981bc3d5698SJohn Baldwin	movq	%r9,8(%rdi)
1982bc3d5698SJohn Baldwin.byte	102,73,15,126,217
1983bc3d5698SJohn Baldwin	movq	%r10,16(%rdi)
1984bc3d5698SJohn Baldwin	movq	%r11,24(%rdi)
1985bc3d5698SJohn Baldwin	movq	%r12,32(%rdi)
1986bc3d5698SJohn Baldwin	movq	%r13,40(%rdi)
1987bc3d5698SJohn Baldwin	movq	%r14,48(%rdi)
1988bc3d5698SJohn Baldwin	movq	%r15,56(%rdi)
1989bc3d5698SJohn Baldwin	leaq	64(%rdi),%rdi
1990bc3d5698SJohn Baldwin
1991bc3d5698SJohn Baldwin	cmpq	%rdx,%rdi
1992bc3d5698SJohn Baldwin	jb	.L8x_reduction_loop
1993bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
1994bc3d5698SJohn Baldwin.cfi_endproc
1995bc3d5698SJohn Baldwin.size	bn_sqr8x_internal,.-bn_sqr8x_internal
1996bc3d5698SJohn Baldwin.type	__bn_post4x_internal,@function
1997bc3d5698SJohn Baldwin.align	32
1998bc3d5698SJohn Baldwin__bn_post4x_internal:
1999bc3d5698SJohn Baldwin.cfi_startproc
2000bc3d5698SJohn Baldwin	movq	0(%rbp),%r12
2001bc3d5698SJohn Baldwin	leaq	(%rdi,%r9,1),%rbx
2002bc3d5698SJohn Baldwin	movq	%r9,%rcx
2003bc3d5698SJohn Baldwin.byte	102,72,15,126,207
2004bc3d5698SJohn Baldwin	negq	%rax
2005bc3d5698SJohn Baldwin.byte	102,72,15,126,206
2006bc3d5698SJohn Baldwin	sarq	$3+2,%rcx
2007bc3d5698SJohn Baldwin	decq	%r12
2008bc3d5698SJohn Baldwin	xorq	%r10,%r10
2009bc3d5698SJohn Baldwin	movq	8(%rbp),%r13
2010bc3d5698SJohn Baldwin	movq	16(%rbp),%r14
2011bc3d5698SJohn Baldwin	movq	24(%rbp),%r15
2012bc3d5698SJohn Baldwin	jmp	.Lsqr4x_sub_entry
2013bc3d5698SJohn Baldwin
2014bc3d5698SJohn Baldwin.align	16
2015bc3d5698SJohn Baldwin.Lsqr4x_sub:
2016bc3d5698SJohn Baldwin	movq	0(%rbp),%r12
2017bc3d5698SJohn Baldwin	movq	8(%rbp),%r13
2018bc3d5698SJohn Baldwin	movq	16(%rbp),%r14
2019bc3d5698SJohn Baldwin	movq	24(%rbp),%r15
2020bc3d5698SJohn Baldwin.Lsqr4x_sub_entry:
2021bc3d5698SJohn Baldwin	leaq	32(%rbp),%rbp
2022bc3d5698SJohn Baldwin	notq	%r12
2023bc3d5698SJohn Baldwin	notq	%r13
2024bc3d5698SJohn Baldwin	notq	%r14
2025bc3d5698SJohn Baldwin	notq	%r15
2026bc3d5698SJohn Baldwin	andq	%rax,%r12
2027bc3d5698SJohn Baldwin	andq	%rax,%r13
2028bc3d5698SJohn Baldwin	andq	%rax,%r14
2029bc3d5698SJohn Baldwin	andq	%rax,%r15
2030bc3d5698SJohn Baldwin
2031bc3d5698SJohn Baldwin	negq	%r10
2032bc3d5698SJohn Baldwin	adcq	0(%rbx),%r12
2033bc3d5698SJohn Baldwin	adcq	8(%rbx),%r13
2034bc3d5698SJohn Baldwin	adcq	16(%rbx),%r14
2035bc3d5698SJohn Baldwin	adcq	24(%rbx),%r15
2036bc3d5698SJohn Baldwin	movq	%r12,0(%rdi)
2037bc3d5698SJohn Baldwin	leaq	32(%rbx),%rbx
2038bc3d5698SJohn Baldwin	movq	%r13,8(%rdi)
2039bc3d5698SJohn Baldwin	sbbq	%r10,%r10
2040bc3d5698SJohn Baldwin	movq	%r14,16(%rdi)
2041bc3d5698SJohn Baldwin	movq	%r15,24(%rdi)
2042bc3d5698SJohn Baldwin	leaq	32(%rdi),%rdi
2043bc3d5698SJohn Baldwin
2044bc3d5698SJohn Baldwin	incq	%rcx
2045bc3d5698SJohn Baldwin	jnz	.Lsqr4x_sub
2046bc3d5698SJohn Baldwin
2047bc3d5698SJohn Baldwin	movq	%r9,%r10
2048bc3d5698SJohn Baldwin	negq	%r9
2049bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
2050bc3d5698SJohn Baldwin.cfi_endproc
2051bc3d5698SJohn Baldwin.size	__bn_post4x_internal,.-__bn_post4x_internal
2052bc3d5698SJohn Baldwin.type	bn_mulx4x_mont_gather5,@function
2053bc3d5698SJohn Baldwin.align	32
2054bc3d5698SJohn Baldwinbn_mulx4x_mont_gather5:
2055bc3d5698SJohn Baldwin.cfi_startproc
2056bc3d5698SJohn Baldwin	movq	%rsp,%rax
2057bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rax
2058bc3d5698SJohn Baldwin.Lmulx4x_enter:
2059bc3d5698SJohn Baldwin	pushq	%rbx
2060bc3d5698SJohn Baldwin.cfi_offset	%rbx,-16
2061bc3d5698SJohn Baldwin	pushq	%rbp
2062bc3d5698SJohn Baldwin.cfi_offset	%rbp,-24
2063bc3d5698SJohn Baldwin	pushq	%r12
2064bc3d5698SJohn Baldwin.cfi_offset	%r12,-32
2065bc3d5698SJohn Baldwin	pushq	%r13
2066bc3d5698SJohn Baldwin.cfi_offset	%r13,-40
2067bc3d5698SJohn Baldwin	pushq	%r14
2068bc3d5698SJohn Baldwin.cfi_offset	%r14,-48
2069bc3d5698SJohn Baldwin	pushq	%r15
2070bc3d5698SJohn Baldwin.cfi_offset	%r15,-56
2071bc3d5698SJohn Baldwin.Lmulx4x_prologue:
2072bc3d5698SJohn Baldwin
2073bc3d5698SJohn Baldwin	shll	$3,%r9d
2074bc3d5698SJohn Baldwin	leaq	(%r9,%r9,2),%r10
2075bc3d5698SJohn Baldwin	negq	%r9
2076bc3d5698SJohn Baldwin	movq	(%r8),%r8
2077bc3d5698SJohn Baldwin
2078bc3d5698SJohn Baldwin
2079bc3d5698SJohn Baldwin
2080bc3d5698SJohn Baldwin
2081bc3d5698SJohn Baldwin
2082bc3d5698SJohn Baldwin
2083bc3d5698SJohn Baldwin
2084bc3d5698SJohn Baldwin
2085bc3d5698SJohn Baldwin
2086bc3d5698SJohn Baldwin
2087bc3d5698SJohn Baldwin	leaq	-320(%rsp,%r9,2),%r11
2088bc3d5698SJohn Baldwin	movq	%rsp,%rbp
2089bc3d5698SJohn Baldwin	subq	%rdi,%r11
2090bc3d5698SJohn Baldwin	andq	$4095,%r11
2091bc3d5698SJohn Baldwin	cmpq	%r11,%r10
2092bc3d5698SJohn Baldwin	jb	.Lmulx4xsp_alt
2093bc3d5698SJohn Baldwin	subq	%r11,%rbp
2094bc3d5698SJohn Baldwin	leaq	-320(%rbp,%r9,2),%rbp
2095bc3d5698SJohn Baldwin	jmp	.Lmulx4xsp_done
2096bc3d5698SJohn Baldwin
2097bc3d5698SJohn Baldwin.Lmulx4xsp_alt:
2098bc3d5698SJohn Baldwin	leaq	4096-320(,%r9,2),%r10
2099bc3d5698SJohn Baldwin	leaq	-320(%rbp,%r9,2),%rbp
2100bc3d5698SJohn Baldwin	subq	%r10,%r11
2101bc3d5698SJohn Baldwin	movq	$0,%r10
2102bc3d5698SJohn Baldwin	cmovcq	%r10,%r11
2103bc3d5698SJohn Baldwin	subq	%r11,%rbp
2104bc3d5698SJohn Baldwin.Lmulx4xsp_done:
2105bc3d5698SJohn Baldwin	andq	$-64,%rbp
2106bc3d5698SJohn Baldwin	movq	%rsp,%r11
2107bc3d5698SJohn Baldwin	subq	%rbp,%r11
2108bc3d5698SJohn Baldwin	andq	$-4096,%r11
2109bc3d5698SJohn Baldwin	leaq	(%r11,%rbp,1),%rsp
2110bc3d5698SJohn Baldwin	movq	(%rsp),%r10
2111bc3d5698SJohn Baldwin	cmpq	%rbp,%rsp
2112bc3d5698SJohn Baldwin	ja	.Lmulx4x_page_walk
2113bc3d5698SJohn Baldwin	jmp	.Lmulx4x_page_walk_done
2114bc3d5698SJohn Baldwin
2115bc3d5698SJohn Baldwin.Lmulx4x_page_walk:
2116bc3d5698SJohn Baldwin	leaq	-4096(%rsp),%rsp
2117bc3d5698SJohn Baldwin	movq	(%rsp),%r10
2118bc3d5698SJohn Baldwin	cmpq	%rbp,%rsp
2119bc3d5698SJohn Baldwin	ja	.Lmulx4x_page_walk
2120bc3d5698SJohn Baldwin.Lmulx4x_page_walk_done:
2121bc3d5698SJohn Baldwin
2122bc3d5698SJohn Baldwin
2123bc3d5698SJohn Baldwin
2124bc3d5698SJohn Baldwin
2125bc3d5698SJohn Baldwin
2126bc3d5698SJohn Baldwin
2127bc3d5698SJohn Baldwin
2128bc3d5698SJohn Baldwin
2129bc3d5698SJohn Baldwin
2130bc3d5698SJohn Baldwin
2131bc3d5698SJohn Baldwin
2132bc3d5698SJohn Baldwin
2133bc3d5698SJohn Baldwin
2134bc3d5698SJohn Baldwin	movq	%r8,32(%rsp)
2135bc3d5698SJohn Baldwin	movq	%rax,40(%rsp)
2136bc3d5698SJohn Baldwin.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2137bc3d5698SJohn Baldwin.Lmulx4x_body:
2138bc3d5698SJohn Baldwin	call	mulx4x_internal
2139bc3d5698SJohn Baldwin
2140bc3d5698SJohn Baldwin	movq	40(%rsp),%rsi
2141bc3d5698SJohn Baldwin.cfi_def_cfa	%rsi,8
2142bc3d5698SJohn Baldwin	movq	$1,%rax
2143bc3d5698SJohn Baldwin
2144bc3d5698SJohn Baldwin	movq	-48(%rsi),%r15
2145bc3d5698SJohn Baldwin.cfi_restore	%r15
2146bc3d5698SJohn Baldwin	movq	-40(%rsi),%r14
2147bc3d5698SJohn Baldwin.cfi_restore	%r14
2148bc3d5698SJohn Baldwin	movq	-32(%rsi),%r13
2149bc3d5698SJohn Baldwin.cfi_restore	%r13
2150bc3d5698SJohn Baldwin	movq	-24(%rsi),%r12
2151bc3d5698SJohn Baldwin.cfi_restore	%r12
2152bc3d5698SJohn Baldwin	movq	-16(%rsi),%rbp
2153bc3d5698SJohn Baldwin.cfi_restore	%rbp
2154bc3d5698SJohn Baldwin	movq	-8(%rsi),%rbx
2155bc3d5698SJohn Baldwin.cfi_restore	%rbx
2156bc3d5698SJohn Baldwin	leaq	(%rsi),%rsp
2157bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rsp
2158bc3d5698SJohn Baldwin.Lmulx4x_epilogue:
2159bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
2160bc3d5698SJohn Baldwin.cfi_endproc
2161bc3d5698SJohn Baldwin.size	bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5
2162bc3d5698SJohn Baldwin
2163bc3d5698SJohn Baldwin.type	mulx4x_internal,@function
2164bc3d5698SJohn Baldwin.align	32
2165bc3d5698SJohn Baldwinmulx4x_internal:
2166bc3d5698SJohn Baldwin.cfi_startproc
2167bc3d5698SJohn Baldwin	movq	%r9,8(%rsp)
2168bc3d5698SJohn Baldwin	movq	%r9,%r10
2169bc3d5698SJohn Baldwin	negq	%r9
2170bc3d5698SJohn Baldwin	shlq	$5,%r9
2171bc3d5698SJohn Baldwin	negq	%r10
2172bc3d5698SJohn Baldwin	leaq	128(%rdx,%r9,1),%r13
2173bc3d5698SJohn Baldwin	shrq	$5+5,%r9
2174bc3d5698SJohn Baldwin	movd	8(%rax),%xmm5
2175bc3d5698SJohn Baldwin	subq	$1,%r9
2176bc3d5698SJohn Baldwin	leaq	.Linc(%rip),%rax
2177bc3d5698SJohn Baldwin	movq	%r13,16+8(%rsp)
2178bc3d5698SJohn Baldwin	movq	%r9,24+8(%rsp)
2179bc3d5698SJohn Baldwin	movq	%rdi,56+8(%rsp)
2180bc3d5698SJohn Baldwin	movdqa	0(%rax),%xmm0
2181bc3d5698SJohn Baldwin	movdqa	16(%rax),%xmm1
2182bc3d5698SJohn Baldwin	leaq	88-112(%rsp,%r10,1),%r10
2183bc3d5698SJohn Baldwin	leaq	128(%rdx),%rdi
2184bc3d5698SJohn Baldwin
2185bc3d5698SJohn Baldwin	pshufd	$0,%xmm5,%xmm5
2186bc3d5698SJohn Baldwin	movdqa	%xmm1,%xmm4
2187bc3d5698SJohn Baldwin.byte	0x67
2188bc3d5698SJohn Baldwin	movdqa	%xmm1,%xmm2
2189bc3d5698SJohn Baldwin.byte	0x67
2190bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm1
2191bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm0
2192bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm3
2193bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm2
2194bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm1
2195bc3d5698SJohn Baldwin	movdqa	%xmm0,112(%r10)
2196bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm0
2197bc3d5698SJohn Baldwin
2198bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm3
2199bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm2
2200bc3d5698SJohn Baldwin	movdqa	%xmm1,128(%r10)
2201bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm1
2202bc3d5698SJohn Baldwin
2203bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm0
2204bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm3
2205bc3d5698SJohn Baldwin	movdqa	%xmm2,144(%r10)
2206bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm2
2207bc3d5698SJohn Baldwin
2208bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm1
2209bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm0
2210bc3d5698SJohn Baldwin	movdqa	%xmm3,160(%r10)
2211bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm3
2212bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm2
2213bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm1
2214bc3d5698SJohn Baldwin	movdqa	%xmm0,176(%r10)
2215bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm0
2216bc3d5698SJohn Baldwin
2217bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm3
2218bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm2
2219bc3d5698SJohn Baldwin	movdqa	%xmm1,192(%r10)
2220bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm1
2221bc3d5698SJohn Baldwin
2222bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm0
2223bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm3
2224bc3d5698SJohn Baldwin	movdqa	%xmm2,208(%r10)
2225bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm2
2226bc3d5698SJohn Baldwin
2227bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm1
2228bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm0
2229bc3d5698SJohn Baldwin	movdqa	%xmm3,224(%r10)
2230bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm3
2231bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm2
2232bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm1
2233bc3d5698SJohn Baldwin	movdqa	%xmm0,240(%r10)
2234bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm0
2235bc3d5698SJohn Baldwin
2236bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm3
2237bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm2
2238bc3d5698SJohn Baldwin	movdqa	%xmm1,256(%r10)
2239bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm1
2240bc3d5698SJohn Baldwin
2241bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm0
2242bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm3
2243bc3d5698SJohn Baldwin	movdqa	%xmm2,272(%r10)
2244bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm2
2245bc3d5698SJohn Baldwin
2246bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm1
2247bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm0
2248bc3d5698SJohn Baldwin	movdqa	%xmm3,288(%r10)
2249bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm3
2250bc3d5698SJohn Baldwin.byte	0x67
2251bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm2
2252bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm1
2253bc3d5698SJohn Baldwin	movdqa	%xmm0,304(%r10)
2254bc3d5698SJohn Baldwin
2255bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm3
2256bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm2
2257bc3d5698SJohn Baldwin	movdqa	%xmm1,320(%r10)
2258bc3d5698SJohn Baldwin
2259bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm3
2260bc3d5698SJohn Baldwin	movdqa	%xmm2,336(%r10)
2261bc3d5698SJohn Baldwin
2262bc3d5698SJohn Baldwin	pand	64(%rdi),%xmm0
2263bc3d5698SJohn Baldwin	pand	80(%rdi),%xmm1
2264bc3d5698SJohn Baldwin	pand	96(%rdi),%xmm2
2265bc3d5698SJohn Baldwin	movdqa	%xmm3,352(%r10)
2266bc3d5698SJohn Baldwin	pand	112(%rdi),%xmm3
2267bc3d5698SJohn Baldwin	por	%xmm2,%xmm0
2268bc3d5698SJohn Baldwin	por	%xmm3,%xmm1
2269bc3d5698SJohn Baldwin	movdqa	-128(%rdi),%xmm4
2270bc3d5698SJohn Baldwin	movdqa	-112(%rdi),%xmm5
2271bc3d5698SJohn Baldwin	movdqa	-96(%rdi),%xmm2
2272bc3d5698SJohn Baldwin	pand	112(%r10),%xmm4
2273bc3d5698SJohn Baldwin	movdqa	-80(%rdi),%xmm3
2274bc3d5698SJohn Baldwin	pand	128(%r10),%xmm5
2275bc3d5698SJohn Baldwin	por	%xmm4,%xmm0
2276bc3d5698SJohn Baldwin	pand	144(%r10),%xmm2
2277bc3d5698SJohn Baldwin	por	%xmm5,%xmm1
2278bc3d5698SJohn Baldwin	pand	160(%r10),%xmm3
2279bc3d5698SJohn Baldwin	por	%xmm2,%xmm0
2280bc3d5698SJohn Baldwin	por	%xmm3,%xmm1
2281bc3d5698SJohn Baldwin	movdqa	-64(%rdi),%xmm4
2282bc3d5698SJohn Baldwin	movdqa	-48(%rdi),%xmm5
2283bc3d5698SJohn Baldwin	movdqa	-32(%rdi),%xmm2
2284bc3d5698SJohn Baldwin	pand	176(%r10),%xmm4
2285bc3d5698SJohn Baldwin	movdqa	-16(%rdi),%xmm3
2286bc3d5698SJohn Baldwin	pand	192(%r10),%xmm5
2287bc3d5698SJohn Baldwin	por	%xmm4,%xmm0
2288bc3d5698SJohn Baldwin	pand	208(%r10),%xmm2
2289bc3d5698SJohn Baldwin	por	%xmm5,%xmm1
2290bc3d5698SJohn Baldwin	pand	224(%r10),%xmm3
2291bc3d5698SJohn Baldwin	por	%xmm2,%xmm0
2292bc3d5698SJohn Baldwin	por	%xmm3,%xmm1
2293bc3d5698SJohn Baldwin	movdqa	0(%rdi),%xmm4
2294bc3d5698SJohn Baldwin	movdqa	16(%rdi),%xmm5
2295bc3d5698SJohn Baldwin	movdqa	32(%rdi),%xmm2
2296bc3d5698SJohn Baldwin	pand	240(%r10),%xmm4
2297bc3d5698SJohn Baldwin	movdqa	48(%rdi),%xmm3
2298bc3d5698SJohn Baldwin	pand	256(%r10),%xmm5
2299bc3d5698SJohn Baldwin	por	%xmm4,%xmm0
2300bc3d5698SJohn Baldwin	pand	272(%r10),%xmm2
2301bc3d5698SJohn Baldwin	por	%xmm5,%xmm1
2302bc3d5698SJohn Baldwin	pand	288(%r10),%xmm3
2303bc3d5698SJohn Baldwin	por	%xmm2,%xmm0
2304bc3d5698SJohn Baldwin	por	%xmm3,%xmm1
2305bc3d5698SJohn Baldwin	pxor	%xmm1,%xmm0
2306bc3d5698SJohn Baldwin	pshufd	$0x4e,%xmm0,%xmm1
2307bc3d5698SJohn Baldwin	por	%xmm1,%xmm0
2308bc3d5698SJohn Baldwin	leaq	256(%rdi),%rdi
2309bc3d5698SJohn Baldwin.byte	102,72,15,126,194
2310bc3d5698SJohn Baldwin	leaq	64+32+8(%rsp),%rbx
2311bc3d5698SJohn Baldwin
2312bc3d5698SJohn Baldwin	movq	%rdx,%r9
2313bc3d5698SJohn Baldwin	mulxq	0(%rsi),%r8,%rax
2314bc3d5698SJohn Baldwin	mulxq	8(%rsi),%r11,%r12
2315bc3d5698SJohn Baldwin	addq	%rax,%r11
2316bc3d5698SJohn Baldwin	mulxq	16(%rsi),%rax,%r13
2317bc3d5698SJohn Baldwin	adcq	%rax,%r12
2318bc3d5698SJohn Baldwin	adcq	$0,%r13
2319bc3d5698SJohn Baldwin	mulxq	24(%rsi),%rax,%r14
2320bc3d5698SJohn Baldwin
2321bc3d5698SJohn Baldwin	movq	%r8,%r15
2322bc3d5698SJohn Baldwin	imulq	32+8(%rsp),%r8
2323bc3d5698SJohn Baldwin	xorq	%rbp,%rbp
2324bc3d5698SJohn Baldwin	movq	%r8,%rdx
2325bc3d5698SJohn Baldwin
2326bc3d5698SJohn Baldwin	movq	%rdi,8+8(%rsp)
2327bc3d5698SJohn Baldwin
2328bc3d5698SJohn Baldwin	leaq	32(%rsi),%rsi
2329bc3d5698SJohn Baldwin	adcxq	%rax,%r13
2330bc3d5698SJohn Baldwin	adcxq	%rbp,%r14
2331bc3d5698SJohn Baldwin
2332bc3d5698SJohn Baldwin	mulxq	0(%rcx),%rax,%r10
2333bc3d5698SJohn Baldwin	adcxq	%rax,%r15
2334bc3d5698SJohn Baldwin	adoxq	%r11,%r10
2335bc3d5698SJohn Baldwin	mulxq	8(%rcx),%rax,%r11
2336bc3d5698SJohn Baldwin	adcxq	%rax,%r10
2337bc3d5698SJohn Baldwin	adoxq	%r12,%r11
2338bc3d5698SJohn Baldwin	mulxq	16(%rcx),%rax,%r12
2339bc3d5698SJohn Baldwin	movq	24+8(%rsp),%rdi
2340bc3d5698SJohn Baldwin	movq	%r10,-32(%rbx)
2341bc3d5698SJohn Baldwin	adcxq	%rax,%r11
2342bc3d5698SJohn Baldwin	adoxq	%r13,%r12
2343bc3d5698SJohn Baldwin	mulxq	24(%rcx),%rax,%r15
2344bc3d5698SJohn Baldwin	movq	%r9,%rdx
2345bc3d5698SJohn Baldwin	movq	%r11,-24(%rbx)
2346bc3d5698SJohn Baldwin	adcxq	%rax,%r12
2347bc3d5698SJohn Baldwin	adoxq	%rbp,%r15
2348bc3d5698SJohn Baldwin	leaq	32(%rcx),%rcx
2349bc3d5698SJohn Baldwin	movq	%r12,-16(%rbx)
2350bc3d5698SJohn Baldwin	jmp	.Lmulx4x_1st
2351bc3d5698SJohn Baldwin
2352bc3d5698SJohn Baldwin.align	32
2353bc3d5698SJohn Baldwin.Lmulx4x_1st:
2354bc3d5698SJohn Baldwin	adcxq	%rbp,%r15
2355bc3d5698SJohn Baldwin	mulxq	0(%rsi),%r10,%rax
2356bc3d5698SJohn Baldwin	adcxq	%r14,%r10
2357bc3d5698SJohn Baldwin	mulxq	8(%rsi),%r11,%r14
2358bc3d5698SJohn Baldwin	adcxq	%rax,%r11
2359bc3d5698SJohn Baldwin	mulxq	16(%rsi),%r12,%rax
2360bc3d5698SJohn Baldwin	adcxq	%r14,%r12
2361bc3d5698SJohn Baldwin	mulxq	24(%rsi),%r13,%r14
2362bc3d5698SJohn Baldwin.byte	0x67,0x67
2363bc3d5698SJohn Baldwin	movq	%r8,%rdx
2364bc3d5698SJohn Baldwin	adcxq	%rax,%r13
2365bc3d5698SJohn Baldwin	adcxq	%rbp,%r14
2366bc3d5698SJohn Baldwin	leaq	32(%rsi),%rsi
2367bc3d5698SJohn Baldwin	leaq	32(%rbx),%rbx
2368bc3d5698SJohn Baldwin
2369bc3d5698SJohn Baldwin	adoxq	%r15,%r10
2370bc3d5698SJohn Baldwin	mulxq	0(%rcx),%rax,%r15
2371bc3d5698SJohn Baldwin	adcxq	%rax,%r10
2372bc3d5698SJohn Baldwin	adoxq	%r15,%r11
2373bc3d5698SJohn Baldwin	mulxq	8(%rcx),%rax,%r15
2374bc3d5698SJohn Baldwin	adcxq	%rax,%r11
2375bc3d5698SJohn Baldwin	adoxq	%r15,%r12
2376bc3d5698SJohn Baldwin	mulxq	16(%rcx),%rax,%r15
2377bc3d5698SJohn Baldwin	movq	%r10,-40(%rbx)
2378bc3d5698SJohn Baldwin	adcxq	%rax,%r12
2379bc3d5698SJohn Baldwin	movq	%r11,-32(%rbx)
2380bc3d5698SJohn Baldwin	adoxq	%r15,%r13
2381bc3d5698SJohn Baldwin	mulxq	24(%rcx),%rax,%r15
2382bc3d5698SJohn Baldwin	movq	%r9,%rdx
2383bc3d5698SJohn Baldwin	movq	%r12,-24(%rbx)
2384bc3d5698SJohn Baldwin	adcxq	%rax,%r13
2385bc3d5698SJohn Baldwin	adoxq	%rbp,%r15
2386bc3d5698SJohn Baldwin	leaq	32(%rcx),%rcx
2387bc3d5698SJohn Baldwin	movq	%r13,-16(%rbx)
2388bc3d5698SJohn Baldwin
2389bc3d5698SJohn Baldwin	decq	%rdi
2390bc3d5698SJohn Baldwin	jnz	.Lmulx4x_1st
2391bc3d5698SJohn Baldwin
2392bc3d5698SJohn Baldwin	movq	8(%rsp),%rax
2393bc3d5698SJohn Baldwin	adcq	%rbp,%r15
2394bc3d5698SJohn Baldwin	leaq	(%rsi,%rax,1),%rsi
2395bc3d5698SJohn Baldwin	addq	%r15,%r14
2396bc3d5698SJohn Baldwin	movq	8+8(%rsp),%rdi
2397bc3d5698SJohn Baldwin	adcq	%rbp,%rbp
2398bc3d5698SJohn Baldwin	movq	%r14,-8(%rbx)
2399bc3d5698SJohn Baldwin	jmp	.Lmulx4x_outer
2400bc3d5698SJohn Baldwin
2401bc3d5698SJohn Baldwin.align	32
2402bc3d5698SJohn Baldwin.Lmulx4x_outer:
2403bc3d5698SJohn Baldwin	leaq	16-256(%rbx),%r10
2404bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm4
2405bc3d5698SJohn Baldwin.byte	0x67,0x67
2406bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm5
2407bc3d5698SJohn Baldwin	movdqa	-128(%rdi),%xmm0
2408bc3d5698SJohn Baldwin	movdqa	-112(%rdi),%xmm1
2409bc3d5698SJohn Baldwin	movdqa	-96(%rdi),%xmm2
2410bc3d5698SJohn Baldwin	pand	256(%r10),%xmm0
2411bc3d5698SJohn Baldwin	movdqa	-80(%rdi),%xmm3
2412bc3d5698SJohn Baldwin	pand	272(%r10),%xmm1
2413bc3d5698SJohn Baldwin	por	%xmm0,%xmm4
2414bc3d5698SJohn Baldwin	pand	288(%r10),%xmm2
2415bc3d5698SJohn Baldwin	por	%xmm1,%xmm5
2416bc3d5698SJohn Baldwin	pand	304(%r10),%xmm3
2417bc3d5698SJohn Baldwin	por	%xmm2,%xmm4
2418bc3d5698SJohn Baldwin	por	%xmm3,%xmm5
2419bc3d5698SJohn Baldwin	movdqa	-64(%rdi),%xmm0
2420bc3d5698SJohn Baldwin	movdqa	-48(%rdi),%xmm1
2421bc3d5698SJohn Baldwin	movdqa	-32(%rdi),%xmm2
2422bc3d5698SJohn Baldwin	pand	320(%r10),%xmm0
2423bc3d5698SJohn Baldwin	movdqa	-16(%rdi),%xmm3
2424bc3d5698SJohn Baldwin	pand	336(%r10),%xmm1
2425bc3d5698SJohn Baldwin	por	%xmm0,%xmm4
2426bc3d5698SJohn Baldwin	pand	352(%r10),%xmm2
2427bc3d5698SJohn Baldwin	por	%xmm1,%xmm5
2428bc3d5698SJohn Baldwin	pand	368(%r10),%xmm3
2429bc3d5698SJohn Baldwin	por	%xmm2,%xmm4
2430bc3d5698SJohn Baldwin	por	%xmm3,%xmm5
2431bc3d5698SJohn Baldwin	movdqa	0(%rdi),%xmm0
2432bc3d5698SJohn Baldwin	movdqa	16(%rdi),%xmm1
2433bc3d5698SJohn Baldwin	movdqa	32(%rdi),%xmm2
2434bc3d5698SJohn Baldwin	pand	384(%r10),%xmm0
2435bc3d5698SJohn Baldwin	movdqa	48(%rdi),%xmm3
2436bc3d5698SJohn Baldwin	pand	400(%r10),%xmm1
2437bc3d5698SJohn Baldwin	por	%xmm0,%xmm4
2438bc3d5698SJohn Baldwin	pand	416(%r10),%xmm2
2439bc3d5698SJohn Baldwin	por	%xmm1,%xmm5
2440bc3d5698SJohn Baldwin	pand	432(%r10),%xmm3
2441bc3d5698SJohn Baldwin	por	%xmm2,%xmm4
2442bc3d5698SJohn Baldwin	por	%xmm3,%xmm5
2443bc3d5698SJohn Baldwin	movdqa	64(%rdi),%xmm0
2444bc3d5698SJohn Baldwin	movdqa	80(%rdi),%xmm1
2445bc3d5698SJohn Baldwin	movdqa	96(%rdi),%xmm2
2446bc3d5698SJohn Baldwin	pand	448(%r10),%xmm0
2447bc3d5698SJohn Baldwin	movdqa	112(%rdi),%xmm3
2448bc3d5698SJohn Baldwin	pand	464(%r10),%xmm1
2449bc3d5698SJohn Baldwin	por	%xmm0,%xmm4
2450bc3d5698SJohn Baldwin	pand	480(%r10),%xmm2
2451bc3d5698SJohn Baldwin	por	%xmm1,%xmm5
2452bc3d5698SJohn Baldwin	pand	496(%r10),%xmm3
2453bc3d5698SJohn Baldwin	por	%xmm2,%xmm4
2454bc3d5698SJohn Baldwin	por	%xmm3,%xmm5
2455bc3d5698SJohn Baldwin	por	%xmm5,%xmm4
2456bc3d5698SJohn Baldwin	pshufd	$0x4e,%xmm4,%xmm0
2457bc3d5698SJohn Baldwin	por	%xmm4,%xmm0
2458bc3d5698SJohn Baldwin	leaq	256(%rdi),%rdi
2459bc3d5698SJohn Baldwin.byte	102,72,15,126,194
2460bc3d5698SJohn Baldwin
2461bc3d5698SJohn Baldwin	movq	%rbp,(%rbx)
2462bc3d5698SJohn Baldwin	leaq	32(%rbx,%rax,1),%rbx
2463bc3d5698SJohn Baldwin	mulxq	0(%rsi),%r8,%r11
2464bc3d5698SJohn Baldwin	xorq	%rbp,%rbp
2465bc3d5698SJohn Baldwin	movq	%rdx,%r9
2466bc3d5698SJohn Baldwin	mulxq	8(%rsi),%r14,%r12
2467bc3d5698SJohn Baldwin	adoxq	-32(%rbx),%r8
2468bc3d5698SJohn Baldwin	adcxq	%r14,%r11
2469bc3d5698SJohn Baldwin	mulxq	16(%rsi),%r15,%r13
2470bc3d5698SJohn Baldwin	adoxq	-24(%rbx),%r11
2471bc3d5698SJohn Baldwin	adcxq	%r15,%r12
2472bc3d5698SJohn Baldwin	mulxq	24(%rsi),%rdx,%r14
2473bc3d5698SJohn Baldwin	adoxq	-16(%rbx),%r12
2474bc3d5698SJohn Baldwin	adcxq	%rdx,%r13
2475bc3d5698SJohn Baldwin	leaq	(%rcx,%rax,1),%rcx
2476bc3d5698SJohn Baldwin	leaq	32(%rsi),%rsi
2477bc3d5698SJohn Baldwin	adoxq	-8(%rbx),%r13
2478bc3d5698SJohn Baldwin	adcxq	%rbp,%r14
2479bc3d5698SJohn Baldwin	adoxq	%rbp,%r14
2480bc3d5698SJohn Baldwin
2481bc3d5698SJohn Baldwin	movq	%r8,%r15
2482bc3d5698SJohn Baldwin	imulq	32+8(%rsp),%r8
2483bc3d5698SJohn Baldwin
2484bc3d5698SJohn Baldwin	movq	%r8,%rdx
2485bc3d5698SJohn Baldwin	xorq	%rbp,%rbp
2486bc3d5698SJohn Baldwin	movq	%rdi,8+8(%rsp)
2487bc3d5698SJohn Baldwin
2488bc3d5698SJohn Baldwin	mulxq	0(%rcx),%rax,%r10
2489bc3d5698SJohn Baldwin	adcxq	%rax,%r15
2490bc3d5698SJohn Baldwin	adoxq	%r11,%r10
2491bc3d5698SJohn Baldwin	mulxq	8(%rcx),%rax,%r11
2492bc3d5698SJohn Baldwin	adcxq	%rax,%r10
2493bc3d5698SJohn Baldwin	adoxq	%r12,%r11
2494bc3d5698SJohn Baldwin	mulxq	16(%rcx),%rax,%r12
2495bc3d5698SJohn Baldwin	adcxq	%rax,%r11
2496bc3d5698SJohn Baldwin	adoxq	%r13,%r12
2497bc3d5698SJohn Baldwin	mulxq	24(%rcx),%rax,%r15
2498bc3d5698SJohn Baldwin	movq	%r9,%rdx
2499bc3d5698SJohn Baldwin	movq	24+8(%rsp),%rdi
2500bc3d5698SJohn Baldwin	movq	%r10,-32(%rbx)
2501bc3d5698SJohn Baldwin	adcxq	%rax,%r12
2502bc3d5698SJohn Baldwin	movq	%r11,-24(%rbx)
2503bc3d5698SJohn Baldwin	adoxq	%rbp,%r15
2504bc3d5698SJohn Baldwin	movq	%r12,-16(%rbx)
2505bc3d5698SJohn Baldwin	leaq	32(%rcx),%rcx
2506bc3d5698SJohn Baldwin	jmp	.Lmulx4x_inner
2507bc3d5698SJohn Baldwin
2508bc3d5698SJohn Baldwin.align	32
2509bc3d5698SJohn Baldwin.Lmulx4x_inner:
2510bc3d5698SJohn Baldwin	mulxq	0(%rsi),%r10,%rax
2511bc3d5698SJohn Baldwin	adcxq	%rbp,%r15
2512bc3d5698SJohn Baldwin	adoxq	%r14,%r10
2513bc3d5698SJohn Baldwin	mulxq	8(%rsi),%r11,%r14
2514bc3d5698SJohn Baldwin	adcxq	0(%rbx),%r10
2515bc3d5698SJohn Baldwin	adoxq	%rax,%r11
2516bc3d5698SJohn Baldwin	mulxq	16(%rsi),%r12,%rax
2517bc3d5698SJohn Baldwin	adcxq	8(%rbx),%r11
2518bc3d5698SJohn Baldwin	adoxq	%r14,%r12
2519bc3d5698SJohn Baldwin	mulxq	24(%rsi),%r13,%r14
2520bc3d5698SJohn Baldwin	movq	%r8,%rdx
2521bc3d5698SJohn Baldwin	adcxq	16(%rbx),%r12
2522bc3d5698SJohn Baldwin	adoxq	%rax,%r13
2523bc3d5698SJohn Baldwin	adcxq	24(%rbx),%r13
2524bc3d5698SJohn Baldwin	adoxq	%rbp,%r14
2525bc3d5698SJohn Baldwin	leaq	32(%rsi),%rsi
2526bc3d5698SJohn Baldwin	leaq	32(%rbx),%rbx
2527bc3d5698SJohn Baldwin	adcxq	%rbp,%r14
2528bc3d5698SJohn Baldwin
2529bc3d5698SJohn Baldwin	adoxq	%r15,%r10
2530bc3d5698SJohn Baldwin	mulxq	0(%rcx),%rax,%r15
2531bc3d5698SJohn Baldwin	adcxq	%rax,%r10
2532bc3d5698SJohn Baldwin	adoxq	%r15,%r11
2533bc3d5698SJohn Baldwin	mulxq	8(%rcx),%rax,%r15
2534bc3d5698SJohn Baldwin	adcxq	%rax,%r11
2535bc3d5698SJohn Baldwin	adoxq	%r15,%r12
2536bc3d5698SJohn Baldwin	mulxq	16(%rcx),%rax,%r15
2537bc3d5698SJohn Baldwin	movq	%r10,-40(%rbx)
2538bc3d5698SJohn Baldwin	adcxq	%rax,%r12
2539bc3d5698SJohn Baldwin	adoxq	%r15,%r13
2540bc3d5698SJohn Baldwin	movq	%r11,-32(%rbx)
2541bc3d5698SJohn Baldwin	mulxq	24(%rcx),%rax,%r15
2542bc3d5698SJohn Baldwin	movq	%r9,%rdx
2543bc3d5698SJohn Baldwin	leaq	32(%rcx),%rcx
2544bc3d5698SJohn Baldwin	movq	%r12,-24(%rbx)
2545bc3d5698SJohn Baldwin	adcxq	%rax,%r13
2546bc3d5698SJohn Baldwin	adoxq	%rbp,%r15
2547bc3d5698SJohn Baldwin	movq	%r13,-16(%rbx)
2548bc3d5698SJohn Baldwin
2549bc3d5698SJohn Baldwin	decq	%rdi
2550bc3d5698SJohn Baldwin	jnz	.Lmulx4x_inner
2551bc3d5698SJohn Baldwin
2552bc3d5698SJohn Baldwin	movq	0+8(%rsp),%rax
2553bc3d5698SJohn Baldwin	adcq	%rbp,%r15
2554bc3d5698SJohn Baldwin	subq	0(%rbx),%rdi
2555bc3d5698SJohn Baldwin	movq	8+8(%rsp),%rdi
2556bc3d5698SJohn Baldwin	movq	16+8(%rsp),%r10
2557bc3d5698SJohn Baldwin	adcq	%r15,%r14
2558bc3d5698SJohn Baldwin	leaq	(%rsi,%rax,1),%rsi
2559bc3d5698SJohn Baldwin	adcq	%rbp,%rbp
2560bc3d5698SJohn Baldwin	movq	%r14,-8(%rbx)
2561bc3d5698SJohn Baldwin
2562bc3d5698SJohn Baldwin	cmpq	%r10,%rdi
2563bc3d5698SJohn Baldwin	jb	.Lmulx4x_outer
2564bc3d5698SJohn Baldwin
2565bc3d5698SJohn Baldwin	movq	-8(%rcx),%r10
2566bc3d5698SJohn Baldwin	movq	%rbp,%r8
2567bc3d5698SJohn Baldwin	movq	(%rcx,%rax,1),%r12
2568bc3d5698SJohn Baldwin	leaq	(%rcx,%rax,1),%rbp
2569bc3d5698SJohn Baldwin	movq	%rax,%rcx
2570bc3d5698SJohn Baldwin	leaq	(%rbx,%rax,1),%rdi
2571bc3d5698SJohn Baldwin	xorl	%eax,%eax
2572bc3d5698SJohn Baldwin	xorq	%r15,%r15
2573bc3d5698SJohn Baldwin	subq	%r14,%r10
2574bc3d5698SJohn Baldwin	adcq	%r15,%r15
2575bc3d5698SJohn Baldwin	orq	%r15,%r8
2576bc3d5698SJohn Baldwin	sarq	$3+2,%rcx
2577bc3d5698SJohn Baldwin	subq	%r8,%rax
2578bc3d5698SJohn Baldwin	movq	56+8(%rsp),%rdx
2579bc3d5698SJohn Baldwin	decq	%r12
2580bc3d5698SJohn Baldwin	movq	8(%rbp),%r13
2581bc3d5698SJohn Baldwin	xorq	%r8,%r8
2582bc3d5698SJohn Baldwin	movq	16(%rbp),%r14
2583bc3d5698SJohn Baldwin	movq	24(%rbp),%r15
2584bc3d5698SJohn Baldwin	jmp	.Lsqrx4x_sub_entry
2585bc3d5698SJohn Baldwin.cfi_endproc
2586bc3d5698SJohn Baldwin.size	mulx4x_internal,.-mulx4x_internal
2587bc3d5698SJohn Baldwin.type	bn_powerx5,@function
2588bc3d5698SJohn Baldwin.align	32
2589bc3d5698SJohn Baldwinbn_powerx5:
2590bc3d5698SJohn Baldwin.cfi_startproc
2591bc3d5698SJohn Baldwin	movq	%rsp,%rax
2592bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rax
2593bc3d5698SJohn Baldwin.Lpowerx5_enter:
2594bc3d5698SJohn Baldwin	pushq	%rbx
2595bc3d5698SJohn Baldwin.cfi_offset	%rbx,-16
2596bc3d5698SJohn Baldwin	pushq	%rbp
2597bc3d5698SJohn Baldwin.cfi_offset	%rbp,-24
2598bc3d5698SJohn Baldwin	pushq	%r12
2599bc3d5698SJohn Baldwin.cfi_offset	%r12,-32
2600bc3d5698SJohn Baldwin	pushq	%r13
2601bc3d5698SJohn Baldwin.cfi_offset	%r13,-40
2602bc3d5698SJohn Baldwin	pushq	%r14
2603bc3d5698SJohn Baldwin.cfi_offset	%r14,-48
2604bc3d5698SJohn Baldwin	pushq	%r15
2605bc3d5698SJohn Baldwin.cfi_offset	%r15,-56
2606bc3d5698SJohn Baldwin.Lpowerx5_prologue:
2607bc3d5698SJohn Baldwin
2608bc3d5698SJohn Baldwin	shll	$3,%r9d
2609bc3d5698SJohn Baldwin	leaq	(%r9,%r9,2),%r10
2610bc3d5698SJohn Baldwin	negq	%r9
2611bc3d5698SJohn Baldwin	movq	(%r8),%r8
2612bc3d5698SJohn Baldwin
2613bc3d5698SJohn Baldwin
2614bc3d5698SJohn Baldwin
2615bc3d5698SJohn Baldwin
2616bc3d5698SJohn Baldwin
2617bc3d5698SJohn Baldwin
2618bc3d5698SJohn Baldwin
2619bc3d5698SJohn Baldwin
2620bc3d5698SJohn Baldwin	leaq	-320(%rsp,%r9,2),%r11
2621bc3d5698SJohn Baldwin	movq	%rsp,%rbp
2622bc3d5698SJohn Baldwin	subq	%rdi,%r11
2623bc3d5698SJohn Baldwin	andq	$4095,%r11
2624bc3d5698SJohn Baldwin	cmpq	%r11,%r10
2625bc3d5698SJohn Baldwin	jb	.Lpwrx_sp_alt
2626bc3d5698SJohn Baldwin	subq	%r11,%rbp
2627bc3d5698SJohn Baldwin	leaq	-320(%rbp,%r9,2),%rbp
2628bc3d5698SJohn Baldwin	jmp	.Lpwrx_sp_done
2629bc3d5698SJohn Baldwin
2630bc3d5698SJohn Baldwin.align	32
2631bc3d5698SJohn Baldwin.Lpwrx_sp_alt:
2632bc3d5698SJohn Baldwin	leaq	4096-320(,%r9,2),%r10
2633bc3d5698SJohn Baldwin	leaq	-320(%rbp,%r9,2),%rbp
2634bc3d5698SJohn Baldwin	subq	%r10,%r11
2635bc3d5698SJohn Baldwin	movq	$0,%r10
2636bc3d5698SJohn Baldwin	cmovcq	%r10,%r11
2637bc3d5698SJohn Baldwin	subq	%r11,%rbp
2638bc3d5698SJohn Baldwin.Lpwrx_sp_done:
2639bc3d5698SJohn Baldwin	andq	$-64,%rbp
2640bc3d5698SJohn Baldwin	movq	%rsp,%r11
2641bc3d5698SJohn Baldwin	subq	%rbp,%r11
2642bc3d5698SJohn Baldwin	andq	$-4096,%r11
2643bc3d5698SJohn Baldwin	leaq	(%r11,%rbp,1),%rsp
2644bc3d5698SJohn Baldwin	movq	(%rsp),%r10
2645bc3d5698SJohn Baldwin	cmpq	%rbp,%rsp
2646bc3d5698SJohn Baldwin	ja	.Lpwrx_page_walk
2647bc3d5698SJohn Baldwin	jmp	.Lpwrx_page_walk_done
2648bc3d5698SJohn Baldwin
2649bc3d5698SJohn Baldwin.Lpwrx_page_walk:
2650bc3d5698SJohn Baldwin	leaq	-4096(%rsp),%rsp
2651bc3d5698SJohn Baldwin	movq	(%rsp),%r10
2652bc3d5698SJohn Baldwin	cmpq	%rbp,%rsp
2653bc3d5698SJohn Baldwin	ja	.Lpwrx_page_walk
2654bc3d5698SJohn Baldwin.Lpwrx_page_walk_done:
2655bc3d5698SJohn Baldwin
2656bc3d5698SJohn Baldwin	movq	%r9,%r10
2657bc3d5698SJohn Baldwin	negq	%r9
2658bc3d5698SJohn Baldwin
2659bc3d5698SJohn Baldwin
2660bc3d5698SJohn Baldwin
2661bc3d5698SJohn Baldwin
2662bc3d5698SJohn Baldwin
2663bc3d5698SJohn Baldwin
2664bc3d5698SJohn Baldwin
2665bc3d5698SJohn Baldwin
2666bc3d5698SJohn Baldwin
2667bc3d5698SJohn Baldwin
2668bc3d5698SJohn Baldwin
2669bc3d5698SJohn Baldwin
2670bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm0
2671bc3d5698SJohn Baldwin.byte	102,72,15,110,207
2672bc3d5698SJohn Baldwin.byte	102,72,15,110,209
2673bc3d5698SJohn Baldwin.byte	102,73,15,110,218
2674bc3d5698SJohn Baldwin.byte	102,72,15,110,226
2675bc3d5698SJohn Baldwin	movq	%r8,32(%rsp)
2676bc3d5698SJohn Baldwin	movq	%rax,40(%rsp)
2677bc3d5698SJohn Baldwin.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
2678bc3d5698SJohn Baldwin.Lpowerx5_body:
2679bc3d5698SJohn Baldwin
2680bc3d5698SJohn Baldwin	call	__bn_sqrx8x_internal
2681bc3d5698SJohn Baldwin	call	__bn_postx4x_internal
2682bc3d5698SJohn Baldwin	call	__bn_sqrx8x_internal
2683bc3d5698SJohn Baldwin	call	__bn_postx4x_internal
2684bc3d5698SJohn Baldwin	call	__bn_sqrx8x_internal
2685bc3d5698SJohn Baldwin	call	__bn_postx4x_internal
2686bc3d5698SJohn Baldwin	call	__bn_sqrx8x_internal
2687bc3d5698SJohn Baldwin	call	__bn_postx4x_internal
2688bc3d5698SJohn Baldwin	call	__bn_sqrx8x_internal
2689bc3d5698SJohn Baldwin	call	__bn_postx4x_internal
2690bc3d5698SJohn Baldwin
2691bc3d5698SJohn Baldwin	movq	%r10,%r9
2692bc3d5698SJohn Baldwin	movq	%rsi,%rdi
2693bc3d5698SJohn Baldwin.byte	102,72,15,126,209
2694bc3d5698SJohn Baldwin.byte	102,72,15,126,226
2695bc3d5698SJohn Baldwin	movq	40(%rsp),%rax
2696bc3d5698SJohn Baldwin
2697bc3d5698SJohn Baldwin	call	mulx4x_internal
2698bc3d5698SJohn Baldwin
2699bc3d5698SJohn Baldwin	movq	40(%rsp),%rsi
2700bc3d5698SJohn Baldwin.cfi_def_cfa	%rsi,8
2701bc3d5698SJohn Baldwin	movq	$1,%rax
2702bc3d5698SJohn Baldwin
2703bc3d5698SJohn Baldwin	movq	-48(%rsi),%r15
2704bc3d5698SJohn Baldwin.cfi_restore	%r15
2705bc3d5698SJohn Baldwin	movq	-40(%rsi),%r14
2706bc3d5698SJohn Baldwin.cfi_restore	%r14
2707bc3d5698SJohn Baldwin	movq	-32(%rsi),%r13
2708bc3d5698SJohn Baldwin.cfi_restore	%r13
2709bc3d5698SJohn Baldwin	movq	-24(%rsi),%r12
2710bc3d5698SJohn Baldwin.cfi_restore	%r12
2711bc3d5698SJohn Baldwin	movq	-16(%rsi),%rbp
2712bc3d5698SJohn Baldwin.cfi_restore	%rbp
2713bc3d5698SJohn Baldwin	movq	-8(%rsi),%rbx
2714bc3d5698SJohn Baldwin.cfi_restore	%rbx
2715bc3d5698SJohn Baldwin	leaq	(%rsi),%rsp
2716bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rsp
2717bc3d5698SJohn Baldwin.Lpowerx5_epilogue:
2718bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
2719bc3d5698SJohn Baldwin.cfi_endproc
2720bc3d5698SJohn Baldwin.size	bn_powerx5,.-bn_powerx5
2721bc3d5698SJohn Baldwin
2722bc3d5698SJohn Baldwin.globl	bn_sqrx8x_internal
2723bc3d5698SJohn Baldwin.hidden	bn_sqrx8x_internal
2724bc3d5698SJohn Baldwin.type	bn_sqrx8x_internal,@function
2725bc3d5698SJohn Baldwin.align	32
2726bc3d5698SJohn Baldwinbn_sqrx8x_internal:
2727bc3d5698SJohn Baldwin__bn_sqrx8x_internal:
2728bc3d5698SJohn Baldwin.cfi_startproc
2729bc3d5698SJohn Baldwin
2730bc3d5698SJohn Baldwin
2731bc3d5698SJohn Baldwin
2732bc3d5698SJohn Baldwin
2733bc3d5698SJohn Baldwin
2734bc3d5698SJohn Baldwin
2735bc3d5698SJohn Baldwin
2736bc3d5698SJohn Baldwin
2737bc3d5698SJohn Baldwin
2738bc3d5698SJohn Baldwin
2739bc3d5698SJohn Baldwin
2740bc3d5698SJohn Baldwin
2741bc3d5698SJohn Baldwin
2742bc3d5698SJohn Baldwin
2743bc3d5698SJohn Baldwin
2744bc3d5698SJohn Baldwin
2745bc3d5698SJohn Baldwin
2746bc3d5698SJohn Baldwin
2747bc3d5698SJohn Baldwin
2748bc3d5698SJohn Baldwin
2749bc3d5698SJohn Baldwin
2750bc3d5698SJohn Baldwin
2751bc3d5698SJohn Baldwin
2752bc3d5698SJohn Baldwin
2753bc3d5698SJohn Baldwin
2754bc3d5698SJohn Baldwin
2755bc3d5698SJohn Baldwin
2756bc3d5698SJohn Baldwin
2757bc3d5698SJohn Baldwin
2758bc3d5698SJohn Baldwin
2759bc3d5698SJohn Baldwin
2760bc3d5698SJohn Baldwin
2761bc3d5698SJohn Baldwin
2762bc3d5698SJohn Baldwin
2763bc3d5698SJohn Baldwin
2764bc3d5698SJohn Baldwin
2765bc3d5698SJohn Baldwin
2766bc3d5698SJohn Baldwin
2767bc3d5698SJohn Baldwin
2768bc3d5698SJohn Baldwin
2769bc3d5698SJohn Baldwin	leaq	48+8(%rsp),%rdi
2770bc3d5698SJohn Baldwin	leaq	(%rsi,%r9,1),%rbp
2771bc3d5698SJohn Baldwin	movq	%r9,0+8(%rsp)
2772bc3d5698SJohn Baldwin	movq	%rbp,8+8(%rsp)
2773bc3d5698SJohn Baldwin	jmp	.Lsqr8x_zero_start
2774bc3d5698SJohn Baldwin
2775bc3d5698SJohn Baldwin.align	32
2776bc3d5698SJohn Baldwin.byte	0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00
2777bc3d5698SJohn Baldwin.Lsqrx8x_zero:
2778bc3d5698SJohn Baldwin.byte	0x3e
2779bc3d5698SJohn Baldwin	movdqa	%xmm0,0(%rdi)
2780bc3d5698SJohn Baldwin	movdqa	%xmm0,16(%rdi)
2781bc3d5698SJohn Baldwin	movdqa	%xmm0,32(%rdi)
2782bc3d5698SJohn Baldwin	movdqa	%xmm0,48(%rdi)
2783bc3d5698SJohn Baldwin.Lsqr8x_zero_start:
2784bc3d5698SJohn Baldwin	movdqa	%xmm0,64(%rdi)
2785bc3d5698SJohn Baldwin	movdqa	%xmm0,80(%rdi)
2786bc3d5698SJohn Baldwin	movdqa	%xmm0,96(%rdi)
2787bc3d5698SJohn Baldwin	movdqa	%xmm0,112(%rdi)
2788bc3d5698SJohn Baldwin	leaq	128(%rdi),%rdi
2789bc3d5698SJohn Baldwin	subq	$64,%r9
2790bc3d5698SJohn Baldwin	jnz	.Lsqrx8x_zero
2791bc3d5698SJohn Baldwin
2792bc3d5698SJohn Baldwin	movq	0(%rsi),%rdx
2793bc3d5698SJohn Baldwin
2794bc3d5698SJohn Baldwin	xorq	%r10,%r10
2795bc3d5698SJohn Baldwin	xorq	%r11,%r11
2796bc3d5698SJohn Baldwin	xorq	%r12,%r12
2797bc3d5698SJohn Baldwin	xorq	%r13,%r13
2798bc3d5698SJohn Baldwin	xorq	%r14,%r14
2799bc3d5698SJohn Baldwin	xorq	%r15,%r15
2800bc3d5698SJohn Baldwin	leaq	48+8(%rsp),%rdi
2801bc3d5698SJohn Baldwin	xorq	%rbp,%rbp
2802bc3d5698SJohn Baldwin	jmp	.Lsqrx8x_outer_loop
2803bc3d5698SJohn Baldwin
2804bc3d5698SJohn Baldwin.align	32
2805bc3d5698SJohn Baldwin.Lsqrx8x_outer_loop:
2806bc3d5698SJohn Baldwin	mulxq	8(%rsi),%r8,%rax
2807bc3d5698SJohn Baldwin	adcxq	%r9,%r8
2808bc3d5698SJohn Baldwin	adoxq	%rax,%r10
2809bc3d5698SJohn Baldwin	mulxq	16(%rsi),%r9,%rax
2810bc3d5698SJohn Baldwin	adcxq	%r10,%r9
2811bc3d5698SJohn Baldwin	adoxq	%rax,%r11
2812bc3d5698SJohn Baldwin.byte	0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00
2813bc3d5698SJohn Baldwin	adcxq	%r11,%r10
2814bc3d5698SJohn Baldwin	adoxq	%rax,%r12
2815bc3d5698SJohn Baldwin.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00
2816bc3d5698SJohn Baldwin	adcxq	%r12,%r11
2817bc3d5698SJohn Baldwin	adoxq	%rax,%r13
2818bc3d5698SJohn Baldwin	mulxq	40(%rsi),%r12,%rax
2819bc3d5698SJohn Baldwin	adcxq	%r13,%r12
2820bc3d5698SJohn Baldwin	adoxq	%rax,%r14
2821bc3d5698SJohn Baldwin	mulxq	48(%rsi),%r13,%rax
2822bc3d5698SJohn Baldwin	adcxq	%r14,%r13
2823bc3d5698SJohn Baldwin	adoxq	%r15,%rax
2824bc3d5698SJohn Baldwin	mulxq	56(%rsi),%r14,%r15
2825bc3d5698SJohn Baldwin	movq	8(%rsi),%rdx
2826bc3d5698SJohn Baldwin	adcxq	%rax,%r14
2827bc3d5698SJohn Baldwin	adoxq	%rbp,%r15
2828bc3d5698SJohn Baldwin	adcq	64(%rdi),%r15
2829bc3d5698SJohn Baldwin	movq	%r8,8(%rdi)
2830bc3d5698SJohn Baldwin	movq	%r9,16(%rdi)
2831bc3d5698SJohn Baldwin	sbbq	%rcx,%rcx
2832bc3d5698SJohn Baldwin	xorq	%rbp,%rbp
2833bc3d5698SJohn Baldwin
2834bc3d5698SJohn Baldwin
2835bc3d5698SJohn Baldwin	mulxq	16(%rsi),%r8,%rbx
2836bc3d5698SJohn Baldwin	mulxq	24(%rsi),%r9,%rax
2837bc3d5698SJohn Baldwin	adcxq	%r10,%r8
2838bc3d5698SJohn Baldwin	adoxq	%rbx,%r9
2839bc3d5698SJohn Baldwin	mulxq	32(%rsi),%r10,%rbx
2840bc3d5698SJohn Baldwin	adcxq	%r11,%r9
2841bc3d5698SJohn Baldwin	adoxq	%rax,%r10
2842bc3d5698SJohn Baldwin.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00
2843bc3d5698SJohn Baldwin	adcxq	%r12,%r10
2844bc3d5698SJohn Baldwin	adoxq	%rbx,%r11
2845bc3d5698SJohn Baldwin.byte	0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00
2846bc3d5698SJohn Baldwin	adcxq	%r13,%r11
2847bc3d5698SJohn Baldwin	adoxq	%r14,%r12
2848bc3d5698SJohn Baldwin.byte	0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00
2849bc3d5698SJohn Baldwin	movq	16(%rsi),%rdx
2850bc3d5698SJohn Baldwin	adcxq	%rax,%r12
2851bc3d5698SJohn Baldwin	adoxq	%rbx,%r13
2852bc3d5698SJohn Baldwin	adcxq	%r15,%r13
2853bc3d5698SJohn Baldwin	adoxq	%rbp,%r14
2854bc3d5698SJohn Baldwin	adcxq	%rbp,%r14
2855bc3d5698SJohn Baldwin
2856bc3d5698SJohn Baldwin	movq	%r8,24(%rdi)
2857bc3d5698SJohn Baldwin	movq	%r9,32(%rdi)
2858bc3d5698SJohn Baldwin
2859bc3d5698SJohn Baldwin	mulxq	24(%rsi),%r8,%rbx
2860bc3d5698SJohn Baldwin	mulxq	32(%rsi),%r9,%rax
2861bc3d5698SJohn Baldwin	adcxq	%r10,%r8
2862bc3d5698SJohn Baldwin	adoxq	%rbx,%r9
2863bc3d5698SJohn Baldwin	mulxq	40(%rsi),%r10,%rbx
2864bc3d5698SJohn Baldwin	adcxq	%r11,%r9
2865bc3d5698SJohn Baldwin	adoxq	%rax,%r10
2866bc3d5698SJohn Baldwin.byte	0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00
2867bc3d5698SJohn Baldwin	adcxq	%r12,%r10
2868bc3d5698SJohn Baldwin	adoxq	%r13,%r11
2869bc3d5698SJohn Baldwin.byte	0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00
2870bc3d5698SJohn Baldwin.byte	0x3e
2871bc3d5698SJohn Baldwin	movq	24(%rsi),%rdx
2872bc3d5698SJohn Baldwin	adcxq	%rbx,%r11
2873bc3d5698SJohn Baldwin	adoxq	%rax,%r12
2874bc3d5698SJohn Baldwin	adcxq	%r14,%r12
2875bc3d5698SJohn Baldwin	movq	%r8,40(%rdi)
2876bc3d5698SJohn Baldwin	movq	%r9,48(%rdi)
2877bc3d5698SJohn Baldwin	mulxq	32(%rsi),%r8,%rax
2878bc3d5698SJohn Baldwin	adoxq	%rbp,%r13
2879bc3d5698SJohn Baldwin	adcxq	%rbp,%r13
2880bc3d5698SJohn Baldwin
2881bc3d5698SJohn Baldwin	mulxq	40(%rsi),%r9,%rbx
2882bc3d5698SJohn Baldwin	adcxq	%r10,%r8
2883bc3d5698SJohn Baldwin	adoxq	%rax,%r9
2884bc3d5698SJohn Baldwin	mulxq	48(%rsi),%r10,%rax
2885bc3d5698SJohn Baldwin	adcxq	%r11,%r9
2886bc3d5698SJohn Baldwin	adoxq	%r12,%r10
2887bc3d5698SJohn Baldwin	mulxq	56(%rsi),%r11,%r12
2888bc3d5698SJohn Baldwin	movq	32(%rsi),%rdx
2889bc3d5698SJohn Baldwin	movq	40(%rsi),%r14
2890bc3d5698SJohn Baldwin	adcxq	%rbx,%r10
2891bc3d5698SJohn Baldwin	adoxq	%rax,%r11
2892bc3d5698SJohn Baldwin	movq	48(%rsi),%r15
2893bc3d5698SJohn Baldwin	adcxq	%r13,%r11
2894bc3d5698SJohn Baldwin	adoxq	%rbp,%r12
2895bc3d5698SJohn Baldwin	adcxq	%rbp,%r12
2896bc3d5698SJohn Baldwin
2897bc3d5698SJohn Baldwin	movq	%r8,56(%rdi)
2898bc3d5698SJohn Baldwin	movq	%r9,64(%rdi)
2899bc3d5698SJohn Baldwin
2900bc3d5698SJohn Baldwin	mulxq	%r14,%r9,%rax
2901bc3d5698SJohn Baldwin	movq	56(%rsi),%r8
2902bc3d5698SJohn Baldwin	adcxq	%r10,%r9
2903bc3d5698SJohn Baldwin	mulxq	%r15,%r10,%rbx
2904bc3d5698SJohn Baldwin	adoxq	%rax,%r10
2905bc3d5698SJohn Baldwin	adcxq	%r11,%r10
2906bc3d5698SJohn Baldwin	mulxq	%r8,%r11,%rax
2907bc3d5698SJohn Baldwin	movq	%r14,%rdx
2908bc3d5698SJohn Baldwin	adoxq	%rbx,%r11
2909bc3d5698SJohn Baldwin	adcxq	%r12,%r11
2910bc3d5698SJohn Baldwin
2911bc3d5698SJohn Baldwin	adcxq	%rbp,%rax
2912bc3d5698SJohn Baldwin
2913bc3d5698SJohn Baldwin	mulxq	%r15,%r14,%rbx
2914bc3d5698SJohn Baldwin	mulxq	%r8,%r12,%r13
2915bc3d5698SJohn Baldwin	movq	%r15,%rdx
2916bc3d5698SJohn Baldwin	leaq	64(%rsi),%rsi
2917bc3d5698SJohn Baldwin	adcxq	%r14,%r11
2918bc3d5698SJohn Baldwin	adoxq	%rbx,%r12
2919bc3d5698SJohn Baldwin	adcxq	%rax,%r12
2920bc3d5698SJohn Baldwin	adoxq	%rbp,%r13
2921bc3d5698SJohn Baldwin
2922bc3d5698SJohn Baldwin.byte	0x67,0x67
2923bc3d5698SJohn Baldwin	mulxq	%r8,%r8,%r14
2924bc3d5698SJohn Baldwin	adcxq	%r8,%r13
2925bc3d5698SJohn Baldwin	adcxq	%rbp,%r14
2926bc3d5698SJohn Baldwin
2927bc3d5698SJohn Baldwin	cmpq	8+8(%rsp),%rsi
2928bc3d5698SJohn Baldwin	je	.Lsqrx8x_outer_break
2929bc3d5698SJohn Baldwin
2930bc3d5698SJohn Baldwin	negq	%rcx
2931bc3d5698SJohn Baldwin	movq	$-8,%rcx
2932bc3d5698SJohn Baldwin	movq	%rbp,%r15
2933bc3d5698SJohn Baldwin	movq	64(%rdi),%r8
2934bc3d5698SJohn Baldwin	adcxq	72(%rdi),%r9
2935bc3d5698SJohn Baldwin	adcxq	80(%rdi),%r10
2936bc3d5698SJohn Baldwin	adcxq	88(%rdi),%r11
2937bc3d5698SJohn Baldwin	adcq	96(%rdi),%r12
2938bc3d5698SJohn Baldwin	adcq	104(%rdi),%r13
2939bc3d5698SJohn Baldwin	adcq	112(%rdi),%r14
2940bc3d5698SJohn Baldwin	adcq	120(%rdi),%r15
2941bc3d5698SJohn Baldwin	leaq	(%rsi),%rbp
2942bc3d5698SJohn Baldwin	leaq	128(%rdi),%rdi
2943bc3d5698SJohn Baldwin	sbbq	%rax,%rax
2944bc3d5698SJohn Baldwin
2945bc3d5698SJohn Baldwin	movq	-64(%rsi),%rdx
2946bc3d5698SJohn Baldwin	movq	%rax,16+8(%rsp)
2947bc3d5698SJohn Baldwin	movq	%rdi,24+8(%rsp)
2948bc3d5698SJohn Baldwin
2949bc3d5698SJohn Baldwin
2950bc3d5698SJohn Baldwin	xorl	%eax,%eax
2951bc3d5698SJohn Baldwin	jmp	.Lsqrx8x_loop
2952bc3d5698SJohn Baldwin
2953bc3d5698SJohn Baldwin.align	32
2954bc3d5698SJohn Baldwin.Lsqrx8x_loop:
2955bc3d5698SJohn Baldwin	movq	%r8,%rbx
2956bc3d5698SJohn Baldwin	mulxq	0(%rbp),%rax,%r8
2957bc3d5698SJohn Baldwin	adcxq	%rax,%rbx
2958bc3d5698SJohn Baldwin	adoxq	%r9,%r8
2959bc3d5698SJohn Baldwin
2960bc3d5698SJohn Baldwin	mulxq	8(%rbp),%rax,%r9
2961bc3d5698SJohn Baldwin	adcxq	%rax,%r8
2962bc3d5698SJohn Baldwin	adoxq	%r10,%r9
2963bc3d5698SJohn Baldwin
2964bc3d5698SJohn Baldwin	mulxq	16(%rbp),%rax,%r10
2965bc3d5698SJohn Baldwin	adcxq	%rax,%r9
2966bc3d5698SJohn Baldwin	adoxq	%r11,%r10
2967bc3d5698SJohn Baldwin
2968bc3d5698SJohn Baldwin	mulxq	24(%rbp),%rax,%r11
2969bc3d5698SJohn Baldwin	adcxq	%rax,%r10
2970bc3d5698SJohn Baldwin	adoxq	%r12,%r11
2971bc3d5698SJohn Baldwin
2972bc3d5698SJohn Baldwin.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
2973bc3d5698SJohn Baldwin	adcxq	%rax,%r11
2974bc3d5698SJohn Baldwin	adoxq	%r13,%r12
2975bc3d5698SJohn Baldwin
2976bc3d5698SJohn Baldwin	mulxq	40(%rbp),%rax,%r13
2977bc3d5698SJohn Baldwin	adcxq	%rax,%r12
2978bc3d5698SJohn Baldwin	adoxq	%r14,%r13
2979bc3d5698SJohn Baldwin
2980bc3d5698SJohn Baldwin	mulxq	48(%rbp),%rax,%r14
2981bc3d5698SJohn Baldwin	movq	%rbx,(%rdi,%rcx,8)
2982bc3d5698SJohn Baldwin	movl	$0,%ebx
2983bc3d5698SJohn Baldwin	adcxq	%rax,%r13
2984bc3d5698SJohn Baldwin	adoxq	%r15,%r14
2985bc3d5698SJohn Baldwin
2986bc3d5698SJohn Baldwin.byte	0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00
2987bc3d5698SJohn Baldwin	movq	8(%rsi,%rcx,8),%rdx
2988bc3d5698SJohn Baldwin	adcxq	%rax,%r14
2989bc3d5698SJohn Baldwin	adoxq	%rbx,%r15
2990bc3d5698SJohn Baldwin	adcxq	%rbx,%r15
2991bc3d5698SJohn Baldwin
2992bc3d5698SJohn Baldwin.byte	0x67
2993bc3d5698SJohn Baldwin	incq	%rcx
2994bc3d5698SJohn Baldwin	jnz	.Lsqrx8x_loop
2995bc3d5698SJohn Baldwin
2996bc3d5698SJohn Baldwin	leaq	64(%rbp),%rbp
2997bc3d5698SJohn Baldwin	movq	$-8,%rcx
2998bc3d5698SJohn Baldwin	cmpq	8+8(%rsp),%rbp
2999bc3d5698SJohn Baldwin	je	.Lsqrx8x_break
3000bc3d5698SJohn Baldwin
3001bc3d5698SJohn Baldwin	subq	16+8(%rsp),%rbx
3002bc3d5698SJohn Baldwin.byte	0x66
3003bc3d5698SJohn Baldwin	movq	-64(%rsi),%rdx
3004bc3d5698SJohn Baldwin	adcxq	0(%rdi),%r8
3005bc3d5698SJohn Baldwin	adcxq	8(%rdi),%r9
3006bc3d5698SJohn Baldwin	adcq	16(%rdi),%r10
3007bc3d5698SJohn Baldwin	adcq	24(%rdi),%r11
3008bc3d5698SJohn Baldwin	adcq	32(%rdi),%r12
3009bc3d5698SJohn Baldwin	adcq	40(%rdi),%r13
3010bc3d5698SJohn Baldwin	adcq	48(%rdi),%r14
3011bc3d5698SJohn Baldwin	adcq	56(%rdi),%r15
3012bc3d5698SJohn Baldwin	leaq	64(%rdi),%rdi
3013bc3d5698SJohn Baldwin.byte	0x67
3014bc3d5698SJohn Baldwin	sbbq	%rax,%rax
3015bc3d5698SJohn Baldwin	xorl	%ebx,%ebx
3016bc3d5698SJohn Baldwin	movq	%rax,16+8(%rsp)
3017bc3d5698SJohn Baldwin	jmp	.Lsqrx8x_loop
3018bc3d5698SJohn Baldwin
3019bc3d5698SJohn Baldwin.align	32
3020bc3d5698SJohn Baldwin.Lsqrx8x_break:
3021bc3d5698SJohn Baldwin	xorq	%rbp,%rbp
3022bc3d5698SJohn Baldwin	subq	16+8(%rsp),%rbx
3023bc3d5698SJohn Baldwin	adcxq	%rbp,%r8
3024bc3d5698SJohn Baldwin	movq	24+8(%rsp),%rcx
3025bc3d5698SJohn Baldwin	adcxq	%rbp,%r9
3026bc3d5698SJohn Baldwin	movq	0(%rsi),%rdx
3027bc3d5698SJohn Baldwin	adcq	$0,%r10
3028bc3d5698SJohn Baldwin	movq	%r8,0(%rdi)
3029bc3d5698SJohn Baldwin	adcq	$0,%r11
3030bc3d5698SJohn Baldwin	adcq	$0,%r12
3031bc3d5698SJohn Baldwin	adcq	$0,%r13
3032bc3d5698SJohn Baldwin	adcq	$0,%r14
3033bc3d5698SJohn Baldwin	adcq	$0,%r15
3034bc3d5698SJohn Baldwin	cmpq	%rcx,%rdi
3035bc3d5698SJohn Baldwin	je	.Lsqrx8x_outer_loop
3036bc3d5698SJohn Baldwin
3037bc3d5698SJohn Baldwin	movq	%r9,8(%rdi)
3038bc3d5698SJohn Baldwin	movq	8(%rcx),%r9
3039bc3d5698SJohn Baldwin	movq	%r10,16(%rdi)
3040bc3d5698SJohn Baldwin	movq	16(%rcx),%r10
3041bc3d5698SJohn Baldwin	movq	%r11,24(%rdi)
3042bc3d5698SJohn Baldwin	movq	24(%rcx),%r11
3043bc3d5698SJohn Baldwin	movq	%r12,32(%rdi)
3044bc3d5698SJohn Baldwin	movq	32(%rcx),%r12
3045bc3d5698SJohn Baldwin	movq	%r13,40(%rdi)
3046bc3d5698SJohn Baldwin	movq	40(%rcx),%r13
3047bc3d5698SJohn Baldwin	movq	%r14,48(%rdi)
3048bc3d5698SJohn Baldwin	movq	48(%rcx),%r14
3049bc3d5698SJohn Baldwin	movq	%r15,56(%rdi)
3050bc3d5698SJohn Baldwin	movq	56(%rcx),%r15
3051bc3d5698SJohn Baldwin	movq	%rcx,%rdi
3052bc3d5698SJohn Baldwin	jmp	.Lsqrx8x_outer_loop
3053bc3d5698SJohn Baldwin
3054bc3d5698SJohn Baldwin.align	32
3055bc3d5698SJohn Baldwin.Lsqrx8x_outer_break:
3056bc3d5698SJohn Baldwin	movq	%r9,72(%rdi)
3057bc3d5698SJohn Baldwin.byte	102,72,15,126,217
3058bc3d5698SJohn Baldwin	movq	%r10,80(%rdi)
3059bc3d5698SJohn Baldwin	movq	%r11,88(%rdi)
3060bc3d5698SJohn Baldwin	movq	%r12,96(%rdi)
3061bc3d5698SJohn Baldwin	movq	%r13,104(%rdi)
3062bc3d5698SJohn Baldwin	movq	%r14,112(%rdi)
3063bc3d5698SJohn Baldwin	leaq	48+8(%rsp),%rdi
3064bc3d5698SJohn Baldwin	movq	(%rsi,%rcx,1),%rdx
3065bc3d5698SJohn Baldwin
3066bc3d5698SJohn Baldwin	movq	8(%rdi),%r11
3067bc3d5698SJohn Baldwin	xorq	%r10,%r10
3068bc3d5698SJohn Baldwin	movq	0+8(%rsp),%r9
3069bc3d5698SJohn Baldwin	adoxq	%r11,%r11
3070bc3d5698SJohn Baldwin	movq	16(%rdi),%r12
3071bc3d5698SJohn Baldwin	movq	24(%rdi),%r13
3072bc3d5698SJohn Baldwin
3073bc3d5698SJohn Baldwin
3074bc3d5698SJohn Baldwin.align	32
3075bc3d5698SJohn Baldwin.Lsqrx4x_shift_n_add:
3076bc3d5698SJohn Baldwin	mulxq	%rdx,%rax,%rbx
3077bc3d5698SJohn Baldwin	adoxq	%r12,%r12
3078bc3d5698SJohn Baldwin	adcxq	%r10,%rax
3079bc3d5698SJohn Baldwin.byte	0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00
3080bc3d5698SJohn Baldwin.byte	0x4c,0x8b,0x97,0x20,0x00,0x00,0x00
3081bc3d5698SJohn Baldwin	adoxq	%r13,%r13
3082bc3d5698SJohn Baldwin	adcxq	%r11,%rbx
3083bc3d5698SJohn Baldwin	movq	40(%rdi),%r11
3084bc3d5698SJohn Baldwin	movq	%rax,0(%rdi)
3085bc3d5698SJohn Baldwin	movq	%rbx,8(%rdi)
3086bc3d5698SJohn Baldwin
3087bc3d5698SJohn Baldwin	mulxq	%rdx,%rax,%rbx
3088bc3d5698SJohn Baldwin	adoxq	%r10,%r10
3089bc3d5698SJohn Baldwin	adcxq	%r12,%rax
3090bc3d5698SJohn Baldwin	movq	16(%rsi,%rcx,1),%rdx
3091bc3d5698SJohn Baldwin	movq	48(%rdi),%r12
3092bc3d5698SJohn Baldwin	adoxq	%r11,%r11
3093bc3d5698SJohn Baldwin	adcxq	%r13,%rbx
3094bc3d5698SJohn Baldwin	movq	56(%rdi),%r13
3095bc3d5698SJohn Baldwin	movq	%rax,16(%rdi)
3096bc3d5698SJohn Baldwin	movq	%rbx,24(%rdi)
3097bc3d5698SJohn Baldwin
3098bc3d5698SJohn Baldwin	mulxq	%rdx,%rax,%rbx
3099bc3d5698SJohn Baldwin	adoxq	%r12,%r12
3100bc3d5698SJohn Baldwin	adcxq	%r10,%rax
3101bc3d5698SJohn Baldwin	movq	24(%rsi,%rcx,1),%rdx
3102bc3d5698SJohn Baldwin	leaq	32(%rcx),%rcx
3103bc3d5698SJohn Baldwin	movq	64(%rdi),%r10
3104bc3d5698SJohn Baldwin	adoxq	%r13,%r13
3105bc3d5698SJohn Baldwin	adcxq	%r11,%rbx
3106bc3d5698SJohn Baldwin	movq	72(%rdi),%r11
3107bc3d5698SJohn Baldwin	movq	%rax,32(%rdi)
3108bc3d5698SJohn Baldwin	movq	%rbx,40(%rdi)
3109bc3d5698SJohn Baldwin
3110bc3d5698SJohn Baldwin	mulxq	%rdx,%rax,%rbx
3111bc3d5698SJohn Baldwin	adoxq	%r10,%r10
3112bc3d5698SJohn Baldwin	adcxq	%r12,%rax
3113bc3d5698SJohn Baldwin	jrcxz	.Lsqrx4x_shift_n_add_break
3114bc3d5698SJohn Baldwin.byte	0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00
3115bc3d5698SJohn Baldwin	adoxq	%r11,%r11
3116bc3d5698SJohn Baldwin	adcxq	%r13,%rbx
3117bc3d5698SJohn Baldwin	movq	80(%rdi),%r12
3118bc3d5698SJohn Baldwin	movq	88(%rdi),%r13
3119bc3d5698SJohn Baldwin	movq	%rax,48(%rdi)
3120bc3d5698SJohn Baldwin	movq	%rbx,56(%rdi)
3121bc3d5698SJohn Baldwin	leaq	64(%rdi),%rdi
3122bc3d5698SJohn Baldwin	nop
3123bc3d5698SJohn Baldwin	jmp	.Lsqrx4x_shift_n_add
3124bc3d5698SJohn Baldwin
3125bc3d5698SJohn Baldwin.align	32
3126bc3d5698SJohn Baldwin.Lsqrx4x_shift_n_add_break:
3127bc3d5698SJohn Baldwin	adcxq	%r13,%rbx
3128bc3d5698SJohn Baldwin	movq	%rax,48(%rdi)
3129bc3d5698SJohn Baldwin	movq	%rbx,56(%rdi)
3130bc3d5698SJohn Baldwin	leaq	64(%rdi),%rdi
3131bc3d5698SJohn Baldwin.byte	102,72,15,126,213
3132bc3d5698SJohn Baldwin__bn_sqrx8x_reduction:
3133bc3d5698SJohn Baldwin	xorl	%eax,%eax
3134bc3d5698SJohn Baldwin	movq	32+8(%rsp),%rbx
3135bc3d5698SJohn Baldwin	movq	48+8(%rsp),%rdx
3136bc3d5698SJohn Baldwin	leaq	-64(%rbp,%r9,1),%rcx
3137bc3d5698SJohn Baldwin
3138bc3d5698SJohn Baldwin	movq	%rcx,0+8(%rsp)
3139bc3d5698SJohn Baldwin	movq	%rdi,8+8(%rsp)
3140bc3d5698SJohn Baldwin
3141bc3d5698SJohn Baldwin	leaq	48+8(%rsp),%rdi
3142bc3d5698SJohn Baldwin	jmp	.Lsqrx8x_reduction_loop
3143bc3d5698SJohn Baldwin
3144bc3d5698SJohn Baldwin.align	32
3145bc3d5698SJohn Baldwin.Lsqrx8x_reduction_loop:
3146bc3d5698SJohn Baldwin	movq	8(%rdi),%r9
3147bc3d5698SJohn Baldwin	movq	16(%rdi),%r10
3148bc3d5698SJohn Baldwin	movq	24(%rdi),%r11
3149bc3d5698SJohn Baldwin	movq	32(%rdi),%r12
3150bc3d5698SJohn Baldwin	movq	%rdx,%r8
3151bc3d5698SJohn Baldwin	imulq	%rbx,%rdx
3152bc3d5698SJohn Baldwin	movq	40(%rdi),%r13
3153bc3d5698SJohn Baldwin	movq	48(%rdi),%r14
3154bc3d5698SJohn Baldwin	movq	56(%rdi),%r15
3155bc3d5698SJohn Baldwin	movq	%rax,24+8(%rsp)
3156bc3d5698SJohn Baldwin
3157bc3d5698SJohn Baldwin	leaq	64(%rdi),%rdi
3158bc3d5698SJohn Baldwin	xorq	%rsi,%rsi
3159bc3d5698SJohn Baldwin	movq	$-8,%rcx
3160bc3d5698SJohn Baldwin	jmp	.Lsqrx8x_reduce
3161bc3d5698SJohn Baldwin
3162bc3d5698SJohn Baldwin.align	32
3163bc3d5698SJohn Baldwin.Lsqrx8x_reduce:
3164bc3d5698SJohn Baldwin	movq	%r8,%rbx
3165bc3d5698SJohn Baldwin	mulxq	0(%rbp),%rax,%r8
3166bc3d5698SJohn Baldwin	adcxq	%rbx,%rax
3167bc3d5698SJohn Baldwin	adoxq	%r9,%r8
3168bc3d5698SJohn Baldwin
3169bc3d5698SJohn Baldwin	mulxq	8(%rbp),%rbx,%r9
3170bc3d5698SJohn Baldwin	adcxq	%rbx,%r8
3171bc3d5698SJohn Baldwin	adoxq	%r10,%r9
3172bc3d5698SJohn Baldwin
3173bc3d5698SJohn Baldwin	mulxq	16(%rbp),%rbx,%r10
3174bc3d5698SJohn Baldwin	adcxq	%rbx,%r9
3175bc3d5698SJohn Baldwin	adoxq	%r11,%r10
3176bc3d5698SJohn Baldwin
3177bc3d5698SJohn Baldwin	mulxq	24(%rbp),%rbx,%r11
3178bc3d5698SJohn Baldwin	adcxq	%rbx,%r10
3179bc3d5698SJohn Baldwin	adoxq	%r12,%r11
3180bc3d5698SJohn Baldwin
3181bc3d5698SJohn Baldwin.byte	0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00
3182bc3d5698SJohn Baldwin	movq	%rdx,%rax
3183bc3d5698SJohn Baldwin	movq	%r8,%rdx
3184bc3d5698SJohn Baldwin	adcxq	%rbx,%r11
3185bc3d5698SJohn Baldwin	adoxq	%r13,%r12
3186bc3d5698SJohn Baldwin
3187bc3d5698SJohn Baldwin	mulxq	32+8(%rsp),%rbx,%rdx
3188bc3d5698SJohn Baldwin	movq	%rax,%rdx
3189bc3d5698SJohn Baldwin	movq	%rax,64+48+8(%rsp,%rcx,8)
3190bc3d5698SJohn Baldwin
3191bc3d5698SJohn Baldwin	mulxq	40(%rbp),%rax,%r13
3192bc3d5698SJohn Baldwin	adcxq	%rax,%r12
3193bc3d5698SJohn Baldwin	adoxq	%r14,%r13
3194bc3d5698SJohn Baldwin
3195bc3d5698SJohn Baldwin	mulxq	48(%rbp),%rax,%r14
3196bc3d5698SJohn Baldwin	adcxq	%rax,%r13
3197bc3d5698SJohn Baldwin	adoxq	%r15,%r14
3198bc3d5698SJohn Baldwin
3199bc3d5698SJohn Baldwin	mulxq	56(%rbp),%rax,%r15
3200bc3d5698SJohn Baldwin	movq	%rbx,%rdx
3201bc3d5698SJohn Baldwin	adcxq	%rax,%r14
3202bc3d5698SJohn Baldwin	adoxq	%rsi,%r15
3203bc3d5698SJohn Baldwin	adcxq	%rsi,%r15
3204bc3d5698SJohn Baldwin
3205bc3d5698SJohn Baldwin.byte	0x67,0x67,0x67
3206bc3d5698SJohn Baldwin	incq	%rcx
3207bc3d5698SJohn Baldwin	jnz	.Lsqrx8x_reduce
3208bc3d5698SJohn Baldwin
3209bc3d5698SJohn Baldwin	movq	%rsi,%rax
3210bc3d5698SJohn Baldwin	cmpq	0+8(%rsp),%rbp
3211bc3d5698SJohn Baldwin	jae	.Lsqrx8x_no_tail
3212bc3d5698SJohn Baldwin
3213bc3d5698SJohn Baldwin	movq	48+8(%rsp),%rdx
3214bc3d5698SJohn Baldwin	addq	0(%rdi),%r8
3215bc3d5698SJohn Baldwin	leaq	64(%rbp),%rbp
3216bc3d5698SJohn Baldwin	movq	$-8,%rcx
3217bc3d5698SJohn Baldwin	adcxq	8(%rdi),%r9
3218bc3d5698SJohn Baldwin	adcxq	16(%rdi),%r10
3219bc3d5698SJohn Baldwin	adcq	24(%rdi),%r11
3220bc3d5698SJohn Baldwin	adcq	32(%rdi),%r12
3221bc3d5698SJohn Baldwin	adcq	40(%rdi),%r13
3222bc3d5698SJohn Baldwin	adcq	48(%rdi),%r14
3223bc3d5698SJohn Baldwin	adcq	56(%rdi),%r15
3224bc3d5698SJohn Baldwin	leaq	64(%rdi),%rdi
3225bc3d5698SJohn Baldwin	sbbq	%rax,%rax
3226bc3d5698SJohn Baldwin
3227bc3d5698SJohn Baldwin	xorq	%rsi,%rsi
3228bc3d5698SJohn Baldwin	movq	%rax,16+8(%rsp)
3229bc3d5698SJohn Baldwin	jmp	.Lsqrx8x_tail
3230bc3d5698SJohn Baldwin
3231bc3d5698SJohn Baldwin.align	32
3232bc3d5698SJohn Baldwin.Lsqrx8x_tail:
3233bc3d5698SJohn Baldwin	movq	%r8,%rbx
3234bc3d5698SJohn Baldwin	mulxq	0(%rbp),%rax,%r8
3235bc3d5698SJohn Baldwin	adcxq	%rax,%rbx
3236bc3d5698SJohn Baldwin	adoxq	%r9,%r8
3237bc3d5698SJohn Baldwin
3238bc3d5698SJohn Baldwin	mulxq	8(%rbp),%rax,%r9
3239bc3d5698SJohn Baldwin	adcxq	%rax,%r8
3240bc3d5698SJohn Baldwin	adoxq	%r10,%r9
3241bc3d5698SJohn Baldwin
3242bc3d5698SJohn Baldwin	mulxq	16(%rbp),%rax,%r10
3243bc3d5698SJohn Baldwin	adcxq	%rax,%r9
3244bc3d5698SJohn Baldwin	adoxq	%r11,%r10
3245bc3d5698SJohn Baldwin
3246bc3d5698SJohn Baldwin	mulxq	24(%rbp),%rax,%r11
3247bc3d5698SJohn Baldwin	adcxq	%rax,%r10
3248bc3d5698SJohn Baldwin	adoxq	%r12,%r11
3249bc3d5698SJohn Baldwin
3250bc3d5698SJohn Baldwin.byte	0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00
3251bc3d5698SJohn Baldwin	adcxq	%rax,%r11
3252bc3d5698SJohn Baldwin	adoxq	%r13,%r12
3253bc3d5698SJohn Baldwin
3254bc3d5698SJohn Baldwin	mulxq	40(%rbp),%rax,%r13
3255bc3d5698SJohn Baldwin	adcxq	%rax,%r12
3256bc3d5698SJohn Baldwin	adoxq	%r14,%r13
3257bc3d5698SJohn Baldwin
3258bc3d5698SJohn Baldwin	mulxq	48(%rbp),%rax,%r14
3259bc3d5698SJohn Baldwin	adcxq	%rax,%r13
3260bc3d5698SJohn Baldwin	adoxq	%r15,%r14
3261bc3d5698SJohn Baldwin
3262bc3d5698SJohn Baldwin	mulxq	56(%rbp),%rax,%r15
3263bc3d5698SJohn Baldwin	movq	72+48+8(%rsp,%rcx,8),%rdx
3264bc3d5698SJohn Baldwin	adcxq	%rax,%r14
3265bc3d5698SJohn Baldwin	adoxq	%rsi,%r15
3266bc3d5698SJohn Baldwin	movq	%rbx,(%rdi,%rcx,8)
3267bc3d5698SJohn Baldwin	movq	%r8,%rbx
3268bc3d5698SJohn Baldwin	adcxq	%rsi,%r15
3269bc3d5698SJohn Baldwin
3270bc3d5698SJohn Baldwin	incq	%rcx
3271bc3d5698SJohn Baldwin	jnz	.Lsqrx8x_tail
3272bc3d5698SJohn Baldwin
3273bc3d5698SJohn Baldwin	cmpq	0+8(%rsp),%rbp
3274bc3d5698SJohn Baldwin	jae	.Lsqrx8x_tail_done
3275bc3d5698SJohn Baldwin
3276bc3d5698SJohn Baldwin	subq	16+8(%rsp),%rsi
3277bc3d5698SJohn Baldwin	movq	48+8(%rsp),%rdx
3278bc3d5698SJohn Baldwin	leaq	64(%rbp),%rbp
3279bc3d5698SJohn Baldwin	adcq	0(%rdi),%r8
3280bc3d5698SJohn Baldwin	adcq	8(%rdi),%r9
3281bc3d5698SJohn Baldwin	adcq	16(%rdi),%r10
3282bc3d5698SJohn Baldwin	adcq	24(%rdi),%r11
3283bc3d5698SJohn Baldwin	adcq	32(%rdi),%r12
3284bc3d5698SJohn Baldwin	adcq	40(%rdi),%r13
3285bc3d5698SJohn Baldwin	adcq	48(%rdi),%r14
3286bc3d5698SJohn Baldwin	adcq	56(%rdi),%r15
3287bc3d5698SJohn Baldwin	leaq	64(%rdi),%rdi
3288bc3d5698SJohn Baldwin	sbbq	%rax,%rax
3289bc3d5698SJohn Baldwin	subq	$8,%rcx
3290bc3d5698SJohn Baldwin
3291bc3d5698SJohn Baldwin	xorq	%rsi,%rsi
3292bc3d5698SJohn Baldwin	movq	%rax,16+8(%rsp)
3293bc3d5698SJohn Baldwin	jmp	.Lsqrx8x_tail
3294bc3d5698SJohn Baldwin
3295bc3d5698SJohn Baldwin.align	32
3296bc3d5698SJohn Baldwin.Lsqrx8x_tail_done:
3297bc3d5698SJohn Baldwin	xorq	%rax,%rax
3298bc3d5698SJohn Baldwin	addq	24+8(%rsp),%r8
3299bc3d5698SJohn Baldwin	adcq	$0,%r9
3300bc3d5698SJohn Baldwin	adcq	$0,%r10
3301bc3d5698SJohn Baldwin	adcq	$0,%r11
3302bc3d5698SJohn Baldwin	adcq	$0,%r12
3303bc3d5698SJohn Baldwin	adcq	$0,%r13
3304bc3d5698SJohn Baldwin	adcq	$0,%r14
3305bc3d5698SJohn Baldwin	adcq	$0,%r15
3306bc3d5698SJohn Baldwin	adcq	$0,%rax
3307bc3d5698SJohn Baldwin
3308bc3d5698SJohn Baldwin	subq	16+8(%rsp),%rsi
3309bc3d5698SJohn Baldwin.Lsqrx8x_no_tail:
3310bc3d5698SJohn Baldwin	adcq	0(%rdi),%r8
3311bc3d5698SJohn Baldwin.byte	102,72,15,126,217
3312bc3d5698SJohn Baldwin	adcq	8(%rdi),%r9
3313bc3d5698SJohn Baldwin	movq	56(%rbp),%rsi
3314bc3d5698SJohn Baldwin.byte	102,72,15,126,213
3315bc3d5698SJohn Baldwin	adcq	16(%rdi),%r10
3316bc3d5698SJohn Baldwin	adcq	24(%rdi),%r11
3317bc3d5698SJohn Baldwin	adcq	32(%rdi),%r12
3318bc3d5698SJohn Baldwin	adcq	40(%rdi),%r13
3319bc3d5698SJohn Baldwin	adcq	48(%rdi),%r14
3320bc3d5698SJohn Baldwin	adcq	56(%rdi),%r15
3321bc3d5698SJohn Baldwin	adcq	$0,%rax
3322bc3d5698SJohn Baldwin
3323bc3d5698SJohn Baldwin	movq	32+8(%rsp),%rbx
3324bc3d5698SJohn Baldwin	movq	64(%rdi,%rcx,1),%rdx
3325bc3d5698SJohn Baldwin
3326bc3d5698SJohn Baldwin	movq	%r8,0(%rdi)
3327bc3d5698SJohn Baldwin	leaq	64(%rdi),%r8
3328bc3d5698SJohn Baldwin	movq	%r9,8(%rdi)
3329bc3d5698SJohn Baldwin	movq	%r10,16(%rdi)
3330bc3d5698SJohn Baldwin	movq	%r11,24(%rdi)
3331bc3d5698SJohn Baldwin	movq	%r12,32(%rdi)
3332bc3d5698SJohn Baldwin	movq	%r13,40(%rdi)
3333bc3d5698SJohn Baldwin	movq	%r14,48(%rdi)
3334bc3d5698SJohn Baldwin	movq	%r15,56(%rdi)
3335bc3d5698SJohn Baldwin
3336bc3d5698SJohn Baldwin	leaq	64(%rdi,%rcx,1),%rdi
3337bc3d5698SJohn Baldwin	cmpq	8+8(%rsp),%r8
3338bc3d5698SJohn Baldwin	jb	.Lsqrx8x_reduction_loop
3339bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
3340bc3d5698SJohn Baldwin.cfi_endproc
3341bc3d5698SJohn Baldwin.size	bn_sqrx8x_internal,.-bn_sqrx8x_internal
3342bc3d5698SJohn Baldwin.align	32
3343bc3d5698SJohn Baldwin__bn_postx4x_internal:
3344bc3d5698SJohn Baldwin.cfi_startproc
3345bc3d5698SJohn Baldwin	movq	0(%rbp),%r12
3346bc3d5698SJohn Baldwin	movq	%rcx,%r10
3347bc3d5698SJohn Baldwin	movq	%rcx,%r9
3348bc3d5698SJohn Baldwin	negq	%rax
3349bc3d5698SJohn Baldwin	sarq	$3+2,%rcx
3350bc3d5698SJohn Baldwin
3351bc3d5698SJohn Baldwin.byte	102,72,15,126,202
3352bc3d5698SJohn Baldwin.byte	102,72,15,126,206
3353bc3d5698SJohn Baldwin	decq	%r12
3354bc3d5698SJohn Baldwin	movq	8(%rbp),%r13
3355bc3d5698SJohn Baldwin	xorq	%r8,%r8
3356bc3d5698SJohn Baldwin	movq	16(%rbp),%r14
3357bc3d5698SJohn Baldwin	movq	24(%rbp),%r15
3358bc3d5698SJohn Baldwin	jmp	.Lsqrx4x_sub_entry
3359bc3d5698SJohn Baldwin
3360bc3d5698SJohn Baldwin.align	16
3361bc3d5698SJohn Baldwin.Lsqrx4x_sub:
3362bc3d5698SJohn Baldwin	movq	0(%rbp),%r12
3363bc3d5698SJohn Baldwin	movq	8(%rbp),%r13
3364bc3d5698SJohn Baldwin	movq	16(%rbp),%r14
3365bc3d5698SJohn Baldwin	movq	24(%rbp),%r15
3366bc3d5698SJohn Baldwin.Lsqrx4x_sub_entry:
3367bc3d5698SJohn Baldwin	andnq	%rax,%r12,%r12
3368bc3d5698SJohn Baldwin	leaq	32(%rbp),%rbp
3369bc3d5698SJohn Baldwin	andnq	%rax,%r13,%r13
3370bc3d5698SJohn Baldwin	andnq	%rax,%r14,%r14
3371bc3d5698SJohn Baldwin	andnq	%rax,%r15,%r15
3372bc3d5698SJohn Baldwin
3373bc3d5698SJohn Baldwin	negq	%r8
3374bc3d5698SJohn Baldwin	adcq	0(%rdi),%r12
3375bc3d5698SJohn Baldwin	adcq	8(%rdi),%r13
3376bc3d5698SJohn Baldwin	adcq	16(%rdi),%r14
3377bc3d5698SJohn Baldwin	adcq	24(%rdi),%r15
3378bc3d5698SJohn Baldwin	movq	%r12,0(%rdx)
3379bc3d5698SJohn Baldwin	leaq	32(%rdi),%rdi
3380bc3d5698SJohn Baldwin	movq	%r13,8(%rdx)
3381bc3d5698SJohn Baldwin	sbbq	%r8,%r8
3382bc3d5698SJohn Baldwin	movq	%r14,16(%rdx)
3383bc3d5698SJohn Baldwin	movq	%r15,24(%rdx)
3384bc3d5698SJohn Baldwin	leaq	32(%rdx),%rdx
3385bc3d5698SJohn Baldwin
3386bc3d5698SJohn Baldwin	incq	%rcx
3387bc3d5698SJohn Baldwin	jnz	.Lsqrx4x_sub
3388bc3d5698SJohn Baldwin
3389bc3d5698SJohn Baldwin	negq	%r9
3390bc3d5698SJohn Baldwin
3391bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
3392bc3d5698SJohn Baldwin.cfi_endproc
3393bc3d5698SJohn Baldwin.size	__bn_postx4x_internal,.-__bn_postx4x_internal
3394bc3d5698SJohn Baldwin.globl	bn_get_bits5
3395bc3d5698SJohn Baldwin.type	bn_get_bits5,@function
3396bc3d5698SJohn Baldwin.align	16
3397bc3d5698SJohn Baldwinbn_get_bits5:
3398bc3d5698SJohn Baldwin.cfi_startproc
3399bc3d5698SJohn Baldwin	leaq	0(%rdi),%r10
3400bc3d5698SJohn Baldwin	leaq	1(%rdi),%r11
3401bc3d5698SJohn Baldwin	movl	%esi,%ecx
3402bc3d5698SJohn Baldwin	shrl	$4,%esi
3403bc3d5698SJohn Baldwin	andl	$15,%ecx
3404bc3d5698SJohn Baldwin	leal	-8(%rcx),%eax
3405bc3d5698SJohn Baldwin	cmpl	$11,%ecx
3406bc3d5698SJohn Baldwin	cmovaq	%r11,%r10
3407bc3d5698SJohn Baldwin	cmoval	%eax,%ecx
3408bc3d5698SJohn Baldwin	movzwl	(%r10,%rsi,2),%eax
3409bc3d5698SJohn Baldwin	shrl	%cl,%eax
3410bc3d5698SJohn Baldwin	andl	$31,%eax
3411bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
3412bc3d5698SJohn Baldwin.cfi_endproc
3413bc3d5698SJohn Baldwin.size	bn_get_bits5,.-bn_get_bits5
3414bc3d5698SJohn Baldwin
3415bc3d5698SJohn Baldwin.globl	bn_scatter5
3416bc3d5698SJohn Baldwin.type	bn_scatter5,@function
3417bc3d5698SJohn Baldwin.align	16
3418bc3d5698SJohn Baldwinbn_scatter5:
3419bc3d5698SJohn Baldwin.cfi_startproc
3420bc3d5698SJohn Baldwin	cmpl	$0,%esi
3421bc3d5698SJohn Baldwin	jz	.Lscatter_epilogue
3422bc3d5698SJohn Baldwin	leaq	(%rdx,%rcx,8),%rdx
3423bc3d5698SJohn Baldwin.Lscatter:
3424bc3d5698SJohn Baldwin	movq	(%rdi),%rax
3425bc3d5698SJohn Baldwin	leaq	8(%rdi),%rdi
3426bc3d5698SJohn Baldwin	movq	%rax,(%rdx)
3427bc3d5698SJohn Baldwin	leaq	256(%rdx),%rdx
3428bc3d5698SJohn Baldwin	subl	$1,%esi
3429bc3d5698SJohn Baldwin	jnz	.Lscatter
3430bc3d5698SJohn Baldwin.Lscatter_epilogue:
3431bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
3432bc3d5698SJohn Baldwin.cfi_endproc
3433bc3d5698SJohn Baldwin.size	bn_scatter5,.-bn_scatter5
3434bc3d5698SJohn Baldwin
3435bc3d5698SJohn Baldwin.globl	bn_gather5
3436bc3d5698SJohn Baldwin.type	bn_gather5,@function
3437bc3d5698SJohn Baldwin.align	32
3438bc3d5698SJohn Baldwinbn_gather5:
3439bc3d5698SJohn Baldwin.LSEH_begin_bn_gather5:
3440bc3d5698SJohn Baldwin.cfi_startproc
3441bc3d5698SJohn Baldwin
3442bc3d5698SJohn Baldwin.byte	0x4c,0x8d,0x14,0x24
3443bc3d5698SJohn Baldwin.byte	0x48,0x81,0xec,0x08,0x01,0x00,0x00
3444bc3d5698SJohn Baldwin	leaq	.Linc(%rip),%rax
3445bc3d5698SJohn Baldwin	andq	$-16,%rsp
3446bc3d5698SJohn Baldwin
3447bc3d5698SJohn Baldwin	movd	%ecx,%xmm5
3448bc3d5698SJohn Baldwin	movdqa	0(%rax),%xmm0
3449bc3d5698SJohn Baldwin	movdqa	16(%rax),%xmm1
3450bc3d5698SJohn Baldwin	leaq	128(%rdx),%r11
3451bc3d5698SJohn Baldwin	leaq	128(%rsp),%rax
3452bc3d5698SJohn Baldwin
3453bc3d5698SJohn Baldwin	pshufd	$0,%xmm5,%xmm5
3454bc3d5698SJohn Baldwin	movdqa	%xmm1,%xmm4
3455bc3d5698SJohn Baldwin	movdqa	%xmm1,%xmm2
3456bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm1
3457bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm0
3458bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm3
3459bc3d5698SJohn Baldwin
3460bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm2
3461bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm1
3462bc3d5698SJohn Baldwin	movdqa	%xmm0,-128(%rax)
3463bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm0
3464bc3d5698SJohn Baldwin
3465bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm3
3466bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm2
3467bc3d5698SJohn Baldwin	movdqa	%xmm1,-112(%rax)
3468bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm1
3469bc3d5698SJohn Baldwin
3470bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm0
3471bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm3
3472bc3d5698SJohn Baldwin	movdqa	%xmm2,-96(%rax)
3473bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm2
3474bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm1
3475bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm0
3476bc3d5698SJohn Baldwin	movdqa	%xmm3,-80(%rax)
3477bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm3
3478bc3d5698SJohn Baldwin
3479bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm2
3480bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm1
3481bc3d5698SJohn Baldwin	movdqa	%xmm0,-64(%rax)
3482bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm0
3483bc3d5698SJohn Baldwin
3484bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm3
3485bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm2
3486bc3d5698SJohn Baldwin	movdqa	%xmm1,-48(%rax)
3487bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm1
3488bc3d5698SJohn Baldwin
3489bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm0
3490bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm3
3491bc3d5698SJohn Baldwin	movdqa	%xmm2,-32(%rax)
3492bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm2
3493bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm1
3494bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm0
3495bc3d5698SJohn Baldwin	movdqa	%xmm3,-16(%rax)
3496bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm3
3497bc3d5698SJohn Baldwin
3498bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm2
3499bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm1
3500bc3d5698SJohn Baldwin	movdqa	%xmm0,0(%rax)
3501bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm0
3502bc3d5698SJohn Baldwin
3503bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm3
3504bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm2
3505bc3d5698SJohn Baldwin	movdqa	%xmm1,16(%rax)
3506bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm1
3507bc3d5698SJohn Baldwin
3508bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm0
3509bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm3
3510bc3d5698SJohn Baldwin	movdqa	%xmm2,32(%rax)
3511bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm2
3512bc3d5698SJohn Baldwin	paddd	%xmm0,%xmm1
3513bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm0
3514bc3d5698SJohn Baldwin	movdqa	%xmm3,48(%rax)
3515bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm3
3516bc3d5698SJohn Baldwin
3517bc3d5698SJohn Baldwin	paddd	%xmm1,%xmm2
3518bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm1
3519bc3d5698SJohn Baldwin	movdqa	%xmm0,64(%rax)
3520bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm0
3521bc3d5698SJohn Baldwin
3522bc3d5698SJohn Baldwin	paddd	%xmm2,%xmm3
3523bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm2
3524bc3d5698SJohn Baldwin	movdqa	%xmm1,80(%rax)
3525bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm1
3526bc3d5698SJohn Baldwin
3527bc3d5698SJohn Baldwin	paddd	%xmm3,%xmm0
3528bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm3
3529bc3d5698SJohn Baldwin	movdqa	%xmm2,96(%rax)
3530bc3d5698SJohn Baldwin	movdqa	%xmm4,%xmm2
3531bc3d5698SJohn Baldwin	movdqa	%xmm3,112(%rax)
3532bc3d5698SJohn Baldwin	jmp	.Lgather
3533bc3d5698SJohn Baldwin
3534bc3d5698SJohn Baldwin.align	32
3535bc3d5698SJohn Baldwin.Lgather:
3536bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm4
3537bc3d5698SJohn Baldwin	pxor	%xmm5,%xmm5
3538bc3d5698SJohn Baldwin	movdqa	-128(%r11),%xmm0
3539bc3d5698SJohn Baldwin	movdqa	-112(%r11),%xmm1
3540bc3d5698SJohn Baldwin	movdqa	-96(%r11),%xmm2
3541bc3d5698SJohn Baldwin	pand	-128(%rax),%xmm0
3542bc3d5698SJohn Baldwin	movdqa	-80(%r11),%xmm3
3543bc3d5698SJohn Baldwin	pand	-112(%rax),%xmm1
3544bc3d5698SJohn Baldwin	por	%xmm0,%xmm4
3545bc3d5698SJohn Baldwin	pand	-96(%rax),%xmm2
3546bc3d5698SJohn Baldwin	por	%xmm1,%xmm5
3547bc3d5698SJohn Baldwin	pand	-80(%rax),%xmm3
3548bc3d5698SJohn Baldwin	por	%xmm2,%xmm4
3549bc3d5698SJohn Baldwin	por	%xmm3,%xmm5
3550bc3d5698SJohn Baldwin	movdqa	-64(%r11),%xmm0
3551bc3d5698SJohn Baldwin	movdqa	-48(%r11),%xmm1
3552bc3d5698SJohn Baldwin	movdqa	-32(%r11),%xmm2
3553bc3d5698SJohn Baldwin	pand	-64(%rax),%xmm0
3554bc3d5698SJohn Baldwin	movdqa	-16(%r11),%xmm3
3555bc3d5698SJohn Baldwin	pand	-48(%rax),%xmm1
3556bc3d5698SJohn Baldwin	por	%xmm0,%xmm4
3557bc3d5698SJohn Baldwin	pand	-32(%rax),%xmm2
3558bc3d5698SJohn Baldwin	por	%xmm1,%xmm5
3559bc3d5698SJohn Baldwin	pand	-16(%rax),%xmm3
3560bc3d5698SJohn Baldwin	por	%xmm2,%xmm4
3561bc3d5698SJohn Baldwin	por	%xmm3,%xmm5
3562bc3d5698SJohn Baldwin	movdqa	0(%r11),%xmm0
3563bc3d5698SJohn Baldwin	movdqa	16(%r11),%xmm1
3564bc3d5698SJohn Baldwin	movdqa	32(%r11),%xmm2
3565bc3d5698SJohn Baldwin	pand	0(%rax),%xmm0
3566bc3d5698SJohn Baldwin	movdqa	48(%r11),%xmm3
3567bc3d5698SJohn Baldwin	pand	16(%rax),%xmm1
3568bc3d5698SJohn Baldwin	por	%xmm0,%xmm4
3569bc3d5698SJohn Baldwin	pand	32(%rax),%xmm2
3570bc3d5698SJohn Baldwin	por	%xmm1,%xmm5
3571bc3d5698SJohn Baldwin	pand	48(%rax),%xmm3
3572bc3d5698SJohn Baldwin	por	%xmm2,%xmm4
3573bc3d5698SJohn Baldwin	por	%xmm3,%xmm5
3574bc3d5698SJohn Baldwin	movdqa	64(%r11),%xmm0
3575bc3d5698SJohn Baldwin	movdqa	80(%r11),%xmm1
3576bc3d5698SJohn Baldwin	movdqa	96(%r11),%xmm2
3577bc3d5698SJohn Baldwin	pand	64(%rax),%xmm0
3578bc3d5698SJohn Baldwin	movdqa	112(%r11),%xmm3
3579bc3d5698SJohn Baldwin	pand	80(%rax),%xmm1
3580bc3d5698SJohn Baldwin	por	%xmm0,%xmm4
3581bc3d5698SJohn Baldwin	pand	96(%rax),%xmm2
3582bc3d5698SJohn Baldwin	por	%xmm1,%xmm5
3583bc3d5698SJohn Baldwin	pand	112(%rax),%xmm3
3584bc3d5698SJohn Baldwin	por	%xmm2,%xmm4
3585bc3d5698SJohn Baldwin	por	%xmm3,%xmm5
3586bc3d5698SJohn Baldwin	por	%xmm5,%xmm4
3587bc3d5698SJohn Baldwin	leaq	256(%r11),%r11
3588bc3d5698SJohn Baldwin	pshufd	$0x4e,%xmm4,%xmm0
3589bc3d5698SJohn Baldwin	por	%xmm4,%xmm0
3590bc3d5698SJohn Baldwin	movq	%xmm0,(%rdi)
3591bc3d5698SJohn Baldwin	leaq	8(%rdi),%rdi
3592bc3d5698SJohn Baldwin	subl	$1,%esi
3593bc3d5698SJohn Baldwin	jnz	.Lgather
3594bc3d5698SJohn Baldwin
3595bc3d5698SJohn Baldwin	leaq	(%r10),%rsp
3596bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
3597bc3d5698SJohn Baldwin.LSEH_end_bn_gather5:
3598bc3d5698SJohn Baldwin.cfi_endproc
3599bc3d5698SJohn Baldwin.size	bn_gather5,.-bn_gather5
3600bc3d5698SJohn Baldwin.align	64
3601bc3d5698SJohn Baldwin.Linc:
3602bc3d5698SJohn Baldwin.long	0,0, 1,1
3603bc3d5698SJohn Baldwin.long	2,2, 2,2
3604bc3d5698SJohn Baldwin.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
3605*c0855eaaSJohn Baldwin	.section ".note.gnu.property", "a"
3606*c0855eaaSJohn Baldwin	.p2align 3
3607*c0855eaaSJohn Baldwin	.long 1f - 0f
3608*c0855eaaSJohn Baldwin	.long 4f - 1f
3609*c0855eaaSJohn Baldwin	.long 5
3610*c0855eaaSJohn Baldwin0:
3611*c0855eaaSJohn Baldwin	# "GNU" encoded with .byte, since .asciz isn't supported
3612*c0855eaaSJohn Baldwin	# on Solaris.
3613*c0855eaaSJohn Baldwin	.byte 0x47
3614*c0855eaaSJohn Baldwin	.byte 0x4e
3615*c0855eaaSJohn Baldwin	.byte 0x55
3616*c0855eaaSJohn Baldwin	.byte 0
3617*c0855eaaSJohn Baldwin1:
3618*c0855eaaSJohn Baldwin	.p2align 3
3619*c0855eaaSJohn Baldwin	.long 0xc0000002
3620*c0855eaaSJohn Baldwin	.long 3f - 2f
3621*c0855eaaSJohn Baldwin2:
3622*c0855eaaSJohn Baldwin	.long 3
3623*c0855eaaSJohn Baldwin3:
3624*c0855eaaSJohn Baldwin	.p2align 3
3625*c0855eaaSJohn Baldwin4:
3626