xref: /freebsd/sys/crypto/openssl/amd64/x86_64-mont.S (revision c0855eaa3ee9614804b6bd6a255aa9f71e095f43)
1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from x86_64-mont.pl. */
2bc3d5698SJohn Baldwin.text
3bc3d5698SJohn Baldwin
4bc3d5698SJohn Baldwin
5bc3d5698SJohn Baldwin
6bc3d5698SJohn Baldwin.globl	bn_mul_mont
7bc3d5698SJohn Baldwin.type	bn_mul_mont,@function
8bc3d5698SJohn Baldwin.align	16
9bc3d5698SJohn Baldwinbn_mul_mont:
10bc3d5698SJohn Baldwin.cfi_startproc
11bc3d5698SJohn Baldwin	movl	%r9d,%r9d
12bc3d5698SJohn Baldwin	movq	%rsp,%rax
13bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rax
14bc3d5698SJohn Baldwin	testl	$3,%r9d
15bc3d5698SJohn Baldwin	jnz	.Lmul_enter
16bc3d5698SJohn Baldwin	cmpl	$8,%r9d
17bc3d5698SJohn Baldwin	jb	.Lmul_enter
18bc3d5698SJohn Baldwin	movl	OPENSSL_ia32cap_P+8(%rip),%r11d
19bc3d5698SJohn Baldwin	cmpq	%rsi,%rdx
20bc3d5698SJohn Baldwin	jne	.Lmul4x_enter
21bc3d5698SJohn Baldwin	testl	$7,%r9d
22bc3d5698SJohn Baldwin	jz	.Lsqr8x_enter
23bc3d5698SJohn Baldwin	jmp	.Lmul4x_enter
24bc3d5698SJohn Baldwin
25bc3d5698SJohn Baldwin.align	16
26bc3d5698SJohn Baldwin.Lmul_enter:
27bc3d5698SJohn Baldwin	pushq	%rbx
28bc3d5698SJohn Baldwin.cfi_offset	%rbx,-16
29bc3d5698SJohn Baldwin	pushq	%rbp
30bc3d5698SJohn Baldwin.cfi_offset	%rbp,-24
31bc3d5698SJohn Baldwin	pushq	%r12
32bc3d5698SJohn Baldwin.cfi_offset	%r12,-32
33bc3d5698SJohn Baldwin	pushq	%r13
34bc3d5698SJohn Baldwin.cfi_offset	%r13,-40
35bc3d5698SJohn Baldwin	pushq	%r14
36bc3d5698SJohn Baldwin.cfi_offset	%r14,-48
37bc3d5698SJohn Baldwin	pushq	%r15
38bc3d5698SJohn Baldwin.cfi_offset	%r15,-56
39bc3d5698SJohn Baldwin
40bc3d5698SJohn Baldwin	negq	%r9
41bc3d5698SJohn Baldwin	movq	%rsp,%r11
42bc3d5698SJohn Baldwin	leaq	-16(%rsp,%r9,8),%r10
43bc3d5698SJohn Baldwin	negq	%r9
44bc3d5698SJohn Baldwin	andq	$-1024,%r10
45bc3d5698SJohn Baldwin
46bc3d5698SJohn Baldwin
47bc3d5698SJohn Baldwin
48bc3d5698SJohn Baldwin
49bc3d5698SJohn Baldwin
50bc3d5698SJohn Baldwin
51bc3d5698SJohn Baldwin
52bc3d5698SJohn Baldwin
53bc3d5698SJohn Baldwin
54bc3d5698SJohn Baldwin	subq	%r10,%r11
55bc3d5698SJohn Baldwin	andq	$-4096,%r11
56bc3d5698SJohn Baldwin	leaq	(%r10,%r11,1),%rsp
57bc3d5698SJohn Baldwin	movq	(%rsp),%r11
58bc3d5698SJohn Baldwin	cmpq	%r10,%rsp
59bc3d5698SJohn Baldwin	ja	.Lmul_page_walk
60bc3d5698SJohn Baldwin	jmp	.Lmul_page_walk_done
61bc3d5698SJohn Baldwin
62bc3d5698SJohn Baldwin.align	16
63bc3d5698SJohn Baldwin.Lmul_page_walk:
64bc3d5698SJohn Baldwin	leaq	-4096(%rsp),%rsp
65bc3d5698SJohn Baldwin	movq	(%rsp),%r11
66bc3d5698SJohn Baldwin	cmpq	%r10,%rsp
67bc3d5698SJohn Baldwin	ja	.Lmul_page_walk
68bc3d5698SJohn Baldwin.Lmul_page_walk_done:
69bc3d5698SJohn Baldwin
70bc3d5698SJohn Baldwin	movq	%rax,8(%rsp,%r9,8)
71bc3d5698SJohn Baldwin.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
72bc3d5698SJohn Baldwin.Lmul_body:
73bc3d5698SJohn Baldwin	movq	%rdx,%r12
74bc3d5698SJohn Baldwin	movq	(%r8),%r8
75bc3d5698SJohn Baldwin	movq	(%r12),%rbx
76bc3d5698SJohn Baldwin	movq	(%rsi),%rax
77bc3d5698SJohn Baldwin
78bc3d5698SJohn Baldwin	xorq	%r14,%r14
79bc3d5698SJohn Baldwin	xorq	%r15,%r15
80bc3d5698SJohn Baldwin
81bc3d5698SJohn Baldwin	movq	%r8,%rbp
82bc3d5698SJohn Baldwin	mulq	%rbx
83bc3d5698SJohn Baldwin	movq	%rax,%r10
84bc3d5698SJohn Baldwin	movq	(%rcx),%rax
85bc3d5698SJohn Baldwin
86bc3d5698SJohn Baldwin	imulq	%r10,%rbp
87bc3d5698SJohn Baldwin	movq	%rdx,%r11
88bc3d5698SJohn Baldwin
89bc3d5698SJohn Baldwin	mulq	%rbp
90bc3d5698SJohn Baldwin	addq	%rax,%r10
91bc3d5698SJohn Baldwin	movq	8(%rsi),%rax
92bc3d5698SJohn Baldwin	adcq	$0,%rdx
93bc3d5698SJohn Baldwin	movq	%rdx,%r13
94bc3d5698SJohn Baldwin
95bc3d5698SJohn Baldwin	leaq	1(%r15),%r15
96bc3d5698SJohn Baldwin	jmp	.L1st_enter
97bc3d5698SJohn Baldwin
98bc3d5698SJohn Baldwin.align	16
99bc3d5698SJohn Baldwin.L1st:
100bc3d5698SJohn Baldwin	addq	%rax,%r13
101bc3d5698SJohn Baldwin	movq	(%rsi,%r15,8),%rax
102bc3d5698SJohn Baldwin	adcq	$0,%rdx
103bc3d5698SJohn Baldwin	addq	%r11,%r13
104bc3d5698SJohn Baldwin	movq	%r10,%r11
105bc3d5698SJohn Baldwin	adcq	$0,%rdx
106bc3d5698SJohn Baldwin	movq	%r13,-16(%rsp,%r15,8)
107bc3d5698SJohn Baldwin	movq	%rdx,%r13
108bc3d5698SJohn Baldwin
109bc3d5698SJohn Baldwin.L1st_enter:
110bc3d5698SJohn Baldwin	mulq	%rbx
111bc3d5698SJohn Baldwin	addq	%rax,%r11
112bc3d5698SJohn Baldwin	movq	(%rcx,%r15,8),%rax
113bc3d5698SJohn Baldwin	adcq	$0,%rdx
114bc3d5698SJohn Baldwin	leaq	1(%r15),%r15
115bc3d5698SJohn Baldwin	movq	%rdx,%r10
116bc3d5698SJohn Baldwin
117bc3d5698SJohn Baldwin	mulq	%rbp
118bc3d5698SJohn Baldwin	cmpq	%r9,%r15
119bc3d5698SJohn Baldwin	jne	.L1st
120bc3d5698SJohn Baldwin
121bc3d5698SJohn Baldwin	addq	%rax,%r13
122bc3d5698SJohn Baldwin	movq	(%rsi),%rax
123bc3d5698SJohn Baldwin	adcq	$0,%rdx
124bc3d5698SJohn Baldwin	addq	%r11,%r13
125bc3d5698SJohn Baldwin	adcq	$0,%rdx
126bc3d5698SJohn Baldwin	movq	%r13,-16(%rsp,%r15,8)
127bc3d5698SJohn Baldwin	movq	%rdx,%r13
128bc3d5698SJohn Baldwin	movq	%r10,%r11
129bc3d5698SJohn Baldwin
130bc3d5698SJohn Baldwin	xorq	%rdx,%rdx
131bc3d5698SJohn Baldwin	addq	%r11,%r13
132bc3d5698SJohn Baldwin	adcq	$0,%rdx
133bc3d5698SJohn Baldwin	movq	%r13,-8(%rsp,%r9,8)
134bc3d5698SJohn Baldwin	movq	%rdx,(%rsp,%r9,8)
135bc3d5698SJohn Baldwin
136bc3d5698SJohn Baldwin	leaq	1(%r14),%r14
137bc3d5698SJohn Baldwin	jmp	.Louter
138bc3d5698SJohn Baldwin.align	16
139bc3d5698SJohn Baldwin.Louter:
140bc3d5698SJohn Baldwin	movq	(%r12,%r14,8),%rbx
141bc3d5698SJohn Baldwin	xorq	%r15,%r15
142bc3d5698SJohn Baldwin	movq	%r8,%rbp
143bc3d5698SJohn Baldwin	movq	(%rsp),%r10
144bc3d5698SJohn Baldwin	mulq	%rbx
145bc3d5698SJohn Baldwin	addq	%rax,%r10
146bc3d5698SJohn Baldwin	movq	(%rcx),%rax
147bc3d5698SJohn Baldwin	adcq	$0,%rdx
148bc3d5698SJohn Baldwin
149bc3d5698SJohn Baldwin	imulq	%r10,%rbp
150bc3d5698SJohn Baldwin	movq	%rdx,%r11
151bc3d5698SJohn Baldwin
152bc3d5698SJohn Baldwin	mulq	%rbp
153bc3d5698SJohn Baldwin	addq	%rax,%r10
154bc3d5698SJohn Baldwin	movq	8(%rsi),%rax
155bc3d5698SJohn Baldwin	adcq	$0,%rdx
156bc3d5698SJohn Baldwin	movq	8(%rsp),%r10
157bc3d5698SJohn Baldwin	movq	%rdx,%r13
158bc3d5698SJohn Baldwin
159bc3d5698SJohn Baldwin	leaq	1(%r15),%r15
160bc3d5698SJohn Baldwin	jmp	.Linner_enter
161bc3d5698SJohn Baldwin
162bc3d5698SJohn Baldwin.align	16
163bc3d5698SJohn Baldwin.Linner:
164bc3d5698SJohn Baldwin	addq	%rax,%r13
165bc3d5698SJohn Baldwin	movq	(%rsi,%r15,8),%rax
166bc3d5698SJohn Baldwin	adcq	$0,%rdx
167bc3d5698SJohn Baldwin	addq	%r10,%r13
168bc3d5698SJohn Baldwin	movq	(%rsp,%r15,8),%r10
169bc3d5698SJohn Baldwin	adcq	$0,%rdx
170bc3d5698SJohn Baldwin	movq	%r13,-16(%rsp,%r15,8)
171bc3d5698SJohn Baldwin	movq	%rdx,%r13
172bc3d5698SJohn Baldwin
173bc3d5698SJohn Baldwin.Linner_enter:
174bc3d5698SJohn Baldwin	mulq	%rbx
175bc3d5698SJohn Baldwin	addq	%rax,%r11
176bc3d5698SJohn Baldwin	movq	(%rcx,%r15,8),%rax
177bc3d5698SJohn Baldwin	adcq	$0,%rdx
178bc3d5698SJohn Baldwin	addq	%r11,%r10
179bc3d5698SJohn Baldwin	movq	%rdx,%r11
180bc3d5698SJohn Baldwin	adcq	$0,%r11
181bc3d5698SJohn Baldwin	leaq	1(%r15),%r15
182bc3d5698SJohn Baldwin
183bc3d5698SJohn Baldwin	mulq	%rbp
184bc3d5698SJohn Baldwin	cmpq	%r9,%r15
185bc3d5698SJohn Baldwin	jne	.Linner
186bc3d5698SJohn Baldwin
187bc3d5698SJohn Baldwin	addq	%rax,%r13
188bc3d5698SJohn Baldwin	movq	(%rsi),%rax
189bc3d5698SJohn Baldwin	adcq	$0,%rdx
190bc3d5698SJohn Baldwin	addq	%r10,%r13
191bc3d5698SJohn Baldwin	movq	(%rsp,%r15,8),%r10
192bc3d5698SJohn Baldwin	adcq	$0,%rdx
193bc3d5698SJohn Baldwin	movq	%r13,-16(%rsp,%r15,8)
194bc3d5698SJohn Baldwin	movq	%rdx,%r13
195bc3d5698SJohn Baldwin
196bc3d5698SJohn Baldwin	xorq	%rdx,%rdx
197bc3d5698SJohn Baldwin	addq	%r11,%r13
198bc3d5698SJohn Baldwin	adcq	$0,%rdx
199bc3d5698SJohn Baldwin	addq	%r10,%r13
200bc3d5698SJohn Baldwin	adcq	$0,%rdx
201bc3d5698SJohn Baldwin	movq	%r13,-8(%rsp,%r9,8)
202bc3d5698SJohn Baldwin	movq	%rdx,(%rsp,%r9,8)
203bc3d5698SJohn Baldwin
204bc3d5698SJohn Baldwin	leaq	1(%r14),%r14
205bc3d5698SJohn Baldwin	cmpq	%r9,%r14
206bc3d5698SJohn Baldwin	jb	.Louter
207bc3d5698SJohn Baldwin
208bc3d5698SJohn Baldwin	xorq	%r14,%r14
209bc3d5698SJohn Baldwin	movq	(%rsp),%rax
210bc3d5698SJohn Baldwin	movq	%r9,%r15
211bc3d5698SJohn Baldwin
212bc3d5698SJohn Baldwin.align	16
213bc3d5698SJohn Baldwin.Lsub:	sbbq	(%rcx,%r14,8),%rax
214bc3d5698SJohn Baldwin	movq	%rax,(%rdi,%r14,8)
215bc3d5698SJohn Baldwin	movq	8(%rsp,%r14,8),%rax
216bc3d5698SJohn Baldwin	leaq	1(%r14),%r14
217bc3d5698SJohn Baldwin	decq	%r15
218bc3d5698SJohn Baldwin	jnz	.Lsub
219bc3d5698SJohn Baldwin
220bc3d5698SJohn Baldwin	sbbq	$0,%rax
221bc3d5698SJohn Baldwin	movq	$-1,%rbx
222bc3d5698SJohn Baldwin	xorq	%rax,%rbx
223bc3d5698SJohn Baldwin	xorq	%r14,%r14
224bc3d5698SJohn Baldwin	movq	%r9,%r15
225bc3d5698SJohn Baldwin
226bc3d5698SJohn Baldwin.Lcopy:
227bc3d5698SJohn Baldwin	movq	(%rdi,%r14,8),%rcx
228bc3d5698SJohn Baldwin	movq	(%rsp,%r14,8),%rdx
229bc3d5698SJohn Baldwin	andq	%rbx,%rcx
230bc3d5698SJohn Baldwin	andq	%rax,%rdx
231bc3d5698SJohn Baldwin	movq	%r9,(%rsp,%r14,8)
232bc3d5698SJohn Baldwin	orq	%rcx,%rdx
233bc3d5698SJohn Baldwin	movq	%rdx,(%rdi,%r14,8)
234bc3d5698SJohn Baldwin	leaq	1(%r14),%r14
235bc3d5698SJohn Baldwin	subq	$1,%r15
236bc3d5698SJohn Baldwin	jnz	.Lcopy
237bc3d5698SJohn Baldwin
238bc3d5698SJohn Baldwin	movq	8(%rsp,%r9,8),%rsi
239bc3d5698SJohn Baldwin.cfi_def_cfa	%rsi,8
240bc3d5698SJohn Baldwin	movq	$1,%rax
241bc3d5698SJohn Baldwin	movq	-48(%rsi),%r15
242bc3d5698SJohn Baldwin.cfi_restore	%r15
243bc3d5698SJohn Baldwin	movq	-40(%rsi),%r14
244bc3d5698SJohn Baldwin.cfi_restore	%r14
245bc3d5698SJohn Baldwin	movq	-32(%rsi),%r13
246bc3d5698SJohn Baldwin.cfi_restore	%r13
247bc3d5698SJohn Baldwin	movq	-24(%rsi),%r12
248bc3d5698SJohn Baldwin.cfi_restore	%r12
249bc3d5698SJohn Baldwin	movq	-16(%rsi),%rbp
250bc3d5698SJohn Baldwin.cfi_restore	%rbp
251bc3d5698SJohn Baldwin	movq	-8(%rsi),%rbx
252bc3d5698SJohn Baldwin.cfi_restore	%rbx
253bc3d5698SJohn Baldwin	leaq	(%rsi),%rsp
254bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rsp
255bc3d5698SJohn Baldwin.Lmul_epilogue:
256bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
257bc3d5698SJohn Baldwin.cfi_endproc
258bc3d5698SJohn Baldwin.size	bn_mul_mont,.-bn_mul_mont
259bc3d5698SJohn Baldwin.type	bn_mul4x_mont,@function
260bc3d5698SJohn Baldwin.align	16
261bc3d5698SJohn Baldwinbn_mul4x_mont:
262bc3d5698SJohn Baldwin.cfi_startproc
263bc3d5698SJohn Baldwin	movl	%r9d,%r9d
264bc3d5698SJohn Baldwin	movq	%rsp,%rax
265bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rax
266bc3d5698SJohn Baldwin.Lmul4x_enter:
267bc3d5698SJohn Baldwin	andl	$0x80100,%r11d
268bc3d5698SJohn Baldwin	cmpl	$0x80100,%r11d
269bc3d5698SJohn Baldwin	je	.Lmulx4x_enter
270bc3d5698SJohn Baldwin	pushq	%rbx
271bc3d5698SJohn Baldwin.cfi_offset	%rbx,-16
272bc3d5698SJohn Baldwin	pushq	%rbp
273bc3d5698SJohn Baldwin.cfi_offset	%rbp,-24
274bc3d5698SJohn Baldwin	pushq	%r12
275bc3d5698SJohn Baldwin.cfi_offset	%r12,-32
276bc3d5698SJohn Baldwin	pushq	%r13
277bc3d5698SJohn Baldwin.cfi_offset	%r13,-40
278bc3d5698SJohn Baldwin	pushq	%r14
279bc3d5698SJohn Baldwin.cfi_offset	%r14,-48
280bc3d5698SJohn Baldwin	pushq	%r15
281bc3d5698SJohn Baldwin.cfi_offset	%r15,-56
282bc3d5698SJohn Baldwin
283bc3d5698SJohn Baldwin	negq	%r9
284bc3d5698SJohn Baldwin	movq	%rsp,%r11
285bc3d5698SJohn Baldwin	leaq	-32(%rsp,%r9,8),%r10
286bc3d5698SJohn Baldwin	negq	%r9
287bc3d5698SJohn Baldwin	andq	$-1024,%r10
288bc3d5698SJohn Baldwin
289bc3d5698SJohn Baldwin	subq	%r10,%r11
290bc3d5698SJohn Baldwin	andq	$-4096,%r11
291bc3d5698SJohn Baldwin	leaq	(%r10,%r11,1),%rsp
292bc3d5698SJohn Baldwin	movq	(%rsp),%r11
293bc3d5698SJohn Baldwin	cmpq	%r10,%rsp
294bc3d5698SJohn Baldwin	ja	.Lmul4x_page_walk
295bc3d5698SJohn Baldwin	jmp	.Lmul4x_page_walk_done
296bc3d5698SJohn Baldwin
297bc3d5698SJohn Baldwin.Lmul4x_page_walk:
298bc3d5698SJohn Baldwin	leaq	-4096(%rsp),%rsp
299bc3d5698SJohn Baldwin	movq	(%rsp),%r11
300bc3d5698SJohn Baldwin	cmpq	%r10,%rsp
301bc3d5698SJohn Baldwin	ja	.Lmul4x_page_walk
302bc3d5698SJohn Baldwin.Lmul4x_page_walk_done:
303bc3d5698SJohn Baldwin
304bc3d5698SJohn Baldwin	movq	%rax,8(%rsp,%r9,8)
305bc3d5698SJohn Baldwin.cfi_escape	0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08
306bc3d5698SJohn Baldwin.Lmul4x_body:
307bc3d5698SJohn Baldwin	movq	%rdi,16(%rsp,%r9,8)
308bc3d5698SJohn Baldwin	movq	%rdx,%r12
309bc3d5698SJohn Baldwin	movq	(%r8),%r8
310bc3d5698SJohn Baldwin	movq	(%r12),%rbx
311bc3d5698SJohn Baldwin	movq	(%rsi),%rax
312bc3d5698SJohn Baldwin
313bc3d5698SJohn Baldwin	xorq	%r14,%r14
314bc3d5698SJohn Baldwin	xorq	%r15,%r15
315bc3d5698SJohn Baldwin
316bc3d5698SJohn Baldwin	movq	%r8,%rbp
317bc3d5698SJohn Baldwin	mulq	%rbx
318bc3d5698SJohn Baldwin	movq	%rax,%r10
319bc3d5698SJohn Baldwin	movq	(%rcx),%rax
320bc3d5698SJohn Baldwin
321bc3d5698SJohn Baldwin	imulq	%r10,%rbp
322bc3d5698SJohn Baldwin	movq	%rdx,%r11
323bc3d5698SJohn Baldwin
324bc3d5698SJohn Baldwin	mulq	%rbp
325bc3d5698SJohn Baldwin	addq	%rax,%r10
326bc3d5698SJohn Baldwin	movq	8(%rsi),%rax
327bc3d5698SJohn Baldwin	adcq	$0,%rdx
328bc3d5698SJohn Baldwin	movq	%rdx,%rdi
329bc3d5698SJohn Baldwin
330bc3d5698SJohn Baldwin	mulq	%rbx
331bc3d5698SJohn Baldwin	addq	%rax,%r11
332bc3d5698SJohn Baldwin	movq	8(%rcx),%rax
333bc3d5698SJohn Baldwin	adcq	$0,%rdx
334bc3d5698SJohn Baldwin	movq	%rdx,%r10
335bc3d5698SJohn Baldwin
336bc3d5698SJohn Baldwin	mulq	%rbp
337bc3d5698SJohn Baldwin	addq	%rax,%rdi
338bc3d5698SJohn Baldwin	movq	16(%rsi),%rax
339bc3d5698SJohn Baldwin	adcq	$0,%rdx
340bc3d5698SJohn Baldwin	addq	%r11,%rdi
341bc3d5698SJohn Baldwin	leaq	4(%r15),%r15
342bc3d5698SJohn Baldwin	adcq	$0,%rdx
343bc3d5698SJohn Baldwin	movq	%rdi,(%rsp)
344bc3d5698SJohn Baldwin	movq	%rdx,%r13
345bc3d5698SJohn Baldwin	jmp	.L1st4x
346bc3d5698SJohn Baldwin.align	16
347bc3d5698SJohn Baldwin.L1st4x:
348bc3d5698SJohn Baldwin	mulq	%rbx
349bc3d5698SJohn Baldwin	addq	%rax,%r10
350bc3d5698SJohn Baldwin	movq	-16(%rcx,%r15,8),%rax
351bc3d5698SJohn Baldwin	adcq	$0,%rdx
352bc3d5698SJohn Baldwin	movq	%rdx,%r11
353bc3d5698SJohn Baldwin
354bc3d5698SJohn Baldwin	mulq	%rbp
355bc3d5698SJohn Baldwin	addq	%rax,%r13
356bc3d5698SJohn Baldwin	movq	-8(%rsi,%r15,8),%rax
357bc3d5698SJohn Baldwin	adcq	$0,%rdx
358bc3d5698SJohn Baldwin	addq	%r10,%r13
359bc3d5698SJohn Baldwin	adcq	$0,%rdx
360bc3d5698SJohn Baldwin	movq	%r13,-24(%rsp,%r15,8)
361bc3d5698SJohn Baldwin	movq	%rdx,%rdi
362bc3d5698SJohn Baldwin
363bc3d5698SJohn Baldwin	mulq	%rbx
364bc3d5698SJohn Baldwin	addq	%rax,%r11
365bc3d5698SJohn Baldwin	movq	-8(%rcx,%r15,8),%rax
366bc3d5698SJohn Baldwin	adcq	$0,%rdx
367bc3d5698SJohn Baldwin	movq	%rdx,%r10
368bc3d5698SJohn Baldwin
369bc3d5698SJohn Baldwin	mulq	%rbp
370bc3d5698SJohn Baldwin	addq	%rax,%rdi
371bc3d5698SJohn Baldwin	movq	(%rsi,%r15,8),%rax
372bc3d5698SJohn Baldwin	adcq	$0,%rdx
373bc3d5698SJohn Baldwin	addq	%r11,%rdi
374bc3d5698SJohn Baldwin	adcq	$0,%rdx
375bc3d5698SJohn Baldwin	movq	%rdi,-16(%rsp,%r15,8)
376bc3d5698SJohn Baldwin	movq	%rdx,%r13
377bc3d5698SJohn Baldwin
378bc3d5698SJohn Baldwin	mulq	%rbx
379bc3d5698SJohn Baldwin	addq	%rax,%r10
380bc3d5698SJohn Baldwin	movq	(%rcx,%r15,8),%rax
381bc3d5698SJohn Baldwin	adcq	$0,%rdx
382bc3d5698SJohn Baldwin	movq	%rdx,%r11
383bc3d5698SJohn Baldwin
384bc3d5698SJohn Baldwin	mulq	%rbp
385bc3d5698SJohn Baldwin	addq	%rax,%r13
386bc3d5698SJohn Baldwin	movq	8(%rsi,%r15,8),%rax
387bc3d5698SJohn Baldwin	adcq	$0,%rdx
388bc3d5698SJohn Baldwin	addq	%r10,%r13
389bc3d5698SJohn Baldwin	adcq	$0,%rdx
390bc3d5698SJohn Baldwin	movq	%r13,-8(%rsp,%r15,8)
391bc3d5698SJohn Baldwin	movq	%rdx,%rdi
392bc3d5698SJohn Baldwin
393bc3d5698SJohn Baldwin	mulq	%rbx
394bc3d5698SJohn Baldwin	addq	%rax,%r11
395bc3d5698SJohn Baldwin	movq	8(%rcx,%r15,8),%rax
396bc3d5698SJohn Baldwin	adcq	$0,%rdx
397bc3d5698SJohn Baldwin	leaq	4(%r15),%r15
398bc3d5698SJohn Baldwin	movq	%rdx,%r10
399bc3d5698SJohn Baldwin
400bc3d5698SJohn Baldwin	mulq	%rbp
401bc3d5698SJohn Baldwin	addq	%rax,%rdi
402bc3d5698SJohn Baldwin	movq	-16(%rsi,%r15,8),%rax
403bc3d5698SJohn Baldwin	adcq	$0,%rdx
404bc3d5698SJohn Baldwin	addq	%r11,%rdi
405bc3d5698SJohn Baldwin	adcq	$0,%rdx
406bc3d5698SJohn Baldwin	movq	%rdi,-32(%rsp,%r15,8)
407bc3d5698SJohn Baldwin	movq	%rdx,%r13
408bc3d5698SJohn Baldwin	cmpq	%r9,%r15
409bc3d5698SJohn Baldwin	jb	.L1st4x
410bc3d5698SJohn Baldwin
411bc3d5698SJohn Baldwin	mulq	%rbx
412bc3d5698SJohn Baldwin	addq	%rax,%r10
413bc3d5698SJohn Baldwin	movq	-16(%rcx,%r15,8),%rax
414bc3d5698SJohn Baldwin	adcq	$0,%rdx
415bc3d5698SJohn Baldwin	movq	%rdx,%r11
416bc3d5698SJohn Baldwin
417bc3d5698SJohn Baldwin	mulq	%rbp
418bc3d5698SJohn Baldwin	addq	%rax,%r13
419bc3d5698SJohn Baldwin	movq	-8(%rsi,%r15,8),%rax
420bc3d5698SJohn Baldwin	adcq	$0,%rdx
421bc3d5698SJohn Baldwin	addq	%r10,%r13
422bc3d5698SJohn Baldwin	adcq	$0,%rdx
423bc3d5698SJohn Baldwin	movq	%r13,-24(%rsp,%r15,8)
424bc3d5698SJohn Baldwin	movq	%rdx,%rdi
425bc3d5698SJohn Baldwin
426bc3d5698SJohn Baldwin	mulq	%rbx
427bc3d5698SJohn Baldwin	addq	%rax,%r11
428bc3d5698SJohn Baldwin	movq	-8(%rcx,%r15,8),%rax
429bc3d5698SJohn Baldwin	adcq	$0,%rdx
430bc3d5698SJohn Baldwin	movq	%rdx,%r10
431bc3d5698SJohn Baldwin
432bc3d5698SJohn Baldwin	mulq	%rbp
433bc3d5698SJohn Baldwin	addq	%rax,%rdi
434bc3d5698SJohn Baldwin	movq	(%rsi),%rax
435bc3d5698SJohn Baldwin	adcq	$0,%rdx
436bc3d5698SJohn Baldwin	addq	%r11,%rdi
437bc3d5698SJohn Baldwin	adcq	$0,%rdx
438bc3d5698SJohn Baldwin	movq	%rdi,-16(%rsp,%r15,8)
439bc3d5698SJohn Baldwin	movq	%rdx,%r13
440bc3d5698SJohn Baldwin
441bc3d5698SJohn Baldwin	xorq	%rdi,%rdi
442bc3d5698SJohn Baldwin	addq	%r10,%r13
443bc3d5698SJohn Baldwin	adcq	$0,%rdi
444bc3d5698SJohn Baldwin	movq	%r13,-8(%rsp,%r15,8)
445bc3d5698SJohn Baldwin	movq	%rdi,(%rsp,%r15,8)
446bc3d5698SJohn Baldwin
447bc3d5698SJohn Baldwin	leaq	1(%r14),%r14
448bc3d5698SJohn Baldwin.align	4
449bc3d5698SJohn Baldwin.Louter4x:
450bc3d5698SJohn Baldwin	movq	(%r12,%r14,8),%rbx
451bc3d5698SJohn Baldwin	xorq	%r15,%r15
452bc3d5698SJohn Baldwin	movq	(%rsp),%r10
453bc3d5698SJohn Baldwin	movq	%r8,%rbp
454bc3d5698SJohn Baldwin	mulq	%rbx
455bc3d5698SJohn Baldwin	addq	%rax,%r10
456bc3d5698SJohn Baldwin	movq	(%rcx),%rax
457bc3d5698SJohn Baldwin	adcq	$0,%rdx
458bc3d5698SJohn Baldwin
459bc3d5698SJohn Baldwin	imulq	%r10,%rbp
460bc3d5698SJohn Baldwin	movq	%rdx,%r11
461bc3d5698SJohn Baldwin
462bc3d5698SJohn Baldwin	mulq	%rbp
463bc3d5698SJohn Baldwin	addq	%rax,%r10
464bc3d5698SJohn Baldwin	movq	8(%rsi),%rax
465bc3d5698SJohn Baldwin	adcq	$0,%rdx
466bc3d5698SJohn Baldwin	movq	%rdx,%rdi
467bc3d5698SJohn Baldwin
468bc3d5698SJohn Baldwin	mulq	%rbx
469bc3d5698SJohn Baldwin	addq	%rax,%r11
470bc3d5698SJohn Baldwin	movq	8(%rcx),%rax
471bc3d5698SJohn Baldwin	adcq	$0,%rdx
472bc3d5698SJohn Baldwin	addq	8(%rsp),%r11
473bc3d5698SJohn Baldwin	adcq	$0,%rdx
474bc3d5698SJohn Baldwin	movq	%rdx,%r10
475bc3d5698SJohn Baldwin
476bc3d5698SJohn Baldwin	mulq	%rbp
477bc3d5698SJohn Baldwin	addq	%rax,%rdi
478bc3d5698SJohn Baldwin	movq	16(%rsi),%rax
479bc3d5698SJohn Baldwin	adcq	$0,%rdx
480bc3d5698SJohn Baldwin	addq	%r11,%rdi
481bc3d5698SJohn Baldwin	leaq	4(%r15),%r15
482bc3d5698SJohn Baldwin	adcq	$0,%rdx
483bc3d5698SJohn Baldwin	movq	%rdi,(%rsp)
484bc3d5698SJohn Baldwin	movq	%rdx,%r13
485bc3d5698SJohn Baldwin	jmp	.Linner4x
486bc3d5698SJohn Baldwin.align	16
487bc3d5698SJohn Baldwin.Linner4x:
488bc3d5698SJohn Baldwin	mulq	%rbx
489bc3d5698SJohn Baldwin	addq	%rax,%r10
490bc3d5698SJohn Baldwin	movq	-16(%rcx,%r15,8),%rax
491bc3d5698SJohn Baldwin	adcq	$0,%rdx
492bc3d5698SJohn Baldwin	addq	-16(%rsp,%r15,8),%r10
493bc3d5698SJohn Baldwin	adcq	$0,%rdx
494bc3d5698SJohn Baldwin	movq	%rdx,%r11
495bc3d5698SJohn Baldwin
496bc3d5698SJohn Baldwin	mulq	%rbp
497bc3d5698SJohn Baldwin	addq	%rax,%r13
498bc3d5698SJohn Baldwin	movq	-8(%rsi,%r15,8),%rax
499bc3d5698SJohn Baldwin	adcq	$0,%rdx
500bc3d5698SJohn Baldwin	addq	%r10,%r13
501bc3d5698SJohn Baldwin	adcq	$0,%rdx
502bc3d5698SJohn Baldwin	movq	%r13,-24(%rsp,%r15,8)
503bc3d5698SJohn Baldwin	movq	%rdx,%rdi
504bc3d5698SJohn Baldwin
505bc3d5698SJohn Baldwin	mulq	%rbx
506bc3d5698SJohn Baldwin	addq	%rax,%r11
507bc3d5698SJohn Baldwin	movq	-8(%rcx,%r15,8),%rax
508bc3d5698SJohn Baldwin	adcq	$0,%rdx
509bc3d5698SJohn Baldwin	addq	-8(%rsp,%r15,8),%r11
510bc3d5698SJohn Baldwin	adcq	$0,%rdx
511bc3d5698SJohn Baldwin	movq	%rdx,%r10
512bc3d5698SJohn Baldwin
513bc3d5698SJohn Baldwin	mulq	%rbp
514bc3d5698SJohn Baldwin	addq	%rax,%rdi
515bc3d5698SJohn Baldwin	movq	(%rsi,%r15,8),%rax
516bc3d5698SJohn Baldwin	adcq	$0,%rdx
517bc3d5698SJohn Baldwin	addq	%r11,%rdi
518bc3d5698SJohn Baldwin	adcq	$0,%rdx
519bc3d5698SJohn Baldwin	movq	%rdi,-16(%rsp,%r15,8)
520bc3d5698SJohn Baldwin	movq	%rdx,%r13
521bc3d5698SJohn Baldwin
522bc3d5698SJohn Baldwin	mulq	%rbx
523bc3d5698SJohn Baldwin	addq	%rax,%r10
524bc3d5698SJohn Baldwin	movq	(%rcx,%r15,8),%rax
525bc3d5698SJohn Baldwin	adcq	$0,%rdx
526bc3d5698SJohn Baldwin	addq	(%rsp,%r15,8),%r10
527bc3d5698SJohn Baldwin	adcq	$0,%rdx
528bc3d5698SJohn Baldwin	movq	%rdx,%r11
529bc3d5698SJohn Baldwin
530bc3d5698SJohn Baldwin	mulq	%rbp
531bc3d5698SJohn Baldwin	addq	%rax,%r13
532bc3d5698SJohn Baldwin	movq	8(%rsi,%r15,8),%rax
533bc3d5698SJohn Baldwin	adcq	$0,%rdx
534bc3d5698SJohn Baldwin	addq	%r10,%r13
535bc3d5698SJohn Baldwin	adcq	$0,%rdx
536bc3d5698SJohn Baldwin	movq	%r13,-8(%rsp,%r15,8)
537bc3d5698SJohn Baldwin	movq	%rdx,%rdi
538bc3d5698SJohn Baldwin
539bc3d5698SJohn Baldwin	mulq	%rbx
540bc3d5698SJohn Baldwin	addq	%rax,%r11
541bc3d5698SJohn Baldwin	movq	8(%rcx,%r15,8),%rax
542bc3d5698SJohn Baldwin	adcq	$0,%rdx
543bc3d5698SJohn Baldwin	addq	8(%rsp,%r15,8),%r11
544bc3d5698SJohn Baldwin	adcq	$0,%rdx
545bc3d5698SJohn Baldwin	leaq	4(%r15),%r15
546bc3d5698SJohn Baldwin	movq	%rdx,%r10
547bc3d5698SJohn Baldwin
548bc3d5698SJohn Baldwin	mulq	%rbp
549bc3d5698SJohn Baldwin	addq	%rax,%rdi
550bc3d5698SJohn Baldwin	movq	-16(%rsi,%r15,8),%rax
551bc3d5698SJohn Baldwin	adcq	$0,%rdx
552bc3d5698SJohn Baldwin	addq	%r11,%rdi
553bc3d5698SJohn Baldwin	adcq	$0,%rdx
554bc3d5698SJohn Baldwin	movq	%rdi,-32(%rsp,%r15,8)
555bc3d5698SJohn Baldwin	movq	%rdx,%r13
556bc3d5698SJohn Baldwin	cmpq	%r9,%r15
557bc3d5698SJohn Baldwin	jb	.Linner4x
558bc3d5698SJohn Baldwin
559bc3d5698SJohn Baldwin	mulq	%rbx
560bc3d5698SJohn Baldwin	addq	%rax,%r10
561bc3d5698SJohn Baldwin	movq	-16(%rcx,%r15,8),%rax
562bc3d5698SJohn Baldwin	adcq	$0,%rdx
563bc3d5698SJohn Baldwin	addq	-16(%rsp,%r15,8),%r10
564bc3d5698SJohn Baldwin	adcq	$0,%rdx
565bc3d5698SJohn Baldwin	movq	%rdx,%r11
566bc3d5698SJohn Baldwin
567bc3d5698SJohn Baldwin	mulq	%rbp
568bc3d5698SJohn Baldwin	addq	%rax,%r13
569bc3d5698SJohn Baldwin	movq	-8(%rsi,%r15,8),%rax
570bc3d5698SJohn Baldwin	adcq	$0,%rdx
571bc3d5698SJohn Baldwin	addq	%r10,%r13
572bc3d5698SJohn Baldwin	adcq	$0,%rdx
573bc3d5698SJohn Baldwin	movq	%r13,-24(%rsp,%r15,8)
574bc3d5698SJohn Baldwin	movq	%rdx,%rdi
575bc3d5698SJohn Baldwin
576bc3d5698SJohn Baldwin	mulq	%rbx
577bc3d5698SJohn Baldwin	addq	%rax,%r11
578bc3d5698SJohn Baldwin	movq	-8(%rcx,%r15,8),%rax
579bc3d5698SJohn Baldwin	adcq	$0,%rdx
580bc3d5698SJohn Baldwin	addq	-8(%rsp,%r15,8),%r11
581bc3d5698SJohn Baldwin	adcq	$0,%rdx
582bc3d5698SJohn Baldwin	leaq	1(%r14),%r14
583bc3d5698SJohn Baldwin	movq	%rdx,%r10
584bc3d5698SJohn Baldwin
585bc3d5698SJohn Baldwin	mulq	%rbp
586bc3d5698SJohn Baldwin	addq	%rax,%rdi
587bc3d5698SJohn Baldwin	movq	(%rsi),%rax
588bc3d5698SJohn Baldwin	adcq	$0,%rdx
589bc3d5698SJohn Baldwin	addq	%r11,%rdi
590bc3d5698SJohn Baldwin	adcq	$0,%rdx
591bc3d5698SJohn Baldwin	movq	%rdi,-16(%rsp,%r15,8)
592bc3d5698SJohn Baldwin	movq	%rdx,%r13
593bc3d5698SJohn Baldwin
594bc3d5698SJohn Baldwin	xorq	%rdi,%rdi
595bc3d5698SJohn Baldwin	addq	%r10,%r13
596bc3d5698SJohn Baldwin	adcq	$0,%rdi
597bc3d5698SJohn Baldwin	addq	(%rsp,%r9,8),%r13
598bc3d5698SJohn Baldwin	adcq	$0,%rdi
599bc3d5698SJohn Baldwin	movq	%r13,-8(%rsp,%r15,8)
600bc3d5698SJohn Baldwin	movq	%rdi,(%rsp,%r15,8)
601bc3d5698SJohn Baldwin
602bc3d5698SJohn Baldwin	cmpq	%r9,%r14
603bc3d5698SJohn Baldwin	jb	.Louter4x
604bc3d5698SJohn Baldwin	movq	16(%rsp,%r9,8),%rdi
605bc3d5698SJohn Baldwin	leaq	-4(%r9),%r15
606bc3d5698SJohn Baldwin	movq	0(%rsp),%rax
607bc3d5698SJohn Baldwin	movq	8(%rsp),%rdx
608bc3d5698SJohn Baldwin	shrq	$2,%r15
609bc3d5698SJohn Baldwin	leaq	(%rsp),%rsi
610bc3d5698SJohn Baldwin	xorq	%r14,%r14
611bc3d5698SJohn Baldwin
612bc3d5698SJohn Baldwin	subq	0(%rcx),%rax
613bc3d5698SJohn Baldwin	movq	16(%rsi),%rbx
614bc3d5698SJohn Baldwin	movq	24(%rsi),%rbp
615bc3d5698SJohn Baldwin	sbbq	8(%rcx),%rdx
616bc3d5698SJohn Baldwin
617bc3d5698SJohn Baldwin.Lsub4x:
618bc3d5698SJohn Baldwin	movq	%rax,0(%rdi,%r14,8)
619bc3d5698SJohn Baldwin	movq	%rdx,8(%rdi,%r14,8)
620bc3d5698SJohn Baldwin	sbbq	16(%rcx,%r14,8),%rbx
621bc3d5698SJohn Baldwin	movq	32(%rsi,%r14,8),%rax
622bc3d5698SJohn Baldwin	movq	40(%rsi,%r14,8),%rdx
623bc3d5698SJohn Baldwin	sbbq	24(%rcx,%r14,8),%rbp
624bc3d5698SJohn Baldwin	movq	%rbx,16(%rdi,%r14,8)
625bc3d5698SJohn Baldwin	movq	%rbp,24(%rdi,%r14,8)
626bc3d5698SJohn Baldwin	sbbq	32(%rcx,%r14,8),%rax
627bc3d5698SJohn Baldwin	movq	48(%rsi,%r14,8),%rbx
628bc3d5698SJohn Baldwin	movq	56(%rsi,%r14,8),%rbp
629bc3d5698SJohn Baldwin	sbbq	40(%rcx,%r14,8),%rdx
630bc3d5698SJohn Baldwin	leaq	4(%r14),%r14
631bc3d5698SJohn Baldwin	decq	%r15
632bc3d5698SJohn Baldwin	jnz	.Lsub4x
633bc3d5698SJohn Baldwin
634bc3d5698SJohn Baldwin	movq	%rax,0(%rdi,%r14,8)
635bc3d5698SJohn Baldwin	movq	32(%rsi,%r14,8),%rax
636bc3d5698SJohn Baldwin	sbbq	16(%rcx,%r14,8),%rbx
637bc3d5698SJohn Baldwin	movq	%rdx,8(%rdi,%r14,8)
638bc3d5698SJohn Baldwin	sbbq	24(%rcx,%r14,8),%rbp
639bc3d5698SJohn Baldwin	movq	%rbx,16(%rdi,%r14,8)
640bc3d5698SJohn Baldwin
641bc3d5698SJohn Baldwin	sbbq	$0,%rax
642bc3d5698SJohn Baldwin	movq	%rbp,24(%rdi,%r14,8)
643bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm0
644bc3d5698SJohn Baldwin.byte	102,72,15,110,224
645bc3d5698SJohn Baldwin	pcmpeqd	%xmm5,%xmm5
646bc3d5698SJohn Baldwin	pshufd	$0,%xmm4,%xmm4
647bc3d5698SJohn Baldwin	movq	%r9,%r15
648bc3d5698SJohn Baldwin	pxor	%xmm4,%xmm5
649bc3d5698SJohn Baldwin	shrq	$2,%r15
650bc3d5698SJohn Baldwin	xorl	%eax,%eax
651bc3d5698SJohn Baldwin
652bc3d5698SJohn Baldwin	jmp	.Lcopy4x
653bc3d5698SJohn Baldwin.align	16
654bc3d5698SJohn Baldwin.Lcopy4x:
655bc3d5698SJohn Baldwin	movdqa	(%rsp,%rax,1),%xmm1
656bc3d5698SJohn Baldwin	movdqu	(%rdi,%rax,1),%xmm2
657bc3d5698SJohn Baldwin	pand	%xmm4,%xmm1
658bc3d5698SJohn Baldwin	pand	%xmm5,%xmm2
659bc3d5698SJohn Baldwin	movdqa	16(%rsp,%rax,1),%xmm3
660bc3d5698SJohn Baldwin	movdqa	%xmm0,(%rsp,%rax,1)
661bc3d5698SJohn Baldwin	por	%xmm2,%xmm1
662bc3d5698SJohn Baldwin	movdqu	16(%rdi,%rax,1),%xmm2
663bc3d5698SJohn Baldwin	movdqu	%xmm1,(%rdi,%rax,1)
664bc3d5698SJohn Baldwin	pand	%xmm4,%xmm3
665bc3d5698SJohn Baldwin	pand	%xmm5,%xmm2
666bc3d5698SJohn Baldwin	movdqa	%xmm0,16(%rsp,%rax,1)
667bc3d5698SJohn Baldwin	por	%xmm2,%xmm3
668bc3d5698SJohn Baldwin	movdqu	%xmm3,16(%rdi,%rax,1)
669bc3d5698SJohn Baldwin	leaq	32(%rax),%rax
670bc3d5698SJohn Baldwin	decq	%r15
671bc3d5698SJohn Baldwin	jnz	.Lcopy4x
672bc3d5698SJohn Baldwin	movq	8(%rsp,%r9,8),%rsi
673bc3d5698SJohn Baldwin.cfi_def_cfa	%rsi, 8
674bc3d5698SJohn Baldwin	movq	$1,%rax
675bc3d5698SJohn Baldwin	movq	-48(%rsi),%r15
676bc3d5698SJohn Baldwin.cfi_restore	%r15
677bc3d5698SJohn Baldwin	movq	-40(%rsi),%r14
678bc3d5698SJohn Baldwin.cfi_restore	%r14
679bc3d5698SJohn Baldwin	movq	-32(%rsi),%r13
680bc3d5698SJohn Baldwin.cfi_restore	%r13
681bc3d5698SJohn Baldwin	movq	-24(%rsi),%r12
682bc3d5698SJohn Baldwin.cfi_restore	%r12
683bc3d5698SJohn Baldwin	movq	-16(%rsi),%rbp
684bc3d5698SJohn Baldwin.cfi_restore	%rbp
685bc3d5698SJohn Baldwin	movq	-8(%rsi),%rbx
686bc3d5698SJohn Baldwin.cfi_restore	%rbx
687bc3d5698SJohn Baldwin	leaq	(%rsi),%rsp
688bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rsp
689bc3d5698SJohn Baldwin.Lmul4x_epilogue:
690bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
691bc3d5698SJohn Baldwin.cfi_endproc
692bc3d5698SJohn Baldwin.size	bn_mul4x_mont,.-bn_mul4x_mont
693bc3d5698SJohn Baldwin
694bc3d5698SJohn Baldwin
695bc3d5698SJohn Baldwin
696bc3d5698SJohn Baldwin.type	bn_sqr8x_mont,@function
697bc3d5698SJohn Baldwin.align	32
698bc3d5698SJohn Baldwinbn_sqr8x_mont:
699bc3d5698SJohn Baldwin.cfi_startproc
700bc3d5698SJohn Baldwin	movq	%rsp,%rax
701bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rax
702bc3d5698SJohn Baldwin.Lsqr8x_enter:
703bc3d5698SJohn Baldwin	pushq	%rbx
704bc3d5698SJohn Baldwin.cfi_offset	%rbx,-16
705bc3d5698SJohn Baldwin	pushq	%rbp
706bc3d5698SJohn Baldwin.cfi_offset	%rbp,-24
707bc3d5698SJohn Baldwin	pushq	%r12
708bc3d5698SJohn Baldwin.cfi_offset	%r12,-32
709bc3d5698SJohn Baldwin	pushq	%r13
710bc3d5698SJohn Baldwin.cfi_offset	%r13,-40
711bc3d5698SJohn Baldwin	pushq	%r14
712bc3d5698SJohn Baldwin.cfi_offset	%r14,-48
713bc3d5698SJohn Baldwin	pushq	%r15
714bc3d5698SJohn Baldwin.cfi_offset	%r15,-56
715bc3d5698SJohn Baldwin.Lsqr8x_prologue:
716bc3d5698SJohn Baldwin
717bc3d5698SJohn Baldwin	movl	%r9d,%r10d
718bc3d5698SJohn Baldwin	shll	$3,%r9d
719bc3d5698SJohn Baldwin	shlq	$3+2,%r10
720bc3d5698SJohn Baldwin	negq	%r9
721bc3d5698SJohn Baldwin
722bc3d5698SJohn Baldwin
723bc3d5698SJohn Baldwin
724bc3d5698SJohn Baldwin
725bc3d5698SJohn Baldwin
726bc3d5698SJohn Baldwin
727bc3d5698SJohn Baldwin	leaq	-64(%rsp,%r9,2),%r11
728bc3d5698SJohn Baldwin	movq	%rsp,%rbp
729bc3d5698SJohn Baldwin	movq	(%r8),%r8
730bc3d5698SJohn Baldwin	subq	%rsi,%r11
731bc3d5698SJohn Baldwin	andq	$4095,%r11
732bc3d5698SJohn Baldwin	cmpq	%r11,%r10
733bc3d5698SJohn Baldwin	jb	.Lsqr8x_sp_alt
734bc3d5698SJohn Baldwin	subq	%r11,%rbp
735bc3d5698SJohn Baldwin	leaq	-64(%rbp,%r9,2),%rbp
736bc3d5698SJohn Baldwin	jmp	.Lsqr8x_sp_done
737bc3d5698SJohn Baldwin
738bc3d5698SJohn Baldwin.align	32
739bc3d5698SJohn Baldwin.Lsqr8x_sp_alt:
740bc3d5698SJohn Baldwin	leaq	4096-64(,%r9,2),%r10
741bc3d5698SJohn Baldwin	leaq	-64(%rbp,%r9,2),%rbp
742bc3d5698SJohn Baldwin	subq	%r10,%r11
743bc3d5698SJohn Baldwin	movq	$0,%r10
744bc3d5698SJohn Baldwin	cmovcq	%r10,%r11
745bc3d5698SJohn Baldwin	subq	%r11,%rbp
746bc3d5698SJohn Baldwin.Lsqr8x_sp_done:
747bc3d5698SJohn Baldwin	andq	$-64,%rbp
748bc3d5698SJohn Baldwin	movq	%rsp,%r11
749bc3d5698SJohn Baldwin	subq	%rbp,%r11
750bc3d5698SJohn Baldwin	andq	$-4096,%r11
751bc3d5698SJohn Baldwin	leaq	(%r11,%rbp,1),%rsp
752bc3d5698SJohn Baldwin	movq	(%rsp),%r10
753bc3d5698SJohn Baldwin	cmpq	%rbp,%rsp
754bc3d5698SJohn Baldwin	ja	.Lsqr8x_page_walk
755bc3d5698SJohn Baldwin	jmp	.Lsqr8x_page_walk_done
756bc3d5698SJohn Baldwin
757bc3d5698SJohn Baldwin.align	16
758bc3d5698SJohn Baldwin.Lsqr8x_page_walk:
759bc3d5698SJohn Baldwin	leaq	-4096(%rsp),%rsp
760bc3d5698SJohn Baldwin	movq	(%rsp),%r10
761bc3d5698SJohn Baldwin	cmpq	%rbp,%rsp
762bc3d5698SJohn Baldwin	ja	.Lsqr8x_page_walk
763bc3d5698SJohn Baldwin.Lsqr8x_page_walk_done:
764bc3d5698SJohn Baldwin
765bc3d5698SJohn Baldwin	movq	%r9,%r10
766bc3d5698SJohn Baldwin	negq	%r9
767bc3d5698SJohn Baldwin
768bc3d5698SJohn Baldwin	movq	%r8,32(%rsp)
769bc3d5698SJohn Baldwin	movq	%rax,40(%rsp)
770bc3d5698SJohn Baldwin.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
771bc3d5698SJohn Baldwin.Lsqr8x_body:
772bc3d5698SJohn Baldwin
773bc3d5698SJohn Baldwin.byte	102,72,15,110,209
774bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm0
775bc3d5698SJohn Baldwin.byte	102,72,15,110,207
776bc3d5698SJohn Baldwin.byte	102,73,15,110,218
777bc3d5698SJohn Baldwin	movl	OPENSSL_ia32cap_P+8(%rip),%eax
778bc3d5698SJohn Baldwin	andl	$0x80100,%eax
779bc3d5698SJohn Baldwin	cmpl	$0x80100,%eax
780bc3d5698SJohn Baldwin	jne	.Lsqr8x_nox
781bc3d5698SJohn Baldwin
782bc3d5698SJohn Baldwin	call	bn_sqrx8x_internal
783bc3d5698SJohn Baldwin
784bc3d5698SJohn Baldwin
785bc3d5698SJohn Baldwin
786bc3d5698SJohn Baldwin
787bc3d5698SJohn Baldwin	leaq	(%r8,%rcx,1),%rbx
788bc3d5698SJohn Baldwin	movq	%rcx,%r9
789bc3d5698SJohn Baldwin	movq	%rcx,%rdx
790bc3d5698SJohn Baldwin.byte	102,72,15,126,207
791bc3d5698SJohn Baldwin	sarq	$3+2,%rcx
792bc3d5698SJohn Baldwin	jmp	.Lsqr8x_sub
793bc3d5698SJohn Baldwin
794bc3d5698SJohn Baldwin.align	32
795bc3d5698SJohn Baldwin.Lsqr8x_nox:
796bc3d5698SJohn Baldwin	call	bn_sqr8x_internal
797bc3d5698SJohn Baldwin
798bc3d5698SJohn Baldwin
799bc3d5698SJohn Baldwin
800bc3d5698SJohn Baldwin
801bc3d5698SJohn Baldwin	leaq	(%rdi,%r9,1),%rbx
802bc3d5698SJohn Baldwin	movq	%r9,%rcx
803bc3d5698SJohn Baldwin	movq	%r9,%rdx
804bc3d5698SJohn Baldwin.byte	102,72,15,126,207
805bc3d5698SJohn Baldwin	sarq	$3+2,%rcx
806bc3d5698SJohn Baldwin	jmp	.Lsqr8x_sub
807bc3d5698SJohn Baldwin
808bc3d5698SJohn Baldwin.align	32
809bc3d5698SJohn Baldwin.Lsqr8x_sub:
810bc3d5698SJohn Baldwin	movq	0(%rbx),%r12
811bc3d5698SJohn Baldwin	movq	8(%rbx),%r13
812bc3d5698SJohn Baldwin	movq	16(%rbx),%r14
813bc3d5698SJohn Baldwin	movq	24(%rbx),%r15
814bc3d5698SJohn Baldwin	leaq	32(%rbx),%rbx
815bc3d5698SJohn Baldwin	sbbq	0(%rbp),%r12
816bc3d5698SJohn Baldwin	sbbq	8(%rbp),%r13
817bc3d5698SJohn Baldwin	sbbq	16(%rbp),%r14
818bc3d5698SJohn Baldwin	sbbq	24(%rbp),%r15
819bc3d5698SJohn Baldwin	leaq	32(%rbp),%rbp
820bc3d5698SJohn Baldwin	movq	%r12,0(%rdi)
821bc3d5698SJohn Baldwin	movq	%r13,8(%rdi)
822bc3d5698SJohn Baldwin	movq	%r14,16(%rdi)
823bc3d5698SJohn Baldwin	movq	%r15,24(%rdi)
824bc3d5698SJohn Baldwin	leaq	32(%rdi),%rdi
825bc3d5698SJohn Baldwin	incq	%rcx
826bc3d5698SJohn Baldwin	jnz	.Lsqr8x_sub
827bc3d5698SJohn Baldwin
828bc3d5698SJohn Baldwin	sbbq	$0,%rax
829bc3d5698SJohn Baldwin	leaq	(%rbx,%r9,1),%rbx
830bc3d5698SJohn Baldwin	leaq	(%rdi,%r9,1),%rdi
831bc3d5698SJohn Baldwin
832bc3d5698SJohn Baldwin.byte	102,72,15,110,200
833bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm0
834bc3d5698SJohn Baldwin	pshufd	$0,%xmm1,%xmm1
835bc3d5698SJohn Baldwin	movq	40(%rsp),%rsi
836bc3d5698SJohn Baldwin.cfi_def_cfa	%rsi,8
837bc3d5698SJohn Baldwin	jmp	.Lsqr8x_cond_copy
838bc3d5698SJohn Baldwin
839bc3d5698SJohn Baldwin.align	32
840bc3d5698SJohn Baldwin.Lsqr8x_cond_copy:
841bc3d5698SJohn Baldwin	movdqa	0(%rbx),%xmm2
842bc3d5698SJohn Baldwin	movdqa	16(%rbx),%xmm3
843bc3d5698SJohn Baldwin	leaq	32(%rbx),%rbx
844bc3d5698SJohn Baldwin	movdqu	0(%rdi),%xmm4
845bc3d5698SJohn Baldwin	movdqu	16(%rdi),%xmm5
846bc3d5698SJohn Baldwin	leaq	32(%rdi),%rdi
847bc3d5698SJohn Baldwin	movdqa	%xmm0,-32(%rbx)
848bc3d5698SJohn Baldwin	movdqa	%xmm0,-16(%rbx)
849bc3d5698SJohn Baldwin	movdqa	%xmm0,-32(%rbx,%rdx,1)
850bc3d5698SJohn Baldwin	movdqa	%xmm0,-16(%rbx,%rdx,1)
851bc3d5698SJohn Baldwin	pcmpeqd	%xmm1,%xmm0
852bc3d5698SJohn Baldwin	pand	%xmm1,%xmm2
853bc3d5698SJohn Baldwin	pand	%xmm1,%xmm3
854bc3d5698SJohn Baldwin	pand	%xmm0,%xmm4
855bc3d5698SJohn Baldwin	pand	%xmm0,%xmm5
856bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm0
857bc3d5698SJohn Baldwin	por	%xmm2,%xmm4
858bc3d5698SJohn Baldwin	por	%xmm3,%xmm5
859bc3d5698SJohn Baldwin	movdqu	%xmm4,-32(%rdi)
860bc3d5698SJohn Baldwin	movdqu	%xmm5,-16(%rdi)
861bc3d5698SJohn Baldwin	addq	$32,%r9
862bc3d5698SJohn Baldwin	jnz	.Lsqr8x_cond_copy
863bc3d5698SJohn Baldwin
864bc3d5698SJohn Baldwin	movq	$1,%rax
865bc3d5698SJohn Baldwin	movq	-48(%rsi),%r15
866bc3d5698SJohn Baldwin.cfi_restore	%r15
867bc3d5698SJohn Baldwin	movq	-40(%rsi),%r14
868bc3d5698SJohn Baldwin.cfi_restore	%r14
869bc3d5698SJohn Baldwin	movq	-32(%rsi),%r13
870bc3d5698SJohn Baldwin.cfi_restore	%r13
871bc3d5698SJohn Baldwin	movq	-24(%rsi),%r12
872bc3d5698SJohn Baldwin.cfi_restore	%r12
873bc3d5698SJohn Baldwin	movq	-16(%rsi),%rbp
874bc3d5698SJohn Baldwin.cfi_restore	%rbp
875bc3d5698SJohn Baldwin	movq	-8(%rsi),%rbx
876bc3d5698SJohn Baldwin.cfi_restore	%rbx
877bc3d5698SJohn Baldwin	leaq	(%rsi),%rsp
878bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rsp
879bc3d5698SJohn Baldwin.Lsqr8x_epilogue:
880bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
881bc3d5698SJohn Baldwin.cfi_endproc
882bc3d5698SJohn Baldwin.size	bn_sqr8x_mont,.-bn_sqr8x_mont
883bc3d5698SJohn Baldwin.type	bn_mulx4x_mont,@function
884bc3d5698SJohn Baldwin.align	32
885bc3d5698SJohn Baldwinbn_mulx4x_mont:
886bc3d5698SJohn Baldwin.cfi_startproc
887bc3d5698SJohn Baldwin	movq	%rsp,%rax
888bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rax
889bc3d5698SJohn Baldwin.Lmulx4x_enter:
890bc3d5698SJohn Baldwin	pushq	%rbx
891bc3d5698SJohn Baldwin.cfi_offset	%rbx,-16
892bc3d5698SJohn Baldwin	pushq	%rbp
893bc3d5698SJohn Baldwin.cfi_offset	%rbp,-24
894bc3d5698SJohn Baldwin	pushq	%r12
895bc3d5698SJohn Baldwin.cfi_offset	%r12,-32
896bc3d5698SJohn Baldwin	pushq	%r13
897bc3d5698SJohn Baldwin.cfi_offset	%r13,-40
898bc3d5698SJohn Baldwin	pushq	%r14
899bc3d5698SJohn Baldwin.cfi_offset	%r14,-48
900bc3d5698SJohn Baldwin	pushq	%r15
901bc3d5698SJohn Baldwin.cfi_offset	%r15,-56
902bc3d5698SJohn Baldwin.Lmulx4x_prologue:
903bc3d5698SJohn Baldwin
904bc3d5698SJohn Baldwin	shll	$3,%r9d
905bc3d5698SJohn Baldwin	xorq	%r10,%r10
906bc3d5698SJohn Baldwin	subq	%r9,%r10
907bc3d5698SJohn Baldwin	movq	(%r8),%r8
908bc3d5698SJohn Baldwin	leaq	-72(%rsp,%r10,1),%rbp
909bc3d5698SJohn Baldwin	andq	$-128,%rbp
910bc3d5698SJohn Baldwin	movq	%rsp,%r11
911bc3d5698SJohn Baldwin	subq	%rbp,%r11
912bc3d5698SJohn Baldwin	andq	$-4096,%r11
913bc3d5698SJohn Baldwin	leaq	(%r11,%rbp,1),%rsp
914bc3d5698SJohn Baldwin	movq	(%rsp),%r10
915bc3d5698SJohn Baldwin	cmpq	%rbp,%rsp
916bc3d5698SJohn Baldwin	ja	.Lmulx4x_page_walk
917bc3d5698SJohn Baldwin	jmp	.Lmulx4x_page_walk_done
918bc3d5698SJohn Baldwin
919bc3d5698SJohn Baldwin.align	16
920bc3d5698SJohn Baldwin.Lmulx4x_page_walk:
921bc3d5698SJohn Baldwin	leaq	-4096(%rsp),%rsp
922bc3d5698SJohn Baldwin	movq	(%rsp),%r10
923bc3d5698SJohn Baldwin	cmpq	%rbp,%rsp
924bc3d5698SJohn Baldwin	ja	.Lmulx4x_page_walk
925bc3d5698SJohn Baldwin.Lmulx4x_page_walk_done:
926bc3d5698SJohn Baldwin
927bc3d5698SJohn Baldwin	leaq	(%rdx,%r9,1),%r10
928bc3d5698SJohn Baldwin
929bc3d5698SJohn Baldwin
930bc3d5698SJohn Baldwin
931bc3d5698SJohn Baldwin
932bc3d5698SJohn Baldwin
933bc3d5698SJohn Baldwin
934bc3d5698SJohn Baldwin
935bc3d5698SJohn Baldwin
936bc3d5698SJohn Baldwin
937bc3d5698SJohn Baldwin
938bc3d5698SJohn Baldwin
939bc3d5698SJohn Baldwin
940bc3d5698SJohn Baldwin	movq	%r9,0(%rsp)
941bc3d5698SJohn Baldwin	shrq	$5,%r9
942bc3d5698SJohn Baldwin	movq	%r10,16(%rsp)
943bc3d5698SJohn Baldwin	subq	$1,%r9
944bc3d5698SJohn Baldwin	movq	%r8,24(%rsp)
945bc3d5698SJohn Baldwin	movq	%rdi,32(%rsp)
946bc3d5698SJohn Baldwin	movq	%rax,40(%rsp)
947bc3d5698SJohn Baldwin.cfi_escape	0x0f,0x05,0x77,0x28,0x06,0x23,0x08
948bc3d5698SJohn Baldwin	movq	%r9,48(%rsp)
949bc3d5698SJohn Baldwin	jmp	.Lmulx4x_body
950bc3d5698SJohn Baldwin
951bc3d5698SJohn Baldwin.align	32
952bc3d5698SJohn Baldwin.Lmulx4x_body:
953bc3d5698SJohn Baldwin	leaq	8(%rdx),%rdi
954bc3d5698SJohn Baldwin	movq	(%rdx),%rdx
955bc3d5698SJohn Baldwin	leaq	64+32(%rsp),%rbx
956bc3d5698SJohn Baldwin	movq	%rdx,%r9
957bc3d5698SJohn Baldwin
958bc3d5698SJohn Baldwin	mulxq	0(%rsi),%r8,%rax
959bc3d5698SJohn Baldwin	mulxq	8(%rsi),%r11,%r14
960bc3d5698SJohn Baldwin	addq	%rax,%r11
961bc3d5698SJohn Baldwin	movq	%rdi,8(%rsp)
962bc3d5698SJohn Baldwin	mulxq	16(%rsi),%r12,%r13
963bc3d5698SJohn Baldwin	adcq	%r14,%r12
964bc3d5698SJohn Baldwin	adcq	$0,%r13
965bc3d5698SJohn Baldwin
966bc3d5698SJohn Baldwin	movq	%r8,%rdi
967bc3d5698SJohn Baldwin	imulq	24(%rsp),%r8
968bc3d5698SJohn Baldwin	xorq	%rbp,%rbp
969bc3d5698SJohn Baldwin
970bc3d5698SJohn Baldwin	mulxq	24(%rsi),%rax,%r14
971bc3d5698SJohn Baldwin	movq	%r8,%rdx
972bc3d5698SJohn Baldwin	leaq	32(%rsi),%rsi
973bc3d5698SJohn Baldwin	adcxq	%rax,%r13
974bc3d5698SJohn Baldwin	adcxq	%rbp,%r14
975bc3d5698SJohn Baldwin
976bc3d5698SJohn Baldwin	mulxq	0(%rcx),%rax,%r10
977bc3d5698SJohn Baldwin	adcxq	%rax,%rdi
978bc3d5698SJohn Baldwin	adoxq	%r11,%r10
979bc3d5698SJohn Baldwin	mulxq	8(%rcx),%rax,%r11
980bc3d5698SJohn Baldwin	adcxq	%rax,%r10
981bc3d5698SJohn Baldwin	adoxq	%r12,%r11
982bc3d5698SJohn Baldwin.byte	0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00
983bc3d5698SJohn Baldwin	movq	48(%rsp),%rdi
984bc3d5698SJohn Baldwin	movq	%r10,-32(%rbx)
985bc3d5698SJohn Baldwin	adcxq	%rax,%r11
986bc3d5698SJohn Baldwin	adoxq	%r13,%r12
987bc3d5698SJohn Baldwin	mulxq	24(%rcx),%rax,%r15
988bc3d5698SJohn Baldwin	movq	%r9,%rdx
989bc3d5698SJohn Baldwin	movq	%r11,-24(%rbx)
990bc3d5698SJohn Baldwin	adcxq	%rax,%r12
991bc3d5698SJohn Baldwin	adoxq	%rbp,%r15
992bc3d5698SJohn Baldwin	leaq	32(%rcx),%rcx
993bc3d5698SJohn Baldwin	movq	%r12,-16(%rbx)
994bc3d5698SJohn Baldwin
995bc3d5698SJohn Baldwin	jmp	.Lmulx4x_1st
996bc3d5698SJohn Baldwin
997bc3d5698SJohn Baldwin.align	32
998bc3d5698SJohn Baldwin.Lmulx4x_1st:
999bc3d5698SJohn Baldwin	adcxq	%rbp,%r15
1000bc3d5698SJohn Baldwin	mulxq	0(%rsi),%r10,%rax
1001bc3d5698SJohn Baldwin	adcxq	%r14,%r10
1002bc3d5698SJohn Baldwin	mulxq	8(%rsi),%r11,%r14
1003bc3d5698SJohn Baldwin	adcxq	%rax,%r11
1004bc3d5698SJohn Baldwin	mulxq	16(%rsi),%r12,%rax
1005bc3d5698SJohn Baldwin	adcxq	%r14,%r12
1006bc3d5698SJohn Baldwin	mulxq	24(%rsi),%r13,%r14
1007bc3d5698SJohn Baldwin.byte	0x67,0x67
1008bc3d5698SJohn Baldwin	movq	%r8,%rdx
1009bc3d5698SJohn Baldwin	adcxq	%rax,%r13
1010bc3d5698SJohn Baldwin	adcxq	%rbp,%r14
1011bc3d5698SJohn Baldwin	leaq	32(%rsi),%rsi
1012bc3d5698SJohn Baldwin	leaq	32(%rbx),%rbx
1013bc3d5698SJohn Baldwin
1014bc3d5698SJohn Baldwin	adoxq	%r15,%r10
1015bc3d5698SJohn Baldwin	mulxq	0(%rcx),%rax,%r15
1016bc3d5698SJohn Baldwin	adcxq	%rax,%r10
1017bc3d5698SJohn Baldwin	adoxq	%r15,%r11
1018bc3d5698SJohn Baldwin	mulxq	8(%rcx),%rax,%r15
1019bc3d5698SJohn Baldwin	adcxq	%rax,%r11
1020bc3d5698SJohn Baldwin	adoxq	%r15,%r12
1021bc3d5698SJohn Baldwin	mulxq	16(%rcx),%rax,%r15
1022bc3d5698SJohn Baldwin	movq	%r10,-40(%rbx)
1023bc3d5698SJohn Baldwin	adcxq	%rax,%r12
1024bc3d5698SJohn Baldwin	movq	%r11,-32(%rbx)
1025bc3d5698SJohn Baldwin	adoxq	%r15,%r13
1026bc3d5698SJohn Baldwin	mulxq	24(%rcx),%rax,%r15
1027bc3d5698SJohn Baldwin	movq	%r9,%rdx
1028bc3d5698SJohn Baldwin	movq	%r12,-24(%rbx)
1029bc3d5698SJohn Baldwin	adcxq	%rax,%r13
1030bc3d5698SJohn Baldwin	adoxq	%rbp,%r15
1031bc3d5698SJohn Baldwin	leaq	32(%rcx),%rcx
1032bc3d5698SJohn Baldwin	movq	%r13,-16(%rbx)
1033bc3d5698SJohn Baldwin
1034bc3d5698SJohn Baldwin	decq	%rdi
1035bc3d5698SJohn Baldwin	jnz	.Lmulx4x_1st
1036bc3d5698SJohn Baldwin
1037bc3d5698SJohn Baldwin	movq	0(%rsp),%rax
1038bc3d5698SJohn Baldwin	movq	8(%rsp),%rdi
1039bc3d5698SJohn Baldwin	adcq	%rbp,%r15
1040bc3d5698SJohn Baldwin	addq	%r15,%r14
1041bc3d5698SJohn Baldwin	sbbq	%r15,%r15
1042bc3d5698SJohn Baldwin	movq	%r14,-8(%rbx)
1043bc3d5698SJohn Baldwin	jmp	.Lmulx4x_outer
1044bc3d5698SJohn Baldwin
1045bc3d5698SJohn Baldwin.align	32
1046bc3d5698SJohn Baldwin.Lmulx4x_outer:
1047bc3d5698SJohn Baldwin	movq	(%rdi),%rdx
1048bc3d5698SJohn Baldwin	leaq	8(%rdi),%rdi
1049bc3d5698SJohn Baldwin	subq	%rax,%rsi
1050bc3d5698SJohn Baldwin	movq	%r15,(%rbx)
1051bc3d5698SJohn Baldwin	leaq	64+32(%rsp),%rbx
1052bc3d5698SJohn Baldwin	subq	%rax,%rcx
1053bc3d5698SJohn Baldwin
1054bc3d5698SJohn Baldwin	mulxq	0(%rsi),%r8,%r11
1055bc3d5698SJohn Baldwin	xorl	%ebp,%ebp
1056bc3d5698SJohn Baldwin	movq	%rdx,%r9
1057bc3d5698SJohn Baldwin	mulxq	8(%rsi),%r14,%r12
1058bc3d5698SJohn Baldwin	adoxq	-32(%rbx),%r8
1059bc3d5698SJohn Baldwin	adcxq	%r14,%r11
1060bc3d5698SJohn Baldwin	mulxq	16(%rsi),%r15,%r13
1061bc3d5698SJohn Baldwin	adoxq	-24(%rbx),%r11
1062bc3d5698SJohn Baldwin	adcxq	%r15,%r12
1063bc3d5698SJohn Baldwin	adoxq	-16(%rbx),%r12
1064bc3d5698SJohn Baldwin	adcxq	%rbp,%r13
1065bc3d5698SJohn Baldwin	adoxq	%rbp,%r13
1066bc3d5698SJohn Baldwin
1067bc3d5698SJohn Baldwin	movq	%rdi,8(%rsp)
1068bc3d5698SJohn Baldwin	movq	%r8,%r15
1069bc3d5698SJohn Baldwin	imulq	24(%rsp),%r8
1070bc3d5698SJohn Baldwin	xorl	%ebp,%ebp
1071bc3d5698SJohn Baldwin
1072bc3d5698SJohn Baldwin	mulxq	24(%rsi),%rax,%r14
1073bc3d5698SJohn Baldwin	movq	%r8,%rdx
1074bc3d5698SJohn Baldwin	adcxq	%rax,%r13
1075bc3d5698SJohn Baldwin	adoxq	-8(%rbx),%r13
1076bc3d5698SJohn Baldwin	adcxq	%rbp,%r14
1077bc3d5698SJohn Baldwin	leaq	32(%rsi),%rsi
1078bc3d5698SJohn Baldwin	adoxq	%rbp,%r14
1079bc3d5698SJohn Baldwin
1080bc3d5698SJohn Baldwin	mulxq	0(%rcx),%rax,%r10
1081bc3d5698SJohn Baldwin	adcxq	%rax,%r15
1082bc3d5698SJohn Baldwin	adoxq	%r11,%r10
1083bc3d5698SJohn Baldwin	mulxq	8(%rcx),%rax,%r11
1084bc3d5698SJohn Baldwin	adcxq	%rax,%r10
1085bc3d5698SJohn Baldwin	adoxq	%r12,%r11
1086bc3d5698SJohn Baldwin	mulxq	16(%rcx),%rax,%r12
1087bc3d5698SJohn Baldwin	movq	%r10,-32(%rbx)
1088bc3d5698SJohn Baldwin	adcxq	%rax,%r11
1089bc3d5698SJohn Baldwin	adoxq	%r13,%r12
1090bc3d5698SJohn Baldwin	mulxq	24(%rcx),%rax,%r15
1091bc3d5698SJohn Baldwin	movq	%r9,%rdx
1092bc3d5698SJohn Baldwin	movq	%r11,-24(%rbx)
1093bc3d5698SJohn Baldwin	leaq	32(%rcx),%rcx
1094bc3d5698SJohn Baldwin	adcxq	%rax,%r12
1095bc3d5698SJohn Baldwin	adoxq	%rbp,%r15
1096bc3d5698SJohn Baldwin	movq	48(%rsp),%rdi
1097bc3d5698SJohn Baldwin	movq	%r12,-16(%rbx)
1098bc3d5698SJohn Baldwin
1099bc3d5698SJohn Baldwin	jmp	.Lmulx4x_inner
1100bc3d5698SJohn Baldwin
1101bc3d5698SJohn Baldwin.align	32
1102bc3d5698SJohn Baldwin.Lmulx4x_inner:
1103bc3d5698SJohn Baldwin	mulxq	0(%rsi),%r10,%rax
1104bc3d5698SJohn Baldwin	adcxq	%rbp,%r15
1105bc3d5698SJohn Baldwin	adoxq	%r14,%r10
1106bc3d5698SJohn Baldwin	mulxq	8(%rsi),%r11,%r14
1107bc3d5698SJohn Baldwin	adcxq	0(%rbx),%r10
1108bc3d5698SJohn Baldwin	adoxq	%rax,%r11
1109bc3d5698SJohn Baldwin	mulxq	16(%rsi),%r12,%rax
1110bc3d5698SJohn Baldwin	adcxq	8(%rbx),%r11
1111bc3d5698SJohn Baldwin	adoxq	%r14,%r12
1112bc3d5698SJohn Baldwin	mulxq	24(%rsi),%r13,%r14
1113bc3d5698SJohn Baldwin	movq	%r8,%rdx
1114bc3d5698SJohn Baldwin	adcxq	16(%rbx),%r12
1115bc3d5698SJohn Baldwin	adoxq	%rax,%r13
1116bc3d5698SJohn Baldwin	adcxq	24(%rbx),%r13
1117bc3d5698SJohn Baldwin	adoxq	%rbp,%r14
1118bc3d5698SJohn Baldwin	leaq	32(%rsi),%rsi
1119bc3d5698SJohn Baldwin	leaq	32(%rbx),%rbx
1120bc3d5698SJohn Baldwin	adcxq	%rbp,%r14
1121bc3d5698SJohn Baldwin
1122bc3d5698SJohn Baldwin	adoxq	%r15,%r10
1123bc3d5698SJohn Baldwin	mulxq	0(%rcx),%rax,%r15
1124bc3d5698SJohn Baldwin	adcxq	%rax,%r10
1125bc3d5698SJohn Baldwin	adoxq	%r15,%r11
1126bc3d5698SJohn Baldwin	mulxq	8(%rcx),%rax,%r15
1127bc3d5698SJohn Baldwin	adcxq	%rax,%r11
1128bc3d5698SJohn Baldwin	adoxq	%r15,%r12
1129bc3d5698SJohn Baldwin	mulxq	16(%rcx),%rax,%r15
1130bc3d5698SJohn Baldwin	movq	%r10,-40(%rbx)
1131bc3d5698SJohn Baldwin	adcxq	%rax,%r12
1132bc3d5698SJohn Baldwin	adoxq	%r15,%r13
1133bc3d5698SJohn Baldwin	mulxq	24(%rcx),%rax,%r15
1134bc3d5698SJohn Baldwin	movq	%r9,%rdx
1135bc3d5698SJohn Baldwin	movq	%r11,-32(%rbx)
1136bc3d5698SJohn Baldwin	movq	%r12,-24(%rbx)
1137bc3d5698SJohn Baldwin	adcxq	%rax,%r13
1138bc3d5698SJohn Baldwin	adoxq	%rbp,%r15
1139bc3d5698SJohn Baldwin	leaq	32(%rcx),%rcx
1140bc3d5698SJohn Baldwin	movq	%r13,-16(%rbx)
1141bc3d5698SJohn Baldwin
1142bc3d5698SJohn Baldwin	decq	%rdi
1143bc3d5698SJohn Baldwin	jnz	.Lmulx4x_inner
1144bc3d5698SJohn Baldwin
1145bc3d5698SJohn Baldwin	movq	0(%rsp),%rax
1146bc3d5698SJohn Baldwin	movq	8(%rsp),%rdi
1147bc3d5698SJohn Baldwin	adcq	%rbp,%r15
1148bc3d5698SJohn Baldwin	subq	0(%rbx),%rbp
1149bc3d5698SJohn Baldwin	adcq	%r15,%r14
1150bc3d5698SJohn Baldwin	sbbq	%r15,%r15
1151bc3d5698SJohn Baldwin	movq	%r14,-8(%rbx)
1152bc3d5698SJohn Baldwin
1153bc3d5698SJohn Baldwin	cmpq	16(%rsp),%rdi
1154bc3d5698SJohn Baldwin	jne	.Lmulx4x_outer
1155bc3d5698SJohn Baldwin
1156bc3d5698SJohn Baldwin	leaq	64(%rsp),%rbx
1157bc3d5698SJohn Baldwin	subq	%rax,%rcx
1158bc3d5698SJohn Baldwin	negq	%r15
1159bc3d5698SJohn Baldwin	movq	%rax,%rdx
1160bc3d5698SJohn Baldwin	shrq	$3+2,%rax
1161bc3d5698SJohn Baldwin	movq	32(%rsp),%rdi
1162bc3d5698SJohn Baldwin	jmp	.Lmulx4x_sub
1163bc3d5698SJohn Baldwin
1164bc3d5698SJohn Baldwin.align	32
1165bc3d5698SJohn Baldwin.Lmulx4x_sub:
1166bc3d5698SJohn Baldwin	movq	0(%rbx),%r11
1167bc3d5698SJohn Baldwin	movq	8(%rbx),%r12
1168bc3d5698SJohn Baldwin	movq	16(%rbx),%r13
1169bc3d5698SJohn Baldwin	movq	24(%rbx),%r14
1170bc3d5698SJohn Baldwin	leaq	32(%rbx),%rbx
1171bc3d5698SJohn Baldwin	sbbq	0(%rcx),%r11
1172bc3d5698SJohn Baldwin	sbbq	8(%rcx),%r12
1173bc3d5698SJohn Baldwin	sbbq	16(%rcx),%r13
1174bc3d5698SJohn Baldwin	sbbq	24(%rcx),%r14
1175bc3d5698SJohn Baldwin	leaq	32(%rcx),%rcx
1176bc3d5698SJohn Baldwin	movq	%r11,0(%rdi)
1177bc3d5698SJohn Baldwin	movq	%r12,8(%rdi)
1178bc3d5698SJohn Baldwin	movq	%r13,16(%rdi)
1179bc3d5698SJohn Baldwin	movq	%r14,24(%rdi)
1180bc3d5698SJohn Baldwin	leaq	32(%rdi),%rdi
1181bc3d5698SJohn Baldwin	decq	%rax
1182bc3d5698SJohn Baldwin	jnz	.Lmulx4x_sub
1183bc3d5698SJohn Baldwin
1184bc3d5698SJohn Baldwin	sbbq	$0,%r15
1185bc3d5698SJohn Baldwin	leaq	64(%rsp),%rbx
1186bc3d5698SJohn Baldwin	subq	%rdx,%rdi
1187bc3d5698SJohn Baldwin
1188bc3d5698SJohn Baldwin.byte	102,73,15,110,207
1189bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm0
1190bc3d5698SJohn Baldwin	pshufd	$0,%xmm1,%xmm1
1191bc3d5698SJohn Baldwin	movq	40(%rsp),%rsi
1192bc3d5698SJohn Baldwin.cfi_def_cfa	%rsi,8
1193bc3d5698SJohn Baldwin	jmp	.Lmulx4x_cond_copy
1194bc3d5698SJohn Baldwin
1195bc3d5698SJohn Baldwin.align	32
1196bc3d5698SJohn Baldwin.Lmulx4x_cond_copy:
1197bc3d5698SJohn Baldwin	movdqa	0(%rbx),%xmm2
1198bc3d5698SJohn Baldwin	movdqa	16(%rbx),%xmm3
1199bc3d5698SJohn Baldwin	leaq	32(%rbx),%rbx
1200bc3d5698SJohn Baldwin	movdqu	0(%rdi),%xmm4
1201bc3d5698SJohn Baldwin	movdqu	16(%rdi),%xmm5
1202bc3d5698SJohn Baldwin	leaq	32(%rdi),%rdi
1203bc3d5698SJohn Baldwin	movdqa	%xmm0,-32(%rbx)
1204bc3d5698SJohn Baldwin	movdqa	%xmm0,-16(%rbx)
1205bc3d5698SJohn Baldwin	pcmpeqd	%xmm1,%xmm0
1206bc3d5698SJohn Baldwin	pand	%xmm1,%xmm2
1207bc3d5698SJohn Baldwin	pand	%xmm1,%xmm3
1208bc3d5698SJohn Baldwin	pand	%xmm0,%xmm4
1209bc3d5698SJohn Baldwin	pand	%xmm0,%xmm5
1210bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm0
1211bc3d5698SJohn Baldwin	por	%xmm2,%xmm4
1212bc3d5698SJohn Baldwin	por	%xmm3,%xmm5
1213bc3d5698SJohn Baldwin	movdqu	%xmm4,-32(%rdi)
1214bc3d5698SJohn Baldwin	movdqu	%xmm5,-16(%rdi)
1215bc3d5698SJohn Baldwin	subq	$32,%rdx
1216bc3d5698SJohn Baldwin	jnz	.Lmulx4x_cond_copy
1217bc3d5698SJohn Baldwin
1218bc3d5698SJohn Baldwin	movq	%rdx,(%rbx)
1219bc3d5698SJohn Baldwin
1220bc3d5698SJohn Baldwin	movq	$1,%rax
1221bc3d5698SJohn Baldwin	movq	-48(%rsi),%r15
1222bc3d5698SJohn Baldwin.cfi_restore	%r15
1223bc3d5698SJohn Baldwin	movq	-40(%rsi),%r14
1224bc3d5698SJohn Baldwin.cfi_restore	%r14
1225bc3d5698SJohn Baldwin	movq	-32(%rsi),%r13
1226bc3d5698SJohn Baldwin.cfi_restore	%r13
1227bc3d5698SJohn Baldwin	movq	-24(%rsi),%r12
1228bc3d5698SJohn Baldwin.cfi_restore	%r12
1229bc3d5698SJohn Baldwin	movq	-16(%rsi),%rbp
1230bc3d5698SJohn Baldwin.cfi_restore	%rbp
1231bc3d5698SJohn Baldwin	movq	-8(%rsi),%rbx
1232bc3d5698SJohn Baldwin.cfi_restore	%rbx
1233bc3d5698SJohn Baldwin	leaq	(%rsi),%rsp
1234bc3d5698SJohn Baldwin.cfi_def_cfa_register	%rsp
1235bc3d5698SJohn Baldwin.Lmulx4x_epilogue:
1236bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
1237bc3d5698SJohn Baldwin.cfi_endproc
1238bc3d5698SJohn Baldwin.size	bn_mulx4x_mont,.-bn_mulx4x_mont
1239bc3d5698SJohn Baldwin.byte	77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1240bc3d5698SJohn Baldwin.align	16
1241*c0855eaaSJohn Baldwin	.section ".note.gnu.property", "a"
1242*c0855eaaSJohn Baldwin	.p2align 3
1243*c0855eaaSJohn Baldwin	.long 1f - 0f
1244*c0855eaaSJohn Baldwin	.long 4f - 1f
1245*c0855eaaSJohn Baldwin	.long 5
1246*c0855eaaSJohn Baldwin0:
1247*c0855eaaSJohn Baldwin	# "GNU" encoded with .byte, since .asciz isn't supported
1248*c0855eaaSJohn Baldwin	# on Solaris.
1249*c0855eaaSJohn Baldwin	.byte 0x47
1250*c0855eaaSJohn Baldwin	.byte 0x4e
1251*c0855eaaSJohn Baldwin	.byte 0x55
1252*c0855eaaSJohn Baldwin	.byte 0
1253*c0855eaaSJohn Baldwin1:
1254*c0855eaaSJohn Baldwin	.p2align 3
1255*c0855eaaSJohn Baldwin	.long 0xc0000002
1256*c0855eaaSJohn Baldwin	.long 3f - 2f
1257*c0855eaaSJohn Baldwin2:
1258*c0855eaaSJohn Baldwin	.long 3
1259*c0855eaaSJohn Baldwin3:
1260*c0855eaaSJohn Baldwin	.p2align 3
1261*c0855eaaSJohn Baldwin4:
1262