xref: /freebsd/sys/crypto/openssl/amd64/poly1305-x86_64.S (revision c0855eaa3ee9614804b6bd6a255aa9f71e095f43)
1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from poly1305-x86_64.pl. */
2bc3d5698SJohn Baldwin.text
3bc3d5698SJohn Baldwin
4bc3d5698SJohn Baldwin
5bc3d5698SJohn Baldwin
6bc3d5698SJohn Baldwin.globl	poly1305_init
7bc3d5698SJohn Baldwin.hidden	poly1305_init
8bc3d5698SJohn Baldwin.globl	poly1305_blocks
9bc3d5698SJohn Baldwin.hidden	poly1305_blocks
10bc3d5698SJohn Baldwin.globl	poly1305_emit
11bc3d5698SJohn Baldwin.hidden	poly1305_emit
12bc3d5698SJohn Baldwin
13bc3d5698SJohn Baldwin.type	poly1305_init,@function
14bc3d5698SJohn Baldwin.align	32
15bc3d5698SJohn Baldwinpoly1305_init:
16bc3d5698SJohn Baldwin.cfi_startproc
17bc3d5698SJohn Baldwin	xorq	%rax,%rax
18bc3d5698SJohn Baldwin	movq	%rax,0(%rdi)
19bc3d5698SJohn Baldwin	movq	%rax,8(%rdi)
20bc3d5698SJohn Baldwin	movq	%rax,16(%rdi)
21bc3d5698SJohn Baldwin
22bc3d5698SJohn Baldwin	cmpq	$0,%rsi
23bc3d5698SJohn Baldwin	je	.Lno_key
24bc3d5698SJohn Baldwin
25bc3d5698SJohn Baldwin	leaq	poly1305_blocks(%rip),%r10
26bc3d5698SJohn Baldwin	leaq	poly1305_emit(%rip),%r11
27bc3d5698SJohn Baldwin	movq	OPENSSL_ia32cap_P+4(%rip),%r9
28bc3d5698SJohn Baldwin	leaq	poly1305_blocks_avx(%rip),%rax
29bc3d5698SJohn Baldwin	leaq	poly1305_emit_avx(%rip),%rcx
30bc3d5698SJohn Baldwin	btq	$28,%r9
31bc3d5698SJohn Baldwin	cmovcq	%rax,%r10
32bc3d5698SJohn Baldwin	cmovcq	%rcx,%r11
33bc3d5698SJohn Baldwin	leaq	poly1305_blocks_avx2(%rip),%rax
34bc3d5698SJohn Baldwin	btq	$37,%r9
35bc3d5698SJohn Baldwin	cmovcq	%rax,%r10
36bc3d5698SJohn Baldwin	movq	$0x0ffffffc0fffffff,%rax
37bc3d5698SJohn Baldwin	movq	$0x0ffffffc0ffffffc,%rcx
38bc3d5698SJohn Baldwin	andq	0(%rsi),%rax
39bc3d5698SJohn Baldwin	andq	8(%rsi),%rcx
40bc3d5698SJohn Baldwin	movq	%rax,24(%rdi)
41bc3d5698SJohn Baldwin	movq	%rcx,32(%rdi)
42bc3d5698SJohn Baldwin	movq	%r10,0(%rdx)
43bc3d5698SJohn Baldwin	movq	%r11,8(%rdx)
44bc3d5698SJohn Baldwin	movl	$1,%eax
45bc3d5698SJohn Baldwin.Lno_key:
46bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
47bc3d5698SJohn Baldwin.cfi_endproc
48bc3d5698SJohn Baldwin.size	poly1305_init,.-poly1305_init
49bc3d5698SJohn Baldwin
50bc3d5698SJohn Baldwin.type	poly1305_blocks,@function
51bc3d5698SJohn Baldwin.align	32
52bc3d5698SJohn Baldwinpoly1305_blocks:
53bc3d5698SJohn Baldwin.cfi_startproc
54bc3d5698SJohn Baldwin.Lblocks:
55bc3d5698SJohn Baldwin	shrq	$4,%rdx
56bc3d5698SJohn Baldwin	jz	.Lno_data
57bc3d5698SJohn Baldwin
58bc3d5698SJohn Baldwin	pushq	%rbx
59bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
60bc3d5698SJohn Baldwin.cfi_offset	%rbx,-16
61bc3d5698SJohn Baldwin	pushq	%rbp
62bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
63bc3d5698SJohn Baldwin.cfi_offset	%rbp,-24
64bc3d5698SJohn Baldwin	pushq	%r12
65bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
66bc3d5698SJohn Baldwin.cfi_offset	%r12,-32
67bc3d5698SJohn Baldwin	pushq	%r13
68bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
69bc3d5698SJohn Baldwin.cfi_offset	%r13,-40
70bc3d5698SJohn Baldwin	pushq	%r14
71bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
72bc3d5698SJohn Baldwin.cfi_offset	%r14,-48
73bc3d5698SJohn Baldwin	pushq	%r15
74bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
75bc3d5698SJohn Baldwin.cfi_offset	%r15,-56
76bc3d5698SJohn Baldwin.Lblocks_body:
77bc3d5698SJohn Baldwin
78bc3d5698SJohn Baldwin	movq	%rdx,%r15
79bc3d5698SJohn Baldwin
80bc3d5698SJohn Baldwin	movq	24(%rdi),%r11
81bc3d5698SJohn Baldwin	movq	32(%rdi),%r13
82bc3d5698SJohn Baldwin
83bc3d5698SJohn Baldwin	movq	0(%rdi),%r14
84bc3d5698SJohn Baldwin	movq	8(%rdi),%rbx
85bc3d5698SJohn Baldwin	movq	16(%rdi),%rbp
86bc3d5698SJohn Baldwin
87bc3d5698SJohn Baldwin	movq	%r13,%r12
88bc3d5698SJohn Baldwin	shrq	$2,%r13
89bc3d5698SJohn Baldwin	movq	%r12,%rax
90bc3d5698SJohn Baldwin	addq	%r12,%r13
91bc3d5698SJohn Baldwin	jmp	.Loop
92bc3d5698SJohn Baldwin
93bc3d5698SJohn Baldwin.align	32
94bc3d5698SJohn Baldwin.Loop:
95bc3d5698SJohn Baldwin	addq	0(%rsi),%r14
96bc3d5698SJohn Baldwin	adcq	8(%rsi),%rbx
97bc3d5698SJohn Baldwin	leaq	16(%rsi),%rsi
98bc3d5698SJohn Baldwin	adcq	%rcx,%rbp
99bc3d5698SJohn Baldwin	mulq	%r14
100bc3d5698SJohn Baldwin	movq	%rax,%r9
101bc3d5698SJohn Baldwin	movq	%r11,%rax
102bc3d5698SJohn Baldwin	movq	%rdx,%r10
103bc3d5698SJohn Baldwin
104bc3d5698SJohn Baldwin	mulq	%r14
105bc3d5698SJohn Baldwin	movq	%rax,%r14
106bc3d5698SJohn Baldwin	movq	%r11,%rax
107bc3d5698SJohn Baldwin	movq	%rdx,%r8
108bc3d5698SJohn Baldwin
109bc3d5698SJohn Baldwin	mulq	%rbx
110bc3d5698SJohn Baldwin	addq	%rax,%r9
111bc3d5698SJohn Baldwin	movq	%r13,%rax
112bc3d5698SJohn Baldwin	adcq	%rdx,%r10
113bc3d5698SJohn Baldwin
114bc3d5698SJohn Baldwin	mulq	%rbx
115bc3d5698SJohn Baldwin	movq	%rbp,%rbx
116bc3d5698SJohn Baldwin	addq	%rax,%r14
117bc3d5698SJohn Baldwin	adcq	%rdx,%r8
118bc3d5698SJohn Baldwin
119bc3d5698SJohn Baldwin	imulq	%r13,%rbx
120bc3d5698SJohn Baldwin	addq	%rbx,%r9
121bc3d5698SJohn Baldwin	movq	%r8,%rbx
122bc3d5698SJohn Baldwin	adcq	$0,%r10
123bc3d5698SJohn Baldwin
124bc3d5698SJohn Baldwin	imulq	%r11,%rbp
125bc3d5698SJohn Baldwin	addq	%r9,%rbx
126bc3d5698SJohn Baldwin	movq	$-4,%rax
127bc3d5698SJohn Baldwin	adcq	%rbp,%r10
128bc3d5698SJohn Baldwin
129bc3d5698SJohn Baldwin	andq	%r10,%rax
130bc3d5698SJohn Baldwin	movq	%r10,%rbp
131bc3d5698SJohn Baldwin	shrq	$2,%r10
132bc3d5698SJohn Baldwin	andq	$3,%rbp
133bc3d5698SJohn Baldwin	addq	%r10,%rax
134bc3d5698SJohn Baldwin	addq	%rax,%r14
135bc3d5698SJohn Baldwin	adcq	$0,%rbx
136bc3d5698SJohn Baldwin	adcq	$0,%rbp
137bc3d5698SJohn Baldwin	movq	%r12,%rax
138bc3d5698SJohn Baldwin	decq	%r15
139bc3d5698SJohn Baldwin	jnz	.Loop
140bc3d5698SJohn Baldwin
141bc3d5698SJohn Baldwin	movq	%r14,0(%rdi)
142bc3d5698SJohn Baldwin	movq	%rbx,8(%rdi)
143bc3d5698SJohn Baldwin	movq	%rbp,16(%rdi)
144bc3d5698SJohn Baldwin
145bc3d5698SJohn Baldwin	movq	0(%rsp),%r15
146bc3d5698SJohn Baldwin.cfi_restore	%r15
147bc3d5698SJohn Baldwin	movq	8(%rsp),%r14
148bc3d5698SJohn Baldwin.cfi_restore	%r14
149bc3d5698SJohn Baldwin	movq	16(%rsp),%r13
150bc3d5698SJohn Baldwin.cfi_restore	%r13
151bc3d5698SJohn Baldwin	movq	24(%rsp),%r12
152bc3d5698SJohn Baldwin.cfi_restore	%r12
153bc3d5698SJohn Baldwin	movq	32(%rsp),%rbp
154bc3d5698SJohn Baldwin.cfi_restore	%rbp
155bc3d5698SJohn Baldwin	movq	40(%rsp),%rbx
156bc3d5698SJohn Baldwin.cfi_restore	%rbx
157bc3d5698SJohn Baldwin	leaq	48(%rsp),%rsp
158bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	-48
159bc3d5698SJohn Baldwin.Lno_data:
160bc3d5698SJohn Baldwin.Lblocks_epilogue:
161bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
162bc3d5698SJohn Baldwin.cfi_endproc
163bc3d5698SJohn Baldwin.size	poly1305_blocks,.-poly1305_blocks
164bc3d5698SJohn Baldwin
165bc3d5698SJohn Baldwin.type	poly1305_emit,@function
166bc3d5698SJohn Baldwin.align	32
167bc3d5698SJohn Baldwinpoly1305_emit:
168bc3d5698SJohn Baldwin.cfi_startproc
169bc3d5698SJohn Baldwin.Lemit:
170bc3d5698SJohn Baldwin	movq	0(%rdi),%r8
171bc3d5698SJohn Baldwin	movq	8(%rdi),%r9
172bc3d5698SJohn Baldwin	movq	16(%rdi),%r10
173bc3d5698SJohn Baldwin
174bc3d5698SJohn Baldwin	movq	%r8,%rax
175bc3d5698SJohn Baldwin	addq	$5,%r8
176bc3d5698SJohn Baldwin	movq	%r9,%rcx
177bc3d5698SJohn Baldwin	adcq	$0,%r9
178bc3d5698SJohn Baldwin	adcq	$0,%r10
179bc3d5698SJohn Baldwin	shrq	$2,%r10
180bc3d5698SJohn Baldwin	cmovnzq	%r8,%rax
181bc3d5698SJohn Baldwin	cmovnzq	%r9,%rcx
182bc3d5698SJohn Baldwin
183bc3d5698SJohn Baldwin	addq	0(%rdx),%rax
184bc3d5698SJohn Baldwin	adcq	8(%rdx),%rcx
185bc3d5698SJohn Baldwin	movq	%rax,0(%rsi)
186bc3d5698SJohn Baldwin	movq	%rcx,8(%rsi)
187bc3d5698SJohn Baldwin
188bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
189bc3d5698SJohn Baldwin.cfi_endproc
190bc3d5698SJohn Baldwin.size	poly1305_emit,.-poly1305_emit
191bc3d5698SJohn Baldwin.type	__poly1305_block,@function
192bc3d5698SJohn Baldwin.align	32
193bc3d5698SJohn Baldwin__poly1305_block:
194bc3d5698SJohn Baldwin.cfi_startproc
195bc3d5698SJohn Baldwin	mulq	%r14
196bc3d5698SJohn Baldwin	movq	%rax,%r9
197bc3d5698SJohn Baldwin	movq	%r11,%rax
198bc3d5698SJohn Baldwin	movq	%rdx,%r10
199bc3d5698SJohn Baldwin
200bc3d5698SJohn Baldwin	mulq	%r14
201bc3d5698SJohn Baldwin	movq	%rax,%r14
202bc3d5698SJohn Baldwin	movq	%r11,%rax
203bc3d5698SJohn Baldwin	movq	%rdx,%r8
204bc3d5698SJohn Baldwin
205bc3d5698SJohn Baldwin	mulq	%rbx
206bc3d5698SJohn Baldwin	addq	%rax,%r9
207bc3d5698SJohn Baldwin	movq	%r13,%rax
208bc3d5698SJohn Baldwin	adcq	%rdx,%r10
209bc3d5698SJohn Baldwin
210bc3d5698SJohn Baldwin	mulq	%rbx
211bc3d5698SJohn Baldwin	movq	%rbp,%rbx
212bc3d5698SJohn Baldwin	addq	%rax,%r14
213bc3d5698SJohn Baldwin	adcq	%rdx,%r8
214bc3d5698SJohn Baldwin
215bc3d5698SJohn Baldwin	imulq	%r13,%rbx
216bc3d5698SJohn Baldwin	addq	%rbx,%r9
217bc3d5698SJohn Baldwin	movq	%r8,%rbx
218bc3d5698SJohn Baldwin	adcq	$0,%r10
219bc3d5698SJohn Baldwin
220bc3d5698SJohn Baldwin	imulq	%r11,%rbp
221bc3d5698SJohn Baldwin	addq	%r9,%rbx
222bc3d5698SJohn Baldwin	movq	$-4,%rax
223bc3d5698SJohn Baldwin	adcq	%rbp,%r10
224bc3d5698SJohn Baldwin
225bc3d5698SJohn Baldwin	andq	%r10,%rax
226bc3d5698SJohn Baldwin	movq	%r10,%rbp
227bc3d5698SJohn Baldwin	shrq	$2,%r10
228bc3d5698SJohn Baldwin	andq	$3,%rbp
229bc3d5698SJohn Baldwin	addq	%r10,%rax
230bc3d5698SJohn Baldwin	addq	%rax,%r14
231bc3d5698SJohn Baldwin	adcq	$0,%rbx
232bc3d5698SJohn Baldwin	adcq	$0,%rbp
233bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
234bc3d5698SJohn Baldwin.cfi_endproc
235bc3d5698SJohn Baldwin.size	__poly1305_block,.-__poly1305_block
236bc3d5698SJohn Baldwin
237bc3d5698SJohn Baldwin.type	__poly1305_init_avx,@function
238bc3d5698SJohn Baldwin.align	32
239bc3d5698SJohn Baldwin__poly1305_init_avx:
240bc3d5698SJohn Baldwin.cfi_startproc
241bc3d5698SJohn Baldwin	movq	%r11,%r14
242bc3d5698SJohn Baldwin	movq	%r12,%rbx
243bc3d5698SJohn Baldwin	xorq	%rbp,%rbp
244bc3d5698SJohn Baldwin
245bc3d5698SJohn Baldwin	leaq	48+64(%rdi),%rdi
246bc3d5698SJohn Baldwin
247bc3d5698SJohn Baldwin	movq	%r12,%rax
248bc3d5698SJohn Baldwin	call	__poly1305_block
249bc3d5698SJohn Baldwin
250bc3d5698SJohn Baldwin	movl	$0x3ffffff,%eax
251bc3d5698SJohn Baldwin	movl	$0x3ffffff,%edx
252bc3d5698SJohn Baldwin	movq	%r14,%r8
253bc3d5698SJohn Baldwin	andl	%r14d,%eax
254bc3d5698SJohn Baldwin	movq	%r11,%r9
255bc3d5698SJohn Baldwin	andl	%r11d,%edx
256bc3d5698SJohn Baldwin	movl	%eax,-64(%rdi)
257bc3d5698SJohn Baldwin	shrq	$26,%r8
258bc3d5698SJohn Baldwin	movl	%edx,-60(%rdi)
259bc3d5698SJohn Baldwin	shrq	$26,%r9
260bc3d5698SJohn Baldwin
261bc3d5698SJohn Baldwin	movl	$0x3ffffff,%eax
262bc3d5698SJohn Baldwin	movl	$0x3ffffff,%edx
263bc3d5698SJohn Baldwin	andl	%r8d,%eax
264bc3d5698SJohn Baldwin	andl	%r9d,%edx
265bc3d5698SJohn Baldwin	movl	%eax,-48(%rdi)
266bc3d5698SJohn Baldwin	leal	(%rax,%rax,4),%eax
267bc3d5698SJohn Baldwin	movl	%edx,-44(%rdi)
268bc3d5698SJohn Baldwin	leal	(%rdx,%rdx,4),%edx
269bc3d5698SJohn Baldwin	movl	%eax,-32(%rdi)
270bc3d5698SJohn Baldwin	shrq	$26,%r8
271bc3d5698SJohn Baldwin	movl	%edx,-28(%rdi)
272bc3d5698SJohn Baldwin	shrq	$26,%r9
273bc3d5698SJohn Baldwin
274bc3d5698SJohn Baldwin	movq	%rbx,%rax
275bc3d5698SJohn Baldwin	movq	%r12,%rdx
276bc3d5698SJohn Baldwin	shlq	$12,%rax
277bc3d5698SJohn Baldwin	shlq	$12,%rdx
278bc3d5698SJohn Baldwin	orq	%r8,%rax
279bc3d5698SJohn Baldwin	orq	%r9,%rdx
280bc3d5698SJohn Baldwin	andl	$0x3ffffff,%eax
281bc3d5698SJohn Baldwin	andl	$0x3ffffff,%edx
282bc3d5698SJohn Baldwin	movl	%eax,-16(%rdi)
283bc3d5698SJohn Baldwin	leal	(%rax,%rax,4),%eax
284bc3d5698SJohn Baldwin	movl	%edx,-12(%rdi)
285bc3d5698SJohn Baldwin	leal	(%rdx,%rdx,4),%edx
286bc3d5698SJohn Baldwin	movl	%eax,0(%rdi)
287bc3d5698SJohn Baldwin	movq	%rbx,%r8
288bc3d5698SJohn Baldwin	movl	%edx,4(%rdi)
289bc3d5698SJohn Baldwin	movq	%r12,%r9
290bc3d5698SJohn Baldwin
291bc3d5698SJohn Baldwin	movl	$0x3ffffff,%eax
292bc3d5698SJohn Baldwin	movl	$0x3ffffff,%edx
293bc3d5698SJohn Baldwin	shrq	$14,%r8
294bc3d5698SJohn Baldwin	shrq	$14,%r9
295bc3d5698SJohn Baldwin	andl	%r8d,%eax
296bc3d5698SJohn Baldwin	andl	%r9d,%edx
297bc3d5698SJohn Baldwin	movl	%eax,16(%rdi)
298bc3d5698SJohn Baldwin	leal	(%rax,%rax,4),%eax
299bc3d5698SJohn Baldwin	movl	%edx,20(%rdi)
300bc3d5698SJohn Baldwin	leal	(%rdx,%rdx,4),%edx
301bc3d5698SJohn Baldwin	movl	%eax,32(%rdi)
302bc3d5698SJohn Baldwin	shrq	$26,%r8
303bc3d5698SJohn Baldwin	movl	%edx,36(%rdi)
304bc3d5698SJohn Baldwin	shrq	$26,%r9
305bc3d5698SJohn Baldwin
306bc3d5698SJohn Baldwin	movq	%rbp,%rax
307bc3d5698SJohn Baldwin	shlq	$24,%rax
308bc3d5698SJohn Baldwin	orq	%rax,%r8
309bc3d5698SJohn Baldwin	movl	%r8d,48(%rdi)
310bc3d5698SJohn Baldwin	leaq	(%r8,%r8,4),%r8
311bc3d5698SJohn Baldwin	movl	%r9d,52(%rdi)
312bc3d5698SJohn Baldwin	leaq	(%r9,%r9,4),%r9
313bc3d5698SJohn Baldwin	movl	%r8d,64(%rdi)
314bc3d5698SJohn Baldwin	movl	%r9d,68(%rdi)
315bc3d5698SJohn Baldwin
316bc3d5698SJohn Baldwin	movq	%r12,%rax
317bc3d5698SJohn Baldwin	call	__poly1305_block
318bc3d5698SJohn Baldwin
319bc3d5698SJohn Baldwin	movl	$0x3ffffff,%eax
320bc3d5698SJohn Baldwin	movq	%r14,%r8
321bc3d5698SJohn Baldwin	andl	%r14d,%eax
322bc3d5698SJohn Baldwin	shrq	$26,%r8
323bc3d5698SJohn Baldwin	movl	%eax,-52(%rdi)
324bc3d5698SJohn Baldwin
325bc3d5698SJohn Baldwin	movl	$0x3ffffff,%edx
326bc3d5698SJohn Baldwin	andl	%r8d,%edx
327bc3d5698SJohn Baldwin	movl	%edx,-36(%rdi)
328bc3d5698SJohn Baldwin	leal	(%rdx,%rdx,4),%edx
329bc3d5698SJohn Baldwin	shrq	$26,%r8
330bc3d5698SJohn Baldwin	movl	%edx,-20(%rdi)
331bc3d5698SJohn Baldwin
332bc3d5698SJohn Baldwin	movq	%rbx,%rax
333bc3d5698SJohn Baldwin	shlq	$12,%rax
334bc3d5698SJohn Baldwin	orq	%r8,%rax
335bc3d5698SJohn Baldwin	andl	$0x3ffffff,%eax
336bc3d5698SJohn Baldwin	movl	%eax,-4(%rdi)
337bc3d5698SJohn Baldwin	leal	(%rax,%rax,4),%eax
338bc3d5698SJohn Baldwin	movq	%rbx,%r8
339bc3d5698SJohn Baldwin	movl	%eax,12(%rdi)
340bc3d5698SJohn Baldwin
341bc3d5698SJohn Baldwin	movl	$0x3ffffff,%edx
342bc3d5698SJohn Baldwin	shrq	$14,%r8
343bc3d5698SJohn Baldwin	andl	%r8d,%edx
344bc3d5698SJohn Baldwin	movl	%edx,28(%rdi)
345bc3d5698SJohn Baldwin	leal	(%rdx,%rdx,4),%edx
346bc3d5698SJohn Baldwin	shrq	$26,%r8
347bc3d5698SJohn Baldwin	movl	%edx,44(%rdi)
348bc3d5698SJohn Baldwin
349bc3d5698SJohn Baldwin	movq	%rbp,%rax
350bc3d5698SJohn Baldwin	shlq	$24,%rax
351bc3d5698SJohn Baldwin	orq	%rax,%r8
352bc3d5698SJohn Baldwin	movl	%r8d,60(%rdi)
353bc3d5698SJohn Baldwin	leaq	(%r8,%r8,4),%r8
354bc3d5698SJohn Baldwin	movl	%r8d,76(%rdi)
355bc3d5698SJohn Baldwin
356bc3d5698SJohn Baldwin	movq	%r12,%rax
357bc3d5698SJohn Baldwin	call	__poly1305_block
358bc3d5698SJohn Baldwin
359bc3d5698SJohn Baldwin	movl	$0x3ffffff,%eax
360bc3d5698SJohn Baldwin	movq	%r14,%r8
361bc3d5698SJohn Baldwin	andl	%r14d,%eax
362bc3d5698SJohn Baldwin	shrq	$26,%r8
363bc3d5698SJohn Baldwin	movl	%eax,-56(%rdi)
364bc3d5698SJohn Baldwin
365bc3d5698SJohn Baldwin	movl	$0x3ffffff,%edx
366bc3d5698SJohn Baldwin	andl	%r8d,%edx
367bc3d5698SJohn Baldwin	movl	%edx,-40(%rdi)
368bc3d5698SJohn Baldwin	leal	(%rdx,%rdx,4),%edx
369bc3d5698SJohn Baldwin	shrq	$26,%r8
370bc3d5698SJohn Baldwin	movl	%edx,-24(%rdi)
371bc3d5698SJohn Baldwin
372bc3d5698SJohn Baldwin	movq	%rbx,%rax
373bc3d5698SJohn Baldwin	shlq	$12,%rax
374bc3d5698SJohn Baldwin	orq	%r8,%rax
375bc3d5698SJohn Baldwin	andl	$0x3ffffff,%eax
376bc3d5698SJohn Baldwin	movl	%eax,-8(%rdi)
377bc3d5698SJohn Baldwin	leal	(%rax,%rax,4),%eax
378bc3d5698SJohn Baldwin	movq	%rbx,%r8
379bc3d5698SJohn Baldwin	movl	%eax,8(%rdi)
380bc3d5698SJohn Baldwin
381bc3d5698SJohn Baldwin	movl	$0x3ffffff,%edx
382bc3d5698SJohn Baldwin	shrq	$14,%r8
383bc3d5698SJohn Baldwin	andl	%r8d,%edx
384bc3d5698SJohn Baldwin	movl	%edx,24(%rdi)
385bc3d5698SJohn Baldwin	leal	(%rdx,%rdx,4),%edx
386bc3d5698SJohn Baldwin	shrq	$26,%r8
387bc3d5698SJohn Baldwin	movl	%edx,40(%rdi)
388bc3d5698SJohn Baldwin
389bc3d5698SJohn Baldwin	movq	%rbp,%rax
390bc3d5698SJohn Baldwin	shlq	$24,%rax
391bc3d5698SJohn Baldwin	orq	%rax,%r8
392bc3d5698SJohn Baldwin	movl	%r8d,56(%rdi)
393bc3d5698SJohn Baldwin	leaq	(%r8,%r8,4),%r8
394bc3d5698SJohn Baldwin	movl	%r8d,72(%rdi)
395bc3d5698SJohn Baldwin
396bc3d5698SJohn Baldwin	leaq	-48-64(%rdi),%rdi
397bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
398bc3d5698SJohn Baldwin.cfi_endproc
399bc3d5698SJohn Baldwin.size	__poly1305_init_avx,.-__poly1305_init_avx
400bc3d5698SJohn Baldwin
401bc3d5698SJohn Baldwin.type	poly1305_blocks_avx,@function
402bc3d5698SJohn Baldwin.align	32
403bc3d5698SJohn Baldwinpoly1305_blocks_avx:
404bc3d5698SJohn Baldwin.cfi_startproc
405bc3d5698SJohn Baldwin	movl	20(%rdi),%r8d
406bc3d5698SJohn Baldwin	cmpq	$128,%rdx
407bc3d5698SJohn Baldwin	jae	.Lblocks_avx
408bc3d5698SJohn Baldwin	testl	%r8d,%r8d
409bc3d5698SJohn Baldwin	jz	.Lblocks
410bc3d5698SJohn Baldwin
411bc3d5698SJohn Baldwin.Lblocks_avx:
412bc3d5698SJohn Baldwin	andq	$-16,%rdx
413bc3d5698SJohn Baldwin	jz	.Lno_data_avx
414bc3d5698SJohn Baldwin
415bc3d5698SJohn Baldwin	vzeroupper
416bc3d5698SJohn Baldwin
417bc3d5698SJohn Baldwin	testl	%r8d,%r8d
418bc3d5698SJohn Baldwin	jz	.Lbase2_64_avx
419bc3d5698SJohn Baldwin
420bc3d5698SJohn Baldwin	testq	$31,%rdx
421bc3d5698SJohn Baldwin	jz	.Leven_avx
422bc3d5698SJohn Baldwin
423bc3d5698SJohn Baldwin	pushq	%rbx
424bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
425bc3d5698SJohn Baldwin.cfi_offset	%rbx,-16
426bc3d5698SJohn Baldwin	pushq	%rbp
427bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
428bc3d5698SJohn Baldwin.cfi_offset	%rbp,-24
429bc3d5698SJohn Baldwin	pushq	%r12
430bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
431bc3d5698SJohn Baldwin.cfi_offset	%r12,-32
432bc3d5698SJohn Baldwin	pushq	%r13
433bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
434bc3d5698SJohn Baldwin.cfi_offset	%r13,-40
435bc3d5698SJohn Baldwin	pushq	%r14
436bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
437bc3d5698SJohn Baldwin.cfi_offset	%r14,-48
438bc3d5698SJohn Baldwin	pushq	%r15
439bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
440bc3d5698SJohn Baldwin.cfi_offset	%r15,-56
441bc3d5698SJohn Baldwin.Lblocks_avx_body:
442bc3d5698SJohn Baldwin
443bc3d5698SJohn Baldwin	movq	%rdx,%r15
444bc3d5698SJohn Baldwin
445bc3d5698SJohn Baldwin	movq	0(%rdi),%r8
446bc3d5698SJohn Baldwin	movq	8(%rdi),%r9
447bc3d5698SJohn Baldwin	movl	16(%rdi),%ebp
448bc3d5698SJohn Baldwin
449bc3d5698SJohn Baldwin	movq	24(%rdi),%r11
450bc3d5698SJohn Baldwin	movq	32(%rdi),%r13
451bc3d5698SJohn Baldwin
452bc3d5698SJohn Baldwin
453bc3d5698SJohn Baldwin	movl	%r8d,%r14d
454bc3d5698SJohn Baldwin	andq	$-2147483648,%r8
455bc3d5698SJohn Baldwin	movq	%r9,%r12
456bc3d5698SJohn Baldwin	movl	%r9d,%ebx
457bc3d5698SJohn Baldwin	andq	$-2147483648,%r9
458bc3d5698SJohn Baldwin
459bc3d5698SJohn Baldwin	shrq	$6,%r8
460bc3d5698SJohn Baldwin	shlq	$52,%r12
461bc3d5698SJohn Baldwin	addq	%r8,%r14
462bc3d5698SJohn Baldwin	shrq	$12,%rbx
463bc3d5698SJohn Baldwin	shrq	$18,%r9
464bc3d5698SJohn Baldwin	addq	%r12,%r14
465bc3d5698SJohn Baldwin	adcq	%r9,%rbx
466bc3d5698SJohn Baldwin
467bc3d5698SJohn Baldwin	movq	%rbp,%r8
468bc3d5698SJohn Baldwin	shlq	$40,%r8
469bc3d5698SJohn Baldwin	shrq	$24,%rbp
470bc3d5698SJohn Baldwin	addq	%r8,%rbx
471bc3d5698SJohn Baldwin	adcq	$0,%rbp
472bc3d5698SJohn Baldwin
473bc3d5698SJohn Baldwin	movq	$-4,%r9
474bc3d5698SJohn Baldwin	movq	%rbp,%r8
475bc3d5698SJohn Baldwin	andq	%rbp,%r9
476bc3d5698SJohn Baldwin	shrq	$2,%r8
477bc3d5698SJohn Baldwin	andq	$3,%rbp
478bc3d5698SJohn Baldwin	addq	%r9,%r8
479bc3d5698SJohn Baldwin	addq	%r8,%r14
480bc3d5698SJohn Baldwin	adcq	$0,%rbx
481bc3d5698SJohn Baldwin	adcq	$0,%rbp
482bc3d5698SJohn Baldwin
483bc3d5698SJohn Baldwin	movq	%r13,%r12
484bc3d5698SJohn Baldwin	movq	%r13,%rax
485bc3d5698SJohn Baldwin	shrq	$2,%r13
486bc3d5698SJohn Baldwin	addq	%r12,%r13
487bc3d5698SJohn Baldwin
488bc3d5698SJohn Baldwin	addq	0(%rsi),%r14
489bc3d5698SJohn Baldwin	adcq	8(%rsi),%rbx
490bc3d5698SJohn Baldwin	leaq	16(%rsi),%rsi
491bc3d5698SJohn Baldwin	adcq	%rcx,%rbp
492bc3d5698SJohn Baldwin
493bc3d5698SJohn Baldwin	call	__poly1305_block
494bc3d5698SJohn Baldwin
495bc3d5698SJohn Baldwin	testq	%rcx,%rcx
496bc3d5698SJohn Baldwin	jz	.Lstore_base2_64_avx
497bc3d5698SJohn Baldwin
498bc3d5698SJohn Baldwin
499bc3d5698SJohn Baldwin	movq	%r14,%rax
500bc3d5698SJohn Baldwin	movq	%r14,%rdx
501bc3d5698SJohn Baldwin	shrq	$52,%r14
502bc3d5698SJohn Baldwin	movq	%rbx,%r11
503bc3d5698SJohn Baldwin	movq	%rbx,%r12
504bc3d5698SJohn Baldwin	shrq	$26,%rdx
505bc3d5698SJohn Baldwin	andq	$0x3ffffff,%rax
506bc3d5698SJohn Baldwin	shlq	$12,%r11
507bc3d5698SJohn Baldwin	andq	$0x3ffffff,%rdx
508bc3d5698SJohn Baldwin	shrq	$14,%rbx
509bc3d5698SJohn Baldwin	orq	%r11,%r14
510bc3d5698SJohn Baldwin	shlq	$24,%rbp
511bc3d5698SJohn Baldwin	andq	$0x3ffffff,%r14
512bc3d5698SJohn Baldwin	shrq	$40,%r12
513bc3d5698SJohn Baldwin	andq	$0x3ffffff,%rbx
514bc3d5698SJohn Baldwin	orq	%r12,%rbp
515bc3d5698SJohn Baldwin
516bc3d5698SJohn Baldwin	subq	$16,%r15
517bc3d5698SJohn Baldwin	jz	.Lstore_base2_26_avx
518bc3d5698SJohn Baldwin
519bc3d5698SJohn Baldwin	vmovd	%eax,%xmm0
520bc3d5698SJohn Baldwin	vmovd	%edx,%xmm1
521bc3d5698SJohn Baldwin	vmovd	%r14d,%xmm2
522bc3d5698SJohn Baldwin	vmovd	%ebx,%xmm3
523bc3d5698SJohn Baldwin	vmovd	%ebp,%xmm4
524bc3d5698SJohn Baldwin	jmp	.Lproceed_avx
525bc3d5698SJohn Baldwin
526bc3d5698SJohn Baldwin.align	32
527bc3d5698SJohn Baldwin.Lstore_base2_64_avx:
528bc3d5698SJohn Baldwin	movq	%r14,0(%rdi)
529bc3d5698SJohn Baldwin	movq	%rbx,8(%rdi)
530bc3d5698SJohn Baldwin	movq	%rbp,16(%rdi)
531bc3d5698SJohn Baldwin	jmp	.Ldone_avx
532bc3d5698SJohn Baldwin
533bc3d5698SJohn Baldwin.align	16
534bc3d5698SJohn Baldwin.Lstore_base2_26_avx:
535bc3d5698SJohn Baldwin	movl	%eax,0(%rdi)
536bc3d5698SJohn Baldwin	movl	%edx,4(%rdi)
537bc3d5698SJohn Baldwin	movl	%r14d,8(%rdi)
538bc3d5698SJohn Baldwin	movl	%ebx,12(%rdi)
539bc3d5698SJohn Baldwin	movl	%ebp,16(%rdi)
540bc3d5698SJohn Baldwin.align	16
541bc3d5698SJohn Baldwin.Ldone_avx:
542bc3d5698SJohn Baldwin	movq	0(%rsp),%r15
543bc3d5698SJohn Baldwin.cfi_restore	%r15
544bc3d5698SJohn Baldwin	movq	8(%rsp),%r14
545bc3d5698SJohn Baldwin.cfi_restore	%r14
546bc3d5698SJohn Baldwin	movq	16(%rsp),%r13
547bc3d5698SJohn Baldwin.cfi_restore	%r13
548bc3d5698SJohn Baldwin	movq	24(%rsp),%r12
549bc3d5698SJohn Baldwin.cfi_restore	%r12
550bc3d5698SJohn Baldwin	movq	32(%rsp),%rbp
551bc3d5698SJohn Baldwin.cfi_restore	%rbp
552bc3d5698SJohn Baldwin	movq	40(%rsp),%rbx
553bc3d5698SJohn Baldwin.cfi_restore	%rbx
554bc3d5698SJohn Baldwin	leaq	48(%rsp),%rsp
555bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	-48
556bc3d5698SJohn Baldwin.Lno_data_avx:
557bc3d5698SJohn Baldwin.Lblocks_avx_epilogue:
558bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
559bc3d5698SJohn Baldwin.cfi_endproc
560bc3d5698SJohn Baldwin
561bc3d5698SJohn Baldwin.align	32
562bc3d5698SJohn Baldwin.Lbase2_64_avx:
563bc3d5698SJohn Baldwin.cfi_startproc
564bc3d5698SJohn Baldwin	pushq	%rbx
565bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
566bc3d5698SJohn Baldwin.cfi_offset	%rbx,-16
567bc3d5698SJohn Baldwin	pushq	%rbp
568bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
569bc3d5698SJohn Baldwin.cfi_offset	%rbp,-24
570bc3d5698SJohn Baldwin	pushq	%r12
571bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
572bc3d5698SJohn Baldwin.cfi_offset	%r12,-32
573bc3d5698SJohn Baldwin	pushq	%r13
574bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
575bc3d5698SJohn Baldwin.cfi_offset	%r13,-40
576bc3d5698SJohn Baldwin	pushq	%r14
577bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
578bc3d5698SJohn Baldwin.cfi_offset	%r14,-48
579bc3d5698SJohn Baldwin	pushq	%r15
580bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
581bc3d5698SJohn Baldwin.cfi_offset	%r15,-56
582bc3d5698SJohn Baldwin.Lbase2_64_avx_body:
583bc3d5698SJohn Baldwin
584bc3d5698SJohn Baldwin	movq	%rdx,%r15
585bc3d5698SJohn Baldwin
586bc3d5698SJohn Baldwin	movq	24(%rdi),%r11
587bc3d5698SJohn Baldwin	movq	32(%rdi),%r13
588bc3d5698SJohn Baldwin
589bc3d5698SJohn Baldwin	movq	0(%rdi),%r14
590bc3d5698SJohn Baldwin	movq	8(%rdi),%rbx
591bc3d5698SJohn Baldwin	movl	16(%rdi),%ebp
592bc3d5698SJohn Baldwin
593bc3d5698SJohn Baldwin	movq	%r13,%r12
594bc3d5698SJohn Baldwin	movq	%r13,%rax
595bc3d5698SJohn Baldwin	shrq	$2,%r13
596bc3d5698SJohn Baldwin	addq	%r12,%r13
597bc3d5698SJohn Baldwin
598bc3d5698SJohn Baldwin	testq	$31,%rdx
599bc3d5698SJohn Baldwin	jz	.Linit_avx
600bc3d5698SJohn Baldwin
601bc3d5698SJohn Baldwin	addq	0(%rsi),%r14
602bc3d5698SJohn Baldwin	adcq	8(%rsi),%rbx
603bc3d5698SJohn Baldwin	leaq	16(%rsi),%rsi
604bc3d5698SJohn Baldwin	adcq	%rcx,%rbp
605bc3d5698SJohn Baldwin	subq	$16,%r15
606bc3d5698SJohn Baldwin
607bc3d5698SJohn Baldwin	call	__poly1305_block
608bc3d5698SJohn Baldwin
609bc3d5698SJohn Baldwin.Linit_avx:
610bc3d5698SJohn Baldwin
611bc3d5698SJohn Baldwin	movq	%r14,%rax
612bc3d5698SJohn Baldwin	movq	%r14,%rdx
613bc3d5698SJohn Baldwin	shrq	$52,%r14
614bc3d5698SJohn Baldwin	movq	%rbx,%r8
615bc3d5698SJohn Baldwin	movq	%rbx,%r9
616bc3d5698SJohn Baldwin	shrq	$26,%rdx
617bc3d5698SJohn Baldwin	andq	$0x3ffffff,%rax
618bc3d5698SJohn Baldwin	shlq	$12,%r8
619bc3d5698SJohn Baldwin	andq	$0x3ffffff,%rdx
620bc3d5698SJohn Baldwin	shrq	$14,%rbx
621bc3d5698SJohn Baldwin	orq	%r8,%r14
622bc3d5698SJohn Baldwin	shlq	$24,%rbp
623bc3d5698SJohn Baldwin	andq	$0x3ffffff,%r14
624bc3d5698SJohn Baldwin	shrq	$40,%r9
625bc3d5698SJohn Baldwin	andq	$0x3ffffff,%rbx
626bc3d5698SJohn Baldwin	orq	%r9,%rbp
627bc3d5698SJohn Baldwin
628bc3d5698SJohn Baldwin	vmovd	%eax,%xmm0
629bc3d5698SJohn Baldwin	vmovd	%edx,%xmm1
630bc3d5698SJohn Baldwin	vmovd	%r14d,%xmm2
631bc3d5698SJohn Baldwin	vmovd	%ebx,%xmm3
632bc3d5698SJohn Baldwin	vmovd	%ebp,%xmm4
633bc3d5698SJohn Baldwin	movl	$1,20(%rdi)
634bc3d5698SJohn Baldwin
635bc3d5698SJohn Baldwin	call	__poly1305_init_avx
636bc3d5698SJohn Baldwin
637bc3d5698SJohn Baldwin.Lproceed_avx:
638bc3d5698SJohn Baldwin	movq	%r15,%rdx
639bc3d5698SJohn Baldwin
640bc3d5698SJohn Baldwin	movq	0(%rsp),%r15
641bc3d5698SJohn Baldwin.cfi_restore	%r15
642bc3d5698SJohn Baldwin	movq	8(%rsp),%r14
643bc3d5698SJohn Baldwin.cfi_restore	%r14
644bc3d5698SJohn Baldwin	movq	16(%rsp),%r13
645bc3d5698SJohn Baldwin.cfi_restore	%r13
646bc3d5698SJohn Baldwin	movq	24(%rsp),%r12
647bc3d5698SJohn Baldwin.cfi_restore	%r12
648bc3d5698SJohn Baldwin	movq	32(%rsp),%rbp
649bc3d5698SJohn Baldwin.cfi_restore	%rbp
650bc3d5698SJohn Baldwin	movq	40(%rsp),%rbx
651bc3d5698SJohn Baldwin.cfi_restore	%rbx
652bc3d5698SJohn Baldwin	leaq	48(%rsp),%rax
653bc3d5698SJohn Baldwin	leaq	48(%rsp),%rsp
654bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	-48
655bc3d5698SJohn Baldwin.Lbase2_64_avx_epilogue:
656bc3d5698SJohn Baldwin	jmp	.Ldo_avx
657bc3d5698SJohn Baldwin.cfi_endproc
658bc3d5698SJohn Baldwin
659bc3d5698SJohn Baldwin.align	32
660bc3d5698SJohn Baldwin.Leven_avx:
661bc3d5698SJohn Baldwin.cfi_startproc
662bc3d5698SJohn Baldwin	vmovd	0(%rdi),%xmm0
663bc3d5698SJohn Baldwin	vmovd	4(%rdi),%xmm1
664bc3d5698SJohn Baldwin	vmovd	8(%rdi),%xmm2
665bc3d5698SJohn Baldwin	vmovd	12(%rdi),%xmm3
666bc3d5698SJohn Baldwin	vmovd	16(%rdi),%xmm4
667bc3d5698SJohn Baldwin
668bc3d5698SJohn Baldwin.Ldo_avx:
669bc3d5698SJohn Baldwin	leaq	-88(%rsp),%r11
670bc3d5698SJohn Baldwin.cfi_def_cfa	%r11,0x60
671bc3d5698SJohn Baldwin	subq	$0x178,%rsp
672bc3d5698SJohn Baldwin	subq	$64,%rdx
673bc3d5698SJohn Baldwin	leaq	-32(%rsi),%rax
674bc3d5698SJohn Baldwin	cmovcq	%rax,%rsi
675bc3d5698SJohn Baldwin
676bc3d5698SJohn Baldwin	vmovdqu	48(%rdi),%xmm14
677bc3d5698SJohn Baldwin	leaq	112(%rdi),%rdi
678bc3d5698SJohn Baldwin	leaq	.Lconst(%rip),%rcx
679bc3d5698SJohn Baldwin
680bc3d5698SJohn Baldwin
681bc3d5698SJohn Baldwin
682bc3d5698SJohn Baldwin	vmovdqu	32(%rsi),%xmm5
683bc3d5698SJohn Baldwin	vmovdqu	48(%rsi),%xmm6
684bc3d5698SJohn Baldwin	vmovdqa	64(%rcx),%xmm15
685bc3d5698SJohn Baldwin
686bc3d5698SJohn Baldwin	vpsrldq	$6,%xmm5,%xmm7
687bc3d5698SJohn Baldwin	vpsrldq	$6,%xmm6,%xmm8
688bc3d5698SJohn Baldwin	vpunpckhqdq	%xmm6,%xmm5,%xmm9
689bc3d5698SJohn Baldwin	vpunpcklqdq	%xmm6,%xmm5,%xmm5
690bc3d5698SJohn Baldwin	vpunpcklqdq	%xmm8,%xmm7,%xmm8
691bc3d5698SJohn Baldwin
692bc3d5698SJohn Baldwin	vpsrlq	$40,%xmm9,%xmm9
693bc3d5698SJohn Baldwin	vpsrlq	$26,%xmm5,%xmm6
694bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm5,%xmm5
695bc3d5698SJohn Baldwin	vpsrlq	$4,%xmm8,%xmm7
696bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm6,%xmm6
697bc3d5698SJohn Baldwin	vpsrlq	$30,%xmm8,%xmm8
698bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm7,%xmm7
699bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm8,%xmm8
700bc3d5698SJohn Baldwin	vpor	32(%rcx),%xmm9,%xmm9
701bc3d5698SJohn Baldwin
702bc3d5698SJohn Baldwin	jbe	.Lskip_loop_avx
703bc3d5698SJohn Baldwin
704bc3d5698SJohn Baldwin
705bc3d5698SJohn Baldwin	vmovdqu	-48(%rdi),%xmm11
706bc3d5698SJohn Baldwin	vmovdqu	-32(%rdi),%xmm12
707bc3d5698SJohn Baldwin	vpshufd	$0xEE,%xmm14,%xmm13
708bc3d5698SJohn Baldwin	vpshufd	$0x44,%xmm14,%xmm10
709bc3d5698SJohn Baldwin	vmovdqa	%xmm13,-144(%r11)
710bc3d5698SJohn Baldwin	vmovdqa	%xmm10,0(%rsp)
711bc3d5698SJohn Baldwin	vpshufd	$0xEE,%xmm11,%xmm14
712bc3d5698SJohn Baldwin	vmovdqu	-16(%rdi),%xmm10
713bc3d5698SJohn Baldwin	vpshufd	$0x44,%xmm11,%xmm11
714bc3d5698SJohn Baldwin	vmovdqa	%xmm14,-128(%r11)
715bc3d5698SJohn Baldwin	vmovdqa	%xmm11,16(%rsp)
716bc3d5698SJohn Baldwin	vpshufd	$0xEE,%xmm12,%xmm13
717bc3d5698SJohn Baldwin	vmovdqu	0(%rdi),%xmm11
718bc3d5698SJohn Baldwin	vpshufd	$0x44,%xmm12,%xmm12
719bc3d5698SJohn Baldwin	vmovdqa	%xmm13,-112(%r11)
720bc3d5698SJohn Baldwin	vmovdqa	%xmm12,32(%rsp)
721bc3d5698SJohn Baldwin	vpshufd	$0xEE,%xmm10,%xmm14
722bc3d5698SJohn Baldwin	vmovdqu	16(%rdi),%xmm12
723bc3d5698SJohn Baldwin	vpshufd	$0x44,%xmm10,%xmm10
724bc3d5698SJohn Baldwin	vmovdqa	%xmm14,-96(%r11)
725bc3d5698SJohn Baldwin	vmovdqa	%xmm10,48(%rsp)
726bc3d5698SJohn Baldwin	vpshufd	$0xEE,%xmm11,%xmm13
727bc3d5698SJohn Baldwin	vmovdqu	32(%rdi),%xmm10
728bc3d5698SJohn Baldwin	vpshufd	$0x44,%xmm11,%xmm11
729bc3d5698SJohn Baldwin	vmovdqa	%xmm13,-80(%r11)
730bc3d5698SJohn Baldwin	vmovdqa	%xmm11,64(%rsp)
731bc3d5698SJohn Baldwin	vpshufd	$0xEE,%xmm12,%xmm14
732bc3d5698SJohn Baldwin	vmovdqu	48(%rdi),%xmm11
733bc3d5698SJohn Baldwin	vpshufd	$0x44,%xmm12,%xmm12
734bc3d5698SJohn Baldwin	vmovdqa	%xmm14,-64(%r11)
735bc3d5698SJohn Baldwin	vmovdqa	%xmm12,80(%rsp)
736bc3d5698SJohn Baldwin	vpshufd	$0xEE,%xmm10,%xmm13
737bc3d5698SJohn Baldwin	vmovdqu	64(%rdi),%xmm12
738bc3d5698SJohn Baldwin	vpshufd	$0x44,%xmm10,%xmm10
739bc3d5698SJohn Baldwin	vmovdqa	%xmm13,-48(%r11)
740bc3d5698SJohn Baldwin	vmovdqa	%xmm10,96(%rsp)
741bc3d5698SJohn Baldwin	vpshufd	$0xEE,%xmm11,%xmm14
742bc3d5698SJohn Baldwin	vpshufd	$0x44,%xmm11,%xmm11
743bc3d5698SJohn Baldwin	vmovdqa	%xmm14,-32(%r11)
744bc3d5698SJohn Baldwin	vmovdqa	%xmm11,112(%rsp)
745bc3d5698SJohn Baldwin	vpshufd	$0xEE,%xmm12,%xmm13
746bc3d5698SJohn Baldwin	vmovdqa	0(%rsp),%xmm14
747bc3d5698SJohn Baldwin	vpshufd	$0x44,%xmm12,%xmm12
748bc3d5698SJohn Baldwin	vmovdqa	%xmm13,-16(%r11)
749bc3d5698SJohn Baldwin	vmovdqa	%xmm12,128(%rsp)
750bc3d5698SJohn Baldwin
751bc3d5698SJohn Baldwin	jmp	.Loop_avx
752bc3d5698SJohn Baldwin
753bc3d5698SJohn Baldwin.align	32
754bc3d5698SJohn Baldwin.Loop_avx:
755bc3d5698SJohn Baldwin
756bc3d5698SJohn Baldwin
757bc3d5698SJohn Baldwin
758bc3d5698SJohn Baldwin
759bc3d5698SJohn Baldwin
760bc3d5698SJohn Baldwin
761bc3d5698SJohn Baldwin
762bc3d5698SJohn Baldwin
763bc3d5698SJohn Baldwin
764bc3d5698SJohn Baldwin
765bc3d5698SJohn Baldwin
766bc3d5698SJohn Baldwin
767bc3d5698SJohn Baldwin
768bc3d5698SJohn Baldwin
769bc3d5698SJohn Baldwin
770bc3d5698SJohn Baldwin
771bc3d5698SJohn Baldwin
772bc3d5698SJohn Baldwin
773bc3d5698SJohn Baldwin
774bc3d5698SJohn Baldwin
775bc3d5698SJohn Baldwin	vpmuludq	%xmm5,%xmm14,%xmm10
776bc3d5698SJohn Baldwin	vpmuludq	%xmm6,%xmm14,%xmm11
777bc3d5698SJohn Baldwin	vmovdqa	%xmm2,32(%r11)
778bc3d5698SJohn Baldwin	vpmuludq	%xmm7,%xmm14,%xmm12
779bc3d5698SJohn Baldwin	vmovdqa	16(%rsp),%xmm2
780bc3d5698SJohn Baldwin	vpmuludq	%xmm8,%xmm14,%xmm13
781bc3d5698SJohn Baldwin	vpmuludq	%xmm9,%xmm14,%xmm14
782bc3d5698SJohn Baldwin
783bc3d5698SJohn Baldwin	vmovdqa	%xmm0,0(%r11)
784bc3d5698SJohn Baldwin	vpmuludq	32(%rsp),%xmm9,%xmm0
785bc3d5698SJohn Baldwin	vmovdqa	%xmm1,16(%r11)
786bc3d5698SJohn Baldwin	vpmuludq	%xmm8,%xmm2,%xmm1
787bc3d5698SJohn Baldwin	vpaddq	%xmm0,%xmm10,%xmm10
788bc3d5698SJohn Baldwin	vpaddq	%xmm1,%xmm14,%xmm14
789bc3d5698SJohn Baldwin	vmovdqa	%xmm3,48(%r11)
790bc3d5698SJohn Baldwin	vpmuludq	%xmm7,%xmm2,%xmm0
791bc3d5698SJohn Baldwin	vpmuludq	%xmm6,%xmm2,%xmm1
792bc3d5698SJohn Baldwin	vpaddq	%xmm0,%xmm13,%xmm13
793bc3d5698SJohn Baldwin	vmovdqa	48(%rsp),%xmm3
794bc3d5698SJohn Baldwin	vpaddq	%xmm1,%xmm12,%xmm12
795bc3d5698SJohn Baldwin	vmovdqa	%xmm4,64(%r11)
796bc3d5698SJohn Baldwin	vpmuludq	%xmm5,%xmm2,%xmm2
797bc3d5698SJohn Baldwin	vpmuludq	%xmm7,%xmm3,%xmm0
798bc3d5698SJohn Baldwin	vpaddq	%xmm2,%xmm11,%xmm11
799bc3d5698SJohn Baldwin
800bc3d5698SJohn Baldwin	vmovdqa	64(%rsp),%xmm4
801bc3d5698SJohn Baldwin	vpaddq	%xmm0,%xmm14,%xmm14
802bc3d5698SJohn Baldwin	vpmuludq	%xmm6,%xmm3,%xmm1
803bc3d5698SJohn Baldwin	vpmuludq	%xmm5,%xmm3,%xmm3
804bc3d5698SJohn Baldwin	vpaddq	%xmm1,%xmm13,%xmm13
805bc3d5698SJohn Baldwin	vmovdqa	80(%rsp),%xmm2
806bc3d5698SJohn Baldwin	vpaddq	%xmm3,%xmm12,%xmm12
807bc3d5698SJohn Baldwin	vpmuludq	%xmm9,%xmm4,%xmm0
808bc3d5698SJohn Baldwin	vpmuludq	%xmm8,%xmm4,%xmm4
809bc3d5698SJohn Baldwin	vpaddq	%xmm0,%xmm11,%xmm11
810bc3d5698SJohn Baldwin	vmovdqa	96(%rsp),%xmm3
811bc3d5698SJohn Baldwin	vpaddq	%xmm4,%xmm10,%xmm10
812bc3d5698SJohn Baldwin
813bc3d5698SJohn Baldwin	vmovdqa	128(%rsp),%xmm4
814bc3d5698SJohn Baldwin	vpmuludq	%xmm6,%xmm2,%xmm1
815bc3d5698SJohn Baldwin	vpmuludq	%xmm5,%xmm2,%xmm2
816bc3d5698SJohn Baldwin	vpaddq	%xmm1,%xmm14,%xmm14
817bc3d5698SJohn Baldwin	vpaddq	%xmm2,%xmm13,%xmm13
818bc3d5698SJohn Baldwin	vpmuludq	%xmm9,%xmm3,%xmm0
819bc3d5698SJohn Baldwin	vpmuludq	%xmm8,%xmm3,%xmm1
820bc3d5698SJohn Baldwin	vpaddq	%xmm0,%xmm12,%xmm12
821bc3d5698SJohn Baldwin	vmovdqu	0(%rsi),%xmm0
822bc3d5698SJohn Baldwin	vpaddq	%xmm1,%xmm11,%xmm11
823bc3d5698SJohn Baldwin	vpmuludq	%xmm7,%xmm3,%xmm3
824bc3d5698SJohn Baldwin	vpmuludq	%xmm7,%xmm4,%xmm7
825bc3d5698SJohn Baldwin	vpaddq	%xmm3,%xmm10,%xmm10
826bc3d5698SJohn Baldwin
827bc3d5698SJohn Baldwin	vmovdqu	16(%rsi),%xmm1
828bc3d5698SJohn Baldwin	vpaddq	%xmm7,%xmm11,%xmm11
829bc3d5698SJohn Baldwin	vpmuludq	%xmm8,%xmm4,%xmm8
830bc3d5698SJohn Baldwin	vpmuludq	%xmm9,%xmm4,%xmm9
831bc3d5698SJohn Baldwin	vpsrldq	$6,%xmm0,%xmm2
832bc3d5698SJohn Baldwin	vpaddq	%xmm8,%xmm12,%xmm12
833bc3d5698SJohn Baldwin	vpaddq	%xmm9,%xmm13,%xmm13
834bc3d5698SJohn Baldwin	vpsrldq	$6,%xmm1,%xmm3
835bc3d5698SJohn Baldwin	vpmuludq	112(%rsp),%xmm5,%xmm9
836bc3d5698SJohn Baldwin	vpmuludq	%xmm6,%xmm4,%xmm5
837bc3d5698SJohn Baldwin	vpunpckhqdq	%xmm1,%xmm0,%xmm4
838bc3d5698SJohn Baldwin	vpaddq	%xmm9,%xmm14,%xmm14
839bc3d5698SJohn Baldwin	vmovdqa	-144(%r11),%xmm9
840bc3d5698SJohn Baldwin	vpaddq	%xmm5,%xmm10,%xmm10
841bc3d5698SJohn Baldwin
842bc3d5698SJohn Baldwin	vpunpcklqdq	%xmm1,%xmm0,%xmm0
843bc3d5698SJohn Baldwin	vpunpcklqdq	%xmm3,%xmm2,%xmm3
844bc3d5698SJohn Baldwin
845bc3d5698SJohn Baldwin
846bc3d5698SJohn Baldwin	vpsrldq	$5,%xmm4,%xmm4
847bc3d5698SJohn Baldwin	vpsrlq	$26,%xmm0,%xmm1
848bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm0,%xmm0
849bc3d5698SJohn Baldwin	vpsrlq	$4,%xmm3,%xmm2
850bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm1,%xmm1
851bc3d5698SJohn Baldwin	vpand	0(%rcx),%xmm4,%xmm4
852bc3d5698SJohn Baldwin	vpsrlq	$30,%xmm3,%xmm3
853bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm2,%xmm2
854bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm3,%xmm3
855bc3d5698SJohn Baldwin	vpor	32(%rcx),%xmm4,%xmm4
856bc3d5698SJohn Baldwin
857bc3d5698SJohn Baldwin	vpaddq	0(%r11),%xmm0,%xmm0
858bc3d5698SJohn Baldwin	vpaddq	16(%r11),%xmm1,%xmm1
859bc3d5698SJohn Baldwin	vpaddq	32(%r11),%xmm2,%xmm2
860bc3d5698SJohn Baldwin	vpaddq	48(%r11),%xmm3,%xmm3
861bc3d5698SJohn Baldwin	vpaddq	64(%r11),%xmm4,%xmm4
862bc3d5698SJohn Baldwin
863bc3d5698SJohn Baldwin	leaq	32(%rsi),%rax
864bc3d5698SJohn Baldwin	leaq	64(%rsi),%rsi
865bc3d5698SJohn Baldwin	subq	$64,%rdx
866bc3d5698SJohn Baldwin	cmovcq	%rax,%rsi
867bc3d5698SJohn Baldwin
868bc3d5698SJohn Baldwin
869bc3d5698SJohn Baldwin
870bc3d5698SJohn Baldwin
871bc3d5698SJohn Baldwin
872bc3d5698SJohn Baldwin
873bc3d5698SJohn Baldwin
874bc3d5698SJohn Baldwin
875bc3d5698SJohn Baldwin
876bc3d5698SJohn Baldwin
877bc3d5698SJohn Baldwin	vpmuludq	%xmm0,%xmm9,%xmm5
878bc3d5698SJohn Baldwin	vpmuludq	%xmm1,%xmm9,%xmm6
879bc3d5698SJohn Baldwin	vpaddq	%xmm5,%xmm10,%xmm10
880bc3d5698SJohn Baldwin	vpaddq	%xmm6,%xmm11,%xmm11
881bc3d5698SJohn Baldwin	vmovdqa	-128(%r11),%xmm7
882bc3d5698SJohn Baldwin	vpmuludq	%xmm2,%xmm9,%xmm5
883bc3d5698SJohn Baldwin	vpmuludq	%xmm3,%xmm9,%xmm6
884bc3d5698SJohn Baldwin	vpaddq	%xmm5,%xmm12,%xmm12
885bc3d5698SJohn Baldwin	vpaddq	%xmm6,%xmm13,%xmm13
886bc3d5698SJohn Baldwin	vpmuludq	%xmm4,%xmm9,%xmm9
887bc3d5698SJohn Baldwin	vpmuludq	-112(%r11),%xmm4,%xmm5
888bc3d5698SJohn Baldwin	vpaddq	%xmm9,%xmm14,%xmm14
889bc3d5698SJohn Baldwin
890bc3d5698SJohn Baldwin	vpaddq	%xmm5,%xmm10,%xmm10
891bc3d5698SJohn Baldwin	vpmuludq	%xmm2,%xmm7,%xmm6
892bc3d5698SJohn Baldwin	vpmuludq	%xmm3,%xmm7,%xmm5
893bc3d5698SJohn Baldwin	vpaddq	%xmm6,%xmm13,%xmm13
894bc3d5698SJohn Baldwin	vmovdqa	-96(%r11),%xmm8
895bc3d5698SJohn Baldwin	vpaddq	%xmm5,%xmm14,%xmm14
896bc3d5698SJohn Baldwin	vpmuludq	%xmm1,%xmm7,%xmm6
897bc3d5698SJohn Baldwin	vpmuludq	%xmm0,%xmm7,%xmm7
898bc3d5698SJohn Baldwin	vpaddq	%xmm6,%xmm12,%xmm12
899bc3d5698SJohn Baldwin	vpaddq	%xmm7,%xmm11,%xmm11
900bc3d5698SJohn Baldwin
901bc3d5698SJohn Baldwin	vmovdqa	-80(%r11),%xmm9
902bc3d5698SJohn Baldwin	vpmuludq	%xmm2,%xmm8,%xmm5
903bc3d5698SJohn Baldwin	vpmuludq	%xmm1,%xmm8,%xmm6
904bc3d5698SJohn Baldwin	vpaddq	%xmm5,%xmm14,%xmm14
905bc3d5698SJohn Baldwin	vpaddq	%xmm6,%xmm13,%xmm13
906bc3d5698SJohn Baldwin	vmovdqa	-64(%r11),%xmm7
907bc3d5698SJohn Baldwin	vpmuludq	%xmm0,%xmm8,%xmm8
908bc3d5698SJohn Baldwin	vpmuludq	%xmm4,%xmm9,%xmm5
909bc3d5698SJohn Baldwin	vpaddq	%xmm8,%xmm12,%xmm12
910bc3d5698SJohn Baldwin	vpaddq	%xmm5,%xmm11,%xmm11
911bc3d5698SJohn Baldwin	vmovdqa	-48(%r11),%xmm8
912bc3d5698SJohn Baldwin	vpmuludq	%xmm3,%xmm9,%xmm9
913bc3d5698SJohn Baldwin	vpmuludq	%xmm1,%xmm7,%xmm6
914bc3d5698SJohn Baldwin	vpaddq	%xmm9,%xmm10,%xmm10
915bc3d5698SJohn Baldwin
916bc3d5698SJohn Baldwin	vmovdqa	-16(%r11),%xmm9
917bc3d5698SJohn Baldwin	vpaddq	%xmm6,%xmm14,%xmm14
918bc3d5698SJohn Baldwin	vpmuludq	%xmm0,%xmm7,%xmm7
919bc3d5698SJohn Baldwin	vpmuludq	%xmm4,%xmm8,%xmm5
920bc3d5698SJohn Baldwin	vpaddq	%xmm7,%xmm13,%xmm13
921bc3d5698SJohn Baldwin	vpaddq	%xmm5,%xmm12,%xmm12
922bc3d5698SJohn Baldwin	vmovdqu	32(%rsi),%xmm5
923bc3d5698SJohn Baldwin	vpmuludq	%xmm3,%xmm8,%xmm7
924bc3d5698SJohn Baldwin	vpmuludq	%xmm2,%xmm8,%xmm8
925bc3d5698SJohn Baldwin	vpaddq	%xmm7,%xmm11,%xmm11
926bc3d5698SJohn Baldwin	vmovdqu	48(%rsi),%xmm6
927bc3d5698SJohn Baldwin	vpaddq	%xmm8,%xmm10,%xmm10
928bc3d5698SJohn Baldwin
929bc3d5698SJohn Baldwin	vpmuludq	%xmm2,%xmm9,%xmm2
930bc3d5698SJohn Baldwin	vpmuludq	%xmm3,%xmm9,%xmm3
931bc3d5698SJohn Baldwin	vpsrldq	$6,%xmm5,%xmm7
932bc3d5698SJohn Baldwin	vpaddq	%xmm2,%xmm11,%xmm11
933bc3d5698SJohn Baldwin	vpmuludq	%xmm4,%xmm9,%xmm4
934bc3d5698SJohn Baldwin	vpsrldq	$6,%xmm6,%xmm8
935bc3d5698SJohn Baldwin	vpaddq	%xmm3,%xmm12,%xmm2
936bc3d5698SJohn Baldwin	vpaddq	%xmm4,%xmm13,%xmm3
937bc3d5698SJohn Baldwin	vpmuludq	-32(%r11),%xmm0,%xmm4
938bc3d5698SJohn Baldwin	vpmuludq	%xmm1,%xmm9,%xmm0
939bc3d5698SJohn Baldwin	vpunpckhqdq	%xmm6,%xmm5,%xmm9
940bc3d5698SJohn Baldwin	vpaddq	%xmm4,%xmm14,%xmm4
941bc3d5698SJohn Baldwin	vpaddq	%xmm0,%xmm10,%xmm0
942bc3d5698SJohn Baldwin
943bc3d5698SJohn Baldwin	vpunpcklqdq	%xmm6,%xmm5,%xmm5
944bc3d5698SJohn Baldwin	vpunpcklqdq	%xmm8,%xmm7,%xmm8
945bc3d5698SJohn Baldwin
946bc3d5698SJohn Baldwin
947bc3d5698SJohn Baldwin	vpsrldq	$5,%xmm9,%xmm9
948bc3d5698SJohn Baldwin	vpsrlq	$26,%xmm5,%xmm6
949bc3d5698SJohn Baldwin	vmovdqa	0(%rsp),%xmm14
950bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm5,%xmm5
951bc3d5698SJohn Baldwin	vpsrlq	$4,%xmm8,%xmm7
952bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm6,%xmm6
953bc3d5698SJohn Baldwin	vpand	0(%rcx),%xmm9,%xmm9
954bc3d5698SJohn Baldwin	vpsrlq	$30,%xmm8,%xmm8
955bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm7,%xmm7
956bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm8,%xmm8
957bc3d5698SJohn Baldwin	vpor	32(%rcx),%xmm9,%xmm9
958bc3d5698SJohn Baldwin
959bc3d5698SJohn Baldwin
960bc3d5698SJohn Baldwin
961bc3d5698SJohn Baldwin
962bc3d5698SJohn Baldwin
963bc3d5698SJohn Baldwin	vpsrlq	$26,%xmm3,%xmm13
964bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm3,%xmm3
965bc3d5698SJohn Baldwin	vpaddq	%xmm13,%xmm4,%xmm4
966bc3d5698SJohn Baldwin
967bc3d5698SJohn Baldwin	vpsrlq	$26,%xmm0,%xmm10
968bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm0,%xmm0
969bc3d5698SJohn Baldwin	vpaddq	%xmm10,%xmm11,%xmm1
970bc3d5698SJohn Baldwin
971bc3d5698SJohn Baldwin	vpsrlq	$26,%xmm4,%xmm10
972bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm4,%xmm4
973bc3d5698SJohn Baldwin
974bc3d5698SJohn Baldwin	vpsrlq	$26,%xmm1,%xmm11
975bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm1,%xmm1
976bc3d5698SJohn Baldwin	vpaddq	%xmm11,%xmm2,%xmm2
977bc3d5698SJohn Baldwin
978bc3d5698SJohn Baldwin	vpaddq	%xmm10,%xmm0,%xmm0
979bc3d5698SJohn Baldwin	vpsllq	$2,%xmm10,%xmm10
980bc3d5698SJohn Baldwin	vpaddq	%xmm10,%xmm0,%xmm0
981bc3d5698SJohn Baldwin
982bc3d5698SJohn Baldwin	vpsrlq	$26,%xmm2,%xmm12
983bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm2,%xmm2
984bc3d5698SJohn Baldwin	vpaddq	%xmm12,%xmm3,%xmm3
985bc3d5698SJohn Baldwin
986bc3d5698SJohn Baldwin	vpsrlq	$26,%xmm0,%xmm10
987bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm0,%xmm0
988bc3d5698SJohn Baldwin	vpaddq	%xmm10,%xmm1,%xmm1
989bc3d5698SJohn Baldwin
990bc3d5698SJohn Baldwin	vpsrlq	$26,%xmm3,%xmm13
991bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm3,%xmm3
992bc3d5698SJohn Baldwin	vpaddq	%xmm13,%xmm4,%xmm4
993bc3d5698SJohn Baldwin
994bc3d5698SJohn Baldwin	ja	.Loop_avx
995bc3d5698SJohn Baldwin
996bc3d5698SJohn Baldwin.Lskip_loop_avx:
997bc3d5698SJohn Baldwin
998bc3d5698SJohn Baldwin
999bc3d5698SJohn Baldwin
1000bc3d5698SJohn Baldwin	vpshufd	$0x10,%xmm14,%xmm14
1001bc3d5698SJohn Baldwin	addq	$32,%rdx
1002bc3d5698SJohn Baldwin	jnz	.Long_tail_avx
1003bc3d5698SJohn Baldwin
1004bc3d5698SJohn Baldwin	vpaddq	%xmm2,%xmm7,%xmm7
1005bc3d5698SJohn Baldwin	vpaddq	%xmm0,%xmm5,%xmm5
1006bc3d5698SJohn Baldwin	vpaddq	%xmm1,%xmm6,%xmm6
1007bc3d5698SJohn Baldwin	vpaddq	%xmm3,%xmm8,%xmm8
1008bc3d5698SJohn Baldwin	vpaddq	%xmm4,%xmm9,%xmm9
1009bc3d5698SJohn Baldwin
1010bc3d5698SJohn Baldwin.Long_tail_avx:
1011bc3d5698SJohn Baldwin	vmovdqa	%xmm2,32(%r11)
1012bc3d5698SJohn Baldwin	vmovdqa	%xmm0,0(%r11)
1013bc3d5698SJohn Baldwin	vmovdqa	%xmm1,16(%r11)
1014bc3d5698SJohn Baldwin	vmovdqa	%xmm3,48(%r11)
1015bc3d5698SJohn Baldwin	vmovdqa	%xmm4,64(%r11)
1016bc3d5698SJohn Baldwin
1017bc3d5698SJohn Baldwin
1018bc3d5698SJohn Baldwin
1019bc3d5698SJohn Baldwin
1020bc3d5698SJohn Baldwin
1021bc3d5698SJohn Baldwin
1022bc3d5698SJohn Baldwin
1023bc3d5698SJohn Baldwin	vpmuludq	%xmm7,%xmm14,%xmm12
1024bc3d5698SJohn Baldwin	vpmuludq	%xmm5,%xmm14,%xmm10
1025bc3d5698SJohn Baldwin	vpshufd	$0x10,-48(%rdi),%xmm2
1026bc3d5698SJohn Baldwin	vpmuludq	%xmm6,%xmm14,%xmm11
1027bc3d5698SJohn Baldwin	vpmuludq	%xmm8,%xmm14,%xmm13
1028bc3d5698SJohn Baldwin	vpmuludq	%xmm9,%xmm14,%xmm14
1029bc3d5698SJohn Baldwin
1030bc3d5698SJohn Baldwin	vpmuludq	%xmm8,%xmm2,%xmm0
1031bc3d5698SJohn Baldwin	vpaddq	%xmm0,%xmm14,%xmm14
1032bc3d5698SJohn Baldwin	vpshufd	$0x10,-32(%rdi),%xmm3
1033bc3d5698SJohn Baldwin	vpmuludq	%xmm7,%xmm2,%xmm1
1034bc3d5698SJohn Baldwin	vpaddq	%xmm1,%xmm13,%xmm13
1035bc3d5698SJohn Baldwin	vpshufd	$0x10,-16(%rdi),%xmm4
1036bc3d5698SJohn Baldwin	vpmuludq	%xmm6,%xmm2,%xmm0
1037bc3d5698SJohn Baldwin	vpaddq	%xmm0,%xmm12,%xmm12
1038bc3d5698SJohn Baldwin	vpmuludq	%xmm5,%xmm2,%xmm2
1039bc3d5698SJohn Baldwin	vpaddq	%xmm2,%xmm11,%xmm11
1040bc3d5698SJohn Baldwin	vpmuludq	%xmm9,%xmm3,%xmm3
1041bc3d5698SJohn Baldwin	vpaddq	%xmm3,%xmm10,%xmm10
1042bc3d5698SJohn Baldwin
1043bc3d5698SJohn Baldwin	vpshufd	$0x10,0(%rdi),%xmm2
1044bc3d5698SJohn Baldwin	vpmuludq	%xmm7,%xmm4,%xmm1
1045bc3d5698SJohn Baldwin	vpaddq	%xmm1,%xmm14,%xmm14
1046bc3d5698SJohn Baldwin	vpmuludq	%xmm6,%xmm4,%xmm0
1047bc3d5698SJohn Baldwin	vpaddq	%xmm0,%xmm13,%xmm13
1048bc3d5698SJohn Baldwin	vpshufd	$0x10,16(%rdi),%xmm3
1049bc3d5698SJohn Baldwin	vpmuludq	%xmm5,%xmm4,%xmm4
1050bc3d5698SJohn Baldwin	vpaddq	%xmm4,%xmm12,%xmm12
1051bc3d5698SJohn Baldwin	vpmuludq	%xmm9,%xmm2,%xmm1
1052bc3d5698SJohn Baldwin	vpaddq	%xmm1,%xmm11,%xmm11
1053bc3d5698SJohn Baldwin	vpshufd	$0x10,32(%rdi),%xmm4
1054bc3d5698SJohn Baldwin	vpmuludq	%xmm8,%xmm2,%xmm2
1055bc3d5698SJohn Baldwin	vpaddq	%xmm2,%xmm10,%xmm10
1056bc3d5698SJohn Baldwin
1057bc3d5698SJohn Baldwin	vpmuludq	%xmm6,%xmm3,%xmm0
1058bc3d5698SJohn Baldwin	vpaddq	%xmm0,%xmm14,%xmm14
1059bc3d5698SJohn Baldwin	vpmuludq	%xmm5,%xmm3,%xmm3
1060bc3d5698SJohn Baldwin	vpaddq	%xmm3,%xmm13,%xmm13
1061bc3d5698SJohn Baldwin	vpshufd	$0x10,48(%rdi),%xmm2
1062bc3d5698SJohn Baldwin	vpmuludq	%xmm9,%xmm4,%xmm1
1063bc3d5698SJohn Baldwin	vpaddq	%xmm1,%xmm12,%xmm12
1064bc3d5698SJohn Baldwin	vpshufd	$0x10,64(%rdi),%xmm3
1065bc3d5698SJohn Baldwin	vpmuludq	%xmm8,%xmm4,%xmm0
1066bc3d5698SJohn Baldwin	vpaddq	%xmm0,%xmm11,%xmm11
1067bc3d5698SJohn Baldwin	vpmuludq	%xmm7,%xmm4,%xmm4
1068bc3d5698SJohn Baldwin	vpaddq	%xmm4,%xmm10,%xmm10
1069bc3d5698SJohn Baldwin
1070bc3d5698SJohn Baldwin	vpmuludq	%xmm5,%xmm2,%xmm2
1071bc3d5698SJohn Baldwin	vpaddq	%xmm2,%xmm14,%xmm14
1072bc3d5698SJohn Baldwin	vpmuludq	%xmm9,%xmm3,%xmm1
1073bc3d5698SJohn Baldwin	vpaddq	%xmm1,%xmm13,%xmm13
1074bc3d5698SJohn Baldwin	vpmuludq	%xmm8,%xmm3,%xmm0
1075bc3d5698SJohn Baldwin	vpaddq	%xmm0,%xmm12,%xmm12
1076bc3d5698SJohn Baldwin	vpmuludq	%xmm7,%xmm3,%xmm1
1077bc3d5698SJohn Baldwin	vpaddq	%xmm1,%xmm11,%xmm11
1078bc3d5698SJohn Baldwin	vpmuludq	%xmm6,%xmm3,%xmm3
1079bc3d5698SJohn Baldwin	vpaddq	%xmm3,%xmm10,%xmm10
1080bc3d5698SJohn Baldwin
1081bc3d5698SJohn Baldwin	jz	.Lshort_tail_avx
1082bc3d5698SJohn Baldwin
1083bc3d5698SJohn Baldwin	vmovdqu	0(%rsi),%xmm0
1084bc3d5698SJohn Baldwin	vmovdqu	16(%rsi),%xmm1
1085bc3d5698SJohn Baldwin
1086bc3d5698SJohn Baldwin	vpsrldq	$6,%xmm0,%xmm2
1087bc3d5698SJohn Baldwin	vpsrldq	$6,%xmm1,%xmm3
1088bc3d5698SJohn Baldwin	vpunpckhqdq	%xmm1,%xmm0,%xmm4
1089bc3d5698SJohn Baldwin	vpunpcklqdq	%xmm1,%xmm0,%xmm0
1090bc3d5698SJohn Baldwin	vpunpcklqdq	%xmm3,%xmm2,%xmm3
1091bc3d5698SJohn Baldwin
1092bc3d5698SJohn Baldwin	vpsrlq	$40,%xmm4,%xmm4
1093bc3d5698SJohn Baldwin	vpsrlq	$26,%xmm0,%xmm1
1094bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm0,%xmm0
1095bc3d5698SJohn Baldwin	vpsrlq	$4,%xmm3,%xmm2
1096bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm1,%xmm1
1097bc3d5698SJohn Baldwin	vpsrlq	$30,%xmm3,%xmm3
1098bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm2,%xmm2
1099bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm3,%xmm3
1100bc3d5698SJohn Baldwin	vpor	32(%rcx),%xmm4,%xmm4
1101bc3d5698SJohn Baldwin
1102bc3d5698SJohn Baldwin	vpshufd	$0x32,-64(%rdi),%xmm9
1103bc3d5698SJohn Baldwin	vpaddq	0(%r11),%xmm0,%xmm0
1104bc3d5698SJohn Baldwin	vpaddq	16(%r11),%xmm1,%xmm1
1105bc3d5698SJohn Baldwin	vpaddq	32(%r11),%xmm2,%xmm2
1106bc3d5698SJohn Baldwin	vpaddq	48(%r11),%xmm3,%xmm3
1107bc3d5698SJohn Baldwin	vpaddq	64(%r11),%xmm4,%xmm4
1108bc3d5698SJohn Baldwin
1109bc3d5698SJohn Baldwin
1110bc3d5698SJohn Baldwin
1111bc3d5698SJohn Baldwin
1112bc3d5698SJohn Baldwin	vpmuludq	%xmm0,%xmm9,%xmm5
1113bc3d5698SJohn Baldwin	vpaddq	%xmm5,%xmm10,%xmm10
1114bc3d5698SJohn Baldwin	vpmuludq	%xmm1,%xmm9,%xmm6
1115bc3d5698SJohn Baldwin	vpaddq	%xmm6,%xmm11,%xmm11
1116bc3d5698SJohn Baldwin	vpmuludq	%xmm2,%xmm9,%xmm5
1117bc3d5698SJohn Baldwin	vpaddq	%xmm5,%xmm12,%xmm12
1118bc3d5698SJohn Baldwin	vpshufd	$0x32,-48(%rdi),%xmm7
1119bc3d5698SJohn Baldwin	vpmuludq	%xmm3,%xmm9,%xmm6
1120bc3d5698SJohn Baldwin	vpaddq	%xmm6,%xmm13,%xmm13
1121bc3d5698SJohn Baldwin	vpmuludq	%xmm4,%xmm9,%xmm9
1122bc3d5698SJohn Baldwin	vpaddq	%xmm9,%xmm14,%xmm14
1123bc3d5698SJohn Baldwin
1124bc3d5698SJohn Baldwin	vpmuludq	%xmm3,%xmm7,%xmm5
1125bc3d5698SJohn Baldwin	vpaddq	%xmm5,%xmm14,%xmm14
1126bc3d5698SJohn Baldwin	vpshufd	$0x32,-32(%rdi),%xmm8
1127bc3d5698SJohn Baldwin	vpmuludq	%xmm2,%xmm7,%xmm6
1128bc3d5698SJohn Baldwin	vpaddq	%xmm6,%xmm13,%xmm13
1129bc3d5698SJohn Baldwin	vpshufd	$0x32,-16(%rdi),%xmm9
1130bc3d5698SJohn Baldwin	vpmuludq	%xmm1,%xmm7,%xmm5
1131bc3d5698SJohn Baldwin	vpaddq	%xmm5,%xmm12,%xmm12
1132bc3d5698SJohn Baldwin	vpmuludq	%xmm0,%xmm7,%xmm7
1133bc3d5698SJohn Baldwin	vpaddq	%xmm7,%xmm11,%xmm11
1134bc3d5698SJohn Baldwin	vpmuludq	%xmm4,%xmm8,%xmm8
1135bc3d5698SJohn Baldwin	vpaddq	%xmm8,%xmm10,%xmm10
1136bc3d5698SJohn Baldwin
1137bc3d5698SJohn Baldwin	vpshufd	$0x32,0(%rdi),%xmm7
1138bc3d5698SJohn Baldwin	vpmuludq	%xmm2,%xmm9,%xmm6
1139bc3d5698SJohn Baldwin	vpaddq	%xmm6,%xmm14,%xmm14
1140bc3d5698SJohn Baldwin	vpmuludq	%xmm1,%xmm9,%xmm5
1141bc3d5698SJohn Baldwin	vpaddq	%xmm5,%xmm13,%xmm13
1142bc3d5698SJohn Baldwin	vpshufd	$0x32,16(%rdi),%xmm8
1143bc3d5698SJohn Baldwin	vpmuludq	%xmm0,%xmm9,%xmm9
1144bc3d5698SJohn Baldwin	vpaddq	%xmm9,%xmm12,%xmm12
1145bc3d5698SJohn Baldwin	vpmuludq	%xmm4,%xmm7,%xmm6
1146bc3d5698SJohn Baldwin	vpaddq	%xmm6,%xmm11,%xmm11
1147bc3d5698SJohn Baldwin	vpshufd	$0x32,32(%rdi),%xmm9
1148bc3d5698SJohn Baldwin	vpmuludq	%xmm3,%xmm7,%xmm7
1149bc3d5698SJohn Baldwin	vpaddq	%xmm7,%xmm10,%xmm10
1150bc3d5698SJohn Baldwin
1151bc3d5698SJohn Baldwin	vpmuludq	%xmm1,%xmm8,%xmm5
1152bc3d5698SJohn Baldwin	vpaddq	%xmm5,%xmm14,%xmm14
1153bc3d5698SJohn Baldwin	vpmuludq	%xmm0,%xmm8,%xmm8
1154bc3d5698SJohn Baldwin	vpaddq	%xmm8,%xmm13,%xmm13
1155bc3d5698SJohn Baldwin	vpshufd	$0x32,48(%rdi),%xmm7
1156bc3d5698SJohn Baldwin	vpmuludq	%xmm4,%xmm9,%xmm6
1157bc3d5698SJohn Baldwin	vpaddq	%xmm6,%xmm12,%xmm12
1158bc3d5698SJohn Baldwin	vpshufd	$0x32,64(%rdi),%xmm8
1159bc3d5698SJohn Baldwin	vpmuludq	%xmm3,%xmm9,%xmm5
1160bc3d5698SJohn Baldwin	vpaddq	%xmm5,%xmm11,%xmm11
1161bc3d5698SJohn Baldwin	vpmuludq	%xmm2,%xmm9,%xmm9
1162bc3d5698SJohn Baldwin	vpaddq	%xmm9,%xmm10,%xmm10
1163bc3d5698SJohn Baldwin
1164bc3d5698SJohn Baldwin	vpmuludq	%xmm0,%xmm7,%xmm7
1165bc3d5698SJohn Baldwin	vpaddq	%xmm7,%xmm14,%xmm14
1166bc3d5698SJohn Baldwin	vpmuludq	%xmm4,%xmm8,%xmm6
1167bc3d5698SJohn Baldwin	vpaddq	%xmm6,%xmm13,%xmm13
1168bc3d5698SJohn Baldwin	vpmuludq	%xmm3,%xmm8,%xmm5
1169bc3d5698SJohn Baldwin	vpaddq	%xmm5,%xmm12,%xmm12
1170bc3d5698SJohn Baldwin	vpmuludq	%xmm2,%xmm8,%xmm6
1171bc3d5698SJohn Baldwin	vpaddq	%xmm6,%xmm11,%xmm11
1172bc3d5698SJohn Baldwin	vpmuludq	%xmm1,%xmm8,%xmm8
1173bc3d5698SJohn Baldwin	vpaddq	%xmm8,%xmm10,%xmm10
1174bc3d5698SJohn Baldwin
1175bc3d5698SJohn Baldwin.Lshort_tail_avx:
1176bc3d5698SJohn Baldwin
1177bc3d5698SJohn Baldwin
1178bc3d5698SJohn Baldwin
1179bc3d5698SJohn Baldwin	vpsrldq	$8,%xmm14,%xmm9
1180bc3d5698SJohn Baldwin	vpsrldq	$8,%xmm13,%xmm8
1181bc3d5698SJohn Baldwin	vpsrldq	$8,%xmm11,%xmm6
1182bc3d5698SJohn Baldwin	vpsrldq	$8,%xmm10,%xmm5
1183bc3d5698SJohn Baldwin	vpsrldq	$8,%xmm12,%xmm7
1184bc3d5698SJohn Baldwin	vpaddq	%xmm8,%xmm13,%xmm13
1185bc3d5698SJohn Baldwin	vpaddq	%xmm9,%xmm14,%xmm14
1186bc3d5698SJohn Baldwin	vpaddq	%xmm5,%xmm10,%xmm10
1187bc3d5698SJohn Baldwin	vpaddq	%xmm6,%xmm11,%xmm11
1188bc3d5698SJohn Baldwin	vpaddq	%xmm7,%xmm12,%xmm12
1189bc3d5698SJohn Baldwin
1190bc3d5698SJohn Baldwin
1191bc3d5698SJohn Baldwin
1192bc3d5698SJohn Baldwin
1193bc3d5698SJohn Baldwin	vpsrlq	$26,%xmm13,%xmm3
1194bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm13,%xmm13
1195bc3d5698SJohn Baldwin	vpaddq	%xmm3,%xmm14,%xmm14
1196bc3d5698SJohn Baldwin
1197bc3d5698SJohn Baldwin	vpsrlq	$26,%xmm10,%xmm0
1198bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm10,%xmm10
1199bc3d5698SJohn Baldwin	vpaddq	%xmm0,%xmm11,%xmm11
1200bc3d5698SJohn Baldwin
1201bc3d5698SJohn Baldwin	vpsrlq	$26,%xmm14,%xmm4
1202bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm14,%xmm14
1203bc3d5698SJohn Baldwin
1204bc3d5698SJohn Baldwin	vpsrlq	$26,%xmm11,%xmm1
1205bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm11,%xmm11
1206bc3d5698SJohn Baldwin	vpaddq	%xmm1,%xmm12,%xmm12
1207bc3d5698SJohn Baldwin
1208bc3d5698SJohn Baldwin	vpaddq	%xmm4,%xmm10,%xmm10
1209bc3d5698SJohn Baldwin	vpsllq	$2,%xmm4,%xmm4
1210bc3d5698SJohn Baldwin	vpaddq	%xmm4,%xmm10,%xmm10
1211bc3d5698SJohn Baldwin
1212bc3d5698SJohn Baldwin	vpsrlq	$26,%xmm12,%xmm2
1213bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm12,%xmm12
1214bc3d5698SJohn Baldwin	vpaddq	%xmm2,%xmm13,%xmm13
1215bc3d5698SJohn Baldwin
1216bc3d5698SJohn Baldwin	vpsrlq	$26,%xmm10,%xmm0
1217bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm10,%xmm10
1218bc3d5698SJohn Baldwin	vpaddq	%xmm0,%xmm11,%xmm11
1219bc3d5698SJohn Baldwin
1220bc3d5698SJohn Baldwin	vpsrlq	$26,%xmm13,%xmm3
1221bc3d5698SJohn Baldwin	vpand	%xmm15,%xmm13,%xmm13
1222bc3d5698SJohn Baldwin	vpaddq	%xmm3,%xmm14,%xmm14
1223bc3d5698SJohn Baldwin
1224bc3d5698SJohn Baldwin	vmovd	%xmm10,-112(%rdi)
1225bc3d5698SJohn Baldwin	vmovd	%xmm11,-108(%rdi)
1226bc3d5698SJohn Baldwin	vmovd	%xmm12,-104(%rdi)
1227bc3d5698SJohn Baldwin	vmovd	%xmm13,-100(%rdi)
1228bc3d5698SJohn Baldwin	vmovd	%xmm14,-96(%rdi)
1229bc3d5698SJohn Baldwin	leaq	88(%r11),%rsp
1230bc3d5698SJohn Baldwin.cfi_def_cfa	%rsp,8
1231bc3d5698SJohn Baldwin	vzeroupper
1232bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
1233bc3d5698SJohn Baldwin.cfi_endproc
1234bc3d5698SJohn Baldwin.size	poly1305_blocks_avx,.-poly1305_blocks_avx
1235bc3d5698SJohn Baldwin
1236bc3d5698SJohn Baldwin.type	poly1305_emit_avx,@function
1237bc3d5698SJohn Baldwin.align	32
1238bc3d5698SJohn Baldwinpoly1305_emit_avx:
1239bc3d5698SJohn Baldwin.cfi_startproc
1240bc3d5698SJohn Baldwin	cmpl	$0,20(%rdi)
1241bc3d5698SJohn Baldwin	je	.Lemit
1242bc3d5698SJohn Baldwin
1243bc3d5698SJohn Baldwin	movl	0(%rdi),%eax
1244bc3d5698SJohn Baldwin	movl	4(%rdi),%ecx
1245bc3d5698SJohn Baldwin	movl	8(%rdi),%r8d
1246bc3d5698SJohn Baldwin	movl	12(%rdi),%r11d
1247bc3d5698SJohn Baldwin	movl	16(%rdi),%r10d
1248bc3d5698SJohn Baldwin
1249bc3d5698SJohn Baldwin	shlq	$26,%rcx
1250bc3d5698SJohn Baldwin	movq	%r8,%r9
1251bc3d5698SJohn Baldwin	shlq	$52,%r8
1252bc3d5698SJohn Baldwin	addq	%rcx,%rax
1253bc3d5698SJohn Baldwin	shrq	$12,%r9
1254bc3d5698SJohn Baldwin	addq	%rax,%r8
1255bc3d5698SJohn Baldwin	adcq	$0,%r9
1256bc3d5698SJohn Baldwin
1257bc3d5698SJohn Baldwin	shlq	$14,%r11
1258bc3d5698SJohn Baldwin	movq	%r10,%rax
1259bc3d5698SJohn Baldwin	shrq	$24,%r10
1260bc3d5698SJohn Baldwin	addq	%r11,%r9
1261bc3d5698SJohn Baldwin	shlq	$40,%rax
1262bc3d5698SJohn Baldwin	addq	%rax,%r9
1263bc3d5698SJohn Baldwin	adcq	$0,%r10
1264bc3d5698SJohn Baldwin
1265bc3d5698SJohn Baldwin	movq	%r10,%rax
1266bc3d5698SJohn Baldwin	movq	%r10,%rcx
1267bc3d5698SJohn Baldwin	andq	$3,%r10
1268bc3d5698SJohn Baldwin	shrq	$2,%rax
1269bc3d5698SJohn Baldwin	andq	$-4,%rcx
1270bc3d5698SJohn Baldwin	addq	%rcx,%rax
1271bc3d5698SJohn Baldwin	addq	%rax,%r8
1272bc3d5698SJohn Baldwin	adcq	$0,%r9
1273bc3d5698SJohn Baldwin	adcq	$0,%r10
1274bc3d5698SJohn Baldwin
1275bc3d5698SJohn Baldwin	movq	%r8,%rax
1276bc3d5698SJohn Baldwin	addq	$5,%r8
1277bc3d5698SJohn Baldwin	movq	%r9,%rcx
1278bc3d5698SJohn Baldwin	adcq	$0,%r9
1279bc3d5698SJohn Baldwin	adcq	$0,%r10
1280bc3d5698SJohn Baldwin	shrq	$2,%r10
1281bc3d5698SJohn Baldwin	cmovnzq	%r8,%rax
1282bc3d5698SJohn Baldwin	cmovnzq	%r9,%rcx
1283bc3d5698SJohn Baldwin
1284bc3d5698SJohn Baldwin	addq	0(%rdx),%rax
1285bc3d5698SJohn Baldwin	adcq	8(%rdx),%rcx
1286bc3d5698SJohn Baldwin	movq	%rax,0(%rsi)
1287bc3d5698SJohn Baldwin	movq	%rcx,8(%rsi)
1288bc3d5698SJohn Baldwin
1289bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
1290bc3d5698SJohn Baldwin.cfi_endproc
1291bc3d5698SJohn Baldwin.size	poly1305_emit_avx,.-poly1305_emit_avx
1292bc3d5698SJohn Baldwin.type	poly1305_blocks_avx2,@function
1293bc3d5698SJohn Baldwin.align	32
1294bc3d5698SJohn Baldwinpoly1305_blocks_avx2:
1295bc3d5698SJohn Baldwin.cfi_startproc
1296bc3d5698SJohn Baldwin	movl	20(%rdi),%r8d
1297bc3d5698SJohn Baldwin	cmpq	$128,%rdx
1298bc3d5698SJohn Baldwin	jae	.Lblocks_avx2
1299bc3d5698SJohn Baldwin	testl	%r8d,%r8d
1300bc3d5698SJohn Baldwin	jz	.Lblocks
1301bc3d5698SJohn Baldwin
1302bc3d5698SJohn Baldwin.Lblocks_avx2:
1303bc3d5698SJohn Baldwin	andq	$-16,%rdx
1304bc3d5698SJohn Baldwin	jz	.Lno_data_avx2
1305bc3d5698SJohn Baldwin
1306bc3d5698SJohn Baldwin	vzeroupper
1307bc3d5698SJohn Baldwin
1308bc3d5698SJohn Baldwin	testl	%r8d,%r8d
1309bc3d5698SJohn Baldwin	jz	.Lbase2_64_avx2
1310bc3d5698SJohn Baldwin
1311bc3d5698SJohn Baldwin	testq	$63,%rdx
1312bc3d5698SJohn Baldwin	jz	.Leven_avx2
1313bc3d5698SJohn Baldwin
1314bc3d5698SJohn Baldwin	pushq	%rbx
1315bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
1316bc3d5698SJohn Baldwin.cfi_offset	%rbx,-16
1317bc3d5698SJohn Baldwin	pushq	%rbp
1318bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
1319bc3d5698SJohn Baldwin.cfi_offset	%rbp,-24
1320bc3d5698SJohn Baldwin	pushq	%r12
1321bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
1322bc3d5698SJohn Baldwin.cfi_offset	%r12,-32
1323bc3d5698SJohn Baldwin	pushq	%r13
1324bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
1325bc3d5698SJohn Baldwin.cfi_offset	%r13,-40
1326bc3d5698SJohn Baldwin	pushq	%r14
1327bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
1328bc3d5698SJohn Baldwin.cfi_offset	%r14,-48
1329bc3d5698SJohn Baldwin	pushq	%r15
1330bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
1331bc3d5698SJohn Baldwin.cfi_offset	%r15,-56
1332bc3d5698SJohn Baldwin.Lblocks_avx2_body:
1333bc3d5698SJohn Baldwin
1334bc3d5698SJohn Baldwin	movq	%rdx,%r15
1335bc3d5698SJohn Baldwin
1336bc3d5698SJohn Baldwin	movq	0(%rdi),%r8
1337bc3d5698SJohn Baldwin	movq	8(%rdi),%r9
1338bc3d5698SJohn Baldwin	movl	16(%rdi),%ebp
1339bc3d5698SJohn Baldwin
1340bc3d5698SJohn Baldwin	movq	24(%rdi),%r11
1341bc3d5698SJohn Baldwin	movq	32(%rdi),%r13
1342bc3d5698SJohn Baldwin
1343bc3d5698SJohn Baldwin
1344bc3d5698SJohn Baldwin	movl	%r8d,%r14d
1345bc3d5698SJohn Baldwin	andq	$-2147483648,%r8
1346bc3d5698SJohn Baldwin	movq	%r9,%r12
1347bc3d5698SJohn Baldwin	movl	%r9d,%ebx
1348bc3d5698SJohn Baldwin	andq	$-2147483648,%r9
1349bc3d5698SJohn Baldwin
1350bc3d5698SJohn Baldwin	shrq	$6,%r8
1351bc3d5698SJohn Baldwin	shlq	$52,%r12
1352bc3d5698SJohn Baldwin	addq	%r8,%r14
1353bc3d5698SJohn Baldwin	shrq	$12,%rbx
1354bc3d5698SJohn Baldwin	shrq	$18,%r9
1355bc3d5698SJohn Baldwin	addq	%r12,%r14
1356bc3d5698SJohn Baldwin	adcq	%r9,%rbx
1357bc3d5698SJohn Baldwin
1358bc3d5698SJohn Baldwin	movq	%rbp,%r8
1359bc3d5698SJohn Baldwin	shlq	$40,%r8
1360bc3d5698SJohn Baldwin	shrq	$24,%rbp
1361bc3d5698SJohn Baldwin	addq	%r8,%rbx
1362bc3d5698SJohn Baldwin	adcq	$0,%rbp
1363bc3d5698SJohn Baldwin
1364bc3d5698SJohn Baldwin	movq	$-4,%r9
1365bc3d5698SJohn Baldwin	movq	%rbp,%r8
1366bc3d5698SJohn Baldwin	andq	%rbp,%r9
1367bc3d5698SJohn Baldwin	shrq	$2,%r8
1368bc3d5698SJohn Baldwin	andq	$3,%rbp
1369bc3d5698SJohn Baldwin	addq	%r9,%r8
1370bc3d5698SJohn Baldwin	addq	%r8,%r14
1371bc3d5698SJohn Baldwin	adcq	$0,%rbx
1372bc3d5698SJohn Baldwin	adcq	$0,%rbp
1373bc3d5698SJohn Baldwin
1374bc3d5698SJohn Baldwin	movq	%r13,%r12
1375bc3d5698SJohn Baldwin	movq	%r13,%rax
1376bc3d5698SJohn Baldwin	shrq	$2,%r13
1377bc3d5698SJohn Baldwin	addq	%r12,%r13
1378bc3d5698SJohn Baldwin
1379bc3d5698SJohn Baldwin.Lbase2_26_pre_avx2:
1380bc3d5698SJohn Baldwin	addq	0(%rsi),%r14
1381bc3d5698SJohn Baldwin	adcq	8(%rsi),%rbx
1382bc3d5698SJohn Baldwin	leaq	16(%rsi),%rsi
1383bc3d5698SJohn Baldwin	adcq	%rcx,%rbp
1384bc3d5698SJohn Baldwin	subq	$16,%r15
1385bc3d5698SJohn Baldwin
1386bc3d5698SJohn Baldwin	call	__poly1305_block
1387bc3d5698SJohn Baldwin	movq	%r12,%rax
1388bc3d5698SJohn Baldwin
1389bc3d5698SJohn Baldwin	testq	$63,%r15
1390bc3d5698SJohn Baldwin	jnz	.Lbase2_26_pre_avx2
1391bc3d5698SJohn Baldwin
1392bc3d5698SJohn Baldwin	testq	%rcx,%rcx
1393bc3d5698SJohn Baldwin	jz	.Lstore_base2_64_avx2
1394bc3d5698SJohn Baldwin
1395bc3d5698SJohn Baldwin
1396bc3d5698SJohn Baldwin	movq	%r14,%rax
1397bc3d5698SJohn Baldwin	movq	%r14,%rdx
1398bc3d5698SJohn Baldwin	shrq	$52,%r14
1399bc3d5698SJohn Baldwin	movq	%rbx,%r11
1400bc3d5698SJohn Baldwin	movq	%rbx,%r12
1401bc3d5698SJohn Baldwin	shrq	$26,%rdx
1402bc3d5698SJohn Baldwin	andq	$0x3ffffff,%rax
1403bc3d5698SJohn Baldwin	shlq	$12,%r11
1404bc3d5698SJohn Baldwin	andq	$0x3ffffff,%rdx
1405bc3d5698SJohn Baldwin	shrq	$14,%rbx
1406bc3d5698SJohn Baldwin	orq	%r11,%r14
1407bc3d5698SJohn Baldwin	shlq	$24,%rbp
1408bc3d5698SJohn Baldwin	andq	$0x3ffffff,%r14
1409bc3d5698SJohn Baldwin	shrq	$40,%r12
1410bc3d5698SJohn Baldwin	andq	$0x3ffffff,%rbx
1411bc3d5698SJohn Baldwin	orq	%r12,%rbp
1412bc3d5698SJohn Baldwin
1413bc3d5698SJohn Baldwin	testq	%r15,%r15
1414bc3d5698SJohn Baldwin	jz	.Lstore_base2_26_avx2
1415bc3d5698SJohn Baldwin
1416bc3d5698SJohn Baldwin	vmovd	%eax,%xmm0
1417bc3d5698SJohn Baldwin	vmovd	%edx,%xmm1
1418bc3d5698SJohn Baldwin	vmovd	%r14d,%xmm2
1419bc3d5698SJohn Baldwin	vmovd	%ebx,%xmm3
1420bc3d5698SJohn Baldwin	vmovd	%ebp,%xmm4
1421bc3d5698SJohn Baldwin	jmp	.Lproceed_avx2
1422bc3d5698SJohn Baldwin
1423bc3d5698SJohn Baldwin.align	32
1424bc3d5698SJohn Baldwin.Lstore_base2_64_avx2:
1425bc3d5698SJohn Baldwin	movq	%r14,0(%rdi)
1426bc3d5698SJohn Baldwin	movq	%rbx,8(%rdi)
1427bc3d5698SJohn Baldwin	movq	%rbp,16(%rdi)
1428bc3d5698SJohn Baldwin	jmp	.Ldone_avx2
1429bc3d5698SJohn Baldwin
1430bc3d5698SJohn Baldwin.align	16
1431bc3d5698SJohn Baldwin.Lstore_base2_26_avx2:
1432bc3d5698SJohn Baldwin	movl	%eax,0(%rdi)
1433bc3d5698SJohn Baldwin	movl	%edx,4(%rdi)
1434bc3d5698SJohn Baldwin	movl	%r14d,8(%rdi)
1435bc3d5698SJohn Baldwin	movl	%ebx,12(%rdi)
1436bc3d5698SJohn Baldwin	movl	%ebp,16(%rdi)
1437bc3d5698SJohn Baldwin.align	16
1438bc3d5698SJohn Baldwin.Ldone_avx2:
1439bc3d5698SJohn Baldwin	movq	0(%rsp),%r15
1440bc3d5698SJohn Baldwin.cfi_restore	%r15
1441bc3d5698SJohn Baldwin	movq	8(%rsp),%r14
1442bc3d5698SJohn Baldwin.cfi_restore	%r14
1443bc3d5698SJohn Baldwin	movq	16(%rsp),%r13
1444bc3d5698SJohn Baldwin.cfi_restore	%r13
1445bc3d5698SJohn Baldwin	movq	24(%rsp),%r12
1446bc3d5698SJohn Baldwin.cfi_restore	%r12
1447bc3d5698SJohn Baldwin	movq	32(%rsp),%rbp
1448bc3d5698SJohn Baldwin.cfi_restore	%rbp
1449bc3d5698SJohn Baldwin	movq	40(%rsp),%rbx
1450bc3d5698SJohn Baldwin.cfi_restore	%rbx
1451bc3d5698SJohn Baldwin	leaq	48(%rsp),%rsp
1452bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	-48
1453bc3d5698SJohn Baldwin.Lno_data_avx2:
1454bc3d5698SJohn Baldwin.Lblocks_avx2_epilogue:
1455bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
1456bc3d5698SJohn Baldwin.cfi_endproc
1457bc3d5698SJohn Baldwin
1458bc3d5698SJohn Baldwin.align	32
1459bc3d5698SJohn Baldwin.Lbase2_64_avx2:
1460bc3d5698SJohn Baldwin.cfi_startproc
1461bc3d5698SJohn Baldwin	pushq	%rbx
1462bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
1463bc3d5698SJohn Baldwin.cfi_offset	%rbx,-16
1464bc3d5698SJohn Baldwin	pushq	%rbp
1465bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
1466bc3d5698SJohn Baldwin.cfi_offset	%rbp,-24
1467bc3d5698SJohn Baldwin	pushq	%r12
1468bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
1469bc3d5698SJohn Baldwin.cfi_offset	%r12,-32
1470bc3d5698SJohn Baldwin	pushq	%r13
1471bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
1472bc3d5698SJohn Baldwin.cfi_offset	%r13,-40
1473bc3d5698SJohn Baldwin	pushq	%r14
1474bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
1475bc3d5698SJohn Baldwin.cfi_offset	%r14,-48
1476bc3d5698SJohn Baldwin	pushq	%r15
1477bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	8
1478bc3d5698SJohn Baldwin.cfi_offset	%r15,-56
1479bc3d5698SJohn Baldwin.Lbase2_64_avx2_body:
1480bc3d5698SJohn Baldwin
1481bc3d5698SJohn Baldwin	movq	%rdx,%r15
1482bc3d5698SJohn Baldwin
1483bc3d5698SJohn Baldwin	movq	24(%rdi),%r11
1484bc3d5698SJohn Baldwin	movq	32(%rdi),%r13
1485bc3d5698SJohn Baldwin
1486bc3d5698SJohn Baldwin	movq	0(%rdi),%r14
1487bc3d5698SJohn Baldwin	movq	8(%rdi),%rbx
1488bc3d5698SJohn Baldwin	movl	16(%rdi),%ebp
1489bc3d5698SJohn Baldwin
1490bc3d5698SJohn Baldwin	movq	%r13,%r12
1491bc3d5698SJohn Baldwin	movq	%r13,%rax
1492bc3d5698SJohn Baldwin	shrq	$2,%r13
1493bc3d5698SJohn Baldwin	addq	%r12,%r13
1494bc3d5698SJohn Baldwin
1495bc3d5698SJohn Baldwin	testq	$63,%rdx
1496bc3d5698SJohn Baldwin	jz	.Linit_avx2
1497bc3d5698SJohn Baldwin
1498bc3d5698SJohn Baldwin.Lbase2_64_pre_avx2:
1499bc3d5698SJohn Baldwin	addq	0(%rsi),%r14
1500bc3d5698SJohn Baldwin	adcq	8(%rsi),%rbx
1501bc3d5698SJohn Baldwin	leaq	16(%rsi),%rsi
1502bc3d5698SJohn Baldwin	adcq	%rcx,%rbp
1503bc3d5698SJohn Baldwin	subq	$16,%r15
1504bc3d5698SJohn Baldwin
1505bc3d5698SJohn Baldwin	call	__poly1305_block
1506bc3d5698SJohn Baldwin	movq	%r12,%rax
1507bc3d5698SJohn Baldwin
1508bc3d5698SJohn Baldwin	testq	$63,%r15
1509bc3d5698SJohn Baldwin	jnz	.Lbase2_64_pre_avx2
1510bc3d5698SJohn Baldwin
1511bc3d5698SJohn Baldwin.Linit_avx2:
1512bc3d5698SJohn Baldwin
1513bc3d5698SJohn Baldwin	movq	%r14,%rax
1514bc3d5698SJohn Baldwin	movq	%r14,%rdx
1515bc3d5698SJohn Baldwin	shrq	$52,%r14
1516bc3d5698SJohn Baldwin	movq	%rbx,%r8
1517bc3d5698SJohn Baldwin	movq	%rbx,%r9
1518bc3d5698SJohn Baldwin	shrq	$26,%rdx
1519bc3d5698SJohn Baldwin	andq	$0x3ffffff,%rax
1520bc3d5698SJohn Baldwin	shlq	$12,%r8
1521bc3d5698SJohn Baldwin	andq	$0x3ffffff,%rdx
1522bc3d5698SJohn Baldwin	shrq	$14,%rbx
1523bc3d5698SJohn Baldwin	orq	%r8,%r14
1524bc3d5698SJohn Baldwin	shlq	$24,%rbp
1525bc3d5698SJohn Baldwin	andq	$0x3ffffff,%r14
1526bc3d5698SJohn Baldwin	shrq	$40,%r9
1527bc3d5698SJohn Baldwin	andq	$0x3ffffff,%rbx
1528bc3d5698SJohn Baldwin	orq	%r9,%rbp
1529bc3d5698SJohn Baldwin
1530bc3d5698SJohn Baldwin	vmovd	%eax,%xmm0
1531bc3d5698SJohn Baldwin	vmovd	%edx,%xmm1
1532bc3d5698SJohn Baldwin	vmovd	%r14d,%xmm2
1533bc3d5698SJohn Baldwin	vmovd	%ebx,%xmm3
1534bc3d5698SJohn Baldwin	vmovd	%ebp,%xmm4
1535bc3d5698SJohn Baldwin	movl	$1,20(%rdi)
1536bc3d5698SJohn Baldwin
1537bc3d5698SJohn Baldwin	call	__poly1305_init_avx
1538bc3d5698SJohn Baldwin
1539bc3d5698SJohn Baldwin.Lproceed_avx2:
1540bc3d5698SJohn Baldwin	movq	%r15,%rdx
1541bc3d5698SJohn Baldwin	movl	OPENSSL_ia32cap_P+8(%rip),%r10d
1542bc3d5698SJohn Baldwin	movl	$3221291008,%r11d
1543bc3d5698SJohn Baldwin
1544bc3d5698SJohn Baldwin	movq	0(%rsp),%r15
1545bc3d5698SJohn Baldwin.cfi_restore	%r15
1546bc3d5698SJohn Baldwin	movq	8(%rsp),%r14
1547bc3d5698SJohn Baldwin.cfi_restore	%r14
1548bc3d5698SJohn Baldwin	movq	16(%rsp),%r13
1549bc3d5698SJohn Baldwin.cfi_restore	%r13
1550bc3d5698SJohn Baldwin	movq	24(%rsp),%r12
1551bc3d5698SJohn Baldwin.cfi_restore	%r12
1552bc3d5698SJohn Baldwin	movq	32(%rsp),%rbp
1553bc3d5698SJohn Baldwin.cfi_restore	%rbp
1554bc3d5698SJohn Baldwin	movq	40(%rsp),%rbx
1555bc3d5698SJohn Baldwin.cfi_restore	%rbx
1556bc3d5698SJohn Baldwin	leaq	48(%rsp),%rax
1557bc3d5698SJohn Baldwin	leaq	48(%rsp),%rsp
1558bc3d5698SJohn Baldwin.cfi_adjust_cfa_offset	-48
1559bc3d5698SJohn Baldwin.Lbase2_64_avx2_epilogue:
1560bc3d5698SJohn Baldwin	jmp	.Ldo_avx2
1561bc3d5698SJohn Baldwin.cfi_endproc
1562bc3d5698SJohn Baldwin
1563bc3d5698SJohn Baldwin.align	32
1564bc3d5698SJohn Baldwin.Leven_avx2:
1565bc3d5698SJohn Baldwin.cfi_startproc
1566bc3d5698SJohn Baldwin	movl	OPENSSL_ia32cap_P+8(%rip),%r10d
1567bc3d5698SJohn Baldwin	vmovd	0(%rdi),%xmm0
1568bc3d5698SJohn Baldwin	vmovd	4(%rdi),%xmm1
1569bc3d5698SJohn Baldwin	vmovd	8(%rdi),%xmm2
1570bc3d5698SJohn Baldwin	vmovd	12(%rdi),%xmm3
1571bc3d5698SJohn Baldwin	vmovd	16(%rdi),%xmm4
1572bc3d5698SJohn Baldwin
1573bc3d5698SJohn Baldwin.Ldo_avx2:
1574bc3d5698SJohn Baldwin	leaq	-8(%rsp),%r11
1575bc3d5698SJohn Baldwin.cfi_def_cfa	%r11,16
1576bc3d5698SJohn Baldwin	subq	$0x128,%rsp
1577bc3d5698SJohn Baldwin	leaq	.Lconst(%rip),%rcx
1578bc3d5698SJohn Baldwin	leaq	48+64(%rdi),%rdi
1579bc3d5698SJohn Baldwin	vmovdqa	96(%rcx),%ymm7
1580bc3d5698SJohn Baldwin
1581bc3d5698SJohn Baldwin
1582bc3d5698SJohn Baldwin	vmovdqu	-64(%rdi),%xmm9
1583bc3d5698SJohn Baldwin	andq	$-512,%rsp
1584bc3d5698SJohn Baldwin	vmovdqu	-48(%rdi),%xmm10
1585bc3d5698SJohn Baldwin	vmovdqu	-32(%rdi),%xmm6
1586bc3d5698SJohn Baldwin	vmovdqu	-16(%rdi),%xmm11
1587bc3d5698SJohn Baldwin	vmovdqu	0(%rdi),%xmm12
1588bc3d5698SJohn Baldwin	vmovdqu	16(%rdi),%xmm13
1589bc3d5698SJohn Baldwin	leaq	144(%rsp),%rax
1590bc3d5698SJohn Baldwin	vmovdqu	32(%rdi),%xmm14
1591bc3d5698SJohn Baldwin	vpermd	%ymm9,%ymm7,%ymm9
1592bc3d5698SJohn Baldwin	vmovdqu	48(%rdi),%xmm15
1593bc3d5698SJohn Baldwin	vpermd	%ymm10,%ymm7,%ymm10
1594bc3d5698SJohn Baldwin	vmovdqu	64(%rdi),%xmm5
1595bc3d5698SJohn Baldwin	vpermd	%ymm6,%ymm7,%ymm6
1596bc3d5698SJohn Baldwin	vmovdqa	%ymm9,0(%rsp)
1597bc3d5698SJohn Baldwin	vpermd	%ymm11,%ymm7,%ymm11
1598bc3d5698SJohn Baldwin	vmovdqa	%ymm10,32-144(%rax)
1599bc3d5698SJohn Baldwin	vpermd	%ymm12,%ymm7,%ymm12
1600bc3d5698SJohn Baldwin	vmovdqa	%ymm6,64-144(%rax)
1601bc3d5698SJohn Baldwin	vpermd	%ymm13,%ymm7,%ymm13
1602bc3d5698SJohn Baldwin	vmovdqa	%ymm11,96-144(%rax)
1603bc3d5698SJohn Baldwin	vpermd	%ymm14,%ymm7,%ymm14
1604bc3d5698SJohn Baldwin	vmovdqa	%ymm12,128-144(%rax)
1605bc3d5698SJohn Baldwin	vpermd	%ymm15,%ymm7,%ymm15
1606bc3d5698SJohn Baldwin	vmovdqa	%ymm13,160-144(%rax)
1607bc3d5698SJohn Baldwin	vpermd	%ymm5,%ymm7,%ymm5
1608bc3d5698SJohn Baldwin	vmovdqa	%ymm14,192-144(%rax)
1609bc3d5698SJohn Baldwin	vmovdqa	%ymm15,224-144(%rax)
1610bc3d5698SJohn Baldwin	vmovdqa	%ymm5,256-144(%rax)
1611bc3d5698SJohn Baldwin	vmovdqa	64(%rcx),%ymm5
1612bc3d5698SJohn Baldwin
1613bc3d5698SJohn Baldwin
1614bc3d5698SJohn Baldwin
1615bc3d5698SJohn Baldwin	vmovdqu	0(%rsi),%xmm7
1616bc3d5698SJohn Baldwin	vmovdqu	16(%rsi),%xmm8
1617bc3d5698SJohn Baldwin	vinserti128	$1,32(%rsi),%ymm7,%ymm7
1618bc3d5698SJohn Baldwin	vinserti128	$1,48(%rsi),%ymm8,%ymm8
1619bc3d5698SJohn Baldwin	leaq	64(%rsi),%rsi
1620bc3d5698SJohn Baldwin
1621bc3d5698SJohn Baldwin	vpsrldq	$6,%ymm7,%ymm9
1622bc3d5698SJohn Baldwin	vpsrldq	$6,%ymm8,%ymm10
1623bc3d5698SJohn Baldwin	vpunpckhqdq	%ymm8,%ymm7,%ymm6
1624bc3d5698SJohn Baldwin	vpunpcklqdq	%ymm10,%ymm9,%ymm9
1625bc3d5698SJohn Baldwin	vpunpcklqdq	%ymm8,%ymm7,%ymm7
1626bc3d5698SJohn Baldwin
1627bc3d5698SJohn Baldwin	vpsrlq	$30,%ymm9,%ymm10
1628bc3d5698SJohn Baldwin	vpsrlq	$4,%ymm9,%ymm9
1629bc3d5698SJohn Baldwin	vpsrlq	$26,%ymm7,%ymm8
1630bc3d5698SJohn Baldwin	vpsrlq	$40,%ymm6,%ymm6
1631bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm9,%ymm9
1632bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm7,%ymm7
1633bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm8,%ymm8
1634bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm10,%ymm10
1635bc3d5698SJohn Baldwin	vpor	32(%rcx),%ymm6,%ymm6
1636bc3d5698SJohn Baldwin
1637bc3d5698SJohn Baldwin	vpaddq	%ymm2,%ymm9,%ymm2
1638bc3d5698SJohn Baldwin	subq	$64,%rdx
1639bc3d5698SJohn Baldwin	jz	.Ltail_avx2
1640bc3d5698SJohn Baldwin	jmp	.Loop_avx2
1641bc3d5698SJohn Baldwin
1642bc3d5698SJohn Baldwin.align	32
1643bc3d5698SJohn Baldwin.Loop_avx2:
1644bc3d5698SJohn Baldwin
1645bc3d5698SJohn Baldwin
1646bc3d5698SJohn Baldwin
1647bc3d5698SJohn Baldwin
1648bc3d5698SJohn Baldwin
1649bc3d5698SJohn Baldwin
1650bc3d5698SJohn Baldwin
1651bc3d5698SJohn Baldwin
1652bc3d5698SJohn Baldwin	vpaddq	%ymm0,%ymm7,%ymm0
1653bc3d5698SJohn Baldwin	vmovdqa	0(%rsp),%ymm7
1654bc3d5698SJohn Baldwin	vpaddq	%ymm1,%ymm8,%ymm1
1655bc3d5698SJohn Baldwin	vmovdqa	32(%rsp),%ymm8
1656bc3d5698SJohn Baldwin	vpaddq	%ymm3,%ymm10,%ymm3
1657bc3d5698SJohn Baldwin	vmovdqa	96(%rsp),%ymm9
1658bc3d5698SJohn Baldwin	vpaddq	%ymm4,%ymm6,%ymm4
1659bc3d5698SJohn Baldwin	vmovdqa	48(%rax),%ymm10
1660bc3d5698SJohn Baldwin	vmovdqa	112(%rax),%ymm5
1661bc3d5698SJohn Baldwin
1662bc3d5698SJohn Baldwin
1663bc3d5698SJohn Baldwin
1664bc3d5698SJohn Baldwin
1665bc3d5698SJohn Baldwin
1666bc3d5698SJohn Baldwin
1667bc3d5698SJohn Baldwin
1668bc3d5698SJohn Baldwin
1669bc3d5698SJohn Baldwin
1670bc3d5698SJohn Baldwin
1671bc3d5698SJohn Baldwin
1672bc3d5698SJohn Baldwin
1673bc3d5698SJohn Baldwin
1674bc3d5698SJohn Baldwin
1675bc3d5698SJohn Baldwin
1676bc3d5698SJohn Baldwin
1677bc3d5698SJohn Baldwin	vpmuludq	%ymm2,%ymm7,%ymm13
1678bc3d5698SJohn Baldwin	vpmuludq	%ymm2,%ymm8,%ymm14
1679bc3d5698SJohn Baldwin	vpmuludq	%ymm2,%ymm9,%ymm15
1680bc3d5698SJohn Baldwin	vpmuludq	%ymm2,%ymm10,%ymm11
1681bc3d5698SJohn Baldwin	vpmuludq	%ymm2,%ymm5,%ymm12
1682bc3d5698SJohn Baldwin
1683bc3d5698SJohn Baldwin	vpmuludq	%ymm0,%ymm8,%ymm6
1684bc3d5698SJohn Baldwin	vpmuludq	%ymm1,%ymm8,%ymm2
1685bc3d5698SJohn Baldwin	vpaddq	%ymm6,%ymm12,%ymm12
1686bc3d5698SJohn Baldwin	vpaddq	%ymm2,%ymm13,%ymm13
1687bc3d5698SJohn Baldwin	vpmuludq	%ymm3,%ymm8,%ymm6
1688bc3d5698SJohn Baldwin	vpmuludq	64(%rsp),%ymm4,%ymm2
1689bc3d5698SJohn Baldwin	vpaddq	%ymm6,%ymm15,%ymm15
1690bc3d5698SJohn Baldwin	vpaddq	%ymm2,%ymm11,%ymm11
1691bc3d5698SJohn Baldwin	vmovdqa	-16(%rax),%ymm8
1692bc3d5698SJohn Baldwin
1693bc3d5698SJohn Baldwin	vpmuludq	%ymm0,%ymm7,%ymm6
1694bc3d5698SJohn Baldwin	vpmuludq	%ymm1,%ymm7,%ymm2
1695bc3d5698SJohn Baldwin	vpaddq	%ymm6,%ymm11,%ymm11
1696bc3d5698SJohn Baldwin	vpaddq	%ymm2,%ymm12,%ymm12
1697bc3d5698SJohn Baldwin	vpmuludq	%ymm3,%ymm7,%ymm6
1698bc3d5698SJohn Baldwin	vpmuludq	%ymm4,%ymm7,%ymm2
1699bc3d5698SJohn Baldwin	vmovdqu	0(%rsi),%xmm7
1700bc3d5698SJohn Baldwin	vpaddq	%ymm6,%ymm14,%ymm14
1701bc3d5698SJohn Baldwin	vpaddq	%ymm2,%ymm15,%ymm15
1702bc3d5698SJohn Baldwin	vinserti128	$1,32(%rsi),%ymm7,%ymm7
1703bc3d5698SJohn Baldwin
1704bc3d5698SJohn Baldwin	vpmuludq	%ymm3,%ymm8,%ymm6
1705bc3d5698SJohn Baldwin	vpmuludq	%ymm4,%ymm8,%ymm2
1706bc3d5698SJohn Baldwin	vmovdqu	16(%rsi),%xmm8
1707bc3d5698SJohn Baldwin	vpaddq	%ymm6,%ymm11,%ymm11
1708bc3d5698SJohn Baldwin	vpaddq	%ymm2,%ymm12,%ymm12
1709bc3d5698SJohn Baldwin	vmovdqa	16(%rax),%ymm2
1710bc3d5698SJohn Baldwin	vpmuludq	%ymm1,%ymm9,%ymm6
1711bc3d5698SJohn Baldwin	vpmuludq	%ymm0,%ymm9,%ymm9
1712bc3d5698SJohn Baldwin	vpaddq	%ymm6,%ymm14,%ymm14
1713bc3d5698SJohn Baldwin	vpaddq	%ymm9,%ymm13,%ymm13
1714bc3d5698SJohn Baldwin	vinserti128	$1,48(%rsi),%ymm8,%ymm8
1715bc3d5698SJohn Baldwin	leaq	64(%rsi),%rsi
1716bc3d5698SJohn Baldwin
1717bc3d5698SJohn Baldwin	vpmuludq	%ymm1,%ymm2,%ymm6
1718bc3d5698SJohn Baldwin	vpmuludq	%ymm0,%ymm2,%ymm2
1719bc3d5698SJohn Baldwin	vpsrldq	$6,%ymm7,%ymm9
1720bc3d5698SJohn Baldwin	vpaddq	%ymm6,%ymm15,%ymm15
1721bc3d5698SJohn Baldwin	vpaddq	%ymm2,%ymm14,%ymm14
1722bc3d5698SJohn Baldwin	vpmuludq	%ymm3,%ymm10,%ymm6
1723bc3d5698SJohn Baldwin	vpmuludq	%ymm4,%ymm10,%ymm2
1724bc3d5698SJohn Baldwin	vpsrldq	$6,%ymm8,%ymm10
1725bc3d5698SJohn Baldwin	vpaddq	%ymm6,%ymm12,%ymm12
1726bc3d5698SJohn Baldwin	vpaddq	%ymm2,%ymm13,%ymm13
1727bc3d5698SJohn Baldwin	vpunpckhqdq	%ymm8,%ymm7,%ymm6
1728bc3d5698SJohn Baldwin
1729bc3d5698SJohn Baldwin	vpmuludq	%ymm3,%ymm5,%ymm3
1730bc3d5698SJohn Baldwin	vpmuludq	%ymm4,%ymm5,%ymm4
1731bc3d5698SJohn Baldwin	vpunpcklqdq	%ymm8,%ymm7,%ymm7
1732bc3d5698SJohn Baldwin	vpaddq	%ymm3,%ymm13,%ymm2
1733bc3d5698SJohn Baldwin	vpaddq	%ymm4,%ymm14,%ymm3
1734bc3d5698SJohn Baldwin	vpunpcklqdq	%ymm10,%ymm9,%ymm10
1735bc3d5698SJohn Baldwin	vpmuludq	80(%rax),%ymm0,%ymm4
1736bc3d5698SJohn Baldwin	vpmuludq	%ymm1,%ymm5,%ymm0
1737bc3d5698SJohn Baldwin	vmovdqa	64(%rcx),%ymm5
1738bc3d5698SJohn Baldwin	vpaddq	%ymm4,%ymm15,%ymm4
1739bc3d5698SJohn Baldwin	vpaddq	%ymm0,%ymm11,%ymm0
1740bc3d5698SJohn Baldwin
1741bc3d5698SJohn Baldwin
1742bc3d5698SJohn Baldwin
1743bc3d5698SJohn Baldwin
1744bc3d5698SJohn Baldwin	vpsrlq	$26,%ymm3,%ymm14
1745bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm3,%ymm3
1746bc3d5698SJohn Baldwin	vpaddq	%ymm14,%ymm4,%ymm4
1747bc3d5698SJohn Baldwin
1748bc3d5698SJohn Baldwin	vpsrlq	$26,%ymm0,%ymm11
1749bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm0,%ymm0
1750bc3d5698SJohn Baldwin	vpaddq	%ymm11,%ymm12,%ymm1
1751bc3d5698SJohn Baldwin
1752bc3d5698SJohn Baldwin	vpsrlq	$26,%ymm4,%ymm15
1753bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm4,%ymm4
1754bc3d5698SJohn Baldwin
1755bc3d5698SJohn Baldwin	vpsrlq	$4,%ymm10,%ymm9
1756bc3d5698SJohn Baldwin
1757bc3d5698SJohn Baldwin	vpsrlq	$26,%ymm1,%ymm12
1758bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm1,%ymm1
1759bc3d5698SJohn Baldwin	vpaddq	%ymm12,%ymm2,%ymm2
1760bc3d5698SJohn Baldwin
1761bc3d5698SJohn Baldwin	vpaddq	%ymm15,%ymm0,%ymm0
1762bc3d5698SJohn Baldwin	vpsllq	$2,%ymm15,%ymm15
1763bc3d5698SJohn Baldwin	vpaddq	%ymm15,%ymm0,%ymm0
1764bc3d5698SJohn Baldwin
1765bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm9,%ymm9
1766bc3d5698SJohn Baldwin	vpsrlq	$26,%ymm7,%ymm8
1767bc3d5698SJohn Baldwin
1768bc3d5698SJohn Baldwin	vpsrlq	$26,%ymm2,%ymm13
1769bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm2,%ymm2
1770bc3d5698SJohn Baldwin	vpaddq	%ymm13,%ymm3,%ymm3
1771bc3d5698SJohn Baldwin
1772bc3d5698SJohn Baldwin	vpaddq	%ymm9,%ymm2,%ymm2
1773bc3d5698SJohn Baldwin	vpsrlq	$30,%ymm10,%ymm10
1774bc3d5698SJohn Baldwin
1775bc3d5698SJohn Baldwin	vpsrlq	$26,%ymm0,%ymm11
1776bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm0,%ymm0
1777bc3d5698SJohn Baldwin	vpaddq	%ymm11,%ymm1,%ymm1
1778bc3d5698SJohn Baldwin
1779bc3d5698SJohn Baldwin	vpsrlq	$40,%ymm6,%ymm6
1780bc3d5698SJohn Baldwin
1781bc3d5698SJohn Baldwin	vpsrlq	$26,%ymm3,%ymm14
1782bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm3,%ymm3
1783bc3d5698SJohn Baldwin	vpaddq	%ymm14,%ymm4,%ymm4
1784bc3d5698SJohn Baldwin
1785bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm7,%ymm7
1786bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm8,%ymm8
1787bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm10,%ymm10
1788bc3d5698SJohn Baldwin	vpor	32(%rcx),%ymm6,%ymm6
1789bc3d5698SJohn Baldwin
1790bc3d5698SJohn Baldwin	subq	$64,%rdx
1791bc3d5698SJohn Baldwin	jnz	.Loop_avx2
1792bc3d5698SJohn Baldwin
1793bc3d5698SJohn Baldwin.byte	0x66,0x90
1794bc3d5698SJohn Baldwin.Ltail_avx2:
1795bc3d5698SJohn Baldwin
1796bc3d5698SJohn Baldwin
1797bc3d5698SJohn Baldwin
1798bc3d5698SJohn Baldwin
1799bc3d5698SJohn Baldwin
1800bc3d5698SJohn Baldwin
1801bc3d5698SJohn Baldwin
1802bc3d5698SJohn Baldwin	vpaddq	%ymm0,%ymm7,%ymm0
1803bc3d5698SJohn Baldwin	vmovdqu	4(%rsp),%ymm7
1804bc3d5698SJohn Baldwin	vpaddq	%ymm1,%ymm8,%ymm1
1805bc3d5698SJohn Baldwin	vmovdqu	36(%rsp),%ymm8
1806bc3d5698SJohn Baldwin	vpaddq	%ymm3,%ymm10,%ymm3
1807bc3d5698SJohn Baldwin	vmovdqu	100(%rsp),%ymm9
1808bc3d5698SJohn Baldwin	vpaddq	%ymm4,%ymm6,%ymm4
1809bc3d5698SJohn Baldwin	vmovdqu	52(%rax),%ymm10
1810bc3d5698SJohn Baldwin	vmovdqu	116(%rax),%ymm5
1811bc3d5698SJohn Baldwin
1812bc3d5698SJohn Baldwin	vpmuludq	%ymm2,%ymm7,%ymm13
1813bc3d5698SJohn Baldwin	vpmuludq	%ymm2,%ymm8,%ymm14
1814bc3d5698SJohn Baldwin	vpmuludq	%ymm2,%ymm9,%ymm15
1815bc3d5698SJohn Baldwin	vpmuludq	%ymm2,%ymm10,%ymm11
1816bc3d5698SJohn Baldwin	vpmuludq	%ymm2,%ymm5,%ymm12
1817bc3d5698SJohn Baldwin
1818bc3d5698SJohn Baldwin	vpmuludq	%ymm0,%ymm8,%ymm6
1819bc3d5698SJohn Baldwin	vpmuludq	%ymm1,%ymm8,%ymm2
1820bc3d5698SJohn Baldwin	vpaddq	%ymm6,%ymm12,%ymm12
1821bc3d5698SJohn Baldwin	vpaddq	%ymm2,%ymm13,%ymm13
1822bc3d5698SJohn Baldwin	vpmuludq	%ymm3,%ymm8,%ymm6
1823bc3d5698SJohn Baldwin	vpmuludq	68(%rsp),%ymm4,%ymm2
1824bc3d5698SJohn Baldwin	vpaddq	%ymm6,%ymm15,%ymm15
1825bc3d5698SJohn Baldwin	vpaddq	%ymm2,%ymm11,%ymm11
1826bc3d5698SJohn Baldwin
1827bc3d5698SJohn Baldwin	vpmuludq	%ymm0,%ymm7,%ymm6
1828bc3d5698SJohn Baldwin	vpmuludq	%ymm1,%ymm7,%ymm2
1829bc3d5698SJohn Baldwin	vpaddq	%ymm6,%ymm11,%ymm11
1830bc3d5698SJohn Baldwin	vmovdqu	-12(%rax),%ymm8
1831bc3d5698SJohn Baldwin	vpaddq	%ymm2,%ymm12,%ymm12
1832bc3d5698SJohn Baldwin	vpmuludq	%ymm3,%ymm7,%ymm6
1833bc3d5698SJohn Baldwin	vpmuludq	%ymm4,%ymm7,%ymm2
1834bc3d5698SJohn Baldwin	vpaddq	%ymm6,%ymm14,%ymm14
1835bc3d5698SJohn Baldwin	vpaddq	%ymm2,%ymm15,%ymm15
1836bc3d5698SJohn Baldwin
1837bc3d5698SJohn Baldwin	vpmuludq	%ymm3,%ymm8,%ymm6
1838bc3d5698SJohn Baldwin	vpmuludq	%ymm4,%ymm8,%ymm2
1839bc3d5698SJohn Baldwin	vpaddq	%ymm6,%ymm11,%ymm11
1840bc3d5698SJohn Baldwin	vpaddq	%ymm2,%ymm12,%ymm12
1841bc3d5698SJohn Baldwin	vmovdqu	20(%rax),%ymm2
1842bc3d5698SJohn Baldwin	vpmuludq	%ymm1,%ymm9,%ymm6
1843bc3d5698SJohn Baldwin	vpmuludq	%ymm0,%ymm9,%ymm9
1844bc3d5698SJohn Baldwin	vpaddq	%ymm6,%ymm14,%ymm14
1845bc3d5698SJohn Baldwin	vpaddq	%ymm9,%ymm13,%ymm13
1846bc3d5698SJohn Baldwin
1847bc3d5698SJohn Baldwin	vpmuludq	%ymm1,%ymm2,%ymm6
1848bc3d5698SJohn Baldwin	vpmuludq	%ymm0,%ymm2,%ymm2
1849bc3d5698SJohn Baldwin	vpaddq	%ymm6,%ymm15,%ymm15
1850bc3d5698SJohn Baldwin	vpaddq	%ymm2,%ymm14,%ymm14
1851bc3d5698SJohn Baldwin	vpmuludq	%ymm3,%ymm10,%ymm6
1852bc3d5698SJohn Baldwin	vpmuludq	%ymm4,%ymm10,%ymm2
1853bc3d5698SJohn Baldwin	vpaddq	%ymm6,%ymm12,%ymm12
1854bc3d5698SJohn Baldwin	vpaddq	%ymm2,%ymm13,%ymm13
1855bc3d5698SJohn Baldwin
1856bc3d5698SJohn Baldwin	vpmuludq	%ymm3,%ymm5,%ymm3
1857bc3d5698SJohn Baldwin	vpmuludq	%ymm4,%ymm5,%ymm4
1858bc3d5698SJohn Baldwin	vpaddq	%ymm3,%ymm13,%ymm2
1859bc3d5698SJohn Baldwin	vpaddq	%ymm4,%ymm14,%ymm3
1860bc3d5698SJohn Baldwin	vpmuludq	84(%rax),%ymm0,%ymm4
1861bc3d5698SJohn Baldwin	vpmuludq	%ymm1,%ymm5,%ymm0
1862bc3d5698SJohn Baldwin	vmovdqa	64(%rcx),%ymm5
1863bc3d5698SJohn Baldwin	vpaddq	%ymm4,%ymm15,%ymm4
1864bc3d5698SJohn Baldwin	vpaddq	%ymm0,%ymm11,%ymm0
1865bc3d5698SJohn Baldwin
1866bc3d5698SJohn Baldwin
1867bc3d5698SJohn Baldwin
1868bc3d5698SJohn Baldwin
1869bc3d5698SJohn Baldwin	vpsrldq	$8,%ymm12,%ymm8
1870bc3d5698SJohn Baldwin	vpsrldq	$8,%ymm2,%ymm9
1871bc3d5698SJohn Baldwin	vpsrldq	$8,%ymm3,%ymm10
1872bc3d5698SJohn Baldwin	vpsrldq	$8,%ymm4,%ymm6
1873bc3d5698SJohn Baldwin	vpsrldq	$8,%ymm0,%ymm7
1874bc3d5698SJohn Baldwin	vpaddq	%ymm8,%ymm12,%ymm12
1875bc3d5698SJohn Baldwin	vpaddq	%ymm9,%ymm2,%ymm2
1876bc3d5698SJohn Baldwin	vpaddq	%ymm10,%ymm3,%ymm3
1877bc3d5698SJohn Baldwin	vpaddq	%ymm6,%ymm4,%ymm4
1878bc3d5698SJohn Baldwin	vpaddq	%ymm7,%ymm0,%ymm0
1879bc3d5698SJohn Baldwin
1880bc3d5698SJohn Baldwin	vpermq	$0x2,%ymm3,%ymm10
1881bc3d5698SJohn Baldwin	vpermq	$0x2,%ymm4,%ymm6
1882bc3d5698SJohn Baldwin	vpermq	$0x2,%ymm0,%ymm7
1883bc3d5698SJohn Baldwin	vpermq	$0x2,%ymm12,%ymm8
1884bc3d5698SJohn Baldwin	vpermq	$0x2,%ymm2,%ymm9
1885bc3d5698SJohn Baldwin	vpaddq	%ymm10,%ymm3,%ymm3
1886bc3d5698SJohn Baldwin	vpaddq	%ymm6,%ymm4,%ymm4
1887bc3d5698SJohn Baldwin	vpaddq	%ymm7,%ymm0,%ymm0
1888bc3d5698SJohn Baldwin	vpaddq	%ymm8,%ymm12,%ymm12
1889bc3d5698SJohn Baldwin	vpaddq	%ymm9,%ymm2,%ymm2
1890bc3d5698SJohn Baldwin
1891bc3d5698SJohn Baldwin
1892bc3d5698SJohn Baldwin
1893bc3d5698SJohn Baldwin
1894bc3d5698SJohn Baldwin	vpsrlq	$26,%ymm3,%ymm14
1895bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm3,%ymm3
1896bc3d5698SJohn Baldwin	vpaddq	%ymm14,%ymm4,%ymm4
1897bc3d5698SJohn Baldwin
1898bc3d5698SJohn Baldwin	vpsrlq	$26,%ymm0,%ymm11
1899bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm0,%ymm0
1900bc3d5698SJohn Baldwin	vpaddq	%ymm11,%ymm12,%ymm1
1901bc3d5698SJohn Baldwin
1902bc3d5698SJohn Baldwin	vpsrlq	$26,%ymm4,%ymm15
1903bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm4,%ymm4
1904bc3d5698SJohn Baldwin
1905bc3d5698SJohn Baldwin	vpsrlq	$26,%ymm1,%ymm12
1906bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm1,%ymm1
1907bc3d5698SJohn Baldwin	vpaddq	%ymm12,%ymm2,%ymm2
1908bc3d5698SJohn Baldwin
1909bc3d5698SJohn Baldwin	vpaddq	%ymm15,%ymm0,%ymm0
1910bc3d5698SJohn Baldwin	vpsllq	$2,%ymm15,%ymm15
1911bc3d5698SJohn Baldwin	vpaddq	%ymm15,%ymm0,%ymm0
1912bc3d5698SJohn Baldwin
1913bc3d5698SJohn Baldwin	vpsrlq	$26,%ymm2,%ymm13
1914bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm2,%ymm2
1915bc3d5698SJohn Baldwin	vpaddq	%ymm13,%ymm3,%ymm3
1916bc3d5698SJohn Baldwin
1917bc3d5698SJohn Baldwin	vpsrlq	$26,%ymm0,%ymm11
1918bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm0,%ymm0
1919bc3d5698SJohn Baldwin	vpaddq	%ymm11,%ymm1,%ymm1
1920bc3d5698SJohn Baldwin
1921bc3d5698SJohn Baldwin	vpsrlq	$26,%ymm3,%ymm14
1922bc3d5698SJohn Baldwin	vpand	%ymm5,%ymm3,%ymm3
1923bc3d5698SJohn Baldwin	vpaddq	%ymm14,%ymm4,%ymm4
1924bc3d5698SJohn Baldwin
1925bc3d5698SJohn Baldwin	vmovd	%xmm0,-112(%rdi)
1926bc3d5698SJohn Baldwin	vmovd	%xmm1,-108(%rdi)
1927bc3d5698SJohn Baldwin	vmovd	%xmm2,-104(%rdi)
1928bc3d5698SJohn Baldwin	vmovd	%xmm3,-100(%rdi)
1929bc3d5698SJohn Baldwin	vmovd	%xmm4,-96(%rdi)
1930bc3d5698SJohn Baldwin	leaq	8(%r11),%rsp
1931bc3d5698SJohn Baldwin.cfi_def_cfa	%rsp,8
1932bc3d5698SJohn Baldwin	vzeroupper
1933bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
1934bc3d5698SJohn Baldwin.cfi_endproc
1935bc3d5698SJohn Baldwin.size	poly1305_blocks_avx2,.-poly1305_blocks_avx2
1936bc3d5698SJohn Baldwin.align	64
1937bc3d5698SJohn Baldwin.Lconst:
1938bc3d5698SJohn Baldwin.Lmask24:
1939bc3d5698SJohn Baldwin.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
1940bc3d5698SJohn Baldwin.L129:
1941bc3d5698SJohn Baldwin.long	16777216,0,16777216,0,16777216,0,16777216,0
1942bc3d5698SJohn Baldwin.Lmask26:
1943bc3d5698SJohn Baldwin.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
1944bc3d5698SJohn Baldwin.Lpermd_avx2:
1945bc3d5698SJohn Baldwin.long	2,2,2,3,2,0,2,1
1946bc3d5698SJohn Baldwin.Lpermd_avx512:
1947bc3d5698SJohn Baldwin.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
1948bc3d5698SJohn Baldwin
1949bc3d5698SJohn Baldwin.L2_44_inp_permd:
1950bc3d5698SJohn Baldwin.long	0,1,1,2,2,3,7,7
1951bc3d5698SJohn Baldwin.L2_44_inp_shift:
1952bc3d5698SJohn Baldwin.quad	0,12,24,64
1953bc3d5698SJohn Baldwin.L2_44_mask:
1954bc3d5698SJohn Baldwin.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
1955bc3d5698SJohn Baldwin.L2_44_shift_rgt:
1956bc3d5698SJohn Baldwin.quad	44,44,42,64
1957bc3d5698SJohn Baldwin.L2_44_shift_lft:
1958bc3d5698SJohn Baldwin.quad	8,8,10,64
1959bc3d5698SJohn Baldwin
1960bc3d5698SJohn Baldwin.align	64
1961bc3d5698SJohn Baldwin.Lx_mask44:
1962bc3d5698SJohn Baldwin.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1963bc3d5698SJohn Baldwin.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1964bc3d5698SJohn Baldwin.Lx_mask42:
1965bc3d5698SJohn Baldwin.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1966bc3d5698SJohn Baldwin.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1967bc3d5698SJohn Baldwin.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1968bc3d5698SJohn Baldwin.align	16
1969bc3d5698SJohn Baldwin.globl	xor128_encrypt_n_pad
1970bc3d5698SJohn Baldwin.type	xor128_encrypt_n_pad,@function
1971bc3d5698SJohn Baldwin.align	16
1972bc3d5698SJohn Baldwinxor128_encrypt_n_pad:
1973bc3d5698SJohn Baldwin.cfi_startproc
1974bc3d5698SJohn Baldwin	subq	%rdx,%rsi
1975bc3d5698SJohn Baldwin	subq	%rdx,%rdi
1976bc3d5698SJohn Baldwin	movq	%rcx,%r10
1977bc3d5698SJohn Baldwin	shrq	$4,%rcx
1978bc3d5698SJohn Baldwin	jz	.Ltail_enc
1979bc3d5698SJohn Baldwin	nop
1980bc3d5698SJohn Baldwin.Loop_enc_xmm:
1981bc3d5698SJohn Baldwin	movdqu	(%rsi,%rdx,1),%xmm0
1982bc3d5698SJohn Baldwin	pxor	(%rdx),%xmm0
1983bc3d5698SJohn Baldwin	movdqu	%xmm0,(%rdi,%rdx,1)
1984bc3d5698SJohn Baldwin	movdqa	%xmm0,(%rdx)
1985bc3d5698SJohn Baldwin	leaq	16(%rdx),%rdx
1986bc3d5698SJohn Baldwin	decq	%rcx
1987bc3d5698SJohn Baldwin	jnz	.Loop_enc_xmm
1988bc3d5698SJohn Baldwin
1989bc3d5698SJohn Baldwin	andq	$15,%r10
1990bc3d5698SJohn Baldwin	jz	.Ldone_enc
1991bc3d5698SJohn Baldwin
1992bc3d5698SJohn Baldwin.Ltail_enc:
1993bc3d5698SJohn Baldwin	movq	$16,%rcx
1994bc3d5698SJohn Baldwin	subq	%r10,%rcx
1995bc3d5698SJohn Baldwin	xorl	%eax,%eax
1996bc3d5698SJohn Baldwin.Loop_enc_byte:
1997bc3d5698SJohn Baldwin	movb	(%rsi,%rdx,1),%al
1998bc3d5698SJohn Baldwin	xorb	(%rdx),%al
1999bc3d5698SJohn Baldwin	movb	%al,(%rdi,%rdx,1)
2000bc3d5698SJohn Baldwin	movb	%al,(%rdx)
2001bc3d5698SJohn Baldwin	leaq	1(%rdx),%rdx
2002bc3d5698SJohn Baldwin	decq	%r10
2003bc3d5698SJohn Baldwin	jnz	.Loop_enc_byte
2004bc3d5698SJohn Baldwin
2005bc3d5698SJohn Baldwin	xorl	%eax,%eax
2006bc3d5698SJohn Baldwin.Loop_enc_pad:
2007bc3d5698SJohn Baldwin	movb	%al,(%rdx)
2008bc3d5698SJohn Baldwin	leaq	1(%rdx),%rdx
2009bc3d5698SJohn Baldwin	decq	%rcx
2010bc3d5698SJohn Baldwin	jnz	.Loop_enc_pad
2011bc3d5698SJohn Baldwin
2012bc3d5698SJohn Baldwin.Ldone_enc:
2013bc3d5698SJohn Baldwin	movq	%rdx,%rax
2014bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
2015bc3d5698SJohn Baldwin.cfi_endproc
2016bc3d5698SJohn Baldwin.size	xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
2017bc3d5698SJohn Baldwin
2018bc3d5698SJohn Baldwin.globl	xor128_decrypt_n_pad
2019bc3d5698SJohn Baldwin.type	xor128_decrypt_n_pad,@function
2020bc3d5698SJohn Baldwin.align	16
2021bc3d5698SJohn Baldwinxor128_decrypt_n_pad:
2022bc3d5698SJohn Baldwin.cfi_startproc
2023bc3d5698SJohn Baldwin	subq	%rdx,%rsi
2024bc3d5698SJohn Baldwin	subq	%rdx,%rdi
2025bc3d5698SJohn Baldwin	movq	%rcx,%r10
2026bc3d5698SJohn Baldwin	shrq	$4,%rcx
2027bc3d5698SJohn Baldwin	jz	.Ltail_dec
2028bc3d5698SJohn Baldwin	nop
2029bc3d5698SJohn Baldwin.Loop_dec_xmm:
2030bc3d5698SJohn Baldwin	movdqu	(%rsi,%rdx,1),%xmm0
2031bc3d5698SJohn Baldwin	movdqa	(%rdx),%xmm1
2032bc3d5698SJohn Baldwin	pxor	%xmm0,%xmm1
2033bc3d5698SJohn Baldwin	movdqu	%xmm1,(%rdi,%rdx,1)
2034bc3d5698SJohn Baldwin	movdqa	%xmm0,(%rdx)
2035bc3d5698SJohn Baldwin	leaq	16(%rdx),%rdx
2036bc3d5698SJohn Baldwin	decq	%rcx
2037bc3d5698SJohn Baldwin	jnz	.Loop_dec_xmm
2038bc3d5698SJohn Baldwin
2039bc3d5698SJohn Baldwin	pxor	%xmm1,%xmm1
2040bc3d5698SJohn Baldwin	andq	$15,%r10
2041bc3d5698SJohn Baldwin	jz	.Ldone_dec
2042bc3d5698SJohn Baldwin
2043bc3d5698SJohn Baldwin.Ltail_dec:
2044bc3d5698SJohn Baldwin	movq	$16,%rcx
2045bc3d5698SJohn Baldwin	subq	%r10,%rcx
2046bc3d5698SJohn Baldwin	xorl	%eax,%eax
2047bc3d5698SJohn Baldwin	xorq	%r11,%r11
2048bc3d5698SJohn Baldwin.Loop_dec_byte:
2049bc3d5698SJohn Baldwin	movb	(%rsi,%rdx,1),%r11b
2050bc3d5698SJohn Baldwin	movb	(%rdx),%al
2051bc3d5698SJohn Baldwin	xorb	%r11b,%al
2052bc3d5698SJohn Baldwin	movb	%al,(%rdi,%rdx,1)
2053bc3d5698SJohn Baldwin	movb	%r11b,(%rdx)
2054bc3d5698SJohn Baldwin	leaq	1(%rdx),%rdx
2055bc3d5698SJohn Baldwin	decq	%r10
2056bc3d5698SJohn Baldwin	jnz	.Loop_dec_byte
2057bc3d5698SJohn Baldwin
2058bc3d5698SJohn Baldwin	xorl	%eax,%eax
2059bc3d5698SJohn Baldwin.Loop_dec_pad:
2060bc3d5698SJohn Baldwin	movb	%al,(%rdx)
2061bc3d5698SJohn Baldwin	leaq	1(%rdx),%rdx
2062bc3d5698SJohn Baldwin	decq	%rcx
2063bc3d5698SJohn Baldwin	jnz	.Loop_dec_pad
2064bc3d5698SJohn Baldwin
2065bc3d5698SJohn Baldwin.Ldone_dec:
2066bc3d5698SJohn Baldwin	movq	%rdx,%rax
2067bc3d5698SJohn Baldwin	.byte	0xf3,0xc3
2068bc3d5698SJohn Baldwin.cfi_endproc
2069bc3d5698SJohn Baldwin.size	xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
2070*c0855eaaSJohn Baldwin	.section ".note.gnu.property", "a"
2071*c0855eaaSJohn Baldwin	.p2align 3
2072*c0855eaaSJohn Baldwin	.long 1f - 0f
2073*c0855eaaSJohn Baldwin	.long 4f - 1f
2074*c0855eaaSJohn Baldwin	.long 5
2075*c0855eaaSJohn Baldwin0:
2076*c0855eaaSJohn Baldwin	# "GNU" encoded with .byte, since .asciz isn't supported
2077*c0855eaaSJohn Baldwin	# on Solaris.
2078*c0855eaaSJohn Baldwin	.byte 0x47
2079*c0855eaaSJohn Baldwin	.byte 0x4e
2080*c0855eaaSJohn Baldwin	.byte 0x55
2081*c0855eaaSJohn Baldwin	.byte 0
2082*c0855eaaSJohn Baldwin1:
2083*c0855eaaSJohn Baldwin	.p2align 3
2084*c0855eaaSJohn Baldwin	.long 0xc0000002
2085*c0855eaaSJohn Baldwin	.long 3f - 2f
2086*c0855eaaSJohn Baldwin2:
2087*c0855eaaSJohn Baldwin	.long 3
2088*c0855eaaSJohn Baldwin3:
2089*c0855eaaSJohn Baldwin	.p2align 3
2090*c0855eaaSJohn Baldwin4:
2091