xref: /freebsd/sys/crypto/openssl/i386/poly1305-x86.S (revision 18f21f0355481283ceef0ec10e99554f44c205c2)
1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from poly1305-x86.pl. */
3#ifdef PIC
4.text
5.align	64
6.globl	poly1305_init
7.type	poly1305_init,@function
8.align	16
9poly1305_init:
10.L_poly1305_init_begin:
11	pushl	%ebp
12	pushl	%ebx
13	pushl	%esi
14	pushl	%edi
15	movl	20(%esp),%edi
16	movl	24(%esp),%esi
17	movl	28(%esp),%ebp
18	xorl	%eax,%eax
19	movl	%eax,(%edi)
20	movl	%eax,4(%edi)
21	movl	%eax,8(%edi)
22	movl	%eax,12(%edi)
23	movl	%eax,16(%edi)
24	movl	%eax,20(%edi)
25	cmpl	$0,%esi
26	je	.L000nokey
27	call	.L001pic_point
28.L001pic_point:
29	popl	%ebx
30	leal	poly1305_blocks-.L001pic_point(%ebx),%eax
31	leal	poly1305_emit-.L001pic_point(%ebx),%edx
32	leal	OPENSSL_ia32cap_P-.L001pic_point(%ebx),%edi
33	movl	(%edi),%ecx
34	andl	$83886080,%ecx
35	cmpl	$83886080,%ecx
36	jne	.L002no_sse2
37	leal	_poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
38	leal	_poly1305_emit_sse2-.L001pic_point(%ebx),%edx
39	movl	8(%edi),%ecx
40	testl	$32,%ecx
41	jz	.L002no_sse2
42	leal	_poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
43.L002no_sse2:
44	movl	20(%esp),%edi
45	movl	%eax,(%ebp)
46	movl	%edx,4(%ebp)
47	movl	(%esi),%eax
48	movl	4(%esi),%ebx
49	movl	8(%esi),%ecx
50	movl	12(%esi),%edx
51	andl	$268435455,%eax
52	andl	$268435452,%ebx
53	andl	$268435452,%ecx
54	andl	$268435452,%edx
55	movl	%eax,24(%edi)
56	movl	%ebx,28(%edi)
57	movl	%ecx,32(%edi)
58	movl	%edx,36(%edi)
59	movl	$1,%eax
60.L000nokey:
61	popl	%edi
62	popl	%esi
63	popl	%ebx
64	popl	%ebp
65	ret
66.size	poly1305_init,.-.L_poly1305_init_begin
67.globl	poly1305_blocks
68.type	poly1305_blocks,@function
69.align	16
70poly1305_blocks:
71.L_poly1305_blocks_begin:
72	pushl	%ebp
73	pushl	%ebx
74	pushl	%esi
75	pushl	%edi
76	movl	20(%esp),%edi
77	movl	24(%esp),%esi
78	movl	28(%esp),%ecx
79.Lenter_blocks:
80	andl	$-15,%ecx
81	jz	.L003nodata
82	subl	$64,%esp
83	movl	24(%edi),%eax
84	movl	28(%edi),%ebx
85	leal	(%esi,%ecx,1),%ebp
86	movl	32(%edi),%ecx
87	movl	36(%edi),%edx
88	movl	%ebp,92(%esp)
89	movl	%esi,%ebp
90	movl	%eax,36(%esp)
91	movl	%ebx,%eax
92	shrl	$2,%eax
93	movl	%ebx,40(%esp)
94	addl	%ebx,%eax
95	movl	%ecx,%ebx
96	shrl	$2,%ebx
97	movl	%ecx,44(%esp)
98	addl	%ecx,%ebx
99	movl	%edx,%ecx
100	shrl	$2,%ecx
101	movl	%edx,48(%esp)
102	addl	%edx,%ecx
103	movl	%eax,52(%esp)
104	movl	%ebx,56(%esp)
105	movl	%ecx,60(%esp)
106	movl	(%edi),%eax
107	movl	4(%edi),%ebx
108	movl	8(%edi),%ecx
109	movl	12(%edi),%esi
110	movl	16(%edi),%edi
111	jmp	.L004loop
112.align	32
113.L004loop:
114	addl	(%ebp),%eax
115	adcl	4(%ebp),%ebx
116	adcl	8(%ebp),%ecx
117	adcl	12(%ebp),%esi
118	leal	16(%ebp),%ebp
119	adcl	96(%esp),%edi
120	movl	%eax,(%esp)
121	movl	%esi,12(%esp)
122	mull	36(%esp)
123	movl	%edi,16(%esp)
124	movl	%eax,%edi
125	movl	%ebx,%eax
126	movl	%edx,%esi
127	mull	60(%esp)
128	addl	%eax,%edi
129	movl	%ecx,%eax
130	adcl	%edx,%esi
131	mull	56(%esp)
132	addl	%eax,%edi
133	movl	12(%esp),%eax
134	adcl	%edx,%esi
135	mull	52(%esp)
136	addl	%eax,%edi
137	movl	(%esp),%eax
138	adcl	%edx,%esi
139	mull	40(%esp)
140	movl	%edi,20(%esp)
141	xorl	%edi,%edi
142	addl	%eax,%esi
143	movl	%ebx,%eax
144	adcl	%edx,%edi
145	mull	36(%esp)
146	addl	%eax,%esi
147	movl	%ecx,%eax
148	adcl	%edx,%edi
149	mull	60(%esp)
150	addl	%eax,%esi
151	movl	12(%esp),%eax
152	adcl	%edx,%edi
153	mull	56(%esp)
154	addl	%eax,%esi
155	movl	16(%esp),%eax
156	adcl	%edx,%edi
157	imull	52(%esp),%eax
158	addl	%eax,%esi
159	movl	(%esp),%eax
160	adcl	$0,%edi
161	mull	44(%esp)
162	movl	%esi,24(%esp)
163	xorl	%esi,%esi
164	addl	%eax,%edi
165	movl	%ebx,%eax
166	adcl	%edx,%esi
167	mull	40(%esp)
168	addl	%eax,%edi
169	movl	%ecx,%eax
170	adcl	%edx,%esi
171	mull	36(%esp)
172	addl	%eax,%edi
173	movl	12(%esp),%eax
174	adcl	%edx,%esi
175	mull	60(%esp)
176	addl	%eax,%edi
177	movl	16(%esp),%eax
178	adcl	%edx,%esi
179	imull	56(%esp),%eax
180	addl	%eax,%edi
181	movl	(%esp),%eax
182	adcl	$0,%esi
183	mull	48(%esp)
184	movl	%edi,28(%esp)
185	xorl	%edi,%edi
186	addl	%eax,%esi
187	movl	%ebx,%eax
188	adcl	%edx,%edi
189	mull	44(%esp)
190	addl	%eax,%esi
191	movl	%ecx,%eax
192	adcl	%edx,%edi
193	mull	40(%esp)
194	addl	%eax,%esi
195	movl	12(%esp),%eax
196	adcl	%edx,%edi
197	mull	36(%esp)
198	addl	%eax,%esi
199	movl	16(%esp),%ecx
200	adcl	%edx,%edi
201	movl	%ecx,%edx
202	imull	60(%esp),%ecx
203	addl	%ecx,%esi
204	movl	20(%esp),%eax
205	adcl	$0,%edi
206	imull	36(%esp),%edx
207	addl	%edi,%edx
208	movl	24(%esp),%ebx
209	movl	28(%esp),%ecx
210	movl	%edx,%edi
211	shrl	$2,%edx
212	andl	$3,%edi
213	leal	(%edx,%edx,4),%edx
214	addl	%edx,%eax
215	adcl	$0,%ebx
216	adcl	$0,%ecx
217	adcl	$0,%esi
218	adcl	$0,%edi
219	cmpl	92(%esp),%ebp
220	jne	.L004loop
221	movl	84(%esp),%edx
222	addl	$64,%esp
223	movl	%eax,(%edx)
224	movl	%ebx,4(%edx)
225	movl	%ecx,8(%edx)
226	movl	%esi,12(%edx)
227	movl	%edi,16(%edx)
228.L003nodata:
229	popl	%edi
230	popl	%esi
231	popl	%ebx
232	popl	%ebp
233	ret
234.size	poly1305_blocks,.-.L_poly1305_blocks_begin
235.globl	poly1305_emit
236.type	poly1305_emit,@function
237.align	16
238poly1305_emit:
239.L_poly1305_emit_begin:
240	pushl	%ebp
241	pushl	%ebx
242	pushl	%esi
243	pushl	%edi
244	movl	20(%esp),%ebp
245.Lenter_emit:
246	movl	24(%esp),%edi
247	movl	(%ebp),%eax
248	movl	4(%ebp),%ebx
249	movl	8(%ebp),%ecx
250	movl	12(%ebp),%edx
251	movl	16(%ebp),%esi
252	addl	$5,%eax
253	adcl	$0,%ebx
254	adcl	$0,%ecx
255	adcl	$0,%edx
256	adcl	$0,%esi
257	shrl	$2,%esi
258	negl	%esi
259	andl	%esi,%eax
260	andl	%esi,%ebx
261	andl	%esi,%ecx
262	andl	%esi,%edx
263	movl	%eax,(%edi)
264	movl	%ebx,4(%edi)
265	movl	%ecx,8(%edi)
266	movl	%edx,12(%edi)
267	notl	%esi
268	movl	(%ebp),%eax
269	movl	4(%ebp),%ebx
270	movl	8(%ebp),%ecx
271	movl	12(%ebp),%edx
272	movl	28(%esp),%ebp
273	andl	%esi,%eax
274	andl	%esi,%ebx
275	andl	%esi,%ecx
276	andl	%esi,%edx
277	orl	(%edi),%eax
278	orl	4(%edi),%ebx
279	orl	8(%edi),%ecx
280	orl	12(%edi),%edx
281	addl	(%ebp),%eax
282	adcl	4(%ebp),%ebx
283	adcl	8(%ebp),%ecx
284	adcl	12(%ebp),%edx
285	movl	%eax,(%edi)
286	movl	%ebx,4(%edi)
287	movl	%ecx,8(%edi)
288	movl	%edx,12(%edi)
289	popl	%edi
290	popl	%esi
291	popl	%ebx
292	popl	%ebp
293	ret
294.size	poly1305_emit,.-.L_poly1305_emit_begin
295.align	32
296.type	_poly1305_init_sse2,@function
297.align	16
298_poly1305_init_sse2:
299	movdqu	24(%edi),%xmm4
300	leal	48(%edi),%edi
301	movl	%esp,%ebp
302	subl	$224,%esp
303	andl	$-16,%esp
304	movq	64(%ebx),%xmm7
305	movdqa	%xmm4,%xmm0
306	movdqa	%xmm4,%xmm1
307	movdqa	%xmm4,%xmm2
308	pand	%xmm7,%xmm0
309	psrlq	$26,%xmm1
310	psrldq	$6,%xmm2
311	pand	%xmm7,%xmm1
312	movdqa	%xmm2,%xmm3
313	psrlq	$4,%xmm2
314	psrlq	$30,%xmm3
315	pand	%xmm7,%xmm2
316	pand	%xmm7,%xmm3
317	psrldq	$13,%xmm4
318	leal	144(%esp),%edx
319	movl	$2,%ecx
320.L005square:
321	movdqa	%xmm0,(%esp)
322	movdqa	%xmm1,16(%esp)
323	movdqa	%xmm2,32(%esp)
324	movdqa	%xmm3,48(%esp)
325	movdqa	%xmm4,64(%esp)
326	movdqa	%xmm1,%xmm6
327	movdqa	%xmm2,%xmm5
328	pslld	$2,%xmm6
329	pslld	$2,%xmm5
330	paddd	%xmm1,%xmm6
331	paddd	%xmm2,%xmm5
332	movdqa	%xmm6,80(%esp)
333	movdqa	%xmm5,96(%esp)
334	movdqa	%xmm3,%xmm6
335	movdqa	%xmm4,%xmm5
336	pslld	$2,%xmm6
337	pslld	$2,%xmm5
338	paddd	%xmm3,%xmm6
339	paddd	%xmm4,%xmm5
340	movdqa	%xmm6,112(%esp)
341	movdqa	%xmm5,128(%esp)
342	pshufd	$68,%xmm0,%xmm6
343	movdqa	%xmm1,%xmm5
344	pshufd	$68,%xmm1,%xmm1
345	pshufd	$68,%xmm2,%xmm2
346	pshufd	$68,%xmm3,%xmm3
347	pshufd	$68,%xmm4,%xmm4
348	movdqa	%xmm6,(%edx)
349	movdqa	%xmm1,16(%edx)
350	movdqa	%xmm2,32(%edx)
351	movdqa	%xmm3,48(%edx)
352	movdqa	%xmm4,64(%edx)
353	pmuludq	%xmm0,%xmm4
354	pmuludq	%xmm0,%xmm3
355	pmuludq	%xmm0,%xmm2
356	pmuludq	%xmm0,%xmm1
357	pmuludq	%xmm6,%xmm0
358	movdqa	%xmm5,%xmm6
359	pmuludq	48(%edx),%xmm5
360	movdqa	%xmm6,%xmm7
361	pmuludq	32(%edx),%xmm6
362	paddq	%xmm5,%xmm4
363	movdqa	%xmm7,%xmm5
364	pmuludq	16(%edx),%xmm7
365	paddq	%xmm6,%xmm3
366	movdqa	80(%esp),%xmm6
367	pmuludq	(%edx),%xmm5
368	paddq	%xmm7,%xmm2
369	pmuludq	64(%edx),%xmm6
370	movdqa	32(%esp),%xmm7
371	paddq	%xmm5,%xmm1
372	movdqa	%xmm7,%xmm5
373	pmuludq	32(%edx),%xmm7
374	paddq	%xmm6,%xmm0
375	movdqa	%xmm5,%xmm6
376	pmuludq	16(%edx),%xmm5
377	paddq	%xmm7,%xmm4
378	movdqa	96(%esp),%xmm7
379	pmuludq	(%edx),%xmm6
380	paddq	%xmm5,%xmm3
381	movdqa	%xmm7,%xmm5
382	pmuludq	64(%edx),%xmm7
383	paddq	%xmm6,%xmm2
384	pmuludq	48(%edx),%xmm5
385	movdqa	48(%esp),%xmm6
386	paddq	%xmm7,%xmm1
387	movdqa	%xmm6,%xmm7
388	pmuludq	16(%edx),%xmm6
389	paddq	%xmm5,%xmm0
390	movdqa	112(%esp),%xmm5
391	pmuludq	(%edx),%xmm7
392	paddq	%xmm6,%xmm4
393	movdqa	%xmm5,%xmm6
394	pmuludq	64(%edx),%xmm5
395	paddq	%xmm7,%xmm3
396	movdqa	%xmm6,%xmm7
397	pmuludq	48(%edx),%xmm6
398	paddq	%xmm5,%xmm2
399	pmuludq	32(%edx),%xmm7
400	movdqa	64(%esp),%xmm5
401	paddq	%xmm6,%xmm1
402	movdqa	128(%esp),%xmm6
403	pmuludq	(%edx),%xmm5
404	paddq	%xmm7,%xmm0
405	movdqa	%xmm6,%xmm7
406	pmuludq	64(%edx),%xmm6
407	paddq	%xmm5,%xmm4
408	movdqa	%xmm7,%xmm5
409	pmuludq	16(%edx),%xmm7
410	paddq	%xmm6,%xmm3
411	movdqa	%xmm5,%xmm6
412	pmuludq	32(%edx),%xmm5
413	paddq	%xmm7,%xmm0
414	pmuludq	48(%edx),%xmm6
415	movdqa	64(%ebx),%xmm7
416	paddq	%xmm5,%xmm1
417	paddq	%xmm6,%xmm2
418	movdqa	%xmm3,%xmm5
419	pand	%xmm7,%xmm3
420	psrlq	$26,%xmm5
421	paddq	%xmm4,%xmm5
422	movdqa	%xmm0,%xmm6
423	pand	%xmm7,%xmm0
424	psrlq	$26,%xmm6
425	movdqa	%xmm5,%xmm4
426	paddq	%xmm1,%xmm6
427	psrlq	$26,%xmm5
428	pand	%xmm7,%xmm4
429	movdqa	%xmm6,%xmm1
430	psrlq	$26,%xmm6
431	paddd	%xmm5,%xmm0
432	psllq	$2,%xmm5
433	paddq	%xmm2,%xmm6
434	paddq	%xmm0,%xmm5
435	pand	%xmm7,%xmm1
436	movdqa	%xmm6,%xmm2
437	psrlq	$26,%xmm6
438	pand	%xmm7,%xmm2
439	paddd	%xmm3,%xmm6
440	movdqa	%xmm5,%xmm0
441	psrlq	$26,%xmm5
442	movdqa	%xmm6,%xmm3
443	psrlq	$26,%xmm6
444	pand	%xmm7,%xmm0
445	paddd	%xmm5,%xmm1
446	pand	%xmm7,%xmm3
447	paddd	%xmm6,%xmm4
448	decl	%ecx
449	jz	.L006square_break
450	punpcklqdq	(%esp),%xmm0
451	punpcklqdq	16(%esp),%xmm1
452	punpcklqdq	32(%esp),%xmm2
453	punpcklqdq	48(%esp),%xmm3
454	punpcklqdq	64(%esp),%xmm4
455	jmp	.L005square
456.L006square_break:
457	psllq	$32,%xmm0
458	psllq	$32,%xmm1
459	psllq	$32,%xmm2
460	psllq	$32,%xmm3
461	psllq	$32,%xmm4
462	por	(%esp),%xmm0
463	por	16(%esp),%xmm1
464	por	32(%esp),%xmm2
465	por	48(%esp),%xmm3
466	por	64(%esp),%xmm4
467	pshufd	$141,%xmm0,%xmm0
468	pshufd	$141,%xmm1,%xmm1
469	pshufd	$141,%xmm2,%xmm2
470	pshufd	$141,%xmm3,%xmm3
471	pshufd	$141,%xmm4,%xmm4
472	movdqu	%xmm0,(%edi)
473	movdqu	%xmm1,16(%edi)
474	movdqu	%xmm2,32(%edi)
475	movdqu	%xmm3,48(%edi)
476	movdqu	%xmm4,64(%edi)
477	movdqa	%xmm1,%xmm6
478	movdqa	%xmm2,%xmm5
479	pslld	$2,%xmm6
480	pslld	$2,%xmm5
481	paddd	%xmm1,%xmm6
482	paddd	%xmm2,%xmm5
483	movdqu	%xmm6,80(%edi)
484	movdqu	%xmm5,96(%edi)
485	movdqa	%xmm3,%xmm6
486	movdqa	%xmm4,%xmm5
487	pslld	$2,%xmm6
488	pslld	$2,%xmm5
489	paddd	%xmm3,%xmm6
490	paddd	%xmm4,%xmm5
491	movdqu	%xmm6,112(%edi)
492	movdqu	%xmm5,128(%edi)
493	movl	%ebp,%esp
494	leal	-48(%edi),%edi
495	ret
496.size	_poly1305_init_sse2,.-_poly1305_init_sse2
497.align	32
498.type	_poly1305_blocks_sse2,@function
499.align	16
500_poly1305_blocks_sse2:
501	pushl	%ebp
502	pushl	%ebx
503	pushl	%esi
504	pushl	%edi
505	movl	20(%esp),%edi
506	movl	24(%esp),%esi
507	movl	28(%esp),%ecx
508	movl	20(%edi),%eax
509	andl	$-16,%ecx
510	jz	.L007nodata
511	cmpl	$64,%ecx
512	jae	.L008enter_sse2
513	testl	%eax,%eax
514	jz	.Lenter_blocks
515.align	16
516.L008enter_sse2:
517	call	.L009pic_point
518.L009pic_point:
519	popl	%ebx
520	leal	.Lconst_sse2-.L009pic_point(%ebx),%ebx
521	testl	%eax,%eax
522	jnz	.L010base2_26
523	call	_poly1305_init_sse2
524	movl	(%edi),%eax
525	movl	3(%edi),%ecx
526	movl	6(%edi),%edx
527	movl	9(%edi),%esi
528	movl	13(%edi),%ebp
529	movl	$1,20(%edi)
530	shrl	$2,%ecx
531	andl	$67108863,%eax
532	shrl	$4,%edx
533	andl	$67108863,%ecx
534	shrl	$6,%esi
535	andl	$67108863,%edx
536	movd	%eax,%xmm0
537	movd	%ecx,%xmm1
538	movd	%edx,%xmm2
539	movd	%esi,%xmm3
540	movd	%ebp,%xmm4
541	movl	24(%esp),%esi
542	movl	28(%esp),%ecx
543	jmp	.L011base2_32
544.align	16
545.L010base2_26:
546	movd	(%edi),%xmm0
547	movd	4(%edi),%xmm1
548	movd	8(%edi),%xmm2
549	movd	12(%edi),%xmm3
550	movd	16(%edi),%xmm4
551	movdqa	64(%ebx),%xmm7
552.L011base2_32:
553	movl	32(%esp),%eax
554	movl	%esp,%ebp
555	subl	$528,%esp
556	andl	$-16,%esp
557	leal	48(%edi),%edi
558	shll	$24,%eax
559	testl	$31,%ecx
560	jz	.L012even
561	movdqu	(%esi),%xmm6
562	leal	16(%esi),%esi
563	movdqa	%xmm6,%xmm5
564	pand	%xmm7,%xmm6
565	paddd	%xmm6,%xmm0
566	movdqa	%xmm5,%xmm6
567	psrlq	$26,%xmm5
568	psrldq	$6,%xmm6
569	pand	%xmm7,%xmm5
570	paddd	%xmm5,%xmm1
571	movdqa	%xmm6,%xmm5
572	psrlq	$4,%xmm6
573	pand	%xmm7,%xmm6
574	paddd	%xmm6,%xmm2
575	movdqa	%xmm5,%xmm6
576	psrlq	$30,%xmm5
577	pand	%xmm7,%xmm5
578	psrldq	$7,%xmm6
579	paddd	%xmm5,%xmm3
580	movd	%eax,%xmm5
581	paddd	%xmm6,%xmm4
582	movd	12(%edi),%xmm6
583	paddd	%xmm5,%xmm4
584	movdqa	%xmm0,(%esp)
585	movdqa	%xmm1,16(%esp)
586	movdqa	%xmm2,32(%esp)
587	movdqa	%xmm3,48(%esp)
588	movdqa	%xmm4,64(%esp)
589	pmuludq	%xmm6,%xmm0
590	pmuludq	%xmm6,%xmm1
591	pmuludq	%xmm6,%xmm2
592	movd	28(%edi),%xmm5
593	pmuludq	%xmm6,%xmm3
594	pmuludq	%xmm6,%xmm4
595	movdqa	%xmm5,%xmm6
596	pmuludq	48(%esp),%xmm5
597	movdqa	%xmm6,%xmm7
598	pmuludq	32(%esp),%xmm6
599	paddq	%xmm5,%xmm4
600	movdqa	%xmm7,%xmm5
601	pmuludq	16(%esp),%xmm7
602	paddq	%xmm6,%xmm3
603	movd	92(%edi),%xmm6
604	pmuludq	(%esp),%xmm5
605	paddq	%xmm7,%xmm2
606	pmuludq	64(%esp),%xmm6
607	movd	44(%edi),%xmm7
608	paddq	%xmm5,%xmm1
609	movdqa	%xmm7,%xmm5
610	pmuludq	32(%esp),%xmm7
611	paddq	%xmm6,%xmm0
612	movdqa	%xmm5,%xmm6
613	pmuludq	16(%esp),%xmm5
614	paddq	%xmm7,%xmm4
615	movd	108(%edi),%xmm7
616	pmuludq	(%esp),%xmm6
617	paddq	%xmm5,%xmm3
618	movdqa	%xmm7,%xmm5
619	pmuludq	64(%esp),%xmm7
620	paddq	%xmm6,%xmm2
621	pmuludq	48(%esp),%xmm5
622	movd	60(%edi),%xmm6
623	paddq	%xmm7,%xmm1
624	movdqa	%xmm6,%xmm7
625	pmuludq	16(%esp),%xmm6
626	paddq	%xmm5,%xmm0
627	movd	124(%edi),%xmm5
628	pmuludq	(%esp),%xmm7
629	paddq	%xmm6,%xmm4
630	movdqa	%xmm5,%xmm6
631	pmuludq	64(%esp),%xmm5
632	paddq	%xmm7,%xmm3
633	movdqa	%xmm6,%xmm7
634	pmuludq	48(%esp),%xmm6
635	paddq	%xmm5,%xmm2
636	pmuludq	32(%esp),%xmm7
637	movd	76(%edi),%xmm5
638	paddq	%xmm6,%xmm1
639	movd	140(%edi),%xmm6
640	pmuludq	(%esp),%xmm5
641	paddq	%xmm7,%xmm0
642	movdqa	%xmm6,%xmm7
643	pmuludq	64(%esp),%xmm6
644	paddq	%xmm5,%xmm4
645	movdqa	%xmm7,%xmm5
646	pmuludq	16(%esp),%xmm7
647	paddq	%xmm6,%xmm3
648	movdqa	%xmm5,%xmm6
649	pmuludq	32(%esp),%xmm5
650	paddq	%xmm7,%xmm0
651	pmuludq	48(%esp),%xmm6
652	movdqa	64(%ebx),%xmm7
653	paddq	%xmm5,%xmm1
654	paddq	%xmm6,%xmm2
655	movdqa	%xmm3,%xmm5
656	pand	%xmm7,%xmm3
657	psrlq	$26,%xmm5
658	paddq	%xmm4,%xmm5
659	movdqa	%xmm0,%xmm6
660	pand	%xmm7,%xmm0
661	psrlq	$26,%xmm6
662	movdqa	%xmm5,%xmm4
663	paddq	%xmm1,%xmm6
664	psrlq	$26,%xmm5
665	pand	%xmm7,%xmm4
666	movdqa	%xmm6,%xmm1
667	psrlq	$26,%xmm6
668	paddd	%xmm5,%xmm0
669	psllq	$2,%xmm5
670	paddq	%xmm2,%xmm6
671	paddq	%xmm0,%xmm5
672	pand	%xmm7,%xmm1
673	movdqa	%xmm6,%xmm2
674	psrlq	$26,%xmm6
675	pand	%xmm7,%xmm2
676	paddd	%xmm3,%xmm6
677	movdqa	%xmm5,%xmm0
678	psrlq	$26,%xmm5
679	movdqa	%xmm6,%xmm3
680	psrlq	$26,%xmm6
681	pand	%xmm7,%xmm0
682	paddd	%xmm5,%xmm1
683	pand	%xmm7,%xmm3
684	paddd	%xmm6,%xmm4
685	subl	$16,%ecx
686	jz	.L013done
687.L012even:
688	leal	384(%esp),%edx
689	leal	-32(%esi),%eax
690	subl	$64,%ecx
691	movdqu	(%edi),%xmm5
692	pshufd	$68,%xmm5,%xmm6
693	cmovbl	%eax,%esi
694	pshufd	$238,%xmm5,%xmm5
695	movdqa	%xmm6,(%edx)
696	leal	160(%esp),%eax
697	movdqu	16(%edi),%xmm6
698	movdqa	%xmm5,-144(%edx)
699	pshufd	$68,%xmm6,%xmm5
700	pshufd	$238,%xmm6,%xmm6
701	movdqa	%xmm5,16(%edx)
702	movdqu	32(%edi),%xmm5
703	movdqa	%xmm6,-128(%edx)
704	pshufd	$68,%xmm5,%xmm6
705	pshufd	$238,%xmm5,%xmm5
706	movdqa	%xmm6,32(%edx)
707	movdqu	48(%edi),%xmm6
708	movdqa	%xmm5,-112(%edx)
709	pshufd	$68,%xmm6,%xmm5
710	pshufd	$238,%xmm6,%xmm6
711	movdqa	%xmm5,48(%edx)
712	movdqu	64(%edi),%xmm5
713	movdqa	%xmm6,-96(%edx)
714	pshufd	$68,%xmm5,%xmm6
715	pshufd	$238,%xmm5,%xmm5
716	movdqa	%xmm6,64(%edx)
717	movdqu	80(%edi),%xmm6
718	movdqa	%xmm5,-80(%edx)
719	pshufd	$68,%xmm6,%xmm5
720	pshufd	$238,%xmm6,%xmm6
721	movdqa	%xmm5,80(%edx)
722	movdqu	96(%edi),%xmm5
723	movdqa	%xmm6,-64(%edx)
724	pshufd	$68,%xmm5,%xmm6
725	pshufd	$238,%xmm5,%xmm5
726	movdqa	%xmm6,96(%edx)
727	movdqu	112(%edi),%xmm6
728	movdqa	%xmm5,-48(%edx)
729	pshufd	$68,%xmm6,%xmm5
730	pshufd	$238,%xmm6,%xmm6
731	movdqa	%xmm5,112(%edx)
732	movdqu	128(%edi),%xmm5
733	movdqa	%xmm6,-32(%edx)
734	pshufd	$68,%xmm5,%xmm6
735	pshufd	$238,%xmm5,%xmm5
736	movdqa	%xmm6,128(%edx)
737	movdqa	%xmm5,-16(%edx)
738	movdqu	32(%esi),%xmm5
739	movdqu	48(%esi),%xmm6
740	leal	32(%esi),%esi
741	movdqa	%xmm2,112(%esp)
742	movdqa	%xmm3,128(%esp)
743	movdqa	%xmm4,144(%esp)
744	movdqa	%xmm5,%xmm2
745	movdqa	%xmm6,%xmm3
746	psrldq	$6,%xmm2
747	psrldq	$6,%xmm3
748	movdqa	%xmm5,%xmm4
749	punpcklqdq	%xmm3,%xmm2
750	punpckhqdq	%xmm6,%xmm4
751	punpcklqdq	%xmm6,%xmm5
752	movdqa	%xmm2,%xmm3
753	psrlq	$4,%xmm2
754	psrlq	$30,%xmm3
755	movdqa	%xmm5,%xmm6
756	psrlq	$40,%xmm4
757	psrlq	$26,%xmm6
758	pand	%xmm7,%xmm5
759	pand	%xmm7,%xmm6
760	pand	%xmm7,%xmm2
761	pand	%xmm7,%xmm3
762	por	(%ebx),%xmm4
763	movdqa	%xmm0,80(%esp)
764	movdqa	%xmm1,96(%esp)
765	jbe	.L014skip_loop
766	jmp	.L015loop
767.align	32
768.L015loop:
769	movdqa	-144(%edx),%xmm7
770	movdqa	%xmm6,16(%eax)
771	movdqa	%xmm2,32(%eax)
772	movdqa	%xmm3,48(%eax)
773	movdqa	%xmm4,64(%eax)
774	movdqa	%xmm5,%xmm1
775	pmuludq	%xmm7,%xmm5
776	movdqa	%xmm6,%xmm0
777	pmuludq	%xmm7,%xmm6
778	pmuludq	%xmm7,%xmm2
779	pmuludq	%xmm7,%xmm3
780	pmuludq	%xmm7,%xmm4
781	pmuludq	-16(%edx),%xmm0
782	movdqa	%xmm1,%xmm7
783	pmuludq	-128(%edx),%xmm1
784	paddq	%xmm5,%xmm0
785	movdqa	%xmm7,%xmm5
786	pmuludq	-112(%edx),%xmm7
787	paddq	%xmm6,%xmm1
788	movdqa	%xmm5,%xmm6
789	pmuludq	-96(%edx),%xmm5
790	paddq	%xmm7,%xmm2
791	movdqa	16(%eax),%xmm7
792	pmuludq	-80(%edx),%xmm6
793	paddq	%xmm5,%xmm3
794	movdqa	%xmm7,%xmm5
795	pmuludq	-128(%edx),%xmm7
796	paddq	%xmm6,%xmm4
797	movdqa	%xmm5,%xmm6
798	pmuludq	-112(%edx),%xmm5
799	paddq	%xmm7,%xmm2
800	movdqa	32(%eax),%xmm7
801	pmuludq	-96(%edx),%xmm6
802	paddq	%xmm5,%xmm3
803	movdqa	%xmm7,%xmm5
804	pmuludq	-32(%edx),%xmm7
805	paddq	%xmm6,%xmm4
806	movdqa	%xmm5,%xmm6
807	pmuludq	-16(%edx),%xmm5
808	paddq	%xmm7,%xmm0
809	movdqa	%xmm6,%xmm7
810	pmuludq	-128(%edx),%xmm6
811	paddq	%xmm5,%xmm1
812	movdqa	48(%eax),%xmm5
813	pmuludq	-112(%edx),%xmm7
814	paddq	%xmm6,%xmm3
815	movdqa	%xmm5,%xmm6
816	pmuludq	-48(%edx),%xmm5
817	paddq	%xmm7,%xmm4
818	movdqa	%xmm6,%xmm7
819	pmuludq	-32(%edx),%xmm6
820	paddq	%xmm5,%xmm0
821	movdqa	%xmm7,%xmm5
822	pmuludq	-16(%edx),%xmm7
823	paddq	%xmm6,%xmm1
824	movdqa	64(%eax),%xmm6
825	pmuludq	-128(%edx),%xmm5
826	paddq	%xmm7,%xmm2
827	movdqa	%xmm6,%xmm7
828	pmuludq	-16(%edx),%xmm6
829	paddq	%xmm5,%xmm4
830	movdqa	%xmm7,%xmm5
831	pmuludq	-64(%edx),%xmm7
832	paddq	%xmm6,%xmm3
833	movdqa	%xmm5,%xmm6
834	pmuludq	-48(%edx),%xmm5
835	paddq	%xmm7,%xmm0
836	movdqa	64(%ebx),%xmm7
837	pmuludq	-32(%edx),%xmm6
838	paddq	%xmm5,%xmm1
839	paddq	%xmm6,%xmm2
840	movdqu	-32(%esi),%xmm5
841	movdqu	-16(%esi),%xmm6
842	leal	32(%esi),%esi
843	movdqa	%xmm2,32(%esp)
844	movdqa	%xmm3,48(%esp)
845	movdqa	%xmm4,64(%esp)
846	movdqa	%xmm5,%xmm2
847	movdqa	%xmm6,%xmm3
848	psrldq	$6,%xmm2
849	psrldq	$6,%xmm3
850	movdqa	%xmm5,%xmm4
851	punpcklqdq	%xmm3,%xmm2
852	punpckhqdq	%xmm6,%xmm4
853	punpcklqdq	%xmm6,%xmm5
854	movdqa	%xmm2,%xmm3
855	psrlq	$4,%xmm2
856	psrlq	$30,%xmm3
857	movdqa	%xmm5,%xmm6
858	psrlq	$40,%xmm4
859	psrlq	$26,%xmm6
860	pand	%xmm7,%xmm5
861	pand	%xmm7,%xmm6
862	pand	%xmm7,%xmm2
863	pand	%xmm7,%xmm3
864	por	(%ebx),%xmm4
865	leal	-32(%esi),%eax
866	subl	$64,%ecx
867	paddd	80(%esp),%xmm5
868	paddd	96(%esp),%xmm6
869	paddd	112(%esp),%xmm2
870	paddd	128(%esp),%xmm3
871	paddd	144(%esp),%xmm4
872	cmovbl	%eax,%esi
873	leal	160(%esp),%eax
874	movdqa	(%edx),%xmm7
875	movdqa	%xmm1,16(%esp)
876	movdqa	%xmm6,16(%eax)
877	movdqa	%xmm2,32(%eax)
878	movdqa	%xmm3,48(%eax)
879	movdqa	%xmm4,64(%eax)
880	movdqa	%xmm5,%xmm1
881	pmuludq	%xmm7,%xmm5
882	paddq	%xmm0,%xmm5
883	movdqa	%xmm6,%xmm0
884	pmuludq	%xmm7,%xmm6
885	pmuludq	%xmm7,%xmm2
886	pmuludq	%xmm7,%xmm3
887	pmuludq	%xmm7,%xmm4
888	paddq	16(%esp),%xmm6
889	paddq	32(%esp),%xmm2
890	paddq	48(%esp),%xmm3
891	paddq	64(%esp),%xmm4
892	pmuludq	128(%edx),%xmm0
893	movdqa	%xmm1,%xmm7
894	pmuludq	16(%edx),%xmm1
895	paddq	%xmm5,%xmm0
896	movdqa	%xmm7,%xmm5
897	pmuludq	32(%edx),%xmm7
898	paddq	%xmm6,%xmm1
899	movdqa	%xmm5,%xmm6
900	pmuludq	48(%edx),%xmm5
901	paddq	%xmm7,%xmm2
902	movdqa	16(%eax),%xmm7
903	pmuludq	64(%edx),%xmm6
904	paddq	%xmm5,%xmm3
905	movdqa	%xmm7,%xmm5
906	pmuludq	16(%edx),%xmm7
907	paddq	%xmm6,%xmm4
908	movdqa	%xmm5,%xmm6
909	pmuludq	32(%edx),%xmm5
910	paddq	%xmm7,%xmm2
911	movdqa	32(%eax),%xmm7
912	pmuludq	48(%edx),%xmm6
913	paddq	%xmm5,%xmm3
914	movdqa	%xmm7,%xmm5
915	pmuludq	112(%edx),%xmm7
916	paddq	%xmm6,%xmm4
917	movdqa	%xmm5,%xmm6
918	pmuludq	128(%edx),%xmm5
919	paddq	%xmm7,%xmm0
920	movdqa	%xmm6,%xmm7
921	pmuludq	16(%edx),%xmm6
922	paddq	%xmm5,%xmm1
923	movdqa	48(%eax),%xmm5
924	pmuludq	32(%edx),%xmm7
925	paddq	%xmm6,%xmm3
926	movdqa	%xmm5,%xmm6
927	pmuludq	96(%edx),%xmm5
928	paddq	%xmm7,%xmm4
929	movdqa	%xmm6,%xmm7
930	pmuludq	112(%edx),%xmm6
931	paddq	%xmm5,%xmm0
932	movdqa	%xmm7,%xmm5
933	pmuludq	128(%edx),%xmm7
934	paddq	%xmm6,%xmm1
935	movdqa	64(%eax),%xmm6
936	pmuludq	16(%edx),%xmm5
937	paddq	%xmm7,%xmm2
938	movdqa	%xmm6,%xmm7
939	pmuludq	128(%edx),%xmm6
940	paddq	%xmm5,%xmm4
941	movdqa	%xmm7,%xmm5
942	pmuludq	80(%edx),%xmm7
943	paddq	%xmm6,%xmm3
944	movdqa	%xmm5,%xmm6
945	pmuludq	96(%edx),%xmm5
946	paddq	%xmm7,%xmm0
947	movdqa	64(%ebx),%xmm7
948	pmuludq	112(%edx),%xmm6
949	paddq	%xmm5,%xmm1
950	paddq	%xmm6,%xmm2
951	movdqa	%xmm3,%xmm5
952	pand	%xmm7,%xmm3
953	psrlq	$26,%xmm5
954	paddq	%xmm4,%xmm5
955	movdqa	%xmm0,%xmm6
956	pand	%xmm7,%xmm0
957	psrlq	$26,%xmm6
958	movdqa	%xmm5,%xmm4
959	paddq	%xmm1,%xmm6
960	psrlq	$26,%xmm5
961	pand	%xmm7,%xmm4
962	movdqa	%xmm6,%xmm1
963	psrlq	$26,%xmm6
964	paddd	%xmm5,%xmm0
965	psllq	$2,%xmm5
966	paddq	%xmm2,%xmm6
967	paddq	%xmm0,%xmm5
968	pand	%xmm7,%xmm1
969	movdqa	%xmm6,%xmm2
970	psrlq	$26,%xmm6
971	pand	%xmm7,%xmm2
972	paddd	%xmm3,%xmm6
973	movdqa	%xmm5,%xmm0
974	psrlq	$26,%xmm5
975	movdqa	%xmm6,%xmm3
976	psrlq	$26,%xmm6
977	pand	%xmm7,%xmm0
978	paddd	%xmm5,%xmm1
979	pand	%xmm7,%xmm3
980	paddd	%xmm6,%xmm4
981	movdqu	32(%esi),%xmm5
982	movdqu	48(%esi),%xmm6
983	leal	32(%esi),%esi
984	movdqa	%xmm2,112(%esp)
985	movdqa	%xmm3,128(%esp)
986	movdqa	%xmm4,144(%esp)
987	movdqa	%xmm5,%xmm2
988	movdqa	%xmm6,%xmm3
989	psrldq	$6,%xmm2
990	psrldq	$6,%xmm3
991	movdqa	%xmm5,%xmm4
992	punpcklqdq	%xmm3,%xmm2
993	punpckhqdq	%xmm6,%xmm4
994	punpcklqdq	%xmm6,%xmm5
995	movdqa	%xmm2,%xmm3
996	psrlq	$4,%xmm2
997	psrlq	$30,%xmm3
998	movdqa	%xmm5,%xmm6
999	psrlq	$40,%xmm4
1000	psrlq	$26,%xmm6
1001	pand	%xmm7,%xmm5
1002	pand	%xmm7,%xmm6
1003	pand	%xmm7,%xmm2
1004	pand	%xmm7,%xmm3
1005	por	(%ebx),%xmm4
1006	movdqa	%xmm0,80(%esp)
1007	movdqa	%xmm1,96(%esp)
1008	ja	.L015loop
1009.L014skip_loop:
1010	pshufd	$16,-144(%edx),%xmm7
1011	addl	$32,%ecx
1012	jnz	.L016long_tail
1013	paddd	%xmm0,%xmm5
1014	paddd	%xmm1,%xmm6
1015	paddd	112(%esp),%xmm2
1016	paddd	128(%esp),%xmm3
1017	paddd	144(%esp),%xmm4
1018.L016long_tail:
1019	movdqa	%xmm5,(%eax)
1020	movdqa	%xmm6,16(%eax)
1021	movdqa	%xmm2,32(%eax)
1022	movdqa	%xmm3,48(%eax)
1023	movdqa	%xmm4,64(%eax)
1024	pmuludq	%xmm7,%xmm5
1025	pmuludq	%xmm7,%xmm6
1026	pmuludq	%xmm7,%xmm2
1027	movdqa	%xmm5,%xmm0
1028	pshufd	$16,-128(%edx),%xmm5
1029	pmuludq	%xmm7,%xmm3
1030	movdqa	%xmm6,%xmm1
1031	pmuludq	%xmm7,%xmm4
1032	movdqa	%xmm5,%xmm6
1033	pmuludq	48(%eax),%xmm5
1034	movdqa	%xmm6,%xmm7
1035	pmuludq	32(%eax),%xmm6
1036	paddq	%xmm5,%xmm4
1037	movdqa	%xmm7,%xmm5
1038	pmuludq	16(%eax),%xmm7
1039	paddq	%xmm6,%xmm3
1040	pshufd	$16,-64(%edx),%xmm6
1041	pmuludq	(%eax),%xmm5
1042	paddq	%xmm7,%xmm2
1043	pmuludq	64(%eax),%xmm6
1044	pshufd	$16,-112(%edx),%xmm7
1045	paddq	%xmm5,%xmm1
1046	movdqa	%xmm7,%xmm5
1047	pmuludq	32(%eax),%xmm7
1048	paddq	%xmm6,%xmm0
1049	movdqa	%xmm5,%xmm6
1050	pmuludq	16(%eax),%xmm5
1051	paddq	%xmm7,%xmm4
1052	pshufd	$16,-48(%edx),%xmm7
1053	pmuludq	(%eax),%xmm6
1054	paddq	%xmm5,%xmm3
1055	movdqa	%xmm7,%xmm5
1056	pmuludq	64(%eax),%xmm7
1057	paddq	%xmm6,%xmm2
1058	pmuludq	48(%eax),%xmm5
1059	pshufd	$16,-96(%edx),%xmm6
1060	paddq	%xmm7,%xmm1
1061	movdqa	%xmm6,%xmm7
1062	pmuludq	16(%eax),%xmm6
1063	paddq	%xmm5,%xmm0
1064	pshufd	$16,-32(%edx),%xmm5
1065	pmuludq	(%eax),%xmm7
1066	paddq	%xmm6,%xmm4
1067	movdqa	%xmm5,%xmm6
1068	pmuludq	64(%eax),%xmm5
1069	paddq	%xmm7,%xmm3
1070	movdqa	%xmm6,%xmm7
1071	pmuludq	48(%eax),%xmm6
1072	paddq	%xmm5,%xmm2
1073	pmuludq	32(%eax),%xmm7
1074	pshufd	$16,-80(%edx),%xmm5
1075	paddq	%xmm6,%xmm1
1076	pshufd	$16,-16(%edx),%xmm6
1077	pmuludq	(%eax),%xmm5
1078	paddq	%xmm7,%xmm0
1079	movdqa	%xmm6,%xmm7
1080	pmuludq	64(%eax),%xmm6
1081	paddq	%xmm5,%xmm4
1082	movdqa	%xmm7,%xmm5
1083	pmuludq	16(%eax),%xmm7
1084	paddq	%xmm6,%xmm3
1085	movdqa	%xmm5,%xmm6
1086	pmuludq	32(%eax),%xmm5
1087	paddq	%xmm7,%xmm0
1088	pmuludq	48(%eax),%xmm6
1089	movdqa	64(%ebx),%xmm7
1090	paddq	%xmm5,%xmm1
1091	paddq	%xmm6,%xmm2
1092	jz	.L017short_tail
1093	movdqu	-32(%esi),%xmm5
1094	movdqu	-16(%esi),%xmm6
1095	leal	32(%esi),%esi
1096	movdqa	%xmm2,32(%esp)
1097	movdqa	%xmm3,48(%esp)
1098	movdqa	%xmm4,64(%esp)
1099	movdqa	%xmm5,%xmm2
1100	movdqa	%xmm6,%xmm3
1101	psrldq	$6,%xmm2
1102	psrldq	$6,%xmm3
1103	movdqa	%xmm5,%xmm4
1104	punpcklqdq	%xmm3,%xmm2
1105	punpckhqdq	%xmm6,%xmm4
1106	punpcklqdq	%xmm6,%xmm5
1107	movdqa	%xmm2,%xmm3
1108	psrlq	$4,%xmm2
1109	psrlq	$30,%xmm3
1110	movdqa	%xmm5,%xmm6
1111	psrlq	$40,%xmm4
1112	psrlq	$26,%xmm6
1113	pand	%xmm7,%xmm5
1114	pand	%xmm7,%xmm6
1115	pand	%xmm7,%xmm2
1116	pand	%xmm7,%xmm3
1117	por	(%ebx),%xmm4
1118	pshufd	$16,(%edx),%xmm7
1119	paddd	80(%esp),%xmm5
1120	paddd	96(%esp),%xmm6
1121	paddd	112(%esp),%xmm2
1122	paddd	128(%esp),%xmm3
1123	paddd	144(%esp),%xmm4
1124	movdqa	%xmm5,(%esp)
1125	pmuludq	%xmm7,%xmm5
1126	movdqa	%xmm6,16(%esp)
1127	pmuludq	%xmm7,%xmm6
1128	paddq	%xmm5,%xmm0
1129	movdqa	%xmm2,%xmm5
1130	pmuludq	%xmm7,%xmm2
1131	paddq	%xmm6,%xmm1
1132	movdqa	%xmm3,%xmm6
1133	pmuludq	%xmm7,%xmm3
1134	paddq	32(%esp),%xmm2
1135	movdqa	%xmm5,32(%esp)
1136	pshufd	$16,16(%edx),%xmm5
1137	paddq	48(%esp),%xmm3
1138	movdqa	%xmm6,48(%esp)
1139	movdqa	%xmm4,%xmm6
1140	pmuludq	%xmm7,%xmm4
1141	paddq	64(%esp),%xmm4
1142	movdqa	%xmm6,64(%esp)
1143	movdqa	%xmm5,%xmm6
1144	pmuludq	48(%esp),%xmm5
1145	movdqa	%xmm6,%xmm7
1146	pmuludq	32(%esp),%xmm6
1147	paddq	%xmm5,%xmm4
1148	movdqa	%xmm7,%xmm5
1149	pmuludq	16(%esp),%xmm7
1150	paddq	%xmm6,%xmm3
1151	pshufd	$16,80(%edx),%xmm6
1152	pmuludq	(%esp),%xmm5
1153	paddq	%xmm7,%xmm2
1154	pmuludq	64(%esp),%xmm6
1155	pshufd	$16,32(%edx),%xmm7
1156	paddq	%xmm5,%xmm1
1157	movdqa	%xmm7,%xmm5
1158	pmuludq	32(%esp),%xmm7
1159	paddq	%xmm6,%xmm0
1160	movdqa	%xmm5,%xmm6
1161	pmuludq	16(%esp),%xmm5
1162	paddq	%xmm7,%xmm4
1163	pshufd	$16,96(%edx),%xmm7
1164	pmuludq	(%esp),%xmm6
1165	paddq	%xmm5,%xmm3
1166	movdqa	%xmm7,%xmm5
1167	pmuludq	64(%esp),%xmm7
1168	paddq	%xmm6,%xmm2
1169	pmuludq	48(%esp),%xmm5
1170	pshufd	$16,48(%edx),%xmm6
1171	paddq	%xmm7,%xmm1
1172	movdqa	%xmm6,%xmm7
1173	pmuludq	16(%esp),%xmm6
1174	paddq	%xmm5,%xmm0
1175	pshufd	$16,112(%edx),%xmm5
1176	pmuludq	(%esp),%xmm7
1177	paddq	%xmm6,%xmm4
1178	movdqa	%xmm5,%xmm6
1179	pmuludq	64(%esp),%xmm5
1180	paddq	%xmm7,%xmm3
1181	movdqa	%xmm6,%xmm7
1182	pmuludq	48(%esp),%xmm6
1183	paddq	%xmm5,%xmm2
1184	pmuludq	32(%esp),%xmm7
1185	pshufd	$16,64(%edx),%xmm5
1186	paddq	%xmm6,%xmm1
1187	pshufd	$16,128(%edx),%xmm6
1188	pmuludq	(%esp),%xmm5
1189	paddq	%xmm7,%xmm0
1190	movdqa	%xmm6,%xmm7
1191	pmuludq	64(%esp),%xmm6
1192	paddq	%xmm5,%xmm4
1193	movdqa	%xmm7,%xmm5
1194	pmuludq	16(%esp),%xmm7
1195	paddq	%xmm6,%xmm3
1196	movdqa	%xmm5,%xmm6
1197	pmuludq	32(%esp),%xmm5
1198	paddq	%xmm7,%xmm0
1199	pmuludq	48(%esp),%xmm6
1200	movdqa	64(%ebx),%xmm7
1201	paddq	%xmm5,%xmm1
1202	paddq	%xmm6,%xmm2
1203.L017short_tail:
1204	pshufd	$78,%xmm4,%xmm6
1205	pshufd	$78,%xmm3,%xmm5
1206	paddq	%xmm6,%xmm4
1207	paddq	%xmm5,%xmm3
1208	pshufd	$78,%xmm0,%xmm6
1209	pshufd	$78,%xmm1,%xmm5
1210	paddq	%xmm6,%xmm0
1211	paddq	%xmm5,%xmm1
1212	pshufd	$78,%xmm2,%xmm6
1213	movdqa	%xmm3,%xmm5
1214	pand	%xmm7,%xmm3
1215	psrlq	$26,%xmm5
1216	paddq	%xmm6,%xmm2
1217	paddq	%xmm4,%xmm5
1218	movdqa	%xmm0,%xmm6
1219	pand	%xmm7,%xmm0
1220	psrlq	$26,%xmm6
1221	movdqa	%xmm5,%xmm4
1222	paddq	%xmm1,%xmm6
1223	psrlq	$26,%xmm5
1224	pand	%xmm7,%xmm4
1225	movdqa	%xmm6,%xmm1
1226	psrlq	$26,%xmm6
1227	paddd	%xmm5,%xmm0
1228	psllq	$2,%xmm5
1229	paddq	%xmm2,%xmm6
1230	paddq	%xmm0,%xmm5
1231	pand	%xmm7,%xmm1
1232	movdqa	%xmm6,%xmm2
1233	psrlq	$26,%xmm6
1234	pand	%xmm7,%xmm2
1235	paddd	%xmm3,%xmm6
1236	movdqa	%xmm5,%xmm0
1237	psrlq	$26,%xmm5
1238	movdqa	%xmm6,%xmm3
1239	psrlq	$26,%xmm6
1240	pand	%xmm7,%xmm0
1241	paddd	%xmm5,%xmm1
1242	pand	%xmm7,%xmm3
1243	paddd	%xmm6,%xmm4
1244.L013done:
1245	movd	%xmm0,-48(%edi)
1246	movd	%xmm1,-44(%edi)
1247	movd	%xmm2,-40(%edi)
1248	movd	%xmm3,-36(%edi)
1249	movd	%xmm4,-32(%edi)
1250	movl	%ebp,%esp
1251.L007nodata:
1252	popl	%edi
1253	popl	%esi
1254	popl	%ebx
1255	popl	%ebp
1256	ret
1257.size	_poly1305_blocks_sse2,.-_poly1305_blocks_sse2
1258.align	32
1259.type	_poly1305_emit_sse2,@function
1260.align	16
1261_poly1305_emit_sse2:
1262	pushl	%ebp
1263	pushl	%ebx
1264	pushl	%esi
1265	pushl	%edi
1266	movl	20(%esp),%ebp
1267	cmpl	$0,20(%ebp)
1268	je	.Lenter_emit
1269	movl	(%ebp),%eax
1270	movl	4(%ebp),%edi
1271	movl	8(%ebp),%ecx
1272	movl	12(%ebp),%edx
1273	movl	16(%ebp),%esi
1274	movl	%edi,%ebx
1275	shll	$26,%edi
1276	shrl	$6,%ebx
1277	addl	%edi,%eax
1278	movl	%ecx,%edi
1279	adcl	$0,%ebx
1280	shll	$20,%edi
1281	shrl	$12,%ecx
1282	addl	%edi,%ebx
1283	movl	%edx,%edi
1284	adcl	$0,%ecx
1285	shll	$14,%edi
1286	shrl	$18,%edx
1287	addl	%edi,%ecx
1288	movl	%esi,%edi
1289	adcl	$0,%edx
1290	shll	$8,%edi
1291	shrl	$24,%esi
1292	addl	%edi,%edx
1293	adcl	$0,%esi
1294	movl	%esi,%edi
1295	andl	$3,%esi
1296	shrl	$2,%edi
1297	leal	(%edi,%edi,4),%ebp
1298	movl	24(%esp),%edi
1299	addl	%ebp,%eax
1300	movl	28(%esp),%ebp
1301	adcl	$0,%ebx
1302	adcl	$0,%ecx
1303	adcl	$0,%edx
1304	adcl	$0,%esi
1305	movd	%eax,%xmm0
1306	addl	$5,%eax
1307	movd	%ebx,%xmm1
1308	adcl	$0,%ebx
1309	movd	%ecx,%xmm2
1310	adcl	$0,%ecx
1311	movd	%edx,%xmm3
1312	adcl	$0,%edx
1313	adcl	$0,%esi
1314	shrl	$2,%esi
1315	negl	%esi
1316	andl	%esi,%eax
1317	andl	%esi,%ebx
1318	andl	%esi,%ecx
1319	andl	%esi,%edx
1320	movl	%eax,(%edi)
1321	movd	%xmm0,%eax
1322	movl	%ebx,4(%edi)
1323	movd	%xmm1,%ebx
1324	movl	%ecx,8(%edi)
1325	movd	%xmm2,%ecx
1326	movl	%edx,12(%edi)
1327	movd	%xmm3,%edx
1328	notl	%esi
1329	andl	%esi,%eax
1330	andl	%esi,%ebx
1331	orl	(%edi),%eax
1332	andl	%esi,%ecx
1333	orl	4(%edi),%ebx
1334	andl	%esi,%edx
1335	orl	8(%edi),%ecx
1336	orl	12(%edi),%edx
1337	addl	(%ebp),%eax
1338	adcl	4(%ebp),%ebx
1339	movl	%eax,(%edi)
1340	adcl	8(%ebp),%ecx
1341	movl	%ebx,4(%edi)
1342	adcl	12(%ebp),%edx
1343	movl	%ecx,8(%edi)
1344	movl	%edx,12(%edi)
1345	popl	%edi
1346	popl	%esi
1347	popl	%ebx
1348	popl	%ebp
1349	ret
1350.size	_poly1305_emit_sse2,.-_poly1305_emit_sse2
1351.align	32
1352.type	_poly1305_init_avx2,@function
1353.align	16
1354_poly1305_init_avx2:
1355	vmovdqu	24(%edi),%xmm4
1356	leal	48(%edi),%edi
1357	movl	%esp,%ebp
1358	subl	$224,%esp
1359	andl	$-16,%esp
1360	vmovdqa	64(%ebx),%xmm7
1361	vpand	%xmm7,%xmm4,%xmm0
1362	vpsrlq	$26,%xmm4,%xmm1
1363	vpsrldq	$6,%xmm4,%xmm3
1364	vpand	%xmm7,%xmm1,%xmm1
1365	vpsrlq	$4,%xmm3,%xmm2
1366	vpsrlq	$30,%xmm3,%xmm3
1367	vpand	%xmm7,%xmm2,%xmm2
1368	vpand	%xmm7,%xmm3,%xmm3
1369	vpsrldq	$13,%xmm4,%xmm4
1370	leal	144(%esp),%edx
1371	movl	$2,%ecx
1372.L018square:
1373	vmovdqa	%xmm0,(%esp)
1374	vmovdqa	%xmm1,16(%esp)
1375	vmovdqa	%xmm2,32(%esp)
1376	vmovdqa	%xmm3,48(%esp)
1377	vmovdqa	%xmm4,64(%esp)
1378	vpslld	$2,%xmm1,%xmm6
1379	vpslld	$2,%xmm2,%xmm5
1380	vpaddd	%xmm1,%xmm6,%xmm6
1381	vpaddd	%xmm2,%xmm5,%xmm5
1382	vmovdqa	%xmm6,80(%esp)
1383	vmovdqa	%xmm5,96(%esp)
1384	vpslld	$2,%xmm3,%xmm6
1385	vpslld	$2,%xmm4,%xmm5
1386	vpaddd	%xmm3,%xmm6,%xmm6
1387	vpaddd	%xmm4,%xmm5,%xmm5
1388	vmovdqa	%xmm6,112(%esp)
1389	vmovdqa	%xmm5,128(%esp)
1390	vpshufd	$68,%xmm0,%xmm5
1391	vmovdqa	%xmm1,%xmm6
1392	vpshufd	$68,%xmm1,%xmm1
1393	vpshufd	$68,%xmm2,%xmm2
1394	vpshufd	$68,%xmm3,%xmm3
1395	vpshufd	$68,%xmm4,%xmm4
1396	vmovdqa	%xmm5,(%edx)
1397	vmovdqa	%xmm1,16(%edx)
1398	vmovdqa	%xmm2,32(%edx)
1399	vmovdqa	%xmm3,48(%edx)
1400	vmovdqa	%xmm4,64(%edx)
1401	vpmuludq	%xmm0,%xmm4,%xmm4
1402	vpmuludq	%xmm0,%xmm3,%xmm3
1403	vpmuludq	%xmm0,%xmm2,%xmm2
1404	vpmuludq	%xmm0,%xmm1,%xmm1
1405	vpmuludq	%xmm0,%xmm5,%xmm0
1406	vpmuludq	48(%edx),%xmm6,%xmm5
1407	vpaddq	%xmm5,%xmm4,%xmm4
1408	vpmuludq	32(%edx),%xmm6,%xmm7
1409	vpaddq	%xmm7,%xmm3,%xmm3
1410	vpmuludq	16(%edx),%xmm6,%xmm5
1411	vpaddq	%xmm5,%xmm2,%xmm2
1412	vmovdqa	80(%esp),%xmm7
1413	vpmuludq	(%edx),%xmm6,%xmm6
1414	vpaddq	%xmm6,%xmm1,%xmm1
1415	vmovdqa	32(%esp),%xmm5
1416	vpmuludq	64(%edx),%xmm7,%xmm7
1417	vpaddq	%xmm7,%xmm0,%xmm0
1418	vpmuludq	32(%edx),%xmm5,%xmm6
1419	vpaddq	%xmm6,%xmm4,%xmm4
1420	vpmuludq	16(%edx),%xmm5,%xmm7
1421	vpaddq	%xmm7,%xmm3,%xmm3
1422	vmovdqa	96(%esp),%xmm6
1423	vpmuludq	(%edx),%xmm5,%xmm5
1424	vpaddq	%xmm5,%xmm2,%xmm2
1425	vpmuludq	64(%edx),%xmm6,%xmm7
1426	vpaddq	%xmm7,%xmm1,%xmm1
1427	vmovdqa	48(%esp),%xmm5
1428	vpmuludq	48(%edx),%xmm6,%xmm6
1429	vpaddq	%xmm6,%xmm0,%xmm0
1430	vpmuludq	16(%edx),%xmm5,%xmm7
1431	vpaddq	%xmm7,%xmm4,%xmm4
1432	vmovdqa	112(%esp),%xmm6
1433	vpmuludq	(%edx),%xmm5,%xmm5
1434	vpaddq	%xmm5,%xmm3,%xmm3
1435	vpmuludq	64(%edx),%xmm6,%xmm7
1436	vpaddq	%xmm7,%xmm2,%xmm2
1437	vpmuludq	48(%edx),%xmm6,%xmm5
1438	vpaddq	%xmm5,%xmm1,%xmm1
1439	vmovdqa	64(%esp),%xmm7
1440	vpmuludq	32(%edx),%xmm6,%xmm6
1441	vpaddq	%xmm6,%xmm0,%xmm0
1442	vmovdqa	128(%esp),%xmm5
1443	vpmuludq	(%edx),%xmm7,%xmm7
1444	vpaddq	%xmm7,%xmm4,%xmm4
1445	vpmuludq	64(%edx),%xmm5,%xmm6
1446	vpaddq	%xmm6,%xmm3,%xmm3
1447	vpmuludq	16(%edx),%xmm5,%xmm7
1448	vpaddq	%xmm7,%xmm0,%xmm0
1449	vpmuludq	32(%edx),%xmm5,%xmm6
1450	vpaddq	%xmm6,%xmm1,%xmm1
1451	vmovdqa	64(%ebx),%xmm7
1452	vpmuludq	48(%edx),%xmm5,%xmm5
1453	vpaddq	%xmm5,%xmm2,%xmm2
1454	vpsrlq	$26,%xmm3,%xmm5
1455	vpand	%xmm7,%xmm3,%xmm3
1456	vpsrlq	$26,%xmm0,%xmm6
1457	vpand	%xmm7,%xmm0,%xmm0
1458	vpaddq	%xmm5,%xmm4,%xmm4
1459	vpaddq	%xmm6,%xmm1,%xmm1
1460	vpsrlq	$26,%xmm4,%xmm5
1461	vpand	%xmm7,%xmm4,%xmm4
1462	vpsrlq	$26,%xmm1,%xmm6
1463	vpand	%xmm7,%xmm1,%xmm1
1464	vpaddq	%xmm6,%xmm2,%xmm2
1465	vpaddd	%xmm5,%xmm0,%xmm0
1466	vpsllq	$2,%xmm5,%xmm5
1467	vpsrlq	$26,%xmm2,%xmm6
1468	vpand	%xmm7,%xmm2,%xmm2
1469	vpaddd	%xmm5,%xmm0,%xmm0
1470	vpaddd	%xmm6,%xmm3,%xmm3
1471	vpsrlq	$26,%xmm3,%xmm6
1472	vpsrlq	$26,%xmm0,%xmm5
1473	vpand	%xmm7,%xmm0,%xmm0
1474	vpand	%xmm7,%xmm3,%xmm3
1475	vpaddd	%xmm5,%xmm1,%xmm1
1476	vpaddd	%xmm6,%xmm4,%xmm4
1477	decl	%ecx
1478	jz	.L019square_break
1479	vpunpcklqdq	(%esp),%xmm0,%xmm0
1480	vpunpcklqdq	16(%esp),%xmm1,%xmm1
1481	vpunpcklqdq	32(%esp),%xmm2,%xmm2
1482	vpunpcklqdq	48(%esp),%xmm3,%xmm3
1483	vpunpcklqdq	64(%esp),%xmm4,%xmm4
1484	jmp	.L018square
1485.L019square_break:
1486	vpsllq	$32,%xmm0,%xmm0
1487	vpsllq	$32,%xmm1,%xmm1
1488	vpsllq	$32,%xmm2,%xmm2
1489	vpsllq	$32,%xmm3,%xmm3
1490	vpsllq	$32,%xmm4,%xmm4
1491	vpor	(%esp),%xmm0,%xmm0
1492	vpor	16(%esp),%xmm1,%xmm1
1493	vpor	32(%esp),%xmm2,%xmm2
1494	vpor	48(%esp),%xmm3,%xmm3
1495	vpor	64(%esp),%xmm4,%xmm4
1496	vpshufd	$141,%xmm0,%xmm0
1497	vpshufd	$141,%xmm1,%xmm1
1498	vpshufd	$141,%xmm2,%xmm2
1499	vpshufd	$141,%xmm3,%xmm3
1500	vpshufd	$141,%xmm4,%xmm4
1501	vmovdqu	%xmm0,(%edi)
1502	vmovdqu	%xmm1,16(%edi)
1503	vmovdqu	%xmm2,32(%edi)
1504	vmovdqu	%xmm3,48(%edi)
1505	vmovdqu	%xmm4,64(%edi)
1506	vpslld	$2,%xmm1,%xmm6
1507	vpslld	$2,%xmm2,%xmm5
1508	vpaddd	%xmm1,%xmm6,%xmm6
1509	vpaddd	%xmm2,%xmm5,%xmm5
1510	vmovdqu	%xmm6,80(%edi)
1511	vmovdqu	%xmm5,96(%edi)
1512	vpslld	$2,%xmm3,%xmm6
1513	vpslld	$2,%xmm4,%xmm5
1514	vpaddd	%xmm3,%xmm6,%xmm6
1515	vpaddd	%xmm4,%xmm5,%xmm5
1516	vmovdqu	%xmm6,112(%edi)
1517	vmovdqu	%xmm5,128(%edi)
1518	movl	%ebp,%esp
1519	leal	-48(%edi),%edi
1520	ret
1521.size	_poly1305_init_avx2,.-_poly1305_init_avx2
1522.align	32
1523.type	_poly1305_blocks_avx2,@function
1524.align	16
1525_poly1305_blocks_avx2:
1526	pushl	%ebp
1527	pushl	%ebx
1528	pushl	%esi
1529	pushl	%edi
1530	movl	20(%esp),%edi
1531	movl	24(%esp),%esi
1532	movl	28(%esp),%ecx
1533	movl	20(%edi),%eax
1534	andl	$-16,%ecx
1535	jz	.L020nodata
1536	cmpl	$64,%ecx
1537	jae	.L021enter_avx2
1538	testl	%eax,%eax
1539	jz	.Lenter_blocks
1540.L021enter_avx2:
1541	vzeroupper
1542	call	.L022pic_point
1543.L022pic_point:
1544	popl	%ebx
1545	leal	.Lconst_sse2-.L022pic_point(%ebx),%ebx
1546	testl	%eax,%eax
1547	jnz	.L023base2_26
1548	call	_poly1305_init_avx2
1549	movl	(%edi),%eax
1550	movl	3(%edi),%ecx
1551	movl	6(%edi),%edx
1552	movl	9(%edi),%esi
1553	movl	13(%edi),%ebp
1554	shrl	$2,%ecx
1555	andl	$67108863,%eax
1556	shrl	$4,%edx
1557	andl	$67108863,%ecx
1558	shrl	$6,%esi
1559	andl	$67108863,%edx
1560	movl	%eax,(%edi)
1561	movl	%ecx,4(%edi)
1562	movl	%edx,8(%edi)
1563	movl	%esi,12(%edi)
1564	movl	%ebp,16(%edi)
1565	movl	$1,20(%edi)
1566	movl	24(%esp),%esi
1567	movl	28(%esp),%ecx
1568.L023base2_26:
1569	movl	32(%esp),%eax
1570	movl	%esp,%ebp
1571	subl	$448,%esp
1572	andl	$-512,%esp
1573	vmovdqu	48(%edi),%xmm0
1574	leal	288(%esp),%edx
1575	vmovdqu	64(%edi),%xmm1
1576	vmovdqu	80(%edi),%xmm2
1577	vmovdqu	96(%edi),%xmm3
1578	vmovdqu	112(%edi),%xmm4
1579	leal	48(%edi),%edi
1580	vpermq	$64,%ymm0,%ymm0
1581	vpermq	$64,%ymm1,%ymm1
1582	vpermq	$64,%ymm2,%ymm2
1583	vpermq	$64,%ymm3,%ymm3
1584	vpermq	$64,%ymm4,%ymm4
1585	vpshufd	$200,%ymm0,%ymm0
1586	vpshufd	$200,%ymm1,%ymm1
1587	vpshufd	$200,%ymm2,%ymm2
1588	vpshufd	$200,%ymm3,%ymm3
1589	vpshufd	$200,%ymm4,%ymm4
1590	vmovdqa	%ymm0,-128(%edx)
1591	vmovdqu	80(%edi),%xmm0
1592	vmovdqa	%ymm1,-96(%edx)
1593	vmovdqu	96(%edi),%xmm1
1594	vmovdqa	%ymm2,-64(%edx)
1595	vmovdqu	112(%edi),%xmm2
1596	vmovdqa	%ymm3,-32(%edx)
1597	vmovdqu	128(%edi),%xmm3
1598	vmovdqa	%ymm4,(%edx)
1599	vpermq	$64,%ymm0,%ymm0
1600	vpermq	$64,%ymm1,%ymm1
1601	vpermq	$64,%ymm2,%ymm2
1602	vpermq	$64,%ymm3,%ymm3
1603	vpshufd	$200,%ymm0,%ymm0
1604	vpshufd	$200,%ymm1,%ymm1
1605	vpshufd	$200,%ymm2,%ymm2
1606	vpshufd	$200,%ymm3,%ymm3
1607	vmovdqa	%ymm0,32(%edx)
1608	vmovd	-48(%edi),%xmm0
1609	vmovdqa	%ymm1,64(%edx)
1610	vmovd	-44(%edi),%xmm1
1611	vmovdqa	%ymm2,96(%edx)
1612	vmovd	-40(%edi),%xmm2
1613	vmovdqa	%ymm3,128(%edx)
1614	vmovd	-36(%edi),%xmm3
1615	vmovd	-32(%edi),%xmm4
1616	vmovdqa	64(%ebx),%ymm7
1617	negl	%eax
1618	testl	$63,%ecx
1619	jz	.L024even
1620	movl	%ecx,%edx
1621	andl	$-64,%ecx
1622	andl	$63,%edx
1623	vmovdqu	(%esi),%xmm5
1624	cmpl	$32,%edx
1625	jb	.L025one
1626	vmovdqu	16(%esi),%xmm6
1627	je	.L026two
1628	vinserti128	$1,32(%esi),%ymm5,%ymm5
1629	leal	48(%esi),%esi
1630	leal	8(%ebx),%ebx
1631	leal	296(%esp),%edx
1632	jmp	.L027tail
1633.L026two:
1634	leal	32(%esi),%esi
1635	leal	16(%ebx),%ebx
1636	leal	304(%esp),%edx
1637	jmp	.L027tail
1638.L025one:
1639	leal	16(%esi),%esi
1640	vpxor	%ymm6,%ymm6,%ymm6
1641	leal	32(%ebx,%eax,8),%ebx
1642	leal	312(%esp),%edx
1643	jmp	.L027tail
1644.align	32
1645.L024even:
1646	vmovdqu	(%esi),%xmm5
1647	vmovdqu	16(%esi),%xmm6
1648	vinserti128	$1,32(%esi),%ymm5,%ymm5
1649	vinserti128	$1,48(%esi),%ymm6,%ymm6
1650	leal	64(%esi),%esi
1651	subl	$64,%ecx
1652	jz	.L027tail
1653.L028loop:
1654	vmovdqa	%ymm2,64(%esp)
1655	vpsrldq	$6,%ymm5,%ymm2
1656	vmovdqa	%ymm0,(%esp)
1657	vpsrldq	$6,%ymm6,%ymm0
1658	vmovdqa	%ymm1,32(%esp)
1659	vpunpckhqdq	%ymm6,%ymm5,%ymm1
1660	vpunpcklqdq	%ymm6,%ymm5,%ymm5
1661	vpunpcklqdq	%ymm0,%ymm2,%ymm2
1662	vpsrlq	$30,%ymm2,%ymm0
1663	vpsrlq	$4,%ymm2,%ymm2
1664	vpsrlq	$26,%ymm5,%ymm6
1665	vpsrlq	$40,%ymm1,%ymm1
1666	vpand	%ymm7,%ymm2,%ymm2
1667	vpand	%ymm7,%ymm5,%ymm5
1668	vpand	%ymm7,%ymm6,%ymm6
1669	vpand	%ymm7,%ymm0,%ymm0
1670	vpor	(%ebx),%ymm1,%ymm1
1671	vpaddq	64(%esp),%ymm2,%ymm2
1672	vpaddq	(%esp),%ymm5,%ymm5
1673	vpaddq	32(%esp),%ymm6,%ymm6
1674	vpaddq	%ymm3,%ymm0,%ymm0
1675	vpaddq	%ymm4,%ymm1,%ymm1
1676	vpmuludq	-96(%edx),%ymm2,%ymm3
1677	vmovdqa	%ymm6,32(%esp)
1678	vpmuludq	-64(%edx),%ymm2,%ymm4
1679	vmovdqa	%ymm0,96(%esp)
1680	vpmuludq	96(%edx),%ymm2,%ymm0
1681	vmovdqa	%ymm1,128(%esp)
1682	vpmuludq	128(%edx),%ymm2,%ymm1
1683	vpmuludq	-128(%edx),%ymm2,%ymm2
1684	vpmuludq	-32(%edx),%ymm5,%ymm7
1685	vpaddq	%ymm7,%ymm3,%ymm3
1686	vpmuludq	(%edx),%ymm5,%ymm6
1687	vpaddq	%ymm6,%ymm4,%ymm4
1688	vpmuludq	-128(%edx),%ymm5,%ymm7
1689	vpaddq	%ymm7,%ymm0,%ymm0
1690	vmovdqa	32(%esp),%ymm7
1691	vpmuludq	-96(%edx),%ymm5,%ymm6
1692	vpaddq	%ymm6,%ymm1,%ymm1
1693	vpmuludq	-64(%edx),%ymm5,%ymm5
1694	vpaddq	%ymm5,%ymm2,%ymm2
1695	vpmuludq	-64(%edx),%ymm7,%ymm6
1696	vpaddq	%ymm6,%ymm3,%ymm3
1697	vpmuludq	-32(%edx),%ymm7,%ymm5
1698	vpaddq	%ymm5,%ymm4,%ymm4
1699	vpmuludq	128(%edx),%ymm7,%ymm6
1700	vpaddq	%ymm6,%ymm0,%ymm0
1701	vmovdqa	96(%esp),%ymm6
1702	vpmuludq	-128(%edx),%ymm7,%ymm5
1703	vpaddq	%ymm5,%ymm1,%ymm1
1704	vpmuludq	-96(%edx),%ymm7,%ymm7
1705	vpaddq	%ymm7,%ymm2,%ymm2
1706	vpmuludq	-128(%edx),%ymm6,%ymm5
1707	vpaddq	%ymm5,%ymm3,%ymm3
1708	vpmuludq	-96(%edx),%ymm6,%ymm7
1709	vpaddq	%ymm7,%ymm4,%ymm4
1710	vpmuludq	64(%edx),%ymm6,%ymm5
1711	vpaddq	%ymm5,%ymm0,%ymm0
1712	vmovdqa	128(%esp),%ymm5
1713	vpmuludq	96(%edx),%ymm6,%ymm7
1714	vpaddq	%ymm7,%ymm1,%ymm1
1715	vpmuludq	128(%edx),%ymm6,%ymm6
1716	vpaddq	%ymm6,%ymm2,%ymm2
1717	vpmuludq	128(%edx),%ymm5,%ymm7
1718	vpaddq	%ymm7,%ymm3,%ymm3
1719	vpmuludq	32(%edx),%ymm5,%ymm6
1720	vpaddq	%ymm6,%ymm0,%ymm0
1721	vpmuludq	-128(%edx),%ymm5,%ymm7
1722	vpaddq	%ymm7,%ymm4,%ymm4
1723	vmovdqa	64(%ebx),%ymm7
1724	vpmuludq	64(%edx),%ymm5,%ymm6
1725	vpaddq	%ymm6,%ymm1,%ymm1
1726	vpmuludq	96(%edx),%ymm5,%ymm5
1727	vpaddq	%ymm5,%ymm2,%ymm2
1728	vpsrlq	$26,%ymm3,%ymm5
1729	vpand	%ymm7,%ymm3,%ymm3
1730	vpsrlq	$26,%ymm0,%ymm6
1731	vpand	%ymm7,%ymm0,%ymm0
1732	vpaddq	%ymm5,%ymm4,%ymm4
1733	vpaddq	%ymm6,%ymm1,%ymm1
1734	vpsrlq	$26,%ymm4,%ymm5
1735	vpand	%ymm7,%ymm4,%ymm4
1736	vpsrlq	$26,%ymm1,%ymm6
1737	vpand	%ymm7,%ymm1,%ymm1
1738	vpaddq	%ymm6,%ymm2,%ymm2
1739	vpaddq	%ymm5,%ymm0,%ymm0
1740	vpsllq	$2,%ymm5,%ymm5
1741	vpsrlq	$26,%ymm2,%ymm6
1742	vpand	%ymm7,%ymm2,%ymm2
1743	vpaddq	%ymm5,%ymm0,%ymm0
1744	vpaddq	%ymm6,%ymm3,%ymm3
1745	vpsrlq	$26,%ymm3,%ymm6
1746	vpsrlq	$26,%ymm0,%ymm5
1747	vpand	%ymm7,%ymm0,%ymm0
1748	vpand	%ymm7,%ymm3,%ymm3
1749	vpaddq	%ymm5,%ymm1,%ymm1
1750	vpaddq	%ymm6,%ymm4,%ymm4
1751	vmovdqu	(%esi),%xmm5
1752	vmovdqu	16(%esi),%xmm6
1753	vinserti128	$1,32(%esi),%ymm5,%ymm5
1754	vinserti128	$1,48(%esi),%ymm6,%ymm6
1755	leal	64(%esi),%esi
1756	subl	$64,%ecx
1757	jnz	.L028loop
1758.L027tail:
1759	vmovdqa	%ymm2,64(%esp)
1760	vpsrldq	$6,%ymm5,%ymm2
1761	vmovdqa	%ymm0,(%esp)
1762	vpsrldq	$6,%ymm6,%ymm0
1763	vmovdqa	%ymm1,32(%esp)
1764	vpunpckhqdq	%ymm6,%ymm5,%ymm1
1765	vpunpcklqdq	%ymm6,%ymm5,%ymm5
1766	vpunpcklqdq	%ymm0,%ymm2,%ymm2
1767	vpsrlq	$30,%ymm2,%ymm0
1768	vpsrlq	$4,%ymm2,%ymm2
1769	vpsrlq	$26,%ymm5,%ymm6
1770	vpsrlq	$40,%ymm1,%ymm1
1771	vpand	%ymm7,%ymm2,%ymm2
1772	vpand	%ymm7,%ymm5,%ymm5
1773	vpand	%ymm7,%ymm6,%ymm6
1774	vpand	%ymm7,%ymm0,%ymm0
1775	vpor	(%ebx),%ymm1,%ymm1
1776	andl	$-64,%ebx
1777	vpaddq	64(%esp),%ymm2,%ymm2
1778	vpaddq	(%esp),%ymm5,%ymm5
1779	vpaddq	32(%esp),%ymm6,%ymm6
1780	vpaddq	%ymm3,%ymm0,%ymm0
1781	vpaddq	%ymm4,%ymm1,%ymm1
1782	vpmuludq	-92(%edx),%ymm2,%ymm3
1783	vmovdqa	%ymm6,32(%esp)
1784	vpmuludq	-60(%edx),%ymm2,%ymm4
1785	vmovdqa	%ymm0,96(%esp)
1786	vpmuludq	100(%edx),%ymm2,%ymm0
1787	vmovdqa	%ymm1,128(%esp)
1788	vpmuludq	132(%edx),%ymm2,%ymm1
1789	vpmuludq	-124(%edx),%ymm2,%ymm2
1790	vpmuludq	-28(%edx),%ymm5,%ymm7
1791	vpaddq	%ymm7,%ymm3,%ymm3
1792	vpmuludq	4(%edx),%ymm5,%ymm6
1793	vpaddq	%ymm6,%ymm4,%ymm4
1794	vpmuludq	-124(%edx),%ymm5,%ymm7
1795	vpaddq	%ymm7,%ymm0,%ymm0
1796	vmovdqa	32(%esp),%ymm7
1797	vpmuludq	-92(%edx),%ymm5,%ymm6
1798	vpaddq	%ymm6,%ymm1,%ymm1
1799	vpmuludq	-60(%edx),%ymm5,%ymm5
1800	vpaddq	%ymm5,%ymm2,%ymm2
1801	vpmuludq	-60(%edx),%ymm7,%ymm6
1802	vpaddq	%ymm6,%ymm3,%ymm3
1803	vpmuludq	-28(%edx),%ymm7,%ymm5
1804	vpaddq	%ymm5,%ymm4,%ymm4
1805	vpmuludq	132(%edx),%ymm7,%ymm6
1806	vpaddq	%ymm6,%ymm0,%ymm0
1807	vmovdqa	96(%esp),%ymm6
1808	vpmuludq	-124(%edx),%ymm7,%ymm5
1809	vpaddq	%ymm5,%ymm1,%ymm1
1810	vpmuludq	-92(%edx),%ymm7,%ymm7
1811	vpaddq	%ymm7,%ymm2,%ymm2
1812	vpmuludq	-124(%edx),%ymm6,%ymm5
1813	vpaddq	%ymm5,%ymm3,%ymm3
1814	vpmuludq	-92(%edx),%ymm6,%ymm7
1815	vpaddq	%ymm7,%ymm4,%ymm4
1816	vpmuludq	68(%edx),%ymm6,%ymm5
1817	vpaddq	%ymm5,%ymm0,%ymm0
1818	vmovdqa	128(%esp),%ymm5
1819	vpmuludq	100(%edx),%ymm6,%ymm7
1820	vpaddq	%ymm7,%ymm1,%ymm1
1821	vpmuludq	132(%edx),%ymm6,%ymm6
1822	vpaddq	%ymm6,%ymm2,%ymm2
1823	vpmuludq	132(%edx),%ymm5,%ymm7
1824	vpaddq	%ymm7,%ymm3,%ymm3
1825	vpmuludq	36(%edx),%ymm5,%ymm6
1826	vpaddq	%ymm6,%ymm0,%ymm0
1827	vpmuludq	-124(%edx),%ymm5,%ymm7
1828	vpaddq	%ymm7,%ymm4,%ymm4
1829	vmovdqa	64(%ebx),%ymm7
1830	vpmuludq	68(%edx),%ymm5,%ymm6
1831	vpaddq	%ymm6,%ymm1,%ymm1
1832	vpmuludq	100(%edx),%ymm5,%ymm5
1833	vpaddq	%ymm5,%ymm2,%ymm2
1834	vpsrldq	$8,%ymm4,%ymm5
1835	vpsrldq	$8,%ymm3,%ymm6
1836	vpaddq	%ymm5,%ymm4,%ymm4
1837	vpsrldq	$8,%ymm0,%ymm5
1838	vpaddq	%ymm6,%ymm3,%ymm3
1839	vpsrldq	$8,%ymm1,%ymm6
1840	vpaddq	%ymm5,%ymm0,%ymm0
1841	vpsrldq	$8,%ymm2,%ymm5
1842	vpaddq	%ymm6,%ymm1,%ymm1
1843	vpermq	$2,%ymm4,%ymm6
1844	vpaddq	%ymm5,%ymm2,%ymm2
1845	vpermq	$2,%ymm3,%ymm5
1846	vpaddq	%ymm6,%ymm4,%ymm4
1847	vpermq	$2,%ymm0,%ymm6
1848	vpaddq	%ymm5,%ymm3,%ymm3
1849	vpermq	$2,%ymm1,%ymm5
1850	vpaddq	%ymm6,%ymm0,%ymm0
1851	vpermq	$2,%ymm2,%ymm6
1852	vpaddq	%ymm5,%ymm1,%ymm1
1853	vpaddq	%ymm6,%ymm2,%ymm2
1854	vpsrlq	$26,%ymm3,%ymm5
1855	vpand	%ymm7,%ymm3,%ymm3
1856	vpsrlq	$26,%ymm0,%ymm6
1857	vpand	%ymm7,%ymm0,%ymm0
1858	vpaddq	%ymm5,%ymm4,%ymm4
1859	vpaddq	%ymm6,%ymm1,%ymm1
1860	vpsrlq	$26,%ymm4,%ymm5
1861	vpand	%ymm7,%ymm4,%ymm4
1862	vpsrlq	$26,%ymm1,%ymm6
1863	vpand	%ymm7,%ymm1,%ymm1
1864	vpaddq	%ymm6,%ymm2,%ymm2
1865	vpaddq	%ymm5,%ymm0,%ymm0
1866	vpsllq	$2,%ymm5,%ymm5
1867	vpsrlq	$26,%ymm2,%ymm6
1868	vpand	%ymm7,%ymm2,%ymm2
1869	vpaddq	%ymm5,%ymm0,%ymm0
1870	vpaddq	%ymm6,%ymm3,%ymm3
1871	vpsrlq	$26,%ymm3,%ymm6
1872	vpsrlq	$26,%ymm0,%ymm5
1873	vpand	%ymm7,%ymm0,%ymm0
1874	vpand	%ymm7,%ymm3,%ymm3
1875	vpaddq	%ymm5,%ymm1,%ymm1
1876	vpaddq	%ymm6,%ymm4,%ymm4
1877	cmpl	$0,%ecx
1878	je	.L029done
1879	vpshufd	$252,%xmm0,%xmm0
1880	leal	288(%esp),%edx
1881	vpshufd	$252,%xmm1,%xmm1
1882	vpshufd	$252,%xmm2,%xmm2
1883	vpshufd	$252,%xmm3,%xmm3
1884	vpshufd	$252,%xmm4,%xmm4
1885	jmp	.L024even
1886.align	16
1887.L029done:
1888	vmovd	%xmm0,-48(%edi)
1889	vmovd	%xmm1,-44(%edi)
1890	vmovd	%xmm2,-40(%edi)
1891	vmovd	%xmm3,-36(%edi)
1892	vmovd	%xmm4,-32(%edi)
1893	vzeroupper
1894	movl	%ebp,%esp
1895.L020nodata:
1896	popl	%edi
1897	popl	%esi
1898	popl	%ebx
1899	popl	%ebp
1900	ret
1901.size	_poly1305_blocks_avx2,.-_poly1305_blocks_avx2
1902.align	64
1903.Lconst_sse2:
1904.long	16777216,0,16777216,0,16777216,0,16777216,0
1905.long	0,0,0,0,0,0,0,0
1906.long	67108863,0,67108863,0,67108863,0,67108863,0
1907.long	268435455,268435452,268435452,268435452
1908.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
1909.byte	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
1910.byte	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
1911.byte	114,103,62,0
1912.align	4
1913.comm	OPENSSL_ia32cap_P,16,4
1914#else
1915.text
1916.align	64
1917.globl	poly1305_init
1918.type	poly1305_init,@function
1919.align	16
1920poly1305_init:
1921.L_poly1305_init_begin:
1922	pushl	%ebp
1923	pushl	%ebx
1924	pushl	%esi
1925	pushl	%edi
1926	movl	20(%esp),%edi
1927	movl	24(%esp),%esi
1928	movl	28(%esp),%ebp
1929	xorl	%eax,%eax
1930	movl	%eax,(%edi)
1931	movl	%eax,4(%edi)
1932	movl	%eax,8(%edi)
1933	movl	%eax,12(%edi)
1934	movl	%eax,16(%edi)
1935	movl	%eax,20(%edi)
1936	cmpl	$0,%esi
1937	je	.L000nokey
1938	call	.L001pic_point
1939.L001pic_point:
1940	popl	%ebx
1941	leal	poly1305_blocks-.L001pic_point(%ebx),%eax
1942	leal	poly1305_emit-.L001pic_point(%ebx),%edx
1943	leal	OPENSSL_ia32cap_P,%edi
1944	movl	(%edi),%ecx
1945	andl	$83886080,%ecx
1946	cmpl	$83886080,%ecx
1947	jne	.L002no_sse2
1948	leal	_poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
1949	leal	_poly1305_emit_sse2-.L001pic_point(%ebx),%edx
1950	movl	8(%edi),%ecx
1951	testl	$32,%ecx
1952	jz	.L002no_sse2
1953	leal	_poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
1954.L002no_sse2:
1955	movl	20(%esp),%edi
1956	movl	%eax,(%ebp)
1957	movl	%edx,4(%ebp)
1958	movl	(%esi),%eax
1959	movl	4(%esi),%ebx
1960	movl	8(%esi),%ecx
1961	movl	12(%esi),%edx
1962	andl	$268435455,%eax
1963	andl	$268435452,%ebx
1964	andl	$268435452,%ecx
1965	andl	$268435452,%edx
1966	movl	%eax,24(%edi)
1967	movl	%ebx,28(%edi)
1968	movl	%ecx,32(%edi)
1969	movl	%edx,36(%edi)
1970	movl	$1,%eax
1971.L000nokey:
1972	popl	%edi
1973	popl	%esi
1974	popl	%ebx
1975	popl	%ebp
1976	ret
1977.size	poly1305_init,.-.L_poly1305_init_begin
1978.globl	poly1305_blocks
1979.type	poly1305_blocks,@function
1980.align	16
1981poly1305_blocks:
1982.L_poly1305_blocks_begin:
1983	pushl	%ebp
1984	pushl	%ebx
1985	pushl	%esi
1986	pushl	%edi
1987	movl	20(%esp),%edi
1988	movl	24(%esp),%esi
1989	movl	28(%esp),%ecx
1990.Lenter_blocks:
1991	andl	$-15,%ecx
1992	jz	.L003nodata
1993	subl	$64,%esp
1994	movl	24(%edi),%eax
1995	movl	28(%edi),%ebx
1996	leal	(%esi,%ecx,1),%ebp
1997	movl	32(%edi),%ecx
1998	movl	36(%edi),%edx
1999	movl	%ebp,92(%esp)
2000	movl	%esi,%ebp
2001	movl	%eax,36(%esp)
2002	movl	%ebx,%eax
2003	shrl	$2,%eax
2004	movl	%ebx,40(%esp)
2005	addl	%ebx,%eax
2006	movl	%ecx,%ebx
2007	shrl	$2,%ebx
2008	movl	%ecx,44(%esp)
2009	addl	%ecx,%ebx
2010	movl	%edx,%ecx
2011	shrl	$2,%ecx
2012	movl	%edx,48(%esp)
2013	addl	%edx,%ecx
2014	movl	%eax,52(%esp)
2015	movl	%ebx,56(%esp)
2016	movl	%ecx,60(%esp)
2017	movl	(%edi),%eax
2018	movl	4(%edi),%ebx
2019	movl	8(%edi),%ecx
2020	movl	12(%edi),%esi
2021	movl	16(%edi),%edi
2022	jmp	.L004loop
2023.align	32
2024.L004loop:
2025	addl	(%ebp),%eax
2026	adcl	4(%ebp),%ebx
2027	adcl	8(%ebp),%ecx
2028	adcl	12(%ebp),%esi
2029	leal	16(%ebp),%ebp
2030	adcl	96(%esp),%edi
2031	movl	%eax,(%esp)
2032	movl	%esi,12(%esp)
2033	mull	36(%esp)
2034	movl	%edi,16(%esp)
2035	movl	%eax,%edi
2036	movl	%ebx,%eax
2037	movl	%edx,%esi
2038	mull	60(%esp)
2039	addl	%eax,%edi
2040	movl	%ecx,%eax
2041	adcl	%edx,%esi
2042	mull	56(%esp)
2043	addl	%eax,%edi
2044	movl	12(%esp),%eax
2045	adcl	%edx,%esi
2046	mull	52(%esp)
2047	addl	%eax,%edi
2048	movl	(%esp),%eax
2049	adcl	%edx,%esi
2050	mull	40(%esp)
2051	movl	%edi,20(%esp)
2052	xorl	%edi,%edi
2053	addl	%eax,%esi
2054	movl	%ebx,%eax
2055	adcl	%edx,%edi
2056	mull	36(%esp)
2057	addl	%eax,%esi
2058	movl	%ecx,%eax
2059	adcl	%edx,%edi
2060	mull	60(%esp)
2061	addl	%eax,%esi
2062	movl	12(%esp),%eax
2063	adcl	%edx,%edi
2064	mull	56(%esp)
2065	addl	%eax,%esi
2066	movl	16(%esp),%eax
2067	adcl	%edx,%edi
2068	imull	52(%esp),%eax
2069	addl	%eax,%esi
2070	movl	(%esp),%eax
2071	adcl	$0,%edi
2072	mull	44(%esp)
2073	movl	%esi,24(%esp)
2074	xorl	%esi,%esi
2075	addl	%eax,%edi
2076	movl	%ebx,%eax
2077	adcl	%edx,%esi
2078	mull	40(%esp)
2079	addl	%eax,%edi
2080	movl	%ecx,%eax
2081	adcl	%edx,%esi
2082	mull	36(%esp)
2083	addl	%eax,%edi
2084	movl	12(%esp),%eax
2085	adcl	%edx,%esi
2086	mull	60(%esp)
2087	addl	%eax,%edi
2088	movl	16(%esp),%eax
2089	adcl	%edx,%esi
2090	imull	56(%esp),%eax
2091	addl	%eax,%edi
2092	movl	(%esp),%eax
2093	adcl	$0,%esi
2094	mull	48(%esp)
2095	movl	%edi,28(%esp)
2096	xorl	%edi,%edi
2097	addl	%eax,%esi
2098	movl	%ebx,%eax
2099	adcl	%edx,%edi
2100	mull	44(%esp)
2101	addl	%eax,%esi
2102	movl	%ecx,%eax
2103	adcl	%edx,%edi
2104	mull	40(%esp)
2105	addl	%eax,%esi
2106	movl	12(%esp),%eax
2107	adcl	%edx,%edi
2108	mull	36(%esp)
2109	addl	%eax,%esi
2110	movl	16(%esp),%ecx
2111	adcl	%edx,%edi
2112	movl	%ecx,%edx
2113	imull	60(%esp),%ecx
2114	addl	%ecx,%esi
2115	movl	20(%esp),%eax
2116	adcl	$0,%edi
2117	imull	36(%esp),%edx
2118	addl	%edi,%edx
2119	movl	24(%esp),%ebx
2120	movl	28(%esp),%ecx
2121	movl	%edx,%edi
2122	shrl	$2,%edx
2123	andl	$3,%edi
2124	leal	(%edx,%edx,4),%edx
2125	addl	%edx,%eax
2126	adcl	$0,%ebx
2127	adcl	$0,%ecx
2128	adcl	$0,%esi
2129	adcl	$0,%edi
2130	cmpl	92(%esp),%ebp
2131	jne	.L004loop
2132	movl	84(%esp),%edx
2133	addl	$64,%esp
2134	movl	%eax,(%edx)
2135	movl	%ebx,4(%edx)
2136	movl	%ecx,8(%edx)
2137	movl	%esi,12(%edx)
2138	movl	%edi,16(%edx)
2139.L003nodata:
2140	popl	%edi
2141	popl	%esi
2142	popl	%ebx
2143	popl	%ebp
2144	ret
2145.size	poly1305_blocks,.-.L_poly1305_blocks_begin
2146.globl	poly1305_emit
2147.type	poly1305_emit,@function
2148.align	16
2149poly1305_emit:
2150.L_poly1305_emit_begin:
2151	pushl	%ebp
2152	pushl	%ebx
2153	pushl	%esi
2154	pushl	%edi
2155	movl	20(%esp),%ebp
2156.Lenter_emit:
2157	movl	24(%esp),%edi
2158	movl	(%ebp),%eax
2159	movl	4(%ebp),%ebx
2160	movl	8(%ebp),%ecx
2161	movl	12(%ebp),%edx
2162	movl	16(%ebp),%esi
2163	addl	$5,%eax
2164	adcl	$0,%ebx
2165	adcl	$0,%ecx
2166	adcl	$0,%edx
2167	adcl	$0,%esi
2168	shrl	$2,%esi
2169	negl	%esi
2170	andl	%esi,%eax
2171	andl	%esi,%ebx
2172	andl	%esi,%ecx
2173	andl	%esi,%edx
2174	movl	%eax,(%edi)
2175	movl	%ebx,4(%edi)
2176	movl	%ecx,8(%edi)
2177	movl	%edx,12(%edi)
2178	notl	%esi
2179	movl	(%ebp),%eax
2180	movl	4(%ebp),%ebx
2181	movl	8(%ebp),%ecx
2182	movl	12(%ebp),%edx
2183	movl	28(%esp),%ebp
2184	andl	%esi,%eax
2185	andl	%esi,%ebx
2186	andl	%esi,%ecx
2187	andl	%esi,%edx
2188	orl	(%edi),%eax
2189	orl	4(%edi),%ebx
2190	orl	8(%edi),%ecx
2191	orl	12(%edi),%edx
2192	addl	(%ebp),%eax
2193	adcl	4(%ebp),%ebx
2194	adcl	8(%ebp),%ecx
2195	adcl	12(%ebp),%edx
2196	movl	%eax,(%edi)
2197	movl	%ebx,4(%edi)
2198	movl	%ecx,8(%edi)
2199	movl	%edx,12(%edi)
2200	popl	%edi
2201	popl	%esi
2202	popl	%ebx
2203	popl	%ebp
2204	ret
2205.size	poly1305_emit,.-.L_poly1305_emit_begin
2206.align	32
2207.type	_poly1305_init_sse2,@function
2208.align	16
2209_poly1305_init_sse2:
2210	movdqu	24(%edi),%xmm4
2211	leal	48(%edi),%edi
2212	movl	%esp,%ebp
2213	subl	$224,%esp
2214	andl	$-16,%esp
2215	movq	64(%ebx),%xmm7
2216	movdqa	%xmm4,%xmm0
2217	movdqa	%xmm4,%xmm1
2218	movdqa	%xmm4,%xmm2
2219	pand	%xmm7,%xmm0
2220	psrlq	$26,%xmm1
2221	psrldq	$6,%xmm2
2222	pand	%xmm7,%xmm1
2223	movdqa	%xmm2,%xmm3
2224	psrlq	$4,%xmm2
2225	psrlq	$30,%xmm3
2226	pand	%xmm7,%xmm2
2227	pand	%xmm7,%xmm3
2228	psrldq	$13,%xmm4
2229	leal	144(%esp),%edx
2230	movl	$2,%ecx
2231.L005square:
2232	movdqa	%xmm0,(%esp)
2233	movdqa	%xmm1,16(%esp)
2234	movdqa	%xmm2,32(%esp)
2235	movdqa	%xmm3,48(%esp)
2236	movdqa	%xmm4,64(%esp)
2237	movdqa	%xmm1,%xmm6
2238	movdqa	%xmm2,%xmm5
2239	pslld	$2,%xmm6
2240	pslld	$2,%xmm5
2241	paddd	%xmm1,%xmm6
2242	paddd	%xmm2,%xmm5
2243	movdqa	%xmm6,80(%esp)
2244	movdqa	%xmm5,96(%esp)
2245	movdqa	%xmm3,%xmm6
2246	movdqa	%xmm4,%xmm5
2247	pslld	$2,%xmm6
2248	pslld	$2,%xmm5
2249	paddd	%xmm3,%xmm6
2250	paddd	%xmm4,%xmm5
2251	movdqa	%xmm6,112(%esp)
2252	movdqa	%xmm5,128(%esp)
2253	pshufd	$68,%xmm0,%xmm6
2254	movdqa	%xmm1,%xmm5
2255	pshufd	$68,%xmm1,%xmm1
2256	pshufd	$68,%xmm2,%xmm2
2257	pshufd	$68,%xmm3,%xmm3
2258	pshufd	$68,%xmm4,%xmm4
2259	movdqa	%xmm6,(%edx)
2260	movdqa	%xmm1,16(%edx)
2261	movdqa	%xmm2,32(%edx)
2262	movdqa	%xmm3,48(%edx)
2263	movdqa	%xmm4,64(%edx)
2264	pmuludq	%xmm0,%xmm4
2265	pmuludq	%xmm0,%xmm3
2266	pmuludq	%xmm0,%xmm2
2267	pmuludq	%xmm0,%xmm1
2268	pmuludq	%xmm6,%xmm0
2269	movdqa	%xmm5,%xmm6
2270	pmuludq	48(%edx),%xmm5
2271	movdqa	%xmm6,%xmm7
2272	pmuludq	32(%edx),%xmm6
2273	paddq	%xmm5,%xmm4
2274	movdqa	%xmm7,%xmm5
2275	pmuludq	16(%edx),%xmm7
2276	paddq	%xmm6,%xmm3
2277	movdqa	80(%esp),%xmm6
2278	pmuludq	(%edx),%xmm5
2279	paddq	%xmm7,%xmm2
2280	pmuludq	64(%edx),%xmm6
2281	movdqa	32(%esp),%xmm7
2282	paddq	%xmm5,%xmm1
2283	movdqa	%xmm7,%xmm5
2284	pmuludq	32(%edx),%xmm7
2285	paddq	%xmm6,%xmm0
2286	movdqa	%xmm5,%xmm6
2287	pmuludq	16(%edx),%xmm5
2288	paddq	%xmm7,%xmm4
2289	movdqa	96(%esp),%xmm7
2290	pmuludq	(%edx),%xmm6
2291	paddq	%xmm5,%xmm3
2292	movdqa	%xmm7,%xmm5
2293	pmuludq	64(%edx),%xmm7
2294	paddq	%xmm6,%xmm2
2295	pmuludq	48(%edx),%xmm5
2296	movdqa	48(%esp),%xmm6
2297	paddq	%xmm7,%xmm1
2298	movdqa	%xmm6,%xmm7
2299	pmuludq	16(%edx),%xmm6
2300	paddq	%xmm5,%xmm0
2301	movdqa	112(%esp),%xmm5
2302	pmuludq	(%edx),%xmm7
2303	paddq	%xmm6,%xmm4
2304	movdqa	%xmm5,%xmm6
2305	pmuludq	64(%edx),%xmm5
2306	paddq	%xmm7,%xmm3
2307	movdqa	%xmm6,%xmm7
2308	pmuludq	48(%edx),%xmm6
2309	paddq	%xmm5,%xmm2
2310	pmuludq	32(%edx),%xmm7
2311	movdqa	64(%esp),%xmm5
2312	paddq	%xmm6,%xmm1
2313	movdqa	128(%esp),%xmm6
2314	pmuludq	(%edx),%xmm5
2315	paddq	%xmm7,%xmm0
2316	movdqa	%xmm6,%xmm7
2317	pmuludq	64(%edx),%xmm6
2318	paddq	%xmm5,%xmm4
2319	movdqa	%xmm7,%xmm5
2320	pmuludq	16(%edx),%xmm7
2321	paddq	%xmm6,%xmm3
2322	movdqa	%xmm5,%xmm6
2323	pmuludq	32(%edx),%xmm5
2324	paddq	%xmm7,%xmm0
2325	pmuludq	48(%edx),%xmm6
2326	movdqa	64(%ebx),%xmm7
2327	paddq	%xmm5,%xmm1
2328	paddq	%xmm6,%xmm2
2329	movdqa	%xmm3,%xmm5
2330	pand	%xmm7,%xmm3
2331	psrlq	$26,%xmm5
2332	paddq	%xmm4,%xmm5
2333	movdqa	%xmm0,%xmm6
2334	pand	%xmm7,%xmm0
2335	psrlq	$26,%xmm6
2336	movdqa	%xmm5,%xmm4
2337	paddq	%xmm1,%xmm6
2338	psrlq	$26,%xmm5
2339	pand	%xmm7,%xmm4
2340	movdqa	%xmm6,%xmm1
2341	psrlq	$26,%xmm6
2342	paddd	%xmm5,%xmm0
2343	psllq	$2,%xmm5
2344	paddq	%xmm2,%xmm6
2345	paddq	%xmm0,%xmm5
2346	pand	%xmm7,%xmm1
2347	movdqa	%xmm6,%xmm2
2348	psrlq	$26,%xmm6
2349	pand	%xmm7,%xmm2
2350	paddd	%xmm3,%xmm6
2351	movdqa	%xmm5,%xmm0
2352	psrlq	$26,%xmm5
2353	movdqa	%xmm6,%xmm3
2354	psrlq	$26,%xmm6
2355	pand	%xmm7,%xmm0
2356	paddd	%xmm5,%xmm1
2357	pand	%xmm7,%xmm3
2358	paddd	%xmm6,%xmm4
2359	decl	%ecx
2360	jz	.L006square_break
2361	punpcklqdq	(%esp),%xmm0
2362	punpcklqdq	16(%esp),%xmm1
2363	punpcklqdq	32(%esp),%xmm2
2364	punpcklqdq	48(%esp),%xmm3
2365	punpcklqdq	64(%esp),%xmm4
2366	jmp	.L005square
2367.L006square_break:
2368	psllq	$32,%xmm0
2369	psllq	$32,%xmm1
2370	psllq	$32,%xmm2
2371	psllq	$32,%xmm3
2372	psllq	$32,%xmm4
2373	por	(%esp),%xmm0
2374	por	16(%esp),%xmm1
2375	por	32(%esp),%xmm2
2376	por	48(%esp),%xmm3
2377	por	64(%esp),%xmm4
2378	pshufd	$141,%xmm0,%xmm0
2379	pshufd	$141,%xmm1,%xmm1
2380	pshufd	$141,%xmm2,%xmm2
2381	pshufd	$141,%xmm3,%xmm3
2382	pshufd	$141,%xmm4,%xmm4
2383	movdqu	%xmm0,(%edi)
2384	movdqu	%xmm1,16(%edi)
2385	movdqu	%xmm2,32(%edi)
2386	movdqu	%xmm3,48(%edi)
2387	movdqu	%xmm4,64(%edi)
2388	movdqa	%xmm1,%xmm6
2389	movdqa	%xmm2,%xmm5
2390	pslld	$2,%xmm6
2391	pslld	$2,%xmm5
2392	paddd	%xmm1,%xmm6
2393	paddd	%xmm2,%xmm5
2394	movdqu	%xmm6,80(%edi)
2395	movdqu	%xmm5,96(%edi)
2396	movdqa	%xmm3,%xmm6
2397	movdqa	%xmm4,%xmm5
2398	pslld	$2,%xmm6
2399	pslld	$2,%xmm5
2400	paddd	%xmm3,%xmm6
2401	paddd	%xmm4,%xmm5
2402	movdqu	%xmm6,112(%edi)
2403	movdqu	%xmm5,128(%edi)
2404	movl	%ebp,%esp
2405	leal	-48(%edi),%edi
2406	ret
2407.size	_poly1305_init_sse2,.-_poly1305_init_sse2
2408.align	32
2409.type	_poly1305_blocks_sse2,@function
2410.align	16
2411_poly1305_blocks_sse2:
2412	pushl	%ebp
2413	pushl	%ebx
2414	pushl	%esi
2415	pushl	%edi
2416	movl	20(%esp),%edi
2417	movl	24(%esp),%esi
2418	movl	28(%esp),%ecx
2419	movl	20(%edi),%eax
2420	andl	$-16,%ecx
2421	jz	.L007nodata
2422	cmpl	$64,%ecx
2423	jae	.L008enter_sse2
2424	testl	%eax,%eax
2425	jz	.Lenter_blocks
2426.align	16
2427.L008enter_sse2:
2428	call	.L009pic_point
2429.L009pic_point:
2430	popl	%ebx
2431	leal	.Lconst_sse2-.L009pic_point(%ebx),%ebx
2432	testl	%eax,%eax
2433	jnz	.L010base2_26
2434	call	_poly1305_init_sse2
2435	movl	(%edi),%eax
2436	movl	3(%edi),%ecx
2437	movl	6(%edi),%edx
2438	movl	9(%edi),%esi
2439	movl	13(%edi),%ebp
2440	movl	$1,20(%edi)
2441	shrl	$2,%ecx
2442	andl	$67108863,%eax
2443	shrl	$4,%edx
2444	andl	$67108863,%ecx
2445	shrl	$6,%esi
2446	andl	$67108863,%edx
2447	movd	%eax,%xmm0
2448	movd	%ecx,%xmm1
2449	movd	%edx,%xmm2
2450	movd	%esi,%xmm3
2451	movd	%ebp,%xmm4
2452	movl	24(%esp),%esi
2453	movl	28(%esp),%ecx
2454	jmp	.L011base2_32
2455.align	16
2456.L010base2_26:
2457	movd	(%edi),%xmm0
2458	movd	4(%edi),%xmm1
2459	movd	8(%edi),%xmm2
2460	movd	12(%edi),%xmm3
2461	movd	16(%edi),%xmm4
2462	movdqa	64(%ebx),%xmm7
2463.L011base2_32:
2464	movl	32(%esp),%eax
2465	movl	%esp,%ebp
2466	subl	$528,%esp
2467	andl	$-16,%esp
2468	leal	48(%edi),%edi
2469	shll	$24,%eax
2470	testl	$31,%ecx
2471	jz	.L012even
2472	movdqu	(%esi),%xmm6
2473	leal	16(%esi),%esi
2474	movdqa	%xmm6,%xmm5
2475	pand	%xmm7,%xmm6
2476	paddd	%xmm6,%xmm0
2477	movdqa	%xmm5,%xmm6
2478	psrlq	$26,%xmm5
2479	psrldq	$6,%xmm6
2480	pand	%xmm7,%xmm5
2481	paddd	%xmm5,%xmm1
2482	movdqa	%xmm6,%xmm5
2483	psrlq	$4,%xmm6
2484	pand	%xmm7,%xmm6
2485	paddd	%xmm6,%xmm2
2486	movdqa	%xmm5,%xmm6
2487	psrlq	$30,%xmm5
2488	pand	%xmm7,%xmm5
2489	psrldq	$7,%xmm6
2490	paddd	%xmm5,%xmm3
2491	movd	%eax,%xmm5
2492	paddd	%xmm6,%xmm4
2493	movd	12(%edi),%xmm6
2494	paddd	%xmm5,%xmm4
2495	movdqa	%xmm0,(%esp)
2496	movdqa	%xmm1,16(%esp)
2497	movdqa	%xmm2,32(%esp)
2498	movdqa	%xmm3,48(%esp)
2499	movdqa	%xmm4,64(%esp)
2500	pmuludq	%xmm6,%xmm0
2501	pmuludq	%xmm6,%xmm1
2502	pmuludq	%xmm6,%xmm2
2503	movd	28(%edi),%xmm5
2504	pmuludq	%xmm6,%xmm3
2505	pmuludq	%xmm6,%xmm4
2506	movdqa	%xmm5,%xmm6
2507	pmuludq	48(%esp),%xmm5
2508	movdqa	%xmm6,%xmm7
2509	pmuludq	32(%esp),%xmm6
2510	paddq	%xmm5,%xmm4
2511	movdqa	%xmm7,%xmm5
2512	pmuludq	16(%esp),%xmm7
2513	paddq	%xmm6,%xmm3
2514	movd	92(%edi),%xmm6
2515	pmuludq	(%esp),%xmm5
2516	paddq	%xmm7,%xmm2
2517	pmuludq	64(%esp),%xmm6
2518	movd	44(%edi),%xmm7
2519	paddq	%xmm5,%xmm1
2520	movdqa	%xmm7,%xmm5
2521	pmuludq	32(%esp),%xmm7
2522	paddq	%xmm6,%xmm0
2523	movdqa	%xmm5,%xmm6
2524	pmuludq	16(%esp),%xmm5
2525	paddq	%xmm7,%xmm4
2526	movd	108(%edi),%xmm7
2527	pmuludq	(%esp),%xmm6
2528	paddq	%xmm5,%xmm3
2529	movdqa	%xmm7,%xmm5
2530	pmuludq	64(%esp),%xmm7
2531	paddq	%xmm6,%xmm2
2532	pmuludq	48(%esp),%xmm5
2533	movd	60(%edi),%xmm6
2534	paddq	%xmm7,%xmm1
2535	movdqa	%xmm6,%xmm7
2536	pmuludq	16(%esp),%xmm6
2537	paddq	%xmm5,%xmm0
2538	movd	124(%edi),%xmm5
2539	pmuludq	(%esp),%xmm7
2540	paddq	%xmm6,%xmm4
2541	movdqa	%xmm5,%xmm6
2542	pmuludq	64(%esp),%xmm5
2543	paddq	%xmm7,%xmm3
2544	movdqa	%xmm6,%xmm7
2545	pmuludq	48(%esp),%xmm6
2546	paddq	%xmm5,%xmm2
2547	pmuludq	32(%esp),%xmm7
2548	movd	76(%edi),%xmm5
2549	paddq	%xmm6,%xmm1
2550	movd	140(%edi),%xmm6
2551	pmuludq	(%esp),%xmm5
2552	paddq	%xmm7,%xmm0
2553	movdqa	%xmm6,%xmm7
2554	pmuludq	64(%esp),%xmm6
2555	paddq	%xmm5,%xmm4
2556	movdqa	%xmm7,%xmm5
2557	pmuludq	16(%esp),%xmm7
2558	paddq	%xmm6,%xmm3
2559	movdqa	%xmm5,%xmm6
2560	pmuludq	32(%esp),%xmm5
2561	paddq	%xmm7,%xmm0
2562	pmuludq	48(%esp),%xmm6
2563	movdqa	64(%ebx),%xmm7
2564	paddq	%xmm5,%xmm1
2565	paddq	%xmm6,%xmm2
2566	movdqa	%xmm3,%xmm5
2567	pand	%xmm7,%xmm3
2568	psrlq	$26,%xmm5
2569	paddq	%xmm4,%xmm5
2570	movdqa	%xmm0,%xmm6
2571	pand	%xmm7,%xmm0
2572	psrlq	$26,%xmm6
2573	movdqa	%xmm5,%xmm4
2574	paddq	%xmm1,%xmm6
2575	psrlq	$26,%xmm5
2576	pand	%xmm7,%xmm4
2577	movdqa	%xmm6,%xmm1
2578	psrlq	$26,%xmm6
2579	paddd	%xmm5,%xmm0
2580	psllq	$2,%xmm5
2581	paddq	%xmm2,%xmm6
2582	paddq	%xmm0,%xmm5
2583	pand	%xmm7,%xmm1
2584	movdqa	%xmm6,%xmm2
2585	psrlq	$26,%xmm6
2586	pand	%xmm7,%xmm2
2587	paddd	%xmm3,%xmm6
2588	movdqa	%xmm5,%xmm0
2589	psrlq	$26,%xmm5
2590	movdqa	%xmm6,%xmm3
2591	psrlq	$26,%xmm6
2592	pand	%xmm7,%xmm0
2593	paddd	%xmm5,%xmm1
2594	pand	%xmm7,%xmm3
2595	paddd	%xmm6,%xmm4
2596	subl	$16,%ecx
2597	jz	.L013done
2598.L012even:
2599	leal	384(%esp),%edx
2600	leal	-32(%esi),%eax
2601	subl	$64,%ecx
2602	movdqu	(%edi),%xmm5
2603	pshufd	$68,%xmm5,%xmm6
2604	cmovbl	%eax,%esi
2605	pshufd	$238,%xmm5,%xmm5
2606	movdqa	%xmm6,(%edx)
2607	leal	160(%esp),%eax
2608	movdqu	16(%edi),%xmm6
2609	movdqa	%xmm5,-144(%edx)
2610	pshufd	$68,%xmm6,%xmm5
2611	pshufd	$238,%xmm6,%xmm6
2612	movdqa	%xmm5,16(%edx)
2613	movdqu	32(%edi),%xmm5
2614	movdqa	%xmm6,-128(%edx)
2615	pshufd	$68,%xmm5,%xmm6
2616	pshufd	$238,%xmm5,%xmm5
2617	movdqa	%xmm6,32(%edx)
2618	movdqu	48(%edi),%xmm6
2619	movdqa	%xmm5,-112(%edx)
2620	pshufd	$68,%xmm6,%xmm5
2621	pshufd	$238,%xmm6,%xmm6
2622	movdqa	%xmm5,48(%edx)
2623	movdqu	64(%edi),%xmm5
2624	movdqa	%xmm6,-96(%edx)
2625	pshufd	$68,%xmm5,%xmm6
2626	pshufd	$238,%xmm5,%xmm5
2627	movdqa	%xmm6,64(%edx)
2628	movdqu	80(%edi),%xmm6
2629	movdqa	%xmm5,-80(%edx)
2630	pshufd	$68,%xmm6,%xmm5
2631	pshufd	$238,%xmm6,%xmm6
2632	movdqa	%xmm5,80(%edx)
2633	movdqu	96(%edi),%xmm5
2634	movdqa	%xmm6,-64(%edx)
2635	pshufd	$68,%xmm5,%xmm6
2636	pshufd	$238,%xmm5,%xmm5
2637	movdqa	%xmm6,96(%edx)
2638	movdqu	112(%edi),%xmm6
2639	movdqa	%xmm5,-48(%edx)
2640	pshufd	$68,%xmm6,%xmm5
2641	pshufd	$238,%xmm6,%xmm6
2642	movdqa	%xmm5,112(%edx)
2643	movdqu	128(%edi),%xmm5
2644	movdqa	%xmm6,-32(%edx)
2645	pshufd	$68,%xmm5,%xmm6
2646	pshufd	$238,%xmm5,%xmm5
2647	movdqa	%xmm6,128(%edx)
2648	movdqa	%xmm5,-16(%edx)
2649	movdqu	32(%esi),%xmm5
2650	movdqu	48(%esi),%xmm6
2651	leal	32(%esi),%esi
2652	movdqa	%xmm2,112(%esp)
2653	movdqa	%xmm3,128(%esp)
2654	movdqa	%xmm4,144(%esp)
2655	movdqa	%xmm5,%xmm2
2656	movdqa	%xmm6,%xmm3
2657	psrldq	$6,%xmm2
2658	psrldq	$6,%xmm3
2659	movdqa	%xmm5,%xmm4
2660	punpcklqdq	%xmm3,%xmm2
2661	punpckhqdq	%xmm6,%xmm4
2662	punpcklqdq	%xmm6,%xmm5
2663	movdqa	%xmm2,%xmm3
2664	psrlq	$4,%xmm2
2665	psrlq	$30,%xmm3
2666	movdqa	%xmm5,%xmm6
2667	psrlq	$40,%xmm4
2668	psrlq	$26,%xmm6
2669	pand	%xmm7,%xmm5
2670	pand	%xmm7,%xmm6
2671	pand	%xmm7,%xmm2
2672	pand	%xmm7,%xmm3
2673	por	(%ebx),%xmm4
2674	movdqa	%xmm0,80(%esp)
2675	movdqa	%xmm1,96(%esp)
2676	jbe	.L014skip_loop
2677	jmp	.L015loop
2678.align	32
2679.L015loop:
2680	movdqa	-144(%edx),%xmm7
2681	movdqa	%xmm6,16(%eax)
2682	movdqa	%xmm2,32(%eax)
2683	movdqa	%xmm3,48(%eax)
2684	movdqa	%xmm4,64(%eax)
2685	movdqa	%xmm5,%xmm1
2686	pmuludq	%xmm7,%xmm5
2687	movdqa	%xmm6,%xmm0
2688	pmuludq	%xmm7,%xmm6
2689	pmuludq	%xmm7,%xmm2
2690	pmuludq	%xmm7,%xmm3
2691	pmuludq	%xmm7,%xmm4
2692	pmuludq	-16(%edx),%xmm0
2693	movdqa	%xmm1,%xmm7
2694	pmuludq	-128(%edx),%xmm1
2695	paddq	%xmm5,%xmm0
2696	movdqa	%xmm7,%xmm5
2697	pmuludq	-112(%edx),%xmm7
2698	paddq	%xmm6,%xmm1
2699	movdqa	%xmm5,%xmm6
2700	pmuludq	-96(%edx),%xmm5
2701	paddq	%xmm7,%xmm2
2702	movdqa	16(%eax),%xmm7
2703	pmuludq	-80(%edx),%xmm6
2704	paddq	%xmm5,%xmm3
2705	movdqa	%xmm7,%xmm5
2706	pmuludq	-128(%edx),%xmm7
2707	paddq	%xmm6,%xmm4
2708	movdqa	%xmm5,%xmm6
2709	pmuludq	-112(%edx),%xmm5
2710	paddq	%xmm7,%xmm2
2711	movdqa	32(%eax),%xmm7
2712	pmuludq	-96(%edx),%xmm6
2713	paddq	%xmm5,%xmm3
2714	movdqa	%xmm7,%xmm5
2715	pmuludq	-32(%edx),%xmm7
2716	paddq	%xmm6,%xmm4
2717	movdqa	%xmm5,%xmm6
2718	pmuludq	-16(%edx),%xmm5
2719	paddq	%xmm7,%xmm0
2720	movdqa	%xmm6,%xmm7
2721	pmuludq	-128(%edx),%xmm6
2722	paddq	%xmm5,%xmm1
2723	movdqa	48(%eax),%xmm5
2724	pmuludq	-112(%edx),%xmm7
2725	paddq	%xmm6,%xmm3
2726	movdqa	%xmm5,%xmm6
2727	pmuludq	-48(%edx),%xmm5
2728	paddq	%xmm7,%xmm4
2729	movdqa	%xmm6,%xmm7
2730	pmuludq	-32(%edx),%xmm6
2731	paddq	%xmm5,%xmm0
2732	movdqa	%xmm7,%xmm5
2733	pmuludq	-16(%edx),%xmm7
2734	paddq	%xmm6,%xmm1
2735	movdqa	64(%eax),%xmm6
2736	pmuludq	-128(%edx),%xmm5
2737	paddq	%xmm7,%xmm2
2738	movdqa	%xmm6,%xmm7
2739	pmuludq	-16(%edx),%xmm6
2740	paddq	%xmm5,%xmm4
2741	movdqa	%xmm7,%xmm5
2742	pmuludq	-64(%edx),%xmm7
2743	paddq	%xmm6,%xmm3
2744	movdqa	%xmm5,%xmm6
2745	pmuludq	-48(%edx),%xmm5
2746	paddq	%xmm7,%xmm0
2747	movdqa	64(%ebx),%xmm7
2748	pmuludq	-32(%edx),%xmm6
2749	paddq	%xmm5,%xmm1
2750	paddq	%xmm6,%xmm2
2751	movdqu	-32(%esi),%xmm5
2752	movdqu	-16(%esi),%xmm6
2753	leal	32(%esi),%esi
2754	movdqa	%xmm2,32(%esp)
2755	movdqa	%xmm3,48(%esp)
2756	movdqa	%xmm4,64(%esp)
2757	movdqa	%xmm5,%xmm2
2758	movdqa	%xmm6,%xmm3
2759	psrldq	$6,%xmm2
2760	psrldq	$6,%xmm3
2761	movdqa	%xmm5,%xmm4
2762	punpcklqdq	%xmm3,%xmm2
2763	punpckhqdq	%xmm6,%xmm4
2764	punpcklqdq	%xmm6,%xmm5
2765	movdqa	%xmm2,%xmm3
2766	psrlq	$4,%xmm2
2767	psrlq	$30,%xmm3
2768	movdqa	%xmm5,%xmm6
2769	psrlq	$40,%xmm4
2770	psrlq	$26,%xmm6
2771	pand	%xmm7,%xmm5
2772	pand	%xmm7,%xmm6
2773	pand	%xmm7,%xmm2
2774	pand	%xmm7,%xmm3
2775	por	(%ebx),%xmm4
2776	leal	-32(%esi),%eax
2777	subl	$64,%ecx
2778	paddd	80(%esp),%xmm5
2779	paddd	96(%esp),%xmm6
2780	paddd	112(%esp),%xmm2
2781	paddd	128(%esp),%xmm3
2782	paddd	144(%esp),%xmm4
2783	cmovbl	%eax,%esi
2784	leal	160(%esp),%eax
2785	movdqa	(%edx),%xmm7
2786	movdqa	%xmm1,16(%esp)
2787	movdqa	%xmm6,16(%eax)
2788	movdqa	%xmm2,32(%eax)
2789	movdqa	%xmm3,48(%eax)
2790	movdqa	%xmm4,64(%eax)
2791	movdqa	%xmm5,%xmm1
2792	pmuludq	%xmm7,%xmm5
2793	paddq	%xmm0,%xmm5
2794	movdqa	%xmm6,%xmm0
2795	pmuludq	%xmm7,%xmm6
2796	pmuludq	%xmm7,%xmm2
2797	pmuludq	%xmm7,%xmm3
2798	pmuludq	%xmm7,%xmm4
2799	paddq	16(%esp),%xmm6
2800	paddq	32(%esp),%xmm2
2801	paddq	48(%esp),%xmm3
2802	paddq	64(%esp),%xmm4
2803	pmuludq	128(%edx),%xmm0
2804	movdqa	%xmm1,%xmm7
2805	pmuludq	16(%edx),%xmm1
2806	paddq	%xmm5,%xmm0
2807	movdqa	%xmm7,%xmm5
2808	pmuludq	32(%edx),%xmm7
2809	paddq	%xmm6,%xmm1
2810	movdqa	%xmm5,%xmm6
2811	pmuludq	48(%edx),%xmm5
2812	paddq	%xmm7,%xmm2
2813	movdqa	16(%eax),%xmm7
2814	pmuludq	64(%edx),%xmm6
2815	paddq	%xmm5,%xmm3
2816	movdqa	%xmm7,%xmm5
2817	pmuludq	16(%edx),%xmm7
2818	paddq	%xmm6,%xmm4
2819	movdqa	%xmm5,%xmm6
2820	pmuludq	32(%edx),%xmm5
2821	paddq	%xmm7,%xmm2
2822	movdqa	32(%eax),%xmm7
2823	pmuludq	48(%edx),%xmm6
2824	paddq	%xmm5,%xmm3
2825	movdqa	%xmm7,%xmm5
2826	pmuludq	112(%edx),%xmm7
2827	paddq	%xmm6,%xmm4
2828	movdqa	%xmm5,%xmm6
2829	pmuludq	128(%edx),%xmm5
2830	paddq	%xmm7,%xmm0
2831	movdqa	%xmm6,%xmm7
2832	pmuludq	16(%edx),%xmm6
2833	paddq	%xmm5,%xmm1
2834	movdqa	48(%eax),%xmm5
2835	pmuludq	32(%edx),%xmm7
2836	paddq	%xmm6,%xmm3
2837	movdqa	%xmm5,%xmm6
2838	pmuludq	96(%edx),%xmm5
2839	paddq	%xmm7,%xmm4
2840	movdqa	%xmm6,%xmm7
2841	pmuludq	112(%edx),%xmm6
2842	paddq	%xmm5,%xmm0
2843	movdqa	%xmm7,%xmm5
2844	pmuludq	128(%edx),%xmm7
2845	paddq	%xmm6,%xmm1
2846	movdqa	64(%eax),%xmm6
2847	pmuludq	16(%edx),%xmm5
2848	paddq	%xmm7,%xmm2
2849	movdqa	%xmm6,%xmm7
2850	pmuludq	128(%edx),%xmm6
2851	paddq	%xmm5,%xmm4
2852	movdqa	%xmm7,%xmm5
2853	pmuludq	80(%edx),%xmm7
2854	paddq	%xmm6,%xmm3
2855	movdqa	%xmm5,%xmm6
2856	pmuludq	96(%edx),%xmm5
2857	paddq	%xmm7,%xmm0
2858	movdqa	64(%ebx),%xmm7
2859	pmuludq	112(%edx),%xmm6
2860	paddq	%xmm5,%xmm1
2861	paddq	%xmm6,%xmm2
2862	movdqa	%xmm3,%xmm5
2863	pand	%xmm7,%xmm3
2864	psrlq	$26,%xmm5
2865	paddq	%xmm4,%xmm5
2866	movdqa	%xmm0,%xmm6
2867	pand	%xmm7,%xmm0
2868	psrlq	$26,%xmm6
2869	movdqa	%xmm5,%xmm4
2870	paddq	%xmm1,%xmm6
2871	psrlq	$26,%xmm5
2872	pand	%xmm7,%xmm4
2873	movdqa	%xmm6,%xmm1
2874	psrlq	$26,%xmm6
2875	paddd	%xmm5,%xmm0
2876	psllq	$2,%xmm5
2877	paddq	%xmm2,%xmm6
2878	paddq	%xmm0,%xmm5
2879	pand	%xmm7,%xmm1
2880	movdqa	%xmm6,%xmm2
2881	psrlq	$26,%xmm6
2882	pand	%xmm7,%xmm2
2883	paddd	%xmm3,%xmm6
2884	movdqa	%xmm5,%xmm0
2885	psrlq	$26,%xmm5
2886	movdqa	%xmm6,%xmm3
2887	psrlq	$26,%xmm6
2888	pand	%xmm7,%xmm0
2889	paddd	%xmm5,%xmm1
2890	pand	%xmm7,%xmm3
2891	paddd	%xmm6,%xmm4
2892	movdqu	32(%esi),%xmm5
2893	movdqu	48(%esi),%xmm6
2894	leal	32(%esi),%esi
2895	movdqa	%xmm2,112(%esp)
2896	movdqa	%xmm3,128(%esp)
2897	movdqa	%xmm4,144(%esp)
2898	movdqa	%xmm5,%xmm2
2899	movdqa	%xmm6,%xmm3
2900	psrldq	$6,%xmm2
2901	psrldq	$6,%xmm3
2902	movdqa	%xmm5,%xmm4
2903	punpcklqdq	%xmm3,%xmm2
2904	punpckhqdq	%xmm6,%xmm4
2905	punpcklqdq	%xmm6,%xmm5
2906	movdqa	%xmm2,%xmm3
2907	psrlq	$4,%xmm2
2908	psrlq	$30,%xmm3
2909	movdqa	%xmm5,%xmm6
2910	psrlq	$40,%xmm4
2911	psrlq	$26,%xmm6
2912	pand	%xmm7,%xmm5
2913	pand	%xmm7,%xmm6
2914	pand	%xmm7,%xmm2
2915	pand	%xmm7,%xmm3
2916	por	(%ebx),%xmm4
2917	movdqa	%xmm0,80(%esp)
2918	movdqa	%xmm1,96(%esp)
2919	ja	.L015loop
2920.L014skip_loop:
2921	pshufd	$16,-144(%edx),%xmm7
2922	addl	$32,%ecx
2923	jnz	.L016long_tail
2924	paddd	%xmm0,%xmm5
2925	paddd	%xmm1,%xmm6
2926	paddd	112(%esp),%xmm2
2927	paddd	128(%esp),%xmm3
2928	paddd	144(%esp),%xmm4
2929.L016long_tail:
2930	movdqa	%xmm5,(%eax)
2931	movdqa	%xmm6,16(%eax)
2932	movdqa	%xmm2,32(%eax)
2933	movdqa	%xmm3,48(%eax)
2934	movdqa	%xmm4,64(%eax)
2935	pmuludq	%xmm7,%xmm5
2936	pmuludq	%xmm7,%xmm6
2937	pmuludq	%xmm7,%xmm2
2938	movdqa	%xmm5,%xmm0
2939	pshufd	$16,-128(%edx),%xmm5
2940	pmuludq	%xmm7,%xmm3
2941	movdqa	%xmm6,%xmm1
2942	pmuludq	%xmm7,%xmm4
2943	movdqa	%xmm5,%xmm6
2944	pmuludq	48(%eax),%xmm5
2945	movdqa	%xmm6,%xmm7
2946	pmuludq	32(%eax),%xmm6
2947	paddq	%xmm5,%xmm4
2948	movdqa	%xmm7,%xmm5
2949	pmuludq	16(%eax),%xmm7
2950	paddq	%xmm6,%xmm3
2951	pshufd	$16,-64(%edx),%xmm6
2952	pmuludq	(%eax),%xmm5
2953	paddq	%xmm7,%xmm2
2954	pmuludq	64(%eax),%xmm6
2955	pshufd	$16,-112(%edx),%xmm7
2956	paddq	%xmm5,%xmm1
2957	movdqa	%xmm7,%xmm5
2958	pmuludq	32(%eax),%xmm7
2959	paddq	%xmm6,%xmm0
2960	movdqa	%xmm5,%xmm6
2961	pmuludq	16(%eax),%xmm5
2962	paddq	%xmm7,%xmm4
2963	pshufd	$16,-48(%edx),%xmm7
2964	pmuludq	(%eax),%xmm6
2965	paddq	%xmm5,%xmm3
2966	movdqa	%xmm7,%xmm5
2967	pmuludq	64(%eax),%xmm7
2968	paddq	%xmm6,%xmm2
2969	pmuludq	48(%eax),%xmm5
2970	pshufd	$16,-96(%edx),%xmm6
2971	paddq	%xmm7,%xmm1
2972	movdqa	%xmm6,%xmm7
2973	pmuludq	16(%eax),%xmm6
2974	paddq	%xmm5,%xmm0
2975	pshufd	$16,-32(%edx),%xmm5
2976	pmuludq	(%eax),%xmm7
2977	paddq	%xmm6,%xmm4
2978	movdqa	%xmm5,%xmm6
2979	pmuludq	64(%eax),%xmm5
2980	paddq	%xmm7,%xmm3
2981	movdqa	%xmm6,%xmm7
2982	pmuludq	48(%eax),%xmm6
2983	paddq	%xmm5,%xmm2
2984	pmuludq	32(%eax),%xmm7
2985	pshufd	$16,-80(%edx),%xmm5
2986	paddq	%xmm6,%xmm1
2987	pshufd	$16,-16(%edx),%xmm6
2988	pmuludq	(%eax),%xmm5
2989	paddq	%xmm7,%xmm0
2990	movdqa	%xmm6,%xmm7
2991	pmuludq	64(%eax),%xmm6
2992	paddq	%xmm5,%xmm4
2993	movdqa	%xmm7,%xmm5
2994	pmuludq	16(%eax),%xmm7
2995	paddq	%xmm6,%xmm3
2996	movdqa	%xmm5,%xmm6
2997	pmuludq	32(%eax),%xmm5
2998	paddq	%xmm7,%xmm0
2999	pmuludq	48(%eax),%xmm6
3000	movdqa	64(%ebx),%xmm7
3001	paddq	%xmm5,%xmm1
3002	paddq	%xmm6,%xmm2
3003	jz	.L017short_tail
3004	movdqu	-32(%esi),%xmm5
3005	movdqu	-16(%esi),%xmm6
3006	leal	32(%esi),%esi
3007	movdqa	%xmm2,32(%esp)
3008	movdqa	%xmm3,48(%esp)
3009	movdqa	%xmm4,64(%esp)
3010	movdqa	%xmm5,%xmm2
3011	movdqa	%xmm6,%xmm3
3012	psrldq	$6,%xmm2
3013	psrldq	$6,%xmm3
3014	movdqa	%xmm5,%xmm4
3015	punpcklqdq	%xmm3,%xmm2
3016	punpckhqdq	%xmm6,%xmm4
3017	punpcklqdq	%xmm6,%xmm5
3018	movdqa	%xmm2,%xmm3
3019	psrlq	$4,%xmm2
3020	psrlq	$30,%xmm3
3021	movdqa	%xmm5,%xmm6
3022	psrlq	$40,%xmm4
3023	psrlq	$26,%xmm6
3024	pand	%xmm7,%xmm5
3025	pand	%xmm7,%xmm6
3026	pand	%xmm7,%xmm2
3027	pand	%xmm7,%xmm3
3028	por	(%ebx),%xmm4
3029	pshufd	$16,(%edx),%xmm7
3030	paddd	80(%esp),%xmm5
3031	paddd	96(%esp),%xmm6
3032	paddd	112(%esp),%xmm2
3033	paddd	128(%esp),%xmm3
3034	paddd	144(%esp),%xmm4
3035	movdqa	%xmm5,(%esp)
3036	pmuludq	%xmm7,%xmm5
3037	movdqa	%xmm6,16(%esp)
3038	pmuludq	%xmm7,%xmm6
3039	paddq	%xmm5,%xmm0
3040	movdqa	%xmm2,%xmm5
3041	pmuludq	%xmm7,%xmm2
3042	paddq	%xmm6,%xmm1
3043	movdqa	%xmm3,%xmm6
3044	pmuludq	%xmm7,%xmm3
3045	paddq	32(%esp),%xmm2
3046	movdqa	%xmm5,32(%esp)
3047	pshufd	$16,16(%edx),%xmm5
3048	paddq	48(%esp),%xmm3
3049	movdqa	%xmm6,48(%esp)
3050	movdqa	%xmm4,%xmm6
3051	pmuludq	%xmm7,%xmm4
3052	paddq	64(%esp),%xmm4
3053	movdqa	%xmm6,64(%esp)
3054	movdqa	%xmm5,%xmm6
3055	pmuludq	48(%esp),%xmm5
3056	movdqa	%xmm6,%xmm7
3057	pmuludq	32(%esp),%xmm6
3058	paddq	%xmm5,%xmm4
3059	movdqa	%xmm7,%xmm5
3060	pmuludq	16(%esp),%xmm7
3061	paddq	%xmm6,%xmm3
3062	pshufd	$16,80(%edx),%xmm6
3063	pmuludq	(%esp),%xmm5
3064	paddq	%xmm7,%xmm2
3065	pmuludq	64(%esp),%xmm6
3066	pshufd	$16,32(%edx),%xmm7
3067	paddq	%xmm5,%xmm1
3068	movdqa	%xmm7,%xmm5
3069	pmuludq	32(%esp),%xmm7
3070	paddq	%xmm6,%xmm0
3071	movdqa	%xmm5,%xmm6
3072	pmuludq	16(%esp),%xmm5
3073	paddq	%xmm7,%xmm4
3074	pshufd	$16,96(%edx),%xmm7
3075	pmuludq	(%esp),%xmm6
3076	paddq	%xmm5,%xmm3
3077	movdqa	%xmm7,%xmm5
3078	pmuludq	64(%esp),%xmm7
3079	paddq	%xmm6,%xmm2
3080	pmuludq	48(%esp),%xmm5
3081	pshufd	$16,48(%edx),%xmm6
3082	paddq	%xmm7,%xmm1
3083	movdqa	%xmm6,%xmm7
3084	pmuludq	16(%esp),%xmm6
3085	paddq	%xmm5,%xmm0
3086	pshufd	$16,112(%edx),%xmm5
3087	pmuludq	(%esp),%xmm7
3088	paddq	%xmm6,%xmm4
3089	movdqa	%xmm5,%xmm6
3090	pmuludq	64(%esp),%xmm5
3091	paddq	%xmm7,%xmm3
3092	movdqa	%xmm6,%xmm7
3093	pmuludq	48(%esp),%xmm6
3094	paddq	%xmm5,%xmm2
3095	pmuludq	32(%esp),%xmm7
3096	pshufd	$16,64(%edx),%xmm5
3097	paddq	%xmm6,%xmm1
3098	pshufd	$16,128(%edx),%xmm6
3099	pmuludq	(%esp),%xmm5
3100	paddq	%xmm7,%xmm0
3101	movdqa	%xmm6,%xmm7
3102	pmuludq	64(%esp),%xmm6
3103	paddq	%xmm5,%xmm4
3104	movdqa	%xmm7,%xmm5
3105	pmuludq	16(%esp),%xmm7
3106	paddq	%xmm6,%xmm3
3107	movdqa	%xmm5,%xmm6
3108	pmuludq	32(%esp),%xmm5
3109	paddq	%xmm7,%xmm0
3110	pmuludq	48(%esp),%xmm6
3111	movdqa	64(%ebx),%xmm7
3112	paddq	%xmm5,%xmm1
3113	paddq	%xmm6,%xmm2
3114.L017short_tail:
3115	pshufd	$78,%xmm4,%xmm6
3116	pshufd	$78,%xmm3,%xmm5
3117	paddq	%xmm6,%xmm4
3118	paddq	%xmm5,%xmm3
3119	pshufd	$78,%xmm0,%xmm6
3120	pshufd	$78,%xmm1,%xmm5
3121	paddq	%xmm6,%xmm0
3122	paddq	%xmm5,%xmm1
3123	pshufd	$78,%xmm2,%xmm6
3124	movdqa	%xmm3,%xmm5
3125	pand	%xmm7,%xmm3
3126	psrlq	$26,%xmm5
3127	paddq	%xmm6,%xmm2
3128	paddq	%xmm4,%xmm5
3129	movdqa	%xmm0,%xmm6
3130	pand	%xmm7,%xmm0
3131	psrlq	$26,%xmm6
3132	movdqa	%xmm5,%xmm4
3133	paddq	%xmm1,%xmm6
3134	psrlq	$26,%xmm5
3135	pand	%xmm7,%xmm4
3136	movdqa	%xmm6,%xmm1
3137	psrlq	$26,%xmm6
3138	paddd	%xmm5,%xmm0
3139	psllq	$2,%xmm5
3140	paddq	%xmm2,%xmm6
3141	paddq	%xmm0,%xmm5
3142	pand	%xmm7,%xmm1
3143	movdqa	%xmm6,%xmm2
3144	psrlq	$26,%xmm6
3145	pand	%xmm7,%xmm2
3146	paddd	%xmm3,%xmm6
3147	movdqa	%xmm5,%xmm0
3148	psrlq	$26,%xmm5
3149	movdqa	%xmm6,%xmm3
3150	psrlq	$26,%xmm6
3151	pand	%xmm7,%xmm0
3152	paddd	%xmm5,%xmm1
3153	pand	%xmm7,%xmm3
3154	paddd	%xmm6,%xmm4
3155.L013done:
3156	movd	%xmm0,-48(%edi)
3157	movd	%xmm1,-44(%edi)
3158	movd	%xmm2,-40(%edi)
3159	movd	%xmm3,-36(%edi)
3160	movd	%xmm4,-32(%edi)
3161	movl	%ebp,%esp
3162.L007nodata:
3163	popl	%edi
3164	popl	%esi
3165	popl	%ebx
3166	popl	%ebp
3167	ret
3168.size	_poly1305_blocks_sse2,.-_poly1305_blocks_sse2
3169.align	32
3170.type	_poly1305_emit_sse2,@function
3171.align	16
3172_poly1305_emit_sse2:
3173	pushl	%ebp
3174	pushl	%ebx
3175	pushl	%esi
3176	pushl	%edi
3177	movl	20(%esp),%ebp
3178	cmpl	$0,20(%ebp)
3179	je	.Lenter_emit
3180	movl	(%ebp),%eax
3181	movl	4(%ebp),%edi
3182	movl	8(%ebp),%ecx
3183	movl	12(%ebp),%edx
3184	movl	16(%ebp),%esi
3185	movl	%edi,%ebx
3186	shll	$26,%edi
3187	shrl	$6,%ebx
3188	addl	%edi,%eax
3189	movl	%ecx,%edi
3190	adcl	$0,%ebx
3191	shll	$20,%edi
3192	shrl	$12,%ecx
3193	addl	%edi,%ebx
3194	movl	%edx,%edi
3195	adcl	$0,%ecx
3196	shll	$14,%edi
3197	shrl	$18,%edx
3198	addl	%edi,%ecx
3199	movl	%esi,%edi
3200	adcl	$0,%edx
3201	shll	$8,%edi
3202	shrl	$24,%esi
3203	addl	%edi,%edx
3204	adcl	$0,%esi
3205	movl	%esi,%edi
3206	andl	$3,%esi
3207	shrl	$2,%edi
3208	leal	(%edi,%edi,4),%ebp
3209	movl	24(%esp),%edi
3210	addl	%ebp,%eax
3211	movl	28(%esp),%ebp
3212	adcl	$0,%ebx
3213	adcl	$0,%ecx
3214	adcl	$0,%edx
3215	adcl	$0,%esi
3216	movd	%eax,%xmm0
3217	addl	$5,%eax
3218	movd	%ebx,%xmm1
3219	adcl	$0,%ebx
3220	movd	%ecx,%xmm2
3221	adcl	$0,%ecx
3222	movd	%edx,%xmm3
3223	adcl	$0,%edx
3224	adcl	$0,%esi
3225	shrl	$2,%esi
3226	negl	%esi
3227	andl	%esi,%eax
3228	andl	%esi,%ebx
3229	andl	%esi,%ecx
3230	andl	%esi,%edx
3231	movl	%eax,(%edi)
3232	movd	%xmm0,%eax
3233	movl	%ebx,4(%edi)
3234	movd	%xmm1,%ebx
3235	movl	%ecx,8(%edi)
3236	movd	%xmm2,%ecx
3237	movl	%edx,12(%edi)
3238	movd	%xmm3,%edx
3239	notl	%esi
3240	andl	%esi,%eax
3241	andl	%esi,%ebx
3242	orl	(%edi),%eax
3243	andl	%esi,%ecx
3244	orl	4(%edi),%ebx
3245	andl	%esi,%edx
3246	orl	8(%edi),%ecx
3247	orl	12(%edi),%edx
3248	addl	(%ebp),%eax
3249	adcl	4(%ebp),%ebx
3250	movl	%eax,(%edi)
3251	adcl	8(%ebp),%ecx
3252	movl	%ebx,4(%edi)
3253	adcl	12(%ebp),%edx
3254	movl	%ecx,8(%edi)
3255	movl	%edx,12(%edi)
3256	popl	%edi
3257	popl	%esi
3258	popl	%ebx
3259	popl	%ebp
3260	ret
3261.size	_poly1305_emit_sse2,.-_poly1305_emit_sse2
3262.align	32
3263.type	_poly1305_init_avx2,@function
3264.align	16
3265_poly1305_init_avx2:
3266	vmovdqu	24(%edi),%xmm4
3267	leal	48(%edi),%edi
3268	movl	%esp,%ebp
3269	subl	$224,%esp
3270	andl	$-16,%esp
3271	vmovdqa	64(%ebx),%xmm7
3272	vpand	%xmm7,%xmm4,%xmm0
3273	vpsrlq	$26,%xmm4,%xmm1
3274	vpsrldq	$6,%xmm4,%xmm3
3275	vpand	%xmm7,%xmm1,%xmm1
3276	vpsrlq	$4,%xmm3,%xmm2
3277	vpsrlq	$30,%xmm3,%xmm3
3278	vpand	%xmm7,%xmm2,%xmm2
3279	vpand	%xmm7,%xmm3,%xmm3
3280	vpsrldq	$13,%xmm4,%xmm4
3281	leal	144(%esp),%edx
3282	movl	$2,%ecx
3283.L018square:
3284	vmovdqa	%xmm0,(%esp)
3285	vmovdqa	%xmm1,16(%esp)
3286	vmovdqa	%xmm2,32(%esp)
3287	vmovdqa	%xmm3,48(%esp)
3288	vmovdqa	%xmm4,64(%esp)
3289	vpslld	$2,%xmm1,%xmm6
3290	vpslld	$2,%xmm2,%xmm5
3291	vpaddd	%xmm1,%xmm6,%xmm6
3292	vpaddd	%xmm2,%xmm5,%xmm5
3293	vmovdqa	%xmm6,80(%esp)
3294	vmovdqa	%xmm5,96(%esp)
3295	vpslld	$2,%xmm3,%xmm6
3296	vpslld	$2,%xmm4,%xmm5
3297	vpaddd	%xmm3,%xmm6,%xmm6
3298	vpaddd	%xmm4,%xmm5,%xmm5
3299	vmovdqa	%xmm6,112(%esp)
3300	vmovdqa	%xmm5,128(%esp)
3301	vpshufd	$68,%xmm0,%xmm5
3302	vmovdqa	%xmm1,%xmm6
3303	vpshufd	$68,%xmm1,%xmm1
3304	vpshufd	$68,%xmm2,%xmm2
3305	vpshufd	$68,%xmm3,%xmm3
3306	vpshufd	$68,%xmm4,%xmm4
3307	vmovdqa	%xmm5,(%edx)
3308	vmovdqa	%xmm1,16(%edx)
3309	vmovdqa	%xmm2,32(%edx)
3310	vmovdqa	%xmm3,48(%edx)
3311	vmovdqa	%xmm4,64(%edx)
3312	vpmuludq	%xmm0,%xmm4,%xmm4
3313	vpmuludq	%xmm0,%xmm3,%xmm3
3314	vpmuludq	%xmm0,%xmm2,%xmm2
3315	vpmuludq	%xmm0,%xmm1,%xmm1
3316	vpmuludq	%xmm0,%xmm5,%xmm0
3317	vpmuludq	48(%edx),%xmm6,%xmm5
3318	vpaddq	%xmm5,%xmm4,%xmm4
3319	vpmuludq	32(%edx),%xmm6,%xmm7
3320	vpaddq	%xmm7,%xmm3,%xmm3
3321	vpmuludq	16(%edx),%xmm6,%xmm5
3322	vpaddq	%xmm5,%xmm2,%xmm2
3323	vmovdqa	80(%esp),%xmm7
3324	vpmuludq	(%edx),%xmm6,%xmm6
3325	vpaddq	%xmm6,%xmm1,%xmm1
3326	vmovdqa	32(%esp),%xmm5
3327	vpmuludq	64(%edx),%xmm7,%xmm7
3328	vpaddq	%xmm7,%xmm0,%xmm0
3329	vpmuludq	32(%edx),%xmm5,%xmm6
3330	vpaddq	%xmm6,%xmm4,%xmm4
3331	vpmuludq	16(%edx),%xmm5,%xmm7
3332	vpaddq	%xmm7,%xmm3,%xmm3
3333	vmovdqa	96(%esp),%xmm6
3334	vpmuludq	(%edx),%xmm5,%xmm5
3335	vpaddq	%xmm5,%xmm2,%xmm2
3336	vpmuludq	64(%edx),%xmm6,%xmm7
3337	vpaddq	%xmm7,%xmm1,%xmm1
3338	vmovdqa	48(%esp),%xmm5
3339	vpmuludq	48(%edx),%xmm6,%xmm6
3340	vpaddq	%xmm6,%xmm0,%xmm0
3341	vpmuludq	16(%edx),%xmm5,%xmm7
3342	vpaddq	%xmm7,%xmm4,%xmm4
3343	vmovdqa	112(%esp),%xmm6
3344	vpmuludq	(%edx),%xmm5,%xmm5
3345	vpaddq	%xmm5,%xmm3,%xmm3
3346	vpmuludq	64(%edx),%xmm6,%xmm7
3347	vpaddq	%xmm7,%xmm2,%xmm2
3348	vpmuludq	48(%edx),%xmm6,%xmm5
3349	vpaddq	%xmm5,%xmm1,%xmm1
3350	vmovdqa	64(%esp),%xmm7
3351	vpmuludq	32(%edx),%xmm6,%xmm6
3352	vpaddq	%xmm6,%xmm0,%xmm0
3353	vmovdqa	128(%esp),%xmm5
3354	vpmuludq	(%edx),%xmm7,%xmm7
3355	vpaddq	%xmm7,%xmm4,%xmm4
3356	vpmuludq	64(%edx),%xmm5,%xmm6
3357	vpaddq	%xmm6,%xmm3,%xmm3
3358	vpmuludq	16(%edx),%xmm5,%xmm7
3359	vpaddq	%xmm7,%xmm0,%xmm0
3360	vpmuludq	32(%edx),%xmm5,%xmm6
3361	vpaddq	%xmm6,%xmm1,%xmm1
3362	vmovdqa	64(%ebx),%xmm7
3363	vpmuludq	48(%edx),%xmm5,%xmm5
3364	vpaddq	%xmm5,%xmm2,%xmm2
3365	vpsrlq	$26,%xmm3,%xmm5
3366	vpand	%xmm7,%xmm3,%xmm3
3367	vpsrlq	$26,%xmm0,%xmm6
3368	vpand	%xmm7,%xmm0,%xmm0
3369	vpaddq	%xmm5,%xmm4,%xmm4
3370	vpaddq	%xmm6,%xmm1,%xmm1
3371	vpsrlq	$26,%xmm4,%xmm5
3372	vpand	%xmm7,%xmm4,%xmm4
3373	vpsrlq	$26,%xmm1,%xmm6
3374	vpand	%xmm7,%xmm1,%xmm1
3375	vpaddq	%xmm6,%xmm2,%xmm2
3376	vpaddd	%xmm5,%xmm0,%xmm0
3377	vpsllq	$2,%xmm5,%xmm5
3378	vpsrlq	$26,%xmm2,%xmm6
3379	vpand	%xmm7,%xmm2,%xmm2
3380	vpaddd	%xmm5,%xmm0,%xmm0
3381	vpaddd	%xmm6,%xmm3,%xmm3
3382	vpsrlq	$26,%xmm3,%xmm6
3383	vpsrlq	$26,%xmm0,%xmm5
3384	vpand	%xmm7,%xmm0,%xmm0
3385	vpand	%xmm7,%xmm3,%xmm3
3386	vpaddd	%xmm5,%xmm1,%xmm1
3387	vpaddd	%xmm6,%xmm4,%xmm4
3388	decl	%ecx
3389	jz	.L019square_break
3390	vpunpcklqdq	(%esp),%xmm0,%xmm0
3391	vpunpcklqdq	16(%esp),%xmm1,%xmm1
3392	vpunpcklqdq	32(%esp),%xmm2,%xmm2
3393	vpunpcklqdq	48(%esp),%xmm3,%xmm3
3394	vpunpcklqdq	64(%esp),%xmm4,%xmm4
3395	jmp	.L018square
3396.L019square_break:
3397	vpsllq	$32,%xmm0,%xmm0
3398	vpsllq	$32,%xmm1,%xmm1
3399	vpsllq	$32,%xmm2,%xmm2
3400	vpsllq	$32,%xmm3,%xmm3
3401	vpsllq	$32,%xmm4,%xmm4
3402	vpor	(%esp),%xmm0,%xmm0
3403	vpor	16(%esp),%xmm1,%xmm1
3404	vpor	32(%esp),%xmm2,%xmm2
3405	vpor	48(%esp),%xmm3,%xmm3
3406	vpor	64(%esp),%xmm4,%xmm4
3407	vpshufd	$141,%xmm0,%xmm0
3408	vpshufd	$141,%xmm1,%xmm1
3409	vpshufd	$141,%xmm2,%xmm2
3410	vpshufd	$141,%xmm3,%xmm3
3411	vpshufd	$141,%xmm4,%xmm4
3412	vmovdqu	%xmm0,(%edi)
3413	vmovdqu	%xmm1,16(%edi)
3414	vmovdqu	%xmm2,32(%edi)
3415	vmovdqu	%xmm3,48(%edi)
3416	vmovdqu	%xmm4,64(%edi)
3417	vpslld	$2,%xmm1,%xmm6
3418	vpslld	$2,%xmm2,%xmm5
3419	vpaddd	%xmm1,%xmm6,%xmm6
3420	vpaddd	%xmm2,%xmm5,%xmm5
3421	vmovdqu	%xmm6,80(%edi)
3422	vmovdqu	%xmm5,96(%edi)
3423	vpslld	$2,%xmm3,%xmm6
3424	vpslld	$2,%xmm4,%xmm5
3425	vpaddd	%xmm3,%xmm6,%xmm6
3426	vpaddd	%xmm4,%xmm5,%xmm5
3427	vmovdqu	%xmm6,112(%edi)
3428	vmovdqu	%xmm5,128(%edi)
3429	movl	%ebp,%esp
3430	leal	-48(%edi),%edi
3431	ret
3432.size	_poly1305_init_avx2,.-_poly1305_init_avx2
3433.align	32
3434.type	_poly1305_blocks_avx2,@function
3435.align	16
3436_poly1305_blocks_avx2:
3437	pushl	%ebp
3438	pushl	%ebx
3439	pushl	%esi
3440	pushl	%edi
3441	movl	20(%esp),%edi
3442	movl	24(%esp),%esi
3443	movl	28(%esp),%ecx
3444	movl	20(%edi),%eax
3445	andl	$-16,%ecx
3446	jz	.L020nodata
3447	cmpl	$64,%ecx
3448	jae	.L021enter_avx2
3449	testl	%eax,%eax
3450	jz	.Lenter_blocks
3451.L021enter_avx2:
3452	vzeroupper
3453	call	.L022pic_point
3454.L022pic_point:
3455	popl	%ebx
3456	leal	.Lconst_sse2-.L022pic_point(%ebx),%ebx
3457	testl	%eax,%eax
3458	jnz	.L023base2_26
3459	call	_poly1305_init_avx2
3460	movl	(%edi),%eax
3461	movl	3(%edi),%ecx
3462	movl	6(%edi),%edx
3463	movl	9(%edi),%esi
3464	movl	13(%edi),%ebp
3465	shrl	$2,%ecx
3466	andl	$67108863,%eax
3467	shrl	$4,%edx
3468	andl	$67108863,%ecx
3469	shrl	$6,%esi
3470	andl	$67108863,%edx
3471	movl	%eax,(%edi)
3472	movl	%ecx,4(%edi)
3473	movl	%edx,8(%edi)
3474	movl	%esi,12(%edi)
3475	movl	%ebp,16(%edi)
3476	movl	$1,20(%edi)
3477	movl	24(%esp),%esi
3478	movl	28(%esp),%ecx
3479.L023base2_26:
3480	movl	32(%esp),%eax
3481	movl	%esp,%ebp
3482	subl	$448,%esp
3483	andl	$-512,%esp
3484	vmovdqu	48(%edi),%xmm0
3485	leal	288(%esp),%edx
3486	vmovdqu	64(%edi),%xmm1
3487	vmovdqu	80(%edi),%xmm2
3488	vmovdqu	96(%edi),%xmm3
3489	vmovdqu	112(%edi),%xmm4
3490	leal	48(%edi),%edi
3491	vpermq	$64,%ymm0,%ymm0
3492	vpermq	$64,%ymm1,%ymm1
3493	vpermq	$64,%ymm2,%ymm2
3494	vpermq	$64,%ymm3,%ymm3
3495	vpermq	$64,%ymm4,%ymm4
3496	vpshufd	$200,%ymm0,%ymm0
3497	vpshufd	$200,%ymm1,%ymm1
3498	vpshufd	$200,%ymm2,%ymm2
3499	vpshufd	$200,%ymm3,%ymm3
3500	vpshufd	$200,%ymm4,%ymm4
3501	vmovdqa	%ymm0,-128(%edx)
3502	vmovdqu	80(%edi),%xmm0
3503	vmovdqa	%ymm1,-96(%edx)
3504	vmovdqu	96(%edi),%xmm1
3505	vmovdqa	%ymm2,-64(%edx)
3506	vmovdqu	112(%edi),%xmm2
3507	vmovdqa	%ymm3,-32(%edx)
3508	vmovdqu	128(%edi),%xmm3
3509	vmovdqa	%ymm4,(%edx)
3510	vpermq	$64,%ymm0,%ymm0
3511	vpermq	$64,%ymm1,%ymm1
3512	vpermq	$64,%ymm2,%ymm2
3513	vpermq	$64,%ymm3,%ymm3
3514	vpshufd	$200,%ymm0,%ymm0
3515	vpshufd	$200,%ymm1,%ymm1
3516	vpshufd	$200,%ymm2,%ymm2
3517	vpshufd	$200,%ymm3,%ymm3
3518	vmovdqa	%ymm0,32(%edx)
3519	vmovd	-48(%edi),%xmm0
3520	vmovdqa	%ymm1,64(%edx)
3521	vmovd	-44(%edi),%xmm1
3522	vmovdqa	%ymm2,96(%edx)
3523	vmovd	-40(%edi),%xmm2
3524	vmovdqa	%ymm3,128(%edx)
3525	vmovd	-36(%edi),%xmm3
3526	vmovd	-32(%edi),%xmm4
3527	vmovdqa	64(%ebx),%ymm7
3528	negl	%eax
3529	testl	$63,%ecx
3530	jz	.L024even
3531	movl	%ecx,%edx
3532	andl	$-64,%ecx
3533	andl	$63,%edx
3534	vmovdqu	(%esi),%xmm5
3535	cmpl	$32,%edx
3536	jb	.L025one
3537	vmovdqu	16(%esi),%xmm6
3538	je	.L026two
3539	vinserti128	$1,32(%esi),%ymm5,%ymm5
3540	leal	48(%esi),%esi
3541	leal	8(%ebx),%ebx
3542	leal	296(%esp),%edx
3543	jmp	.L027tail
3544.L026two:
3545	leal	32(%esi),%esi
3546	leal	16(%ebx),%ebx
3547	leal	304(%esp),%edx
3548	jmp	.L027tail
3549.L025one:
3550	leal	16(%esi),%esi
3551	vpxor	%ymm6,%ymm6,%ymm6
3552	leal	32(%ebx,%eax,8),%ebx
3553	leal	312(%esp),%edx
3554	jmp	.L027tail
3555.align	32
3556.L024even:
3557	vmovdqu	(%esi),%xmm5
3558	vmovdqu	16(%esi),%xmm6
3559	vinserti128	$1,32(%esi),%ymm5,%ymm5
3560	vinserti128	$1,48(%esi),%ymm6,%ymm6
3561	leal	64(%esi),%esi
3562	subl	$64,%ecx
3563	jz	.L027tail
3564.L028loop:
3565	vmovdqa	%ymm2,64(%esp)
3566	vpsrldq	$6,%ymm5,%ymm2
3567	vmovdqa	%ymm0,(%esp)
3568	vpsrldq	$6,%ymm6,%ymm0
3569	vmovdqa	%ymm1,32(%esp)
3570	vpunpckhqdq	%ymm6,%ymm5,%ymm1
3571	vpunpcklqdq	%ymm6,%ymm5,%ymm5
3572	vpunpcklqdq	%ymm0,%ymm2,%ymm2
3573	vpsrlq	$30,%ymm2,%ymm0
3574	vpsrlq	$4,%ymm2,%ymm2
3575	vpsrlq	$26,%ymm5,%ymm6
3576	vpsrlq	$40,%ymm1,%ymm1
3577	vpand	%ymm7,%ymm2,%ymm2
3578	vpand	%ymm7,%ymm5,%ymm5
3579	vpand	%ymm7,%ymm6,%ymm6
3580	vpand	%ymm7,%ymm0,%ymm0
3581	vpor	(%ebx),%ymm1,%ymm1
3582	vpaddq	64(%esp),%ymm2,%ymm2
3583	vpaddq	(%esp),%ymm5,%ymm5
3584	vpaddq	32(%esp),%ymm6,%ymm6
3585	vpaddq	%ymm3,%ymm0,%ymm0
3586	vpaddq	%ymm4,%ymm1,%ymm1
3587	vpmuludq	-96(%edx),%ymm2,%ymm3
3588	vmovdqa	%ymm6,32(%esp)
3589	vpmuludq	-64(%edx),%ymm2,%ymm4
3590	vmovdqa	%ymm0,96(%esp)
3591	vpmuludq	96(%edx),%ymm2,%ymm0
3592	vmovdqa	%ymm1,128(%esp)
3593	vpmuludq	128(%edx),%ymm2,%ymm1
3594	vpmuludq	-128(%edx),%ymm2,%ymm2
3595	vpmuludq	-32(%edx),%ymm5,%ymm7
3596	vpaddq	%ymm7,%ymm3,%ymm3
3597	vpmuludq	(%edx),%ymm5,%ymm6
3598	vpaddq	%ymm6,%ymm4,%ymm4
3599	vpmuludq	-128(%edx),%ymm5,%ymm7
3600	vpaddq	%ymm7,%ymm0,%ymm0
3601	vmovdqa	32(%esp),%ymm7
3602	vpmuludq	-96(%edx),%ymm5,%ymm6
3603	vpaddq	%ymm6,%ymm1,%ymm1
3604	vpmuludq	-64(%edx),%ymm5,%ymm5
3605	vpaddq	%ymm5,%ymm2,%ymm2
3606	vpmuludq	-64(%edx),%ymm7,%ymm6
3607	vpaddq	%ymm6,%ymm3,%ymm3
3608	vpmuludq	-32(%edx),%ymm7,%ymm5
3609	vpaddq	%ymm5,%ymm4,%ymm4
3610	vpmuludq	128(%edx),%ymm7,%ymm6
3611	vpaddq	%ymm6,%ymm0,%ymm0
3612	vmovdqa	96(%esp),%ymm6
3613	vpmuludq	-128(%edx),%ymm7,%ymm5
3614	vpaddq	%ymm5,%ymm1,%ymm1
3615	vpmuludq	-96(%edx),%ymm7,%ymm7
3616	vpaddq	%ymm7,%ymm2,%ymm2
3617	vpmuludq	-128(%edx),%ymm6,%ymm5
3618	vpaddq	%ymm5,%ymm3,%ymm3
3619	vpmuludq	-96(%edx),%ymm6,%ymm7
3620	vpaddq	%ymm7,%ymm4,%ymm4
3621	vpmuludq	64(%edx),%ymm6,%ymm5
3622	vpaddq	%ymm5,%ymm0,%ymm0
3623	vmovdqa	128(%esp),%ymm5
3624	vpmuludq	96(%edx),%ymm6,%ymm7
3625	vpaddq	%ymm7,%ymm1,%ymm1
3626	vpmuludq	128(%edx),%ymm6,%ymm6
3627	vpaddq	%ymm6,%ymm2,%ymm2
3628	vpmuludq	128(%edx),%ymm5,%ymm7
3629	vpaddq	%ymm7,%ymm3,%ymm3
3630	vpmuludq	32(%edx),%ymm5,%ymm6
3631	vpaddq	%ymm6,%ymm0,%ymm0
3632	vpmuludq	-128(%edx),%ymm5,%ymm7
3633	vpaddq	%ymm7,%ymm4,%ymm4
3634	vmovdqa	64(%ebx),%ymm7
3635	vpmuludq	64(%edx),%ymm5,%ymm6
3636	vpaddq	%ymm6,%ymm1,%ymm1
3637	vpmuludq	96(%edx),%ymm5,%ymm5
3638	vpaddq	%ymm5,%ymm2,%ymm2
3639	vpsrlq	$26,%ymm3,%ymm5
3640	vpand	%ymm7,%ymm3,%ymm3
3641	vpsrlq	$26,%ymm0,%ymm6
3642	vpand	%ymm7,%ymm0,%ymm0
3643	vpaddq	%ymm5,%ymm4,%ymm4
3644	vpaddq	%ymm6,%ymm1,%ymm1
3645	vpsrlq	$26,%ymm4,%ymm5
3646	vpand	%ymm7,%ymm4,%ymm4
3647	vpsrlq	$26,%ymm1,%ymm6
3648	vpand	%ymm7,%ymm1,%ymm1
3649	vpaddq	%ymm6,%ymm2,%ymm2
3650	vpaddq	%ymm5,%ymm0,%ymm0
3651	vpsllq	$2,%ymm5,%ymm5
3652	vpsrlq	$26,%ymm2,%ymm6
3653	vpand	%ymm7,%ymm2,%ymm2
3654	vpaddq	%ymm5,%ymm0,%ymm0
3655	vpaddq	%ymm6,%ymm3,%ymm3
3656	vpsrlq	$26,%ymm3,%ymm6
3657	vpsrlq	$26,%ymm0,%ymm5
3658	vpand	%ymm7,%ymm0,%ymm0
3659	vpand	%ymm7,%ymm3,%ymm3
3660	vpaddq	%ymm5,%ymm1,%ymm1
3661	vpaddq	%ymm6,%ymm4,%ymm4
3662	vmovdqu	(%esi),%xmm5
3663	vmovdqu	16(%esi),%xmm6
3664	vinserti128	$1,32(%esi),%ymm5,%ymm5
3665	vinserti128	$1,48(%esi),%ymm6,%ymm6
3666	leal	64(%esi),%esi
3667	subl	$64,%ecx
3668	jnz	.L028loop
3669.L027tail:
3670	vmovdqa	%ymm2,64(%esp)
3671	vpsrldq	$6,%ymm5,%ymm2
3672	vmovdqa	%ymm0,(%esp)
3673	vpsrldq	$6,%ymm6,%ymm0
3674	vmovdqa	%ymm1,32(%esp)
3675	vpunpckhqdq	%ymm6,%ymm5,%ymm1
3676	vpunpcklqdq	%ymm6,%ymm5,%ymm5
3677	vpunpcklqdq	%ymm0,%ymm2,%ymm2
3678	vpsrlq	$30,%ymm2,%ymm0
3679	vpsrlq	$4,%ymm2,%ymm2
3680	vpsrlq	$26,%ymm5,%ymm6
3681	vpsrlq	$40,%ymm1,%ymm1
3682	vpand	%ymm7,%ymm2,%ymm2
3683	vpand	%ymm7,%ymm5,%ymm5
3684	vpand	%ymm7,%ymm6,%ymm6
3685	vpand	%ymm7,%ymm0,%ymm0
3686	vpor	(%ebx),%ymm1,%ymm1
3687	andl	$-64,%ebx
3688	vpaddq	64(%esp),%ymm2,%ymm2
3689	vpaddq	(%esp),%ymm5,%ymm5
3690	vpaddq	32(%esp),%ymm6,%ymm6
3691	vpaddq	%ymm3,%ymm0,%ymm0
3692	vpaddq	%ymm4,%ymm1,%ymm1
3693	vpmuludq	-92(%edx),%ymm2,%ymm3
3694	vmovdqa	%ymm6,32(%esp)
3695	vpmuludq	-60(%edx),%ymm2,%ymm4
3696	vmovdqa	%ymm0,96(%esp)
3697	vpmuludq	100(%edx),%ymm2,%ymm0
3698	vmovdqa	%ymm1,128(%esp)
3699	vpmuludq	132(%edx),%ymm2,%ymm1
3700	vpmuludq	-124(%edx),%ymm2,%ymm2
3701	vpmuludq	-28(%edx),%ymm5,%ymm7
3702	vpaddq	%ymm7,%ymm3,%ymm3
3703	vpmuludq	4(%edx),%ymm5,%ymm6
3704	vpaddq	%ymm6,%ymm4,%ymm4
3705	vpmuludq	-124(%edx),%ymm5,%ymm7
3706	vpaddq	%ymm7,%ymm0,%ymm0
3707	vmovdqa	32(%esp),%ymm7
3708	vpmuludq	-92(%edx),%ymm5,%ymm6
3709	vpaddq	%ymm6,%ymm1,%ymm1
3710	vpmuludq	-60(%edx),%ymm5,%ymm5
3711	vpaddq	%ymm5,%ymm2,%ymm2
3712	vpmuludq	-60(%edx),%ymm7,%ymm6
3713	vpaddq	%ymm6,%ymm3,%ymm3
3714	vpmuludq	-28(%edx),%ymm7,%ymm5
3715	vpaddq	%ymm5,%ymm4,%ymm4
3716	vpmuludq	132(%edx),%ymm7,%ymm6
3717	vpaddq	%ymm6,%ymm0,%ymm0
3718	vmovdqa	96(%esp),%ymm6
3719	vpmuludq	-124(%edx),%ymm7,%ymm5
3720	vpaddq	%ymm5,%ymm1,%ymm1
3721	vpmuludq	-92(%edx),%ymm7,%ymm7
3722	vpaddq	%ymm7,%ymm2,%ymm2
3723	vpmuludq	-124(%edx),%ymm6,%ymm5
3724	vpaddq	%ymm5,%ymm3,%ymm3
3725	vpmuludq	-92(%edx),%ymm6,%ymm7
3726	vpaddq	%ymm7,%ymm4,%ymm4
3727	vpmuludq	68(%edx),%ymm6,%ymm5
3728	vpaddq	%ymm5,%ymm0,%ymm0
3729	vmovdqa	128(%esp),%ymm5
3730	vpmuludq	100(%edx),%ymm6,%ymm7
3731	vpaddq	%ymm7,%ymm1,%ymm1
3732	vpmuludq	132(%edx),%ymm6,%ymm6
3733	vpaddq	%ymm6,%ymm2,%ymm2
3734	vpmuludq	132(%edx),%ymm5,%ymm7
3735	vpaddq	%ymm7,%ymm3,%ymm3
3736	vpmuludq	36(%edx),%ymm5,%ymm6
3737	vpaddq	%ymm6,%ymm0,%ymm0
3738	vpmuludq	-124(%edx),%ymm5,%ymm7
3739	vpaddq	%ymm7,%ymm4,%ymm4
3740	vmovdqa	64(%ebx),%ymm7
3741	vpmuludq	68(%edx),%ymm5,%ymm6
3742	vpaddq	%ymm6,%ymm1,%ymm1
3743	vpmuludq	100(%edx),%ymm5,%ymm5
3744	vpaddq	%ymm5,%ymm2,%ymm2
3745	vpsrldq	$8,%ymm4,%ymm5
3746	vpsrldq	$8,%ymm3,%ymm6
3747	vpaddq	%ymm5,%ymm4,%ymm4
3748	vpsrldq	$8,%ymm0,%ymm5
3749	vpaddq	%ymm6,%ymm3,%ymm3
3750	vpsrldq	$8,%ymm1,%ymm6
3751	vpaddq	%ymm5,%ymm0,%ymm0
3752	vpsrldq	$8,%ymm2,%ymm5
3753	vpaddq	%ymm6,%ymm1,%ymm1
3754	vpermq	$2,%ymm4,%ymm6
3755	vpaddq	%ymm5,%ymm2,%ymm2
3756	vpermq	$2,%ymm3,%ymm5
3757	vpaddq	%ymm6,%ymm4,%ymm4
3758	vpermq	$2,%ymm0,%ymm6
3759	vpaddq	%ymm5,%ymm3,%ymm3
3760	vpermq	$2,%ymm1,%ymm5
3761	vpaddq	%ymm6,%ymm0,%ymm0
3762	vpermq	$2,%ymm2,%ymm6
3763	vpaddq	%ymm5,%ymm1,%ymm1
3764	vpaddq	%ymm6,%ymm2,%ymm2
3765	vpsrlq	$26,%ymm3,%ymm5
3766	vpand	%ymm7,%ymm3,%ymm3
3767	vpsrlq	$26,%ymm0,%ymm6
3768	vpand	%ymm7,%ymm0,%ymm0
3769	vpaddq	%ymm5,%ymm4,%ymm4
3770	vpaddq	%ymm6,%ymm1,%ymm1
3771	vpsrlq	$26,%ymm4,%ymm5
3772	vpand	%ymm7,%ymm4,%ymm4
3773	vpsrlq	$26,%ymm1,%ymm6
3774	vpand	%ymm7,%ymm1,%ymm1
3775	vpaddq	%ymm6,%ymm2,%ymm2
3776	vpaddq	%ymm5,%ymm0,%ymm0
3777	vpsllq	$2,%ymm5,%ymm5
3778	vpsrlq	$26,%ymm2,%ymm6
3779	vpand	%ymm7,%ymm2,%ymm2
3780	vpaddq	%ymm5,%ymm0,%ymm0
3781	vpaddq	%ymm6,%ymm3,%ymm3
3782	vpsrlq	$26,%ymm3,%ymm6
3783	vpsrlq	$26,%ymm0,%ymm5
3784	vpand	%ymm7,%ymm0,%ymm0
3785	vpand	%ymm7,%ymm3,%ymm3
3786	vpaddq	%ymm5,%ymm1,%ymm1
3787	vpaddq	%ymm6,%ymm4,%ymm4
3788	cmpl	$0,%ecx
3789	je	.L029done
3790	vpshufd	$252,%xmm0,%xmm0
3791	leal	288(%esp),%edx
3792	vpshufd	$252,%xmm1,%xmm1
3793	vpshufd	$252,%xmm2,%xmm2
3794	vpshufd	$252,%xmm3,%xmm3
3795	vpshufd	$252,%xmm4,%xmm4
3796	jmp	.L024even
3797.align	16
3798.L029done:
3799	vmovd	%xmm0,-48(%edi)
3800	vmovd	%xmm1,-44(%edi)
3801	vmovd	%xmm2,-40(%edi)
3802	vmovd	%xmm3,-36(%edi)
3803	vmovd	%xmm4,-32(%edi)
3804	vzeroupper
3805	movl	%ebp,%esp
3806.L020nodata:
3807	popl	%edi
3808	popl	%esi
3809	popl	%ebx
3810	popl	%ebp
3811	ret
3812.size	_poly1305_blocks_avx2,.-_poly1305_blocks_avx2
3813.align	64
3814.Lconst_sse2:
3815.long	16777216,0,16777216,0,16777216,0,16777216,0
3816.long	0,0,0,0,0,0,0,0
3817.long	67108863,0,67108863,0,67108863,0,67108863,0
3818.long	268435455,268435452,268435452,268435452
3819.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
3820.byte	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
3821.byte	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
3822.byte	114,103,62,0
3823.align	4
3824.comm	OPENSSL_ia32cap_P,16,4
3825#endif
3826