xref: /freebsd/sys/crypto/openssl/i386/poly1305-x86.S (revision 2a63c3be158216222d89a073dcbd6a72ee4aab5a)
1/* Do not modify. This file is auto-generated from poly1305-x86.pl. */
2#ifdef PIC
3.text
4.align	64
5.globl	poly1305_init
6.type	poly1305_init,@function
7.align	16
8poly1305_init:
9.L_poly1305_init_begin:
10	pushl	%ebp
11	pushl	%ebx
12	pushl	%esi
13	pushl	%edi
14	movl	20(%esp),%edi
15	movl	24(%esp),%esi
16	movl	28(%esp),%ebp
17	xorl	%eax,%eax
18	movl	%eax,(%edi)
19	movl	%eax,4(%edi)
20	movl	%eax,8(%edi)
21	movl	%eax,12(%edi)
22	movl	%eax,16(%edi)
23	movl	%eax,20(%edi)
24	cmpl	$0,%esi
25	je	.L000nokey
26	call	.L001pic_point
27.L001pic_point:
28	popl	%ebx
29	leal	poly1305_blocks-.L001pic_point(%ebx),%eax
30	leal	poly1305_emit-.L001pic_point(%ebx),%edx
31	leal	OPENSSL_ia32cap_P-.L001pic_point(%ebx),%edi
32	movl	(%edi),%ecx
33	andl	$83886080,%ecx
34	cmpl	$83886080,%ecx
35	jne	.L002no_sse2
36	leal	_poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
37	leal	_poly1305_emit_sse2-.L001pic_point(%ebx),%edx
38	movl	8(%edi),%ecx
39	testl	$32,%ecx
40	jz	.L002no_sse2
41	leal	_poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
42.L002no_sse2:
43	movl	20(%esp),%edi
44	movl	%eax,(%ebp)
45	movl	%edx,4(%ebp)
46	movl	(%esi),%eax
47	movl	4(%esi),%ebx
48	movl	8(%esi),%ecx
49	movl	12(%esi),%edx
50	andl	$268435455,%eax
51	andl	$268435452,%ebx
52	andl	$268435452,%ecx
53	andl	$268435452,%edx
54	movl	%eax,24(%edi)
55	movl	%ebx,28(%edi)
56	movl	%ecx,32(%edi)
57	movl	%edx,36(%edi)
58	movl	$1,%eax
59.L000nokey:
60	popl	%edi
61	popl	%esi
62	popl	%ebx
63	popl	%ebp
64	ret
65.size	poly1305_init,.-.L_poly1305_init_begin
66.globl	poly1305_blocks
67.type	poly1305_blocks,@function
68.align	16
69poly1305_blocks:
70.L_poly1305_blocks_begin:
71	pushl	%ebp
72	pushl	%ebx
73	pushl	%esi
74	pushl	%edi
75	movl	20(%esp),%edi
76	movl	24(%esp),%esi
77	movl	28(%esp),%ecx
78.Lenter_blocks:
79	andl	$-15,%ecx
80	jz	.L003nodata
81	subl	$64,%esp
82	movl	24(%edi),%eax
83	movl	28(%edi),%ebx
84	leal	(%esi,%ecx,1),%ebp
85	movl	32(%edi),%ecx
86	movl	36(%edi),%edx
87	movl	%ebp,92(%esp)
88	movl	%esi,%ebp
89	movl	%eax,36(%esp)
90	movl	%ebx,%eax
91	shrl	$2,%eax
92	movl	%ebx,40(%esp)
93	addl	%ebx,%eax
94	movl	%ecx,%ebx
95	shrl	$2,%ebx
96	movl	%ecx,44(%esp)
97	addl	%ecx,%ebx
98	movl	%edx,%ecx
99	shrl	$2,%ecx
100	movl	%edx,48(%esp)
101	addl	%edx,%ecx
102	movl	%eax,52(%esp)
103	movl	%ebx,56(%esp)
104	movl	%ecx,60(%esp)
105	movl	(%edi),%eax
106	movl	4(%edi),%ebx
107	movl	8(%edi),%ecx
108	movl	12(%edi),%esi
109	movl	16(%edi),%edi
110	jmp	.L004loop
111.align	32
112.L004loop:
113	addl	(%ebp),%eax
114	adcl	4(%ebp),%ebx
115	adcl	8(%ebp),%ecx
116	adcl	12(%ebp),%esi
117	leal	16(%ebp),%ebp
118	adcl	96(%esp),%edi
119	movl	%eax,(%esp)
120	movl	%esi,12(%esp)
121	mull	36(%esp)
122	movl	%edi,16(%esp)
123	movl	%eax,%edi
124	movl	%ebx,%eax
125	movl	%edx,%esi
126	mull	60(%esp)
127	addl	%eax,%edi
128	movl	%ecx,%eax
129	adcl	%edx,%esi
130	mull	56(%esp)
131	addl	%eax,%edi
132	movl	12(%esp),%eax
133	adcl	%edx,%esi
134	mull	52(%esp)
135	addl	%eax,%edi
136	movl	(%esp),%eax
137	adcl	%edx,%esi
138	mull	40(%esp)
139	movl	%edi,20(%esp)
140	xorl	%edi,%edi
141	addl	%eax,%esi
142	movl	%ebx,%eax
143	adcl	%edx,%edi
144	mull	36(%esp)
145	addl	%eax,%esi
146	movl	%ecx,%eax
147	adcl	%edx,%edi
148	mull	60(%esp)
149	addl	%eax,%esi
150	movl	12(%esp),%eax
151	adcl	%edx,%edi
152	mull	56(%esp)
153	addl	%eax,%esi
154	movl	16(%esp),%eax
155	adcl	%edx,%edi
156	imull	52(%esp),%eax
157	addl	%eax,%esi
158	movl	(%esp),%eax
159	adcl	$0,%edi
160	mull	44(%esp)
161	movl	%esi,24(%esp)
162	xorl	%esi,%esi
163	addl	%eax,%edi
164	movl	%ebx,%eax
165	adcl	%edx,%esi
166	mull	40(%esp)
167	addl	%eax,%edi
168	movl	%ecx,%eax
169	adcl	%edx,%esi
170	mull	36(%esp)
171	addl	%eax,%edi
172	movl	12(%esp),%eax
173	adcl	%edx,%esi
174	mull	60(%esp)
175	addl	%eax,%edi
176	movl	16(%esp),%eax
177	adcl	%edx,%esi
178	imull	56(%esp),%eax
179	addl	%eax,%edi
180	movl	(%esp),%eax
181	adcl	$0,%esi
182	mull	48(%esp)
183	movl	%edi,28(%esp)
184	xorl	%edi,%edi
185	addl	%eax,%esi
186	movl	%ebx,%eax
187	adcl	%edx,%edi
188	mull	44(%esp)
189	addl	%eax,%esi
190	movl	%ecx,%eax
191	adcl	%edx,%edi
192	mull	40(%esp)
193	addl	%eax,%esi
194	movl	12(%esp),%eax
195	adcl	%edx,%edi
196	mull	36(%esp)
197	addl	%eax,%esi
198	movl	16(%esp),%ecx
199	adcl	%edx,%edi
200	movl	%ecx,%edx
201	imull	60(%esp),%ecx
202	addl	%ecx,%esi
203	movl	20(%esp),%eax
204	adcl	$0,%edi
205	imull	36(%esp),%edx
206	addl	%edi,%edx
207	movl	24(%esp),%ebx
208	movl	28(%esp),%ecx
209	movl	%edx,%edi
210	shrl	$2,%edx
211	andl	$3,%edi
212	leal	(%edx,%edx,4),%edx
213	addl	%edx,%eax
214	adcl	$0,%ebx
215	adcl	$0,%ecx
216	adcl	$0,%esi
217	adcl	$0,%edi
218	cmpl	92(%esp),%ebp
219	jne	.L004loop
220	movl	84(%esp),%edx
221	addl	$64,%esp
222	movl	%eax,(%edx)
223	movl	%ebx,4(%edx)
224	movl	%ecx,8(%edx)
225	movl	%esi,12(%edx)
226	movl	%edi,16(%edx)
227.L003nodata:
228	popl	%edi
229	popl	%esi
230	popl	%ebx
231	popl	%ebp
232	ret
233.size	poly1305_blocks,.-.L_poly1305_blocks_begin
234.globl	poly1305_emit
235.type	poly1305_emit,@function
236.align	16
237poly1305_emit:
238.L_poly1305_emit_begin:
239	pushl	%ebp
240	pushl	%ebx
241	pushl	%esi
242	pushl	%edi
243	movl	20(%esp),%ebp
244.Lenter_emit:
245	movl	24(%esp),%edi
246	movl	(%ebp),%eax
247	movl	4(%ebp),%ebx
248	movl	8(%ebp),%ecx
249	movl	12(%ebp),%edx
250	movl	16(%ebp),%esi
251	addl	$5,%eax
252	adcl	$0,%ebx
253	adcl	$0,%ecx
254	adcl	$0,%edx
255	adcl	$0,%esi
256	shrl	$2,%esi
257	negl	%esi
258	andl	%esi,%eax
259	andl	%esi,%ebx
260	andl	%esi,%ecx
261	andl	%esi,%edx
262	movl	%eax,(%edi)
263	movl	%ebx,4(%edi)
264	movl	%ecx,8(%edi)
265	movl	%edx,12(%edi)
266	notl	%esi
267	movl	(%ebp),%eax
268	movl	4(%ebp),%ebx
269	movl	8(%ebp),%ecx
270	movl	12(%ebp),%edx
271	movl	28(%esp),%ebp
272	andl	%esi,%eax
273	andl	%esi,%ebx
274	andl	%esi,%ecx
275	andl	%esi,%edx
276	orl	(%edi),%eax
277	orl	4(%edi),%ebx
278	orl	8(%edi),%ecx
279	orl	12(%edi),%edx
280	addl	(%ebp),%eax
281	adcl	4(%ebp),%ebx
282	adcl	8(%ebp),%ecx
283	adcl	12(%ebp),%edx
284	movl	%eax,(%edi)
285	movl	%ebx,4(%edi)
286	movl	%ecx,8(%edi)
287	movl	%edx,12(%edi)
288	popl	%edi
289	popl	%esi
290	popl	%ebx
291	popl	%ebp
292	ret
293.size	poly1305_emit,.-.L_poly1305_emit_begin
294.align	32
295.type	_poly1305_init_sse2,@function
296.align	16
297_poly1305_init_sse2:
298	movdqu	24(%edi),%xmm4
299	leal	48(%edi),%edi
300	movl	%esp,%ebp
301	subl	$224,%esp
302	andl	$-16,%esp
303	movq	64(%ebx),%xmm7
304	movdqa	%xmm4,%xmm0
305	movdqa	%xmm4,%xmm1
306	movdqa	%xmm4,%xmm2
307	pand	%xmm7,%xmm0
308	psrlq	$26,%xmm1
309	psrldq	$6,%xmm2
310	pand	%xmm7,%xmm1
311	movdqa	%xmm2,%xmm3
312	psrlq	$4,%xmm2
313	psrlq	$30,%xmm3
314	pand	%xmm7,%xmm2
315	pand	%xmm7,%xmm3
316	psrldq	$13,%xmm4
317	leal	144(%esp),%edx
318	movl	$2,%ecx
319.L005square:
320	movdqa	%xmm0,(%esp)
321	movdqa	%xmm1,16(%esp)
322	movdqa	%xmm2,32(%esp)
323	movdqa	%xmm3,48(%esp)
324	movdqa	%xmm4,64(%esp)
325	movdqa	%xmm1,%xmm6
326	movdqa	%xmm2,%xmm5
327	pslld	$2,%xmm6
328	pslld	$2,%xmm5
329	paddd	%xmm1,%xmm6
330	paddd	%xmm2,%xmm5
331	movdqa	%xmm6,80(%esp)
332	movdqa	%xmm5,96(%esp)
333	movdqa	%xmm3,%xmm6
334	movdqa	%xmm4,%xmm5
335	pslld	$2,%xmm6
336	pslld	$2,%xmm5
337	paddd	%xmm3,%xmm6
338	paddd	%xmm4,%xmm5
339	movdqa	%xmm6,112(%esp)
340	movdqa	%xmm5,128(%esp)
341	pshufd	$68,%xmm0,%xmm6
342	movdqa	%xmm1,%xmm5
343	pshufd	$68,%xmm1,%xmm1
344	pshufd	$68,%xmm2,%xmm2
345	pshufd	$68,%xmm3,%xmm3
346	pshufd	$68,%xmm4,%xmm4
347	movdqa	%xmm6,(%edx)
348	movdqa	%xmm1,16(%edx)
349	movdqa	%xmm2,32(%edx)
350	movdqa	%xmm3,48(%edx)
351	movdqa	%xmm4,64(%edx)
352	pmuludq	%xmm0,%xmm4
353	pmuludq	%xmm0,%xmm3
354	pmuludq	%xmm0,%xmm2
355	pmuludq	%xmm0,%xmm1
356	pmuludq	%xmm6,%xmm0
357	movdqa	%xmm5,%xmm6
358	pmuludq	48(%edx),%xmm5
359	movdqa	%xmm6,%xmm7
360	pmuludq	32(%edx),%xmm6
361	paddq	%xmm5,%xmm4
362	movdqa	%xmm7,%xmm5
363	pmuludq	16(%edx),%xmm7
364	paddq	%xmm6,%xmm3
365	movdqa	80(%esp),%xmm6
366	pmuludq	(%edx),%xmm5
367	paddq	%xmm7,%xmm2
368	pmuludq	64(%edx),%xmm6
369	movdqa	32(%esp),%xmm7
370	paddq	%xmm5,%xmm1
371	movdqa	%xmm7,%xmm5
372	pmuludq	32(%edx),%xmm7
373	paddq	%xmm6,%xmm0
374	movdqa	%xmm5,%xmm6
375	pmuludq	16(%edx),%xmm5
376	paddq	%xmm7,%xmm4
377	movdqa	96(%esp),%xmm7
378	pmuludq	(%edx),%xmm6
379	paddq	%xmm5,%xmm3
380	movdqa	%xmm7,%xmm5
381	pmuludq	64(%edx),%xmm7
382	paddq	%xmm6,%xmm2
383	pmuludq	48(%edx),%xmm5
384	movdqa	48(%esp),%xmm6
385	paddq	%xmm7,%xmm1
386	movdqa	%xmm6,%xmm7
387	pmuludq	16(%edx),%xmm6
388	paddq	%xmm5,%xmm0
389	movdqa	112(%esp),%xmm5
390	pmuludq	(%edx),%xmm7
391	paddq	%xmm6,%xmm4
392	movdqa	%xmm5,%xmm6
393	pmuludq	64(%edx),%xmm5
394	paddq	%xmm7,%xmm3
395	movdqa	%xmm6,%xmm7
396	pmuludq	48(%edx),%xmm6
397	paddq	%xmm5,%xmm2
398	pmuludq	32(%edx),%xmm7
399	movdqa	64(%esp),%xmm5
400	paddq	%xmm6,%xmm1
401	movdqa	128(%esp),%xmm6
402	pmuludq	(%edx),%xmm5
403	paddq	%xmm7,%xmm0
404	movdqa	%xmm6,%xmm7
405	pmuludq	64(%edx),%xmm6
406	paddq	%xmm5,%xmm4
407	movdqa	%xmm7,%xmm5
408	pmuludq	16(%edx),%xmm7
409	paddq	%xmm6,%xmm3
410	movdqa	%xmm5,%xmm6
411	pmuludq	32(%edx),%xmm5
412	paddq	%xmm7,%xmm0
413	pmuludq	48(%edx),%xmm6
414	movdqa	64(%ebx),%xmm7
415	paddq	%xmm5,%xmm1
416	paddq	%xmm6,%xmm2
417	movdqa	%xmm3,%xmm5
418	pand	%xmm7,%xmm3
419	psrlq	$26,%xmm5
420	paddq	%xmm4,%xmm5
421	movdqa	%xmm0,%xmm6
422	pand	%xmm7,%xmm0
423	psrlq	$26,%xmm6
424	movdqa	%xmm5,%xmm4
425	paddq	%xmm1,%xmm6
426	psrlq	$26,%xmm5
427	pand	%xmm7,%xmm4
428	movdqa	%xmm6,%xmm1
429	psrlq	$26,%xmm6
430	paddd	%xmm5,%xmm0
431	psllq	$2,%xmm5
432	paddq	%xmm2,%xmm6
433	paddq	%xmm0,%xmm5
434	pand	%xmm7,%xmm1
435	movdqa	%xmm6,%xmm2
436	psrlq	$26,%xmm6
437	pand	%xmm7,%xmm2
438	paddd	%xmm3,%xmm6
439	movdqa	%xmm5,%xmm0
440	psrlq	$26,%xmm5
441	movdqa	%xmm6,%xmm3
442	psrlq	$26,%xmm6
443	pand	%xmm7,%xmm0
444	paddd	%xmm5,%xmm1
445	pand	%xmm7,%xmm3
446	paddd	%xmm6,%xmm4
447	decl	%ecx
448	jz	.L006square_break
449	punpcklqdq	(%esp),%xmm0
450	punpcklqdq	16(%esp),%xmm1
451	punpcklqdq	32(%esp),%xmm2
452	punpcklqdq	48(%esp),%xmm3
453	punpcklqdq	64(%esp),%xmm4
454	jmp	.L005square
455.L006square_break:
456	psllq	$32,%xmm0
457	psllq	$32,%xmm1
458	psllq	$32,%xmm2
459	psllq	$32,%xmm3
460	psllq	$32,%xmm4
461	por	(%esp),%xmm0
462	por	16(%esp),%xmm1
463	por	32(%esp),%xmm2
464	por	48(%esp),%xmm3
465	por	64(%esp),%xmm4
466	pshufd	$141,%xmm0,%xmm0
467	pshufd	$141,%xmm1,%xmm1
468	pshufd	$141,%xmm2,%xmm2
469	pshufd	$141,%xmm3,%xmm3
470	pshufd	$141,%xmm4,%xmm4
471	movdqu	%xmm0,(%edi)
472	movdqu	%xmm1,16(%edi)
473	movdqu	%xmm2,32(%edi)
474	movdqu	%xmm3,48(%edi)
475	movdqu	%xmm4,64(%edi)
476	movdqa	%xmm1,%xmm6
477	movdqa	%xmm2,%xmm5
478	pslld	$2,%xmm6
479	pslld	$2,%xmm5
480	paddd	%xmm1,%xmm6
481	paddd	%xmm2,%xmm5
482	movdqu	%xmm6,80(%edi)
483	movdqu	%xmm5,96(%edi)
484	movdqa	%xmm3,%xmm6
485	movdqa	%xmm4,%xmm5
486	pslld	$2,%xmm6
487	pslld	$2,%xmm5
488	paddd	%xmm3,%xmm6
489	paddd	%xmm4,%xmm5
490	movdqu	%xmm6,112(%edi)
491	movdqu	%xmm5,128(%edi)
492	movl	%ebp,%esp
493	leal	-48(%edi),%edi
494	ret
495.size	_poly1305_init_sse2,.-_poly1305_init_sse2
496.align	32
497.type	_poly1305_blocks_sse2,@function
498.align	16
499_poly1305_blocks_sse2:
500	pushl	%ebp
501	pushl	%ebx
502	pushl	%esi
503	pushl	%edi
504	movl	20(%esp),%edi
505	movl	24(%esp),%esi
506	movl	28(%esp),%ecx
507	movl	20(%edi),%eax
508	andl	$-16,%ecx
509	jz	.L007nodata
510	cmpl	$64,%ecx
511	jae	.L008enter_sse2
512	testl	%eax,%eax
513	jz	.Lenter_blocks
514.align	16
515.L008enter_sse2:
516	call	.L009pic_point
517.L009pic_point:
518	popl	%ebx
519	leal	.Lconst_sse2-.L009pic_point(%ebx),%ebx
520	testl	%eax,%eax
521	jnz	.L010base2_26
522	call	_poly1305_init_sse2
523	movl	(%edi),%eax
524	movl	3(%edi),%ecx
525	movl	6(%edi),%edx
526	movl	9(%edi),%esi
527	movl	13(%edi),%ebp
528	movl	$1,20(%edi)
529	shrl	$2,%ecx
530	andl	$67108863,%eax
531	shrl	$4,%edx
532	andl	$67108863,%ecx
533	shrl	$6,%esi
534	andl	$67108863,%edx
535	movd	%eax,%xmm0
536	movd	%ecx,%xmm1
537	movd	%edx,%xmm2
538	movd	%esi,%xmm3
539	movd	%ebp,%xmm4
540	movl	24(%esp),%esi
541	movl	28(%esp),%ecx
542	jmp	.L011base2_32
543.align	16
544.L010base2_26:
545	movd	(%edi),%xmm0
546	movd	4(%edi),%xmm1
547	movd	8(%edi),%xmm2
548	movd	12(%edi),%xmm3
549	movd	16(%edi),%xmm4
550	movdqa	64(%ebx),%xmm7
551.L011base2_32:
552	movl	32(%esp),%eax
553	movl	%esp,%ebp
554	subl	$528,%esp
555	andl	$-16,%esp
556	leal	48(%edi),%edi
557	shll	$24,%eax
558	testl	$31,%ecx
559	jz	.L012even
560	movdqu	(%esi),%xmm6
561	leal	16(%esi),%esi
562	movdqa	%xmm6,%xmm5
563	pand	%xmm7,%xmm6
564	paddd	%xmm6,%xmm0
565	movdqa	%xmm5,%xmm6
566	psrlq	$26,%xmm5
567	psrldq	$6,%xmm6
568	pand	%xmm7,%xmm5
569	paddd	%xmm5,%xmm1
570	movdqa	%xmm6,%xmm5
571	psrlq	$4,%xmm6
572	pand	%xmm7,%xmm6
573	paddd	%xmm6,%xmm2
574	movdqa	%xmm5,%xmm6
575	psrlq	$30,%xmm5
576	pand	%xmm7,%xmm5
577	psrldq	$7,%xmm6
578	paddd	%xmm5,%xmm3
579	movd	%eax,%xmm5
580	paddd	%xmm6,%xmm4
581	movd	12(%edi),%xmm6
582	paddd	%xmm5,%xmm4
583	movdqa	%xmm0,(%esp)
584	movdqa	%xmm1,16(%esp)
585	movdqa	%xmm2,32(%esp)
586	movdqa	%xmm3,48(%esp)
587	movdqa	%xmm4,64(%esp)
588	pmuludq	%xmm6,%xmm0
589	pmuludq	%xmm6,%xmm1
590	pmuludq	%xmm6,%xmm2
591	movd	28(%edi),%xmm5
592	pmuludq	%xmm6,%xmm3
593	pmuludq	%xmm6,%xmm4
594	movdqa	%xmm5,%xmm6
595	pmuludq	48(%esp),%xmm5
596	movdqa	%xmm6,%xmm7
597	pmuludq	32(%esp),%xmm6
598	paddq	%xmm5,%xmm4
599	movdqa	%xmm7,%xmm5
600	pmuludq	16(%esp),%xmm7
601	paddq	%xmm6,%xmm3
602	movd	92(%edi),%xmm6
603	pmuludq	(%esp),%xmm5
604	paddq	%xmm7,%xmm2
605	pmuludq	64(%esp),%xmm6
606	movd	44(%edi),%xmm7
607	paddq	%xmm5,%xmm1
608	movdqa	%xmm7,%xmm5
609	pmuludq	32(%esp),%xmm7
610	paddq	%xmm6,%xmm0
611	movdqa	%xmm5,%xmm6
612	pmuludq	16(%esp),%xmm5
613	paddq	%xmm7,%xmm4
614	movd	108(%edi),%xmm7
615	pmuludq	(%esp),%xmm6
616	paddq	%xmm5,%xmm3
617	movdqa	%xmm7,%xmm5
618	pmuludq	64(%esp),%xmm7
619	paddq	%xmm6,%xmm2
620	pmuludq	48(%esp),%xmm5
621	movd	60(%edi),%xmm6
622	paddq	%xmm7,%xmm1
623	movdqa	%xmm6,%xmm7
624	pmuludq	16(%esp),%xmm6
625	paddq	%xmm5,%xmm0
626	movd	124(%edi),%xmm5
627	pmuludq	(%esp),%xmm7
628	paddq	%xmm6,%xmm4
629	movdqa	%xmm5,%xmm6
630	pmuludq	64(%esp),%xmm5
631	paddq	%xmm7,%xmm3
632	movdqa	%xmm6,%xmm7
633	pmuludq	48(%esp),%xmm6
634	paddq	%xmm5,%xmm2
635	pmuludq	32(%esp),%xmm7
636	movd	76(%edi),%xmm5
637	paddq	%xmm6,%xmm1
638	movd	140(%edi),%xmm6
639	pmuludq	(%esp),%xmm5
640	paddq	%xmm7,%xmm0
641	movdqa	%xmm6,%xmm7
642	pmuludq	64(%esp),%xmm6
643	paddq	%xmm5,%xmm4
644	movdqa	%xmm7,%xmm5
645	pmuludq	16(%esp),%xmm7
646	paddq	%xmm6,%xmm3
647	movdqa	%xmm5,%xmm6
648	pmuludq	32(%esp),%xmm5
649	paddq	%xmm7,%xmm0
650	pmuludq	48(%esp),%xmm6
651	movdqa	64(%ebx),%xmm7
652	paddq	%xmm5,%xmm1
653	paddq	%xmm6,%xmm2
654	movdqa	%xmm3,%xmm5
655	pand	%xmm7,%xmm3
656	psrlq	$26,%xmm5
657	paddq	%xmm4,%xmm5
658	movdqa	%xmm0,%xmm6
659	pand	%xmm7,%xmm0
660	psrlq	$26,%xmm6
661	movdqa	%xmm5,%xmm4
662	paddq	%xmm1,%xmm6
663	psrlq	$26,%xmm5
664	pand	%xmm7,%xmm4
665	movdqa	%xmm6,%xmm1
666	psrlq	$26,%xmm6
667	paddd	%xmm5,%xmm0
668	psllq	$2,%xmm5
669	paddq	%xmm2,%xmm6
670	paddq	%xmm0,%xmm5
671	pand	%xmm7,%xmm1
672	movdqa	%xmm6,%xmm2
673	psrlq	$26,%xmm6
674	pand	%xmm7,%xmm2
675	paddd	%xmm3,%xmm6
676	movdqa	%xmm5,%xmm0
677	psrlq	$26,%xmm5
678	movdqa	%xmm6,%xmm3
679	psrlq	$26,%xmm6
680	pand	%xmm7,%xmm0
681	paddd	%xmm5,%xmm1
682	pand	%xmm7,%xmm3
683	paddd	%xmm6,%xmm4
684	subl	$16,%ecx
685	jz	.L013done
686.L012even:
687	leal	384(%esp),%edx
688	leal	-32(%esi),%eax
689	subl	$64,%ecx
690	movdqu	(%edi),%xmm5
691	pshufd	$68,%xmm5,%xmm6
692	cmovbl	%eax,%esi
693	pshufd	$238,%xmm5,%xmm5
694	movdqa	%xmm6,(%edx)
695	leal	160(%esp),%eax
696	movdqu	16(%edi),%xmm6
697	movdqa	%xmm5,-144(%edx)
698	pshufd	$68,%xmm6,%xmm5
699	pshufd	$238,%xmm6,%xmm6
700	movdqa	%xmm5,16(%edx)
701	movdqu	32(%edi),%xmm5
702	movdqa	%xmm6,-128(%edx)
703	pshufd	$68,%xmm5,%xmm6
704	pshufd	$238,%xmm5,%xmm5
705	movdqa	%xmm6,32(%edx)
706	movdqu	48(%edi),%xmm6
707	movdqa	%xmm5,-112(%edx)
708	pshufd	$68,%xmm6,%xmm5
709	pshufd	$238,%xmm6,%xmm6
710	movdqa	%xmm5,48(%edx)
711	movdqu	64(%edi),%xmm5
712	movdqa	%xmm6,-96(%edx)
713	pshufd	$68,%xmm5,%xmm6
714	pshufd	$238,%xmm5,%xmm5
715	movdqa	%xmm6,64(%edx)
716	movdqu	80(%edi),%xmm6
717	movdqa	%xmm5,-80(%edx)
718	pshufd	$68,%xmm6,%xmm5
719	pshufd	$238,%xmm6,%xmm6
720	movdqa	%xmm5,80(%edx)
721	movdqu	96(%edi),%xmm5
722	movdqa	%xmm6,-64(%edx)
723	pshufd	$68,%xmm5,%xmm6
724	pshufd	$238,%xmm5,%xmm5
725	movdqa	%xmm6,96(%edx)
726	movdqu	112(%edi),%xmm6
727	movdqa	%xmm5,-48(%edx)
728	pshufd	$68,%xmm6,%xmm5
729	pshufd	$238,%xmm6,%xmm6
730	movdqa	%xmm5,112(%edx)
731	movdqu	128(%edi),%xmm5
732	movdqa	%xmm6,-32(%edx)
733	pshufd	$68,%xmm5,%xmm6
734	pshufd	$238,%xmm5,%xmm5
735	movdqa	%xmm6,128(%edx)
736	movdqa	%xmm5,-16(%edx)
737	movdqu	32(%esi),%xmm5
738	movdqu	48(%esi),%xmm6
739	leal	32(%esi),%esi
740	movdqa	%xmm2,112(%esp)
741	movdqa	%xmm3,128(%esp)
742	movdqa	%xmm4,144(%esp)
743	movdqa	%xmm5,%xmm2
744	movdqa	%xmm6,%xmm3
745	psrldq	$6,%xmm2
746	psrldq	$6,%xmm3
747	movdqa	%xmm5,%xmm4
748	punpcklqdq	%xmm3,%xmm2
749	punpckhqdq	%xmm6,%xmm4
750	punpcklqdq	%xmm6,%xmm5
751	movdqa	%xmm2,%xmm3
752	psrlq	$4,%xmm2
753	psrlq	$30,%xmm3
754	movdqa	%xmm5,%xmm6
755	psrlq	$40,%xmm4
756	psrlq	$26,%xmm6
757	pand	%xmm7,%xmm5
758	pand	%xmm7,%xmm6
759	pand	%xmm7,%xmm2
760	pand	%xmm7,%xmm3
761	por	(%ebx),%xmm4
762	movdqa	%xmm0,80(%esp)
763	movdqa	%xmm1,96(%esp)
764	jbe	.L014skip_loop
765	jmp	.L015loop
766.align	32
767.L015loop:
768	movdqa	-144(%edx),%xmm7
769	movdqa	%xmm6,16(%eax)
770	movdqa	%xmm2,32(%eax)
771	movdqa	%xmm3,48(%eax)
772	movdqa	%xmm4,64(%eax)
773	movdqa	%xmm5,%xmm1
774	pmuludq	%xmm7,%xmm5
775	movdqa	%xmm6,%xmm0
776	pmuludq	%xmm7,%xmm6
777	pmuludq	%xmm7,%xmm2
778	pmuludq	%xmm7,%xmm3
779	pmuludq	%xmm7,%xmm4
780	pmuludq	-16(%edx),%xmm0
781	movdqa	%xmm1,%xmm7
782	pmuludq	-128(%edx),%xmm1
783	paddq	%xmm5,%xmm0
784	movdqa	%xmm7,%xmm5
785	pmuludq	-112(%edx),%xmm7
786	paddq	%xmm6,%xmm1
787	movdqa	%xmm5,%xmm6
788	pmuludq	-96(%edx),%xmm5
789	paddq	%xmm7,%xmm2
790	movdqa	16(%eax),%xmm7
791	pmuludq	-80(%edx),%xmm6
792	paddq	%xmm5,%xmm3
793	movdqa	%xmm7,%xmm5
794	pmuludq	-128(%edx),%xmm7
795	paddq	%xmm6,%xmm4
796	movdqa	%xmm5,%xmm6
797	pmuludq	-112(%edx),%xmm5
798	paddq	%xmm7,%xmm2
799	movdqa	32(%eax),%xmm7
800	pmuludq	-96(%edx),%xmm6
801	paddq	%xmm5,%xmm3
802	movdqa	%xmm7,%xmm5
803	pmuludq	-32(%edx),%xmm7
804	paddq	%xmm6,%xmm4
805	movdqa	%xmm5,%xmm6
806	pmuludq	-16(%edx),%xmm5
807	paddq	%xmm7,%xmm0
808	movdqa	%xmm6,%xmm7
809	pmuludq	-128(%edx),%xmm6
810	paddq	%xmm5,%xmm1
811	movdqa	48(%eax),%xmm5
812	pmuludq	-112(%edx),%xmm7
813	paddq	%xmm6,%xmm3
814	movdqa	%xmm5,%xmm6
815	pmuludq	-48(%edx),%xmm5
816	paddq	%xmm7,%xmm4
817	movdqa	%xmm6,%xmm7
818	pmuludq	-32(%edx),%xmm6
819	paddq	%xmm5,%xmm0
820	movdqa	%xmm7,%xmm5
821	pmuludq	-16(%edx),%xmm7
822	paddq	%xmm6,%xmm1
823	movdqa	64(%eax),%xmm6
824	pmuludq	-128(%edx),%xmm5
825	paddq	%xmm7,%xmm2
826	movdqa	%xmm6,%xmm7
827	pmuludq	-16(%edx),%xmm6
828	paddq	%xmm5,%xmm4
829	movdqa	%xmm7,%xmm5
830	pmuludq	-64(%edx),%xmm7
831	paddq	%xmm6,%xmm3
832	movdqa	%xmm5,%xmm6
833	pmuludq	-48(%edx),%xmm5
834	paddq	%xmm7,%xmm0
835	movdqa	64(%ebx),%xmm7
836	pmuludq	-32(%edx),%xmm6
837	paddq	%xmm5,%xmm1
838	paddq	%xmm6,%xmm2
839	movdqu	-32(%esi),%xmm5
840	movdqu	-16(%esi),%xmm6
841	leal	32(%esi),%esi
842	movdqa	%xmm2,32(%esp)
843	movdqa	%xmm3,48(%esp)
844	movdqa	%xmm4,64(%esp)
845	movdqa	%xmm5,%xmm2
846	movdqa	%xmm6,%xmm3
847	psrldq	$6,%xmm2
848	psrldq	$6,%xmm3
849	movdqa	%xmm5,%xmm4
850	punpcklqdq	%xmm3,%xmm2
851	punpckhqdq	%xmm6,%xmm4
852	punpcklqdq	%xmm6,%xmm5
853	movdqa	%xmm2,%xmm3
854	psrlq	$4,%xmm2
855	psrlq	$30,%xmm3
856	movdqa	%xmm5,%xmm6
857	psrlq	$40,%xmm4
858	psrlq	$26,%xmm6
859	pand	%xmm7,%xmm5
860	pand	%xmm7,%xmm6
861	pand	%xmm7,%xmm2
862	pand	%xmm7,%xmm3
863	por	(%ebx),%xmm4
864	leal	-32(%esi),%eax
865	subl	$64,%ecx
866	paddd	80(%esp),%xmm5
867	paddd	96(%esp),%xmm6
868	paddd	112(%esp),%xmm2
869	paddd	128(%esp),%xmm3
870	paddd	144(%esp),%xmm4
871	cmovbl	%eax,%esi
872	leal	160(%esp),%eax
873	movdqa	(%edx),%xmm7
874	movdqa	%xmm1,16(%esp)
875	movdqa	%xmm6,16(%eax)
876	movdqa	%xmm2,32(%eax)
877	movdqa	%xmm3,48(%eax)
878	movdqa	%xmm4,64(%eax)
879	movdqa	%xmm5,%xmm1
880	pmuludq	%xmm7,%xmm5
881	paddq	%xmm0,%xmm5
882	movdqa	%xmm6,%xmm0
883	pmuludq	%xmm7,%xmm6
884	pmuludq	%xmm7,%xmm2
885	pmuludq	%xmm7,%xmm3
886	pmuludq	%xmm7,%xmm4
887	paddq	16(%esp),%xmm6
888	paddq	32(%esp),%xmm2
889	paddq	48(%esp),%xmm3
890	paddq	64(%esp),%xmm4
891	pmuludq	128(%edx),%xmm0
892	movdqa	%xmm1,%xmm7
893	pmuludq	16(%edx),%xmm1
894	paddq	%xmm5,%xmm0
895	movdqa	%xmm7,%xmm5
896	pmuludq	32(%edx),%xmm7
897	paddq	%xmm6,%xmm1
898	movdqa	%xmm5,%xmm6
899	pmuludq	48(%edx),%xmm5
900	paddq	%xmm7,%xmm2
901	movdqa	16(%eax),%xmm7
902	pmuludq	64(%edx),%xmm6
903	paddq	%xmm5,%xmm3
904	movdqa	%xmm7,%xmm5
905	pmuludq	16(%edx),%xmm7
906	paddq	%xmm6,%xmm4
907	movdqa	%xmm5,%xmm6
908	pmuludq	32(%edx),%xmm5
909	paddq	%xmm7,%xmm2
910	movdqa	32(%eax),%xmm7
911	pmuludq	48(%edx),%xmm6
912	paddq	%xmm5,%xmm3
913	movdqa	%xmm7,%xmm5
914	pmuludq	112(%edx),%xmm7
915	paddq	%xmm6,%xmm4
916	movdqa	%xmm5,%xmm6
917	pmuludq	128(%edx),%xmm5
918	paddq	%xmm7,%xmm0
919	movdqa	%xmm6,%xmm7
920	pmuludq	16(%edx),%xmm6
921	paddq	%xmm5,%xmm1
922	movdqa	48(%eax),%xmm5
923	pmuludq	32(%edx),%xmm7
924	paddq	%xmm6,%xmm3
925	movdqa	%xmm5,%xmm6
926	pmuludq	96(%edx),%xmm5
927	paddq	%xmm7,%xmm4
928	movdqa	%xmm6,%xmm7
929	pmuludq	112(%edx),%xmm6
930	paddq	%xmm5,%xmm0
931	movdqa	%xmm7,%xmm5
932	pmuludq	128(%edx),%xmm7
933	paddq	%xmm6,%xmm1
934	movdqa	64(%eax),%xmm6
935	pmuludq	16(%edx),%xmm5
936	paddq	%xmm7,%xmm2
937	movdqa	%xmm6,%xmm7
938	pmuludq	128(%edx),%xmm6
939	paddq	%xmm5,%xmm4
940	movdqa	%xmm7,%xmm5
941	pmuludq	80(%edx),%xmm7
942	paddq	%xmm6,%xmm3
943	movdqa	%xmm5,%xmm6
944	pmuludq	96(%edx),%xmm5
945	paddq	%xmm7,%xmm0
946	movdqa	64(%ebx),%xmm7
947	pmuludq	112(%edx),%xmm6
948	paddq	%xmm5,%xmm1
949	paddq	%xmm6,%xmm2
950	movdqa	%xmm3,%xmm5
951	pand	%xmm7,%xmm3
952	psrlq	$26,%xmm5
953	paddq	%xmm4,%xmm5
954	movdqa	%xmm0,%xmm6
955	pand	%xmm7,%xmm0
956	psrlq	$26,%xmm6
957	movdqa	%xmm5,%xmm4
958	paddq	%xmm1,%xmm6
959	psrlq	$26,%xmm5
960	pand	%xmm7,%xmm4
961	movdqa	%xmm6,%xmm1
962	psrlq	$26,%xmm6
963	paddd	%xmm5,%xmm0
964	psllq	$2,%xmm5
965	paddq	%xmm2,%xmm6
966	paddq	%xmm0,%xmm5
967	pand	%xmm7,%xmm1
968	movdqa	%xmm6,%xmm2
969	psrlq	$26,%xmm6
970	pand	%xmm7,%xmm2
971	paddd	%xmm3,%xmm6
972	movdqa	%xmm5,%xmm0
973	psrlq	$26,%xmm5
974	movdqa	%xmm6,%xmm3
975	psrlq	$26,%xmm6
976	pand	%xmm7,%xmm0
977	paddd	%xmm5,%xmm1
978	pand	%xmm7,%xmm3
979	paddd	%xmm6,%xmm4
980	movdqu	32(%esi),%xmm5
981	movdqu	48(%esi),%xmm6
982	leal	32(%esi),%esi
983	movdqa	%xmm2,112(%esp)
984	movdqa	%xmm3,128(%esp)
985	movdqa	%xmm4,144(%esp)
986	movdqa	%xmm5,%xmm2
987	movdqa	%xmm6,%xmm3
988	psrldq	$6,%xmm2
989	psrldq	$6,%xmm3
990	movdqa	%xmm5,%xmm4
991	punpcklqdq	%xmm3,%xmm2
992	punpckhqdq	%xmm6,%xmm4
993	punpcklqdq	%xmm6,%xmm5
994	movdqa	%xmm2,%xmm3
995	psrlq	$4,%xmm2
996	psrlq	$30,%xmm3
997	movdqa	%xmm5,%xmm6
998	psrlq	$40,%xmm4
999	psrlq	$26,%xmm6
1000	pand	%xmm7,%xmm5
1001	pand	%xmm7,%xmm6
1002	pand	%xmm7,%xmm2
1003	pand	%xmm7,%xmm3
1004	por	(%ebx),%xmm4
1005	movdqa	%xmm0,80(%esp)
1006	movdqa	%xmm1,96(%esp)
1007	ja	.L015loop
1008.L014skip_loop:
1009	pshufd	$16,-144(%edx),%xmm7
1010	addl	$32,%ecx
1011	jnz	.L016long_tail
1012	paddd	%xmm0,%xmm5
1013	paddd	%xmm1,%xmm6
1014	paddd	112(%esp),%xmm2
1015	paddd	128(%esp),%xmm3
1016	paddd	144(%esp),%xmm4
1017.L016long_tail:
1018	movdqa	%xmm5,(%eax)
1019	movdqa	%xmm6,16(%eax)
1020	movdqa	%xmm2,32(%eax)
1021	movdqa	%xmm3,48(%eax)
1022	movdqa	%xmm4,64(%eax)
1023	pmuludq	%xmm7,%xmm5
1024	pmuludq	%xmm7,%xmm6
1025	pmuludq	%xmm7,%xmm2
1026	movdqa	%xmm5,%xmm0
1027	pshufd	$16,-128(%edx),%xmm5
1028	pmuludq	%xmm7,%xmm3
1029	movdqa	%xmm6,%xmm1
1030	pmuludq	%xmm7,%xmm4
1031	movdqa	%xmm5,%xmm6
1032	pmuludq	48(%eax),%xmm5
1033	movdqa	%xmm6,%xmm7
1034	pmuludq	32(%eax),%xmm6
1035	paddq	%xmm5,%xmm4
1036	movdqa	%xmm7,%xmm5
1037	pmuludq	16(%eax),%xmm7
1038	paddq	%xmm6,%xmm3
1039	pshufd	$16,-64(%edx),%xmm6
1040	pmuludq	(%eax),%xmm5
1041	paddq	%xmm7,%xmm2
1042	pmuludq	64(%eax),%xmm6
1043	pshufd	$16,-112(%edx),%xmm7
1044	paddq	%xmm5,%xmm1
1045	movdqa	%xmm7,%xmm5
1046	pmuludq	32(%eax),%xmm7
1047	paddq	%xmm6,%xmm0
1048	movdqa	%xmm5,%xmm6
1049	pmuludq	16(%eax),%xmm5
1050	paddq	%xmm7,%xmm4
1051	pshufd	$16,-48(%edx),%xmm7
1052	pmuludq	(%eax),%xmm6
1053	paddq	%xmm5,%xmm3
1054	movdqa	%xmm7,%xmm5
1055	pmuludq	64(%eax),%xmm7
1056	paddq	%xmm6,%xmm2
1057	pmuludq	48(%eax),%xmm5
1058	pshufd	$16,-96(%edx),%xmm6
1059	paddq	%xmm7,%xmm1
1060	movdqa	%xmm6,%xmm7
1061	pmuludq	16(%eax),%xmm6
1062	paddq	%xmm5,%xmm0
1063	pshufd	$16,-32(%edx),%xmm5
1064	pmuludq	(%eax),%xmm7
1065	paddq	%xmm6,%xmm4
1066	movdqa	%xmm5,%xmm6
1067	pmuludq	64(%eax),%xmm5
1068	paddq	%xmm7,%xmm3
1069	movdqa	%xmm6,%xmm7
1070	pmuludq	48(%eax),%xmm6
1071	paddq	%xmm5,%xmm2
1072	pmuludq	32(%eax),%xmm7
1073	pshufd	$16,-80(%edx),%xmm5
1074	paddq	%xmm6,%xmm1
1075	pshufd	$16,-16(%edx),%xmm6
1076	pmuludq	(%eax),%xmm5
1077	paddq	%xmm7,%xmm0
1078	movdqa	%xmm6,%xmm7
1079	pmuludq	64(%eax),%xmm6
1080	paddq	%xmm5,%xmm4
1081	movdqa	%xmm7,%xmm5
1082	pmuludq	16(%eax),%xmm7
1083	paddq	%xmm6,%xmm3
1084	movdqa	%xmm5,%xmm6
1085	pmuludq	32(%eax),%xmm5
1086	paddq	%xmm7,%xmm0
1087	pmuludq	48(%eax),%xmm6
1088	movdqa	64(%ebx),%xmm7
1089	paddq	%xmm5,%xmm1
1090	paddq	%xmm6,%xmm2
1091	jz	.L017short_tail
1092	movdqu	-32(%esi),%xmm5
1093	movdqu	-16(%esi),%xmm6
1094	leal	32(%esi),%esi
1095	movdqa	%xmm2,32(%esp)
1096	movdqa	%xmm3,48(%esp)
1097	movdqa	%xmm4,64(%esp)
1098	movdqa	%xmm5,%xmm2
1099	movdqa	%xmm6,%xmm3
1100	psrldq	$6,%xmm2
1101	psrldq	$6,%xmm3
1102	movdqa	%xmm5,%xmm4
1103	punpcklqdq	%xmm3,%xmm2
1104	punpckhqdq	%xmm6,%xmm4
1105	punpcklqdq	%xmm6,%xmm5
1106	movdqa	%xmm2,%xmm3
1107	psrlq	$4,%xmm2
1108	psrlq	$30,%xmm3
1109	movdqa	%xmm5,%xmm6
1110	psrlq	$40,%xmm4
1111	psrlq	$26,%xmm6
1112	pand	%xmm7,%xmm5
1113	pand	%xmm7,%xmm6
1114	pand	%xmm7,%xmm2
1115	pand	%xmm7,%xmm3
1116	por	(%ebx),%xmm4
1117	pshufd	$16,(%edx),%xmm7
1118	paddd	80(%esp),%xmm5
1119	paddd	96(%esp),%xmm6
1120	paddd	112(%esp),%xmm2
1121	paddd	128(%esp),%xmm3
1122	paddd	144(%esp),%xmm4
1123	movdqa	%xmm5,(%esp)
1124	pmuludq	%xmm7,%xmm5
1125	movdqa	%xmm6,16(%esp)
1126	pmuludq	%xmm7,%xmm6
1127	paddq	%xmm5,%xmm0
1128	movdqa	%xmm2,%xmm5
1129	pmuludq	%xmm7,%xmm2
1130	paddq	%xmm6,%xmm1
1131	movdqa	%xmm3,%xmm6
1132	pmuludq	%xmm7,%xmm3
1133	paddq	32(%esp),%xmm2
1134	movdqa	%xmm5,32(%esp)
1135	pshufd	$16,16(%edx),%xmm5
1136	paddq	48(%esp),%xmm3
1137	movdqa	%xmm6,48(%esp)
1138	movdqa	%xmm4,%xmm6
1139	pmuludq	%xmm7,%xmm4
1140	paddq	64(%esp),%xmm4
1141	movdqa	%xmm6,64(%esp)
1142	movdqa	%xmm5,%xmm6
1143	pmuludq	48(%esp),%xmm5
1144	movdqa	%xmm6,%xmm7
1145	pmuludq	32(%esp),%xmm6
1146	paddq	%xmm5,%xmm4
1147	movdqa	%xmm7,%xmm5
1148	pmuludq	16(%esp),%xmm7
1149	paddq	%xmm6,%xmm3
1150	pshufd	$16,80(%edx),%xmm6
1151	pmuludq	(%esp),%xmm5
1152	paddq	%xmm7,%xmm2
1153	pmuludq	64(%esp),%xmm6
1154	pshufd	$16,32(%edx),%xmm7
1155	paddq	%xmm5,%xmm1
1156	movdqa	%xmm7,%xmm5
1157	pmuludq	32(%esp),%xmm7
1158	paddq	%xmm6,%xmm0
1159	movdqa	%xmm5,%xmm6
1160	pmuludq	16(%esp),%xmm5
1161	paddq	%xmm7,%xmm4
1162	pshufd	$16,96(%edx),%xmm7
1163	pmuludq	(%esp),%xmm6
1164	paddq	%xmm5,%xmm3
1165	movdqa	%xmm7,%xmm5
1166	pmuludq	64(%esp),%xmm7
1167	paddq	%xmm6,%xmm2
1168	pmuludq	48(%esp),%xmm5
1169	pshufd	$16,48(%edx),%xmm6
1170	paddq	%xmm7,%xmm1
1171	movdqa	%xmm6,%xmm7
1172	pmuludq	16(%esp),%xmm6
1173	paddq	%xmm5,%xmm0
1174	pshufd	$16,112(%edx),%xmm5
1175	pmuludq	(%esp),%xmm7
1176	paddq	%xmm6,%xmm4
1177	movdqa	%xmm5,%xmm6
1178	pmuludq	64(%esp),%xmm5
1179	paddq	%xmm7,%xmm3
1180	movdqa	%xmm6,%xmm7
1181	pmuludq	48(%esp),%xmm6
1182	paddq	%xmm5,%xmm2
1183	pmuludq	32(%esp),%xmm7
1184	pshufd	$16,64(%edx),%xmm5
1185	paddq	%xmm6,%xmm1
1186	pshufd	$16,128(%edx),%xmm6
1187	pmuludq	(%esp),%xmm5
1188	paddq	%xmm7,%xmm0
1189	movdqa	%xmm6,%xmm7
1190	pmuludq	64(%esp),%xmm6
1191	paddq	%xmm5,%xmm4
1192	movdqa	%xmm7,%xmm5
1193	pmuludq	16(%esp),%xmm7
1194	paddq	%xmm6,%xmm3
1195	movdqa	%xmm5,%xmm6
1196	pmuludq	32(%esp),%xmm5
1197	paddq	%xmm7,%xmm0
1198	pmuludq	48(%esp),%xmm6
1199	movdqa	64(%ebx),%xmm7
1200	paddq	%xmm5,%xmm1
1201	paddq	%xmm6,%xmm2
1202.L017short_tail:
1203	pshufd	$78,%xmm4,%xmm6
1204	pshufd	$78,%xmm3,%xmm5
1205	paddq	%xmm6,%xmm4
1206	paddq	%xmm5,%xmm3
1207	pshufd	$78,%xmm0,%xmm6
1208	pshufd	$78,%xmm1,%xmm5
1209	paddq	%xmm6,%xmm0
1210	paddq	%xmm5,%xmm1
1211	pshufd	$78,%xmm2,%xmm6
1212	movdqa	%xmm3,%xmm5
1213	pand	%xmm7,%xmm3
1214	psrlq	$26,%xmm5
1215	paddq	%xmm6,%xmm2
1216	paddq	%xmm4,%xmm5
1217	movdqa	%xmm0,%xmm6
1218	pand	%xmm7,%xmm0
1219	psrlq	$26,%xmm6
1220	movdqa	%xmm5,%xmm4
1221	paddq	%xmm1,%xmm6
1222	psrlq	$26,%xmm5
1223	pand	%xmm7,%xmm4
1224	movdqa	%xmm6,%xmm1
1225	psrlq	$26,%xmm6
1226	paddd	%xmm5,%xmm0
1227	psllq	$2,%xmm5
1228	paddq	%xmm2,%xmm6
1229	paddq	%xmm0,%xmm5
1230	pand	%xmm7,%xmm1
1231	movdqa	%xmm6,%xmm2
1232	psrlq	$26,%xmm6
1233	pand	%xmm7,%xmm2
1234	paddd	%xmm3,%xmm6
1235	movdqa	%xmm5,%xmm0
1236	psrlq	$26,%xmm5
1237	movdqa	%xmm6,%xmm3
1238	psrlq	$26,%xmm6
1239	pand	%xmm7,%xmm0
1240	paddd	%xmm5,%xmm1
1241	pand	%xmm7,%xmm3
1242	paddd	%xmm6,%xmm4
1243.L013done:
1244	movd	%xmm0,-48(%edi)
1245	movd	%xmm1,-44(%edi)
1246	movd	%xmm2,-40(%edi)
1247	movd	%xmm3,-36(%edi)
1248	movd	%xmm4,-32(%edi)
1249	movl	%ebp,%esp
1250.L007nodata:
1251	popl	%edi
1252	popl	%esi
1253	popl	%ebx
1254	popl	%ebp
1255	ret
1256.size	_poly1305_blocks_sse2,.-_poly1305_blocks_sse2
1257.align	32
1258.type	_poly1305_emit_sse2,@function
1259.align	16
1260_poly1305_emit_sse2:
1261	pushl	%ebp
1262	pushl	%ebx
1263	pushl	%esi
1264	pushl	%edi
1265	movl	20(%esp),%ebp
1266	cmpl	$0,20(%ebp)
1267	je	.Lenter_emit
1268	movl	(%ebp),%eax
1269	movl	4(%ebp),%edi
1270	movl	8(%ebp),%ecx
1271	movl	12(%ebp),%edx
1272	movl	16(%ebp),%esi
1273	movl	%edi,%ebx
1274	shll	$26,%edi
1275	shrl	$6,%ebx
1276	addl	%edi,%eax
1277	movl	%ecx,%edi
1278	adcl	$0,%ebx
1279	shll	$20,%edi
1280	shrl	$12,%ecx
1281	addl	%edi,%ebx
1282	movl	%edx,%edi
1283	adcl	$0,%ecx
1284	shll	$14,%edi
1285	shrl	$18,%edx
1286	addl	%edi,%ecx
1287	movl	%esi,%edi
1288	adcl	$0,%edx
1289	shll	$8,%edi
1290	shrl	$24,%esi
1291	addl	%edi,%edx
1292	adcl	$0,%esi
1293	movl	%esi,%edi
1294	andl	$3,%esi
1295	shrl	$2,%edi
1296	leal	(%edi,%edi,4),%ebp
1297	movl	24(%esp),%edi
1298	addl	%ebp,%eax
1299	movl	28(%esp),%ebp
1300	adcl	$0,%ebx
1301	adcl	$0,%ecx
1302	adcl	$0,%edx
1303	adcl	$0,%esi
1304	movd	%eax,%xmm0
1305	addl	$5,%eax
1306	movd	%ebx,%xmm1
1307	adcl	$0,%ebx
1308	movd	%ecx,%xmm2
1309	adcl	$0,%ecx
1310	movd	%edx,%xmm3
1311	adcl	$0,%edx
1312	adcl	$0,%esi
1313	shrl	$2,%esi
1314	negl	%esi
1315	andl	%esi,%eax
1316	andl	%esi,%ebx
1317	andl	%esi,%ecx
1318	andl	%esi,%edx
1319	movl	%eax,(%edi)
1320	movd	%xmm0,%eax
1321	movl	%ebx,4(%edi)
1322	movd	%xmm1,%ebx
1323	movl	%ecx,8(%edi)
1324	movd	%xmm2,%ecx
1325	movl	%edx,12(%edi)
1326	movd	%xmm3,%edx
1327	notl	%esi
1328	andl	%esi,%eax
1329	andl	%esi,%ebx
1330	orl	(%edi),%eax
1331	andl	%esi,%ecx
1332	orl	4(%edi),%ebx
1333	andl	%esi,%edx
1334	orl	8(%edi),%ecx
1335	orl	12(%edi),%edx
1336	addl	(%ebp),%eax
1337	adcl	4(%ebp),%ebx
1338	movl	%eax,(%edi)
1339	adcl	8(%ebp),%ecx
1340	movl	%ebx,4(%edi)
1341	adcl	12(%ebp),%edx
1342	movl	%ecx,8(%edi)
1343	movl	%edx,12(%edi)
1344	popl	%edi
1345	popl	%esi
1346	popl	%ebx
1347	popl	%ebp
1348	ret
1349.size	_poly1305_emit_sse2,.-_poly1305_emit_sse2
1350.align	32
1351.type	_poly1305_init_avx2,@function
1352.align	16
1353_poly1305_init_avx2:
1354	vmovdqu	24(%edi),%xmm4
1355	leal	48(%edi),%edi
1356	movl	%esp,%ebp
1357	subl	$224,%esp
1358	andl	$-16,%esp
1359	vmovdqa	64(%ebx),%xmm7
1360	vpand	%xmm7,%xmm4,%xmm0
1361	vpsrlq	$26,%xmm4,%xmm1
1362	vpsrldq	$6,%xmm4,%xmm3
1363	vpand	%xmm7,%xmm1,%xmm1
1364	vpsrlq	$4,%xmm3,%xmm2
1365	vpsrlq	$30,%xmm3,%xmm3
1366	vpand	%xmm7,%xmm2,%xmm2
1367	vpand	%xmm7,%xmm3,%xmm3
1368	vpsrldq	$13,%xmm4,%xmm4
1369	leal	144(%esp),%edx
1370	movl	$2,%ecx
1371.L018square:
1372	vmovdqa	%xmm0,(%esp)
1373	vmovdqa	%xmm1,16(%esp)
1374	vmovdqa	%xmm2,32(%esp)
1375	vmovdqa	%xmm3,48(%esp)
1376	vmovdqa	%xmm4,64(%esp)
1377	vpslld	$2,%xmm1,%xmm6
1378	vpslld	$2,%xmm2,%xmm5
1379	vpaddd	%xmm1,%xmm6,%xmm6
1380	vpaddd	%xmm2,%xmm5,%xmm5
1381	vmovdqa	%xmm6,80(%esp)
1382	vmovdqa	%xmm5,96(%esp)
1383	vpslld	$2,%xmm3,%xmm6
1384	vpslld	$2,%xmm4,%xmm5
1385	vpaddd	%xmm3,%xmm6,%xmm6
1386	vpaddd	%xmm4,%xmm5,%xmm5
1387	vmovdqa	%xmm6,112(%esp)
1388	vmovdqa	%xmm5,128(%esp)
1389	vpshufd	$68,%xmm0,%xmm5
1390	vmovdqa	%xmm1,%xmm6
1391	vpshufd	$68,%xmm1,%xmm1
1392	vpshufd	$68,%xmm2,%xmm2
1393	vpshufd	$68,%xmm3,%xmm3
1394	vpshufd	$68,%xmm4,%xmm4
1395	vmovdqa	%xmm5,(%edx)
1396	vmovdqa	%xmm1,16(%edx)
1397	vmovdqa	%xmm2,32(%edx)
1398	vmovdqa	%xmm3,48(%edx)
1399	vmovdqa	%xmm4,64(%edx)
1400	vpmuludq	%xmm0,%xmm4,%xmm4
1401	vpmuludq	%xmm0,%xmm3,%xmm3
1402	vpmuludq	%xmm0,%xmm2,%xmm2
1403	vpmuludq	%xmm0,%xmm1,%xmm1
1404	vpmuludq	%xmm0,%xmm5,%xmm0
1405	vpmuludq	48(%edx),%xmm6,%xmm5
1406	vpaddq	%xmm5,%xmm4,%xmm4
1407	vpmuludq	32(%edx),%xmm6,%xmm7
1408	vpaddq	%xmm7,%xmm3,%xmm3
1409	vpmuludq	16(%edx),%xmm6,%xmm5
1410	vpaddq	%xmm5,%xmm2,%xmm2
1411	vmovdqa	80(%esp),%xmm7
1412	vpmuludq	(%edx),%xmm6,%xmm6
1413	vpaddq	%xmm6,%xmm1,%xmm1
1414	vmovdqa	32(%esp),%xmm5
1415	vpmuludq	64(%edx),%xmm7,%xmm7
1416	vpaddq	%xmm7,%xmm0,%xmm0
1417	vpmuludq	32(%edx),%xmm5,%xmm6
1418	vpaddq	%xmm6,%xmm4,%xmm4
1419	vpmuludq	16(%edx),%xmm5,%xmm7
1420	vpaddq	%xmm7,%xmm3,%xmm3
1421	vmovdqa	96(%esp),%xmm6
1422	vpmuludq	(%edx),%xmm5,%xmm5
1423	vpaddq	%xmm5,%xmm2,%xmm2
1424	vpmuludq	64(%edx),%xmm6,%xmm7
1425	vpaddq	%xmm7,%xmm1,%xmm1
1426	vmovdqa	48(%esp),%xmm5
1427	vpmuludq	48(%edx),%xmm6,%xmm6
1428	vpaddq	%xmm6,%xmm0,%xmm0
1429	vpmuludq	16(%edx),%xmm5,%xmm7
1430	vpaddq	%xmm7,%xmm4,%xmm4
1431	vmovdqa	112(%esp),%xmm6
1432	vpmuludq	(%edx),%xmm5,%xmm5
1433	vpaddq	%xmm5,%xmm3,%xmm3
1434	vpmuludq	64(%edx),%xmm6,%xmm7
1435	vpaddq	%xmm7,%xmm2,%xmm2
1436	vpmuludq	48(%edx),%xmm6,%xmm5
1437	vpaddq	%xmm5,%xmm1,%xmm1
1438	vmovdqa	64(%esp),%xmm7
1439	vpmuludq	32(%edx),%xmm6,%xmm6
1440	vpaddq	%xmm6,%xmm0,%xmm0
1441	vmovdqa	128(%esp),%xmm5
1442	vpmuludq	(%edx),%xmm7,%xmm7
1443	vpaddq	%xmm7,%xmm4,%xmm4
1444	vpmuludq	64(%edx),%xmm5,%xmm6
1445	vpaddq	%xmm6,%xmm3,%xmm3
1446	vpmuludq	16(%edx),%xmm5,%xmm7
1447	vpaddq	%xmm7,%xmm0,%xmm0
1448	vpmuludq	32(%edx),%xmm5,%xmm6
1449	vpaddq	%xmm6,%xmm1,%xmm1
1450	vmovdqa	64(%ebx),%xmm7
1451	vpmuludq	48(%edx),%xmm5,%xmm5
1452	vpaddq	%xmm5,%xmm2,%xmm2
1453	vpsrlq	$26,%xmm3,%xmm5
1454	vpand	%xmm7,%xmm3,%xmm3
1455	vpsrlq	$26,%xmm0,%xmm6
1456	vpand	%xmm7,%xmm0,%xmm0
1457	vpaddq	%xmm5,%xmm4,%xmm4
1458	vpaddq	%xmm6,%xmm1,%xmm1
1459	vpsrlq	$26,%xmm4,%xmm5
1460	vpand	%xmm7,%xmm4,%xmm4
1461	vpsrlq	$26,%xmm1,%xmm6
1462	vpand	%xmm7,%xmm1,%xmm1
1463	vpaddq	%xmm6,%xmm2,%xmm2
1464	vpaddd	%xmm5,%xmm0,%xmm0
1465	vpsllq	$2,%xmm5,%xmm5
1466	vpsrlq	$26,%xmm2,%xmm6
1467	vpand	%xmm7,%xmm2,%xmm2
1468	vpaddd	%xmm5,%xmm0,%xmm0
1469	vpaddd	%xmm6,%xmm3,%xmm3
1470	vpsrlq	$26,%xmm3,%xmm6
1471	vpsrlq	$26,%xmm0,%xmm5
1472	vpand	%xmm7,%xmm0,%xmm0
1473	vpand	%xmm7,%xmm3,%xmm3
1474	vpaddd	%xmm5,%xmm1,%xmm1
1475	vpaddd	%xmm6,%xmm4,%xmm4
1476	decl	%ecx
1477	jz	.L019square_break
1478	vpunpcklqdq	(%esp),%xmm0,%xmm0
1479	vpunpcklqdq	16(%esp),%xmm1,%xmm1
1480	vpunpcklqdq	32(%esp),%xmm2,%xmm2
1481	vpunpcklqdq	48(%esp),%xmm3,%xmm3
1482	vpunpcklqdq	64(%esp),%xmm4,%xmm4
1483	jmp	.L018square
1484.L019square_break:
1485	vpsllq	$32,%xmm0,%xmm0
1486	vpsllq	$32,%xmm1,%xmm1
1487	vpsllq	$32,%xmm2,%xmm2
1488	vpsllq	$32,%xmm3,%xmm3
1489	vpsllq	$32,%xmm4,%xmm4
1490	vpor	(%esp),%xmm0,%xmm0
1491	vpor	16(%esp),%xmm1,%xmm1
1492	vpor	32(%esp),%xmm2,%xmm2
1493	vpor	48(%esp),%xmm3,%xmm3
1494	vpor	64(%esp),%xmm4,%xmm4
1495	vpshufd	$141,%xmm0,%xmm0
1496	vpshufd	$141,%xmm1,%xmm1
1497	vpshufd	$141,%xmm2,%xmm2
1498	vpshufd	$141,%xmm3,%xmm3
1499	vpshufd	$141,%xmm4,%xmm4
1500	vmovdqu	%xmm0,(%edi)
1501	vmovdqu	%xmm1,16(%edi)
1502	vmovdqu	%xmm2,32(%edi)
1503	vmovdqu	%xmm3,48(%edi)
1504	vmovdqu	%xmm4,64(%edi)
1505	vpslld	$2,%xmm1,%xmm6
1506	vpslld	$2,%xmm2,%xmm5
1507	vpaddd	%xmm1,%xmm6,%xmm6
1508	vpaddd	%xmm2,%xmm5,%xmm5
1509	vmovdqu	%xmm6,80(%edi)
1510	vmovdqu	%xmm5,96(%edi)
1511	vpslld	$2,%xmm3,%xmm6
1512	vpslld	$2,%xmm4,%xmm5
1513	vpaddd	%xmm3,%xmm6,%xmm6
1514	vpaddd	%xmm4,%xmm5,%xmm5
1515	vmovdqu	%xmm6,112(%edi)
1516	vmovdqu	%xmm5,128(%edi)
1517	movl	%ebp,%esp
1518	leal	-48(%edi),%edi
1519	ret
1520.size	_poly1305_init_avx2,.-_poly1305_init_avx2
1521.align	32
1522.type	_poly1305_blocks_avx2,@function
1523.align	16
1524_poly1305_blocks_avx2:
1525	pushl	%ebp
1526	pushl	%ebx
1527	pushl	%esi
1528	pushl	%edi
1529	movl	20(%esp),%edi
1530	movl	24(%esp),%esi
1531	movl	28(%esp),%ecx
1532	movl	20(%edi),%eax
1533	andl	$-16,%ecx
1534	jz	.L020nodata
1535	cmpl	$64,%ecx
1536	jae	.L021enter_avx2
1537	testl	%eax,%eax
1538	jz	.Lenter_blocks
1539.L021enter_avx2:
1540	vzeroupper
1541	call	.L022pic_point
1542.L022pic_point:
1543	popl	%ebx
1544	leal	.Lconst_sse2-.L022pic_point(%ebx),%ebx
1545	testl	%eax,%eax
1546	jnz	.L023base2_26
1547	call	_poly1305_init_avx2
1548	movl	(%edi),%eax
1549	movl	3(%edi),%ecx
1550	movl	6(%edi),%edx
1551	movl	9(%edi),%esi
1552	movl	13(%edi),%ebp
1553	shrl	$2,%ecx
1554	andl	$67108863,%eax
1555	shrl	$4,%edx
1556	andl	$67108863,%ecx
1557	shrl	$6,%esi
1558	andl	$67108863,%edx
1559	movl	%eax,(%edi)
1560	movl	%ecx,4(%edi)
1561	movl	%edx,8(%edi)
1562	movl	%esi,12(%edi)
1563	movl	%ebp,16(%edi)
1564	movl	$1,20(%edi)
1565	movl	24(%esp),%esi
1566	movl	28(%esp),%ecx
1567.L023base2_26:
1568	movl	32(%esp),%eax
1569	movl	%esp,%ebp
1570	subl	$448,%esp
1571	andl	$-512,%esp
1572	vmovdqu	48(%edi),%xmm0
1573	leal	288(%esp),%edx
1574	vmovdqu	64(%edi),%xmm1
1575	vmovdqu	80(%edi),%xmm2
1576	vmovdqu	96(%edi),%xmm3
1577	vmovdqu	112(%edi),%xmm4
1578	leal	48(%edi),%edi
1579	vpermq	$64,%ymm0,%ymm0
1580	vpermq	$64,%ymm1,%ymm1
1581	vpermq	$64,%ymm2,%ymm2
1582	vpermq	$64,%ymm3,%ymm3
1583	vpermq	$64,%ymm4,%ymm4
1584	vpshufd	$200,%ymm0,%ymm0
1585	vpshufd	$200,%ymm1,%ymm1
1586	vpshufd	$200,%ymm2,%ymm2
1587	vpshufd	$200,%ymm3,%ymm3
1588	vpshufd	$200,%ymm4,%ymm4
1589	vmovdqa	%ymm0,-128(%edx)
1590	vmovdqu	80(%edi),%xmm0
1591	vmovdqa	%ymm1,-96(%edx)
1592	vmovdqu	96(%edi),%xmm1
1593	vmovdqa	%ymm2,-64(%edx)
1594	vmovdqu	112(%edi),%xmm2
1595	vmovdqa	%ymm3,-32(%edx)
1596	vmovdqu	128(%edi),%xmm3
1597	vmovdqa	%ymm4,(%edx)
1598	vpermq	$64,%ymm0,%ymm0
1599	vpermq	$64,%ymm1,%ymm1
1600	vpermq	$64,%ymm2,%ymm2
1601	vpermq	$64,%ymm3,%ymm3
1602	vpshufd	$200,%ymm0,%ymm0
1603	vpshufd	$200,%ymm1,%ymm1
1604	vpshufd	$200,%ymm2,%ymm2
1605	vpshufd	$200,%ymm3,%ymm3
1606	vmovdqa	%ymm0,32(%edx)
1607	vmovd	-48(%edi),%xmm0
1608	vmovdqa	%ymm1,64(%edx)
1609	vmovd	-44(%edi),%xmm1
1610	vmovdqa	%ymm2,96(%edx)
1611	vmovd	-40(%edi),%xmm2
1612	vmovdqa	%ymm3,128(%edx)
1613	vmovd	-36(%edi),%xmm3
1614	vmovd	-32(%edi),%xmm4
1615	vmovdqa	64(%ebx),%ymm7
1616	negl	%eax
1617	testl	$63,%ecx
1618	jz	.L024even
1619	movl	%ecx,%edx
1620	andl	$-64,%ecx
1621	andl	$63,%edx
1622	vmovdqu	(%esi),%xmm5
1623	cmpl	$32,%edx
1624	jb	.L025one
1625	vmovdqu	16(%esi),%xmm6
1626	je	.L026two
1627	vinserti128	$1,32(%esi),%ymm5,%ymm5
1628	leal	48(%esi),%esi
1629	leal	8(%ebx),%ebx
1630	leal	296(%esp),%edx
1631	jmp	.L027tail
1632.L026two:
1633	leal	32(%esi),%esi
1634	leal	16(%ebx),%ebx
1635	leal	304(%esp),%edx
1636	jmp	.L027tail
1637.L025one:
1638	leal	16(%esi),%esi
1639	vpxor	%ymm6,%ymm6,%ymm6
1640	leal	32(%ebx,%eax,8),%ebx
1641	leal	312(%esp),%edx
1642	jmp	.L027tail
1643.align	32
1644.L024even:
1645	vmovdqu	(%esi),%xmm5
1646	vmovdqu	16(%esi),%xmm6
1647	vinserti128	$1,32(%esi),%ymm5,%ymm5
1648	vinserti128	$1,48(%esi),%ymm6,%ymm6
1649	leal	64(%esi),%esi
1650	subl	$64,%ecx
1651	jz	.L027tail
1652.L028loop:
1653	vmovdqa	%ymm2,64(%esp)
1654	vpsrldq	$6,%ymm5,%ymm2
1655	vmovdqa	%ymm0,(%esp)
1656	vpsrldq	$6,%ymm6,%ymm0
1657	vmovdqa	%ymm1,32(%esp)
1658	vpunpckhqdq	%ymm6,%ymm5,%ymm1
1659	vpunpcklqdq	%ymm6,%ymm5,%ymm5
1660	vpunpcklqdq	%ymm0,%ymm2,%ymm2
1661	vpsrlq	$30,%ymm2,%ymm0
1662	vpsrlq	$4,%ymm2,%ymm2
1663	vpsrlq	$26,%ymm5,%ymm6
1664	vpsrlq	$40,%ymm1,%ymm1
1665	vpand	%ymm7,%ymm2,%ymm2
1666	vpand	%ymm7,%ymm5,%ymm5
1667	vpand	%ymm7,%ymm6,%ymm6
1668	vpand	%ymm7,%ymm0,%ymm0
1669	vpor	(%ebx),%ymm1,%ymm1
1670	vpaddq	64(%esp),%ymm2,%ymm2
1671	vpaddq	(%esp),%ymm5,%ymm5
1672	vpaddq	32(%esp),%ymm6,%ymm6
1673	vpaddq	%ymm3,%ymm0,%ymm0
1674	vpaddq	%ymm4,%ymm1,%ymm1
1675	vpmuludq	-96(%edx),%ymm2,%ymm3
1676	vmovdqa	%ymm6,32(%esp)
1677	vpmuludq	-64(%edx),%ymm2,%ymm4
1678	vmovdqa	%ymm0,96(%esp)
1679	vpmuludq	96(%edx),%ymm2,%ymm0
1680	vmovdqa	%ymm1,128(%esp)
1681	vpmuludq	128(%edx),%ymm2,%ymm1
1682	vpmuludq	-128(%edx),%ymm2,%ymm2
1683	vpmuludq	-32(%edx),%ymm5,%ymm7
1684	vpaddq	%ymm7,%ymm3,%ymm3
1685	vpmuludq	(%edx),%ymm5,%ymm6
1686	vpaddq	%ymm6,%ymm4,%ymm4
1687	vpmuludq	-128(%edx),%ymm5,%ymm7
1688	vpaddq	%ymm7,%ymm0,%ymm0
1689	vmovdqa	32(%esp),%ymm7
1690	vpmuludq	-96(%edx),%ymm5,%ymm6
1691	vpaddq	%ymm6,%ymm1,%ymm1
1692	vpmuludq	-64(%edx),%ymm5,%ymm5
1693	vpaddq	%ymm5,%ymm2,%ymm2
1694	vpmuludq	-64(%edx),%ymm7,%ymm6
1695	vpaddq	%ymm6,%ymm3,%ymm3
1696	vpmuludq	-32(%edx),%ymm7,%ymm5
1697	vpaddq	%ymm5,%ymm4,%ymm4
1698	vpmuludq	128(%edx),%ymm7,%ymm6
1699	vpaddq	%ymm6,%ymm0,%ymm0
1700	vmovdqa	96(%esp),%ymm6
1701	vpmuludq	-128(%edx),%ymm7,%ymm5
1702	vpaddq	%ymm5,%ymm1,%ymm1
1703	vpmuludq	-96(%edx),%ymm7,%ymm7
1704	vpaddq	%ymm7,%ymm2,%ymm2
1705	vpmuludq	-128(%edx),%ymm6,%ymm5
1706	vpaddq	%ymm5,%ymm3,%ymm3
1707	vpmuludq	-96(%edx),%ymm6,%ymm7
1708	vpaddq	%ymm7,%ymm4,%ymm4
1709	vpmuludq	64(%edx),%ymm6,%ymm5
1710	vpaddq	%ymm5,%ymm0,%ymm0
1711	vmovdqa	128(%esp),%ymm5
1712	vpmuludq	96(%edx),%ymm6,%ymm7
1713	vpaddq	%ymm7,%ymm1,%ymm1
1714	vpmuludq	128(%edx),%ymm6,%ymm6
1715	vpaddq	%ymm6,%ymm2,%ymm2
1716	vpmuludq	128(%edx),%ymm5,%ymm7
1717	vpaddq	%ymm7,%ymm3,%ymm3
1718	vpmuludq	32(%edx),%ymm5,%ymm6
1719	vpaddq	%ymm6,%ymm0,%ymm0
1720	vpmuludq	-128(%edx),%ymm5,%ymm7
1721	vpaddq	%ymm7,%ymm4,%ymm4
1722	vmovdqa	64(%ebx),%ymm7
1723	vpmuludq	64(%edx),%ymm5,%ymm6
1724	vpaddq	%ymm6,%ymm1,%ymm1
1725	vpmuludq	96(%edx),%ymm5,%ymm5
1726	vpaddq	%ymm5,%ymm2,%ymm2
1727	vpsrlq	$26,%ymm3,%ymm5
1728	vpand	%ymm7,%ymm3,%ymm3
1729	vpsrlq	$26,%ymm0,%ymm6
1730	vpand	%ymm7,%ymm0,%ymm0
1731	vpaddq	%ymm5,%ymm4,%ymm4
1732	vpaddq	%ymm6,%ymm1,%ymm1
1733	vpsrlq	$26,%ymm4,%ymm5
1734	vpand	%ymm7,%ymm4,%ymm4
1735	vpsrlq	$26,%ymm1,%ymm6
1736	vpand	%ymm7,%ymm1,%ymm1
1737	vpaddq	%ymm6,%ymm2,%ymm2
1738	vpaddq	%ymm5,%ymm0,%ymm0
1739	vpsllq	$2,%ymm5,%ymm5
1740	vpsrlq	$26,%ymm2,%ymm6
1741	vpand	%ymm7,%ymm2,%ymm2
1742	vpaddq	%ymm5,%ymm0,%ymm0
1743	vpaddq	%ymm6,%ymm3,%ymm3
1744	vpsrlq	$26,%ymm3,%ymm6
1745	vpsrlq	$26,%ymm0,%ymm5
1746	vpand	%ymm7,%ymm0,%ymm0
1747	vpand	%ymm7,%ymm3,%ymm3
1748	vpaddq	%ymm5,%ymm1,%ymm1
1749	vpaddq	%ymm6,%ymm4,%ymm4
1750	vmovdqu	(%esi),%xmm5
1751	vmovdqu	16(%esi),%xmm6
1752	vinserti128	$1,32(%esi),%ymm5,%ymm5
1753	vinserti128	$1,48(%esi),%ymm6,%ymm6
1754	leal	64(%esi),%esi
1755	subl	$64,%ecx
1756	jnz	.L028loop
1757.L027tail:
1758	vmovdqa	%ymm2,64(%esp)
1759	vpsrldq	$6,%ymm5,%ymm2
1760	vmovdqa	%ymm0,(%esp)
1761	vpsrldq	$6,%ymm6,%ymm0
1762	vmovdqa	%ymm1,32(%esp)
1763	vpunpckhqdq	%ymm6,%ymm5,%ymm1
1764	vpunpcklqdq	%ymm6,%ymm5,%ymm5
1765	vpunpcklqdq	%ymm0,%ymm2,%ymm2
1766	vpsrlq	$30,%ymm2,%ymm0
1767	vpsrlq	$4,%ymm2,%ymm2
1768	vpsrlq	$26,%ymm5,%ymm6
1769	vpsrlq	$40,%ymm1,%ymm1
1770	vpand	%ymm7,%ymm2,%ymm2
1771	vpand	%ymm7,%ymm5,%ymm5
1772	vpand	%ymm7,%ymm6,%ymm6
1773	vpand	%ymm7,%ymm0,%ymm0
1774	vpor	(%ebx),%ymm1,%ymm1
1775	andl	$-64,%ebx
1776	vpaddq	64(%esp),%ymm2,%ymm2
1777	vpaddq	(%esp),%ymm5,%ymm5
1778	vpaddq	32(%esp),%ymm6,%ymm6
1779	vpaddq	%ymm3,%ymm0,%ymm0
1780	vpaddq	%ymm4,%ymm1,%ymm1
1781	vpmuludq	-92(%edx),%ymm2,%ymm3
1782	vmovdqa	%ymm6,32(%esp)
1783	vpmuludq	-60(%edx),%ymm2,%ymm4
1784	vmovdqa	%ymm0,96(%esp)
1785	vpmuludq	100(%edx),%ymm2,%ymm0
1786	vmovdqa	%ymm1,128(%esp)
1787	vpmuludq	132(%edx),%ymm2,%ymm1
1788	vpmuludq	-124(%edx),%ymm2,%ymm2
1789	vpmuludq	-28(%edx),%ymm5,%ymm7
1790	vpaddq	%ymm7,%ymm3,%ymm3
1791	vpmuludq	4(%edx),%ymm5,%ymm6
1792	vpaddq	%ymm6,%ymm4,%ymm4
1793	vpmuludq	-124(%edx),%ymm5,%ymm7
1794	vpaddq	%ymm7,%ymm0,%ymm0
1795	vmovdqa	32(%esp),%ymm7
1796	vpmuludq	-92(%edx),%ymm5,%ymm6
1797	vpaddq	%ymm6,%ymm1,%ymm1
1798	vpmuludq	-60(%edx),%ymm5,%ymm5
1799	vpaddq	%ymm5,%ymm2,%ymm2
1800	vpmuludq	-60(%edx),%ymm7,%ymm6
1801	vpaddq	%ymm6,%ymm3,%ymm3
1802	vpmuludq	-28(%edx),%ymm7,%ymm5
1803	vpaddq	%ymm5,%ymm4,%ymm4
1804	vpmuludq	132(%edx),%ymm7,%ymm6
1805	vpaddq	%ymm6,%ymm0,%ymm0
1806	vmovdqa	96(%esp),%ymm6
1807	vpmuludq	-124(%edx),%ymm7,%ymm5
1808	vpaddq	%ymm5,%ymm1,%ymm1
1809	vpmuludq	-92(%edx),%ymm7,%ymm7
1810	vpaddq	%ymm7,%ymm2,%ymm2
1811	vpmuludq	-124(%edx),%ymm6,%ymm5
1812	vpaddq	%ymm5,%ymm3,%ymm3
1813	vpmuludq	-92(%edx),%ymm6,%ymm7
1814	vpaddq	%ymm7,%ymm4,%ymm4
1815	vpmuludq	68(%edx),%ymm6,%ymm5
1816	vpaddq	%ymm5,%ymm0,%ymm0
1817	vmovdqa	128(%esp),%ymm5
1818	vpmuludq	100(%edx),%ymm6,%ymm7
1819	vpaddq	%ymm7,%ymm1,%ymm1
1820	vpmuludq	132(%edx),%ymm6,%ymm6
1821	vpaddq	%ymm6,%ymm2,%ymm2
1822	vpmuludq	132(%edx),%ymm5,%ymm7
1823	vpaddq	%ymm7,%ymm3,%ymm3
1824	vpmuludq	36(%edx),%ymm5,%ymm6
1825	vpaddq	%ymm6,%ymm0,%ymm0
1826	vpmuludq	-124(%edx),%ymm5,%ymm7
1827	vpaddq	%ymm7,%ymm4,%ymm4
1828	vmovdqa	64(%ebx),%ymm7
1829	vpmuludq	68(%edx),%ymm5,%ymm6
1830	vpaddq	%ymm6,%ymm1,%ymm1
1831	vpmuludq	100(%edx),%ymm5,%ymm5
1832	vpaddq	%ymm5,%ymm2,%ymm2
1833	vpsrldq	$8,%ymm4,%ymm5
1834	vpsrldq	$8,%ymm3,%ymm6
1835	vpaddq	%ymm5,%ymm4,%ymm4
1836	vpsrldq	$8,%ymm0,%ymm5
1837	vpaddq	%ymm6,%ymm3,%ymm3
1838	vpsrldq	$8,%ymm1,%ymm6
1839	vpaddq	%ymm5,%ymm0,%ymm0
1840	vpsrldq	$8,%ymm2,%ymm5
1841	vpaddq	%ymm6,%ymm1,%ymm1
1842	vpermq	$2,%ymm4,%ymm6
1843	vpaddq	%ymm5,%ymm2,%ymm2
1844	vpermq	$2,%ymm3,%ymm5
1845	vpaddq	%ymm6,%ymm4,%ymm4
1846	vpermq	$2,%ymm0,%ymm6
1847	vpaddq	%ymm5,%ymm3,%ymm3
1848	vpermq	$2,%ymm1,%ymm5
1849	vpaddq	%ymm6,%ymm0,%ymm0
1850	vpermq	$2,%ymm2,%ymm6
1851	vpaddq	%ymm5,%ymm1,%ymm1
1852	vpaddq	%ymm6,%ymm2,%ymm2
1853	vpsrlq	$26,%ymm3,%ymm5
1854	vpand	%ymm7,%ymm3,%ymm3
1855	vpsrlq	$26,%ymm0,%ymm6
1856	vpand	%ymm7,%ymm0,%ymm0
1857	vpaddq	%ymm5,%ymm4,%ymm4
1858	vpaddq	%ymm6,%ymm1,%ymm1
1859	vpsrlq	$26,%ymm4,%ymm5
1860	vpand	%ymm7,%ymm4,%ymm4
1861	vpsrlq	$26,%ymm1,%ymm6
1862	vpand	%ymm7,%ymm1,%ymm1
1863	vpaddq	%ymm6,%ymm2,%ymm2
1864	vpaddq	%ymm5,%ymm0,%ymm0
1865	vpsllq	$2,%ymm5,%ymm5
1866	vpsrlq	$26,%ymm2,%ymm6
1867	vpand	%ymm7,%ymm2,%ymm2
1868	vpaddq	%ymm5,%ymm0,%ymm0
1869	vpaddq	%ymm6,%ymm3,%ymm3
1870	vpsrlq	$26,%ymm3,%ymm6
1871	vpsrlq	$26,%ymm0,%ymm5
1872	vpand	%ymm7,%ymm0,%ymm0
1873	vpand	%ymm7,%ymm3,%ymm3
1874	vpaddq	%ymm5,%ymm1,%ymm1
1875	vpaddq	%ymm6,%ymm4,%ymm4
1876	cmpl	$0,%ecx
1877	je	.L029done
1878	vpshufd	$252,%xmm0,%xmm0
1879	leal	288(%esp),%edx
1880	vpshufd	$252,%xmm1,%xmm1
1881	vpshufd	$252,%xmm2,%xmm2
1882	vpshufd	$252,%xmm3,%xmm3
1883	vpshufd	$252,%xmm4,%xmm4
1884	jmp	.L024even
1885.align	16
1886.L029done:
1887	vmovd	%xmm0,-48(%edi)
1888	vmovd	%xmm1,-44(%edi)
1889	vmovd	%xmm2,-40(%edi)
1890	vmovd	%xmm3,-36(%edi)
1891	vmovd	%xmm4,-32(%edi)
1892	vzeroupper
1893	movl	%ebp,%esp
1894.L020nodata:
1895	popl	%edi
1896	popl	%esi
1897	popl	%ebx
1898	popl	%ebp
1899	ret
1900.size	_poly1305_blocks_avx2,.-_poly1305_blocks_avx2
1901.align	64
1902.Lconst_sse2:
1903.long	16777216,0,16777216,0,16777216,0,16777216,0
1904.long	0,0,0,0,0,0,0,0
1905.long	67108863,0,67108863,0,67108863,0,67108863,0
1906.long	268435455,268435452,268435452,268435452
1907.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
1908.byte	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
1909.byte	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
1910.byte	114,103,62,0
1911.align	4
1912.comm	OPENSSL_ia32cap_P,16,4
1913#else
1914.text
1915.align	64
1916.globl	poly1305_init
1917.type	poly1305_init,@function
1918.align	16
1919poly1305_init:
1920.L_poly1305_init_begin:
1921	pushl	%ebp
1922	pushl	%ebx
1923	pushl	%esi
1924	pushl	%edi
1925	movl	20(%esp),%edi
1926	movl	24(%esp),%esi
1927	movl	28(%esp),%ebp
1928	xorl	%eax,%eax
1929	movl	%eax,(%edi)
1930	movl	%eax,4(%edi)
1931	movl	%eax,8(%edi)
1932	movl	%eax,12(%edi)
1933	movl	%eax,16(%edi)
1934	movl	%eax,20(%edi)
1935	cmpl	$0,%esi
1936	je	.L000nokey
1937	call	.L001pic_point
1938.L001pic_point:
1939	popl	%ebx
1940	leal	poly1305_blocks-.L001pic_point(%ebx),%eax
1941	leal	poly1305_emit-.L001pic_point(%ebx),%edx
1942	leal	OPENSSL_ia32cap_P,%edi
1943	movl	(%edi),%ecx
1944	andl	$83886080,%ecx
1945	cmpl	$83886080,%ecx
1946	jne	.L002no_sse2
1947	leal	_poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
1948	leal	_poly1305_emit_sse2-.L001pic_point(%ebx),%edx
1949	movl	8(%edi),%ecx
1950	testl	$32,%ecx
1951	jz	.L002no_sse2
1952	leal	_poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
1953.L002no_sse2:
1954	movl	20(%esp),%edi
1955	movl	%eax,(%ebp)
1956	movl	%edx,4(%ebp)
1957	movl	(%esi),%eax
1958	movl	4(%esi),%ebx
1959	movl	8(%esi),%ecx
1960	movl	12(%esi),%edx
1961	andl	$268435455,%eax
1962	andl	$268435452,%ebx
1963	andl	$268435452,%ecx
1964	andl	$268435452,%edx
1965	movl	%eax,24(%edi)
1966	movl	%ebx,28(%edi)
1967	movl	%ecx,32(%edi)
1968	movl	%edx,36(%edi)
1969	movl	$1,%eax
1970.L000nokey:
1971	popl	%edi
1972	popl	%esi
1973	popl	%ebx
1974	popl	%ebp
1975	ret
1976.size	poly1305_init,.-.L_poly1305_init_begin
1977.globl	poly1305_blocks
1978.type	poly1305_blocks,@function
1979.align	16
1980poly1305_blocks:
1981.L_poly1305_blocks_begin:
1982	pushl	%ebp
1983	pushl	%ebx
1984	pushl	%esi
1985	pushl	%edi
1986	movl	20(%esp),%edi
1987	movl	24(%esp),%esi
1988	movl	28(%esp),%ecx
1989.Lenter_blocks:
1990	andl	$-15,%ecx
1991	jz	.L003nodata
1992	subl	$64,%esp
1993	movl	24(%edi),%eax
1994	movl	28(%edi),%ebx
1995	leal	(%esi,%ecx,1),%ebp
1996	movl	32(%edi),%ecx
1997	movl	36(%edi),%edx
1998	movl	%ebp,92(%esp)
1999	movl	%esi,%ebp
2000	movl	%eax,36(%esp)
2001	movl	%ebx,%eax
2002	shrl	$2,%eax
2003	movl	%ebx,40(%esp)
2004	addl	%ebx,%eax
2005	movl	%ecx,%ebx
2006	shrl	$2,%ebx
2007	movl	%ecx,44(%esp)
2008	addl	%ecx,%ebx
2009	movl	%edx,%ecx
2010	shrl	$2,%ecx
2011	movl	%edx,48(%esp)
2012	addl	%edx,%ecx
2013	movl	%eax,52(%esp)
2014	movl	%ebx,56(%esp)
2015	movl	%ecx,60(%esp)
2016	movl	(%edi),%eax
2017	movl	4(%edi),%ebx
2018	movl	8(%edi),%ecx
2019	movl	12(%edi),%esi
2020	movl	16(%edi),%edi
2021	jmp	.L004loop
2022.align	32
2023.L004loop:
2024	addl	(%ebp),%eax
2025	adcl	4(%ebp),%ebx
2026	adcl	8(%ebp),%ecx
2027	adcl	12(%ebp),%esi
2028	leal	16(%ebp),%ebp
2029	adcl	96(%esp),%edi
2030	movl	%eax,(%esp)
2031	movl	%esi,12(%esp)
2032	mull	36(%esp)
2033	movl	%edi,16(%esp)
2034	movl	%eax,%edi
2035	movl	%ebx,%eax
2036	movl	%edx,%esi
2037	mull	60(%esp)
2038	addl	%eax,%edi
2039	movl	%ecx,%eax
2040	adcl	%edx,%esi
2041	mull	56(%esp)
2042	addl	%eax,%edi
2043	movl	12(%esp),%eax
2044	adcl	%edx,%esi
2045	mull	52(%esp)
2046	addl	%eax,%edi
2047	movl	(%esp),%eax
2048	adcl	%edx,%esi
2049	mull	40(%esp)
2050	movl	%edi,20(%esp)
2051	xorl	%edi,%edi
2052	addl	%eax,%esi
2053	movl	%ebx,%eax
2054	adcl	%edx,%edi
2055	mull	36(%esp)
2056	addl	%eax,%esi
2057	movl	%ecx,%eax
2058	adcl	%edx,%edi
2059	mull	60(%esp)
2060	addl	%eax,%esi
2061	movl	12(%esp),%eax
2062	adcl	%edx,%edi
2063	mull	56(%esp)
2064	addl	%eax,%esi
2065	movl	16(%esp),%eax
2066	adcl	%edx,%edi
2067	imull	52(%esp),%eax
2068	addl	%eax,%esi
2069	movl	(%esp),%eax
2070	adcl	$0,%edi
2071	mull	44(%esp)
2072	movl	%esi,24(%esp)
2073	xorl	%esi,%esi
2074	addl	%eax,%edi
2075	movl	%ebx,%eax
2076	adcl	%edx,%esi
2077	mull	40(%esp)
2078	addl	%eax,%edi
2079	movl	%ecx,%eax
2080	adcl	%edx,%esi
2081	mull	36(%esp)
2082	addl	%eax,%edi
2083	movl	12(%esp),%eax
2084	adcl	%edx,%esi
2085	mull	60(%esp)
2086	addl	%eax,%edi
2087	movl	16(%esp),%eax
2088	adcl	%edx,%esi
2089	imull	56(%esp),%eax
2090	addl	%eax,%edi
2091	movl	(%esp),%eax
2092	adcl	$0,%esi
2093	mull	48(%esp)
2094	movl	%edi,28(%esp)
2095	xorl	%edi,%edi
2096	addl	%eax,%esi
2097	movl	%ebx,%eax
2098	adcl	%edx,%edi
2099	mull	44(%esp)
2100	addl	%eax,%esi
2101	movl	%ecx,%eax
2102	adcl	%edx,%edi
2103	mull	40(%esp)
2104	addl	%eax,%esi
2105	movl	12(%esp),%eax
2106	adcl	%edx,%edi
2107	mull	36(%esp)
2108	addl	%eax,%esi
2109	movl	16(%esp),%ecx
2110	adcl	%edx,%edi
2111	movl	%ecx,%edx
2112	imull	60(%esp),%ecx
2113	addl	%ecx,%esi
2114	movl	20(%esp),%eax
2115	adcl	$0,%edi
2116	imull	36(%esp),%edx
2117	addl	%edi,%edx
2118	movl	24(%esp),%ebx
2119	movl	28(%esp),%ecx
2120	movl	%edx,%edi
2121	shrl	$2,%edx
2122	andl	$3,%edi
2123	leal	(%edx,%edx,4),%edx
2124	addl	%edx,%eax
2125	adcl	$0,%ebx
2126	adcl	$0,%ecx
2127	adcl	$0,%esi
2128	adcl	$0,%edi
2129	cmpl	92(%esp),%ebp
2130	jne	.L004loop
2131	movl	84(%esp),%edx
2132	addl	$64,%esp
2133	movl	%eax,(%edx)
2134	movl	%ebx,4(%edx)
2135	movl	%ecx,8(%edx)
2136	movl	%esi,12(%edx)
2137	movl	%edi,16(%edx)
2138.L003nodata:
2139	popl	%edi
2140	popl	%esi
2141	popl	%ebx
2142	popl	%ebp
2143	ret
2144.size	poly1305_blocks,.-.L_poly1305_blocks_begin
2145.globl	poly1305_emit
2146.type	poly1305_emit,@function
2147.align	16
2148poly1305_emit:
2149.L_poly1305_emit_begin:
2150	pushl	%ebp
2151	pushl	%ebx
2152	pushl	%esi
2153	pushl	%edi
2154	movl	20(%esp),%ebp
2155.Lenter_emit:
2156	movl	24(%esp),%edi
2157	movl	(%ebp),%eax
2158	movl	4(%ebp),%ebx
2159	movl	8(%ebp),%ecx
2160	movl	12(%ebp),%edx
2161	movl	16(%ebp),%esi
2162	addl	$5,%eax
2163	adcl	$0,%ebx
2164	adcl	$0,%ecx
2165	adcl	$0,%edx
2166	adcl	$0,%esi
2167	shrl	$2,%esi
2168	negl	%esi
2169	andl	%esi,%eax
2170	andl	%esi,%ebx
2171	andl	%esi,%ecx
2172	andl	%esi,%edx
2173	movl	%eax,(%edi)
2174	movl	%ebx,4(%edi)
2175	movl	%ecx,8(%edi)
2176	movl	%edx,12(%edi)
2177	notl	%esi
2178	movl	(%ebp),%eax
2179	movl	4(%ebp),%ebx
2180	movl	8(%ebp),%ecx
2181	movl	12(%ebp),%edx
2182	movl	28(%esp),%ebp
2183	andl	%esi,%eax
2184	andl	%esi,%ebx
2185	andl	%esi,%ecx
2186	andl	%esi,%edx
2187	orl	(%edi),%eax
2188	orl	4(%edi),%ebx
2189	orl	8(%edi),%ecx
2190	orl	12(%edi),%edx
2191	addl	(%ebp),%eax
2192	adcl	4(%ebp),%ebx
2193	adcl	8(%ebp),%ecx
2194	adcl	12(%ebp),%edx
2195	movl	%eax,(%edi)
2196	movl	%ebx,4(%edi)
2197	movl	%ecx,8(%edi)
2198	movl	%edx,12(%edi)
2199	popl	%edi
2200	popl	%esi
2201	popl	%ebx
2202	popl	%ebp
2203	ret
2204.size	poly1305_emit,.-.L_poly1305_emit_begin
2205.align	32
2206.type	_poly1305_init_sse2,@function
2207.align	16
2208_poly1305_init_sse2:
2209	movdqu	24(%edi),%xmm4
2210	leal	48(%edi),%edi
2211	movl	%esp,%ebp
2212	subl	$224,%esp
2213	andl	$-16,%esp
2214	movq	64(%ebx),%xmm7
2215	movdqa	%xmm4,%xmm0
2216	movdqa	%xmm4,%xmm1
2217	movdqa	%xmm4,%xmm2
2218	pand	%xmm7,%xmm0
2219	psrlq	$26,%xmm1
2220	psrldq	$6,%xmm2
2221	pand	%xmm7,%xmm1
2222	movdqa	%xmm2,%xmm3
2223	psrlq	$4,%xmm2
2224	psrlq	$30,%xmm3
2225	pand	%xmm7,%xmm2
2226	pand	%xmm7,%xmm3
2227	psrldq	$13,%xmm4
2228	leal	144(%esp),%edx
2229	movl	$2,%ecx
2230.L005square:
2231	movdqa	%xmm0,(%esp)
2232	movdqa	%xmm1,16(%esp)
2233	movdqa	%xmm2,32(%esp)
2234	movdqa	%xmm3,48(%esp)
2235	movdqa	%xmm4,64(%esp)
2236	movdqa	%xmm1,%xmm6
2237	movdqa	%xmm2,%xmm5
2238	pslld	$2,%xmm6
2239	pslld	$2,%xmm5
2240	paddd	%xmm1,%xmm6
2241	paddd	%xmm2,%xmm5
2242	movdqa	%xmm6,80(%esp)
2243	movdqa	%xmm5,96(%esp)
2244	movdqa	%xmm3,%xmm6
2245	movdqa	%xmm4,%xmm5
2246	pslld	$2,%xmm6
2247	pslld	$2,%xmm5
2248	paddd	%xmm3,%xmm6
2249	paddd	%xmm4,%xmm5
2250	movdqa	%xmm6,112(%esp)
2251	movdqa	%xmm5,128(%esp)
2252	pshufd	$68,%xmm0,%xmm6
2253	movdqa	%xmm1,%xmm5
2254	pshufd	$68,%xmm1,%xmm1
2255	pshufd	$68,%xmm2,%xmm2
2256	pshufd	$68,%xmm3,%xmm3
2257	pshufd	$68,%xmm4,%xmm4
2258	movdqa	%xmm6,(%edx)
2259	movdqa	%xmm1,16(%edx)
2260	movdqa	%xmm2,32(%edx)
2261	movdqa	%xmm3,48(%edx)
2262	movdqa	%xmm4,64(%edx)
2263	pmuludq	%xmm0,%xmm4
2264	pmuludq	%xmm0,%xmm3
2265	pmuludq	%xmm0,%xmm2
2266	pmuludq	%xmm0,%xmm1
2267	pmuludq	%xmm6,%xmm0
2268	movdqa	%xmm5,%xmm6
2269	pmuludq	48(%edx),%xmm5
2270	movdqa	%xmm6,%xmm7
2271	pmuludq	32(%edx),%xmm6
2272	paddq	%xmm5,%xmm4
2273	movdqa	%xmm7,%xmm5
2274	pmuludq	16(%edx),%xmm7
2275	paddq	%xmm6,%xmm3
2276	movdqa	80(%esp),%xmm6
2277	pmuludq	(%edx),%xmm5
2278	paddq	%xmm7,%xmm2
2279	pmuludq	64(%edx),%xmm6
2280	movdqa	32(%esp),%xmm7
2281	paddq	%xmm5,%xmm1
2282	movdqa	%xmm7,%xmm5
2283	pmuludq	32(%edx),%xmm7
2284	paddq	%xmm6,%xmm0
2285	movdqa	%xmm5,%xmm6
2286	pmuludq	16(%edx),%xmm5
2287	paddq	%xmm7,%xmm4
2288	movdqa	96(%esp),%xmm7
2289	pmuludq	(%edx),%xmm6
2290	paddq	%xmm5,%xmm3
2291	movdqa	%xmm7,%xmm5
2292	pmuludq	64(%edx),%xmm7
2293	paddq	%xmm6,%xmm2
2294	pmuludq	48(%edx),%xmm5
2295	movdqa	48(%esp),%xmm6
2296	paddq	%xmm7,%xmm1
2297	movdqa	%xmm6,%xmm7
2298	pmuludq	16(%edx),%xmm6
2299	paddq	%xmm5,%xmm0
2300	movdqa	112(%esp),%xmm5
2301	pmuludq	(%edx),%xmm7
2302	paddq	%xmm6,%xmm4
2303	movdqa	%xmm5,%xmm6
2304	pmuludq	64(%edx),%xmm5
2305	paddq	%xmm7,%xmm3
2306	movdqa	%xmm6,%xmm7
2307	pmuludq	48(%edx),%xmm6
2308	paddq	%xmm5,%xmm2
2309	pmuludq	32(%edx),%xmm7
2310	movdqa	64(%esp),%xmm5
2311	paddq	%xmm6,%xmm1
2312	movdqa	128(%esp),%xmm6
2313	pmuludq	(%edx),%xmm5
2314	paddq	%xmm7,%xmm0
2315	movdqa	%xmm6,%xmm7
2316	pmuludq	64(%edx),%xmm6
2317	paddq	%xmm5,%xmm4
2318	movdqa	%xmm7,%xmm5
2319	pmuludq	16(%edx),%xmm7
2320	paddq	%xmm6,%xmm3
2321	movdqa	%xmm5,%xmm6
2322	pmuludq	32(%edx),%xmm5
2323	paddq	%xmm7,%xmm0
2324	pmuludq	48(%edx),%xmm6
2325	movdqa	64(%ebx),%xmm7
2326	paddq	%xmm5,%xmm1
2327	paddq	%xmm6,%xmm2
2328	movdqa	%xmm3,%xmm5
2329	pand	%xmm7,%xmm3
2330	psrlq	$26,%xmm5
2331	paddq	%xmm4,%xmm5
2332	movdqa	%xmm0,%xmm6
2333	pand	%xmm7,%xmm0
2334	psrlq	$26,%xmm6
2335	movdqa	%xmm5,%xmm4
2336	paddq	%xmm1,%xmm6
2337	psrlq	$26,%xmm5
2338	pand	%xmm7,%xmm4
2339	movdqa	%xmm6,%xmm1
2340	psrlq	$26,%xmm6
2341	paddd	%xmm5,%xmm0
2342	psllq	$2,%xmm5
2343	paddq	%xmm2,%xmm6
2344	paddq	%xmm0,%xmm5
2345	pand	%xmm7,%xmm1
2346	movdqa	%xmm6,%xmm2
2347	psrlq	$26,%xmm6
2348	pand	%xmm7,%xmm2
2349	paddd	%xmm3,%xmm6
2350	movdqa	%xmm5,%xmm0
2351	psrlq	$26,%xmm5
2352	movdqa	%xmm6,%xmm3
2353	psrlq	$26,%xmm6
2354	pand	%xmm7,%xmm0
2355	paddd	%xmm5,%xmm1
2356	pand	%xmm7,%xmm3
2357	paddd	%xmm6,%xmm4
2358	decl	%ecx
2359	jz	.L006square_break
2360	punpcklqdq	(%esp),%xmm0
2361	punpcklqdq	16(%esp),%xmm1
2362	punpcklqdq	32(%esp),%xmm2
2363	punpcklqdq	48(%esp),%xmm3
2364	punpcklqdq	64(%esp),%xmm4
2365	jmp	.L005square
2366.L006square_break:
2367	psllq	$32,%xmm0
2368	psllq	$32,%xmm1
2369	psllq	$32,%xmm2
2370	psllq	$32,%xmm3
2371	psllq	$32,%xmm4
2372	por	(%esp),%xmm0
2373	por	16(%esp),%xmm1
2374	por	32(%esp),%xmm2
2375	por	48(%esp),%xmm3
2376	por	64(%esp),%xmm4
2377	pshufd	$141,%xmm0,%xmm0
2378	pshufd	$141,%xmm1,%xmm1
2379	pshufd	$141,%xmm2,%xmm2
2380	pshufd	$141,%xmm3,%xmm3
2381	pshufd	$141,%xmm4,%xmm4
2382	movdqu	%xmm0,(%edi)
2383	movdqu	%xmm1,16(%edi)
2384	movdqu	%xmm2,32(%edi)
2385	movdqu	%xmm3,48(%edi)
2386	movdqu	%xmm4,64(%edi)
2387	movdqa	%xmm1,%xmm6
2388	movdqa	%xmm2,%xmm5
2389	pslld	$2,%xmm6
2390	pslld	$2,%xmm5
2391	paddd	%xmm1,%xmm6
2392	paddd	%xmm2,%xmm5
2393	movdqu	%xmm6,80(%edi)
2394	movdqu	%xmm5,96(%edi)
2395	movdqa	%xmm3,%xmm6
2396	movdqa	%xmm4,%xmm5
2397	pslld	$2,%xmm6
2398	pslld	$2,%xmm5
2399	paddd	%xmm3,%xmm6
2400	paddd	%xmm4,%xmm5
2401	movdqu	%xmm6,112(%edi)
2402	movdqu	%xmm5,128(%edi)
2403	movl	%ebp,%esp
2404	leal	-48(%edi),%edi
2405	ret
2406.size	_poly1305_init_sse2,.-_poly1305_init_sse2
2407.align	32
2408.type	_poly1305_blocks_sse2,@function
2409.align	16
2410_poly1305_blocks_sse2:
2411	pushl	%ebp
2412	pushl	%ebx
2413	pushl	%esi
2414	pushl	%edi
2415	movl	20(%esp),%edi
2416	movl	24(%esp),%esi
2417	movl	28(%esp),%ecx
2418	movl	20(%edi),%eax
2419	andl	$-16,%ecx
2420	jz	.L007nodata
2421	cmpl	$64,%ecx
2422	jae	.L008enter_sse2
2423	testl	%eax,%eax
2424	jz	.Lenter_blocks
2425.align	16
2426.L008enter_sse2:
2427	call	.L009pic_point
2428.L009pic_point:
2429	popl	%ebx
2430	leal	.Lconst_sse2-.L009pic_point(%ebx),%ebx
2431	testl	%eax,%eax
2432	jnz	.L010base2_26
2433	call	_poly1305_init_sse2
2434	movl	(%edi),%eax
2435	movl	3(%edi),%ecx
2436	movl	6(%edi),%edx
2437	movl	9(%edi),%esi
2438	movl	13(%edi),%ebp
2439	movl	$1,20(%edi)
2440	shrl	$2,%ecx
2441	andl	$67108863,%eax
2442	shrl	$4,%edx
2443	andl	$67108863,%ecx
2444	shrl	$6,%esi
2445	andl	$67108863,%edx
2446	movd	%eax,%xmm0
2447	movd	%ecx,%xmm1
2448	movd	%edx,%xmm2
2449	movd	%esi,%xmm3
2450	movd	%ebp,%xmm4
2451	movl	24(%esp),%esi
2452	movl	28(%esp),%ecx
2453	jmp	.L011base2_32
2454.align	16
2455.L010base2_26:
2456	movd	(%edi),%xmm0
2457	movd	4(%edi),%xmm1
2458	movd	8(%edi),%xmm2
2459	movd	12(%edi),%xmm3
2460	movd	16(%edi),%xmm4
2461	movdqa	64(%ebx),%xmm7
2462.L011base2_32:
2463	movl	32(%esp),%eax
2464	movl	%esp,%ebp
2465	subl	$528,%esp
2466	andl	$-16,%esp
2467	leal	48(%edi),%edi
2468	shll	$24,%eax
2469	testl	$31,%ecx
2470	jz	.L012even
2471	movdqu	(%esi),%xmm6
2472	leal	16(%esi),%esi
2473	movdqa	%xmm6,%xmm5
2474	pand	%xmm7,%xmm6
2475	paddd	%xmm6,%xmm0
2476	movdqa	%xmm5,%xmm6
2477	psrlq	$26,%xmm5
2478	psrldq	$6,%xmm6
2479	pand	%xmm7,%xmm5
2480	paddd	%xmm5,%xmm1
2481	movdqa	%xmm6,%xmm5
2482	psrlq	$4,%xmm6
2483	pand	%xmm7,%xmm6
2484	paddd	%xmm6,%xmm2
2485	movdqa	%xmm5,%xmm6
2486	psrlq	$30,%xmm5
2487	pand	%xmm7,%xmm5
2488	psrldq	$7,%xmm6
2489	paddd	%xmm5,%xmm3
2490	movd	%eax,%xmm5
2491	paddd	%xmm6,%xmm4
2492	movd	12(%edi),%xmm6
2493	paddd	%xmm5,%xmm4
2494	movdqa	%xmm0,(%esp)
2495	movdqa	%xmm1,16(%esp)
2496	movdqa	%xmm2,32(%esp)
2497	movdqa	%xmm3,48(%esp)
2498	movdqa	%xmm4,64(%esp)
2499	pmuludq	%xmm6,%xmm0
2500	pmuludq	%xmm6,%xmm1
2501	pmuludq	%xmm6,%xmm2
2502	movd	28(%edi),%xmm5
2503	pmuludq	%xmm6,%xmm3
2504	pmuludq	%xmm6,%xmm4
2505	movdqa	%xmm5,%xmm6
2506	pmuludq	48(%esp),%xmm5
2507	movdqa	%xmm6,%xmm7
2508	pmuludq	32(%esp),%xmm6
2509	paddq	%xmm5,%xmm4
2510	movdqa	%xmm7,%xmm5
2511	pmuludq	16(%esp),%xmm7
2512	paddq	%xmm6,%xmm3
2513	movd	92(%edi),%xmm6
2514	pmuludq	(%esp),%xmm5
2515	paddq	%xmm7,%xmm2
2516	pmuludq	64(%esp),%xmm6
2517	movd	44(%edi),%xmm7
2518	paddq	%xmm5,%xmm1
2519	movdqa	%xmm7,%xmm5
2520	pmuludq	32(%esp),%xmm7
2521	paddq	%xmm6,%xmm0
2522	movdqa	%xmm5,%xmm6
2523	pmuludq	16(%esp),%xmm5
2524	paddq	%xmm7,%xmm4
2525	movd	108(%edi),%xmm7
2526	pmuludq	(%esp),%xmm6
2527	paddq	%xmm5,%xmm3
2528	movdqa	%xmm7,%xmm5
2529	pmuludq	64(%esp),%xmm7
2530	paddq	%xmm6,%xmm2
2531	pmuludq	48(%esp),%xmm5
2532	movd	60(%edi),%xmm6
2533	paddq	%xmm7,%xmm1
2534	movdqa	%xmm6,%xmm7
2535	pmuludq	16(%esp),%xmm6
2536	paddq	%xmm5,%xmm0
2537	movd	124(%edi),%xmm5
2538	pmuludq	(%esp),%xmm7
2539	paddq	%xmm6,%xmm4
2540	movdqa	%xmm5,%xmm6
2541	pmuludq	64(%esp),%xmm5
2542	paddq	%xmm7,%xmm3
2543	movdqa	%xmm6,%xmm7
2544	pmuludq	48(%esp),%xmm6
2545	paddq	%xmm5,%xmm2
2546	pmuludq	32(%esp),%xmm7
2547	movd	76(%edi),%xmm5
2548	paddq	%xmm6,%xmm1
2549	movd	140(%edi),%xmm6
2550	pmuludq	(%esp),%xmm5
2551	paddq	%xmm7,%xmm0
2552	movdqa	%xmm6,%xmm7
2553	pmuludq	64(%esp),%xmm6
2554	paddq	%xmm5,%xmm4
2555	movdqa	%xmm7,%xmm5
2556	pmuludq	16(%esp),%xmm7
2557	paddq	%xmm6,%xmm3
2558	movdqa	%xmm5,%xmm6
2559	pmuludq	32(%esp),%xmm5
2560	paddq	%xmm7,%xmm0
2561	pmuludq	48(%esp),%xmm6
2562	movdqa	64(%ebx),%xmm7
2563	paddq	%xmm5,%xmm1
2564	paddq	%xmm6,%xmm2
2565	movdqa	%xmm3,%xmm5
2566	pand	%xmm7,%xmm3
2567	psrlq	$26,%xmm5
2568	paddq	%xmm4,%xmm5
2569	movdqa	%xmm0,%xmm6
2570	pand	%xmm7,%xmm0
2571	psrlq	$26,%xmm6
2572	movdqa	%xmm5,%xmm4
2573	paddq	%xmm1,%xmm6
2574	psrlq	$26,%xmm5
2575	pand	%xmm7,%xmm4
2576	movdqa	%xmm6,%xmm1
2577	psrlq	$26,%xmm6
2578	paddd	%xmm5,%xmm0
2579	psllq	$2,%xmm5
2580	paddq	%xmm2,%xmm6
2581	paddq	%xmm0,%xmm5
2582	pand	%xmm7,%xmm1
2583	movdqa	%xmm6,%xmm2
2584	psrlq	$26,%xmm6
2585	pand	%xmm7,%xmm2
2586	paddd	%xmm3,%xmm6
2587	movdqa	%xmm5,%xmm0
2588	psrlq	$26,%xmm5
2589	movdqa	%xmm6,%xmm3
2590	psrlq	$26,%xmm6
2591	pand	%xmm7,%xmm0
2592	paddd	%xmm5,%xmm1
2593	pand	%xmm7,%xmm3
2594	paddd	%xmm6,%xmm4
2595	subl	$16,%ecx
2596	jz	.L013done
2597.L012even:
2598	leal	384(%esp),%edx
2599	leal	-32(%esi),%eax
2600	subl	$64,%ecx
2601	movdqu	(%edi),%xmm5
2602	pshufd	$68,%xmm5,%xmm6
2603	cmovbl	%eax,%esi
2604	pshufd	$238,%xmm5,%xmm5
2605	movdqa	%xmm6,(%edx)
2606	leal	160(%esp),%eax
2607	movdqu	16(%edi),%xmm6
2608	movdqa	%xmm5,-144(%edx)
2609	pshufd	$68,%xmm6,%xmm5
2610	pshufd	$238,%xmm6,%xmm6
2611	movdqa	%xmm5,16(%edx)
2612	movdqu	32(%edi),%xmm5
2613	movdqa	%xmm6,-128(%edx)
2614	pshufd	$68,%xmm5,%xmm6
2615	pshufd	$238,%xmm5,%xmm5
2616	movdqa	%xmm6,32(%edx)
2617	movdqu	48(%edi),%xmm6
2618	movdqa	%xmm5,-112(%edx)
2619	pshufd	$68,%xmm6,%xmm5
2620	pshufd	$238,%xmm6,%xmm6
2621	movdqa	%xmm5,48(%edx)
2622	movdqu	64(%edi),%xmm5
2623	movdqa	%xmm6,-96(%edx)
2624	pshufd	$68,%xmm5,%xmm6
2625	pshufd	$238,%xmm5,%xmm5
2626	movdqa	%xmm6,64(%edx)
2627	movdqu	80(%edi),%xmm6
2628	movdqa	%xmm5,-80(%edx)
2629	pshufd	$68,%xmm6,%xmm5
2630	pshufd	$238,%xmm6,%xmm6
2631	movdqa	%xmm5,80(%edx)
2632	movdqu	96(%edi),%xmm5
2633	movdqa	%xmm6,-64(%edx)
2634	pshufd	$68,%xmm5,%xmm6
2635	pshufd	$238,%xmm5,%xmm5
2636	movdqa	%xmm6,96(%edx)
2637	movdqu	112(%edi),%xmm6
2638	movdqa	%xmm5,-48(%edx)
2639	pshufd	$68,%xmm6,%xmm5
2640	pshufd	$238,%xmm6,%xmm6
2641	movdqa	%xmm5,112(%edx)
2642	movdqu	128(%edi),%xmm5
2643	movdqa	%xmm6,-32(%edx)
2644	pshufd	$68,%xmm5,%xmm6
2645	pshufd	$238,%xmm5,%xmm5
2646	movdqa	%xmm6,128(%edx)
2647	movdqa	%xmm5,-16(%edx)
2648	movdqu	32(%esi),%xmm5
2649	movdqu	48(%esi),%xmm6
2650	leal	32(%esi),%esi
2651	movdqa	%xmm2,112(%esp)
2652	movdqa	%xmm3,128(%esp)
2653	movdqa	%xmm4,144(%esp)
2654	movdqa	%xmm5,%xmm2
2655	movdqa	%xmm6,%xmm3
2656	psrldq	$6,%xmm2
2657	psrldq	$6,%xmm3
2658	movdqa	%xmm5,%xmm4
2659	punpcklqdq	%xmm3,%xmm2
2660	punpckhqdq	%xmm6,%xmm4
2661	punpcklqdq	%xmm6,%xmm5
2662	movdqa	%xmm2,%xmm3
2663	psrlq	$4,%xmm2
2664	psrlq	$30,%xmm3
2665	movdqa	%xmm5,%xmm6
2666	psrlq	$40,%xmm4
2667	psrlq	$26,%xmm6
2668	pand	%xmm7,%xmm5
2669	pand	%xmm7,%xmm6
2670	pand	%xmm7,%xmm2
2671	pand	%xmm7,%xmm3
2672	por	(%ebx),%xmm4
2673	movdqa	%xmm0,80(%esp)
2674	movdqa	%xmm1,96(%esp)
2675	jbe	.L014skip_loop
2676	jmp	.L015loop
2677.align	32
2678.L015loop:
2679	movdqa	-144(%edx),%xmm7
2680	movdqa	%xmm6,16(%eax)
2681	movdqa	%xmm2,32(%eax)
2682	movdqa	%xmm3,48(%eax)
2683	movdqa	%xmm4,64(%eax)
2684	movdqa	%xmm5,%xmm1
2685	pmuludq	%xmm7,%xmm5
2686	movdqa	%xmm6,%xmm0
2687	pmuludq	%xmm7,%xmm6
2688	pmuludq	%xmm7,%xmm2
2689	pmuludq	%xmm7,%xmm3
2690	pmuludq	%xmm7,%xmm4
2691	pmuludq	-16(%edx),%xmm0
2692	movdqa	%xmm1,%xmm7
2693	pmuludq	-128(%edx),%xmm1
2694	paddq	%xmm5,%xmm0
2695	movdqa	%xmm7,%xmm5
2696	pmuludq	-112(%edx),%xmm7
2697	paddq	%xmm6,%xmm1
2698	movdqa	%xmm5,%xmm6
2699	pmuludq	-96(%edx),%xmm5
2700	paddq	%xmm7,%xmm2
2701	movdqa	16(%eax),%xmm7
2702	pmuludq	-80(%edx),%xmm6
2703	paddq	%xmm5,%xmm3
2704	movdqa	%xmm7,%xmm5
2705	pmuludq	-128(%edx),%xmm7
2706	paddq	%xmm6,%xmm4
2707	movdqa	%xmm5,%xmm6
2708	pmuludq	-112(%edx),%xmm5
2709	paddq	%xmm7,%xmm2
2710	movdqa	32(%eax),%xmm7
2711	pmuludq	-96(%edx),%xmm6
2712	paddq	%xmm5,%xmm3
2713	movdqa	%xmm7,%xmm5
2714	pmuludq	-32(%edx),%xmm7
2715	paddq	%xmm6,%xmm4
2716	movdqa	%xmm5,%xmm6
2717	pmuludq	-16(%edx),%xmm5
2718	paddq	%xmm7,%xmm0
2719	movdqa	%xmm6,%xmm7
2720	pmuludq	-128(%edx),%xmm6
2721	paddq	%xmm5,%xmm1
2722	movdqa	48(%eax),%xmm5
2723	pmuludq	-112(%edx),%xmm7
2724	paddq	%xmm6,%xmm3
2725	movdqa	%xmm5,%xmm6
2726	pmuludq	-48(%edx),%xmm5
2727	paddq	%xmm7,%xmm4
2728	movdqa	%xmm6,%xmm7
2729	pmuludq	-32(%edx),%xmm6
2730	paddq	%xmm5,%xmm0
2731	movdqa	%xmm7,%xmm5
2732	pmuludq	-16(%edx),%xmm7
2733	paddq	%xmm6,%xmm1
2734	movdqa	64(%eax),%xmm6
2735	pmuludq	-128(%edx),%xmm5
2736	paddq	%xmm7,%xmm2
2737	movdqa	%xmm6,%xmm7
2738	pmuludq	-16(%edx),%xmm6
2739	paddq	%xmm5,%xmm4
2740	movdqa	%xmm7,%xmm5
2741	pmuludq	-64(%edx),%xmm7
2742	paddq	%xmm6,%xmm3
2743	movdqa	%xmm5,%xmm6
2744	pmuludq	-48(%edx),%xmm5
2745	paddq	%xmm7,%xmm0
2746	movdqa	64(%ebx),%xmm7
2747	pmuludq	-32(%edx),%xmm6
2748	paddq	%xmm5,%xmm1
2749	paddq	%xmm6,%xmm2
2750	movdqu	-32(%esi),%xmm5
2751	movdqu	-16(%esi),%xmm6
2752	leal	32(%esi),%esi
2753	movdqa	%xmm2,32(%esp)
2754	movdqa	%xmm3,48(%esp)
2755	movdqa	%xmm4,64(%esp)
2756	movdqa	%xmm5,%xmm2
2757	movdqa	%xmm6,%xmm3
2758	psrldq	$6,%xmm2
2759	psrldq	$6,%xmm3
2760	movdqa	%xmm5,%xmm4
2761	punpcklqdq	%xmm3,%xmm2
2762	punpckhqdq	%xmm6,%xmm4
2763	punpcklqdq	%xmm6,%xmm5
2764	movdqa	%xmm2,%xmm3
2765	psrlq	$4,%xmm2
2766	psrlq	$30,%xmm3
2767	movdqa	%xmm5,%xmm6
2768	psrlq	$40,%xmm4
2769	psrlq	$26,%xmm6
2770	pand	%xmm7,%xmm5
2771	pand	%xmm7,%xmm6
2772	pand	%xmm7,%xmm2
2773	pand	%xmm7,%xmm3
2774	por	(%ebx),%xmm4
2775	leal	-32(%esi),%eax
2776	subl	$64,%ecx
2777	paddd	80(%esp),%xmm5
2778	paddd	96(%esp),%xmm6
2779	paddd	112(%esp),%xmm2
2780	paddd	128(%esp),%xmm3
2781	paddd	144(%esp),%xmm4
2782	cmovbl	%eax,%esi
2783	leal	160(%esp),%eax
2784	movdqa	(%edx),%xmm7
2785	movdqa	%xmm1,16(%esp)
2786	movdqa	%xmm6,16(%eax)
2787	movdqa	%xmm2,32(%eax)
2788	movdqa	%xmm3,48(%eax)
2789	movdqa	%xmm4,64(%eax)
2790	movdqa	%xmm5,%xmm1
2791	pmuludq	%xmm7,%xmm5
2792	paddq	%xmm0,%xmm5
2793	movdqa	%xmm6,%xmm0
2794	pmuludq	%xmm7,%xmm6
2795	pmuludq	%xmm7,%xmm2
2796	pmuludq	%xmm7,%xmm3
2797	pmuludq	%xmm7,%xmm4
2798	paddq	16(%esp),%xmm6
2799	paddq	32(%esp),%xmm2
2800	paddq	48(%esp),%xmm3
2801	paddq	64(%esp),%xmm4
2802	pmuludq	128(%edx),%xmm0
2803	movdqa	%xmm1,%xmm7
2804	pmuludq	16(%edx),%xmm1
2805	paddq	%xmm5,%xmm0
2806	movdqa	%xmm7,%xmm5
2807	pmuludq	32(%edx),%xmm7
2808	paddq	%xmm6,%xmm1
2809	movdqa	%xmm5,%xmm6
2810	pmuludq	48(%edx),%xmm5
2811	paddq	%xmm7,%xmm2
2812	movdqa	16(%eax),%xmm7
2813	pmuludq	64(%edx),%xmm6
2814	paddq	%xmm5,%xmm3
2815	movdqa	%xmm7,%xmm5
2816	pmuludq	16(%edx),%xmm7
2817	paddq	%xmm6,%xmm4
2818	movdqa	%xmm5,%xmm6
2819	pmuludq	32(%edx),%xmm5
2820	paddq	%xmm7,%xmm2
2821	movdqa	32(%eax),%xmm7
2822	pmuludq	48(%edx),%xmm6
2823	paddq	%xmm5,%xmm3
2824	movdqa	%xmm7,%xmm5
2825	pmuludq	112(%edx),%xmm7
2826	paddq	%xmm6,%xmm4
2827	movdqa	%xmm5,%xmm6
2828	pmuludq	128(%edx),%xmm5
2829	paddq	%xmm7,%xmm0
2830	movdqa	%xmm6,%xmm7
2831	pmuludq	16(%edx),%xmm6
2832	paddq	%xmm5,%xmm1
2833	movdqa	48(%eax),%xmm5
2834	pmuludq	32(%edx),%xmm7
2835	paddq	%xmm6,%xmm3
2836	movdqa	%xmm5,%xmm6
2837	pmuludq	96(%edx),%xmm5
2838	paddq	%xmm7,%xmm4
2839	movdqa	%xmm6,%xmm7
2840	pmuludq	112(%edx),%xmm6
2841	paddq	%xmm5,%xmm0
2842	movdqa	%xmm7,%xmm5
2843	pmuludq	128(%edx),%xmm7
2844	paddq	%xmm6,%xmm1
2845	movdqa	64(%eax),%xmm6
2846	pmuludq	16(%edx),%xmm5
2847	paddq	%xmm7,%xmm2
2848	movdqa	%xmm6,%xmm7
2849	pmuludq	128(%edx),%xmm6
2850	paddq	%xmm5,%xmm4
2851	movdqa	%xmm7,%xmm5
2852	pmuludq	80(%edx),%xmm7
2853	paddq	%xmm6,%xmm3
2854	movdqa	%xmm5,%xmm6
2855	pmuludq	96(%edx),%xmm5
2856	paddq	%xmm7,%xmm0
2857	movdqa	64(%ebx),%xmm7
2858	pmuludq	112(%edx),%xmm6
2859	paddq	%xmm5,%xmm1
2860	paddq	%xmm6,%xmm2
2861	movdqa	%xmm3,%xmm5
2862	pand	%xmm7,%xmm3
2863	psrlq	$26,%xmm5
2864	paddq	%xmm4,%xmm5
2865	movdqa	%xmm0,%xmm6
2866	pand	%xmm7,%xmm0
2867	psrlq	$26,%xmm6
2868	movdqa	%xmm5,%xmm4
2869	paddq	%xmm1,%xmm6
2870	psrlq	$26,%xmm5
2871	pand	%xmm7,%xmm4
2872	movdqa	%xmm6,%xmm1
2873	psrlq	$26,%xmm6
2874	paddd	%xmm5,%xmm0
2875	psllq	$2,%xmm5
2876	paddq	%xmm2,%xmm6
2877	paddq	%xmm0,%xmm5
2878	pand	%xmm7,%xmm1
2879	movdqa	%xmm6,%xmm2
2880	psrlq	$26,%xmm6
2881	pand	%xmm7,%xmm2
2882	paddd	%xmm3,%xmm6
2883	movdqa	%xmm5,%xmm0
2884	psrlq	$26,%xmm5
2885	movdqa	%xmm6,%xmm3
2886	psrlq	$26,%xmm6
2887	pand	%xmm7,%xmm0
2888	paddd	%xmm5,%xmm1
2889	pand	%xmm7,%xmm3
2890	paddd	%xmm6,%xmm4
2891	movdqu	32(%esi),%xmm5
2892	movdqu	48(%esi),%xmm6
2893	leal	32(%esi),%esi
2894	movdqa	%xmm2,112(%esp)
2895	movdqa	%xmm3,128(%esp)
2896	movdqa	%xmm4,144(%esp)
2897	movdqa	%xmm5,%xmm2
2898	movdqa	%xmm6,%xmm3
2899	psrldq	$6,%xmm2
2900	psrldq	$6,%xmm3
2901	movdqa	%xmm5,%xmm4
2902	punpcklqdq	%xmm3,%xmm2
2903	punpckhqdq	%xmm6,%xmm4
2904	punpcklqdq	%xmm6,%xmm5
2905	movdqa	%xmm2,%xmm3
2906	psrlq	$4,%xmm2
2907	psrlq	$30,%xmm3
2908	movdqa	%xmm5,%xmm6
2909	psrlq	$40,%xmm4
2910	psrlq	$26,%xmm6
2911	pand	%xmm7,%xmm5
2912	pand	%xmm7,%xmm6
2913	pand	%xmm7,%xmm2
2914	pand	%xmm7,%xmm3
2915	por	(%ebx),%xmm4
2916	movdqa	%xmm0,80(%esp)
2917	movdqa	%xmm1,96(%esp)
2918	ja	.L015loop
2919.L014skip_loop:
2920	pshufd	$16,-144(%edx),%xmm7
2921	addl	$32,%ecx
2922	jnz	.L016long_tail
2923	paddd	%xmm0,%xmm5
2924	paddd	%xmm1,%xmm6
2925	paddd	112(%esp),%xmm2
2926	paddd	128(%esp),%xmm3
2927	paddd	144(%esp),%xmm4
2928.L016long_tail:
2929	movdqa	%xmm5,(%eax)
2930	movdqa	%xmm6,16(%eax)
2931	movdqa	%xmm2,32(%eax)
2932	movdqa	%xmm3,48(%eax)
2933	movdqa	%xmm4,64(%eax)
2934	pmuludq	%xmm7,%xmm5
2935	pmuludq	%xmm7,%xmm6
2936	pmuludq	%xmm7,%xmm2
2937	movdqa	%xmm5,%xmm0
2938	pshufd	$16,-128(%edx),%xmm5
2939	pmuludq	%xmm7,%xmm3
2940	movdqa	%xmm6,%xmm1
2941	pmuludq	%xmm7,%xmm4
2942	movdqa	%xmm5,%xmm6
2943	pmuludq	48(%eax),%xmm5
2944	movdqa	%xmm6,%xmm7
2945	pmuludq	32(%eax),%xmm6
2946	paddq	%xmm5,%xmm4
2947	movdqa	%xmm7,%xmm5
2948	pmuludq	16(%eax),%xmm7
2949	paddq	%xmm6,%xmm3
2950	pshufd	$16,-64(%edx),%xmm6
2951	pmuludq	(%eax),%xmm5
2952	paddq	%xmm7,%xmm2
2953	pmuludq	64(%eax),%xmm6
2954	pshufd	$16,-112(%edx),%xmm7
2955	paddq	%xmm5,%xmm1
2956	movdqa	%xmm7,%xmm5
2957	pmuludq	32(%eax),%xmm7
2958	paddq	%xmm6,%xmm0
2959	movdqa	%xmm5,%xmm6
2960	pmuludq	16(%eax),%xmm5
2961	paddq	%xmm7,%xmm4
2962	pshufd	$16,-48(%edx),%xmm7
2963	pmuludq	(%eax),%xmm6
2964	paddq	%xmm5,%xmm3
2965	movdqa	%xmm7,%xmm5
2966	pmuludq	64(%eax),%xmm7
2967	paddq	%xmm6,%xmm2
2968	pmuludq	48(%eax),%xmm5
2969	pshufd	$16,-96(%edx),%xmm6
2970	paddq	%xmm7,%xmm1
2971	movdqa	%xmm6,%xmm7
2972	pmuludq	16(%eax),%xmm6
2973	paddq	%xmm5,%xmm0
2974	pshufd	$16,-32(%edx),%xmm5
2975	pmuludq	(%eax),%xmm7
2976	paddq	%xmm6,%xmm4
2977	movdqa	%xmm5,%xmm6
2978	pmuludq	64(%eax),%xmm5
2979	paddq	%xmm7,%xmm3
2980	movdqa	%xmm6,%xmm7
2981	pmuludq	48(%eax),%xmm6
2982	paddq	%xmm5,%xmm2
2983	pmuludq	32(%eax),%xmm7
2984	pshufd	$16,-80(%edx),%xmm5
2985	paddq	%xmm6,%xmm1
2986	pshufd	$16,-16(%edx),%xmm6
2987	pmuludq	(%eax),%xmm5
2988	paddq	%xmm7,%xmm0
2989	movdqa	%xmm6,%xmm7
2990	pmuludq	64(%eax),%xmm6
2991	paddq	%xmm5,%xmm4
2992	movdqa	%xmm7,%xmm5
2993	pmuludq	16(%eax),%xmm7
2994	paddq	%xmm6,%xmm3
2995	movdqa	%xmm5,%xmm6
2996	pmuludq	32(%eax),%xmm5
2997	paddq	%xmm7,%xmm0
2998	pmuludq	48(%eax),%xmm6
2999	movdqa	64(%ebx),%xmm7
3000	paddq	%xmm5,%xmm1
3001	paddq	%xmm6,%xmm2
3002	jz	.L017short_tail
3003	movdqu	-32(%esi),%xmm5
3004	movdqu	-16(%esi),%xmm6
3005	leal	32(%esi),%esi
3006	movdqa	%xmm2,32(%esp)
3007	movdqa	%xmm3,48(%esp)
3008	movdqa	%xmm4,64(%esp)
3009	movdqa	%xmm5,%xmm2
3010	movdqa	%xmm6,%xmm3
3011	psrldq	$6,%xmm2
3012	psrldq	$6,%xmm3
3013	movdqa	%xmm5,%xmm4
3014	punpcklqdq	%xmm3,%xmm2
3015	punpckhqdq	%xmm6,%xmm4
3016	punpcklqdq	%xmm6,%xmm5
3017	movdqa	%xmm2,%xmm3
3018	psrlq	$4,%xmm2
3019	psrlq	$30,%xmm3
3020	movdqa	%xmm5,%xmm6
3021	psrlq	$40,%xmm4
3022	psrlq	$26,%xmm6
3023	pand	%xmm7,%xmm5
3024	pand	%xmm7,%xmm6
3025	pand	%xmm7,%xmm2
3026	pand	%xmm7,%xmm3
3027	por	(%ebx),%xmm4
3028	pshufd	$16,(%edx),%xmm7
3029	paddd	80(%esp),%xmm5
3030	paddd	96(%esp),%xmm6
3031	paddd	112(%esp),%xmm2
3032	paddd	128(%esp),%xmm3
3033	paddd	144(%esp),%xmm4
3034	movdqa	%xmm5,(%esp)
3035	pmuludq	%xmm7,%xmm5
3036	movdqa	%xmm6,16(%esp)
3037	pmuludq	%xmm7,%xmm6
3038	paddq	%xmm5,%xmm0
3039	movdqa	%xmm2,%xmm5
3040	pmuludq	%xmm7,%xmm2
3041	paddq	%xmm6,%xmm1
3042	movdqa	%xmm3,%xmm6
3043	pmuludq	%xmm7,%xmm3
3044	paddq	32(%esp),%xmm2
3045	movdqa	%xmm5,32(%esp)
3046	pshufd	$16,16(%edx),%xmm5
3047	paddq	48(%esp),%xmm3
3048	movdqa	%xmm6,48(%esp)
3049	movdqa	%xmm4,%xmm6
3050	pmuludq	%xmm7,%xmm4
3051	paddq	64(%esp),%xmm4
3052	movdqa	%xmm6,64(%esp)
3053	movdqa	%xmm5,%xmm6
3054	pmuludq	48(%esp),%xmm5
3055	movdqa	%xmm6,%xmm7
3056	pmuludq	32(%esp),%xmm6
3057	paddq	%xmm5,%xmm4
3058	movdqa	%xmm7,%xmm5
3059	pmuludq	16(%esp),%xmm7
3060	paddq	%xmm6,%xmm3
3061	pshufd	$16,80(%edx),%xmm6
3062	pmuludq	(%esp),%xmm5
3063	paddq	%xmm7,%xmm2
3064	pmuludq	64(%esp),%xmm6
3065	pshufd	$16,32(%edx),%xmm7
3066	paddq	%xmm5,%xmm1
3067	movdqa	%xmm7,%xmm5
3068	pmuludq	32(%esp),%xmm7
3069	paddq	%xmm6,%xmm0
3070	movdqa	%xmm5,%xmm6
3071	pmuludq	16(%esp),%xmm5
3072	paddq	%xmm7,%xmm4
3073	pshufd	$16,96(%edx),%xmm7
3074	pmuludq	(%esp),%xmm6
3075	paddq	%xmm5,%xmm3
3076	movdqa	%xmm7,%xmm5
3077	pmuludq	64(%esp),%xmm7
3078	paddq	%xmm6,%xmm2
3079	pmuludq	48(%esp),%xmm5
3080	pshufd	$16,48(%edx),%xmm6
3081	paddq	%xmm7,%xmm1
3082	movdqa	%xmm6,%xmm7
3083	pmuludq	16(%esp),%xmm6
3084	paddq	%xmm5,%xmm0
3085	pshufd	$16,112(%edx),%xmm5
3086	pmuludq	(%esp),%xmm7
3087	paddq	%xmm6,%xmm4
3088	movdqa	%xmm5,%xmm6
3089	pmuludq	64(%esp),%xmm5
3090	paddq	%xmm7,%xmm3
3091	movdqa	%xmm6,%xmm7
3092	pmuludq	48(%esp),%xmm6
3093	paddq	%xmm5,%xmm2
3094	pmuludq	32(%esp),%xmm7
3095	pshufd	$16,64(%edx),%xmm5
3096	paddq	%xmm6,%xmm1
3097	pshufd	$16,128(%edx),%xmm6
3098	pmuludq	(%esp),%xmm5
3099	paddq	%xmm7,%xmm0
3100	movdqa	%xmm6,%xmm7
3101	pmuludq	64(%esp),%xmm6
3102	paddq	%xmm5,%xmm4
3103	movdqa	%xmm7,%xmm5
3104	pmuludq	16(%esp),%xmm7
3105	paddq	%xmm6,%xmm3
3106	movdqa	%xmm5,%xmm6
3107	pmuludq	32(%esp),%xmm5
3108	paddq	%xmm7,%xmm0
3109	pmuludq	48(%esp),%xmm6
3110	movdqa	64(%ebx),%xmm7
3111	paddq	%xmm5,%xmm1
3112	paddq	%xmm6,%xmm2
3113.L017short_tail:
3114	pshufd	$78,%xmm4,%xmm6
3115	pshufd	$78,%xmm3,%xmm5
3116	paddq	%xmm6,%xmm4
3117	paddq	%xmm5,%xmm3
3118	pshufd	$78,%xmm0,%xmm6
3119	pshufd	$78,%xmm1,%xmm5
3120	paddq	%xmm6,%xmm0
3121	paddq	%xmm5,%xmm1
3122	pshufd	$78,%xmm2,%xmm6
3123	movdqa	%xmm3,%xmm5
3124	pand	%xmm7,%xmm3
3125	psrlq	$26,%xmm5
3126	paddq	%xmm6,%xmm2
3127	paddq	%xmm4,%xmm5
3128	movdqa	%xmm0,%xmm6
3129	pand	%xmm7,%xmm0
3130	psrlq	$26,%xmm6
3131	movdqa	%xmm5,%xmm4
3132	paddq	%xmm1,%xmm6
3133	psrlq	$26,%xmm5
3134	pand	%xmm7,%xmm4
3135	movdqa	%xmm6,%xmm1
3136	psrlq	$26,%xmm6
3137	paddd	%xmm5,%xmm0
3138	psllq	$2,%xmm5
3139	paddq	%xmm2,%xmm6
3140	paddq	%xmm0,%xmm5
3141	pand	%xmm7,%xmm1
3142	movdqa	%xmm6,%xmm2
3143	psrlq	$26,%xmm6
3144	pand	%xmm7,%xmm2
3145	paddd	%xmm3,%xmm6
3146	movdqa	%xmm5,%xmm0
3147	psrlq	$26,%xmm5
3148	movdqa	%xmm6,%xmm3
3149	psrlq	$26,%xmm6
3150	pand	%xmm7,%xmm0
3151	paddd	%xmm5,%xmm1
3152	pand	%xmm7,%xmm3
3153	paddd	%xmm6,%xmm4
3154.L013done:
3155	movd	%xmm0,-48(%edi)
3156	movd	%xmm1,-44(%edi)
3157	movd	%xmm2,-40(%edi)
3158	movd	%xmm3,-36(%edi)
3159	movd	%xmm4,-32(%edi)
3160	movl	%ebp,%esp
3161.L007nodata:
3162	popl	%edi
3163	popl	%esi
3164	popl	%ebx
3165	popl	%ebp
3166	ret
3167.size	_poly1305_blocks_sse2,.-_poly1305_blocks_sse2
3168.align	32
3169.type	_poly1305_emit_sse2,@function
3170.align	16
3171_poly1305_emit_sse2:
3172	pushl	%ebp
3173	pushl	%ebx
3174	pushl	%esi
3175	pushl	%edi
3176	movl	20(%esp),%ebp
3177	cmpl	$0,20(%ebp)
3178	je	.Lenter_emit
3179	movl	(%ebp),%eax
3180	movl	4(%ebp),%edi
3181	movl	8(%ebp),%ecx
3182	movl	12(%ebp),%edx
3183	movl	16(%ebp),%esi
3184	movl	%edi,%ebx
3185	shll	$26,%edi
3186	shrl	$6,%ebx
3187	addl	%edi,%eax
3188	movl	%ecx,%edi
3189	adcl	$0,%ebx
3190	shll	$20,%edi
3191	shrl	$12,%ecx
3192	addl	%edi,%ebx
3193	movl	%edx,%edi
3194	adcl	$0,%ecx
3195	shll	$14,%edi
3196	shrl	$18,%edx
3197	addl	%edi,%ecx
3198	movl	%esi,%edi
3199	adcl	$0,%edx
3200	shll	$8,%edi
3201	shrl	$24,%esi
3202	addl	%edi,%edx
3203	adcl	$0,%esi
3204	movl	%esi,%edi
3205	andl	$3,%esi
3206	shrl	$2,%edi
3207	leal	(%edi,%edi,4),%ebp
3208	movl	24(%esp),%edi
3209	addl	%ebp,%eax
3210	movl	28(%esp),%ebp
3211	adcl	$0,%ebx
3212	adcl	$0,%ecx
3213	adcl	$0,%edx
3214	adcl	$0,%esi
3215	movd	%eax,%xmm0
3216	addl	$5,%eax
3217	movd	%ebx,%xmm1
3218	adcl	$0,%ebx
3219	movd	%ecx,%xmm2
3220	adcl	$0,%ecx
3221	movd	%edx,%xmm3
3222	adcl	$0,%edx
3223	adcl	$0,%esi
3224	shrl	$2,%esi
3225	negl	%esi
3226	andl	%esi,%eax
3227	andl	%esi,%ebx
3228	andl	%esi,%ecx
3229	andl	%esi,%edx
3230	movl	%eax,(%edi)
3231	movd	%xmm0,%eax
3232	movl	%ebx,4(%edi)
3233	movd	%xmm1,%ebx
3234	movl	%ecx,8(%edi)
3235	movd	%xmm2,%ecx
3236	movl	%edx,12(%edi)
3237	movd	%xmm3,%edx
3238	notl	%esi
3239	andl	%esi,%eax
3240	andl	%esi,%ebx
3241	orl	(%edi),%eax
3242	andl	%esi,%ecx
3243	orl	4(%edi),%ebx
3244	andl	%esi,%edx
3245	orl	8(%edi),%ecx
3246	orl	12(%edi),%edx
3247	addl	(%ebp),%eax
3248	adcl	4(%ebp),%ebx
3249	movl	%eax,(%edi)
3250	adcl	8(%ebp),%ecx
3251	movl	%ebx,4(%edi)
3252	adcl	12(%ebp),%edx
3253	movl	%ecx,8(%edi)
3254	movl	%edx,12(%edi)
3255	popl	%edi
3256	popl	%esi
3257	popl	%ebx
3258	popl	%ebp
3259	ret
3260.size	_poly1305_emit_sse2,.-_poly1305_emit_sse2
3261.align	32
3262.type	_poly1305_init_avx2,@function
3263.align	16
3264_poly1305_init_avx2:
3265	vmovdqu	24(%edi),%xmm4
3266	leal	48(%edi),%edi
3267	movl	%esp,%ebp
3268	subl	$224,%esp
3269	andl	$-16,%esp
3270	vmovdqa	64(%ebx),%xmm7
3271	vpand	%xmm7,%xmm4,%xmm0
3272	vpsrlq	$26,%xmm4,%xmm1
3273	vpsrldq	$6,%xmm4,%xmm3
3274	vpand	%xmm7,%xmm1,%xmm1
3275	vpsrlq	$4,%xmm3,%xmm2
3276	vpsrlq	$30,%xmm3,%xmm3
3277	vpand	%xmm7,%xmm2,%xmm2
3278	vpand	%xmm7,%xmm3,%xmm3
3279	vpsrldq	$13,%xmm4,%xmm4
3280	leal	144(%esp),%edx
3281	movl	$2,%ecx
3282.L018square:
3283	vmovdqa	%xmm0,(%esp)
3284	vmovdqa	%xmm1,16(%esp)
3285	vmovdqa	%xmm2,32(%esp)
3286	vmovdqa	%xmm3,48(%esp)
3287	vmovdqa	%xmm4,64(%esp)
3288	vpslld	$2,%xmm1,%xmm6
3289	vpslld	$2,%xmm2,%xmm5
3290	vpaddd	%xmm1,%xmm6,%xmm6
3291	vpaddd	%xmm2,%xmm5,%xmm5
3292	vmovdqa	%xmm6,80(%esp)
3293	vmovdqa	%xmm5,96(%esp)
3294	vpslld	$2,%xmm3,%xmm6
3295	vpslld	$2,%xmm4,%xmm5
3296	vpaddd	%xmm3,%xmm6,%xmm6
3297	vpaddd	%xmm4,%xmm5,%xmm5
3298	vmovdqa	%xmm6,112(%esp)
3299	vmovdqa	%xmm5,128(%esp)
3300	vpshufd	$68,%xmm0,%xmm5
3301	vmovdqa	%xmm1,%xmm6
3302	vpshufd	$68,%xmm1,%xmm1
3303	vpshufd	$68,%xmm2,%xmm2
3304	vpshufd	$68,%xmm3,%xmm3
3305	vpshufd	$68,%xmm4,%xmm4
3306	vmovdqa	%xmm5,(%edx)
3307	vmovdqa	%xmm1,16(%edx)
3308	vmovdqa	%xmm2,32(%edx)
3309	vmovdqa	%xmm3,48(%edx)
3310	vmovdqa	%xmm4,64(%edx)
3311	vpmuludq	%xmm0,%xmm4,%xmm4
3312	vpmuludq	%xmm0,%xmm3,%xmm3
3313	vpmuludq	%xmm0,%xmm2,%xmm2
3314	vpmuludq	%xmm0,%xmm1,%xmm1
3315	vpmuludq	%xmm0,%xmm5,%xmm0
3316	vpmuludq	48(%edx),%xmm6,%xmm5
3317	vpaddq	%xmm5,%xmm4,%xmm4
3318	vpmuludq	32(%edx),%xmm6,%xmm7
3319	vpaddq	%xmm7,%xmm3,%xmm3
3320	vpmuludq	16(%edx),%xmm6,%xmm5
3321	vpaddq	%xmm5,%xmm2,%xmm2
3322	vmovdqa	80(%esp),%xmm7
3323	vpmuludq	(%edx),%xmm6,%xmm6
3324	vpaddq	%xmm6,%xmm1,%xmm1
3325	vmovdqa	32(%esp),%xmm5
3326	vpmuludq	64(%edx),%xmm7,%xmm7
3327	vpaddq	%xmm7,%xmm0,%xmm0
3328	vpmuludq	32(%edx),%xmm5,%xmm6
3329	vpaddq	%xmm6,%xmm4,%xmm4
3330	vpmuludq	16(%edx),%xmm5,%xmm7
3331	vpaddq	%xmm7,%xmm3,%xmm3
3332	vmovdqa	96(%esp),%xmm6
3333	vpmuludq	(%edx),%xmm5,%xmm5
3334	vpaddq	%xmm5,%xmm2,%xmm2
3335	vpmuludq	64(%edx),%xmm6,%xmm7
3336	vpaddq	%xmm7,%xmm1,%xmm1
3337	vmovdqa	48(%esp),%xmm5
3338	vpmuludq	48(%edx),%xmm6,%xmm6
3339	vpaddq	%xmm6,%xmm0,%xmm0
3340	vpmuludq	16(%edx),%xmm5,%xmm7
3341	vpaddq	%xmm7,%xmm4,%xmm4
3342	vmovdqa	112(%esp),%xmm6
3343	vpmuludq	(%edx),%xmm5,%xmm5
3344	vpaddq	%xmm5,%xmm3,%xmm3
3345	vpmuludq	64(%edx),%xmm6,%xmm7
3346	vpaddq	%xmm7,%xmm2,%xmm2
3347	vpmuludq	48(%edx),%xmm6,%xmm5
3348	vpaddq	%xmm5,%xmm1,%xmm1
3349	vmovdqa	64(%esp),%xmm7
3350	vpmuludq	32(%edx),%xmm6,%xmm6
3351	vpaddq	%xmm6,%xmm0,%xmm0
3352	vmovdqa	128(%esp),%xmm5
3353	vpmuludq	(%edx),%xmm7,%xmm7
3354	vpaddq	%xmm7,%xmm4,%xmm4
3355	vpmuludq	64(%edx),%xmm5,%xmm6
3356	vpaddq	%xmm6,%xmm3,%xmm3
3357	vpmuludq	16(%edx),%xmm5,%xmm7
3358	vpaddq	%xmm7,%xmm0,%xmm0
3359	vpmuludq	32(%edx),%xmm5,%xmm6
3360	vpaddq	%xmm6,%xmm1,%xmm1
3361	vmovdqa	64(%ebx),%xmm7
3362	vpmuludq	48(%edx),%xmm5,%xmm5
3363	vpaddq	%xmm5,%xmm2,%xmm2
3364	vpsrlq	$26,%xmm3,%xmm5
3365	vpand	%xmm7,%xmm3,%xmm3
3366	vpsrlq	$26,%xmm0,%xmm6
3367	vpand	%xmm7,%xmm0,%xmm0
3368	vpaddq	%xmm5,%xmm4,%xmm4
3369	vpaddq	%xmm6,%xmm1,%xmm1
3370	vpsrlq	$26,%xmm4,%xmm5
3371	vpand	%xmm7,%xmm4,%xmm4
3372	vpsrlq	$26,%xmm1,%xmm6
3373	vpand	%xmm7,%xmm1,%xmm1
3374	vpaddq	%xmm6,%xmm2,%xmm2
3375	vpaddd	%xmm5,%xmm0,%xmm0
3376	vpsllq	$2,%xmm5,%xmm5
3377	vpsrlq	$26,%xmm2,%xmm6
3378	vpand	%xmm7,%xmm2,%xmm2
3379	vpaddd	%xmm5,%xmm0,%xmm0
3380	vpaddd	%xmm6,%xmm3,%xmm3
3381	vpsrlq	$26,%xmm3,%xmm6
3382	vpsrlq	$26,%xmm0,%xmm5
3383	vpand	%xmm7,%xmm0,%xmm0
3384	vpand	%xmm7,%xmm3,%xmm3
3385	vpaddd	%xmm5,%xmm1,%xmm1
3386	vpaddd	%xmm6,%xmm4,%xmm4
3387	decl	%ecx
3388	jz	.L019square_break
3389	vpunpcklqdq	(%esp),%xmm0,%xmm0
3390	vpunpcklqdq	16(%esp),%xmm1,%xmm1
3391	vpunpcklqdq	32(%esp),%xmm2,%xmm2
3392	vpunpcklqdq	48(%esp),%xmm3,%xmm3
3393	vpunpcklqdq	64(%esp),%xmm4,%xmm4
3394	jmp	.L018square
3395.L019square_break:
3396	vpsllq	$32,%xmm0,%xmm0
3397	vpsllq	$32,%xmm1,%xmm1
3398	vpsllq	$32,%xmm2,%xmm2
3399	vpsllq	$32,%xmm3,%xmm3
3400	vpsllq	$32,%xmm4,%xmm4
3401	vpor	(%esp),%xmm0,%xmm0
3402	vpor	16(%esp),%xmm1,%xmm1
3403	vpor	32(%esp),%xmm2,%xmm2
3404	vpor	48(%esp),%xmm3,%xmm3
3405	vpor	64(%esp),%xmm4,%xmm4
3406	vpshufd	$141,%xmm0,%xmm0
3407	vpshufd	$141,%xmm1,%xmm1
3408	vpshufd	$141,%xmm2,%xmm2
3409	vpshufd	$141,%xmm3,%xmm3
3410	vpshufd	$141,%xmm4,%xmm4
3411	vmovdqu	%xmm0,(%edi)
3412	vmovdqu	%xmm1,16(%edi)
3413	vmovdqu	%xmm2,32(%edi)
3414	vmovdqu	%xmm3,48(%edi)
3415	vmovdqu	%xmm4,64(%edi)
3416	vpslld	$2,%xmm1,%xmm6
3417	vpslld	$2,%xmm2,%xmm5
3418	vpaddd	%xmm1,%xmm6,%xmm6
3419	vpaddd	%xmm2,%xmm5,%xmm5
3420	vmovdqu	%xmm6,80(%edi)
3421	vmovdqu	%xmm5,96(%edi)
3422	vpslld	$2,%xmm3,%xmm6
3423	vpslld	$2,%xmm4,%xmm5
3424	vpaddd	%xmm3,%xmm6,%xmm6
3425	vpaddd	%xmm4,%xmm5,%xmm5
3426	vmovdqu	%xmm6,112(%edi)
3427	vmovdqu	%xmm5,128(%edi)
3428	movl	%ebp,%esp
3429	leal	-48(%edi),%edi
3430	ret
3431.size	_poly1305_init_avx2,.-_poly1305_init_avx2
3432.align	32
3433.type	_poly1305_blocks_avx2,@function
3434.align	16
3435_poly1305_blocks_avx2:
3436	pushl	%ebp
3437	pushl	%ebx
3438	pushl	%esi
3439	pushl	%edi
3440	movl	20(%esp),%edi
3441	movl	24(%esp),%esi
3442	movl	28(%esp),%ecx
3443	movl	20(%edi),%eax
3444	andl	$-16,%ecx
3445	jz	.L020nodata
3446	cmpl	$64,%ecx
3447	jae	.L021enter_avx2
3448	testl	%eax,%eax
3449	jz	.Lenter_blocks
3450.L021enter_avx2:
3451	vzeroupper
3452	call	.L022pic_point
3453.L022pic_point:
3454	popl	%ebx
3455	leal	.Lconst_sse2-.L022pic_point(%ebx),%ebx
3456	testl	%eax,%eax
3457	jnz	.L023base2_26
3458	call	_poly1305_init_avx2
3459	movl	(%edi),%eax
3460	movl	3(%edi),%ecx
3461	movl	6(%edi),%edx
3462	movl	9(%edi),%esi
3463	movl	13(%edi),%ebp
3464	shrl	$2,%ecx
3465	andl	$67108863,%eax
3466	shrl	$4,%edx
3467	andl	$67108863,%ecx
3468	shrl	$6,%esi
3469	andl	$67108863,%edx
3470	movl	%eax,(%edi)
3471	movl	%ecx,4(%edi)
3472	movl	%edx,8(%edi)
3473	movl	%esi,12(%edi)
3474	movl	%ebp,16(%edi)
3475	movl	$1,20(%edi)
3476	movl	24(%esp),%esi
3477	movl	28(%esp),%ecx
3478.L023base2_26:
3479	movl	32(%esp),%eax
3480	movl	%esp,%ebp
3481	subl	$448,%esp
3482	andl	$-512,%esp
3483	vmovdqu	48(%edi),%xmm0
3484	leal	288(%esp),%edx
3485	vmovdqu	64(%edi),%xmm1
3486	vmovdqu	80(%edi),%xmm2
3487	vmovdqu	96(%edi),%xmm3
3488	vmovdqu	112(%edi),%xmm4
3489	leal	48(%edi),%edi
3490	vpermq	$64,%ymm0,%ymm0
3491	vpermq	$64,%ymm1,%ymm1
3492	vpermq	$64,%ymm2,%ymm2
3493	vpermq	$64,%ymm3,%ymm3
3494	vpermq	$64,%ymm4,%ymm4
3495	vpshufd	$200,%ymm0,%ymm0
3496	vpshufd	$200,%ymm1,%ymm1
3497	vpshufd	$200,%ymm2,%ymm2
3498	vpshufd	$200,%ymm3,%ymm3
3499	vpshufd	$200,%ymm4,%ymm4
3500	vmovdqa	%ymm0,-128(%edx)
3501	vmovdqu	80(%edi),%xmm0
3502	vmovdqa	%ymm1,-96(%edx)
3503	vmovdqu	96(%edi),%xmm1
3504	vmovdqa	%ymm2,-64(%edx)
3505	vmovdqu	112(%edi),%xmm2
3506	vmovdqa	%ymm3,-32(%edx)
3507	vmovdqu	128(%edi),%xmm3
3508	vmovdqa	%ymm4,(%edx)
3509	vpermq	$64,%ymm0,%ymm0
3510	vpermq	$64,%ymm1,%ymm1
3511	vpermq	$64,%ymm2,%ymm2
3512	vpermq	$64,%ymm3,%ymm3
3513	vpshufd	$200,%ymm0,%ymm0
3514	vpshufd	$200,%ymm1,%ymm1
3515	vpshufd	$200,%ymm2,%ymm2
3516	vpshufd	$200,%ymm3,%ymm3
3517	vmovdqa	%ymm0,32(%edx)
3518	vmovd	-48(%edi),%xmm0
3519	vmovdqa	%ymm1,64(%edx)
3520	vmovd	-44(%edi),%xmm1
3521	vmovdqa	%ymm2,96(%edx)
3522	vmovd	-40(%edi),%xmm2
3523	vmovdqa	%ymm3,128(%edx)
3524	vmovd	-36(%edi),%xmm3
3525	vmovd	-32(%edi),%xmm4
3526	vmovdqa	64(%ebx),%ymm7
3527	negl	%eax
3528	testl	$63,%ecx
3529	jz	.L024even
3530	movl	%ecx,%edx
3531	andl	$-64,%ecx
3532	andl	$63,%edx
3533	vmovdqu	(%esi),%xmm5
3534	cmpl	$32,%edx
3535	jb	.L025one
3536	vmovdqu	16(%esi),%xmm6
3537	je	.L026two
3538	vinserti128	$1,32(%esi),%ymm5,%ymm5
3539	leal	48(%esi),%esi
3540	leal	8(%ebx),%ebx
3541	leal	296(%esp),%edx
3542	jmp	.L027tail
3543.L026two:
3544	leal	32(%esi),%esi
3545	leal	16(%ebx),%ebx
3546	leal	304(%esp),%edx
3547	jmp	.L027tail
3548.L025one:
3549	leal	16(%esi),%esi
3550	vpxor	%ymm6,%ymm6,%ymm6
3551	leal	32(%ebx,%eax,8),%ebx
3552	leal	312(%esp),%edx
3553	jmp	.L027tail
3554.align	32
3555.L024even:
3556	vmovdqu	(%esi),%xmm5
3557	vmovdqu	16(%esi),%xmm6
3558	vinserti128	$1,32(%esi),%ymm5,%ymm5
3559	vinserti128	$1,48(%esi),%ymm6,%ymm6
3560	leal	64(%esi),%esi
3561	subl	$64,%ecx
3562	jz	.L027tail
3563.L028loop:
3564	vmovdqa	%ymm2,64(%esp)
3565	vpsrldq	$6,%ymm5,%ymm2
3566	vmovdqa	%ymm0,(%esp)
3567	vpsrldq	$6,%ymm6,%ymm0
3568	vmovdqa	%ymm1,32(%esp)
3569	vpunpckhqdq	%ymm6,%ymm5,%ymm1
3570	vpunpcklqdq	%ymm6,%ymm5,%ymm5
3571	vpunpcklqdq	%ymm0,%ymm2,%ymm2
3572	vpsrlq	$30,%ymm2,%ymm0
3573	vpsrlq	$4,%ymm2,%ymm2
3574	vpsrlq	$26,%ymm5,%ymm6
3575	vpsrlq	$40,%ymm1,%ymm1
3576	vpand	%ymm7,%ymm2,%ymm2
3577	vpand	%ymm7,%ymm5,%ymm5
3578	vpand	%ymm7,%ymm6,%ymm6
3579	vpand	%ymm7,%ymm0,%ymm0
3580	vpor	(%ebx),%ymm1,%ymm1
3581	vpaddq	64(%esp),%ymm2,%ymm2
3582	vpaddq	(%esp),%ymm5,%ymm5
3583	vpaddq	32(%esp),%ymm6,%ymm6
3584	vpaddq	%ymm3,%ymm0,%ymm0
3585	vpaddq	%ymm4,%ymm1,%ymm1
3586	vpmuludq	-96(%edx),%ymm2,%ymm3
3587	vmovdqa	%ymm6,32(%esp)
3588	vpmuludq	-64(%edx),%ymm2,%ymm4
3589	vmovdqa	%ymm0,96(%esp)
3590	vpmuludq	96(%edx),%ymm2,%ymm0
3591	vmovdqa	%ymm1,128(%esp)
3592	vpmuludq	128(%edx),%ymm2,%ymm1
3593	vpmuludq	-128(%edx),%ymm2,%ymm2
3594	vpmuludq	-32(%edx),%ymm5,%ymm7
3595	vpaddq	%ymm7,%ymm3,%ymm3
3596	vpmuludq	(%edx),%ymm5,%ymm6
3597	vpaddq	%ymm6,%ymm4,%ymm4
3598	vpmuludq	-128(%edx),%ymm5,%ymm7
3599	vpaddq	%ymm7,%ymm0,%ymm0
3600	vmovdqa	32(%esp),%ymm7
3601	vpmuludq	-96(%edx),%ymm5,%ymm6
3602	vpaddq	%ymm6,%ymm1,%ymm1
3603	vpmuludq	-64(%edx),%ymm5,%ymm5
3604	vpaddq	%ymm5,%ymm2,%ymm2
3605	vpmuludq	-64(%edx),%ymm7,%ymm6
3606	vpaddq	%ymm6,%ymm3,%ymm3
3607	vpmuludq	-32(%edx),%ymm7,%ymm5
3608	vpaddq	%ymm5,%ymm4,%ymm4
3609	vpmuludq	128(%edx),%ymm7,%ymm6
3610	vpaddq	%ymm6,%ymm0,%ymm0
3611	vmovdqa	96(%esp),%ymm6
3612	vpmuludq	-128(%edx),%ymm7,%ymm5
3613	vpaddq	%ymm5,%ymm1,%ymm1
3614	vpmuludq	-96(%edx),%ymm7,%ymm7
3615	vpaddq	%ymm7,%ymm2,%ymm2
3616	vpmuludq	-128(%edx),%ymm6,%ymm5
3617	vpaddq	%ymm5,%ymm3,%ymm3
3618	vpmuludq	-96(%edx),%ymm6,%ymm7
3619	vpaddq	%ymm7,%ymm4,%ymm4
3620	vpmuludq	64(%edx),%ymm6,%ymm5
3621	vpaddq	%ymm5,%ymm0,%ymm0
3622	vmovdqa	128(%esp),%ymm5
3623	vpmuludq	96(%edx),%ymm6,%ymm7
3624	vpaddq	%ymm7,%ymm1,%ymm1
3625	vpmuludq	128(%edx),%ymm6,%ymm6
3626	vpaddq	%ymm6,%ymm2,%ymm2
3627	vpmuludq	128(%edx),%ymm5,%ymm7
3628	vpaddq	%ymm7,%ymm3,%ymm3
3629	vpmuludq	32(%edx),%ymm5,%ymm6
3630	vpaddq	%ymm6,%ymm0,%ymm0
3631	vpmuludq	-128(%edx),%ymm5,%ymm7
3632	vpaddq	%ymm7,%ymm4,%ymm4
3633	vmovdqa	64(%ebx),%ymm7
3634	vpmuludq	64(%edx),%ymm5,%ymm6
3635	vpaddq	%ymm6,%ymm1,%ymm1
3636	vpmuludq	96(%edx),%ymm5,%ymm5
3637	vpaddq	%ymm5,%ymm2,%ymm2
3638	vpsrlq	$26,%ymm3,%ymm5
3639	vpand	%ymm7,%ymm3,%ymm3
3640	vpsrlq	$26,%ymm0,%ymm6
3641	vpand	%ymm7,%ymm0,%ymm0
3642	vpaddq	%ymm5,%ymm4,%ymm4
3643	vpaddq	%ymm6,%ymm1,%ymm1
3644	vpsrlq	$26,%ymm4,%ymm5
3645	vpand	%ymm7,%ymm4,%ymm4
3646	vpsrlq	$26,%ymm1,%ymm6
3647	vpand	%ymm7,%ymm1,%ymm1
3648	vpaddq	%ymm6,%ymm2,%ymm2
3649	vpaddq	%ymm5,%ymm0,%ymm0
3650	vpsllq	$2,%ymm5,%ymm5
3651	vpsrlq	$26,%ymm2,%ymm6
3652	vpand	%ymm7,%ymm2,%ymm2
3653	vpaddq	%ymm5,%ymm0,%ymm0
3654	vpaddq	%ymm6,%ymm3,%ymm3
3655	vpsrlq	$26,%ymm3,%ymm6
3656	vpsrlq	$26,%ymm0,%ymm5
3657	vpand	%ymm7,%ymm0,%ymm0
3658	vpand	%ymm7,%ymm3,%ymm3
3659	vpaddq	%ymm5,%ymm1,%ymm1
3660	vpaddq	%ymm6,%ymm4,%ymm4
3661	vmovdqu	(%esi),%xmm5
3662	vmovdqu	16(%esi),%xmm6
3663	vinserti128	$1,32(%esi),%ymm5,%ymm5
3664	vinserti128	$1,48(%esi),%ymm6,%ymm6
3665	leal	64(%esi),%esi
3666	subl	$64,%ecx
3667	jnz	.L028loop
3668.L027tail:
3669	vmovdqa	%ymm2,64(%esp)
3670	vpsrldq	$6,%ymm5,%ymm2
3671	vmovdqa	%ymm0,(%esp)
3672	vpsrldq	$6,%ymm6,%ymm0
3673	vmovdqa	%ymm1,32(%esp)
3674	vpunpckhqdq	%ymm6,%ymm5,%ymm1
3675	vpunpcklqdq	%ymm6,%ymm5,%ymm5
3676	vpunpcklqdq	%ymm0,%ymm2,%ymm2
3677	vpsrlq	$30,%ymm2,%ymm0
3678	vpsrlq	$4,%ymm2,%ymm2
3679	vpsrlq	$26,%ymm5,%ymm6
3680	vpsrlq	$40,%ymm1,%ymm1
3681	vpand	%ymm7,%ymm2,%ymm2
3682	vpand	%ymm7,%ymm5,%ymm5
3683	vpand	%ymm7,%ymm6,%ymm6
3684	vpand	%ymm7,%ymm0,%ymm0
3685	vpor	(%ebx),%ymm1,%ymm1
3686	andl	$-64,%ebx
3687	vpaddq	64(%esp),%ymm2,%ymm2
3688	vpaddq	(%esp),%ymm5,%ymm5
3689	vpaddq	32(%esp),%ymm6,%ymm6
3690	vpaddq	%ymm3,%ymm0,%ymm0
3691	vpaddq	%ymm4,%ymm1,%ymm1
3692	vpmuludq	-92(%edx),%ymm2,%ymm3
3693	vmovdqa	%ymm6,32(%esp)
3694	vpmuludq	-60(%edx),%ymm2,%ymm4
3695	vmovdqa	%ymm0,96(%esp)
3696	vpmuludq	100(%edx),%ymm2,%ymm0
3697	vmovdqa	%ymm1,128(%esp)
3698	vpmuludq	132(%edx),%ymm2,%ymm1
3699	vpmuludq	-124(%edx),%ymm2,%ymm2
3700	vpmuludq	-28(%edx),%ymm5,%ymm7
3701	vpaddq	%ymm7,%ymm3,%ymm3
3702	vpmuludq	4(%edx),%ymm5,%ymm6
3703	vpaddq	%ymm6,%ymm4,%ymm4
3704	vpmuludq	-124(%edx),%ymm5,%ymm7
3705	vpaddq	%ymm7,%ymm0,%ymm0
3706	vmovdqa	32(%esp),%ymm7
3707	vpmuludq	-92(%edx),%ymm5,%ymm6
3708	vpaddq	%ymm6,%ymm1,%ymm1
3709	vpmuludq	-60(%edx),%ymm5,%ymm5
3710	vpaddq	%ymm5,%ymm2,%ymm2
3711	vpmuludq	-60(%edx),%ymm7,%ymm6
3712	vpaddq	%ymm6,%ymm3,%ymm3
3713	vpmuludq	-28(%edx),%ymm7,%ymm5
3714	vpaddq	%ymm5,%ymm4,%ymm4
3715	vpmuludq	132(%edx),%ymm7,%ymm6
3716	vpaddq	%ymm6,%ymm0,%ymm0
3717	vmovdqa	96(%esp),%ymm6
3718	vpmuludq	-124(%edx),%ymm7,%ymm5
3719	vpaddq	%ymm5,%ymm1,%ymm1
3720	vpmuludq	-92(%edx),%ymm7,%ymm7
3721	vpaddq	%ymm7,%ymm2,%ymm2
3722	vpmuludq	-124(%edx),%ymm6,%ymm5
3723	vpaddq	%ymm5,%ymm3,%ymm3
3724	vpmuludq	-92(%edx),%ymm6,%ymm7
3725	vpaddq	%ymm7,%ymm4,%ymm4
3726	vpmuludq	68(%edx),%ymm6,%ymm5
3727	vpaddq	%ymm5,%ymm0,%ymm0
3728	vmovdqa	128(%esp),%ymm5
3729	vpmuludq	100(%edx),%ymm6,%ymm7
3730	vpaddq	%ymm7,%ymm1,%ymm1
3731	vpmuludq	132(%edx),%ymm6,%ymm6
3732	vpaddq	%ymm6,%ymm2,%ymm2
3733	vpmuludq	132(%edx),%ymm5,%ymm7
3734	vpaddq	%ymm7,%ymm3,%ymm3
3735	vpmuludq	36(%edx),%ymm5,%ymm6
3736	vpaddq	%ymm6,%ymm0,%ymm0
3737	vpmuludq	-124(%edx),%ymm5,%ymm7
3738	vpaddq	%ymm7,%ymm4,%ymm4
3739	vmovdqa	64(%ebx),%ymm7
3740	vpmuludq	68(%edx),%ymm5,%ymm6
3741	vpaddq	%ymm6,%ymm1,%ymm1
3742	vpmuludq	100(%edx),%ymm5,%ymm5
3743	vpaddq	%ymm5,%ymm2,%ymm2
3744	vpsrldq	$8,%ymm4,%ymm5
3745	vpsrldq	$8,%ymm3,%ymm6
3746	vpaddq	%ymm5,%ymm4,%ymm4
3747	vpsrldq	$8,%ymm0,%ymm5
3748	vpaddq	%ymm6,%ymm3,%ymm3
3749	vpsrldq	$8,%ymm1,%ymm6
3750	vpaddq	%ymm5,%ymm0,%ymm0
3751	vpsrldq	$8,%ymm2,%ymm5
3752	vpaddq	%ymm6,%ymm1,%ymm1
3753	vpermq	$2,%ymm4,%ymm6
3754	vpaddq	%ymm5,%ymm2,%ymm2
3755	vpermq	$2,%ymm3,%ymm5
3756	vpaddq	%ymm6,%ymm4,%ymm4
3757	vpermq	$2,%ymm0,%ymm6
3758	vpaddq	%ymm5,%ymm3,%ymm3
3759	vpermq	$2,%ymm1,%ymm5
3760	vpaddq	%ymm6,%ymm0,%ymm0
3761	vpermq	$2,%ymm2,%ymm6
3762	vpaddq	%ymm5,%ymm1,%ymm1
3763	vpaddq	%ymm6,%ymm2,%ymm2
3764	vpsrlq	$26,%ymm3,%ymm5
3765	vpand	%ymm7,%ymm3,%ymm3
3766	vpsrlq	$26,%ymm0,%ymm6
3767	vpand	%ymm7,%ymm0,%ymm0
3768	vpaddq	%ymm5,%ymm4,%ymm4
3769	vpaddq	%ymm6,%ymm1,%ymm1
3770	vpsrlq	$26,%ymm4,%ymm5
3771	vpand	%ymm7,%ymm4,%ymm4
3772	vpsrlq	$26,%ymm1,%ymm6
3773	vpand	%ymm7,%ymm1,%ymm1
3774	vpaddq	%ymm6,%ymm2,%ymm2
3775	vpaddq	%ymm5,%ymm0,%ymm0
3776	vpsllq	$2,%ymm5,%ymm5
3777	vpsrlq	$26,%ymm2,%ymm6
3778	vpand	%ymm7,%ymm2,%ymm2
3779	vpaddq	%ymm5,%ymm0,%ymm0
3780	vpaddq	%ymm6,%ymm3,%ymm3
3781	vpsrlq	$26,%ymm3,%ymm6
3782	vpsrlq	$26,%ymm0,%ymm5
3783	vpand	%ymm7,%ymm0,%ymm0
3784	vpand	%ymm7,%ymm3,%ymm3
3785	vpaddq	%ymm5,%ymm1,%ymm1
3786	vpaddq	%ymm6,%ymm4,%ymm4
3787	cmpl	$0,%ecx
3788	je	.L029done
3789	vpshufd	$252,%xmm0,%xmm0
3790	leal	288(%esp),%edx
3791	vpshufd	$252,%xmm1,%xmm1
3792	vpshufd	$252,%xmm2,%xmm2
3793	vpshufd	$252,%xmm3,%xmm3
3794	vpshufd	$252,%xmm4,%xmm4
3795	jmp	.L024even
3796.align	16
3797.L029done:
3798	vmovd	%xmm0,-48(%edi)
3799	vmovd	%xmm1,-44(%edi)
3800	vmovd	%xmm2,-40(%edi)
3801	vmovd	%xmm3,-36(%edi)
3802	vmovd	%xmm4,-32(%edi)
3803	vzeroupper
3804	movl	%ebp,%esp
3805.L020nodata:
3806	popl	%edi
3807	popl	%esi
3808	popl	%ebx
3809	popl	%ebp
3810	ret
3811.size	_poly1305_blocks_avx2,.-_poly1305_blocks_avx2
3812.align	64
3813.Lconst_sse2:
3814.long	16777216,0,16777216,0,16777216,0,16777216,0
3815.long	0,0,0,0,0,0,0,0
3816.long	67108863,0,67108863,0,67108863,0,67108863,0
3817.long	268435455,268435452,268435452,268435452
3818.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
3819.byte	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
3820.byte	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
3821.byte	114,103,62,0
3822.align	4
3823.comm	OPENSSL_ia32cap_P,16,4
3824#endif
3825