xref: /freebsd/sys/crypto/openssl/i386/poly1305-x86.S (revision 5e3190f700637fcfc1a52daeaa4a031fdd2557c7)
1/* Do not modify. This file is auto-generated from poly1305-x86.pl. */
2#ifdef PIC
3.text
4.align	64
5.globl	poly1305_init
6.type	poly1305_init,@function
7.align	16
8poly1305_init:
9.L_poly1305_init_begin:
10	#ifdef __CET__
11
12.byte	243,15,30,251
13	#endif
14
15	pushl	%ebp
16	pushl	%ebx
17	pushl	%esi
18	pushl	%edi
19	movl	20(%esp),%edi
20	movl	24(%esp),%esi
21	movl	28(%esp),%ebp
22	xorl	%eax,%eax
23	movl	%eax,(%edi)
24	movl	%eax,4(%edi)
25	movl	%eax,8(%edi)
26	movl	%eax,12(%edi)
27	movl	%eax,16(%edi)
28	movl	%eax,20(%edi)
29	cmpl	$0,%esi
30	je	.L000nokey
31	call	.L001pic_point
32.L001pic_point:
33	popl	%ebx
34	leal	poly1305_blocks-.L001pic_point(%ebx),%eax
35	leal	poly1305_emit-.L001pic_point(%ebx),%edx
36	leal	OPENSSL_ia32cap_P-.L001pic_point(%ebx),%edi
37	movl	(%edi),%ecx
38	andl	$83886080,%ecx
39	cmpl	$83886080,%ecx
40	jne	.L002no_sse2
41	leal	_poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
42	leal	_poly1305_emit_sse2-.L001pic_point(%ebx),%edx
43	movl	8(%edi),%ecx
44	testl	$32,%ecx
45	jz	.L002no_sse2
46	leal	_poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
47.L002no_sse2:
48	movl	20(%esp),%edi
49	movl	%eax,(%ebp)
50	movl	%edx,4(%ebp)
51	movl	(%esi),%eax
52	movl	4(%esi),%ebx
53	movl	8(%esi),%ecx
54	movl	12(%esi),%edx
55	andl	$268435455,%eax
56	andl	$268435452,%ebx
57	andl	$268435452,%ecx
58	andl	$268435452,%edx
59	movl	%eax,24(%edi)
60	movl	%ebx,28(%edi)
61	movl	%ecx,32(%edi)
62	movl	%edx,36(%edi)
63	movl	$1,%eax
64.L000nokey:
65	popl	%edi
66	popl	%esi
67	popl	%ebx
68	popl	%ebp
69	ret
70.size	poly1305_init,.-.L_poly1305_init_begin
71.globl	poly1305_blocks
72.type	poly1305_blocks,@function
73.align	16
74poly1305_blocks:
75.L_poly1305_blocks_begin:
76	#ifdef __CET__
77
78.byte	243,15,30,251
79	#endif
80
81	pushl	%ebp
82	pushl	%ebx
83	pushl	%esi
84	pushl	%edi
85	movl	20(%esp),%edi
86	movl	24(%esp),%esi
87	movl	28(%esp),%ecx
88.Lenter_blocks:
89	andl	$-15,%ecx
90	jz	.L003nodata
91	subl	$64,%esp
92	movl	24(%edi),%eax
93	movl	28(%edi),%ebx
94	leal	(%esi,%ecx,1),%ebp
95	movl	32(%edi),%ecx
96	movl	36(%edi),%edx
97	movl	%ebp,92(%esp)
98	movl	%esi,%ebp
99	movl	%eax,36(%esp)
100	movl	%ebx,%eax
101	shrl	$2,%eax
102	movl	%ebx,40(%esp)
103	addl	%ebx,%eax
104	movl	%ecx,%ebx
105	shrl	$2,%ebx
106	movl	%ecx,44(%esp)
107	addl	%ecx,%ebx
108	movl	%edx,%ecx
109	shrl	$2,%ecx
110	movl	%edx,48(%esp)
111	addl	%edx,%ecx
112	movl	%eax,52(%esp)
113	movl	%ebx,56(%esp)
114	movl	%ecx,60(%esp)
115	movl	(%edi),%eax
116	movl	4(%edi),%ebx
117	movl	8(%edi),%ecx
118	movl	12(%edi),%esi
119	movl	16(%edi),%edi
120	jmp	.L004loop
121.align	32
122.L004loop:
123	addl	(%ebp),%eax
124	adcl	4(%ebp),%ebx
125	adcl	8(%ebp),%ecx
126	adcl	12(%ebp),%esi
127	leal	16(%ebp),%ebp
128	adcl	96(%esp),%edi
129	movl	%eax,(%esp)
130	movl	%esi,12(%esp)
131	mull	36(%esp)
132	movl	%edi,16(%esp)
133	movl	%eax,%edi
134	movl	%ebx,%eax
135	movl	%edx,%esi
136	mull	60(%esp)
137	addl	%eax,%edi
138	movl	%ecx,%eax
139	adcl	%edx,%esi
140	mull	56(%esp)
141	addl	%eax,%edi
142	movl	12(%esp),%eax
143	adcl	%edx,%esi
144	mull	52(%esp)
145	addl	%eax,%edi
146	movl	(%esp),%eax
147	adcl	%edx,%esi
148	mull	40(%esp)
149	movl	%edi,20(%esp)
150	xorl	%edi,%edi
151	addl	%eax,%esi
152	movl	%ebx,%eax
153	adcl	%edx,%edi
154	mull	36(%esp)
155	addl	%eax,%esi
156	movl	%ecx,%eax
157	adcl	%edx,%edi
158	mull	60(%esp)
159	addl	%eax,%esi
160	movl	12(%esp),%eax
161	adcl	%edx,%edi
162	mull	56(%esp)
163	addl	%eax,%esi
164	movl	16(%esp),%eax
165	adcl	%edx,%edi
166	imull	52(%esp),%eax
167	addl	%eax,%esi
168	movl	(%esp),%eax
169	adcl	$0,%edi
170	mull	44(%esp)
171	movl	%esi,24(%esp)
172	xorl	%esi,%esi
173	addl	%eax,%edi
174	movl	%ebx,%eax
175	adcl	%edx,%esi
176	mull	40(%esp)
177	addl	%eax,%edi
178	movl	%ecx,%eax
179	adcl	%edx,%esi
180	mull	36(%esp)
181	addl	%eax,%edi
182	movl	12(%esp),%eax
183	adcl	%edx,%esi
184	mull	60(%esp)
185	addl	%eax,%edi
186	movl	16(%esp),%eax
187	adcl	%edx,%esi
188	imull	56(%esp),%eax
189	addl	%eax,%edi
190	movl	(%esp),%eax
191	adcl	$0,%esi
192	mull	48(%esp)
193	movl	%edi,28(%esp)
194	xorl	%edi,%edi
195	addl	%eax,%esi
196	movl	%ebx,%eax
197	adcl	%edx,%edi
198	mull	44(%esp)
199	addl	%eax,%esi
200	movl	%ecx,%eax
201	adcl	%edx,%edi
202	mull	40(%esp)
203	addl	%eax,%esi
204	movl	12(%esp),%eax
205	adcl	%edx,%edi
206	mull	36(%esp)
207	addl	%eax,%esi
208	movl	16(%esp),%ecx
209	adcl	%edx,%edi
210	movl	%ecx,%edx
211	imull	60(%esp),%ecx
212	addl	%ecx,%esi
213	movl	20(%esp),%eax
214	adcl	$0,%edi
215	imull	36(%esp),%edx
216	addl	%edi,%edx
217	movl	24(%esp),%ebx
218	movl	28(%esp),%ecx
219	movl	%edx,%edi
220	shrl	$2,%edx
221	andl	$3,%edi
222	leal	(%edx,%edx,4),%edx
223	addl	%edx,%eax
224	adcl	$0,%ebx
225	adcl	$0,%ecx
226	adcl	$0,%esi
227	adcl	$0,%edi
228	cmpl	92(%esp),%ebp
229	jne	.L004loop
230	movl	84(%esp),%edx
231	addl	$64,%esp
232	movl	%eax,(%edx)
233	movl	%ebx,4(%edx)
234	movl	%ecx,8(%edx)
235	movl	%esi,12(%edx)
236	movl	%edi,16(%edx)
237.L003nodata:
238	popl	%edi
239	popl	%esi
240	popl	%ebx
241	popl	%ebp
242	ret
243.size	poly1305_blocks,.-.L_poly1305_blocks_begin
244.globl	poly1305_emit
245.type	poly1305_emit,@function
246.align	16
247poly1305_emit:
248.L_poly1305_emit_begin:
249	#ifdef __CET__
250
251.byte	243,15,30,251
252	#endif
253
254	pushl	%ebp
255	pushl	%ebx
256	pushl	%esi
257	pushl	%edi
258	movl	20(%esp),%ebp
259.Lenter_emit:
260	movl	24(%esp),%edi
261	movl	(%ebp),%eax
262	movl	4(%ebp),%ebx
263	movl	8(%ebp),%ecx
264	movl	12(%ebp),%edx
265	movl	16(%ebp),%esi
266	addl	$5,%eax
267	adcl	$0,%ebx
268	adcl	$0,%ecx
269	adcl	$0,%edx
270	adcl	$0,%esi
271	shrl	$2,%esi
272	negl	%esi
273	andl	%esi,%eax
274	andl	%esi,%ebx
275	andl	%esi,%ecx
276	andl	%esi,%edx
277	movl	%eax,(%edi)
278	movl	%ebx,4(%edi)
279	movl	%ecx,8(%edi)
280	movl	%edx,12(%edi)
281	notl	%esi
282	movl	(%ebp),%eax
283	movl	4(%ebp),%ebx
284	movl	8(%ebp),%ecx
285	movl	12(%ebp),%edx
286	movl	28(%esp),%ebp
287	andl	%esi,%eax
288	andl	%esi,%ebx
289	andl	%esi,%ecx
290	andl	%esi,%edx
291	orl	(%edi),%eax
292	orl	4(%edi),%ebx
293	orl	8(%edi),%ecx
294	orl	12(%edi),%edx
295	addl	(%ebp),%eax
296	adcl	4(%ebp),%ebx
297	adcl	8(%ebp),%ecx
298	adcl	12(%ebp),%edx
299	movl	%eax,(%edi)
300	movl	%ebx,4(%edi)
301	movl	%ecx,8(%edi)
302	movl	%edx,12(%edi)
303	popl	%edi
304	popl	%esi
305	popl	%ebx
306	popl	%ebp
307	ret
308.size	poly1305_emit,.-.L_poly1305_emit_begin
309.align	32
310.type	_poly1305_init_sse2,@function
311.align	16
312_poly1305_init_sse2:
313	#ifdef __CET__
314
315.byte	243,15,30,251
316	#endif
317
318	movdqu	24(%edi),%xmm4
319	leal	48(%edi),%edi
320	movl	%esp,%ebp
321	subl	$224,%esp
322	andl	$-16,%esp
323	movq	64(%ebx),%xmm7
324	movdqa	%xmm4,%xmm0
325	movdqa	%xmm4,%xmm1
326	movdqa	%xmm4,%xmm2
327	pand	%xmm7,%xmm0
328	psrlq	$26,%xmm1
329	psrldq	$6,%xmm2
330	pand	%xmm7,%xmm1
331	movdqa	%xmm2,%xmm3
332	psrlq	$4,%xmm2
333	psrlq	$30,%xmm3
334	pand	%xmm7,%xmm2
335	pand	%xmm7,%xmm3
336	psrldq	$13,%xmm4
337	leal	144(%esp),%edx
338	movl	$2,%ecx
339.L005square:
340	movdqa	%xmm0,(%esp)
341	movdqa	%xmm1,16(%esp)
342	movdqa	%xmm2,32(%esp)
343	movdqa	%xmm3,48(%esp)
344	movdqa	%xmm4,64(%esp)
345	movdqa	%xmm1,%xmm6
346	movdqa	%xmm2,%xmm5
347	pslld	$2,%xmm6
348	pslld	$2,%xmm5
349	paddd	%xmm1,%xmm6
350	paddd	%xmm2,%xmm5
351	movdqa	%xmm6,80(%esp)
352	movdqa	%xmm5,96(%esp)
353	movdqa	%xmm3,%xmm6
354	movdqa	%xmm4,%xmm5
355	pslld	$2,%xmm6
356	pslld	$2,%xmm5
357	paddd	%xmm3,%xmm6
358	paddd	%xmm4,%xmm5
359	movdqa	%xmm6,112(%esp)
360	movdqa	%xmm5,128(%esp)
361	pshufd	$68,%xmm0,%xmm6
362	movdqa	%xmm1,%xmm5
363	pshufd	$68,%xmm1,%xmm1
364	pshufd	$68,%xmm2,%xmm2
365	pshufd	$68,%xmm3,%xmm3
366	pshufd	$68,%xmm4,%xmm4
367	movdqa	%xmm6,(%edx)
368	movdqa	%xmm1,16(%edx)
369	movdqa	%xmm2,32(%edx)
370	movdqa	%xmm3,48(%edx)
371	movdqa	%xmm4,64(%edx)
372	pmuludq	%xmm0,%xmm4
373	pmuludq	%xmm0,%xmm3
374	pmuludq	%xmm0,%xmm2
375	pmuludq	%xmm0,%xmm1
376	pmuludq	%xmm6,%xmm0
377	movdqa	%xmm5,%xmm6
378	pmuludq	48(%edx),%xmm5
379	movdqa	%xmm6,%xmm7
380	pmuludq	32(%edx),%xmm6
381	paddq	%xmm5,%xmm4
382	movdqa	%xmm7,%xmm5
383	pmuludq	16(%edx),%xmm7
384	paddq	%xmm6,%xmm3
385	movdqa	80(%esp),%xmm6
386	pmuludq	(%edx),%xmm5
387	paddq	%xmm7,%xmm2
388	pmuludq	64(%edx),%xmm6
389	movdqa	32(%esp),%xmm7
390	paddq	%xmm5,%xmm1
391	movdqa	%xmm7,%xmm5
392	pmuludq	32(%edx),%xmm7
393	paddq	%xmm6,%xmm0
394	movdqa	%xmm5,%xmm6
395	pmuludq	16(%edx),%xmm5
396	paddq	%xmm7,%xmm4
397	movdqa	96(%esp),%xmm7
398	pmuludq	(%edx),%xmm6
399	paddq	%xmm5,%xmm3
400	movdqa	%xmm7,%xmm5
401	pmuludq	64(%edx),%xmm7
402	paddq	%xmm6,%xmm2
403	pmuludq	48(%edx),%xmm5
404	movdqa	48(%esp),%xmm6
405	paddq	%xmm7,%xmm1
406	movdqa	%xmm6,%xmm7
407	pmuludq	16(%edx),%xmm6
408	paddq	%xmm5,%xmm0
409	movdqa	112(%esp),%xmm5
410	pmuludq	(%edx),%xmm7
411	paddq	%xmm6,%xmm4
412	movdqa	%xmm5,%xmm6
413	pmuludq	64(%edx),%xmm5
414	paddq	%xmm7,%xmm3
415	movdqa	%xmm6,%xmm7
416	pmuludq	48(%edx),%xmm6
417	paddq	%xmm5,%xmm2
418	pmuludq	32(%edx),%xmm7
419	movdqa	64(%esp),%xmm5
420	paddq	%xmm6,%xmm1
421	movdqa	128(%esp),%xmm6
422	pmuludq	(%edx),%xmm5
423	paddq	%xmm7,%xmm0
424	movdqa	%xmm6,%xmm7
425	pmuludq	64(%edx),%xmm6
426	paddq	%xmm5,%xmm4
427	movdqa	%xmm7,%xmm5
428	pmuludq	16(%edx),%xmm7
429	paddq	%xmm6,%xmm3
430	movdqa	%xmm5,%xmm6
431	pmuludq	32(%edx),%xmm5
432	paddq	%xmm7,%xmm0
433	pmuludq	48(%edx),%xmm6
434	movdqa	64(%ebx),%xmm7
435	paddq	%xmm5,%xmm1
436	paddq	%xmm6,%xmm2
437	movdqa	%xmm3,%xmm5
438	pand	%xmm7,%xmm3
439	psrlq	$26,%xmm5
440	paddq	%xmm4,%xmm5
441	movdqa	%xmm0,%xmm6
442	pand	%xmm7,%xmm0
443	psrlq	$26,%xmm6
444	movdqa	%xmm5,%xmm4
445	paddq	%xmm1,%xmm6
446	psrlq	$26,%xmm5
447	pand	%xmm7,%xmm4
448	movdqa	%xmm6,%xmm1
449	psrlq	$26,%xmm6
450	paddd	%xmm5,%xmm0
451	psllq	$2,%xmm5
452	paddq	%xmm2,%xmm6
453	paddq	%xmm0,%xmm5
454	pand	%xmm7,%xmm1
455	movdqa	%xmm6,%xmm2
456	psrlq	$26,%xmm6
457	pand	%xmm7,%xmm2
458	paddd	%xmm3,%xmm6
459	movdqa	%xmm5,%xmm0
460	psrlq	$26,%xmm5
461	movdqa	%xmm6,%xmm3
462	psrlq	$26,%xmm6
463	pand	%xmm7,%xmm0
464	paddd	%xmm5,%xmm1
465	pand	%xmm7,%xmm3
466	paddd	%xmm6,%xmm4
467	decl	%ecx
468	jz	.L006square_break
469	punpcklqdq	(%esp),%xmm0
470	punpcklqdq	16(%esp),%xmm1
471	punpcklqdq	32(%esp),%xmm2
472	punpcklqdq	48(%esp),%xmm3
473	punpcklqdq	64(%esp),%xmm4
474	jmp	.L005square
475.L006square_break:
476	psllq	$32,%xmm0
477	psllq	$32,%xmm1
478	psllq	$32,%xmm2
479	psllq	$32,%xmm3
480	psllq	$32,%xmm4
481	por	(%esp),%xmm0
482	por	16(%esp),%xmm1
483	por	32(%esp),%xmm2
484	por	48(%esp),%xmm3
485	por	64(%esp),%xmm4
486	pshufd	$141,%xmm0,%xmm0
487	pshufd	$141,%xmm1,%xmm1
488	pshufd	$141,%xmm2,%xmm2
489	pshufd	$141,%xmm3,%xmm3
490	pshufd	$141,%xmm4,%xmm4
491	movdqu	%xmm0,(%edi)
492	movdqu	%xmm1,16(%edi)
493	movdqu	%xmm2,32(%edi)
494	movdqu	%xmm3,48(%edi)
495	movdqu	%xmm4,64(%edi)
496	movdqa	%xmm1,%xmm6
497	movdqa	%xmm2,%xmm5
498	pslld	$2,%xmm6
499	pslld	$2,%xmm5
500	paddd	%xmm1,%xmm6
501	paddd	%xmm2,%xmm5
502	movdqu	%xmm6,80(%edi)
503	movdqu	%xmm5,96(%edi)
504	movdqa	%xmm3,%xmm6
505	movdqa	%xmm4,%xmm5
506	pslld	$2,%xmm6
507	pslld	$2,%xmm5
508	paddd	%xmm3,%xmm6
509	paddd	%xmm4,%xmm5
510	movdqu	%xmm6,112(%edi)
511	movdqu	%xmm5,128(%edi)
512	movl	%ebp,%esp
513	leal	-48(%edi),%edi
514	ret
515.size	_poly1305_init_sse2,.-_poly1305_init_sse2
516.align	32
517.type	_poly1305_blocks_sse2,@function
518.align	16
519_poly1305_blocks_sse2:
520	#ifdef __CET__
521
522.byte	243,15,30,251
523	#endif
524
525	pushl	%ebp
526	pushl	%ebx
527	pushl	%esi
528	pushl	%edi
529	movl	20(%esp),%edi
530	movl	24(%esp),%esi
531	movl	28(%esp),%ecx
532	movl	20(%edi),%eax
533	andl	$-16,%ecx
534	jz	.L007nodata
535	cmpl	$64,%ecx
536	jae	.L008enter_sse2
537	testl	%eax,%eax
538	jz	.Lenter_blocks
539.align	16
540.L008enter_sse2:
541	call	.L009pic_point
542.L009pic_point:
543	popl	%ebx
544	leal	.Lconst_sse2-.L009pic_point(%ebx),%ebx
545	testl	%eax,%eax
546	jnz	.L010base2_26
547	call	_poly1305_init_sse2
548	movl	(%edi),%eax
549	movl	3(%edi),%ecx
550	movl	6(%edi),%edx
551	movl	9(%edi),%esi
552	movl	13(%edi),%ebp
553	movl	$1,20(%edi)
554	shrl	$2,%ecx
555	andl	$67108863,%eax
556	shrl	$4,%edx
557	andl	$67108863,%ecx
558	shrl	$6,%esi
559	andl	$67108863,%edx
560	movd	%eax,%xmm0
561	movd	%ecx,%xmm1
562	movd	%edx,%xmm2
563	movd	%esi,%xmm3
564	movd	%ebp,%xmm4
565	movl	24(%esp),%esi
566	movl	28(%esp),%ecx
567	jmp	.L011base2_32
568.align	16
569.L010base2_26:
570	movd	(%edi),%xmm0
571	movd	4(%edi),%xmm1
572	movd	8(%edi),%xmm2
573	movd	12(%edi),%xmm3
574	movd	16(%edi),%xmm4
575	movdqa	64(%ebx),%xmm7
576.L011base2_32:
577	movl	32(%esp),%eax
578	movl	%esp,%ebp
579	subl	$528,%esp
580	andl	$-16,%esp
581	leal	48(%edi),%edi
582	shll	$24,%eax
583	testl	$31,%ecx
584	jz	.L012even
585	movdqu	(%esi),%xmm6
586	leal	16(%esi),%esi
587	movdqa	%xmm6,%xmm5
588	pand	%xmm7,%xmm6
589	paddd	%xmm6,%xmm0
590	movdqa	%xmm5,%xmm6
591	psrlq	$26,%xmm5
592	psrldq	$6,%xmm6
593	pand	%xmm7,%xmm5
594	paddd	%xmm5,%xmm1
595	movdqa	%xmm6,%xmm5
596	psrlq	$4,%xmm6
597	pand	%xmm7,%xmm6
598	paddd	%xmm6,%xmm2
599	movdqa	%xmm5,%xmm6
600	psrlq	$30,%xmm5
601	pand	%xmm7,%xmm5
602	psrldq	$7,%xmm6
603	paddd	%xmm5,%xmm3
604	movd	%eax,%xmm5
605	paddd	%xmm6,%xmm4
606	movd	12(%edi),%xmm6
607	paddd	%xmm5,%xmm4
608	movdqa	%xmm0,(%esp)
609	movdqa	%xmm1,16(%esp)
610	movdqa	%xmm2,32(%esp)
611	movdqa	%xmm3,48(%esp)
612	movdqa	%xmm4,64(%esp)
613	pmuludq	%xmm6,%xmm0
614	pmuludq	%xmm6,%xmm1
615	pmuludq	%xmm6,%xmm2
616	movd	28(%edi),%xmm5
617	pmuludq	%xmm6,%xmm3
618	pmuludq	%xmm6,%xmm4
619	movdqa	%xmm5,%xmm6
620	pmuludq	48(%esp),%xmm5
621	movdqa	%xmm6,%xmm7
622	pmuludq	32(%esp),%xmm6
623	paddq	%xmm5,%xmm4
624	movdqa	%xmm7,%xmm5
625	pmuludq	16(%esp),%xmm7
626	paddq	%xmm6,%xmm3
627	movd	92(%edi),%xmm6
628	pmuludq	(%esp),%xmm5
629	paddq	%xmm7,%xmm2
630	pmuludq	64(%esp),%xmm6
631	movd	44(%edi),%xmm7
632	paddq	%xmm5,%xmm1
633	movdqa	%xmm7,%xmm5
634	pmuludq	32(%esp),%xmm7
635	paddq	%xmm6,%xmm0
636	movdqa	%xmm5,%xmm6
637	pmuludq	16(%esp),%xmm5
638	paddq	%xmm7,%xmm4
639	movd	108(%edi),%xmm7
640	pmuludq	(%esp),%xmm6
641	paddq	%xmm5,%xmm3
642	movdqa	%xmm7,%xmm5
643	pmuludq	64(%esp),%xmm7
644	paddq	%xmm6,%xmm2
645	pmuludq	48(%esp),%xmm5
646	movd	60(%edi),%xmm6
647	paddq	%xmm7,%xmm1
648	movdqa	%xmm6,%xmm7
649	pmuludq	16(%esp),%xmm6
650	paddq	%xmm5,%xmm0
651	movd	124(%edi),%xmm5
652	pmuludq	(%esp),%xmm7
653	paddq	%xmm6,%xmm4
654	movdqa	%xmm5,%xmm6
655	pmuludq	64(%esp),%xmm5
656	paddq	%xmm7,%xmm3
657	movdqa	%xmm6,%xmm7
658	pmuludq	48(%esp),%xmm6
659	paddq	%xmm5,%xmm2
660	pmuludq	32(%esp),%xmm7
661	movd	76(%edi),%xmm5
662	paddq	%xmm6,%xmm1
663	movd	140(%edi),%xmm6
664	pmuludq	(%esp),%xmm5
665	paddq	%xmm7,%xmm0
666	movdqa	%xmm6,%xmm7
667	pmuludq	64(%esp),%xmm6
668	paddq	%xmm5,%xmm4
669	movdqa	%xmm7,%xmm5
670	pmuludq	16(%esp),%xmm7
671	paddq	%xmm6,%xmm3
672	movdqa	%xmm5,%xmm6
673	pmuludq	32(%esp),%xmm5
674	paddq	%xmm7,%xmm0
675	pmuludq	48(%esp),%xmm6
676	movdqa	64(%ebx),%xmm7
677	paddq	%xmm5,%xmm1
678	paddq	%xmm6,%xmm2
679	movdqa	%xmm3,%xmm5
680	pand	%xmm7,%xmm3
681	psrlq	$26,%xmm5
682	paddq	%xmm4,%xmm5
683	movdqa	%xmm0,%xmm6
684	pand	%xmm7,%xmm0
685	psrlq	$26,%xmm6
686	movdqa	%xmm5,%xmm4
687	paddq	%xmm1,%xmm6
688	psrlq	$26,%xmm5
689	pand	%xmm7,%xmm4
690	movdqa	%xmm6,%xmm1
691	psrlq	$26,%xmm6
692	paddd	%xmm5,%xmm0
693	psllq	$2,%xmm5
694	paddq	%xmm2,%xmm6
695	paddq	%xmm0,%xmm5
696	pand	%xmm7,%xmm1
697	movdqa	%xmm6,%xmm2
698	psrlq	$26,%xmm6
699	pand	%xmm7,%xmm2
700	paddd	%xmm3,%xmm6
701	movdqa	%xmm5,%xmm0
702	psrlq	$26,%xmm5
703	movdqa	%xmm6,%xmm3
704	psrlq	$26,%xmm6
705	pand	%xmm7,%xmm0
706	paddd	%xmm5,%xmm1
707	pand	%xmm7,%xmm3
708	paddd	%xmm6,%xmm4
709	subl	$16,%ecx
710	jz	.L013done
711.L012even:
712	leal	384(%esp),%edx
713	leal	-32(%esi),%eax
714	subl	$64,%ecx
715	movdqu	(%edi),%xmm5
716	pshufd	$68,%xmm5,%xmm6
717	cmovbl	%eax,%esi
718	pshufd	$238,%xmm5,%xmm5
719	movdqa	%xmm6,(%edx)
720	leal	160(%esp),%eax
721	movdqu	16(%edi),%xmm6
722	movdqa	%xmm5,-144(%edx)
723	pshufd	$68,%xmm6,%xmm5
724	pshufd	$238,%xmm6,%xmm6
725	movdqa	%xmm5,16(%edx)
726	movdqu	32(%edi),%xmm5
727	movdqa	%xmm6,-128(%edx)
728	pshufd	$68,%xmm5,%xmm6
729	pshufd	$238,%xmm5,%xmm5
730	movdqa	%xmm6,32(%edx)
731	movdqu	48(%edi),%xmm6
732	movdqa	%xmm5,-112(%edx)
733	pshufd	$68,%xmm6,%xmm5
734	pshufd	$238,%xmm6,%xmm6
735	movdqa	%xmm5,48(%edx)
736	movdqu	64(%edi),%xmm5
737	movdqa	%xmm6,-96(%edx)
738	pshufd	$68,%xmm5,%xmm6
739	pshufd	$238,%xmm5,%xmm5
740	movdqa	%xmm6,64(%edx)
741	movdqu	80(%edi),%xmm6
742	movdqa	%xmm5,-80(%edx)
743	pshufd	$68,%xmm6,%xmm5
744	pshufd	$238,%xmm6,%xmm6
745	movdqa	%xmm5,80(%edx)
746	movdqu	96(%edi),%xmm5
747	movdqa	%xmm6,-64(%edx)
748	pshufd	$68,%xmm5,%xmm6
749	pshufd	$238,%xmm5,%xmm5
750	movdqa	%xmm6,96(%edx)
751	movdqu	112(%edi),%xmm6
752	movdqa	%xmm5,-48(%edx)
753	pshufd	$68,%xmm6,%xmm5
754	pshufd	$238,%xmm6,%xmm6
755	movdqa	%xmm5,112(%edx)
756	movdqu	128(%edi),%xmm5
757	movdqa	%xmm6,-32(%edx)
758	pshufd	$68,%xmm5,%xmm6
759	pshufd	$238,%xmm5,%xmm5
760	movdqa	%xmm6,128(%edx)
761	movdqa	%xmm5,-16(%edx)
762	movdqu	32(%esi),%xmm5
763	movdqu	48(%esi),%xmm6
764	leal	32(%esi),%esi
765	movdqa	%xmm2,112(%esp)
766	movdqa	%xmm3,128(%esp)
767	movdqa	%xmm4,144(%esp)
768	movdqa	%xmm5,%xmm2
769	movdqa	%xmm6,%xmm3
770	psrldq	$6,%xmm2
771	psrldq	$6,%xmm3
772	movdqa	%xmm5,%xmm4
773	punpcklqdq	%xmm3,%xmm2
774	punpckhqdq	%xmm6,%xmm4
775	punpcklqdq	%xmm6,%xmm5
776	movdqa	%xmm2,%xmm3
777	psrlq	$4,%xmm2
778	psrlq	$30,%xmm3
779	movdqa	%xmm5,%xmm6
780	psrlq	$40,%xmm4
781	psrlq	$26,%xmm6
782	pand	%xmm7,%xmm5
783	pand	%xmm7,%xmm6
784	pand	%xmm7,%xmm2
785	pand	%xmm7,%xmm3
786	por	(%ebx),%xmm4
787	movdqa	%xmm0,80(%esp)
788	movdqa	%xmm1,96(%esp)
789	jbe	.L014skip_loop
790	jmp	.L015loop
791.align	32
792.L015loop:
793	movdqa	-144(%edx),%xmm7
794	movdqa	%xmm6,16(%eax)
795	movdqa	%xmm2,32(%eax)
796	movdqa	%xmm3,48(%eax)
797	movdqa	%xmm4,64(%eax)
798	movdqa	%xmm5,%xmm1
799	pmuludq	%xmm7,%xmm5
800	movdqa	%xmm6,%xmm0
801	pmuludq	%xmm7,%xmm6
802	pmuludq	%xmm7,%xmm2
803	pmuludq	%xmm7,%xmm3
804	pmuludq	%xmm7,%xmm4
805	pmuludq	-16(%edx),%xmm0
806	movdqa	%xmm1,%xmm7
807	pmuludq	-128(%edx),%xmm1
808	paddq	%xmm5,%xmm0
809	movdqa	%xmm7,%xmm5
810	pmuludq	-112(%edx),%xmm7
811	paddq	%xmm6,%xmm1
812	movdqa	%xmm5,%xmm6
813	pmuludq	-96(%edx),%xmm5
814	paddq	%xmm7,%xmm2
815	movdqa	16(%eax),%xmm7
816	pmuludq	-80(%edx),%xmm6
817	paddq	%xmm5,%xmm3
818	movdqa	%xmm7,%xmm5
819	pmuludq	-128(%edx),%xmm7
820	paddq	%xmm6,%xmm4
821	movdqa	%xmm5,%xmm6
822	pmuludq	-112(%edx),%xmm5
823	paddq	%xmm7,%xmm2
824	movdqa	32(%eax),%xmm7
825	pmuludq	-96(%edx),%xmm6
826	paddq	%xmm5,%xmm3
827	movdqa	%xmm7,%xmm5
828	pmuludq	-32(%edx),%xmm7
829	paddq	%xmm6,%xmm4
830	movdqa	%xmm5,%xmm6
831	pmuludq	-16(%edx),%xmm5
832	paddq	%xmm7,%xmm0
833	movdqa	%xmm6,%xmm7
834	pmuludq	-128(%edx),%xmm6
835	paddq	%xmm5,%xmm1
836	movdqa	48(%eax),%xmm5
837	pmuludq	-112(%edx),%xmm7
838	paddq	%xmm6,%xmm3
839	movdqa	%xmm5,%xmm6
840	pmuludq	-48(%edx),%xmm5
841	paddq	%xmm7,%xmm4
842	movdqa	%xmm6,%xmm7
843	pmuludq	-32(%edx),%xmm6
844	paddq	%xmm5,%xmm0
845	movdqa	%xmm7,%xmm5
846	pmuludq	-16(%edx),%xmm7
847	paddq	%xmm6,%xmm1
848	movdqa	64(%eax),%xmm6
849	pmuludq	-128(%edx),%xmm5
850	paddq	%xmm7,%xmm2
851	movdqa	%xmm6,%xmm7
852	pmuludq	-16(%edx),%xmm6
853	paddq	%xmm5,%xmm4
854	movdqa	%xmm7,%xmm5
855	pmuludq	-64(%edx),%xmm7
856	paddq	%xmm6,%xmm3
857	movdqa	%xmm5,%xmm6
858	pmuludq	-48(%edx),%xmm5
859	paddq	%xmm7,%xmm0
860	movdqa	64(%ebx),%xmm7
861	pmuludq	-32(%edx),%xmm6
862	paddq	%xmm5,%xmm1
863	paddq	%xmm6,%xmm2
864	movdqu	-32(%esi),%xmm5
865	movdqu	-16(%esi),%xmm6
866	leal	32(%esi),%esi
867	movdqa	%xmm2,32(%esp)
868	movdqa	%xmm3,48(%esp)
869	movdqa	%xmm4,64(%esp)
870	movdqa	%xmm5,%xmm2
871	movdqa	%xmm6,%xmm3
872	psrldq	$6,%xmm2
873	psrldq	$6,%xmm3
874	movdqa	%xmm5,%xmm4
875	punpcklqdq	%xmm3,%xmm2
876	punpckhqdq	%xmm6,%xmm4
877	punpcklqdq	%xmm6,%xmm5
878	movdqa	%xmm2,%xmm3
879	psrlq	$4,%xmm2
880	psrlq	$30,%xmm3
881	movdqa	%xmm5,%xmm6
882	psrlq	$40,%xmm4
883	psrlq	$26,%xmm6
884	pand	%xmm7,%xmm5
885	pand	%xmm7,%xmm6
886	pand	%xmm7,%xmm2
887	pand	%xmm7,%xmm3
888	por	(%ebx),%xmm4
889	leal	-32(%esi),%eax
890	subl	$64,%ecx
891	paddd	80(%esp),%xmm5
892	paddd	96(%esp),%xmm6
893	paddd	112(%esp),%xmm2
894	paddd	128(%esp),%xmm3
895	paddd	144(%esp),%xmm4
896	cmovbl	%eax,%esi
897	leal	160(%esp),%eax
898	movdqa	(%edx),%xmm7
899	movdqa	%xmm1,16(%esp)
900	movdqa	%xmm6,16(%eax)
901	movdqa	%xmm2,32(%eax)
902	movdqa	%xmm3,48(%eax)
903	movdqa	%xmm4,64(%eax)
904	movdqa	%xmm5,%xmm1
905	pmuludq	%xmm7,%xmm5
906	paddq	%xmm0,%xmm5
907	movdqa	%xmm6,%xmm0
908	pmuludq	%xmm7,%xmm6
909	pmuludq	%xmm7,%xmm2
910	pmuludq	%xmm7,%xmm3
911	pmuludq	%xmm7,%xmm4
912	paddq	16(%esp),%xmm6
913	paddq	32(%esp),%xmm2
914	paddq	48(%esp),%xmm3
915	paddq	64(%esp),%xmm4
916	pmuludq	128(%edx),%xmm0
917	movdqa	%xmm1,%xmm7
918	pmuludq	16(%edx),%xmm1
919	paddq	%xmm5,%xmm0
920	movdqa	%xmm7,%xmm5
921	pmuludq	32(%edx),%xmm7
922	paddq	%xmm6,%xmm1
923	movdqa	%xmm5,%xmm6
924	pmuludq	48(%edx),%xmm5
925	paddq	%xmm7,%xmm2
926	movdqa	16(%eax),%xmm7
927	pmuludq	64(%edx),%xmm6
928	paddq	%xmm5,%xmm3
929	movdqa	%xmm7,%xmm5
930	pmuludq	16(%edx),%xmm7
931	paddq	%xmm6,%xmm4
932	movdqa	%xmm5,%xmm6
933	pmuludq	32(%edx),%xmm5
934	paddq	%xmm7,%xmm2
935	movdqa	32(%eax),%xmm7
936	pmuludq	48(%edx),%xmm6
937	paddq	%xmm5,%xmm3
938	movdqa	%xmm7,%xmm5
939	pmuludq	112(%edx),%xmm7
940	paddq	%xmm6,%xmm4
941	movdqa	%xmm5,%xmm6
942	pmuludq	128(%edx),%xmm5
943	paddq	%xmm7,%xmm0
944	movdqa	%xmm6,%xmm7
945	pmuludq	16(%edx),%xmm6
946	paddq	%xmm5,%xmm1
947	movdqa	48(%eax),%xmm5
948	pmuludq	32(%edx),%xmm7
949	paddq	%xmm6,%xmm3
950	movdqa	%xmm5,%xmm6
951	pmuludq	96(%edx),%xmm5
952	paddq	%xmm7,%xmm4
953	movdqa	%xmm6,%xmm7
954	pmuludq	112(%edx),%xmm6
955	paddq	%xmm5,%xmm0
956	movdqa	%xmm7,%xmm5
957	pmuludq	128(%edx),%xmm7
958	paddq	%xmm6,%xmm1
959	movdqa	64(%eax),%xmm6
960	pmuludq	16(%edx),%xmm5
961	paddq	%xmm7,%xmm2
962	movdqa	%xmm6,%xmm7
963	pmuludq	128(%edx),%xmm6
964	paddq	%xmm5,%xmm4
965	movdqa	%xmm7,%xmm5
966	pmuludq	80(%edx),%xmm7
967	paddq	%xmm6,%xmm3
968	movdqa	%xmm5,%xmm6
969	pmuludq	96(%edx),%xmm5
970	paddq	%xmm7,%xmm0
971	movdqa	64(%ebx),%xmm7
972	pmuludq	112(%edx),%xmm6
973	paddq	%xmm5,%xmm1
974	paddq	%xmm6,%xmm2
975	movdqa	%xmm3,%xmm5
976	pand	%xmm7,%xmm3
977	psrlq	$26,%xmm5
978	paddq	%xmm4,%xmm5
979	movdqa	%xmm0,%xmm6
980	pand	%xmm7,%xmm0
981	psrlq	$26,%xmm6
982	movdqa	%xmm5,%xmm4
983	paddq	%xmm1,%xmm6
984	psrlq	$26,%xmm5
985	pand	%xmm7,%xmm4
986	movdqa	%xmm6,%xmm1
987	psrlq	$26,%xmm6
988	paddd	%xmm5,%xmm0
989	psllq	$2,%xmm5
990	paddq	%xmm2,%xmm6
991	paddq	%xmm0,%xmm5
992	pand	%xmm7,%xmm1
993	movdqa	%xmm6,%xmm2
994	psrlq	$26,%xmm6
995	pand	%xmm7,%xmm2
996	paddd	%xmm3,%xmm6
997	movdqa	%xmm5,%xmm0
998	psrlq	$26,%xmm5
999	movdqa	%xmm6,%xmm3
1000	psrlq	$26,%xmm6
1001	pand	%xmm7,%xmm0
1002	paddd	%xmm5,%xmm1
1003	pand	%xmm7,%xmm3
1004	paddd	%xmm6,%xmm4
1005	movdqu	32(%esi),%xmm5
1006	movdqu	48(%esi),%xmm6
1007	leal	32(%esi),%esi
1008	movdqa	%xmm2,112(%esp)
1009	movdqa	%xmm3,128(%esp)
1010	movdqa	%xmm4,144(%esp)
1011	movdqa	%xmm5,%xmm2
1012	movdqa	%xmm6,%xmm3
1013	psrldq	$6,%xmm2
1014	psrldq	$6,%xmm3
1015	movdqa	%xmm5,%xmm4
1016	punpcklqdq	%xmm3,%xmm2
1017	punpckhqdq	%xmm6,%xmm4
1018	punpcklqdq	%xmm6,%xmm5
1019	movdqa	%xmm2,%xmm3
1020	psrlq	$4,%xmm2
1021	psrlq	$30,%xmm3
1022	movdqa	%xmm5,%xmm6
1023	psrlq	$40,%xmm4
1024	psrlq	$26,%xmm6
1025	pand	%xmm7,%xmm5
1026	pand	%xmm7,%xmm6
1027	pand	%xmm7,%xmm2
1028	pand	%xmm7,%xmm3
1029	por	(%ebx),%xmm4
1030	movdqa	%xmm0,80(%esp)
1031	movdqa	%xmm1,96(%esp)
1032	ja	.L015loop
1033.L014skip_loop:
1034	pshufd	$16,-144(%edx),%xmm7
1035	addl	$32,%ecx
1036	jnz	.L016long_tail
1037	paddd	%xmm0,%xmm5
1038	paddd	%xmm1,%xmm6
1039	paddd	112(%esp),%xmm2
1040	paddd	128(%esp),%xmm3
1041	paddd	144(%esp),%xmm4
1042.L016long_tail:
1043	movdqa	%xmm5,(%eax)
1044	movdqa	%xmm6,16(%eax)
1045	movdqa	%xmm2,32(%eax)
1046	movdqa	%xmm3,48(%eax)
1047	movdqa	%xmm4,64(%eax)
1048	pmuludq	%xmm7,%xmm5
1049	pmuludq	%xmm7,%xmm6
1050	pmuludq	%xmm7,%xmm2
1051	movdqa	%xmm5,%xmm0
1052	pshufd	$16,-128(%edx),%xmm5
1053	pmuludq	%xmm7,%xmm3
1054	movdqa	%xmm6,%xmm1
1055	pmuludq	%xmm7,%xmm4
1056	movdqa	%xmm5,%xmm6
1057	pmuludq	48(%eax),%xmm5
1058	movdqa	%xmm6,%xmm7
1059	pmuludq	32(%eax),%xmm6
1060	paddq	%xmm5,%xmm4
1061	movdqa	%xmm7,%xmm5
1062	pmuludq	16(%eax),%xmm7
1063	paddq	%xmm6,%xmm3
1064	pshufd	$16,-64(%edx),%xmm6
1065	pmuludq	(%eax),%xmm5
1066	paddq	%xmm7,%xmm2
1067	pmuludq	64(%eax),%xmm6
1068	pshufd	$16,-112(%edx),%xmm7
1069	paddq	%xmm5,%xmm1
1070	movdqa	%xmm7,%xmm5
1071	pmuludq	32(%eax),%xmm7
1072	paddq	%xmm6,%xmm0
1073	movdqa	%xmm5,%xmm6
1074	pmuludq	16(%eax),%xmm5
1075	paddq	%xmm7,%xmm4
1076	pshufd	$16,-48(%edx),%xmm7
1077	pmuludq	(%eax),%xmm6
1078	paddq	%xmm5,%xmm3
1079	movdqa	%xmm7,%xmm5
1080	pmuludq	64(%eax),%xmm7
1081	paddq	%xmm6,%xmm2
1082	pmuludq	48(%eax),%xmm5
1083	pshufd	$16,-96(%edx),%xmm6
1084	paddq	%xmm7,%xmm1
1085	movdqa	%xmm6,%xmm7
1086	pmuludq	16(%eax),%xmm6
1087	paddq	%xmm5,%xmm0
1088	pshufd	$16,-32(%edx),%xmm5
1089	pmuludq	(%eax),%xmm7
1090	paddq	%xmm6,%xmm4
1091	movdqa	%xmm5,%xmm6
1092	pmuludq	64(%eax),%xmm5
1093	paddq	%xmm7,%xmm3
1094	movdqa	%xmm6,%xmm7
1095	pmuludq	48(%eax),%xmm6
1096	paddq	%xmm5,%xmm2
1097	pmuludq	32(%eax),%xmm7
1098	pshufd	$16,-80(%edx),%xmm5
1099	paddq	%xmm6,%xmm1
1100	pshufd	$16,-16(%edx),%xmm6
1101	pmuludq	(%eax),%xmm5
1102	paddq	%xmm7,%xmm0
1103	movdqa	%xmm6,%xmm7
1104	pmuludq	64(%eax),%xmm6
1105	paddq	%xmm5,%xmm4
1106	movdqa	%xmm7,%xmm5
1107	pmuludq	16(%eax),%xmm7
1108	paddq	%xmm6,%xmm3
1109	movdqa	%xmm5,%xmm6
1110	pmuludq	32(%eax),%xmm5
1111	paddq	%xmm7,%xmm0
1112	pmuludq	48(%eax),%xmm6
1113	movdqa	64(%ebx),%xmm7
1114	paddq	%xmm5,%xmm1
1115	paddq	%xmm6,%xmm2
1116	jz	.L017short_tail
1117	movdqu	-32(%esi),%xmm5
1118	movdqu	-16(%esi),%xmm6
1119	leal	32(%esi),%esi
1120	movdqa	%xmm2,32(%esp)
1121	movdqa	%xmm3,48(%esp)
1122	movdqa	%xmm4,64(%esp)
1123	movdqa	%xmm5,%xmm2
1124	movdqa	%xmm6,%xmm3
1125	psrldq	$6,%xmm2
1126	psrldq	$6,%xmm3
1127	movdqa	%xmm5,%xmm4
1128	punpcklqdq	%xmm3,%xmm2
1129	punpckhqdq	%xmm6,%xmm4
1130	punpcklqdq	%xmm6,%xmm5
1131	movdqa	%xmm2,%xmm3
1132	psrlq	$4,%xmm2
1133	psrlq	$30,%xmm3
1134	movdqa	%xmm5,%xmm6
1135	psrlq	$40,%xmm4
1136	psrlq	$26,%xmm6
1137	pand	%xmm7,%xmm5
1138	pand	%xmm7,%xmm6
1139	pand	%xmm7,%xmm2
1140	pand	%xmm7,%xmm3
1141	por	(%ebx),%xmm4
1142	pshufd	$16,(%edx),%xmm7
1143	paddd	80(%esp),%xmm5
1144	paddd	96(%esp),%xmm6
1145	paddd	112(%esp),%xmm2
1146	paddd	128(%esp),%xmm3
1147	paddd	144(%esp),%xmm4
1148	movdqa	%xmm5,(%esp)
1149	pmuludq	%xmm7,%xmm5
1150	movdqa	%xmm6,16(%esp)
1151	pmuludq	%xmm7,%xmm6
1152	paddq	%xmm5,%xmm0
1153	movdqa	%xmm2,%xmm5
1154	pmuludq	%xmm7,%xmm2
1155	paddq	%xmm6,%xmm1
1156	movdqa	%xmm3,%xmm6
1157	pmuludq	%xmm7,%xmm3
1158	paddq	32(%esp),%xmm2
1159	movdqa	%xmm5,32(%esp)
1160	pshufd	$16,16(%edx),%xmm5
1161	paddq	48(%esp),%xmm3
1162	movdqa	%xmm6,48(%esp)
1163	movdqa	%xmm4,%xmm6
1164	pmuludq	%xmm7,%xmm4
1165	paddq	64(%esp),%xmm4
1166	movdqa	%xmm6,64(%esp)
1167	movdqa	%xmm5,%xmm6
1168	pmuludq	48(%esp),%xmm5
1169	movdqa	%xmm6,%xmm7
1170	pmuludq	32(%esp),%xmm6
1171	paddq	%xmm5,%xmm4
1172	movdqa	%xmm7,%xmm5
1173	pmuludq	16(%esp),%xmm7
1174	paddq	%xmm6,%xmm3
1175	pshufd	$16,80(%edx),%xmm6
1176	pmuludq	(%esp),%xmm5
1177	paddq	%xmm7,%xmm2
1178	pmuludq	64(%esp),%xmm6
1179	pshufd	$16,32(%edx),%xmm7
1180	paddq	%xmm5,%xmm1
1181	movdqa	%xmm7,%xmm5
1182	pmuludq	32(%esp),%xmm7
1183	paddq	%xmm6,%xmm0
1184	movdqa	%xmm5,%xmm6
1185	pmuludq	16(%esp),%xmm5
1186	paddq	%xmm7,%xmm4
1187	pshufd	$16,96(%edx),%xmm7
1188	pmuludq	(%esp),%xmm6
1189	paddq	%xmm5,%xmm3
1190	movdqa	%xmm7,%xmm5
1191	pmuludq	64(%esp),%xmm7
1192	paddq	%xmm6,%xmm2
1193	pmuludq	48(%esp),%xmm5
1194	pshufd	$16,48(%edx),%xmm6
1195	paddq	%xmm7,%xmm1
1196	movdqa	%xmm6,%xmm7
1197	pmuludq	16(%esp),%xmm6
1198	paddq	%xmm5,%xmm0
1199	pshufd	$16,112(%edx),%xmm5
1200	pmuludq	(%esp),%xmm7
1201	paddq	%xmm6,%xmm4
1202	movdqa	%xmm5,%xmm6
1203	pmuludq	64(%esp),%xmm5
1204	paddq	%xmm7,%xmm3
1205	movdqa	%xmm6,%xmm7
1206	pmuludq	48(%esp),%xmm6
1207	paddq	%xmm5,%xmm2
1208	pmuludq	32(%esp),%xmm7
1209	pshufd	$16,64(%edx),%xmm5
1210	paddq	%xmm6,%xmm1
1211	pshufd	$16,128(%edx),%xmm6
1212	pmuludq	(%esp),%xmm5
1213	paddq	%xmm7,%xmm0
1214	movdqa	%xmm6,%xmm7
1215	pmuludq	64(%esp),%xmm6
1216	paddq	%xmm5,%xmm4
1217	movdqa	%xmm7,%xmm5
1218	pmuludq	16(%esp),%xmm7
1219	paddq	%xmm6,%xmm3
1220	movdqa	%xmm5,%xmm6
1221	pmuludq	32(%esp),%xmm5
1222	paddq	%xmm7,%xmm0
1223	pmuludq	48(%esp),%xmm6
1224	movdqa	64(%ebx),%xmm7
1225	paddq	%xmm5,%xmm1
1226	paddq	%xmm6,%xmm2
1227.L017short_tail:
1228	pshufd	$78,%xmm4,%xmm6
1229	pshufd	$78,%xmm3,%xmm5
1230	paddq	%xmm6,%xmm4
1231	paddq	%xmm5,%xmm3
1232	pshufd	$78,%xmm0,%xmm6
1233	pshufd	$78,%xmm1,%xmm5
1234	paddq	%xmm6,%xmm0
1235	paddq	%xmm5,%xmm1
1236	pshufd	$78,%xmm2,%xmm6
1237	movdqa	%xmm3,%xmm5
1238	pand	%xmm7,%xmm3
1239	psrlq	$26,%xmm5
1240	paddq	%xmm6,%xmm2
1241	paddq	%xmm4,%xmm5
1242	movdqa	%xmm0,%xmm6
1243	pand	%xmm7,%xmm0
1244	psrlq	$26,%xmm6
1245	movdqa	%xmm5,%xmm4
1246	paddq	%xmm1,%xmm6
1247	psrlq	$26,%xmm5
1248	pand	%xmm7,%xmm4
1249	movdqa	%xmm6,%xmm1
1250	psrlq	$26,%xmm6
1251	paddd	%xmm5,%xmm0
1252	psllq	$2,%xmm5
1253	paddq	%xmm2,%xmm6
1254	paddq	%xmm0,%xmm5
1255	pand	%xmm7,%xmm1
1256	movdqa	%xmm6,%xmm2
1257	psrlq	$26,%xmm6
1258	pand	%xmm7,%xmm2
1259	paddd	%xmm3,%xmm6
1260	movdqa	%xmm5,%xmm0
1261	psrlq	$26,%xmm5
1262	movdqa	%xmm6,%xmm3
1263	psrlq	$26,%xmm6
1264	pand	%xmm7,%xmm0
1265	paddd	%xmm5,%xmm1
1266	pand	%xmm7,%xmm3
1267	paddd	%xmm6,%xmm4
1268.L013done:
1269	movd	%xmm0,-48(%edi)
1270	movd	%xmm1,-44(%edi)
1271	movd	%xmm2,-40(%edi)
1272	movd	%xmm3,-36(%edi)
1273	movd	%xmm4,-32(%edi)
1274	movl	%ebp,%esp
1275.L007nodata:
1276	popl	%edi
1277	popl	%esi
1278	popl	%ebx
1279	popl	%ebp
1280	ret
1281.size	_poly1305_blocks_sse2,.-_poly1305_blocks_sse2
1282.align	32
1283.type	_poly1305_emit_sse2,@function
1284.align	16
1285_poly1305_emit_sse2:
1286	#ifdef __CET__
1287
1288.byte	243,15,30,251
1289	#endif
1290
1291	pushl	%ebp
1292	pushl	%ebx
1293	pushl	%esi
1294	pushl	%edi
1295	movl	20(%esp),%ebp
1296	cmpl	$0,20(%ebp)
1297	je	.Lenter_emit
1298	movl	(%ebp),%eax
1299	movl	4(%ebp),%edi
1300	movl	8(%ebp),%ecx
1301	movl	12(%ebp),%edx
1302	movl	16(%ebp),%esi
1303	movl	%edi,%ebx
1304	shll	$26,%edi
1305	shrl	$6,%ebx
1306	addl	%edi,%eax
1307	movl	%ecx,%edi
1308	adcl	$0,%ebx
1309	shll	$20,%edi
1310	shrl	$12,%ecx
1311	addl	%edi,%ebx
1312	movl	%edx,%edi
1313	adcl	$0,%ecx
1314	shll	$14,%edi
1315	shrl	$18,%edx
1316	addl	%edi,%ecx
1317	movl	%esi,%edi
1318	adcl	$0,%edx
1319	shll	$8,%edi
1320	shrl	$24,%esi
1321	addl	%edi,%edx
1322	adcl	$0,%esi
1323	movl	%esi,%edi
1324	andl	$3,%esi
1325	shrl	$2,%edi
1326	leal	(%edi,%edi,4),%ebp
1327	movl	24(%esp),%edi
1328	addl	%ebp,%eax
1329	movl	28(%esp),%ebp
1330	adcl	$0,%ebx
1331	adcl	$0,%ecx
1332	adcl	$0,%edx
1333	adcl	$0,%esi
1334	movd	%eax,%xmm0
1335	addl	$5,%eax
1336	movd	%ebx,%xmm1
1337	adcl	$0,%ebx
1338	movd	%ecx,%xmm2
1339	adcl	$0,%ecx
1340	movd	%edx,%xmm3
1341	adcl	$0,%edx
1342	adcl	$0,%esi
1343	shrl	$2,%esi
1344	negl	%esi
1345	andl	%esi,%eax
1346	andl	%esi,%ebx
1347	andl	%esi,%ecx
1348	andl	%esi,%edx
1349	movl	%eax,(%edi)
1350	movd	%xmm0,%eax
1351	movl	%ebx,4(%edi)
1352	movd	%xmm1,%ebx
1353	movl	%ecx,8(%edi)
1354	movd	%xmm2,%ecx
1355	movl	%edx,12(%edi)
1356	movd	%xmm3,%edx
1357	notl	%esi
1358	andl	%esi,%eax
1359	andl	%esi,%ebx
1360	orl	(%edi),%eax
1361	andl	%esi,%ecx
1362	orl	4(%edi),%ebx
1363	andl	%esi,%edx
1364	orl	8(%edi),%ecx
1365	orl	12(%edi),%edx
1366	addl	(%ebp),%eax
1367	adcl	4(%ebp),%ebx
1368	movl	%eax,(%edi)
1369	adcl	8(%ebp),%ecx
1370	movl	%ebx,4(%edi)
1371	adcl	12(%ebp),%edx
1372	movl	%ecx,8(%edi)
1373	movl	%edx,12(%edi)
1374	popl	%edi
1375	popl	%esi
1376	popl	%ebx
1377	popl	%ebp
1378	ret
1379.size	_poly1305_emit_sse2,.-_poly1305_emit_sse2
1380.align	32
1381.type	_poly1305_init_avx2,@function
1382.align	16
1383_poly1305_init_avx2:
1384	#ifdef __CET__
1385
1386.byte	243,15,30,251
1387	#endif
1388
1389	vmovdqu	24(%edi),%xmm4
1390	leal	48(%edi),%edi
1391	movl	%esp,%ebp
1392	subl	$224,%esp
1393	andl	$-16,%esp
1394	vmovdqa	64(%ebx),%xmm7
1395	vpand	%xmm7,%xmm4,%xmm0
1396	vpsrlq	$26,%xmm4,%xmm1
1397	vpsrldq	$6,%xmm4,%xmm3
1398	vpand	%xmm7,%xmm1,%xmm1
1399	vpsrlq	$4,%xmm3,%xmm2
1400	vpsrlq	$30,%xmm3,%xmm3
1401	vpand	%xmm7,%xmm2,%xmm2
1402	vpand	%xmm7,%xmm3,%xmm3
1403	vpsrldq	$13,%xmm4,%xmm4
1404	leal	144(%esp),%edx
1405	movl	$2,%ecx
1406.L018square:
1407	vmovdqa	%xmm0,(%esp)
1408	vmovdqa	%xmm1,16(%esp)
1409	vmovdqa	%xmm2,32(%esp)
1410	vmovdqa	%xmm3,48(%esp)
1411	vmovdqa	%xmm4,64(%esp)
1412	vpslld	$2,%xmm1,%xmm6
1413	vpslld	$2,%xmm2,%xmm5
1414	vpaddd	%xmm1,%xmm6,%xmm6
1415	vpaddd	%xmm2,%xmm5,%xmm5
1416	vmovdqa	%xmm6,80(%esp)
1417	vmovdqa	%xmm5,96(%esp)
1418	vpslld	$2,%xmm3,%xmm6
1419	vpslld	$2,%xmm4,%xmm5
1420	vpaddd	%xmm3,%xmm6,%xmm6
1421	vpaddd	%xmm4,%xmm5,%xmm5
1422	vmovdqa	%xmm6,112(%esp)
1423	vmovdqa	%xmm5,128(%esp)
1424	vpshufd	$68,%xmm0,%xmm5
1425	vmovdqa	%xmm1,%xmm6
1426	vpshufd	$68,%xmm1,%xmm1
1427	vpshufd	$68,%xmm2,%xmm2
1428	vpshufd	$68,%xmm3,%xmm3
1429	vpshufd	$68,%xmm4,%xmm4
1430	vmovdqa	%xmm5,(%edx)
1431	vmovdqa	%xmm1,16(%edx)
1432	vmovdqa	%xmm2,32(%edx)
1433	vmovdqa	%xmm3,48(%edx)
1434	vmovdqa	%xmm4,64(%edx)
1435	vpmuludq	%xmm0,%xmm4,%xmm4
1436	vpmuludq	%xmm0,%xmm3,%xmm3
1437	vpmuludq	%xmm0,%xmm2,%xmm2
1438	vpmuludq	%xmm0,%xmm1,%xmm1
1439	vpmuludq	%xmm0,%xmm5,%xmm0
1440	vpmuludq	48(%edx),%xmm6,%xmm5
1441	vpaddq	%xmm5,%xmm4,%xmm4
1442	vpmuludq	32(%edx),%xmm6,%xmm7
1443	vpaddq	%xmm7,%xmm3,%xmm3
1444	vpmuludq	16(%edx),%xmm6,%xmm5
1445	vpaddq	%xmm5,%xmm2,%xmm2
1446	vmovdqa	80(%esp),%xmm7
1447	vpmuludq	(%edx),%xmm6,%xmm6
1448	vpaddq	%xmm6,%xmm1,%xmm1
1449	vmovdqa	32(%esp),%xmm5
1450	vpmuludq	64(%edx),%xmm7,%xmm7
1451	vpaddq	%xmm7,%xmm0,%xmm0
1452	vpmuludq	32(%edx),%xmm5,%xmm6
1453	vpaddq	%xmm6,%xmm4,%xmm4
1454	vpmuludq	16(%edx),%xmm5,%xmm7
1455	vpaddq	%xmm7,%xmm3,%xmm3
1456	vmovdqa	96(%esp),%xmm6
1457	vpmuludq	(%edx),%xmm5,%xmm5
1458	vpaddq	%xmm5,%xmm2,%xmm2
1459	vpmuludq	64(%edx),%xmm6,%xmm7
1460	vpaddq	%xmm7,%xmm1,%xmm1
1461	vmovdqa	48(%esp),%xmm5
1462	vpmuludq	48(%edx),%xmm6,%xmm6
1463	vpaddq	%xmm6,%xmm0,%xmm0
1464	vpmuludq	16(%edx),%xmm5,%xmm7
1465	vpaddq	%xmm7,%xmm4,%xmm4
1466	vmovdqa	112(%esp),%xmm6
1467	vpmuludq	(%edx),%xmm5,%xmm5
1468	vpaddq	%xmm5,%xmm3,%xmm3
1469	vpmuludq	64(%edx),%xmm6,%xmm7
1470	vpaddq	%xmm7,%xmm2,%xmm2
1471	vpmuludq	48(%edx),%xmm6,%xmm5
1472	vpaddq	%xmm5,%xmm1,%xmm1
1473	vmovdqa	64(%esp),%xmm7
1474	vpmuludq	32(%edx),%xmm6,%xmm6
1475	vpaddq	%xmm6,%xmm0,%xmm0
1476	vmovdqa	128(%esp),%xmm5
1477	vpmuludq	(%edx),%xmm7,%xmm7
1478	vpaddq	%xmm7,%xmm4,%xmm4
1479	vpmuludq	64(%edx),%xmm5,%xmm6
1480	vpaddq	%xmm6,%xmm3,%xmm3
1481	vpmuludq	16(%edx),%xmm5,%xmm7
1482	vpaddq	%xmm7,%xmm0,%xmm0
1483	vpmuludq	32(%edx),%xmm5,%xmm6
1484	vpaddq	%xmm6,%xmm1,%xmm1
1485	vmovdqa	64(%ebx),%xmm7
1486	vpmuludq	48(%edx),%xmm5,%xmm5
1487	vpaddq	%xmm5,%xmm2,%xmm2
1488	vpsrlq	$26,%xmm3,%xmm5
1489	vpand	%xmm7,%xmm3,%xmm3
1490	vpsrlq	$26,%xmm0,%xmm6
1491	vpand	%xmm7,%xmm0,%xmm0
1492	vpaddq	%xmm5,%xmm4,%xmm4
1493	vpaddq	%xmm6,%xmm1,%xmm1
1494	vpsrlq	$26,%xmm4,%xmm5
1495	vpand	%xmm7,%xmm4,%xmm4
1496	vpsrlq	$26,%xmm1,%xmm6
1497	vpand	%xmm7,%xmm1,%xmm1
1498	vpaddq	%xmm6,%xmm2,%xmm2
1499	vpaddd	%xmm5,%xmm0,%xmm0
1500	vpsllq	$2,%xmm5,%xmm5
1501	vpsrlq	$26,%xmm2,%xmm6
1502	vpand	%xmm7,%xmm2,%xmm2
1503	vpaddd	%xmm5,%xmm0,%xmm0
1504	vpaddd	%xmm6,%xmm3,%xmm3
1505	vpsrlq	$26,%xmm3,%xmm6
1506	vpsrlq	$26,%xmm0,%xmm5
1507	vpand	%xmm7,%xmm0,%xmm0
1508	vpand	%xmm7,%xmm3,%xmm3
1509	vpaddd	%xmm5,%xmm1,%xmm1
1510	vpaddd	%xmm6,%xmm4,%xmm4
1511	decl	%ecx
1512	jz	.L019square_break
1513	vpunpcklqdq	(%esp),%xmm0,%xmm0
1514	vpunpcklqdq	16(%esp),%xmm1,%xmm1
1515	vpunpcklqdq	32(%esp),%xmm2,%xmm2
1516	vpunpcklqdq	48(%esp),%xmm3,%xmm3
1517	vpunpcklqdq	64(%esp),%xmm4,%xmm4
1518	jmp	.L018square
1519.L019square_break:
1520	vpsllq	$32,%xmm0,%xmm0
1521	vpsllq	$32,%xmm1,%xmm1
1522	vpsllq	$32,%xmm2,%xmm2
1523	vpsllq	$32,%xmm3,%xmm3
1524	vpsllq	$32,%xmm4,%xmm4
1525	vpor	(%esp),%xmm0,%xmm0
1526	vpor	16(%esp),%xmm1,%xmm1
1527	vpor	32(%esp),%xmm2,%xmm2
1528	vpor	48(%esp),%xmm3,%xmm3
1529	vpor	64(%esp),%xmm4,%xmm4
1530	vpshufd	$141,%xmm0,%xmm0
1531	vpshufd	$141,%xmm1,%xmm1
1532	vpshufd	$141,%xmm2,%xmm2
1533	vpshufd	$141,%xmm3,%xmm3
1534	vpshufd	$141,%xmm4,%xmm4
1535	vmovdqu	%xmm0,(%edi)
1536	vmovdqu	%xmm1,16(%edi)
1537	vmovdqu	%xmm2,32(%edi)
1538	vmovdqu	%xmm3,48(%edi)
1539	vmovdqu	%xmm4,64(%edi)
1540	vpslld	$2,%xmm1,%xmm6
1541	vpslld	$2,%xmm2,%xmm5
1542	vpaddd	%xmm1,%xmm6,%xmm6
1543	vpaddd	%xmm2,%xmm5,%xmm5
1544	vmovdqu	%xmm6,80(%edi)
1545	vmovdqu	%xmm5,96(%edi)
1546	vpslld	$2,%xmm3,%xmm6
1547	vpslld	$2,%xmm4,%xmm5
1548	vpaddd	%xmm3,%xmm6,%xmm6
1549	vpaddd	%xmm4,%xmm5,%xmm5
1550	vmovdqu	%xmm6,112(%edi)
1551	vmovdqu	%xmm5,128(%edi)
1552	movl	%ebp,%esp
1553	leal	-48(%edi),%edi
1554	ret
1555.size	_poly1305_init_avx2,.-_poly1305_init_avx2
1556.align	32
1557.type	_poly1305_blocks_avx2,@function
1558.align	16
1559_poly1305_blocks_avx2:
1560	#ifdef __CET__
1561
1562.byte	243,15,30,251
1563	#endif
1564
1565	pushl	%ebp
1566	pushl	%ebx
1567	pushl	%esi
1568	pushl	%edi
1569	movl	20(%esp),%edi
1570	movl	24(%esp),%esi
1571	movl	28(%esp),%ecx
1572	movl	20(%edi),%eax
1573	andl	$-16,%ecx
1574	jz	.L020nodata
1575	cmpl	$64,%ecx
1576	jae	.L021enter_avx2
1577	testl	%eax,%eax
1578	jz	.Lenter_blocks
1579.L021enter_avx2:
1580	vzeroupper
1581	call	.L022pic_point
1582.L022pic_point:
1583	popl	%ebx
1584	leal	.Lconst_sse2-.L022pic_point(%ebx),%ebx
1585	testl	%eax,%eax
1586	jnz	.L023base2_26
1587	call	_poly1305_init_avx2
1588	movl	(%edi),%eax
1589	movl	3(%edi),%ecx
1590	movl	6(%edi),%edx
1591	movl	9(%edi),%esi
1592	movl	13(%edi),%ebp
1593	shrl	$2,%ecx
1594	andl	$67108863,%eax
1595	shrl	$4,%edx
1596	andl	$67108863,%ecx
1597	shrl	$6,%esi
1598	andl	$67108863,%edx
1599	movl	%eax,(%edi)
1600	movl	%ecx,4(%edi)
1601	movl	%edx,8(%edi)
1602	movl	%esi,12(%edi)
1603	movl	%ebp,16(%edi)
1604	movl	$1,20(%edi)
1605	movl	24(%esp),%esi
1606	movl	28(%esp),%ecx
1607.L023base2_26:
1608	movl	32(%esp),%eax
1609	movl	%esp,%ebp
1610	subl	$448,%esp
1611	andl	$-512,%esp
1612	vmovdqu	48(%edi),%xmm0
1613	leal	288(%esp),%edx
1614	vmovdqu	64(%edi),%xmm1
1615	vmovdqu	80(%edi),%xmm2
1616	vmovdqu	96(%edi),%xmm3
1617	vmovdqu	112(%edi),%xmm4
1618	leal	48(%edi),%edi
1619	vpermq	$64,%ymm0,%ymm0
1620	vpermq	$64,%ymm1,%ymm1
1621	vpermq	$64,%ymm2,%ymm2
1622	vpermq	$64,%ymm3,%ymm3
1623	vpermq	$64,%ymm4,%ymm4
1624	vpshufd	$200,%ymm0,%ymm0
1625	vpshufd	$200,%ymm1,%ymm1
1626	vpshufd	$200,%ymm2,%ymm2
1627	vpshufd	$200,%ymm3,%ymm3
1628	vpshufd	$200,%ymm4,%ymm4
1629	vmovdqa	%ymm0,-128(%edx)
1630	vmovdqu	80(%edi),%xmm0
1631	vmovdqa	%ymm1,-96(%edx)
1632	vmovdqu	96(%edi),%xmm1
1633	vmovdqa	%ymm2,-64(%edx)
1634	vmovdqu	112(%edi),%xmm2
1635	vmovdqa	%ymm3,-32(%edx)
1636	vmovdqu	128(%edi),%xmm3
1637	vmovdqa	%ymm4,(%edx)
1638	vpermq	$64,%ymm0,%ymm0
1639	vpermq	$64,%ymm1,%ymm1
1640	vpermq	$64,%ymm2,%ymm2
1641	vpermq	$64,%ymm3,%ymm3
1642	vpshufd	$200,%ymm0,%ymm0
1643	vpshufd	$200,%ymm1,%ymm1
1644	vpshufd	$200,%ymm2,%ymm2
1645	vpshufd	$200,%ymm3,%ymm3
1646	vmovdqa	%ymm0,32(%edx)
1647	vmovd	-48(%edi),%xmm0
1648	vmovdqa	%ymm1,64(%edx)
1649	vmovd	-44(%edi),%xmm1
1650	vmovdqa	%ymm2,96(%edx)
1651	vmovd	-40(%edi),%xmm2
1652	vmovdqa	%ymm3,128(%edx)
1653	vmovd	-36(%edi),%xmm3
1654	vmovd	-32(%edi),%xmm4
1655	vmovdqa	64(%ebx),%ymm7
1656	negl	%eax
1657	testl	$63,%ecx
1658	jz	.L024even
1659	movl	%ecx,%edx
1660	andl	$-64,%ecx
1661	andl	$63,%edx
1662	vmovdqu	(%esi),%xmm5
1663	cmpl	$32,%edx
1664	jb	.L025one
1665	vmovdqu	16(%esi),%xmm6
1666	je	.L026two
1667	vinserti128	$1,32(%esi),%ymm5,%ymm5
1668	leal	48(%esi),%esi
1669	leal	8(%ebx),%ebx
1670	leal	296(%esp),%edx
1671	jmp	.L027tail
1672.L026two:
1673	leal	32(%esi),%esi
1674	leal	16(%ebx),%ebx
1675	leal	304(%esp),%edx
1676	jmp	.L027tail
1677.L025one:
1678	leal	16(%esi),%esi
1679	vpxor	%ymm6,%ymm6,%ymm6
1680	leal	32(%ebx,%eax,8),%ebx
1681	leal	312(%esp),%edx
1682	jmp	.L027tail
1683.align	32
1684.L024even:
1685	vmovdqu	(%esi),%xmm5
1686	vmovdqu	16(%esi),%xmm6
1687	vinserti128	$1,32(%esi),%ymm5,%ymm5
1688	vinserti128	$1,48(%esi),%ymm6,%ymm6
1689	leal	64(%esi),%esi
1690	subl	$64,%ecx
1691	jz	.L027tail
1692.L028loop:
1693	vmovdqa	%ymm2,64(%esp)
1694	vpsrldq	$6,%ymm5,%ymm2
1695	vmovdqa	%ymm0,(%esp)
1696	vpsrldq	$6,%ymm6,%ymm0
1697	vmovdqa	%ymm1,32(%esp)
1698	vpunpckhqdq	%ymm6,%ymm5,%ymm1
1699	vpunpcklqdq	%ymm6,%ymm5,%ymm5
1700	vpunpcklqdq	%ymm0,%ymm2,%ymm2
1701	vpsrlq	$30,%ymm2,%ymm0
1702	vpsrlq	$4,%ymm2,%ymm2
1703	vpsrlq	$26,%ymm5,%ymm6
1704	vpsrlq	$40,%ymm1,%ymm1
1705	vpand	%ymm7,%ymm2,%ymm2
1706	vpand	%ymm7,%ymm5,%ymm5
1707	vpand	%ymm7,%ymm6,%ymm6
1708	vpand	%ymm7,%ymm0,%ymm0
1709	vpor	(%ebx),%ymm1,%ymm1
1710	vpaddq	64(%esp),%ymm2,%ymm2
1711	vpaddq	(%esp),%ymm5,%ymm5
1712	vpaddq	32(%esp),%ymm6,%ymm6
1713	vpaddq	%ymm3,%ymm0,%ymm0
1714	vpaddq	%ymm4,%ymm1,%ymm1
1715	vpmuludq	-96(%edx),%ymm2,%ymm3
1716	vmovdqa	%ymm6,32(%esp)
1717	vpmuludq	-64(%edx),%ymm2,%ymm4
1718	vmovdqa	%ymm0,96(%esp)
1719	vpmuludq	96(%edx),%ymm2,%ymm0
1720	vmovdqa	%ymm1,128(%esp)
1721	vpmuludq	128(%edx),%ymm2,%ymm1
1722	vpmuludq	-128(%edx),%ymm2,%ymm2
1723	vpmuludq	-32(%edx),%ymm5,%ymm7
1724	vpaddq	%ymm7,%ymm3,%ymm3
1725	vpmuludq	(%edx),%ymm5,%ymm6
1726	vpaddq	%ymm6,%ymm4,%ymm4
1727	vpmuludq	-128(%edx),%ymm5,%ymm7
1728	vpaddq	%ymm7,%ymm0,%ymm0
1729	vmovdqa	32(%esp),%ymm7
1730	vpmuludq	-96(%edx),%ymm5,%ymm6
1731	vpaddq	%ymm6,%ymm1,%ymm1
1732	vpmuludq	-64(%edx),%ymm5,%ymm5
1733	vpaddq	%ymm5,%ymm2,%ymm2
1734	vpmuludq	-64(%edx),%ymm7,%ymm6
1735	vpaddq	%ymm6,%ymm3,%ymm3
1736	vpmuludq	-32(%edx),%ymm7,%ymm5
1737	vpaddq	%ymm5,%ymm4,%ymm4
1738	vpmuludq	128(%edx),%ymm7,%ymm6
1739	vpaddq	%ymm6,%ymm0,%ymm0
1740	vmovdqa	96(%esp),%ymm6
1741	vpmuludq	-128(%edx),%ymm7,%ymm5
1742	vpaddq	%ymm5,%ymm1,%ymm1
1743	vpmuludq	-96(%edx),%ymm7,%ymm7
1744	vpaddq	%ymm7,%ymm2,%ymm2
1745	vpmuludq	-128(%edx),%ymm6,%ymm5
1746	vpaddq	%ymm5,%ymm3,%ymm3
1747	vpmuludq	-96(%edx),%ymm6,%ymm7
1748	vpaddq	%ymm7,%ymm4,%ymm4
1749	vpmuludq	64(%edx),%ymm6,%ymm5
1750	vpaddq	%ymm5,%ymm0,%ymm0
1751	vmovdqa	128(%esp),%ymm5
1752	vpmuludq	96(%edx),%ymm6,%ymm7
1753	vpaddq	%ymm7,%ymm1,%ymm1
1754	vpmuludq	128(%edx),%ymm6,%ymm6
1755	vpaddq	%ymm6,%ymm2,%ymm2
1756	vpmuludq	128(%edx),%ymm5,%ymm7
1757	vpaddq	%ymm7,%ymm3,%ymm3
1758	vpmuludq	32(%edx),%ymm5,%ymm6
1759	vpaddq	%ymm6,%ymm0,%ymm0
1760	vpmuludq	-128(%edx),%ymm5,%ymm7
1761	vpaddq	%ymm7,%ymm4,%ymm4
1762	vmovdqa	64(%ebx),%ymm7
1763	vpmuludq	64(%edx),%ymm5,%ymm6
1764	vpaddq	%ymm6,%ymm1,%ymm1
1765	vpmuludq	96(%edx),%ymm5,%ymm5
1766	vpaddq	%ymm5,%ymm2,%ymm2
1767	vpsrlq	$26,%ymm3,%ymm5
1768	vpand	%ymm7,%ymm3,%ymm3
1769	vpsrlq	$26,%ymm0,%ymm6
1770	vpand	%ymm7,%ymm0,%ymm0
1771	vpaddq	%ymm5,%ymm4,%ymm4
1772	vpaddq	%ymm6,%ymm1,%ymm1
1773	vpsrlq	$26,%ymm4,%ymm5
1774	vpand	%ymm7,%ymm4,%ymm4
1775	vpsrlq	$26,%ymm1,%ymm6
1776	vpand	%ymm7,%ymm1,%ymm1
1777	vpaddq	%ymm6,%ymm2,%ymm2
1778	vpaddq	%ymm5,%ymm0,%ymm0
1779	vpsllq	$2,%ymm5,%ymm5
1780	vpsrlq	$26,%ymm2,%ymm6
1781	vpand	%ymm7,%ymm2,%ymm2
1782	vpaddq	%ymm5,%ymm0,%ymm0
1783	vpaddq	%ymm6,%ymm3,%ymm3
1784	vpsrlq	$26,%ymm3,%ymm6
1785	vpsrlq	$26,%ymm0,%ymm5
1786	vpand	%ymm7,%ymm0,%ymm0
1787	vpand	%ymm7,%ymm3,%ymm3
1788	vpaddq	%ymm5,%ymm1,%ymm1
1789	vpaddq	%ymm6,%ymm4,%ymm4
1790	vmovdqu	(%esi),%xmm5
1791	vmovdqu	16(%esi),%xmm6
1792	vinserti128	$1,32(%esi),%ymm5,%ymm5
1793	vinserti128	$1,48(%esi),%ymm6,%ymm6
1794	leal	64(%esi),%esi
1795	subl	$64,%ecx
1796	jnz	.L028loop
1797.L027tail:
1798	vmovdqa	%ymm2,64(%esp)
1799	vpsrldq	$6,%ymm5,%ymm2
1800	vmovdqa	%ymm0,(%esp)
1801	vpsrldq	$6,%ymm6,%ymm0
1802	vmovdqa	%ymm1,32(%esp)
1803	vpunpckhqdq	%ymm6,%ymm5,%ymm1
1804	vpunpcklqdq	%ymm6,%ymm5,%ymm5
1805	vpunpcklqdq	%ymm0,%ymm2,%ymm2
1806	vpsrlq	$30,%ymm2,%ymm0
1807	vpsrlq	$4,%ymm2,%ymm2
1808	vpsrlq	$26,%ymm5,%ymm6
1809	vpsrlq	$40,%ymm1,%ymm1
1810	vpand	%ymm7,%ymm2,%ymm2
1811	vpand	%ymm7,%ymm5,%ymm5
1812	vpand	%ymm7,%ymm6,%ymm6
1813	vpand	%ymm7,%ymm0,%ymm0
1814	vpor	(%ebx),%ymm1,%ymm1
1815	andl	$-64,%ebx
1816	vpaddq	64(%esp),%ymm2,%ymm2
1817	vpaddq	(%esp),%ymm5,%ymm5
1818	vpaddq	32(%esp),%ymm6,%ymm6
1819	vpaddq	%ymm3,%ymm0,%ymm0
1820	vpaddq	%ymm4,%ymm1,%ymm1
1821	vpmuludq	-92(%edx),%ymm2,%ymm3
1822	vmovdqa	%ymm6,32(%esp)
1823	vpmuludq	-60(%edx),%ymm2,%ymm4
1824	vmovdqa	%ymm0,96(%esp)
1825	vpmuludq	100(%edx),%ymm2,%ymm0
1826	vmovdqa	%ymm1,128(%esp)
1827	vpmuludq	132(%edx),%ymm2,%ymm1
1828	vpmuludq	-124(%edx),%ymm2,%ymm2
1829	vpmuludq	-28(%edx),%ymm5,%ymm7
1830	vpaddq	%ymm7,%ymm3,%ymm3
1831	vpmuludq	4(%edx),%ymm5,%ymm6
1832	vpaddq	%ymm6,%ymm4,%ymm4
1833	vpmuludq	-124(%edx),%ymm5,%ymm7
1834	vpaddq	%ymm7,%ymm0,%ymm0
1835	vmovdqa	32(%esp),%ymm7
1836	vpmuludq	-92(%edx),%ymm5,%ymm6
1837	vpaddq	%ymm6,%ymm1,%ymm1
1838	vpmuludq	-60(%edx),%ymm5,%ymm5
1839	vpaddq	%ymm5,%ymm2,%ymm2
1840	vpmuludq	-60(%edx),%ymm7,%ymm6
1841	vpaddq	%ymm6,%ymm3,%ymm3
1842	vpmuludq	-28(%edx),%ymm7,%ymm5
1843	vpaddq	%ymm5,%ymm4,%ymm4
1844	vpmuludq	132(%edx),%ymm7,%ymm6
1845	vpaddq	%ymm6,%ymm0,%ymm0
1846	vmovdqa	96(%esp),%ymm6
1847	vpmuludq	-124(%edx),%ymm7,%ymm5
1848	vpaddq	%ymm5,%ymm1,%ymm1
1849	vpmuludq	-92(%edx),%ymm7,%ymm7
1850	vpaddq	%ymm7,%ymm2,%ymm2
1851	vpmuludq	-124(%edx),%ymm6,%ymm5
1852	vpaddq	%ymm5,%ymm3,%ymm3
1853	vpmuludq	-92(%edx),%ymm6,%ymm7
1854	vpaddq	%ymm7,%ymm4,%ymm4
1855	vpmuludq	68(%edx),%ymm6,%ymm5
1856	vpaddq	%ymm5,%ymm0,%ymm0
1857	vmovdqa	128(%esp),%ymm5
1858	vpmuludq	100(%edx),%ymm6,%ymm7
1859	vpaddq	%ymm7,%ymm1,%ymm1
1860	vpmuludq	132(%edx),%ymm6,%ymm6
1861	vpaddq	%ymm6,%ymm2,%ymm2
1862	vpmuludq	132(%edx),%ymm5,%ymm7
1863	vpaddq	%ymm7,%ymm3,%ymm3
1864	vpmuludq	36(%edx),%ymm5,%ymm6
1865	vpaddq	%ymm6,%ymm0,%ymm0
1866	vpmuludq	-124(%edx),%ymm5,%ymm7
1867	vpaddq	%ymm7,%ymm4,%ymm4
1868	vmovdqa	64(%ebx),%ymm7
1869	vpmuludq	68(%edx),%ymm5,%ymm6
1870	vpaddq	%ymm6,%ymm1,%ymm1
1871	vpmuludq	100(%edx),%ymm5,%ymm5
1872	vpaddq	%ymm5,%ymm2,%ymm2
1873	vpsrldq	$8,%ymm4,%ymm5
1874	vpsrldq	$8,%ymm3,%ymm6
1875	vpaddq	%ymm5,%ymm4,%ymm4
1876	vpsrldq	$8,%ymm0,%ymm5
1877	vpaddq	%ymm6,%ymm3,%ymm3
1878	vpsrldq	$8,%ymm1,%ymm6
1879	vpaddq	%ymm5,%ymm0,%ymm0
1880	vpsrldq	$8,%ymm2,%ymm5
1881	vpaddq	%ymm6,%ymm1,%ymm1
1882	vpermq	$2,%ymm4,%ymm6
1883	vpaddq	%ymm5,%ymm2,%ymm2
1884	vpermq	$2,%ymm3,%ymm5
1885	vpaddq	%ymm6,%ymm4,%ymm4
1886	vpermq	$2,%ymm0,%ymm6
1887	vpaddq	%ymm5,%ymm3,%ymm3
1888	vpermq	$2,%ymm1,%ymm5
1889	vpaddq	%ymm6,%ymm0,%ymm0
1890	vpermq	$2,%ymm2,%ymm6
1891	vpaddq	%ymm5,%ymm1,%ymm1
1892	vpaddq	%ymm6,%ymm2,%ymm2
1893	vpsrlq	$26,%ymm3,%ymm5
1894	vpand	%ymm7,%ymm3,%ymm3
1895	vpsrlq	$26,%ymm0,%ymm6
1896	vpand	%ymm7,%ymm0,%ymm0
1897	vpaddq	%ymm5,%ymm4,%ymm4
1898	vpaddq	%ymm6,%ymm1,%ymm1
1899	vpsrlq	$26,%ymm4,%ymm5
1900	vpand	%ymm7,%ymm4,%ymm4
1901	vpsrlq	$26,%ymm1,%ymm6
1902	vpand	%ymm7,%ymm1,%ymm1
1903	vpaddq	%ymm6,%ymm2,%ymm2
1904	vpaddq	%ymm5,%ymm0,%ymm0
1905	vpsllq	$2,%ymm5,%ymm5
1906	vpsrlq	$26,%ymm2,%ymm6
1907	vpand	%ymm7,%ymm2,%ymm2
1908	vpaddq	%ymm5,%ymm0,%ymm0
1909	vpaddq	%ymm6,%ymm3,%ymm3
1910	vpsrlq	$26,%ymm3,%ymm6
1911	vpsrlq	$26,%ymm0,%ymm5
1912	vpand	%ymm7,%ymm0,%ymm0
1913	vpand	%ymm7,%ymm3,%ymm3
1914	vpaddq	%ymm5,%ymm1,%ymm1
1915	vpaddq	%ymm6,%ymm4,%ymm4
1916	cmpl	$0,%ecx
1917	je	.L029done
1918	vpshufd	$252,%xmm0,%xmm0
1919	leal	288(%esp),%edx
1920	vpshufd	$252,%xmm1,%xmm1
1921	vpshufd	$252,%xmm2,%xmm2
1922	vpshufd	$252,%xmm3,%xmm3
1923	vpshufd	$252,%xmm4,%xmm4
1924	jmp	.L024even
1925.align	16
1926.L029done:
1927	vmovd	%xmm0,-48(%edi)
1928	vmovd	%xmm1,-44(%edi)
1929	vmovd	%xmm2,-40(%edi)
1930	vmovd	%xmm3,-36(%edi)
1931	vmovd	%xmm4,-32(%edi)
1932	vzeroupper
1933	movl	%ebp,%esp
1934.L020nodata:
1935	popl	%edi
1936	popl	%esi
1937	popl	%ebx
1938	popl	%ebp
1939	ret
1940.size	_poly1305_blocks_avx2,.-_poly1305_blocks_avx2
1941.align	64
1942.Lconst_sse2:
1943.long	16777216,0,16777216,0,16777216,0,16777216,0
1944.long	0,0,0,0,0,0,0,0
1945.long	67108863,0,67108863,0,67108863,0,67108863,0
1946.long	268435455,268435452,268435452,268435452
1947.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
1948.byte	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
1949.byte	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
1950.byte	114,103,62,0
1951.align	4
1952.comm	OPENSSL_ia32cap_P,16,4
1953
1954	.section ".note.gnu.property", "a"
1955	.p2align 2
1956	.long 1f - 0f
1957	.long 4f - 1f
1958	.long 5
19590:
1960	.asciz "GNU"
19611:
1962	.p2align 2
1963	.long 0xc0000002
1964	.long 3f - 2f
19652:
1966	.long 3
19673:
1968	.p2align 2
19694:
1970#else
1971.text
1972.align	64
1973.globl	poly1305_init
1974.type	poly1305_init,@function
1975.align	16
1976poly1305_init:
1977.L_poly1305_init_begin:
1978	#ifdef __CET__
1979
1980.byte	243,15,30,251
1981	#endif
1982
1983	pushl	%ebp
1984	pushl	%ebx
1985	pushl	%esi
1986	pushl	%edi
1987	movl	20(%esp),%edi
1988	movl	24(%esp),%esi
1989	movl	28(%esp),%ebp
1990	xorl	%eax,%eax
1991	movl	%eax,(%edi)
1992	movl	%eax,4(%edi)
1993	movl	%eax,8(%edi)
1994	movl	%eax,12(%edi)
1995	movl	%eax,16(%edi)
1996	movl	%eax,20(%edi)
1997	cmpl	$0,%esi
1998	je	.L000nokey
1999	call	.L001pic_point
2000.L001pic_point:
2001	popl	%ebx
2002	leal	poly1305_blocks-.L001pic_point(%ebx),%eax
2003	leal	poly1305_emit-.L001pic_point(%ebx),%edx
2004	leal	OPENSSL_ia32cap_P,%edi
2005	movl	(%edi),%ecx
2006	andl	$83886080,%ecx
2007	cmpl	$83886080,%ecx
2008	jne	.L002no_sse2
2009	leal	_poly1305_blocks_sse2-.L001pic_point(%ebx),%eax
2010	leal	_poly1305_emit_sse2-.L001pic_point(%ebx),%edx
2011	movl	8(%edi),%ecx
2012	testl	$32,%ecx
2013	jz	.L002no_sse2
2014	leal	_poly1305_blocks_avx2-.L001pic_point(%ebx),%eax
2015.L002no_sse2:
2016	movl	20(%esp),%edi
2017	movl	%eax,(%ebp)
2018	movl	%edx,4(%ebp)
2019	movl	(%esi),%eax
2020	movl	4(%esi),%ebx
2021	movl	8(%esi),%ecx
2022	movl	12(%esi),%edx
2023	andl	$268435455,%eax
2024	andl	$268435452,%ebx
2025	andl	$268435452,%ecx
2026	andl	$268435452,%edx
2027	movl	%eax,24(%edi)
2028	movl	%ebx,28(%edi)
2029	movl	%ecx,32(%edi)
2030	movl	%edx,36(%edi)
2031	movl	$1,%eax
2032.L000nokey:
2033	popl	%edi
2034	popl	%esi
2035	popl	%ebx
2036	popl	%ebp
2037	ret
2038.size	poly1305_init,.-.L_poly1305_init_begin
2039.globl	poly1305_blocks
2040.type	poly1305_blocks,@function
2041.align	16
2042poly1305_blocks:
2043.L_poly1305_blocks_begin:
2044	#ifdef __CET__
2045
2046.byte	243,15,30,251
2047	#endif
2048
2049	pushl	%ebp
2050	pushl	%ebx
2051	pushl	%esi
2052	pushl	%edi
2053	movl	20(%esp),%edi
2054	movl	24(%esp),%esi
2055	movl	28(%esp),%ecx
2056.Lenter_blocks:
2057	andl	$-15,%ecx
2058	jz	.L003nodata
2059	subl	$64,%esp
2060	movl	24(%edi),%eax
2061	movl	28(%edi),%ebx
2062	leal	(%esi,%ecx,1),%ebp
2063	movl	32(%edi),%ecx
2064	movl	36(%edi),%edx
2065	movl	%ebp,92(%esp)
2066	movl	%esi,%ebp
2067	movl	%eax,36(%esp)
2068	movl	%ebx,%eax
2069	shrl	$2,%eax
2070	movl	%ebx,40(%esp)
2071	addl	%ebx,%eax
2072	movl	%ecx,%ebx
2073	shrl	$2,%ebx
2074	movl	%ecx,44(%esp)
2075	addl	%ecx,%ebx
2076	movl	%edx,%ecx
2077	shrl	$2,%ecx
2078	movl	%edx,48(%esp)
2079	addl	%edx,%ecx
2080	movl	%eax,52(%esp)
2081	movl	%ebx,56(%esp)
2082	movl	%ecx,60(%esp)
2083	movl	(%edi),%eax
2084	movl	4(%edi),%ebx
2085	movl	8(%edi),%ecx
2086	movl	12(%edi),%esi
2087	movl	16(%edi),%edi
2088	jmp	.L004loop
2089.align	32
2090.L004loop:
2091	addl	(%ebp),%eax
2092	adcl	4(%ebp),%ebx
2093	adcl	8(%ebp),%ecx
2094	adcl	12(%ebp),%esi
2095	leal	16(%ebp),%ebp
2096	adcl	96(%esp),%edi
2097	movl	%eax,(%esp)
2098	movl	%esi,12(%esp)
2099	mull	36(%esp)
2100	movl	%edi,16(%esp)
2101	movl	%eax,%edi
2102	movl	%ebx,%eax
2103	movl	%edx,%esi
2104	mull	60(%esp)
2105	addl	%eax,%edi
2106	movl	%ecx,%eax
2107	adcl	%edx,%esi
2108	mull	56(%esp)
2109	addl	%eax,%edi
2110	movl	12(%esp),%eax
2111	adcl	%edx,%esi
2112	mull	52(%esp)
2113	addl	%eax,%edi
2114	movl	(%esp),%eax
2115	adcl	%edx,%esi
2116	mull	40(%esp)
2117	movl	%edi,20(%esp)
2118	xorl	%edi,%edi
2119	addl	%eax,%esi
2120	movl	%ebx,%eax
2121	adcl	%edx,%edi
2122	mull	36(%esp)
2123	addl	%eax,%esi
2124	movl	%ecx,%eax
2125	adcl	%edx,%edi
2126	mull	60(%esp)
2127	addl	%eax,%esi
2128	movl	12(%esp),%eax
2129	adcl	%edx,%edi
2130	mull	56(%esp)
2131	addl	%eax,%esi
2132	movl	16(%esp),%eax
2133	adcl	%edx,%edi
2134	imull	52(%esp),%eax
2135	addl	%eax,%esi
2136	movl	(%esp),%eax
2137	adcl	$0,%edi
2138	mull	44(%esp)
2139	movl	%esi,24(%esp)
2140	xorl	%esi,%esi
2141	addl	%eax,%edi
2142	movl	%ebx,%eax
2143	adcl	%edx,%esi
2144	mull	40(%esp)
2145	addl	%eax,%edi
2146	movl	%ecx,%eax
2147	adcl	%edx,%esi
2148	mull	36(%esp)
2149	addl	%eax,%edi
2150	movl	12(%esp),%eax
2151	adcl	%edx,%esi
2152	mull	60(%esp)
2153	addl	%eax,%edi
2154	movl	16(%esp),%eax
2155	adcl	%edx,%esi
2156	imull	56(%esp),%eax
2157	addl	%eax,%edi
2158	movl	(%esp),%eax
2159	adcl	$0,%esi
2160	mull	48(%esp)
2161	movl	%edi,28(%esp)
2162	xorl	%edi,%edi
2163	addl	%eax,%esi
2164	movl	%ebx,%eax
2165	adcl	%edx,%edi
2166	mull	44(%esp)
2167	addl	%eax,%esi
2168	movl	%ecx,%eax
2169	adcl	%edx,%edi
2170	mull	40(%esp)
2171	addl	%eax,%esi
2172	movl	12(%esp),%eax
2173	adcl	%edx,%edi
2174	mull	36(%esp)
2175	addl	%eax,%esi
2176	movl	16(%esp),%ecx
2177	adcl	%edx,%edi
2178	movl	%ecx,%edx
2179	imull	60(%esp),%ecx
2180	addl	%ecx,%esi
2181	movl	20(%esp),%eax
2182	adcl	$0,%edi
2183	imull	36(%esp),%edx
2184	addl	%edi,%edx
2185	movl	24(%esp),%ebx
2186	movl	28(%esp),%ecx
2187	movl	%edx,%edi
2188	shrl	$2,%edx
2189	andl	$3,%edi
2190	leal	(%edx,%edx,4),%edx
2191	addl	%edx,%eax
2192	adcl	$0,%ebx
2193	adcl	$0,%ecx
2194	adcl	$0,%esi
2195	adcl	$0,%edi
2196	cmpl	92(%esp),%ebp
2197	jne	.L004loop
2198	movl	84(%esp),%edx
2199	addl	$64,%esp
2200	movl	%eax,(%edx)
2201	movl	%ebx,4(%edx)
2202	movl	%ecx,8(%edx)
2203	movl	%esi,12(%edx)
2204	movl	%edi,16(%edx)
2205.L003nodata:
2206	popl	%edi
2207	popl	%esi
2208	popl	%ebx
2209	popl	%ebp
2210	ret
2211.size	poly1305_blocks,.-.L_poly1305_blocks_begin
2212.globl	poly1305_emit
2213.type	poly1305_emit,@function
2214.align	16
2215poly1305_emit:
2216.L_poly1305_emit_begin:
2217	#ifdef __CET__
2218
2219.byte	243,15,30,251
2220	#endif
2221
2222	pushl	%ebp
2223	pushl	%ebx
2224	pushl	%esi
2225	pushl	%edi
2226	movl	20(%esp),%ebp
2227.Lenter_emit:
2228	movl	24(%esp),%edi
2229	movl	(%ebp),%eax
2230	movl	4(%ebp),%ebx
2231	movl	8(%ebp),%ecx
2232	movl	12(%ebp),%edx
2233	movl	16(%ebp),%esi
2234	addl	$5,%eax
2235	adcl	$0,%ebx
2236	adcl	$0,%ecx
2237	adcl	$0,%edx
2238	adcl	$0,%esi
2239	shrl	$2,%esi
2240	negl	%esi
2241	andl	%esi,%eax
2242	andl	%esi,%ebx
2243	andl	%esi,%ecx
2244	andl	%esi,%edx
2245	movl	%eax,(%edi)
2246	movl	%ebx,4(%edi)
2247	movl	%ecx,8(%edi)
2248	movl	%edx,12(%edi)
2249	notl	%esi
2250	movl	(%ebp),%eax
2251	movl	4(%ebp),%ebx
2252	movl	8(%ebp),%ecx
2253	movl	12(%ebp),%edx
2254	movl	28(%esp),%ebp
2255	andl	%esi,%eax
2256	andl	%esi,%ebx
2257	andl	%esi,%ecx
2258	andl	%esi,%edx
2259	orl	(%edi),%eax
2260	orl	4(%edi),%ebx
2261	orl	8(%edi),%ecx
2262	orl	12(%edi),%edx
2263	addl	(%ebp),%eax
2264	adcl	4(%ebp),%ebx
2265	adcl	8(%ebp),%ecx
2266	adcl	12(%ebp),%edx
2267	movl	%eax,(%edi)
2268	movl	%ebx,4(%edi)
2269	movl	%ecx,8(%edi)
2270	movl	%edx,12(%edi)
2271	popl	%edi
2272	popl	%esi
2273	popl	%ebx
2274	popl	%ebp
2275	ret
2276.size	poly1305_emit,.-.L_poly1305_emit_begin
2277.align	32
2278.type	_poly1305_init_sse2,@function
2279.align	16
2280_poly1305_init_sse2:
2281	#ifdef __CET__
2282
2283.byte	243,15,30,251
2284	#endif
2285
2286	movdqu	24(%edi),%xmm4
2287	leal	48(%edi),%edi
2288	movl	%esp,%ebp
2289	subl	$224,%esp
2290	andl	$-16,%esp
2291	movq	64(%ebx),%xmm7
2292	movdqa	%xmm4,%xmm0
2293	movdqa	%xmm4,%xmm1
2294	movdqa	%xmm4,%xmm2
2295	pand	%xmm7,%xmm0
2296	psrlq	$26,%xmm1
2297	psrldq	$6,%xmm2
2298	pand	%xmm7,%xmm1
2299	movdqa	%xmm2,%xmm3
2300	psrlq	$4,%xmm2
2301	psrlq	$30,%xmm3
2302	pand	%xmm7,%xmm2
2303	pand	%xmm7,%xmm3
2304	psrldq	$13,%xmm4
2305	leal	144(%esp),%edx
2306	movl	$2,%ecx
2307.L005square:
2308	movdqa	%xmm0,(%esp)
2309	movdqa	%xmm1,16(%esp)
2310	movdqa	%xmm2,32(%esp)
2311	movdqa	%xmm3,48(%esp)
2312	movdqa	%xmm4,64(%esp)
2313	movdqa	%xmm1,%xmm6
2314	movdqa	%xmm2,%xmm5
2315	pslld	$2,%xmm6
2316	pslld	$2,%xmm5
2317	paddd	%xmm1,%xmm6
2318	paddd	%xmm2,%xmm5
2319	movdqa	%xmm6,80(%esp)
2320	movdqa	%xmm5,96(%esp)
2321	movdqa	%xmm3,%xmm6
2322	movdqa	%xmm4,%xmm5
2323	pslld	$2,%xmm6
2324	pslld	$2,%xmm5
2325	paddd	%xmm3,%xmm6
2326	paddd	%xmm4,%xmm5
2327	movdqa	%xmm6,112(%esp)
2328	movdqa	%xmm5,128(%esp)
2329	pshufd	$68,%xmm0,%xmm6
2330	movdqa	%xmm1,%xmm5
2331	pshufd	$68,%xmm1,%xmm1
2332	pshufd	$68,%xmm2,%xmm2
2333	pshufd	$68,%xmm3,%xmm3
2334	pshufd	$68,%xmm4,%xmm4
2335	movdqa	%xmm6,(%edx)
2336	movdqa	%xmm1,16(%edx)
2337	movdqa	%xmm2,32(%edx)
2338	movdqa	%xmm3,48(%edx)
2339	movdqa	%xmm4,64(%edx)
2340	pmuludq	%xmm0,%xmm4
2341	pmuludq	%xmm0,%xmm3
2342	pmuludq	%xmm0,%xmm2
2343	pmuludq	%xmm0,%xmm1
2344	pmuludq	%xmm6,%xmm0
2345	movdqa	%xmm5,%xmm6
2346	pmuludq	48(%edx),%xmm5
2347	movdqa	%xmm6,%xmm7
2348	pmuludq	32(%edx),%xmm6
2349	paddq	%xmm5,%xmm4
2350	movdqa	%xmm7,%xmm5
2351	pmuludq	16(%edx),%xmm7
2352	paddq	%xmm6,%xmm3
2353	movdqa	80(%esp),%xmm6
2354	pmuludq	(%edx),%xmm5
2355	paddq	%xmm7,%xmm2
2356	pmuludq	64(%edx),%xmm6
2357	movdqa	32(%esp),%xmm7
2358	paddq	%xmm5,%xmm1
2359	movdqa	%xmm7,%xmm5
2360	pmuludq	32(%edx),%xmm7
2361	paddq	%xmm6,%xmm0
2362	movdqa	%xmm5,%xmm6
2363	pmuludq	16(%edx),%xmm5
2364	paddq	%xmm7,%xmm4
2365	movdqa	96(%esp),%xmm7
2366	pmuludq	(%edx),%xmm6
2367	paddq	%xmm5,%xmm3
2368	movdqa	%xmm7,%xmm5
2369	pmuludq	64(%edx),%xmm7
2370	paddq	%xmm6,%xmm2
2371	pmuludq	48(%edx),%xmm5
2372	movdqa	48(%esp),%xmm6
2373	paddq	%xmm7,%xmm1
2374	movdqa	%xmm6,%xmm7
2375	pmuludq	16(%edx),%xmm6
2376	paddq	%xmm5,%xmm0
2377	movdqa	112(%esp),%xmm5
2378	pmuludq	(%edx),%xmm7
2379	paddq	%xmm6,%xmm4
2380	movdqa	%xmm5,%xmm6
2381	pmuludq	64(%edx),%xmm5
2382	paddq	%xmm7,%xmm3
2383	movdqa	%xmm6,%xmm7
2384	pmuludq	48(%edx),%xmm6
2385	paddq	%xmm5,%xmm2
2386	pmuludq	32(%edx),%xmm7
2387	movdqa	64(%esp),%xmm5
2388	paddq	%xmm6,%xmm1
2389	movdqa	128(%esp),%xmm6
2390	pmuludq	(%edx),%xmm5
2391	paddq	%xmm7,%xmm0
2392	movdqa	%xmm6,%xmm7
2393	pmuludq	64(%edx),%xmm6
2394	paddq	%xmm5,%xmm4
2395	movdqa	%xmm7,%xmm5
2396	pmuludq	16(%edx),%xmm7
2397	paddq	%xmm6,%xmm3
2398	movdqa	%xmm5,%xmm6
2399	pmuludq	32(%edx),%xmm5
2400	paddq	%xmm7,%xmm0
2401	pmuludq	48(%edx),%xmm6
2402	movdqa	64(%ebx),%xmm7
2403	paddq	%xmm5,%xmm1
2404	paddq	%xmm6,%xmm2
2405	movdqa	%xmm3,%xmm5
2406	pand	%xmm7,%xmm3
2407	psrlq	$26,%xmm5
2408	paddq	%xmm4,%xmm5
2409	movdqa	%xmm0,%xmm6
2410	pand	%xmm7,%xmm0
2411	psrlq	$26,%xmm6
2412	movdqa	%xmm5,%xmm4
2413	paddq	%xmm1,%xmm6
2414	psrlq	$26,%xmm5
2415	pand	%xmm7,%xmm4
2416	movdqa	%xmm6,%xmm1
2417	psrlq	$26,%xmm6
2418	paddd	%xmm5,%xmm0
2419	psllq	$2,%xmm5
2420	paddq	%xmm2,%xmm6
2421	paddq	%xmm0,%xmm5
2422	pand	%xmm7,%xmm1
2423	movdqa	%xmm6,%xmm2
2424	psrlq	$26,%xmm6
2425	pand	%xmm7,%xmm2
2426	paddd	%xmm3,%xmm6
2427	movdqa	%xmm5,%xmm0
2428	psrlq	$26,%xmm5
2429	movdqa	%xmm6,%xmm3
2430	psrlq	$26,%xmm6
2431	pand	%xmm7,%xmm0
2432	paddd	%xmm5,%xmm1
2433	pand	%xmm7,%xmm3
2434	paddd	%xmm6,%xmm4
2435	decl	%ecx
2436	jz	.L006square_break
2437	punpcklqdq	(%esp),%xmm0
2438	punpcklqdq	16(%esp),%xmm1
2439	punpcklqdq	32(%esp),%xmm2
2440	punpcklqdq	48(%esp),%xmm3
2441	punpcklqdq	64(%esp),%xmm4
2442	jmp	.L005square
2443.L006square_break:
2444	psllq	$32,%xmm0
2445	psllq	$32,%xmm1
2446	psllq	$32,%xmm2
2447	psllq	$32,%xmm3
2448	psllq	$32,%xmm4
2449	por	(%esp),%xmm0
2450	por	16(%esp),%xmm1
2451	por	32(%esp),%xmm2
2452	por	48(%esp),%xmm3
2453	por	64(%esp),%xmm4
2454	pshufd	$141,%xmm0,%xmm0
2455	pshufd	$141,%xmm1,%xmm1
2456	pshufd	$141,%xmm2,%xmm2
2457	pshufd	$141,%xmm3,%xmm3
2458	pshufd	$141,%xmm4,%xmm4
2459	movdqu	%xmm0,(%edi)
2460	movdqu	%xmm1,16(%edi)
2461	movdqu	%xmm2,32(%edi)
2462	movdqu	%xmm3,48(%edi)
2463	movdqu	%xmm4,64(%edi)
2464	movdqa	%xmm1,%xmm6
2465	movdqa	%xmm2,%xmm5
2466	pslld	$2,%xmm6
2467	pslld	$2,%xmm5
2468	paddd	%xmm1,%xmm6
2469	paddd	%xmm2,%xmm5
2470	movdqu	%xmm6,80(%edi)
2471	movdqu	%xmm5,96(%edi)
2472	movdqa	%xmm3,%xmm6
2473	movdqa	%xmm4,%xmm5
2474	pslld	$2,%xmm6
2475	pslld	$2,%xmm5
2476	paddd	%xmm3,%xmm6
2477	paddd	%xmm4,%xmm5
2478	movdqu	%xmm6,112(%edi)
2479	movdqu	%xmm5,128(%edi)
2480	movl	%ebp,%esp
2481	leal	-48(%edi),%edi
2482	ret
2483.size	_poly1305_init_sse2,.-_poly1305_init_sse2
2484.align	32
2485.type	_poly1305_blocks_sse2,@function
2486.align	16
2487_poly1305_blocks_sse2:
2488	#ifdef __CET__
2489
2490.byte	243,15,30,251
2491	#endif
2492
2493	pushl	%ebp
2494	pushl	%ebx
2495	pushl	%esi
2496	pushl	%edi
2497	movl	20(%esp),%edi
2498	movl	24(%esp),%esi
2499	movl	28(%esp),%ecx
2500	movl	20(%edi),%eax
2501	andl	$-16,%ecx
2502	jz	.L007nodata
2503	cmpl	$64,%ecx
2504	jae	.L008enter_sse2
2505	testl	%eax,%eax
2506	jz	.Lenter_blocks
2507.align	16
2508.L008enter_sse2:
2509	call	.L009pic_point
2510.L009pic_point:
2511	popl	%ebx
2512	leal	.Lconst_sse2-.L009pic_point(%ebx),%ebx
2513	testl	%eax,%eax
2514	jnz	.L010base2_26
2515	call	_poly1305_init_sse2
2516	movl	(%edi),%eax
2517	movl	3(%edi),%ecx
2518	movl	6(%edi),%edx
2519	movl	9(%edi),%esi
2520	movl	13(%edi),%ebp
2521	movl	$1,20(%edi)
2522	shrl	$2,%ecx
2523	andl	$67108863,%eax
2524	shrl	$4,%edx
2525	andl	$67108863,%ecx
2526	shrl	$6,%esi
2527	andl	$67108863,%edx
2528	movd	%eax,%xmm0
2529	movd	%ecx,%xmm1
2530	movd	%edx,%xmm2
2531	movd	%esi,%xmm3
2532	movd	%ebp,%xmm4
2533	movl	24(%esp),%esi
2534	movl	28(%esp),%ecx
2535	jmp	.L011base2_32
2536.align	16
2537.L010base2_26:
2538	movd	(%edi),%xmm0
2539	movd	4(%edi),%xmm1
2540	movd	8(%edi),%xmm2
2541	movd	12(%edi),%xmm3
2542	movd	16(%edi),%xmm4
2543	movdqa	64(%ebx),%xmm7
2544.L011base2_32:
2545	movl	32(%esp),%eax
2546	movl	%esp,%ebp
2547	subl	$528,%esp
2548	andl	$-16,%esp
2549	leal	48(%edi),%edi
2550	shll	$24,%eax
2551	testl	$31,%ecx
2552	jz	.L012even
2553	movdqu	(%esi),%xmm6
2554	leal	16(%esi),%esi
2555	movdqa	%xmm6,%xmm5
2556	pand	%xmm7,%xmm6
2557	paddd	%xmm6,%xmm0
2558	movdqa	%xmm5,%xmm6
2559	psrlq	$26,%xmm5
2560	psrldq	$6,%xmm6
2561	pand	%xmm7,%xmm5
2562	paddd	%xmm5,%xmm1
2563	movdqa	%xmm6,%xmm5
2564	psrlq	$4,%xmm6
2565	pand	%xmm7,%xmm6
2566	paddd	%xmm6,%xmm2
2567	movdqa	%xmm5,%xmm6
2568	psrlq	$30,%xmm5
2569	pand	%xmm7,%xmm5
2570	psrldq	$7,%xmm6
2571	paddd	%xmm5,%xmm3
2572	movd	%eax,%xmm5
2573	paddd	%xmm6,%xmm4
2574	movd	12(%edi),%xmm6
2575	paddd	%xmm5,%xmm4
2576	movdqa	%xmm0,(%esp)
2577	movdqa	%xmm1,16(%esp)
2578	movdqa	%xmm2,32(%esp)
2579	movdqa	%xmm3,48(%esp)
2580	movdqa	%xmm4,64(%esp)
2581	pmuludq	%xmm6,%xmm0
2582	pmuludq	%xmm6,%xmm1
2583	pmuludq	%xmm6,%xmm2
2584	movd	28(%edi),%xmm5
2585	pmuludq	%xmm6,%xmm3
2586	pmuludq	%xmm6,%xmm4
2587	movdqa	%xmm5,%xmm6
2588	pmuludq	48(%esp),%xmm5
2589	movdqa	%xmm6,%xmm7
2590	pmuludq	32(%esp),%xmm6
2591	paddq	%xmm5,%xmm4
2592	movdqa	%xmm7,%xmm5
2593	pmuludq	16(%esp),%xmm7
2594	paddq	%xmm6,%xmm3
2595	movd	92(%edi),%xmm6
2596	pmuludq	(%esp),%xmm5
2597	paddq	%xmm7,%xmm2
2598	pmuludq	64(%esp),%xmm6
2599	movd	44(%edi),%xmm7
2600	paddq	%xmm5,%xmm1
2601	movdqa	%xmm7,%xmm5
2602	pmuludq	32(%esp),%xmm7
2603	paddq	%xmm6,%xmm0
2604	movdqa	%xmm5,%xmm6
2605	pmuludq	16(%esp),%xmm5
2606	paddq	%xmm7,%xmm4
2607	movd	108(%edi),%xmm7
2608	pmuludq	(%esp),%xmm6
2609	paddq	%xmm5,%xmm3
2610	movdqa	%xmm7,%xmm5
2611	pmuludq	64(%esp),%xmm7
2612	paddq	%xmm6,%xmm2
2613	pmuludq	48(%esp),%xmm5
2614	movd	60(%edi),%xmm6
2615	paddq	%xmm7,%xmm1
2616	movdqa	%xmm6,%xmm7
2617	pmuludq	16(%esp),%xmm6
2618	paddq	%xmm5,%xmm0
2619	movd	124(%edi),%xmm5
2620	pmuludq	(%esp),%xmm7
2621	paddq	%xmm6,%xmm4
2622	movdqa	%xmm5,%xmm6
2623	pmuludq	64(%esp),%xmm5
2624	paddq	%xmm7,%xmm3
2625	movdqa	%xmm6,%xmm7
2626	pmuludq	48(%esp),%xmm6
2627	paddq	%xmm5,%xmm2
2628	pmuludq	32(%esp),%xmm7
2629	movd	76(%edi),%xmm5
2630	paddq	%xmm6,%xmm1
2631	movd	140(%edi),%xmm6
2632	pmuludq	(%esp),%xmm5
2633	paddq	%xmm7,%xmm0
2634	movdqa	%xmm6,%xmm7
2635	pmuludq	64(%esp),%xmm6
2636	paddq	%xmm5,%xmm4
2637	movdqa	%xmm7,%xmm5
2638	pmuludq	16(%esp),%xmm7
2639	paddq	%xmm6,%xmm3
2640	movdqa	%xmm5,%xmm6
2641	pmuludq	32(%esp),%xmm5
2642	paddq	%xmm7,%xmm0
2643	pmuludq	48(%esp),%xmm6
2644	movdqa	64(%ebx),%xmm7
2645	paddq	%xmm5,%xmm1
2646	paddq	%xmm6,%xmm2
2647	movdqa	%xmm3,%xmm5
2648	pand	%xmm7,%xmm3
2649	psrlq	$26,%xmm5
2650	paddq	%xmm4,%xmm5
2651	movdqa	%xmm0,%xmm6
2652	pand	%xmm7,%xmm0
2653	psrlq	$26,%xmm6
2654	movdqa	%xmm5,%xmm4
2655	paddq	%xmm1,%xmm6
2656	psrlq	$26,%xmm5
2657	pand	%xmm7,%xmm4
2658	movdqa	%xmm6,%xmm1
2659	psrlq	$26,%xmm6
2660	paddd	%xmm5,%xmm0
2661	psllq	$2,%xmm5
2662	paddq	%xmm2,%xmm6
2663	paddq	%xmm0,%xmm5
2664	pand	%xmm7,%xmm1
2665	movdqa	%xmm6,%xmm2
2666	psrlq	$26,%xmm6
2667	pand	%xmm7,%xmm2
2668	paddd	%xmm3,%xmm6
2669	movdqa	%xmm5,%xmm0
2670	psrlq	$26,%xmm5
2671	movdqa	%xmm6,%xmm3
2672	psrlq	$26,%xmm6
2673	pand	%xmm7,%xmm0
2674	paddd	%xmm5,%xmm1
2675	pand	%xmm7,%xmm3
2676	paddd	%xmm6,%xmm4
2677	subl	$16,%ecx
2678	jz	.L013done
2679.L012even:
2680	leal	384(%esp),%edx
2681	leal	-32(%esi),%eax
2682	subl	$64,%ecx
2683	movdqu	(%edi),%xmm5
2684	pshufd	$68,%xmm5,%xmm6
2685	cmovbl	%eax,%esi
2686	pshufd	$238,%xmm5,%xmm5
2687	movdqa	%xmm6,(%edx)
2688	leal	160(%esp),%eax
2689	movdqu	16(%edi),%xmm6
2690	movdqa	%xmm5,-144(%edx)
2691	pshufd	$68,%xmm6,%xmm5
2692	pshufd	$238,%xmm6,%xmm6
2693	movdqa	%xmm5,16(%edx)
2694	movdqu	32(%edi),%xmm5
2695	movdqa	%xmm6,-128(%edx)
2696	pshufd	$68,%xmm5,%xmm6
2697	pshufd	$238,%xmm5,%xmm5
2698	movdqa	%xmm6,32(%edx)
2699	movdqu	48(%edi),%xmm6
2700	movdqa	%xmm5,-112(%edx)
2701	pshufd	$68,%xmm6,%xmm5
2702	pshufd	$238,%xmm6,%xmm6
2703	movdqa	%xmm5,48(%edx)
2704	movdqu	64(%edi),%xmm5
2705	movdqa	%xmm6,-96(%edx)
2706	pshufd	$68,%xmm5,%xmm6
2707	pshufd	$238,%xmm5,%xmm5
2708	movdqa	%xmm6,64(%edx)
2709	movdqu	80(%edi),%xmm6
2710	movdqa	%xmm5,-80(%edx)
2711	pshufd	$68,%xmm6,%xmm5
2712	pshufd	$238,%xmm6,%xmm6
2713	movdqa	%xmm5,80(%edx)
2714	movdqu	96(%edi),%xmm5
2715	movdqa	%xmm6,-64(%edx)
2716	pshufd	$68,%xmm5,%xmm6
2717	pshufd	$238,%xmm5,%xmm5
2718	movdqa	%xmm6,96(%edx)
2719	movdqu	112(%edi),%xmm6
2720	movdqa	%xmm5,-48(%edx)
2721	pshufd	$68,%xmm6,%xmm5
2722	pshufd	$238,%xmm6,%xmm6
2723	movdqa	%xmm5,112(%edx)
2724	movdqu	128(%edi),%xmm5
2725	movdqa	%xmm6,-32(%edx)
2726	pshufd	$68,%xmm5,%xmm6
2727	pshufd	$238,%xmm5,%xmm5
2728	movdqa	%xmm6,128(%edx)
2729	movdqa	%xmm5,-16(%edx)
2730	movdqu	32(%esi),%xmm5
2731	movdqu	48(%esi),%xmm6
2732	leal	32(%esi),%esi
2733	movdqa	%xmm2,112(%esp)
2734	movdqa	%xmm3,128(%esp)
2735	movdqa	%xmm4,144(%esp)
2736	movdqa	%xmm5,%xmm2
2737	movdqa	%xmm6,%xmm3
2738	psrldq	$6,%xmm2
2739	psrldq	$6,%xmm3
2740	movdqa	%xmm5,%xmm4
2741	punpcklqdq	%xmm3,%xmm2
2742	punpckhqdq	%xmm6,%xmm4
2743	punpcklqdq	%xmm6,%xmm5
2744	movdqa	%xmm2,%xmm3
2745	psrlq	$4,%xmm2
2746	psrlq	$30,%xmm3
2747	movdqa	%xmm5,%xmm6
2748	psrlq	$40,%xmm4
2749	psrlq	$26,%xmm6
2750	pand	%xmm7,%xmm5
2751	pand	%xmm7,%xmm6
2752	pand	%xmm7,%xmm2
2753	pand	%xmm7,%xmm3
2754	por	(%ebx),%xmm4
2755	movdqa	%xmm0,80(%esp)
2756	movdqa	%xmm1,96(%esp)
2757	jbe	.L014skip_loop
2758	jmp	.L015loop
2759.align	32
2760.L015loop:
2761	movdqa	-144(%edx),%xmm7
2762	movdqa	%xmm6,16(%eax)
2763	movdqa	%xmm2,32(%eax)
2764	movdqa	%xmm3,48(%eax)
2765	movdqa	%xmm4,64(%eax)
2766	movdqa	%xmm5,%xmm1
2767	pmuludq	%xmm7,%xmm5
2768	movdqa	%xmm6,%xmm0
2769	pmuludq	%xmm7,%xmm6
2770	pmuludq	%xmm7,%xmm2
2771	pmuludq	%xmm7,%xmm3
2772	pmuludq	%xmm7,%xmm4
2773	pmuludq	-16(%edx),%xmm0
2774	movdqa	%xmm1,%xmm7
2775	pmuludq	-128(%edx),%xmm1
2776	paddq	%xmm5,%xmm0
2777	movdqa	%xmm7,%xmm5
2778	pmuludq	-112(%edx),%xmm7
2779	paddq	%xmm6,%xmm1
2780	movdqa	%xmm5,%xmm6
2781	pmuludq	-96(%edx),%xmm5
2782	paddq	%xmm7,%xmm2
2783	movdqa	16(%eax),%xmm7
2784	pmuludq	-80(%edx),%xmm6
2785	paddq	%xmm5,%xmm3
2786	movdqa	%xmm7,%xmm5
2787	pmuludq	-128(%edx),%xmm7
2788	paddq	%xmm6,%xmm4
2789	movdqa	%xmm5,%xmm6
2790	pmuludq	-112(%edx),%xmm5
2791	paddq	%xmm7,%xmm2
2792	movdqa	32(%eax),%xmm7
2793	pmuludq	-96(%edx),%xmm6
2794	paddq	%xmm5,%xmm3
2795	movdqa	%xmm7,%xmm5
2796	pmuludq	-32(%edx),%xmm7
2797	paddq	%xmm6,%xmm4
2798	movdqa	%xmm5,%xmm6
2799	pmuludq	-16(%edx),%xmm5
2800	paddq	%xmm7,%xmm0
2801	movdqa	%xmm6,%xmm7
2802	pmuludq	-128(%edx),%xmm6
2803	paddq	%xmm5,%xmm1
2804	movdqa	48(%eax),%xmm5
2805	pmuludq	-112(%edx),%xmm7
2806	paddq	%xmm6,%xmm3
2807	movdqa	%xmm5,%xmm6
2808	pmuludq	-48(%edx),%xmm5
2809	paddq	%xmm7,%xmm4
2810	movdqa	%xmm6,%xmm7
2811	pmuludq	-32(%edx),%xmm6
2812	paddq	%xmm5,%xmm0
2813	movdqa	%xmm7,%xmm5
2814	pmuludq	-16(%edx),%xmm7
2815	paddq	%xmm6,%xmm1
2816	movdqa	64(%eax),%xmm6
2817	pmuludq	-128(%edx),%xmm5
2818	paddq	%xmm7,%xmm2
2819	movdqa	%xmm6,%xmm7
2820	pmuludq	-16(%edx),%xmm6
2821	paddq	%xmm5,%xmm4
2822	movdqa	%xmm7,%xmm5
2823	pmuludq	-64(%edx),%xmm7
2824	paddq	%xmm6,%xmm3
2825	movdqa	%xmm5,%xmm6
2826	pmuludq	-48(%edx),%xmm5
2827	paddq	%xmm7,%xmm0
2828	movdqa	64(%ebx),%xmm7
2829	pmuludq	-32(%edx),%xmm6
2830	paddq	%xmm5,%xmm1
2831	paddq	%xmm6,%xmm2
2832	movdqu	-32(%esi),%xmm5
2833	movdqu	-16(%esi),%xmm6
2834	leal	32(%esi),%esi
2835	movdqa	%xmm2,32(%esp)
2836	movdqa	%xmm3,48(%esp)
2837	movdqa	%xmm4,64(%esp)
2838	movdqa	%xmm5,%xmm2
2839	movdqa	%xmm6,%xmm3
2840	psrldq	$6,%xmm2
2841	psrldq	$6,%xmm3
2842	movdqa	%xmm5,%xmm4
2843	punpcklqdq	%xmm3,%xmm2
2844	punpckhqdq	%xmm6,%xmm4
2845	punpcklqdq	%xmm6,%xmm5
2846	movdqa	%xmm2,%xmm3
2847	psrlq	$4,%xmm2
2848	psrlq	$30,%xmm3
2849	movdqa	%xmm5,%xmm6
2850	psrlq	$40,%xmm4
2851	psrlq	$26,%xmm6
2852	pand	%xmm7,%xmm5
2853	pand	%xmm7,%xmm6
2854	pand	%xmm7,%xmm2
2855	pand	%xmm7,%xmm3
2856	por	(%ebx),%xmm4
2857	leal	-32(%esi),%eax
2858	subl	$64,%ecx
2859	paddd	80(%esp),%xmm5
2860	paddd	96(%esp),%xmm6
2861	paddd	112(%esp),%xmm2
2862	paddd	128(%esp),%xmm3
2863	paddd	144(%esp),%xmm4
2864	cmovbl	%eax,%esi
2865	leal	160(%esp),%eax
2866	movdqa	(%edx),%xmm7
2867	movdqa	%xmm1,16(%esp)
2868	movdqa	%xmm6,16(%eax)
2869	movdqa	%xmm2,32(%eax)
2870	movdqa	%xmm3,48(%eax)
2871	movdqa	%xmm4,64(%eax)
2872	movdqa	%xmm5,%xmm1
2873	pmuludq	%xmm7,%xmm5
2874	paddq	%xmm0,%xmm5
2875	movdqa	%xmm6,%xmm0
2876	pmuludq	%xmm7,%xmm6
2877	pmuludq	%xmm7,%xmm2
2878	pmuludq	%xmm7,%xmm3
2879	pmuludq	%xmm7,%xmm4
2880	paddq	16(%esp),%xmm6
2881	paddq	32(%esp),%xmm2
2882	paddq	48(%esp),%xmm3
2883	paddq	64(%esp),%xmm4
2884	pmuludq	128(%edx),%xmm0
2885	movdqa	%xmm1,%xmm7
2886	pmuludq	16(%edx),%xmm1
2887	paddq	%xmm5,%xmm0
2888	movdqa	%xmm7,%xmm5
2889	pmuludq	32(%edx),%xmm7
2890	paddq	%xmm6,%xmm1
2891	movdqa	%xmm5,%xmm6
2892	pmuludq	48(%edx),%xmm5
2893	paddq	%xmm7,%xmm2
2894	movdqa	16(%eax),%xmm7
2895	pmuludq	64(%edx),%xmm6
2896	paddq	%xmm5,%xmm3
2897	movdqa	%xmm7,%xmm5
2898	pmuludq	16(%edx),%xmm7
2899	paddq	%xmm6,%xmm4
2900	movdqa	%xmm5,%xmm6
2901	pmuludq	32(%edx),%xmm5
2902	paddq	%xmm7,%xmm2
2903	movdqa	32(%eax),%xmm7
2904	pmuludq	48(%edx),%xmm6
2905	paddq	%xmm5,%xmm3
2906	movdqa	%xmm7,%xmm5
2907	pmuludq	112(%edx),%xmm7
2908	paddq	%xmm6,%xmm4
2909	movdqa	%xmm5,%xmm6
2910	pmuludq	128(%edx),%xmm5
2911	paddq	%xmm7,%xmm0
2912	movdqa	%xmm6,%xmm7
2913	pmuludq	16(%edx),%xmm6
2914	paddq	%xmm5,%xmm1
2915	movdqa	48(%eax),%xmm5
2916	pmuludq	32(%edx),%xmm7
2917	paddq	%xmm6,%xmm3
2918	movdqa	%xmm5,%xmm6
2919	pmuludq	96(%edx),%xmm5
2920	paddq	%xmm7,%xmm4
2921	movdqa	%xmm6,%xmm7
2922	pmuludq	112(%edx),%xmm6
2923	paddq	%xmm5,%xmm0
2924	movdqa	%xmm7,%xmm5
2925	pmuludq	128(%edx),%xmm7
2926	paddq	%xmm6,%xmm1
2927	movdqa	64(%eax),%xmm6
2928	pmuludq	16(%edx),%xmm5
2929	paddq	%xmm7,%xmm2
2930	movdqa	%xmm6,%xmm7
2931	pmuludq	128(%edx),%xmm6
2932	paddq	%xmm5,%xmm4
2933	movdqa	%xmm7,%xmm5
2934	pmuludq	80(%edx),%xmm7
2935	paddq	%xmm6,%xmm3
2936	movdqa	%xmm5,%xmm6
2937	pmuludq	96(%edx),%xmm5
2938	paddq	%xmm7,%xmm0
2939	movdqa	64(%ebx),%xmm7
2940	pmuludq	112(%edx),%xmm6
2941	paddq	%xmm5,%xmm1
2942	paddq	%xmm6,%xmm2
2943	movdqa	%xmm3,%xmm5
2944	pand	%xmm7,%xmm3
2945	psrlq	$26,%xmm5
2946	paddq	%xmm4,%xmm5
2947	movdqa	%xmm0,%xmm6
2948	pand	%xmm7,%xmm0
2949	psrlq	$26,%xmm6
2950	movdqa	%xmm5,%xmm4
2951	paddq	%xmm1,%xmm6
2952	psrlq	$26,%xmm5
2953	pand	%xmm7,%xmm4
2954	movdqa	%xmm6,%xmm1
2955	psrlq	$26,%xmm6
2956	paddd	%xmm5,%xmm0
2957	psllq	$2,%xmm5
2958	paddq	%xmm2,%xmm6
2959	paddq	%xmm0,%xmm5
2960	pand	%xmm7,%xmm1
2961	movdqa	%xmm6,%xmm2
2962	psrlq	$26,%xmm6
2963	pand	%xmm7,%xmm2
2964	paddd	%xmm3,%xmm6
2965	movdqa	%xmm5,%xmm0
2966	psrlq	$26,%xmm5
2967	movdqa	%xmm6,%xmm3
2968	psrlq	$26,%xmm6
2969	pand	%xmm7,%xmm0
2970	paddd	%xmm5,%xmm1
2971	pand	%xmm7,%xmm3
2972	paddd	%xmm6,%xmm4
2973	movdqu	32(%esi),%xmm5
2974	movdqu	48(%esi),%xmm6
2975	leal	32(%esi),%esi
2976	movdqa	%xmm2,112(%esp)
2977	movdqa	%xmm3,128(%esp)
2978	movdqa	%xmm4,144(%esp)
2979	movdqa	%xmm5,%xmm2
2980	movdqa	%xmm6,%xmm3
2981	psrldq	$6,%xmm2
2982	psrldq	$6,%xmm3
2983	movdqa	%xmm5,%xmm4
2984	punpcklqdq	%xmm3,%xmm2
2985	punpckhqdq	%xmm6,%xmm4
2986	punpcklqdq	%xmm6,%xmm5
2987	movdqa	%xmm2,%xmm3
2988	psrlq	$4,%xmm2
2989	psrlq	$30,%xmm3
2990	movdqa	%xmm5,%xmm6
2991	psrlq	$40,%xmm4
2992	psrlq	$26,%xmm6
2993	pand	%xmm7,%xmm5
2994	pand	%xmm7,%xmm6
2995	pand	%xmm7,%xmm2
2996	pand	%xmm7,%xmm3
2997	por	(%ebx),%xmm4
2998	movdqa	%xmm0,80(%esp)
2999	movdqa	%xmm1,96(%esp)
3000	ja	.L015loop
3001.L014skip_loop:
3002	pshufd	$16,-144(%edx),%xmm7
3003	addl	$32,%ecx
3004	jnz	.L016long_tail
3005	paddd	%xmm0,%xmm5
3006	paddd	%xmm1,%xmm6
3007	paddd	112(%esp),%xmm2
3008	paddd	128(%esp),%xmm3
3009	paddd	144(%esp),%xmm4
3010.L016long_tail:
3011	movdqa	%xmm5,(%eax)
3012	movdqa	%xmm6,16(%eax)
3013	movdqa	%xmm2,32(%eax)
3014	movdqa	%xmm3,48(%eax)
3015	movdqa	%xmm4,64(%eax)
3016	pmuludq	%xmm7,%xmm5
3017	pmuludq	%xmm7,%xmm6
3018	pmuludq	%xmm7,%xmm2
3019	movdqa	%xmm5,%xmm0
3020	pshufd	$16,-128(%edx),%xmm5
3021	pmuludq	%xmm7,%xmm3
3022	movdqa	%xmm6,%xmm1
3023	pmuludq	%xmm7,%xmm4
3024	movdqa	%xmm5,%xmm6
3025	pmuludq	48(%eax),%xmm5
3026	movdqa	%xmm6,%xmm7
3027	pmuludq	32(%eax),%xmm6
3028	paddq	%xmm5,%xmm4
3029	movdqa	%xmm7,%xmm5
3030	pmuludq	16(%eax),%xmm7
3031	paddq	%xmm6,%xmm3
3032	pshufd	$16,-64(%edx),%xmm6
3033	pmuludq	(%eax),%xmm5
3034	paddq	%xmm7,%xmm2
3035	pmuludq	64(%eax),%xmm6
3036	pshufd	$16,-112(%edx),%xmm7
3037	paddq	%xmm5,%xmm1
3038	movdqa	%xmm7,%xmm5
3039	pmuludq	32(%eax),%xmm7
3040	paddq	%xmm6,%xmm0
3041	movdqa	%xmm5,%xmm6
3042	pmuludq	16(%eax),%xmm5
3043	paddq	%xmm7,%xmm4
3044	pshufd	$16,-48(%edx),%xmm7
3045	pmuludq	(%eax),%xmm6
3046	paddq	%xmm5,%xmm3
3047	movdqa	%xmm7,%xmm5
3048	pmuludq	64(%eax),%xmm7
3049	paddq	%xmm6,%xmm2
3050	pmuludq	48(%eax),%xmm5
3051	pshufd	$16,-96(%edx),%xmm6
3052	paddq	%xmm7,%xmm1
3053	movdqa	%xmm6,%xmm7
3054	pmuludq	16(%eax),%xmm6
3055	paddq	%xmm5,%xmm0
3056	pshufd	$16,-32(%edx),%xmm5
3057	pmuludq	(%eax),%xmm7
3058	paddq	%xmm6,%xmm4
3059	movdqa	%xmm5,%xmm6
3060	pmuludq	64(%eax),%xmm5
3061	paddq	%xmm7,%xmm3
3062	movdqa	%xmm6,%xmm7
3063	pmuludq	48(%eax),%xmm6
3064	paddq	%xmm5,%xmm2
3065	pmuludq	32(%eax),%xmm7
3066	pshufd	$16,-80(%edx),%xmm5
3067	paddq	%xmm6,%xmm1
3068	pshufd	$16,-16(%edx),%xmm6
3069	pmuludq	(%eax),%xmm5
3070	paddq	%xmm7,%xmm0
3071	movdqa	%xmm6,%xmm7
3072	pmuludq	64(%eax),%xmm6
3073	paddq	%xmm5,%xmm4
3074	movdqa	%xmm7,%xmm5
3075	pmuludq	16(%eax),%xmm7
3076	paddq	%xmm6,%xmm3
3077	movdqa	%xmm5,%xmm6
3078	pmuludq	32(%eax),%xmm5
3079	paddq	%xmm7,%xmm0
3080	pmuludq	48(%eax),%xmm6
3081	movdqa	64(%ebx),%xmm7
3082	paddq	%xmm5,%xmm1
3083	paddq	%xmm6,%xmm2
3084	jz	.L017short_tail
3085	movdqu	-32(%esi),%xmm5
3086	movdqu	-16(%esi),%xmm6
3087	leal	32(%esi),%esi
3088	movdqa	%xmm2,32(%esp)
3089	movdqa	%xmm3,48(%esp)
3090	movdqa	%xmm4,64(%esp)
3091	movdqa	%xmm5,%xmm2
3092	movdqa	%xmm6,%xmm3
3093	psrldq	$6,%xmm2
3094	psrldq	$6,%xmm3
3095	movdqa	%xmm5,%xmm4
3096	punpcklqdq	%xmm3,%xmm2
3097	punpckhqdq	%xmm6,%xmm4
3098	punpcklqdq	%xmm6,%xmm5
3099	movdqa	%xmm2,%xmm3
3100	psrlq	$4,%xmm2
3101	psrlq	$30,%xmm3
3102	movdqa	%xmm5,%xmm6
3103	psrlq	$40,%xmm4
3104	psrlq	$26,%xmm6
3105	pand	%xmm7,%xmm5
3106	pand	%xmm7,%xmm6
3107	pand	%xmm7,%xmm2
3108	pand	%xmm7,%xmm3
3109	por	(%ebx),%xmm4
3110	pshufd	$16,(%edx),%xmm7
3111	paddd	80(%esp),%xmm5
3112	paddd	96(%esp),%xmm6
3113	paddd	112(%esp),%xmm2
3114	paddd	128(%esp),%xmm3
3115	paddd	144(%esp),%xmm4
3116	movdqa	%xmm5,(%esp)
3117	pmuludq	%xmm7,%xmm5
3118	movdqa	%xmm6,16(%esp)
3119	pmuludq	%xmm7,%xmm6
3120	paddq	%xmm5,%xmm0
3121	movdqa	%xmm2,%xmm5
3122	pmuludq	%xmm7,%xmm2
3123	paddq	%xmm6,%xmm1
3124	movdqa	%xmm3,%xmm6
3125	pmuludq	%xmm7,%xmm3
3126	paddq	32(%esp),%xmm2
3127	movdqa	%xmm5,32(%esp)
3128	pshufd	$16,16(%edx),%xmm5
3129	paddq	48(%esp),%xmm3
3130	movdqa	%xmm6,48(%esp)
3131	movdqa	%xmm4,%xmm6
3132	pmuludq	%xmm7,%xmm4
3133	paddq	64(%esp),%xmm4
3134	movdqa	%xmm6,64(%esp)
3135	movdqa	%xmm5,%xmm6
3136	pmuludq	48(%esp),%xmm5
3137	movdqa	%xmm6,%xmm7
3138	pmuludq	32(%esp),%xmm6
3139	paddq	%xmm5,%xmm4
3140	movdqa	%xmm7,%xmm5
3141	pmuludq	16(%esp),%xmm7
3142	paddq	%xmm6,%xmm3
3143	pshufd	$16,80(%edx),%xmm6
3144	pmuludq	(%esp),%xmm5
3145	paddq	%xmm7,%xmm2
3146	pmuludq	64(%esp),%xmm6
3147	pshufd	$16,32(%edx),%xmm7
3148	paddq	%xmm5,%xmm1
3149	movdqa	%xmm7,%xmm5
3150	pmuludq	32(%esp),%xmm7
3151	paddq	%xmm6,%xmm0
3152	movdqa	%xmm5,%xmm6
3153	pmuludq	16(%esp),%xmm5
3154	paddq	%xmm7,%xmm4
3155	pshufd	$16,96(%edx),%xmm7
3156	pmuludq	(%esp),%xmm6
3157	paddq	%xmm5,%xmm3
3158	movdqa	%xmm7,%xmm5
3159	pmuludq	64(%esp),%xmm7
3160	paddq	%xmm6,%xmm2
3161	pmuludq	48(%esp),%xmm5
3162	pshufd	$16,48(%edx),%xmm6
3163	paddq	%xmm7,%xmm1
3164	movdqa	%xmm6,%xmm7
3165	pmuludq	16(%esp),%xmm6
3166	paddq	%xmm5,%xmm0
3167	pshufd	$16,112(%edx),%xmm5
3168	pmuludq	(%esp),%xmm7
3169	paddq	%xmm6,%xmm4
3170	movdqa	%xmm5,%xmm6
3171	pmuludq	64(%esp),%xmm5
3172	paddq	%xmm7,%xmm3
3173	movdqa	%xmm6,%xmm7
3174	pmuludq	48(%esp),%xmm6
3175	paddq	%xmm5,%xmm2
3176	pmuludq	32(%esp),%xmm7
3177	pshufd	$16,64(%edx),%xmm5
3178	paddq	%xmm6,%xmm1
3179	pshufd	$16,128(%edx),%xmm6
3180	pmuludq	(%esp),%xmm5
3181	paddq	%xmm7,%xmm0
3182	movdqa	%xmm6,%xmm7
3183	pmuludq	64(%esp),%xmm6
3184	paddq	%xmm5,%xmm4
3185	movdqa	%xmm7,%xmm5
3186	pmuludq	16(%esp),%xmm7
3187	paddq	%xmm6,%xmm3
3188	movdqa	%xmm5,%xmm6
3189	pmuludq	32(%esp),%xmm5
3190	paddq	%xmm7,%xmm0
3191	pmuludq	48(%esp),%xmm6
3192	movdqa	64(%ebx),%xmm7
3193	paddq	%xmm5,%xmm1
3194	paddq	%xmm6,%xmm2
3195.L017short_tail:
3196	pshufd	$78,%xmm4,%xmm6
3197	pshufd	$78,%xmm3,%xmm5
3198	paddq	%xmm6,%xmm4
3199	paddq	%xmm5,%xmm3
3200	pshufd	$78,%xmm0,%xmm6
3201	pshufd	$78,%xmm1,%xmm5
3202	paddq	%xmm6,%xmm0
3203	paddq	%xmm5,%xmm1
3204	pshufd	$78,%xmm2,%xmm6
3205	movdqa	%xmm3,%xmm5
3206	pand	%xmm7,%xmm3
3207	psrlq	$26,%xmm5
3208	paddq	%xmm6,%xmm2
3209	paddq	%xmm4,%xmm5
3210	movdqa	%xmm0,%xmm6
3211	pand	%xmm7,%xmm0
3212	psrlq	$26,%xmm6
3213	movdqa	%xmm5,%xmm4
3214	paddq	%xmm1,%xmm6
3215	psrlq	$26,%xmm5
3216	pand	%xmm7,%xmm4
3217	movdqa	%xmm6,%xmm1
3218	psrlq	$26,%xmm6
3219	paddd	%xmm5,%xmm0
3220	psllq	$2,%xmm5
3221	paddq	%xmm2,%xmm6
3222	paddq	%xmm0,%xmm5
3223	pand	%xmm7,%xmm1
3224	movdqa	%xmm6,%xmm2
3225	psrlq	$26,%xmm6
3226	pand	%xmm7,%xmm2
3227	paddd	%xmm3,%xmm6
3228	movdqa	%xmm5,%xmm0
3229	psrlq	$26,%xmm5
3230	movdqa	%xmm6,%xmm3
3231	psrlq	$26,%xmm6
3232	pand	%xmm7,%xmm0
3233	paddd	%xmm5,%xmm1
3234	pand	%xmm7,%xmm3
3235	paddd	%xmm6,%xmm4
3236.L013done:
3237	movd	%xmm0,-48(%edi)
3238	movd	%xmm1,-44(%edi)
3239	movd	%xmm2,-40(%edi)
3240	movd	%xmm3,-36(%edi)
3241	movd	%xmm4,-32(%edi)
3242	movl	%ebp,%esp
3243.L007nodata:
3244	popl	%edi
3245	popl	%esi
3246	popl	%ebx
3247	popl	%ebp
3248	ret
3249.size	_poly1305_blocks_sse2,.-_poly1305_blocks_sse2
3250.align	32
3251.type	_poly1305_emit_sse2,@function
3252.align	16
3253_poly1305_emit_sse2:
3254	#ifdef __CET__
3255
3256.byte	243,15,30,251
3257	#endif
3258
3259	pushl	%ebp
3260	pushl	%ebx
3261	pushl	%esi
3262	pushl	%edi
3263	movl	20(%esp),%ebp
3264	cmpl	$0,20(%ebp)
3265	je	.Lenter_emit
3266	movl	(%ebp),%eax
3267	movl	4(%ebp),%edi
3268	movl	8(%ebp),%ecx
3269	movl	12(%ebp),%edx
3270	movl	16(%ebp),%esi
3271	movl	%edi,%ebx
3272	shll	$26,%edi
3273	shrl	$6,%ebx
3274	addl	%edi,%eax
3275	movl	%ecx,%edi
3276	adcl	$0,%ebx
3277	shll	$20,%edi
3278	shrl	$12,%ecx
3279	addl	%edi,%ebx
3280	movl	%edx,%edi
3281	adcl	$0,%ecx
3282	shll	$14,%edi
3283	shrl	$18,%edx
3284	addl	%edi,%ecx
3285	movl	%esi,%edi
3286	adcl	$0,%edx
3287	shll	$8,%edi
3288	shrl	$24,%esi
3289	addl	%edi,%edx
3290	adcl	$0,%esi
3291	movl	%esi,%edi
3292	andl	$3,%esi
3293	shrl	$2,%edi
3294	leal	(%edi,%edi,4),%ebp
3295	movl	24(%esp),%edi
3296	addl	%ebp,%eax
3297	movl	28(%esp),%ebp
3298	adcl	$0,%ebx
3299	adcl	$0,%ecx
3300	adcl	$0,%edx
3301	adcl	$0,%esi
3302	movd	%eax,%xmm0
3303	addl	$5,%eax
3304	movd	%ebx,%xmm1
3305	adcl	$0,%ebx
3306	movd	%ecx,%xmm2
3307	adcl	$0,%ecx
3308	movd	%edx,%xmm3
3309	adcl	$0,%edx
3310	adcl	$0,%esi
3311	shrl	$2,%esi
3312	negl	%esi
3313	andl	%esi,%eax
3314	andl	%esi,%ebx
3315	andl	%esi,%ecx
3316	andl	%esi,%edx
3317	movl	%eax,(%edi)
3318	movd	%xmm0,%eax
3319	movl	%ebx,4(%edi)
3320	movd	%xmm1,%ebx
3321	movl	%ecx,8(%edi)
3322	movd	%xmm2,%ecx
3323	movl	%edx,12(%edi)
3324	movd	%xmm3,%edx
3325	notl	%esi
3326	andl	%esi,%eax
3327	andl	%esi,%ebx
3328	orl	(%edi),%eax
3329	andl	%esi,%ecx
3330	orl	4(%edi),%ebx
3331	andl	%esi,%edx
3332	orl	8(%edi),%ecx
3333	orl	12(%edi),%edx
3334	addl	(%ebp),%eax
3335	adcl	4(%ebp),%ebx
3336	movl	%eax,(%edi)
3337	adcl	8(%ebp),%ecx
3338	movl	%ebx,4(%edi)
3339	adcl	12(%ebp),%edx
3340	movl	%ecx,8(%edi)
3341	movl	%edx,12(%edi)
3342	popl	%edi
3343	popl	%esi
3344	popl	%ebx
3345	popl	%ebp
3346	ret
3347.size	_poly1305_emit_sse2,.-_poly1305_emit_sse2
3348.align	32
3349.type	_poly1305_init_avx2,@function
3350.align	16
3351_poly1305_init_avx2:
3352	#ifdef __CET__
3353
3354.byte	243,15,30,251
3355	#endif
3356
3357	vmovdqu	24(%edi),%xmm4
3358	leal	48(%edi),%edi
3359	movl	%esp,%ebp
3360	subl	$224,%esp
3361	andl	$-16,%esp
3362	vmovdqa	64(%ebx),%xmm7
3363	vpand	%xmm7,%xmm4,%xmm0
3364	vpsrlq	$26,%xmm4,%xmm1
3365	vpsrldq	$6,%xmm4,%xmm3
3366	vpand	%xmm7,%xmm1,%xmm1
3367	vpsrlq	$4,%xmm3,%xmm2
3368	vpsrlq	$30,%xmm3,%xmm3
3369	vpand	%xmm7,%xmm2,%xmm2
3370	vpand	%xmm7,%xmm3,%xmm3
3371	vpsrldq	$13,%xmm4,%xmm4
3372	leal	144(%esp),%edx
3373	movl	$2,%ecx
3374.L018square:
3375	vmovdqa	%xmm0,(%esp)
3376	vmovdqa	%xmm1,16(%esp)
3377	vmovdqa	%xmm2,32(%esp)
3378	vmovdqa	%xmm3,48(%esp)
3379	vmovdqa	%xmm4,64(%esp)
3380	vpslld	$2,%xmm1,%xmm6
3381	vpslld	$2,%xmm2,%xmm5
3382	vpaddd	%xmm1,%xmm6,%xmm6
3383	vpaddd	%xmm2,%xmm5,%xmm5
3384	vmovdqa	%xmm6,80(%esp)
3385	vmovdqa	%xmm5,96(%esp)
3386	vpslld	$2,%xmm3,%xmm6
3387	vpslld	$2,%xmm4,%xmm5
3388	vpaddd	%xmm3,%xmm6,%xmm6
3389	vpaddd	%xmm4,%xmm5,%xmm5
3390	vmovdqa	%xmm6,112(%esp)
3391	vmovdqa	%xmm5,128(%esp)
3392	vpshufd	$68,%xmm0,%xmm5
3393	vmovdqa	%xmm1,%xmm6
3394	vpshufd	$68,%xmm1,%xmm1
3395	vpshufd	$68,%xmm2,%xmm2
3396	vpshufd	$68,%xmm3,%xmm3
3397	vpshufd	$68,%xmm4,%xmm4
3398	vmovdqa	%xmm5,(%edx)
3399	vmovdqa	%xmm1,16(%edx)
3400	vmovdqa	%xmm2,32(%edx)
3401	vmovdqa	%xmm3,48(%edx)
3402	vmovdqa	%xmm4,64(%edx)
3403	vpmuludq	%xmm0,%xmm4,%xmm4
3404	vpmuludq	%xmm0,%xmm3,%xmm3
3405	vpmuludq	%xmm0,%xmm2,%xmm2
3406	vpmuludq	%xmm0,%xmm1,%xmm1
3407	vpmuludq	%xmm0,%xmm5,%xmm0
3408	vpmuludq	48(%edx),%xmm6,%xmm5
3409	vpaddq	%xmm5,%xmm4,%xmm4
3410	vpmuludq	32(%edx),%xmm6,%xmm7
3411	vpaddq	%xmm7,%xmm3,%xmm3
3412	vpmuludq	16(%edx),%xmm6,%xmm5
3413	vpaddq	%xmm5,%xmm2,%xmm2
3414	vmovdqa	80(%esp),%xmm7
3415	vpmuludq	(%edx),%xmm6,%xmm6
3416	vpaddq	%xmm6,%xmm1,%xmm1
3417	vmovdqa	32(%esp),%xmm5
3418	vpmuludq	64(%edx),%xmm7,%xmm7
3419	vpaddq	%xmm7,%xmm0,%xmm0
3420	vpmuludq	32(%edx),%xmm5,%xmm6
3421	vpaddq	%xmm6,%xmm4,%xmm4
3422	vpmuludq	16(%edx),%xmm5,%xmm7
3423	vpaddq	%xmm7,%xmm3,%xmm3
3424	vmovdqa	96(%esp),%xmm6
3425	vpmuludq	(%edx),%xmm5,%xmm5
3426	vpaddq	%xmm5,%xmm2,%xmm2
3427	vpmuludq	64(%edx),%xmm6,%xmm7
3428	vpaddq	%xmm7,%xmm1,%xmm1
3429	vmovdqa	48(%esp),%xmm5
3430	vpmuludq	48(%edx),%xmm6,%xmm6
3431	vpaddq	%xmm6,%xmm0,%xmm0
3432	vpmuludq	16(%edx),%xmm5,%xmm7
3433	vpaddq	%xmm7,%xmm4,%xmm4
3434	vmovdqa	112(%esp),%xmm6
3435	vpmuludq	(%edx),%xmm5,%xmm5
3436	vpaddq	%xmm5,%xmm3,%xmm3
3437	vpmuludq	64(%edx),%xmm6,%xmm7
3438	vpaddq	%xmm7,%xmm2,%xmm2
3439	vpmuludq	48(%edx),%xmm6,%xmm5
3440	vpaddq	%xmm5,%xmm1,%xmm1
3441	vmovdqa	64(%esp),%xmm7
3442	vpmuludq	32(%edx),%xmm6,%xmm6
3443	vpaddq	%xmm6,%xmm0,%xmm0
3444	vmovdqa	128(%esp),%xmm5
3445	vpmuludq	(%edx),%xmm7,%xmm7
3446	vpaddq	%xmm7,%xmm4,%xmm4
3447	vpmuludq	64(%edx),%xmm5,%xmm6
3448	vpaddq	%xmm6,%xmm3,%xmm3
3449	vpmuludq	16(%edx),%xmm5,%xmm7
3450	vpaddq	%xmm7,%xmm0,%xmm0
3451	vpmuludq	32(%edx),%xmm5,%xmm6
3452	vpaddq	%xmm6,%xmm1,%xmm1
3453	vmovdqa	64(%ebx),%xmm7
3454	vpmuludq	48(%edx),%xmm5,%xmm5
3455	vpaddq	%xmm5,%xmm2,%xmm2
3456	vpsrlq	$26,%xmm3,%xmm5
3457	vpand	%xmm7,%xmm3,%xmm3
3458	vpsrlq	$26,%xmm0,%xmm6
3459	vpand	%xmm7,%xmm0,%xmm0
3460	vpaddq	%xmm5,%xmm4,%xmm4
3461	vpaddq	%xmm6,%xmm1,%xmm1
3462	vpsrlq	$26,%xmm4,%xmm5
3463	vpand	%xmm7,%xmm4,%xmm4
3464	vpsrlq	$26,%xmm1,%xmm6
3465	vpand	%xmm7,%xmm1,%xmm1
3466	vpaddq	%xmm6,%xmm2,%xmm2
3467	vpaddd	%xmm5,%xmm0,%xmm0
3468	vpsllq	$2,%xmm5,%xmm5
3469	vpsrlq	$26,%xmm2,%xmm6
3470	vpand	%xmm7,%xmm2,%xmm2
3471	vpaddd	%xmm5,%xmm0,%xmm0
3472	vpaddd	%xmm6,%xmm3,%xmm3
3473	vpsrlq	$26,%xmm3,%xmm6
3474	vpsrlq	$26,%xmm0,%xmm5
3475	vpand	%xmm7,%xmm0,%xmm0
3476	vpand	%xmm7,%xmm3,%xmm3
3477	vpaddd	%xmm5,%xmm1,%xmm1
3478	vpaddd	%xmm6,%xmm4,%xmm4
3479	decl	%ecx
3480	jz	.L019square_break
3481	vpunpcklqdq	(%esp),%xmm0,%xmm0
3482	vpunpcklqdq	16(%esp),%xmm1,%xmm1
3483	vpunpcklqdq	32(%esp),%xmm2,%xmm2
3484	vpunpcklqdq	48(%esp),%xmm3,%xmm3
3485	vpunpcklqdq	64(%esp),%xmm4,%xmm4
3486	jmp	.L018square
3487.L019square_break:
3488	vpsllq	$32,%xmm0,%xmm0
3489	vpsllq	$32,%xmm1,%xmm1
3490	vpsllq	$32,%xmm2,%xmm2
3491	vpsllq	$32,%xmm3,%xmm3
3492	vpsllq	$32,%xmm4,%xmm4
3493	vpor	(%esp),%xmm0,%xmm0
3494	vpor	16(%esp),%xmm1,%xmm1
3495	vpor	32(%esp),%xmm2,%xmm2
3496	vpor	48(%esp),%xmm3,%xmm3
3497	vpor	64(%esp),%xmm4,%xmm4
3498	vpshufd	$141,%xmm0,%xmm0
3499	vpshufd	$141,%xmm1,%xmm1
3500	vpshufd	$141,%xmm2,%xmm2
3501	vpshufd	$141,%xmm3,%xmm3
3502	vpshufd	$141,%xmm4,%xmm4
3503	vmovdqu	%xmm0,(%edi)
3504	vmovdqu	%xmm1,16(%edi)
3505	vmovdqu	%xmm2,32(%edi)
3506	vmovdqu	%xmm3,48(%edi)
3507	vmovdqu	%xmm4,64(%edi)
3508	vpslld	$2,%xmm1,%xmm6
3509	vpslld	$2,%xmm2,%xmm5
3510	vpaddd	%xmm1,%xmm6,%xmm6
3511	vpaddd	%xmm2,%xmm5,%xmm5
3512	vmovdqu	%xmm6,80(%edi)
3513	vmovdqu	%xmm5,96(%edi)
3514	vpslld	$2,%xmm3,%xmm6
3515	vpslld	$2,%xmm4,%xmm5
3516	vpaddd	%xmm3,%xmm6,%xmm6
3517	vpaddd	%xmm4,%xmm5,%xmm5
3518	vmovdqu	%xmm6,112(%edi)
3519	vmovdqu	%xmm5,128(%edi)
3520	movl	%ebp,%esp
3521	leal	-48(%edi),%edi
3522	ret
3523.size	_poly1305_init_avx2,.-_poly1305_init_avx2
3524.align	32
3525.type	_poly1305_blocks_avx2,@function
3526.align	16
3527_poly1305_blocks_avx2:
3528	#ifdef __CET__
3529
3530.byte	243,15,30,251
3531	#endif
3532
3533	pushl	%ebp
3534	pushl	%ebx
3535	pushl	%esi
3536	pushl	%edi
3537	movl	20(%esp),%edi
3538	movl	24(%esp),%esi
3539	movl	28(%esp),%ecx
3540	movl	20(%edi),%eax
3541	andl	$-16,%ecx
3542	jz	.L020nodata
3543	cmpl	$64,%ecx
3544	jae	.L021enter_avx2
3545	testl	%eax,%eax
3546	jz	.Lenter_blocks
3547.L021enter_avx2:
3548	vzeroupper
3549	call	.L022pic_point
3550.L022pic_point:
3551	popl	%ebx
3552	leal	.Lconst_sse2-.L022pic_point(%ebx),%ebx
3553	testl	%eax,%eax
3554	jnz	.L023base2_26
3555	call	_poly1305_init_avx2
3556	movl	(%edi),%eax
3557	movl	3(%edi),%ecx
3558	movl	6(%edi),%edx
3559	movl	9(%edi),%esi
3560	movl	13(%edi),%ebp
3561	shrl	$2,%ecx
3562	andl	$67108863,%eax
3563	shrl	$4,%edx
3564	andl	$67108863,%ecx
3565	shrl	$6,%esi
3566	andl	$67108863,%edx
3567	movl	%eax,(%edi)
3568	movl	%ecx,4(%edi)
3569	movl	%edx,8(%edi)
3570	movl	%esi,12(%edi)
3571	movl	%ebp,16(%edi)
3572	movl	$1,20(%edi)
3573	movl	24(%esp),%esi
3574	movl	28(%esp),%ecx
3575.L023base2_26:
3576	movl	32(%esp),%eax
3577	movl	%esp,%ebp
3578	subl	$448,%esp
3579	andl	$-512,%esp
3580	vmovdqu	48(%edi),%xmm0
3581	leal	288(%esp),%edx
3582	vmovdqu	64(%edi),%xmm1
3583	vmovdqu	80(%edi),%xmm2
3584	vmovdqu	96(%edi),%xmm3
3585	vmovdqu	112(%edi),%xmm4
3586	leal	48(%edi),%edi
3587	vpermq	$64,%ymm0,%ymm0
3588	vpermq	$64,%ymm1,%ymm1
3589	vpermq	$64,%ymm2,%ymm2
3590	vpermq	$64,%ymm3,%ymm3
3591	vpermq	$64,%ymm4,%ymm4
3592	vpshufd	$200,%ymm0,%ymm0
3593	vpshufd	$200,%ymm1,%ymm1
3594	vpshufd	$200,%ymm2,%ymm2
3595	vpshufd	$200,%ymm3,%ymm3
3596	vpshufd	$200,%ymm4,%ymm4
3597	vmovdqa	%ymm0,-128(%edx)
3598	vmovdqu	80(%edi),%xmm0
3599	vmovdqa	%ymm1,-96(%edx)
3600	vmovdqu	96(%edi),%xmm1
3601	vmovdqa	%ymm2,-64(%edx)
3602	vmovdqu	112(%edi),%xmm2
3603	vmovdqa	%ymm3,-32(%edx)
3604	vmovdqu	128(%edi),%xmm3
3605	vmovdqa	%ymm4,(%edx)
3606	vpermq	$64,%ymm0,%ymm0
3607	vpermq	$64,%ymm1,%ymm1
3608	vpermq	$64,%ymm2,%ymm2
3609	vpermq	$64,%ymm3,%ymm3
3610	vpshufd	$200,%ymm0,%ymm0
3611	vpshufd	$200,%ymm1,%ymm1
3612	vpshufd	$200,%ymm2,%ymm2
3613	vpshufd	$200,%ymm3,%ymm3
3614	vmovdqa	%ymm0,32(%edx)
3615	vmovd	-48(%edi),%xmm0
3616	vmovdqa	%ymm1,64(%edx)
3617	vmovd	-44(%edi),%xmm1
3618	vmovdqa	%ymm2,96(%edx)
3619	vmovd	-40(%edi),%xmm2
3620	vmovdqa	%ymm3,128(%edx)
3621	vmovd	-36(%edi),%xmm3
3622	vmovd	-32(%edi),%xmm4
3623	vmovdqa	64(%ebx),%ymm7
3624	negl	%eax
3625	testl	$63,%ecx
3626	jz	.L024even
3627	movl	%ecx,%edx
3628	andl	$-64,%ecx
3629	andl	$63,%edx
3630	vmovdqu	(%esi),%xmm5
3631	cmpl	$32,%edx
3632	jb	.L025one
3633	vmovdqu	16(%esi),%xmm6
3634	je	.L026two
3635	vinserti128	$1,32(%esi),%ymm5,%ymm5
3636	leal	48(%esi),%esi
3637	leal	8(%ebx),%ebx
3638	leal	296(%esp),%edx
3639	jmp	.L027tail
3640.L026two:
3641	leal	32(%esi),%esi
3642	leal	16(%ebx),%ebx
3643	leal	304(%esp),%edx
3644	jmp	.L027tail
3645.L025one:
3646	leal	16(%esi),%esi
3647	vpxor	%ymm6,%ymm6,%ymm6
3648	leal	32(%ebx,%eax,8),%ebx
3649	leal	312(%esp),%edx
3650	jmp	.L027tail
3651.align	32
3652.L024even:
3653	vmovdqu	(%esi),%xmm5
3654	vmovdqu	16(%esi),%xmm6
3655	vinserti128	$1,32(%esi),%ymm5,%ymm5
3656	vinserti128	$1,48(%esi),%ymm6,%ymm6
3657	leal	64(%esi),%esi
3658	subl	$64,%ecx
3659	jz	.L027tail
3660.L028loop:
3661	vmovdqa	%ymm2,64(%esp)
3662	vpsrldq	$6,%ymm5,%ymm2
3663	vmovdqa	%ymm0,(%esp)
3664	vpsrldq	$6,%ymm6,%ymm0
3665	vmovdqa	%ymm1,32(%esp)
3666	vpunpckhqdq	%ymm6,%ymm5,%ymm1
3667	vpunpcklqdq	%ymm6,%ymm5,%ymm5
3668	vpunpcklqdq	%ymm0,%ymm2,%ymm2
3669	vpsrlq	$30,%ymm2,%ymm0
3670	vpsrlq	$4,%ymm2,%ymm2
3671	vpsrlq	$26,%ymm5,%ymm6
3672	vpsrlq	$40,%ymm1,%ymm1
3673	vpand	%ymm7,%ymm2,%ymm2
3674	vpand	%ymm7,%ymm5,%ymm5
3675	vpand	%ymm7,%ymm6,%ymm6
3676	vpand	%ymm7,%ymm0,%ymm0
3677	vpor	(%ebx),%ymm1,%ymm1
3678	vpaddq	64(%esp),%ymm2,%ymm2
3679	vpaddq	(%esp),%ymm5,%ymm5
3680	vpaddq	32(%esp),%ymm6,%ymm6
3681	vpaddq	%ymm3,%ymm0,%ymm0
3682	vpaddq	%ymm4,%ymm1,%ymm1
3683	vpmuludq	-96(%edx),%ymm2,%ymm3
3684	vmovdqa	%ymm6,32(%esp)
3685	vpmuludq	-64(%edx),%ymm2,%ymm4
3686	vmovdqa	%ymm0,96(%esp)
3687	vpmuludq	96(%edx),%ymm2,%ymm0
3688	vmovdqa	%ymm1,128(%esp)
3689	vpmuludq	128(%edx),%ymm2,%ymm1
3690	vpmuludq	-128(%edx),%ymm2,%ymm2
3691	vpmuludq	-32(%edx),%ymm5,%ymm7
3692	vpaddq	%ymm7,%ymm3,%ymm3
3693	vpmuludq	(%edx),%ymm5,%ymm6
3694	vpaddq	%ymm6,%ymm4,%ymm4
3695	vpmuludq	-128(%edx),%ymm5,%ymm7
3696	vpaddq	%ymm7,%ymm0,%ymm0
3697	vmovdqa	32(%esp),%ymm7
3698	vpmuludq	-96(%edx),%ymm5,%ymm6
3699	vpaddq	%ymm6,%ymm1,%ymm1
3700	vpmuludq	-64(%edx),%ymm5,%ymm5
3701	vpaddq	%ymm5,%ymm2,%ymm2
3702	vpmuludq	-64(%edx),%ymm7,%ymm6
3703	vpaddq	%ymm6,%ymm3,%ymm3
3704	vpmuludq	-32(%edx),%ymm7,%ymm5
3705	vpaddq	%ymm5,%ymm4,%ymm4
3706	vpmuludq	128(%edx),%ymm7,%ymm6
3707	vpaddq	%ymm6,%ymm0,%ymm0
3708	vmovdqa	96(%esp),%ymm6
3709	vpmuludq	-128(%edx),%ymm7,%ymm5
3710	vpaddq	%ymm5,%ymm1,%ymm1
3711	vpmuludq	-96(%edx),%ymm7,%ymm7
3712	vpaddq	%ymm7,%ymm2,%ymm2
3713	vpmuludq	-128(%edx),%ymm6,%ymm5
3714	vpaddq	%ymm5,%ymm3,%ymm3
3715	vpmuludq	-96(%edx),%ymm6,%ymm7
3716	vpaddq	%ymm7,%ymm4,%ymm4
3717	vpmuludq	64(%edx),%ymm6,%ymm5
3718	vpaddq	%ymm5,%ymm0,%ymm0
3719	vmovdqa	128(%esp),%ymm5
3720	vpmuludq	96(%edx),%ymm6,%ymm7
3721	vpaddq	%ymm7,%ymm1,%ymm1
3722	vpmuludq	128(%edx),%ymm6,%ymm6
3723	vpaddq	%ymm6,%ymm2,%ymm2
3724	vpmuludq	128(%edx),%ymm5,%ymm7
3725	vpaddq	%ymm7,%ymm3,%ymm3
3726	vpmuludq	32(%edx),%ymm5,%ymm6
3727	vpaddq	%ymm6,%ymm0,%ymm0
3728	vpmuludq	-128(%edx),%ymm5,%ymm7
3729	vpaddq	%ymm7,%ymm4,%ymm4
3730	vmovdqa	64(%ebx),%ymm7
3731	vpmuludq	64(%edx),%ymm5,%ymm6
3732	vpaddq	%ymm6,%ymm1,%ymm1
3733	vpmuludq	96(%edx),%ymm5,%ymm5
3734	vpaddq	%ymm5,%ymm2,%ymm2
3735	vpsrlq	$26,%ymm3,%ymm5
3736	vpand	%ymm7,%ymm3,%ymm3
3737	vpsrlq	$26,%ymm0,%ymm6
3738	vpand	%ymm7,%ymm0,%ymm0
3739	vpaddq	%ymm5,%ymm4,%ymm4
3740	vpaddq	%ymm6,%ymm1,%ymm1
3741	vpsrlq	$26,%ymm4,%ymm5
3742	vpand	%ymm7,%ymm4,%ymm4
3743	vpsrlq	$26,%ymm1,%ymm6
3744	vpand	%ymm7,%ymm1,%ymm1
3745	vpaddq	%ymm6,%ymm2,%ymm2
3746	vpaddq	%ymm5,%ymm0,%ymm0
3747	vpsllq	$2,%ymm5,%ymm5
3748	vpsrlq	$26,%ymm2,%ymm6
3749	vpand	%ymm7,%ymm2,%ymm2
3750	vpaddq	%ymm5,%ymm0,%ymm0
3751	vpaddq	%ymm6,%ymm3,%ymm3
3752	vpsrlq	$26,%ymm3,%ymm6
3753	vpsrlq	$26,%ymm0,%ymm5
3754	vpand	%ymm7,%ymm0,%ymm0
3755	vpand	%ymm7,%ymm3,%ymm3
3756	vpaddq	%ymm5,%ymm1,%ymm1
3757	vpaddq	%ymm6,%ymm4,%ymm4
3758	vmovdqu	(%esi),%xmm5
3759	vmovdqu	16(%esi),%xmm6
3760	vinserti128	$1,32(%esi),%ymm5,%ymm5
3761	vinserti128	$1,48(%esi),%ymm6,%ymm6
3762	leal	64(%esi),%esi
3763	subl	$64,%ecx
3764	jnz	.L028loop
3765.L027tail:
3766	vmovdqa	%ymm2,64(%esp)
3767	vpsrldq	$6,%ymm5,%ymm2
3768	vmovdqa	%ymm0,(%esp)
3769	vpsrldq	$6,%ymm6,%ymm0
3770	vmovdqa	%ymm1,32(%esp)
3771	vpunpckhqdq	%ymm6,%ymm5,%ymm1
3772	vpunpcklqdq	%ymm6,%ymm5,%ymm5
3773	vpunpcklqdq	%ymm0,%ymm2,%ymm2
3774	vpsrlq	$30,%ymm2,%ymm0
3775	vpsrlq	$4,%ymm2,%ymm2
3776	vpsrlq	$26,%ymm5,%ymm6
3777	vpsrlq	$40,%ymm1,%ymm1
3778	vpand	%ymm7,%ymm2,%ymm2
3779	vpand	%ymm7,%ymm5,%ymm5
3780	vpand	%ymm7,%ymm6,%ymm6
3781	vpand	%ymm7,%ymm0,%ymm0
3782	vpor	(%ebx),%ymm1,%ymm1
3783	andl	$-64,%ebx
3784	vpaddq	64(%esp),%ymm2,%ymm2
3785	vpaddq	(%esp),%ymm5,%ymm5
3786	vpaddq	32(%esp),%ymm6,%ymm6
3787	vpaddq	%ymm3,%ymm0,%ymm0
3788	vpaddq	%ymm4,%ymm1,%ymm1
3789	vpmuludq	-92(%edx),%ymm2,%ymm3
3790	vmovdqa	%ymm6,32(%esp)
3791	vpmuludq	-60(%edx),%ymm2,%ymm4
3792	vmovdqa	%ymm0,96(%esp)
3793	vpmuludq	100(%edx),%ymm2,%ymm0
3794	vmovdqa	%ymm1,128(%esp)
3795	vpmuludq	132(%edx),%ymm2,%ymm1
3796	vpmuludq	-124(%edx),%ymm2,%ymm2
3797	vpmuludq	-28(%edx),%ymm5,%ymm7
3798	vpaddq	%ymm7,%ymm3,%ymm3
3799	vpmuludq	4(%edx),%ymm5,%ymm6
3800	vpaddq	%ymm6,%ymm4,%ymm4
3801	vpmuludq	-124(%edx),%ymm5,%ymm7
3802	vpaddq	%ymm7,%ymm0,%ymm0
3803	vmovdqa	32(%esp),%ymm7
3804	vpmuludq	-92(%edx),%ymm5,%ymm6
3805	vpaddq	%ymm6,%ymm1,%ymm1
3806	vpmuludq	-60(%edx),%ymm5,%ymm5
3807	vpaddq	%ymm5,%ymm2,%ymm2
3808	vpmuludq	-60(%edx),%ymm7,%ymm6
3809	vpaddq	%ymm6,%ymm3,%ymm3
3810	vpmuludq	-28(%edx),%ymm7,%ymm5
3811	vpaddq	%ymm5,%ymm4,%ymm4
3812	vpmuludq	132(%edx),%ymm7,%ymm6
3813	vpaddq	%ymm6,%ymm0,%ymm0
3814	vmovdqa	96(%esp),%ymm6
3815	vpmuludq	-124(%edx),%ymm7,%ymm5
3816	vpaddq	%ymm5,%ymm1,%ymm1
3817	vpmuludq	-92(%edx),%ymm7,%ymm7
3818	vpaddq	%ymm7,%ymm2,%ymm2
3819	vpmuludq	-124(%edx),%ymm6,%ymm5
3820	vpaddq	%ymm5,%ymm3,%ymm3
3821	vpmuludq	-92(%edx),%ymm6,%ymm7
3822	vpaddq	%ymm7,%ymm4,%ymm4
3823	vpmuludq	68(%edx),%ymm6,%ymm5
3824	vpaddq	%ymm5,%ymm0,%ymm0
3825	vmovdqa	128(%esp),%ymm5
3826	vpmuludq	100(%edx),%ymm6,%ymm7
3827	vpaddq	%ymm7,%ymm1,%ymm1
3828	vpmuludq	132(%edx),%ymm6,%ymm6
3829	vpaddq	%ymm6,%ymm2,%ymm2
3830	vpmuludq	132(%edx),%ymm5,%ymm7
3831	vpaddq	%ymm7,%ymm3,%ymm3
3832	vpmuludq	36(%edx),%ymm5,%ymm6
3833	vpaddq	%ymm6,%ymm0,%ymm0
3834	vpmuludq	-124(%edx),%ymm5,%ymm7
3835	vpaddq	%ymm7,%ymm4,%ymm4
3836	vmovdqa	64(%ebx),%ymm7
3837	vpmuludq	68(%edx),%ymm5,%ymm6
3838	vpaddq	%ymm6,%ymm1,%ymm1
3839	vpmuludq	100(%edx),%ymm5,%ymm5
3840	vpaddq	%ymm5,%ymm2,%ymm2
3841	vpsrldq	$8,%ymm4,%ymm5
3842	vpsrldq	$8,%ymm3,%ymm6
3843	vpaddq	%ymm5,%ymm4,%ymm4
3844	vpsrldq	$8,%ymm0,%ymm5
3845	vpaddq	%ymm6,%ymm3,%ymm3
3846	vpsrldq	$8,%ymm1,%ymm6
3847	vpaddq	%ymm5,%ymm0,%ymm0
3848	vpsrldq	$8,%ymm2,%ymm5
3849	vpaddq	%ymm6,%ymm1,%ymm1
3850	vpermq	$2,%ymm4,%ymm6
3851	vpaddq	%ymm5,%ymm2,%ymm2
3852	vpermq	$2,%ymm3,%ymm5
3853	vpaddq	%ymm6,%ymm4,%ymm4
3854	vpermq	$2,%ymm0,%ymm6
3855	vpaddq	%ymm5,%ymm3,%ymm3
3856	vpermq	$2,%ymm1,%ymm5
3857	vpaddq	%ymm6,%ymm0,%ymm0
3858	vpermq	$2,%ymm2,%ymm6
3859	vpaddq	%ymm5,%ymm1,%ymm1
3860	vpaddq	%ymm6,%ymm2,%ymm2
3861	vpsrlq	$26,%ymm3,%ymm5
3862	vpand	%ymm7,%ymm3,%ymm3
3863	vpsrlq	$26,%ymm0,%ymm6
3864	vpand	%ymm7,%ymm0,%ymm0
3865	vpaddq	%ymm5,%ymm4,%ymm4
3866	vpaddq	%ymm6,%ymm1,%ymm1
3867	vpsrlq	$26,%ymm4,%ymm5
3868	vpand	%ymm7,%ymm4,%ymm4
3869	vpsrlq	$26,%ymm1,%ymm6
3870	vpand	%ymm7,%ymm1,%ymm1
3871	vpaddq	%ymm6,%ymm2,%ymm2
3872	vpaddq	%ymm5,%ymm0,%ymm0
3873	vpsllq	$2,%ymm5,%ymm5
3874	vpsrlq	$26,%ymm2,%ymm6
3875	vpand	%ymm7,%ymm2,%ymm2
3876	vpaddq	%ymm5,%ymm0,%ymm0
3877	vpaddq	%ymm6,%ymm3,%ymm3
3878	vpsrlq	$26,%ymm3,%ymm6
3879	vpsrlq	$26,%ymm0,%ymm5
3880	vpand	%ymm7,%ymm0,%ymm0
3881	vpand	%ymm7,%ymm3,%ymm3
3882	vpaddq	%ymm5,%ymm1,%ymm1
3883	vpaddq	%ymm6,%ymm4,%ymm4
3884	cmpl	$0,%ecx
3885	je	.L029done
3886	vpshufd	$252,%xmm0,%xmm0
3887	leal	288(%esp),%edx
3888	vpshufd	$252,%xmm1,%xmm1
3889	vpshufd	$252,%xmm2,%xmm2
3890	vpshufd	$252,%xmm3,%xmm3
3891	vpshufd	$252,%xmm4,%xmm4
3892	jmp	.L024even
3893.align	16
3894.L029done:
3895	vmovd	%xmm0,-48(%edi)
3896	vmovd	%xmm1,-44(%edi)
3897	vmovd	%xmm2,-40(%edi)
3898	vmovd	%xmm3,-36(%edi)
3899	vmovd	%xmm4,-32(%edi)
3900	vzeroupper
3901	movl	%ebp,%esp
3902.L020nodata:
3903	popl	%edi
3904	popl	%esi
3905	popl	%ebx
3906	popl	%ebp
3907	ret
3908.size	_poly1305_blocks_avx2,.-_poly1305_blocks_avx2
3909.align	64
3910.Lconst_sse2:
3911.long	16777216,0,16777216,0,16777216,0,16777216,0
3912.long	0,0,0,0,0,0,0,0
3913.long	67108863,0,67108863,0,67108863,0,67108863,0
3914.long	268435455,268435452,268435452,268435452
3915.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54
3916.byte	44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32
3917.byte	60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111
3918.byte	114,103,62,0
3919.align	4
3920.comm	OPENSSL_ia32cap_P,16,4
3921
3922	.section ".note.gnu.property", "a"
3923	.p2align 2
3924	.long 1f - 0f
3925	.long 4f - 1f
3926	.long 5
39270:
3928	.asciz "GNU"
39291:
3930	.p2align 2
3931	.long 0xc0000002
3932	.long 3f - 2f
39332:
3934	.long 3
39353:
3936	.p2align 2
39374:
3938#endif
3939