xref: /freebsd/sys/crypto/openssl/amd64/poly1305-x86_64.S (revision d0b2dbfa0ecf2bbc9709efc5e20baf8e4b44bbbf)
1/* Do not modify. This file is auto-generated from poly1305-x86_64.pl. */
2.text
3
4
5
6.globl	poly1305_init
7.hidden	poly1305_init
8.globl	poly1305_blocks
9.hidden	poly1305_blocks
10.globl	poly1305_emit
11.hidden	poly1305_emit
12
13.type	poly1305_init,@function
14.align	32
15poly1305_init:
16.cfi_startproc
17	xorq	%rax,%rax
18	movq	%rax,0(%rdi)
19	movq	%rax,8(%rdi)
20	movq	%rax,16(%rdi)
21
22	cmpq	$0,%rsi
23	je	.Lno_key
24
25	leaq	poly1305_blocks(%rip),%r10
26	leaq	poly1305_emit(%rip),%r11
27	movq	OPENSSL_ia32cap_P+4(%rip),%r9
28	leaq	poly1305_blocks_avx(%rip),%rax
29	leaq	poly1305_emit_avx(%rip),%rcx
30	btq	$28,%r9
31	cmovcq	%rax,%r10
32	cmovcq	%rcx,%r11
33	leaq	poly1305_blocks_avx2(%rip),%rax
34	btq	$37,%r9
35	cmovcq	%rax,%r10
36	movq	$0x0ffffffc0fffffff,%rax
37	movq	$0x0ffffffc0ffffffc,%rcx
38	andq	0(%rsi),%rax
39	andq	8(%rsi),%rcx
40	movq	%rax,24(%rdi)
41	movq	%rcx,32(%rdi)
42	movq	%r10,0(%rdx)
43	movq	%r11,8(%rdx)
44	movl	$1,%eax
45.Lno_key:
46	.byte	0xf3,0xc3
47.cfi_endproc
48.size	poly1305_init,.-poly1305_init
49
50.type	poly1305_blocks,@function
51.align	32
52poly1305_blocks:
53.cfi_startproc
54.Lblocks:
55	shrq	$4,%rdx
56	jz	.Lno_data
57
58	pushq	%rbx
59.cfi_adjust_cfa_offset	8
60.cfi_offset	%rbx,-16
61	pushq	%rbp
62.cfi_adjust_cfa_offset	8
63.cfi_offset	%rbp,-24
64	pushq	%r12
65.cfi_adjust_cfa_offset	8
66.cfi_offset	%r12,-32
67	pushq	%r13
68.cfi_adjust_cfa_offset	8
69.cfi_offset	%r13,-40
70	pushq	%r14
71.cfi_adjust_cfa_offset	8
72.cfi_offset	%r14,-48
73	pushq	%r15
74.cfi_adjust_cfa_offset	8
75.cfi_offset	%r15,-56
76.Lblocks_body:
77
78	movq	%rdx,%r15
79
80	movq	24(%rdi),%r11
81	movq	32(%rdi),%r13
82
83	movq	0(%rdi),%r14
84	movq	8(%rdi),%rbx
85	movq	16(%rdi),%rbp
86
87	movq	%r13,%r12
88	shrq	$2,%r13
89	movq	%r12,%rax
90	addq	%r12,%r13
91	jmp	.Loop
92
93.align	32
94.Loop:
95	addq	0(%rsi),%r14
96	adcq	8(%rsi),%rbx
97	leaq	16(%rsi),%rsi
98	adcq	%rcx,%rbp
99	mulq	%r14
100	movq	%rax,%r9
101	movq	%r11,%rax
102	movq	%rdx,%r10
103
104	mulq	%r14
105	movq	%rax,%r14
106	movq	%r11,%rax
107	movq	%rdx,%r8
108
109	mulq	%rbx
110	addq	%rax,%r9
111	movq	%r13,%rax
112	adcq	%rdx,%r10
113
114	mulq	%rbx
115	movq	%rbp,%rbx
116	addq	%rax,%r14
117	adcq	%rdx,%r8
118
119	imulq	%r13,%rbx
120	addq	%rbx,%r9
121	movq	%r8,%rbx
122	adcq	$0,%r10
123
124	imulq	%r11,%rbp
125	addq	%r9,%rbx
126	movq	$-4,%rax
127	adcq	%rbp,%r10
128
129	andq	%r10,%rax
130	movq	%r10,%rbp
131	shrq	$2,%r10
132	andq	$3,%rbp
133	addq	%r10,%rax
134	addq	%rax,%r14
135	adcq	$0,%rbx
136	adcq	$0,%rbp
137	movq	%r12,%rax
138	decq	%r15
139	jnz	.Loop
140
141	movq	%r14,0(%rdi)
142	movq	%rbx,8(%rdi)
143	movq	%rbp,16(%rdi)
144
145	movq	0(%rsp),%r15
146.cfi_restore	%r15
147	movq	8(%rsp),%r14
148.cfi_restore	%r14
149	movq	16(%rsp),%r13
150.cfi_restore	%r13
151	movq	24(%rsp),%r12
152.cfi_restore	%r12
153	movq	32(%rsp),%rbp
154.cfi_restore	%rbp
155	movq	40(%rsp),%rbx
156.cfi_restore	%rbx
157	leaq	48(%rsp),%rsp
158.cfi_adjust_cfa_offset	-48
159.Lno_data:
160.Lblocks_epilogue:
161	.byte	0xf3,0xc3
162.cfi_endproc
163.size	poly1305_blocks,.-poly1305_blocks
164
165.type	poly1305_emit,@function
166.align	32
167poly1305_emit:
168.cfi_startproc
169.Lemit:
170	movq	0(%rdi),%r8
171	movq	8(%rdi),%r9
172	movq	16(%rdi),%r10
173
174	movq	%r8,%rax
175	addq	$5,%r8
176	movq	%r9,%rcx
177	adcq	$0,%r9
178	adcq	$0,%r10
179	shrq	$2,%r10
180	cmovnzq	%r8,%rax
181	cmovnzq	%r9,%rcx
182
183	addq	0(%rdx),%rax
184	adcq	8(%rdx),%rcx
185	movq	%rax,0(%rsi)
186	movq	%rcx,8(%rsi)
187
188	.byte	0xf3,0xc3
189.cfi_endproc
190.size	poly1305_emit,.-poly1305_emit
191.type	__poly1305_block,@function
192.align	32
193__poly1305_block:
194.cfi_startproc
195	mulq	%r14
196	movq	%rax,%r9
197	movq	%r11,%rax
198	movq	%rdx,%r10
199
200	mulq	%r14
201	movq	%rax,%r14
202	movq	%r11,%rax
203	movq	%rdx,%r8
204
205	mulq	%rbx
206	addq	%rax,%r9
207	movq	%r13,%rax
208	adcq	%rdx,%r10
209
210	mulq	%rbx
211	movq	%rbp,%rbx
212	addq	%rax,%r14
213	adcq	%rdx,%r8
214
215	imulq	%r13,%rbx
216	addq	%rbx,%r9
217	movq	%r8,%rbx
218	adcq	$0,%r10
219
220	imulq	%r11,%rbp
221	addq	%r9,%rbx
222	movq	$-4,%rax
223	adcq	%rbp,%r10
224
225	andq	%r10,%rax
226	movq	%r10,%rbp
227	shrq	$2,%r10
228	andq	$3,%rbp
229	addq	%r10,%rax
230	addq	%rax,%r14
231	adcq	$0,%rbx
232	adcq	$0,%rbp
233	.byte	0xf3,0xc3
234.cfi_endproc
235.size	__poly1305_block,.-__poly1305_block
236
237.type	__poly1305_init_avx,@function
238.align	32
239__poly1305_init_avx:
240.cfi_startproc
241	movq	%r11,%r14
242	movq	%r12,%rbx
243	xorq	%rbp,%rbp
244
245	leaq	48+64(%rdi),%rdi
246
247	movq	%r12,%rax
248	call	__poly1305_block
249
250	movl	$0x3ffffff,%eax
251	movl	$0x3ffffff,%edx
252	movq	%r14,%r8
253	andl	%r14d,%eax
254	movq	%r11,%r9
255	andl	%r11d,%edx
256	movl	%eax,-64(%rdi)
257	shrq	$26,%r8
258	movl	%edx,-60(%rdi)
259	shrq	$26,%r9
260
261	movl	$0x3ffffff,%eax
262	movl	$0x3ffffff,%edx
263	andl	%r8d,%eax
264	andl	%r9d,%edx
265	movl	%eax,-48(%rdi)
266	leal	(%rax,%rax,4),%eax
267	movl	%edx,-44(%rdi)
268	leal	(%rdx,%rdx,4),%edx
269	movl	%eax,-32(%rdi)
270	shrq	$26,%r8
271	movl	%edx,-28(%rdi)
272	shrq	$26,%r9
273
274	movq	%rbx,%rax
275	movq	%r12,%rdx
276	shlq	$12,%rax
277	shlq	$12,%rdx
278	orq	%r8,%rax
279	orq	%r9,%rdx
280	andl	$0x3ffffff,%eax
281	andl	$0x3ffffff,%edx
282	movl	%eax,-16(%rdi)
283	leal	(%rax,%rax,4),%eax
284	movl	%edx,-12(%rdi)
285	leal	(%rdx,%rdx,4),%edx
286	movl	%eax,0(%rdi)
287	movq	%rbx,%r8
288	movl	%edx,4(%rdi)
289	movq	%r12,%r9
290
291	movl	$0x3ffffff,%eax
292	movl	$0x3ffffff,%edx
293	shrq	$14,%r8
294	shrq	$14,%r9
295	andl	%r8d,%eax
296	andl	%r9d,%edx
297	movl	%eax,16(%rdi)
298	leal	(%rax,%rax,4),%eax
299	movl	%edx,20(%rdi)
300	leal	(%rdx,%rdx,4),%edx
301	movl	%eax,32(%rdi)
302	shrq	$26,%r8
303	movl	%edx,36(%rdi)
304	shrq	$26,%r9
305
306	movq	%rbp,%rax
307	shlq	$24,%rax
308	orq	%rax,%r8
309	movl	%r8d,48(%rdi)
310	leaq	(%r8,%r8,4),%r8
311	movl	%r9d,52(%rdi)
312	leaq	(%r9,%r9,4),%r9
313	movl	%r8d,64(%rdi)
314	movl	%r9d,68(%rdi)
315
316	movq	%r12,%rax
317	call	__poly1305_block
318
319	movl	$0x3ffffff,%eax
320	movq	%r14,%r8
321	andl	%r14d,%eax
322	shrq	$26,%r8
323	movl	%eax,-52(%rdi)
324
325	movl	$0x3ffffff,%edx
326	andl	%r8d,%edx
327	movl	%edx,-36(%rdi)
328	leal	(%rdx,%rdx,4),%edx
329	shrq	$26,%r8
330	movl	%edx,-20(%rdi)
331
332	movq	%rbx,%rax
333	shlq	$12,%rax
334	orq	%r8,%rax
335	andl	$0x3ffffff,%eax
336	movl	%eax,-4(%rdi)
337	leal	(%rax,%rax,4),%eax
338	movq	%rbx,%r8
339	movl	%eax,12(%rdi)
340
341	movl	$0x3ffffff,%edx
342	shrq	$14,%r8
343	andl	%r8d,%edx
344	movl	%edx,28(%rdi)
345	leal	(%rdx,%rdx,4),%edx
346	shrq	$26,%r8
347	movl	%edx,44(%rdi)
348
349	movq	%rbp,%rax
350	shlq	$24,%rax
351	orq	%rax,%r8
352	movl	%r8d,60(%rdi)
353	leaq	(%r8,%r8,4),%r8
354	movl	%r8d,76(%rdi)
355
356	movq	%r12,%rax
357	call	__poly1305_block
358
359	movl	$0x3ffffff,%eax
360	movq	%r14,%r8
361	andl	%r14d,%eax
362	shrq	$26,%r8
363	movl	%eax,-56(%rdi)
364
365	movl	$0x3ffffff,%edx
366	andl	%r8d,%edx
367	movl	%edx,-40(%rdi)
368	leal	(%rdx,%rdx,4),%edx
369	shrq	$26,%r8
370	movl	%edx,-24(%rdi)
371
372	movq	%rbx,%rax
373	shlq	$12,%rax
374	orq	%r8,%rax
375	andl	$0x3ffffff,%eax
376	movl	%eax,-8(%rdi)
377	leal	(%rax,%rax,4),%eax
378	movq	%rbx,%r8
379	movl	%eax,8(%rdi)
380
381	movl	$0x3ffffff,%edx
382	shrq	$14,%r8
383	andl	%r8d,%edx
384	movl	%edx,24(%rdi)
385	leal	(%rdx,%rdx,4),%edx
386	shrq	$26,%r8
387	movl	%edx,40(%rdi)
388
389	movq	%rbp,%rax
390	shlq	$24,%rax
391	orq	%rax,%r8
392	movl	%r8d,56(%rdi)
393	leaq	(%r8,%r8,4),%r8
394	movl	%r8d,72(%rdi)
395
396	leaq	-48-64(%rdi),%rdi
397	.byte	0xf3,0xc3
398.cfi_endproc
399.size	__poly1305_init_avx,.-__poly1305_init_avx
400
401.type	poly1305_blocks_avx,@function
402.align	32
403poly1305_blocks_avx:
404.cfi_startproc
405	movl	20(%rdi),%r8d
406	cmpq	$128,%rdx
407	jae	.Lblocks_avx
408	testl	%r8d,%r8d
409	jz	.Lblocks
410
411.Lblocks_avx:
412	andq	$-16,%rdx
413	jz	.Lno_data_avx
414
415	vzeroupper
416
417	testl	%r8d,%r8d
418	jz	.Lbase2_64_avx
419
420	testq	$31,%rdx
421	jz	.Leven_avx
422
423	pushq	%rbx
424.cfi_adjust_cfa_offset	8
425.cfi_offset	%rbx,-16
426	pushq	%rbp
427.cfi_adjust_cfa_offset	8
428.cfi_offset	%rbp,-24
429	pushq	%r12
430.cfi_adjust_cfa_offset	8
431.cfi_offset	%r12,-32
432	pushq	%r13
433.cfi_adjust_cfa_offset	8
434.cfi_offset	%r13,-40
435	pushq	%r14
436.cfi_adjust_cfa_offset	8
437.cfi_offset	%r14,-48
438	pushq	%r15
439.cfi_adjust_cfa_offset	8
440.cfi_offset	%r15,-56
441.Lblocks_avx_body:
442
443	movq	%rdx,%r15
444
445	movq	0(%rdi),%r8
446	movq	8(%rdi),%r9
447	movl	16(%rdi),%ebp
448
449	movq	24(%rdi),%r11
450	movq	32(%rdi),%r13
451
452
453	movl	%r8d,%r14d
454	andq	$-2147483648,%r8
455	movq	%r9,%r12
456	movl	%r9d,%ebx
457	andq	$-2147483648,%r9
458
459	shrq	$6,%r8
460	shlq	$52,%r12
461	addq	%r8,%r14
462	shrq	$12,%rbx
463	shrq	$18,%r9
464	addq	%r12,%r14
465	adcq	%r9,%rbx
466
467	movq	%rbp,%r8
468	shlq	$40,%r8
469	shrq	$24,%rbp
470	addq	%r8,%rbx
471	adcq	$0,%rbp
472
473	movq	$-4,%r9
474	movq	%rbp,%r8
475	andq	%rbp,%r9
476	shrq	$2,%r8
477	andq	$3,%rbp
478	addq	%r9,%r8
479	addq	%r8,%r14
480	adcq	$0,%rbx
481	adcq	$0,%rbp
482
483	movq	%r13,%r12
484	movq	%r13,%rax
485	shrq	$2,%r13
486	addq	%r12,%r13
487
488	addq	0(%rsi),%r14
489	adcq	8(%rsi),%rbx
490	leaq	16(%rsi),%rsi
491	adcq	%rcx,%rbp
492
493	call	__poly1305_block
494
495	testq	%rcx,%rcx
496	jz	.Lstore_base2_64_avx
497
498
499	movq	%r14,%rax
500	movq	%r14,%rdx
501	shrq	$52,%r14
502	movq	%rbx,%r11
503	movq	%rbx,%r12
504	shrq	$26,%rdx
505	andq	$0x3ffffff,%rax
506	shlq	$12,%r11
507	andq	$0x3ffffff,%rdx
508	shrq	$14,%rbx
509	orq	%r11,%r14
510	shlq	$24,%rbp
511	andq	$0x3ffffff,%r14
512	shrq	$40,%r12
513	andq	$0x3ffffff,%rbx
514	orq	%r12,%rbp
515
516	subq	$16,%r15
517	jz	.Lstore_base2_26_avx
518
519	vmovd	%eax,%xmm0
520	vmovd	%edx,%xmm1
521	vmovd	%r14d,%xmm2
522	vmovd	%ebx,%xmm3
523	vmovd	%ebp,%xmm4
524	jmp	.Lproceed_avx
525
526.align	32
527.Lstore_base2_64_avx:
528	movq	%r14,0(%rdi)
529	movq	%rbx,8(%rdi)
530	movq	%rbp,16(%rdi)
531	jmp	.Ldone_avx
532
533.align	16
534.Lstore_base2_26_avx:
535	movl	%eax,0(%rdi)
536	movl	%edx,4(%rdi)
537	movl	%r14d,8(%rdi)
538	movl	%ebx,12(%rdi)
539	movl	%ebp,16(%rdi)
540.align	16
541.Ldone_avx:
542	movq	0(%rsp),%r15
543.cfi_restore	%r15
544	movq	8(%rsp),%r14
545.cfi_restore	%r14
546	movq	16(%rsp),%r13
547.cfi_restore	%r13
548	movq	24(%rsp),%r12
549.cfi_restore	%r12
550	movq	32(%rsp),%rbp
551.cfi_restore	%rbp
552	movq	40(%rsp),%rbx
553.cfi_restore	%rbx
554	leaq	48(%rsp),%rsp
555.cfi_adjust_cfa_offset	-48
556.Lno_data_avx:
557.Lblocks_avx_epilogue:
558	.byte	0xf3,0xc3
559.cfi_endproc
560
561.align	32
562.Lbase2_64_avx:
563.cfi_startproc
564	pushq	%rbx
565.cfi_adjust_cfa_offset	8
566.cfi_offset	%rbx,-16
567	pushq	%rbp
568.cfi_adjust_cfa_offset	8
569.cfi_offset	%rbp,-24
570	pushq	%r12
571.cfi_adjust_cfa_offset	8
572.cfi_offset	%r12,-32
573	pushq	%r13
574.cfi_adjust_cfa_offset	8
575.cfi_offset	%r13,-40
576	pushq	%r14
577.cfi_adjust_cfa_offset	8
578.cfi_offset	%r14,-48
579	pushq	%r15
580.cfi_adjust_cfa_offset	8
581.cfi_offset	%r15,-56
582.Lbase2_64_avx_body:
583
584	movq	%rdx,%r15
585
586	movq	24(%rdi),%r11
587	movq	32(%rdi),%r13
588
589	movq	0(%rdi),%r14
590	movq	8(%rdi),%rbx
591	movl	16(%rdi),%ebp
592
593	movq	%r13,%r12
594	movq	%r13,%rax
595	shrq	$2,%r13
596	addq	%r12,%r13
597
598	testq	$31,%rdx
599	jz	.Linit_avx
600
601	addq	0(%rsi),%r14
602	adcq	8(%rsi),%rbx
603	leaq	16(%rsi),%rsi
604	adcq	%rcx,%rbp
605	subq	$16,%r15
606
607	call	__poly1305_block
608
609.Linit_avx:
610
611	movq	%r14,%rax
612	movq	%r14,%rdx
613	shrq	$52,%r14
614	movq	%rbx,%r8
615	movq	%rbx,%r9
616	shrq	$26,%rdx
617	andq	$0x3ffffff,%rax
618	shlq	$12,%r8
619	andq	$0x3ffffff,%rdx
620	shrq	$14,%rbx
621	orq	%r8,%r14
622	shlq	$24,%rbp
623	andq	$0x3ffffff,%r14
624	shrq	$40,%r9
625	andq	$0x3ffffff,%rbx
626	orq	%r9,%rbp
627
628	vmovd	%eax,%xmm0
629	vmovd	%edx,%xmm1
630	vmovd	%r14d,%xmm2
631	vmovd	%ebx,%xmm3
632	vmovd	%ebp,%xmm4
633	movl	$1,20(%rdi)
634
635	call	__poly1305_init_avx
636
637.Lproceed_avx:
638	movq	%r15,%rdx
639
640	movq	0(%rsp),%r15
641.cfi_restore	%r15
642	movq	8(%rsp),%r14
643.cfi_restore	%r14
644	movq	16(%rsp),%r13
645.cfi_restore	%r13
646	movq	24(%rsp),%r12
647.cfi_restore	%r12
648	movq	32(%rsp),%rbp
649.cfi_restore	%rbp
650	movq	40(%rsp),%rbx
651.cfi_restore	%rbx
652	leaq	48(%rsp),%rax
653	leaq	48(%rsp),%rsp
654.cfi_adjust_cfa_offset	-48
655.Lbase2_64_avx_epilogue:
656	jmp	.Ldo_avx
657.cfi_endproc
658
659.align	32
660.Leven_avx:
661.cfi_startproc
662	vmovd	0(%rdi),%xmm0
663	vmovd	4(%rdi),%xmm1
664	vmovd	8(%rdi),%xmm2
665	vmovd	12(%rdi),%xmm3
666	vmovd	16(%rdi),%xmm4
667
668.Ldo_avx:
669	leaq	-88(%rsp),%r11
670.cfi_def_cfa	%r11,0x60
671	subq	$0x178,%rsp
672	subq	$64,%rdx
673	leaq	-32(%rsi),%rax
674	cmovcq	%rax,%rsi
675
676	vmovdqu	48(%rdi),%xmm14
677	leaq	112(%rdi),%rdi
678	leaq	.Lconst(%rip),%rcx
679
680
681
682	vmovdqu	32(%rsi),%xmm5
683	vmovdqu	48(%rsi),%xmm6
684	vmovdqa	64(%rcx),%xmm15
685
686	vpsrldq	$6,%xmm5,%xmm7
687	vpsrldq	$6,%xmm6,%xmm8
688	vpunpckhqdq	%xmm6,%xmm5,%xmm9
689	vpunpcklqdq	%xmm6,%xmm5,%xmm5
690	vpunpcklqdq	%xmm8,%xmm7,%xmm8
691
692	vpsrlq	$40,%xmm9,%xmm9
693	vpsrlq	$26,%xmm5,%xmm6
694	vpand	%xmm15,%xmm5,%xmm5
695	vpsrlq	$4,%xmm8,%xmm7
696	vpand	%xmm15,%xmm6,%xmm6
697	vpsrlq	$30,%xmm8,%xmm8
698	vpand	%xmm15,%xmm7,%xmm7
699	vpand	%xmm15,%xmm8,%xmm8
700	vpor	32(%rcx),%xmm9,%xmm9
701
702	jbe	.Lskip_loop_avx
703
704
705	vmovdqu	-48(%rdi),%xmm11
706	vmovdqu	-32(%rdi),%xmm12
707	vpshufd	$0xEE,%xmm14,%xmm13
708	vpshufd	$0x44,%xmm14,%xmm10
709	vmovdqa	%xmm13,-144(%r11)
710	vmovdqa	%xmm10,0(%rsp)
711	vpshufd	$0xEE,%xmm11,%xmm14
712	vmovdqu	-16(%rdi),%xmm10
713	vpshufd	$0x44,%xmm11,%xmm11
714	vmovdqa	%xmm14,-128(%r11)
715	vmovdqa	%xmm11,16(%rsp)
716	vpshufd	$0xEE,%xmm12,%xmm13
717	vmovdqu	0(%rdi),%xmm11
718	vpshufd	$0x44,%xmm12,%xmm12
719	vmovdqa	%xmm13,-112(%r11)
720	vmovdqa	%xmm12,32(%rsp)
721	vpshufd	$0xEE,%xmm10,%xmm14
722	vmovdqu	16(%rdi),%xmm12
723	vpshufd	$0x44,%xmm10,%xmm10
724	vmovdqa	%xmm14,-96(%r11)
725	vmovdqa	%xmm10,48(%rsp)
726	vpshufd	$0xEE,%xmm11,%xmm13
727	vmovdqu	32(%rdi),%xmm10
728	vpshufd	$0x44,%xmm11,%xmm11
729	vmovdqa	%xmm13,-80(%r11)
730	vmovdqa	%xmm11,64(%rsp)
731	vpshufd	$0xEE,%xmm12,%xmm14
732	vmovdqu	48(%rdi),%xmm11
733	vpshufd	$0x44,%xmm12,%xmm12
734	vmovdqa	%xmm14,-64(%r11)
735	vmovdqa	%xmm12,80(%rsp)
736	vpshufd	$0xEE,%xmm10,%xmm13
737	vmovdqu	64(%rdi),%xmm12
738	vpshufd	$0x44,%xmm10,%xmm10
739	vmovdqa	%xmm13,-48(%r11)
740	vmovdqa	%xmm10,96(%rsp)
741	vpshufd	$0xEE,%xmm11,%xmm14
742	vpshufd	$0x44,%xmm11,%xmm11
743	vmovdqa	%xmm14,-32(%r11)
744	vmovdqa	%xmm11,112(%rsp)
745	vpshufd	$0xEE,%xmm12,%xmm13
746	vmovdqa	0(%rsp),%xmm14
747	vpshufd	$0x44,%xmm12,%xmm12
748	vmovdqa	%xmm13,-16(%r11)
749	vmovdqa	%xmm12,128(%rsp)
750
751	jmp	.Loop_avx
752
753.align	32
754.Loop_avx:
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775	vpmuludq	%xmm5,%xmm14,%xmm10
776	vpmuludq	%xmm6,%xmm14,%xmm11
777	vmovdqa	%xmm2,32(%r11)
778	vpmuludq	%xmm7,%xmm14,%xmm12
779	vmovdqa	16(%rsp),%xmm2
780	vpmuludq	%xmm8,%xmm14,%xmm13
781	vpmuludq	%xmm9,%xmm14,%xmm14
782
783	vmovdqa	%xmm0,0(%r11)
784	vpmuludq	32(%rsp),%xmm9,%xmm0
785	vmovdqa	%xmm1,16(%r11)
786	vpmuludq	%xmm8,%xmm2,%xmm1
787	vpaddq	%xmm0,%xmm10,%xmm10
788	vpaddq	%xmm1,%xmm14,%xmm14
789	vmovdqa	%xmm3,48(%r11)
790	vpmuludq	%xmm7,%xmm2,%xmm0
791	vpmuludq	%xmm6,%xmm2,%xmm1
792	vpaddq	%xmm0,%xmm13,%xmm13
793	vmovdqa	48(%rsp),%xmm3
794	vpaddq	%xmm1,%xmm12,%xmm12
795	vmovdqa	%xmm4,64(%r11)
796	vpmuludq	%xmm5,%xmm2,%xmm2
797	vpmuludq	%xmm7,%xmm3,%xmm0
798	vpaddq	%xmm2,%xmm11,%xmm11
799
800	vmovdqa	64(%rsp),%xmm4
801	vpaddq	%xmm0,%xmm14,%xmm14
802	vpmuludq	%xmm6,%xmm3,%xmm1
803	vpmuludq	%xmm5,%xmm3,%xmm3
804	vpaddq	%xmm1,%xmm13,%xmm13
805	vmovdqa	80(%rsp),%xmm2
806	vpaddq	%xmm3,%xmm12,%xmm12
807	vpmuludq	%xmm9,%xmm4,%xmm0
808	vpmuludq	%xmm8,%xmm4,%xmm4
809	vpaddq	%xmm0,%xmm11,%xmm11
810	vmovdqa	96(%rsp),%xmm3
811	vpaddq	%xmm4,%xmm10,%xmm10
812
813	vmovdqa	128(%rsp),%xmm4
814	vpmuludq	%xmm6,%xmm2,%xmm1
815	vpmuludq	%xmm5,%xmm2,%xmm2
816	vpaddq	%xmm1,%xmm14,%xmm14
817	vpaddq	%xmm2,%xmm13,%xmm13
818	vpmuludq	%xmm9,%xmm3,%xmm0
819	vpmuludq	%xmm8,%xmm3,%xmm1
820	vpaddq	%xmm0,%xmm12,%xmm12
821	vmovdqu	0(%rsi),%xmm0
822	vpaddq	%xmm1,%xmm11,%xmm11
823	vpmuludq	%xmm7,%xmm3,%xmm3
824	vpmuludq	%xmm7,%xmm4,%xmm7
825	vpaddq	%xmm3,%xmm10,%xmm10
826
827	vmovdqu	16(%rsi),%xmm1
828	vpaddq	%xmm7,%xmm11,%xmm11
829	vpmuludq	%xmm8,%xmm4,%xmm8
830	vpmuludq	%xmm9,%xmm4,%xmm9
831	vpsrldq	$6,%xmm0,%xmm2
832	vpaddq	%xmm8,%xmm12,%xmm12
833	vpaddq	%xmm9,%xmm13,%xmm13
834	vpsrldq	$6,%xmm1,%xmm3
835	vpmuludq	112(%rsp),%xmm5,%xmm9
836	vpmuludq	%xmm6,%xmm4,%xmm5
837	vpunpckhqdq	%xmm1,%xmm0,%xmm4
838	vpaddq	%xmm9,%xmm14,%xmm14
839	vmovdqa	-144(%r11),%xmm9
840	vpaddq	%xmm5,%xmm10,%xmm10
841
842	vpunpcklqdq	%xmm1,%xmm0,%xmm0
843	vpunpcklqdq	%xmm3,%xmm2,%xmm3
844
845
846	vpsrldq	$5,%xmm4,%xmm4
847	vpsrlq	$26,%xmm0,%xmm1
848	vpand	%xmm15,%xmm0,%xmm0
849	vpsrlq	$4,%xmm3,%xmm2
850	vpand	%xmm15,%xmm1,%xmm1
851	vpand	0(%rcx),%xmm4,%xmm4
852	vpsrlq	$30,%xmm3,%xmm3
853	vpand	%xmm15,%xmm2,%xmm2
854	vpand	%xmm15,%xmm3,%xmm3
855	vpor	32(%rcx),%xmm4,%xmm4
856
857	vpaddq	0(%r11),%xmm0,%xmm0
858	vpaddq	16(%r11),%xmm1,%xmm1
859	vpaddq	32(%r11),%xmm2,%xmm2
860	vpaddq	48(%r11),%xmm3,%xmm3
861	vpaddq	64(%r11),%xmm4,%xmm4
862
863	leaq	32(%rsi),%rax
864	leaq	64(%rsi),%rsi
865	subq	$64,%rdx
866	cmovcq	%rax,%rsi
867
868
869
870
871
872
873
874
875
876
877	vpmuludq	%xmm0,%xmm9,%xmm5
878	vpmuludq	%xmm1,%xmm9,%xmm6
879	vpaddq	%xmm5,%xmm10,%xmm10
880	vpaddq	%xmm6,%xmm11,%xmm11
881	vmovdqa	-128(%r11),%xmm7
882	vpmuludq	%xmm2,%xmm9,%xmm5
883	vpmuludq	%xmm3,%xmm9,%xmm6
884	vpaddq	%xmm5,%xmm12,%xmm12
885	vpaddq	%xmm6,%xmm13,%xmm13
886	vpmuludq	%xmm4,%xmm9,%xmm9
887	vpmuludq	-112(%r11),%xmm4,%xmm5
888	vpaddq	%xmm9,%xmm14,%xmm14
889
890	vpaddq	%xmm5,%xmm10,%xmm10
891	vpmuludq	%xmm2,%xmm7,%xmm6
892	vpmuludq	%xmm3,%xmm7,%xmm5
893	vpaddq	%xmm6,%xmm13,%xmm13
894	vmovdqa	-96(%r11),%xmm8
895	vpaddq	%xmm5,%xmm14,%xmm14
896	vpmuludq	%xmm1,%xmm7,%xmm6
897	vpmuludq	%xmm0,%xmm7,%xmm7
898	vpaddq	%xmm6,%xmm12,%xmm12
899	vpaddq	%xmm7,%xmm11,%xmm11
900
901	vmovdqa	-80(%r11),%xmm9
902	vpmuludq	%xmm2,%xmm8,%xmm5
903	vpmuludq	%xmm1,%xmm8,%xmm6
904	vpaddq	%xmm5,%xmm14,%xmm14
905	vpaddq	%xmm6,%xmm13,%xmm13
906	vmovdqa	-64(%r11),%xmm7
907	vpmuludq	%xmm0,%xmm8,%xmm8
908	vpmuludq	%xmm4,%xmm9,%xmm5
909	vpaddq	%xmm8,%xmm12,%xmm12
910	vpaddq	%xmm5,%xmm11,%xmm11
911	vmovdqa	-48(%r11),%xmm8
912	vpmuludq	%xmm3,%xmm9,%xmm9
913	vpmuludq	%xmm1,%xmm7,%xmm6
914	vpaddq	%xmm9,%xmm10,%xmm10
915
916	vmovdqa	-16(%r11),%xmm9
917	vpaddq	%xmm6,%xmm14,%xmm14
918	vpmuludq	%xmm0,%xmm7,%xmm7
919	vpmuludq	%xmm4,%xmm8,%xmm5
920	vpaddq	%xmm7,%xmm13,%xmm13
921	vpaddq	%xmm5,%xmm12,%xmm12
922	vmovdqu	32(%rsi),%xmm5
923	vpmuludq	%xmm3,%xmm8,%xmm7
924	vpmuludq	%xmm2,%xmm8,%xmm8
925	vpaddq	%xmm7,%xmm11,%xmm11
926	vmovdqu	48(%rsi),%xmm6
927	vpaddq	%xmm8,%xmm10,%xmm10
928
929	vpmuludq	%xmm2,%xmm9,%xmm2
930	vpmuludq	%xmm3,%xmm9,%xmm3
931	vpsrldq	$6,%xmm5,%xmm7
932	vpaddq	%xmm2,%xmm11,%xmm11
933	vpmuludq	%xmm4,%xmm9,%xmm4
934	vpsrldq	$6,%xmm6,%xmm8
935	vpaddq	%xmm3,%xmm12,%xmm2
936	vpaddq	%xmm4,%xmm13,%xmm3
937	vpmuludq	-32(%r11),%xmm0,%xmm4
938	vpmuludq	%xmm1,%xmm9,%xmm0
939	vpunpckhqdq	%xmm6,%xmm5,%xmm9
940	vpaddq	%xmm4,%xmm14,%xmm4
941	vpaddq	%xmm0,%xmm10,%xmm0
942
943	vpunpcklqdq	%xmm6,%xmm5,%xmm5
944	vpunpcklqdq	%xmm8,%xmm7,%xmm8
945
946
947	vpsrldq	$5,%xmm9,%xmm9
948	vpsrlq	$26,%xmm5,%xmm6
949	vmovdqa	0(%rsp),%xmm14
950	vpand	%xmm15,%xmm5,%xmm5
951	vpsrlq	$4,%xmm8,%xmm7
952	vpand	%xmm15,%xmm6,%xmm6
953	vpand	0(%rcx),%xmm9,%xmm9
954	vpsrlq	$30,%xmm8,%xmm8
955	vpand	%xmm15,%xmm7,%xmm7
956	vpand	%xmm15,%xmm8,%xmm8
957	vpor	32(%rcx),%xmm9,%xmm9
958
959
960
961
962
963	vpsrlq	$26,%xmm3,%xmm13
964	vpand	%xmm15,%xmm3,%xmm3
965	vpaddq	%xmm13,%xmm4,%xmm4
966
967	vpsrlq	$26,%xmm0,%xmm10
968	vpand	%xmm15,%xmm0,%xmm0
969	vpaddq	%xmm10,%xmm11,%xmm1
970
971	vpsrlq	$26,%xmm4,%xmm10
972	vpand	%xmm15,%xmm4,%xmm4
973
974	vpsrlq	$26,%xmm1,%xmm11
975	vpand	%xmm15,%xmm1,%xmm1
976	vpaddq	%xmm11,%xmm2,%xmm2
977
978	vpaddq	%xmm10,%xmm0,%xmm0
979	vpsllq	$2,%xmm10,%xmm10
980	vpaddq	%xmm10,%xmm0,%xmm0
981
982	vpsrlq	$26,%xmm2,%xmm12
983	vpand	%xmm15,%xmm2,%xmm2
984	vpaddq	%xmm12,%xmm3,%xmm3
985
986	vpsrlq	$26,%xmm0,%xmm10
987	vpand	%xmm15,%xmm0,%xmm0
988	vpaddq	%xmm10,%xmm1,%xmm1
989
990	vpsrlq	$26,%xmm3,%xmm13
991	vpand	%xmm15,%xmm3,%xmm3
992	vpaddq	%xmm13,%xmm4,%xmm4
993
994	ja	.Loop_avx
995
996.Lskip_loop_avx:
997
998
999
1000	vpshufd	$0x10,%xmm14,%xmm14
1001	addq	$32,%rdx
1002	jnz	.Long_tail_avx
1003
1004	vpaddq	%xmm2,%xmm7,%xmm7
1005	vpaddq	%xmm0,%xmm5,%xmm5
1006	vpaddq	%xmm1,%xmm6,%xmm6
1007	vpaddq	%xmm3,%xmm8,%xmm8
1008	vpaddq	%xmm4,%xmm9,%xmm9
1009
1010.Long_tail_avx:
1011	vmovdqa	%xmm2,32(%r11)
1012	vmovdqa	%xmm0,0(%r11)
1013	vmovdqa	%xmm1,16(%r11)
1014	vmovdqa	%xmm3,48(%r11)
1015	vmovdqa	%xmm4,64(%r11)
1016
1017
1018
1019
1020
1021
1022
1023	vpmuludq	%xmm7,%xmm14,%xmm12
1024	vpmuludq	%xmm5,%xmm14,%xmm10
1025	vpshufd	$0x10,-48(%rdi),%xmm2
1026	vpmuludq	%xmm6,%xmm14,%xmm11
1027	vpmuludq	%xmm8,%xmm14,%xmm13
1028	vpmuludq	%xmm9,%xmm14,%xmm14
1029
1030	vpmuludq	%xmm8,%xmm2,%xmm0
1031	vpaddq	%xmm0,%xmm14,%xmm14
1032	vpshufd	$0x10,-32(%rdi),%xmm3
1033	vpmuludq	%xmm7,%xmm2,%xmm1
1034	vpaddq	%xmm1,%xmm13,%xmm13
1035	vpshufd	$0x10,-16(%rdi),%xmm4
1036	vpmuludq	%xmm6,%xmm2,%xmm0
1037	vpaddq	%xmm0,%xmm12,%xmm12
1038	vpmuludq	%xmm5,%xmm2,%xmm2
1039	vpaddq	%xmm2,%xmm11,%xmm11
1040	vpmuludq	%xmm9,%xmm3,%xmm3
1041	vpaddq	%xmm3,%xmm10,%xmm10
1042
1043	vpshufd	$0x10,0(%rdi),%xmm2
1044	vpmuludq	%xmm7,%xmm4,%xmm1
1045	vpaddq	%xmm1,%xmm14,%xmm14
1046	vpmuludq	%xmm6,%xmm4,%xmm0
1047	vpaddq	%xmm0,%xmm13,%xmm13
1048	vpshufd	$0x10,16(%rdi),%xmm3
1049	vpmuludq	%xmm5,%xmm4,%xmm4
1050	vpaddq	%xmm4,%xmm12,%xmm12
1051	vpmuludq	%xmm9,%xmm2,%xmm1
1052	vpaddq	%xmm1,%xmm11,%xmm11
1053	vpshufd	$0x10,32(%rdi),%xmm4
1054	vpmuludq	%xmm8,%xmm2,%xmm2
1055	vpaddq	%xmm2,%xmm10,%xmm10
1056
1057	vpmuludq	%xmm6,%xmm3,%xmm0
1058	vpaddq	%xmm0,%xmm14,%xmm14
1059	vpmuludq	%xmm5,%xmm3,%xmm3
1060	vpaddq	%xmm3,%xmm13,%xmm13
1061	vpshufd	$0x10,48(%rdi),%xmm2
1062	vpmuludq	%xmm9,%xmm4,%xmm1
1063	vpaddq	%xmm1,%xmm12,%xmm12
1064	vpshufd	$0x10,64(%rdi),%xmm3
1065	vpmuludq	%xmm8,%xmm4,%xmm0
1066	vpaddq	%xmm0,%xmm11,%xmm11
1067	vpmuludq	%xmm7,%xmm4,%xmm4
1068	vpaddq	%xmm4,%xmm10,%xmm10
1069
1070	vpmuludq	%xmm5,%xmm2,%xmm2
1071	vpaddq	%xmm2,%xmm14,%xmm14
1072	vpmuludq	%xmm9,%xmm3,%xmm1
1073	vpaddq	%xmm1,%xmm13,%xmm13
1074	vpmuludq	%xmm8,%xmm3,%xmm0
1075	vpaddq	%xmm0,%xmm12,%xmm12
1076	vpmuludq	%xmm7,%xmm3,%xmm1
1077	vpaddq	%xmm1,%xmm11,%xmm11
1078	vpmuludq	%xmm6,%xmm3,%xmm3
1079	vpaddq	%xmm3,%xmm10,%xmm10
1080
1081	jz	.Lshort_tail_avx
1082
1083	vmovdqu	0(%rsi),%xmm0
1084	vmovdqu	16(%rsi),%xmm1
1085
1086	vpsrldq	$6,%xmm0,%xmm2
1087	vpsrldq	$6,%xmm1,%xmm3
1088	vpunpckhqdq	%xmm1,%xmm0,%xmm4
1089	vpunpcklqdq	%xmm1,%xmm0,%xmm0
1090	vpunpcklqdq	%xmm3,%xmm2,%xmm3
1091
1092	vpsrlq	$40,%xmm4,%xmm4
1093	vpsrlq	$26,%xmm0,%xmm1
1094	vpand	%xmm15,%xmm0,%xmm0
1095	vpsrlq	$4,%xmm3,%xmm2
1096	vpand	%xmm15,%xmm1,%xmm1
1097	vpsrlq	$30,%xmm3,%xmm3
1098	vpand	%xmm15,%xmm2,%xmm2
1099	vpand	%xmm15,%xmm3,%xmm3
1100	vpor	32(%rcx),%xmm4,%xmm4
1101
1102	vpshufd	$0x32,-64(%rdi),%xmm9
1103	vpaddq	0(%r11),%xmm0,%xmm0
1104	vpaddq	16(%r11),%xmm1,%xmm1
1105	vpaddq	32(%r11),%xmm2,%xmm2
1106	vpaddq	48(%r11),%xmm3,%xmm3
1107	vpaddq	64(%r11),%xmm4,%xmm4
1108
1109
1110
1111
1112	vpmuludq	%xmm0,%xmm9,%xmm5
1113	vpaddq	%xmm5,%xmm10,%xmm10
1114	vpmuludq	%xmm1,%xmm9,%xmm6
1115	vpaddq	%xmm6,%xmm11,%xmm11
1116	vpmuludq	%xmm2,%xmm9,%xmm5
1117	vpaddq	%xmm5,%xmm12,%xmm12
1118	vpshufd	$0x32,-48(%rdi),%xmm7
1119	vpmuludq	%xmm3,%xmm9,%xmm6
1120	vpaddq	%xmm6,%xmm13,%xmm13
1121	vpmuludq	%xmm4,%xmm9,%xmm9
1122	vpaddq	%xmm9,%xmm14,%xmm14
1123
1124	vpmuludq	%xmm3,%xmm7,%xmm5
1125	vpaddq	%xmm5,%xmm14,%xmm14
1126	vpshufd	$0x32,-32(%rdi),%xmm8
1127	vpmuludq	%xmm2,%xmm7,%xmm6
1128	vpaddq	%xmm6,%xmm13,%xmm13
1129	vpshufd	$0x32,-16(%rdi),%xmm9
1130	vpmuludq	%xmm1,%xmm7,%xmm5
1131	vpaddq	%xmm5,%xmm12,%xmm12
1132	vpmuludq	%xmm0,%xmm7,%xmm7
1133	vpaddq	%xmm7,%xmm11,%xmm11
1134	vpmuludq	%xmm4,%xmm8,%xmm8
1135	vpaddq	%xmm8,%xmm10,%xmm10
1136
1137	vpshufd	$0x32,0(%rdi),%xmm7
1138	vpmuludq	%xmm2,%xmm9,%xmm6
1139	vpaddq	%xmm6,%xmm14,%xmm14
1140	vpmuludq	%xmm1,%xmm9,%xmm5
1141	vpaddq	%xmm5,%xmm13,%xmm13
1142	vpshufd	$0x32,16(%rdi),%xmm8
1143	vpmuludq	%xmm0,%xmm9,%xmm9
1144	vpaddq	%xmm9,%xmm12,%xmm12
1145	vpmuludq	%xmm4,%xmm7,%xmm6
1146	vpaddq	%xmm6,%xmm11,%xmm11
1147	vpshufd	$0x32,32(%rdi),%xmm9
1148	vpmuludq	%xmm3,%xmm7,%xmm7
1149	vpaddq	%xmm7,%xmm10,%xmm10
1150
1151	vpmuludq	%xmm1,%xmm8,%xmm5
1152	vpaddq	%xmm5,%xmm14,%xmm14
1153	vpmuludq	%xmm0,%xmm8,%xmm8
1154	vpaddq	%xmm8,%xmm13,%xmm13
1155	vpshufd	$0x32,48(%rdi),%xmm7
1156	vpmuludq	%xmm4,%xmm9,%xmm6
1157	vpaddq	%xmm6,%xmm12,%xmm12
1158	vpshufd	$0x32,64(%rdi),%xmm8
1159	vpmuludq	%xmm3,%xmm9,%xmm5
1160	vpaddq	%xmm5,%xmm11,%xmm11
1161	vpmuludq	%xmm2,%xmm9,%xmm9
1162	vpaddq	%xmm9,%xmm10,%xmm10
1163
1164	vpmuludq	%xmm0,%xmm7,%xmm7
1165	vpaddq	%xmm7,%xmm14,%xmm14
1166	vpmuludq	%xmm4,%xmm8,%xmm6
1167	vpaddq	%xmm6,%xmm13,%xmm13
1168	vpmuludq	%xmm3,%xmm8,%xmm5
1169	vpaddq	%xmm5,%xmm12,%xmm12
1170	vpmuludq	%xmm2,%xmm8,%xmm6
1171	vpaddq	%xmm6,%xmm11,%xmm11
1172	vpmuludq	%xmm1,%xmm8,%xmm8
1173	vpaddq	%xmm8,%xmm10,%xmm10
1174
1175.Lshort_tail_avx:
1176
1177
1178
1179	vpsrldq	$8,%xmm14,%xmm9
1180	vpsrldq	$8,%xmm13,%xmm8
1181	vpsrldq	$8,%xmm11,%xmm6
1182	vpsrldq	$8,%xmm10,%xmm5
1183	vpsrldq	$8,%xmm12,%xmm7
1184	vpaddq	%xmm8,%xmm13,%xmm13
1185	vpaddq	%xmm9,%xmm14,%xmm14
1186	vpaddq	%xmm5,%xmm10,%xmm10
1187	vpaddq	%xmm6,%xmm11,%xmm11
1188	vpaddq	%xmm7,%xmm12,%xmm12
1189
1190
1191
1192
1193	vpsrlq	$26,%xmm13,%xmm3
1194	vpand	%xmm15,%xmm13,%xmm13
1195	vpaddq	%xmm3,%xmm14,%xmm14
1196
1197	vpsrlq	$26,%xmm10,%xmm0
1198	vpand	%xmm15,%xmm10,%xmm10
1199	vpaddq	%xmm0,%xmm11,%xmm11
1200
1201	vpsrlq	$26,%xmm14,%xmm4
1202	vpand	%xmm15,%xmm14,%xmm14
1203
1204	vpsrlq	$26,%xmm11,%xmm1
1205	vpand	%xmm15,%xmm11,%xmm11
1206	vpaddq	%xmm1,%xmm12,%xmm12
1207
1208	vpaddq	%xmm4,%xmm10,%xmm10
1209	vpsllq	$2,%xmm4,%xmm4
1210	vpaddq	%xmm4,%xmm10,%xmm10
1211
1212	vpsrlq	$26,%xmm12,%xmm2
1213	vpand	%xmm15,%xmm12,%xmm12
1214	vpaddq	%xmm2,%xmm13,%xmm13
1215
1216	vpsrlq	$26,%xmm10,%xmm0
1217	vpand	%xmm15,%xmm10,%xmm10
1218	vpaddq	%xmm0,%xmm11,%xmm11
1219
1220	vpsrlq	$26,%xmm13,%xmm3
1221	vpand	%xmm15,%xmm13,%xmm13
1222	vpaddq	%xmm3,%xmm14,%xmm14
1223
1224	vmovd	%xmm10,-112(%rdi)
1225	vmovd	%xmm11,-108(%rdi)
1226	vmovd	%xmm12,-104(%rdi)
1227	vmovd	%xmm13,-100(%rdi)
1228	vmovd	%xmm14,-96(%rdi)
1229	leaq	88(%r11),%rsp
1230.cfi_def_cfa	%rsp,8
1231	vzeroupper
1232	.byte	0xf3,0xc3
1233.cfi_endproc
1234.size	poly1305_blocks_avx,.-poly1305_blocks_avx
1235
1236.type	poly1305_emit_avx,@function
1237.align	32
1238poly1305_emit_avx:
1239.cfi_startproc
1240	cmpl	$0,20(%rdi)
1241	je	.Lemit
1242
1243	movl	0(%rdi),%eax
1244	movl	4(%rdi),%ecx
1245	movl	8(%rdi),%r8d
1246	movl	12(%rdi),%r11d
1247	movl	16(%rdi),%r10d
1248
1249	shlq	$26,%rcx
1250	movq	%r8,%r9
1251	shlq	$52,%r8
1252	addq	%rcx,%rax
1253	shrq	$12,%r9
1254	addq	%rax,%r8
1255	adcq	$0,%r9
1256
1257	shlq	$14,%r11
1258	movq	%r10,%rax
1259	shrq	$24,%r10
1260	addq	%r11,%r9
1261	shlq	$40,%rax
1262	addq	%rax,%r9
1263	adcq	$0,%r10
1264
1265	movq	%r10,%rax
1266	movq	%r10,%rcx
1267	andq	$3,%r10
1268	shrq	$2,%rax
1269	andq	$-4,%rcx
1270	addq	%rcx,%rax
1271	addq	%rax,%r8
1272	adcq	$0,%r9
1273	adcq	$0,%r10
1274
1275	movq	%r8,%rax
1276	addq	$5,%r8
1277	movq	%r9,%rcx
1278	adcq	$0,%r9
1279	adcq	$0,%r10
1280	shrq	$2,%r10
1281	cmovnzq	%r8,%rax
1282	cmovnzq	%r9,%rcx
1283
1284	addq	0(%rdx),%rax
1285	adcq	8(%rdx),%rcx
1286	movq	%rax,0(%rsi)
1287	movq	%rcx,8(%rsi)
1288
1289	.byte	0xf3,0xc3
1290.cfi_endproc
1291.size	poly1305_emit_avx,.-poly1305_emit_avx
1292.type	poly1305_blocks_avx2,@function
1293.align	32
1294poly1305_blocks_avx2:
1295.cfi_startproc
1296	movl	20(%rdi),%r8d
1297	cmpq	$128,%rdx
1298	jae	.Lblocks_avx2
1299	testl	%r8d,%r8d
1300	jz	.Lblocks
1301
1302.Lblocks_avx2:
1303	andq	$-16,%rdx
1304	jz	.Lno_data_avx2
1305
1306	vzeroupper
1307
1308	testl	%r8d,%r8d
1309	jz	.Lbase2_64_avx2
1310
1311	testq	$63,%rdx
1312	jz	.Leven_avx2
1313
1314	pushq	%rbx
1315.cfi_adjust_cfa_offset	8
1316.cfi_offset	%rbx,-16
1317	pushq	%rbp
1318.cfi_adjust_cfa_offset	8
1319.cfi_offset	%rbp,-24
1320	pushq	%r12
1321.cfi_adjust_cfa_offset	8
1322.cfi_offset	%r12,-32
1323	pushq	%r13
1324.cfi_adjust_cfa_offset	8
1325.cfi_offset	%r13,-40
1326	pushq	%r14
1327.cfi_adjust_cfa_offset	8
1328.cfi_offset	%r14,-48
1329	pushq	%r15
1330.cfi_adjust_cfa_offset	8
1331.cfi_offset	%r15,-56
1332.Lblocks_avx2_body:
1333
1334	movq	%rdx,%r15
1335
1336	movq	0(%rdi),%r8
1337	movq	8(%rdi),%r9
1338	movl	16(%rdi),%ebp
1339
1340	movq	24(%rdi),%r11
1341	movq	32(%rdi),%r13
1342
1343
1344	movl	%r8d,%r14d
1345	andq	$-2147483648,%r8
1346	movq	%r9,%r12
1347	movl	%r9d,%ebx
1348	andq	$-2147483648,%r9
1349
1350	shrq	$6,%r8
1351	shlq	$52,%r12
1352	addq	%r8,%r14
1353	shrq	$12,%rbx
1354	shrq	$18,%r9
1355	addq	%r12,%r14
1356	adcq	%r9,%rbx
1357
1358	movq	%rbp,%r8
1359	shlq	$40,%r8
1360	shrq	$24,%rbp
1361	addq	%r8,%rbx
1362	adcq	$0,%rbp
1363
1364	movq	$-4,%r9
1365	movq	%rbp,%r8
1366	andq	%rbp,%r9
1367	shrq	$2,%r8
1368	andq	$3,%rbp
1369	addq	%r9,%r8
1370	addq	%r8,%r14
1371	adcq	$0,%rbx
1372	adcq	$0,%rbp
1373
1374	movq	%r13,%r12
1375	movq	%r13,%rax
1376	shrq	$2,%r13
1377	addq	%r12,%r13
1378
1379.Lbase2_26_pre_avx2:
1380	addq	0(%rsi),%r14
1381	adcq	8(%rsi),%rbx
1382	leaq	16(%rsi),%rsi
1383	adcq	%rcx,%rbp
1384	subq	$16,%r15
1385
1386	call	__poly1305_block
1387	movq	%r12,%rax
1388
1389	testq	$63,%r15
1390	jnz	.Lbase2_26_pre_avx2
1391
1392	testq	%rcx,%rcx
1393	jz	.Lstore_base2_64_avx2
1394
1395
1396	movq	%r14,%rax
1397	movq	%r14,%rdx
1398	shrq	$52,%r14
1399	movq	%rbx,%r11
1400	movq	%rbx,%r12
1401	shrq	$26,%rdx
1402	andq	$0x3ffffff,%rax
1403	shlq	$12,%r11
1404	andq	$0x3ffffff,%rdx
1405	shrq	$14,%rbx
1406	orq	%r11,%r14
1407	shlq	$24,%rbp
1408	andq	$0x3ffffff,%r14
1409	shrq	$40,%r12
1410	andq	$0x3ffffff,%rbx
1411	orq	%r12,%rbp
1412
1413	testq	%r15,%r15
1414	jz	.Lstore_base2_26_avx2
1415
1416	vmovd	%eax,%xmm0
1417	vmovd	%edx,%xmm1
1418	vmovd	%r14d,%xmm2
1419	vmovd	%ebx,%xmm3
1420	vmovd	%ebp,%xmm4
1421	jmp	.Lproceed_avx2
1422
1423.align	32
1424.Lstore_base2_64_avx2:
1425	movq	%r14,0(%rdi)
1426	movq	%rbx,8(%rdi)
1427	movq	%rbp,16(%rdi)
1428	jmp	.Ldone_avx2
1429
1430.align	16
1431.Lstore_base2_26_avx2:
1432	movl	%eax,0(%rdi)
1433	movl	%edx,4(%rdi)
1434	movl	%r14d,8(%rdi)
1435	movl	%ebx,12(%rdi)
1436	movl	%ebp,16(%rdi)
1437.align	16
1438.Ldone_avx2:
1439	movq	0(%rsp),%r15
1440.cfi_restore	%r15
1441	movq	8(%rsp),%r14
1442.cfi_restore	%r14
1443	movq	16(%rsp),%r13
1444.cfi_restore	%r13
1445	movq	24(%rsp),%r12
1446.cfi_restore	%r12
1447	movq	32(%rsp),%rbp
1448.cfi_restore	%rbp
1449	movq	40(%rsp),%rbx
1450.cfi_restore	%rbx
1451	leaq	48(%rsp),%rsp
1452.cfi_adjust_cfa_offset	-48
1453.Lno_data_avx2:
1454.Lblocks_avx2_epilogue:
1455	.byte	0xf3,0xc3
1456.cfi_endproc
1457
1458.align	32
1459.Lbase2_64_avx2:
1460.cfi_startproc
1461	pushq	%rbx
1462.cfi_adjust_cfa_offset	8
1463.cfi_offset	%rbx,-16
1464	pushq	%rbp
1465.cfi_adjust_cfa_offset	8
1466.cfi_offset	%rbp,-24
1467	pushq	%r12
1468.cfi_adjust_cfa_offset	8
1469.cfi_offset	%r12,-32
1470	pushq	%r13
1471.cfi_adjust_cfa_offset	8
1472.cfi_offset	%r13,-40
1473	pushq	%r14
1474.cfi_adjust_cfa_offset	8
1475.cfi_offset	%r14,-48
1476	pushq	%r15
1477.cfi_adjust_cfa_offset	8
1478.cfi_offset	%r15,-56
1479.Lbase2_64_avx2_body:
1480
1481	movq	%rdx,%r15
1482
1483	movq	24(%rdi),%r11
1484	movq	32(%rdi),%r13
1485
1486	movq	0(%rdi),%r14
1487	movq	8(%rdi),%rbx
1488	movl	16(%rdi),%ebp
1489
1490	movq	%r13,%r12
1491	movq	%r13,%rax
1492	shrq	$2,%r13
1493	addq	%r12,%r13
1494
1495	testq	$63,%rdx
1496	jz	.Linit_avx2
1497
1498.Lbase2_64_pre_avx2:
1499	addq	0(%rsi),%r14
1500	adcq	8(%rsi),%rbx
1501	leaq	16(%rsi),%rsi
1502	adcq	%rcx,%rbp
1503	subq	$16,%r15
1504
1505	call	__poly1305_block
1506	movq	%r12,%rax
1507
1508	testq	$63,%r15
1509	jnz	.Lbase2_64_pre_avx2
1510
1511.Linit_avx2:
1512
1513	movq	%r14,%rax
1514	movq	%r14,%rdx
1515	shrq	$52,%r14
1516	movq	%rbx,%r8
1517	movq	%rbx,%r9
1518	shrq	$26,%rdx
1519	andq	$0x3ffffff,%rax
1520	shlq	$12,%r8
1521	andq	$0x3ffffff,%rdx
1522	shrq	$14,%rbx
1523	orq	%r8,%r14
1524	shlq	$24,%rbp
1525	andq	$0x3ffffff,%r14
1526	shrq	$40,%r9
1527	andq	$0x3ffffff,%rbx
1528	orq	%r9,%rbp
1529
1530	vmovd	%eax,%xmm0
1531	vmovd	%edx,%xmm1
1532	vmovd	%r14d,%xmm2
1533	vmovd	%ebx,%xmm3
1534	vmovd	%ebp,%xmm4
1535	movl	$1,20(%rdi)
1536
1537	call	__poly1305_init_avx
1538
1539.Lproceed_avx2:
1540	movq	%r15,%rdx
1541	movl	OPENSSL_ia32cap_P+8(%rip),%r10d
1542	movl	$3221291008,%r11d
1543
1544	movq	0(%rsp),%r15
1545.cfi_restore	%r15
1546	movq	8(%rsp),%r14
1547.cfi_restore	%r14
1548	movq	16(%rsp),%r13
1549.cfi_restore	%r13
1550	movq	24(%rsp),%r12
1551.cfi_restore	%r12
1552	movq	32(%rsp),%rbp
1553.cfi_restore	%rbp
1554	movq	40(%rsp),%rbx
1555.cfi_restore	%rbx
1556	leaq	48(%rsp),%rax
1557	leaq	48(%rsp),%rsp
1558.cfi_adjust_cfa_offset	-48
1559.Lbase2_64_avx2_epilogue:
1560	jmp	.Ldo_avx2
1561.cfi_endproc
1562
1563.align	32
1564.Leven_avx2:
1565.cfi_startproc
1566	movl	OPENSSL_ia32cap_P+8(%rip),%r10d
1567	vmovd	0(%rdi),%xmm0
1568	vmovd	4(%rdi),%xmm1
1569	vmovd	8(%rdi),%xmm2
1570	vmovd	12(%rdi),%xmm3
1571	vmovd	16(%rdi),%xmm4
1572
1573.Ldo_avx2:
1574	leaq	-8(%rsp),%r11
1575.cfi_def_cfa	%r11,16
1576	subq	$0x128,%rsp
1577	leaq	.Lconst(%rip),%rcx
1578	leaq	48+64(%rdi),%rdi
1579	vmovdqa	96(%rcx),%ymm7
1580
1581
1582	vmovdqu	-64(%rdi),%xmm9
1583	andq	$-512,%rsp
1584	vmovdqu	-48(%rdi),%xmm10
1585	vmovdqu	-32(%rdi),%xmm6
1586	vmovdqu	-16(%rdi),%xmm11
1587	vmovdqu	0(%rdi),%xmm12
1588	vmovdqu	16(%rdi),%xmm13
1589	leaq	144(%rsp),%rax
1590	vmovdqu	32(%rdi),%xmm14
1591	vpermd	%ymm9,%ymm7,%ymm9
1592	vmovdqu	48(%rdi),%xmm15
1593	vpermd	%ymm10,%ymm7,%ymm10
1594	vmovdqu	64(%rdi),%xmm5
1595	vpermd	%ymm6,%ymm7,%ymm6
1596	vmovdqa	%ymm9,0(%rsp)
1597	vpermd	%ymm11,%ymm7,%ymm11
1598	vmovdqa	%ymm10,32-144(%rax)
1599	vpermd	%ymm12,%ymm7,%ymm12
1600	vmovdqa	%ymm6,64-144(%rax)
1601	vpermd	%ymm13,%ymm7,%ymm13
1602	vmovdqa	%ymm11,96-144(%rax)
1603	vpermd	%ymm14,%ymm7,%ymm14
1604	vmovdqa	%ymm12,128-144(%rax)
1605	vpermd	%ymm15,%ymm7,%ymm15
1606	vmovdqa	%ymm13,160-144(%rax)
1607	vpermd	%ymm5,%ymm7,%ymm5
1608	vmovdqa	%ymm14,192-144(%rax)
1609	vmovdqa	%ymm15,224-144(%rax)
1610	vmovdqa	%ymm5,256-144(%rax)
1611	vmovdqa	64(%rcx),%ymm5
1612
1613
1614
1615	vmovdqu	0(%rsi),%xmm7
1616	vmovdqu	16(%rsi),%xmm8
1617	vinserti128	$1,32(%rsi),%ymm7,%ymm7
1618	vinserti128	$1,48(%rsi),%ymm8,%ymm8
1619	leaq	64(%rsi),%rsi
1620
1621	vpsrldq	$6,%ymm7,%ymm9
1622	vpsrldq	$6,%ymm8,%ymm10
1623	vpunpckhqdq	%ymm8,%ymm7,%ymm6
1624	vpunpcklqdq	%ymm10,%ymm9,%ymm9
1625	vpunpcklqdq	%ymm8,%ymm7,%ymm7
1626
1627	vpsrlq	$30,%ymm9,%ymm10
1628	vpsrlq	$4,%ymm9,%ymm9
1629	vpsrlq	$26,%ymm7,%ymm8
1630	vpsrlq	$40,%ymm6,%ymm6
1631	vpand	%ymm5,%ymm9,%ymm9
1632	vpand	%ymm5,%ymm7,%ymm7
1633	vpand	%ymm5,%ymm8,%ymm8
1634	vpand	%ymm5,%ymm10,%ymm10
1635	vpor	32(%rcx),%ymm6,%ymm6
1636
1637	vpaddq	%ymm2,%ymm9,%ymm2
1638	subq	$64,%rdx
1639	jz	.Ltail_avx2
1640	jmp	.Loop_avx2
1641
1642.align	32
1643.Loop_avx2:
1644
1645
1646
1647
1648
1649
1650
1651
1652	vpaddq	%ymm0,%ymm7,%ymm0
1653	vmovdqa	0(%rsp),%ymm7
1654	vpaddq	%ymm1,%ymm8,%ymm1
1655	vmovdqa	32(%rsp),%ymm8
1656	vpaddq	%ymm3,%ymm10,%ymm3
1657	vmovdqa	96(%rsp),%ymm9
1658	vpaddq	%ymm4,%ymm6,%ymm4
1659	vmovdqa	48(%rax),%ymm10
1660	vmovdqa	112(%rax),%ymm5
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677	vpmuludq	%ymm2,%ymm7,%ymm13
1678	vpmuludq	%ymm2,%ymm8,%ymm14
1679	vpmuludq	%ymm2,%ymm9,%ymm15
1680	vpmuludq	%ymm2,%ymm10,%ymm11
1681	vpmuludq	%ymm2,%ymm5,%ymm12
1682
1683	vpmuludq	%ymm0,%ymm8,%ymm6
1684	vpmuludq	%ymm1,%ymm8,%ymm2
1685	vpaddq	%ymm6,%ymm12,%ymm12
1686	vpaddq	%ymm2,%ymm13,%ymm13
1687	vpmuludq	%ymm3,%ymm8,%ymm6
1688	vpmuludq	64(%rsp),%ymm4,%ymm2
1689	vpaddq	%ymm6,%ymm15,%ymm15
1690	vpaddq	%ymm2,%ymm11,%ymm11
1691	vmovdqa	-16(%rax),%ymm8
1692
1693	vpmuludq	%ymm0,%ymm7,%ymm6
1694	vpmuludq	%ymm1,%ymm7,%ymm2
1695	vpaddq	%ymm6,%ymm11,%ymm11
1696	vpaddq	%ymm2,%ymm12,%ymm12
1697	vpmuludq	%ymm3,%ymm7,%ymm6
1698	vpmuludq	%ymm4,%ymm7,%ymm2
1699	vmovdqu	0(%rsi),%xmm7
1700	vpaddq	%ymm6,%ymm14,%ymm14
1701	vpaddq	%ymm2,%ymm15,%ymm15
1702	vinserti128	$1,32(%rsi),%ymm7,%ymm7
1703
1704	vpmuludq	%ymm3,%ymm8,%ymm6
1705	vpmuludq	%ymm4,%ymm8,%ymm2
1706	vmovdqu	16(%rsi),%xmm8
1707	vpaddq	%ymm6,%ymm11,%ymm11
1708	vpaddq	%ymm2,%ymm12,%ymm12
1709	vmovdqa	16(%rax),%ymm2
1710	vpmuludq	%ymm1,%ymm9,%ymm6
1711	vpmuludq	%ymm0,%ymm9,%ymm9
1712	vpaddq	%ymm6,%ymm14,%ymm14
1713	vpaddq	%ymm9,%ymm13,%ymm13
1714	vinserti128	$1,48(%rsi),%ymm8,%ymm8
1715	leaq	64(%rsi),%rsi
1716
1717	vpmuludq	%ymm1,%ymm2,%ymm6
1718	vpmuludq	%ymm0,%ymm2,%ymm2
1719	vpsrldq	$6,%ymm7,%ymm9
1720	vpaddq	%ymm6,%ymm15,%ymm15
1721	vpaddq	%ymm2,%ymm14,%ymm14
1722	vpmuludq	%ymm3,%ymm10,%ymm6
1723	vpmuludq	%ymm4,%ymm10,%ymm2
1724	vpsrldq	$6,%ymm8,%ymm10
1725	vpaddq	%ymm6,%ymm12,%ymm12
1726	vpaddq	%ymm2,%ymm13,%ymm13
1727	vpunpckhqdq	%ymm8,%ymm7,%ymm6
1728
1729	vpmuludq	%ymm3,%ymm5,%ymm3
1730	vpmuludq	%ymm4,%ymm5,%ymm4
1731	vpunpcklqdq	%ymm8,%ymm7,%ymm7
1732	vpaddq	%ymm3,%ymm13,%ymm2
1733	vpaddq	%ymm4,%ymm14,%ymm3
1734	vpunpcklqdq	%ymm10,%ymm9,%ymm10
1735	vpmuludq	80(%rax),%ymm0,%ymm4
1736	vpmuludq	%ymm1,%ymm5,%ymm0
1737	vmovdqa	64(%rcx),%ymm5
1738	vpaddq	%ymm4,%ymm15,%ymm4
1739	vpaddq	%ymm0,%ymm11,%ymm0
1740
1741
1742
1743
1744	vpsrlq	$26,%ymm3,%ymm14
1745	vpand	%ymm5,%ymm3,%ymm3
1746	vpaddq	%ymm14,%ymm4,%ymm4
1747
1748	vpsrlq	$26,%ymm0,%ymm11
1749	vpand	%ymm5,%ymm0,%ymm0
1750	vpaddq	%ymm11,%ymm12,%ymm1
1751
1752	vpsrlq	$26,%ymm4,%ymm15
1753	vpand	%ymm5,%ymm4,%ymm4
1754
1755	vpsrlq	$4,%ymm10,%ymm9
1756
1757	vpsrlq	$26,%ymm1,%ymm12
1758	vpand	%ymm5,%ymm1,%ymm1
1759	vpaddq	%ymm12,%ymm2,%ymm2
1760
1761	vpaddq	%ymm15,%ymm0,%ymm0
1762	vpsllq	$2,%ymm15,%ymm15
1763	vpaddq	%ymm15,%ymm0,%ymm0
1764
1765	vpand	%ymm5,%ymm9,%ymm9
1766	vpsrlq	$26,%ymm7,%ymm8
1767
1768	vpsrlq	$26,%ymm2,%ymm13
1769	vpand	%ymm5,%ymm2,%ymm2
1770	vpaddq	%ymm13,%ymm3,%ymm3
1771
1772	vpaddq	%ymm9,%ymm2,%ymm2
1773	vpsrlq	$30,%ymm10,%ymm10
1774
1775	vpsrlq	$26,%ymm0,%ymm11
1776	vpand	%ymm5,%ymm0,%ymm0
1777	vpaddq	%ymm11,%ymm1,%ymm1
1778
1779	vpsrlq	$40,%ymm6,%ymm6
1780
1781	vpsrlq	$26,%ymm3,%ymm14
1782	vpand	%ymm5,%ymm3,%ymm3
1783	vpaddq	%ymm14,%ymm4,%ymm4
1784
1785	vpand	%ymm5,%ymm7,%ymm7
1786	vpand	%ymm5,%ymm8,%ymm8
1787	vpand	%ymm5,%ymm10,%ymm10
1788	vpor	32(%rcx),%ymm6,%ymm6
1789
1790	subq	$64,%rdx
1791	jnz	.Loop_avx2
1792
1793.byte	0x66,0x90
1794.Ltail_avx2:
1795
1796
1797
1798
1799
1800
1801
1802	vpaddq	%ymm0,%ymm7,%ymm0
1803	vmovdqu	4(%rsp),%ymm7
1804	vpaddq	%ymm1,%ymm8,%ymm1
1805	vmovdqu	36(%rsp),%ymm8
1806	vpaddq	%ymm3,%ymm10,%ymm3
1807	vmovdqu	100(%rsp),%ymm9
1808	vpaddq	%ymm4,%ymm6,%ymm4
1809	vmovdqu	52(%rax),%ymm10
1810	vmovdqu	116(%rax),%ymm5
1811
1812	vpmuludq	%ymm2,%ymm7,%ymm13
1813	vpmuludq	%ymm2,%ymm8,%ymm14
1814	vpmuludq	%ymm2,%ymm9,%ymm15
1815	vpmuludq	%ymm2,%ymm10,%ymm11
1816	vpmuludq	%ymm2,%ymm5,%ymm12
1817
1818	vpmuludq	%ymm0,%ymm8,%ymm6
1819	vpmuludq	%ymm1,%ymm8,%ymm2
1820	vpaddq	%ymm6,%ymm12,%ymm12
1821	vpaddq	%ymm2,%ymm13,%ymm13
1822	vpmuludq	%ymm3,%ymm8,%ymm6
1823	vpmuludq	68(%rsp),%ymm4,%ymm2
1824	vpaddq	%ymm6,%ymm15,%ymm15
1825	vpaddq	%ymm2,%ymm11,%ymm11
1826
1827	vpmuludq	%ymm0,%ymm7,%ymm6
1828	vpmuludq	%ymm1,%ymm7,%ymm2
1829	vpaddq	%ymm6,%ymm11,%ymm11
1830	vmovdqu	-12(%rax),%ymm8
1831	vpaddq	%ymm2,%ymm12,%ymm12
1832	vpmuludq	%ymm3,%ymm7,%ymm6
1833	vpmuludq	%ymm4,%ymm7,%ymm2
1834	vpaddq	%ymm6,%ymm14,%ymm14
1835	vpaddq	%ymm2,%ymm15,%ymm15
1836
1837	vpmuludq	%ymm3,%ymm8,%ymm6
1838	vpmuludq	%ymm4,%ymm8,%ymm2
1839	vpaddq	%ymm6,%ymm11,%ymm11
1840	vpaddq	%ymm2,%ymm12,%ymm12
1841	vmovdqu	20(%rax),%ymm2
1842	vpmuludq	%ymm1,%ymm9,%ymm6
1843	vpmuludq	%ymm0,%ymm9,%ymm9
1844	vpaddq	%ymm6,%ymm14,%ymm14
1845	vpaddq	%ymm9,%ymm13,%ymm13
1846
1847	vpmuludq	%ymm1,%ymm2,%ymm6
1848	vpmuludq	%ymm0,%ymm2,%ymm2
1849	vpaddq	%ymm6,%ymm15,%ymm15
1850	vpaddq	%ymm2,%ymm14,%ymm14
1851	vpmuludq	%ymm3,%ymm10,%ymm6
1852	vpmuludq	%ymm4,%ymm10,%ymm2
1853	vpaddq	%ymm6,%ymm12,%ymm12
1854	vpaddq	%ymm2,%ymm13,%ymm13
1855
1856	vpmuludq	%ymm3,%ymm5,%ymm3
1857	vpmuludq	%ymm4,%ymm5,%ymm4
1858	vpaddq	%ymm3,%ymm13,%ymm2
1859	vpaddq	%ymm4,%ymm14,%ymm3
1860	vpmuludq	84(%rax),%ymm0,%ymm4
1861	vpmuludq	%ymm1,%ymm5,%ymm0
1862	vmovdqa	64(%rcx),%ymm5
1863	vpaddq	%ymm4,%ymm15,%ymm4
1864	vpaddq	%ymm0,%ymm11,%ymm0
1865
1866
1867
1868
1869	vpsrldq	$8,%ymm12,%ymm8
1870	vpsrldq	$8,%ymm2,%ymm9
1871	vpsrldq	$8,%ymm3,%ymm10
1872	vpsrldq	$8,%ymm4,%ymm6
1873	vpsrldq	$8,%ymm0,%ymm7
1874	vpaddq	%ymm8,%ymm12,%ymm12
1875	vpaddq	%ymm9,%ymm2,%ymm2
1876	vpaddq	%ymm10,%ymm3,%ymm3
1877	vpaddq	%ymm6,%ymm4,%ymm4
1878	vpaddq	%ymm7,%ymm0,%ymm0
1879
1880	vpermq	$0x2,%ymm3,%ymm10
1881	vpermq	$0x2,%ymm4,%ymm6
1882	vpermq	$0x2,%ymm0,%ymm7
1883	vpermq	$0x2,%ymm12,%ymm8
1884	vpermq	$0x2,%ymm2,%ymm9
1885	vpaddq	%ymm10,%ymm3,%ymm3
1886	vpaddq	%ymm6,%ymm4,%ymm4
1887	vpaddq	%ymm7,%ymm0,%ymm0
1888	vpaddq	%ymm8,%ymm12,%ymm12
1889	vpaddq	%ymm9,%ymm2,%ymm2
1890
1891
1892
1893
1894	vpsrlq	$26,%ymm3,%ymm14
1895	vpand	%ymm5,%ymm3,%ymm3
1896	vpaddq	%ymm14,%ymm4,%ymm4
1897
1898	vpsrlq	$26,%ymm0,%ymm11
1899	vpand	%ymm5,%ymm0,%ymm0
1900	vpaddq	%ymm11,%ymm12,%ymm1
1901
1902	vpsrlq	$26,%ymm4,%ymm15
1903	vpand	%ymm5,%ymm4,%ymm4
1904
1905	vpsrlq	$26,%ymm1,%ymm12
1906	vpand	%ymm5,%ymm1,%ymm1
1907	vpaddq	%ymm12,%ymm2,%ymm2
1908
1909	vpaddq	%ymm15,%ymm0,%ymm0
1910	vpsllq	$2,%ymm15,%ymm15
1911	vpaddq	%ymm15,%ymm0,%ymm0
1912
1913	vpsrlq	$26,%ymm2,%ymm13
1914	vpand	%ymm5,%ymm2,%ymm2
1915	vpaddq	%ymm13,%ymm3,%ymm3
1916
1917	vpsrlq	$26,%ymm0,%ymm11
1918	vpand	%ymm5,%ymm0,%ymm0
1919	vpaddq	%ymm11,%ymm1,%ymm1
1920
1921	vpsrlq	$26,%ymm3,%ymm14
1922	vpand	%ymm5,%ymm3,%ymm3
1923	vpaddq	%ymm14,%ymm4,%ymm4
1924
1925	vmovd	%xmm0,-112(%rdi)
1926	vmovd	%xmm1,-108(%rdi)
1927	vmovd	%xmm2,-104(%rdi)
1928	vmovd	%xmm3,-100(%rdi)
1929	vmovd	%xmm4,-96(%rdi)
1930	leaq	8(%r11),%rsp
1931.cfi_def_cfa	%rsp,8
1932	vzeroupper
1933	.byte	0xf3,0xc3
1934.cfi_endproc
1935.size	poly1305_blocks_avx2,.-poly1305_blocks_avx2
1936.align	64
1937.Lconst:
1938.Lmask24:
1939.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
1940.L129:
1941.long	16777216,0,16777216,0,16777216,0,16777216,0
1942.Lmask26:
1943.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
1944.Lpermd_avx2:
1945.long	2,2,2,3,2,0,2,1
1946.Lpermd_avx512:
1947.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
1948
1949.L2_44_inp_permd:
1950.long	0,1,1,2,2,3,7,7
1951.L2_44_inp_shift:
1952.quad	0,12,24,64
1953.L2_44_mask:
1954.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
1955.L2_44_shift_rgt:
1956.quad	44,44,42,64
1957.L2_44_shift_lft:
1958.quad	8,8,10,64
1959
1960.align	64
1961.Lx_mask44:
1962.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1963.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1964.Lx_mask42:
1965.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1966.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1967.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1968.align	16
1969.globl	xor128_encrypt_n_pad
1970.type	xor128_encrypt_n_pad,@function
1971.align	16
1972xor128_encrypt_n_pad:
1973.cfi_startproc
1974	subq	%rdx,%rsi
1975	subq	%rdx,%rdi
1976	movq	%rcx,%r10
1977	shrq	$4,%rcx
1978	jz	.Ltail_enc
1979	nop
1980.Loop_enc_xmm:
1981	movdqu	(%rsi,%rdx,1),%xmm0
1982	pxor	(%rdx),%xmm0
1983	movdqu	%xmm0,(%rdi,%rdx,1)
1984	movdqa	%xmm0,(%rdx)
1985	leaq	16(%rdx),%rdx
1986	decq	%rcx
1987	jnz	.Loop_enc_xmm
1988
1989	andq	$15,%r10
1990	jz	.Ldone_enc
1991
1992.Ltail_enc:
1993	movq	$16,%rcx
1994	subq	%r10,%rcx
1995	xorl	%eax,%eax
1996.Loop_enc_byte:
1997	movb	(%rsi,%rdx,1),%al
1998	xorb	(%rdx),%al
1999	movb	%al,(%rdi,%rdx,1)
2000	movb	%al,(%rdx)
2001	leaq	1(%rdx),%rdx
2002	decq	%r10
2003	jnz	.Loop_enc_byte
2004
2005	xorl	%eax,%eax
2006.Loop_enc_pad:
2007	movb	%al,(%rdx)
2008	leaq	1(%rdx),%rdx
2009	decq	%rcx
2010	jnz	.Loop_enc_pad
2011
2012.Ldone_enc:
2013	movq	%rdx,%rax
2014	.byte	0xf3,0xc3
2015.cfi_endproc
2016.size	xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
2017
2018.globl	xor128_decrypt_n_pad
2019.type	xor128_decrypt_n_pad,@function
2020.align	16
2021xor128_decrypt_n_pad:
2022.cfi_startproc
2023	subq	%rdx,%rsi
2024	subq	%rdx,%rdi
2025	movq	%rcx,%r10
2026	shrq	$4,%rcx
2027	jz	.Ltail_dec
2028	nop
2029.Loop_dec_xmm:
2030	movdqu	(%rsi,%rdx,1),%xmm0
2031	movdqa	(%rdx),%xmm1
2032	pxor	%xmm0,%xmm1
2033	movdqu	%xmm1,(%rdi,%rdx,1)
2034	movdqa	%xmm0,(%rdx)
2035	leaq	16(%rdx),%rdx
2036	decq	%rcx
2037	jnz	.Loop_dec_xmm
2038
2039	pxor	%xmm1,%xmm1
2040	andq	$15,%r10
2041	jz	.Ldone_dec
2042
2043.Ltail_dec:
2044	movq	$16,%rcx
2045	subq	%r10,%rcx
2046	xorl	%eax,%eax
2047	xorq	%r11,%r11
2048.Loop_dec_byte:
2049	movb	(%rsi,%rdx,1),%r11b
2050	movb	(%rdx),%al
2051	xorb	%r11b,%al
2052	movb	%al,(%rdi,%rdx,1)
2053	movb	%r11b,(%rdx)
2054	leaq	1(%rdx),%rdx
2055	decq	%r10
2056	jnz	.Loop_dec_byte
2057
2058	xorl	%eax,%eax
2059.Loop_dec_pad:
2060	movb	%al,(%rdx)
2061	leaq	1(%rdx),%rdx
2062	decq	%rcx
2063	jnz	.Loop_dec_pad
2064
2065.Ldone_dec:
2066	movq	%rdx,%rax
2067	.byte	0xf3,0xc3
2068.cfi_endproc
2069.size	xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
2070