xref: /freebsd/sys/crypto/openssl/amd64/poly1305-x86_64.S (revision 90b5fc95832da64a5f56295e687379732c33718f)
1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from poly1305-x86_64.pl. */
3.text
4
5
6
7.globl	poly1305_init
8.hidden	poly1305_init
9.globl	poly1305_blocks
10.hidden	poly1305_blocks
11.globl	poly1305_emit
12.hidden	poly1305_emit
13
14.type	poly1305_init,@function
15.align	32
16poly1305_init:
17.cfi_startproc
18	xorq	%rax,%rax
19	movq	%rax,0(%rdi)
20	movq	%rax,8(%rdi)
21	movq	%rax,16(%rdi)
22
23	cmpq	$0,%rsi
24	je	.Lno_key
25
26	leaq	poly1305_blocks(%rip),%r10
27	leaq	poly1305_emit(%rip),%r11
28	movq	OPENSSL_ia32cap_P+4(%rip),%r9
29	leaq	poly1305_blocks_avx(%rip),%rax
30	leaq	poly1305_emit_avx(%rip),%rcx
31	btq	$28,%r9
32	cmovcq	%rax,%r10
33	cmovcq	%rcx,%r11
34	leaq	poly1305_blocks_avx2(%rip),%rax
35	btq	$37,%r9
36	cmovcq	%rax,%r10
37	movq	$0x0ffffffc0fffffff,%rax
38	movq	$0x0ffffffc0ffffffc,%rcx
39	andq	0(%rsi),%rax
40	andq	8(%rsi),%rcx
41	movq	%rax,24(%rdi)
42	movq	%rcx,32(%rdi)
43	movq	%r10,0(%rdx)
44	movq	%r11,8(%rdx)
45	movl	$1,%eax
46.Lno_key:
47	.byte	0xf3,0xc3
48.cfi_endproc
49.size	poly1305_init,.-poly1305_init
50
51.type	poly1305_blocks,@function
52.align	32
53poly1305_blocks:
54.cfi_startproc
55.Lblocks:
56	shrq	$4,%rdx
57	jz	.Lno_data
58
59	pushq	%rbx
60.cfi_adjust_cfa_offset	8
61.cfi_offset	%rbx,-16
62	pushq	%rbp
63.cfi_adjust_cfa_offset	8
64.cfi_offset	%rbp,-24
65	pushq	%r12
66.cfi_adjust_cfa_offset	8
67.cfi_offset	%r12,-32
68	pushq	%r13
69.cfi_adjust_cfa_offset	8
70.cfi_offset	%r13,-40
71	pushq	%r14
72.cfi_adjust_cfa_offset	8
73.cfi_offset	%r14,-48
74	pushq	%r15
75.cfi_adjust_cfa_offset	8
76.cfi_offset	%r15,-56
77.Lblocks_body:
78
79	movq	%rdx,%r15
80
81	movq	24(%rdi),%r11
82	movq	32(%rdi),%r13
83
84	movq	0(%rdi),%r14
85	movq	8(%rdi),%rbx
86	movq	16(%rdi),%rbp
87
88	movq	%r13,%r12
89	shrq	$2,%r13
90	movq	%r12,%rax
91	addq	%r12,%r13
92	jmp	.Loop
93
94.align	32
95.Loop:
96	addq	0(%rsi),%r14
97	adcq	8(%rsi),%rbx
98	leaq	16(%rsi),%rsi
99	adcq	%rcx,%rbp
100	mulq	%r14
101	movq	%rax,%r9
102	movq	%r11,%rax
103	movq	%rdx,%r10
104
105	mulq	%r14
106	movq	%rax,%r14
107	movq	%r11,%rax
108	movq	%rdx,%r8
109
110	mulq	%rbx
111	addq	%rax,%r9
112	movq	%r13,%rax
113	adcq	%rdx,%r10
114
115	mulq	%rbx
116	movq	%rbp,%rbx
117	addq	%rax,%r14
118	adcq	%rdx,%r8
119
120	imulq	%r13,%rbx
121	addq	%rbx,%r9
122	movq	%r8,%rbx
123	adcq	$0,%r10
124
125	imulq	%r11,%rbp
126	addq	%r9,%rbx
127	movq	$-4,%rax
128	adcq	%rbp,%r10
129
130	andq	%r10,%rax
131	movq	%r10,%rbp
132	shrq	$2,%r10
133	andq	$3,%rbp
134	addq	%r10,%rax
135	addq	%rax,%r14
136	adcq	$0,%rbx
137	adcq	$0,%rbp
138	movq	%r12,%rax
139	decq	%r15
140	jnz	.Loop
141
142	movq	%r14,0(%rdi)
143	movq	%rbx,8(%rdi)
144	movq	%rbp,16(%rdi)
145
146	movq	0(%rsp),%r15
147.cfi_restore	%r15
148	movq	8(%rsp),%r14
149.cfi_restore	%r14
150	movq	16(%rsp),%r13
151.cfi_restore	%r13
152	movq	24(%rsp),%r12
153.cfi_restore	%r12
154	movq	32(%rsp),%rbp
155.cfi_restore	%rbp
156	movq	40(%rsp),%rbx
157.cfi_restore	%rbx
158	leaq	48(%rsp),%rsp
159.cfi_adjust_cfa_offset	-48
160.Lno_data:
161.Lblocks_epilogue:
162	.byte	0xf3,0xc3
163.cfi_endproc
164.size	poly1305_blocks,.-poly1305_blocks
165
166.type	poly1305_emit,@function
167.align	32
168poly1305_emit:
169.cfi_startproc
170.Lemit:
171	movq	0(%rdi),%r8
172	movq	8(%rdi),%r9
173	movq	16(%rdi),%r10
174
175	movq	%r8,%rax
176	addq	$5,%r8
177	movq	%r9,%rcx
178	adcq	$0,%r9
179	adcq	$0,%r10
180	shrq	$2,%r10
181	cmovnzq	%r8,%rax
182	cmovnzq	%r9,%rcx
183
184	addq	0(%rdx),%rax
185	adcq	8(%rdx),%rcx
186	movq	%rax,0(%rsi)
187	movq	%rcx,8(%rsi)
188
189	.byte	0xf3,0xc3
190.cfi_endproc
191.size	poly1305_emit,.-poly1305_emit
192.type	__poly1305_block,@function
193.align	32
194__poly1305_block:
195.cfi_startproc
196	mulq	%r14
197	movq	%rax,%r9
198	movq	%r11,%rax
199	movq	%rdx,%r10
200
201	mulq	%r14
202	movq	%rax,%r14
203	movq	%r11,%rax
204	movq	%rdx,%r8
205
206	mulq	%rbx
207	addq	%rax,%r9
208	movq	%r13,%rax
209	adcq	%rdx,%r10
210
211	mulq	%rbx
212	movq	%rbp,%rbx
213	addq	%rax,%r14
214	adcq	%rdx,%r8
215
216	imulq	%r13,%rbx
217	addq	%rbx,%r9
218	movq	%r8,%rbx
219	adcq	$0,%r10
220
221	imulq	%r11,%rbp
222	addq	%r9,%rbx
223	movq	$-4,%rax
224	adcq	%rbp,%r10
225
226	andq	%r10,%rax
227	movq	%r10,%rbp
228	shrq	$2,%r10
229	andq	$3,%rbp
230	addq	%r10,%rax
231	addq	%rax,%r14
232	adcq	$0,%rbx
233	adcq	$0,%rbp
234	.byte	0xf3,0xc3
235.cfi_endproc
236.size	__poly1305_block,.-__poly1305_block
237
238.type	__poly1305_init_avx,@function
239.align	32
240__poly1305_init_avx:
241.cfi_startproc
242	movq	%r11,%r14
243	movq	%r12,%rbx
244	xorq	%rbp,%rbp
245
246	leaq	48+64(%rdi),%rdi
247
248	movq	%r12,%rax
249	call	__poly1305_block
250
251	movl	$0x3ffffff,%eax
252	movl	$0x3ffffff,%edx
253	movq	%r14,%r8
254	andl	%r14d,%eax
255	movq	%r11,%r9
256	andl	%r11d,%edx
257	movl	%eax,-64(%rdi)
258	shrq	$26,%r8
259	movl	%edx,-60(%rdi)
260	shrq	$26,%r9
261
262	movl	$0x3ffffff,%eax
263	movl	$0x3ffffff,%edx
264	andl	%r8d,%eax
265	andl	%r9d,%edx
266	movl	%eax,-48(%rdi)
267	leal	(%rax,%rax,4),%eax
268	movl	%edx,-44(%rdi)
269	leal	(%rdx,%rdx,4),%edx
270	movl	%eax,-32(%rdi)
271	shrq	$26,%r8
272	movl	%edx,-28(%rdi)
273	shrq	$26,%r9
274
275	movq	%rbx,%rax
276	movq	%r12,%rdx
277	shlq	$12,%rax
278	shlq	$12,%rdx
279	orq	%r8,%rax
280	orq	%r9,%rdx
281	andl	$0x3ffffff,%eax
282	andl	$0x3ffffff,%edx
283	movl	%eax,-16(%rdi)
284	leal	(%rax,%rax,4),%eax
285	movl	%edx,-12(%rdi)
286	leal	(%rdx,%rdx,4),%edx
287	movl	%eax,0(%rdi)
288	movq	%rbx,%r8
289	movl	%edx,4(%rdi)
290	movq	%r12,%r9
291
292	movl	$0x3ffffff,%eax
293	movl	$0x3ffffff,%edx
294	shrq	$14,%r8
295	shrq	$14,%r9
296	andl	%r8d,%eax
297	andl	%r9d,%edx
298	movl	%eax,16(%rdi)
299	leal	(%rax,%rax,4),%eax
300	movl	%edx,20(%rdi)
301	leal	(%rdx,%rdx,4),%edx
302	movl	%eax,32(%rdi)
303	shrq	$26,%r8
304	movl	%edx,36(%rdi)
305	shrq	$26,%r9
306
307	movq	%rbp,%rax
308	shlq	$24,%rax
309	orq	%rax,%r8
310	movl	%r8d,48(%rdi)
311	leaq	(%r8,%r8,4),%r8
312	movl	%r9d,52(%rdi)
313	leaq	(%r9,%r9,4),%r9
314	movl	%r8d,64(%rdi)
315	movl	%r9d,68(%rdi)
316
317	movq	%r12,%rax
318	call	__poly1305_block
319
320	movl	$0x3ffffff,%eax
321	movq	%r14,%r8
322	andl	%r14d,%eax
323	shrq	$26,%r8
324	movl	%eax,-52(%rdi)
325
326	movl	$0x3ffffff,%edx
327	andl	%r8d,%edx
328	movl	%edx,-36(%rdi)
329	leal	(%rdx,%rdx,4),%edx
330	shrq	$26,%r8
331	movl	%edx,-20(%rdi)
332
333	movq	%rbx,%rax
334	shlq	$12,%rax
335	orq	%r8,%rax
336	andl	$0x3ffffff,%eax
337	movl	%eax,-4(%rdi)
338	leal	(%rax,%rax,4),%eax
339	movq	%rbx,%r8
340	movl	%eax,12(%rdi)
341
342	movl	$0x3ffffff,%edx
343	shrq	$14,%r8
344	andl	%r8d,%edx
345	movl	%edx,28(%rdi)
346	leal	(%rdx,%rdx,4),%edx
347	shrq	$26,%r8
348	movl	%edx,44(%rdi)
349
350	movq	%rbp,%rax
351	shlq	$24,%rax
352	orq	%rax,%r8
353	movl	%r8d,60(%rdi)
354	leaq	(%r8,%r8,4),%r8
355	movl	%r8d,76(%rdi)
356
357	movq	%r12,%rax
358	call	__poly1305_block
359
360	movl	$0x3ffffff,%eax
361	movq	%r14,%r8
362	andl	%r14d,%eax
363	shrq	$26,%r8
364	movl	%eax,-56(%rdi)
365
366	movl	$0x3ffffff,%edx
367	andl	%r8d,%edx
368	movl	%edx,-40(%rdi)
369	leal	(%rdx,%rdx,4),%edx
370	shrq	$26,%r8
371	movl	%edx,-24(%rdi)
372
373	movq	%rbx,%rax
374	shlq	$12,%rax
375	orq	%r8,%rax
376	andl	$0x3ffffff,%eax
377	movl	%eax,-8(%rdi)
378	leal	(%rax,%rax,4),%eax
379	movq	%rbx,%r8
380	movl	%eax,8(%rdi)
381
382	movl	$0x3ffffff,%edx
383	shrq	$14,%r8
384	andl	%r8d,%edx
385	movl	%edx,24(%rdi)
386	leal	(%rdx,%rdx,4),%edx
387	shrq	$26,%r8
388	movl	%edx,40(%rdi)
389
390	movq	%rbp,%rax
391	shlq	$24,%rax
392	orq	%rax,%r8
393	movl	%r8d,56(%rdi)
394	leaq	(%r8,%r8,4),%r8
395	movl	%r8d,72(%rdi)
396
397	leaq	-48-64(%rdi),%rdi
398	.byte	0xf3,0xc3
399.cfi_endproc
400.size	__poly1305_init_avx,.-__poly1305_init_avx
401
402.type	poly1305_blocks_avx,@function
403.align	32
404poly1305_blocks_avx:
405.cfi_startproc
406	movl	20(%rdi),%r8d
407	cmpq	$128,%rdx
408	jae	.Lblocks_avx
409	testl	%r8d,%r8d
410	jz	.Lblocks
411
412.Lblocks_avx:
413	andq	$-16,%rdx
414	jz	.Lno_data_avx
415
416	vzeroupper
417
418	testl	%r8d,%r8d
419	jz	.Lbase2_64_avx
420
421	testq	$31,%rdx
422	jz	.Leven_avx
423
424	pushq	%rbx
425.cfi_adjust_cfa_offset	8
426.cfi_offset	%rbx,-16
427	pushq	%rbp
428.cfi_adjust_cfa_offset	8
429.cfi_offset	%rbp,-24
430	pushq	%r12
431.cfi_adjust_cfa_offset	8
432.cfi_offset	%r12,-32
433	pushq	%r13
434.cfi_adjust_cfa_offset	8
435.cfi_offset	%r13,-40
436	pushq	%r14
437.cfi_adjust_cfa_offset	8
438.cfi_offset	%r14,-48
439	pushq	%r15
440.cfi_adjust_cfa_offset	8
441.cfi_offset	%r15,-56
442.Lblocks_avx_body:
443
444	movq	%rdx,%r15
445
446	movq	0(%rdi),%r8
447	movq	8(%rdi),%r9
448	movl	16(%rdi),%ebp
449
450	movq	24(%rdi),%r11
451	movq	32(%rdi),%r13
452
453
454	movl	%r8d,%r14d
455	andq	$-2147483648,%r8
456	movq	%r9,%r12
457	movl	%r9d,%ebx
458	andq	$-2147483648,%r9
459
460	shrq	$6,%r8
461	shlq	$52,%r12
462	addq	%r8,%r14
463	shrq	$12,%rbx
464	shrq	$18,%r9
465	addq	%r12,%r14
466	adcq	%r9,%rbx
467
468	movq	%rbp,%r8
469	shlq	$40,%r8
470	shrq	$24,%rbp
471	addq	%r8,%rbx
472	adcq	$0,%rbp
473
474	movq	$-4,%r9
475	movq	%rbp,%r8
476	andq	%rbp,%r9
477	shrq	$2,%r8
478	andq	$3,%rbp
479	addq	%r9,%r8
480	addq	%r8,%r14
481	adcq	$0,%rbx
482	adcq	$0,%rbp
483
484	movq	%r13,%r12
485	movq	%r13,%rax
486	shrq	$2,%r13
487	addq	%r12,%r13
488
489	addq	0(%rsi),%r14
490	adcq	8(%rsi),%rbx
491	leaq	16(%rsi),%rsi
492	adcq	%rcx,%rbp
493
494	call	__poly1305_block
495
496	testq	%rcx,%rcx
497	jz	.Lstore_base2_64_avx
498
499
500	movq	%r14,%rax
501	movq	%r14,%rdx
502	shrq	$52,%r14
503	movq	%rbx,%r11
504	movq	%rbx,%r12
505	shrq	$26,%rdx
506	andq	$0x3ffffff,%rax
507	shlq	$12,%r11
508	andq	$0x3ffffff,%rdx
509	shrq	$14,%rbx
510	orq	%r11,%r14
511	shlq	$24,%rbp
512	andq	$0x3ffffff,%r14
513	shrq	$40,%r12
514	andq	$0x3ffffff,%rbx
515	orq	%r12,%rbp
516
517	subq	$16,%r15
518	jz	.Lstore_base2_26_avx
519
520	vmovd	%eax,%xmm0
521	vmovd	%edx,%xmm1
522	vmovd	%r14d,%xmm2
523	vmovd	%ebx,%xmm3
524	vmovd	%ebp,%xmm4
525	jmp	.Lproceed_avx
526
527.align	32
528.Lstore_base2_64_avx:
529	movq	%r14,0(%rdi)
530	movq	%rbx,8(%rdi)
531	movq	%rbp,16(%rdi)
532	jmp	.Ldone_avx
533
534.align	16
535.Lstore_base2_26_avx:
536	movl	%eax,0(%rdi)
537	movl	%edx,4(%rdi)
538	movl	%r14d,8(%rdi)
539	movl	%ebx,12(%rdi)
540	movl	%ebp,16(%rdi)
541.align	16
542.Ldone_avx:
543	movq	0(%rsp),%r15
544.cfi_restore	%r15
545	movq	8(%rsp),%r14
546.cfi_restore	%r14
547	movq	16(%rsp),%r13
548.cfi_restore	%r13
549	movq	24(%rsp),%r12
550.cfi_restore	%r12
551	movq	32(%rsp),%rbp
552.cfi_restore	%rbp
553	movq	40(%rsp),%rbx
554.cfi_restore	%rbx
555	leaq	48(%rsp),%rsp
556.cfi_adjust_cfa_offset	-48
557.Lno_data_avx:
558.Lblocks_avx_epilogue:
559	.byte	0xf3,0xc3
560.cfi_endproc
561
562.align	32
563.Lbase2_64_avx:
564.cfi_startproc
565	pushq	%rbx
566.cfi_adjust_cfa_offset	8
567.cfi_offset	%rbx,-16
568	pushq	%rbp
569.cfi_adjust_cfa_offset	8
570.cfi_offset	%rbp,-24
571	pushq	%r12
572.cfi_adjust_cfa_offset	8
573.cfi_offset	%r12,-32
574	pushq	%r13
575.cfi_adjust_cfa_offset	8
576.cfi_offset	%r13,-40
577	pushq	%r14
578.cfi_adjust_cfa_offset	8
579.cfi_offset	%r14,-48
580	pushq	%r15
581.cfi_adjust_cfa_offset	8
582.cfi_offset	%r15,-56
583.Lbase2_64_avx_body:
584
585	movq	%rdx,%r15
586
587	movq	24(%rdi),%r11
588	movq	32(%rdi),%r13
589
590	movq	0(%rdi),%r14
591	movq	8(%rdi),%rbx
592	movl	16(%rdi),%ebp
593
594	movq	%r13,%r12
595	movq	%r13,%rax
596	shrq	$2,%r13
597	addq	%r12,%r13
598
599	testq	$31,%rdx
600	jz	.Linit_avx
601
602	addq	0(%rsi),%r14
603	adcq	8(%rsi),%rbx
604	leaq	16(%rsi),%rsi
605	adcq	%rcx,%rbp
606	subq	$16,%r15
607
608	call	__poly1305_block
609
610.Linit_avx:
611
612	movq	%r14,%rax
613	movq	%r14,%rdx
614	shrq	$52,%r14
615	movq	%rbx,%r8
616	movq	%rbx,%r9
617	shrq	$26,%rdx
618	andq	$0x3ffffff,%rax
619	shlq	$12,%r8
620	andq	$0x3ffffff,%rdx
621	shrq	$14,%rbx
622	orq	%r8,%r14
623	shlq	$24,%rbp
624	andq	$0x3ffffff,%r14
625	shrq	$40,%r9
626	andq	$0x3ffffff,%rbx
627	orq	%r9,%rbp
628
629	vmovd	%eax,%xmm0
630	vmovd	%edx,%xmm1
631	vmovd	%r14d,%xmm2
632	vmovd	%ebx,%xmm3
633	vmovd	%ebp,%xmm4
634	movl	$1,20(%rdi)
635
636	call	__poly1305_init_avx
637
638.Lproceed_avx:
639	movq	%r15,%rdx
640
641	movq	0(%rsp),%r15
642.cfi_restore	%r15
643	movq	8(%rsp),%r14
644.cfi_restore	%r14
645	movq	16(%rsp),%r13
646.cfi_restore	%r13
647	movq	24(%rsp),%r12
648.cfi_restore	%r12
649	movq	32(%rsp),%rbp
650.cfi_restore	%rbp
651	movq	40(%rsp),%rbx
652.cfi_restore	%rbx
653	leaq	48(%rsp),%rax
654	leaq	48(%rsp),%rsp
655.cfi_adjust_cfa_offset	-48
656.Lbase2_64_avx_epilogue:
657	jmp	.Ldo_avx
658.cfi_endproc
659
660.align	32
661.Leven_avx:
662.cfi_startproc
663	vmovd	0(%rdi),%xmm0
664	vmovd	4(%rdi),%xmm1
665	vmovd	8(%rdi),%xmm2
666	vmovd	12(%rdi),%xmm3
667	vmovd	16(%rdi),%xmm4
668
669.Ldo_avx:
670	leaq	-88(%rsp),%r11
671.cfi_def_cfa	%r11,0x60
672	subq	$0x178,%rsp
673	subq	$64,%rdx
674	leaq	-32(%rsi),%rax
675	cmovcq	%rax,%rsi
676
677	vmovdqu	48(%rdi),%xmm14
678	leaq	112(%rdi),%rdi
679	leaq	.Lconst(%rip),%rcx
680
681
682
683	vmovdqu	32(%rsi),%xmm5
684	vmovdqu	48(%rsi),%xmm6
685	vmovdqa	64(%rcx),%xmm15
686
687	vpsrldq	$6,%xmm5,%xmm7
688	vpsrldq	$6,%xmm6,%xmm8
689	vpunpckhqdq	%xmm6,%xmm5,%xmm9
690	vpunpcklqdq	%xmm6,%xmm5,%xmm5
691	vpunpcklqdq	%xmm8,%xmm7,%xmm8
692
693	vpsrlq	$40,%xmm9,%xmm9
694	vpsrlq	$26,%xmm5,%xmm6
695	vpand	%xmm15,%xmm5,%xmm5
696	vpsrlq	$4,%xmm8,%xmm7
697	vpand	%xmm15,%xmm6,%xmm6
698	vpsrlq	$30,%xmm8,%xmm8
699	vpand	%xmm15,%xmm7,%xmm7
700	vpand	%xmm15,%xmm8,%xmm8
701	vpor	32(%rcx),%xmm9,%xmm9
702
703	jbe	.Lskip_loop_avx
704
705
706	vmovdqu	-48(%rdi),%xmm11
707	vmovdqu	-32(%rdi),%xmm12
708	vpshufd	$0xEE,%xmm14,%xmm13
709	vpshufd	$0x44,%xmm14,%xmm10
710	vmovdqa	%xmm13,-144(%r11)
711	vmovdqa	%xmm10,0(%rsp)
712	vpshufd	$0xEE,%xmm11,%xmm14
713	vmovdqu	-16(%rdi),%xmm10
714	vpshufd	$0x44,%xmm11,%xmm11
715	vmovdqa	%xmm14,-128(%r11)
716	vmovdqa	%xmm11,16(%rsp)
717	vpshufd	$0xEE,%xmm12,%xmm13
718	vmovdqu	0(%rdi),%xmm11
719	vpshufd	$0x44,%xmm12,%xmm12
720	vmovdqa	%xmm13,-112(%r11)
721	vmovdqa	%xmm12,32(%rsp)
722	vpshufd	$0xEE,%xmm10,%xmm14
723	vmovdqu	16(%rdi),%xmm12
724	vpshufd	$0x44,%xmm10,%xmm10
725	vmovdqa	%xmm14,-96(%r11)
726	vmovdqa	%xmm10,48(%rsp)
727	vpshufd	$0xEE,%xmm11,%xmm13
728	vmovdqu	32(%rdi),%xmm10
729	vpshufd	$0x44,%xmm11,%xmm11
730	vmovdqa	%xmm13,-80(%r11)
731	vmovdqa	%xmm11,64(%rsp)
732	vpshufd	$0xEE,%xmm12,%xmm14
733	vmovdqu	48(%rdi),%xmm11
734	vpshufd	$0x44,%xmm12,%xmm12
735	vmovdqa	%xmm14,-64(%r11)
736	vmovdqa	%xmm12,80(%rsp)
737	vpshufd	$0xEE,%xmm10,%xmm13
738	vmovdqu	64(%rdi),%xmm12
739	vpshufd	$0x44,%xmm10,%xmm10
740	vmovdqa	%xmm13,-48(%r11)
741	vmovdqa	%xmm10,96(%rsp)
742	vpshufd	$0xEE,%xmm11,%xmm14
743	vpshufd	$0x44,%xmm11,%xmm11
744	vmovdqa	%xmm14,-32(%r11)
745	vmovdqa	%xmm11,112(%rsp)
746	vpshufd	$0xEE,%xmm12,%xmm13
747	vmovdqa	0(%rsp),%xmm14
748	vpshufd	$0x44,%xmm12,%xmm12
749	vmovdqa	%xmm13,-16(%r11)
750	vmovdqa	%xmm12,128(%rsp)
751
752	jmp	.Loop_avx
753
754.align	32
755.Loop_avx:
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776	vpmuludq	%xmm5,%xmm14,%xmm10
777	vpmuludq	%xmm6,%xmm14,%xmm11
778	vmovdqa	%xmm2,32(%r11)
779	vpmuludq	%xmm7,%xmm14,%xmm12
780	vmovdqa	16(%rsp),%xmm2
781	vpmuludq	%xmm8,%xmm14,%xmm13
782	vpmuludq	%xmm9,%xmm14,%xmm14
783
784	vmovdqa	%xmm0,0(%r11)
785	vpmuludq	32(%rsp),%xmm9,%xmm0
786	vmovdqa	%xmm1,16(%r11)
787	vpmuludq	%xmm8,%xmm2,%xmm1
788	vpaddq	%xmm0,%xmm10,%xmm10
789	vpaddq	%xmm1,%xmm14,%xmm14
790	vmovdqa	%xmm3,48(%r11)
791	vpmuludq	%xmm7,%xmm2,%xmm0
792	vpmuludq	%xmm6,%xmm2,%xmm1
793	vpaddq	%xmm0,%xmm13,%xmm13
794	vmovdqa	48(%rsp),%xmm3
795	vpaddq	%xmm1,%xmm12,%xmm12
796	vmovdqa	%xmm4,64(%r11)
797	vpmuludq	%xmm5,%xmm2,%xmm2
798	vpmuludq	%xmm7,%xmm3,%xmm0
799	vpaddq	%xmm2,%xmm11,%xmm11
800
801	vmovdqa	64(%rsp),%xmm4
802	vpaddq	%xmm0,%xmm14,%xmm14
803	vpmuludq	%xmm6,%xmm3,%xmm1
804	vpmuludq	%xmm5,%xmm3,%xmm3
805	vpaddq	%xmm1,%xmm13,%xmm13
806	vmovdqa	80(%rsp),%xmm2
807	vpaddq	%xmm3,%xmm12,%xmm12
808	vpmuludq	%xmm9,%xmm4,%xmm0
809	vpmuludq	%xmm8,%xmm4,%xmm4
810	vpaddq	%xmm0,%xmm11,%xmm11
811	vmovdqa	96(%rsp),%xmm3
812	vpaddq	%xmm4,%xmm10,%xmm10
813
814	vmovdqa	128(%rsp),%xmm4
815	vpmuludq	%xmm6,%xmm2,%xmm1
816	vpmuludq	%xmm5,%xmm2,%xmm2
817	vpaddq	%xmm1,%xmm14,%xmm14
818	vpaddq	%xmm2,%xmm13,%xmm13
819	vpmuludq	%xmm9,%xmm3,%xmm0
820	vpmuludq	%xmm8,%xmm3,%xmm1
821	vpaddq	%xmm0,%xmm12,%xmm12
822	vmovdqu	0(%rsi),%xmm0
823	vpaddq	%xmm1,%xmm11,%xmm11
824	vpmuludq	%xmm7,%xmm3,%xmm3
825	vpmuludq	%xmm7,%xmm4,%xmm7
826	vpaddq	%xmm3,%xmm10,%xmm10
827
828	vmovdqu	16(%rsi),%xmm1
829	vpaddq	%xmm7,%xmm11,%xmm11
830	vpmuludq	%xmm8,%xmm4,%xmm8
831	vpmuludq	%xmm9,%xmm4,%xmm9
832	vpsrldq	$6,%xmm0,%xmm2
833	vpaddq	%xmm8,%xmm12,%xmm12
834	vpaddq	%xmm9,%xmm13,%xmm13
835	vpsrldq	$6,%xmm1,%xmm3
836	vpmuludq	112(%rsp),%xmm5,%xmm9
837	vpmuludq	%xmm6,%xmm4,%xmm5
838	vpunpckhqdq	%xmm1,%xmm0,%xmm4
839	vpaddq	%xmm9,%xmm14,%xmm14
840	vmovdqa	-144(%r11),%xmm9
841	vpaddq	%xmm5,%xmm10,%xmm10
842
843	vpunpcklqdq	%xmm1,%xmm0,%xmm0
844	vpunpcklqdq	%xmm3,%xmm2,%xmm3
845
846
847	vpsrldq	$5,%xmm4,%xmm4
848	vpsrlq	$26,%xmm0,%xmm1
849	vpand	%xmm15,%xmm0,%xmm0
850	vpsrlq	$4,%xmm3,%xmm2
851	vpand	%xmm15,%xmm1,%xmm1
852	vpand	0(%rcx),%xmm4,%xmm4
853	vpsrlq	$30,%xmm3,%xmm3
854	vpand	%xmm15,%xmm2,%xmm2
855	vpand	%xmm15,%xmm3,%xmm3
856	vpor	32(%rcx),%xmm4,%xmm4
857
858	vpaddq	0(%r11),%xmm0,%xmm0
859	vpaddq	16(%r11),%xmm1,%xmm1
860	vpaddq	32(%r11),%xmm2,%xmm2
861	vpaddq	48(%r11),%xmm3,%xmm3
862	vpaddq	64(%r11),%xmm4,%xmm4
863
864	leaq	32(%rsi),%rax
865	leaq	64(%rsi),%rsi
866	subq	$64,%rdx
867	cmovcq	%rax,%rsi
868
869
870
871
872
873
874
875
876
877
878	vpmuludq	%xmm0,%xmm9,%xmm5
879	vpmuludq	%xmm1,%xmm9,%xmm6
880	vpaddq	%xmm5,%xmm10,%xmm10
881	vpaddq	%xmm6,%xmm11,%xmm11
882	vmovdqa	-128(%r11),%xmm7
883	vpmuludq	%xmm2,%xmm9,%xmm5
884	vpmuludq	%xmm3,%xmm9,%xmm6
885	vpaddq	%xmm5,%xmm12,%xmm12
886	vpaddq	%xmm6,%xmm13,%xmm13
887	vpmuludq	%xmm4,%xmm9,%xmm9
888	vpmuludq	-112(%r11),%xmm4,%xmm5
889	vpaddq	%xmm9,%xmm14,%xmm14
890
891	vpaddq	%xmm5,%xmm10,%xmm10
892	vpmuludq	%xmm2,%xmm7,%xmm6
893	vpmuludq	%xmm3,%xmm7,%xmm5
894	vpaddq	%xmm6,%xmm13,%xmm13
895	vmovdqa	-96(%r11),%xmm8
896	vpaddq	%xmm5,%xmm14,%xmm14
897	vpmuludq	%xmm1,%xmm7,%xmm6
898	vpmuludq	%xmm0,%xmm7,%xmm7
899	vpaddq	%xmm6,%xmm12,%xmm12
900	vpaddq	%xmm7,%xmm11,%xmm11
901
902	vmovdqa	-80(%r11),%xmm9
903	vpmuludq	%xmm2,%xmm8,%xmm5
904	vpmuludq	%xmm1,%xmm8,%xmm6
905	vpaddq	%xmm5,%xmm14,%xmm14
906	vpaddq	%xmm6,%xmm13,%xmm13
907	vmovdqa	-64(%r11),%xmm7
908	vpmuludq	%xmm0,%xmm8,%xmm8
909	vpmuludq	%xmm4,%xmm9,%xmm5
910	vpaddq	%xmm8,%xmm12,%xmm12
911	vpaddq	%xmm5,%xmm11,%xmm11
912	vmovdqa	-48(%r11),%xmm8
913	vpmuludq	%xmm3,%xmm9,%xmm9
914	vpmuludq	%xmm1,%xmm7,%xmm6
915	vpaddq	%xmm9,%xmm10,%xmm10
916
917	vmovdqa	-16(%r11),%xmm9
918	vpaddq	%xmm6,%xmm14,%xmm14
919	vpmuludq	%xmm0,%xmm7,%xmm7
920	vpmuludq	%xmm4,%xmm8,%xmm5
921	vpaddq	%xmm7,%xmm13,%xmm13
922	vpaddq	%xmm5,%xmm12,%xmm12
923	vmovdqu	32(%rsi),%xmm5
924	vpmuludq	%xmm3,%xmm8,%xmm7
925	vpmuludq	%xmm2,%xmm8,%xmm8
926	vpaddq	%xmm7,%xmm11,%xmm11
927	vmovdqu	48(%rsi),%xmm6
928	vpaddq	%xmm8,%xmm10,%xmm10
929
930	vpmuludq	%xmm2,%xmm9,%xmm2
931	vpmuludq	%xmm3,%xmm9,%xmm3
932	vpsrldq	$6,%xmm5,%xmm7
933	vpaddq	%xmm2,%xmm11,%xmm11
934	vpmuludq	%xmm4,%xmm9,%xmm4
935	vpsrldq	$6,%xmm6,%xmm8
936	vpaddq	%xmm3,%xmm12,%xmm2
937	vpaddq	%xmm4,%xmm13,%xmm3
938	vpmuludq	-32(%r11),%xmm0,%xmm4
939	vpmuludq	%xmm1,%xmm9,%xmm0
940	vpunpckhqdq	%xmm6,%xmm5,%xmm9
941	vpaddq	%xmm4,%xmm14,%xmm4
942	vpaddq	%xmm0,%xmm10,%xmm0
943
944	vpunpcklqdq	%xmm6,%xmm5,%xmm5
945	vpunpcklqdq	%xmm8,%xmm7,%xmm8
946
947
948	vpsrldq	$5,%xmm9,%xmm9
949	vpsrlq	$26,%xmm5,%xmm6
950	vmovdqa	0(%rsp),%xmm14
951	vpand	%xmm15,%xmm5,%xmm5
952	vpsrlq	$4,%xmm8,%xmm7
953	vpand	%xmm15,%xmm6,%xmm6
954	vpand	0(%rcx),%xmm9,%xmm9
955	vpsrlq	$30,%xmm8,%xmm8
956	vpand	%xmm15,%xmm7,%xmm7
957	vpand	%xmm15,%xmm8,%xmm8
958	vpor	32(%rcx),%xmm9,%xmm9
959
960
961
962
963
964	vpsrlq	$26,%xmm3,%xmm13
965	vpand	%xmm15,%xmm3,%xmm3
966	vpaddq	%xmm13,%xmm4,%xmm4
967
968	vpsrlq	$26,%xmm0,%xmm10
969	vpand	%xmm15,%xmm0,%xmm0
970	vpaddq	%xmm10,%xmm11,%xmm1
971
972	vpsrlq	$26,%xmm4,%xmm10
973	vpand	%xmm15,%xmm4,%xmm4
974
975	vpsrlq	$26,%xmm1,%xmm11
976	vpand	%xmm15,%xmm1,%xmm1
977	vpaddq	%xmm11,%xmm2,%xmm2
978
979	vpaddq	%xmm10,%xmm0,%xmm0
980	vpsllq	$2,%xmm10,%xmm10
981	vpaddq	%xmm10,%xmm0,%xmm0
982
983	vpsrlq	$26,%xmm2,%xmm12
984	vpand	%xmm15,%xmm2,%xmm2
985	vpaddq	%xmm12,%xmm3,%xmm3
986
987	vpsrlq	$26,%xmm0,%xmm10
988	vpand	%xmm15,%xmm0,%xmm0
989	vpaddq	%xmm10,%xmm1,%xmm1
990
991	vpsrlq	$26,%xmm3,%xmm13
992	vpand	%xmm15,%xmm3,%xmm3
993	vpaddq	%xmm13,%xmm4,%xmm4
994
995	ja	.Loop_avx
996
997.Lskip_loop_avx:
998
999
1000
1001	vpshufd	$0x10,%xmm14,%xmm14
1002	addq	$32,%rdx
1003	jnz	.Long_tail_avx
1004
1005	vpaddq	%xmm2,%xmm7,%xmm7
1006	vpaddq	%xmm0,%xmm5,%xmm5
1007	vpaddq	%xmm1,%xmm6,%xmm6
1008	vpaddq	%xmm3,%xmm8,%xmm8
1009	vpaddq	%xmm4,%xmm9,%xmm9
1010
1011.Long_tail_avx:
1012	vmovdqa	%xmm2,32(%r11)
1013	vmovdqa	%xmm0,0(%r11)
1014	vmovdqa	%xmm1,16(%r11)
1015	vmovdqa	%xmm3,48(%r11)
1016	vmovdqa	%xmm4,64(%r11)
1017
1018
1019
1020
1021
1022
1023
1024	vpmuludq	%xmm7,%xmm14,%xmm12
1025	vpmuludq	%xmm5,%xmm14,%xmm10
1026	vpshufd	$0x10,-48(%rdi),%xmm2
1027	vpmuludq	%xmm6,%xmm14,%xmm11
1028	vpmuludq	%xmm8,%xmm14,%xmm13
1029	vpmuludq	%xmm9,%xmm14,%xmm14
1030
1031	vpmuludq	%xmm8,%xmm2,%xmm0
1032	vpaddq	%xmm0,%xmm14,%xmm14
1033	vpshufd	$0x10,-32(%rdi),%xmm3
1034	vpmuludq	%xmm7,%xmm2,%xmm1
1035	vpaddq	%xmm1,%xmm13,%xmm13
1036	vpshufd	$0x10,-16(%rdi),%xmm4
1037	vpmuludq	%xmm6,%xmm2,%xmm0
1038	vpaddq	%xmm0,%xmm12,%xmm12
1039	vpmuludq	%xmm5,%xmm2,%xmm2
1040	vpaddq	%xmm2,%xmm11,%xmm11
1041	vpmuludq	%xmm9,%xmm3,%xmm3
1042	vpaddq	%xmm3,%xmm10,%xmm10
1043
1044	vpshufd	$0x10,0(%rdi),%xmm2
1045	vpmuludq	%xmm7,%xmm4,%xmm1
1046	vpaddq	%xmm1,%xmm14,%xmm14
1047	vpmuludq	%xmm6,%xmm4,%xmm0
1048	vpaddq	%xmm0,%xmm13,%xmm13
1049	vpshufd	$0x10,16(%rdi),%xmm3
1050	vpmuludq	%xmm5,%xmm4,%xmm4
1051	vpaddq	%xmm4,%xmm12,%xmm12
1052	vpmuludq	%xmm9,%xmm2,%xmm1
1053	vpaddq	%xmm1,%xmm11,%xmm11
1054	vpshufd	$0x10,32(%rdi),%xmm4
1055	vpmuludq	%xmm8,%xmm2,%xmm2
1056	vpaddq	%xmm2,%xmm10,%xmm10
1057
1058	vpmuludq	%xmm6,%xmm3,%xmm0
1059	vpaddq	%xmm0,%xmm14,%xmm14
1060	vpmuludq	%xmm5,%xmm3,%xmm3
1061	vpaddq	%xmm3,%xmm13,%xmm13
1062	vpshufd	$0x10,48(%rdi),%xmm2
1063	vpmuludq	%xmm9,%xmm4,%xmm1
1064	vpaddq	%xmm1,%xmm12,%xmm12
1065	vpshufd	$0x10,64(%rdi),%xmm3
1066	vpmuludq	%xmm8,%xmm4,%xmm0
1067	vpaddq	%xmm0,%xmm11,%xmm11
1068	vpmuludq	%xmm7,%xmm4,%xmm4
1069	vpaddq	%xmm4,%xmm10,%xmm10
1070
1071	vpmuludq	%xmm5,%xmm2,%xmm2
1072	vpaddq	%xmm2,%xmm14,%xmm14
1073	vpmuludq	%xmm9,%xmm3,%xmm1
1074	vpaddq	%xmm1,%xmm13,%xmm13
1075	vpmuludq	%xmm8,%xmm3,%xmm0
1076	vpaddq	%xmm0,%xmm12,%xmm12
1077	vpmuludq	%xmm7,%xmm3,%xmm1
1078	vpaddq	%xmm1,%xmm11,%xmm11
1079	vpmuludq	%xmm6,%xmm3,%xmm3
1080	vpaddq	%xmm3,%xmm10,%xmm10
1081
1082	jz	.Lshort_tail_avx
1083
1084	vmovdqu	0(%rsi),%xmm0
1085	vmovdqu	16(%rsi),%xmm1
1086
1087	vpsrldq	$6,%xmm0,%xmm2
1088	vpsrldq	$6,%xmm1,%xmm3
1089	vpunpckhqdq	%xmm1,%xmm0,%xmm4
1090	vpunpcklqdq	%xmm1,%xmm0,%xmm0
1091	vpunpcklqdq	%xmm3,%xmm2,%xmm3
1092
1093	vpsrlq	$40,%xmm4,%xmm4
1094	vpsrlq	$26,%xmm0,%xmm1
1095	vpand	%xmm15,%xmm0,%xmm0
1096	vpsrlq	$4,%xmm3,%xmm2
1097	vpand	%xmm15,%xmm1,%xmm1
1098	vpsrlq	$30,%xmm3,%xmm3
1099	vpand	%xmm15,%xmm2,%xmm2
1100	vpand	%xmm15,%xmm3,%xmm3
1101	vpor	32(%rcx),%xmm4,%xmm4
1102
1103	vpshufd	$0x32,-64(%rdi),%xmm9
1104	vpaddq	0(%r11),%xmm0,%xmm0
1105	vpaddq	16(%r11),%xmm1,%xmm1
1106	vpaddq	32(%r11),%xmm2,%xmm2
1107	vpaddq	48(%r11),%xmm3,%xmm3
1108	vpaddq	64(%r11),%xmm4,%xmm4
1109
1110
1111
1112
1113	vpmuludq	%xmm0,%xmm9,%xmm5
1114	vpaddq	%xmm5,%xmm10,%xmm10
1115	vpmuludq	%xmm1,%xmm9,%xmm6
1116	vpaddq	%xmm6,%xmm11,%xmm11
1117	vpmuludq	%xmm2,%xmm9,%xmm5
1118	vpaddq	%xmm5,%xmm12,%xmm12
1119	vpshufd	$0x32,-48(%rdi),%xmm7
1120	vpmuludq	%xmm3,%xmm9,%xmm6
1121	vpaddq	%xmm6,%xmm13,%xmm13
1122	vpmuludq	%xmm4,%xmm9,%xmm9
1123	vpaddq	%xmm9,%xmm14,%xmm14
1124
1125	vpmuludq	%xmm3,%xmm7,%xmm5
1126	vpaddq	%xmm5,%xmm14,%xmm14
1127	vpshufd	$0x32,-32(%rdi),%xmm8
1128	vpmuludq	%xmm2,%xmm7,%xmm6
1129	vpaddq	%xmm6,%xmm13,%xmm13
1130	vpshufd	$0x32,-16(%rdi),%xmm9
1131	vpmuludq	%xmm1,%xmm7,%xmm5
1132	vpaddq	%xmm5,%xmm12,%xmm12
1133	vpmuludq	%xmm0,%xmm7,%xmm7
1134	vpaddq	%xmm7,%xmm11,%xmm11
1135	vpmuludq	%xmm4,%xmm8,%xmm8
1136	vpaddq	%xmm8,%xmm10,%xmm10
1137
1138	vpshufd	$0x32,0(%rdi),%xmm7
1139	vpmuludq	%xmm2,%xmm9,%xmm6
1140	vpaddq	%xmm6,%xmm14,%xmm14
1141	vpmuludq	%xmm1,%xmm9,%xmm5
1142	vpaddq	%xmm5,%xmm13,%xmm13
1143	vpshufd	$0x32,16(%rdi),%xmm8
1144	vpmuludq	%xmm0,%xmm9,%xmm9
1145	vpaddq	%xmm9,%xmm12,%xmm12
1146	vpmuludq	%xmm4,%xmm7,%xmm6
1147	vpaddq	%xmm6,%xmm11,%xmm11
1148	vpshufd	$0x32,32(%rdi),%xmm9
1149	vpmuludq	%xmm3,%xmm7,%xmm7
1150	vpaddq	%xmm7,%xmm10,%xmm10
1151
1152	vpmuludq	%xmm1,%xmm8,%xmm5
1153	vpaddq	%xmm5,%xmm14,%xmm14
1154	vpmuludq	%xmm0,%xmm8,%xmm8
1155	vpaddq	%xmm8,%xmm13,%xmm13
1156	vpshufd	$0x32,48(%rdi),%xmm7
1157	vpmuludq	%xmm4,%xmm9,%xmm6
1158	vpaddq	%xmm6,%xmm12,%xmm12
1159	vpshufd	$0x32,64(%rdi),%xmm8
1160	vpmuludq	%xmm3,%xmm9,%xmm5
1161	vpaddq	%xmm5,%xmm11,%xmm11
1162	vpmuludq	%xmm2,%xmm9,%xmm9
1163	vpaddq	%xmm9,%xmm10,%xmm10
1164
1165	vpmuludq	%xmm0,%xmm7,%xmm7
1166	vpaddq	%xmm7,%xmm14,%xmm14
1167	vpmuludq	%xmm4,%xmm8,%xmm6
1168	vpaddq	%xmm6,%xmm13,%xmm13
1169	vpmuludq	%xmm3,%xmm8,%xmm5
1170	vpaddq	%xmm5,%xmm12,%xmm12
1171	vpmuludq	%xmm2,%xmm8,%xmm6
1172	vpaddq	%xmm6,%xmm11,%xmm11
1173	vpmuludq	%xmm1,%xmm8,%xmm8
1174	vpaddq	%xmm8,%xmm10,%xmm10
1175
1176.Lshort_tail_avx:
1177
1178
1179
1180	vpsrldq	$8,%xmm14,%xmm9
1181	vpsrldq	$8,%xmm13,%xmm8
1182	vpsrldq	$8,%xmm11,%xmm6
1183	vpsrldq	$8,%xmm10,%xmm5
1184	vpsrldq	$8,%xmm12,%xmm7
1185	vpaddq	%xmm8,%xmm13,%xmm13
1186	vpaddq	%xmm9,%xmm14,%xmm14
1187	vpaddq	%xmm5,%xmm10,%xmm10
1188	vpaddq	%xmm6,%xmm11,%xmm11
1189	vpaddq	%xmm7,%xmm12,%xmm12
1190
1191
1192
1193
1194	vpsrlq	$26,%xmm13,%xmm3
1195	vpand	%xmm15,%xmm13,%xmm13
1196	vpaddq	%xmm3,%xmm14,%xmm14
1197
1198	vpsrlq	$26,%xmm10,%xmm0
1199	vpand	%xmm15,%xmm10,%xmm10
1200	vpaddq	%xmm0,%xmm11,%xmm11
1201
1202	vpsrlq	$26,%xmm14,%xmm4
1203	vpand	%xmm15,%xmm14,%xmm14
1204
1205	vpsrlq	$26,%xmm11,%xmm1
1206	vpand	%xmm15,%xmm11,%xmm11
1207	vpaddq	%xmm1,%xmm12,%xmm12
1208
1209	vpaddq	%xmm4,%xmm10,%xmm10
1210	vpsllq	$2,%xmm4,%xmm4
1211	vpaddq	%xmm4,%xmm10,%xmm10
1212
1213	vpsrlq	$26,%xmm12,%xmm2
1214	vpand	%xmm15,%xmm12,%xmm12
1215	vpaddq	%xmm2,%xmm13,%xmm13
1216
1217	vpsrlq	$26,%xmm10,%xmm0
1218	vpand	%xmm15,%xmm10,%xmm10
1219	vpaddq	%xmm0,%xmm11,%xmm11
1220
1221	vpsrlq	$26,%xmm13,%xmm3
1222	vpand	%xmm15,%xmm13,%xmm13
1223	vpaddq	%xmm3,%xmm14,%xmm14
1224
1225	vmovd	%xmm10,-112(%rdi)
1226	vmovd	%xmm11,-108(%rdi)
1227	vmovd	%xmm12,-104(%rdi)
1228	vmovd	%xmm13,-100(%rdi)
1229	vmovd	%xmm14,-96(%rdi)
1230	leaq	88(%r11),%rsp
1231.cfi_def_cfa	%rsp,8
1232	vzeroupper
1233	.byte	0xf3,0xc3
1234.cfi_endproc
1235.size	poly1305_blocks_avx,.-poly1305_blocks_avx
1236
1237.type	poly1305_emit_avx,@function
1238.align	32
1239poly1305_emit_avx:
1240.cfi_startproc
1241	cmpl	$0,20(%rdi)
1242	je	.Lemit
1243
1244	movl	0(%rdi),%eax
1245	movl	4(%rdi),%ecx
1246	movl	8(%rdi),%r8d
1247	movl	12(%rdi),%r11d
1248	movl	16(%rdi),%r10d
1249
1250	shlq	$26,%rcx
1251	movq	%r8,%r9
1252	shlq	$52,%r8
1253	addq	%rcx,%rax
1254	shrq	$12,%r9
1255	addq	%rax,%r8
1256	adcq	$0,%r9
1257
1258	shlq	$14,%r11
1259	movq	%r10,%rax
1260	shrq	$24,%r10
1261	addq	%r11,%r9
1262	shlq	$40,%rax
1263	addq	%rax,%r9
1264	adcq	$0,%r10
1265
1266	movq	%r10,%rax
1267	movq	%r10,%rcx
1268	andq	$3,%r10
1269	shrq	$2,%rax
1270	andq	$-4,%rcx
1271	addq	%rcx,%rax
1272	addq	%rax,%r8
1273	adcq	$0,%r9
1274	adcq	$0,%r10
1275
1276	movq	%r8,%rax
1277	addq	$5,%r8
1278	movq	%r9,%rcx
1279	adcq	$0,%r9
1280	adcq	$0,%r10
1281	shrq	$2,%r10
1282	cmovnzq	%r8,%rax
1283	cmovnzq	%r9,%rcx
1284
1285	addq	0(%rdx),%rax
1286	adcq	8(%rdx),%rcx
1287	movq	%rax,0(%rsi)
1288	movq	%rcx,8(%rsi)
1289
1290	.byte	0xf3,0xc3
1291.cfi_endproc
1292.size	poly1305_emit_avx,.-poly1305_emit_avx
1293.type	poly1305_blocks_avx2,@function
1294.align	32
1295poly1305_blocks_avx2:
1296.cfi_startproc
1297	movl	20(%rdi),%r8d
1298	cmpq	$128,%rdx
1299	jae	.Lblocks_avx2
1300	testl	%r8d,%r8d
1301	jz	.Lblocks
1302
1303.Lblocks_avx2:
1304	andq	$-16,%rdx
1305	jz	.Lno_data_avx2
1306
1307	vzeroupper
1308
1309	testl	%r8d,%r8d
1310	jz	.Lbase2_64_avx2
1311
1312	testq	$63,%rdx
1313	jz	.Leven_avx2
1314
1315	pushq	%rbx
1316.cfi_adjust_cfa_offset	8
1317.cfi_offset	%rbx,-16
1318	pushq	%rbp
1319.cfi_adjust_cfa_offset	8
1320.cfi_offset	%rbp,-24
1321	pushq	%r12
1322.cfi_adjust_cfa_offset	8
1323.cfi_offset	%r12,-32
1324	pushq	%r13
1325.cfi_adjust_cfa_offset	8
1326.cfi_offset	%r13,-40
1327	pushq	%r14
1328.cfi_adjust_cfa_offset	8
1329.cfi_offset	%r14,-48
1330	pushq	%r15
1331.cfi_adjust_cfa_offset	8
1332.cfi_offset	%r15,-56
1333.Lblocks_avx2_body:
1334
1335	movq	%rdx,%r15
1336
1337	movq	0(%rdi),%r8
1338	movq	8(%rdi),%r9
1339	movl	16(%rdi),%ebp
1340
1341	movq	24(%rdi),%r11
1342	movq	32(%rdi),%r13
1343
1344
1345	movl	%r8d,%r14d
1346	andq	$-2147483648,%r8
1347	movq	%r9,%r12
1348	movl	%r9d,%ebx
1349	andq	$-2147483648,%r9
1350
1351	shrq	$6,%r8
1352	shlq	$52,%r12
1353	addq	%r8,%r14
1354	shrq	$12,%rbx
1355	shrq	$18,%r9
1356	addq	%r12,%r14
1357	adcq	%r9,%rbx
1358
1359	movq	%rbp,%r8
1360	shlq	$40,%r8
1361	shrq	$24,%rbp
1362	addq	%r8,%rbx
1363	adcq	$0,%rbp
1364
1365	movq	$-4,%r9
1366	movq	%rbp,%r8
1367	andq	%rbp,%r9
1368	shrq	$2,%r8
1369	andq	$3,%rbp
1370	addq	%r9,%r8
1371	addq	%r8,%r14
1372	adcq	$0,%rbx
1373	adcq	$0,%rbp
1374
1375	movq	%r13,%r12
1376	movq	%r13,%rax
1377	shrq	$2,%r13
1378	addq	%r12,%r13
1379
1380.Lbase2_26_pre_avx2:
1381	addq	0(%rsi),%r14
1382	adcq	8(%rsi),%rbx
1383	leaq	16(%rsi),%rsi
1384	adcq	%rcx,%rbp
1385	subq	$16,%r15
1386
1387	call	__poly1305_block
1388	movq	%r12,%rax
1389
1390	testq	$63,%r15
1391	jnz	.Lbase2_26_pre_avx2
1392
1393	testq	%rcx,%rcx
1394	jz	.Lstore_base2_64_avx2
1395
1396
1397	movq	%r14,%rax
1398	movq	%r14,%rdx
1399	shrq	$52,%r14
1400	movq	%rbx,%r11
1401	movq	%rbx,%r12
1402	shrq	$26,%rdx
1403	andq	$0x3ffffff,%rax
1404	shlq	$12,%r11
1405	andq	$0x3ffffff,%rdx
1406	shrq	$14,%rbx
1407	orq	%r11,%r14
1408	shlq	$24,%rbp
1409	andq	$0x3ffffff,%r14
1410	shrq	$40,%r12
1411	andq	$0x3ffffff,%rbx
1412	orq	%r12,%rbp
1413
1414	testq	%r15,%r15
1415	jz	.Lstore_base2_26_avx2
1416
1417	vmovd	%eax,%xmm0
1418	vmovd	%edx,%xmm1
1419	vmovd	%r14d,%xmm2
1420	vmovd	%ebx,%xmm3
1421	vmovd	%ebp,%xmm4
1422	jmp	.Lproceed_avx2
1423
1424.align	32
1425.Lstore_base2_64_avx2:
1426	movq	%r14,0(%rdi)
1427	movq	%rbx,8(%rdi)
1428	movq	%rbp,16(%rdi)
1429	jmp	.Ldone_avx2
1430
1431.align	16
1432.Lstore_base2_26_avx2:
1433	movl	%eax,0(%rdi)
1434	movl	%edx,4(%rdi)
1435	movl	%r14d,8(%rdi)
1436	movl	%ebx,12(%rdi)
1437	movl	%ebp,16(%rdi)
1438.align	16
1439.Ldone_avx2:
1440	movq	0(%rsp),%r15
1441.cfi_restore	%r15
1442	movq	8(%rsp),%r14
1443.cfi_restore	%r14
1444	movq	16(%rsp),%r13
1445.cfi_restore	%r13
1446	movq	24(%rsp),%r12
1447.cfi_restore	%r12
1448	movq	32(%rsp),%rbp
1449.cfi_restore	%rbp
1450	movq	40(%rsp),%rbx
1451.cfi_restore	%rbx
1452	leaq	48(%rsp),%rsp
1453.cfi_adjust_cfa_offset	-48
1454.Lno_data_avx2:
1455.Lblocks_avx2_epilogue:
1456	.byte	0xf3,0xc3
1457.cfi_endproc
1458
1459.align	32
1460.Lbase2_64_avx2:
1461.cfi_startproc
1462	pushq	%rbx
1463.cfi_adjust_cfa_offset	8
1464.cfi_offset	%rbx,-16
1465	pushq	%rbp
1466.cfi_adjust_cfa_offset	8
1467.cfi_offset	%rbp,-24
1468	pushq	%r12
1469.cfi_adjust_cfa_offset	8
1470.cfi_offset	%r12,-32
1471	pushq	%r13
1472.cfi_adjust_cfa_offset	8
1473.cfi_offset	%r13,-40
1474	pushq	%r14
1475.cfi_adjust_cfa_offset	8
1476.cfi_offset	%r14,-48
1477	pushq	%r15
1478.cfi_adjust_cfa_offset	8
1479.cfi_offset	%r15,-56
1480.Lbase2_64_avx2_body:
1481
1482	movq	%rdx,%r15
1483
1484	movq	24(%rdi),%r11
1485	movq	32(%rdi),%r13
1486
1487	movq	0(%rdi),%r14
1488	movq	8(%rdi),%rbx
1489	movl	16(%rdi),%ebp
1490
1491	movq	%r13,%r12
1492	movq	%r13,%rax
1493	shrq	$2,%r13
1494	addq	%r12,%r13
1495
1496	testq	$63,%rdx
1497	jz	.Linit_avx2
1498
1499.Lbase2_64_pre_avx2:
1500	addq	0(%rsi),%r14
1501	adcq	8(%rsi),%rbx
1502	leaq	16(%rsi),%rsi
1503	adcq	%rcx,%rbp
1504	subq	$16,%r15
1505
1506	call	__poly1305_block
1507	movq	%r12,%rax
1508
1509	testq	$63,%r15
1510	jnz	.Lbase2_64_pre_avx2
1511
1512.Linit_avx2:
1513
1514	movq	%r14,%rax
1515	movq	%r14,%rdx
1516	shrq	$52,%r14
1517	movq	%rbx,%r8
1518	movq	%rbx,%r9
1519	shrq	$26,%rdx
1520	andq	$0x3ffffff,%rax
1521	shlq	$12,%r8
1522	andq	$0x3ffffff,%rdx
1523	shrq	$14,%rbx
1524	orq	%r8,%r14
1525	shlq	$24,%rbp
1526	andq	$0x3ffffff,%r14
1527	shrq	$40,%r9
1528	andq	$0x3ffffff,%rbx
1529	orq	%r9,%rbp
1530
1531	vmovd	%eax,%xmm0
1532	vmovd	%edx,%xmm1
1533	vmovd	%r14d,%xmm2
1534	vmovd	%ebx,%xmm3
1535	vmovd	%ebp,%xmm4
1536	movl	$1,20(%rdi)
1537
1538	call	__poly1305_init_avx
1539
1540.Lproceed_avx2:
1541	movq	%r15,%rdx
1542	movl	OPENSSL_ia32cap_P+8(%rip),%r10d
1543	movl	$3221291008,%r11d
1544
1545	movq	0(%rsp),%r15
1546.cfi_restore	%r15
1547	movq	8(%rsp),%r14
1548.cfi_restore	%r14
1549	movq	16(%rsp),%r13
1550.cfi_restore	%r13
1551	movq	24(%rsp),%r12
1552.cfi_restore	%r12
1553	movq	32(%rsp),%rbp
1554.cfi_restore	%rbp
1555	movq	40(%rsp),%rbx
1556.cfi_restore	%rbx
1557	leaq	48(%rsp),%rax
1558	leaq	48(%rsp),%rsp
1559.cfi_adjust_cfa_offset	-48
1560.Lbase2_64_avx2_epilogue:
1561	jmp	.Ldo_avx2
1562.cfi_endproc
1563
1564.align	32
1565.Leven_avx2:
1566.cfi_startproc
1567	movl	OPENSSL_ia32cap_P+8(%rip),%r10d
1568	vmovd	0(%rdi),%xmm0
1569	vmovd	4(%rdi),%xmm1
1570	vmovd	8(%rdi),%xmm2
1571	vmovd	12(%rdi),%xmm3
1572	vmovd	16(%rdi),%xmm4
1573
1574.Ldo_avx2:
1575	leaq	-8(%rsp),%r11
1576.cfi_def_cfa	%r11,16
1577	subq	$0x128,%rsp
1578	leaq	.Lconst(%rip),%rcx
1579	leaq	48+64(%rdi),%rdi
1580	vmovdqa	96(%rcx),%ymm7
1581
1582
1583	vmovdqu	-64(%rdi),%xmm9
1584	andq	$-512,%rsp
1585	vmovdqu	-48(%rdi),%xmm10
1586	vmovdqu	-32(%rdi),%xmm6
1587	vmovdqu	-16(%rdi),%xmm11
1588	vmovdqu	0(%rdi),%xmm12
1589	vmovdqu	16(%rdi),%xmm13
1590	leaq	144(%rsp),%rax
1591	vmovdqu	32(%rdi),%xmm14
1592	vpermd	%ymm9,%ymm7,%ymm9
1593	vmovdqu	48(%rdi),%xmm15
1594	vpermd	%ymm10,%ymm7,%ymm10
1595	vmovdqu	64(%rdi),%xmm5
1596	vpermd	%ymm6,%ymm7,%ymm6
1597	vmovdqa	%ymm9,0(%rsp)
1598	vpermd	%ymm11,%ymm7,%ymm11
1599	vmovdqa	%ymm10,32-144(%rax)
1600	vpermd	%ymm12,%ymm7,%ymm12
1601	vmovdqa	%ymm6,64-144(%rax)
1602	vpermd	%ymm13,%ymm7,%ymm13
1603	vmovdqa	%ymm11,96-144(%rax)
1604	vpermd	%ymm14,%ymm7,%ymm14
1605	vmovdqa	%ymm12,128-144(%rax)
1606	vpermd	%ymm15,%ymm7,%ymm15
1607	vmovdqa	%ymm13,160-144(%rax)
1608	vpermd	%ymm5,%ymm7,%ymm5
1609	vmovdqa	%ymm14,192-144(%rax)
1610	vmovdqa	%ymm15,224-144(%rax)
1611	vmovdqa	%ymm5,256-144(%rax)
1612	vmovdqa	64(%rcx),%ymm5
1613
1614
1615
1616	vmovdqu	0(%rsi),%xmm7
1617	vmovdqu	16(%rsi),%xmm8
1618	vinserti128	$1,32(%rsi),%ymm7,%ymm7
1619	vinserti128	$1,48(%rsi),%ymm8,%ymm8
1620	leaq	64(%rsi),%rsi
1621
1622	vpsrldq	$6,%ymm7,%ymm9
1623	vpsrldq	$6,%ymm8,%ymm10
1624	vpunpckhqdq	%ymm8,%ymm7,%ymm6
1625	vpunpcklqdq	%ymm10,%ymm9,%ymm9
1626	vpunpcklqdq	%ymm8,%ymm7,%ymm7
1627
1628	vpsrlq	$30,%ymm9,%ymm10
1629	vpsrlq	$4,%ymm9,%ymm9
1630	vpsrlq	$26,%ymm7,%ymm8
1631	vpsrlq	$40,%ymm6,%ymm6
1632	vpand	%ymm5,%ymm9,%ymm9
1633	vpand	%ymm5,%ymm7,%ymm7
1634	vpand	%ymm5,%ymm8,%ymm8
1635	vpand	%ymm5,%ymm10,%ymm10
1636	vpor	32(%rcx),%ymm6,%ymm6
1637
1638	vpaddq	%ymm2,%ymm9,%ymm2
1639	subq	$64,%rdx
1640	jz	.Ltail_avx2
1641	jmp	.Loop_avx2
1642
1643.align	32
1644.Loop_avx2:
1645
1646
1647
1648
1649
1650
1651
1652
1653	vpaddq	%ymm0,%ymm7,%ymm0
1654	vmovdqa	0(%rsp),%ymm7
1655	vpaddq	%ymm1,%ymm8,%ymm1
1656	vmovdqa	32(%rsp),%ymm8
1657	vpaddq	%ymm3,%ymm10,%ymm3
1658	vmovdqa	96(%rsp),%ymm9
1659	vpaddq	%ymm4,%ymm6,%ymm4
1660	vmovdqa	48(%rax),%ymm10
1661	vmovdqa	112(%rax),%ymm5
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678	vpmuludq	%ymm2,%ymm7,%ymm13
1679	vpmuludq	%ymm2,%ymm8,%ymm14
1680	vpmuludq	%ymm2,%ymm9,%ymm15
1681	vpmuludq	%ymm2,%ymm10,%ymm11
1682	vpmuludq	%ymm2,%ymm5,%ymm12
1683
1684	vpmuludq	%ymm0,%ymm8,%ymm6
1685	vpmuludq	%ymm1,%ymm8,%ymm2
1686	vpaddq	%ymm6,%ymm12,%ymm12
1687	vpaddq	%ymm2,%ymm13,%ymm13
1688	vpmuludq	%ymm3,%ymm8,%ymm6
1689	vpmuludq	64(%rsp),%ymm4,%ymm2
1690	vpaddq	%ymm6,%ymm15,%ymm15
1691	vpaddq	%ymm2,%ymm11,%ymm11
1692	vmovdqa	-16(%rax),%ymm8
1693
1694	vpmuludq	%ymm0,%ymm7,%ymm6
1695	vpmuludq	%ymm1,%ymm7,%ymm2
1696	vpaddq	%ymm6,%ymm11,%ymm11
1697	vpaddq	%ymm2,%ymm12,%ymm12
1698	vpmuludq	%ymm3,%ymm7,%ymm6
1699	vpmuludq	%ymm4,%ymm7,%ymm2
1700	vmovdqu	0(%rsi),%xmm7
1701	vpaddq	%ymm6,%ymm14,%ymm14
1702	vpaddq	%ymm2,%ymm15,%ymm15
1703	vinserti128	$1,32(%rsi),%ymm7,%ymm7
1704
1705	vpmuludq	%ymm3,%ymm8,%ymm6
1706	vpmuludq	%ymm4,%ymm8,%ymm2
1707	vmovdqu	16(%rsi),%xmm8
1708	vpaddq	%ymm6,%ymm11,%ymm11
1709	vpaddq	%ymm2,%ymm12,%ymm12
1710	vmovdqa	16(%rax),%ymm2
1711	vpmuludq	%ymm1,%ymm9,%ymm6
1712	vpmuludq	%ymm0,%ymm9,%ymm9
1713	vpaddq	%ymm6,%ymm14,%ymm14
1714	vpaddq	%ymm9,%ymm13,%ymm13
1715	vinserti128	$1,48(%rsi),%ymm8,%ymm8
1716	leaq	64(%rsi),%rsi
1717
1718	vpmuludq	%ymm1,%ymm2,%ymm6
1719	vpmuludq	%ymm0,%ymm2,%ymm2
1720	vpsrldq	$6,%ymm7,%ymm9
1721	vpaddq	%ymm6,%ymm15,%ymm15
1722	vpaddq	%ymm2,%ymm14,%ymm14
1723	vpmuludq	%ymm3,%ymm10,%ymm6
1724	vpmuludq	%ymm4,%ymm10,%ymm2
1725	vpsrldq	$6,%ymm8,%ymm10
1726	vpaddq	%ymm6,%ymm12,%ymm12
1727	vpaddq	%ymm2,%ymm13,%ymm13
1728	vpunpckhqdq	%ymm8,%ymm7,%ymm6
1729
1730	vpmuludq	%ymm3,%ymm5,%ymm3
1731	vpmuludq	%ymm4,%ymm5,%ymm4
1732	vpunpcklqdq	%ymm8,%ymm7,%ymm7
1733	vpaddq	%ymm3,%ymm13,%ymm2
1734	vpaddq	%ymm4,%ymm14,%ymm3
1735	vpunpcklqdq	%ymm10,%ymm9,%ymm10
1736	vpmuludq	80(%rax),%ymm0,%ymm4
1737	vpmuludq	%ymm1,%ymm5,%ymm0
1738	vmovdqa	64(%rcx),%ymm5
1739	vpaddq	%ymm4,%ymm15,%ymm4
1740	vpaddq	%ymm0,%ymm11,%ymm0
1741
1742
1743
1744
1745	vpsrlq	$26,%ymm3,%ymm14
1746	vpand	%ymm5,%ymm3,%ymm3
1747	vpaddq	%ymm14,%ymm4,%ymm4
1748
1749	vpsrlq	$26,%ymm0,%ymm11
1750	vpand	%ymm5,%ymm0,%ymm0
1751	vpaddq	%ymm11,%ymm12,%ymm1
1752
1753	vpsrlq	$26,%ymm4,%ymm15
1754	vpand	%ymm5,%ymm4,%ymm4
1755
1756	vpsrlq	$4,%ymm10,%ymm9
1757
1758	vpsrlq	$26,%ymm1,%ymm12
1759	vpand	%ymm5,%ymm1,%ymm1
1760	vpaddq	%ymm12,%ymm2,%ymm2
1761
1762	vpaddq	%ymm15,%ymm0,%ymm0
1763	vpsllq	$2,%ymm15,%ymm15
1764	vpaddq	%ymm15,%ymm0,%ymm0
1765
1766	vpand	%ymm5,%ymm9,%ymm9
1767	vpsrlq	$26,%ymm7,%ymm8
1768
1769	vpsrlq	$26,%ymm2,%ymm13
1770	vpand	%ymm5,%ymm2,%ymm2
1771	vpaddq	%ymm13,%ymm3,%ymm3
1772
1773	vpaddq	%ymm9,%ymm2,%ymm2
1774	vpsrlq	$30,%ymm10,%ymm10
1775
1776	vpsrlq	$26,%ymm0,%ymm11
1777	vpand	%ymm5,%ymm0,%ymm0
1778	vpaddq	%ymm11,%ymm1,%ymm1
1779
1780	vpsrlq	$40,%ymm6,%ymm6
1781
1782	vpsrlq	$26,%ymm3,%ymm14
1783	vpand	%ymm5,%ymm3,%ymm3
1784	vpaddq	%ymm14,%ymm4,%ymm4
1785
1786	vpand	%ymm5,%ymm7,%ymm7
1787	vpand	%ymm5,%ymm8,%ymm8
1788	vpand	%ymm5,%ymm10,%ymm10
1789	vpor	32(%rcx),%ymm6,%ymm6
1790
1791	subq	$64,%rdx
1792	jnz	.Loop_avx2
1793
1794.byte	0x66,0x90
1795.Ltail_avx2:
1796
1797
1798
1799
1800
1801
1802
1803	vpaddq	%ymm0,%ymm7,%ymm0
1804	vmovdqu	4(%rsp),%ymm7
1805	vpaddq	%ymm1,%ymm8,%ymm1
1806	vmovdqu	36(%rsp),%ymm8
1807	vpaddq	%ymm3,%ymm10,%ymm3
1808	vmovdqu	100(%rsp),%ymm9
1809	vpaddq	%ymm4,%ymm6,%ymm4
1810	vmovdqu	52(%rax),%ymm10
1811	vmovdqu	116(%rax),%ymm5
1812
1813	vpmuludq	%ymm2,%ymm7,%ymm13
1814	vpmuludq	%ymm2,%ymm8,%ymm14
1815	vpmuludq	%ymm2,%ymm9,%ymm15
1816	vpmuludq	%ymm2,%ymm10,%ymm11
1817	vpmuludq	%ymm2,%ymm5,%ymm12
1818
1819	vpmuludq	%ymm0,%ymm8,%ymm6
1820	vpmuludq	%ymm1,%ymm8,%ymm2
1821	vpaddq	%ymm6,%ymm12,%ymm12
1822	vpaddq	%ymm2,%ymm13,%ymm13
1823	vpmuludq	%ymm3,%ymm8,%ymm6
1824	vpmuludq	68(%rsp),%ymm4,%ymm2
1825	vpaddq	%ymm6,%ymm15,%ymm15
1826	vpaddq	%ymm2,%ymm11,%ymm11
1827
1828	vpmuludq	%ymm0,%ymm7,%ymm6
1829	vpmuludq	%ymm1,%ymm7,%ymm2
1830	vpaddq	%ymm6,%ymm11,%ymm11
1831	vmovdqu	-12(%rax),%ymm8
1832	vpaddq	%ymm2,%ymm12,%ymm12
1833	vpmuludq	%ymm3,%ymm7,%ymm6
1834	vpmuludq	%ymm4,%ymm7,%ymm2
1835	vpaddq	%ymm6,%ymm14,%ymm14
1836	vpaddq	%ymm2,%ymm15,%ymm15
1837
1838	vpmuludq	%ymm3,%ymm8,%ymm6
1839	vpmuludq	%ymm4,%ymm8,%ymm2
1840	vpaddq	%ymm6,%ymm11,%ymm11
1841	vpaddq	%ymm2,%ymm12,%ymm12
1842	vmovdqu	20(%rax),%ymm2
1843	vpmuludq	%ymm1,%ymm9,%ymm6
1844	vpmuludq	%ymm0,%ymm9,%ymm9
1845	vpaddq	%ymm6,%ymm14,%ymm14
1846	vpaddq	%ymm9,%ymm13,%ymm13
1847
1848	vpmuludq	%ymm1,%ymm2,%ymm6
1849	vpmuludq	%ymm0,%ymm2,%ymm2
1850	vpaddq	%ymm6,%ymm15,%ymm15
1851	vpaddq	%ymm2,%ymm14,%ymm14
1852	vpmuludq	%ymm3,%ymm10,%ymm6
1853	vpmuludq	%ymm4,%ymm10,%ymm2
1854	vpaddq	%ymm6,%ymm12,%ymm12
1855	vpaddq	%ymm2,%ymm13,%ymm13
1856
1857	vpmuludq	%ymm3,%ymm5,%ymm3
1858	vpmuludq	%ymm4,%ymm5,%ymm4
1859	vpaddq	%ymm3,%ymm13,%ymm2
1860	vpaddq	%ymm4,%ymm14,%ymm3
1861	vpmuludq	84(%rax),%ymm0,%ymm4
1862	vpmuludq	%ymm1,%ymm5,%ymm0
1863	vmovdqa	64(%rcx),%ymm5
1864	vpaddq	%ymm4,%ymm15,%ymm4
1865	vpaddq	%ymm0,%ymm11,%ymm0
1866
1867
1868
1869
1870	vpsrldq	$8,%ymm12,%ymm8
1871	vpsrldq	$8,%ymm2,%ymm9
1872	vpsrldq	$8,%ymm3,%ymm10
1873	vpsrldq	$8,%ymm4,%ymm6
1874	vpsrldq	$8,%ymm0,%ymm7
1875	vpaddq	%ymm8,%ymm12,%ymm12
1876	vpaddq	%ymm9,%ymm2,%ymm2
1877	vpaddq	%ymm10,%ymm3,%ymm3
1878	vpaddq	%ymm6,%ymm4,%ymm4
1879	vpaddq	%ymm7,%ymm0,%ymm0
1880
1881	vpermq	$0x2,%ymm3,%ymm10
1882	vpermq	$0x2,%ymm4,%ymm6
1883	vpermq	$0x2,%ymm0,%ymm7
1884	vpermq	$0x2,%ymm12,%ymm8
1885	vpermq	$0x2,%ymm2,%ymm9
1886	vpaddq	%ymm10,%ymm3,%ymm3
1887	vpaddq	%ymm6,%ymm4,%ymm4
1888	vpaddq	%ymm7,%ymm0,%ymm0
1889	vpaddq	%ymm8,%ymm12,%ymm12
1890	vpaddq	%ymm9,%ymm2,%ymm2
1891
1892
1893
1894
1895	vpsrlq	$26,%ymm3,%ymm14
1896	vpand	%ymm5,%ymm3,%ymm3
1897	vpaddq	%ymm14,%ymm4,%ymm4
1898
1899	vpsrlq	$26,%ymm0,%ymm11
1900	vpand	%ymm5,%ymm0,%ymm0
1901	vpaddq	%ymm11,%ymm12,%ymm1
1902
1903	vpsrlq	$26,%ymm4,%ymm15
1904	vpand	%ymm5,%ymm4,%ymm4
1905
1906	vpsrlq	$26,%ymm1,%ymm12
1907	vpand	%ymm5,%ymm1,%ymm1
1908	vpaddq	%ymm12,%ymm2,%ymm2
1909
1910	vpaddq	%ymm15,%ymm0,%ymm0
1911	vpsllq	$2,%ymm15,%ymm15
1912	vpaddq	%ymm15,%ymm0,%ymm0
1913
1914	vpsrlq	$26,%ymm2,%ymm13
1915	vpand	%ymm5,%ymm2,%ymm2
1916	vpaddq	%ymm13,%ymm3,%ymm3
1917
1918	vpsrlq	$26,%ymm0,%ymm11
1919	vpand	%ymm5,%ymm0,%ymm0
1920	vpaddq	%ymm11,%ymm1,%ymm1
1921
1922	vpsrlq	$26,%ymm3,%ymm14
1923	vpand	%ymm5,%ymm3,%ymm3
1924	vpaddq	%ymm14,%ymm4,%ymm4
1925
1926	vmovd	%xmm0,-112(%rdi)
1927	vmovd	%xmm1,-108(%rdi)
1928	vmovd	%xmm2,-104(%rdi)
1929	vmovd	%xmm3,-100(%rdi)
1930	vmovd	%xmm4,-96(%rdi)
1931	leaq	8(%r11),%rsp
1932.cfi_def_cfa	%rsp,8
1933	vzeroupper
1934	.byte	0xf3,0xc3
1935.cfi_endproc
1936.size	poly1305_blocks_avx2,.-poly1305_blocks_avx2
1937.align	64
1938.Lconst:
1939.Lmask24:
1940.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
1941.L129:
1942.long	16777216,0,16777216,0,16777216,0,16777216,0
1943.Lmask26:
1944.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
1945.Lpermd_avx2:
1946.long	2,2,2,3,2,0,2,1
1947.Lpermd_avx512:
1948.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
1949
1950.L2_44_inp_permd:
1951.long	0,1,1,2,2,3,7,7
1952.L2_44_inp_shift:
1953.quad	0,12,24,64
1954.L2_44_mask:
1955.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
1956.L2_44_shift_rgt:
1957.quad	44,44,42,64
1958.L2_44_shift_lft:
1959.quad	8,8,10,64
1960
1961.align	64
1962.Lx_mask44:
1963.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1964.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1965.Lx_mask42:
1966.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1967.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1968.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1969.align	16
1970.globl	xor128_encrypt_n_pad
1971.type	xor128_encrypt_n_pad,@function
1972.align	16
1973xor128_encrypt_n_pad:
1974.cfi_startproc
1975	subq	%rdx,%rsi
1976	subq	%rdx,%rdi
1977	movq	%rcx,%r10
1978	shrq	$4,%rcx
1979	jz	.Ltail_enc
1980	nop
1981.Loop_enc_xmm:
1982	movdqu	(%rsi,%rdx,1),%xmm0
1983	pxor	(%rdx),%xmm0
1984	movdqu	%xmm0,(%rdi,%rdx,1)
1985	movdqa	%xmm0,(%rdx)
1986	leaq	16(%rdx),%rdx
1987	decq	%rcx
1988	jnz	.Loop_enc_xmm
1989
1990	andq	$15,%r10
1991	jz	.Ldone_enc
1992
1993.Ltail_enc:
1994	movq	$16,%rcx
1995	subq	%r10,%rcx
1996	xorl	%eax,%eax
1997.Loop_enc_byte:
1998	movb	(%rsi,%rdx,1),%al
1999	xorb	(%rdx),%al
2000	movb	%al,(%rdi,%rdx,1)
2001	movb	%al,(%rdx)
2002	leaq	1(%rdx),%rdx
2003	decq	%r10
2004	jnz	.Loop_enc_byte
2005
2006	xorl	%eax,%eax
2007.Loop_enc_pad:
2008	movb	%al,(%rdx)
2009	leaq	1(%rdx),%rdx
2010	decq	%rcx
2011	jnz	.Loop_enc_pad
2012
2013.Ldone_enc:
2014	movq	%rdx,%rax
2015	.byte	0xf3,0xc3
2016.cfi_endproc
2017.size	xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
2018
2019.globl	xor128_decrypt_n_pad
2020.type	xor128_decrypt_n_pad,@function
2021.align	16
2022xor128_decrypt_n_pad:
2023.cfi_startproc
2024	subq	%rdx,%rsi
2025	subq	%rdx,%rdi
2026	movq	%rcx,%r10
2027	shrq	$4,%rcx
2028	jz	.Ltail_dec
2029	nop
2030.Loop_dec_xmm:
2031	movdqu	(%rsi,%rdx,1),%xmm0
2032	movdqa	(%rdx),%xmm1
2033	pxor	%xmm0,%xmm1
2034	movdqu	%xmm1,(%rdi,%rdx,1)
2035	movdqa	%xmm0,(%rdx)
2036	leaq	16(%rdx),%rdx
2037	decq	%rcx
2038	jnz	.Loop_dec_xmm
2039
2040	pxor	%xmm1,%xmm1
2041	andq	$15,%r10
2042	jz	.Ldone_dec
2043
2044.Ltail_dec:
2045	movq	$16,%rcx
2046	subq	%r10,%rcx
2047	xorl	%eax,%eax
2048	xorq	%r11,%r11
2049.Loop_dec_byte:
2050	movb	(%rsi,%rdx,1),%r11b
2051	movb	(%rdx),%al
2052	xorb	%r11b,%al
2053	movb	%al,(%rdi,%rdx,1)
2054	movb	%r11b,(%rdx)
2055	leaq	1(%rdx),%rdx
2056	decq	%r10
2057	jnz	.Loop_dec_byte
2058
2059	xorl	%eax,%eax
2060.Loop_dec_pad:
2061	movb	%al,(%rdx)
2062	leaq	1(%rdx),%rdx
2063	decq	%rcx
2064	jnz	.Loop_dec_pad
2065
2066.Ldone_dec:
2067	movq	%rdx,%rax
2068	.byte	0xf3,0xc3
2069.cfi_endproc
2070.size	xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
2071