xref: /freebsd/sys/crypto/openssl/amd64/poly1305-x86_64.S (revision 24e4dcf4ba5e9dedcf89efd358ea3e1fe5867020)
1/* Do not modify. This file is auto-generated from poly1305-x86_64.pl. */
2.text
3
4
5
6.globl	poly1305_init
7.hidden	poly1305_init
8.globl	poly1305_blocks
9.hidden	poly1305_blocks
10.globl	poly1305_emit
11.hidden	poly1305_emit
12
13.type	poly1305_init,@function
14.align	32
15poly1305_init:
16.cfi_startproc
17	xorq	%rax,%rax
18	movq	%rax,0(%rdi)
19	movq	%rax,8(%rdi)
20	movq	%rax,16(%rdi)
21
22	cmpq	$0,%rsi
23	je	.Lno_key
24
25	leaq	poly1305_blocks(%rip),%r10
26	leaq	poly1305_emit(%rip),%r11
27	movq	OPENSSL_ia32cap_P+4(%rip),%r9
28	leaq	poly1305_blocks_avx(%rip),%rax
29	leaq	poly1305_emit_avx(%rip),%rcx
30	btq	$28,%r9
31	cmovcq	%rax,%r10
32	cmovcq	%rcx,%r11
33	leaq	poly1305_blocks_avx2(%rip),%rax
34	btq	$37,%r9
35	cmovcq	%rax,%r10
36	movq	$0x0ffffffc0fffffff,%rax
37	movq	$0x0ffffffc0ffffffc,%rcx
38	andq	0(%rsi),%rax
39	andq	8(%rsi),%rcx
40	movq	%rax,24(%rdi)
41	movq	%rcx,32(%rdi)
42	movq	%r10,0(%rdx)
43	movq	%r11,8(%rdx)
44	movl	$1,%eax
45.Lno_key:
46	.byte	0xf3,0xc3
47.cfi_endproc
48.size	poly1305_init,.-poly1305_init
49
50.type	poly1305_blocks,@function
51.align	32
52poly1305_blocks:
53.cfi_startproc
54.byte	243,15,30,250
55.Lblocks:
56	shrq	$4,%rdx
57	jz	.Lno_data
58
59	pushq	%rbx
60.cfi_adjust_cfa_offset	8
61.cfi_offset	%rbx,-16
62	pushq	%rbp
63.cfi_adjust_cfa_offset	8
64.cfi_offset	%rbp,-24
65	pushq	%r12
66.cfi_adjust_cfa_offset	8
67.cfi_offset	%r12,-32
68	pushq	%r13
69.cfi_adjust_cfa_offset	8
70.cfi_offset	%r13,-40
71	pushq	%r14
72.cfi_adjust_cfa_offset	8
73.cfi_offset	%r14,-48
74	pushq	%r15
75.cfi_adjust_cfa_offset	8
76.cfi_offset	%r15,-56
77.Lblocks_body:
78
79	movq	%rdx,%r15
80
81	movq	24(%rdi),%r11
82	movq	32(%rdi),%r13
83
84	movq	0(%rdi),%r14
85	movq	8(%rdi),%rbx
86	movq	16(%rdi),%rbp
87
88	movq	%r13,%r12
89	shrq	$2,%r13
90	movq	%r12,%rax
91	addq	%r12,%r13
92	jmp	.Loop
93
94.align	32
95.Loop:
96	addq	0(%rsi),%r14
97	adcq	8(%rsi),%rbx
98	leaq	16(%rsi),%rsi
99	adcq	%rcx,%rbp
100	mulq	%r14
101	movq	%rax,%r9
102	movq	%r11,%rax
103	movq	%rdx,%r10
104
105	mulq	%r14
106	movq	%rax,%r14
107	movq	%r11,%rax
108	movq	%rdx,%r8
109
110	mulq	%rbx
111	addq	%rax,%r9
112	movq	%r13,%rax
113	adcq	%rdx,%r10
114
115	mulq	%rbx
116	movq	%rbp,%rbx
117	addq	%rax,%r14
118	adcq	%rdx,%r8
119
120	imulq	%r13,%rbx
121	addq	%rbx,%r9
122	movq	%r8,%rbx
123	adcq	$0,%r10
124
125	imulq	%r11,%rbp
126	addq	%r9,%rbx
127	movq	$-4,%rax
128	adcq	%rbp,%r10
129
130	andq	%r10,%rax
131	movq	%r10,%rbp
132	shrq	$2,%r10
133	andq	$3,%rbp
134	addq	%r10,%rax
135	addq	%rax,%r14
136	adcq	$0,%rbx
137	adcq	$0,%rbp
138	movq	%r12,%rax
139	decq	%r15
140	jnz	.Loop
141
142	movq	%r14,0(%rdi)
143	movq	%rbx,8(%rdi)
144	movq	%rbp,16(%rdi)
145
146	movq	0(%rsp),%r15
147.cfi_restore	%r15
148	movq	8(%rsp),%r14
149.cfi_restore	%r14
150	movq	16(%rsp),%r13
151.cfi_restore	%r13
152	movq	24(%rsp),%r12
153.cfi_restore	%r12
154	movq	32(%rsp),%rbp
155.cfi_restore	%rbp
156	movq	40(%rsp),%rbx
157.cfi_restore	%rbx
158	leaq	48(%rsp),%rsp
159.cfi_adjust_cfa_offset	-48
160.Lno_data:
161.Lblocks_epilogue:
162	.byte	0xf3,0xc3
163.cfi_endproc
164.size	poly1305_blocks,.-poly1305_blocks
165
166.type	poly1305_emit,@function
167.align	32
168poly1305_emit:
169.cfi_startproc
170.byte	243,15,30,250
171.Lemit:
172	movq	0(%rdi),%r8
173	movq	8(%rdi),%r9
174	movq	16(%rdi),%r10
175
176	movq	%r8,%rax
177	addq	$5,%r8
178	movq	%r9,%rcx
179	adcq	$0,%r9
180	adcq	$0,%r10
181	shrq	$2,%r10
182	cmovnzq	%r8,%rax
183	cmovnzq	%r9,%rcx
184
185	addq	0(%rdx),%rax
186	adcq	8(%rdx),%rcx
187	movq	%rax,0(%rsi)
188	movq	%rcx,8(%rsi)
189
190	.byte	0xf3,0xc3
191.cfi_endproc
192.size	poly1305_emit,.-poly1305_emit
193.type	__poly1305_block,@function
194.align	32
195__poly1305_block:
196.cfi_startproc
197	mulq	%r14
198	movq	%rax,%r9
199	movq	%r11,%rax
200	movq	%rdx,%r10
201
202	mulq	%r14
203	movq	%rax,%r14
204	movq	%r11,%rax
205	movq	%rdx,%r8
206
207	mulq	%rbx
208	addq	%rax,%r9
209	movq	%r13,%rax
210	adcq	%rdx,%r10
211
212	mulq	%rbx
213	movq	%rbp,%rbx
214	addq	%rax,%r14
215	adcq	%rdx,%r8
216
217	imulq	%r13,%rbx
218	addq	%rbx,%r9
219	movq	%r8,%rbx
220	adcq	$0,%r10
221
222	imulq	%r11,%rbp
223	addq	%r9,%rbx
224	movq	$-4,%rax
225	adcq	%rbp,%r10
226
227	andq	%r10,%rax
228	movq	%r10,%rbp
229	shrq	$2,%r10
230	andq	$3,%rbp
231	addq	%r10,%rax
232	addq	%rax,%r14
233	adcq	$0,%rbx
234	adcq	$0,%rbp
235	.byte	0xf3,0xc3
236.cfi_endproc
237.size	__poly1305_block,.-__poly1305_block
238
239.type	__poly1305_init_avx,@function
240.align	32
241__poly1305_init_avx:
242.cfi_startproc
243	movq	%r11,%r14
244	movq	%r12,%rbx
245	xorq	%rbp,%rbp
246
247	leaq	48+64(%rdi),%rdi
248
249	movq	%r12,%rax
250	call	__poly1305_block
251
252	movl	$0x3ffffff,%eax
253	movl	$0x3ffffff,%edx
254	movq	%r14,%r8
255	andl	%r14d,%eax
256	movq	%r11,%r9
257	andl	%r11d,%edx
258	movl	%eax,-64(%rdi)
259	shrq	$26,%r8
260	movl	%edx,-60(%rdi)
261	shrq	$26,%r9
262
263	movl	$0x3ffffff,%eax
264	movl	$0x3ffffff,%edx
265	andl	%r8d,%eax
266	andl	%r9d,%edx
267	movl	%eax,-48(%rdi)
268	leal	(%rax,%rax,4),%eax
269	movl	%edx,-44(%rdi)
270	leal	(%rdx,%rdx,4),%edx
271	movl	%eax,-32(%rdi)
272	shrq	$26,%r8
273	movl	%edx,-28(%rdi)
274	shrq	$26,%r9
275
276	movq	%rbx,%rax
277	movq	%r12,%rdx
278	shlq	$12,%rax
279	shlq	$12,%rdx
280	orq	%r8,%rax
281	orq	%r9,%rdx
282	andl	$0x3ffffff,%eax
283	andl	$0x3ffffff,%edx
284	movl	%eax,-16(%rdi)
285	leal	(%rax,%rax,4),%eax
286	movl	%edx,-12(%rdi)
287	leal	(%rdx,%rdx,4),%edx
288	movl	%eax,0(%rdi)
289	movq	%rbx,%r8
290	movl	%edx,4(%rdi)
291	movq	%r12,%r9
292
293	movl	$0x3ffffff,%eax
294	movl	$0x3ffffff,%edx
295	shrq	$14,%r8
296	shrq	$14,%r9
297	andl	%r8d,%eax
298	andl	%r9d,%edx
299	movl	%eax,16(%rdi)
300	leal	(%rax,%rax,4),%eax
301	movl	%edx,20(%rdi)
302	leal	(%rdx,%rdx,4),%edx
303	movl	%eax,32(%rdi)
304	shrq	$26,%r8
305	movl	%edx,36(%rdi)
306	shrq	$26,%r9
307
308	movq	%rbp,%rax
309	shlq	$24,%rax
310	orq	%rax,%r8
311	movl	%r8d,48(%rdi)
312	leaq	(%r8,%r8,4),%r8
313	movl	%r9d,52(%rdi)
314	leaq	(%r9,%r9,4),%r9
315	movl	%r8d,64(%rdi)
316	movl	%r9d,68(%rdi)
317
318	movq	%r12,%rax
319	call	__poly1305_block
320
321	movl	$0x3ffffff,%eax
322	movq	%r14,%r8
323	andl	%r14d,%eax
324	shrq	$26,%r8
325	movl	%eax,-52(%rdi)
326
327	movl	$0x3ffffff,%edx
328	andl	%r8d,%edx
329	movl	%edx,-36(%rdi)
330	leal	(%rdx,%rdx,4),%edx
331	shrq	$26,%r8
332	movl	%edx,-20(%rdi)
333
334	movq	%rbx,%rax
335	shlq	$12,%rax
336	orq	%r8,%rax
337	andl	$0x3ffffff,%eax
338	movl	%eax,-4(%rdi)
339	leal	(%rax,%rax,4),%eax
340	movq	%rbx,%r8
341	movl	%eax,12(%rdi)
342
343	movl	$0x3ffffff,%edx
344	shrq	$14,%r8
345	andl	%r8d,%edx
346	movl	%edx,28(%rdi)
347	leal	(%rdx,%rdx,4),%edx
348	shrq	$26,%r8
349	movl	%edx,44(%rdi)
350
351	movq	%rbp,%rax
352	shlq	$24,%rax
353	orq	%rax,%r8
354	movl	%r8d,60(%rdi)
355	leaq	(%r8,%r8,4),%r8
356	movl	%r8d,76(%rdi)
357
358	movq	%r12,%rax
359	call	__poly1305_block
360
361	movl	$0x3ffffff,%eax
362	movq	%r14,%r8
363	andl	%r14d,%eax
364	shrq	$26,%r8
365	movl	%eax,-56(%rdi)
366
367	movl	$0x3ffffff,%edx
368	andl	%r8d,%edx
369	movl	%edx,-40(%rdi)
370	leal	(%rdx,%rdx,4),%edx
371	shrq	$26,%r8
372	movl	%edx,-24(%rdi)
373
374	movq	%rbx,%rax
375	shlq	$12,%rax
376	orq	%r8,%rax
377	andl	$0x3ffffff,%eax
378	movl	%eax,-8(%rdi)
379	leal	(%rax,%rax,4),%eax
380	movq	%rbx,%r8
381	movl	%eax,8(%rdi)
382
383	movl	$0x3ffffff,%edx
384	shrq	$14,%r8
385	andl	%r8d,%edx
386	movl	%edx,24(%rdi)
387	leal	(%rdx,%rdx,4),%edx
388	shrq	$26,%r8
389	movl	%edx,40(%rdi)
390
391	movq	%rbp,%rax
392	shlq	$24,%rax
393	orq	%rax,%r8
394	movl	%r8d,56(%rdi)
395	leaq	(%r8,%r8,4),%r8
396	movl	%r8d,72(%rdi)
397
398	leaq	-48-64(%rdi),%rdi
399	.byte	0xf3,0xc3
400.cfi_endproc
401.size	__poly1305_init_avx,.-__poly1305_init_avx
402
403.type	poly1305_blocks_avx,@function
404.align	32
405poly1305_blocks_avx:
406.cfi_startproc
407.byte	243,15,30,250
408	movl	20(%rdi),%r8d
409	cmpq	$128,%rdx
410	jae	.Lblocks_avx
411	testl	%r8d,%r8d
412	jz	.Lblocks
413
414.Lblocks_avx:
415	andq	$-16,%rdx
416	jz	.Lno_data_avx
417
418	vzeroupper
419
420	testl	%r8d,%r8d
421	jz	.Lbase2_64_avx
422
423	testq	$31,%rdx
424	jz	.Leven_avx
425
426	pushq	%rbx
427.cfi_adjust_cfa_offset	8
428.cfi_offset	%rbx,-16
429	pushq	%rbp
430.cfi_adjust_cfa_offset	8
431.cfi_offset	%rbp,-24
432	pushq	%r12
433.cfi_adjust_cfa_offset	8
434.cfi_offset	%r12,-32
435	pushq	%r13
436.cfi_adjust_cfa_offset	8
437.cfi_offset	%r13,-40
438	pushq	%r14
439.cfi_adjust_cfa_offset	8
440.cfi_offset	%r14,-48
441	pushq	%r15
442.cfi_adjust_cfa_offset	8
443.cfi_offset	%r15,-56
444.Lblocks_avx_body:
445
446	movq	%rdx,%r15
447
448	movq	0(%rdi),%r8
449	movq	8(%rdi),%r9
450	movl	16(%rdi),%ebp
451
452	movq	24(%rdi),%r11
453	movq	32(%rdi),%r13
454
455
456	movl	%r8d,%r14d
457	andq	$-2147483648,%r8
458	movq	%r9,%r12
459	movl	%r9d,%ebx
460	andq	$-2147483648,%r9
461
462	shrq	$6,%r8
463	shlq	$52,%r12
464	addq	%r8,%r14
465	shrq	$12,%rbx
466	shrq	$18,%r9
467	addq	%r12,%r14
468	adcq	%r9,%rbx
469
470	movq	%rbp,%r8
471	shlq	$40,%r8
472	shrq	$24,%rbp
473	addq	%r8,%rbx
474	adcq	$0,%rbp
475
476	movq	$-4,%r9
477	movq	%rbp,%r8
478	andq	%rbp,%r9
479	shrq	$2,%r8
480	andq	$3,%rbp
481	addq	%r9,%r8
482	addq	%r8,%r14
483	adcq	$0,%rbx
484	adcq	$0,%rbp
485
486	movq	%r13,%r12
487	movq	%r13,%rax
488	shrq	$2,%r13
489	addq	%r12,%r13
490
491	addq	0(%rsi),%r14
492	adcq	8(%rsi),%rbx
493	leaq	16(%rsi),%rsi
494	adcq	%rcx,%rbp
495
496	call	__poly1305_block
497
498	testq	%rcx,%rcx
499	jz	.Lstore_base2_64_avx
500
501
502	movq	%r14,%rax
503	movq	%r14,%rdx
504	shrq	$52,%r14
505	movq	%rbx,%r11
506	movq	%rbx,%r12
507	shrq	$26,%rdx
508	andq	$0x3ffffff,%rax
509	shlq	$12,%r11
510	andq	$0x3ffffff,%rdx
511	shrq	$14,%rbx
512	orq	%r11,%r14
513	shlq	$24,%rbp
514	andq	$0x3ffffff,%r14
515	shrq	$40,%r12
516	andq	$0x3ffffff,%rbx
517	orq	%r12,%rbp
518
519	subq	$16,%r15
520	jz	.Lstore_base2_26_avx
521
522	vmovd	%eax,%xmm0
523	vmovd	%edx,%xmm1
524	vmovd	%r14d,%xmm2
525	vmovd	%ebx,%xmm3
526	vmovd	%ebp,%xmm4
527	jmp	.Lproceed_avx
528
529.align	32
530.Lstore_base2_64_avx:
531	movq	%r14,0(%rdi)
532	movq	%rbx,8(%rdi)
533	movq	%rbp,16(%rdi)
534	jmp	.Ldone_avx
535
536.align	16
537.Lstore_base2_26_avx:
538	movl	%eax,0(%rdi)
539	movl	%edx,4(%rdi)
540	movl	%r14d,8(%rdi)
541	movl	%ebx,12(%rdi)
542	movl	%ebp,16(%rdi)
543.align	16
544.Ldone_avx:
545	movq	0(%rsp),%r15
546.cfi_restore	%r15
547	movq	8(%rsp),%r14
548.cfi_restore	%r14
549	movq	16(%rsp),%r13
550.cfi_restore	%r13
551	movq	24(%rsp),%r12
552.cfi_restore	%r12
553	movq	32(%rsp),%rbp
554.cfi_restore	%rbp
555	movq	40(%rsp),%rbx
556.cfi_restore	%rbx
557	leaq	48(%rsp),%rsp
558.cfi_adjust_cfa_offset	-48
559.Lno_data_avx:
560.Lblocks_avx_epilogue:
561	.byte	0xf3,0xc3
562.cfi_endproc
563
564.align	32
565.Lbase2_64_avx:
566.cfi_startproc
567	pushq	%rbx
568.cfi_adjust_cfa_offset	8
569.cfi_offset	%rbx,-16
570	pushq	%rbp
571.cfi_adjust_cfa_offset	8
572.cfi_offset	%rbp,-24
573	pushq	%r12
574.cfi_adjust_cfa_offset	8
575.cfi_offset	%r12,-32
576	pushq	%r13
577.cfi_adjust_cfa_offset	8
578.cfi_offset	%r13,-40
579	pushq	%r14
580.cfi_adjust_cfa_offset	8
581.cfi_offset	%r14,-48
582	pushq	%r15
583.cfi_adjust_cfa_offset	8
584.cfi_offset	%r15,-56
585.Lbase2_64_avx_body:
586
587	movq	%rdx,%r15
588
589	movq	24(%rdi),%r11
590	movq	32(%rdi),%r13
591
592	movq	0(%rdi),%r14
593	movq	8(%rdi),%rbx
594	movl	16(%rdi),%ebp
595
596	movq	%r13,%r12
597	movq	%r13,%rax
598	shrq	$2,%r13
599	addq	%r12,%r13
600
601	testq	$31,%rdx
602	jz	.Linit_avx
603
604	addq	0(%rsi),%r14
605	adcq	8(%rsi),%rbx
606	leaq	16(%rsi),%rsi
607	adcq	%rcx,%rbp
608	subq	$16,%r15
609
610	call	__poly1305_block
611
612.Linit_avx:
613
614	movq	%r14,%rax
615	movq	%r14,%rdx
616	shrq	$52,%r14
617	movq	%rbx,%r8
618	movq	%rbx,%r9
619	shrq	$26,%rdx
620	andq	$0x3ffffff,%rax
621	shlq	$12,%r8
622	andq	$0x3ffffff,%rdx
623	shrq	$14,%rbx
624	orq	%r8,%r14
625	shlq	$24,%rbp
626	andq	$0x3ffffff,%r14
627	shrq	$40,%r9
628	andq	$0x3ffffff,%rbx
629	orq	%r9,%rbp
630
631	vmovd	%eax,%xmm0
632	vmovd	%edx,%xmm1
633	vmovd	%r14d,%xmm2
634	vmovd	%ebx,%xmm3
635	vmovd	%ebp,%xmm4
636	movl	$1,20(%rdi)
637
638	call	__poly1305_init_avx
639
640.Lproceed_avx:
641	movq	%r15,%rdx
642
643	movq	0(%rsp),%r15
644.cfi_restore	%r15
645	movq	8(%rsp),%r14
646.cfi_restore	%r14
647	movq	16(%rsp),%r13
648.cfi_restore	%r13
649	movq	24(%rsp),%r12
650.cfi_restore	%r12
651	movq	32(%rsp),%rbp
652.cfi_restore	%rbp
653	movq	40(%rsp),%rbx
654.cfi_restore	%rbx
655	leaq	48(%rsp),%rax
656	leaq	48(%rsp),%rsp
657.cfi_adjust_cfa_offset	-48
658.Lbase2_64_avx_epilogue:
659	jmp	.Ldo_avx
660.cfi_endproc
661
662.align	32
663.Leven_avx:
664.cfi_startproc
665	vmovd	0(%rdi),%xmm0
666	vmovd	4(%rdi),%xmm1
667	vmovd	8(%rdi),%xmm2
668	vmovd	12(%rdi),%xmm3
669	vmovd	16(%rdi),%xmm4
670
671.Ldo_avx:
672	leaq	-88(%rsp),%r11
673.cfi_def_cfa	%r11,0x60
674	subq	$0x178,%rsp
675	subq	$64,%rdx
676	leaq	-32(%rsi),%rax
677	cmovcq	%rax,%rsi
678
679	vmovdqu	48(%rdi),%xmm14
680	leaq	112(%rdi),%rdi
681	leaq	.Lconst(%rip),%rcx
682
683
684
685	vmovdqu	32(%rsi),%xmm5
686	vmovdqu	48(%rsi),%xmm6
687	vmovdqa	64(%rcx),%xmm15
688
689	vpsrldq	$6,%xmm5,%xmm7
690	vpsrldq	$6,%xmm6,%xmm8
691	vpunpckhqdq	%xmm6,%xmm5,%xmm9
692	vpunpcklqdq	%xmm6,%xmm5,%xmm5
693	vpunpcklqdq	%xmm8,%xmm7,%xmm8
694
695	vpsrlq	$40,%xmm9,%xmm9
696	vpsrlq	$26,%xmm5,%xmm6
697	vpand	%xmm15,%xmm5,%xmm5
698	vpsrlq	$4,%xmm8,%xmm7
699	vpand	%xmm15,%xmm6,%xmm6
700	vpsrlq	$30,%xmm8,%xmm8
701	vpand	%xmm15,%xmm7,%xmm7
702	vpand	%xmm15,%xmm8,%xmm8
703	vpor	32(%rcx),%xmm9,%xmm9
704
705	jbe	.Lskip_loop_avx
706
707
708	vmovdqu	-48(%rdi),%xmm11
709	vmovdqu	-32(%rdi),%xmm12
710	vpshufd	$0xEE,%xmm14,%xmm13
711	vpshufd	$0x44,%xmm14,%xmm10
712	vmovdqa	%xmm13,-144(%r11)
713	vmovdqa	%xmm10,0(%rsp)
714	vpshufd	$0xEE,%xmm11,%xmm14
715	vmovdqu	-16(%rdi),%xmm10
716	vpshufd	$0x44,%xmm11,%xmm11
717	vmovdqa	%xmm14,-128(%r11)
718	vmovdqa	%xmm11,16(%rsp)
719	vpshufd	$0xEE,%xmm12,%xmm13
720	vmovdqu	0(%rdi),%xmm11
721	vpshufd	$0x44,%xmm12,%xmm12
722	vmovdqa	%xmm13,-112(%r11)
723	vmovdqa	%xmm12,32(%rsp)
724	vpshufd	$0xEE,%xmm10,%xmm14
725	vmovdqu	16(%rdi),%xmm12
726	vpshufd	$0x44,%xmm10,%xmm10
727	vmovdqa	%xmm14,-96(%r11)
728	vmovdqa	%xmm10,48(%rsp)
729	vpshufd	$0xEE,%xmm11,%xmm13
730	vmovdqu	32(%rdi),%xmm10
731	vpshufd	$0x44,%xmm11,%xmm11
732	vmovdqa	%xmm13,-80(%r11)
733	vmovdqa	%xmm11,64(%rsp)
734	vpshufd	$0xEE,%xmm12,%xmm14
735	vmovdqu	48(%rdi),%xmm11
736	vpshufd	$0x44,%xmm12,%xmm12
737	vmovdqa	%xmm14,-64(%r11)
738	vmovdqa	%xmm12,80(%rsp)
739	vpshufd	$0xEE,%xmm10,%xmm13
740	vmovdqu	64(%rdi),%xmm12
741	vpshufd	$0x44,%xmm10,%xmm10
742	vmovdqa	%xmm13,-48(%r11)
743	vmovdqa	%xmm10,96(%rsp)
744	vpshufd	$0xEE,%xmm11,%xmm14
745	vpshufd	$0x44,%xmm11,%xmm11
746	vmovdqa	%xmm14,-32(%r11)
747	vmovdqa	%xmm11,112(%rsp)
748	vpshufd	$0xEE,%xmm12,%xmm13
749	vmovdqa	0(%rsp),%xmm14
750	vpshufd	$0x44,%xmm12,%xmm12
751	vmovdqa	%xmm13,-16(%r11)
752	vmovdqa	%xmm12,128(%rsp)
753
754	jmp	.Loop_avx
755
756.align	32
757.Loop_avx:
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778	vpmuludq	%xmm5,%xmm14,%xmm10
779	vpmuludq	%xmm6,%xmm14,%xmm11
780	vmovdqa	%xmm2,32(%r11)
781	vpmuludq	%xmm7,%xmm14,%xmm12
782	vmovdqa	16(%rsp),%xmm2
783	vpmuludq	%xmm8,%xmm14,%xmm13
784	vpmuludq	%xmm9,%xmm14,%xmm14
785
786	vmovdqa	%xmm0,0(%r11)
787	vpmuludq	32(%rsp),%xmm9,%xmm0
788	vmovdqa	%xmm1,16(%r11)
789	vpmuludq	%xmm8,%xmm2,%xmm1
790	vpaddq	%xmm0,%xmm10,%xmm10
791	vpaddq	%xmm1,%xmm14,%xmm14
792	vmovdqa	%xmm3,48(%r11)
793	vpmuludq	%xmm7,%xmm2,%xmm0
794	vpmuludq	%xmm6,%xmm2,%xmm1
795	vpaddq	%xmm0,%xmm13,%xmm13
796	vmovdqa	48(%rsp),%xmm3
797	vpaddq	%xmm1,%xmm12,%xmm12
798	vmovdqa	%xmm4,64(%r11)
799	vpmuludq	%xmm5,%xmm2,%xmm2
800	vpmuludq	%xmm7,%xmm3,%xmm0
801	vpaddq	%xmm2,%xmm11,%xmm11
802
803	vmovdqa	64(%rsp),%xmm4
804	vpaddq	%xmm0,%xmm14,%xmm14
805	vpmuludq	%xmm6,%xmm3,%xmm1
806	vpmuludq	%xmm5,%xmm3,%xmm3
807	vpaddq	%xmm1,%xmm13,%xmm13
808	vmovdqa	80(%rsp),%xmm2
809	vpaddq	%xmm3,%xmm12,%xmm12
810	vpmuludq	%xmm9,%xmm4,%xmm0
811	vpmuludq	%xmm8,%xmm4,%xmm4
812	vpaddq	%xmm0,%xmm11,%xmm11
813	vmovdqa	96(%rsp),%xmm3
814	vpaddq	%xmm4,%xmm10,%xmm10
815
816	vmovdqa	128(%rsp),%xmm4
817	vpmuludq	%xmm6,%xmm2,%xmm1
818	vpmuludq	%xmm5,%xmm2,%xmm2
819	vpaddq	%xmm1,%xmm14,%xmm14
820	vpaddq	%xmm2,%xmm13,%xmm13
821	vpmuludq	%xmm9,%xmm3,%xmm0
822	vpmuludq	%xmm8,%xmm3,%xmm1
823	vpaddq	%xmm0,%xmm12,%xmm12
824	vmovdqu	0(%rsi),%xmm0
825	vpaddq	%xmm1,%xmm11,%xmm11
826	vpmuludq	%xmm7,%xmm3,%xmm3
827	vpmuludq	%xmm7,%xmm4,%xmm7
828	vpaddq	%xmm3,%xmm10,%xmm10
829
830	vmovdqu	16(%rsi),%xmm1
831	vpaddq	%xmm7,%xmm11,%xmm11
832	vpmuludq	%xmm8,%xmm4,%xmm8
833	vpmuludq	%xmm9,%xmm4,%xmm9
834	vpsrldq	$6,%xmm0,%xmm2
835	vpaddq	%xmm8,%xmm12,%xmm12
836	vpaddq	%xmm9,%xmm13,%xmm13
837	vpsrldq	$6,%xmm1,%xmm3
838	vpmuludq	112(%rsp),%xmm5,%xmm9
839	vpmuludq	%xmm6,%xmm4,%xmm5
840	vpunpckhqdq	%xmm1,%xmm0,%xmm4
841	vpaddq	%xmm9,%xmm14,%xmm14
842	vmovdqa	-144(%r11),%xmm9
843	vpaddq	%xmm5,%xmm10,%xmm10
844
845	vpunpcklqdq	%xmm1,%xmm0,%xmm0
846	vpunpcklqdq	%xmm3,%xmm2,%xmm3
847
848
849	vpsrldq	$5,%xmm4,%xmm4
850	vpsrlq	$26,%xmm0,%xmm1
851	vpand	%xmm15,%xmm0,%xmm0
852	vpsrlq	$4,%xmm3,%xmm2
853	vpand	%xmm15,%xmm1,%xmm1
854	vpand	0(%rcx),%xmm4,%xmm4
855	vpsrlq	$30,%xmm3,%xmm3
856	vpand	%xmm15,%xmm2,%xmm2
857	vpand	%xmm15,%xmm3,%xmm3
858	vpor	32(%rcx),%xmm4,%xmm4
859
860	vpaddq	0(%r11),%xmm0,%xmm0
861	vpaddq	16(%r11),%xmm1,%xmm1
862	vpaddq	32(%r11),%xmm2,%xmm2
863	vpaddq	48(%r11),%xmm3,%xmm3
864	vpaddq	64(%r11),%xmm4,%xmm4
865
866	leaq	32(%rsi),%rax
867	leaq	64(%rsi),%rsi
868	subq	$64,%rdx
869	cmovcq	%rax,%rsi
870
871
872
873
874
875
876
877
878
879
880	vpmuludq	%xmm0,%xmm9,%xmm5
881	vpmuludq	%xmm1,%xmm9,%xmm6
882	vpaddq	%xmm5,%xmm10,%xmm10
883	vpaddq	%xmm6,%xmm11,%xmm11
884	vmovdqa	-128(%r11),%xmm7
885	vpmuludq	%xmm2,%xmm9,%xmm5
886	vpmuludq	%xmm3,%xmm9,%xmm6
887	vpaddq	%xmm5,%xmm12,%xmm12
888	vpaddq	%xmm6,%xmm13,%xmm13
889	vpmuludq	%xmm4,%xmm9,%xmm9
890	vpmuludq	-112(%r11),%xmm4,%xmm5
891	vpaddq	%xmm9,%xmm14,%xmm14
892
893	vpaddq	%xmm5,%xmm10,%xmm10
894	vpmuludq	%xmm2,%xmm7,%xmm6
895	vpmuludq	%xmm3,%xmm7,%xmm5
896	vpaddq	%xmm6,%xmm13,%xmm13
897	vmovdqa	-96(%r11),%xmm8
898	vpaddq	%xmm5,%xmm14,%xmm14
899	vpmuludq	%xmm1,%xmm7,%xmm6
900	vpmuludq	%xmm0,%xmm7,%xmm7
901	vpaddq	%xmm6,%xmm12,%xmm12
902	vpaddq	%xmm7,%xmm11,%xmm11
903
904	vmovdqa	-80(%r11),%xmm9
905	vpmuludq	%xmm2,%xmm8,%xmm5
906	vpmuludq	%xmm1,%xmm8,%xmm6
907	vpaddq	%xmm5,%xmm14,%xmm14
908	vpaddq	%xmm6,%xmm13,%xmm13
909	vmovdqa	-64(%r11),%xmm7
910	vpmuludq	%xmm0,%xmm8,%xmm8
911	vpmuludq	%xmm4,%xmm9,%xmm5
912	vpaddq	%xmm8,%xmm12,%xmm12
913	vpaddq	%xmm5,%xmm11,%xmm11
914	vmovdqa	-48(%r11),%xmm8
915	vpmuludq	%xmm3,%xmm9,%xmm9
916	vpmuludq	%xmm1,%xmm7,%xmm6
917	vpaddq	%xmm9,%xmm10,%xmm10
918
919	vmovdqa	-16(%r11),%xmm9
920	vpaddq	%xmm6,%xmm14,%xmm14
921	vpmuludq	%xmm0,%xmm7,%xmm7
922	vpmuludq	%xmm4,%xmm8,%xmm5
923	vpaddq	%xmm7,%xmm13,%xmm13
924	vpaddq	%xmm5,%xmm12,%xmm12
925	vmovdqu	32(%rsi),%xmm5
926	vpmuludq	%xmm3,%xmm8,%xmm7
927	vpmuludq	%xmm2,%xmm8,%xmm8
928	vpaddq	%xmm7,%xmm11,%xmm11
929	vmovdqu	48(%rsi),%xmm6
930	vpaddq	%xmm8,%xmm10,%xmm10
931
932	vpmuludq	%xmm2,%xmm9,%xmm2
933	vpmuludq	%xmm3,%xmm9,%xmm3
934	vpsrldq	$6,%xmm5,%xmm7
935	vpaddq	%xmm2,%xmm11,%xmm11
936	vpmuludq	%xmm4,%xmm9,%xmm4
937	vpsrldq	$6,%xmm6,%xmm8
938	vpaddq	%xmm3,%xmm12,%xmm2
939	vpaddq	%xmm4,%xmm13,%xmm3
940	vpmuludq	-32(%r11),%xmm0,%xmm4
941	vpmuludq	%xmm1,%xmm9,%xmm0
942	vpunpckhqdq	%xmm6,%xmm5,%xmm9
943	vpaddq	%xmm4,%xmm14,%xmm4
944	vpaddq	%xmm0,%xmm10,%xmm0
945
946	vpunpcklqdq	%xmm6,%xmm5,%xmm5
947	vpunpcklqdq	%xmm8,%xmm7,%xmm8
948
949
950	vpsrldq	$5,%xmm9,%xmm9
951	vpsrlq	$26,%xmm5,%xmm6
952	vmovdqa	0(%rsp),%xmm14
953	vpand	%xmm15,%xmm5,%xmm5
954	vpsrlq	$4,%xmm8,%xmm7
955	vpand	%xmm15,%xmm6,%xmm6
956	vpand	0(%rcx),%xmm9,%xmm9
957	vpsrlq	$30,%xmm8,%xmm8
958	vpand	%xmm15,%xmm7,%xmm7
959	vpand	%xmm15,%xmm8,%xmm8
960	vpor	32(%rcx),%xmm9,%xmm9
961
962
963
964
965
966	vpsrlq	$26,%xmm3,%xmm13
967	vpand	%xmm15,%xmm3,%xmm3
968	vpaddq	%xmm13,%xmm4,%xmm4
969
970	vpsrlq	$26,%xmm0,%xmm10
971	vpand	%xmm15,%xmm0,%xmm0
972	vpaddq	%xmm10,%xmm11,%xmm1
973
974	vpsrlq	$26,%xmm4,%xmm10
975	vpand	%xmm15,%xmm4,%xmm4
976
977	vpsrlq	$26,%xmm1,%xmm11
978	vpand	%xmm15,%xmm1,%xmm1
979	vpaddq	%xmm11,%xmm2,%xmm2
980
981	vpaddq	%xmm10,%xmm0,%xmm0
982	vpsllq	$2,%xmm10,%xmm10
983	vpaddq	%xmm10,%xmm0,%xmm0
984
985	vpsrlq	$26,%xmm2,%xmm12
986	vpand	%xmm15,%xmm2,%xmm2
987	vpaddq	%xmm12,%xmm3,%xmm3
988
989	vpsrlq	$26,%xmm0,%xmm10
990	vpand	%xmm15,%xmm0,%xmm0
991	vpaddq	%xmm10,%xmm1,%xmm1
992
993	vpsrlq	$26,%xmm3,%xmm13
994	vpand	%xmm15,%xmm3,%xmm3
995	vpaddq	%xmm13,%xmm4,%xmm4
996
997	ja	.Loop_avx
998
999.Lskip_loop_avx:
1000
1001
1002
1003	vpshufd	$0x10,%xmm14,%xmm14
1004	addq	$32,%rdx
1005	jnz	.Long_tail_avx
1006
1007	vpaddq	%xmm2,%xmm7,%xmm7
1008	vpaddq	%xmm0,%xmm5,%xmm5
1009	vpaddq	%xmm1,%xmm6,%xmm6
1010	vpaddq	%xmm3,%xmm8,%xmm8
1011	vpaddq	%xmm4,%xmm9,%xmm9
1012
1013.Long_tail_avx:
1014	vmovdqa	%xmm2,32(%r11)
1015	vmovdqa	%xmm0,0(%r11)
1016	vmovdqa	%xmm1,16(%r11)
1017	vmovdqa	%xmm3,48(%r11)
1018	vmovdqa	%xmm4,64(%r11)
1019
1020
1021
1022
1023
1024
1025
1026	vpmuludq	%xmm7,%xmm14,%xmm12
1027	vpmuludq	%xmm5,%xmm14,%xmm10
1028	vpshufd	$0x10,-48(%rdi),%xmm2
1029	vpmuludq	%xmm6,%xmm14,%xmm11
1030	vpmuludq	%xmm8,%xmm14,%xmm13
1031	vpmuludq	%xmm9,%xmm14,%xmm14
1032
1033	vpmuludq	%xmm8,%xmm2,%xmm0
1034	vpaddq	%xmm0,%xmm14,%xmm14
1035	vpshufd	$0x10,-32(%rdi),%xmm3
1036	vpmuludq	%xmm7,%xmm2,%xmm1
1037	vpaddq	%xmm1,%xmm13,%xmm13
1038	vpshufd	$0x10,-16(%rdi),%xmm4
1039	vpmuludq	%xmm6,%xmm2,%xmm0
1040	vpaddq	%xmm0,%xmm12,%xmm12
1041	vpmuludq	%xmm5,%xmm2,%xmm2
1042	vpaddq	%xmm2,%xmm11,%xmm11
1043	vpmuludq	%xmm9,%xmm3,%xmm3
1044	vpaddq	%xmm3,%xmm10,%xmm10
1045
1046	vpshufd	$0x10,0(%rdi),%xmm2
1047	vpmuludq	%xmm7,%xmm4,%xmm1
1048	vpaddq	%xmm1,%xmm14,%xmm14
1049	vpmuludq	%xmm6,%xmm4,%xmm0
1050	vpaddq	%xmm0,%xmm13,%xmm13
1051	vpshufd	$0x10,16(%rdi),%xmm3
1052	vpmuludq	%xmm5,%xmm4,%xmm4
1053	vpaddq	%xmm4,%xmm12,%xmm12
1054	vpmuludq	%xmm9,%xmm2,%xmm1
1055	vpaddq	%xmm1,%xmm11,%xmm11
1056	vpshufd	$0x10,32(%rdi),%xmm4
1057	vpmuludq	%xmm8,%xmm2,%xmm2
1058	vpaddq	%xmm2,%xmm10,%xmm10
1059
1060	vpmuludq	%xmm6,%xmm3,%xmm0
1061	vpaddq	%xmm0,%xmm14,%xmm14
1062	vpmuludq	%xmm5,%xmm3,%xmm3
1063	vpaddq	%xmm3,%xmm13,%xmm13
1064	vpshufd	$0x10,48(%rdi),%xmm2
1065	vpmuludq	%xmm9,%xmm4,%xmm1
1066	vpaddq	%xmm1,%xmm12,%xmm12
1067	vpshufd	$0x10,64(%rdi),%xmm3
1068	vpmuludq	%xmm8,%xmm4,%xmm0
1069	vpaddq	%xmm0,%xmm11,%xmm11
1070	vpmuludq	%xmm7,%xmm4,%xmm4
1071	vpaddq	%xmm4,%xmm10,%xmm10
1072
1073	vpmuludq	%xmm5,%xmm2,%xmm2
1074	vpaddq	%xmm2,%xmm14,%xmm14
1075	vpmuludq	%xmm9,%xmm3,%xmm1
1076	vpaddq	%xmm1,%xmm13,%xmm13
1077	vpmuludq	%xmm8,%xmm3,%xmm0
1078	vpaddq	%xmm0,%xmm12,%xmm12
1079	vpmuludq	%xmm7,%xmm3,%xmm1
1080	vpaddq	%xmm1,%xmm11,%xmm11
1081	vpmuludq	%xmm6,%xmm3,%xmm3
1082	vpaddq	%xmm3,%xmm10,%xmm10
1083
1084	jz	.Lshort_tail_avx
1085
1086	vmovdqu	0(%rsi),%xmm0
1087	vmovdqu	16(%rsi),%xmm1
1088
1089	vpsrldq	$6,%xmm0,%xmm2
1090	vpsrldq	$6,%xmm1,%xmm3
1091	vpunpckhqdq	%xmm1,%xmm0,%xmm4
1092	vpunpcklqdq	%xmm1,%xmm0,%xmm0
1093	vpunpcklqdq	%xmm3,%xmm2,%xmm3
1094
1095	vpsrlq	$40,%xmm4,%xmm4
1096	vpsrlq	$26,%xmm0,%xmm1
1097	vpand	%xmm15,%xmm0,%xmm0
1098	vpsrlq	$4,%xmm3,%xmm2
1099	vpand	%xmm15,%xmm1,%xmm1
1100	vpsrlq	$30,%xmm3,%xmm3
1101	vpand	%xmm15,%xmm2,%xmm2
1102	vpand	%xmm15,%xmm3,%xmm3
1103	vpor	32(%rcx),%xmm4,%xmm4
1104
1105	vpshufd	$0x32,-64(%rdi),%xmm9
1106	vpaddq	0(%r11),%xmm0,%xmm0
1107	vpaddq	16(%r11),%xmm1,%xmm1
1108	vpaddq	32(%r11),%xmm2,%xmm2
1109	vpaddq	48(%r11),%xmm3,%xmm3
1110	vpaddq	64(%r11),%xmm4,%xmm4
1111
1112
1113
1114
1115	vpmuludq	%xmm0,%xmm9,%xmm5
1116	vpaddq	%xmm5,%xmm10,%xmm10
1117	vpmuludq	%xmm1,%xmm9,%xmm6
1118	vpaddq	%xmm6,%xmm11,%xmm11
1119	vpmuludq	%xmm2,%xmm9,%xmm5
1120	vpaddq	%xmm5,%xmm12,%xmm12
1121	vpshufd	$0x32,-48(%rdi),%xmm7
1122	vpmuludq	%xmm3,%xmm9,%xmm6
1123	vpaddq	%xmm6,%xmm13,%xmm13
1124	vpmuludq	%xmm4,%xmm9,%xmm9
1125	vpaddq	%xmm9,%xmm14,%xmm14
1126
1127	vpmuludq	%xmm3,%xmm7,%xmm5
1128	vpaddq	%xmm5,%xmm14,%xmm14
1129	vpshufd	$0x32,-32(%rdi),%xmm8
1130	vpmuludq	%xmm2,%xmm7,%xmm6
1131	vpaddq	%xmm6,%xmm13,%xmm13
1132	vpshufd	$0x32,-16(%rdi),%xmm9
1133	vpmuludq	%xmm1,%xmm7,%xmm5
1134	vpaddq	%xmm5,%xmm12,%xmm12
1135	vpmuludq	%xmm0,%xmm7,%xmm7
1136	vpaddq	%xmm7,%xmm11,%xmm11
1137	vpmuludq	%xmm4,%xmm8,%xmm8
1138	vpaddq	%xmm8,%xmm10,%xmm10
1139
1140	vpshufd	$0x32,0(%rdi),%xmm7
1141	vpmuludq	%xmm2,%xmm9,%xmm6
1142	vpaddq	%xmm6,%xmm14,%xmm14
1143	vpmuludq	%xmm1,%xmm9,%xmm5
1144	vpaddq	%xmm5,%xmm13,%xmm13
1145	vpshufd	$0x32,16(%rdi),%xmm8
1146	vpmuludq	%xmm0,%xmm9,%xmm9
1147	vpaddq	%xmm9,%xmm12,%xmm12
1148	vpmuludq	%xmm4,%xmm7,%xmm6
1149	vpaddq	%xmm6,%xmm11,%xmm11
1150	vpshufd	$0x32,32(%rdi),%xmm9
1151	vpmuludq	%xmm3,%xmm7,%xmm7
1152	vpaddq	%xmm7,%xmm10,%xmm10
1153
1154	vpmuludq	%xmm1,%xmm8,%xmm5
1155	vpaddq	%xmm5,%xmm14,%xmm14
1156	vpmuludq	%xmm0,%xmm8,%xmm8
1157	vpaddq	%xmm8,%xmm13,%xmm13
1158	vpshufd	$0x32,48(%rdi),%xmm7
1159	vpmuludq	%xmm4,%xmm9,%xmm6
1160	vpaddq	%xmm6,%xmm12,%xmm12
1161	vpshufd	$0x32,64(%rdi),%xmm8
1162	vpmuludq	%xmm3,%xmm9,%xmm5
1163	vpaddq	%xmm5,%xmm11,%xmm11
1164	vpmuludq	%xmm2,%xmm9,%xmm9
1165	vpaddq	%xmm9,%xmm10,%xmm10
1166
1167	vpmuludq	%xmm0,%xmm7,%xmm7
1168	vpaddq	%xmm7,%xmm14,%xmm14
1169	vpmuludq	%xmm4,%xmm8,%xmm6
1170	vpaddq	%xmm6,%xmm13,%xmm13
1171	vpmuludq	%xmm3,%xmm8,%xmm5
1172	vpaddq	%xmm5,%xmm12,%xmm12
1173	vpmuludq	%xmm2,%xmm8,%xmm6
1174	vpaddq	%xmm6,%xmm11,%xmm11
1175	vpmuludq	%xmm1,%xmm8,%xmm8
1176	vpaddq	%xmm8,%xmm10,%xmm10
1177
1178.Lshort_tail_avx:
1179
1180
1181
1182	vpsrldq	$8,%xmm14,%xmm9
1183	vpsrldq	$8,%xmm13,%xmm8
1184	vpsrldq	$8,%xmm11,%xmm6
1185	vpsrldq	$8,%xmm10,%xmm5
1186	vpsrldq	$8,%xmm12,%xmm7
1187	vpaddq	%xmm8,%xmm13,%xmm13
1188	vpaddq	%xmm9,%xmm14,%xmm14
1189	vpaddq	%xmm5,%xmm10,%xmm10
1190	vpaddq	%xmm6,%xmm11,%xmm11
1191	vpaddq	%xmm7,%xmm12,%xmm12
1192
1193
1194
1195
1196	vpsrlq	$26,%xmm13,%xmm3
1197	vpand	%xmm15,%xmm13,%xmm13
1198	vpaddq	%xmm3,%xmm14,%xmm14
1199
1200	vpsrlq	$26,%xmm10,%xmm0
1201	vpand	%xmm15,%xmm10,%xmm10
1202	vpaddq	%xmm0,%xmm11,%xmm11
1203
1204	vpsrlq	$26,%xmm14,%xmm4
1205	vpand	%xmm15,%xmm14,%xmm14
1206
1207	vpsrlq	$26,%xmm11,%xmm1
1208	vpand	%xmm15,%xmm11,%xmm11
1209	vpaddq	%xmm1,%xmm12,%xmm12
1210
1211	vpaddq	%xmm4,%xmm10,%xmm10
1212	vpsllq	$2,%xmm4,%xmm4
1213	vpaddq	%xmm4,%xmm10,%xmm10
1214
1215	vpsrlq	$26,%xmm12,%xmm2
1216	vpand	%xmm15,%xmm12,%xmm12
1217	vpaddq	%xmm2,%xmm13,%xmm13
1218
1219	vpsrlq	$26,%xmm10,%xmm0
1220	vpand	%xmm15,%xmm10,%xmm10
1221	vpaddq	%xmm0,%xmm11,%xmm11
1222
1223	vpsrlq	$26,%xmm13,%xmm3
1224	vpand	%xmm15,%xmm13,%xmm13
1225	vpaddq	%xmm3,%xmm14,%xmm14
1226
1227	vmovd	%xmm10,-112(%rdi)
1228	vmovd	%xmm11,-108(%rdi)
1229	vmovd	%xmm12,-104(%rdi)
1230	vmovd	%xmm13,-100(%rdi)
1231	vmovd	%xmm14,-96(%rdi)
1232	leaq	88(%r11),%rsp
1233.cfi_def_cfa	%rsp,8
1234	vzeroupper
1235	.byte	0xf3,0xc3
1236.cfi_endproc
1237.size	poly1305_blocks_avx,.-poly1305_blocks_avx
1238
1239.type	poly1305_emit_avx,@function
1240.align	32
1241poly1305_emit_avx:
1242.cfi_startproc
1243.byte	243,15,30,250
1244	cmpl	$0,20(%rdi)
1245	je	.Lemit
1246
1247	movl	0(%rdi),%eax
1248	movl	4(%rdi),%ecx
1249	movl	8(%rdi),%r8d
1250	movl	12(%rdi),%r11d
1251	movl	16(%rdi),%r10d
1252
1253	shlq	$26,%rcx
1254	movq	%r8,%r9
1255	shlq	$52,%r8
1256	addq	%rcx,%rax
1257	shrq	$12,%r9
1258	addq	%rax,%r8
1259	adcq	$0,%r9
1260
1261	shlq	$14,%r11
1262	movq	%r10,%rax
1263	shrq	$24,%r10
1264	addq	%r11,%r9
1265	shlq	$40,%rax
1266	addq	%rax,%r9
1267	adcq	$0,%r10
1268
1269	movq	%r10,%rax
1270	movq	%r10,%rcx
1271	andq	$3,%r10
1272	shrq	$2,%rax
1273	andq	$-4,%rcx
1274	addq	%rcx,%rax
1275	addq	%rax,%r8
1276	adcq	$0,%r9
1277	adcq	$0,%r10
1278
1279	movq	%r8,%rax
1280	addq	$5,%r8
1281	movq	%r9,%rcx
1282	adcq	$0,%r9
1283	adcq	$0,%r10
1284	shrq	$2,%r10
1285	cmovnzq	%r8,%rax
1286	cmovnzq	%r9,%rcx
1287
1288	addq	0(%rdx),%rax
1289	adcq	8(%rdx),%rcx
1290	movq	%rax,0(%rsi)
1291	movq	%rcx,8(%rsi)
1292
1293	.byte	0xf3,0xc3
1294.cfi_endproc
1295.size	poly1305_emit_avx,.-poly1305_emit_avx
1296.type	poly1305_blocks_avx2,@function
1297.align	32
1298poly1305_blocks_avx2:
1299.cfi_startproc
1300.byte	243,15,30,250
1301	movl	20(%rdi),%r8d
1302	cmpq	$128,%rdx
1303	jae	.Lblocks_avx2
1304	testl	%r8d,%r8d
1305	jz	.Lblocks
1306
1307.Lblocks_avx2:
1308	andq	$-16,%rdx
1309	jz	.Lno_data_avx2
1310
1311	vzeroupper
1312
1313	testl	%r8d,%r8d
1314	jz	.Lbase2_64_avx2
1315
1316	testq	$63,%rdx
1317	jz	.Leven_avx2
1318
1319	pushq	%rbx
1320.cfi_adjust_cfa_offset	8
1321.cfi_offset	%rbx,-16
1322	pushq	%rbp
1323.cfi_adjust_cfa_offset	8
1324.cfi_offset	%rbp,-24
1325	pushq	%r12
1326.cfi_adjust_cfa_offset	8
1327.cfi_offset	%r12,-32
1328	pushq	%r13
1329.cfi_adjust_cfa_offset	8
1330.cfi_offset	%r13,-40
1331	pushq	%r14
1332.cfi_adjust_cfa_offset	8
1333.cfi_offset	%r14,-48
1334	pushq	%r15
1335.cfi_adjust_cfa_offset	8
1336.cfi_offset	%r15,-56
1337.Lblocks_avx2_body:
1338
1339	movq	%rdx,%r15
1340
1341	movq	0(%rdi),%r8
1342	movq	8(%rdi),%r9
1343	movl	16(%rdi),%ebp
1344
1345	movq	24(%rdi),%r11
1346	movq	32(%rdi),%r13
1347
1348
1349	movl	%r8d,%r14d
1350	andq	$-2147483648,%r8
1351	movq	%r9,%r12
1352	movl	%r9d,%ebx
1353	andq	$-2147483648,%r9
1354
1355	shrq	$6,%r8
1356	shlq	$52,%r12
1357	addq	%r8,%r14
1358	shrq	$12,%rbx
1359	shrq	$18,%r9
1360	addq	%r12,%r14
1361	adcq	%r9,%rbx
1362
1363	movq	%rbp,%r8
1364	shlq	$40,%r8
1365	shrq	$24,%rbp
1366	addq	%r8,%rbx
1367	adcq	$0,%rbp
1368
1369	movq	$-4,%r9
1370	movq	%rbp,%r8
1371	andq	%rbp,%r9
1372	shrq	$2,%r8
1373	andq	$3,%rbp
1374	addq	%r9,%r8
1375	addq	%r8,%r14
1376	adcq	$0,%rbx
1377	adcq	$0,%rbp
1378
1379	movq	%r13,%r12
1380	movq	%r13,%rax
1381	shrq	$2,%r13
1382	addq	%r12,%r13
1383
1384.Lbase2_26_pre_avx2:
1385	addq	0(%rsi),%r14
1386	adcq	8(%rsi),%rbx
1387	leaq	16(%rsi),%rsi
1388	adcq	%rcx,%rbp
1389	subq	$16,%r15
1390
1391	call	__poly1305_block
1392	movq	%r12,%rax
1393
1394	testq	$63,%r15
1395	jnz	.Lbase2_26_pre_avx2
1396
1397	testq	%rcx,%rcx
1398	jz	.Lstore_base2_64_avx2
1399
1400
1401	movq	%r14,%rax
1402	movq	%r14,%rdx
1403	shrq	$52,%r14
1404	movq	%rbx,%r11
1405	movq	%rbx,%r12
1406	shrq	$26,%rdx
1407	andq	$0x3ffffff,%rax
1408	shlq	$12,%r11
1409	andq	$0x3ffffff,%rdx
1410	shrq	$14,%rbx
1411	orq	%r11,%r14
1412	shlq	$24,%rbp
1413	andq	$0x3ffffff,%r14
1414	shrq	$40,%r12
1415	andq	$0x3ffffff,%rbx
1416	orq	%r12,%rbp
1417
1418	testq	%r15,%r15
1419	jz	.Lstore_base2_26_avx2
1420
1421	vmovd	%eax,%xmm0
1422	vmovd	%edx,%xmm1
1423	vmovd	%r14d,%xmm2
1424	vmovd	%ebx,%xmm3
1425	vmovd	%ebp,%xmm4
1426	jmp	.Lproceed_avx2
1427
1428.align	32
1429.Lstore_base2_64_avx2:
1430	movq	%r14,0(%rdi)
1431	movq	%rbx,8(%rdi)
1432	movq	%rbp,16(%rdi)
1433	jmp	.Ldone_avx2
1434
1435.align	16
1436.Lstore_base2_26_avx2:
1437	movl	%eax,0(%rdi)
1438	movl	%edx,4(%rdi)
1439	movl	%r14d,8(%rdi)
1440	movl	%ebx,12(%rdi)
1441	movl	%ebp,16(%rdi)
1442.align	16
1443.Ldone_avx2:
1444	movq	0(%rsp),%r15
1445.cfi_restore	%r15
1446	movq	8(%rsp),%r14
1447.cfi_restore	%r14
1448	movq	16(%rsp),%r13
1449.cfi_restore	%r13
1450	movq	24(%rsp),%r12
1451.cfi_restore	%r12
1452	movq	32(%rsp),%rbp
1453.cfi_restore	%rbp
1454	movq	40(%rsp),%rbx
1455.cfi_restore	%rbx
1456	leaq	48(%rsp),%rsp
1457.cfi_adjust_cfa_offset	-48
1458.Lno_data_avx2:
1459.Lblocks_avx2_epilogue:
1460	.byte	0xf3,0xc3
1461.cfi_endproc
1462
1463.align	32
1464.Lbase2_64_avx2:
1465.cfi_startproc
1466	pushq	%rbx
1467.cfi_adjust_cfa_offset	8
1468.cfi_offset	%rbx,-16
1469	pushq	%rbp
1470.cfi_adjust_cfa_offset	8
1471.cfi_offset	%rbp,-24
1472	pushq	%r12
1473.cfi_adjust_cfa_offset	8
1474.cfi_offset	%r12,-32
1475	pushq	%r13
1476.cfi_adjust_cfa_offset	8
1477.cfi_offset	%r13,-40
1478	pushq	%r14
1479.cfi_adjust_cfa_offset	8
1480.cfi_offset	%r14,-48
1481	pushq	%r15
1482.cfi_adjust_cfa_offset	8
1483.cfi_offset	%r15,-56
1484.Lbase2_64_avx2_body:
1485
1486	movq	%rdx,%r15
1487
1488	movq	24(%rdi),%r11
1489	movq	32(%rdi),%r13
1490
1491	movq	0(%rdi),%r14
1492	movq	8(%rdi),%rbx
1493	movl	16(%rdi),%ebp
1494
1495	movq	%r13,%r12
1496	movq	%r13,%rax
1497	shrq	$2,%r13
1498	addq	%r12,%r13
1499
1500	testq	$63,%rdx
1501	jz	.Linit_avx2
1502
1503.Lbase2_64_pre_avx2:
1504	addq	0(%rsi),%r14
1505	adcq	8(%rsi),%rbx
1506	leaq	16(%rsi),%rsi
1507	adcq	%rcx,%rbp
1508	subq	$16,%r15
1509
1510	call	__poly1305_block
1511	movq	%r12,%rax
1512
1513	testq	$63,%r15
1514	jnz	.Lbase2_64_pre_avx2
1515
1516.Linit_avx2:
1517
1518	movq	%r14,%rax
1519	movq	%r14,%rdx
1520	shrq	$52,%r14
1521	movq	%rbx,%r8
1522	movq	%rbx,%r9
1523	shrq	$26,%rdx
1524	andq	$0x3ffffff,%rax
1525	shlq	$12,%r8
1526	andq	$0x3ffffff,%rdx
1527	shrq	$14,%rbx
1528	orq	%r8,%r14
1529	shlq	$24,%rbp
1530	andq	$0x3ffffff,%r14
1531	shrq	$40,%r9
1532	andq	$0x3ffffff,%rbx
1533	orq	%r9,%rbp
1534
1535	vmovd	%eax,%xmm0
1536	vmovd	%edx,%xmm1
1537	vmovd	%r14d,%xmm2
1538	vmovd	%ebx,%xmm3
1539	vmovd	%ebp,%xmm4
1540	movl	$1,20(%rdi)
1541
1542	call	__poly1305_init_avx
1543
1544.Lproceed_avx2:
1545	movq	%r15,%rdx
1546	movl	OPENSSL_ia32cap_P+8(%rip),%r10d
1547	movl	$3221291008,%r11d
1548
1549	movq	0(%rsp),%r15
1550.cfi_restore	%r15
1551	movq	8(%rsp),%r14
1552.cfi_restore	%r14
1553	movq	16(%rsp),%r13
1554.cfi_restore	%r13
1555	movq	24(%rsp),%r12
1556.cfi_restore	%r12
1557	movq	32(%rsp),%rbp
1558.cfi_restore	%rbp
1559	movq	40(%rsp),%rbx
1560.cfi_restore	%rbx
1561	leaq	48(%rsp),%rax
1562	leaq	48(%rsp),%rsp
1563.cfi_adjust_cfa_offset	-48
1564.Lbase2_64_avx2_epilogue:
1565	jmp	.Ldo_avx2
1566.cfi_endproc
1567
1568.align	32
1569.Leven_avx2:
1570.cfi_startproc
1571	movl	OPENSSL_ia32cap_P+8(%rip),%r10d
1572	vmovd	0(%rdi),%xmm0
1573	vmovd	4(%rdi),%xmm1
1574	vmovd	8(%rdi),%xmm2
1575	vmovd	12(%rdi),%xmm3
1576	vmovd	16(%rdi),%xmm4
1577
1578.Ldo_avx2:
1579	leaq	-8(%rsp),%r11
1580.cfi_def_cfa	%r11,16
1581	subq	$0x128,%rsp
1582	leaq	.Lconst(%rip),%rcx
1583	leaq	48+64(%rdi),%rdi
1584	vmovdqa	96(%rcx),%ymm7
1585
1586
1587	vmovdqu	-64(%rdi),%xmm9
1588	andq	$-512,%rsp
1589	vmovdqu	-48(%rdi),%xmm10
1590	vmovdqu	-32(%rdi),%xmm6
1591	vmovdqu	-16(%rdi),%xmm11
1592	vmovdqu	0(%rdi),%xmm12
1593	vmovdqu	16(%rdi),%xmm13
1594	leaq	144(%rsp),%rax
1595	vmovdqu	32(%rdi),%xmm14
1596	vpermd	%ymm9,%ymm7,%ymm9
1597	vmovdqu	48(%rdi),%xmm15
1598	vpermd	%ymm10,%ymm7,%ymm10
1599	vmovdqu	64(%rdi),%xmm5
1600	vpermd	%ymm6,%ymm7,%ymm6
1601	vmovdqa	%ymm9,0(%rsp)
1602	vpermd	%ymm11,%ymm7,%ymm11
1603	vmovdqa	%ymm10,32-144(%rax)
1604	vpermd	%ymm12,%ymm7,%ymm12
1605	vmovdqa	%ymm6,64-144(%rax)
1606	vpermd	%ymm13,%ymm7,%ymm13
1607	vmovdqa	%ymm11,96-144(%rax)
1608	vpermd	%ymm14,%ymm7,%ymm14
1609	vmovdqa	%ymm12,128-144(%rax)
1610	vpermd	%ymm15,%ymm7,%ymm15
1611	vmovdqa	%ymm13,160-144(%rax)
1612	vpermd	%ymm5,%ymm7,%ymm5
1613	vmovdqa	%ymm14,192-144(%rax)
1614	vmovdqa	%ymm15,224-144(%rax)
1615	vmovdqa	%ymm5,256-144(%rax)
1616	vmovdqa	64(%rcx),%ymm5
1617
1618
1619
1620	vmovdqu	0(%rsi),%xmm7
1621	vmovdqu	16(%rsi),%xmm8
1622	vinserti128	$1,32(%rsi),%ymm7,%ymm7
1623	vinserti128	$1,48(%rsi),%ymm8,%ymm8
1624	leaq	64(%rsi),%rsi
1625
1626	vpsrldq	$6,%ymm7,%ymm9
1627	vpsrldq	$6,%ymm8,%ymm10
1628	vpunpckhqdq	%ymm8,%ymm7,%ymm6
1629	vpunpcklqdq	%ymm10,%ymm9,%ymm9
1630	vpunpcklqdq	%ymm8,%ymm7,%ymm7
1631
1632	vpsrlq	$30,%ymm9,%ymm10
1633	vpsrlq	$4,%ymm9,%ymm9
1634	vpsrlq	$26,%ymm7,%ymm8
1635	vpsrlq	$40,%ymm6,%ymm6
1636	vpand	%ymm5,%ymm9,%ymm9
1637	vpand	%ymm5,%ymm7,%ymm7
1638	vpand	%ymm5,%ymm8,%ymm8
1639	vpand	%ymm5,%ymm10,%ymm10
1640	vpor	32(%rcx),%ymm6,%ymm6
1641
1642	vpaddq	%ymm2,%ymm9,%ymm2
1643	subq	$64,%rdx
1644	jz	.Ltail_avx2
1645	jmp	.Loop_avx2
1646
1647.align	32
1648.Loop_avx2:
1649
1650
1651
1652
1653
1654
1655
1656
1657	vpaddq	%ymm0,%ymm7,%ymm0
1658	vmovdqa	0(%rsp),%ymm7
1659	vpaddq	%ymm1,%ymm8,%ymm1
1660	vmovdqa	32(%rsp),%ymm8
1661	vpaddq	%ymm3,%ymm10,%ymm3
1662	vmovdqa	96(%rsp),%ymm9
1663	vpaddq	%ymm4,%ymm6,%ymm4
1664	vmovdqa	48(%rax),%ymm10
1665	vmovdqa	112(%rax),%ymm5
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682	vpmuludq	%ymm2,%ymm7,%ymm13
1683	vpmuludq	%ymm2,%ymm8,%ymm14
1684	vpmuludq	%ymm2,%ymm9,%ymm15
1685	vpmuludq	%ymm2,%ymm10,%ymm11
1686	vpmuludq	%ymm2,%ymm5,%ymm12
1687
1688	vpmuludq	%ymm0,%ymm8,%ymm6
1689	vpmuludq	%ymm1,%ymm8,%ymm2
1690	vpaddq	%ymm6,%ymm12,%ymm12
1691	vpaddq	%ymm2,%ymm13,%ymm13
1692	vpmuludq	%ymm3,%ymm8,%ymm6
1693	vpmuludq	64(%rsp),%ymm4,%ymm2
1694	vpaddq	%ymm6,%ymm15,%ymm15
1695	vpaddq	%ymm2,%ymm11,%ymm11
1696	vmovdqa	-16(%rax),%ymm8
1697
1698	vpmuludq	%ymm0,%ymm7,%ymm6
1699	vpmuludq	%ymm1,%ymm7,%ymm2
1700	vpaddq	%ymm6,%ymm11,%ymm11
1701	vpaddq	%ymm2,%ymm12,%ymm12
1702	vpmuludq	%ymm3,%ymm7,%ymm6
1703	vpmuludq	%ymm4,%ymm7,%ymm2
1704	vmovdqu	0(%rsi),%xmm7
1705	vpaddq	%ymm6,%ymm14,%ymm14
1706	vpaddq	%ymm2,%ymm15,%ymm15
1707	vinserti128	$1,32(%rsi),%ymm7,%ymm7
1708
1709	vpmuludq	%ymm3,%ymm8,%ymm6
1710	vpmuludq	%ymm4,%ymm8,%ymm2
1711	vmovdqu	16(%rsi),%xmm8
1712	vpaddq	%ymm6,%ymm11,%ymm11
1713	vpaddq	%ymm2,%ymm12,%ymm12
1714	vmovdqa	16(%rax),%ymm2
1715	vpmuludq	%ymm1,%ymm9,%ymm6
1716	vpmuludq	%ymm0,%ymm9,%ymm9
1717	vpaddq	%ymm6,%ymm14,%ymm14
1718	vpaddq	%ymm9,%ymm13,%ymm13
1719	vinserti128	$1,48(%rsi),%ymm8,%ymm8
1720	leaq	64(%rsi),%rsi
1721
1722	vpmuludq	%ymm1,%ymm2,%ymm6
1723	vpmuludq	%ymm0,%ymm2,%ymm2
1724	vpsrldq	$6,%ymm7,%ymm9
1725	vpaddq	%ymm6,%ymm15,%ymm15
1726	vpaddq	%ymm2,%ymm14,%ymm14
1727	vpmuludq	%ymm3,%ymm10,%ymm6
1728	vpmuludq	%ymm4,%ymm10,%ymm2
1729	vpsrldq	$6,%ymm8,%ymm10
1730	vpaddq	%ymm6,%ymm12,%ymm12
1731	vpaddq	%ymm2,%ymm13,%ymm13
1732	vpunpckhqdq	%ymm8,%ymm7,%ymm6
1733
1734	vpmuludq	%ymm3,%ymm5,%ymm3
1735	vpmuludq	%ymm4,%ymm5,%ymm4
1736	vpunpcklqdq	%ymm8,%ymm7,%ymm7
1737	vpaddq	%ymm3,%ymm13,%ymm2
1738	vpaddq	%ymm4,%ymm14,%ymm3
1739	vpunpcklqdq	%ymm10,%ymm9,%ymm10
1740	vpmuludq	80(%rax),%ymm0,%ymm4
1741	vpmuludq	%ymm1,%ymm5,%ymm0
1742	vmovdqa	64(%rcx),%ymm5
1743	vpaddq	%ymm4,%ymm15,%ymm4
1744	vpaddq	%ymm0,%ymm11,%ymm0
1745
1746
1747
1748
1749	vpsrlq	$26,%ymm3,%ymm14
1750	vpand	%ymm5,%ymm3,%ymm3
1751	vpaddq	%ymm14,%ymm4,%ymm4
1752
1753	vpsrlq	$26,%ymm0,%ymm11
1754	vpand	%ymm5,%ymm0,%ymm0
1755	vpaddq	%ymm11,%ymm12,%ymm1
1756
1757	vpsrlq	$26,%ymm4,%ymm15
1758	vpand	%ymm5,%ymm4,%ymm4
1759
1760	vpsrlq	$4,%ymm10,%ymm9
1761
1762	vpsrlq	$26,%ymm1,%ymm12
1763	vpand	%ymm5,%ymm1,%ymm1
1764	vpaddq	%ymm12,%ymm2,%ymm2
1765
1766	vpaddq	%ymm15,%ymm0,%ymm0
1767	vpsllq	$2,%ymm15,%ymm15
1768	vpaddq	%ymm15,%ymm0,%ymm0
1769
1770	vpand	%ymm5,%ymm9,%ymm9
1771	vpsrlq	$26,%ymm7,%ymm8
1772
1773	vpsrlq	$26,%ymm2,%ymm13
1774	vpand	%ymm5,%ymm2,%ymm2
1775	vpaddq	%ymm13,%ymm3,%ymm3
1776
1777	vpaddq	%ymm9,%ymm2,%ymm2
1778	vpsrlq	$30,%ymm10,%ymm10
1779
1780	vpsrlq	$26,%ymm0,%ymm11
1781	vpand	%ymm5,%ymm0,%ymm0
1782	vpaddq	%ymm11,%ymm1,%ymm1
1783
1784	vpsrlq	$40,%ymm6,%ymm6
1785
1786	vpsrlq	$26,%ymm3,%ymm14
1787	vpand	%ymm5,%ymm3,%ymm3
1788	vpaddq	%ymm14,%ymm4,%ymm4
1789
1790	vpand	%ymm5,%ymm7,%ymm7
1791	vpand	%ymm5,%ymm8,%ymm8
1792	vpand	%ymm5,%ymm10,%ymm10
1793	vpor	32(%rcx),%ymm6,%ymm6
1794
1795	subq	$64,%rdx
1796	jnz	.Loop_avx2
1797
1798.byte	0x66,0x90
1799.Ltail_avx2:
1800
1801
1802
1803
1804
1805
1806
1807	vpaddq	%ymm0,%ymm7,%ymm0
1808	vmovdqu	4(%rsp),%ymm7
1809	vpaddq	%ymm1,%ymm8,%ymm1
1810	vmovdqu	36(%rsp),%ymm8
1811	vpaddq	%ymm3,%ymm10,%ymm3
1812	vmovdqu	100(%rsp),%ymm9
1813	vpaddq	%ymm4,%ymm6,%ymm4
1814	vmovdqu	52(%rax),%ymm10
1815	vmovdqu	116(%rax),%ymm5
1816
1817	vpmuludq	%ymm2,%ymm7,%ymm13
1818	vpmuludq	%ymm2,%ymm8,%ymm14
1819	vpmuludq	%ymm2,%ymm9,%ymm15
1820	vpmuludq	%ymm2,%ymm10,%ymm11
1821	vpmuludq	%ymm2,%ymm5,%ymm12
1822
1823	vpmuludq	%ymm0,%ymm8,%ymm6
1824	vpmuludq	%ymm1,%ymm8,%ymm2
1825	vpaddq	%ymm6,%ymm12,%ymm12
1826	vpaddq	%ymm2,%ymm13,%ymm13
1827	vpmuludq	%ymm3,%ymm8,%ymm6
1828	vpmuludq	68(%rsp),%ymm4,%ymm2
1829	vpaddq	%ymm6,%ymm15,%ymm15
1830	vpaddq	%ymm2,%ymm11,%ymm11
1831
1832	vpmuludq	%ymm0,%ymm7,%ymm6
1833	vpmuludq	%ymm1,%ymm7,%ymm2
1834	vpaddq	%ymm6,%ymm11,%ymm11
1835	vmovdqu	-12(%rax),%ymm8
1836	vpaddq	%ymm2,%ymm12,%ymm12
1837	vpmuludq	%ymm3,%ymm7,%ymm6
1838	vpmuludq	%ymm4,%ymm7,%ymm2
1839	vpaddq	%ymm6,%ymm14,%ymm14
1840	vpaddq	%ymm2,%ymm15,%ymm15
1841
1842	vpmuludq	%ymm3,%ymm8,%ymm6
1843	vpmuludq	%ymm4,%ymm8,%ymm2
1844	vpaddq	%ymm6,%ymm11,%ymm11
1845	vpaddq	%ymm2,%ymm12,%ymm12
1846	vmovdqu	20(%rax),%ymm2
1847	vpmuludq	%ymm1,%ymm9,%ymm6
1848	vpmuludq	%ymm0,%ymm9,%ymm9
1849	vpaddq	%ymm6,%ymm14,%ymm14
1850	vpaddq	%ymm9,%ymm13,%ymm13
1851
1852	vpmuludq	%ymm1,%ymm2,%ymm6
1853	vpmuludq	%ymm0,%ymm2,%ymm2
1854	vpaddq	%ymm6,%ymm15,%ymm15
1855	vpaddq	%ymm2,%ymm14,%ymm14
1856	vpmuludq	%ymm3,%ymm10,%ymm6
1857	vpmuludq	%ymm4,%ymm10,%ymm2
1858	vpaddq	%ymm6,%ymm12,%ymm12
1859	vpaddq	%ymm2,%ymm13,%ymm13
1860
1861	vpmuludq	%ymm3,%ymm5,%ymm3
1862	vpmuludq	%ymm4,%ymm5,%ymm4
1863	vpaddq	%ymm3,%ymm13,%ymm2
1864	vpaddq	%ymm4,%ymm14,%ymm3
1865	vpmuludq	84(%rax),%ymm0,%ymm4
1866	vpmuludq	%ymm1,%ymm5,%ymm0
1867	vmovdqa	64(%rcx),%ymm5
1868	vpaddq	%ymm4,%ymm15,%ymm4
1869	vpaddq	%ymm0,%ymm11,%ymm0
1870
1871
1872
1873
1874	vpsrldq	$8,%ymm12,%ymm8
1875	vpsrldq	$8,%ymm2,%ymm9
1876	vpsrldq	$8,%ymm3,%ymm10
1877	vpsrldq	$8,%ymm4,%ymm6
1878	vpsrldq	$8,%ymm0,%ymm7
1879	vpaddq	%ymm8,%ymm12,%ymm12
1880	vpaddq	%ymm9,%ymm2,%ymm2
1881	vpaddq	%ymm10,%ymm3,%ymm3
1882	vpaddq	%ymm6,%ymm4,%ymm4
1883	vpaddq	%ymm7,%ymm0,%ymm0
1884
1885	vpermq	$0x2,%ymm3,%ymm10
1886	vpermq	$0x2,%ymm4,%ymm6
1887	vpermq	$0x2,%ymm0,%ymm7
1888	vpermq	$0x2,%ymm12,%ymm8
1889	vpermq	$0x2,%ymm2,%ymm9
1890	vpaddq	%ymm10,%ymm3,%ymm3
1891	vpaddq	%ymm6,%ymm4,%ymm4
1892	vpaddq	%ymm7,%ymm0,%ymm0
1893	vpaddq	%ymm8,%ymm12,%ymm12
1894	vpaddq	%ymm9,%ymm2,%ymm2
1895
1896
1897
1898
1899	vpsrlq	$26,%ymm3,%ymm14
1900	vpand	%ymm5,%ymm3,%ymm3
1901	vpaddq	%ymm14,%ymm4,%ymm4
1902
1903	vpsrlq	$26,%ymm0,%ymm11
1904	vpand	%ymm5,%ymm0,%ymm0
1905	vpaddq	%ymm11,%ymm12,%ymm1
1906
1907	vpsrlq	$26,%ymm4,%ymm15
1908	vpand	%ymm5,%ymm4,%ymm4
1909
1910	vpsrlq	$26,%ymm1,%ymm12
1911	vpand	%ymm5,%ymm1,%ymm1
1912	vpaddq	%ymm12,%ymm2,%ymm2
1913
1914	vpaddq	%ymm15,%ymm0,%ymm0
1915	vpsllq	$2,%ymm15,%ymm15
1916	vpaddq	%ymm15,%ymm0,%ymm0
1917
1918	vpsrlq	$26,%ymm2,%ymm13
1919	vpand	%ymm5,%ymm2,%ymm2
1920	vpaddq	%ymm13,%ymm3,%ymm3
1921
1922	vpsrlq	$26,%ymm0,%ymm11
1923	vpand	%ymm5,%ymm0,%ymm0
1924	vpaddq	%ymm11,%ymm1,%ymm1
1925
1926	vpsrlq	$26,%ymm3,%ymm14
1927	vpand	%ymm5,%ymm3,%ymm3
1928	vpaddq	%ymm14,%ymm4,%ymm4
1929
1930	vmovd	%xmm0,-112(%rdi)
1931	vmovd	%xmm1,-108(%rdi)
1932	vmovd	%xmm2,-104(%rdi)
1933	vmovd	%xmm3,-100(%rdi)
1934	vmovd	%xmm4,-96(%rdi)
1935	leaq	8(%r11),%rsp
1936.cfi_def_cfa	%rsp,8
1937	vzeroupper
1938	.byte	0xf3,0xc3
1939.cfi_endproc
1940.size	poly1305_blocks_avx2,.-poly1305_blocks_avx2
1941.section	.rodata
1942.align	64
1943.Lconst:
1944.Lmask24:
1945.long	0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
1946.L129:
1947.long	16777216,0,16777216,0,16777216,0,16777216,0
1948.Lmask26:
1949.long	0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
1950.Lpermd_avx2:
1951.long	2,2,2,3,2,0,2,1
1952.Lpermd_avx512:
1953.long	0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
1954
1955.L2_44_inp_permd:
1956.long	0,1,1,2,2,3,7,7
1957.L2_44_inp_shift:
1958.quad	0,12,24,64
1959.L2_44_mask:
1960.quad	0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff
1961.L2_44_shift_rgt:
1962.quad	44,44,42,64
1963.L2_44_shift_lft:
1964.quad	8,8,10,64
1965
1966.align	64
1967.Lx_mask44:
1968.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1969.quad	0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff
1970.Lx_mask42:
1971.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1972.quad	0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff
1973.previous
1974.byte	80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1975.align	16
1976.globl	xor128_encrypt_n_pad
1977.type	xor128_encrypt_n_pad,@function
1978.align	16
1979xor128_encrypt_n_pad:
1980.cfi_startproc
1981	subq	%rdx,%rsi
1982	subq	%rdx,%rdi
1983	movq	%rcx,%r10
1984	shrq	$4,%rcx
1985	jz	.Ltail_enc
1986	nop
1987.Loop_enc_xmm:
1988	movdqu	(%rsi,%rdx,1),%xmm0
1989	pxor	(%rdx),%xmm0
1990	movdqu	%xmm0,(%rdi,%rdx,1)
1991	movdqa	%xmm0,(%rdx)
1992	leaq	16(%rdx),%rdx
1993	decq	%rcx
1994	jnz	.Loop_enc_xmm
1995
1996	andq	$15,%r10
1997	jz	.Ldone_enc
1998
1999.Ltail_enc:
2000	movq	$16,%rcx
2001	subq	%r10,%rcx
2002	xorl	%eax,%eax
2003.Loop_enc_byte:
2004	movb	(%rsi,%rdx,1),%al
2005	xorb	(%rdx),%al
2006	movb	%al,(%rdi,%rdx,1)
2007	movb	%al,(%rdx)
2008	leaq	1(%rdx),%rdx
2009	decq	%r10
2010	jnz	.Loop_enc_byte
2011
2012	xorl	%eax,%eax
2013.Loop_enc_pad:
2014	movb	%al,(%rdx)
2015	leaq	1(%rdx),%rdx
2016	decq	%rcx
2017	jnz	.Loop_enc_pad
2018
2019.Ldone_enc:
2020	movq	%rdx,%rax
2021	.byte	0xf3,0xc3
2022.cfi_endproc
2023.size	xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
2024
2025.globl	xor128_decrypt_n_pad
2026.type	xor128_decrypt_n_pad,@function
2027.align	16
2028xor128_decrypt_n_pad:
2029.cfi_startproc
2030	subq	%rdx,%rsi
2031	subq	%rdx,%rdi
2032	movq	%rcx,%r10
2033	shrq	$4,%rcx
2034	jz	.Ltail_dec
2035	nop
2036.Loop_dec_xmm:
2037	movdqu	(%rsi,%rdx,1),%xmm0
2038	movdqa	(%rdx),%xmm1
2039	pxor	%xmm0,%xmm1
2040	movdqu	%xmm1,(%rdi,%rdx,1)
2041	movdqa	%xmm0,(%rdx)
2042	leaq	16(%rdx),%rdx
2043	decq	%rcx
2044	jnz	.Loop_dec_xmm
2045
2046	pxor	%xmm1,%xmm1
2047	andq	$15,%r10
2048	jz	.Ldone_dec
2049
2050.Ltail_dec:
2051	movq	$16,%rcx
2052	subq	%r10,%rcx
2053	xorl	%eax,%eax
2054	xorq	%r11,%r11
2055.Loop_dec_byte:
2056	movb	(%rsi,%rdx,1),%r11b
2057	movb	(%rdx),%al
2058	xorb	%r11b,%al
2059	movb	%al,(%rdi,%rdx,1)
2060	movb	%r11b,(%rdx)
2061	leaq	1(%rdx),%rdx
2062	decq	%r10
2063	jnz	.Loop_dec_byte
2064
2065	xorl	%eax,%eax
2066.Loop_dec_pad:
2067	movb	%al,(%rdx)
2068	leaq	1(%rdx),%rdx
2069	decq	%rcx
2070	jnz	.Loop_dec_pad
2071
2072.Ldone_dec:
2073	movq	%rdx,%rax
2074	.byte	0xf3,0xc3
2075.cfi_endproc
2076.size	xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
2077	.section ".note.gnu.property", "a"
2078	.p2align 3
2079	.long 1f - 0f
2080	.long 4f - 1f
2081	.long 5
20820:
2083	# "GNU" encoded with .byte, since .asciz isn't supported
2084	# on Solaris.
2085	.byte 0x47
2086	.byte 0x4e
2087	.byte 0x55
2088	.byte 0
20891:
2090	.p2align 3
2091	.long 0xc0000002
2092	.long 3f - 2f
20932:
2094	.long 3
20953:
2096	.p2align 3
20974:
2098