xref: /freebsd/sys/crypto/openssl/amd64/chacha-x86_64.S (revision edf8578117e8844e02c0121147f45e4609b30680)
1/* Do not modify. This file is auto-generated from chacha-x86_64.pl. */
2.text
3
4
5
6.align	64
7.Lzero:
8.long	0,0,0,0
9.Lone:
10.long	1,0,0,0
11.Linc:
12.long	0,1,2,3
13.Lfour:
14.long	4,4,4,4
15.Lincy:
16.long	0,2,4,6,1,3,5,7
17.Leight:
18.long	8,8,8,8,8,8,8,8
19.Lrot16:
20.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
21.Lrot24:
22.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
23.Ltwoy:
24.long	2,0,0,0, 2,0,0,0
25.align	64
26.Lzeroz:
27.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
28.Lfourz:
29.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
30.Lincz:
31.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
32.Lsixteen:
33.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
34.Lsigma:
35.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
36.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
37.globl	ChaCha20_ctr32
38.type	ChaCha20_ctr32,@function
39.align	64
40ChaCha20_ctr32:
41.cfi_startproc
42	cmpq	$0,%rdx
43	je	.Lno_data
44	movq	OPENSSL_ia32cap_P+4(%rip),%r10
45	testl	$512,%r10d
46	jnz	.LChaCha20_ssse3
47
48	pushq	%rbx
49.cfi_adjust_cfa_offset	8
50.cfi_offset	%rbx,-16
51	pushq	%rbp
52.cfi_adjust_cfa_offset	8
53.cfi_offset	%rbp,-24
54	pushq	%r12
55.cfi_adjust_cfa_offset	8
56.cfi_offset	%r12,-32
57	pushq	%r13
58.cfi_adjust_cfa_offset	8
59.cfi_offset	%r13,-40
60	pushq	%r14
61.cfi_adjust_cfa_offset	8
62.cfi_offset	%r14,-48
63	pushq	%r15
64.cfi_adjust_cfa_offset	8
65.cfi_offset	%r15,-56
66	subq	$64+24,%rsp
67.cfi_adjust_cfa_offset	64+24
68.Lctr32_body:
69
70
71	movdqu	(%rcx),%xmm1
72	movdqu	16(%rcx),%xmm2
73	movdqu	(%r8),%xmm3
74	movdqa	.Lone(%rip),%xmm4
75
76
77	movdqa	%xmm1,16(%rsp)
78	movdqa	%xmm2,32(%rsp)
79	movdqa	%xmm3,48(%rsp)
80	movq	%rdx,%rbp
81	jmp	.Loop_outer
82
83.align	32
84.Loop_outer:
85	movl	$0x61707865,%eax
86	movl	$0x3320646e,%ebx
87	movl	$0x79622d32,%ecx
88	movl	$0x6b206574,%edx
89	movl	16(%rsp),%r8d
90	movl	20(%rsp),%r9d
91	movl	24(%rsp),%r10d
92	movl	28(%rsp),%r11d
93	movd	%xmm3,%r12d
94	movl	52(%rsp),%r13d
95	movl	56(%rsp),%r14d
96	movl	60(%rsp),%r15d
97
98	movq	%rbp,64+0(%rsp)
99	movl	$10,%ebp
100	movq	%rsi,64+8(%rsp)
101.byte	102,72,15,126,214
102	movq	%rdi,64+16(%rsp)
103	movq	%rsi,%rdi
104	shrq	$32,%rdi
105	jmp	.Loop
106
107.align	32
108.Loop:
109	addl	%r8d,%eax
110	xorl	%eax,%r12d
111	roll	$16,%r12d
112	addl	%r9d,%ebx
113	xorl	%ebx,%r13d
114	roll	$16,%r13d
115	addl	%r12d,%esi
116	xorl	%esi,%r8d
117	roll	$12,%r8d
118	addl	%r13d,%edi
119	xorl	%edi,%r9d
120	roll	$12,%r9d
121	addl	%r8d,%eax
122	xorl	%eax,%r12d
123	roll	$8,%r12d
124	addl	%r9d,%ebx
125	xorl	%ebx,%r13d
126	roll	$8,%r13d
127	addl	%r12d,%esi
128	xorl	%esi,%r8d
129	roll	$7,%r8d
130	addl	%r13d,%edi
131	xorl	%edi,%r9d
132	roll	$7,%r9d
133	movl	%esi,32(%rsp)
134	movl	%edi,36(%rsp)
135	movl	40(%rsp),%esi
136	movl	44(%rsp),%edi
137	addl	%r10d,%ecx
138	xorl	%ecx,%r14d
139	roll	$16,%r14d
140	addl	%r11d,%edx
141	xorl	%edx,%r15d
142	roll	$16,%r15d
143	addl	%r14d,%esi
144	xorl	%esi,%r10d
145	roll	$12,%r10d
146	addl	%r15d,%edi
147	xorl	%edi,%r11d
148	roll	$12,%r11d
149	addl	%r10d,%ecx
150	xorl	%ecx,%r14d
151	roll	$8,%r14d
152	addl	%r11d,%edx
153	xorl	%edx,%r15d
154	roll	$8,%r15d
155	addl	%r14d,%esi
156	xorl	%esi,%r10d
157	roll	$7,%r10d
158	addl	%r15d,%edi
159	xorl	%edi,%r11d
160	roll	$7,%r11d
161	addl	%r9d,%eax
162	xorl	%eax,%r15d
163	roll	$16,%r15d
164	addl	%r10d,%ebx
165	xorl	%ebx,%r12d
166	roll	$16,%r12d
167	addl	%r15d,%esi
168	xorl	%esi,%r9d
169	roll	$12,%r9d
170	addl	%r12d,%edi
171	xorl	%edi,%r10d
172	roll	$12,%r10d
173	addl	%r9d,%eax
174	xorl	%eax,%r15d
175	roll	$8,%r15d
176	addl	%r10d,%ebx
177	xorl	%ebx,%r12d
178	roll	$8,%r12d
179	addl	%r15d,%esi
180	xorl	%esi,%r9d
181	roll	$7,%r9d
182	addl	%r12d,%edi
183	xorl	%edi,%r10d
184	roll	$7,%r10d
185	movl	%esi,40(%rsp)
186	movl	%edi,44(%rsp)
187	movl	32(%rsp),%esi
188	movl	36(%rsp),%edi
189	addl	%r11d,%ecx
190	xorl	%ecx,%r13d
191	roll	$16,%r13d
192	addl	%r8d,%edx
193	xorl	%edx,%r14d
194	roll	$16,%r14d
195	addl	%r13d,%esi
196	xorl	%esi,%r11d
197	roll	$12,%r11d
198	addl	%r14d,%edi
199	xorl	%edi,%r8d
200	roll	$12,%r8d
201	addl	%r11d,%ecx
202	xorl	%ecx,%r13d
203	roll	$8,%r13d
204	addl	%r8d,%edx
205	xorl	%edx,%r14d
206	roll	$8,%r14d
207	addl	%r13d,%esi
208	xorl	%esi,%r11d
209	roll	$7,%r11d
210	addl	%r14d,%edi
211	xorl	%edi,%r8d
212	roll	$7,%r8d
213	decl	%ebp
214	jnz	.Loop
215	movl	%edi,36(%rsp)
216	movl	%esi,32(%rsp)
217	movq	64(%rsp),%rbp
218	movdqa	%xmm2,%xmm1
219	movq	64+8(%rsp),%rsi
220	paddd	%xmm4,%xmm3
221	movq	64+16(%rsp),%rdi
222
223	addl	$0x61707865,%eax
224	addl	$0x3320646e,%ebx
225	addl	$0x79622d32,%ecx
226	addl	$0x6b206574,%edx
227	addl	16(%rsp),%r8d
228	addl	20(%rsp),%r9d
229	addl	24(%rsp),%r10d
230	addl	28(%rsp),%r11d
231	addl	48(%rsp),%r12d
232	addl	52(%rsp),%r13d
233	addl	56(%rsp),%r14d
234	addl	60(%rsp),%r15d
235	paddd	32(%rsp),%xmm1
236
237	cmpq	$64,%rbp
238	jb	.Ltail
239
240	xorl	0(%rsi),%eax
241	xorl	4(%rsi),%ebx
242	xorl	8(%rsi),%ecx
243	xorl	12(%rsi),%edx
244	xorl	16(%rsi),%r8d
245	xorl	20(%rsi),%r9d
246	xorl	24(%rsi),%r10d
247	xorl	28(%rsi),%r11d
248	movdqu	32(%rsi),%xmm0
249	xorl	48(%rsi),%r12d
250	xorl	52(%rsi),%r13d
251	xorl	56(%rsi),%r14d
252	xorl	60(%rsi),%r15d
253	leaq	64(%rsi),%rsi
254	pxor	%xmm1,%xmm0
255
256	movdqa	%xmm2,32(%rsp)
257	movd	%xmm3,48(%rsp)
258
259	movl	%eax,0(%rdi)
260	movl	%ebx,4(%rdi)
261	movl	%ecx,8(%rdi)
262	movl	%edx,12(%rdi)
263	movl	%r8d,16(%rdi)
264	movl	%r9d,20(%rdi)
265	movl	%r10d,24(%rdi)
266	movl	%r11d,28(%rdi)
267	movdqu	%xmm0,32(%rdi)
268	movl	%r12d,48(%rdi)
269	movl	%r13d,52(%rdi)
270	movl	%r14d,56(%rdi)
271	movl	%r15d,60(%rdi)
272	leaq	64(%rdi),%rdi
273
274	subq	$64,%rbp
275	jnz	.Loop_outer
276
277	jmp	.Ldone
278
279.align	16
280.Ltail:
281	movl	%eax,0(%rsp)
282	movl	%ebx,4(%rsp)
283	xorq	%rbx,%rbx
284	movl	%ecx,8(%rsp)
285	movl	%edx,12(%rsp)
286	movl	%r8d,16(%rsp)
287	movl	%r9d,20(%rsp)
288	movl	%r10d,24(%rsp)
289	movl	%r11d,28(%rsp)
290	movdqa	%xmm1,32(%rsp)
291	movl	%r12d,48(%rsp)
292	movl	%r13d,52(%rsp)
293	movl	%r14d,56(%rsp)
294	movl	%r15d,60(%rsp)
295
296.Loop_tail:
297	movzbl	(%rsi,%rbx,1),%eax
298	movzbl	(%rsp,%rbx,1),%edx
299	leaq	1(%rbx),%rbx
300	xorl	%edx,%eax
301	movb	%al,-1(%rdi,%rbx,1)
302	decq	%rbp
303	jnz	.Loop_tail
304
305.Ldone:
306	leaq	64+24+48(%rsp),%rsi
307.cfi_def_cfa	%rsi,8
308	movq	-48(%rsi),%r15
309.cfi_restore	%r15
310	movq	-40(%rsi),%r14
311.cfi_restore	%r14
312	movq	-32(%rsi),%r13
313.cfi_restore	%r13
314	movq	-24(%rsi),%r12
315.cfi_restore	%r12
316	movq	-16(%rsi),%rbp
317.cfi_restore	%rbp
318	movq	-8(%rsi),%rbx
319.cfi_restore	%rbx
320	leaq	(%rsi),%rsp
321.cfi_def_cfa_register	%rsp
322.Lno_data:
323	.byte	0xf3,0xc3
324.cfi_endproc
325.size	ChaCha20_ctr32,.-ChaCha20_ctr32
326.type	ChaCha20_ssse3,@function
327.align	32
328ChaCha20_ssse3:
329.cfi_startproc
330.LChaCha20_ssse3:
331	movq	%rsp,%r9
332.cfi_def_cfa_register	%r9
333	testl	$2048,%r10d
334	jnz	.LChaCha20_4xop
335	cmpq	$128,%rdx
336	je	.LChaCha20_128
337	ja	.LChaCha20_4x
338
339.Ldo_sse3_after_all:
340	subq	$64+8,%rsp
341	movdqa	.Lsigma(%rip),%xmm0
342	movdqu	(%rcx),%xmm1
343	movdqu	16(%rcx),%xmm2
344	movdqu	(%r8),%xmm3
345	movdqa	.Lrot16(%rip),%xmm6
346	movdqa	.Lrot24(%rip),%xmm7
347
348	movdqa	%xmm0,0(%rsp)
349	movdqa	%xmm1,16(%rsp)
350	movdqa	%xmm2,32(%rsp)
351	movdqa	%xmm3,48(%rsp)
352	movq	$10,%r8
353	jmp	.Loop_ssse3
354
355.align	32
356.Loop_outer_ssse3:
357	movdqa	.Lone(%rip),%xmm3
358	movdqa	0(%rsp),%xmm0
359	movdqa	16(%rsp),%xmm1
360	movdqa	32(%rsp),%xmm2
361	paddd	48(%rsp),%xmm3
362	movq	$10,%r8
363	movdqa	%xmm3,48(%rsp)
364	jmp	.Loop_ssse3
365
366.align	32
367.Loop_ssse3:
368	paddd	%xmm1,%xmm0
369	pxor	%xmm0,%xmm3
370.byte	102,15,56,0,222
371	paddd	%xmm3,%xmm2
372	pxor	%xmm2,%xmm1
373	movdqa	%xmm1,%xmm4
374	psrld	$20,%xmm1
375	pslld	$12,%xmm4
376	por	%xmm4,%xmm1
377	paddd	%xmm1,%xmm0
378	pxor	%xmm0,%xmm3
379.byte	102,15,56,0,223
380	paddd	%xmm3,%xmm2
381	pxor	%xmm2,%xmm1
382	movdqa	%xmm1,%xmm4
383	psrld	$25,%xmm1
384	pslld	$7,%xmm4
385	por	%xmm4,%xmm1
386	pshufd	$78,%xmm2,%xmm2
387	pshufd	$57,%xmm1,%xmm1
388	pshufd	$147,%xmm3,%xmm3
389	nop
390	paddd	%xmm1,%xmm0
391	pxor	%xmm0,%xmm3
392.byte	102,15,56,0,222
393	paddd	%xmm3,%xmm2
394	pxor	%xmm2,%xmm1
395	movdqa	%xmm1,%xmm4
396	psrld	$20,%xmm1
397	pslld	$12,%xmm4
398	por	%xmm4,%xmm1
399	paddd	%xmm1,%xmm0
400	pxor	%xmm0,%xmm3
401.byte	102,15,56,0,223
402	paddd	%xmm3,%xmm2
403	pxor	%xmm2,%xmm1
404	movdqa	%xmm1,%xmm4
405	psrld	$25,%xmm1
406	pslld	$7,%xmm4
407	por	%xmm4,%xmm1
408	pshufd	$78,%xmm2,%xmm2
409	pshufd	$147,%xmm1,%xmm1
410	pshufd	$57,%xmm3,%xmm3
411	decq	%r8
412	jnz	.Loop_ssse3
413	paddd	0(%rsp),%xmm0
414	paddd	16(%rsp),%xmm1
415	paddd	32(%rsp),%xmm2
416	paddd	48(%rsp),%xmm3
417
418	cmpq	$64,%rdx
419	jb	.Ltail_ssse3
420
421	movdqu	0(%rsi),%xmm4
422	movdqu	16(%rsi),%xmm5
423	pxor	%xmm4,%xmm0
424	movdqu	32(%rsi),%xmm4
425	pxor	%xmm5,%xmm1
426	movdqu	48(%rsi),%xmm5
427	leaq	64(%rsi),%rsi
428	pxor	%xmm4,%xmm2
429	pxor	%xmm5,%xmm3
430
431	movdqu	%xmm0,0(%rdi)
432	movdqu	%xmm1,16(%rdi)
433	movdqu	%xmm2,32(%rdi)
434	movdqu	%xmm3,48(%rdi)
435	leaq	64(%rdi),%rdi
436
437	subq	$64,%rdx
438	jnz	.Loop_outer_ssse3
439
440	jmp	.Ldone_ssse3
441
442.align	16
443.Ltail_ssse3:
444	movdqa	%xmm0,0(%rsp)
445	movdqa	%xmm1,16(%rsp)
446	movdqa	%xmm2,32(%rsp)
447	movdqa	%xmm3,48(%rsp)
448	xorq	%r8,%r8
449
450.Loop_tail_ssse3:
451	movzbl	(%rsi,%r8,1),%eax
452	movzbl	(%rsp,%r8,1),%ecx
453	leaq	1(%r8),%r8
454	xorl	%ecx,%eax
455	movb	%al,-1(%rdi,%r8,1)
456	decq	%rdx
457	jnz	.Loop_tail_ssse3
458
459.Ldone_ssse3:
460	leaq	(%r9),%rsp
461.cfi_def_cfa_register	%rsp
462.Lssse3_epilogue:
463	.byte	0xf3,0xc3
464.cfi_endproc
465.size	ChaCha20_ssse3,.-ChaCha20_ssse3
466.type	ChaCha20_128,@function
467.align	32
468ChaCha20_128:
469.cfi_startproc
470.LChaCha20_128:
471	movq	%rsp,%r9
472.cfi_def_cfa_register	%r9
473	subq	$64+8,%rsp
474	movdqa	.Lsigma(%rip),%xmm8
475	movdqu	(%rcx),%xmm9
476	movdqu	16(%rcx),%xmm2
477	movdqu	(%r8),%xmm3
478	movdqa	.Lone(%rip),%xmm1
479	movdqa	.Lrot16(%rip),%xmm6
480	movdqa	.Lrot24(%rip),%xmm7
481
482	movdqa	%xmm8,%xmm10
483	movdqa	%xmm8,0(%rsp)
484	movdqa	%xmm9,%xmm11
485	movdqa	%xmm9,16(%rsp)
486	movdqa	%xmm2,%xmm0
487	movdqa	%xmm2,32(%rsp)
488	paddd	%xmm3,%xmm1
489	movdqa	%xmm3,48(%rsp)
490	movq	$10,%r8
491	jmp	.Loop_128
492
493.align	32
494.Loop_128:
495	paddd	%xmm9,%xmm8
496	pxor	%xmm8,%xmm3
497	paddd	%xmm11,%xmm10
498	pxor	%xmm10,%xmm1
499.byte	102,15,56,0,222
500.byte	102,15,56,0,206
501	paddd	%xmm3,%xmm2
502	paddd	%xmm1,%xmm0
503	pxor	%xmm2,%xmm9
504	pxor	%xmm0,%xmm11
505	movdqa	%xmm9,%xmm4
506	psrld	$20,%xmm9
507	movdqa	%xmm11,%xmm5
508	pslld	$12,%xmm4
509	psrld	$20,%xmm11
510	por	%xmm4,%xmm9
511	pslld	$12,%xmm5
512	por	%xmm5,%xmm11
513	paddd	%xmm9,%xmm8
514	pxor	%xmm8,%xmm3
515	paddd	%xmm11,%xmm10
516	pxor	%xmm10,%xmm1
517.byte	102,15,56,0,223
518.byte	102,15,56,0,207
519	paddd	%xmm3,%xmm2
520	paddd	%xmm1,%xmm0
521	pxor	%xmm2,%xmm9
522	pxor	%xmm0,%xmm11
523	movdqa	%xmm9,%xmm4
524	psrld	$25,%xmm9
525	movdqa	%xmm11,%xmm5
526	pslld	$7,%xmm4
527	psrld	$25,%xmm11
528	por	%xmm4,%xmm9
529	pslld	$7,%xmm5
530	por	%xmm5,%xmm11
531	pshufd	$78,%xmm2,%xmm2
532	pshufd	$57,%xmm9,%xmm9
533	pshufd	$147,%xmm3,%xmm3
534	pshufd	$78,%xmm0,%xmm0
535	pshufd	$57,%xmm11,%xmm11
536	pshufd	$147,%xmm1,%xmm1
537	paddd	%xmm9,%xmm8
538	pxor	%xmm8,%xmm3
539	paddd	%xmm11,%xmm10
540	pxor	%xmm10,%xmm1
541.byte	102,15,56,0,222
542.byte	102,15,56,0,206
543	paddd	%xmm3,%xmm2
544	paddd	%xmm1,%xmm0
545	pxor	%xmm2,%xmm9
546	pxor	%xmm0,%xmm11
547	movdqa	%xmm9,%xmm4
548	psrld	$20,%xmm9
549	movdqa	%xmm11,%xmm5
550	pslld	$12,%xmm4
551	psrld	$20,%xmm11
552	por	%xmm4,%xmm9
553	pslld	$12,%xmm5
554	por	%xmm5,%xmm11
555	paddd	%xmm9,%xmm8
556	pxor	%xmm8,%xmm3
557	paddd	%xmm11,%xmm10
558	pxor	%xmm10,%xmm1
559.byte	102,15,56,0,223
560.byte	102,15,56,0,207
561	paddd	%xmm3,%xmm2
562	paddd	%xmm1,%xmm0
563	pxor	%xmm2,%xmm9
564	pxor	%xmm0,%xmm11
565	movdqa	%xmm9,%xmm4
566	psrld	$25,%xmm9
567	movdqa	%xmm11,%xmm5
568	pslld	$7,%xmm4
569	psrld	$25,%xmm11
570	por	%xmm4,%xmm9
571	pslld	$7,%xmm5
572	por	%xmm5,%xmm11
573	pshufd	$78,%xmm2,%xmm2
574	pshufd	$147,%xmm9,%xmm9
575	pshufd	$57,%xmm3,%xmm3
576	pshufd	$78,%xmm0,%xmm0
577	pshufd	$147,%xmm11,%xmm11
578	pshufd	$57,%xmm1,%xmm1
579	decq	%r8
580	jnz	.Loop_128
581	paddd	0(%rsp),%xmm8
582	paddd	16(%rsp),%xmm9
583	paddd	32(%rsp),%xmm2
584	paddd	48(%rsp),%xmm3
585	paddd	.Lone(%rip),%xmm1
586	paddd	0(%rsp),%xmm10
587	paddd	16(%rsp),%xmm11
588	paddd	32(%rsp),%xmm0
589	paddd	48(%rsp),%xmm1
590
591	movdqu	0(%rsi),%xmm4
592	movdqu	16(%rsi),%xmm5
593	pxor	%xmm4,%xmm8
594	movdqu	32(%rsi),%xmm4
595	pxor	%xmm5,%xmm9
596	movdqu	48(%rsi),%xmm5
597	pxor	%xmm4,%xmm2
598	movdqu	64(%rsi),%xmm4
599	pxor	%xmm5,%xmm3
600	movdqu	80(%rsi),%xmm5
601	pxor	%xmm4,%xmm10
602	movdqu	96(%rsi),%xmm4
603	pxor	%xmm5,%xmm11
604	movdqu	112(%rsi),%xmm5
605	pxor	%xmm4,%xmm0
606	pxor	%xmm5,%xmm1
607
608	movdqu	%xmm8,0(%rdi)
609	movdqu	%xmm9,16(%rdi)
610	movdqu	%xmm2,32(%rdi)
611	movdqu	%xmm3,48(%rdi)
612	movdqu	%xmm10,64(%rdi)
613	movdqu	%xmm11,80(%rdi)
614	movdqu	%xmm0,96(%rdi)
615	movdqu	%xmm1,112(%rdi)
616	leaq	(%r9),%rsp
617.cfi_def_cfa_register	%rsp
618.L128_epilogue:
619	.byte	0xf3,0xc3
620.cfi_endproc
621.size	ChaCha20_128,.-ChaCha20_128
622.type	ChaCha20_4x,@function
623.align	32
624ChaCha20_4x:
625.cfi_startproc
626.LChaCha20_4x:
627	movq	%rsp,%r9
628.cfi_def_cfa_register	%r9
629	movq	%r10,%r11
630	shrq	$32,%r10
631	testq	$32,%r10
632	jnz	.LChaCha20_8x
633	cmpq	$192,%rdx
634	ja	.Lproceed4x
635
636	andq	$71303168,%r11
637	cmpq	$4194304,%r11
638	je	.Ldo_sse3_after_all
639
640.Lproceed4x:
641	subq	$0x140+8,%rsp
642	movdqa	.Lsigma(%rip),%xmm11
643	movdqu	(%rcx),%xmm15
644	movdqu	16(%rcx),%xmm7
645	movdqu	(%r8),%xmm3
646	leaq	256(%rsp),%rcx
647	leaq	.Lrot16(%rip),%r10
648	leaq	.Lrot24(%rip),%r11
649
650	pshufd	$0x00,%xmm11,%xmm8
651	pshufd	$0x55,%xmm11,%xmm9
652	movdqa	%xmm8,64(%rsp)
653	pshufd	$0xaa,%xmm11,%xmm10
654	movdqa	%xmm9,80(%rsp)
655	pshufd	$0xff,%xmm11,%xmm11
656	movdqa	%xmm10,96(%rsp)
657	movdqa	%xmm11,112(%rsp)
658
659	pshufd	$0x00,%xmm15,%xmm12
660	pshufd	$0x55,%xmm15,%xmm13
661	movdqa	%xmm12,128-256(%rcx)
662	pshufd	$0xaa,%xmm15,%xmm14
663	movdqa	%xmm13,144-256(%rcx)
664	pshufd	$0xff,%xmm15,%xmm15
665	movdqa	%xmm14,160-256(%rcx)
666	movdqa	%xmm15,176-256(%rcx)
667
668	pshufd	$0x00,%xmm7,%xmm4
669	pshufd	$0x55,%xmm7,%xmm5
670	movdqa	%xmm4,192-256(%rcx)
671	pshufd	$0xaa,%xmm7,%xmm6
672	movdqa	%xmm5,208-256(%rcx)
673	pshufd	$0xff,%xmm7,%xmm7
674	movdqa	%xmm6,224-256(%rcx)
675	movdqa	%xmm7,240-256(%rcx)
676
677	pshufd	$0x00,%xmm3,%xmm0
678	pshufd	$0x55,%xmm3,%xmm1
679	paddd	.Linc(%rip),%xmm0
680	pshufd	$0xaa,%xmm3,%xmm2
681	movdqa	%xmm1,272-256(%rcx)
682	pshufd	$0xff,%xmm3,%xmm3
683	movdqa	%xmm2,288-256(%rcx)
684	movdqa	%xmm3,304-256(%rcx)
685
686	jmp	.Loop_enter4x
687
688.align	32
689.Loop_outer4x:
690	movdqa	64(%rsp),%xmm8
691	movdqa	80(%rsp),%xmm9
692	movdqa	96(%rsp),%xmm10
693	movdqa	112(%rsp),%xmm11
694	movdqa	128-256(%rcx),%xmm12
695	movdqa	144-256(%rcx),%xmm13
696	movdqa	160-256(%rcx),%xmm14
697	movdqa	176-256(%rcx),%xmm15
698	movdqa	192-256(%rcx),%xmm4
699	movdqa	208-256(%rcx),%xmm5
700	movdqa	224-256(%rcx),%xmm6
701	movdqa	240-256(%rcx),%xmm7
702	movdqa	256-256(%rcx),%xmm0
703	movdqa	272-256(%rcx),%xmm1
704	movdqa	288-256(%rcx),%xmm2
705	movdqa	304-256(%rcx),%xmm3
706	paddd	.Lfour(%rip),%xmm0
707
708.Loop_enter4x:
709	movdqa	%xmm6,32(%rsp)
710	movdqa	%xmm7,48(%rsp)
711	movdqa	(%r10),%xmm7
712	movl	$10,%eax
713	movdqa	%xmm0,256-256(%rcx)
714	jmp	.Loop4x
715
716.align	32
717.Loop4x:
718	paddd	%xmm12,%xmm8
719	paddd	%xmm13,%xmm9
720	pxor	%xmm8,%xmm0
721	pxor	%xmm9,%xmm1
722.byte	102,15,56,0,199
723.byte	102,15,56,0,207
724	paddd	%xmm0,%xmm4
725	paddd	%xmm1,%xmm5
726	pxor	%xmm4,%xmm12
727	pxor	%xmm5,%xmm13
728	movdqa	%xmm12,%xmm6
729	pslld	$12,%xmm12
730	psrld	$20,%xmm6
731	movdqa	%xmm13,%xmm7
732	pslld	$12,%xmm13
733	por	%xmm6,%xmm12
734	psrld	$20,%xmm7
735	movdqa	(%r11),%xmm6
736	por	%xmm7,%xmm13
737	paddd	%xmm12,%xmm8
738	paddd	%xmm13,%xmm9
739	pxor	%xmm8,%xmm0
740	pxor	%xmm9,%xmm1
741.byte	102,15,56,0,198
742.byte	102,15,56,0,206
743	paddd	%xmm0,%xmm4
744	paddd	%xmm1,%xmm5
745	pxor	%xmm4,%xmm12
746	pxor	%xmm5,%xmm13
747	movdqa	%xmm12,%xmm7
748	pslld	$7,%xmm12
749	psrld	$25,%xmm7
750	movdqa	%xmm13,%xmm6
751	pslld	$7,%xmm13
752	por	%xmm7,%xmm12
753	psrld	$25,%xmm6
754	movdqa	(%r10),%xmm7
755	por	%xmm6,%xmm13
756	movdqa	%xmm4,0(%rsp)
757	movdqa	%xmm5,16(%rsp)
758	movdqa	32(%rsp),%xmm4
759	movdqa	48(%rsp),%xmm5
760	paddd	%xmm14,%xmm10
761	paddd	%xmm15,%xmm11
762	pxor	%xmm10,%xmm2
763	pxor	%xmm11,%xmm3
764.byte	102,15,56,0,215
765.byte	102,15,56,0,223
766	paddd	%xmm2,%xmm4
767	paddd	%xmm3,%xmm5
768	pxor	%xmm4,%xmm14
769	pxor	%xmm5,%xmm15
770	movdqa	%xmm14,%xmm6
771	pslld	$12,%xmm14
772	psrld	$20,%xmm6
773	movdqa	%xmm15,%xmm7
774	pslld	$12,%xmm15
775	por	%xmm6,%xmm14
776	psrld	$20,%xmm7
777	movdqa	(%r11),%xmm6
778	por	%xmm7,%xmm15
779	paddd	%xmm14,%xmm10
780	paddd	%xmm15,%xmm11
781	pxor	%xmm10,%xmm2
782	pxor	%xmm11,%xmm3
783.byte	102,15,56,0,214
784.byte	102,15,56,0,222
785	paddd	%xmm2,%xmm4
786	paddd	%xmm3,%xmm5
787	pxor	%xmm4,%xmm14
788	pxor	%xmm5,%xmm15
789	movdqa	%xmm14,%xmm7
790	pslld	$7,%xmm14
791	psrld	$25,%xmm7
792	movdqa	%xmm15,%xmm6
793	pslld	$7,%xmm15
794	por	%xmm7,%xmm14
795	psrld	$25,%xmm6
796	movdqa	(%r10),%xmm7
797	por	%xmm6,%xmm15
798	paddd	%xmm13,%xmm8
799	paddd	%xmm14,%xmm9
800	pxor	%xmm8,%xmm3
801	pxor	%xmm9,%xmm0
802.byte	102,15,56,0,223
803.byte	102,15,56,0,199
804	paddd	%xmm3,%xmm4
805	paddd	%xmm0,%xmm5
806	pxor	%xmm4,%xmm13
807	pxor	%xmm5,%xmm14
808	movdqa	%xmm13,%xmm6
809	pslld	$12,%xmm13
810	psrld	$20,%xmm6
811	movdqa	%xmm14,%xmm7
812	pslld	$12,%xmm14
813	por	%xmm6,%xmm13
814	psrld	$20,%xmm7
815	movdqa	(%r11),%xmm6
816	por	%xmm7,%xmm14
817	paddd	%xmm13,%xmm8
818	paddd	%xmm14,%xmm9
819	pxor	%xmm8,%xmm3
820	pxor	%xmm9,%xmm0
821.byte	102,15,56,0,222
822.byte	102,15,56,0,198
823	paddd	%xmm3,%xmm4
824	paddd	%xmm0,%xmm5
825	pxor	%xmm4,%xmm13
826	pxor	%xmm5,%xmm14
827	movdqa	%xmm13,%xmm7
828	pslld	$7,%xmm13
829	psrld	$25,%xmm7
830	movdqa	%xmm14,%xmm6
831	pslld	$7,%xmm14
832	por	%xmm7,%xmm13
833	psrld	$25,%xmm6
834	movdqa	(%r10),%xmm7
835	por	%xmm6,%xmm14
836	movdqa	%xmm4,32(%rsp)
837	movdqa	%xmm5,48(%rsp)
838	movdqa	0(%rsp),%xmm4
839	movdqa	16(%rsp),%xmm5
840	paddd	%xmm15,%xmm10
841	paddd	%xmm12,%xmm11
842	pxor	%xmm10,%xmm1
843	pxor	%xmm11,%xmm2
844.byte	102,15,56,0,207
845.byte	102,15,56,0,215
846	paddd	%xmm1,%xmm4
847	paddd	%xmm2,%xmm5
848	pxor	%xmm4,%xmm15
849	pxor	%xmm5,%xmm12
850	movdqa	%xmm15,%xmm6
851	pslld	$12,%xmm15
852	psrld	$20,%xmm6
853	movdqa	%xmm12,%xmm7
854	pslld	$12,%xmm12
855	por	%xmm6,%xmm15
856	psrld	$20,%xmm7
857	movdqa	(%r11),%xmm6
858	por	%xmm7,%xmm12
859	paddd	%xmm15,%xmm10
860	paddd	%xmm12,%xmm11
861	pxor	%xmm10,%xmm1
862	pxor	%xmm11,%xmm2
863.byte	102,15,56,0,206
864.byte	102,15,56,0,214
865	paddd	%xmm1,%xmm4
866	paddd	%xmm2,%xmm5
867	pxor	%xmm4,%xmm15
868	pxor	%xmm5,%xmm12
869	movdqa	%xmm15,%xmm7
870	pslld	$7,%xmm15
871	psrld	$25,%xmm7
872	movdqa	%xmm12,%xmm6
873	pslld	$7,%xmm12
874	por	%xmm7,%xmm15
875	psrld	$25,%xmm6
876	movdqa	(%r10),%xmm7
877	por	%xmm6,%xmm12
878	decl	%eax
879	jnz	.Loop4x
880
881	paddd	64(%rsp),%xmm8
882	paddd	80(%rsp),%xmm9
883	paddd	96(%rsp),%xmm10
884	paddd	112(%rsp),%xmm11
885
886	movdqa	%xmm8,%xmm6
887	punpckldq	%xmm9,%xmm8
888	movdqa	%xmm10,%xmm7
889	punpckldq	%xmm11,%xmm10
890	punpckhdq	%xmm9,%xmm6
891	punpckhdq	%xmm11,%xmm7
892	movdqa	%xmm8,%xmm9
893	punpcklqdq	%xmm10,%xmm8
894	movdqa	%xmm6,%xmm11
895	punpcklqdq	%xmm7,%xmm6
896	punpckhqdq	%xmm10,%xmm9
897	punpckhqdq	%xmm7,%xmm11
898	paddd	128-256(%rcx),%xmm12
899	paddd	144-256(%rcx),%xmm13
900	paddd	160-256(%rcx),%xmm14
901	paddd	176-256(%rcx),%xmm15
902
903	movdqa	%xmm8,0(%rsp)
904	movdqa	%xmm9,16(%rsp)
905	movdqa	32(%rsp),%xmm8
906	movdqa	48(%rsp),%xmm9
907
908	movdqa	%xmm12,%xmm10
909	punpckldq	%xmm13,%xmm12
910	movdqa	%xmm14,%xmm7
911	punpckldq	%xmm15,%xmm14
912	punpckhdq	%xmm13,%xmm10
913	punpckhdq	%xmm15,%xmm7
914	movdqa	%xmm12,%xmm13
915	punpcklqdq	%xmm14,%xmm12
916	movdqa	%xmm10,%xmm15
917	punpcklqdq	%xmm7,%xmm10
918	punpckhqdq	%xmm14,%xmm13
919	punpckhqdq	%xmm7,%xmm15
920	paddd	192-256(%rcx),%xmm4
921	paddd	208-256(%rcx),%xmm5
922	paddd	224-256(%rcx),%xmm8
923	paddd	240-256(%rcx),%xmm9
924
925	movdqa	%xmm6,32(%rsp)
926	movdqa	%xmm11,48(%rsp)
927
928	movdqa	%xmm4,%xmm14
929	punpckldq	%xmm5,%xmm4
930	movdqa	%xmm8,%xmm7
931	punpckldq	%xmm9,%xmm8
932	punpckhdq	%xmm5,%xmm14
933	punpckhdq	%xmm9,%xmm7
934	movdqa	%xmm4,%xmm5
935	punpcklqdq	%xmm8,%xmm4
936	movdqa	%xmm14,%xmm9
937	punpcklqdq	%xmm7,%xmm14
938	punpckhqdq	%xmm8,%xmm5
939	punpckhqdq	%xmm7,%xmm9
940	paddd	256-256(%rcx),%xmm0
941	paddd	272-256(%rcx),%xmm1
942	paddd	288-256(%rcx),%xmm2
943	paddd	304-256(%rcx),%xmm3
944
945	movdqa	%xmm0,%xmm8
946	punpckldq	%xmm1,%xmm0
947	movdqa	%xmm2,%xmm7
948	punpckldq	%xmm3,%xmm2
949	punpckhdq	%xmm1,%xmm8
950	punpckhdq	%xmm3,%xmm7
951	movdqa	%xmm0,%xmm1
952	punpcklqdq	%xmm2,%xmm0
953	movdqa	%xmm8,%xmm3
954	punpcklqdq	%xmm7,%xmm8
955	punpckhqdq	%xmm2,%xmm1
956	punpckhqdq	%xmm7,%xmm3
957	cmpq	$256,%rdx
958	jb	.Ltail4x
959
960	movdqu	0(%rsi),%xmm6
961	movdqu	16(%rsi),%xmm11
962	movdqu	32(%rsi),%xmm2
963	movdqu	48(%rsi),%xmm7
964	pxor	0(%rsp),%xmm6
965	pxor	%xmm12,%xmm11
966	pxor	%xmm4,%xmm2
967	pxor	%xmm0,%xmm7
968
969	movdqu	%xmm6,0(%rdi)
970	movdqu	64(%rsi),%xmm6
971	movdqu	%xmm11,16(%rdi)
972	movdqu	80(%rsi),%xmm11
973	movdqu	%xmm2,32(%rdi)
974	movdqu	96(%rsi),%xmm2
975	movdqu	%xmm7,48(%rdi)
976	movdqu	112(%rsi),%xmm7
977	leaq	128(%rsi),%rsi
978	pxor	16(%rsp),%xmm6
979	pxor	%xmm13,%xmm11
980	pxor	%xmm5,%xmm2
981	pxor	%xmm1,%xmm7
982
983	movdqu	%xmm6,64(%rdi)
984	movdqu	0(%rsi),%xmm6
985	movdqu	%xmm11,80(%rdi)
986	movdqu	16(%rsi),%xmm11
987	movdqu	%xmm2,96(%rdi)
988	movdqu	32(%rsi),%xmm2
989	movdqu	%xmm7,112(%rdi)
990	leaq	128(%rdi),%rdi
991	movdqu	48(%rsi),%xmm7
992	pxor	32(%rsp),%xmm6
993	pxor	%xmm10,%xmm11
994	pxor	%xmm14,%xmm2
995	pxor	%xmm8,%xmm7
996
997	movdqu	%xmm6,0(%rdi)
998	movdqu	64(%rsi),%xmm6
999	movdqu	%xmm11,16(%rdi)
1000	movdqu	80(%rsi),%xmm11
1001	movdqu	%xmm2,32(%rdi)
1002	movdqu	96(%rsi),%xmm2
1003	movdqu	%xmm7,48(%rdi)
1004	movdqu	112(%rsi),%xmm7
1005	leaq	128(%rsi),%rsi
1006	pxor	48(%rsp),%xmm6
1007	pxor	%xmm15,%xmm11
1008	pxor	%xmm9,%xmm2
1009	pxor	%xmm3,%xmm7
1010	movdqu	%xmm6,64(%rdi)
1011	movdqu	%xmm11,80(%rdi)
1012	movdqu	%xmm2,96(%rdi)
1013	movdqu	%xmm7,112(%rdi)
1014	leaq	128(%rdi),%rdi
1015
1016	subq	$256,%rdx
1017	jnz	.Loop_outer4x
1018
1019	jmp	.Ldone4x
1020
1021.Ltail4x:
1022	cmpq	$192,%rdx
1023	jae	.L192_or_more4x
1024	cmpq	$128,%rdx
1025	jae	.L128_or_more4x
1026	cmpq	$64,%rdx
1027	jae	.L64_or_more4x
1028
1029
1030	xorq	%r10,%r10
1031
1032	movdqa	%xmm12,16(%rsp)
1033	movdqa	%xmm4,32(%rsp)
1034	movdqa	%xmm0,48(%rsp)
1035	jmp	.Loop_tail4x
1036
1037.align	32
1038.L64_or_more4x:
1039	movdqu	0(%rsi),%xmm6
1040	movdqu	16(%rsi),%xmm11
1041	movdqu	32(%rsi),%xmm2
1042	movdqu	48(%rsi),%xmm7
1043	pxor	0(%rsp),%xmm6
1044	pxor	%xmm12,%xmm11
1045	pxor	%xmm4,%xmm2
1046	pxor	%xmm0,%xmm7
1047	movdqu	%xmm6,0(%rdi)
1048	movdqu	%xmm11,16(%rdi)
1049	movdqu	%xmm2,32(%rdi)
1050	movdqu	%xmm7,48(%rdi)
1051	je	.Ldone4x
1052
1053	movdqa	16(%rsp),%xmm6
1054	leaq	64(%rsi),%rsi
1055	xorq	%r10,%r10
1056	movdqa	%xmm6,0(%rsp)
1057	movdqa	%xmm13,16(%rsp)
1058	leaq	64(%rdi),%rdi
1059	movdqa	%xmm5,32(%rsp)
1060	subq	$64,%rdx
1061	movdqa	%xmm1,48(%rsp)
1062	jmp	.Loop_tail4x
1063
1064.align	32
1065.L128_or_more4x:
1066	movdqu	0(%rsi),%xmm6
1067	movdqu	16(%rsi),%xmm11
1068	movdqu	32(%rsi),%xmm2
1069	movdqu	48(%rsi),%xmm7
1070	pxor	0(%rsp),%xmm6
1071	pxor	%xmm12,%xmm11
1072	pxor	%xmm4,%xmm2
1073	pxor	%xmm0,%xmm7
1074
1075	movdqu	%xmm6,0(%rdi)
1076	movdqu	64(%rsi),%xmm6
1077	movdqu	%xmm11,16(%rdi)
1078	movdqu	80(%rsi),%xmm11
1079	movdqu	%xmm2,32(%rdi)
1080	movdqu	96(%rsi),%xmm2
1081	movdqu	%xmm7,48(%rdi)
1082	movdqu	112(%rsi),%xmm7
1083	pxor	16(%rsp),%xmm6
1084	pxor	%xmm13,%xmm11
1085	pxor	%xmm5,%xmm2
1086	pxor	%xmm1,%xmm7
1087	movdqu	%xmm6,64(%rdi)
1088	movdqu	%xmm11,80(%rdi)
1089	movdqu	%xmm2,96(%rdi)
1090	movdqu	%xmm7,112(%rdi)
1091	je	.Ldone4x
1092
1093	movdqa	32(%rsp),%xmm6
1094	leaq	128(%rsi),%rsi
1095	xorq	%r10,%r10
1096	movdqa	%xmm6,0(%rsp)
1097	movdqa	%xmm10,16(%rsp)
1098	leaq	128(%rdi),%rdi
1099	movdqa	%xmm14,32(%rsp)
1100	subq	$128,%rdx
1101	movdqa	%xmm8,48(%rsp)
1102	jmp	.Loop_tail4x
1103
1104.align	32
1105.L192_or_more4x:
1106	movdqu	0(%rsi),%xmm6
1107	movdqu	16(%rsi),%xmm11
1108	movdqu	32(%rsi),%xmm2
1109	movdqu	48(%rsi),%xmm7
1110	pxor	0(%rsp),%xmm6
1111	pxor	%xmm12,%xmm11
1112	pxor	%xmm4,%xmm2
1113	pxor	%xmm0,%xmm7
1114
1115	movdqu	%xmm6,0(%rdi)
1116	movdqu	64(%rsi),%xmm6
1117	movdqu	%xmm11,16(%rdi)
1118	movdqu	80(%rsi),%xmm11
1119	movdqu	%xmm2,32(%rdi)
1120	movdqu	96(%rsi),%xmm2
1121	movdqu	%xmm7,48(%rdi)
1122	movdqu	112(%rsi),%xmm7
1123	leaq	128(%rsi),%rsi
1124	pxor	16(%rsp),%xmm6
1125	pxor	%xmm13,%xmm11
1126	pxor	%xmm5,%xmm2
1127	pxor	%xmm1,%xmm7
1128
1129	movdqu	%xmm6,64(%rdi)
1130	movdqu	0(%rsi),%xmm6
1131	movdqu	%xmm11,80(%rdi)
1132	movdqu	16(%rsi),%xmm11
1133	movdqu	%xmm2,96(%rdi)
1134	movdqu	32(%rsi),%xmm2
1135	movdqu	%xmm7,112(%rdi)
1136	leaq	128(%rdi),%rdi
1137	movdqu	48(%rsi),%xmm7
1138	pxor	32(%rsp),%xmm6
1139	pxor	%xmm10,%xmm11
1140	pxor	%xmm14,%xmm2
1141	pxor	%xmm8,%xmm7
1142	movdqu	%xmm6,0(%rdi)
1143	movdqu	%xmm11,16(%rdi)
1144	movdqu	%xmm2,32(%rdi)
1145	movdqu	%xmm7,48(%rdi)
1146	je	.Ldone4x
1147
1148	movdqa	48(%rsp),%xmm6
1149	leaq	64(%rsi),%rsi
1150	xorq	%r10,%r10
1151	movdqa	%xmm6,0(%rsp)
1152	movdqa	%xmm15,16(%rsp)
1153	leaq	64(%rdi),%rdi
1154	movdqa	%xmm9,32(%rsp)
1155	subq	$192,%rdx
1156	movdqa	%xmm3,48(%rsp)
1157
1158.Loop_tail4x:
1159	movzbl	(%rsi,%r10,1),%eax
1160	movzbl	(%rsp,%r10,1),%ecx
1161	leaq	1(%r10),%r10
1162	xorl	%ecx,%eax
1163	movb	%al,-1(%rdi,%r10,1)
1164	decq	%rdx
1165	jnz	.Loop_tail4x
1166
1167.Ldone4x:
1168	leaq	(%r9),%rsp
1169.cfi_def_cfa_register	%rsp
1170.L4x_epilogue:
1171	.byte	0xf3,0xc3
1172.cfi_endproc
1173.size	ChaCha20_4x,.-ChaCha20_4x
1174.type	ChaCha20_4xop,@function
1175.align	32
1176ChaCha20_4xop:
1177.cfi_startproc
1178.LChaCha20_4xop:
1179	movq	%rsp,%r9
1180.cfi_def_cfa_register	%r9
1181	subq	$0x140+8,%rsp
1182	vzeroupper
1183
1184	vmovdqa	.Lsigma(%rip),%xmm11
1185	vmovdqu	(%rcx),%xmm3
1186	vmovdqu	16(%rcx),%xmm15
1187	vmovdqu	(%r8),%xmm7
1188	leaq	256(%rsp),%rcx
1189
1190	vpshufd	$0x00,%xmm11,%xmm8
1191	vpshufd	$0x55,%xmm11,%xmm9
1192	vmovdqa	%xmm8,64(%rsp)
1193	vpshufd	$0xaa,%xmm11,%xmm10
1194	vmovdqa	%xmm9,80(%rsp)
1195	vpshufd	$0xff,%xmm11,%xmm11
1196	vmovdqa	%xmm10,96(%rsp)
1197	vmovdqa	%xmm11,112(%rsp)
1198
1199	vpshufd	$0x00,%xmm3,%xmm0
1200	vpshufd	$0x55,%xmm3,%xmm1
1201	vmovdqa	%xmm0,128-256(%rcx)
1202	vpshufd	$0xaa,%xmm3,%xmm2
1203	vmovdqa	%xmm1,144-256(%rcx)
1204	vpshufd	$0xff,%xmm3,%xmm3
1205	vmovdqa	%xmm2,160-256(%rcx)
1206	vmovdqa	%xmm3,176-256(%rcx)
1207
1208	vpshufd	$0x00,%xmm15,%xmm12
1209	vpshufd	$0x55,%xmm15,%xmm13
1210	vmovdqa	%xmm12,192-256(%rcx)
1211	vpshufd	$0xaa,%xmm15,%xmm14
1212	vmovdqa	%xmm13,208-256(%rcx)
1213	vpshufd	$0xff,%xmm15,%xmm15
1214	vmovdqa	%xmm14,224-256(%rcx)
1215	vmovdqa	%xmm15,240-256(%rcx)
1216
1217	vpshufd	$0x00,%xmm7,%xmm4
1218	vpshufd	$0x55,%xmm7,%xmm5
1219	vpaddd	.Linc(%rip),%xmm4,%xmm4
1220	vpshufd	$0xaa,%xmm7,%xmm6
1221	vmovdqa	%xmm5,272-256(%rcx)
1222	vpshufd	$0xff,%xmm7,%xmm7
1223	vmovdqa	%xmm6,288-256(%rcx)
1224	vmovdqa	%xmm7,304-256(%rcx)
1225
1226	jmp	.Loop_enter4xop
1227
1228.align	32
1229.Loop_outer4xop:
1230	vmovdqa	64(%rsp),%xmm8
1231	vmovdqa	80(%rsp),%xmm9
1232	vmovdqa	96(%rsp),%xmm10
1233	vmovdqa	112(%rsp),%xmm11
1234	vmovdqa	128-256(%rcx),%xmm0
1235	vmovdqa	144-256(%rcx),%xmm1
1236	vmovdqa	160-256(%rcx),%xmm2
1237	vmovdqa	176-256(%rcx),%xmm3
1238	vmovdqa	192-256(%rcx),%xmm12
1239	vmovdqa	208-256(%rcx),%xmm13
1240	vmovdqa	224-256(%rcx),%xmm14
1241	vmovdqa	240-256(%rcx),%xmm15
1242	vmovdqa	256-256(%rcx),%xmm4
1243	vmovdqa	272-256(%rcx),%xmm5
1244	vmovdqa	288-256(%rcx),%xmm6
1245	vmovdqa	304-256(%rcx),%xmm7
1246	vpaddd	.Lfour(%rip),%xmm4,%xmm4
1247
1248.Loop_enter4xop:
1249	movl	$10,%eax
1250	vmovdqa	%xmm4,256-256(%rcx)
1251	jmp	.Loop4xop
1252
1253.align	32
1254.Loop4xop:
1255	vpaddd	%xmm0,%xmm8,%xmm8
1256	vpaddd	%xmm1,%xmm9,%xmm9
1257	vpaddd	%xmm2,%xmm10,%xmm10
1258	vpaddd	%xmm3,%xmm11,%xmm11
1259	vpxor	%xmm4,%xmm8,%xmm4
1260	vpxor	%xmm5,%xmm9,%xmm5
1261	vpxor	%xmm6,%xmm10,%xmm6
1262	vpxor	%xmm7,%xmm11,%xmm7
1263.byte	143,232,120,194,228,16
1264.byte	143,232,120,194,237,16
1265.byte	143,232,120,194,246,16
1266.byte	143,232,120,194,255,16
1267	vpaddd	%xmm4,%xmm12,%xmm12
1268	vpaddd	%xmm5,%xmm13,%xmm13
1269	vpaddd	%xmm6,%xmm14,%xmm14
1270	vpaddd	%xmm7,%xmm15,%xmm15
1271	vpxor	%xmm0,%xmm12,%xmm0
1272	vpxor	%xmm1,%xmm13,%xmm1
1273	vpxor	%xmm14,%xmm2,%xmm2
1274	vpxor	%xmm15,%xmm3,%xmm3
1275.byte	143,232,120,194,192,12
1276.byte	143,232,120,194,201,12
1277.byte	143,232,120,194,210,12
1278.byte	143,232,120,194,219,12
1279	vpaddd	%xmm8,%xmm0,%xmm8
1280	vpaddd	%xmm9,%xmm1,%xmm9
1281	vpaddd	%xmm2,%xmm10,%xmm10
1282	vpaddd	%xmm3,%xmm11,%xmm11
1283	vpxor	%xmm4,%xmm8,%xmm4
1284	vpxor	%xmm5,%xmm9,%xmm5
1285	vpxor	%xmm6,%xmm10,%xmm6
1286	vpxor	%xmm7,%xmm11,%xmm7
1287.byte	143,232,120,194,228,8
1288.byte	143,232,120,194,237,8
1289.byte	143,232,120,194,246,8
1290.byte	143,232,120,194,255,8
1291	vpaddd	%xmm4,%xmm12,%xmm12
1292	vpaddd	%xmm5,%xmm13,%xmm13
1293	vpaddd	%xmm6,%xmm14,%xmm14
1294	vpaddd	%xmm7,%xmm15,%xmm15
1295	vpxor	%xmm0,%xmm12,%xmm0
1296	vpxor	%xmm1,%xmm13,%xmm1
1297	vpxor	%xmm14,%xmm2,%xmm2
1298	vpxor	%xmm15,%xmm3,%xmm3
1299.byte	143,232,120,194,192,7
1300.byte	143,232,120,194,201,7
1301.byte	143,232,120,194,210,7
1302.byte	143,232,120,194,219,7
1303	vpaddd	%xmm1,%xmm8,%xmm8
1304	vpaddd	%xmm2,%xmm9,%xmm9
1305	vpaddd	%xmm3,%xmm10,%xmm10
1306	vpaddd	%xmm0,%xmm11,%xmm11
1307	vpxor	%xmm7,%xmm8,%xmm7
1308	vpxor	%xmm4,%xmm9,%xmm4
1309	vpxor	%xmm5,%xmm10,%xmm5
1310	vpxor	%xmm6,%xmm11,%xmm6
1311.byte	143,232,120,194,255,16
1312.byte	143,232,120,194,228,16
1313.byte	143,232,120,194,237,16
1314.byte	143,232,120,194,246,16
1315	vpaddd	%xmm7,%xmm14,%xmm14
1316	vpaddd	%xmm4,%xmm15,%xmm15
1317	vpaddd	%xmm5,%xmm12,%xmm12
1318	vpaddd	%xmm6,%xmm13,%xmm13
1319	vpxor	%xmm1,%xmm14,%xmm1
1320	vpxor	%xmm2,%xmm15,%xmm2
1321	vpxor	%xmm12,%xmm3,%xmm3
1322	vpxor	%xmm13,%xmm0,%xmm0
1323.byte	143,232,120,194,201,12
1324.byte	143,232,120,194,210,12
1325.byte	143,232,120,194,219,12
1326.byte	143,232,120,194,192,12
1327	vpaddd	%xmm8,%xmm1,%xmm8
1328	vpaddd	%xmm9,%xmm2,%xmm9
1329	vpaddd	%xmm3,%xmm10,%xmm10
1330	vpaddd	%xmm0,%xmm11,%xmm11
1331	vpxor	%xmm7,%xmm8,%xmm7
1332	vpxor	%xmm4,%xmm9,%xmm4
1333	vpxor	%xmm5,%xmm10,%xmm5
1334	vpxor	%xmm6,%xmm11,%xmm6
1335.byte	143,232,120,194,255,8
1336.byte	143,232,120,194,228,8
1337.byte	143,232,120,194,237,8
1338.byte	143,232,120,194,246,8
1339	vpaddd	%xmm7,%xmm14,%xmm14
1340	vpaddd	%xmm4,%xmm15,%xmm15
1341	vpaddd	%xmm5,%xmm12,%xmm12
1342	vpaddd	%xmm6,%xmm13,%xmm13
1343	vpxor	%xmm1,%xmm14,%xmm1
1344	vpxor	%xmm2,%xmm15,%xmm2
1345	vpxor	%xmm12,%xmm3,%xmm3
1346	vpxor	%xmm13,%xmm0,%xmm0
1347.byte	143,232,120,194,201,7
1348.byte	143,232,120,194,210,7
1349.byte	143,232,120,194,219,7
1350.byte	143,232,120,194,192,7
1351	decl	%eax
1352	jnz	.Loop4xop
1353
1354	vpaddd	64(%rsp),%xmm8,%xmm8
1355	vpaddd	80(%rsp),%xmm9,%xmm9
1356	vpaddd	96(%rsp),%xmm10,%xmm10
1357	vpaddd	112(%rsp),%xmm11,%xmm11
1358
1359	vmovdqa	%xmm14,32(%rsp)
1360	vmovdqa	%xmm15,48(%rsp)
1361
1362	vpunpckldq	%xmm9,%xmm8,%xmm14
1363	vpunpckldq	%xmm11,%xmm10,%xmm15
1364	vpunpckhdq	%xmm9,%xmm8,%xmm8
1365	vpunpckhdq	%xmm11,%xmm10,%xmm10
1366	vpunpcklqdq	%xmm15,%xmm14,%xmm9
1367	vpunpckhqdq	%xmm15,%xmm14,%xmm14
1368	vpunpcklqdq	%xmm10,%xmm8,%xmm11
1369	vpunpckhqdq	%xmm10,%xmm8,%xmm8
1370	vpaddd	128-256(%rcx),%xmm0,%xmm0
1371	vpaddd	144-256(%rcx),%xmm1,%xmm1
1372	vpaddd	160-256(%rcx),%xmm2,%xmm2
1373	vpaddd	176-256(%rcx),%xmm3,%xmm3
1374
1375	vmovdqa	%xmm9,0(%rsp)
1376	vmovdqa	%xmm14,16(%rsp)
1377	vmovdqa	32(%rsp),%xmm9
1378	vmovdqa	48(%rsp),%xmm14
1379
1380	vpunpckldq	%xmm1,%xmm0,%xmm10
1381	vpunpckldq	%xmm3,%xmm2,%xmm15
1382	vpunpckhdq	%xmm1,%xmm0,%xmm0
1383	vpunpckhdq	%xmm3,%xmm2,%xmm2
1384	vpunpcklqdq	%xmm15,%xmm10,%xmm1
1385	vpunpckhqdq	%xmm15,%xmm10,%xmm10
1386	vpunpcklqdq	%xmm2,%xmm0,%xmm3
1387	vpunpckhqdq	%xmm2,%xmm0,%xmm0
1388	vpaddd	192-256(%rcx),%xmm12,%xmm12
1389	vpaddd	208-256(%rcx),%xmm13,%xmm13
1390	vpaddd	224-256(%rcx),%xmm9,%xmm9
1391	vpaddd	240-256(%rcx),%xmm14,%xmm14
1392
1393	vpunpckldq	%xmm13,%xmm12,%xmm2
1394	vpunpckldq	%xmm14,%xmm9,%xmm15
1395	vpunpckhdq	%xmm13,%xmm12,%xmm12
1396	vpunpckhdq	%xmm14,%xmm9,%xmm9
1397	vpunpcklqdq	%xmm15,%xmm2,%xmm13
1398	vpunpckhqdq	%xmm15,%xmm2,%xmm2
1399	vpunpcklqdq	%xmm9,%xmm12,%xmm14
1400	vpunpckhqdq	%xmm9,%xmm12,%xmm12
1401	vpaddd	256-256(%rcx),%xmm4,%xmm4
1402	vpaddd	272-256(%rcx),%xmm5,%xmm5
1403	vpaddd	288-256(%rcx),%xmm6,%xmm6
1404	vpaddd	304-256(%rcx),%xmm7,%xmm7
1405
1406	vpunpckldq	%xmm5,%xmm4,%xmm9
1407	vpunpckldq	%xmm7,%xmm6,%xmm15
1408	vpunpckhdq	%xmm5,%xmm4,%xmm4
1409	vpunpckhdq	%xmm7,%xmm6,%xmm6
1410	vpunpcklqdq	%xmm15,%xmm9,%xmm5
1411	vpunpckhqdq	%xmm15,%xmm9,%xmm9
1412	vpunpcklqdq	%xmm6,%xmm4,%xmm7
1413	vpunpckhqdq	%xmm6,%xmm4,%xmm4
1414	vmovdqa	0(%rsp),%xmm6
1415	vmovdqa	16(%rsp),%xmm15
1416
1417	cmpq	$256,%rdx
1418	jb	.Ltail4xop
1419
1420	vpxor	0(%rsi),%xmm6,%xmm6
1421	vpxor	16(%rsi),%xmm1,%xmm1
1422	vpxor	32(%rsi),%xmm13,%xmm13
1423	vpxor	48(%rsi),%xmm5,%xmm5
1424	vpxor	64(%rsi),%xmm15,%xmm15
1425	vpxor	80(%rsi),%xmm10,%xmm10
1426	vpxor	96(%rsi),%xmm2,%xmm2
1427	vpxor	112(%rsi),%xmm9,%xmm9
1428	leaq	128(%rsi),%rsi
1429	vpxor	0(%rsi),%xmm11,%xmm11
1430	vpxor	16(%rsi),%xmm3,%xmm3
1431	vpxor	32(%rsi),%xmm14,%xmm14
1432	vpxor	48(%rsi),%xmm7,%xmm7
1433	vpxor	64(%rsi),%xmm8,%xmm8
1434	vpxor	80(%rsi),%xmm0,%xmm0
1435	vpxor	96(%rsi),%xmm12,%xmm12
1436	vpxor	112(%rsi),%xmm4,%xmm4
1437	leaq	128(%rsi),%rsi
1438
1439	vmovdqu	%xmm6,0(%rdi)
1440	vmovdqu	%xmm1,16(%rdi)
1441	vmovdqu	%xmm13,32(%rdi)
1442	vmovdqu	%xmm5,48(%rdi)
1443	vmovdqu	%xmm15,64(%rdi)
1444	vmovdqu	%xmm10,80(%rdi)
1445	vmovdqu	%xmm2,96(%rdi)
1446	vmovdqu	%xmm9,112(%rdi)
1447	leaq	128(%rdi),%rdi
1448	vmovdqu	%xmm11,0(%rdi)
1449	vmovdqu	%xmm3,16(%rdi)
1450	vmovdqu	%xmm14,32(%rdi)
1451	vmovdqu	%xmm7,48(%rdi)
1452	vmovdqu	%xmm8,64(%rdi)
1453	vmovdqu	%xmm0,80(%rdi)
1454	vmovdqu	%xmm12,96(%rdi)
1455	vmovdqu	%xmm4,112(%rdi)
1456	leaq	128(%rdi),%rdi
1457
1458	subq	$256,%rdx
1459	jnz	.Loop_outer4xop
1460
1461	jmp	.Ldone4xop
1462
1463.align	32
1464.Ltail4xop:
1465	cmpq	$192,%rdx
1466	jae	.L192_or_more4xop
1467	cmpq	$128,%rdx
1468	jae	.L128_or_more4xop
1469	cmpq	$64,%rdx
1470	jae	.L64_or_more4xop
1471
1472	xorq	%r10,%r10
1473	vmovdqa	%xmm6,0(%rsp)
1474	vmovdqa	%xmm1,16(%rsp)
1475	vmovdqa	%xmm13,32(%rsp)
1476	vmovdqa	%xmm5,48(%rsp)
1477	jmp	.Loop_tail4xop
1478
1479.align	32
1480.L64_or_more4xop:
1481	vpxor	0(%rsi),%xmm6,%xmm6
1482	vpxor	16(%rsi),%xmm1,%xmm1
1483	vpxor	32(%rsi),%xmm13,%xmm13
1484	vpxor	48(%rsi),%xmm5,%xmm5
1485	vmovdqu	%xmm6,0(%rdi)
1486	vmovdqu	%xmm1,16(%rdi)
1487	vmovdqu	%xmm13,32(%rdi)
1488	vmovdqu	%xmm5,48(%rdi)
1489	je	.Ldone4xop
1490
1491	leaq	64(%rsi),%rsi
1492	vmovdqa	%xmm15,0(%rsp)
1493	xorq	%r10,%r10
1494	vmovdqa	%xmm10,16(%rsp)
1495	leaq	64(%rdi),%rdi
1496	vmovdqa	%xmm2,32(%rsp)
1497	subq	$64,%rdx
1498	vmovdqa	%xmm9,48(%rsp)
1499	jmp	.Loop_tail4xop
1500
1501.align	32
1502.L128_or_more4xop:
1503	vpxor	0(%rsi),%xmm6,%xmm6
1504	vpxor	16(%rsi),%xmm1,%xmm1
1505	vpxor	32(%rsi),%xmm13,%xmm13
1506	vpxor	48(%rsi),%xmm5,%xmm5
1507	vpxor	64(%rsi),%xmm15,%xmm15
1508	vpxor	80(%rsi),%xmm10,%xmm10
1509	vpxor	96(%rsi),%xmm2,%xmm2
1510	vpxor	112(%rsi),%xmm9,%xmm9
1511
1512	vmovdqu	%xmm6,0(%rdi)
1513	vmovdqu	%xmm1,16(%rdi)
1514	vmovdqu	%xmm13,32(%rdi)
1515	vmovdqu	%xmm5,48(%rdi)
1516	vmovdqu	%xmm15,64(%rdi)
1517	vmovdqu	%xmm10,80(%rdi)
1518	vmovdqu	%xmm2,96(%rdi)
1519	vmovdqu	%xmm9,112(%rdi)
1520	je	.Ldone4xop
1521
1522	leaq	128(%rsi),%rsi
1523	vmovdqa	%xmm11,0(%rsp)
1524	xorq	%r10,%r10
1525	vmovdqa	%xmm3,16(%rsp)
1526	leaq	128(%rdi),%rdi
1527	vmovdqa	%xmm14,32(%rsp)
1528	subq	$128,%rdx
1529	vmovdqa	%xmm7,48(%rsp)
1530	jmp	.Loop_tail4xop
1531
1532.align	32
1533.L192_or_more4xop:
1534	vpxor	0(%rsi),%xmm6,%xmm6
1535	vpxor	16(%rsi),%xmm1,%xmm1
1536	vpxor	32(%rsi),%xmm13,%xmm13
1537	vpxor	48(%rsi),%xmm5,%xmm5
1538	vpxor	64(%rsi),%xmm15,%xmm15
1539	vpxor	80(%rsi),%xmm10,%xmm10
1540	vpxor	96(%rsi),%xmm2,%xmm2
1541	vpxor	112(%rsi),%xmm9,%xmm9
1542	leaq	128(%rsi),%rsi
1543	vpxor	0(%rsi),%xmm11,%xmm11
1544	vpxor	16(%rsi),%xmm3,%xmm3
1545	vpxor	32(%rsi),%xmm14,%xmm14
1546	vpxor	48(%rsi),%xmm7,%xmm7
1547
1548	vmovdqu	%xmm6,0(%rdi)
1549	vmovdqu	%xmm1,16(%rdi)
1550	vmovdqu	%xmm13,32(%rdi)
1551	vmovdqu	%xmm5,48(%rdi)
1552	vmovdqu	%xmm15,64(%rdi)
1553	vmovdqu	%xmm10,80(%rdi)
1554	vmovdqu	%xmm2,96(%rdi)
1555	vmovdqu	%xmm9,112(%rdi)
1556	leaq	128(%rdi),%rdi
1557	vmovdqu	%xmm11,0(%rdi)
1558	vmovdqu	%xmm3,16(%rdi)
1559	vmovdqu	%xmm14,32(%rdi)
1560	vmovdqu	%xmm7,48(%rdi)
1561	je	.Ldone4xop
1562
1563	leaq	64(%rsi),%rsi
1564	vmovdqa	%xmm8,0(%rsp)
1565	xorq	%r10,%r10
1566	vmovdqa	%xmm0,16(%rsp)
1567	leaq	64(%rdi),%rdi
1568	vmovdqa	%xmm12,32(%rsp)
1569	subq	$192,%rdx
1570	vmovdqa	%xmm4,48(%rsp)
1571
1572.Loop_tail4xop:
1573	movzbl	(%rsi,%r10,1),%eax
1574	movzbl	(%rsp,%r10,1),%ecx
1575	leaq	1(%r10),%r10
1576	xorl	%ecx,%eax
1577	movb	%al,-1(%rdi,%r10,1)
1578	decq	%rdx
1579	jnz	.Loop_tail4xop
1580
1581.Ldone4xop:
1582	vzeroupper
1583	leaq	(%r9),%rsp
1584.cfi_def_cfa_register	%rsp
1585.L4xop_epilogue:
1586	.byte	0xf3,0xc3
1587.cfi_endproc
1588.size	ChaCha20_4xop,.-ChaCha20_4xop
1589.type	ChaCha20_8x,@function
1590.align	32
1591ChaCha20_8x:
1592.cfi_startproc
1593.LChaCha20_8x:
1594	movq	%rsp,%r9
1595.cfi_def_cfa_register	%r9
1596	subq	$0x280+8,%rsp
1597	andq	$-32,%rsp
1598	vzeroupper
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609	vbroadcasti128	.Lsigma(%rip),%ymm11
1610	vbroadcasti128	(%rcx),%ymm3
1611	vbroadcasti128	16(%rcx),%ymm15
1612	vbroadcasti128	(%r8),%ymm7
1613	leaq	256(%rsp),%rcx
1614	leaq	512(%rsp),%rax
1615	leaq	.Lrot16(%rip),%r10
1616	leaq	.Lrot24(%rip),%r11
1617
1618	vpshufd	$0x00,%ymm11,%ymm8
1619	vpshufd	$0x55,%ymm11,%ymm9
1620	vmovdqa	%ymm8,128-256(%rcx)
1621	vpshufd	$0xaa,%ymm11,%ymm10
1622	vmovdqa	%ymm9,160-256(%rcx)
1623	vpshufd	$0xff,%ymm11,%ymm11
1624	vmovdqa	%ymm10,192-256(%rcx)
1625	vmovdqa	%ymm11,224-256(%rcx)
1626
1627	vpshufd	$0x00,%ymm3,%ymm0
1628	vpshufd	$0x55,%ymm3,%ymm1
1629	vmovdqa	%ymm0,256-256(%rcx)
1630	vpshufd	$0xaa,%ymm3,%ymm2
1631	vmovdqa	%ymm1,288-256(%rcx)
1632	vpshufd	$0xff,%ymm3,%ymm3
1633	vmovdqa	%ymm2,320-256(%rcx)
1634	vmovdqa	%ymm3,352-256(%rcx)
1635
1636	vpshufd	$0x00,%ymm15,%ymm12
1637	vpshufd	$0x55,%ymm15,%ymm13
1638	vmovdqa	%ymm12,384-512(%rax)
1639	vpshufd	$0xaa,%ymm15,%ymm14
1640	vmovdqa	%ymm13,416-512(%rax)
1641	vpshufd	$0xff,%ymm15,%ymm15
1642	vmovdqa	%ymm14,448-512(%rax)
1643	vmovdqa	%ymm15,480-512(%rax)
1644
1645	vpshufd	$0x00,%ymm7,%ymm4
1646	vpshufd	$0x55,%ymm7,%ymm5
1647	vpaddd	.Lincy(%rip),%ymm4,%ymm4
1648	vpshufd	$0xaa,%ymm7,%ymm6
1649	vmovdqa	%ymm5,544-512(%rax)
1650	vpshufd	$0xff,%ymm7,%ymm7
1651	vmovdqa	%ymm6,576-512(%rax)
1652	vmovdqa	%ymm7,608-512(%rax)
1653
1654	jmp	.Loop_enter8x
1655
1656.align	32
1657.Loop_outer8x:
1658	vmovdqa	128-256(%rcx),%ymm8
1659	vmovdqa	160-256(%rcx),%ymm9
1660	vmovdqa	192-256(%rcx),%ymm10
1661	vmovdqa	224-256(%rcx),%ymm11
1662	vmovdqa	256-256(%rcx),%ymm0
1663	vmovdqa	288-256(%rcx),%ymm1
1664	vmovdqa	320-256(%rcx),%ymm2
1665	vmovdqa	352-256(%rcx),%ymm3
1666	vmovdqa	384-512(%rax),%ymm12
1667	vmovdqa	416-512(%rax),%ymm13
1668	vmovdqa	448-512(%rax),%ymm14
1669	vmovdqa	480-512(%rax),%ymm15
1670	vmovdqa	512-512(%rax),%ymm4
1671	vmovdqa	544-512(%rax),%ymm5
1672	vmovdqa	576-512(%rax),%ymm6
1673	vmovdqa	608-512(%rax),%ymm7
1674	vpaddd	.Leight(%rip),%ymm4,%ymm4
1675
1676.Loop_enter8x:
1677	vmovdqa	%ymm14,64(%rsp)
1678	vmovdqa	%ymm15,96(%rsp)
1679	vbroadcasti128	(%r10),%ymm15
1680	vmovdqa	%ymm4,512-512(%rax)
1681	movl	$10,%eax
1682	jmp	.Loop8x
1683
1684.align	32
1685.Loop8x:
1686	vpaddd	%ymm0,%ymm8,%ymm8
1687	vpxor	%ymm4,%ymm8,%ymm4
1688	vpshufb	%ymm15,%ymm4,%ymm4
1689	vpaddd	%ymm1,%ymm9,%ymm9
1690	vpxor	%ymm5,%ymm9,%ymm5
1691	vpshufb	%ymm15,%ymm5,%ymm5
1692	vpaddd	%ymm4,%ymm12,%ymm12
1693	vpxor	%ymm0,%ymm12,%ymm0
1694	vpslld	$12,%ymm0,%ymm14
1695	vpsrld	$20,%ymm0,%ymm0
1696	vpor	%ymm0,%ymm14,%ymm0
1697	vbroadcasti128	(%r11),%ymm14
1698	vpaddd	%ymm5,%ymm13,%ymm13
1699	vpxor	%ymm1,%ymm13,%ymm1
1700	vpslld	$12,%ymm1,%ymm15
1701	vpsrld	$20,%ymm1,%ymm1
1702	vpor	%ymm1,%ymm15,%ymm1
1703	vpaddd	%ymm0,%ymm8,%ymm8
1704	vpxor	%ymm4,%ymm8,%ymm4
1705	vpshufb	%ymm14,%ymm4,%ymm4
1706	vpaddd	%ymm1,%ymm9,%ymm9
1707	vpxor	%ymm5,%ymm9,%ymm5
1708	vpshufb	%ymm14,%ymm5,%ymm5
1709	vpaddd	%ymm4,%ymm12,%ymm12
1710	vpxor	%ymm0,%ymm12,%ymm0
1711	vpslld	$7,%ymm0,%ymm15
1712	vpsrld	$25,%ymm0,%ymm0
1713	vpor	%ymm0,%ymm15,%ymm0
1714	vbroadcasti128	(%r10),%ymm15
1715	vpaddd	%ymm5,%ymm13,%ymm13
1716	vpxor	%ymm1,%ymm13,%ymm1
1717	vpslld	$7,%ymm1,%ymm14
1718	vpsrld	$25,%ymm1,%ymm1
1719	vpor	%ymm1,%ymm14,%ymm1
1720	vmovdqa	%ymm12,0(%rsp)
1721	vmovdqa	%ymm13,32(%rsp)
1722	vmovdqa	64(%rsp),%ymm12
1723	vmovdqa	96(%rsp),%ymm13
1724	vpaddd	%ymm2,%ymm10,%ymm10
1725	vpxor	%ymm6,%ymm10,%ymm6
1726	vpshufb	%ymm15,%ymm6,%ymm6
1727	vpaddd	%ymm3,%ymm11,%ymm11
1728	vpxor	%ymm7,%ymm11,%ymm7
1729	vpshufb	%ymm15,%ymm7,%ymm7
1730	vpaddd	%ymm6,%ymm12,%ymm12
1731	vpxor	%ymm2,%ymm12,%ymm2
1732	vpslld	$12,%ymm2,%ymm14
1733	vpsrld	$20,%ymm2,%ymm2
1734	vpor	%ymm2,%ymm14,%ymm2
1735	vbroadcasti128	(%r11),%ymm14
1736	vpaddd	%ymm7,%ymm13,%ymm13
1737	vpxor	%ymm3,%ymm13,%ymm3
1738	vpslld	$12,%ymm3,%ymm15
1739	vpsrld	$20,%ymm3,%ymm3
1740	vpor	%ymm3,%ymm15,%ymm3
1741	vpaddd	%ymm2,%ymm10,%ymm10
1742	vpxor	%ymm6,%ymm10,%ymm6
1743	vpshufb	%ymm14,%ymm6,%ymm6
1744	vpaddd	%ymm3,%ymm11,%ymm11
1745	vpxor	%ymm7,%ymm11,%ymm7
1746	vpshufb	%ymm14,%ymm7,%ymm7
1747	vpaddd	%ymm6,%ymm12,%ymm12
1748	vpxor	%ymm2,%ymm12,%ymm2
1749	vpslld	$7,%ymm2,%ymm15
1750	vpsrld	$25,%ymm2,%ymm2
1751	vpor	%ymm2,%ymm15,%ymm2
1752	vbroadcasti128	(%r10),%ymm15
1753	vpaddd	%ymm7,%ymm13,%ymm13
1754	vpxor	%ymm3,%ymm13,%ymm3
1755	vpslld	$7,%ymm3,%ymm14
1756	vpsrld	$25,%ymm3,%ymm3
1757	vpor	%ymm3,%ymm14,%ymm3
1758	vpaddd	%ymm1,%ymm8,%ymm8
1759	vpxor	%ymm7,%ymm8,%ymm7
1760	vpshufb	%ymm15,%ymm7,%ymm7
1761	vpaddd	%ymm2,%ymm9,%ymm9
1762	vpxor	%ymm4,%ymm9,%ymm4
1763	vpshufb	%ymm15,%ymm4,%ymm4
1764	vpaddd	%ymm7,%ymm12,%ymm12
1765	vpxor	%ymm1,%ymm12,%ymm1
1766	vpslld	$12,%ymm1,%ymm14
1767	vpsrld	$20,%ymm1,%ymm1
1768	vpor	%ymm1,%ymm14,%ymm1
1769	vbroadcasti128	(%r11),%ymm14
1770	vpaddd	%ymm4,%ymm13,%ymm13
1771	vpxor	%ymm2,%ymm13,%ymm2
1772	vpslld	$12,%ymm2,%ymm15
1773	vpsrld	$20,%ymm2,%ymm2
1774	vpor	%ymm2,%ymm15,%ymm2
1775	vpaddd	%ymm1,%ymm8,%ymm8
1776	vpxor	%ymm7,%ymm8,%ymm7
1777	vpshufb	%ymm14,%ymm7,%ymm7
1778	vpaddd	%ymm2,%ymm9,%ymm9
1779	vpxor	%ymm4,%ymm9,%ymm4
1780	vpshufb	%ymm14,%ymm4,%ymm4
1781	vpaddd	%ymm7,%ymm12,%ymm12
1782	vpxor	%ymm1,%ymm12,%ymm1
1783	vpslld	$7,%ymm1,%ymm15
1784	vpsrld	$25,%ymm1,%ymm1
1785	vpor	%ymm1,%ymm15,%ymm1
1786	vbroadcasti128	(%r10),%ymm15
1787	vpaddd	%ymm4,%ymm13,%ymm13
1788	vpxor	%ymm2,%ymm13,%ymm2
1789	vpslld	$7,%ymm2,%ymm14
1790	vpsrld	$25,%ymm2,%ymm2
1791	vpor	%ymm2,%ymm14,%ymm2
1792	vmovdqa	%ymm12,64(%rsp)
1793	vmovdqa	%ymm13,96(%rsp)
1794	vmovdqa	0(%rsp),%ymm12
1795	vmovdqa	32(%rsp),%ymm13
1796	vpaddd	%ymm3,%ymm10,%ymm10
1797	vpxor	%ymm5,%ymm10,%ymm5
1798	vpshufb	%ymm15,%ymm5,%ymm5
1799	vpaddd	%ymm0,%ymm11,%ymm11
1800	vpxor	%ymm6,%ymm11,%ymm6
1801	vpshufb	%ymm15,%ymm6,%ymm6
1802	vpaddd	%ymm5,%ymm12,%ymm12
1803	vpxor	%ymm3,%ymm12,%ymm3
1804	vpslld	$12,%ymm3,%ymm14
1805	vpsrld	$20,%ymm3,%ymm3
1806	vpor	%ymm3,%ymm14,%ymm3
1807	vbroadcasti128	(%r11),%ymm14
1808	vpaddd	%ymm6,%ymm13,%ymm13
1809	vpxor	%ymm0,%ymm13,%ymm0
1810	vpslld	$12,%ymm0,%ymm15
1811	vpsrld	$20,%ymm0,%ymm0
1812	vpor	%ymm0,%ymm15,%ymm0
1813	vpaddd	%ymm3,%ymm10,%ymm10
1814	vpxor	%ymm5,%ymm10,%ymm5
1815	vpshufb	%ymm14,%ymm5,%ymm5
1816	vpaddd	%ymm0,%ymm11,%ymm11
1817	vpxor	%ymm6,%ymm11,%ymm6
1818	vpshufb	%ymm14,%ymm6,%ymm6
1819	vpaddd	%ymm5,%ymm12,%ymm12
1820	vpxor	%ymm3,%ymm12,%ymm3
1821	vpslld	$7,%ymm3,%ymm15
1822	vpsrld	$25,%ymm3,%ymm3
1823	vpor	%ymm3,%ymm15,%ymm3
1824	vbroadcasti128	(%r10),%ymm15
1825	vpaddd	%ymm6,%ymm13,%ymm13
1826	vpxor	%ymm0,%ymm13,%ymm0
1827	vpslld	$7,%ymm0,%ymm14
1828	vpsrld	$25,%ymm0,%ymm0
1829	vpor	%ymm0,%ymm14,%ymm0
1830	decl	%eax
1831	jnz	.Loop8x
1832
1833	leaq	512(%rsp),%rax
1834	vpaddd	128-256(%rcx),%ymm8,%ymm8
1835	vpaddd	160-256(%rcx),%ymm9,%ymm9
1836	vpaddd	192-256(%rcx),%ymm10,%ymm10
1837	vpaddd	224-256(%rcx),%ymm11,%ymm11
1838
1839	vpunpckldq	%ymm9,%ymm8,%ymm14
1840	vpunpckldq	%ymm11,%ymm10,%ymm15
1841	vpunpckhdq	%ymm9,%ymm8,%ymm8
1842	vpunpckhdq	%ymm11,%ymm10,%ymm10
1843	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1844	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1845	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1846	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1847	vpaddd	256-256(%rcx),%ymm0,%ymm0
1848	vpaddd	288-256(%rcx),%ymm1,%ymm1
1849	vpaddd	320-256(%rcx),%ymm2,%ymm2
1850	vpaddd	352-256(%rcx),%ymm3,%ymm3
1851
1852	vpunpckldq	%ymm1,%ymm0,%ymm10
1853	vpunpckldq	%ymm3,%ymm2,%ymm15
1854	vpunpckhdq	%ymm1,%ymm0,%ymm0
1855	vpunpckhdq	%ymm3,%ymm2,%ymm2
1856	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1857	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1858	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1859	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1860	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1861	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1862	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1863	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1864	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1865	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1866	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1867	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1868	vmovdqa	%ymm15,0(%rsp)
1869	vmovdqa	%ymm9,32(%rsp)
1870	vmovdqa	64(%rsp),%ymm15
1871	vmovdqa	96(%rsp),%ymm9
1872
1873	vpaddd	384-512(%rax),%ymm12,%ymm12
1874	vpaddd	416-512(%rax),%ymm13,%ymm13
1875	vpaddd	448-512(%rax),%ymm15,%ymm15
1876	vpaddd	480-512(%rax),%ymm9,%ymm9
1877
1878	vpunpckldq	%ymm13,%ymm12,%ymm2
1879	vpunpckldq	%ymm9,%ymm15,%ymm8
1880	vpunpckhdq	%ymm13,%ymm12,%ymm12
1881	vpunpckhdq	%ymm9,%ymm15,%ymm15
1882	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1883	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1884	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1885	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1886	vpaddd	512-512(%rax),%ymm4,%ymm4
1887	vpaddd	544-512(%rax),%ymm5,%ymm5
1888	vpaddd	576-512(%rax),%ymm6,%ymm6
1889	vpaddd	608-512(%rax),%ymm7,%ymm7
1890
1891	vpunpckldq	%ymm5,%ymm4,%ymm15
1892	vpunpckldq	%ymm7,%ymm6,%ymm8
1893	vpunpckhdq	%ymm5,%ymm4,%ymm4
1894	vpunpckhdq	%ymm7,%ymm6,%ymm6
1895	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1896	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1897	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1898	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1899	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1900	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1901	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1902	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1903	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1904	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1905	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1906	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1907	vmovdqa	0(%rsp),%ymm6
1908	vmovdqa	32(%rsp),%ymm12
1909
1910	cmpq	$512,%rdx
1911	jb	.Ltail8x
1912
1913	vpxor	0(%rsi),%ymm6,%ymm6
1914	vpxor	32(%rsi),%ymm8,%ymm8
1915	vpxor	64(%rsi),%ymm1,%ymm1
1916	vpxor	96(%rsi),%ymm5,%ymm5
1917	leaq	128(%rsi),%rsi
1918	vmovdqu	%ymm6,0(%rdi)
1919	vmovdqu	%ymm8,32(%rdi)
1920	vmovdqu	%ymm1,64(%rdi)
1921	vmovdqu	%ymm5,96(%rdi)
1922	leaq	128(%rdi),%rdi
1923
1924	vpxor	0(%rsi),%ymm12,%ymm12
1925	vpxor	32(%rsi),%ymm13,%ymm13
1926	vpxor	64(%rsi),%ymm10,%ymm10
1927	vpxor	96(%rsi),%ymm15,%ymm15
1928	leaq	128(%rsi),%rsi
1929	vmovdqu	%ymm12,0(%rdi)
1930	vmovdqu	%ymm13,32(%rdi)
1931	vmovdqu	%ymm10,64(%rdi)
1932	vmovdqu	%ymm15,96(%rdi)
1933	leaq	128(%rdi),%rdi
1934
1935	vpxor	0(%rsi),%ymm14,%ymm14
1936	vpxor	32(%rsi),%ymm2,%ymm2
1937	vpxor	64(%rsi),%ymm3,%ymm3
1938	vpxor	96(%rsi),%ymm7,%ymm7
1939	leaq	128(%rsi),%rsi
1940	vmovdqu	%ymm14,0(%rdi)
1941	vmovdqu	%ymm2,32(%rdi)
1942	vmovdqu	%ymm3,64(%rdi)
1943	vmovdqu	%ymm7,96(%rdi)
1944	leaq	128(%rdi),%rdi
1945
1946	vpxor	0(%rsi),%ymm11,%ymm11
1947	vpxor	32(%rsi),%ymm9,%ymm9
1948	vpxor	64(%rsi),%ymm0,%ymm0
1949	vpxor	96(%rsi),%ymm4,%ymm4
1950	leaq	128(%rsi),%rsi
1951	vmovdqu	%ymm11,0(%rdi)
1952	vmovdqu	%ymm9,32(%rdi)
1953	vmovdqu	%ymm0,64(%rdi)
1954	vmovdqu	%ymm4,96(%rdi)
1955	leaq	128(%rdi),%rdi
1956
1957	subq	$512,%rdx
1958	jnz	.Loop_outer8x
1959
1960	jmp	.Ldone8x
1961
1962.Ltail8x:
1963	cmpq	$448,%rdx
1964	jae	.L448_or_more8x
1965	cmpq	$384,%rdx
1966	jae	.L384_or_more8x
1967	cmpq	$320,%rdx
1968	jae	.L320_or_more8x
1969	cmpq	$256,%rdx
1970	jae	.L256_or_more8x
1971	cmpq	$192,%rdx
1972	jae	.L192_or_more8x
1973	cmpq	$128,%rdx
1974	jae	.L128_or_more8x
1975	cmpq	$64,%rdx
1976	jae	.L64_or_more8x
1977
1978	xorq	%r10,%r10
1979	vmovdqa	%ymm6,0(%rsp)
1980	vmovdqa	%ymm8,32(%rsp)
1981	jmp	.Loop_tail8x
1982
1983.align	32
1984.L64_or_more8x:
1985	vpxor	0(%rsi),%ymm6,%ymm6
1986	vpxor	32(%rsi),%ymm8,%ymm8
1987	vmovdqu	%ymm6,0(%rdi)
1988	vmovdqu	%ymm8,32(%rdi)
1989	je	.Ldone8x
1990
1991	leaq	64(%rsi),%rsi
1992	xorq	%r10,%r10
1993	vmovdqa	%ymm1,0(%rsp)
1994	leaq	64(%rdi),%rdi
1995	subq	$64,%rdx
1996	vmovdqa	%ymm5,32(%rsp)
1997	jmp	.Loop_tail8x
1998
1999.align	32
2000.L128_or_more8x:
2001	vpxor	0(%rsi),%ymm6,%ymm6
2002	vpxor	32(%rsi),%ymm8,%ymm8
2003	vpxor	64(%rsi),%ymm1,%ymm1
2004	vpxor	96(%rsi),%ymm5,%ymm5
2005	vmovdqu	%ymm6,0(%rdi)
2006	vmovdqu	%ymm8,32(%rdi)
2007	vmovdqu	%ymm1,64(%rdi)
2008	vmovdqu	%ymm5,96(%rdi)
2009	je	.Ldone8x
2010
2011	leaq	128(%rsi),%rsi
2012	xorq	%r10,%r10
2013	vmovdqa	%ymm12,0(%rsp)
2014	leaq	128(%rdi),%rdi
2015	subq	$128,%rdx
2016	vmovdqa	%ymm13,32(%rsp)
2017	jmp	.Loop_tail8x
2018
2019.align	32
2020.L192_or_more8x:
2021	vpxor	0(%rsi),%ymm6,%ymm6
2022	vpxor	32(%rsi),%ymm8,%ymm8
2023	vpxor	64(%rsi),%ymm1,%ymm1
2024	vpxor	96(%rsi),%ymm5,%ymm5
2025	vpxor	128(%rsi),%ymm12,%ymm12
2026	vpxor	160(%rsi),%ymm13,%ymm13
2027	vmovdqu	%ymm6,0(%rdi)
2028	vmovdqu	%ymm8,32(%rdi)
2029	vmovdqu	%ymm1,64(%rdi)
2030	vmovdqu	%ymm5,96(%rdi)
2031	vmovdqu	%ymm12,128(%rdi)
2032	vmovdqu	%ymm13,160(%rdi)
2033	je	.Ldone8x
2034
2035	leaq	192(%rsi),%rsi
2036	xorq	%r10,%r10
2037	vmovdqa	%ymm10,0(%rsp)
2038	leaq	192(%rdi),%rdi
2039	subq	$192,%rdx
2040	vmovdqa	%ymm15,32(%rsp)
2041	jmp	.Loop_tail8x
2042
2043.align	32
2044.L256_or_more8x:
2045	vpxor	0(%rsi),%ymm6,%ymm6
2046	vpxor	32(%rsi),%ymm8,%ymm8
2047	vpxor	64(%rsi),%ymm1,%ymm1
2048	vpxor	96(%rsi),%ymm5,%ymm5
2049	vpxor	128(%rsi),%ymm12,%ymm12
2050	vpxor	160(%rsi),%ymm13,%ymm13
2051	vpxor	192(%rsi),%ymm10,%ymm10
2052	vpxor	224(%rsi),%ymm15,%ymm15
2053	vmovdqu	%ymm6,0(%rdi)
2054	vmovdqu	%ymm8,32(%rdi)
2055	vmovdqu	%ymm1,64(%rdi)
2056	vmovdqu	%ymm5,96(%rdi)
2057	vmovdqu	%ymm12,128(%rdi)
2058	vmovdqu	%ymm13,160(%rdi)
2059	vmovdqu	%ymm10,192(%rdi)
2060	vmovdqu	%ymm15,224(%rdi)
2061	je	.Ldone8x
2062
2063	leaq	256(%rsi),%rsi
2064	xorq	%r10,%r10
2065	vmovdqa	%ymm14,0(%rsp)
2066	leaq	256(%rdi),%rdi
2067	subq	$256,%rdx
2068	vmovdqa	%ymm2,32(%rsp)
2069	jmp	.Loop_tail8x
2070
2071.align	32
2072.L320_or_more8x:
2073	vpxor	0(%rsi),%ymm6,%ymm6
2074	vpxor	32(%rsi),%ymm8,%ymm8
2075	vpxor	64(%rsi),%ymm1,%ymm1
2076	vpxor	96(%rsi),%ymm5,%ymm5
2077	vpxor	128(%rsi),%ymm12,%ymm12
2078	vpxor	160(%rsi),%ymm13,%ymm13
2079	vpxor	192(%rsi),%ymm10,%ymm10
2080	vpxor	224(%rsi),%ymm15,%ymm15
2081	vpxor	256(%rsi),%ymm14,%ymm14
2082	vpxor	288(%rsi),%ymm2,%ymm2
2083	vmovdqu	%ymm6,0(%rdi)
2084	vmovdqu	%ymm8,32(%rdi)
2085	vmovdqu	%ymm1,64(%rdi)
2086	vmovdqu	%ymm5,96(%rdi)
2087	vmovdqu	%ymm12,128(%rdi)
2088	vmovdqu	%ymm13,160(%rdi)
2089	vmovdqu	%ymm10,192(%rdi)
2090	vmovdqu	%ymm15,224(%rdi)
2091	vmovdqu	%ymm14,256(%rdi)
2092	vmovdqu	%ymm2,288(%rdi)
2093	je	.Ldone8x
2094
2095	leaq	320(%rsi),%rsi
2096	xorq	%r10,%r10
2097	vmovdqa	%ymm3,0(%rsp)
2098	leaq	320(%rdi),%rdi
2099	subq	$320,%rdx
2100	vmovdqa	%ymm7,32(%rsp)
2101	jmp	.Loop_tail8x
2102
2103.align	32
2104.L384_or_more8x:
2105	vpxor	0(%rsi),%ymm6,%ymm6
2106	vpxor	32(%rsi),%ymm8,%ymm8
2107	vpxor	64(%rsi),%ymm1,%ymm1
2108	vpxor	96(%rsi),%ymm5,%ymm5
2109	vpxor	128(%rsi),%ymm12,%ymm12
2110	vpxor	160(%rsi),%ymm13,%ymm13
2111	vpxor	192(%rsi),%ymm10,%ymm10
2112	vpxor	224(%rsi),%ymm15,%ymm15
2113	vpxor	256(%rsi),%ymm14,%ymm14
2114	vpxor	288(%rsi),%ymm2,%ymm2
2115	vpxor	320(%rsi),%ymm3,%ymm3
2116	vpxor	352(%rsi),%ymm7,%ymm7
2117	vmovdqu	%ymm6,0(%rdi)
2118	vmovdqu	%ymm8,32(%rdi)
2119	vmovdqu	%ymm1,64(%rdi)
2120	vmovdqu	%ymm5,96(%rdi)
2121	vmovdqu	%ymm12,128(%rdi)
2122	vmovdqu	%ymm13,160(%rdi)
2123	vmovdqu	%ymm10,192(%rdi)
2124	vmovdqu	%ymm15,224(%rdi)
2125	vmovdqu	%ymm14,256(%rdi)
2126	vmovdqu	%ymm2,288(%rdi)
2127	vmovdqu	%ymm3,320(%rdi)
2128	vmovdqu	%ymm7,352(%rdi)
2129	je	.Ldone8x
2130
2131	leaq	384(%rsi),%rsi
2132	xorq	%r10,%r10
2133	vmovdqa	%ymm11,0(%rsp)
2134	leaq	384(%rdi),%rdi
2135	subq	$384,%rdx
2136	vmovdqa	%ymm9,32(%rsp)
2137	jmp	.Loop_tail8x
2138
2139.align	32
2140.L448_or_more8x:
2141	vpxor	0(%rsi),%ymm6,%ymm6
2142	vpxor	32(%rsi),%ymm8,%ymm8
2143	vpxor	64(%rsi),%ymm1,%ymm1
2144	vpxor	96(%rsi),%ymm5,%ymm5
2145	vpxor	128(%rsi),%ymm12,%ymm12
2146	vpxor	160(%rsi),%ymm13,%ymm13
2147	vpxor	192(%rsi),%ymm10,%ymm10
2148	vpxor	224(%rsi),%ymm15,%ymm15
2149	vpxor	256(%rsi),%ymm14,%ymm14
2150	vpxor	288(%rsi),%ymm2,%ymm2
2151	vpxor	320(%rsi),%ymm3,%ymm3
2152	vpxor	352(%rsi),%ymm7,%ymm7
2153	vpxor	384(%rsi),%ymm11,%ymm11
2154	vpxor	416(%rsi),%ymm9,%ymm9
2155	vmovdqu	%ymm6,0(%rdi)
2156	vmovdqu	%ymm8,32(%rdi)
2157	vmovdqu	%ymm1,64(%rdi)
2158	vmovdqu	%ymm5,96(%rdi)
2159	vmovdqu	%ymm12,128(%rdi)
2160	vmovdqu	%ymm13,160(%rdi)
2161	vmovdqu	%ymm10,192(%rdi)
2162	vmovdqu	%ymm15,224(%rdi)
2163	vmovdqu	%ymm14,256(%rdi)
2164	vmovdqu	%ymm2,288(%rdi)
2165	vmovdqu	%ymm3,320(%rdi)
2166	vmovdqu	%ymm7,352(%rdi)
2167	vmovdqu	%ymm11,384(%rdi)
2168	vmovdqu	%ymm9,416(%rdi)
2169	je	.Ldone8x
2170
2171	leaq	448(%rsi),%rsi
2172	xorq	%r10,%r10
2173	vmovdqa	%ymm0,0(%rsp)
2174	leaq	448(%rdi),%rdi
2175	subq	$448,%rdx
2176	vmovdqa	%ymm4,32(%rsp)
2177
2178.Loop_tail8x:
2179	movzbl	(%rsi,%r10,1),%eax
2180	movzbl	(%rsp,%r10,1),%ecx
2181	leaq	1(%r10),%r10
2182	xorl	%ecx,%eax
2183	movb	%al,-1(%rdi,%r10,1)
2184	decq	%rdx
2185	jnz	.Loop_tail8x
2186
2187.Ldone8x:
2188	vzeroall
2189	leaq	(%r9),%rsp
2190.cfi_def_cfa_register	%rsp
2191.L8x_epilogue:
2192	.byte	0xf3,0xc3
2193.cfi_endproc
2194.size	ChaCha20_8x,.-ChaCha20_8x
2195	.section ".note.gnu.property", "a"
2196	.p2align 3
2197	.long 1f - 0f
2198	.long 4f - 1f
2199	.long 5
22000:
2201	# "GNU" encoded with .byte, since .asciz isn't supported
2202	# on Solaris.
2203	.byte 0x47
2204	.byte 0x4e
2205	.byte 0x55
2206	.byte 0
22071:
2208	.p2align 3
2209	.long 0xc0000002
2210	.long 3f - 2f
22112:
2212	.long 3
22133:
2214	.p2align 3
22154:
2216