xref: /freebsd/sys/crypto/openssl/amd64/chacha-x86_64.S (revision c66ec88fed842fbaad62c30d510644ceb7bd2d71)
1/* $FreeBSD$ */
2/* Do not modify. This file is auto-generated from chacha-x86_64.pl. */
3.text
4
5
6
7.align	64
8.Lzero:
9.long	0,0,0,0
10.Lone:
11.long	1,0,0,0
12.Linc:
13.long	0,1,2,3
14.Lfour:
15.long	4,4,4,4
16.Lincy:
17.long	0,2,4,6,1,3,5,7
18.Leight:
19.long	8,8,8,8,8,8,8,8
20.Lrot16:
21.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
22.Lrot24:
23.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
24.Ltwoy:
25.long	2,0,0,0, 2,0,0,0
26.align	64
27.Lzeroz:
28.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
29.Lfourz:
30.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
31.Lincz:
32.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
33.Lsixteen:
34.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
35.Lsigma:
36.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
37.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
38.globl	ChaCha20_ctr32
39.type	ChaCha20_ctr32,@function
40.align	64
41ChaCha20_ctr32:
42.cfi_startproc
43	cmpq	$0,%rdx
44	je	.Lno_data
45	movq	OPENSSL_ia32cap_P+4(%rip),%r10
46	testl	$512,%r10d
47	jnz	.LChaCha20_ssse3
48
49	pushq	%rbx
50.cfi_adjust_cfa_offset	8
51.cfi_offset	%rbx,-16
52	pushq	%rbp
53.cfi_adjust_cfa_offset	8
54.cfi_offset	%rbp,-24
55	pushq	%r12
56.cfi_adjust_cfa_offset	8
57.cfi_offset	%r12,-32
58	pushq	%r13
59.cfi_adjust_cfa_offset	8
60.cfi_offset	%r13,-40
61	pushq	%r14
62.cfi_adjust_cfa_offset	8
63.cfi_offset	%r14,-48
64	pushq	%r15
65.cfi_adjust_cfa_offset	8
66.cfi_offset	%r15,-56
67	subq	$64+24,%rsp
68.cfi_adjust_cfa_offset	64+24
69.Lctr32_body:
70
71
72	movdqu	(%rcx),%xmm1
73	movdqu	16(%rcx),%xmm2
74	movdqu	(%r8),%xmm3
75	movdqa	.Lone(%rip),%xmm4
76
77
78	movdqa	%xmm1,16(%rsp)
79	movdqa	%xmm2,32(%rsp)
80	movdqa	%xmm3,48(%rsp)
81	movq	%rdx,%rbp
82	jmp	.Loop_outer
83
84.align	32
85.Loop_outer:
86	movl	$0x61707865,%eax
87	movl	$0x3320646e,%ebx
88	movl	$0x79622d32,%ecx
89	movl	$0x6b206574,%edx
90	movl	16(%rsp),%r8d
91	movl	20(%rsp),%r9d
92	movl	24(%rsp),%r10d
93	movl	28(%rsp),%r11d
94	movd	%xmm3,%r12d
95	movl	52(%rsp),%r13d
96	movl	56(%rsp),%r14d
97	movl	60(%rsp),%r15d
98
99	movq	%rbp,64+0(%rsp)
100	movl	$10,%ebp
101	movq	%rsi,64+8(%rsp)
102.byte	102,72,15,126,214
103	movq	%rdi,64+16(%rsp)
104	movq	%rsi,%rdi
105	shrq	$32,%rdi
106	jmp	.Loop
107
108.align	32
109.Loop:
110	addl	%r8d,%eax
111	xorl	%eax,%r12d
112	roll	$16,%r12d
113	addl	%r9d,%ebx
114	xorl	%ebx,%r13d
115	roll	$16,%r13d
116	addl	%r12d,%esi
117	xorl	%esi,%r8d
118	roll	$12,%r8d
119	addl	%r13d,%edi
120	xorl	%edi,%r9d
121	roll	$12,%r9d
122	addl	%r8d,%eax
123	xorl	%eax,%r12d
124	roll	$8,%r12d
125	addl	%r9d,%ebx
126	xorl	%ebx,%r13d
127	roll	$8,%r13d
128	addl	%r12d,%esi
129	xorl	%esi,%r8d
130	roll	$7,%r8d
131	addl	%r13d,%edi
132	xorl	%edi,%r9d
133	roll	$7,%r9d
134	movl	%esi,32(%rsp)
135	movl	%edi,36(%rsp)
136	movl	40(%rsp),%esi
137	movl	44(%rsp),%edi
138	addl	%r10d,%ecx
139	xorl	%ecx,%r14d
140	roll	$16,%r14d
141	addl	%r11d,%edx
142	xorl	%edx,%r15d
143	roll	$16,%r15d
144	addl	%r14d,%esi
145	xorl	%esi,%r10d
146	roll	$12,%r10d
147	addl	%r15d,%edi
148	xorl	%edi,%r11d
149	roll	$12,%r11d
150	addl	%r10d,%ecx
151	xorl	%ecx,%r14d
152	roll	$8,%r14d
153	addl	%r11d,%edx
154	xorl	%edx,%r15d
155	roll	$8,%r15d
156	addl	%r14d,%esi
157	xorl	%esi,%r10d
158	roll	$7,%r10d
159	addl	%r15d,%edi
160	xorl	%edi,%r11d
161	roll	$7,%r11d
162	addl	%r9d,%eax
163	xorl	%eax,%r15d
164	roll	$16,%r15d
165	addl	%r10d,%ebx
166	xorl	%ebx,%r12d
167	roll	$16,%r12d
168	addl	%r15d,%esi
169	xorl	%esi,%r9d
170	roll	$12,%r9d
171	addl	%r12d,%edi
172	xorl	%edi,%r10d
173	roll	$12,%r10d
174	addl	%r9d,%eax
175	xorl	%eax,%r15d
176	roll	$8,%r15d
177	addl	%r10d,%ebx
178	xorl	%ebx,%r12d
179	roll	$8,%r12d
180	addl	%r15d,%esi
181	xorl	%esi,%r9d
182	roll	$7,%r9d
183	addl	%r12d,%edi
184	xorl	%edi,%r10d
185	roll	$7,%r10d
186	movl	%esi,40(%rsp)
187	movl	%edi,44(%rsp)
188	movl	32(%rsp),%esi
189	movl	36(%rsp),%edi
190	addl	%r11d,%ecx
191	xorl	%ecx,%r13d
192	roll	$16,%r13d
193	addl	%r8d,%edx
194	xorl	%edx,%r14d
195	roll	$16,%r14d
196	addl	%r13d,%esi
197	xorl	%esi,%r11d
198	roll	$12,%r11d
199	addl	%r14d,%edi
200	xorl	%edi,%r8d
201	roll	$12,%r8d
202	addl	%r11d,%ecx
203	xorl	%ecx,%r13d
204	roll	$8,%r13d
205	addl	%r8d,%edx
206	xorl	%edx,%r14d
207	roll	$8,%r14d
208	addl	%r13d,%esi
209	xorl	%esi,%r11d
210	roll	$7,%r11d
211	addl	%r14d,%edi
212	xorl	%edi,%r8d
213	roll	$7,%r8d
214	decl	%ebp
215	jnz	.Loop
216	movl	%edi,36(%rsp)
217	movl	%esi,32(%rsp)
218	movq	64(%rsp),%rbp
219	movdqa	%xmm2,%xmm1
220	movq	64+8(%rsp),%rsi
221	paddd	%xmm4,%xmm3
222	movq	64+16(%rsp),%rdi
223
224	addl	$0x61707865,%eax
225	addl	$0x3320646e,%ebx
226	addl	$0x79622d32,%ecx
227	addl	$0x6b206574,%edx
228	addl	16(%rsp),%r8d
229	addl	20(%rsp),%r9d
230	addl	24(%rsp),%r10d
231	addl	28(%rsp),%r11d
232	addl	48(%rsp),%r12d
233	addl	52(%rsp),%r13d
234	addl	56(%rsp),%r14d
235	addl	60(%rsp),%r15d
236	paddd	32(%rsp),%xmm1
237
238	cmpq	$64,%rbp
239	jb	.Ltail
240
241	xorl	0(%rsi),%eax
242	xorl	4(%rsi),%ebx
243	xorl	8(%rsi),%ecx
244	xorl	12(%rsi),%edx
245	xorl	16(%rsi),%r8d
246	xorl	20(%rsi),%r9d
247	xorl	24(%rsi),%r10d
248	xorl	28(%rsi),%r11d
249	movdqu	32(%rsi),%xmm0
250	xorl	48(%rsi),%r12d
251	xorl	52(%rsi),%r13d
252	xorl	56(%rsi),%r14d
253	xorl	60(%rsi),%r15d
254	leaq	64(%rsi),%rsi
255	pxor	%xmm1,%xmm0
256
257	movdqa	%xmm2,32(%rsp)
258	movd	%xmm3,48(%rsp)
259
260	movl	%eax,0(%rdi)
261	movl	%ebx,4(%rdi)
262	movl	%ecx,8(%rdi)
263	movl	%edx,12(%rdi)
264	movl	%r8d,16(%rdi)
265	movl	%r9d,20(%rdi)
266	movl	%r10d,24(%rdi)
267	movl	%r11d,28(%rdi)
268	movdqu	%xmm0,32(%rdi)
269	movl	%r12d,48(%rdi)
270	movl	%r13d,52(%rdi)
271	movl	%r14d,56(%rdi)
272	movl	%r15d,60(%rdi)
273	leaq	64(%rdi),%rdi
274
275	subq	$64,%rbp
276	jnz	.Loop_outer
277
278	jmp	.Ldone
279
280.align	16
281.Ltail:
282	movl	%eax,0(%rsp)
283	movl	%ebx,4(%rsp)
284	xorq	%rbx,%rbx
285	movl	%ecx,8(%rsp)
286	movl	%edx,12(%rsp)
287	movl	%r8d,16(%rsp)
288	movl	%r9d,20(%rsp)
289	movl	%r10d,24(%rsp)
290	movl	%r11d,28(%rsp)
291	movdqa	%xmm1,32(%rsp)
292	movl	%r12d,48(%rsp)
293	movl	%r13d,52(%rsp)
294	movl	%r14d,56(%rsp)
295	movl	%r15d,60(%rsp)
296
297.Loop_tail:
298	movzbl	(%rsi,%rbx,1),%eax
299	movzbl	(%rsp,%rbx,1),%edx
300	leaq	1(%rbx),%rbx
301	xorl	%edx,%eax
302	movb	%al,-1(%rdi,%rbx,1)
303	decq	%rbp
304	jnz	.Loop_tail
305
306.Ldone:
307	leaq	64+24+48(%rsp),%rsi
308.cfi_def_cfa	%rsi,8
309	movq	-48(%rsi),%r15
310.cfi_restore	%r15
311	movq	-40(%rsi),%r14
312.cfi_restore	%r14
313	movq	-32(%rsi),%r13
314.cfi_restore	%r13
315	movq	-24(%rsi),%r12
316.cfi_restore	%r12
317	movq	-16(%rsi),%rbp
318.cfi_restore	%rbp
319	movq	-8(%rsi),%rbx
320.cfi_restore	%rbx
321	leaq	(%rsi),%rsp
322.cfi_def_cfa_register	%rsp
323.Lno_data:
324	.byte	0xf3,0xc3
325.cfi_endproc
326.size	ChaCha20_ctr32,.-ChaCha20_ctr32
327.type	ChaCha20_ssse3,@function
328.align	32
329ChaCha20_ssse3:
330.cfi_startproc
331.LChaCha20_ssse3:
332	movq	%rsp,%r9
333.cfi_def_cfa_register	%r9
334	testl	$2048,%r10d
335	jnz	.LChaCha20_4xop
336	cmpq	$128,%rdx
337	je	.LChaCha20_128
338	ja	.LChaCha20_4x
339
340.Ldo_sse3_after_all:
341	subq	$64+8,%rsp
342	movdqa	.Lsigma(%rip),%xmm0
343	movdqu	(%rcx),%xmm1
344	movdqu	16(%rcx),%xmm2
345	movdqu	(%r8),%xmm3
346	movdqa	.Lrot16(%rip),%xmm6
347	movdqa	.Lrot24(%rip),%xmm7
348
349	movdqa	%xmm0,0(%rsp)
350	movdqa	%xmm1,16(%rsp)
351	movdqa	%xmm2,32(%rsp)
352	movdqa	%xmm3,48(%rsp)
353	movq	$10,%r8
354	jmp	.Loop_ssse3
355
356.align	32
357.Loop_outer_ssse3:
358	movdqa	.Lone(%rip),%xmm3
359	movdqa	0(%rsp),%xmm0
360	movdqa	16(%rsp),%xmm1
361	movdqa	32(%rsp),%xmm2
362	paddd	48(%rsp),%xmm3
363	movq	$10,%r8
364	movdqa	%xmm3,48(%rsp)
365	jmp	.Loop_ssse3
366
367.align	32
368.Loop_ssse3:
369	paddd	%xmm1,%xmm0
370	pxor	%xmm0,%xmm3
371.byte	102,15,56,0,222
372	paddd	%xmm3,%xmm2
373	pxor	%xmm2,%xmm1
374	movdqa	%xmm1,%xmm4
375	psrld	$20,%xmm1
376	pslld	$12,%xmm4
377	por	%xmm4,%xmm1
378	paddd	%xmm1,%xmm0
379	pxor	%xmm0,%xmm3
380.byte	102,15,56,0,223
381	paddd	%xmm3,%xmm2
382	pxor	%xmm2,%xmm1
383	movdqa	%xmm1,%xmm4
384	psrld	$25,%xmm1
385	pslld	$7,%xmm4
386	por	%xmm4,%xmm1
387	pshufd	$78,%xmm2,%xmm2
388	pshufd	$57,%xmm1,%xmm1
389	pshufd	$147,%xmm3,%xmm3
390	nop
391	paddd	%xmm1,%xmm0
392	pxor	%xmm0,%xmm3
393.byte	102,15,56,0,222
394	paddd	%xmm3,%xmm2
395	pxor	%xmm2,%xmm1
396	movdqa	%xmm1,%xmm4
397	psrld	$20,%xmm1
398	pslld	$12,%xmm4
399	por	%xmm4,%xmm1
400	paddd	%xmm1,%xmm0
401	pxor	%xmm0,%xmm3
402.byte	102,15,56,0,223
403	paddd	%xmm3,%xmm2
404	pxor	%xmm2,%xmm1
405	movdqa	%xmm1,%xmm4
406	psrld	$25,%xmm1
407	pslld	$7,%xmm4
408	por	%xmm4,%xmm1
409	pshufd	$78,%xmm2,%xmm2
410	pshufd	$147,%xmm1,%xmm1
411	pshufd	$57,%xmm3,%xmm3
412	decq	%r8
413	jnz	.Loop_ssse3
414	paddd	0(%rsp),%xmm0
415	paddd	16(%rsp),%xmm1
416	paddd	32(%rsp),%xmm2
417	paddd	48(%rsp),%xmm3
418
419	cmpq	$64,%rdx
420	jb	.Ltail_ssse3
421
422	movdqu	0(%rsi),%xmm4
423	movdqu	16(%rsi),%xmm5
424	pxor	%xmm4,%xmm0
425	movdqu	32(%rsi),%xmm4
426	pxor	%xmm5,%xmm1
427	movdqu	48(%rsi),%xmm5
428	leaq	64(%rsi),%rsi
429	pxor	%xmm4,%xmm2
430	pxor	%xmm5,%xmm3
431
432	movdqu	%xmm0,0(%rdi)
433	movdqu	%xmm1,16(%rdi)
434	movdqu	%xmm2,32(%rdi)
435	movdqu	%xmm3,48(%rdi)
436	leaq	64(%rdi),%rdi
437
438	subq	$64,%rdx
439	jnz	.Loop_outer_ssse3
440
441	jmp	.Ldone_ssse3
442
443.align	16
444.Ltail_ssse3:
445	movdqa	%xmm0,0(%rsp)
446	movdqa	%xmm1,16(%rsp)
447	movdqa	%xmm2,32(%rsp)
448	movdqa	%xmm3,48(%rsp)
449	xorq	%r8,%r8
450
451.Loop_tail_ssse3:
452	movzbl	(%rsi,%r8,1),%eax
453	movzbl	(%rsp,%r8,1),%ecx
454	leaq	1(%r8),%r8
455	xorl	%ecx,%eax
456	movb	%al,-1(%rdi,%r8,1)
457	decq	%rdx
458	jnz	.Loop_tail_ssse3
459
460.Ldone_ssse3:
461	leaq	(%r9),%rsp
462.cfi_def_cfa_register	%rsp
463.Lssse3_epilogue:
464	.byte	0xf3,0xc3
465.cfi_endproc
466.size	ChaCha20_ssse3,.-ChaCha20_ssse3
467.type	ChaCha20_128,@function
468.align	32
469ChaCha20_128:
470.cfi_startproc
471.LChaCha20_128:
472	movq	%rsp,%r9
473.cfi_def_cfa_register	%r9
474	subq	$64+8,%rsp
475	movdqa	.Lsigma(%rip),%xmm8
476	movdqu	(%rcx),%xmm9
477	movdqu	16(%rcx),%xmm2
478	movdqu	(%r8),%xmm3
479	movdqa	.Lone(%rip),%xmm1
480	movdqa	.Lrot16(%rip),%xmm6
481	movdqa	.Lrot24(%rip),%xmm7
482
483	movdqa	%xmm8,%xmm10
484	movdqa	%xmm8,0(%rsp)
485	movdqa	%xmm9,%xmm11
486	movdqa	%xmm9,16(%rsp)
487	movdqa	%xmm2,%xmm0
488	movdqa	%xmm2,32(%rsp)
489	paddd	%xmm3,%xmm1
490	movdqa	%xmm3,48(%rsp)
491	movq	$10,%r8
492	jmp	.Loop_128
493
494.align	32
495.Loop_128:
496	paddd	%xmm9,%xmm8
497	pxor	%xmm8,%xmm3
498	paddd	%xmm11,%xmm10
499	pxor	%xmm10,%xmm1
500.byte	102,15,56,0,222
501.byte	102,15,56,0,206
502	paddd	%xmm3,%xmm2
503	paddd	%xmm1,%xmm0
504	pxor	%xmm2,%xmm9
505	pxor	%xmm0,%xmm11
506	movdqa	%xmm9,%xmm4
507	psrld	$20,%xmm9
508	movdqa	%xmm11,%xmm5
509	pslld	$12,%xmm4
510	psrld	$20,%xmm11
511	por	%xmm4,%xmm9
512	pslld	$12,%xmm5
513	por	%xmm5,%xmm11
514	paddd	%xmm9,%xmm8
515	pxor	%xmm8,%xmm3
516	paddd	%xmm11,%xmm10
517	pxor	%xmm10,%xmm1
518.byte	102,15,56,0,223
519.byte	102,15,56,0,207
520	paddd	%xmm3,%xmm2
521	paddd	%xmm1,%xmm0
522	pxor	%xmm2,%xmm9
523	pxor	%xmm0,%xmm11
524	movdqa	%xmm9,%xmm4
525	psrld	$25,%xmm9
526	movdqa	%xmm11,%xmm5
527	pslld	$7,%xmm4
528	psrld	$25,%xmm11
529	por	%xmm4,%xmm9
530	pslld	$7,%xmm5
531	por	%xmm5,%xmm11
532	pshufd	$78,%xmm2,%xmm2
533	pshufd	$57,%xmm9,%xmm9
534	pshufd	$147,%xmm3,%xmm3
535	pshufd	$78,%xmm0,%xmm0
536	pshufd	$57,%xmm11,%xmm11
537	pshufd	$147,%xmm1,%xmm1
538	paddd	%xmm9,%xmm8
539	pxor	%xmm8,%xmm3
540	paddd	%xmm11,%xmm10
541	pxor	%xmm10,%xmm1
542.byte	102,15,56,0,222
543.byte	102,15,56,0,206
544	paddd	%xmm3,%xmm2
545	paddd	%xmm1,%xmm0
546	pxor	%xmm2,%xmm9
547	pxor	%xmm0,%xmm11
548	movdqa	%xmm9,%xmm4
549	psrld	$20,%xmm9
550	movdqa	%xmm11,%xmm5
551	pslld	$12,%xmm4
552	psrld	$20,%xmm11
553	por	%xmm4,%xmm9
554	pslld	$12,%xmm5
555	por	%xmm5,%xmm11
556	paddd	%xmm9,%xmm8
557	pxor	%xmm8,%xmm3
558	paddd	%xmm11,%xmm10
559	pxor	%xmm10,%xmm1
560.byte	102,15,56,0,223
561.byte	102,15,56,0,207
562	paddd	%xmm3,%xmm2
563	paddd	%xmm1,%xmm0
564	pxor	%xmm2,%xmm9
565	pxor	%xmm0,%xmm11
566	movdqa	%xmm9,%xmm4
567	psrld	$25,%xmm9
568	movdqa	%xmm11,%xmm5
569	pslld	$7,%xmm4
570	psrld	$25,%xmm11
571	por	%xmm4,%xmm9
572	pslld	$7,%xmm5
573	por	%xmm5,%xmm11
574	pshufd	$78,%xmm2,%xmm2
575	pshufd	$147,%xmm9,%xmm9
576	pshufd	$57,%xmm3,%xmm3
577	pshufd	$78,%xmm0,%xmm0
578	pshufd	$147,%xmm11,%xmm11
579	pshufd	$57,%xmm1,%xmm1
580	decq	%r8
581	jnz	.Loop_128
582	paddd	0(%rsp),%xmm8
583	paddd	16(%rsp),%xmm9
584	paddd	32(%rsp),%xmm2
585	paddd	48(%rsp),%xmm3
586	paddd	.Lone(%rip),%xmm1
587	paddd	0(%rsp),%xmm10
588	paddd	16(%rsp),%xmm11
589	paddd	32(%rsp),%xmm0
590	paddd	48(%rsp),%xmm1
591
592	movdqu	0(%rsi),%xmm4
593	movdqu	16(%rsi),%xmm5
594	pxor	%xmm4,%xmm8
595	movdqu	32(%rsi),%xmm4
596	pxor	%xmm5,%xmm9
597	movdqu	48(%rsi),%xmm5
598	pxor	%xmm4,%xmm2
599	movdqu	64(%rsi),%xmm4
600	pxor	%xmm5,%xmm3
601	movdqu	80(%rsi),%xmm5
602	pxor	%xmm4,%xmm10
603	movdqu	96(%rsi),%xmm4
604	pxor	%xmm5,%xmm11
605	movdqu	112(%rsi),%xmm5
606	pxor	%xmm4,%xmm0
607	pxor	%xmm5,%xmm1
608
609	movdqu	%xmm8,0(%rdi)
610	movdqu	%xmm9,16(%rdi)
611	movdqu	%xmm2,32(%rdi)
612	movdqu	%xmm3,48(%rdi)
613	movdqu	%xmm10,64(%rdi)
614	movdqu	%xmm11,80(%rdi)
615	movdqu	%xmm0,96(%rdi)
616	movdqu	%xmm1,112(%rdi)
617	leaq	(%r9),%rsp
618.cfi_def_cfa_register	%rsp
619.L128_epilogue:
620	.byte	0xf3,0xc3
621.cfi_endproc
622.size	ChaCha20_128,.-ChaCha20_128
623.type	ChaCha20_4x,@function
624.align	32
625ChaCha20_4x:
626.cfi_startproc
627.LChaCha20_4x:
628	movq	%rsp,%r9
629.cfi_def_cfa_register	%r9
630	movq	%r10,%r11
631	shrq	$32,%r10
632	testq	$32,%r10
633	jnz	.LChaCha20_8x
634	cmpq	$192,%rdx
635	ja	.Lproceed4x
636
637	andq	$71303168,%r11
638	cmpq	$4194304,%r11
639	je	.Ldo_sse3_after_all
640
641.Lproceed4x:
642	subq	$0x140+8,%rsp
643	movdqa	.Lsigma(%rip),%xmm11
644	movdqu	(%rcx),%xmm15
645	movdqu	16(%rcx),%xmm7
646	movdqu	(%r8),%xmm3
647	leaq	256(%rsp),%rcx
648	leaq	.Lrot16(%rip),%r10
649	leaq	.Lrot24(%rip),%r11
650
651	pshufd	$0x00,%xmm11,%xmm8
652	pshufd	$0x55,%xmm11,%xmm9
653	movdqa	%xmm8,64(%rsp)
654	pshufd	$0xaa,%xmm11,%xmm10
655	movdqa	%xmm9,80(%rsp)
656	pshufd	$0xff,%xmm11,%xmm11
657	movdqa	%xmm10,96(%rsp)
658	movdqa	%xmm11,112(%rsp)
659
660	pshufd	$0x00,%xmm15,%xmm12
661	pshufd	$0x55,%xmm15,%xmm13
662	movdqa	%xmm12,128-256(%rcx)
663	pshufd	$0xaa,%xmm15,%xmm14
664	movdqa	%xmm13,144-256(%rcx)
665	pshufd	$0xff,%xmm15,%xmm15
666	movdqa	%xmm14,160-256(%rcx)
667	movdqa	%xmm15,176-256(%rcx)
668
669	pshufd	$0x00,%xmm7,%xmm4
670	pshufd	$0x55,%xmm7,%xmm5
671	movdqa	%xmm4,192-256(%rcx)
672	pshufd	$0xaa,%xmm7,%xmm6
673	movdqa	%xmm5,208-256(%rcx)
674	pshufd	$0xff,%xmm7,%xmm7
675	movdqa	%xmm6,224-256(%rcx)
676	movdqa	%xmm7,240-256(%rcx)
677
678	pshufd	$0x00,%xmm3,%xmm0
679	pshufd	$0x55,%xmm3,%xmm1
680	paddd	.Linc(%rip),%xmm0
681	pshufd	$0xaa,%xmm3,%xmm2
682	movdqa	%xmm1,272-256(%rcx)
683	pshufd	$0xff,%xmm3,%xmm3
684	movdqa	%xmm2,288-256(%rcx)
685	movdqa	%xmm3,304-256(%rcx)
686
687	jmp	.Loop_enter4x
688
689.align	32
690.Loop_outer4x:
691	movdqa	64(%rsp),%xmm8
692	movdqa	80(%rsp),%xmm9
693	movdqa	96(%rsp),%xmm10
694	movdqa	112(%rsp),%xmm11
695	movdqa	128-256(%rcx),%xmm12
696	movdqa	144-256(%rcx),%xmm13
697	movdqa	160-256(%rcx),%xmm14
698	movdqa	176-256(%rcx),%xmm15
699	movdqa	192-256(%rcx),%xmm4
700	movdqa	208-256(%rcx),%xmm5
701	movdqa	224-256(%rcx),%xmm6
702	movdqa	240-256(%rcx),%xmm7
703	movdqa	256-256(%rcx),%xmm0
704	movdqa	272-256(%rcx),%xmm1
705	movdqa	288-256(%rcx),%xmm2
706	movdqa	304-256(%rcx),%xmm3
707	paddd	.Lfour(%rip),%xmm0
708
709.Loop_enter4x:
710	movdqa	%xmm6,32(%rsp)
711	movdqa	%xmm7,48(%rsp)
712	movdqa	(%r10),%xmm7
713	movl	$10,%eax
714	movdqa	%xmm0,256-256(%rcx)
715	jmp	.Loop4x
716
717.align	32
718.Loop4x:
719	paddd	%xmm12,%xmm8
720	paddd	%xmm13,%xmm9
721	pxor	%xmm8,%xmm0
722	pxor	%xmm9,%xmm1
723.byte	102,15,56,0,199
724.byte	102,15,56,0,207
725	paddd	%xmm0,%xmm4
726	paddd	%xmm1,%xmm5
727	pxor	%xmm4,%xmm12
728	pxor	%xmm5,%xmm13
729	movdqa	%xmm12,%xmm6
730	pslld	$12,%xmm12
731	psrld	$20,%xmm6
732	movdqa	%xmm13,%xmm7
733	pslld	$12,%xmm13
734	por	%xmm6,%xmm12
735	psrld	$20,%xmm7
736	movdqa	(%r11),%xmm6
737	por	%xmm7,%xmm13
738	paddd	%xmm12,%xmm8
739	paddd	%xmm13,%xmm9
740	pxor	%xmm8,%xmm0
741	pxor	%xmm9,%xmm1
742.byte	102,15,56,0,198
743.byte	102,15,56,0,206
744	paddd	%xmm0,%xmm4
745	paddd	%xmm1,%xmm5
746	pxor	%xmm4,%xmm12
747	pxor	%xmm5,%xmm13
748	movdqa	%xmm12,%xmm7
749	pslld	$7,%xmm12
750	psrld	$25,%xmm7
751	movdqa	%xmm13,%xmm6
752	pslld	$7,%xmm13
753	por	%xmm7,%xmm12
754	psrld	$25,%xmm6
755	movdqa	(%r10),%xmm7
756	por	%xmm6,%xmm13
757	movdqa	%xmm4,0(%rsp)
758	movdqa	%xmm5,16(%rsp)
759	movdqa	32(%rsp),%xmm4
760	movdqa	48(%rsp),%xmm5
761	paddd	%xmm14,%xmm10
762	paddd	%xmm15,%xmm11
763	pxor	%xmm10,%xmm2
764	pxor	%xmm11,%xmm3
765.byte	102,15,56,0,215
766.byte	102,15,56,0,223
767	paddd	%xmm2,%xmm4
768	paddd	%xmm3,%xmm5
769	pxor	%xmm4,%xmm14
770	pxor	%xmm5,%xmm15
771	movdqa	%xmm14,%xmm6
772	pslld	$12,%xmm14
773	psrld	$20,%xmm6
774	movdqa	%xmm15,%xmm7
775	pslld	$12,%xmm15
776	por	%xmm6,%xmm14
777	psrld	$20,%xmm7
778	movdqa	(%r11),%xmm6
779	por	%xmm7,%xmm15
780	paddd	%xmm14,%xmm10
781	paddd	%xmm15,%xmm11
782	pxor	%xmm10,%xmm2
783	pxor	%xmm11,%xmm3
784.byte	102,15,56,0,214
785.byte	102,15,56,0,222
786	paddd	%xmm2,%xmm4
787	paddd	%xmm3,%xmm5
788	pxor	%xmm4,%xmm14
789	pxor	%xmm5,%xmm15
790	movdqa	%xmm14,%xmm7
791	pslld	$7,%xmm14
792	psrld	$25,%xmm7
793	movdqa	%xmm15,%xmm6
794	pslld	$7,%xmm15
795	por	%xmm7,%xmm14
796	psrld	$25,%xmm6
797	movdqa	(%r10),%xmm7
798	por	%xmm6,%xmm15
799	paddd	%xmm13,%xmm8
800	paddd	%xmm14,%xmm9
801	pxor	%xmm8,%xmm3
802	pxor	%xmm9,%xmm0
803.byte	102,15,56,0,223
804.byte	102,15,56,0,199
805	paddd	%xmm3,%xmm4
806	paddd	%xmm0,%xmm5
807	pxor	%xmm4,%xmm13
808	pxor	%xmm5,%xmm14
809	movdqa	%xmm13,%xmm6
810	pslld	$12,%xmm13
811	psrld	$20,%xmm6
812	movdqa	%xmm14,%xmm7
813	pslld	$12,%xmm14
814	por	%xmm6,%xmm13
815	psrld	$20,%xmm7
816	movdqa	(%r11),%xmm6
817	por	%xmm7,%xmm14
818	paddd	%xmm13,%xmm8
819	paddd	%xmm14,%xmm9
820	pxor	%xmm8,%xmm3
821	pxor	%xmm9,%xmm0
822.byte	102,15,56,0,222
823.byte	102,15,56,0,198
824	paddd	%xmm3,%xmm4
825	paddd	%xmm0,%xmm5
826	pxor	%xmm4,%xmm13
827	pxor	%xmm5,%xmm14
828	movdqa	%xmm13,%xmm7
829	pslld	$7,%xmm13
830	psrld	$25,%xmm7
831	movdqa	%xmm14,%xmm6
832	pslld	$7,%xmm14
833	por	%xmm7,%xmm13
834	psrld	$25,%xmm6
835	movdqa	(%r10),%xmm7
836	por	%xmm6,%xmm14
837	movdqa	%xmm4,32(%rsp)
838	movdqa	%xmm5,48(%rsp)
839	movdqa	0(%rsp),%xmm4
840	movdqa	16(%rsp),%xmm5
841	paddd	%xmm15,%xmm10
842	paddd	%xmm12,%xmm11
843	pxor	%xmm10,%xmm1
844	pxor	%xmm11,%xmm2
845.byte	102,15,56,0,207
846.byte	102,15,56,0,215
847	paddd	%xmm1,%xmm4
848	paddd	%xmm2,%xmm5
849	pxor	%xmm4,%xmm15
850	pxor	%xmm5,%xmm12
851	movdqa	%xmm15,%xmm6
852	pslld	$12,%xmm15
853	psrld	$20,%xmm6
854	movdqa	%xmm12,%xmm7
855	pslld	$12,%xmm12
856	por	%xmm6,%xmm15
857	psrld	$20,%xmm7
858	movdqa	(%r11),%xmm6
859	por	%xmm7,%xmm12
860	paddd	%xmm15,%xmm10
861	paddd	%xmm12,%xmm11
862	pxor	%xmm10,%xmm1
863	pxor	%xmm11,%xmm2
864.byte	102,15,56,0,206
865.byte	102,15,56,0,214
866	paddd	%xmm1,%xmm4
867	paddd	%xmm2,%xmm5
868	pxor	%xmm4,%xmm15
869	pxor	%xmm5,%xmm12
870	movdqa	%xmm15,%xmm7
871	pslld	$7,%xmm15
872	psrld	$25,%xmm7
873	movdqa	%xmm12,%xmm6
874	pslld	$7,%xmm12
875	por	%xmm7,%xmm15
876	psrld	$25,%xmm6
877	movdqa	(%r10),%xmm7
878	por	%xmm6,%xmm12
879	decl	%eax
880	jnz	.Loop4x
881
882	paddd	64(%rsp),%xmm8
883	paddd	80(%rsp),%xmm9
884	paddd	96(%rsp),%xmm10
885	paddd	112(%rsp),%xmm11
886
887	movdqa	%xmm8,%xmm6
888	punpckldq	%xmm9,%xmm8
889	movdqa	%xmm10,%xmm7
890	punpckldq	%xmm11,%xmm10
891	punpckhdq	%xmm9,%xmm6
892	punpckhdq	%xmm11,%xmm7
893	movdqa	%xmm8,%xmm9
894	punpcklqdq	%xmm10,%xmm8
895	movdqa	%xmm6,%xmm11
896	punpcklqdq	%xmm7,%xmm6
897	punpckhqdq	%xmm10,%xmm9
898	punpckhqdq	%xmm7,%xmm11
899	paddd	128-256(%rcx),%xmm12
900	paddd	144-256(%rcx),%xmm13
901	paddd	160-256(%rcx),%xmm14
902	paddd	176-256(%rcx),%xmm15
903
904	movdqa	%xmm8,0(%rsp)
905	movdqa	%xmm9,16(%rsp)
906	movdqa	32(%rsp),%xmm8
907	movdqa	48(%rsp),%xmm9
908
909	movdqa	%xmm12,%xmm10
910	punpckldq	%xmm13,%xmm12
911	movdqa	%xmm14,%xmm7
912	punpckldq	%xmm15,%xmm14
913	punpckhdq	%xmm13,%xmm10
914	punpckhdq	%xmm15,%xmm7
915	movdqa	%xmm12,%xmm13
916	punpcklqdq	%xmm14,%xmm12
917	movdqa	%xmm10,%xmm15
918	punpcklqdq	%xmm7,%xmm10
919	punpckhqdq	%xmm14,%xmm13
920	punpckhqdq	%xmm7,%xmm15
921	paddd	192-256(%rcx),%xmm4
922	paddd	208-256(%rcx),%xmm5
923	paddd	224-256(%rcx),%xmm8
924	paddd	240-256(%rcx),%xmm9
925
926	movdqa	%xmm6,32(%rsp)
927	movdqa	%xmm11,48(%rsp)
928
929	movdqa	%xmm4,%xmm14
930	punpckldq	%xmm5,%xmm4
931	movdqa	%xmm8,%xmm7
932	punpckldq	%xmm9,%xmm8
933	punpckhdq	%xmm5,%xmm14
934	punpckhdq	%xmm9,%xmm7
935	movdqa	%xmm4,%xmm5
936	punpcklqdq	%xmm8,%xmm4
937	movdqa	%xmm14,%xmm9
938	punpcklqdq	%xmm7,%xmm14
939	punpckhqdq	%xmm8,%xmm5
940	punpckhqdq	%xmm7,%xmm9
941	paddd	256-256(%rcx),%xmm0
942	paddd	272-256(%rcx),%xmm1
943	paddd	288-256(%rcx),%xmm2
944	paddd	304-256(%rcx),%xmm3
945
946	movdqa	%xmm0,%xmm8
947	punpckldq	%xmm1,%xmm0
948	movdqa	%xmm2,%xmm7
949	punpckldq	%xmm3,%xmm2
950	punpckhdq	%xmm1,%xmm8
951	punpckhdq	%xmm3,%xmm7
952	movdqa	%xmm0,%xmm1
953	punpcklqdq	%xmm2,%xmm0
954	movdqa	%xmm8,%xmm3
955	punpcklqdq	%xmm7,%xmm8
956	punpckhqdq	%xmm2,%xmm1
957	punpckhqdq	%xmm7,%xmm3
958	cmpq	$256,%rdx
959	jb	.Ltail4x
960
961	movdqu	0(%rsi),%xmm6
962	movdqu	16(%rsi),%xmm11
963	movdqu	32(%rsi),%xmm2
964	movdqu	48(%rsi),%xmm7
965	pxor	0(%rsp),%xmm6
966	pxor	%xmm12,%xmm11
967	pxor	%xmm4,%xmm2
968	pxor	%xmm0,%xmm7
969
970	movdqu	%xmm6,0(%rdi)
971	movdqu	64(%rsi),%xmm6
972	movdqu	%xmm11,16(%rdi)
973	movdqu	80(%rsi),%xmm11
974	movdqu	%xmm2,32(%rdi)
975	movdqu	96(%rsi),%xmm2
976	movdqu	%xmm7,48(%rdi)
977	movdqu	112(%rsi),%xmm7
978	leaq	128(%rsi),%rsi
979	pxor	16(%rsp),%xmm6
980	pxor	%xmm13,%xmm11
981	pxor	%xmm5,%xmm2
982	pxor	%xmm1,%xmm7
983
984	movdqu	%xmm6,64(%rdi)
985	movdqu	0(%rsi),%xmm6
986	movdqu	%xmm11,80(%rdi)
987	movdqu	16(%rsi),%xmm11
988	movdqu	%xmm2,96(%rdi)
989	movdqu	32(%rsi),%xmm2
990	movdqu	%xmm7,112(%rdi)
991	leaq	128(%rdi),%rdi
992	movdqu	48(%rsi),%xmm7
993	pxor	32(%rsp),%xmm6
994	pxor	%xmm10,%xmm11
995	pxor	%xmm14,%xmm2
996	pxor	%xmm8,%xmm7
997
998	movdqu	%xmm6,0(%rdi)
999	movdqu	64(%rsi),%xmm6
1000	movdqu	%xmm11,16(%rdi)
1001	movdqu	80(%rsi),%xmm11
1002	movdqu	%xmm2,32(%rdi)
1003	movdqu	96(%rsi),%xmm2
1004	movdqu	%xmm7,48(%rdi)
1005	movdqu	112(%rsi),%xmm7
1006	leaq	128(%rsi),%rsi
1007	pxor	48(%rsp),%xmm6
1008	pxor	%xmm15,%xmm11
1009	pxor	%xmm9,%xmm2
1010	pxor	%xmm3,%xmm7
1011	movdqu	%xmm6,64(%rdi)
1012	movdqu	%xmm11,80(%rdi)
1013	movdqu	%xmm2,96(%rdi)
1014	movdqu	%xmm7,112(%rdi)
1015	leaq	128(%rdi),%rdi
1016
1017	subq	$256,%rdx
1018	jnz	.Loop_outer4x
1019
1020	jmp	.Ldone4x
1021
1022.Ltail4x:
1023	cmpq	$192,%rdx
1024	jae	.L192_or_more4x
1025	cmpq	$128,%rdx
1026	jae	.L128_or_more4x
1027	cmpq	$64,%rdx
1028	jae	.L64_or_more4x
1029
1030
1031	xorq	%r10,%r10
1032
1033	movdqa	%xmm12,16(%rsp)
1034	movdqa	%xmm4,32(%rsp)
1035	movdqa	%xmm0,48(%rsp)
1036	jmp	.Loop_tail4x
1037
1038.align	32
1039.L64_or_more4x:
1040	movdqu	0(%rsi),%xmm6
1041	movdqu	16(%rsi),%xmm11
1042	movdqu	32(%rsi),%xmm2
1043	movdqu	48(%rsi),%xmm7
1044	pxor	0(%rsp),%xmm6
1045	pxor	%xmm12,%xmm11
1046	pxor	%xmm4,%xmm2
1047	pxor	%xmm0,%xmm7
1048	movdqu	%xmm6,0(%rdi)
1049	movdqu	%xmm11,16(%rdi)
1050	movdqu	%xmm2,32(%rdi)
1051	movdqu	%xmm7,48(%rdi)
1052	je	.Ldone4x
1053
1054	movdqa	16(%rsp),%xmm6
1055	leaq	64(%rsi),%rsi
1056	xorq	%r10,%r10
1057	movdqa	%xmm6,0(%rsp)
1058	movdqa	%xmm13,16(%rsp)
1059	leaq	64(%rdi),%rdi
1060	movdqa	%xmm5,32(%rsp)
1061	subq	$64,%rdx
1062	movdqa	%xmm1,48(%rsp)
1063	jmp	.Loop_tail4x
1064
1065.align	32
1066.L128_or_more4x:
1067	movdqu	0(%rsi),%xmm6
1068	movdqu	16(%rsi),%xmm11
1069	movdqu	32(%rsi),%xmm2
1070	movdqu	48(%rsi),%xmm7
1071	pxor	0(%rsp),%xmm6
1072	pxor	%xmm12,%xmm11
1073	pxor	%xmm4,%xmm2
1074	pxor	%xmm0,%xmm7
1075
1076	movdqu	%xmm6,0(%rdi)
1077	movdqu	64(%rsi),%xmm6
1078	movdqu	%xmm11,16(%rdi)
1079	movdqu	80(%rsi),%xmm11
1080	movdqu	%xmm2,32(%rdi)
1081	movdqu	96(%rsi),%xmm2
1082	movdqu	%xmm7,48(%rdi)
1083	movdqu	112(%rsi),%xmm7
1084	pxor	16(%rsp),%xmm6
1085	pxor	%xmm13,%xmm11
1086	pxor	%xmm5,%xmm2
1087	pxor	%xmm1,%xmm7
1088	movdqu	%xmm6,64(%rdi)
1089	movdqu	%xmm11,80(%rdi)
1090	movdqu	%xmm2,96(%rdi)
1091	movdqu	%xmm7,112(%rdi)
1092	je	.Ldone4x
1093
1094	movdqa	32(%rsp),%xmm6
1095	leaq	128(%rsi),%rsi
1096	xorq	%r10,%r10
1097	movdqa	%xmm6,0(%rsp)
1098	movdqa	%xmm10,16(%rsp)
1099	leaq	128(%rdi),%rdi
1100	movdqa	%xmm14,32(%rsp)
1101	subq	$128,%rdx
1102	movdqa	%xmm8,48(%rsp)
1103	jmp	.Loop_tail4x
1104
1105.align	32
1106.L192_or_more4x:
1107	movdqu	0(%rsi),%xmm6
1108	movdqu	16(%rsi),%xmm11
1109	movdqu	32(%rsi),%xmm2
1110	movdqu	48(%rsi),%xmm7
1111	pxor	0(%rsp),%xmm6
1112	pxor	%xmm12,%xmm11
1113	pxor	%xmm4,%xmm2
1114	pxor	%xmm0,%xmm7
1115
1116	movdqu	%xmm6,0(%rdi)
1117	movdqu	64(%rsi),%xmm6
1118	movdqu	%xmm11,16(%rdi)
1119	movdqu	80(%rsi),%xmm11
1120	movdqu	%xmm2,32(%rdi)
1121	movdqu	96(%rsi),%xmm2
1122	movdqu	%xmm7,48(%rdi)
1123	movdqu	112(%rsi),%xmm7
1124	leaq	128(%rsi),%rsi
1125	pxor	16(%rsp),%xmm6
1126	pxor	%xmm13,%xmm11
1127	pxor	%xmm5,%xmm2
1128	pxor	%xmm1,%xmm7
1129
1130	movdqu	%xmm6,64(%rdi)
1131	movdqu	0(%rsi),%xmm6
1132	movdqu	%xmm11,80(%rdi)
1133	movdqu	16(%rsi),%xmm11
1134	movdqu	%xmm2,96(%rdi)
1135	movdqu	32(%rsi),%xmm2
1136	movdqu	%xmm7,112(%rdi)
1137	leaq	128(%rdi),%rdi
1138	movdqu	48(%rsi),%xmm7
1139	pxor	32(%rsp),%xmm6
1140	pxor	%xmm10,%xmm11
1141	pxor	%xmm14,%xmm2
1142	pxor	%xmm8,%xmm7
1143	movdqu	%xmm6,0(%rdi)
1144	movdqu	%xmm11,16(%rdi)
1145	movdqu	%xmm2,32(%rdi)
1146	movdqu	%xmm7,48(%rdi)
1147	je	.Ldone4x
1148
1149	movdqa	48(%rsp),%xmm6
1150	leaq	64(%rsi),%rsi
1151	xorq	%r10,%r10
1152	movdqa	%xmm6,0(%rsp)
1153	movdqa	%xmm15,16(%rsp)
1154	leaq	64(%rdi),%rdi
1155	movdqa	%xmm9,32(%rsp)
1156	subq	$192,%rdx
1157	movdqa	%xmm3,48(%rsp)
1158
1159.Loop_tail4x:
1160	movzbl	(%rsi,%r10,1),%eax
1161	movzbl	(%rsp,%r10,1),%ecx
1162	leaq	1(%r10),%r10
1163	xorl	%ecx,%eax
1164	movb	%al,-1(%rdi,%r10,1)
1165	decq	%rdx
1166	jnz	.Loop_tail4x
1167
1168.Ldone4x:
1169	leaq	(%r9),%rsp
1170.cfi_def_cfa_register	%rsp
1171.L4x_epilogue:
1172	.byte	0xf3,0xc3
1173.cfi_endproc
1174.size	ChaCha20_4x,.-ChaCha20_4x
1175.type	ChaCha20_4xop,@function
1176.align	32
1177ChaCha20_4xop:
1178.cfi_startproc
1179.LChaCha20_4xop:
1180	movq	%rsp,%r9
1181.cfi_def_cfa_register	%r9
1182	subq	$0x140+8,%rsp
1183	vzeroupper
1184
1185	vmovdqa	.Lsigma(%rip),%xmm11
1186	vmovdqu	(%rcx),%xmm3
1187	vmovdqu	16(%rcx),%xmm15
1188	vmovdqu	(%r8),%xmm7
1189	leaq	256(%rsp),%rcx
1190
1191	vpshufd	$0x00,%xmm11,%xmm8
1192	vpshufd	$0x55,%xmm11,%xmm9
1193	vmovdqa	%xmm8,64(%rsp)
1194	vpshufd	$0xaa,%xmm11,%xmm10
1195	vmovdqa	%xmm9,80(%rsp)
1196	vpshufd	$0xff,%xmm11,%xmm11
1197	vmovdqa	%xmm10,96(%rsp)
1198	vmovdqa	%xmm11,112(%rsp)
1199
1200	vpshufd	$0x00,%xmm3,%xmm0
1201	vpshufd	$0x55,%xmm3,%xmm1
1202	vmovdqa	%xmm0,128-256(%rcx)
1203	vpshufd	$0xaa,%xmm3,%xmm2
1204	vmovdqa	%xmm1,144-256(%rcx)
1205	vpshufd	$0xff,%xmm3,%xmm3
1206	vmovdqa	%xmm2,160-256(%rcx)
1207	vmovdqa	%xmm3,176-256(%rcx)
1208
1209	vpshufd	$0x00,%xmm15,%xmm12
1210	vpshufd	$0x55,%xmm15,%xmm13
1211	vmovdqa	%xmm12,192-256(%rcx)
1212	vpshufd	$0xaa,%xmm15,%xmm14
1213	vmovdqa	%xmm13,208-256(%rcx)
1214	vpshufd	$0xff,%xmm15,%xmm15
1215	vmovdqa	%xmm14,224-256(%rcx)
1216	vmovdqa	%xmm15,240-256(%rcx)
1217
1218	vpshufd	$0x00,%xmm7,%xmm4
1219	vpshufd	$0x55,%xmm7,%xmm5
1220	vpaddd	.Linc(%rip),%xmm4,%xmm4
1221	vpshufd	$0xaa,%xmm7,%xmm6
1222	vmovdqa	%xmm5,272-256(%rcx)
1223	vpshufd	$0xff,%xmm7,%xmm7
1224	vmovdqa	%xmm6,288-256(%rcx)
1225	vmovdqa	%xmm7,304-256(%rcx)
1226
1227	jmp	.Loop_enter4xop
1228
1229.align	32
1230.Loop_outer4xop:
1231	vmovdqa	64(%rsp),%xmm8
1232	vmovdqa	80(%rsp),%xmm9
1233	vmovdqa	96(%rsp),%xmm10
1234	vmovdqa	112(%rsp),%xmm11
1235	vmovdqa	128-256(%rcx),%xmm0
1236	vmovdqa	144-256(%rcx),%xmm1
1237	vmovdqa	160-256(%rcx),%xmm2
1238	vmovdqa	176-256(%rcx),%xmm3
1239	vmovdqa	192-256(%rcx),%xmm12
1240	vmovdqa	208-256(%rcx),%xmm13
1241	vmovdqa	224-256(%rcx),%xmm14
1242	vmovdqa	240-256(%rcx),%xmm15
1243	vmovdqa	256-256(%rcx),%xmm4
1244	vmovdqa	272-256(%rcx),%xmm5
1245	vmovdqa	288-256(%rcx),%xmm6
1246	vmovdqa	304-256(%rcx),%xmm7
1247	vpaddd	.Lfour(%rip),%xmm4,%xmm4
1248
1249.Loop_enter4xop:
1250	movl	$10,%eax
1251	vmovdqa	%xmm4,256-256(%rcx)
1252	jmp	.Loop4xop
1253
1254.align	32
1255.Loop4xop:
1256	vpaddd	%xmm0,%xmm8,%xmm8
1257	vpaddd	%xmm1,%xmm9,%xmm9
1258	vpaddd	%xmm2,%xmm10,%xmm10
1259	vpaddd	%xmm3,%xmm11,%xmm11
1260	vpxor	%xmm4,%xmm8,%xmm4
1261	vpxor	%xmm5,%xmm9,%xmm5
1262	vpxor	%xmm6,%xmm10,%xmm6
1263	vpxor	%xmm7,%xmm11,%xmm7
1264.byte	143,232,120,194,228,16
1265.byte	143,232,120,194,237,16
1266.byte	143,232,120,194,246,16
1267.byte	143,232,120,194,255,16
1268	vpaddd	%xmm4,%xmm12,%xmm12
1269	vpaddd	%xmm5,%xmm13,%xmm13
1270	vpaddd	%xmm6,%xmm14,%xmm14
1271	vpaddd	%xmm7,%xmm15,%xmm15
1272	vpxor	%xmm0,%xmm12,%xmm0
1273	vpxor	%xmm1,%xmm13,%xmm1
1274	vpxor	%xmm14,%xmm2,%xmm2
1275	vpxor	%xmm15,%xmm3,%xmm3
1276.byte	143,232,120,194,192,12
1277.byte	143,232,120,194,201,12
1278.byte	143,232,120,194,210,12
1279.byte	143,232,120,194,219,12
1280	vpaddd	%xmm8,%xmm0,%xmm8
1281	vpaddd	%xmm9,%xmm1,%xmm9
1282	vpaddd	%xmm2,%xmm10,%xmm10
1283	vpaddd	%xmm3,%xmm11,%xmm11
1284	vpxor	%xmm4,%xmm8,%xmm4
1285	vpxor	%xmm5,%xmm9,%xmm5
1286	vpxor	%xmm6,%xmm10,%xmm6
1287	vpxor	%xmm7,%xmm11,%xmm7
1288.byte	143,232,120,194,228,8
1289.byte	143,232,120,194,237,8
1290.byte	143,232,120,194,246,8
1291.byte	143,232,120,194,255,8
1292	vpaddd	%xmm4,%xmm12,%xmm12
1293	vpaddd	%xmm5,%xmm13,%xmm13
1294	vpaddd	%xmm6,%xmm14,%xmm14
1295	vpaddd	%xmm7,%xmm15,%xmm15
1296	vpxor	%xmm0,%xmm12,%xmm0
1297	vpxor	%xmm1,%xmm13,%xmm1
1298	vpxor	%xmm14,%xmm2,%xmm2
1299	vpxor	%xmm15,%xmm3,%xmm3
1300.byte	143,232,120,194,192,7
1301.byte	143,232,120,194,201,7
1302.byte	143,232,120,194,210,7
1303.byte	143,232,120,194,219,7
1304	vpaddd	%xmm1,%xmm8,%xmm8
1305	vpaddd	%xmm2,%xmm9,%xmm9
1306	vpaddd	%xmm3,%xmm10,%xmm10
1307	vpaddd	%xmm0,%xmm11,%xmm11
1308	vpxor	%xmm7,%xmm8,%xmm7
1309	vpxor	%xmm4,%xmm9,%xmm4
1310	vpxor	%xmm5,%xmm10,%xmm5
1311	vpxor	%xmm6,%xmm11,%xmm6
1312.byte	143,232,120,194,255,16
1313.byte	143,232,120,194,228,16
1314.byte	143,232,120,194,237,16
1315.byte	143,232,120,194,246,16
1316	vpaddd	%xmm7,%xmm14,%xmm14
1317	vpaddd	%xmm4,%xmm15,%xmm15
1318	vpaddd	%xmm5,%xmm12,%xmm12
1319	vpaddd	%xmm6,%xmm13,%xmm13
1320	vpxor	%xmm1,%xmm14,%xmm1
1321	vpxor	%xmm2,%xmm15,%xmm2
1322	vpxor	%xmm12,%xmm3,%xmm3
1323	vpxor	%xmm13,%xmm0,%xmm0
1324.byte	143,232,120,194,201,12
1325.byte	143,232,120,194,210,12
1326.byte	143,232,120,194,219,12
1327.byte	143,232,120,194,192,12
1328	vpaddd	%xmm8,%xmm1,%xmm8
1329	vpaddd	%xmm9,%xmm2,%xmm9
1330	vpaddd	%xmm3,%xmm10,%xmm10
1331	vpaddd	%xmm0,%xmm11,%xmm11
1332	vpxor	%xmm7,%xmm8,%xmm7
1333	vpxor	%xmm4,%xmm9,%xmm4
1334	vpxor	%xmm5,%xmm10,%xmm5
1335	vpxor	%xmm6,%xmm11,%xmm6
1336.byte	143,232,120,194,255,8
1337.byte	143,232,120,194,228,8
1338.byte	143,232,120,194,237,8
1339.byte	143,232,120,194,246,8
1340	vpaddd	%xmm7,%xmm14,%xmm14
1341	vpaddd	%xmm4,%xmm15,%xmm15
1342	vpaddd	%xmm5,%xmm12,%xmm12
1343	vpaddd	%xmm6,%xmm13,%xmm13
1344	vpxor	%xmm1,%xmm14,%xmm1
1345	vpxor	%xmm2,%xmm15,%xmm2
1346	vpxor	%xmm12,%xmm3,%xmm3
1347	vpxor	%xmm13,%xmm0,%xmm0
1348.byte	143,232,120,194,201,7
1349.byte	143,232,120,194,210,7
1350.byte	143,232,120,194,219,7
1351.byte	143,232,120,194,192,7
1352	decl	%eax
1353	jnz	.Loop4xop
1354
1355	vpaddd	64(%rsp),%xmm8,%xmm8
1356	vpaddd	80(%rsp),%xmm9,%xmm9
1357	vpaddd	96(%rsp),%xmm10,%xmm10
1358	vpaddd	112(%rsp),%xmm11,%xmm11
1359
1360	vmovdqa	%xmm14,32(%rsp)
1361	vmovdqa	%xmm15,48(%rsp)
1362
1363	vpunpckldq	%xmm9,%xmm8,%xmm14
1364	vpunpckldq	%xmm11,%xmm10,%xmm15
1365	vpunpckhdq	%xmm9,%xmm8,%xmm8
1366	vpunpckhdq	%xmm11,%xmm10,%xmm10
1367	vpunpcklqdq	%xmm15,%xmm14,%xmm9
1368	vpunpckhqdq	%xmm15,%xmm14,%xmm14
1369	vpunpcklqdq	%xmm10,%xmm8,%xmm11
1370	vpunpckhqdq	%xmm10,%xmm8,%xmm8
1371	vpaddd	128-256(%rcx),%xmm0,%xmm0
1372	vpaddd	144-256(%rcx),%xmm1,%xmm1
1373	vpaddd	160-256(%rcx),%xmm2,%xmm2
1374	vpaddd	176-256(%rcx),%xmm3,%xmm3
1375
1376	vmovdqa	%xmm9,0(%rsp)
1377	vmovdqa	%xmm14,16(%rsp)
1378	vmovdqa	32(%rsp),%xmm9
1379	vmovdqa	48(%rsp),%xmm14
1380
1381	vpunpckldq	%xmm1,%xmm0,%xmm10
1382	vpunpckldq	%xmm3,%xmm2,%xmm15
1383	vpunpckhdq	%xmm1,%xmm0,%xmm0
1384	vpunpckhdq	%xmm3,%xmm2,%xmm2
1385	vpunpcklqdq	%xmm15,%xmm10,%xmm1
1386	vpunpckhqdq	%xmm15,%xmm10,%xmm10
1387	vpunpcklqdq	%xmm2,%xmm0,%xmm3
1388	vpunpckhqdq	%xmm2,%xmm0,%xmm0
1389	vpaddd	192-256(%rcx),%xmm12,%xmm12
1390	vpaddd	208-256(%rcx),%xmm13,%xmm13
1391	vpaddd	224-256(%rcx),%xmm9,%xmm9
1392	vpaddd	240-256(%rcx),%xmm14,%xmm14
1393
1394	vpunpckldq	%xmm13,%xmm12,%xmm2
1395	vpunpckldq	%xmm14,%xmm9,%xmm15
1396	vpunpckhdq	%xmm13,%xmm12,%xmm12
1397	vpunpckhdq	%xmm14,%xmm9,%xmm9
1398	vpunpcklqdq	%xmm15,%xmm2,%xmm13
1399	vpunpckhqdq	%xmm15,%xmm2,%xmm2
1400	vpunpcklqdq	%xmm9,%xmm12,%xmm14
1401	vpunpckhqdq	%xmm9,%xmm12,%xmm12
1402	vpaddd	256-256(%rcx),%xmm4,%xmm4
1403	vpaddd	272-256(%rcx),%xmm5,%xmm5
1404	vpaddd	288-256(%rcx),%xmm6,%xmm6
1405	vpaddd	304-256(%rcx),%xmm7,%xmm7
1406
1407	vpunpckldq	%xmm5,%xmm4,%xmm9
1408	vpunpckldq	%xmm7,%xmm6,%xmm15
1409	vpunpckhdq	%xmm5,%xmm4,%xmm4
1410	vpunpckhdq	%xmm7,%xmm6,%xmm6
1411	vpunpcklqdq	%xmm15,%xmm9,%xmm5
1412	vpunpckhqdq	%xmm15,%xmm9,%xmm9
1413	vpunpcklqdq	%xmm6,%xmm4,%xmm7
1414	vpunpckhqdq	%xmm6,%xmm4,%xmm4
1415	vmovdqa	0(%rsp),%xmm6
1416	vmovdqa	16(%rsp),%xmm15
1417
1418	cmpq	$256,%rdx
1419	jb	.Ltail4xop
1420
1421	vpxor	0(%rsi),%xmm6,%xmm6
1422	vpxor	16(%rsi),%xmm1,%xmm1
1423	vpxor	32(%rsi),%xmm13,%xmm13
1424	vpxor	48(%rsi),%xmm5,%xmm5
1425	vpxor	64(%rsi),%xmm15,%xmm15
1426	vpxor	80(%rsi),%xmm10,%xmm10
1427	vpxor	96(%rsi),%xmm2,%xmm2
1428	vpxor	112(%rsi),%xmm9,%xmm9
1429	leaq	128(%rsi),%rsi
1430	vpxor	0(%rsi),%xmm11,%xmm11
1431	vpxor	16(%rsi),%xmm3,%xmm3
1432	vpxor	32(%rsi),%xmm14,%xmm14
1433	vpxor	48(%rsi),%xmm7,%xmm7
1434	vpxor	64(%rsi),%xmm8,%xmm8
1435	vpxor	80(%rsi),%xmm0,%xmm0
1436	vpxor	96(%rsi),%xmm12,%xmm12
1437	vpxor	112(%rsi),%xmm4,%xmm4
1438	leaq	128(%rsi),%rsi
1439
1440	vmovdqu	%xmm6,0(%rdi)
1441	vmovdqu	%xmm1,16(%rdi)
1442	vmovdqu	%xmm13,32(%rdi)
1443	vmovdqu	%xmm5,48(%rdi)
1444	vmovdqu	%xmm15,64(%rdi)
1445	vmovdqu	%xmm10,80(%rdi)
1446	vmovdqu	%xmm2,96(%rdi)
1447	vmovdqu	%xmm9,112(%rdi)
1448	leaq	128(%rdi),%rdi
1449	vmovdqu	%xmm11,0(%rdi)
1450	vmovdqu	%xmm3,16(%rdi)
1451	vmovdqu	%xmm14,32(%rdi)
1452	vmovdqu	%xmm7,48(%rdi)
1453	vmovdqu	%xmm8,64(%rdi)
1454	vmovdqu	%xmm0,80(%rdi)
1455	vmovdqu	%xmm12,96(%rdi)
1456	vmovdqu	%xmm4,112(%rdi)
1457	leaq	128(%rdi),%rdi
1458
1459	subq	$256,%rdx
1460	jnz	.Loop_outer4xop
1461
1462	jmp	.Ldone4xop
1463
1464.align	32
1465.Ltail4xop:
1466	cmpq	$192,%rdx
1467	jae	.L192_or_more4xop
1468	cmpq	$128,%rdx
1469	jae	.L128_or_more4xop
1470	cmpq	$64,%rdx
1471	jae	.L64_or_more4xop
1472
1473	xorq	%r10,%r10
1474	vmovdqa	%xmm6,0(%rsp)
1475	vmovdqa	%xmm1,16(%rsp)
1476	vmovdqa	%xmm13,32(%rsp)
1477	vmovdqa	%xmm5,48(%rsp)
1478	jmp	.Loop_tail4xop
1479
1480.align	32
1481.L64_or_more4xop:
1482	vpxor	0(%rsi),%xmm6,%xmm6
1483	vpxor	16(%rsi),%xmm1,%xmm1
1484	vpxor	32(%rsi),%xmm13,%xmm13
1485	vpxor	48(%rsi),%xmm5,%xmm5
1486	vmovdqu	%xmm6,0(%rdi)
1487	vmovdqu	%xmm1,16(%rdi)
1488	vmovdqu	%xmm13,32(%rdi)
1489	vmovdqu	%xmm5,48(%rdi)
1490	je	.Ldone4xop
1491
1492	leaq	64(%rsi),%rsi
1493	vmovdqa	%xmm15,0(%rsp)
1494	xorq	%r10,%r10
1495	vmovdqa	%xmm10,16(%rsp)
1496	leaq	64(%rdi),%rdi
1497	vmovdqa	%xmm2,32(%rsp)
1498	subq	$64,%rdx
1499	vmovdqa	%xmm9,48(%rsp)
1500	jmp	.Loop_tail4xop
1501
1502.align	32
1503.L128_or_more4xop:
1504	vpxor	0(%rsi),%xmm6,%xmm6
1505	vpxor	16(%rsi),%xmm1,%xmm1
1506	vpxor	32(%rsi),%xmm13,%xmm13
1507	vpxor	48(%rsi),%xmm5,%xmm5
1508	vpxor	64(%rsi),%xmm15,%xmm15
1509	vpxor	80(%rsi),%xmm10,%xmm10
1510	vpxor	96(%rsi),%xmm2,%xmm2
1511	vpxor	112(%rsi),%xmm9,%xmm9
1512
1513	vmovdqu	%xmm6,0(%rdi)
1514	vmovdqu	%xmm1,16(%rdi)
1515	vmovdqu	%xmm13,32(%rdi)
1516	vmovdqu	%xmm5,48(%rdi)
1517	vmovdqu	%xmm15,64(%rdi)
1518	vmovdqu	%xmm10,80(%rdi)
1519	vmovdqu	%xmm2,96(%rdi)
1520	vmovdqu	%xmm9,112(%rdi)
1521	je	.Ldone4xop
1522
1523	leaq	128(%rsi),%rsi
1524	vmovdqa	%xmm11,0(%rsp)
1525	xorq	%r10,%r10
1526	vmovdqa	%xmm3,16(%rsp)
1527	leaq	128(%rdi),%rdi
1528	vmovdqa	%xmm14,32(%rsp)
1529	subq	$128,%rdx
1530	vmovdqa	%xmm7,48(%rsp)
1531	jmp	.Loop_tail4xop
1532
1533.align	32
1534.L192_or_more4xop:
1535	vpxor	0(%rsi),%xmm6,%xmm6
1536	vpxor	16(%rsi),%xmm1,%xmm1
1537	vpxor	32(%rsi),%xmm13,%xmm13
1538	vpxor	48(%rsi),%xmm5,%xmm5
1539	vpxor	64(%rsi),%xmm15,%xmm15
1540	vpxor	80(%rsi),%xmm10,%xmm10
1541	vpxor	96(%rsi),%xmm2,%xmm2
1542	vpxor	112(%rsi),%xmm9,%xmm9
1543	leaq	128(%rsi),%rsi
1544	vpxor	0(%rsi),%xmm11,%xmm11
1545	vpxor	16(%rsi),%xmm3,%xmm3
1546	vpxor	32(%rsi),%xmm14,%xmm14
1547	vpxor	48(%rsi),%xmm7,%xmm7
1548
1549	vmovdqu	%xmm6,0(%rdi)
1550	vmovdqu	%xmm1,16(%rdi)
1551	vmovdqu	%xmm13,32(%rdi)
1552	vmovdqu	%xmm5,48(%rdi)
1553	vmovdqu	%xmm15,64(%rdi)
1554	vmovdqu	%xmm10,80(%rdi)
1555	vmovdqu	%xmm2,96(%rdi)
1556	vmovdqu	%xmm9,112(%rdi)
1557	leaq	128(%rdi),%rdi
1558	vmovdqu	%xmm11,0(%rdi)
1559	vmovdqu	%xmm3,16(%rdi)
1560	vmovdqu	%xmm14,32(%rdi)
1561	vmovdqu	%xmm7,48(%rdi)
1562	je	.Ldone4xop
1563
1564	leaq	64(%rsi),%rsi
1565	vmovdqa	%xmm8,0(%rsp)
1566	xorq	%r10,%r10
1567	vmovdqa	%xmm0,16(%rsp)
1568	leaq	64(%rdi),%rdi
1569	vmovdqa	%xmm12,32(%rsp)
1570	subq	$192,%rdx
1571	vmovdqa	%xmm4,48(%rsp)
1572
1573.Loop_tail4xop:
1574	movzbl	(%rsi,%r10,1),%eax
1575	movzbl	(%rsp,%r10,1),%ecx
1576	leaq	1(%r10),%r10
1577	xorl	%ecx,%eax
1578	movb	%al,-1(%rdi,%r10,1)
1579	decq	%rdx
1580	jnz	.Loop_tail4xop
1581
1582.Ldone4xop:
1583	vzeroupper
1584	leaq	(%r9),%rsp
1585.cfi_def_cfa_register	%rsp
1586.L4xop_epilogue:
1587	.byte	0xf3,0xc3
1588.cfi_endproc
1589.size	ChaCha20_4xop,.-ChaCha20_4xop
1590.type	ChaCha20_8x,@function
1591.align	32
1592ChaCha20_8x:
1593.cfi_startproc
1594.LChaCha20_8x:
1595	movq	%rsp,%r9
1596.cfi_def_cfa_register	%r9
1597	subq	$0x280+8,%rsp
1598	andq	$-32,%rsp
1599	vzeroupper
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610	vbroadcasti128	.Lsigma(%rip),%ymm11
1611	vbroadcasti128	(%rcx),%ymm3
1612	vbroadcasti128	16(%rcx),%ymm15
1613	vbroadcasti128	(%r8),%ymm7
1614	leaq	256(%rsp),%rcx
1615	leaq	512(%rsp),%rax
1616	leaq	.Lrot16(%rip),%r10
1617	leaq	.Lrot24(%rip),%r11
1618
1619	vpshufd	$0x00,%ymm11,%ymm8
1620	vpshufd	$0x55,%ymm11,%ymm9
1621	vmovdqa	%ymm8,128-256(%rcx)
1622	vpshufd	$0xaa,%ymm11,%ymm10
1623	vmovdqa	%ymm9,160-256(%rcx)
1624	vpshufd	$0xff,%ymm11,%ymm11
1625	vmovdqa	%ymm10,192-256(%rcx)
1626	vmovdqa	%ymm11,224-256(%rcx)
1627
1628	vpshufd	$0x00,%ymm3,%ymm0
1629	vpshufd	$0x55,%ymm3,%ymm1
1630	vmovdqa	%ymm0,256-256(%rcx)
1631	vpshufd	$0xaa,%ymm3,%ymm2
1632	vmovdqa	%ymm1,288-256(%rcx)
1633	vpshufd	$0xff,%ymm3,%ymm3
1634	vmovdqa	%ymm2,320-256(%rcx)
1635	vmovdqa	%ymm3,352-256(%rcx)
1636
1637	vpshufd	$0x00,%ymm15,%ymm12
1638	vpshufd	$0x55,%ymm15,%ymm13
1639	vmovdqa	%ymm12,384-512(%rax)
1640	vpshufd	$0xaa,%ymm15,%ymm14
1641	vmovdqa	%ymm13,416-512(%rax)
1642	vpshufd	$0xff,%ymm15,%ymm15
1643	vmovdqa	%ymm14,448-512(%rax)
1644	vmovdqa	%ymm15,480-512(%rax)
1645
1646	vpshufd	$0x00,%ymm7,%ymm4
1647	vpshufd	$0x55,%ymm7,%ymm5
1648	vpaddd	.Lincy(%rip),%ymm4,%ymm4
1649	vpshufd	$0xaa,%ymm7,%ymm6
1650	vmovdqa	%ymm5,544-512(%rax)
1651	vpshufd	$0xff,%ymm7,%ymm7
1652	vmovdqa	%ymm6,576-512(%rax)
1653	vmovdqa	%ymm7,608-512(%rax)
1654
1655	jmp	.Loop_enter8x
1656
1657.align	32
1658.Loop_outer8x:
1659	vmovdqa	128-256(%rcx),%ymm8
1660	vmovdqa	160-256(%rcx),%ymm9
1661	vmovdqa	192-256(%rcx),%ymm10
1662	vmovdqa	224-256(%rcx),%ymm11
1663	vmovdqa	256-256(%rcx),%ymm0
1664	vmovdqa	288-256(%rcx),%ymm1
1665	vmovdqa	320-256(%rcx),%ymm2
1666	vmovdqa	352-256(%rcx),%ymm3
1667	vmovdqa	384-512(%rax),%ymm12
1668	vmovdqa	416-512(%rax),%ymm13
1669	vmovdqa	448-512(%rax),%ymm14
1670	vmovdqa	480-512(%rax),%ymm15
1671	vmovdqa	512-512(%rax),%ymm4
1672	vmovdqa	544-512(%rax),%ymm5
1673	vmovdqa	576-512(%rax),%ymm6
1674	vmovdqa	608-512(%rax),%ymm7
1675	vpaddd	.Leight(%rip),%ymm4,%ymm4
1676
1677.Loop_enter8x:
1678	vmovdqa	%ymm14,64(%rsp)
1679	vmovdqa	%ymm15,96(%rsp)
1680	vbroadcasti128	(%r10),%ymm15
1681	vmovdqa	%ymm4,512-512(%rax)
1682	movl	$10,%eax
1683	jmp	.Loop8x
1684
1685.align	32
1686.Loop8x:
1687	vpaddd	%ymm0,%ymm8,%ymm8
1688	vpxor	%ymm4,%ymm8,%ymm4
1689	vpshufb	%ymm15,%ymm4,%ymm4
1690	vpaddd	%ymm1,%ymm9,%ymm9
1691	vpxor	%ymm5,%ymm9,%ymm5
1692	vpshufb	%ymm15,%ymm5,%ymm5
1693	vpaddd	%ymm4,%ymm12,%ymm12
1694	vpxor	%ymm0,%ymm12,%ymm0
1695	vpslld	$12,%ymm0,%ymm14
1696	vpsrld	$20,%ymm0,%ymm0
1697	vpor	%ymm0,%ymm14,%ymm0
1698	vbroadcasti128	(%r11),%ymm14
1699	vpaddd	%ymm5,%ymm13,%ymm13
1700	vpxor	%ymm1,%ymm13,%ymm1
1701	vpslld	$12,%ymm1,%ymm15
1702	vpsrld	$20,%ymm1,%ymm1
1703	vpor	%ymm1,%ymm15,%ymm1
1704	vpaddd	%ymm0,%ymm8,%ymm8
1705	vpxor	%ymm4,%ymm8,%ymm4
1706	vpshufb	%ymm14,%ymm4,%ymm4
1707	vpaddd	%ymm1,%ymm9,%ymm9
1708	vpxor	%ymm5,%ymm9,%ymm5
1709	vpshufb	%ymm14,%ymm5,%ymm5
1710	vpaddd	%ymm4,%ymm12,%ymm12
1711	vpxor	%ymm0,%ymm12,%ymm0
1712	vpslld	$7,%ymm0,%ymm15
1713	vpsrld	$25,%ymm0,%ymm0
1714	vpor	%ymm0,%ymm15,%ymm0
1715	vbroadcasti128	(%r10),%ymm15
1716	vpaddd	%ymm5,%ymm13,%ymm13
1717	vpxor	%ymm1,%ymm13,%ymm1
1718	vpslld	$7,%ymm1,%ymm14
1719	vpsrld	$25,%ymm1,%ymm1
1720	vpor	%ymm1,%ymm14,%ymm1
1721	vmovdqa	%ymm12,0(%rsp)
1722	vmovdqa	%ymm13,32(%rsp)
1723	vmovdqa	64(%rsp),%ymm12
1724	vmovdqa	96(%rsp),%ymm13
1725	vpaddd	%ymm2,%ymm10,%ymm10
1726	vpxor	%ymm6,%ymm10,%ymm6
1727	vpshufb	%ymm15,%ymm6,%ymm6
1728	vpaddd	%ymm3,%ymm11,%ymm11
1729	vpxor	%ymm7,%ymm11,%ymm7
1730	vpshufb	%ymm15,%ymm7,%ymm7
1731	vpaddd	%ymm6,%ymm12,%ymm12
1732	vpxor	%ymm2,%ymm12,%ymm2
1733	vpslld	$12,%ymm2,%ymm14
1734	vpsrld	$20,%ymm2,%ymm2
1735	vpor	%ymm2,%ymm14,%ymm2
1736	vbroadcasti128	(%r11),%ymm14
1737	vpaddd	%ymm7,%ymm13,%ymm13
1738	vpxor	%ymm3,%ymm13,%ymm3
1739	vpslld	$12,%ymm3,%ymm15
1740	vpsrld	$20,%ymm3,%ymm3
1741	vpor	%ymm3,%ymm15,%ymm3
1742	vpaddd	%ymm2,%ymm10,%ymm10
1743	vpxor	%ymm6,%ymm10,%ymm6
1744	vpshufb	%ymm14,%ymm6,%ymm6
1745	vpaddd	%ymm3,%ymm11,%ymm11
1746	vpxor	%ymm7,%ymm11,%ymm7
1747	vpshufb	%ymm14,%ymm7,%ymm7
1748	vpaddd	%ymm6,%ymm12,%ymm12
1749	vpxor	%ymm2,%ymm12,%ymm2
1750	vpslld	$7,%ymm2,%ymm15
1751	vpsrld	$25,%ymm2,%ymm2
1752	vpor	%ymm2,%ymm15,%ymm2
1753	vbroadcasti128	(%r10),%ymm15
1754	vpaddd	%ymm7,%ymm13,%ymm13
1755	vpxor	%ymm3,%ymm13,%ymm3
1756	vpslld	$7,%ymm3,%ymm14
1757	vpsrld	$25,%ymm3,%ymm3
1758	vpor	%ymm3,%ymm14,%ymm3
1759	vpaddd	%ymm1,%ymm8,%ymm8
1760	vpxor	%ymm7,%ymm8,%ymm7
1761	vpshufb	%ymm15,%ymm7,%ymm7
1762	vpaddd	%ymm2,%ymm9,%ymm9
1763	vpxor	%ymm4,%ymm9,%ymm4
1764	vpshufb	%ymm15,%ymm4,%ymm4
1765	vpaddd	%ymm7,%ymm12,%ymm12
1766	vpxor	%ymm1,%ymm12,%ymm1
1767	vpslld	$12,%ymm1,%ymm14
1768	vpsrld	$20,%ymm1,%ymm1
1769	vpor	%ymm1,%ymm14,%ymm1
1770	vbroadcasti128	(%r11),%ymm14
1771	vpaddd	%ymm4,%ymm13,%ymm13
1772	vpxor	%ymm2,%ymm13,%ymm2
1773	vpslld	$12,%ymm2,%ymm15
1774	vpsrld	$20,%ymm2,%ymm2
1775	vpor	%ymm2,%ymm15,%ymm2
1776	vpaddd	%ymm1,%ymm8,%ymm8
1777	vpxor	%ymm7,%ymm8,%ymm7
1778	vpshufb	%ymm14,%ymm7,%ymm7
1779	vpaddd	%ymm2,%ymm9,%ymm9
1780	vpxor	%ymm4,%ymm9,%ymm4
1781	vpshufb	%ymm14,%ymm4,%ymm4
1782	vpaddd	%ymm7,%ymm12,%ymm12
1783	vpxor	%ymm1,%ymm12,%ymm1
1784	vpslld	$7,%ymm1,%ymm15
1785	vpsrld	$25,%ymm1,%ymm1
1786	vpor	%ymm1,%ymm15,%ymm1
1787	vbroadcasti128	(%r10),%ymm15
1788	vpaddd	%ymm4,%ymm13,%ymm13
1789	vpxor	%ymm2,%ymm13,%ymm2
1790	vpslld	$7,%ymm2,%ymm14
1791	vpsrld	$25,%ymm2,%ymm2
1792	vpor	%ymm2,%ymm14,%ymm2
1793	vmovdqa	%ymm12,64(%rsp)
1794	vmovdqa	%ymm13,96(%rsp)
1795	vmovdqa	0(%rsp),%ymm12
1796	vmovdqa	32(%rsp),%ymm13
1797	vpaddd	%ymm3,%ymm10,%ymm10
1798	vpxor	%ymm5,%ymm10,%ymm5
1799	vpshufb	%ymm15,%ymm5,%ymm5
1800	vpaddd	%ymm0,%ymm11,%ymm11
1801	vpxor	%ymm6,%ymm11,%ymm6
1802	vpshufb	%ymm15,%ymm6,%ymm6
1803	vpaddd	%ymm5,%ymm12,%ymm12
1804	vpxor	%ymm3,%ymm12,%ymm3
1805	vpslld	$12,%ymm3,%ymm14
1806	vpsrld	$20,%ymm3,%ymm3
1807	vpor	%ymm3,%ymm14,%ymm3
1808	vbroadcasti128	(%r11),%ymm14
1809	vpaddd	%ymm6,%ymm13,%ymm13
1810	vpxor	%ymm0,%ymm13,%ymm0
1811	vpslld	$12,%ymm0,%ymm15
1812	vpsrld	$20,%ymm0,%ymm0
1813	vpor	%ymm0,%ymm15,%ymm0
1814	vpaddd	%ymm3,%ymm10,%ymm10
1815	vpxor	%ymm5,%ymm10,%ymm5
1816	vpshufb	%ymm14,%ymm5,%ymm5
1817	vpaddd	%ymm0,%ymm11,%ymm11
1818	vpxor	%ymm6,%ymm11,%ymm6
1819	vpshufb	%ymm14,%ymm6,%ymm6
1820	vpaddd	%ymm5,%ymm12,%ymm12
1821	vpxor	%ymm3,%ymm12,%ymm3
1822	vpslld	$7,%ymm3,%ymm15
1823	vpsrld	$25,%ymm3,%ymm3
1824	vpor	%ymm3,%ymm15,%ymm3
1825	vbroadcasti128	(%r10),%ymm15
1826	vpaddd	%ymm6,%ymm13,%ymm13
1827	vpxor	%ymm0,%ymm13,%ymm0
1828	vpslld	$7,%ymm0,%ymm14
1829	vpsrld	$25,%ymm0,%ymm0
1830	vpor	%ymm0,%ymm14,%ymm0
1831	decl	%eax
1832	jnz	.Loop8x
1833
1834	leaq	512(%rsp),%rax
1835	vpaddd	128-256(%rcx),%ymm8,%ymm8
1836	vpaddd	160-256(%rcx),%ymm9,%ymm9
1837	vpaddd	192-256(%rcx),%ymm10,%ymm10
1838	vpaddd	224-256(%rcx),%ymm11,%ymm11
1839
1840	vpunpckldq	%ymm9,%ymm8,%ymm14
1841	vpunpckldq	%ymm11,%ymm10,%ymm15
1842	vpunpckhdq	%ymm9,%ymm8,%ymm8
1843	vpunpckhdq	%ymm11,%ymm10,%ymm10
1844	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1845	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1846	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1847	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1848	vpaddd	256-256(%rcx),%ymm0,%ymm0
1849	vpaddd	288-256(%rcx),%ymm1,%ymm1
1850	vpaddd	320-256(%rcx),%ymm2,%ymm2
1851	vpaddd	352-256(%rcx),%ymm3,%ymm3
1852
1853	vpunpckldq	%ymm1,%ymm0,%ymm10
1854	vpunpckldq	%ymm3,%ymm2,%ymm15
1855	vpunpckhdq	%ymm1,%ymm0,%ymm0
1856	vpunpckhdq	%ymm3,%ymm2,%ymm2
1857	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1858	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1859	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1860	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1861	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1862	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1863	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1864	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1865	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1866	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1867	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1868	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1869	vmovdqa	%ymm15,0(%rsp)
1870	vmovdqa	%ymm9,32(%rsp)
1871	vmovdqa	64(%rsp),%ymm15
1872	vmovdqa	96(%rsp),%ymm9
1873
1874	vpaddd	384-512(%rax),%ymm12,%ymm12
1875	vpaddd	416-512(%rax),%ymm13,%ymm13
1876	vpaddd	448-512(%rax),%ymm15,%ymm15
1877	vpaddd	480-512(%rax),%ymm9,%ymm9
1878
1879	vpunpckldq	%ymm13,%ymm12,%ymm2
1880	vpunpckldq	%ymm9,%ymm15,%ymm8
1881	vpunpckhdq	%ymm13,%ymm12,%ymm12
1882	vpunpckhdq	%ymm9,%ymm15,%ymm15
1883	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1884	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1885	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1886	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1887	vpaddd	512-512(%rax),%ymm4,%ymm4
1888	vpaddd	544-512(%rax),%ymm5,%ymm5
1889	vpaddd	576-512(%rax),%ymm6,%ymm6
1890	vpaddd	608-512(%rax),%ymm7,%ymm7
1891
1892	vpunpckldq	%ymm5,%ymm4,%ymm15
1893	vpunpckldq	%ymm7,%ymm6,%ymm8
1894	vpunpckhdq	%ymm5,%ymm4,%ymm4
1895	vpunpckhdq	%ymm7,%ymm6,%ymm6
1896	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1897	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1898	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1899	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1900	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1901	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1902	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1903	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1904	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1905	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1906	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1907	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1908	vmovdqa	0(%rsp),%ymm6
1909	vmovdqa	32(%rsp),%ymm12
1910
1911	cmpq	$512,%rdx
1912	jb	.Ltail8x
1913
1914	vpxor	0(%rsi),%ymm6,%ymm6
1915	vpxor	32(%rsi),%ymm8,%ymm8
1916	vpxor	64(%rsi),%ymm1,%ymm1
1917	vpxor	96(%rsi),%ymm5,%ymm5
1918	leaq	128(%rsi),%rsi
1919	vmovdqu	%ymm6,0(%rdi)
1920	vmovdqu	%ymm8,32(%rdi)
1921	vmovdqu	%ymm1,64(%rdi)
1922	vmovdqu	%ymm5,96(%rdi)
1923	leaq	128(%rdi),%rdi
1924
1925	vpxor	0(%rsi),%ymm12,%ymm12
1926	vpxor	32(%rsi),%ymm13,%ymm13
1927	vpxor	64(%rsi),%ymm10,%ymm10
1928	vpxor	96(%rsi),%ymm15,%ymm15
1929	leaq	128(%rsi),%rsi
1930	vmovdqu	%ymm12,0(%rdi)
1931	vmovdqu	%ymm13,32(%rdi)
1932	vmovdqu	%ymm10,64(%rdi)
1933	vmovdqu	%ymm15,96(%rdi)
1934	leaq	128(%rdi),%rdi
1935
1936	vpxor	0(%rsi),%ymm14,%ymm14
1937	vpxor	32(%rsi),%ymm2,%ymm2
1938	vpxor	64(%rsi),%ymm3,%ymm3
1939	vpxor	96(%rsi),%ymm7,%ymm7
1940	leaq	128(%rsi),%rsi
1941	vmovdqu	%ymm14,0(%rdi)
1942	vmovdqu	%ymm2,32(%rdi)
1943	vmovdqu	%ymm3,64(%rdi)
1944	vmovdqu	%ymm7,96(%rdi)
1945	leaq	128(%rdi),%rdi
1946
1947	vpxor	0(%rsi),%ymm11,%ymm11
1948	vpxor	32(%rsi),%ymm9,%ymm9
1949	vpxor	64(%rsi),%ymm0,%ymm0
1950	vpxor	96(%rsi),%ymm4,%ymm4
1951	leaq	128(%rsi),%rsi
1952	vmovdqu	%ymm11,0(%rdi)
1953	vmovdqu	%ymm9,32(%rdi)
1954	vmovdqu	%ymm0,64(%rdi)
1955	vmovdqu	%ymm4,96(%rdi)
1956	leaq	128(%rdi),%rdi
1957
1958	subq	$512,%rdx
1959	jnz	.Loop_outer8x
1960
1961	jmp	.Ldone8x
1962
1963.Ltail8x:
1964	cmpq	$448,%rdx
1965	jae	.L448_or_more8x
1966	cmpq	$384,%rdx
1967	jae	.L384_or_more8x
1968	cmpq	$320,%rdx
1969	jae	.L320_or_more8x
1970	cmpq	$256,%rdx
1971	jae	.L256_or_more8x
1972	cmpq	$192,%rdx
1973	jae	.L192_or_more8x
1974	cmpq	$128,%rdx
1975	jae	.L128_or_more8x
1976	cmpq	$64,%rdx
1977	jae	.L64_or_more8x
1978
1979	xorq	%r10,%r10
1980	vmovdqa	%ymm6,0(%rsp)
1981	vmovdqa	%ymm8,32(%rsp)
1982	jmp	.Loop_tail8x
1983
1984.align	32
1985.L64_or_more8x:
1986	vpxor	0(%rsi),%ymm6,%ymm6
1987	vpxor	32(%rsi),%ymm8,%ymm8
1988	vmovdqu	%ymm6,0(%rdi)
1989	vmovdqu	%ymm8,32(%rdi)
1990	je	.Ldone8x
1991
1992	leaq	64(%rsi),%rsi
1993	xorq	%r10,%r10
1994	vmovdqa	%ymm1,0(%rsp)
1995	leaq	64(%rdi),%rdi
1996	subq	$64,%rdx
1997	vmovdqa	%ymm5,32(%rsp)
1998	jmp	.Loop_tail8x
1999
2000.align	32
2001.L128_or_more8x:
2002	vpxor	0(%rsi),%ymm6,%ymm6
2003	vpxor	32(%rsi),%ymm8,%ymm8
2004	vpxor	64(%rsi),%ymm1,%ymm1
2005	vpxor	96(%rsi),%ymm5,%ymm5
2006	vmovdqu	%ymm6,0(%rdi)
2007	vmovdqu	%ymm8,32(%rdi)
2008	vmovdqu	%ymm1,64(%rdi)
2009	vmovdqu	%ymm5,96(%rdi)
2010	je	.Ldone8x
2011
2012	leaq	128(%rsi),%rsi
2013	xorq	%r10,%r10
2014	vmovdqa	%ymm12,0(%rsp)
2015	leaq	128(%rdi),%rdi
2016	subq	$128,%rdx
2017	vmovdqa	%ymm13,32(%rsp)
2018	jmp	.Loop_tail8x
2019
2020.align	32
2021.L192_or_more8x:
2022	vpxor	0(%rsi),%ymm6,%ymm6
2023	vpxor	32(%rsi),%ymm8,%ymm8
2024	vpxor	64(%rsi),%ymm1,%ymm1
2025	vpxor	96(%rsi),%ymm5,%ymm5
2026	vpxor	128(%rsi),%ymm12,%ymm12
2027	vpxor	160(%rsi),%ymm13,%ymm13
2028	vmovdqu	%ymm6,0(%rdi)
2029	vmovdqu	%ymm8,32(%rdi)
2030	vmovdqu	%ymm1,64(%rdi)
2031	vmovdqu	%ymm5,96(%rdi)
2032	vmovdqu	%ymm12,128(%rdi)
2033	vmovdqu	%ymm13,160(%rdi)
2034	je	.Ldone8x
2035
2036	leaq	192(%rsi),%rsi
2037	xorq	%r10,%r10
2038	vmovdqa	%ymm10,0(%rsp)
2039	leaq	192(%rdi),%rdi
2040	subq	$192,%rdx
2041	vmovdqa	%ymm15,32(%rsp)
2042	jmp	.Loop_tail8x
2043
2044.align	32
2045.L256_or_more8x:
2046	vpxor	0(%rsi),%ymm6,%ymm6
2047	vpxor	32(%rsi),%ymm8,%ymm8
2048	vpxor	64(%rsi),%ymm1,%ymm1
2049	vpxor	96(%rsi),%ymm5,%ymm5
2050	vpxor	128(%rsi),%ymm12,%ymm12
2051	vpxor	160(%rsi),%ymm13,%ymm13
2052	vpxor	192(%rsi),%ymm10,%ymm10
2053	vpxor	224(%rsi),%ymm15,%ymm15
2054	vmovdqu	%ymm6,0(%rdi)
2055	vmovdqu	%ymm8,32(%rdi)
2056	vmovdqu	%ymm1,64(%rdi)
2057	vmovdqu	%ymm5,96(%rdi)
2058	vmovdqu	%ymm12,128(%rdi)
2059	vmovdqu	%ymm13,160(%rdi)
2060	vmovdqu	%ymm10,192(%rdi)
2061	vmovdqu	%ymm15,224(%rdi)
2062	je	.Ldone8x
2063
2064	leaq	256(%rsi),%rsi
2065	xorq	%r10,%r10
2066	vmovdqa	%ymm14,0(%rsp)
2067	leaq	256(%rdi),%rdi
2068	subq	$256,%rdx
2069	vmovdqa	%ymm2,32(%rsp)
2070	jmp	.Loop_tail8x
2071
2072.align	32
2073.L320_or_more8x:
2074	vpxor	0(%rsi),%ymm6,%ymm6
2075	vpxor	32(%rsi),%ymm8,%ymm8
2076	vpxor	64(%rsi),%ymm1,%ymm1
2077	vpxor	96(%rsi),%ymm5,%ymm5
2078	vpxor	128(%rsi),%ymm12,%ymm12
2079	vpxor	160(%rsi),%ymm13,%ymm13
2080	vpxor	192(%rsi),%ymm10,%ymm10
2081	vpxor	224(%rsi),%ymm15,%ymm15
2082	vpxor	256(%rsi),%ymm14,%ymm14
2083	vpxor	288(%rsi),%ymm2,%ymm2
2084	vmovdqu	%ymm6,0(%rdi)
2085	vmovdqu	%ymm8,32(%rdi)
2086	vmovdqu	%ymm1,64(%rdi)
2087	vmovdqu	%ymm5,96(%rdi)
2088	vmovdqu	%ymm12,128(%rdi)
2089	vmovdqu	%ymm13,160(%rdi)
2090	vmovdqu	%ymm10,192(%rdi)
2091	vmovdqu	%ymm15,224(%rdi)
2092	vmovdqu	%ymm14,256(%rdi)
2093	vmovdqu	%ymm2,288(%rdi)
2094	je	.Ldone8x
2095
2096	leaq	320(%rsi),%rsi
2097	xorq	%r10,%r10
2098	vmovdqa	%ymm3,0(%rsp)
2099	leaq	320(%rdi),%rdi
2100	subq	$320,%rdx
2101	vmovdqa	%ymm7,32(%rsp)
2102	jmp	.Loop_tail8x
2103
2104.align	32
2105.L384_or_more8x:
2106	vpxor	0(%rsi),%ymm6,%ymm6
2107	vpxor	32(%rsi),%ymm8,%ymm8
2108	vpxor	64(%rsi),%ymm1,%ymm1
2109	vpxor	96(%rsi),%ymm5,%ymm5
2110	vpxor	128(%rsi),%ymm12,%ymm12
2111	vpxor	160(%rsi),%ymm13,%ymm13
2112	vpxor	192(%rsi),%ymm10,%ymm10
2113	vpxor	224(%rsi),%ymm15,%ymm15
2114	vpxor	256(%rsi),%ymm14,%ymm14
2115	vpxor	288(%rsi),%ymm2,%ymm2
2116	vpxor	320(%rsi),%ymm3,%ymm3
2117	vpxor	352(%rsi),%ymm7,%ymm7
2118	vmovdqu	%ymm6,0(%rdi)
2119	vmovdqu	%ymm8,32(%rdi)
2120	vmovdqu	%ymm1,64(%rdi)
2121	vmovdqu	%ymm5,96(%rdi)
2122	vmovdqu	%ymm12,128(%rdi)
2123	vmovdqu	%ymm13,160(%rdi)
2124	vmovdqu	%ymm10,192(%rdi)
2125	vmovdqu	%ymm15,224(%rdi)
2126	vmovdqu	%ymm14,256(%rdi)
2127	vmovdqu	%ymm2,288(%rdi)
2128	vmovdqu	%ymm3,320(%rdi)
2129	vmovdqu	%ymm7,352(%rdi)
2130	je	.Ldone8x
2131
2132	leaq	384(%rsi),%rsi
2133	xorq	%r10,%r10
2134	vmovdqa	%ymm11,0(%rsp)
2135	leaq	384(%rdi),%rdi
2136	subq	$384,%rdx
2137	vmovdqa	%ymm9,32(%rsp)
2138	jmp	.Loop_tail8x
2139
2140.align	32
2141.L448_or_more8x:
2142	vpxor	0(%rsi),%ymm6,%ymm6
2143	vpxor	32(%rsi),%ymm8,%ymm8
2144	vpxor	64(%rsi),%ymm1,%ymm1
2145	vpxor	96(%rsi),%ymm5,%ymm5
2146	vpxor	128(%rsi),%ymm12,%ymm12
2147	vpxor	160(%rsi),%ymm13,%ymm13
2148	vpxor	192(%rsi),%ymm10,%ymm10
2149	vpxor	224(%rsi),%ymm15,%ymm15
2150	vpxor	256(%rsi),%ymm14,%ymm14
2151	vpxor	288(%rsi),%ymm2,%ymm2
2152	vpxor	320(%rsi),%ymm3,%ymm3
2153	vpxor	352(%rsi),%ymm7,%ymm7
2154	vpxor	384(%rsi),%ymm11,%ymm11
2155	vpxor	416(%rsi),%ymm9,%ymm9
2156	vmovdqu	%ymm6,0(%rdi)
2157	vmovdqu	%ymm8,32(%rdi)
2158	vmovdqu	%ymm1,64(%rdi)
2159	vmovdqu	%ymm5,96(%rdi)
2160	vmovdqu	%ymm12,128(%rdi)
2161	vmovdqu	%ymm13,160(%rdi)
2162	vmovdqu	%ymm10,192(%rdi)
2163	vmovdqu	%ymm15,224(%rdi)
2164	vmovdqu	%ymm14,256(%rdi)
2165	vmovdqu	%ymm2,288(%rdi)
2166	vmovdqu	%ymm3,320(%rdi)
2167	vmovdqu	%ymm7,352(%rdi)
2168	vmovdqu	%ymm11,384(%rdi)
2169	vmovdqu	%ymm9,416(%rdi)
2170	je	.Ldone8x
2171
2172	leaq	448(%rsi),%rsi
2173	xorq	%r10,%r10
2174	vmovdqa	%ymm0,0(%rsp)
2175	leaq	448(%rdi),%rdi
2176	subq	$448,%rdx
2177	vmovdqa	%ymm4,32(%rsp)
2178
2179.Loop_tail8x:
2180	movzbl	(%rsi,%r10,1),%eax
2181	movzbl	(%rsp,%r10,1),%ecx
2182	leaq	1(%r10),%r10
2183	xorl	%ecx,%eax
2184	movb	%al,-1(%rdi,%r10,1)
2185	decq	%rdx
2186	jnz	.Loop_tail8x
2187
2188.Ldone8x:
2189	vzeroall
2190	leaq	(%r9),%rsp
2191.cfi_def_cfa_register	%rsp
2192.L8x_epilogue:
2193	.byte	0xf3,0xc3
2194.cfi_endproc
2195.size	ChaCha20_8x,.-ChaCha20_8x
2196