xref: /freebsd/sys/crypto/openssl/amd64/chacha-x86_64.S (revision 4757b351ea9d59d71d4a38b82506d2d16fcd560d)
1/* Do not modify. This file is auto-generated from chacha-x86_64.pl. */
2.text
3
4
5
6.section	.rodata
7.align	64
8.Lzero:
9.long	0,0,0,0
10.Lone:
11.long	1,0,0,0
12.Linc:
13.long	0,1,2,3
14.Lfour:
15.long	4,4,4,4
16.Lincy:
17.long	0,2,4,6,1,3,5,7
18.Leight:
19.long	8,8,8,8,8,8,8,8
20.Lrot16:
21.byte	0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd
22.Lrot24:
23.byte	0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe
24.Ltwoy:
25.long	2,0,0,0, 2,0,0,0
26.align	64
27.Lzeroz:
28.long	0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
29.Lfourz:
30.long	4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
31.Lincz:
32.long	0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
33.Lsixteen:
34.long	16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
35.Lsigma:
36.byte	101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0
37.byte	67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
38.previous
39.globl	ChaCha20_ctr32
40.type	ChaCha20_ctr32,@function
41.align	64
42ChaCha20_ctr32:
43.cfi_startproc
44	cmpq	$0,%rdx
45	je	.Lno_data
46	movq	OPENSSL_ia32cap_P+4(%rip),%r10
47	testl	$512,%r10d
48	jnz	.LChaCha20_ssse3
49
50	pushq	%rbx
51.cfi_adjust_cfa_offset	8
52.cfi_offset	%rbx,-16
53	pushq	%rbp
54.cfi_adjust_cfa_offset	8
55.cfi_offset	%rbp,-24
56	pushq	%r12
57.cfi_adjust_cfa_offset	8
58.cfi_offset	%r12,-32
59	pushq	%r13
60.cfi_adjust_cfa_offset	8
61.cfi_offset	%r13,-40
62	pushq	%r14
63.cfi_adjust_cfa_offset	8
64.cfi_offset	%r14,-48
65	pushq	%r15
66.cfi_adjust_cfa_offset	8
67.cfi_offset	%r15,-56
68	subq	$64+24,%rsp
69.cfi_adjust_cfa_offset	64+24
70.Lctr32_body:
71
72
73	movdqu	(%rcx),%xmm1
74	movdqu	16(%rcx),%xmm2
75	movdqu	(%r8),%xmm3
76	movdqa	.Lone(%rip),%xmm4
77
78
79	movdqa	%xmm1,16(%rsp)
80	movdqa	%xmm2,32(%rsp)
81	movdqa	%xmm3,48(%rsp)
82	movq	%rdx,%rbp
83	jmp	.Loop_outer
84
85.align	32
86.Loop_outer:
87	movl	$0x61707865,%eax
88	movl	$0x3320646e,%ebx
89	movl	$0x79622d32,%ecx
90	movl	$0x6b206574,%edx
91	movl	16(%rsp),%r8d
92	movl	20(%rsp),%r9d
93	movl	24(%rsp),%r10d
94	movl	28(%rsp),%r11d
95	movd	%xmm3,%r12d
96	movl	52(%rsp),%r13d
97	movl	56(%rsp),%r14d
98	movl	60(%rsp),%r15d
99
100	movq	%rbp,64+0(%rsp)
101	movl	$10,%ebp
102	movq	%rsi,64+8(%rsp)
103.byte	102,72,15,126,214
104	movq	%rdi,64+16(%rsp)
105	movq	%rsi,%rdi
106	shrq	$32,%rdi
107	jmp	.Loop
108
109.align	32
110.Loop:
111	addl	%r8d,%eax
112	xorl	%eax,%r12d
113	roll	$16,%r12d
114	addl	%r9d,%ebx
115	xorl	%ebx,%r13d
116	roll	$16,%r13d
117	addl	%r12d,%esi
118	xorl	%esi,%r8d
119	roll	$12,%r8d
120	addl	%r13d,%edi
121	xorl	%edi,%r9d
122	roll	$12,%r9d
123	addl	%r8d,%eax
124	xorl	%eax,%r12d
125	roll	$8,%r12d
126	addl	%r9d,%ebx
127	xorl	%ebx,%r13d
128	roll	$8,%r13d
129	addl	%r12d,%esi
130	xorl	%esi,%r8d
131	roll	$7,%r8d
132	addl	%r13d,%edi
133	xorl	%edi,%r9d
134	roll	$7,%r9d
135	movl	%esi,32(%rsp)
136	movl	%edi,36(%rsp)
137	movl	40(%rsp),%esi
138	movl	44(%rsp),%edi
139	addl	%r10d,%ecx
140	xorl	%ecx,%r14d
141	roll	$16,%r14d
142	addl	%r11d,%edx
143	xorl	%edx,%r15d
144	roll	$16,%r15d
145	addl	%r14d,%esi
146	xorl	%esi,%r10d
147	roll	$12,%r10d
148	addl	%r15d,%edi
149	xorl	%edi,%r11d
150	roll	$12,%r11d
151	addl	%r10d,%ecx
152	xorl	%ecx,%r14d
153	roll	$8,%r14d
154	addl	%r11d,%edx
155	xorl	%edx,%r15d
156	roll	$8,%r15d
157	addl	%r14d,%esi
158	xorl	%esi,%r10d
159	roll	$7,%r10d
160	addl	%r15d,%edi
161	xorl	%edi,%r11d
162	roll	$7,%r11d
163	addl	%r9d,%eax
164	xorl	%eax,%r15d
165	roll	$16,%r15d
166	addl	%r10d,%ebx
167	xorl	%ebx,%r12d
168	roll	$16,%r12d
169	addl	%r15d,%esi
170	xorl	%esi,%r9d
171	roll	$12,%r9d
172	addl	%r12d,%edi
173	xorl	%edi,%r10d
174	roll	$12,%r10d
175	addl	%r9d,%eax
176	xorl	%eax,%r15d
177	roll	$8,%r15d
178	addl	%r10d,%ebx
179	xorl	%ebx,%r12d
180	roll	$8,%r12d
181	addl	%r15d,%esi
182	xorl	%esi,%r9d
183	roll	$7,%r9d
184	addl	%r12d,%edi
185	xorl	%edi,%r10d
186	roll	$7,%r10d
187	movl	%esi,40(%rsp)
188	movl	%edi,44(%rsp)
189	movl	32(%rsp),%esi
190	movl	36(%rsp),%edi
191	addl	%r11d,%ecx
192	xorl	%ecx,%r13d
193	roll	$16,%r13d
194	addl	%r8d,%edx
195	xorl	%edx,%r14d
196	roll	$16,%r14d
197	addl	%r13d,%esi
198	xorl	%esi,%r11d
199	roll	$12,%r11d
200	addl	%r14d,%edi
201	xorl	%edi,%r8d
202	roll	$12,%r8d
203	addl	%r11d,%ecx
204	xorl	%ecx,%r13d
205	roll	$8,%r13d
206	addl	%r8d,%edx
207	xorl	%edx,%r14d
208	roll	$8,%r14d
209	addl	%r13d,%esi
210	xorl	%esi,%r11d
211	roll	$7,%r11d
212	addl	%r14d,%edi
213	xorl	%edi,%r8d
214	roll	$7,%r8d
215	decl	%ebp
216	jnz	.Loop
217	movl	%edi,36(%rsp)
218	movl	%esi,32(%rsp)
219	movq	64(%rsp),%rbp
220	movdqa	%xmm2,%xmm1
221	movq	64+8(%rsp),%rsi
222	paddd	%xmm4,%xmm3
223	movq	64+16(%rsp),%rdi
224
225	addl	$0x61707865,%eax
226	addl	$0x3320646e,%ebx
227	addl	$0x79622d32,%ecx
228	addl	$0x6b206574,%edx
229	addl	16(%rsp),%r8d
230	addl	20(%rsp),%r9d
231	addl	24(%rsp),%r10d
232	addl	28(%rsp),%r11d
233	addl	48(%rsp),%r12d
234	addl	52(%rsp),%r13d
235	addl	56(%rsp),%r14d
236	addl	60(%rsp),%r15d
237	paddd	32(%rsp),%xmm1
238
239	cmpq	$64,%rbp
240	jb	.Ltail
241
242	xorl	0(%rsi),%eax
243	xorl	4(%rsi),%ebx
244	xorl	8(%rsi),%ecx
245	xorl	12(%rsi),%edx
246	xorl	16(%rsi),%r8d
247	xorl	20(%rsi),%r9d
248	xorl	24(%rsi),%r10d
249	xorl	28(%rsi),%r11d
250	movdqu	32(%rsi),%xmm0
251	xorl	48(%rsi),%r12d
252	xorl	52(%rsi),%r13d
253	xorl	56(%rsi),%r14d
254	xorl	60(%rsi),%r15d
255	leaq	64(%rsi),%rsi
256	pxor	%xmm1,%xmm0
257
258	movdqa	%xmm2,32(%rsp)
259	movd	%xmm3,48(%rsp)
260
261	movl	%eax,0(%rdi)
262	movl	%ebx,4(%rdi)
263	movl	%ecx,8(%rdi)
264	movl	%edx,12(%rdi)
265	movl	%r8d,16(%rdi)
266	movl	%r9d,20(%rdi)
267	movl	%r10d,24(%rdi)
268	movl	%r11d,28(%rdi)
269	movdqu	%xmm0,32(%rdi)
270	movl	%r12d,48(%rdi)
271	movl	%r13d,52(%rdi)
272	movl	%r14d,56(%rdi)
273	movl	%r15d,60(%rdi)
274	leaq	64(%rdi),%rdi
275
276	subq	$64,%rbp
277	jnz	.Loop_outer
278
279	jmp	.Ldone
280
281.align	16
282.Ltail:
283	movl	%eax,0(%rsp)
284	movl	%ebx,4(%rsp)
285	xorq	%rbx,%rbx
286	movl	%ecx,8(%rsp)
287	movl	%edx,12(%rsp)
288	movl	%r8d,16(%rsp)
289	movl	%r9d,20(%rsp)
290	movl	%r10d,24(%rsp)
291	movl	%r11d,28(%rsp)
292	movdqa	%xmm1,32(%rsp)
293	movl	%r12d,48(%rsp)
294	movl	%r13d,52(%rsp)
295	movl	%r14d,56(%rsp)
296	movl	%r15d,60(%rsp)
297
298.Loop_tail:
299	movzbl	(%rsi,%rbx,1),%eax
300	movzbl	(%rsp,%rbx,1),%edx
301	leaq	1(%rbx),%rbx
302	xorl	%edx,%eax
303	movb	%al,-1(%rdi,%rbx,1)
304	decq	%rbp
305	jnz	.Loop_tail
306
307.Ldone:
308	leaq	64+24+48(%rsp),%rsi
309.cfi_def_cfa	%rsi,8
310	movq	-48(%rsi),%r15
311.cfi_restore	%r15
312	movq	-40(%rsi),%r14
313.cfi_restore	%r14
314	movq	-32(%rsi),%r13
315.cfi_restore	%r13
316	movq	-24(%rsi),%r12
317.cfi_restore	%r12
318	movq	-16(%rsi),%rbp
319.cfi_restore	%rbp
320	movq	-8(%rsi),%rbx
321.cfi_restore	%rbx
322	leaq	(%rsi),%rsp
323.cfi_def_cfa_register	%rsp
324.Lno_data:
325	.byte	0xf3,0xc3
326.cfi_endproc
327.size	ChaCha20_ctr32,.-ChaCha20_ctr32
328.type	ChaCha20_ssse3,@function
329.align	32
330ChaCha20_ssse3:
331.cfi_startproc
332.LChaCha20_ssse3:
333	movq	%rsp,%r9
334.cfi_def_cfa_register	%r9
335	testl	$2048,%r10d
336	jnz	.LChaCha20_4xop
337	cmpq	$128,%rdx
338	je	.LChaCha20_128
339	ja	.LChaCha20_4x
340
341.Ldo_sse3_after_all:
342	subq	$64+8,%rsp
343	movdqa	.Lsigma(%rip),%xmm0
344	movdqu	(%rcx),%xmm1
345	movdqu	16(%rcx),%xmm2
346	movdqu	(%r8),%xmm3
347	movdqa	.Lrot16(%rip),%xmm6
348	movdqa	.Lrot24(%rip),%xmm7
349
350	movdqa	%xmm0,0(%rsp)
351	movdqa	%xmm1,16(%rsp)
352	movdqa	%xmm2,32(%rsp)
353	movdqa	%xmm3,48(%rsp)
354	movq	$10,%r8
355	jmp	.Loop_ssse3
356
357.align	32
358.Loop_outer_ssse3:
359	movdqa	.Lone(%rip),%xmm3
360	movdqa	0(%rsp),%xmm0
361	movdqa	16(%rsp),%xmm1
362	movdqa	32(%rsp),%xmm2
363	paddd	48(%rsp),%xmm3
364	movq	$10,%r8
365	movdqa	%xmm3,48(%rsp)
366	jmp	.Loop_ssse3
367
368.align	32
369.Loop_ssse3:
370	paddd	%xmm1,%xmm0
371	pxor	%xmm0,%xmm3
372.byte	102,15,56,0,222
373	paddd	%xmm3,%xmm2
374	pxor	%xmm2,%xmm1
375	movdqa	%xmm1,%xmm4
376	psrld	$20,%xmm1
377	pslld	$12,%xmm4
378	por	%xmm4,%xmm1
379	paddd	%xmm1,%xmm0
380	pxor	%xmm0,%xmm3
381.byte	102,15,56,0,223
382	paddd	%xmm3,%xmm2
383	pxor	%xmm2,%xmm1
384	movdqa	%xmm1,%xmm4
385	psrld	$25,%xmm1
386	pslld	$7,%xmm4
387	por	%xmm4,%xmm1
388	pshufd	$78,%xmm2,%xmm2
389	pshufd	$57,%xmm1,%xmm1
390	pshufd	$147,%xmm3,%xmm3
391	nop
392	paddd	%xmm1,%xmm0
393	pxor	%xmm0,%xmm3
394.byte	102,15,56,0,222
395	paddd	%xmm3,%xmm2
396	pxor	%xmm2,%xmm1
397	movdqa	%xmm1,%xmm4
398	psrld	$20,%xmm1
399	pslld	$12,%xmm4
400	por	%xmm4,%xmm1
401	paddd	%xmm1,%xmm0
402	pxor	%xmm0,%xmm3
403.byte	102,15,56,0,223
404	paddd	%xmm3,%xmm2
405	pxor	%xmm2,%xmm1
406	movdqa	%xmm1,%xmm4
407	psrld	$25,%xmm1
408	pslld	$7,%xmm4
409	por	%xmm4,%xmm1
410	pshufd	$78,%xmm2,%xmm2
411	pshufd	$147,%xmm1,%xmm1
412	pshufd	$57,%xmm3,%xmm3
413	decq	%r8
414	jnz	.Loop_ssse3
415	paddd	0(%rsp),%xmm0
416	paddd	16(%rsp),%xmm1
417	paddd	32(%rsp),%xmm2
418	paddd	48(%rsp),%xmm3
419
420	cmpq	$64,%rdx
421	jb	.Ltail_ssse3
422
423	movdqu	0(%rsi),%xmm4
424	movdqu	16(%rsi),%xmm5
425	pxor	%xmm4,%xmm0
426	movdqu	32(%rsi),%xmm4
427	pxor	%xmm5,%xmm1
428	movdqu	48(%rsi),%xmm5
429	leaq	64(%rsi),%rsi
430	pxor	%xmm4,%xmm2
431	pxor	%xmm5,%xmm3
432
433	movdqu	%xmm0,0(%rdi)
434	movdqu	%xmm1,16(%rdi)
435	movdqu	%xmm2,32(%rdi)
436	movdqu	%xmm3,48(%rdi)
437	leaq	64(%rdi),%rdi
438
439	subq	$64,%rdx
440	jnz	.Loop_outer_ssse3
441
442	jmp	.Ldone_ssse3
443
444.align	16
445.Ltail_ssse3:
446	movdqa	%xmm0,0(%rsp)
447	movdqa	%xmm1,16(%rsp)
448	movdqa	%xmm2,32(%rsp)
449	movdqa	%xmm3,48(%rsp)
450	xorq	%r8,%r8
451
452.Loop_tail_ssse3:
453	movzbl	(%rsi,%r8,1),%eax
454	movzbl	(%rsp,%r8,1),%ecx
455	leaq	1(%r8),%r8
456	xorl	%ecx,%eax
457	movb	%al,-1(%rdi,%r8,1)
458	decq	%rdx
459	jnz	.Loop_tail_ssse3
460
461.Ldone_ssse3:
462	leaq	(%r9),%rsp
463.cfi_def_cfa_register	%rsp
464.Lssse3_epilogue:
465	.byte	0xf3,0xc3
466.cfi_endproc
467.size	ChaCha20_ssse3,.-ChaCha20_ssse3
468.type	ChaCha20_128,@function
469.align	32
470ChaCha20_128:
471.cfi_startproc
472.LChaCha20_128:
473	movq	%rsp,%r9
474.cfi_def_cfa_register	%r9
475	subq	$64+8,%rsp
476	movdqa	.Lsigma(%rip),%xmm8
477	movdqu	(%rcx),%xmm9
478	movdqu	16(%rcx),%xmm2
479	movdqu	(%r8),%xmm3
480	movdqa	.Lone(%rip),%xmm1
481	movdqa	.Lrot16(%rip),%xmm6
482	movdqa	.Lrot24(%rip),%xmm7
483
484	movdqa	%xmm8,%xmm10
485	movdqa	%xmm8,0(%rsp)
486	movdqa	%xmm9,%xmm11
487	movdqa	%xmm9,16(%rsp)
488	movdqa	%xmm2,%xmm0
489	movdqa	%xmm2,32(%rsp)
490	paddd	%xmm3,%xmm1
491	movdqa	%xmm3,48(%rsp)
492	movq	$10,%r8
493	jmp	.Loop_128
494
495.align	32
496.Loop_128:
497	paddd	%xmm9,%xmm8
498	pxor	%xmm8,%xmm3
499	paddd	%xmm11,%xmm10
500	pxor	%xmm10,%xmm1
501.byte	102,15,56,0,222
502.byte	102,15,56,0,206
503	paddd	%xmm3,%xmm2
504	paddd	%xmm1,%xmm0
505	pxor	%xmm2,%xmm9
506	pxor	%xmm0,%xmm11
507	movdqa	%xmm9,%xmm4
508	psrld	$20,%xmm9
509	movdqa	%xmm11,%xmm5
510	pslld	$12,%xmm4
511	psrld	$20,%xmm11
512	por	%xmm4,%xmm9
513	pslld	$12,%xmm5
514	por	%xmm5,%xmm11
515	paddd	%xmm9,%xmm8
516	pxor	%xmm8,%xmm3
517	paddd	%xmm11,%xmm10
518	pxor	%xmm10,%xmm1
519.byte	102,15,56,0,223
520.byte	102,15,56,0,207
521	paddd	%xmm3,%xmm2
522	paddd	%xmm1,%xmm0
523	pxor	%xmm2,%xmm9
524	pxor	%xmm0,%xmm11
525	movdqa	%xmm9,%xmm4
526	psrld	$25,%xmm9
527	movdqa	%xmm11,%xmm5
528	pslld	$7,%xmm4
529	psrld	$25,%xmm11
530	por	%xmm4,%xmm9
531	pslld	$7,%xmm5
532	por	%xmm5,%xmm11
533	pshufd	$78,%xmm2,%xmm2
534	pshufd	$57,%xmm9,%xmm9
535	pshufd	$147,%xmm3,%xmm3
536	pshufd	$78,%xmm0,%xmm0
537	pshufd	$57,%xmm11,%xmm11
538	pshufd	$147,%xmm1,%xmm1
539	paddd	%xmm9,%xmm8
540	pxor	%xmm8,%xmm3
541	paddd	%xmm11,%xmm10
542	pxor	%xmm10,%xmm1
543.byte	102,15,56,0,222
544.byte	102,15,56,0,206
545	paddd	%xmm3,%xmm2
546	paddd	%xmm1,%xmm0
547	pxor	%xmm2,%xmm9
548	pxor	%xmm0,%xmm11
549	movdqa	%xmm9,%xmm4
550	psrld	$20,%xmm9
551	movdqa	%xmm11,%xmm5
552	pslld	$12,%xmm4
553	psrld	$20,%xmm11
554	por	%xmm4,%xmm9
555	pslld	$12,%xmm5
556	por	%xmm5,%xmm11
557	paddd	%xmm9,%xmm8
558	pxor	%xmm8,%xmm3
559	paddd	%xmm11,%xmm10
560	pxor	%xmm10,%xmm1
561.byte	102,15,56,0,223
562.byte	102,15,56,0,207
563	paddd	%xmm3,%xmm2
564	paddd	%xmm1,%xmm0
565	pxor	%xmm2,%xmm9
566	pxor	%xmm0,%xmm11
567	movdqa	%xmm9,%xmm4
568	psrld	$25,%xmm9
569	movdqa	%xmm11,%xmm5
570	pslld	$7,%xmm4
571	psrld	$25,%xmm11
572	por	%xmm4,%xmm9
573	pslld	$7,%xmm5
574	por	%xmm5,%xmm11
575	pshufd	$78,%xmm2,%xmm2
576	pshufd	$147,%xmm9,%xmm9
577	pshufd	$57,%xmm3,%xmm3
578	pshufd	$78,%xmm0,%xmm0
579	pshufd	$147,%xmm11,%xmm11
580	pshufd	$57,%xmm1,%xmm1
581	decq	%r8
582	jnz	.Loop_128
583	paddd	0(%rsp),%xmm8
584	paddd	16(%rsp),%xmm9
585	paddd	32(%rsp),%xmm2
586	paddd	48(%rsp),%xmm3
587	paddd	.Lone(%rip),%xmm1
588	paddd	0(%rsp),%xmm10
589	paddd	16(%rsp),%xmm11
590	paddd	32(%rsp),%xmm0
591	paddd	48(%rsp),%xmm1
592
593	movdqu	0(%rsi),%xmm4
594	movdqu	16(%rsi),%xmm5
595	pxor	%xmm4,%xmm8
596	movdqu	32(%rsi),%xmm4
597	pxor	%xmm5,%xmm9
598	movdqu	48(%rsi),%xmm5
599	pxor	%xmm4,%xmm2
600	movdqu	64(%rsi),%xmm4
601	pxor	%xmm5,%xmm3
602	movdqu	80(%rsi),%xmm5
603	pxor	%xmm4,%xmm10
604	movdqu	96(%rsi),%xmm4
605	pxor	%xmm5,%xmm11
606	movdqu	112(%rsi),%xmm5
607	pxor	%xmm4,%xmm0
608	pxor	%xmm5,%xmm1
609
610	movdqu	%xmm8,0(%rdi)
611	movdqu	%xmm9,16(%rdi)
612	movdqu	%xmm2,32(%rdi)
613	movdqu	%xmm3,48(%rdi)
614	movdqu	%xmm10,64(%rdi)
615	movdqu	%xmm11,80(%rdi)
616	movdqu	%xmm0,96(%rdi)
617	movdqu	%xmm1,112(%rdi)
618	leaq	(%r9),%rsp
619.cfi_def_cfa_register	%rsp
620.L128_epilogue:
621	.byte	0xf3,0xc3
622.cfi_endproc
623.size	ChaCha20_128,.-ChaCha20_128
624.type	ChaCha20_4x,@function
625.align	32
626ChaCha20_4x:
627.cfi_startproc
628.LChaCha20_4x:
629	movq	%rsp,%r9
630.cfi_def_cfa_register	%r9
631	movq	%r10,%r11
632	shrq	$32,%r10
633	testq	$32,%r10
634	jnz	.LChaCha20_8x
635	cmpq	$192,%rdx
636	ja	.Lproceed4x
637
638	andq	$71303168,%r11
639	cmpq	$4194304,%r11
640	je	.Ldo_sse3_after_all
641
642.Lproceed4x:
643	subq	$0x140+8,%rsp
644	movdqa	.Lsigma(%rip),%xmm11
645	movdqu	(%rcx),%xmm15
646	movdqu	16(%rcx),%xmm7
647	movdqu	(%r8),%xmm3
648	leaq	256(%rsp),%rcx
649	leaq	.Lrot16(%rip),%r10
650	leaq	.Lrot24(%rip),%r11
651
652	pshufd	$0x00,%xmm11,%xmm8
653	pshufd	$0x55,%xmm11,%xmm9
654	movdqa	%xmm8,64(%rsp)
655	pshufd	$0xaa,%xmm11,%xmm10
656	movdqa	%xmm9,80(%rsp)
657	pshufd	$0xff,%xmm11,%xmm11
658	movdqa	%xmm10,96(%rsp)
659	movdqa	%xmm11,112(%rsp)
660
661	pshufd	$0x00,%xmm15,%xmm12
662	pshufd	$0x55,%xmm15,%xmm13
663	movdqa	%xmm12,128-256(%rcx)
664	pshufd	$0xaa,%xmm15,%xmm14
665	movdqa	%xmm13,144-256(%rcx)
666	pshufd	$0xff,%xmm15,%xmm15
667	movdqa	%xmm14,160-256(%rcx)
668	movdqa	%xmm15,176-256(%rcx)
669
670	pshufd	$0x00,%xmm7,%xmm4
671	pshufd	$0x55,%xmm7,%xmm5
672	movdqa	%xmm4,192-256(%rcx)
673	pshufd	$0xaa,%xmm7,%xmm6
674	movdqa	%xmm5,208-256(%rcx)
675	pshufd	$0xff,%xmm7,%xmm7
676	movdqa	%xmm6,224-256(%rcx)
677	movdqa	%xmm7,240-256(%rcx)
678
679	pshufd	$0x00,%xmm3,%xmm0
680	pshufd	$0x55,%xmm3,%xmm1
681	paddd	.Linc(%rip),%xmm0
682	pshufd	$0xaa,%xmm3,%xmm2
683	movdqa	%xmm1,272-256(%rcx)
684	pshufd	$0xff,%xmm3,%xmm3
685	movdqa	%xmm2,288-256(%rcx)
686	movdqa	%xmm3,304-256(%rcx)
687
688	jmp	.Loop_enter4x
689
690.align	32
691.Loop_outer4x:
692	movdqa	64(%rsp),%xmm8
693	movdqa	80(%rsp),%xmm9
694	movdqa	96(%rsp),%xmm10
695	movdqa	112(%rsp),%xmm11
696	movdqa	128-256(%rcx),%xmm12
697	movdqa	144-256(%rcx),%xmm13
698	movdqa	160-256(%rcx),%xmm14
699	movdqa	176-256(%rcx),%xmm15
700	movdqa	192-256(%rcx),%xmm4
701	movdqa	208-256(%rcx),%xmm5
702	movdqa	224-256(%rcx),%xmm6
703	movdqa	240-256(%rcx),%xmm7
704	movdqa	256-256(%rcx),%xmm0
705	movdqa	272-256(%rcx),%xmm1
706	movdqa	288-256(%rcx),%xmm2
707	movdqa	304-256(%rcx),%xmm3
708	paddd	.Lfour(%rip),%xmm0
709
710.Loop_enter4x:
711	movdqa	%xmm6,32(%rsp)
712	movdqa	%xmm7,48(%rsp)
713	movdqa	(%r10),%xmm7
714	movl	$10,%eax
715	movdqa	%xmm0,256-256(%rcx)
716	jmp	.Loop4x
717
718.align	32
719.Loop4x:
720	paddd	%xmm12,%xmm8
721	paddd	%xmm13,%xmm9
722	pxor	%xmm8,%xmm0
723	pxor	%xmm9,%xmm1
724.byte	102,15,56,0,199
725.byte	102,15,56,0,207
726	paddd	%xmm0,%xmm4
727	paddd	%xmm1,%xmm5
728	pxor	%xmm4,%xmm12
729	pxor	%xmm5,%xmm13
730	movdqa	%xmm12,%xmm6
731	pslld	$12,%xmm12
732	psrld	$20,%xmm6
733	movdqa	%xmm13,%xmm7
734	pslld	$12,%xmm13
735	por	%xmm6,%xmm12
736	psrld	$20,%xmm7
737	movdqa	(%r11),%xmm6
738	por	%xmm7,%xmm13
739	paddd	%xmm12,%xmm8
740	paddd	%xmm13,%xmm9
741	pxor	%xmm8,%xmm0
742	pxor	%xmm9,%xmm1
743.byte	102,15,56,0,198
744.byte	102,15,56,0,206
745	paddd	%xmm0,%xmm4
746	paddd	%xmm1,%xmm5
747	pxor	%xmm4,%xmm12
748	pxor	%xmm5,%xmm13
749	movdqa	%xmm12,%xmm7
750	pslld	$7,%xmm12
751	psrld	$25,%xmm7
752	movdqa	%xmm13,%xmm6
753	pslld	$7,%xmm13
754	por	%xmm7,%xmm12
755	psrld	$25,%xmm6
756	movdqa	(%r10),%xmm7
757	por	%xmm6,%xmm13
758	movdqa	%xmm4,0(%rsp)
759	movdqa	%xmm5,16(%rsp)
760	movdqa	32(%rsp),%xmm4
761	movdqa	48(%rsp),%xmm5
762	paddd	%xmm14,%xmm10
763	paddd	%xmm15,%xmm11
764	pxor	%xmm10,%xmm2
765	pxor	%xmm11,%xmm3
766.byte	102,15,56,0,215
767.byte	102,15,56,0,223
768	paddd	%xmm2,%xmm4
769	paddd	%xmm3,%xmm5
770	pxor	%xmm4,%xmm14
771	pxor	%xmm5,%xmm15
772	movdqa	%xmm14,%xmm6
773	pslld	$12,%xmm14
774	psrld	$20,%xmm6
775	movdqa	%xmm15,%xmm7
776	pslld	$12,%xmm15
777	por	%xmm6,%xmm14
778	psrld	$20,%xmm7
779	movdqa	(%r11),%xmm6
780	por	%xmm7,%xmm15
781	paddd	%xmm14,%xmm10
782	paddd	%xmm15,%xmm11
783	pxor	%xmm10,%xmm2
784	pxor	%xmm11,%xmm3
785.byte	102,15,56,0,214
786.byte	102,15,56,0,222
787	paddd	%xmm2,%xmm4
788	paddd	%xmm3,%xmm5
789	pxor	%xmm4,%xmm14
790	pxor	%xmm5,%xmm15
791	movdqa	%xmm14,%xmm7
792	pslld	$7,%xmm14
793	psrld	$25,%xmm7
794	movdqa	%xmm15,%xmm6
795	pslld	$7,%xmm15
796	por	%xmm7,%xmm14
797	psrld	$25,%xmm6
798	movdqa	(%r10),%xmm7
799	por	%xmm6,%xmm15
800	paddd	%xmm13,%xmm8
801	paddd	%xmm14,%xmm9
802	pxor	%xmm8,%xmm3
803	pxor	%xmm9,%xmm0
804.byte	102,15,56,0,223
805.byte	102,15,56,0,199
806	paddd	%xmm3,%xmm4
807	paddd	%xmm0,%xmm5
808	pxor	%xmm4,%xmm13
809	pxor	%xmm5,%xmm14
810	movdqa	%xmm13,%xmm6
811	pslld	$12,%xmm13
812	psrld	$20,%xmm6
813	movdqa	%xmm14,%xmm7
814	pslld	$12,%xmm14
815	por	%xmm6,%xmm13
816	psrld	$20,%xmm7
817	movdqa	(%r11),%xmm6
818	por	%xmm7,%xmm14
819	paddd	%xmm13,%xmm8
820	paddd	%xmm14,%xmm9
821	pxor	%xmm8,%xmm3
822	pxor	%xmm9,%xmm0
823.byte	102,15,56,0,222
824.byte	102,15,56,0,198
825	paddd	%xmm3,%xmm4
826	paddd	%xmm0,%xmm5
827	pxor	%xmm4,%xmm13
828	pxor	%xmm5,%xmm14
829	movdqa	%xmm13,%xmm7
830	pslld	$7,%xmm13
831	psrld	$25,%xmm7
832	movdqa	%xmm14,%xmm6
833	pslld	$7,%xmm14
834	por	%xmm7,%xmm13
835	psrld	$25,%xmm6
836	movdqa	(%r10),%xmm7
837	por	%xmm6,%xmm14
838	movdqa	%xmm4,32(%rsp)
839	movdqa	%xmm5,48(%rsp)
840	movdqa	0(%rsp),%xmm4
841	movdqa	16(%rsp),%xmm5
842	paddd	%xmm15,%xmm10
843	paddd	%xmm12,%xmm11
844	pxor	%xmm10,%xmm1
845	pxor	%xmm11,%xmm2
846.byte	102,15,56,0,207
847.byte	102,15,56,0,215
848	paddd	%xmm1,%xmm4
849	paddd	%xmm2,%xmm5
850	pxor	%xmm4,%xmm15
851	pxor	%xmm5,%xmm12
852	movdqa	%xmm15,%xmm6
853	pslld	$12,%xmm15
854	psrld	$20,%xmm6
855	movdqa	%xmm12,%xmm7
856	pslld	$12,%xmm12
857	por	%xmm6,%xmm15
858	psrld	$20,%xmm7
859	movdqa	(%r11),%xmm6
860	por	%xmm7,%xmm12
861	paddd	%xmm15,%xmm10
862	paddd	%xmm12,%xmm11
863	pxor	%xmm10,%xmm1
864	pxor	%xmm11,%xmm2
865.byte	102,15,56,0,206
866.byte	102,15,56,0,214
867	paddd	%xmm1,%xmm4
868	paddd	%xmm2,%xmm5
869	pxor	%xmm4,%xmm15
870	pxor	%xmm5,%xmm12
871	movdqa	%xmm15,%xmm7
872	pslld	$7,%xmm15
873	psrld	$25,%xmm7
874	movdqa	%xmm12,%xmm6
875	pslld	$7,%xmm12
876	por	%xmm7,%xmm15
877	psrld	$25,%xmm6
878	movdqa	(%r10),%xmm7
879	por	%xmm6,%xmm12
880	decl	%eax
881	jnz	.Loop4x
882
883	paddd	64(%rsp),%xmm8
884	paddd	80(%rsp),%xmm9
885	paddd	96(%rsp),%xmm10
886	paddd	112(%rsp),%xmm11
887
888	movdqa	%xmm8,%xmm6
889	punpckldq	%xmm9,%xmm8
890	movdqa	%xmm10,%xmm7
891	punpckldq	%xmm11,%xmm10
892	punpckhdq	%xmm9,%xmm6
893	punpckhdq	%xmm11,%xmm7
894	movdqa	%xmm8,%xmm9
895	punpcklqdq	%xmm10,%xmm8
896	movdqa	%xmm6,%xmm11
897	punpcklqdq	%xmm7,%xmm6
898	punpckhqdq	%xmm10,%xmm9
899	punpckhqdq	%xmm7,%xmm11
900	paddd	128-256(%rcx),%xmm12
901	paddd	144-256(%rcx),%xmm13
902	paddd	160-256(%rcx),%xmm14
903	paddd	176-256(%rcx),%xmm15
904
905	movdqa	%xmm8,0(%rsp)
906	movdqa	%xmm9,16(%rsp)
907	movdqa	32(%rsp),%xmm8
908	movdqa	48(%rsp),%xmm9
909
910	movdqa	%xmm12,%xmm10
911	punpckldq	%xmm13,%xmm12
912	movdqa	%xmm14,%xmm7
913	punpckldq	%xmm15,%xmm14
914	punpckhdq	%xmm13,%xmm10
915	punpckhdq	%xmm15,%xmm7
916	movdqa	%xmm12,%xmm13
917	punpcklqdq	%xmm14,%xmm12
918	movdqa	%xmm10,%xmm15
919	punpcklqdq	%xmm7,%xmm10
920	punpckhqdq	%xmm14,%xmm13
921	punpckhqdq	%xmm7,%xmm15
922	paddd	192-256(%rcx),%xmm4
923	paddd	208-256(%rcx),%xmm5
924	paddd	224-256(%rcx),%xmm8
925	paddd	240-256(%rcx),%xmm9
926
927	movdqa	%xmm6,32(%rsp)
928	movdqa	%xmm11,48(%rsp)
929
930	movdqa	%xmm4,%xmm14
931	punpckldq	%xmm5,%xmm4
932	movdqa	%xmm8,%xmm7
933	punpckldq	%xmm9,%xmm8
934	punpckhdq	%xmm5,%xmm14
935	punpckhdq	%xmm9,%xmm7
936	movdqa	%xmm4,%xmm5
937	punpcklqdq	%xmm8,%xmm4
938	movdqa	%xmm14,%xmm9
939	punpcklqdq	%xmm7,%xmm14
940	punpckhqdq	%xmm8,%xmm5
941	punpckhqdq	%xmm7,%xmm9
942	paddd	256-256(%rcx),%xmm0
943	paddd	272-256(%rcx),%xmm1
944	paddd	288-256(%rcx),%xmm2
945	paddd	304-256(%rcx),%xmm3
946
947	movdqa	%xmm0,%xmm8
948	punpckldq	%xmm1,%xmm0
949	movdqa	%xmm2,%xmm7
950	punpckldq	%xmm3,%xmm2
951	punpckhdq	%xmm1,%xmm8
952	punpckhdq	%xmm3,%xmm7
953	movdqa	%xmm0,%xmm1
954	punpcklqdq	%xmm2,%xmm0
955	movdqa	%xmm8,%xmm3
956	punpcklqdq	%xmm7,%xmm8
957	punpckhqdq	%xmm2,%xmm1
958	punpckhqdq	%xmm7,%xmm3
959	cmpq	$256,%rdx
960	jb	.Ltail4x
961
962	movdqu	0(%rsi),%xmm6
963	movdqu	16(%rsi),%xmm11
964	movdqu	32(%rsi),%xmm2
965	movdqu	48(%rsi),%xmm7
966	pxor	0(%rsp),%xmm6
967	pxor	%xmm12,%xmm11
968	pxor	%xmm4,%xmm2
969	pxor	%xmm0,%xmm7
970
971	movdqu	%xmm6,0(%rdi)
972	movdqu	64(%rsi),%xmm6
973	movdqu	%xmm11,16(%rdi)
974	movdqu	80(%rsi),%xmm11
975	movdqu	%xmm2,32(%rdi)
976	movdqu	96(%rsi),%xmm2
977	movdqu	%xmm7,48(%rdi)
978	movdqu	112(%rsi),%xmm7
979	leaq	128(%rsi),%rsi
980	pxor	16(%rsp),%xmm6
981	pxor	%xmm13,%xmm11
982	pxor	%xmm5,%xmm2
983	pxor	%xmm1,%xmm7
984
985	movdqu	%xmm6,64(%rdi)
986	movdqu	0(%rsi),%xmm6
987	movdqu	%xmm11,80(%rdi)
988	movdqu	16(%rsi),%xmm11
989	movdqu	%xmm2,96(%rdi)
990	movdqu	32(%rsi),%xmm2
991	movdqu	%xmm7,112(%rdi)
992	leaq	128(%rdi),%rdi
993	movdqu	48(%rsi),%xmm7
994	pxor	32(%rsp),%xmm6
995	pxor	%xmm10,%xmm11
996	pxor	%xmm14,%xmm2
997	pxor	%xmm8,%xmm7
998
999	movdqu	%xmm6,0(%rdi)
1000	movdqu	64(%rsi),%xmm6
1001	movdqu	%xmm11,16(%rdi)
1002	movdqu	80(%rsi),%xmm11
1003	movdqu	%xmm2,32(%rdi)
1004	movdqu	96(%rsi),%xmm2
1005	movdqu	%xmm7,48(%rdi)
1006	movdqu	112(%rsi),%xmm7
1007	leaq	128(%rsi),%rsi
1008	pxor	48(%rsp),%xmm6
1009	pxor	%xmm15,%xmm11
1010	pxor	%xmm9,%xmm2
1011	pxor	%xmm3,%xmm7
1012	movdqu	%xmm6,64(%rdi)
1013	movdqu	%xmm11,80(%rdi)
1014	movdqu	%xmm2,96(%rdi)
1015	movdqu	%xmm7,112(%rdi)
1016	leaq	128(%rdi),%rdi
1017
1018	subq	$256,%rdx
1019	jnz	.Loop_outer4x
1020
1021	jmp	.Ldone4x
1022
1023.Ltail4x:
1024	cmpq	$192,%rdx
1025	jae	.L192_or_more4x
1026	cmpq	$128,%rdx
1027	jae	.L128_or_more4x
1028	cmpq	$64,%rdx
1029	jae	.L64_or_more4x
1030
1031
1032	xorq	%r10,%r10
1033
1034	movdqa	%xmm12,16(%rsp)
1035	movdqa	%xmm4,32(%rsp)
1036	movdqa	%xmm0,48(%rsp)
1037	jmp	.Loop_tail4x
1038
1039.align	32
1040.L64_or_more4x:
1041	movdqu	0(%rsi),%xmm6
1042	movdqu	16(%rsi),%xmm11
1043	movdqu	32(%rsi),%xmm2
1044	movdqu	48(%rsi),%xmm7
1045	pxor	0(%rsp),%xmm6
1046	pxor	%xmm12,%xmm11
1047	pxor	%xmm4,%xmm2
1048	pxor	%xmm0,%xmm7
1049	movdqu	%xmm6,0(%rdi)
1050	movdqu	%xmm11,16(%rdi)
1051	movdqu	%xmm2,32(%rdi)
1052	movdqu	%xmm7,48(%rdi)
1053	je	.Ldone4x
1054
1055	movdqa	16(%rsp),%xmm6
1056	leaq	64(%rsi),%rsi
1057	xorq	%r10,%r10
1058	movdqa	%xmm6,0(%rsp)
1059	movdqa	%xmm13,16(%rsp)
1060	leaq	64(%rdi),%rdi
1061	movdqa	%xmm5,32(%rsp)
1062	subq	$64,%rdx
1063	movdqa	%xmm1,48(%rsp)
1064	jmp	.Loop_tail4x
1065
1066.align	32
1067.L128_or_more4x:
1068	movdqu	0(%rsi),%xmm6
1069	movdqu	16(%rsi),%xmm11
1070	movdqu	32(%rsi),%xmm2
1071	movdqu	48(%rsi),%xmm7
1072	pxor	0(%rsp),%xmm6
1073	pxor	%xmm12,%xmm11
1074	pxor	%xmm4,%xmm2
1075	pxor	%xmm0,%xmm7
1076
1077	movdqu	%xmm6,0(%rdi)
1078	movdqu	64(%rsi),%xmm6
1079	movdqu	%xmm11,16(%rdi)
1080	movdqu	80(%rsi),%xmm11
1081	movdqu	%xmm2,32(%rdi)
1082	movdqu	96(%rsi),%xmm2
1083	movdqu	%xmm7,48(%rdi)
1084	movdqu	112(%rsi),%xmm7
1085	pxor	16(%rsp),%xmm6
1086	pxor	%xmm13,%xmm11
1087	pxor	%xmm5,%xmm2
1088	pxor	%xmm1,%xmm7
1089	movdqu	%xmm6,64(%rdi)
1090	movdqu	%xmm11,80(%rdi)
1091	movdqu	%xmm2,96(%rdi)
1092	movdqu	%xmm7,112(%rdi)
1093	je	.Ldone4x
1094
1095	movdqa	32(%rsp),%xmm6
1096	leaq	128(%rsi),%rsi
1097	xorq	%r10,%r10
1098	movdqa	%xmm6,0(%rsp)
1099	movdqa	%xmm10,16(%rsp)
1100	leaq	128(%rdi),%rdi
1101	movdqa	%xmm14,32(%rsp)
1102	subq	$128,%rdx
1103	movdqa	%xmm8,48(%rsp)
1104	jmp	.Loop_tail4x
1105
1106.align	32
1107.L192_or_more4x:
1108	movdqu	0(%rsi),%xmm6
1109	movdqu	16(%rsi),%xmm11
1110	movdqu	32(%rsi),%xmm2
1111	movdqu	48(%rsi),%xmm7
1112	pxor	0(%rsp),%xmm6
1113	pxor	%xmm12,%xmm11
1114	pxor	%xmm4,%xmm2
1115	pxor	%xmm0,%xmm7
1116
1117	movdqu	%xmm6,0(%rdi)
1118	movdqu	64(%rsi),%xmm6
1119	movdqu	%xmm11,16(%rdi)
1120	movdqu	80(%rsi),%xmm11
1121	movdqu	%xmm2,32(%rdi)
1122	movdqu	96(%rsi),%xmm2
1123	movdqu	%xmm7,48(%rdi)
1124	movdqu	112(%rsi),%xmm7
1125	leaq	128(%rsi),%rsi
1126	pxor	16(%rsp),%xmm6
1127	pxor	%xmm13,%xmm11
1128	pxor	%xmm5,%xmm2
1129	pxor	%xmm1,%xmm7
1130
1131	movdqu	%xmm6,64(%rdi)
1132	movdqu	0(%rsi),%xmm6
1133	movdqu	%xmm11,80(%rdi)
1134	movdqu	16(%rsi),%xmm11
1135	movdqu	%xmm2,96(%rdi)
1136	movdqu	32(%rsi),%xmm2
1137	movdqu	%xmm7,112(%rdi)
1138	leaq	128(%rdi),%rdi
1139	movdqu	48(%rsi),%xmm7
1140	pxor	32(%rsp),%xmm6
1141	pxor	%xmm10,%xmm11
1142	pxor	%xmm14,%xmm2
1143	pxor	%xmm8,%xmm7
1144	movdqu	%xmm6,0(%rdi)
1145	movdqu	%xmm11,16(%rdi)
1146	movdqu	%xmm2,32(%rdi)
1147	movdqu	%xmm7,48(%rdi)
1148	je	.Ldone4x
1149
1150	movdqa	48(%rsp),%xmm6
1151	leaq	64(%rsi),%rsi
1152	xorq	%r10,%r10
1153	movdqa	%xmm6,0(%rsp)
1154	movdqa	%xmm15,16(%rsp)
1155	leaq	64(%rdi),%rdi
1156	movdqa	%xmm9,32(%rsp)
1157	subq	$192,%rdx
1158	movdqa	%xmm3,48(%rsp)
1159
1160.Loop_tail4x:
1161	movzbl	(%rsi,%r10,1),%eax
1162	movzbl	(%rsp,%r10,1),%ecx
1163	leaq	1(%r10),%r10
1164	xorl	%ecx,%eax
1165	movb	%al,-1(%rdi,%r10,1)
1166	decq	%rdx
1167	jnz	.Loop_tail4x
1168
1169.Ldone4x:
1170	leaq	(%r9),%rsp
1171.cfi_def_cfa_register	%rsp
1172.L4x_epilogue:
1173	.byte	0xf3,0xc3
1174.cfi_endproc
1175.size	ChaCha20_4x,.-ChaCha20_4x
1176.type	ChaCha20_4xop,@function
1177.align	32
1178ChaCha20_4xop:
1179.cfi_startproc
1180.LChaCha20_4xop:
1181	movq	%rsp,%r9
1182.cfi_def_cfa_register	%r9
1183	subq	$0x140+8,%rsp
1184	vzeroupper
1185
1186	vmovdqa	.Lsigma(%rip),%xmm11
1187	vmovdqu	(%rcx),%xmm3
1188	vmovdqu	16(%rcx),%xmm15
1189	vmovdqu	(%r8),%xmm7
1190	leaq	256(%rsp),%rcx
1191
1192	vpshufd	$0x00,%xmm11,%xmm8
1193	vpshufd	$0x55,%xmm11,%xmm9
1194	vmovdqa	%xmm8,64(%rsp)
1195	vpshufd	$0xaa,%xmm11,%xmm10
1196	vmovdqa	%xmm9,80(%rsp)
1197	vpshufd	$0xff,%xmm11,%xmm11
1198	vmovdqa	%xmm10,96(%rsp)
1199	vmovdqa	%xmm11,112(%rsp)
1200
1201	vpshufd	$0x00,%xmm3,%xmm0
1202	vpshufd	$0x55,%xmm3,%xmm1
1203	vmovdqa	%xmm0,128-256(%rcx)
1204	vpshufd	$0xaa,%xmm3,%xmm2
1205	vmovdqa	%xmm1,144-256(%rcx)
1206	vpshufd	$0xff,%xmm3,%xmm3
1207	vmovdqa	%xmm2,160-256(%rcx)
1208	vmovdqa	%xmm3,176-256(%rcx)
1209
1210	vpshufd	$0x00,%xmm15,%xmm12
1211	vpshufd	$0x55,%xmm15,%xmm13
1212	vmovdqa	%xmm12,192-256(%rcx)
1213	vpshufd	$0xaa,%xmm15,%xmm14
1214	vmovdqa	%xmm13,208-256(%rcx)
1215	vpshufd	$0xff,%xmm15,%xmm15
1216	vmovdqa	%xmm14,224-256(%rcx)
1217	vmovdqa	%xmm15,240-256(%rcx)
1218
1219	vpshufd	$0x00,%xmm7,%xmm4
1220	vpshufd	$0x55,%xmm7,%xmm5
1221	vpaddd	.Linc(%rip),%xmm4,%xmm4
1222	vpshufd	$0xaa,%xmm7,%xmm6
1223	vmovdqa	%xmm5,272-256(%rcx)
1224	vpshufd	$0xff,%xmm7,%xmm7
1225	vmovdqa	%xmm6,288-256(%rcx)
1226	vmovdqa	%xmm7,304-256(%rcx)
1227
1228	jmp	.Loop_enter4xop
1229
1230.align	32
1231.Loop_outer4xop:
1232	vmovdqa	64(%rsp),%xmm8
1233	vmovdqa	80(%rsp),%xmm9
1234	vmovdqa	96(%rsp),%xmm10
1235	vmovdqa	112(%rsp),%xmm11
1236	vmovdqa	128-256(%rcx),%xmm0
1237	vmovdqa	144-256(%rcx),%xmm1
1238	vmovdqa	160-256(%rcx),%xmm2
1239	vmovdqa	176-256(%rcx),%xmm3
1240	vmovdqa	192-256(%rcx),%xmm12
1241	vmovdqa	208-256(%rcx),%xmm13
1242	vmovdqa	224-256(%rcx),%xmm14
1243	vmovdqa	240-256(%rcx),%xmm15
1244	vmovdqa	256-256(%rcx),%xmm4
1245	vmovdqa	272-256(%rcx),%xmm5
1246	vmovdqa	288-256(%rcx),%xmm6
1247	vmovdqa	304-256(%rcx),%xmm7
1248	vpaddd	.Lfour(%rip),%xmm4,%xmm4
1249
1250.Loop_enter4xop:
1251	movl	$10,%eax
1252	vmovdqa	%xmm4,256-256(%rcx)
1253	jmp	.Loop4xop
1254
1255.align	32
1256.Loop4xop:
1257	vpaddd	%xmm0,%xmm8,%xmm8
1258	vpaddd	%xmm1,%xmm9,%xmm9
1259	vpaddd	%xmm2,%xmm10,%xmm10
1260	vpaddd	%xmm3,%xmm11,%xmm11
1261	vpxor	%xmm4,%xmm8,%xmm4
1262	vpxor	%xmm5,%xmm9,%xmm5
1263	vpxor	%xmm6,%xmm10,%xmm6
1264	vpxor	%xmm7,%xmm11,%xmm7
1265.byte	143,232,120,194,228,16
1266.byte	143,232,120,194,237,16
1267.byte	143,232,120,194,246,16
1268.byte	143,232,120,194,255,16
1269	vpaddd	%xmm4,%xmm12,%xmm12
1270	vpaddd	%xmm5,%xmm13,%xmm13
1271	vpaddd	%xmm6,%xmm14,%xmm14
1272	vpaddd	%xmm7,%xmm15,%xmm15
1273	vpxor	%xmm0,%xmm12,%xmm0
1274	vpxor	%xmm1,%xmm13,%xmm1
1275	vpxor	%xmm14,%xmm2,%xmm2
1276	vpxor	%xmm15,%xmm3,%xmm3
1277.byte	143,232,120,194,192,12
1278.byte	143,232,120,194,201,12
1279.byte	143,232,120,194,210,12
1280.byte	143,232,120,194,219,12
1281	vpaddd	%xmm8,%xmm0,%xmm8
1282	vpaddd	%xmm9,%xmm1,%xmm9
1283	vpaddd	%xmm2,%xmm10,%xmm10
1284	vpaddd	%xmm3,%xmm11,%xmm11
1285	vpxor	%xmm4,%xmm8,%xmm4
1286	vpxor	%xmm5,%xmm9,%xmm5
1287	vpxor	%xmm6,%xmm10,%xmm6
1288	vpxor	%xmm7,%xmm11,%xmm7
1289.byte	143,232,120,194,228,8
1290.byte	143,232,120,194,237,8
1291.byte	143,232,120,194,246,8
1292.byte	143,232,120,194,255,8
1293	vpaddd	%xmm4,%xmm12,%xmm12
1294	vpaddd	%xmm5,%xmm13,%xmm13
1295	vpaddd	%xmm6,%xmm14,%xmm14
1296	vpaddd	%xmm7,%xmm15,%xmm15
1297	vpxor	%xmm0,%xmm12,%xmm0
1298	vpxor	%xmm1,%xmm13,%xmm1
1299	vpxor	%xmm14,%xmm2,%xmm2
1300	vpxor	%xmm15,%xmm3,%xmm3
1301.byte	143,232,120,194,192,7
1302.byte	143,232,120,194,201,7
1303.byte	143,232,120,194,210,7
1304.byte	143,232,120,194,219,7
1305	vpaddd	%xmm1,%xmm8,%xmm8
1306	vpaddd	%xmm2,%xmm9,%xmm9
1307	vpaddd	%xmm3,%xmm10,%xmm10
1308	vpaddd	%xmm0,%xmm11,%xmm11
1309	vpxor	%xmm7,%xmm8,%xmm7
1310	vpxor	%xmm4,%xmm9,%xmm4
1311	vpxor	%xmm5,%xmm10,%xmm5
1312	vpxor	%xmm6,%xmm11,%xmm6
1313.byte	143,232,120,194,255,16
1314.byte	143,232,120,194,228,16
1315.byte	143,232,120,194,237,16
1316.byte	143,232,120,194,246,16
1317	vpaddd	%xmm7,%xmm14,%xmm14
1318	vpaddd	%xmm4,%xmm15,%xmm15
1319	vpaddd	%xmm5,%xmm12,%xmm12
1320	vpaddd	%xmm6,%xmm13,%xmm13
1321	vpxor	%xmm1,%xmm14,%xmm1
1322	vpxor	%xmm2,%xmm15,%xmm2
1323	vpxor	%xmm12,%xmm3,%xmm3
1324	vpxor	%xmm13,%xmm0,%xmm0
1325.byte	143,232,120,194,201,12
1326.byte	143,232,120,194,210,12
1327.byte	143,232,120,194,219,12
1328.byte	143,232,120,194,192,12
1329	vpaddd	%xmm8,%xmm1,%xmm8
1330	vpaddd	%xmm9,%xmm2,%xmm9
1331	vpaddd	%xmm3,%xmm10,%xmm10
1332	vpaddd	%xmm0,%xmm11,%xmm11
1333	vpxor	%xmm7,%xmm8,%xmm7
1334	vpxor	%xmm4,%xmm9,%xmm4
1335	vpxor	%xmm5,%xmm10,%xmm5
1336	vpxor	%xmm6,%xmm11,%xmm6
1337.byte	143,232,120,194,255,8
1338.byte	143,232,120,194,228,8
1339.byte	143,232,120,194,237,8
1340.byte	143,232,120,194,246,8
1341	vpaddd	%xmm7,%xmm14,%xmm14
1342	vpaddd	%xmm4,%xmm15,%xmm15
1343	vpaddd	%xmm5,%xmm12,%xmm12
1344	vpaddd	%xmm6,%xmm13,%xmm13
1345	vpxor	%xmm1,%xmm14,%xmm1
1346	vpxor	%xmm2,%xmm15,%xmm2
1347	vpxor	%xmm12,%xmm3,%xmm3
1348	vpxor	%xmm13,%xmm0,%xmm0
1349.byte	143,232,120,194,201,7
1350.byte	143,232,120,194,210,7
1351.byte	143,232,120,194,219,7
1352.byte	143,232,120,194,192,7
1353	decl	%eax
1354	jnz	.Loop4xop
1355
1356	vpaddd	64(%rsp),%xmm8,%xmm8
1357	vpaddd	80(%rsp),%xmm9,%xmm9
1358	vpaddd	96(%rsp),%xmm10,%xmm10
1359	vpaddd	112(%rsp),%xmm11,%xmm11
1360
1361	vmovdqa	%xmm14,32(%rsp)
1362	vmovdqa	%xmm15,48(%rsp)
1363
1364	vpunpckldq	%xmm9,%xmm8,%xmm14
1365	vpunpckldq	%xmm11,%xmm10,%xmm15
1366	vpunpckhdq	%xmm9,%xmm8,%xmm8
1367	vpunpckhdq	%xmm11,%xmm10,%xmm10
1368	vpunpcklqdq	%xmm15,%xmm14,%xmm9
1369	vpunpckhqdq	%xmm15,%xmm14,%xmm14
1370	vpunpcklqdq	%xmm10,%xmm8,%xmm11
1371	vpunpckhqdq	%xmm10,%xmm8,%xmm8
1372	vpaddd	128-256(%rcx),%xmm0,%xmm0
1373	vpaddd	144-256(%rcx),%xmm1,%xmm1
1374	vpaddd	160-256(%rcx),%xmm2,%xmm2
1375	vpaddd	176-256(%rcx),%xmm3,%xmm3
1376
1377	vmovdqa	%xmm9,0(%rsp)
1378	vmovdqa	%xmm14,16(%rsp)
1379	vmovdqa	32(%rsp),%xmm9
1380	vmovdqa	48(%rsp),%xmm14
1381
1382	vpunpckldq	%xmm1,%xmm0,%xmm10
1383	vpunpckldq	%xmm3,%xmm2,%xmm15
1384	vpunpckhdq	%xmm1,%xmm0,%xmm0
1385	vpunpckhdq	%xmm3,%xmm2,%xmm2
1386	vpunpcklqdq	%xmm15,%xmm10,%xmm1
1387	vpunpckhqdq	%xmm15,%xmm10,%xmm10
1388	vpunpcklqdq	%xmm2,%xmm0,%xmm3
1389	vpunpckhqdq	%xmm2,%xmm0,%xmm0
1390	vpaddd	192-256(%rcx),%xmm12,%xmm12
1391	vpaddd	208-256(%rcx),%xmm13,%xmm13
1392	vpaddd	224-256(%rcx),%xmm9,%xmm9
1393	vpaddd	240-256(%rcx),%xmm14,%xmm14
1394
1395	vpunpckldq	%xmm13,%xmm12,%xmm2
1396	vpunpckldq	%xmm14,%xmm9,%xmm15
1397	vpunpckhdq	%xmm13,%xmm12,%xmm12
1398	vpunpckhdq	%xmm14,%xmm9,%xmm9
1399	vpunpcklqdq	%xmm15,%xmm2,%xmm13
1400	vpunpckhqdq	%xmm15,%xmm2,%xmm2
1401	vpunpcklqdq	%xmm9,%xmm12,%xmm14
1402	vpunpckhqdq	%xmm9,%xmm12,%xmm12
1403	vpaddd	256-256(%rcx),%xmm4,%xmm4
1404	vpaddd	272-256(%rcx),%xmm5,%xmm5
1405	vpaddd	288-256(%rcx),%xmm6,%xmm6
1406	vpaddd	304-256(%rcx),%xmm7,%xmm7
1407
1408	vpunpckldq	%xmm5,%xmm4,%xmm9
1409	vpunpckldq	%xmm7,%xmm6,%xmm15
1410	vpunpckhdq	%xmm5,%xmm4,%xmm4
1411	vpunpckhdq	%xmm7,%xmm6,%xmm6
1412	vpunpcklqdq	%xmm15,%xmm9,%xmm5
1413	vpunpckhqdq	%xmm15,%xmm9,%xmm9
1414	vpunpcklqdq	%xmm6,%xmm4,%xmm7
1415	vpunpckhqdq	%xmm6,%xmm4,%xmm4
1416	vmovdqa	0(%rsp),%xmm6
1417	vmovdqa	16(%rsp),%xmm15
1418
1419	cmpq	$256,%rdx
1420	jb	.Ltail4xop
1421
1422	vpxor	0(%rsi),%xmm6,%xmm6
1423	vpxor	16(%rsi),%xmm1,%xmm1
1424	vpxor	32(%rsi),%xmm13,%xmm13
1425	vpxor	48(%rsi),%xmm5,%xmm5
1426	vpxor	64(%rsi),%xmm15,%xmm15
1427	vpxor	80(%rsi),%xmm10,%xmm10
1428	vpxor	96(%rsi),%xmm2,%xmm2
1429	vpxor	112(%rsi),%xmm9,%xmm9
1430	leaq	128(%rsi),%rsi
1431	vpxor	0(%rsi),%xmm11,%xmm11
1432	vpxor	16(%rsi),%xmm3,%xmm3
1433	vpxor	32(%rsi),%xmm14,%xmm14
1434	vpxor	48(%rsi),%xmm7,%xmm7
1435	vpxor	64(%rsi),%xmm8,%xmm8
1436	vpxor	80(%rsi),%xmm0,%xmm0
1437	vpxor	96(%rsi),%xmm12,%xmm12
1438	vpxor	112(%rsi),%xmm4,%xmm4
1439	leaq	128(%rsi),%rsi
1440
1441	vmovdqu	%xmm6,0(%rdi)
1442	vmovdqu	%xmm1,16(%rdi)
1443	vmovdqu	%xmm13,32(%rdi)
1444	vmovdqu	%xmm5,48(%rdi)
1445	vmovdqu	%xmm15,64(%rdi)
1446	vmovdqu	%xmm10,80(%rdi)
1447	vmovdqu	%xmm2,96(%rdi)
1448	vmovdqu	%xmm9,112(%rdi)
1449	leaq	128(%rdi),%rdi
1450	vmovdqu	%xmm11,0(%rdi)
1451	vmovdqu	%xmm3,16(%rdi)
1452	vmovdqu	%xmm14,32(%rdi)
1453	vmovdqu	%xmm7,48(%rdi)
1454	vmovdqu	%xmm8,64(%rdi)
1455	vmovdqu	%xmm0,80(%rdi)
1456	vmovdqu	%xmm12,96(%rdi)
1457	vmovdqu	%xmm4,112(%rdi)
1458	leaq	128(%rdi),%rdi
1459
1460	subq	$256,%rdx
1461	jnz	.Loop_outer4xop
1462
1463	jmp	.Ldone4xop
1464
1465.align	32
1466.Ltail4xop:
1467	cmpq	$192,%rdx
1468	jae	.L192_or_more4xop
1469	cmpq	$128,%rdx
1470	jae	.L128_or_more4xop
1471	cmpq	$64,%rdx
1472	jae	.L64_or_more4xop
1473
1474	xorq	%r10,%r10
1475	vmovdqa	%xmm6,0(%rsp)
1476	vmovdqa	%xmm1,16(%rsp)
1477	vmovdqa	%xmm13,32(%rsp)
1478	vmovdqa	%xmm5,48(%rsp)
1479	jmp	.Loop_tail4xop
1480
1481.align	32
1482.L64_or_more4xop:
1483	vpxor	0(%rsi),%xmm6,%xmm6
1484	vpxor	16(%rsi),%xmm1,%xmm1
1485	vpxor	32(%rsi),%xmm13,%xmm13
1486	vpxor	48(%rsi),%xmm5,%xmm5
1487	vmovdqu	%xmm6,0(%rdi)
1488	vmovdqu	%xmm1,16(%rdi)
1489	vmovdqu	%xmm13,32(%rdi)
1490	vmovdqu	%xmm5,48(%rdi)
1491	je	.Ldone4xop
1492
1493	leaq	64(%rsi),%rsi
1494	vmovdqa	%xmm15,0(%rsp)
1495	xorq	%r10,%r10
1496	vmovdqa	%xmm10,16(%rsp)
1497	leaq	64(%rdi),%rdi
1498	vmovdqa	%xmm2,32(%rsp)
1499	subq	$64,%rdx
1500	vmovdqa	%xmm9,48(%rsp)
1501	jmp	.Loop_tail4xop
1502
1503.align	32
1504.L128_or_more4xop:
1505	vpxor	0(%rsi),%xmm6,%xmm6
1506	vpxor	16(%rsi),%xmm1,%xmm1
1507	vpxor	32(%rsi),%xmm13,%xmm13
1508	vpxor	48(%rsi),%xmm5,%xmm5
1509	vpxor	64(%rsi),%xmm15,%xmm15
1510	vpxor	80(%rsi),%xmm10,%xmm10
1511	vpxor	96(%rsi),%xmm2,%xmm2
1512	vpxor	112(%rsi),%xmm9,%xmm9
1513
1514	vmovdqu	%xmm6,0(%rdi)
1515	vmovdqu	%xmm1,16(%rdi)
1516	vmovdqu	%xmm13,32(%rdi)
1517	vmovdqu	%xmm5,48(%rdi)
1518	vmovdqu	%xmm15,64(%rdi)
1519	vmovdqu	%xmm10,80(%rdi)
1520	vmovdqu	%xmm2,96(%rdi)
1521	vmovdqu	%xmm9,112(%rdi)
1522	je	.Ldone4xop
1523
1524	leaq	128(%rsi),%rsi
1525	vmovdqa	%xmm11,0(%rsp)
1526	xorq	%r10,%r10
1527	vmovdqa	%xmm3,16(%rsp)
1528	leaq	128(%rdi),%rdi
1529	vmovdqa	%xmm14,32(%rsp)
1530	subq	$128,%rdx
1531	vmovdqa	%xmm7,48(%rsp)
1532	jmp	.Loop_tail4xop
1533
1534.align	32
1535.L192_or_more4xop:
1536	vpxor	0(%rsi),%xmm6,%xmm6
1537	vpxor	16(%rsi),%xmm1,%xmm1
1538	vpxor	32(%rsi),%xmm13,%xmm13
1539	vpxor	48(%rsi),%xmm5,%xmm5
1540	vpxor	64(%rsi),%xmm15,%xmm15
1541	vpxor	80(%rsi),%xmm10,%xmm10
1542	vpxor	96(%rsi),%xmm2,%xmm2
1543	vpxor	112(%rsi),%xmm9,%xmm9
1544	leaq	128(%rsi),%rsi
1545	vpxor	0(%rsi),%xmm11,%xmm11
1546	vpxor	16(%rsi),%xmm3,%xmm3
1547	vpxor	32(%rsi),%xmm14,%xmm14
1548	vpxor	48(%rsi),%xmm7,%xmm7
1549
1550	vmovdqu	%xmm6,0(%rdi)
1551	vmovdqu	%xmm1,16(%rdi)
1552	vmovdqu	%xmm13,32(%rdi)
1553	vmovdqu	%xmm5,48(%rdi)
1554	vmovdqu	%xmm15,64(%rdi)
1555	vmovdqu	%xmm10,80(%rdi)
1556	vmovdqu	%xmm2,96(%rdi)
1557	vmovdqu	%xmm9,112(%rdi)
1558	leaq	128(%rdi),%rdi
1559	vmovdqu	%xmm11,0(%rdi)
1560	vmovdqu	%xmm3,16(%rdi)
1561	vmovdqu	%xmm14,32(%rdi)
1562	vmovdqu	%xmm7,48(%rdi)
1563	je	.Ldone4xop
1564
1565	leaq	64(%rsi),%rsi
1566	vmovdqa	%xmm8,0(%rsp)
1567	xorq	%r10,%r10
1568	vmovdqa	%xmm0,16(%rsp)
1569	leaq	64(%rdi),%rdi
1570	vmovdqa	%xmm12,32(%rsp)
1571	subq	$192,%rdx
1572	vmovdqa	%xmm4,48(%rsp)
1573
1574.Loop_tail4xop:
1575	movzbl	(%rsi,%r10,1),%eax
1576	movzbl	(%rsp,%r10,1),%ecx
1577	leaq	1(%r10),%r10
1578	xorl	%ecx,%eax
1579	movb	%al,-1(%rdi,%r10,1)
1580	decq	%rdx
1581	jnz	.Loop_tail4xop
1582
1583.Ldone4xop:
1584	vzeroupper
1585	leaq	(%r9),%rsp
1586.cfi_def_cfa_register	%rsp
1587.L4xop_epilogue:
1588	.byte	0xf3,0xc3
1589.cfi_endproc
1590.size	ChaCha20_4xop,.-ChaCha20_4xop
1591.type	ChaCha20_8x,@function
1592.align	32
1593ChaCha20_8x:
1594.cfi_startproc
1595.LChaCha20_8x:
1596	movq	%rsp,%r9
1597.cfi_def_cfa_register	%r9
1598	subq	$0x280+8,%rsp
1599	andq	$-32,%rsp
1600	vzeroupper
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611	vbroadcasti128	.Lsigma(%rip),%ymm11
1612	vbroadcasti128	(%rcx),%ymm3
1613	vbroadcasti128	16(%rcx),%ymm15
1614	vbroadcasti128	(%r8),%ymm7
1615	leaq	256(%rsp),%rcx
1616	leaq	512(%rsp),%rax
1617	leaq	.Lrot16(%rip),%r10
1618	leaq	.Lrot24(%rip),%r11
1619
1620	vpshufd	$0x00,%ymm11,%ymm8
1621	vpshufd	$0x55,%ymm11,%ymm9
1622	vmovdqa	%ymm8,128-256(%rcx)
1623	vpshufd	$0xaa,%ymm11,%ymm10
1624	vmovdqa	%ymm9,160-256(%rcx)
1625	vpshufd	$0xff,%ymm11,%ymm11
1626	vmovdqa	%ymm10,192-256(%rcx)
1627	vmovdqa	%ymm11,224-256(%rcx)
1628
1629	vpshufd	$0x00,%ymm3,%ymm0
1630	vpshufd	$0x55,%ymm3,%ymm1
1631	vmovdqa	%ymm0,256-256(%rcx)
1632	vpshufd	$0xaa,%ymm3,%ymm2
1633	vmovdqa	%ymm1,288-256(%rcx)
1634	vpshufd	$0xff,%ymm3,%ymm3
1635	vmovdqa	%ymm2,320-256(%rcx)
1636	vmovdqa	%ymm3,352-256(%rcx)
1637
1638	vpshufd	$0x00,%ymm15,%ymm12
1639	vpshufd	$0x55,%ymm15,%ymm13
1640	vmovdqa	%ymm12,384-512(%rax)
1641	vpshufd	$0xaa,%ymm15,%ymm14
1642	vmovdqa	%ymm13,416-512(%rax)
1643	vpshufd	$0xff,%ymm15,%ymm15
1644	vmovdqa	%ymm14,448-512(%rax)
1645	vmovdqa	%ymm15,480-512(%rax)
1646
1647	vpshufd	$0x00,%ymm7,%ymm4
1648	vpshufd	$0x55,%ymm7,%ymm5
1649	vpaddd	.Lincy(%rip),%ymm4,%ymm4
1650	vpshufd	$0xaa,%ymm7,%ymm6
1651	vmovdqa	%ymm5,544-512(%rax)
1652	vpshufd	$0xff,%ymm7,%ymm7
1653	vmovdqa	%ymm6,576-512(%rax)
1654	vmovdqa	%ymm7,608-512(%rax)
1655
1656	jmp	.Loop_enter8x
1657
1658.align	32
1659.Loop_outer8x:
1660	vmovdqa	128-256(%rcx),%ymm8
1661	vmovdqa	160-256(%rcx),%ymm9
1662	vmovdqa	192-256(%rcx),%ymm10
1663	vmovdqa	224-256(%rcx),%ymm11
1664	vmovdqa	256-256(%rcx),%ymm0
1665	vmovdqa	288-256(%rcx),%ymm1
1666	vmovdqa	320-256(%rcx),%ymm2
1667	vmovdqa	352-256(%rcx),%ymm3
1668	vmovdqa	384-512(%rax),%ymm12
1669	vmovdqa	416-512(%rax),%ymm13
1670	vmovdqa	448-512(%rax),%ymm14
1671	vmovdqa	480-512(%rax),%ymm15
1672	vmovdqa	512-512(%rax),%ymm4
1673	vmovdqa	544-512(%rax),%ymm5
1674	vmovdqa	576-512(%rax),%ymm6
1675	vmovdqa	608-512(%rax),%ymm7
1676	vpaddd	.Leight(%rip),%ymm4,%ymm4
1677
1678.Loop_enter8x:
1679	vmovdqa	%ymm14,64(%rsp)
1680	vmovdqa	%ymm15,96(%rsp)
1681	vbroadcasti128	(%r10),%ymm15
1682	vmovdqa	%ymm4,512-512(%rax)
1683	movl	$10,%eax
1684	jmp	.Loop8x
1685
1686.align	32
1687.Loop8x:
1688	vpaddd	%ymm0,%ymm8,%ymm8
1689	vpxor	%ymm4,%ymm8,%ymm4
1690	vpshufb	%ymm15,%ymm4,%ymm4
1691	vpaddd	%ymm1,%ymm9,%ymm9
1692	vpxor	%ymm5,%ymm9,%ymm5
1693	vpshufb	%ymm15,%ymm5,%ymm5
1694	vpaddd	%ymm4,%ymm12,%ymm12
1695	vpxor	%ymm0,%ymm12,%ymm0
1696	vpslld	$12,%ymm0,%ymm14
1697	vpsrld	$20,%ymm0,%ymm0
1698	vpor	%ymm0,%ymm14,%ymm0
1699	vbroadcasti128	(%r11),%ymm14
1700	vpaddd	%ymm5,%ymm13,%ymm13
1701	vpxor	%ymm1,%ymm13,%ymm1
1702	vpslld	$12,%ymm1,%ymm15
1703	vpsrld	$20,%ymm1,%ymm1
1704	vpor	%ymm1,%ymm15,%ymm1
1705	vpaddd	%ymm0,%ymm8,%ymm8
1706	vpxor	%ymm4,%ymm8,%ymm4
1707	vpshufb	%ymm14,%ymm4,%ymm4
1708	vpaddd	%ymm1,%ymm9,%ymm9
1709	vpxor	%ymm5,%ymm9,%ymm5
1710	vpshufb	%ymm14,%ymm5,%ymm5
1711	vpaddd	%ymm4,%ymm12,%ymm12
1712	vpxor	%ymm0,%ymm12,%ymm0
1713	vpslld	$7,%ymm0,%ymm15
1714	vpsrld	$25,%ymm0,%ymm0
1715	vpor	%ymm0,%ymm15,%ymm0
1716	vbroadcasti128	(%r10),%ymm15
1717	vpaddd	%ymm5,%ymm13,%ymm13
1718	vpxor	%ymm1,%ymm13,%ymm1
1719	vpslld	$7,%ymm1,%ymm14
1720	vpsrld	$25,%ymm1,%ymm1
1721	vpor	%ymm1,%ymm14,%ymm1
1722	vmovdqa	%ymm12,0(%rsp)
1723	vmovdqa	%ymm13,32(%rsp)
1724	vmovdqa	64(%rsp),%ymm12
1725	vmovdqa	96(%rsp),%ymm13
1726	vpaddd	%ymm2,%ymm10,%ymm10
1727	vpxor	%ymm6,%ymm10,%ymm6
1728	vpshufb	%ymm15,%ymm6,%ymm6
1729	vpaddd	%ymm3,%ymm11,%ymm11
1730	vpxor	%ymm7,%ymm11,%ymm7
1731	vpshufb	%ymm15,%ymm7,%ymm7
1732	vpaddd	%ymm6,%ymm12,%ymm12
1733	vpxor	%ymm2,%ymm12,%ymm2
1734	vpslld	$12,%ymm2,%ymm14
1735	vpsrld	$20,%ymm2,%ymm2
1736	vpor	%ymm2,%ymm14,%ymm2
1737	vbroadcasti128	(%r11),%ymm14
1738	vpaddd	%ymm7,%ymm13,%ymm13
1739	vpxor	%ymm3,%ymm13,%ymm3
1740	vpslld	$12,%ymm3,%ymm15
1741	vpsrld	$20,%ymm3,%ymm3
1742	vpor	%ymm3,%ymm15,%ymm3
1743	vpaddd	%ymm2,%ymm10,%ymm10
1744	vpxor	%ymm6,%ymm10,%ymm6
1745	vpshufb	%ymm14,%ymm6,%ymm6
1746	vpaddd	%ymm3,%ymm11,%ymm11
1747	vpxor	%ymm7,%ymm11,%ymm7
1748	vpshufb	%ymm14,%ymm7,%ymm7
1749	vpaddd	%ymm6,%ymm12,%ymm12
1750	vpxor	%ymm2,%ymm12,%ymm2
1751	vpslld	$7,%ymm2,%ymm15
1752	vpsrld	$25,%ymm2,%ymm2
1753	vpor	%ymm2,%ymm15,%ymm2
1754	vbroadcasti128	(%r10),%ymm15
1755	vpaddd	%ymm7,%ymm13,%ymm13
1756	vpxor	%ymm3,%ymm13,%ymm3
1757	vpslld	$7,%ymm3,%ymm14
1758	vpsrld	$25,%ymm3,%ymm3
1759	vpor	%ymm3,%ymm14,%ymm3
1760	vpaddd	%ymm1,%ymm8,%ymm8
1761	vpxor	%ymm7,%ymm8,%ymm7
1762	vpshufb	%ymm15,%ymm7,%ymm7
1763	vpaddd	%ymm2,%ymm9,%ymm9
1764	vpxor	%ymm4,%ymm9,%ymm4
1765	vpshufb	%ymm15,%ymm4,%ymm4
1766	vpaddd	%ymm7,%ymm12,%ymm12
1767	vpxor	%ymm1,%ymm12,%ymm1
1768	vpslld	$12,%ymm1,%ymm14
1769	vpsrld	$20,%ymm1,%ymm1
1770	vpor	%ymm1,%ymm14,%ymm1
1771	vbroadcasti128	(%r11),%ymm14
1772	vpaddd	%ymm4,%ymm13,%ymm13
1773	vpxor	%ymm2,%ymm13,%ymm2
1774	vpslld	$12,%ymm2,%ymm15
1775	vpsrld	$20,%ymm2,%ymm2
1776	vpor	%ymm2,%ymm15,%ymm2
1777	vpaddd	%ymm1,%ymm8,%ymm8
1778	vpxor	%ymm7,%ymm8,%ymm7
1779	vpshufb	%ymm14,%ymm7,%ymm7
1780	vpaddd	%ymm2,%ymm9,%ymm9
1781	vpxor	%ymm4,%ymm9,%ymm4
1782	vpshufb	%ymm14,%ymm4,%ymm4
1783	vpaddd	%ymm7,%ymm12,%ymm12
1784	vpxor	%ymm1,%ymm12,%ymm1
1785	vpslld	$7,%ymm1,%ymm15
1786	vpsrld	$25,%ymm1,%ymm1
1787	vpor	%ymm1,%ymm15,%ymm1
1788	vbroadcasti128	(%r10),%ymm15
1789	vpaddd	%ymm4,%ymm13,%ymm13
1790	vpxor	%ymm2,%ymm13,%ymm2
1791	vpslld	$7,%ymm2,%ymm14
1792	vpsrld	$25,%ymm2,%ymm2
1793	vpor	%ymm2,%ymm14,%ymm2
1794	vmovdqa	%ymm12,64(%rsp)
1795	vmovdqa	%ymm13,96(%rsp)
1796	vmovdqa	0(%rsp),%ymm12
1797	vmovdqa	32(%rsp),%ymm13
1798	vpaddd	%ymm3,%ymm10,%ymm10
1799	vpxor	%ymm5,%ymm10,%ymm5
1800	vpshufb	%ymm15,%ymm5,%ymm5
1801	vpaddd	%ymm0,%ymm11,%ymm11
1802	vpxor	%ymm6,%ymm11,%ymm6
1803	vpshufb	%ymm15,%ymm6,%ymm6
1804	vpaddd	%ymm5,%ymm12,%ymm12
1805	vpxor	%ymm3,%ymm12,%ymm3
1806	vpslld	$12,%ymm3,%ymm14
1807	vpsrld	$20,%ymm3,%ymm3
1808	vpor	%ymm3,%ymm14,%ymm3
1809	vbroadcasti128	(%r11),%ymm14
1810	vpaddd	%ymm6,%ymm13,%ymm13
1811	vpxor	%ymm0,%ymm13,%ymm0
1812	vpslld	$12,%ymm0,%ymm15
1813	vpsrld	$20,%ymm0,%ymm0
1814	vpor	%ymm0,%ymm15,%ymm0
1815	vpaddd	%ymm3,%ymm10,%ymm10
1816	vpxor	%ymm5,%ymm10,%ymm5
1817	vpshufb	%ymm14,%ymm5,%ymm5
1818	vpaddd	%ymm0,%ymm11,%ymm11
1819	vpxor	%ymm6,%ymm11,%ymm6
1820	vpshufb	%ymm14,%ymm6,%ymm6
1821	vpaddd	%ymm5,%ymm12,%ymm12
1822	vpxor	%ymm3,%ymm12,%ymm3
1823	vpslld	$7,%ymm3,%ymm15
1824	vpsrld	$25,%ymm3,%ymm3
1825	vpor	%ymm3,%ymm15,%ymm3
1826	vbroadcasti128	(%r10),%ymm15
1827	vpaddd	%ymm6,%ymm13,%ymm13
1828	vpxor	%ymm0,%ymm13,%ymm0
1829	vpslld	$7,%ymm0,%ymm14
1830	vpsrld	$25,%ymm0,%ymm0
1831	vpor	%ymm0,%ymm14,%ymm0
1832	decl	%eax
1833	jnz	.Loop8x
1834
1835	leaq	512(%rsp),%rax
1836	vpaddd	128-256(%rcx),%ymm8,%ymm8
1837	vpaddd	160-256(%rcx),%ymm9,%ymm9
1838	vpaddd	192-256(%rcx),%ymm10,%ymm10
1839	vpaddd	224-256(%rcx),%ymm11,%ymm11
1840
1841	vpunpckldq	%ymm9,%ymm8,%ymm14
1842	vpunpckldq	%ymm11,%ymm10,%ymm15
1843	vpunpckhdq	%ymm9,%ymm8,%ymm8
1844	vpunpckhdq	%ymm11,%ymm10,%ymm10
1845	vpunpcklqdq	%ymm15,%ymm14,%ymm9
1846	vpunpckhqdq	%ymm15,%ymm14,%ymm14
1847	vpunpcklqdq	%ymm10,%ymm8,%ymm11
1848	vpunpckhqdq	%ymm10,%ymm8,%ymm8
1849	vpaddd	256-256(%rcx),%ymm0,%ymm0
1850	vpaddd	288-256(%rcx),%ymm1,%ymm1
1851	vpaddd	320-256(%rcx),%ymm2,%ymm2
1852	vpaddd	352-256(%rcx),%ymm3,%ymm3
1853
1854	vpunpckldq	%ymm1,%ymm0,%ymm10
1855	vpunpckldq	%ymm3,%ymm2,%ymm15
1856	vpunpckhdq	%ymm1,%ymm0,%ymm0
1857	vpunpckhdq	%ymm3,%ymm2,%ymm2
1858	vpunpcklqdq	%ymm15,%ymm10,%ymm1
1859	vpunpckhqdq	%ymm15,%ymm10,%ymm10
1860	vpunpcklqdq	%ymm2,%ymm0,%ymm3
1861	vpunpckhqdq	%ymm2,%ymm0,%ymm0
1862	vperm2i128	$0x20,%ymm1,%ymm9,%ymm15
1863	vperm2i128	$0x31,%ymm1,%ymm9,%ymm1
1864	vperm2i128	$0x20,%ymm10,%ymm14,%ymm9
1865	vperm2i128	$0x31,%ymm10,%ymm14,%ymm10
1866	vperm2i128	$0x20,%ymm3,%ymm11,%ymm14
1867	vperm2i128	$0x31,%ymm3,%ymm11,%ymm3
1868	vperm2i128	$0x20,%ymm0,%ymm8,%ymm11
1869	vperm2i128	$0x31,%ymm0,%ymm8,%ymm0
1870	vmovdqa	%ymm15,0(%rsp)
1871	vmovdqa	%ymm9,32(%rsp)
1872	vmovdqa	64(%rsp),%ymm15
1873	vmovdqa	96(%rsp),%ymm9
1874
1875	vpaddd	384-512(%rax),%ymm12,%ymm12
1876	vpaddd	416-512(%rax),%ymm13,%ymm13
1877	vpaddd	448-512(%rax),%ymm15,%ymm15
1878	vpaddd	480-512(%rax),%ymm9,%ymm9
1879
1880	vpunpckldq	%ymm13,%ymm12,%ymm2
1881	vpunpckldq	%ymm9,%ymm15,%ymm8
1882	vpunpckhdq	%ymm13,%ymm12,%ymm12
1883	vpunpckhdq	%ymm9,%ymm15,%ymm15
1884	vpunpcklqdq	%ymm8,%ymm2,%ymm13
1885	vpunpckhqdq	%ymm8,%ymm2,%ymm2
1886	vpunpcklqdq	%ymm15,%ymm12,%ymm9
1887	vpunpckhqdq	%ymm15,%ymm12,%ymm12
1888	vpaddd	512-512(%rax),%ymm4,%ymm4
1889	vpaddd	544-512(%rax),%ymm5,%ymm5
1890	vpaddd	576-512(%rax),%ymm6,%ymm6
1891	vpaddd	608-512(%rax),%ymm7,%ymm7
1892
1893	vpunpckldq	%ymm5,%ymm4,%ymm15
1894	vpunpckldq	%ymm7,%ymm6,%ymm8
1895	vpunpckhdq	%ymm5,%ymm4,%ymm4
1896	vpunpckhdq	%ymm7,%ymm6,%ymm6
1897	vpunpcklqdq	%ymm8,%ymm15,%ymm5
1898	vpunpckhqdq	%ymm8,%ymm15,%ymm15
1899	vpunpcklqdq	%ymm6,%ymm4,%ymm7
1900	vpunpckhqdq	%ymm6,%ymm4,%ymm4
1901	vperm2i128	$0x20,%ymm5,%ymm13,%ymm8
1902	vperm2i128	$0x31,%ymm5,%ymm13,%ymm5
1903	vperm2i128	$0x20,%ymm15,%ymm2,%ymm13
1904	vperm2i128	$0x31,%ymm15,%ymm2,%ymm15
1905	vperm2i128	$0x20,%ymm7,%ymm9,%ymm2
1906	vperm2i128	$0x31,%ymm7,%ymm9,%ymm7
1907	vperm2i128	$0x20,%ymm4,%ymm12,%ymm9
1908	vperm2i128	$0x31,%ymm4,%ymm12,%ymm4
1909	vmovdqa	0(%rsp),%ymm6
1910	vmovdqa	32(%rsp),%ymm12
1911
1912	cmpq	$512,%rdx
1913	jb	.Ltail8x
1914
1915	vpxor	0(%rsi),%ymm6,%ymm6
1916	vpxor	32(%rsi),%ymm8,%ymm8
1917	vpxor	64(%rsi),%ymm1,%ymm1
1918	vpxor	96(%rsi),%ymm5,%ymm5
1919	leaq	128(%rsi),%rsi
1920	vmovdqu	%ymm6,0(%rdi)
1921	vmovdqu	%ymm8,32(%rdi)
1922	vmovdqu	%ymm1,64(%rdi)
1923	vmovdqu	%ymm5,96(%rdi)
1924	leaq	128(%rdi),%rdi
1925
1926	vpxor	0(%rsi),%ymm12,%ymm12
1927	vpxor	32(%rsi),%ymm13,%ymm13
1928	vpxor	64(%rsi),%ymm10,%ymm10
1929	vpxor	96(%rsi),%ymm15,%ymm15
1930	leaq	128(%rsi),%rsi
1931	vmovdqu	%ymm12,0(%rdi)
1932	vmovdqu	%ymm13,32(%rdi)
1933	vmovdqu	%ymm10,64(%rdi)
1934	vmovdqu	%ymm15,96(%rdi)
1935	leaq	128(%rdi),%rdi
1936
1937	vpxor	0(%rsi),%ymm14,%ymm14
1938	vpxor	32(%rsi),%ymm2,%ymm2
1939	vpxor	64(%rsi),%ymm3,%ymm3
1940	vpxor	96(%rsi),%ymm7,%ymm7
1941	leaq	128(%rsi),%rsi
1942	vmovdqu	%ymm14,0(%rdi)
1943	vmovdqu	%ymm2,32(%rdi)
1944	vmovdqu	%ymm3,64(%rdi)
1945	vmovdqu	%ymm7,96(%rdi)
1946	leaq	128(%rdi),%rdi
1947
1948	vpxor	0(%rsi),%ymm11,%ymm11
1949	vpxor	32(%rsi),%ymm9,%ymm9
1950	vpxor	64(%rsi),%ymm0,%ymm0
1951	vpxor	96(%rsi),%ymm4,%ymm4
1952	leaq	128(%rsi),%rsi
1953	vmovdqu	%ymm11,0(%rdi)
1954	vmovdqu	%ymm9,32(%rdi)
1955	vmovdqu	%ymm0,64(%rdi)
1956	vmovdqu	%ymm4,96(%rdi)
1957	leaq	128(%rdi),%rdi
1958
1959	subq	$512,%rdx
1960	jnz	.Loop_outer8x
1961
1962	jmp	.Ldone8x
1963
1964.Ltail8x:
1965	cmpq	$448,%rdx
1966	jae	.L448_or_more8x
1967	cmpq	$384,%rdx
1968	jae	.L384_or_more8x
1969	cmpq	$320,%rdx
1970	jae	.L320_or_more8x
1971	cmpq	$256,%rdx
1972	jae	.L256_or_more8x
1973	cmpq	$192,%rdx
1974	jae	.L192_or_more8x
1975	cmpq	$128,%rdx
1976	jae	.L128_or_more8x
1977	cmpq	$64,%rdx
1978	jae	.L64_or_more8x
1979
1980	xorq	%r10,%r10
1981	vmovdqa	%ymm6,0(%rsp)
1982	vmovdqa	%ymm8,32(%rsp)
1983	jmp	.Loop_tail8x
1984
1985.align	32
1986.L64_or_more8x:
1987	vpxor	0(%rsi),%ymm6,%ymm6
1988	vpxor	32(%rsi),%ymm8,%ymm8
1989	vmovdqu	%ymm6,0(%rdi)
1990	vmovdqu	%ymm8,32(%rdi)
1991	je	.Ldone8x
1992
1993	leaq	64(%rsi),%rsi
1994	xorq	%r10,%r10
1995	vmovdqa	%ymm1,0(%rsp)
1996	leaq	64(%rdi),%rdi
1997	subq	$64,%rdx
1998	vmovdqa	%ymm5,32(%rsp)
1999	jmp	.Loop_tail8x
2000
2001.align	32
2002.L128_or_more8x:
2003	vpxor	0(%rsi),%ymm6,%ymm6
2004	vpxor	32(%rsi),%ymm8,%ymm8
2005	vpxor	64(%rsi),%ymm1,%ymm1
2006	vpxor	96(%rsi),%ymm5,%ymm5
2007	vmovdqu	%ymm6,0(%rdi)
2008	vmovdqu	%ymm8,32(%rdi)
2009	vmovdqu	%ymm1,64(%rdi)
2010	vmovdqu	%ymm5,96(%rdi)
2011	je	.Ldone8x
2012
2013	leaq	128(%rsi),%rsi
2014	xorq	%r10,%r10
2015	vmovdqa	%ymm12,0(%rsp)
2016	leaq	128(%rdi),%rdi
2017	subq	$128,%rdx
2018	vmovdqa	%ymm13,32(%rsp)
2019	jmp	.Loop_tail8x
2020
2021.align	32
2022.L192_or_more8x:
2023	vpxor	0(%rsi),%ymm6,%ymm6
2024	vpxor	32(%rsi),%ymm8,%ymm8
2025	vpxor	64(%rsi),%ymm1,%ymm1
2026	vpxor	96(%rsi),%ymm5,%ymm5
2027	vpxor	128(%rsi),%ymm12,%ymm12
2028	vpxor	160(%rsi),%ymm13,%ymm13
2029	vmovdqu	%ymm6,0(%rdi)
2030	vmovdqu	%ymm8,32(%rdi)
2031	vmovdqu	%ymm1,64(%rdi)
2032	vmovdqu	%ymm5,96(%rdi)
2033	vmovdqu	%ymm12,128(%rdi)
2034	vmovdqu	%ymm13,160(%rdi)
2035	je	.Ldone8x
2036
2037	leaq	192(%rsi),%rsi
2038	xorq	%r10,%r10
2039	vmovdqa	%ymm10,0(%rsp)
2040	leaq	192(%rdi),%rdi
2041	subq	$192,%rdx
2042	vmovdqa	%ymm15,32(%rsp)
2043	jmp	.Loop_tail8x
2044
2045.align	32
2046.L256_or_more8x:
2047	vpxor	0(%rsi),%ymm6,%ymm6
2048	vpxor	32(%rsi),%ymm8,%ymm8
2049	vpxor	64(%rsi),%ymm1,%ymm1
2050	vpxor	96(%rsi),%ymm5,%ymm5
2051	vpxor	128(%rsi),%ymm12,%ymm12
2052	vpxor	160(%rsi),%ymm13,%ymm13
2053	vpxor	192(%rsi),%ymm10,%ymm10
2054	vpxor	224(%rsi),%ymm15,%ymm15
2055	vmovdqu	%ymm6,0(%rdi)
2056	vmovdqu	%ymm8,32(%rdi)
2057	vmovdqu	%ymm1,64(%rdi)
2058	vmovdqu	%ymm5,96(%rdi)
2059	vmovdqu	%ymm12,128(%rdi)
2060	vmovdqu	%ymm13,160(%rdi)
2061	vmovdqu	%ymm10,192(%rdi)
2062	vmovdqu	%ymm15,224(%rdi)
2063	je	.Ldone8x
2064
2065	leaq	256(%rsi),%rsi
2066	xorq	%r10,%r10
2067	vmovdqa	%ymm14,0(%rsp)
2068	leaq	256(%rdi),%rdi
2069	subq	$256,%rdx
2070	vmovdqa	%ymm2,32(%rsp)
2071	jmp	.Loop_tail8x
2072
2073.align	32
2074.L320_or_more8x:
2075	vpxor	0(%rsi),%ymm6,%ymm6
2076	vpxor	32(%rsi),%ymm8,%ymm8
2077	vpxor	64(%rsi),%ymm1,%ymm1
2078	vpxor	96(%rsi),%ymm5,%ymm5
2079	vpxor	128(%rsi),%ymm12,%ymm12
2080	vpxor	160(%rsi),%ymm13,%ymm13
2081	vpxor	192(%rsi),%ymm10,%ymm10
2082	vpxor	224(%rsi),%ymm15,%ymm15
2083	vpxor	256(%rsi),%ymm14,%ymm14
2084	vpxor	288(%rsi),%ymm2,%ymm2
2085	vmovdqu	%ymm6,0(%rdi)
2086	vmovdqu	%ymm8,32(%rdi)
2087	vmovdqu	%ymm1,64(%rdi)
2088	vmovdqu	%ymm5,96(%rdi)
2089	vmovdqu	%ymm12,128(%rdi)
2090	vmovdqu	%ymm13,160(%rdi)
2091	vmovdqu	%ymm10,192(%rdi)
2092	vmovdqu	%ymm15,224(%rdi)
2093	vmovdqu	%ymm14,256(%rdi)
2094	vmovdqu	%ymm2,288(%rdi)
2095	je	.Ldone8x
2096
2097	leaq	320(%rsi),%rsi
2098	xorq	%r10,%r10
2099	vmovdqa	%ymm3,0(%rsp)
2100	leaq	320(%rdi),%rdi
2101	subq	$320,%rdx
2102	vmovdqa	%ymm7,32(%rsp)
2103	jmp	.Loop_tail8x
2104
2105.align	32
2106.L384_or_more8x:
2107	vpxor	0(%rsi),%ymm6,%ymm6
2108	vpxor	32(%rsi),%ymm8,%ymm8
2109	vpxor	64(%rsi),%ymm1,%ymm1
2110	vpxor	96(%rsi),%ymm5,%ymm5
2111	vpxor	128(%rsi),%ymm12,%ymm12
2112	vpxor	160(%rsi),%ymm13,%ymm13
2113	vpxor	192(%rsi),%ymm10,%ymm10
2114	vpxor	224(%rsi),%ymm15,%ymm15
2115	vpxor	256(%rsi),%ymm14,%ymm14
2116	vpxor	288(%rsi),%ymm2,%ymm2
2117	vpxor	320(%rsi),%ymm3,%ymm3
2118	vpxor	352(%rsi),%ymm7,%ymm7
2119	vmovdqu	%ymm6,0(%rdi)
2120	vmovdqu	%ymm8,32(%rdi)
2121	vmovdqu	%ymm1,64(%rdi)
2122	vmovdqu	%ymm5,96(%rdi)
2123	vmovdqu	%ymm12,128(%rdi)
2124	vmovdqu	%ymm13,160(%rdi)
2125	vmovdqu	%ymm10,192(%rdi)
2126	vmovdqu	%ymm15,224(%rdi)
2127	vmovdqu	%ymm14,256(%rdi)
2128	vmovdqu	%ymm2,288(%rdi)
2129	vmovdqu	%ymm3,320(%rdi)
2130	vmovdqu	%ymm7,352(%rdi)
2131	je	.Ldone8x
2132
2133	leaq	384(%rsi),%rsi
2134	xorq	%r10,%r10
2135	vmovdqa	%ymm11,0(%rsp)
2136	leaq	384(%rdi),%rdi
2137	subq	$384,%rdx
2138	vmovdqa	%ymm9,32(%rsp)
2139	jmp	.Loop_tail8x
2140
2141.align	32
2142.L448_or_more8x:
2143	vpxor	0(%rsi),%ymm6,%ymm6
2144	vpxor	32(%rsi),%ymm8,%ymm8
2145	vpxor	64(%rsi),%ymm1,%ymm1
2146	vpxor	96(%rsi),%ymm5,%ymm5
2147	vpxor	128(%rsi),%ymm12,%ymm12
2148	vpxor	160(%rsi),%ymm13,%ymm13
2149	vpxor	192(%rsi),%ymm10,%ymm10
2150	vpxor	224(%rsi),%ymm15,%ymm15
2151	vpxor	256(%rsi),%ymm14,%ymm14
2152	vpxor	288(%rsi),%ymm2,%ymm2
2153	vpxor	320(%rsi),%ymm3,%ymm3
2154	vpxor	352(%rsi),%ymm7,%ymm7
2155	vpxor	384(%rsi),%ymm11,%ymm11
2156	vpxor	416(%rsi),%ymm9,%ymm9
2157	vmovdqu	%ymm6,0(%rdi)
2158	vmovdqu	%ymm8,32(%rdi)
2159	vmovdqu	%ymm1,64(%rdi)
2160	vmovdqu	%ymm5,96(%rdi)
2161	vmovdqu	%ymm12,128(%rdi)
2162	vmovdqu	%ymm13,160(%rdi)
2163	vmovdqu	%ymm10,192(%rdi)
2164	vmovdqu	%ymm15,224(%rdi)
2165	vmovdqu	%ymm14,256(%rdi)
2166	vmovdqu	%ymm2,288(%rdi)
2167	vmovdqu	%ymm3,320(%rdi)
2168	vmovdqu	%ymm7,352(%rdi)
2169	vmovdqu	%ymm11,384(%rdi)
2170	vmovdqu	%ymm9,416(%rdi)
2171	je	.Ldone8x
2172
2173	leaq	448(%rsi),%rsi
2174	xorq	%r10,%r10
2175	vmovdqa	%ymm0,0(%rsp)
2176	leaq	448(%rdi),%rdi
2177	subq	$448,%rdx
2178	vmovdqa	%ymm4,32(%rsp)
2179
2180.Loop_tail8x:
2181	movzbl	(%rsi,%r10,1),%eax
2182	movzbl	(%rsp,%r10,1),%ecx
2183	leaq	1(%r10),%r10
2184	xorl	%ecx,%eax
2185	movb	%al,-1(%rdi,%r10,1)
2186	decq	%rdx
2187	jnz	.Loop_tail8x
2188
2189.Ldone8x:
2190	vzeroall
2191	leaq	(%r9),%rsp
2192.cfi_def_cfa_register	%rsp
2193.L8x_epilogue:
2194	.byte	0xf3,0xc3
2195.cfi_endproc
2196.size	ChaCha20_8x,.-ChaCha20_8x
2197	.section ".note.gnu.property", "a"
2198	.p2align 3
2199	.long 1f - 0f
2200	.long 4f - 1f
2201	.long 5
22020:
2203	# "GNU" encoded with .byte, since .asciz isn't supported
2204	# on Solaris.
2205	.byte 0x47
2206	.byte 0x4e
2207	.byte 0x55
2208	.byte 0
22091:
2210	.p2align 3
2211	.long 0xc0000002
2212	.long 3f - 2f
22132:
2214	.long 3
22153:
2216	.p2align 3
22174:
2218