xref: /linux/arch/x86/crypto/chacha-avx2-x86_64.S (revision 2b64b2ed277ff23e785fbdb65098ee7e1252d64f)
1/*
2 * ChaCha 256-bit cipher algorithm, x64 AVX2 functions
3 *
4 * Copyright (C) 2015 Martin Willi
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 */
11
12#include <linux/linkage.h>
13
14.section	.rodata.cst32.ROT8, "aM", @progbits, 32
15.align 32
16ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
17	.octa 0x0e0d0c0f0a09080b0605040702010003
18
19.section	.rodata.cst32.ROT16, "aM", @progbits, 32
20.align 32
21ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
22	.octa 0x0d0c0f0e09080b0a0504070601000302
23
24.section	.rodata.cst32.CTRINC, "aM", @progbits, 32
25.align 32
26CTRINC:	.octa 0x00000003000000020000000100000000
27	.octa 0x00000007000000060000000500000004
28
29.section	.rodata.cst32.CTR2BL, "aM", @progbits, 32
30.align 32
31CTR2BL:	.octa 0x00000000000000000000000000000000
32	.octa 0x00000000000000000000000000000001
33
34.section	.rodata.cst32.CTR4BL, "aM", @progbits, 32
35.align 32
36CTR4BL:	.octa 0x00000000000000000000000000000002
37	.octa 0x00000000000000000000000000000003
38
39.text
40
41ENTRY(chacha_2block_xor_avx2)
42	# %rdi: Input state matrix, s
43	# %rsi: up to 2 data blocks output, o
44	# %rdx: up to 2 data blocks input, i
45	# %rcx: input/output length in bytes
46	# %r8d: nrounds
47
48	# This function encrypts two ChaCha blocks by loading the state
49	# matrix twice across four AVX registers. It performs matrix operations
50	# on four words in each matrix in parallel, but requires shuffling to
51	# rearrange the words after each round.
52
53	vzeroupper
54
55	# x0..3[0-2] = s0..3
56	vbroadcasti128	0x00(%rdi),%ymm0
57	vbroadcasti128	0x10(%rdi),%ymm1
58	vbroadcasti128	0x20(%rdi),%ymm2
59	vbroadcasti128	0x30(%rdi),%ymm3
60
61	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
62
63	vmovdqa		%ymm0,%ymm8
64	vmovdqa		%ymm1,%ymm9
65	vmovdqa		%ymm2,%ymm10
66	vmovdqa		%ymm3,%ymm11
67
68	vmovdqa		ROT8(%rip),%ymm4
69	vmovdqa		ROT16(%rip),%ymm5
70
71	mov		%rcx,%rax
72
73.Ldoubleround:
74
75	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
76	vpaddd		%ymm1,%ymm0,%ymm0
77	vpxor		%ymm0,%ymm3,%ymm3
78	vpshufb		%ymm5,%ymm3,%ymm3
79
80	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
81	vpaddd		%ymm3,%ymm2,%ymm2
82	vpxor		%ymm2,%ymm1,%ymm1
83	vmovdqa		%ymm1,%ymm6
84	vpslld		$12,%ymm6,%ymm6
85	vpsrld		$20,%ymm1,%ymm1
86	vpor		%ymm6,%ymm1,%ymm1
87
88	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
89	vpaddd		%ymm1,%ymm0,%ymm0
90	vpxor		%ymm0,%ymm3,%ymm3
91	vpshufb		%ymm4,%ymm3,%ymm3
92
93	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
94	vpaddd		%ymm3,%ymm2,%ymm2
95	vpxor		%ymm2,%ymm1,%ymm1
96	vmovdqa		%ymm1,%ymm7
97	vpslld		$7,%ymm7,%ymm7
98	vpsrld		$25,%ymm1,%ymm1
99	vpor		%ymm7,%ymm1,%ymm1
100
101	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
102	vpshufd		$0x39,%ymm1,%ymm1
103	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
104	vpshufd		$0x4e,%ymm2,%ymm2
105	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
106	vpshufd		$0x93,%ymm3,%ymm3
107
108	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
109	vpaddd		%ymm1,%ymm0,%ymm0
110	vpxor		%ymm0,%ymm3,%ymm3
111	vpshufb		%ymm5,%ymm3,%ymm3
112
113	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
114	vpaddd		%ymm3,%ymm2,%ymm2
115	vpxor		%ymm2,%ymm1,%ymm1
116	vmovdqa		%ymm1,%ymm6
117	vpslld		$12,%ymm6,%ymm6
118	vpsrld		$20,%ymm1,%ymm1
119	vpor		%ymm6,%ymm1,%ymm1
120
121	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
122	vpaddd		%ymm1,%ymm0,%ymm0
123	vpxor		%ymm0,%ymm3,%ymm3
124	vpshufb		%ymm4,%ymm3,%ymm3
125
126	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
127	vpaddd		%ymm3,%ymm2,%ymm2
128	vpxor		%ymm2,%ymm1,%ymm1
129	vmovdqa		%ymm1,%ymm7
130	vpslld		$7,%ymm7,%ymm7
131	vpsrld		$25,%ymm1,%ymm1
132	vpor		%ymm7,%ymm1,%ymm1
133
134	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
135	vpshufd		$0x93,%ymm1,%ymm1
136	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
137	vpshufd		$0x4e,%ymm2,%ymm2
138	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
139	vpshufd		$0x39,%ymm3,%ymm3
140
141	sub		$2,%r8d
142	jnz		.Ldoubleround
143
144	# o0 = i0 ^ (x0 + s0)
145	vpaddd		%ymm8,%ymm0,%ymm7
146	cmp		$0x10,%rax
147	jl		.Lxorpart2
148	vpxor		0x00(%rdx),%xmm7,%xmm6
149	vmovdqu		%xmm6,0x00(%rsi)
150	vextracti128	$1,%ymm7,%xmm0
151	# o1 = i1 ^ (x1 + s1)
152	vpaddd		%ymm9,%ymm1,%ymm7
153	cmp		$0x20,%rax
154	jl		.Lxorpart2
155	vpxor		0x10(%rdx),%xmm7,%xmm6
156	vmovdqu		%xmm6,0x10(%rsi)
157	vextracti128	$1,%ymm7,%xmm1
158	# o2 = i2 ^ (x2 + s2)
159	vpaddd		%ymm10,%ymm2,%ymm7
160	cmp		$0x30,%rax
161	jl		.Lxorpart2
162	vpxor		0x20(%rdx),%xmm7,%xmm6
163	vmovdqu		%xmm6,0x20(%rsi)
164	vextracti128	$1,%ymm7,%xmm2
165	# o3 = i3 ^ (x3 + s3)
166	vpaddd		%ymm11,%ymm3,%ymm7
167	cmp		$0x40,%rax
168	jl		.Lxorpart2
169	vpxor		0x30(%rdx),%xmm7,%xmm6
170	vmovdqu		%xmm6,0x30(%rsi)
171	vextracti128	$1,%ymm7,%xmm3
172
173	# xor and write second block
174	vmovdqa		%xmm0,%xmm7
175	cmp		$0x50,%rax
176	jl		.Lxorpart2
177	vpxor		0x40(%rdx),%xmm7,%xmm6
178	vmovdqu		%xmm6,0x40(%rsi)
179
180	vmovdqa		%xmm1,%xmm7
181	cmp		$0x60,%rax
182	jl		.Lxorpart2
183	vpxor		0x50(%rdx),%xmm7,%xmm6
184	vmovdqu		%xmm6,0x50(%rsi)
185
186	vmovdqa		%xmm2,%xmm7
187	cmp		$0x70,%rax
188	jl		.Lxorpart2
189	vpxor		0x60(%rdx),%xmm7,%xmm6
190	vmovdqu		%xmm6,0x60(%rsi)
191
192	vmovdqa		%xmm3,%xmm7
193	cmp		$0x80,%rax
194	jl		.Lxorpart2
195	vpxor		0x70(%rdx),%xmm7,%xmm6
196	vmovdqu		%xmm6,0x70(%rsi)
197
198.Ldone2:
199	vzeroupper
200	ret
201
202.Lxorpart2:
203	# xor remaining bytes from partial register into output
204	mov		%rax,%r9
205	and		$0x0f,%r9
206	jz		.Ldone2
207	and		$~0x0f,%rax
208
209	mov		%rsi,%r11
210
211	lea		8(%rsp),%r10
212	sub		$0x10,%rsp
213	and		$~31,%rsp
214
215	lea		(%rdx,%rax),%rsi
216	mov		%rsp,%rdi
217	mov		%r9,%rcx
218	rep movsb
219
220	vpxor		0x00(%rsp),%xmm7,%xmm7
221	vmovdqa		%xmm7,0x00(%rsp)
222
223	mov		%rsp,%rsi
224	lea		(%r11,%rax),%rdi
225	mov		%r9,%rcx
226	rep movsb
227
228	lea		-8(%r10),%rsp
229	jmp		.Ldone2
230
231ENDPROC(chacha_2block_xor_avx2)
232
233ENTRY(chacha_4block_xor_avx2)
234	# %rdi: Input state matrix, s
235	# %rsi: up to 4 data blocks output, o
236	# %rdx: up to 4 data blocks input, i
237	# %rcx: input/output length in bytes
238	# %r8d: nrounds
239
240	# This function encrypts four ChaCha blocks by loading the state
241	# matrix four times across eight AVX registers. It performs matrix
242	# operations on four words in two matrices in parallel, sequentially
243	# to the operations on the four words of the other two matrices. The
244	# required word shuffling has a rather high latency, we can do the
245	# arithmetic on two matrix-pairs without much slowdown.
246
247	vzeroupper
248
249	# x0..3[0-4] = s0..3
250	vbroadcasti128	0x00(%rdi),%ymm0
251	vbroadcasti128	0x10(%rdi),%ymm1
252	vbroadcasti128	0x20(%rdi),%ymm2
253	vbroadcasti128	0x30(%rdi),%ymm3
254
255	vmovdqa		%ymm0,%ymm4
256	vmovdqa		%ymm1,%ymm5
257	vmovdqa		%ymm2,%ymm6
258	vmovdqa		%ymm3,%ymm7
259
260	vpaddd		CTR2BL(%rip),%ymm3,%ymm3
261	vpaddd		CTR4BL(%rip),%ymm7,%ymm7
262
263	vmovdqa		%ymm0,%ymm11
264	vmovdqa		%ymm1,%ymm12
265	vmovdqa		%ymm2,%ymm13
266	vmovdqa		%ymm3,%ymm14
267	vmovdqa		%ymm7,%ymm15
268
269	vmovdqa		ROT8(%rip),%ymm8
270	vmovdqa		ROT16(%rip),%ymm9
271
272	mov		%rcx,%rax
273
274.Ldoubleround4:
275
276	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
277	vpaddd		%ymm1,%ymm0,%ymm0
278	vpxor		%ymm0,%ymm3,%ymm3
279	vpshufb		%ymm9,%ymm3,%ymm3
280
281	vpaddd		%ymm5,%ymm4,%ymm4
282	vpxor		%ymm4,%ymm7,%ymm7
283	vpshufb		%ymm9,%ymm7,%ymm7
284
285	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
286	vpaddd		%ymm3,%ymm2,%ymm2
287	vpxor		%ymm2,%ymm1,%ymm1
288	vmovdqa		%ymm1,%ymm10
289	vpslld		$12,%ymm10,%ymm10
290	vpsrld		$20,%ymm1,%ymm1
291	vpor		%ymm10,%ymm1,%ymm1
292
293	vpaddd		%ymm7,%ymm6,%ymm6
294	vpxor		%ymm6,%ymm5,%ymm5
295	vmovdqa		%ymm5,%ymm10
296	vpslld		$12,%ymm10,%ymm10
297	vpsrld		$20,%ymm5,%ymm5
298	vpor		%ymm10,%ymm5,%ymm5
299
300	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
301	vpaddd		%ymm1,%ymm0,%ymm0
302	vpxor		%ymm0,%ymm3,%ymm3
303	vpshufb		%ymm8,%ymm3,%ymm3
304
305	vpaddd		%ymm5,%ymm4,%ymm4
306	vpxor		%ymm4,%ymm7,%ymm7
307	vpshufb		%ymm8,%ymm7,%ymm7
308
309	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
310	vpaddd		%ymm3,%ymm2,%ymm2
311	vpxor		%ymm2,%ymm1,%ymm1
312	vmovdqa		%ymm1,%ymm10
313	vpslld		$7,%ymm10,%ymm10
314	vpsrld		$25,%ymm1,%ymm1
315	vpor		%ymm10,%ymm1,%ymm1
316
317	vpaddd		%ymm7,%ymm6,%ymm6
318	vpxor		%ymm6,%ymm5,%ymm5
319	vmovdqa		%ymm5,%ymm10
320	vpslld		$7,%ymm10,%ymm10
321	vpsrld		$25,%ymm5,%ymm5
322	vpor		%ymm10,%ymm5,%ymm5
323
324	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
325	vpshufd		$0x39,%ymm1,%ymm1
326	vpshufd		$0x39,%ymm5,%ymm5
327	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
328	vpshufd		$0x4e,%ymm2,%ymm2
329	vpshufd		$0x4e,%ymm6,%ymm6
330	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
331	vpshufd		$0x93,%ymm3,%ymm3
332	vpshufd		$0x93,%ymm7,%ymm7
333
334	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
335	vpaddd		%ymm1,%ymm0,%ymm0
336	vpxor		%ymm0,%ymm3,%ymm3
337	vpshufb		%ymm9,%ymm3,%ymm3
338
339	vpaddd		%ymm5,%ymm4,%ymm4
340	vpxor		%ymm4,%ymm7,%ymm7
341	vpshufb		%ymm9,%ymm7,%ymm7
342
343	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
344	vpaddd		%ymm3,%ymm2,%ymm2
345	vpxor		%ymm2,%ymm1,%ymm1
346	vmovdqa		%ymm1,%ymm10
347	vpslld		$12,%ymm10,%ymm10
348	vpsrld		$20,%ymm1,%ymm1
349	vpor		%ymm10,%ymm1,%ymm1
350
351	vpaddd		%ymm7,%ymm6,%ymm6
352	vpxor		%ymm6,%ymm5,%ymm5
353	vmovdqa		%ymm5,%ymm10
354	vpslld		$12,%ymm10,%ymm10
355	vpsrld		$20,%ymm5,%ymm5
356	vpor		%ymm10,%ymm5,%ymm5
357
358	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
359	vpaddd		%ymm1,%ymm0,%ymm0
360	vpxor		%ymm0,%ymm3,%ymm3
361	vpshufb		%ymm8,%ymm3,%ymm3
362
363	vpaddd		%ymm5,%ymm4,%ymm4
364	vpxor		%ymm4,%ymm7,%ymm7
365	vpshufb		%ymm8,%ymm7,%ymm7
366
367	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
368	vpaddd		%ymm3,%ymm2,%ymm2
369	vpxor		%ymm2,%ymm1,%ymm1
370	vmovdqa		%ymm1,%ymm10
371	vpslld		$7,%ymm10,%ymm10
372	vpsrld		$25,%ymm1,%ymm1
373	vpor		%ymm10,%ymm1,%ymm1
374
375	vpaddd		%ymm7,%ymm6,%ymm6
376	vpxor		%ymm6,%ymm5,%ymm5
377	vmovdqa		%ymm5,%ymm10
378	vpslld		$7,%ymm10,%ymm10
379	vpsrld		$25,%ymm5,%ymm5
380	vpor		%ymm10,%ymm5,%ymm5
381
382	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
383	vpshufd		$0x93,%ymm1,%ymm1
384	vpshufd		$0x93,%ymm5,%ymm5
385	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
386	vpshufd		$0x4e,%ymm2,%ymm2
387	vpshufd		$0x4e,%ymm6,%ymm6
388	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
389	vpshufd		$0x39,%ymm3,%ymm3
390	vpshufd		$0x39,%ymm7,%ymm7
391
392	sub		$2,%r8d
393	jnz		.Ldoubleround4
394
395	# o0 = i0 ^ (x0 + s0), first block
396	vpaddd		%ymm11,%ymm0,%ymm10
397	cmp		$0x10,%rax
398	jl		.Lxorpart4
399	vpxor		0x00(%rdx),%xmm10,%xmm9
400	vmovdqu		%xmm9,0x00(%rsi)
401	vextracti128	$1,%ymm10,%xmm0
402	# o1 = i1 ^ (x1 + s1), first block
403	vpaddd		%ymm12,%ymm1,%ymm10
404	cmp		$0x20,%rax
405	jl		.Lxorpart4
406	vpxor		0x10(%rdx),%xmm10,%xmm9
407	vmovdqu		%xmm9,0x10(%rsi)
408	vextracti128	$1,%ymm10,%xmm1
409	# o2 = i2 ^ (x2 + s2), first block
410	vpaddd		%ymm13,%ymm2,%ymm10
411	cmp		$0x30,%rax
412	jl		.Lxorpart4
413	vpxor		0x20(%rdx),%xmm10,%xmm9
414	vmovdqu		%xmm9,0x20(%rsi)
415	vextracti128	$1,%ymm10,%xmm2
416	# o3 = i3 ^ (x3 + s3), first block
417	vpaddd		%ymm14,%ymm3,%ymm10
418	cmp		$0x40,%rax
419	jl		.Lxorpart4
420	vpxor		0x30(%rdx),%xmm10,%xmm9
421	vmovdqu		%xmm9,0x30(%rsi)
422	vextracti128	$1,%ymm10,%xmm3
423
424	# xor and write second block
425	vmovdqa		%xmm0,%xmm10
426	cmp		$0x50,%rax
427	jl		.Lxorpart4
428	vpxor		0x40(%rdx),%xmm10,%xmm9
429	vmovdqu		%xmm9,0x40(%rsi)
430
431	vmovdqa		%xmm1,%xmm10
432	cmp		$0x60,%rax
433	jl		.Lxorpart4
434	vpxor		0x50(%rdx),%xmm10,%xmm9
435	vmovdqu		%xmm9,0x50(%rsi)
436
437	vmovdqa		%xmm2,%xmm10
438	cmp		$0x70,%rax
439	jl		.Lxorpart4
440	vpxor		0x60(%rdx),%xmm10,%xmm9
441	vmovdqu		%xmm9,0x60(%rsi)
442
443	vmovdqa		%xmm3,%xmm10
444	cmp		$0x80,%rax
445	jl		.Lxorpart4
446	vpxor		0x70(%rdx),%xmm10,%xmm9
447	vmovdqu		%xmm9,0x70(%rsi)
448
449	# o0 = i0 ^ (x0 + s0), third block
450	vpaddd		%ymm11,%ymm4,%ymm10
451	cmp		$0x90,%rax
452	jl		.Lxorpart4
453	vpxor		0x80(%rdx),%xmm10,%xmm9
454	vmovdqu		%xmm9,0x80(%rsi)
455	vextracti128	$1,%ymm10,%xmm4
456	# o1 = i1 ^ (x1 + s1), third block
457	vpaddd		%ymm12,%ymm5,%ymm10
458	cmp		$0xa0,%rax
459	jl		.Lxorpart4
460	vpxor		0x90(%rdx),%xmm10,%xmm9
461	vmovdqu		%xmm9,0x90(%rsi)
462	vextracti128	$1,%ymm10,%xmm5
463	# o2 = i2 ^ (x2 + s2), third block
464	vpaddd		%ymm13,%ymm6,%ymm10
465	cmp		$0xb0,%rax
466	jl		.Lxorpart4
467	vpxor		0xa0(%rdx),%xmm10,%xmm9
468	vmovdqu		%xmm9,0xa0(%rsi)
469	vextracti128	$1,%ymm10,%xmm6
470	# o3 = i3 ^ (x3 + s3), third block
471	vpaddd		%ymm15,%ymm7,%ymm10
472	cmp		$0xc0,%rax
473	jl		.Lxorpart4
474	vpxor		0xb0(%rdx),%xmm10,%xmm9
475	vmovdqu		%xmm9,0xb0(%rsi)
476	vextracti128	$1,%ymm10,%xmm7
477
478	# xor and write fourth block
479	vmovdqa		%xmm4,%xmm10
480	cmp		$0xd0,%rax
481	jl		.Lxorpart4
482	vpxor		0xc0(%rdx),%xmm10,%xmm9
483	vmovdqu		%xmm9,0xc0(%rsi)
484
485	vmovdqa		%xmm5,%xmm10
486	cmp		$0xe0,%rax
487	jl		.Lxorpart4
488	vpxor		0xd0(%rdx),%xmm10,%xmm9
489	vmovdqu		%xmm9,0xd0(%rsi)
490
491	vmovdqa		%xmm6,%xmm10
492	cmp		$0xf0,%rax
493	jl		.Lxorpart4
494	vpxor		0xe0(%rdx),%xmm10,%xmm9
495	vmovdqu		%xmm9,0xe0(%rsi)
496
497	vmovdqa		%xmm7,%xmm10
498	cmp		$0x100,%rax
499	jl		.Lxorpart4
500	vpxor		0xf0(%rdx),%xmm10,%xmm9
501	vmovdqu		%xmm9,0xf0(%rsi)
502
503.Ldone4:
504	vzeroupper
505	ret
506
507.Lxorpart4:
508	# xor remaining bytes from partial register into output
509	mov		%rax,%r9
510	and		$0x0f,%r9
511	jz		.Ldone4
512	and		$~0x0f,%rax
513
514	mov		%rsi,%r11
515
516	lea		8(%rsp),%r10
517	sub		$0x10,%rsp
518	and		$~31,%rsp
519
520	lea		(%rdx,%rax),%rsi
521	mov		%rsp,%rdi
522	mov		%r9,%rcx
523	rep movsb
524
525	vpxor		0x00(%rsp),%xmm10,%xmm10
526	vmovdqa		%xmm10,0x00(%rsp)
527
528	mov		%rsp,%rsi
529	lea		(%r11,%rax),%rdi
530	mov		%r9,%rcx
531	rep movsb
532
533	lea		-8(%r10),%rsp
534	jmp		.Ldone4
535
536ENDPROC(chacha_4block_xor_avx2)
537
538ENTRY(chacha_8block_xor_avx2)
539	# %rdi: Input state matrix, s
540	# %rsi: up to 8 data blocks output, o
541	# %rdx: up to 8 data blocks input, i
542	# %rcx: input/output length in bytes
543	# %r8d: nrounds
544
545	# This function encrypts eight consecutive ChaCha blocks by loading
546	# the state matrix in AVX registers eight times. As we need some
547	# scratch registers, we save the first four registers on the stack. The
548	# algorithm performs each operation on the corresponding word of each
549	# state matrix, hence requires no word shuffling. For final XORing step
550	# we transpose the matrix by interleaving 32-, 64- and then 128-bit
551	# words, which allows us to do XOR in AVX registers. 8/16-bit word
552	# rotation is done with the slightly better performing byte shuffling,
553	# 7/12-bit word rotation uses traditional shift+OR.
554
555	vzeroupper
556	# 4 * 32 byte stack, 32-byte aligned
557	lea		8(%rsp),%r10
558	and		$~31, %rsp
559	sub		$0x80, %rsp
560	mov		%rcx,%rax
561
562	# x0..15[0-7] = s[0..15]
563	vpbroadcastd	0x00(%rdi),%ymm0
564	vpbroadcastd	0x04(%rdi),%ymm1
565	vpbroadcastd	0x08(%rdi),%ymm2
566	vpbroadcastd	0x0c(%rdi),%ymm3
567	vpbroadcastd	0x10(%rdi),%ymm4
568	vpbroadcastd	0x14(%rdi),%ymm5
569	vpbroadcastd	0x18(%rdi),%ymm6
570	vpbroadcastd	0x1c(%rdi),%ymm7
571	vpbroadcastd	0x20(%rdi),%ymm8
572	vpbroadcastd	0x24(%rdi),%ymm9
573	vpbroadcastd	0x28(%rdi),%ymm10
574	vpbroadcastd	0x2c(%rdi),%ymm11
575	vpbroadcastd	0x30(%rdi),%ymm12
576	vpbroadcastd	0x34(%rdi),%ymm13
577	vpbroadcastd	0x38(%rdi),%ymm14
578	vpbroadcastd	0x3c(%rdi),%ymm15
579	# x0..3 on stack
580	vmovdqa		%ymm0,0x00(%rsp)
581	vmovdqa		%ymm1,0x20(%rsp)
582	vmovdqa		%ymm2,0x40(%rsp)
583	vmovdqa		%ymm3,0x60(%rsp)
584
585	vmovdqa		CTRINC(%rip),%ymm1
586	vmovdqa		ROT8(%rip),%ymm2
587	vmovdqa		ROT16(%rip),%ymm3
588
589	# x12 += counter values 0-3
590	vpaddd		%ymm1,%ymm12,%ymm12
591
592.Ldoubleround8:
593	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
594	vpaddd		0x00(%rsp),%ymm4,%ymm0
595	vmovdqa		%ymm0,0x00(%rsp)
596	vpxor		%ymm0,%ymm12,%ymm12
597	vpshufb		%ymm3,%ymm12,%ymm12
598	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
599	vpaddd		0x20(%rsp),%ymm5,%ymm0
600	vmovdqa		%ymm0,0x20(%rsp)
601	vpxor		%ymm0,%ymm13,%ymm13
602	vpshufb		%ymm3,%ymm13,%ymm13
603	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
604	vpaddd		0x40(%rsp),%ymm6,%ymm0
605	vmovdqa		%ymm0,0x40(%rsp)
606	vpxor		%ymm0,%ymm14,%ymm14
607	vpshufb		%ymm3,%ymm14,%ymm14
608	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
609	vpaddd		0x60(%rsp),%ymm7,%ymm0
610	vmovdqa		%ymm0,0x60(%rsp)
611	vpxor		%ymm0,%ymm15,%ymm15
612	vpshufb		%ymm3,%ymm15,%ymm15
613
614	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
615	vpaddd		%ymm12,%ymm8,%ymm8
616	vpxor		%ymm8,%ymm4,%ymm4
617	vpslld		$12,%ymm4,%ymm0
618	vpsrld		$20,%ymm4,%ymm4
619	vpor		%ymm0,%ymm4,%ymm4
620	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
621	vpaddd		%ymm13,%ymm9,%ymm9
622	vpxor		%ymm9,%ymm5,%ymm5
623	vpslld		$12,%ymm5,%ymm0
624	vpsrld		$20,%ymm5,%ymm5
625	vpor		%ymm0,%ymm5,%ymm5
626	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
627	vpaddd		%ymm14,%ymm10,%ymm10
628	vpxor		%ymm10,%ymm6,%ymm6
629	vpslld		$12,%ymm6,%ymm0
630	vpsrld		$20,%ymm6,%ymm6
631	vpor		%ymm0,%ymm6,%ymm6
632	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
633	vpaddd		%ymm15,%ymm11,%ymm11
634	vpxor		%ymm11,%ymm7,%ymm7
635	vpslld		$12,%ymm7,%ymm0
636	vpsrld		$20,%ymm7,%ymm7
637	vpor		%ymm0,%ymm7,%ymm7
638
639	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
640	vpaddd		0x00(%rsp),%ymm4,%ymm0
641	vmovdqa		%ymm0,0x00(%rsp)
642	vpxor		%ymm0,%ymm12,%ymm12
643	vpshufb		%ymm2,%ymm12,%ymm12
644	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
645	vpaddd		0x20(%rsp),%ymm5,%ymm0
646	vmovdqa		%ymm0,0x20(%rsp)
647	vpxor		%ymm0,%ymm13,%ymm13
648	vpshufb		%ymm2,%ymm13,%ymm13
649	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
650	vpaddd		0x40(%rsp),%ymm6,%ymm0
651	vmovdqa		%ymm0,0x40(%rsp)
652	vpxor		%ymm0,%ymm14,%ymm14
653	vpshufb		%ymm2,%ymm14,%ymm14
654	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
655	vpaddd		0x60(%rsp),%ymm7,%ymm0
656	vmovdqa		%ymm0,0x60(%rsp)
657	vpxor		%ymm0,%ymm15,%ymm15
658	vpshufb		%ymm2,%ymm15,%ymm15
659
660	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
661	vpaddd		%ymm12,%ymm8,%ymm8
662	vpxor		%ymm8,%ymm4,%ymm4
663	vpslld		$7,%ymm4,%ymm0
664	vpsrld		$25,%ymm4,%ymm4
665	vpor		%ymm0,%ymm4,%ymm4
666	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
667	vpaddd		%ymm13,%ymm9,%ymm9
668	vpxor		%ymm9,%ymm5,%ymm5
669	vpslld		$7,%ymm5,%ymm0
670	vpsrld		$25,%ymm5,%ymm5
671	vpor		%ymm0,%ymm5,%ymm5
672	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
673	vpaddd		%ymm14,%ymm10,%ymm10
674	vpxor		%ymm10,%ymm6,%ymm6
675	vpslld		$7,%ymm6,%ymm0
676	vpsrld		$25,%ymm6,%ymm6
677	vpor		%ymm0,%ymm6,%ymm6
678	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
679	vpaddd		%ymm15,%ymm11,%ymm11
680	vpxor		%ymm11,%ymm7,%ymm7
681	vpslld		$7,%ymm7,%ymm0
682	vpsrld		$25,%ymm7,%ymm7
683	vpor		%ymm0,%ymm7,%ymm7
684
685	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
686	vpaddd		0x00(%rsp),%ymm5,%ymm0
687	vmovdqa		%ymm0,0x00(%rsp)
688	vpxor		%ymm0,%ymm15,%ymm15
689	vpshufb		%ymm3,%ymm15,%ymm15
690	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)%ymm0
691	vpaddd		0x20(%rsp),%ymm6,%ymm0
692	vmovdqa		%ymm0,0x20(%rsp)
693	vpxor		%ymm0,%ymm12,%ymm12
694	vpshufb		%ymm3,%ymm12,%ymm12
695	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
696	vpaddd		0x40(%rsp),%ymm7,%ymm0
697	vmovdqa		%ymm0,0x40(%rsp)
698	vpxor		%ymm0,%ymm13,%ymm13
699	vpshufb		%ymm3,%ymm13,%ymm13
700	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
701	vpaddd		0x60(%rsp),%ymm4,%ymm0
702	vmovdqa		%ymm0,0x60(%rsp)
703	vpxor		%ymm0,%ymm14,%ymm14
704	vpshufb		%ymm3,%ymm14,%ymm14
705
706	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
707	vpaddd		%ymm15,%ymm10,%ymm10
708	vpxor		%ymm10,%ymm5,%ymm5
709	vpslld		$12,%ymm5,%ymm0
710	vpsrld		$20,%ymm5,%ymm5
711	vpor		%ymm0,%ymm5,%ymm5
712	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
713	vpaddd		%ymm12,%ymm11,%ymm11
714	vpxor		%ymm11,%ymm6,%ymm6
715	vpslld		$12,%ymm6,%ymm0
716	vpsrld		$20,%ymm6,%ymm6
717	vpor		%ymm0,%ymm6,%ymm6
718	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
719	vpaddd		%ymm13,%ymm8,%ymm8
720	vpxor		%ymm8,%ymm7,%ymm7
721	vpslld		$12,%ymm7,%ymm0
722	vpsrld		$20,%ymm7,%ymm7
723	vpor		%ymm0,%ymm7,%ymm7
724	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
725	vpaddd		%ymm14,%ymm9,%ymm9
726	vpxor		%ymm9,%ymm4,%ymm4
727	vpslld		$12,%ymm4,%ymm0
728	vpsrld		$20,%ymm4,%ymm4
729	vpor		%ymm0,%ymm4,%ymm4
730
731	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
732	vpaddd		0x00(%rsp),%ymm5,%ymm0
733	vmovdqa		%ymm0,0x00(%rsp)
734	vpxor		%ymm0,%ymm15,%ymm15
735	vpshufb		%ymm2,%ymm15,%ymm15
736	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
737	vpaddd		0x20(%rsp),%ymm6,%ymm0
738	vmovdqa		%ymm0,0x20(%rsp)
739	vpxor		%ymm0,%ymm12,%ymm12
740	vpshufb		%ymm2,%ymm12,%ymm12
741	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
742	vpaddd		0x40(%rsp),%ymm7,%ymm0
743	vmovdqa		%ymm0,0x40(%rsp)
744	vpxor		%ymm0,%ymm13,%ymm13
745	vpshufb		%ymm2,%ymm13,%ymm13
746	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
747	vpaddd		0x60(%rsp),%ymm4,%ymm0
748	vmovdqa		%ymm0,0x60(%rsp)
749	vpxor		%ymm0,%ymm14,%ymm14
750	vpshufb		%ymm2,%ymm14,%ymm14
751
752	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
753	vpaddd		%ymm15,%ymm10,%ymm10
754	vpxor		%ymm10,%ymm5,%ymm5
755	vpslld		$7,%ymm5,%ymm0
756	vpsrld		$25,%ymm5,%ymm5
757	vpor		%ymm0,%ymm5,%ymm5
758	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
759	vpaddd		%ymm12,%ymm11,%ymm11
760	vpxor		%ymm11,%ymm6,%ymm6
761	vpslld		$7,%ymm6,%ymm0
762	vpsrld		$25,%ymm6,%ymm6
763	vpor		%ymm0,%ymm6,%ymm6
764	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
765	vpaddd		%ymm13,%ymm8,%ymm8
766	vpxor		%ymm8,%ymm7,%ymm7
767	vpslld		$7,%ymm7,%ymm0
768	vpsrld		$25,%ymm7,%ymm7
769	vpor		%ymm0,%ymm7,%ymm7
770	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
771	vpaddd		%ymm14,%ymm9,%ymm9
772	vpxor		%ymm9,%ymm4,%ymm4
773	vpslld		$7,%ymm4,%ymm0
774	vpsrld		$25,%ymm4,%ymm4
775	vpor		%ymm0,%ymm4,%ymm4
776
777	sub		$2,%r8d
778	jnz		.Ldoubleround8
779
780	# x0..15[0-3] += s[0..15]
781	vpbroadcastd	0x00(%rdi),%ymm0
782	vpaddd		0x00(%rsp),%ymm0,%ymm0
783	vmovdqa		%ymm0,0x00(%rsp)
784	vpbroadcastd	0x04(%rdi),%ymm0
785	vpaddd		0x20(%rsp),%ymm0,%ymm0
786	vmovdqa		%ymm0,0x20(%rsp)
787	vpbroadcastd	0x08(%rdi),%ymm0
788	vpaddd		0x40(%rsp),%ymm0,%ymm0
789	vmovdqa		%ymm0,0x40(%rsp)
790	vpbroadcastd	0x0c(%rdi),%ymm0
791	vpaddd		0x60(%rsp),%ymm0,%ymm0
792	vmovdqa		%ymm0,0x60(%rsp)
793	vpbroadcastd	0x10(%rdi),%ymm0
794	vpaddd		%ymm0,%ymm4,%ymm4
795	vpbroadcastd	0x14(%rdi),%ymm0
796	vpaddd		%ymm0,%ymm5,%ymm5
797	vpbroadcastd	0x18(%rdi),%ymm0
798	vpaddd		%ymm0,%ymm6,%ymm6
799	vpbroadcastd	0x1c(%rdi),%ymm0
800	vpaddd		%ymm0,%ymm7,%ymm7
801	vpbroadcastd	0x20(%rdi),%ymm0
802	vpaddd		%ymm0,%ymm8,%ymm8
803	vpbroadcastd	0x24(%rdi),%ymm0
804	vpaddd		%ymm0,%ymm9,%ymm9
805	vpbroadcastd	0x28(%rdi),%ymm0
806	vpaddd		%ymm0,%ymm10,%ymm10
807	vpbroadcastd	0x2c(%rdi),%ymm0
808	vpaddd		%ymm0,%ymm11,%ymm11
809	vpbroadcastd	0x30(%rdi),%ymm0
810	vpaddd		%ymm0,%ymm12,%ymm12
811	vpbroadcastd	0x34(%rdi),%ymm0
812	vpaddd		%ymm0,%ymm13,%ymm13
813	vpbroadcastd	0x38(%rdi),%ymm0
814	vpaddd		%ymm0,%ymm14,%ymm14
815	vpbroadcastd	0x3c(%rdi),%ymm0
816	vpaddd		%ymm0,%ymm15,%ymm15
817
818	# x12 += counter values 0-3
819	vpaddd		%ymm1,%ymm12,%ymm12
820
821	# interleave 32-bit words in state n, n+1
822	vmovdqa		0x00(%rsp),%ymm0
823	vmovdqa		0x20(%rsp),%ymm1
824	vpunpckldq	%ymm1,%ymm0,%ymm2
825	vpunpckhdq	%ymm1,%ymm0,%ymm1
826	vmovdqa		%ymm2,0x00(%rsp)
827	vmovdqa		%ymm1,0x20(%rsp)
828	vmovdqa		0x40(%rsp),%ymm0
829	vmovdqa		0x60(%rsp),%ymm1
830	vpunpckldq	%ymm1,%ymm0,%ymm2
831	vpunpckhdq	%ymm1,%ymm0,%ymm1
832	vmovdqa		%ymm2,0x40(%rsp)
833	vmovdqa		%ymm1,0x60(%rsp)
834	vmovdqa		%ymm4,%ymm0
835	vpunpckldq	%ymm5,%ymm0,%ymm4
836	vpunpckhdq	%ymm5,%ymm0,%ymm5
837	vmovdqa		%ymm6,%ymm0
838	vpunpckldq	%ymm7,%ymm0,%ymm6
839	vpunpckhdq	%ymm7,%ymm0,%ymm7
840	vmovdqa		%ymm8,%ymm0
841	vpunpckldq	%ymm9,%ymm0,%ymm8
842	vpunpckhdq	%ymm9,%ymm0,%ymm9
843	vmovdqa		%ymm10,%ymm0
844	vpunpckldq	%ymm11,%ymm0,%ymm10
845	vpunpckhdq	%ymm11,%ymm0,%ymm11
846	vmovdqa		%ymm12,%ymm0
847	vpunpckldq	%ymm13,%ymm0,%ymm12
848	vpunpckhdq	%ymm13,%ymm0,%ymm13
849	vmovdqa		%ymm14,%ymm0
850	vpunpckldq	%ymm15,%ymm0,%ymm14
851	vpunpckhdq	%ymm15,%ymm0,%ymm15
852
853	# interleave 64-bit words in state n, n+2
854	vmovdqa		0x00(%rsp),%ymm0
855	vmovdqa		0x40(%rsp),%ymm2
856	vpunpcklqdq	%ymm2,%ymm0,%ymm1
857	vpunpckhqdq	%ymm2,%ymm0,%ymm2
858	vmovdqa		%ymm1,0x00(%rsp)
859	vmovdqa		%ymm2,0x40(%rsp)
860	vmovdqa		0x20(%rsp),%ymm0
861	vmovdqa		0x60(%rsp),%ymm2
862	vpunpcklqdq	%ymm2,%ymm0,%ymm1
863	vpunpckhqdq	%ymm2,%ymm0,%ymm2
864	vmovdqa		%ymm1,0x20(%rsp)
865	vmovdqa		%ymm2,0x60(%rsp)
866	vmovdqa		%ymm4,%ymm0
867	vpunpcklqdq	%ymm6,%ymm0,%ymm4
868	vpunpckhqdq	%ymm6,%ymm0,%ymm6
869	vmovdqa		%ymm5,%ymm0
870	vpunpcklqdq	%ymm7,%ymm0,%ymm5
871	vpunpckhqdq	%ymm7,%ymm0,%ymm7
872	vmovdqa		%ymm8,%ymm0
873	vpunpcklqdq	%ymm10,%ymm0,%ymm8
874	vpunpckhqdq	%ymm10,%ymm0,%ymm10
875	vmovdqa		%ymm9,%ymm0
876	vpunpcklqdq	%ymm11,%ymm0,%ymm9
877	vpunpckhqdq	%ymm11,%ymm0,%ymm11
878	vmovdqa		%ymm12,%ymm0
879	vpunpcklqdq	%ymm14,%ymm0,%ymm12
880	vpunpckhqdq	%ymm14,%ymm0,%ymm14
881	vmovdqa		%ymm13,%ymm0
882	vpunpcklqdq	%ymm15,%ymm0,%ymm13
883	vpunpckhqdq	%ymm15,%ymm0,%ymm15
884
885	# interleave 128-bit words in state n, n+4
886	# xor/write first four blocks
887	vmovdqa		0x00(%rsp),%ymm1
888	vperm2i128	$0x20,%ymm4,%ymm1,%ymm0
889	cmp		$0x0020,%rax
890	jl		.Lxorpart8
891	vpxor		0x0000(%rdx),%ymm0,%ymm0
892	vmovdqu		%ymm0,0x0000(%rsi)
893	vperm2i128	$0x31,%ymm4,%ymm1,%ymm4
894
895	vperm2i128	$0x20,%ymm12,%ymm8,%ymm0
896	cmp		$0x0040,%rax
897	jl		.Lxorpart8
898	vpxor		0x0020(%rdx),%ymm0,%ymm0
899	vmovdqu		%ymm0,0x0020(%rsi)
900	vperm2i128	$0x31,%ymm12,%ymm8,%ymm12
901
902	vmovdqa		0x40(%rsp),%ymm1
903	vperm2i128	$0x20,%ymm6,%ymm1,%ymm0
904	cmp		$0x0060,%rax
905	jl		.Lxorpart8
906	vpxor		0x0040(%rdx),%ymm0,%ymm0
907	vmovdqu		%ymm0,0x0040(%rsi)
908	vperm2i128	$0x31,%ymm6,%ymm1,%ymm6
909
910	vperm2i128	$0x20,%ymm14,%ymm10,%ymm0
911	cmp		$0x0080,%rax
912	jl		.Lxorpart8
913	vpxor		0x0060(%rdx),%ymm0,%ymm0
914	vmovdqu		%ymm0,0x0060(%rsi)
915	vperm2i128	$0x31,%ymm14,%ymm10,%ymm14
916
917	vmovdqa		0x20(%rsp),%ymm1
918	vperm2i128	$0x20,%ymm5,%ymm1,%ymm0
919	cmp		$0x00a0,%rax
920	jl		.Lxorpart8
921	vpxor		0x0080(%rdx),%ymm0,%ymm0
922	vmovdqu		%ymm0,0x0080(%rsi)
923	vperm2i128	$0x31,%ymm5,%ymm1,%ymm5
924
925	vperm2i128	$0x20,%ymm13,%ymm9,%ymm0
926	cmp		$0x00c0,%rax
927	jl		.Lxorpart8
928	vpxor		0x00a0(%rdx),%ymm0,%ymm0
929	vmovdqu		%ymm0,0x00a0(%rsi)
930	vperm2i128	$0x31,%ymm13,%ymm9,%ymm13
931
932	vmovdqa		0x60(%rsp),%ymm1
933	vperm2i128	$0x20,%ymm7,%ymm1,%ymm0
934	cmp		$0x00e0,%rax
935	jl		.Lxorpart8
936	vpxor		0x00c0(%rdx),%ymm0,%ymm0
937	vmovdqu		%ymm0,0x00c0(%rsi)
938	vperm2i128	$0x31,%ymm7,%ymm1,%ymm7
939
940	vperm2i128	$0x20,%ymm15,%ymm11,%ymm0
941	cmp		$0x0100,%rax
942	jl		.Lxorpart8
943	vpxor		0x00e0(%rdx),%ymm0,%ymm0
944	vmovdqu		%ymm0,0x00e0(%rsi)
945	vperm2i128	$0x31,%ymm15,%ymm11,%ymm15
946
947	# xor remaining blocks, write to output
948	vmovdqa		%ymm4,%ymm0
949	cmp		$0x0120,%rax
950	jl		.Lxorpart8
951	vpxor		0x0100(%rdx),%ymm0,%ymm0
952	vmovdqu		%ymm0,0x0100(%rsi)
953
954	vmovdqa		%ymm12,%ymm0
955	cmp		$0x0140,%rax
956	jl		.Lxorpart8
957	vpxor		0x0120(%rdx),%ymm0,%ymm0
958	vmovdqu		%ymm0,0x0120(%rsi)
959
960	vmovdqa		%ymm6,%ymm0
961	cmp		$0x0160,%rax
962	jl		.Lxorpart8
963	vpxor		0x0140(%rdx),%ymm0,%ymm0
964	vmovdqu		%ymm0,0x0140(%rsi)
965
966	vmovdqa		%ymm14,%ymm0
967	cmp		$0x0180,%rax
968	jl		.Lxorpart8
969	vpxor		0x0160(%rdx),%ymm0,%ymm0
970	vmovdqu		%ymm0,0x0160(%rsi)
971
972	vmovdqa		%ymm5,%ymm0
973	cmp		$0x01a0,%rax
974	jl		.Lxorpart8
975	vpxor		0x0180(%rdx),%ymm0,%ymm0
976	vmovdqu		%ymm0,0x0180(%rsi)
977
978	vmovdqa		%ymm13,%ymm0
979	cmp		$0x01c0,%rax
980	jl		.Lxorpart8
981	vpxor		0x01a0(%rdx),%ymm0,%ymm0
982	vmovdqu		%ymm0,0x01a0(%rsi)
983
984	vmovdqa		%ymm7,%ymm0
985	cmp		$0x01e0,%rax
986	jl		.Lxorpart8
987	vpxor		0x01c0(%rdx),%ymm0,%ymm0
988	vmovdqu		%ymm0,0x01c0(%rsi)
989
990	vmovdqa		%ymm15,%ymm0
991	cmp		$0x0200,%rax
992	jl		.Lxorpart8
993	vpxor		0x01e0(%rdx),%ymm0,%ymm0
994	vmovdqu		%ymm0,0x01e0(%rsi)
995
996.Ldone8:
997	vzeroupper
998	lea		-8(%r10),%rsp
999	ret
1000
1001.Lxorpart8:
1002	# xor remaining bytes from partial register into output
1003	mov		%rax,%r9
1004	and		$0x1f,%r9
1005	jz		.Ldone8
1006	and		$~0x1f,%rax
1007
1008	mov		%rsi,%r11
1009
1010	lea		(%rdx,%rax),%rsi
1011	mov		%rsp,%rdi
1012	mov		%r9,%rcx
1013	rep movsb
1014
1015	vpxor		0x00(%rsp),%ymm0,%ymm0
1016	vmovdqa		%ymm0,0x00(%rsp)
1017
1018	mov		%rsp,%rsi
1019	lea		(%r11,%rax),%rdi
1020	mov		%r9,%rcx
1021	rep movsb
1022
1023	jmp		.Ldone8
1024
1025ENDPROC(chacha_8block_xor_avx2)
1026