xref: /linux/arch/x86/crypto/chacha-ssse3-x86_64.S (revision 2b64b2ed277ff23e785fbdb65098ee7e1252d64f)
1/*
2 * ChaCha 256-bit cipher algorithm, x64 SSSE3 functions
3 *
4 * Copyright (C) 2015 Martin Willi
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 */
11
12#include <linux/linkage.h>
13#include <asm/frame.h>
14
15.section	.rodata.cst16.ROT8, "aM", @progbits, 16
16.align 16
17ROT8:	.octa 0x0e0d0c0f0a09080b0605040702010003
18.section	.rodata.cst16.ROT16, "aM", @progbits, 16
19.align 16
20ROT16:	.octa 0x0d0c0f0e09080b0a0504070601000302
21.section	.rodata.cst16.CTRINC, "aM", @progbits, 16
22.align 16
23CTRINC:	.octa 0x00000003000000020000000100000000
24
25.text
26
27/*
28 * chacha_permute - permute one block
29 *
30 * Permute one 64-byte block where the state matrix is in %xmm0-%xmm3.  This
31 * function performs matrix operations on four words in parallel, but requires
32 * shuffling to rearrange the words after each round.  8/16-bit word rotation is
33 * done with the slightly better performing SSSE3 byte shuffling, 7/12-bit word
34 * rotation uses traditional shift+OR.
35 *
36 * The round count is given in %r8d.
37 *
38 * Clobbers: %r8d, %xmm4-%xmm7
39 */
40chacha_permute:
41
42	movdqa		ROT8(%rip),%xmm4
43	movdqa		ROT16(%rip),%xmm5
44
45.Ldoubleround:
46	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
47	paddd		%xmm1,%xmm0
48	pxor		%xmm0,%xmm3
49	pshufb		%xmm5,%xmm3
50
51	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
52	paddd		%xmm3,%xmm2
53	pxor		%xmm2,%xmm1
54	movdqa		%xmm1,%xmm6
55	pslld		$12,%xmm6
56	psrld		$20,%xmm1
57	por		%xmm6,%xmm1
58
59	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
60	paddd		%xmm1,%xmm0
61	pxor		%xmm0,%xmm3
62	pshufb		%xmm4,%xmm3
63
64	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
65	paddd		%xmm3,%xmm2
66	pxor		%xmm2,%xmm1
67	movdqa		%xmm1,%xmm7
68	pslld		$7,%xmm7
69	psrld		$25,%xmm1
70	por		%xmm7,%xmm1
71
72	# x1 = shuffle32(x1, MASK(0, 3, 2, 1))
73	pshufd		$0x39,%xmm1,%xmm1
74	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
75	pshufd		$0x4e,%xmm2,%xmm2
76	# x3 = shuffle32(x3, MASK(2, 1, 0, 3))
77	pshufd		$0x93,%xmm3,%xmm3
78
79	# x0 += x1, x3 = rotl32(x3 ^ x0, 16)
80	paddd		%xmm1,%xmm0
81	pxor		%xmm0,%xmm3
82	pshufb		%xmm5,%xmm3
83
84	# x2 += x3, x1 = rotl32(x1 ^ x2, 12)
85	paddd		%xmm3,%xmm2
86	pxor		%xmm2,%xmm1
87	movdqa		%xmm1,%xmm6
88	pslld		$12,%xmm6
89	psrld		$20,%xmm1
90	por		%xmm6,%xmm1
91
92	# x0 += x1, x3 = rotl32(x3 ^ x0, 8)
93	paddd		%xmm1,%xmm0
94	pxor		%xmm0,%xmm3
95	pshufb		%xmm4,%xmm3
96
97	# x2 += x3, x1 = rotl32(x1 ^ x2, 7)
98	paddd		%xmm3,%xmm2
99	pxor		%xmm2,%xmm1
100	movdqa		%xmm1,%xmm7
101	pslld		$7,%xmm7
102	psrld		$25,%xmm1
103	por		%xmm7,%xmm1
104
105	# x1 = shuffle32(x1, MASK(2, 1, 0, 3))
106	pshufd		$0x93,%xmm1,%xmm1
107	# x2 = shuffle32(x2, MASK(1, 0, 3, 2))
108	pshufd		$0x4e,%xmm2,%xmm2
109	# x3 = shuffle32(x3, MASK(0, 3, 2, 1))
110	pshufd		$0x39,%xmm3,%xmm3
111
112	sub		$2,%r8d
113	jnz		.Ldoubleround
114
115	ret
116ENDPROC(chacha_permute)
117
118ENTRY(chacha_block_xor_ssse3)
119	# %rdi: Input state matrix, s
120	# %rsi: up to 1 data block output, o
121	# %rdx: up to 1 data block input, i
122	# %rcx: input/output length in bytes
123	# %r8d: nrounds
124	FRAME_BEGIN
125
126	# x0..3 = s0..3
127	movdqa		0x00(%rdi),%xmm0
128	movdqa		0x10(%rdi),%xmm1
129	movdqa		0x20(%rdi),%xmm2
130	movdqa		0x30(%rdi),%xmm3
131	movdqa		%xmm0,%xmm8
132	movdqa		%xmm1,%xmm9
133	movdqa		%xmm2,%xmm10
134	movdqa		%xmm3,%xmm11
135
136	mov		%rcx,%rax
137	call		chacha_permute
138
139	# o0 = i0 ^ (x0 + s0)
140	paddd		%xmm8,%xmm0
141	cmp		$0x10,%rax
142	jl		.Lxorpart
143	movdqu		0x00(%rdx),%xmm4
144	pxor		%xmm4,%xmm0
145	movdqu		%xmm0,0x00(%rsi)
146	# o1 = i1 ^ (x1 + s1)
147	paddd		%xmm9,%xmm1
148	movdqa		%xmm1,%xmm0
149	cmp		$0x20,%rax
150	jl		.Lxorpart
151	movdqu		0x10(%rdx),%xmm0
152	pxor		%xmm1,%xmm0
153	movdqu		%xmm0,0x10(%rsi)
154	# o2 = i2 ^ (x2 + s2)
155	paddd		%xmm10,%xmm2
156	movdqa		%xmm2,%xmm0
157	cmp		$0x30,%rax
158	jl		.Lxorpart
159	movdqu		0x20(%rdx),%xmm0
160	pxor		%xmm2,%xmm0
161	movdqu		%xmm0,0x20(%rsi)
162	# o3 = i3 ^ (x3 + s3)
163	paddd		%xmm11,%xmm3
164	movdqa		%xmm3,%xmm0
165	cmp		$0x40,%rax
166	jl		.Lxorpart
167	movdqu		0x30(%rdx),%xmm0
168	pxor		%xmm3,%xmm0
169	movdqu		%xmm0,0x30(%rsi)
170
171.Ldone:
172	FRAME_END
173	ret
174
175.Lxorpart:
176	# xor remaining bytes from partial register into output
177	mov		%rax,%r9
178	and		$0x0f,%r9
179	jz		.Ldone
180	and		$~0x0f,%rax
181
182	mov		%rsi,%r11
183
184	lea		8(%rsp),%r10
185	sub		$0x10,%rsp
186	and		$~31,%rsp
187
188	lea		(%rdx,%rax),%rsi
189	mov		%rsp,%rdi
190	mov		%r9,%rcx
191	rep movsb
192
193	pxor		0x00(%rsp),%xmm0
194	movdqa		%xmm0,0x00(%rsp)
195
196	mov		%rsp,%rsi
197	lea		(%r11,%rax),%rdi
198	mov		%r9,%rcx
199	rep movsb
200
201	lea		-8(%r10),%rsp
202	jmp		.Ldone
203
204ENDPROC(chacha_block_xor_ssse3)
205
206ENTRY(hchacha_block_ssse3)
207	# %rdi: Input state matrix, s
208	# %rsi: output (8 32-bit words)
209	# %edx: nrounds
210	FRAME_BEGIN
211
212	movdqa		0x00(%rdi),%xmm0
213	movdqa		0x10(%rdi),%xmm1
214	movdqa		0x20(%rdi),%xmm2
215	movdqa		0x30(%rdi),%xmm3
216
217	mov		%edx,%r8d
218	call		chacha_permute
219
220	movdqu		%xmm0,0x00(%rsi)
221	movdqu		%xmm3,0x10(%rsi)
222
223	FRAME_END
224	ret
225ENDPROC(hchacha_block_ssse3)
226
227ENTRY(chacha_4block_xor_ssse3)
228	# %rdi: Input state matrix, s
229	# %rsi: up to 4 data blocks output, o
230	# %rdx: up to 4 data blocks input, i
231	# %rcx: input/output length in bytes
232	# %r8d: nrounds
233
234	# This function encrypts four consecutive ChaCha blocks by loading the
235	# the state matrix in SSE registers four times. As we need some scratch
236	# registers, we save the first four registers on the stack. The
237	# algorithm performs each operation on the corresponding word of each
238	# state matrix, hence requires no word shuffling. For final XORing step
239	# we transpose the matrix by interleaving 32- and then 64-bit words,
240	# which allows us to do XOR in SSE registers. 8/16-bit word rotation is
241	# done with the slightly better performing SSSE3 byte shuffling,
242	# 7/12-bit word rotation uses traditional shift+OR.
243
244	lea		8(%rsp),%r10
245	sub		$0x80,%rsp
246	and		$~63,%rsp
247	mov		%rcx,%rax
248
249	# x0..15[0-3] = s0..3[0..3]
250	movq		0x00(%rdi),%xmm1
251	pshufd		$0x00,%xmm1,%xmm0
252	pshufd		$0x55,%xmm1,%xmm1
253	movq		0x08(%rdi),%xmm3
254	pshufd		$0x00,%xmm3,%xmm2
255	pshufd		$0x55,%xmm3,%xmm3
256	movq		0x10(%rdi),%xmm5
257	pshufd		$0x00,%xmm5,%xmm4
258	pshufd		$0x55,%xmm5,%xmm5
259	movq		0x18(%rdi),%xmm7
260	pshufd		$0x00,%xmm7,%xmm6
261	pshufd		$0x55,%xmm7,%xmm7
262	movq		0x20(%rdi),%xmm9
263	pshufd		$0x00,%xmm9,%xmm8
264	pshufd		$0x55,%xmm9,%xmm9
265	movq		0x28(%rdi),%xmm11
266	pshufd		$0x00,%xmm11,%xmm10
267	pshufd		$0x55,%xmm11,%xmm11
268	movq		0x30(%rdi),%xmm13
269	pshufd		$0x00,%xmm13,%xmm12
270	pshufd		$0x55,%xmm13,%xmm13
271	movq		0x38(%rdi),%xmm15
272	pshufd		$0x00,%xmm15,%xmm14
273	pshufd		$0x55,%xmm15,%xmm15
274	# x0..3 on stack
275	movdqa		%xmm0,0x00(%rsp)
276	movdqa		%xmm1,0x10(%rsp)
277	movdqa		%xmm2,0x20(%rsp)
278	movdqa		%xmm3,0x30(%rsp)
279
280	movdqa		CTRINC(%rip),%xmm1
281	movdqa		ROT8(%rip),%xmm2
282	movdqa		ROT16(%rip),%xmm3
283
284	# x12 += counter values 0-3
285	paddd		%xmm1,%xmm12
286
287.Ldoubleround4:
288	# x0 += x4, x12 = rotl32(x12 ^ x0, 16)
289	movdqa		0x00(%rsp),%xmm0
290	paddd		%xmm4,%xmm0
291	movdqa		%xmm0,0x00(%rsp)
292	pxor		%xmm0,%xmm12
293	pshufb		%xmm3,%xmm12
294	# x1 += x5, x13 = rotl32(x13 ^ x1, 16)
295	movdqa		0x10(%rsp),%xmm0
296	paddd		%xmm5,%xmm0
297	movdqa		%xmm0,0x10(%rsp)
298	pxor		%xmm0,%xmm13
299	pshufb		%xmm3,%xmm13
300	# x2 += x6, x14 = rotl32(x14 ^ x2, 16)
301	movdqa		0x20(%rsp),%xmm0
302	paddd		%xmm6,%xmm0
303	movdqa		%xmm0,0x20(%rsp)
304	pxor		%xmm0,%xmm14
305	pshufb		%xmm3,%xmm14
306	# x3 += x7, x15 = rotl32(x15 ^ x3, 16)
307	movdqa		0x30(%rsp),%xmm0
308	paddd		%xmm7,%xmm0
309	movdqa		%xmm0,0x30(%rsp)
310	pxor		%xmm0,%xmm15
311	pshufb		%xmm3,%xmm15
312
313	# x8 += x12, x4 = rotl32(x4 ^ x8, 12)
314	paddd		%xmm12,%xmm8
315	pxor		%xmm8,%xmm4
316	movdqa		%xmm4,%xmm0
317	pslld		$12,%xmm0
318	psrld		$20,%xmm4
319	por		%xmm0,%xmm4
320	# x9 += x13, x5 = rotl32(x5 ^ x9, 12)
321	paddd		%xmm13,%xmm9
322	pxor		%xmm9,%xmm5
323	movdqa		%xmm5,%xmm0
324	pslld		$12,%xmm0
325	psrld		$20,%xmm5
326	por		%xmm0,%xmm5
327	# x10 += x14, x6 = rotl32(x6 ^ x10, 12)
328	paddd		%xmm14,%xmm10
329	pxor		%xmm10,%xmm6
330	movdqa		%xmm6,%xmm0
331	pslld		$12,%xmm0
332	psrld		$20,%xmm6
333	por		%xmm0,%xmm6
334	# x11 += x15, x7 = rotl32(x7 ^ x11, 12)
335	paddd		%xmm15,%xmm11
336	pxor		%xmm11,%xmm7
337	movdqa		%xmm7,%xmm0
338	pslld		$12,%xmm0
339	psrld		$20,%xmm7
340	por		%xmm0,%xmm7
341
342	# x0 += x4, x12 = rotl32(x12 ^ x0, 8)
343	movdqa		0x00(%rsp),%xmm0
344	paddd		%xmm4,%xmm0
345	movdqa		%xmm0,0x00(%rsp)
346	pxor		%xmm0,%xmm12
347	pshufb		%xmm2,%xmm12
348	# x1 += x5, x13 = rotl32(x13 ^ x1, 8)
349	movdqa		0x10(%rsp),%xmm0
350	paddd		%xmm5,%xmm0
351	movdqa		%xmm0,0x10(%rsp)
352	pxor		%xmm0,%xmm13
353	pshufb		%xmm2,%xmm13
354	# x2 += x6, x14 = rotl32(x14 ^ x2, 8)
355	movdqa		0x20(%rsp),%xmm0
356	paddd		%xmm6,%xmm0
357	movdqa		%xmm0,0x20(%rsp)
358	pxor		%xmm0,%xmm14
359	pshufb		%xmm2,%xmm14
360	# x3 += x7, x15 = rotl32(x15 ^ x3, 8)
361	movdqa		0x30(%rsp),%xmm0
362	paddd		%xmm7,%xmm0
363	movdqa		%xmm0,0x30(%rsp)
364	pxor		%xmm0,%xmm15
365	pshufb		%xmm2,%xmm15
366
367	# x8 += x12, x4 = rotl32(x4 ^ x8, 7)
368	paddd		%xmm12,%xmm8
369	pxor		%xmm8,%xmm4
370	movdqa		%xmm4,%xmm0
371	pslld		$7,%xmm0
372	psrld		$25,%xmm4
373	por		%xmm0,%xmm4
374	# x9 += x13, x5 = rotl32(x5 ^ x9, 7)
375	paddd		%xmm13,%xmm9
376	pxor		%xmm9,%xmm5
377	movdqa		%xmm5,%xmm0
378	pslld		$7,%xmm0
379	psrld		$25,%xmm5
380	por		%xmm0,%xmm5
381	# x10 += x14, x6 = rotl32(x6 ^ x10, 7)
382	paddd		%xmm14,%xmm10
383	pxor		%xmm10,%xmm6
384	movdqa		%xmm6,%xmm0
385	pslld		$7,%xmm0
386	psrld		$25,%xmm6
387	por		%xmm0,%xmm6
388	# x11 += x15, x7 = rotl32(x7 ^ x11, 7)
389	paddd		%xmm15,%xmm11
390	pxor		%xmm11,%xmm7
391	movdqa		%xmm7,%xmm0
392	pslld		$7,%xmm0
393	psrld		$25,%xmm7
394	por		%xmm0,%xmm7
395
396	# x0 += x5, x15 = rotl32(x15 ^ x0, 16)
397	movdqa		0x00(%rsp),%xmm0
398	paddd		%xmm5,%xmm0
399	movdqa		%xmm0,0x00(%rsp)
400	pxor		%xmm0,%xmm15
401	pshufb		%xmm3,%xmm15
402	# x1 += x6, x12 = rotl32(x12 ^ x1, 16)
403	movdqa		0x10(%rsp),%xmm0
404	paddd		%xmm6,%xmm0
405	movdqa		%xmm0,0x10(%rsp)
406	pxor		%xmm0,%xmm12
407	pshufb		%xmm3,%xmm12
408	# x2 += x7, x13 = rotl32(x13 ^ x2, 16)
409	movdqa		0x20(%rsp),%xmm0
410	paddd		%xmm7,%xmm0
411	movdqa		%xmm0,0x20(%rsp)
412	pxor		%xmm0,%xmm13
413	pshufb		%xmm3,%xmm13
414	# x3 += x4, x14 = rotl32(x14 ^ x3, 16)
415	movdqa		0x30(%rsp),%xmm0
416	paddd		%xmm4,%xmm0
417	movdqa		%xmm0,0x30(%rsp)
418	pxor		%xmm0,%xmm14
419	pshufb		%xmm3,%xmm14
420
421	# x10 += x15, x5 = rotl32(x5 ^ x10, 12)
422	paddd		%xmm15,%xmm10
423	pxor		%xmm10,%xmm5
424	movdqa		%xmm5,%xmm0
425	pslld		$12,%xmm0
426	psrld		$20,%xmm5
427	por		%xmm0,%xmm5
428	# x11 += x12, x6 = rotl32(x6 ^ x11, 12)
429	paddd		%xmm12,%xmm11
430	pxor		%xmm11,%xmm6
431	movdqa		%xmm6,%xmm0
432	pslld		$12,%xmm0
433	psrld		$20,%xmm6
434	por		%xmm0,%xmm6
435	# x8 += x13, x7 = rotl32(x7 ^ x8, 12)
436	paddd		%xmm13,%xmm8
437	pxor		%xmm8,%xmm7
438	movdqa		%xmm7,%xmm0
439	pslld		$12,%xmm0
440	psrld		$20,%xmm7
441	por		%xmm0,%xmm7
442	# x9 += x14, x4 = rotl32(x4 ^ x9, 12)
443	paddd		%xmm14,%xmm9
444	pxor		%xmm9,%xmm4
445	movdqa		%xmm4,%xmm0
446	pslld		$12,%xmm0
447	psrld		$20,%xmm4
448	por		%xmm0,%xmm4
449
450	# x0 += x5, x15 = rotl32(x15 ^ x0, 8)
451	movdqa		0x00(%rsp),%xmm0
452	paddd		%xmm5,%xmm0
453	movdqa		%xmm0,0x00(%rsp)
454	pxor		%xmm0,%xmm15
455	pshufb		%xmm2,%xmm15
456	# x1 += x6, x12 = rotl32(x12 ^ x1, 8)
457	movdqa		0x10(%rsp),%xmm0
458	paddd		%xmm6,%xmm0
459	movdqa		%xmm0,0x10(%rsp)
460	pxor		%xmm0,%xmm12
461	pshufb		%xmm2,%xmm12
462	# x2 += x7, x13 = rotl32(x13 ^ x2, 8)
463	movdqa		0x20(%rsp),%xmm0
464	paddd		%xmm7,%xmm0
465	movdqa		%xmm0,0x20(%rsp)
466	pxor		%xmm0,%xmm13
467	pshufb		%xmm2,%xmm13
468	# x3 += x4, x14 = rotl32(x14 ^ x3, 8)
469	movdqa		0x30(%rsp),%xmm0
470	paddd		%xmm4,%xmm0
471	movdqa		%xmm0,0x30(%rsp)
472	pxor		%xmm0,%xmm14
473	pshufb		%xmm2,%xmm14
474
475	# x10 += x15, x5 = rotl32(x5 ^ x10, 7)
476	paddd		%xmm15,%xmm10
477	pxor		%xmm10,%xmm5
478	movdqa		%xmm5,%xmm0
479	pslld		$7,%xmm0
480	psrld		$25,%xmm5
481	por		%xmm0,%xmm5
482	# x11 += x12, x6 = rotl32(x6 ^ x11, 7)
483	paddd		%xmm12,%xmm11
484	pxor		%xmm11,%xmm6
485	movdqa		%xmm6,%xmm0
486	pslld		$7,%xmm0
487	psrld		$25,%xmm6
488	por		%xmm0,%xmm6
489	# x8 += x13, x7 = rotl32(x7 ^ x8, 7)
490	paddd		%xmm13,%xmm8
491	pxor		%xmm8,%xmm7
492	movdqa		%xmm7,%xmm0
493	pslld		$7,%xmm0
494	psrld		$25,%xmm7
495	por		%xmm0,%xmm7
496	# x9 += x14, x4 = rotl32(x4 ^ x9, 7)
497	paddd		%xmm14,%xmm9
498	pxor		%xmm9,%xmm4
499	movdqa		%xmm4,%xmm0
500	pslld		$7,%xmm0
501	psrld		$25,%xmm4
502	por		%xmm0,%xmm4
503
504	sub		$2,%r8d
505	jnz		.Ldoubleround4
506
507	# x0[0-3] += s0[0]
508	# x1[0-3] += s0[1]
509	movq		0x00(%rdi),%xmm3
510	pshufd		$0x00,%xmm3,%xmm2
511	pshufd		$0x55,%xmm3,%xmm3
512	paddd		0x00(%rsp),%xmm2
513	movdqa		%xmm2,0x00(%rsp)
514	paddd		0x10(%rsp),%xmm3
515	movdqa		%xmm3,0x10(%rsp)
516	# x2[0-3] += s0[2]
517	# x3[0-3] += s0[3]
518	movq		0x08(%rdi),%xmm3
519	pshufd		$0x00,%xmm3,%xmm2
520	pshufd		$0x55,%xmm3,%xmm3
521	paddd		0x20(%rsp),%xmm2
522	movdqa		%xmm2,0x20(%rsp)
523	paddd		0x30(%rsp),%xmm3
524	movdqa		%xmm3,0x30(%rsp)
525
526	# x4[0-3] += s1[0]
527	# x5[0-3] += s1[1]
528	movq		0x10(%rdi),%xmm3
529	pshufd		$0x00,%xmm3,%xmm2
530	pshufd		$0x55,%xmm3,%xmm3
531	paddd		%xmm2,%xmm4
532	paddd		%xmm3,%xmm5
533	# x6[0-3] += s1[2]
534	# x7[0-3] += s1[3]
535	movq		0x18(%rdi),%xmm3
536	pshufd		$0x00,%xmm3,%xmm2
537	pshufd		$0x55,%xmm3,%xmm3
538	paddd		%xmm2,%xmm6
539	paddd		%xmm3,%xmm7
540
541	# x8[0-3] += s2[0]
542	# x9[0-3] += s2[1]
543	movq		0x20(%rdi),%xmm3
544	pshufd		$0x00,%xmm3,%xmm2
545	pshufd		$0x55,%xmm3,%xmm3
546	paddd		%xmm2,%xmm8
547	paddd		%xmm3,%xmm9
548	# x10[0-3] += s2[2]
549	# x11[0-3] += s2[3]
550	movq		0x28(%rdi),%xmm3
551	pshufd		$0x00,%xmm3,%xmm2
552	pshufd		$0x55,%xmm3,%xmm3
553	paddd		%xmm2,%xmm10
554	paddd		%xmm3,%xmm11
555
556	# x12[0-3] += s3[0]
557	# x13[0-3] += s3[1]
558	movq		0x30(%rdi),%xmm3
559	pshufd		$0x00,%xmm3,%xmm2
560	pshufd		$0x55,%xmm3,%xmm3
561	paddd		%xmm2,%xmm12
562	paddd		%xmm3,%xmm13
563	# x14[0-3] += s3[2]
564	# x15[0-3] += s3[3]
565	movq		0x38(%rdi),%xmm3
566	pshufd		$0x00,%xmm3,%xmm2
567	pshufd		$0x55,%xmm3,%xmm3
568	paddd		%xmm2,%xmm14
569	paddd		%xmm3,%xmm15
570
571	# x12 += counter values 0-3
572	paddd		%xmm1,%xmm12
573
574	# interleave 32-bit words in state n, n+1
575	movdqa		0x00(%rsp),%xmm0
576	movdqa		0x10(%rsp),%xmm1
577	movdqa		%xmm0,%xmm2
578	punpckldq	%xmm1,%xmm2
579	punpckhdq	%xmm1,%xmm0
580	movdqa		%xmm2,0x00(%rsp)
581	movdqa		%xmm0,0x10(%rsp)
582	movdqa		0x20(%rsp),%xmm0
583	movdqa		0x30(%rsp),%xmm1
584	movdqa		%xmm0,%xmm2
585	punpckldq	%xmm1,%xmm2
586	punpckhdq	%xmm1,%xmm0
587	movdqa		%xmm2,0x20(%rsp)
588	movdqa		%xmm0,0x30(%rsp)
589	movdqa		%xmm4,%xmm0
590	punpckldq	%xmm5,%xmm4
591	punpckhdq	%xmm5,%xmm0
592	movdqa		%xmm0,%xmm5
593	movdqa		%xmm6,%xmm0
594	punpckldq	%xmm7,%xmm6
595	punpckhdq	%xmm7,%xmm0
596	movdqa		%xmm0,%xmm7
597	movdqa		%xmm8,%xmm0
598	punpckldq	%xmm9,%xmm8
599	punpckhdq	%xmm9,%xmm0
600	movdqa		%xmm0,%xmm9
601	movdqa		%xmm10,%xmm0
602	punpckldq	%xmm11,%xmm10
603	punpckhdq	%xmm11,%xmm0
604	movdqa		%xmm0,%xmm11
605	movdqa		%xmm12,%xmm0
606	punpckldq	%xmm13,%xmm12
607	punpckhdq	%xmm13,%xmm0
608	movdqa		%xmm0,%xmm13
609	movdqa		%xmm14,%xmm0
610	punpckldq	%xmm15,%xmm14
611	punpckhdq	%xmm15,%xmm0
612	movdqa		%xmm0,%xmm15
613
614	# interleave 64-bit words in state n, n+2
615	movdqa		0x00(%rsp),%xmm0
616	movdqa		0x20(%rsp),%xmm1
617	movdqa		%xmm0,%xmm2
618	punpcklqdq	%xmm1,%xmm2
619	punpckhqdq	%xmm1,%xmm0
620	movdqa		%xmm2,0x00(%rsp)
621	movdqa		%xmm0,0x20(%rsp)
622	movdqa		0x10(%rsp),%xmm0
623	movdqa		0x30(%rsp),%xmm1
624	movdqa		%xmm0,%xmm2
625	punpcklqdq	%xmm1,%xmm2
626	punpckhqdq	%xmm1,%xmm0
627	movdqa		%xmm2,0x10(%rsp)
628	movdqa		%xmm0,0x30(%rsp)
629	movdqa		%xmm4,%xmm0
630	punpcklqdq	%xmm6,%xmm4
631	punpckhqdq	%xmm6,%xmm0
632	movdqa		%xmm0,%xmm6
633	movdqa		%xmm5,%xmm0
634	punpcklqdq	%xmm7,%xmm5
635	punpckhqdq	%xmm7,%xmm0
636	movdqa		%xmm0,%xmm7
637	movdqa		%xmm8,%xmm0
638	punpcklqdq	%xmm10,%xmm8
639	punpckhqdq	%xmm10,%xmm0
640	movdqa		%xmm0,%xmm10
641	movdqa		%xmm9,%xmm0
642	punpcklqdq	%xmm11,%xmm9
643	punpckhqdq	%xmm11,%xmm0
644	movdqa		%xmm0,%xmm11
645	movdqa		%xmm12,%xmm0
646	punpcklqdq	%xmm14,%xmm12
647	punpckhqdq	%xmm14,%xmm0
648	movdqa		%xmm0,%xmm14
649	movdqa		%xmm13,%xmm0
650	punpcklqdq	%xmm15,%xmm13
651	punpckhqdq	%xmm15,%xmm0
652	movdqa		%xmm0,%xmm15
653
654	# xor with corresponding input, write to output
655	movdqa		0x00(%rsp),%xmm0
656	cmp		$0x10,%rax
657	jl		.Lxorpart4
658	movdqu		0x00(%rdx),%xmm1
659	pxor		%xmm1,%xmm0
660	movdqu		%xmm0,0x00(%rsi)
661
662	movdqu		%xmm4,%xmm0
663	cmp		$0x20,%rax
664	jl		.Lxorpart4
665	movdqu		0x10(%rdx),%xmm1
666	pxor		%xmm1,%xmm0
667	movdqu		%xmm0,0x10(%rsi)
668
669	movdqu		%xmm8,%xmm0
670	cmp		$0x30,%rax
671	jl		.Lxorpart4
672	movdqu		0x20(%rdx),%xmm1
673	pxor		%xmm1,%xmm0
674	movdqu		%xmm0,0x20(%rsi)
675
676	movdqu		%xmm12,%xmm0
677	cmp		$0x40,%rax
678	jl		.Lxorpart4
679	movdqu		0x30(%rdx),%xmm1
680	pxor		%xmm1,%xmm0
681	movdqu		%xmm0,0x30(%rsi)
682
683	movdqa		0x20(%rsp),%xmm0
684	cmp		$0x50,%rax
685	jl		.Lxorpart4
686	movdqu		0x40(%rdx),%xmm1
687	pxor		%xmm1,%xmm0
688	movdqu		%xmm0,0x40(%rsi)
689
690	movdqu		%xmm6,%xmm0
691	cmp		$0x60,%rax
692	jl		.Lxorpart4
693	movdqu		0x50(%rdx),%xmm1
694	pxor		%xmm1,%xmm0
695	movdqu		%xmm0,0x50(%rsi)
696
697	movdqu		%xmm10,%xmm0
698	cmp		$0x70,%rax
699	jl		.Lxorpart4
700	movdqu		0x60(%rdx),%xmm1
701	pxor		%xmm1,%xmm0
702	movdqu		%xmm0,0x60(%rsi)
703
704	movdqu		%xmm14,%xmm0
705	cmp		$0x80,%rax
706	jl		.Lxorpart4
707	movdqu		0x70(%rdx),%xmm1
708	pxor		%xmm1,%xmm0
709	movdqu		%xmm0,0x70(%rsi)
710
711	movdqa		0x10(%rsp),%xmm0
712	cmp		$0x90,%rax
713	jl		.Lxorpart4
714	movdqu		0x80(%rdx),%xmm1
715	pxor		%xmm1,%xmm0
716	movdqu		%xmm0,0x80(%rsi)
717
718	movdqu		%xmm5,%xmm0
719	cmp		$0xa0,%rax
720	jl		.Lxorpart4
721	movdqu		0x90(%rdx),%xmm1
722	pxor		%xmm1,%xmm0
723	movdqu		%xmm0,0x90(%rsi)
724
725	movdqu		%xmm9,%xmm0
726	cmp		$0xb0,%rax
727	jl		.Lxorpart4
728	movdqu		0xa0(%rdx),%xmm1
729	pxor		%xmm1,%xmm0
730	movdqu		%xmm0,0xa0(%rsi)
731
732	movdqu		%xmm13,%xmm0
733	cmp		$0xc0,%rax
734	jl		.Lxorpart4
735	movdqu		0xb0(%rdx),%xmm1
736	pxor		%xmm1,%xmm0
737	movdqu		%xmm0,0xb0(%rsi)
738
739	movdqa		0x30(%rsp),%xmm0
740	cmp		$0xd0,%rax
741	jl		.Lxorpart4
742	movdqu		0xc0(%rdx),%xmm1
743	pxor		%xmm1,%xmm0
744	movdqu		%xmm0,0xc0(%rsi)
745
746	movdqu		%xmm7,%xmm0
747	cmp		$0xe0,%rax
748	jl		.Lxorpart4
749	movdqu		0xd0(%rdx),%xmm1
750	pxor		%xmm1,%xmm0
751	movdqu		%xmm0,0xd0(%rsi)
752
753	movdqu		%xmm11,%xmm0
754	cmp		$0xf0,%rax
755	jl		.Lxorpart4
756	movdqu		0xe0(%rdx),%xmm1
757	pxor		%xmm1,%xmm0
758	movdqu		%xmm0,0xe0(%rsi)
759
760	movdqu		%xmm15,%xmm0
761	cmp		$0x100,%rax
762	jl		.Lxorpart4
763	movdqu		0xf0(%rdx),%xmm1
764	pxor		%xmm1,%xmm0
765	movdqu		%xmm0,0xf0(%rsi)
766
767.Ldone4:
768	lea		-8(%r10),%rsp
769	ret
770
771.Lxorpart4:
772	# xor remaining bytes from partial register into output
773	mov		%rax,%r9
774	and		$0x0f,%r9
775	jz		.Ldone4
776	and		$~0x0f,%rax
777
778	mov		%rsi,%r11
779
780	lea		(%rdx,%rax),%rsi
781	mov		%rsp,%rdi
782	mov		%r9,%rcx
783	rep movsb
784
785	pxor		0x00(%rsp),%xmm0
786	movdqa		%xmm0,0x00(%rsp)
787
788	mov		%rsp,%rsi
789	lea		(%r11,%rax),%rdi
790	mov		%r9,%rcx
791	rep movsb
792
793	jmp		.Ldone4
794
795ENDPROC(chacha_4block_xor_ssse3)
796