xref: /linux/arch/arm64/crypto/chacha-neon-core.S (revision c4bbe83d27c2446a033cc0381c3fb6be5e8c41c7)
1/*
2 * ChaCha/XChaCha NEON helper functions
3 *
4 * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * Originally based on:
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
12 *
13 * Copyright (C) 2015 Martin Willi
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 */
20
21#include <linux/linkage.h>
22#include <asm/assembler.h>
23#include <asm/cache.h>
24
25	.text
26	.align		6
27
28/*
29 * chacha_permute - permute one block
30 *
31 * Permute one 64-byte block where the state matrix is stored in the four NEON
32 * registers v0-v3.  It performs matrix operations on four words in parallel,
33 * but requires shuffling to rearrange the words after each round.
34 *
35 * The round count is given in w3.
36 *
37 * Clobbers: w3, x10, v4, v12
38 */
39SYM_FUNC_START_LOCAL(chacha_permute)
40
41	adr_l		x10, ROT8
42	ld1		{v12.4s}, [x10]
43
44.Ldoubleround:
45	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
46	add		v0.4s, v0.4s, v1.4s
47	eor		v3.16b, v3.16b, v0.16b
48	rev32		v3.8h, v3.8h
49
50	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
51	add		v2.4s, v2.4s, v3.4s
52	eor		v4.16b, v1.16b, v2.16b
53	shl		v1.4s, v4.4s, #12
54	sri		v1.4s, v4.4s, #20
55
56	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
57	add		v0.4s, v0.4s, v1.4s
58	eor		v3.16b, v3.16b, v0.16b
59	tbl		v3.16b, {v3.16b}, v12.16b
60
61	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
62	add		v2.4s, v2.4s, v3.4s
63	eor		v4.16b, v1.16b, v2.16b
64	shl		v1.4s, v4.4s, #7
65	sri		v1.4s, v4.4s, #25
66
67	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
68	ext		v1.16b, v1.16b, v1.16b, #4
69	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
70	ext		v2.16b, v2.16b, v2.16b, #8
71	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
72	ext		v3.16b, v3.16b, v3.16b, #12
73
74	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
75	add		v0.4s, v0.4s, v1.4s
76	eor		v3.16b, v3.16b, v0.16b
77	rev32		v3.8h, v3.8h
78
79	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
80	add		v2.4s, v2.4s, v3.4s
81	eor		v4.16b, v1.16b, v2.16b
82	shl		v1.4s, v4.4s, #12
83	sri		v1.4s, v4.4s, #20
84
85	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
86	add		v0.4s, v0.4s, v1.4s
87	eor		v3.16b, v3.16b, v0.16b
88	tbl		v3.16b, {v3.16b}, v12.16b
89
90	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
91	add		v2.4s, v2.4s, v3.4s
92	eor		v4.16b, v1.16b, v2.16b
93	shl		v1.4s, v4.4s, #7
94	sri		v1.4s, v4.4s, #25
95
96	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
97	ext		v1.16b, v1.16b, v1.16b, #12
98	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
99	ext		v2.16b, v2.16b, v2.16b, #8
100	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
101	ext		v3.16b, v3.16b, v3.16b, #4
102
103	subs		w3, w3, #2
104	b.ne		.Ldoubleround
105
106	ret
107SYM_FUNC_END(chacha_permute)
108
109SYM_FUNC_START(chacha_block_xor_neon)
110	// x0: Input state matrix, s
111	// x1: 1 data block output, o
112	// x2: 1 data block input, i
113	// w3: nrounds
114
115	stp		x29, x30, [sp, #-16]!
116	mov		x29, sp
117
118	// x0..3 = s0..3
119	ld1		{v0.4s-v3.4s}, [x0]
120	ld1		{v8.4s-v11.4s}, [x0]
121
122	bl		chacha_permute
123
124	ld1		{v4.16b-v7.16b}, [x2]
125
126	// o0 = i0 ^ (x0 + s0)
127	add		v0.4s, v0.4s, v8.4s
128	eor		v0.16b, v0.16b, v4.16b
129
130	// o1 = i1 ^ (x1 + s1)
131	add		v1.4s, v1.4s, v9.4s
132	eor		v1.16b, v1.16b, v5.16b
133
134	// o2 = i2 ^ (x2 + s2)
135	add		v2.4s, v2.4s, v10.4s
136	eor		v2.16b, v2.16b, v6.16b
137
138	// o3 = i3 ^ (x3 + s3)
139	add		v3.4s, v3.4s, v11.4s
140	eor		v3.16b, v3.16b, v7.16b
141
142	st1		{v0.16b-v3.16b}, [x1]
143
144	ldp		x29, x30, [sp], #16
145	ret
146SYM_FUNC_END(chacha_block_xor_neon)
147
148SYM_FUNC_START(hchacha_block_neon)
149	// x0: Input state matrix, s
150	// x1: output (8 32-bit words)
151	// w2: nrounds
152
153	stp		x29, x30, [sp, #-16]!
154	mov		x29, sp
155
156	ld1		{v0.4s-v3.4s}, [x0]
157
158	mov		w3, w2
159	bl		chacha_permute
160
161	st1		{v0.4s}, [x1], #16
162	st1		{v3.4s}, [x1]
163
164	ldp		x29, x30, [sp], #16
165	ret
166SYM_FUNC_END(hchacha_block_neon)
167
168	a0		.req	w12
169	a1		.req	w13
170	a2		.req	w14
171	a3		.req	w15
172	a4		.req	w16
173	a5		.req	w17
174	a6		.req	w19
175	a7		.req	w20
176	a8		.req	w21
177	a9		.req	w22
178	a10		.req	w23
179	a11		.req	w24
180	a12		.req	w25
181	a13		.req	w26
182	a14		.req	w27
183	a15		.req	w28
184
185	.align		6
186SYM_FUNC_START(chacha_4block_xor_neon)
187	frame_push	10
188
189	// x0: Input state matrix, s
190	// x1: 4 data blocks output, o
191	// x2: 4 data blocks input, i
192	// w3: nrounds
193	// x4: byte count
194
195	adr_l		x10, .Lpermute
196	and		x5, x4, #63
197	add		x10, x10, x5
198
199	//
200	// This function encrypts four consecutive ChaCha blocks by loading
201	// the state matrix in NEON registers four times. The algorithm performs
202	// each operation on the corresponding word of each state matrix, hence
203	// requires no word shuffling. For final XORing step we transpose the
204	// matrix by interleaving 32- and then 64-bit words, which allows us to
205	// do XOR in NEON registers.
206	//
207	// At the same time, a fifth block is encrypted in parallel using
208	// scalar registers
209	//
210	adr_l		x9, CTRINC		// ... and ROT8
211	ld1		{v30.4s-v31.4s}, [x9]
212
213	// x0..15[0-3] = s0..3[0..3]
214	add		x8, x0, #16
215	ld4r		{ v0.4s- v3.4s}, [x0]
216	ld4r		{ v4.4s- v7.4s}, [x8], #16
217	ld4r		{ v8.4s-v11.4s}, [x8], #16
218	ld4r		{v12.4s-v15.4s}, [x8]
219
220	mov		a0, v0.s[0]
221	mov		a1, v1.s[0]
222	mov		a2, v2.s[0]
223	mov		a3, v3.s[0]
224	mov		a4, v4.s[0]
225	mov		a5, v5.s[0]
226	mov		a6, v6.s[0]
227	mov		a7, v7.s[0]
228	mov		a8, v8.s[0]
229	mov		a9, v9.s[0]
230	mov		a10, v10.s[0]
231	mov		a11, v11.s[0]
232	mov		a12, v12.s[0]
233	mov		a13, v13.s[0]
234	mov		a14, v14.s[0]
235	mov		a15, v15.s[0]
236
237	// x12 += counter values 1-4
238	add		v12.4s, v12.4s, v30.4s
239
240.Ldoubleround4:
241	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
242	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
243	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
244	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
245	add		v0.4s, v0.4s, v4.4s
246	  add		a0, a0, a4
247	add		v1.4s, v1.4s, v5.4s
248	  add		a1, a1, a5
249	add		v2.4s, v2.4s, v6.4s
250	  add		a2, a2, a6
251	add		v3.4s, v3.4s, v7.4s
252	  add		a3, a3, a7
253
254	eor		v12.16b, v12.16b, v0.16b
255	  eor		a12, a12, a0
256	eor		v13.16b, v13.16b, v1.16b
257	  eor		a13, a13, a1
258	eor		v14.16b, v14.16b, v2.16b
259	  eor		a14, a14, a2
260	eor		v15.16b, v15.16b, v3.16b
261	  eor		a15, a15, a3
262
263	rev32		v12.8h, v12.8h
264	  ror		a12, a12, #16
265	rev32		v13.8h, v13.8h
266	  ror		a13, a13, #16
267	rev32		v14.8h, v14.8h
268	  ror		a14, a14, #16
269	rev32		v15.8h, v15.8h
270	  ror		a15, a15, #16
271
272	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
273	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
274	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
275	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
276	add		v8.4s, v8.4s, v12.4s
277	  add		a8, a8, a12
278	add		v9.4s, v9.4s, v13.4s
279	  add		a9, a9, a13
280	add		v10.4s, v10.4s, v14.4s
281	  add		a10, a10, a14
282	add		v11.4s, v11.4s, v15.4s
283	  add		a11, a11, a15
284
285	eor		v16.16b, v4.16b, v8.16b
286	  eor		a4, a4, a8
287	eor		v17.16b, v5.16b, v9.16b
288	  eor		a5, a5, a9
289	eor		v18.16b, v6.16b, v10.16b
290	  eor		a6, a6, a10
291	eor		v19.16b, v7.16b, v11.16b
292	  eor		a7, a7, a11
293
294	shl		v4.4s, v16.4s, #12
295	shl		v5.4s, v17.4s, #12
296	shl		v6.4s, v18.4s, #12
297	shl		v7.4s, v19.4s, #12
298
299	sri		v4.4s, v16.4s, #20
300	  ror		a4, a4, #20
301	sri		v5.4s, v17.4s, #20
302	  ror		a5, a5, #20
303	sri		v6.4s, v18.4s, #20
304	  ror		a6, a6, #20
305	sri		v7.4s, v19.4s, #20
306	  ror		a7, a7, #20
307
308	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
309	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
310	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
311	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
312	add		v0.4s, v0.4s, v4.4s
313	  add		a0, a0, a4
314	add		v1.4s, v1.4s, v5.4s
315	  add		a1, a1, a5
316	add		v2.4s, v2.4s, v6.4s
317	  add		a2, a2, a6
318	add		v3.4s, v3.4s, v7.4s
319	  add		a3, a3, a7
320
321	eor		v12.16b, v12.16b, v0.16b
322	  eor		a12, a12, a0
323	eor		v13.16b, v13.16b, v1.16b
324	  eor		a13, a13, a1
325	eor		v14.16b, v14.16b, v2.16b
326	  eor		a14, a14, a2
327	eor		v15.16b, v15.16b, v3.16b
328	  eor		a15, a15, a3
329
330	tbl		v12.16b, {v12.16b}, v31.16b
331	  ror		a12, a12, #24
332	tbl		v13.16b, {v13.16b}, v31.16b
333	  ror		a13, a13, #24
334	tbl		v14.16b, {v14.16b}, v31.16b
335	  ror		a14, a14, #24
336	tbl		v15.16b, {v15.16b}, v31.16b
337	  ror		a15, a15, #24
338
339	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
340	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
341	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
342	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
343	add		v8.4s, v8.4s, v12.4s
344	  add		a8, a8, a12
345	add		v9.4s, v9.4s, v13.4s
346	  add		a9, a9, a13
347	add		v10.4s, v10.4s, v14.4s
348	  add		a10, a10, a14
349	add		v11.4s, v11.4s, v15.4s
350	  add		a11, a11, a15
351
352	eor		v16.16b, v4.16b, v8.16b
353	  eor		a4, a4, a8
354	eor		v17.16b, v5.16b, v9.16b
355	  eor		a5, a5, a9
356	eor		v18.16b, v6.16b, v10.16b
357	  eor		a6, a6, a10
358	eor		v19.16b, v7.16b, v11.16b
359	  eor		a7, a7, a11
360
361	shl		v4.4s, v16.4s, #7
362	shl		v5.4s, v17.4s, #7
363	shl		v6.4s, v18.4s, #7
364	shl		v7.4s, v19.4s, #7
365
366	sri		v4.4s, v16.4s, #25
367	  ror		a4, a4, #25
368	sri		v5.4s, v17.4s, #25
369	  ror		a5, a5, #25
370	sri		v6.4s, v18.4s, #25
371	 ror		a6, a6, #25
372	sri		v7.4s, v19.4s, #25
373	  ror		a7, a7, #25
374
375	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
376	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
377	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
378	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
379	add		v0.4s, v0.4s, v5.4s
380	  add		a0, a0, a5
381	add		v1.4s, v1.4s, v6.4s
382	  add		a1, a1, a6
383	add		v2.4s, v2.4s, v7.4s
384	  add		a2, a2, a7
385	add		v3.4s, v3.4s, v4.4s
386	  add		a3, a3, a4
387
388	eor		v15.16b, v15.16b, v0.16b
389	  eor		a15, a15, a0
390	eor		v12.16b, v12.16b, v1.16b
391	  eor		a12, a12, a1
392	eor		v13.16b, v13.16b, v2.16b
393	  eor		a13, a13, a2
394	eor		v14.16b, v14.16b, v3.16b
395	  eor		a14, a14, a3
396
397	rev32		v15.8h, v15.8h
398	  ror		a15, a15, #16
399	rev32		v12.8h, v12.8h
400	  ror		a12, a12, #16
401	rev32		v13.8h, v13.8h
402	  ror		a13, a13, #16
403	rev32		v14.8h, v14.8h
404	  ror		a14, a14, #16
405
406	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
407	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
408	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
409	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
410	add		v10.4s, v10.4s, v15.4s
411	  add		a10, a10, a15
412	add		v11.4s, v11.4s, v12.4s
413	  add		a11, a11, a12
414	add		v8.4s, v8.4s, v13.4s
415	  add		a8, a8, a13
416	add		v9.4s, v9.4s, v14.4s
417	  add		a9, a9, a14
418
419	eor		v16.16b, v5.16b, v10.16b
420	  eor		a5, a5, a10
421	eor		v17.16b, v6.16b, v11.16b
422	  eor		a6, a6, a11
423	eor		v18.16b, v7.16b, v8.16b
424	  eor		a7, a7, a8
425	eor		v19.16b, v4.16b, v9.16b
426	  eor		a4, a4, a9
427
428	shl		v5.4s, v16.4s, #12
429	shl		v6.4s, v17.4s, #12
430	shl		v7.4s, v18.4s, #12
431	shl		v4.4s, v19.4s, #12
432
433	sri		v5.4s, v16.4s, #20
434	  ror		a5, a5, #20
435	sri		v6.4s, v17.4s, #20
436	  ror		a6, a6, #20
437	sri		v7.4s, v18.4s, #20
438	  ror		a7, a7, #20
439	sri		v4.4s, v19.4s, #20
440	  ror		a4, a4, #20
441
442	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
443	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
444	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
445	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
446	add		v0.4s, v0.4s, v5.4s
447	  add		a0, a0, a5
448	add		v1.4s, v1.4s, v6.4s
449	  add		a1, a1, a6
450	add		v2.4s, v2.4s, v7.4s
451	  add		a2, a2, a7
452	add		v3.4s, v3.4s, v4.4s
453	  add		a3, a3, a4
454
455	eor		v15.16b, v15.16b, v0.16b
456	  eor		a15, a15, a0
457	eor		v12.16b, v12.16b, v1.16b
458	  eor		a12, a12, a1
459	eor		v13.16b, v13.16b, v2.16b
460	  eor		a13, a13, a2
461	eor		v14.16b, v14.16b, v3.16b
462	  eor		a14, a14, a3
463
464	tbl		v15.16b, {v15.16b}, v31.16b
465	  ror		a15, a15, #24
466	tbl		v12.16b, {v12.16b}, v31.16b
467	  ror		a12, a12, #24
468	tbl		v13.16b, {v13.16b}, v31.16b
469	  ror		a13, a13, #24
470	tbl		v14.16b, {v14.16b}, v31.16b
471	  ror		a14, a14, #24
472
473	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
474	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
475	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
476	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
477	add		v10.4s, v10.4s, v15.4s
478	  add		a10, a10, a15
479	add		v11.4s, v11.4s, v12.4s
480	  add		a11, a11, a12
481	add		v8.4s, v8.4s, v13.4s
482	  add		a8, a8, a13
483	add		v9.4s, v9.4s, v14.4s
484	  add		a9, a9, a14
485
486	eor		v16.16b, v5.16b, v10.16b
487	  eor		a5, a5, a10
488	eor		v17.16b, v6.16b, v11.16b
489	  eor		a6, a6, a11
490	eor		v18.16b, v7.16b, v8.16b
491	  eor		a7, a7, a8
492	eor		v19.16b, v4.16b, v9.16b
493	  eor		a4, a4, a9
494
495	shl		v5.4s, v16.4s, #7
496	shl		v6.4s, v17.4s, #7
497	shl		v7.4s, v18.4s, #7
498	shl		v4.4s, v19.4s, #7
499
500	sri		v5.4s, v16.4s, #25
501	  ror		a5, a5, #25
502	sri		v6.4s, v17.4s, #25
503	  ror		a6, a6, #25
504	sri		v7.4s, v18.4s, #25
505	  ror		a7, a7, #25
506	sri		v4.4s, v19.4s, #25
507	  ror		a4, a4, #25
508
509	subs		w3, w3, #2
510	b.ne		.Ldoubleround4
511
512	ld4r		{v16.4s-v19.4s}, [x0], #16
513	ld4r		{v20.4s-v23.4s}, [x0], #16
514
515	// x12 += counter values 0-3
516	add		v12.4s, v12.4s, v30.4s
517
518	// x0[0-3] += s0[0]
519	// x1[0-3] += s0[1]
520	// x2[0-3] += s0[2]
521	// x3[0-3] += s0[3]
522	add		v0.4s, v0.4s, v16.4s
523	  mov		w6, v16.s[0]
524	  mov		w7, v17.s[0]
525	add		v1.4s, v1.4s, v17.4s
526	  mov		w8, v18.s[0]
527	  mov		w9, v19.s[0]
528	add		v2.4s, v2.4s, v18.4s
529	  add		a0, a0, w6
530	  add		a1, a1, w7
531	add		v3.4s, v3.4s, v19.4s
532	  add		a2, a2, w8
533	  add		a3, a3, w9
534CPU_BE(	  rev		a0, a0		)
535CPU_BE(	  rev		a1, a1		)
536CPU_BE(	  rev		a2, a2		)
537CPU_BE(	  rev		a3, a3		)
538
539	ld4r		{v24.4s-v27.4s}, [x0], #16
540	ld4r		{v28.4s-v31.4s}, [x0]
541
542	// x4[0-3] += s1[0]
543	// x5[0-3] += s1[1]
544	// x6[0-3] += s1[2]
545	// x7[0-3] += s1[3]
546	add		v4.4s, v4.4s, v20.4s
547	  mov		w6, v20.s[0]
548	  mov		w7, v21.s[0]
549	add		v5.4s, v5.4s, v21.4s
550	  mov		w8, v22.s[0]
551	  mov		w9, v23.s[0]
552	add		v6.4s, v6.4s, v22.4s
553	  add		a4, a4, w6
554	  add		a5, a5, w7
555	add		v7.4s, v7.4s, v23.4s
556	  add		a6, a6, w8
557	  add		a7, a7, w9
558CPU_BE(	  rev		a4, a4		)
559CPU_BE(	  rev		a5, a5		)
560CPU_BE(	  rev		a6, a6		)
561CPU_BE(	  rev		a7, a7		)
562
563	// x8[0-3] += s2[0]
564	// x9[0-3] += s2[1]
565	// x10[0-3] += s2[2]
566	// x11[0-3] += s2[3]
567	add		v8.4s, v8.4s, v24.4s
568	  mov		w6, v24.s[0]
569	  mov		w7, v25.s[0]
570	add		v9.4s, v9.4s, v25.4s
571	  mov		w8, v26.s[0]
572	  mov		w9, v27.s[0]
573	add		v10.4s, v10.4s, v26.4s
574	  add		a8, a8, w6
575	  add		a9, a9, w7
576	add		v11.4s, v11.4s, v27.4s
577	  add		a10, a10, w8
578	  add		a11, a11, w9
579CPU_BE(	  rev		a8, a8		)
580CPU_BE(	  rev		a9, a9		)
581CPU_BE(	  rev		a10, a10	)
582CPU_BE(	  rev		a11, a11	)
583
584	// x12[0-3] += s3[0]
585	// x13[0-3] += s3[1]
586	// x14[0-3] += s3[2]
587	// x15[0-3] += s3[3]
588	add		v12.4s, v12.4s, v28.4s
589	  mov		w6, v28.s[0]
590	  mov		w7, v29.s[0]
591	add		v13.4s, v13.4s, v29.4s
592	  mov		w8, v30.s[0]
593	  mov		w9, v31.s[0]
594	add		v14.4s, v14.4s, v30.4s
595	  add		a12, a12, w6
596	  add		a13, a13, w7
597	add		v15.4s, v15.4s, v31.4s
598	  add		a14, a14, w8
599	  add		a15, a15, w9
600CPU_BE(	  rev		a12, a12	)
601CPU_BE(	  rev		a13, a13	)
602CPU_BE(	  rev		a14, a14	)
603CPU_BE(	  rev		a15, a15	)
604
605	// interleave 32-bit words in state n, n+1
606	  ldp		w6, w7, [x2], #64
607	zip1		v16.4s, v0.4s, v1.4s
608	  ldp		w8, w9, [x2, #-56]
609	  eor		a0, a0, w6
610	zip2		v17.4s, v0.4s, v1.4s
611	  eor		a1, a1, w7
612	zip1		v18.4s, v2.4s, v3.4s
613	  eor		a2, a2, w8
614	zip2		v19.4s, v2.4s, v3.4s
615	  eor		a3, a3, w9
616	  ldp		w6, w7, [x2, #-48]
617	zip1		v20.4s, v4.4s, v5.4s
618	  ldp		w8, w9, [x2, #-40]
619	  eor		a4, a4, w6
620	zip2		v21.4s, v4.4s, v5.4s
621	  eor		a5, a5, w7
622	zip1		v22.4s, v6.4s, v7.4s
623	  eor		a6, a6, w8
624	zip2		v23.4s, v6.4s, v7.4s
625	  eor		a7, a7, w9
626	  ldp		w6, w7, [x2, #-32]
627	zip1		v24.4s, v8.4s, v9.4s
628	  ldp		w8, w9, [x2, #-24]
629	  eor		a8, a8, w6
630	zip2		v25.4s, v8.4s, v9.4s
631	  eor		a9, a9, w7
632	zip1		v26.4s, v10.4s, v11.4s
633	  eor		a10, a10, w8
634	zip2		v27.4s, v10.4s, v11.4s
635	  eor		a11, a11, w9
636	  ldp		w6, w7, [x2, #-16]
637	zip1		v28.4s, v12.4s, v13.4s
638	  ldp		w8, w9, [x2, #-8]
639	  eor		a12, a12, w6
640	zip2		v29.4s, v12.4s, v13.4s
641	  eor		a13, a13, w7
642	zip1		v30.4s, v14.4s, v15.4s
643	  eor		a14, a14, w8
644	zip2		v31.4s, v14.4s, v15.4s
645	  eor		a15, a15, w9
646
647	add		x3, x2, x4
648	sub		x3, x3, #128		// start of last block
649
650	subs		x5, x4, #128
651	csel		x2, x2, x3, ge
652
653	// interleave 64-bit words in state n, n+2
654	zip1		v0.2d, v16.2d, v18.2d
655	zip2		v4.2d, v16.2d, v18.2d
656	  stp		a0, a1, [x1], #64
657	zip1		v8.2d, v17.2d, v19.2d
658	zip2		v12.2d, v17.2d, v19.2d
659	  stp		a2, a3, [x1, #-56]
660
661	subs		x6, x4, #192
662	ld1		{v16.16b-v19.16b}, [x2], #64
663	csel		x2, x2, x3, ge
664
665	zip1		v1.2d, v20.2d, v22.2d
666	zip2		v5.2d, v20.2d, v22.2d
667	  stp		a4, a5, [x1, #-48]
668	zip1		v9.2d, v21.2d, v23.2d
669	zip2		v13.2d, v21.2d, v23.2d
670	  stp		a6, a7, [x1, #-40]
671
672	subs		x7, x4, #256
673	ld1		{v20.16b-v23.16b}, [x2], #64
674	csel		x2, x2, x3, ge
675
676	zip1		v2.2d, v24.2d, v26.2d
677	zip2		v6.2d, v24.2d, v26.2d
678	  stp		a8, a9, [x1, #-32]
679	zip1		v10.2d, v25.2d, v27.2d
680	zip2		v14.2d, v25.2d, v27.2d
681	  stp		a10, a11, [x1, #-24]
682
683	subs		x8, x4, #320
684	ld1		{v24.16b-v27.16b}, [x2], #64
685	csel		x2, x2, x3, ge
686
687	zip1		v3.2d, v28.2d, v30.2d
688	zip2		v7.2d, v28.2d, v30.2d
689	  stp		a12, a13, [x1, #-16]
690	zip1		v11.2d, v29.2d, v31.2d
691	zip2		v15.2d, v29.2d, v31.2d
692	  stp		a14, a15, [x1, #-8]
693
694	tbnz		x5, #63, .Lt128
695	ld1		{v28.16b-v31.16b}, [x2]
696
697	// xor with corresponding input, write to output
698	eor		v16.16b, v16.16b, v0.16b
699	eor		v17.16b, v17.16b, v1.16b
700	eor		v18.16b, v18.16b, v2.16b
701	eor		v19.16b, v19.16b, v3.16b
702
703	tbnz		x6, #63, .Lt192
704
705	eor		v20.16b, v20.16b, v4.16b
706	eor		v21.16b, v21.16b, v5.16b
707	eor		v22.16b, v22.16b, v6.16b
708	eor		v23.16b, v23.16b, v7.16b
709
710	st1		{v16.16b-v19.16b}, [x1], #64
711	tbnz		x7, #63, .Lt256
712
713	eor		v24.16b, v24.16b, v8.16b
714	eor		v25.16b, v25.16b, v9.16b
715	eor		v26.16b, v26.16b, v10.16b
716	eor		v27.16b, v27.16b, v11.16b
717
718	st1		{v20.16b-v23.16b}, [x1], #64
719	tbnz		x8, #63, .Lt320
720
721	eor		v28.16b, v28.16b, v12.16b
722	eor		v29.16b, v29.16b, v13.16b
723	eor		v30.16b, v30.16b, v14.16b
724	eor		v31.16b, v31.16b, v15.16b
725
726	st1		{v24.16b-v27.16b}, [x1], #64
727	st1		{v28.16b-v31.16b}, [x1]
728
729.Lout:	frame_pop
730	ret
731
732	// fewer than 192 bytes of in/output
733.Lt192:	cbz		x5, 1f				// exactly 128 bytes?
734	ld1		{v28.16b-v31.16b}, [x10]
735	add		x5, x5, x1
736	tbl		v28.16b, {v4.16b-v7.16b}, v28.16b
737	tbl		v29.16b, {v4.16b-v7.16b}, v29.16b
738	tbl		v30.16b, {v4.16b-v7.16b}, v30.16b
739	tbl		v31.16b, {v4.16b-v7.16b}, v31.16b
740
7410:	eor		v20.16b, v20.16b, v28.16b
742	eor		v21.16b, v21.16b, v29.16b
743	eor		v22.16b, v22.16b, v30.16b
744	eor		v23.16b, v23.16b, v31.16b
745	st1		{v20.16b-v23.16b}, [x5]		// overlapping stores
7461:	st1		{v16.16b-v19.16b}, [x1]
747	b		.Lout
748
749	// fewer than 128 bytes of in/output
750.Lt128:	ld1		{v28.16b-v31.16b}, [x10]
751	add		x5, x5, x1
752	sub		x1, x1, #64
753	tbl		v28.16b, {v0.16b-v3.16b}, v28.16b
754	tbl		v29.16b, {v0.16b-v3.16b}, v29.16b
755	tbl		v30.16b, {v0.16b-v3.16b}, v30.16b
756	tbl		v31.16b, {v0.16b-v3.16b}, v31.16b
757	ld1		{v16.16b-v19.16b}, [x1]		// reload first output block
758	b		0b
759
760	// fewer than 256 bytes of in/output
761.Lt256:	cbz		x6, 2f				// exactly 192 bytes?
762	ld1		{v4.16b-v7.16b}, [x10]
763	add		x6, x6, x1
764	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
765	tbl		v1.16b, {v8.16b-v11.16b}, v5.16b
766	tbl		v2.16b, {v8.16b-v11.16b}, v6.16b
767	tbl		v3.16b, {v8.16b-v11.16b}, v7.16b
768
769	eor		v28.16b, v28.16b, v0.16b
770	eor		v29.16b, v29.16b, v1.16b
771	eor		v30.16b, v30.16b, v2.16b
772	eor		v31.16b, v31.16b, v3.16b
773	st1		{v28.16b-v31.16b}, [x6]		// overlapping stores
7742:	st1		{v20.16b-v23.16b}, [x1]
775	b		.Lout
776
777	// fewer than 320 bytes of in/output
778.Lt320:	cbz		x7, 3f				// exactly 256 bytes?
779	ld1		{v4.16b-v7.16b}, [x10]
780	add		x7, x7, x1
781	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
782	tbl		v1.16b, {v12.16b-v15.16b}, v5.16b
783	tbl		v2.16b, {v12.16b-v15.16b}, v6.16b
784	tbl		v3.16b, {v12.16b-v15.16b}, v7.16b
785
786	eor		v28.16b, v28.16b, v0.16b
787	eor		v29.16b, v29.16b, v1.16b
788	eor		v30.16b, v30.16b, v2.16b
789	eor		v31.16b, v31.16b, v3.16b
790	st1		{v28.16b-v31.16b}, [x7]		// overlapping stores
7913:	st1		{v24.16b-v27.16b}, [x1]
792	b		.Lout
793SYM_FUNC_END(chacha_4block_xor_neon)
794
795	.section	".rodata", "a", %progbits
796	.align		L1_CACHE_SHIFT
797.Lpermute:
798	.set		.Li, 0
799	.rept		128
800	.byte		(.Li - 64)
801	.set		.Li, .Li + 1
802	.endr
803
804CTRINC:	.word		1, 2, 3, 4
805ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
806