xref: /linux/lib/crypto/arm64/chacha-neon-core.S (revision c4dde411bc366f568dbe33366253bbfea049e8ea)
1/*
2 * ChaCha/HChaCha NEON helper functions
3 *
4 * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * Originally based on:
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
12 *
13 * Copyright (C) 2015 Martin Willi
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or
18 * (at your option) any later version.
19 */
20
21#include <linux/linkage.h>
22#include <asm/assembler.h>
23#include <asm/cache.h>
24
25	.text
26	.align		6
27
28/*
29 * chacha_permute - permute one block
30 *
31 * Permute one 64-byte block where the state matrix is stored in the four NEON
32 * registers v0-v3.  It performs matrix operations on four words in parallel,
33 * but requires shuffling to rearrange the words after each round.
34 *
35 * The round count is given in w3.
36 *
37 * Clobbers: w3, x10, v4, v12
38 */
39SYM_FUNC_START_LOCAL(chacha_permute)
40
41	adr_l		x10, ROT8
42	ld1		{v12.4s}, [x10]
43
44.Ldoubleround:
45	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
46	add		v0.4s, v0.4s, v1.4s
47	eor		v3.16b, v3.16b, v0.16b
48	rev32		v3.8h, v3.8h
49
50	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
51	add		v2.4s, v2.4s, v3.4s
52	eor		v4.16b, v1.16b, v2.16b
53	shl		v1.4s, v4.4s, #12
54	sri		v1.4s, v4.4s, #20
55
56	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
57	add		v0.4s, v0.4s, v1.4s
58	eor		v3.16b, v3.16b, v0.16b
59	tbl		v3.16b, {v3.16b}, v12.16b
60
61	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
62	add		v2.4s, v2.4s, v3.4s
63	eor		v4.16b, v1.16b, v2.16b
64	shl		v1.4s, v4.4s, #7
65	sri		v1.4s, v4.4s, #25
66
67	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
68	ext		v1.16b, v1.16b, v1.16b, #4
69	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
70	ext		v2.16b, v2.16b, v2.16b, #8
71	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
72	ext		v3.16b, v3.16b, v3.16b, #12
73
74	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
75	add		v0.4s, v0.4s, v1.4s
76	eor		v3.16b, v3.16b, v0.16b
77	rev32		v3.8h, v3.8h
78
79	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
80	add		v2.4s, v2.4s, v3.4s
81	eor		v4.16b, v1.16b, v2.16b
82	shl		v1.4s, v4.4s, #12
83	sri		v1.4s, v4.4s, #20
84
85	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
86	add		v0.4s, v0.4s, v1.4s
87	eor		v3.16b, v3.16b, v0.16b
88	tbl		v3.16b, {v3.16b}, v12.16b
89
90	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
91	add		v2.4s, v2.4s, v3.4s
92	eor		v4.16b, v1.16b, v2.16b
93	shl		v1.4s, v4.4s, #7
94	sri		v1.4s, v4.4s, #25
95
96	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
97	ext		v1.16b, v1.16b, v1.16b, #12
98	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
99	ext		v2.16b, v2.16b, v2.16b, #8
100	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
101	ext		v3.16b, v3.16b, v3.16b, #4
102
103	subs		w3, w3, #2
104	b.ne		.Ldoubleround
105
106	ret
107SYM_FUNC_END(chacha_permute)
108
109SYM_FUNC_START(chacha_block_xor_neon)
110	// x0: Input state matrix, s
111	// x1: 1 data block output, o
112	// x2: 1 data block input, i
113	// w3: nrounds
114
115	stp		x29, x30, [sp, #-16]!
116	mov		x29, sp
117
118	// x0..3 = s0..3
119	ld1		{v0.4s-v3.4s}, [x0]
120	ld1		{v8.4s-v11.4s}, [x0]
121
122	bl		chacha_permute
123
124	ld1		{v4.16b-v7.16b}, [x2]
125
126	// o0 = i0 ^ (x0 + s0)
127	add		v0.4s, v0.4s, v8.4s
128	eor		v0.16b, v0.16b, v4.16b
129
130	// o1 = i1 ^ (x1 + s1)
131	add		v1.4s, v1.4s, v9.4s
132	eor		v1.16b, v1.16b, v5.16b
133
134	// o2 = i2 ^ (x2 + s2)
135	add		v2.4s, v2.4s, v10.4s
136	eor		v2.16b, v2.16b, v6.16b
137
138	// o3 = i3 ^ (x3 + s3)
139	add		v3.4s, v3.4s, v11.4s
140	eor		v3.16b, v3.16b, v7.16b
141
142	st1		{v0.16b-v3.16b}, [x1]
143
144	ldp		x29, x30, [sp], #16
145	ret
146SYM_FUNC_END(chacha_block_xor_neon)
147
148SYM_FUNC_START(hchacha_block_neon)
149	// x0: Input state matrix, s
150	// x1: output (8 32-bit words)
151	// w2: nrounds
152
153	stp		x29, x30, [sp, #-16]!
154	mov		x29, sp
155
156	ld1		{v0.4s-v3.4s}, [x0]
157
158	mov		w3, w2
159	bl		chacha_permute
160
161	st1		{v0.4s}, [x1], #16
162	st1		{v3.4s}, [x1]
163
164	ldp		x29, x30, [sp], #16
165	ret
166SYM_FUNC_END(hchacha_block_neon)
167
168	a0		.req	w12
169	a1		.req	w13
170	a2		.req	w14
171	a3		.req	w15
172	a4		.req	w16
173	a5		.req	w17
174	a6		.req	w19
175	a7		.req	w20
176	a8		.req	w21
177	a9		.req	w22
178	a10		.req	w23
179	a11		.req	w24
180	a12		.req	w25
181	a13		.req	w26
182	a14		.req	w27
183	a15		.req	w28
184
185	.align		6
186SYM_FUNC_START(chacha_4block_xor_neon)
187	frame_push	10
188
189	// x0: Input state matrix, s
190	// x1: 4 data blocks output, o
191	// x2: 4 data blocks input, i
192	// w3: nrounds
193	// x4: byte count
194
195	adr_l		x10, .Lpermute
196	and		x5, x4, #63
197	add		x10, x10, x5
198
199	//
200	// This function encrypts four consecutive ChaCha blocks by loading
201	// the state matrix in NEON registers four times. The algorithm performs
202	// each operation on the corresponding word of each state matrix, hence
203	// requires no word shuffling. For final XORing step we transpose the
204	// matrix by interleaving 32- and then 64-bit words, which allows us to
205	// do XOR in NEON registers.
206	//
207	// At the same time, a fifth block is encrypted in parallel using
208	// scalar registers
209	//
210	adr_l		x9, CTRINC		// ... and ROT8
211	ld1		{v30.4s-v31.4s}, [x9]
212
213	// x0..15[0-3] = s0..3[0..3]
214	add		x8, x0, #16
215	ld4r		{ v0.4s- v3.4s}, [x0]
216	ld4r		{ v4.4s- v7.4s}, [x8], #16
217	ld4r		{ v8.4s-v11.4s}, [x8], #16
218	ld4r		{v12.4s-v15.4s}, [x8]
219
220	mov		a0, v0.s[0]
221	mov		a1, v1.s[0]
222	mov		a2, v2.s[0]
223	mov		a3, v3.s[0]
224	mov		a4, v4.s[0]
225	mov		a5, v5.s[0]
226	mov		a6, v6.s[0]
227	mov		a7, v7.s[0]
228	mov		a8, v8.s[0]
229	mov		a9, v9.s[0]
230	mov		a10, v10.s[0]
231	mov		a11, v11.s[0]
232	mov		a12, v12.s[0]
233	mov		a13, v13.s[0]
234	mov		a14, v14.s[0]
235	mov		a15, v15.s[0]
236
237	// x12 += counter values 1-4
238	add		v12.4s, v12.4s, v30.4s
239
240.Ldoubleround4:
241	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
242	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
243	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
244	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
245	add		v0.4s, v0.4s, v4.4s
246	  add		a0, a0, a4
247	add		v1.4s, v1.4s, v5.4s
248	  add		a1, a1, a5
249	add		v2.4s, v2.4s, v6.4s
250	  add		a2, a2, a6
251	add		v3.4s, v3.4s, v7.4s
252	  add		a3, a3, a7
253
254	eor		v12.16b, v12.16b, v0.16b
255	  eor		a12, a12, a0
256	eor		v13.16b, v13.16b, v1.16b
257	  eor		a13, a13, a1
258	eor		v14.16b, v14.16b, v2.16b
259	  eor		a14, a14, a2
260	eor		v15.16b, v15.16b, v3.16b
261	  eor		a15, a15, a3
262
263	rev32		v12.8h, v12.8h
264	  ror		a12, a12, #16
265	rev32		v13.8h, v13.8h
266	  ror		a13, a13, #16
267	rev32		v14.8h, v14.8h
268	  ror		a14, a14, #16
269	rev32		v15.8h, v15.8h
270	  ror		a15, a15, #16
271
272	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
273	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
274	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
275	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
276	add		v8.4s, v8.4s, v12.4s
277	  add		a8, a8, a12
278	add		v9.4s, v9.4s, v13.4s
279	  add		a9, a9, a13
280	add		v10.4s, v10.4s, v14.4s
281	  add		a10, a10, a14
282	add		v11.4s, v11.4s, v15.4s
283	  add		a11, a11, a15
284
285	eor		v16.16b, v4.16b, v8.16b
286	  eor		a4, a4, a8
287	eor		v17.16b, v5.16b, v9.16b
288	  eor		a5, a5, a9
289	eor		v18.16b, v6.16b, v10.16b
290	  eor		a6, a6, a10
291	eor		v19.16b, v7.16b, v11.16b
292	  eor		a7, a7, a11
293
294	shl		v4.4s, v16.4s, #12
295	shl		v5.4s, v17.4s, #12
296	shl		v6.4s, v18.4s, #12
297	shl		v7.4s, v19.4s, #12
298
299	sri		v4.4s, v16.4s, #20
300	  ror		a4, a4, #20
301	sri		v5.4s, v17.4s, #20
302	  ror		a5, a5, #20
303	sri		v6.4s, v18.4s, #20
304	  ror		a6, a6, #20
305	sri		v7.4s, v19.4s, #20
306	  ror		a7, a7, #20
307
308	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
309	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
310	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
311	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
312	add		v0.4s, v0.4s, v4.4s
313	  add		a0, a0, a4
314	add		v1.4s, v1.4s, v5.4s
315	  add		a1, a1, a5
316	add		v2.4s, v2.4s, v6.4s
317	  add		a2, a2, a6
318	add		v3.4s, v3.4s, v7.4s
319	  add		a3, a3, a7
320
321	eor		v12.16b, v12.16b, v0.16b
322	  eor		a12, a12, a0
323	eor		v13.16b, v13.16b, v1.16b
324	  eor		a13, a13, a1
325	eor		v14.16b, v14.16b, v2.16b
326	  eor		a14, a14, a2
327	eor		v15.16b, v15.16b, v3.16b
328	  eor		a15, a15, a3
329
330	tbl		v12.16b, {v12.16b}, v31.16b
331	  ror		a12, a12, #24
332	tbl		v13.16b, {v13.16b}, v31.16b
333	  ror		a13, a13, #24
334	tbl		v14.16b, {v14.16b}, v31.16b
335	  ror		a14, a14, #24
336	tbl		v15.16b, {v15.16b}, v31.16b
337	  ror		a15, a15, #24
338
339	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
340	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
341	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
342	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
343	add		v8.4s, v8.4s, v12.4s
344	  add		a8, a8, a12
345	add		v9.4s, v9.4s, v13.4s
346	  add		a9, a9, a13
347	add		v10.4s, v10.4s, v14.4s
348	  add		a10, a10, a14
349	add		v11.4s, v11.4s, v15.4s
350	  add		a11, a11, a15
351
352	eor		v16.16b, v4.16b, v8.16b
353	  eor		a4, a4, a8
354	eor		v17.16b, v5.16b, v9.16b
355	  eor		a5, a5, a9
356	eor		v18.16b, v6.16b, v10.16b
357	  eor		a6, a6, a10
358	eor		v19.16b, v7.16b, v11.16b
359	  eor		a7, a7, a11
360
361	shl		v4.4s, v16.4s, #7
362	shl		v5.4s, v17.4s, #7
363	shl		v6.4s, v18.4s, #7
364	shl		v7.4s, v19.4s, #7
365
366	sri		v4.4s, v16.4s, #25
367	  ror		a4, a4, #25
368	sri		v5.4s, v17.4s, #25
369	  ror		a5, a5, #25
370	sri		v6.4s, v18.4s, #25
371	 ror		a6, a6, #25
372	sri		v7.4s, v19.4s, #25
373	  ror		a7, a7, #25
374
375	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
376	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
377	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
378	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
379	add		v0.4s, v0.4s, v5.4s
380	  add		a0, a0, a5
381	add		v1.4s, v1.4s, v6.4s
382	  add		a1, a1, a6
383	add		v2.4s, v2.4s, v7.4s
384	  add		a2, a2, a7
385	add		v3.4s, v3.4s, v4.4s
386	  add		a3, a3, a4
387
388	eor		v15.16b, v15.16b, v0.16b
389	  eor		a15, a15, a0
390	eor		v12.16b, v12.16b, v1.16b
391	  eor		a12, a12, a1
392	eor		v13.16b, v13.16b, v2.16b
393	  eor		a13, a13, a2
394	eor		v14.16b, v14.16b, v3.16b
395	  eor		a14, a14, a3
396
397	rev32		v15.8h, v15.8h
398	  ror		a15, a15, #16
399	rev32		v12.8h, v12.8h
400	  ror		a12, a12, #16
401	rev32		v13.8h, v13.8h
402	  ror		a13, a13, #16
403	rev32		v14.8h, v14.8h
404	  ror		a14, a14, #16
405
406	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
407	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
408	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
409	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
410	add		v10.4s, v10.4s, v15.4s
411	  add		a10, a10, a15
412	add		v11.4s, v11.4s, v12.4s
413	  add		a11, a11, a12
414	add		v8.4s, v8.4s, v13.4s
415	  add		a8, a8, a13
416	add		v9.4s, v9.4s, v14.4s
417	  add		a9, a9, a14
418
419	eor		v16.16b, v5.16b, v10.16b
420	  eor		a5, a5, a10
421	eor		v17.16b, v6.16b, v11.16b
422	  eor		a6, a6, a11
423	eor		v18.16b, v7.16b, v8.16b
424	  eor		a7, a7, a8
425	eor		v19.16b, v4.16b, v9.16b
426	  eor		a4, a4, a9
427
428	shl		v5.4s, v16.4s, #12
429	shl		v6.4s, v17.4s, #12
430	shl		v7.4s, v18.4s, #12
431	shl		v4.4s, v19.4s, #12
432
433	sri		v5.4s, v16.4s, #20
434	  ror		a5, a5, #20
435	sri		v6.4s, v17.4s, #20
436	  ror		a6, a6, #20
437	sri		v7.4s, v18.4s, #20
438	  ror		a7, a7, #20
439	sri		v4.4s, v19.4s, #20
440	  ror		a4, a4, #20
441
442	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
443	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
444	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
445	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
446	add		v0.4s, v0.4s, v5.4s
447	  add		a0, a0, a5
448	add		v1.4s, v1.4s, v6.4s
449	  add		a1, a1, a6
450	add		v2.4s, v2.4s, v7.4s
451	  add		a2, a2, a7
452	add		v3.4s, v3.4s, v4.4s
453	  add		a3, a3, a4
454
455	eor		v15.16b, v15.16b, v0.16b
456	  eor		a15, a15, a0
457	eor		v12.16b, v12.16b, v1.16b
458	  eor		a12, a12, a1
459	eor		v13.16b, v13.16b, v2.16b
460	  eor		a13, a13, a2
461	eor		v14.16b, v14.16b, v3.16b
462	  eor		a14, a14, a3
463
464	tbl		v15.16b, {v15.16b}, v31.16b
465	  ror		a15, a15, #24
466	tbl		v12.16b, {v12.16b}, v31.16b
467	  ror		a12, a12, #24
468	tbl		v13.16b, {v13.16b}, v31.16b
469	  ror		a13, a13, #24
470	tbl		v14.16b, {v14.16b}, v31.16b
471	  ror		a14, a14, #24
472
473	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
474	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
475	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
476	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
477	add		v10.4s, v10.4s, v15.4s
478	  add		a10, a10, a15
479	add		v11.4s, v11.4s, v12.4s
480	  add		a11, a11, a12
481	add		v8.4s, v8.4s, v13.4s
482	  add		a8, a8, a13
483	add		v9.4s, v9.4s, v14.4s
484	  add		a9, a9, a14
485
486	eor		v16.16b, v5.16b, v10.16b
487	  eor		a5, a5, a10
488	eor		v17.16b, v6.16b, v11.16b
489	  eor		a6, a6, a11
490	eor		v18.16b, v7.16b, v8.16b
491	  eor		a7, a7, a8
492	eor		v19.16b, v4.16b, v9.16b
493	  eor		a4, a4, a9
494
495	shl		v5.4s, v16.4s, #7
496	shl		v6.4s, v17.4s, #7
497	shl		v7.4s, v18.4s, #7
498	shl		v4.4s, v19.4s, #7
499
500	sri		v5.4s, v16.4s, #25
501	  ror		a5, a5, #25
502	sri		v6.4s, v17.4s, #25
503	  ror		a6, a6, #25
504	sri		v7.4s, v18.4s, #25
505	  ror		a7, a7, #25
506	sri		v4.4s, v19.4s, #25
507	  ror		a4, a4, #25
508
509	subs		w3, w3, #2
510	b.ne		.Ldoubleround4
511
512	ld4r		{v16.4s-v19.4s}, [x0], #16
513	ld4r		{v20.4s-v23.4s}, [x0], #16
514
515	// x12 += counter values 0-3
516	add		v12.4s, v12.4s, v30.4s
517
518	// x0[0-3] += s0[0]
519	// x1[0-3] += s0[1]
520	// x2[0-3] += s0[2]
521	// x3[0-3] += s0[3]
522	add		v0.4s, v0.4s, v16.4s
523	  mov		w6, v16.s[0]
524	  mov		w7, v17.s[0]
525	add		v1.4s, v1.4s, v17.4s
526	  mov		w8, v18.s[0]
527	  mov		w9, v19.s[0]
528	add		v2.4s, v2.4s, v18.4s
529	  add		a0, a0, w6
530	  add		a1, a1, w7
531	add		v3.4s, v3.4s, v19.4s
532	  add		a2, a2, w8
533	  add		a3, a3, w9
534
535	ld4r		{v24.4s-v27.4s}, [x0], #16
536	ld4r		{v28.4s-v31.4s}, [x0]
537
538	// x4[0-3] += s1[0]
539	// x5[0-3] += s1[1]
540	// x6[0-3] += s1[2]
541	// x7[0-3] += s1[3]
542	add		v4.4s, v4.4s, v20.4s
543	  mov		w6, v20.s[0]
544	  mov		w7, v21.s[0]
545	add		v5.4s, v5.4s, v21.4s
546	  mov		w8, v22.s[0]
547	  mov		w9, v23.s[0]
548	add		v6.4s, v6.4s, v22.4s
549	  add		a4, a4, w6
550	  add		a5, a5, w7
551	add		v7.4s, v7.4s, v23.4s
552	  add		a6, a6, w8
553	  add		a7, a7, w9
554
555	// x8[0-3] += s2[0]
556	// x9[0-3] += s2[1]
557	// x10[0-3] += s2[2]
558	// x11[0-3] += s2[3]
559	add		v8.4s, v8.4s, v24.4s
560	  mov		w6, v24.s[0]
561	  mov		w7, v25.s[0]
562	add		v9.4s, v9.4s, v25.4s
563	  mov		w8, v26.s[0]
564	  mov		w9, v27.s[0]
565	add		v10.4s, v10.4s, v26.4s
566	  add		a8, a8, w6
567	  add		a9, a9, w7
568	add		v11.4s, v11.4s, v27.4s
569	  add		a10, a10, w8
570	  add		a11, a11, w9
571
572	// x12[0-3] += s3[0]
573	// x13[0-3] += s3[1]
574	// x14[0-3] += s3[2]
575	// x15[0-3] += s3[3]
576	add		v12.4s, v12.4s, v28.4s
577	  mov		w6, v28.s[0]
578	  mov		w7, v29.s[0]
579	add		v13.4s, v13.4s, v29.4s
580	  mov		w8, v30.s[0]
581	  mov		w9, v31.s[0]
582	add		v14.4s, v14.4s, v30.4s
583	  add		a12, a12, w6
584	  add		a13, a13, w7
585	add		v15.4s, v15.4s, v31.4s
586	  add		a14, a14, w8
587	  add		a15, a15, w9
588
589	// interleave 32-bit words in state n, n+1
590	  ldp		w6, w7, [x2], #64
591	zip1		v16.4s, v0.4s, v1.4s
592	  ldp		w8, w9, [x2, #-56]
593	  eor		a0, a0, w6
594	zip2		v17.4s, v0.4s, v1.4s
595	  eor		a1, a1, w7
596	zip1		v18.4s, v2.4s, v3.4s
597	  eor		a2, a2, w8
598	zip2		v19.4s, v2.4s, v3.4s
599	  eor		a3, a3, w9
600	  ldp		w6, w7, [x2, #-48]
601	zip1		v20.4s, v4.4s, v5.4s
602	  ldp		w8, w9, [x2, #-40]
603	  eor		a4, a4, w6
604	zip2		v21.4s, v4.4s, v5.4s
605	  eor		a5, a5, w7
606	zip1		v22.4s, v6.4s, v7.4s
607	  eor		a6, a6, w8
608	zip2		v23.4s, v6.4s, v7.4s
609	  eor		a7, a7, w9
610	  ldp		w6, w7, [x2, #-32]
611	zip1		v24.4s, v8.4s, v9.4s
612	  ldp		w8, w9, [x2, #-24]
613	  eor		a8, a8, w6
614	zip2		v25.4s, v8.4s, v9.4s
615	  eor		a9, a9, w7
616	zip1		v26.4s, v10.4s, v11.4s
617	  eor		a10, a10, w8
618	zip2		v27.4s, v10.4s, v11.4s
619	  eor		a11, a11, w9
620	  ldp		w6, w7, [x2, #-16]
621	zip1		v28.4s, v12.4s, v13.4s
622	  ldp		w8, w9, [x2, #-8]
623	  eor		a12, a12, w6
624	zip2		v29.4s, v12.4s, v13.4s
625	  eor		a13, a13, w7
626	zip1		v30.4s, v14.4s, v15.4s
627	  eor		a14, a14, w8
628	zip2		v31.4s, v14.4s, v15.4s
629	  eor		a15, a15, w9
630
631	add		x3, x2, x4
632	sub		x3, x3, #128		// start of last block
633
634	subs		x5, x4, #128
635	csel		x2, x2, x3, ge
636
637	// interleave 64-bit words in state n, n+2
638	zip1		v0.2d, v16.2d, v18.2d
639	zip2		v4.2d, v16.2d, v18.2d
640	  stp		a0, a1, [x1], #64
641	zip1		v8.2d, v17.2d, v19.2d
642	zip2		v12.2d, v17.2d, v19.2d
643	  stp		a2, a3, [x1, #-56]
644
645	subs		x6, x4, #192
646	ld1		{v16.16b-v19.16b}, [x2], #64
647	csel		x2, x2, x3, ge
648
649	zip1		v1.2d, v20.2d, v22.2d
650	zip2		v5.2d, v20.2d, v22.2d
651	  stp		a4, a5, [x1, #-48]
652	zip1		v9.2d, v21.2d, v23.2d
653	zip2		v13.2d, v21.2d, v23.2d
654	  stp		a6, a7, [x1, #-40]
655
656	subs		x7, x4, #256
657	ld1		{v20.16b-v23.16b}, [x2], #64
658	csel		x2, x2, x3, ge
659
660	zip1		v2.2d, v24.2d, v26.2d
661	zip2		v6.2d, v24.2d, v26.2d
662	  stp		a8, a9, [x1, #-32]
663	zip1		v10.2d, v25.2d, v27.2d
664	zip2		v14.2d, v25.2d, v27.2d
665	  stp		a10, a11, [x1, #-24]
666
667	subs		x8, x4, #320
668	ld1		{v24.16b-v27.16b}, [x2], #64
669	csel		x2, x2, x3, ge
670
671	zip1		v3.2d, v28.2d, v30.2d
672	zip2		v7.2d, v28.2d, v30.2d
673	  stp		a12, a13, [x1, #-16]
674	zip1		v11.2d, v29.2d, v31.2d
675	zip2		v15.2d, v29.2d, v31.2d
676	  stp		a14, a15, [x1, #-8]
677
678	tbnz		x5, #63, .Lt128
679	ld1		{v28.16b-v31.16b}, [x2]
680
681	// xor with corresponding input, write to output
682	eor		v16.16b, v16.16b, v0.16b
683	eor		v17.16b, v17.16b, v1.16b
684	eor		v18.16b, v18.16b, v2.16b
685	eor		v19.16b, v19.16b, v3.16b
686
687	tbnz		x6, #63, .Lt192
688
689	eor		v20.16b, v20.16b, v4.16b
690	eor		v21.16b, v21.16b, v5.16b
691	eor		v22.16b, v22.16b, v6.16b
692	eor		v23.16b, v23.16b, v7.16b
693
694	st1		{v16.16b-v19.16b}, [x1], #64
695	tbnz		x7, #63, .Lt256
696
697	eor		v24.16b, v24.16b, v8.16b
698	eor		v25.16b, v25.16b, v9.16b
699	eor		v26.16b, v26.16b, v10.16b
700	eor		v27.16b, v27.16b, v11.16b
701
702	st1		{v20.16b-v23.16b}, [x1], #64
703	tbnz		x8, #63, .Lt320
704
705	eor		v28.16b, v28.16b, v12.16b
706	eor		v29.16b, v29.16b, v13.16b
707	eor		v30.16b, v30.16b, v14.16b
708	eor		v31.16b, v31.16b, v15.16b
709
710	st1		{v24.16b-v27.16b}, [x1], #64
711	st1		{v28.16b-v31.16b}, [x1]
712
713.Lout:	frame_pop
714	ret
715
716	// fewer than 192 bytes of in/output
717.Lt192:	cbz		x5, 1f				// exactly 128 bytes?
718	ld1		{v28.16b-v31.16b}, [x10]
719	add		x5, x5, x1
720	tbl		v28.16b, {v4.16b-v7.16b}, v28.16b
721	tbl		v29.16b, {v4.16b-v7.16b}, v29.16b
722	tbl		v30.16b, {v4.16b-v7.16b}, v30.16b
723	tbl		v31.16b, {v4.16b-v7.16b}, v31.16b
724
7250:	eor		v20.16b, v20.16b, v28.16b
726	eor		v21.16b, v21.16b, v29.16b
727	eor		v22.16b, v22.16b, v30.16b
728	eor		v23.16b, v23.16b, v31.16b
729	st1		{v20.16b-v23.16b}, [x5]		// overlapping stores
7301:	st1		{v16.16b-v19.16b}, [x1]
731	b		.Lout
732
733	// fewer than 128 bytes of in/output
734.Lt128:	ld1		{v28.16b-v31.16b}, [x10]
735	add		x5, x5, x1
736	sub		x1, x1, #64
737	tbl		v28.16b, {v0.16b-v3.16b}, v28.16b
738	tbl		v29.16b, {v0.16b-v3.16b}, v29.16b
739	tbl		v30.16b, {v0.16b-v3.16b}, v30.16b
740	tbl		v31.16b, {v0.16b-v3.16b}, v31.16b
741	ld1		{v16.16b-v19.16b}, [x1]		// reload first output block
742	b		0b
743
744	// fewer than 256 bytes of in/output
745.Lt256:	cbz		x6, 2f				// exactly 192 bytes?
746	ld1		{v4.16b-v7.16b}, [x10]
747	add		x6, x6, x1
748	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
749	tbl		v1.16b, {v8.16b-v11.16b}, v5.16b
750	tbl		v2.16b, {v8.16b-v11.16b}, v6.16b
751	tbl		v3.16b, {v8.16b-v11.16b}, v7.16b
752
753	eor		v28.16b, v28.16b, v0.16b
754	eor		v29.16b, v29.16b, v1.16b
755	eor		v30.16b, v30.16b, v2.16b
756	eor		v31.16b, v31.16b, v3.16b
757	st1		{v28.16b-v31.16b}, [x6]		// overlapping stores
7582:	st1		{v20.16b-v23.16b}, [x1]
759	b		.Lout
760
761	// fewer than 320 bytes of in/output
762.Lt320:	cbz		x7, 3f				// exactly 256 bytes?
763	ld1		{v4.16b-v7.16b}, [x10]
764	add		x7, x7, x1
765	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
766	tbl		v1.16b, {v12.16b-v15.16b}, v5.16b
767	tbl		v2.16b, {v12.16b-v15.16b}, v6.16b
768	tbl		v3.16b, {v12.16b-v15.16b}, v7.16b
769
770	eor		v28.16b, v28.16b, v0.16b
771	eor		v29.16b, v29.16b, v1.16b
772	eor		v30.16b, v30.16b, v2.16b
773	eor		v31.16b, v31.16b, v3.16b
774	st1		{v28.16b-v31.16b}, [x7]		// overlapping stores
7753:	st1		{v24.16b-v27.16b}, [x1]
776	b		.Lout
777SYM_FUNC_END(chacha_4block_xor_neon)
778
779	.section	".rodata", "a", %progbits
780	.align		L1_CACHE_SHIFT
781.Lpermute:
782	.set		.Li, 0
783	.rept		128
784	.byte		(.Li - 64)
785	.set		.Li, .Li + 1
786	.endr
787
788CTRINC:	.word		1, 2, 3, 4
789ROT8:	.word		0x02010003, 0x06050407, 0x0a09080b, 0x0e0d0c0f
790