xref: /linux/lib/crypto/arm/chacha-neon-core.S (revision 13150742b09e720fdf021de14cd2b98b37415a89)
1*4a32e5dcSEric Biggers/*
2*4a32e5dcSEric Biggers * ChaCha/HChaCha NEON helper functions
3*4a32e5dcSEric Biggers *
4*4a32e5dcSEric Biggers * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
5*4a32e5dcSEric Biggers *
6*4a32e5dcSEric Biggers * This program is free software; you can redistribute it and/or modify
7*4a32e5dcSEric Biggers * it under the terms of the GNU General Public License version 2 as
8*4a32e5dcSEric Biggers * published by the Free Software Foundation.
9*4a32e5dcSEric Biggers *
10*4a32e5dcSEric Biggers * Based on:
11*4a32e5dcSEric Biggers * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
12*4a32e5dcSEric Biggers *
13*4a32e5dcSEric Biggers * Copyright (C) 2015 Martin Willi
14*4a32e5dcSEric Biggers *
15*4a32e5dcSEric Biggers * This program is free software; you can redistribute it and/or modify
16*4a32e5dcSEric Biggers * it under the terms of the GNU General Public License as published by
17*4a32e5dcSEric Biggers * the Free Software Foundation; either version 2 of the License, or
18*4a32e5dcSEric Biggers * (at your option) any later version.
19*4a32e5dcSEric Biggers */
20*4a32e5dcSEric Biggers
21*4a32e5dcSEric Biggers /*
22*4a32e5dcSEric Biggers  * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
23*4a32e5dcSEric Biggers  *
24*4a32e5dcSEric Biggers  * (a)  vshl.u32 + vsri.u32		(needs temporary register)
25*4a32e5dcSEric Biggers  * (b)  vshl.u32 + vshr.u32 + vorr	(needs temporary register)
26*4a32e5dcSEric Biggers  * (c)  vrev32.16			(16-bit rotations only)
27*4a32e5dcSEric Biggers  * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
28*4a32e5dcSEric Biggers  *					 needs index vector)
29*4a32e5dcSEric Biggers  *
30*4a32e5dcSEric Biggers  * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
31*4a32e5dcSEric Biggers  * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
32*4a32e5dcSEric Biggers  * cycles of (b) on both Cortex-A7 and Cortex-A53.
33*4a32e5dcSEric Biggers  *
34*4a32e5dcSEric Biggers  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
35*4a32e5dcSEric Biggers  * and doesn't need a temporary register.
36*4a32e5dcSEric Biggers  *
37*4a32e5dcSEric Biggers  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
38*4a32e5dcSEric Biggers  * is twice as fast as (a), even when doing (a) on multiple registers
39*4a32e5dcSEric Biggers  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
40*4a32e5dcSEric Biggers  * parallelizes better when temporary registers are scarce.
41*4a32e5dcSEric Biggers  *
42*4a32e5dcSEric Biggers  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
43*4a32e5dcSEric Biggers  * (a), so the need to load the rotation table actually makes the vtbl method
44*4a32e5dcSEric Biggers  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
45*4a32e5dcSEric Biggers  * seems to be a good compromise to get a more significant speed boost on some
46*4a32e5dcSEric Biggers  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
47*4a32e5dcSEric Biggers  */
48*4a32e5dcSEric Biggers
49*4a32e5dcSEric Biggers#include <linux/linkage.h>
50*4a32e5dcSEric Biggers#include <asm/cache.h>
51*4a32e5dcSEric Biggers
52*4a32e5dcSEric Biggers	.text
53*4a32e5dcSEric Biggers	.fpu		neon
54*4a32e5dcSEric Biggers	.align		5
55*4a32e5dcSEric Biggers
56*4a32e5dcSEric Biggers/*
57*4a32e5dcSEric Biggers * chacha_permute - permute one block
58*4a32e5dcSEric Biggers *
59*4a32e5dcSEric Biggers * Permute one 64-byte block where the state matrix is stored in the four NEON
60*4a32e5dcSEric Biggers * registers q0-q3.  It performs matrix operations on four words in parallel,
61*4a32e5dcSEric Biggers * but requires shuffling to rearrange the words after each round.
62*4a32e5dcSEric Biggers *
63*4a32e5dcSEric Biggers * The round count is given in r3.
64*4a32e5dcSEric Biggers *
65*4a32e5dcSEric Biggers * Clobbers: r3, ip, q4-q5
66*4a32e5dcSEric Biggers */
67*4a32e5dcSEric Biggerschacha_permute:
68*4a32e5dcSEric Biggers
69*4a32e5dcSEric Biggers	adr		ip, .Lrol8_table
70*4a32e5dcSEric Biggers	vld1.8		{d10}, [ip, :64]
71*4a32e5dcSEric Biggers
72*4a32e5dcSEric Biggers.Ldoubleround:
73*4a32e5dcSEric Biggers	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
74*4a32e5dcSEric Biggers	vadd.i32	q0, q0, q1
75*4a32e5dcSEric Biggers	veor		q3, q3, q0
76*4a32e5dcSEric Biggers	vrev32.16	q3, q3
77*4a32e5dcSEric Biggers
78*4a32e5dcSEric Biggers	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
79*4a32e5dcSEric Biggers	vadd.i32	q2, q2, q3
80*4a32e5dcSEric Biggers	veor		q4, q1, q2
81*4a32e5dcSEric Biggers	vshl.u32	q1, q4, #12
82*4a32e5dcSEric Biggers	vsri.u32	q1, q4, #20
83*4a32e5dcSEric Biggers
84*4a32e5dcSEric Biggers	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
85*4a32e5dcSEric Biggers	vadd.i32	q0, q0, q1
86*4a32e5dcSEric Biggers	veor		q3, q3, q0
87*4a32e5dcSEric Biggers	vtbl.8		d6, {d6}, d10
88*4a32e5dcSEric Biggers	vtbl.8		d7, {d7}, d10
89*4a32e5dcSEric Biggers
90*4a32e5dcSEric Biggers	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
91*4a32e5dcSEric Biggers	vadd.i32	q2, q2, q3
92*4a32e5dcSEric Biggers	veor		q4, q1, q2
93*4a32e5dcSEric Biggers	vshl.u32	q1, q4, #7
94*4a32e5dcSEric Biggers	vsri.u32	q1, q4, #25
95*4a32e5dcSEric Biggers
96*4a32e5dcSEric Biggers	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
97*4a32e5dcSEric Biggers	vext.8		q1, q1, q1, #4
98*4a32e5dcSEric Biggers	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
99*4a32e5dcSEric Biggers	vext.8		q2, q2, q2, #8
100*4a32e5dcSEric Biggers	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
101*4a32e5dcSEric Biggers	vext.8		q3, q3, q3, #12
102*4a32e5dcSEric Biggers
103*4a32e5dcSEric Biggers	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
104*4a32e5dcSEric Biggers	vadd.i32	q0, q0, q1
105*4a32e5dcSEric Biggers	veor		q3, q3, q0
106*4a32e5dcSEric Biggers	vrev32.16	q3, q3
107*4a32e5dcSEric Biggers
108*4a32e5dcSEric Biggers	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
109*4a32e5dcSEric Biggers	vadd.i32	q2, q2, q3
110*4a32e5dcSEric Biggers	veor		q4, q1, q2
111*4a32e5dcSEric Biggers	vshl.u32	q1, q4, #12
112*4a32e5dcSEric Biggers	vsri.u32	q1, q4, #20
113*4a32e5dcSEric Biggers
114*4a32e5dcSEric Biggers	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
115*4a32e5dcSEric Biggers	vadd.i32	q0, q0, q1
116*4a32e5dcSEric Biggers	veor		q3, q3, q0
117*4a32e5dcSEric Biggers	vtbl.8		d6, {d6}, d10
118*4a32e5dcSEric Biggers	vtbl.8		d7, {d7}, d10
119*4a32e5dcSEric Biggers
120*4a32e5dcSEric Biggers	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
121*4a32e5dcSEric Biggers	vadd.i32	q2, q2, q3
122*4a32e5dcSEric Biggers	veor		q4, q1, q2
123*4a32e5dcSEric Biggers	vshl.u32	q1, q4, #7
124*4a32e5dcSEric Biggers	vsri.u32	q1, q4, #25
125*4a32e5dcSEric Biggers
126*4a32e5dcSEric Biggers	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
127*4a32e5dcSEric Biggers	vext.8		q1, q1, q1, #12
128*4a32e5dcSEric Biggers	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
129*4a32e5dcSEric Biggers	vext.8		q2, q2, q2, #8
130*4a32e5dcSEric Biggers	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
131*4a32e5dcSEric Biggers	vext.8		q3, q3, q3, #4
132*4a32e5dcSEric Biggers
133*4a32e5dcSEric Biggers	subs		r3, r3, #2
134*4a32e5dcSEric Biggers	bne		.Ldoubleround
135*4a32e5dcSEric Biggers
136*4a32e5dcSEric Biggers	bx		lr
137*4a32e5dcSEric BiggersENDPROC(chacha_permute)
138*4a32e5dcSEric Biggers
139*4a32e5dcSEric BiggersENTRY(chacha_block_xor_neon)
140*4a32e5dcSEric Biggers	// r0: Input state matrix, s
141*4a32e5dcSEric Biggers	// r1: 1 data block output, o
142*4a32e5dcSEric Biggers	// r2: 1 data block input, i
143*4a32e5dcSEric Biggers	// r3: nrounds
144*4a32e5dcSEric Biggers	push		{lr}
145*4a32e5dcSEric Biggers
146*4a32e5dcSEric Biggers	// x0..3 = s0..3
147*4a32e5dcSEric Biggers	add		ip, r0, #0x20
148*4a32e5dcSEric Biggers	vld1.32		{q0-q1}, [r0]
149*4a32e5dcSEric Biggers	vld1.32		{q2-q3}, [ip]
150*4a32e5dcSEric Biggers
151*4a32e5dcSEric Biggers	vmov		q8, q0
152*4a32e5dcSEric Biggers	vmov		q9, q1
153*4a32e5dcSEric Biggers	vmov		q10, q2
154*4a32e5dcSEric Biggers	vmov		q11, q3
155*4a32e5dcSEric Biggers
156*4a32e5dcSEric Biggers	bl		chacha_permute
157*4a32e5dcSEric Biggers
158*4a32e5dcSEric Biggers	add		ip, r2, #0x20
159*4a32e5dcSEric Biggers	vld1.8		{q4-q5}, [r2]
160*4a32e5dcSEric Biggers	vld1.8		{q6-q7}, [ip]
161*4a32e5dcSEric Biggers
162*4a32e5dcSEric Biggers	// o0 = i0 ^ (x0 + s0)
163*4a32e5dcSEric Biggers	vadd.i32	q0, q0, q8
164*4a32e5dcSEric Biggers	veor		q0, q0, q4
165*4a32e5dcSEric Biggers
166*4a32e5dcSEric Biggers	// o1 = i1 ^ (x1 + s1)
167*4a32e5dcSEric Biggers	vadd.i32	q1, q1, q9
168*4a32e5dcSEric Biggers	veor		q1, q1, q5
169*4a32e5dcSEric Biggers
170*4a32e5dcSEric Biggers	// o2 = i2 ^ (x2 + s2)
171*4a32e5dcSEric Biggers	vadd.i32	q2, q2, q10
172*4a32e5dcSEric Biggers	veor		q2, q2, q6
173*4a32e5dcSEric Biggers
174*4a32e5dcSEric Biggers	// o3 = i3 ^ (x3 + s3)
175*4a32e5dcSEric Biggers	vadd.i32	q3, q3, q11
176*4a32e5dcSEric Biggers	veor		q3, q3, q7
177*4a32e5dcSEric Biggers
178*4a32e5dcSEric Biggers	add		ip, r1, #0x20
179*4a32e5dcSEric Biggers	vst1.8		{q0-q1}, [r1]
180*4a32e5dcSEric Biggers	vst1.8		{q2-q3}, [ip]
181*4a32e5dcSEric Biggers
182*4a32e5dcSEric Biggers	pop		{pc}
183*4a32e5dcSEric BiggersENDPROC(chacha_block_xor_neon)
184*4a32e5dcSEric Biggers
185*4a32e5dcSEric BiggersENTRY(hchacha_block_neon)
186*4a32e5dcSEric Biggers	// r0: Input state matrix, s
187*4a32e5dcSEric Biggers	// r1: output (8 32-bit words)
188*4a32e5dcSEric Biggers	// r2: nrounds
189*4a32e5dcSEric Biggers	push		{lr}
190*4a32e5dcSEric Biggers
191*4a32e5dcSEric Biggers	vld1.32		{q0-q1}, [r0]!
192*4a32e5dcSEric Biggers	vld1.32		{q2-q3}, [r0]
193*4a32e5dcSEric Biggers
194*4a32e5dcSEric Biggers	mov		r3, r2
195*4a32e5dcSEric Biggers	bl		chacha_permute
196*4a32e5dcSEric Biggers
197*4a32e5dcSEric Biggers	vst1.32		{q0}, [r1]!
198*4a32e5dcSEric Biggers	vst1.32		{q3}, [r1]
199*4a32e5dcSEric Biggers
200*4a32e5dcSEric Biggers	pop		{pc}
201*4a32e5dcSEric BiggersENDPROC(hchacha_block_neon)
202*4a32e5dcSEric Biggers
203*4a32e5dcSEric Biggers	.align		4
204*4a32e5dcSEric Biggers.Lctrinc:	.word	0, 1, 2, 3
205*4a32e5dcSEric Biggers.Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
206*4a32e5dcSEric Biggers
207*4a32e5dcSEric Biggers	.align		5
208*4a32e5dcSEric BiggersENTRY(chacha_4block_xor_neon)
209*4a32e5dcSEric Biggers	push		{r4, lr}
210*4a32e5dcSEric Biggers	mov		r4, sp			// preserve the stack pointer
211*4a32e5dcSEric Biggers	sub		ip, sp, #0x20		// allocate a 32 byte buffer
212*4a32e5dcSEric Biggers	bic		ip, ip, #0x1f		// aligned to 32 bytes
213*4a32e5dcSEric Biggers	mov		sp, ip
214*4a32e5dcSEric Biggers
215*4a32e5dcSEric Biggers	// r0: Input state matrix, s
216*4a32e5dcSEric Biggers	// r1: 4 data blocks output, o
217*4a32e5dcSEric Biggers	// r2: 4 data blocks input, i
218*4a32e5dcSEric Biggers	// r3: nrounds
219*4a32e5dcSEric Biggers
220*4a32e5dcSEric Biggers	//
221*4a32e5dcSEric Biggers	// This function encrypts four consecutive ChaCha blocks by loading
222*4a32e5dcSEric Biggers	// the state matrix in NEON registers four times. The algorithm performs
223*4a32e5dcSEric Biggers	// each operation on the corresponding word of each state matrix, hence
224*4a32e5dcSEric Biggers	// requires no word shuffling. The words are re-interleaved before the
225*4a32e5dcSEric Biggers	// final addition of the original state and the XORing step.
226*4a32e5dcSEric Biggers	//
227*4a32e5dcSEric Biggers
228*4a32e5dcSEric Biggers	// x0..15[0-3] = s0..15[0-3]
229*4a32e5dcSEric Biggers	add		ip, r0, #0x20
230*4a32e5dcSEric Biggers	vld1.32		{q0-q1}, [r0]
231*4a32e5dcSEric Biggers	vld1.32		{q2-q3}, [ip]
232*4a32e5dcSEric Biggers
233*4a32e5dcSEric Biggers	adr		lr, .Lctrinc
234*4a32e5dcSEric Biggers	vdup.32		q15, d7[1]
235*4a32e5dcSEric Biggers	vdup.32		q14, d7[0]
236*4a32e5dcSEric Biggers	vld1.32		{q4}, [lr, :128]
237*4a32e5dcSEric Biggers	vdup.32		q13, d6[1]
238*4a32e5dcSEric Biggers	vdup.32		q12, d6[0]
239*4a32e5dcSEric Biggers	vdup.32		q11, d5[1]
240*4a32e5dcSEric Biggers	vdup.32		q10, d5[0]
241*4a32e5dcSEric Biggers	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
242*4a32e5dcSEric Biggers	vdup.32		q9, d4[1]
243*4a32e5dcSEric Biggers	vdup.32		q8, d4[0]
244*4a32e5dcSEric Biggers	vdup.32		q7, d3[1]
245*4a32e5dcSEric Biggers	vdup.32		q6, d3[0]
246*4a32e5dcSEric Biggers	vdup.32		q5, d2[1]
247*4a32e5dcSEric Biggers	vdup.32		q4, d2[0]
248*4a32e5dcSEric Biggers	vdup.32		q3, d1[1]
249*4a32e5dcSEric Biggers	vdup.32		q2, d1[0]
250*4a32e5dcSEric Biggers	vdup.32		q1, d0[1]
251*4a32e5dcSEric Biggers	vdup.32		q0, d0[0]
252*4a32e5dcSEric Biggers
253*4a32e5dcSEric Biggers	adr		ip, .Lrol8_table
254*4a32e5dcSEric Biggers	b		1f
255*4a32e5dcSEric Biggers
256*4a32e5dcSEric Biggers.Ldoubleround4:
257*4a32e5dcSEric Biggers	vld1.32		{q8-q9}, [sp, :256]
258*4a32e5dcSEric Biggers1:
259*4a32e5dcSEric Biggers	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
260*4a32e5dcSEric Biggers	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
261*4a32e5dcSEric Biggers	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
262*4a32e5dcSEric Biggers	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
263*4a32e5dcSEric Biggers	vadd.i32	q0, q0, q4
264*4a32e5dcSEric Biggers	vadd.i32	q1, q1, q5
265*4a32e5dcSEric Biggers	vadd.i32	q2, q2, q6
266*4a32e5dcSEric Biggers	vadd.i32	q3, q3, q7
267*4a32e5dcSEric Biggers
268*4a32e5dcSEric Biggers	veor		q12, q12, q0
269*4a32e5dcSEric Biggers	veor		q13, q13, q1
270*4a32e5dcSEric Biggers	veor		q14, q14, q2
271*4a32e5dcSEric Biggers	veor		q15, q15, q3
272*4a32e5dcSEric Biggers
273*4a32e5dcSEric Biggers	vrev32.16	q12, q12
274*4a32e5dcSEric Biggers	vrev32.16	q13, q13
275*4a32e5dcSEric Biggers	vrev32.16	q14, q14
276*4a32e5dcSEric Biggers	vrev32.16	q15, q15
277*4a32e5dcSEric Biggers
278*4a32e5dcSEric Biggers	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
279*4a32e5dcSEric Biggers	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
280*4a32e5dcSEric Biggers	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
281*4a32e5dcSEric Biggers	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
282*4a32e5dcSEric Biggers	vadd.i32	q8, q8, q12
283*4a32e5dcSEric Biggers	vadd.i32	q9, q9, q13
284*4a32e5dcSEric Biggers	vadd.i32	q10, q10, q14
285*4a32e5dcSEric Biggers	vadd.i32	q11, q11, q15
286*4a32e5dcSEric Biggers
287*4a32e5dcSEric Biggers	vst1.32		{q8-q9}, [sp, :256]
288*4a32e5dcSEric Biggers
289*4a32e5dcSEric Biggers	veor		q8, q4, q8
290*4a32e5dcSEric Biggers	veor		q9, q5, q9
291*4a32e5dcSEric Biggers	vshl.u32	q4, q8, #12
292*4a32e5dcSEric Biggers	vshl.u32	q5, q9, #12
293*4a32e5dcSEric Biggers	vsri.u32	q4, q8, #20
294*4a32e5dcSEric Biggers	vsri.u32	q5, q9, #20
295*4a32e5dcSEric Biggers
296*4a32e5dcSEric Biggers	veor		q8, q6, q10
297*4a32e5dcSEric Biggers	veor		q9, q7, q11
298*4a32e5dcSEric Biggers	vshl.u32	q6, q8, #12
299*4a32e5dcSEric Biggers	vshl.u32	q7, q9, #12
300*4a32e5dcSEric Biggers	vsri.u32	q6, q8, #20
301*4a32e5dcSEric Biggers	vsri.u32	q7, q9, #20
302*4a32e5dcSEric Biggers
303*4a32e5dcSEric Biggers	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
304*4a32e5dcSEric Biggers	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
305*4a32e5dcSEric Biggers	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
306*4a32e5dcSEric Biggers	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
307*4a32e5dcSEric Biggers	vld1.8		{d16}, [ip, :64]
308*4a32e5dcSEric Biggers	vadd.i32	q0, q0, q4
309*4a32e5dcSEric Biggers	vadd.i32	q1, q1, q5
310*4a32e5dcSEric Biggers	vadd.i32	q2, q2, q6
311*4a32e5dcSEric Biggers	vadd.i32	q3, q3, q7
312*4a32e5dcSEric Biggers
313*4a32e5dcSEric Biggers	veor		q12, q12, q0
314*4a32e5dcSEric Biggers	veor		q13, q13, q1
315*4a32e5dcSEric Biggers	veor		q14, q14, q2
316*4a32e5dcSEric Biggers	veor		q15, q15, q3
317*4a32e5dcSEric Biggers
318*4a32e5dcSEric Biggers	vtbl.8		d24, {d24}, d16
319*4a32e5dcSEric Biggers	vtbl.8		d25, {d25}, d16
320*4a32e5dcSEric Biggers	vtbl.8		d26, {d26}, d16
321*4a32e5dcSEric Biggers	vtbl.8		d27, {d27}, d16
322*4a32e5dcSEric Biggers	vtbl.8		d28, {d28}, d16
323*4a32e5dcSEric Biggers	vtbl.8		d29, {d29}, d16
324*4a32e5dcSEric Biggers	vtbl.8		d30, {d30}, d16
325*4a32e5dcSEric Biggers	vtbl.8		d31, {d31}, d16
326*4a32e5dcSEric Biggers
327*4a32e5dcSEric Biggers	vld1.32		{q8-q9}, [sp, :256]
328*4a32e5dcSEric Biggers
329*4a32e5dcSEric Biggers	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
330*4a32e5dcSEric Biggers	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
331*4a32e5dcSEric Biggers	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
332*4a32e5dcSEric Biggers	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
333*4a32e5dcSEric Biggers	vadd.i32	q8, q8, q12
334*4a32e5dcSEric Biggers	vadd.i32	q9, q9, q13
335*4a32e5dcSEric Biggers	vadd.i32	q10, q10, q14
336*4a32e5dcSEric Biggers	vadd.i32	q11, q11, q15
337*4a32e5dcSEric Biggers
338*4a32e5dcSEric Biggers	vst1.32		{q8-q9}, [sp, :256]
339*4a32e5dcSEric Biggers
340*4a32e5dcSEric Biggers	veor		q8, q4, q8
341*4a32e5dcSEric Biggers	veor		q9, q5, q9
342*4a32e5dcSEric Biggers	vshl.u32	q4, q8, #7
343*4a32e5dcSEric Biggers	vshl.u32	q5, q9, #7
344*4a32e5dcSEric Biggers	vsri.u32	q4, q8, #25
345*4a32e5dcSEric Biggers	vsri.u32	q5, q9, #25
346*4a32e5dcSEric Biggers
347*4a32e5dcSEric Biggers	veor		q8, q6, q10
348*4a32e5dcSEric Biggers	veor		q9, q7, q11
349*4a32e5dcSEric Biggers	vshl.u32	q6, q8, #7
350*4a32e5dcSEric Biggers	vshl.u32	q7, q9, #7
351*4a32e5dcSEric Biggers	vsri.u32	q6, q8, #25
352*4a32e5dcSEric Biggers	vsri.u32	q7, q9, #25
353*4a32e5dcSEric Biggers
354*4a32e5dcSEric Biggers	vld1.32		{q8-q9}, [sp, :256]
355*4a32e5dcSEric Biggers
356*4a32e5dcSEric Biggers	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
357*4a32e5dcSEric Biggers	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
358*4a32e5dcSEric Biggers	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
359*4a32e5dcSEric Biggers	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
360*4a32e5dcSEric Biggers	vadd.i32	q0, q0, q5
361*4a32e5dcSEric Biggers	vadd.i32	q1, q1, q6
362*4a32e5dcSEric Biggers	vadd.i32	q2, q2, q7
363*4a32e5dcSEric Biggers	vadd.i32	q3, q3, q4
364*4a32e5dcSEric Biggers
365*4a32e5dcSEric Biggers	veor		q15, q15, q0
366*4a32e5dcSEric Biggers	veor		q12, q12, q1
367*4a32e5dcSEric Biggers	veor		q13, q13, q2
368*4a32e5dcSEric Biggers	veor		q14, q14, q3
369*4a32e5dcSEric Biggers
370*4a32e5dcSEric Biggers	vrev32.16	q15, q15
371*4a32e5dcSEric Biggers	vrev32.16	q12, q12
372*4a32e5dcSEric Biggers	vrev32.16	q13, q13
373*4a32e5dcSEric Biggers	vrev32.16	q14, q14
374*4a32e5dcSEric Biggers
375*4a32e5dcSEric Biggers	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
376*4a32e5dcSEric Biggers	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
377*4a32e5dcSEric Biggers	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
378*4a32e5dcSEric Biggers	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
379*4a32e5dcSEric Biggers	vadd.i32	q10, q10, q15
380*4a32e5dcSEric Biggers	vadd.i32	q11, q11, q12
381*4a32e5dcSEric Biggers	vadd.i32	q8, q8, q13
382*4a32e5dcSEric Biggers	vadd.i32	q9, q9, q14
383*4a32e5dcSEric Biggers
384*4a32e5dcSEric Biggers	vst1.32		{q8-q9}, [sp, :256]
385*4a32e5dcSEric Biggers
386*4a32e5dcSEric Biggers	veor		q8, q7, q8
387*4a32e5dcSEric Biggers	veor		q9, q4, q9
388*4a32e5dcSEric Biggers	vshl.u32	q7, q8, #12
389*4a32e5dcSEric Biggers	vshl.u32	q4, q9, #12
390*4a32e5dcSEric Biggers	vsri.u32	q7, q8, #20
391*4a32e5dcSEric Biggers	vsri.u32	q4, q9, #20
392*4a32e5dcSEric Biggers
393*4a32e5dcSEric Biggers	veor		q8, q5, q10
394*4a32e5dcSEric Biggers	veor		q9, q6, q11
395*4a32e5dcSEric Biggers	vshl.u32	q5, q8, #12
396*4a32e5dcSEric Biggers	vshl.u32	q6, q9, #12
397*4a32e5dcSEric Biggers	vsri.u32	q5, q8, #20
398*4a32e5dcSEric Biggers	vsri.u32	q6, q9, #20
399*4a32e5dcSEric Biggers
400*4a32e5dcSEric Biggers	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
401*4a32e5dcSEric Biggers	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
402*4a32e5dcSEric Biggers	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
403*4a32e5dcSEric Biggers	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
404*4a32e5dcSEric Biggers	vld1.8		{d16}, [ip, :64]
405*4a32e5dcSEric Biggers	vadd.i32	q0, q0, q5
406*4a32e5dcSEric Biggers	vadd.i32	q1, q1, q6
407*4a32e5dcSEric Biggers	vadd.i32	q2, q2, q7
408*4a32e5dcSEric Biggers	vadd.i32	q3, q3, q4
409*4a32e5dcSEric Biggers
410*4a32e5dcSEric Biggers	veor		q15, q15, q0
411*4a32e5dcSEric Biggers	veor		q12, q12, q1
412*4a32e5dcSEric Biggers	veor		q13, q13, q2
413*4a32e5dcSEric Biggers	veor		q14, q14, q3
414*4a32e5dcSEric Biggers
415*4a32e5dcSEric Biggers	vtbl.8		d30, {d30}, d16
416*4a32e5dcSEric Biggers	vtbl.8		d31, {d31}, d16
417*4a32e5dcSEric Biggers	vtbl.8		d24, {d24}, d16
418*4a32e5dcSEric Biggers	vtbl.8		d25, {d25}, d16
419*4a32e5dcSEric Biggers	vtbl.8		d26, {d26}, d16
420*4a32e5dcSEric Biggers	vtbl.8		d27, {d27}, d16
421*4a32e5dcSEric Biggers	vtbl.8		d28, {d28}, d16
422*4a32e5dcSEric Biggers	vtbl.8		d29, {d29}, d16
423*4a32e5dcSEric Biggers
424*4a32e5dcSEric Biggers	vld1.32		{q8-q9}, [sp, :256]
425*4a32e5dcSEric Biggers
426*4a32e5dcSEric Biggers	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
427*4a32e5dcSEric Biggers	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
428*4a32e5dcSEric Biggers	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
429*4a32e5dcSEric Biggers	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
430*4a32e5dcSEric Biggers	vadd.i32	q10, q10, q15
431*4a32e5dcSEric Biggers	vadd.i32	q11, q11, q12
432*4a32e5dcSEric Biggers	vadd.i32	q8, q8, q13
433*4a32e5dcSEric Biggers	vadd.i32	q9, q9, q14
434*4a32e5dcSEric Biggers
435*4a32e5dcSEric Biggers	vst1.32		{q8-q9}, [sp, :256]
436*4a32e5dcSEric Biggers
437*4a32e5dcSEric Biggers	veor		q8, q7, q8
438*4a32e5dcSEric Biggers	veor		q9, q4, q9
439*4a32e5dcSEric Biggers	vshl.u32	q7, q8, #7
440*4a32e5dcSEric Biggers	vshl.u32	q4, q9, #7
441*4a32e5dcSEric Biggers	vsri.u32	q7, q8, #25
442*4a32e5dcSEric Biggers	vsri.u32	q4, q9, #25
443*4a32e5dcSEric Biggers
444*4a32e5dcSEric Biggers	veor		q8, q5, q10
445*4a32e5dcSEric Biggers	veor		q9, q6, q11
446*4a32e5dcSEric Biggers	vshl.u32	q5, q8, #7
447*4a32e5dcSEric Biggers	vshl.u32	q6, q9, #7
448*4a32e5dcSEric Biggers	vsri.u32	q5, q8, #25
449*4a32e5dcSEric Biggers	vsri.u32	q6, q9, #25
450*4a32e5dcSEric Biggers
451*4a32e5dcSEric Biggers	subs		r3, r3, #2
452*4a32e5dcSEric Biggers	bne		.Ldoubleround4
453*4a32e5dcSEric Biggers
454*4a32e5dcSEric Biggers	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
455*4a32e5dcSEric Biggers	// x8..9[0-3] are on the stack.
456*4a32e5dcSEric Biggers
457*4a32e5dcSEric Biggers	// Re-interleave the words in the first two rows of each block (x0..7).
458*4a32e5dcSEric Biggers	// Also add the counter values 0-3 to x12[0-3].
459*4a32e5dcSEric Biggers	  vld1.32	{q8}, [lr, :128]	// load counter values 0-3
460*4a32e5dcSEric Biggers	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
461*4a32e5dcSEric Biggers	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
462*4a32e5dcSEric Biggers	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
463*4a32e5dcSEric Biggers	vzip.32		q6, q7			// => (6 7 6 7) (6 7 6 7)
464*4a32e5dcSEric Biggers	  vadd.u32	q12, q8			// x12 += counter values 0-3
465*4a32e5dcSEric Biggers	vswp		d1, d4
466*4a32e5dcSEric Biggers	vswp		d3, d6
467*4a32e5dcSEric Biggers	  vld1.32	{q8-q9}, [r0]!		// load s0..7
468*4a32e5dcSEric Biggers	vswp		d9, d12
469*4a32e5dcSEric Biggers	vswp		d11, d14
470*4a32e5dcSEric Biggers
471*4a32e5dcSEric Biggers	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
472*4a32e5dcSEric Biggers	// after XORing the first 32 bytes.
473*4a32e5dcSEric Biggers	vswp		q1, q4
474*4a32e5dcSEric Biggers
475*4a32e5dcSEric Biggers	// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
476*4a32e5dcSEric Biggers
477*4a32e5dcSEric Biggers	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
478*4a32e5dcSEric Biggers	vadd.u32	q0, q0, q8
479*4a32e5dcSEric Biggers	vadd.u32	q2, q2, q8
480*4a32e5dcSEric Biggers	vadd.u32	q4, q4, q8
481*4a32e5dcSEric Biggers	vadd.u32	q3, q3, q8
482*4a32e5dcSEric Biggers
483*4a32e5dcSEric Biggers	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
484*4a32e5dcSEric Biggers	vadd.u32	q1, q1, q9
485*4a32e5dcSEric Biggers	vadd.u32	q6, q6, q9
486*4a32e5dcSEric Biggers	vadd.u32	q5, q5, q9
487*4a32e5dcSEric Biggers	vadd.u32	q7, q7, q9
488*4a32e5dcSEric Biggers
489*4a32e5dcSEric Biggers	// XOR first 32 bytes using keystream from first two rows of first block
490*4a32e5dcSEric Biggers	vld1.8		{q8-q9}, [r2]!
491*4a32e5dcSEric Biggers	veor		q8, q8, q0
492*4a32e5dcSEric Biggers	veor		q9, q9, q1
493*4a32e5dcSEric Biggers	vst1.8		{q8-q9}, [r1]!
494*4a32e5dcSEric Biggers
495*4a32e5dcSEric Biggers	// Re-interleave the words in the last two rows of each block (x8..15).
496*4a32e5dcSEric Biggers	vld1.32		{q8-q9}, [sp, :256]
497*4a32e5dcSEric Biggers	  mov		sp, r4		// restore original stack pointer
498*4a32e5dcSEric Biggers	  ldr		r4, [r4, #8]	// load number of bytes
499*4a32e5dcSEric Biggers	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
500*4a32e5dcSEric Biggers	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
501*4a32e5dcSEric Biggers	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
502*4a32e5dcSEric Biggers	vzip.32		q10, q11	// => (10 11 10 11) (10 11 10 11)
503*4a32e5dcSEric Biggers	  vld1.32	{q0-q1}, [r0]	// load s8..15
504*4a32e5dcSEric Biggers	vswp		d25, d28
505*4a32e5dcSEric Biggers	vswp		d27, d30
506*4a32e5dcSEric Biggers	vswp		d17, d20
507*4a32e5dcSEric Biggers	vswp		d19, d22
508*4a32e5dcSEric Biggers
509*4a32e5dcSEric Biggers	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
510*4a32e5dcSEric Biggers
511*4a32e5dcSEric Biggers	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
512*4a32e5dcSEric Biggers	vadd.u32	q8,  q8,  q0
513*4a32e5dcSEric Biggers	vadd.u32	q10, q10, q0
514*4a32e5dcSEric Biggers	vadd.u32	q9,  q9,  q0
515*4a32e5dcSEric Biggers	vadd.u32	q11, q11, q0
516*4a32e5dcSEric Biggers
517*4a32e5dcSEric Biggers	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
518*4a32e5dcSEric Biggers	vadd.u32	q12, q12, q1
519*4a32e5dcSEric Biggers	vadd.u32	q14, q14, q1
520*4a32e5dcSEric Biggers	vadd.u32	q13, q13, q1
521*4a32e5dcSEric Biggers	vadd.u32	q15, q15, q1
522*4a32e5dcSEric Biggers
523*4a32e5dcSEric Biggers	// XOR the rest of the data with the keystream
524*4a32e5dcSEric Biggers
525*4a32e5dcSEric Biggers	vld1.8		{q0-q1}, [r2]!
526*4a32e5dcSEric Biggers	subs		r4, r4, #96
527*4a32e5dcSEric Biggers	veor		q0, q0, q8
528*4a32e5dcSEric Biggers	veor		q1, q1, q12
529*4a32e5dcSEric Biggers	ble		.Lle96
530*4a32e5dcSEric Biggers	vst1.8		{q0-q1}, [r1]!
531*4a32e5dcSEric Biggers
532*4a32e5dcSEric Biggers	vld1.8		{q0-q1}, [r2]!
533*4a32e5dcSEric Biggers	subs		r4, r4, #32
534*4a32e5dcSEric Biggers	veor		q0, q0, q2
535*4a32e5dcSEric Biggers	veor		q1, q1, q6
536*4a32e5dcSEric Biggers	ble		.Lle128
537*4a32e5dcSEric Biggers	vst1.8		{q0-q1}, [r1]!
538*4a32e5dcSEric Biggers
539*4a32e5dcSEric Biggers	vld1.8		{q0-q1}, [r2]!
540*4a32e5dcSEric Biggers	subs		r4, r4, #32
541*4a32e5dcSEric Biggers	veor		q0, q0, q10
542*4a32e5dcSEric Biggers	veor		q1, q1, q14
543*4a32e5dcSEric Biggers	ble		.Lle160
544*4a32e5dcSEric Biggers	vst1.8		{q0-q1}, [r1]!
545*4a32e5dcSEric Biggers
546*4a32e5dcSEric Biggers	vld1.8		{q0-q1}, [r2]!
547*4a32e5dcSEric Biggers	subs		r4, r4, #32
548*4a32e5dcSEric Biggers	veor		q0, q0, q4
549*4a32e5dcSEric Biggers	veor		q1, q1, q5
550*4a32e5dcSEric Biggers	ble		.Lle192
551*4a32e5dcSEric Biggers	vst1.8		{q0-q1}, [r1]!
552*4a32e5dcSEric Biggers
553*4a32e5dcSEric Biggers	vld1.8		{q0-q1}, [r2]!
554*4a32e5dcSEric Biggers	subs		r4, r4, #32
555*4a32e5dcSEric Biggers	veor		q0, q0, q9
556*4a32e5dcSEric Biggers	veor		q1, q1, q13
557*4a32e5dcSEric Biggers	ble		.Lle224
558*4a32e5dcSEric Biggers	vst1.8		{q0-q1}, [r1]!
559*4a32e5dcSEric Biggers
560*4a32e5dcSEric Biggers	vld1.8		{q0-q1}, [r2]!
561*4a32e5dcSEric Biggers	subs		r4, r4, #32
562*4a32e5dcSEric Biggers	veor		q0, q0, q3
563*4a32e5dcSEric Biggers	veor		q1, q1, q7
564*4a32e5dcSEric Biggers	blt		.Llt256
565*4a32e5dcSEric Biggers.Lout:
566*4a32e5dcSEric Biggers	vst1.8		{q0-q1}, [r1]!
567*4a32e5dcSEric Biggers
568*4a32e5dcSEric Biggers	vld1.8		{q0-q1}, [r2]
569*4a32e5dcSEric Biggers	veor		q0, q0, q11
570*4a32e5dcSEric Biggers	veor		q1, q1, q15
571*4a32e5dcSEric Biggers	vst1.8		{q0-q1}, [r1]
572*4a32e5dcSEric Biggers
573*4a32e5dcSEric Biggers	pop		{r4, pc}
574*4a32e5dcSEric Biggers
575*4a32e5dcSEric Biggers.Lle192:
576*4a32e5dcSEric Biggers	vmov		q4, q9
577*4a32e5dcSEric Biggers	vmov		q5, q13
578*4a32e5dcSEric Biggers
579*4a32e5dcSEric Biggers.Lle160:
580*4a32e5dcSEric Biggers	// nothing to do
581*4a32e5dcSEric Biggers
582*4a32e5dcSEric Biggers.Lfinalblock:
583*4a32e5dcSEric Biggers	// Process the final block if processing less than 4 full blocks.
584*4a32e5dcSEric Biggers	// Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
585*4a32e5dcSEric Biggers	// previous 32 byte output block that still needs to be written at
586*4a32e5dcSEric Biggers	// [r1] in q0-q1.
587*4a32e5dcSEric Biggers	beq		.Lfullblock
588*4a32e5dcSEric Biggers
589*4a32e5dcSEric Biggers.Lpartialblock:
590*4a32e5dcSEric Biggers	adr		lr, .Lpermute + 32
591*4a32e5dcSEric Biggers	add		r2, r2, r4
592*4a32e5dcSEric Biggers	add		lr, lr, r4
593*4a32e5dcSEric Biggers	add		r4, r4, r1
594*4a32e5dcSEric Biggers
595*4a32e5dcSEric Biggers	vld1.8		{q2-q3}, [lr]
596*4a32e5dcSEric Biggers	vld1.8		{q6-q7}, [r2]
597*4a32e5dcSEric Biggers
598*4a32e5dcSEric Biggers	add		r4, r4, #32
599*4a32e5dcSEric Biggers
600*4a32e5dcSEric Biggers	vtbl.8		d4, {q4-q5}, d4
601*4a32e5dcSEric Biggers	vtbl.8		d5, {q4-q5}, d5
602*4a32e5dcSEric Biggers	vtbl.8		d6, {q4-q5}, d6
603*4a32e5dcSEric Biggers	vtbl.8		d7, {q4-q5}, d7
604*4a32e5dcSEric Biggers
605*4a32e5dcSEric Biggers	veor		q6, q6, q2
606*4a32e5dcSEric Biggers	veor		q7, q7, q3
607*4a32e5dcSEric Biggers
608*4a32e5dcSEric Biggers	vst1.8		{q6-q7}, [r4]	// overlapping stores
609*4a32e5dcSEric Biggers	vst1.8		{q0-q1}, [r1]
610*4a32e5dcSEric Biggers	pop		{r4, pc}
611*4a32e5dcSEric Biggers
612*4a32e5dcSEric Biggers.Lfullblock:
613*4a32e5dcSEric Biggers	vmov		q11, q4
614*4a32e5dcSEric Biggers	vmov		q15, q5
615*4a32e5dcSEric Biggers	b		.Lout
616*4a32e5dcSEric Biggers.Lle96:
617*4a32e5dcSEric Biggers	vmov		q4, q2
618*4a32e5dcSEric Biggers	vmov		q5, q6
619*4a32e5dcSEric Biggers	b		.Lfinalblock
620*4a32e5dcSEric Biggers.Lle128:
621*4a32e5dcSEric Biggers	vmov		q4, q10
622*4a32e5dcSEric Biggers	vmov		q5, q14
623*4a32e5dcSEric Biggers	b		.Lfinalblock
624*4a32e5dcSEric Biggers.Lle224:
625*4a32e5dcSEric Biggers	vmov		q4, q3
626*4a32e5dcSEric Biggers	vmov		q5, q7
627*4a32e5dcSEric Biggers	b		.Lfinalblock
628*4a32e5dcSEric Biggers.Llt256:
629*4a32e5dcSEric Biggers	vmov		q4, q11
630*4a32e5dcSEric Biggers	vmov		q5, q15
631*4a32e5dcSEric Biggers	b		.Lpartialblock
632*4a32e5dcSEric BiggersENDPROC(chacha_4block_xor_neon)
633*4a32e5dcSEric Biggers
634*4a32e5dcSEric Biggers	.align		L1_CACHE_SHIFT
635*4a32e5dcSEric Biggers.Lpermute:
636*4a32e5dcSEric Biggers	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
637*4a32e5dcSEric Biggers	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
638*4a32e5dcSEric Biggers	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
639*4a32e5dcSEric Biggers	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
640*4a32e5dcSEric Biggers	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
641*4a32e5dcSEric Biggers	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
642*4a32e5dcSEric Biggers	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
643*4a32e5dcSEric Biggers	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
644