xref: /linux/arch/arm/crypto/chacha-neon-core.S (revision cdd38c5f1ce4398ec58fec95904b75824daab7b5)
13cc21519SEric Biggers/*
23cc21519SEric Biggers * ChaCha/XChaCha NEON helper functions
33cc21519SEric Biggers *
43cc21519SEric Biggers * Copyright (C) 2016 Linaro, Ltd. <ard.biesheuvel@linaro.org>
53cc21519SEric Biggers *
63cc21519SEric Biggers * This program is free software; you can redistribute it and/or modify
73cc21519SEric Biggers * it under the terms of the GNU General Public License version 2 as
83cc21519SEric Biggers * published by the Free Software Foundation.
93cc21519SEric Biggers *
103cc21519SEric Biggers * Based on:
113cc21519SEric Biggers * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
123cc21519SEric Biggers *
133cc21519SEric Biggers * Copyright (C) 2015 Martin Willi
143cc21519SEric Biggers *
153cc21519SEric Biggers * This program is free software; you can redistribute it and/or modify
163cc21519SEric Biggers * it under the terms of the GNU General Public License as published by
173cc21519SEric Biggers * the Free Software Foundation; either version 2 of the License, or
183cc21519SEric Biggers * (at your option) any later version.
193cc21519SEric Biggers */
203cc21519SEric Biggers
213cc21519SEric Biggers /*
223cc21519SEric Biggers  * NEON doesn't have a rotate instruction.  The alternatives are, more or less:
233cc21519SEric Biggers  *
243cc21519SEric Biggers  * (a)  vshl.u32 + vsri.u32		(needs temporary register)
253cc21519SEric Biggers  * (b)  vshl.u32 + vshr.u32 + vorr	(needs temporary register)
263cc21519SEric Biggers  * (c)  vrev32.16			(16-bit rotations only)
273cc21519SEric Biggers  * (d)  vtbl.8 + vtbl.8		(multiple of 8 bits rotations only,
283cc21519SEric Biggers  *					 needs index vector)
293cc21519SEric Biggers  *
303cc21519SEric Biggers  * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
313cc21519SEric Biggers  * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
323cc21519SEric Biggers  * cycles of (b) on both Cortex-A7 and Cortex-A53.
333cc21519SEric Biggers  *
343cc21519SEric Biggers  * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
353cc21519SEric Biggers  * and doesn't need a temporary register.
363cc21519SEric Biggers  *
373cc21519SEric Biggers  * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
383cc21519SEric Biggers  * is twice as fast as (a), even when doing (a) on multiple registers
393cc21519SEric Biggers  * simultaneously to eliminate the stall between vshl and vsri.  Also, it
403cc21519SEric Biggers  * parallelizes better when temporary registers are scarce.
413cc21519SEric Biggers  *
423cc21519SEric Biggers  * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
433cc21519SEric Biggers  * (a), so the need to load the rotation table actually makes the vtbl method
443cc21519SEric Biggers  * slightly slower overall on that CPU (~1.3% slower ChaCha20).  Still, it
453cc21519SEric Biggers  * seems to be a good compromise to get a more significant speed boost on some
463cc21519SEric Biggers  * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
473cc21519SEric Biggers  */
483cc21519SEric Biggers
493cc21519SEric Biggers#include <linux/linkage.h>
50*86cd97ecSArd Biesheuvel#include <asm/cache.h>
513cc21519SEric Biggers
523cc21519SEric Biggers	.text
533cc21519SEric Biggers	.fpu		neon
543cc21519SEric Biggers	.align		5
553cc21519SEric Biggers
563cc21519SEric Biggers/*
573cc21519SEric Biggers * chacha_permute - permute one block
583cc21519SEric Biggers *
593cc21519SEric Biggers * Permute one 64-byte block where the state matrix is stored in the four NEON
603cc21519SEric Biggers * registers q0-q3.  It performs matrix operations on four words in parallel,
613cc21519SEric Biggers * but requires shuffling to rearrange the words after each round.
623cc21519SEric Biggers *
633cc21519SEric Biggers * The round count is given in r3.
643cc21519SEric Biggers *
653cc21519SEric Biggers * Clobbers: r3, ip, q4-q5
663cc21519SEric Biggers */
673cc21519SEric Biggerschacha_permute:
683cc21519SEric Biggers
693cc21519SEric Biggers	adr		ip, .Lrol8_table
703cc21519SEric Biggers	vld1.8		{d10}, [ip, :64]
713cc21519SEric Biggers
723cc21519SEric Biggers.Ldoubleround:
733cc21519SEric Biggers	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
743cc21519SEric Biggers	vadd.i32	q0, q0, q1
753cc21519SEric Biggers	veor		q3, q3, q0
763cc21519SEric Biggers	vrev32.16	q3, q3
773cc21519SEric Biggers
783cc21519SEric Biggers	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
793cc21519SEric Biggers	vadd.i32	q2, q2, q3
803cc21519SEric Biggers	veor		q4, q1, q2
813cc21519SEric Biggers	vshl.u32	q1, q4, #12
823cc21519SEric Biggers	vsri.u32	q1, q4, #20
833cc21519SEric Biggers
843cc21519SEric Biggers	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
853cc21519SEric Biggers	vadd.i32	q0, q0, q1
863cc21519SEric Biggers	veor		q3, q3, q0
873cc21519SEric Biggers	vtbl.8		d6, {d6}, d10
883cc21519SEric Biggers	vtbl.8		d7, {d7}, d10
893cc21519SEric Biggers
903cc21519SEric Biggers	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
913cc21519SEric Biggers	vadd.i32	q2, q2, q3
923cc21519SEric Biggers	veor		q4, q1, q2
933cc21519SEric Biggers	vshl.u32	q1, q4, #7
943cc21519SEric Biggers	vsri.u32	q1, q4, #25
953cc21519SEric Biggers
963cc21519SEric Biggers	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
973cc21519SEric Biggers	vext.8		q1, q1, q1, #4
983cc21519SEric Biggers	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
993cc21519SEric Biggers	vext.8		q2, q2, q2, #8
1003cc21519SEric Biggers	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
1013cc21519SEric Biggers	vext.8		q3, q3, q3, #12
1023cc21519SEric Biggers
1033cc21519SEric Biggers	// x0 += x1, x3 = rotl32(x3 ^ x0, 16)
1043cc21519SEric Biggers	vadd.i32	q0, q0, q1
1053cc21519SEric Biggers	veor		q3, q3, q0
1063cc21519SEric Biggers	vrev32.16	q3, q3
1073cc21519SEric Biggers
1083cc21519SEric Biggers	// x2 += x3, x1 = rotl32(x1 ^ x2, 12)
1093cc21519SEric Biggers	vadd.i32	q2, q2, q3
1103cc21519SEric Biggers	veor		q4, q1, q2
1113cc21519SEric Biggers	vshl.u32	q1, q4, #12
1123cc21519SEric Biggers	vsri.u32	q1, q4, #20
1133cc21519SEric Biggers
1143cc21519SEric Biggers	// x0 += x1, x3 = rotl32(x3 ^ x0, 8)
1153cc21519SEric Biggers	vadd.i32	q0, q0, q1
1163cc21519SEric Biggers	veor		q3, q3, q0
1173cc21519SEric Biggers	vtbl.8		d6, {d6}, d10
1183cc21519SEric Biggers	vtbl.8		d7, {d7}, d10
1193cc21519SEric Biggers
1203cc21519SEric Biggers	// x2 += x3, x1 = rotl32(x1 ^ x2, 7)
1213cc21519SEric Biggers	vadd.i32	q2, q2, q3
1223cc21519SEric Biggers	veor		q4, q1, q2
1233cc21519SEric Biggers	vshl.u32	q1, q4, #7
1243cc21519SEric Biggers	vsri.u32	q1, q4, #25
1253cc21519SEric Biggers
1263cc21519SEric Biggers	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
1273cc21519SEric Biggers	vext.8		q1, q1, q1, #12
1283cc21519SEric Biggers	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
1293cc21519SEric Biggers	vext.8		q2, q2, q2, #8
1303cc21519SEric Biggers	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
1313cc21519SEric Biggers	vext.8		q3, q3, q3, #4
1323cc21519SEric Biggers
1333cc21519SEric Biggers	subs		r3, r3, #2
1343cc21519SEric Biggers	bne		.Ldoubleround
1353cc21519SEric Biggers
1363cc21519SEric Biggers	bx		lr
1373cc21519SEric BiggersENDPROC(chacha_permute)
1383cc21519SEric Biggers
1393cc21519SEric BiggersENTRY(chacha_block_xor_neon)
1403cc21519SEric Biggers	// r0: Input state matrix, s
1413cc21519SEric Biggers	// r1: 1 data block output, o
1423cc21519SEric Biggers	// r2: 1 data block input, i
1433cc21519SEric Biggers	// r3: nrounds
1443cc21519SEric Biggers	push		{lr}
1453cc21519SEric Biggers
1463cc21519SEric Biggers	// x0..3 = s0..3
1473cc21519SEric Biggers	add		ip, r0, #0x20
1483cc21519SEric Biggers	vld1.32		{q0-q1}, [r0]
1493cc21519SEric Biggers	vld1.32		{q2-q3}, [ip]
1503cc21519SEric Biggers
1513cc21519SEric Biggers	vmov		q8, q0
1523cc21519SEric Biggers	vmov		q9, q1
1533cc21519SEric Biggers	vmov		q10, q2
1543cc21519SEric Biggers	vmov		q11, q3
1553cc21519SEric Biggers
1563cc21519SEric Biggers	bl		chacha_permute
1573cc21519SEric Biggers
1583cc21519SEric Biggers	add		ip, r2, #0x20
1593cc21519SEric Biggers	vld1.8		{q4-q5}, [r2]
1603cc21519SEric Biggers	vld1.8		{q6-q7}, [ip]
1613cc21519SEric Biggers
1623cc21519SEric Biggers	// o0 = i0 ^ (x0 + s0)
1633cc21519SEric Biggers	vadd.i32	q0, q0, q8
1643cc21519SEric Biggers	veor		q0, q0, q4
1653cc21519SEric Biggers
1663cc21519SEric Biggers	// o1 = i1 ^ (x1 + s1)
1673cc21519SEric Biggers	vadd.i32	q1, q1, q9
1683cc21519SEric Biggers	veor		q1, q1, q5
1693cc21519SEric Biggers
1703cc21519SEric Biggers	// o2 = i2 ^ (x2 + s2)
1713cc21519SEric Biggers	vadd.i32	q2, q2, q10
1723cc21519SEric Biggers	veor		q2, q2, q6
1733cc21519SEric Biggers
1743cc21519SEric Biggers	// o3 = i3 ^ (x3 + s3)
1753cc21519SEric Biggers	vadd.i32	q3, q3, q11
1763cc21519SEric Biggers	veor		q3, q3, q7
1773cc21519SEric Biggers
1783cc21519SEric Biggers	add		ip, r1, #0x20
1793cc21519SEric Biggers	vst1.8		{q0-q1}, [r1]
1803cc21519SEric Biggers	vst1.8		{q2-q3}, [ip]
1813cc21519SEric Biggers
1823cc21519SEric Biggers	pop		{pc}
1833cc21519SEric BiggersENDPROC(chacha_block_xor_neon)
1843cc21519SEric Biggers
1853cc21519SEric BiggersENTRY(hchacha_block_neon)
1863cc21519SEric Biggers	// r0: Input state matrix, s
1873cc21519SEric Biggers	// r1: output (8 32-bit words)
1883cc21519SEric Biggers	// r2: nrounds
1893cc21519SEric Biggers	push		{lr}
1903cc21519SEric Biggers
1913cc21519SEric Biggers	vld1.32		{q0-q1}, [r0]!
1923cc21519SEric Biggers	vld1.32		{q2-q3}, [r0]
1933cc21519SEric Biggers
1943cc21519SEric Biggers	mov		r3, r2
1953cc21519SEric Biggers	bl		chacha_permute
1963cc21519SEric Biggers
1973cc21519SEric Biggers	vst1.32		{q0}, [r1]!
1983cc21519SEric Biggers	vst1.32		{q3}, [r1]
1993cc21519SEric Biggers
2003cc21519SEric Biggers	pop		{pc}
2013cc21519SEric BiggersENDPROC(hchacha_block_neon)
2023cc21519SEric Biggers
2033cc21519SEric Biggers	.align		4
2043cc21519SEric Biggers.Lctrinc:	.word	0, 1, 2, 3
2053cc21519SEric Biggers.Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
2063cc21519SEric Biggers
2073cc21519SEric Biggers	.align		5
2083cc21519SEric BiggersENTRY(chacha_4block_xor_neon)
209*86cd97ecSArd Biesheuvel	push		{r4, lr}
2103cc21519SEric Biggers	mov		r4, sp			// preserve the stack pointer
2113cc21519SEric Biggers	sub		ip, sp, #0x20		// allocate a 32 byte buffer
2123cc21519SEric Biggers	bic		ip, ip, #0x1f		// aligned to 32 bytes
2133cc21519SEric Biggers	mov		sp, ip
2143cc21519SEric Biggers
2153cc21519SEric Biggers	// r0: Input state matrix, s
2163cc21519SEric Biggers	// r1: 4 data blocks output, o
2173cc21519SEric Biggers	// r2: 4 data blocks input, i
2183cc21519SEric Biggers	// r3: nrounds
2193cc21519SEric Biggers
2203cc21519SEric Biggers	//
2213cc21519SEric Biggers	// This function encrypts four consecutive ChaCha blocks by loading
2223cc21519SEric Biggers	// the state matrix in NEON registers four times. The algorithm performs
2233cc21519SEric Biggers	// each operation on the corresponding word of each state matrix, hence
2243cc21519SEric Biggers	// requires no word shuffling. The words are re-interleaved before the
2253cc21519SEric Biggers	// final addition of the original state and the XORing step.
2263cc21519SEric Biggers	//
2273cc21519SEric Biggers
2283cc21519SEric Biggers	// x0..15[0-3] = s0..15[0-3]
2293cc21519SEric Biggers	add		ip, r0, #0x20
2303cc21519SEric Biggers	vld1.32		{q0-q1}, [r0]
2313cc21519SEric Biggers	vld1.32		{q2-q3}, [ip]
2323cc21519SEric Biggers
233*86cd97ecSArd Biesheuvel	adr		lr, .Lctrinc
2343cc21519SEric Biggers	vdup.32		q15, d7[1]
2353cc21519SEric Biggers	vdup.32		q14, d7[0]
236*86cd97ecSArd Biesheuvel	vld1.32		{q4}, [lr, :128]
2373cc21519SEric Biggers	vdup.32		q13, d6[1]
2383cc21519SEric Biggers	vdup.32		q12, d6[0]
2393cc21519SEric Biggers	vdup.32		q11, d5[1]
2403cc21519SEric Biggers	vdup.32		q10, d5[0]
2413cc21519SEric Biggers	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
2423cc21519SEric Biggers	vdup.32		q9, d4[1]
2433cc21519SEric Biggers	vdup.32		q8, d4[0]
2443cc21519SEric Biggers	vdup.32		q7, d3[1]
2453cc21519SEric Biggers	vdup.32		q6, d3[0]
2463cc21519SEric Biggers	vdup.32		q5, d2[1]
2473cc21519SEric Biggers	vdup.32		q4, d2[0]
2483cc21519SEric Biggers	vdup.32		q3, d1[1]
2493cc21519SEric Biggers	vdup.32		q2, d1[0]
2503cc21519SEric Biggers	vdup.32		q1, d0[1]
2513cc21519SEric Biggers	vdup.32		q0, d0[0]
2523cc21519SEric Biggers
2533cc21519SEric Biggers	adr		ip, .Lrol8_table
2543cc21519SEric Biggers	b		1f
2553cc21519SEric Biggers
2563cc21519SEric Biggers.Ldoubleround4:
2573cc21519SEric Biggers	vld1.32		{q8-q9}, [sp, :256]
2583cc21519SEric Biggers1:
2593cc21519SEric Biggers	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
2603cc21519SEric Biggers	// x1 += x5, x13 = rotl32(x13 ^ x1, 16)
2613cc21519SEric Biggers	// x2 += x6, x14 = rotl32(x14 ^ x2, 16)
2623cc21519SEric Biggers	// x3 += x7, x15 = rotl32(x15 ^ x3, 16)
2633cc21519SEric Biggers	vadd.i32	q0, q0, q4
2643cc21519SEric Biggers	vadd.i32	q1, q1, q5
2653cc21519SEric Biggers	vadd.i32	q2, q2, q6
2663cc21519SEric Biggers	vadd.i32	q3, q3, q7
2673cc21519SEric Biggers
2683cc21519SEric Biggers	veor		q12, q12, q0
2693cc21519SEric Biggers	veor		q13, q13, q1
2703cc21519SEric Biggers	veor		q14, q14, q2
2713cc21519SEric Biggers	veor		q15, q15, q3
2723cc21519SEric Biggers
2733cc21519SEric Biggers	vrev32.16	q12, q12
2743cc21519SEric Biggers	vrev32.16	q13, q13
2753cc21519SEric Biggers	vrev32.16	q14, q14
2763cc21519SEric Biggers	vrev32.16	q15, q15
2773cc21519SEric Biggers
2783cc21519SEric Biggers	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
2793cc21519SEric Biggers	// x9 += x13, x5 = rotl32(x5 ^ x9, 12)
2803cc21519SEric Biggers	// x10 += x14, x6 = rotl32(x6 ^ x10, 12)
2813cc21519SEric Biggers	// x11 += x15, x7 = rotl32(x7 ^ x11, 12)
2823cc21519SEric Biggers	vadd.i32	q8, q8, q12
2833cc21519SEric Biggers	vadd.i32	q9, q9, q13
2843cc21519SEric Biggers	vadd.i32	q10, q10, q14
2853cc21519SEric Biggers	vadd.i32	q11, q11, q15
2863cc21519SEric Biggers
2873cc21519SEric Biggers	vst1.32		{q8-q9}, [sp, :256]
2883cc21519SEric Biggers
2893cc21519SEric Biggers	veor		q8, q4, q8
2903cc21519SEric Biggers	veor		q9, q5, q9
2913cc21519SEric Biggers	vshl.u32	q4, q8, #12
2923cc21519SEric Biggers	vshl.u32	q5, q9, #12
2933cc21519SEric Biggers	vsri.u32	q4, q8, #20
2943cc21519SEric Biggers	vsri.u32	q5, q9, #20
2953cc21519SEric Biggers
2963cc21519SEric Biggers	veor		q8, q6, q10
2973cc21519SEric Biggers	veor		q9, q7, q11
2983cc21519SEric Biggers	vshl.u32	q6, q8, #12
2993cc21519SEric Biggers	vshl.u32	q7, q9, #12
3003cc21519SEric Biggers	vsri.u32	q6, q8, #20
3013cc21519SEric Biggers	vsri.u32	q7, q9, #20
3023cc21519SEric Biggers
3033cc21519SEric Biggers	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
3043cc21519SEric Biggers	// x1 += x5, x13 = rotl32(x13 ^ x1, 8)
3053cc21519SEric Biggers	// x2 += x6, x14 = rotl32(x14 ^ x2, 8)
3063cc21519SEric Biggers	// x3 += x7, x15 = rotl32(x15 ^ x3, 8)
3073cc21519SEric Biggers	vld1.8		{d16}, [ip, :64]
3083cc21519SEric Biggers	vadd.i32	q0, q0, q4
3093cc21519SEric Biggers	vadd.i32	q1, q1, q5
3103cc21519SEric Biggers	vadd.i32	q2, q2, q6
3113cc21519SEric Biggers	vadd.i32	q3, q3, q7
3123cc21519SEric Biggers
3133cc21519SEric Biggers	veor		q12, q12, q0
3143cc21519SEric Biggers	veor		q13, q13, q1
3153cc21519SEric Biggers	veor		q14, q14, q2
3163cc21519SEric Biggers	veor		q15, q15, q3
3173cc21519SEric Biggers
3183cc21519SEric Biggers	vtbl.8		d24, {d24}, d16
3193cc21519SEric Biggers	vtbl.8		d25, {d25}, d16
3203cc21519SEric Biggers	vtbl.8		d26, {d26}, d16
3213cc21519SEric Biggers	vtbl.8		d27, {d27}, d16
3223cc21519SEric Biggers	vtbl.8		d28, {d28}, d16
3233cc21519SEric Biggers	vtbl.8		d29, {d29}, d16
3243cc21519SEric Biggers	vtbl.8		d30, {d30}, d16
3253cc21519SEric Biggers	vtbl.8		d31, {d31}, d16
3263cc21519SEric Biggers
3273cc21519SEric Biggers	vld1.32		{q8-q9}, [sp, :256]
3283cc21519SEric Biggers
3293cc21519SEric Biggers	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
3303cc21519SEric Biggers	// x9 += x13, x5 = rotl32(x5 ^ x9, 7)
3313cc21519SEric Biggers	// x10 += x14, x6 = rotl32(x6 ^ x10, 7)
3323cc21519SEric Biggers	// x11 += x15, x7 = rotl32(x7 ^ x11, 7)
3333cc21519SEric Biggers	vadd.i32	q8, q8, q12
3343cc21519SEric Biggers	vadd.i32	q9, q9, q13
3353cc21519SEric Biggers	vadd.i32	q10, q10, q14
3363cc21519SEric Biggers	vadd.i32	q11, q11, q15
3373cc21519SEric Biggers
3383cc21519SEric Biggers	vst1.32		{q8-q9}, [sp, :256]
3393cc21519SEric Biggers
3403cc21519SEric Biggers	veor		q8, q4, q8
3413cc21519SEric Biggers	veor		q9, q5, q9
3423cc21519SEric Biggers	vshl.u32	q4, q8, #7
3433cc21519SEric Biggers	vshl.u32	q5, q9, #7
3443cc21519SEric Biggers	vsri.u32	q4, q8, #25
3453cc21519SEric Biggers	vsri.u32	q5, q9, #25
3463cc21519SEric Biggers
3473cc21519SEric Biggers	veor		q8, q6, q10
3483cc21519SEric Biggers	veor		q9, q7, q11
3493cc21519SEric Biggers	vshl.u32	q6, q8, #7
3503cc21519SEric Biggers	vshl.u32	q7, q9, #7
3513cc21519SEric Biggers	vsri.u32	q6, q8, #25
3523cc21519SEric Biggers	vsri.u32	q7, q9, #25
3533cc21519SEric Biggers
3543cc21519SEric Biggers	vld1.32		{q8-q9}, [sp, :256]
3553cc21519SEric Biggers
3563cc21519SEric Biggers	// x0 += x5, x15 = rotl32(x15 ^ x0, 16)
3573cc21519SEric Biggers	// x1 += x6, x12 = rotl32(x12 ^ x1, 16)
3583cc21519SEric Biggers	// x2 += x7, x13 = rotl32(x13 ^ x2, 16)
3593cc21519SEric Biggers	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
3603cc21519SEric Biggers	vadd.i32	q0, q0, q5
3613cc21519SEric Biggers	vadd.i32	q1, q1, q6
3623cc21519SEric Biggers	vadd.i32	q2, q2, q7
3633cc21519SEric Biggers	vadd.i32	q3, q3, q4
3643cc21519SEric Biggers
3653cc21519SEric Biggers	veor		q15, q15, q0
3663cc21519SEric Biggers	veor		q12, q12, q1
3673cc21519SEric Biggers	veor		q13, q13, q2
3683cc21519SEric Biggers	veor		q14, q14, q3
3693cc21519SEric Biggers
3703cc21519SEric Biggers	vrev32.16	q15, q15
3713cc21519SEric Biggers	vrev32.16	q12, q12
3723cc21519SEric Biggers	vrev32.16	q13, q13
3733cc21519SEric Biggers	vrev32.16	q14, q14
3743cc21519SEric Biggers
3753cc21519SEric Biggers	// x10 += x15, x5 = rotl32(x5 ^ x10, 12)
3763cc21519SEric Biggers	// x11 += x12, x6 = rotl32(x6 ^ x11, 12)
3773cc21519SEric Biggers	// x8 += x13, x7 = rotl32(x7 ^ x8, 12)
3783cc21519SEric Biggers	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
3793cc21519SEric Biggers	vadd.i32	q10, q10, q15
3803cc21519SEric Biggers	vadd.i32	q11, q11, q12
3813cc21519SEric Biggers	vadd.i32	q8, q8, q13
3823cc21519SEric Biggers	vadd.i32	q9, q9, q14
3833cc21519SEric Biggers
3843cc21519SEric Biggers	vst1.32		{q8-q9}, [sp, :256]
3853cc21519SEric Biggers
3863cc21519SEric Biggers	veor		q8, q7, q8
3873cc21519SEric Biggers	veor		q9, q4, q9
3883cc21519SEric Biggers	vshl.u32	q7, q8, #12
3893cc21519SEric Biggers	vshl.u32	q4, q9, #12
3903cc21519SEric Biggers	vsri.u32	q7, q8, #20
3913cc21519SEric Biggers	vsri.u32	q4, q9, #20
3923cc21519SEric Biggers
3933cc21519SEric Biggers	veor		q8, q5, q10
3943cc21519SEric Biggers	veor		q9, q6, q11
3953cc21519SEric Biggers	vshl.u32	q5, q8, #12
3963cc21519SEric Biggers	vshl.u32	q6, q9, #12
3973cc21519SEric Biggers	vsri.u32	q5, q8, #20
3983cc21519SEric Biggers	vsri.u32	q6, q9, #20
3993cc21519SEric Biggers
4003cc21519SEric Biggers	// x0 += x5, x15 = rotl32(x15 ^ x0, 8)
4013cc21519SEric Biggers	// x1 += x6, x12 = rotl32(x12 ^ x1, 8)
4023cc21519SEric Biggers	// x2 += x7, x13 = rotl32(x13 ^ x2, 8)
4033cc21519SEric Biggers	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
4043cc21519SEric Biggers	vld1.8		{d16}, [ip, :64]
4053cc21519SEric Biggers	vadd.i32	q0, q0, q5
4063cc21519SEric Biggers	vadd.i32	q1, q1, q6
4073cc21519SEric Biggers	vadd.i32	q2, q2, q7
4083cc21519SEric Biggers	vadd.i32	q3, q3, q4
4093cc21519SEric Biggers
4103cc21519SEric Biggers	veor		q15, q15, q0
4113cc21519SEric Biggers	veor		q12, q12, q1
4123cc21519SEric Biggers	veor		q13, q13, q2
4133cc21519SEric Biggers	veor		q14, q14, q3
4143cc21519SEric Biggers
4153cc21519SEric Biggers	vtbl.8		d30, {d30}, d16
4163cc21519SEric Biggers	vtbl.8		d31, {d31}, d16
4173cc21519SEric Biggers	vtbl.8		d24, {d24}, d16
4183cc21519SEric Biggers	vtbl.8		d25, {d25}, d16
4193cc21519SEric Biggers	vtbl.8		d26, {d26}, d16
4203cc21519SEric Biggers	vtbl.8		d27, {d27}, d16
4213cc21519SEric Biggers	vtbl.8		d28, {d28}, d16
4223cc21519SEric Biggers	vtbl.8		d29, {d29}, d16
4233cc21519SEric Biggers
4243cc21519SEric Biggers	vld1.32		{q8-q9}, [sp, :256]
4253cc21519SEric Biggers
4263cc21519SEric Biggers	// x10 += x15, x5 = rotl32(x5 ^ x10, 7)
4273cc21519SEric Biggers	// x11 += x12, x6 = rotl32(x6 ^ x11, 7)
4283cc21519SEric Biggers	// x8 += x13, x7 = rotl32(x7 ^ x8, 7)
4293cc21519SEric Biggers	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
4303cc21519SEric Biggers	vadd.i32	q10, q10, q15
4313cc21519SEric Biggers	vadd.i32	q11, q11, q12
4323cc21519SEric Biggers	vadd.i32	q8, q8, q13
4333cc21519SEric Biggers	vadd.i32	q9, q9, q14
4343cc21519SEric Biggers
4353cc21519SEric Biggers	vst1.32		{q8-q9}, [sp, :256]
4363cc21519SEric Biggers
4373cc21519SEric Biggers	veor		q8, q7, q8
4383cc21519SEric Biggers	veor		q9, q4, q9
4393cc21519SEric Biggers	vshl.u32	q7, q8, #7
4403cc21519SEric Biggers	vshl.u32	q4, q9, #7
4413cc21519SEric Biggers	vsri.u32	q7, q8, #25
4423cc21519SEric Biggers	vsri.u32	q4, q9, #25
4433cc21519SEric Biggers
4443cc21519SEric Biggers	veor		q8, q5, q10
4453cc21519SEric Biggers	veor		q9, q6, q11
4463cc21519SEric Biggers	vshl.u32	q5, q8, #7
4473cc21519SEric Biggers	vshl.u32	q6, q9, #7
4483cc21519SEric Biggers	vsri.u32	q5, q8, #25
4493cc21519SEric Biggers	vsri.u32	q6, q9, #25
4503cc21519SEric Biggers
4513cc21519SEric Biggers	subs		r3, r3, #2
4523cc21519SEric Biggers	bne		.Ldoubleround4
4533cc21519SEric Biggers
4543cc21519SEric Biggers	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
4553cc21519SEric Biggers	// x8..9[0-3] are on the stack.
4563cc21519SEric Biggers
4573cc21519SEric Biggers	// Re-interleave the words in the first two rows of each block (x0..7).
4583cc21519SEric Biggers	// Also add the counter values 0-3 to x12[0-3].
459*86cd97ecSArd Biesheuvel	  vld1.32	{q8}, [lr, :128]	// load counter values 0-3
4603cc21519SEric Biggers	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
4613cc21519SEric Biggers	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
4623cc21519SEric Biggers	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
4633cc21519SEric Biggers	vzip.32		q6, q7			// => (6 7 6 7) (6 7 6 7)
4643cc21519SEric Biggers	  vadd.u32	q12, q8			// x12 += counter values 0-3
4653cc21519SEric Biggers	vswp		d1, d4
4663cc21519SEric Biggers	vswp		d3, d6
4673cc21519SEric Biggers	  vld1.32	{q8-q9}, [r0]!		// load s0..7
4683cc21519SEric Biggers	vswp		d9, d12
4693cc21519SEric Biggers	vswp		d11, d14
4703cc21519SEric Biggers
4713cc21519SEric Biggers	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
4723cc21519SEric Biggers	// after XORing the first 32 bytes.
4733cc21519SEric Biggers	vswp		q1, q4
4743cc21519SEric Biggers
4753cc21519SEric Biggers	// First two rows of each block are (q0 q1) (q2 q6) (q4 q5) (q3 q7)
4763cc21519SEric Biggers
4773cc21519SEric Biggers	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
4783cc21519SEric Biggers	vadd.u32	q0, q0, q8
4793cc21519SEric Biggers	vadd.u32	q2, q2, q8
4803cc21519SEric Biggers	vadd.u32	q4, q4, q8
4813cc21519SEric Biggers	vadd.u32	q3, q3, q8
4823cc21519SEric Biggers
4833cc21519SEric Biggers	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
4843cc21519SEric Biggers	vadd.u32	q1, q1, q9
4853cc21519SEric Biggers	vadd.u32	q6, q6, q9
4863cc21519SEric Biggers	vadd.u32	q5, q5, q9
4873cc21519SEric Biggers	vadd.u32	q7, q7, q9
4883cc21519SEric Biggers
4893cc21519SEric Biggers	// XOR first 32 bytes using keystream from first two rows of first block
4903cc21519SEric Biggers	vld1.8		{q8-q9}, [r2]!
4913cc21519SEric Biggers	veor		q8, q8, q0
4923cc21519SEric Biggers	veor		q9, q9, q1
4933cc21519SEric Biggers	vst1.8		{q8-q9}, [r1]!
4943cc21519SEric Biggers
4953cc21519SEric Biggers	// Re-interleave the words in the last two rows of each block (x8..15).
4963cc21519SEric Biggers	vld1.32		{q8-q9}, [sp, :256]
497*86cd97ecSArd Biesheuvel	  mov		sp, r4		// restore original stack pointer
498*86cd97ecSArd Biesheuvel	  ldr		r4, [r4, #8]	// load number of bytes
4993cc21519SEric Biggers	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
5003cc21519SEric Biggers	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
5013cc21519SEric Biggers	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
5023cc21519SEric Biggers	vzip.32		q10, q11	// => (10 11 10 11) (10 11 10 11)
5033cc21519SEric Biggers	  vld1.32	{q0-q1}, [r0]	// load s8..15
5043cc21519SEric Biggers	vswp		d25, d28
5053cc21519SEric Biggers	vswp		d27, d30
5063cc21519SEric Biggers	vswp		d17, d20
5073cc21519SEric Biggers	vswp		d19, d22
5083cc21519SEric Biggers
5093cc21519SEric Biggers	// Last two rows of each block are (q8 q12) (q10 q14) (q9 q13) (q11 q15)
5103cc21519SEric Biggers
5113cc21519SEric Biggers	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
5123cc21519SEric Biggers	vadd.u32	q8,  q8,  q0
5133cc21519SEric Biggers	vadd.u32	q10, q10, q0
5143cc21519SEric Biggers	vadd.u32	q9,  q9,  q0
5153cc21519SEric Biggers	vadd.u32	q11, q11, q0
5163cc21519SEric Biggers
5173cc21519SEric Biggers	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
5183cc21519SEric Biggers	vadd.u32	q12, q12, q1
5193cc21519SEric Biggers	vadd.u32	q14, q14, q1
5203cc21519SEric Biggers	vadd.u32	q13, q13, q1
5213cc21519SEric Biggers	vadd.u32	q15, q15, q1
5223cc21519SEric Biggers
5233cc21519SEric Biggers	// XOR the rest of the data with the keystream
5243cc21519SEric Biggers
5253cc21519SEric Biggers	vld1.8		{q0-q1}, [r2]!
526*86cd97ecSArd Biesheuvel	subs		r4, r4, #96
5273cc21519SEric Biggers	veor		q0, q0, q8
5283cc21519SEric Biggers	veor		q1, q1, q12
529*86cd97ecSArd Biesheuvel	ble		.Lle96
5303cc21519SEric Biggers	vst1.8		{q0-q1}, [r1]!
5313cc21519SEric Biggers
5323cc21519SEric Biggers	vld1.8		{q0-q1}, [r2]!
533*86cd97ecSArd Biesheuvel	subs		r4, r4, #32
5343cc21519SEric Biggers	veor		q0, q0, q2
5353cc21519SEric Biggers	veor		q1, q1, q6
536*86cd97ecSArd Biesheuvel	ble		.Lle128
5373cc21519SEric Biggers	vst1.8		{q0-q1}, [r1]!
5383cc21519SEric Biggers
5393cc21519SEric Biggers	vld1.8		{q0-q1}, [r2]!
540*86cd97ecSArd Biesheuvel	subs		r4, r4, #32
5413cc21519SEric Biggers	veor		q0, q0, q10
5423cc21519SEric Biggers	veor		q1, q1, q14
543*86cd97ecSArd Biesheuvel	ble		.Lle160
5443cc21519SEric Biggers	vst1.8		{q0-q1}, [r1]!
5453cc21519SEric Biggers
5463cc21519SEric Biggers	vld1.8		{q0-q1}, [r2]!
547*86cd97ecSArd Biesheuvel	subs		r4, r4, #32
5483cc21519SEric Biggers	veor		q0, q0, q4
5493cc21519SEric Biggers	veor		q1, q1, q5
550*86cd97ecSArd Biesheuvel	ble		.Lle192
5513cc21519SEric Biggers	vst1.8		{q0-q1}, [r1]!
5523cc21519SEric Biggers
5533cc21519SEric Biggers	vld1.8		{q0-q1}, [r2]!
554*86cd97ecSArd Biesheuvel	subs		r4, r4, #32
5553cc21519SEric Biggers	veor		q0, q0, q9
5563cc21519SEric Biggers	veor		q1, q1, q13
557*86cd97ecSArd Biesheuvel	ble		.Lle224
5583cc21519SEric Biggers	vst1.8		{q0-q1}, [r1]!
5593cc21519SEric Biggers
5603cc21519SEric Biggers	vld1.8		{q0-q1}, [r2]!
561*86cd97ecSArd Biesheuvel	subs		r4, r4, #32
5623cc21519SEric Biggers	veor		q0, q0, q3
5633cc21519SEric Biggers	veor		q1, q1, q7
564*86cd97ecSArd Biesheuvel	blt		.Llt256
565*86cd97ecSArd Biesheuvel.Lout:
5663cc21519SEric Biggers	vst1.8		{q0-q1}, [r1]!
5673cc21519SEric Biggers
5683cc21519SEric Biggers	vld1.8		{q0-q1}, [r2]
5693cc21519SEric Biggers	veor		q0, q0, q11
5703cc21519SEric Biggers	veor		q1, q1, q15
5713cc21519SEric Biggers	vst1.8		{q0-q1}, [r1]
5723cc21519SEric Biggers
573*86cd97ecSArd Biesheuvel	pop		{r4, pc}
574*86cd97ecSArd Biesheuvel
575*86cd97ecSArd Biesheuvel.Lle192:
576*86cd97ecSArd Biesheuvel	vmov		q4, q9
577*86cd97ecSArd Biesheuvel	vmov		q5, q13
578*86cd97ecSArd Biesheuvel
579*86cd97ecSArd Biesheuvel.Lle160:
580*86cd97ecSArd Biesheuvel	// nothing to do
581*86cd97ecSArd Biesheuvel
582*86cd97ecSArd Biesheuvel.Lfinalblock:
583*86cd97ecSArd Biesheuvel	// Process the final block if processing less than 4 full blocks.
584*86cd97ecSArd Biesheuvel	// Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
585*86cd97ecSArd Biesheuvel	// previous 32 byte output block that still needs to be written at
586*86cd97ecSArd Biesheuvel	// [r1] in q0-q1.
587*86cd97ecSArd Biesheuvel	beq		.Lfullblock
588*86cd97ecSArd Biesheuvel
589*86cd97ecSArd Biesheuvel.Lpartialblock:
590*86cd97ecSArd Biesheuvel	adr		lr, .Lpermute + 32
591*86cd97ecSArd Biesheuvel	add		r2, r2, r4
592*86cd97ecSArd Biesheuvel	add		lr, lr, r4
593*86cd97ecSArd Biesheuvel	add		r4, r4, r1
594*86cd97ecSArd Biesheuvel
595*86cd97ecSArd Biesheuvel	vld1.8		{q2-q3}, [lr]
596*86cd97ecSArd Biesheuvel	vld1.8		{q6-q7}, [r2]
597*86cd97ecSArd Biesheuvel
598*86cd97ecSArd Biesheuvel	add		r4, r4, #32
599*86cd97ecSArd Biesheuvel
600*86cd97ecSArd Biesheuvel	vtbl.8		d4, {q4-q5}, d4
601*86cd97ecSArd Biesheuvel	vtbl.8		d5, {q4-q5}, d5
602*86cd97ecSArd Biesheuvel	vtbl.8		d6, {q4-q5}, d6
603*86cd97ecSArd Biesheuvel	vtbl.8		d7, {q4-q5}, d7
604*86cd97ecSArd Biesheuvel
605*86cd97ecSArd Biesheuvel	veor		q6, q6, q2
606*86cd97ecSArd Biesheuvel	veor		q7, q7, q3
607*86cd97ecSArd Biesheuvel
608*86cd97ecSArd Biesheuvel	vst1.8		{q6-q7}, [r4]	// overlapping stores
609*86cd97ecSArd Biesheuvel	vst1.8		{q0-q1}, [r1]
610*86cd97ecSArd Biesheuvel	pop		{r4, pc}
611*86cd97ecSArd Biesheuvel
612*86cd97ecSArd Biesheuvel.Lfullblock:
613*86cd97ecSArd Biesheuvel	vmov		q11, q4
614*86cd97ecSArd Biesheuvel	vmov		q15, q5
615*86cd97ecSArd Biesheuvel	b		.Lout
616*86cd97ecSArd Biesheuvel.Lle96:
617*86cd97ecSArd Biesheuvel	vmov		q4, q2
618*86cd97ecSArd Biesheuvel	vmov		q5, q6
619*86cd97ecSArd Biesheuvel	b		.Lfinalblock
620*86cd97ecSArd Biesheuvel.Lle128:
621*86cd97ecSArd Biesheuvel	vmov		q4, q10
622*86cd97ecSArd Biesheuvel	vmov		q5, q14
623*86cd97ecSArd Biesheuvel	b		.Lfinalblock
624*86cd97ecSArd Biesheuvel.Lle224:
625*86cd97ecSArd Biesheuvel	vmov		q4, q3
626*86cd97ecSArd Biesheuvel	vmov		q5, q7
627*86cd97ecSArd Biesheuvel	b		.Lfinalblock
628*86cd97ecSArd Biesheuvel.Llt256:
629*86cd97ecSArd Biesheuvel	vmov		q4, q11
630*86cd97ecSArd Biesheuvel	vmov		q5, q15
631*86cd97ecSArd Biesheuvel	b		.Lpartialblock
6323cc21519SEric BiggersENDPROC(chacha_4block_xor_neon)
633*86cd97ecSArd Biesheuvel
634*86cd97ecSArd Biesheuvel	.align		L1_CACHE_SHIFT
635*86cd97ecSArd Biesheuvel.Lpermute:
636*86cd97ecSArd Biesheuvel	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
637*86cd97ecSArd Biesheuvel	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
638*86cd97ecSArd Biesheuvel	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
639*86cd97ecSArd Biesheuvel	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
640*86cd97ecSArd Biesheuvel	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
641*86cd97ecSArd Biesheuvel	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
642*86cd97ecSArd Biesheuvel	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
643*86cd97ecSArd Biesheuvel	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
644