chacha-neon-core.S - OpenGrok cross reference for /linux/lib/crypto/arm/chacha-neon-core.S

Lines Matching +full:0 +full:- +full:32
11  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
26   * (c)  vrev32.16			(16-bit rotations only)
30   * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
31   * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
32   * cycles of (b) on both Cortex-A7 and Cortex-A53.
34   * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
37   * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
42   * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
46   * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
57  * chacha_permute - permute one block
59  * Permute one 64-byte block where the state matrix is stored in the four NEON
60  * registers q0-q3.  It performs matrix operations on four words in parallel,
65  * Clobbers: r3, ip, q4-q5
96 	// x1 = shuffle32(x1, MASK(0, 3, 2, 1))
98 	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
100 	// x3 = shuffle32(x3, MASK(2, 1, 0, 3))
126 	// x1 = shuffle32(x1, MASK(2, 1, 0, 3))
128 	// x2 = shuffle32(x2, MASK(1, 0, 3, 2))
130 	// x3 = shuffle32(x3, MASK(0, 3, 2, 1))
147 	add		ip, r0, #0x20
148 	vld1.32		{q0-q1}, [r0]
149 	vld1.32		{q2-q3}, [ip]
158 	add		ip, r2, #0x20
159 	vld1.8		{q4-q5}, [r2]
160 	vld1.8		{q6-q7}, [ip]
178 	add		ip, r1, #0x20
179 	vst1.8		{q0-q1}, [r1]
180 	vst1.8		{q2-q3}, [ip]
187 	// r1: output (8 32-bit words)
191 	vld1.32		{q0-q1}, [r0]!
192 	vld1.32		{q2-q3}, [r0]
197 	vst1.32		{q0}, [r1]!
198 	vst1.32		{q3}, [r1]
204 .Lctrinc:	.word	0, 1, 2, 3
205 .Lrol8_table:	.byte	3, 0, 1, 2, 7, 4, 5, 6
211 	sub		ip, sp, #0x20		// allocate a 32 byte buffer
212 	bic		ip, ip, #0x1f		// aligned to 32 bytes
224 	// requires no word shuffling. The words are re-interleaved before the
228 	// x0..15[0-3] = s0..15[0-3]
229 	add		ip, r0, #0x20
230 	vld1.32		{q0-q1}, [r0]
231 	vld1.32		{q2-q3}, [ip]
234 	vdup.32		q15, d7[1]
235 	vdup.32		q14, d7[0]
236 	vld1.32		{q4}, [lr, :128]
237 	vdup.32		q13, d6[1]
238 	vdup.32		q12, d6[0]
239 	vdup.32		q11, d5[1]
240 	vdup.32		q10, d5[0]
241 	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
242 	vdup.32		q9, d4[1]
243 	vdup.32		q8, d4[0]
244 	vdup.32		q7, d3[1]
245 	vdup.32		q6, d3[0]
246 	vdup.32		q5, d2[1]
247 	vdup.32		q4, d2[0]
248 	vdup.32		q3, d1[1]
249 	vdup.32		q2, d1[0]
250 	vdup.32		q1, d0[1]
251 	vdup.32		q0, d0[0]
257 	vld1.32		{q8-q9}, [sp, :256]
287 	vst1.32		{q8-q9}, [sp, :256]
327 	vld1.32		{q8-q9}, [sp, :256]
338 	vst1.32		{q8-q9}, [sp, :256]
354 	vld1.32		{q8-q9}, [sp, :256]
384 	vst1.32		{q8-q9}, [sp, :256]
424 	vld1.32		{q8-q9}, [sp, :256]
435 	vst1.32		{q8-q9}, [sp, :256]
454 	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
455 	// x8..9[0-3] are on the stack.
457 	// Re-interleave the words in the first two rows of each block (x0..7).
458 	// Also add the counter values 0-3 to x12[0-3].
459 	  vld1.32	{q8}, [lr, :128]	// load counter values 0-3
460 	vzip.32		q0, q1			// => (0 1 0 1) (0 1 0 1)
461 	vzip.32		q2, q3			// => (2 3 2 3) (2 3 2 3)
462 	vzip.32		q4, q5			// => (4 5 4 5) (4 5 4 5)
463 	vzip.32		q6, q7			// => (6 7 6 7) (6 7 6 7)
464 	  vadd.u32	q12, q8			// x12 += counter values 0-3
467 	  vld1.32	{q8-q9}, [r0]!		// load s0..7
471 	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
472 	// after XORing the first 32 bytes.
477 	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
483 	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
489 	// XOR first 32 bytes using keystream from first two rows of first block
490 	vld1.8		{q8-q9}, [r2]!
493 	vst1.8		{q8-q9}, [r1]!
495 	// Re-interleave the words in the last two rows of each block (x8..15).
496 	vld1.32		{q8-q9}, [sp, :256]
499 	vzip.32		q12, q13	// => (12 13 12 13) (12 13 12 13)
500 	vzip.32		q14, q15	// => (14 15 14 15) (14 15 14 15)
501 	vzip.32		q8, q9		// => (8 9 8 9) (8 9 8 9)
502 	vzip.32		q10, q11	// => (10 11 10 11) (10 11 10 11)
503 	  vld1.32	{q0-q1}, [r0]	// load s8..15
511 	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
517 	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
525 	vld1.8		{q0-q1}, [r2]!
530 	vst1.8		{q0-q1}, [r1]!
532 	vld1.8		{q0-q1}, [r2]!
533 	subs		r4, r4, #32
537 	vst1.8		{q0-q1}, [r1]!
539 	vld1.8		{q0-q1}, [r2]!
540 	subs		r4, r4, #32
544 	vst1.8		{q0-q1}, [r1]!
546 	vld1.8		{q0-q1}, [r2]!
547 	subs		r4, r4, #32
551 	vst1.8		{q0-q1}, [r1]!
553 	vld1.8		{q0-q1}, [r2]!
554 	subs		r4, r4, #32
558 	vst1.8		{q0-q1}, [r1]!
560 	vld1.8		{q0-q1}, [r2]!
561 	subs		r4, r4, #32
566 	vst1.8		{q0-q1}, [r1]!
568 	vld1.8		{q0-q1}, [r2]
571 	vst1.8		{q0-q1}, [r1]
584 	// Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
585 	// previous 32 byte output block that still needs to be written at
586 	// [r1] in q0-q1.
590 	adr		lr, .Lpermute + 32
595 	vld1.8		{q2-q3}, [lr]
596 	vld1.8		{q6-q7}, [r2]
598 	add		r4, r4, #32
600 	vtbl.8		d4, {q4-q5}, d4
601 	vtbl.8		d5, {q4-q5}, d5
602 	vtbl.8		d6, {q4-q5}, d6
603 	vtbl.8		d7, {q4-q5}, d7
608 	vst1.8		{q6-q7}, [r4]	// overlapping stores
609 	vst1.8		{q0-q1}, [r1]
636 	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
637 	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
638 	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
639 	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
640 	.byte		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
641 	.byte		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
642 	.byte		0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
643 	.byte		0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f