chacha-neon-core.S - OpenGrok cross reference for /linux/lib/crypto/arm/chacha-neon-core.S

Lines Matching +full:cortex +full:- +full:x4
11  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
26   * (c)  vrev32.16			(16-bit rotations only)
30   * ChaCha has 16, 12, 8, and 7-bit rotations.  For the 12 and 7-bit rotations,
31   * the only choices are (a) and (b).  We use (a) since it takes two-thirds the
32   * cycles of (b) on both Cortex-A7 and Cortex-A53.
34   * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
37   * For the 8-bit rotation, we use vtbl.8 + vtbl.8.  On Cortex-A7, this sequence
42   * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
46   * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
57  * chacha_permute - permute one block
59  * Permute one 64-byte block where the state matrix is stored in the four NEON
60  * registers q0-q3.  It performs matrix operations on four words in parallel,
65  * Clobbers: r3, ip, q4-q5
148 	vld1.32		{q0-q1}, [r0]
149 	vld1.32		{q2-q3}, [ip]
159 	vld1.8		{q4-q5}, [r2]
160 	vld1.8		{q6-q7}, [ip]
179 	vst1.8		{q0-q1}, [r1]
180 	vst1.8		{q2-q3}, [ip]
187 	// r1: output (8 32-bit words)
191 	vld1.32		{q0-q1}, [r0]!
192 	vld1.32		{q2-q3}, [r0]
224 	// requires no word shuffling. The words are re-interleaved before the
228 	// x0..15[0-3] = s0..15[0-3]
230 	vld1.32		{q0-q1}, [r0]
231 	vld1.32		{q2-q3}, [ip]
241 	vadd.u32	q12, q12, q4		// x12 += counter values 0-3
257 	vld1.32		{q8-q9}, [sp, :256]
259 	// x0 += x4, x12 = rotl32(x12 ^ x0, 16)
278 	// x8 += x12, x4 = rotl32(x4 ^ x8, 12)
287 	vst1.32		{q8-q9}, [sp, :256]
303 	// x0 += x4, x12 = rotl32(x12 ^ x0, 8)
327 	vld1.32		{q8-q9}, [sp, :256]
329 	// x8 += x12, x4 = rotl32(x4 ^ x8, 7)
338 	vst1.32		{q8-q9}, [sp, :256]
354 	vld1.32		{q8-q9}, [sp, :256]
359 	// x3 += x4, x14 = rotl32(x14 ^ x3, 16)
378 	// x9 += x14, x4 = rotl32(x4 ^ x9, 12)
384 	vst1.32		{q8-q9}, [sp, :256]
403 	// x3 += x4, x14 = rotl32(x14 ^ x3, 8)
424 	vld1.32		{q8-q9}, [sp, :256]
429 	// x9 += x14, x4 = rotl32(x4 ^ x9, 7)
435 	vst1.32		{q8-q9}, [sp, :256]
454 	// x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
455 	// x8..9[0-3] are on the stack.
457 	// Re-interleave the words in the first two rows of each block (x0..7).
458 	// Also add the counter values 0-3 to x12[0-3].
459 	  vld1.32	{q8}, [lr, :128]	// load counter values 0-3
464 	  vadd.u32	q12, q8			// x12 += counter values 0-3
467 	  vld1.32	{q8-q9}, [r0]!		// load s0..7
471 	// Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
477 	// x0..3[0-3] += s0..3[0-3]	(add orig state to 1st row of each block)
483 	// x4..7[0-3] += s4..7[0-3]	(add orig state to 2nd row of each block)
490 	vld1.8		{q8-q9}, [r2]!
493 	vst1.8		{q8-q9}, [r1]!
495 	// Re-interleave the words in the last two rows of each block (x8..15).
496 	vld1.32		{q8-q9}, [sp, :256]
503 	  vld1.32	{q0-q1}, [r0]	// load s8..15
511 	// x8..11[0-3] += s8..11[0-3]	(add orig state to 3rd row of each block)
517 	// x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
525 	vld1.8		{q0-q1}, [r2]!
530 	vst1.8		{q0-q1}, [r1]!
532 	vld1.8		{q0-q1}, [r2]!
537 	vst1.8		{q0-q1}, [r1]!
539 	vld1.8		{q0-q1}, [r2]!
544 	vst1.8		{q0-q1}, [r1]!
546 	vld1.8		{q0-q1}, [r2]!
551 	vst1.8		{q0-q1}, [r1]!
553 	vld1.8		{q0-q1}, [r2]!
558 	vst1.8		{q0-q1}, [r1]!
560 	vld1.8		{q0-q1}, [r2]!
566 	vst1.8		{q0-q1}, [r1]!
568 	vld1.8		{q0-q1}, [r2]
571 	vst1.8		{q0-q1}, [r1]
584 	// Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
586 	// [r1] in q0-q1.
595 	vld1.8		{q2-q3}, [lr]
596 	vld1.8		{q6-q7}, [r2]
600 	vtbl.8		d4, {q4-q5}, d4
601 	vtbl.8		d5, {q4-q5}, d5
602 	vtbl.8		d6, {q4-q5}, d6
603 	vtbl.8		d7, {q4-q5}, d7
608 	vst1.8		{q6-q7}, [r4]	// overlapping stores
609 	vst1.8		{q0-q1}, [r1]