Lines Matching +full:0 +full:- +full:32

11  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSE3 functions
26 * (c) vrev32.16 (16-bit rotations only)
30 * ChaCha has 16, 12, 8, and 7-bit rotations. For the 12 and 7-bit rotations,
31 * the only choices are (a) and (b). We use (a) since it takes two-thirds the
32 * cycles of (b) on both Cortex-A7 and Cortex-A53.
34 * For the 16-bit rotation, we use vrev32.16 since it's consistently fastest
37 * For the 8-bit rotation, we use vtbl.8 + vtbl.8. On Cortex-A7, this sequence
42 * A disadvantage is that on Cortex-A53, the vtbl sequence is the same speed as
46 * CPUs, e.g. ~4.8% faster ChaCha20 on Cortex-A7.
57 * chacha_permute - permute one block
59 * Permute one 64-byte block where the state matrix is stored in the four NEON
60 * registers q0-q3. It performs matrix operations on four words in parallel,
65 * Clobbers: r3, ip, q4-q5
96 // x1 = shuffle32(x1, MASK(0, 3, 2, 1))
98 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
100 // x3 = shuffle32(x3, MASK(2, 1, 0, 3))
126 // x1 = shuffle32(x1, MASK(2, 1, 0, 3))
128 // x2 = shuffle32(x2, MASK(1, 0, 3, 2))
130 // x3 = shuffle32(x3, MASK(0, 3, 2, 1))
147 add ip, r0, #0x20
148 vld1.32 {q0-q1}, [r0]
149 vld1.32 {q2-q3}, [ip]
158 add ip, r2, #0x20
159 vld1.8 {q4-q5}, [r2]
160 vld1.8 {q6-q7}, [ip]
178 add ip, r1, #0x20
179 vst1.8 {q0-q1}, [r1]
180 vst1.8 {q2-q3}, [ip]
187 // r1: output (8 32-bit words)
191 vld1.32 {q0-q1}, [r0]!
192 vld1.32 {q2-q3}, [r0]
197 vst1.32 {q0}, [r1]!
198 vst1.32 {q3}, [r1]
204 .Lctrinc: .word 0, 1, 2, 3
205 .Lrol8_table: .byte 3, 0, 1, 2, 7, 4, 5, 6
211 sub ip, sp, #0x20 // allocate a 32 byte buffer
212 bic ip, ip, #0x1f // aligned to 32 bytes
224 // requires no word shuffling. The words are re-interleaved before the
228 // x0..15[0-3] = s0..15[0-3]
229 add ip, r0, #0x20
230 vld1.32 {q0-q1}, [r0]
231 vld1.32 {q2-q3}, [ip]
234 vdup.32 q15, d7[1]
235 vdup.32 q14, d7[0]
236 vld1.32 {q4}, [lr, :128]
237 vdup.32 q13, d6[1]
238 vdup.32 q12, d6[0]
239 vdup.32 q11, d5[1]
240 vdup.32 q10, d5[0]
241 vadd.u32 q12, q12, q4 // x12 += counter values 0-3
242 vdup.32 q9, d4[1]
243 vdup.32 q8, d4[0]
244 vdup.32 q7, d3[1]
245 vdup.32 q6, d3[0]
246 vdup.32 q5, d2[1]
247 vdup.32 q4, d2[0]
248 vdup.32 q3, d1[1]
249 vdup.32 q2, d1[0]
250 vdup.32 q1, d0[1]
251 vdup.32 q0, d0[0]
257 vld1.32 {q8-q9}, [sp, :256]
287 vst1.32 {q8-q9}, [sp, :256]
327 vld1.32 {q8-q9}, [sp, :256]
338 vst1.32 {q8-q9}, [sp, :256]
354 vld1.32 {q8-q9}, [sp, :256]
384 vst1.32 {q8-q9}, [sp, :256]
424 vld1.32 {q8-q9}, [sp, :256]
435 vst1.32 {q8-q9}, [sp, :256]
454 // x0..7[0-3] are in q0-q7, x10..15[0-3] are in q10-q15.
455 // x8..9[0-3] are on the stack.
457 // Re-interleave the words in the first two rows of each block (x0..7).
458 // Also add the counter values 0-3 to x12[0-3].
459 vld1.32 {q8}, [lr, :128] // load counter values 0-3
460 vzip.32 q0, q1 // => (0 1 0 1) (0 1 0 1)
461 vzip.32 q2, q3 // => (2 3 2 3) (2 3 2 3)
462 vzip.32 q4, q5 // => (4 5 4 5) (4 5 4 5)
463 vzip.32 q6, q7 // => (6 7 6 7) (6 7 6 7)
464 vadd.u32 q12, q8 // x12 += counter values 0-3
467 vld1.32 {q8-q9}, [r0]! // load s0..7
471 // Swap q1 and q4 so that we'll free up consecutive registers (q0-q1)
472 // after XORing the first 32 bytes.
477 // x0..3[0-3] += s0..3[0-3] (add orig state to 1st row of each block)
483 // x4..7[0-3] += s4..7[0-3] (add orig state to 2nd row of each block)
489 // XOR first 32 bytes using keystream from first two rows of first block
490 vld1.8 {q8-q9}, [r2]!
493 vst1.8 {q8-q9}, [r1]!
495 // Re-interleave the words in the last two rows of each block (x8..15).
496 vld1.32 {q8-q9}, [sp, :256]
499 vzip.32 q12, q13 // => (12 13 12 13) (12 13 12 13)
500 vzip.32 q14, q15 // => (14 15 14 15) (14 15 14 15)
501 vzip.32 q8, q9 // => (8 9 8 9) (8 9 8 9)
502 vzip.32 q10, q11 // => (10 11 10 11) (10 11 10 11)
503 vld1.32 {q0-q1}, [r0] // load s8..15
511 // x8..11[0-3] += s8..11[0-3] (add orig state to 3rd row of each block)
517 // x12..15[0-3] += s12..15[0-3] (add orig state to 4th row of each block)
525 vld1.8 {q0-q1}, [r2]!
530 vst1.8 {q0-q1}, [r1]!
532 vld1.8 {q0-q1}, [r2]!
533 subs r4, r4, #32
537 vst1.8 {q0-q1}, [r1]!
539 vld1.8 {q0-q1}, [r2]!
540 subs r4, r4, #32
544 vst1.8 {q0-q1}, [r1]!
546 vld1.8 {q0-q1}, [r2]!
547 subs r4, r4, #32
551 vst1.8 {q0-q1}, [r1]!
553 vld1.8 {q0-q1}, [r2]!
554 subs r4, r4, #32
558 vst1.8 {q0-q1}, [r1]!
560 vld1.8 {q0-q1}, [r2]!
561 subs r4, r4, #32
566 vst1.8 {q0-q1}, [r1]!
568 vld1.8 {q0-q1}, [r2]
571 vst1.8 {q0-q1}, [r1]
584 // Entered with 32 bytes of ChaCha cipher stream in q4-q5, and the
585 // previous 32 byte output block that still needs to be written at
586 // [r1] in q0-q1.
590 adr lr, .Lpermute + 32
595 vld1.8 {q2-q3}, [lr]
596 vld1.8 {q6-q7}, [r2]
598 add r4, r4, #32
600 vtbl.8 d4, {q4-q5}, d4
601 vtbl.8 d5, {q4-q5}, d5
602 vtbl.8 d6, {q4-q5}, d6
603 vtbl.8 d7, {q4-q5}, d7
608 vst1.8 {q6-q7}, [r4] // overlapping stores
609 vst1.8 {q0-q1}, [r1]
636 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
637 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
638 .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
639 .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f
640 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
641 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
642 .byte 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17
643 .byte 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f