Lines Matching +full:xor +full:- +full:v2
4 * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
29 * chacha_permute - permute one block
31 * Permute one 64-byte block where the state matrix is stored in the four NEON
32 * registers v0-v3. It performs matrix operations on four words in parallel,
51 add v2.4s, v2.4s, v3.4s
52 eor v4.16b, v1.16b, v2.16b
62 add v2.4s, v2.4s, v3.4s
63 eor v4.16b, v1.16b, v2.16b
70 ext v2.16b, v2.16b, v2.16b, #8
80 add v2.4s, v2.4s, v3.4s
81 eor v4.16b, v1.16b, v2.16b
91 add v2.4s, v2.4s, v3.4s
92 eor v4.16b, v1.16b, v2.16b
99 ext v2.16b, v2.16b, v2.16b, #8
115 stp x29, x30, [sp, #-16]!
119 ld1 {v0.4s-v3.4s}, [x0]
120 ld1 {v8.4s-v11.4s}, [x0]
124 ld1 {v4.16b-v7.16b}, [x2]
135 add v2.4s, v2.4s, v10.4s
136 eor v2.16b, v2.16b, v6.16b
142 st1 {v0.16b-v3.16b}, [x1]
150 // x1: output (8 32-bit words)
153 stp x29, x30, [sp, #-16]!
156 ld1 {v0.4s-v3.4s}, [x0]
204 // matrix by interleaving 32- and then 64-bit words, which allows us to
205 // do XOR in NEON registers.
211 ld1 {v30.4s-v31.4s}, [x9]
213 // x0..15[0-3] = s0..3[0..3]
215 ld4r { v0.4s- v3.4s}, [x0]
216 ld4r { v4.4s- v7.4s}, [x8], #16
217 ld4r { v8.4s-v11.4s}, [x8], #16
218 ld4r {v12.4s-v15.4s}, [x8]
222 mov a2, v2.s[0]
237 // x12 += counter values 1-4
249 add v2.4s, v2.4s, v6.4s
258 eor v14.16b, v14.16b, v2.16b
316 add v2.4s, v2.4s, v6.4s
325 eor v14.16b, v14.16b, v2.16b
383 add v2.4s, v2.4s, v7.4s
392 eor v13.16b, v13.16b, v2.16b
450 add v2.4s, v2.4s, v7.4s
459 eor v13.16b, v13.16b, v2.16b
512 ld4r {v16.4s-v19.4s}, [x0], #16
513 ld4r {v20.4s-v23.4s}, [x0], #16
515 // x12 += counter values 0-3
518 // x0[0-3] += s0[0]
519 // x1[0-3] += s0[1]
520 // x2[0-3] += s0[2]
521 // x3[0-3] += s0[3]
528 add v2.4s, v2.4s, v18.4s
539 ld4r {v24.4s-v27.4s}, [x0], #16
540 ld4r {v28.4s-v31.4s}, [x0]
542 // x4[0-3] += s1[0]
543 // x5[0-3] += s1[1]
544 // x6[0-3] += s1[2]
545 // x7[0-3] += s1[3]
563 // x8[0-3] += s2[0]
564 // x9[0-3] += s2[1]
565 // x10[0-3] += s2[2]
566 // x11[0-3] += s2[3]
584 // x12[0-3] += s3[0]
585 // x13[0-3] += s3[1]
586 // x14[0-3] += s3[2]
587 // x15[0-3] += s3[3]
605 // interleave 32-bit words in state n, n+1
608 ldp w8, w9, [x2, #-56]
612 zip1 v18.4s, v2.4s, v3.4s
614 zip2 v19.4s, v2.4s, v3.4s
616 ldp w6, w7, [x2, #-48]
618 ldp w8, w9, [x2, #-40]
626 ldp w6, w7, [x2, #-32]
628 ldp w8, w9, [x2, #-24]
636 ldp w6, w7, [x2, #-16]
638 ldp w8, w9, [x2, #-8]
653 // interleave 64-bit words in state n, n+2
659 stp a2, a3, [x1, #-56]
662 ld1 {v16.16b-v19.16b}, [x2], #64
667 stp a4, a5, [x1, #-48]
670 stp a6, a7, [x1, #-40]
673 ld1 {v20.16b-v23.16b}, [x2], #64
676 zip1 v2.2d, v24.2d, v26.2d
678 stp a8, a9, [x1, #-32]
681 stp a10, a11, [x1, #-24]
684 ld1 {v24.16b-v27.16b}, [x2], #64
689 stp a12, a13, [x1, #-16]
692 stp a14, a15, [x1, #-8]
695 ld1 {v28.16b-v31.16b}, [x2]
697 // xor with corresponding input, write to output
700 eor v18.16b, v18.16b, v2.16b
710 st1 {v16.16b-v19.16b}, [x1], #64
718 st1 {v20.16b-v23.16b}, [x1], #64
726 st1 {v24.16b-v27.16b}, [x1], #64
727 st1 {v28.16b-v31.16b}, [x1]
734 ld1 {v28.16b-v31.16b}, [x10]
736 tbl v28.16b, {v4.16b-v7.16b}, v28.16b
737 tbl v29.16b, {v4.16b-v7.16b}, v29.16b
738 tbl v30.16b, {v4.16b-v7.16b}, v30.16b
739 tbl v31.16b, {v4.16b-v7.16b}, v31.16b
745 st1 {v20.16b-v23.16b}, [x5] // overlapping stores
746 1: st1 {v16.16b-v19.16b}, [x1]
750 .Lt128: ld1 {v28.16b-v31.16b}, [x10]
753 tbl v28.16b, {v0.16b-v3.16b}, v28.16b
754 tbl v29.16b, {v0.16b-v3.16b}, v29.16b
755 tbl v30.16b, {v0.16b-v3.16b}, v30.16b
756 tbl v31.16b, {v0.16b-v3.16b}, v31.16b
757 ld1 {v16.16b-v19.16b}, [x1] // reload first output block
762 ld1 {v4.16b-v7.16b}, [x10]
764 tbl v0.16b, {v8.16b-v11.16b}, v4.16b
765 tbl v1.16b, {v8.16b-v11.16b}, v5.16b
766 tbl v2.16b, {v8.16b-v11.16b}, v6.16b
767 tbl v3.16b, {v8.16b-v11.16b}, v7.16b
771 eor v30.16b, v30.16b, v2.16b
773 st1 {v28.16b-v31.16b}, [x6] // overlapping stores
774 2: st1 {v20.16b-v23.16b}, [x1]
779 ld1 {v4.16b-v7.16b}, [x10]
781 tbl v0.16b, {v12.16b-v15.16b}, v4.16b
782 tbl v1.16b, {v12.16b-v15.16b}, v5.16b
783 tbl v2.16b, {v12.16b-v15.16b}, v6.16b
784 tbl v3.16b, {v12.16b-v15.16b}, v7.16b
788 eor v30.16b, v30.16b, v2.16b
790 st1 {v28.16b-v31.16b}, [x7] // overlapping stores
791 3: st1 {v24.16b-v27.16b}, [x1]
800 .byte (.Li - 64)