Lines Matching +full:- +full:b

4  * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
11 * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
29 * chacha_permute - permute one block
31 * Permute one 64-byte block where the state matrix is stored in the four NEON
32 * registers v0-v3. It performs matrix operations on four words in parallel,
47 eor v3.16b, v3.16b, v0.16b
52 eor v4.16b, v1.16b, v2.16b
58 eor v3.16b, v3.16b, v0.16b
59 tbl v3.16b, {v3.16b}, v12.16b
63 eor v4.16b, v1.16b, v2.16b
68 ext v1.16b, v1.16b, v1.16b, #4
70 ext v2.16b, v2.16b, v2.16b, #8
72 ext v3.16b, v3.16b, v3.16b, #12
76 eor v3.16b, v3.16b, v0.16b
81 eor v4.16b, v1.16b, v2.16b
87 eor v3.16b, v3.16b, v0.16b
88 tbl v3.16b, {v3.16b}, v12.16b
92 eor v4.16b, v1.16b, v2.16b
97 ext v1.16b, v1.16b, v1.16b, #12
99 ext v2.16b, v2.16b, v2.16b, #8
101 ext v3.16b, v3.16b, v3.16b, #4
104 b.ne .Ldoubleround
115 stp x29, x30, [sp, #-16]!
119 ld1 {v0.4s-v3.4s}, [x0]
120 ld1 {v8.4s-v11.4s}, [x0]
124 ld1 {v4.16b-v7.16b}, [x2]
128 eor v0.16b, v0.16b, v4.16b
132 eor v1.16b, v1.16b, v5.16b
136 eor v2.16b, v2.16b, v6.16b
140 eor v3.16b, v3.16b, v7.16b
142 st1 {v0.16b-v3.16b}, [x1]
150 // x1: output (8 32-bit words)
153 stp x29, x30, [sp, #-16]!
156 ld1 {v0.4s-v3.4s}, [x0]
204 // matrix by interleaving 32- and then 64-bit words, which allows us to
211 ld1 {v30.4s-v31.4s}, [x9]
213 // x0..15[0-3] = s0..3[0..3]
215 ld4r { v0.4s- v3.4s}, [x0]
216 ld4r { v4.4s- v7.4s}, [x8], #16
217 ld4r { v8.4s-v11.4s}, [x8], #16
218 ld4r {v12.4s-v15.4s}, [x8]
237 // x12 += counter values 1-4
254 eor v12.16b, v12.16b, v0.16b
256 eor v13.16b, v13.16b, v1.16b
258 eor v14.16b, v14.16b, v2.16b
260 eor v15.16b, v15.16b, v3.16b
285 eor v16.16b, v4.16b, v8.16b
287 eor v17.16b, v5.16b, v9.16b
289 eor v18.16b, v6.16b, v10.16b
291 eor v19.16b, v7.16b, v11.16b
321 eor v12.16b, v12.16b, v0.16b
323 eor v13.16b, v13.16b, v1.16b
325 eor v14.16b, v14.16b, v2.16b
327 eor v15.16b, v15.16b, v3.16b
330 tbl v12.16b, {v12.16b}, v31.16b
332 tbl v13.16b, {v13.16b}, v31.16b
334 tbl v14.16b, {v14.16b}, v31.16b
336 tbl v15.16b, {v15.16b}, v31.16b
352 eor v16.16b, v4.16b, v8.16b
354 eor v17.16b, v5.16b, v9.16b
356 eor v18.16b, v6.16b, v10.16b
358 eor v19.16b, v7.16b, v11.16b
388 eor v15.16b, v15.16b, v0.16b
390 eor v12.16b, v12.16b, v1.16b
392 eor v13.16b, v13.16b, v2.16b
394 eor v14.16b, v14.16b, v3.16b
419 eor v16.16b, v5.16b, v10.16b
421 eor v17.16b, v6.16b, v11.16b
423 eor v18.16b, v7.16b, v8.16b
425 eor v19.16b, v4.16b, v9.16b
455 eor v15.16b, v15.16b, v0.16b
457 eor v12.16b, v12.16b, v1.16b
459 eor v13.16b, v13.16b, v2.16b
461 eor v14.16b, v14.16b, v3.16b
464 tbl v15.16b, {v15.16b}, v31.16b
466 tbl v12.16b, {v12.16b}, v31.16b
468 tbl v13.16b, {v13.16b}, v31.16b
470 tbl v14.16b, {v14.16b}, v31.16b
486 eor v16.16b, v5.16b, v10.16b
488 eor v17.16b, v6.16b, v11.16b
490 eor v18.16b, v7.16b, v8.16b
492 eor v19.16b, v4.16b, v9.16b
510 b.ne .Ldoubleround4
512 ld4r {v16.4s-v19.4s}, [x0], #16
513 ld4r {v20.4s-v23.4s}, [x0], #16
515 // x12 += counter values 0-3
518 // x0[0-3] += s0[0]
519 // x1[0-3] += s0[1]
520 // x2[0-3] += s0[2]
521 // x3[0-3] += s0[3]
539 ld4r {v24.4s-v27.4s}, [x0], #16
540 ld4r {v28.4s-v31.4s}, [x0]
542 // x4[0-3] += s1[0]
543 // x5[0-3] += s1[1]
544 // x6[0-3] += s1[2]
545 // x7[0-3] += s1[3]
563 // x8[0-3] += s2[0]
564 // x9[0-3] += s2[1]
565 // x10[0-3] += s2[2]
566 // x11[0-3] += s2[3]
584 // x12[0-3] += s3[0]
585 // x13[0-3] += s3[1]
586 // x14[0-3] += s3[2]
587 // x15[0-3] += s3[3]
605 // interleave 32-bit words in state n, n+1
608 ldp w8, w9, [x2, #-56]
616 ldp w6, w7, [x2, #-48]
618 ldp w8, w9, [x2, #-40]
626 ldp w6, w7, [x2, #-32]
628 ldp w8, w9, [x2, #-24]
636 ldp w6, w7, [x2, #-16]
638 ldp w8, w9, [x2, #-8]
653 // interleave 64-bit words in state n, n+2
659 stp a2, a3, [x1, #-56]
662 ld1 {v16.16b-v19.16b}, [x2], #64
667 stp a4, a5, [x1, #-48]
670 stp a6, a7, [x1, #-40]
673 ld1 {v20.16b-v23.16b}, [x2], #64
678 stp a8, a9, [x1, #-32]
681 stp a10, a11, [x1, #-24]
684 ld1 {v24.16b-v27.16b}, [x2], #64
689 stp a12, a13, [x1, #-16]
692 stp a14, a15, [x1, #-8]
695 ld1 {v28.16b-v31.16b}, [x2]
698 eor v16.16b, v16.16b, v0.16b
699 eor v17.16b, v17.16b, v1.16b
700 eor v18.16b, v18.16b, v2.16b
701 eor v19.16b, v19.16b, v3.16b
705 eor v20.16b, v20.16b, v4.16b
706 eor v21.16b, v21.16b, v5.16b
707 eor v22.16b, v22.16b, v6.16b
708 eor v23.16b, v23.16b, v7.16b
710 st1 {v16.16b-v19.16b}, [x1], #64
713 eor v24.16b, v24.16b, v8.16b
714 eor v25.16b, v25.16b, v9.16b
715 eor v26.16b, v26.16b, v10.16b
716 eor v27.16b, v27.16b, v11.16b
718 st1 {v20.16b-v23.16b}, [x1], #64
721 eor v28.16b, v28.16b, v12.16b
722 eor v29.16b, v29.16b, v13.16b
723 eor v30.16b, v30.16b, v14.16b
724 eor v31.16b, v31.16b, v15.16b
726 st1 {v24.16b-v27.16b}, [x1], #64
727 st1 {v28.16b-v31.16b}, [x1]
734 ld1 {v28.16b-v31.16b}, [x10]
736 tbl v28.16b, {v4.16b-v7.16b}, v28.16b
737 tbl v29.16b, {v4.16b-v7.16b}, v29.16b
738 tbl v30.16b, {v4.16b-v7.16b}, v30.16b
739 tbl v31.16b, {v4.16b-v7.16b}, v31.16b
741 0: eor v20.16b, v20.16b, v28.16b
742 eor v21.16b, v21.16b, v29.16b
743 eor v22.16b, v22.16b, v30.16b
744 eor v23.16b, v23.16b, v31.16b
745 st1 {v20.16b-v23.16b}, [x5] // overlapping stores
746 1: st1 {v16.16b-v19.16b}, [x1]
747 b .Lout
750 .Lt128: ld1 {v28.16b-v31.16b}, [x10]
753 tbl v28.16b, {v0.16b-v3.16b}, v28.16b
754 tbl v29.16b, {v0.16b-v3.16b}, v29.16b
755 tbl v30.16b, {v0.16b-v3.16b}, v30.16b
756 tbl v31.16b, {v0.16b-v3.16b}, v31.16b
757 ld1 {v16.16b-v19.16b}, [x1] // reload first output block
758 b 0b
762 ld1 {v4.16b-v7.16b}, [x10]
764 tbl v0.16b, {v8.16b-v11.16b}, v4.16b
765 tbl v1.16b, {v8.16b-v11.16b}, v5.16b
766 tbl v2.16b, {v8.16b-v11.16b}, v6.16b
767 tbl v3.16b, {v8.16b-v11.16b}, v7.16b
769 eor v28.16b, v28.16b, v0.16b
770 eor v29.16b, v29.16b, v1.16b
771 eor v30.16b, v30.16b, v2.16b
772 eor v31.16b, v31.16b, v3.16b
773 st1 {v28.16b-v31.16b}, [x6] // overlapping stores
774 2: st1 {v20.16b-v23.16b}, [x1]
775 b .Lout
779 ld1 {v4.16b-v7.16b}, [x10]
781 tbl v0.16b, {v12.16b-v15.16b}, v4.16b
782 tbl v1.16b, {v12.16b-v15.16b}, v5.16b
783 tbl v2.16b, {v12.16b-v15.16b}, v6.16b
784 tbl v3.16b, {v12.16b-v15.16b}, v7.16b
786 eor v28.16b, v28.16b, v0.16b
787 eor v29.16b, v29.16b, v1.16b
788 eor v30.16b, v30.16b, v2.16b
789 eor v31.16b, v31.16b, v3.16b
790 st1 {v28.16b-v31.16b}, [x7] // overlapping stores
791 3: st1 {v24.16b-v27.16b}, [x1]
792 b .Lout
800 .byte (.Li - 64)