chacha-neon-core.S - OpenGrok cross reference for /linux/lib/crypto/arm64/chacha-neon-core.S

Lines Matching +full:xor +full:- +full:v2
4  * Copyright (C) 2016-2018 Linaro, Ltd. <ard.biesheuvel@linaro.org>
11  * ChaCha20 256-bit cipher algorithm, RFC7539, x64 SSSE3 functions
29  * chacha_permute - permute one block
31  * Permute one 64-byte block where the state matrix is stored in the four NEON
32  * registers v0-v3.  It performs matrix operations on four words in parallel,
51 	add		v2.4s, v2.4s, v3.4s
52 	eor		v4.16b, v1.16b, v2.16b
62 	add		v2.4s, v2.4s, v3.4s
63 	eor		v4.16b, v1.16b, v2.16b
70 	ext		v2.16b, v2.16b, v2.16b, #8
80 	add		v2.4s, v2.4s, v3.4s
81 	eor		v4.16b, v1.16b, v2.16b
91 	add		v2.4s, v2.4s, v3.4s
92 	eor		v4.16b, v1.16b, v2.16b
99 	ext		v2.16b, v2.16b, v2.16b, #8
115 	stp		x29, x30, [sp, #-16]!
119 	ld1		{v0.4s-v3.4s}, [x0]
120 	ld1		{v8.4s-v11.4s}, [x0]
124 	ld1		{v4.16b-v7.16b}, [x2]
135 	add		v2.4s, v2.4s, v10.4s
136 	eor		v2.16b, v2.16b, v6.16b
142 	st1		{v0.16b-v3.16b}, [x1]
150 	// x1: output (8 32-bit words)
153 	stp		x29, x30, [sp, #-16]!
156 	ld1		{v0.4s-v3.4s}, [x0]
204 	// matrix by interleaving 32- and then 64-bit words, which allows us to
205 	// do XOR in NEON registers.
211 	ld1		{v30.4s-v31.4s}, [x9]
213 	// x0..15[0-3] = s0..3[0..3]
215 	ld4r		{ v0.4s- v3.4s}, [x0]
216 	ld4r		{ v4.4s- v7.4s}, [x8], #16
217 	ld4r		{ v8.4s-v11.4s}, [x8], #16
218 	ld4r		{v12.4s-v15.4s}, [x8]
222 	mov		a2, v2.s[0]
237 	// x12 += counter values 1-4
249 	add		v2.4s, v2.4s, v6.4s
258 	eor		v14.16b, v14.16b, v2.16b
316 	add		v2.4s, v2.4s, v6.4s
325 	eor		v14.16b, v14.16b, v2.16b
383 	add		v2.4s, v2.4s, v7.4s
392 	eor		v13.16b, v13.16b, v2.16b
450 	add		v2.4s, v2.4s, v7.4s
459 	eor		v13.16b, v13.16b, v2.16b
512 	ld4r		{v16.4s-v19.4s}, [x0], #16
513 	ld4r		{v20.4s-v23.4s}, [x0], #16
515 	// x12 += counter values 0-3
518 	// x0[0-3] += s0[0]
519 	// x1[0-3] += s0[1]
520 	// x2[0-3] += s0[2]
521 	// x3[0-3] += s0[3]
528 	add		v2.4s, v2.4s, v18.4s
539 	ld4r		{v24.4s-v27.4s}, [x0], #16
540 	ld4r		{v28.4s-v31.4s}, [x0]
542 	// x4[0-3] += s1[0]
543 	// x5[0-3] += s1[1]
544 	// x6[0-3] += s1[2]
545 	// x7[0-3] += s1[3]
563 	// x8[0-3] += s2[0]
564 	// x9[0-3] += s2[1]
565 	// x10[0-3] += s2[2]
566 	// x11[0-3] += s2[3]
584 	// x12[0-3] += s3[0]
585 	// x13[0-3] += s3[1]
586 	// x14[0-3] += s3[2]
587 	// x15[0-3] += s3[3]
605 	// interleave 32-bit words in state n, n+1
608 	  ldp		w8, w9, [x2, #-56]
612 	zip1		v18.4s, v2.4s, v3.4s
614 	zip2		v19.4s, v2.4s, v3.4s
616 	  ldp		w6, w7, [x2, #-48]
618 	  ldp		w8, w9, [x2, #-40]
626 	  ldp		w6, w7, [x2, #-32]
628 	  ldp		w8, w9, [x2, #-24]
636 	  ldp		w6, w7, [x2, #-16]
638 	  ldp		w8, w9, [x2, #-8]
653 	// interleave 64-bit words in state n, n+2
659 	  stp		a2, a3, [x1, #-56]
662 	ld1		{v16.16b-v19.16b}, [x2], #64
667 	  stp		a4, a5, [x1, #-48]
670 	  stp		a6, a7, [x1, #-40]
673 	ld1		{v20.16b-v23.16b}, [x2], #64
676 	zip1		v2.2d, v24.2d, v26.2d
678 	  stp		a8, a9, [x1, #-32]
681 	  stp		a10, a11, [x1, #-24]
684 	ld1		{v24.16b-v27.16b}, [x2], #64
689 	  stp		a12, a13, [x1, #-16]
692 	  stp		a14, a15, [x1, #-8]
695 	ld1		{v28.16b-v31.16b}, [x2]
697 	// xor with corresponding input, write to output
700 	eor		v18.16b, v18.16b, v2.16b
710 	st1		{v16.16b-v19.16b}, [x1], #64
718 	st1		{v20.16b-v23.16b}, [x1], #64
726 	st1		{v24.16b-v27.16b}, [x1], #64
727 	st1		{v28.16b-v31.16b}, [x1]
734 	ld1		{v28.16b-v31.16b}, [x10]
736 	tbl		v28.16b, {v4.16b-v7.16b}, v28.16b
737 	tbl		v29.16b, {v4.16b-v7.16b}, v29.16b
738 	tbl		v30.16b, {v4.16b-v7.16b}, v30.16b
739 	tbl		v31.16b, {v4.16b-v7.16b}, v31.16b
745 	st1		{v20.16b-v23.16b}, [x5]		// overlapping stores
746 1:	st1		{v16.16b-v19.16b}, [x1]
750 .Lt128:	ld1		{v28.16b-v31.16b}, [x10]
753 	tbl		v28.16b, {v0.16b-v3.16b}, v28.16b
754 	tbl		v29.16b, {v0.16b-v3.16b}, v29.16b
755 	tbl		v30.16b, {v0.16b-v3.16b}, v30.16b
756 	tbl		v31.16b, {v0.16b-v3.16b}, v31.16b
757 	ld1		{v16.16b-v19.16b}, [x1]		// reload first output block
762 	ld1		{v4.16b-v7.16b}, [x10]
764 	tbl		v0.16b, {v8.16b-v11.16b}, v4.16b
765 	tbl		v1.16b, {v8.16b-v11.16b}, v5.16b
766 	tbl		v2.16b, {v8.16b-v11.16b}, v6.16b
767 	tbl		v3.16b, {v8.16b-v11.16b}, v7.16b
771 	eor		v30.16b, v30.16b, v2.16b
773 	st1		{v28.16b-v31.16b}, [x6]		// overlapping stores
774 2:	st1		{v20.16b-v23.16b}, [x1]
779 	ld1		{v4.16b-v7.16b}, [x10]
781 	tbl		v0.16b, {v12.16b-v15.16b}, v4.16b
782 	tbl		v1.16b, {v12.16b-v15.16b}, v5.16b
783 	tbl		v2.16b, {v12.16b-v15.16b}, v6.16b
784 	tbl		v3.16b, {v12.16b-v15.16b}, v7.16b
788 	eor		v30.16b, v30.16b, v2.16b
790 	st1		{v28.16b-v31.16b}, [x7]		// overlapping stores
791 3:	st1		{v24.16b-v27.16b}, [x1]
800 	.byte		(.Li - 64)