sha256-ce.S - OpenGrok cross reference for /linux/lib/crypto/arm64/sha256-ce.S

Lines Matching +full:- +full:4
1 /* SPDX-License-Identifier: GPL-2.0-only */
3  * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
12 	.arch		armv8-a+crypto
32 	add		t1.4s, v\s0\().4s, \rc\().4s
33 	sha256h		dg0q, dg1q, t0.4s
34 	sha256h2	dg1q, dg2q, t0.4s
37 	add		t0.4s, v\s0\().4s, \rc\().4s
39 	sha256h		dg0q, dg1q, t1.4s
40 	sha256h2	dg1q, dg2q, t1.4s
45 	sha256su0	v\s0\().4s, v\s1\().4s
47 	sha256su1	v\s0\().4s, v\s2\().4s, v\s3\().4s
51 	 * The SHA-256 round constants
54 	.align		4
75 	ld1		{ v0.4s- v3.4s}, [\tmp], #64
76 	ld1		{ v4.4s- v7.4s}, [\tmp], #64
77 	ld1		{ v8.4s-v11.4s}, [\tmp], #64
78 	ld1		{v12.4s-v15.4s}, [\tmp]
91 	ld1		{dgav.4s, dgbv.4s}, [x0]
94 0:	ld1		{v16.4s-v19.4s}, [x1], #64
102 	add		t0.4s, v16.4s, v0.4s
127 	add		dgav.4s, dgav.4s, dg0v.4s
128 	add		dgbv.4s, dgbv.4s, dg1v.4s
137 1:	st1		{dgav.4s, dgbv.4s}, [x0]
167 	// x8-x9 are used as temporaries.
169 	// v0-v15 are used to cache the SHA-256 round constants.
170 	// v16-v19 are used for the message schedule for the first message.
171 	// v20-v23 are used for the message schedule for the second message.
172 	// v24-v31 are used for the state and temporaries as given below.
193 	// Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a
194 	// and m0_b contain the current 4 message schedule words for the first
198 	// this also computes 4 more message schedule words for each message.
199 	// m1_a-m3_a contain the next 3 groups of 4 message schedule words for
200 	// the first message, and likewise m1_b-m3_b for the second.  After
208 	add		t0_a\().4s, \m0_a\().4s, \k\().4s
209 	add		t0_b\().4s, \m0_b\().4s, \k\().4s
211 	sha256su0	\m0_a\().4s, \m1_a\().4s
212 	sha256su0	\m0_b\().4s, \m1_b\().4s
213 	sha256su1	\m0_a\().4s, \m2_a\().4s, \m3_a\().4s
214 	sha256su1	\m0_b\().4s, \m2_b\().4s, \m3_b\().4s
218 	sha256h		state0_a_q, state1_a_q, t0_a\().4s
219 	sha256h		state0_b_q, state1_b_q, t0_b\().4s
220 	sha256h2	state1_a_q, t1_a_q, t0_a\().4s
221 	sha256h2	state1_b_q, t1_b_q, t0_b\().4s
226 	do_4rounds_2x	\i + 4,  \k1,  v17, v18, v19, v16,  v21, v22, v23, v20
237 // This function computes the SHA-256 digests of two messages |data1| and
241 // The instructions for the two SHA-256 operations are interleaved.  On many
243 // to taking better advantage of the CPU's SHA-256 and SIMD throughput.
250 	// Load the initial state from ctx->state.
251 	ld1		{state0_a.4s-state1_a.4s}, [ctx]
253 	// Load ctx->bytecount.  Take the mod 64 of it to get the number of
254 	// bytes that are buffered in ctx->buf.  Also save it in a register with
261 	// x8 bytes (1 to 63) are currently buffered in ctx->buf.  Load them
262 	// followed by the first 64 - x8 bytes of data.  Since len >= 64, we
263 	// just load 64 bytes from each of ctx->buf, data1, and data2
266 	ld1		{v16.16b-v19.16b}, [x9]
267 	st1		{v16.16b-v19.16b}, [sp]
269 	ld1		{v16.16b-v19.16b}, [data1], #64
271 	st1		{v16.16b-v19.16b}, [x9]
272 	ld1		{v16.4s-v19.4s}, [sp]
274 	ld1		{v20.16b-v23.16b}, [data2], #64
275 	st1		{v20.16b-v23.16b}, [x9]
276 	ld1		{v20.4s-v23.4s}, [sp]
292 	ld1		{v16.4s-v19.4s}, [data1], #64
293 	ld1		{v20.4s-v23.4s}, [data2], #64
307 	st1		{state0_a.4s-state1_b.4s}, [sp]
309 	// Do the SHA-256 rounds on each block.
316 	ld1		{v16.4s-v19.4s}, [sp]
317 	add		state0_a.4s, state0_a.4s, v16.4s
318 	add		state1_a.4s, state1_a.4s, v17.4s
319 	add		state0_b.4s, state0_b.4s, v18.4s
320 	add		state1_b.4s, state1_b.4s, v19.4s
328 	// final_step = 1: need to do count-only padding block
335 	// Not block-aligned; 1 <= len <= 63 data bytes remain.  Pad the block.
338 	// and load from &sp[64 - len] to get the needed padding block.  This
340 	sub		w8, len, #64		// w8 = len - 64
341 	add		data1, data1, w8, sxtw	// data1 += len - 64
342 	add		data2, data2, w8, sxtw	// data2 += len - 64
351 	sub		x9, sp, w8, sxtw	// x9 = &sp[64 - len]
357 	mov		final_step, #2	// won't need count-only block
360 	mov		final_step, #1	// will need count-only block
362 	ld1		{v16.16b-v19.16b}, [data1]
363 	st1		{v16.16b-v19.16b}, [sp]
364 	ld1		{v16.4s-v19.4s}, [x9]
365 	ld1		{v20.16b-v23.16b}, [data2]
366 	st1		{v20.16b-v23.16b}, [sp]
367 	ld1		{v20.4s-v23.4s}, [x9]
378 	// Pre-swap the endianness of the words.
404 	st1		{state0_a.4s-state1_a.4s}, [out1]
405 	st1		{state0_b.4s-state1_b.4s}, [out2]