sha256-ce.S - OpenGrok cross reference for /linux/lib/crypto/arm64/sha256-ce.S

Lines Matching +full:sub +full:- +full:message
1 /* SPDX-License-Identifier: GPL-2.0-only */
3  * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
12 	.arch		armv8-a+crypto
51 	 * The SHA-256 round constants
75 	ld1		{ v0.4s- v3.4s}, [\tmp], #64
76 	ld1		{ v4.4s- v7.4s}, [\tmp], #64
77 	ld1		{ v8.4s-v11.4s}, [\tmp], #64
78 	ld1		{v12.4s-v15.4s}, [\tmp]
94 0:	ld1		{v16.4s-v19.4s}, [x1], #64
95 	sub		x2, x2, #1
167 	// x8-x9 are used as temporaries.
169 	// v0-v15 are used to cache the SHA-256 round constants.
170 	// v16-v19 are used for the message schedule for the first message.
171 	// v20-v23 are used for the message schedule for the second message.
172 	// v24-v31 are used for the state and temporaries as given below.
173 	// *_a are for the first message and *_b for the second.
193 	// Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a
194 	// and m0_b contain the current 4 message schedule words for the first
195 	// and second message respectively.
197 	// If not all the message schedule words have been computed yet, then
198 	// this also computes 4 more message schedule words for each message.
199 	// m1_a-m3_a contain the next 3 groups of 4 message schedule words for
200 	// the first message, and likewise m1_b-m3_b for the second.  After
237 // This function computes the SHA-256 digests of two messages |data1| and
241 // The instructions for the two SHA-256 operations are interleaved.  On many
242 // CPUs, this is almost twice as fast as hashing each message individually due
243 // to taking better advantage of the CPU's SHA-256 and SIMD throughput.
246 	sub		sp, sp, #128
250 	// Load the initial state from ctx->state.
251 	ld1		{state0_a.4s-state1_a.4s}, [ctx]
253 	// Load ctx->bytecount.  Take the mod 64 of it to get the number of
254 	// bytes that are buffered in ctx->buf.  Also save it in a register with
261 	// x8 bytes (1 to 63) are currently buffered in ctx->buf.  Load them
262 	// followed by the first 64 - x8 bytes of data.  Since len >= 64, we
263 	// just load 64 bytes from each of ctx->buf, data1, and data2
266 	ld1		{v16.16b-v19.16b}, [x9]
267 	st1		{v16.16b-v19.16b}, [sp]
269 	ld1		{v16.16b-v19.16b}, [data1], #64
271 	st1		{v16.16b-v19.16b}, [x9]
272 	ld1		{v16.4s-v19.4s}, [sp]
274 	ld1		{v20.16b-v23.16b}, [data2], #64
275 	st1		{v20.16b-v23.16b}, [x9]
276 	ld1		{v20.4s-v23.4s}, [sp]
278 	sub		len, len, #64
279 	sub		data1, data1, x8
280 	sub		data2, data2, x8
287 	sub		len, len, #64
292 	ld1		{v16.4s-v19.4s}, [data1], #64
293 	ld1		{v20.4s-v23.4s}, [data2], #64
307 	st1		{state0_a.4s-state1_b.4s}, [sp]
309 	// Do the SHA-256 rounds on each block.
316 	ld1		{v16.4s-v19.4s}, [sp]
323 	sub		len, len, #64
328 	// final_step = 1: need to do count-only padding block
335 	// Not block-aligned; 1 <= len <= 63 data bytes remain.  Pad the block.
337 	// &sp[64].  Then for each message, copy the last 64 data bytes to sp
338 	// and load from &sp[64 - len] to get the needed padding block.  This
340 	sub		w8, len, #64		// w8 = len - 64
341 	add		data1, data1, w8, sxtw	// data1 += len - 64
342 	add		data2, data2, w8, sxtw	// data2 += len - 64
351 	sub		x9, sp, w8, sxtw	// x9 = &sp[64 - len]
357 	mov		final_step, #2	// won't need count-only block
360 	mov		final_step, #1	// will need count-only block
362 	ld1		{v16.16b-v19.16b}, [data1]
363 	st1		{v16.16b-v19.16b}, [sp]
364 	ld1		{v16.4s-v19.4s}, [x9]
365 	ld1		{v20.16b-v23.16b}, [data2]
366 	st1		{v20.16b-v23.16b}, [sp]
367 	ld1		{v20.4s-v23.4s}, [x9]
373 	//	This is for a block aligned message.
376 	//	This is for a message whose length mod 64 is >= 56.
378 	// Pre-swap the endianness of the words.
404 	st1		{state0_a.4s-state1_a.4s}, [out1]
405 	st1		{state0_b.4s-state1_b.4s}, [out2]