sha256-ce.S - OpenGrok cross reference for /linux/lib/crypto/arm64/sha256-ce.S

Lines Matching +full:1 +full:- +full:16
1 /* SPDX-License-Identifier: GPL-2.0-only */
3  * sha2-ce-core.S - core SHA-224/SHA-256 transform using v8 Crypto Extensions
12 	.arch		armv8-a+crypto
30 	mov		dg2v.16b, dg0v.16b
51 	 * The SHA-256 round constants
75 	ld1		{ v0.4s- v3.4s}, [\tmp], #64
76 	ld1		{ v4.4s- v7.4s}, [\tmp], #64
77 	ld1		{ v8.4s-v11.4s}, [\tmp], #64
78 	ld1		{v12.4s-v15.4s}, [\tmp]
94 0:	ld1		{v16.4s-v19.4s}, [x1], #64
95 	sub		x2, x2, #1
97 CPU_LE(	rev32		v16.16b, v16.16b	)
98 CPU_LE(	rev32		v17.16b, v17.16b	)
99 CPU_LE(	rev32		v18.16b, v18.16b	)
100 CPU_LE(	rev32		v19.16b, v19.16b	)
103 	mov		dg0v.16b, dgav.16b
104 	mov		dg1v.16b, dgbv.16b
106 	add_update	0,  v1, 16, 17, 18, 19
107 	add_update	1,  v2, 17, 18, 19, 16
108 	add_update	0,  v3, 18, 19, 16, 17
109 	add_update	1,  v4, 19, 16, 17, 18
111 	add_update	0,  v5, 16, 17, 18, 19
112 	add_update	1,  v6, 17, 18, 19, 16
113 	add_update	0,  v7, 18, 19, 16, 17
114 	add_update	1,  v8, 19, 16, 17, 18
116 	add_update	0,  v9, 16, 17, 18, 19
117 	add_update	1, v10, 17, 18, 19, 16
118 	add_update	0, v11, 18, 19, 16, 17
119 	add_update	1, v12, 19, 16, 17, 18
122 	add_only	1, v14, 18
124 	add_only	1
131 	cond_yield	1f, x5, x6
137 1:	st1		{dgav.4s, dgbv.4s}, [x0]
167 	// x8-x9 are used as temporaries.
169 	// v0-v15 are used to cache the SHA-256 round constants.
170 	// v16-v19 are used for the message schedule for the first message.
171 	// v20-v23 are used for the message schedule for the second message.
172 	// v24-v31 are used for the state and temporaries as given below.
193 	// Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a
199 	// m1_a-m3_a contain the next 3 groups of 4 message schedule words for
200 	// the first message, and likewise m1_b-m3_b for the second.  After
216 	mov		t1_a.16b, state0_a.16b
217 	mov		t1_b.16b, state0_b.16b
237 // This function computes the SHA-256 digests of two messages |data1| and
241 // The instructions for the two SHA-256 operations are interleaved.  On many
243 // to taking better advantage of the CPU's SHA-256 and SIMD throughput.
250 	// Load the initial state from ctx->state.
251 	ld1		{state0_a.4s-state1_a.4s}, [ctx]
253 	// Load ctx->bytecount.  Take the mod 64 of it to get the number of
254 	// bytes that are buffered in ctx->buf.  Also save it in a register with
261 	// x8 bytes (1 to 63) are currently buffered in ctx->buf.  Load them
262 	// followed by the first 64 - x8 bytes of data.  Since len >= 64, we
263 	// just load 64 bytes from each of ctx->buf, data1, and data2
266 	ld1		{v16.16b-v19.16b}, [x9]
267 	st1		{v16.16b-v19.16b}, [sp]
269 	ld1		{v16.16b-v19.16b}, [data1], #64
271 	st1		{v16.16b-v19.16b}, [x9]
272 	ld1		{v16.4s-v19.4s}, [sp]
274 	ld1		{v20.16b-v23.16b}, [data2], #64
275 	st1		{v20.16b-v23.16b}, [x9]
276 	ld1		{v20.4s-v23.4s}, [sp]
282 	mov		state0_b.16b, state0_a.16b
283 	mov		state1_b.16b, state1_a.16b
288 	mov		state0_b.16b, state0_a.16b
289 	mov		state1_b.16b, state1_a.16b
292 	ld1		{v16.4s-v19.4s}, [data1], #64
293 	ld1		{v20.4s-v23.4s}, [data2], #64
296 CPU_LE(	rev32		v16.16b, v16.16b	)
297 CPU_LE(	rev32		v17.16b, v17.16b	)
298 CPU_LE(	rev32		v18.16b, v18.16b	)
299 CPU_LE(	rev32		v19.16b, v19.16b	)
300 CPU_LE(	rev32		v20.16b, v20.16b	)
301 CPU_LE(	rev32		v21.16b, v21.16b	)
302 CPU_LE(	rev32		v22.16b, v22.16b	)
303 CPU_LE(	rev32		v23.16b, v23.16b	)
307 	st1		{state0_a.4s-state1_b.4s}, [sp]
309 	// Do the SHA-256 rounds on each block.
311 	do_16rounds_2x	16, v4, v5, v6, v7
316 	ld1		{v16.4s-v19.4s}, [sp]
328 	// final_step = 1: need to do count-only padding block
330 	tbnz		final_step, #1, .Lfinup2x_done
335 	// Not block-aligned; 1 <= len <= 63 data bytes remain.  Pad the block.
338 	// and load from &sp[64 - len] to get the needed padding block.  This
340 	sub		w8, len, #64		// w8 = len - 64
341 	add		data1, data1, w8, sxtw	// data1 += len - 64
342 	add		data2, data2, w8, sxtw	// data2 += len - 64
345 CPU_BE(	movi		v16.16b, #0		)
347 CPU_BE(	mov		v16.d[1], x9		)
348 	movi		v17.16b, #0
351 	sub		x9, sp, w8, sxtw	// x9 = &sp[64 - len]
353 	b.ge		1f		// will count spill into its own block?
357 	mov		final_step, #2	// won't need count-only block
359 1:
360 	mov		final_step, #1	// will need count-only block
362 	ld1		{v16.16b-v19.16b}, [data1]
363 	st1		{v16.16b-v19.16b}, [sp]
364 	ld1		{v16.4s-v19.4s}, [x9]
365 	ld1		{v20.16b-v23.16b}, [data2]
366 	st1		{v20.16b-v23.16b}, [sp]
367 	ld1		{v20.4s-v23.4s}, [x9]
378 	// Pre-swap the endianness of the words.
381 	b		1f
385 1:
390 	mov		v19.d[1], count
391 	mov		v20.16b, v16.16b
394 	mov		v23.16b, v19.16b
400 CPU_LE(	rev32		state0_a.16b, state0_a.16b	)
401 CPU_LE(	rev32		state1_a.16b, state1_a.16b	)
402 CPU_LE(	rev32		state0_b.16b, state0_b.16b	)
403 CPU_LE(	rev32		state1_b.16b, state1_b.16b	)
404 	st1		{state0_a.4s-state1_a.4s}, [out1]
405 	st1		{state0_b.4s-state1_b.4s}, [out2]