sha256-ni-asm.S - OpenGrok cross reference for /linux/lib/crypto/x86/sha256-ni-asm.S

Lines Matching +full:len +full:- +full:or +full:- +full:define
2  * Intel SHA Extensions optimized implementation of a SHA-256 update function
4  * This file is provided under a dual BSD/GPLv2 license.  When using or
11  * This program is free software; you can redistribute it and/or modify
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
28  * Redistribution and use in source and binary forms, with or without
36  * 	  the documentation and/or other materials provided with the
39  * 	  contributors may be used to endorse or promote products derived
43  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
46  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
49  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
50  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
58 #define STATE_PTR	%rdi	/* 1st arg */
59 #define DATA_PTR	%rsi	/* 2nd arg */
60 #define NUM_BLKS	%rdx	/* 3rd arg */
62 #define SHA256CONSTANTS	%rax
64 #define MSG		%xmm0  /* sha256rnds2 implicit operand */
65 #define STATE0		%xmm1
66 #define STATE1		%xmm2
67 #define MSG0		%xmm3
68 #define MSG1		%xmm4
69 #define MSG2		%xmm5
70 #define MSG3		%xmm6
71 #define TMP		%xmm7
73 #define SHUF_MASK	%xmm8
75 #define ABEF_SAVE	%xmm9
76 #define CDGH_SAVE	%xmm10
83 	movdqa		(\i-32)*4(SHA256CONSTANTS), MSG
100  * Intel SHA Extensions optimized implementation of a SHA-256 block function
102  * This function takes a pointer to the current SHA-256 state, a pointer to the
103  * input data, and the number of 64-byte blocks to process.  Once all blocks
120 	 * DCBA, HGFE -> ABEF, CDGH
185 #define CTX		%rdi
186 #define DATA1		%rsi
187 #define DATA2		%rdx
188 #define LEN		%ecx  macro
189 #define LEN8		%cl
190 #define LEN64		%rcx
191 #define OUT1		%r8
192 #define OUT2		%r9
195 #define SHA256CONSTANTS	%rax
196 #define COUNT		%r10
197 #define COUNT32		%r10d
198 #define FINAL_STEP	%r11d
202 #define MSG		%xmm0	// sha256rnds2 implicit operand
203 #define STATE0_A	%xmm1
204 #define STATE1_A	%xmm2
205 #define STATE0_B	%xmm3
206 #define STATE1_B	%xmm4
207 #define TMP_A		%xmm5
208 #define TMP_B		%xmm6
209 #define MSG0_A		%xmm7
210 #define MSG1_A		%xmm8
211 #define MSG2_A		%xmm9
212 #define MSG3_A		%xmm10
213 #define MSG0_B		%xmm11
214 #define MSG1_B		%xmm12
215 #define MSG2_B		%xmm13
216 #define MSG3_B		%xmm14
217 #define SHUF_MASK	%xmm15
219 #define OFFSETOF_STATE		0  // offsetof(struct __sha256_ctx, state)
220 #define OFFSETOF_BYTECOUNT	32 // offsetof(struct __sha256_ctx, bytecount)
221 #define OFFSETOF_BUF		40 // offsetof(struct __sha256_ctx, buf)
223 // Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a and m0_b
228 // computes 4 more message schedule words for each message.  m1_a-m3_a contain
230 // likewise m1_b-m3_b for the second.  After consuming the current value of
236 	movdqa		(\i-32)*4(SHA256CONSTANTS), TMP_A
266 //			  const u8 *data1, const u8 *data2, int len,
270 // This function computes the SHA-256 digests of two messages |data1| and
271 // |data2| that are both |len| bytes long, starting from the initial context
272 // |ctx|.  |len| must be at least SHA256_BLOCK_SIZE.
274 // The instructions for the two SHA-256 operations are interleaved.  On many
276 // to taking better advantage of the CPU's SHA-256 and SIMD throughput.
279 	// Allocate 128 bytes of stack space, 16-byte aligned.
286 	// Load the shuffle mask for swapping the endianness of 32-bit words.
295 	// Load the initial state from ctx->state.
304 	// Load ctx->bytecount.  Take the mod 64 of it to get the number of
305 	// bytes that are buffered in ctx->buf.  Also save it in a register with
306 	// LEN added to it.
307 	mov		LEN, LEN
313 	// %ebx bytes (1 to 63) are currently buffered in ctx->buf.  Load them
314 	// followed by the first 64 - %ebx bytes of data.  Since LEN >= 64, we
315 	// just load 64 bytes from each of ctx->buf, DATA1, and DATA2
353 	sub		$64, %rbx 	// rbx = buffered - 64
354 	sub		%rbx, DATA1	// DATA1 += 64 - buffered
355 	sub		%rbx, DATA2	// DATA2 += 64 - buffered
356 	add		%ebx, LEN	// LEN += buffered - 64
362 	sub		$64, LEN
395 	// Do the SHA-256 rounds on each block.
413 	// Update LEN and loop back if more blocks remain.
414 	sub		$64, LEN
419 	// FINAL_STEP = 1: need to do count-only padding block
424 	add		$64, LEN
427 	// Not block-aligned; 1 <= LEN <= 63 data bytes remain.  Pad the block.
430 	// and load from &sp[64 - LEN] to get the needed padding block.  This
433 	sub		LEN, %ebx		// ebx = 64 - LEN
434 	sub		%rbx, DATA1		// DATA1 -= 64 - LEN
435 	sub		%rbx, DATA2		// DATA2 -= 64 - LEN
443 	cmp		$56, LEN
448 	mov		$2, FINAL_STEP	// won't need count-only block
451 	mov		$1, FINAL_STEP	// will need count-only block
488 	// Pre-swap the endianness of the words.