xref: /linux/lib/crypto/x86/sha256-ni-asm.S (revision 07fdad3a93756b872da7b53647715c48d0f4a2d0)
1/*
2 * Intel SHA Extensions optimized implementation of a SHA-256 update function
3 *
4 * This file is provided under a dual BSD/GPLv2 license.  When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2015 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * General Public License for more details.
19 *
20 * Contact Information:
21 * 	Sean Gulley <sean.m.gulley@intel.com>
22 * 	Tim Chen <tim.c.chen@linux.intel.com>
23 *
24 * BSD LICENSE
25 *
26 * Copyright(c) 2015 Intel Corporation.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 *
32 * 	* Redistributions of source code must retain the above copyright
33 * 	  notice, this list of conditions and the following disclaimer.
34 * 	* Redistributions in binary form must reproduce the above copyright
35 * 	  notice, this list of conditions and the following disclaimer in
36 * 	  the documentation and/or other materials provided with the
37 * 	  distribution.
38 * 	* Neither the name of Intel Corporation nor the names of its
39 * 	  contributors may be used to endorse or promote products derived
40 * 	  from this software without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
46 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 *
54 */
55
56#include <linux/linkage.h>
57
58#define STATE_PTR	%rdi	/* 1st arg */
59#define DATA_PTR	%rsi	/* 2nd arg */
60#define NUM_BLKS	%rdx	/* 3rd arg */
61
62#define SHA256CONSTANTS	%rax
63
64#define MSG		%xmm0  /* sha256rnds2 implicit operand */
65#define STATE0		%xmm1
66#define STATE1		%xmm2
67#define MSG0		%xmm3
68#define MSG1		%xmm4
69#define MSG2		%xmm5
70#define MSG3		%xmm6
71#define TMP		%xmm7
72
73#define SHUF_MASK	%xmm8
74
75#define ABEF_SAVE	%xmm9
76#define CDGH_SAVE	%xmm10
77
78.macro do_4rounds	i, m0, m1, m2, m3
79.if \i < 16
80	movdqu		\i*4(DATA_PTR), \m0
81	pshufb		SHUF_MASK, \m0
82.endif
83	movdqa		(\i-32)*4(SHA256CONSTANTS), MSG
84	paddd		\m0, MSG
85	sha256rnds2	STATE0, STATE1
86.if \i >= 12 && \i < 60
87	movdqa		\m0, TMP
88	palignr		$4, \m3, TMP
89	paddd		TMP, \m1
90	sha256msg2	\m0, \m1
91.endif
92	punpckhqdq	MSG, MSG
93	sha256rnds2	STATE1, STATE0
94.if \i >= 4 && \i < 52
95	sha256msg1	\m0, \m3
96.endif
97.endm
98
99/*
100 * Intel SHA Extensions optimized implementation of a SHA-256 block function
101 *
102 * This function takes a pointer to the current SHA-256 state, a pointer to the
103 * input data, and the number of 64-byte blocks to process.  Once all blocks
104 * have been processed, the state is updated with the new state.  This function
105 * only processes complete blocks.  State initialization, buffering of partial
106 * blocks, and digest finalization is expected to be handled elsewhere.
107 *
108 * void sha256_ni_transform(struct sha256_block_state *state,
109 *			    const u8 *data, size_t nblocks);
110 */
111.text
112SYM_FUNC_START(sha256_ni_transform)
113
114	shl		$6, NUM_BLKS		/*  convert to bytes */
115	add		DATA_PTR, NUM_BLKS	/* pointer to end of data */
116
117	/*
118	 * load initial hash values
119	 * Need to reorder these appropriately
120	 * DCBA, HGFE -> ABEF, CDGH
121	 */
122	movdqu		0*16(STATE_PTR), STATE0		/* DCBA */
123	movdqu		1*16(STATE_PTR), STATE1		/* HGFE */
124
125	movdqa		STATE0, TMP
126	punpcklqdq	STATE1, STATE0			/* FEBA */
127	punpckhqdq	TMP, STATE1			/* DCHG */
128	pshufd		$0x1B, STATE0, STATE0		/* ABEF */
129	pshufd		$0xB1, STATE1, STATE1		/* CDGH */
130
131	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
132	lea		K256+32*4(%rip), SHA256CONSTANTS
133
134.Lloop0:
135	/* Save hash values for addition after rounds */
136	movdqa		STATE0, ABEF_SAVE
137	movdqa		STATE1, CDGH_SAVE
138
139.irp i, 0, 16, 32, 48
140	do_4rounds	(\i + 0),  MSG0, MSG1, MSG2, MSG3
141	do_4rounds	(\i + 4),  MSG1, MSG2, MSG3, MSG0
142	do_4rounds	(\i + 8),  MSG2, MSG3, MSG0, MSG1
143	do_4rounds	(\i + 12), MSG3, MSG0, MSG1, MSG2
144.endr
145
146	/* Add current hash values with previously saved */
147	paddd		ABEF_SAVE, STATE0
148	paddd		CDGH_SAVE, STATE1
149
150	/* Increment data pointer and loop if more to process */
151	add		$64, DATA_PTR
152	cmp		NUM_BLKS, DATA_PTR
153	jne		.Lloop0
154
155	/* Write hash values back in the correct order */
156	movdqa		STATE0, TMP
157	punpcklqdq	STATE1, STATE0			/* GHEF */
158	punpckhqdq	TMP, STATE1			/* ABCD */
159	pshufd		$0xB1, STATE0, STATE0		/* HGFE */
160	pshufd		$0x1B, STATE1, STATE1		/* DCBA */
161
162	movdqu		STATE1, 0*16(STATE_PTR)
163	movdqu		STATE0, 1*16(STATE_PTR)
164
165	RET
166SYM_FUNC_END(sha256_ni_transform)
167
168#undef DIGEST_PTR
169#undef DATA_PTR
170#undef NUM_BLKS
171#undef SHA256CONSTANTS
172#undef MSG
173#undef STATE0
174#undef STATE1
175#undef MSG0
176#undef MSG1
177#undef MSG2
178#undef MSG3
179#undef TMP
180#undef SHUF_MASK
181#undef ABEF_SAVE
182#undef CDGH_SAVE
183
184// parameters for sha256_ni_finup2x()
185#define CTX		%rdi
186#define DATA1		%rsi
187#define DATA2		%rdx
188#define LEN		%ecx
189#define LEN8		%cl
190#define LEN64		%rcx
191#define OUT1		%r8
192#define OUT2		%r9
193
194// other scalar variables
195#define SHA256CONSTANTS	%rax
196#define COUNT		%r10
197#define COUNT32		%r10d
198#define FINAL_STEP	%r11d
199
200// rbx is used as a temporary.
201
202#define MSG		%xmm0	// sha256rnds2 implicit operand
203#define STATE0_A	%xmm1
204#define STATE1_A	%xmm2
205#define STATE0_B	%xmm3
206#define STATE1_B	%xmm4
207#define TMP_A		%xmm5
208#define TMP_B		%xmm6
209#define MSG0_A		%xmm7
210#define MSG1_A		%xmm8
211#define MSG2_A		%xmm9
212#define MSG3_A		%xmm10
213#define MSG0_B		%xmm11
214#define MSG1_B		%xmm12
215#define MSG2_B		%xmm13
216#define MSG3_B		%xmm14
217#define SHUF_MASK	%xmm15
218
219#define OFFSETOF_STATE		0  // offsetof(struct __sha256_ctx, state)
220#define OFFSETOF_BYTECOUNT	32 // offsetof(struct __sha256_ctx, bytecount)
221#define OFFSETOF_BUF		40 // offsetof(struct __sha256_ctx, buf)
222
223// Do 4 rounds of SHA-256 for each of two messages (interleaved).  m0_a and m0_b
224// contain the current 4 message schedule words for the first and second message
225// respectively.
226//
227// If not all the message schedule words have been computed yet, then this also
228// computes 4 more message schedule words for each message.  m1_a-m3_a contain
229// the next 3 groups of 4 message schedule words for the first message, and
230// likewise m1_b-m3_b for the second.  After consuming the current value of
231// m0_a, this macro computes the group after m3_a and writes it to m0_a, and
232// likewise for *_b.  This means that the next (m0_a, m1_a, m2_a, m3_a) is the
233// current (m1_a, m2_a, m3_a, m0_a), and likewise for *_b, so the caller must
234// cycle through the registers accordingly.
235.macro	do_4rounds_2x	i, m0_a, m1_a, m2_a, m3_a,  m0_b, m1_b, m2_b, m3_b
236	movdqa		(\i-32)*4(SHA256CONSTANTS), TMP_A
237	movdqa		TMP_A, TMP_B
238	paddd		\m0_a, TMP_A
239	paddd		\m0_b, TMP_B
240.if \i < 48
241	sha256msg1	\m1_a, \m0_a
242	sha256msg1	\m1_b, \m0_b
243.endif
244	movdqa		TMP_A, MSG
245	sha256rnds2	STATE0_A, STATE1_A
246	movdqa		TMP_B, MSG
247	sha256rnds2	STATE0_B, STATE1_B
248	pshufd 		$0x0E, TMP_A, MSG
249	sha256rnds2	STATE1_A, STATE0_A
250	pshufd 		$0x0E, TMP_B, MSG
251	sha256rnds2	STATE1_B, STATE0_B
252.if \i < 48
253	movdqa		\m3_a, TMP_A
254	movdqa		\m3_b, TMP_B
255	palignr		$4, \m2_a, TMP_A
256	palignr		$4, \m2_b, TMP_B
257	paddd		TMP_A, \m0_a
258	paddd		TMP_B, \m0_b
259	sha256msg2	\m3_a, \m0_a
260	sha256msg2	\m3_b, \m0_b
261.endif
262.endm
263
264//
265// void sha256_ni_finup2x(const struct __sha256_ctx *ctx,
266//			  const u8 *data1, const u8 *data2, int len,
267//			  u8 out1[SHA256_DIGEST_SIZE],
268//			  u8 out2[SHA256_DIGEST_SIZE]);
269//
270// This function computes the SHA-256 digests of two messages |data1| and
271// |data2| that are both |len| bytes long, starting from the initial context
272// |ctx|.  |len| must be at least SHA256_BLOCK_SIZE.
273//
274// The instructions for the two SHA-256 operations are interleaved.  On many
275// CPUs, this is almost twice as fast as hashing each message individually due
276// to taking better advantage of the CPU's SHA-256 and SIMD throughput.
277//
278SYM_FUNC_START(sha256_ni_finup2x)
279	// Allocate 128 bytes of stack space, 16-byte aligned.
280	push		%rbx
281	push		%rbp
282	mov		%rsp, %rbp
283	sub		$128, %rsp
284	and		$~15, %rsp
285
286	// Load the shuffle mask for swapping the endianness of 32-bit words.
287	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
288
289	// Set up pointer to the round constants.
290	lea		K256+32*4(%rip), SHA256CONSTANTS
291
292	// Initially we're not processing the final blocks.
293	xor		FINAL_STEP, FINAL_STEP
294
295	// Load the initial state from ctx->state.
296	movdqu		OFFSETOF_STATE+0*16(CTX), STATE0_A	// DCBA
297	movdqu		OFFSETOF_STATE+1*16(CTX), STATE1_A	// HGFE
298	movdqa		STATE0_A, TMP_A
299	punpcklqdq	STATE1_A, STATE0_A			// FEBA
300	punpckhqdq	TMP_A, STATE1_A				// DCHG
301	pshufd		$0x1B, STATE0_A, STATE0_A		// ABEF
302	pshufd		$0xB1, STATE1_A, STATE1_A		// CDGH
303
304	// Load ctx->bytecount.  Take the mod 64 of it to get the number of
305	// bytes that are buffered in ctx->buf.  Also save it in a register with
306	// LEN added to it.
307	mov		LEN, LEN
308	mov		OFFSETOF_BYTECOUNT(CTX), %rbx
309	lea		(%rbx, LEN64, 1), COUNT
310	and		$63, %ebx
311	jz		.Lfinup2x_enter_loop	// No bytes buffered?
312
313	// %ebx bytes (1 to 63) are currently buffered in ctx->buf.  Load them
314	// followed by the first 64 - %ebx bytes of data.  Since LEN >= 64, we
315	// just load 64 bytes from each of ctx->buf, DATA1, and DATA2
316	// unconditionally and rearrange the data as needed.
317
318	movdqu		OFFSETOF_BUF+0*16(CTX), MSG0_A
319	movdqu		OFFSETOF_BUF+1*16(CTX), MSG1_A
320	movdqu		OFFSETOF_BUF+2*16(CTX), MSG2_A
321	movdqu		OFFSETOF_BUF+3*16(CTX), MSG3_A
322	movdqa		MSG0_A, 0*16(%rsp)
323	movdqa		MSG1_A, 1*16(%rsp)
324	movdqa		MSG2_A, 2*16(%rsp)
325	movdqa		MSG3_A, 3*16(%rsp)
326
327	movdqu		0*16(DATA1), MSG0_A
328	movdqu		1*16(DATA1), MSG1_A
329	movdqu		2*16(DATA1), MSG2_A
330	movdqu		3*16(DATA1), MSG3_A
331	movdqu		MSG0_A, 0*16(%rsp,%rbx)
332	movdqu		MSG1_A, 1*16(%rsp,%rbx)
333	movdqu		MSG2_A, 2*16(%rsp,%rbx)
334	movdqu		MSG3_A, 3*16(%rsp,%rbx)
335	movdqa		0*16(%rsp), MSG0_A
336	movdqa		1*16(%rsp), MSG1_A
337	movdqa		2*16(%rsp), MSG2_A
338	movdqa		3*16(%rsp), MSG3_A
339
340	movdqu		0*16(DATA2), MSG0_B
341	movdqu		1*16(DATA2), MSG1_B
342	movdqu		2*16(DATA2), MSG2_B
343	movdqu		3*16(DATA2), MSG3_B
344	movdqu		MSG0_B, 0*16(%rsp,%rbx)
345	movdqu		MSG1_B, 1*16(%rsp,%rbx)
346	movdqu		MSG2_B, 2*16(%rsp,%rbx)
347	movdqu		MSG3_B, 3*16(%rsp,%rbx)
348	movdqa		0*16(%rsp), MSG0_B
349	movdqa		1*16(%rsp), MSG1_B
350	movdqa		2*16(%rsp), MSG2_B
351	movdqa		3*16(%rsp), MSG3_B
352
353	sub		$64, %rbx 	// rbx = buffered - 64
354	sub		%rbx, DATA1	// DATA1 += 64 - buffered
355	sub		%rbx, DATA2	// DATA2 += 64 - buffered
356	add		%ebx, LEN	// LEN += buffered - 64
357	movdqa		STATE0_A, STATE0_B
358	movdqa		STATE1_A, STATE1_B
359	jmp		.Lfinup2x_loop_have_data
360
361.Lfinup2x_enter_loop:
362	sub		$64, LEN
363	movdqa		STATE0_A, STATE0_B
364	movdqa		STATE1_A, STATE1_B
365.Lfinup2x_loop:
366	// Load the next two data blocks.
367	movdqu		0*16(DATA1), MSG0_A
368	movdqu		0*16(DATA2), MSG0_B
369	movdqu		1*16(DATA1), MSG1_A
370	movdqu		1*16(DATA2), MSG1_B
371	movdqu		2*16(DATA1), MSG2_A
372	movdqu		2*16(DATA2), MSG2_B
373	movdqu		3*16(DATA1), MSG3_A
374	movdqu		3*16(DATA2), MSG3_B
375	add		$64, DATA1
376	add		$64, DATA2
377.Lfinup2x_loop_have_data:
378	// Convert the words of the data blocks from big endian.
379	pshufb		SHUF_MASK, MSG0_A
380	pshufb		SHUF_MASK, MSG0_B
381	pshufb		SHUF_MASK, MSG1_A
382	pshufb		SHUF_MASK, MSG1_B
383	pshufb		SHUF_MASK, MSG2_A
384	pshufb		SHUF_MASK, MSG2_B
385	pshufb		SHUF_MASK, MSG3_A
386	pshufb		SHUF_MASK, MSG3_B
387.Lfinup2x_loop_have_bswapped_data:
388
389	// Save the original state for each block.
390	movdqa		STATE0_A, 0*16(%rsp)
391	movdqa		STATE0_B, 1*16(%rsp)
392	movdqa		STATE1_A, 2*16(%rsp)
393	movdqa		STATE1_B, 3*16(%rsp)
394
395	// Do the SHA-256 rounds on each block.
396.irp i, 0, 16, 32, 48
397	do_4rounds_2x	(\i + 0),  MSG0_A, MSG1_A, MSG2_A, MSG3_A, \
398				   MSG0_B, MSG1_B, MSG2_B, MSG3_B
399	do_4rounds_2x	(\i + 4),  MSG1_A, MSG2_A, MSG3_A, MSG0_A, \
400				   MSG1_B, MSG2_B, MSG3_B, MSG0_B
401	do_4rounds_2x	(\i + 8),  MSG2_A, MSG3_A, MSG0_A, MSG1_A, \
402				   MSG2_B, MSG3_B, MSG0_B, MSG1_B
403	do_4rounds_2x	(\i + 12), MSG3_A, MSG0_A, MSG1_A, MSG2_A, \
404				   MSG3_B, MSG0_B, MSG1_B, MSG2_B
405.endr
406
407	// Add the original state for each block.
408	paddd		0*16(%rsp), STATE0_A
409	paddd		1*16(%rsp), STATE0_B
410	paddd		2*16(%rsp), STATE1_A
411	paddd		3*16(%rsp), STATE1_B
412
413	// Update LEN and loop back if more blocks remain.
414	sub		$64, LEN
415	jge		.Lfinup2x_loop
416
417	// Check if any final blocks need to be handled.
418	// FINAL_STEP = 2: all done
419	// FINAL_STEP = 1: need to do count-only padding block
420	// FINAL_STEP = 0: need to do the block with 0x80 padding byte
421	cmp		$1, FINAL_STEP
422	jg		.Lfinup2x_done
423	je		.Lfinup2x_finalize_countonly
424	add		$64, LEN
425	jz		.Lfinup2x_finalize_blockaligned
426
427	// Not block-aligned; 1 <= LEN <= 63 data bytes remain.  Pad the block.
428	// To do this, write the padding starting with the 0x80 byte to
429	// &sp[64].  Then for each message, copy the last 64 data bytes to sp
430	// and load from &sp[64 - LEN] to get the needed padding block.  This
431	// code relies on the data buffers being >= 64 bytes in length.
432	mov		$64, %ebx
433	sub		LEN, %ebx		// ebx = 64 - LEN
434	sub		%rbx, DATA1		// DATA1 -= 64 - LEN
435	sub		%rbx, DATA2		// DATA2 -= 64 - LEN
436	mov		$0x80, FINAL_STEP   // using FINAL_STEP as a temporary
437	movd		FINAL_STEP, MSG0_A
438	pxor		MSG1_A, MSG1_A
439	movdqa		MSG0_A, 4*16(%rsp)
440	movdqa		MSG1_A, 5*16(%rsp)
441	movdqa		MSG1_A, 6*16(%rsp)
442	movdqa		MSG1_A, 7*16(%rsp)
443	cmp		$56, LEN
444	jge		1f	// will COUNT spill into its own block?
445	shl		$3, COUNT
446	bswap		COUNT
447	mov		COUNT, 56(%rsp,%rbx)
448	mov		$2, FINAL_STEP	// won't need count-only block
449	jmp		2f
4501:
451	mov		$1, FINAL_STEP	// will need count-only block
4522:
453	movdqu		0*16(DATA1), MSG0_A
454	movdqu		1*16(DATA1), MSG1_A
455	movdqu		2*16(DATA1), MSG2_A
456	movdqu		3*16(DATA1), MSG3_A
457	movdqa		MSG0_A, 0*16(%rsp)
458	movdqa		MSG1_A, 1*16(%rsp)
459	movdqa		MSG2_A, 2*16(%rsp)
460	movdqa		MSG3_A, 3*16(%rsp)
461	movdqu		0*16(%rsp,%rbx), MSG0_A
462	movdqu		1*16(%rsp,%rbx), MSG1_A
463	movdqu		2*16(%rsp,%rbx), MSG2_A
464	movdqu		3*16(%rsp,%rbx), MSG3_A
465
466	movdqu		0*16(DATA2), MSG0_B
467	movdqu		1*16(DATA2), MSG1_B
468	movdqu		2*16(DATA2), MSG2_B
469	movdqu		3*16(DATA2), MSG3_B
470	movdqa		MSG0_B, 0*16(%rsp)
471	movdqa		MSG1_B, 1*16(%rsp)
472	movdqa		MSG2_B, 2*16(%rsp)
473	movdqa		MSG3_B, 3*16(%rsp)
474	movdqu		0*16(%rsp,%rbx), MSG0_B
475	movdqu		1*16(%rsp,%rbx), MSG1_B
476	movdqu		2*16(%rsp,%rbx), MSG2_B
477	movdqu		3*16(%rsp,%rbx), MSG3_B
478	jmp		.Lfinup2x_loop_have_data
479
480	// Prepare a padding block, either:
481	//
482	//	{0x80, 0, 0, 0, ..., count (as __be64)}
483	//	This is for a block aligned message.
484	//
485	//	{   0, 0, 0, 0, ..., count (as __be64)}
486	//	This is for a message whose length mod 64 is >= 56.
487	//
488	// Pre-swap the endianness of the words.
489.Lfinup2x_finalize_countonly:
490	pxor		MSG0_A, MSG0_A
491	jmp		1f
492
493.Lfinup2x_finalize_blockaligned:
494	mov		$0x80000000, %ebx
495	movd		%ebx, MSG0_A
4961:
497	pxor		MSG1_A, MSG1_A
498	pxor		MSG2_A, MSG2_A
499	ror		$29, COUNT
500	movq		COUNT, MSG3_A
501	pslldq		$8, MSG3_A
502	movdqa		MSG0_A, MSG0_B
503	pxor		MSG1_B, MSG1_B
504	pxor		MSG2_B, MSG2_B
505	movdqa		MSG3_A, MSG3_B
506	mov		$2, FINAL_STEP
507	jmp		.Lfinup2x_loop_have_bswapped_data
508
509.Lfinup2x_done:
510	// Write the two digests with all bytes in the correct order.
511	movdqa		STATE0_A, TMP_A
512	movdqa		STATE0_B, TMP_B
513	punpcklqdq	STATE1_A, STATE0_A		// GHEF
514	punpcklqdq	STATE1_B, STATE0_B
515	punpckhqdq	TMP_A, STATE1_A			// ABCD
516	punpckhqdq	TMP_B, STATE1_B
517	pshufd		$0xB1, STATE0_A, STATE0_A	// HGFE
518	pshufd		$0xB1, STATE0_B, STATE0_B
519	pshufd		$0x1B, STATE1_A, STATE1_A	// DCBA
520	pshufd		$0x1B, STATE1_B, STATE1_B
521	pshufb		SHUF_MASK, STATE0_A
522	pshufb		SHUF_MASK, STATE0_B
523	pshufb		SHUF_MASK, STATE1_A
524	pshufb		SHUF_MASK, STATE1_B
525	movdqu		STATE0_A, 1*16(OUT1)
526	movdqu		STATE0_B, 1*16(OUT2)
527	movdqu		STATE1_A, 0*16(OUT1)
528	movdqu		STATE1_B, 0*16(OUT2)
529
530	mov		%rbp, %rsp
531	pop		%rbp
532	pop		%rbx
533	RET
534SYM_FUNC_END(sha256_ni_finup2x)
535
536.section	.rodata.cst256.K256, "aM", @progbits, 256
537.align 64
538K256:
539	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
540	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
541	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
542	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
543	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
544	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
545	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
546	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
547	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
548	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
549	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
550	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
551	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
552	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
553	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
554	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
555
556.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
557.align 16
558PSHUFFLE_BYTE_FLIP_MASK:
559	.octa 0x0c0d0e0f08090a0b0405060700010203
560