xref: /illumos-gate/usr/src/common/crypto/sha2/amd64/sha256-ni.S (revision 5d9d9091f564c198a760790b0bfa72c44e17912b)
1/*
2 * Intel SHA Extensions optimized implementation of a SHA-256 update function
3 *
4 * This file is provided under a dual BSD/GPLv2 license.  When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2015 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * General Public License for more details.
19 *
20 * Contact Information:
21 * 	Sean Gulley <sean.m.gulley@intel.com>
22 * 	Tim Chen <tim.c.chen@linux.intel.com>
23 *
24 * BSD LICENSE
25 *
26 * Copyright(c) 2015 Intel Corporation.
27 *
28 * Redistribution and use in source and binary forms, with or without
29 * modification, are permitted provided that the following conditions
30 * are met:
31 *
32 * 	* Redistributions of source code must retain the above copyright
33 * 	  notice, this list of conditions and the following disclaimer.
34 * 	* Redistributions in binary form must reproduce the above copyright
35 * 	  notice, this list of conditions and the following disclaimer in
36 * 	  the documentation and/or other materials provided with the
37 * 	  distribution.
38 * 	* Neither the name of Intel Corporation nor the names of its
39 * 	  contributors may be used to endorse or promote products derived
40 * 	  from this software without specific prior written permission.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
43 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
44 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
45 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
46 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
47 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
48 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
49 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
50 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
51 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
52 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
53 *
54 */
55
56/*
57 * Copyright (c) 2018, Joyent, Inc.
58 */
59
60/*
61 * illumos uses this file under the terms of the BSD license.
62 *
63 * The following are a series of changes that we have made to this code:
64 *
65 *  o Changed the include to be sys/asm_linkage.h.
66 *  o Use the sys/asm_linkage.h prototypes for assembly functions.
67 *  o Renamed the function from sha256_ni_transform to SHA256TransformBlocks to
68 *    match the illumos name for the function.
69 *  o The illumos SHA256_CTX does not have the digest as the first member of its
70 *    context struct. As such, an offset has to be added to the digest argument
71 *    to make sure that we get to the actual digest.
72 *  o Update the function prototype block comment to reflect that we are
73 *    passing the context and not the direct digest.
74 */
75
76#include <sys/asm_linkage.h>
77
78#define DIGEST_PTR	%rdi	/* 1st arg */
79#define DATA_PTR	%rsi	/* 2nd arg */
80#define NUM_BLKS	%rdx	/* 3rd arg */
81
82#define SHA256CONSTANTS	%rax
83
84#define MSG		%xmm0
85#define STATE0		%xmm1
86#define STATE1		%xmm2
87#define MSGTMP0		%xmm3
88#define MSGTMP1		%xmm4
89#define MSGTMP2		%xmm5
90#define MSGTMP3		%xmm6
91#define MSGTMP4		%xmm7
92
93#define SHUF_MASK	%xmm8
94
95#define ABEF_SAVE	%xmm9
96#define CDGH_SAVE	%xmm10
97
98/*
99 * Intel SHA Extensions optimized implementation of a SHA-256 update function
100 *
101 * The function takes a pointer to the current hash values, a pointer to the
102 * input data, and a number of 64 byte blocks to process.  Once all blocks have
103 * been processed, the digest pointer is  updated with the resulting hash value.
104 * The function only processes complete blocks, there is no functionality to
105 * store partial blocks.  All message padding and hash value initialization must
106 * be done outside the update function.
107 *
108 * The indented lines in the loop are instructions related to rounds processing.
109 * The non-indented lines are instructions related to the message schedule.
110 *
111 * void SHA256TransformBlocks(SHA256_CTX *ctx, const void *data,
112		uint32_t numBlocks);
113 * digest : pointer to digest
114 * data: pointer to input data
115 * numBlocks: Number of blocks to process
116 */
117
118.text
119.align 32
120ENTRY_NP(SHA256TransformBlocks)
121
122	shl		$6, NUM_BLKS		/* convert to bytes */
123	jz		.Ldone_hash
124	add		DATA_PTR, NUM_BLKS	/* pointer to end of data */
125
126	/*
127	 * load initial hash values
128	 * Need to reorder these appropriately
129	 * DCBA, HGFE -> ABEF, CDGH
130	 *
131	 * Offset DIGEST_PTR to account for the algorithm in the context.
132	 */
133	addq		$8, DIGEST_PTR
134	movdqu		0*16(DIGEST_PTR), STATE0
135	movdqu		1*16(DIGEST_PTR), STATE1
136
137	pshufd		$0xB1, STATE0,  STATE0		/* CDAB */
138	pshufd		$0x1B, STATE1,  STATE1		/* EFGH */
139	movdqa		STATE0, MSGTMP4
140	palignr		$8, STATE1,  STATE0		/* ABEF */
141	pblendw		$0xF0, MSGTMP4, STATE1		/* CDGH */
142
143	movdqa		PSHUFFLE_BYTE_FLIP_MASK(%rip), SHUF_MASK
144	lea		K256(%rip), SHA256CONSTANTS
145
146.Lloop0:
147	/* Save hash values for addition after rounds */
148	movdqa		STATE0, ABEF_SAVE
149	movdqa		STATE1, CDGH_SAVE
150
151	/* Rounds 0-3 */
152	movdqu		0*16(DATA_PTR), MSG
153	pshufb		SHUF_MASK, MSG
154	movdqa		MSG, MSGTMP0
155		paddd		0*16(SHA256CONSTANTS), MSG
156		sha256rnds2	STATE0, STATE1
157		pshufd 		$0x0E, MSG, MSG
158		sha256rnds2	STATE1, STATE0
159
160	/* Rounds 4-7 */
161	movdqu		1*16(DATA_PTR), MSG
162	pshufb		SHUF_MASK, MSG
163	movdqa		MSG, MSGTMP1
164		paddd		1*16(SHA256CONSTANTS), MSG
165		sha256rnds2	STATE0, STATE1
166		pshufd 		$0x0E, MSG, MSG
167		sha256rnds2	STATE1, STATE0
168	sha256msg1	MSGTMP1, MSGTMP0
169
170	/* Rounds 8-11 */
171	movdqu		2*16(DATA_PTR), MSG
172	pshufb		SHUF_MASK, MSG
173	movdqa		MSG, MSGTMP2
174		paddd		2*16(SHA256CONSTANTS), MSG
175		sha256rnds2	STATE0, STATE1
176		pshufd 		$0x0E, MSG, MSG
177		sha256rnds2	STATE1, STATE0
178	sha256msg1	MSGTMP2, MSGTMP1
179
180	/* Rounds 12-15 */
181	movdqu		3*16(DATA_PTR), MSG
182	pshufb		SHUF_MASK, MSG
183	movdqa		MSG, MSGTMP3
184		paddd		3*16(SHA256CONSTANTS), MSG
185		sha256rnds2	STATE0, STATE1
186	movdqa		MSGTMP3, MSGTMP4
187	palignr		$4, MSGTMP2, MSGTMP4
188	paddd		MSGTMP4, MSGTMP0
189	sha256msg2	MSGTMP3, MSGTMP0
190		pshufd 		$0x0E, MSG, MSG
191		sha256rnds2	STATE1, STATE0
192	sha256msg1	MSGTMP3, MSGTMP2
193
194	/* Rounds 16-19 */
195	movdqa		MSGTMP0, MSG
196		paddd		4*16(SHA256CONSTANTS), MSG
197		sha256rnds2	STATE0, STATE1
198	movdqa		MSGTMP0, MSGTMP4
199	palignr		$4, MSGTMP3, MSGTMP4
200	paddd		MSGTMP4, MSGTMP1
201	sha256msg2	MSGTMP0, MSGTMP1
202		pshufd 		$0x0E, MSG, MSG
203		sha256rnds2	STATE1, STATE0
204	sha256msg1	MSGTMP0, MSGTMP3
205
206	/* Rounds 20-23 */
207	movdqa		MSGTMP1, MSG
208		paddd		5*16(SHA256CONSTANTS), MSG
209		sha256rnds2	STATE0, STATE1
210	movdqa		MSGTMP1, MSGTMP4
211	palignr		$4, MSGTMP0, MSGTMP4
212	paddd		MSGTMP4, MSGTMP2
213	sha256msg2	MSGTMP1, MSGTMP2
214		pshufd 		$0x0E, MSG, MSG
215		sha256rnds2	STATE1, STATE0
216	sha256msg1	MSGTMP1, MSGTMP0
217
218	/* Rounds 24-27 */
219	movdqa		MSGTMP2, MSG
220		paddd		6*16(SHA256CONSTANTS), MSG
221		sha256rnds2	STATE0, STATE1
222	movdqa		MSGTMP2, MSGTMP4
223	palignr		$4, MSGTMP1, MSGTMP4
224	paddd		MSGTMP4, MSGTMP3
225	sha256msg2	MSGTMP2, MSGTMP3
226		pshufd 		$0x0E, MSG, MSG
227		sha256rnds2	STATE1, STATE0
228	sha256msg1	MSGTMP2, MSGTMP1
229
230	/* Rounds 28-31 */
231	movdqa		MSGTMP3, MSG
232		paddd		7*16(SHA256CONSTANTS), MSG
233		sha256rnds2	STATE0, STATE1
234	movdqa		MSGTMP3, MSGTMP4
235	palignr		$4, MSGTMP2, MSGTMP4
236	paddd		MSGTMP4, MSGTMP0
237	sha256msg2	MSGTMP3, MSGTMP0
238		pshufd 		$0x0E, MSG, MSG
239		sha256rnds2	STATE1, STATE0
240	sha256msg1	MSGTMP3, MSGTMP2
241
242	/* Rounds 32-35 */
243	movdqa		MSGTMP0, MSG
244		paddd		8*16(SHA256CONSTANTS), MSG
245		sha256rnds2	STATE0, STATE1
246	movdqa		MSGTMP0, MSGTMP4
247	palignr		$4, MSGTMP3, MSGTMP4
248	paddd		MSGTMP4, MSGTMP1
249	sha256msg2	MSGTMP0, MSGTMP1
250		pshufd 		$0x0E, MSG, MSG
251		sha256rnds2	STATE1, STATE0
252	sha256msg1	MSGTMP0, MSGTMP3
253
254	/* Rounds 36-39 */
255	movdqa		MSGTMP1, MSG
256		paddd		9*16(SHA256CONSTANTS), MSG
257		sha256rnds2	STATE0, STATE1
258	movdqa		MSGTMP1, MSGTMP4
259	palignr		$4, MSGTMP0, MSGTMP4
260	paddd		MSGTMP4, MSGTMP2
261	sha256msg2	MSGTMP1, MSGTMP2
262		pshufd 		$0x0E, MSG, MSG
263		sha256rnds2	STATE1, STATE0
264	sha256msg1	MSGTMP1, MSGTMP0
265
266	/* Rounds 40-43 */
267	movdqa		MSGTMP2, MSG
268		paddd		10*16(SHA256CONSTANTS), MSG
269		sha256rnds2	STATE0, STATE1
270	movdqa		MSGTMP2, MSGTMP4
271	palignr		$4, MSGTMP1, MSGTMP4
272	paddd		MSGTMP4, MSGTMP3
273	sha256msg2	MSGTMP2, MSGTMP3
274		pshufd 		$0x0E, MSG, MSG
275		sha256rnds2	STATE1, STATE0
276	sha256msg1	MSGTMP2, MSGTMP1
277
278	/* Rounds 44-47 */
279	movdqa		MSGTMP3, MSG
280		paddd		11*16(SHA256CONSTANTS), MSG
281		sha256rnds2	STATE0, STATE1
282	movdqa		MSGTMP3, MSGTMP4
283	palignr		$4, MSGTMP2, MSGTMP4
284	paddd		MSGTMP4, MSGTMP0
285	sha256msg2	MSGTMP3, MSGTMP0
286		pshufd 		$0x0E, MSG, MSG
287		sha256rnds2	STATE1, STATE0
288	sha256msg1	MSGTMP3, MSGTMP2
289
290	/* Rounds 48-51 */
291	movdqa		MSGTMP0, MSG
292		paddd		12*16(SHA256CONSTANTS), MSG
293		sha256rnds2	STATE0, STATE1
294	movdqa		MSGTMP0, MSGTMP4
295	palignr		$4, MSGTMP3, MSGTMP4
296	paddd		MSGTMP4, MSGTMP1
297	sha256msg2	MSGTMP0, MSGTMP1
298		pshufd 		$0x0E, MSG, MSG
299		sha256rnds2	STATE1, STATE0
300	sha256msg1	MSGTMP0, MSGTMP3
301
302	/* Rounds 52-55 */
303	movdqa		MSGTMP1, MSG
304		paddd		13*16(SHA256CONSTANTS), MSG
305		sha256rnds2	STATE0, STATE1
306	movdqa		MSGTMP1, MSGTMP4
307	palignr		$4, MSGTMP0, MSGTMP4
308	paddd		MSGTMP4, MSGTMP2
309	sha256msg2	MSGTMP1, MSGTMP2
310		pshufd 		$0x0E, MSG, MSG
311		sha256rnds2	STATE1, STATE0
312
313	/* Rounds 56-59 */
314	movdqa		MSGTMP2, MSG
315		paddd		14*16(SHA256CONSTANTS), MSG
316		sha256rnds2	STATE0, STATE1
317	movdqa		MSGTMP2, MSGTMP4
318	palignr		$4, MSGTMP1, MSGTMP4
319	paddd		MSGTMP4, MSGTMP3
320	sha256msg2	MSGTMP2, MSGTMP3
321		pshufd 		$0x0E, MSG, MSG
322		sha256rnds2	STATE1, STATE0
323
324	/* Rounds 60-63 */
325	movdqa		MSGTMP3, MSG
326		paddd		15*16(SHA256CONSTANTS), MSG
327		sha256rnds2	STATE0, STATE1
328		pshufd 		$0x0E, MSG, MSG
329		sha256rnds2	STATE1, STATE0
330
331	/* Add current hash values with previously saved */
332	paddd		ABEF_SAVE, STATE0
333	paddd		CDGH_SAVE, STATE1
334
335	/* Increment data pointer and loop if more to process */
336	add		$64, DATA_PTR
337	cmp		NUM_BLKS, DATA_PTR
338	jne		.Lloop0
339
340	/* Write hash values back in the correct order */
341	pshufd		$0x1B, STATE0,  STATE0		/* FEBA */
342	pshufd		$0xB1, STATE1,  STATE1		/* DCHG */
343	movdqa		STATE0, MSGTMP4
344	pblendw		$0xF0, STATE1,  STATE0		/* DCBA */
345	palignr		$8, MSGTMP4, STATE1		/* HGFE */
346
347	movdqu		STATE0, 0*16(DIGEST_PTR)
348	movdqu		STATE1, 1*16(DIGEST_PTR)
349
350.Ldone_hash:
351
352	ret
353SET_SIZE(SHA256TransformBlocks)
354
355.section	.rodata.cst256.K256, "aM", @progbits, 256
356.align 64
357K256:
358	.long	0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
359	.long	0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
360	.long	0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
361	.long	0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
362	.long	0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
363	.long	0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
364	.long	0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
365	.long	0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
366	.long	0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
367	.long	0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
368	.long	0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
369	.long	0xd192e819,0xd6990624,0xf40e3585,0x106aa070
370	.long	0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
371	.long	0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
372	.long	0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
373	.long	0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
374
375.section	.rodata.cst16.PSHUFFLE_BYTE_FLIP_MASK, "aM", @progbits, 16
376.align 16
377PSHUFFLE_BYTE_FLIP_MASK:
378	.octa 0x0c0d0e0f08090a0b0405060700010203
379