xref: /linux/lib/crypto/x86/sha1-avx2-asm.S (revision 54fd6bd42e7bd351802ff1d193a2e33e4bfb1836)
1/*
2 *	Implement fast SHA-1 with AVX2 instructions. (x86_64)
3 *
4 * This file is provided under a dual BSD/GPLv2 license.  When using or
5 * redistributing this file, you may do so under either license.
6 *
7 * GPL LICENSE SUMMARY
8 *
9 * Copyright(c) 2014 Intel Corporation.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of version 2 of the GNU General Public License as
13 * published by the Free Software Foundation.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18 * General Public License for more details.
19 *
20 * Contact Information:
21 * Ilya Albrekht <ilya.albrekht@intel.com>
22 * Maxim Locktyukhin <maxim.locktyukhin@intel.com>
23 * Ronen Zohar <ronen.zohar@intel.com>
24 * Chandramouli Narayanan <mouli@linux.intel.com>
25 *
26 * BSD LICENSE
27 *
28 * Copyright(c) 2014 Intel Corporation.
29 *
30 * Redistribution and use in source and binary forms, with or without
31 * modification, are permitted provided that the following conditions
32 * are met:
33 *
34 * Redistributions of source code must retain the above copyright
35 * notice, this list of conditions and the following disclaimer.
36 * Redistributions in binary form must reproduce the above copyright
37 * notice, this list of conditions and the following disclaimer in
38 * the documentation and/or other materials provided with the
39 * distribution.
40 * Neither the name of Intel Corporation nor the names of its
41 * contributors may be used to endorse or promote products derived
42 * from this software without specific prior written permission.
43 *
44 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
45 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
46 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
47 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
48 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
49 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
50 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
51 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
52 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
53 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
54 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
55 *
56 */
57
58/*
59 * SHA-1 implementation with Intel(R) AVX2 instruction set extensions.
60 *
61 *This implementation is based on the previous SSSE3 release:
62 *Visit http://software.intel.com/en-us/articles/
63 *and refer to improving-the-performance-of-the-secure-hash-algorithm-1/
64 *
65 * void sha1_transform_avx2(struct sha1_block_state *state,
66 *			    const u8 *data, size_t nblocks);
67 */
68
69#include <linux/linkage.h>
70
71#define	CTX	%rdi	/* arg1 */
72#define BUF	%rsi	/* arg2 */
73#define CNT	%rdx	/* arg3 */
74
75#define	REG_A	%ecx
76#define	REG_B	%esi
77#define	REG_C	%edi
78#define	REG_D	%eax
79#define	REG_E	%edx
80#define	REG_TB	%ebx
81#define	REG_TA	%r12d
82#define	REG_RA	%rcx
83#define	REG_RB	%rsi
84#define	REG_RC	%rdi
85#define	REG_RD	%rax
86#define	REG_RE	%rdx
87#define	REG_RTA	%r12
88#define	REG_RTB	%rbx
89#define	REG_T1	%r11d
90#define	xmm_mov	vmovups
91#define	avx2_zeroupper	vzeroupper
92#define	RND_F1	1
93#define	RND_F2	2
94#define	RND_F3	3
95
96.macro REGALLOC
97	.set A, REG_A
98	.set B, REG_B
99	.set C, REG_C
100	.set D, REG_D
101	.set E, REG_E
102	.set TB, REG_TB
103	.set TA, REG_TA
104
105	.set RA, REG_RA
106	.set RB, REG_RB
107	.set RC, REG_RC
108	.set RD, REG_RD
109	.set RE, REG_RE
110
111	.set RTA, REG_RTA
112	.set RTB, REG_RTB
113
114	.set T1, REG_T1
115.endm
116
117#define HASH_PTR	%r9
118#define BLOCKS_CTR	%r8
119#define BUFFER_PTR	%r10
120#define BUFFER_PTR2	%r13
121
122#define PRECALC_BUF	%r14
123#define WK_BUF		%r15
124
125#define W_TMP		%xmm0
126#define WY_TMP		%ymm0
127#define WY_TMP2		%ymm9
128
129# AVX2 variables
130#define WY0		%ymm3
131#define WY4		%ymm5
132#define WY08		%ymm7
133#define WY12		%ymm8
134#define WY16		%ymm12
135#define WY20		%ymm13
136#define WY24		%ymm14
137#define WY28		%ymm15
138
139#define YMM_SHUFB_BSWAP	%ymm10
140
141/*
142 * Keep 2 iterations precalculated at a time:
143 *    - 80 DWORDs per iteration * 2
144 */
145#define W_SIZE		(80*2*2 +16)
146
147#define WK(t)	((((t) % 80) / 4)*32 + ( (t) % 4)*4 + ((t)/80)*16 )(WK_BUF)
148#define PRECALC_WK(t)	((t)*2*2)(PRECALC_BUF)
149
150
151.macro UPDATE_HASH  hash, val
152	add	\hash, \val
153	mov	\val, \hash
154.endm
155
156.macro PRECALC_RESET_WY
157	.set WY_00, WY0
158	.set WY_04, WY4
159	.set WY_08, WY08
160	.set WY_12, WY12
161	.set WY_16, WY16
162	.set WY_20, WY20
163	.set WY_24, WY24
164	.set WY_28, WY28
165	.set WY_32, WY_00
166.endm
167
168.macro PRECALC_ROTATE_WY
169	/* Rotate macros */
170	.set WY_32, WY_28
171	.set WY_28, WY_24
172	.set WY_24, WY_20
173	.set WY_20, WY_16
174	.set WY_16, WY_12
175	.set WY_12, WY_08
176	.set WY_08, WY_04
177	.set WY_04, WY_00
178	.set WY_00, WY_32
179
180	/* Define register aliases */
181	.set WY, WY_00
182	.set WY_minus_04, WY_04
183	.set WY_minus_08, WY_08
184	.set WY_minus_12, WY_12
185	.set WY_minus_16, WY_16
186	.set WY_minus_20, WY_20
187	.set WY_minus_24, WY_24
188	.set WY_minus_28, WY_28
189	.set WY_minus_32, WY
190.endm
191
192.macro PRECALC_00_15
193	.if (i == 0) # Initialize and rotate registers
194		PRECALC_RESET_WY
195		PRECALC_ROTATE_WY
196	.endif
197
198	/* message scheduling pre-compute for rounds 0-15 */
199	.if   ((i & 7) == 0)
200		/*
201		 * blended AVX2 and ALU instruction scheduling
202		 * 1 vector iteration per 8 rounds
203		 */
204		vmovdqu (i * 2)(BUFFER_PTR), W_TMP
205	.elseif ((i & 7) == 1)
206		vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\
207			 WY_TMP, WY_TMP
208	.elseif ((i & 7) == 2)
209		vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY
210	.elseif ((i & 7) == 4)
211		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
212	.elseif ((i & 7) == 7)
213		vmovdqu  WY_TMP, PRECALC_WK(i&~7)
214
215		PRECALC_ROTATE_WY
216	.endif
217.endm
218
219.macro PRECALC_16_31
220	/*
221	 * message scheduling pre-compute for rounds 16-31
222	 * calculating last 32 w[i] values in 8 XMM registers
223	 * pre-calculate K+w[i] values and store to mem
224	 * for later load by ALU add instruction
225	 *
226	 * "brute force" vectorization for rounds 16-31 only
227	 * due to w[i]->w[i-3] dependency
228	 */
229	.if   ((i & 7) == 0)
230		/*
231		 * blended AVX2 and ALU instruction scheduling
232		 * 1 vector iteration per 8 rounds
233		 */
234		/* w[i-14] */
235		vpalignr	$8, WY_minus_16, WY_minus_12, WY
236		vpsrldq	$4, WY_minus_04, WY_TMP               /* w[i-3] */
237	.elseif ((i & 7) == 1)
238		vpxor	WY_minus_08, WY, WY
239		vpxor	WY_minus_16, WY_TMP, WY_TMP
240	.elseif ((i & 7) == 2)
241		vpxor	WY_TMP, WY, WY
242		vpslldq	$12, WY, WY_TMP2
243	.elseif ((i & 7) == 3)
244		vpslld	$1, WY, WY_TMP
245		vpsrld	$31, WY, WY
246	.elseif ((i & 7) == 4)
247		vpor	WY, WY_TMP, WY_TMP
248		vpslld	$2, WY_TMP2, WY
249	.elseif ((i & 7) == 5)
250		vpsrld	$30, WY_TMP2, WY_TMP2
251		vpxor	WY, WY_TMP, WY_TMP
252	.elseif ((i & 7) == 7)
253		vpxor	WY_TMP2, WY_TMP, WY
254		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
255		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
256
257		PRECALC_ROTATE_WY
258	.endif
259.endm
260
261.macro PRECALC_32_79
262	/*
263	 * in SHA-1 specification:
264	 * w[i] = (w[i-3] ^ w[i-8]  ^ w[i-14] ^ w[i-16]) rol 1
265	 * instead we do equal:
266	 * w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
267	 * allows more efficient vectorization
268	 * since w[i]=>w[i-3] dependency is broken
269	 */
270
271	.if   ((i & 7) == 0)
272	/*
273	 * blended AVX2 and ALU instruction scheduling
274	 * 1 vector iteration per 8 rounds
275	 */
276		vpalignr	$8, WY_minus_08, WY_minus_04, WY_TMP
277	.elseif ((i & 7) == 1)
278		/* W is W_minus_32 before xor */
279		vpxor	WY_minus_28, WY, WY
280	.elseif ((i & 7) == 2)
281		vpxor	WY_minus_16, WY_TMP, WY_TMP
282	.elseif ((i & 7) == 3)
283		vpxor	WY_TMP, WY, WY
284	.elseif ((i & 7) == 4)
285		vpslld	$2, WY, WY_TMP
286	.elseif ((i & 7) == 5)
287		vpsrld	$30, WY, WY
288		vpor	WY, WY_TMP, WY
289	.elseif ((i & 7) == 7)
290		vpaddd  K_XMM + K_XMM_AR(%rip), WY, WY_TMP
291		vmovdqu	WY_TMP, PRECALC_WK(i&~7)
292
293		PRECALC_ROTATE_WY
294	.endif
295.endm
296
297.macro PRECALC r, s
298	.set i, \r
299
300	.if (i < 40)
301		.set K_XMM, 32*0
302	.elseif (i < 80)
303		.set K_XMM, 32*1
304	.elseif (i < 120)
305		.set K_XMM, 32*2
306	.else
307		.set K_XMM, 32*3
308	.endif
309
310	.if (i<32)
311		PRECALC_00_15	\s
312	.elseif (i<64)
313		PRECALC_16_31	\s
314	.elseif (i < 160)
315		PRECALC_32_79	\s
316	.endif
317.endm
318
319.macro ROTATE_STATE
320	.set T_REG, E
321	.set E, D
322	.set D, C
323	.set C, B
324	.set B, TB
325	.set TB, A
326	.set A, T_REG
327
328	.set T_REG, RE
329	.set RE, RD
330	.set RD, RC
331	.set RC, RB
332	.set RB, RTB
333	.set RTB, RA
334	.set RA, T_REG
335.endm
336
337/* Macro relies on saved ROUND_Fx */
338
339.macro RND_FUN f, r
340	.if (\f == RND_F1)
341		ROUND_F1	\r
342	.elseif (\f == RND_F2)
343		ROUND_F2	\r
344	.elseif (\f == RND_F3)
345		ROUND_F3	\r
346	.endif
347.endm
348
349.macro RR r
350	.set round_id, (\r % 80)
351
352	.if (round_id == 0)        /* Precalculate F for first round */
353		.set ROUND_FUNC, RND_F1
354		mov	B, TB
355
356		rorx	$(32-30), B, B    /* b>>>2 */
357		andn	D, TB, T1
358		and	C, TB
359		xor	T1, TB
360	.endif
361
362	RND_FUN ROUND_FUNC, \r
363	ROTATE_STATE
364
365	.if   (round_id == 18)
366		.set ROUND_FUNC, RND_F2
367	.elseif (round_id == 38)
368		.set ROUND_FUNC, RND_F3
369	.elseif (round_id == 58)
370		.set ROUND_FUNC, RND_F2
371	.endif
372
373	.set round_id, ( (\r+1) % 80)
374
375	RND_FUN ROUND_FUNC, (\r+1)
376	ROTATE_STATE
377.endm
378
379.macro ROUND_F1 r
380	add	WK(\r), E
381
382	andn	C, A, T1			/* ~b&d */
383	lea	(RE,RTB), E		/* Add F from the previous round */
384
385	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
386	rorx	$(32-30),A, TB		/* b>>>2 for next round */
387
388	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
389
390	/*
391	 * Calculate F for the next round
392	 * (b & c) ^ andn[b, d]
393	 */
394	and	B, A			/* b&c */
395	xor	T1, A			/* F1 = (b&c) ^ (~b&d) */
396
397	lea	(RE,RTA), E		/* E += A >>> 5 */
398.endm
399
400.macro ROUND_F2 r
401	add	WK(\r), E
402	lea	(RE,RTB), E		/* Add F from the previous round */
403
404	/* Calculate F for the next round */
405	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
406	.if ((round_id) < 79)
407		rorx	$(32-30), A, TB	/* b>>>2 for next round */
408	.endif
409	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
410
411	.if ((round_id) < 79)
412		xor	B, A
413	.endif
414
415	add	TA, E			/* E += A >>> 5 */
416
417	.if ((round_id) < 79)
418		xor	C, A
419	.endif
420.endm
421
422.macro ROUND_F3 r
423	add	WK(\r), E
424	PRECALC	(\r)			/* msg scheduling for next 2 blocks */
425
426	lea	(RE,RTB), E		/* Add F from the previous round */
427
428	mov	B, T1
429	or	A, T1
430
431	rorx	$(32-5), A, TA		/* T2 = A >>> 5 */
432	rorx	$(32-30), A, TB		/* b>>>2 for next round */
433
434	/* Calculate F for the next round
435	 * (b and c) or (d and (b or c))
436	 */
437	and	C, T1
438	and	B, A
439	or	T1, A
440
441	add	TA, E			/* E += A >>> 5 */
442
443.endm
444
445/* Add constant only if (%2 > %3) condition met (uses RTA as temp)
446 * %1 + %2 >= %3 ? %4 : 0
447 */
448.macro ADD_IF_GE a, b, c, d
449	mov     \a, RTA
450	add     $\d, RTA
451	cmp     $\c, \b
452	cmovge  RTA, \a
453.endm
454
455/*
456 * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining
457 */
458.macro SHA1_PIPELINED_MAIN_BODY
459
460	REGALLOC
461
462	mov	(HASH_PTR), A
463	mov	4(HASH_PTR), B
464	mov	8(HASH_PTR), C
465	mov	12(HASH_PTR), D
466	mov	16(HASH_PTR), E
467
468	mov	%rsp, PRECALC_BUF
469	lea	(2*4*80+32)(%rsp), WK_BUF
470
471	# Precalc WK for first 2 blocks
472	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64
473	.set i, 0
474	.rept    160
475		PRECALC i
476		.set i, i + 1
477	.endr
478
479	/* Go to next block if needed */
480	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128
481	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
482	xchg	WK_BUF, PRECALC_BUF
483
484	.align 32
485.L_loop:
486	/*
487	 * code loops through more than one block
488	 * we use K_BASE value as a signal of a last block,
489	 * it is set below by: cmovae BUFFER_PTR, K_BASE
490	 */
491	test BLOCKS_CTR, BLOCKS_CTR
492	jnz .L_begin
493	.align 32
494	jmp	.L_end
495	.align 32
496.L_begin:
497
498	/*
499	 * Do first block
500	 * rounds: 0,2,4,6,8
501	 */
502	.set j, 0
503	.rept 5
504		RR	j
505		.set j, j+2
506	.endr
507
508	/*
509	 * rounds:
510	 * 10,12,14,16,18
511	 * 20,22,24,26,28
512	 * 30,32,34,36,38
513	 * 40,42,44,46,48
514	 * 50,52,54,56,58
515	 */
516	.rept 25
517		RR	j
518		.set j, j+2
519	.endr
520
521	/* Update Counter */
522	sub $1, BLOCKS_CTR
523	/* Move to the next block only if needed*/
524	ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128
525	/*
526	 * rounds
527	 * 60,62,64,66,68
528	 * 70,72,74,76,78
529	 */
530	.rept 10
531		RR	j
532		.set j, j+2
533	.endr
534
535	UPDATE_HASH	(HASH_PTR), A
536	UPDATE_HASH	4(HASH_PTR), TB
537	UPDATE_HASH	8(HASH_PTR), C
538	UPDATE_HASH	12(HASH_PTR), D
539	UPDATE_HASH	16(HASH_PTR), E
540
541	test	BLOCKS_CTR, BLOCKS_CTR
542	jz	.L_loop
543
544	mov	TB, B
545
546	/* Process second block */
547	/*
548	 * rounds
549	 *  0+80, 2+80, 4+80, 6+80, 8+80
550	 * 10+80,12+80,14+80,16+80,18+80
551	 */
552
553	.set j, 0
554	.rept 10
555		RR	j+80
556		.set j, j+2
557	.endr
558
559	/*
560	 * rounds
561	 * 20+80,22+80,24+80,26+80,28+80
562	 * 30+80,32+80,34+80,36+80,38+80
563	 */
564	.rept 10
565		RR	j+80
566		.set j, j+2
567	.endr
568
569	/*
570	 * rounds
571	 * 40+80,42+80,44+80,46+80,48+80
572	 * 50+80,52+80,54+80,56+80,58+80
573	 */
574	.rept 10
575		RR	j+80
576		.set j, j+2
577	.endr
578
579	/* update counter */
580	sub     $1, BLOCKS_CTR
581	/* Move to the next block only if needed*/
582	ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128
583
584	/*
585	 * rounds
586	 * 60+80,62+80,64+80,66+80,68+80
587	 * 70+80,72+80,74+80,76+80,78+80
588	 */
589	.rept 10
590		RR	j+80
591		.set j, j+2
592	.endr
593
594	UPDATE_HASH	(HASH_PTR), A
595	UPDATE_HASH	4(HASH_PTR), TB
596	UPDATE_HASH	8(HASH_PTR), C
597	UPDATE_HASH	12(HASH_PTR), D
598	UPDATE_HASH	16(HASH_PTR), E
599
600	/* Reset state for AVX2 reg permutation */
601	mov	A, TA
602	mov	TB, A
603	mov	C, TB
604	mov	E, C
605	mov	D, B
606	mov	TA, D
607
608	REGALLOC
609
610	xchg	WK_BUF, PRECALC_BUF
611
612	jmp	.L_loop
613
614	.align 32
615.L_end:
616
617.endm
618/*
619 * macro implements SHA-1 function's body for several 64-byte blocks
620 * param: function's name
621 */
622.macro SHA1_VECTOR_ASM  name
623	SYM_FUNC_START(\name)
624
625	push	%rbx
626	push	%r12
627	push	%r13
628	push	%r14
629	push	%r15
630
631	RESERVE_STACK  = (W_SIZE*4 + 8+24)
632
633	/* Align stack */
634	push	%rbp
635	mov	%rsp, %rbp
636	and	$~(0x20-1), %rsp
637	sub	$RESERVE_STACK, %rsp
638
639	avx2_zeroupper
640
641	/* Setup initial values */
642	mov	CTX, HASH_PTR
643	mov	BUF, BUFFER_PTR
644
645	mov	BUF, BUFFER_PTR2
646	mov	CNT, BLOCKS_CTR
647
648	xmm_mov	BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP
649
650	SHA1_PIPELINED_MAIN_BODY
651
652	avx2_zeroupper
653
654	mov	%rbp, %rsp
655	pop	%rbp
656
657	pop	%r15
658	pop	%r14
659	pop	%r13
660	pop	%r12
661	pop	%rbx
662
663	RET
664
665	SYM_FUNC_END(\name)
666.endm
667
668.section .rodata
669
670#define K1 0x5a827999
671#define K2 0x6ed9eba1
672#define K3 0x8f1bbcdc
673#define K4 0xca62c1d6
674
675.align 128
676K_XMM_AR:
677	.long K1, K1, K1, K1
678	.long K1, K1, K1, K1
679	.long K2, K2, K2, K2
680	.long K2, K2, K2, K2
681	.long K3, K3, K3, K3
682	.long K3, K3, K3, K3
683	.long K4, K4, K4, K4
684	.long K4, K4, K4, K4
685
686BSWAP_SHUFB_CTL:
687	.long 0x00010203
688	.long 0x04050607
689	.long 0x08090a0b
690	.long 0x0c0d0e0f
691	.long 0x00010203
692	.long 0x04050607
693	.long 0x08090a0b
694	.long 0x0c0d0e0f
695.text
696
697SHA1_VECTOR_ASM     sha1_transform_avx2
698