xref: /freebsd/lib/libmd/amd64/sha1block.S (revision 207f3b2b25eaa0f9d32699e664b139e5e40e5450)
1/*-
2 * Copyright (c) 2013 The Go Authors. All rights reserved.
3 * Copyright (c) 2024 Robert Clausecker <fuz@freebsd.org>
4 *
5 * Adapted from Go's crypto/sha1/sha1block_amd64.s.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions are
9 * met:
10 *
11 *   * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 *   * Redistributions in binary form must reproduce the above
14 * copyright notice, this list of conditions and the following disclaimer
15 * in the documentation and/or other materials provided with the
16 * distribution.
17 *   * Neither the name of Google Inc. nor the names of its
18 * contributors may be used to endorse or promote products derived from
19 * this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32 */
33
34#include <machine/asm.h>
35
36/*
37 * SHA-1 block routine. See sha1c.c for C equivalent.
38 *
39 * There are 80 rounds of 4 types:
40 *   - rounds 0-15 are type 1 and load data (round1 macro).
41 *   - rounds 16-19 are type 1 and do not load data (round1x macro).
42 *   - rounds 20-39 are type 2 and do not load data (round2 macro).
43 *   - rounds 40-59 are type 3 and do not load data (round3 macro).
44 *   - rounds 60-79 are type 4 and do not load data (round4 macro).
45 *
46 * Each round loads or shuffles the data, then computes a per-round
47 * function of b, c, d, and then mixes the result into and rotates the
48 * five registers a, b, c, d, e holding the intermediate results.
49 *
50 * The register rotation is implemented by rotating the arguments to
51 * the round macros instead of by explicit move instructions.
52 */
53.macro	load		index
54	mov		(\index)*4(%rsi), %r10d
55	bswap		%r10d
56	mov		%r10d, (\index)*4(%rsp)
57.endm
58
59.macro	shuffle		index
60	mov		((\index   )&0xf)*4(%rsp), %r10d
61	xor		((\index- 3)&0xf)*4(%rsp), %r10d
62	xor		((\index- 8)&0xf)*4(%rsp), %r10d
63	xor		((\index-14)&0xf)*4(%rsp), %r10d
64	rol		$1, %r10d
65	mov		%r10d, ((\index)&0xf)*4(%rsp)
66.endm
67
68.macro	func1		a, b, c, d, e
69	mov		\d, %r9d
70	xor		\c, %r9d
71	and		\b, %r9d
72	xor		\d, %r9d
73.endm
74
75.macro	func2		a, b, c, d, e
76	mov		\b, %r9d
77	xor		\c, %r9d
78	xor		\d, %r9d
79.endm
80
81.macro	func3		a, b, c, d, e
82	mov		\b, %r8d
83	or		\c, %r8d
84	and		\d, %r8d
85	mov		\b, %r9d
86	and		\c, %r9d
87	or		%r8d, %r9d
88.endm
89
90.macro	func4		a, b, c, d, e
91	func2		\a, \b, \c, \d, \e
92.endm
93
94.macro	mix		a, b, c, d, e, const
95	rol		$30, \b
96	add		%r9d, \e
97	mov		\a, %r8d
98	rol		$5, %r8d
99	lea		\const(\e, %r10d, 1), \e
100	add		%r8d, \e
101.endm
102
103.macro	round1		a, b, c, d, e, index
104	load		\index
105	func1		\a, \b, \c, \d, \e
106	mix		\a, \b, \c, \d, \e, 0x5a827999
107.endm
108
109.macro	round1x		a, b, c, d, e, index
110	shuffle		\index
111	func1		\a, \b, \c, \d, \e
112	mix		\a, \b, \c, \d, \e, 0x5a827999
113.endm
114
115.macro	round2		a, b, c, d, e, index
116	shuffle		\index
117	func2		\a, \b, \c, \d, \e
118	mix		\a, \b, \c, \d, \e, 0x6ed9eba1
119.endm
120
121.macro	round3		a, b, c, d, e, index
122	shuffle		\index
123	func3		\a, \b, \c, \d, \e
124	mix		\a, \b, \c, \d, \e, 0x8f1bbcdc
125.endm
126
127.macro	round4		a, b, c, d, e, index
128	shuffle		\index
129	func4		\a, \b, \c, \d, \e
130	mix		\a, \b, \c, \d, \e, 0xca62c1d6
131.endm
132
133	// sha1block(SHA1_CTX, buf, len)
134ENTRY(_libmd_sha1block_scalar)
135	push		%rbp
136	push		%rbx
137	push		%r12
138	push		%r13
139	push		%r14
140	push		%r15
141	push		%rdi			// rdi: SHA1_CTX
142	sub		$64+8, %rsp		// 64 bytes for round keys
143						// plus alignment
144
145	mov		%rdi, %rbp
146						// rsi: buf
147	and		$~63, %rdx		// rdx: length in blocks
148	lea		(%rsi, %rdx, 1), %rdi	// rdi: end pointer
149	mov		(%rbp),	%eax		// c->h0
150	mov		4(%rbp), %ebx		// c->h1
151	mov		8(%rbp), %ecx		// c->h2
152	mov		12(%rbp), %edx		// c->h3
153	mov		16(%rbp), %ebp		// c->h4
154
155	cmp		%rsi, %rdi		// any data to process?
156	je		.Lend
157
158.Lloop:	mov		%eax, %r11d
159	mov		%ebx, %r12d
160	mov		%ecx, %r13d
161	mov		%edx, %r14d
162	mov		%ebp, %r15d
163
164	round1		%eax, %ebx, %ecx, %edx, %ebp,  0
165	round1		%ebp, %eax, %ebx, %ecx, %edx,  1
166	round1		%edx, %ebp, %eax, %ebx, %ecx,  2
167	round1		%ecx, %edx, %ebp, %eax, %ebx,  3
168	round1		%ebx, %ecx, %edx, %ebp, %eax,  4
169
170	round1		%eax, %ebx, %ecx, %edx, %ebp,  5
171	round1		%ebp, %eax, %ebx, %ecx, %edx,  6
172	round1		%edx, %ebp, %eax, %ebx, %ecx,  7
173	round1		%ecx, %edx, %ebp, %eax, %ebx,  8
174	round1		%ebx, %ecx, %edx, %ebp, %eax,  9
175
176	round1		%eax, %ebx, %ecx, %edx, %ebp, 10
177	round1		%ebp, %eax, %ebx, %ecx, %edx, 11
178	round1		%edx, %ebp, %eax, %ebx, %ecx, 12
179	round1		%ecx, %edx, %ebp, %eax, %ebx, 13
180	round1		%ebx, %ecx, %edx, %ebp, %eax, 14
181
182	round1		%eax, %ebx, %ecx, %edx, %ebp, 15
183	round1x		%ebp, %eax, %ebx, %ecx, %edx, 16
184	round1x		%edx, %ebp, %eax, %ebx, %ecx, 17
185	round1x		%ecx, %edx, %ebp, %eax, %ebx, 18
186	round1x		%ebx, %ecx, %edx, %ebp, %eax, 19
187
188	round2		%eax, %ebx, %ecx, %edx, %ebp, 20
189	round2		%ebp, %eax, %ebx, %ecx, %edx, 21
190	round2		%edx, %ebp, %eax, %ebx, %ecx, 22
191	round2		%ecx, %edx, %ebp, %eax, %ebx, 23
192	round2		%ebx, %ecx, %edx, %ebp, %eax, 24
193
194	round2		%eax, %ebx, %ecx, %edx, %ebp, 25
195	round2		%ebp, %eax, %ebx, %ecx, %edx, 26
196	round2		%edx, %ebp, %eax, %ebx, %ecx, 27
197	round2		%ecx, %edx, %ebp, %eax, %ebx, 28
198	round2		%ebx, %ecx, %edx, %ebp, %eax, 29
199
200	round2		%eax, %ebx, %ecx, %edx, %ebp, 30
201	round2		%ebp, %eax, %ebx, %ecx, %edx, 31
202	round2		%edx, %ebp, %eax, %ebx, %ecx, 32
203	round2		%ecx, %edx, %ebp, %eax, %ebx, 33
204	round2		%ebx, %ecx, %edx, %ebp, %eax, 34
205
206	round2		%eax, %ebx, %ecx, %edx, %ebp, 35
207	round2		%ebp, %eax, %ebx, %ecx, %edx, 36
208	round2		%edx, %ebp, %eax, %ebx, %ecx, 37
209	round2		%ecx, %edx, %ebp, %eax, %ebx, 38
210	round2		%ebx, %ecx, %edx, %ebp, %eax, 39
211
212	round3		%eax, %ebx, %ecx, %edx, %ebp, 40
213	round3		%ebp, %eax, %ebx, %ecx, %edx, 41
214	round3		%edx, %ebp, %eax, %ebx, %ecx, 42
215	round3		%ecx, %edx, %ebp, %eax, %ebx, 43
216	round3		%ebx, %ecx, %edx, %ebp, %eax, 44
217
218	round3		%eax, %ebx, %ecx, %edx, %ebp, 45
219	round3		%ebp, %eax, %ebx, %ecx, %edx, 46
220	round3		%edx, %ebp, %eax, %ebx, %ecx, 47
221	round3		%ecx, %edx, %ebp, %eax, %ebx, 48
222	round3		%ebx, %ecx, %edx, %ebp, %eax, 49
223
224	round3		%eax, %ebx, %ecx, %edx, %ebp, 50
225	round3		%ebp, %eax, %ebx, %ecx, %edx, 51
226	round3		%edx, %ebp, %eax, %ebx, %ecx, 52
227	round3		%ecx, %edx, %ebp, %eax, %ebx, 53
228	round3		%ebx, %ecx, %edx, %ebp, %eax, 54
229
230	round3		%eax, %ebx, %ecx, %edx, %ebp, 55
231	round3		%ebp, %eax, %ebx, %ecx, %edx, 56
232	round3		%edx, %ebp, %eax, %ebx, %ecx, 57
233	round3		%ecx, %edx, %ebp, %eax, %ebx, 58
234	round3		%ebx, %ecx, %edx, %ebp, %eax, 59
235
236	round4		%eax, %ebx, %ecx, %edx, %ebp, 60
237	round4		%ebp, %eax, %ebx, %ecx, %edx, 61
238	round4		%edx, %ebp, %eax, %ebx, %ecx, 62
239	round4		%ecx, %edx, %ebp, %eax, %ebx, 63
240	round4		%ebx, %ecx, %edx, %ebp, %eax, 64
241
242	round4		%eax, %ebx, %ecx, %edx, %ebp, 65
243	round4		%ebp, %eax, %ebx, %ecx, %edx, 66
244	round4		%edx, %ebp, %eax, %ebx, %ecx, 67
245	round4		%ecx, %edx, %ebp, %eax, %ebx, 68
246	round4		%ebx, %ecx, %edx, %ebp, %eax, 69
247
248	round4		%eax, %ebx, %ecx, %edx, %ebp, 70
249	round4		%ebp, %eax, %ebx, %ecx, %edx, 71
250	round4		%edx, %ebp, %eax, %ebx, %ecx, 72
251	round4		%ecx, %edx, %ebp, %eax, %ebx, 73
252	round4		%ebx, %ecx, %edx, %ebp, %eax, 74
253
254	round4		%eax, %ebx, %ecx, %edx, %ebp, 75
255	round4		%ebp, %eax, %ebx, %ecx, %edx, 76
256	round4		%edx, %ebp, %eax, %ebx, %ecx, 77
257	round4		%ecx, %edx, %ebp, %eax, %ebx, 78
258	round4		%ebx, %ecx, %edx, %ebp, %eax, 79
259
260	add		%r11d, %eax
261	add		%r12d, %ebx
262	add		%r13d, %ecx
263	add		%r14d, %edx
264	add		%r15d, %ebp
265
266	add		$64, %rsi
267	cmp		%rdi, %rsi
268	jb		.Lloop
269
270.Lend:	add		$64+8, %rsp
271	pop		%rdi			// SHA1_CTX
272	mov		%eax, (%rdi)
273	mov		%ebx, 4(%rdi)
274	mov		%ecx, 8(%rdi)
275	mov		%edx, 12(%rdi)
276	mov		%ebp, 16(%rdi)
277
278	pop		%r15
279	pop		%r14
280	pop		%r13
281	pop		%r12
282	pop		%rbx
283	pop		%rbp
284	ret
285END(_libmd_sha1block_scalar)
286
287/*
288 * This is the implementation using AVX2, BMI1 and BMI2. It is based on:
289 * "SHA-1 implementation with Intel(R) AVX2 instruction set extensions"
290 * From http://software.intel.com/en-us/articles
291 * (look for improving-the-performance-of-the-secure-hash-algorithm-1)
292 * This implementation is 2x unrolled, and interleaves vector instructions,
293 * used to precompute W, with scalar computation of current round
294 * for optimal scheduling.
295 */
296
297	/* trivial helper macros */
298.macro	update_hash	a, tb, c, d, e
299	add		(%r9), \a
300	mov		\a, (%r9)
301	add		4(%r9), \tb
302	mov		\tb, 4(%r9)
303	add		8(%r9), \c
304	mov		\c, 8(%r9)
305	add		12(%r9), \d
306	mov		\d, 12(%r9)
307	add		16(%r9), \e
308	mov		\e, 16(%r9)
309.endm
310
311	/* help macros for recalc, which does precomputations */
312.macro	precalc0	offset
313	vmovdqu		\offset(%r10), %xmm0
314.endm
315
316.macro	precalc1	offset
317	vinserti128	$1, \offset(%r13), %ymm0, %ymm0
318.endm
319
320.macro	precalc2	yreg
321	vpshufb		%ymm10, %ymm0, \yreg
322.endm
323
324.macro	precalc4	yreg, k_offset
325	vpaddd		\k_offset(%r8), \yreg, %ymm0
326.endm
327
328.macro	precalc7	offset
329	vmovdqu		%ymm0, (\offset)*2(%r14)
330.endm
331
332/*
333 * Message scheduling pre-compute for rounds 0-15
334 * r13      is a pointer to the even 64-byte block
335 * r10      is a pointer to the odd 64-byte block
336 * r14      is a pointer to the temp buffer
337 * xmm0     is used as a temp register
338 * yreg     is clobbered as part of the computation
339 * offset   chooses a 16 byte chunk within a block
340 * r8       is a pointer to the constants block
341 * k_offset chooses K constants relevant to this round
342 * xmm10    holds the swap mask
343 */
344.macro	precalc00_15	offset, yreg
345	precalc0	\offset
346	precalc1	\offset
347	precalc2	\yreg
348	precalc4	\yreg, 0
349	precalc7	\offset
350.endm
351
352	/* helper macros for precalc16_31 */
353.macro	precalc16	reg_sub16, reg_sub12, reg_sub4, reg
354	vpalignr	$8, \reg_sub16, \reg_sub12, \reg	// w[i - 14]
355	vpsrldq		$4, \reg_sub4, %ymm0			// w[i -  3]
356.endm
357
358.macro	precalc17	reg_sub16, reg_sub8, reg
359	vpxor		\reg_sub8, \reg, \reg
360	vpxor		\reg_sub16, %ymm0, %ymm0
361.endm
362
363.macro	precalc18	reg
364	vpxor		%ymm0, \reg, \reg
365	vpslldq		$12, \reg, %ymm9
366.endm
367
368.macro	precalc19	reg
369	vpslld		$1, \reg, %ymm0
370	vpsrld		$31, \reg, \reg
371	.endm
372
373.macro	precalc20	reg
374	vpor		\reg, %ymm0, %ymm0
375	vpslld		$2, %ymm9, \reg
376.endm
377
378.macro	precalc21	reg
379	vpsrld		$30, %ymm9, %ymm9
380	vpxor		\reg, %ymm0, %ymm0
381.endm
382
383.macro	precalc23	reg, k_offset, offset
384	vpxor		%ymm9, %ymm0, \reg
385	vpaddd		\k_offset(%r8), \reg, %ymm0
386	vmovdqu		%ymm0, (\offset)(%r14)
387.endm
388
389/*
390 * Message scheduling pre-compute for rounds 16-31
391 * calculating last 32 w[i] values in 8 XMM registers
392 * pre-calculate K+w[i] values and store to mem
393 * for later load by ALU add instruction.
394 * "brute force" vectorization for rounds 16-31 only
395 * due to w[i]->w[i-3] dependency.
396 + clobbers 5 input ymm registers REG_SUB*
397 * uses xmm0 and xmm9 as temp registers
398 * As always, r8 is a pointer to constants block
399 * and r14 is a pointer to temp buffer
400 */
401.macro	precalc16_31	reg, reg_sub4, reg_sub8, reg_sub12, reg_sub16, k_offset, offset
402	precalc16	\reg_sub16, \reg_sub12, \reg_sub4, \reg
403	precalc17	\reg_sub16, \reg_sub8, \reg
404	precalc18	\reg
405	precalc19	\reg
406	precalc20	\reg
407	precalc21	\reg
408	precalc23	\reg, \k_offset, \offset
409.endm
410
411	/* helper macros for precalc_32_79 */
412.macro	precalc32	reg_sub8, reg_sub4
413	vpalignr	$8, \reg_sub8, \reg_sub4, %ymm0
414.endm
415
416.macro	precalc33	reg_sub28, reg
417	vpxor		\reg_sub28, \reg, \reg
418.endm
419
420.macro	precalc34	reg_sub16
421	vpxor		\reg_sub16, %ymm0, %ymm0
422.endm
423
424.macro	precalc35	reg
425	vpxor		%ymm0, \reg, \reg
426.endm
427
428.macro	precalc36	reg
429	vpslld		$2, \reg, %ymm0
430.endm
431
432.macro	precalc37	reg
433	vpsrld		$30, \reg, \reg
434	vpor		\reg, %ymm0, \reg
435.endm
436
437.macro	precalc39	reg, k_offset, offset
438	vpaddd		\k_offset(%r8), \reg, %ymm0
439	vmovdqu		%ymm0, \offset(%r14)
440.endm
441
442.macro	precalc32_79	reg, reg_sub4, reg_sub8, reg_sub16, reg_sub28, k_offset, offset
443	precalc32	\reg_sub8, \reg_sub4
444	precalc33	\reg_sub28, \reg
445	precalc34	\reg_sub16
446	precalc35	\reg
447	precalc36	\reg
448	precalc37	\reg
449	precalc39	\reg, \k_offset, \offset
450.endm
451
452.macro	precalc
453	precalc00_15	0x00, %ymm15
454	precalc00_15	0x10, %ymm14
455	precalc00_15	0x20, %ymm13
456	precalc00_15	0x30, %ymm12
457	precalc16_31	%ymm8,  %ymm12, %ymm13, %ymm14, %ymm15, 0x00, 0x080
458	precalc16_31	%ymm7,  %ymm8,  %ymm12, %ymm13, %ymm14, 0x20, 0x0a0
459	precalc16_31	%ymm5,  %ymm7,  %ymm8,  %ymm12, %ymm13, 0x20, 0x0c0
460	precalc16_31	%ymm3,  %ymm5,  %ymm7,  %ymm8,  %ymm12, 0x20, 0x0e0
461	precalc32_79	%ymm15, %ymm3,  %ymm5,  %ymm8,  %ymm14, 0x20, 0x100
462	precalc32_79	%ymm14, %ymm15, %ymm3,  %ymm7,  %ymm13, 0x20, 0x120
463	precalc32_79	%ymm13, %ymm14, %ymm15, %ymm5,  %ymm12, 0x40, 0x140
464	precalc32_79	%ymm12, %ymm13, %ymm14, %ymm3,  %ymm8,  0x40, 0x160
465	precalc32_79	%ymm8,  %ymm12, %ymm13, %ymm15, %ymm7,  0x40, 0x180
466	precalc32_79	%ymm7,  %ymm8,  %ymm12, %ymm14, %ymm5,  0x40, 0x1a0
467	precalc32_79	%ymm5,  %ymm7,  %ymm8,  %ymm13, %ymm3,  0x40, 0x1c0
468	precalc32_79	%ymm3,  %ymm5,  %ymm7,  %ymm12, %ymm15, 0x60, 0x1e0
469	precalc32_79	%ymm15, %ymm3,  %ymm5,  %ymm8,  %ymm14, 0x60, 0x200
470	precalc32_79	%ymm14, %ymm15, %ymm3,  %ymm7,  %ymm13, 0x60, 0x220
471	precalc32_79	%ymm13, %ymm14, %ymm15, %ymm5,  %ymm12, 0x60, 0x240
472	precalc32_79	%ymm12, %ymm13, %ymm14, %ymm3,  %ymm8,  0x60, 0x260
473.endm
474
475/*
476 * Macros calculating individual rounds have general form
477 * calc_round_pre + precalc_round + calc_round_post
478 * calc_round_{pre,post} macros follow
479 */
480.macro	calc_f1_pre	offset, reg_a, reg_b, reg_c, reg_e
481	add		\offset(%r15), \reg_e
482	andn		\reg_c, \reg_a, %ebp
483	add		\reg_b, \reg_e			// add F from the previous round
484	rorx		$0x1b, \reg_a, %r12d
485	rorx		$2, \reg_a, \reg_b		// for the next round
486.endm
487
488/*
489 * Calculate F for the next round
490 */
491.macro	calc_f1_post	reg_a, reg_b, reg_e
492	and		\reg_b, \reg_a			// b & c
493	xor		%ebp, \reg_a			// F1 = (b&c) ^ (~b&d)
494	add		%r12d, \reg_e
495.endm
496
497/*
498 * Registers are cyclically rotated:
499 * edx -> eax -> edi -> esi -> ebx -> ecx
500 */
501.macro	calc0
502	mov		%esi, %ebx			// precalculate first round
503	rorx		$2, %esi, %esi
504	andn		%eax, %ebx, %ebp
505	and		%edi, %ebx
506	xor		%ebp, %ebx
507	calc_f1_pre	0x0, %ecx, %ebx, %edi, %edx
508	precalc0	0x80
509	calc_f1_post	%ecx, %esi, %edx
510.endm
511
512.macro	calc1
513	calc_f1_pre	0x4, %edx, %ecx, %esi, %eax
514	precalc1	0x80
515	calc_f1_post	%edx, %ebx, %eax
516.endm
517
518.macro	calc2
519	calc_f1_pre	0x8, %eax, %edx, %ebx, %edi
520	precalc2	%ymm15
521	calc_f1_post	%eax, %ecx, %edi
522.endm
523
524.macro	calc3
525	calc_f1_pre	0xc, %edi, %eax, %ecx, %esi
526	calc_f1_post	%edi, %edx, %esi
527.endm
528
529.macro	calc4
530	calc_f1_pre	0x20, %esi, %edi, %edx, %ebx
531	precalc4	%ymm15, 0x0
532	calc_f1_post	%esi, %eax, %ebx
533.endm
534
535.macro	calc5
536	calc_f1_pre	0x24, %ebx, %esi, %eax, %ecx
537	calc_f1_post	%ebx, %edi, %ecx
538.endm
539
540.macro	calc6
541	calc_f1_pre	0x28, %ecx, %ebx, %edi, %edx
542	calc_f1_post	%ecx, %esi, %edx
543.endm
544
545.macro	calc7
546	calc_f1_pre	0x2c, %edx, %ecx, %esi, %eax
547	precalc7	0x0
548	calc_f1_post	%edx, %ebx, %eax
549.endm
550
551.macro	calc8
552	calc_f1_pre	0x40, %eax, %edx, %ebx, %edi
553	precalc0	0x90
554	calc_f1_post	%eax, %ecx, %edi
555.endm
556
557.macro	calc9
558	calc_f1_pre	0x44, %edi, %eax, %ecx, %esi
559	precalc1	0x90
560	calc_f1_post	%edi, %edx, %esi
561.endm
562
563.macro	calc10
564	calc_f1_pre	0x48, %esi, %edi, %edx, %ebx
565	precalc2	%ymm14
566	calc_f1_post	%esi, %eax, %ebx
567.endm
568
569.macro	calc11
570	calc_f1_pre	0x4c, %ebx, %esi, %eax, %ecx
571	calc_f1_post	%ebx, %edi, %ecx
572.endm
573
574.macro	calc12
575	calc_f1_pre	0x60, %ecx, %ebx, %edi, %edx
576	precalc4	%ymm14, 0
577	calc_f1_post	%ecx, %esi, %edx
578.endm
579
580.macro	calc13
581	calc_f1_pre	0x64, %edx, %ecx, %esi, %eax
582	calc_f1_post	%edx, %ebx, %eax
583.endm
584
585.macro	calc14
586	calc_f1_pre	0x68, %eax, %edx, %ebx, %edi
587	calc_f1_post	%eax, %ecx, %edi
588.endm
589
590.macro	calc15
591	calc_f1_pre	0x6c, %edi, %eax, %ecx, %esi
592	precalc7	0x10
593	calc_f1_post	%edi, %edx, %esi
594.endm
595
596.macro	calc16
597	calc_f1_pre	0x80, %esi, %edi, %edx, %ebx
598	precalc0	0xa0
599	calc_f1_post	%esi, %eax, %ebx
600.endm
601
602.macro	calc17
603	calc_f1_pre	0x84, %ebx, %esi, %eax, %ecx
604	precalc1	0xa0
605	calc_f1_post	%ebx, %edi, %ecx
606.endm
607
608.macro	calc18
609	calc_f1_pre	0x88, %ecx, %ebx, %edi, %edx
610	precalc2	%ymm13
611	calc_f1_post	%ecx, %esi, %edx
612.endm
613
614.macro	calc_f2_pre	offset, reg_a, reg_b, reg_e
615	add		\offset(%r15), \reg_e
616	add		\reg_b, \reg_e			// add F from the previous round
617	rorx		$0x1b, \reg_a, %r12d
618	rorx		$2, \reg_a, \reg_b		// for next round
619.endm
620
621.macro	calc_f2_post	reg_a, reg_b, reg_c, reg_e
622	xor		\reg_b, \reg_a
623	add		%r12d, \reg_e
624	xor		\reg_c, \reg_a
625.endm
626
627.macro	calc19
628	calc_f2_pre	0x8c, %edx, %ecx, %eax
629	calc_f2_post	%edx, %ebx, %esi, %eax
630.endm
631
632.macro	calc20
633	calc_f2_pre	0xa0, %eax, %edx, %edi
634	precalc4	%ymm13, 0x0
635	calc_f2_post	%eax, %ecx, %ebx, %edi
636.endm
637
638.macro	calc21
639	calc_f2_pre	0xa4, %edi, %eax, %esi
640	calc_f2_post	%edi, %edx, %ecx, %esi
641.endm
642
643.macro	calc22
644	calc_f2_pre	0xa8, %esi, %edi, %ebx
645	calc_f2_post	%esi, %eax, %edx, %ebx
646.endm
647
648.macro	calc23
649	calc_f2_pre	0xac, %ebx, %esi, %ecx
650	precalc7	0x20
651	calc_f2_post	%ebx, %edi, %eax, %ecx
652.endm
653
654.macro	calc24
655	calc_f2_pre	0xc0, %ecx, %ebx, %edx
656	precalc0	0xb0
657	calc_f2_post	%ecx, %esi, %edi, %edx
658.endm
659
660.macro	calc25
661	calc_f2_pre	0xc4, %edx, %ecx, %eax
662	precalc1	0xb0
663	calc_f2_post	%edx, %ebx, %esi, %eax
664.endm
665
666.macro	calc26
667	calc_f2_pre	0xc8, %eax, %edx, %edi
668	precalc2	%ymm12
669	calc_f2_post	%eax, %ecx, %ebx, %edi
670.endm
671
672.macro	calc27
673	calc_f2_pre	0xcc, %edi, %eax, %esi
674	calc_f2_post	%edi, %edx, %ecx, %esi
675.endm
676
677.macro	calc28
678	calc_f2_pre	0xe0, %esi, %edi, %ebx
679	precalc4	%ymm12, 0x0
680	calc_f2_post	%esi, %eax, %edx, %ebx
681.endm
682
683.macro	calc29
684	calc_f2_pre	0xe4, %ebx, %esi, %ecx
685	calc_f2_post	%ebx, %edi, %eax, %ecx
686.endm
687
688.macro	calc30
689	calc_f2_pre	0xe8, %ecx, %ebx, %edx
690	calc_f2_post	%ecx, %esi, %edi, %edx
691.endm
692
693.macro	calc31
694	calc_f2_pre	0xec, %edx, %ecx, %eax
695	precalc7	0x30
696	calc_f2_post	%edx, %ebx, %esi, %eax
697.endm
698
699.macro	calc32
700	calc_f2_pre	0x100, %eax, %edx, %edi
701	precalc16	%ymm15, %ymm14, %ymm12, %ymm8
702	calc_f2_post	%eax, %ecx, %ebx, %edi
703.endm
704
705.macro	calc33
706	calc_f2_pre	0x104, %edi, %eax, %esi
707	precalc17	%ymm15, %ymm13, %ymm8
708	calc_f2_post	%edi, %edx, %ecx, %esi
709.endm
710
711.macro	calc34
712	calc_f2_pre	0x108, %esi, %edi, %ebx
713	precalc18	%ymm8
714	calc_f2_post	%esi, %eax, %edx, %ebx
715.endm
716
717.macro	calc35
718	calc_f2_pre	0x10c, %ebx, %esi, %ecx
719	precalc19	%ymm8
720	calc_f2_post	%ebx, %edi, %eax, %ecx
721.endm
722
723.macro	calc36
724	calc_f2_pre	0x120, %ecx, %ebx, %edx
725	precalc20	%ymm8
726	calc_f2_post	%ecx, %esi, %edi, %edx
727.endm
728
729.macro	calc37
730	calc_f2_pre	0x124, %edx, %ecx, %eax
731	precalc21	%ymm8
732	calc_f2_post	%edx, %ebx, %esi, %eax
733.endm
734
735.macro	calc38
736	calc_f2_pre	0x128, %eax, %edx, %edi
737	calc_f2_post	%eax, %ecx, %ebx, %edi
738.endm
739
740.macro	calc_f3_pre	offset, reg_e
741	add		\offset(%r15), \reg_e
742.endm
743
744.macro	calc_f3_post	reg_a, reg_b, reg_c, reg_e, reg_tb
745	add		\reg_tb, \reg_e		// add F from the previous round
746	mov		\reg_b, %ebp
747	or		\reg_a, %ebp
748	rorx		$0x1b, \reg_a, %r12d
749	rorx		$2, \reg_a, \reg_tb
750	and		\reg_c, %ebp		// calculate F for the next round
751	and		\reg_b, \reg_a
752	or		%ebp, \reg_a
753	add		%r12d, \reg_e
754.endm
755
756.macro	calc39
757	calc_f3_pre	0x12c, %esi
758	precalc23	%ymm8, 0x0, 0x80
759	calc_f3_post	%edi, %edx, %ecx, %esi, %eax
760.endm
761
762.macro	calc40
763	calc_f3_pre	0x140, %ebx
764	precalc16	%ymm14, %ymm13, %ymm8, %ymm7
765	calc_f3_post	%esi, %eax, %edx, %ebx, %edi
766.endm
767
768.macro	calc41
769	calc_f3_pre	0x144, %ecx
770	precalc17	%ymm14, %ymm12, %ymm7
771	calc_f3_post	%ebx, %edi, %eax, %ecx, %esi
772.endm
773
774.macro	calc42
775	calc_f3_pre	0x148, %edx
776	precalc18	%ymm7
777	calc_f3_post	%ecx, %esi, %edi, %edx, %ebx
778.endm
779
780.macro	calc43
781	calc_f3_pre	0x14c, %eax
782	precalc19	%ymm7
783	calc_f3_post	%edx, %ebx, %esi, %eax, %ecx
784.endm
785
786.macro	calc44
787	calc_f3_pre	0x160, %edi
788	precalc20	%ymm7
789	calc_f3_post	%eax, %ecx, %ebx, %edi, %edx
790.endm
791
792.macro	calc45
793	calc_f3_pre	0x164, %esi
794	precalc21	%ymm7
795	calc_f3_post	%edi, %edx, %ecx, %esi, %eax
796.endm
797
798.macro	calc46
799	calc_f3_pre	0x168, %ebx
800	calc_f3_post	%esi, %eax, %edx, %ebx, %edi
801.endm
802
803.macro	calc47
804	calc_f3_pre	0x16c, %ecx
805	vpxor		%ymm9, %ymm0, %ymm7
806	vpaddd		0x20(%r8), %ymm7, %ymm0
807	vmovdqu		%ymm0, 0xa0(%r14)
808	calc_f3_post	%ebx, %edi, %eax, %ecx, %esi
809.endm
810
811.macro	calc48
812	calc_f3_pre	0x180, %edx
813	precalc16	%ymm13, %ymm12, %ymm7, %ymm5
814	calc_f3_post	%ecx, %esi, %edi, %edx, %ebx
815.endm
816
817.macro	calc49
818	calc_f3_pre	0x184, %eax
819	precalc17	%ymm13, %ymm8, %ymm5
820	calc_f3_post	%edx, %ebx, %esi, %eax, %ecx
821.endm
822
823.macro	calc50
824	calc_f3_pre	0x188, %edi
825	precalc18	%ymm5
826	calc_f3_post	%eax, %ecx, %ebx, %edi, %edx
827.endm
828
829.macro	calc51
830	calc_f3_pre	0x18c, %esi
831	precalc19	%ymm5
832	calc_f3_post	%edi, %edx, %ecx, %esi, %eax
833.endm
834
835.macro	calc52
836	calc_f3_pre	0x1a0, %ebx
837	precalc20	%ymm5
838	calc_f3_post	%esi, %eax, %edx, %ebx, %edi
839.endm
840
841.macro	calc53
842	calc_f3_pre	0x1a4, %ecx
843	precalc21	%ymm5
844	calc_f3_post	%ebx, %edi, %eax, %ecx, %esi
845.endm
846
847.macro	calc54
848	calc_f3_pre	0x1a8, %edx
849	calc_f3_post	%ecx, %esi, %edi, %edx, %ebx
850.endm
851
852.macro	calc55
853	calc_f3_pre	0x1ac, %eax
854	precalc23	%ymm5, 0x20, 0xc0
855	calc_f3_post	%edx, %ebx, %esi, %eax, %ecx
856.endm
857
858.macro	calc56
859	calc_f3_pre	0x1c0, %edi
860	precalc16	%ymm12, %ymm8, %ymm5, %ymm3
861	calc_f3_post	%eax, %ecx, %ebx, %edi, %edx
862.endm
863
864.macro	calc57
865	calc_f3_pre	0x1c4, %esi
866	precalc17	%ymm12, %ymm7, %ymm3
867	calc_f3_post	%edi, %edx, %ecx, %esi, %eax
868.endm
869
870.macro	calc58
871	calc_f3_pre	0x1c8, %ebx
872	precalc18	%ymm3
873	calc_f3_post	%esi, %eax, %edx, %ebx, %edi
874.endm
875
876.macro	calc59
877	calc_f2_pre	0x1cc, %ebx, %esi, %ecx
878	precalc19	%ymm3
879	calc_f2_post	%ebx, %edi, %eax, %ecx
880.endm
881
882.macro	calc60
883	calc_f2_pre	0x1e0, %ecx, %ebx, %edx
884	precalc20	%ymm3
885	calc_f2_post	%ecx, %esi, %edi, %edx
886.endm
887
888.macro	calc61
889	calc_f2_pre	0x1e4, %edx, %ecx, %eax
890	precalc21	%ymm3
891	calc_f2_post	%edx, %ebx, %esi, %eax
892.endm
893
894.macro	calc62
895	calc_f2_pre	0x1e8, %eax, %edx, %edi
896	calc_f2_post	%eax, %ecx, %ebx, %edi
897.endm
898
899.macro	calc63
900	calc_f2_pre	0x1ec, %edi, %eax, %esi
901	precalc23	%ymm3, 0x20, 0xe0
902	calc_f2_post	%edi, %edx, %ecx, %esi
903.endm
904
905.macro	calc64
906	calc_f2_pre	0x200, %esi, %edi, %ebx
907	precalc32	%ymm5, %ymm3
908	calc_f2_post	%esi, %eax, %edx, %ebx
909.endm
910
911.macro	calc65
912	calc_f2_pre	0x204, %ebx, %esi, %ecx
913	precalc33	%ymm14, %ymm15
914	calc_f2_post	%ebx, %edi, %eax, %ecx
915.endm
916
917.macro	calc66
918	calc_f2_pre	0x208, %ecx, %ebx, %edx
919	precalc34	%ymm8
920	calc_f2_post	%ecx, %esi, %edi, %edx
921.endm
922
923.macro	calc67
924	calc_f2_pre	0x20c, %edx, %ecx, %eax
925	precalc35	%ymm15
926	calc_f2_post	%edx, %ebx, %esi, %eax
927.endm
928
929.macro	calc68
930	calc_f2_pre	0x220, %eax, %edx, %edi
931	precalc36	%ymm15
932	calc_f2_post	%eax, %ecx, %ebx, %edi
933.endm
934
935.macro	calc69
936	calc_f2_pre	0x224, %edi, %eax, %esi
937	precalc37	%ymm15
938	calc_f2_post	%edi, %edx, %ecx, %esi
939.endm
940
941.macro	calc70
942	calc_f2_pre	0x228, %esi, %edi, %ebx
943	calc_f2_post	%esi, %eax, %edx, %ebx
944.endm
945
946.macro	calc71
947	calc_f2_pre	0x22c, %ebx, %esi, %ecx
948	precalc39	%ymm15, 0x20, 0x100
949	calc_f2_post	%ebx, %edi, %eax, %ecx
950.endm
951
952.macro	calc72
953	calc_f2_pre	0x240, %ecx, %ebx, %edx
954	precalc32	%ymm3, %ymm15
955	calc_f2_post	%ecx, %esi, %edi, %edx
956.endm
957
958.macro	calc73
959	calc_f2_pre	0x244, %edx, %ecx, %eax
960	precalc33	%ymm13, %ymm14
961	calc_f2_post	%edx, %ebx, %esi, %eax
962.endm
963
964.macro	calc74
965	calc_f2_pre	0x248, %eax, %edx, %edi
966	precalc34	%ymm7
967	calc_f2_post	%eax, %ecx, %ebx, %edi
968.endm
969
970.macro	calc75
971	calc_f2_pre	0x24c, %edi, %eax, %esi
972	precalc35	%ymm14
973	calc_f2_post	%edi, %edx, %ecx, %esi
974.endm
975
976.macro	calc76
977	calc_f2_pre	0x260, %esi, %edi, %ebx
978	precalc36	%ymm14
979	calc_f2_post	%esi, %eax, %edx, %ebx
980.endm
981
982.macro	calc77
983	calc_f2_pre	0x264, %ebx, %esi, %ecx
984	precalc37	%ymm14
985	calc_f2_post	%ebx, %edi, %eax, %ecx
986.endm
987
988.macro	calc78
989	calc_f2_pre	0x268, %ecx, %ebx, %edx
990	calc_f2_post	%ecx, %esi, %edi, %edx
991.endm
992
993.macro	calc79
994	add		0x26c(%r15), %eax
995	add		%ecx, %eax
996	rorx		$0x1b, %edx, %r12d
997	precalc39	%ymm14, 0x20, 0x120
998	add		%r12d, %eax
999.endm
1000
1001/*
1002 * Similar to calc0
1003 */
1004.macro	calc80
1005	mov		%ecx, %edx			// precalculate first round
1006	rorx		$2, %ecx, %ecx
1007	andn		%esi, %edx, %ebp
1008	and		%ebx, %edx
1009	xor		%ebp, %edx
1010	calc_f1_pre	0x10, %eax, %edx, %ebx, %edi
1011	precalc32	%ymm15, %ymm14
1012	calc_f1_post	%eax, %ecx, %edi
1013.endm
1014
1015.macro	calc81
1016	calc_f1_pre	0x14, %edi, %eax, %ecx, %esi
1017	precalc33	%ymm12, %ymm13
1018	calc_f1_post	%edi, %edx, %esi
1019.endm
1020
1021.macro	calc82
1022	calc_f1_pre	0x18, %esi, %edi, %edx, %ebx
1023	precalc34	%ymm5
1024	calc_f1_post	%esi, %eax, %ebx
1025.endm
1026
1027.macro	calc83
1028	calc_f1_pre	0x1c, %ebx, %esi, %eax, %ecx
1029	precalc35	%ymm13
1030	calc_f1_post	%ebx, %edi, %ecx
1031.endm
1032
1033.macro	calc84
1034	calc_f1_pre	0x30, %ecx, %ebx, %edi, %edx
1035	precalc36	%ymm13
1036	calc_f1_post	%ecx, %esi, %edx
1037.endm
1038
1039.macro	calc85
1040	calc_f1_pre	0x34, %edx, %ecx, %esi, %eax
1041	precalc37	%ymm13
1042	calc_f1_post	%edx, %ebx, %eax
1043.endm
1044
1045.macro	calc86
1046	calc_f1_pre	0x38, %eax, %edx, %ebx, %edi
1047	calc_f1_post	%eax, %ecx, %edi
1048.endm
1049
1050.macro	calc87
1051	calc_f1_pre	0x3c, %edi, %eax, %ecx, %esi
1052	precalc39	%ymm13, 0x40, 0x140
1053	calc_f1_post	%edi, %edx, %esi
1054.endm
1055
1056.macro	calc88
1057	calc_f1_pre	0x50, %esi, %edi, %edx, %ebx
1058	precalc32	%ymm14, %ymm13
1059	calc_f1_post	%esi, %eax, %ebx
1060.endm
1061
1062.macro	calc89
1063	calc_f1_pre	0x54, %ebx, %esi, %eax, %ecx
1064	precalc33	%ymm8, %ymm12
1065	calc_f1_post	%ebx, %edi, %ecx
1066.endm
1067
1068.macro	calc90
1069	calc_f1_pre	0x58, %ecx, %ebx, %edi, %edx
1070	precalc34	%ymm3
1071	calc_f1_post	%ecx, %esi, %edx
1072.endm
1073
1074.macro	calc91
1075	calc_f1_pre	0x5c, %edx, %ecx, %esi, %eax
1076	precalc35	%ymm12
1077	calc_f1_post	%edx, %ebx, %eax
1078.endm
1079
1080.macro	calc92
1081	calc_f1_pre	0x70, %eax, %edx, %ebx, %edi
1082	precalc36	%ymm12
1083	calc_f1_post	%eax, %ecx, %edi
1084.endm
1085
1086.macro	calc93
1087	calc_f1_pre	0x74, %edi, %eax, %ecx, %esi
1088	precalc37	%ymm12
1089	calc_f1_post	%edi, %edx, %esi
1090.endm
1091
1092.macro	calc94
1093	calc_f1_pre	0x78, %esi, %edi, %edx, %ebx
1094	calc_f1_post	%esi, %eax, %ebx
1095.endm
1096
1097.macro	calc95
1098	calc_f1_pre	0x7c, %ebx, %esi, %eax, %ecx
1099	precalc39	%ymm12, 0x40, 0x160
1100	calc_f1_post	%ebx, %edi, %ecx
1101.endm
1102
1103.macro	calc96
1104	calc_f1_pre	0x90, %ecx, %ebx, %edi, %edx
1105	precalc32	%ymm13, %ymm12
1106	calc_f1_post	%ecx, %esi, %edx
1107.endm
1108
1109.macro	calc97
1110	calc_f1_pre	0x94, %edx, %ecx, %esi, %eax
1111	precalc33	%ymm7, %ymm8
1112	calc_f1_post	%edx, %ebx, %eax
1113.endm
1114
1115.macro	calc98
1116	calc_f1_pre	0x98, %eax, %edx, %ebx, %edi
1117	precalc34	%ymm15
1118	calc_f1_post	%eax, %ecx, %edi
1119.endm
1120
1121.macro	calc99
1122	calc_f2_pre	0x9c, %edi, %eax, %esi
1123	precalc35	%ymm8
1124	calc_f2_post	%edi, %edx, %ecx, %esi
1125.endm
1126
1127.macro	calc100
1128	calc_f2_pre	0xb0, %esi, %edi, %ebx
1129	precalc36	%ymm8
1130	calc_f2_post	%esi, %eax, %edx, %ebx
1131.endm
1132
1133.macro	calc101
1134	calc_f2_pre	0xb4, %ebx, %esi, %ecx
1135	precalc37	%ymm8
1136	calc_f2_post	%ebx, %edi, %eax, %ecx
1137.endm
1138
1139.macro	calc102
1140	calc_f2_pre	0xb8, %ecx, %ebx, %edx
1141	calc_f2_post	%ecx, %esi, %edi, %edx
1142.endm
1143
1144.macro	calc103
1145	calc_f2_pre	0xbc, %edx, %ecx, %eax
1146	precalc39	%ymm8, 0x40, 0x180
1147	calc_f2_post	%edx, %ebx, %esi, %eax
1148.endm
1149
1150.macro	calc104
1151	calc_f2_pre	0xd0, %eax, %edx, %edi
1152	precalc32	%ymm12, %ymm8
1153	calc_f2_post	%eax, %ecx, %ebx, %edi
1154.endm
1155
1156.macro	calc105
1157	calc_f2_pre	0xd4, %edi, %eax, %esi
1158	precalc33	%ymm5, %ymm7
1159	calc_f2_post	%edi, %edx, %ecx, %esi
1160.endm
1161
1162.macro	calc106
1163	calc_f2_pre	0xd8, %esi, %edi, %ebx
1164	precalc34	%ymm14
1165	calc_f2_post	%esi, %eax, %edx, %ebx
1166.endm
1167
1168.macro	calc107
1169	calc_f2_pre	0xdc, %ebx, %esi, %ecx
1170	precalc35	%ymm7
1171	calc_f2_post	%ebx, %edi, %eax, %ecx
1172.endm
1173
1174.macro	calc108
1175	calc_f2_pre	0xf0, %ecx, %ebx, %edx
1176	precalc36	%ymm7
1177	calc_f2_post	%ecx, %esi, %edi, %edx
1178.endm
1179
1180.macro	calc109
1181	calc_f2_pre	0xf4, %edx, %ecx, %eax
1182	precalc37	%ymm7
1183	calc_f2_post	%edx, %ebx, %esi, %eax
1184.endm
1185
1186.macro	calc110
1187	calc_f2_pre	0xf8, %eax, %edx, %edi
1188	calc_f2_post	%eax, %ecx, %ebx, %edi
1189.endm
1190
1191.macro	calc111
1192	calc_f2_pre	0xfc, %edi, %eax, %esi
1193	precalc39	%ymm7, 0x40, 0x1a0
1194	calc_f2_post	%edi, %edx, %ecx, %esi
1195.endm
1196
1197.macro	calc112
1198	calc_f2_pre	0x110, %esi, %edi, %ebx
1199	precalc32	%ymm8, %ymm7
1200	calc_f2_post	%esi, %eax, %edx, %ebx
1201.endm
1202
1203.macro	calc113
1204	calc_f2_pre	0x114, %ebx, %esi, %ecx
1205	precalc33	%ymm3, %ymm5
1206	calc_f2_post	%ebx, %edi, %eax, %ecx
1207.endm
1208
1209.macro	calc114
1210	calc_f2_pre	0x118, %ecx, %ebx, %edx
1211	precalc34	%ymm13
1212	calc_f2_post	%ecx, %esi, %edi, %edx
1213.endm
1214
1215.macro	calc115
1216	calc_f2_pre	0x11c, %edx, %ecx, %eax
1217	precalc35	%ymm5
1218	calc_f2_post	%edx, %ebx, %esi, %eax
1219.endm
1220
1221.macro	calc116
1222	calc_f2_pre	0x130, %eax, %edx, %edi
1223	precalc36	%ymm5
1224	calc_f2_post	%eax, %ecx, %ebx, %edi
1225.endm
1226
1227.macro	calc117
1228	calc_f2_pre	0x134, %edi, %eax, %esi
1229	precalc37	%ymm5
1230	calc_f2_post	%edi, %edx, %ecx, %esi
1231.endm
1232
1233.macro	calc118
1234	calc_f2_pre	0x138, %esi, %edi, %ebx
1235	calc_f2_post	%esi, %eax, %edx, %ebx
1236.endm
1237
1238.macro	calc119
1239	calc_f3_pre	0x13c, %ecx
1240	precalc39	%ymm5, 0x40, 0x1c0
1241	calc_f3_post	%ebx, %edi, %eax, %ecx, %esi
1242.endm
1243
1244.macro	calc120
1245	calc_f3_pre	0x150, %edx
1246	precalc32	%ymm7, %ymm5
1247	calc_f3_post	%ecx, %esi, %edi, %edx, %ebx
1248.endm
1249
1250.macro	calc121
1251	calc_f3_pre	0x154, %eax
1252	precalc33	%ymm15, %ymm3
1253	calc_f3_post	%edx, %ebx, %esi, %eax, %ecx
1254.endm
1255
1256.macro	calc122
1257	calc_f3_pre	0x158, %edi
1258	precalc34	%ymm12
1259	calc_f3_post	%eax, %ecx, %ebx, %edi, %edx
1260.endm
1261
1262.macro	calc123
1263	calc_f3_pre	0x15c, %esi
1264	precalc35	%ymm3
1265	calc_f3_post	%edi, %edx, %ecx, %esi, %eax
1266.endm
1267
1268.macro	calc124
1269	calc_f3_pre	0x170, %ebx
1270	precalc36	%ymm3
1271	calc_f3_post	%esi, %eax, %edx, %ebx, %edi
1272.endm
1273
1274.macro	calc125
1275	calc_f3_pre	0x174, %ecx
1276	precalc37	%ymm3
1277	calc_f3_post	%ebx, %edi, %eax, %ecx, %esi
1278.endm
1279
1280.macro	calc126
1281	calc_f3_pre	0x178, %edx
1282	calc_f3_post	%ecx, %esi, %edi, %edx, %ebx
1283.endm
1284
1285.macro	calc127
1286	calc_f3_pre	0x17c, %eax
1287	precalc39	%ymm3, 0x60, 0x1e0
1288	calc_f3_post	%edx, %ebx, %esi, %eax, %ecx
1289.endm
1290
1291.macro	calc128
1292	calc_f3_pre	0x190, %edi
1293	precalc32	%ymm5, %ymm3
1294	calc_f3_post	%eax, %ecx, %ebx, %edi, %edx
1295.endm
1296
1297.macro	calc129
1298	calc_f3_pre	0x194, %esi
1299	precalc33	%ymm14, %ymm15
1300	calc_f3_post	%edi, %edx, %ecx, %esi, %eax
1301.endm
1302
1303.macro	calc130
1304	calc_f3_pre	0x198, %ebx
1305	precalc34	%ymm8
1306	calc_f3_post	%esi, %eax, %edx, %ebx, %edi
1307.endm
1308
1309.macro	calc131
1310	calc_f3_pre	0x19c, %ecx
1311	precalc35	%ymm15
1312	calc_f3_post	%ebx, %edi, %eax, %ecx, %esi
1313.endm
1314
1315.macro	calc132
1316	calc_f3_pre	0x1b0, %edx
1317	precalc36	%ymm15
1318	calc_f3_post	%ecx, %esi, %edi, %edx, %ebx
1319.endm
1320
1321.macro	calc133
1322	calc_f3_pre	0x1b4, %eax
1323	precalc37	%ymm15
1324	calc_f3_post	%edx, %ebx, %esi, %eax, %ecx
1325.endm
1326
1327.macro	calc134
1328	calc_f3_pre	0x1b8, %edi
1329	calc_f3_post	%eax, %ecx, %ebx, %edi, %edx
1330.endm
1331
1332.macro	calc135
1333	calc_f3_pre	0x1bc, %esi
1334	precalc39	%ymm15, 0x60, 0x200
1335	calc_f3_post	%edi, %edx, %ecx, %esi, %eax
1336.endm
1337
1338.macro	calc136
1339	calc_f3_pre	0x1d0, %ebx
1340	precalc32	%ymm3, %ymm15
1341	calc_f3_post	%esi, %eax, %edx, %ebx, %edi
1342.endm
1343
1344.macro	calc137
1345	calc_f3_pre	0x1d4, %ecx
1346	precalc33	%ymm13, %ymm14
1347	calc_f3_post	%ebx, %edi, %eax, %ecx, %esi
1348.endm
1349
1350.macro	calc138
1351	calc_f3_pre	0x1d8, %edx
1352	precalc34	%ymm7
1353	calc_f3_post	%ecx, %esi, %edi, %edx, %ebx
1354.endm
1355
1356.macro	calc139
1357	calc_f2_pre	0x1dc, %edx, %ecx, %eax
1358	precalc35	%ymm14
1359	calc_f2_post	%edx, %ebx, %esi, %eax
1360.endm
1361
1362.macro	calc140
1363	calc_f2_pre	0x1f0, %eax, %edx, %edi
1364	precalc36	%ymm14
1365	calc_f2_post	%eax, %ecx, %ebx, %edi
1366.endm
1367
1368.macro	calc141
1369	calc_f2_pre	0x1f4, %edi, %eax, %esi
1370	precalc37	%ymm14
1371	calc_f2_post	%edi, %edx, %ecx, %esi
1372.endm
1373
1374.macro	calc142
1375	calc_f2_pre	0x1f8, %esi, %edi, %ebx
1376	calc_f2_post	%esi, %eax, %edx, %ebx
1377.endm
1378
1379.macro	calc143
1380	calc_f2_pre	0x1fc, %ebx, %esi, %ecx
1381	precalc39	%ymm14, 0x60, 0x220
1382	calc_f2_post	%ebx, %edi, %eax, %ecx
1383.endm
1384
1385.macro	calc144
1386	calc_f2_pre	0x210, %ecx, %ebx, %edx
1387	precalc32	%ymm15, %ymm14
1388	calc_f2_post	%ecx, %esi, %edi, %edx
1389.endm
1390
1391.macro	calc145
1392	calc_f2_pre	0x214, %edx, %ecx, %eax
1393	precalc33	%ymm12, %ymm13
1394	calc_f2_post	%edx, %ebx, %esi, %eax
1395.endm
1396
1397.macro	calc146
1398	calc_f2_pre	0x218, %eax, %edx, %edi
1399	precalc34	%ymm5
1400	calc_f2_post	%eax, %ecx, %ebx, %edi
1401.endm
1402
1403.macro	calc147
1404	calc_f2_pre	0x21c, %edi, %eax, %esi
1405	precalc35	%ymm13
1406	calc_f2_post	%edi, %edx, %ecx, %esi
1407.endm
1408
1409.macro	calc148
1410	calc_f2_pre	0x230, %esi, %edi, %ebx
1411	precalc36	%ymm13
1412	calc_f2_post	%esi, %eax, %edx, %ebx
1413.endm
1414
1415.macro	calc149
1416	calc_f2_pre	0x234, %ebx, %esi, %ecx
1417	precalc37	%ymm13
1418	calc_f2_post	%ebx, %edi, %eax, %ecx
1419.endm
1420
1421.macro	calc150
1422	calc_f2_pre	0x238, %ecx, %ebx, %edx
1423	calc_f2_post	%ecx, %esi, %edi, %edx
1424.endm
1425
1426.macro	calc151
1427	calc_f2_pre	0x23c, %edx, %ecx, %eax
1428	precalc39	%ymm13, 0x60, 0x240
1429	calc_f2_post	%edx, %ebx, %esi, %eax
1430.endm
1431
1432.macro	calc152
1433	calc_f2_pre	0x250, %eax, %edx, %edi
1434	precalc32	%ymm14, %ymm13
1435	calc_f2_post	%eax, %ecx, %ebx, %edi
1436.endm
1437
1438.macro	calc153
1439	calc_f2_pre	0x254, %edi, %eax, %esi
1440	precalc33	%ymm8, %ymm12
1441	calc_f2_post	%edi, %edx, %ecx, %esi
1442.endm
1443
1444.macro	calc154
1445	calc_f2_pre	0x258, %esi, %edi, %ebx
1446	precalc34	%ymm3
1447	calc_f2_post	%esi, %eax, %edx, %ebx
1448.endm
1449
1450.macro	calc155
1451	calc_f2_pre	0x25c, %ebx, %esi, %ecx
1452	precalc35	%ymm12
1453	calc_f2_post	%ebx, %edi, %eax, %ecx
1454.endm
1455
1456.macro	calc156
1457	calc_f2_pre	0x270, %ecx, %ebx, %edx
1458	precalc36	%ymm12
1459	calc_f2_post	%ecx, %esi, %edi, %edx
1460.endm
1461
1462.macro	calc157
1463	calc_f2_pre	0x274, %edx, %ecx, %eax
1464	precalc37	%ymm12
1465	calc_f2_post	%edx, %ebx, %esi, %eax
1466.endm
1467
1468.macro	calc158
1469	calc_f2_pre	0x278, %eax, %edx, %edi
1470	calc_f2_post	%eax, %ecx, %ebx, %edi
1471.endm
1472
1473.macro	calc159
1474	add		0x27c(%r15), %esi
1475	add		%eax, %esi
1476	rorx		$0x1b, %edi, %r12d
1477	precalc39	%ymm12, 0x60, 0x260
1478	add		%r12d, %esi
1479.endm
1480
1481	// sha1block(SHA1_CTX, buf, len)
1482ENTRY(_libmd_sha1block_avx2)
1483	push		%rbx
1484	push		%rbp
1485	push		%r12
1486	push		%r13
1487	push		%r14
1488	push		%r15
1489	sub		$1408+8, %rsp
1490
1491	and		$~63, %rdx
1492	lea		k_xmm_ar(%rip), %r8
1493	mov		%rdi, %r9
1494	mov		%rsi, %r10
1495	lea		64(%rsi), %r13
1496	lea		64(%rsi, %rdx), %r11
1497	cmp		%r11, %r13
1498	cmovae		%r8, %r13
1499	vmovdqu		bswap_shufb_ctl(%rip), %ymm10
1500
1501	mov		(%r9), %ecx
1502	mov		4(%r9), %esi
1503	mov		8(%r9), %edi
1504	mov		12(%r9), %eax
1505	mov		16(%r9), %edx
1506	mov		%rsp, %r14
1507	lea		2*4*80+32(%rsp), %r15
1508	precalc						// precalc WK for first 2 blocks
1509	xchg		%r14, %r15
1510
1511	// this is unrolled
1512.Loop:	cmp		%r8, %r10			// we use the value of R8 (set below)
1513							// as a signal of the last block
1514	jne		.Lbegin
1515	add		$1408+8, %rsp
1516	pop		%r15
1517	pop		%r14
1518	pop		%r13
1519	pop		%r12
1520	pop		%rbp
1521	pop		%rbx
1522	vzeroupper
1523	ret
1524
1525.Lbegin:
1526	calc0
1527	calc1
1528	calc2
1529	calc3
1530	calc4
1531	calc5
1532	calc6
1533	calc7
1534	calc8
1535	calc9
1536	calc10
1537	calc11
1538	calc12
1539	calc13
1540	calc14
1541	calc15
1542	calc16
1543	calc17
1544	calc18
1545	calc19
1546	calc20
1547	calc21
1548	calc22
1549	calc23
1550	calc24
1551	calc25
1552	calc26
1553	calc27
1554	calc28
1555	calc29
1556	calc30
1557	calc31
1558	calc32
1559	calc33
1560	calc34
1561	calc35
1562	calc36
1563	calc37
1564	calc38
1565	calc39
1566	calc40
1567	calc41
1568	calc42
1569	calc43
1570	calc44
1571	calc45
1572	calc46
1573	calc47
1574	calc48
1575	calc49
1576	calc50
1577	calc51
1578	calc52
1579	calc53
1580	calc54
1581	calc55
1582	calc56
1583	calc57
1584	calc58
1585	calc59
1586
1587	add		$128, %r10		// move to the next even-64-byte block
1588	cmp		%r11, %r10		// is the current block the last one?
1589	cmovae		%r8, %r10		// signal the last iteration smartly
1590
1591	calc60
1592	calc61
1593	calc62
1594	calc63
1595	calc64
1596	calc65
1597	calc66
1598	calc67
1599	calc68
1600	calc69
1601	calc70
1602	calc71
1603	calc72
1604	calc73
1605	calc74
1606	calc75
1607	calc76
1608	calc77
1609	calc78
1610	calc79
1611
1612	update_hash	%eax, %edx, %ebx, %esi, %edi
1613	cmp		%r8, %r10		// is the current block the last one?
1614	je		.Loop
1615	mov		%edx, %ecx
1616
1617	calc80
1618	calc81
1619	calc82
1620	calc83
1621	calc84
1622	calc85
1623	calc86
1624	calc87
1625	calc88
1626	calc89
1627	calc90
1628	calc91
1629	calc92
1630	calc93
1631	calc94
1632	calc95
1633	calc96
1634	calc97
1635	calc98
1636	calc99
1637	calc100
1638	calc101
1639	calc102
1640	calc103
1641	calc104
1642	calc105
1643	calc106
1644	calc107
1645	calc108
1646	calc109
1647	calc110
1648	calc111
1649	calc112
1650	calc113
1651	calc114
1652	calc115
1653	calc116
1654	calc117
1655	calc118
1656	calc119
1657	calc120
1658	calc121
1659	calc122
1660	calc123
1661	calc124
1662	calc125
1663	calc126
1664	calc127
1665	calc128
1666	calc129
1667	calc130
1668	calc131
1669	calc132
1670	calc133
1671	calc134
1672	calc135
1673	calc136
1674	calc137
1675	calc138
1676	calc139
1677
1678	add		$128, %r13		// move to the next even-64-byte block
1679	cmp		%r11, %r13		// is the current block the last one?
1680	cmovae		%r8, %r10
1681
1682	calc140
1683	calc141
1684	calc142
1685	calc143
1686	calc144
1687	calc145
1688	calc146
1689	calc147
1690	calc148
1691	calc149
1692	calc150
1693	calc151
1694	calc152
1695	calc153
1696	calc154
1697	calc155
1698	calc156
1699	calc157
1700	calc158
1701	calc159
1702
1703	update_hash	%esi, %edi, %edx, %ecx, %ebx
1704	mov		%esi, %r12d		// reset state for AVX2 reg permutation
1705	mov		%edi, %esi
1706	mov		%edx, %edi
1707	mov		%ebx, %edx
1708	mov		%ecx, %eax
1709	mov		%r12d, %ecx
1710	xchg		%r14, %r15
1711	jmp		.Loop
1712END(_libmd_sha1block_avx2)
1713
1714	.section	.rodata
1715	.balign		32
1716k_xmm_ar:
1717	.fill		8, 4, 0x5a827999
1718	.fill		8, 4, 0x6ed9eba1
1719	.fill		8, 4, 0x8f1bbcdc
1720	.fill		8, 4, 0xca62c1d6
1721	.size		k_xmm_ar, .-k_xmm_ar
1722
1723bswap_shufb_ctl:
1724	.4byte		0x00010203
1725	.4byte		0x04050607
1726	.4byte		0x08090a0b
1727	.4byte		0x0c0d0e0f
1728	.4byte		0x00010203
1729	.4byte		0x04050607
1730	.4byte		0x08090a0b
1731	.4byte		0x0c0d0e0f
1732	.size		bswap_shufb_ctl, .-bswap_shufb_ctl
1733
1734	/*
1735	 * SHA1 implementation using the Intel SHA extensions (SHANI).
1736	 *
1737	 * Imlemented according to the Intel white paper
1738	 *
1739	 * S. Gulley, V. Gopal, K. Yap, W. Feghali, J. Guilford,
1740	 * G. Wolrich: "Intel SHA Extensions: new instruction supporting
1741	 * the Secure Hash Algorithm on Intel® architecture processors",
1742	 * July 2013.
1743	 */
1744	// sha1block(SHA1_CTX, buf, len)
1745ENTRY(_libmd_sha1block_shani)
1746	and		$~63, %rdx		// round length to block-size multiple
1747	lea		(%rsi, %rdx, 1), %rcx	// end pointer
1748	test		%rdx, %rdx		// nothing to do?
1749	je		1f			// if so, terminate immediately
1750
1751	movdqu		(%rdi), %xmm6		// h0, h1, h2, h3
1752	pxor		%xmm7, %xmm7
1753	pshufd		$0x1b, %xmm6, %xmm6	// h3, h2, h1, h0
1754	pinsrd		$3, 16(%rdi), %xmm7	// h4 in the highest word of xmm7
1755	movdqu		shuf_mask(%rip), %xmm4
1756
1757	// main loop
17580:	movdqa		%xmm6, %xmm8		// stash ABCD
1759	movdqa		%xmm7, %xmm9		// stash E
1760
1761	// rounds 0--3
1762	movdqu		0*16(%rsi), %xmm0	// load first message block
1763	pshufb		%xmm4, %xmm0		// and byte-swap
1764	paddd		%xmm0, %xmm7		// E += w[0]
1765	movdqa		%xmm6, %xmm5		// E' = A
1766	sha1rnds4	$0, %xmm7, %xmm6	// perform rounds 0--3
1767
1768	// rounds 4--7
1769	movdqu		1*16(%rsi), %xmm1
1770	pshufb		%xmm4, %xmm1
1771	sha1nexte	%xmm1, %xmm5
1772	movdqa		%xmm6, %xmm7
1773	sha1rnds4	$0, %xmm5, %xmm6
1774	sha1msg1	%xmm1, %xmm0
1775
1776	// rounds 8--11
1777	movdqu		2*16(%rsi), %xmm2
1778	pshufb		%xmm4, %xmm2
1779	sha1nexte	%xmm2, %xmm7
1780	movdqa		%xmm6, %xmm5
1781	sha1rnds4	$0, %xmm7, %xmm6
1782	sha1msg1	%xmm2, %xmm1
1783	pxor		%xmm2, %xmm0
1784
1785.macro	midround	msg3, msg0, msg1, msg2, e1, e0, k
1786	sha1nexte	\msg3, \e1
1787	movdqa		%xmm6, \e0
1788	sha1msg2	\msg3, \msg0
1789	sha1rnds4	$\k, \e1, %xmm6
1790	sha1msg1	\msg3, \msg2
1791	pxor		\msg3, \msg1
1792.endm
1793
1794	movdqu		3*16(%rsi), %xmm3	// load third message block
1795	pshufb		%xmm4, %xmm3
1796
1797	add		$4*16, %rsi
1798
1799	midround	%xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 0	// 12--15
1800	midround	%xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 0	// 16--19
1801	midround	%xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 1	// 20--23
1802	midround	%xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 1	// 24--27
1803	midround	%xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 1	// 28--31
1804	midround	%xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 1	// 32--35
1805	midround	%xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 1	// 36--39
1806	midround	%xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 2	// 40--43
1807	midround	%xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 2	// 44--47
1808	midround	%xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 2	// 48--51
1809	midround	%xmm1, %xmm2, %xmm3, %xmm0, %xmm5, %xmm7, 2	// 52--55
1810	midround	%xmm2, %xmm3, %xmm0, %xmm1, %xmm7, %xmm5, 2	// 56--59
1811	midround	%xmm3, %xmm0, %xmm1, %xmm2, %xmm5, %xmm7, 3	// 60--63
1812	midround	%xmm0, %xmm1, %xmm2, %xmm3, %xmm7, %xmm5, 3	// 64--67
1813
1814	// rounds 68--71
1815	sha1nexte	%xmm1, %xmm5
1816	movdqa		%xmm6, %xmm7
1817	sha1msg2	%xmm1, %xmm2
1818	sha1rnds4	$3, %xmm5, %xmm6
1819	pxor		%xmm1, %xmm3
1820
1821	// rounds 72--75
1822	sha1nexte	%xmm2, %xmm7
1823	movdqa		%xmm6, %xmm5
1824	sha1msg2	%xmm2, %xmm3
1825	sha1rnds4	$3, %xmm7, %xmm6
1826
1827	// rounds 76--79
1828	sha1nexte	%xmm3, %xmm5
1829	movdqa		%xmm6, %xmm7
1830	sha1rnds4	$3, %xmm5, %xmm6
1831
1832	sha1nexte	%xmm9, %xmm7		// add saved E
1833	paddd		%xmm8, %xmm6		// add saved ABCD
1834
1835	cmp		%rsi, %rcx		// end reached?
1836	jne		0b
1837
1838	pshufd		$0x1b, %xmm6, %xmm6	// restore order of h0--h3
1839	movdqu		%xmm6, (%rdi)		// write h0--h3
1840	pextrd		$3, %xmm7, 16(%rdi)	// write h4
18411:	ret
1842END(_libmd_sha1block_shani)
1843
1844	.section	.rodata
1845	.balign		16
1846shuf_mask:
1847	.8byte		0x08090a0b0c0d0e0f
1848	.8byte		0x0001020304050607
1849	.size		shuf_mask, .-shuf_mask
1850
1851	.section .note.GNU-stack,"",%progbits
1852