Lines Matching +full:3 +full:- +full:d

1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
17 …* http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorith…
61 /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
66 * This macro implements the SHA-1 function's body for single 64-byte block
108 * This macro implements 80 rounds of SHA-1 for one 64-byte block
116 mov 12(HASH_PTR), D
127 RR F1,A,B,C,D,E,0
128 RR F1,D,E,A,B,C,2
129 RR F1,B,C,D,E,A,4
130 RR F1,E,A,B,C,D,6
131 RR F1,C,D,E,A,B,8
133 RR F1,A,B,C,D,E,10
134 RR F1,D,E,A,B,C,12
135 RR F1,B,C,D,E,A,14
136 RR F1,E,A,B,C,D,16
137 RR F1,C,D,E,A,B,18
139 RR F2,A,B,C,D,E,20
140 RR F2,D,E,A,B,C,22
141 RR F2,B,C,D,E,A,24
142 RR F2,E,A,B,C,D,26
143 RR F2,C,D,E,A,B,28
145 RR F2,A,B,C,D,E,30
146 RR F2,D,E,A,B,C,32
147 RR F2,B,C,D,E,A,34
148 RR F2,E,A,B,C,D,36
149 RR F2,C,D,E,A,B,38
151 RR F3,A,B,C,D,E,40
152 RR F3,D,E,A,B,C,42
153 RR F3,B,C,D,E,A,44
154 RR F3,E,A,B,C,D,46
155 RR F3,C,D,E,A,B,48
157 RR F3,A,B,C,D,E,50
158 RR F3,D,E,A,B,C,52
159 RR F3,B,C,D,E,A,54
160 RR F3,E,A,B,C,D,56
161 RR F3,C,D,E,A,B,58
163 add $64, BUFFER_PTR # move to the next 64-byte block
167 RR F4,A,B,C,D,E,60
168 RR F4,D,E,A,B,C,62
169 RR F4,B,C,D,E,A,64
170 RR F4,E,A,B,C,D,66
171 RR F4,C,D,E,A,B,68
173 RR F4,A,B,C,D,E,70
174 RR F4,D,E,A,B,C,72
175 RR F4,B,C,D,E,A,74
176 RR F4,E,A,B,C,D,76
177 RR F4,C,D,E,A,B,78
182 UPDATE_HASH 12(HASH_PTR), D
194 .set D, REG_D define
203 mov D, REG_D
214 .macro F1 b, c, d argument
217 xor \d, T1
219 xor \d, T1
222 .macro F2 b, c, d argument
223 mov \d, T1
224 SWAP_REG_NAMES \d, T1
229 .macro F3 b, c ,d argument
235 and \d, T1
239 .macro F4 b, c, d argument
240 F2 \b, \c, \d
249 * RR does two rounds of SHA-1 back to back with W[] pre-calc
250 * t1 = F(b, c, d); e += w(i)
251 * e += t1; b <<= 30; d += w(i+1);
253 * d += t1; a <<= 5;
257 * d += t1;
259 .macro RR F, a, b, c, d, e, round
261 \F \b, \c, \d # t1 = F(b, c, d);
265 add WK(\round + 1), \d
271 add T1, \d
278 add T1, \d
281 # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
298 .set i, ((\r) % 80) # pre-compute for the next iteration
305 .elseif (i < 80) // rounds 32-79
346 /* message scheduling pre-compute for rounds 0-15 */
348 .if ((i & 3) == 0)
350 .elseif ((i & 3) == 1)
353 .elseif ((i & 3) == 2)
355 .elseif ((i & 3) == 3)
356 movdqa W_TMP1, WK(i&~3)
361 /* message scheduling pre-compute for rounds 16-31
363 * - calculating last 32 w[i] values in 8 XMM registers
364 * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
367 * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
368 * dependency, but improves for 32-79
371 # blended scheduling of vector and scalar instruction streams, one 4-wide
373 .if ((i & 3) == 0)
375 palignr $8, W_minus_16, W # w[i-14]
377 psrldq $4, W_TMP1 # w[i-3]
379 .elseif ((i & 3) == 1)
385 .elseif ((i & 3) == 2)
392 .elseif ((i & 3) == 3)
397 movdqa W_TMP1, WK(i&~3)
402 /* message scheduling pre-compute for rounds 32-79
404 * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
405 * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
406 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
409 .if ((i & 3) == 0)
413 .elseif ((i & 3) == 1)
417 .elseif ((i & 3) == 2)
421 .elseif ((i & 3) == 3)
424 movdqa W_TMP1, WK(i&~3)
484 .if ((i & 3) == 0)
486 .elseif ((i & 3) == 1)
488 .elseif ((i & 3) == 2)
490 .elseif ((i & 3) == 3)
491 vmovdqa W_TMP1, WK(i&~3)
497 .if ((i & 3) == 0)
498 vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
499 vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
502 .elseif ((i & 3) == 1)
506 .elseif ((i & 3) == 2)
511 .elseif ((i & 3) == 3)
515 vmovdqu W_TMP1, WK(i&~3)
521 .if ((i & 3) == 0)
524 .elseif ((i & 3) == 1)
527 .elseif ((i & 3) == 2)
531 .elseif ((i & 3) == 3)
533 vmovdqu W_TMP1, WK(i&~3)