Lines Matching +full:a +full:- +full:b

1 /* SPDX-License-Identifier: GPL-2.0-or-later */
3 * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental
17 …* http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorith…
61 /* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */
66 * This macro implements the SHA-1 function's body for single 64-byte block
108 * This macro implements 80 rounds of SHA-1 for one 64-byte block
113 mov (HASH_PTR), A
114 mov 4(HASH_PTR), B
127 RR F1,A,B,C,D,E,0
128 RR F1,D,E,A,B,C,2
129 RR F1,B,C,D,E,A,4
130 RR F1,E,A,B,C,D,6
131 RR F1,C,D,E,A,B,8
133 RR F1,A,B,C,D,E,10
134 RR F1,D,E,A,B,C,12
135 RR F1,B,C,D,E,A,14
136 RR F1,E,A,B,C,D,16
137 RR F1,C,D,E,A,B,18
139 RR F2,A,B,C,D,E,20
140 RR F2,D,E,A,B,C,22
141 RR F2,B,C,D,E,A,24
142 RR F2,E,A,B,C,D,26
143 RR F2,C,D,E,A,B,28
145 RR F2,A,B,C,D,E,30
146 RR F2,D,E,A,B,C,32
147 RR F2,B,C,D,E,A,34
148 RR F2,E,A,B,C,D,36
149 RR F2,C,D,E,A,B,38
151 RR F3,A,B,C,D,E,40
152 RR F3,D,E,A,B,C,42
153 RR F3,B,C,D,E,A,44
154 RR F3,E,A,B,C,D,46
155 RR F3,C,D,E,A,B,48
157 RR F3,A,B,C,D,E,50
158 RR F3,D,E,A,B,C,52
159 RR F3,B,C,D,E,A,54
160 RR F3,E,A,B,C,D,56
161 RR F3,C,D,E,A,B,58
163 add $64, BUFFER_PTR # move to the next 64-byte block
167 RR F4,A,B,C,D,E,60
168 RR F4,D,E,A,B,C,62
169 RR F4,B,C,D,E,A,64
170 RR F4,E,A,B,C,D,66
171 RR F4,C,D,E,A,B,68
173 RR F4,A,B,C,D,E,70
174 RR F4,D,E,A,B,C,72
175 RR F4,B,C,D,E,A,74
176 RR F4,E,A,B,C,D,76
177 RR F4,C,D,E,A,B,78
179 UPDATE_HASH (HASH_PTR), A
180 UPDATE_HASH 4(HASH_PTR), B
187 jne 1b
191 .set A, REG_A define
192 .set B, REG_B define
202 mov B, REG_B
204 mov A, REG_A
208 .macro SWAP_REG_NAMES a, b argument
209 .set _T, \a
210 .set \a, \b
211 .set \b, _T
214 .macro F1 b, c, d
218 and \b, T1
222 .macro F2 b, c, d
226 xor \b, T1
229 .macro F3 b, c ,d
232 mov \b, T2
233 or \b, T1
239 .macro F4 b, c, d
240 F2 \b, \c, \d
249 * RR does two rounds of SHA-1 back to back with W[] pre-calc
250 * t1 = F(b, c, d); e += w(i)
251 * e += t1; b <<= 30; d += w(i+1);
252 * t1 = F(a, b, c);
253 * d += t1; a <<= 5;
254 * e += a;
255 * t1 = e; a >>= 7;
259 .macro RR F, a, b, c, d, e, round
261 \F \b, \c, \d # t1 = F(b, c, d);
263 rol $30, \b
267 \F \a, \b, \c
269 rol $5, \a
270 add \a, \e
272 ror $7, \a # (a <<r 5) >>r 7) => a <<r 30)
280 # write: \a, \b
281 # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c
298 .set i, ((\r) % 80) # pre-compute for the next iteration
305 .elseif (i < 80) // rounds 32-79
346 /* message scheduling pre-compute for rounds 0-15 */
361 /* message scheduling pre-compute for rounds 16-31
363 * - calculating last 32 w[i] values in 8 XMM registers
364 * - pre-calculate K+w[i] values and store to mem, for later load by ALU add
367 * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3]
368 * dependency, but improves for 32-79
371 # blended scheduling of vector and scalar instruction streams, one 4-wide
375 palignr $8, W_minus_16, W # w[i-14]
377 psrldq $4, W_TMP1 # w[i-3]
402 /* message scheduling pre-compute for rounds 32-79
404 * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1
405 * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2
406 * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken
456 .macro xmm_mov a, b argument
457 movdqu \a,\b
498 vpalignr $8, W_minus_16, W_minus_12, W # w[i-14]
499 vpsrldq $4, W_minus_04, W_TMP1 # w[i-3]
542 .macro xmm_mov a, b argument
543 vmovdqu \a,\b