1
2 #include <stdint.h>
3 #include <string.h>
4
5 #include "../onetimeauth_poly1305.h"
6 #include "crypto_verify_16.h"
7 #include "poly1305_sse2.h"
8 #include "private/common.h"
9 #include "private/sse2_64_32.h"
10 #include "utils.h"
11
12 #if defined(HAVE_TI_MODE) && defined(HAVE_EMMINTRIN_H)
13
14 # ifdef __GNUC__
15 # pragma GCC target("sse2")
16 # endif
17
18 # include <emmintrin.h>
19
20 typedef __m128i xmmi;
21
22 # if defined(_MSC_VER)
23 # define POLY1305_NOINLINE __declspec(noinline)
24 # elif defined(__clang__) || defined(__GNUC__)
25 # define POLY1305_NOINLINE __attribute__((noinline))
26 # else
27 # define POLY1305_NOINLINE
28 # endif
29
30 # define poly1305_block_size 32
31
32 enum poly1305_state_flags_t {
33 poly1305_started = 1,
34 poly1305_final_shift8 = 4,
35 poly1305_final_shift16 = 8,
36 poly1305_final_r2_r = 16, /* use [r^2,r] for the final block */
37 poly1305_final_r_1 = 32 /* use [r,1] for the final block */
38 };
39
40 typedef struct poly1305_state_internal_t {
41 union {
42 uint64_t h[3];
43 uint32_t hh[10];
44 } H; /* 40 bytes */
45 uint32_t R[5]; /* 20 bytes */
46 uint32_t R2[5]; /* 20 bytes */
47 uint32_t R4[5]; /* 20 bytes */
48 uint64_t pad[2]; /* 16 bytes */
49 uint64_t flags; /* 8 bytes */
50 unsigned long long leftover; /* 8 bytes */
51 unsigned char buffer[poly1305_block_size]; /* 32 bytes */
52 } poly1305_state_internal_t; /* 164 bytes total */
53
54 /*
55 * _mm_loadl_epi64() is turned into a simple MOVQ. So, unaligned accesses are
56 * totally fine, even though this intrinsic requires a __m128i* input.
57 * This confuses dynamic analysis, so force alignment, only in debug mode.
58 */
59 # ifdef DEBUG
60 static xmmi
_fakealign_mm_loadl_epi64(const void * m)61 _fakealign_mm_loadl_epi64(const void *m)
62 {
63 xmmi tmp;
64 memcpy(&tmp, m, 8);
65
66 return _mm_loadl_epi64(&tmp);
67 }
68 # define _mm_loadl_epi64(X) _fakealign_mm_loadl_epi64(X)
69 #endif
70
71 /* copy 0-31 bytes */
72 static inline void
poly1305_block_copy31(unsigned char * dst,const unsigned char * src,unsigned long long bytes)73 poly1305_block_copy31(unsigned char *dst, const unsigned char *src,
74 unsigned long long bytes)
75 {
76 if (bytes & 16) {
77 _mm_store_si128((xmmi *) (void *) dst,
78 _mm_loadu_si128((const xmmi *) (const void *) src));
79 src += 16;
80 dst += 16;
81 }
82 if (bytes & 8) {
83 memcpy(dst, src, 8);
84 src += 8;
85 dst += 8;
86 }
87 if (bytes & 4) {
88 memcpy(dst, src, 4);
89 src += 4;
90 dst += 4;
91 }
92 if (bytes & 2) {
93 memcpy(dst, src, 2);
94 src += 2;
95 dst += 2;
96 }
97 if (bytes & 1) {
98 *dst = *src;
99 }
100 }
101
102 static POLY1305_NOINLINE void
poly1305_init_ext(poly1305_state_internal_t * st,const unsigned char key[32],unsigned long long bytes)103 poly1305_init_ext(poly1305_state_internal_t *st, const unsigned char key[32],
104 unsigned long long bytes)
105 {
106 uint32_t *R;
107 uint128_t d[3];
108 uint64_t r0, r1, r2;
109 uint64_t rt0, rt1, rt2, st2, c;
110 uint64_t t0, t1;
111 unsigned long long i;
112
113 if (!bytes) {
114 bytes = ~(unsigned long long) 0;
115 }
116 /* H = 0 */
117 _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], _mm_setzero_si128());
118 _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], _mm_setzero_si128());
119 _mm_storeu_si128((xmmi *) (void *) &st->H.hh[8], _mm_setzero_si128());
120
121 /* clamp key */
122 memcpy(&t0, key, 8);
123 memcpy(&t1, key + 8, 8);
124 r0 = t0 & 0xffc0fffffff;
125 t0 >>= 44;
126 t0 |= t1 << 20;
127 r1 = t0 & 0xfffffc0ffff;
128 t1 >>= 24;
129 r2 = t1 & 0x00ffffffc0f;
130
131 /* r^1 */
132 R = st->R;
133 R[0] = (uint32_t)(r0) &0x3ffffff;
134 R[1] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff;
135 R[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff;
136 R[3] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff;
137 R[4] = (uint32_t)((r2 >> 16));
138
139 /* save pad */
140 memcpy(&st->pad[0], key + 16, 8);
141 memcpy(&st->pad[1], key + 24, 8);
142
143 rt0 = r0;
144 rt1 = r1;
145 rt2 = r2;
146
147 /* r^2, r^4 */
148 for (i = 0; i < 2; i++) {
149 if (i == 0) {
150 R = st->R2;
151 if (bytes <= 16) {
152 break;
153 }
154 } else if (i == 1) {
155 R = st->R4;
156 if (bytes < 96) {
157 break;
158 }
159 }
160 st2 = rt2 * (5 << 2);
161
162 d[0] = ((uint128_t) rt0 * rt0) + ((uint128_t)(rt1 * 2) * st2);
163 d[1] = ((uint128_t) rt2 * st2) + ((uint128_t)(rt0 * 2) * rt1);
164 d[2] = ((uint128_t) rt1 * rt1) + ((uint128_t)(rt2 * 2) * rt0);
165
166 rt0 = (uint64_t) d[0] & 0xfffffffffff;
167 c = (uint64_t)(d[0] >> 44);
168 d[1] += c;
169
170 rt1 = (uint64_t) d[1] & 0xfffffffffff;
171 c = (uint64_t)(d[1] >> 44);
172 d[2] += c;
173
174 rt2 = (uint64_t) d[2] & 0x3ffffffffff;
175 c = (uint64_t)(d[2] >> 42);
176 rt0 += c * 5;
177 c = (rt0 >> 44);
178 rt0 = rt0 & 0xfffffffffff;
179 rt1 += c;
180 c = (rt1 >> 44);
181 rt1 = rt1 & 0xfffffffffff;
182 rt2 += c; /* even if rt2 overflows, it will still fit in rp4 safely, and
183 is safe to multiply with */
184
185 R[0] = (uint32_t)(rt0) &0x3ffffff;
186 R[1] = (uint32_t)((rt0 >> 26) | (rt1 << 18)) & 0x3ffffff;
187 R[2] = (uint32_t)((rt1 >> 8)) & 0x3ffffff;
188 R[3] = (uint32_t)((rt1 >> 34) | (rt2 << 10)) & 0x3ffffff;
189 R[4] = (uint32_t)((rt2 >> 16));
190 }
191 st->flags = 0;
192 st->leftover = 0U;
193 }
194
195 static POLY1305_NOINLINE void
poly1305_blocks(poly1305_state_internal_t * st,const unsigned char * m,unsigned long long bytes)196 poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
197 unsigned long long bytes)
198 {
199 CRYPTO_ALIGN(64)
200 xmmi HIBIT =
201 _mm_shuffle_epi32(_mm_cvtsi32_si128(1 << 24), _MM_SHUFFLE(1, 0, 1, 0));
202 const xmmi MMASK = _mm_shuffle_epi32(_mm_cvtsi32_si128((1 << 26) - 1),
203 _MM_SHUFFLE(1, 0, 1, 0));
204 const xmmi FIVE =
205 _mm_shuffle_epi32(_mm_cvtsi32_si128(5), _MM_SHUFFLE(1, 0, 1, 0));
206 xmmi H0, H1, H2, H3, H4;
207 xmmi T0, T1, T2, T3, T4, T5, T6, T7, T8;
208 xmmi M0, M1, M2, M3, M4;
209 xmmi M5, M6, M7, M8;
210 xmmi C1, C2;
211 xmmi R20, R21, R22, R23, R24, S21, S22, S23, S24;
212 xmmi R40, R41, R42, R43, R44, S41, S42, S43, S44;
213
214 if (st->flags & poly1305_final_shift8) {
215 HIBIT = _mm_srli_si128(HIBIT, 8);
216 }
217 if (st->flags & poly1305_final_shift16) {
218 HIBIT = _mm_setzero_si128();
219 }
220 if (!(st->flags & poly1305_started)) {
221 /* H = [Mx,My] */
222 T5 = _mm_unpacklo_epi64(
223 _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)),
224 _mm_loadl_epi64((const xmmi *) (const void *) (m + 16)));
225 T6 = _mm_unpacklo_epi64(
226 _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)),
227 _mm_loadl_epi64((const xmmi *) (const void *) (m + 24)));
228 H0 = _mm_and_si128(MMASK, T5);
229 H1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
230 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
231 H2 = _mm_and_si128(MMASK, T5);
232 H3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
233 H4 = _mm_srli_epi64(T6, 40);
234 H4 = _mm_or_si128(H4, HIBIT);
235 m += 32;
236 bytes -= 32;
237 st->flags |= poly1305_started;
238 } else {
239 T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[0]);
240 T1 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[4]);
241 T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->H.hh[8]);
242 H0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 0, 0));
243 H1 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 2, 2));
244 H2 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(1, 1, 0, 0));
245 H3 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 3, 2, 2));
246 H4 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(1, 1, 0, 0));
247 }
248 if (st->flags & (poly1305_final_r2_r | poly1305_final_r_1)) {
249 if (st->flags & poly1305_final_r2_r) {
250 /* use [r^2, r] */
251 T2 = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]);
252 T3 = _mm_cvtsi32_si128(st->R[4]);
253 T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]);
254 T1 = _mm_cvtsi32_si128(st->R2[4]);
255 T4 = _mm_unpacklo_epi32(T0, T2);
256 T5 = _mm_unpackhi_epi32(T0, T2);
257 R24 = _mm_unpacklo_epi64(T1, T3);
258 } else {
259 /* use [r^1, 1] */
260 T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R[0]);
261 T1 = _mm_cvtsi32_si128(st->R[4]);
262 T2 = _mm_cvtsi32_si128(1);
263 T4 = _mm_unpacklo_epi32(T0, T2);
264 T5 = _mm_unpackhi_epi32(T0, T2);
265 R24 = T1;
266 }
267 R20 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(1, 1, 0, 0));
268 R21 = _mm_shuffle_epi32(T4, _MM_SHUFFLE(3, 3, 2, 2));
269 R22 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(1, 1, 0, 0));
270 R23 = _mm_shuffle_epi32(T5, _MM_SHUFFLE(3, 3, 2, 2));
271 } else {
272 /* use [r^2, r^2] */
273 T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R2[0]);
274 T1 = _mm_cvtsi32_si128(st->R2[4]);
275 R20 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0));
276 R21 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1));
277 R22 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2));
278 R23 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3));
279 R24 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0));
280 }
281 S21 = _mm_mul_epu32(R21, FIVE);
282 S22 = _mm_mul_epu32(R22, FIVE);
283 S23 = _mm_mul_epu32(R23, FIVE);
284 S24 = _mm_mul_epu32(R24, FIVE);
285
286 if (bytes >= 64) {
287 T0 = _mm_loadu_si128((const xmmi *) (const void *) &st->R4[0]);
288 T1 = _mm_cvtsi32_si128(st->R4[4]);
289 R40 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(0, 0, 0, 0));
290 R41 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(1, 1, 1, 1));
291 R42 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(2, 2, 2, 2));
292 R43 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 3, 3, 3));
293 R44 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(0, 0, 0, 0));
294 S41 = _mm_mul_epu32(R41, FIVE);
295 S42 = _mm_mul_epu32(R42, FIVE);
296 S43 = _mm_mul_epu32(R43, FIVE);
297 S44 = _mm_mul_epu32(R44, FIVE);
298
299 while (bytes >= 64) {
300 xmmi v00, v01, v02, v03, v04;
301 xmmi v10, v11, v12, v13, v14;
302 xmmi v20, v21, v22, v23, v24;
303 xmmi v30, v31, v32, v33, v34;
304 xmmi v40, v41, v42, v43, v44;
305 xmmi T14, T15;
306
307 /* H *= [r^4,r^4], preload [Mx,My] */
308 T15 = S42;
309 T0 = H4;
310 T0 = _mm_mul_epu32(T0, S41);
311 v01 = H3;
312 v01 = _mm_mul_epu32(v01, T15);
313 T14 = S43;
314 T1 = H4;
315 T1 = _mm_mul_epu32(T1, T15);
316 v11 = H3;
317 v11 = _mm_mul_epu32(v11, T14);
318 T2 = H4;
319 T2 = _mm_mul_epu32(T2, T14);
320 T0 = _mm_add_epi64(T0, v01);
321 T15 = S44;
322 v02 = H2;
323 v02 = _mm_mul_epu32(v02, T14);
324 T3 = H4;
325 T3 = _mm_mul_epu32(T3, T15);
326 T1 = _mm_add_epi64(T1, v11);
327 v03 = H1;
328 v03 = _mm_mul_epu32(v03, T15);
329 v12 = H2;
330 v12 = _mm_mul_epu32(v12, T15);
331 T0 = _mm_add_epi64(T0, v02);
332 T14 = R40;
333 v21 = H3;
334 v21 = _mm_mul_epu32(v21, T15);
335 v31 = H3;
336 v31 = _mm_mul_epu32(v31, T14);
337 T0 = _mm_add_epi64(T0, v03);
338 T4 = H4;
339 T4 = _mm_mul_epu32(T4, T14);
340 T1 = _mm_add_epi64(T1, v12);
341 v04 = H0;
342 v04 = _mm_mul_epu32(v04, T14);
343 T2 = _mm_add_epi64(T2, v21);
344 v13 = H1;
345 v13 = _mm_mul_epu32(v13, T14);
346 T3 = _mm_add_epi64(T3, v31);
347 T15 = R41;
348 v22 = H2;
349 v22 = _mm_mul_epu32(v22, T14);
350 v32 = H2;
351 v32 = _mm_mul_epu32(v32, T15);
352 T0 = _mm_add_epi64(T0, v04);
353 v41 = H3;
354 v41 = _mm_mul_epu32(v41, T15);
355 T1 = _mm_add_epi64(T1, v13);
356 v14 = H0;
357 v14 = _mm_mul_epu32(v14, T15);
358 T2 = _mm_add_epi64(T2, v22);
359 T14 = R42;
360 T5 = _mm_unpacklo_epi64(
361 _mm_loadl_epi64((const xmmi *) (const void *) (m + 0)),
362 _mm_loadl_epi64((const xmmi *) (const void *) (m + 16)));
363 v23 = H1;
364 v23 = _mm_mul_epu32(v23, T15);
365 T3 = _mm_add_epi64(T3, v32);
366 v33 = H1;
367 v33 = _mm_mul_epu32(v33, T14);
368 T4 = _mm_add_epi64(T4, v41);
369 v42 = H2;
370 v42 = _mm_mul_epu32(v42, T14);
371 T1 = _mm_add_epi64(T1, v14);
372 T15 = R43;
373 T6 = _mm_unpacklo_epi64(
374 _mm_loadl_epi64((const xmmi *) (const void *) (m + 8)),
375 _mm_loadl_epi64((const xmmi *) (const void *) (m + 24)));
376 v24 = H0;
377 v24 = _mm_mul_epu32(v24, T14);
378 T2 = _mm_add_epi64(T2, v23);
379 v34 = H0;
380 v34 = _mm_mul_epu32(v34, T15);
381 T3 = _mm_add_epi64(T3, v33);
382 M0 = _mm_and_si128(MMASK, T5);
383 v43 = H1;
384 v43 = _mm_mul_epu32(v43, T15);
385 T4 = _mm_add_epi64(T4, v42);
386 M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26));
387 v44 = H0;
388 v44 = _mm_mul_epu32(v44, R44);
389 T2 = _mm_add_epi64(T2, v24);
390 T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12));
391 T3 = _mm_add_epi64(T3, v34);
392 M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T6, 14));
393 T4 = _mm_add_epi64(T4, v43);
394 M2 = _mm_and_si128(MMASK, T5);
395 T4 = _mm_add_epi64(T4, v44);
396 M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT);
397
398 /* H += [Mx',My'] */
399 T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 32));
400 T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 48));
401 T7 = _mm_unpacklo_epi32(T5, T6);
402 T8 = _mm_unpackhi_epi32(T5, T6);
403 M5 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
404 M6 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
405 M7 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
406 M8 = _mm_unpackhi_epi32(T8, _mm_setzero_si128());
407 M6 = _mm_slli_epi64(M6, 6);
408 M7 = _mm_slli_epi64(M7, 12);
409 M8 = _mm_slli_epi64(M8, 18);
410 T0 = _mm_add_epi64(T0, M5);
411 T1 = _mm_add_epi64(T1, M6);
412 T2 = _mm_add_epi64(T2, M7);
413 T3 = _mm_add_epi64(T3, M8);
414 T4 = _mm_add_epi64(T4, HIBIT);
415
416 /* H += [Mx,My]*[r^2,r^2] */
417 T15 = S22;
418 v00 = M4;
419 v00 = _mm_mul_epu32(v00, S21);
420 v01 = M3;
421 v01 = _mm_mul_epu32(v01, T15);
422 T14 = S23;
423 v10 = M4;
424 v10 = _mm_mul_epu32(v10, T15);
425 v11 = M3;
426 v11 = _mm_mul_epu32(v11, T14);
427 T0 = _mm_add_epi64(T0, v00);
428 v20 = M4;
429 v20 = _mm_mul_epu32(v20, T14);
430 T0 = _mm_add_epi64(T0, v01);
431 T15 = S24;
432 v02 = M2;
433 v02 = _mm_mul_epu32(v02, T14);
434 T1 = _mm_add_epi64(T1, v10);
435 v30 = M4;
436 v30 = _mm_mul_epu32(v30, T15);
437 T1 = _mm_add_epi64(T1, v11);
438 v03 = M1;
439 v03 = _mm_mul_epu32(v03, T15);
440 T2 = _mm_add_epi64(T2, v20);
441 v12 = M2;
442 v12 = _mm_mul_epu32(v12, T15);
443 T0 = _mm_add_epi64(T0, v02);
444 T14 = R20;
445 v21 = M3;
446 v21 = _mm_mul_epu32(v21, T15);
447 T3 = _mm_add_epi64(T3, v30);
448 v31 = M3;
449 v31 = _mm_mul_epu32(v31, T14);
450 T0 = _mm_add_epi64(T0, v03);
451 v40 = M4;
452 v40 = _mm_mul_epu32(v40, T14);
453 T1 = _mm_add_epi64(T1, v12);
454 v04 = M0;
455 v04 = _mm_mul_epu32(v04, T14);
456 T2 = _mm_add_epi64(T2, v21);
457 v13 = M1;
458 v13 = _mm_mul_epu32(v13, T14);
459 T3 = _mm_add_epi64(T3, v31);
460 T15 = R21;
461 v22 = M2;
462 v22 = _mm_mul_epu32(v22, T14);
463 T4 = _mm_add_epi64(T4, v40);
464 v32 = M2;
465 v32 = _mm_mul_epu32(v32, T15);
466 T0 = _mm_add_epi64(T0, v04);
467 v41 = M3;
468 v41 = _mm_mul_epu32(v41, T15);
469 T1 = _mm_add_epi64(T1, v13);
470 v14 = M0;
471 v14 = _mm_mul_epu32(v14, T15);
472 T2 = _mm_add_epi64(T2, v22);
473 T14 = R22;
474 v23 = M1;
475 v23 = _mm_mul_epu32(v23, T15);
476 T3 = _mm_add_epi64(T3, v32);
477 v33 = M1;
478 v33 = _mm_mul_epu32(v33, T14);
479 T4 = _mm_add_epi64(T4, v41);
480 v42 = M2;
481 v42 = _mm_mul_epu32(v42, T14);
482 T1 = _mm_add_epi64(T1, v14);
483 T15 = R23;
484 v24 = M0;
485 v24 = _mm_mul_epu32(v24, T14);
486 T2 = _mm_add_epi64(T2, v23);
487 v34 = M0;
488 v34 = _mm_mul_epu32(v34, T15);
489 T3 = _mm_add_epi64(T3, v33);
490 v43 = M1;
491 v43 = _mm_mul_epu32(v43, T15);
492 T4 = _mm_add_epi64(T4, v42);
493 v44 = M0;
494 v44 = _mm_mul_epu32(v44, R24);
495 T2 = _mm_add_epi64(T2, v24);
496 T3 = _mm_add_epi64(T3, v34);
497 T4 = _mm_add_epi64(T4, v43);
498 T4 = _mm_add_epi64(T4, v44);
499
500 /* reduce */
501 C1 = _mm_srli_epi64(T0, 26);
502 C2 = _mm_srli_epi64(T3, 26);
503 T0 = _mm_and_si128(T0, MMASK);
504 T3 = _mm_and_si128(T3, MMASK);
505 T1 = _mm_add_epi64(T1, C1);
506 T4 = _mm_add_epi64(T4, C2);
507 C1 = _mm_srli_epi64(T1, 26);
508 C2 = _mm_srli_epi64(T4, 26);
509 T1 = _mm_and_si128(T1, MMASK);
510 T4 = _mm_and_si128(T4, MMASK);
511 T2 = _mm_add_epi64(T2, C1);
512 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
513 C1 = _mm_srli_epi64(T2, 26);
514 C2 = _mm_srli_epi64(T0, 26);
515 T2 = _mm_and_si128(T2, MMASK);
516 T0 = _mm_and_si128(T0, MMASK);
517 T3 = _mm_add_epi64(T3, C1);
518 T1 = _mm_add_epi64(T1, C2);
519 C1 = _mm_srli_epi64(T3, 26);
520 T3 = _mm_and_si128(T3, MMASK);
521 T4 = _mm_add_epi64(T4, C1);
522
523 /* Final: H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx',My']) */
524 H0 = T0;
525 H1 = T1;
526 H2 = T2;
527 H3 = T3;
528 H4 = T4;
529
530 m += 64;
531 bytes -= 64;
532 }
533 }
534
535 if (bytes >= 32) {
536 xmmi v01, v02, v03, v04;
537 xmmi v11, v12, v13, v14;
538 xmmi v21, v22, v23, v24;
539 xmmi v31, v32, v33, v34;
540 xmmi v41, v42, v43, v44;
541 xmmi T14, T15;
542
543 /* H *= [r^2,r^2] */
544 T15 = S22;
545 T0 = H4;
546 T0 = _mm_mul_epu32(T0, S21);
547 v01 = H3;
548 v01 = _mm_mul_epu32(v01, T15);
549 T14 = S23;
550 T1 = H4;
551 T1 = _mm_mul_epu32(T1, T15);
552 v11 = H3;
553 v11 = _mm_mul_epu32(v11, T14);
554 T2 = H4;
555 T2 = _mm_mul_epu32(T2, T14);
556 T0 = _mm_add_epi64(T0, v01);
557 T15 = S24;
558 v02 = H2;
559 v02 = _mm_mul_epu32(v02, T14);
560 T3 = H4;
561 T3 = _mm_mul_epu32(T3, T15);
562 T1 = _mm_add_epi64(T1, v11);
563 v03 = H1;
564 v03 = _mm_mul_epu32(v03, T15);
565 v12 = H2;
566 v12 = _mm_mul_epu32(v12, T15);
567 T0 = _mm_add_epi64(T0, v02);
568 T14 = R20;
569 v21 = H3;
570 v21 = _mm_mul_epu32(v21, T15);
571 v31 = H3;
572 v31 = _mm_mul_epu32(v31, T14);
573 T0 = _mm_add_epi64(T0, v03);
574 T4 = H4;
575 T4 = _mm_mul_epu32(T4, T14);
576 T1 = _mm_add_epi64(T1, v12);
577 v04 = H0;
578 v04 = _mm_mul_epu32(v04, T14);
579 T2 = _mm_add_epi64(T2, v21);
580 v13 = H1;
581 v13 = _mm_mul_epu32(v13, T14);
582 T3 = _mm_add_epi64(T3, v31);
583 T15 = R21;
584 v22 = H2;
585 v22 = _mm_mul_epu32(v22, T14);
586 v32 = H2;
587 v32 = _mm_mul_epu32(v32, T15);
588 T0 = _mm_add_epi64(T0, v04);
589 v41 = H3;
590 v41 = _mm_mul_epu32(v41, T15);
591 T1 = _mm_add_epi64(T1, v13);
592 v14 = H0;
593 v14 = _mm_mul_epu32(v14, T15);
594 T2 = _mm_add_epi64(T2, v22);
595 T14 = R22;
596 v23 = H1;
597 v23 = _mm_mul_epu32(v23, T15);
598 T3 = _mm_add_epi64(T3, v32);
599 v33 = H1;
600 v33 = _mm_mul_epu32(v33, T14);
601 T4 = _mm_add_epi64(T4, v41);
602 v42 = H2;
603 v42 = _mm_mul_epu32(v42, T14);
604 T1 = _mm_add_epi64(T1, v14);
605 T15 = R23;
606 v24 = H0;
607 v24 = _mm_mul_epu32(v24, T14);
608 T2 = _mm_add_epi64(T2, v23);
609 v34 = H0;
610 v34 = _mm_mul_epu32(v34, T15);
611 T3 = _mm_add_epi64(T3, v33);
612 v43 = H1;
613 v43 = _mm_mul_epu32(v43, T15);
614 T4 = _mm_add_epi64(T4, v42);
615 v44 = H0;
616 v44 = _mm_mul_epu32(v44, R24);
617 T2 = _mm_add_epi64(T2, v24);
618 T3 = _mm_add_epi64(T3, v34);
619 T4 = _mm_add_epi64(T4, v43);
620 T4 = _mm_add_epi64(T4, v44);
621
622 /* H += [Mx,My] */
623 if (m) {
624 T5 = _mm_loadu_si128((const xmmi *) (const void *) (m + 0));
625 T6 = _mm_loadu_si128((const xmmi *) (const void *) (m + 16));
626 T7 = _mm_unpacklo_epi32(T5, T6);
627 T8 = _mm_unpackhi_epi32(T5, T6);
628 M0 = _mm_unpacklo_epi32(T7, _mm_setzero_si128());
629 M1 = _mm_unpackhi_epi32(T7, _mm_setzero_si128());
630 M2 = _mm_unpacklo_epi32(T8, _mm_setzero_si128());
631 M3 = _mm_unpackhi_epi32(T8, _mm_setzero_si128());
632 M1 = _mm_slli_epi64(M1, 6);
633 M2 = _mm_slli_epi64(M2, 12);
634 M3 = _mm_slli_epi64(M3, 18);
635 T0 = _mm_add_epi64(T0, M0);
636 T1 = _mm_add_epi64(T1, M1);
637 T2 = _mm_add_epi64(T2, M2);
638 T3 = _mm_add_epi64(T3, M3);
639 T4 = _mm_add_epi64(T4, HIBIT);
640 }
641
642 /* reduce */
643 C1 = _mm_srli_epi64(T0, 26);
644 C2 = _mm_srli_epi64(T3, 26);
645 T0 = _mm_and_si128(T0, MMASK);
646 T3 = _mm_and_si128(T3, MMASK);
647 T1 = _mm_add_epi64(T1, C1);
648 T4 = _mm_add_epi64(T4, C2);
649 C1 = _mm_srli_epi64(T1, 26);
650 C2 = _mm_srli_epi64(T4, 26);
651 T1 = _mm_and_si128(T1, MMASK);
652 T4 = _mm_and_si128(T4, MMASK);
653 T2 = _mm_add_epi64(T2, C1);
654 T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE));
655 C1 = _mm_srli_epi64(T2, 26);
656 C2 = _mm_srli_epi64(T0, 26);
657 T2 = _mm_and_si128(T2, MMASK);
658 T0 = _mm_and_si128(T0, MMASK);
659 T3 = _mm_add_epi64(T3, C1);
660 T1 = _mm_add_epi64(T1, C2);
661 C1 = _mm_srli_epi64(T3, 26);
662 T3 = _mm_and_si128(T3, MMASK);
663 T4 = _mm_add_epi64(T4, C1);
664
665 /* H = (H*[r^2,r^2] + [Mx,My]) */
666 H0 = T0;
667 H1 = T1;
668 H2 = T2;
669 H3 = T3;
670 H4 = T4;
671 }
672
673 if (m) {
674 T0 = _mm_shuffle_epi32(H0, _MM_SHUFFLE(0, 0, 2, 0));
675 T1 = _mm_shuffle_epi32(H1, _MM_SHUFFLE(0, 0, 2, 0));
676 T2 = _mm_shuffle_epi32(H2, _MM_SHUFFLE(0, 0, 2, 0));
677 T3 = _mm_shuffle_epi32(H3, _MM_SHUFFLE(0, 0, 2, 0));
678 T4 = _mm_shuffle_epi32(H4, _MM_SHUFFLE(0, 0, 2, 0));
679 T0 = _mm_unpacklo_epi64(T0, T1);
680 T1 = _mm_unpacklo_epi64(T2, T3);
681 _mm_storeu_si128((xmmi *) (void *) &st->H.hh[0], T0);
682 _mm_storeu_si128((xmmi *) (void *) &st->H.hh[4], T1);
683 _mm_storel_epi64((xmmi *) (void *) &st->H.hh[8], T4);
684 } else {
685 uint32_t t0, t1, t2, t3, t4, b;
686 uint64_t h0, h1, h2, g0, g1, g2, c, nc;
687
688 /* H = H[0]+H[1] */
689 T0 = H0;
690 T1 = H1;
691 T2 = H2;
692 T3 = H3;
693 T4 = H4;
694
695 T0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8));
696 T1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8));
697 T2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8));
698 T3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8));
699 T4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8));
700
701 t0 = _mm_cvtsi128_si32(T0);
702 b = (t0 >> 26);
703 t0 &= 0x3ffffff;
704 t1 = _mm_cvtsi128_si32(T1) + b;
705 b = (t1 >> 26);
706 t1 &= 0x3ffffff;
707 t2 = _mm_cvtsi128_si32(T2) + b;
708 b = (t2 >> 26);
709 t2 &= 0x3ffffff;
710 t3 = _mm_cvtsi128_si32(T3) + b;
711 b = (t3 >> 26);
712 t3 &= 0x3ffffff;
713 t4 = _mm_cvtsi128_si32(T4) + b;
714
715 /* everything except t4 is in range, so this is all safe */
716 h0 = (((uint64_t) t0) | ((uint64_t) t1 << 26)) & 0xfffffffffffull;
717 h1 = (((uint64_t) t1 >> 18) | ((uint64_t) t2 << 8) |
718 ((uint64_t) t3 << 34)) &
719 0xfffffffffffull;
720 h2 = (((uint64_t) t3 >> 10) | ((uint64_t) t4 << 16));
721
722 c = (h2 >> 42);
723 h2 &= 0x3ffffffffff;
724 h0 += c * 5;
725 c = (h0 >> 44);
726 h0 &= 0xfffffffffff;
727 h1 += c;
728 c = (h1 >> 44);
729 h1 &= 0xfffffffffff;
730 h2 += c;
731 c = (h2 >> 42);
732 h2 &= 0x3ffffffffff;
733 h0 += c * 5;
734 c = (h0 >> 44);
735 h0 &= 0xfffffffffff;
736 h1 += c;
737
738 g0 = h0 + 5;
739 c = (g0 >> 44);
740 g0 &= 0xfffffffffff;
741 g1 = h1 + c;
742 c = (g1 >> 44);
743 g1 &= 0xfffffffffff;
744 g2 = h2 + c - ((uint64_t) 1 << 42);
745
746 c = (g2 >> 63) - 1;
747 nc = ~c;
748 h0 = (h0 & nc) | (g0 & c);
749 h1 = (h1 & nc) | (g1 & c);
750 h2 = (h2 & nc) | (g2 & c);
751
752 st->H.h[0] = h0;
753 st->H.h[1] = h1;
754 st->H.h[2] = h2;
755 }
756 }
757
758 static void
poly1305_update(poly1305_state_internal_t * st,const unsigned char * m,unsigned long long bytes)759 poly1305_update(poly1305_state_internal_t *st, const unsigned char *m,
760 unsigned long long bytes)
761 {
762 unsigned long long i;
763
764 /* handle leftover */
765 if (st->leftover) {
766 unsigned long long want = (poly1305_block_size - st->leftover);
767
768 if (want > bytes) {
769 want = bytes;
770 }
771 for (i = 0; i < want; i++) {
772 st->buffer[st->leftover + i] = m[i];
773 }
774 bytes -= want;
775 m += want;
776 st->leftover += want;
777 if (st->leftover < poly1305_block_size) {
778 return;
779 }
780 poly1305_blocks(st, st->buffer, poly1305_block_size);
781 st->leftover = 0;
782 }
783
784 /* process full blocks */
785 if (bytes >= poly1305_block_size) {
786 unsigned long long want = (bytes & ~(poly1305_block_size - 1));
787
788 poly1305_blocks(st, m, want);
789 m += want;
790 bytes -= want;
791 }
792
793 /* store leftover */
794 if (bytes) {
795 for (i = 0; i < bytes; i++) {
796 st->buffer[st->leftover + i] = m[i];
797 }
798 st->leftover += bytes;
799 }
800 }
801
802 static POLY1305_NOINLINE void
poly1305_finish_ext(poly1305_state_internal_t * st,const unsigned char * m,unsigned long long leftover,unsigned char mac[16])803 poly1305_finish_ext(poly1305_state_internal_t *st, const unsigned char *m,
804 unsigned long long leftover, unsigned char mac[16])
805 {
806 uint64_t h0, h1, h2;
807
808 if (leftover) {
809 CRYPTO_ALIGN(16) unsigned char final[32] = { 0 };
810
811 poly1305_block_copy31(final, m, leftover);
812 if (leftover != 16) {
813 final[leftover] = 1;
814 }
815 st->flags |=
816 (leftover >= 16) ? poly1305_final_shift8 : poly1305_final_shift16;
817 poly1305_blocks(st, final, 32);
818 }
819
820 if (st->flags & poly1305_started) {
821 /* finalize, H *= [r^2,r], or H *= [r,1] */
822 if (!leftover || (leftover > 16)) {
823 st->flags |= poly1305_final_r2_r;
824 } else {
825 st->flags |= poly1305_final_r_1;
826 }
827 poly1305_blocks(st, NULL, 32);
828 }
829
830 h0 = st->H.h[0];
831 h1 = st->H.h[1];
832 h2 = st->H.h[2];
833
834 /* pad */
835 h0 = ((h0) | (h1 << 44));
836 h1 = ((h1 >> 20) | (h2 << 24));
837 #ifdef HAVE_AMD64_ASM
838 __asm__ __volatile__(
839 "addq %2, %0 ;\n"
840 "adcq %3, %1 ;\n"
841 : "+r"(h0), "+r"(h1)
842 : "r"(st->pad[0]), "r"(st->pad[1])
843 : "flags", "cc");
844 #else
845 {
846 uint128_t h;
847
848 memcpy(&h, &st->pad[0], 16);
849 h += ((uint128_t) h1 << 64) | h0;
850 h0 = (uint64_t) h;
851 h1 = (uint64_t)(h >> 64);
852 }
853 #endif
854 _mm_storeu_si128((xmmi *) (void *) st + 0, _mm_setzero_si128());
855 _mm_storeu_si128((xmmi *) (void *) st + 1, _mm_setzero_si128());
856 _mm_storeu_si128((xmmi *) (void *) st + 2, _mm_setzero_si128());
857 _mm_storeu_si128((xmmi *) (void *) st + 3, _mm_setzero_si128());
858 _mm_storeu_si128((xmmi *) (void *) st + 4, _mm_setzero_si128());
859 _mm_storeu_si128((xmmi *) (void *) st + 5, _mm_setzero_si128());
860 _mm_storeu_si128((xmmi *) (void *) st + 6, _mm_setzero_si128());
861 _mm_storeu_si128((xmmi *) (void *) st + 7, _mm_setzero_si128());
862
863 memcpy(&mac[0], &h0, 8);
864 memcpy(&mac[8], &h1, 8);
865
866 sodium_memzero((void *) st, sizeof *st);
867 }
868
869 static void
poly1305_finish(poly1305_state_internal_t * st,unsigned char mac[16])870 poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16])
871 {
872 poly1305_finish_ext(st, st->buffer, st->leftover, mac);
873 }
874
875 static int
crypto_onetimeauth_poly1305_sse2_init(crypto_onetimeauth_poly1305_state * state,const unsigned char * key)876 crypto_onetimeauth_poly1305_sse2_init(crypto_onetimeauth_poly1305_state *state,
877 const unsigned char *key)
878 {
879 COMPILER_ASSERT(sizeof(crypto_onetimeauth_poly1305_state) >=
880 sizeof(poly1305_state_internal_t));
881 poly1305_init_ext((poly1305_state_internal_t *) (void *) state, key, 0U);
882
883 return 0;
884 }
885
886 static int
crypto_onetimeauth_poly1305_sse2_update(crypto_onetimeauth_poly1305_state * state,const unsigned char * in,unsigned long long inlen)887 crypto_onetimeauth_poly1305_sse2_update(
888 crypto_onetimeauth_poly1305_state *state, const unsigned char *in,
889 unsigned long long inlen)
890 {
891 poly1305_update((poly1305_state_internal_t *) (void *) state, in, inlen);
892
893 return 0;
894 }
895
896 static int
crypto_onetimeauth_poly1305_sse2_final(crypto_onetimeauth_poly1305_state * state,unsigned char * out)897 crypto_onetimeauth_poly1305_sse2_final(crypto_onetimeauth_poly1305_state *state,
898 unsigned char *out)
899 {
900 poly1305_finish((poly1305_state_internal_t *) (void *) state, out);
901
902 return 0;
903 }
904
905 static int
crypto_onetimeauth_poly1305_sse2(unsigned char * out,const unsigned char * m,unsigned long long inlen,const unsigned char * key)906 crypto_onetimeauth_poly1305_sse2(unsigned char *out, const unsigned char *m,
907 unsigned long long inlen,
908 const unsigned char *key)
909 {
910 CRYPTO_ALIGN(64) poly1305_state_internal_t st;
911 unsigned long long blocks;
912
913 poly1305_init_ext(&st, key, inlen);
914 blocks = inlen & ~31;
915 if (blocks > 0) {
916 poly1305_blocks(&st, m, blocks);
917 m += blocks;
918 inlen -= blocks;
919 }
920 poly1305_finish_ext(&st, m, inlen, out);
921
922 return 0;
923 }
924
925 static int
crypto_onetimeauth_poly1305_sse2_verify(const unsigned char * h,const unsigned char * in,unsigned long long inlen,const unsigned char * k)926 crypto_onetimeauth_poly1305_sse2_verify(const unsigned char *h,
927 const unsigned char *in,
928 unsigned long long inlen,
929 const unsigned char *k)
930 {
931 unsigned char correct[16];
932
933 crypto_onetimeauth_poly1305_sse2(correct, in, inlen, k);
934
935 return crypto_verify_16(h, correct);
936 }
937
938 struct crypto_onetimeauth_poly1305_implementation
939 crypto_onetimeauth_poly1305_sse2_implementation = {
940 SODIUM_C99(.onetimeauth =) crypto_onetimeauth_poly1305_sse2,
941 SODIUM_C99(.onetimeauth_verify =)
942 crypto_onetimeauth_poly1305_sse2_verify,
943 SODIUM_C99(.onetimeauth_init =) crypto_onetimeauth_poly1305_sse2_init,
944 SODIUM_C99(.onetimeauth_update =)
945 crypto_onetimeauth_poly1305_sse2_update,
946 SODIUM_C99(.onetimeauth_final =) crypto_onetimeauth_poly1305_sse2_final
947 };
948
949 #endif
950