1 /*
2 poly1305 implementation using 64 bit * 64 bit = 128 bit multiplication
3 and 128 bit addition
4 */
5
6 #include "private/common.h"
7
8 #define MUL(out, x, y) out = ((uint128_t) x * y)
9 #define ADD(out, in) out += in
10 #define ADDLO(out, in) out += in
11 #define SHR(in, shift) (unsigned long long) (in >> (shift))
12 #define LO(in) (unsigned long long) (in)
13
14 #if defined(_MSC_VER)
15 # define POLY1305_NOINLINE __declspec(noinline)
16 #elif defined(__clang__) || defined(__GNUC__)
17 # define POLY1305_NOINLINE __attribute__((noinline))
18 #else
19 # define POLY1305_NOINLINE
20 #endif
21
22 #define poly1305_block_size 16
23
24 /* 17 + sizeof(unsigned long long) + 8*sizeof(unsigned long long) */
25 typedef struct poly1305_state_internal_t {
26 unsigned long long r[3];
27 unsigned long long h[3];
28 unsigned long long pad[2];
29 unsigned long long leftover;
30 unsigned char buffer[poly1305_block_size];
31 unsigned char final;
32 } poly1305_state_internal_t;
33
34 static void
poly1305_init(poly1305_state_internal_t * st,const unsigned char key[32])35 poly1305_init(poly1305_state_internal_t *st, const unsigned char key[32])
36 {
37 unsigned long long t0, t1;
38
39 /* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
40 t0 = LOAD64_LE(&key[0]);
41 t1 = LOAD64_LE(&key[8]);
42
43 /* wiped after finalization */
44 st->r[0] = (t0) &0xffc0fffffff;
45 st->r[1] = ((t0 >> 44) | (t1 << 20)) & 0xfffffc0ffff;
46 st->r[2] = ((t1 >> 24)) & 0x00ffffffc0f;
47
48 /* h = 0 */
49 st->h[0] = 0;
50 st->h[1] = 0;
51 st->h[2] = 0;
52
53 /* save pad for later */
54 st->pad[0] = LOAD64_LE(&key[16]);
55 st->pad[1] = LOAD64_LE(&key[24]);
56
57 st->leftover = 0;
58 st->final = 0;
59 }
60
61 static void
poly1305_blocks(poly1305_state_internal_t * st,const unsigned char * m,unsigned long long bytes)62 poly1305_blocks(poly1305_state_internal_t *st, const unsigned char *m,
63 unsigned long long bytes)
64 {
65 const unsigned long long hibit =
66 (st->final) ? 0ULL : (1ULL << 40); /* 1 << 128 */
67 unsigned long long r0, r1, r2;
68 unsigned long long s1, s2;
69 unsigned long long h0, h1, h2;
70 unsigned long long c;
71 uint128_t d0, d1, d2, d;
72
73 r0 = st->r[0];
74 r1 = st->r[1];
75 r2 = st->r[2];
76
77 h0 = st->h[0];
78 h1 = st->h[1];
79 h2 = st->h[2];
80
81 s1 = r1 * (5 << 2);
82 s2 = r2 * (5 << 2);
83
84 while (bytes >= poly1305_block_size) {
85 unsigned long long t0, t1;
86
87 /* h += m[i] */
88 t0 = LOAD64_LE(&m[0]);
89 t1 = LOAD64_LE(&m[8]);
90
91 h0 += ((t0) &0xfffffffffff);
92 h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff);
93 h2 += (((t1 >> 24)) & 0x3ffffffffff) | hibit;
94
95 /* h *= r */
96 MUL(d0, h0, r0);
97 MUL(d, h1, s2);
98 ADD(d0, d);
99 MUL(d, h2, s1);
100 ADD(d0, d);
101 MUL(d1, h0, r1);
102 MUL(d, h1, r0);
103 ADD(d1, d);
104 MUL(d, h2, s2);
105 ADD(d1, d);
106 MUL(d2, h0, r2);
107 MUL(d, h1, r1);
108 ADD(d2, d);
109 MUL(d, h2, r0);
110 ADD(d2, d);
111
112 /* (partial) h %= p */
113 c = SHR(d0, 44);
114 h0 = LO(d0) & 0xfffffffffff;
115 ADDLO(d1, c);
116 c = SHR(d1, 44);
117 h1 = LO(d1) & 0xfffffffffff;
118 ADDLO(d2, c);
119 c = SHR(d2, 42);
120 h2 = LO(d2) & 0x3ffffffffff;
121 h0 += c * 5;
122 c = (h0 >> 44);
123 h0 = h0 & 0xfffffffffff;
124 h1 += c;
125
126 m += poly1305_block_size;
127 bytes -= poly1305_block_size;
128 }
129
130 st->h[0] = h0;
131 st->h[1] = h1;
132 st->h[2] = h2;
133 }
134
135 static POLY1305_NOINLINE void
poly1305_finish(poly1305_state_internal_t * st,unsigned char mac[16])136 poly1305_finish(poly1305_state_internal_t *st, unsigned char mac[16])
137 {
138 unsigned long long h0, h1, h2, c;
139 unsigned long long g0, g1, g2;
140 unsigned long long t0, t1;
141
142 /* process the remaining block */
143 if (st->leftover) {
144 unsigned long long i = st->leftover;
145
146 st->buffer[i] = 1;
147
148 for (i = i + 1; i < poly1305_block_size; i++) {
149 st->buffer[i] = 0;
150 }
151 st->final = 1;
152 poly1305_blocks(st, st->buffer, poly1305_block_size);
153 }
154
155 /* fully carry h */
156 h0 = st->h[0];
157 h1 = st->h[1];
158 h2 = st->h[2];
159
160 c = (h1 >> 44);
161 h1 &= 0xfffffffffff;
162 h2 += c;
163 c = (h2 >> 42);
164 h2 &= 0x3ffffffffff;
165 h0 += c * 5;
166 c = (h0 >> 44);
167 h0 &= 0xfffffffffff;
168 h1 += c;
169 c = (h1 >> 44);
170 h1 &= 0xfffffffffff;
171 h2 += c;
172 c = (h2 >> 42);
173 h2 &= 0x3ffffffffff;
174 h0 += c * 5;
175 c = (h0 >> 44);
176 h0 &= 0xfffffffffff;
177 h1 += c;
178
179 /* compute h + -p */
180 g0 = h0 + 5;
181 c = (g0 >> 44);
182 g0 &= 0xfffffffffff;
183 g1 = h1 + c;
184 c = (g1 >> 44);
185 g1 &= 0xfffffffffff;
186 g2 = h2 + c - (1ULL << 42);
187
188 /* select h if h < p, or h + -p if h >= p */
189 c = (g2 >> ((sizeof(unsigned long long) * 8) - 1)) - 1;
190 g0 &= c;
191 g1 &= c;
192 g2 &= c;
193 c = ~c;
194 h0 = (h0 & c) | g0;
195 h1 = (h1 & c) | g1;
196 h2 = (h2 & c) | g2;
197
198 /* h = (h + pad) */
199 t0 = st->pad[0];
200 t1 = st->pad[1];
201
202 h0 += ((t0) &0xfffffffffff);
203 c = (h0 >> 44);
204 h0 &= 0xfffffffffff;
205 h1 += (((t0 >> 44) | (t1 << 20)) & 0xfffffffffff) + c;
206 c = (h1 >> 44);
207 h1 &= 0xfffffffffff;
208 h2 += (((t1 >> 24)) & 0x3ffffffffff) + c;
209 h2 &= 0x3ffffffffff;
210
211 /* mac = h % (2^128) */
212 h0 = ((h0) | (h1 << 44));
213 h1 = ((h1 >> 20) | (h2 << 24));
214
215 STORE64_LE(&mac[0], h0);
216 STORE64_LE(&mac[8], h1);
217
218 /* zero out the state */
219 sodium_memzero((void *) st, sizeof *st);
220 }
221