1 /* 2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #include "inner.h" 26 27 /* 28 * Perform the inner processing of blocks for Poly1305. 29 */ 30 static void 31 poly1305_inner(uint32_t *a, const uint32_t *r, const void *data, size_t len) 32 { 33 /* 34 * Implementation notes: we split the 130-bit values into ten 35 * 13-bit words. This gives us some space for carries and allows 36 * using only 32x32->32 multiplications, which are way faster than 37 * 32x32->64 multiplications on the ARM Cortex-M0/M0+, and also 38 * help in making constant-time code on the Cortex-M3. 39 * 40 * Since we compute modulo 2^130-5, the "upper words" become 41 * low words with a factor of 5; that is, x*2^130 = x*5 mod p. 42 * This has already been integrated in the r[] array, which 43 * is extended to the 0..18 range. 44 * 45 * In each loop iteration, a[] and r[] words are 13-bit each, 46 * except a[1] which may use 14 bits. 47 */ 48 const unsigned char *buf; 49 50 buf = data; 51 while (len > 0) { 52 unsigned char tmp[16]; 53 uint32_t b[10]; 54 unsigned u, v; 55 uint32_t z, cc1, cc2; 56 57 /* 58 * If there is a partial block, right-pad it with zeros. 59 */ 60 if (len < 16) { 61 memset(tmp, 0, sizeof tmp); 62 memcpy(tmp, buf, len); 63 buf = tmp; 64 len = 16; 65 } 66 67 /* 68 * Decode next block and apply the "high bit"; that value 69 * is added to the accumulator. 70 */ 71 v = br_dec16le(buf); 72 a[0] += v & 0x01FFF; 73 v >>= 13; 74 v |= buf[2] << 3; 75 v |= buf[3] << 11; 76 a[1] += v & 0x01FFF; 77 v >>= 13; 78 v |= buf[4] << 6; 79 a[2] += v & 0x01FFF; 80 v >>= 13; 81 v |= buf[5] << 1; 82 v |= buf[6] << 9; 83 a[3] += v & 0x01FFF; 84 v >>= 13; 85 v |= buf[7] << 4; 86 v |= buf[8] << 12; 87 a[4] += v & 0x01FFF; 88 v >>= 13; 89 v |= buf[9] << 7; 90 a[5] += v & 0x01FFF; 91 v >>= 13; 92 v |= buf[10] << 2; 93 v |= buf[11] << 10; 94 a[6] += v & 0x01FFF; 95 v >>= 13; 96 v |= buf[12] << 5; 97 a[7] += v & 0x01FFF; 98 v = br_dec16le(buf + 13); 99 a[8] += v & 0x01FFF; 100 v >>= 13; 101 v |= buf[15] << 3; 102 a[9] += v | 0x00800; 103 104 /* 105 * At that point, all a[] values fit on 14 bits, while 106 * all r[] values fit on 13 bits. Thus products fit on 107 * 27 bits, and we can accumulate up to 31 of them in 108 * a 32-bit word and still have some room for carries. 109 */ 110 111 /* 112 * Now a[] contains words with values up to 14 bits each. 113 * We perform the multiplication with r[]. 114 * 115 * The extended words of r[] may be larger than 13 bits 116 * (they are 5 times a 13-bit word) so the full summation 117 * may yield values up to 46 times a 27-bit word, which 118 * does not fit on a 32-bit word. To avoid that issue, we 119 * must split the loop below in two, with a carry 120 * propagation operation in the middle. 121 */ 122 cc1 = 0; 123 for (u = 0; u < 10; u ++) { 124 uint32_t s; 125 126 s = cc1 127 + MUL15(a[0], r[u + 9 - 0]) 128 + MUL15(a[1], r[u + 9 - 1]) 129 + MUL15(a[2], r[u + 9 - 2]) 130 + MUL15(a[3], r[u + 9 - 3]) 131 + MUL15(a[4], r[u + 9 - 4]); 132 b[u] = s & 0x1FFF; 133 cc1 = s >> 13; 134 } 135 cc2 = 0; 136 for (u = 0; u < 10; u ++) { 137 uint32_t s; 138 139 s = b[u] + cc2 140 + MUL15(a[5], r[u + 9 - 5]) 141 + MUL15(a[6], r[u + 9 - 6]) 142 + MUL15(a[7], r[u + 9 - 7]) 143 + MUL15(a[8], r[u + 9 - 8]) 144 + MUL15(a[9], r[u + 9 - 9]); 145 b[u] = s & 0x1FFF; 146 cc2 = s >> 13; 147 } 148 memcpy(a, b, sizeof b); 149 150 /* 151 * The two carries "loop back" with a factor of 5. We 152 * propagate them into a[0] and a[1]. 153 */ 154 z = cc1 + cc2; 155 z += (z << 2) + a[0]; 156 a[0] = z & 0x1FFF; 157 a[1] += z >> 13; 158 159 buf += 16; 160 len -= 16; 161 } 162 } 163 164 /* see bearssl_block.h */ 165 void 166 br_poly1305_ctmul32_run(const void *key, const void *iv, 167 void *data, size_t len, const void *aad, size_t aad_len, 168 void *tag, br_chacha20_run ichacha, int encrypt) 169 { 170 unsigned char pkey[32], foot[16]; 171 uint32_t z, r[19], acc[10], cc, ctl; 172 int i; 173 174 /* 175 * Compute the MAC key. The 'r' value is the first 16 bytes of 176 * pkey[]. 177 */ 178 memset(pkey, 0, sizeof pkey); 179 ichacha(key, iv, 0, pkey, sizeof pkey); 180 181 /* 182 * If encrypting, ChaCha20 must run first, followed by Poly1305. 183 * When decrypting, the operations are reversed. 184 */ 185 if (encrypt) { 186 ichacha(key, iv, 1, data, len); 187 } 188 189 /* 190 * Run Poly1305. We must process the AAD, then ciphertext, then 191 * the footer (with the lengths). Note that the AAD and ciphertext 192 * are meant to be padded with zeros up to the next multiple of 16, 193 * and the length of the footer is 16 bytes as well. 194 */ 195 196 /* 197 * Decode the 'r' value into 13-bit words, with the "clamping" 198 * operation applied. 199 */ 200 z = br_dec32le(pkey) & 0x03FFFFFF; 201 r[9] = z & 0x1FFF; 202 r[10] = z >> 13; 203 z = (br_dec32le(pkey + 3) >> 2) & 0x03FFFF03; 204 r[11] = z & 0x1FFF; 205 r[12] = z >> 13; 206 z = (br_dec32le(pkey + 6) >> 4) & 0x03FFC0FF; 207 r[13] = z & 0x1FFF; 208 r[14] = z >> 13; 209 z = (br_dec32le(pkey + 9) >> 6) & 0x03F03FFF; 210 r[15] = z & 0x1FFF; 211 r[16] = z >> 13; 212 z = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF; 213 r[17] = z & 0x1FFF; 214 r[18] = z >> 13; 215 216 /* 217 * Extend r[] with the 5x factor pre-applied. 218 */ 219 for (i = 0; i < 9; i ++) { 220 r[i] = MUL15(5, r[i + 10]); 221 } 222 223 /* 224 * Accumulator is 0. 225 */ 226 memset(acc, 0, sizeof acc); 227 228 /* 229 * Process the additional authenticated data, ciphertext, and 230 * footer in due order. 231 */ 232 br_enc64le(foot, (uint64_t)aad_len); 233 br_enc64le(foot + 8, (uint64_t)len); 234 poly1305_inner(acc, r, aad, aad_len); 235 poly1305_inner(acc, r, data, len); 236 poly1305_inner(acc, r, foot, sizeof foot); 237 238 /* 239 * Finalise modular reduction. This is done with carry propagation 240 * and applying the '2^130 = -5 mod p' rule. Note that the output 241 * of poly1035_inner() is already mostly reduced, since only 242 * acc[1] may be (very slightly) above 2^13. A single loop back 243 * to acc[1] will be enough to make the value fit in 130 bits. 244 */ 245 cc = 0; 246 for (i = 1; i < 10; i ++) { 247 z = acc[i] + cc; 248 acc[i] = z & 0x1FFF; 249 cc = z >> 13; 250 } 251 z = acc[0] + cc + (cc << 2); 252 acc[0] = z & 0x1FFF; 253 acc[1] += z >> 13; 254 255 /* 256 * We may still have a value in the 2^130-5..2^130-1 range, in 257 * which case we must reduce it again. The code below selects, 258 * in constant-time, between 'acc' and 'acc-p', 259 */ 260 ctl = GT(acc[0], 0x1FFA); 261 for (i = 1; i < 10; i ++) { 262 ctl &= EQ(acc[i], 0x1FFF); 263 } 264 acc[0] = MUX(ctl, acc[0] - 0x1FFB, acc[0]); 265 for (i = 1; i < 10; i ++) { 266 acc[i] &= ~(-ctl); 267 } 268 269 /* 270 * Convert back the accumulator to 32-bit words, and add the 271 * 's' value (second half of pkey[]). That addition is done 272 * modulo 2^128. 273 */ 274 z = acc[0] + (acc[1] << 13) + br_dec16le(pkey + 16); 275 br_enc16le((unsigned char *)tag, z & 0xFFFF); 276 z = (z >> 16) + (acc[2] << 10) + br_dec16le(pkey + 18); 277 br_enc16le((unsigned char *)tag + 2, z & 0xFFFF); 278 z = (z >> 16) + (acc[3] << 7) + br_dec16le(pkey + 20); 279 br_enc16le((unsigned char *)tag + 4, z & 0xFFFF); 280 z = (z >> 16) + (acc[4] << 4) + br_dec16le(pkey + 22); 281 br_enc16le((unsigned char *)tag + 6, z & 0xFFFF); 282 z = (z >> 16) + (acc[5] << 1) + (acc[6] << 14) + br_dec16le(pkey + 24); 283 br_enc16le((unsigned char *)tag + 8, z & 0xFFFF); 284 z = (z >> 16) + (acc[7] << 11) + br_dec16le(pkey + 26); 285 br_enc16le((unsigned char *)tag + 10, z & 0xFFFF); 286 z = (z >> 16) + (acc[8] << 8) + br_dec16le(pkey + 28); 287 br_enc16le((unsigned char *)tag + 12, z & 0xFFFF); 288 z = (z >> 16) + (acc[9] << 5) + br_dec16le(pkey + 30); 289 br_enc16le((unsigned char *)tag + 14, z & 0xFFFF); 290 291 /* 292 * If decrypting, then ChaCha20 runs _after_ Poly1305. 293 */ 294 if (!encrypt) { 295 ichacha(key, iv, 1, data, len); 296 } 297 } 298