1 /* 2 * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #include "inner.h" 26 27 /* 28 * Perform the inner processing of blocks for Poly1305. The accumulator 29 * and the r key are provided as arrays of 26-bit words (these words 30 * are allowed to have an extra bit, i.e. use 27 bits). 31 * 32 * On output, all accumulator words fit on 26 bits, except acc[1], which 33 * may be slightly larger (but by a very small amount only). 34 */ 35 static void 36 poly1305_inner(uint32_t *acc, const uint32_t *r, const void *data, size_t len) 37 { 38 /* 39 * Implementation notes: we split the 130-bit values into five 40 * 26-bit words. This gives us some space for carries. 41 * 42 * This code is inspired from the public-domain code available 43 * on: 44 * https://github.com/floodyberry/poly1305-donna 45 * 46 * Since we compute modulo 2^130-5, the "upper words" become 47 * low words with a factor of 5; that is, x*2^130 = x*5 mod p. 48 */ 49 const unsigned char *buf; 50 uint32_t a0, a1, a2, a3, a4; 51 uint32_t r0, r1, r2, r3, r4; 52 uint32_t u1, u2, u3, u4; 53 54 r0 = r[0]; 55 r1 = r[1]; 56 r2 = r[2]; 57 r3 = r[3]; 58 r4 = r[4]; 59 60 u1 = r1 * 5; 61 u2 = r2 * 5; 62 u3 = r3 * 5; 63 u4 = r4 * 5; 64 65 a0 = acc[0]; 66 a1 = acc[1]; 67 a2 = acc[2]; 68 a3 = acc[3]; 69 a4 = acc[4]; 70 71 buf = data; 72 while (len > 0) { 73 uint64_t w0, w1, w2, w3, w4; 74 uint64_t c; 75 unsigned char tmp[16]; 76 77 /* 78 * If there is a partial block, right-pad it with zeros. 79 */ 80 if (len < 16) { 81 memset(tmp, 0, sizeof tmp); 82 memcpy(tmp, buf, len); 83 buf = tmp; 84 len = 16; 85 } 86 87 /* 88 * Decode next block and apply the "high bit"; that value 89 * is added to the accumulator. 90 */ 91 a0 += br_dec32le(buf) & 0x03FFFFFF; 92 a1 += (br_dec32le(buf + 3) >> 2) & 0x03FFFFFF; 93 a2 += (br_dec32le(buf + 6) >> 4) & 0x03FFFFFF; 94 a3 += (br_dec32le(buf + 9) >> 6) & 0x03FFFFFF; 95 a4 += (br_dec32le(buf + 12) >> 8) | 0x01000000; 96 97 /* 98 * Compute multiplication. 99 */ 100 #define M(x, y) ((uint64_t)(x) * (uint64_t)(y)) 101 102 w0 = M(a0, r0) + M(a1, u4) + M(a2, u3) + M(a3, u2) + M(a4, u1); 103 w1 = M(a0, r1) + M(a1, r0) + M(a2, u4) + M(a3, u3) + M(a4, u2); 104 w2 = M(a0, r2) + M(a1, r1) + M(a2, r0) + M(a3, u4) + M(a4, u3); 105 w3 = M(a0, r3) + M(a1, r2) + M(a2, r1) + M(a3, r0) + M(a4, u4); 106 w4 = M(a0, r4) + M(a1, r3) + M(a2, r2) + M(a3, r1) + M(a4, r0); 107 108 #undef M 109 /* 110 * Perform some (partial) modular reduction. This step is 111 * enough to keep values in ranges such that there won't 112 * be carry overflows. Most of the reduction was done in 113 * the multiplication step (by using the 'u*' values, and 114 * using the fact that 2^130 = -5 mod p); here we perform 115 * some carry propagation. 116 */ 117 c = w0 >> 26; 118 a0 = (uint32_t)w0 & 0x3FFFFFF; 119 w1 += c; 120 c = w1 >> 26; 121 a1 = (uint32_t)w1 & 0x3FFFFFF; 122 w2 += c; 123 c = w2 >> 26; 124 a2 = (uint32_t)w2 & 0x3FFFFFF; 125 w3 += c; 126 c = w3 >> 26; 127 a3 = (uint32_t)w3 & 0x3FFFFFF; 128 w4 += c; 129 c = w4 >> 26; 130 a4 = (uint32_t)w4 & 0x3FFFFFF; 131 a0 += (uint32_t)c * 5; 132 a1 += a0 >> 26; 133 a0 &= 0x3FFFFFF; 134 135 buf += 16; 136 len -= 16; 137 } 138 139 acc[0] = a0; 140 acc[1] = a1; 141 acc[2] = a2; 142 acc[3] = a3; 143 acc[4] = a4; 144 } 145 146 /* see bearssl_block.h */ 147 void 148 br_poly1305_ctmul_run(const void *key, const void *iv, 149 void *data, size_t len, const void *aad, size_t aad_len, 150 void *tag, br_chacha20_run ichacha, int encrypt) 151 { 152 unsigned char pkey[32], foot[16]; 153 uint32_t r[5], acc[5], cc, ctl, hi; 154 uint64_t w; 155 int i; 156 157 /* 158 * Compute the MAC key. The 'r' value is the first 16 bytes of 159 * pkey[]. 160 */ 161 memset(pkey, 0, sizeof pkey); 162 ichacha(key, iv, 0, pkey, sizeof pkey); 163 164 /* 165 * If encrypting, ChaCha20 must run first, followed by Poly1305. 166 * When decrypting, the operations are reversed. 167 */ 168 if (encrypt) { 169 ichacha(key, iv, 1, data, len); 170 } 171 172 /* 173 * Run Poly1305. We must process the AAD, then ciphertext, then 174 * the footer (with the lengths). Note that the AAD and ciphertext 175 * are meant to be padded with zeros up to the next multiple of 16, 176 * and the length of the footer is 16 bytes as well. 177 */ 178 179 /* 180 * Decode the 'r' value into 26-bit words, with the "clamping" 181 * operation applied. 182 */ 183 r[0] = br_dec32le(pkey) & 0x03FFFFFF; 184 r[1] = (br_dec32le(pkey + 3) >> 2) & 0x03FFFF03; 185 r[2] = (br_dec32le(pkey + 6) >> 4) & 0x03FFC0FF; 186 r[3] = (br_dec32le(pkey + 9) >> 6) & 0x03F03FFF; 187 r[4] = (br_dec32le(pkey + 12) >> 8) & 0x000FFFFF; 188 189 /* 190 * Accumulator is 0. 191 */ 192 memset(acc, 0, sizeof acc); 193 194 /* 195 * Process the additional authenticated data, ciphertext, and 196 * footer in due order. 197 */ 198 br_enc64le(foot, (uint64_t)aad_len); 199 br_enc64le(foot + 8, (uint64_t)len); 200 poly1305_inner(acc, r, aad, aad_len); 201 poly1305_inner(acc, r, data, len); 202 poly1305_inner(acc, r, foot, sizeof foot); 203 204 /* 205 * Finalise modular reduction. This is done with carry propagation 206 * and applying the '2^130 = -5 mod p' rule. Note that the output 207 * of poly1035_inner() is already mostly reduced, since only 208 * acc[1] may be (very slightly) above 2^26. A single loop back 209 * to acc[1] will be enough to make the value fit in 130 bits. 210 */ 211 cc = 0; 212 for (i = 1; i <= 6; i ++) { 213 int j; 214 215 j = (i >= 5) ? i - 5 : i; 216 acc[j] += cc; 217 cc = acc[j] >> 26; 218 acc[j] &= 0x03FFFFFF; 219 } 220 221 /* 222 * We may still have a value in the 2^130-5..2^130-1 range, in 223 * which case we must reduce it again. The code below selects, 224 * in constant-time, between 'acc' and 'acc-p', 225 */ 226 ctl = GT(acc[0], 0x03FFFFFA); 227 for (i = 1; i < 5; i ++) { 228 ctl &= EQ(acc[i], 0x03FFFFFF); 229 } 230 cc = 5; 231 for (i = 0; i < 5; i ++) { 232 uint32_t t; 233 234 t = (acc[i] + cc); 235 cc = t >> 26; 236 t &= 0x03FFFFFF; 237 acc[i] = MUX(ctl, t, acc[i]); 238 } 239 240 /* 241 * Convert back the accumulator to 32-bit words, and add the 242 * 's' value (second half of pkey[]). That addition is done 243 * modulo 2^128. 244 */ 245 w = (uint64_t)acc[0] + ((uint64_t)acc[1] << 26) + br_dec32le(pkey + 16); 246 br_enc32le((unsigned char *)tag, (uint32_t)w); 247 w = (w >> 32) + ((uint64_t)acc[2] << 20) + br_dec32le(pkey + 20); 248 br_enc32le((unsigned char *)tag + 4, (uint32_t)w); 249 w = (w >> 32) + ((uint64_t)acc[3] << 14) + br_dec32le(pkey + 24); 250 br_enc32le((unsigned char *)tag + 8, (uint32_t)w); 251 hi = (uint32_t)(w >> 32) + (acc[4] << 8) + br_dec32le(pkey + 28); 252 br_enc32le((unsigned char *)tag + 12, hi); 253 254 /* 255 * If decrypting, then ChaCha20 runs _after_ Poly1305. 256 */ 257 if (!encrypt) { 258 ichacha(key, iv, 1, data, len); 259 } 260 } 261