1 /* 2 * Copyright (c) 2016 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #include "inner.h" 26 27 /* 28 * During key schedule, we need to apply bit extraction PC-2 then permute 29 * things into our bitslice representation. PC-2 extracts 48 bits out 30 * of two 28-bit words (kl and kr), and we store these bits into two 31 * 32-bit words sk0 and sk1. 32 * 33 * -- bit 16+x of sk0 comes from bit QL0[x] of kl 34 * -- bit x of sk0 comes from bit QR0[x] of kr 35 * -- bit 16+x of sk1 comes from bit QL1[x] of kl 36 * -- bit x of sk1 comes from bit QR1[x] of kr 37 */ 38 39 static const unsigned char QL0[] = { 40 17, 4, 27, 23, 13, 22, 7, 18, 41 16, 24, 2, 20, 1, 8, 15, 26 42 }; 43 44 static const unsigned char QR0[] = { 45 25, 19, 9, 1, 5, 11, 23, 8, 46 17, 0, 22, 3, 6, 20, 27, 24 47 }; 48 49 static const unsigned char QL1[] = { 50 28, 28, 14, 11, 28, 28, 25, 0, 51 28, 28, 5, 9, 28, 28, 12, 21 52 }; 53 54 static const unsigned char QR1[] = { 55 28, 28, 15, 4, 28, 28, 26, 16, 56 28, 28, 12, 7, 28, 28, 10, 14 57 }; 58 59 /* 60 * 32-bit rotation. The C compiler is supposed to recognize it as a 61 * rotation and use the local architecture rotation opcode (if available). 62 */ 63 static inline uint32_t 64 rotl(uint32_t x, int n) 65 { 66 return (x << n) | (x >> (32 - n)); 67 } 68 69 /* 70 * Compute key schedule for 8 key bytes (produces 32 subkey words). 71 */ 72 static void 73 keysched_unit(uint32_t *skey, const void *key) 74 { 75 int i; 76 77 br_des_keysched_unit(skey, key); 78 79 /* 80 * Apply PC-2 + bitslicing. 81 */ 82 for (i = 0; i < 16; i ++) { 83 uint32_t kl, kr, sk0, sk1; 84 int j; 85 86 kl = skey[(i << 1) + 0]; 87 kr = skey[(i << 1) + 1]; 88 sk0 = 0; 89 sk1 = 0; 90 for (j = 0; j < 16; j ++) { 91 sk0 <<= 1; 92 sk1 <<= 1; 93 sk0 |= ((kl >> QL0[j]) & (uint32_t)1) << 16; 94 sk0 |= (kr >> QR0[j]) & (uint32_t)1; 95 sk1 |= ((kl >> QL1[j]) & (uint32_t)1) << 16; 96 sk1 |= (kr >> QR1[j]) & (uint32_t)1; 97 } 98 99 skey[(i << 1) + 0] = sk0; 100 skey[(i << 1) + 1] = sk1; 101 } 102 103 #if 0 104 /* 105 * Speed-optimized version for PC-2 + bitslicing. 106 * (Unused. Kept for reference only.) 107 */ 108 sk0 = kl & (uint32_t)0x00100000; 109 sk0 |= (kl & (uint32_t)0x08008000) << 2; 110 sk0 |= (kl & (uint32_t)0x00400000) << 4; 111 sk0 |= (kl & (uint32_t)0x00800000) << 5; 112 sk0 |= (kl & (uint32_t)0x00040000) << 6; 113 sk0 |= (kl & (uint32_t)0x00010000) << 7; 114 sk0 |= (kl & (uint32_t)0x00000100) << 10; 115 sk0 |= (kl & (uint32_t)0x00022000) << 14; 116 sk0 |= (kl & (uint32_t)0x00000082) << 18; 117 sk0 |= (kl & (uint32_t)0x00000004) << 19; 118 sk0 |= (kl & (uint32_t)0x04000000) >> 10; 119 sk0 |= (kl & (uint32_t)0x00000010) << 26; 120 sk0 |= (kl & (uint32_t)0x01000000) >> 2; 121 122 sk0 |= kr & (uint32_t)0x00000100; 123 sk0 |= (kr & (uint32_t)0x00000008) << 1; 124 sk0 |= (kr & (uint32_t)0x00000200) << 4; 125 sk0 |= rotl(kr & (uint32_t)0x08000021, 6); 126 sk0 |= (kr & (uint32_t)0x01000000) >> 24; 127 sk0 |= (kr & (uint32_t)0x00000002) << 11; 128 sk0 |= (kr & (uint32_t)0x00100000) >> 18; 129 sk0 |= (kr & (uint32_t)0x00400000) >> 17; 130 sk0 |= (kr & (uint32_t)0x00800000) >> 14; 131 sk0 |= (kr & (uint32_t)0x02020000) >> 10; 132 sk0 |= (kr & (uint32_t)0x00080000) >> 5; 133 sk0 |= (kr & (uint32_t)0x00000040) >> 3; 134 sk0 |= (kr & (uint32_t)0x00000800) >> 1; 135 136 sk1 = kl & (uint32_t)0x02000000; 137 sk1 |= (kl & (uint32_t)0x00001000) << 5; 138 sk1 |= (kl & (uint32_t)0x00000200) << 11; 139 sk1 |= (kl & (uint32_t)0x00004000) << 15; 140 sk1 |= (kl & (uint32_t)0x00000020) << 16; 141 sk1 |= (kl & (uint32_t)0x00000800) << 17; 142 sk1 |= (kl & (uint32_t)0x00000001) << 24; 143 sk1 |= (kl & (uint32_t)0x00200000) >> 5; 144 145 sk1 |= (kr & (uint32_t)0x00000010) << 8; 146 sk1 |= (kr & (uint32_t)0x04000000) >> 17; 147 sk1 |= (kr & (uint32_t)0x00004000) >> 14; 148 sk1 |= (kr & (uint32_t)0x00000400) >> 9; 149 sk1 |= (kr & (uint32_t)0x00010000) >> 8; 150 sk1 |= (kr & (uint32_t)0x00001000) >> 7; 151 sk1 |= (kr & (uint32_t)0x00000080) >> 3; 152 sk1 |= (kr & (uint32_t)0x00008000) >> 2; 153 #endif 154 } 155 156 /* see inner.h */ 157 unsigned 158 br_des_ct_keysched(uint32_t *skey, const void *key, size_t key_len) 159 { 160 switch (key_len) { 161 case 8: 162 keysched_unit(skey, key); 163 return 1; 164 case 16: 165 keysched_unit(skey, key); 166 keysched_unit(skey + 32, (const unsigned char *)key + 8); 167 br_des_rev_skey(skey + 32); 168 memcpy(skey + 64, skey, 32 * sizeof *skey); 169 return 3; 170 default: 171 keysched_unit(skey, key); 172 keysched_unit(skey + 32, (const unsigned char *)key + 8); 173 br_des_rev_skey(skey + 32); 174 keysched_unit(skey + 64, (const unsigned char *)key + 16); 175 return 3; 176 } 177 } 178 179 /* 180 * DES confusion function. This function performs expansion E (32 to 181 * 48 bits), XOR with subkey, S-boxes, and permutation P. 182 */ 183 static inline uint32_t 184 Fconf(uint32_t r0, const uint32_t *sk) 185 { 186 /* 187 * Each 6->4 S-box is virtually turned into four 6->1 boxes; we 188 * thus end up with 32 boxes that we call "T-boxes" here. We will 189 * evaluate them with bitslice code. 190 * 191 * Each T-box is a circuit of multiplexers (sort of) and thus 192 * takes 70 inputs: the 6 actual T-box inputs, and 64 constants 193 * that describe the T-box output for all combinations of the 194 * 6 inputs. With this model, all T-boxes are identical (with 195 * distinct inputs) and thus can be executed in parallel with 196 * bitslice code. 197 * 198 * T-boxes are numbered from 0 to 31, in least-to-most 199 * significant order. Thus, S-box S1 corresponds to T-boxes 31, 200 * 30, 29 and 28, in that order. T-box 'n' is computed with the 201 * bits at rank 'n' in the 32-bit words. 202 * 203 * Words x0 to x5 contain the T-box inputs 0 to 5. 204 */ 205 uint32_t x0, x1, x2, x3, x4, x5, z0; 206 uint32_t y0, y1, y2, y3, y4, y5, y6, y7, y8, y9; 207 uint32_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19; 208 uint32_t y20, y21, y22, y23, y24, y25, y26, y27, y28, y29; 209 uint32_t y30; 210 211 /* 212 * Spread input bits over the 6 input words x*. 213 */ 214 x1 = r0 & (uint32_t)0x11111111; 215 x2 = (r0 >> 1) & (uint32_t)0x11111111; 216 x3 = (r0 >> 2) & (uint32_t)0x11111111; 217 x4 = (r0 >> 3) & (uint32_t)0x11111111; 218 x1 = (x1 << 4) - x1; 219 x2 = (x2 << 4) - x2; 220 x3 = (x3 << 4) - x3; 221 x4 = (x4 << 4) - x4; 222 x0 = (x4 << 4) | (x4 >> 28); 223 x5 = (x1 >> 4) | (x1 << 28); 224 225 /* 226 * XOR with the subkey for this round. 227 */ 228 x0 ^= sk[0]; 229 x1 ^= sk[1]; 230 x2 ^= sk[2]; 231 x3 ^= sk[3]; 232 x4 ^= sk[4]; 233 x5 ^= sk[5]; 234 235 /* 236 * The T-boxes are done in parallel, since they all use a 237 * "tree of multiplexer". We use "fake multiplexers": 238 * 239 * y = a ^ (x & b) 240 * 241 * computes y as either 'a' (if x == 0) or 'a ^ b' (if x == 1). 242 */ 243 y0 = (uint32_t)0xEFA72C4D ^ (x0 & (uint32_t)0xEC7AC69C); 244 y1 = (uint32_t)0xAEAAEDFF ^ (x0 & (uint32_t)0x500FB821); 245 y2 = (uint32_t)0x37396665 ^ (x0 & (uint32_t)0x40EFA809); 246 y3 = (uint32_t)0x68D7B833 ^ (x0 & (uint32_t)0xA5EC0B28); 247 y4 = (uint32_t)0xC9C755BB ^ (x0 & (uint32_t)0x252CF820); 248 y5 = (uint32_t)0x73FC3606 ^ (x0 & (uint32_t)0x40205801); 249 y6 = (uint32_t)0xA2A0A918 ^ (x0 & (uint32_t)0xE220F929); 250 y7 = (uint32_t)0x8222BD90 ^ (x0 & (uint32_t)0x44A3F9E1); 251 y8 = (uint32_t)0xD6B6AC77 ^ (x0 & (uint32_t)0x794F104A); 252 y9 = (uint32_t)0x3069300C ^ (x0 & (uint32_t)0x026F320B); 253 y10 = (uint32_t)0x6CE0D5CC ^ (x0 & (uint32_t)0x7640B01A); 254 y11 = (uint32_t)0x59A9A22D ^ (x0 & (uint32_t)0x238F1572); 255 y12 = (uint32_t)0xAC6D0BD4 ^ (x0 & (uint32_t)0x7A63C083); 256 y13 = (uint32_t)0x21C83200 ^ (x0 & (uint32_t)0x11CCA000); 257 y14 = (uint32_t)0xA0E62188 ^ (x0 & (uint32_t)0x202F69AA); 258 /* y15 = (uint32_t)0x00000000 ^ (x0 & (uint32_t)0x00000000); */ 259 y16 = (uint32_t)0xAF7D655A ^ (x0 & (uint32_t)0x51B33BE9); 260 y17 = (uint32_t)0xF0168AA3 ^ (x0 & (uint32_t)0x3B0FE8AE); 261 y18 = (uint32_t)0x90AA30C6 ^ (x0 & (uint32_t)0x90BF8816); 262 y19 = (uint32_t)0x5AB2750A ^ (x0 & (uint32_t)0x09E34F9B); 263 y20 = (uint32_t)0x5391BE65 ^ (x0 & (uint32_t)0x0103BE88); 264 y21 = (uint32_t)0x93372BAF ^ (x0 & (uint32_t)0x49AC8E25); 265 y22 = (uint32_t)0xF288210C ^ (x0 & (uint32_t)0x922C313D); 266 y23 = (uint32_t)0x920AF5C0 ^ (x0 & (uint32_t)0x70EF31B0); 267 y24 = (uint32_t)0x63D312C0 ^ (x0 & (uint32_t)0x6A707100); 268 y25 = (uint32_t)0x537B3006 ^ (x0 & (uint32_t)0xB97C9011); 269 y26 = (uint32_t)0xA2EFB0A5 ^ (x0 & (uint32_t)0xA320C959); 270 y27 = (uint32_t)0xBC8F96A5 ^ (x0 & (uint32_t)0x6EA0AB4A); 271 y28 = (uint32_t)0xFAD176A5 ^ (x0 & (uint32_t)0x6953DDF8); 272 y29 = (uint32_t)0x665A14A3 ^ (x0 & (uint32_t)0xF74F3E2B); 273 y30 = (uint32_t)0xF2EFF0CC ^ (x0 & (uint32_t)0xF0306CAD); 274 /* y31 = (uint32_t)0x00000000 ^ (x0 & (uint32_t)0x00000000); */ 275 276 y0 = y0 ^ (x1 & y1); 277 y1 = y2 ^ (x1 & y3); 278 y2 = y4 ^ (x1 & y5); 279 y3 = y6 ^ (x1 & y7); 280 y4 = y8 ^ (x1 & y9); 281 y5 = y10 ^ (x1 & y11); 282 y6 = y12 ^ (x1 & y13); 283 y7 = y14; /* was: y14 ^ (x1 & y15) */ 284 y8 = y16 ^ (x1 & y17); 285 y9 = y18 ^ (x1 & y19); 286 y10 = y20 ^ (x1 & y21); 287 y11 = y22 ^ (x1 & y23); 288 y12 = y24 ^ (x1 & y25); 289 y13 = y26 ^ (x1 & y27); 290 y14 = y28 ^ (x1 & y29); 291 y15 = y30; /* was: y30 ^ (x1 & y31) */ 292 293 y0 = y0 ^ (x2 & y1); 294 y1 = y2 ^ (x2 & y3); 295 y2 = y4 ^ (x2 & y5); 296 y3 = y6 ^ (x2 & y7); 297 y4 = y8 ^ (x2 & y9); 298 y5 = y10 ^ (x2 & y11); 299 y6 = y12 ^ (x2 & y13); 300 y7 = y14 ^ (x2 & y15); 301 302 y0 = y0 ^ (x3 & y1); 303 y1 = y2 ^ (x3 & y3); 304 y2 = y4 ^ (x3 & y5); 305 y3 = y6 ^ (x3 & y7); 306 307 y0 = y0 ^ (x4 & y1); 308 y1 = y2 ^ (x4 & y3); 309 310 y0 = y0 ^ (x5 & y1); 311 312 /* 313 * The P permutation: 314 * -- Each bit move is converted into a mask + left rotation. 315 * -- Rotations that use the same movement are coalesced together. 316 * -- Left and right shifts are used as alternatives to a rotation 317 * where appropriate (this will help architectures that do not have 318 * a rotation opcode). 319 */ 320 z0 = (y0 & (uint32_t)0x00000004) << 3; 321 z0 |= (y0 & (uint32_t)0x00004000) << 4; 322 z0 |= rotl(y0 & 0x12020120, 5); 323 z0 |= (y0 & (uint32_t)0x00100000) << 6; 324 z0 |= (y0 & (uint32_t)0x00008000) << 9; 325 z0 |= (y0 & (uint32_t)0x04000000) >> 22; 326 z0 |= (y0 & (uint32_t)0x00000001) << 11; 327 z0 |= rotl(y0 & 0x20000200, 12); 328 z0 |= (y0 & (uint32_t)0x00200000) >> 19; 329 z0 |= (y0 & (uint32_t)0x00000040) << 14; 330 z0 |= (y0 & (uint32_t)0x00010000) << 15; 331 z0 |= (y0 & (uint32_t)0x00000002) << 16; 332 z0 |= rotl(y0 & 0x40801800, 17); 333 z0 |= (y0 & (uint32_t)0x00080000) >> 13; 334 z0 |= (y0 & (uint32_t)0x00000010) << 21; 335 z0 |= (y0 & (uint32_t)0x01000000) >> 10; 336 z0 |= rotl(y0 & 0x88000008, 24); 337 z0 |= (y0 & (uint32_t)0x00000480) >> 7; 338 z0 |= (y0 & (uint32_t)0x00442000) >> 6; 339 return z0; 340 } 341 342 /* 343 * Process one block through 16 successive rounds, omitting the swap 344 * in the final round. 345 */ 346 static void 347 process_block_unit(uint32_t *pl, uint32_t *pr, const uint32_t *sk_exp) 348 { 349 int i; 350 uint32_t l, r; 351 352 l = *pl; 353 r = *pr; 354 for (i = 0; i < 16; i ++) { 355 uint32_t t; 356 357 t = l ^ Fconf(r, sk_exp); 358 l = r; 359 r = t; 360 sk_exp += 6; 361 } 362 *pl = r; 363 *pr = l; 364 } 365 366 /* see inner.h */ 367 void 368 br_des_ct_process_block(unsigned num_rounds, 369 const uint32_t *sk_exp, void *block) 370 { 371 unsigned char *buf; 372 uint32_t l, r; 373 374 buf = block; 375 l = br_dec32be(buf); 376 r = br_dec32be(buf + 4); 377 br_des_do_IP(&l, &r); 378 while (num_rounds -- > 0) { 379 process_block_unit(&l, &r, sk_exp); 380 sk_exp += 96; 381 } 382 br_des_do_invIP(&l, &r); 383 br_enc32be(buf, l); 384 br_enc32be(buf + 4, r); 385 } 386 387 /* see inner.h */ 388 void 389 br_des_ct_skey_expand(uint32_t *sk_exp, 390 unsigned num_rounds, const uint32_t *skey) 391 { 392 num_rounds <<= 4; 393 while (num_rounds -- > 0) { 394 uint32_t v, w0, w1, w2, w3; 395 396 v = *skey ++; 397 w0 = v & 0x11111111; 398 w1 = (v >> 1) & 0x11111111; 399 w2 = (v >> 2) & 0x11111111; 400 w3 = (v >> 3) & 0x11111111; 401 *sk_exp ++ = (w0 << 4) - w0; 402 *sk_exp ++ = (w1 << 4) - w1; 403 *sk_exp ++ = (w2 << 4) - w2; 404 *sk_exp ++ = (w3 << 4) - w3; 405 v = *skey ++; 406 w0 = v & 0x11111111; 407 w1 = (v >> 1) & 0x11111111; 408 *sk_exp ++ = (w0 << 4) - w0; 409 *sk_exp ++ = (w1 << 4) - w1; 410 } 411 } 412