1 /*- 2 * Copyright (c) 2014 The FreeBSD Foundation 3 * All rights reserved. 4 * 5 * This software was developed by John-Mark Gurney under 6 * the sponsorship of the FreeBSD Foundation and 7 * Rubicon Communications, LLC (Netgate). 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * 30 * $FreeBSD$ 31 * 32 */ 33 34 /* 35 * Figure 5, 8 and 12 are copied from the Intel white paper: 36 * Intel® Carry-Less Multiplication Instruction and its Usage for 37 * Computing the GCM Mode 38 * 39 * and as such are: 40 * Copyright © 2010 Intel Corporation. 41 * All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * * Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * * Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * * Neither the name of Intel Corporation nor the 52 * names of its contributors may be used to endorse or promote products 53 * derived from this software without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 56 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 57 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 58 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 59 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 60 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 61 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 62 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 63 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 64 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 65 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 66 */ 67 68 #ifdef _KERNEL 69 #include <crypto/aesni/aesni.h> 70 #include <crypto/aesni/aesni_os.h> 71 #else 72 #include <stdint.h> 73 #endif 74 75 #include <wmmintrin.h> 76 #include <emmintrin.h> 77 #include <smmintrin.h> 78 79 static inline int 80 m128icmp(__m128i a, __m128i b) 81 { 82 __m128i cmp; 83 84 cmp = _mm_cmpeq_epi32(a, b); 85 86 return _mm_movemask_epi8(cmp) == 0xffff; 87 } 88 89 #ifdef __i386__ 90 static inline __m128i 91 _mm_insert_epi64(__m128i a, int64_t b, const int ndx) 92 { 93 94 if (!ndx) { 95 a = _mm_insert_epi32(a, b, 0); 96 a = _mm_insert_epi32(a, b >> 32, 1); 97 } else { 98 a = _mm_insert_epi32(a, b, 2); 99 a = _mm_insert_epi32(a, b >> 32, 3); 100 } 101 102 return a; 103 } 104 #endif 105 106 /* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */ 107 108 /* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */ 109 static void 110 gfmul(__m128i a, __m128i b, __m128i *res) 111 { 112 __m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; 113 114 tmp3 = _mm_clmulepi64_si128(a, b, 0x00); 115 tmp4 = _mm_clmulepi64_si128(a, b, 0x10); 116 tmp5 = _mm_clmulepi64_si128(a, b, 0x01); 117 tmp6 = _mm_clmulepi64_si128(a, b, 0x11); 118 119 tmp4 = _mm_xor_si128(tmp4, tmp5); 120 tmp5 = _mm_slli_si128(tmp4, 8); 121 tmp4 = _mm_srli_si128(tmp4, 8); 122 tmp3 = _mm_xor_si128(tmp3, tmp5); 123 tmp6 = _mm_xor_si128(tmp6, tmp4); 124 125 tmp7 = _mm_srli_epi32(tmp3, 31); 126 tmp8 = _mm_srli_epi32(tmp6, 31); 127 tmp3 = _mm_slli_epi32(tmp3, 1); 128 tmp6 = _mm_slli_epi32(tmp6, 1); 129 130 tmp9 = _mm_srli_si128(tmp7, 12); 131 tmp8 = _mm_slli_si128(tmp8, 4); 132 tmp7 = _mm_slli_si128(tmp7, 4); 133 tmp3 = _mm_or_si128(tmp3, tmp7); 134 tmp6 = _mm_or_si128(tmp6, tmp8); 135 tmp6 = _mm_or_si128(tmp6, tmp9); 136 137 tmp7 = _mm_slli_epi32(tmp3, 31); 138 tmp8 = _mm_slli_epi32(tmp3, 30); 139 tmp9 = _mm_slli_epi32(tmp3, 25); 140 141 tmp7 = _mm_xor_si128(tmp7, tmp8); 142 tmp7 = _mm_xor_si128(tmp7, tmp9); 143 tmp8 = _mm_srli_si128(tmp7, 4); 144 tmp7 = _mm_slli_si128(tmp7, 12); 145 tmp3 = _mm_xor_si128(tmp3, tmp7); 146 147 tmp2 = _mm_srli_epi32(tmp3, 1); 148 tmp4 = _mm_srli_epi32(tmp3, 2); 149 tmp5 = _mm_srli_epi32(tmp3, 7); 150 tmp2 = _mm_xor_si128(tmp2, tmp4); 151 tmp2 = _mm_xor_si128(tmp2, tmp5); 152 tmp2 = _mm_xor_si128(tmp2, tmp8); 153 tmp3 = _mm_xor_si128(tmp3, tmp2); 154 tmp6 = _mm_xor_si128(tmp6, tmp3); 155 156 *res = tmp6; 157 } 158 159 /* 160 * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction 161 * Method */ 162 static void 163 reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4, 164 __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res) 165 { 166 /*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/ 167 __m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo, 168 H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi; 169 __m128i tmp0, tmp1, tmp2, tmp3; 170 __m128i tmp4, tmp5, tmp6, tmp7; 171 __m128i tmp8, tmp9; 172 173 H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00); 174 H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00); 175 H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00); 176 H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00); 177 178 lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo); 179 lo = _mm_xor_si128(lo, H3_X3_lo); 180 lo = _mm_xor_si128(lo, H4_X4_lo); 181 182 H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11); 183 H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11); 184 H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11); 185 H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11); 186 187 hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi); 188 hi = _mm_xor_si128(hi, H3_X3_hi); 189 hi = _mm_xor_si128(hi, H4_X4_hi); 190 191 tmp0 = _mm_shuffle_epi32(H1, 78); 192 tmp4 = _mm_shuffle_epi32(X1, 78); 193 tmp0 = _mm_xor_si128(tmp0, H1); 194 tmp4 = _mm_xor_si128(tmp4, X1); 195 tmp1 = _mm_shuffle_epi32(H2, 78); 196 tmp5 = _mm_shuffle_epi32(X2, 78); 197 tmp1 = _mm_xor_si128(tmp1, H2); 198 tmp5 = _mm_xor_si128(tmp5, X2); 199 tmp2 = _mm_shuffle_epi32(H3, 78); 200 tmp6 = _mm_shuffle_epi32(X3, 78); 201 tmp2 = _mm_xor_si128(tmp2, H3); 202 tmp6 = _mm_xor_si128(tmp6, X3); 203 tmp3 = _mm_shuffle_epi32(H4, 78); 204 tmp7 = _mm_shuffle_epi32(X4, 78); 205 tmp3 = _mm_xor_si128(tmp3, H4); 206 tmp7 = _mm_xor_si128(tmp7, X4); 207 208 tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00); 209 tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00); 210 tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00); 211 tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00); 212 213 tmp0 = _mm_xor_si128(tmp0, lo); 214 tmp0 = _mm_xor_si128(tmp0, hi); 215 tmp0 = _mm_xor_si128(tmp1, tmp0); 216 tmp0 = _mm_xor_si128(tmp2, tmp0); 217 tmp0 = _mm_xor_si128(tmp3, tmp0); 218 219 tmp4 = _mm_slli_si128(tmp0, 8); 220 tmp0 = _mm_srli_si128(tmp0, 8); 221 222 lo = _mm_xor_si128(tmp4, lo); 223 hi = _mm_xor_si128(tmp0, hi); 224 225 tmp3 = lo; 226 tmp6 = hi; 227 228 tmp7 = _mm_srli_epi32(tmp3, 31); 229 tmp8 = _mm_srli_epi32(tmp6, 31); 230 tmp3 = _mm_slli_epi32(tmp3, 1); 231 tmp6 = _mm_slli_epi32(tmp6, 1); 232 233 tmp9 = _mm_srli_si128(tmp7, 12); 234 tmp8 = _mm_slli_si128(tmp8, 4); 235 tmp7 = _mm_slli_si128(tmp7, 4); 236 tmp3 = _mm_or_si128(tmp3, tmp7); 237 tmp6 = _mm_or_si128(tmp6, tmp8); 238 tmp6 = _mm_or_si128(tmp6, tmp9); 239 240 tmp7 = _mm_slli_epi32(tmp3, 31); 241 tmp8 = _mm_slli_epi32(tmp3, 30); 242 tmp9 = _mm_slli_epi32(tmp3, 25); 243 244 tmp7 = _mm_xor_si128(tmp7, tmp8); 245 tmp7 = _mm_xor_si128(tmp7, tmp9); 246 tmp8 = _mm_srli_si128(tmp7, 4); 247 tmp7 = _mm_slli_si128(tmp7, 12); 248 tmp3 = _mm_xor_si128(tmp3, tmp7); 249 250 tmp2 = _mm_srli_epi32(tmp3, 1); 251 tmp4 = _mm_srli_epi32(tmp3, 2); 252 tmp5 = _mm_srli_epi32(tmp3, 7); 253 tmp2 = _mm_xor_si128(tmp2, tmp4); 254 tmp2 = _mm_xor_si128(tmp2, tmp5); 255 tmp2 = _mm_xor_si128(tmp2, tmp8); 256 tmp3 = _mm_xor_si128(tmp3, tmp2); 257 tmp6 = _mm_xor_si128(tmp6, tmp3); 258 259 *res = tmp6; 260 } 261 262 /* 263 * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated 264 * Every Four Blocks 265 */ 266 /* 267 * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or 268 * 2^32-256*8*16 bytes. 269 */ 270 void 271 AES_GCM_encrypt(const unsigned char *in, unsigned char *out, 272 const unsigned char *addt, const unsigned char *ivec, 273 unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes, 274 const unsigned char *key, int nr) 275 { 276 int i, j ,k; 277 __m128i tmp1, tmp2, tmp3, tmp4; 278 __m128i tmp5, tmp6, tmp7, tmp8; 279 __m128i H, H2, H3, H4, Y, T; 280 __m128i *KEY = (__m128i*)key; 281 __m128i ctr1, ctr2, ctr3, ctr4; 282 __m128i ctr5, ctr6, ctr7, ctr8; 283 __m128i last_block = _mm_setzero_si128(); 284 __m128i ONE = _mm_set_epi32(0, 1, 0, 0); 285 __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0); 286 __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6, 287 7); 288 __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14, 289 15); 290 __m128i X = _mm_setzero_si128(); 291 292 if (ibytes == 96/8) { 293 Y = _mm_loadu_si128((__m128i*)ivec); 294 Y = _mm_insert_epi32(Y, 0x1000000, 3); 295 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/ 296 tmp1 = _mm_xor_si128(X, KEY[0]); 297 tmp2 = _mm_xor_si128(Y, KEY[0]); 298 for (j=1; j < nr-1; j+=2) { 299 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 300 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 301 302 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 303 tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); 304 } 305 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 306 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); 307 308 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 309 T = _mm_aesenclast_si128(tmp2, KEY[nr]); 310 311 H = _mm_shuffle_epi8(H, BSWAP_MASK); 312 } else { 313 tmp1 = _mm_xor_si128(X, KEY[0]); 314 for (j=1; j <nr; j++) 315 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 316 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 317 318 H = _mm_shuffle_epi8(H, BSWAP_MASK); 319 Y = _mm_setzero_si128(); 320 321 for (i=0; i < ibytes/16; i++) { 322 tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); 323 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 324 Y = _mm_xor_si128(Y, tmp1); 325 gfmul(Y, H, &Y); 326 } 327 if (ibytes%16) { 328 for (j=0; j < ibytes%16; j++) 329 ((unsigned char*)&last_block)[j] = ivec[i*16+j]; 330 tmp1 = last_block; 331 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 332 Y = _mm_xor_si128(Y, tmp1); 333 gfmul(Y, H, &Y); 334 } 335 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0); 336 tmp1 = _mm_insert_epi64(tmp1, 0, 1); 337 338 Y = _mm_xor_si128(Y, tmp1); 339 gfmul(Y, H, &Y); 340 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/ 341 tmp1 = _mm_xor_si128(Y, KEY[0]); 342 for (j=1; j < nr; j++) 343 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 344 T = _mm_aesenclast_si128(tmp1, KEY[nr]); 345 } 346 347 gfmul(H,H,&H2); 348 gfmul(H,H2,&H3); 349 gfmul(H,H3,&H4); 350 351 for (i=0; i<abytes/16/4; i++) { 352 tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i*4]); 353 tmp2 = _mm_loadu_si128(&((__m128i*)addt)[i*4+1]); 354 tmp3 = _mm_loadu_si128(&((__m128i*)addt)[i*4+2]); 355 tmp4 = _mm_loadu_si128(&((__m128i*)addt)[i*4+3]); 356 357 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 358 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 359 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 360 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 361 tmp1 = _mm_xor_si128(X, tmp1); 362 363 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 364 } 365 for (i=i*4; i<abytes/16; i++) { 366 tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); 367 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 368 X = _mm_xor_si128(X,tmp1); 369 gfmul(X, H, &X); 370 } 371 if (abytes%16) { 372 last_block = _mm_setzero_si128(); 373 for (j=0; j<abytes%16; j++) 374 ((unsigned char*)&last_block)[j] = addt[i*16+j]; 375 tmp1 = last_block; 376 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 377 X =_mm_xor_si128(X,tmp1); 378 gfmul(X,H,&X); 379 } 380 381 ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); 382 ctr1 = _mm_add_epi64(ctr1, ONE); 383 ctr2 = _mm_add_epi64(ctr1, ONE); 384 ctr3 = _mm_add_epi64(ctr2, ONE); 385 ctr4 = _mm_add_epi64(ctr3, ONE); 386 ctr5 = _mm_add_epi64(ctr4, ONE); 387 ctr6 = _mm_add_epi64(ctr5, ONE); 388 ctr7 = _mm_add_epi64(ctr6, ONE); 389 ctr8 = _mm_add_epi64(ctr7, ONE); 390 391 for (i=0; i<nbytes/16/8; i++) { 392 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 393 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); 394 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); 395 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); 396 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64); 397 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64); 398 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64); 399 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64); 400 401 ctr1 = _mm_add_epi64(ctr1, EIGHT); 402 ctr2 = _mm_add_epi64(ctr2, EIGHT); 403 ctr3 = _mm_add_epi64(ctr3, EIGHT); 404 ctr4 = _mm_add_epi64(ctr4, EIGHT); 405 ctr5 = _mm_add_epi64(ctr5, EIGHT); 406 ctr6 = _mm_add_epi64(ctr6, EIGHT); 407 ctr7 = _mm_add_epi64(ctr7, EIGHT); 408 ctr8 = _mm_add_epi64(ctr8, EIGHT); 409 410 tmp1 =_mm_xor_si128(tmp1, KEY[0]); 411 tmp2 =_mm_xor_si128(tmp2, KEY[0]); 412 tmp3 =_mm_xor_si128(tmp3, KEY[0]); 413 tmp4 =_mm_xor_si128(tmp4, KEY[0]); 414 tmp5 =_mm_xor_si128(tmp5, KEY[0]); 415 tmp6 =_mm_xor_si128(tmp6, KEY[0]); 416 tmp7 =_mm_xor_si128(tmp7, KEY[0]); 417 tmp8 =_mm_xor_si128(tmp8, KEY[0]); 418 419 for (j=1; j<nr; j++) { 420 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 421 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 422 tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); 423 tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); 424 tmp5 = _mm_aesenc_si128(tmp5, KEY[j]); 425 tmp6 = _mm_aesenc_si128(tmp6, KEY[j]); 426 tmp7 = _mm_aesenc_si128(tmp7, KEY[j]); 427 tmp8 = _mm_aesenc_si128(tmp8, KEY[j]); 428 } 429 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]); 430 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]); 431 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]); 432 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]); 433 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]); 434 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]); 435 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]); 436 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]); 437 438 tmp1 = _mm_xor_si128(tmp1, 439 _mm_loadu_si128(&((__m128i*)in)[i*8+0])); 440 tmp2 = _mm_xor_si128(tmp2, 441 _mm_loadu_si128(&((__m128i*)in)[i*8+1])); 442 tmp3 = _mm_xor_si128(tmp3, 443 _mm_loadu_si128(&((__m128i*)in)[i*8+2])); 444 tmp4 = _mm_xor_si128(tmp4, 445 _mm_loadu_si128(&((__m128i*)in)[i*8+3])); 446 tmp5 = _mm_xor_si128(tmp5, 447 _mm_loadu_si128(&((__m128i*)in)[i*8+4])); 448 tmp6 = _mm_xor_si128(tmp6, 449 _mm_loadu_si128(&((__m128i*)in)[i*8+5])); 450 tmp7 = _mm_xor_si128(tmp7, 451 _mm_loadu_si128(&((__m128i*)in)[i*8+6])); 452 tmp8 = _mm_xor_si128(tmp8, 453 _mm_loadu_si128(&((__m128i*)in)[i*8+7])); 454 455 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); 456 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); 457 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); 458 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); 459 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); 460 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); 461 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); 462 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); 463 464 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 465 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 466 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 467 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 468 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); 469 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); 470 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); 471 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); 472 473 tmp1 = _mm_xor_si128(X, tmp1); 474 475 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 476 477 tmp5 = _mm_xor_si128(X, tmp5); 478 reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X); 479 } 480 for (k=i*8; k<nbytes/16; k++) { 481 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 482 ctr1 = _mm_add_epi64(ctr1, ONE); 483 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 484 for (j=1; j<nr-1; j+=2) { 485 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 486 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 487 } 488 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 489 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 490 tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); 491 _mm_storeu_si128(&((__m128i*)out)[k], tmp1); 492 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 493 X = _mm_xor_si128(X, tmp1); 494 gfmul(X,H,&X); 495 } 496 //If remains one incomplete block 497 if (nbytes%16) { 498 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 499 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 500 for (j=1; j<nr-1; j+=2) { 501 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 502 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 503 } 504 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 505 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 506 tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); 507 last_block = tmp1; 508 for (j=0; j<nbytes%16; j++) 509 out[k*16+j] = ((unsigned char*)&last_block)[j]; 510 for ((void)j; j<16; j++) 511 ((unsigned char*)&last_block)[j] = 0; 512 tmp1 = last_block; 513 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 514 X = _mm_xor_si128(X, tmp1); 515 gfmul(X, H, &X); 516 } 517 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0); 518 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1); 519 520 X = _mm_xor_si128(X, tmp1); 521 gfmul(X,H,&X); 522 X = _mm_shuffle_epi8(X, BSWAP_MASK); 523 T = _mm_xor_si128(X, T); 524 _mm_storeu_si128((__m128i*)tag, T); 525 } 526 527 /* My modification of _encrypt to be _decrypt */ 528 int 529 AES_GCM_decrypt(const unsigned char *in, unsigned char *out, 530 const unsigned char *addt, const unsigned char *ivec, 531 const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes, 532 const unsigned char *key, int nr) 533 { 534 int i, j ,k; 535 __m128i tmp1, tmp2, tmp3, tmp4; 536 __m128i tmp5, tmp6, tmp7, tmp8; 537 __m128i H, H2, H3, H4, Y, T; 538 __m128i *KEY = (__m128i*)key; 539 __m128i ctr1, ctr2, ctr3, ctr4; 540 __m128i ctr5, ctr6, ctr7, ctr8; 541 __m128i last_block = _mm_setzero_si128(); 542 __m128i ONE = _mm_set_epi32(0, 1, 0, 0); 543 __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0); 544 __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6, 545 7); 546 __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14, 547 15); 548 __m128i X = _mm_setzero_si128(); 549 550 if (ibytes == 96/8) { 551 Y = _mm_loadu_si128((__m128i*)ivec); 552 Y = _mm_insert_epi32(Y, 0x1000000, 3); 553 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/ 554 tmp1 = _mm_xor_si128(X, KEY[0]); 555 tmp2 = _mm_xor_si128(Y, KEY[0]); 556 for (j=1; j < nr-1; j+=2) { 557 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 558 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 559 560 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 561 tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); 562 } 563 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 564 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); 565 566 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 567 T = _mm_aesenclast_si128(tmp2, KEY[nr]); 568 569 H = _mm_shuffle_epi8(H, BSWAP_MASK); 570 } else { 571 tmp1 = _mm_xor_si128(X, KEY[0]); 572 for (j=1; j <nr; j++) 573 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 574 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 575 576 H = _mm_shuffle_epi8(H, BSWAP_MASK); 577 Y = _mm_setzero_si128(); 578 579 for (i=0; i < ibytes/16; i++) { 580 tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); 581 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 582 Y = _mm_xor_si128(Y, tmp1); 583 gfmul(Y, H, &Y); 584 } 585 if (ibytes%16) { 586 for (j=0; j < ibytes%16; j++) 587 ((unsigned char*)&last_block)[j] = ivec[i*16+j]; 588 tmp1 = last_block; 589 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 590 Y = _mm_xor_si128(Y, tmp1); 591 gfmul(Y, H, &Y); 592 } 593 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0); 594 tmp1 = _mm_insert_epi64(tmp1, 0, 1); 595 596 Y = _mm_xor_si128(Y, tmp1); 597 gfmul(Y, H, &Y); 598 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/ 599 tmp1 = _mm_xor_si128(Y, KEY[0]); 600 for (j=1; j < nr; j++) 601 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 602 T = _mm_aesenclast_si128(tmp1, KEY[nr]); 603 } 604 605 gfmul(H,H,&H2); 606 gfmul(H,H2,&H3); 607 gfmul(H,H3,&H4); 608 609 for (i=0; i<abytes/16/4; i++) { 610 tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i*4]); 611 tmp2 = _mm_loadu_si128(&((__m128i*)addt)[i*4+1]); 612 tmp3 = _mm_loadu_si128(&((__m128i*)addt)[i*4+2]); 613 tmp4 = _mm_loadu_si128(&((__m128i*)addt)[i*4+3]); 614 615 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 616 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 617 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 618 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 619 620 tmp1 = _mm_xor_si128(X, tmp1); 621 622 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 623 } 624 for (i=i*4; i<abytes/16; i++) { 625 tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); 626 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 627 X = _mm_xor_si128(X,tmp1); 628 gfmul(X, H, &X); 629 } 630 if (abytes%16) { 631 last_block = _mm_setzero_si128(); 632 for (j=0; j<abytes%16; j++) 633 ((unsigned char*)&last_block)[j] = addt[i*16+j]; 634 tmp1 = last_block; 635 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 636 X =_mm_xor_si128(X,tmp1); 637 gfmul(X,H,&X); 638 } 639 640 /* This is where we validate the cipher text before decrypt */ 641 for (i = 0; i<nbytes/16/4; i++) { 642 tmp1 = _mm_loadu_si128(&((__m128i*)in)[i*4]); 643 tmp2 = _mm_loadu_si128(&((__m128i*)in)[i*4+1]); 644 tmp3 = _mm_loadu_si128(&((__m128i*)in)[i*4+2]); 645 tmp4 = _mm_loadu_si128(&((__m128i*)in)[i*4+3]); 646 647 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 648 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 649 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 650 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 651 652 tmp1 = _mm_xor_si128(X, tmp1); 653 654 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 655 } 656 for (i = i*4; i<nbytes/16; i++) { 657 tmp1 = _mm_loadu_si128(&((__m128i*)in)[i]); 658 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 659 X = _mm_xor_si128(X, tmp1); 660 gfmul(X,H,&X); 661 } 662 if (nbytes%16) { 663 last_block = _mm_setzero_si128(); 664 for (j=0; j<nbytes%16; j++) 665 ((unsigned char*)&last_block)[j] = in[i*16+j]; 666 tmp1 = last_block; 667 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 668 X = _mm_xor_si128(X, tmp1); 669 gfmul(X, H, &X); 670 } 671 672 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0); 673 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1); 674 675 X = _mm_xor_si128(X, tmp1); 676 gfmul(X,H,&X); 677 X = _mm_shuffle_epi8(X, BSWAP_MASK); 678 T = _mm_xor_si128(X, T); 679 680 if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag))) 681 return 0; //in case the authentication failed 682 683 ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); 684 ctr1 = _mm_add_epi64(ctr1, ONE); 685 ctr2 = _mm_add_epi64(ctr1, ONE); 686 ctr3 = _mm_add_epi64(ctr2, ONE); 687 ctr4 = _mm_add_epi64(ctr3, ONE); 688 ctr5 = _mm_add_epi64(ctr4, ONE); 689 ctr6 = _mm_add_epi64(ctr5, ONE); 690 ctr7 = _mm_add_epi64(ctr6, ONE); 691 ctr8 = _mm_add_epi64(ctr7, ONE); 692 693 for (i=0; i<nbytes/16/8; i++) { 694 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 695 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); 696 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); 697 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); 698 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64); 699 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64); 700 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64); 701 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64); 702 703 ctr1 = _mm_add_epi64(ctr1, EIGHT); 704 ctr2 = _mm_add_epi64(ctr2, EIGHT); 705 ctr3 = _mm_add_epi64(ctr3, EIGHT); 706 ctr4 = _mm_add_epi64(ctr4, EIGHT); 707 ctr5 = _mm_add_epi64(ctr5, EIGHT); 708 ctr6 = _mm_add_epi64(ctr6, EIGHT); 709 ctr7 = _mm_add_epi64(ctr7, EIGHT); 710 ctr8 = _mm_add_epi64(ctr8, EIGHT); 711 712 tmp1 =_mm_xor_si128(tmp1, KEY[0]); 713 tmp2 =_mm_xor_si128(tmp2, KEY[0]); 714 tmp3 =_mm_xor_si128(tmp3, KEY[0]); 715 tmp4 =_mm_xor_si128(tmp4, KEY[0]); 716 tmp5 =_mm_xor_si128(tmp5, KEY[0]); 717 tmp6 =_mm_xor_si128(tmp6, KEY[0]); 718 tmp7 =_mm_xor_si128(tmp7, KEY[0]); 719 tmp8 =_mm_xor_si128(tmp8, KEY[0]); 720 721 for (j=1; j<nr; j++) { 722 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 723 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 724 tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); 725 tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); 726 tmp5 = _mm_aesenc_si128(tmp5, KEY[j]); 727 tmp6 = _mm_aesenc_si128(tmp6, KEY[j]); 728 tmp7 = _mm_aesenc_si128(tmp7, KEY[j]); 729 tmp8 = _mm_aesenc_si128(tmp8, KEY[j]); 730 } 731 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]); 732 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]); 733 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]); 734 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]); 735 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]); 736 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]); 737 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]); 738 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]); 739 740 tmp1 = _mm_xor_si128(tmp1, 741 _mm_loadu_si128(&((__m128i*)in)[i*8+0])); 742 tmp2 = _mm_xor_si128(tmp2, 743 _mm_loadu_si128(&((__m128i*)in)[i*8+1])); 744 tmp3 = _mm_xor_si128(tmp3, 745 _mm_loadu_si128(&((__m128i*)in)[i*8+2])); 746 tmp4 = _mm_xor_si128(tmp4, 747 _mm_loadu_si128(&((__m128i*)in)[i*8+3])); 748 tmp5 = _mm_xor_si128(tmp5, 749 _mm_loadu_si128(&((__m128i*)in)[i*8+4])); 750 tmp6 = _mm_xor_si128(tmp6, 751 _mm_loadu_si128(&((__m128i*)in)[i*8+5])); 752 tmp7 = _mm_xor_si128(tmp7, 753 _mm_loadu_si128(&((__m128i*)in)[i*8+6])); 754 tmp8 = _mm_xor_si128(tmp8, 755 _mm_loadu_si128(&((__m128i*)in)[i*8+7])); 756 757 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); 758 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); 759 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); 760 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); 761 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); 762 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); 763 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); 764 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); 765 766 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 767 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 768 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 769 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 770 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); 771 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); 772 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); 773 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); 774 } 775 for (k=i*8; k<nbytes/16; k++) { 776 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 777 ctr1 = _mm_add_epi64(ctr1, ONE); 778 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 779 for (j=1; j<nr-1; j+=2) { 780 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 781 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 782 } 783 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 784 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 785 tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); 786 _mm_storeu_si128(&((__m128i*)out)[k], tmp1); 787 } 788 //If remains one incomplete block 789 if (nbytes%16) { 790 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 791 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 792 for (j=1; j<nr-1; j+=2) { 793 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 794 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 795 } 796 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 797 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 798 tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); 799 last_block = tmp1; 800 for (j=0; j<nbytes%16; j++) 801 out[k*16+j] = ((unsigned char*)&last_block)[j]; 802 } 803 return 1; //when sucessfull returns 1 804 } 805