1 /*- 2 * Copyright (c) 2014 The FreeBSD Foundation 3 * All rights reserved. 4 * 5 * This software was developed by John-Mark Gurney under 6 * the sponsorship of the FreeBSD Foundation and 7 * Rubicon Communications, LLC (Netgate). 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * 30 */ 31 32 /* 33 * Figure 5, 8 and 12 are copied from the Intel white paper: 34 * Intel® Carry-Less Multiplication Instruction and its Usage for 35 * Computing the GCM Mode 36 * 37 * and as such are: 38 * Copyright © 2010 Intel Corporation. 39 * All rights reserved. 40 * 41 * Redistribution and use in source and binary forms, with or without 42 * modification, are permitted provided that the following conditions 43 * are met: 44 * * Redistributions of source code must retain the above copyright 45 * notice, this list of conditions and the following disclaimer. 46 * * Redistributions in binary form must reproduce the above copyright 47 * notice, this list of conditions and the following disclaimer in the 48 * documentation and/or other materials provided with the distribution. 49 * * Neither the name of Intel Corporation nor the 50 * names of its contributors may be used to endorse or promote products 51 * derived from this software without specific prior written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 54 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 55 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 56 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 57 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 58 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 59 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 60 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 61 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 62 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 63 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 64 */ 65 66 #ifdef _KERNEL 67 #include <crypto/aesni/aesni.h> 68 #include <crypto/aesni/aesni_os.h> 69 #else 70 #include <stdint.h> 71 #endif 72 73 #include <wmmintrin.h> 74 #include <emmintrin.h> 75 #include <smmintrin.h> 76 77 static inline int 78 m128icmp(__m128i a, __m128i b) 79 { 80 __m128i cmp; 81 82 cmp = _mm_cmpeq_epi32(a, b); 83 84 return _mm_movemask_epi8(cmp) == 0xffff; 85 } 86 87 #ifdef __i386__ 88 static inline __m128i 89 _mm_insert_epi64(__m128i a, int64_t b, const int ndx) 90 { 91 92 if (!ndx) { 93 a = _mm_insert_epi32(a, b, 0); 94 a = _mm_insert_epi32(a, b >> 32, 1); 95 } else { 96 a = _mm_insert_epi32(a, b, 2); 97 a = _mm_insert_epi32(a, b >> 32, 3); 98 } 99 100 return a; 101 } 102 #endif 103 104 /* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */ 105 106 /* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */ 107 static void 108 gfmul(__m128i a, __m128i b, __m128i *res) 109 { 110 __m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; 111 112 tmp3 = _mm_clmulepi64_si128(a, b, 0x00); 113 tmp4 = _mm_clmulepi64_si128(a, b, 0x10); 114 tmp5 = _mm_clmulepi64_si128(a, b, 0x01); 115 tmp6 = _mm_clmulepi64_si128(a, b, 0x11); 116 117 tmp4 = _mm_xor_si128(tmp4, tmp5); 118 tmp5 = _mm_slli_si128(tmp4, 8); 119 tmp4 = _mm_srli_si128(tmp4, 8); 120 tmp3 = _mm_xor_si128(tmp3, tmp5); 121 tmp6 = _mm_xor_si128(tmp6, tmp4); 122 123 tmp7 = _mm_srli_epi32(tmp3, 31); 124 tmp8 = _mm_srli_epi32(tmp6, 31); 125 tmp3 = _mm_slli_epi32(tmp3, 1); 126 tmp6 = _mm_slli_epi32(tmp6, 1); 127 128 tmp9 = _mm_srli_si128(tmp7, 12); 129 tmp8 = _mm_slli_si128(tmp8, 4); 130 tmp7 = _mm_slli_si128(tmp7, 4); 131 tmp3 = _mm_or_si128(tmp3, tmp7); 132 tmp6 = _mm_or_si128(tmp6, tmp8); 133 tmp6 = _mm_or_si128(tmp6, tmp9); 134 135 tmp7 = _mm_slli_epi32(tmp3, 31); 136 tmp8 = _mm_slli_epi32(tmp3, 30); 137 tmp9 = _mm_slli_epi32(tmp3, 25); 138 139 tmp7 = _mm_xor_si128(tmp7, tmp8); 140 tmp7 = _mm_xor_si128(tmp7, tmp9); 141 tmp8 = _mm_srli_si128(tmp7, 4); 142 tmp7 = _mm_slli_si128(tmp7, 12); 143 tmp3 = _mm_xor_si128(tmp3, tmp7); 144 145 tmp2 = _mm_srli_epi32(tmp3, 1); 146 tmp4 = _mm_srli_epi32(tmp3, 2); 147 tmp5 = _mm_srli_epi32(tmp3, 7); 148 tmp2 = _mm_xor_si128(tmp2, tmp4); 149 tmp2 = _mm_xor_si128(tmp2, tmp5); 150 tmp2 = _mm_xor_si128(tmp2, tmp8); 151 tmp3 = _mm_xor_si128(tmp3, tmp2); 152 tmp6 = _mm_xor_si128(tmp6, tmp3); 153 154 *res = tmp6; 155 } 156 157 /* 158 * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction 159 * Method */ 160 static void 161 reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4, 162 __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res) 163 { 164 /*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/ 165 __m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo, 166 H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi; 167 __m128i tmp0, tmp1, tmp2, tmp3; 168 __m128i tmp4, tmp5, tmp6, tmp7; 169 __m128i tmp8, tmp9; 170 171 H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00); 172 H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00); 173 H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00); 174 H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00); 175 176 lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo); 177 lo = _mm_xor_si128(lo, H3_X3_lo); 178 lo = _mm_xor_si128(lo, H4_X4_lo); 179 180 H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11); 181 H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11); 182 H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11); 183 H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11); 184 185 hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi); 186 hi = _mm_xor_si128(hi, H3_X3_hi); 187 hi = _mm_xor_si128(hi, H4_X4_hi); 188 189 tmp0 = _mm_shuffle_epi32(H1, 78); 190 tmp4 = _mm_shuffle_epi32(X1, 78); 191 tmp0 = _mm_xor_si128(tmp0, H1); 192 tmp4 = _mm_xor_si128(tmp4, X1); 193 tmp1 = _mm_shuffle_epi32(H2, 78); 194 tmp5 = _mm_shuffle_epi32(X2, 78); 195 tmp1 = _mm_xor_si128(tmp1, H2); 196 tmp5 = _mm_xor_si128(tmp5, X2); 197 tmp2 = _mm_shuffle_epi32(H3, 78); 198 tmp6 = _mm_shuffle_epi32(X3, 78); 199 tmp2 = _mm_xor_si128(tmp2, H3); 200 tmp6 = _mm_xor_si128(tmp6, X3); 201 tmp3 = _mm_shuffle_epi32(H4, 78); 202 tmp7 = _mm_shuffle_epi32(X4, 78); 203 tmp3 = _mm_xor_si128(tmp3, H4); 204 tmp7 = _mm_xor_si128(tmp7, X4); 205 206 tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00); 207 tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00); 208 tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00); 209 tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00); 210 211 tmp0 = _mm_xor_si128(tmp0, lo); 212 tmp0 = _mm_xor_si128(tmp0, hi); 213 tmp0 = _mm_xor_si128(tmp1, tmp0); 214 tmp0 = _mm_xor_si128(tmp2, tmp0); 215 tmp0 = _mm_xor_si128(tmp3, tmp0); 216 217 tmp4 = _mm_slli_si128(tmp0, 8); 218 tmp0 = _mm_srli_si128(tmp0, 8); 219 220 lo = _mm_xor_si128(tmp4, lo); 221 hi = _mm_xor_si128(tmp0, hi); 222 223 tmp3 = lo; 224 tmp6 = hi; 225 226 tmp7 = _mm_srli_epi32(tmp3, 31); 227 tmp8 = _mm_srli_epi32(tmp6, 31); 228 tmp3 = _mm_slli_epi32(tmp3, 1); 229 tmp6 = _mm_slli_epi32(tmp6, 1); 230 231 tmp9 = _mm_srli_si128(tmp7, 12); 232 tmp8 = _mm_slli_si128(tmp8, 4); 233 tmp7 = _mm_slli_si128(tmp7, 4); 234 tmp3 = _mm_or_si128(tmp3, tmp7); 235 tmp6 = _mm_or_si128(tmp6, tmp8); 236 tmp6 = _mm_or_si128(tmp6, tmp9); 237 238 tmp7 = _mm_slli_epi32(tmp3, 31); 239 tmp8 = _mm_slli_epi32(tmp3, 30); 240 tmp9 = _mm_slli_epi32(tmp3, 25); 241 242 tmp7 = _mm_xor_si128(tmp7, tmp8); 243 tmp7 = _mm_xor_si128(tmp7, tmp9); 244 tmp8 = _mm_srli_si128(tmp7, 4); 245 tmp7 = _mm_slli_si128(tmp7, 12); 246 tmp3 = _mm_xor_si128(tmp3, tmp7); 247 248 tmp2 = _mm_srli_epi32(tmp3, 1); 249 tmp4 = _mm_srli_epi32(tmp3, 2); 250 tmp5 = _mm_srli_epi32(tmp3, 7); 251 tmp2 = _mm_xor_si128(tmp2, tmp4); 252 tmp2 = _mm_xor_si128(tmp2, tmp5); 253 tmp2 = _mm_xor_si128(tmp2, tmp8); 254 tmp3 = _mm_xor_si128(tmp3, tmp2); 255 tmp6 = _mm_xor_si128(tmp6, tmp3); 256 257 *res = tmp6; 258 } 259 260 /* 261 * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated 262 * Every Four Blocks 263 */ 264 /* 265 * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or 266 * 2^32-256*8*16 bytes. 267 */ 268 void 269 AES_GCM_encrypt(const unsigned char *in, unsigned char *out, 270 const unsigned char *addt, const unsigned char *ivec, 271 unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes, 272 const unsigned char *key, int nr) 273 { 274 int i, j ,k; 275 __m128i tmp1, tmp2, tmp3, tmp4; 276 __m128i tmp5, tmp6, tmp7, tmp8; 277 __m128i H, H2, H3, H4, Y, T; 278 const __m128i *KEY = (const __m128i *)key; 279 __m128i ctr1, ctr2, ctr3, ctr4; 280 __m128i ctr5, ctr6, ctr7, ctr8; 281 __m128i last_block = _mm_setzero_si128(); 282 __m128i ONE = _mm_set_epi32(0, 1, 0, 0); 283 __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0); 284 __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6, 285 7); 286 __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14, 287 15); 288 __m128i X = _mm_setzero_si128(); 289 290 if (ibytes == 96/8) { 291 Y = _mm_loadu_si128((const __m128i *)ivec); 292 Y = _mm_insert_epi32(Y, 0x1000000, 3); 293 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/ 294 tmp1 = _mm_xor_si128(X, KEY[0]); 295 tmp2 = _mm_xor_si128(Y, KEY[0]); 296 for (j=1; j < nr-1; j+=2) { 297 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 298 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 299 300 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 301 tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); 302 } 303 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 304 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); 305 306 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 307 T = _mm_aesenclast_si128(tmp2, KEY[nr]); 308 309 H = _mm_shuffle_epi8(H, BSWAP_MASK); 310 } else { 311 tmp1 = _mm_xor_si128(X, KEY[0]); 312 for (j=1; j <nr; j++) 313 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 314 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 315 316 H = _mm_shuffle_epi8(H, BSWAP_MASK); 317 Y = _mm_setzero_si128(); 318 319 for (i=0; i < ibytes/16; i++) { 320 tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]); 321 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 322 Y = _mm_xor_si128(Y, tmp1); 323 gfmul(Y, H, &Y); 324 } 325 if (ibytes%16) { 326 for (j=0; j < ibytes%16; j++) 327 ((unsigned char*)&last_block)[j] = ivec[i*16+j]; 328 tmp1 = last_block; 329 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 330 Y = _mm_xor_si128(Y, tmp1); 331 gfmul(Y, H, &Y); 332 } 333 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0); 334 tmp1 = _mm_insert_epi64(tmp1, 0, 1); 335 336 Y = _mm_xor_si128(Y, tmp1); 337 gfmul(Y, H, &Y); 338 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/ 339 tmp1 = _mm_xor_si128(Y, KEY[0]); 340 for (j=1; j < nr; j++) 341 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 342 T = _mm_aesenclast_si128(tmp1, KEY[nr]); 343 } 344 345 gfmul(H,H,&H2); 346 gfmul(H,H2,&H3); 347 gfmul(H,H3,&H4); 348 349 for (i=0; i<abytes/16/4; i++) { 350 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]); 351 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]); 352 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]); 353 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]); 354 355 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 356 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 357 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 358 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 359 tmp1 = _mm_xor_si128(X, tmp1); 360 361 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 362 } 363 for (i=i*4; i<abytes/16; i++) { 364 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]); 365 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 366 X = _mm_xor_si128(X,tmp1); 367 gfmul(X, H, &X); 368 } 369 if (abytes%16) { 370 last_block = _mm_setzero_si128(); 371 for (j=0; j<abytes%16; j++) 372 ((unsigned char*)&last_block)[j] = addt[i*16+j]; 373 tmp1 = last_block; 374 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 375 X =_mm_xor_si128(X,tmp1); 376 gfmul(X,H,&X); 377 } 378 379 ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); 380 ctr1 = _mm_add_epi64(ctr1, ONE); 381 ctr2 = _mm_add_epi64(ctr1, ONE); 382 ctr3 = _mm_add_epi64(ctr2, ONE); 383 ctr4 = _mm_add_epi64(ctr3, ONE); 384 ctr5 = _mm_add_epi64(ctr4, ONE); 385 ctr6 = _mm_add_epi64(ctr5, ONE); 386 ctr7 = _mm_add_epi64(ctr6, ONE); 387 ctr8 = _mm_add_epi64(ctr7, ONE); 388 389 for (i=0; i<nbytes/16/8; i++) { 390 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 391 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); 392 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); 393 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); 394 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64); 395 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64); 396 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64); 397 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64); 398 399 ctr1 = _mm_add_epi64(ctr1, EIGHT); 400 ctr2 = _mm_add_epi64(ctr2, EIGHT); 401 ctr3 = _mm_add_epi64(ctr3, EIGHT); 402 ctr4 = _mm_add_epi64(ctr4, EIGHT); 403 ctr5 = _mm_add_epi64(ctr5, EIGHT); 404 ctr6 = _mm_add_epi64(ctr6, EIGHT); 405 ctr7 = _mm_add_epi64(ctr7, EIGHT); 406 ctr8 = _mm_add_epi64(ctr8, EIGHT); 407 408 tmp1 =_mm_xor_si128(tmp1, KEY[0]); 409 tmp2 =_mm_xor_si128(tmp2, KEY[0]); 410 tmp3 =_mm_xor_si128(tmp3, KEY[0]); 411 tmp4 =_mm_xor_si128(tmp4, KEY[0]); 412 tmp5 =_mm_xor_si128(tmp5, KEY[0]); 413 tmp6 =_mm_xor_si128(tmp6, KEY[0]); 414 tmp7 =_mm_xor_si128(tmp7, KEY[0]); 415 tmp8 =_mm_xor_si128(tmp8, KEY[0]); 416 417 for (j=1; j<nr; j++) { 418 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 419 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 420 tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); 421 tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); 422 tmp5 = _mm_aesenc_si128(tmp5, KEY[j]); 423 tmp6 = _mm_aesenc_si128(tmp6, KEY[j]); 424 tmp7 = _mm_aesenc_si128(tmp7, KEY[j]); 425 tmp8 = _mm_aesenc_si128(tmp8, KEY[j]); 426 } 427 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]); 428 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]); 429 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]); 430 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]); 431 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]); 432 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]); 433 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]); 434 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]); 435 436 tmp1 = _mm_xor_si128(tmp1, 437 _mm_loadu_si128(&((const __m128i *)in)[i*8+0])); 438 tmp2 = _mm_xor_si128(tmp2, 439 _mm_loadu_si128(&((const __m128i *)in)[i*8+1])); 440 tmp3 = _mm_xor_si128(tmp3, 441 _mm_loadu_si128(&((const __m128i *)in)[i*8+2])); 442 tmp4 = _mm_xor_si128(tmp4, 443 _mm_loadu_si128(&((const __m128i *)in)[i*8+3])); 444 tmp5 = _mm_xor_si128(tmp5, 445 _mm_loadu_si128(&((const __m128i *)in)[i*8+4])); 446 tmp6 = _mm_xor_si128(tmp6, 447 _mm_loadu_si128(&((const __m128i *)in)[i*8+5])); 448 tmp7 = _mm_xor_si128(tmp7, 449 _mm_loadu_si128(&((const __m128i *)in)[i*8+6])); 450 tmp8 = _mm_xor_si128(tmp8, 451 _mm_loadu_si128(&((const __m128i *)in)[i*8+7])); 452 453 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); 454 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); 455 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); 456 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); 457 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); 458 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); 459 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); 460 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); 461 462 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 463 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 464 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 465 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 466 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); 467 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); 468 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); 469 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); 470 471 tmp1 = _mm_xor_si128(X, tmp1); 472 473 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 474 475 tmp5 = _mm_xor_si128(X, tmp5); 476 reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X); 477 } 478 for (k=i*8; k<nbytes/16; k++) { 479 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 480 ctr1 = _mm_add_epi64(ctr1, ONE); 481 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 482 for (j=1; j<nr-1; j+=2) { 483 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 484 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 485 } 486 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 487 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 488 tmp1 = _mm_xor_si128(tmp1, 489 _mm_loadu_si128(&((const __m128i *)in)[k])); 490 _mm_storeu_si128(&((__m128i*)out)[k], tmp1); 491 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 492 X = _mm_xor_si128(X, tmp1); 493 gfmul(X,H,&X); 494 } 495 //If remains one incomplete block 496 if (nbytes%16) { 497 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 498 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 499 for (j=1; j<nr-1; j+=2) { 500 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 501 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 502 } 503 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 504 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 505 last_block = _mm_setzero_si128(); 506 memcpy(&last_block, &((const __m128i *)in)[k], 507 nbytes % 16); 508 last_block = _mm_xor_si128(last_block, tmp1); 509 for (j=0; j<nbytes%16; j++) 510 out[k*16+j] = ((unsigned char*)&last_block)[j]; 511 for ((void)j; j<16; j++) 512 ((unsigned char*)&last_block)[j] = 0; 513 tmp1 = last_block; 514 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 515 X = _mm_xor_si128(X, tmp1); 516 gfmul(X, H, &X); 517 } 518 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0); 519 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1); 520 521 X = _mm_xor_si128(X, tmp1); 522 gfmul(X,H,&X); 523 X = _mm_shuffle_epi8(X, BSWAP_MASK); 524 T = _mm_xor_si128(X, T); 525 _mm_storeu_si128((__m128i*)tag, T); 526 } 527 528 /* My modification of _encrypt to be _decrypt */ 529 int 530 AES_GCM_decrypt(const unsigned char *in, unsigned char *out, 531 const unsigned char *addt, const unsigned char *ivec, 532 const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes, 533 const unsigned char *key, int nr) 534 { 535 int i, j ,k; 536 __m128i tmp1, tmp2, tmp3, tmp4; 537 __m128i tmp5, tmp6, tmp7, tmp8; 538 __m128i H, H2, H3, H4, Y, T; 539 const __m128i *KEY = (const __m128i *)key; 540 __m128i ctr1, ctr2, ctr3, ctr4; 541 __m128i ctr5, ctr6, ctr7, ctr8; 542 __m128i last_block = _mm_setzero_si128(); 543 __m128i ONE = _mm_set_epi32(0, 1, 0, 0); 544 __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0); 545 __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6, 546 7); 547 __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14, 548 15); 549 __m128i X = _mm_setzero_si128(); 550 551 if (ibytes == 96/8) { 552 Y = _mm_loadu_si128((const __m128i *)ivec); 553 Y = _mm_insert_epi32(Y, 0x1000000, 3); 554 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/ 555 tmp1 = _mm_xor_si128(X, KEY[0]); 556 tmp2 = _mm_xor_si128(Y, KEY[0]); 557 for (j=1; j < nr-1; j+=2) { 558 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 559 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 560 561 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 562 tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); 563 } 564 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 565 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); 566 567 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 568 T = _mm_aesenclast_si128(tmp2, KEY[nr]); 569 570 H = _mm_shuffle_epi8(H, BSWAP_MASK); 571 } else { 572 tmp1 = _mm_xor_si128(X, KEY[0]); 573 for (j=1; j <nr; j++) 574 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 575 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 576 577 H = _mm_shuffle_epi8(H, BSWAP_MASK); 578 Y = _mm_setzero_si128(); 579 580 for (i=0; i < ibytes/16; i++) { 581 tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]); 582 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 583 Y = _mm_xor_si128(Y, tmp1); 584 gfmul(Y, H, &Y); 585 } 586 if (ibytes%16) { 587 for (j=0; j < ibytes%16; j++) 588 ((unsigned char*)&last_block)[j] = ivec[i*16+j]; 589 tmp1 = last_block; 590 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 591 Y = _mm_xor_si128(Y, tmp1); 592 gfmul(Y, H, &Y); 593 } 594 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0); 595 tmp1 = _mm_insert_epi64(tmp1, 0, 1); 596 597 Y = _mm_xor_si128(Y, tmp1); 598 gfmul(Y, H, &Y); 599 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/ 600 tmp1 = _mm_xor_si128(Y, KEY[0]); 601 for (j=1; j < nr; j++) 602 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 603 T = _mm_aesenclast_si128(tmp1, KEY[nr]); 604 } 605 606 gfmul(H,H,&H2); 607 gfmul(H,H2,&H3); 608 gfmul(H,H3,&H4); 609 610 for (i=0; i<abytes/16/4; i++) { 611 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]); 612 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]); 613 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]); 614 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]); 615 616 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 617 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 618 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 619 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 620 621 tmp1 = _mm_xor_si128(X, tmp1); 622 623 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 624 } 625 for (i=i*4; i<abytes/16; i++) { 626 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]); 627 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 628 X = _mm_xor_si128(X,tmp1); 629 gfmul(X, H, &X); 630 } 631 if (abytes%16) { 632 last_block = _mm_setzero_si128(); 633 for (j=0; j<abytes%16; j++) 634 ((unsigned char*)&last_block)[j] = addt[i*16+j]; 635 tmp1 = last_block; 636 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 637 X =_mm_xor_si128(X,tmp1); 638 gfmul(X,H,&X); 639 } 640 641 /* This is where we validate the cipher text before decrypt */ 642 for (i = 0; i<nbytes/16/4; i++) { 643 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i*4]); 644 tmp2 = _mm_loadu_si128(&((const __m128i *)in)[i*4+1]); 645 tmp3 = _mm_loadu_si128(&((const __m128i *)in)[i*4+2]); 646 tmp4 = _mm_loadu_si128(&((const __m128i *)in)[i*4+3]); 647 648 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 649 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 650 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 651 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 652 653 tmp1 = _mm_xor_si128(X, tmp1); 654 655 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 656 } 657 for (i = i*4; i<nbytes/16; i++) { 658 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i]); 659 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 660 X = _mm_xor_si128(X, tmp1); 661 gfmul(X,H,&X); 662 } 663 if (nbytes%16) { 664 last_block = _mm_setzero_si128(); 665 for (j=0; j<nbytes%16; j++) 666 ((unsigned char*)&last_block)[j] = in[i*16+j]; 667 tmp1 = last_block; 668 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 669 X = _mm_xor_si128(X, tmp1); 670 gfmul(X, H, &X); 671 } 672 673 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0); 674 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1); 675 676 X = _mm_xor_si128(X, tmp1); 677 gfmul(X,H,&X); 678 X = _mm_shuffle_epi8(X, BSWAP_MASK); 679 T = _mm_xor_si128(X, T); 680 681 if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag))) 682 return 0; //in case the authentication failed 683 684 ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); 685 ctr1 = _mm_add_epi64(ctr1, ONE); 686 ctr2 = _mm_add_epi64(ctr1, ONE); 687 ctr3 = _mm_add_epi64(ctr2, ONE); 688 ctr4 = _mm_add_epi64(ctr3, ONE); 689 ctr5 = _mm_add_epi64(ctr4, ONE); 690 ctr6 = _mm_add_epi64(ctr5, ONE); 691 ctr7 = _mm_add_epi64(ctr6, ONE); 692 ctr8 = _mm_add_epi64(ctr7, ONE); 693 694 for (i=0; i<nbytes/16/8; i++) { 695 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 696 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); 697 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); 698 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); 699 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64); 700 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64); 701 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64); 702 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64); 703 704 ctr1 = _mm_add_epi64(ctr1, EIGHT); 705 ctr2 = _mm_add_epi64(ctr2, EIGHT); 706 ctr3 = _mm_add_epi64(ctr3, EIGHT); 707 ctr4 = _mm_add_epi64(ctr4, EIGHT); 708 ctr5 = _mm_add_epi64(ctr5, EIGHT); 709 ctr6 = _mm_add_epi64(ctr6, EIGHT); 710 ctr7 = _mm_add_epi64(ctr7, EIGHT); 711 ctr8 = _mm_add_epi64(ctr8, EIGHT); 712 713 tmp1 =_mm_xor_si128(tmp1, KEY[0]); 714 tmp2 =_mm_xor_si128(tmp2, KEY[0]); 715 tmp3 =_mm_xor_si128(tmp3, KEY[0]); 716 tmp4 =_mm_xor_si128(tmp4, KEY[0]); 717 tmp5 =_mm_xor_si128(tmp5, KEY[0]); 718 tmp6 =_mm_xor_si128(tmp6, KEY[0]); 719 tmp7 =_mm_xor_si128(tmp7, KEY[0]); 720 tmp8 =_mm_xor_si128(tmp8, KEY[0]); 721 722 for (j=1; j<nr; j++) { 723 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 724 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 725 tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); 726 tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); 727 tmp5 = _mm_aesenc_si128(tmp5, KEY[j]); 728 tmp6 = _mm_aesenc_si128(tmp6, KEY[j]); 729 tmp7 = _mm_aesenc_si128(tmp7, KEY[j]); 730 tmp8 = _mm_aesenc_si128(tmp8, KEY[j]); 731 } 732 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]); 733 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]); 734 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]); 735 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]); 736 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]); 737 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]); 738 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]); 739 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]); 740 741 tmp1 = _mm_xor_si128(tmp1, 742 _mm_loadu_si128(&((const __m128i *)in)[i*8+0])); 743 tmp2 = _mm_xor_si128(tmp2, 744 _mm_loadu_si128(&((const __m128i *)in)[i*8+1])); 745 tmp3 = _mm_xor_si128(tmp3, 746 _mm_loadu_si128(&((const __m128i *)in)[i*8+2])); 747 tmp4 = _mm_xor_si128(tmp4, 748 _mm_loadu_si128(&((const __m128i *)in)[i*8+3])); 749 tmp5 = _mm_xor_si128(tmp5, 750 _mm_loadu_si128(&((const __m128i *)in)[i*8+4])); 751 tmp6 = _mm_xor_si128(tmp6, 752 _mm_loadu_si128(&((const __m128i *)in)[i*8+5])); 753 tmp7 = _mm_xor_si128(tmp7, 754 _mm_loadu_si128(&((const __m128i *)in)[i*8+6])); 755 tmp8 = _mm_xor_si128(tmp8, 756 _mm_loadu_si128(&((const __m128i *)in)[i*8+7])); 757 758 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); 759 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); 760 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); 761 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); 762 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); 763 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); 764 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); 765 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); 766 767 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 768 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 769 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 770 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 771 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); 772 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); 773 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); 774 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); 775 } 776 for (k=i*8; k<nbytes/16; k++) { 777 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 778 ctr1 = _mm_add_epi64(ctr1, ONE); 779 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 780 for (j=1; j<nr-1; j+=2) { 781 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 782 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 783 } 784 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 785 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 786 tmp1 = _mm_xor_si128(tmp1, 787 _mm_loadu_si128(&((const __m128i *)in)[k])); 788 _mm_storeu_si128(&((__m128i*)out)[k], tmp1); 789 } 790 //If remains one incomplete block 791 if (nbytes%16) { 792 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 793 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 794 for (j=1; j<nr-1; j+=2) { 795 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 796 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 797 } 798 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 799 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 800 last_block = _mm_setzero_si128(); 801 memcpy(&last_block, &((const __m128i *)in)[k], nbytes%16); 802 tmp1 = _mm_xor_si128(tmp1, last_block); 803 last_block = tmp1; 804 for (j=0; j<nbytes%16; j++) 805 out[k*16+j] = ((unsigned char*)&last_block)[j]; 806 } 807 return 1; //when sucessfull returns 1 808 } 809