1 /*- 2 * Copyright (c) 2014 The FreeBSD Foundation 3 * All rights reserved. 4 * 5 * This software was developed by John-Mark Gurney under 6 * the sponsorship of the FreeBSD Foundation and 7 * Rubicon Communications, LLC (Netgate). 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * 30 * $FreeBSD$ 31 * 32 */ 33 34 /* 35 * Figure 5, 8 and 12 are copied from the Intel white paper: 36 * Intel® Carry-Less Multiplication Instruction and its Usage for 37 * Computing the GCM Mode 38 * 39 * and as such are: 40 * Copyright © 2010 Intel Corporation. 41 * All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * * Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * * Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * * Neither the name of Intel Corporation nor the 52 * names of its contributors may be used to endorse or promote products 53 * derived from this software without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 56 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 57 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 58 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 59 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 60 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 61 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 62 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 63 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 64 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 65 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 66 */ 67 68 #ifdef _KERNEL 69 #include <crypto/aesni/aesni.h> 70 #else 71 #include <stdint.h> 72 #endif 73 74 #include <wmmintrin.h> 75 #include <emmintrin.h> 76 #include <smmintrin.h> 77 78 static inline int 79 m128icmp(__m128i a, __m128i b) 80 { 81 __m128i cmp; 82 83 cmp = _mm_cmpeq_epi32(a, b); 84 85 return _mm_movemask_epi8(cmp) == 0xffff; 86 } 87 88 #ifdef __i386__ 89 static inline __m128i 90 _mm_insert_epi64(__m128i a, int64_t b, const int ndx) 91 { 92 93 if (!ndx) { 94 a = _mm_insert_epi32(a, b, 0); 95 a = _mm_insert_epi32(a, b >> 32, 1); 96 } else { 97 a = _mm_insert_epi32(a, b, 2); 98 a = _mm_insert_epi32(a, b >> 32, 3); 99 } 100 101 return a; 102 } 103 #endif 104 105 /* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */ 106 107 /* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */ 108 static void 109 gfmul(__m128i a, __m128i b, __m128i *res) 110 { 111 __m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; 112 113 tmp3 = _mm_clmulepi64_si128(a, b, 0x00); 114 tmp4 = _mm_clmulepi64_si128(a, b, 0x10); 115 tmp5 = _mm_clmulepi64_si128(a, b, 0x01); 116 tmp6 = _mm_clmulepi64_si128(a, b, 0x11); 117 118 tmp4 = _mm_xor_si128(tmp4, tmp5); 119 tmp5 = _mm_slli_si128(tmp4, 8); 120 tmp4 = _mm_srli_si128(tmp4, 8); 121 tmp3 = _mm_xor_si128(tmp3, tmp5); 122 tmp6 = _mm_xor_si128(tmp6, tmp4); 123 124 tmp7 = _mm_srli_epi32(tmp3, 31); 125 tmp8 = _mm_srli_epi32(tmp6, 31); 126 tmp3 = _mm_slli_epi32(tmp3, 1); 127 tmp6 = _mm_slli_epi32(tmp6, 1); 128 129 tmp9 = _mm_srli_si128(tmp7, 12); 130 tmp8 = _mm_slli_si128(tmp8, 4); 131 tmp7 = _mm_slli_si128(tmp7, 4); 132 tmp3 = _mm_or_si128(tmp3, tmp7); 133 tmp6 = _mm_or_si128(tmp6, tmp8); 134 tmp6 = _mm_or_si128(tmp6, tmp9); 135 136 tmp7 = _mm_slli_epi32(tmp3, 31); 137 tmp8 = _mm_slli_epi32(tmp3, 30); 138 tmp9 = _mm_slli_epi32(tmp3, 25); 139 140 tmp7 = _mm_xor_si128(tmp7, tmp8); 141 tmp7 = _mm_xor_si128(tmp7, tmp9); 142 tmp8 = _mm_srli_si128(tmp7, 4); 143 tmp7 = _mm_slli_si128(tmp7, 12); 144 tmp3 = _mm_xor_si128(tmp3, tmp7); 145 146 tmp2 = _mm_srli_epi32(tmp3, 1); 147 tmp4 = _mm_srli_epi32(tmp3, 2); 148 tmp5 = _mm_srli_epi32(tmp3, 7); 149 tmp2 = _mm_xor_si128(tmp2, tmp4); 150 tmp2 = _mm_xor_si128(tmp2, tmp5); 151 tmp2 = _mm_xor_si128(tmp2, tmp8); 152 tmp3 = _mm_xor_si128(tmp3, tmp2); 153 tmp6 = _mm_xor_si128(tmp6, tmp3); 154 155 *res = tmp6; 156 } 157 158 /* 159 * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction 160 * Method */ 161 static void 162 reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4, 163 __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res) 164 { 165 /*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/ 166 __m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo, 167 H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi; 168 __m128i tmp0, tmp1, tmp2, tmp3; 169 __m128i tmp4, tmp5, tmp6, tmp7; 170 __m128i tmp8, tmp9; 171 172 H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00); 173 H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00); 174 H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00); 175 H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00); 176 177 lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo); 178 lo = _mm_xor_si128(lo, H3_X3_lo); 179 lo = _mm_xor_si128(lo, H4_X4_lo); 180 181 H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11); 182 H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11); 183 H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11); 184 H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11); 185 186 hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi); 187 hi = _mm_xor_si128(hi, H3_X3_hi); 188 hi = _mm_xor_si128(hi, H4_X4_hi); 189 190 tmp0 = _mm_shuffle_epi32(H1, 78); 191 tmp4 = _mm_shuffle_epi32(X1, 78); 192 tmp0 = _mm_xor_si128(tmp0, H1); 193 tmp4 = _mm_xor_si128(tmp4, X1); 194 tmp1 = _mm_shuffle_epi32(H2, 78); 195 tmp5 = _mm_shuffle_epi32(X2, 78); 196 tmp1 = _mm_xor_si128(tmp1, H2); 197 tmp5 = _mm_xor_si128(tmp5, X2); 198 tmp2 = _mm_shuffle_epi32(H3, 78); 199 tmp6 = _mm_shuffle_epi32(X3, 78); 200 tmp2 = _mm_xor_si128(tmp2, H3); 201 tmp6 = _mm_xor_si128(tmp6, X3); 202 tmp3 = _mm_shuffle_epi32(H4, 78); 203 tmp7 = _mm_shuffle_epi32(X4, 78); 204 tmp3 = _mm_xor_si128(tmp3, H4); 205 tmp7 = _mm_xor_si128(tmp7, X4); 206 207 tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00); 208 tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00); 209 tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00); 210 tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00); 211 212 tmp0 = _mm_xor_si128(tmp0, lo); 213 tmp0 = _mm_xor_si128(tmp0, hi); 214 tmp0 = _mm_xor_si128(tmp1, tmp0); 215 tmp0 = _mm_xor_si128(tmp2, tmp0); 216 tmp0 = _mm_xor_si128(tmp3, tmp0); 217 218 tmp4 = _mm_slli_si128(tmp0, 8); 219 tmp0 = _mm_srli_si128(tmp0, 8); 220 221 lo = _mm_xor_si128(tmp4, lo); 222 hi = _mm_xor_si128(tmp0, hi); 223 224 tmp3 = lo; 225 tmp6 = hi; 226 227 tmp7 = _mm_srli_epi32(tmp3, 31); 228 tmp8 = _mm_srli_epi32(tmp6, 31); 229 tmp3 = _mm_slli_epi32(tmp3, 1); 230 tmp6 = _mm_slli_epi32(tmp6, 1); 231 232 tmp9 = _mm_srli_si128(tmp7, 12); 233 tmp8 = _mm_slli_si128(tmp8, 4); 234 tmp7 = _mm_slli_si128(tmp7, 4); 235 tmp3 = _mm_or_si128(tmp3, tmp7); 236 tmp6 = _mm_or_si128(tmp6, tmp8); 237 tmp6 = _mm_or_si128(tmp6, tmp9); 238 239 tmp7 = _mm_slli_epi32(tmp3, 31); 240 tmp8 = _mm_slli_epi32(tmp3, 30); 241 tmp9 = _mm_slli_epi32(tmp3, 25); 242 243 tmp7 = _mm_xor_si128(tmp7, tmp8); 244 tmp7 = _mm_xor_si128(tmp7, tmp9); 245 tmp8 = _mm_srli_si128(tmp7, 4); 246 tmp7 = _mm_slli_si128(tmp7, 12); 247 tmp3 = _mm_xor_si128(tmp3, tmp7); 248 249 tmp2 = _mm_srli_epi32(tmp3, 1); 250 tmp4 = _mm_srli_epi32(tmp3, 2); 251 tmp5 = _mm_srli_epi32(tmp3, 7); 252 tmp2 = _mm_xor_si128(tmp2, tmp4); 253 tmp2 = _mm_xor_si128(tmp2, tmp5); 254 tmp2 = _mm_xor_si128(tmp2, tmp8); 255 tmp3 = _mm_xor_si128(tmp3, tmp2); 256 tmp6 = _mm_xor_si128(tmp6, tmp3); 257 258 *res = tmp6; 259 } 260 261 /* 262 * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated 263 * Every Four Blocks 264 */ 265 /* 266 * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or 267 * 2^32-256*8*16 bytes. 268 */ 269 void 270 AES_GCM_encrypt(const unsigned char *in, unsigned char *out, 271 const unsigned char *addt, const unsigned char *ivec, 272 unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes, 273 const unsigned char *key, int nr) 274 { 275 int i, j ,k; 276 __m128i tmp1, tmp2, tmp3, tmp4; 277 __m128i tmp5, tmp6, tmp7, tmp8; 278 __m128i H, H2, H3, H4, Y, T; 279 __m128i *KEY = (__m128i*)key; 280 __m128i ctr1, ctr2, ctr3, ctr4; 281 __m128i ctr5, ctr6, ctr7, ctr8; 282 __m128i last_block = _mm_setzero_si128(); 283 __m128i ONE = _mm_set_epi32(0, 1, 0, 0); 284 __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0); 285 __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6, 286 7); 287 __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14, 288 15); 289 __m128i X = _mm_setzero_si128(); 290 291 if (ibytes == 96/8) { 292 Y = _mm_loadu_si128((__m128i*)ivec); 293 Y = _mm_insert_epi32(Y, 0x1000000, 3); 294 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/ 295 tmp1 = _mm_xor_si128(X, KEY[0]); 296 tmp2 = _mm_xor_si128(Y, KEY[0]); 297 for (j=1; j < nr-1; j+=2) { 298 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 299 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 300 301 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 302 tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); 303 } 304 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 305 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); 306 307 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 308 T = _mm_aesenclast_si128(tmp2, KEY[nr]); 309 310 H = _mm_shuffle_epi8(H, BSWAP_MASK); 311 } else { 312 tmp1 = _mm_xor_si128(X, KEY[0]); 313 for (j=1; j <nr; j++) 314 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 315 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 316 317 H = _mm_shuffle_epi8(H, BSWAP_MASK); 318 Y = _mm_setzero_si128(); 319 320 for (i=0; i < ibytes/16; i++) { 321 tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); 322 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 323 Y = _mm_xor_si128(Y, tmp1); 324 gfmul(Y, H, &Y); 325 } 326 if (ibytes%16) { 327 for (j=0; j < ibytes%16; j++) 328 ((unsigned char*)&last_block)[j] = ivec[i*16+j]; 329 tmp1 = last_block; 330 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 331 Y = _mm_xor_si128(Y, tmp1); 332 gfmul(Y, H, &Y); 333 } 334 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0); 335 tmp1 = _mm_insert_epi64(tmp1, 0, 1); 336 337 Y = _mm_xor_si128(Y, tmp1); 338 gfmul(Y, H, &Y); 339 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/ 340 tmp1 = _mm_xor_si128(Y, KEY[0]); 341 for (j=1; j < nr; j++) 342 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 343 T = _mm_aesenclast_si128(tmp1, KEY[nr]); 344 } 345 346 gfmul(H,H,&H2); 347 gfmul(H,H2,&H3); 348 gfmul(H,H3,&H4); 349 350 for (i=0; i<abytes/16/4; i++) { 351 tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i*4]); 352 tmp2 = _mm_loadu_si128(&((__m128i*)addt)[i*4+1]); 353 tmp3 = _mm_loadu_si128(&((__m128i*)addt)[i*4+2]); 354 tmp4 = _mm_loadu_si128(&((__m128i*)addt)[i*4+3]); 355 356 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 357 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 358 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 359 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 360 tmp1 = _mm_xor_si128(X, tmp1); 361 362 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 363 } 364 for (i=i*4; i<abytes/16; i++) { 365 tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); 366 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 367 X = _mm_xor_si128(X,tmp1); 368 gfmul(X, H, &X); 369 } 370 if (abytes%16) { 371 last_block = _mm_setzero_si128(); 372 for (j=0; j<abytes%16; j++) 373 ((unsigned char*)&last_block)[j] = addt[i*16+j]; 374 tmp1 = last_block; 375 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 376 X =_mm_xor_si128(X,tmp1); 377 gfmul(X,H,&X); 378 } 379 380 ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); 381 ctr1 = _mm_add_epi64(ctr1, ONE); 382 ctr2 = _mm_add_epi64(ctr1, ONE); 383 ctr3 = _mm_add_epi64(ctr2, ONE); 384 ctr4 = _mm_add_epi64(ctr3, ONE); 385 ctr5 = _mm_add_epi64(ctr4, ONE); 386 ctr6 = _mm_add_epi64(ctr5, ONE); 387 ctr7 = _mm_add_epi64(ctr6, ONE); 388 ctr8 = _mm_add_epi64(ctr7, ONE); 389 390 for (i=0; i<nbytes/16/8; i++) { 391 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 392 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); 393 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); 394 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); 395 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64); 396 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64); 397 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64); 398 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64); 399 400 ctr1 = _mm_add_epi64(ctr1, EIGHT); 401 ctr2 = _mm_add_epi64(ctr2, EIGHT); 402 ctr3 = _mm_add_epi64(ctr3, EIGHT); 403 ctr4 = _mm_add_epi64(ctr4, EIGHT); 404 ctr5 = _mm_add_epi64(ctr5, EIGHT); 405 ctr6 = _mm_add_epi64(ctr6, EIGHT); 406 ctr7 = _mm_add_epi64(ctr7, EIGHT); 407 ctr8 = _mm_add_epi64(ctr8, EIGHT); 408 409 tmp1 =_mm_xor_si128(tmp1, KEY[0]); 410 tmp2 =_mm_xor_si128(tmp2, KEY[0]); 411 tmp3 =_mm_xor_si128(tmp3, KEY[0]); 412 tmp4 =_mm_xor_si128(tmp4, KEY[0]); 413 tmp5 =_mm_xor_si128(tmp5, KEY[0]); 414 tmp6 =_mm_xor_si128(tmp6, KEY[0]); 415 tmp7 =_mm_xor_si128(tmp7, KEY[0]); 416 tmp8 =_mm_xor_si128(tmp8, KEY[0]); 417 418 for (j=1; j<nr; j++) { 419 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 420 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 421 tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); 422 tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); 423 tmp5 = _mm_aesenc_si128(tmp5, KEY[j]); 424 tmp6 = _mm_aesenc_si128(tmp6, KEY[j]); 425 tmp7 = _mm_aesenc_si128(tmp7, KEY[j]); 426 tmp8 = _mm_aesenc_si128(tmp8, KEY[j]); 427 } 428 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]); 429 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]); 430 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]); 431 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]); 432 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]); 433 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]); 434 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]); 435 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]); 436 437 tmp1 = _mm_xor_si128(tmp1, 438 _mm_loadu_si128(&((__m128i*)in)[i*8+0])); 439 tmp2 = _mm_xor_si128(tmp2, 440 _mm_loadu_si128(&((__m128i*)in)[i*8+1])); 441 tmp3 = _mm_xor_si128(tmp3, 442 _mm_loadu_si128(&((__m128i*)in)[i*8+2])); 443 tmp4 = _mm_xor_si128(tmp4, 444 _mm_loadu_si128(&((__m128i*)in)[i*8+3])); 445 tmp5 = _mm_xor_si128(tmp5, 446 _mm_loadu_si128(&((__m128i*)in)[i*8+4])); 447 tmp6 = _mm_xor_si128(tmp6, 448 _mm_loadu_si128(&((__m128i*)in)[i*8+5])); 449 tmp7 = _mm_xor_si128(tmp7, 450 _mm_loadu_si128(&((__m128i*)in)[i*8+6])); 451 tmp8 = _mm_xor_si128(tmp8, 452 _mm_loadu_si128(&((__m128i*)in)[i*8+7])); 453 454 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); 455 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); 456 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); 457 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); 458 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); 459 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); 460 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); 461 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); 462 463 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 464 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 465 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 466 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 467 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); 468 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); 469 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); 470 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); 471 472 tmp1 = _mm_xor_si128(X, tmp1); 473 474 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 475 476 tmp5 = _mm_xor_si128(X, tmp5); 477 reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X); 478 } 479 for (k=i*8; k<nbytes/16; k++) { 480 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 481 ctr1 = _mm_add_epi64(ctr1, ONE); 482 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 483 for (j=1; j<nr-1; j+=2) { 484 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 485 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 486 } 487 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 488 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 489 tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); 490 _mm_storeu_si128(&((__m128i*)out)[k], tmp1); 491 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 492 X = _mm_xor_si128(X, tmp1); 493 gfmul(X,H,&X); 494 } 495 //If remains one incomplete block 496 if (nbytes%16) { 497 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 498 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 499 for (j=1; j<nr-1; j+=2) { 500 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 501 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 502 } 503 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 504 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 505 tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); 506 last_block = tmp1; 507 for (j=0; j<nbytes%16; j++) 508 out[k*16+j] = ((unsigned char*)&last_block)[j]; 509 for ((void)j; j<16; j++) 510 ((unsigned char*)&last_block)[j] = 0; 511 tmp1 = last_block; 512 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 513 X = _mm_xor_si128(X, tmp1); 514 gfmul(X, H, &X); 515 } 516 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0); 517 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1); 518 519 X = _mm_xor_si128(X, tmp1); 520 gfmul(X,H,&X); 521 X = _mm_shuffle_epi8(X, BSWAP_MASK); 522 T = _mm_xor_si128(X, T); 523 _mm_storeu_si128((__m128i*)tag, T); 524 } 525 526 /* My modification of _encrypt to be _decrypt */ 527 int 528 AES_GCM_decrypt(const unsigned char *in, unsigned char *out, 529 const unsigned char *addt, const unsigned char *ivec, 530 unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes, 531 const unsigned char *key, int nr) 532 { 533 int i, j ,k; 534 __m128i tmp1, tmp2, tmp3, tmp4; 535 __m128i tmp5, tmp6, tmp7, tmp8; 536 __m128i H, H2, H3, H4, Y, T; 537 __m128i *KEY = (__m128i*)key; 538 __m128i ctr1, ctr2, ctr3, ctr4; 539 __m128i ctr5, ctr6, ctr7, ctr8; 540 __m128i last_block = _mm_setzero_si128(); 541 __m128i ONE = _mm_set_epi32(0, 1, 0, 0); 542 __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0); 543 __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6, 544 7); 545 __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14, 546 15); 547 __m128i X = _mm_setzero_si128(); 548 549 if (ibytes == 96/8) { 550 Y = _mm_loadu_si128((__m128i*)ivec); 551 Y = _mm_insert_epi32(Y, 0x1000000, 3); 552 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/ 553 tmp1 = _mm_xor_si128(X, KEY[0]); 554 tmp2 = _mm_xor_si128(Y, KEY[0]); 555 for (j=1; j < nr-1; j+=2) { 556 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 557 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 558 559 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 560 tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); 561 } 562 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 563 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); 564 565 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 566 T = _mm_aesenclast_si128(tmp2, KEY[nr]); 567 568 H = _mm_shuffle_epi8(H, BSWAP_MASK); 569 } else { 570 tmp1 = _mm_xor_si128(X, KEY[0]); 571 for (j=1; j <nr; j++) 572 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 573 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 574 575 H = _mm_shuffle_epi8(H, BSWAP_MASK); 576 Y = _mm_setzero_si128(); 577 578 for (i=0; i < ibytes/16; i++) { 579 tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); 580 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 581 Y = _mm_xor_si128(Y, tmp1); 582 gfmul(Y, H, &Y); 583 } 584 if (ibytes%16) { 585 for (j=0; j < ibytes%16; j++) 586 ((unsigned char*)&last_block)[j] = ivec[i*16+j]; 587 tmp1 = last_block; 588 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 589 Y = _mm_xor_si128(Y, tmp1); 590 gfmul(Y, H, &Y); 591 } 592 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0); 593 tmp1 = _mm_insert_epi64(tmp1, 0, 1); 594 595 Y = _mm_xor_si128(Y, tmp1); 596 gfmul(Y, H, &Y); 597 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/ 598 tmp1 = _mm_xor_si128(Y, KEY[0]); 599 for (j=1; j < nr; j++) 600 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 601 T = _mm_aesenclast_si128(tmp1, KEY[nr]); 602 } 603 604 gfmul(H,H,&H2); 605 gfmul(H,H2,&H3); 606 gfmul(H,H3,&H4); 607 608 for (i=0; i<abytes/16/4; i++) { 609 tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i*4]); 610 tmp2 = _mm_loadu_si128(&((__m128i*)addt)[i*4+1]); 611 tmp3 = _mm_loadu_si128(&((__m128i*)addt)[i*4+2]); 612 tmp4 = _mm_loadu_si128(&((__m128i*)addt)[i*4+3]); 613 614 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 615 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 616 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 617 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 618 619 tmp1 = _mm_xor_si128(X, tmp1); 620 621 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 622 } 623 for (i=i*4; i<abytes/16; i++) { 624 tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); 625 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 626 X = _mm_xor_si128(X,tmp1); 627 gfmul(X, H, &X); 628 } 629 if (abytes%16) { 630 last_block = _mm_setzero_si128(); 631 for (j=0; j<abytes%16; j++) 632 ((unsigned char*)&last_block)[j] = addt[i*16+j]; 633 tmp1 = last_block; 634 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 635 X =_mm_xor_si128(X,tmp1); 636 gfmul(X,H,&X); 637 } 638 639 /* This is where we validate the cipher text before decrypt */ 640 for (i = 0; i<nbytes/16/4; i++) { 641 tmp1 = _mm_loadu_si128(&((__m128i*)in)[i*4]); 642 tmp2 = _mm_loadu_si128(&((__m128i*)in)[i*4+1]); 643 tmp3 = _mm_loadu_si128(&((__m128i*)in)[i*4+2]); 644 tmp4 = _mm_loadu_si128(&((__m128i*)in)[i*4+3]); 645 646 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 647 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 648 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 649 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 650 651 tmp1 = _mm_xor_si128(X, tmp1); 652 653 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 654 } 655 for (i = i*4; i<nbytes/16; i++) { 656 tmp1 = _mm_loadu_si128(&((__m128i*)in)[i]); 657 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 658 X = _mm_xor_si128(X, tmp1); 659 gfmul(X,H,&X); 660 } 661 if (nbytes%16) { 662 last_block = _mm_setzero_si128(); 663 for (j=0; j<nbytes%16; j++) 664 ((unsigned char*)&last_block)[j] = in[i*16+j]; 665 tmp1 = last_block; 666 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 667 X = _mm_xor_si128(X, tmp1); 668 gfmul(X, H, &X); 669 } 670 671 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0); 672 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1); 673 674 X = _mm_xor_si128(X, tmp1); 675 gfmul(X,H,&X); 676 X = _mm_shuffle_epi8(X, BSWAP_MASK); 677 T = _mm_xor_si128(X, T); 678 679 if (!m128icmp(T, _mm_loadu_si128((__m128i*)tag))) 680 return 0; //in case the authentication failed 681 682 ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); 683 ctr1 = _mm_add_epi64(ctr1, ONE); 684 ctr2 = _mm_add_epi64(ctr1, ONE); 685 ctr3 = _mm_add_epi64(ctr2, ONE); 686 ctr4 = _mm_add_epi64(ctr3, ONE); 687 ctr5 = _mm_add_epi64(ctr4, ONE); 688 ctr6 = _mm_add_epi64(ctr5, ONE); 689 ctr7 = _mm_add_epi64(ctr6, ONE); 690 ctr8 = _mm_add_epi64(ctr7, ONE); 691 692 for (i=0; i<nbytes/16/8; i++) { 693 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 694 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); 695 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); 696 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); 697 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64); 698 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64); 699 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64); 700 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64); 701 702 ctr1 = _mm_add_epi64(ctr1, EIGHT); 703 ctr2 = _mm_add_epi64(ctr2, EIGHT); 704 ctr3 = _mm_add_epi64(ctr3, EIGHT); 705 ctr4 = _mm_add_epi64(ctr4, EIGHT); 706 ctr5 = _mm_add_epi64(ctr5, EIGHT); 707 ctr6 = _mm_add_epi64(ctr6, EIGHT); 708 ctr7 = _mm_add_epi64(ctr7, EIGHT); 709 ctr8 = _mm_add_epi64(ctr8, EIGHT); 710 711 tmp1 =_mm_xor_si128(tmp1, KEY[0]); 712 tmp2 =_mm_xor_si128(tmp2, KEY[0]); 713 tmp3 =_mm_xor_si128(tmp3, KEY[0]); 714 tmp4 =_mm_xor_si128(tmp4, KEY[0]); 715 tmp5 =_mm_xor_si128(tmp5, KEY[0]); 716 tmp6 =_mm_xor_si128(tmp6, KEY[0]); 717 tmp7 =_mm_xor_si128(tmp7, KEY[0]); 718 tmp8 =_mm_xor_si128(tmp8, KEY[0]); 719 720 for (j=1; j<nr; j++) { 721 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 722 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 723 tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); 724 tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); 725 tmp5 = _mm_aesenc_si128(tmp5, KEY[j]); 726 tmp6 = _mm_aesenc_si128(tmp6, KEY[j]); 727 tmp7 = _mm_aesenc_si128(tmp7, KEY[j]); 728 tmp8 = _mm_aesenc_si128(tmp8, KEY[j]); 729 } 730 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]); 731 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]); 732 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]); 733 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]); 734 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]); 735 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]); 736 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]); 737 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]); 738 739 tmp1 = _mm_xor_si128(tmp1, 740 _mm_loadu_si128(&((__m128i*)in)[i*8+0])); 741 tmp2 = _mm_xor_si128(tmp2, 742 _mm_loadu_si128(&((__m128i*)in)[i*8+1])); 743 tmp3 = _mm_xor_si128(tmp3, 744 _mm_loadu_si128(&((__m128i*)in)[i*8+2])); 745 tmp4 = _mm_xor_si128(tmp4, 746 _mm_loadu_si128(&((__m128i*)in)[i*8+3])); 747 tmp5 = _mm_xor_si128(tmp5, 748 _mm_loadu_si128(&((__m128i*)in)[i*8+4])); 749 tmp6 = _mm_xor_si128(tmp6, 750 _mm_loadu_si128(&((__m128i*)in)[i*8+5])); 751 tmp7 = _mm_xor_si128(tmp7, 752 _mm_loadu_si128(&((__m128i*)in)[i*8+6])); 753 tmp8 = _mm_xor_si128(tmp8, 754 _mm_loadu_si128(&((__m128i*)in)[i*8+7])); 755 756 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); 757 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); 758 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); 759 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); 760 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); 761 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); 762 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); 763 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); 764 765 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 766 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 767 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 768 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 769 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); 770 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); 771 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); 772 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); 773 } 774 for (k=i*8; k<nbytes/16; k++) { 775 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 776 ctr1 = _mm_add_epi64(ctr1, ONE); 777 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 778 for (j=1; j<nr-1; j+=2) { 779 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 780 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 781 } 782 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 783 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 784 tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); 785 _mm_storeu_si128(&((__m128i*)out)[k], tmp1); 786 } 787 //If remains one incomplete block 788 if (nbytes%16) { 789 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 790 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 791 for (j=1; j<nr-1; j+=2) { 792 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 793 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 794 } 795 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 796 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 797 tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); 798 last_block = tmp1; 799 for (j=0; j<nbytes%16; j++) 800 out[k*16+j] = ((unsigned char*)&last_block)[j]; 801 } 802 return 1; //when sucessfull returns 1 803 } 804