1 /*- 2 * Copyright (c) 2014 The FreeBSD Foundation 3 * All rights reserved. 4 * 5 * This software was developed by John-Mark Gurney under 6 * the sponsorship of the FreeBSD Foundation and 7 * Rubicon Communications, LLC (Netgate). 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * 30 * $FreeBSD$ 31 * 32 */ 33 34 /* 35 * Figure 5, 8 and 12 are copied from the Intel white paper: 36 * Intel® Carry-Less Multiplication Instruction and its Usage for 37 * Computing the GCM Mode 38 * 39 * and as such are: 40 * Copyright © 2010 Intel Corporation. 41 * All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * * Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * * Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * * Neither the name of Intel Corporation nor the 52 * names of its contributors may be used to endorse or promote products 53 * derived from this software without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 56 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 57 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 58 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 59 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 60 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 61 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 62 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 63 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 64 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 65 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 66 */ 67 68 #ifdef _KERNEL 69 #include <crypto/aesni/aesni.h> 70 #include <crypto/aesni/aesni_os.h> 71 #else 72 #include <stdint.h> 73 #endif 74 75 #include <wmmintrin.h> 76 #include <emmintrin.h> 77 #include <smmintrin.h> 78 79 static inline int 80 m128icmp(__m128i a, __m128i b) 81 { 82 __m128i cmp; 83 84 cmp = _mm_cmpeq_epi32(a, b); 85 86 return _mm_movemask_epi8(cmp) == 0xffff; 87 } 88 89 #ifdef __i386__ 90 static inline __m128i 91 _mm_insert_epi64(__m128i a, int64_t b, const int ndx) 92 { 93 94 if (!ndx) { 95 a = _mm_insert_epi32(a, b, 0); 96 a = _mm_insert_epi32(a, b >> 32, 1); 97 } else { 98 a = _mm_insert_epi32(a, b, 2); 99 a = _mm_insert_epi32(a, b >> 32, 3); 100 } 101 102 return a; 103 } 104 #endif 105 106 /* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */ 107 108 /* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */ 109 static void 110 gfmul(__m128i a, __m128i b, __m128i *res) 111 { 112 __m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; 113 114 tmp3 = _mm_clmulepi64_si128(a, b, 0x00); 115 tmp4 = _mm_clmulepi64_si128(a, b, 0x10); 116 tmp5 = _mm_clmulepi64_si128(a, b, 0x01); 117 tmp6 = _mm_clmulepi64_si128(a, b, 0x11); 118 119 tmp4 = _mm_xor_si128(tmp4, tmp5); 120 tmp5 = _mm_slli_si128(tmp4, 8); 121 tmp4 = _mm_srli_si128(tmp4, 8); 122 tmp3 = _mm_xor_si128(tmp3, tmp5); 123 tmp6 = _mm_xor_si128(tmp6, tmp4); 124 125 tmp7 = _mm_srli_epi32(tmp3, 31); 126 tmp8 = _mm_srli_epi32(tmp6, 31); 127 tmp3 = _mm_slli_epi32(tmp3, 1); 128 tmp6 = _mm_slli_epi32(tmp6, 1); 129 130 tmp9 = _mm_srli_si128(tmp7, 12); 131 tmp8 = _mm_slli_si128(tmp8, 4); 132 tmp7 = _mm_slli_si128(tmp7, 4); 133 tmp3 = _mm_or_si128(tmp3, tmp7); 134 tmp6 = _mm_or_si128(tmp6, tmp8); 135 tmp6 = _mm_or_si128(tmp6, tmp9); 136 137 tmp7 = _mm_slli_epi32(tmp3, 31); 138 tmp8 = _mm_slli_epi32(tmp3, 30); 139 tmp9 = _mm_slli_epi32(tmp3, 25); 140 141 tmp7 = _mm_xor_si128(tmp7, tmp8); 142 tmp7 = _mm_xor_si128(tmp7, tmp9); 143 tmp8 = _mm_srli_si128(tmp7, 4); 144 tmp7 = _mm_slli_si128(tmp7, 12); 145 tmp3 = _mm_xor_si128(tmp3, tmp7); 146 147 tmp2 = _mm_srli_epi32(tmp3, 1); 148 tmp4 = _mm_srli_epi32(tmp3, 2); 149 tmp5 = _mm_srli_epi32(tmp3, 7); 150 tmp2 = _mm_xor_si128(tmp2, tmp4); 151 tmp2 = _mm_xor_si128(tmp2, tmp5); 152 tmp2 = _mm_xor_si128(tmp2, tmp8); 153 tmp3 = _mm_xor_si128(tmp3, tmp2); 154 tmp6 = _mm_xor_si128(tmp6, tmp3); 155 156 *res = tmp6; 157 } 158 159 /* 160 * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction 161 * Method */ 162 static void 163 reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4, 164 __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res) 165 { 166 /*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/ 167 __m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo, 168 H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi; 169 __m128i tmp0, tmp1, tmp2, tmp3; 170 __m128i tmp4, tmp5, tmp6, tmp7; 171 __m128i tmp8, tmp9; 172 173 H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00); 174 H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00); 175 H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00); 176 H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00); 177 178 lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo); 179 lo = _mm_xor_si128(lo, H3_X3_lo); 180 lo = _mm_xor_si128(lo, H4_X4_lo); 181 182 H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11); 183 H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11); 184 H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11); 185 H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11); 186 187 hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi); 188 hi = _mm_xor_si128(hi, H3_X3_hi); 189 hi = _mm_xor_si128(hi, H4_X4_hi); 190 191 tmp0 = _mm_shuffle_epi32(H1, 78); 192 tmp4 = _mm_shuffle_epi32(X1, 78); 193 tmp0 = _mm_xor_si128(tmp0, H1); 194 tmp4 = _mm_xor_si128(tmp4, X1); 195 tmp1 = _mm_shuffle_epi32(H2, 78); 196 tmp5 = _mm_shuffle_epi32(X2, 78); 197 tmp1 = _mm_xor_si128(tmp1, H2); 198 tmp5 = _mm_xor_si128(tmp5, X2); 199 tmp2 = _mm_shuffle_epi32(H3, 78); 200 tmp6 = _mm_shuffle_epi32(X3, 78); 201 tmp2 = _mm_xor_si128(tmp2, H3); 202 tmp6 = _mm_xor_si128(tmp6, X3); 203 tmp3 = _mm_shuffle_epi32(H4, 78); 204 tmp7 = _mm_shuffle_epi32(X4, 78); 205 tmp3 = _mm_xor_si128(tmp3, H4); 206 tmp7 = _mm_xor_si128(tmp7, X4); 207 208 tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00); 209 tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00); 210 tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00); 211 tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00); 212 213 tmp0 = _mm_xor_si128(tmp0, lo); 214 tmp0 = _mm_xor_si128(tmp0, hi); 215 tmp0 = _mm_xor_si128(tmp1, tmp0); 216 tmp0 = _mm_xor_si128(tmp2, tmp0); 217 tmp0 = _mm_xor_si128(tmp3, tmp0); 218 219 tmp4 = _mm_slli_si128(tmp0, 8); 220 tmp0 = _mm_srli_si128(tmp0, 8); 221 222 lo = _mm_xor_si128(tmp4, lo); 223 hi = _mm_xor_si128(tmp0, hi); 224 225 tmp3 = lo; 226 tmp6 = hi; 227 228 tmp7 = _mm_srli_epi32(tmp3, 31); 229 tmp8 = _mm_srli_epi32(tmp6, 31); 230 tmp3 = _mm_slli_epi32(tmp3, 1); 231 tmp6 = _mm_slli_epi32(tmp6, 1); 232 233 tmp9 = _mm_srli_si128(tmp7, 12); 234 tmp8 = _mm_slli_si128(tmp8, 4); 235 tmp7 = _mm_slli_si128(tmp7, 4); 236 tmp3 = _mm_or_si128(tmp3, tmp7); 237 tmp6 = _mm_or_si128(tmp6, tmp8); 238 tmp6 = _mm_or_si128(tmp6, tmp9); 239 240 tmp7 = _mm_slli_epi32(tmp3, 31); 241 tmp8 = _mm_slli_epi32(tmp3, 30); 242 tmp9 = _mm_slli_epi32(tmp3, 25); 243 244 tmp7 = _mm_xor_si128(tmp7, tmp8); 245 tmp7 = _mm_xor_si128(tmp7, tmp9); 246 tmp8 = _mm_srli_si128(tmp7, 4); 247 tmp7 = _mm_slli_si128(tmp7, 12); 248 tmp3 = _mm_xor_si128(tmp3, tmp7); 249 250 tmp2 = _mm_srli_epi32(tmp3, 1); 251 tmp4 = _mm_srli_epi32(tmp3, 2); 252 tmp5 = _mm_srli_epi32(tmp3, 7); 253 tmp2 = _mm_xor_si128(tmp2, tmp4); 254 tmp2 = _mm_xor_si128(tmp2, tmp5); 255 tmp2 = _mm_xor_si128(tmp2, tmp8); 256 tmp3 = _mm_xor_si128(tmp3, tmp2); 257 tmp6 = _mm_xor_si128(tmp6, tmp3); 258 259 *res = tmp6; 260 } 261 262 /* 263 * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated 264 * Every Four Blocks 265 */ 266 /* 267 * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or 268 * 2^32-256*8*16 bytes. 269 */ 270 void 271 AES_GCM_encrypt(const unsigned char *in, unsigned char *out, 272 const unsigned char *addt, const unsigned char *ivec, 273 unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes, 274 const unsigned char *key, int nr) 275 { 276 int i, j ,k; 277 __m128i tmp1, tmp2, tmp3, tmp4; 278 __m128i tmp5, tmp6, tmp7, tmp8; 279 __m128i H, H2, H3, H4, Y, T; 280 const __m128i *KEY = (const __m128i *)key; 281 __m128i ctr1, ctr2, ctr3, ctr4; 282 __m128i ctr5, ctr6, ctr7, ctr8; 283 __m128i last_block = _mm_setzero_si128(); 284 __m128i ONE = _mm_set_epi32(0, 1, 0, 0); 285 __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0); 286 __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6, 287 7); 288 __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14, 289 15); 290 __m128i X = _mm_setzero_si128(); 291 292 if (ibytes == 96/8) { 293 Y = _mm_loadu_si128((const __m128i *)ivec); 294 Y = _mm_insert_epi32(Y, 0x1000000, 3); 295 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/ 296 tmp1 = _mm_xor_si128(X, KEY[0]); 297 tmp2 = _mm_xor_si128(Y, KEY[0]); 298 for (j=1; j < nr-1; j+=2) { 299 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 300 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 301 302 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 303 tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); 304 } 305 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 306 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); 307 308 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 309 T = _mm_aesenclast_si128(tmp2, KEY[nr]); 310 311 H = _mm_shuffle_epi8(H, BSWAP_MASK); 312 } else { 313 tmp1 = _mm_xor_si128(X, KEY[0]); 314 for (j=1; j <nr; j++) 315 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 316 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 317 318 H = _mm_shuffle_epi8(H, BSWAP_MASK); 319 Y = _mm_setzero_si128(); 320 321 for (i=0; i < ibytes/16; i++) { 322 tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]); 323 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 324 Y = _mm_xor_si128(Y, tmp1); 325 gfmul(Y, H, &Y); 326 } 327 if (ibytes%16) { 328 for (j=0; j < ibytes%16; j++) 329 ((unsigned char*)&last_block)[j] = ivec[i*16+j]; 330 tmp1 = last_block; 331 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 332 Y = _mm_xor_si128(Y, tmp1); 333 gfmul(Y, H, &Y); 334 } 335 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0); 336 tmp1 = _mm_insert_epi64(tmp1, 0, 1); 337 338 Y = _mm_xor_si128(Y, tmp1); 339 gfmul(Y, H, &Y); 340 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/ 341 tmp1 = _mm_xor_si128(Y, KEY[0]); 342 for (j=1; j < nr; j++) 343 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 344 T = _mm_aesenclast_si128(tmp1, KEY[nr]); 345 } 346 347 gfmul(H,H,&H2); 348 gfmul(H,H2,&H3); 349 gfmul(H,H3,&H4); 350 351 for (i=0; i<abytes/16/4; i++) { 352 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]); 353 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]); 354 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]); 355 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]); 356 357 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 358 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 359 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 360 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 361 tmp1 = _mm_xor_si128(X, tmp1); 362 363 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 364 } 365 for (i=i*4; i<abytes/16; i++) { 366 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]); 367 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 368 X = _mm_xor_si128(X,tmp1); 369 gfmul(X, H, &X); 370 } 371 if (abytes%16) { 372 last_block = _mm_setzero_si128(); 373 for (j=0; j<abytes%16; j++) 374 ((unsigned char*)&last_block)[j] = addt[i*16+j]; 375 tmp1 = last_block; 376 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 377 X =_mm_xor_si128(X,tmp1); 378 gfmul(X,H,&X); 379 } 380 381 ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); 382 ctr1 = _mm_add_epi64(ctr1, ONE); 383 ctr2 = _mm_add_epi64(ctr1, ONE); 384 ctr3 = _mm_add_epi64(ctr2, ONE); 385 ctr4 = _mm_add_epi64(ctr3, ONE); 386 ctr5 = _mm_add_epi64(ctr4, ONE); 387 ctr6 = _mm_add_epi64(ctr5, ONE); 388 ctr7 = _mm_add_epi64(ctr6, ONE); 389 ctr8 = _mm_add_epi64(ctr7, ONE); 390 391 for (i=0; i<nbytes/16/8; i++) { 392 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 393 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); 394 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); 395 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); 396 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64); 397 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64); 398 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64); 399 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64); 400 401 ctr1 = _mm_add_epi64(ctr1, EIGHT); 402 ctr2 = _mm_add_epi64(ctr2, EIGHT); 403 ctr3 = _mm_add_epi64(ctr3, EIGHT); 404 ctr4 = _mm_add_epi64(ctr4, EIGHT); 405 ctr5 = _mm_add_epi64(ctr5, EIGHT); 406 ctr6 = _mm_add_epi64(ctr6, EIGHT); 407 ctr7 = _mm_add_epi64(ctr7, EIGHT); 408 ctr8 = _mm_add_epi64(ctr8, EIGHT); 409 410 tmp1 =_mm_xor_si128(tmp1, KEY[0]); 411 tmp2 =_mm_xor_si128(tmp2, KEY[0]); 412 tmp3 =_mm_xor_si128(tmp3, KEY[0]); 413 tmp4 =_mm_xor_si128(tmp4, KEY[0]); 414 tmp5 =_mm_xor_si128(tmp5, KEY[0]); 415 tmp6 =_mm_xor_si128(tmp6, KEY[0]); 416 tmp7 =_mm_xor_si128(tmp7, KEY[0]); 417 tmp8 =_mm_xor_si128(tmp8, KEY[0]); 418 419 for (j=1; j<nr; j++) { 420 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 421 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 422 tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); 423 tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); 424 tmp5 = _mm_aesenc_si128(tmp5, KEY[j]); 425 tmp6 = _mm_aesenc_si128(tmp6, KEY[j]); 426 tmp7 = _mm_aesenc_si128(tmp7, KEY[j]); 427 tmp8 = _mm_aesenc_si128(tmp8, KEY[j]); 428 } 429 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]); 430 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]); 431 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]); 432 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]); 433 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]); 434 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]); 435 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]); 436 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]); 437 438 tmp1 = _mm_xor_si128(tmp1, 439 _mm_loadu_si128(&((const __m128i *)in)[i*8+0])); 440 tmp2 = _mm_xor_si128(tmp2, 441 _mm_loadu_si128(&((const __m128i *)in)[i*8+1])); 442 tmp3 = _mm_xor_si128(tmp3, 443 _mm_loadu_si128(&((const __m128i *)in)[i*8+2])); 444 tmp4 = _mm_xor_si128(tmp4, 445 _mm_loadu_si128(&((const __m128i *)in)[i*8+3])); 446 tmp5 = _mm_xor_si128(tmp5, 447 _mm_loadu_si128(&((const __m128i *)in)[i*8+4])); 448 tmp6 = _mm_xor_si128(tmp6, 449 _mm_loadu_si128(&((const __m128i *)in)[i*8+5])); 450 tmp7 = _mm_xor_si128(tmp7, 451 _mm_loadu_si128(&((const __m128i *)in)[i*8+6])); 452 tmp8 = _mm_xor_si128(tmp8, 453 _mm_loadu_si128(&((const __m128i *)in)[i*8+7])); 454 455 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); 456 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); 457 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); 458 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); 459 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); 460 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); 461 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); 462 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); 463 464 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 465 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 466 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 467 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 468 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); 469 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); 470 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); 471 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); 472 473 tmp1 = _mm_xor_si128(X, tmp1); 474 475 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 476 477 tmp5 = _mm_xor_si128(X, tmp5); 478 reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X); 479 } 480 for (k=i*8; k<nbytes/16; k++) { 481 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 482 ctr1 = _mm_add_epi64(ctr1, ONE); 483 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 484 for (j=1; j<nr-1; j+=2) { 485 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 486 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 487 } 488 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 489 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 490 tmp1 = _mm_xor_si128(tmp1, 491 _mm_loadu_si128(&((const __m128i *)in)[k])); 492 _mm_storeu_si128(&((__m128i*)out)[k], tmp1); 493 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 494 X = _mm_xor_si128(X, tmp1); 495 gfmul(X,H,&X); 496 } 497 //If remains one incomplete block 498 if (nbytes%16) { 499 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 500 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 501 for (j=1; j<nr-1; j+=2) { 502 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 503 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 504 } 505 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 506 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 507 tmp1 = _mm_xor_si128(tmp1, 508 _mm_loadu_si128(&((const __m128i *)in)[k])); 509 last_block = tmp1; 510 for (j=0; j<nbytes%16; j++) 511 out[k*16+j] = ((unsigned char*)&last_block)[j]; 512 for ((void)j; j<16; j++) 513 ((unsigned char*)&last_block)[j] = 0; 514 tmp1 = last_block; 515 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 516 X = _mm_xor_si128(X, tmp1); 517 gfmul(X, H, &X); 518 } 519 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0); 520 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1); 521 522 X = _mm_xor_si128(X, tmp1); 523 gfmul(X,H,&X); 524 X = _mm_shuffle_epi8(X, BSWAP_MASK); 525 T = _mm_xor_si128(X, T); 526 _mm_storeu_si128((__m128i*)tag, T); 527 } 528 529 /* My modification of _encrypt to be _decrypt */ 530 int 531 AES_GCM_decrypt(const unsigned char *in, unsigned char *out, 532 const unsigned char *addt, const unsigned char *ivec, 533 const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes, 534 const unsigned char *key, int nr) 535 { 536 int i, j ,k; 537 __m128i tmp1, tmp2, tmp3, tmp4; 538 __m128i tmp5, tmp6, tmp7, tmp8; 539 __m128i H, H2, H3, H4, Y, T; 540 const __m128i *KEY = (const __m128i *)key; 541 __m128i ctr1, ctr2, ctr3, ctr4; 542 __m128i ctr5, ctr6, ctr7, ctr8; 543 __m128i last_block = _mm_setzero_si128(); 544 __m128i ONE = _mm_set_epi32(0, 1, 0, 0); 545 __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0); 546 __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6, 547 7); 548 __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14, 549 15); 550 __m128i X = _mm_setzero_si128(); 551 552 if (ibytes == 96/8) { 553 Y = _mm_loadu_si128((const __m128i *)ivec); 554 Y = _mm_insert_epi32(Y, 0x1000000, 3); 555 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/ 556 tmp1 = _mm_xor_si128(X, KEY[0]); 557 tmp2 = _mm_xor_si128(Y, KEY[0]); 558 for (j=1; j < nr-1; j+=2) { 559 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 560 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 561 562 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 563 tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); 564 } 565 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 566 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); 567 568 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 569 T = _mm_aesenclast_si128(tmp2, KEY[nr]); 570 571 H = _mm_shuffle_epi8(H, BSWAP_MASK); 572 } else { 573 tmp1 = _mm_xor_si128(X, KEY[0]); 574 for (j=1; j <nr; j++) 575 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 576 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 577 578 H = _mm_shuffle_epi8(H, BSWAP_MASK); 579 Y = _mm_setzero_si128(); 580 581 for (i=0; i < ibytes/16; i++) { 582 tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]); 583 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 584 Y = _mm_xor_si128(Y, tmp1); 585 gfmul(Y, H, &Y); 586 } 587 if (ibytes%16) { 588 for (j=0; j < ibytes%16; j++) 589 ((unsigned char*)&last_block)[j] = ivec[i*16+j]; 590 tmp1 = last_block; 591 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 592 Y = _mm_xor_si128(Y, tmp1); 593 gfmul(Y, H, &Y); 594 } 595 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0); 596 tmp1 = _mm_insert_epi64(tmp1, 0, 1); 597 598 Y = _mm_xor_si128(Y, tmp1); 599 gfmul(Y, H, &Y); 600 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/ 601 tmp1 = _mm_xor_si128(Y, KEY[0]); 602 for (j=1; j < nr; j++) 603 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 604 T = _mm_aesenclast_si128(tmp1, KEY[nr]); 605 } 606 607 gfmul(H,H,&H2); 608 gfmul(H,H2,&H3); 609 gfmul(H,H3,&H4); 610 611 for (i=0; i<abytes/16/4; i++) { 612 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]); 613 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]); 614 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]); 615 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]); 616 617 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 618 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 619 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 620 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 621 622 tmp1 = _mm_xor_si128(X, tmp1); 623 624 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 625 } 626 for (i=i*4; i<abytes/16; i++) { 627 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]); 628 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 629 X = _mm_xor_si128(X,tmp1); 630 gfmul(X, H, &X); 631 } 632 if (abytes%16) { 633 last_block = _mm_setzero_si128(); 634 for (j=0; j<abytes%16; j++) 635 ((unsigned char*)&last_block)[j] = addt[i*16+j]; 636 tmp1 = last_block; 637 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 638 X =_mm_xor_si128(X,tmp1); 639 gfmul(X,H,&X); 640 } 641 642 /* This is where we validate the cipher text before decrypt */ 643 for (i = 0; i<nbytes/16/4; i++) { 644 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i*4]); 645 tmp2 = _mm_loadu_si128(&((const __m128i *)in)[i*4+1]); 646 tmp3 = _mm_loadu_si128(&((const __m128i *)in)[i*4+2]); 647 tmp4 = _mm_loadu_si128(&((const __m128i *)in)[i*4+3]); 648 649 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 650 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 651 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 652 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 653 654 tmp1 = _mm_xor_si128(X, tmp1); 655 656 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 657 } 658 for (i = i*4; i<nbytes/16; i++) { 659 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i]); 660 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 661 X = _mm_xor_si128(X, tmp1); 662 gfmul(X,H,&X); 663 } 664 if (nbytes%16) { 665 last_block = _mm_setzero_si128(); 666 for (j=0; j<nbytes%16; j++) 667 ((unsigned char*)&last_block)[j] = in[i*16+j]; 668 tmp1 = last_block; 669 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 670 X = _mm_xor_si128(X, tmp1); 671 gfmul(X, H, &X); 672 } 673 674 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0); 675 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1); 676 677 X = _mm_xor_si128(X, tmp1); 678 gfmul(X,H,&X); 679 X = _mm_shuffle_epi8(X, BSWAP_MASK); 680 T = _mm_xor_si128(X, T); 681 682 if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag))) 683 return 0; //in case the authentication failed 684 685 ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); 686 ctr1 = _mm_add_epi64(ctr1, ONE); 687 ctr2 = _mm_add_epi64(ctr1, ONE); 688 ctr3 = _mm_add_epi64(ctr2, ONE); 689 ctr4 = _mm_add_epi64(ctr3, ONE); 690 ctr5 = _mm_add_epi64(ctr4, ONE); 691 ctr6 = _mm_add_epi64(ctr5, ONE); 692 ctr7 = _mm_add_epi64(ctr6, ONE); 693 ctr8 = _mm_add_epi64(ctr7, ONE); 694 695 for (i=0; i<nbytes/16/8; i++) { 696 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 697 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); 698 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); 699 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); 700 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64); 701 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64); 702 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64); 703 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64); 704 705 ctr1 = _mm_add_epi64(ctr1, EIGHT); 706 ctr2 = _mm_add_epi64(ctr2, EIGHT); 707 ctr3 = _mm_add_epi64(ctr3, EIGHT); 708 ctr4 = _mm_add_epi64(ctr4, EIGHT); 709 ctr5 = _mm_add_epi64(ctr5, EIGHT); 710 ctr6 = _mm_add_epi64(ctr6, EIGHT); 711 ctr7 = _mm_add_epi64(ctr7, EIGHT); 712 ctr8 = _mm_add_epi64(ctr8, EIGHT); 713 714 tmp1 =_mm_xor_si128(tmp1, KEY[0]); 715 tmp2 =_mm_xor_si128(tmp2, KEY[0]); 716 tmp3 =_mm_xor_si128(tmp3, KEY[0]); 717 tmp4 =_mm_xor_si128(tmp4, KEY[0]); 718 tmp5 =_mm_xor_si128(tmp5, KEY[0]); 719 tmp6 =_mm_xor_si128(tmp6, KEY[0]); 720 tmp7 =_mm_xor_si128(tmp7, KEY[0]); 721 tmp8 =_mm_xor_si128(tmp8, KEY[0]); 722 723 for (j=1; j<nr; j++) { 724 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 725 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 726 tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); 727 tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); 728 tmp5 = _mm_aesenc_si128(tmp5, KEY[j]); 729 tmp6 = _mm_aesenc_si128(tmp6, KEY[j]); 730 tmp7 = _mm_aesenc_si128(tmp7, KEY[j]); 731 tmp8 = _mm_aesenc_si128(tmp8, KEY[j]); 732 } 733 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]); 734 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]); 735 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]); 736 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]); 737 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]); 738 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]); 739 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]); 740 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]); 741 742 tmp1 = _mm_xor_si128(tmp1, 743 _mm_loadu_si128(&((const __m128i *)in)[i*8+0])); 744 tmp2 = _mm_xor_si128(tmp2, 745 _mm_loadu_si128(&((const __m128i *)in)[i*8+1])); 746 tmp3 = _mm_xor_si128(tmp3, 747 _mm_loadu_si128(&((const __m128i *)in)[i*8+2])); 748 tmp4 = _mm_xor_si128(tmp4, 749 _mm_loadu_si128(&((const __m128i *)in)[i*8+3])); 750 tmp5 = _mm_xor_si128(tmp5, 751 _mm_loadu_si128(&((const __m128i *)in)[i*8+4])); 752 tmp6 = _mm_xor_si128(tmp6, 753 _mm_loadu_si128(&((const __m128i *)in)[i*8+5])); 754 tmp7 = _mm_xor_si128(tmp7, 755 _mm_loadu_si128(&((const __m128i *)in)[i*8+6])); 756 tmp8 = _mm_xor_si128(tmp8, 757 _mm_loadu_si128(&((const __m128i *)in)[i*8+7])); 758 759 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); 760 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); 761 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); 762 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); 763 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); 764 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); 765 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); 766 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); 767 768 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 769 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 770 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 771 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 772 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); 773 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); 774 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); 775 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); 776 } 777 for (k=i*8; k<nbytes/16; k++) { 778 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 779 ctr1 = _mm_add_epi64(ctr1, ONE); 780 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 781 for (j=1; j<nr-1; j+=2) { 782 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 783 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 784 } 785 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 786 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 787 tmp1 = _mm_xor_si128(tmp1, 788 _mm_loadu_si128(&((const __m128i *)in)[k])); 789 _mm_storeu_si128(&((__m128i*)out)[k], tmp1); 790 } 791 //If remains one incomplete block 792 if (nbytes%16) { 793 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 794 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 795 for (j=1; j<nr-1; j+=2) { 796 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 797 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 798 } 799 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 800 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 801 tmp1 = _mm_xor_si128(tmp1, 802 _mm_loadu_si128(&((const __m128i *)in)[k])); 803 last_block = tmp1; 804 for (j=0; j<nbytes%16; j++) 805 out[k*16+j] = ((unsigned char*)&last_block)[j]; 806 } 807 return 1; //when sucessfull returns 1 808 } 809