1 /*- 2 * Copyright (c) 2014 The FreeBSD Foundation 3 * All rights reserved. 4 * 5 * This software was developed by John-Mark Gurney under 6 * the sponsorship of the FreeBSD Foundation and 7 * Rubicon Communications, LLC (Netgate). 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 * 30 * $FreeBSD$ 31 * 32 */ 33 34 /* 35 * Figure 5, 8 and 12 are copied from the Intel white paper: 36 * Intel® Carry-Less Multiplication Instruction and its Usage for 37 * Computing the GCM Mode 38 * 39 * and as such are: 40 * Copyright © 2010 Intel Corporation. 41 * All rights reserved. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * * Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * * Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * * Neither the name of Intel Corporation nor the 52 * names of its contributors may be used to endorse or promote products 53 * derived from this software without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 56 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 57 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 58 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 59 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 60 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 61 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 62 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 63 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 64 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 65 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 66 */ 67 68 #ifdef _KERNEL 69 #include <crypto/aesni/aesni.h> 70 #include <crypto/aesni/aesni_os.h> 71 #else 72 #include <stdint.h> 73 #endif 74 75 #include <wmmintrin.h> 76 #include <emmintrin.h> 77 #include <smmintrin.h> 78 79 static inline int 80 m128icmp(__m128i a, __m128i b) 81 { 82 __m128i cmp; 83 84 cmp = _mm_cmpeq_epi32(a, b); 85 86 return _mm_movemask_epi8(cmp) == 0xffff; 87 } 88 89 #ifdef __i386__ 90 static inline __m128i 91 _mm_insert_epi64(__m128i a, int64_t b, const int ndx) 92 { 93 94 if (!ndx) { 95 a = _mm_insert_epi32(a, b, 0); 96 a = _mm_insert_epi32(a, b >> 32, 1); 97 } else { 98 a = _mm_insert_epi32(a, b, 2); 99 a = _mm_insert_epi32(a, b >> 32, 3); 100 } 101 102 return a; 103 } 104 #endif 105 106 /* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */ 107 108 /* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */ 109 static void 110 gfmul(__m128i a, __m128i b, __m128i *res) 111 { 112 __m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9; 113 114 tmp3 = _mm_clmulepi64_si128(a, b, 0x00); 115 tmp4 = _mm_clmulepi64_si128(a, b, 0x10); 116 tmp5 = _mm_clmulepi64_si128(a, b, 0x01); 117 tmp6 = _mm_clmulepi64_si128(a, b, 0x11); 118 119 tmp4 = _mm_xor_si128(tmp4, tmp5); 120 tmp5 = _mm_slli_si128(tmp4, 8); 121 tmp4 = _mm_srli_si128(tmp4, 8); 122 tmp3 = _mm_xor_si128(tmp3, tmp5); 123 tmp6 = _mm_xor_si128(tmp6, tmp4); 124 125 tmp7 = _mm_srli_epi32(tmp3, 31); 126 tmp8 = _mm_srli_epi32(tmp6, 31); 127 tmp3 = _mm_slli_epi32(tmp3, 1); 128 tmp6 = _mm_slli_epi32(tmp6, 1); 129 130 tmp9 = _mm_srli_si128(tmp7, 12); 131 tmp8 = _mm_slli_si128(tmp8, 4); 132 tmp7 = _mm_slli_si128(tmp7, 4); 133 tmp3 = _mm_or_si128(tmp3, tmp7); 134 tmp6 = _mm_or_si128(tmp6, tmp8); 135 tmp6 = _mm_or_si128(tmp6, tmp9); 136 137 tmp7 = _mm_slli_epi32(tmp3, 31); 138 tmp8 = _mm_slli_epi32(tmp3, 30); 139 tmp9 = _mm_slli_epi32(tmp3, 25); 140 141 tmp7 = _mm_xor_si128(tmp7, tmp8); 142 tmp7 = _mm_xor_si128(tmp7, tmp9); 143 tmp8 = _mm_srli_si128(tmp7, 4); 144 tmp7 = _mm_slli_si128(tmp7, 12); 145 tmp3 = _mm_xor_si128(tmp3, tmp7); 146 147 tmp2 = _mm_srli_epi32(tmp3, 1); 148 tmp4 = _mm_srli_epi32(tmp3, 2); 149 tmp5 = _mm_srli_epi32(tmp3, 7); 150 tmp2 = _mm_xor_si128(tmp2, tmp4); 151 tmp2 = _mm_xor_si128(tmp2, tmp5); 152 tmp2 = _mm_xor_si128(tmp2, tmp8); 153 tmp3 = _mm_xor_si128(tmp3, tmp2); 154 tmp6 = _mm_xor_si128(tmp6, tmp3); 155 156 *res = tmp6; 157 } 158 159 /* 160 * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction 161 * Method */ 162 static void 163 reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4, 164 __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res) 165 { 166 /*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/ 167 __m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo, 168 H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi; 169 __m128i tmp0, tmp1, tmp2, tmp3; 170 __m128i tmp4, tmp5, tmp6, tmp7; 171 __m128i tmp8, tmp9; 172 173 H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00); 174 H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00); 175 H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00); 176 H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00); 177 178 lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo); 179 lo = _mm_xor_si128(lo, H3_X3_lo); 180 lo = _mm_xor_si128(lo, H4_X4_lo); 181 182 H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11); 183 H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11); 184 H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11); 185 H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11); 186 187 hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi); 188 hi = _mm_xor_si128(hi, H3_X3_hi); 189 hi = _mm_xor_si128(hi, H4_X4_hi); 190 191 tmp0 = _mm_shuffle_epi32(H1, 78); 192 tmp4 = _mm_shuffle_epi32(X1, 78); 193 tmp0 = _mm_xor_si128(tmp0, H1); 194 tmp4 = _mm_xor_si128(tmp4, X1); 195 tmp1 = _mm_shuffle_epi32(H2, 78); 196 tmp5 = _mm_shuffle_epi32(X2, 78); 197 tmp1 = _mm_xor_si128(tmp1, H2); 198 tmp5 = _mm_xor_si128(tmp5, X2); 199 tmp2 = _mm_shuffle_epi32(H3, 78); 200 tmp6 = _mm_shuffle_epi32(X3, 78); 201 tmp2 = _mm_xor_si128(tmp2, H3); 202 tmp6 = _mm_xor_si128(tmp6, X3); 203 tmp3 = _mm_shuffle_epi32(H4, 78); 204 tmp7 = _mm_shuffle_epi32(X4, 78); 205 tmp3 = _mm_xor_si128(tmp3, H4); 206 tmp7 = _mm_xor_si128(tmp7, X4); 207 208 tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00); 209 tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00); 210 tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00); 211 tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00); 212 213 tmp0 = _mm_xor_si128(tmp0, lo); 214 tmp0 = _mm_xor_si128(tmp0, hi); 215 tmp0 = _mm_xor_si128(tmp1, tmp0); 216 tmp0 = _mm_xor_si128(tmp2, tmp0); 217 tmp0 = _mm_xor_si128(tmp3, tmp0); 218 219 tmp4 = _mm_slli_si128(tmp0, 8); 220 tmp0 = _mm_srli_si128(tmp0, 8); 221 222 lo = _mm_xor_si128(tmp4, lo); 223 hi = _mm_xor_si128(tmp0, hi); 224 225 tmp3 = lo; 226 tmp6 = hi; 227 228 tmp7 = _mm_srli_epi32(tmp3, 31); 229 tmp8 = _mm_srli_epi32(tmp6, 31); 230 tmp3 = _mm_slli_epi32(tmp3, 1); 231 tmp6 = _mm_slli_epi32(tmp6, 1); 232 233 tmp9 = _mm_srli_si128(tmp7, 12); 234 tmp8 = _mm_slli_si128(tmp8, 4); 235 tmp7 = _mm_slli_si128(tmp7, 4); 236 tmp3 = _mm_or_si128(tmp3, tmp7); 237 tmp6 = _mm_or_si128(tmp6, tmp8); 238 tmp6 = _mm_or_si128(tmp6, tmp9); 239 240 tmp7 = _mm_slli_epi32(tmp3, 31); 241 tmp8 = _mm_slli_epi32(tmp3, 30); 242 tmp9 = _mm_slli_epi32(tmp3, 25); 243 244 tmp7 = _mm_xor_si128(tmp7, tmp8); 245 tmp7 = _mm_xor_si128(tmp7, tmp9); 246 tmp8 = _mm_srli_si128(tmp7, 4); 247 tmp7 = _mm_slli_si128(tmp7, 12); 248 tmp3 = _mm_xor_si128(tmp3, tmp7); 249 250 tmp2 = _mm_srli_epi32(tmp3, 1); 251 tmp4 = _mm_srli_epi32(tmp3, 2); 252 tmp5 = _mm_srli_epi32(tmp3, 7); 253 tmp2 = _mm_xor_si128(tmp2, tmp4); 254 tmp2 = _mm_xor_si128(tmp2, tmp5); 255 tmp2 = _mm_xor_si128(tmp2, tmp8); 256 tmp3 = _mm_xor_si128(tmp3, tmp2); 257 tmp6 = _mm_xor_si128(tmp6, tmp3); 258 259 *res = tmp6; 260 } 261 262 /* 263 * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated 264 * Every Four Blocks 265 */ 266 /* 267 * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or 268 * 2^32-256*8*16 bytes. 269 */ 270 void 271 AES_GCM_encrypt(const unsigned char *in, unsigned char *out, 272 const unsigned char *addt, const unsigned char *ivec, 273 unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes, 274 const unsigned char *key, int nr) 275 { 276 int i, j ,k; 277 __m128i tmp1, tmp2, tmp3, tmp4; 278 __m128i tmp5, tmp6, tmp7, tmp8; 279 __m128i H, H2, H3, H4, Y, T; 280 const __m128i *KEY = (const __m128i *)key; 281 __m128i ctr1, ctr2, ctr3, ctr4; 282 __m128i ctr5, ctr6, ctr7, ctr8; 283 __m128i last_block = _mm_setzero_si128(); 284 __m128i ONE = _mm_set_epi32(0, 1, 0, 0); 285 __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0); 286 __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6, 287 7); 288 __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14, 289 15); 290 __m128i X = _mm_setzero_si128(); 291 292 if (ibytes == 96/8) { 293 Y = _mm_loadu_si128((const __m128i *)ivec); 294 Y = _mm_insert_epi32(Y, 0x1000000, 3); 295 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/ 296 tmp1 = _mm_xor_si128(X, KEY[0]); 297 tmp2 = _mm_xor_si128(Y, KEY[0]); 298 for (j=1; j < nr-1; j+=2) { 299 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 300 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 301 302 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 303 tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); 304 } 305 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 306 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); 307 308 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 309 T = _mm_aesenclast_si128(tmp2, KEY[nr]); 310 311 H = _mm_shuffle_epi8(H, BSWAP_MASK); 312 } else { 313 tmp1 = _mm_xor_si128(X, KEY[0]); 314 for (j=1; j <nr; j++) 315 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 316 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 317 318 H = _mm_shuffle_epi8(H, BSWAP_MASK); 319 Y = _mm_setzero_si128(); 320 321 for (i=0; i < ibytes/16; i++) { 322 tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]); 323 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 324 Y = _mm_xor_si128(Y, tmp1); 325 gfmul(Y, H, &Y); 326 } 327 if (ibytes%16) { 328 for (j=0; j < ibytes%16; j++) 329 ((unsigned char*)&last_block)[j] = ivec[i*16+j]; 330 tmp1 = last_block; 331 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 332 Y = _mm_xor_si128(Y, tmp1); 333 gfmul(Y, H, &Y); 334 } 335 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0); 336 tmp1 = _mm_insert_epi64(tmp1, 0, 1); 337 338 Y = _mm_xor_si128(Y, tmp1); 339 gfmul(Y, H, &Y); 340 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/ 341 tmp1 = _mm_xor_si128(Y, KEY[0]); 342 for (j=1; j < nr; j++) 343 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 344 T = _mm_aesenclast_si128(tmp1, KEY[nr]); 345 } 346 347 gfmul(H,H,&H2); 348 gfmul(H,H2,&H3); 349 gfmul(H,H3,&H4); 350 351 for (i=0; i<abytes/16/4; i++) { 352 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]); 353 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]); 354 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]); 355 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]); 356 357 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 358 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 359 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 360 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 361 tmp1 = _mm_xor_si128(X, tmp1); 362 363 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 364 } 365 for (i=i*4; i<abytes/16; i++) { 366 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]); 367 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 368 X = _mm_xor_si128(X,tmp1); 369 gfmul(X, H, &X); 370 } 371 if (abytes%16) { 372 last_block = _mm_setzero_si128(); 373 for (j=0; j<abytes%16; j++) 374 ((unsigned char*)&last_block)[j] = addt[i*16+j]; 375 tmp1 = last_block; 376 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 377 X =_mm_xor_si128(X,tmp1); 378 gfmul(X,H,&X); 379 } 380 381 ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); 382 ctr1 = _mm_add_epi64(ctr1, ONE); 383 ctr2 = _mm_add_epi64(ctr1, ONE); 384 ctr3 = _mm_add_epi64(ctr2, ONE); 385 ctr4 = _mm_add_epi64(ctr3, ONE); 386 ctr5 = _mm_add_epi64(ctr4, ONE); 387 ctr6 = _mm_add_epi64(ctr5, ONE); 388 ctr7 = _mm_add_epi64(ctr6, ONE); 389 ctr8 = _mm_add_epi64(ctr7, ONE); 390 391 for (i=0; i<nbytes/16/8; i++) { 392 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 393 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); 394 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); 395 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); 396 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64); 397 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64); 398 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64); 399 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64); 400 401 ctr1 = _mm_add_epi64(ctr1, EIGHT); 402 ctr2 = _mm_add_epi64(ctr2, EIGHT); 403 ctr3 = _mm_add_epi64(ctr3, EIGHT); 404 ctr4 = _mm_add_epi64(ctr4, EIGHT); 405 ctr5 = _mm_add_epi64(ctr5, EIGHT); 406 ctr6 = _mm_add_epi64(ctr6, EIGHT); 407 ctr7 = _mm_add_epi64(ctr7, EIGHT); 408 ctr8 = _mm_add_epi64(ctr8, EIGHT); 409 410 tmp1 =_mm_xor_si128(tmp1, KEY[0]); 411 tmp2 =_mm_xor_si128(tmp2, KEY[0]); 412 tmp3 =_mm_xor_si128(tmp3, KEY[0]); 413 tmp4 =_mm_xor_si128(tmp4, KEY[0]); 414 tmp5 =_mm_xor_si128(tmp5, KEY[0]); 415 tmp6 =_mm_xor_si128(tmp6, KEY[0]); 416 tmp7 =_mm_xor_si128(tmp7, KEY[0]); 417 tmp8 =_mm_xor_si128(tmp8, KEY[0]); 418 419 for (j=1; j<nr; j++) { 420 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 421 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 422 tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); 423 tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); 424 tmp5 = _mm_aesenc_si128(tmp5, KEY[j]); 425 tmp6 = _mm_aesenc_si128(tmp6, KEY[j]); 426 tmp7 = _mm_aesenc_si128(tmp7, KEY[j]); 427 tmp8 = _mm_aesenc_si128(tmp8, KEY[j]); 428 } 429 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]); 430 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]); 431 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]); 432 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]); 433 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]); 434 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]); 435 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]); 436 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]); 437 438 tmp1 = _mm_xor_si128(tmp1, 439 _mm_loadu_si128(&((const __m128i *)in)[i*8+0])); 440 tmp2 = _mm_xor_si128(tmp2, 441 _mm_loadu_si128(&((const __m128i *)in)[i*8+1])); 442 tmp3 = _mm_xor_si128(tmp3, 443 _mm_loadu_si128(&((const __m128i *)in)[i*8+2])); 444 tmp4 = _mm_xor_si128(tmp4, 445 _mm_loadu_si128(&((const __m128i *)in)[i*8+3])); 446 tmp5 = _mm_xor_si128(tmp5, 447 _mm_loadu_si128(&((const __m128i *)in)[i*8+4])); 448 tmp6 = _mm_xor_si128(tmp6, 449 _mm_loadu_si128(&((const __m128i *)in)[i*8+5])); 450 tmp7 = _mm_xor_si128(tmp7, 451 _mm_loadu_si128(&((const __m128i *)in)[i*8+6])); 452 tmp8 = _mm_xor_si128(tmp8, 453 _mm_loadu_si128(&((const __m128i *)in)[i*8+7])); 454 455 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); 456 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); 457 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); 458 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); 459 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); 460 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); 461 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); 462 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); 463 464 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 465 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 466 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 467 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 468 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); 469 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); 470 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); 471 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); 472 473 tmp1 = _mm_xor_si128(X, tmp1); 474 475 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 476 477 tmp5 = _mm_xor_si128(X, tmp5); 478 reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X); 479 } 480 for (k=i*8; k<nbytes/16; k++) { 481 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 482 ctr1 = _mm_add_epi64(ctr1, ONE); 483 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 484 for (j=1; j<nr-1; j+=2) { 485 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 486 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 487 } 488 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 489 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 490 tmp1 = _mm_xor_si128(tmp1, 491 _mm_loadu_si128(&((const __m128i *)in)[k])); 492 _mm_storeu_si128(&((__m128i*)out)[k], tmp1); 493 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 494 X = _mm_xor_si128(X, tmp1); 495 gfmul(X,H,&X); 496 } 497 //If remains one incomplete block 498 if (nbytes%16) { 499 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 500 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 501 for (j=1; j<nr-1; j+=2) { 502 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 503 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 504 } 505 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 506 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 507 last_block = _mm_setzero_si128(); 508 memcpy(&last_block, &((const __m128i *)in)[k], 509 nbytes % 16); 510 last_block = _mm_xor_si128(last_block, tmp1); 511 for (j=0; j<nbytes%16; j++) 512 out[k*16+j] = ((unsigned char*)&last_block)[j]; 513 for ((void)j; j<16; j++) 514 ((unsigned char*)&last_block)[j] = 0; 515 tmp1 = last_block; 516 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 517 X = _mm_xor_si128(X, tmp1); 518 gfmul(X, H, &X); 519 } 520 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0); 521 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1); 522 523 X = _mm_xor_si128(X, tmp1); 524 gfmul(X,H,&X); 525 X = _mm_shuffle_epi8(X, BSWAP_MASK); 526 T = _mm_xor_si128(X, T); 527 _mm_storeu_si128((__m128i*)tag, T); 528 } 529 530 /* My modification of _encrypt to be _decrypt */ 531 int 532 AES_GCM_decrypt(const unsigned char *in, unsigned char *out, 533 const unsigned char *addt, const unsigned char *ivec, 534 const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes, 535 const unsigned char *key, int nr) 536 { 537 int i, j ,k; 538 __m128i tmp1, tmp2, tmp3, tmp4; 539 __m128i tmp5, tmp6, tmp7, tmp8; 540 __m128i H, H2, H3, H4, Y, T; 541 const __m128i *KEY = (const __m128i *)key; 542 __m128i ctr1, ctr2, ctr3, ctr4; 543 __m128i ctr5, ctr6, ctr7, ctr8; 544 __m128i last_block = _mm_setzero_si128(); 545 __m128i ONE = _mm_set_epi32(0, 1, 0, 0); 546 __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0); 547 __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6, 548 7); 549 __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14, 550 15); 551 __m128i X = _mm_setzero_si128(); 552 553 if (ibytes == 96/8) { 554 Y = _mm_loadu_si128((const __m128i *)ivec); 555 Y = _mm_insert_epi32(Y, 0x1000000, 3); 556 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/ 557 tmp1 = _mm_xor_si128(X, KEY[0]); 558 tmp2 = _mm_xor_si128(Y, KEY[0]); 559 for (j=1; j < nr-1; j+=2) { 560 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 561 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 562 563 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 564 tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); 565 } 566 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 567 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); 568 569 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 570 T = _mm_aesenclast_si128(tmp2, KEY[nr]); 571 572 H = _mm_shuffle_epi8(H, BSWAP_MASK); 573 } else { 574 tmp1 = _mm_xor_si128(X, KEY[0]); 575 for (j=1; j <nr; j++) 576 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 577 H = _mm_aesenclast_si128(tmp1, KEY[nr]); 578 579 H = _mm_shuffle_epi8(H, BSWAP_MASK); 580 Y = _mm_setzero_si128(); 581 582 for (i=0; i < ibytes/16; i++) { 583 tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]); 584 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 585 Y = _mm_xor_si128(Y, tmp1); 586 gfmul(Y, H, &Y); 587 } 588 if (ibytes%16) { 589 for (j=0; j < ibytes%16; j++) 590 ((unsigned char*)&last_block)[j] = ivec[i*16+j]; 591 tmp1 = last_block; 592 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 593 Y = _mm_xor_si128(Y, tmp1); 594 gfmul(Y, H, &Y); 595 } 596 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0); 597 tmp1 = _mm_insert_epi64(tmp1, 0, 1); 598 599 Y = _mm_xor_si128(Y, tmp1); 600 gfmul(Y, H, &Y); 601 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/ 602 tmp1 = _mm_xor_si128(Y, KEY[0]); 603 for (j=1; j < nr; j++) 604 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 605 T = _mm_aesenclast_si128(tmp1, KEY[nr]); 606 } 607 608 gfmul(H,H,&H2); 609 gfmul(H,H2,&H3); 610 gfmul(H,H3,&H4); 611 612 for (i=0; i<abytes/16/4; i++) { 613 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]); 614 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]); 615 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]); 616 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]); 617 618 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 619 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 620 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 621 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 622 623 tmp1 = _mm_xor_si128(X, tmp1); 624 625 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 626 } 627 for (i=i*4; i<abytes/16; i++) { 628 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]); 629 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 630 X = _mm_xor_si128(X,tmp1); 631 gfmul(X, H, &X); 632 } 633 if (abytes%16) { 634 last_block = _mm_setzero_si128(); 635 for (j=0; j<abytes%16; j++) 636 ((unsigned char*)&last_block)[j] = addt[i*16+j]; 637 tmp1 = last_block; 638 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 639 X =_mm_xor_si128(X,tmp1); 640 gfmul(X,H,&X); 641 } 642 643 /* This is where we validate the cipher text before decrypt */ 644 for (i = 0; i<nbytes/16/4; i++) { 645 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i*4]); 646 tmp2 = _mm_loadu_si128(&((const __m128i *)in)[i*4+1]); 647 tmp3 = _mm_loadu_si128(&((const __m128i *)in)[i*4+2]); 648 tmp4 = _mm_loadu_si128(&((const __m128i *)in)[i*4+3]); 649 650 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 651 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 652 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 653 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 654 655 tmp1 = _mm_xor_si128(X, tmp1); 656 657 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X); 658 } 659 for (i = i*4; i<nbytes/16; i++) { 660 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i]); 661 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 662 X = _mm_xor_si128(X, tmp1); 663 gfmul(X,H,&X); 664 } 665 if (nbytes%16) { 666 last_block = _mm_setzero_si128(); 667 for (j=0; j<nbytes%16; j++) 668 ((unsigned char*)&last_block)[j] = in[i*16+j]; 669 tmp1 = last_block; 670 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 671 X = _mm_xor_si128(X, tmp1); 672 gfmul(X, H, &X); 673 } 674 675 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0); 676 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1); 677 678 X = _mm_xor_si128(X, tmp1); 679 gfmul(X,H,&X); 680 X = _mm_shuffle_epi8(X, BSWAP_MASK); 681 T = _mm_xor_si128(X, T); 682 683 if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag))) 684 return 0; //in case the authentication failed 685 686 ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); 687 ctr1 = _mm_add_epi64(ctr1, ONE); 688 ctr2 = _mm_add_epi64(ctr1, ONE); 689 ctr3 = _mm_add_epi64(ctr2, ONE); 690 ctr4 = _mm_add_epi64(ctr3, ONE); 691 ctr5 = _mm_add_epi64(ctr4, ONE); 692 ctr6 = _mm_add_epi64(ctr5, ONE); 693 ctr7 = _mm_add_epi64(ctr6, ONE); 694 ctr8 = _mm_add_epi64(ctr7, ONE); 695 696 for (i=0; i<nbytes/16/8; i++) { 697 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 698 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); 699 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); 700 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); 701 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64); 702 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64); 703 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64); 704 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64); 705 706 ctr1 = _mm_add_epi64(ctr1, EIGHT); 707 ctr2 = _mm_add_epi64(ctr2, EIGHT); 708 ctr3 = _mm_add_epi64(ctr3, EIGHT); 709 ctr4 = _mm_add_epi64(ctr4, EIGHT); 710 ctr5 = _mm_add_epi64(ctr5, EIGHT); 711 ctr6 = _mm_add_epi64(ctr6, EIGHT); 712 ctr7 = _mm_add_epi64(ctr7, EIGHT); 713 ctr8 = _mm_add_epi64(ctr8, EIGHT); 714 715 tmp1 =_mm_xor_si128(tmp1, KEY[0]); 716 tmp2 =_mm_xor_si128(tmp2, KEY[0]); 717 tmp3 =_mm_xor_si128(tmp3, KEY[0]); 718 tmp4 =_mm_xor_si128(tmp4, KEY[0]); 719 tmp5 =_mm_xor_si128(tmp5, KEY[0]); 720 tmp6 =_mm_xor_si128(tmp6, KEY[0]); 721 tmp7 =_mm_xor_si128(tmp7, KEY[0]); 722 tmp8 =_mm_xor_si128(tmp8, KEY[0]); 723 724 for (j=1; j<nr; j++) { 725 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 726 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); 727 tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); 728 tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); 729 tmp5 = _mm_aesenc_si128(tmp5, KEY[j]); 730 tmp6 = _mm_aesenc_si128(tmp6, KEY[j]); 731 tmp7 = _mm_aesenc_si128(tmp7, KEY[j]); 732 tmp8 = _mm_aesenc_si128(tmp8, KEY[j]); 733 } 734 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]); 735 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]); 736 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]); 737 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]); 738 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]); 739 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]); 740 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]); 741 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]); 742 743 tmp1 = _mm_xor_si128(tmp1, 744 _mm_loadu_si128(&((const __m128i *)in)[i*8+0])); 745 tmp2 = _mm_xor_si128(tmp2, 746 _mm_loadu_si128(&((const __m128i *)in)[i*8+1])); 747 tmp3 = _mm_xor_si128(tmp3, 748 _mm_loadu_si128(&((const __m128i *)in)[i*8+2])); 749 tmp4 = _mm_xor_si128(tmp4, 750 _mm_loadu_si128(&((const __m128i *)in)[i*8+3])); 751 tmp5 = _mm_xor_si128(tmp5, 752 _mm_loadu_si128(&((const __m128i *)in)[i*8+4])); 753 tmp6 = _mm_xor_si128(tmp6, 754 _mm_loadu_si128(&((const __m128i *)in)[i*8+5])); 755 tmp7 = _mm_xor_si128(tmp7, 756 _mm_loadu_si128(&((const __m128i *)in)[i*8+6])); 757 tmp8 = _mm_xor_si128(tmp8, 758 _mm_loadu_si128(&((const __m128i *)in)[i*8+7])); 759 760 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1); 761 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2); 762 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3); 763 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4); 764 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5); 765 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6); 766 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7); 767 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8); 768 769 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); 770 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); 771 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); 772 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); 773 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK); 774 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK); 775 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK); 776 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK); 777 } 778 for (k=i*8; k<nbytes/16; k++) { 779 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 780 ctr1 = _mm_add_epi64(ctr1, ONE); 781 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 782 for (j=1; j<nr-1; j+=2) { 783 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 784 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 785 } 786 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 787 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 788 tmp1 = _mm_xor_si128(tmp1, 789 _mm_loadu_si128(&((const __m128i *)in)[k])); 790 _mm_storeu_si128(&((__m128i*)out)[k], tmp1); 791 } 792 //If remains one incomplete block 793 if (nbytes%16) { 794 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); 795 tmp1 = _mm_xor_si128(tmp1, KEY[0]); 796 for (j=1; j<nr-1; j+=2) { 797 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); 798 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); 799 } 800 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); 801 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); 802 last_block = _mm_setzero_si128(); 803 memcpy(&last_block, &((const __m128i *)in)[k], nbytes%16); 804 tmp1 = _mm_xor_si128(tmp1, last_block); 805 last_block = tmp1; 806 for (j=0; j<nbytes%16; j++) 807 out[k*16+j] = ((unsigned char*)&last_block)[j]; 808 } 809 return 1; //when sucessfull returns 1 810 } 811