1 /*-
2 * Copyright (c) 2014 The FreeBSD Foundation
3 * All rights reserved.
4 *
5 * This software was developed by John-Mark Gurney under
6 * the sponsorship of the FreeBSD Foundation and
7 * Rubicon Communications, LLC (Netgate).
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
16 *
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27 * SUCH DAMAGE.
28 *
29 *
30 */
31
32 /*
33 * Figure 5, 8 and 12 are copied from the Intel white paper:
34 * Intel® Carry-Less Multiplication Instruction and its Usage for
35 * Computing the GCM Mode
36 *
37 * and as such are:
38 * Copyright © 2010 Intel Corporation.
39 * All rights reserved.
40 *
41 * Redistribution and use in source and binary forms, with or without
42 * modification, are permitted provided that the following conditions
43 * are met:
44 * * Redistributions of source code must retain the above copyright
45 * notice, this list of conditions and the following disclaimer.
46 * * Redistributions in binary form must reproduce the above copyright
47 * notice, this list of conditions and the following disclaimer in the
48 * documentation and/or other materials provided with the distribution.
49 * * Neither the name of Intel Corporation nor the
50 * names of its contributors may be used to endorse or promote products
51 * derived from this software without specific prior written permission.
52 *
53 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
54 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
55 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
56 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
57 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
58 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
59 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
60 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
61 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
62 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
63 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
64 */
65
66 #ifdef _KERNEL
67 #include <crypto/aesni/aesni.h>
68 #include <crypto/aesni/aesni_os.h>
69 #else
70 #include <stdint.h>
71 #endif
72
73 #include <wmmintrin.h>
74 #include <emmintrin.h>
75 #include <smmintrin.h>
76
77 static inline int
m128icmp(__m128i a,__m128i b)78 m128icmp(__m128i a, __m128i b)
79 {
80 __m128i cmp;
81
82 cmp = _mm_cmpeq_epi32(a, b);
83
84 return _mm_movemask_epi8(cmp) == 0xffff;
85 }
86
87 #ifdef __i386__
88 static inline __m128i
_mm_insert_epi64(__m128i a,int64_t b,const int ndx)89 _mm_insert_epi64(__m128i a, int64_t b, const int ndx)
90 {
91
92 if (!ndx) {
93 a = _mm_insert_epi32(a, b, 0);
94 a = _mm_insert_epi32(a, b >> 32, 1);
95 } else {
96 a = _mm_insert_epi32(a, b, 2);
97 a = _mm_insert_epi32(a, b >> 32, 3);
98 }
99
100 return a;
101 }
102 #endif
103
104 /* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
105
106 /* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
107 static void
gfmul(__m128i a,__m128i b,__m128i * res)108 gfmul(__m128i a, __m128i b, __m128i *res)
109 {
110 __m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
111
112 tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
113 tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
114 tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
115 tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
116
117 tmp4 = _mm_xor_si128(tmp4, tmp5);
118 tmp5 = _mm_slli_si128(tmp4, 8);
119 tmp4 = _mm_srli_si128(tmp4, 8);
120 tmp3 = _mm_xor_si128(tmp3, tmp5);
121 tmp6 = _mm_xor_si128(tmp6, tmp4);
122
123 tmp7 = _mm_srli_epi32(tmp3, 31);
124 tmp8 = _mm_srli_epi32(tmp6, 31);
125 tmp3 = _mm_slli_epi32(tmp3, 1);
126 tmp6 = _mm_slli_epi32(tmp6, 1);
127
128 tmp9 = _mm_srli_si128(tmp7, 12);
129 tmp8 = _mm_slli_si128(tmp8, 4);
130 tmp7 = _mm_slli_si128(tmp7, 4);
131 tmp3 = _mm_or_si128(tmp3, tmp7);
132 tmp6 = _mm_or_si128(tmp6, tmp8);
133 tmp6 = _mm_or_si128(tmp6, tmp9);
134
135 tmp7 = _mm_slli_epi32(tmp3, 31);
136 tmp8 = _mm_slli_epi32(tmp3, 30);
137 tmp9 = _mm_slli_epi32(tmp3, 25);
138
139 tmp7 = _mm_xor_si128(tmp7, tmp8);
140 tmp7 = _mm_xor_si128(tmp7, tmp9);
141 tmp8 = _mm_srli_si128(tmp7, 4);
142 tmp7 = _mm_slli_si128(tmp7, 12);
143 tmp3 = _mm_xor_si128(tmp3, tmp7);
144
145 tmp2 = _mm_srli_epi32(tmp3, 1);
146 tmp4 = _mm_srli_epi32(tmp3, 2);
147 tmp5 = _mm_srli_epi32(tmp3, 7);
148 tmp2 = _mm_xor_si128(tmp2, tmp4);
149 tmp2 = _mm_xor_si128(tmp2, tmp5);
150 tmp2 = _mm_xor_si128(tmp2, tmp8);
151 tmp3 = _mm_xor_si128(tmp3, tmp2);
152 tmp6 = _mm_xor_si128(tmp6, tmp3);
153
154 *res = tmp6;
155 }
156
157 /*
158 * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
159 * Method */
160 static void
reduce4(__m128i H1,__m128i H2,__m128i H3,__m128i H4,__m128i X1,__m128i X2,__m128i X3,__m128i X4,__m128i * res)161 reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4,
162 __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res)
163 {
164 /*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
165 __m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo,
166 H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi;
167 __m128i tmp0, tmp1, tmp2, tmp3;
168 __m128i tmp4, tmp5, tmp6, tmp7;
169 __m128i tmp8, tmp9;
170
171 H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
172 H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
173 H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
174 H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
175
176 lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo);
177 lo = _mm_xor_si128(lo, H3_X3_lo);
178 lo = _mm_xor_si128(lo, H4_X4_lo);
179
180 H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
181 H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
182 H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
183 H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
184
185 hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi);
186 hi = _mm_xor_si128(hi, H3_X3_hi);
187 hi = _mm_xor_si128(hi, H4_X4_hi);
188
189 tmp0 = _mm_shuffle_epi32(H1, 78);
190 tmp4 = _mm_shuffle_epi32(X1, 78);
191 tmp0 = _mm_xor_si128(tmp0, H1);
192 tmp4 = _mm_xor_si128(tmp4, X1);
193 tmp1 = _mm_shuffle_epi32(H2, 78);
194 tmp5 = _mm_shuffle_epi32(X2, 78);
195 tmp1 = _mm_xor_si128(tmp1, H2);
196 tmp5 = _mm_xor_si128(tmp5, X2);
197 tmp2 = _mm_shuffle_epi32(H3, 78);
198 tmp6 = _mm_shuffle_epi32(X3, 78);
199 tmp2 = _mm_xor_si128(tmp2, H3);
200 tmp6 = _mm_xor_si128(tmp6, X3);
201 tmp3 = _mm_shuffle_epi32(H4, 78);
202 tmp7 = _mm_shuffle_epi32(X4, 78);
203 tmp3 = _mm_xor_si128(tmp3, H4);
204 tmp7 = _mm_xor_si128(tmp7, X4);
205
206 tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00);
207 tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
208 tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00);
209 tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
210
211 tmp0 = _mm_xor_si128(tmp0, lo);
212 tmp0 = _mm_xor_si128(tmp0, hi);
213 tmp0 = _mm_xor_si128(tmp1, tmp0);
214 tmp0 = _mm_xor_si128(tmp2, tmp0);
215 tmp0 = _mm_xor_si128(tmp3, tmp0);
216
217 tmp4 = _mm_slli_si128(tmp0, 8);
218 tmp0 = _mm_srli_si128(tmp0, 8);
219
220 lo = _mm_xor_si128(tmp4, lo);
221 hi = _mm_xor_si128(tmp0, hi);
222
223 tmp3 = lo;
224 tmp6 = hi;
225
226 tmp7 = _mm_srli_epi32(tmp3, 31);
227 tmp8 = _mm_srli_epi32(tmp6, 31);
228 tmp3 = _mm_slli_epi32(tmp3, 1);
229 tmp6 = _mm_slli_epi32(tmp6, 1);
230
231 tmp9 = _mm_srli_si128(tmp7, 12);
232 tmp8 = _mm_slli_si128(tmp8, 4);
233 tmp7 = _mm_slli_si128(tmp7, 4);
234 tmp3 = _mm_or_si128(tmp3, tmp7);
235 tmp6 = _mm_or_si128(tmp6, tmp8);
236 tmp6 = _mm_or_si128(tmp6, tmp9);
237
238 tmp7 = _mm_slli_epi32(tmp3, 31);
239 tmp8 = _mm_slli_epi32(tmp3, 30);
240 tmp9 = _mm_slli_epi32(tmp3, 25);
241
242 tmp7 = _mm_xor_si128(tmp7, tmp8);
243 tmp7 = _mm_xor_si128(tmp7, tmp9);
244 tmp8 = _mm_srli_si128(tmp7, 4);
245 tmp7 = _mm_slli_si128(tmp7, 12);
246 tmp3 = _mm_xor_si128(tmp3, tmp7);
247
248 tmp2 = _mm_srli_epi32(tmp3, 1);
249 tmp4 = _mm_srli_epi32(tmp3, 2);
250 tmp5 = _mm_srli_epi32(tmp3, 7);
251 tmp2 = _mm_xor_si128(tmp2, tmp4);
252 tmp2 = _mm_xor_si128(tmp2, tmp5);
253 tmp2 = _mm_xor_si128(tmp2, tmp8);
254 tmp3 = _mm_xor_si128(tmp3, tmp2);
255 tmp6 = _mm_xor_si128(tmp6, tmp3);
256
257 *res = tmp6;
258 }
259
260 /*
261 * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
262 * Every Four Blocks
263 */
264 /*
265 * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
266 * 2^32-256*8*16 bytes.
267 */
268 void
AES_GCM_encrypt(const unsigned char * in,unsigned char * out,const unsigned char * addt,const unsigned char * ivec,unsigned char * tag,uint32_t nbytes,uint32_t abytes,int ibytes,const unsigned char * key,int nr)269 AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
270 const unsigned char *addt, const unsigned char *ivec,
271 unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
272 const unsigned char *key, int nr)
273 {
274 int i, j ,k;
275 __m128i tmp1, tmp2, tmp3, tmp4;
276 __m128i tmp5, tmp6, tmp7, tmp8;
277 __m128i H, H2, H3, H4, Y, T;
278 const __m128i *KEY = (const __m128i *)key;
279 __m128i ctr1, ctr2, ctr3, ctr4;
280 __m128i ctr5, ctr6, ctr7, ctr8;
281 __m128i last_block = _mm_setzero_si128();
282 __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
283 __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
284 __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
285 7);
286 __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
287 15);
288 __m128i X = _mm_setzero_si128();
289
290 if (ibytes == 96/8) {
291 Y = _mm_loadu_si128((const __m128i *)ivec);
292 Y = _mm_insert_epi32(Y, 0x1000000, 3);
293 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/
294 tmp1 = _mm_xor_si128(X, KEY[0]);
295 tmp2 = _mm_xor_si128(Y, KEY[0]);
296 for (j=1; j < nr-1; j+=2) {
297 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
298 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
299
300 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
301 tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
302 }
303 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
304 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
305
306 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
307 T = _mm_aesenclast_si128(tmp2, KEY[nr]);
308
309 H = _mm_shuffle_epi8(H, BSWAP_MASK);
310 } else {
311 tmp1 = _mm_xor_si128(X, KEY[0]);
312 for (j=1; j <nr; j++)
313 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
314 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
315
316 H = _mm_shuffle_epi8(H, BSWAP_MASK);
317 Y = _mm_setzero_si128();
318
319 for (i=0; i < ibytes/16; i++) {
320 tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
321 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
322 Y = _mm_xor_si128(Y, tmp1);
323 gfmul(Y, H, &Y);
324 }
325 if (ibytes%16) {
326 for (j=0; j < ibytes%16; j++)
327 ((unsigned char*)&last_block)[j] = ivec[i*16+j];
328 tmp1 = last_block;
329 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
330 Y = _mm_xor_si128(Y, tmp1);
331 gfmul(Y, H, &Y);
332 }
333 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
334 tmp1 = _mm_insert_epi64(tmp1, 0, 1);
335
336 Y = _mm_xor_si128(Y, tmp1);
337 gfmul(Y, H, &Y);
338 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
339 tmp1 = _mm_xor_si128(Y, KEY[0]);
340 for (j=1; j < nr; j++)
341 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
342 T = _mm_aesenclast_si128(tmp1, KEY[nr]);
343 }
344
345 gfmul(H,H,&H2);
346 gfmul(H,H2,&H3);
347 gfmul(H,H3,&H4);
348
349 for (i=0; i<abytes/16/4; i++) {
350 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
351 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
352 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
353 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
354
355 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
356 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
357 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
358 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
359 tmp1 = _mm_xor_si128(X, tmp1);
360
361 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
362 }
363 for (i=i*4; i<abytes/16; i++) {
364 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
365 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
366 X = _mm_xor_si128(X,tmp1);
367 gfmul(X, H, &X);
368 }
369 if (abytes%16) {
370 last_block = _mm_setzero_si128();
371 for (j=0; j<abytes%16; j++)
372 ((unsigned char*)&last_block)[j] = addt[i*16+j];
373 tmp1 = last_block;
374 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
375 X =_mm_xor_si128(X,tmp1);
376 gfmul(X,H,&X);
377 }
378
379 ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
380 ctr1 = _mm_add_epi64(ctr1, ONE);
381 ctr2 = _mm_add_epi64(ctr1, ONE);
382 ctr3 = _mm_add_epi64(ctr2, ONE);
383 ctr4 = _mm_add_epi64(ctr3, ONE);
384 ctr5 = _mm_add_epi64(ctr4, ONE);
385 ctr6 = _mm_add_epi64(ctr5, ONE);
386 ctr7 = _mm_add_epi64(ctr6, ONE);
387 ctr8 = _mm_add_epi64(ctr7, ONE);
388
389 for (i=0; i<nbytes/16/8; i++) {
390 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
391 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
392 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
393 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
394 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
395 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
396 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
397 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
398
399 ctr1 = _mm_add_epi64(ctr1, EIGHT);
400 ctr2 = _mm_add_epi64(ctr2, EIGHT);
401 ctr3 = _mm_add_epi64(ctr3, EIGHT);
402 ctr4 = _mm_add_epi64(ctr4, EIGHT);
403 ctr5 = _mm_add_epi64(ctr5, EIGHT);
404 ctr6 = _mm_add_epi64(ctr6, EIGHT);
405 ctr7 = _mm_add_epi64(ctr7, EIGHT);
406 ctr8 = _mm_add_epi64(ctr8, EIGHT);
407
408 tmp1 =_mm_xor_si128(tmp1, KEY[0]);
409 tmp2 =_mm_xor_si128(tmp2, KEY[0]);
410 tmp3 =_mm_xor_si128(tmp3, KEY[0]);
411 tmp4 =_mm_xor_si128(tmp4, KEY[0]);
412 tmp5 =_mm_xor_si128(tmp5, KEY[0]);
413 tmp6 =_mm_xor_si128(tmp6, KEY[0]);
414 tmp7 =_mm_xor_si128(tmp7, KEY[0]);
415 tmp8 =_mm_xor_si128(tmp8, KEY[0]);
416
417 for (j=1; j<nr; j++) {
418 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
419 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
420 tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
421 tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
422 tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
423 tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
424 tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
425 tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
426 }
427 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
428 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
429 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
430 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
431 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
432 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
433 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
434 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
435
436 tmp1 = _mm_xor_si128(tmp1,
437 _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
438 tmp2 = _mm_xor_si128(tmp2,
439 _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
440 tmp3 = _mm_xor_si128(tmp3,
441 _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
442 tmp4 = _mm_xor_si128(tmp4,
443 _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
444 tmp5 = _mm_xor_si128(tmp5,
445 _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
446 tmp6 = _mm_xor_si128(tmp6,
447 _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
448 tmp7 = _mm_xor_si128(tmp7,
449 _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
450 tmp8 = _mm_xor_si128(tmp8,
451 _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
452
453 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
454 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
455 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
456 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
457 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
458 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
459 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
460 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
461
462 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
463 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
464 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
465 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
466 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
467 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
468 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
469 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
470
471 tmp1 = _mm_xor_si128(X, tmp1);
472
473 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
474
475 tmp5 = _mm_xor_si128(X, tmp5);
476 reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X);
477 }
478 for (k=i*8; k<nbytes/16; k++) {
479 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
480 ctr1 = _mm_add_epi64(ctr1, ONE);
481 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
482 for (j=1; j<nr-1; j+=2) {
483 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
484 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
485 }
486 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
487 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
488 tmp1 = _mm_xor_si128(tmp1,
489 _mm_loadu_si128(&((const __m128i *)in)[k]));
490 _mm_storeu_si128(&((__m128i*)out)[k], tmp1);
491 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
492 X = _mm_xor_si128(X, tmp1);
493 gfmul(X,H,&X);
494 }
495 //If remains one incomplete block
496 if (nbytes%16) {
497 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
498 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
499 for (j=1; j<nr-1; j+=2) {
500 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
501 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
502 }
503 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
504 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
505 last_block = _mm_setzero_si128();
506 memcpy(&last_block, &((const __m128i *)in)[k],
507 nbytes % 16);
508 last_block = _mm_xor_si128(last_block, tmp1);
509 for (j=0; j<nbytes%16; j++)
510 out[k*16+j] = ((unsigned char*)&last_block)[j];
511 for ((void)j; j<16; j++)
512 ((unsigned char*)&last_block)[j] = 0;
513 tmp1 = last_block;
514 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
515 X = _mm_xor_si128(X, tmp1);
516 gfmul(X, H, &X);
517 }
518 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
519 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
520
521 X = _mm_xor_si128(X, tmp1);
522 gfmul(X,H,&X);
523 X = _mm_shuffle_epi8(X, BSWAP_MASK);
524 T = _mm_xor_si128(X, T);
525 _mm_storeu_si128((__m128i*)tag, T);
526 }
527
528 /* My modification of _encrypt to be _decrypt */
529 int
AES_GCM_decrypt(const unsigned char * in,unsigned char * out,const unsigned char * addt,const unsigned char * ivec,const unsigned char * tag,uint32_t nbytes,uint32_t abytes,int ibytes,const unsigned char * key,int nr)530 AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
531 const unsigned char *addt, const unsigned char *ivec,
532 const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
533 const unsigned char *key, int nr)
534 {
535 int i, j ,k;
536 __m128i tmp1, tmp2, tmp3, tmp4;
537 __m128i tmp5, tmp6, tmp7, tmp8;
538 __m128i H, H2, H3, H4, Y, T;
539 const __m128i *KEY = (const __m128i *)key;
540 __m128i ctr1, ctr2, ctr3, ctr4;
541 __m128i ctr5, ctr6, ctr7, ctr8;
542 __m128i last_block = _mm_setzero_si128();
543 __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
544 __m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
545 __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
546 7);
547 __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
548 15);
549 __m128i X = _mm_setzero_si128();
550
551 if (ibytes == 96/8) {
552 Y = _mm_loadu_si128((const __m128i *)ivec);
553 Y = _mm_insert_epi32(Y, 0x1000000, 3);
554 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/
555 tmp1 = _mm_xor_si128(X, KEY[0]);
556 tmp2 = _mm_xor_si128(Y, KEY[0]);
557 for (j=1; j < nr-1; j+=2) {
558 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
559 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
560
561 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
562 tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
563 }
564 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
565 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
566
567 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
568 T = _mm_aesenclast_si128(tmp2, KEY[nr]);
569
570 H = _mm_shuffle_epi8(H, BSWAP_MASK);
571 } else {
572 tmp1 = _mm_xor_si128(X, KEY[0]);
573 for (j=1; j <nr; j++)
574 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
575 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
576
577 H = _mm_shuffle_epi8(H, BSWAP_MASK);
578 Y = _mm_setzero_si128();
579
580 for (i=0; i < ibytes/16; i++) {
581 tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
582 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
583 Y = _mm_xor_si128(Y, tmp1);
584 gfmul(Y, H, &Y);
585 }
586 if (ibytes%16) {
587 for (j=0; j < ibytes%16; j++)
588 ((unsigned char*)&last_block)[j] = ivec[i*16+j];
589 tmp1 = last_block;
590 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
591 Y = _mm_xor_si128(Y, tmp1);
592 gfmul(Y, H, &Y);
593 }
594 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
595 tmp1 = _mm_insert_epi64(tmp1, 0, 1);
596
597 Y = _mm_xor_si128(Y, tmp1);
598 gfmul(Y, H, &Y);
599 Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
600 tmp1 = _mm_xor_si128(Y, KEY[0]);
601 for (j=1; j < nr; j++)
602 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
603 T = _mm_aesenclast_si128(tmp1, KEY[nr]);
604 }
605
606 gfmul(H,H,&H2);
607 gfmul(H,H2,&H3);
608 gfmul(H,H3,&H4);
609
610 for (i=0; i<abytes/16/4; i++) {
611 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
612 tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
613 tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
614 tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
615
616 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
617 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
618 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
619 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
620
621 tmp1 = _mm_xor_si128(X, tmp1);
622
623 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
624 }
625 for (i=i*4; i<abytes/16; i++) {
626 tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
627 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
628 X = _mm_xor_si128(X,tmp1);
629 gfmul(X, H, &X);
630 }
631 if (abytes%16) {
632 last_block = _mm_setzero_si128();
633 for (j=0; j<abytes%16; j++)
634 ((unsigned char*)&last_block)[j] = addt[i*16+j];
635 tmp1 = last_block;
636 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
637 X =_mm_xor_si128(X,tmp1);
638 gfmul(X,H,&X);
639 }
640
641 /* This is where we validate the cipher text before decrypt */
642 for (i = 0; i<nbytes/16/4; i++) {
643 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i*4]);
644 tmp2 = _mm_loadu_si128(&((const __m128i *)in)[i*4+1]);
645 tmp3 = _mm_loadu_si128(&((const __m128i *)in)[i*4+2]);
646 tmp4 = _mm_loadu_si128(&((const __m128i *)in)[i*4+3]);
647
648 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
649 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
650 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
651 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
652
653 tmp1 = _mm_xor_si128(X, tmp1);
654
655 reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
656 }
657 for (i = i*4; i<nbytes/16; i++) {
658 tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i]);
659 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
660 X = _mm_xor_si128(X, tmp1);
661 gfmul(X,H,&X);
662 }
663 if (nbytes%16) {
664 last_block = _mm_setzero_si128();
665 for (j=0; j<nbytes%16; j++)
666 ((unsigned char*)&last_block)[j] = in[i*16+j];
667 tmp1 = last_block;
668 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
669 X = _mm_xor_si128(X, tmp1);
670 gfmul(X, H, &X);
671 }
672
673 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
674 tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
675
676 X = _mm_xor_si128(X, tmp1);
677 gfmul(X,H,&X);
678 X = _mm_shuffle_epi8(X, BSWAP_MASK);
679 T = _mm_xor_si128(X, T);
680
681 if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag)))
682 return 0; //in case the authentication failed
683
684 ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
685 ctr1 = _mm_add_epi64(ctr1, ONE);
686 ctr2 = _mm_add_epi64(ctr1, ONE);
687 ctr3 = _mm_add_epi64(ctr2, ONE);
688 ctr4 = _mm_add_epi64(ctr3, ONE);
689 ctr5 = _mm_add_epi64(ctr4, ONE);
690 ctr6 = _mm_add_epi64(ctr5, ONE);
691 ctr7 = _mm_add_epi64(ctr6, ONE);
692 ctr8 = _mm_add_epi64(ctr7, ONE);
693
694 for (i=0; i<nbytes/16/8; i++) {
695 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
696 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
697 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
698 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
699 tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
700 tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
701 tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
702 tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
703
704 ctr1 = _mm_add_epi64(ctr1, EIGHT);
705 ctr2 = _mm_add_epi64(ctr2, EIGHT);
706 ctr3 = _mm_add_epi64(ctr3, EIGHT);
707 ctr4 = _mm_add_epi64(ctr4, EIGHT);
708 ctr5 = _mm_add_epi64(ctr5, EIGHT);
709 ctr6 = _mm_add_epi64(ctr6, EIGHT);
710 ctr7 = _mm_add_epi64(ctr7, EIGHT);
711 ctr8 = _mm_add_epi64(ctr8, EIGHT);
712
713 tmp1 =_mm_xor_si128(tmp1, KEY[0]);
714 tmp2 =_mm_xor_si128(tmp2, KEY[0]);
715 tmp3 =_mm_xor_si128(tmp3, KEY[0]);
716 tmp4 =_mm_xor_si128(tmp4, KEY[0]);
717 tmp5 =_mm_xor_si128(tmp5, KEY[0]);
718 tmp6 =_mm_xor_si128(tmp6, KEY[0]);
719 tmp7 =_mm_xor_si128(tmp7, KEY[0]);
720 tmp8 =_mm_xor_si128(tmp8, KEY[0]);
721
722 for (j=1; j<nr; j++) {
723 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
724 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
725 tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
726 tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
727 tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
728 tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
729 tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
730 tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
731 }
732 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
733 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
734 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
735 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
736 tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
737 tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
738 tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
739 tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
740
741 tmp1 = _mm_xor_si128(tmp1,
742 _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
743 tmp2 = _mm_xor_si128(tmp2,
744 _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
745 tmp3 = _mm_xor_si128(tmp3,
746 _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
747 tmp4 = _mm_xor_si128(tmp4,
748 _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
749 tmp5 = _mm_xor_si128(tmp5,
750 _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
751 tmp6 = _mm_xor_si128(tmp6,
752 _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
753 tmp7 = _mm_xor_si128(tmp7,
754 _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
755 tmp8 = _mm_xor_si128(tmp8,
756 _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
757
758 _mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
759 _mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
760 _mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
761 _mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
762 _mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
763 _mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
764 _mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
765 _mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
766
767 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
768 tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
769 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
770 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
771 tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
772 tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
773 tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
774 tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
775 }
776 for (k=i*8; k<nbytes/16; k++) {
777 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
778 ctr1 = _mm_add_epi64(ctr1, ONE);
779 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
780 for (j=1; j<nr-1; j+=2) {
781 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
782 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
783 }
784 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
785 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
786 tmp1 = _mm_xor_si128(tmp1,
787 _mm_loadu_si128(&((const __m128i *)in)[k]));
788 _mm_storeu_si128(&((__m128i*)out)[k], tmp1);
789 }
790 //If remains one incomplete block
791 if (nbytes%16) {
792 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
793 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
794 for (j=1; j<nr-1; j+=2) {
795 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
796 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
797 }
798 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
799 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
800 last_block = _mm_setzero_si128();
801 memcpy(&last_block, &((const __m128i *)in)[k], nbytes%16);
802 tmp1 = _mm_xor_si128(tmp1, last_block);
803 last_block = tmp1;
804 for (j=0; j<nbytes%16; j++)
805 out[k*16+j] = ((unsigned char*)&last_block)[j];
806 }
807 return 1; //when sucessfull returns 1
808 }
809