xref: /freebsd/sys/crypto/aesni/aesni_ghash.c (revision 545ddfbe7d4fe8adfb862903b24eac1d5896c1ef)
1 /*-
2  * Copyright (c) 2014 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by John-Mark Gurney under
6  * the sponsorship of the FreeBSD Foundation and
7  * Rubicon Communications, LLC (Netgate).
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1.  Redistributions of source code must retain the above copyright
12  *     notice, this list of conditions and the following disclaimer.
13  * 2.  Redistributions in binary form must reproduce the above copyright
14  *     notice, this list of conditions and the following disclaimer in the
15  *     documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *
30  *	$FreeBSD$
31  *
32  */
33 
34 /*
35  * Figure 5, 8 and 12 are copied from the Intel white paper:
36  * Intel® Carry-Less Multiplication Instruction and its Usage for
37  * Computing the GCM Mode
38  *
39  * and as such are:
40  * Copyright © 2010 Intel Corporation.
41  * All rights reserved.
42  *
43  * Redistribution and use in source and binary forms, with or without
44  * modification, are permitted provided that the following conditions
45  * are met:
46  *   * Redistributions of source code must retain the above copyright
47  *     notice, this list of conditions and the following disclaimer.
48  *   * Redistributions in binary form must reproduce the above copyright
49  *     notice, this list of conditions and the following disclaimer in the
50  *     documentation and/or other materials provided with the distribution.
51  *   * Neither the name of Intel Corporation nor the
52  *     names of its contributors may be used to endorse or promote products
53  *     derived from this software without specific prior written permission.
54  *
55  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
56  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
57  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
58  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
59  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
60  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
61  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
62  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
63  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
64  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
65  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
66  */
67 
68 #ifdef _KERNEL
69 #include <crypto/aesni/aesni.h>
70 #else
71 #include <stdint.h>
72 #endif
73 
74 #include <wmmintrin.h>
75 #include <emmintrin.h>
76 #include <smmintrin.h>
77 
78 static inline int
79 m128icmp(__m128i a, __m128i b)
80 {
81 	__m128i cmp;
82 
83 	cmp = _mm_cmpeq_epi32(a, b);
84 
85 	return _mm_movemask_epi8(cmp) == 0xffff;
86 }
87 
88 #ifdef __i386__
89 static inline __m128i
90 _mm_insert_epi64(__m128i a, int64_t b, const int ndx)
91 {
92 
93 	if (!ndx) {
94 		a = _mm_insert_epi32(a, b, 0);
95 		a = _mm_insert_epi32(a, b >> 32, 1);
96 	} else {
97 		a = _mm_insert_epi32(a, b, 2);
98 		a = _mm_insert_epi32(a, b >> 32, 3);
99 	}
100 
101 	return a;
102 }
103 #endif
104 
105 /* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
106 
107 /* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
108 static void
109 gfmul(__m128i a, __m128i b, __m128i *res)
110 {
111 	__m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
112 
113 	tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
114 	tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
115 	tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
116 	tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
117 
118 	tmp4 = _mm_xor_si128(tmp4, tmp5);
119 	tmp5 = _mm_slli_si128(tmp4, 8);
120 	tmp4 = _mm_srli_si128(tmp4, 8);
121 	tmp3 = _mm_xor_si128(tmp3, tmp5);
122 	tmp6 = _mm_xor_si128(tmp6, tmp4);
123 
124 	tmp7 = _mm_srli_epi32(tmp3, 31);
125 	tmp8 = _mm_srli_epi32(tmp6, 31);
126 	tmp3 = _mm_slli_epi32(tmp3, 1);
127 	tmp6 = _mm_slli_epi32(tmp6, 1);
128 
129 	tmp9 = _mm_srli_si128(tmp7, 12);
130 	tmp8 = _mm_slli_si128(tmp8, 4);
131 	tmp7 = _mm_slli_si128(tmp7, 4);
132 	tmp3 = _mm_or_si128(tmp3, tmp7);
133 	tmp6 = _mm_or_si128(tmp6, tmp8);
134 	tmp6 = _mm_or_si128(tmp6, tmp9);
135 
136 	tmp7 = _mm_slli_epi32(tmp3, 31);
137 	tmp8 = _mm_slli_epi32(tmp3, 30);
138 	tmp9 = _mm_slli_epi32(tmp3, 25);
139 
140 	tmp7 = _mm_xor_si128(tmp7, tmp8);
141 	tmp7 = _mm_xor_si128(tmp7, tmp9);
142 	tmp8 = _mm_srli_si128(tmp7, 4);
143 	tmp7 = _mm_slli_si128(tmp7, 12);
144 	tmp3 = _mm_xor_si128(tmp3, tmp7);
145 
146 	tmp2 = _mm_srli_epi32(tmp3, 1);
147 	tmp4 = _mm_srli_epi32(tmp3, 2);
148 	tmp5 = _mm_srli_epi32(tmp3, 7);
149 	tmp2 = _mm_xor_si128(tmp2, tmp4);
150 	tmp2 = _mm_xor_si128(tmp2, tmp5);
151 	tmp2 = _mm_xor_si128(tmp2, tmp8);
152 	tmp3 = _mm_xor_si128(tmp3, tmp2);
153 	tmp6 = _mm_xor_si128(tmp6, tmp3);
154 
155 	*res = tmp6;
156 }
157 
158 /*
159  * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
160  * Method */
161 static void
162 reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4,
163     __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res)
164 {
165 	/*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
166 	__m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo,
167 	    H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi;
168 	__m128i tmp0, tmp1, tmp2, tmp3;
169 	__m128i tmp4, tmp5, tmp6, tmp7;
170 	__m128i tmp8, tmp9;
171 
172 	H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
173 	H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
174 	H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
175 	H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
176 
177 	lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo);
178 	lo = _mm_xor_si128(lo, H3_X3_lo);
179 	lo = _mm_xor_si128(lo, H4_X4_lo);
180 
181 	H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
182 	H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
183 	H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
184 	H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
185 
186 	hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi);
187 	hi = _mm_xor_si128(hi, H3_X3_hi);
188 	hi = _mm_xor_si128(hi, H4_X4_hi);
189 
190 	tmp0 = _mm_shuffle_epi32(H1, 78);
191 	tmp4 = _mm_shuffle_epi32(X1, 78);
192 	tmp0 = _mm_xor_si128(tmp0, H1);
193 	tmp4 = _mm_xor_si128(tmp4, X1);
194 	tmp1 = _mm_shuffle_epi32(H2, 78);
195 	tmp5 = _mm_shuffle_epi32(X2, 78);
196 	tmp1 = _mm_xor_si128(tmp1, H2);
197 	tmp5 = _mm_xor_si128(tmp5, X2);
198 	tmp2 = _mm_shuffle_epi32(H3, 78);
199 	tmp6 = _mm_shuffle_epi32(X3, 78);
200 	tmp2 = _mm_xor_si128(tmp2, H3);
201 	tmp6 = _mm_xor_si128(tmp6, X3);
202 	tmp3 = _mm_shuffle_epi32(H4, 78);
203 	tmp7 = _mm_shuffle_epi32(X4, 78);
204 	tmp3 = _mm_xor_si128(tmp3, H4);
205 	tmp7 = _mm_xor_si128(tmp7, X4);
206 
207 	tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00);
208 	tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
209 	tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00);
210 	tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
211 
212 	tmp0 = _mm_xor_si128(tmp0, lo);
213 	tmp0 = _mm_xor_si128(tmp0, hi);
214 	tmp0 = _mm_xor_si128(tmp1, tmp0);
215 	tmp0 = _mm_xor_si128(tmp2, tmp0);
216 	tmp0 = _mm_xor_si128(tmp3, tmp0);
217 
218 	tmp4 = _mm_slli_si128(tmp0, 8);
219 	tmp0 = _mm_srli_si128(tmp0, 8);
220 
221 	lo = _mm_xor_si128(tmp4, lo);
222 	hi = _mm_xor_si128(tmp0, hi);
223 
224 	tmp3 = lo;
225 	tmp6 = hi;
226 
227 	tmp7 = _mm_srli_epi32(tmp3, 31);
228 	tmp8 = _mm_srli_epi32(tmp6, 31);
229 	tmp3 = _mm_slli_epi32(tmp3, 1);
230 	tmp6 = _mm_slli_epi32(tmp6, 1);
231 
232 	tmp9 = _mm_srli_si128(tmp7, 12);
233 	tmp8 = _mm_slli_si128(tmp8, 4);
234 	tmp7 = _mm_slli_si128(tmp7, 4);
235 	tmp3 = _mm_or_si128(tmp3, tmp7);
236 	tmp6 = _mm_or_si128(tmp6, tmp8);
237 	tmp6 = _mm_or_si128(tmp6, tmp9);
238 
239 	tmp7 = _mm_slli_epi32(tmp3, 31);
240 	tmp8 = _mm_slli_epi32(tmp3, 30);
241 	tmp9 = _mm_slli_epi32(tmp3, 25);
242 
243 	tmp7 = _mm_xor_si128(tmp7, tmp8);
244 	tmp7 = _mm_xor_si128(tmp7, tmp9);
245 	tmp8 = _mm_srli_si128(tmp7, 4);
246 	tmp7 = _mm_slli_si128(tmp7, 12);
247 	tmp3 = _mm_xor_si128(tmp3, tmp7);
248 
249 	tmp2 = _mm_srli_epi32(tmp3, 1);
250 	tmp4 = _mm_srli_epi32(tmp3, 2);
251 	tmp5 = _mm_srli_epi32(tmp3, 7);
252 	tmp2 = _mm_xor_si128(tmp2, tmp4);
253 	tmp2 = _mm_xor_si128(tmp2, tmp5);
254 	tmp2 = _mm_xor_si128(tmp2, tmp8);
255 	tmp3 = _mm_xor_si128(tmp3, tmp2);
256 	tmp6 = _mm_xor_si128(tmp6, tmp3);
257 
258 	*res = tmp6;
259 }
260 
261 /*
262  * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
263  * Every Four Blocks
264  */
265 /*
266  * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
267  * 2^32-256*8*16 bytes.
268  */
269 void
270 AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
271 	const unsigned char *addt, const unsigned char *ivec,
272 	unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
273 	const unsigned char *key, int nr)
274 {
275 	int i, j ,k;
276 	__m128i tmp1, tmp2, tmp3, tmp4;
277 	__m128i tmp5, tmp6, tmp7, tmp8;
278 	__m128i H, H2, H3, H4, Y, T;
279 	__m128i *KEY = (__m128i*)key;
280 	__m128i ctr1, ctr2, ctr3, ctr4;
281 	__m128i ctr5, ctr6, ctr7, ctr8;
282 	__m128i last_block = _mm_setzero_si128();
283 	__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
284 	__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
285 	__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
286 	    7);
287 	__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
288 	    15);
289 	__m128i X = _mm_setzero_si128();
290 
291 	if (ibytes == 96/8) {
292 		Y = _mm_loadu_si128((__m128i*)ivec);
293 		Y = _mm_insert_epi32(Y, 0x1000000, 3);
294 		/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
295 		tmp1 = _mm_xor_si128(X, KEY[0]);
296 		tmp2 = _mm_xor_si128(Y, KEY[0]);
297 		for (j=1; j < nr-1; j+=2) {
298 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
299 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
300 
301 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
302 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
303 		}
304 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
305 		tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
306 
307 		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
308 		T = _mm_aesenclast_si128(tmp2, KEY[nr]);
309 
310 		H = _mm_shuffle_epi8(H, BSWAP_MASK);
311 	} else {
312 		tmp1 = _mm_xor_si128(X, KEY[0]);
313 		for (j=1; j <nr; j++)
314 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
315 		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
316 
317 		H = _mm_shuffle_epi8(H, BSWAP_MASK);
318 		Y = _mm_setzero_si128();
319 
320 		for (i=0; i < ibytes/16; i++) {
321 			tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]);
322 			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
323 			Y = _mm_xor_si128(Y, tmp1);
324 			gfmul(Y, H, &Y);
325 		}
326 		if (ibytes%16) {
327 			for (j=0; j < ibytes%16; j++)
328 				((unsigned char*)&last_block)[j] = ivec[i*16+j];
329 			tmp1 = last_block;
330 			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
331 			Y = _mm_xor_si128(Y, tmp1);
332 			gfmul(Y, H, &Y);
333 		}
334 		tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
335 		tmp1 = _mm_insert_epi64(tmp1, 0, 1);
336 
337 		Y = _mm_xor_si128(Y, tmp1);
338 		gfmul(Y, H, &Y);
339 		Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
340 		tmp1 = _mm_xor_si128(Y, KEY[0]);
341 		for (j=1; j < nr; j++)
342 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
343 		T = _mm_aesenclast_si128(tmp1, KEY[nr]);
344 	}
345 
346 	gfmul(H,H,&H2);
347 	gfmul(H,H2,&H3);
348 	gfmul(H,H3,&H4);
349 
350 	for (i=0; i<abytes/16/4; i++) {
351 		tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i*4]);
352 		tmp2 = _mm_loadu_si128(&((__m128i*)addt)[i*4+1]);
353 		tmp3 = _mm_loadu_si128(&((__m128i*)addt)[i*4+2]);
354 		tmp4 = _mm_loadu_si128(&((__m128i*)addt)[i*4+3]);
355 
356 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
357 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
358 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
359 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
360 		tmp1 = _mm_xor_si128(X, tmp1);
361 
362 		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
363 	}
364 	for (i=i*4; i<abytes/16; i++) {
365 		tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]);
366 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
367 		X = _mm_xor_si128(X,tmp1);
368 		gfmul(X, H, &X);
369 	}
370 	if (abytes%16) {
371 		last_block = _mm_setzero_si128();
372 		for (j=0; j<abytes%16; j++)
373 			((unsigned char*)&last_block)[j] = addt[i*16+j];
374 		tmp1 = last_block;
375 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
376 		X =_mm_xor_si128(X,tmp1);
377 		gfmul(X,H,&X);
378 	}
379 
380 	ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
381 	ctr1 = _mm_add_epi64(ctr1, ONE);
382 	ctr2 = _mm_add_epi64(ctr1, ONE);
383 	ctr3 = _mm_add_epi64(ctr2, ONE);
384 	ctr4 = _mm_add_epi64(ctr3, ONE);
385 	ctr5 = _mm_add_epi64(ctr4, ONE);
386 	ctr6 = _mm_add_epi64(ctr5, ONE);
387 	ctr7 = _mm_add_epi64(ctr6, ONE);
388 	ctr8 = _mm_add_epi64(ctr7, ONE);
389 
390 	for (i=0; i<nbytes/16/8; i++) {
391 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
392 		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
393 		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
394 		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
395 		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
396 		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
397 		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
398 		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
399 
400 		ctr1 = _mm_add_epi64(ctr1, EIGHT);
401 		ctr2 = _mm_add_epi64(ctr2, EIGHT);
402 		ctr3 = _mm_add_epi64(ctr3, EIGHT);
403 		ctr4 = _mm_add_epi64(ctr4, EIGHT);
404 		ctr5 = _mm_add_epi64(ctr5, EIGHT);
405 		ctr6 = _mm_add_epi64(ctr6, EIGHT);
406 		ctr7 = _mm_add_epi64(ctr7, EIGHT);
407 		ctr8 = _mm_add_epi64(ctr8, EIGHT);
408 
409 		tmp1 =_mm_xor_si128(tmp1, KEY[0]);
410 		tmp2 =_mm_xor_si128(tmp2, KEY[0]);
411 		tmp3 =_mm_xor_si128(tmp3, KEY[0]);
412 		tmp4 =_mm_xor_si128(tmp4, KEY[0]);
413 		tmp5 =_mm_xor_si128(tmp5, KEY[0]);
414 		tmp6 =_mm_xor_si128(tmp6, KEY[0]);
415 		tmp7 =_mm_xor_si128(tmp7, KEY[0]);
416 		tmp8 =_mm_xor_si128(tmp8, KEY[0]);
417 
418 		for (j=1; j<nr; j++) {
419 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
420 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
421 			tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
422 			tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
423 			tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
424 			tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
425 			tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
426 			tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
427 		}
428 		tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
429 		tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
430 		tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
431 		tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
432 		tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
433 		tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
434 		tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
435 		tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
436 
437 		tmp1 = _mm_xor_si128(tmp1,
438 		    _mm_loadu_si128(&((__m128i*)in)[i*8+0]));
439 		tmp2 = _mm_xor_si128(tmp2,
440 		    _mm_loadu_si128(&((__m128i*)in)[i*8+1]));
441 		tmp3 = _mm_xor_si128(tmp3,
442 		    _mm_loadu_si128(&((__m128i*)in)[i*8+2]));
443 		tmp4 = _mm_xor_si128(tmp4,
444 		    _mm_loadu_si128(&((__m128i*)in)[i*8+3]));
445 		tmp5 = _mm_xor_si128(tmp5,
446 		    _mm_loadu_si128(&((__m128i*)in)[i*8+4]));
447 		tmp6 = _mm_xor_si128(tmp6,
448 		    _mm_loadu_si128(&((__m128i*)in)[i*8+5]));
449 		tmp7 = _mm_xor_si128(tmp7,
450 		    _mm_loadu_si128(&((__m128i*)in)[i*8+6]));
451 		tmp8 = _mm_xor_si128(tmp8,
452 		    _mm_loadu_si128(&((__m128i*)in)[i*8+7]));
453 
454 		_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
455 		_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
456 		_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
457 		_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
458 		_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
459 		_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
460 		_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
461 		_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
462 
463 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
464 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
465 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
466 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
467 		tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
468 		tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
469 		tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
470 		tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
471 
472 		tmp1 = _mm_xor_si128(X, tmp1);
473 
474 		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
475 
476 		tmp5 = _mm_xor_si128(X, tmp5);
477 		reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X);
478 	}
479 	for (k=i*8; k<nbytes/16; k++) {
480 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
481 		ctr1 = _mm_add_epi64(ctr1, ONE);
482 		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
483 		for (j=1; j<nr-1; j+=2) {
484 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
485 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
486 		}
487 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
488 		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
489 		tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
490 		_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
491 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
492 		X = _mm_xor_si128(X, tmp1);
493 		gfmul(X,H,&X);
494 	}
495 	//If remains one incomplete block
496 	if (nbytes%16) {
497 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
498 		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
499 		for (j=1; j<nr-1; j+=2) {
500 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
501 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
502 		}
503 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
504 		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
505 		tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
506 		last_block = tmp1;
507 		for (j=0; j<nbytes%16; j++)
508 			out[k*16+j] = ((unsigned char*)&last_block)[j];
509 		for ((void)j; j<16; j++)
510 			((unsigned char*)&last_block)[j] = 0;
511 		tmp1 = last_block;
512 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
513 		X = _mm_xor_si128(X, tmp1);
514 		gfmul(X, H, &X);
515 	}
516 	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
517 	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
518 
519 	X = _mm_xor_si128(X, tmp1);
520 	gfmul(X,H,&X);
521 	X = _mm_shuffle_epi8(X, BSWAP_MASK);
522 	T = _mm_xor_si128(X, T);
523 	_mm_storeu_si128((__m128i*)tag, T);
524 }
525 
526 /* My modification of _encrypt to be _decrypt */
527 int
528 AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
529 	const unsigned char *addt, const unsigned char *ivec,
530 	unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
531 	const unsigned char *key, int nr)
532 {
533 	int i, j ,k;
534 	__m128i tmp1, tmp2, tmp3, tmp4;
535 	__m128i tmp5, tmp6, tmp7, tmp8;
536 	__m128i H, H2, H3, H4, Y, T;
537 	__m128i *KEY = (__m128i*)key;
538 	__m128i ctr1, ctr2, ctr3, ctr4;
539 	__m128i ctr5, ctr6, ctr7, ctr8;
540 	__m128i last_block = _mm_setzero_si128();
541 	__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
542 	__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
543 	__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
544 	    7);
545 	__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
546 	    15);
547 	__m128i X = _mm_setzero_si128();
548 
549 	if (ibytes == 96/8) {
550 		Y = _mm_loadu_si128((__m128i*)ivec);
551 		Y = _mm_insert_epi32(Y, 0x1000000, 3);
552 		/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
553 		tmp1 = _mm_xor_si128(X, KEY[0]);
554 		tmp2 = _mm_xor_si128(Y, KEY[0]);
555 		for (j=1; j < nr-1; j+=2) {
556 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
557 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
558 
559 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
560 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
561 		}
562 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
563 		tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
564 
565 		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
566 		T = _mm_aesenclast_si128(tmp2, KEY[nr]);
567 
568 		H = _mm_shuffle_epi8(H, BSWAP_MASK);
569 	} else {
570 		tmp1 = _mm_xor_si128(X, KEY[0]);
571 		for (j=1; j <nr; j++)
572 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
573 		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
574 
575 		H = _mm_shuffle_epi8(H, BSWAP_MASK);
576 		Y = _mm_setzero_si128();
577 
578 		for (i=0; i < ibytes/16; i++) {
579 			tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]);
580 			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
581 			Y = _mm_xor_si128(Y, tmp1);
582 			gfmul(Y, H, &Y);
583 		}
584 		if (ibytes%16) {
585 			for (j=0; j < ibytes%16; j++)
586 				((unsigned char*)&last_block)[j] = ivec[i*16+j];
587 			tmp1 = last_block;
588 			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
589 			Y = _mm_xor_si128(Y, tmp1);
590 			gfmul(Y, H, &Y);
591 		}
592 		tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
593 		tmp1 = _mm_insert_epi64(tmp1, 0, 1);
594 
595 		Y = _mm_xor_si128(Y, tmp1);
596 		gfmul(Y, H, &Y);
597 		Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
598 		tmp1 = _mm_xor_si128(Y, KEY[0]);
599 		for (j=1; j < nr; j++)
600 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
601 		T = _mm_aesenclast_si128(tmp1, KEY[nr]);
602 	}
603 
604 	gfmul(H,H,&H2);
605 	gfmul(H,H2,&H3);
606 	gfmul(H,H3,&H4);
607 
608 	for (i=0; i<abytes/16/4; i++) {
609 		tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i*4]);
610 		tmp2 = _mm_loadu_si128(&((__m128i*)addt)[i*4+1]);
611 		tmp3 = _mm_loadu_si128(&((__m128i*)addt)[i*4+2]);
612 		tmp4 = _mm_loadu_si128(&((__m128i*)addt)[i*4+3]);
613 
614 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
615 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
616 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
617 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
618 
619 		tmp1 = _mm_xor_si128(X, tmp1);
620 
621 		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
622 	}
623 	for (i=i*4; i<abytes/16; i++) {
624 		tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]);
625 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
626 		X = _mm_xor_si128(X,tmp1);
627 		gfmul(X, H, &X);
628 	}
629 	if (abytes%16) {
630 		last_block = _mm_setzero_si128();
631 		for (j=0; j<abytes%16; j++)
632 			((unsigned char*)&last_block)[j] = addt[i*16+j];
633 		tmp1 = last_block;
634 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
635 		X =_mm_xor_si128(X,tmp1);
636 		gfmul(X,H,&X);
637 	}
638 
639 	/* This is where we validate the cipher text before decrypt */
640 	for (i = 0; i<nbytes/16/4; i++) {
641 		tmp1 = _mm_loadu_si128(&((__m128i*)in)[i*4]);
642 		tmp2 = _mm_loadu_si128(&((__m128i*)in)[i*4+1]);
643 		tmp3 = _mm_loadu_si128(&((__m128i*)in)[i*4+2]);
644 		tmp4 = _mm_loadu_si128(&((__m128i*)in)[i*4+3]);
645 
646 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
647 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
648 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
649 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
650 
651 		tmp1 = _mm_xor_si128(X, tmp1);
652 
653 		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
654 	}
655 	for (i = i*4; i<nbytes/16; i++) {
656 		tmp1 = _mm_loadu_si128(&((__m128i*)in)[i]);
657 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
658 		X = _mm_xor_si128(X, tmp1);
659 		gfmul(X,H,&X);
660 	}
661 	if (nbytes%16) {
662 		last_block = _mm_setzero_si128();
663 		for (j=0; j<nbytes%16; j++)
664 			((unsigned char*)&last_block)[j] = in[i*16+j];
665 		tmp1 = last_block;
666 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
667 		X = _mm_xor_si128(X, tmp1);
668 		gfmul(X, H, &X);
669 	}
670 
671 	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
672 	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
673 
674 	X = _mm_xor_si128(X, tmp1);
675 	gfmul(X,H,&X);
676 	X = _mm_shuffle_epi8(X, BSWAP_MASK);
677 	T = _mm_xor_si128(X, T);
678 
679 	if (!m128icmp(T, _mm_loadu_si128((__m128i*)tag)))
680 		return 0; //in case the authentication failed
681 
682 	ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
683 	ctr1 = _mm_add_epi64(ctr1, ONE);
684 	ctr2 = _mm_add_epi64(ctr1, ONE);
685 	ctr3 = _mm_add_epi64(ctr2, ONE);
686 	ctr4 = _mm_add_epi64(ctr3, ONE);
687 	ctr5 = _mm_add_epi64(ctr4, ONE);
688 	ctr6 = _mm_add_epi64(ctr5, ONE);
689 	ctr7 = _mm_add_epi64(ctr6, ONE);
690 	ctr8 = _mm_add_epi64(ctr7, ONE);
691 
692 	for (i=0; i<nbytes/16/8; i++) {
693 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
694 		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
695 		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
696 		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
697 		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
698 		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
699 		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
700 		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
701 
702 		ctr1 = _mm_add_epi64(ctr1, EIGHT);
703 		ctr2 = _mm_add_epi64(ctr2, EIGHT);
704 		ctr3 = _mm_add_epi64(ctr3, EIGHT);
705 		ctr4 = _mm_add_epi64(ctr4, EIGHT);
706 		ctr5 = _mm_add_epi64(ctr5, EIGHT);
707 		ctr6 = _mm_add_epi64(ctr6, EIGHT);
708 		ctr7 = _mm_add_epi64(ctr7, EIGHT);
709 		ctr8 = _mm_add_epi64(ctr8, EIGHT);
710 
711 		tmp1 =_mm_xor_si128(tmp1, KEY[0]);
712 		tmp2 =_mm_xor_si128(tmp2, KEY[0]);
713 		tmp3 =_mm_xor_si128(tmp3, KEY[0]);
714 		tmp4 =_mm_xor_si128(tmp4, KEY[0]);
715 		tmp5 =_mm_xor_si128(tmp5, KEY[0]);
716 		tmp6 =_mm_xor_si128(tmp6, KEY[0]);
717 		tmp7 =_mm_xor_si128(tmp7, KEY[0]);
718 		tmp8 =_mm_xor_si128(tmp8, KEY[0]);
719 
720 		for (j=1; j<nr; j++) {
721 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
722 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
723 			tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
724 			tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
725 			tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
726 			tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
727 			tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
728 			tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
729 		}
730 		tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
731 		tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
732 		tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
733 		tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
734 		tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
735 		tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
736 		tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
737 		tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
738 
739 		tmp1 = _mm_xor_si128(tmp1,
740 		    _mm_loadu_si128(&((__m128i*)in)[i*8+0]));
741 		tmp2 = _mm_xor_si128(tmp2,
742 		    _mm_loadu_si128(&((__m128i*)in)[i*8+1]));
743 		tmp3 = _mm_xor_si128(tmp3,
744 		    _mm_loadu_si128(&((__m128i*)in)[i*8+2]));
745 		tmp4 = _mm_xor_si128(tmp4,
746 		    _mm_loadu_si128(&((__m128i*)in)[i*8+3]));
747 		tmp5 = _mm_xor_si128(tmp5,
748 		    _mm_loadu_si128(&((__m128i*)in)[i*8+4]));
749 		tmp6 = _mm_xor_si128(tmp6,
750 		    _mm_loadu_si128(&((__m128i*)in)[i*8+5]));
751 		tmp7 = _mm_xor_si128(tmp7,
752 		    _mm_loadu_si128(&((__m128i*)in)[i*8+6]));
753 		tmp8 = _mm_xor_si128(tmp8,
754 		    _mm_loadu_si128(&((__m128i*)in)[i*8+7]));
755 
756 		_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
757 		_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
758 		_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
759 		_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
760 		_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
761 		_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
762 		_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
763 		_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
764 
765 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
766 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
767 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
768 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
769 		tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
770 		tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
771 		tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
772 		tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
773 	}
774 	for (k=i*8; k<nbytes/16; k++) {
775 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
776 		ctr1 = _mm_add_epi64(ctr1, ONE);
777 		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
778 		for (j=1; j<nr-1; j+=2) {
779 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
780 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
781 		}
782 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
783 		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
784 		tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
785 		_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
786 	}
787 	//If remains one incomplete block
788 	if (nbytes%16) {
789 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
790 		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
791 		for (j=1; j<nr-1; j+=2) {
792 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
793 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
794 		}
795 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
796 		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
797 		tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
798 		last_block = tmp1;
799 		for (j=0; j<nbytes%16; j++)
800 			out[k*16+j] = ((unsigned char*)&last_block)[j];
801 	}
802 	return 1; //when sucessfull returns 1
803 }
804