xref: /freebsd/sys/crypto/aesni/aesni_ghash.c (revision 271171e0d97b88ba2a7c3bf750c9672b484c1c13)
1 /*-
2  * Copyright (c) 2014 The FreeBSD Foundation
3  * All rights reserved.
4  *
5  * This software was developed by John-Mark Gurney under
6  * the sponsorship of the FreeBSD Foundation and
7  * Rubicon Communications, LLC (Netgate).
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1.  Redistributions of source code must retain the above copyright
12  *     notice, this list of conditions and the following disclaimer.
13  * 2.  Redistributions in binary form must reproduce the above copyright
14  *     notice, this list of conditions and the following disclaimer in the
15  *     documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *
30  *	$FreeBSD$
31  *
32  */
33 
34 /*
35  * Figure 5, 8 and 12 are copied from the Intel white paper:
36  * Intel® Carry-Less Multiplication Instruction and its Usage for
37  * Computing the GCM Mode
38  *
39  * and as such are:
40  * Copyright © 2010 Intel Corporation.
41  * All rights reserved.
42  *
43  * Redistribution and use in source and binary forms, with or without
44  * modification, are permitted provided that the following conditions
45  * are met:
46  *   * Redistributions of source code must retain the above copyright
47  *     notice, this list of conditions and the following disclaimer.
48  *   * Redistributions in binary form must reproduce the above copyright
49  *     notice, this list of conditions and the following disclaimer in the
50  *     documentation and/or other materials provided with the distribution.
51  *   * Neither the name of Intel Corporation nor the
52  *     names of its contributors may be used to endorse or promote products
53  *     derived from this software without specific prior written permission.
54  *
55  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
56  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
57  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
58  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
59  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
60  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
61  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
62  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
63  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
64  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
65  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
66  */
67 
68 #ifdef _KERNEL
69 #include <crypto/aesni/aesni.h>
70 #include <crypto/aesni/aesni_os.h>
71 #else
72 #include <stdint.h>
73 #endif
74 
75 #include <wmmintrin.h>
76 #include <emmintrin.h>
77 #include <smmintrin.h>
78 
79 static inline int
80 m128icmp(__m128i a, __m128i b)
81 {
82 	__m128i cmp;
83 
84 	cmp = _mm_cmpeq_epi32(a, b);
85 
86 	return _mm_movemask_epi8(cmp) == 0xffff;
87 }
88 
89 #ifdef __i386__
90 static inline __m128i
91 _mm_insert_epi64(__m128i a, int64_t b, const int ndx)
92 {
93 
94 	if (!ndx) {
95 		a = _mm_insert_epi32(a, b, 0);
96 		a = _mm_insert_epi32(a, b >> 32, 1);
97 	} else {
98 		a = _mm_insert_epi32(a, b, 2);
99 		a = _mm_insert_epi32(a, b >> 32, 3);
100 	}
101 
102 	return a;
103 }
104 #endif
105 
106 /* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
107 
108 /* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
109 static void
110 gfmul(__m128i a, __m128i b, __m128i *res)
111 {
112 	__m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
113 
114 	tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
115 	tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
116 	tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
117 	tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
118 
119 	tmp4 = _mm_xor_si128(tmp4, tmp5);
120 	tmp5 = _mm_slli_si128(tmp4, 8);
121 	tmp4 = _mm_srli_si128(tmp4, 8);
122 	tmp3 = _mm_xor_si128(tmp3, tmp5);
123 	tmp6 = _mm_xor_si128(tmp6, tmp4);
124 
125 	tmp7 = _mm_srli_epi32(tmp3, 31);
126 	tmp8 = _mm_srli_epi32(tmp6, 31);
127 	tmp3 = _mm_slli_epi32(tmp3, 1);
128 	tmp6 = _mm_slli_epi32(tmp6, 1);
129 
130 	tmp9 = _mm_srli_si128(tmp7, 12);
131 	tmp8 = _mm_slli_si128(tmp8, 4);
132 	tmp7 = _mm_slli_si128(tmp7, 4);
133 	tmp3 = _mm_or_si128(tmp3, tmp7);
134 	tmp6 = _mm_or_si128(tmp6, tmp8);
135 	tmp6 = _mm_or_si128(tmp6, tmp9);
136 
137 	tmp7 = _mm_slli_epi32(tmp3, 31);
138 	tmp8 = _mm_slli_epi32(tmp3, 30);
139 	tmp9 = _mm_slli_epi32(tmp3, 25);
140 
141 	tmp7 = _mm_xor_si128(tmp7, tmp8);
142 	tmp7 = _mm_xor_si128(tmp7, tmp9);
143 	tmp8 = _mm_srli_si128(tmp7, 4);
144 	tmp7 = _mm_slli_si128(tmp7, 12);
145 	tmp3 = _mm_xor_si128(tmp3, tmp7);
146 
147 	tmp2 = _mm_srli_epi32(tmp3, 1);
148 	tmp4 = _mm_srli_epi32(tmp3, 2);
149 	tmp5 = _mm_srli_epi32(tmp3, 7);
150 	tmp2 = _mm_xor_si128(tmp2, tmp4);
151 	tmp2 = _mm_xor_si128(tmp2, tmp5);
152 	tmp2 = _mm_xor_si128(tmp2, tmp8);
153 	tmp3 = _mm_xor_si128(tmp3, tmp2);
154 	tmp6 = _mm_xor_si128(tmp6, tmp3);
155 
156 	*res = tmp6;
157 }
158 
159 /*
160  * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
161  * Method */
162 static void
163 reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4,
164     __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res)
165 {
166 	/*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
167 	__m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo,
168 	    H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi;
169 	__m128i tmp0, tmp1, tmp2, tmp3;
170 	__m128i tmp4, tmp5, tmp6, tmp7;
171 	__m128i tmp8, tmp9;
172 
173 	H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
174 	H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
175 	H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
176 	H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
177 
178 	lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo);
179 	lo = _mm_xor_si128(lo, H3_X3_lo);
180 	lo = _mm_xor_si128(lo, H4_X4_lo);
181 
182 	H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
183 	H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
184 	H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
185 	H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
186 
187 	hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi);
188 	hi = _mm_xor_si128(hi, H3_X3_hi);
189 	hi = _mm_xor_si128(hi, H4_X4_hi);
190 
191 	tmp0 = _mm_shuffle_epi32(H1, 78);
192 	tmp4 = _mm_shuffle_epi32(X1, 78);
193 	tmp0 = _mm_xor_si128(tmp0, H1);
194 	tmp4 = _mm_xor_si128(tmp4, X1);
195 	tmp1 = _mm_shuffle_epi32(H2, 78);
196 	tmp5 = _mm_shuffle_epi32(X2, 78);
197 	tmp1 = _mm_xor_si128(tmp1, H2);
198 	tmp5 = _mm_xor_si128(tmp5, X2);
199 	tmp2 = _mm_shuffle_epi32(H3, 78);
200 	tmp6 = _mm_shuffle_epi32(X3, 78);
201 	tmp2 = _mm_xor_si128(tmp2, H3);
202 	tmp6 = _mm_xor_si128(tmp6, X3);
203 	tmp3 = _mm_shuffle_epi32(H4, 78);
204 	tmp7 = _mm_shuffle_epi32(X4, 78);
205 	tmp3 = _mm_xor_si128(tmp3, H4);
206 	tmp7 = _mm_xor_si128(tmp7, X4);
207 
208 	tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00);
209 	tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
210 	tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00);
211 	tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
212 
213 	tmp0 = _mm_xor_si128(tmp0, lo);
214 	tmp0 = _mm_xor_si128(tmp0, hi);
215 	tmp0 = _mm_xor_si128(tmp1, tmp0);
216 	tmp0 = _mm_xor_si128(tmp2, tmp0);
217 	tmp0 = _mm_xor_si128(tmp3, tmp0);
218 
219 	tmp4 = _mm_slli_si128(tmp0, 8);
220 	tmp0 = _mm_srli_si128(tmp0, 8);
221 
222 	lo = _mm_xor_si128(tmp4, lo);
223 	hi = _mm_xor_si128(tmp0, hi);
224 
225 	tmp3 = lo;
226 	tmp6 = hi;
227 
228 	tmp7 = _mm_srli_epi32(tmp3, 31);
229 	tmp8 = _mm_srli_epi32(tmp6, 31);
230 	tmp3 = _mm_slli_epi32(tmp3, 1);
231 	tmp6 = _mm_slli_epi32(tmp6, 1);
232 
233 	tmp9 = _mm_srli_si128(tmp7, 12);
234 	tmp8 = _mm_slli_si128(tmp8, 4);
235 	tmp7 = _mm_slli_si128(tmp7, 4);
236 	tmp3 = _mm_or_si128(tmp3, tmp7);
237 	tmp6 = _mm_or_si128(tmp6, tmp8);
238 	tmp6 = _mm_or_si128(tmp6, tmp9);
239 
240 	tmp7 = _mm_slli_epi32(tmp3, 31);
241 	tmp8 = _mm_slli_epi32(tmp3, 30);
242 	tmp9 = _mm_slli_epi32(tmp3, 25);
243 
244 	tmp7 = _mm_xor_si128(tmp7, tmp8);
245 	tmp7 = _mm_xor_si128(tmp7, tmp9);
246 	tmp8 = _mm_srli_si128(tmp7, 4);
247 	tmp7 = _mm_slli_si128(tmp7, 12);
248 	tmp3 = _mm_xor_si128(tmp3, tmp7);
249 
250 	tmp2 = _mm_srli_epi32(tmp3, 1);
251 	tmp4 = _mm_srli_epi32(tmp3, 2);
252 	tmp5 = _mm_srli_epi32(tmp3, 7);
253 	tmp2 = _mm_xor_si128(tmp2, tmp4);
254 	tmp2 = _mm_xor_si128(tmp2, tmp5);
255 	tmp2 = _mm_xor_si128(tmp2, tmp8);
256 	tmp3 = _mm_xor_si128(tmp3, tmp2);
257 	tmp6 = _mm_xor_si128(tmp6, tmp3);
258 
259 	*res = tmp6;
260 }
261 
262 /*
263  * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
264  * Every Four Blocks
265  */
266 /*
267  * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
268  * 2^32-256*8*16 bytes.
269  */
270 void
271 AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
272 	const unsigned char *addt, const unsigned char *ivec,
273 	unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
274 	const unsigned char *key, int nr)
275 {
276 	int i, j ,k;
277 	__m128i tmp1, tmp2, tmp3, tmp4;
278 	__m128i tmp5, tmp6, tmp7, tmp8;
279 	__m128i H, H2, H3, H4, Y, T;
280 	const __m128i *KEY = (const __m128i *)key;
281 	__m128i ctr1, ctr2, ctr3, ctr4;
282 	__m128i ctr5, ctr6, ctr7, ctr8;
283 	__m128i last_block = _mm_setzero_si128();
284 	__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
285 	__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
286 	__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
287 	    7);
288 	__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
289 	    15);
290 	__m128i X = _mm_setzero_si128();
291 
292 	if (ibytes == 96/8) {
293 		Y = _mm_loadu_si128((const __m128i *)ivec);
294 		Y = _mm_insert_epi32(Y, 0x1000000, 3);
295 		/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
296 		tmp1 = _mm_xor_si128(X, KEY[0]);
297 		tmp2 = _mm_xor_si128(Y, KEY[0]);
298 		for (j=1; j < nr-1; j+=2) {
299 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
300 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
301 
302 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
303 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
304 		}
305 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
306 		tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
307 
308 		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
309 		T = _mm_aesenclast_si128(tmp2, KEY[nr]);
310 
311 		H = _mm_shuffle_epi8(H, BSWAP_MASK);
312 	} else {
313 		tmp1 = _mm_xor_si128(X, KEY[0]);
314 		for (j=1; j <nr; j++)
315 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
316 		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
317 
318 		H = _mm_shuffle_epi8(H, BSWAP_MASK);
319 		Y = _mm_setzero_si128();
320 
321 		for (i=0; i < ibytes/16; i++) {
322 			tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
323 			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
324 			Y = _mm_xor_si128(Y, tmp1);
325 			gfmul(Y, H, &Y);
326 		}
327 		if (ibytes%16) {
328 			for (j=0; j < ibytes%16; j++)
329 				((unsigned char*)&last_block)[j] = ivec[i*16+j];
330 			tmp1 = last_block;
331 			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
332 			Y = _mm_xor_si128(Y, tmp1);
333 			gfmul(Y, H, &Y);
334 		}
335 		tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
336 		tmp1 = _mm_insert_epi64(tmp1, 0, 1);
337 
338 		Y = _mm_xor_si128(Y, tmp1);
339 		gfmul(Y, H, &Y);
340 		Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
341 		tmp1 = _mm_xor_si128(Y, KEY[0]);
342 		for (j=1; j < nr; j++)
343 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
344 		T = _mm_aesenclast_si128(tmp1, KEY[nr]);
345 	}
346 
347 	gfmul(H,H,&H2);
348 	gfmul(H,H2,&H3);
349 	gfmul(H,H3,&H4);
350 
351 	for (i=0; i<abytes/16/4; i++) {
352 		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
353 		tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
354 		tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
355 		tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
356 
357 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
358 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
359 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
360 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
361 		tmp1 = _mm_xor_si128(X, tmp1);
362 
363 		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
364 	}
365 	for (i=i*4; i<abytes/16; i++) {
366 		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
367 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
368 		X = _mm_xor_si128(X,tmp1);
369 		gfmul(X, H, &X);
370 	}
371 	if (abytes%16) {
372 		last_block = _mm_setzero_si128();
373 		for (j=0; j<abytes%16; j++)
374 			((unsigned char*)&last_block)[j] = addt[i*16+j];
375 		tmp1 = last_block;
376 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
377 		X =_mm_xor_si128(X,tmp1);
378 		gfmul(X,H,&X);
379 	}
380 
381 	ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
382 	ctr1 = _mm_add_epi64(ctr1, ONE);
383 	ctr2 = _mm_add_epi64(ctr1, ONE);
384 	ctr3 = _mm_add_epi64(ctr2, ONE);
385 	ctr4 = _mm_add_epi64(ctr3, ONE);
386 	ctr5 = _mm_add_epi64(ctr4, ONE);
387 	ctr6 = _mm_add_epi64(ctr5, ONE);
388 	ctr7 = _mm_add_epi64(ctr6, ONE);
389 	ctr8 = _mm_add_epi64(ctr7, ONE);
390 
391 	for (i=0; i<nbytes/16/8; i++) {
392 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
393 		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
394 		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
395 		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
396 		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
397 		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
398 		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
399 		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
400 
401 		ctr1 = _mm_add_epi64(ctr1, EIGHT);
402 		ctr2 = _mm_add_epi64(ctr2, EIGHT);
403 		ctr3 = _mm_add_epi64(ctr3, EIGHT);
404 		ctr4 = _mm_add_epi64(ctr4, EIGHT);
405 		ctr5 = _mm_add_epi64(ctr5, EIGHT);
406 		ctr6 = _mm_add_epi64(ctr6, EIGHT);
407 		ctr7 = _mm_add_epi64(ctr7, EIGHT);
408 		ctr8 = _mm_add_epi64(ctr8, EIGHT);
409 
410 		tmp1 =_mm_xor_si128(tmp1, KEY[0]);
411 		tmp2 =_mm_xor_si128(tmp2, KEY[0]);
412 		tmp3 =_mm_xor_si128(tmp3, KEY[0]);
413 		tmp4 =_mm_xor_si128(tmp4, KEY[0]);
414 		tmp5 =_mm_xor_si128(tmp5, KEY[0]);
415 		tmp6 =_mm_xor_si128(tmp6, KEY[0]);
416 		tmp7 =_mm_xor_si128(tmp7, KEY[0]);
417 		tmp8 =_mm_xor_si128(tmp8, KEY[0]);
418 
419 		for (j=1; j<nr; j++) {
420 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
421 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
422 			tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
423 			tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
424 			tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
425 			tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
426 			tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
427 			tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
428 		}
429 		tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
430 		tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
431 		tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
432 		tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
433 		tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
434 		tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
435 		tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
436 		tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
437 
438 		tmp1 = _mm_xor_si128(tmp1,
439 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
440 		tmp2 = _mm_xor_si128(tmp2,
441 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
442 		tmp3 = _mm_xor_si128(tmp3,
443 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
444 		tmp4 = _mm_xor_si128(tmp4,
445 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
446 		tmp5 = _mm_xor_si128(tmp5,
447 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
448 		tmp6 = _mm_xor_si128(tmp6,
449 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
450 		tmp7 = _mm_xor_si128(tmp7,
451 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
452 		tmp8 = _mm_xor_si128(tmp8,
453 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
454 
455 		_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
456 		_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
457 		_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
458 		_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
459 		_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
460 		_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
461 		_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
462 		_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
463 
464 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
465 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
466 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
467 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
468 		tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
469 		tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
470 		tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
471 		tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
472 
473 		tmp1 = _mm_xor_si128(X, tmp1);
474 
475 		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
476 
477 		tmp5 = _mm_xor_si128(X, tmp5);
478 		reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X);
479 	}
480 	for (k=i*8; k<nbytes/16; k++) {
481 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
482 		ctr1 = _mm_add_epi64(ctr1, ONE);
483 		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
484 		for (j=1; j<nr-1; j+=2) {
485 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
486 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
487 		}
488 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
489 		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
490 		tmp1 = _mm_xor_si128(tmp1,
491 		    _mm_loadu_si128(&((const __m128i *)in)[k]));
492 		_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
493 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
494 		X = _mm_xor_si128(X, tmp1);
495 		gfmul(X,H,&X);
496 	}
497 	//If remains one incomplete block
498 	if (nbytes%16) {
499 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
500 		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
501 		for (j=1; j<nr-1; j+=2) {
502 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
503 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
504 		}
505 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
506 		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
507 		last_block = _mm_setzero_si128();
508 		memcpy(&last_block, &((const __m128i *)in)[k],
509 		    nbytes % 16);
510 		last_block = _mm_xor_si128(last_block, tmp1);
511 		for (j=0; j<nbytes%16; j++)
512 			out[k*16+j] = ((unsigned char*)&last_block)[j];
513 		for ((void)j; j<16; j++)
514 			((unsigned char*)&last_block)[j] = 0;
515 		tmp1 = last_block;
516 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
517 		X = _mm_xor_si128(X, tmp1);
518 		gfmul(X, H, &X);
519 	}
520 	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
521 	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
522 
523 	X = _mm_xor_si128(X, tmp1);
524 	gfmul(X,H,&X);
525 	X = _mm_shuffle_epi8(X, BSWAP_MASK);
526 	T = _mm_xor_si128(X, T);
527 	_mm_storeu_si128((__m128i*)tag, T);
528 }
529 
530 /* My modification of _encrypt to be _decrypt */
531 int
532 AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
533 	const unsigned char *addt, const unsigned char *ivec,
534 	const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
535 	const unsigned char *key, int nr)
536 {
537 	int i, j ,k;
538 	__m128i tmp1, tmp2, tmp3, tmp4;
539 	__m128i tmp5, tmp6, tmp7, tmp8;
540 	__m128i H, H2, H3, H4, Y, T;
541 	const __m128i *KEY = (const __m128i *)key;
542 	__m128i ctr1, ctr2, ctr3, ctr4;
543 	__m128i ctr5, ctr6, ctr7, ctr8;
544 	__m128i last_block = _mm_setzero_si128();
545 	__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
546 	__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
547 	__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
548 	    7);
549 	__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
550 	    15);
551 	__m128i X = _mm_setzero_si128();
552 
553 	if (ibytes == 96/8) {
554 		Y = _mm_loadu_si128((const __m128i *)ivec);
555 		Y = _mm_insert_epi32(Y, 0x1000000, 3);
556 		/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
557 		tmp1 = _mm_xor_si128(X, KEY[0]);
558 		tmp2 = _mm_xor_si128(Y, KEY[0]);
559 		for (j=1; j < nr-1; j+=2) {
560 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
561 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
562 
563 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
564 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
565 		}
566 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
567 		tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
568 
569 		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
570 		T = _mm_aesenclast_si128(tmp2, KEY[nr]);
571 
572 		H = _mm_shuffle_epi8(H, BSWAP_MASK);
573 	} else {
574 		tmp1 = _mm_xor_si128(X, KEY[0]);
575 		for (j=1; j <nr; j++)
576 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
577 		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
578 
579 		H = _mm_shuffle_epi8(H, BSWAP_MASK);
580 		Y = _mm_setzero_si128();
581 
582 		for (i=0; i < ibytes/16; i++) {
583 			tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
584 			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
585 			Y = _mm_xor_si128(Y, tmp1);
586 			gfmul(Y, H, &Y);
587 		}
588 		if (ibytes%16) {
589 			for (j=0; j < ibytes%16; j++)
590 				((unsigned char*)&last_block)[j] = ivec[i*16+j];
591 			tmp1 = last_block;
592 			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
593 			Y = _mm_xor_si128(Y, tmp1);
594 			gfmul(Y, H, &Y);
595 		}
596 		tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
597 		tmp1 = _mm_insert_epi64(tmp1, 0, 1);
598 
599 		Y = _mm_xor_si128(Y, tmp1);
600 		gfmul(Y, H, &Y);
601 		Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
602 		tmp1 = _mm_xor_si128(Y, KEY[0]);
603 		for (j=1; j < nr; j++)
604 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
605 		T = _mm_aesenclast_si128(tmp1, KEY[nr]);
606 	}
607 
608 	gfmul(H,H,&H2);
609 	gfmul(H,H2,&H3);
610 	gfmul(H,H3,&H4);
611 
612 	for (i=0; i<abytes/16/4; i++) {
613 		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
614 		tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
615 		tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
616 		tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
617 
618 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
619 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
620 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
621 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
622 
623 		tmp1 = _mm_xor_si128(X, tmp1);
624 
625 		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
626 	}
627 	for (i=i*4; i<abytes/16; i++) {
628 		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
629 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
630 		X = _mm_xor_si128(X,tmp1);
631 		gfmul(X, H, &X);
632 	}
633 	if (abytes%16) {
634 		last_block = _mm_setzero_si128();
635 		for (j=0; j<abytes%16; j++)
636 			((unsigned char*)&last_block)[j] = addt[i*16+j];
637 		tmp1 = last_block;
638 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
639 		X =_mm_xor_si128(X,tmp1);
640 		gfmul(X,H,&X);
641 	}
642 
643 	/* This is where we validate the cipher text before decrypt */
644 	for (i = 0; i<nbytes/16/4; i++) {
645 		tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i*4]);
646 		tmp2 = _mm_loadu_si128(&((const __m128i *)in)[i*4+1]);
647 		tmp3 = _mm_loadu_si128(&((const __m128i *)in)[i*4+2]);
648 		tmp4 = _mm_loadu_si128(&((const __m128i *)in)[i*4+3]);
649 
650 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
651 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
652 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
653 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
654 
655 		tmp1 = _mm_xor_si128(X, tmp1);
656 
657 		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
658 	}
659 	for (i = i*4; i<nbytes/16; i++) {
660 		tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i]);
661 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
662 		X = _mm_xor_si128(X, tmp1);
663 		gfmul(X,H,&X);
664 	}
665 	if (nbytes%16) {
666 		last_block = _mm_setzero_si128();
667 		for (j=0; j<nbytes%16; j++)
668 			((unsigned char*)&last_block)[j] = in[i*16+j];
669 		tmp1 = last_block;
670 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
671 		X = _mm_xor_si128(X, tmp1);
672 		gfmul(X, H, &X);
673 	}
674 
675 	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
676 	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
677 
678 	X = _mm_xor_si128(X, tmp1);
679 	gfmul(X,H,&X);
680 	X = _mm_shuffle_epi8(X, BSWAP_MASK);
681 	T = _mm_xor_si128(X, T);
682 
683 	if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag)))
684 		return 0; //in case the authentication failed
685 
686 	ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
687 	ctr1 = _mm_add_epi64(ctr1, ONE);
688 	ctr2 = _mm_add_epi64(ctr1, ONE);
689 	ctr3 = _mm_add_epi64(ctr2, ONE);
690 	ctr4 = _mm_add_epi64(ctr3, ONE);
691 	ctr5 = _mm_add_epi64(ctr4, ONE);
692 	ctr6 = _mm_add_epi64(ctr5, ONE);
693 	ctr7 = _mm_add_epi64(ctr6, ONE);
694 	ctr8 = _mm_add_epi64(ctr7, ONE);
695 
696 	for (i=0; i<nbytes/16/8; i++) {
697 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
698 		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
699 		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
700 		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
701 		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
702 		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
703 		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
704 		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
705 
706 		ctr1 = _mm_add_epi64(ctr1, EIGHT);
707 		ctr2 = _mm_add_epi64(ctr2, EIGHT);
708 		ctr3 = _mm_add_epi64(ctr3, EIGHT);
709 		ctr4 = _mm_add_epi64(ctr4, EIGHT);
710 		ctr5 = _mm_add_epi64(ctr5, EIGHT);
711 		ctr6 = _mm_add_epi64(ctr6, EIGHT);
712 		ctr7 = _mm_add_epi64(ctr7, EIGHT);
713 		ctr8 = _mm_add_epi64(ctr8, EIGHT);
714 
715 		tmp1 =_mm_xor_si128(tmp1, KEY[0]);
716 		tmp2 =_mm_xor_si128(tmp2, KEY[0]);
717 		tmp3 =_mm_xor_si128(tmp3, KEY[0]);
718 		tmp4 =_mm_xor_si128(tmp4, KEY[0]);
719 		tmp5 =_mm_xor_si128(tmp5, KEY[0]);
720 		tmp6 =_mm_xor_si128(tmp6, KEY[0]);
721 		tmp7 =_mm_xor_si128(tmp7, KEY[0]);
722 		tmp8 =_mm_xor_si128(tmp8, KEY[0]);
723 
724 		for (j=1; j<nr; j++) {
725 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
726 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
727 			tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
728 			tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
729 			tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
730 			tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
731 			tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
732 			tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
733 		}
734 		tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
735 		tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
736 		tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
737 		tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
738 		tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
739 		tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
740 		tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
741 		tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
742 
743 		tmp1 = _mm_xor_si128(tmp1,
744 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
745 		tmp2 = _mm_xor_si128(tmp2,
746 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
747 		tmp3 = _mm_xor_si128(tmp3,
748 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
749 		tmp4 = _mm_xor_si128(tmp4,
750 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
751 		tmp5 = _mm_xor_si128(tmp5,
752 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
753 		tmp6 = _mm_xor_si128(tmp6,
754 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
755 		tmp7 = _mm_xor_si128(tmp7,
756 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
757 		tmp8 = _mm_xor_si128(tmp8,
758 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
759 
760 		_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
761 		_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
762 		_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
763 		_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
764 		_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
765 		_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
766 		_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
767 		_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
768 
769 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
770 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
771 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
772 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
773 		tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
774 		tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
775 		tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
776 		tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
777 	}
778 	for (k=i*8; k<nbytes/16; k++) {
779 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
780 		ctr1 = _mm_add_epi64(ctr1, ONE);
781 		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
782 		for (j=1; j<nr-1; j+=2) {
783 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
784 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
785 		}
786 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
787 		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
788 		tmp1 = _mm_xor_si128(tmp1,
789 		    _mm_loadu_si128(&((const __m128i *)in)[k]));
790 		_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
791 	}
792 	//If remains one incomplete block
793 	if (nbytes%16) {
794 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
795 		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
796 		for (j=1; j<nr-1; j+=2) {
797 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
798 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
799 		}
800 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
801 		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
802 		last_block = _mm_setzero_si128();
803 		memcpy(&last_block, &((const __m128i *)in)[k], nbytes%16);
804 		tmp1 = _mm_xor_si128(tmp1, last_block);
805 		last_block = tmp1;
806 		for (j=0; j<nbytes%16; j++)
807 			out[k*16+j] = ((unsigned char*)&last_block)[j];
808 	}
809 	return 1; //when sucessfull returns 1
810 }
811