xref: /freebsd/sys/crypto/aesni/aesni_ghash.c (revision 95ee2897e98f5d444f26ed2334cc7c439f9c16c6)
108fca7a5SJohn-Mark Gurney /*-
208fca7a5SJohn-Mark Gurney  * Copyright (c) 2014 The FreeBSD Foundation
308fca7a5SJohn-Mark Gurney  * All rights reserved.
408fca7a5SJohn-Mark Gurney  *
508fca7a5SJohn-Mark Gurney  * This software was developed by John-Mark Gurney under
608fca7a5SJohn-Mark Gurney  * the sponsorship of the FreeBSD Foundation and
708fca7a5SJohn-Mark Gurney  * Rubicon Communications, LLC (Netgate).
808fca7a5SJohn-Mark Gurney  * Redistribution and use in source and binary forms, with or without
908fca7a5SJohn-Mark Gurney  * modification, are permitted provided that the following conditions
1008fca7a5SJohn-Mark Gurney  * are met:
1108fca7a5SJohn-Mark Gurney  * 1.  Redistributions of source code must retain the above copyright
1208fca7a5SJohn-Mark Gurney  *     notice, this list of conditions and the following disclaimer.
1308fca7a5SJohn-Mark Gurney  * 2.  Redistributions in binary form must reproduce the above copyright
1408fca7a5SJohn-Mark Gurney  *     notice, this list of conditions and the following disclaimer in the
1508fca7a5SJohn-Mark Gurney  *     documentation and/or other materials provided with the distribution.
1608fca7a5SJohn-Mark Gurney  *
1708fca7a5SJohn-Mark Gurney  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1808fca7a5SJohn-Mark Gurney  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1908fca7a5SJohn-Mark Gurney  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
2008fca7a5SJohn-Mark Gurney  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
2108fca7a5SJohn-Mark Gurney  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2208fca7a5SJohn-Mark Gurney  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2308fca7a5SJohn-Mark Gurney  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2408fca7a5SJohn-Mark Gurney  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2508fca7a5SJohn-Mark Gurney  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2608fca7a5SJohn-Mark Gurney  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2708fca7a5SJohn-Mark Gurney  * SUCH DAMAGE.
2808fca7a5SJohn-Mark Gurney  *
2908fca7a5SJohn-Mark Gurney  *
3008fca7a5SJohn-Mark Gurney  */
3108fca7a5SJohn-Mark Gurney 
3208fca7a5SJohn-Mark Gurney /*
3308fca7a5SJohn-Mark Gurney  * Figure 5, 8 and 12 are copied from the Intel white paper:
3408fca7a5SJohn-Mark Gurney  * Intel® Carry-Less Multiplication Instruction and its Usage for
3508fca7a5SJohn-Mark Gurney  * Computing the GCM Mode
3608fca7a5SJohn-Mark Gurney  *
3708fca7a5SJohn-Mark Gurney  * and as such are:
3808fca7a5SJohn-Mark Gurney  * Copyright © 2010 Intel Corporation.
3908fca7a5SJohn-Mark Gurney  * All rights reserved.
4008fca7a5SJohn-Mark Gurney  *
4108fca7a5SJohn-Mark Gurney  * Redistribution and use in source and binary forms, with or without
4208fca7a5SJohn-Mark Gurney  * modification, are permitted provided that the following conditions
4308fca7a5SJohn-Mark Gurney  * are met:
4408fca7a5SJohn-Mark Gurney  *   * Redistributions of source code must retain the above copyright
4508fca7a5SJohn-Mark Gurney  *     notice, this list of conditions and the following disclaimer.
4608fca7a5SJohn-Mark Gurney  *   * Redistributions in binary form must reproduce the above copyright
4708fca7a5SJohn-Mark Gurney  *     notice, this list of conditions and the following disclaimer in the
4808fca7a5SJohn-Mark Gurney  *     documentation and/or other materials provided with the distribution.
4908fca7a5SJohn-Mark Gurney  *   * Neither the name of Intel Corporation nor the
5008fca7a5SJohn-Mark Gurney  *     names of its contributors may be used to endorse or promote products
5108fca7a5SJohn-Mark Gurney  *     derived from this software without specific prior written permission.
5208fca7a5SJohn-Mark Gurney  *
5308fca7a5SJohn-Mark Gurney  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
5408fca7a5SJohn-Mark Gurney  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
5508fca7a5SJohn-Mark Gurney  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
5608fca7a5SJohn-Mark Gurney  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
5708fca7a5SJohn-Mark Gurney  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
5808fca7a5SJohn-Mark Gurney  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
5908fca7a5SJohn-Mark Gurney  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
6008fca7a5SJohn-Mark Gurney  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
6108fca7a5SJohn-Mark Gurney  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
6208fca7a5SJohn-Mark Gurney  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
6308fca7a5SJohn-Mark Gurney  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
6408fca7a5SJohn-Mark Gurney  */
6508fca7a5SJohn-Mark Gurney 
6608fca7a5SJohn-Mark Gurney #ifdef _KERNEL
6708fca7a5SJohn-Mark Gurney #include <crypto/aesni/aesni.h>
68800be1b6SCraig Rodrigues #include <crypto/aesni/aesni_os.h>
6908fca7a5SJohn-Mark Gurney #else
7008fca7a5SJohn-Mark Gurney #include <stdint.h>
7108fca7a5SJohn-Mark Gurney #endif
7208fca7a5SJohn-Mark Gurney 
7308fca7a5SJohn-Mark Gurney #include <wmmintrin.h>
7408fca7a5SJohn-Mark Gurney #include <emmintrin.h>
7508fca7a5SJohn-Mark Gurney #include <smmintrin.h>
7608fca7a5SJohn-Mark Gurney 
7708fca7a5SJohn-Mark Gurney static inline int
m128icmp(__m128i a,__m128i b)7808fca7a5SJohn-Mark Gurney m128icmp(__m128i a, __m128i b)
7908fca7a5SJohn-Mark Gurney {
8008fca7a5SJohn-Mark Gurney 	__m128i cmp;
8108fca7a5SJohn-Mark Gurney 
8208fca7a5SJohn-Mark Gurney 	cmp = _mm_cmpeq_epi32(a, b);
8308fca7a5SJohn-Mark Gurney 
8408fca7a5SJohn-Mark Gurney 	return _mm_movemask_epi8(cmp) == 0xffff;
8508fca7a5SJohn-Mark Gurney }
8608fca7a5SJohn-Mark Gurney 
8708fca7a5SJohn-Mark Gurney #ifdef __i386__
8808fca7a5SJohn-Mark Gurney static inline __m128i
_mm_insert_epi64(__m128i a,int64_t b,const int ndx)8908fca7a5SJohn-Mark Gurney _mm_insert_epi64(__m128i a, int64_t b, const int ndx)
9008fca7a5SJohn-Mark Gurney {
9108fca7a5SJohn-Mark Gurney 
9208fca7a5SJohn-Mark Gurney 	if (!ndx) {
9308fca7a5SJohn-Mark Gurney 		a = _mm_insert_epi32(a, b, 0);
9408fca7a5SJohn-Mark Gurney 		a = _mm_insert_epi32(a, b >> 32, 1);
9508fca7a5SJohn-Mark Gurney 	} else {
9608fca7a5SJohn-Mark Gurney 		a = _mm_insert_epi32(a, b, 2);
9708fca7a5SJohn-Mark Gurney 		a = _mm_insert_epi32(a, b >> 32, 3);
9808fca7a5SJohn-Mark Gurney 	}
9908fca7a5SJohn-Mark Gurney 
10008fca7a5SJohn-Mark Gurney 	return a;
10108fca7a5SJohn-Mark Gurney }
10208fca7a5SJohn-Mark Gurney #endif
10308fca7a5SJohn-Mark Gurney 
10408fca7a5SJohn-Mark Gurney /* some code from carry-less-multiplication-instruction-in-gcm-mode-paper.pdf */
10508fca7a5SJohn-Mark Gurney 
10608fca7a5SJohn-Mark Gurney /* Figure 5. Code Sample - Performing Ghash Using Algorithms 1 and 5 (C) */
10708fca7a5SJohn-Mark Gurney static void
gfmul(__m128i a,__m128i b,__m128i * res)10808fca7a5SJohn-Mark Gurney gfmul(__m128i a, __m128i b, __m128i *res)
10908fca7a5SJohn-Mark Gurney {
11008fca7a5SJohn-Mark Gurney 	__m128i tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
11108fca7a5SJohn-Mark Gurney 
11208fca7a5SJohn-Mark Gurney 	tmp3 = _mm_clmulepi64_si128(a, b, 0x00);
11308fca7a5SJohn-Mark Gurney 	tmp4 = _mm_clmulepi64_si128(a, b, 0x10);
11408fca7a5SJohn-Mark Gurney 	tmp5 = _mm_clmulepi64_si128(a, b, 0x01);
11508fca7a5SJohn-Mark Gurney 	tmp6 = _mm_clmulepi64_si128(a, b, 0x11);
11608fca7a5SJohn-Mark Gurney 
11708fca7a5SJohn-Mark Gurney 	tmp4 = _mm_xor_si128(tmp4, tmp5);
11808fca7a5SJohn-Mark Gurney 	tmp5 = _mm_slli_si128(tmp4, 8);
11908fca7a5SJohn-Mark Gurney 	tmp4 = _mm_srli_si128(tmp4, 8);
12008fca7a5SJohn-Mark Gurney 	tmp3 = _mm_xor_si128(tmp3, tmp5);
12108fca7a5SJohn-Mark Gurney 	tmp6 = _mm_xor_si128(tmp6, tmp4);
12208fca7a5SJohn-Mark Gurney 
12308fca7a5SJohn-Mark Gurney 	tmp7 = _mm_srli_epi32(tmp3, 31);
12408fca7a5SJohn-Mark Gurney 	tmp8 = _mm_srli_epi32(tmp6, 31);
12508fca7a5SJohn-Mark Gurney 	tmp3 = _mm_slli_epi32(tmp3, 1);
12608fca7a5SJohn-Mark Gurney 	tmp6 = _mm_slli_epi32(tmp6, 1);
12708fca7a5SJohn-Mark Gurney 
12808fca7a5SJohn-Mark Gurney 	tmp9 = _mm_srli_si128(tmp7, 12);
12908fca7a5SJohn-Mark Gurney 	tmp8 = _mm_slli_si128(tmp8, 4);
13008fca7a5SJohn-Mark Gurney 	tmp7 = _mm_slli_si128(tmp7, 4);
13108fca7a5SJohn-Mark Gurney 	tmp3 = _mm_or_si128(tmp3, tmp7);
13208fca7a5SJohn-Mark Gurney 	tmp6 = _mm_or_si128(tmp6, tmp8);
13308fca7a5SJohn-Mark Gurney 	tmp6 = _mm_or_si128(tmp6, tmp9);
13408fca7a5SJohn-Mark Gurney 
13508fca7a5SJohn-Mark Gurney 	tmp7 = _mm_slli_epi32(tmp3, 31);
13608fca7a5SJohn-Mark Gurney 	tmp8 = _mm_slli_epi32(tmp3, 30);
13708fca7a5SJohn-Mark Gurney 	tmp9 = _mm_slli_epi32(tmp3, 25);
13808fca7a5SJohn-Mark Gurney 
13908fca7a5SJohn-Mark Gurney 	tmp7 = _mm_xor_si128(tmp7, tmp8);
14008fca7a5SJohn-Mark Gurney 	tmp7 = _mm_xor_si128(tmp7, tmp9);
14108fca7a5SJohn-Mark Gurney 	tmp8 = _mm_srli_si128(tmp7, 4);
14208fca7a5SJohn-Mark Gurney 	tmp7 = _mm_slli_si128(tmp7, 12);
14308fca7a5SJohn-Mark Gurney 	tmp3 = _mm_xor_si128(tmp3, tmp7);
14408fca7a5SJohn-Mark Gurney 
14508fca7a5SJohn-Mark Gurney 	tmp2 = _mm_srli_epi32(tmp3, 1);
14608fca7a5SJohn-Mark Gurney 	tmp4 = _mm_srli_epi32(tmp3, 2);
14708fca7a5SJohn-Mark Gurney 	tmp5 = _mm_srli_epi32(tmp3, 7);
14808fca7a5SJohn-Mark Gurney 	tmp2 = _mm_xor_si128(tmp2, tmp4);
14908fca7a5SJohn-Mark Gurney 	tmp2 = _mm_xor_si128(tmp2, tmp5);
15008fca7a5SJohn-Mark Gurney 	tmp2 = _mm_xor_si128(tmp2, tmp8);
15108fca7a5SJohn-Mark Gurney 	tmp3 = _mm_xor_si128(tmp3, tmp2);
15208fca7a5SJohn-Mark Gurney 	tmp6 = _mm_xor_si128(tmp6, tmp3);
15308fca7a5SJohn-Mark Gurney 
15408fca7a5SJohn-Mark Gurney 	*res = tmp6;
15508fca7a5SJohn-Mark Gurney }
15608fca7a5SJohn-Mark Gurney 
15708fca7a5SJohn-Mark Gurney /*
15808fca7a5SJohn-Mark Gurney  * Figure 8. Code Sample - Performing Ghash Using an Aggregated Reduction
15908fca7a5SJohn-Mark Gurney  * Method */
16008fca7a5SJohn-Mark Gurney static void
reduce4(__m128i H1,__m128i H2,__m128i H3,__m128i H4,__m128i X1,__m128i X2,__m128i X3,__m128i X4,__m128i * res)16108fca7a5SJohn-Mark Gurney reduce4(__m128i H1, __m128i H2, __m128i H3, __m128i H4,
16208fca7a5SJohn-Mark Gurney     __m128i X1, __m128i X2, __m128i X3, __m128i X4, __m128i *res)
16308fca7a5SJohn-Mark Gurney {
16408fca7a5SJohn-Mark Gurney 	/*algorithm by Krzysztof Jankowski, Pierre Laurent - Intel*/
16508fca7a5SJohn-Mark Gurney 	__m128i H1_X1_lo, H1_X1_hi, H2_X2_lo, H2_X2_hi, H3_X3_lo,
16608fca7a5SJohn-Mark Gurney 	    H3_X3_hi, H4_X4_lo, H4_X4_hi, lo, hi;
16708fca7a5SJohn-Mark Gurney 	__m128i tmp0, tmp1, tmp2, tmp3;
16808fca7a5SJohn-Mark Gurney 	__m128i tmp4, tmp5, tmp6, tmp7;
16908fca7a5SJohn-Mark Gurney 	__m128i tmp8, tmp9;
17008fca7a5SJohn-Mark Gurney 
17108fca7a5SJohn-Mark Gurney 	H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
17208fca7a5SJohn-Mark Gurney 	H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
17308fca7a5SJohn-Mark Gurney 	H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
17408fca7a5SJohn-Mark Gurney 	H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);
17508fca7a5SJohn-Mark Gurney 
17608fca7a5SJohn-Mark Gurney 	lo = _mm_xor_si128(H1_X1_lo, H2_X2_lo);
17708fca7a5SJohn-Mark Gurney 	lo = _mm_xor_si128(lo, H3_X3_lo);
17808fca7a5SJohn-Mark Gurney 	lo = _mm_xor_si128(lo, H4_X4_lo);
17908fca7a5SJohn-Mark Gurney 
18008fca7a5SJohn-Mark Gurney 	H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
18108fca7a5SJohn-Mark Gurney 	H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
18208fca7a5SJohn-Mark Gurney 	H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
18308fca7a5SJohn-Mark Gurney 	H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);
18408fca7a5SJohn-Mark Gurney 
18508fca7a5SJohn-Mark Gurney 	hi = _mm_xor_si128(H1_X1_hi, H2_X2_hi);
18608fca7a5SJohn-Mark Gurney 	hi = _mm_xor_si128(hi, H3_X3_hi);
18708fca7a5SJohn-Mark Gurney 	hi = _mm_xor_si128(hi, H4_X4_hi);
18808fca7a5SJohn-Mark Gurney 
18908fca7a5SJohn-Mark Gurney 	tmp0 = _mm_shuffle_epi32(H1, 78);
19008fca7a5SJohn-Mark Gurney 	tmp4 = _mm_shuffle_epi32(X1, 78);
19108fca7a5SJohn-Mark Gurney 	tmp0 = _mm_xor_si128(tmp0, H1);
19208fca7a5SJohn-Mark Gurney 	tmp4 = _mm_xor_si128(tmp4, X1);
19308fca7a5SJohn-Mark Gurney 	tmp1 = _mm_shuffle_epi32(H2, 78);
19408fca7a5SJohn-Mark Gurney 	tmp5 = _mm_shuffle_epi32(X2, 78);
19508fca7a5SJohn-Mark Gurney 	tmp1 = _mm_xor_si128(tmp1, H2);
19608fca7a5SJohn-Mark Gurney 	tmp5 = _mm_xor_si128(tmp5, X2);
19708fca7a5SJohn-Mark Gurney 	tmp2 = _mm_shuffle_epi32(H3, 78);
19808fca7a5SJohn-Mark Gurney 	tmp6 = _mm_shuffle_epi32(X3, 78);
19908fca7a5SJohn-Mark Gurney 	tmp2 = _mm_xor_si128(tmp2, H3);
20008fca7a5SJohn-Mark Gurney 	tmp6 = _mm_xor_si128(tmp6, X3);
20108fca7a5SJohn-Mark Gurney 	tmp3 = _mm_shuffle_epi32(H4, 78);
20208fca7a5SJohn-Mark Gurney 	tmp7 = _mm_shuffle_epi32(X4, 78);
20308fca7a5SJohn-Mark Gurney 	tmp3 = _mm_xor_si128(tmp3, H4);
20408fca7a5SJohn-Mark Gurney 	tmp7 = _mm_xor_si128(tmp7, X4);
20508fca7a5SJohn-Mark Gurney 
20608fca7a5SJohn-Mark Gurney 	tmp0 = _mm_clmulepi64_si128(tmp0, tmp4, 0x00);
20708fca7a5SJohn-Mark Gurney 	tmp1 = _mm_clmulepi64_si128(tmp1, tmp5, 0x00);
20808fca7a5SJohn-Mark Gurney 	tmp2 = _mm_clmulepi64_si128(tmp2, tmp6, 0x00);
20908fca7a5SJohn-Mark Gurney 	tmp3 = _mm_clmulepi64_si128(tmp3, tmp7, 0x00);
21008fca7a5SJohn-Mark Gurney 
21108fca7a5SJohn-Mark Gurney 	tmp0 = _mm_xor_si128(tmp0, lo);
21208fca7a5SJohn-Mark Gurney 	tmp0 = _mm_xor_si128(tmp0, hi);
21308fca7a5SJohn-Mark Gurney 	tmp0 = _mm_xor_si128(tmp1, tmp0);
21408fca7a5SJohn-Mark Gurney 	tmp0 = _mm_xor_si128(tmp2, tmp0);
21508fca7a5SJohn-Mark Gurney 	tmp0 = _mm_xor_si128(tmp3, tmp0);
21608fca7a5SJohn-Mark Gurney 
21708fca7a5SJohn-Mark Gurney 	tmp4 = _mm_slli_si128(tmp0, 8);
21808fca7a5SJohn-Mark Gurney 	tmp0 = _mm_srli_si128(tmp0, 8);
21908fca7a5SJohn-Mark Gurney 
22008fca7a5SJohn-Mark Gurney 	lo = _mm_xor_si128(tmp4, lo);
22108fca7a5SJohn-Mark Gurney 	hi = _mm_xor_si128(tmp0, hi);
22208fca7a5SJohn-Mark Gurney 
22308fca7a5SJohn-Mark Gurney 	tmp3 = lo;
22408fca7a5SJohn-Mark Gurney 	tmp6 = hi;
22508fca7a5SJohn-Mark Gurney 
22608fca7a5SJohn-Mark Gurney 	tmp7 = _mm_srli_epi32(tmp3, 31);
22708fca7a5SJohn-Mark Gurney 	tmp8 = _mm_srli_epi32(tmp6, 31);
22808fca7a5SJohn-Mark Gurney 	tmp3 = _mm_slli_epi32(tmp3, 1);
22908fca7a5SJohn-Mark Gurney 	tmp6 = _mm_slli_epi32(tmp6, 1);
23008fca7a5SJohn-Mark Gurney 
23108fca7a5SJohn-Mark Gurney 	tmp9 = _mm_srli_si128(tmp7, 12);
23208fca7a5SJohn-Mark Gurney 	tmp8 = _mm_slli_si128(tmp8, 4);
23308fca7a5SJohn-Mark Gurney 	tmp7 = _mm_slli_si128(tmp7, 4);
23408fca7a5SJohn-Mark Gurney 	tmp3 = _mm_or_si128(tmp3, tmp7);
23508fca7a5SJohn-Mark Gurney 	tmp6 = _mm_or_si128(tmp6, tmp8);
23608fca7a5SJohn-Mark Gurney 	tmp6 = _mm_or_si128(tmp6, tmp9);
23708fca7a5SJohn-Mark Gurney 
23808fca7a5SJohn-Mark Gurney 	tmp7 = _mm_slli_epi32(tmp3, 31);
23908fca7a5SJohn-Mark Gurney 	tmp8 = _mm_slli_epi32(tmp3, 30);
24008fca7a5SJohn-Mark Gurney 	tmp9 = _mm_slli_epi32(tmp3, 25);
24108fca7a5SJohn-Mark Gurney 
24208fca7a5SJohn-Mark Gurney 	tmp7 = _mm_xor_si128(tmp7, tmp8);
24308fca7a5SJohn-Mark Gurney 	tmp7 = _mm_xor_si128(tmp7, tmp9);
24408fca7a5SJohn-Mark Gurney 	tmp8 = _mm_srli_si128(tmp7, 4);
24508fca7a5SJohn-Mark Gurney 	tmp7 = _mm_slli_si128(tmp7, 12);
24608fca7a5SJohn-Mark Gurney 	tmp3 = _mm_xor_si128(tmp3, tmp7);
24708fca7a5SJohn-Mark Gurney 
24808fca7a5SJohn-Mark Gurney 	tmp2 = _mm_srli_epi32(tmp3, 1);
24908fca7a5SJohn-Mark Gurney 	tmp4 = _mm_srli_epi32(tmp3, 2);
25008fca7a5SJohn-Mark Gurney 	tmp5 = _mm_srli_epi32(tmp3, 7);
25108fca7a5SJohn-Mark Gurney 	tmp2 = _mm_xor_si128(tmp2, tmp4);
25208fca7a5SJohn-Mark Gurney 	tmp2 = _mm_xor_si128(tmp2, tmp5);
25308fca7a5SJohn-Mark Gurney 	tmp2 = _mm_xor_si128(tmp2, tmp8);
25408fca7a5SJohn-Mark Gurney 	tmp3 = _mm_xor_si128(tmp3, tmp2);
25508fca7a5SJohn-Mark Gurney 	tmp6 = _mm_xor_si128(tmp6, tmp3);
25608fca7a5SJohn-Mark Gurney 
25708fca7a5SJohn-Mark Gurney 	*res = tmp6;
25808fca7a5SJohn-Mark Gurney }
25908fca7a5SJohn-Mark Gurney 
26008fca7a5SJohn-Mark Gurney /*
26108fca7a5SJohn-Mark Gurney  * Figure 12. AES-GCM: Processing Four Blocks in Parallel with Aggregated
26208fca7a5SJohn-Mark Gurney  * Every Four Blocks
26308fca7a5SJohn-Mark Gurney  */
26408fca7a5SJohn-Mark Gurney /*
26508fca7a5SJohn-Mark Gurney  * per NIST SP-800-38D, 5.2.1.1, len(p) <= 2^39-256 (in bits), or
26608fca7a5SJohn-Mark Gurney  * 2^32-256*8*16 bytes.
26708fca7a5SJohn-Mark Gurney  */
26808fca7a5SJohn-Mark Gurney void
AES_GCM_encrypt(const unsigned char * in,unsigned char * out,const unsigned char * addt,const unsigned char * ivec,unsigned char * tag,uint32_t nbytes,uint32_t abytes,int ibytes,const unsigned char * key,int nr)26908fca7a5SJohn-Mark Gurney AES_GCM_encrypt(const unsigned char *in, unsigned char *out,
27008fca7a5SJohn-Mark Gurney 	const unsigned char *addt, const unsigned char *ivec,
27108fca7a5SJohn-Mark Gurney 	unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
27208fca7a5SJohn-Mark Gurney 	const unsigned char *key, int nr)
27308fca7a5SJohn-Mark Gurney {
27408fca7a5SJohn-Mark Gurney 	int i, j ,k;
27508fca7a5SJohn-Mark Gurney 	__m128i tmp1, tmp2, tmp3, tmp4;
27608fca7a5SJohn-Mark Gurney 	__m128i tmp5, tmp6, tmp7, tmp8;
27708fca7a5SJohn-Mark Gurney 	__m128i H, H2, H3, H4, Y, T;
278d395fd0dSRyan Libby 	const __m128i *KEY = (const __m128i *)key;
27908fca7a5SJohn-Mark Gurney 	__m128i ctr1, ctr2, ctr3, ctr4;
28008fca7a5SJohn-Mark Gurney 	__m128i ctr5, ctr6, ctr7, ctr8;
28108fca7a5SJohn-Mark Gurney 	__m128i last_block = _mm_setzero_si128();
28208fca7a5SJohn-Mark Gurney 	__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
28308fca7a5SJohn-Mark Gurney 	__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
28408fca7a5SJohn-Mark Gurney 	__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
28508fca7a5SJohn-Mark Gurney 	    7);
28608fca7a5SJohn-Mark Gurney 	__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
28708fca7a5SJohn-Mark Gurney 	    15);
28808fca7a5SJohn-Mark Gurney 	__m128i X = _mm_setzero_si128();
28908fca7a5SJohn-Mark Gurney 
29008fca7a5SJohn-Mark Gurney 	if (ibytes == 96/8) {
291d395fd0dSRyan Libby 		Y = _mm_loadu_si128((const __m128i *)ivec);
29208fca7a5SJohn-Mark Gurney 		Y = _mm_insert_epi32(Y, 0x1000000, 3);
29308fca7a5SJohn-Mark Gurney 		/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
29408fca7a5SJohn-Mark Gurney 		tmp1 = _mm_xor_si128(X, KEY[0]);
29508fca7a5SJohn-Mark Gurney 		tmp2 = _mm_xor_si128(Y, KEY[0]);
29608fca7a5SJohn-Mark Gurney 		for (j=1; j < nr-1; j+=2) {
29708fca7a5SJohn-Mark Gurney 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
29808fca7a5SJohn-Mark Gurney 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
29908fca7a5SJohn-Mark Gurney 
30008fca7a5SJohn-Mark Gurney 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
30108fca7a5SJohn-Mark Gurney 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
30208fca7a5SJohn-Mark Gurney 		}
30308fca7a5SJohn-Mark Gurney 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
30408fca7a5SJohn-Mark Gurney 		tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
30508fca7a5SJohn-Mark Gurney 
30608fca7a5SJohn-Mark Gurney 		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
30708fca7a5SJohn-Mark Gurney 		T = _mm_aesenclast_si128(tmp2, KEY[nr]);
30808fca7a5SJohn-Mark Gurney 
30908fca7a5SJohn-Mark Gurney 		H = _mm_shuffle_epi8(H, BSWAP_MASK);
31008fca7a5SJohn-Mark Gurney 	} else {
31108fca7a5SJohn-Mark Gurney 		tmp1 = _mm_xor_si128(X, KEY[0]);
31208fca7a5SJohn-Mark Gurney 		for (j=1; j <nr; j++)
31308fca7a5SJohn-Mark Gurney 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
31408fca7a5SJohn-Mark Gurney 		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
31508fca7a5SJohn-Mark Gurney 
31608fca7a5SJohn-Mark Gurney 		H = _mm_shuffle_epi8(H, BSWAP_MASK);
31708fca7a5SJohn-Mark Gurney 		Y = _mm_setzero_si128();
31808fca7a5SJohn-Mark Gurney 
31908fca7a5SJohn-Mark Gurney 		for (i=0; i < ibytes/16; i++) {
320d395fd0dSRyan Libby 			tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
32108fca7a5SJohn-Mark Gurney 			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
32208fca7a5SJohn-Mark Gurney 			Y = _mm_xor_si128(Y, tmp1);
32308fca7a5SJohn-Mark Gurney 			gfmul(Y, H, &Y);
32408fca7a5SJohn-Mark Gurney 		}
32508fca7a5SJohn-Mark Gurney 		if (ibytes%16) {
32608fca7a5SJohn-Mark Gurney 			for (j=0; j < ibytes%16; j++)
32708fca7a5SJohn-Mark Gurney 				((unsigned char*)&last_block)[j] = ivec[i*16+j];
32808fca7a5SJohn-Mark Gurney 			tmp1 = last_block;
32908fca7a5SJohn-Mark Gurney 			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
33008fca7a5SJohn-Mark Gurney 			Y = _mm_xor_si128(Y, tmp1);
33108fca7a5SJohn-Mark Gurney 			gfmul(Y, H, &Y);
33208fca7a5SJohn-Mark Gurney 		}
33308fca7a5SJohn-Mark Gurney 		tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
33408fca7a5SJohn-Mark Gurney 		tmp1 = _mm_insert_epi64(tmp1, 0, 1);
33508fca7a5SJohn-Mark Gurney 
33608fca7a5SJohn-Mark Gurney 		Y = _mm_xor_si128(Y, tmp1);
33708fca7a5SJohn-Mark Gurney 		gfmul(Y, H, &Y);
33808fca7a5SJohn-Mark Gurney 		Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
33908fca7a5SJohn-Mark Gurney 		tmp1 = _mm_xor_si128(Y, KEY[0]);
34008fca7a5SJohn-Mark Gurney 		for (j=1; j < nr; j++)
34108fca7a5SJohn-Mark Gurney 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
34208fca7a5SJohn-Mark Gurney 		T = _mm_aesenclast_si128(tmp1, KEY[nr]);
34308fca7a5SJohn-Mark Gurney 	}
34408fca7a5SJohn-Mark Gurney 
34508fca7a5SJohn-Mark Gurney 	gfmul(H,H,&H2);
34608fca7a5SJohn-Mark Gurney 	gfmul(H,H2,&H3);
34708fca7a5SJohn-Mark Gurney 	gfmul(H,H3,&H4);
34808fca7a5SJohn-Mark Gurney 
34908fca7a5SJohn-Mark Gurney 	for (i=0; i<abytes/16/4; i++) {
350d395fd0dSRyan Libby 		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
351d395fd0dSRyan Libby 		tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
352d395fd0dSRyan Libby 		tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
353d395fd0dSRyan Libby 		tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
35408fca7a5SJohn-Mark Gurney 
35508fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
35608fca7a5SJohn-Mark Gurney 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
35708fca7a5SJohn-Mark Gurney 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
35808fca7a5SJohn-Mark Gurney 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
35908fca7a5SJohn-Mark Gurney 		tmp1 = _mm_xor_si128(X, tmp1);
36008fca7a5SJohn-Mark Gurney 
36108fca7a5SJohn-Mark Gurney 		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
36208fca7a5SJohn-Mark Gurney 	}
36308fca7a5SJohn-Mark Gurney 	for (i=i*4; i<abytes/16; i++) {
364d395fd0dSRyan Libby 		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
36508fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
36608fca7a5SJohn-Mark Gurney 		X = _mm_xor_si128(X,tmp1);
36708fca7a5SJohn-Mark Gurney 		gfmul(X, H, &X);
36808fca7a5SJohn-Mark Gurney 	}
36908fca7a5SJohn-Mark Gurney 	if (abytes%16) {
37008fca7a5SJohn-Mark Gurney 		last_block = _mm_setzero_si128();
37108fca7a5SJohn-Mark Gurney 		for (j=0; j<abytes%16; j++)
37208fca7a5SJohn-Mark Gurney 			((unsigned char*)&last_block)[j] = addt[i*16+j];
37308fca7a5SJohn-Mark Gurney 		tmp1 = last_block;
37408fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
37508fca7a5SJohn-Mark Gurney 		X =_mm_xor_si128(X,tmp1);
37608fca7a5SJohn-Mark Gurney 		gfmul(X,H,&X);
37708fca7a5SJohn-Mark Gurney 	}
37808fca7a5SJohn-Mark Gurney 
37908fca7a5SJohn-Mark Gurney 	ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
38008fca7a5SJohn-Mark Gurney 	ctr1 = _mm_add_epi64(ctr1, ONE);
38108fca7a5SJohn-Mark Gurney 	ctr2 = _mm_add_epi64(ctr1, ONE);
38208fca7a5SJohn-Mark Gurney 	ctr3 = _mm_add_epi64(ctr2, ONE);
38308fca7a5SJohn-Mark Gurney 	ctr4 = _mm_add_epi64(ctr3, ONE);
38408fca7a5SJohn-Mark Gurney 	ctr5 = _mm_add_epi64(ctr4, ONE);
38508fca7a5SJohn-Mark Gurney 	ctr6 = _mm_add_epi64(ctr5, ONE);
38608fca7a5SJohn-Mark Gurney 	ctr7 = _mm_add_epi64(ctr6, ONE);
38708fca7a5SJohn-Mark Gurney 	ctr8 = _mm_add_epi64(ctr7, ONE);
38808fca7a5SJohn-Mark Gurney 
38908fca7a5SJohn-Mark Gurney 	for (i=0; i<nbytes/16/8; i++) {
39008fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
39108fca7a5SJohn-Mark Gurney 		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
39208fca7a5SJohn-Mark Gurney 		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
39308fca7a5SJohn-Mark Gurney 		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
39408fca7a5SJohn-Mark Gurney 		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
39508fca7a5SJohn-Mark Gurney 		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
39608fca7a5SJohn-Mark Gurney 		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
39708fca7a5SJohn-Mark Gurney 		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
39808fca7a5SJohn-Mark Gurney 
39908fca7a5SJohn-Mark Gurney 		ctr1 = _mm_add_epi64(ctr1, EIGHT);
40008fca7a5SJohn-Mark Gurney 		ctr2 = _mm_add_epi64(ctr2, EIGHT);
40108fca7a5SJohn-Mark Gurney 		ctr3 = _mm_add_epi64(ctr3, EIGHT);
40208fca7a5SJohn-Mark Gurney 		ctr4 = _mm_add_epi64(ctr4, EIGHT);
40308fca7a5SJohn-Mark Gurney 		ctr5 = _mm_add_epi64(ctr5, EIGHT);
40408fca7a5SJohn-Mark Gurney 		ctr6 = _mm_add_epi64(ctr6, EIGHT);
40508fca7a5SJohn-Mark Gurney 		ctr7 = _mm_add_epi64(ctr7, EIGHT);
40608fca7a5SJohn-Mark Gurney 		ctr8 = _mm_add_epi64(ctr8, EIGHT);
40708fca7a5SJohn-Mark Gurney 
40808fca7a5SJohn-Mark Gurney 		tmp1 =_mm_xor_si128(tmp1, KEY[0]);
40908fca7a5SJohn-Mark Gurney 		tmp2 =_mm_xor_si128(tmp2, KEY[0]);
41008fca7a5SJohn-Mark Gurney 		tmp3 =_mm_xor_si128(tmp3, KEY[0]);
41108fca7a5SJohn-Mark Gurney 		tmp4 =_mm_xor_si128(tmp4, KEY[0]);
41208fca7a5SJohn-Mark Gurney 		tmp5 =_mm_xor_si128(tmp5, KEY[0]);
41308fca7a5SJohn-Mark Gurney 		tmp6 =_mm_xor_si128(tmp6, KEY[0]);
41408fca7a5SJohn-Mark Gurney 		tmp7 =_mm_xor_si128(tmp7, KEY[0]);
41508fca7a5SJohn-Mark Gurney 		tmp8 =_mm_xor_si128(tmp8, KEY[0]);
41608fca7a5SJohn-Mark Gurney 
41708fca7a5SJohn-Mark Gurney 		for (j=1; j<nr; j++) {
41808fca7a5SJohn-Mark Gurney 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
41908fca7a5SJohn-Mark Gurney 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
42008fca7a5SJohn-Mark Gurney 			tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
42108fca7a5SJohn-Mark Gurney 			tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
42208fca7a5SJohn-Mark Gurney 			tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
42308fca7a5SJohn-Mark Gurney 			tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
42408fca7a5SJohn-Mark Gurney 			tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
42508fca7a5SJohn-Mark Gurney 			tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
42608fca7a5SJohn-Mark Gurney 		}
42708fca7a5SJohn-Mark Gurney 		tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
42808fca7a5SJohn-Mark Gurney 		tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
42908fca7a5SJohn-Mark Gurney 		tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
43008fca7a5SJohn-Mark Gurney 		tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
43108fca7a5SJohn-Mark Gurney 		tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
43208fca7a5SJohn-Mark Gurney 		tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
43308fca7a5SJohn-Mark Gurney 		tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
43408fca7a5SJohn-Mark Gurney 		tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
43508fca7a5SJohn-Mark Gurney 
43608fca7a5SJohn-Mark Gurney 		tmp1 = _mm_xor_si128(tmp1,
437d395fd0dSRyan Libby 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
43808fca7a5SJohn-Mark Gurney 		tmp2 = _mm_xor_si128(tmp2,
439d395fd0dSRyan Libby 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
44008fca7a5SJohn-Mark Gurney 		tmp3 = _mm_xor_si128(tmp3,
441d395fd0dSRyan Libby 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
44208fca7a5SJohn-Mark Gurney 		tmp4 = _mm_xor_si128(tmp4,
443d395fd0dSRyan Libby 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
44408fca7a5SJohn-Mark Gurney 		tmp5 = _mm_xor_si128(tmp5,
445d395fd0dSRyan Libby 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
44608fca7a5SJohn-Mark Gurney 		tmp6 = _mm_xor_si128(tmp6,
447d395fd0dSRyan Libby 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
44808fca7a5SJohn-Mark Gurney 		tmp7 = _mm_xor_si128(tmp7,
449d395fd0dSRyan Libby 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
45008fca7a5SJohn-Mark Gurney 		tmp8 = _mm_xor_si128(tmp8,
451d395fd0dSRyan Libby 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
45208fca7a5SJohn-Mark Gurney 
45308fca7a5SJohn-Mark Gurney 		_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
45408fca7a5SJohn-Mark Gurney 		_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
45508fca7a5SJohn-Mark Gurney 		_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
45608fca7a5SJohn-Mark Gurney 		_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
45708fca7a5SJohn-Mark Gurney 		_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
45808fca7a5SJohn-Mark Gurney 		_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
45908fca7a5SJohn-Mark Gurney 		_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
46008fca7a5SJohn-Mark Gurney 		_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
46108fca7a5SJohn-Mark Gurney 
46208fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
46308fca7a5SJohn-Mark Gurney 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
46408fca7a5SJohn-Mark Gurney 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
46508fca7a5SJohn-Mark Gurney 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
46608fca7a5SJohn-Mark Gurney 		tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
46708fca7a5SJohn-Mark Gurney 		tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
46808fca7a5SJohn-Mark Gurney 		tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
46908fca7a5SJohn-Mark Gurney 		tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
47008fca7a5SJohn-Mark Gurney 
47108fca7a5SJohn-Mark Gurney 		tmp1 = _mm_xor_si128(X, tmp1);
47208fca7a5SJohn-Mark Gurney 
47308fca7a5SJohn-Mark Gurney 		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
47408fca7a5SJohn-Mark Gurney 
47508fca7a5SJohn-Mark Gurney 		tmp5 = _mm_xor_si128(X, tmp5);
47608fca7a5SJohn-Mark Gurney 		reduce4(H, H2, H3, H4, tmp8, tmp7, tmp6, tmp5, &X);
47708fca7a5SJohn-Mark Gurney 	}
47808fca7a5SJohn-Mark Gurney 	for (k=i*8; k<nbytes/16; k++) {
47908fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
48008fca7a5SJohn-Mark Gurney 		ctr1 = _mm_add_epi64(ctr1, ONE);
48108fca7a5SJohn-Mark Gurney 		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
48208fca7a5SJohn-Mark Gurney 		for (j=1; j<nr-1; j+=2) {
48308fca7a5SJohn-Mark Gurney 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
48408fca7a5SJohn-Mark Gurney 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
48508fca7a5SJohn-Mark Gurney 		}
48608fca7a5SJohn-Mark Gurney 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
48708fca7a5SJohn-Mark Gurney 		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
488d395fd0dSRyan Libby 		tmp1 = _mm_xor_si128(tmp1,
489d395fd0dSRyan Libby 		    _mm_loadu_si128(&((const __m128i *)in)[k]));
49008fca7a5SJohn-Mark Gurney 		_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
49108fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
49208fca7a5SJohn-Mark Gurney 		X = _mm_xor_si128(X, tmp1);
49308fca7a5SJohn-Mark Gurney 		gfmul(X,H,&X);
49408fca7a5SJohn-Mark Gurney 	}
49508fca7a5SJohn-Mark Gurney 	//If remains one incomplete block
49608fca7a5SJohn-Mark Gurney 	if (nbytes%16) {
49708fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
49808fca7a5SJohn-Mark Gurney 		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
49908fca7a5SJohn-Mark Gurney 		for (j=1; j<nr-1; j+=2) {
50008fca7a5SJohn-Mark Gurney 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
50108fca7a5SJohn-Mark Gurney 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
50208fca7a5SJohn-Mark Gurney 		}
50308fca7a5SJohn-Mark Gurney 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
50408fca7a5SJohn-Mark Gurney 		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
5054285655aSMark Johnston 		last_block = _mm_setzero_si128();
5064285655aSMark Johnston 		memcpy(&last_block, &((const __m128i *)in)[k],
5074285655aSMark Johnston 		    nbytes % 16);
5084285655aSMark Johnston 		last_block = _mm_xor_si128(last_block, tmp1);
50908fca7a5SJohn-Mark Gurney 		for (j=0; j<nbytes%16; j++)
51008fca7a5SJohn-Mark Gurney 			out[k*16+j] = ((unsigned char*)&last_block)[j];
51108fca7a5SJohn-Mark Gurney 		for ((void)j; j<16; j++)
51208fca7a5SJohn-Mark Gurney 			((unsigned char*)&last_block)[j] = 0;
51308fca7a5SJohn-Mark Gurney 		tmp1 = last_block;
51408fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
51508fca7a5SJohn-Mark Gurney 		X = _mm_xor_si128(X, tmp1);
51608fca7a5SJohn-Mark Gurney 		gfmul(X, H, &X);
51708fca7a5SJohn-Mark Gurney 	}
51808fca7a5SJohn-Mark Gurney 	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
51908fca7a5SJohn-Mark Gurney 	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
52008fca7a5SJohn-Mark Gurney 
52108fca7a5SJohn-Mark Gurney 	X = _mm_xor_si128(X, tmp1);
52208fca7a5SJohn-Mark Gurney 	gfmul(X,H,&X);
52308fca7a5SJohn-Mark Gurney 	X = _mm_shuffle_epi8(X, BSWAP_MASK);
52408fca7a5SJohn-Mark Gurney 	T = _mm_xor_si128(X, T);
52508fca7a5SJohn-Mark Gurney 	_mm_storeu_si128((__m128i*)tag, T);
52608fca7a5SJohn-Mark Gurney }
52708fca7a5SJohn-Mark Gurney 
52808fca7a5SJohn-Mark Gurney /* My modification of _encrypt to be _decrypt */
52908fca7a5SJohn-Mark Gurney int
AES_GCM_decrypt(const unsigned char * in,unsigned char * out,const unsigned char * addt,const unsigned char * ivec,const unsigned char * tag,uint32_t nbytes,uint32_t abytes,int ibytes,const unsigned char * key,int nr)53008fca7a5SJohn-Mark Gurney AES_GCM_decrypt(const unsigned char *in, unsigned char *out,
53108fca7a5SJohn-Mark Gurney 	const unsigned char *addt, const unsigned char *ivec,
532e381fd29SJohn-Mark Gurney 	const unsigned char *tag, uint32_t nbytes, uint32_t abytes, int ibytes,
53308fca7a5SJohn-Mark Gurney 	const unsigned char *key, int nr)
53408fca7a5SJohn-Mark Gurney {
53508fca7a5SJohn-Mark Gurney 	int i, j ,k;
53608fca7a5SJohn-Mark Gurney 	__m128i tmp1, tmp2, tmp3, tmp4;
53708fca7a5SJohn-Mark Gurney 	__m128i tmp5, tmp6, tmp7, tmp8;
53808fca7a5SJohn-Mark Gurney 	__m128i H, H2, H3, H4, Y, T;
539d395fd0dSRyan Libby 	const __m128i *KEY = (const __m128i *)key;
54008fca7a5SJohn-Mark Gurney 	__m128i ctr1, ctr2, ctr3, ctr4;
54108fca7a5SJohn-Mark Gurney 	__m128i ctr5, ctr6, ctr7, ctr8;
54208fca7a5SJohn-Mark Gurney 	__m128i last_block = _mm_setzero_si128();
54308fca7a5SJohn-Mark Gurney 	__m128i ONE = _mm_set_epi32(0, 1, 0, 0);
54408fca7a5SJohn-Mark Gurney 	__m128i EIGHT = _mm_set_epi32(0, 8, 0, 0);
54508fca7a5SJohn-Mark Gurney 	__m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,
54608fca7a5SJohn-Mark Gurney 	    7);
54708fca7a5SJohn-Mark Gurney 	__m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,
54808fca7a5SJohn-Mark Gurney 	    15);
54908fca7a5SJohn-Mark Gurney 	__m128i X = _mm_setzero_si128();
55008fca7a5SJohn-Mark Gurney 
55108fca7a5SJohn-Mark Gurney 	if (ibytes == 96/8) {
552d395fd0dSRyan Libby 		Y = _mm_loadu_si128((const __m128i *)ivec);
55308fca7a5SJohn-Mark Gurney 		Y = _mm_insert_epi32(Y, 0x1000000, 3);
55408fca7a5SJohn-Mark Gurney 		/*(Compute E[ZERO, KS] and E[Y0, KS] together*/
55508fca7a5SJohn-Mark Gurney 		tmp1 = _mm_xor_si128(X, KEY[0]);
55608fca7a5SJohn-Mark Gurney 		tmp2 = _mm_xor_si128(Y, KEY[0]);
55708fca7a5SJohn-Mark Gurney 		for (j=1; j < nr-1; j+=2) {
55808fca7a5SJohn-Mark Gurney 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
55908fca7a5SJohn-Mark Gurney 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
56008fca7a5SJohn-Mark Gurney 
56108fca7a5SJohn-Mark Gurney 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
56208fca7a5SJohn-Mark Gurney 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
56308fca7a5SJohn-Mark Gurney 		}
56408fca7a5SJohn-Mark Gurney 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
56508fca7a5SJohn-Mark Gurney 		tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
56608fca7a5SJohn-Mark Gurney 
56708fca7a5SJohn-Mark Gurney 		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
56808fca7a5SJohn-Mark Gurney 		T = _mm_aesenclast_si128(tmp2, KEY[nr]);
56908fca7a5SJohn-Mark Gurney 
57008fca7a5SJohn-Mark Gurney 		H = _mm_shuffle_epi8(H, BSWAP_MASK);
57108fca7a5SJohn-Mark Gurney 	} else {
57208fca7a5SJohn-Mark Gurney 		tmp1 = _mm_xor_si128(X, KEY[0]);
57308fca7a5SJohn-Mark Gurney 		for (j=1; j <nr; j++)
57408fca7a5SJohn-Mark Gurney 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
57508fca7a5SJohn-Mark Gurney 		H = _mm_aesenclast_si128(tmp1, KEY[nr]);
57608fca7a5SJohn-Mark Gurney 
57708fca7a5SJohn-Mark Gurney 		H = _mm_shuffle_epi8(H, BSWAP_MASK);
57808fca7a5SJohn-Mark Gurney 		Y = _mm_setzero_si128();
57908fca7a5SJohn-Mark Gurney 
58008fca7a5SJohn-Mark Gurney 		for (i=0; i < ibytes/16; i++) {
581d395fd0dSRyan Libby 			tmp1 = _mm_loadu_si128(&((const __m128i *)ivec)[i]);
58208fca7a5SJohn-Mark Gurney 			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
58308fca7a5SJohn-Mark Gurney 			Y = _mm_xor_si128(Y, tmp1);
58408fca7a5SJohn-Mark Gurney 			gfmul(Y, H, &Y);
58508fca7a5SJohn-Mark Gurney 		}
58608fca7a5SJohn-Mark Gurney 		if (ibytes%16) {
58708fca7a5SJohn-Mark Gurney 			for (j=0; j < ibytes%16; j++)
58808fca7a5SJohn-Mark Gurney 				((unsigned char*)&last_block)[j] = ivec[i*16+j];
58908fca7a5SJohn-Mark Gurney 			tmp1 = last_block;
59008fca7a5SJohn-Mark Gurney 			tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
59108fca7a5SJohn-Mark Gurney 			Y = _mm_xor_si128(Y, tmp1);
59208fca7a5SJohn-Mark Gurney 			gfmul(Y, H, &Y);
59308fca7a5SJohn-Mark Gurney 		}
59408fca7a5SJohn-Mark Gurney 		tmp1 = _mm_insert_epi64(tmp1, (uint64_t)ibytes*8, 0);
59508fca7a5SJohn-Mark Gurney 		tmp1 = _mm_insert_epi64(tmp1, 0, 1);
59608fca7a5SJohn-Mark Gurney 
59708fca7a5SJohn-Mark Gurney 		Y = _mm_xor_si128(Y, tmp1);
59808fca7a5SJohn-Mark Gurney 		gfmul(Y, H, &Y);
59908fca7a5SJohn-Mark Gurney 		Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/
60008fca7a5SJohn-Mark Gurney 		tmp1 = _mm_xor_si128(Y, KEY[0]);
60108fca7a5SJohn-Mark Gurney 		for (j=1; j < nr; j++)
60208fca7a5SJohn-Mark Gurney 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
60308fca7a5SJohn-Mark Gurney 		T = _mm_aesenclast_si128(tmp1, KEY[nr]);
60408fca7a5SJohn-Mark Gurney 	}
60508fca7a5SJohn-Mark Gurney 
60608fca7a5SJohn-Mark Gurney 	gfmul(H,H,&H2);
60708fca7a5SJohn-Mark Gurney 	gfmul(H,H2,&H3);
60808fca7a5SJohn-Mark Gurney 	gfmul(H,H3,&H4);
60908fca7a5SJohn-Mark Gurney 
61008fca7a5SJohn-Mark Gurney 	for (i=0; i<abytes/16/4; i++) {
611d395fd0dSRyan Libby 		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i*4]);
612d395fd0dSRyan Libby 		tmp2 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+1]);
613d395fd0dSRyan Libby 		tmp3 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+2]);
614d395fd0dSRyan Libby 		tmp4 = _mm_loadu_si128(&((const __m128i *)addt)[i*4+3]);
61508fca7a5SJohn-Mark Gurney 
61608fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
61708fca7a5SJohn-Mark Gurney 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
61808fca7a5SJohn-Mark Gurney 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
61908fca7a5SJohn-Mark Gurney 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
62008fca7a5SJohn-Mark Gurney 
62108fca7a5SJohn-Mark Gurney 		tmp1 = _mm_xor_si128(X, tmp1);
62208fca7a5SJohn-Mark Gurney 
62308fca7a5SJohn-Mark Gurney 		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
62408fca7a5SJohn-Mark Gurney 	}
62508fca7a5SJohn-Mark Gurney 	for (i=i*4; i<abytes/16; i++) {
626d395fd0dSRyan Libby 		tmp1 = _mm_loadu_si128(&((const __m128i *)addt)[i]);
62708fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
62808fca7a5SJohn-Mark Gurney 		X = _mm_xor_si128(X,tmp1);
62908fca7a5SJohn-Mark Gurney 		gfmul(X, H, &X);
63008fca7a5SJohn-Mark Gurney 	}
63108fca7a5SJohn-Mark Gurney 	if (abytes%16) {
63208fca7a5SJohn-Mark Gurney 		last_block = _mm_setzero_si128();
63308fca7a5SJohn-Mark Gurney 		for (j=0; j<abytes%16; j++)
63408fca7a5SJohn-Mark Gurney 			((unsigned char*)&last_block)[j] = addt[i*16+j];
63508fca7a5SJohn-Mark Gurney 		tmp1 = last_block;
63608fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
63708fca7a5SJohn-Mark Gurney 		X =_mm_xor_si128(X,tmp1);
63808fca7a5SJohn-Mark Gurney 		gfmul(X,H,&X);
63908fca7a5SJohn-Mark Gurney 	}
64008fca7a5SJohn-Mark Gurney 
64108fca7a5SJohn-Mark Gurney 	/* This is where we validate the cipher text before decrypt */
64208fca7a5SJohn-Mark Gurney 	for (i = 0; i<nbytes/16/4; i++) {
643d395fd0dSRyan Libby 		tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i*4]);
644d395fd0dSRyan Libby 		tmp2 = _mm_loadu_si128(&((const __m128i *)in)[i*4+1]);
645d395fd0dSRyan Libby 		tmp3 = _mm_loadu_si128(&((const __m128i *)in)[i*4+2]);
646d395fd0dSRyan Libby 		tmp4 = _mm_loadu_si128(&((const __m128i *)in)[i*4+3]);
64708fca7a5SJohn-Mark Gurney 
64808fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
64908fca7a5SJohn-Mark Gurney 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
65008fca7a5SJohn-Mark Gurney 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
65108fca7a5SJohn-Mark Gurney 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
65208fca7a5SJohn-Mark Gurney 
65308fca7a5SJohn-Mark Gurney 		tmp1 = _mm_xor_si128(X, tmp1);
65408fca7a5SJohn-Mark Gurney 
65508fca7a5SJohn-Mark Gurney 		reduce4(H, H2, H3, H4, tmp4, tmp3, tmp2, tmp1, &X);
65608fca7a5SJohn-Mark Gurney 	}
65708fca7a5SJohn-Mark Gurney 	for (i = i*4; i<nbytes/16; i++) {
658d395fd0dSRyan Libby 		tmp1 = _mm_loadu_si128(&((const __m128i *)in)[i]);
65908fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
66008fca7a5SJohn-Mark Gurney 		X = _mm_xor_si128(X, tmp1);
66108fca7a5SJohn-Mark Gurney 		gfmul(X,H,&X);
66208fca7a5SJohn-Mark Gurney 	}
66308fca7a5SJohn-Mark Gurney 	if (nbytes%16) {
66408fca7a5SJohn-Mark Gurney 		last_block = _mm_setzero_si128();
66508fca7a5SJohn-Mark Gurney 		for (j=0; j<nbytes%16; j++)
66608fca7a5SJohn-Mark Gurney 			((unsigned char*)&last_block)[j] = in[i*16+j];
66708fca7a5SJohn-Mark Gurney 		tmp1 = last_block;
66808fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
66908fca7a5SJohn-Mark Gurney 		X = _mm_xor_si128(X, tmp1);
67008fca7a5SJohn-Mark Gurney 		gfmul(X, H, &X);
67108fca7a5SJohn-Mark Gurney 	}
67208fca7a5SJohn-Mark Gurney 
67308fca7a5SJohn-Mark Gurney 	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)nbytes*8, 0);
67408fca7a5SJohn-Mark Gurney 	tmp1 = _mm_insert_epi64(tmp1, (uint64_t)abytes*8, 1);
67508fca7a5SJohn-Mark Gurney 
67608fca7a5SJohn-Mark Gurney 	X = _mm_xor_si128(X, tmp1);
67708fca7a5SJohn-Mark Gurney 	gfmul(X,H,&X);
67808fca7a5SJohn-Mark Gurney 	X = _mm_shuffle_epi8(X, BSWAP_MASK);
67908fca7a5SJohn-Mark Gurney 	T = _mm_xor_si128(X, T);
68008fca7a5SJohn-Mark Gurney 
681e381fd29SJohn-Mark Gurney 	if (!m128icmp(T, _mm_loadu_si128((const __m128i*)tag)))
68208fca7a5SJohn-Mark Gurney 		return 0; //in case the authentication failed
68308fca7a5SJohn-Mark Gurney 
68408fca7a5SJohn-Mark Gurney 	ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
68508fca7a5SJohn-Mark Gurney 	ctr1 = _mm_add_epi64(ctr1, ONE);
68608fca7a5SJohn-Mark Gurney 	ctr2 = _mm_add_epi64(ctr1, ONE);
68708fca7a5SJohn-Mark Gurney 	ctr3 = _mm_add_epi64(ctr2, ONE);
68808fca7a5SJohn-Mark Gurney 	ctr4 = _mm_add_epi64(ctr3, ONE);
68908fca7a5SJohn-Mark Gurney 	ctr5 = _mm_add_epi64(ctr4, ONE);
69008fca7a5SJohn-Mark Gurney 	ctr6 = _mm_add_epi64(ctr5, ONE);
69108fca7a5SJohn-Mark Gurney 	ctr7 = _mm_add_epi64(ctr6, ONE);
69208fca7a5SJohn-Mark Gurney 	ctr8 = _mm_add_epi64(ctr7, ONE);
69308fca7a5SJohn-Mark Gurney 
69408fca7a5SJohn-Mark Gurney 	for (i=0; i<nbytes/16/8; i++) {
69508fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
69608fca7a5SJohn-Mark Gurney 		tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
69708fca7a5SJohn-Mark Gurney 		tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
69808fca7a5SJohn-Mark Gurney 		tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
69908fca7a5SJohn-Mark Gurney 		tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64);
70008fca7a5SJohn-Mark Gurney 		tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64);
70108fca7a5SJohn-Mark Gurney 		tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64);
70208fca7a5SJohn-Mark Gurney 		tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64);
70308fca7a5SJohn-Mark Gurney 
70408fca7a5SJohn-Mark Gurney 		ctr1 = _mm_add_epi64(ctr1, EIGHT);
70508fca7a5SJohn-Mark Gurney 		ctr2 = _mm_add_epi64(ctr2, EIGHT);
70608fca7a5SJohn-Mark Gurney 		ctr3 = _mm_add_epi64(ctr3, EIGHT);
70708fca7a5SJohn-Mark Gurney 		ctr4 = _mm_add_epi64(ctr4, EIGHT);
70808fca7a5SJohn-Mark Gurney 		ctr5 = _mm_add_epi64(ctr5, EIGHT);
70908fca7a5SJohn-Mark Gurney 		ctr6 = _mm_add_epi64(ctr6, EIGHT);
71008fca7a5SJohn-Mark Gurney 		ctr7 = _mm_add_epi64(ctr7, EIGHT);
71108fca7a5SJohn-Mark Gurney 		ctr8 = _mm_add_epi64(ctr8, EIGHT);
71208fca7a5SJohn-Mark Gurney 
71308fca7a5SJohn-Mark Gurney 		tmp1 =_mm_xor_si128(tmp1, KEY[0]);
71408fca7a5SJohn-Mark Gurney 		tmp2 =_mm_xor_si128(tmp2, KEY[0]);
71508fca7a5SJohn-Mark Gurney 		tmp3 =_mm_xor_si128(tmp3, KEY[0]);
71608fca7a5SJohn-Mark Gurney 		tmp4 =_mm_xor_si128(tmp4, KEY[0]);
71708fca7a5SJohn-Mark Gurney 		tmp5 =_mm_xor_si128(tmp5, KEY[0]);
71808fca7a5SJohn-Mark Gurney 		tmp6 =_mm_xor_si128(tmp6, KEY[0]);
71908fca7a5SJohn-Mark Gurney 		tmp7 =_mm_xor_si128(tmp7, KEY[0]);
72008fca7a5SJohn-Mark Gurney 		tmp8 =_mm_xor_si128(tmp8, KEY[0]);
72108fca7a5SJohn-Mark Gurney 
72208fca7a5SJohn-Mark Gurney 		for (j=1; j<nr; j++) {
72308fca7a5SJohn-Mark Gurney 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
72408fca7a5SJohn-Mark Gurney 			tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
72508fca7a5SJohn-Mark Gurney 			tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
72608fca7a5SJohn-Mark Gurney 			tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
72708fca7a5SJohn-Mark Gurney 			tmp5 = _mm_aesenc_si128(tmp5, KEY[j]);
72808fca7a5SJohn-Mark Gurney 			tmp6 = _mm_aesenc_si128(tmp6, KEY[j]);
72908fca7a5SJohn-Mark Gurney 			tmp7 = _mm_aesenc_si128(tmp7, KEY[j]);
73008fca7a5SJohn-Mark Gurney 			tmp8 = _mm_aesenc_si128(tmp8, KEY[j]);
73108fca7a5SJohn-Mark Gurney 		}
73208fca7a5SJohn-Mark Gurney 		tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
73308fca7a5SJohn-Mark Gurney 		tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
73408fca7a5SJohn-Mark Gurney 		tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
73508fca7a5SJohn-Mark Gurney 		tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
73608fca7a5SJohn-Mark Gurney 		tmp5 =_mm_aesenclast_si128(tmp5, KEY[nr]);
73708fca7a5SJohn-Mark Gurney 		tmp6 =_mm_aesenclast_si128(tmp6, KEY[nr]);
73808fca7a5SJohn-Mark Gurney 		tmp7 =_mm_aesenclast_si128(tmp7, KEY[nr]);
73908fca7a5SJohn-Mark Gurney 		tmp8 =_mm_aesenclast_si128(tmp8, KEY[nr]);
74008fca7a5SJohn-Mark Gurney 
74108fca7a5SJohn-Mark Gurney 		tmp1 = _mm_xor_si128(tmp1,
742d395fd0dSRyan Libby 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+0]));
74308fca7a5SJohn-Mark Gurney 		tmp2 = _mm_xor_si128(tmp2,
744d395fd0dSRyan Libby 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+1]));
74508fca7a5SJohn-Mark Gurney 		tmp3 = _mm_xor_si128(tmp3,
746d395fd0dSRyan Libby 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+2]));
74708fca7a5SJohn-Mark Gurney 		tmp4 = _mm_xor_si128(tmp4,
748d395fd0dSRyan Libby 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+3]));
74908fca7a5SJohn-Mark Gurney 		tmp5 = _mm_xor_si128(tmp5,
750d395fd0dSRyan Libby 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+4]));
75108fca7a5SJohn-Mark Gurney 		tmp6 = _mm_xor_si128(tmp6,
752d395fd0dSRyan Libby 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+5]));
75308fca7a5SJohn-Mark Gurney 		tmp7 = _mm_xor_si128(tmp7,
754d395fd0dSRyan Libby 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+6]));
75508fca7a5SJohn-Mark Gurney 		tmp8 = _mm_xor_si128(tmp8,
756d395fd0dSRyan Libby 		    _mm_loadu_si128(&((const __m128i *)in)[i*8+7]));
75708fca7a5SJohn-Mark Gurney 
75808fca7a5SJohn-Mark Gurney 		_mm_storeu_si128(&((__m128i*)out)[i*8+0], tmp1);
75908fca7a5SJohn-Mark Gurney 		_mm_storeu_si128(&((__m128i*)out)[i*8+1], tmp2);
76008fca7a5SJohn-Mark Gurney 		_mm_storeu_si128(&((__m128i*)out)[i*8+2], tmp3);
76108fca7a5SJohn-Mark Gurney 		_mm_storeu_si128(&((__m128i*)out)[i*8+3], tmp4);
76208fca7a5SJohn-Mark Gurney 		_mm_storeu_si128(&((__m128i*)out)[i*8+4], tmp5);
76308fca7a5SJohn-Mark Gurney 		_mm_storeu_si128(&((__m128i*)out)[i*8+5], tmp6);
76408fca7a5SJohn-Mark Gurney 		_mm_storeu_si128(&((__m128i*)out)[i*8+6], tmp7);
76508fca7a5SJohn-Mark Gurney 		_mm_storeu_si128(&((__m128i*)out)[i*8+7], tmp8);
76608fca7a5SJohn-Mark Gurney 
76708fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
76808fca7a5SJohn-Mark Gurney 		tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
76908fca7a5SJohn-Mark Gurney 		tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
77008fca7a5SJohn-Mark Gurney 		tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
77108fca7a5SJohn-Mark Gurney 		tmp5 = _mm_shuffle_epi8(tmp5, BSWAP_MASK);
77208fca7a5SJohn-Mark Gurney 		tmp6 = _mm_shuffle_epi8(tmp6, BSWAP_MASK);
77308fca7a5SJohn-Mark Gurney 		tmp7 = _mm_shuffle_epi8(tmp7, BSWAP_MASK);
77408fca7a5SJohn-Mark Gurney 		tmp8 = _mm_shuffle_epi8(tmp8, BSWAP_MASK);
77508fca7a5SJohn-Mark Gurney 	}
77608fca7a5SJohn-Mark Gurney 	for (k=i*8; k<nbytes/16; k++) {
77708fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
77808fca7a5SJohn-Mark Gurney 		ctr1 = _mm_add_epi64(ctr1, ONE);
77908fca7a5SJohn-Mark Gurney 		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
78008fca7a5SJohn-Mark Gurney 		for (j=1; j<nr-1; j+=2) {
78108fca7a5SJohn-Mark Gurney 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
78208fca7a5SJohn-Mark Gurney 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
78308fca7a5SJohn-Mark Gurney 		}
78408fca7a5SJohn-Mark Gurney 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
78508fca7a5SJohn-Mark Gurney 		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
786d395fd0dSRyan Libby 		tmp1 = _mm_xor_si128(tmp1,
787d395fd0dSRyan Libby 		    _mm_loadu_si128(&((const __m128i *)in)[k]));
78808fca7a5SJohn-Mark Gurney 		_mm_storeu_si128(&((__m128i*)out)[k], tmp1);
78908fca7a5SJohn-Mark Gurney 	}
79008fca7a5SJohn-Mark Gurney 	//If remains one incomplete block
79108fca7a5SJohn-Mark Gurney 	if (nbytes%16) {
79208fca7a5SJohn-Mark Gurney 		tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
79308fca7a5SJohn-Mark Gurney 		tmp1 = _mm_xor_si128(tmp1, KEY[0]);
79408fca7a5SJohn-Mark Gurney 		for (j=1; j<nr-1; j+=2) {
79508fca7a5SJohn-Mark Gurney 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
79608fca7a5SJohn-Mark Gurney 			tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
79708fca7a5SJohn-Mark Gurney 		}
79808fca7a5SJohn-Mark Gurney 		tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
79908fca7a5SJohn-Mark Gurney 		tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
800*4a61d8efSMark Johnston 		last_block = _mm_setzero_si128();
801*4a61d8efSMark Johnston 		memcpy(&last_block, &((const __m128i *)in)[k], nbytes%16);
802*4a61d8efSMark Johnston 		tmp1 = _mm_xor_si128(tmp1, last_block);
80308fca7a5SJohn-Mark Gurney 		last_block = tmp1;
80408fca7a5SJohn-Mark Gurney 		for (j=0; j<nbytes%16; j++)
80508fca7a5SJohn-Mark Gurney 			out[k*16+j] = ((unsigned char*)&last_block)[j];
80608fca7a5SJohn-Mark Gurney 	}
80708fca7a5SJohn-Mark Gurney 	return 1; //when sucessfull returns 1
80808fca7a5SJohn-Mark Gurney }
809