src/hash/ghash_pclmul.c

*0957b409SSimon J. Gerraty/*
*0957b409SSimon J. Gerraty * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
*0957b409SSimon J. Gerraty *
*0957b409SSimon J. Gerraty * Permission is hereby granted, free of charge, to any person obtaining
*0957b409SSimon J. Gerraty * a copy of this software and associated documentation files (the
*0957b409SSimon J. Gerraty * "Software"), to deal in the Software without restriction, including
*0957b409SSimon J. Gerraty * without limitation the rights to use, copy, modify, merge, publish,
*0957b409SSimon J. Gerraty * distribute, sublicense, and/or sell copies of the Software, and to
*0957b409SSimon J. Gerraty * permit persons to whom the Software is furnished to do so, subject to
*0957b409SSimon J. Gerraty * the following conditions:
*0957b409SSimon J. Gerraty *
*0957b409SSimon J. Gerraty * The above copyright notice and this permission notice shall be
*0957b409SSimon J. Gerraty * included in all copies or substantial portions of the Software.
*0957b409SSimon J. Gerraty *
*0957b409SSimon J. Gerraty * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
*0957b409SSimon J. Gerraty * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
*0957b409SSimon J. Gerraty * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
*0957b409SSimon J. Gerraty * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
*0957b409SSimon J. Gerraty * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
*0957b409SSimon J. Gerraty * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
*0957b409SSimon J. Gerraty * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
*0957b409SSimon J. Gerraty * SOFTWARE.
*0957b409SSimon J. Gerraty */
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty#define BR_ENABLE_INTRINSICS   1
*0957b409SSimon J. Gerraty#include "inner.h"
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty/*
*0957b409SSimon J. Gerraty * This is the GHASH implementation that leverages the pclmulqdq opcode
*0957b409SSimon J. Gerraty * (from the AES-NI instructions).
*0957b409SSimon J. Gerraty */
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty#if BR_AES_X86NI
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty/*
*0957b409SSimon J. Gerraty * Test CPU support for PCLMULQDQ.
*0957b409SSimon J. Gerraty */
*0957b409SSimon J. Gerratystatic inline int
*0957b409SSimon J. Gerratypclmul_supported(void)
*0957b409SSimon J. Gerraty{
*0957b409SSimon J. Gerraty	/*
*0957b409SSimon J. Gerraty	 * Bit mask for features in ECX:
*0957b409SSimon J. Gerraty	 *    1   PCLMULQDQ support
*0957b409SSimon J. Gerraty	 */
*0957b409SSimon J. Gerraty	return br_cpuid(0, 0, 0x00000002, 0);
*0957b409SSimon J. Gerraty}
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty/* see bearssl_hash.h */
*0957b409SSimon J. Gerratybr_ghash
*0957b409SSimon J. Gerratybr_ghash_pclmul_get(void)
*0957b409SSimon J. Gerraty{
*0957b409SSimon J. Gerraty	return pclmul_supported() ? &br_ghash_pclmul : 0;
*0957b409SSimon J. Gerraty}
*0957b409SSimon J. Gerraty
*0957b409SSimon J. GerratyBR_TARGETS_X86_UP
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty/*
*0957b409SSimon J. Gerraty * GHASH is defined over elements of GF(2^128) with "full little-endian"
*0957b409SSimon J. Gerraty * representation: leftmost byte is least significant, and, within each
*0957b409SSimon J. Gerraty * byte, leftmost _bit_ is least significant. The natural ordering in
*0957b409SSimon J. Gerraty * x86 is "mixed little-endian": bytes are ordered from least to most
*0957b409SSimon J. Gerraty * significant, but bits within a byte are in most-to-least significant
*0957b409SSimon J. Gerraty * order. Going to full little-endian representation would require
*0957b409SSimon J. Gerraty * reversing bits within each byte, which is doable but expensive.
*0957b409SSimon J. Gerraty *
*0957b409SSimon J. Gerraty * Instead, we go to full big-endian representation, by swapping bytes
*0957b409SSimon J. Gerraty * around, which is done with a single _mm_shuffle_epi8() opcode (it
*0957b409SSimon J. Gerraty * comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We
*0957b409SSimon J. Gerraty * can use a full big-endian representation because in a carryless
*0957b409SSimon J. Gerraty * multiplication, we have a nice bit reversal property:
*0957b409SSimon J. Gerraty *
*0957b409SSimon J. Gerraty *    rev_128(x) * rev_128(y) = rev_255(x * y)
*0957b409SSimon J. Gerraty *
*0957b409SSimon J. Gerraty * So by using full big-endian, we still get the right result, except
*0957b409SSimon J. Gerraty * that it is right-shifted by 1 bit. The left-shift is relatively
*0957b409SSimon J. Gerraty * inexpensive, and it can be mutualised.
*0957b409SSimon J. Gerraty *
*0957b409SSimon J. Gerraty *
*0957b409SSimon J. Gerraty * Since SSE2 opcodes do not have facilities for shitfting full 128-bit
*0957b409SSimon J. Gerraty * values with bit precision, we have to break down values into 64-bit
*0957b409SSimon J. Gerraty * chunks. We number chunks from 0 to 3 in left to right order.
*0957b409SSimon J. Gerraty */
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty/*
*0957b409SSimon J. Gerraty * Byte-swap a complete 128-bit value. This normally uses
*0957b409SSimon J. Gerraty * _mm_shuffle_epi8(), which gets translated to pshufb (an SSSE3 opcode).
*0957b409SSimon J. Gerraty * However, this crashes old Clang versions, so, for Clang before 3.8,
*0957b409SSimon J. Gerraty * we use an alternate (and less efficient) version.
*0957b409SSimon J. Gerraty */
*0957b409SSimon J. Gerraty#if BR_CLANG && !BR_CLANG_3_8
*0957b409SSimon J. Gerraty#define BYTESWAP_DECL
*0957b409SSimon J. Gerraty#define BYTESWAP_PREP   (void)0
*0957b409SSimon J. Gerraty#define BYTESWAP(x)   do { \
*0957b409SSimon J. Gerraty		__m128i byteswap1, byteswap2; \
*0957b409SSimon J. Gerraty		byteswap1 = (x); \
*0957b409SSimon J. Gerraty		byteswap2 = _mm_srli_epi16(byteswap1, 8); \
*0957b409SSimon J. Gerraty		byteswap1 = _mm_slli_epi16(byteswap1, 8); \
*0957b409SSimon J. Gerraty		byteswap1 = _mm_or_si128(byteswap1, byteswap2); \
*0957b409SSimon J. Gerraty		byteswap1 = _mm_shufflelo_epi16(byteswap1, 0x1B); \
*0957b409SSimon J. Gerraty		byteswap1 = _mm_shufflehi_epi16(byteswap1, 0x1B); \
*0957b409SSimon J. Gerraty		(x) = _mm_shuffle_epi32(byteswap1, 0x4E); \
*0957b409SSimon J. Gerraty	} while (0)
*0957b409SSimon J. Gerraty#else
*0957b409SSimon J. Gerraty#define BYTESWAP_DECL   __m128i byteswap_index;
*0957b409SSimon J. Gerraty#define BYTESWAP_PREP   do { \
*0957b409SSimon J. Gerraty		byteswap_index = _mm_set_epi8( \
*0957b409SSimon J. Gerraty			0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \
*0957b409SSimon J. Gerraty	} while (0)
*0957b409SSimon J. Gerraty#define BYTESWAP(x)   do { \
*0957b409SSimon J. Gerraty		(x) = _mm_shuffle_epi8((x), byteswap_index); \
*0957b409SSimon J. Gerraty	} while (0)
*0957b409SSimon J. Gerraty#endif
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty/*
*0957b409SSimon J. Gerraty * Call pclmulqdq. Clang appears to have trouble with the intrinsic, so,
*0957b409SSimon J. Gerraty * for that compiler, we use inline assembly. Inline assembly is
*0957b409SSimon J. Gerraty * potentially a bit slower because the compiler does not understand
*0957b409SSimon J. Gerraty * what the opcode does, and thus cannot optimize instruction
*0957b409SSimon J. Gerraty * scheduling.
*0957b409SSimon J. Gerraty *
*0957b409SSimon J. Gerraty * We use a target of "sse2" only, so that Clang may still handle the
*0957b409SSimon J. Gerraty * '__m128i' type and allocate SSE2 registers.
*0957b409SSimon J. Gerraty */
*0957b409SSimon J. Gerraty#if BR_CLANG
*0957b409SSimon J. GerratyBR_TARGET("sse2")
*0957b409SSimon J. Gerratystatic inline __m128i
*0957b409SSimon J. Gerratypclmulqdq00(__m128i x, __m128i y)
*0957b409SSimon J. Gerraty{
*0957b409SSimon J. Gerraty	__asm__ ("pclmulqdq $0x00, %1, %0" : "+x" (x) : "x" (y));
*0957b409SSimon J. Gerraty	return x;
*0957b409SSimon J. Gerraty}
*0957b409SSimon J. GerratyBR_TARGET("sse2")
*0957b409SSimon J. Gerratystatic inline __m128i
*0957b409SSimon J. Gerratypclmulqdq11(__m128i x, __m128i y)
*0957b409SSimon J. Gerraty{
*0957b409SSimon J. Gerraty	__asm__ ("pclmulqdq $0x11, %1, %0" : "+x" (x) : "x" (y));
*0957b409SSimon J. Gerraty	return x;
*0957b409SSimon J. Gerraty}
*0957b409SSimon J. Gerraty#else
*0957b409SSimon J. Gerraty#define pclmulqdq00(x, y)   _mm_clmulepi64_si128(x, y, 0x00)
*0957b409SSimon J. Gerraty#define pclmulqdq11(x, y)   _mm_clmulepi64_si128(x, y, 0x11)
*0957b409SSimon J. Gerraty#endif
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty/*
*0957b409SSimon J. Gerraty * From a 128-bit value kw, compute kx as the XOR of the two 64-bit
*0957b409SSimon J. Gerraty * halves of kw (into the right half of kx; left half is unspecified).
*0957b409SSimon J. Gerraty */
*0957b409SSimon J. Gerraty#define BK(kw, kx)   do { \
*0957b409SSimon J. Gerraty		kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \
*0957b409SSimon J. Gerraty	} while (0)
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty/*
*0957b409SSimon J. Gerraty * Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and
*0957b409SSimon J. Gerraty * the XOR of the two values (kx).
*0957b409SSimon J. Gerraty */
*0957b409SSimon J. Gerraty#define PBK(k0, k1, kw, kx)   do { \
*0957b409SSimon J. Gerraty		kw = _mm_unpacklo_epi64(k1, k0); \
*0957b409SSimon J. Gerraty		kx = _mm_xor_si128(k0, k1); \
*0957b409SSimon J. Gerraty	} while (0)
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty/*
*0957b409SSimon J. Gerraty * Left-shift by 1 bit a 256-bit value (in four 64-bit words).
*0957b409SSimon J. Gerraty */
*0957b409SSimon J. Gerraty#define SL_256(x0, x1, x2, x3)   do { \
*0957b409SSimon J. Gerraty		x0 = _mm_or_si128( \
*0957b409SSimon J. Gerraty			_mm_slli_epi64(x0, 1), \
*0957b409SSimon J. Gerraty			_mm_srli_epi64(x1, 63)); \
*0957b409SSimon J. Gerraty		x1 = _mm_or_si128( \
*0957b409SSimon J. Gerraty			_mm_slli_epi64(x1, 1), \
*0957b409SSimon J. Gerraty			_mm_srli_epi64(x2, 63)); \
*0957b409SSimon J. Gerraty		x2 = _mm_or_si128( \
*0957b409SSimon J. Gerraty			_mm_slli_epi64(x2, 1), \
*0957b409SSimon J. Gerraty			_mm_srli_epi64(x3, 63)); \
*0957b409SSimon J. Gerraty		x3 = _mm_slli_epi64(x3, 1); \
*0957b409SSimon J. Gerraty	} while (0)
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty/*
*0957b409SSimon J. Gerraty * Perform reduction in GF(2^128). The 256-bit value is in x0..x3;
*0957b409SSimon J. Gerraty * result is written in x0..x1.
*0957b409SSimon J. Gerraty */
*0957b409SSimon J. Gerraty#define REDUCE_F128(x0, x1, x2, x3)   do { \
*0957b409SSimon J. Gerraty		x1 = _mm_xor_si128( \
*0957b409SSimon J. Gerraty			x1, \
*0957b409SSimon J. Gerraty			_mm_xor_si128( \
*0957b409SSimon J. Gerraty				_mm_xor_si128( \
*0957b409SSimon J. Gerraty					x3, \
*0957b409SSimon J. Gerraty					_mm_srli_epi64(x3, 1)), \
*0957b409SSimon J. Gerraty				_mm_xor_si128( \
*0957b409SSimon J. Gerraty					_mm_srli_epi64(x3, 2), \
*0957b409SSimon J. Gerraty					_mm_srli_epi64(x3, 7)))); \
*0957b409SSimon J. Gerraty		x2 = _mm_xor_si128( \
*0957b409SSimon J. Gerraty			_mm_xor_si128( \
*0957b409SSimon J. Gerraty				x2, \
*0957b409SSimon J. Gerraty				_mm_slli_epi64(x3, 63)), \
*0957b409SSimon J. Gerraty			_mm_xor_si128( \
*0957b409SSimon J. Gerraty				_mm_slli_epi64(x3, 62), \
*0957b409SSimon J. Gerraty				_mm_slli_epi64(x3, 57))); \
*0957b409SSimon J. Gerraty		x0 = _mm_xor_si128( \
*0957b409SSimon J. Gerraty			x0, \
*0957b409SSimon J. Gerraty			_mm_xor_si128( \
*0957b409SSimon J. Gerraty				_mm_xor_si128( \
*0957b409SSimon J. Gerraty					x2, \
*0957b409SSimon J. Gerraty					_mm_srli_epi64(x2, 1)), \
*0957b409SSimon J. Gerraty				_mm_xor_si128( \
*0957b409SSimon J. Gerraty					_mm_srli_epi64(x2, 2), \
*0957b409SSimon J. Gerraty					_mm_srli_epi64(x2, 7)))); \
*0957b409SSimon J. Gerraty		x1 = _mm_xor_si128( \
*0957b409SSimon J. Gerraty			_mm_xor_si128( \
*0957b409SSimon J. Gerraty				x1, \
*0957b409SSimon J. Gerraty				_mm_slli_epi64(x2, 63)), \
*0957b409SSimon J. Gerraty			_mm_xor_si128( \
*0957b409SSimon J. Gerraty				_mm_slli_epi64(x2, 62), \
*0957b409SSimon J. Gerraty				_mm_slli_epi64(x2, 57))); \
*0957b409SSimon J. Gerraty	} while (0)
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty/*
*0957b409SSimon J. Gerraty * Square value kw into (dw,dx).
*0957b409SSimon J. Gerraty */
*0957b409SSimon J. Gerraty#define SQUARE_F128(kw, dw, dx)   do { \
*0957b409SSimon J. Gerraty		__m128i z0, z1, z2, z3; \
*0957b409SSimon J. Gerraty		z1 = pclmulqdq11(kw, kw); \
*0957b409SSimon J. Gerraty		z3 = pclmulqdq00(kw, kw); \
*0957b409SSimon J. Gerraty		z0 = _mm_shuffle_epi32(z1, 0x0E); \
*0957b409SSimon J. Gerraty		z2 = _mm_shuffle_epi32(z3, 0x0E); \
*0957b409SSimon J. Gerraty		SL_256(z0, z1, z2, z3); \
*0957b409SSimon J. Gerraty		REDUCE_F128(z0, z1, z2, z3); \
*0957b409SSimon J. Gerraty		PBK(z0, z1, dw, dx); \
*0957b409SSimon J. Gerraty	} while (0)
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty/* see bearssl_hash.h */
*0957b409SSimon J. GerratyBR_TARGET("ssse3,pclmul")
*0957b409SSimon J. Gerratyvoid
*0957b409SSimon J. Gerratybr_ghash_pclmul(void *y, const void *h, const void *data, size_t len)
*0957b409SSimon J. Gerraty{
*0957b409SSimon J. Gerraty	const unsigned char *buf1, *buf2;
*0957b409SSimon J. Gerraty	unsigned char tmp[64];
*0957b409SSimon J. Gerraty	size_t num4, num1;
*0957b409SSimon J. Gerraty	__m128i yw, h1w, h1x;
*0957b409SSimon J. Gerraty	BYTESWAP_DECL
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty	/*
*0957b409SSimon J. Gerraty	 * We split data into two chunks. First chunk starts at buf1
*0957b409SSimon J. Gerraty	 * and contains num4 blocks of 64-byte values. Second chunk
*0957b409SSimon J. Gerraty	 * starts at buf2 and contains num1 blocks of 16-byte values.
*0957b409SSimon J. Gerraty	 * We want the first chunk to be as large as possible.
*0957b409SSimon J. Gerraty	 */
*0957b409SSimon J. Gerraty	buf1 = data;
*0957b409SSimon J. Gerraty	num4 = len >> 6;
*0957b409SSimon J. Gerraty	len &= 63;
*0957b409SSimon J. Gerraty	buf2 = buf1 + (num4 << 6);
*0957b409SSimon J. Gerraty	num1 = (len + 15) >> 4;
*0957b409SSimon J. Gerraty	if ((len & 15) != 0) {
*0957b409SSimon J. Gerraty		memcpy(tmp, buf2, len);
*0957b409SSimon J. Gerraty		memset(tmp + len, 0, (num1 << 4) - len);
*0957b409SSimon J. Gerraty		buf2 = tmp;
*0957b409SSimon J. Gerraty	}
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty	/*
*0957b409SSimon J. Gerraty	 * Preparatory step for endian conversions.
*0957b409SSimon J. Gerraty	 */
*0957b409SSimon J. Gerraty	BYTESWAP_PREP;
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty	/*
*0957b409SSimon J. Gerraty	 * Load y and h.
*0957b409SSimon J. Gerraty	 */
*0957b409SSimon J. Gerraty	yw = _mm_loadu_si128(y);
*0957b409SSimon J. Gerraty	h1w = _mm_loadu_si128(h);
*0957b409SSimon J. Gerraty	BYTESWAP(yw);
*0957b409SSimon J. Gerraty	BYTESWAP(h1w);
*0957b409SSimon J. Gerraty	BK(h1w, h1x);
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty	if (num4 > 0) {
*0957b409SSimon J. Gerraty		__m128i h2w, h2x, h3w, h3x, h4w, h4x;
*0957b409SSimon J. Gerraty		__m128i t0, t1, t2, t3;
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty		/*
*0957b409SSimon J. Gerraty		 * Compute h2 = h^2.
*0957b409SSimon J. Gerraty		 */
*0957b409SSimon J. Gerraty		SQUARE_F128(h1w, h2w, h2x);
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty		/*
*0957b409SSimon J. Gerraty		 * Compute h3 = h^3 = h*(h^2).
*0957b409SSimon J. Gerraty		 */
*0957b409SSimon J. Gerraty		t1 = pclmulqdq11(h1w, h2w);
*0957b409SSimon J. Gerraty		t3 = pclmulqdq00(h1w, h2w);
*0957b409SSimon J. Gerraty		t2 = _mm_xor_si128(pclmulqdq00(h1x, h2x),
*0957b409SSimon J. Gerraty			_mm_xor_si128(t1, t3));
*0957b409SSimon J. Gerraty		t0 = _mm_shuffle_epi32(t1, 0x0E);
*0957b409SSimon J. Gerraty		t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
*0957b409SSimon J. Gerraty		t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
*0957b409SSimon J. Gerraty		SL_256(t0, t1, t2, t3);
*0957b409SSimon J. Gerraty		REDUCE_F128(t0, t1, t2, t3);
*0957b409SSimon J. Gerraty		PBK(t0, t1, h3w, h3x);
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty		/*
*0957b409SSimon J. Gerraty		 * Compute h4 = h^4 = (h^2)^2.
*0957b409SSimon J. Gerraty		 */
*0957b409SSimon J. Gerraty		SQUARE_F128(h2w, h4w, h4x);
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty		while (num4 -- > 0) {
*0957b409SSimon J. Gerraty			__m128i aw0, aw1, aw2, aw3;
*0957b409SSimon J. Gerraty			__m128i ax0, ax1, ax2, ax3;
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty			aw0 = _mm_loadu_si128((void *)(buf1 +  0));
*0957b409SSimon J. Gerraty			aw1 = _mm_loadu_si128((void *)(buf1 + 16));
*0957b409SSimon J. Gerraty			aw2 = _mm_loadu_si128((void *)(buf1 + 32));
*0957b409SSimon J. Gerraty			aw3 = _mm_loadu_si128((void *)(buf1 + 48));
*0957b409SSimon J. Gerraty			BYTESWAP(aw0);
*0957b409SSimon J. Gerraty			BYTESWAP(aw1);
*0957b409SSimon J. Gerraty			BYTESWAP(aw2);
*0957b409SSimon J. Gerraty			BYTESWAP(aw3);
*0957b409SSimon J. Gerraty			buf1 += 64;
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty			aw0 = _mm_xor_si128(aw0, yw);
*0957b409SSimon J. Gerraty			BK(aw1, ax1);
*0957b409SSimon J. Gerraty			BK(aw2, ax2);
*0957b409SSimon J. Gerraty			BK(aw3, ax3);
*0957b409SSimon J. Gerraty			BK(aw0, ax0);
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty			t1 = _mm_xor_si128(
*0957b409SSimon J. Gerraty				_mm_xor_si128(
*0957b409SSimon J. Gerraty					pclmulqdq11(aw0, h4w),
*0957b409SSimon J. Gerraty					pclmulqdq11(aw1, h3w)),
*0957b409SSimon J. Gerraty				_mm_xor_si128(
*0957b409SSimon J. Gerraty					pclmulqdq11(aw2, h2w),
*0957b409SSimon J. Gerraty					pclmulqdq11(aw3, h1w)));
*0957b409SSimon J. Gerraty			t3 = _mm_xor_si128(
*0957b409SSimon J. Gerraty				_mm_xor_si128(
*0957b409SSimon J. Gerraty					pclmulqdq00(aw0, h4w),
*0957b409SSimon J. Gerraty					pclmulqdq00(aw1, h3w)),
*0957b409SSimon J. Gerraty				_mm_xor_si128(
*0957b409SSimon J. Gerraty					pclmulqdq00(aw2, h2w),
*0957b409SSimon J. Gerraty					pclmulqdq00(aw3, h1w)));
*0957b409SSimon J. Gerraty			t2 = _mm_xor_si128(
*0957b409SSimon J. Gerraty				_mm_xor_si128(
*0957b409SSimon J. Gerraty					pclmulqdq00(ax0, h4x),
*0957b409SSimon J. Gerraty					pclmulqdq00(ax1, h3x)),
*0957b409SSimon J. Gerraty				_mm_xor_si128(
*0957b409SSimon J. Gerraty					pclmulqdq00(ax2, h2x),
*0957b409SSimon J. Gerraty					pclmulqdq00(ax3, h1x)));
*0957b409SSimon J. Gerraty			t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
*0957b409SSimon J. Gerraty			t0 = _mm_shuffle_epi32(t1, 0x0E);
*0957b409SSimon J. Gerraty			t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
*0957b409SSimon J. Gerraty			t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
*0957b409SSimon J. Gerraty			SL_256(t0, t1, t2, t3);
*0957b409SSimon J. Gerraty			REDUCE_F128(t0, t1, t2, t3);
*0957b409SSimon J. Gerraty			yw = _mm_unpacklo_epi64(t1, t0);
*0957b409SSimon J. Gerraty		}
*0957b409SSimon J. Gerraty	}
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty	while (num1 -- > 0) {
*0957b409SSimon J. Gerraty		__m128i aw, ax;
*0957b409SSimon J. Gerraty		__m128i t0, t1, t2, t3;
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty		aw = _mm_loadu_si128((void *)buf2);
*0957b409SSimon J. Gerraty		BYTESWAP(aw);
*0957b409SSimon J. Gerraty		buf2 += 16;
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty		aw = _mm_xor_si128(aw, yw);
*0957b409SSimon J. Gerraty		BK(aw, ax);
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty		t1 = pclmulqdq11(aw, h1w);
*0957b409SSimon J. Gerraty		t3 = pclmulqdq00(aw, h1w);
*0957b409SSimon J. Gerraty		t2 = pclmulqdq00(ax, h1x);
*0957b409SSimon J. Gerraty		t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3));
*0957b409SSimon J. Gerraty		t0 = _mm_shuffle_epi32(t1, 0x0E);
*0957b409SSimon J. Gerraty		t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E));
*0957b409SSimon J. Gerraty		t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E));
*0957b409SSimon J. Gerraty		SL_256(t0, t1, t2, t3);
*0957b409SSimon J. Gerraty		REDUCE_F128(t0, t1, t2, t3);
*0957b409SSimon J. Gerraty		yw = _mm_unpacklo_epi64(t1, t0);
*0957b409SSimon J. Gerraty	}
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty	BYTESWAP(yw);
*0957b409SSimon J. Gerraty	_mm_storeu_si128(y, yw);
*0957b409SSimon J. Gerraty}
*0957b409SSimon J. Gerraty
*0957b409SSimon J. GerratyBR_TARGETS_X86_DOWN
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty#else
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty/* see bearssl_hash.h */
*0957b409SSimon J. Gerratybr_ghash
*0957b409SSimon J. Gerratybr_ghash_pclmul_get(void)
*0957b409SSimon J. Gerraty{
*0957b409SSimon J. Gerraty	return 0;
*0957b409SSimon J. Gerraty}
*0957b409SSimon J. Gerraty
*0957b409SSimon J. Gerraty#endif