1 /* 2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #define BR_ENABLE_INTRINSICS 1 26 #include "inner.h" 27 28 /* 29 * This is the GHASH implementation that leverages the pclmulqdq opcode 30 * (from the AES-NI instructions). 31 */ 32 33 #if BR_AES_X86NI 34 35 /* 36 * Test CPU support for PCLMULQDQ. 37 */ 38 static inline int 39 pclmul_supported(void) 40 { 41 /* 42 * Bit mask for features in ECX: 43 * 1 PCLMULQDQ support 44 */ 45 return br_cpuid(0, 0, 0x00000002, 0); 46 } 47 48 /* see bearssl_hash.h */ 49 br_ghash 50 br_ghash_pclmul_get(void) 51 { 52 return pclmul_supported() ? &br_ghash_pclmul : 0; 53 } 54 55 BR_TARGETS_X86_UP 56 57 /* 58 * GHASH is defined over elements of GF(2^128) with "full little-endian" 59 * representation: leftmost byte is least significant, and, within each 60 * byte, leftmost _bit_ is least significant. The natural ordering in 61 * x86 is "mixed little-endian": bytes are ordered from least to most 62 * significant, but bits within a byte are in most-to-least significant 63 * order. Going to full little-endian representation would require 64 * reversing bits within each byte, which is doable but expensive. 65 * 66 * Instead, we go to full big-endian representation, by swapping bytes 67 * around, which is done with a single _mm_shuffle_epi8() opcode (it 68 * comes with SSSE3; all CPU that offer pclmulqdq also have SSSE3). We 69 * can use a full big-endian representation because in a carryless 70 * multiplication, we have a nice bit reversal property: 71 * 72 * rev_128(x) * rev_128(y) = rev_255(x * y) 73 * 74 * So by using full big-endian, we still get the right result, except 75 * that it is right-shifted by 1 bit. The left-shift is relatively 76 * inexpensive, and it can be mutualised. 77 * 78 * 79 * Since SSE2 opcodes do not have facilities for shitfting full 128-bit 80 * values with bit precision, we have to break down values into 64-bit 81 * chunks. We number chunks from 0 to 3 in left to right order. 82 */ 83 84 /* 85 * Byte-swap a complete 128-bit value. This normally uses 86 * _mm_shuffle_epi8(), which gets translated to pshufb (an SSSE3 opcode). 87 * However, this crashes old Clang versions, so, for Clang before 3.8, 88 * we use an alternate (and less efficient) version. 89 */ 90 #if BR_CLANG && !BR_CLANG_3_8 91 #define BYTESWAP_DECL 92 #define BYTESWAP_PREP (void)0 93 #define BYTESWAP(x) do { \ 94 __m128i byteswap1, byteswap2; \ 95 byteswap1 = (x); \ 96 byteswap2 = _mm_srli_epi16(byteswap1, 8); \ 97 byteswap1 = _mm_slli_epi16(byteswap1, 8); \ 98 byteswap1 = _mm_or_si128(byteswap1, byteswap2); \ 99 byteswap1 = _mm_shufflelo_epi16(byteswap1, 0x1B); \ 100 byteswap1 = _mm_shufflehi_epi16(byteswap1, 0x1B); \ 101 (x) = _mm_shuffle_epi32(byteswap1, 0x4E); \ 102 } while (0) 103 #else 104 #define BYTESWAP_DECL __m128i byteswap_index; 105 #define BYTESWAP_PREP do { \ 106 byteswap_index = _mm_set_epi8( \ 107 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); \ 108 } while (0) 109 #define BYTESWAP(x) do { \ 110 (x) = _mm_shuffle_epi8((x), byteswap_index); \ 111 } while (0) 112 #endif 113 114 /* 115 * Call pclmulqdq. Clang appears to have trouble with the intrinsic, so, 116 * for that compiler, we use inline assembly. Inline assembly is 117 * potentially a bit slower because the compiler does not understand 118 * what the opcode does, and thus cannot optimize instruction 119 * scheduling. 120 * 121 * We use a target of "sse2" only, so that Clang may still handle the 122 * '__m128i' type and allocate SSE2 registers. 123 */ 124 #if BR_CLANG 125 BR_TARGET("sse2") 126 static inline __m128i 127 pclmulqdq00(__m128i x, __m128i y) 128 { 129 __asm__ ("pclmulqdq $0x00, %1, %0" : "+x" (x) : "x" (y)); 130 return x; 131 } 132 BR_TARGET("sse2") 133 static inline __m128i 134 pclmulqdq11(__m128i x, __m128i y) 135 { 136 __asm__ ("pclmulqdq $0x11, %1, %0" : "+x" (x) : "x" (y)); 137 return x; 138 } 139 #else 140 #define pclmulqdq00(x, y) _mm_clmulepi64_si128(x, y, 0x00) 141 #define pclmulqdq11(x, y) _mm_clmulepi64_si128(x, y, 0x11) 142 #endif 143 144 /* 145 * From a 128-bit value kw, compute kx as the XOR of the two 64-bit 146 * halves of kw (into the right half of kx; left half is unspecified). 147 */ 148 #define BK(kw, kx) do { \ 149 kx = _mm_xor_si128(kw, _mm_shuffle_epi32(kw, 0x0E)); \ 150 } while (0) 151 152 /* 153 * Combine two 64-bit values (k0:k1) into a 128-bit (kw) value and 154 * the XOR of the two values (kx). 155 */ 156 #define PBK(k0, k1, kw, kx) do { \ 157 kw = _mm_unpacklo_epi64(k1, k0); \ 158 kx = _mm_xor_si128(k0, k1); \ 159 } while (0) 160 161 /* 162 * Left-shift by 1 bit a 256-bit value (in four 64-bit words). 163 */ 164 #define SL_256(x0, x1, x2, x3) do { \ 165 x0 = _mm_or_si128( \ 166 _mm_slli_epi64(x0, 1), \ 167 _mm_srli_epi64(x1, 63)); \ 168 x1 = _mm_or_si128( \ 169 _mm_slli_epi64(x1, 1), \ 170 _mm_srli_epi64(x2, 63)); \ 171 x2 = _mm_or_si128( \ 172 _mm_slli_epi64(x2, 1), \ 173 _mm_srli_epi64(x3, 63)); \ 174 x3 = _mm_slli_epi64(x3, 1); \ 175 } while (0) 176 177 /* 178 * Perform reduction in GF(2^128). The 256-bit value is in x0..x3; 179 * result is written in x0..x1. 180 */ 181 #define REDUCE_F128(x0, x1, x2, x3) do { \ 182 x1 = _mm_xor_si128( \ 183 x1, \ 184 _mm_xor_si128( \ 185 _mm_xor_si128( \ 186 x3, \ 187 _mm_srli_epi64(x3, 1)), \ 188 _mm_xor_si128( \ 189 _mm_srli_epi64(x3, 2), \ 190 _mm_srli_epi64(x3, 7)))); \ 191 x2 = _mm_xor_si128( \ 192 _mm_xor_si128( \ 193 x2, \ 194 _mm_slli_epi64(x3, 63)), \ 195 _mm_xor_si128( \ 196 _mm_slli_epi64(x3, 62), \ 197 _mm_slli_epi64(x3, 57))); \ 198 x0 = _mm_xor_si128( \ 199 x0, \ 200 _mm_xor_si128( \ 201 _mm_xor_si128( \ 202 x2, \ 203 _mm_srli_epi64(x2, 1)), \ 204 _mm_xor_si128( \ 205 _mm_srli_epi64(x2, 2), \ 206 _mm_srli_epi64(x2, 7)))); \ 207 x1 = _mm_xor_si128( \ 208 _mm_xor_si128( \ 209 x1, \ 210 _mm_slli_epi64(x2, 63)), \ 211 _mm_xor_si128( \ 212 _mm_slli_epi64(x2, 62), \ 213 _mm_slli_epi64(x2, 57))); \ 214 } while (0) 215 216 /* 217 * Square value kw into (dw,dx). 218 */ 219 #define SQUARE_F128(kw, dw, dx) do { \ 220 __m128i z0, z1, z2, z3; \ 221 z1 = pclmulqdq11(kw, kw); \ 222 z3 = pclmulqdq00(kw, kw); \ 223 z0 = _mm_shuffle_epi32(z1, 0x0E); \ 224 z2 = _mm_shuffle_epi32(z3, 0x0E); \ 225 SL_256(z0, z1, z2, z3); \ 226 REDUCE_F128(z0, z1, z2, z3); \ 227 PBK(z0, z1, dw, dx); \ 228 } while (0) 229 230 /* see bearssl_hash.h */ 231 BR_TARGET("ssse3,pclmul") 232 void 233 br_ghash_pclmul(void *y, const void *h, const void *data, size_t len) 234 { 235 const unsigned char *buf1, *buf2; 236 unsigned char tmp[64]; 237 size_t num4, num1; 238 __m128i yw, h1w, h1x; 239 BYTESWAP_DECL 240 241 /* 242 * We split data into two chunks. First chunk starts at buf1 243 * and contains num4 blocks of 64-byte values. Second chunk 244 * starts at buf2 and contains num1 blocks of 16-byte values. 245 * We want the first chunk to be as large as possible. 246 */ 247 buf1 = data; 248 num4 = len >> 6; 249 len &= 63; 250 buf2 = buf1 + (num4 << 6); 251 num1 = (len + 15) >> 4; 252 if ((len & 15) != 0) { 253 memcpy(tmp, buf2, len); 254 memset(tmp + len, 0, (num1 << 4) - len); 255 buf2 = tmp; 256 } 257 258 /* 259 * Preparatory step for endian conversions. 260 */ 261 BYTESWAP_PREP; 262 263 /* 264 * Load y and h. 265 */ 266 yw = _mm_loadu_si128(y); 267 h1w = _mm_loadu_si128(h); 268 BYTESWAP(yw); 269 BYTESWAP(h1w); 270 BK(h1w, h1x); 271 272 if (num4 > 0) { 273 __m128i h2w, h2x, h3w, h3x, h4w, h4x; 274 __m128i t0, t1, t2, t3; 275 276 /* 277 * Compute h2 = h^2. 278 */ 279 SQUARE_F128(h1w, h2w, h2x); 280 281 /* 282 * Compute h3 = h^3 = h*(h^2). 283 */ 284 t1 = pclmulqdq11(h1w, h2w); 285 t3 = pclmulqdq00(h1w, h2w); 286 t2 = _mm_xor_si128(pclmulqdq00(h1x, h2x), 287 _mm_xor_si128(t1, t3)); 288 t0 = _mm_shuffle_epi32(t1, 0x0E); 289 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E)); 290 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E)); 291 SL_256(t0, t1, t2, t3); 292 REDUCE_F128(t0, t1, t2, t3); 293 PBK(t0, t1, h3w, h3x); 294 295 /* 296 * Compute h4 = h^4 = (h^2)^2. 297 */ 298 SQUARE_F128(h2w, h4w, h4x); 299 300 while (num4 -- > 0) { 301 __m128i aw0, aw1, aw2, aw3; 302 __m128i ax0, ax1, ax2, ax3; 303 304 aw0 = _mm_loadu_si128((void *)(buf1 + 0)); 305 aw1 = _mm_loadu_si128((void *)(buf1 + 16)); 306 aw2 = _mm_loadu_si128((void *)(buf1 + 32)); 307 aw3 = _mm_loadu_si128((void *)(buf1 + 48)); 308 BYTESWAP(aw0); 309 BYTESWAP(aw1); 310 BYTESWAP(aw2); 311 BYTESWAP(aw3); 312 buf1 += 64; 313 314 aw0 = _mm_xor_si128(aw0, yw); 315 BK(aw1, ax1); 316 BK(aw2, ax2); 317 BK(aw3, ax3); 318 BK(aw0, ax0); 319 320 t1 = _mm_xor_si128( 321 _mm_xor_si128( 322 pclmulqdq11(aw0, h4w), 323 pclmulqdq11(aw1, h3w)), 324 _mm_xor_si128( 325 pclmulqdq11(aw2, h2w), 326 pclmulqdq11(aw3, h1w))); 327 t3 = _mm_xor_si128( 328 _mm_xor_si128( 329 pclmulqdq00(aw0, h4w), 330 pclmulqdq00(aw1, h3w)), 331 _mm_xor_si128( 332 pclmulqdq00(aw2, h2w), 333 pclmulqdq00(aw3, h1w))); 334 t2 = _mm_xor_si128( 335 _mm_xor_si128( 336 pclmulqdq00(ax0, h4x), 337 pclmulqdq00(ax1, h3x)), 338 _mm_xor_si128( 339 pclmulqdq00(ax2, h2x), 340 pclmulqdq00(ax3, h1x))); 341 t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3)); 342 t0 = _mm_shuffle_epi32(t1, 0x0E); 343 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E)); 344 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E)); 345 SL_256(t0, t1, t2, t3); 346 REDUCE_F128(t0, t1, t2, t3); 347 yw = _mm_unpacklo_epi64(t1, t0); 348 } 349 } 350 351 while (num1 -- > 0) { 352 __m128i aw, ax; 353 __m128i t0, t1, t2, t3; 354 355 aw = _mm_loadu_si128((void *)buf2); 356 BYTESWAP(aw); 357 buf2 += 16; 358 359 aw = _mm_xor_si128(aw, yw); 360 BK(aw, ax); 361 362 t1 = pclmulqdq11(aw, h1w); 363 t3 = pclmulqdq00(aw, h1w); 364 t2 = pclmulqdq00(ax, h1x); 365 t2 = _mm_xor_si128(t2, _mm_xor_si128(t1, t3)); 366 t0 = _mm_shuffle_epi32(t1, 0x0E); 367 t1 = _mm_xor_si128(t1, _mm_shuffle_epi32(t2, 0x0E)); 368 t2 = _mm_xor_si128(t2, _mm_shuffle_epi32(t3, 0x0E)); 369 SL_256(t0, t1, t2, t3); 370 REDUCE_F128(t0, t1, t2, t3); 371 yw = _mm_unpacklo_epi64(t1, t0); 372 } 373 374 BYTESWAP(yw); 375 _mm_storeu_si128(y, yw); 376 } 377 378 BR_TARGETS_X86_DOWN 379 380 #else 381 382 /* see bearssl_hash.h */ 383 br_ghash 384 br_ghash_pclmul_get(void) 385 { 386 return 0; 387 } 388 389 #endif 390