1 /* 2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #define BR_ENABLE_INTRINSICS 1 26 #include "inner.h" 27 28 #if BR_AES_X86NI 29 30 /* see bearssl_block.h */ 31 const br_block_ctrcbc_class * 32 br_aes_x86ni_ctrcbc_get_vtable(void) 33 { 34 return br_aes_x86ni_supported() ? &br_aes_x86ni_ctrcbc_vtable : NULL; 35 } 36 37 /* see bearssl_block.h */ 38 void 39 br_aes_x86ni_ctrcbc_init(br_aes_x86ni_ctrcbc_keys *ctx, 40 const void *key, size_t len) 41 { 42 ctx->vtable = &br_aes_x86ni_ctrcbc_vtable; 43 ctx->num_rounds = br_aes_x86ni_keysched_enc(ctx->skey.skni, key, len); 44 } 45 46 BR_TARGETS_X86_UP 47 48 /* see bearssl_block.h */ 49 BR_TARGET("sse2,sse4.1,aes") 50 void 51 br_aes_x86ni_ctrcbc_ctr(const br_aes_x86ni_ctrcbc_keys *ctx, 52 void *ctr, void *data, size_t len) 53 { 54 unsigned char *buf; 55 unsigned num_rounds; 56 __m128i sk[15]; 57 __m128i ivx0, ivx1, ivx2, ivx3; 58 __m128i erev, zero, one, four, notthree; 59 unsigned u; 60 61 buf = data; 62 num_rounds = ctx->num_rounds; 63 for (u = 0; u <= num_rounds; u ++) { 64 sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4))); 65 } 66 67 /* 68 * Some SSE2 constants. 69 */ 70 erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 71 8, 9, 10, 11, 12, 13, 14, 15); 72 zero = _mm_setzero_si128(); 73 one = _mm_set_epi64x(0, 1); 74 four = _mm_set_epi64x(0, 4); 75 notthree = _mm_sub_epi64(zero, four); 76 77 /* 78 * Decode the counter in big-endian and pre-increment the other 79 * three counters. 80 */ 81 ivx0 = _mm_shuffle_epi8(_mm_loadu_si128((void *)ctr), erev); 82 ivx1 = _mm_add_epi64(ivx0, one); 83 ivx1 = _mm_sub_epi64(ivx1, 84 _mm_slli_si128(_mm_cmpeq_epi64(ivx1, zero), 8)); 85 ivx2 = _mm_add_epi64(ivx1, one); 86 ivx2 = _mm_sub_epi64(ivx2, 87 _mm_slli_si128(_mm_cmpeq_epi64(ivx2, zero), 8)); 88 ivx3 = _mm_add_epi64(ivx2, one); 89 ivx3 = _mm_sub_epi64(ivx3, 90 _mm_slli_si128(_mm_cmpeq_epi64(ivx3, zero), 8)); 91 while (len > 0) { 92 __m128i x0, x1, x2, x3; 93 94 /* 95 * Load counter values; we need to byteswap them because 96 * the specification says that they use big-endian. 97 */ 98 x0 = _mm_shuffle_epi8(ivx0, erev); 99 x1 = _mm_shuffle_epi8(ivx1, erev); 100 x2 = _mm_shuffle_epi8(ivx2, erev); 101 x3 = _mm_shuffle_epi8(ivx3, erev); 102 103 x0 = _mm_xor_si128(x0, sk[0]); 104 x1 = _mm_xor_si128(x1, sk[0]); 105 x2 = _mm_xor_si128(x2, sk[0]); 106 x3 = _mm_xor_si128(x3, sk[0]); 107 x0 = _mm_aesenc_si128(x0, sk[1]); 108 x1 = _mm_aesenc_si128(x1, sk[1]); 109 x2 = _mm_aesenc_si128(x2, sk[1]); 110 x3 = _mm_aesenc_si128(x3, sk[1]); 111 x0 = _mm_aesenc_si128(x0, sk[2]); 112 x1 = _mm_aesenc_si128(x1, sk[2]); 113 x2 = _mm_aesenc_si128(x2, sk[2]); 114 x3 = _mm_aesenc_si128(x3, sk[2]); 115 x0 = _mm_aesenc_si128(x0, sk[3]); 116 x1 = _mm_aesenc_si128(x1, sk[3]); 117 x2 = _mm_aesenc_si128(x2, sk[3]); 118 x3 = _mm_aesenc_si128(x3, sk[3]); 119 x0 = _mm_aesenc_si128(x0, sk[4]); 120 x1 = _mm_aesenc_si128(x1, sk[4]); 121 x2 = _mm_aesenc_si128(x2, sk[4]); 122 x3 = _mm_aesenc_si128(x3, sk[4]); 123 x0 = _mm_aesenc_si128(x0, sk[5]); 124 x1 = _mm_aesenc_si128(x1, sk[5]); 125 x2 = _mm_aesenc_si128(x2, sk[5]); 126 x3 = _mm_aesenc_si128(x3, sk[5]); 127 x0 = _mm_aesenc_si128(x0, sk[6]); 128 x1 = _mm_aesenc_si128(x1, sk[6]); 129 x2 = _mm_aesenc_si128(x2, sk[6]); 130 x3 = _mm_aesenc_si128(x3, sk[6]); 131 x0 = _mm_aesenc_si128(x0, sk[7]); 132 x1 = _mm_aesenc_si128(x1, sk[7]); 133 x2 = _mm_aesenc_si128(x2, sk[7]); 134 x3 = _mm_aesenc_si128(x3, sk[7]); 135 x0 = _mm_aesenc_si128(x0, sk[8]); 136 x1 = _mm_aesenc_si128(x1, sk[8]); 137 x2 = _mm_aesenc_si128(x2, sk[8]); 138 x3 = _mm_aesenc_si128(x3, sk[8]); 139 x0 = _mm_aesenc_si128(x0, sk[9]); 140 x1 = _mm_aesenc_si128(x1, sk[9]); 141 x2 = _mm_aesenc_si128(x2, sk[9]); 142 x3 = _mm_aesenc_si128(x3, sk[9]); 143 if (num_rounds == 10) { 144 x0 = _mm_aesenclast_si128(x0, sk[10]); 145 x1 = _mm_aesenclast_si128(x1, sk[10]); 146 x2 = _mm_aesenclast_si128(x2, sk[10]); 147 x3 = _mm_aesenclast_si128(x3, sk[10]); 148 } else if (num_rounds == 12) { 149 x0 = _mm_aesenc_si128(x0, sk[10]); 150 x1 = _mm_aesenc_si128(x1, sk[10]); 151 x2 = _mm_aesenc_si128(x2, sk[10]); 152 x3 = _mm_aesenc_si128(x3, sk[10]); 153 x0 = _mm_aesenc_si128(x0, sk[11]); 154 x1 = _mm_aesenc_si128(x1, sk[11]); 155 x2 = _mm_aesenc_si128(x2, sk[11]); 156 x3 = _mm_aesenc_si128(x3, sk[11]); 157 x0 = _mm_aesenclast_si128(x0, sk[12]); 158 x1 = _mm_aesenclast_si128(x1, sk[12]); 159 x2 = _mm_aesenclast_si128(x2, sk[12]); 160 x3 = _mm_aesenclast_si128(x3, sk[12]); 161 } else { 162 x0 = _mm_aesenc_si128(x0, sk[10]); 163 x1 = _mm_aesenc_si128(x1, sk[10]); 164 x2 = _mm_aesenc_si128(x2, sk[10]); 165 x3 = _mm_aesenc_si128(x3, sk[10]); 166 x0 = _mm_aesenc_si128(x0, sk[11]); 167 x1 = _mm_aesenc_si128(x1, sk[11]); 168 x2 = _mm_aesenc_si128(x2, sk[11]); 169 x3 = _mm_aesenc_si128(x3, sk[11]); 170 x0 = _mm_aesenc_si128(x0, sk[12]); 171 x1 = _mm_aesenc_si128(x1, sk[12]); 172 x2 = _mm_aesenc_si128(x2, sk[12]); 173 x3 = _mm_aesenc_si128(x3, sk[12]); 174 x0 = _mm_aesenc_si128(x0, sk[13]); 175 x1 = _mm_aesenc_si128(x1, sk[13]); 176 x2 = _mm_aesenc_si128(x2, sk[13]); 177 x3 = _mm_aesenc_si128(x3, sk[13]); 178 x0 = _mm_aesenclast_si128(x0, sk[14]); 179 x1 = _mm_aesenclast_si128(x1, sk[14]); 180 x2 = _mm_aesenclast_si128(x2, sk[14]); 181 x3 = _mm_aesenclast_si128(x3, sk[14]); 182 } 183 if (len >= 64) { 184 x0 = _mm_xor_si128(x0, 185 _mm_loadu_si128((void *)(buf + 0))); 186 x1 = _mm_xor_si128(x1, 187 _mm_loadu_si128((void *)(buf + 16))); 188 x2 = _mm_xor_si128(x2, 189 _mm_loadu_si128((void *)(buf + 32))); 190 x3 = _mm_xor_si128(x3, 191 _mm_loadu_si128((void *)(buf + 48))); 192 _mm_storeu_si128((void *)(buf + 0), x0); 193 _mm_storeu_si128((void *)(buf + 16), x1); 194 _mm_storeu_si128((void *)(buf + 32), x2); 195 _mm_storeu_si128((void *)(buf + 48), x3); 196 buf += 64; 197 len -= 64; 198 } else { 199 unsigned char tmp[64]; 200 201 _mm_storeu_si128((void *)(tmp + 0), x0); 202 _mm_storeu_si128((void *)(tmp + 16), x1); 203 _mm_storeu_si128((void *)(tmp + 32), x2); 204 _mm_storeu_si128((void *)(tmp + 48), x3); 205 for (u = 0; u < len; u ++) { 206 buf[u] ^= tmp[u]; 207 } 208 switch (len) { 209 case 16: 210 ivx0 = ivx1; 211 break; 212 case 32: 213 ivx0 = ivx2; 214 break; 215 case 48: 216 ivx0 = ivx3; 217 break; 218 } 219 break; 220 } 221 222 /* 223 * Add 4 to each counter value. For carry propagation 224 * into the upper 64-bit words, we would need to compare 225 * the results with 4, but SSE2+ has only _signed_ 226 * comparisons. Instead, we mask out the low two bits, 227 * and check whether the remaining bits are zero. 228 */ 229 ivx0 = _mm_add_epi64(ivx0, four); 230 ivx1 = _mm_add_epi64(ivx1, four); 231 ivx2 = _mm_add_epi64(ivx2, four); 232 ivx3 = _mm_add_epi64(ivx3, four); 233 ivx0 = _mm_sub_epi64(ivx0, 234 _mm_slli_si128(_mm_cmpeq_epi64( 235 _mm_and_si128(ivx0, notthree), zero), 8)); 236 ivx1 = _mm_sub_epi64(ivx1, 237 _mm_slli_si128(_mm_cmpeq_epi64( 238 _mm_and_si128(ivx1, notthree), zero), 8)); 239 ivx2 = _mm_sub_epi64(ivx2, 240 _mm_slli_si128(_mm_cmpeq_epi64( 241 _mm_and_si128(ivx2, notthree), zero), 8)); 242 ivx3 = _mm_sub_epi64(ivx3, 243 _mm_slli_si128(_mm_cmpeq_epi64( 244 _mm_and_si128(ivx3, notthree), zero), 8)); 245 } 246 247 /* 248 * Write back new counter value. The loop took care to put the 249 * right counter value in ivx0. 250 */ 251 _mm_storeu_si128((void *)ctr, _mm_shuffle_epi8(ivx0, erev)); 252 } 253 254 /* see bearssl_block.h */ 255 BR_TARGET("sse2,sse4.1,aes") 256 void 257 br_aes_x86ni_ctrcbc_mac(const br_aes_x86ni_ctrcbc_keys *ctx, 258 void *cbcmac, const void *data, size_t len) 259 { 260 const unsigned char *buf; 261 unsigned num_rounds; 262 __m128i sk[15], ivx; 263 unsigned u; 264 265 buf = data; 266 ivx = _mm_loadu_si128(cbcmac); 267 num_rounds = ctx->num_rounds; 268 for (u = 0; u <= num_rounds; u ++) { 269 sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4))); 270 } 271 while (len > 0) { 272 __m128i x; 273 274 x = _mm_xor_si128(_mm_loadu_si128((void *)buf), ivx); 275 x = _mm_xor_si128(x, sk[0]); 276 x = _mm_aesenc_si128(x, sk[1]); 277 x = _mm_aesenc_si128(x, sk[2]); 278 x = _mm_aesenc_si128(x, sk[3]); 279 x = _mm_aesenc_si128(x, sk[4]); 280 x = _mm_aesenc_si128(x, sk[5]); 281 x = _mm_aesenc_si128(x, sk[6]); 282 x = _mm_aesenc_si128(x, sk[7]); 283 x = _mm_aesenc_si128(x, sk[8]); 284 x = _mm_aesenc_si128(x, sk[9]); 285 if (num_rounds == 10) { 286 x = _mm_aesenclast_si128(x, sk[10]); 287 } else if (num_rounds == 12) { 288 x = _mm_aesenc_si128(x, sk[10]); 289 x = _mm_aesenc_si128(x, sk[11]); 290 x = _mm_aesenclast_si128(x, sk[12]); 291 } else { 292 x = _mm_aesenc_si128(x, sk[10]); 293 x = _mm_aesenc_si128(x, sk[11]); 294 x = _mm_aesenc_si128(x, sk[12]); 295 x = _mm_aesenc_si128(x, sk[13]); 296 x = _mm_aesenclast_si128(x, sk[14]); 297 } 298 ivx = x; 299 buf += 16; 300 len -= 16; 301 } 302 _mm_storeu_si128(cbcmac, ivx); 303 } 304 305 /* see bearssl_block.h */ 306 BR_TARGET("sse2,sse4.1,aes") 307 void 308 br_aes_x86ni_ctrcbc_encrypt(const br_aes_x86ni_ctrcbc_keys *ctx, 309 void *ctr, void *cbcmac, void *data, size_t len) 310 { 311 unsigned char *buf; 312 unsigned num_rounds; 313 __m128i sk[15]; 314 __m128i ivx, cmx; 315 __m128i erev, zero, one; 316 unsigned u; 317 int first_iter; 318 319 num_rounds = ctx->num_rounds; 320 for (u = 0; u <= num_rounds; u ++) { 321 sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4))); 322 } 323 324 /* 325 * Some SSE2 constants. 326 */ 327 erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 328 8, 9, 10, 11, 12, 13, 14, 15); 329 zero = _mm_setzero_si128(); 330 one = _mm_set_epi64x(0, 1); 331 332 /* 333 * Decode the counter in big-endian. 334 */ 335 ivx = _mm_shuffle_epi8(_mm_loadu_si128(ctr), erev); 336 cmx = _mm_loadu_si128(cbcmac); 337 338 buf = data; 339 first_iter = 1; 340 while (len > 0) { 341 __m128i dx, x0, x1; 342 343 /* 344 * Load initial values: 345 * dx encrypted block of data 346 * x0 counter (for CTR encryption) 347 * x1 input for CBC-MAC 348 */ 349 dx = _mm_loadu_si128((void *)buf); 350 x0 = _mm_shuffle_epi8(ivx, erev); 351 x1 = cmx; 352 353 x0 = _mm_xor_si128(x0, sk[0]); 354 x1 = _mm_xor_si128(x1, sk[0]); 355 x0 = _mm_aesenc_si128(x0, sk[1]); 356 x1 = _mm_aesenc_si128(x1, sk[1]); 357 x0 = _mm_aesenc_si128(x0, sk[2]); 358 x1 = _mm_aesenc_si128(x1, sk[2]); 359 x0 = _mm_aesenc_si128(x0, sk[3]); 360 x1 = _mm_aesenc_si128(x1, sk[3]); 361 x0 = _mm_aesenc_si128(x0, sk[4]); 362 x1 = _mm_aesenc_si128(x1, sk[4]); 363 x0 = _mm_aesenc_si128(x0, sk[5]); 364 x1 = _mm_aesenc_si128(x1, sk[5]); 365 x0 = _mm_aesenc_si128(x0, sk[6]); 366 x1 = _mm_aesenc_si128(x1, sk[6]); 367 x0 = _mm_aesenc_si128(x0, sk[7]); 368 x1 = _mm_aesenc_si128(x1, sk[7]); 369 x0 = _mm_aesenc_si128(x0, sk[8]); 370 x1 = _mm_aesenc_si128(x1, sk[8]); 371 x0 = _mm_aesenc_si128(x0, sk[9]); 372 x1 = _mm_aesenc_si128(x1, sk[9]); 373 if (num_rounds == 10) { 374 x0 = _mm_aesenclast_si128(x0, sk[10]); 375 x1 = _mm_aesenclast_si128(x1, sk[10]); 376 } else if (num_rounds == 12) { 377 x0 = _mm_aesenc_si128(x0, sk[10]); 378 x1 = _mm_aesenc_si128(x1, sk[10]); 379 x0 = _mm_aesenc_si128(x0, sk[11]); 380 x1 = _mm_aesenc_si128(x1, sk[11]); 381 x0 = _mm_aesenclast_si128(x0, sk[12]); 382 x1 = _mm_aesenclast_si128(x1, sk[12]); 383 } else { 384 x0 = _mm_aesenc_si128(x0, sk[10]); 385 x1 = _mm_aesenc_si128(x1, sk[10]); 386 x0 = _mm_aesenc_si128(x0, sk[11]); 387 x1 = _mm_aesenc_si128(x1, sk[11]); 388 x0 = _mm_aesenc_si128(x0, sk[12]); 389 x1 = _mm_aesenc_si128(x1, sk[12]); 390 x0 = _mm_aesenc_si128(x0, sk[13]); 391 x1 = _mm_aesenc_si128(x1, sk[13]); 392 x0 = _mm_aesenclast_si128(x0, sk[14]); 393 x1 = _mm_aesenclast_si128(x1, sk[14]); 394 } 395 396 x0 = _mm_xor_si128(x0, dx); 397 if (first_iter) { 398 cmx = _mm_xor_si128(cmx, x0); 399 first_iter = 0; 400 } else { 401 cmx = _mm_xor_si128(x1, x0); 402 } 403 _mm_storeu_si128((void *)buf, x0); 404 405 buf += 16; 406 len -= 16; 407 408 /* 409 * Increment the counter value. 410 */ 411 ivx = _mm_add_epi64(ivx, one); 412 ivx = _mm_sub_epi64(ivx, 413 _mm_slli_si128(_mm_cmpeq_epi64(ivx, zero), 8)); 414 415 /* 416 * If this was the last iteration, then compute the 417 * extra block encryption to complete CBC-MAC. 418 */ 419 if (len == 0) { 420 cmx = _mm_xor_si128(cmx, sk[0]); 421 cmx = _mm_aesenc_si128(cmx, sk[1]); 422 cmx = _mm_aesenc_si128(cmx, sk[2]); 423 cmx = _mm_aesenc_si128(cmx, sk[3]); 424 cmx = _mm_aesenc_si128(cmx, sk[4]); 425 cmx = _mm_aesenc_si128(cmx, sk[5]); 426 cmx = _mm_aesenc_si128(cmx, sk[6]); 427 cmx = _mm_aesenc_si128(cmx, sk[7]); 428 cmx = _mm_aesenc_si128(cmx, sk[8]); 429 cmx = _mm_aesenc_si128(cmx, sk[9]); 430 if (num_rounds == 10) { 431 cmx = _mm_aesenclast_si128(cmx, sk[10]); 432 } else if (num_rounds == 12) { 433 cmx = _mm_aesenc_si128(cmx, sk[10]); 434 cmx = _mm_aesenc_si128(cmx, sk[11]); 435 cmx = _mm_aesenclast_si128(cmx, sk[12]); 436 } else { 437 cmx = _mm_aesenc_si128(cmx, sk[10]); 438 cmx = _mm_aesenc_si128(cmx, sk[11]); 439 cmx = _mm_aesenc_si128(cmx, sk[12]); 440 cmx = _mm_aesenc_si128(cmx, sk[13]); 441 cmx = _mm_aesenclast_si128(cmx, sk[14]); 442 } 443 break; 444 } 445 } 446 447 /* 448 * Write back new counter value and CBC-MAC value. 449 */ 450 _mm_storeu_si128(ctr, _mm_shuffle_epi8(ivx, erev)); 451 _mm_storeu_si128(cbcmac, cmx); 452 } 453 454 /* see bearssl_block.h */ 455 BR_TARGET("sse2,sse4.1,aes") 456 void 457 br_aes_x86ni_ctrcbc_decrypt(const br_aes_x86ni_ctrcbc_keys *ctx, 458 void *ctr, void *cbcmac, void *data, size_t len) 459 { 460 unsigned char *buf; 461 unsigned num_rounds; 462 __m128i sk[15]; 463 __m128i ivx, cmx; 464 __m128i erev, zero, one; 465 unsigned u; 466 467 num_rounds = ctx->num_rounds; 468 for (u = 0; u <= num_rounds; u ++) { 469 sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4))); 470 } 471 472 /* 473 * Some SSE2 constants. 474 */ 475 erev = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 476 8, 9, 10, 11, 12, 13, 14, 15); 477 zero = _mm_setzero_si128(); 478 one = _mm_set_epi64x(0, 1); 479 480 /* 481 * Decode the counter in big-endian. 482 */ 483 ivx = _mm_shuffle_epi8(_mm_loadu_si128(ctr), erev); 484 cmx = _mm_loadu_si128(cbcmac); 485 486 buf = data; 487 while (len > 0) { 488 __m128i dx, x0, x1; 489 490 /* 491 * Load initial values: 492 * dx encrypted block of data 493 * x0 counter (for CTR encryption) 494 * x1 input for CBC-MAC 495 */ 496 dx = _mm_loadu_si128((void *)buf); 497 x0 = _mm_shuffle_epi8(ivx, erev); 498 x1 = _mm_xor_si128(cmx, dx); 499 500 x0 = _mm_xor_si128(x0, sk[0]); 501 x1 = _mm_xor_si128(x1, sk[0]); 502 x0 = _mm_aesenc_si128(x0, sk[1]); 503 x1 = _mm_aesenc_si128(x1, sk[1]); 504 x0 = _mm_aesenc_si128(x0, sk[2]); 505 x1 = _mm_aesenc_si128(x1, sk[2]); 506 x0 = _mm_aesenc_si128(x0, sk[3]); 507 x1 = _mm_aesenc_si128(x1, sk[3]); 508 x0 = _mm_aesenc_si128(x0, sk[4]); 509 x1 = _mm_aesenc_si128(x1, sk[4]); 510 x0 = _mm_aesenc_si128(x0, sk[5]); 511 x1 = _mm_aesenc_si128(x1, sk[5]); 512 x0 = _mm_aesenc_si128(x0, sk[6]); 513 x1 = _mm_aesenc_si128(x1, sk[6]); 514 x0 = _mm_aesenc_si128(x0, sk[7]); 515 x1 = _mm_aesenc_si128(x1, sk[7]); 516 x0 = _mm_aesenc_si128(x0, sk[8]); 517 x1 = _mm_aesenc_si128(x1, sk[8]); 518 x0 = _mm_aesenc_si128(x0, sk[9]); 519 x1 = _mm_aesenc_si128(x1, sk[9]); 520 if (num_rounds == 10) { 521 x0 = _mm_aesenclast_si128(x0, sk[10]); 522 x1 = _mm_aesenclast_si128(x1, sk[10]); 523 } else if (num_rounds == 12) { 524 x0 = _mm_aesenc_si128(x0, sk[10]); 525 x1 = _mm_aesenc_si128(x1, sk[10]); 526 x0 = _mm_aesenc_si128(x0, sk[11]); 527 x1 = _mm_aesenc_si128(x1, sk[11]); 528 x0 = _mm_aesenclast_si128(x0, sk[12]); 529 x1 = _mm_aesenclast_si128(x1, sk[12]); 530 } else { 531 x0 = _mm_aesenc_si128(x0, sk[10]); 532 x1 = _mm_aesenc_si128(x1, sk[10]); 533 x0 = _mm_aesenc_si128(x0, sk[11]); 534 x1 = _mm_aesenc_si128(x1, sk[11]); 535 x0 = _mm_aesenc_si128(x0, sk[12]); 536 x1 = _mm_aesenc_si128(x1, sk[12]); 537 x0 = _mm_aesenc_si128(x0, sk[13]); 538 x1 = _mm_aesenc_si128(x1, sk[13]); 539 x0 = _mm_aesenclast_si128(x0, sk[14]); 540 x1 = _mm_aesenclast_si128(x1, sk[14]); 541 } 542 x0 = _mm_xor_si128(x0, dx); 543 cmx = x1; 544 _mm_storeu_si128((void *)buf, x0); 545 546 buf += 16; 547 len -= 16; 548 549 /* 550 * Increment the counter value. 551 */ 552 ivx = _mm_add_epi64(ivx, one); 553 ivx = _mm_sub_epi64(ivx, 554 _mm_slli_si128(_mm_cmpeq_epi64(ivx, zero), 8)); 555 } 556 557 /* 558 * Write back new counter value and CBC-MAC value. 559 */ 560 _mm_storeu_si128(ctr, _mm_shuffle_epi8(ivx, erev)); 561 _mm_storeu_si128(cbcmac, cmx); 562 } 563 564 BR_TARGETS_X86_DOWN 565 566 /* see bearssl_block.h */ 567 const br_block_ctrcbc_class br_aes_x86ni_ctrcbc_vtable = { 568 sizeof(br_aes_x86ni_ctrcbc_keys), 569 16, 570 4, 571 (void (*)(const br_block_ctrcbc_class **, const void *, size_t)) 572 &br_aes_x86ni_ctrcbc_init, 573 (void (*)(const br_block_ctrcbc_class *const *, 574 void *, void *, void *, size_t)) 575 &br_aes_x86ni_ctrcbc_encrypt, 576 (void (*)(const br_block_ctrcbc_class *const *, 577 void *, void *, void *, size_t)) 578 &br_aes_x86ni_ctrcbc_decrypt, 579 (void (*)(const br_block_ctrcbc_class *const *, 580 void *, void *, size_t)) 581 &br_aes_x86ni_ctrcbc_ctr, 582 (void (*)(const br_block_ctrcbc_class *const *, 583 void *, const void *, size_t)) 584 &br_aes_x86ni_ctrcbc_mac 585 }; 586 587 #else 588 589 /* see bearssl_block.h */ 590 const br_block_ctrcbc_class * 591 br_aes_x86ni_ctrcbc_get_vtable(void) 592 { 593 return NULL; 594 } 595 596 #endif 597