1 /* 2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #define BR_ENABLE_INTRINSICS 1 26 #include "inner.h" 27 28 #if BR_AES_X86NI 29 30 /* see bearssl_block.h */ 31 const br_block_ctr_class * 32 br_aes_x86ni_ctr_get_vtable(void) 33 { 34 return br_aes_x86ni_supported() ? &br_aes_x86ni_ctr_vtable : NULL; 35 } 36 37 /* see bearssl_block.h */ 38 void 39 br_aes_x86ni_ctr_init(br_aes_x86ni_ctr_keys *ctx, 40 const void *key, size_t len) 41 { 42 ctx->vtable = &br_aes_x86ni_ctr_vtable; 43 ctx->num_rounds = br_aes_x86ni_keysched_enc(ctx->skey.skni, key, len); 44 } 45 46 BR_TARGETS_X86_UP 47 48 /* see bearssl_block.h */ 49 BR_TARGET("sse2,sse4.1,aes") 50 uint32_t 51 br_aes_x86ni_ctr_run(const br_aes_x86ni_ctr_keys *ctx, 52 const void *iv, uint32_t cc, void *data, size_t len) 53 { 54 unsigned char *buf; 55 unsigned char ivbuf[16]; 56 unsigned num_rounds; 57 __m128i sk[15]; 58 __m128i ivx; 59 unsigned u; 60 61 buf = data; 62 memcpy(ivbuf, iv, 12); 63 num_rounds = ctx->num_rounds; 64 for (u = 0; u <= num_rounds; u ++) { 65 sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4))); 66 } 67 ivx = _mm_loadu_si128((void *)ivbuf); 68 while (len > 0) { 69 __m128i x0, x1, x2, x3; 70 71 x0 = _mm_insert_epi32(ivx, br_bswap32(cc + 0), 3); 72 x1 = _mm_insert_epi32(ivx, br_bswap32(cc + 1), 3); 73 x2 = _mm_insert_epi32(ivx, br_bswap32(cc + 2), 3); 74 x3 = _mm_insert_epi32(ivx, br_bswap32(cc + 3), 3); 75 x0 = _mm_xor_si128(x0, sk[0]); 76 x1 = _mm_xor_si128(x1, sk[0]); 77 x2 = _mm_xor_si128(x2, sk[0]); 78 x3 = _mm_xor_si128(x3, sk[0]); 79 x0 = _mm_aesenc_si128(x0, sk[1]); 80 x1 = _mm_aesenc_si128(x1, sk[1]); 81 x2 = _mm_aesenc_si128(x2, sk[1]); 82 x3 = _mm_aesenc_si128(x3, sk[1]); 83 x0 = _mm_aesenc_si128(x0, sk[2]); 84 x1 = _mm_aesenc_si128(x1, sk[2]); 85 x2 = _mm_aesenc_si128(x2, sk[2]); 86 x3 = _mm_aesenc_si128(x3, sk[2]); 87 x0 = _mm_aesenc_si128(x0, sk[3]); 88 x1 = _mm_aesenc_si128(x1, sk[3]); 89 x2 = _mm_aesenc_si128(x2, sk[3]); 90 x3 = _mm_aesenc_si128(x3, sk[3]); 91 x0 = _mm_aesenc_si128(x0, sk[4]); 92 x1 = _mm_aesenc_si128(x1, sk[4]); 93 x2 = _mm_aesenc_si128(x2, sk[4]); 94 x3 = _mm_aesenc_si128(x3, sk[4]); 95 x0 = _mm_aesenc_si128(x0, sk[5]); 96 x1 = _mm_aesenc_si128(x1, sk[5]); 97 x2 = _mm_aesenc_si128(x2, sk[5]); 98 x3 = _mm_aesenc_si128(x3, sk[5]); 99 x0 = _mm_aesenc_si128(x0, sk[6]); 100 x1 = _mm_aesenc_si128(x1, sk[6]); 101 x2 = _mm_aesenc_si128(x2, sk[6]); 102 x3 = _mm_aesenc_si128(x3, sk[6]); 103 x0 = _mm_aesenc_si128(x0, sk[7]); 104 x1 = _mm_aesenc_si128(x1, sk[7]); 105 x2 = _mm_aesenc_si128(x2, sk[7]); 106 x3 = _mm_aesenc_si128(x3, sk[7]); 107 x0 = _mm_aesenc_si128(x0, sk[8]); 108 x1 = _mm_aesenc_si128(x1, sk[8]); 109 x2 = _mm_aesenc_si128(x2, sk[8]); 110 x3 = _mm_aesenc_si128(x3, sk[8]); 111 x0 = _mm_aesenc_si128(x0, sk[9]); 112 x1 = _mm_aesenc_si128(x1, sk[9]); 113 x2 = _mm_aesenc_si128(x2, sk[9]); 114 x3 = _mm_aesenc_si128(x3, sk[9]); 115 if (num_rounds == 10) { 116 x0 = _mm_aesenclast_si128(x0, sk[10]); 117 x1 = _mm_aesenclast_si128(x1, sk[10]); 118 x2 = _mm_aesenclast_si128(x2, sk[10]); 119 x3 = _mm_aesenclast_si128(x3, sk[10]); 120 } else if (num_rounds == 12) { 121 x0 = _mm_aesenc_si128(x0, sk[10]); 122 x1 = _mm_aesenc_si128(x1, sk[10]); 123 x2 = _mm_aesenc_si128(x2, sk[10]); 124 x3 = _mm_aesenc_si128(x3, sk[10]); 125 x0 = _mm_aesenc_si128(x0, sk[11]); 126 x1 = _mm_aesenc_si128(x1, sk[11]); 127 x2 = _mm_aesenc_si128(x2, sk[11]); 128 x3 = _mm_aesenc_si128(x3, sk[11]); 129 x0 = _mm_aesenclast_si128(x0, sk[12]); 130 x1 = _mm_aesenclast_si128(x1, sk[12]); 131 x2 = _mm_aesenclast_si128(x2, sk[12]); 132 x3 = _mm_aesenclast_si128(x3, sk[12]); 133 } else { 134 x0 = _mm_aesenc_si128(x0, sk[10]); 135 x1 = _mm_aesenc_si128(x1, sk[10]); 136 x2 = _mm_aesenc_si128(x2, sk[10]); 137 x3 = _mm_aesenc_si128(x3, sk[10]); 138 x0 = _mm_aesenc_si128(x0, sk[11]); 139 x1 = _mm_aesenc_si128(x1, sk[11]); 140 x2 = _mm_aesenc_si128(x2, sk[11]); 141 x3 = _mm_aesenc_si128(x3, sk[11]); 142 x0 = _mm_aesenc_si128(x0, sk[12]); 143 x1 = _mm_aesenc_si128(x1, sk[12]); 144 x2 = _mm_aesenc_si128(x2, sk[12]); 145 x3 = _mm_aesenc_si128(x3, sk[12]); 146 x0 = _mm_aesenc_si128(x0, sk[13]); 147 x1 = _mm_aesenc_si128(x1, sk[13]); 148 x2 = _mm_aesenc_si128(x2, sk[13]); 149 x3 = _mm_aesenc_si128(x3, sk[13]); 150 x0 = _mm_aesenclast_si128(x0, sk[14]); 151 x1 = _mm_aesenclast_si128(x1, sk[14]); 152 x2 = _mm_aesenclast_si128(x2, sk[14]); 153 x3 = _mm_aesenclast_si128(x3, sk[14]); 154 } 155 if (len >= 64) { 156 x0 = _mm_xor_si128(x0, 157 _mm_loadu_si128((void *)(buf + 0))); 158 x1 = _mm_xor_si128(x1, 159 _mm_loadu_si128((void *)(buf + 16))); 160 x2 = _mm_xor_si128(x2, 161 _mm_loadu_si128((void *)(buf + 32))); 162 x3 = _mm_xor_si128(x3, 163 _mm_loadu_si128((void *)(buf + 48))); 164 _mm_storeu_si128((void *)(buf + 0), x0); 165 _mm_storeu_si128((void *)(buf + 16), x1); 166 _mm_storeu_si128((void *)(buf + 32), x2); 167 _mm_storeu_si128((void *)(buf + 48), x3); 168 buf += 64; 169 len -= 64; 170 cc += 4; 171 } else { 172 unsigned char tmp[64]; 173 174 _mm_storeu_si128((void *)(tmp + 0), x0); 175 _mm_storeu_si128((void *)(tmp + 16), x1); 176 _mm_storeu_si128((void *)(tmp + 32), x2); 177 _mm_storeu_si128((void *)(tmp + 48), x3); 178 for (u = 0; u < len; u ++) { 179 buf[u] ^= tmp[u]; 180 } 181 cc += (uint32_t)len >> 4; 182 break; 183 } 184 } 185 return cc; 186 } 187 188 BR_TARGETS_X86_DOWN 189 190 /* see bearssl_block.h */ 191 const br_block_ctr_class br_aes_x86ni_ctr_vtable = { 192 sizeof(br_aes_x86ni_ctr_keys), 193 16, 194 4, 195 (void (*)(const br_block_ctr_class **, const void *, size_t)) 196 &br_aes_x86ni_ctr_init, 197 (uint32_t (*)(const br_block_ctr_class *const *, 198 const void *, uint32_t, void *, size_t)) 199 &br_aes_x86ni_ctr_run 200 }; 201 202 #else 203 204 /* see bearssl_block.h */ 205 const br_block_ctr_class * 206 br_aes_x86ni_ctr_get_vtable(void) 207 { 208 return NULL; 209 } 210 211 #endif 212