1 /* 2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #define BR_ENABLE_INTRINSICS 1 26 #include "inner.h" 27 28 /* 29 * This code contains the AES key schedule implementation using the 30 * AES-NI opcodes. 31 */ 32 33 #if BR_AES_X86NI 34 35 /* see inner.h */ 36 int 37 br_aes_x86ni_supported(void) 38 { 39 /* 40 * Bit mask for features in ECX: 41 * 19 SSE4.1 (used for _mm_insert_epi32(), for AES-CTR) 42 * 25 AES-NI 43 */ 44 return br_cpuid(0, 0, 0x02080000, 0); 45 } 46 47 BR_TARGETS_X86_UP 48 49 BR_TARGET("sse2,aes") 50 static inline __m128i 51 expand_step128(__m128i k, __m128i k2) 52 { 53 k = _mm_xor_si128(k, _mm_slli_si128(k, 4)); 54 k = _mm_xor_si128(k, _mm_slli_si128(k, 4)); 55 k = _mm_xor_si128(k, _mm_slli_si128(k, 4)); 56 k2 = _mm_shuffle_epi32(k2, 0xFF); 57 return _mm_xor_si128(k, k2); 58 } 59 60 BR_TARGET("sse2,aes") 61 static inline void 62 expand_step192(__m128i *t1, __m128i *t2, __m128i *t3) 63 { 64 __m128i t4; 65 66 *t2 = _mm_shuffle_epi32(*t2, 0x55); 67 t4 = _mm_slli_si128(*t1, 0x4); 68 *t1 = _mm_xor_si128(*t1, t4); 69 t4 = _mm_slli_si128(t4, 0x4); 70 *t1 = _mm_xor_si128(*t1, t4); 71 t4 = _mm_slli_si128(t4, 0x4); 72 *t1 = _mm_xor_si128(*t1, t4); 73 *t1 = _mm_xor_si128(*t1, *t2); 74 *t2 = _mm_shuffle_epi32(*t1, 0xFF); 75 t4 = _mm_slli_si128(*t3, 0x4); 76 *t3 = _mm_xor_si128(*t3, t4); 77 *t3 = _mm_xor_si128(*t3, *t2); 78 } 79 80 BR_TARGET("sse2,aes") 81 static inline void 82 expand_step256_1(__m128i *t1, __m128i *t2) 83 { 84 __m128i t4; 85 86 *t2 = _mm_shuffle_epi32(*t2, 0xFF); 87 t4 = _mm_slli_si128(*t1, 0x4); 88 *t1 = _mm_xor_si128(*t1, t4); 89 t4 = _mm_slli_si128(t4, 0x4); 90 *t1 = _mm_xor_si128(*t1, t4); 91 t4 = _mm_slli_si128(t4, 0x4); 92 *t1 = _mm_xor_si128(*t1, t4); 93 *t1 = _mm_xor_si128(*t1, *t2); 94 } 95 96 BR_TARGET("sse2,aes") 97 static inline void 98 expand_step256_2(__m128i *t1, __m128i *t3) 99 { 100 __m128i t2, t4; 101 102 t4 = _mm_aeskeygenassist_si128(*t1, 0x0); 103 t2 = _mm_shuffle_epi32(t4, 0xAA); 104 t4 = _mm_slli_si128(*t3, 0x4); 105 *t3 = _mm_xor_si128(*t3, t4); 106 t4 = _mm_slli_si128(t4, 0x4); 107 *t3 = _mm_xor_si128(*t3, t4); 108 t4 = _mm_slli_si128(t4, 0x4); 109 *t3 = _mm_xor_si128(*t3, t4); 110 *t3 = _mm_xor_si128(*t3, t2); 111 } 112 113 /* 114 * Perform key schedule for AES, encryption direction. Subkeys are written 115 * in sk[], and the number of rounds is returned. Key length MUST be 16, 116 * 24 or 32 bytes. 117 */ 118 BR_TARGET("sse2,aes") 119 static unsigned 120 x86ni_keysched(__m128i *sk, const void *key, size_t len) 121 { 122 const unsigned char *kb; 123 124 #define KEXP128(k, i, rcon) do { \ 125 k = expand_step128(k, _mm_aeskeygenassist_si128(k, rcon)); \ 126 sk[i] = k; \ 127 } while (0) 128 129 #define KEXP192(i, rcon1, rcon2) do { \ 130 sk[(i) + 0] = t1; \ 131 sk[(i) + 1] = t3; \ 132 t2 = _mm_aeskeygenassist_si128(t3, rcon1); \ 133 expand_step192(&t1, &t2, &t3); \ 134 sk[(i) + 1] = _mm_castpd_si128(_mm_shuffle_pd( \ 135 _mm_castsi128_pd(sk[(i) + 1]), \ 136 _mm_castsi128_pd(t1), 0)); \ 137 sk[(i) + 2] = _mm_castpd_si128(_mm_shuffle_pd( \ 138 _mm_castsi128_pd(t1), \ 139 _mm_castsi128_pd(t3), 1)); \ 140 t2 = _mm_aeskeygenassist_si128(t3, rcon2); \ 141 expand_step192(&t1, &t2, &t3); \ 142 } while (0) 143 144 #define KEXP256(i, rcon) do { \ 145 sk[(i) + 0] = t3; \ 146 t2 = _mm_aeskeygenassist_si128(t3, rcon); \ 147 expand_step256_1(&t1, &t2); \ 148 sk[(i) + 1] = t1; \ 149 expand_step256_2(&t1, &t3); \ 150 } while (0) 151 152 kb = key; 153 switch (len) { 154 __m128i t1, t2, t3; 155 156 case 16: 157 t1 = _mm_loadu_si128((const void *)kb); 158 sk[0] = t1; 159 KEXP128(t1, 1, 0x01); 160 KEXP128(t1, 2, 0x02); 161 KEXP128(t1, 3, 0x04); 162 KEXP128(t1, 4, 0x08); 163 KEXP128(t1, 5, 0x10); 164 KEXP128(t1, 6, 0x20); 165 KEXP128(t1, 7, 0x40); 166 KEXP128(t1, 8, 0x80); 167 KEXP128(t1, 9, 0x1B); 168 KEXP128(t1, 10, 0x36); 169 return 10; 170 171 case 24: 172 t1 = _mm_loadu_si128((const void *)kb); 173 t3 = _mm_loadu_si128((const void *)(kb + 8)); 174 t3 = _mm_shuffle_epi32(t3, 0x4E); 175 KEXP192(0, 0x01, 0x02); 176 KEXP192(3, 0x04, 0x08); 177 KEXP192(6, 0x10, 0x20); 178 KEXP192(9, 0x40, 0x80); 179 sk[12] = t1; 180 return 12; 181 182 case 32: 183 t1 = _mm_loadu_si128((const void *)kb); 184 t3 = _mm_loadu_si128((const void *)(kb + 16)); 185 sk[0] = t1; 186 KEXP256( 1, 0x01); 187 KEXP256( 3, 0x02); 188 KEXP256( 5, 0x04); 189 KEXP256( 7, 0x08); 190 KEXP256( 9, 0x10); 191 KEXP256(11, 0x20); 192 sk[13] = t3; 193 t2 = _mm_aeskeygenassist_si128(t3, 0x40); 194 expand_step256_1(&t1, &t2); 195 sk[14] = t1; 196 return 14; 197 198 default: 199 return 0; 200 } 201 202 #undef KEXP128 203 #undef KEXP192 204 #undef KEXP256 205 } 206 207 /* see inner.h */ 208 BR_TARGET("sse2,aes") 209 unsigned 210 br_aes_x86ni_keysched_enc(unsigned char *skni, const void *key, size_t len) 211 { 212 __m128i sk[15]; 213 unsigned num_rounds; 214 215 num_rounds = x86ni_keysched(sk, key, len); 216 memcpy(skni, sk, (num_rounds + 1) << 4); 217 return num_rounds; 218 } 219 220 /* see inner.h */ 221 BR_TARGET("sse2,aes") 222 unsigned 223 br_aes_x86ni_keysched_dec(unsigned char *skni, const void *key, size_t len) 224 { 225 __m128i sk[15]; 226 unsigned u, num_rounds; 227 228 num_rounds = x86ni_keysched(sk, key, len); 229 _mm_storeu_si128((void *)skni, sk[num_rounds]); 230 for (u = 1; u < num_rounds; u ++) { 231 _mm_storeu_si128((void *)(skni + (u << 4)), 232 _mm_aesimc_si128(sk[num_rounds - u])); 233 } 234 _mm_storeu_si128((void *)(skni + (num_rounds << 4)), sk[0]); 235 return num_rounds; 236 } 237 238 BR_TARGETS_X86_DOWN 239 240 #endif 241