1 /* 2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #define BR_ENABLE_INTRINSICS 1 26 #include "inner.h" 27 28 #if BR_AES_X86NI 29 30 /* see bearssl_block.h */ 31 const br_block_cbcdec_class * 32 br_aes_x86ni_cbcdec_get_vtable(void) 33 { 34 return br_aes_x86ni_supported() ? &br_aes_x86ni_cbcdec_vtable : NULL; 35 } 36 37 /* see bearssl_block.h */ 38 void 39 br_aes_x86ni_cbcdec_init(br_aes_x86ni_cbcdec_keys *ctx, 40 const void *key, size_t len) 41 { 42 ctx->vtable = &br_aes_x86ni_cbcdec_vtable; 43 ctx->num_rounds = br_aes_x86ni_keysched_dec(ctx->skey.skni, key, len); 44 } 45 46 BR_TARGETS_X86_UP 47 48 /* see bearssl_block.h */ 49 BR_TARGET("sse2,aes") 50 void 51 br_aes_x86ni_cbcdec_run(const br_aes_x86ni_cbcdec_keys *ctx, 52 void *iv, void *data, size_t len) 53 { 54 unsigned char *buf; 55 unsigned num_rounds; 56 __m128i sk[15], ivx; 57 unsigned u; 58 59 buf = data; 60 ivx = _mm_loadu_si128(iv); 61 num_rounds = ctx->num_rounds; 62 for (u = 0; u <= num_rounds; u ++) { 63 sk[u] = _mm_loadu_si128((void *)(ctx->skey.skni + (u << 4))); 64 } 65 while (len > 0) { 66 __m128i x0, x1, x2, x3, e0, e1, e2, e3; 67 68 x0 = _mm_loadu_si128((void *)(buf + 0)); 69 if (len >= 64) { 70 x1 = _mm_loadu_si128((void *)(buf + 16)); 71 x2 = _mm_loadu_si128((void *)(buf + 32)); 72 x3 = _mm_loadu_si128((void *)(buf + 48)); 73 } else { 74 x0 = _mm_loadu_si128((void *)(buf + 0)); 75 if (len >= 32) { 76 x1 = _mm_loadu_si128((void *)(buf + 16)); 77 if (len >= 48) { 78 x2 = _mm_loadu_si128( 79 (void *)(buf + 32)); 80 x3 = x2; 81 } else { 82 x2 = x0; 83 x3 = x1; 84 } 85 } else { 86 x1 = x0; 87 x2 = x0; 88 x3 = x0; 89 } 90 } 91 e0 = x0; 92 e1 = x1; 93 e2 = x2; 94 e3 = x3; 95 x0 = _mm_xor_si128(x0, sk[0]); 96 x1 = _mm_xor_si128(x1, sk[0]); 97 x2 = _mm_xor_si128(x2, sk[0]); 98 x3 = _mm_xor_si128(x3, sk[0]); 99 x0 = _mm_aesdec_si128(x0, sk[1]); 100 x1 = _mm_aesdec_si128(x1, sk[1]); 101 x2 = _mm_aesdec_si128(x2, sk[1]); 102 x3 = _mm_aesdec_si128(x3, sk[1]); 103 x0 = _mm_aesdec_si128(x0, sk[2]); 104 x1 = _mm_aesdec_si128(x1, sk[2]); 105 x2 = _mm_aesdec_si128(x2, sk[2]); 106 x3 = _mm_aesdec_si128(x3, sk[2]); 107 x0 = _mm_aesdec_si128(x0, sk[3]); 108 x1 = _mm_aesdec_si128(x1, sk[3]); 109 x2 = _mm_aesdec_si128(x2, sk[3]); 110 x3 = _mm_aesdec_si128(x3, sk[3]); 111 x0 = _mm_aesdec_si128(x0, sk[4]); 112 x1 = _mm_aesdec_si128(x1, sk[4]); 113 x2 = _mm_aesdec_si128(x2, sk[4]); 114 x3 = _mm_aesdec_si128(x3, sk[4]); 115 x0 = _mm_aesdec_si128(x0, sk[5]); 116 x1 = _mm_aesdec_si128(x1, sk[5]); 117 x2 = _mm_aesdec_si128(x2, sk[5]); 118 x3 = _mm_aesdec_si128(x3, sk[5]); 119 x0 = _mm_aesdec_si128(x0, sk[6]); 120 x1 = _mm_aesdec_si128(x1, sk[6]); 121 x2 = _mm_aesdec_si128(x2, sk[6]); 122 x3 = _mm_aesdec_si128(x3, sk[6]); 123 x0 = _mm_aesdec_si128(x0, sk[7]); 124 x1 = _mm_aesdec_si128(x1, sk[7]); 125 x2 = _mm_aesdec_si128(x2, sk[7]); 126 x3 = _mm_aesdec_si128(x3, sk[7]); 127 x0 = _mm_aesdec_si128(x0, sk[8]); 128 x1 = _mm_aesdec_si128(x1, sk[8]); 129 x2 = _mm_aesdec_si128(x2, sk[8]); 130 x3 = _mm_aesdec_si128(x3, sk[8]); 131 x0 = _mm_aesdec_si128(x0, sk[9]); 132 x1 = _mm_aesdec_si128(x1, sk[9]); 133 x2 = _mm_aesdec_si128(x2, sk[9]); 134 x3 = _mm_aesdec_si128(x3, sk[9]); 135 if (num_rounds == 10) { 136 x0 = _mm_aesdeclast_si128(x0, sk[10]); 137 x1 = _mm_aesdeclast_si128(x1, sk[10]); 138 x2 = _mm_aesdeclast_si128(x2, sk[10]); 139 x3 = _mm_aesdeclast_si128(x3, sk[10]); 140 } else if (num_rounds == 12) { 141 x0 = _mm_aesdec_si128(x0, sk[10]); 142 x1 = _mm_aesdec_si128(x1, sk[10]); 143 x2 = _mm_aesdec_si128(x2, sk[10]); 144 x3 = _mm_aesdec_si128(x3, sk[10]); 145 x0 = _mm_aesdec_si128(x0, sk[11]); 146 x1 = _mm_aesdec_si128(x1, sk[11]); 147 x2 = _mm_aesdec_si128(x2, sk[11]); 148 x3 = _mm_aesdec_si128(x3, sk[11]); 149 x0 = _mm_aesdeclast_si128(x0, sk[12]); 150 x1 = _mm_aesdeclast_si128(x1, sk[12]); 151 x2 = _mm_aesdeclast_si128(x2, sk[12]); 152 x3 = _mm_aesdeclast_si128(x3, sk[12]); 153 } else { 154 x0 = _mm_aesdec_si128(x0, sk[10]); 155 x1 = _mm_aesdec_si128(x1, sk[10]); 156 x2 = _mm_aesdec_si128(x2, sk[10]); 157 x3 = _mm_aesdec_si128(x3, sk[10]); 158 x0 = _mm_aesdec_si128(x0, sk[11]); 159 x1 = _mm_aesdec_si128(x1, sk[11]); 160 x2 = _mm_aesdec_si128(x2, sk[11]); 161 x3 = _mm_aesdec_si128(x3, sk[11]); 162 x0 = _mm_aesdec_si128(x0, sk[12]); 163 x1 = _mm_aesdec_si128(x1, sk[12]); 164 x2 = _mm_aesdec_si128(x2, sk[12]); 165 x3 = _mm_aesdec_si128(x3, sk[12]); 166 x0 = _mm_aesdec_si128(x0, sk[13]); 167 x1 = _mm_aesdec_si128(x1, sk[13]); 168 x2 = _mm_aesdec_si128(x2, sk[13]); 169 x3 = _mm_aesdec_si128(x3, sk[13]); 170 x0 = _mm_aesdeclast_si128(x0, sk[14]); 171 x1 = _mm_aesdeclast_si128(x1, sk[14]); 172 x2 = _mm_aesdeclast_si128(x2, sk[14]); 173 x3 = _mm_aesdeclast_si128(x3, sk[14]); 174 } 175 x0 = _mm_xor_si128(x0, ivx); 176 x1 = _mm_xor_si128(x1, e0); 177 x2 = _mm_xor_si128(x2, e1); 178 x3 = _mm_xor_si128(x3, e2); 179 ivx = e3; 180 _mm_storeu_si128((void *)(buf + 0), x0); 181 if (len >= 64) { 182 _mm_storeu_si128((void *)(buf + 16), x1); 183 _mm_storeu_si128((void *)(buf + 32), x2); 184 _mm_storeu_si128((void *)(buf + 48), x3); 185 buf += 64; 186 len -= 64; 187 } else { 188 if (len >= 32) { 189 _mm_storeu_si128((void *)(buf + 16), x1); 190 if (len >= 48) { 191 _mm_storeu_si128( 192 (void *)(buf + 32), x2); 193 } 194 } 195 break; 196 } 197 } 198 _mm_storeu_si128(iv, ivx); 199 } 200 201 BR_TARGETS_X86_DOWN 202 203 /* see bearssl_block.h */ 204 const br_block_cbcdec_class br_aes_x86ni_cbcdec_vtable = { 205 sizeof(br_aes_x86ni_cbcdec_keys), 206 16, 207 4, 208 (void (*)(const br_block_cbcdec_class **, const void *, size_t)) 209 &br_aes_x86ni_cbcdec_init, 210 (void (*)(const br_block_cbcdec_class *const *, void *, void *, size_t)) 211 &br_aes_x86ni_cbcdec_run 212 }; 213 214 #else 215 216 /* see bearssl_block.h */ 217 const br_block_cbcdec_class * 218 br_aes_x86ni_cbcdec_get_vtable(void) 219 { 220 return NULL; 221 } 222 223 #endif 224