1 /* 2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #define BR_ENABLE_INTRINSICS 1 26 #include "inner.h" 27 28 #if BR_SSE2 29 30 /* 31 * This file contains a ChaCha20 implementation that leverages SSE2 32 * opcodes for better performance. 33 */ 34 35 /* see bearssl_block.h */ 36 br_chacha20_run 37 br_chacha20_sse2_get(void) 38 { 39 /* 40 * If using 64-bit mode, then SSE2 opcodes should be automatically 41 * available, since they are part of the ABI. 42 * 43 * In 32-bit mode, we use CPUID to detect the SSE2 feature. 44 */ 45 46 #if BR_amd64 47 return &br_chacha20_sse2_run; 48 #else 49 50 /* 51 * SSE2 support is indicated by bit 26 in EDX. 52 */ 53 if (br_cpuid(0, 0, 0, 0x04000000)) { 54 return &br_chacha20_sse2_run; 55 } else { 56 return 0; 57 } 58 #endif 59 } 60 61 BR_TARGETS_X86_UP 62 63 /* see bearssl_block.h */ 64 BR_TARGET("sse2") 65 uint32_t 66 br_chacha20_sse2_run(const void *key, 67 const void *iv, uint32_t cc, void *data, size_t len) 68 { 69 unsigned char *buf; 70 uint32_t ivtmp[4]; 71 __m128i kw0, kw1; 72 __m128i iw, cw; 73 __m128i one; 74 75 static const uint32_t CW[] = { 76 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 77 }; 78 79 buf = data; 80 kw0 = _mm_loadu_si128(key); 81 kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16)); 82 ivtmp[0] = cc; 83 memcpy(ivtmp + 1, iv, 12); 84 iw = _mm_loadu_si128((const void *)ivtmp); 85 cw = _mm_loadu_si128((const void *)CW); 86 one = _mm_set_epi32(0, 0, 0, 1); 87 88 while (len > 0) { 89 /* 90 * sj contains state words 4*j to 4*j+3. 91 */ 92 __m128i s0, s1, s2, s3; 93 int i; 94 95 s0 = cw; 96 s1 = kw0; 97 s2 = kw1; 98 s3 = iw; 99 for (i = 0; i < 10; i ++) { 100 /* 101 * Even round is straightforward application on 102 * the state words. 103 */ 104 s0 = _mm_add_epi32(s0, s1); 105 s3 = _mm_xor_si128(s3, s0); 106 s3 = _mm_or_si128( 107 _mm_slli_epi32(s3, 16), 108 _mm_srli_epi32(s3, 16)); 109 110 s2 = _mm_add_epi32(s2, s3); 111 s1 = _mm_xor_si128(s1, s2); 112 s1 = _mm_or_si128( 113 _mm_slli_epi32(s1, 12), 114 _mm_srli_epi32(s1, 20)); 115 116 s0 = _mm_add_epi32(s0, s1); 117 s3 = _mm_xor_si128(s3, s0); 118 s3 = _mm_or_si128( 119 _mm_slli_epi32(s3, 8), 120 _mm_srli_epi32(s3, 24)); 121 122 s2 = _mm_add_epi32(s2, s3); 123 s1 = _mm_xor_si128(s1, s2); 124 s1 = _mm_or_si128( 125 _mm_slli_epi32(s1, 7), 126 _mm_srli_epi32(s1, 25)); 127 128 /* 129 * For the odd round, we must rotate some state 130 * words so that the computations apply on the 131 * right combinations of words. 132 */ 133 s1 = _mm_shuffle_epi32(s1, 0x39); 134 s2 = _mm_shuffle_epi32(s2, 0x4E); 135 s3 = _mm_shuffle_epi32(s3, 0x93); 136 137 s0 = _mm_add_epi32(s0, s1); 138 s3 = _mm_xor_si128(s3, s0); 139 s3 = _mm_or_si128( 140 _mm_slli_epi32(s3, 16), 141 _mm_srli_epi32(s3, 16)); 142 143 s2 = _mm_add_epi32(s2, s3); 144 s1 = _mm_xor_si128(s1, s2); 145 s1 = _mm_or_si128( 146 _mm_slli_epi32(s1, 12), 147 _mm_srli_epi32(s1, 20)); 148 149 s0 = _mm_add_epi32(s0, s1); 150 s3 = _mm_xor_si128(s3, s0); 151 s3 = _mm_or_si128( 152 _mm_slli_epi32(s3, 8), 153 _mm_srli_epi32(s3, 24)); 154 155 s2 = _mm_add_epi32(s2, s3); 156 s1 = _mm_xor_si128(s1, s2); 157 s1 = _mm_or_si128( 158 _mm_slli_epi32(s1, 7), 159 _mm_srli_epi32(s1, 25)); 160 161 /* 162 * After the odd round, we rotate back the values 163 * to undo the rotate at the start of the odd round. 164 */ 165 s1 = _mm_shuffle_epi32(s1, 0x93); 166 s2 = _mm_shuffle_epi32(s2, 0x4E); 167 s3 = _mm_shuffle_epi32(s3, 0x39); 168 } 169 170 /* 171 * Addition with the initial state. 172 */ 173 s0 = _mm_add_epi32(s0, cw); 174 s1 = _mm_add_epi32(s1, kw0); 175 s2 = _mm_add_epi32(s2, kw1); 176 s3 = _mm_add_epi32(s3, iw); 177 178 /* 179 * Increment block counter. 180 */ 181 iw = _mm_add_epi32(iw, one); 182 183 /* 184 * XOR final state with the data. 185 */ 186 if (len < 64) { 187 unsigned char tmp[64]; 188 size_t u; 189 190 _mm_storeu_si128((void *)(tmp + 0), s0); 191 _mm_storeu_si128((void *)(tmp + 16), s1); 192 _mm_storeu_si128((void *)(tmp + 32), s2); 193 _mm_storeu_si128((void *)(tmp + 48), s3); 194 for (u = 0; u < len; u ++) { 195 buf[u] ^= tmp[u]; 196 } 197 break; 198 } else { 199 __m128i b0, b1, b2, b3; 200 201 b0 = _mm_loadu_si128((const void *)(buf + 0)); 202 b1 = _mm_loadu_si128((const void *)(buf + 16)); 203 b2 = _mm_loadu_si128((const void *)(buf + 32)); 204 b3 = _mm_loadu_si128((const void *)(buf + 48)); 205 b0 = _mm_xor_si128(b0, s0); 206 b1 = _mm_xor_si128(b1, s1); 207 b2 = _mm_xor_si128(b2, s2); 208 b3 = _mm_xor_si128(b3, s3); 209 _mm_storeu_si128((void *)(buf + 0), b0); 210 _mm_storeu_si128((void *)(buf + 16), b1); 211 _mm_storeu_si128((void *)(buf + 32), b2); 212 _mm_storeu_si128((void *)(buf + 48), b3); 213 buf += 64; 214 len -= 64; 215 } 216 } 217 218 /* 219 * _mm_extract_epi32() requires SSE4.1. We prefer to stick to 220 * raw SSE2, thus we use _mm_extract_epi16(). 221 */ 222 return (uint32_t)_mm_extract_epi16(iw, 0) 223 | ((uint32_t)_mm_extract_epi16(iw, 1) << 16); 224 } 225 226 BR_TARGETS_X86_DOWN 227 228 #else 229 230 /* see bearssl_block.h */ 231 br_chacha20_run 232 br_chacha20_sse2_get(void) 233 { 234 return 0; 235 } 236 237 #endif 238