1 /***************************************************************************** 2 * * 3 * Copyright (c) 2012, Intel Corporation * 4 * * 5 * All rights reserved. * 6 * * 7 * Redistribution and use in source and binary forms, with or without * 8 * modification, are permitted provided that the following conditions are * 9 * met: * 10 * * 11 * * Redistributions of source code must retain the above copyright * 12 * notice, this list of conditions and the following disclaimer. * 13 * * 14 * * Redistributions in binary form must reproduce the above copyright * 15 * notice, this list of conditions and the following disclaimer in the * 16 * documentation and/or other materials provided with the * 17 * distribution. * 18 * * 19 * * Neither the name of the Intel Corporation nor the names of its * 20 * contributors may be used to endorse or promote products derived from * 21 * this software without specific prior written permission. * 22 * * 23 * * 24 * THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY * 25 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * 27 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR * 28 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * 29 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * 30 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * 31 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * 32 * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * 33 * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * 34 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * 35 * * 36 ****************************************************************************** 37 * Developers and authors: * 38 * Shay Gueron (1, 2), and Vlad Krasnov (1) * 39 * (1) Intel Corporation, Israel Development Center, Haifa, Israel * 40 * (2) University of Haifa, Israel * 41 *****************************************************************************/ 42 43 #include "rsaz_exp.h" 44 45 #ifdef RSAZ_ENABLED 46 47 /* 48 * See crypto/bn/asm/rsaz-avx2.pl for further details. 49 */ 50 void rsaz_1024_norm2red_avx2(void *red, const void *norm); 51 void rsaz_1024_mul_avx2(void *ret, const void *a, const void *b, 52 const void *n, BN_ULONG k); 53 void rsaz_1024_sqr_avx2(void *ret, const void *a, const void *n, BN_ULONG k, 54 int cnt); 55 void rsaz_1024_scatter5_avx2(void *tbl, const void *val, int i); 56 void rsaz_1024_gather5_avx2(void *val, const void *tbl, int i); 57 void rsaz_1024_red2norm_avx2(void *norm, const void *red); 58 59 #if defined(__GNUC__) 60 # define ALIGN64 __attribute__((aligned(64))) 61 #elif defined(_MSC_VER) 62 # define ALIGN64 __declspec(align(64)) 63 #elif defined(__SUNPRO_C) 64 # define ALIGN64 65 # pragma align 64(one,two80) 66 #else 67 /* not fatal, might hurt performance a little */ 68 # define ALIGN64 69 #endif 70 71 ALIGN64 static const BN_ULONG one[40] = { 72 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 73 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 74 }; 75 76 ALIGN64 static const BN_ULONG two80[40] = { 77 0, 0, 1 << 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 78 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 79 }; 80 81 void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16], 82 const BN_ULONG base_norm[16], 83 const BN_ULONG exponent[16], 84 const BN_ULONG m_norm[16], const BN_ULONG RR[16], 85 BN_ULONG k0) 86 { 87 unsigned char storage[320 * 3 + 32 * 9 * 16 + 64]; /* 5.5KB */ 88 unsigned char *p_str = storage + (64 - ((size_t)storage % 64)); 89 unsigned char *a_inv, *m, *result; 90 unsigned char *table_s = p_str + 320 * 3; 91 unsigned char *R2 = table_s; /* borrow */ 92 int index; 93 int wvalue; 94 95 if ((((size_t)p_str & 4095) + 320) >> 12) { 96 result = p_str; 97 a_inv = p_str + 320; 98 m = p_str + 320 * 2; /* should not cross page */ 99 } else { 100 m = p_str; /* should not cross page */ 101 result = p_str + 320; 102 a_inv = p_str + 320 * 2; 103 } 104 105 rsaz_1024_norm2red_avx2(m, m_norm); 106 rsaz_1024_norm2red_avx2(a_inv, base_norm); 107 rsaz_1024_norm2red_avx2(R2, RR); 108 109 rsaz_1024_mul_avx2(R2, R2, R2, m, k0); 110 rsaz_1024_mul_avx2(R2, R2, two80, m, k0); 111 112 /* table[0] = 1 */ 113 rsaz_1024_mul_avx2(result, R2, one, m, k0); 114 /* table[1] = a_inv^1 */ 115 rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0); 116 117 rsaz_1024_scatter5_avx2(table_s, result, 0); 118 rsaz_1024_scatter5_avx2(table_s, a_inv, 1); 119 120 /* table[2] = a_inv^2 */ 121 rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1); 122 rsaz_1024_scatter5_avx2(table_s, result, 2); 123 #if 0 124 /* this is almost 2x smaller and less than 1% slower */ 125 for (index = 3; index < 32; index++) { 126 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 127 rsaz_1024_scatter5_avx2(table_s, result, index); 128 } 129 #else 130 /* table[4] = a_inv^4 */ 131 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 132 rsaz_1024_scatter5_avx2(table_s, result, 4); 133 /* table[8] = a_inv^8 */ 134 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 135 rsaz_1024_scatter5_avx2(table_s, result, 8); 136 /* table[16] = a_inv^16 */ 137 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 138 rsaz_1024_scatter5_avx2(table_s, result, 16); 139 /* table[17] = a_inv^17 */ 140 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 141 rsaz_1024_scatter5_avx2(table_s, result, 17); 142 143 /* table[3] */ 144 rsaz_1024_gather5_avx2(result, table_s, 2); 145 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 146 rsaz_1024_scatter5_avx2(table_s, result, 3); 147 /* table[6] */ 148 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 149 rsaz_1024_scatter5_avx2(table_s, result, 6); 150 /* table[12] */ 151 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 152 rsaz_1024_scatter5_avx2(table_s, result, 12); 153 /* table[24] */ 154 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 155 rsaz_1024_scatter5_avx2(table_s, result, 24); 156 /* table[25] */ 157 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 158 rsaz_1024_scatter5_avx2(table_s, result, 25); 159 160 /* table[5] */ 161 rsaz_1024_gather5_avx2(result, table_s, 4); 162 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 163 rsaz_1024_scatter5_avx2(table_s, result, 5); 164 /* table[10] */ 165 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 166 rsaz_1024_scatter5_avx2(table_s, result, 10); 167 /* table[20] */ 168 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 169 rsaz_1024_scatter5_avx2(table_s, result, 20); 170 /* table[21] */ 171 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 172 rsaz_1024_scatter5_avx2(table_s, result, 21); 173 174 /* table[7] */ 175 rsaz_1024_gather5_avx2(result, table_s, 6); 176 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 177 rsaz_1024_scatter5_avx2(table_s, result, 7); 178 /* table[14] */ 179 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 180 rsaz_1024_scatter5_avx2(table_s, result, 14); 181 /* table[28] */ 182 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 183 rsaz_1024_scatter5_avx2(table_s, result, 28); 184 /* table[29] */ 185 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 186 rsaz_1024_scatter5_avx2(table_s, result, 29); 187 188 /* table[9] */ 189 rsaz_1024_gather5_avx2(result, table_s, 8); 190 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 191 rsaz_1024_scatter5_avx2(table_s, result, 9); 192 /* table[18] */ 193 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 194 rsaz_1024_scatter5_avx2(table_s, result, 18); 195 /* table[19] */ 196 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 197 rsaz_1024_scatter5_avx2(table_s, result, 19); 198 199 /* table[11] */ 200 rsaz_1024_gather5_avx2(result, table_s, 10); 201 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 202 rsaz_1024_scatter5_avx2(table_s, result, 11); 203 /* table[22] */ 204 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 205 rsaz_1024_scatter5_avx2(table_s, result, 22); 206 /* table[23] */ 207 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 208 rsaz_1024_scatter5_avx2(table_s, result, 23); 209 210 /* table[13] */ 211 rsaz_1024_gather5_avx2(result, table_s, 12); 212 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 213 rsaz_1024_scatter5_avx2(table_s, result, 13); 214 /* table[26] */ 215 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 216 rsaz_1024_scatter5_avx2(table_s, result, 26); 217 /* table[27] */ 218 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 219 rsaz_1024_scatter5_avx2(table_s, result, 27); 220 221 /* table[15] */ 222 rsaz_1024_gather5_avx2(result, table_s, 14); 223 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 224 rsaz_1024_scatter5_avx2(table_s, result, 15); 225 /* table[30] */ 226 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 227 rsaz_1024_scatter5_avx2(table_s, result, 30); 228 /* table[31] */ 229 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 230 rsaz_1024_scatter5_avx2(table_s, result, 31); 231 #endif 232 233 /* load first window */ 234 p_str = (unsigned char *)exponent; 235 wvalue = p_str[127] >> 3; 236 rsaz_1024_gather5_avx2(result, table_s, wvalue); 237 238 index = 1014; 239 240 while (index > -1) { /* loop for the remaining 127 windows */ 241 242 rsaz_1024_sqr_avx2(result, result, m, k0, 5); 243 244 wvalue = *((unsigned short *)&p_str[index / 8]); 245 wvalue = (wvalue >> (index % 8)) & 31; 246 index -= 5; 247 248 rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */ 249 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 250 } 251 252 /* square four times */ 253 rsaz_1024_sqr_avx2(result, result, m, k0, 4); 254 255 wvalue = p_str[0] & 15; 256 257 rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */ 258 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 259 260 /* from Montgomery */ 261 rsaz_1024_mul_avx2(result, result, one, m, k0); 262 263 rsaz_1024_red2norm_avx2(result_norm, result); 264 265 OPENSSL_cleanse(storage, sizeof(storage)); 266 } 267 268 /* 269 * See crypto/bn/rsaz-x86_64.pl for further details. 270 */ 271 void rsaz_512_mul(void *ret, const void *a, const void *b, const void *n, 272 BN_ULONG k); 273 void rsaz_512_mul_scatter4(void *ret, const void *a, const void *n, 274 BN_ULONG k, const void *tbl, unsigned int power); 275 void rsaz_512_mul_gather4(void *ret, const void *a, const void *tbl, 276 const void *n, BN_ULONG k, unsigned int power); 277 void rsaz_512_mul_by_one(void *ret, const void *a, const void *n, BN_ULONG k); 278 void rsaz_512_sqr(void *ret, const void *a, const void *n, BN_ULONG k, 279 int cnt); 280 void rsaz_512_scatter4(void *tbl, const BN_ULONG *val, int power); 281 void rsaz_512_gather4(BN_ULONG *val, const void *tbl, int power); 282 283 void RSAZ_512_mod_exp(BN_ULONG result[8], 284 const BN_ULONG base[8], const BN_ULONG exponent[8], 285 const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8]) 286 { 287 unsigned char storage[16 * 8 * 8 + 64 * 2 + 64]; /* 1.2KB */ 288 unsigned char *table = storage + (64 - ((size_t)storage % 64)); 289 BN_ULONG *a_inv = (BN_ULONG *)(table + 16 * 8 * 8); 290 BN_ULONG *temp = (BN_ULONG *)(table + 16 * 8 * 8 + 8 * 8); 291 unsigned char *p_str = (unsigned char *)exponent; 292 int index; 293 unsigned int wvalue; 294 295 /* table[0] = 1_inv */ 296 temp[0] = 0 - m[0]; 297 temp[1] = ~m[1]; 298 temp[2] = ~m[2]; 299 temp[3] = ~m[3]; 300 temp[4] = ~m[4]; 301 temp[5] = ~m[5]; 302 temp[6] = ~m[6]; 303 temp[7] = ~m[7]; 304 rsaz_512_scatter4(table, temp, 0); 305 306 /* table [1] = a_inv^1 */ 307 rsaz_512_mul(a_inv, base, RR, m, k0); 308 rsaz_512_scatter4(table, a_inv, 1); 309 310 /* table [2] = a_inv^2 */ 311 rsaz_512_sqr(temp, a_inv, m, k0, 1); 312 rsaz_512_scatter4(table, temp, 2); 313 314 for (index = 3; index < 16; index++) 315 rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index); 316 317 /* load first window */ 318 wvalue = p_str[63]; 319 320 rsaz_512_gather4(temp, table, wvalue >> 4); 321 rsaz_512_sqr(temp, temp, m, k0, 4); 322 rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0xf); 323 324 for (index = 62; index >= 0; index--) { 325 wvalue = p_str[index]; 326 327 rsaz_512_sqr(temp, temp, m, k0, 4); 328 rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue >> 4); 329 330 rsaz_512_sqr(temp, temp, m, k0, 4); 331 rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0x0f); 332 } 333 334 /* from Montgomery */ 335 rsaz_512_mul_by_one(result, temp, m, k0); 336 337 OPENSSL_cleanse(storage, sizeof(storage)); 338 } 339 340 #else 341 342 # if defined(PEDANTIC) || defined(__DECC) || defined(__clang__) 343 static void *dummy = &dummy; 344 # endif 345 346 #endif 347