1 /* 2 * Copyright 2013-2022 The OpenSSL Project Authors. All Rights Reserved. 3 * Copyright (c) 2012, Intel Corporation. All Rights Reserved. 4 * 5 * Licensed under the OpenSSL license (the "License"). You may not use 6 * this file except in compliance with the License. You can obtain a copy 7 * in the file LICENSE in the source distribution or at 8 * https://www.openssl.org/source/license.html 9 * 10 * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) 11 * (1) Intel Corporation, Israel Development Center, Haifa, Israel 12 * (2) University of Haifa, Israel 13 */ 14 15 #include <openssl/opensslconf.h> 16 #include "rsaz_exp.h" 17 18 #ifndef RSAZ_ENABLED 19 NON_EMPTY_TRANSLATION_UNIT 20 #else 21 22 /* 23 * See crypto/bn/asm/rsaz-avx2.pl for further details. 24 */ 25 void rsaz_1024_norm2red_avx2(void *red, const void *norm); 26 void rsaz_1024_mul_avx2(void *ret, const void *a, const void *b, 27 const void *n, BN_ULONG k); 28 void rsaz_1024_sqr_avx2(void *ret, const void *a, const void *n, BN_ULONG k, 29 int cnt); 30 void rsaz_1024_scatter5_avx2(void *tbl, const void *val, int i); 31 void rsaz_1024_gather5_avx2(void *val, const void *tbl, int i); 32 void rsaz_1024_red2norm_avx2(void *norm, const void *red); 33 34 #if defined(__GNUC__) 35 # define ALIGN64 __attribute__((aligned(64))) 36 #elif defined(_MSC_VER) 37 # define ALIGN64 __declspec(align(64)) 38 #elif defined(__SUNPRO_C) 39 # define ALIGN64 40 # pragma align 64(one,two80) 41 #else 42 /* not fatal, might hurt performance a little */ 43 # define ALIGN64 44 #endif 45 46 ALIGN64 static const BN_ULONG one[40] = { 47 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 48 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 49 }; 50 51 ALIGN64 static const BN_ULONG two80[40] = { 52 0, 0, 1 << 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 53 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 54 }; 55 56 void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16], 57 const BN_ULONG base_norm[16], 58 const BN_ULONG exponent[16], 59 const BN_ULONG m_norm[16], const BN_ULONG RR[16], 60 BN_ULONG k0) 61 { 62 unsigned char storage[320 * 3 + 32 * 9 * 16 + 64]; /* 5.5KB */ 63 unsigned char *p_str = storage + (64 - ((size_t)storage % 64)); 64 unsigned char *a_inv, *m, *result; 65 unsigned char *table_s = p_str + 320 * 3; 66 unsigned char *R2 = table_s; /* borrow */ 67 int index; 68 int wvalue; 69 BN_ULONG tmp[16]; 70 71 if ((((size_t)p_str & 4095) + 320) >> 12) { 72 result = p_str; 73 a_inv = p_str + 320; 74 m = p_str + 320 * 2; /* should not cross page */ 75 } else { 76 m = p_str; /* should not cross page */ 77 result = p_str + 320; 78 a_inv = p_str + 320 * 2; 79 } 80 81 rsaz_1024_norm2red_avx2(m, m_norm); 82 rsaz_1024_norm2red_avx2(a_inv, base_norm); 83 rsaz_1024_norm2red_avx2(R2, RR); 84 85 rsaz_1024_mul_avx2(R2, R2, R2, m, k0); 86 rsaz_1024_mul_avx2(R2, R2, two80, m, k0); 87 88 /* table[0] = 1 */ 89 rsaz_1024_mul_avx2(result, R2, one, m, k0); 90 /* table[1] = a_inv^1 */ 91 rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0); 92 93 rsaz_1024_scatter5_avx2(table_s, result, 0); 94 rsaz_1024_scatter5_avx2(table_s, a_inv, 1); 95 96 /* table[2] = a_inv^2 */ 97 rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1); 98 rsaz_1024_scatter5_avx2(table_s, result, 2); 99 #if 0 100 /* this is almost 2x smaller and less than 1% slower */ 101 for (index = 3; index < 32; index++) { 102 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 103 rsaz_1024_scatter5_avx2(table_s, result, index); 104 } 105 #else 106 /* table[4] = a_inv^4 */ 107 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 108 rsaz_1024_scatter5_avx2(table_s, result, 4); 109 /* table[8] = a_inv^8 */ 110 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 111 rsaz_1024_scatter5_avx2(table_s, result, 8); 112 /* table[16] = a_inv^16 */ 113 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 114 rsaz_1024_scatter5_avx2(table_s, result, 16); 115 /* table[17] = a_inv^17 */ 116 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 117 rsaz_1024_scatter5_avx2(table_s, result, 17); 118 119 /* table[3] */ 120 rsaz_1024_gather5_avx2(result, table_s, 2); 121 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 122 rsaz_1024_scatter5_avx2(table_s, result, 3); 123 /* table[6] */ 124 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 125 rsaz_1024_scatter5_avx2(table_s, result, 6); 126 /* table[12] */ 127 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 128 rsaz_1024_scatter5_avx2(table_s, result, 12); 129 /* table[24] */ 130 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 131 rsaz_1024_scatter5_avx2(table_s, result, 24); 132 /* table[25] */ 133 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 134 rsaz_1024_scatter5_avx2(table_s, result, 25); 135 136 /* table[5] */ 137 rsaz_1024_gather5_avx2(result, table_s, 4); 138 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 139 rsaz_1024_scatter5_avx2(table_s, result, 5); 140 /* table[10] */ 141 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 142 rsaz_1024_scatter5_avx2(table_s, result, 10); 143 /* table[20] */ 144 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 145 rsaz_1024_scatter5_avx2(table_s, result, 20); 146 /* table[21] */ 147 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 148 rsaz_1024_scatter5_avx2(table_s, result, 21); 149 150 /* table[7] */ 151 rsaz_1024_gather5_avx2(result, table_s, 6); 152 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 153 rsaz_1024_scatter5_avx2(table_s, result, 7); 154 /* table[14] */ 155 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 156 rsaz_1024_scatter5_avx2(table_s, result, 14); 157 /* table[28] */ 158 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 159 rsaz_1024_scatter5_avx2(table_s, result, 28); 160 /* table[29] */ 161 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 162 rsaz_1024_scatter5_avx2(table_s, result, 29); 163 164 /* table[9] */ 165 rsaz_1024_gather5_avx2(result, table_s, 8); 166 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 167 rsaz_1024_scatter5_avx2(table_s, result, 9); 168 /* table[18] */ 169 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 170 rsaz_1024_scatter5_avx2(table_s, result, 18); 171 /* table[19] */ 172 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 173 rsaz_1024_scatter5_avx2(table_s, result, 19); 174 175 /* table[11] */ 176 rsaz_1024_gather5_avx2(result, table_s, 10); 177 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 178 rsaz_1024_scatter5_avx2(table_s, result, 11); 179 /* table[22] */ 180 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 181 rsaz_1024_scatter5_avx2(table_s, result, 22); 182 /* table[23] */ 183 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 184 rsaz_1024_scatter5_avx2(table_s, result, 23); 185 186 /* table[13] */ 187 rsaz_1024_gather5_avx2(result, table_s, 12); 188 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 189 rsaz_1024_scatter5_avx2(table_s, result, 13); 190 /* table[26] */ 191 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 192 rsaz_1024_scatter5_avx2(table_s, result, 26); 193 /* table[27] */ 194 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 195 rsaz_1024_scatter5_avx2(table_s, result, 27); 196 197 /* table[15] */ 198 rsaz_1024_gather5_avx2(result, table_s, 14); 199 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 200 rsaz_1024_scatter5_avx2(table_s, result, 15); 201 /* table[30] */ 202 rsaz_1024_sqr_avx2(result, result, m, k0, 1); 203 rsaz_1024_scatter5_avx2(table_s, result, 30); 204 /* table[31] */ 205 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 206 rsaz_1024_scatter5_avx2(table_s, result, 31); 207 #endif 208 209 /* load first window */ 210 p_str = (unsigned char *)exponent; 211 wvalue = p_str[127] >> 3; 212 rsaz_1024_gather5_avx2(result, table_s, wvalue); 213 214 index = 1014; 215 216 while (index > -1) { /* loop for the remaining 127 windows */ 217 218 rsaz_1024_sqr_avx2(result, result, m, k0, 5); 219 220 wvalue = (p_str[(index / 8) + 1] << 8) | p_str[index / 8]; 221 wvalue = (wvalue >> (index % 8)) & 31; 222 index -= 5; 223 224 rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */ 225 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 226 } 227 228 /* square four times */ 229 rsaz_1024_sqr_avx2(result, result, m, k0, 4); 230 231 wvalue = p_str[0] & 15; 232 233 rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); /* borrow a_inv */ 234 rsaz_1024_mul_avx2(result, result, a_inv, m, k0); 235 236 /* from Montgomery */ 237 rsaz_1024_mul_avx2(result, result, one, m, k0); 238 239 rsaz_1024_red2norm_avx2(result_norm, result); 240 241 bn_reduce_once_in_place(result_norm, /*carry=*/0, m_norm, tmp, 16); 242 243 OPENSSL_cleanse(storage, sizeof(storage)); 244 OPENSSL_cleanse(tmp, sizeof(tmp)); 245 } 246 247 /* 248 * See crypto/bn/rsaz-x86_64.pl for further details. 249 */ 250 void rsaz_512_mul(void *ret, const void *a, const void *b, const void *n, 251 BN_ULONG k); 252 void rsaz_512_mul_scatter4(void *ret, const void *a, const void *n, 253 BN_ULONG k, const void *tbl, unsigned int power); 254 void rsaz_512_mul_gather4(void *ret, const void *a, const void *tbl, 255 const void *n, BN_ULONG k, unsigned int power); 256 void rsaz_512_mul_by_one(void *ret, const void *a, const void *n, BN_ULONG k); 257 void rsaz_512_sqr(void *ret, const void *a, const void *n, BN_ULONG k, 258 int cnt); 259 void rsaz_512_scatter4(void *tbl, const BN_ULONG *val, int power); 260 void rsaz_512_gather4(BN_ULONG *val, const void *tbl, int power); 261 262 void RSAZ_512_mod_exp(BN_ULONG result[8], 263 const BN_ULONG base[8], const BN_ULONG exponent[8], 264 const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8]) 265 { 266 unsigned char storage[16 * 8 * 8 + 64 * 2 + 64]; /* 1.2KB */ 267 unsigned char *table = storage + (64 - ((size_t)storage % 64)); 268 BN_ULONG *a_inv = (BN_ULONG *)(table + 16 * 8 * 8); 269 BN_ULONG *temp = (BN_ULONG *)(table + 16 * 8 * 8 + 8 * 8); 270 unsigned char *p_str = (unsigned char *)exponent; 271 int index; 272 unsigned int wvalue; 273 BN_ULONG tmp[8]; 274 275 /* table[0] = 1_inv */ 276 temp[0] = 0 - m[0]; 277 temp[1] = ~m[1]; 278 temp[2] = ~m[2]; 279 temp[3] = ~m[3]; 280 temp[4] = ~m[4]; 281 temp[5] = ~m[5]; 282 temp[6] = ~m[6]; 283 temp[7] = ~m[7]; 284 rsaz_512_scatter4(table, temp, 0); 285 286 /* table [1] = a_inv^1 */ 287 rsaz_512_mul(a_inv, base, RR, m, k0); 288 rsaz_512_scatter4(table, a_inv, 1); 289 290 /* table [2] = a_inv^2 */ 291 rsaz_512_sqr(temp, a_inv, m, k0, 1); 292 rsaz_512_scatter4(table, temp, 2); 293 294 for (index = 3; index < 16; index++) 295 rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index); 296 297 /* load first window */ 298 wvalue = p_str[63]; 299 300 rsaz_512_gather4(temp, table, wvalue >> 4); 301 rsaz_512_sqr(temp, temp, m, k0, 4); 302 rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0xf); 303 304 for (index = 62; index >= 0; index--) { 305 wvalue = p_str[index]; 306 307 rsaz_512_sqr(temp, temp, m, k0, 4); 308 rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue >> 4); 309 310 rsaz_512_sqr(temp, temp, m, k0, 4); 311 rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue & 0x0f); 312 } 313 314 /* from Montgomery */ 315 rsaz_512_mul_by_one(result, temp, m, k0); 316 317 bn_reduce_once_in_place(result, /*carry=*/0, m, tmp, 8); 318 319 OPENSSL_cleanse(storage, sizeof(storage)); 320 OPENSSL_cleanse(tmp, sizeof(tmp)); 321 } 322 323 #endif 324