xref: /freebsd/crypto/openssl/crypto/bn/rsaz_exp_x2.c (revision e7be843b4a162e68651d3911f0357ed464915629)
1 /*
2  * Copyright 2020-2025 The OpenSSL Project Authors. All Rights Reserved.
3  * Copyright (c) 2020-2021, Intel Corporation. All Rights Reserved.
4  *
5  * Licensed under the Apache License 2.0 (the "License").  You may not use
6  * this file except in compliance with the License.  You can obtain a copy
7  * in the file LICENSE in the source distribution or at
8  * https://www.openssl.org/source/license.html
9  *
10  *
11  * Originally written by Sergey Kirillov and Andrey Matyukov.
12  * Special thanks to Ilya Albrekht for his valuable hints.
13  * Intel Corporation
14  *
15  */
16 
17 #include <openssl/opensslconf.h>
18 #include <openssl/crypto.h>
19 #include "rsaz_exp.h"
20 
21 #ifndef RSAZ_ENABLED
22 NON_EMPTY_TRANSLATION_UNIT
23 #else
24 # include <assert.h>
25 # include <string.h>
26 
27 # define ALIGN_OF(ptr, boundary) \
28     ((unsigned char *)(ptr) + (boundary - (((size_t)(ptr)) & (boundary - 1))))
29 
30 /* Internal radix */
31 # define DIGIT_SIZE (52)
32 /* 52-bit mask */
33 # define DIGIT_MASK ((uint64_t)0xFFFFFFFFFFFFF)
34 
35 # define BITS2WORD8_SIZE(x)  (((x) + 7) >> 3)
36 # define BITS2WORD64_SIZE(x) (((x) + 63) >> 6)
37 
38 /* Number of registers required to hold |digits_num| amount of qword digits */
39 # define NUMBER_OF_REGISTERS(digits_num, register_size)            \
40     (((digits_num) * 64 + (register_size) - 1) / (register_size))
41 
42 static ossl_inline uint64_t get_digit(const uint8_t *in, int in_len);
43 static ossl_inline void put_digit(uint8_t *out, int out_len, uint64_t digit);
44 static void to_words52(BN_ULONG *out, int out_len, const BN_ULONG *in,
45                        int in_bitsize);
46 static void from_words52(BN_ULONG *bn_out, int out_bitsize, const BN_ULONG *in);
47 static ossl_inline void set_bit(BN_ULONG *a, int idx);
48 
49 /* Number of |digit_size|-bit digits in |bitsize|-bit value */
50 static ossl_inline int number_of_digits(int bitsize, int digit_size)
51 {
52     return (bitsize + digit_size - 1) / digit_size;
53 }
54 
55 /*
56  * For details of the methods declared below please refer to
57  *    crypto/bn/asm/rsaz-avx512.pl
58  *
59  * Naming conventions:
60  *  amm = Almost Montgomery Multiplication
61  *  ams = Almost Montgomery Squaring
62  *  52xZZ - data represented as array of ZZ digits in 52-bit radix
63  *  _x1_/_x2_ - 1 or 2 independent inputs/outputs
64  *  _ifma256 - uses 256-bit wide IFMA ISA (AVX512_IFMA256)
65  *  _avxifma256 - uses 256-bit wide AVXIFMA ISA (AVX_IFMA256)
66  */
67 
68 void ossl_rsaz_amm52x20_x1_ifma256(BN_ULONG *res, const BN_ULONG *a,
69                                    const BN_ULONG *b, const BN_ULONG *m,
70                                    BN_ULONG k0);
71 void ossl_rsaz_amm52x20_x2_ifma256(BN_ULONG *out, const BN_ULONG *a,
72                                    const BN_ULONG *b, const BN_ULONG *m,
73                                    const BN_ULONG k0[2]);
74 void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y,
75                                        const BN_ULONG *red_table,
76                                        int red_table_idx1, int red_table_idx2);
77 
78 void ossl_rsaz_amm52x30_x1_ifma256(BN_ULONG *res, const BN_ULONG *a,
79                                    const BN_ULONG *b, const BN_ULONG *m,
80                                    BN_ULONG k0);
81 void ossl_rsaz_amm52x30_x2_ifma256(BN_ULONG *out, const BN_ULONG *a,
82                                    const BN_ULONG *b, const BN_ULONG *m,
83                                    const BN_ULONG k0[2]);
84 void ossl_extract_multiplier_2x30_win5(BN_ULONG *red_Y,
85                                        const BN_ULONG *red_table,
86                                        int red_table_idx1, int red_table_idx2);
87 
88 void ossl_rsaz_amm52x40_x1_ifma256(BN_ULONG *res, const BN_ULONG *a,
89                                    const BN_ULONG *b, const BN_ULONG *m,
90                                    BN_ULONG k0);
91 void ossl_rsaz_amm52x40_x2_ifma256(BN_ULONG *out, const BN_ULONG *a,
92                                    const BN_ULONG *b, const BN_ULONG *m,
93                                    const BN_ULONG k0[2]);
94 void ossl_extract_multiplier_2x40_win5(BN_ULONG *red_Y,
95                                        const BN_ULONG *red_table,
96                                        int red_table_idx1, int red_table_idx2);
97 
98 void ossl_rsaz_amm52x20_x1_avxifma256(BN_ULONG *res, const BN_ULONG *a,
99                                       const BN_ULONG *b, const BN_ULONG *m,
100                                       BN_ULONG k0);
101 void ossl_rsaz_amm52x20_x2_avxifma256(BN_ULONG *out, const BN_ULONG *a,
102                                       const BN_ULONG *b, const BN_ULONG *m,
103                                       const BN_ULONG k0[2]);
104 void ossl_extract_multiplier_2x20_win5_avx(BN_ULONG *red_Y,
105                                            const BN_ULONG *red_table,
106                                            int red_table_idx1,
107                                            int red_table_idx2);
108 
109 void ossl_rsaz_amm52x30_x1_avxifma256(BN_ULONG *res, const BN_ULONG *a,
110                                       const BN_ULONG *b, const BN_ULONG *m,
111                                       BN_ULONG k0);
112 void ossl_rsaz_amm52x30_x2_avxifma256(BN_ULONG *out, const BN_ULONG *a,
113                                       const BN_ULONG *b, const BN_ULONG *m,
114                                       const BN_ULONG k0[2]);
115 void ossl_extract_multiplier_2x30_win5_avx(BN_ULONG *red_Y,
116                                            const BN_ULONG *red_table,
117                                            int red_table_idx1,
118                                            int red_table_idx2);
119 
120 void ossl_rsaz_amm52x40_x1_avxifma256(BN_ULONG *res, const BN_ULONG *a,
121                                       const BN_ULONG *b, const BN_ULONG *m,
122                                       BN_ULONG k0);
123 void ossl_rsaz_amm52x40_x2_avxifma256(BN_ULONG *out, const BN_ULONG *a,
124                                       const BN_ULONG *b, const BN_ULONG *m,
125                                       const BN_ULONG k0[2]);
126 void ossl_extract_multiplier_2x40_win5_avx(BN_ULONG *red_Y,
127                                            const BN_ULONG *red_table,
128                                            int red_table_idx1,
129                                            int red_table_idx2);
130 
131 typedef void (*AMM)(BN_ULONG *res, const BN_ULONG *a, const BN_ULONG *b,
132                     const BN_ULONG *m, BN_ULONG k0);
133 
134 static AMM ossl_rsaz_amm52_x1[] = {
135     ossl_rsaz_amm52x20_x1_avxifma256, ossl_rsaz_amm52x20_x1_ifma256,
136     ossl_rsaz_amm52x30_x1_avxifma256, ossl_rsaz_amm52x30_x1_ifma256,
137     ossl_rsaz_amm52x40_x1_avxifma256, ossl_rsaz_amm52x40_x1_ifma256,
138 };
139 
140 typedef void (*DAMM)(BN_ULONG *res, const BN_ULONG *a, const BN_ULONG *b,
141                      const BN_ULONG *m, const BN_ULONG k0[2]);
142 
143 static DAMM ossl_rsaz_amm52_x2[] = {
144     ossl_rsaz_amm52x20_x2_avxifma256, ossl_rsaz_amm52x20_x2_ifma256,
145     ossl_rsaz_amm52x30_x2_avxifma256, ossl_rsaz_amm52x30_x2_ifma256,
146     ossl_rsaz_amm52x40_x2_avxifma256, ossl_rsaz_amm52x40_x2_ifma256,
147 };
148 
149 typedef void (*DEXTRACT)(BN_ULONG *res, const BN_ULONG *red_table,
150                          int red_table_idx, int tbl_idx);
151 
152 static DEXTRACT ossl_extract_multiplier_win5[] = {
153     ossl_extract_multiplier_2x20_win5_avx, ossl_extract_multiplier_2x20_win5,
154     ossl_extract_multiplier_2x30_win5_avx, ossl_extract_multiplier_2x30_win5,
155     ossl_extract_multiplier_2x40_win5_avx, ossl_extract_multiplier_2x40_win5,
156 };
157 
158 static int RSAZ_mod_exp_x2_ifma256(BN_ULONG *res, const BN_ULONG *base,
159                                    const BN_ULONG *exp[2], const BN_ULONG *m,
160                                    const BN_ULONG *rr, const BN_ULONG k0[2],
161                                    int modulus_bitsize);
162 
163 /*
164  * Dual Montgomery modular exponentiation using prime moduli of the
165  * same bit size, optimized with AVX512 ISA or AVXIFMA ISA.
166  *
167  * Input and output parameters for each exponentiation are independent and
168  * denoted here by index |i|, i = 1..2.
169  *
170  * Input and output are all in regular 2^64 radix.
171  *
172  * Each moduli shall be |factor_size| bit size.
173  *
174  * Supported cases:
175  *   - 2x1024
176  *   - 2x1536
177  *   - 2x2048
178  *
179  *  [out] res|i|      - result of modular exponentiation: array of qword values
180  *                      in regular (2^64) radix. Size of array shall be enough
181  *                      to hold |factor_size| bits.
182  *  [in]  base|i|     - base
183  *  [in]  exp|i|      - exponent
184  *  [in]  m|i|        - moduli
185  *  [in]  rr|i|       - Montgomery parameter RR = R^2 mod m|i|
186  *  [in]  k0_|i|      - Montgomery parameter k0 = -1/m|i| mod 2^64
187  *  [in]  factor_size - moduli bit size
188  *
189  * \return 0 in case of failure,
190  *         1 in case of success.
191  */
192 int ossl_rsaz_mod_exp_avx512_x2(BN_ULONG *res1,
193                                 const BN_ULONG *base1,
194                                 const BN_ULONG *exp1,
195                                 const BN_ULONG *m1,
196                                 const BN_ULONG *rr1,
197                                 BN_ULONG k0_1,
198                                 BN_ULONG *res2,
199                                 const BN_ULONG *base2,
200                                 const BN_ULONG *exp2,
201                                 const BN_ULONG *m2,
202                                 const BN_ULONG *rr2,
203                                 BN_ULONG k0_2,
204                                 int factor_size)
205 {
206     int ret = 0;
207 
208     /*
209      * Number of word-size (BN_ULONG) digits to store exponent in redundant
210      * representation.
211      */
212     int exp_digits = number_of_digits(factor_size + 2, DIGIT_SIZE);
213     int coeff_pow = 4 * (DIGIT_SIZE * exp_digits - factor_size);
214 
215     /*  Number of YMM registers required to store exponent's digits */
216     int ymm_regs_num = NUMBER_OF_REGISTERS(exp_digits, 256 /* ymm bit size */);
217     /* Capacity of the register set (in qwords) to store exponent */
218     int regs_capacity = ymm_regs_num * 4;
219 
220     BN_ULONG *base1_red, *m1_red, *rr1_red;
221     BN_ULONG *base2_red, *m2_red, *rr2_red;
222     BN_ULONG *coeff_red;
223     BN_ULONG *storage = NULL;
224     BN_ULONG *storage_aligned = NULL;
225     int storage_len_bytes = 7 * regs_capacity * sizeof(BN_ULONG)
226                            + 64 /* alignment */;
227 
228     const BN_ULONG *exp[2] = {0};
229     BN_ULONG k0[2] = {0};
230     /* AMM = Almost Montgomery Multiplication */
231     AMM amm = NULL;
232     int avx512ifma = !!ossl_rsaz_avx512ifma_eligible();
233 
234     if (factor_size != 1024 && factor_size != 1536 && factor_size != 2048)
235         goto err;
236 
237     amm = ossl_rsaz_amm52_x1[(factor_size / 512 - 2) * 2 + avx512ifma];
238 
239     storage = (BN_ULONG *)OPENSSL_malloc(storage_len_bytes);
240     if (storage == NULL)
241         goto err;
242     storage_aligned = (BN_ULONG *)ALIGN_OF(storage, 64);
243 
244     /* Memory layout for red(undant) representations */
245     base1_red = storage_aligned;
246     base2_red = storage_aligned + 1 * regs_capacity;
247     m1_red    = storage_aligned + 2 * regs_capacity;
248     m2_red    = storage_aligned + 3 * regs_capacity;
249     rr1_red   = storage_aligned + 4 * regs_capacity;
250     rr2_red   = storage_aligned + 5 * regs_capacity;
251     coeff_red = storage_aligned + 6 * regs_capacity;
252 
253     /* Convert base_i, m_i, rr_i, from regular to 52-bit radix */
254     to_words52(base1_red, regs_capacity, base1, factor_size);
255     to_words52(base2_red, regs_capacity, base2, factor_size);
256     to_words52(m1_red,    regs_capacity, m1,    factor_size);
257     to_words52(m2_red,    regs_capacity, m2,    factor_size);
258     to_words52(rr1_red,   regs_capacity, rr1,   factor_size);
259     to_words52(rr2_red,   regs_capacity, rr2,   factor_size);
260 
261     /*
262      * Compute target domain Montgomery converters RR' for each modulus
263      * based on precomputed original domain's RR.
264      *
265      * RR -> RR' transformation steps:
266      *  (1) coeff = 2^k
267      *  (2) t = AMM(RR,RR) = RR^2 / R' mod m
268      *  (3) RR' = AMM(t, coeff) = RR^2 * 2^k / R'^2 mod m
269      * where
270      *  k = 4 * (52 * digits52 - modlen)
271      *  R  = 2^(64 * ceil(modlen/64)) mod m
272      *  RR = R^2 mod m
273      *  R' = 2^(52 * ceil(modlen/52)) mod m
274      *
275      *  EX/ modlen = 1024: k = 64, RR = 2^2048 mod m, RR' = 2^2080 mod m
276      */
277     memset(coeff_red, 0, exp_digits * sizeof(BN_ULONG));
278     /* (1) in reduced domain representation */
279     set_bit(coeff_red, 64 * (int)(coeff_pow / 52) + coeff_pow % 52);
280 
281     amm(rr1_red, rr1_red, rr1_red, m1_red, k0_1);     /* (2) for m1 */
282     amm(rr1_red, rr1_red, coeff_red, m1_red, k0_1);   /* (3) for m1 */
283 
284     amm(rr2_red, rr2_red, rr2_red, m2_red, k0_2);     /* (2) for m2 */
285     amm(rr2_red, rr2_red, coeff_red, m2_red, k0_2);   /* (3) for m2 */
286 
287     exp[0] = exp1;
288     exp[1] = exp2;
289 
290     k0[0] = k0_1;
291     k0[1] = k0_2;
292 
293     /* Dual (2-exps in parallel) exponentiation */
294     ret = RSAZ_mod_exp_x2_ifma256(rr1_red, base1_red, exp, m1_red, rr1_red,
295                                   k0, factor_size);
296     if (!ret)
297         goto err;
298 
299     /* Convert rr_i back to regular radix */
300     from_words52(res1, factor_size, rr1_red);
301     from_words52(res2, factor_size, rr2_red);
302 
303     /* bn_reduce_once_in_place expects number of BN_ULONG, not bit size */
304     factor_size /= sizeof(BN_ULONG) * 8;
305 
306     bn_reduce_once_in_place(res1, /*carry=*/0, m1, storage, factor_size);
307     bn_reduce_once_in_place(res2, /*carry=*/0, m2, storage, factor_size);
308 
309 err:
310     if (storage != NULL) {
311         OPENSSL_cleanse(storage, storage_len_bytes);
312         OPENSSL_free(storage);
313     }
314     return ret;
315 }
316 
317 /*
318  * Dual {1024,1536,2048}-bit w-ary modular exponentiation using prime moduli of
319  * the same bit size using Almost Montgomery Multiplication, optimized with
320  * AVX512_IFMA256 ISA.
321  *
322  * The parameter w (window size) = 5.
323  *
324  *  [out] res      - result of modular exponentiation: 2x{20,30,40} qword
325  *                   values in 2^52 radix.
326  *  [in]  base     - base (2x{20,30,40} qword values in 2^52 radix)
327  *  [in]  exp      - array of 2 pointers to {16,24,32} qword values in 2^64 radix.
328  *                   Exponent is not converted to redundant representation.
329  *  [in]  m        - moduli (2x{20,30,40} qword values in 2^52 radix)
330  *  [in]  rr       - Montgomery parameter for 2 moduli:
331  *                     RR(1024) = 2^2080 mod m.
332  *                     RR(1536) = 2^3120 mod m.
333  *                     RR(2048) = 2^4160 mod m.
334  *                   (2x{20,30,40} qword values in 2^52 radix)
335  *  [in]  k0       - Montgomery parameter for 2 moduli: k0 = -1/m mod 2^64
336  *
337  * \return (void).
338  */
339 int RSAZ_mod_exp_x2_ifma256(BN_ULONG *out,
340                             const BN_ULONG *base,
341                             const BN_ULONG *exp[2],
342                             const BN_ULONG *m,
343                             const BN_ULONG *rr,
344                             const BN_ULONG k0[2],
345                             int modulus_bitsize)
346 {
347     int ret = 0;
348     int idx;
349 
350     /* Exponent window size */
351     int exp_win_size = 5;
352     int exp_win_mask = (1U << exp_win_size) - 1;
353 
354     /*
355     * Number of digits (64-bit words) in redundant representation to handle
356     * modulus bits
357     */
358     int red_digits = 0;
359     int exp_digits = 0;
360 
361     BN_ULONG *storage = NULL;
362     BN_ULONG *storage_aligned = NULL;
363     int storage_len_bytes = 0;
364 
365     /* Red(undant) result Y and multiplier X */
366     BN_ULONG *red_Y = NULL;     /* [2][red_digits] */
367     BN_ULONG *red_X = NULL;     /* [2][red_digits] */
368     /* Pre-computed table of base powers */
369     BN_ULONG *red_table = NULL; /* [1U << exp_win_size][2][red_digits] */
370     /* Expanded exponent */
371     BN_ULONG *expz = NULL;      /* [2][exp_digits + 1] */
372 
373     /* Dual AMM */
374     DAMM damm = NULL;
375     /* Extractor from red_table */
376     DEXTRACT extract = NULL;
377     int avx512ifma = !!ossl_rsaz_avx512ifma_eligible();
378 
379 /*
380  * Squaring is done using multiplication now. That can be a subject of
381  * optimization in future.
382  */
383 # define DAMS(r,a,m,k0) damm((r),(a),(a),(m),(k0))
384 
385     if (modulus_bitsize != 1024 && modulus_bitsize != 1536 && modulus_bitsize != 2048)
386         goto err;
387 
388     damm = ossl_rsaz_amm52_x2[(modulus_bitsize / 512 - 2) * 2 + avx512ifma];
389     extract = ossl_extract_multiplier_win5[(modulus_bitsize / 512 - 2) * 2 + avx512ifma];
390 
391     switch (modulus_bitsize) {
392     case 1024:
393         red_digits = 20;
394         exp_digits = 16;
395         break;
396     case 1536:
397         /* Extended with 2 digits padding to avoid mask ops in high YMM register */
398         red_digits = 30 + 2;
399         exp_digits = 24;
400         break;
401     case 2048:
402         red_digits = 40;
403         exp_digits = 32;
404         break;
405     default:
406         goto err;
407     }
408 
409     storage_len_bytes = (2 * red_digits                         /* red_Y     */
410                        + 2 * red_digits                         /* red_X     */
411                        + 2 * red_digits * (1U << exp_win_size)  /* red_table */
412                        + 2 * (exp_digits + 1))                  /* expz      */
413                        * sizeof(BN_ULONG)
414                        + 64;                                    /* alignment */
415 
416     storage = (BN_ULONG *)OPENSSL_zalloc(storage_len_bytes);
417     if (storage == NULL)
418         goto err;
419     storage_aligned = (BN_ULONG *)ALIGN_OF(storage, 64);
420 
421     red_Y     = storage_aligned;
422     red_X     = red_Y + 2 * red_digits;
423     red_table = red_X + 2 * red_digits;
424     expz      = red_table + 2 * red_digits * (1U << exp_win_size);
425 
426     /*
427      * Compute table of powers base^i, i = 0, ..., (2^EXP_WIN_SIZE) - 1
428      *   table[0] = mont(x^0) = mont(1)
429      *   table[1] = mont(x^1) = mont(x)
430      */
431     red_X[0 * red_digits] = 1;
432     red_X[1 * red_digits] = 1;
433     damm(&red_table[0 * 2 * red_digits], (const BN_ULONG*)red_X, rr, m, k0);
434     damm(&red_table[1 * 2 * red_digits], base,  rr, m, k0);
435 
436     for (idx = 1; idx < (int)((1U << exp_win_size) / 2); idx++) {
437         DAMS(&red_table[(2 * idx + 0) * 2 * red_digits],
438              &red_table[(1 * idx)     * 2 * red_digits], m, k0);
439         damm(&red_table[(2 * idx + 1) * 2 * red_digits],
440              &red_table[(2 * idx)     * 2 * red_digits],
441              &red_table[1 * 2 * red_digits], m, k0);
442     }
443 
444     /* Copy and expand exponents */
445     memcpy(&expz[0 * (exp_digits + 1)], exp[0], exp_digits * sizeof(BN_ULONG));
446     expz[1 * (exp_digits + 1) - 1] = 0;
447     memcpy(&expz[1 * (exp_digits + 1)], exp[1], exp_digits * sizeof(BN_ULONG));
448     expz[2 * (exp_digits + 1) - 1] = 0;
449 
450     /* Exponentiation */
451     {
452         const int rem = modulus_bitsize % exp_win_size;
453         const BN_ULONG table_idx_mask = exp_win_mask;
454 
455         int exp_bit_no = modulus_bitsize - rem;
456         int exp_chunk_no = exp_bit_no / 64;
457         int exp_chunk_shift = exp_bit_no % 64;
458 
459         BN_ULONG red_table_idx_0, red_table_idx_1;
460 
461         /*
462          * If rem == 0, then
463          *      exp_bit_no = modulus_bitsize - exp_win_size
464          * However, this isn't possible because rem is { 1024, 1536, 2048 } % 5
465          * which is { 4, 1, 3 } respectively.
466          *
467          * If this assertion ever fails the fix above is easy.
468          */
469         OPENSSL_assert(rem != 0);
470 
471         /* Process 1-st exp window - just init result */
472         red_table_idx_0 = expz[exp_chunk_no + 0 * (exp_digits + 1)];
473         red_table_idx_1 = expz[exp_chunk_no + 1 * (exp_digits + 1)];
474 
475         /*
476          * The function operates with fixed moduli sizes divisible by 64,
477          * thus table index here is always in supported range [0, EXP_WIN_SIZE).
478          */
479         red_table_idx_0 >>= exp_chunk_shift;
480         red_table_idx_1 >>= exp_chunk_shift;
481 
482         extract(&red_Y[0 * red_digits], (const BN_ULONG*)red_table, (int)red_table_idx_0, (int)red_table_idx_1);
483 
484         /* Process other exp windows */
485         for (exp_bit_no -= exp_win_size; exp_bit_no >= 0; exp_bit_no -= exp_win_size) {
486             /* Extract pre-computed multiplier from the table */
487             {
488                 BN_ULONG T;
489 
490                 exp_chunk_no = exp_bit_no / 64;
491                 exp_chunk_shift = exp_bit_no % 64;
492                 {
493                     red_table_idx_0 = expz[exp_chunk_no + 0 * (exp_digits + 1)];
494                     T = expz[exp_chunk_no + 1 + 0 * (exp_digits + 1)];
495 
496                     red_table_idx_0 >>= exp_chunk_shift;
497                     /*
498                      * Get additional bits from then next quadword
499                      * when 64-bit boundaries are crossed.
500                      */
501                     if (exp_chunk_shift > 64 - exp_win_size) {
502                         T <<= (64 - exp_chunk_shift);
503                         red_table_idx_0 ^= T;
504                     }
505                     red_table_idx_0 &= table_idx_mask;
506                 }
507                 {
508                     red_table_idx_1 = expz[exp_chunk_no + 1 * (exp_digits + 1)];
509                     T = expz[exp_chunk_no + 1 + 1 * (exp_digits + 1)];
510 
511                     red_table_idx_1 >>= exp_chunk_shift;
512                     /*
513                      * Get additional bits from then next quadword
514                      * when 64-bit boundaries are crossed.
515                      */
516                     if (exp_chunk_shift > 64 - exp_win_size) {
517                         T <<= (64 - exp_chunk_shift);
518                         red_table_idx_1 ^= T;
519                     }
520                     red_table_idx_1 &= table_idx_mask;
521                 }
522 
523                 extract(&red_X[0 * red_digits], (const BN_ULONG*)red_table, (int)red_table_idx_0, (int)red_table_idx_1);
524             }
525 
526             /* Series of squaring */
527             DAMS((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, m, k0);
528             DAMS((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, m, k0);
529             DAMS((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, m, k0);
530             DAMS((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, m, k0);
531             DAMS((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, m, k0);
532 
533             damm((BN_ULONG*)red_Y, (const BN_ULONG*)red_Y, (const BN_ULONG*)red_X, m, k0);
534         }
535     }
536 
537     /*
538      *
539      * NB: After the last AMM of exponentiation in Montgomery domain, the result
540      * may be (modulus_bitsize + 1), but the conversion out of Montgomery domain
541      * performs an AMM(x,1) which guarantees that the final result is less than
542      * |m|, so no conditional subtraction is needed here. See [1] for details.
543      *
544      * [1] Gueron, S. Efficient software implementations of modular exponentiation.
545      *     DOI: 10.1007/s13389-012-0031-5
546      */
547 
548     /* Convert result back in regular 2^52 domain */
549     memset(red_X, 0, 2 * red_digits * sizeof(BN_ULONG));
550     red_X[0 * red_digits] = 1;
551     red_X[1 * red_digits] = 1;
552     damm(out, (const BN_ULONG*)red_Y, (const BN_ULONG*)red_X, m, k0);
553 
554     ret = 1;
555 
556 err:
557     if (storage != NULL) {
558         /* Clear whole storage */
559         OPENSSL_cleanse(storage, storage_len_bytes);
560         OPENSSL_free(storage);
561     }
562 
563 #undef DAMS
564     return ret;
565 }
566 
567 static ossl_inline uint64_t get_digit(const uint8_t *in, int in_len)
568 {
569     uint64_t digit = 0;
570 
571     assert(in != NULL);
572     assert(in_len <= 8);
573 
574     for (; in_len > 0; in_len--) {
575         digit <<= 8;
576         digit += (uint64_t)(in[in_len - 1]);
577     }
578     return digit;
579 }
580 
581 /*
582  * Convert array of words in regular (base=2^64) representation to array of
583  * words in redundant (base=2^52) one.
584  */
585 static void to_words52(BN_ULONG *out, int out_len,
586                        const BN_ULONG *in, int in_bitsize)
587 {
588     uint8_t *in_str = NULL;
589 
590     assert(out != NULL);
591     assert(in != NULL);
592     /* Check destination buffer capacity */
593     assert(out_len >= number_of_digits(in_bitsize, DIGIT_SIZE));
594 
595     in_str = (uint8_t *)in;
596 
597     for (; in_bitsize >= (2 * DIGIT_SIZE); in_bitsize -= (2 * DIGIT_SIZE), out += 2) {
598         uint64_t digit;
599 
600         memcpy(&digit, in_str, sizeof(digit));
601         out[0] = digit & DIGIT_MASK;
602         in_str += 6;
603         memcpy(&digit, in_str, sizeof(digit));
604         out[1] = (digit >> 4) & DIGIT_MASK;
605         in_str += 7;
606         out_len -= 2;
607     }
608 
609     if (in_bitsize > DIGIT_SIZE) {
610         uint64_t digit = get_digit(in_str, 7);
611 
612         out[0] = digit & DIGIT_MASK;
613         in_str += 6;
614         in_bitsize -= DIGIT_SIZE;
615         digit = get_digit(in_str, BITS2WORD8_SIZE(in_bitsize));
616         out[1] = digit >> 4;
617         out += 2;
618         out_len -= 2;
619     } else if (in_bitsize > 0) {
620         out[0] = get_digit(in_str, BITS2WORD8_SIZE(in_bitsize));
621         out++;
622         out_len--;
623     }
624 
625     memset(out, 0, out_len * sizeof(BN_ULONG));
626 }
627 
628 static ossl_inline void put_digit(uint8_t *out, int out_len, uint64_t digit)
629 {
630     assert(out != NULL);
631     assert(out_len <= 8);
632 
633     for (; out_len > 0; out_len--) {
634         *out++ = (uint8_t)(digit & 0xFF);
635         digit >>= 8;
636     }
637 }
638 
639 /*
640  * Convert array of words in redundant (base=2^52) representation to array of
641  * words in regular (base=2^64) one.
642  */
643 static void from_words52(BN_ULONG *out, int out_bitsize, const BN_ULONG *in)
644 {
645     int i;
646     int out_len = BITS2WORD64_SIZE(out_bitsize);
647 
648     assert(out != NULL);
649     assert(in != NULL);
650 
651     for (i = 0; i < out_len; i++)
652         out[i] = 0;
653 
654     {
655         uint8_t *out_str = (uint8_t *)out;
656 
657         for (; out_bitsize >= (2 * DIGIT_SIZE);
658                out_bitsize -= (2 * DIGIT_SIZE), in += 2) {
659             uint64_t digit;
660 
661             digit = in[0];
662             memcpy(out_str, &digit, sizeof(digit));
663             out_str += 6;
664             digit = digit >> 48 | in[1] << 4;
665             memcpy(out_str, &digit, sizeof(digit));
666             out_str += 7;
667         }
668 
669         if (out_bitsize > DIGIT_SIZE) {
670             put_digit(out_str, 7, in[0]);
671             out_str += 6;
672             out_bitsize -= DIGIT_SIZE;
673             put_digit(out_str, BITS2WORD8_SIZE(out_bitsize),
674                         (in[1] << 4 | in[0] >> 48));
675         } else if (out_bitsize) {
676             put_digit(out_str, BITS2WORD8_SIZE(out_bitsize), in[0]);
677         }
678     }
679 }
680 
681 /*
682  * Set bit at index |idx| in the words array |a|.
683  * It does not do any boundaries checks, make sure the index is valid before
684  * calling the function.
685  */
686 static ossl_inline void set_bit(BN_ULONG *a, int idx)
687 {
688     assert(a != NULL);
689 
690     {
691         int i, j;
692 
693         i = idx / BN_BITS2;
694         j = idx % BN_BITS2;
695         a[i] |= (((BN_ULONG)1) << j);
696     }
697 }
698 
699 #endif
700