xref: /freebsd/crypto/openssl/crypto/bn/asm/x86_64-gcc.c (revision f25b8c9fb4f58cf61adb47d7570abe7caa6d385d)
1 /*
2  * Copyright 2002-2018 The OpenSSL Project Authors. All Rights Reserved.
3  *
4  * Licensed under the Apache License 2.0 (the "License").  You may not use
5  * this file except in compliance with the License.  You can obtain a copy
6  * in the file LICENSE in the source distribution or at
7  * https://www.openssl.org/source/license.html
8  */
9 
10 #include "../bn_local.h"
11 #if !(defined(__GNUC__) && __GNUC__ >= 2)
12 /* clang-format off */
13 # include "../bn_asm.c"         /* kind of dirty hack for Sun Studio */
14 /* clang-format on */
15 #else
16 /*-
17  * x86_64 BIGNUM accelerator version 0.1, December 2002.
18  *
19  * Implemented by Andy Polyakov <appro@openssl.org> for the OpenSSL
20  * project.
21  *
22  * Rights for redistribution and usage in source and binary forms are
23  * granted according to the License. Warranty of any kind is disclaimed.
24  *
25  * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
26  *    versions, like 1.0...
27  * A. Well, that's because this code is basically a quick-n-dirty
28  *    proof-of-concept hack. As you can see it's implemented with
29  *    inline assembler, which means that you're bound to GCC and that
30  *    there might be enough room for further improvement.
31  *
32  * Q. Why inline assembler?
33  * A. x86_64 features own ABI which I'm not familiar with. This is
34  *    why I decided to let the compiler take care of subroutine
35  *    prologue/epilogue as well as register allocation. For reference.
36  *    Win64 implements different ABI for AMD64, different from Linux.
37  *
38  * Q. How much faster does it get?
39  * A. 'apps/openssl speed rsa dsa' output with no-asm:
40  *
41  *                        sign    verify    sign/s verify/s
42  *      rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
43  *      rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
44  *      rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
45  *      rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
46  *                        sign    verify    sign/s verify/s
47  *      dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
48  *      dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
49  *      dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
50  *
51  *    'apps/openssl speed rsa dsa' output with this module:
52  *
53  *                        sign    verify    sign/s verify/s
54  *      rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
55  *      rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
56  *      rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
57  *      rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
58  *                        sign    verify    sign/s verify/s
59  *      dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
60  *      dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
61  *      dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
62  *
63  *    For the reference. IA-32 assembler implementation performs
64  *    very much like 64-bit code compiled with no-asm on the same
65  *    machine.
66  */
67 
68 #undef mul
69 #undef mul_add
70 
71 /*-
72  * "m"(a), "+m"(r)      is the way to favor DirectPath µ-code;
73  * "g"(0)               let the compiler to decide where does it
74  *                      want to keep the value of zero;
75  */
76 #define mul_add(r, a, word, carry)    \
77     do {                              \
78         register BN_ULONG high, low;  \
79         asm("mulq %3"                 \
80             : "=a"(low), "=d"(high)   \
81             : "a"(word), "m"(a)       \
82             : "cc");                  \
83         asm("addq %2,%0; adcq %3,%1"  \
84             : "+r"(carry), "+d"(high) \
85             : "a"(low), "g"(0)        \
86             : "cc");                  \
87         asm("addq %2,%0; adcq %3,%1"  \
88             : "+m"(r), "+d"(high)     \
89             : "r"(carry), "g"(0)      \
90             : "cc");                  \
91         carry = high;                 \
92     } while (0)
93 
94 #define mul(r, a, word, carry)        \
95     do {                              \
96         register BN_ULONG high, low;  \
97         asm("mulq %3"                 \
98             : "=a"(low), "=d"(high)   \
99             : "a"(word), "g"(a)       \
100             : "cc");                  \
101         asm("addq %2,%0; adcq %3,%1"  \
102             : "+r"(carry), "+d"(high) \
103             : "a"(low), "g"(0)        \
104             : "cc");                  \
105         (r) = carry, carry = high;    \
106     } while (0)
107 #undef sqr
108 #define sqr(r0, r1, a)       \
109     asm("mulq %2"            \
110         : "=a"(r0), "=d"(r1) \
111         : "a"(a)             \
112         : "cc");
113 
bn_mul_add_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)114 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
115     BN_ULONG w)
116 {
117     BN_ULONG c1 = 0;
118 
119     if (num <= 0)
120         return c1;
121 
122     while (num & ~3) {
123         mul_add(rp[0], ap[0], w, c1);
124         mul_add(rp[1], ap[1], w, c1);
125         mul_add(rp[2], ap[2], w, c1);
126         mul_add(rp[3], ap[3], w, c1);
127         ap += 4;
128         rp += 4;
129         num -= 4;
130     }
131     if (num) {
132         mul_add(rp[0], ap[0], w, c1);
133         if (--num == 0)
134             return c1;
135         mul_add(rp[1], ap[1], w, c1);
136         if (--num == 0)
137             return c1;
138         mul_add(rp[2], ap[2], w, c1);
139         return c1;
140     }
141 
142     return c1;
143 }
144 
bn_mul_words(BN_ULONG * rp,const BN_ULONG * ap,int num,BN_ULONG w)145 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w)
146 {
147     BN_ULONG c1 = 0;
148 
149     if (num <= 0)
150         return c1;
151 
152     while (num & ~3) {
153         mul(rp[0], ap[0], w, c1);
154         mul(rp[1], ap[1], w, c1);
155         mul(rp[2], ap[2], w, c1);
156         mul(rp[3], ap[3], w, c1);
157         ap += 4;
158         rp += 4;
159         num -= 4;
160     }
161     if (num) {
162         mul(rp[0], ap[0], w, c1);
163         if (--num == 0)
164             return c1;
165         mul(rp[1], ap[1], w, c1);
166         if (--num == 0)
167             return c1;
168         mul(rp[2], ap[2], w, c1);
169     }
170     return c1;
171 }
172 
bn_sqr_words(BN_ULONG * r,const BN_ULONG * a,int n)173 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n)
174 {
175     if (n <= 0)
176         return;
177 
178     while (n & ~3) {
179         sqr(r[0], r[1], a[0]);
180         sqr(r[2], r[3], a[1]);
181         sqr(r[4], r[5], a[2]);
182         sqr(r[6], r[7], a[3]);
183         a += 4;
184         r += 8;
185         n -= 4;
186     }
187     if (n) {
188         sqr(r[0], r[1], a[0]);
189         if (--n == 0)
190             return;
191         sqr(r[2], r[3], a[1]);
192         if (--n == 0)
193             return;
194         sqr(r[4], r[5], a[2]);
195     }
196 }
197 
bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d)198 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
199 {
200     BN_ULONG ret, waste;
201 
202     asm("divq      %4" : "=a"(ret), "=d"(waste)
203         : "a"(l), "d"(h), "r"(d)
204         : "cc");
205 
206     return ret;
207 }
208 
bn_add_words(BN_ULONG * rp,const BN_ULONG * ap,const BN_ULONG * bp,int n)209 BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
210     int n)
211 {
212     BN_ULONG ret;
213     size_t i = 0;
214 
215     if (n <= 0)
216         return 0;
217 
218     asm volatile("       subq    %0,%0           \n" /* clear carry */
219                  "       jmp     1f              \n"
220                  ".p2align 4                     \n"
221                  "1:     movq    (%4,%2,8),%0    \n"
222                  "       adcq    (%5,%2,8),%0    \n"
223                  "       movq    %0,(%3,%2,8)    \n"
224                  "       lea     1(%2),%2        \n"
225                  "       dec     %1              \n"
226                  "       jnz     1b              \n"
227                  "       sbbq    %0,%0           \n"
228         : "=&r"(ret), "+c"(n), "+r"(i)
229         : "r"(rp), "r"(ap), "r"(bp)
230         : "cc", "memory");
231 
232     return ret & 1;
233 }
234 
235 #ifndef SIMICS
bn_sub_words(BN_ULONG * rp,const BN_ULONG * ap,const BN_ULONG * bp,int n)236 BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
237     int n)
238 {
239     BN_ULONG ret;
240     size_t i = 0;
241 
242     if (n <= 0)
243         return 0;
244 
245     asm volatile("       subq    %0,%0           \n" /* clear borrow */
246                  "       jmp     1f              \n"
247                  ".p2align 4                     \n"
248                  "1:     movq    (%4,%2,8),%0    \n"
249                  "       sbbq    (%5,%2,8),%0    \n"
250                  "       movq    %0,(%3,%2,8)    \n"
251                  "       lea     1(%2),%2        \n"
252                  "       dec     %1              \n"
253                  "       jnz     1b              \n"
254                  "       sbbq    %0,%0           \n"
255         : "=&r"(ret), "+c"(n), "+r"(i)
256         : "r"(rp), "r"(ap), "r"(bp)
257         : "cc", "memory");
258 
259     return ret & 1;
260 }
261 #else
262 /* Simics 1.4<7 has buggy sbbq:-( */
263 #define BN_MASK2 0xffffffffffffffffL
bn_sub_words(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b,int n)264 BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
265 {
266     BN_ULONG t1, t2;
267     int c = 0;
268 
269     if (n <= 0)
270         return (BN_ULONG)0;
271 
272     for (;;) {
273         t1 = a[0];
274         t2 = b[0];
275         r[0] = (t1 - t2 - c) & BN_MASK2;
276         if (t1 != t2)
277             c = (t1 < t2);
278         if (--n <= 0)
279             break;
280 
281         t1 = a[1];
282         t2 = b[1];
283         r[1] = (t1 - t2 - c) & BN_MASK2;
284         if (t1 != t2)
285             c = (t1 < t2);
286         if (--n <= 0)
287             break;
288 
289         t1 = a[2];
290         t2 = b[2];
291         r[2] = (t1 - t2 - c) & BN_MASK2;
292         if (t1 != t2)
293             c = (t1 < t2);
294         if (--n <= 0)
295             break;
296 
297         t1 = a[3];
298         t2 = b[3];
299         r[3] = (t1 - t2 - c) & BN_MASK2;
300         if (t1 != t2)
301             c = (t1 < t2);
302         if (--n <= 0)
303             break;
304 
305         a += 4;
306         b += 4;
307         r += 4;
308     }
309     return c;
310 }
311 #endif
312 
313 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
314 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
315 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
316 /*
317  * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
318  * c=(c2,c1,c0)
319  */
320 
321 /*
322  * Keep in mind that carrying into high part of multiplication result
323  * can not overflow, because it cannot be all-ones.
324  */
325 #if 0
326 /* original macros are kept for reference purposes */
327 #define mul_add_c(a, b, c0, c1, c2)    \
328     do {                               \
329         BN_ULONG ta = (a), tb = (b);   \
330         BN_ULONG lo, hi;               \
331         BN_UMULT_LOHI(lo, hi, ta, tb); \
332         c0 += lo;                      \
333         hi += (c0 < lo) ? 1 : 0;       \
334         c1 += hi;                      \
335         c2 += (c1 < hi) ? 1 : 0;       \
336     } while (0)
337 
338 #define mul_add_c2(a, b, c0, c1, c2)   \
339     do {                               \
340         BN_ULONG ta = (a), tb = (b);   \
341         BN_ULONG lo, hi, tt;           \
342         BN_UMULT_LOHI(lo, hi, ta, tb); \
343         c0 += lo;                      \
344         tt = hi + ((c0 < lo) ? 1 : 0); \
345         c1 += tt;                      \
346         c2 += (c1 < tt) ? 1 : 0;       \
347         c0 += lo;                      \
348         hi += (c0 < lo) ? 1 : 0;       \
349         c1 += hi;                      \
350         c2 += (c1 < hi) ? 1 : 0;       \
351     } while (0)
352 
353 #define sqr_add_c(a, i, c0, c1, c2)    \
354     do {                               \
355         BN_ULONG ta = (a)[i];          \
356         BN_ULONG lo, hi;               \
357         BN_UMULT_LOHI(lo, hi, ta, ta); \
358         c0 += lo;                      \
359         hi += (c0 < lo) ? 1 : 0;       \
360         c1 += hi;                      \
361         c2 += (c1 < hi) ? 1 : 0;       \
362     } while (0)
363 #else
364 #define mul_add_c(a, b, c0, c1, c2)              \
365     do {                                         \
366         BN_ULONG t1, t2;                         \
367         asm("mulq %3"                            \
368             : "=a"(t1), "=d"(t2)                 \
369             : "a"(a), "m"(b)                     \
370             : "cc");                             \
371         asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
372             : "+r"(c0), "+r"(c1), "+r"(c2)       \
373             : "r"(t1), "r"(t2), "g"(0)           \
374             : "cc");                             \
375     } while (0)
376 
377 #define sqr_add_c(a, i, c0, c1, c2)              \
378     do {                                         \
379         BN_ULONG t1, t2;                         \
380         asm("mulq %2"                            \
381             : "=a"(t1), "=d"(t2)                 \
382             : "a"(a[i])                          \
383             : "cc");                             \
384         asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
385             : "+r"(c0), "+r"(c1), "+r"(c2)       \
386             : "r"(t1), "r"(t2), "g"(0)           \
387             : "cc");                             \
388     } while (0)
389 
390 #define mul_add_c2(a, b, c0, c1, c2)             \
391     do {                                         \
392         BN_ULONG t1, t2;                         \
393         asm("mulq %3"                            \
394             : "=a"(t1), "=d"(t2)                 \
395             : "a"(a), "m"(b)                     \
396             : "cc");                             \
397         asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
398             : "+r"(c0), "+r"(c1), "+r"(c2)       \
399             : "r"(t1), "r"(t2), "g"(0)           \
400             : "cc");                             \
401         asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
402             : "+r"(c0), "+r"(c1), "+r"(c2)       \
403             : "r"(t1), "r"(t2), "g"(0)           \
404             : "cc");                             \
405     } while (0)
406 #endif
407 
408 #define sqr_add_c2(a, i, j, c0, c1, c2) \
409     mul_add_c2((a)[i], (a)[j], c0, c1, c2)
410 
bn_mul_comba8(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)411 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
412 {
413     BN_ULONG c1, c2, c3;
414 
415     c1 = 0;
416     c2 = 0;
417     c3 = 0;
418     mul_add_c(a[0], b[0], c1, c2, c3);
419     r[0] = c1;
420     c1 = 0;
421     mul_add_c(a[0], b[1], c2, c3, c1);
422     mul_add_c(a[1], b[0], c2, c3, c1);
423     r[1] = c2;
424     c2 = 0;
425     mul_add_c(a[2], b[0], c3, c1, c2);
426     mul_add_c(a[1], b[1], c3, c1, c2);
427     mul_add_c(a[0], b[2], c3, c1, c2);
428     r[2] = c3;
429     c3 = 0;
430     mul_add_c(a[0], b[3], c1, c2, c3);
431     mul_add_c(a[1], b[2], c1, c2, c3);
432     mul_add_c(a[2], b[1], c1, c2, c3);
433     mul_add_c(a[3], b[0], c1, c2, c3);
434     r[3] = c1;
435     c1 = 0;
436     mul_add_c(a[4], b[0], c2, c3, c1);
437     mul_add_c(a[3], b[1], c2, c3, c1);
438     mul_add_c(a[2], b[2], c2, c3, c1);
439     mul_add_c(a[1], b[3], c2, c3, c1);
440     mul_add_c(a[0], b[4], c2, c3, c1);
441     r[4] = c2;
442     c2 = 0;
443     mul_add_c(a[0], b[5], c3, c1, c2);
444     mul_add_c(a[1], b[4], c3, c1, c2);
445     mul_add_c(a[2], b[3], c3, c1, c2);
446     mul_add_c(a[3], b[2], c3, c1, c2);
447     mul_add_c(a[4], b[1], c3, c1, c2);
448     mul_add_c(a[5], b[0], c3, c1, c2);
449     r[5] = c3;
450     c3 = 0;
451     mul_add_c(a[6], b[0], c1, c2, c3);
452     mul_add_c(a[5], b[1], c1, c2, c3);
453     mul_add_c(a[4], b[2], c1, c2, c3);
454     mul_add_c(a[3], b[3], c1, c2, c3);
455     mul_add_c(a[2], b[4], c1, c2, c3);
456     mul_add_c(a[1], b[5], c1, c2, c3);
457     mul_add_c(a[0], b[6], c1, c2, c3);
458     r[6] = c1;
459     c1 = 0;
460     mul_add_c(a[0], b[7], c2, c3, c1);
461     mul_add_c(a[1], b[6], c2, c3, c1);
462     mul_add_c(a[2], b[5], c2, c3, c1);
463     mul_add_c(a[3], b[4], c2, c3, c1);
464     mul_add_c(a[4], b[3], c2, c3, c1);
465     mul_add_c(a[5], b[2], c2, c3, c1);
466     mul_add_c(a[6], b[1], c2, c3, c1);
467     mul_add_c(a[7], b[0], c2, c3, c1);
468     r[7] = c2;
469     c2 = 0;
470     mul_add_c(a[7], b[1], c3, c1, c2);
471     mul_add_c(a[6], b[2], c3, c1, c2);
472     mul_add_c(a[5], b[3], c3, c1, c2);
473     mul_add_c(a[4], b[4], c3, c1, c2);
474     mul_add_c(a[3], b[5], c3, c1, c2);
475     mul_add_c(a[2], b[6], c3, c1, c2);
476     mul_add_c(a[1], b[7], c3, c1, c2);
477     r[8] = c3;
478     c3 = 0;
479     mul_add_c(a[2], b[7], c1, c2, c3);
480     mul_add_c(a[3], b[6], c1, c2, c3);
481     mul_add_c(a[4], b[5], c1, c2, c3);
482     mul_add_c(a[5], b[4], c1, c2, c3);
483     mul_add_c(a[6], b[3], c1, c2, c3);
484     mul_add_c(a[7], b[2], c1, c2, c3);
485     r[9] = c1;
486     c1 = 0;
487     mul_add_c(a[7], b[3], c2, c3, c1);
488     mul_add_c(a[6], b[4], c2, c3, c1);
489     mul_add_c(a[5], b[5], c2, c3, c1);
490     mul_add_c(a[4], b[6], c2, c3, c1);
491     mul_add_c(a[3], b[7], c2, c3, c1);
492     r[10] = c2;
493     c2 = 0;
494     mul_add_c(a[4], b[7], c3, c1, c2);
495     mul_add_c(a[5], b[6], c3, c1, c2);
496     mul_add_c(a[6], b[5], c3, c1, c2);
497     mul_add_c(a[7], b[4], c3, c1, c2);
498     r[11] = c3;
499     c3 = 0;
500     mul_add_c(a[7], b[5], c1, c2, c3);
501     mul_add_c(a[6], b[6], c1, c2, c3);
502     mul_add_c(a[5], b[7], c1, c2, c3);
503     r[12] = c1;
504     c1 = 0;
505     mul_add_c(a[6], b[7], c2, c3, c1);
506     mul_add_c(a[7], b[6], c2, c3, c1);
507     r[13] = c2;
508     c2 = 0;
509     mul_add_c(a[7], b[7], c3, c1, c2);
510     r[14] = c3;
511     r[15] = c1;
512 }
513 
bn_mul_comba4(BN_ULONG * r,BN_ULONG * a,BN_ULONG * b)514 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
515 {
516     BN_ULONG c1, c2, c3;
517 
518     c1 = 0;
519     c2 = 0;
520     c3 = 0;
521     mul_add_c(a[0], b[0], c1, c2, c3);
522     r[0] = c1;
523     c1 = 0;
524     mul_add_c(a[0], b[1], c2, c3, c1);
525     mul_add_c(a[1], b[0], c2, c3, c1);
526     r[1] = c2;
527     c2 = 0;
528     mul_add_c(a[2], b[0], c3, c1, c2);
529     mul_add_c(a[1], b[1], c3, c1, c2);
530     mul_add_c(a[0], b[2], c3, c1, c2);
531     r[2] = c3;
532     c3 = 0;
533     mul_add_c(a[0], b[3], c1, c2, c3);
534     mul_add_c(a[1], b[2], c1, c2, c3);
535     mul_add_c(a[2], b[1], c1, c2, c3);
536     mul_add_c(a[3], b[0], c1, c2, c3);
537     r[3] = c1;
538     c1 = 0;
539     mul_add_c(a[3], b[1], c2, c3, c1);
540     mul_add_c(a[2], b[2], c2, c3, c1);
541     mul_add_c(a[1], b[3], c2, c3, c1);
542     r[4] = c2;
543     c2 = 0;
544     mul_add_c(a[2], b[3], c3, c1, c2);
545     mul_add_c(a[3], b[2], c3, c1, c2);
546     r[5] = c3;
547     c3 = 0;
548     mul_add_c(a[3], b[3], c1, c2, c3);
549     r[6] = c1;
550     r[7] = c2;
551 }
552 
bn_sqr_comba8(BN_ULONG * r,const BN_ULONG * a)553 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
554 {
555     BN_ULONG c1, c2, c3;
556 
557     c1 = 0;
558     c2 = 0;
559     c3 = 0;
560     sqr_add_c(a, 0, c1, c2, c3);
561     r[0] = c1;
562     c1 = 0;
563     sqr_add_c2(a, 1, 0, c2, c3, c1);
564     r[1] = c2;
565     c2 = 0;
566     sqr_add_c(a, 1, c3, c1, c2);
567     sqr_add_c2(a, 2, 0, c3, c1, c2);
568     r[2] = c3;
569     c3 = 0;
570     sqr_add_c2(a, 3, 0, c1, c2, c3);
571     sqr_add_c2(a, 2, 1, c1, c2, c3);
572     r[3] = c1;
573     c1 = 0;
574     sqr_add_c(a, 2, c2, c3, c1);
575     sqr_add_c2(a, 3, 1, c2, c3, c1);
576     sqr_add_c2(a, 4, 0, c2, c3, c1);
577     r[4] = c2;
578     c2 = 0;
579     sqr_add_c2(a, 5, 0, c3, c1, c2);
580     sqr_add_c2(a, 4, 1, c3, c1, c2);
581     sqr_add_c2(a, 3, 2, c3, c1, c2);
582     r[5] = c3;
583     c3 = 0;
584     sqr_add_c(a, 3, c1, c2, c3);
585     sqr_add_c2(a, 4, 2, c1, c2, c3);
586     sqr_add_c2(a, 5, 1, c1, c2, c3);
587     sqr_add_c2(a, 6, 0, c1, c2, c3);
588     r[6] = c1;
589     c1 = 0;
590     sqr_add_c2(a, 7, 0, c2, c3, c1);
591     sqr_add_c2(a, 6, 1, c2, c3, c1);
592     sqr_add_c2(a, 5, 2, c2, c3, c1);
593     sqr_add_c2(a, 4, 3, c2, c3, c1);
594     r[7] = c2;
595     c2 = 0;
596     sqr_add_c(a, 4, c3, c1, c2);
597     sqr_add_c2(a, 5, 3, c3, c1, c2);
598     sqr_add_c2(a, 6, 2, c3, c1, c2);
599     sqr_add_c2(a, 7, 1, c3, c1, c2);
600     r[8] = c3;
601     c3 = 0;
602     sqr_add_c2(a, 7, 2, c1, c2, c3);
603     sqr_add_c2(a, 6, 3, c1, c2, c3);
604     sqr_add_c2(a, 5, 4, c1, c2, c3);
605     r[9] = c1;
606     c1 = 0;
607     sqr_add_c(a, 5, c2, c3, c1);
608     sqr_add_c2(a, 6, 4, c2, c3, c1);
609     sqr_add_c2(a, 7, 3, c2, c3, c1);
610     r[10] = c2;
611     c2 = 0;
612     sqr_add_c2(a, 7, 4, c3, c1, c2);
613     sqr_add_c2(a, 6, 5, c3, c1, c2);
614     r[11] = c3;
615     c3 = 0;
616     sqr_add_c(a, 6, c1, c2, c3);
617     sqr_add_c2(a, 7, 5, c1, c2, c3);
618     r[12] = c1;
619     c1 = 0;
620     sqr_add_c2(a, 7, 6, c2, c3, c1);
621     r[13] = c2;
622     c2 = 0;
623     sqr_add_c(a, 7, c3, c1, c2);
624     r[14] = c3;
625     r[15] = c1;
626 }
627 
bn_sqr_comba4(BN_ULONG * r,const BN_ULONG * a)628 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
629 {
630     BN_ULONG c1, c2, c3;
631 
632     c1 = 0;
633     c2 = 0;
634     c3 = 0;
635     sqr_add_c(a, 0, c1, c2, c3);
636     r[0] = c1;
637     c1 = 0;
638     sqr_add_c2(a, 1, 0, c2, c3, c1);
639     r[1] = c2;
640     c2 = 0;
641     sqr_add_c(a, 1, c3, c1, c2);
642     sqr_add_c2(a, 2, 0, c3, c1, c2);
643     r[2] = c3;
644     c3 = 0;
645     sqr_add_c2(a, 3, 0, c1, c2, c3);
646     sqr_add_c2(a, 2, 1, c1, c2, c3);
647     r[3] = c1;
648     c1 = 0;
649     sqr_add_c(a, 2, c2, c3, c1);
650     sqr_add_c2(a, 3, 1, c2, c3, c1);
651     r[4] = c2;
652     c2 = 0;
653     sqr_add_c2(a, 3, 2, c3, c1, c2);
654     r[5] = c3;
655     c3 = 0;
656     sqr_add_c(a, 3, c1, c2, c3);
657     r[6] = c1;
658     r[7] = c2;
659 }
660 #endif
661