Lines Matching +full:0 +full:a
5 * this file except in compliance with the License. You can obtain a copy
25 * A. Well, that's because this code is basically a quick-n-dirty
31 * A. x86_64 features own ABI which I'm not familiar with. This is
37 * A. 'apps/openssl speed rsa dsa' output with no-asm:
70 * "m"(a), "+m"(r) is the way to favor DirectPath ยต-code;
71 * "g"(0) let the compiler to decide where does it
74 # define mul_add(r,a,word,carry) do { \ argument
77 : "=a"(low),"=d"(high) \
78 : "a"(word),"m"(a) \
80 asm ("addq %2,%0; adcq %3,%1" \
82 : "a"(low),"g"(0) \
84 asm ("addq %2,%0; adcq %3,%1" \
86 : "r"(carry),"g"(0) \
89 } while (0)
91 # define mul(r,a,word,carry) do { \ argument
94 : "=a"(low),"=d"(high) \
95 : "a"(word),"g"(a) \
97 asm ("addq %2,%0; adcq %3,%1" \
99 : "a"(low),"g"(0) \
102 } while (0)
104 # define sqr(r0,r1,a) \ argument
106 : "=a"(r0),"=d"(r1) \
107 : "a"(a) \
113 BN_ULONG c1 = 0; in bn_mul_add_words()
115 if (num <= 0) in bn_mul_add_words()
119 mul_add(rp[0], ap[0], w, c1); in bn_mul_add_words()
128 mul_add(rp[0], ap[0], w, c1); in bn_mul_add_words()
129 if (--num == 0) in bn_mul_add_words()
132 if (--num == 0) in bn_mul_add_words()
143 BN_ULONG c1 = 0; in bn_mul_words()
145 if (num <= 0) in bn_mul_words()
149 mul(rp[0], ap[0], w, c1); in bn_mul_words()
158 mul(rp[0], ap[0], w, c1); in bn_mul_words()
159 if (--num == 0) in bn_mul_words()
162 if (--num == 0) in bn_mul_words()
169 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) in bn_sqr_words() argument
171 if (n <= 0) in bn_sqr_words()
175 sqr(r[0], r[1], a[0]); in bn_sqr_words()
176 sqr(r[2], r[3], a[1]); in bn_sqr_words()
177 sqr(r[4], r[5], a[2]); in bn_sqr_words()
178 sqr(r[6], r[7], a[3]); in bn_sqr_words()
179 a += 4; in bn_sqr_words()
184 sqr(r[0], r[1], a[0]); in bn_sqr_words()
185 if (--n == 0) in bn_sqr_words()
187 sqr(r[2], r[3], a[1]); in bn_sqr_words()
188 if (--n == 0) in bn_sqr_words()
190 sqr(r[4], r[5], a[2]); in bn_sqr_words()
198 asm("divq %4":"=a"(ret), "=d"(waste) in bn_div_words()
199 : "a"(l), "d"(h), "r"(d) in bn_div_words()
209 size_t i = 0; in bn_add_words()
211 if (n <= 0) in bn_add_words()
212 return 0; in bn_add_words()
214 asm volatile (" subq %0,%0 \n" /* clear carry */ in bn_add_words()
217 "1: movq (%4,%2,8),%0 \n" in bn_add_words()
218 " adcq (%5,%2,8),%0 \n" in bn_add_words()
219 " movq %0,(%3,%2,8) \n" in bn_add_words()
223 " sbbq %0,%0 \n" in bn_add_words()
236 size_t i = 0; in bn_sub_words()
238 if (n <= 0) in bn_sub_words()
239 return 0; in bn_sub_words()
241 asm volatile (" subq %0,%0 \n" /* clear borrow */ in bn_sub_words()
244 "1: movq (%4,%2,8),%0 \n" in bn_sub_words()
245 " sbbq (%5,%2,8),%0 \n" in bn_sub_words()
246 " movq %0,(%3,%2,8) \n" in bn_sub_words()
250 " sbbq %0,%0 \n" in bn_sub_words()
259 # define BN_MASK2 0xffffffffffffffffL
260 BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) in bn_sub_words() argument
263 int c = 0; in bn_sub_words()
265 if (n <= 0) in bn_sub_words()
266 return (BN_ULONG)0; in bn_sub_words()
269 t1 = a[0]; in bn_sub_words()
270 t2 = b[0]; in bn_sub_words()
271 r[0] = (t1 - t2 - c) & BN_MASK2; in bn_sub_words()
274 if (--n <= 0) in bn_sub_words()
277 t1 = a[1]; in bn_sub_words()
282 if (--n <= 0) in bn_sub_words()
285 t1 = a[2]; in bn_sub_words()
290 if (--n <= 0) in bn_sub_words()
293 t1 = a[3]; in bn_sub_words()
298 if (--n <= 0) in bn_sub_words()
301 a += 4; in bn_sub_words()
309 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
310 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
311 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
313 * sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number
321 # if 0
323 # define mul_add_c(a,b,c0,c1,c2) do { \
324 BN_ULONG ta = (a), tb = (b); \
327 c0 += lo; hi += (c0<lo)?1:0; \
328 c1 += hi; c2 += (c1<hi)?1:0; \
329 } while(0)
331 # define mul_add_c2(a,b,c0,c1,c2) do { \
332 BN_ULONG ta = (a), tb = (b); \
335 c0 += lo; tt = hi+((c0<lo)?1:0); \
336 c1 += tt; c2 += (c1<tt)?1:0; \
337 c0 += lo; hi += (c0<lo)?1:0; \
338 c1 += hi; c2 += (c1<hi)?1:0; \
339 } while(0)
341 # define sqr_add_c(a,i,c0,c1,c2) do { \
342 BN_ULONG ta = (a)[i]; \
345 c0 += lo; hi += (c0<lo)?1:0; \
346 c1 += hi; c2 += (c1<hi)?1:0; \
347 } while(0)
349 # define mul_add_c(a,b,c0,c1,c2) do { \ argument
352 : "=a"(t1),"=d"(t2) \
353 : "a"(a),"m"(b) \
355 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
357 : "r"(t1),"r"(t2),"g"(0) \
359 } while (0)
361 # define sqr_add_c(a,i,c0,c1,c2) do { \ argument
364 : "=a"(t1),"=d"(t2) \
365 : "a"(a[i]) \
367 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
369 : "r"(t1),"r"(t2),"g"(0) \
371 } while (0)
373 # define mul_add_c2(a,b,c0,c1,c2) do { \ argument
376 : "=a"(t1),"=d"(t2) \
377 : "a"(a),"m"(b) \
379 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
381 : "r"(t1),"r"(t2),"g"(0) \
383 asm ("addq %3,%0; adcq %4,%1; adcq %5,%2" \
385 : "r"(t1),"r"(t2),"g"(0) \
387 } while (0)
390 # define sqr_add_c2(a,i,j,c0,c1,c2) \ argument
391 mul_add_c2((a)[i],(a)[j],c0,c1,c2)
393 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) in bn_mul_comba8() argument
397 c1 = 0; in bn_mul_comba8()
398 c2 = 0; in bn_mul_comba8()
399 c3 = 0; in bn_mul_comba8()
400 mul_add_c(a[0], b[0], c1, c2, c3); in bn_mul_comba8()
401 r[0] = c1; in bn_mul_comba8()
402 c1 = 0; in bn_mul_comba8()
403 mul_add_c(a[0], b[1], c2, c3, c1); in bn_mul_comba8()
404 mul_add_c(a[1], b[0], c2, c3, c1); in bn_mul_comba8()
406 c2 = 0; in bn_mul_comba8()
407 mul_add_c(a[2], b[0], c3, c1, c2); in bn_mul_comba8()
408 mul_add_c(a[1], b[1], c3, c1, c2); in bn_mul_comba8()
409 mul_add_c(a[0], b[2], c3, c1, c2); in bn_mul_comba8()
411 c3 = 0; in bn_mul_comba8()
412 mul_add_c(a[0], b[3], c1, c2, c3); in bn_mul_comba8()
413 mul_add_c(a[1], b[2], c1, c2, c3); in bn_mul_comba8()
414 mul_add_c(a[2], b[1], c1, c2, c3); in bn_mul_comba8()
415 mul_add_c(a[3], b[0], c1, c2, c3); in bn_mul_comba8()
417 c1 = 0; in bn_mul_comba8()
418 mul_add_c(a[4], b[0], c2, c3, c1); in bn_mul_comba8()
419 mul_add_c(a[3], b[1], c2, c3, c1); in bn_mul_comba8()
420 mul_add_c(a[2], b[2], c2, c3, c1); in bn_mul_comba8()
421 mul_add_c(a[1], b[3], c2, c3, c1); in bn_mul_comba8()
422 mul_add_c(a[0], b[4], c2, c3, c1); in bn_mul_comba8()
424 c2 = 0; in bn_mul_comba8()
425 mul_add_c(a[0], b[5], c3, c1, c2); in bn_mul_comba8()
426 mul_add_c(a[1], b[4], c3, c1, c2); in bn_mul_comba8()
427 mul_add_c(a[2], b[3], c3, c1, c2); in bn_mul_comba8()
428 mul_add_c(a[3], b[2], c3, c1, c2); in bn_mul_comba8()
429 mul_add_c(a[4], b[1], c3, c1, c2); in bn_mul_comba8()
430 mul_add_c(a[5], b[0], c3, c1, c2); in bn_mul_comba8()
432 c3 = 0; in bn_mul_comba8()
433 mul_add_c(a[6], b[0], c1, c2, c3); in bn_mul_comba8()
434 mul_add_c(a[5], b[1], c1, c2, c3); in bn_mul_comba8()
435 mul_add_c(a[4], b[2], c1, c2, c3); in bn_mul_comba8()
436 mul_add_c(a[3], b[3], c1, c2, c3); in bn_mul_comba8()
437 mul_add_c(a[2], b[4], c1, c2, c3); in bn_mul_comba8()
438 mul_add_c(a[1], b[5], c1, c2, c3); in bn_mul_comba8()
439 mul_add_c(a[0], b[6], c1, c2, c3); in bn_mul_comba8()
441 c1 = 0; in bn_mul_comba8()
442 mul_add_c(a[0], b[7], c2, c3, c1); in bn_mul_comba8()
443 mul_add_c(a[1], b[6], c2, c3, c1); in bn_mul_comba8()
444 mul_add_c(a[2], b[5], c2, c3, c1); in bn_mul_comba8()
445 mul_add_c(a[3], b[4], c2, c3, c1); in bn_mul_comba8()
446 mul_add_c(a[4], b[3], c2, c3, c1); in bn_mul_comba8()
447 mul_add_c(a[5], b[2], c2, c3, c1); in bn_mul_comba8()
448 mul_add_c(a[6], b[1], c2, c3, c1); in bn_mul_comba8()
449 mul_add_c(a[7], b[0], c2, c3, c1); in bn_mul_comba8()
451 c2 = 0; in bn_mul_comba8()
452 mul_add_c(a[7], b[1], c3, c1, c2); in bn_mul_comba8()
453 mul_add_c(a[6], b[2], c3, c1, c2); in bn_mul_comba8()
454 mul_add_c(a[5], b[3], c3, c1, c2); in bn_mul_comba8()
455 mul_add_c(a[4], b[4], c3, c1, c2); in bn_mul_comba8()
456 mul_add_c(a[3], b[5], c3, c1, c2); in bn_mul_comba8()
457 mul_add_c(a[2], b[6], c3, c1, c2); in bn_mul_comba8()
458 mul_add_c(a[1], b[7], c3, c1, c2); in bn_mul_comba8()
460 c3 = 0; in bn_mul_comba8()
461 mul_add_c(a[2], b[7], c1, c2, c3); in bn_mul_comba8()
462 mul_add_c(a[3], b[6], c1, c2, c3); in bn_mul_comba8()
463 mul_add_c(a[4], b[5], c1, c2, c3); in bn_mul_comba8()
464 mul_add_c(a[5], b[4], c1, c2, c3); in bn_mul_comba8()
465 mul_add_c(a[6], b[3], c1, c2, c3); in bn_mul_comba8()
466 mul_add_c(a[7], b[2], c1, c2, c3); in bn_mul_comba8()
468 c1 = 0; in bn_mul_comba8()
469 mul_add_c(a[7], b[3], c2, c3, c1); in bn_mul_comba8()
470 mul_add_c(a[6], b[4], c2, c3, c1); in bn_mul_comba8()
471 mul_add_c(a[5], b[5], c2, c3, c1); in bn_mul_comba8()
472 mul_add_c(a[4], b[6], c2, c3, c1); in bn_mul_comba8()
473 mul_add_c(a[3], b[7], c2, c3, c1); in bn_mul_comba8()
475 c2 = 0; in bn_mul_comba8()
476 mul_add_c(a[4], b[7], c3, c1, c2); in bn_mul_comba8()
477 mul_add_c(a[5], b[6], c3, c1, c2); in bn_mul_comba8()
478 mul_add_c(a[6], b[5], c3, c1, c2); in bn_mul_comba8()
479 mul_add_c(a[7], b[4], c3, c1, c2); in bn_mul_comba8()
481 c3 = 0; in bn_mul_comba8()
482 mul_add_c(a[7], b[5], c1, c2, c3); in bn_mul_comba8()
483 mul_add_c(a[6], b[6], c1, c2, c3); in bn_mul_comba8()
484 mul_add_c(a[5], b[7], c1, c2, c3); in bn_mul_comba8()
486 c1 = 0; in bn_mul_comba8()
487 mul_add_c(a[6], b[7], c2, c3, c1); in bn_mul_comba8()
488 mul_add_c(a[7], b[6], c2, c3, c1); in bn_mul_comba8()
490 c2 = 0; in bn_mul_comba8()
491 mul_add_c(a[7], b[7], c3, c1, c2); in bn_mul_comba8()
496 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) in bn_mul_comba4() argument
500 c1 = 0; in bn_mul_comba4()
501 c2 = 0; in bn_mul_comba4()
502 c3 = 0; in bn_mul_comba4()
503 mul_add_c(a[0], b[0], c1, c2, c3); in bn_mul_comba4()
504 r[0] = c1; in bn_mul_comba4()
505 c1 = 0; in bn_mul_comba4()
506 mul_add_c(a[0], b[1], c2, c3, c1); in bn_mul_comba4()
507 mul_add_c(a[1], b[0], c2, c3, c1); in bn_mul_comba4()
509 c2 = 0; in bn_mul_comba4()
510 mul_add_c(a[2], b[0], c3, c1, c2); in bn_mul_comba4()
511 mul_add_c(a[1], b[1], c3, c1, c2); in bn_mul_comba4()
512 mul_add_c(a[0], b[2], c3, c1, c2); in bn_mul_comba4()
514 c3 = 0; in bn_mul_comba4()
515 mul_add_c(a[0], b[3], c1, c2, c3); in bn_mul_comba4()
516 mul_add_c(a[1], b[2], c1, c2, c3); in bn_mul_comba4()
517 mul_add_c(a[2], b[1], c1, c2, c3); in bn_mul_comba4()
518 mul_add_c(a[3], b[0], c1, c2, c3); in bn_mul_comba4()
520 c1 = 0; in bn_mul_comba4()
521 mul_add_c(a[3], b[1], c2, c3, c1); in bn_mul_comba4()
522 mul_add_c(a[2], b[2], c2, c3, c1); in bn_mul_comba4()
523 mul_add_c(a[1], b[3], c2, c3, c1); in bn_mul_comba4()
525 c2 = 0; in bn_mul_comba4()
526 mul_add_c(a[2], b[3], c3, c1, c2); in bn_mul_comba4()
527 mul_add_c(a[3], b[2], c3, c1, c2); in bn_mul_comba4()
529 c3 = 0; in bn_mul_comba4()
530 mul_add_c(a[3], b[3], c1, c2, c3); in bn_mul_comba4()
535 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) in bn_sqr_comba8() argument
539 c1 = 0; in bn_sqr_comba8()
540 c2 = 0; in bn_sqr_comba8()
541 c3 = 0; in bn_sqr_comba8()
542 sqr_add_c(a, 0, c1, c2, c3); in bn_sqr_comba8()
543 r[0] = c1; in bn_sqr_comba8()
544 c1 = 0; in bn_sqr_comba8()
545 sqr_add_c2(a, 1, 0, c2, c3, c1); in bn_sqr_comba8()
547 c2 = 0; in bn_sqr_comba8()
548 sqr_add_c(a, 1, c3, c1, c2); in bn_sqr_comba8()
549 sqr_add_c2(a, 2, 0, c3, c1, c2); in bn_sqr_comba8()
551 c3 = 0; in bn_sqr_comba8()
552 sqr_add_c2(a, 3, 0, c1, c2, c3); in bn_sqr_comba8()
553 sqr_add_c2(a, 2, 1, c1, c2, c3); in bn_sqr_comba8()
555 c1 = 0; in bn_sqr_comba8()
556 sqr_add_c(a, 2, c2, c3, c1); in bn_sqr_comba8()
557 sqr_add_c2(a, 3, 1, c2, c3, c1); in bn_sqr_comba8()
558 sqr_add_c2(a, 4, 0, c2, c3, c1); in bn_sqr_comba8()
560 c2 = 0; in bn_sqr_comba8()
561 sqr_add_c2(a, 5, 0, c3, c1, c2); in bn_sqr_comba8()
562 sqr_add_c2(a, 4, 1, c3, c1, c2); in bn_sqr_comba8()
563 sqr_add_c2(a, 3, 2, c3, c1, c2); in bn_sqr_comba8()
565 c3 = 0; in bn_sqr_comba8()
566 sqr_add_c(a, 3, c1, c2, c3); in bn_sqr_comba8()
567 sqr_add_c2(a, 4, 2, c1, c2, c3); in bn_sqr_comba8()
568 sqr_add_c2(a, 5, 1, c1, c2, c3); in bn_sqr_comba8()
569 sqr_add_c2(a, 6, 0, c1, c2, c3); in bn_sqr_comba8()
571 c1 = 0; in bn_sqr_comba8()
572 sqr_add_c2(a, 7, 0, c2, c3, c1); in bn_sqr_comba8()
573 sqr_add_c2(a, 6, 1, c2, c3, c1); in bn_sqr_comba8()
574 sqr_add_c2(a, 5, 2, c2, c3, c1); in bn_sqr_comba8()
575 sqr_add_c2(a, 4, 3, c2, c3, c1); in bn_sqr_comba8()
577 c2 = 0; in bn_sqr_comba8()
578 sqr_add_c(a, 4, c3, c1, c2); in bn_sqr_comba8()
579 sqr_add_c2(a, 5, 3, c3, c1, c2); in bn_sqr_comba8()
580 sqr_add_c2(a, 6, 2, c3, c1, c2); in bn_sqr_comba8()
581 sqr_add_c2(a, 7, 1, c3, c1, c2); in bn_sqr_comba8()
583 c3 = 0; in bn_sqr_comba8()
584 sqr_add_c2(a, 7, 2, c1, c2, c3); in bn_sqr_comba8()
585 sqr_add_c2(a, 6, 3, c1, c2, c3); in bn_sqr_comba8()
586 sqr_add_c2(a, 5, 4, c1, c2, c3); in bn_sqr_comba8()
588 c1 = 0; in bn_sqr_comba8()
589 sqr_add_c(a, 5, c2, c3, c1); in bn_sqr_comba8()
590 sqr_add_c2(a, 6, 4, c2, c3, c1); in bn_sqr_comba8()
591 sqr_add_c2(a, 7, 3, c2, c3, c1); in bn_sqr_comba8()
593 c2 = 0; in bn_sqr_comba8()
594 sqr_add_c2(a, 7, 4, c3, c1, c2); in bn_sqr_comba8()
595 sqr_add_c2(a, 6, 5, c3, c1, c2); in bn_sqr_comba8()
597 c3 = 0; in bn_sqr_comba8()
598 sqr_add_c(a, 6, c1, c2, c3); in bn_sqr_comba8()
599 sqr_add_c2(a, 7, 5, c1, c2, c3); in bn_sqr_comba8()
601 c1 = 0; in bn_sqr_comba8()
602 sqr_add_c2(a, 7, 6, c2, c3, c1); in bn_sqr_comba8()
604 c2 = 0; in bn_sqr_comba8()
605 sqr_add_c(a, 7, c3, c1, c2); in bn_sqr_comba8()
610 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) in bn_sqr_comba4() argument
614 c1 = 0; in bn_sqr_comba4()
615 c2 = 0; in bn_sqr_comba4()
616 c3 = 0; in bn_sqr_comba4()
617 sqr_add_c(a, 0, c1, c2, c3); in bn_sqr_comba4()
618 r[0] = c1; in bn_sqr_comba4()
619 c1 = 0; in bn_sqr_comba4()
620 sqr_add_c2(a, 1, 0, c2, c3, c1); in bn_sqr_comba4()
622 c2 = 0; in bn_sqr_comba4()
623 sqr_add_c(a, 1, c3, c1, c2); in bn_sqr_comba4()
624 sqr_add_c2(a, 2, 0, c3, c1, c2); in bn_sqr_comba4()
626 c3 = 0; in bn_sqr_comba4()
627 sqr_add_c2(a, 3, 0, c1, c2, c3); in bn_sqr_comba4()
628 sqr_add_c2(a, 2, 1, c1, c2, c3); in bn_sqr_comba4()
630 c1 = 0; in bn_sqr_comba4()
631 sqr_add_c(a, 2, c2, c3, c1); in bn_sqr_comba4()
632 sqr_add_c2(a, 3, 1, c2, c3, c1); in bn_sqr_comba4()
634 c2 = 0; in bn_sqr_comba4()
635 sqr_add_c2(a, 3, 2, c3, c1, c2); in bn_sqr_comba4()
637 c3 = 0; in bn_sqr_comba4()
638 sqr_add_c(a, 3, c1, c2, c3); in bn_sqr_comba4()