Lines Matching +full:3 +full:d1
2 # SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
101 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
148 .long 2,2,2,3,2,0,2,1
150 .long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
153 .long 0,1,1,2,2,3,7,7
179 my ($d1,$d2,$d3, $r0,$r1,$s1)=("%r8","%r9","%rdi","%r11","%r12","%r13");
194 mov %rdx,$d1
204 adc %rdx,$d1
208 mov $d1,$h1
219 and \$3,$h2
230 # unsigned __int64 h[3]; # current hash value base 2^64
246 &declare_function("poly1305_init_x86_64", 32, 3);
273 $code.=<<___ if (!$kernel && $avx>3);
380 &declare_function("poly1305_emit_x86_64", 32, 3);
413 # struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
419 my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
450 mov $h0,$d1
455 shr \$26,$d1
461 and $d1#d,%eax
468 shr \$26,$d1
476 or $d1,%rax
480 mov %eax,`16*3+0-64`($ctx)
482 mov %edx,`16*3+4-64`($ctx)
485 mov $h1,$d1
491 shr \$14,$d1
493 and $d1#d,%eax
500 shr \$26,$d1
506 or %rax,$d1
507 mov $d1#d,`16*7+0-64`($ctx)
508 lea ($d1,$d1,4),$d1 # *5
511 mov $d1#d,`16*8+0-64`($ctx)
515 call __poly1305_block # r^3
517 mov \$0x3ffffff,%eax # save r^3 base 2^26
518 mov $h0,$d1
520 shr \$26,$d1
524 and $d1#d,%edx
527 shr \$26,$d1
532 or $d1,%rax
534 mov %eax,`16*3+12-64`($ctx)
536 mov $h1,$d1
540 shr \$14,$d1
541 and $d1#d,%edx
544 shr \$26,$d1
549 or %rax,$d1
550 mov $d1#d,`16*7+12-64`($ctx)
551 lea ($d1,$d1,4),$d1 # *5
552 mov $d1#d,`16*8+12-64`($ctx)
558 mov $h0,$d1
560 shr \$26,$d1
564 and $d1#d,%edx
567 shr \$26,$d1
572 or $d1,%rax
574 mov %eax,`16*3+8-64`($ctx)
576 mov $h1,$d1
580 shr \$14,$d1
581 and $d1#d,%edx
584 shr \$26,$d1
589 or %rax,$d1
590 mov $d1#d,`16*7+8-64`($ctx)
591 lea ($d1,$d1,4),$d1 # *5
592 mov $d1#d,`16*8+8-64`($ctx)
638 mov 0($ctx),$d1 # load hash value
646 mov $d1#d,$h0#d
647 and \$`-1*(1<<31)`,$d1
652 shr \$6,$d1
654 add $d1,$h0
660 mov $h2,$d1
661 shl \$40,$d1
663 add $d1,$h1
667 mov $h2,$d1
669 shr \$2,$d1
670 and \$3,$h2
671 add $d2,$d1 # =*5
672 add $d1,$h0
706 and \$0x3ffffff,$h1 # h[3]
800 mov $h1,$d1
804 shl \$12,$d1
807 or $d1,$h0
811 and \$0x3ffffff,$h1 # h[3]
847 vmovd 4*3($ctx),$H3
880 vmovdqu `16*3`($ctx),$D4 # preload r0^2
881 lea `16*3+64`($ctx),$ctx # size optimization
887 vmovdqu 16*3($inp),$T1
894 vpunpcklqdq $T3,$T2,$T3 # 2:3
903 vpand $MASK,$T3,$T3 # 3
909 vmovdqu `16*1-64`($ctx),$D1
915 vpshufd \$0xEE,$D1,$D4
916 vmovdqu `16*3-64`($ctx),$D0
917 vpshufd \$0x44,$D1,$D1
919 vmovdqa $D1,0x10(%rsp)
921 vmovdqu `16*4-64`($ctx),$D1
930 vpshufd \$0xEE,$D1,$D3
932 vpshufd \$0x44,$D1,$D1
934 vmovdqa $D1,0x40(%rsp)
936 vmovdqu `16*7-64`($ctx),$D1
945 vpshufd \$0xEE,$D1,$D4
946 vpshufd \$0x44,$D1,$D1
948 vmovdqa $D1,0x70(%rsp)
961 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
964 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
967 # Note that we start with inp[2:3]*r^2. This is because it
973 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
980 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
1002 vpaddq $H2,$D1,$D1 # d1 += h0*r1
1013 vpaddq $H0,$D1,$D1 # d1 += h4*s2
1026 vpaddq $H1,$D1,$D1 # d1 += h3*s3
1032 vpaddq $T2,$D1,$D1 # d1 += h2*s4
1047 vpunpcklqdq $H3,$H2,$H3 # 2:3
1058 vpand $MASK,$H3,$H3 # 3
1078 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1084 vpaddq $T1,$D1,$D1
1103 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1114 vpaddq $T0,$D1,$D1 # d1 += h4*s2
1129 vpaddq $T2,$D1,$D1 # d1 += h3*s3
1130 vmovdqu 16*3($inp),$T1 #
1136 vpaddq $H2,$D1,$D1 # d1 += h2*s4
1148 vpunpcklqdq $T3,$T2,$T3 # 2:3
1160 vpand $MASK,$T3,$T3 # 3
1173 vpaddq $D0,$D1,$H1 # h0 -> h1
1178 vpsrlq \$26,$H1,$D1
1180 vpaddq $D1,$H2,$H2 # h1 -> h2
1202 # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
1224 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1230 vpmuludq $T1,$D4,$D1 # d1 = h1*r0
1239 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
1243 vpaddq $H2,$D1,$D1 # d1 += h0*r1
1256 vpaddq $H1,$D1,$D1 # d1 += h4*s2
1270 vpaddq $H0,$D1,$D1 # d1 += h3*s3
1281 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
1294 vpunpcklqdq $H3,$H2,$H3 # 2:3
1303 vpand $MASK,$H3,$H3 # 3
1314 # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
1319 vpaddq $T1,$D1,$D1 # d1 += h1*r0
1333 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
1337 vpaddq $T2,$D1,$D1 # d1 += h0*r1
1350 vpaddq $T1,$D1,$D1 # d1 += h4*s2
1364 vpaddq $T0,$D1,$D1 # d1 += h3*s3
1375 vpaddq $T1,$D1,$D1 # d1 += h2*s4
1385 vpsrldq \$8,$D1,$T1
1391 vpaddq $T1,$D1,$D1
1403 vpaddq $H0,$D1,$D1 # h0 -> h1
1408 vpsrlq \$26,$D1,$H1
1409 vpand $MASK,$D1,$D1
1422 vpaddq $H0,$D1,$D1 # h0 -> h1
1429 vmovd $D1,`4*1-48-64`($ctx)
1431 vmovd $D3,`4*3-48-64`($ctx)
1459 &declare_function("poly1305_emit_avx", 32, 3);
1488 and \$3,%r10
1516 my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
1560 mov 0($ctx),$d1 # load hash value
1568 mov $d1#d,$h0#d
1569 and \$`-1*(1<<31)`,$d1
1574 shr \$6,$d1
1576 add $d1,$h0
1582 mov $h2,$d1
1583 shl \$40,$d1
1585 add $d1,$h1
1589 mov $h2,$d1
1591 shr \$2,$d1
1592 and \$3,$h2
1593 add $d2,$d1 # =*5
1594 add $d1,$h0
1634 and \$0x3ffffff,$h1 # h[3]
1733 mov $h1,$d1
1737 shl \$12,$d1
1740 or $d1,$h0
1744 and \$0x3ffffff,$h1 # h[3]
1791 vmovd 4*3($ctx),%x#$H3
1838 vmovdqu `16*3-64`($ctx),%x#$D0
1839 vmovdqu `16*4-64`($ctx),%x#$D1
1851 vpermd $D1,$T0,$D1
1856 vmovdqa $D1,0x80-0x90(%rax)
1870 vinserti128 \$1,16*3($inp),$T1,$T1
1876 vpunpcklqdq $T3,$T2,$T2 # 2:3
1886 vpand $MASK,$T3,$T3 # 3
1898 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3
1900 # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1
1909 vmovdqa `32*3`(%rsp),$T2 # r2^4
1917 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
1926 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
1933 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
1937 vpaddq $T4,$D1,$D1 # d1 += h0*r1
1948 vpaddq $H2,$D1,$D1 # d1 += h1*r0
1960 vpaddq $H2,$D1,$D1 # d1 += h4*s2
1966 vinserti128 \$1,16*3($inp),$T1,$T1
1977 vpaddq $T4,$D1,$D1 # d1 += h3*s3
1986 vpunpcklqdq $T3,$T2,$T3 # 2:3
2002 vpaddq $D0,$D1,$H1 # h0 -> h1
2009 vpsrlq \$26,$H1,$D1
2011 vpaddq $D1,$H2,$H2 # h1 -> h2
2039 vpand $MASK,$T3,$T3 # 3
2059 vmovdqu `32*3+4`(%rsp),$T2 # r2^4
2068 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2072 vpaddq $T4,$D1,$D1 # d1 += h0*r1
2083 vpaddq $H2,$D1,$D1 # d1 += h1*r0
2092 vpaddq $H2,$D1,$D1 # d1 += h4*s2
2105 vpaddq $T4,$D1,$D1 # d1 += h3*s3
2121 vpsrldq \$8,$D1,$T1
2126 vpaddq $T1,$D1,$D1
2135 vpermq \$0x2,$D1,$T1
2140 vpaddq $T1,$D1,$D1
2152 vpaddq $D0,$D1,$H1 # h0 -> h1
2157 vpsrlq \$26,$H1,$D1
2159 vpaddq $D1,$H2,$H2 # h1 -> h2
2180 vmovd %x#$H3,`4*3-48-64`($ctx)
2212 map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4));
2250 vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1}
2253 vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2}
2261 vpermd $D1,$T2,$R1
2285 # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2
2291 vpmuludq $T0,$R1,$D1 # d1 = r0'*r1
2304 vpaddq $M1,$D1,$D1 # d1 += r1'*r0
2316 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4
2329 vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3
2340 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2
2358 vpaddq $M0,$D1,$D1 # d0 -> d1
2363 vpsrlq \$26,$D1,$M1
2364 vpandq $MASK,$D1,$D1
2365 vpaddq $M1,$D2,$D2 # d1 -> d2
2377 vpaddq $M0,$D1,$D1 # d0 -> d1
2399 vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4---
2406 vpermd $D1,$M0,${R1}{%k1}
2431 #vpandq $MASK,$T3,$T3 # 3
2445 # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5
2447 # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3
2457 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
2466 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2
2474 vpandq $MASK,$T3,$T3 # 3
2475 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2492 vpaddq $M1,$D1,$D1 # d1 += h0*r1
2512 vpaddq $M1,$D1,$D1 # d1 += h1*r0
2523 vpaddq $M1,$D1,$D1 # d1 += h3*s3
2530 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2554 vpsrlq \$26,$H1,$D1
2556 vpaddq $D1,$H2,$H2 # h1 -> h2
2583 #vpandq $MASK,$T3,$T3 # 3
2616 vpmuludq $H2,$S4,$D1 # d1 = h2*s4
2617 vpandq $MASK,$T3,$T3 # 3
2632 vpaddq $M1,$D1,$D1 # d1 += h0*r1
2651 vpaddq $M1,$D1,$D1 # d1 += h1*r0
2654 vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1
2663 vpaddq $M1,$D1,$D1 # d1 += h3*s3
2670 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2
2680 vpermq \$0xb1,$H1,$D1
2685 vpaddq $D1,$H1,$H1
2692 vpermq \$0x2,$H1,$D1
2697 vpaddq $D1,$H1,$H1
2703 vextracti64x4 \$0x1,$H1,%y#$D1
2708 vpaddq $D1,$H1,${H1}{%k3}{z}
2712 map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK));
2726 vpunpcklqdq $T3,$T2,$T2 # 2:3
2733 vpsrlq \$26,$H1,$D1
2737 vpaddq $D1,$H2,$H2 # h1 -> h2
2759 vpand $MASK,$T3,$T3 # 3
2771 vmovd %x#$H3,`4*3-48-64`($ctx)
2826 if (!$kernel && $avx>3) {
2843 # unsigned __int64 h[3]; # current hash value base 2^44
2845 # unsigned __int64 r[3]; # key value base 2^44
2846 # struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4];
2849 # # memory, R[3] is R[1]*20
2852 .type poly1305_init_base2_44,\@function,3
2913 # if powers of the key are not calculated yet, process up to 3
2918 mov \$3,%rax
3051 test \$3,$len # is length 4*n+2?
3075 # at this point 64-bit lanes are ordered as 3-1-2-0
3186 vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4
3190 vpermq \$0b11011000,$R1,$R1 # 1,3,2,4
3202 test \$3,$len # is length 4*n+2?
3492 vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers
3831 .type poly1305_emit_base2_44,\@function,3