Lines Matching +full:0 +full:x17
20 adrp x17,OPENSSL_armv8_rsa_neonized
21 ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized]
33 add x29,sp,#0
38 ldr x9,[x2],#8 // bp[0]
40 ldp x7,x8,[x1],#16 // ap[0..1]
44 ldp x13,x14,[x3],#16 // np[0..1]
46 mul x6,x7,x9 // ap[0]*bp[0]
49 mul x10,x8,x9 // ap[1]*bp[0]
52 mul x15,x6,x4 // "tp[0]"*n0
55 // (*) mul x12,x13,x15 // np[0]*m1
69 umulh x17,x14,x15
81 mul x10,x8,x9 // ap[j]*bp[0]
82 adc x13,x17,xzr
88 umulh x17,x14,x15
99 adc x13,x17,xzr
111 ldr x23,[sp] // tp[0]
114 mul x6,x7,x9 // ap[0]*bp[i]
126 // (*) mul x12,x13,x15 // np[0]*m1
131 umulh x17,x14,x15
144 adc x13,x17,xzr
153 umulh x17,x14,x15
166 adcs x13,x17,x19
183 ldr x23,[sp] // tp[0]
185 ldr x14,[x3],#8 // np[0]
200 ldr x23,[sp] // tp[0]
202 ldr x8,[x0],#8 // rp[0]
280 umlal v6.2d,v28.2s,v0.s[0]
287 umlal v10.2d,v28.2s,v1.s[0]
290 st1 {v28.2s},[sp] // put aside smashed b[8*i+0]
295 umlal v6.2d,v29.2s,v2.s[0]
301 umlal v10.2d,v29.2s,v3.s[0]
309 ins v7.d[0],v16.d[0]
310 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+0]
311 umlal v7.2d,v28.2s,v0.s[0]
319 umlal v11.2d,v28.2s,v1.s[0]
327 umlal v7.2d,v29.2s,v2.s[0]
333 umlal v11.2d,v29.2s,v3.s[0]
341 ins v8.d[0],v16.d[0]
343 umlal v8.2d,v28.2s,v0.s[0]
351 umlal v12.2d,v28.2s,v1.s[0]
359 umlal v8.2d,v29.2s,v2.s[0]
365 umlal v12.2d,v29.2s,v3.s[0]
373 ins v9.d[0],v16.d[0]
375 umlal v9.2d,v28.2s,v0.s[0]
383 umlal v13.2d,v28.2s,v1.s[0]
391 umlal v9.2d,v29.2s,v2.s[0]
397 umlal v13.2d,v29.2s,v3.s[0]
405 ins v10.d[0],v16.d[0]
407 umlal v10.2d,v28.2s,v0.s[0]
415 umlal v6.2d,v28.2s,v1.s[0]
423 umlal v10.2d,v29.2s,v2.s[0]
429 umlal v6.2d,v29.2s,v3.s[0]
437 ins v11.d[0],v16.d[0]
439 umlal v11.2d,v28.2s,v0.s[0]
447 umlal v7.2d,v28.2s,v1.s[0]
455 umlal v11.2d,v29.2s,v2.s[0]
461 umlal v7.2d,v29.2s,v3.s[0]
469 ins v12.d[0],v16.d[0]
471 umlal v12.2d,v28.2s,v0.s[0]
479 umlal v8.2d,v28.2s,v1.s[0]
487 umlal v12.2d,v29.2s,v2.s[0]
493 umlal v8.2d,v29.2s,v3.s[0]
501 ins v13.d[0],v16.d[0]
503 umlal v13.2d,v28.2s,v0.s[0]
511 umlal v9.2d,v28.2s,v1.s[0]
518 ld1 {v28.2s},[sp] // pull smashed b[8*i+0]
519 umlal v13.2d,v29.2s,v2.s[0]
527 umlal v9.2d,v29.2s,v3.s[0]
532 ins v13.d[1],v15.d[0]
544 umlal v6.2d,v28.2s,v0.s[0]
547 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+0]
554 umlal v10.2d,v28.2s,v1.s[0]
559 umlal v6.2d,v29.2s,v2.s[0]
563 umlal v10.2d,v29.2s,v3.s[0]
568 umlal v7.2d,v28.2s,v0.s[0]
577 umlal v11.2d,v28.2s,v1.s[0]
582 umlal v7.2d,v29.2s,v2.s[0]
586 umlal v11.2d,v29.2s,v3.s[0]
591 umlal v8.2d,v28.2s,v0.s[0]
600 umlal v12.2d,v28.2s,v1.s[0]
605 umlal v8.2d,v29.2s,v2.s[0]
609 umlal v12.2d,v29.2s,v3.s[0]
614 umlal v9.2d,v28.2s,v0.s[0]
623 umlal v13.2d,v28.2s,v1.s[0]
628 umlal v9.2d,v29.2s,v2.s[0]
632 umlal v13.2d,v29.2s,v3.s[0]
637 umlal v10.2d,v28.2s,v0.s[0]
646 umlal v6.2d,v28.2s,v1.s[0]
651 umlal v10.2d,v29.2s,v2.s[0]
655 umlal v6.2d,v29.2s,v3.s[0]
660 umlal v11.2d,v28.2s,v0.s[0]
669 umlal v7.2d,v28.2s,v1.s[0]
674 umlal v11.2d,v29.2s,v2.s[0]
678 umlal v7.2d,v29.2s,v3.s[0]
683 umlal v12.2d,v28.2s,v0.s[0]
692 umlal v8.2d,v28.2s,v1.s[0]
697 umlal v12.2d,v29.2s,v2.s[0]
701 umlal v8.2d,v29.2s,v3.s[0]
706 umlal v13.2d,v28.2s,v0.s[0]
715 umlal v9.2d,v28.2s,v1.s[0]
722 umlal v13.2d,v29.2s,v2.s[0]
723 ld1 {v28.2s},[sp] // pull smashed b[8*i+0]
729 umlal v9.2d,v29.2s,v3.s[0]
766 ins v15.d[1],v14.d[0]
783 ins v15.d[1],v14.d[0]
787 st1 {v6.s}[0], [x7],#4
794 ins v15.d[1],v14.d[0]
796 st1 {v7.s}[0], [x7],#4
803 ins v15.d[1],v14.d[0]
805 st1 {v8.s}[0], [x7],#4
812 ins v15.d[1],v14.d[0]
814 st1 {v9.s}[0], [x7],#4
821 ins v15.d[1],v14.d[0]
823 st1 {v10.s}[0], [x7],#4
830 ins v15.d[1],v14.d[0]
832 st1 {v11.s}[0], [x7],#4
839 ins v15.d[1],v14.d[0]
841 st1 {v12.s}[0], [x7],#4
848 ins v15.d[1],v14.d[0]
851 st1 {v13.s}[0], [x7],#4
854 st1 {v15.s}[0], [x7],#4 // top-most bit
856 subs x1,sp,#0 // clear carry flag
868 sub x17,x2,x1
871 cbnz x17,.LNEON_sub
913 sub x17,x2,x1 // preserves carry
916 cbnz x17,.LNEON_copy_n_zap
937 add x29,sp,#0
945 ldp x6,x7,[x1,#8*0]
959 stp xzr,xzr,[x2,#8*0]
987 // a[1]a[0] (i)
988 // a[2]a[0]
989 // a[3]a[0]
990 // a[4]a[0]
991 // a[5]a[0]
992 // a[6]a[0]
993 // a[7]a[0]
1016 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
1019 mul x17,x10,x6
1020 adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
1026 adcs x23,x23,x17
1027 umulh x17,x7,x6 // hi(a[1..7]*a[0])
1034 stp x19,x20,[x2],#8*2 // t[0..1]
1036 adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
1037 umulh x17,x11,x6
1044 adcs x25,x25,x17
1045 mul x17,x9,x7
1053 adcs x23,x23,x17
1054 mul x17,x13,x7
1061 adcs x19,x19,x17
1062 umulh x17,x11,x7
1071 adcs x26,x26,x17
1072 mul x17,x10,x8
1080 adcs x25,x25,x17
1081 umulh x17,x9,x8 // hi(a[3..7]*a[2])
1090 adds x25,x25,x17
1091 umulh x17,x13,x8
1098 adc x21,x21,x17
1100 mul x17,x13,x9
1107 adcs x21,x21,x17
1108 umulh x17,x13,x9
1117 adc x22,x22,x17
1119 umulh x17,x11,x10 // hi(a[5..7]*a[4])
1127 adds x21,x21,x17
1128 mul x17,x13,x11
1136 adcs x23,x23,x17
1137 umulh x17,x13,x12 // hi(a[7]*a[6])
1146 add x25,x25,x17
1151 ldp x6,x7,[x2,#8*0]
1157 ldp x6,x7,[x1,#8*0]
1172 // a[8]a[0]
1173 // a[9]a[0]
1174 // a[a]a[0]
1175 // a[b]a[0]
1176 // a[c]a[0]
1177 // a[d]a[0]
1178 // a[e]a[0]
1179 // a[f]a[0]
1200 mul x17,x9,x4
1207 adcs x22,x22,x17
1208 mul x17,x13,x4
1215 adcs x26,x26,x17
1216 umulh x17,x9,x4
1225 adcs x22,x23,x17
1226 umulh x17,x13,x4
1231 adcs x26,x28,x17
1239 ldp x6,x7,[x2,#8*0]
1246 ldp x6,x7,[x1,#8*0]
1263 ldp x6,x7,[x0,#8*0]
1272 stp x19,x20,[x2,#8*0]
1273 ldp x19,x20,[x15,#8*0]
1285 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1286 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
1290 ldp x17,x14,[sp,#8*3]
1292 stp x19,x20,[x2,#8*0]
1307 extr x16,x17,x16,#63
1316 extr x17,x14,x17,#63
1317 stp x19,x20,[x2,#8*0]
1318 adcs x23,x10,x17
1322 ldp x17,x14,[x2,#8*7]
1325 extr x16,x17,x16,#63
1334 extr x17,x14,x17,#63
1337 adcs x19,x6,x17
1340 ldp x17,x14,[x2,#8*3]
1346 extr x16,x17,x16,#63
1351 stp x19,x20,[x2,#8*0]
1355 extr x17,x14,x17,#63
1356 adcs x23,x10,x17
1358 ldp x19,x20,[sp,#8*0]
1361 ldp x6,x7,[x1,#8*0]
1369 mul x28,x4,x19 // t[0]*n0
1383 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
1387 str x28,[x2],#8 // put aside t[0]*n0 for tail processing
1388 mul x17,x9,x28
1396 adcs x21,x22,x17
1397 mul x17,x13,x28
1399 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
1404 adcs x25,x26,x17
1405 umulh x17,x9,x28
1413 adcs x22,x22,x17
1414 umulh x17,x13,x28
1415 mul x28,x4,x19 // next t[0]*n0
1419 adc x26,x26,x17
1422 ldp x14,x15,[x2,#8*0]
1423 ldp x16,x17,[x2,#8*2]
1430 adcs x22,x22,x17
1431 ldp x16,x17,[x2,#8*6]
1435 adcs x26,x26,x17
1440 ldp x6,x7,[x1,#8*0]
1453 mul x17,x9,x4
1460 adcs x22,x22,x17
1461 mul x17,x13,x4
1468 adcs x26,x26,x17
1469 umulh x17,x9,x4
1478 adcs x22,x23,x17
1479 umulh x17,x13,x4
1484 adcs x26,x28,x17
1489 ldp x6,x7,[x2,#8*0]
1500 ldp x6,x7,[x1,#8*0]
1523 ldp x19,x20,[x0,#8*0]
1525 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
1537 stp x14,x15,[x2,#8*0]
1562 ldp x6,x7,[x1,#8*0]
1563 sbcs x17,x22,x9
1564 stp x14,x15,[x0,#8*0]
1568 stp x16,x17,[x0,#8*2]
1571 sbcs x17,x26,x13
1574 ldp x19,x20,[x2,#8*0]
1582 stp x16,x17,[x0,#8*6]
1590 ldp x6,x7,[x3,#8*0]
1591 sbcs x17,x22,x9
1592 stp x14,x15,[x0,#8*0]
1596 stp x16,x17,[x0,#8*2]
1598 ldp x19,x20,[x1,#8*0]
1599 sbcs x17,x26,x13
1604 stp x16,x17,[x0,#8*6]
1610 stp xzr,xzr,[x2,#8*0]
1617 csel x17,x22,x9,lo
1621 stp x14,x15,[x3,#8*0]
1622 stp x16,x17,[x3,#8*2]
1624 stp xzr,xzr,[x1,#8*0]
1629 stp xzr,xzr,[x2,#8*0]
1633 csel x17,x22,x9,lo
1634 stp x14,x15,[x3,#8*0]
1635 stp x16,x17,[x3,#8*2]
1647 stp xzr,xzr,[sp,#8*0]
1668 stp x6,x7,[x1,#8*0]
1696 add x29,sp,#0
1712 ldr x24,[x2,#8*0] // b[0]
1713 ldp x6,x7,[x1,#8*0] // a[0..3]
1720 ldp x14,x15,[x3,#8*0] // n[0..3]
1721 ldp x16,x17,[x3,#8*2]
1724 mov x28,#0
1728 mul x10,x6,x24 // lo(a[0..3]*b[0])
1736 umulh x10,x6,x24 // hi(a[0..3]*b[0])
1738 mul x25,x19,x4 // t[0]*n0
1745 ldr x24,[x2,x28] // next b[i] (or b[0])
1747 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0)
1748 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1754 mul x13,x17,x25
1757 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0)
1763 umulh x13,x17,x25
1776 ldp x6,x7,[x1,#8*0] // a[4..7]
1779 ldr x25,[sp] // a[0]*n0
1780 ldp x14,x15,[x3,#8*0] // n[4..7]
1781 ldp x16,x17,[x3,#8*2]
1801 ldr x24,[x2,x28] // next b[i] (or b[0])
1803 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
1809 mul x13,x17,x25
1811 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
1818 umulh x13,x17,x25
1820 ldr x25,[sp,x28] // next t[0]*n0
1833 ldp x6,x7,[x1,#8*0]
1836 ldp x14,x15,[x3,#8*0]
1837 ldp x16,x17,[x3,#8*2]
1845 ldp x6,x7,[x11,#8*0] // a[0..3]
1850 stp x19,x20,[x26,#8*0] // result!!!
1851 ldp x19,x20,[sp,#8*4] // t[0..3]
1855 ldp x14,x15,[x3,#8*0] // n[0..3]
1857 ldp x16,x17,[x3,#8*2]
1863 mul x10,x6,x24 // lo(a[0..3]*b[4])
1871 umulh x10,x6,x24 // hi(a[0..3]*b[4])
1873 mul x25,x19,x4 // t[0]*n0
1883 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1885 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0
1889 mul x13,x17,x25
1892 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0
1898 umulh x13,x17,x25
1911 ldp x6,x7,[x1,#8*0] // a[4..7]
1920 ldr x25,[sp] // t[0]*n0
1921 ldp x14,x15,[x3,#8*0] // n[4..7]
1922 ldp x16,x17,[x3,#8*2]
1945 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0)
1951 mul x13,x17,x25
1953 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0)
1959 umulh x13,x17,x25
1961 ldr x25,[sp,x28] // next a[0]*n0
1978 ldp x6,x7,[x1,#8*0]
1986 ldp x14,x15,[x3,#8*0]
1987 ldp x16,x17,[x3,#8*2]
1999 stp x19,x20,[x26,#8*0] // result!!!
2001 ldp x19,x20,[sp,#8*4] // t[0..3]
2006 ldp x14,x15,[x11,#8*0] // n[0..3]
2007 ldp x16,x17,[x11,#8*2]
2012 ldp x6,x7,[x1,#8*0] // a[0..3]
2034 ldp x14,x15,[x3,#8*0]
2036 ldp x19,x20,[x26,#8*0]
2037 sbcs x13,x22,x17
2038 ldp x16,x17,[x3,#8*2]
2042 stp x10,x11,[x0,#8*0]
2052 ldp x6,x7,[x27,#8*0]
2053 sbcs x13,x22,x17
2054 stp x10,x11,[x0,#8*0]
2057 ldp x19,x20,[x1,#8*0]
2066 stp xzr,xzr,[x26,#8*0]
2077 stp x10,x11,[x27,#8*0]
2083 stp xzr,xzr,[x26,#8*0]
2090 stp x10,x11,[x27,#8*0]
2103 stp xzr,xzr,[sp,#8*0]
2106 sbcs x9,x22,x17
2116 stp x6,x7,[x1,#8*0]
2132 …9,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0