Lines Matching +full:a +full:- +full:8

1 /* Do not modify. This file is auto-generated from armv8-mont.pl. */
32 stp x29,x30,[sp,#-64]!
38 ldr x9,[x2],#8 // bp[0]
43 and x22,x22,#-16 // ABI says so
47 sub x21,x5,#16 // j=num-2
66 // x6 being non-zero. So that carry can be calculated
67 // by adding -1 to x6. That's what next instruction does.
74 ldr x8,[x1],#8
76 sub x21,x21,#8 // j--
79 ldr x14,[x3],#8
89 str x12,[x22],#8 // tp[j-1]
102 sub x20,x5,#8 // i=num-1
109 ldr x9,[x2],#8 // bp[i]
112 add x22,sp,#8
115 sub x21,x5,#16 // j=num-2
124 sub x20,x20,#8 // i--
135 ldr x8,[x1],#8
137 ldr x23,[x22],#8 // tp[j]
139 sub x21,x21,#8 // j--
143 ldr x14,[x3],#8
154 stur x12,[x22,#-16] // tp[j-1]
158 ldr x23,[x22],#8 // tp[j]
175 stp x12,x13,[x22,#-16]
184 add x22,sp,#8
185 ldr x14,[x3],#8 // np[0]
186 subs x21,x5,#8 // j=num-1 and clear borrow
189 sbcs x8,x23,x14 // tp[j]-np[j]
190 ldr x23,[x22],#8
191 sub x21,x21,#8 // j--
192 ldr x14,[x3],#8
193 str x8,[x1],#8 // rp[j]=tp[j]-np[j]
198 str x8,[x1],#8 // rp[num-1]
201 add x22,sp,#8
202 ldr x8,[x0],#8 // rp[0]
203 sub x5,x5,#8 // num--
206 sub x5,x5,#8 // num--
208 ldr x23,[x22],#8
209 ldr x8,[x0],#8
210 stur xzr,[x22,#-16] // wipe tp
211 stur x14,[x0,#-16]
215 stur xzr,[x22,#-8] // wipe tp
216 stur x14,[x0,#-8]
226 .size bn_mul_mont,.-bn_mul_mont
232 stp x29,x30,[sp,#-80]!
248 and x7,x7,#-64
254 sub x8,x5,#8
260 subs x8,x8,#8
268 add x10,sp,#8
284 ext v29.16b,v29.16b,v29.16b,#8
290 st1 {v28.2s},[sp] // put aside smashed b[8*i+0]
302 ext v6.16b,v6.16b,v6.16b,#8
310 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+0]
316 ext v29.16b,v29.16b,v29.16b,#8
322 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+1]
334 ext v7.16b,v7.16b,v7.16b,#8
342 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+1]
348 ext v29.16b,v29.16b,v29.16b,#8
354 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+2]
366 ext v8.16b,v8.16b,v8.16b,#8
374 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+2]
380 ext v29.16b,v29.16b,v29.16b,#8
386 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+3]
398 ext v9.16b,v9.16b,v9.16b,#8
406 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+3]
412 ext v29.16b,v29.16b,v29.16b,#8
418 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+4]
430 ext v10.16b,v10.16b,v10.16b,#8
438 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+4]
444 ext v29.16b,v29.16b,v29.16b,#8
450 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+5]
462 ext v11.16b,v11.16b,v11.16b,#8
470 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+5]
476 ext v29.16b,v29.16b,v29.16b,#8
482 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+6]
494 ext v12.16b,v12.16b,v12.16b,#8
502 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+6]
508 ext v29.16b,v29.16b,v29.16b,#8
514 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+7]
518 ld1 {v28.2s},[sp] // pull smashed b[8*i+0]
525 ext v13.16b,v13.16b,v13.16b,#8
536 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+7]
537 add x10,sp,#8 // rewind
538 sub x8,x5,#8
543 subs x8,x8,#8
547 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+0]
558 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+1]
571 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+1]
581 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+2]
594 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+2]
604 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+3]
617 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+3]
627 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+4]
640 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+4]
650 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+5]
663 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+5]
673 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+6]
686 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+6]
696 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+7]
709 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+7]
723 ld1 {v28.2s},[sp] // pull smashed b[8*i+0]
727 add x10,sp,#8 // rewind
744 subs x9,x9,#8
759 ext v6.16b,v6.16b,v6.16b,#8
776 ext v6.16b,v6.16b,v6.16b,#8
790 ext v7.16b,v7.16b,v7.16b,#8
799 ext v8.16b,v8.16b,v8.16b,#8
808 ext v9.16b,v9.16b,v9.16b,#8
817 ext v10.16b,v10.16b,v10.16b,#8
826 ext v11.16b,v11.16b,v11.16b,#8
835 ext v12.16b,v12.16b,v12.16b,#8
844 ext v13.16b,v13.16b,v13.16b,#8
850 subs x8,x8,#8
854 st1 {v15.s}[0], [x7],#4 // top-most bit
860 ldp w4,w5,[x1],#8
861 ldp w6,w7,[x1],#8
862 ldp w8,w9,[x3],#8
863 ldp w10,w11,[x3],#8
869 stp w8,w9,[x0],#8
870 stp w10,w11,[x0],#8
873 ldr w10, [x1] // load top-most bit
884 ldp w4,w5,[x1],#8
885 ldp w6,w7,[x1],#8
886 ldp w8,w9,[x0],#8
888 sub x0,x0,#8
897 ldp w4,w5,[x1],#8
898 ldp w6,w7,[x1],#8
899 stp w8,w9,[x0],#8
900 stp w10,w11,[x0],#8
902 ldp w8,w9,[x0],#8
904 sub x0,x0,#8
914 stp w8,w9,[x0],#8
915 stp w10,w11,[x0],#8
927 .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
936 stp x29,x30,[sp,#-128]!
945 ldp x6,x7,[x1,#8*0]
946 ldp x8,x9,[x1,#8*2]
947 ldp x10,x11,[x1,#8*4]
948 ldp x12,x13,[x1,#8*6]
954 sub x27,x5,#8*8
958 sub x27,x27,#8*8
959 stp xzr,xzr,[x2,#8*0]
960 stp xzr,xzr,[x2,#8*2]
961 stp xzr,xzr,[x2,#8*4]
962 stp xzr,xzr,[x2,#8*6]
964 stp xzr,xzr,[x2,#8*8]
965 stp xzr,xzr,[x2,#8*10]
966 stp xzr,xzr,[x2,#8*12]
967 stp xzr,xzr,[x2,#8*14]
968 add x2,x2,#8*16
972 add x1,x1,#8*8
984 // Multiply everything but a[i]*a[i]
987 // a[1]a[0] (i)
988 // a[2]a[0]
989 // a[3]a[0]
990 // a[4]a[0]
991 // a[5]a[0]
992 // a[6]a[0]
993 // a[7]a[0]
994 // a[2]a[1] (ii)
995 // a[3]a[1]
996 // a[4]a[1]
997 // a[5]a[1]
998 // a[6]a[1]
999 // a[7]a[1]
1000 // a[3]a[2] (iii)
1001 // a[4]a[2]
1002 // a[5]a[2]
1003 // a[6]a[2]
1004 // a[7]a[2]
1005 // a[4]a[3] (iv)
1006 // a[5]a[3]
1007 // a[6]a[3]
1008 // a[7]a[3]
1009 // a[5]a[4] (v)
1010 // a[6]a[4]
1011 // a[7]a[4]
1012 // a[6]a[5] (vi)
1013 // a[7]a[5]
1014 // a[7]a[6] (vii)
1016 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i)
1020 adds x20,x20,x14 // t[1]+lo(a[1]*a[0])
1027 umulh x17,x7,x6 // hi(a[1..7]*a[0])
1034 stp x19,x20,[x2],#8*2 // t[0..1]
1035 adc x19,xzr,xzr // t[8]
1036 adds x21,x21,x17 // t[2]+lo(a[1]*a[0])
1043 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii)
1056 umulh x14,x8,x7 // hi(a[2..7]*a[1])
1063 stp x21,x22,[x2],#8*2 // t[2..3]
1070 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii)
1081 umulh x17,x9,x8 // hi(a[3..7]*a[2])
1088 stp x23,x24,[x2],#8*2 // t[4..5]
1093 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv)
1102 umulh x14,x10,x9 // hi(a[4..7]*a[3])
1109 stp x25,x26,[x2],#8*2 // t[6..7]
1112 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v)
1119 umulh x17,x11,x10 // hi(a[5..7]*a[4])
1125 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi)
1130 umulh x14,x12,x11 // hi(a[6..7]*a[5])
1135 mul x16,x13,x12 // lo(a[7]*a[6]) (vii)
1137 umulh x17,x13,x12 // hi(a[7]*a[6])
1151 ldp x6,x7,[x2,#8*0]
1152 ldp x8,x9,[x2,#8*2]
1153 ldp x10,x11,[x2,#8*4]
1154 ldp x12,x13,[x2,#8*6]
1157 ldp x6,x7,[x1,#8*0]
1160 ldp x8,x9,[x1,#8*2]
1163 ldp x10,x11,[x1,#8*4]
1167 ldp x12,x13,[x1,#8*6]
1168 add x1,x1,#8*8
1170 mov x27,#-8*8
1172 // a[8]a[0]
1173 // a[9]a[0]
1174 // a[a]a[0]
1175 // a[b]a[0]
1176 // a[c]a[0]
1177 // a[d]a[0]
1178 // a[e]a[0]
1179 // a[f]a[0]
1180 // a[8]a[1]
1181 // a[f]a[1]........................
1182 // a[8]a[2]
1183 // a[f]a[2]........................
1184 // a[8]a[3]
1185 // a[f]a[3]........................
1186 // a[8]a[4]
1187 // a[f]a[4]........................
1188 // a[8]a[5]
1189 // a[f]a[5]........................
1190 // a[8]a[6]
1191 // a[f]a[6]........................
1192 // a[8]a[7]
1193 // a[f]a[7]........................
1196 adc x28,xzr,xzr // carry bit, modulo-scheduled
1198 add x27,x27,#8
1218 str x19,[x2],#8
1239 ldp x6,x7,[x2,#8*0]
1240 ldp x8,x9,[x2,#8*2]
1241 ldp x10,x11,[x2,#8*4]
1242 ldp x12,x13,[x2,#8*6]
1244 ldur x4,[x0,#-8*8]
1246 ldp x6,x7,[x1,#8*0]
1249 ldp x8,x9,[x1,#8*2]
1252 ldp x10,x11,[x1,#8*4]
1254 mov x27,#-8*8
1256 ldp x12,x13,[x1,#8*6]
1257 add x1,x1,#8*8
1263 ldp x6,x7,[x0,#8*0]
1264 add x1,x0,#8*8
1265 ldp x8,x9,[x0,#8*2]
1267 ldp x10,x11,[x0,#8*4]
1269 ldp x12,x13,[x0,#8*6]
1272 stp x19,x20,[x2,#8*0]
1273 ldp x19,x20,[x15,#8*0]
1274 stp x21,x22,[x2,#8*2]
1275 ldp x21,x22,[x15,#8*2]
1276 stp x23,x24,[x2,#8*4]
1277 ldp x23,x24,[x15,#8*4]
1278 stp x25,x26,[x2,#8*6]
1280 ldp x25,x26,[x15,#8*6]
1285 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0]
1286 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0]
1287 ldp x15,x16,[sp,#8*1]
1288 ldp x11,x13,[x14,#8*2]
1289 add x1,x14,#8*4
1290 ldp x17,x14,[sp,#8*3]
1292 stp x19,x20,[x2,#8*0]
1294 stp x21,x22,[x2,#8*2]
1296 stp x23,x24,[x2,#8*4]
1298 stp x25,x26,[x2,#8*6]
1303 sub x27,x5,#8*4
1308 sub x27,x27,#8*4
1310 ldp x15,x16,[x2,#8*5]
1312 ldp x7,x9,[x1],#8*2
1317 stp x19,x20,[x2,#8*0]
1320 stp x21,x22,[x2,#8*2]
1322 ldp x17,x14,[x2,#8*7]
1327 ldp x15,x16,[x2,#8*9]
1329 ldp x11,x13,[x1],#8*2
1333 stp x23,x24,[x2,#8*4]
1335 stp x25,x26,[x2,#8*6]
1336 add x2,x2,#8*8
1340 ldp x17,x14,[x2,#8*3]
1348 ldp x15,x16,[x2,#8*5]
1351 stp x19,x20,[x2,#8*0]
1354 stp x21,x22,[x2,#8*2]
1358 ldp x19,x20,[sp,#8*0]
1361 ldp x6,x7,[x1,#8*0]
1364 ldp x8,x9,[x1,#8*2]
1366 ldp x10,x11,[x1,#8*4]
1370 ldp x12,x13,[x1,#8*6]
1372 ldp x21,x22,[sp,#8*2]
1373 stp x23,x24,[x2,#8*4]
1374 ldp x23,x24,[sp,#8*4]
1375 stp x25,x26,[x2,#8*6]
1376 ldp x25,x26,[sp,#8*6]
1377 add x1,x1,#8*8
1378 mov x30,xzr // initial top-most carry
1380 mov x27,#8
1383 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0)
1387 str x28,[x2],#8 // put aside t[0]*n0 for tail processing
1399 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0)
1422 ldp x14,x15,[x2,#8*0]
1423 ldp x16,x17,[x2,#8*2]
1428 ldp x14,x15,[x2,#8*4]
1431 ldp x16,x17,[x2,#8*6]
1439 ldur x4,[x2,#-8*8]
1440 ldp x6,x7,[x1,#8*0]
1441 ldp x8,x9,[x1,#8*2]
1442 ldp x10,x11,[x1,#8*4]
1443 mov x27,#-8*8
1444 ldp x12,x13,[x1,#8*6]
1445 add x1,x1,#8*8
1449 adc x28,xzr,xzr // carry bit, modulo-scheduled
1451 add x27,x27,#8
1471 str x19,[x2],#8
1489 ldp x6,x7,[x2,#8*0]
1492 ldp x8,x9,[x2,#8*2]
1493 ldp x10,x11,[x2,#8*4]
1494 ldp x12,x13,[x2,#8*6]
1497 ldur x4,[x0,#-8*8]
1500 ldp x6,x7,[x1,#8*0]
1503 ldp x8,x9,[x1,#8*2]
1506 ldp x10,x11,[x1,#8*4]
1508 mov x27,#-8*8
1510 ldp x12,x13,[x1,#8*6]
1511 add x1,x1,#8*8
1518 add x27,x2,#8*8 // end of current t[num] window
1520 subs xzr,x30,#1 // "move" top-most carry to carry bit
1523 ldp x19,x20,[x0,#8*0]
1525 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0]
1527 ldp x8,x9,[x16,#8*2]
1530 ldp x10,x11,[x16,#8*4]
1533 ldp x12,x13,[x16,#8*6]
1534 add x1,x16,#8*8
1535 adc x30,xzr,xzr // top-most carry
1537 stp x14,x15,[x2,#8*0]
1538 stp x21,x22,[x2,#8*2]
1539 ldp x21,x22,[x0,#8*2]
1540 stp x23,x24,[x2,#8*4]
1541 ldp x23,x24,[x0,#8*4]
1543 stp x25,x26,[x2,#8*6]
1545 ldp x25,x26,[x0,#8*6]
1546 mov x27,#8
1554 add x2,x2,#8*8
1557 sub x27,x5,#8*8
1562 ldp x6,x7,[x1,#8*0]
1564 stp x14,x15,[x0,#8*0]
1566 ldp x8,x9,[x1,#8*2]
1568 stp x16,x17,[x0,#8*2]
1570 ldp x10,x11,[x1,#8*4]
1572 ldp x12,x13,[x1,#8*6]
1573 add x1,x1,#8*8
1574 ldp x19,x20,[x2,#8*0]
1575 sub x27,x27,#8*8
1576 ldp x21,x22,[x2,#8*2]
1577 ldp x23,x24,[x2,#8*4]
1578 ldp x25,x26,[x2,#8*6]
1579 add x2,x2,#8*8
1580 stp x14,x15,[x0,#8*4]
1582 stp x16,x17,[x0,#8*6]
1583 add x0,x0,#8*8
1590 ldp x6,x7,[x3,#8*0]
1592 stp x14,x15,[x0,#8*0]
1594 ldp x8,x9,[x3,#8*2]
1596 stp x16,x17,[x0,#8*2]
1598 ldp x19,x20,[x1,#8*0]
1600 ldp x21,x22,[x1,#8*2]
1602 ldr x30,[x29,#8] // pull return address
1603 stp x14,x15,[x0,#8*4]
1604 stp x16,x17,[x0,#8*6]
1606 sub x27,x5,#8*4
1608 sub x27,x27,#8*4
1610 stp xzr,xzr,[x2,#8*0]
1612 ldp x6,x7,[x3,#8*4]
1613 ldp x19,x20,[x1,#8*4]
1615 stp xzr,xzr,[x2,#8*2]
1616 add x2,x2,#8*4
1618 ldp x8,x9,[x3,#8*6]
1619 ldp x21,x22,[x1,#8*6]
1620 add x1,x1,#8*4
1621 stp x14,x15,[x3,#8*0]
1622 stp x16,x17,[x3,#8*2]
1623 add x3,x3,#8*4
1624 stp xzr,xzr,[x1,#8*0]
1625 stp xzr,xzr,[x1,#8*2]
1629 stp xzr,xzr,[x2,#8*0]
1631 stp xzr,xzr,[x2,#8*2]
1634 stp x14,x15,[x3,#8*0]
1635 stp x16,x17,[x3,#8*2]
1642 ldr x30,[x29,#8] // pull return address
1643 // x19-7,x28 hold result, x6-7 hold modulus
1647 stp xzr,xzr,[sp,#8*0]
1649 stp xzr,xzr,[sp,#8*2]
1651 stp xzr,xzr,[sp,#8*4]
1653 stp xzr,xzr,[sp,#8*6]
1655 stp xzr,xzr,[sp,#8*8]
1657 stp xzr,xzr,[sp,#8*10]
1659 stp xzr,xzr,[sp,#8*12]
1661 stp xzr,xzr,[sp,#8*14]
1663 // x6-7 hold result-modulus
1668 stp x6,x7,[x1,#8*0]
1671 stp x8,x9,[x1,#8*2]
1674 stp x10,x11,[x1,#8*4]
1675 stp x12,x13,[x1,#8*6]
1689 .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
1695 stp x29,x30,[sp,#-128]!
1706 sub sp,x26,#8*4 // alloca
1712 ldr x24,[x2,#8*0] // b[0]
1713 ldp x6,x7,[x1,#8*0] // a[0..3]
1714 ldp x8,x9,[x1,#8*2]
1715 add x1,x1,#8*4
1720 ldp x14,x15,[x3,#8*0] // n[0..3]
1721 ldp x16,x17,[x3,#8*2]
1722 adds x3,x3,#8*4 // clear carry bit
1728 mul x10,x6,x24 // lo(a[0..3]*b[0])
1729 adc x0,x0,xzr // modulo-scheduled
1731 add x28,x28,#8
1736 umulh x10,x6,x24 // hi(a[0..3]*b[0])
1748 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1776 ldp x6,x7,[x1,#8*0] // a[4..7]
1777 ldp x8,x9,[x1,#8*2]
1778 add x1,x1,#8*4
1779 ldr x25,[sp] // a[0]*n0
1780 ldp x14,x15,[x3,#8*0] // n[4..7]
1781 ldp x16,x17,[x3,#8*2]
1782 add x3,x3,#8*4
1785 mul x10,x6,x24 // lo(a[4..7]*b[i])
1786 adc x0,x0,xzr // modulo-scheduled
1788 add x28,x28,#8
1793 umulh x10,x6,x24 // hi(a[4..7]*b[i])
1803 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0)
1811 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0)
1821 str x19,[x26],#8 // result!!!
1833 ldp x6,x7,[x1,#8*0]
1834 ldp x8,x9,[x1,#8*2]
1835 add x1,x1,#8*4
1836 ldp x14,x15,[x3,#8*0]
1837 ldp x16,x17,[x3,#8*2]
1838 add x3,x3,#8*4
1843 ldr x24,[x2,#8*4]! // *++b
1845 ldp x6,x7,[x11,#8*0] // a[0..3]
1847 ldp x8,x9,[x11,#8*2]
1848 add x1,x11,#8*4
1850 stp x19,x20,[x26,#8*0] // result!!!
1851 ldp x19,x20,[sp,#8*4] // t[0..3]
1852 stp x21,x22,[x26,#8*2] // result!!!
1853 ldp x21,x22,[sp,#8*6]
1855 ldp x14,x15,[x3,#8*0] // n[0..3]
1857 ldp x16,x17,[x3,#8*2]
1858 adds x3,x3,#8*4 // clear carry bit
1863 mul x10,x6,x24 // lo(a[0..3]*b[4])
1864 adc x0,x0,xzr // modulo-scheduled
1866 add x28,x28,#8
1871 umulh x10,x6,x24 // hi(a[0..3]*b[4])
1883 str x25,[x26],#8 // put aside t[0]*n0 for tail processing
1909 ldp x10,x11,[x26,#8*4] // t[4..7]
1910 ldp x12,x13,[x26,#8*6]
1911 ldp x6,x7,[x1,#8*0] // a[4..7]
1912 ldp x8,x9,[x1,#8*2]
1913 add x1,x1,#8*4
1921 ldp x14,x15,[x3,#8*0] // n[4..7]
1922 ldp x16,x17,[x3,#8*2]
1923 add x3,x3,#8*4
1927 mul x10,x6,x24 // lo(a[4..7]*b[4])
1928 adc x0,x0,xzr // modulo-scheduled
1930 add x28,x28,#8
1935 umulh x10,x6,x24 // hi(a[4..7]*b[4])
1961 ldr x25,[sp,x28] // next a[0]*n0
1963 str x19,[x26],#8 // result!!!
1976 ldp x10,x11,[x26,#8*4]
1977 ldp x12,x13,[x26,#8*6]
1978 ldp x6,x7,[x1,#8*0]
1979 ldp x8,x9,[x1,#8*2]
1980 add x1,x1,#8*4
1986 ldp x14,x15,[x3,#8*0]
1987 ldp x16,x17,[x3,#8*2]
1988 add x3,x3,#8*4
1995 add x2,x2,#8*4 // bp++
1999 stp x19,x20,[x26,#8*0] // result!!!
2001 ldp x19,x20,[sp,#8*4] // t[0..3]
2003 stp x21,x22,[x26,#8*2] // result!!!
2005 ldp x21,x22,[sp,#8*6]
2006 ldp x14,x15,[x11,#8*0] // n[0..3]
2007 ldp x16,x17,[x11,#8*2]
2008 add x3,x11,#8*4
2012 ldp x6,x7,[x1,#8*0] // a[0..3]
2013 ldp x8,x9,[x1,#8*2]
2014 adds x1,x1,#8*4 // clear carry bit
2028 add x26,sp,#8*8
2030 sub x28,x5,#8*4
2034 ldp x14,x15,[x3,#8*0]
2035 sub x28,x28,#8*4
2036 ldp x19,x20,[x26,#8*0]
2038 ldp x16,x17,[x3,#8*2]
2039 add x3,x3,#8*4
2040 ldp x21,x22,[x26,#8*2]
2041 add x26,x26,#8*4
2042 stp x10,x11,[x0,#8*0]
2044 stp x12,x13,[x0,#8*2]
2045 add x0,x0,#8*4
2051 add x1,sp,#8*4
2052 ldp x6,x7,[x27,#8*0]
2054 stp x10,x11,[x0,#8*0]
2055 ldp x8,x9,[x27,#8*2]
2056 stp x12,x13,[x0,#8*2]
2057 ldp x19,x20,[x1,#8*0]
2058 ldp x21,x22,[x1,#8*2]
2060 ldr x30,[x29,#8] // pull return address
2062 sub x28,x5,#8*4
2064 sub x28,x28,#8*4
2066 stp xzr,xzr,[x26,#8*0]
2068 ldp x6,x7,[x27,#8*4]
2069 ldp x19,x20,[x1,#8*4]
2071 stp xzr,xzr,[x26,#8*2]
2072 add x26,x26,#8*4
2074 ldp x8,x9,[x27,#8*6]
2075 ldp x21,x22,[x1,#8*6]
2076 add x1,x1,#8*4
2077 stp x10,x11,[x27,#8*0]
2078 stp x12,x13,[x27,#8*2]
2079 add x27,x27,#8*4
2083 stp xzr,xzr,[x26,#8*0]
2085 stp xzr,xzr,[x26,#8*2]
2087 stp xzr,xzr,[x26,#8*3]
2089 stp xzr,xzr,[x26,#8*4]
2090 stp x10,x11,[x27,#8*0]
2091 stp x12,x13,[x27,#8*2]
2099 // x19-3,x0 hold result, x14-7 hold modulus
2101 ldr x30,[x29,#8] // pull return address
2103 stp xzr,xzr,[sp,#8*0]
2105 stp xzr,xzr,[sp,#8*2]
2107 stp xzr,xzr,[sp,#8*4]
2109 stp xzr,xzr,[sp,#8*6]
2111 // x6-3 hold result-modulus
2116 stp x6,x7,[x1,#8*0]
2117 stp x8,x9,[x1,#8*2]
2131 .size __bn_mul4x_mont,.-__bn_mul4x_mont