1/* $FreeBSD$ */ 2/* Do not modify. This file is auto-generated from armv8-mont.pl. */ 3.text 4 5.globl bn_mul_mont 6.type bn_mul_mont,%function 7.align 5 8bn_mul_mont: 9 tst x5,#7 10 b.eq __bn_sqr8x_mont 11 tst x5,#3 12 b.eq __bn_mul4x_mont 13.Lmul_mont: 14 stp x29,x30,[sp,#-64]! 15 add x29,sp,#0 16 stp x19,x20,[sp,#16] 17 stp x21,x22,[sp,#32] 18 stp x23,x24,[sp,#48] 19 20 ldr x9,[x2],#8 // bp[0] 21 sub x22,sp,x5,lsl#3 22 ldp x7,x8,[x1],#16 // ap[0..1] 23 lsl x5,x5,#3 24 ldr x4,[x4] // *n0 25 and x22,x22,#-16 // ABI says so 26 ldp x13,x14,[x3],#16 // np[0..1] 27 28 mul x6,x7,x9 // ap[0]*bp[0] 29 sub x21,x5,#16 // j=num-2 30 umulh x7,x7,x9 31 mul x10,x8,x9 // ap[1]*bp[0] 32 umulh x11,x8,x9 33 34 mul x15,x6,x4 // "tp[0]"*n0 35 mov sp,x22 // alloca 36 37 // (*) mul x12,x13,x15 // np[0]*m1 38 umulh x13,x13,x15 39 mul x16,x14,x15 // np[1]*m1 40 // (*) adds x12,x12,x6 // discarded 41 // (*) As for removal of first multiplication and addition 42 // instructions. The outcome of first addition is 43 // guaranteed to be zero, which leaves two computationally 44 // significant outcomes: it either carries or not. Then 45 // question is when does it carry? Is there alternative 46 // way to deduce it? If you follow operations, you can 47 // observe that condition for carry is quite simple: 48 // x6 being non-zero. So that carry can be calculated 49 // by adding -1 to x6. That's what next instruction does. 50 subs xzr,x6,#1 // (*) 51 umulh x17,x14,x15 52 adc x13,x13,xzr 53 cbz x21,.L1st_skip 54 55.L1st: 56 ldr x8,[x1],#8 57 adds x6,x10,x7 58 sub x21,x21,#8 // j-- 59 adc x7,x11,xzr 60 61 ldr x14,[x3],#8 62 adds x12,x16,x13 63 mul x10,x8,x9 // ap[j]*bp[0] 64 adc x13,x17,xzr 65 umulh x11,x8,x9 66 67 adds x12,x12,x6 68 mul x16,x14,x15 // np[j]*m1 69 adc x13,x13,xzr 70 umulh x17,x14,x15 71 str x12,[x22],#8 // tp[j-1] 72 cbnz x21,.L1st 73 74.L1st_skip: 75 adds x6,x10,x7 76 sub x1,x1,x5 // rewind x1 77 adc x7,x11,xzr 78 79 adds x12,x16,x13 80 sub x3,x3,x5 // rewind x3 81 adc x13,x17,xzr 82 83 adds x12,x12,x6 84 sub x20,x5,#8 // i=num-1 85 adcs x13,x13,x7 86 87 adc x19,xzr,xzr // upmost overflow bit 88 stp x12,x13,[x22] 89 90.Louter: 91 ldr x9,[x2],#8 // bp[i] 92 ldp x7,x8,[x1],#16 93 ldr x23,[sp] // tp[0] 94 add x22,sp,#8 95 96 mul x6,x7,x9 // ap[0]*bp[i] 97 sub x21,x5,#16 // j=num-2 98 umulh x7,x7,x9 99 ldp x13,x14,[x3],#16 100 mul x10,x8,x9 // ap[1]*bp[i] 101 adds x6,x6,x23 102 umulh x11,x8,x9 103 adc x7,x7,xzr 104 105 mul x15,x6,x4 106 sub x20,x20,#8 // i-- 107 108 // (*) mul x12,x13,x15 // np[0]*m1 109 umulh x13,x13,x15 110 mul x16,x14,x15 // np[1]*m1 111 // (*) adds x12,x12,x6 112 subs xzr,x6,#1 // (*) 113 umulh x17,x14,x15 114 cbz x21,.Linner_skip 115 116.Linner: 117 ldr x8,[x1],#8 118 adc x13,x13,xzr 119 ldr x23,[x22],#8 // tp[j] 120 adds x6,x10,x7 121 sub x21,x21,#8 // j-- 122 adc x7,x11,xzr 123 124 adds x12,x16,x13 125 ldr x14,[x3],#8 126 adc x13,x17,xzr 127 128 mul x10,x8,x9 // ap[j]*bp[i] 129 adds x6,x6,x23 130 umulh x11,x8,x9 131 adc x7,x7,xzr 132 133 mul x16,x14,x15 // np[j]*m1 134 adds x12,x12,x6 135 umulh x17,x14,x15 136 str x12,[x22,#-16] // tp[j-1] 137 cbnz x21,.Linner 138 139.Linner_skip: 140 ldr x23,[x22],#8 // tp[j] 141 adc x13,x13,xzr 142 adds x6,x10,x7 143 sub x1,x1,x5 // rewind x1 144 adc x7,x11,xzr 145 146 adds x12,x16,x13 147 sub x3,x3,x5 // rewind x3 148 adcs x13,x17,x19 149 adc x19,xzr,xzr 150 151 adds x6,x6,x23 152 adc x7,x7,xzr 153 154 adds x12,x12,x6 155 adcs x13,x13,x7 156 adc x19,x19,xzr // upmost overflow bit 157 stp x12,x13,[x22,#-16] 158 159 cbnz x20,.Louter 160 161 // Final step. We see if result is larger than modulus, and 162 // if it is, subtract the modulus. But comparison implies 163 // subtraction. So we subtract modulus, see if it borrowed, 164 // and conditionally copy original value. 165 ldr x23,[sp] // tp[0] 166 add x22,sp,#8 167 ldr x14,[x3],#8 // np[0] 168 subs x21,x5,#8 // j=num-1 and clear borrow 169 mov x1,x0 170.Lsub: 171 sbcs x8,x23,x14 // tp[j]-np[j] 172 ldr x23,[x22],#8 173 sub x21,x21,#8 // j-- 174 ldr x14,[x3],#8 175 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 176 cbnz x21,.Lsub 177 178 sbcs x8,x23,x14 179 sbcs x19,x19,xzr // did it borrow? 180 str x8,[x1],#8 // rp[num-1] 181 182 ldr x23,[sp] // tp[0] 183 add x22,sp,#8 184 ldr x8,[x0],#8 // rp[0] 185 sub x5,x5,#8 // num-- 186 nop 187.Lcond_copy: 188 sub x5,x5,#8 // num-- 189 csel x14,x23,x8,lo // did it borrow? 190 ldr x23,[x22],#8 191 ldr x8,[x0],#8 192 str xzr,[x22,#-16] // wipe tp 193 str x14,[x0,#-16] 194 cbnz x5,.Lcond_copy 195 196 csel x14,x23,x8,lo 197 str xzr,[x22,#-8] // wipe tp 198 str x14,[x0,#-8] 199 200 ldp x19,x20,[x29,#16] 201 mov sp,x29 202 ldp x21,x22,[x29,#32] 203 mov x0,#1 204 ldp x23,x24,[x29,#48] 205 ldr x29,[sp],#64 206 ret 207.size bn_mul_mont,.-bn_mul_mont 208.type __bn_sqr8x_mont,%function 209.align 5 210__bn_sqr8x_mont: 211 cmp x1,x2 212 b.ne __bn_mul4x_mont 213.Lsqr8x_mont: 214.inst 0xd503233f // paciasp 215 stp x29,x30,[sp,#-128]! 216 add x29,sp,#0 217 stp x19,x20,[sp,#16] 218 stp x21,x22,[sp,#32] 219 stp x23,x24,[sp,#48] 220 stp x25,x26,[sp,#64] 221 stp x27,x28,[sp,#80] 222 stp x0,x3,[sp,#96] // offload rp and np 223 224 ldp x6,x7,[x1,#8*0] 225 ldp x8,x9,[x1,#8*2] 226 ldp x10,x11,[x1,#8*4] 227 ldp x12,x13,[x1,#8*6] 228 229 sub x2,sp,x5,lsl#4 230 lsl x5,x5,#3 231 ldr x4,[x4] // *n0 232 mov sp,x2 // alloca 233 sub x27,x5,#8*8 234 b .Lsqr8x_zero_start 235 236.Lsqr8x_zero: 237 sub x27,x27,#8*8 238 stp xzr,xzr,[x2,#8*0] 239 stp xzr,xzr,[x2,#8*2] 240 stp xzr,xzr,[x2,#8*4] 241 stp xzr,xzr,[x2,#8*6] 242.Lsqr8x_zero_start: 243 stp xzr,xzr,[x2,#8*8] 244 stp xzr,xzr,[x2,#8*10] 245 stp xzr,xzr,[x2,#8*12] 246 stp xzr,xzr,[x2,#8*14] 247 add x2,x2,#8*16 248 cbnz x27,.Lsqr8x_zero 249 250 add x3,x1,x5 251 add x1,x1,#8*8 252 mov x19,xzr 253 mov x20,xzr 254 mov x21,xzr 255 mov x22,xzr 256 mov x23,xzr 257 mov x24,xzr 258 mov x25,xzr 259 mov x26,xzr 260 mov x2,sp 261 str x4,[x29,#112] // offload n0 262 263 // Multiply everything but a[i]*a[i] 264.align 4 265.Lsqr8x_outer_loop: 266 // a[1]a[0] (i) 267 // a[2]a[0] 268 // a[3]a[0] 269 // a[4]a[0] 270 // a[5]a[0] 271 // a[6]a[0] 272 // a[7]a[0] 273 // a[2]a[1] (ii) 274 // a[3]a[1] 275 // a[4]a[1] 276 // a[5]a[1] 277 // a[6]a[1] 278 // a[7]a[1] 279 // a[3]a[2] (iii) 280 // a[4]a[2] 281 // a[5]a[2] 282 // a[6]a[2] 283 // a[7]a[2] 284 // a[4]a[3] (iv) 285 // a[5]a[3] 286 // a[6]a[3] 287 // a[7]a[3] 288 // a[5]a[4] (v) 289 // a[6]a[4] 290 // a[7]a[4] 291 // a[6]a[5] (vi) 292 // a[7]a[5] 293 // a[7]a[6] (vii) 294 295 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 296 mul x15,x8,x6 297 mul x16,x9,x6 298 mul x17,x10,x6 299 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 300 mul x14,x11,x6 301 adcs x21,x21,x15 302 mul x15,x12,x6 303 adcs x22,x22,x16 304 mul x16,x13,x6 305 adcs x23,x23,x17 306 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 307 adcs x24,x24,x14 308 umulh x14,x8,x6 309 adcs x25,x25,x15 310 umulh x15,x9,x6 311 adcs x26,x26,x16 312 umulh x16,x10,x6 313 stp x19,x20,[x2],#8*2 // t[0..1] 314 adc x19,xzr,xzr // t[8] 315 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 316 umulh x17,x11,x6 317 adcs x22,x22,x14 318 umulh x14,x12,x6 319 adcs x23,x23,x15 320 umulh x15,x13,x6 321 adcs x24,x24,x16 322 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 323 adcs x25,x25,x17 324 mul x17,x9,x7 325 adcs x26,x26,x14 326 mul x14,x10,x7 327 adc x19,x19,x15 328 329 mul x15,x11,x7 330 adds x22,x22,x16 331 mul x16,x12,x7 332 adcs x23,x23,x17 333 mul x17,x13,x7 334 adcs x24,x24,x14 335 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 336 adcs x25,x25,x15 337 umulh x15,x9,x7 338 adcs x26,x26,x16 339 umulh x16,x10,x7 340 adcs x19,x19,x17 341 umulh x17,x11,x7 342 stp x21,x22,[x2],#8*2 // t[2..3] 343 adc x20,xzr,xzr // t[9] 344 adds x23,x23,x14 345 umulh x14,x12,x7 346 adcs x24,x24,x15 347 umulh x15,x13,x7 348 adcs x25,x25,x16 349 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 350 adcs x26,x26,x17 351 mul x17,x10,x8 352 adcs x19,x19,x14 353 mul x14,x11,x8 354 adc x20,x20,x15 355 356 mul x15,x12,x8 357 adds x24,x24,x16 358 mul x16,x13,x8 359 adcs x25,x25,x17 360 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 361 adcs x26,x26,x14 362 umulh x14,x10,x8 363 adcs x19,x19,x15 364 umulh x15,x11,x8 365 adcs x20,x20,x16 366 umulh x16,x12,x8 367 stp x23,x24,[x2],#8*2 // t[4..5] 368 adc x21,xzr,xzr // t[10] 369 adds x25,x25,x17 370 umulh x17,x13,x8 371 adcs x26,x26,x14 372 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 373 adcs x19,x19,x15 374 mul x15,x11,x9 375 adcs x20,x20,x16 376 mul x16,x12,x9 377 adc x21,x21,x17 378 379 mul x17,x13,x9 380 adds x26,x26,x14 381 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 382 adcs x19,x19,x15 383 umulh x15,x11,x9 384 adcs x20,x20,x16 385 umulh x16,x12,x9 386 adcs x21,x21,x17 387 umulh x17,x13,x9 388 stp x25,x26,[x2],#8*2 // t[6..7] 389 adc x22,xzr,xzr // t[11] 390 adds x19,x19,x14 391 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 392 adcs x20,x20,x15 393 mul x15,x12,x10 394 adcs x21,x21,x16 395 mul x16,x13,x10 396 adc x22,x22,x17 397 398 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 399 adds x20,x20,x14 400 umulh x14,x12,x10 401 adcs x21,x21,x15 402 umulh x15,x13,x10 403 adcs x22,x22,x16 404 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 405 adc x23,xzr,xzr // t[12] 406 adds x21,x21,x17 407 mul x17,x13,x11 408 adcs x22,x22,x14 409 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 410 adc x23,x23,x15 411 412 umulh x15,x13,x11 413 adds x22,x22,x16 414 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 415 adcs x23,x23,x17 416 umulh x17,x13,x12 // hi(a[7]*a[6]) 417 adc x24,xzr,xzr // t[13] 418 adds x23,x23,x14 419 sub x27,x3,x1 // done yet? 420 adc x24,x24,x15 421 422 adds x24,x24,x16 423 sub x14,x3,x5 // rewinded ap 424 adc x25,xzr,xzr // t[14] 425 add x25,x25,x17 426 427 cbz x27,.Lsqr8x_outer_break 428 429 mov x4,x6 430 ldp x6,x7,[x2,#8*0] 431 ldp x8,x9,[x2,#8*2] 432 ldp x10,x11,[x2,#8*4] 433 ldp x12,x13,[x2,#8*6] 434 adds x19,x19,x6 435 adcs x20,x20,x7 436 ldp x6,x7,[x1,#8*0] 437 adcs x21,x21,x8 438 adcs x22,x22,x9 439 ldp x8,x9,[x1,#8*2] 440 adcs x23,x23,x10 441 adcs x24,x24,x11 442 ldp x10,x11,[x1,#8*4] 443 adcs x25,x25,x12 444 mov x0,x1 445 adcs x26,xzr,x13 446 ldp x12,x13,[x1,#8*6] 447 add x1,x1,#8*8 448 //adc x28,xzr,xzr // moved below 449 mov x27,#-8*8 450 451 // a[8]a[0] 452 // a[9]a[0] 453 // a[a]a[0] 454 // a[b]a[0] 455 // a[c]a[0] 456 // a[d]a[0] 457 // a[e]a[0] 458 // a[f]a[0] 459 // a[8]a[1] 460 // a[f]a[1]........................ 461 // a[8]a[2] 462 // a[f]a[2]........................ 463 // a[8]a[3] 464 // a[f]a[3]........................ 465 // a[8]a[4] 466 // a[f]a[4]........................ 467 // a[8]a[5] 468 // a[f]a[5]........................ 469 // a[8]a[6] 470 // a[f]a[6]........................ 471 // a[8]a[7] 472 // a[f]a[7]........................ 473.Lsqr8x_mul: 474 mul x14,x6,x4 475 adc x28,xzr,xzr // carry bit, modulo-scheduled 476 mul x15,x7,x4 477 add x27,x27,#8 478 mul x16,x8,x4 479 mul x17,x9,x4 480 adds x19,x19,x14 481 mul x14,x10,x4 482 adcs x20,x20,x15 483 mul x15,x11,x4 484 adcs x21,x21,x16 485 mul x16,x12,x4 486 adcs x22,x22,x17 487 mul x17,x13,x4 488 adcs x23,x23,x14 489 umulh x14,x6,x4 490 adcs x24,x24,x15 491 umulh x15,x7,x4 492 adcs x25,x25,x16 493 umulh x16,x8,x4 494 adcs x26,x26,x17 495 umulh x17,x9,x4 496 adc x28,x28,xzr 497 str x19,[x2],#8 498 adds x19,x20,x14 499 umulh x14,x10,x4 500 adcs x20,x21,x15 501 umulh x15,x11,x4 502 adcs x21,x22,x16 503 umulh x16,x12,x4 504 adcs x22,x23,x17 505 umulh x17,x13,x4 506 ldr x4,[x0,x27] 507 adcs x23,x24,x14 508 adcs x24,x25,x15 509 adcs x25,x26,x16 510 adcs x26,x28,x17 511 //adc x28,xzr,xzr // moved above 512 cbnz x27,.Lsqr8x_mul 513 // note that carry flag is guaranteed 514 // to be zero at this point 515 cmp x1,x3 // done yet? 516 b.eq .Lsqr8x_break 517 518 ldp x6,x7,[x2,#8*0] 519 ldp x8,x9,[x2,#8*2] 520 ldp x10,x11,[x2,#8*4] 521 ldp x12,x13,[x2,#8*6] 522 adds x19,x19,x6 523 ldr x4,[x0,#-8*8] 524 adcs x20,x20,x7 525 ldp x6,x7,[x1,#8*0] 526 adcs x21,x21,x8 527 adcs x22,x22,x9 528 ldp x8,x9,[x1,#8*2] 529 adcs x23,x23,x10 530 adcs x24,x24,x11 531 ldp x10,x11,[x1,#8*4] 532 adcs x25,x25,x12 533 mov x27,#-8*8 534 adcs x26,x26,x13 535 ldp x12,x13,[x1,#8*6] 536 add x1,x1,#8*8 537 //adc x28,xzr,xzr // moved above 538 b .Lsqr8x_mul 539 540.align 4 541.Lsqr8x_break: 542 ldp x6,x7,[x0,#8*0] 543 add x1,x0,#8*8 544 ldp x8,x9,[x0,#8*2] 545 sub x14,x3,x1 // is it last iteration? 546 ldp x10,x11,[x0,#8*4] 547 sub x15,x2,x14 548 ldp x12,x13,[x0,#8*6] 549 cbz x14,.Lsqr8x_outer_loop 550 551 stp x19,x20,[x2,#8*0] 552 ldp x19,x20,[x15,#8*0] 553 stp x21,x22,[x2,#8*2] 554 ldp x21,x22,[x15,#8*2] 555 stp x23,x24,[x2,#8*4] 556 ldp x23,x24,[x15,#8*4] 557 stp x25,x26,[x2,#8*6] 558 mov x2,x15 559 ldp x25,x26,[x15,#8*6] 560 b .Lsqr8x_outer_loop 561 562.align 4 563.Lsqr8x_outer_break: 564 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 565 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 566 ldp x15,x16,[sp,#8*1] 567 ldp x11,x13,[x14,#8*2] 568 add x1,x14,#8*4 569 ldp x17,x14,[sp,#8*3] 570 571 stp x19,x20,[x2,#8*0] 572 mul x19,x7,x7 573 stp x21,x22,[x2,#8*2] 574 umulh x7,x7,x7 575 stp x23,x24,[x2,#8*4] 576 mul x8,x9,x9 577 stp x25,x26,[x2,#8*6] 578 mov x2,sp 579 umulh x9,x9,x9 580 adds x20,x7,x15,lsl#1 581 extr x15,x16,x15,#63 582 sub x27,x5,#8*4 583 584.Lsqr4x_shift_n_add: 585 adcs x21,x8,x15 586 extr x16,x17,x16,#63 587 sub x27,x27,#8*4 588 adcs x22,x9,x16 589 ldp x15,x16,[x2,#8*5] 590 mul x10,x11,x11 591 ldp x7,x9,[x1],#8*2 592 umulh x11,x11,x11 593 mul x12,x13,x13 594 umulh x13,x13,x13 595 extr x17,x14,x17,#63 596 stp x19,x20,[x2,#8*0] 597 adcs x23,x10,x17 598 extr x14,x15,x14,#63 599 stp x21,x22,[x2,#8*2] 600 adcs x24,x11,x14 601 ldp x17,x14,[x2,#8*7] 602 extr x15,x16,x15,#63 603 adcs x25,x12,x15 604 extr x16,x17,x16,#63 605 adcs x26,x13,x16 606 ldp x15,x16,[x2,#8*9] 607 mul x6,x7,x7 608 ldp x11,x13,[x1],#8*2 609 umulh x7,x7,x7 610 mul x8,x9,x9 611 umulh x9,x9,x9 612 stp x23,x24,[x2,#8*4] 613 extr x17,x14,x17,#63 614 stp x25,x26,[x2,#8*6] 615 add x2,x2,#8*8 616 adcs x19,x6,x17 617 extr x14,x15,x14,#63 618 adcs x20,x7,x14 619 ldp x17,x14,[x2,#8*3] 620 extr x15,x16,x15,#63 621 cbnz x27,.Lsqr4x_shift_n_add 622 ldp x1,x4,[x29,#104] // pull np and n0 623 624 adcs x21,x8,x15 625 extr x16,x17,x16,#63 626 adcs x22,x9,x16 627 ldp x15,x16,[x2,#8*5] 628 mul x10,x11,x11 629 umulh x11,x11,x11 630 stp x19,x20,[x2,#8*0] 631 mul x12,x13,x13 632 umulh x13,x13,x13 633 stp x21,x22,[x2,#8*2] 634 extr x17,x14,x17,#63 635 adcs x23,x10,x17 636 extr x14,x15,x14,#63 637 ldp x19,x20,[sp,#8*0] 638 adcs x24,x11,x14 639 extr x15,x16,x15,#63 640 ldp x6,x7,[x1,#8*0] 641 adcs x25,x12,x15 642 extr x16,xzr,x16,#63 643 ldp x8,x9,[x1,#8*2] 644 adc x26,x13,x16 645 ldp x10,x11,[x1,#8*4] 646 647 // Reduce by 512 bits per iteration 648 mul x28,x4,x19 // t[0]*n0 649 ldp x12,x13,[x1,#8*6] 650 add x3,x1,x5 651 ldp x21,x22,[sp,#8*2] 652 stp x23,x24,[x2,#8*4] 653 ldp x23,x24,[sp,#8*4] 654 stp x25,x26,[x2,#8*6] 655 ldp x25,x26,[sp,#8*6] 656 add x1,x1,#8*8 657 mov x30,xzr // initial top-most carry 658 mov x2,sp 659 mov x27,#8 660 661.Lsqr8x_reduction: 662 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 663 mul x15,x7,x28 664 sub x27,x27,#1 665 mul x16,x8,x28 666 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 667 mul x17,x9,x28 668 // (*) adds xzr,x19,x14 669 subs xzr,x19,#1 // (*) 670 mul x14,x10,x28 671 adcs x19,x20,x15 672 mul x15,x11,x28 673 adcs x20,x21,x16 674 mul x16,x12,x28 675 adcs x21,x22,x17 676 mul x17,x13,x28 677 adcs x22,x23,x14 678 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 679 adcs x23,x24,x15 680 umulh x15,x7,x28 681 adcs x24,x25,x16 682 umulh x16,x8,x28 683 adcs x25,x26,x17 684 umulh x17,x9,x28 685 adc x26,xzr,xzr 686 adds x19,x19,x14 687 umulh x14,x10,x28 688 adcs x20,x20,x15 689 umulh x15,x11,x28 690 adcs x21,x21,x16 691 umulh x16,x12,x28 692 adcs x22,x22,x17 693 umulh x17,x13,x28 694 mul x28,x4,x19 // next t[0]*n0 695 adcs x23,x23,x14 696 adcs x24,x24,x15 697 adcs x25,x25,x16 698 adc x26,x26,x17 699 cbnz x27,.Lsqr8x_reduction 700 701 ldp x14,x15,[x2,#8*0] 702 ldp x16,x17,[x2,#8*2] 703 mov x0,x2 704 sub x27,x3,x1 // done yet? 705 adds x19,x19,x14 706 adcs x20,x20,x15 707 ldp x14,x15,[x2,#8*4] 708 adcs x21,x21,x16 709 adcs x22,x22,x17 710 ldp x16,x17,[x2,#8*6] 711 adcs x23,x23,x14 712 adcs x24,x24,x15 713 adcs x25,x25,x16 714 adcs x26,x26,x17 715 //adc x28,xzr,xzr // moved below 716 cbz x27,.Lsqr8x8_post_condition 717 718 ldr x4,[x2,#-8*8] 719 ldp x6,x7,[x1,#8*0] 720 ldp x8,x9,[x1,#8*2] 721 ldp x10,x11,[x1,#8*4] 722 mov x27,#-8*8 723 ldp x12,x13,[x1,#8*6] 724 add x1,x1,#8*8 725 726.Lsqr8x_tail: 727 mul x14,x6,x4 728 adc x28,xzr,xzr // carry bit, modulo-scheduled 729 mul x15,x7,x4 730 add x27,x27,#8 731 mul x16,x8,x4 732 mul x17,x9,x4 733 adds x19,x19,x14 734 mul x14,x10,x4 735 adcs x20,x20,x15 736 mul x15,x11,x4 737 adcs x21,x21,x16 738 mul x16,x12,x4 739 adcs x22,x22,x17 740 mul x17,x13,x4 741 adcs x23,x23,x14 742 umulh x14,x6,x4 743 adcs x24,x24,x15 744 umulh x15,x7,x4 745 adcs x25,x25,x16 746 umulh x16,x8,x4 747 adcs x26,x26,x17 748 umulh x17,x9,x4 749 adc x28,x28,xzr 750 str x19,[x2],#8 751 adds x19,x20,x14 752 umulh x14,x10,x4 753 adcs x20,x21,x15 754 umulh x15,x11,x4 755 adcs x21,x22,x16 756 umulh x16,x12,x4 757 adcs x22,x23,x17 758 umulh x17,x13,x4 759 ldr x4,[x0,x27] 760 adcs x23,x24,x14 761 adcs x24,x25,x15 762 adcs x25,x26,x16 763 adcs x26,x28,x17 764 //adc x28,xzr,xzr // moved above 765 cbnz x27,.Lsqr8x_tail 766 // note that carry flag is guaranteed 767 // to be zero at this point 768 ldp x6,x7,[x2,#8*0] 769 sub x27,x3,x1 // done yet? 770 sub x16,x3,x5 // rewinded np 771 ldp x8,x9,[x2,#8*2] 772 ldp x10,x11,[x2,#8*4] 773 ldp x12,x13,[x2,#8*6] 774 cbz x27,.Lsqr8x_tail_break 775 776 ldr x4,[x0,#-8*8] 777 adds x19,x19,x6 778 adcs x20,x20,x7 779 ldp x6,x7,[x1,#8*0] 780 adcs x21,x21,x8 781 adcs x22,x22,x9 782 ldp x8,x9,[x1,#8*2] 783 adcs x23,x23,x10 784 adcs x24,x24,x11 785 ldp x10,x11,[x1,#8*4] 786 adcs x25,x25,x12 787 mov x27,#-8*8 788 adcs x26,x26,x13 789 ldp x12,x13,[x1,#8*6] 790 add x1,x1,#8*8 791 //adc x28,xzr,xzr // moved above 792 b .Lsqr8x_tail 793 794.align 4 795.Lsqr8x_tail_break: 796 ldr x4,[x29,#112] // pull n0 797 add x27,x2,#8*8 // end of current t[num] window 798 799 subs xzr,x30,#1 // "move" top-most carry to carry bit 800 adcs x14,x19,x6 801 adcs x15,x20,x7 802 ldp x19,x20,[x0,#8*0] 803 adcs x21,x21,x8 804 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 805 adcs x22,x22,x9 806 ldp x8,x9,[x16,#8*2] 807 adcs x23,x23,x10 808 adcs x24,x24,x11 809 ldp x10,x11,[x16,#8*4] 810 adcs x25,x25,x12 811 adcs x26,x26,x13 812 ldp x12,x13,[x16,#8*6] 813 add x1,x16,#8*8 814 adc x30,xzr,xzr // top-most carry 815 mul x28,x4,x19 816 stp x14,x15,[x2,#8*0] 817 stp x21,x22,[x2,#8*2] 818 ldp x21,x22,[x0,#8*2] 819 stp x23,x24,[x2,#8*4] 820 ldp x23,x24,[x0,#8*4] 821 cmp x27,x29 // did we hit the bottom? 822 stp x25,x26,[x2,#8*6] 823 mov x2,x0 // slide the window 824 ldp x25,x26,[x0,#8*6] 825 mov x27,#8 826 b.ne .Lsqr8x_reduction 827 828 // Final step. We see if result is larger than modulus, and 829 // if it is, subtract the modulus. But comparison implies 830 // subtraction. So we subtract modulus, see if it borrowed, 831 // and conditionally copy original value. 832 ldr x0,[x29,#96] // pull rp 833 add x2,x2,#8*8 834 subs x14,x19,x6 835 sbcs x15,x20,x7 836 sub x27,x5,#8*8 837 mov x3,x0 // x0 copy 838 839.Lsqr8x_sub: 840 sbcs x16,x21,x8 841 ldp x6,x7,[x1,#8*0] 842 sbcs x17,x22,x9 843 stp x14,x15,[x0,#8*0] 844 sbcs x14,x23,x10 845 ldp x8,x9,[x1,#8*2] 846 sbcs x15,x24,x11 847 stp x16,x17,[x0,#8*2] 848 sbcs x16,x25,x12 849 ldp x10,x11,[x1,#8*4] 850 sbcs x17,x26,x13 851 ldp x12,x13,[x1,#8*6] 852 add x1,x1,#8*8 853 ldp x19,x20,[x2,#8*0] 854 sub x27,x27,#8*8 855 ldp x21,x22,[x2,#8*2] 856 ldp x23,x24,[x2,#8*4] 857 ldp x25,x26,[x2,#8*6] 858 add x2,x2,#8*8 859 stp x14,x15,[x0,#8*4] 860 sbcs x14,x19,x6 861 stp x16,x17,[x0,#8*6] 862 add x0,x0,#8*8 863 sbcs x15,x20,x7 864 cbnz x27,.Lsqr8x_sub 865 866 sbcs x16,x21,x8 867 mov x2,sp 868 add x1,sp,x5 869 ldp x6,x7,[x3,#8*0] 870 sbcs x17,x22,x9 871 stp x14,x15,[x0,#8*0] 872 sbcs x14,x23,x10 873 ldp x8,x9,[x3,#8*2] 874 sbcs x15,x24,x11 875 stp x16,x17,[x0,#8*2] 876 sbcs x16,x25,x12 877 ldp x19,x20,[x1,#8*0] 878 sbcs x17,x26,x13 879 ldp x21,x22,[x1,#8*2] 880 sbcs xzr,x30,xzr // did it borrow? 881 ldr x30,[x29,#8] // pull return address 882 stp x14,x15,[x0,#8*4] 883 stp x16,x17,[x0,#8*6] 884 885 sub x27,x5,#8*4 886.Lsqr4x_cond_copy: 887 sub x27,x27,#8*4 888 csel x14,x19,x6,lo 889 stp xzr,xzr,[x2,#8*0] 890 csel x15,x20,x7,lo 891 ldp x6,x7,[x3,#8*4] 892 ldp x19,x20,[x1,#8*4] 893 csel x16,x21,x8,lo 894 stp xzr,xzr,[x2,#8*2] 895 add x2,x2,#8*4 896 csel x17,x22,x9,lo 897 ldp x8,x9,[x3,#8*6] 898 ldp x21,x22,[x1,#8*6] 899 add x1,x1,#8*4 900 stp x14,x15,[x3,#8*0] 901 stp x16,x17,[x3,#8*2] 902 add x3,x3,#8*4 903 stp xzr,xzr,[x1,#8*0] 904 stp xzr,xzr,[x1,#8*2] 905 cbnz x27,.Lsqr4x_cond_copy 906 907 csel x14,x19,x6,lo 908 stp xzr,xzr,[x2,#8*0] 909 csel x15,x20,x7,lo 910 stp xzr,xzr,[x2,#8*2] 911 csel x16,x21,x8,lo 912 csel x17,x22,x9,lo 913 stp x14,x15,[x3,#8*0] 914 stp x16,x17,[x3,#8*2] 915 916 b .Lsqr8x_done 917 918.align 4 919.Lsqr8x8_post_condition: 920 adc x28,xzr,xzr 921 ldr x30,[x29,#8] // pull return address 922 // x19-7,x28 hold result, x6-7 hold modulus 923 subs x6,x19,x6 924 ldr x1,[x29,#96] // pull rp 925 sbcs x7,x20,x7 926 stp xzr,xzr,[sp,#8*0] 927 sbcs x8,x21,x8 928 stp xzr,xzr,[sp,#8*2] 929 sbcs x9,x22,x9 930 stp xzr,xzr,[sp,#8*4] 931 sbcs x10,x23,x10 932 stp xzr,xzr,[sp,#8*6] 933 sbcs x11,x24,x11 934 stp xzr,xzr,[sp,#8*8] 935 sbcs x12,x25,x12 936 stp xzr,xzr,[sp,#8*10] 937 sbcs x13,x26,x13 938 stp xzr,xzr,[sp,#8*12] 939 sbcs x28,x28,xzr // did it borrow? 940 stp xzr,xzr,[sp,#8*14] 941 942 // x6-7 hold result-modulus 943 csel x6,x19,x6,lo 944 csel x7,x20,x7,lo 945 csel x8,x21,x8,lo 946 csel x9,x22,x9,lo 947 stp x6,x7,[x1,#8*0] 948 csel x10,x23,x10,lo 949 csel x11,x24,x11,lo 950 stp x8,x9,[x1,#8*2] 951 csel x12,x25,x12,lo 952 csel x13,x26,x13,lo 953 stp x10,x11,[x1,#8*4] 954 stp x12,x13,[x1,#8*6] 955 956.Lsqr8x_done: 957 ldp x19,x20,[x29,#16] 958 mov sp,x29 959 ldp x21,x22,[x29,#32] 960 mov x0,#1 961 ldp x23,x24,[x29,#48] 962 ldp x25,x26,[x29,#64] 963 ldp x27,x28,[x29,#80] 964 ldr x29,[sp],#128 965.inst 0xd50323bf // autiasp 966 ret 967.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 968.type __bn_mul4x_mont,%function 969.align 5 970__bn_mul4x_mont: 971.inst 0xd503233f // paciasp 972 stp x29,x30,[sp,#-128]! 973 add x29,sp,#0 974 stp x19,x20,[sp,#16] 975 stp x21,x22,[sp,#32] 976 stp x23,x24,[sp,#48] 977 stp x25,x26,[sp,#64] 978 stp x27,x28,[sp,#80] 979 980 sub x26,sp,x5,lsl#3 981 lsl x5,x5,#3 982 ldr x4,[x4] // *n0 983 sub sp,x26,#8*4 // alloca 984 985 add x10,x2,x5 986 add x27,x1,x5 987 stp x0,x10,[x29,#96] // offload rp and &b[num] 988 989 ldr x24,[x2,#8*0] // b[0] 990 ldp x6,x7,[x1,#8*0] // a[0..3] 991 ldp x8,x9,[x1,#8*2] 992 add x1,x1,#8*4 993 mov x19,xzr 994 mov x20,xzr 995 mov x21,xzr 996 mov x22,xzr 997 ldp x14,x15,[x3,#8*0] // n[0..3] 998 ldp x16,x17,[x3,#8*2] 999 adds x3,x3,#8*4 // clear carry bit 1000 mov x0,xzr 1001 mov x28,#0 1002 mov x26,sp 1003 1004.Loop_mul4x_1st_reduction: 1005 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1006 adc x0,x0,xzr // modulo-scheduled 1007 mul x11,x7,x24 1008 add x28,x28,#8 1009 mul x12,x8,x24 1010 and x28,x28,#31 1011 mul x13,x9,x24 1012 adds x19,x19,x10 1013 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1014 adcs x20,x20,x11 1015 mul x25,x19,x4 // t[0]*n0 1016 adcs x21,x21,x12 1017 umulh x11,x7,x24 1018 adcs x22,x22,x13 1019 umulh x12,x8,x24 1020 adc x23,xzr,xzr 1021 umulh x13,x9,x24 1022 ldr x24,[x2,x28] // next b[i] (or b[0]) 1023 adds x20,x20,x10 1024 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1025 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1026 adcs x21,x21,x11 1027 mul x11,x15,x25 1028 adcs x22,x22,x12 1029 mul x12,x16,x25 1030 adc x23,x23,x13 // can't overflow 1031 mul x13,x17,x25 1032 // (*) adds xzr,x19,x10 1033 subs xzr,x19,#1 // (*) 1034 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1035 adcs x19,x20,x11 1036 umulh x11,x15,x25 1037 adcs x20,x21,x12 1038 umulh x12,x16,x25 1039 adcs x21,x22,x13 1040 umulh x13,x17,x25 1041 adcs x22,x23,x0 1042 adc x0,xzr,xzr 1043 adds x19,x19,x10 1044 sub x10,x27,x1 1045 adcs x20,x20,x11 1046 adcs x21,x21,x12 1047 adcs x22,x22,x13 1048 //adc x0,x0,xzr 1049 cbnz x28,.Loop_mul4x_1st_reduction 1050 1051 cbz x10,.Lmul4x4_post_condition 1052 1053 ldp x6,x7,[x1,#8*0] // a[4..7] 1054 ldp x8,x9,[x1,#8*2] 1055 add x1,x1,#8*4 1056 ldr x25,[sp] // a[0]*n0 1057 ldp x14,x15,[x3,#8*0] // n[4..7] 1058 ldp x16,x17,[x3,#8*2] 1059 add x3,x3,#8*4 1060 1061.Loop_mul4x_1st_tail: 1062 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1063 adc x0,x0,xzr // modulo-scheduled 1064 mul x11,x7,x24 1065 add x28,x28,#8 1066 mul x12,x8,x24 1067 and x28,x28,#31 1068 mul x13,x9,x24 1069 adds x19,x19,x10 1070 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1071 adcs x20,x20,x11 1072 umulh x11,x7,x24 1073 adcs x21,x21,x12 1074 umulh x12,x8,x24 1075 adcs x22,x22,x13 1076 umulh x13,x9,x24 1077 adc x23,xzr,xzr 1078 ldr x24,[x2,x28] // next b[i] (or b[0]) 1079 adds x20,x20,x10 1080 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1081 adcs x21,x21,x11 1082 mul x11,x15,x25 1083 adcs x22,x22,x12 1084 mul x12,x16,x25 1085 adc x23,x23,x13 // can't overflow 1086 mul x13,x17,x25 1087 adds x19,x19,x10 1088 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1089 adcs x20,x20,x11 1090 umulh x11,x15,x25 1091 adcs x21,x21,x12 1092 umulh x12,x16,x25 1093 adcs x22,x22,x13 1094 adcs x23,x23,x0 1095 umulh x13,x17,x25 1096 adc x0,xzr,xzr 1097 ldr x25,[sp,x28] // next t[0]*n0 1098 str x19,[x26],#8 // result!!! 1099 adds x19,x20,x10 1100 sub x10,x27,x1 // done yet? 1101 adcs x20,x21,x11 1102 adcs x21,x22,x12 1103 adcs x22,x23,x13 1104 //adc x0,x0,xzr 1105 cbnz x28,.Loop_mul4x_1st_tail 1106 1107 sub x11,x27,x5 // rewinded x1 1108 cbz x10,.Lmul4x_proceed 1109 1110 ldp x6,x7,[x1,#8*0] 1111 ldp x8,x9,[x1,#8*2] 1112 add x1,x1,#8*4 1113 ldp x14,x15,[x3,#8*0] 1114 ldp x16,x17,[x3,#8*2] 1115 add x3,x3,#8*4 1116 b .Loop_mul4x_1st_tail 1117 1118.align 5 1119.Lmul4x_proceed: 1120 ldr x24,[x2,#8*4]! // *++b 1121 adc x30,x0,xzr 1122 ldp x6,x7,[x11,#8*0] // a[0..3] 1123 sub x3,x3,x5 // rewind np 1124 ldp x8,x9,[x11,#8*2] 1125 add x1,x11,#8*4 1126 1127 stp x19,x20,[x26,#8*0] // result!!! 1128 ldp x19,x20,[sp,#8*4] // t[0..3] 1129 stp x21,x22,[x26,#8*2] // result!!! 1130 ldp x21,x22,[sp,#8*6] 1131 1132 ldp x14,x15,[x3,#8*0] // n[0..3] 1133 mov x26,sp 1134 ldp x16,x17,[x3,#8*2] 1135 adds x3,x3,#8*4 // clear carry bit 1136 mov x0,xzr 1137 1138.align 4 1139.Loop_mul4x_reduction: 1140 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1141 adc x0,x0,xzr // modulo-scheduled 1142 mul x11,x7,x24 1143 add x28,x28,#8 1144 mul x12,x8,x24 1145 and x28,x28,#31 1146 mul x13,x9,x24 1147 adds x19,x19,x10 1148 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1149 adcs x20,x20,x11 1150 mul x25,x19,x4 // t[0]*n0 1151 adcs x21,x21,x12 1152 umulh x11,x7,x24 1153 adcs x22,x22,x13 1154 umulh x12,x8,x24 1155 adc x23,xzr,xzr 1156 umulh x13,x9,x24 1157 ldr x24,[x2,x28] // next b[i] 1158 adds x20,x20,x10 1159 // (*) mul x10,x14,x25 1160 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1161 adcs x21,x21,x11 1162 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1163 adcs x22,x22,x12 1164 mul x12,x16,x25 1165 adc x23,x23,x13 // can't overflow 1166 mul x13,x17,x25 1167 // (*) adds xzr,x19,x10 1168 subs xzr,x19,#1 // (*) 1169 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1170 adcs x19,x20,x11 1171 umulh x11,x15,x25 1172 adcs x20,x21,x12 1173 umulh x12,x16,x25 1174 adcs x21,x22,x13 1175 umulh x13,x17,x25 1176 adcs x22,x23,x0 1177 adc x0,xzr,xzr 1178 adds x19,x19,x10 1179 adcs x20,x20,x11 1180 adcs x21,x21,x12 1181 adcs x22,x22,x13 1182 //adc x0,x0,xzr 1183 cbnz x28,.Loop_mul4x_reduction 1184 1185 adc x0,x0,xzr 1186 ldp x10,x11,[x26,#8*4] // t[4..7] 1187 ldp x12,x13,[x26,#8*6] 1188 ldp x6,x7,[x1,#8*0] // a[4..7] 1189 ldp x8,x9,[x1,#8*2] 1190 add x1,x1,#8*4 1191 adds x19,x19,x10 1192 adcs x20,x20,x11 1193 adcs x21,x21,x12 1194 adcs x22,x22,x13 1195 //adc x0,x0,xzr 1196 1197 ldr x25,[sp] // t[0]*n0 1198 ldp x14,x15,[x3,#8*0] // n[4..7] 1199 ldp x16,x17,[x3,#8*2] 1200 add x3,x3,#8*4 1201 1202.align 4 1203.Loop_mul4x_tail: 1204 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1205 adc x0,x0,xzr // modulo-scheduled 1206 mul x11,x7,x24 1207 add x28,x28,#8 1208 mul x12,x8,x24 1209 and x28,x28,#31 1210 mul x13,x9,x24 1211 adds x19,x19,x10 1212 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1213 adcs x20,x20,x11 1214 umulh x11,x7,x24 1215 adcs x21,x21,x12 1216 umulh x12,x8,x24 1217 adcs x22,x22,x13 1218 umulh x13,x9,x24 1219 adc x23,xzr,xzr 1220 ldr x24,[x2,x28] // next b[i] 1221 adds x20,x20,x10 1222 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1223 adcs x21,x21,x11 1224 mul x11,x15,x25 1225 adcs x22,x22,x12 1226 mul x12,x16,x25 1227 adc x23,x23,x13 // can't overflow 1228 mul x13,x17,x25 1229 adds x19,x19,x10 1230 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1231 adcs x20,x20,x11 1232 umulh x11,x15,x25 1233 adcs x21,x21,x12 1234 umulh x12,x16,x25 1235 adcs x22,x22,x13 1236 umulh x13,x17,x25 1237 adcs x23,x23,x0 1238 ldr x25,[sp,x28] // next a[0]*n0 1239 adc x0,xzr,xzr 1240 str x19,[x26],#8 // result!!! 1241 adds x19,x20,x10 1242 sub x10,x27,x1 // done yet? 1243 adcs x20,x21,x11 1244 adcs x21,x22,x12 1245 adcs x22,x23,x13 1246 //adc x0,x0,xzr 1247 cbnz x28,.Loop_mul4x_tail 1248 1249 sub x11,x3,x5 // rewinded np? 1250 adc x0,x0,xzr 1251 cbz x10,.Loop_mul4x_break 1252 1253 ldp x10,x11,[x26,#8*4] 1254 ldp x12,x13,[x26,#8*6] 1255 ldp x6,x7,[x1,#8*0] 1256 ldp x8,x9,[x1,#8*2] 1257 add x1,x1,#8*4 1258 adds x19,x19,x10 1259 adcs x20,x20,x11 1260 adcs x21,x21,x12 1261 adcs x22,x22,x13 1262 //adc x0,x0,xzr 1263 ldp x14,x15,[x3,#8*0] 1264 ldp x16,x17,[x3,#8*2] 1265 add x3,x3,#8*4 1266 b .Loop_mul4x_tail 1267 1268.align 4 1269.Loop_mul4x_break: 1270 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1271 adds x19,x19,x30 1272 add x2,x2,#8*4 // bp++ 1273 adcs x20,x20,xzr 1274 sub x1,x1,x5 // rewind ap 1275 adcs x21,x21,xzr 1276 stp x19,x20,[x26,#8*0] // result!!! 1277 adcs x22,x22,xzr 1278 ldp x19,x20,[sp,#8*4] // t[0..3] 1279 adc x30,x0,xzr 1280 stp x21,x22,[x26,#8*2] // result!!! 1281 cmp x2,x13 // done yet? 1282 ldp x21,x22,[sp,#8*6] 1283 ldp x14,x15,[x11,#8*0] // n[0..3] 1284 ldp x16,x17,[x11,#8*2] 1285 add x3,x11,#8*4 1286 b.eq .Lmul4x_post 1287 1288 ldr x24,[x2] 1289 ldp x6,x7,[x1,#8*0] // a[0..3] 1290 ldp x8,x9,[x1,#8*2] 1291 adds x1,x1,#8*4 // clear carry bit 1292 mov x0,xzr 1293 mov x26,sp 1294 b .Loop_mul4x_reduction 1295 1296.align 4 1297.Lmul4x_post: 1298 // Final step. We see if result is larger than modulus, and 1299 // if it is, subtract the modulus. But comparison implies 1300 // subtraction. So we subtract modulus, see if it borrowed, 1301 // and conditionally copy original value. 1302 mov x0,x12 1303 mov x27,x12 // x0 copy 1304 subs x10,x19,x14 1305 add x26,sp,#8*8 1306 sbcs x11,x20,x15 1307 sub x28,x5,#8*4 1308 1309.Lmul4x_sub: 1310 sbcs x12,x21,x16 1311 ldp x14,x15,[x3,#8*0] 1312 sub x28,x28,#8*4 1313 ldp x19,x20,[x26,#8*0] 1314 sbcs x13,x22,x17 1315 ldp x16,x17,[x3,#8*2] 1316 add x3,x3,#8*4 1317 ldp x21,x22,[x26,#8*2] 1318 add x26,x26,#8*4 1319 stp x10,x11,[x0,#8*0] 1320 sbcs x10,x19,x14 1321 stp x12,x13,[x0,#8*2] 1322 add x0,x0,#8*4 1323 sbcs x11,x20,x15 1324 cbnz x28,.Lmul4x_sub 1325 1326 sbcs x12,x21,x16 1327 mov x26,sp 1328 add x1,sp,#8*4 1329 ldp x6,x7,[x27,#8*0] 1330 sbcs x13,x22,x17 1331 stp x10,x11,[x0,#8*0] 1332 ldp x8,x9,[x27,#8*2] 1333 stp x12,x13,[x0,#8*2] 1334 ldp x19,x20,[x1,#8*0] 1335 ldp x21,x22,[x1,#8*2] 1336 sbcs xzr,x30,xzr // did it borrow? 1337 ldr x30,[x29,#8] // pull return address 1338 1339 sub x28,x5,#8*4 1340.Lmul4x_cond_copy: 1341 sub x28,x28,#8*4 1342 csel x10,x19,x6,lo 1343 stp xzr,xzr,[x26,#8*0] 1344 csel x11,x20,x7,lo 1345 ldp x6,x7,[x27,#8*4] 1346 ldp x19,x20,[x1,#8*4] 1347 csel x12,x21,x8,lo 1348 stp xzr,xzr,[x26,#8*2] 1349 add x26,x26,#8*4 1350 csel x13,x22,x9,lo 1351 ldp x8,x9,[x27,#8*6] 1352 ldp x21,x22,[x1,#8*6] 1353 add x1,x1,#8*4 1354 stp x10,x11,[x27,#8*0] 1355 stp x12,x13,[x27,#8*2] 1356 add x27,x27,#8*4 1357 cbnz x28,.Lmul4x_cond_copy 1358 1359 csel x10,x19,x6,lo 1360 stp xzr,xzr,[x26,#8*0] 1361 csel x11,x20,x7,lo 1362 stp xzr,xzr,[x26,#8*2] 1363 csel x12,x21,x8,lo 1364 stp xzr,xzr,[x26,#8*3] 1365 csel x13,x22,x9,lo 1366 stp xzr,xzr,[x26,#8*4] 1367 stp x10,x11,[x27,#8*0] 1368 stp x12,x13,[x27,#8*2] 1369 1370 b .Lmul4x_done 1371 1372.align 4 1373.Lmul4x4_post_condition: 1374 adc x0,x0,xzr 1375 ldr x1,[x29,#96] // pull rp 1376 // x19-3,x0 hold result, x14-7 hold modulus 1377 subs x6,x19,x14 1378 ldr x30,[x29,#8] // pull return address 1379 sbcs x7,x20,x15 1380 stp xzr,xzr,[sp,#8*0] 1381 sbcs x8,x21,x16 1382 stp xzr,xzr,[sp,#8*2] 1383 sbcs x9,x22,x17 1384 stp xzr,xzr,[sp,#8*4] 1385 sbcs xzr,x0,xzr // did it borrow? 1386 stp xzr,xzr,[sp,#8*6] 1387 1388 // x6-3 hold result-modulus 1389 csel x6,x19,x6,lo 1390 csel x7,x20,x7,lo 1391 csel x8,x21,x8,lo 1392 csel x9,x22,x9,lo 1393 stp x6,x7,[x1,#8*0] 1394 stp x8,x9,[x1,#8*2] 1395 1396.Lmul4x_done: 1397 ldp x19,x20,[x29,#16] 1398 mov sp,x29 1399 ldp x21,x22,[x29,#32] 1400 mov x0,#1 1401 ldp x23,x24,[x29,#48] 1402 ldp x25,x26,[x29,#64] 1403 ldp x27,x28,[x29,#80] 1404 ldr x29,[sp],#128 1405.inst 0xd50323bf // autiasp 1406 ret 1407.size __bn_mul4x_mont,.-__bn_mul4x_mont 1408.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1409.align 2 1410.align 4 1411