1/* Do not modify. This file is auto-generated from ecp_sm2p256-armv8.pl. */ 2#include "arm_arch.h" 3.arch armv8-a 4.section .rodata 5 6.align 5 7// The polynomial p 8.Lpoly: 9.quad 0xffffffffffffffff,0xffffffff00000000,0xffffffffffffffff,0xfffffffeffffffff 10// The order of polynomial n 11.Lord: 12.quad 0x53bbf40939d54123,0x7203df6b21c6052b,0xffffffffffffffff,0xfffffffeffffffff 13// (p + 1) / 2 14.Lpoly_div_2: 15.quad 0x8000000000000000,0xffffffff80000000,0xffffffffffffffff,0x7fffffff7fffffff 16// (n + 1) / 2 17.Lord_div_2: 18.quad 0xa9ddfa049ceaa092,0xb901efb590e30295,0xffffffffffffffff,0x7fffffff7fffffff 19 20.text 21 22// void bn_rshift1(BN_ULONG *a); 23.globl bn_rshift1 24.type bn_rshift1,%function 25.align 5 26bn_rshift1: 27 AARCH64_VALID_CALL_TARGET 28 // Load inputs 29 ldp x7,x8,[x0] 30 ldp x9,x10,[x0,#16] 31 32 // Right shift 33 extr x7,x8,x7,#1 34 extr x8,x9,x8,#1 35 extr x9,x10,x9,#1 36 lsr x10,x10,#1 37 38 // Store results 39 stp x7,x8,[x0] 40 stp x9,x10,[x0,#16] 41 42 ret 43.size bn_rshift1,.-bn_rshift1 44 45// void bn_sub(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b); 46.globl bn_sub 47.type bn_sub,%function 48.align 5 49bn_sub: 50 AARCH64_VALID_CALL_TARGET 51 // Load inputs 52 ldp x7,x8,[x1] 53 ldp x9,x10,[x1,#16] 54 ldp x11,x12,[x2] 55 ldp x13,x14,[x2,#16] 56 57 // Subtraction 58 subs x7,x7,x11 59 sbcs x8,x8,x12 60 sbcs x9,x9,x13 61 sbc x10,x10,x14 62 63 // Store results 64 stp x7,x8,[x0] 65 stp x9,x10,[x0,#16] 66 67 ret 68.size bn_sub,.-bn_sub 69 70// void ecp_sm2p256_div_by_2(BN_ULONG *r,const BN_ULONG *a); 71.globl ecp_sm2p256_div_by_2 72.type ecp_sm2p256_div_by_2,%function 73.align 5 74ecp_sm2p256_div_by_2: 75 AARCH64_VALID_CALL_TARGET 76 // Load inputs 77 ldp x7,x8,[x1] 78 ldp x9,x10,[x1,#16] 79 80 // Save the least significant bit 81 mov x3,x7 82 83 // Right shift 1 84 extr x7,x8,x7,#1 85 extr x8,x9,x8,#1 86 extr x9,x10,x9,#1 87 lsr x10,x10,#1 88 89 // Load mod 90 adrp x2,.Lpoly_div_2 91 add x2,x2,#:lo12:.Lpoly_div_2 92 ldp x11,x12,[x2] 93 ldp x13,x14,[x2,#16] 94 95 // Parity check 96 tst x3,#1 97 csel x11,xzr,x11,eq 98 csel x12,xzr,x12,eq 99 csel x13,xzr,x13,eq 100 csel x14,xzr,x14,eq 101 102 // Add 103 adds x7,x7,x11 104 adcs x8,x8,x12 105 adcs x9,x9,x13 106 adc x10,x10,x14 107 108 // Store results 109 stp x7,x8,[x0] 110 stp x9,x10,[x0,#16] 111 ret 112.size ecp_sm2p256_div_by_2,.-ecp_sm2p256_div_by_2 113 114// void ecp_sm2p256_div_by_2_mod_ord(BN_ULONG *r,const BN_ULONG *a); 115.globl ecp_sm2p256_div_by_2_mod_ord 116.type ecp_sm2p256_div_by_2_mod_ord,%function 117.align 5 118ecp_sm2p256_div_by_2_mod_ord: 119 AARCH64_VALID_CALL_TARGET 120 // Load inputs 121 ldp x7,x8,[x1] 122 ldp x9,x10,[x1,#16] 123 124 // Save the least significant bit 125 mov x3,x7 126 127 // Right shift 1 128 extr x7,x8,x7,#1 129 extr x8,x9,x8,#1 130 extr x9,x10,x9,#1 131 lsr x10,x10,#1 132 133 // Load mod 134 adrp x2,.Lord_div_2 135 add x2,x2,#:lo12:.Lord_div_2 136 ldp x11,x12,[x2] 137 ldp x13,x14,[x2,#16] 138 139 // Parity check 140 tst x3,#1 141 csel x11,xzr,x11,eq 142 csel x12,xzr,x12,eq 143 csel x13,xzr,x13,eq 144 csel x14,xzr,x14,eq 145 146 // Add 147 adds x7,x7,x11 148 adcs x8,x8,x12 149 adcs x9,x9,x13 150 adc x10,x10,x14 151 152 // Store results 153 stp x7,x8,[x0] 154 stp x9,x10,[x0,#16] 155 ret 156.size ecp_sm2p256_div_by_2_mod_ord,.-ecp_sm2p256_div_by_2_mod_ord 157 158// void ecp_sm2p256_mul_by_3(BN_ULONG *r,const BN_ULONG *a); 159.globl ecp_sm2p256_mul_by_3 160.type ecp_sm2p256_mul_by_3,%function 161.align 5 162ecp_sm2p256_mul_by_3: 163 AARCH64_VALID_CALL_TARGET 164 // Load inputs 165 ldp x7,x8,[x1] 166 ldp x9,x10,[x1,#16] 167 168 // 2*a 169 adds x7,x7,x7 170 adcs x8,x8,x8 171 adcs x9,x9,x9 172 adcs x10,x10,x10 173 adcs x15,xzr,xzr 174 175 mov x3,x7 176 mov x4,x8 177 mov x5,x9 178 mov x6,x10 179 180 // Sub polynomial 181 adrp x2,.Lpoly 182 add x2,x2,#:lo12:.Lpoly 183 ldp x11,x12,[x2] 184 ldp x13,x14,[x2,#16] 185 subs x7,x7,x11 186 sbcs x8,x8,x12 187 sbcs x9,x9,x13 188 sbcs x10,x10,x14 189 sbcs x15,x15,xzr 190 191 csel x7,x7,x3,cs 192 csel x8,x8,x4,cs 193 csel x9,x9,x5,cs 194 csel x10,x10,x6,cs 195 eor x15,x15,x15 196 197 // 3*a 198 ldp x11,x12,[x1] 199 ldp x13,x14,[x1,#16] 200 adds x7,x7,x11 201 adcs x8,x8,x12 202 adcs x9,x9,x13 203 adcs x10,x10,x14 204 adcs x15,xzr,xzr 205 206 mov x3,x7 207 mov x4,x8 208 mov x5,x9 209 mov x6,x10 210 211 // Sub polynomial 212 adrp x2,.Lpoly 213 add x2,x2,#:lo12:.Lpoly 214 ldp x11,x12,[x2] 215 ldp x13,x14,[x2,#16] 216 subs x7,x7,x11 217 sbcs x8,x8,x12 218 sbcs x9,x9,x13 219 sbcs x10,x10,x14 220 sbcs x15,x15,xzr 221 222 csel x7,x7,x3,cs 223 csel x8,x8,x4,cs 224 csel x9,x9,x5,cs 225 csel x10,x10,x6,cs 226 227 // Store results 228 stp x7,x8,[x0] 229 stp x9,x10,[x0,#16] 230 231 ret 232.size ecp_sm2p256_mul_by_3,.-ecp_sm2p256_mul_by_3 233 234// void ecp_sm2p256_add(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b); 235.globl ecp_sm2p256_add 236.type ecp_sm2p256_add,%function 237.align 5 238ecp_sm2p256_add: 239 AARCH64_VALID_CALL_TARGET 240 // Load inputs 241 ldp x7,x8,[x1] 242 ldp x9,x10,[x1,#16] 243 ldp x11,x12,[x2] 244 ldp x13,x14,[x2,#16] 245 246 // Addition 247 adds x7,x7,x11 248 adcs x8,x8,x12 249 adcs x9,x9,x13 250 adcs x10,x10,x14 251 adc x15,xzr,xzr 252 253 // Load polynomial 254 adrp x2,.Lpoly 255 add x2,x2,#:lo12:.Lpoly 256 ldp x11,x12,[x2] 257 ldp x13,x14,[x2,#16] 258 259 // Backup Addition 260 mov x3,x7 261 mov x4,x8 262 mov x5,x9 263 mov x6,x10 264 265 // Sub polynomial 266 subs x3,x3,x11 267 sbcs x4,x4,x12 268 sbcs x5,x5,x13 269 sbcs x6,x6,x14 270 sbcs x15,x15,xzr 271 272 // Select based on carry 273 csel x7,x7,x3,cc 274 csel x8,x8,x4,cc 275 csel x9,x9,x5,cc 276 csel x10,x10,x6,cc 277 278 // Store results 279 stp x7,x8,[x0] 280 stp x9,x10,[x0,#16] 281 ret 282.size ecp_sm2p256_add,.-ecp_sm2p256_add 283 284// void ecp_sm2p256_sub(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b); 285.globl ecp_sm2p256_sub 286.type ecp_sm2p256_sub,%function 287.align 5 288ecp_sm2p256_sub: 289 AARCH64_VALID_CALL_TARGET 290 // Load inputs 291 ldp x7,x8,[x1] 292 ldp x9,x10,[x1,#16] 293 ldp x11,x12,[x2] 294 ldp x13,x14,[x2,#16] 295 296 // Subtraction 297 subs x7,x7,x11 298 sbcs x8,x8,x12 299 sbcs x9,x9,x13 300 sbcs x10,x10,x14 301 sbc x15,xzr,xzr 302 303 // Load polynomial 304 adrp x2,.Lpoly 305 add x2,x2,#:lo12:.Lpoly 306 ldp x11,x12,[x2] 307 ldp x13,x14,[x2,#16] 308 309 // Backup subtraction 310 mov x3,x7 311 mov x4,x8 312 mov x5,x9 313 mov x6,x10 314 315 // Add polynomial 316 adds x3,x3,x11 317 adcs x4,x4,x12 318 adcs x5,x5,x13 319 adcs x6,x6,x14 320 tst x15,x15 321 322 // Select based on carry 323 csel x7,x7,x3,eq 324 csel x8,x8,x4,eq 325 csel x9,x9,x5,eq 326 csel x10,x10,x6,eq 327 328 // Store results 329 stp x7,x8,[x0] 330 stp x9,x10,[x0,#16] 331 ret 332.size ecp_sm2p256_sub,.-ecp_sm2p256_sub 333 334// void ecp_sm2p256_sub_mod_ord(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b); 335.globl ecp_sm2p256_sub_mod_ord 336.type ecp_sm2p256_sub_mod_ord,%function 337.align 5 338ecp_sm2p256_sub_mod_ord: 339 AARCH64_VALID_CALL_TARGET 340 // Load inputs 341 ldp x7,x8,[x1] 342 ldp x9,x10,[x1,#16] 343 ldp x11,x12,[x2] 344 ldp x13,x14,[x2,#16] 345 346 // Subtraction 347 subs x7,x7,x11 348 sbcs x8,x8,x12 349 sbcs x9,x9,x13 350 sbcs x10,x10,x14 351 sbc x15,xzr,xzr 352 353 // Load polynomial 354 adrp x2,.Lord 355 add x2,x2,#:lo12:.Lord 356 ldp x11,x12,[x2] 357 ldp x13,x14,[x2,#16] 358 359 // Backup subtraction 360 mov x3,x7 361 mov x4,x8 362 mov x5,x9 363 mov x6,x10 364 365 // Add polynomial 366 adds x3,x3,x11 367 adcs x4,x4,x12 368 adcs x5,x5,x13 369 adcs x6,x6,x14 370 tst x15,x15 371 372 // Select based on carry 373 csel x7,x7,x3,eq 374 csel x8,x8,x4,eq 375 csel x9,x9,x5,eq 376 csel x10,x10,x6,eq 377 378 // Store results 379 stp x7,x8,[x0] 380 stp x9,x10,[x0,#16] 381 ret 382.size ecp_sm2p256_sub_mod_ord,.-ecp_sm2p256_sub_mod_ord 383 384.macro RDC 385 // a = | s7 | ... | s0 |, where si are 64-bit quantities 386 // = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities 387 // | s7 | s6 | s5 | s4 | 388 // | a15 | a14 | a13 | a12 | a11 | a10 | a9 | a8 | 389 // | s3 | s2 | s1 | s0 | 390 // | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 | 391 // ================================================= 392 // | a8 | a11 | a10 | a9 | a8 | 0 | s4 | (+) 393 // | a9 | a15 | s6 | a11 | 0 | a10 | a9 | (+) 394 // | a10 | 0 | a14 | a13 | a12 | 0 | s5 | (+) 395 // | a11 | 0 | s7 | a13 | 0 | a12 | a11 | (+) 396 // | a12 | 0 | s7 | a13 | 0 | s6 | (+) 397 // | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+) 398 // | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+) 399 // | a13 | 0 | 0 | 0 | 0 | 0 | s7 | (+) 400 // | a14 | 0 | 0 | 0 | 0 | 0 | s7 | (+) 401 // | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) 402 // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+) 403 // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+) 404 // | s7 | 0 | 0 | 0 | 0 | 0 | 0 | (+) 405 // | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-) 406 // | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-) 407 // | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-) 408 // | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-) 409 // | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]| 410 // | V[3] | V[2] | V[1] | V[0] | 411 412 // 1. 64-bit addition 413 // t2=s6+s7+s7 414 adds x5,x13,x14 415 adcs x4,xzr,xzr 416 adds x5,x5,x14 417 adcs x4,x4,xzr 418 // t3=s4+s5+t2 419 adds x6,x11,x5 420 adcs x15,x4,xzr 421 adds x6,x6,x12 422 adcs x15,x15,xzr 423 // sum 424 adds x7,x7,x6 425 adcs x8,x8,x15 426 adcs x9,x9,x5 427 adcs x10,x10,x14 428 adcs x3,xzr,xzr 429 adds x10,x10,x4 430 adcs x3,x3,xzr 431 432 stp x7,x8,[sp,#32] 433 stp x9,x10,[sp,#48] 434 435 // 2. 64-bit to 32-bit spread 436 mov x4,#0xffffffff 437 mov x7,x11 438 mov x8,x12 439 mov x9,x13 440 mov x10,x14 441 and x7,x7,x4 // a8 442 and x8,x8,x4 // a10 443 and x9,x9,x4 // a12 444 and x10,x10,x4 // a14 445 lsr x11,x11,#32 // a9 446 lsr x12,x12,#32 // a11 447 lsr x13,x13,#32 // a13 448 lsr x14,x14,#32 // a15 449 450 // 3. 32-bit addition 451 add x4,x10,x9 // t1 <- a12 + a14 452 add x5,x14,x13 // t2 <- a13 + a15 453 add x6,x7,x11 // t3 <- a8 + a9 454 add x15,x10,x8 // t4 <- a10 + a14 455 add x14,x14,x12 // a15 <- a11 + a15 456 add x9,x5,x4 // a12 <- a12 + a13 + a14 + a15 457 add x8,x8,x9 // a10 <- a10 + a12 + a13 + a14 + a15 458 add x8,x8,x9 // a10 <- a10 + 2*(a12 + a13 + a14 + a15) 459 add x8,x8,x6 // a10 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15) 460 add x8,x8,x12 // a10 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15) 461 add x9,x9,x13 // a12 <- a12 + 2*a13 + a14 + a15 462 add x9,x9,x12 // a12 <- a11 + a12 + 2*a13 + a14 + a15 463 add x9,x9,x7 // a12 <- a8 + a11 + a12 + 2*a13 + a14 + a15 464 add x6,x6,x10 // t3 <- a8 + a9 + a14 465 add x6,x6,x13 // t3 <- a8 + a9 + a13 + a14 466 add x11,x11,x5 // a9 <- a9 + a13 + a15 467 add x12,x12,x11 // a11 <- a9 + a11 + a13 + a15 468 add x12,x12,x5 // a11 <- a9 + a11 + 2*(a13 + a15) 469 add x4,x4,x15 // t1 <- a10 + a12 + 2*a14 470 471 // U[0] s5 a9 + a11 + 2*(a13 + a15) 472 // U[1] t1 a10 + a12 + 2*a14 473 // U[2] -t3 a8 + a9 + a13 + a14 474 // U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15 475 // U[4] s4 a9 + a13 + a15 476 // U[5] t4 a10 + a14 477 // U[6] s7 a11 + a15 478 // U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15) 479 480 // 4. 32-bit to 64-bit 481 lsl x7,x4,#32 482 extr x4,x9,x4,#32 483 extr x9,x15,x9,#32 484 extr x15,x8,x15,#32 485 lsr x8,x8,#32 486 487 // 5. 64-bit addition 488 adds x12,x12,x7 489 adcs x4,x4,xzr 490 adcs x11,x11,x9 491 adcs x14,x14,x15 492 adcs x3,x3,x8 493 494 // V[0] s5 495 // V[1] t1 496 // V[2] s4 497 // V[3] s7 498 // carry t0 499 // sub t3 500 501 // 5. Process s0-s3 502 ldp x7,x8,[sp,#32] 503 ldp x9,x10,[sp,#48] 504 // add with V0-V3 505 adds x7,x7,x12 506 adcs x8,x8,x4 507 adcs x9,x9,x11 508 adcs x10,x10,x14 509 adcs x3,x3,xzr 510 // sub with t3 511 subs x8,x8,x6 512 sbcs x9,x9,xzr 513 sbcs x10,x10,xzr 514 sbcs x3,x3,xzr 515 516 // 6. MOD 517 // First Mod 518 lsl x4,x3,#32 519 subs x5,x4,x3 520 521 adds x7,x7,x3 522 adcs x8,x8,x5 523 adcs x9,x9,xzr 524 adcs x10,x10,x4 525 526 // Last Mod 527 // return y - p if y > p else y 528 mov x11,x7 529 mov x12,x8 530 mov x13,x9 531 mov x14,x10 532 533 adrp x3,.Lpoly 534 add x3,x3,#:lo12:.Lpoly 535 ldp x4,x5,[x3] 536 ldp x6,x15,[x3,#16] 537 538 adcs x16,xzr,xzr 539 540 subs x7,x7,x4 541 sbcs x8,x8,x5 542 sbcs x9,x9,x6 543 sbcs x10,x10,x15 544 sbcs x16,x16,xzr 545 546 csel x7,x7,x11,cs 547 csel x8,x8,x12,cs 548 csel x9,x9,x13,cs 549 csel x10,x10,x14,cs 550 551.endm 552 553// void ecp_sm2p256_mul(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b); 554.globl ecp_sm2p256_mul 555.type ecp_sm2p256_mul,%function 556.align 5 557ecp_sm2p256_mul: 558 AARCH64_SIGN_LINK_REGISTER 559 // Store scalar registers 560 stp x29,x30,[sp,#-80]! 561 add x29,sp,#0 562 stp x16,x17,[sp,#16] 563 stp x19,x20,[sp,#64] 564 565 // Load inputs 566 ldp x7,x8,[x1] 567 ldp x9,x10,[x1,#16] 568 ldp x11,x12,[x2] 569 ldp x13,x14,[x2,#16] 570 571// ### multiplication ### 572 // ======================== 573 // s3 s2 s1 s0 574 // * s7 s6 s5 s4 575 // ------------------------ 576 // + s0 s0 s0 s0 577 // * * * * 578 // s7 s6 s5 s4 579 // s1 s1 s1 s1 580 // * * * * 581 // s7 s6 s5 s4 582 // s2 s2 s2 s2 583 // * * * * 584 // s7 s6 s5 s4 585 // s3 s3 s3 s3 586 // * * * * 587 // s7 s6 s5 s4 588 // ------------------------ 589 // s7 s6 s5 s4 s3 s2 s1 s0 590 // ======================== 591 592// ### s0*s4 ### 593 mul x16,x7,x11 594 umulh x5,x7,x11 595 596// ### s1*s4 + s0*s5 ### 597 mul x3,x8,x11 598 umulh x4,x8,x11 599 adds x5,x5,x3 600 adcs x6,x4,xzr 601 602 mul x3,x7,x12 603 umulh x4,x7,x12 604 adds x5,x5,x3 605 adcs x6,x6,x4 606 adcs x15,xzr,xzr 607 608// ### s2*s4 + s1*s5 + s0*s6 ### 609 mul x3,x9,x11 610 umulh x4,x9,x11 611 adds x6,x6,x3 612 adcs x15,x15,x4 613 614 mul x3,x8,x12 615 umulh x4,x8,x12 616 adds x6,x6,x3 617 adcs x15,x15,x4 618 adcs x17,xzr,xzr 619 620 mul x3,x7,x13 621 umulh x4,x7,x13 622 adds x6,x6,x3 623 adcs x15,x15,x4 624 adcs x17,x17,xzr 625 626// ### s3*s4 + s2*s5 + s1*s6 + s0*s7 ### 627 mul x3,x10,x11 628 umulh x4,x10,x11 629 adds x15,x15,x3 630 adcs x17,x17,x4 631 adcs x19,xzr,xzr 632 633 mul x3,x9,x12 634 umulh x4,x9,x12 635 adds x15,x15,x3 636 adcs x17,x17,x4 637 adcs x19,x19,xzr 638 639 mul x3,x8,x13 640 umulh x4,x8,x13 641 adds x15,x15,x3 642 adcs x17,x17,x4 643 adcs x19,x19,xzr 644 645 mul x3,x7,x14 646 umulh x4,x7,x14 647 adds x15,x15,x3 648 adcs x17,x17,x4 649 adcs x19,x19,xzr 650 651// ### s3*s5 + s2*s6 + s1*s7 ### 652 mul x3,x10,x12 653 umulh x4,x10,x12 654 adds x17,x17,x3 655 adcs x19,x19,x4 656 adcs x20,xzr,xzr 657 658 mul x3,x9,x13 659 umulh x4,x9,x13 660 adds x17,x17,x3 661 adcs x19,x19,x4 662 adcs x20,x20,xzr 663 664 mul x3,x8,x14 665 umulh x4,x8,x14 666 adds x11,x17,x3 667 adcs x19,x19,x4 668 adcs x20,x20,xzr 669 670// ### s3*s6 + s2*s7 ### 671 mul x3,x10,x13 672 umulh x4,x10,x13 673 adds x19,x19,x3 674 adcs x20,x20,x4 675 adcs x17,xzr,xzr 676 677 mul x3,x9,x14 678 umulh x4,x9,x14 679 adds x12,x19,x3 680 adcs x20,x20,x4 681 adcs x17,x17,xzr 682 683// ### s3*s7 ### 684 mul x3,x10,x14 685 umulh x4,x10,x14 686 adds x13,x20,x3 687 adcs x14,x17,x4 688 689 mov x7,x16 690 mov x8,x5 691 mov x9,x6 692 mov x10,x15 693 694 // result of mul: s7 s6 s5 s4 s3 s2 s1 s0 695 696// ### Reduction ### 697 RDC 698 699 stp x7,x8,[x0] 700 stp x9,x10,[x0,#16] 701 702 // Restore scalar registers 703 ldp x16,x17,[sp,#16] 704 ldp x19,x20,[sp,#64] 705 ldp x29,x30,[sp],#80 706 707 AARCH64_VALIDATE_LINK_REGISTER 708 ret 709.size ecp_sm2p256_mul,.-ecp_sm2p256_mul 710 711// void ecp_sm2p256_sqr(BN_ULONG *r, const BN_ULONG *a); 712.globl ecp_sm2p256_sqr 713.type ecp_sm2p256_sqr,%function 714.align 5 715 716ecp_sm2p256_sqr: 717 AARCH64_SIGN_LINK_REGISTER 718 // Store scalar registers 719 stp x29,x30,[sp,#-80]! 720 add x29,sp,#0 721 stp x16,x17,[sp,#16] 722 stp x19,x20,[sp,#64] 723 724 // Load inputs 725 ldp x11,x12,[x1] 726 ldp x13,x14,[x1,#16] 727 728// ### square ### 729 // ======================== 730 // s7 s6 s5 s4 731 // * s7 s6 s5 s4 732 // ------------------------ 733 // + s4 s4 s4 s4 734 // * * * * 735 // s7 s6 s5 s4 736 // s5 s5 s5 s5 737 // * * * * 738 // s7 s6 s5 s4 739 // s6 s6 s6 s6 740 // * * * * 741 // s7 s6 s5 s4 742 // s7 s7 s7 s7 743 // * * * * 744 // s7 s6 s5 s4 745 // ------------------------ 746 // s7 s6 s5 s4 s3 s2 s1 s0 747 // ======================== 748 749// ### s4*s5 ### 750 mul x8,x11,x12 751 umulh x9,x11,x12 752 753// ### s4*s6 ### 754 mul x3,x13,x11 755 umulh x10,x13,x11 756 adds x9,x9,x3 757 adcs x10,x10,xzr 758 759// ### s4*s7 + s5*s6 ### 760 mul x3,x14,x11 761 umulh x4,x14,x11 762 adds x10,x10,x3 763 adcs x7,x4,xzr 764 765 mul x3,x13,x12 766 umulh x4,x13,x12 767 adds x10,x10,x3 768 adcs x7,x7,x4 769 adcs x5,xzr,xzr 770 771// ### s5*s7 ### 772 mul x3,x14,x12 773 umulh x4,x14,x12 774 adds x7,x7,x3 775 adcs x5,x5,x4 776 777// ### s6*s7 ### 778 mul x3,x14,x13 779 umulh x4,x14,x13 780 adds x5,x5,x3 781 adcs x6,x4,xzr 782 783// ### 2*(t3,t2,s0,s3,s2,s1) ### 784 adds x8,x8,x8 785 adcs x9,x9,x9 786 adcs x10,x10,x10 787 adcs x7,x7,x7 788 adcs x5,x5,x5 789 adcs x6,x6,x6 790 adcs x15,xzr,xzr 791 792// ### s4*s4 ### 793 mul x16,x11,x11 794 umulh x17,x11,x11 795 796// ### s5*s5 ### 797 mul x11,x12,x12 798 umulh x12,x12,x12 799 800// ### s6*s6 ### 801 mul x3,x13,x13 802 umulh x4,x13,x13 803 804// ### s7*s7 ### 805 mul x19,x14,x14 806 umulh x20,x14,x14 807 808 adds x8,x8,x17 809 adcs x9,x9,x11 810 adcs x10,x10,x12 811 adcs x7,x7,x3 812 adcs x5,x5,x4 813 adcs x6,x6,x19 814 adcs x15,x15,x20 815 816 mov x11,x7 817 mov x7,x16 818 mov x12,x5 819 mov x13,x6 820 mov x14,x15 821 822 // result of mul: s7 s6 s5 s4 s3 s2 s1 s0 823 824// ### Reduction ### 825 RDC 826 827 stp x7,x8,[x0] 828 stp x9,x10,[x0,#16] 829 830 // Restore scalar registers 831 ldp x16,x17,[sp,#16] 832 ldp x19,x20,[sp,#64] 833 ldp x29,x30,[sp],#80 834 835 AARCH64_VALIDATE_LINK_REGISTER 836 ret 837.size ecp_sm2p256_sqr,.-ecp_sm2p256_sqr 838