1/* Do not modify. This file is auto-generated from armv8-mont.pl. */ 2#include "arm_arch.h" 3#ifndef __KERNEL__ 4 5.hidden OPENSSL_armv8_rsa_neonized 6#endif 7.text 8 9.globl bn_mul_mont 10.type bn_mul_mont,%function 11.align 5 12bn_mul_mont: 13 AARCH64_SIGN_LINK_REGISTER 14.Lbn_mul_mont: 15 tst x5,#3 16 b.ne .Lmul_mont 17 cmp x5,#32 18 b.le .Lscalar_impl 19#ifndef __KERNEL__ 20 adrp x17,OPENSSL_armv8_rsa_neonized 21 ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] 22 cbnz w17, bn_mul8x_mont_neon 23#endif 24 25.Lscalar_impl: 26 tst x5,#7 27 b.eq __bn_sqr8x_mont 28 tst x5,#3 29 b.eq __bn_mul4x_mont 30 31.Lmul_mont: 32 stp x29,x30,[sp,#-64]! 33 add x29,sp,#0 34 stp x19,x20,[sp,#16] 35 stp x21,x22,[sp,#32] 36 stp x23,x24,[sp,#48] 37 38 ldr x9,[x2],#8 // bp[0] 39 sub x22,sp,x5,lsl#3 40 ldp x7,x8,[x1],#16 // ap[0..1] 41 lsl x5,x5,#3 42 ldr x4,[x4] // *n0 43 and x22,x22,#-16 // ABI says so 44 ldp x13,x14,[x3],#16 // np[0..1] 45 46 mul x6,x7,x9 // ap[0]*bp[0] 47 sub x21,x5,#16 // j=num-2 48 umulh x7,x7,x9 49 mul x10,x8,x9 // ap[1]*bp[0] 50 umulh x11,x8,x9 51 52 mul x15,x6,x4 // "tp[0]"*n0 53 mov sp,x22 // alloca 54 55 // (*) mul x12,x13,x15 // np[0]*m1 56 umulh x13,x13,x15 57 mul x16,x14,x15 // np[1]*m1 58 // (*) adds x12,x12,x6 // discarded 59 // (*) As for removal of first multiplication and addition 60 // instructions. The outcome of first addition is 61 // guaranteed to be zero, which leaves two computationally 62 // significant outcomes: it either carries or not. Then 63 // question is when does it carry? Is there alternative 64 // way to deduce it? If you follow operations, you can 65 // observe that condition for carry is quite simple: 66 // x6 being non-zero. So that carry can be calculated 67 // by adding -1 to x6. That's what next instruction does. 68 subs xzr,x6,#1 // (*) 69 umulh x17,x14,x15 70 adc x13,x13,xzr 71 cbz x21,.L1st_skip 72 73.L1st: 74 ldr x8,[x1],#8 75 adds x6,x10,x7 76 sub x21,x21,#8 // j-- 77 adc x7,x11,xzr 78 79 ldr x14,[x3],#8 80 adds x12,x16,x13 81 mul x10,x8,x9 // ap[j]*bp[0] 82 adc x13,x17,xzr 83 umulh x11,x8,x9 84 85 adds x12,x12,x6 86 mul x16,x14,x15 // np[j]*m1 87 adc x13,x13,xzr 88 umulh x17,x14,x15 89 str x12,[x22],#8 // tp[j-1] 90 cbnz x21,.L1st 91 92.L1st_skip: 93 adds x6,x10,x7 94 sub x1,x1,x5 // rewind x1 95 adc x7,x11,xzr 96 97 adds x12,x16,x13 98 sub x3,x3,x5 // rewind x3 99 adc x13,x17,xzr 100 101 adds x12,x12,x6 102 sub x20,x5,#8 // i=num-1 103 adcs x13,x13,x7 104 105 adc x19,xzr,xzr // upmost overflow bit 106 stp x12,x13,[x22] 107 108.Louter: 109 ldr x9,[x2],#8 // bp[i] 110 ldp x7,x8,[x1],#16 111 ldr x23,[sp] // tp[0] 112 add x22,sp,#8 113 114 mul x6,x7,x9 // ap[0]*bp[i] 115 sub x21,x5,#16 // j=num-2 116 umulh x7,x7,x9 117 ldp x13,x14,[x3],#16 118 mul x10,x8,x9 // ap[1]*bp[i] 119 adds x6,x6,x23 120 umulh x11,x8,x9 121 adc x7,x7,xzr 122 123 mul x15,x6,x4 124 sub x20,x20,#8 // i-- 125 126 // (*) mul x12,x13,x15 // np[0]*m1 127 umulh x13,x13,x15 128 mul x16,x14,x15 // np[1]*m1 129 // (*) adds x12,x12,x6 130 subs xzr,x6,#1 // (*) 131 umulh x17,x14,x15 132 cbz x21,.Linner_skip 133 134.Linner: 135 ldr x8,[x1],#8 136 adc x13,x13,xzr 137 ldr x23,[x22],#8 // tp[j] 138 adds x6,x10,x7 139 sub x21,x21,#8 // j-- 140 adc x7,x11,xzr 141 142 adds x12,x16,x13 143 ldr x14,[x3],#8 144 adc x13,x17,xzr 145 146 mul x10,x8,x9 // ap[j]*bp[i] 147 adds x6,x6,x23 148 umulh x11,x8,x9 149 adc x7,x7,xzr 150 151 mul x16,x14,x15 // np[j]*m1 152 adds x12,x12,x6 153 umulh x17,x14,x15 154 stur x12,[x22,#-16] // tp[j-1] 155 cbnz x21,.Linner 156 157.Linner_skip: 158 ldr x23,[x22],#8 // tp[j] 159 adc x13,x13,xzr 160 adds x6,x10,x7 161 sub x1,x1,x5 // rewind x1 162 adc x7,x11,xzr 163 164 adds x12,x16,x13 165 sub x3,x3,x5 // rewind x3 166 adcs x13,x17,x19 167 adc x19,xzr,xzr 168 169 adds x6,x6,x23 170 adc x7,x7,xzr 171 172 adds x12,x12,x6 173 adcs x13,x13,x7 174 adc x19,x19,xzr // upmost overflow bit 175 stp x12,x13,[x22,#-16] 176 177 cbnz x20,.Louter 178 179 // Final step. We see if result is larger than modulus, and 180 // if it is, subtract the modulus. But comparison implies 181 // subtraction. So we subtract modulus, see if it borrowed, 182 // and conditionally copy original value. 183 ldr x23,[sp] // tp[0] 184 add x22,sp,#8 185 ldr x14,[x3],#8 // np[0] 186 subs x21,x5,#8 // j=num-1 and clear borrow 187 mov x1,x0 188.Lsub: 189 sbcs x8,x23,x14 // tp[j]-np[j] 190 ldr x23,[x22],#8 191 sub x21,x21,#8 // j-- 192 ldr x14,[x3],#8 193 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 194 cbnz x21,.Lsub 195 196 sbcs x8,x23,x14 197 sbcs x19,x19,xzr // did it borrow? 198 str x8,[x1],#8 // rp[num-1] 199 200 ldr x23,[sp] // tp[0] 201 add x22,sp,#8 202 ldr x8,[x0],#8 // rp[0] 203 sub x5,x5,#8 // num-- 204 nop 205.Lcond_copy: 206 sub x5,x5,#8 // num-- 207 csel x14,x23,x8,lo // did it borrow? 208 ldr x23,[x22],#8 209 ldr x8,[x0],#8 210 stur xzr,[x22,#-16] // wipe tp 211 stur x14,[x0,#-16] 212 cbnz x5,.Lcond_copy 213 214 csel x14,x23,x8,lo 215 stur xzr,[x22,#-8] // wipe tp 216 stur x14,[x0,#-8] 217 218 ldp x19,x20,[x29,#16] 219 mov sp,x29 220 ldp x21,x22,[x29,#32] 221 mov x0,#1 222 ldp x23,x24,[x29,#48] 223 ldr x29,[sp],#64 224 AARCH64_VALIDATE_LINK_REGISTER 225 ret 226.size bn_mul_mont,.-bn_mul_mont 227.type bn_mul8x_mont_neon,%function 228.align 5 229bn_mul8x_mont_neon: 230 // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to 231 // only from bn_mul_mont which has already signed the return address. 232 stp x29,x30,[sp,#-80]! 233 mov x16,sp 234 stp d8,d9,[sp,#16] 235 stp d10,d11,[sp,#32] 236 stp d12,d13,[sp,#48] 237 stp d14,d15,[sp,#64] 238 lsl x5,x5,#1 239 eor v14.16b,v14.16b,v14.16b 240 241.align 4 242.LNEON_8n: 243 eor v6.16b,v6.16b,v6.16b 244 sub x7,sp,#128 245 eor v7.16b,v7.16b,v7.16b 246 sub x7,x7,x5,lsl#4 247 eor v8.16b,v8.16b,v8.16b 248 and x7,x7,#-64 249 eor v9.16b,v9.16b,v9.16b 250 mov sp,x7 // alloca 251 eor v10.16b,v10.16b,v10.16b 252 add x7,x7,#256 253 eor v11.16b,v11.16b,v11.16b 254 sub x8,x5,#8 255 eor v12.16b,v12.16b,v12.16b 256 eor v13.16b,v13.16b,v13.16b 257 258.LNEON_8n_init: 259 st1 {v6.2d,v7.2d},[x7],#32 260 subs x8,x8,#8 261 st1 {v8.2d,v9.2d},[x7],#32 262 st1 {v10.2d,v11.2d},[x7],#32 263 st1 {v12.2d,v13.2d},[x7],#32 264 bne .LNEON_8n_init 265 266 add x6,sp,#256 267 ld1 {v0.4s,v1.4s},[x1],#32 268 add x10,sp,#8 269 ldr s30,[x4],#4 270 mov x9,x5 271 b .LNEON_8n_outer 272 273.align 4 274.LNEON_8n_outer: 275 ldr s28,[x2],#4 // *b++ 276 uxtl v28.4s,v28.4h 277 add x7,sp,#128 278 ld1 {v2.4s,v3.4s},[x3],#32 279 280 umlal v6.2d,v28.2s,v0.s[0] 281 umlal v7.2d,v28.2s,v0.s[1] 282 umlal v8.2d,v28.2s,v0.s[2] 283 shl v29.2d,v6.2d,#16 284 ext v29.16b,v29.16b,v29.16b,#8 285 umlal v9.2d,v28.2s,v0.s[3] 286 add v29.2d,v29.2d,v6.2d 287 umlal v10.2d,v28.2s,v1.s[0] 288 mul v29.2s,v29.2s,v30.2s 289 umlal v11.2d,v28.2s,v1.s[1] 290 st1 {v28.2s},[sp] // put aside smashed b[8*i+0] 291 umlal v12.2d,v28.2s,v1.s[2] 292 uxtl v29.4s,v29.4h 293 umlal v13.2d,v28.2s,v1.s[3] 294 ldr s28,[x2],#4 // *b++ 295 umlal v6.2d,v29.2s,v2.s[0] 296 umlal v7.2d,v29.2s,v2.s[1] 297 uxtl v28.4s,v28.4h 298 umlal v8.2d,v29.2s,v2.s[2] 299 ushr v15.2d,v6.2d,#16 300 umlal v9.2d,v29.2s,v2.s[3] 301 umlal v10.2d,v29.2s,v3.s[0] 302 ext v6.16b,v6.16b,v6.16b,#8 303 add v6.2d,v6.2d,v15.2d 304 umlal v11.2d,v29.2s,v3.s[1] 305 ushr v6.2d,v6.2d,#16 306 umlal v12.2d,v29.2s,v3.s[2] 307 umlal v13.2d,v29.2s,v3.s[3] 308 add v16.2d,v7.2d,v6.2d 309 ins v7.d[0],v16.d[0] 310 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+0] 311 umlal v7.2d,v28.2s,v0.s[0] 312 ld1 {v6.2d},[x6],#16 313 umlal v8.2d,v28.2s,v0.s[1] 314 umlal v9.2d,v28.2s,v0.s[2] 315 shl v29.2d,v7.2d,#16 316 ext v29.16b,v29.16b,v29.16b,#8 317 umlal v10.2d,v28.2s,v0.s[3] 318 add v29.2d,v29.2d,v7.2d 319 umlal v11.2d,v28.2s,v1.s[0] 320 mul v29.2s,v29.2s,v30.2s 321 umlal v12.2d,v28.2s,v1.s[1] 322 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+1] 323 umlal v13.2d,v28.2s,v1.s[2] 324 uxtl v29.4s,v29.4h 325 umlal v6.2d,v28.2s,v1.s[3] 326 ldr s28,[x2],#4 // *b++ 327 umlal v7.2d,v29.2s,v2.s[0] 328 umlal v8.2d,v29.2s,v2.s[1] 329 uxtl v28.4s,v28.4h 330 umlal v9.2d,v29.2s,v2.s[2] 331 ushr v15.2d,v7.2d,#16 332 umlal v10.2d,v29.2s,v2.s[3] 333 umlal v11.2d,v29.2s,v3.s[0] 334 ext v7.16b,v7.16b,v7.16b,#8 335 add v7.2d,v7.2d,v15.2d 336 umlal v12.2d,v29.2s,v3.s[1] 337 ushr v7.2d,v7.2d,#16 338 umlal v13.2d,v29.2s,v3.s[2] 339 umlal v6.2d,v29.2s,v3.s[3] 340 add v16.2d,v8.2d,v7.2d 341 ins v8.d[0],v16.d[0] 342 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+1] 343 umlal v8.2d,v28.2s,v0.s[0] 344 ld1 {v7.2d},[x6],#16 345 umlal v9.2d,v28.2s,v0.s[1] 346 umlal v10.2d,v28.2s,v0.s[2] 347 shl v29.2d,v8.2d,#16 348 ext v29.16b,v29.16b,v29.16b,#8 349 umlal v11.2d,v28.2s,v0.s[3] 350 add v29.2d,v29.2d,v8.2d 351 umlal v12.2d,v28.2s,v1.s[0] 352 mul v29.2s,v29.2s,v30.2s 353 umlal v13.2d,v28.2s,v1.s[1] 354 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+2] 355 umlal v6.2d,v28.2s,v1.s[2] 356 uxtl v29.4s,v29.4h 357 umlal v7.2d,v28.2s,v1.s[3] 358 ldr s28,[x2],#4 // *b++ 359 umlal v8.2d,v29.2s,v2.s[0] 360 umlal v9.2d,v29.2s,v2.s[1] 361 uxtl v28.4s,v28.4h 362 umlal v10.2d,v29.2s,v2.s[2] 363 ushr v15.2d,v8.2d,#16 364 umlal v11.2d,v29.2s,v2.s[3] 365 umlal v12.2d,v29.2s,v3.s[0] 366 ext v8.16b,v8.16b,v8.16b,#8 367 add v8.2d,v8.2d,v15.2d 368 umlal v13.2d,v29.2s,v3.s[1] 369 ushr v8.2d,v8.2d,#16 370 umlal v6.2d,v29.2s,v3.s[2] 371 umlal v7.2d,v29.2s,v3.s[3] 372 add v16.2d,v9.2d,v8.2d 373 ins v9.d[0],v16.d[0] 374 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+2] 375 umlal v9.2d,v28.2s,v0.s[0] 376 ld1 {v8.2d},[x6],#16 377 umlal v10.2d,v28.2s,v0.s[1] 378 umlal v11.2d,v28.2s,v0.s[2] 379 shl v29.2d,v9.2d,#16 380 ext v29.16b,v29.16b,v29.16b,#8 381 umlal v12.2d,v28.2s,v0.s[3] 382 add v29.2d,v29.2d,v9.2d 383 umlal v13.2d,v28.2s,v1.s[0] 384 mul v29.2s,v29.2s,v30.2s 385 umlal v6.2d,v28.2s,v1.s[1] 386 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+3] 387 umlal v7.2d,v28.2s,v1.s[2] 388 uxtl v29.4s,v29.4h 389 umlal v8.2d,v28.2s,v1.s[3] 390 ldr s28,[x2],#4 // *b++ 391 umlal v9.2d,v29.2s,v2.s[0] 392 umlal v10.2d,v29.2s,v2.s[1] 393 uxtl v28.4s,v28.4h 394 umlal v11.2d,v29.2s,v2.s[2] 395 ushr v15.2d,v9.2d,#16 396 umlal v12.2d,v29.2s,v2.s[3] 397 umlal v13.2d,v29.2s,v3.s[0] 398 ext v9.16b,v9.16b,v9.16b,#8 399 add v9.2d,v9.2d,v15.2d 400 umlal v6.2d,v29.2s,v3.s[1] 401 ushr v9.2d,v9.2d,#16 402 umlal v7.2d,v29.2s,v3.s[2] 403 umlal v8.2d,v29.2s,v3.s[3] 404 add v16.2d,v10.2d,v9.2d 405 ins v10.d[0],v16.d[0] 406 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+3] 407 umlal v10.2d,v28.2s,v0.s[0] 408 ld1 {v9.2d},[x6],#16 409 umlal v11.2d,v28.2s,v0.s[1] 410 umlal v12.2d,v28.2s,v0.s[2] 411 shl v29.2d,v10.2d,#16 412 ext v29.16b,v29.16b,v29.16b,#8 413 umlal v13.2d,v28.2s,v0.s[3] 414 add v29.2d,v29.2d,v10.2d 415 umlal v6.2d,v28.2s,v1.s[0] 416 mul v29.2s,v29.2s,v30.2s 417 umlal v7.2d,v28.2s,v1.s[1] 418 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+4] 419 umlal v8.2d,v28.2s,v1.s[2] 420 uxtl v29.4s,v29.4h 421 umlal v9.2d,v28.2s,v1.s[3] 422 ldr s28,[x2],#4 // *b++ 423 umlal v10.2d,v29.2s,v2.s[0] 424 umlal v11.2d,v29.2s,v2.s[1] 425 uxtl v28.4s,v28.4h 426 umlal v12.2d,v29.2s,v2.s[2] 427 ushr v15.2d,v10.2d,#16 428 umlal v13.2d,v29.2s,v2.s[3] 429 umlal v6.2d,v29.2s,v3.s[0] 430 ext v10.16b,v10.16b,v10.16b,#8 431 add v10.2d,v10.2d,v15.2d 432 umlal v7.2d,v29.2s,v3.s[1] 433 ushr v10.2d,v10.2d,#16 434 umlal v8.2d,v29.2s,v3.s[2] 435 umlal v9.2d,v29.2s,v3.s[3] 436 add v16.2d,v11.2d,v10.2d 437 ins v11.d[0],v16.d[0] 438 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+4] 439 umlal v11.2d,v28.2s,v0.s[0] 440 ld1 {v10.2d},[x6],#16 441 umlal v12.2d,v28.2s,v0.s[1] 442 umlal v13.2d,v28.2s,v0.s[2] 443 shl v29.2d,v11.2d,#16 444 ext v29.16b,v29.16b,v29.16b,#8 445 umlal v6.2d,v28.2s,v0.s[3] 446 add v29.2d,v29.2d,v11.2d 447 umlal v7.2d,v28.2s,v1.s[0] 448 mul v29.2s,v29.2s,v30.2s 449 umlal v8.2d,v28.2s,v1.s[1] 450 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+5] 451 umlal v9.2d,v28.2s,v1.s[2] 452 uxtl v29.4s,v29.4h 453 umlal v10.2d,v28.2s,v1.s[3] 454 ldr s28,[x2],#4 // *b++ 455 umlal v11.2d,v29.2s,v2.s[0] 456 umlal v12.2d,v29.2s,v2.s[1] 457 uxtl v28.4s,v28.4h 458 umlal v13.2d,v29.2s,v2.s[2] 459 ushr v15.2d,v11.2d,#16 460 umlal v6.2d,v29.2s,v2.s[3] 461 umlal v7.2d,v29.2s,v3.s[0] 462 ext v11.16b,v11.16b,v11.16b,#8 463 add v11.2d,v11.2d,v15.2d 464 umlal v8.2d,v29.2s,v3.s[1] 465 ushr v11.2d,v11.2d,#16 466 umlal v9.2d,v29.2s,v3.s[2] 467 umlal v10.2d,v29.2s,v3.s[3] 468 add v16.2d,v12.2d,v11.2d 469 ins v12.d[0],v16.d[0] 470 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+5] 471 umlal v12.2d,v28.2s,v0.s[0] 472 ld1 {v11.2d},[x6],#16 473 umlal v13.2d,v28.2s,v0.s[1] 474 umlal v6.2d,v28.2s,v0.s[2] 475 shl v29.2d,v12.2d,#16 476 ext v29.16b,v29.16b,v29.16b,#8 477 umlal v7.2d,v28.2s,v0.s[3] 478 add v29.2d,v29.2d,v12.2d 479 umlal v8.2d,v28.2s,v1.s[0] 480 mul v29.2s,v29.2s,v30.2s 481 umlal v9.2d,v28.2s,v1.s[1] 482 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+6] 483 umlal v10.2d,v28.2s,v1.s[2] 484 uxtl v29.4s,v29.4h 485 umlal v11.2d,v28.2s,v1.s[3] 486 ldr s28,[x2],#4 // *b++ 487 umlal v12.2d,v29.2s,v2.s[0] 488 umlal v13.2d,v29.2s,v2.s[1] 489 uxtl v28.4s,v28.4h 490 umlal v6.2d,v29.2s,v2.s[2] 491 ushr v15.2d,v12.2d,#16 492 umlal v7.2d,v29.2s,v2.s[3] 493 umlal v8.2d,v29.2s,v3.s[0] 494 ext v12.16b,v12.16b,v12.16b,#8 495 add v12.2d,v12.2d,v15.2d 496 umlal v9.2d,v29.2s,v3.s[1] 497 ushr v12.2d,v12.2d,#16 498 umlal v10.2d,v29.2s,v3.s[2] 499 umlal v11.2d,v29.2s,v3.s[3] 500 add v16.2d,v13.2d,v12.2d 501 ins v13.d[0],v16.d[0] 502 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+6] 503 umlal v13.2d,v28.2s,v0.s[0] 504 ld1 {v12.2d},[x6],#16 505 umlal v6.2d,v28.2s,v0.s[1] 506 umlal v7.2d,v28.2s,v0.s[2] 507 shl v29.2d,v13.2d,#16 508 ext v29.16b,v29.16b,v29.16b,#8 509 umlal v8.2d,v28.2s,v0.s[3] 510 add v29.2d,v29.2d,v13.2d 511 umlal v9.2d,v28.2s,v1.s[0] 512 mul v29.2s,v29.2s,v30.2s 513 umlal v10.2d,v28.2s,v1.s[1] 514 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+7] 515 umlal v11.2d,v28.2s,v1.s[2] 516 uxtl v29.4s,v29.4h 517 umlal v12.2d,v28.2s,v1.s[3] 518 ld1 {v28.2s},[sp] // pull smashed b[8*i+0] 519 umlal v13.2d,v29.2s,v2.s[0] 520 ld1 {v0.4s,v1.4s},[x1],#32 521 umlal v6.2d,v29.2s,v2.s[1] 522 umlal v7.2d,v29.2s,v2.s[2] 523 mov v5.16b,v13.16b 524 ushr v5.2d,v5.2d,#16 525 ext v13.16b,v13.16b,v13.16b,#8 526 umlal v8.2d,v29.2s,v2.s[3] 527 umlal v9.2d,v29.2s,v3.s[0] 528 add v13.2d,v13.2d,v5.2d 529 umlal v10.2d,v29.2s,v3.s[1] 530 ushr v13.2d,v13.2d,#16 531 eor v15.16b,v15.16b,v15.16b 532 ins v13.d[1],v15.d[0] 533 umlal v11.2d,v29.2s,v3.s[2] 534 umlal v12.2d,v29.2s,v3.s[3] 535 add v6.2d,v6.2d,v13.2d 536 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+7] 537 add x10,sp,#8 // rewind 538 sub x8,x5,#8 539 b .LNEON_8n_inner 540 541.align 4 542.LNEON_8n_inner: 543 subs x8,x8,#8 544 umlal v6.2d,v28.2s,v0.s[0] 545 ld1 {v13.2d},[x6] 546 umlal v7.2d,v28.2s,v0.s[1] 547 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+0] 548 umlal v8.2d,v28.2s,v0.s[2] 549 ld1 {v2.4s,v3.4s},[x3],#32 550 umlal v9.2d,v28.2s,v0.s[3] 551 b.eq .LInner_jump 552 add x6,x6,#16 // don't advance in last iteration 553.LInner_jump: 554 umlal v10.2d,v28.2s,v1.s[0] 555 umlal v11.2d,v28.2s,v1.s[1] 556 umlal v12.2d,v28.2s,v1.s[2] 557 umlal v13.2d,v28.2s,v1.s[3] 558 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+1] 559 umlal v6.2d,v29.2s,v2.s[0] 560 umlal v7.2d,v29.2s,v2.s[1] 561 umlal v8.2d,v29.2s,v2.s[2] 562 umlal v9.2d,v29.2s,v2.s[3] 563 umlal v10.2d,v29.2s,v3.s[0] 564 umlal v11.2d,v29.2s,v3.s[1] 565 umlal v12.2d,v29.2s,v3.s[2] 566 umlal v13.2d,v29.2s,v3.s[3] 567 st1 {v6.2d},[x7],#16 568 umlal v7.2d,v28.2s,v0.s[0] 569 ld1 {v6.2d},[x6] 570 umlal v8.2d,v28.2s,v0.s[1] 571 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+1] 572 umlal v9.2d,v28.2s,v0.s[2] 573 b.eq .LInner_jump1 574 add x6,x6,#16 // don't advance in last iteration 575.LInner_jump1: 576 umlal v10.2d,v28.2s,v0.s[3] 577 umlal v11.2d,v28.2s,v1.s[0] 578 umlal v12.2d,v28.2s,v1.s[1] 579 umlal v13.2d,v28.2s,v1.s[2] 580 umlal v6.2d,v28.2s,v1.s[3] 581 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+2] 582 umlal v7.2d,v29.2s,v2.s[0] 583 umlal v8.2d,v29.2s,v2.s[1] 584 umlal v9.2d,v29.2s,v2.s[2] 585 umlal v10.2d,v29.2s,v2.s[3] 586 umlal v11.2d,v29.2s,v3.s[0] 587 umlal v12.2d,v29.2s,v3.s[1] 588 umlal v13.2d,v29.2s,v3.s[2] 589 umlal v6.2d,v29.2s,v3.s[3] 590 st1 {v7.2d},[x7],#16 591 umlal v8.2d,v28.2s,v0.s[0] 592 ld1 {v7.2d},[x6] 593 umlal v9.2d,v28.2s,v0.s[1] 594 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+2] 595 umlal v10.2d,v28.2s,v0.s[2] 596 b.eq .LInner_jump2 597 add x6,x6,#16 // don't advance in last iteration 598.LInner_jump2: 599 umlal v11.2d,v28.2s,v0.s[3] 600 umlal v12.2d,v28.2s,v1.s[0] 601 umlal v13.2d,v28.2s,v1.s[1] 602 umlal v6.2d,v28.2s,v1.s[2] 603 umlal v7.2d,v28.2s,v1.s[3] 604 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+3] 605 umlal v8.2d,v29.2s,v2.s[0] 606 umlal v9.2d,v29.2s,v2.s[1] 607 umlal v10.2d,v29.2s,v2.s[2] 608 umlal v11.2d,v29.2s,v2.s[3] 609 umlal v12.2d,v29.2s,v3.s[0] 610 umlal v13.2d,v29.2s,v3.s[1] 611 umlal v6.2d,v29.2s,v3.s[2] 612 umlal v7.2d,v29.2s,v3.s[3] 613 st1 {v8.2d},[x7],#16 614 umlal v9.2d,v28.2s,v0.s[0] 615 ld1 {v8.2d},[x6] 616 umlal v10.2d,v28.2s,v0.s[1] 617 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+3] 618 umlal v11.2d,v28.2s,v0.s[2] 619 b.eq .LInner_jump3 620 add x6,x6,#16 // don't advance in last iteration 621.LInner_jump3: 622 umlal v12.2d,v28.2s,v0.s[3] 623 umlal v13.2d,v28.2s,v1.s[0] 624 umlal v6.2d,v28.2s,v1.s[1] 625 umlal v7.2d,v28.2s,v1.s[2] 626 umlal v8.2d,v28.2s,v1.s[3] 627 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+4] 628 umlal v9.2d,v29.2s,v2.s[0] 629 umlal v10.2d,v29.2s,v2.s[1] 630 umlal v11.2d,v29.2s,v2.s[2] 631 umlal v12.2d,v29.2s,v2.s[3] 632 umlal v13.2d,v29.2s,v3.s[0] 633 umlal v6.2d,v29.2s,v3.s[1] 634 umlal v7.2d,v29.2s,v3.s[2] 635 umlal v8.2d,v29.2s,v3.s[3] 636 st1 {v9.2d},[x7],#16 637 umlal v10.2d,v28.2s,v0.s[0] 638 ld1 {v9.2d},[x6] 639 umlal v11.2d,v28.2s,v0.s[1] 640 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+4] 641 umlal v12.2d,v28.2s,v0.s[2] 642 b.eq .LInner_jump4 643 add x6,x6,#16 // don't advance in last iteration 644.LInner_jump4: 645 umlal v13.2d,v28.2s,v0.s[3] 646 umlal v6.2d,v28.2s,v1.s[0] 647 umlal v7.2d,v28.2s,v1.s[1] 648 umlal v8.2d,v28.2s,v1.s[2] 649 umlal v9.2d,v28.2s,v1.s[3] 650 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+5] 651 umlal v10.2d,v29.2s,v2.s[0] 652 umlal v11.2d,v29.2s,v2.s[1] 653 umlal v12.2d,v29.2s,v2.s[2] 654 umlal v13.2d,v29.2s,v2.s[3] 655 umlal v6.2d,v29.2s,v3.s[0] 656 umlal v7.2d,v29.2s,v3.s[1] 657 umlal v8.2d,v29.2s,v3.s[2] 658 umlal v9.2d,v29.2s,v3.s[3] 659 st1 {v10.2d},[x7],#16 660 umlal v11.2d,v28.2s,v0.s[0] 661 ld1 {v10.2d},[x6] 662 umlal v12.2d,v28.2s,v0.s[1] 663 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+5] 664 umlal v13.2d,v28.2s,v0.s[2] 665 b.eq .LInner_jump5 666 add x6,x6,#16 // don't advance in last iteration 667.LInner_jump5: 668 umlal v6.2d,v28.2s,v0.s[3] 669 umlal v7.2d,v28.2s,v1.s[0] 670 umlal v8.2d,v28.2s,v1.s[1] 671 umlal v9.2d,v28.2s,v1.s[2] 672 umlal v10.2d,v28.2s,v1.s[3] 673 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+6] 674 umlal v11.2d,v29.2s,v2.s[0] 675 umlal v12.2d,v29.2s,v2.s[1] 676 umlal v13.2d,v29.2s,v2.s[2] 677 umlal v6.2d,v29.2s,v2.s[3] 678 umlal v7.2d,v29.2s,v3.s[0] 679 umlal v8.2d,v29.2s,v3.s[1] 680 umlal v9.2d,v29.2s,v3.s[2] 681 umlal v10.2d,v29.2s,v3.s[3] 682 st1 {v11.2d},[x7],#16 683 umlal v12.2d,v28.2s,v0.s[0] 684 ld1 {v11.2d},[x6] 685 umlal v13.2d,v28.2s,v0.s[1] 686 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+6] 687 umlal v6.2d,v28.2s,v0.s[2] 688 b.eq .LInner_jump6 689 add x6,x6,#16 // don't advance in last iteration 690.LInner_jump6: 691 umlal v7.2d,v28.2s,v0.s[3] 692 umlal v8.2d,v28.2s,v1.s[0] 693 umlal v9.2d,v28.2s,v1.s[1] 694 umlal v10.2d,v28.2s,v1.s[2] 695 umlal v11.2d,v28.2s,v1.s[3] 696 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+7] 697 umlal v12.2d,v29.2s,v2.s[0] 698 umlal v13.2d,v29.2s,v2.s[1] 699 umlal v6.2d,v29.2s,v2.s[2] 700 umlal v7.2d,v29.2s,v2.s[3] 701 umlal v8.2d,v29.2s,v3.s[0] 702 umlal v9.2d,v29.2s,v3.s[1] 703 umlal v10.2d,v29.2s,v3.s[2] 704 umlal v11.2d,v29.2s,v3.s[3] 705 st1 {v12.2d},[x7],#16 706 umlal v13.2d,v28.2s,v0.s[0] 707 ld1 {v12.2d},[x6] 708 umlal v6.2d,v28.2s,v0.s[1] 709 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+7] 710 umlal v7.2d,v28.2s,v0.s[2] 711 b.eq .LInner_jump7 712 add x6,x6,#16 // don't advance in last iteration 713.LInner_jump7: 714 umlal v8.2d,v28.2s,v0.s[3] 715 umlal v9.2d,v28.2s,v1.s[0] 716 umlal v10.2d,v28.2s,v1.s[1] 717 umlal v11.2d,v28.2s,v1.s[2] 718 umlal v12.2d,v28.2s,v1.s[3] 719 b.ne .LInner_after_rewind8 720 sub x1,x1,x5,lsl#2 // rewind 721.LInner_after_rewind8: 722 umlal v13.2d,v29.2s,v2.s[0] 723 ld1 {v28.2s},[sp] // pull smashed b[8*i+0] 724 umlal v6.2d,v29.2s,v2.s[1] 725 ld1 {v0.4s,v1.4s},[x1],#32 726 umlal v7.2d,v29.2s,v2.s[2] 727 add x10,sp,#8 // rewind 728 umlal v8.2d,v29.2s,v2.s[3] 729 umlal v9.2d,v29.2s,v3.s[0] 730 umlal v10.2d,v29.2s,v3.s[1] 731 umlal v11.2d,v29.2s,v3.s[2] 732 st1 {v13.2d},[x7],#16 733 umlal v12.2d,v29.2s,v3.s[3] 734 735 bne .LNEON_8n_inner 736 add x6,sp,#128 737 st1 {v6.2d,v7.2d},[x7],#32 738 eor v2.16b,v2.16b,v2.16b // v2 739 st1 {v8.2d,v9.2d},[x7],#32 740 eor v3.16b,v3.16b,v3.16b // v3 741 st1 {v10.2d,v11.2d},[x7],#32 742 st1 {v12.2d},[x7] 743 744 subs x9,x9,#8 745 ld1 {v6.2d,v7.2d},[x6],#32 746 ld1 {v8.2d,v9.2d},[x6],#32 747 ld1 {v10.2d,v11.2d},[x6],#32 748 ld1 {v12.2d,v13.2d},[x6],#32 749 750 b.eq .LInner_8n_jump_2steps 751 sub x3,x3,x5,lsl#2 // rewind 752 b .LNEON_8n_outer 753 754.LInner_8n_jump_2steps: 755 add x7,sp,#128 756 st1 {v2.2d,v3.2d}, [sp],#32 // start wiping stack frame 757 mov v5.16b,v6.16b 758 ushr v15.2d,v6.2d,#16 759 ext v6.16b,v6.16b,v6.16b,#8 760 st1 {v2.2d,v3.2d}, [sp],#32 761 add v6.2d,v6.2d,v15.2d 762 st1 {v2.2d,v3.2d}, [sp],#32 763 ushr v15.2d,v6.2d,#16 764 st1 {v2.2d,v3.2d}, [sp],#32 765 zip1 v6.4h,v5.4h,v6.4h 766 ins v15.d[1],v14.d[0] 767 768 mov x8,x5 769 b .LNEON_tail_entry 770 771.align 4 772.LNEON_tail: 773 add v6.2d,v6.2d,v15.2d 774 mov v5.16b,v6.16b 775 ushr v15.2d,v6.2d,#16 776 ext v6.16b,v6.16b,v6.16b,#8 777 ld1 {v8.2d,v9.2d}, [x6],#32 778 add v6.2d,v6.2d,v15.2d 779 ld1 {v10.2d,v11.2d}, [x6],#32 780 ushr v15.2d,v6.2d,#16 781 ld1 {v12.2d,v13.2d}, [x6],#32 782 zip1 v6.4h,v5.4h,v6.4h 783 ins v15.d[1],v14.d[0] 784 785.LNEON_tail_entry: 786 add v7.2d,v7.2d,v15.2d 787 st1 {v6.s}[0], [x7],#4 788 ushr v15.2d,v7.2d,#16 789 mov v5.16b,v7.16b 790 ext v7.16b,v7.16b,v7.16b,#8 791 add v7.2d,v7.2d,v15.2d 792 ushr v15.2d,v7.2d,#16 793 zip1 v7.4h,v5.4h,v7.4h 794 ins v15.d[1],v14.d[0] 795 add v8.2d,v8.2d,v15.2d 796 st1 {v7.s}[0], [x7],#4 797 ushr v15.2d,v8.2d,#16 798 mov v5.16b,v8.16b 799 ext v8.16b,v8.16b,v8.16b,#8 800 add v8.2d,v8.2d,v15.2d 801 ushr v15.2d,v8.2d,#16 802 zip1 v8.4h,v5.4h,v8.4h 803 ins v15.d[1],v14.d[0] 804 add v9.2d,v9.2d,v15.2d 805 st1 {v8.s}[0], [x7],#4 806 ushr v15.2d,v9.2d,#16 807 mov v5.16b,v9.16b 808 ext v9.16b,v9.16b,v9.16b,#8 809 add v9.2d,v9.2d,v15.2d 810 ushr v15.2d,v9.2d,#16 811 zip1 v9.4h,v5.4h,v9.4h 812 ins v15.d[1],v14.d[0] 813 add v10.2d,v10.2d,v15.2d 814 st1 {v9.s}[0], [x7],#4 815 ushr v15.2d,v10.2d,#16 816 mov v5.16b,v10.16b 817 ext v10.16b,v10.16b,v10.16b,#8 818 add v10.2d,v10.2d,v15.2d 819 ushr v15.2d,v10.2d,#16 820 zip1 v10.4h,v5.4h,v10.4h 821 ins v15.d[1],v14.d[0] 822 add v11.2d,v11.2d,v15.2d 823 st1 {v10.s}[0], [x7],#4 824 ushr v15.2d,v11.2d,#16 825 mov v5.16b,v11.16b 826 ext v11.16b,v11.16b,v11.16b,#8 827 add v11.2d,v11.2d,v15.2d 828 ushr v15.2d,v11.2d,#16 829 zip1 v11.4h,v5.4h,v11.4h 830 ins v15.d[1],v14.d[0] 831 add v12.2d,v12.2d,v15.2d 832 st1 {v11.s}[0], [x7],#4 833 ushr v15.2d,v12.2d,#16 834 mov v5.16b,v12.16b 835 ext v12.16b,v12.16b,v12.16b,#8 836 add v12.2d,v12.2d,v15.2d 837 ushr v15.2d,v12.2d,#16 838 zip1 v12.4h,v5.4h,v12.4h 839 ins v15.d[1],v14.d[0] 840 add v13.2d,v13.2d,v15.2d 841 st1 {v12.s}[0], [x7],#4 842 ushr v15.2d,v13.2d,#16 843 mov v5.16b,v13.16b 844 ext v13.16b,v13.16b,v13.16b,#8 845 add v13.2d,v13.2d,v15.2d 846 ushr v15.2d,v13.2d,#16 847 zip1 v13.4h,v5.4h,v13.4h 848 ins v15.d[1],v14.d[0] 849 ld1 {v6.2d,v7.2d}, [x6],#32 850 subs x8,x8,#8 851 st1 {v13.s}[0], [x7],#4 852 bne .LNEON_tail 853 854 st1 {v15.s}[0], [x7],#4 // top-most bit 855 sub x3,x3,x5,lsl#2 // rewind x3 856 subs x1,sp,#0 // clear carry flag 857 add x2,sp,x5,lsl#2 858 859.LNEON_sub: 860 ldp w4,w5,[x1],#8 861 ldp w6,w7,[x1],#8 862 ldp w8,w9,[x3],#8 863 ldp w10,w11,[x3],#8 864 sbcs w8,w4,w8 865 sbcs w9,w5,w9 866 sbcs w10,w6,w10 867 sbcs w11,w7,w11 868 sub x17,x2,x1 869 stp w8,w9,[x0],#8 870 stp w10,w11,[x0],#8 871 cbnz x17,.LNEON_sub 872 873 ldr w10, [x1] // load top-most bit 874 mov x11,sp 875 eor v0.16b,v0.16b,v0.16b 876 sub x11,x2,x11 // this is num*4 877 eor v1.16b,v1.16b,v1.16b 878 mov x1,sp 879 sub x0,x0,x11 // rewind x0 880 mov x3,x2 // second 3/4th of frame 881 sbcs w10,w10,wzr // result is carry flag 882 883.LNEON_copy_n_zap: 884 ldp w4,w5,[x1],#8 885 ldp w6,w7,[x1],#8 886 ldp w8,w9,[x0],#8 887 ldp w10,w11,[x0] 888 sub x0,x0,#8 889 b.cs .LCopy_1 890 mov w8,w4 891 mov w9,w5 892 mov w10,w6 893 mov w11,w7 894.LCopy_1: 895 st1 {v0.2d,v1.2d}, [x3],#32 // wipe 896 st1 {v0.2d,v1.2d}, [x3],#32 // wipe 897 ldp w4,w5,[x1],#8 898 ldp w6,w7,[x1],#8 899 stp w8,w9,[x0],#8 900 stp w10,w11,[x0],#8 901 sub x1,x1,#32 902 ldp w8,w9,[x0],#8 903 ldp w10,w11,[x0] 904 sub x0,x0,#8 905 b.cs .LCopy_2 906 mov w8, w4 907 mov w9, w5 908 mov w10, w6 909 mov w11, w7 910.LCopy_2: 911 st1 {v0.2d,v1.2d}, [x1],#32 // wipe 912 st1 {v0.2d,v1.2d}, [x3],#32 // wipe 913 sub x17,x2,x1 // preserves carry 914 stp w8,w9,[x0],#8 915 stp w10,w11,[x0],#8 916 cbnz x17,.LNEON_copy_n_zap 917 918 mov sp,x16 919 ldp d14,d15,[sp,#64] 920 ldp d12,d13,[sp,#48] 921 ldp d10,d11,[sp,#32] 922 ldp d8,d9,[sp,#16] 923 ldr x29,[sp],#80 924 AARCH64_VALIDATE_LINK_REGISTER 925 ret // bx lr 926 927.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 928.type __bn_sqr8x_mont,%function 929.align 5 930__bn_sqr8x_mont: 931 cmp x1,x2 932 b.ne __bn_mul4x_mont 933.Lsqr8x_mont: 934 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to 935 // only from bn_mul_mont which has already signed the return address. 936 stp x29,x30,[sp,#-128]! 937 add x29,sp,#0 938 stp x19,x20,[sp,#16] 939 stp x21,x22,[sp,#32] 940 stp x23,x24,[sp,#48] 941 stp x25,x26,[sp,#64] 942 stp x27,x28,[sp,#80] 943 stp x0,x3,[sp,#96] // offload rp and np 944 945 ldp x6,x7,[x1,#8*0] 946 ldp x8,x9,[x1,#8*2] 947 ldp x10,x11,[x1,#8*4] 948 ldp x12,x13,[x1,#8*6] 949 950 sub x2,sp,x5,lsl#4 951 lsl x5,x5,#3 952 ldr x4,[x4] // *n0 953 mov sp,x2 // alloca 954 sub x27,x5,#8*8 955 b .Lsqr8x_zero_start 956 957.Lsqr8x_zero: 958 sub x27,x27,#8*8 959 stp xzr,xzr,[x2,#8*0] 960 stp xzr,xzr,[x2,#8*2] 961 stp xzr,xzr,[x2,#8*4] 962 stp xzr,xzr,[x2,#8*6] 963.Lsqr8x_zero_start: 964 stp xzr,xzr,[x2,#8*8] 965 stp xzr,xzr,[x2,#8*10] 966 stp xzr,xzr,[x2,#8*12] 967 stp xzr,xzr,[x2,#8*14] 968 add x2,x2,#8*16 969 cbnz x27,.Lsqr8x_zero 970 971 add x3,x1,x5 972 add x1,x1,#8*8 973 mov x19,xzr 974 mov x20,xzr 975 mov x21,xzr 976 mov x22,xzr 977 mov x23,xzr 978 mov x24,xzr 979 mov x25,xzr 980 mov x26,xzr 981 mov x2,sp 982 str x4,[x29,#112] // offload n0 983 984 // Multiply everything but a[i]*a[i] 985.align 4 986.Lsqr8x_outer_loop: 987 // a[1]a[0] (i) 988 // a[2]a[0] 989 // a[3]a[0] 990 // a[4]a[0] 991 // a[5]a[0] 992 // a[6]a[0] 993 // a[7]a[0] 994 // a[2]a[1] (ii) 995 // a[3]a[1] 996 // a[4]a[1] 997 // a[5]a[1] 998 // a[6]a[1] 999 // a[7]a[1] 1000 // a[3]a[2] (iii) 1001 // a[4]a[2] 1002 // a[5]a[2] 1003 // a[6]a[2] 1004 // a[7]a[2] 1005 // a[4]a[3] (iv) 1006 // a[5]a[3] 1007 // a[6]a[3] 1008 // a[7]a[3] 1009 // a[5]a[4] (v) 1010 // a[6]a[4] 1011 // a[7]a[4] 1012 // a[6]a[5] (vi) 1013 // a[7]a[5] 1014 // a[7]a[6] (vii) 1015 1016 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 1017 mul x15,x8,x6 1018 mul x16,x9,x6 1019 mul x17,x10,x6 1020 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 1021 mul x14,x11,x6 1022 adcs x21,x21,x15 1023 mul x15,x12,x6 1024 adcs x22,x22,x16 1025 mul x16,x13,x6 1026 adcs x23,x23,x17 1027 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 1028 adcs x24,x24,x14 1029 umulh x14,x8,x6 1030 adcs x25,x25,x15 1031 umulh x15,x9,x6 1032 adcs x26,x26,x16 1033 umulh x16,x10,x6 1034 stp x19,x20,[x2],#8*2 // t[0..1] 1035 adc x19,xzr,xzr // t[8] 1036 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 1037 umulh x17,x11,x6 1038 adcs x22,x22,x14 1039 umulh x14,x12,x6 1040 adcs x23,x23,x15 1041 umulh x15,x13,x6 1042 adcs x24,x24,x16 1043 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 1044 adcs x25,x25,x17 1045 mul x17,x9,x7 1046 adcs x26,x26,x14 1047 mul x14,x10,x7 1048 adc x19,x19,x15 1049 1050 mul x15,x11,x7 1051 adds x22,x22,x16 1052 mul x16,x12,x7 1053 adcs x23,x23,x17 1054 mul x17,x13,x7 1055 adcs x24,x24,x14 1056 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 1057 adcs x25,x25,x15 1058 umulh x15,x9,x7 1059 adcs x26,x26,x16 1060 umulh x16,x10,x7 1061 adcs x19,x19,x17 1062 umulh x17,x11,x7 1063 stp x21,x22,[x2],#8*2 // t[2..3] 1064 adc x20,xzr,xzr // t[9] 1065 adds x23,x23,x14 1066 umulh x14,x12,x7 1067 adcs x24,x24,x15 1068 umulh x15,x13,x7 1069 adcs x25,x25,x16 1070 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 1071 adcs x26,x26,x17 1072 mul x17,x10,x8 1073 adcs x19,x19,x14 1074 mul x14,x11,x8 1075 adc x20,x20,x15 1076 1077 mul x15,x12,x8 1078 adds x24,x24,x16 1079 mul x16,x13,x8 1080 adcs x25,x25,x17 1081 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 1082 adcs x26,x26,x14 1083 umulh x14,x10,x8 1084 adcs x19,x19,x15 1085 umulh x15,x11,x8 1086 adcs x20,x20,x16 1087 umulh x16,x12,x8 1088 stp x23,x24,[x2],#8*2 // t[4..5] 1089 adc x21,xzr,xzr // t[10] 1090 adds x25,x25,x17 1091 umulh x17,x13,x8 1092 adcs x26,x26,x14 1093 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 1094 adcs x19,x19,x15 1095 mul x15,x11,x9 1096 adcs x20,x20,x16 1097 mul x16,x12,x9 1098 adc x21,x21,x17 1099 1100 mul x17,x13,x9 1101 adds x26,x26,x14 1102 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 1103 adcs x19,x19,x15 1104 umulh x15,x11,x9 1105 adcs x20,x20,x16 1106 umulh x16,x12,x9 1107 adcs x21,x21,x17 1108 umulh x17,x13,x9 1109 stp x25,x26,[x2],#8*2 // t[6..7] 1110 adc x22,xzr,xzr // t[11] 1111 adds x19,x19,x14 1112 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 1113 adcs x20,x20,x15 1114 mul x15,x12,x10 1115 adcs x21,x21,x16 1116 mul x16,x13,x10 1117 adc x22,x22,x17 1118 1119 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 1120 adds x20,x20,x14 1121 umulh x14,x12,x10 1122 adcs x21,x21,x15 1123 umulh x15,x13,x10 1124 adcs x22,x22,x16 1125 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 1126 adc x23,xzr,xzr // t[12] 1127 adds x21,x21,x17 1128 mul x17,x13,x11 1129 adcs x22,x22,x14 1130 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 1131 adc x23,x23,x15 1132 1133 umulh x15,x13,x11 1134 adds x22,x22,x16 1135 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 1136 adcs x23,x23,x17 1137 umulh x17,x13,x12 // hi(a[7]*a[6]) 1138 adc x24,xzr,xzr // t[13] 1139 adds x23,x23,x14 1140 sub x27,x3,x1 // done yet? 1141 adc x24,x24,x15 1142 1143 adds x24,x24,x16 1144 sub x14,x3,x5 // rewinded ap 1145 adc x25,xzr,xzr // t[14] 1146 add x25,x25,x17 1147 1148 cbz x27,.Lsqr8x_outer_break 1149 1150 mov x4,x6 1151 ldp x6,x7,[x2,#8*0] 1152 ldp x8,x9,[x2,#8*2] 1153 ldp x10,x11,[x2,#8*4] 1154 ldp x12,x13,[x2,#8*6] 1155 adds x19,x19,x6 1156 adcs x20,x20,x7 1157 ldp x6,x7,[x1,#8*0] 1158 adcs x21,x21,x8 1159 adcs x22,x22,x9 1160 ldp x8,x9,[x1,#8*2] 1161 adcs x23,x23,x10 1162 adcs x24,x24,x11 1163 ldp x10,x11,[x1,#8*4] 1164 adcs x25,x25,x12 1165 mov x0,x1 1166 adcs x26,xzr,x13 1167 ldp x12,x13,[x1,#8*6] 1168 add x1,x1,#8*8 1169 //adc x28,xzr,xzr // moved below 1170 mov x27,#-8*8 1171 1172 // a[8]a[0] 1173 // a[9]a[0] 1174 // a[a]a[0] 1175 // a[b]a[0] 1176 // a[c]a[0] 1177 // a[d]a[0] 1178 // a[e]a[0] 1179 // a[f]a[0] 1180 // a[8]a[1] 1181 // a[f]a[1]........................ 1182 // a[8]a[2] 1183 // a[f]a[2]........................ 1184 // a[8]a[3] 1185 // a[f]a[3]........................ 1186 // a[8]a[4] 1187 // a[f]a[4]........................ 1188 // a[8]a[5] 1189 // a[f]a[5]........................ 1190 // a[8]a[6] 1191 // a[f]a[6]........................ 1192 // a[8]a[7] 1193 // a[f]a[7]........................ 1194.Lsqr8x_mul: 1195 mul x14,x6,x4 1196 adc x28,xzr,xzr // carry bit, modulo-scheduled 1197 mul x15,x7,x4 1198 add x27,x27,#8 1199 mul x16,x8,x4 1200 mul x17,x9,x4 1201 adds x19,x19,x14 1202 mul x14,x10,x4 1203 adcs x20,x20,x15 1204 mul x15,x11,x4 1205 adcs x21,x21,x16 1206 mul x16,x12,x4 1207 adcs x22,x22,x17 1208 mul x17,x13,x4 1209 adcs x23,x23,x14 1210 umulh x14,x6,x4 1211 adcs x24,x24,x15 1212 umulh x15,x7,x4 1213 adcs x25,x25,x16 1214 umulh x16,x8,x4 1215 adcs x26,x26,x17 1216 umulh x17,x9,x4 1217 adc x28,x28,xzr 1218 str x19,[x2],#8 1219 adds x19,x20,x14 1220 umulh x14,x10,x4 1221 adcs x20,x21,x15 1222 umulh x15,x11,x4 1223 adcs x21,x22,x16 1224 umulh x16,x12,x4 1225 adcs x22,x23,x17 1226 umulh x17,x13,x4 1227 ldr x4,[x0,x27] 1228 adcs x23,x24,x14 1229 adcs x24,x25,x15 1230 adcs x25,x26,x16 1231 adcs x26,x28,x17 1232 //adc x28,xzr,xzr // moved above 1233 cbnz x27,.Lsqr8x_mul 1234 // note that carry flag is guaranteed 1235 // to be zero at this point 1236 cmp x1,x3 // done yet? 1237 b.eq .Lsqr8x_break 1238 1239 ldp x6,x7,[x2,#8*0] 1240 ldp x8,x9,[x2,#8*2] 1241 ldp x10,x11,[x2,#8*4] 1242 ldp x12,x13,[x2,#8*6] 1243 adds x19,x19,x6 1244 ldur x4,[x0,#-8*8] 1245 adcs x20,x20,x7 1246 ldp x6,x7,[x1,#8*0] 1247 adcs x21,x21,x8 1248 adcs x22,x22,x9 1249 ldp x8,x9,[x1,#8*2] 1250 adcs x23,x23,x10 1251 adcs x24,x24,x11 1252 ldp x10,x11,[x1,#8*4] 1253 adcs x25,x25,x12 1254 mov x27,#-8*8 1255 adcs x26,x26,x13 1256 ldp x12,x13,[x1,#8*6] 1257 add x1,x1,#8*8 1258 //adc x28,xzr,xzr // moved above 1259 b .Lsqr8x_mul 1260 1261.align 4 1262.Lsqr8x_break: 1263 ldp x6,x7,[x0,#8*0] 1264 add x1,x0,#8*8 1265 ldp x8,x9,[x0,#8*2] 1266 sub x14,x3,x1 // is it last iteration? 1267 ldp x10,x11,[x0,#8*4] 1268 sub x15,x2,x14 1269 ldp x12,x13,[x0,#8*6] 1270 cbz x14,.Lsqr8x_outer_loop 1271 1272 stp x19,x20,[x2,#8*0] 1273 ldp x19,x20,[x15,#8*0] 1274 stp x21,x22,[x2,#8*2] 1275 ldp x21,x22,[x15,#8*2] 1276 stp x23,x24,[x2,#8*4] 1277 ldp x23,x24,[x15,#8*4] 1278 stp x25,x26,[x2,#8*6] 1279 mov x2,x15 1280 ldp x25,x26,[x15,#8*6] 1281 b .Lsqr8x_outer_loop 1282 1283.align 4 1284.Lsqr8x_outer_break: 1285 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 1286 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 1287 ldp x15,x16,[sp,#8*1] 1288 ldp x11,x13,[x14,#8*2] 1289 add x1,x14,#8*4 1290 ldp x17,x14,[sp,#8*3] 1291 1292 stp x19,x20,[x2,#8*0] 1293 mul x19,x7,x7 1294 stp x21,x22,[x2,#8*2] 1295 umulh x7,x7,x7 1296 stp x23,x24,[x2,#8*4] 1297 mul x8,x9,x9 1298 stp x25,x26,[x2,#8*6] 1299 mov x2,sp 1300 umulh x9,x9,x9 1301 adds x20,x7,x15,lsl#1 1302 extr x15,x16,x15,#63 1303 sub x27,x5,#8*4 1304 1305.Lsqr4x_shift_n_add: 1306 adcs x21,x8,x15 1307 extr x16,x17,x16,#63 1308 sub x27,x27,#8*4 1309 adcs x22,x9,x16 1310 ldp x15,x16,[x2,#8*5] 1311 mul x10,x11,x11 1312 ldp x7,x9,[x1],#8*2 1313 umulh x11,x11,x11 1314 mul x12,x13,x13 1315 umulh x13,x13,x13 1316 extr x17,x14,x17,#63 1317 stp x19,x20,[x2,#8*0] 1318 adcs x23,x10,x17 1319 extr x14,x15,x14,#63 1320 stp x21,x22,[x2,#8*2] 1321 adcs x24,x11,x14 1322 ldp x17,x14,[x2,#8*7] 1323 extr x15,x16,x15,#63 1324 adcs x25,x12,x15 1325 extr x16,x17,x16,#63 1326 adcs x26,x13,x16 1327 ldp x15,x16,[x2,#8*9] 1328 mul x6,x7,x7 1329 ldp x11,x13,[x1],#8*2 1330 umulh x7,x7,x7 1331 mul x8,x9,x9 1332 umulh x9,x9,x9 1333 stp x23,x24,[x2,#8*4] 1334 extr x17,x14,x17,#63 1335 stp x25,x26,[x2,#8*6] 1336 add x2,x2,#8*8 1337 adcs x19,x6,x17 1338 extr x14,x15,x14,#63 1339 adcs x20,x7,x14 1340 ldp x17,x14,[x2,#8*3] 1341 extr x15,x16,x15,#63 1342 cbnz x27,.Lsqr4x_shift_n_add 1343 ldp x1,x4,[x29,#104] // pull np and n0 1344 1345 adcs x21,x8,x15 1346 extr x16,x17,x16,#63 1347 adcs x22,x9,x16 1348 ldp x15,x16,[x2,#8*5] 1349 mul x10,x11,x11 1350 umulh x11,x11,x11 1351 stp x19,x20,[x2,#8*0] 1352 mul x12,x13,x13 1353 umulh x13,x13,x13 1354 stp x21,x22,[x2,#8*2] 1355 extr x17,x14,x17,#63 1356 adcs x23,x10,x17 1357 extr x14,x15,x14,#63 1358 ldp x19,x20,[sp,#8*0] 1359 adcs x24,x11,x14 1360 extr x15,x16,x15,#63 1361 ldp x6,x7,[x1,#8*0] 1362 adcs x25,x12,x15 1363 extr x16,xzr,x16,#63 1364 ldp x8,x9,[x1,#8*2] 1365 adc x26,x13,x16 1366 ldp x10,x11,[x1,#8*4] 1367 1368 // Reduce by 512 bits per iteration 1369 mul x28,x4,x19 // t[0]*n0 1370 ldp x12,x13,[x1,#8*6] 1371 add x3,x1,x5 1372 ldp x21,x22,[sp,#8*2] 1373 stp x23,x24,[x2,#8*4] 1374 ldp x23,x24,[sp,#8*4] 1375 stp x25,x26,[x2,#8*6] 1376 ldp x25,x26,[sp,#8*6] 1377 add x1,x1,#8*8 1378 mov x30,xzr // initial top-most carry 1379 mov x2,sp 1380 mov x27,#8 1381 1382.Lsqr8x_reduction: 1383 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 1384 mul x15,x7,x28 1385 sub x27,x27,#1 1386 mul x16,x8,x28 1387 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 1388 mul x17,x9,x28 1389 // (*) adds xzr,x19,x14 1390 subs xzr,x19,#1 // (*) 1391 mul x14,x10,x28 1392 adcs x19,x20,x15 1393 mul x15,x11,x28 1394 adcs x20,x21,x16 1395 mul x16,x12,x28 1396 adcs x21,x22,x17 1397 mul x17,x13,x28 1398 adcs x22,x23,x14 1399 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 1400 adcs x23,x24,x15 1401 umulh x15,x7,x28 1402 adcs x24,x25,x16 1403 umulh x16,x8,x28 1404 adcs x25,x26,x17 1405 umulh x17,x9,x28 1406 adc x26,xzr,xzr 1407 adds x19,x19,x14 1408 umulh x14,x10,x28 1409 adcs x20,x20,x15 1410 umulh x15,x11,x28 1411 adcs x21,x21,x16 1412 umulh x16,x12,x28 1413 adcs x22,x22,x17 1414 umulh x17,x13,x28 1415 mul x28,x4,x19 // next t[0]*n0 1416 adcs x23,x23,x14 1417 adcs x24,x24,x15 1418 adcs x25,x25,x16 1419 adc x26,x26,x17 1420 cbnz x27,.Lsqr8x_reduction 1421 1422 ldp x14,x15,[x2,#8*0] 1423 ldp x16,x17,[x2,#8*2] 1424 mov x0,x2 1425 sub x27,x3,x1 // done yet? 1426 adds x19,x19,x14 1427 adcs x20,x20,x15 1428 ldp x14,x15,[x2,#8*4] 1429 adcs x21,x21,x16 1430 adcs x22,x22,x17 1431 ldp x16,x17,[x2,#8*6] 1432 adcs x23,x23,x14 1433 adcs x24,x24,x15 1434 adcs x25,x25,x16 1435 adcs x26,x26,x17 1436 //adc x28,xzr,xzr // moved below 1437 cbz x27,.Lsqr8x8_post_condition 1438 1439 ldur x4,[x2,#-8*8] 1440 ldp x6,x7,[x1,#8*0] 1441 ldp x8,x9,[x1,#8*2] 1442 ldp x10,x11,[x1,#8*4] 1443 mov x27,#-8*8 1444 ldp x12,x13,[x1,#8*6] 1445 add x1,x1,#8*8 1446 1447.Lsqr8x_tail: 1448 mul x14,x6,x4 1449 adc x28,xzr,xzr // carry bit, modulo-scheduled 1450 mul x15,x7,x4 1451 add x27,x27,#8 1452 mul x16,x8,x4 1453 mul x17,x9,x4 1454 adds x19,x19,x14 1455 mul x14,x10,x4 1456 adcs x20,x20,x15 1457 mul x15,x11,x4 1458 adcs x21,x21,x16 1459 mul x16,x12,x4 1460 adcs x22,x22,x17 1461 mul x17,x13,x4 1462 adcs x23,x23,x14 1463 umulh x14,x6,x4 1464 adcs x24,x24,x15 1465 umulh x15,x7,x4 1466 adcs x25,x25,x16 1467 umulh x16,x8,x4 1468 adcs x26,x26,x17 1469 umulh x17,x9,x4 1470 adc x28,x28,xzr 1471 str x19,[x2],#8 1472 adds x19,x20,x14 1473 umulh x14,x10,x4 1474 adcs x20,x21,x15 1475 umulh x15,x11,x4 1476 adcs x21,x22,x16 1477 umulh x16,x12,x4 1478 adcs x22,x23,x17 1479 umulh x17,x13,x4 1480 ldr x4,[x0,x27] 1481 adcs x23,x24,x14 1482 adcs x24,x25,x15 1483 adcs x25,x26,x16 1484 adcs x26,x28,x17 1485 //adc x28,xzr,xzr // moved above 1486 cbnz x27,.Lsqr8x_tail 1487 // note that carry flag is guaranteed 1488 // to be zero at this point 1489 ldp x6,x7,[x2,#8*0] 1490 sub x27,x3,x1 // done yet? 1491 sub x16,x3,x5 // rewinded np 1492 ldp x8,x9,[x2,#8*2] 1493 ldp x10,x11,[x2,#8*4] 1494 ldp x12,x13,[x2,#8*6] 1495 cbz x27,.Lsqr8x_tail_break 1496 1497 ldur x4,[x0,#-8*8] 1498 adds x19,x19,x6 1499 adcs x20,x20,x7 1500 ldp x6,x7,[x1,#8*0] 1501 adcs x21,x21,x8 1502 adcs x22,x22,x9 1503 ldp x8,x9,[x1,#8*2] 1504 adcs x23,x23,x10 1505 adcs x24,x24,x11 1506 ldp x10,x11,[x1,#8*4] 1507 adcs x25,x25,x12 1508 mov x27,#-8*8 1509 adcs x26,x26,x13 1510 ldp x12,x13,[x1,#8*6] 1511 add x1,x1,#8*8 1512 //adc x28,xzr,xzr // moved above 1513 b .Lsqr8x_tail 1514 1515.align 4 1516.Lsqr8x_tail_break: 1517 ldr x4,[x29,#112] // pull n0 1518 add x27,x2,#8*8 // end of current t[num] window 1519 1520 subs xzr,x30,#1 // "move" top-most carry to carry bit 1521 adcs x14,x19,x6 1522 adcs x15,x20,x7 1523 ldp x19,x20,[x0,#8*0] 1524 adcs x21,x21,x8 1525 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 1526 adcs x22,x22,x9 1527 ldp x8,x9,[x16,#8*2] 1528 adcs x23,x23,x10 1529 adcs x24,x24,x11 1530 ldp x10,x11,[x16,#8*4] 1531 adcs x25,x25,x12 1532 adcs x26,x26,x13 1533 ldp x12,x13,[x16,#8*6] 1534 add x1,x16,#8*8 1535 adc x30,xzr,xzr // top-most carry 1536 mul x28,x4,x19 1537 stp x14,x15,[x2,#8*0] 1538 stp x21,x22,[x2,#8*2] 1539 ldp x21,x22,[x0,#8*2] 1540 stp x23,x24,[x2,#8*4] 1541 ldp x23,x24,[x0,#8*4] 1542 cmp x27,x29 // did we hit the bottom? 1543 stp x25,x26,[x2,#8*6] 1544 mov x2,x0 // slide the window 1545 ldp x25,x26,[x0,#8*6] 1546 mov x27,#8 1547 b.ne .Lsqr8x_reduction 1548 1549 // Final step. We see if result is larger than modulus, and 1550 // if it is, subtract the modulus. But comparison implies 1551 // subtraction. So we subtract modulus, see if it borrowed, 1552 // and conditionally copy original value. 1553 ldr x0,[x29,#96] // pull rp 1554 add x2,x2,#8*8 1555 subs x14,x19,x6 1556 sbcs x15,x20,x7 1557 sub x27,x5,#8*8 1558 mov x3,x0 // x0 copy 1559 1560.Lsqr8x_sub: 1561 sbcs x16,x21,x8 1562 ldp x6,x7,[x1,#8*0] 1563 sbcs x17,x22,x9 1564 stp x14,x15,[x0,#8*0] 1565 sbcs x14,x23,x10 1566 ldp x8,x9,[x1,#8*2] 1567 sbcs x15,x24,x11 1568 stp x16,x17,[x0,#8*2] 1569 sbcs x16,x25,x12 1570 ldp x10,x11,[x1,#8*4] 1571 sbcs x17,x26,x13 1572 ldp x12,x13,[x1,#8*6] 1573 add x1,x1,#8*8 1574 ldp x19,x20,[x2,#8*0] 1575 sub x27,x27,#8*8 1576 ldp x21,x22,[x2,#8*2] 1577 ldp x23,x24,[x2,#8*4] 1578 ldp x25,x26,[x2,#8*6] 1579 add x2,x2,#8*8 1580 stp x14,x15,[x0,#8*4] 1581 sbcs x14,x19,x6 1582 stp x16,x17,[x0,#8*6] 1583 add x0,x0,#8*8 1584 sbcs x15,x20,x7 1585 cbnz x27,.Lsqr8x_sub 1586 1587 sbcs x16,x21,x8 1588 mov x2,sp 1589 add x1,sp,x5 1590 ldp x6,x7,[x3,#8*0] 1591 sbcs x17,x22,x9 1592 stp x14,x15,[x0,#8*0] 1593 sbcs x14,x23,x10 1594 ldp x8,x9,[x3,#8*2] 1595 sbcs x15,x24,x11 1596 stp x16,x17,[x0,#8*2] 1597 sbcs x16,x25,x12 1598 ldp x19,x20,[x1,#8*0] 1599 sbcs x17,x26,x13 1600 ldp x21,x22,[x1,#8*2] 1601 sbcs xzr,x30,xzr // did it borrow? 1602 ldr x30,[x29,#8] // pull return address 1603 stp x14,x15,[x0,#8*4] 1604 stp x16,x17,[x0,#8*6] 1605 1606 sub x27,x5,#8*4 1607.Lsqr4x_cond_copy: 1608 sub x27,x27,#8*4 1609 csel x14,x19,x6,lo 1610 stp xzr,xzr,[x2,#8*0] 1611 csel x15,x20,x7,lo 1612 ldp x6,x7,[x3,#8*4] 1613 ldp x19,x20,[x1,#8*4] 1614 csel x16,x21,x8,lo 1615 stp xzr,xzr,[x2,#8*2] 1616 add x2,x2,#8*4 1617 csel x17,x22,x9,lo 1618 ldp x8,x9,[x3,#8*6] 1619 ldp x21,x22,[x1,#8*6] 1620 add x1,x1,#8*4 1621 stp x14,x15,[x3,#8*0] 1622 stp x16,x17,[x3,#8*2] 1623 add x3,x3,#8*4 1624 stp xzr,xzr,[x1,#8*0] 1625 stp xzr,xzr,[x1,#8*2] 1626 cbnz x27,.Lsqr4x_cond_copy 1627 1628 csel x14,x19,x6,lo 1629 stp xzr,xzr,[x2,#8*0] 1630 csel x15,x20,x7,lo 1631 stp xzr,xzr,[x2,#8*2] 1632 csel x16,x21,x8,lo 1633 csel x17,x22,x9,lo 1634 stp x14,x15,[x3,#8*0] 1635 stp x16,x17,[x3,#8*2] 1636 1637 b .Lsqr8x_done 1638 1639.align 4 1640.Lsqr8x8_post_condition: 1641 adc x28,xzr,xzr 1642 ldr x30,[x29,#8] // pull return address 1643 // x19-7,x28 hold result, x6-7 hold modulus 1644 subs x6,x19,x6 1645 ldr x1,[x29,#96] // pull rp 1646 sbcs x7,x20,x7 1647 stp xzr,xzr,[sp,#8*0] 1648 sbcs x8,x21,x8 1649 stp xzr,xzr,[sp,#8*2] 1650 sbcs x9,x22,x9 1651 stp xzr,xzr,[sp,#8*4] 1652 sbcs x10,x23,x10 1653 stp xzr,xzr,[sp,#8*6] 1654 sbcs x11,x24,x11 1655 stp xzr,xzr,[sp,#8*8] 1656 sbcs x12,x25,x12 1657 stp xzr,xzr,[sp,#8*10] 1658 sbcs x13,x26,x13 1659 stp xzr,xzr,[sp,#8*12] 1660 sbcs x28,x28,xzr // did it borrow? 1661 stp xzr,xzr,[sp,#8*14] 1662 1663 // x6-7 hold result-modulus 1664 csel x6,x19,x6,lo 1665 csel x7,x20,x7,lo 1666 csel x8,x21,x8,lo 1667 csel x9,x22,x9,lo 1668 stp x6,x7,[x1,#8*0] 1669 csel x10,x23,x10,lo 1670 csel x11,x24,x11,lo 1671 stp x8,x9,[x1,#8*2] 1672 csel x12,x25,x12,lo 1673 csel x13,x26,x13,lo 1674 stp x10,x11,[x1,#8*4] 1675 stp x12,x13,[x1,#8*6] 1676 1677.Lsqr8x_done: 1678 ldp x19,x20,[x29,#16] 1679 mov sp,x29 1680 ldp x21,x22,[x29,#32] 1681 mov x0,#1 1682 ldp x23,x24,[x29,#48] 1683 ldp x25,x26,[x29,#64] 1684 ldp x27,x28,[x29,#80] 1685 ldr x29,[sp],#128 1686 // x30 is loaded earlier 1687 AARCH64_VALIDATE_LINK_REGISTER 1688 ret 1689.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 1690.type __bn_mul4x_mont,%function 1691.align 5 1692__bn_mul4x_mont: 1693 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to 1694 // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address. 1695 stp x29,x30,[sp,#-128]! 1696 add x29,sp,#0 1697 stp x19,x20,[sp,#16] 1698 stp x21,x22,[sp,#32] 1699 stp x23,x24,[sp,#48] 1700 stp x25,x26,[sp,#64] 1701 stp x27,x28,[sp,#80] 1702 1703 sub x26,sp,x5,lsl#3 1704 lsl x5,x5,#3 1705 ldr x4,[x4] // *n0 1706 sub sp,x26,#8*4 // alloca 1707 1708 add x10,x2,x5 1709 add x27,x1,x5 1710 stp x0,x10,[x29,#96] // offload rp and &b[num] 1711 1712 ldr x24,[x2,#8*0] // b[0] 1713 ldp x6,x7,[x1,#8*0] // a[0..3] 1714 ldp x8,x9,[x1,#8*2] 1715 add x1,x1,#8*4 1716 mov x19,xzr 1717 mov x20,xzr 1718 mov x21,xzr 1719 mov x22,xzr 1720 ldp x14,x15,[x3,#8*0] // n[0..3] 1721 ldp x16,x17,[x3,#8*2] 1722 adds x3,x3,#8*4 // clear carry bit 1723 mov x0,xzr 1724 mov x28,#0 1725 mov x26,sp 1726 1727.Loop_mul4x_1st_reduction: 1728 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1729 adc x0,x0,xzr // modulo-scheduled 1730 mul x11,x7,x24 1731 add x28,x28,#8 1732 mul x12,x8,x24 1733 and x28,x28,#31 1734 mul x13,x9,x24 1735 adds x19,x19,x10 1736 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1737 adcs x20,x20,x11 1738 mul x25,x19,x4 // t[0]*n0 1739 adcs x21,x21,x12 1740 umulh x11,x7,x24 1741 adcs x22,x22,x13 1742 umulh x12,x8,x24 1743 adc x23,xzr,xzr 1744 umulh x13,x9,x24 1745 ldr x24,[x2,x28] // next b[i] (or b[0]) 1746 adds x20,x20,x10 1747 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1748 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1749 adcs x21,x21,x11 1750 mul x11,x15,x25 1751 adcs x22,x22,x12 1752 mul x12,x16,x25 1753 adc x23,x23,x13 // can't overflow 1754 mul x13,x17,x25 1755 // (*) adds xzr,x19,x10 1756 subs xzr,x19,#1 // (*) 1757 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1758 adcs x19,x20,x11 1759 umulh x11,x15,x25 1760 adcs x20,x21,x12 1761 umulh x12,x16,x25 1762 adcs x21,x22,x13 1763 umulh x13,x17,x25 1764 adcs x22,x23,x0 1765 adc x0,xzr,xzr 1766 adds x19,x19,x10 1767 sub x10,x27,x1 1768 adcs x20,x20,x11 1769 adcs x21,x21,x12 1770 adcs x22,x22,x13 1771 //adc x0,x0,xzr 1772 cbnz x28,.Loop_mul4x_1st_reduction 1773 1774 cbz x10,.Lmul4x4_post_condition 1775 1776 ldp x6,x7,[x1,#8*0] // a[4..7] 1777 ldp x8,x9,[x1,#8*2] 1778 add x1,x1,#8*4 1779 ldr x25,[sp] // a[0]*n0 1780 ldp x14,x15,[x3,#8*0] // n[4..7] 1781 ldp x16,x17,[x3,#8*2] 1782 add x3,x3,#8*4 1783 1784.Loop_mul4x_1st_tail: 1785 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1786 adc x0,x0,xzr // modulo-scheduled 1787 mul x11,x7,x24 1788 add x28,x28,#8 1789 mul x12,x8,x24 1790 and x28,x28,#31 1791 mul x13,x9,x24 1792 adds x19,x19,x10 1793 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1794 adcs x20,x20,x11 1795 umulh x11,x7,x24 1796 adcs x21,x21,x12 1797 umulh x12,x8,x24 1798 adcs x22,x22,x13 1799 umulh x13,x9,x24 1800 adc x23,xzr,xzr 1801 ldr x24,[x2,x28] // next b[i] (or b[0]) 1802 adds x20,x20,x10 1803 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1804 adcs x21,x21,x11 1805 mul x11,x15,x25 1806 adcs x22,x22,x12 1807 mul x12,x16,x25 1808 adc x23,x23,x13 // can't overflow 1809 mul x13,x17,x25 1810 adds x19,x19,x10 1811 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1812 adcs x20,x20,x11 1813 umulh x11,x15,x25 1814 adcs x21,x21,x12 1815 umulh x12,x16,x25 1816 adcs x22,x22,x13 1817 adcs x23,x23,x0 1818 umulh x13,x17,x25 1819 adc x0,xzr,xzr 1820 ldr x25,[sp,x28] // next t[0]*n0 1821 str x19,[x26],#8 // result!!! 1822 adds x19,x20,x10 1823 sub x10,x27,x1 // done yet? 1824 adcs x20,x21,x11 1825 adcs x21,x22,x12 1826 adcs x22,x23,x13 1827 //adc x0,x0,xzr 1828 cbnz x28,.Loop_mul4x_1st_tail 1829 1830 sub x11,x27,x5 // rewinded x1 1831 cbz x10,.Lmul4x_proceed 1832 1833 ldp x6,x7,[x1,#8*0] 1834 ldp x8,x9,[x1,#8*2] 1835 add x1,x1,#8*4 1836 ldp x14,x15,[x3,#8*0] 1837 ldp x16,x17,[x3,#8*2] 1838 add x3,x3,#8*4 1839 b .Loop_mul4x_1st_tail 1840 1841.align 5 1842.Lmul4x_proceed: 1843 ldr x24,[x2,#8*4]! // *++b 1844 adc x30,x0,xzr 1845 ldp x6,x7,[x11,#8*0] // a[0..3] 1846 sub x3,x3,x5 // rewind np 1847 ldp x8,x9,[x11,#8*2] 1848 add x1,x11,#8*4 1849 1850 stp x19,x20,[x26,#8*0] // result!!! 1851 ldp x19,x20,[sp,#8*4] // t[0..3] 1852 stp x21,x22,[x26,#8*2] // result!!! 1853 ldp x21,x22,[sp,#8*6] 1854 1855 ldp x14,x15,[x3,#8*0] // n[0..3] 1856 mov x26,sp 1857 ldp x16,x17,[x3,#8*2] 1858 adds x3,x3,#8*4 // clear carry bit 1859 mov x0,xzr 1860 1861.align 4 1862.Loop_mul4x_reduction: 1863 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1864 adc x0,x0,xzr // modulo-scheduled 1865 mul x11,x7,x24 1866 add x28,x28,#8 1867 mul x12,x8,x24 1868 and x28,x28,#31 1869 mul x13,x9,x24 1870 adds x19,x19,x10 1871 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1872 adcs x20,x20,x11 1873 mul x25,x19,x4 // t[0]*n0 1874 adcs x21,x21,x12 1875 umulh x11,x7,x24 1876 adcs x22,x22,x13 1877 umulh x12,x8,x24 1878 adc x23,xzr,xzr 1879 umulh x13,x9,x24 1880 ldr x24,[x2,x28] // next b[i] 1881 adds x20,x20,x10 1882 // (*) mul x10,x14,x25 1883 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1884 adcs x21,x21,x11 1885 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1886 adcs x22,x22,x12 1887 mul x12,x16,x25 1888 adc x23,x23,x13 // can't overflow 1889 mul x13,x17,x25 1890 // (*) adds xzr,x19,x10 1891 subs xzr,x19,#1 // (*) 1892 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1893 adcs x19,x20,x11 1894 umulh x11,x15,x25 1895 adcs x20,x21,x12 1896 umulh x12,x16,x25 1897 adcs x21,x22,x13 1898 umulh x13,x17,x25 1899 adcs x22,x23,x0 1900 adc x0,xzr,xzr 1901 adds x19,x19,x10 1902 adcs x20,x20,x11 1903 adcs x21,x21,x12 1904 adcs x22,x22,x13 1905 //adc x0,x0,xzr 1906 cbnz x28,.Loop_mul4x_reduction 1907 1908 adc x0,x0,xzr 1909 ldp x10,x11,[x26,#8*4] // t[4..7] 1910 ldp x12,x13,[x26,#8*6] 1911 ldp x6,x7,[x1,#8*0] // a[4..7] 1912 ldp x8,x9,[x1,#8*2] 1913 add x1,x1,#8*4 1914 adds x19,x19,x10 1915 adcs x20,x20,x11 1916 adcs x21,x21,x12 1917 adcs x22,x22,x13 1918 //adc x0,x0,xzr 1919 1920 ldr x25,[sp] // t[0]*n0 1921 ldp x14,x15,[x3,#8*0] // n[4..7] 1922 ldp x16,x17,[x3,#8*2] 1923 add x3,x3,#8*4 1924 1925.align 4 1926.Loop_mul4x_tail: 1927 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1928 adc x0,x0,xzr // modulo-scheduled 1929 mul x11,x7,x24 1930 add x28,x28,#8 1931 mul x12,x8,x24 1932 and x28,x28,#31 1933 mul x13,x9,x24 1934 adds x19,x19,x10 1935 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1936 adcs x20,x20,x11 1937 umulh x11,x7,x24 1938 adcs x21,x21,x12 1939 umulh x12,x8,x24 1940 adcs x22,x22,x13 1941 umulh x13,x9,x24 1942 adc x23,xzr,xzr 1943 ldr x24,[x2,x28] // next b[i] 1944 adds x20,x20,x10 1945 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1946 adcs x21,x21,x11 1947 mul x11,x15,x25 1948 adcs x22,x22,x12 1949 mul x12,x16,x25 1950 adc x23,x23,x13 // can't overflow 1951 mul x13,x17,x25 1952 adds x19,x19,x10 1953 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1954 adcs x20,x20,x11 1955 umulh x11,x15,x25 1956 adcs x21,x21,x12 1957 umulh x12,x16,x25 1958 adcs x22,x22,x13 1959 umulh x13,x17,x25 1960 adcs x23,x23,x0 1961 ldr x25,[sp,x28] // next a[0]*n0 1962 adc x0,xzr,xzr 1963 str x19,[x26],#8 // result!!! 1964 adds x19,x20,x10 1965 sub x10,x27,x1 // done yet? 1966 adcs x20,x21,x11 1967 adcs x21,x22,x12 1968 adcs x22,x23,x13 1969 //adc x0,x0,xzr 1970 cbnz x28,.Loop_mul4x_tail 1971 1972 sub x11,x3,x5 // rewinded np? 1973 adc x0,x0,xzr 1974 cbz x10,.Loop_mul4x_break 1975 1976 ldp x10,x11,[x26,#8*4] 1977 ldp x12,x13,[x26,#8*6] 1978 ldp x6,x7,[x1,#8*0] 1979 ldp x8,x9,[x1,#8*2] 1980 add x1,x1,#8*4 1981 adds x19,x19,x10 1982 adcs x20,x20,x11 1983 adcs x21,x21,x12 1984 adcs x22,x22,x13 1985 //adc x0,x0,xzr 1986 ldp x14,x15,[x3,#8*0] 1987 ldp x16,x17,[x3,#8*2] 1988 add x3,x3,#8*4 1989 b .Loop_mul4x_tail 1990 1991.align 4 1992.Loop_mul4x_break: 1993 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1994 adds x19,x19,x30 1995 add x2,x2,#8*4 // bp++ 1996 adcs x20,x20,xzr 1997 sub x1,x1,x5 // rewind ap 1998 adcs x21,x21,xzr 1999 stp x19,x20,[x26,#8*0] // result!!! 2000 adcs x22,x22,xzr 2001 ldp x19,x20,[sp,#8*4] // t[0..3] 2002 adc x30,x0,xzr 2003 stp x21,x22,[x26,#8*2] // result!!! 2004 cmp x2,x13 // done yet? 2005 ldp x21,x22,[sp,#8*6] 2006 ldp x14,x15,[x11,#8*0] // n[0..3] 2007 ldp x16,x17,[x11,#8*2] 2008 add x3,x11,#8*4 2009 b.eq .Lmul4x_post 2010 2011 ldr x24,[x2] 2012 ldp x6,x7,[x1,#8*0] // a[0..3] 2013 ldp x8,x9,[x1,#8*2] 2014 adds x1,x1,#8*4 // clear carry bit 2015 mov x0,xzr 2016 mov x26,sp 2017 b .Loop_mul4x_reduction 2018 2019.align 4 2020.Lmul4x_post: 2021 // Final step. We see if result is larger than modulus, and 2022 // if it is, subtract the modulus. But comparison implies 2023 // subtraction. So we subtract modulus, see if it borrowed, 2024 // and conditionally copy original value. 2025 mov x0,x12 2026 mov x27,x12 // x0 copy 2027 subs x10,x19,x14 2028 add x26,sp,#8*8 2029 sbcs x11,x20,x15 2030 sub x28,x5,#8*4 2031 2032.Lmul4x_sub: 2033 sbcs x12,x21,x16 2034 ldp x14,x15,[x3,#8*0] 2035 sub x28,x28,#8*4 2036 ldp x19,x20,[x26,#8*0] 2037 sbcs x13,x22,x17 2038 ldp x16,x17,[x3,#8*2] 2039 add x3,x3,#8*4 2040 ldp x21,x22,[x26,#8*2] 2041 add x26,x26,#8*4 2042 stp x10,x11,[x0,#8*0] 2043 sbcs x10,x19,x14 2044 stp x12,x13,[x0,#8*2] 2045 add x0,x0,#8*4 2046 sbcs x11,x20,x15 2047 cbnz x28,.Lmul4x_sub 2048 2049 sbcs x12,x21,x16 2050 mov x26,sp 2051 add x1,sp,#8*4 2052 ldp x6,x7,[x27,#8*0] 2053 sbcs x13,x22,x17 2054 stp x10,x11,[x0,#8*0] 2055 ldp x8,x9,[x27,#8*2] 2056 stp x12,x13,[x0,#8*2] 2057 ldp x19,x20,[x1,#8*0] 2058 ldp x21,x22,[x1,#8*2] 2059 sbcs xzr,x30,xzr // did it borrow? 2060 ldr x30,[x29,#8] // pull return address 2061 2062 sub x28,x5,#8*4 2063.Lmul4x_cond_copy: 2064 sub x28,x28,#8*4 2065 csel x10,x19,x6,lo 2066 stp xzr,xzr,[x26,#8*0] 2067 csel x11,x20,x7,lo 2068 ldp x6,x7,[x27,#8*4] 2069 ldp x19,x20,[x1,#8*4] 2070 csel x12,x21,x8,lo 2071 stp xzr,xzr,[x26,#8*2] 2072 add x26,x26,#8*4 2073 csel x13,x22,x9,lo 2074 ldp x8,x9,[x27,#8*6] 2075 ldp x21,x22,[x1,#8*6] 2076 add x1,x1,#8*4 2077 stp x10,x11,[x27,#8*0] 2078 stp x12,x13,[x27,#8*2] 2079 add x27,x27,#8*4 2080 cbnz x28,.Lmul4x_cond_copy 2081 2082 csel x10,x19,x6,lo 2083 stp xzr,xzr,[x26,#8*0] 2084 csel x11,x20,x7,lo 2085 stp xzr,xzr,[x26,#8*2] 2086 csel x12,x21,x8,lo 2087 stp xzr,xzr,[x26,#8*3] 2088 csel x13,x22,x9,lo 2089 stp xzr,xzr,[x26,#8*4] 2090 stp x10,x11,[x27,#8*0] 2091 stp x12,x13,[x27,#8*2] 2092 2093 b .Lmul4x_done 2094 2095.align 4 2096.Lmul4x4_post_condition: 2097 adc x0,x0,xzr 2098 ldr x1,[x29,#96] // pull rp 2099 // x19-3,x0 hold result, x14-7 hold modulus 2100 subs x6,x19,x14 2101 ldr x30,[x29,#8] // pull return address 2102 sbcs x7,x20,x15 2103 stp xzr,xzr,[sp,#8*0] 2104 sbcs x8,x21,x16 2105 stp xzr,xzr,[sp,#8*2] 2106 sbcs x9,x22,x17 2107 stp xzr,xzr,[sp,#8*4] 2108 sbcs xzr,x0,xzr // did it borrow? 2109 stp xzr,xzr,[sp,#8*6] 2110 2111 // x6-3 hold result-modulus 2112 csel x6,x19,x6,lo 2113 csel x7,x20,x7,lo 2114 csel x8,x21,x8,lo 2115 csel x9,x22,x9,lo 2116 stp x6,x7,[x1,#8*0] 2117 stp x8,x9,[x1,#8*2] 2118 2119.Lmul4x_done: 2120 ldp x19,x20,[x29,#16] 2121 mov sp,x29 2122 ldp x21,x22,[x29,#32] 2123 mov x0,#1 2124 ldp x23,x24,[x29,#48] 2125 ldp x25,x26,[x29,#64] 2126 ldp x27,x28,[x29,#80] 2127 ldr x29,[sp],#128 2128 // x30 loaded earlier 2129 AARCH64_VALIDATE_LINK_REGISTER 2130 ret 2131.size __bn_mul4x_mont,.-__bn_mul4x_mont 2132.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 2133.align 2 2134.align 4 2135