1/* Do not modify. This file is auto-generated from armv8-mont.pl. */ 2#include "arm_arch.h" 3#ifndef __KERNEL__ 4 5.hidden OPENSSL_armv8_rsa_neonized 6#endif 7.text 8 9.globl bn_mul_mont 10.type bn_mul_mont,%function 11.align 5 12bn_mul_mont: 13 AARCH64_SIGN_LINK_REGISTER 14.Lbn_mul_mont: 15 tst x5,#3 16 b.ne .Lmul_mont 17 cmp x5,#32 18 b.le .Lscalar_impl 19#ifndef __KERNEL__ 20#ifndef __AARCH64EB__ 21 adrp x17,OPENSSL_armv8_rsa_neonized 22 ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] 23 cbnz w17, bn_mul8x_mont_neon 24#endif 25#endif 26 27.Lscalar_impl: 28 tst x5,#7 29 b.eq __bn_sqr8x_mont 30 tst x5,#3 31 b.eq __bn_mul4x_mont 32 33.Lmul_mont: 34 stp x29,x30,[sp,#-64]! 35 add x29,sp,#0 36 stp x19,x20,[sp,#16] 37 stp x21,x22,[sp,#32] 38 stp x23,x24,[sp,#48] 39 40 ldr x9,[x2],#8 // bp[0] 41 sub x22,sp,x5,lsl#3 42 ldp x7,x8,[x1],#16 // ap[0..1] 43 lsl x5,x5,#3 44 ldr x4,[x4] // *n0 45 and x22,x22,#-16 // ABI says so 46 ldp x13,x14,[x3],#16 // np[0..1] 47 48 mul x6,x7,x9 // ap[0]*bp[0] 49 sub x21,x5,#16 // j=num-2 50 umulh x7,x7,x9 51 mul x10,x8,x9 // ap[1]*bp[0] 52 umulh x11,x8,x9 53 54 mul x15,x6,x4 // "tp[0]"*n0 55 mov sp,x22 // alloca 56 57 // (*) mul x12,x13,x15 // np[0]*m1 58 umulh x13,x13,x15 59 mul x16,x14,x15 // np[1]*m1 60 // (*) adds x12,x12,x6 // discarded 61 // (*) As for removal of first multiplication and addition 62 // instructions. The outcome of first addition is 63 // guaranteed to be zero, which leaves two computationally 64 // significant outcomes: it either carries or not. Then 65 // question is when does it carry? Is there alternative 66 // way to deduce it? If you follow operations, you can 67 // observe that condition for carry is quite simple: 68 // x6 being non-zero. So that carry can be calculated 69 // by adding -1 to x6. That's what next instruction does. 70 subs xzr,x6,#1 // (*) 71 umulh x17,x14,x15 72 adc x13,x13,xzr 73 cbz x21,.L1st_skip 74 75.L1st: 76 ldr x8,[x1],#8 77 adds x6,x10,x7 78 sub x21,x21,#8 // j-- 79 adc x7,x11,xzr 80 81 ldr x14,[x3],#8 82 adds x12,x16,x13 83 mul x10,x8,x9 // ap[j]*bp[0] 84 adc x13,x17,xzr 85 umulh x11,x8,x9 86 87 adds x12,x12,x6 88 mul x16,x14,x15 // np[j]*m1 89 adc x13,x13,xzr 90 umulh x17,x14,x15 91 str x12,[x22],#8 // tp[j-1] 92 cbnz x21,.L1st 93 94.L1st_skip: 95 adds x6,x10,x7 96 sub x1,x1,x5 // rewind x1 97 adc x7,x11,xzr 98 99 adds x12,x16,x13 100 sub x3,x3,x5 // rewind x3 101 adc x13,x17,xzr 102 103 adds x12,x12,x6 104 sub x20,x5,#8 // i=num-1 105 adcs x13,x13,x7 106 107 adc x19,xzr,xzr // upmost overflow bit 108 stp x12,x13,[x22] 109 110.Louter: 111 ldr x9,[x2],#8 // bp[i] 112 ldp x7,x8,[x1],#16 113 ldr x23,[sp] // tp[0] 114 add x22,sp,#8 115 116 mul x6,x7,x9 // ap[0]*bp[i] 117 sub x21,x5,#16 // j=num-2 118 umulh x7,x7,x9 119 ldp x13,x14,[x3],#16 120 mul x10,x8,x9 // ap[1]*bp[i] 121 adds x6,x6,x23 122 umulh x11,x8,x9 123 adc x7,x7,xzr 124 125 mul x15,x6,x4 126 sub x20,x20,#8 // i-- 127 128 // (*) mul x12,x13,x15 // np[0]*m1 129 umulh x13,x13,x15 130 mul x16,x14,x15 // np[1]*m1 131 // (*) adds x12,x12,x6 132 subs xzr,x6,#1 // (*) 133 umulh x17,x14,x15 134 cbz x21,.Linner_skip 135 136.Linner: 137 ldr x8,[x1],#8 138 adc x13,x13,xzr 139 ldr x23,[x22],#8 // tp[j] 140 adds x6,x10,x7 141 sub x21,x21,#8 // j-- 142 adc x7,x11,xzr 143 144 adds x12,x16,x13 145 ldr x14,[x3],#8 146 adc x13,x17,xzr 147 148 mul x10,x8,x9 // ap[j]*bp[i] 149 adds x6,x6,x23 150 umulh x11,x8,x9 151 adc x7,x7,xzr 152 153 mul x16,x14,x15 // np[j]*m1 154 adds x12,x12,x6 155 umulh x17,x14,x15 156 stur x12,[x22,#-16] // tp[j-1] 157 cbnz x21,.Linner 158 159.Linner_skip: 160 ldr x23,[x22],#8 // tp[j] 161 adc x13,x13,xzr 162 adds x6,x10,x7 163 sub x1,x1,x5 // rewind x1 164 adc x7,x11,xzr 165 166 adds x12,x16,x13 167 sub x3,x3,x5 // rewind x3 168 adcs x13,x17,x19 169 adc x19,xzr,xzr 170 171 adds x6,x6,x23 172 adc x7,x7,xzr 173 174 adds x12,x12,x6 175 adcs x13,x13,x7 176 adc x19,x19,xzr // upmost overflow bit 177 stp x12,x13,[x22,#-16] 178 179 cbnz x20,.Louter 180 181 // Final step. We see if result is larger than modulus, and 182 // if it is, subtract the modulus. But comparison implies 183 // subtraction. So we subtract modulus, see if it borrowed, 184 // and conditionally copy original value. 185 ldr x23,[sp] // tp[0] 186 add x22,sp,#8 187 ldr x14,[x3],#8 // np[0] 188 subs x21,x5,#8 // j=num-1 and clear borrow 189 mov x1,x0 190.Lsub: 191 sbcs x8,x23,x14 // tp[j]-np[j] 192 ldr x23,[x22],#8 193 sub x21,x21,#8 // j-- 194 ldr x14,[x3],#8 195 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 196 cbnz x21,.Lsub 197 198 sbcs x8,x23,x14 199 sbcs x19,x19,xzr // did it borrow? 200 str x8,[x1],#8 // rp[num-1] 201 202 ldr x23,[sp] // tp[0] 203 add x22,sp,#8 204 ldr x8,[x0],#8 // rp[0] 205 sub x5,x5,#8 // num-- 206 nop 207.Lcond_copy: 208 sub x5,x5,#8 // num-- 209 csel x14,x23,x8,lo // did it borrow? 210 ldr x23,[x22],#8 211 ldr x8,[x0],#8 212 stur xzr,[x22,#-16] // wipe tp 213 stur x14,[x0,#-16] 214 cbnz x5,.Lcond_copy 215 216 csel x14,x23,x8,lo 217 stur xzr,[x22,#-8] // wipe tp 218 stur x14,[x0,#-8] 219 220 ldp x19,x20,[x29,#16] 221 mov sp,x29 222 ldp x21,x22,[x29,#32] 223 mov x0,#1 224 ldp x23,x24,[x29,#48] 225 ldr x29,[sp],#64 226 AARCH64_VALIDATE_LINK_REGISTER 227 ret 228.size bn_mul_mont,.-bn_mul_mont 229.type bn_mul8x_mont_neon,%function 230.align 5 231bn_mul8x_mont_neon: 232 // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to 233 // only from bn_mul_mont which has already signed the return address. 234 stp x29,x30,[sp,#-80]! 235 mov x16,sp 236 stp d8,d9,[sp,#16] 237 stp d10,d11,[sp,#32] 238 stp d12,d13,[sp,#48] 239 stp d14,d15,[sp,#64] 240 lsl x5,x5,#1 241 eor v14.16b,v14.16b,v14.16b 242 243.align 4 244.LNEON_8n: 245 eor v6.16b,v6.16b,v6.16b 246 sub x7,sp,#128 247 eor v7.16b,v7.16b,v7.16b 248 sub x7,x7,x5,lsl#4 249 eor v8.16b,v8.16b,v8.16b 250 and x7,x7,#-64 251 eor v9.16b,v9.16b,v9.16b 252 mov sp,x7 // alloca 253 eor v10.16b,v10.16b,v10.16b 254 add x7,x7,#256 255 eor v11.16b,v11.16b,v11.16b 256 sub x8,x5,#8 257 eor v12.16b,v12.16b,v12.16b 258 eor v13.16b,v13.16b,v13.16b 259 260.LNEON_8n_init: 261 st1 {v6.2d,v7.2d},[x7],#32 262 subs x8,x8,#8 263 st1 {v8.2d,v9.2d},[x7],#32 264 st1 {v10.2d,v11.2d},[x7],#32 265 st1 {v12.2d,v13.2d},[x7],#32 266 bne .LNEON_8n_init 267 268 add x6,sp,#256 269 ld1 {v0.4s,v1.4s},[x1],#32 270 add x10,sp,#8 271 ldr s30,[x4],#4 272 mov x9,x5 273 b .LNEON_8n_outer 274 275.align 4 276.LNEON_8n_outer: 277 ldr s28,[x2],#4 // *b++ 278 uxtl v28.4s,v28.4h 279 add x7,sp,#128 280 ld1 {v2.4s,v3.4s},[x3],#32 281 282 umlal v6.2d,v28.2s,v0.s[0] 283 umlal v7.2d,v28.2s,v0.s[1] 284 umlal v8.2d,v28.2s,v0.s[2] 285 shl v29.2d,v6.2d,#16 286 ext v29.16b,v29.16b,v29.16b,#8 287 umlal v9.2d,v28.2s,v0.s[3] 288 add v29.2d,v29.2d,v6.2d 289 umlal v10.2d,v28.2s,v1.s[0] 290 mul v29.2s,v29.2s,v30.2s 291 umlal v11.2d,v28.2s,v1.s[1] 292 st1 {v28.2s},[sp] // put aside smashed b[8*i+0] 293 umlal v12.2d,v28.2s,v1.s[2] 294 uxtl v29.4s,v29.4h 295 umlal v13.2d,v28.2s,v1.s[3] 296 ldr s28,[x2],#4 // *b++ 297 umlal v6.2d,v29.2s,v2.s[0] 298 umlal v7.2d,v29.2s,v2.s[1] 299 uxtl v28.4s,v28.4h 300 umlal v8.2d,v29.2s,v2.s[2] 301 ushr v15.2d,v6.2d,#16 302 umlal v9.2d,v29.2s,v2.s[3] 303 umlal v10.2d,v29.2s,v3.s[0] 304 ext v6.16b,v6.16b,v6.16b,#8 305 add v6.2d,v6.2d,v15.2d 306 umlal v11.2d,v29.2s,v3.s[1] 307 ushr v6.2d,v6.2d,#16 308 umlal v12.2d,v29.2s,v3.s[2] 309 umlal v13.2d,v29.2s,v3.s[3] 310 add v16.2d,v7.2d,v6.2d 311 ins v7.d[0],v16.d[0] 312 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+0] 313 umlal v7.2d,v28.2s,v0.s[0] 314 ld1 {v6.2d},[x6],#16 315 umlal v8.2d,v28.2s,v0.s[1] 316 umlal v9.2d,v28.2s,v0.s[2] 317 shl v29.2d,v7.2d,#16 318 ext v29.16b,v29.16b,v29.16b,#8 319 umlal v10.2d,v28.2s,v0.s[3] 320 add v29.2d,v29.2d,v7.2d 321 umlal v11.2d,v28.2s,v1.s[0] 322 mul v29.2s,v29.2s,v30.2s 323 umlal v12.2d,v28.2s,v1.s[1] 324 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+1] 325 umlal v13.2d,v28.2s,v1.s[2] 326 uxtl v29.4s,v29.4h 327 umlal v6.2d,v28.2s,v1.s[3] 328 ldr s28,[x2],#4 // *b++ 329 umlal v7.2d,v29.2s,v2.s[0] 330 umlal v8.2d,v29.2s,v2.s[1] 331 uxtl v28.4s,v28.4h 332 umlal v9.2d,v29.2s,v2.s[2] 333 ushr v15.2d,v7.2d,#16 334 umlal v10.2d,v29.2s,v2.s[3] 335 umlal v11.2d,v29.2s,v3.s[0] 336 ext v7.16b,v7.16b,v7.16b,#8 337 add v7.2d,v7.2d,v15.2d 338 umlal v12.2d,v29.2s,v3.s[1] 339 ushr v7.2d,v7.2d,#16 340 umlal v13.2d,v29.2s,v3.s[2] 341 umlal v6.2d,v29.2s,v3.s[3] 342 add v16.2d,v8.2d,v7.2d 343 ins v8.d[0],v16.d[0] 344 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+1] 345 umlal v8.2d,v28.2s,v0.s[0] 346 ld1 {v7.2d},[x6],#16 347 umlal v9.2d,v28.2s,v0.s[1] 348 umlal v10.2d,v28.2s,v0.s[2] 349 shl v29.2d,v8.2d,#16 350 ext v29.16b,v29.16b,v29.16b,#8 351 umlal v11.2d,v28.2s,v0.s[3] 352 add v29.2d,v29.2d,v8.2d 353 umlal v12.2d,v28.2s,v1.s[0] 354 mul v29.2s,v29.2s,v30.2s 355 umlal v13.2d,v28.2s,v1.s[1] 356 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+2] 357 umlal v6.2d,v28.2s,v1.s[2] 358 uxtl v29.4s,v29.4h 359 umlal v7.2d,v28.2s,v1.s[3] 360 ldr s28,[x2],#4 // *b++ 361 umlal v8.2d,v29.2s,v2.s[0] 362 umlal v9.2d,v29.2s,v2.s[1] 363 uxtl v28.4s,v28.4h 364 umlal v10.2d,v29.2s,v2.s[2] 365 ushr v15.2d,v8.2d,#16 366 umlal v11.2d,v29.2s,v2.s[3] 367 umlal v12.2d,v29.2s,v3.s[0] 368 ext v8.16b,v8.16b,v8.16b,#8 369 add v8.2d,v8.2d,v15.2d 370 umlal v13.2d,v29.2s,v3.s[1] 371 ushr v8.2d,v8.2d,#16 372 umlal v6.2d,v29.2s,v3.s[2] 373 umlal v7.2d,v29.2s,v3.s[3] 374 add v16.2d,v9.2d,v8.2d 375 ins v9.d[0],v16.d[0] 376 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+2] 377 umlal v9.2d,v28.2s,v0.s[0] 378 ld1 {v8.2d},[x6],#16 379 umlal v10.2d,v28.2s,v0.s[1] 380 umlal v11.2d,v28.2s,v0.s[2] 381 shl v29.2d,v9.2d,#16 382 ext v29.16b,v29.16b,v29.16b,#8 383 umlal v12.2d,v28.2s,v0.s[3] 384 add v29.2d,v29.2d,v9.2d 385 umlal v13.2d,v28.2s,v1.s[0] 386 mul v29.2s,v29.2s,v30.2s 387 umlal v6.2d,v28.2s,v1.s[1] 388 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+3] 389 umlal v7.2d,v28.2s,v1.s[2] 390 uxtl v29.4s,v29.4h 391 umlal v8.2d,v28.2s,v1.s[3] 392 ldr s28,[x2],#4 // *b++ 393 umlal v9.2d,v29.2s,v2.s[0] 394 umlal v10.2d,v29.2s,v2.s[1] 395 uxtl v28.4s,v28.4h 396 umlal v11.2d,v29.2s,v2.s[2] 397 ushr v15.2d,v9.2d,#16 398 umlal v12.2d,v29.2s,v2.s[3] 399 umlal v13.2d,v29.2s,v3.s[0] 400 ext v9.16b,v9.16b,v9.16b,#8 401 add v9.2d,v9.2d,v15.2d 402 umlal v6.2d,v29.2s,v3.s[1] 403 ushr v9.2d,v9.2d,#16 404 umlal v7.2d,v29.2s,v3.s[2] 405 umlal v8.2d,v29.2s,v3.s[3] 406 add v16.2d,v10.2d,v9.2d 407 ins v10.d[0],v16.d[0] 408 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+3] 409 umlal v10.2d,v28.2s,v0.s[0] 410 ld1 {v9.2d},[x6],#16 411 umlal v11.2d,v28.2s,v0.s[1] 412 umlal v12.2d,v28.2s,v0.s[2] 413 shl v29.2d,v10.2d,#16 414 ext v29.16b,v29.16b,v29.16b,#8 415 umlal v13.2d,v28.2s,v0.s[3] 416 add v29.2d,v29.2d,v10.2d 417 umlal v6.2d,v28.2s,v1.s[0] 418 mul v29.2s,v29.2s,v30.2s 419 umlal v7.2d,v28.2s,v1.s[1] 420 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+4] 421 umlal v8.2d,v28.2s,v1.s[2] 422 uxtl v29.4s,v29.4h 423 umlal v9.2d,v28.2s,v1.s[3] 424 ldr s28,[x2],#4 // *b++ 425 umlal v10.2d,v29.2s,v2.s[0] 426 umlal v11.2d,v29.2s,v2.s[1] 427 uxtl v28.4s,v28.4h 428 umlal v12.2d,v29.2s,v2.s[2] 429 ushr v15.2d,v10.2d,#16 430 umlal v13.2d,v29.2s,v2.s[3] 431 umlal v6.2d,v29.2s,v3.s[0] 432 ext v10.16b,v10.16b,v10.16b,#8 433 add v10.2d,v10.2d,v15.2d 434 umlal v7.2d,v29.2s,v3.s[1] 435 ushr v10.2d,v10.2d,#16 436 umlal v8.2d,v29.2s,v3.s[2] 437 umlal v9.2d,v29.2s,v3.s[3] 438 add v16.2d,v11.2d,v10.2d 439 ins v11.d[0],v16.d[0] 440 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+4] 441 umlal v11.2d,v28.2s,v0.s[0] 442 ld1 {v10.2d},[x6],#16 443 umlal v12.2d,v28.2s,v0.s[1] 444 umlal v13.2d,v28.2s,v0.s[2] 445 shl v29.2d,v11.2d,#16 446 ext v29.16b,v29.16b,v29.16b,#8 447 umlal v6.2d,v28.2s,v0.s[3] 448 add v29.2d,v29.2d,v11.2d 449 umlal v7.2d,v28.2s,v1.s[0] 450 mul v29.2s,v29.2s,v30.2s 451 umlal v8.2d,v28.2s,v1.s[1] 452 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+5] 453 umlal v9.2d,v28.2s,v1.s[2] 454 uxtl v29.4s,v29.4h 455 umlal v10.2d,v28.2s,v1.s[3] 456 ldr s28,[x2],#4 // *b++ 457 umlal v11.2d,v29.2s,v2.s[0] 458 umlal v12.2d,v29.2s,v2.s[1] 459 uxtl v28.4s,v28.4h 460 umlal v13.2d,v29.2s,v2.s[2] 461 ushr v15.2d,v11.2d,#16 462 umlal v6.2d,v29.2s,v2.s[3] 463 umlal v7.2d,v29.2s,v3.s[0] 464 ext v11.16b,v11.16b,v11.16b,#8 465 add v11.2d,v11.2d,v15.2d 466 umlal v8.2d,v29.2s,v3.s[1] 467 ushr v11.2d,v11.2d,#16 468 umlal v9.2d,v29.2s,v3.s[2] 469 umlal v10.2d,v29.2s,v3.s[3] 470 add v16.2d,v12.2d,v11.2d 471 ins v12.d[0],v16.d[0] 472 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+5] 473 umlal v12.2d,v28.2s,v0.s[0] 474 ld1 {v11.2d},[x6],#16 475 umlal v13.2d,v28.2s,v0.s[1] 476 umlal v6.2d,v28.2s,v0.s[2] 477 shl v29.2d,v12.2d,#16 478 ext v29.16b,v29.16b,v29.16b,#8 479 umlal v7.2d,v28.2s,v0.s[3] 480 add v29.2d,v29.2d,v12.2d 481 umlal v8.2d,v28.2s,v1.s[0] 482 mul v29.2s,v29.2s,v30.2s 483 umlal v9.2d,v28.2s,v1.s[1] 484 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+6] 485 umlal v10.2d,v28.2s,v1.s[2] 486 uxtl v29.4s,v29.4h 487 umlal v11.2d,v28.2s,v1.s[3] 488 ldr s28,[x2],#4 // *b++ 489 umlal v12.2d,v29.2s,v2.s[0] 490 umlal v13.2d,v29.2s,v2.s[1] 491 uxtl v28.4s,v28.4h 492 umlal v6.2d,v29.2s,v2.s[2] 493 ushr v15.2d,v12.2d,#16 494 umlal v7.2d,v29.2s,v2.s[3] 495 umlal v8.2d,v29.2s,v3.s[0] 496 ext v12.16b,v12.16b,v12.16b,#8 497 add v12.2d,v12.2d,v15.2d 498 umlal v9.2d,v29.2s,v3.s[1] 499 ushr v12.2d,v12.2d,#16 500 umlal v10.2d,v29.2s,v3.s[2] 501 umlal v11.2d,v29.2s,v3.s[3] 502 add v16.2d,v13.2d,v12.2d 503 ins v13.d[0],v16.d[0] 504 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+6] 505 umlal v13.2d,v28.2s,v0.s[0] 506 ld1 {v12.2d},[x6],#16 507 umlal v6.2d,v28.2s,v0.s[1] 508 umlal v7.2d,v28.2s,v0.s[2] 509 shl v29.2d,v13.2d,#16 510 ext v29.16b,v29.16b,v29.16b,#8 511 umlal v8.2d,v28.2s,v0.s[3] 512 add v29.2d,v29.2d,v13.2d 513 umlal v9.2d,v28.2s,v1.s[0] 514 mul v29.2s,v29.2s,v30.2s 515 umlal v10.2d,v28.2s,v1.s[1] 516 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+7] 517 umlal v11.2d,v28.2s,v1.s[2] 518 uxtl v29.4s,v29.4h 519 umlal v12.2d,v28.2s,v1.s[3] 520 ld1 {v28.2s},[sp] // pull smashed b[8*i+0] 521 umlal v13.2d,v29.2s,v2.s[0] 522 ld1 {v0.4s,v1.4s},[x1],#32 523 umlal v6.2d,v29.2s,v2.s[1] 524 umlal v7.2d,v29.2s,v2.s[2] 525 mov v5.16b,v13.16b 526 ushr v5.2d,v5.2d,#16 527 ext v13.16b,v13.16b,v13.16b,#8 528 umlal v8.2d,v29.2s,v2.s[3] 529 umlal v9.2d,v29.2s,v3.s[0] 530 add v13.2d,v13.2d,v5.2d 531 umlal v10.2d,v29.2s,v3.s[1] 532 ushr v13.2d,v13.2d,#16 533 eor v15.16b,v15.16b,v15.16b 534 ins v13.d[1],v15.d[0] 535 umlal v11.2d,v29.2s,v3.s[2] 536 umlal v12.2d,v29.2s,v3.s[3] 537 add v6.2d,v6.2d,v13.2d 538 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+7] 539 add x10,sp,#8 // rewind 540 sub x8,x5,#8 541 b .LNEON_8n_inner 542 543.align 4 544.LNEON_8n_inner: 545 subs x8,x8,#8 546 umlal v6.2d,v28.2s,v0.s[0] 547 ld1 {v13.2d},[x6] 548 umlal v7.2d,v28.2s,v0.s[1] 549 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+0] 550 umlal v8.2d,v28.2s,v0.s[2] 551 ld1 {v2.4s,v3.4s},[x3],#32 552 umlal v9.2d,v28.2s,v0.s[3] 553 b.eq .LInner_jump 554 add x6,x6,#16 // don't advance in last iteration 555.LInner_jump: 556 umlal v10.2d,v28.2s,v1.s[0] 557 umlal v11.2d,v28.2s,v1.s[1] 558 umlal v12.2d,v28.2s,v1.s[2] 559 umlal v13.2d,v28.2s,v1.s[3] 560 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+1] 561 umlal v6.2d,v29.2s,v2.s[0] 562 umlal v7.2d,v29.2s,v2.s[1] 563 umlal v8.2d,v29.2s,v2.s[2] 564 umlal v9.2d,v29.2s,v2.s[3] 565 umlal v10.2d,v29.2s,v3.s[0] 566 umlal v11.2d,v29.2s,v3.s[1] 567 umlal v12.2d,v29.2s,v3.s[2] 568 umlal v13.2d,v29.2s,v3.s[3] 569 st1 {v6.2d},[x7],#16 570 umlal v7.2d,v28.2s,v0.s[0] 571 ld1 {v6.2d},[x6] 572 umlal v8.2d,v28.2s,v0.s[1] 573 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+1] 574 umlal v9.2d,v28.2s,v0.s[2] 575 b.eq .LInner_jump1 576 add x6,x6,#16 // don't advance in last iteration 577.LInner_jump1: 578 umlal v10.2d,v28.2s,v0.s[3] 579 umlal v11.2d,v28.2s,v1.s[0] 580 umlal v12.2d,v28.2s,v1.s[1] 581 umlal v13.2d,v28.2s,v1.s[2] 582 umlal v6.2d,v28.2s,v1.s[3] 583 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+2] 584 umlal v7.2d,v29.2s,v2.s[0] 585 umlal v8.2d,v29.2s,v2.s[1] 586 umlal v9.2d,v29.2s,v2.s[2] 587 umlal v10.2d,v29.2s,v2.s[3] 588 umlal v11.2d,v29.2s,v3.s[0] 589 umlal v12.2d,v29.2s,v3.s[1] 590 umlal v13.2d,v29.2s,v3.s[2] 591 umlal v6.2d,v29.2s,v3.s[3] 592 st1 {v7.2d},[x7],#16 593 umlal v8.2d,v28.2s,v0.s[0] 594 ld1 {v7.2d},[x6] 595 umlal v9.2d,v28.2s,v0.s[1] 596 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+2] 597 umlal v10.2d,v28.2s,v0.s[2] 598 b.eq .LInner_jump2 599 add x6,x6,#16 // don't advance in last iteration 600.LInner_jump2: 601 umlal v11.2d,v28.2s,v0.s[3] 602 umlal v12.2d,v28.2s,v1.s[0] 603 umlal v13.2d,v28.2s,v1.s[1] 604 umlal v6.2d,v28.2s,v1.s[2] 605 umlal v7.2d,v28.2s,v1.s[3] 606 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+3] 607 umlal v8.2d,v29.2s,v2.s[0] 608 umlal v9.2d,v29.2s,v2.s[1] 609 umlal v10.2d,v29.2s,v2.s[2] 610 umlal v11.2d,v29.2s,v2.s[3] 611 umlal v12.2d,v29.2s,v3.s[0] 612 umlal v13.2d,v29.2s,v3.s[1] 613 umlal v6.2d,v29.2s,v3.s[2] 614 umlal v7.2d,v29.2s,v3.s[3] 615 st1 {v8.2d},[x7],#16 616 umlal v9.2d,v28.2s,v0.s[0] 617 ld1 {v8.2d},[x6] 618 umlal v10.2d,v28.2s,v0.s[1] 619 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+3] 620 umlal v11.2d,v28.2s,v0.s[2] 621 b.eq .LInner_jump3 622 add x6,x6,#16 // don't advance in last iteration 623.LInner_jump3: 624 umlal v12.2d,v28.2s,v0.s[3] 625 umlal v13.2d,v28.2s,v1.s[0] 626 umlal v6.2d,v28.2s,v1.s[1] 627 umlal v7.2d,v28.2s,v1.s[2] 628 umlal v8.2d,v28.2s,v1.s[3] 629 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+4] 630 umlal v9.2d,v29.2s,v2.s[0] 631 umlal v10.2d,v29.2s,v2.s[1] 632 umlal v11.2d,v29.2s,v2.s[2] 633 umlal v12.2d,v29.2s,v2.s[3] 634 umlal v13.2d,v29.2s,v3.s[0] 635 umlal v6.2d,v29.2s,v3.s[1] 636 umlal v7.2d,v29.2s,v3.s[2] 637 umlal v8.2d,v29.2s,v3.s[3] 638 st1 {v9.2d},[x7],#16 639 umlal v10.2d,v28.2s,v0.s[0] 640 ld1 {v9.2d},[x6] 641 umlal v11.2d,v28.2s,v0.s[1] 642 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+4] 643 umlal v12.2d,v28.2s,v0.s[2] 644 b.eq .LInner_jump4 645 add x6,x6,#16 // don't advance in last iteration 646.LInner_jump4: 647 umlal v13.2d,v28.2s,v0.s[3] 648 umlal v6.2d,v28.2s,v1.s[0] 649 umlal v7.2d,v28.2s,v1.s[1] 650 umlal v8.2d,v28.2s,v1.s[2] 651 umlal v9.2d,v28.2s,v1.s[3] 652 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+5] 653 umlal v10.2d,v29.2s,v2.s[0] 654 umlal v11.2d,v29.2s,v2.s[1] 655 umlal v12.2d,v29.2s,v2.s[2] 656 umlal v13.2d,v29.2s,v2.s[3] 657 umlal v6.2d,v29.2s,v3.s[0] 658 umlal v7.2d,v29.2s,v3.s[1] 659 umlal v8.2d,v29.2s,v3.s[2] 660 umlal v9.2d,v29.2s,v3.s[3] 661 st1 {v10.2d},[x7],#16 662 umlal v11.2d,v28.2s,v0.s[0] 663 ld1 {v10.2d},[x6] 664 umlal v12.2d,v28.2s,v0.s[1] 665 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+5] 666 umlal v13.2d,v28.2s,v0.s[2] 667 b.eq .LInner_jump5 668 add x6,x6,#16 // don't advance in last iteration 669.LInner_jump5: 670 umlal v6.2d,v28.2s,v0.s[3] 671 umlal v7.2d,v28.2s,v1.s[0] 672 umlal v8.2d,v28.2s,v1.s[1] 673 umlal v9.2d,v28.2s,v1.s[2] 674 umlal v10.2d,v28.2s,v1.s[3] 675 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+6] 676 umlal v11.2d,v29.2s,v2.s[0] 677 umlal v12.2d,v29.2s,v2.s[1] 678 umlal v13.2d,v29.2s,v2.s[2] 679 umlal v6.2d,v29.2s,v2.s[3] 680 umlal v7.2d,v29.2s,v3.s[0] 681 umlal v8.2d,v29.2s,v3.s[1] 682 umlal v9.2d,v29.2s,v3.s[2] 683 umlal v10.2d,v29.2s,v3.s[3] 684 st1 {v11.2d},[x7],#16 685 umlal v12.2d,v28.2s,v0.s[0] 686 ld1 {v11.2d},[x6] 687 umlal v13.2d,v28.2s,v0.s[1] 688 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+6] 689 umlal v6.2d,v28.2s,v0.s[2] 690 b.eq .LInner_jump6 691 add x6,x6,#16 // don't advance in last iteration 692.LInner_jump6: 693 umlal v7.2d,v28.2s,v0.s[3] 694 umlal v8.2d,v28.2s,v1.s[0] 695 umlal v9.2d,v28.2s,v1.s[1] 696 umlal v10.2d,v28.2s,v1.s[2] 697 umlal v11.2d,v28.2s,v1.s[3] 698 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+7] 699 umlal v12.2d,v29.2s,v2.s[0] 700 umlal v13.2d,v29.2s,v2.s[1] 701 umlal v6.2d,v29.2s,v2.s[2] 702 umlal v7.2d,v29.2s,v2.s[3] 703 umlal v8.2d,v29.2s,v3.s[0] 704 umlal v9.2d,v29.2s,v3.s[1] 705 umlal v10.2d,v29.2s,v3.s[2] 706 umlal v11.2d,v29.2s,v3.s[3] 707 st1 {v12.2d},[x7],#16 708 umlal v13.2d,v28.2s,v0.s[0] 709 ld1 {v12.2d},[x6] 710 umlal v6.2d,v28.2s,v0.s[1] 711 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+7] 712 umlal v7.2d,v28.2s,v0.s[2] 713 b.eq .LInner_jump7 714 add x6,x6,#16 // don't advance in last iteration 715.LInner_jump7: 716 umlal v8.2d,v28.2s,v0.s[3] 717 umlal v9.2d,v28.2s,v1.s[0] 718 umlal v10.2d,v28.2s,v1.s[1] 719 umlal v11.2d,v28.2s,v1.s[2] 720 umlal v12.2d,v28.2s,v1.s[3] 721 b.ne .LInner_after_rewind8 722 sub x1,x1,x5,lsl#2 // rewind 723.LInner_after_rewind8: 724 umlal v13.2d,v29.2s,v2.s[0] 725 ld1 {v28.2s},[sp] // pull smashed b[8*i+0] 726 umlal v6.2d,v29.2s,v2.s[1] 727 ld1 {v0.4s,v1.4s},[x1],#32 728 umlal v7.2d,v29.2s,v2.s[2] 729 add x10,sp,#8 // rewind 730 umlal v8.2d,v29.2s,v2.s[3] 731 umlal v9.2d,v29.2s,v3.s[0] 732 umlal v10.2d,v29.2s,v3.s[1] 733 umlal v11.2d,v29.2s,v3.s[2] 734 st1 {v13.2d},[x7],#16 735 umlal v12.2d,v29.2s,v3.s[3] 736 737 bne .LNEON_8n_inner 738 add x6,sp,#128 739 st1 {v6.2d,v7.2d},[x7],#32 740 eor v2.16b,v2.16b,v2.16b // v2 741 st1 {v8.2d,v9.2d},[x7],#32 742 eor v3.16b,v3.16b,v3.16b // v3 743 st1 {v10.2d,v11.2d},[x7],#32 744 st1 {v12.2d},[x7] 745 746 subs x9,x9,#8 747 ld1 {v6.2d,v7.2d},[x6],#32 748 ld1 {v8.2d,v9.2d},[x6],#32 749 ld1 {v10.2d,v11.2d},[x6],#32 750 ld1 {v12.2d,v13.2d},[x6],#32 751 752 b.eq .LInner_8n_jump_2steps 753 sub x3,x3,x5,lsl#2 // rewind 754 b .LNEON_8n_outer 755 756.LInner_8n_jump_2steps: 757 add x7,sp,#128 758 st1 {v2.2d,v3.2d}, [sp],#32 // start wiping stack frame 759 mov v5.16b,v6.16b 760 ushr v15.2d,v6.2d,#16 761 ext v6.16b,v6.16b,v6.16b,#8 762 st1 {v2.2d,v3.2d}, [sp],#32 763 add v6.2d,v6.2d,v15.2d 764 st1 {v2.2d,v3.2d}, [sp],#32 765 ushr v15.2d,v6.2d,#16 766 st1 {v2.2d,v3.2d}, [sp],#32 767 zip1 v6.4h,v5.4h,v6.4h 768 ins v15.d[1],v14.d[0] 769 770 mov x8,x5 771 b .LNEON_tail_entry 772 773.align 4 774.LNEON_tail: 775 add v6.2d,v6.2d,v15.2d 776 mov v5.16b,v6.16b 777 ushr v15.2d,v6.2d,#16 778 ext v6.16b,v6.16b,v6.16b,#8 779 ld1 {v8.2d,v9.2d}, [x6],#32 780 add v6.2d,v6.2d,v15.2d 781 ld1 {v10.2d,v11.2d}, [x6],#32 782 ushr v15.2d,v6.2d,#16 783 ld1 {v12.2d,v13.2d}, [x6],#32 784 zip1 v6.4h,v5.4h,v6.4h 785 ins v15.d[1],v14.d[0] 786 787.LNEON_tail_entry: 788 add v7.2d,v7.2d,v15.2d 789 st1 {v6.s}[0], [x7],#4 790 ushr v15.2d,v7.2d,#16 791 mov v5.16b,v7.16b 792 ext v7.16b,v7.16b,v7.16b,#8 793 add v7.2d,v7.2d,v15.2d 794 ushr v15.2d,v7.2d,#16 795 zip1 v7.4h,v5.4h,v7.4h 796 ins v15.d[1],v14.d[0] 797 add v8.2d,v8.2d,v15.2d 798 st1 {v7.s}[0], [x7],#4 799 ushr v15.2d,v8.2d,#16 800 mov v5.16b,v8.16b 801 ext v8.16b,v8.16b,v8.16b,#8 802 add v8.2d,v8.2d,v15.2d 803 ushr v15.2d,v8.2d,#16 804 zip1 v8.4h,v5.4h,v8.4h 805 ins v15.d[1],v14.d[0] 806 add v9.2d,v9.2d,v15.2d 807 st1 {v8.s}[0], [x7],#4 808 ushr v15.2d,v9.2d,#16 809 mov v5.16b,v9.16b 810 ext v9.16b,v9.16b,v9.16b,#8 811 add v9.2d,v9.2d,v15.2d 812 ushr v15.2d,v9.2d,#16 813 zip1 v9.4h,v5.4h,v9.4h 814 ins v15.d[1],v14.d[0] 815 add v10.2d,v10.2d,v15.2d 816 st1 {v9.s}[0], [x7],#4 817 ushr v15.2d,v10.2d,#16 818 mov v5.16b,v10.16b 819 ext v10.16b,v10.16b,v10.16b,#8 820 add v10.2d,v10.2d,v15.2d 821 ushr v15.2d,v10.2d,#16 822 zip1 v10.4h,v5.4h,v10.4h 823 ins v15.d[1],v14.d[0] 824 add v11.2d,v11.2d,v15.2d 825 st1 {v10.s}[0], [x7],#4 826 ushr v15.2d,v11.2d,#16 827 mov v5.16b,v11.16b 828 ext v11.16b,v11.16b,v11.16b,#8 829 add v11.2d,v11.2d,v15.2d 830 ushr v15.2d,v11.2d,#16 831 zip1 v11.4h,v5.4h,v11.4h 832 ins v15.d[1],v14.d[0] 833 add v12.2d,v12.2d,v15.2d 834 st1 {v11.s}[0], [x7],#4 835 ushr v15.2d,v12.2d,#16 836 mov v5.16b,v12.16b 837 ext v12.16b,v12.16b,v12.16b,#8 838 add v12.2d,v12.2d,v15.2d 839 ushr v15.2d,v12.2d,#16 840 zip1 v12.4h,v5.4h,v12.4h 841 ins v15.d[1],v14.d[0] 842 add v13.2d,v13.2d,v15.2d 843 st1 {v12.s}[0], [x7],#4 844 ushr v15.2d,v13.2d,#16 845 mov v5.16b,v13.16b 846 ext v13.16b,v13.16b,v13.16b,#8 847 add v13.2d,v13.2d,v15.2d 848 ushr v15.2d,v13.2d,#16 849 zip1 v13.4h,v5.4h,v13.4h 850 ins v15.d[1],v14.d[0] 851 ld1 {v6.2d,v7.2d}, [x6],#32 852 subs x8,x8,#8 853 st1 {v13.s}[0], [x7],#4 854 bne .LNEON_tail 855 856 st1 {v15.s}[0], [x7],#4 // top-most bit 857 sub x3,x3,x5,lsl#2 // rewind x3 858 subs x1,sp,#0 // clear carry flag 859 add x2,sp,x5,lsl#2 860 861.LNEON_sub: 862 ldp w4,w5,[x1],#8 863 ldp w6,w7,[x1],#8 864 ldp w8,w9,[x3],#8 865 ldp w10,w11,[x3],#8 866 sbcs w8,w4,w8 867 sbcs w9,w5,w9 868 sbcs w10,w6,w10 869 sbcs w11,w7,w11 870 sub x17,x2,x1 871 stp w8,w9,[x0],#8 872 stp w10,w11,[x0],#8 873 cbnz x17,.LNEON_sub 874 875 ldr w10, [x1] // load top-most bit 876 mov x11,sp 877 eor v0.16b,v0.16b,v0.16b 878 sub x11,x2,x11 // this is num*4 879 eor v1.16b,v1.16b,v1.16b 880 mov x1,sp 881 sub x0,x0,x11 // rewind x0 882 mov x3,x2 // second 3/4th of frame 883 sbcs w10,w10,wzr // result is carry flag 884 885.LNEON_copy_n_zap: 886 ldp w4,w5,[x1],#8 887 ldp w6,w7,[x1],#8 888 ldp w8,w9,[x0],#8 889 ldp w10,w11,[x0] 890 sub x0,x0,#8 891 b.cs .LCopy_1 892 mov w8,w4 893 mov w9,w5 894 mov w10,w6 895 mov w11,w7 896.LCopy_1: 897 st1 {v0.2d,v1.2d}, [x3],#32 // wipe 898 st1 {v0.2d,v1.2d}, [x3],#32 // wipe 899 ldp w4,w5,[x1],#8 900 ldp w6,w7,[x1],#8 901 stp w8,w9,[x0],#8 902 stp w10,w11,[x0],#8 903 sub x1,x1,#32 904 ldp w8,w9,[x0],#8 905 ldp w10,w11,[x0] 906 sub x0,x0,#8 907 b.cs .LCopy_2 908 mov w8, w4 909 mov w9, w5 910 mov w10, w6 911 mov w11, w7 912.LCopy_2: 913 st1 {v0.2d,v1.2d}, [x1],#32 // wipe 914 st1 {v0.2d,v1.2d}, [x3],#32 // wipe 915 sub x17,x2,x1 // preserves carry 916 stp w8,w9,[x0],#8 917 stp w10,w11,[x0],#8 918 cbnz x17,.LNEON_copy_n_zap 919 920 mov sp,x16 921 ldp d14,d15,[sp,#64] 922 ldp d12,d13,[sp,#48] 923 ldp d10,d11,[sp,#32] 924 ldp d8,d9,[sp,#16] 925 ldr x29,[sp],#80 926 AARCH64_VALIDATE_LINK_REGISTER 927 ret // bx lr 928 929.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 930.type __bn_sqr8x_mont,%function 931.align 5 932__bn_sqr8x_mont: 933 cmp x1,x2 934 b.ne __bn_mul4x_mont 935.Lsqr8x_mont: 936 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to 937 // only from bn_mul_mont which has already signed the return address. 938 stp x29,x30,[sp,#-128]! 939 add x29,sp,#0 940 stp x19,x20,[sp,#16] 941 stp x21,x22,[sp,#32] 942 stp x23,x24,[sp,#48] 943 stp x25,x26,[sp,#64] 944 stp x27,x28,[sp,#80] 945 stp x0,x3,[sp,#96] // offload rp and np 946 947 ldp x6,x7,[x1,#8*0] 948 ldp x8,x9,[x1,#8*2] 949 ldp x10,x11,[x1,#8*4] 950 ldp x12,x13,[x1,#8*6] 951 952 sub x2,sp,x5,lsl#4 953 lsl x5,x5,#3 954 ldr x4,[x4] // *n0 955 mov sp,x2 // alloca 956 sub x27,x5,#8*8 957 b .Lsqr8x_zero_start 958 959.Lsqr8x_zero: 960 sub x27,x27,#8*8 961 stp xzr,xzr,[x2,#8*0] 962 stp xzr,xzr,[x2,#8*2] 963 stp xzr,xzr,[x2,#8*4] 964 stp xzr,xzr,[x2,#8*6] 965.Lsqr8x_zero_start: 966 stp xzr,xzr,[x2,#8*8] 967 stp xzr,xzr,[x2,#8*10] 968 stp xzr,xzr,[x2,#8*12] 969 stp xzr,xzr,[x2,#8*14] 970 add x2,x2,#8*16 971 cbnz x27,.Lsqr8x_zero 972 973 add x3,x1,x5 974 add x1,x1,#8*8 975 mov x19,xzr 976 mov x20,xzr 977 mov x21,xzr 978 mov x22,xzr 979 mov x23,xzr 980 mov x24,xzr 981 mov x25,xzr 982 mov x26,xzr 983 mov x2,sp 984 str x4,[x29,#112] // offload n0 985 986 // Multiply everything but a[i]*a[i] 987.align 4 988.Lsqr8x_outer_loop: 989 // a[1]a[0] (i) 990 // a[2]a[0] 991 // a[3]a[0] 992 // a[4]a[0] 993 // a[5]a[0] 994 // a[6]a[0] 995 // a[7]a[0] 996 // a[2]a[1] (ii) 997 // a[3]a[1] 998 // a[4]a[1] 999 // a[5]a[1] 1000 // a[6]a[1] 1001 // a[7]a[1] 1002 // a[3]a[2] (iii) 1003 // a[4]a[2] 1004 // a[5]a[2] 1005 // a[6]a[2] 1006 // a[7]a[2] 1007 // a[4]a[3] (iv) 1008 // a[5]a[3] 1009 // a[6]a[3] 1010 // a[7]a[3] 1011 // a[5]a[4] (v) 1012 // a[6]a[4] 1013 // a[7]a[4] 1014 // a[6]a[5] (vi) 1015 // a[7]a[5] 1016 // a[7]a[6] (vii) 1017 1018 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 1019 mul x15,x8,x6 1020 mul x16,x9,x6 1021 mul x17,x10,x6 1022 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 1023 mul x14,x11,x6 1024 adcs x21,x21,x15 1025 mul x15,x12,x6 1026 adcs x22,x22,x16 1027 mul x16,x13,x6 1028 adcs x23,x23,x17 1029 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 1030 adcs x24,x24,x14 1031 umulh x14,x8,x6 1032 adcs x25,x25,x15 1033 umulh x15,x9,x6 1034 adcs x26,x26,x16 1035 umulh x16,x10,x6 1036 stp x19,x20,[x2],#8*2 // t[0..1] 1037 adc x19,xzr,xzr // t[8] 1038 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 1039 umulh x17,x11,x6 1040 adcs x22,x22,x14 1041 umulh x14,x12,x6 1042 adcs x23,x23,x15 1043 umulh x15,x13,x6 1044 adcs x24,x24,x16 1045 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 1046 adcs x25,x25,x17 1047 mul x17,x9,x7 1048 adcs x26,x26,x14 1049 mul x14,x10,x7 1050 adc x19,x19,x15 1051 1052 mul x15,x11,x7 1053 adds x22,x22,x16 1054 mul x16,x12,x7 1055 adcs x23,x23,x17 1056 mul x17,x13,x7 1057 adcs x24,x24,x14 1058 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 1059 adcs x25,x25,x15 1060 umulh x15,x9,x7 1061 adcs x26,x26,x16 1062 umulh x16,x10,x7 1063 adcs x19,x19,x17 1064 umulh x17,x11,x7 1065 stp x21,x22,[x2],#8*2 // t[2..3] 1066 adc x20,xzr,xzr // t[9] 1067 adds x23,x23,x14 1068 umulh x14,x12,x7 1069 adcs x24,x24,x15 1070 umulh x15,x13,x7 1071 adcs x25,x25,x16 1072 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 1073 adcs x26,x26,x17 1074 mul x17,x10,x8 1075 adcs x19,x19,x14 1076 mul x14,x11,x8 1077 adc x20,x20,x15 1078 1079 mul x15,x12,x8 1080 adds x24,x24,x16 1081 mul x16,x13,x8 1082 adcs x25,x25,x17 1083 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 1084 adcs x26,x26,x14 1085 umulh x14,x10,x8 1086 adcs x19,x19,x15 1087 umulh x15,x11,x8 1088 adcs x20,x20,x16 1089 umulh x16,x12,x8 1090 stp x23,x24,[x2],#8*2 // t[4..5] 1091 adc x21,xzr,xzr // t[10] 1092 adds x25,x25,x17 1093 umulh x17,x13,x8 1094 adcs x26,x26,x14 1095 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 1096 adcs x19,x19,x15 1097 mul x15,x11,x9 1098 adcs x20,x20,x16 1099 mul x16,x12,x9 1100 adc x21,x21,x17 1101 1102 mul x17,x13,x9 1103 adds x26,x26,x14 1104 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 1105 adcs x19,x19,x15 1106 umulh x15,x11,x9 1107 adcs x20,x20,x16 1108 umulh x16,x12,x9 1109 adcs x21,x21,x17 1110 umulh x17,x13,x9 1111 stp x25,x26,[x2],#8*2 // t[6..7] 1112 adc x22,xzr,xzr // t[11] 1113 adds x19,x19,x14 1114 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 1115 adcs x20,x20,x15 1116 mul x15,x12,x10 1117 adcs x21,x21,x16 1118 mul x16,x13,x10 1119 adc x22,x22,x17 1120 1121 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 1122 adds x20,x20,x14 1123 umulh x14,x12,x10 1124 adcs x21,x21,x15 1125 umulh x15,x13,x10 1126 adcs x22,x22,x16 1127 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 1128 adc x23,xzr,xzr // t[12] 1129 adds x21,x21,x17 1130 mul x17,x13,x11 1131 adcs x22,x22,x14 1132 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 1133 adc x23,x23,x15 1134 1135 umulh x15,x13,x11 1136 adds x22,x22,x16 1137 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 1138 adcs x23,x23,x17 1139 umulh x17,x13,x12 // hi(a[7]*a[6]) 1140 adc x24,xzr,xzr // t[13] 1141 adds x23,x23,x14 1142 sub x27,x3,x1 // done yet? 1143 adc x24,x24,x15 1144 1145 adds x24,x24,x16 1146 sub x14,x3,x5 // rewinded ap 1147 adc x25,xzr,xzr // t[14] 1148 add x25,x25,x17 1149 1150 cbz x27,.Lsqr8x_outer_break 1151 1152 mov x4,x6 1153 ldp x6,x7,[x2,#8*0] 1154 ldp x8,x9,[x2,#8*2] 1155 ldp x10,x11,[x2,#8*4] 1156 ldp x12,x13,[x2,#8*6] 1157 adds x19,x19,x6 1158 adcs x20,x20,x7 1159 ldp x6,x7,[x1,#8*0] 1160 adcs x21,x21,x8 1161 adcs x22,x22,x9 1162 ldp x8,x9,[x1,#8*2] 1163 adcs x23,x23,x10 1164 adcs x24,x24,x11 1165 ldp x10,x11,[x1,#8*4] 1166 adcs x25,x25,x12 1167 mov x0,x1 1168 adcs x26,xzr,x13 1169 ldp x12,x13,[x1,#8*6] 1170 add x1,x1,#8*8 1171 //adc x28,xzr,xzr // moved below 1172 mov x27,#-8*8 1173 1174 // a[8]a[0] 1175 // a[9]a[0] 1176 // a[a]a[0] 1177 // a[b]a[0] 1178 // a[c]a[0] 1179 // a[d]a[0] 1180 // a[e]a[0] 1181 // a[f]a[0] 1182 // a[8]a[1] 1183 // a[f]a[1]........................ 1184 // a[8]a[2] 1185 // a[f]a[2]........................ 1186 // a[8]a[3] 1187 // a[f]a[3]........................ 1188 // a[8]a[4] 1189 // a[f]a[4]........................ 1190 // a[8]a[5] 1191 // a[f]a[5]........................ 1192 // a[8]a[6] 1193 // a[f]a[6]........................ 1194 // a[8]a[7] 1195 // a[f]a[7]........................ 1196.Lsqr8x_mul: 1197 mul x14,x6,x4 1198 adc x28,xzr,xzr // carry bit, modulo-scheduled 1199 mul x15,x7,x4 1200 add x27,x27,#8 1201 mul x16,x8,x4 1202 mul x17,x9,x4 1203 adds x19,x19,x14 1204 mul x14,x10,x4 1205 adcs x20,x20,x15 1206 mul x15,x11,x4 1207 adcs x21,x21,x16 1208 mul x16,x12,x4 1209 adcs x22,x22,x17 1210 mul x17,x13,x4 1211 adcs x23,x23,x14 1212 umulh x14,x6,x4 1213 adcs x24,x24,x15 1214 umulh x15,x7,x4 1215 adcs x25,x25,x16 1216 umulh x16,x8,x4 1217 adcs x26,x26,x17 1218 umulh x17,x9,x4 1219 adc x28,x28,xzr 1220 str x19,[x2],#8 1221 adds x19,x20,x14 1222 umulh x14,x10,x4 1223 adcs x20,x21,x15 1224 umulh x15,x11,x4 1225 adcs x21,x22,x16 1226 umulh x16,x12,x4 1227 adcs x22,x23,x17 1228 umulh x17,x13,x4 1229 ldr x4,[x0,x27] 1230 adcs x23,x24,x14 1231 adcs x24,x25,x15 1232 adcs x25,x26,x16 1233 adcs x26,x28,x17 1234 //adc x28,xzr,xzr // moved above 1235 cbnz x27,.Lsqr8x_mul 1236 // note that carry flag is guaranteed 1237 // to be zero at this point 1238 cmp x1,x3 // done yet? 1239 b.eq .Lsqr8x_break 1240 1241 ldp x6,x7,[x2,#8*0] 1242 ldp x8,x9,[x2,#8*2] 1243 ldp x10,x11,[x2,#8*4] 1244 ldp x12,x13,[x2,#8*6] 1245 adds x19,x19,x6 1246 ldur x4,[x0,#-8*8] 1247 adcs x20,x20,x7 1248 ldp x6,x7,[x1,#8*0] 1249 adcs x21,x21,x8 1250 adcs x22,x22,x9 1251 ldp x8,x9,[x1,#8*2] 1252 adcs x23,x23,x10 1253 adcs x24,x24,x11 1254 ldp x10,x11,[x1,#8*4] 1255 adcs x25,x25,x12 1256 mov x27,#-8*8 1257 adcs x26,x26,x13 1258 ldp x12,x13,[x1,#8*6] 1259 add x1,x1,#8*8 1260 //adc x28,xzr,xzr // moved above 1261 b .Lsqr8x_mul 1262 1263.align 4 1264.Lsqr8x_break: 1265 ldp x6,x7,[x0,#8*0] 1266 add x1,x0,#8*8 1267 ldp x8,x9,[x0,#8*2] 1268 sub x14,x3,x1 // is it last iteration? 1269 ldp x10,x11,[x0,#8*4] 1270 sub x15,x2,x14 1271 ldp x12,x13,[x0,#8*6] 1272 cbz x14,.Lsqr8x_outer_loop 1273 1274 stp x19,x20,[x2,#8*0] 1275 ldp x19,x20,[x15,#8*0] 1276 stp x21,x22,[x2,#8*2] 1277 ldp x21,x22,[x15,#8*2] 1278 stp x23,x24,[x2,#8*4] 1279 ldp x23,x24,[x15,#8*4] 1280 stp x25,x26,[x2,#8*6] 1281 mov x2,x15 1282 ldp x25,x26,[x15,#8*6] 1283 b .Lsqr8x_outer_loop 1284 1285.align 4 1286.Lsqr8x_outer_break: 1287 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 1288 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 1289 ldp x15,x16,[sp,#8*1] 1290 ldp x11,x13,[x14,#8*2] 1291 add x1,x14,#8*4 1292 ldp x17,x14,[sp,#8*3] 1293 1294 stp x19,x20,[x2,#8*0] 1295 mul x19,x7,x7 1296 stp x21,x22,[x2,#8*2] 1297 umulh x7,x7,x7 1298 stp x23,x24,[x2,#8*4] 1299 mul x8,x9,x9 1300 stp x25,x26,[x2,#8*6] 1301 mov x2,sp 1302 umulh x9,x9,x9 1303 adds x20,x7,x15,lsl#1 1304 extr x15,x16,x15,#63 1305 sub x27,x5,#8*4 1306 1307.Lsqr4x_shift_n_add: 1308 adcs x21,x8,x15 1309 extr x16,x17,x16,#63 1310 sub x27,x27,#8*4 1311 adcs x22,x9,x16 1312 ldp x15,x16,[x2,#8*5] 1313 mul x10,x11,x11 1314 ldp x7,x9,[x1],#8*2 1315 umulh x11,x11,x11 1316 mul x12,x13,x13 1317 umulh x13,x13,x13 1318 extr x17,x14,x17,#63 1319 stp x19,x20,[x2,#8*0] 1320 adcs x23,x10,x17 1321 extr x14,x15,x14,#63 1322 stp x21,x22,[x2,#8*2] 1323 adcs x24,x11,x14 1324 ldp x17,x14,[x2,#8*7] 1325 extr x15,x16,x15,#63 1326 adcs x25,x12,x15 1327 extr x16,x17,x16,#63 1328 adcs x26,x13,x16 1329 ldp x15,x16,[x2,#8*9] 1330 mul x6,x7,x7 1331 ldp x11,x13,[x1],#8*2 1332 umulh x7,x7,x7 1333 mul x8,x9,x9 1334 umulh x9,x9,x9 1335 stp x23,x24,[x2,#8*4] 1336 extr x17,x14,x17,#63 1337 stp x25,x26,[x2,#8*6] 1338 add x2,x2,#8*8 1339 adcs x19,x6,x17 1340 extr x14,x15,x14,#63 1341 adcs x20,x7,x14 1342 ldp x17,x14,[x2,#8*3] 1343 extr x15,x16,x15,#63 1344 cbnz x27,.Lsqr4x_shift_n_add 1345 ldp x1,x4,[x29,#104] // pull np and n0 1346 1347 adcs x21,x8,x15 1348 extr x16,x17,x16,#63 1349 adcs x22,x9,x16 1350 ldp x15,x16,[x2,#8*5] 1351 mul x10,x11,x11 1352 umulh x11,x11,x11 1353 stp x19,x20,[x2,#8*0] 1354 mul x12,x13,x13 1355 umulh x13,x13,x13 1356 stp x21,x22,[x2,#8*2] 1357 extr x17,x14,x17,#63 1358 adcs x23,x10,x17 1359 extr x14,x15,x14,#63 1360 ldp x19,x20,[sp,#8*0] 1361 adcs x24,x11,x14 1362 extr x15,x16,x15,#63 1363 ldp x6,x7,[x1,#8*0] 1364 adcs x25,x12,x15 1365 extr x16,xzr,x16,#63 1366 ldp x8,x9,[x1,#8*2] 1367 adc x26,x13,x16 1368 ldp x10,x11,[x1,#8*4] 1369 1370 // Reduce by 512 bits per iteration 1371 mul x28,x4,x19 // t[0]*n0 1372 ldp x12,x13,[x1,#8*6] 1373 add x3,x1,x5 1374 ldp x21,x22,[sp,#8*2] 1375 stp x23,x24,[x2,#8*4] 1376 ldp x23,x24,[sp,#8*4] 1377 stp x25,x26,[x2,#8*6] 1378 ldp x25,x26,[sp,#8*6] 1379 add x1,x1,#8*8 1380 mov x30,xzr // initial top-most carry 1381 mov x2,sp 1382 mov x27,#8 1383 1384.Lsqr8x_reduction: 1385 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 1386 mul x15,x7,x28 1387 sub x27,x27,#1 1388 mul x16,x8,x28 1389 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 1390 mul x17,x9,x28 1391 // (*) adds xzr,x19,x14 1392 subs xzr,x19,#1 // (*) 1393 mul x14,x10,x28 1394 adcs x19,x20,x15 1395 mul x15,x11,x28 1396 adcs x20,x21,x16 1397 mul x16,x12,x28 1398 adcs x21,x22,x17 1399 mul x17,x13,x28 1400 adcs x22,x23,x14 1401 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 1402 adcs x23,x24,x15 1403 umulh x15,x7,x28 1404 adcs x24,x25,x16 1405 umulh x16,x8,x28 1406 adcs x25,x26,x17 1407 umulh x17,x9,x28 1408 adc x26,xzr,xzr 1409 adds x19,x19,x14 1410 umulh x14,x10,x28 1411 adcs x20,x20,x15 1412 umulh x15,x11,x28 1413 adcs x21,x21,x16 1414 umulh x16,x12,x28 1415 adcs x22,x22,x17 1416 umulh x17,x13,x28 1417 mul x28,x4,x19 // next t[0]*n0 1418 adcs x23,x23,x14 1419 adcs x24,x24,x15 1420 adcs x25,x25,x16 1421 adc x26,x26,x17 1422 cbnz x27,.Lsqr8x_reduction 1423 1424 ldp x14,x15,[x2,#8*0] 1425 ldp x16,x17,[x2,#8*2] 1426 mov x0,x2 1427 sub x27,x3,x1 // done yet? 1428 adds x19,x19,x14 1429 adcs x20,x20,x15 1430 ldp x14,x15,[x2,#8*4] 1431 adcs x21,x21,x16 1432 adcs x22,x22,x17 1433 ldp x16,x17,[x2,#8*6] 1434 adcs x23,x23,x14 1435 adcs x24,x24,x15 1436 adcs x25,x25,x16 1437 adcs x26,x26,x17 1438 //adc x28,xzr,xzr // moved below 1439 cbz x27,.Lsqr8x8_post_condition 1440 1441 ldur x4,[x2,#-8*8] 1442 ldp x6,x7,[x1,#8*0] 1443 ldp x8,x9,[x1,#8*2] 1444 ldp x10,x11,[x1,#8*4] 1445 mov x27,#-8*8 1446 ldp x12,x13,[x1,#8*6] 1447 add x1,x1,#8*8 1448 1449.Lsqr8x_tail: 1450 mul x14,x6,x4 1451 adc x28,xzr,xzr // carry bit, modulo-scheduled 1452 mul x15,x7,x4 1453 add x27,x27,#8 1454 mul x16,x8,x4 1455 mul x17,x9,x4 1456 adds x19,x19,x14 1457 mul x14,x10,x4 1458 adcs x20,x20,x15 1459 mul x15,x11,x4 1460 adcs x21,x21,x16 1461 mul x16,x12,x4 1462 adcs x22,x22,x17 1463 mul x17,x13,x4 1464 adcs x23,x23,x14 1465 umulh x14,x6,x4 1466 adcs x24,x24,x15 1467 umulh x15,x7,x4 1468 adcs x25,x25,x16 1469 umulh x16,x8,x4 1470 adcs x26,x26,x17 1471 umulh x17,x9,x4 1472 adc x28,x28,xzr 1473 str x19,[x2],#8 1474 adds x19,x20,x14 1475 umulh x14,x10,x4 1476 adcs x20,x21,x15 1477 umulh x15,x11,x4 1478 adcs x21,x22,x16 1479 umulh x16,x12,x4 1480 adcs x22,x23,x17 1481 umulh x17,x13,x4 1482 ldr x4,[x0,x27] 1483 adcs x23,x24,x14 1484 adcs x24,x25,x15 1485 adcs x25,x26,x16 1486 adcs x26,x28,x17 1487 //adc x28,xzr,xzr // moved above 1488 cbnz x27,.Lsqr8x_tail 1489 // note that carry flag is guaranteed 1490 // to be zero at this point 1491 ldp x6,x7,[x2,#8*0] 1492 sub x27,x3,x1 // done yet? 1493 sub x16,x3,x5 // rewinded np 1494 ldp x8,x9,[x2,#8*2] 1495 ldp x10,x11,[x2,#8*4] 1496 ldp x12,x13,[x2,#8*6] 1497 cbz x27,.Lsqr8x_tail_break 1498 1499 ldur x4,[x0,#-8*8] 1500 adds x19,x19,x6 1501 adcs x20,x20,x7 1502 ldp x6,x7,[x1,#8*0] 1503 adcs x21,x21,x8 1504 adcs x22,x22,x9 1505 ldp x8,x9,[x1,#8*2] 1506 adcs x23,x23,x10 1507 adcs x24,x24,x11 1508 ldp x10,x11,[x1,#8*4] 1509 adcs x25,x25,x12 1510 mov x27,#-8*8 1511 adcs x26,x26,x13 1512 ldp x12,x13,[x1,#8*6] 1513 add x1,x1,#8*8 1514 //adc x28,xzr,xzr // moved above 1515 b .Lsqr8x_tail 1516 1517.align 4 1518.Lsqr8x_tail_break: 1519 ldr x4,[x29,#112] // pull n0 1520 add x27,x2,#8*8 // end of current t[num] window 1521 1522 subs xzr,x30,#1 // "move" top-most carry to carry bit 1523 adcs x14,x19,x6 1524 adcs x15,x20,x7 1525 ldp x19,x20,[x0,#8*0] 1526 adcs x21,x21,x8 1527 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 1528 adcs x22,x22,x9 1529 ldp x8,x9,[x16,#8*2] 1530 adcs x23,x23,x10 1531 adcs x24,x24,x11 1532 ldp x10,x11,[x16,#8*4] 1533 adcs x25,x25,x12 1534 adcs x26,x26,x13 1535 ldp x12,x13,[x16,#8*6] 1536 add x1,x16,#8*8 1537 adc x30,xzr,xzr // top-most carry 1538 mul x28,x4,x19 1539 stp x14,x15,[x2,#8*0] 1540 stp x21,x22,[x2,#8*2] 1541 ldp x21,x22,[x0,#8*2] 1542 stp x23,x24,[x2,#8*4] 1543 ldp x23,x24,[x0,#8*4] 1544 cmp x27,x29 // did we hit the bottom? 1545 stp x25,x26,[x2,#8*6] 1546 mov x2,x0 // slide the window 1547 ldp x25,x26,[x0,#8*6] 1548 mov x27,#8 1549 b.ne .Lsqr8x_reduction 1550 1551 // Final step. We see if result is larger than modulus, and 1552 // if it is, subtract the modulus. But comparison implies 1553 // subtraction. So we subtract modulus, see if it borrowed, 1554 // and conditionally copy original value. 1555 ldr x0,[x29,#96] // pull rp 1556 add x2,x2,#8*8 1557 subs x14,x19,x6 1558 sbcs x15,x20,x7 1559 sub x27,x5,#8*8 1560 mov x3,x0 // x0 copy 1561 1562.Lsqr8x_sub: 1563 sbcs x16,x21,x8 1564 ldp x6,x7,[x1,#8*0] 1565 sbcs x17,x22,x9 1566 stp x14,x15,[x0,#8*0] 1567 sbcs x14,x23,x10 1568 ldp x8,x9,[x1,#8*2] 1569 sbcs x15,x24,x11 1570 stp x16,x17,[x0,#8*2] 1571 sbcs x16,x25,x12 1572 ldp x10,x11,[x1,#8*4] 1573 sbcs x17,x26,x13 1574 ldp x12,x13,[x1,#8*6] 1575 add x1,x1,#8*8 1576 ldp x19,x20,[x2,#8*0] 1577 sub x27,x27,#8*8 1578 ldp x21,x22,[x2,#8*2] 1579 ldp x23,x24,[x2,#8*4] 1580 ldp x25,x26,[x2,#8*6] 1581 add x2,x2,#8*8 1582 stp x14,x15,[x0,#8*4] 1583 sbcs x14,x19,x6 1584 stp x16,x17,[x0,#8*6] 1585 add x0,x0,#8*8 1586 sbcs x15,x20,x7 1587 cbnz x27,.Lsqr8x_sub 1588 1589 sbcs x16,x21,x8 1590 mov x2,sp 1591 add x1,sp,x5 1592 ldp x6,x7,[x3,#8*0] 1593 sbcs x17,x22,x9 1594 stp x14,x15,[x0,#8*0] 1595 sbcs x14,x23,x10 1596 ldp x8,x9,[x3,#8*2] 1597 sbcs x15,x24,x11 1598 stp x16,x17,[x0,#8*2] 1599 sbcs x16,x25,x12 1600 ldp x19,x20,[x1,#8*0] 1601 sbcs x17,x26,x13 1602 ldp x21,x22,[x1,#8*2] 1603 sbcs xzr,x30,xzr // did it borrow? 1604 ldr x30,[x29,#8] // pull return address 1605 stp x14,x15,[x0,#8*4] 1606 stp x16,x17,[x0,#8*6] 1607 1608 sub x27,x5,#8*4 1609.Lsqr4x_cond_copy: 1610 sub x27,x27,#8*4 1611 csel x14,x19,x6,lo 1612 stp xzr,xzr,[x2,#8*0] 1613 csel x15,x20,x7,lo 1614 ldp x6,x7,[x3,#8*4] 1615 ldp x19,x20,[x1,#8*4] 1616 csel x16,x21,x8,lo 1617 stp xzr,xzr,[x2,#8*2] 1618 add x2,x2,#8*4 1619 csel x17,x22,x9,lo 1620 ldp x8,x9,[x3,#8*6] 1621 ldp x21,x22,[x1,#8*6] 1622 add x1,x1,#8*4 1623 stp x14,x15,[x3,#8*0] 1624 stp x16,x17,[x3,#8*2] 1625 add x3,x3,#8*4 1626 stp xzr,xzr,[x1,#8*0] 1627 stp xzr,xzr,[x1,#8*2] 1628 cbnz x27,.Lsqr4x_cond_copy 1629 1630 csel x14,x19,x6,lo 1631 stp xzr,xzr,[x2,#8*0] 1632 csel x15,x20,x7,lo 1633 stp xzr,xzr,[x2,#8*2] 1634 csel x16,x21,x8,lo 1635 csel x17,x22,x9,lo 1636 stp x14,x15,[x3,#8*0] 1637 stp x16,x17,[x3,#8*2] 1638 1639 b .Lsqr8x_done 1640 1641.align 4 1642.Lsqr8x8_post_condition: 1643 adc x28,xzr,xzr 1644 ldr x30,[x29,#8] // pull return address 1645 // x19-7,x28 hold result, x6-7 hold modulus 1646 subs x6,x19,x6 1647 ldr x1,[x29,#96] // pull rp 1648 sbcs x7,x20,x7 1649 stp xzr,xzr,[sp,#8*0] 1650 sbcs x8,x21,x8 1651 stp xzr,xzr,[sp,#8*2] 1652 sbcs x9,x22,x9 1653 stp xzr,xzr,[sp,#8*4] 1654 sbcs x10,x23,x10 1655 stp xzr,xzr,[sp,#8*6] 1656 sbcs x11,x24,x11 1657 stp xzr,xzr,[sp,#8*8] 1658 sbcs x12,x25,x12 1659 stp xzr,xzr,[sp,#8*10] 1660 sbcs x13,x26,x13 1661 stp xzr,xzr,[sp,#8*12] 1662 sbcs x28,x28,xzr // did it borrow? 1663 stp xzr,xzr,[sp,#8*14] 1664 1665 // x6-7 hold result-modulus 1666 csel x6,x19,x6,lo 1667 csel x7,x20,x7,lo 1668 csel x8,x21,x8,lo 1669 csel x9,x22,x9,lo 1670 stp x6,x7,[x1,#8*0] 1671 csel x10,x23,x10,lo 1672 csel x11,x24,x11,lo 1673 stp x8,x9,[x1,#8*2] 1674 csel x12,x25,x12,lo 1675 csel x13,x26,x13,lo 1676 stp x10,x11,[x1,#8*4] 1677 stp x12,x13,[x1,#8*6] 1678 1679.Lsqr8x_done: 1680 ldp x19,x20,[x29,#16] 1681 mov sp,x29 1682 ldp x21,x22,[x29,#32] 1683 mov x0,#1 1684 ldp x23,x24,[x29,#48] 1685 ldp x25,x26,[x29,#64] 1686 ldp x27,x28,[x29,#80] 1687 ldr x29,[sp],#128 1688 // x30 is loaded earlier 1689 AARCH64_VALIDATE_LINK_REGISTER 1690 ret 1691.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 1692.type __bn_mul4x_mont,%function 1693.align 5 1694__bn_mul4x_mont: 1695 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to 1696 // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address. 1697 stp x29,x30,[sp,#-128]! 1698 add x29,sp,#0 1699 stp x19,x20,[sp,#16] 1700 stp x21,x22,[sp,#32] 1701 stp x23,x24,[sp,#48] 1702 stp x25,x26,[sp,#64] 1703 stp x27,x28,[sp,#80] 1704 1705 sub x26,sp,x5,lsl#3 1706 lsl x5,x5,#3 1707 ldr x4,[x4] // *n0 1708 sub sp,x26,#8*4 // alloca 1709 1710 add x10,x2,x5 1711 add x27,x1,x5 1712 stp x0,x10,[x29,#96] // offload rp and &b[num] 1713 1714 ldr x24,[x2,#8*0] // b[0] 1715 ldp x6,x7,[x1,#8*0] // a[0..3] 1716 ldp x8,x9,[x1,#8*2] 1717 add x1,x1,#8*4 1718 mov x19,xzr 1719 mov x20,xzr 1720 mov x21,xzr 1721 mov x22,xzr 1722 ldp x14,x15,[x3,#8*0] // n[0..3] 1723 ldp x16,x17,[x3,#8*2] 1724 adds x3,x3,#8*4 // clear carry bit 1725 mov x0,xzr 1726 mov x28,#0 1727 mov x26,sp 1728 1729.Loop_mul4x_1st_reduction: 1730 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1731 adc x0,x0,xzr // modulo-scheduled 1732 mul x11,x7,x24 1733 add x28,x28,#8 1734 mul x12,x8,x24 1735 and x28,x28,#31 1736 mul x13,x9,x24 1737 adds x19,x19,x10 1738 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1739 adcs x20,x20,x11 1740 mul x25,x19,x4 // t[0]*n0 1741 adcs x21,x21,x12 1742 umulh x11,x7,x24 1743 adcs x22,x22,x13 1744 umulh x12,x8,x24 1745 adc x23,xzr,xzr 1746 umulh x13,x9,x24 1747 ldr x24,[x2,x28] // next b[i] (or b[0]) 1748 adds x20,x20,x10 1749 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1750 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1751 adcs x21,x21,x11 1752 mul x11,x15,x25 1753 adcs x22,x22,x12 1754 mul x12,x16,x25 1755 adc x23,x23,x13 // can't overflow 1756 mul x13,x17,x25 1757 // (*) adds xzr,x19,x10 1758 subs xzr,x19,#1 // (*) 1759 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1760 adcs x19,x20,x11 1761 umulh x11,x15,x25 1762 adcs x20,x21,x12 1763 umulh x12,x16,x25 1764 adcs x21,x22,x13 1765 umulh x13,x17,x25 1766 adcs x22,x23,x0 1767 adc x0,xzr,xzr 1768 adds x19,x19,x10 1769 sub x10,x27,x1 1770 adcs x20,x20,x11 1771 adcs x21,x21,x12 1772 adcs x22,x22,x13 1773 //adc x0,x0,xzr 1774 cbnz x28,.Loop_mul4x_1st_reduction 1775 1776 cbz x10,.Lmul4x4_post_condition 1777 1778 ldp x6,x7,[x1,#8*0] // a[4..7] 1779 ldp x8,x9,[x1,#8*2] 1780 add x1,x1,#8*4 1781 ldr x25,[sp] // a[0]*n0 1782 ldp x14,x15,[x3,#8*0] // n[4..7] 1783 ldp x16,x17,[x3,#8*2] 1784 add x3,x3,#8*4 1785 1786.Loop_mul4x_1st_tail: 1787 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1788 adc x0,x0,xzr // modulo-scheduled 1789 mul x11,x7,x24 1790 add x28,x28,#8 1791 mul x12,x8,x24 1792 and x28,x28,#31 1793 mul x13,x9,x24 1794 adds x19,x19,x10 1795 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1796 adcs x20,x20,x11 1797 umulh x11,x7,x24 1798 adcs x21,x21,x12 1799 umulh x12,x8,x24 1800 adcs x22,x22,x13 1801 umulh x13,x9,x24 1802 adc x23,xzr,xzr 1803 ldr x24,[x2,x28] // next b[i] (or b[0]) 1804 adds x20,x20,x10 1805 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1806 adcs x21,x21,x11 1807 mul x11,x15,x25 1808 adcs x22,x22,x12 1809 mul x12,x16,x25 1810 adc x23,x23,x13 // can't overflow 1811 mul x13,x17,x25 1812 adds x19,x19,x10 1813 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1814 adcs x20,x20,x11 1815 umulh x11,x15,x25 1816 adcs x21,x21,x12 1817 umulh x12,x16,x25 1818 adcs x22,x22,x13 1819 adcs x23,x23,x0 1820 umulh x13,x17,x25 1821 adc x0,xzr,xzr 1822 ldr x25,[sp,x28] // next t[0]*n0 1823 str x19,[x26],#8 // result!!! 1824 adds x19,x20,x10 1825 sub x10,x27,x1 // done yet? 1826 adcs x20,x21,x11 1827 adcs x21,x22,x12 1828 adcs x22,x23,x13 1829 //adc x0,x0,xzr 1830 cbnz x28,.Loop_mul4x_1st_tail 1831 1832 sub x11,x27,x5 // rewinded x1 1833 cbz x10,.Lmul4x_proceed 1834 1835 ldp x6,x7,[x1,#8*0] 1836 ldp x8,x9,[x1,#8*2] 1837 add x1,x1,#8*4 1838 ldp x14,x15,[x3,#8*0] 1839 ldp x16,x17,[x3,#8*2] 1840 add x3,x3,#8*4 1841 b .Loop_mul4x_1st_tail 1842 1843.align 5 1844.Lmul4x_proceed: 1845 ldr x24,[x2,#8*4]! // *++b 1846 adc x30,x0,xzr 1847 ldp x6,x7,[x11,#8*0] // a[0..3] 1848 sub x3,x3,x5 // rewind np 1849 ldp x8,x9,[x11,#8*2] 1850 add x1,x11,#8*4 1851 1852 stp x19,x20,[x26,#8*0] // result!!! 1853 ldp x19,x20,[sp,#8*4] // t[0..3] 1854 stp x21,x22,[x26,#8*2] // result!!! 1855 ldp x21,x22,[sp,#8*6] 1856 1857 ldp x14,x15,[x3,#8*0] // n[0..3] 1858 mov x26,sp 1859 ldp x16,x17,[x3,#8*2] 1860 adds x3,x3,#8*4 // clear carry bit 1861 mov x0,xzr 1862 1863.align 4 1864.Loop_mul4x_reduction: 1865 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1866 adc x0,x0,xzr // modulo-scheduled 1867 mul x11,x7,x24 1868 add x28,x28,#8 1869 mul x12,x8,x24 1870 and x28,x28,#31 1871 mul x13,x9,x24 1872 adds x19,x19,x10 1873 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1874 adcs x20,x20,x11 1875 mul x25,x19,x4 // t[0]*n0 1876 adcs x21,x21,x12 1877 umulh x11,x7,x24 1878 adcs x22,x22,x13 1879 umulh x12,x8,x24 1880 adc x23,xzr,xzr 1881 umulh x13,x9,x24 1882 ldr x24,[x2,x28] // next b[i] 1883 adds x20,x20,x10 1884 // (*) mul x10,x14,x25 1885 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1886 adcs x21,x21,x11 1887 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1888 adcs x22,x22,x12 1889 mul x12,x16,x25 1890 adc x23,x23,x13 // can't overflow 1891 mul x13,x17,x25 1892 // (*) adds xzr,x19,x10 1893 subs xzr,x19,#1 // (*) 1894 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1895 adcs x19,x20,x11 1896 umulh x11,x15,x25 1897 adcs x20,x21,x12 1898 umulh x12,x16,x25 1899 adcs x21,x22,x13 1900 umulh x13,x17,x25 1901 adcs x22,x23,x0 1902 adc x0,xzr,xzr 1903 adds x19,x19,x10 1904 adcs x20,x20,x11 1905 adcs x21,x21,x12 1906 adcs x22,x22,x13 1907 //adc x0,x0,xzr 1908 cbnz x28,.Loop_mul4x_reduction 1909 1910 adc x0,x0,xzr 1911 ldp x10,x11,[x26,#8*4] // t[4..7] 1912 ldp x12,x13,[x26,#8*6] 1913 ldp x6,x7,[x1,#8*0] // a[4..7] 1914 ldp x8,x9,[x1,#8*2] 1915 add x1,x1,#8*4 1916 adds x19,x19,x10 1917 adcs x20,x20,x11 1918 adcs x21,x21,x12 1919 adcs x22,x22,x13 1920 //adc x0,x0,xzr 1921 1922 ldr x25,[sp] // t[0]*n0 1923 ldp x14,x15,[x3,#8*0] // n[4..7] 1924 ldp x16,x17,[x3,#8*2] 1925 add x3,x3,#8*4 1926 1927.align 4 1928.Loop_mul4x_tail: 1929 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1930 adc x0,x0,xzr // modulo-scheduled 1931 mul x11,x7,x24 1932 add x28,x28,#8 1933 mul x12,x8,x24 1934 and x28,x28,#31 1935 mul x13,x9,x24 1936 adds x19,x19,x10 1937 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1938 adcs x20,x20,x11 1939 umulh x11,x7,x24 1940 adcs x21,x21,x12 1941 umulh x12,x8,x24 1942 adcs x22,x22,x13 1943 umulh x13,x9,x24 1944 adc x23,xzr,xzr 1945 ldr x24,[x2,x28] // next b[i] 1946 adds x20,x20,x10 1947 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1948 adcs x21,x21,x11 1949 mul x11,x15,x25 1950 adcs x22,x22,x12 1951 mul x12,x16,x25 1952 adc x23,x23,x13 // can't overflow 1953 mul x13,x17,x25 1954 adds x19,x19,x10 1955 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1956 adcs x20,x20,x11 1957 umulh x11,x15,x25 1958 adcs x21,x21,x12 1959 umulh x12,x16,x25 1960 adcs x22,x22,x13 1961 umulh x13,x17,x25 1962 adcs x23,x23,x0 1963 ldr x25,[sp,x28] // next a[0]*n0 1964 adc x0,xzr,xzr 1965 str x19,[x26],#8 // result!!! 1966 adds x19,x20,x10 1967 sub x10,x27,x1 // done yet? 1968 adcs x20,x21,x11 1969 adcs x21,x22,x12 1970 adcs x22,x23,x13 1971 //adc x0,x0,xzr 1972 cbnz x28,.Loop_mul4x_tail 1973 1974 sub x11,x3,x5 // rewinded np? 1975 adc x0,x0,xzr 1976 cbz x10,.Loop_mul4x_break 1977 1978 ldp x10,x11,[x26,#8*4] 1979 ldp x12,x13,[x26,#8*6] 1980 ldp x6,x7,[x1,#8*0] 1981 ldp x8,x9,[x1,#8*2] 1982 add x1,x1,#8*4 1983 adds x19,x19,x10 1984 adcs x20,x20,x11 1985 adcs x21,x21,x12 1986 adcs x22,x22,x13 1987 //adc x0,x0,xzr 1988 ldp x14,x15,[x3,#8*0] 1989 ldp x16,x17,[x3,#8*2] 1990 add x3,x3,#8*4 1991 b .Loop_mul4x_tail 1992 1993.align 4 1994.Loop_mul4x_break: 1995 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1996 adds x19,x19,x30 1997 add x2,x2,#8*4 // bp++ 1998 adcs x20,x20,xzr 1999 sub x1,x1,x5 // rewind ap 2000 adcs x21,x21,xzr 2001 stp x19,x20,[x26,#8*0] // result!!! 2002 adcs x22,x22,xzr 2003 ldp x19,x20,[sp,#8*4] // t[0..3] 2004 adc x30,x0,xzr 2005 stp x21,x22,[x26,#8*2] // result!!! 2006 cmp x2,x13 // done yet? 2007 ldp x21,x22,[sp,#8*6] 2008 ldp x14,x15,[x11,#8*0] // n[0..3] 2009 ldp x16,x17,[x11,#8*2] 2010 add x3,x11,#8*4 2011 b.eq .Lmul4x_post 2012 2013 ldr x24,[x2] 2014 ldp x6,x7,[x1,#8*0] // a[0..3] 2015 ldp x8,x9,[x1,#8*2] 2016 adds x1,x1,#8*4 // clear carry bit 2017 mov x0,xzr 2018 mov x26,sp 2019 b .Loop_mul4x_reduction 2020 2021.align 4 2022.Lmul4x_post: 2023 // Final step. We see if result is larger than modulus, and 2024 // if it is, subtract the modulus. But comparison implies 2025 // subtraction. So we subtract modulus, see if it borrowed, 2026 // and conditionally copy original value. 2027 mov x0,x12 2028 mov x27,x12 // x0 copy 2029 subs x10,x19,x14 2030 add x26,sp,#8*8 2031 sbcs x11,x20,x15 2032 sub x28,x5,#8*4 2033 2034.Lmul4x_sub: 2035 sbcs x12,x21,x16 2036 ldp x14,x15,[x3,#8*0] 2037 sub x28,x28,#8*4 2038 ldp x19,x20,[x26,#8*0] 2039 sbcs x13,x22,x17 2040 ldp x16,x17,[x3,#8*2] 2041 add x3,x3,#8*4 2042 ldp x21,x22,[x26,#8*2] 2043 add x26,x26,#8*4 2044 stp x10,x11,[x0,#8*0] 2045 sbcs x10,x19,x14 2046 stp x12,x13,[x0,#8*2] 2047 add x0,x0,#8*4 2048 sbcs x11,x20,x15 2049 cbnz x28,.Lmul4x_sub 2050 2051 sbcs x12,x21,x16 2052 mov x26,sp 2053 add x1,sp,#8*4 2054 ldp x6,x7,[x27,#8*0] 2055 sbcs x13,x22,x17 2056 stp x10,x11,[x0,#8*0] 2057 ldp x8,x9,[x27,#8*2] 2058 stp x12,x13,[x0,#8*2] 2059 ldp x19,x20,[x1,#8*0] 2060 ldp x21,x22,[x1,#8*2] 2061 sbcs xzr,x30,xzr // did it borrow? 2062 ldr x30,[x29,#8] // pull return address 2063 2064 sub x28,x5,#8*4 2065.Lmul4x_cond_copy: 2066 sub x28,x28,#8*4 2067 csel x10,x19,x6,lo 2068 stp xzr,xzr,[x26,#8*0] 2069 csel x11,x20,x7,lo 2070 ldp x6,x7,[x27,#8*4] 2071 ldp x19,x20,[x1,#8*4] 2072 csel x12,x21,x8,lo 2073 stp xzr,xzr,[x26,#8*2] 2074 add x26,x26,#8*4 2075 csel x13,x22,x9,lo 2076 ldp x8,x9,[x27,#8*6] 2077 ldp x21,x22,[x1,#8*6] 2078 add x1,x1,#8*4 2079 stp x10,x11,[x27,#8*0] 2080 stp x12,x13,[x27,#8*2] 2081 add x27,x27,#8*4 2082 cbnz x28,.Lmul4x_cond_copy 2083 2084 csel x10,x19,x6,lo 2085 stp xzr,xzr,[x26,#8*0] 2086 csel x11,x20,x7,lo 2087 stp xzr,xzr,[x26,#8*2] 2088 csel x12,x21,x8,lo 2089 stp xzr,xzr,[x26,#8*3] 2090 csel x13,x22,x9,lo 2091 stp xzr,xzr,[x26,#8*4] 2092 stp x10,x11,[x27,#8*0] 2093 stp x12,x13,[x27,#8*2] 2094 2095 b .Lmul4x_done 2096 2097.align 4 2098.Lmul4x4_post_condition: 2099 adc x0,x0,xzr 2100 ldr x1,[x29,#96] // pull rp 2101 // x19-3,x0 hold result, x14-7 hold modulus 2102 subs x6,x19,x14 2103 ldr x30,[x29,#8] // pull return address 2104 sbcs x7,x20,x15 2105 stp xzr,xzr,[sp,#8*0] 2106 sbcs x8,x21,x16 2107 stp xzr,xzr,[sp,#8*2] 2108 sbcs x9,x22,x17 2109 stp xzr,xzr,[sp,#8*4] 2110 sbcs xzr,x0,xzr // did it borrow? 2111 stp xzr,xzr,[sp,#8*6] 2112 2113 // x6-3 hold result-modulus 2114 csel x6,x19,x6,lo 2115 csel x7,x20,x7,lo 2116 csel x8,x21,x8,lo 2117 csel x9,x22,x9,lo 2118 stp x6,x7,[x1,#8*0] 2119 stp x8,x9,[x1,#8*2] 2120 2121.Lmul4x_done: 2122 ldp x19,x20,[x29,#16] 2123 mov sp,x29 2124 ldp x21,x22,[x29,#32] 2125 mov x0,#1 2126 ldp x23,x24,[x29,#48] 2127 ldp x25,x26,[x29,#64] 2128 ldp x27,x28,[x29,#80] 2129 ldr x29,[sp],#128 2130 // x30 loaded earlier 2131 AARCH64_VALIDATE_LINK_REGISTER 2132 ret 2133.size __bn_mul4x_mont,.-__bn_mul4x_mont 2134.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 2135.align 2 2136.align 4 2137