1/* Do not modify. This file is auto-generated from armv8-mont.pl. */ 2#ifndef __KERNEL__ 3# include "arm_arch.h" 4 5.hidden OPENSSL_armv8_rsa_neonized 6#endif 7.text 8 9.globl bn_mul_mont 10.type bn_mul_mont,%function 11.align 5 12bn_mul_mont: 13.Lbn_mul_mont: 14 tst x5,#3 15 b.ne .Lmul_mont 16 cmp x5,#32 17 b.le .Lscalar_impl 18#ifndef __KERNEL__ 19 adrp x17,OPENSSL_armv8_rsa_neonized 20 ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] 21 cbnz w17, bn_mul8x_mont_neon 22#endif 23 24.Lscalar_impl: 25 tst x5,#7 26 b.eq __bn_sqr8x_mont 27 tst x5,#3 28 b.eq __bn_mul4x_mont 29 30.Lmul_mont: 31 stp x29,x30,[sp,#-64]! 32 add x29,sp,#0 33 stp x19,x20,[sp,#16] 34 stp x21,x22,[sp,#32] 35 stp x23,x24,[sp,#48] 36 37 ldr x9,[x2],#8 // bp[0] 38 sub x22,sp,x5,lsl#3 39 ldp x7,x8,[x1],#16 // ap[0..1] 40 lsl x5,x5,#3 41 ldr x4,[x4] // *n0 42 and x22,x22,#-16 // ABI says so 43 ldp x13,x14,[x3],#16 // np[0..1] 44 45 mul x6,x7,x9 // ap[0]*bp[0] 46 sub x21,x5,#16 // j=num-2 47 umulh x7,x7,x9 48 mul x10,x8,x9 // ap[1]*bp[0] 49 umulh x11,x8,x9 50 51 mul x15,x6,x4 // "tp[0]"*n0 52 mov sp,x22 // alloca 53 54 // (*) mul x12,x13,x15 // np[0]*m1 55 umulh x13,x13,x15 56 mul x16,x14,x15 // np[1]*m1 57 // (*) adds x12,x12,x6 // discarded 58 // (*) As for removal of first multiplication and addition 59 // instructions. The outcome of first addition is 60 // guaranteed to be zero, which leaves two computationally 61 // significant outcomes: it either carries or not. Then 62 // question is when does it carry? Is there alternative 63 // way to deduce it? If you follow operations, you can 64 // observe that condition for carry is quite simple: 65 // x6 being non-zero. So that carry can be calculated 66 // by adding -1 to x6. That's what next instruction does. 67 subs xzr,x6,#1 // (*) 68 umulh x17,x14,x15 69 adc x13,x13,xzr 70 cbz x21,.L1st_skip 71 72.L1st: 73 ldr x8,[x1],#8 74 adds x6,x10,x7 75 sub x21,x21,#8 // j-- 76 adc x7,x11,xzr 77 78 ldr x14,[x3],#8 79 adds x12,x16,x13 80 mul x10,x8,x9 // ap[j]*bp[0] 81 adc x13,x17,xzr 82 umulh x11,x8,x9 83 84 adds x12,x12,x6 85 mul x16,x14,x15 // np[j]*m1 86 adc x13,x13,xzr 87 umulh x17,x14,x15 88 str x12,[x22],#8 // tp[j-1] 89 cbnz x21,.L1st 90 91.L1st_skip: 92 adds x6,x10,x7 93 sub x1,x1,x5 // rewind x1 94 adc x7,x11,xzr 95 96 adds x12,x16,x13 97 sub x3,x3,x5 // rewind x3 98 adc x13,x17,xzr 99 100 adds x12,x12,x6 101 sub x20,x5,#8 // i=num-1 102 adcs x13,x13,x7 103 104 adc x19,xzr,xzr // upmost overflow bit 105 stp x12,x13,[x22] 106 107.Louter: 108 ldr x9,[x2],#8 // bp[i] 109 ldp x7,x8,[x1],#16 110 ldr x23,[sp] // tp[0] 111 add x22,sp,#8 112 113 mul x6,x7,x9 // ap[0]*bp[i] 114 sub x21,x5,#16 // j=num-2 115 umulh x7,x7,x9 116 ldp x13,x14,[x3],#16 117 mul x10,x8,x9 // ap[1]*bp[i] 118 adds x6,x6,x23 119 umulh x11,x8,x9 120 adc x7,x7,xzr 121 122 mul x15,x6,x4 123 sub x20,x20,#8 // i-- 124 125 // (*) mul x12,x13,x15 // np[0]*m1 126 umulh x13,x13,x15 127 mul x16,x14,x15 // np[1]*m1 128 // (*) adds x12,x12,x6 129 subs xzr,x6,#1 // (*) 130 umulh x17,x14,x15 131 cbz x21,.Linner_skip 132 133.Linner: 134 ldr x8,[x1],#8 135 adc x13,x13,xzr 136 ldr x23,[x22],#8 // tp[j] 137 adds x6,x10,x7 138 sub x21,x21,#8 // j-- 139 adc x7,x11,xzr 140 141 adds x12,x16,x13 142 ldr x14,[x3],#8 143 adc x13,x17,xzr 144 145 mul x10,x8,x9 // ap[j]*bp[i] 146 adds x6,x6,x23 147 umulh x11,x8,x9 148 adc x7,x7,xzr 149 150 mul x16,x14,x15 // np[j]*m1 151 adds x12,x12,x6 152 umulh x17,x14,x15 153 stur x12,[x22,#-16] // tp[j-1] 154 cbnz x21,.Linner 155 156.Linner_skip: 157 ldr x23,[x22],#8 // tp[j] 158 adc x13,x13,xzr 159 adds x6,x10,x7 160 sub x1,x1,x5 // rewind x1 161 adc x7,x11,xzr 162 163 adds x12,x16,x13 164 sub x3,x3,x5 // rewind x3 165 adcs x13,x17,x19 166 adc x19,xzr,xzr 167 168 adds x6,x6,x23 169 adc x7,x7,xzr 170 171 adds x12,x12,x6 172 adcs x13,x13,x7 173 adc x19,x19,xzr // upmost overflow bit 174 stp x12,x13,[x22,#-16] 175 176 cbnz x20,.Louter 177 178 // Final step. We see if result is larger than modulus, and 179 // if it is, subtract the modulus. But comparison implies 180 // subtraction. So we subtract modulus, see if it borrowed, 181 // and conditionally copy original value. 182 ldr x23,[sp] // tp[0] 183 add x22,sp,#8 184 ldr x14,[x3],#8 // np[0] 185 subs x21,x5,#8 // j=num-1 and clear borrow 186 mov x1,x0 187.Lsub: 188 sbcs x8,x23,x14 // tp[j]-np[j] 189 ldr x23,[x22],#8 190 sub x21,x21,#8 // j-- 191 ldr x14,[x3],#8 192 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 193 cbnz x21,.Lsub 194 195 sbcs x8,x23,x14 196 sbcs x19,x19,xzr // did it borrow? 197 str x8,[x1],#8 // rp[num-1] 198 199 ldr x23,[sp] // tp[0] 200 add x22,sp,#8 201 ldr x8,[x0],#8 // rp[0] 202 sub x5,x5,#8 // num-- 203 nop 204.Lcond_copy: 205 sub x5,x5,#8 // num-- 206 csel x14,x23,x8,lo // did it borrow? 207 ldr x23,[x22],#8 208 ldr x8,[x0],#8 209 stur xzr,[x22,#-16] // wipe tp 210 stur x14,[x0,#-16] 211 cbnz x5,.Lcond_copy 212 213 csel x14,x23,x8,lo 214 stur xzr,[x22,#-8] // wipe tp 215 stur x14,[x0,#-8] 216 217 ldp x19,x20,[x29,#16] 218 mov sp,x29 219 ldp x21,x22,[x29,#32] 220 mov x0,#1 221 ldp x23,x24,[x29,#48] 222 ldr x29,[sp],#64 223 ret 224.size bn_mul_mont,.-bn_mul_mont 225.type bn_mul8x_mont_neon,%function 226.align 5 227bn_mul8x_mont_neon: 228 stp x29,x30,[sp,#-80]! 229 mov x16,sp 230 stp d8,d9,[sp,#16] 231 stp d10,d11,[sp,#32] 232 stp d12,d13,[sp,#48] 233 stp d14,d15,[sp,#64] 234 lsl x5,x5,#1 235 eor v14.16b,v14.16b,v14.16b 236 237.align 4 238.LNEON_8n: 239 eor v6.16b,v6.16b,v6.16b 240 sub x7,sp,#128 241 eor v7.16b,v7.16b,v7.16b 242 sub x7,x7,x5,lsl#4 243 eor v8.16b,v8.16b,v8.16b 244 and x7,x7,#-64 245 eor v9.16b,v9.16b,v9.16b 246 mov sp,x7 // alloca 247 eor v10.16b,v10.16b,v10.16b 248 add x7,x7,#256 249 eor v11.16b,v11.16b,v11.16b 250 sub x8,x5,#8 251 eor v12.16b,v12.16b,v12.16b 252 eor v13.16b,v13.16b,v13.16b 253 254.LNEON_8n_init: 255 st1 {v6.2d,v7.2d},[x7],#32 256 subs x8,x8,#8 257 st1 {v8.2d,v9.2d},[x7],#32 258 st1 {v10.2d,v11.2d},[x7],#32 259 st1 {v12.2d,v13.2d},[x7],#32 260 bne .LNEON_8n_init 261 262 add x6,sp,#256 263 ld1 {v0.4s,v1.4s},[x1],#32 264 add x10,sp,#8 265 ldr s30,[x4],#4 266 mov x9,x5 267 b .LNEON_8n_outer 268 269.align 4 270.LNEON_8n_outer: 271 ldr s28,[x2],#4 // *b++ 272 uxtl v28.4s,v28.4h 273 add x7,sp,#128 274 ld1 {v2.4s,v3.4s},[x3],#32 275 276 umlal v6.2d,v28.2s,v0.s[0] 277 umlal v7.2d,v28.2s,v0.s[1] 278 umlal v8.2d,v28.2s,v0.s[2] 279 shl v29.2d,v6.2d,#16 280 ext v29.16b,v29.16b,v29.16b,#8 281 umlal v9.2d,v28.2s,v0.s[3] 282 add v29.2d,v29.2d,v6.2d 283 umlal v10.2d,v28.2s,v1.s[0] 284 mul v29.2s,v29.2s,v30.2s 285 umlal v11.2d,v28.2s,v1.s[1] 286 st1 {v28.2s},[sp] // put aside smashed b[8*i+0] 287 umlal v12.2d,v28.2s,v1.s[2] 288 uxtl v29.4s,v29.4h 289 umlal v13.2d,v28.2s,v1.s[3] 290 ldr s28,[x2],#4 // *b++ 291 umlal v6.2d,v29.2s,v2.s[0] 292 umlal v7.2d,v29.2s,v2.s[1] 293 uxtl v28.4s,v28.4h 294 umlal v8.2d,v29.2s,v2.s[2] 295 ushr v15.2d,v6.2d,#16 296 umlal v9.2d,v29.2s,v2.s[3] 297 umlal v10.2d,v29.2s,v3.s[0] 298 ext v6.16b,v6.16b,v6.16b,#8 299 add v6.2d,v6.2d,v15.2d 300 umlal v11.2d,v29.2s,v3.s[1] 301 ushr v6.2d,v6.2d,#16 302 umlal v12.2d,v29.2s,v3.s[2] 303 umlal v13.2d,v29.2s,v3.s[3] 304 add v16.2d,v7.2d,v6.2d 305 ins v7.d[0],v16.d[0] 306 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+0] 307 umlal v7.2d,v28.2s,v0.s[0] 308 ld1 {v6.2d},[x6],#16 309 umlal v8.2d,v28.2s,v0.s[1] 310 umlal v9.2d,v28.2s,v0.s[2] 311 shl v29.2d,v7.2d,#16 312 ext v29.16b,v29.16b,v29.16b,#8 313 umlal v10.2d,v28.2s,v0.s[3] 314 add v29.2d,v29.2d,v7.2d 315 umlal v11.2d,v28.2s,v1.s[0] 316 mul v29.2s,v29.2s,v30.2s 317 umlal v12.2d,v28.2s,v1.s[1] 318 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+1] 319 umlal v13.2d,v28.2s,v1.s[2] 320 uxtl v29.4s,v29.4h 321 umlal v6.2d,v28.2s,v1.s[3] 322 ldr s28,[x2],#4 // *b++ 323 umlal v7.2d,v29.2s,v2.s[0] 324 umlal v8.2d,v29.2s,v2.s[1] 325 uxtl v28.4s,v28.4h 326 umlal v9.2d,v29.2s,v2.s[2] 327 ushr v15.2d,v7.2d,#16 328 umlal v10.2d,v29.2s,v2.s[3] 329 umlal v11.2d,v29.2s,v3.s[0] 330 ext v7.16b,v7.16b,v7.16b,#8 331 add v7.2d,v7.2d,v15.2d 332 umlal v12.2d,v29.2s,v3.s[1] 333 ushr v7.2d,v7.2d,#16 334 umlal v13.2d,v29.2s,v3.s[2] 335 umlal v6.2d,v29.2s,v3.s[3] 336 add v16.2d,v8.2d,v7.2d 337 ins v8.d[0],v16.d[0] 338 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+1] 339 umlal v8.2d,v28.2s,v0.s[0] 340 ld1 {v7.2d},[x6],#16 341 umlal v9.2d,v28.2s,v0.s[1] 342 umlal v10.2d,v28.2s,v0.s[2] 343 shl v29.2d,v8.2d,#16 344 ext v29.16b,v29.16b,v29.16b,#8 345 umlal v11.2d,v28.2s,v0.s[3] 346 add v29.2d,v29.2d,v8.2d 347 umlal v12.2d,v28.2s,v1.s[0] 348 mul v29.2s,v29.2s,v30.2s 349 umlal v13.2d,v28.2s,v1.s[1] 350 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+2] 351 umlal v6.2d,v28.2s,v1.s[2] 352 uxtl v29.4s,v29.4h 353 umlal v7.2d,v28.2s,v1.s[3] 354 ldr s28,[x2],#4 // *b++ 355 umlal v8.2d,v29.2s,v2.s[0] 356 umlal v9.2d,v29.2s,v2.s[1] 357 uxtl v28.4s,v28.4h 358 umlal v10.2d,v29.2s,v2.s[2] 359 ushr v15.2d,v8.2d,#16 360 umlal v11.2d,v29.2s,v2.s[3] 361 umlal v12.2d,v29.2s,v3.s[0] 362 ext v8.16b,v8.16b,v8.16b,#8 363 add v8.2d,v8.2d,v15.2d 364 umlal v13.2d,v29.2s,v3.s[1] 365 ushr v8.2d,v8.2d,#16 366 umlal v6.2d,v29.2s,v3.s[2] 367 umlal v7.2d,v29.2s,v3.s[3] 368 add v16.2d,v9.2d,v8.2d 369 ins v9.d[0],v16.d[0] 370 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+2] 371 umlal v9.2d,v28.2s,v0.s[0] 372 ld1 {v8.2d},[x6],#16 373 umlal v10.2d,v28.2s,v0.s[1] 374 umlal v11.2d,v28.2s,v0.s[2] 375 shl v29.2d,v9.2d,#16 376 ext v29.16b,v29.16b,v29.16b,#8 377 umlal v12.2d,v28.2s,v0.s[3] 378 add v29.2d,v29.2d,v9.2d 379 umlal v13.2d,v28.2s,v1.s[0] 380 mul v29.2s,v29.2s,v30.2s 381 umlal v6.2d,v28.2s,v1.s[1] 382 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+3] 383 umlal v7.2d,v28.2s,v1.s[2] 384 uxtl v29.4s,v29.4h 385 umlal v8.2d,v28.2s,v1.s[3] 386 ldr s28,[x2],#4 // *b++ 387 umlal v9.2d,v29.2s,v2.s[0] 388 umlal v10.2d,v29.2s,v2.s[1] 389 uxtl v28.4s,v28.4h 390 umlal v11.2d,v29.2s,v2.s[2] 391 ushr v15.2d,v9.2d,#16 392 umlal v12.2d,v29.2s,v2.s[3] 393 umlal v13.2d,v29.2s,v3.s[0] 394 ext v9.16b,v9.16b,v9.16b,#8 395 add v9.2d,v9.2d,v15.2d 396 umlal v6.2d,v29.2s,v3.s[1] 397 ushr v9.2d,v9.2d,#16 398 umlal v7.2d,v29.2s,v3.s[2] 399 umlal v8.2d,v29.2s,v3.s[3] 400 add v16.2d,v10.2d,v9.2d 401 ins v10.d[0],v16.d[0] 402 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+3] 403 umlal v10.2d,v28.2s,v0.s[0] 404 ld1 {v9.2d},[x6],#16 405 umlal v11.2d,v28.2s,v0.s[1] 406 umlal v12.2d,v28.2s,v0.s[2] 407 shl v29.2d,v10.2d,#16 408 ext v29.16b,v29.16b,v29.16b,#8 409 umlal v13.2d,v28.2s,v0.s[3] 410 add v29.2d,v29.2d,v10.2d 411 umlal v6.2d,v28.2s,v1.s[0] 412 mul v29.2s,v29.2s,v30.2s 413 umlal v7.2d,v28.2s,v1.s[1] 414 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+4] 415 umlal v8.2d,v28.2s,v1.s[2] 416 uxtl v29.4s,v29.4h 417 umlal v9.2d,v28.2s,v1.s[3] 418 ldr s28,[x2],#4 // *b++ 419 umlal v10.2d,v29.2s,v2.s[0] 420 umlal v11.2d,v29.2s,v2.s[1] 421 uxtl v28.4s,v28.4h 422 umlal v12.2d,v29.2s,v2.s[2] 423 ushr v15.2d,v10.2d,#16 424 umlal v13.2d,v29.2s,v2.s[3] 425 umlal v6.2d,v29.2s,v3.s[0] 426 ext v10.16b,v10.16b,v10.16b,#8 427 add v10.2d,v10.2d,v15.2d 428 umlal v7.2d,v29.2s,v3.s[1] 429 ushr v10.2d,v10.2d,#16 430 umlal v8.2d,v29.2s,v3.s[2] 431 umlal v9.2d,v29.2s,v3.s[3] 432 add v16.2d,v11.2d,v10.2d 433 ins v11.d[0],v16.d[0] 434 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+4] 435 umlal v11.2d,v28.2s,v0.s[0] 436 ld1 {v10.2d},[x6],#16 437 umlal v12.2d,v28.2s,v0.s[1] 438 umlal v13.2d,v28.2s,v0.s[2] 439 shl v29.2d,v11.2d,#16 440 ext v29.16b,v29.16b,v29.16b,#8 441 umlal v6.2d,v28.2s,v0.s[3] 442 add v29.2d,v29.2d,v11.2d 443 umlal v7.2d,v28.2s,v1.s[0] 444 mul v29.2s,v29.2s,v30.2s 445 umlal v8.2d,v28.2s,v1.s[1] 446 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+5] 447 umlal v9.2d,v28.2s,v1.s[2] 448 uxtl v29.4s,v29.4h 449 umlal v10.2d,v28.2s,v1.s[3] 450 ldr s28,[x2],#4 // *b++ 451 umlal v11.2d,v29.2s,v2.s[0] 452 umlal v12.2d,v29.2s,v2.s[1] 453 uxtl v28.4s,v28.4h 454 umlal v13.2d,v29.2s,v2.s[2] 455 ushr v15.2d,v11.2d,#16 456 umlal v6.2d,v29.2s,v2.s[3] 457 umlal v7.2d,v29.2s,v3.s[0] 458 ext v11.16b,v11.16b,v11.16b,#8 459 add v11.2d,v11.2d,v15.2d 460 umlal v8.2d,v29.2s,v3.s[1] 461 ushr v11.2d,v11.2d,#16 462 umlal v9.2d,v29.2s,v3.s[2] 463 umlal v10.2d,v29.2s,v3.s[3] 464 add v16.2d,v12.2d,v11.2d 465 ins v12.d[0],v16.d[0] 466 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+5] 467 umlal v12.2d,v28.2s,v0.s[0] 468 ld1 {v11.2d},[x6],#16 469 umlal v13.2d,v28.2s,v0.s[1] 470 umlal v6.2d,v28.2s,v0.s[2] 471 shl v29.2d,v12.2d,#16 472 ext v29.16b,v29.16b,v29.16b,#8 473 umlal v7.2d,v28.2s,v0.s[3] 474 add v29.2d,v29.2d,v12.2d 475 umlal v8.2d,v28.2s,v1.s[0] 476 mul v29.2s,v29.2s,v30.2s 477 umlal v9.2d,v28.2s,v1.s[1] 478 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+6] 479 umlal v10.2d,v28.2s,v1.s[2] 480 uxtl v29.4s,v29.4h 481 umlal v11.2d,v28.2s,v1.s[3] 482 ldr s28,[x2],#4 // *b++ 483 umlal v12.2d,v29.2s,v2.s[0] 484 umlal v13.2d,v29.2s,v2.s[1] 485 uxtl v28.4s,v28.4h 486 umlal v6.2d,v29.2s,v2.s[2] 487 ushr v15.2d,v12.2d,#16 488 umlal v7.2d,v29.2s,v2.s[3] 489 umlal v8.2d,v29.2s,v3.s[0] 490 ext v12.16b,v12.16b,v12.16b,#8 491 add v12.2d,v12.2d,v15.2d 492 umlal v9.2d,v29.2s,v3.s[1] 493 ushr v12.2d,v12.2d,#16 494 umlal v10.2d,v29.2s,v3.s[2] 495 umlal v11.2d,v29.2s,v3.s[3] 496 add v16.2d,v13.2d,v12.2d 497 ins v13.d[0],v16.d[0] 498 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+6] 499 umlal v13.2d,v28.2s,v0.s[0] 500 ld1 {v12.2d},[x6],#16 501 umlal v6.2d,v28.2s,v0.s[1] 502 umlal v7.2d,v28.2s,v0.s[2] 503 shl v29.2d,v13.2d,#16 504 ext v29.16b,v29.16b,v29.16b,#8 505 umlal v8.2d,v28.2s,v0.s[3] 506 add v29.2d,v29.2d,v13.2d 507 umlal v9.2d,v28.2s,v1.s[0] 508 mul v29.2s,v29.2s,v30.2s 509 umlal v10.2d,v28.2s,v1.s[1] 510 st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+7] 511 umlal v11.2d,v28.2s,v1.s[2] 512 uxtl v29.4s,v29.4h 513 umlal v12.2d,v28.2s,v1.s[3] 514 ld1 {v28.2s},[sp] // pull smashed b[8*i+0] 515 umlal v13.2d,v29.2s,v2.s[0] 516 ld1 {v0.4s,v1.4s},[x1],#32 517 umlal v6.2d,v29.2s,v2.s[1] 518 umlal v7.2d,v29.2s,v2.s[2] 519 mov v5.16b,v13.16b 520 ushr v5.2d,v5.2d,#16 521 ext v13.16b,v13.16b,v13.16b,#8 522 umlal v8.2d,v29.2s,v2.s[3] 523 umlal v9.2d,v29.2s,v3.s[0] 524 add v13.2d,v13.2d,v5.2d 525 umlal v10.2d,v29.2s,v3.s[1] 526 ushr v13.2d,v13.2d,#16 527 eor v15.16b,v15.16b,v15.16b 528 ins v13.d[1],v15.d[0] 529 umlal v11.2d,v29.2s,v3.s[2] 530 umlal v12.2d,v29.2s,v3.s[3] 531 add v6.2d,v6.2d,v13.2d 532 st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+7] 533 add x10,sp,#8 // rewind 534 sub x8,x5,#8 535 b .LNEON_8n_inner 536 537.align 4 538.LNEON_8n_inner: 539 subs x8,x8,#8 540 umlal v6.2d,v28.2s,v0.s[0] 541 ld1 {v13.2d},[x6] 542 umlal v7.2d,v28.2s,v0.s[1] 543 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+0] 544 umlal v8.2d,v28.2s,v0.s[2] 545 ld1 {v2.4s,v3.4s},[x3],#32 546 umlal v9.2d,v28.2s,v0.s[3] 547 b.eq .LInner_jump 548 add x6,x6,#16 // don't advance in last iteration 549.LInner_jump: 550 umlal v10.2d,v28.2s,v1.s[0] 551 umlal v11.2d,v28.2s,v1.s[1] 552 umlal v12.2d,v28.2s,v1.s[2] 553 umlal v13.2d,v28.2s,v1.s[3] 554 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+1] 555 umlal v6.2d,v29.2s,v2.s[0] 556 umlal v7.2d,v29.2s,v2.s[1] 557 umlal v8.2d,v29.2s,v2.s[2] 558 umlal v9.2d,v29.2s,v2.s[3] 559 umlal v10.2d,v29.2s,v3.s[0] 560 umlal v11.2d,v29.2s,v3.s[1] 561 umlal v12.2d,v29.2s,v3.s[2] 562 umlal v13.2d,v29.2s,v3.s[3] 563 st1 {v6.2d},[x7],#16 564 umlal v7.2d,v28.2s,v0.s[0] 565 ld1 {v6.2d},[x6] 566 umlal v8.2d,v28.2s,v0.s[1] 567 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+1] 568 umlal v9.2d,v28.2s,v0.s[2] 569 b.eq .LInner_jump1 570 add x6,x6,#16 // don't advance in last iteration 571.LInner_jump1: 572 umlal v10.2d,v28.2s,v0.s[3] 573 umlal v11.2d,v28.2s,v1.s[0] 574 umlal v12.2d,v28.2s,v1.s[1] 575 umlal v13.2d,v28.2s,v1.s[2] 576 umlal v6.2d,v28.2s,v1.s[3] 577 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+2] 578 umlal v7.2d,v29.2s,v2.s[0] 579 umlal v8.2d,v29.2s,v2.s[1] 580 umlal v9.2d,v29.2s,v2.s[2] 581 umlal v10.2d,v29.2s,v2.s[3] 582 umlal v11.2d,v29.2s,v3.s[0] 583 umlal v12.2d,v29.2s,v3.s[1] 584 umlal v13.2d,v29.2s,v3.s[2] 585 umlal v6.2d,v29.2s,v3.s[3] 586 st1 {v7.2d},[x7],#16 587 umlal v8.2d,v28.2s,v0.s[0] 588 ld1 {v7.2d},[x6] 589 umlal v9.2d,v28.2s,v0.s[1] 590 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+2] 591 umlal v10.2d,v28.2s,v0.s[2] 592 b.eq .LInner_jump2 593 add x6,x6,#16 // don't advance in last iteration 594.LInner_jump2: 595 umlal v11.2d,v28.2s,v0.s[3] 596 umlal v12.2d,v28.2s,v1.s[0] 597 umlal v13.2d,v28.2s,v1.s[1] 598 umlal v6.2d,v28.2s,v1.s[2] 599 umlal v7.2d,v28.2s,v1.s[3] 600 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+3] 601 umlal v8.2d,v29.2s,v2.s[0] 602 umlal v9.2d,v29.2s,v2.s[1] 603 umlal v10.2d,v29.2s,v2.s[2] 604 umlal v11.2d,v29.2s,v2.s[3] 605 umlal v12.2d,v29.2s,v3.s[0] 606 umlal v13.2d,v29.2s,v3.s[1] 607 umlal v6.2d,v29.2s,v3.s[2] 608 umlal v7.2d,v29.2s,v3.s[3] 609 st1 {v8.2d},[x7],#16 610 umlal v9.2d,v28.2s,v0.s[0] 611 ld1 {v8.2d},[x6] 612 umlal v10.2d,v28.2s,v0.s[1] 613 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+3] 614 umlal v11.2d,v28.2s,v0.s[2] 615 b.eq .LInner_jump3 616 add x6,x6,#16 // don't advance in last iteration 617.LInner_jump3: 618 umlal v12.2d,v28.2s,v0.s[3] 619 umlal v13.2d,v28.2s,v1.s[0] 620 umlal v6.2d,v28.2s,v1.s[1] 621 umlal v7.2d,v28.2s,v1.s[2] 622 umlal v8.2d,v28.2s,v1.s[3] 623 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+4] 624 umlal v9.2d,v29.2s,v2.s[0] 625 umlal v10.2d,v29.2s,v2.s[1] 626 umlal v11.2d,v29.2s,v2.s[2] 627 umlal v12.2d,v29.2s,v2.s[3] 628 umlal v13.2d,v29.2s,v3.s[0] 629 umlal v6.2d,v29.2s,v3.s[1] 630 umlal v7.2d,v29.2s,v3.s[2] 631 umlal v8.2d,v29.2s,v3.s[3] 632 st1 {v9.2d},[x7],#16 633 umlal v10.2d,v28.2s,v0.s[0] 634 ld1 {v9.2d},[x6] 635 umlal v11.2d,v28.2s,v0.s[1] 636 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+4] 637 umlal v12.2d,v28.2s,v0.s[2] 638 b.eq .LInner_jump4 639 add x6,x6,#16 // don't advance in last iteration 640.LInner_jump4: 641 umlal v13.2d,v28.2s,v0.s[3] 642 umlal v6.2d,v28.2s,v1.s[0] 643 umlal v7.2d,v28.2s,v1.s[1] 644 umlal v8.2d,v28.2s,v1.s[2] 645 umlal v9.2d,v28.2s,v1.s[3] 646 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+5] 647 umlal v10.2d,v29.2s,v2.s[0] 648 umlal v11.2d,v29.2s,v2.s[1] 649 umlal v12.2d,v29.2s,v2.s[2] 650 umlal v13.2d,v29.2s,v2.s[3] 651 umlal v6.2d,v29.2s,v3.s[0] 652 umlal v7.2d,v29.2s,v3.s[1] 653 umlal v8.2d,v29.2s,v3.s[2] 654 umlal v9.2d,v29.2s,v3.s[3] 655 st1 {v10.2d},[x7],#16 656 umlal v11.2d,v28.2s,v0.s[0] 657 ld1 {v10.2d},[x6] 658 umlal v12.2d,v28.2s,v0.s[1] 659 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+5] 660 umlal v13.2d,v28.2s,v0.s[2] 661 b.eq .LInner_jump5 662 add x6,x6,#16 // don't advance in last iteration 663.LInner_jump5: 664 umlal v6.2d,v28.2s,v0.s[3] 665 umlal v7.2d,v28.2s,v1.s[0] 666 umlal v8.2d,v28.2s,v1.s[1] 667 umlal v9.2d,v28.2s,v1.s[2] 668 umlal v10.2d,v28.2s,v1.s[3] 669 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+6] 670 umlal v11.2d,v29.2s,v2.s[0] 671 umlal v12.2d,v29.2s,v2.s[1] 672 umlal v13.2d,v29.2s,v2.s[2] 673 umlal v6.2d,v29.2s,v2.s[3] 674 umlal v7.2d,v29.2s,v3.s[0] 675 umlal v8.2d,v29.2s,v3.s[1] 676 umlal v9.2d,v29.2s,v3.s[2] 677 umlal v10.2d,v29.2s,v3.s[3] 678 st1 {v11.2d},[x7],#16 679 umlal v12.2d,v28.2s,v0.s[0] 680 ld1 {v11.2d},[x6] 681 umlal v13.2d,v28.2s,v0.s[1] 682 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+6] 683 umlal v6.2d,v28.2s,v0.s[2] 684 b.eq .LInner_jump6 685 add x6,x6,#16 // don't advance in last iteration 686.LInner_jump6: 687 umlal v7.2d,v28.2s,v0.s[3] 688 umlal v8.2d,v28.2s,v1.s[0] 689 umlal v9.2d,v28.2s,v1.s[1] 690 umlal v10.2d,v28.2s,v1.s[2] 691 umlal v11.2d,v28.2s,v1.s[3] 692 ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+7] 693 umlal v12.2d,v29.2s,v2.s[0] 694 umlal v13.2d,v29.2s,v2.s[1] 695 umlal v6.2d,v29.2s,v2.s[2] 696 umlal v7.2d,v29.2s,v2.s[3] 697 umlal v8.2d,v29.2s,v3.s[0] 698 umlal v9.2d,v29.2s,v3.s[1] 699 umlal v10.2d,v29.2s,v3.s[2] 700 umlal v11.2d,v29.2s,v3.s[3] 701 st1 {v12.2d},[x7],#16 702 umlal v13.2d,v28.2s,v0.s[0] 703 ld1 {v12.2d},[x6] 704 umlal v6.2d,v28.2s,v0.s[1] 705 ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+7] 706 umlal v7.2d,v28.2s,v0.s[2] 707 b.eq .LInner_jump7 708 add x6,x6,#16 // don't advance in last iteration 709.LInner_jump7: 710 umlal v8.2d,v28.2s,v0.s[3] 711 umlal v9.2d,v28.2s,v1.s[0] 712 umlal v10.2d,v28.2s,v1.s[1] 713 umlal v11.2d,v28.2s,v1.s[2] 714 umlal v12.2d,v28.2s,v1.s[3] 715 b.ne .LInner_after_rewind8 716 sub x1,x1,x5,lsl#2 // rewind 717.LInner_after_rewind8: 718 umlal v13.2d,v29.2s,v2.s[0] 719 ld1 {v28.2s},[sp] // pull smashed b[8*i+0] 720 umlal v6.2d,v29.2s,v2.s[1] 721 ld1 {v0.4s,v1.4s},[x1],#32 722 umlal v7.2d,v29.2s,v2.s[2] 723 add x10,sp,#8 // rewind 724 umlal v8.2d,v29.2s,v2.s[3] 725 umlal v9.2d,v29.2s,v3.s[0] 726 umlal v10.2d,v29.2s,v3.s[1] 727 umlal v11.2d,v29.2s,v3.s[2] 728 st1 {v13.2d},[x7],#16 729 umlal v12.2d,v29.2s,v3.s[3] 730 731 bne .LNEON_8n_inner 732 add x6,sp,#128 733 st1 {v6.2d,v7.2d},[x7],#32 734 eor v2.16b,v2.16b,v2.16b // v2 735 st1 {v8.2d,v9.2d},[x7],#32 736 eor v3.16b,v3.16b,v3.16b // v3 737 st1 {v10.2d,v11.2d},[x7],#32 738 st1 {v12.2d},[x7] 739 740 subs x9,x9,#8 741 ld1 {v6.2d,v7.2d},[x6],#32 742 ld1 {v8.2d,v9.2d},[x6],#32 743 ld1 {v10.2d,v11.2d},[x6],#32 744 ld1 {v12.2d,v13.2d},[x6],#32 745 746 b.eq .LInner_8n_jump_2steps 747 sub x3,x3,x5,lsl#2 // rewind 748 b .LNEON_8n_outer 749 750.LInner_8n_jump_2steps: 751 add x7,sp,#128 752 st1 {v2.2d,v3.2d}, [sp],#32 // start wiping stack frame 753 mov v5.16b,v6.16b 754 ushr v15.2d,v6.2d,#16 755 ext v6.16b,v6.16b,v6.16b,#8 756 st1 {v2.2d,v3.2d}, [sp],#32 757 add v6.2d,v6.2d,v15.2d 758 st1 {v2.2d,v3.2d}, [sp],#32 759 ushr v15.2d,v6.2d,#16 760 st1 {v2.2d,v3.2d}, [sp],#32 761 zip1 v6.4h,v5.4h,v6.4h 762 ins v15.d[1],v14.d[0] 763 764 mov x8,x5 765 b .LNEON_tail_entry 766 767.align 4 768.LNEON_tail: 769 add v6.2d,v6.2d,v15.2d 770 mov v5.16b,v6.16b 771 ushr v15.2d,v6.2d,#16 772 ext v6.16b,v6.16b,v6.16b,#8 773 ld1 {v8.2d,v9.2d}, [x6],#32 774 add v6.2d,v6.2d,v15.2d 775 ld1 {v10.2d,v11.2d}, [x6],#32 776 ushr v15.2d,v6.2d,#16 777 ld1 {v12.2d,v13.2d}, [x6],#32 778 zip1 v6.4h,v5.4h,v6.4h 779 ins v15.d[1],v14.d[0] 780 781.LNEON_tail_entry: 782 add v7.2d,v7.2d,v15.2d 783 st1 {v6.s}[0], [x7],#4 784 ushr v15.2d,v7.2d,#16 785 mov v5.16b,v7.16b 786 ext v7.16b,v7.16b,v7.16b,#8 787 add v7.2d,v7.2d,v15.2d 788 ushr v15.2d,v7.2d,#16 789 zip1 v7.4h,v5.4h,v7.4h 790 ins v15.d[1],v14.d[0] 791 add v8.2d,v8.2d,v15.2d 792 st1 {v7.s}[0], [x7],#4 793 ushr v15.2d,v8.2d,#16 794 mov v5.16b,v8.16b 795 ext v8.16b,v8.16b,v8.16b,#8 796 add v8.2d,v8.2d,v15.2d 797 ushr v15.2d,v8.2d,#16 798 zip1 v8.4h,v5.4h,v8.4h 799 ins v15.d[1],v14.d[0] 800 add v9.2d,v9.2d,v15.2d 801 st1 {v8.s}[0], [x7],#4 802 ushr v15.2d,v9.2d,#16 803 mov v5.16b,v9.16b 804 ext v9.16b,v9.16b,v9.16b,#8 805 add v9.2d,v9.2d,v15.2d 806 ushr v15.2d,v9.2d,#16 807 zip1 v9.4h,v5.4h,v9.4h 808 ins v15.d[1],v14.d[0] 809 add v10.2d,v10.2d,v15.2d 810 st1 {v9.s}[0], [x7],#4 811 ushr v15.2d,v10.2d,#16 812 mov v5.16b,v10.16b 813 ext v10.16b,v10.16b,v10.16b,#8 814 add v10.2d,v10.2d,v15.2d 815 ushr v15.2d,v10.2d,#16 816 zip1 v10.4h,v5.4h,v10.4h 817 ins v15.d[1],v14.d[0] 818 add v11.2d,v11.2d,v15.2d 819 st1 {v10.s}[0], [x7],#4 820 ushr v15.2d,v11.2d,#16 821 mov v5.16b,v11.16b 822 ext v11.16b,v11.16b,v11.16b,#8 823 add v11.2d,v11.2d,v15.2d 824 ushr v15.2d,v11.2d,#16 825 zip1 v11.4h,v5.4h,v11.4h 826 ins v15.d[1],v14.d[0] 827 add v12.2d,v12.2d,v15.2d 828 st1 {v11.s}[0], [x7],#4 829 ushr v15.2d,v12.2d,#16 830 mov v5.16b,v12.16b 831 ext v12.16b,v12.16b,v12.16b,#8 832 add v12.2d,v12.2d,v15.2d 833 ushr v15.2d,v12.2d,#16 834 zip1 v12.4h,v5.4h,v12.4h 835 ins v15.d[1],v14.d[0] 836 add v13.2d,v13.2d,v15.2d 837 st1 {v12.s}[0], [x7],#4 838 ushr v15.2d,v13.2d,#16 839 mov v5.16b,v13.16b 840 ext v13.16b,v13.16b,v13.16b,#8 841 add v13.2d,v13.2d,v15.2d 842 ushr v15.2d,v13.2d,#16 843 zip1 v13.4h,v5.4h,v13.4h 844 ins v15.d[1],v14.d[0] 845 ld1 {v6.2d,v7.2d}, [x6],#32 846 subs x8,x8,#8 847 st1 {v13.s}[0], [x7],#4 848 bne .LNEON_tail 849 850 st1 {v15.s}[0], [x7],#4 // top-most bit 851 sub x3,x3,x5,lsl#2 // rewind x3 852 subs x1,sp,#0 // clear carry flag 853 add x2,sp,x5,lsl#2 854 855.LNEON_sub: 856 ldp w4,w5,[x1],#8 857 ldp w6,w7,[x1],#8 858 ldp w8,w9,[x3],#8 859 ldp w10,w11,[x3],#8 860 sbcs w8,w4,w8 861 sbcs w9,w5,w9 862 sbcs w10,w6,w10 863 sbcs w11,w7,w11 864 sub x17,x2,x1 865 stp w8,w9,[x0],#8 866 stp w10,w11,[x0],#8 867 cbnz x17,.LNEON_sub 868 869 ldr w10, [x1] // load top-most bit 870 mov x11,sp 871 eor v0.16b,v0.16b,v0.16b 872 sub x11,x2,x11 // this is num*4 873 eor v1.16b,v1.16b,v1.16b 874 mov x1,sp 875 sub x0,x0,x11 // rewind x0 876 mov x3,x2 // second 3/4th of frame 877 sbcs w10,w10,wzr // result is carry flag 878 879.LNEON_copy_n_zap: 880 ldp w4,w5,[x1],#8 881 ldp w6,w7,[x1],#8 882 ldp w8,w9,[x0],#8 883 ldp w10,w11,[x0] 884 sub x0,x0,#8 885 b.cs .LCopy_1 886 mov w8,w4 887 mov w9,w5 888 mov w10,w6 889 mov w11,w7 890.LCopy_1: 891 st1 {v0.2d,v1.2d}, [x3],#32 // wipe 892 st1 {v0.2d,v1.2d}, [x3],#32 // wipe 893 ldp w4,w5,[x1],#8 894 ldp w6,w7,[x1],#8 895 stp w8,w9,[x0],#8 896 stp w10,w11,[x0],#8 897 sub x1,x1,#32 898 ldp w8,w9,[x0],#8 899 ldp w10,w11,[x0] 900 sub x0,x0,#8 901 b.cs .LCopy_2 902 mov w8, w4 903 mov w9, w5 904 mov w10, w6 905 mov w11, w7 906.LCopy_2: 907 st1 {v0.2d,v1.2d}, [x1],#32 // wipe 908 st1 {v0.2d,v1.2d}, [x3],#32 // wipe 909 sub x17,x2,x1 // preserves carry 910 stp w8,w9,[x0],#8 911 stp w10,w11,[x0],#8 912 cbnz x17,.LNEON_copy_n_zap 913 914 mov sp,x16 915 ldp d14,d15,[sp,#64] 916 ldp d12,d13,[sp,#48] 917 ldp d10,d11,[sp,#32] 918 ldp d8,d9,[sp,#16] 919 ldr x29,[sp],#80 920 ret // bx lr 921 922.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 923.type __bn_sqr8x_mont,%function 924.align 5 925__bn_sqr8x_mont: 926 cmp x1,x2 927 b.ne __bn_mul4x_mont 928.Lsqr8x_mont: 929.inst 0xd503233f // paciasp 930 stp x29,x30,[sp,#-128]! 931 add x29,sp,#0 932 stp x19,x20,[sp,#16] 933 stp x21,x22,[sp,#32] 934 stp x23,x24,[sp,#48] 935 stp x25,x26,[sp,#64] 936 stp x27,x28,[sp,#80] 937 stp x0,x3,[sp,#96] // offload rp and np 938 939 ldp x6,x7,[x1,#8*0] 940 ldp x8,x9,[x1,#8*2] 941 ldp x10,x11,[x1,#8*4] 942 ldp x12,x13,[x1,#8*6] 943 944 sub x2,sp,x5,lsl#4 945 lsl x5,x5,#3 946 ldr x4,[x4] // *n0 947 mov sp,x2 // alloca 948 sub x27,x5,#8*8 949 b .Lsqr8x_zero_start 950 951.Lsqr8x_zero: 952 sub x27,x27,#8*8 953 stp xzr,xzr,[x2,#8*0] 954 stp xzr,xzr,[x2,#8*2] 955 stp xzr,xzr,[x2,#8*4] 956 stp xzr,xzr,[x2,#8*6] 957.Lsqr8x_zero_start: 958 stp xzr,xzr,[x2,#8*8] 959 stp xzr,xzr,[x2,#8*10] 960 stp xzr,xzr,[x2,#8*12] 961 stp xzr,xzr,[x2,#8*14] 962 add x2,x2,#8*16 963 cbnz x27,.Lsqr8x_zero 964 965 add x3,x1,x5 966 add x1,x1,#8*8 967 mov x19,xzr 968 mov x20,xzr 969 mov x21,xzr 970 mov x22,xzr 971 mov x23,xzr 972 mov x24,xzr 973 mov x25,xzr 974 mov x26,xzr 975 mov x2,sp 976 str x4,[x29,#112] // offload n0 977 978 // Multiply everything but a[i]*a[i] 979.align 4 980.Lsqr8x_outer_loop: 981 // a[1]a[0] (i) 982 // a[2]a[0] 983 // a[3]a[0] 984 // a[4]a[0] 985 // a[5]a[0] 986 // a[6]a[0] 987 // a[7]a[0] 988 // a[2]a[1] (ii) 989 // a[3]a[1] 990 // a[4]a[1] 991 // a[5]a[1] 992 // a[6]a[1] 993 // a[7]a[1] 994 // a[3]a[2] (iii) 995 // a[4]a[2] 996 // a[5]a[2] 997 // a[6]a[2] 998 // a[7]a[2] 999 // a[4]a[3] (iv) 1000 // a[5]a[3] 1001 // a[6]a[3] 1002 // a[7]a[3] 1003 // a[5]a[4] (v) 1004 // a[6]a[4] 1005 // a[7]a[4] 1006 // a[6]a[5] (vi) 1007 // a[7]a[5] 1008 // a[7]a[6] (vii) 1009 1010 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 1011 mul x15,x8,x6 1012 mul x16,x9,x6 1013 mul x17,x10,x6 1014 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 1015 mul x14,x11,x6 1016 adcs x21,x21,x15 1017 mul x15,x12,x6 1018 adcs x22,x22,x16 1019 mul x16,x13,x6 1020 adcs x23,x23,x17 1021 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 1022 adcs x24,x24,x14 1023 umulh x14,x8,x6 1024 adcs x25,x25,x15 1025 umulh x15,x9,x6 1026 adcs x26,x26,x16 1027 umulh x16,x10,x6 1028 stp x19,x20,[x2],#8*2 // t[0..1] 1029 adc x19,xzr,xzr // t[8] 1030 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 1031 umulh x17,x11,x6 1032 adcs x22,x22,x14 1033 umulh x14,x12,x6 1034 adcs x23,x23,x15 1035 umulh x15,x13,x6 1036 adcs x24,x24,x16 1037 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 1038 adcs x25,x25,x17 1039 mul x17,x9,x7 1040 adcs x26,x26,x14 1041 mul x14,x10,x7 1042 adc x19,x19,x15 1043 1044 mul x15,x11,x7 1045 adds x22,x22,x16 1046 mul x16,x12,x7 1047 adcs x23,x23,x17 1048 mul x17,x13,x7 1049 adcs x24,x24,x14 1050 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 1051 adcs x25,x25,x15 1052 umulh x15,x9,x7 1053 adcs x26,x26,x16 1054 umulh x16,x10,x7 1055 adcs x19,x19,x17 1056 umulh x17,x11,x7 1057 stp x21,x22,[x2],#8*2 // t[2..3] 1058 adc x20,xzr,xzr // t[9] 1059 adds x23,x23,x14 1060 umulh x14,x12,x7 1061 adcs x24,x24,x15 1062 umulh x15,x13,x7 1063 adcs x25,x25,x16 1064 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 1065 adcs x26,x26,x17 1066 mul x17,x10,x8 1067 adcs x19,x19,x14 1068 mul x14,x11,x8 1069 adc x20,x20,x15 1070 1071 mul x15,x12,x8 1072 adds x24,x24,x16 1073 mul x16,x13,x8 1074 adcs x25,x25,x17 1075 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 1076 adcs x26,x26,x14 1077 umulh x14,x10,x8 1078 adcs x19,x19,x15 1079 umulh x15,x11,x8 1080 adcs x20,x20,x16 1081 umulh x16,x12,x8 1082 stp x23,x24,[x2],#8*2 // t[4..5] 1083 adc x21,xzr,xzr // t[10] 1084 adds x25,x25,x17 1085 umulh x17,x13,x8 1086 adcs x26,x26,x14 1087 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 1088 adcs x19,x19,x15 1089 mul x15,x11,x9 1090 adcs x20,x20,x16 1091 mul x16,x12,x9 1092 adc x21,x21,x17 1093 1094 mul x17,x13,x9 1095 adds x26,x26,x14 1096 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 1097 adcs x19,x19,x15 1098 umulh x15,x11,x9 1099 adcs x20,x20,x16 1100 umulh x16,x12,x9 1101 adcs x21,x21,x17 1102 umulh x17,x13,x9 1103 stp x25,x26,[x2],#8*2 // t[6..7] 1104 adc x22,xzr,xzr // t[11] 1105 adds x19,x19,x14 1106 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 1107 adcs x20,x20,x15 1108 mul x15,x12,x10 1109 adcs x21,x21,x16 1110 mul x16,x13,x10 1111 adc x22,x22,x17 1112 1113 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 1114 adds x20,x20,x14 1115 umulh x14,x12,x10 1116 adcs x21,x21,x15 1117 umulh x15,x13,x10 1118 adcs x22,x22,x16 1119 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 1120 adc x23,xzr,xzr // t[12] 1121 adds x21,x21,x17 1122 mul x17,x13,x11 1123 adcs x22,x22,x14 1124 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 1125 adc x23,x23,x15 1126 1127 umulh x15,x13,x11 1128 adds x22,x22,x16 1129 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 1130 adcs x23,x23,x17 1131 umulh x17,x13,x12 // hi(a[7]*a[6]) 1132 adc x24,xzr,xzr // t[13] 1133 adds x23,x23,x14 1134 sub x27,x3,x1 // done yet? 1135 adc x24,x24,x15 1136 1137 adds x24,x24,x16 1138 sub x14,x3,x5 // rewinded ap 1139 adc x25,xzr,xzr // t[14] 1140 add x25,x25,x17 1141 1142 cbz x27,.Lsqr8x_outer_break 1143 1144 mov x4,x6 1145 ldp x6,x7,[x2,#8*0] 1146 ldp x8,x9,[x2,#8*2] 1147 ldp x10,x11,[x2,#8*4] 1148 ldp x12,x13,[x2,#8*6] 1149 adds x19,x19,x6 1150 adcs x20,x20,x7 1151 ldp x6,x7,[x1,#8*0] 1152 adcs x21,x21,x8 1153 adcs x22,x22,x9 1154 ldp x8,x9,[x1,#8*2] 1155 adcs x23,x23,x10 1156 adcs x24,x24,x11 1157 ldp x10,x11,[x1,#8*4] 1158 adcs x25,x25,x12 1159 mov x0,x1 1160 adcs x26,xzr,x13 1161 ldp x12,x13,[x1,#8*6] 1162 add x1,x1,#8*8 1163 //adc x28,xzr,xzr // moved below 1164 mov x27,#-8*8 1165 1166 // a[8]a[0] 1167 // a[9]a[0] 1168 // a[a]a[0] 1169 // a[b]a[0] 1170 // a[c]a[0] 1171 // a[d]a[0] 1172 // a[e]a[0] 1173 // a[f]a[0] 1174 // a[8]a[1] 1175 // a[f]a[1]........................ 1176 // a[8]a[2] 1177 // a[f]a[2]........................ 1178 // a[8]a[3] 1179 // a[f]a[3]........................ 1180 // a[8]a[4] 1181 // a[f]a[4]........................ 1182 // a[8]a[5] 1183 // a[f]a[5]........................ 1184 // a[8]a[6] 1185 // a[f]a[6]........................ 1186 // a[8]a[7] 1187 // a[f]a[7]........................ 1188.Lsqr8x_mul: 1189 mul x14,x6,x4 1190 adc x28,xzr,xzr // carry bit, modulo-scheduled 1191 mul x15,x7,x4 1192 add x27,x27,#8 1193 mul x16,x8,x4 1194 mul x17,x9,x4 1195 adds x19,x19,x14 1196 mul x14,x10,x4 1197 adcs x20,x20,x15 1198 mul x15,x11,x4 1199 adcs x21,x21,x16 1200 mul x16,x12,x4 1201 adcs x22,x22,x17 1202 mul x17,x13,x4 1203 adcs x23,x23,x14 1204 umulh x14,x6,x4 1205 adcs x24,x24,x15 1206 umulh x15,x7,x4 1207 adcs x25,x25,x16 1208 umulh x16,x8,x4 1209 adcs x26,x26,x17 1210 umulh x17,x9,x4 1211 adc x28,x28,xzr 1212 str x19,[x2],#8 1213 adds x19,x20,x14 1214 umulh x14,x10,x4 1215 adcs x20,x21,x15 1216 umulh x15,x11,x4 1217 adcs x21,x22,x16 1218 umulh x16,x12,x4 1219 adcs x22,x23,x17 1220 umulh x17,x13,x4 1221 ldr x4,[x0,x27] 1222 adcs x23,x24,x14 1223 adcs x24,x25,x15 1224 adcs x25,x26,x16 1225 adcs x26,x28,x17 1226 //adc x28,xzr,xzr // moved above 1227 cbnz x27,.Lsqr8x_mul 1228 // note that carry flag is guaranteed 1229 // to be zero at this point 1230 cmp x1,x3 // done yet? 1231 b.eq .Lsqr8x_break 1232 1233 ldp x6,x7,[x2,#8*0] 1234 ldp x8,x9,[x2,#8*2] 1235 ldp x10,x11,[x2,#8*4] 1236 ldp x12,x13,[x2,#8*6] 1237 adds x19,x19,x6 1238 ldur x4,[x0,#-8*8] 1239 adcs x20,x20,x7 1240 ldp x6,x7,[x1,#8*0] 1241 adcs x21,x21,x8 1242 adcs x22,x22,x9 1243 ldp x8,x9,[x1,#8*2] 1244 adcs x23,x23,x10 1245 adcs x24,x24,x11 1246 ldp x10,x11,[x1,#8*4] 1247 adcs x25,x25,x12 1248 mov x27,#-8*8 1249 adcs x26,x26,x13 1250 ldp x12,x13,[x1,#8*6] 1251 add x1,x1,#8*8 1252 //adc x28,xzr,xzr // moved above 1253 b .Lsqr8x_mul 1254 1255.align 4 1256.Lsqr8x_break: 1257 ldp x6,x7,[x0,#8*0] 1258 add x1,x0,#8*8 1259 ldp x8,x9,[x0,#8*2] 1260 sub x14,x3,x1 // is it last iteration? 1261 ldp x10,x11,[x0,#8*4] 1262 sub x15,x2,x14 1263 ldp x12,x13,[x0,#8*6] 1264 cbz x14,.Lsqr8x_outer_loop 1265 1266 stp x19,x20,[x2,#8*0] 1267 ldp x19,x20,[x15,#8*0] 1268 stp x21,x22,[x2,#8*2] 1269 ldp x21,x22,[x15,#8*2] 1270 stp x23,x24,[x2,#8*4] 1271 ldp x23,x24,[x15,#8*4] 1272 stp x25,x26,[x2,#8*6] 1273 mov x2,x15 1274 ldp x25,x26,[x15,#8*6] 1275 b .Lsqr8x_outer_loop 1276 1277.align 4 1278.Lsqr8x_outer_break: 1279 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 1280 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 1281 ldp x15,x16,[sp,#8*1] 1282 ldp x11,x13,[x14,#8*2] 1283 add x1,x14,#8*4 1284 ldp x17,x14,[sp,#8*3] 1285 1286 stp x19,x20,[x2,#8*0] 1287 mul x19,x7,x7 1288 stp x21,x22,[x2,#8*2] 1289 umulh x7,x7,x7 1290 stp x23,x24,[x2,#8*4] 1291 mul x8,x9,x9 1292 stp x25,x26,[x2,#8*6] 1293 mov x2,sp 1294 umulh x9,x9,x9 1295 adds x20,x7,x15,lsl#1 1296 extr x15,x16,x15,#63 1297 sub x27,x5,#8*4 1298 1299.Lsqr4x_shift_n_add: 1300 adcs x21,x8,x15 1301 extr x16,x17,x16,#63 1302 sub x27,x27,#8*4 1303 adcs x22,x9,x16 1304 ldp x15,x16,[x2,#8*5] 1305 mul x10,x11,x11 1306 ldp x7,x9,[x1],#8*2 1307 umulh x11,x11,x11 1308 mul x12,x13,x13 1309 umulh x13,x13,x13 1310 extr x17,x14,x17,#63 1311 stp x19,x20,[x2,#8*0] 1312 adcs x23,x10,x17 1313 extr x14,x15,x14,#63 1314 stp x21,x22,[x2,#8*2] 1315 adcs x24,x11,x14 1316 ldp x17,x14,[x2,#8*7] 1317 extr x15,x16,x15,#63 1318 adcs x25,x12,x15 1319 extr x16,x17,x16,#63 1320 adcs x26,x13,x16 1321 ldp x15,x16,[x2,#8*9] 1322 mul x6,x7,x7 1323 ldp x11,x13,[x1],#8*2 1324 umulh x7,x7,x7 1325 mul x8,x9,x9 1326 umulh x9,x9,x9 1327 stp x23,x24,[x2,#8*4] 1328 extr x17,x14,x17,#63 1329 stp x25,x26,[x2,#8*6] 1330 add x2,x2,#8*8 1331 adcs x19,x6,x17 1332 extr x14,x15,x14,#63 1333 adcs x20,x7,x14 1334 ldp x17,x14,[x2,#8*3] 1335 extr x15,x16,x15,#63 1336 cbnz x27,.Lsqr4x_shift_n_add 1337 ldp x1,x4,[x29,#104] // pull np and n0 1338 1339 adcs x21,x8,x15 1340 extr x16,x17,x16,#63 1341 adcs x22,x9,x16 1342 ldp x15,x16,[x2,#8*5] 1343 mul x10,x11,x11 1344 umulh x11,x11,x11 1345 stp x19,x20,[x2,#8*0] 1346 mul x12,x13,x13 1347 umulh x13,x13,x13 1348 stp x21,x22,[x2,#8*2] 1349 extr x17,x14,x17,#63 1350 adcs x23,x10,x17 1351 extr x14,x15,x14,#63 1352 ldp x19,x20,[sp,#8*0] 1353 adcs x24,x11,x14 1354 extr x15,x16,x15,#63 1355 ldp x6,x7,[x1,#8*0] 1356 adcs x25,x12,x15 1357 extr x16,xzr,x16,#63 1358 ldp x8,x9,[x1,#8*2] 1359 adc x26,x13,x16 1360 ldp x10,x11,[x1,#8*4] 1361 1362 // Reduce by 512 bits per iteration 1363 mul x28,x4,x19 // t[0]*n0 1364 ldp x12,x13,[x1,#8*6] 1365 add x3,x1,x5 1366 ldp x21,x22,[sp,#8*2] 1367 stp x23,x24,[x2,#8*4] 1368 ldp x23,x24,[sp,#8*4] 1369 stp x25,x26,[x2,#8*6] 1370 ldp x25,x26,[sp,#8*6] 1371 add x1,x1,#8*8 1372 mov x30,xzr // initial top-most carry 1373 mov x2,sp 1374 mov x27,#8 1375 1376.Lsqr8x_reduction: 1377 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 1378 mul x15,x7,x28 1379 sub x27,x27,#1 1380 mul x16,x8,x28 1381 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 1382 mul x17,x9,x28 1383 // (*) adds xzr,x19,x14 1384 subs xzr,x19,#1 // (*) 1385 mul x14,x10,x28 1386 adcs x19,x20,x15 1387 mul x15,x11,x28 1388 adcs x20,x21,x16 1389 mul x16,x12,x28 1390 adcs x21,x22,x17 1391 mul x17,x13,x28 1392 adcs x22,x23,x14 1393 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 1394 adcs x23,x24,x15 1395 umulh x15,x7,x28 1396 adcs x24,x25,x16 1397 umulh x16,x8,x28 1398 adcs x25,x26,x17 1399 umulh x17,x9,x28 1400 adc x26,xzr,xzr 1401 adds x19,x19,x14 1402 umulh x14,x10,x28 1403 adcs x20,x20,x15 1404 umulh x15,x11,x28 1405 adcs x21,x21,x16 1406 umulh x16,x12,x28 1407 adcs x22,x22,x17 1408 umulh x17,x13,x28 1409 mul x28,x4,x19 // next t[0]*n0 1410 adcs x23,x23,x14 1411 adcs x24,x24,x15 1412 adcs x25,x25,x16 1413 adc x26,x26,x17 1414 cbnz x27,.Lsqr8x_reduction 1415 1416 ldp x14,x15,[x2,#8*0] 1417 ldp x16,x17,[x2,#8*2] 1418 mov x0,x2 1419 sub x27,x3,x1 // done yet? 1420 adds x19,x19,x14 1421 adcs x20,x20,x15 1422 ldp x14,x15,[x2,#8*4] 1423 adcs x21,x21,x16 1424 adcs x22,x22,x17 1425 ldp x16,x17,[x2,#8*6] 1426 adcs x23,x23,x14 1427 adcs x24,x24,x15 1428 adcs x25,x25,x16 1429 adcs x26,x26,x17 1430 //adc x28,xzr,xzr // moved below 1431 cbz x27,.Lsqr8x8_post_condition 1432 1433 ldur x4,[x2,#-8*8] 1434 ldp x6,x7,[x1,#8*0] 1435 ldp x8,x9,[x1,#8*2] 1436 ldp x10,x11,[x1,#8*4] 1437 mov x27,#-8*8 1438 ldp x12,x13,[x1,#8*6] 1439 add x1,x1,#8*8 1440 1441.Lsqr8x_tail: 1442 mul x14,x6,x4 1443 adc x28,xzr,xzr // carry bit, modulo-scheduled 1444 mul x15,x7,x4 1445 add x27,x27,#8 1446 mul x16,x8,x4 1447 mul x17,x9,x4 1448 adds x19,x19,x14 1449 mul x14,x10,x4 1450 adcs x20,x20,x15 1451 mul x15,x11,x4 1452 adcs x21,x21,x16 1453 mul x16,x12,x4 1454 adcs x22,x22,x17 1455 mul x17,x13,x4 1456 adcs x23,x23,x14 1457 umulh x14,x6,x4 1458 adcs x24,x24,x15 1459 umulh x15,x7,x4 1460 adcs x25,x25,x16 1461 umulh x16,x8,x4 1462 adcs x26,x26,x17 1463 umulh x17,x9,x4 1464 adc x28,x28,xzr 1465 str x19,[x2],#8 1466 adds x19,x20,x14 1467 umulh x14,x10,x4 1468 adcs x20,x21,x15 1469 umulh x15,x11,x4 1470 adcs x21,x22,x16 1471 umulh x16,x12,x4 1472 adcs x22,x23,x17 1473 umulh x17,x13,x4 1474 ldr x4,[x0,x27] 1475 adcs x23,x24,x14 1476 adcs x24,x25,x15 1477 adcs x25,x26,x16 1478 adcs x26,x28,x17 1479 //adc x28,xzr,xzr // moved above 1480 cbnz x27,.Lsqr8x_tail 1481 // note that carry flag is guaranteed 1482 // to be zero at this point 1483 ldp x6,x7,[x2,#8*0] 1484 sub x27,x3,x1 // done yet? 1485 sub x16,x3,x5 // rewinded np 1486 ldp x8,x9,[x2,#8*2] 1487 ldp x10,x11,[x2,#8*4] 1488 ldp x12,x13,[x2,#8*6] 1489 cbz x27,.Lsqr8x_tail_break 1490 1491 ldur x4,[x0,#-8*8] 1492 adds x19,x19,x6 1493 adcs x20,x20,x7 1494 ldp x6,x7,[x1,#8*0] 1495 adcs x21,x21,x8 1496 adcs x22,x22,x9 1497 ldp x8,x9,[x1,#8*2] 1498 adcs x23,x23,x10 1499 adcs x24,x24,x11 1500 ldp x10,x11,[x1,#8*4] 1501 adcs x25,x25,x12 1502 mov x27,#-8*8 1503 adcs x26,x26,x13 1504 ldp x12,x13,[x1,#8*6] 1505 add x1,x1,#8*8 1506 //adc x28,xzr,xzr // moved above 1507 b .Lsqr8x_tail 1508 1509.align 4 1510.Lsqr8x_tail_break: 1511 ldr x4,[x29,#112] // pull n0 1512 add x27,x2,#8*8 // end of current t[num] window 1513 1514 subs xzr,x30,#1 // "move" top-most carry to carry bit 1515 adcs x14,x19,x6 1516 adcs x15,x20,x7 1517 ldp x19,x20,[x0,#8*0] 1518 adcs x21,x21,x8 1519 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 1520 adcs x22,x22,x9 1521 ldp x8,x9,[x16,#8*2] 1522 adcs x23,x23,x10 1523 adcs x24,x24,x11 1524 ldp x10,x11,[x16,#8*4] 1525 adcs x25,x25,x12 1526 adcs x26,x26,x13 1527 ldp x12,x13,[x16,#8*6] 1528 add x1,x16,#8*8 1529 adc x30,xzr,xzr // top-most carry 1530 mul x28,x4,x19 1531 stp x14,x15,[x2,#8*0] 1532 stp x21,x22,[x2,#8*2] 1533 ldp x21,x22,[x0,#8*2] 1534 stp x23,x24,[x2,#8*4] 1535 ldp x23,x24,[x0,#8*4] 1536 cmp x27,x29 // did we hit the bottom? 1537 stp x25,x26,[x2,#8*6] 1538 mov x2,x0 // slide the window 1539 ldp x25,x26,[x0,#8*6] 1540 mov x27,#8 1541 b.ne .Lsqr8x_reduction 1542 1543 // Final step. We see if result is larger than modulus, and 1544 // if it is, subtract the modulus. But comparison implies 1545 // subtraction. So we subtract modulus, see if it borrowed, 1546 // and conditionally copy original value. 1547 ldr x0,[x29,#96] // pull rp 1548 add x2,x2,#8*8 1549 subs x14,x19,x6 1550 sbcs x15,x20,x7 1551 sub x27,x5,#8*8 1552 mov x3,x0 // x0 copy 1553 1554.Lsqr8x_sub: 1555 sbcs x16,x21,x8 1556 ldp x6,x7,[x1,#8*0] 1557 sbcs x17,x22,x9 1558 stp x14,x15,[x0,#8*0] 1559 sbcs x14,x23,x10 1560 ldp x8,x9,[x1,#8*2] 1561 sbcs x15,x24,x11 1562 stp x16,x17,[x0,#8*2] 1563 sbcs x16,x25,x12 1564 ldp x10,x11,[x1,#8*4] 1565 sbcs x17,x26,x13 1566 ldp x12,x13,[x1,#8*6] 1567 add x1,x1,#8*8 1568 ldp x19,x20,[x2,#8*0] 1569 sub x27,x27,#8*8 1570 ldp x21,x22,[x2,#8*2] 1571 ldp x23,x24,[x2,#8*4] 1572 ldp x25,x26,[x2,#8*6] 1573 add x2,x2,#8*8 1574 stp x14,x15,[x0,#8*4] 1575 sbcs x14,x19,x6 1576 stp x16,x17,[x0,#8*6] 1577 add x0,x0,#8*8 1578 sbcs x15,x20,x7 1579 cbnz x27,.Lsqr8x_sub 1580 1581 sbcs x16,x21,x8 1582 mov x2,sp 1583 add x1,sp,x5 1584 ldp x6,x7,[x3,#8*0] 1585 sbcs x17,x22,x9 1586 stp x14,x15,[x0,#8*0] 1587 sbcs x14,x23,x10 1588 ldp x8,x9,[x3,#8*2] 1589 sbcs x15,x24,x11 1590 stp x16,x17,[x0,#8*2] 1591 sbcs x16,x25,x12 1592 ldp x19,x20,[x1,#8*0] 1593 sbcs x17,x26,x13 1594 ldp x21,x22,[x1,#8*2] 1595 sbcs xzr,x30,xzr // did it borrow? 1596 ldr x30,[x29,#8] // pull return address 1597 stp x14,x15,[x0,#8*4] 1598 stp x16,x17,[x0,#8*6] 1599 1600 sub x27,x5,#8*4 1601.Lsqr4x_cond_copy: 1602 sub x27,x27,#8*4 1603 csel x14,x19,x6,lo 1604 stp xzr,xzr,[x2,#8*0] 1605 csel x15,x20,x7,lo 1606 ldp x6,x7,[x3,#8*4] 1607 ldp x19,x20,[x1,#8*4] 1608 csel x16,x21,x8,lo 1609 stp xzr,xzr,[x2,#8*2] 1610 add x2,x2,#8*4 1611 csel x17,x22,x9,lo 1612 ldp x8,x9,[x3,#8*6] 1613 ldp x21,x22,[x1,#8*6] 1614 add x1,x1,#8*4 1615 stp x14,x15,[x3,#8*0] 1616 stp x16,x17,[x3,#8*2] 1617 add x3,x3,#8*4 1618 stp xzr,xzr,[x1,#8*0] 1619 stp xzr,xzr,[x1,#8*2] 1620 cbnz x27,.Lsqr4x_cond_copy 1621 1622 csel x14,x19,x6,lo 1623 stp xzr,xzr,[x2,#8*0] 1624 csel x15,x20,x7,lo 1625 stp xzr,xzr,[x2,#8*2] 1626 csel x16,x21,x8,lo 1627 csel x17,x22,x9,lo 1628 stp x14,x15,[x3,#8*0] 1629 stp x16,x17,[x3,#8*2] 1630 1631 b .Lsqr8x_done 1632 1633.align 4 1634.Lsqr8x8_post_condition: 1635 adc x28,xzr,xzr 1636 ldr x30,[x29,#8] // pull return address 1637 // x19-7,x28 hold result, x6-7 hold modulus 1638 subs x6,x19,x6 1639 ldr x1,[x29,#96] // pull rp 1640 sbcs x7,x20,x7 1641 stp xzr,xzr,[sp,#8*0] 1642 sbcs x8,x21,x8 1643 stp xzr,xzr,[sp,#8*2] 1644 sbcs x9,x22,x9 1645 stp xzr,xzr,[sp,#8*4] 1646 sbcs x10,x23,x10 1647 stp xzr,xzr,[sp,#8*6] 1648 sbcs x11,x24,x11 1649 stp xzr,xzr,[sp,#8*8] 1650 sbcs x12,x25,x12 1651 stp xzr,xzr,[sp,#8*10] 1652 sbcs x13,x26,x13 1653 stp xzr,xzr,[sp,#8*12] 1654 sbcs x28,x28,xzr // did it borrow? 1655 stp xzr,xzr,[sp,#8*14] 1656 1657 // x6-7 hold result-modulus 1658 csel x6,x19,x6,lo 1659 csel x7,x20,x7,lo 1660 csel x8,x21,x8,lo 1661 csel x9,x22,x9,lo 1662 stp x6,x7,[x1,#8*0] 1663 csel x10,x23,x10,lo 1664 csel x11,x24,x11,lo 1665 stp x8,x9,[x1,#8*2] 1666 csel x12,x25,x12,lo 1667 csel x13,x26,x13,lo 1668 stp x10,x11,[x1,#8*4] 1669 stp x12,x13,[x1,#8*6] 1670 1671.Lsqr8x_done: 1672 ldp x19,x20,[x29,#16] 1673 mov sp,x29 1674 ldp x21,x22,[x29,#32] 1675 mov x0,#1 1676 ldp x23,x24,[x29,#48] 1677 ldp x25,x26,[x29,#64] 1678 ldp x27,x28,[x29,#80] 1679 ldr x29,[sp],#128 1680.inst 0xd50323bf // autiasp 1681 ret 1682.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 1683.type __bn_mul4x_mont,%function 1684.align 5 1685__bn_mul4x_mont: 1686.inst 0xd503233f // paciasp 1687 stp x29,x30,[sp,#-128]! 1688 add x29,sp,#0 1689 stp x19,x20,[sp,#16] 1690 stp x21,x22,[sp,#32] 1691 stp x23,x24,[sp,#48] 1692 stp x25,x26,[sp,#64] 1693 stp x27,x28,[sp,#80] 1694 1695 sub x26,sp,x5,lsl#3 1696 lsl x5,x5,#3 1697 ldr x4,[x4] // *n0 1698 sub sp,x26,#8*4 // alloca 1699 1700 add x10,x2,x5 1701 add x27,x1,x5 1702 stp x0,x10,[x29,#96] // offload rp and &b[num] 1703 1704 ldr x24,[x2,#8*0] // b[0] 1705 ldp x6,x7,[x1,#8*0] // a[0..3] 1706 ldp x8,x9,[x1,#8*2] 1707 add x1,x1,#8*4 1708 mov x19,xzr 1709 mov x20,xzr 1710 mov x21,xzr 1711 mov x22,xzr 1712 ldp x14,x15,[x3,#8*0] // n[0..3] 1713 ldp x16,x17,[x3,#8*2] 1714 adds x3,x3,#8*4 // clear carry bit 1715 mov x0,xzr 1716 mov x28,#0 1717 mov x26,sp 1718 1719.Loop_mul4x_1st_reduction: 1720 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1721 adc x0,x0,xzr // modulo-scheduled 1722 mul x11,x7,x24 1723 add x28,x28,#8 1724 mul x12,x8,x24 1725 and x28,x28,#31 1726 mul x13,x9,x24 1727 adds x19,x19,x10 1728 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1729 adcs x20,x20,x11 1730 mul x25,x19,x4 // t[0]*n0 1731 adcs x21,x21,x12 1732 umulh x11,x7,x24 1733 adcs x22,x22,x13 1734 umulh x12,x8,x24 1735 adc x23,xzr,xzr 1736 umulh x13,x9,x24 1737 ldr x24,[x2,x28] // next b[i] (or b[0]) 1738 adds x20,x20,x10 1739 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1740 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1741 adcs x21,x21,x11 1742 mul x11,x15,x25 1743 adcs x22,x22,x12 1744 mul x12,x16,x25 1745 adc x23,x23,x13 // can't overflow 1746 mul x13,x17,x25 1747 // (*) adds xzr,x19,x10 1748 subs xzr,x19,#1 // (*) 1749 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1750 adcs x19,x20,x11 1751 umulh x11,x15,x25 1752 adcs x20,x21,x12 1753 umulh x12,x16,x25 1754 adcs x21,x22,x13 1755 umulh x13,x17,x25 1756 adcs x22,x23,x0 1757 adc x0,xzr,xzr 1758 adds x19,x19,x10 1759 sub x10,x27,x1 1760 adcs x20,x20,x11 1761 adcs x21,x21,x12 1762 adcs x22,x22,x13 1763 //adc x0,x0,xzr 1764 cbnz x28,.Loop_mul4x_1st_reduction 1765 1766 cbz x10,.Lmul4x4_post_condition 1767 1768 ldp x6,x7,[x1,#8*0] // a[4..7] 1769 ldp x8,x9,[x1,#8*2] 1770 add x1,x1,#8*4 1771 ldr x25,[sp] // a[0]*n0 1772 ldp x14,x15,[x3,#8*0] // n[4..7] 1773 ldp x16,x17,[x3,#8*2] 1774 add x3,x3,#8*4 1775 1776.Loop_mul4x_1st_tail: 1777 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1778 adc x0,x0,xzr // modulo-scheduled 1779 mul x11,x7,x24 1780 add x28,x28,#8 1781 mul x12,x8,x24 1782 and x28,x28,#31 1783 mul x13,x9,x24 1784 adds x19,x19,x10 1785 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1786 adcs x20,x20,x11 1787 umulh x11,x7,x24 1788 adcs x21,x21,x12 1789 umulh x12,x8,x24 1790 adcs x22,x22,x13 1791 umulh x13,x9,x24 1792 adc x23,xzr,xzr 1793 ldr x24,[x2,x28] // next b[i] (or b[0]) 1794 adds x20,x20,x10 1795 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1796 adcs x21,x21,x11 1797 mul x11,x15,x25 1798 adcs x22,x22,x12 1799 mul x12,x16,x25 1800 adc x23,x23,x13 // can't overflow 1801 mul x13,x17,x25 1802 adds x19,x19,x10 1803 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1804 adcs x20,x20,x11 1805 umulh x11,x15,x25 1806 adcs x21,x21,x12 1807 umulh x12,x16,x25 1808 adcs x22,x22,x13 1809 adcs x23,x23,x0 1810 umulh x13,x17,x25 1811 adc x0,xzr,xzr 1812 ldr x25,[sp,x28] // next t[0]*n0 1813 str x19,[x26],#8 // result!!! 1814 adds x19,x20,x10 1815 sub x10,x27,x1 // done yet? 1816 adcs x20,x21,x11 1817 adcs x21,x22,x12 1818 adcs x22,x23,x13 1819 //adc x0,x0,xzr 1820 cbnz x28,.Loop_mul4x_1st_tail 1821 1822 sub x11,x27,x5 // rewinded x1 1823 cbz x10,.Lmul4x_proceed 1824 1825 ldp x6,x7,[x1,#8*0] 1826 ldp x8,x9,[x1,#8*2] 1827 add x1,x1,#8*4 1828 ldp x14,x15,[x3,#8*0] 1829 ldp x16,x17,[x3,#8*2] 1830 add x3,x3,#8*4 1831 b .Loop_mul4x_1st_tail 1832 1833.align 5 1834.Lmul4x_proceed: 1835 ldr x24,[x2,#8*4]! // *++b 1836 adc x30,x0,xzr 1837 ldp x6,x7,[x11,#8*0] // a[0..3] 1838 sub x3,x3,x5 // rewind np 1839 ldp x8,x9,[x11,#8*2] 1840 add x1,x11,#8*4 1841 1842 stp x19,x20,[x26,#8*0] // result!!! 1843 ldp x19,x20,[sp,#8*4] // t[0..3] 1844 stp x21,x22,[x26,#8*2] // result!!! 1845 ldp x21,x22,[sp,#8*6] 1846 1847 ldp x14,x15,[x3,#8*0] // n[0..3] 1848 mov x26,sp 1849 ldp x16,x17,[x3,#8*2] 1850 adds x3,x3,#8*4 // clear carry bit 1851 mov x0,xzr 1852 1853.align 4 1854.Loop_mul4x_reduction: 1855 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1856 adc x0,x0,xzr // modulo-scheduled 1857 mul x11,x7,x24 1858 add x28,x28,#8 1859 mul x12,x8,x24 1860 and x28,x28,#31 1861 mul x13,x9,x24 1862 adds x19,x19,x10 1863 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1864 adcs x20,x20,x11 1865 mul x25,x19,x4 // t[0]*n0 1866 adcs x21,x21,x12 1867 umulh x11,x7,x24 1868 adcs x22,x22,x13 1869 umulh x12,x8,x24 1870 adc x23,xzr,xzr 1871 umulh x13,x9,x24 1872 ldr x24,[x2,x28] // next b[i] 1873 adds x20,x20,x10 1874 // (*) mul x10,x14,x25 1875 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1876 adcs x21,x21,x11 1877 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1878 adcs x22,x22,x12 1879 mul x12,x16,x25 1880 adc x23,x23,x13 // can't overflow 1881 mul x13,x17,x25 1882 // (*) adds xzr,x19,x10 1883 subs xzr,x19,#1 // (*) 1884 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1885 adcs x19,x20,x11 1886 umulh x11,x15,x25 1887 adcs x20,x21,x12 1888 umulh x12,x16,x25 1889 adcs x21,x22,x13 1890 umulh x13,x17,x25 1891 adcs x22,x23,x0 1892 adc x0,xzr,xzr 1893 adds x19,x19,x10 1894 adcs x20,x20,x11 1895 adcs x21,x21,x12 1896 adcs x22,x22,x13 1897 //adc x0,x0,xzr 1898 cbnz x28,.Loop_mul4x_reduction 1899 1900 adc x0,x0,xzr 1901 ldp x10,x11,[x26,#8*4] // t[4..7] 1902 ldp x12,x13,[x26,#8*6] 1903 ldp x6,x7,[x1,#8*0] // a[4..7] 1904 ldp x8,x9,[x1,#8*2] 1905 add x1,x1,#8*4 1906 adds x19,x19,x10 1907 adcs x20,x20,x11 1908 adcs x21,x21,x12 1909 adcs x22,x22,x13 1910 //adc x0,x0,xzr 1911 1912 ldr x25,[sp] // t[0]*n0 1913 ldp x14,x15,[x3,#8*0] // n[4..7] 1914 ldp x16,x17,[x3,#8*2] 1915 add x3,x3,#8*4 1916 1917.align 4 1918.Loop_mul4x_tail: 1919 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1920 adc x0,x0,xzr // modulo-scheduled 1921 mul x11,x7,x24 1922 add x28,x28,#8 1923 mul x12,x8,x24 1924 and x28,x28,#31 1925 mul x13,x9,x24 1926 adds x19,x19,x10 1927 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1928 adcs x20,x20,x11 1929 umulh x11,x7,x24 1930 adcs x21,x21,x12 1931 umulh x12,x8,x24 1932 adcs x22,x22,x13 1933 umulh x13,x9,x24 1934 adc x23,xzr,xzr 1935 ldr x24,[x2,x28] // next b[i] 1936 adds x20,x20,x10 1937 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1938 adcs x21,x21,x11 1939 mul x11,x15,x25 1940 adcs x22,x22,x12 1941 mul x12,x16,x25 1942 adc x23,x23,x13 // can't overflow 1943 mul x13,x17,x25 1944 adds x19,x19,x10 1945 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1946 adcs x20,x20,x11 1947 umulh x11,x15,x25 1948 adcs x21,x21,x12 1949 umulh x12,x16,x25 1950 adcs x22,x22,x13 1951 umulh x13,x17,x25 1952 adcs x23,x23,x0 1953 ldr x25,[sp,x28] // next a[0]*n0 1954 adc x0,xzr,xzr 1955 str x19,[x26],#8 // result!!! 1956 adds x19,x20,x10 1957 sub x10,x27,x1 // done yet? 1958 adcs x20,x21,x11 1959 adcs x21,x22,x12 1960 adcs x22,x23,x13 1961 //adc x0,x0,xzr 1962 cbnz x28,.Loop_mul4x_tail 1963 1964 sub x11,x3,x5 // rewinded np? 1965 adc x0,x0,xzr 1966 cbz x10,.Loop_mul4x_break 1967 1968 ldp x10,x11,[x26,#8*4] 1969 ldp x12,x13,[x26,#8*6] 1970 ldp x6,x7,[x1,#8*0] 1971 ldp x8,x9,[x1,#8*2] 1972 add x1,x1,#8*4 1973 adds x19,x19,x10 1974 adcs x20,x20,x11 1975 adcs x21,x21,x12 1976 adcs x22,x22,x13 1977 //adc x0,x0,xzr 1978 ldp x14,x15,[x3,#8*0] 1979 ldp x16,x17,[x3,#8*2] 1980 add x3,x3,#8*4 1981 b .Loop_mul4x_tail 1982 1983.align 4 1984.Loop_mul4x_break: 1985 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1986 adds x19,x19,x30 1987 add x2,x2,#8*4 // bp++ 1988 adcs x20,x20,xzr 1989 sub x1,x1,x5 // rewind ap 1990 adcs x21,x21,xzr 1991 stp x19,x20,[x26,#8*0] // result!!! 1992 adcs x22,x22,xzr 1993 ldp x19,x20,[sp,#8*4] // t[0..3] 1994 adc x30,x0,xzr 1995 stp x21,x22,[x26,#8*2] // result!!! 1996 cmp x2,x13 // done yet? 1997 ldp x21,x22,[sp,#8*6] 1998 ldp x14,x15,[x11,#8*0] // n[0..3] 1999 ldp x16,x17,[x11,#8*2] 2000 add x3,x11,#8*4 2001 b.eq .Lmul4x_post 2002 2003 ldr x24,[x2] 2004 ldp x6,x7,[x1,#8*0] // a[0..3] 2005 ldp x8,x9,[x1,#8*2] 2006 adds x1,x1,#8*4 // clear carry bit 2007 mov x0,xzr 2008 mov x26,sp 2009 b .Loop_mul4x_reduction 2010 2011.align 4 2012.Lmul4x_post: 2013 // Final step. We see if result is larger than modulus, and 2014 // if it is, subtract the modulus. But comparison implies 2015 // subtraction. So we subtract modulus, see if it borrowed, 2016 // and conditionally copy original value. 2017 mov x0,x12 2018 mov x27,x12 // x0 copy 2019 subs x10,x19,x14 2020 add x26,sp,#8*8 2021 sbcs x11,x20,x15 2022 sub x28,x5,#8*4 2023 2024.Lmul4x_sub: 2025 sbcs x12,x21,x16 2026 ldp x14,x15,[x3,#8*0] 2027 sub x28,x28,#8*4 2028 ldp x19,x20,[x26,#8*0] 2029 sbcs x13,x22,x17 2030 ldp x16,x17,[x3,#8*2] 2031 add x3,x3,#8*4 2032 ldp x21,x22,[x26,#8*2] 2033 add x26,x26,#8*4 2034 stp x10,x11,[x0,#8*0] 2035 sbcs x10,x19,x14 2036 stp x12,x13,[x0,#8*2] 2037 add x0,x0,#8*4 2038 sbcs x11,x20,x15 2039 cbnz x28,.Lmul4x_sub 2040 2041 sbcs x12,x21,x16 2042 mov x26,sp 2043 add x1,sp,#8*4 2044 ldp x6,x7,[x27,#8*0] 2045 sbcs x13,x22,x17 2046 stp x10,x11,[x0,#8*0] 2047 ldp x8,x9,[x27,#8*2] 2048 stp x12,x13,[x0,#8*2] 2049 ldp x19,x20,[x1,#8*0] 2050 ldp x21,x22,[x1,#8*2] 2051 sbcs xzr,x30,xzr // did it borrow? 2052 ldr x30,[x29,#8] // pull return address 2053 2054 sub x28,x5,#8*4 2055.Lmul4x_cond_copy: 2056 sub x28,x28,#8*4 2057 csel x10,x19,x6,lo 2058 stp xzr,xzr,[x26,#8*0] 2059 csel x11,x20,x7,lo 2060 ldp x6,x7,[x27,#8*4] 2061 ldp x19,x20,[x1,#8*4] 2062 csel x12,x21,x8,lo 2063 stp xzr,xzr,[x26,#8*2] 2064 add x26,x26,#8*4 2065 csel x13,x22,x9,lo 2066 ldp x8,x9,[x27,#8*6] 2067 ldp x21,x22,[x1,#8*6] 2068 add x1,x1,#8*4 2069 stp x10,x11,[x27,#8*0] 2070 stp x12,x13,[x27,#8*2] 2071 add x27,x27,#8*4 2072 cbnz x28,.Lmul4x_cond_copy 2073 2074 csel x10,x19,x6,lo 2075 stp xzr,xzr,[x26,#8*0] 2076 csel x11,x20,x7,lo 2077 stp xzr,xzr,[x26,#8*2] 2078 csel x12,x21,x8,lo 2079 stp xzr,xzr,[x26,#8*3] 2080 csel x13,x22,x9,lo 2081 stp xzr,xzr,[x26,#8*4] 2082 stp x10,x11,[x27,#8*0] 2083 stp x12,x13,[x27,#8*2] 2084 2085 b .Lmul4x_done 2086 2087.align 4 2088.Lmul4x4_post_condition: 2089 adc x0,x0,xzr 2090 ldr x1,[x29,#96] // pull rp 2091 // x19-3,x0 hold result, x14-7 hold modulus 2092 subs x6,x19,x14 2093 ldr x30,[x29,#8] // pull return address 2094 sbcs x7,x20,x15 2095 stp xzr,xzr,[sp,#8*0] 2096 sbcs x8,x21,x16 2097 stp xzr,xzr,[sp,#8*2] 2098 sbcs x9,x22,x17 2099 stp xzr,xzr,[sp,#8*4] 2100 sbcs xzr,x0,xzr // did it borrow? 2101 stp xzr,xzr,[sp,#8*6] 2102 2103 // x6-3 hold result-modulus 2104 csel x6,x19,x6,lo 2105 csel x7,x20,x7,lo 2106 csel x8,x21,x8,lo 2107 csel x9,x22,x9,lo 2108 stp x6,x7,[x1,#8*0] 2109 stp x8,x9,[x1,#8*2] 2110 2111.Lmul4x_done: 2112 ldp x19,x20,[x29,#16] 2113 mov sp,x29 2114 ldp x21,x22,[x29,#32] 2115 mov x0,#1 2116 ldp x23,x24,[x29,#48] 2117 ldp x25,x26,[x29,#64] 2118 ldp x27,x28,[x29,#80] 2119 ldr x29,[sp],#128 2120.inst 0xd50323bf // autiasp 2121 ret 2122.size __bn_mul4x_mont,.-__bn_mul4x_mont 2123.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 2124.align 2 2125.align 4 2126