1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from armv8-mont.pl. */ 2c0855eaaSJohn Baldwin#include "arm_arch.h" 3bd9588bcSAndrew Turner#ifndef __KERNEL__ 4c0855eaaSJohn Baldwin 5c0855eaaSJohn Baldwin.hidden OPENSSL_armv8_rsa_neonized 6c0855eaaSJohn Baldwin#endif 7bc3d5698SJohn Baldwin.text 8bc3d5698SJohn Baldwin 9bc3d5698SJohn Baldwin.globl bn_mul_mont 10bc3d5698SJohn Baldwin.type bn_mul_mont,%function 11bc3d5698SJohn Baldwin.align 5 12bc3d5698SJohn Baldwinbn_mul_mont: 13bd9588bcSAndrew Turner AARCH64_SIGN_LINK_REGISTER 14c0855eaaSJohn Baldwin.Lbn_mul_mont: 15c0855eaaSJohn Baldwin tst x5,#3 16c0855eaaSJohn Baldwin b.ne .Lmul_mont 17c0855eaaSJohn Baldwin cmp x5,#32 18c0855eaaSJohn Baldwin b.le .Lscalar_impl 19c0855eaaSJohn Baldwin#ifndef __KERNEL__ 20*d2a55e6aSEnji Cooper#ifndef __AARCH64EB__ 21c0855eaaSJohn Baldwin adrp x17,OPENSSL_armv8_rsa_neonized 22c0855eaaSJohn Baldwin ldr w17,[x17,#:lo12:OPENSSL_armv8_rsa_neonized] 23c0855eaaSJohn Baldwin cbnz w17, bn_mul8x_mont_neon 24c0855eaaSJohn Baldwin#endif 25*d2a55e6aSEnji Cooper#endif 26c0855eaaSJohn Baldwin 27c0855eaaSJohn Baldwin.Lscalar_impl: 28bc3d5698SJohn Baldwin tst x5,#7 29bc3d5698SJohn Baldwin b.eq __bn_sqr8x_mont 30bc3d5698SJohn Baldwin tst x5,#3 31bc3d5698SJohn Baldwin b.eq __bn_mul4x_mont 32c0855eaaSJohn Baldwin 33bc3d5698SJohn Baldwin.Lmul_mont: 34bc3d5698SJohn Baldwin stp x29,x30,[sp,#-64]! 35bc3d5698SJohn Baldwin add x29,sp,#0 36bc3d5698SJohn Baldwin stp x19,x20,[sp,#16] 37bc3d5698SJohn Baldwin stp x21,x22,[sp,#32] 38bc3d5698SJohn Baldwin stp x23,x24,[sp,#48] 39bc3d5698SJohn Baldwin 40bc3d5698SJohn Baldwin ldr x9,[x2],#8 // bp[0] 41bc3d5698SJohn Baldwin sub x22,sp,x5,lsl#3 42bc3d5698SJohn Baldwin ldp x7,x8,[x1],#16 // ap[0..1] 43bc3d5698SJohn Baldwin lsl x5,x5,#3 44bc3d5698SJohn Baldwin ldr x4,[x4] // *n0 45bc3d5698SJohn Baldwin and x22,x22,#-16 // ABI says so 46bc3d5698SJohn Baldwin ldp x13,x14,[x3],#16 // np[0..1] 47bc3d5698SJohn Baldwin 48bc3d5698SJohn Baldwin mul x6,x7,x9 // ap[0]*bp[0] 49bc3d5698SJohn Baldwin sub x21,x5,#16 // j=num-2 50bc3d5698SJohn Baldwin umulh x7,x7,x9 51bc3d5698SJohn Baldwin mul x10,x8,x9 // ap[1]*bp[0] 52bc3d5698SJohn Baldwin umulh x11,x8,x9 53bc3d5698SJohn Baldwin 54bc3d5698SJohn Baldwin mul x15,x6,x4 // "tp[0]"*n0 55bc3d5698SJohn Baldwin mov sp,x22 // alloca 56bc3d5698SJohn Baldwin 57bc3d5698SJohn Baldwin // (*) mul x12,x13,x15 // np[0]*m1 58bc3d5698SJohn Baldwin umulh x13,x13,x15 59bc3d5698SJohn Baldwin mul x16,x14,x15 // np[1]*m1 60bc3d5698SJohn Baldwin // (*) adds x12,x12,x6 // discarded 61bc3d5698SJohn Baldwin // (*) As for removal of first multiplication and addition 62bc3d5698SJohn Baldwin // instructions. The outcome of first addition is 63bc3d5698SJohn Baldwin // guaranteed to be zero, which leaves two computationally 64bc3d5698SJohn Baldwin // significant outcomes: it either carries or not. Then 65bc3d5698SJohn Baldwin // question is when does it carry? Is there alternative 66bc3d5698SJohn Baldwin // way to deduce it? If you follow operations, you can 67bc3d5698SJohn Baldwin // observe that condition for carry is quite simple: 68bc3d5698SJohn Baldwin // x6 being non-zero. So that carry can be calculated 69bc3d5698SJohn Baldwin // by adding -1 to x6. That's what next instruction does. 70bc3d5698SJohn Baldwin subs xzr,x6,#1 // (*) 71bc3d5698SJohn Baldwin umulh x17,x14,x15 72bc3d5698SJohn Baldwin adc x13,x13,xzr 73bc3d5698SJohn Baldwin cbz x21,.L1st_skip 74bc3d5698SJohn Baldwin 75bc3d5698SJohn Baldwin.L1st: 76bc3d5698SJohn Baldwin ldr x8,[x1],#8 77bc3d5698SJohn Baldwin adds x6,x10,x7 78bc3d5698SJohn Baldwin sub x21,x21,#8 // j-- 79bc3d5698SJohn Baldwin adc x7,x11,xzr 80bc3d5698SJohn Baldwin 81bc3d5698SJohn Baldwin ldr x14,[x3],#8 82bc3d5698SJohn Baldwin adds x12,x16,x13 83bc3d5698SJohn Baldwin mul x10,x8,x9 // ap[j]*bp[0] 84bc3d5698SJohn Baldwin adc x13,x17,xzr 85bc3d5698SJohn Baldwin umulh x11,x8,x9 86bc3d5698SJohn Baldwin 87bc3d5698SJohn Baldwin adds x12,x12,x6 88bc3d5698SJohn Baldwin mul x16,x14,x15 // np[j]*m1 89bc3d5698SJohn Baldwin adc x13,x13,xzr 90bc3d5698SJohn Baldwin umulh x17,x14,x15 91bc3d5698SJohn Baldwin str x12,[x22],#8 // tp[j-1] 92bc3d5698SJohn Baldwin cbnz x21,.L1st 93bc3d5698SJohn Baldwin 94bc3d5698SJohn Baldwin.L1st_skip: 95bc3d5698SJohn Baldwin adds x6,x10,x7 96bc3d5698SJohn Baldwin sub x1,x1,x5 // rewind x1 97bc3d5698SJohn Baldwin adc x7,x11,xzr 98bc3d5698SJohn Baldwin 99bc3d5698SJohn Baldwin adds x12,x16,x13 100bc3d5698SJohn Baldwin sub x3,x3,x5 // rewind x3 101bc3d5698SJohn Baldwin adc x13,x17,xzr 102bc3d5698SJohn Baldwin 103bc3d5698SJohn Baldwin adds x12,x12,x6 104bc3d5698SJohn Baldwin sub x20,x5,#8 // i=num-1 105bc3d5698SJohn Baldwin adcs x13,x13,x7 106bc3d5698SJohn Baldwin 107bc3d5698SJohn Baldwin adc x19,xzr,xzr // upmost overflow bit 108bc3d5698SJohn Baldwin stp x12,x13,[x22] 109bc3d5698SJohn Baldwin 110bc3d5698SJohn Baldwin.Louter: 111bc3d5698SJohn Baldwin ldr x9,[x2],#8 // bp[i] 112bc3d5698SJohn Baldwin ldp x7,x8,[x1],#16 113bc3d5698SJohn Baldwin ldr x23,[sp] // tp[0] 114bc3d5698SJohn Baldwin add x22,sp,#8 115bc3d5698SJohn Baldwin 116bc3d5698SJohn Baldwin mul x6,x7,x9 // ap[0]*bp[i] 117bc3d5698SJohn Baldwin sub x21,x5,#16 // j=num-2 118bc3d5698SJohn Baldwin umulh x7,x7,x9 119bc3d5698SJohn Baldwin ldp x13,x14,[x3],#16 120bc3d5698SJohn Baldwin mul x10,x8,x9 // ap[1]*bp[i] 121bc3d5698SJohn Baldwin adds x6,x6,x23 122bc3d5698SJohn Baldwin umulh x11,x8,x9 123bc3d5698SJohn Baldwin adc x7,x7,xzr 124bc3d5698SJohn Baldwin 125bc3d5698SJohn Baldwin mul x15,x6,x4 126bc3d5698SJohn Baldwin sub x20,x20,#8 // i-- 127bc3d5698SJohn Baldwin 128bc3d5698SJohn Baldwin // (*) mul x12,x13,x15 // np[0]*m1 129bc3d5698SJohn Baldwin umulh x13,x13,x15 130bc3d5698SJohn Baldwin mul x16,x14,x15 // np[1]*m1 131bc3d5698SJohn Baldwin // (*) adds x12,x12,x6 132bc3d5698SJohn Baldwin subs xzr,x6,#1 // (*) 133bc3d5698SJohn Baldwin umulh x17,x14,x15 134bc3d5698SJohn Baldwin cbz x21,.Linner_skip 135bc3d5698SJohn Baldwin 136bc3d5698SJohn Baldwin.Linner: 137bc3d5698SJohn Baldwin ldr x8,[x1],#8 138bc3d5698SJohn Baldwin adc x13,x13,xzr 139bc3d5698SJohn Baldwin ldr x23,[x22],#8 // tp[j] 140bc3d5698SJohn Baldwin adds x6,x10,x7 141bc3d5698SJohn Baldwin sub x21,x21,#8 // j-- 142bc3d5698SJohn Baldwin adc x7,x11,xzr 143bc3d5698SJohn Baldwin 144bc3d5698SJohn Baldwin adds x12,x16,x13 145bc3d5698SJohn Baldwin ldr x14,[x3],#8 146bc3d5698SJohn Baldwin adc x13,x17,xzr 147bc3d5698SJohn Baldwin 148bc3d5698SJohn Baldwin mul x10,x8,x9 // ap[j]*bp[i] 149bc3d5698SJohn Baldwin adds x6,x6,x23 150bc3d5698SJohn Baldwin umulh x11,x8,x9 151bc3d5698SJohn Baldwin adc x7,x7,xzr 152bc3d5698SJohn Baldwin 153bc3d5698SJohn Baldwin mul x16,x14,x15 // np[j]*m1 154bc3d5698SJohn Baldwin adds x12,x12,x6 155bc3d5698SJohn Baldwin umulh x17,x14,x15 156c0855eaaSJohn Baldwin stur x12,[x22,#-16] // tp[j-1] 157bc3d5698SJohn Baldwin cbnz x21,.Linner 158bc3d5698SJohn Baldwin 159bc3d5698SJohn Baldwin.Linner_skip: 160bc3d5698SJohn Baldwin ldr x23,[x22],#8 // tp[j] 161bc3d5698SJohn Baldwin adc x13,x13,xzr 162bc3d5698SJohn Baldwin adds x6,x10,x7 163bc3d5698SJohn Baldwin sub x1,x1,x5 // rewind x1 164bc3d5698SJohn Baldwin adc x7,x11,xzr 165bc3d5698SJohn Baldwin 166bc3d5698SJohn Baldwin adds x12,x16,x13 167bc3d5698SJohn Baldwin sub x3,x3,x5 // rewind x3 168bc3d5698SJohn Baldwin adcs x13,x17,x19 169bc3d5698SJohn Baldwin adc x19,xzr,xzr 170bc3d5698SJohn Baldwin 171bc3d5698SJohn Baldwin adds x6,x6,x23 172bc3d5698SJohn Baldwin adc x7,x7,xzr 173bc3d5698SJohn Baldwin 174bc3d5698SJohn Baldwin adds x12,x12,x6 175bc3d5698SJohn Baldwin adcs x13,x13,x7 176bc3d5698SJohn Baldwin adc x19,x19,xzr // upmost overflow bit 177bc3d5698SJohn Baldwin stp x12,x13,[x22,#-16] 178bc3d5698SJohn Baldwin 179bc3d5698SJohn Baldwin cbnz x20,.Louter 180bc3d5698SJohn Baldwin 181bc3d5698SJohn Baldwin // Final step. We see if result is larger than modulus, and 182bc3d5698SJohn Baldwin // if it is, subtract the modulus. But comparison implies 183bc3d5698SJohn Baldwin // subtraction. So we subtract modulus, see if it borrowed, 184bc3d5698SJohn Baldwin // and conditionally copy original value. 185bc3d5698SJohn Baldwin ldr x23,[sp] // tp[0] 186bc3d5698SJohn Baldwin add x22,sp,#8 187bc3d5698SJohn Baldwin ldr x14,[x3],#8 // np[0] 188bc3d5698SJohn Baldwin subs x21,x5,#8 // j=num-1 and clear borrow 189bc3d5698SJohn Baldwin mov x1,x0 190bc3d5698SJohn Baldwin.Lsub: 191bc3d5698SJohn Baldwin sbcs x8,x23,x14 // tp[j]-np[j] 192bc3d5698SJohn Baldwin ldr x23,[x22],#8 193bc3d5698SJohn Baldwin sub x21,x21,#8 // j-- 194bc3d5698SJohn Baldwin ldr x14,[x3],#8 195bc3d5698SJohn Baldwin str x8,[x1],#8 // rp[j]=tp[j]-np[j] 196bc3d5698SJohn Baldwin cbnz x21,.Lsub 197bc3d5698SJohn Baldwin 198bc3d5698SJohn Baldwin sbcs x8,x23,x14 199bc3d5698SJohn Baldwin sbcs x19,x19,xzr // did it borrow? 200bc3d5698SJohn Baldwin str x8,[x1],#8 // rp[num-1] 201bc3d5698SJohn Baldwin 202bc3d5698SJohn Baldwin ldr x23,[sp] // tp[0] 203bc3d5698SJohn Baldwin add x22,sp,#8 204bc3d5698SJohn Baldwin ldr x8,[x0],#8 // rp[0] 205bc3d5698SJohn Baldwin sub x5,x5,#8 // num-- 206bc3d5698SJohn Baldwin nop 207bc3d5698SJohn Baldwin.Lcond_copy: 208bc3d5698SJohn Baldwin sub x5,x5,#8 // num-- 209bc3d5698SJohn Baldwin csel x14,x23,x8,lo // did it borrow? 210bc3d5698SJohn Baldwin ldr x23,[x22],#8 211bc3d5698SJohn Baldwin ldr x8,[x0],#8 212c0855eaaSJohn Baldwin stur xzr,[x22,#-16] // wipe tp 213c0855eaaSJohn Baldwin stur x14,[x0,#-16] 214bc3d5698SJohn Baldwin cbnz x5,.Lcond_copy 215bc3d5698SJohn Baldwin 216bc3d5698SJohn Baldwin csel x14,x23,x8,lo 217c0855eaaSJohn Baldwin stur xzr,[x22,#-8] // wipe tp 218c0855eaaSJohn Baldwin stur x14,[x0,#-8] 219bc3d5698SJohn Baldwin 220bc3d5698SJohn Baldwin ldp x19,x20,[x29,#16] 221bc3d5698SJohn Baldwin mov sp,x29 222bc3d5698SJohn Baldwin ldp x21,x22,[x29,#32] 223bc3d5698SJohn Baldwin mov x0,#1 224bc3d5698SJohn Baldwin ldp x23,x24,[x29,#48] 225bc3d5698SJohn Baldwin ldr x29,[sp],#64 226bd9588bcSAndrew Turner AARCH64_VALIDATE_LINK_REGISTER 227bc3d5698SJohn Baldwin ret 228bc3d5698SJohn Baldwin.size bn_mul_mont,.-bn_mul_mont 229c0855eaaSJohn Baldwin.type bn_mul8x_mont_neon,%function 230c0855eaaSJohn Baldwin.align 5 231c0855eaaSJohn Baldwinbn_mul8x_mont_neon: 232bd9588bcSAndrew Turner // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to 233bd9588bcSAndrew Turner // only from bn_mul_mont which has already signed the return address. 234c0855eaaSJohn Baldwin stp x29,x30,[sp,#-80]! 235c0855eaaSJohn Baldwin mov x16,sp 236c0855eaaSJohn Baldwin stp d8,d9,[sp,#16] 237c0855eaaSJohn Baldwin stp d10,d11,[sp,#32] 238c0855eaaSJohn Baldwin stp d12,d13,[sp,#48] 239c0855eaaSJohn Baldwin stp d14,d15,[sp,#64] 240c0855eaaSJohn Baldwin lsl x5,x5,#1 241c0855eaaSJohn Baldwin eor v14.16b,v14.16b,v14.16b 242c0855eaaSJohn Baldwin 243c0855eaaSJohn Baldwin.align 4 244c0855eaaSJohn Baldwin.LNEON_8n: 245c0855eaaSJohn Baldwin eor v6.16b,v6.16b,v6.16b 246c0855eaaSJohn Baldwin sub x7,sp,#128 247c0855eaaSJohn Baldwin eor v7.16b,v7.16b,v7.16b 248c0855eaaSJohn Baldwin sub x7,x7,x5,lsl#4 249c0855eaaSJohn Baldwin eor v8.16b,v8.16b,v8.16b 250c0855eaaSJohn Baldwin and x7,x7,#-64 251c0855eaaSJohn Baldwin eor v9.16b,v9.16b,v9.16b 252c0855eaaSJohn Baldwin mov sp,x7 // alloca 253c0855eaaSJohn Baldwin eor v10.16b,v10.16b,v10.16b 254c0855eaaSJohn Baldwin add x7,x7,#256 255c0855eaaSJohn Baldwin eor v11.16b,v11.16b,v11.16b 256c0855eaaSJohn Baldwin sub x8,x5,#8 257c0855eaaSJohn Baldwin eor v12.16b,v12.16b,v12.16b 258c0855eaaSJohn Baldwin eor v13.16b,v13.16b,v13.16b 259c0855eaaSJohn Baldwin 260c0855eaaSJohn Baldwin.LNEON_8n_init: 261c0855eaaSJohn Baldwin st1 {v6.2d,v7.2d},[x7],#32 262c0855eaaSJohn Baldwin subs x8,x8,#8 263c0855eaaSJohn Baldwin st1 {v8.2d,v9.2d},[x7],#32 264c0855eaaSJohn Baldwin st1 {v10.2d,v11.2d},[x7],#32 265c0855eaaSJohn Baldwin st1 {v12.2d,v13.2d},[x7],#32 266c0855eaaSJohn Baldwin bne .LNEON_8n_init 267c0855eaaSJohn Baldwin 268c0855eaaSJohn Baldwin add x6,sp,#256 269c0855eaaSJohn Baldwin ld1 {v0.4s,v1.4s},[x1],#32 270c0855eaaSJohn Baldwin add x10,sp,#8 271c0855eaaSJohn Baldwin ldr s30,[x4],#4 272c0855eaaSJohn Baldwin mov x9,x5 273c0855eaaSJohn Baldwin b .LNEON_8n_outer 274c0855eaaSJohn Baldwin 275c0855eaaSJohn Baldwin.align 4 276c0855eaaSJohn Baldwin.LNEON_8n_outer: 277c0855eaaSJohn Baldwin ldr s28,[x2],#4 // *b++ 278c0855eaaSJohn Baldwin uxtl v28.4s,v28.4h 279c0855eaaSJohn Baldwin add x7,sp,#128 280c0855eaaSJohn Baldwin ld1 {v2.4s,v3.4s},[x3],#32 281c0855eaaSJohn Baldwin 282c0855eaaSJohn Baldwin umlal v6.2d,v28.2s,v0.s[0] 283c0855eaaSJohn Baldwin umlal v7.2d,v28.2s,v0.s[1] 284c0855eaaSJohn Baldwin umlal v8.2d,v28.2s,v0.s[2] 285c0855eaaSJohn Baldwin shl v29.2d,v6.2d,#16 286c0855eaaSJohn Baldwin ext v29.16b,v29.16b,v29.16b,#8 287c0855eaaSJohn Baldwin umlal v9.2d,v28.2s,v0.s[3] 288c0855eaaSJohn Baldwin add v29.2d,v29.2d,v6.2d 289c0855eaaSJohn Baldwin umlal v10.2d,v28.2s,v1.s[0] 290c0855eaaSJohn Baldwin mul v29.2s,v29.2s,v30.2s 291c0855eaaSJohn Baldwin umlal v11.2d,v28.2s,v1.s[1] 292c0855eaaSJohn Baldwin st1 {v28.2s},[sp] // put aside smashed b[8*i+0] 293c0855eaaSJohn Baldwin umlal v12.2d,v28.2s,v1.s[2] 294c0855eaaSJohn Baldwin uxtl v29.4s,v29.4h 295c0855eaaSJohn Baldwin umlal v13.2d,v28.2s,v1.s[3] 296c0855eaaSJohn Baldwin ldr s28,[x2],#4 // *b++ 297c0855eaaSJohn Baldwin umlal v6.2d,v29.2s,v2.s[0] 298c0855eaaSJohn Baldwin umlal v7.2d,v29.2s,v2.s[1] 299c0855eaaSJohn Baldwin uxtl v28.4s,v28.4h 300c0855eaaSJohn Baldwin umlal v8.2d,v29.2s,v2.s[2] 301c0855eaaSJohn Baldwin ushr v15.2d,v6.2d,#16 302c0855eaaSJohn Baldwin umlal v9.2d,v29.2s,v2.s[3] 303c0855eaaSJohn Baldwin umlal v10.2d,v29.2s,v3.s[0] 304c0855eaaSJohn Baldwin ext v6.16b,v6.16b,v6.16b,#8 305c0855eaaSJohn Baldwin add v6.2d,v6.2d,v15.2d 306c0855eaaSJohn Baldwin umlal v11.2d,v29.2s,v3.s[1] 307c0855eaaSJohn Baldwin ushr v6.2d,v6.2d,#16 308c0855eaaSJohn Baldwin umlal v12.2d,v29.2s,v3.s[2] 309c0855eaaSJohn Baldwin umlal v13.2d,v29.2s,v3.s[3] 310c0855eaaSJohn Baldwin add v16.2d,v7.2d,v6.2d 311c0855eaaSJohn Baldwin ins v7.d[0],v16.d[0] 312c0855eaaSJohn Baldwin st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+0] 313c0855eaaSJohn Baldwin umlal v7.2d,v28.2s,v0.s[0] 314c0855eaaSJohn Baldwin ld1 {v6.2d},[x6],#16 315c0855eaaSJohn Baldwin umlal v8.2d,v28.2s,v0.s[1] 316c0855eaaSJohn Baldwin umlal v9.2d,v28.2s,v0.s[2] 317c0855eaaSJohn Baldwin shl v29.2d,v7.2d,#16 318c0855eaaSJohn Baldwin ext v29.16b,v29.16b,v29.16b,#8 319c0855eaaSJohn Baldwin umlal v10.2d,v28.2s,v0.s[3] 320c0855eaaSJohn Baldwin add v29.2d,v29.2d,v7.2d 321c0855eaaSJohn Baldwin umlal v11.2d,v28.2s,v1.s[0] 322c0855eaaSJohn Baldwin mul v29.2s,v29.2s,v30.2s 323c0855eaaSJohn Baldwin umlal v12.2d,v28.2s,v1.s[1] 324c0855eaaSJohn Baldwin st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+1] 325c0855eaaSJohn Baldwin umlal v13.2d,v28.2s,v1.s[2] 326c0855eaaSJohn Baldwin uxtl v29.4s,v29.4h 327c0855eaaSJohn Baldwin umlal v6.2d,v28.2s,v1.s[3] 328c0855eaaSJohn Baldwin ldr s28,[x2],#4 // *b++ 329c0855eaaSJohn Baldwin umlal v7.2d,v29.2s,v2.s[0] 330c0855eaaSJohn Baldwin umlal v8.2d,v29.2s,v2.s[1] 331c0855eaaSJohn Baldwin uxtl v28.4s,v28.4h 332c0855eaaSJohn Baldwin umlal v9.2d,v29.2s,v2.s[2] 333c0855eaaSJohn Baldwin ushr v15.2d,v7.2d,#16 334c0855eaaSJohn Baldwin umlal v10.2d,v29.2s,v2.s[3] 335c0855eaaSJohn Baldwin umlal v11.2d,v29.2s,v3.s[0] 336c0855eaaSJohn Baldwin ext v7.16b,v7.16b,v7.16b,#8 337c0855eaaSJohn Baldwin add v7.2d,v7.2d,v15.2d 338c0855eaaSJohn Baldwin umlal v12.2d,v29.2s,v3.s[1] 339c0855eaaSJohn Baldwin ushr v7.2d,v7.2d,#16 340c0855eaaSJohn Baldwin umlal v13.2d,v29.2s,v3.s[2] 341c0855eaaSJohn Baldwin umlal v6.2d,v29.2s,v3.s[3] 342c0855eaaSJohn Baldwin add v16.2d,v8.2d,v7.2d 343c0855eaaSJohn Baldwin ins v8.d[0],v16.d[0] 344c0855eaaSJohn Baldwin st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+1] 345c0855eaaSJohn Baldwin umlal v8.2d,v28.2s,v0.s[0] 346c0855eaaSJohn Baldwin ld1 {v7.2d},[x6],#16 347c0855eaaSJohn Baldwin umlal v9.2d,v28.2s,v0.s[1] 348c0855eaaSJohn Baldwin umlal v10.2d,v28.2s,v0.s[2] 349c0855eaaSJohn Baldwin shl v29.2d,v8.2d,#16 350c0855eaaSJohn Baldwin ext v29.16b,v29.16b,v29.16b,#8 351c0855eaaSJohn Baldwin umlal v11.2d,v28.2s,v0.s[3] 352c0855eaaSJohn Baldwin add v29.2d,v29.2d,v8.2d 353c0855eaaSJohn Baldwin umlal v12.2d,v28.2s,v1.s[0] 354c0855eaaSJohn Baldwin mul v29.2s,v29.2s,v30.2s 355c0855eaaSJohn Baldwin umlal v13.2d,v28.2s,v1.s[1] 356c0855eaaSJohn Baldwin st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+2] 357c0855eaaSJohn Baldwin umlal v6.2d,v28.2s,v1.s[2] 358c0855eaaSJohn Baldwin uxtl v29.4s,v29.4h 359c0855eaaSJohn Baldwin umlal v7.2d,v28.2s,v1.s[3] 360c0855eaaSJohn Baldwin ldr s28,[x2],#4 // *b++ 361c0855eaaSJohn Baldwin umlal v8.2d,v29.2s,v2.s[0] 362c0855eaaSJohn Baldwin umlal v9.2d,v29.2s,v2.s[1] 363c0855eaaSJohn Baldwin uxtl v28.4s,v28.4h 364c0855eaaSJohn Baldwin umlal v10.2d,v29.2s,v2.s[2] 365c0855eaaSJohn Baldwin ushr v15.2d,v8.2d,#16 366c0855eaaSJohn Baldwin umlal v11.2d,v29.2s,v2.s[3] 367c0855eaaSJohn Baldwin umlal v12.2d,v29.2s,v3.s[0] 368c0855eaaSJohn Baldwin ext v8.16b,v8.16b,v8.16b,#8 369c0855eaaSJohn Baldwin add v8.2d,v8.2d,v15.2d 370c0855eaaSJohn Baldwin umlal v13.2d,v29.2s,v3.s[1] 371c0855eaaSJohn Baldwin ushr v8.2d,v8.2d,#16 372c0855eaaSJohn Baldwin umlal v6.2d,v29.2s,v3.s[2] 373c0855eaaSJohn Baldwin umlal v7.2d,v29.2s,v3.s[3] 374c0855eaaSJohn Baldwin add v16.2d,v9.2d,v8.2d 375c0855eaaSJohn Baldwin ins v9.d[0],v16.d[0] 376c0855eaaSJohn Baldwin st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+2] 377c0855eaaSJohn Baldwin umlal v9.2d,v28.2s,v0.s[0] 378c0855eaaSJohn Baldwin ld1 {v8.2d},[x6],#16 379c0855eaaSJohn Baldwin umlal v10.2d,v28.2s,v0.s[1] 380c0855eaaSJohn Baldwin umlal v11.2d,v28.2s,v0.s[2] 381c0855eaaSJohn Baldwin shl v29.2d,v9.2d,#16 382c0855eaaSJohn Baldwin ext v29.16b,v29.16b,v29.16b,#8 383c0855eaaSJohn Baldwin umlal v12.2d,v28.2s,v0.s[3] 384c0855eaaSJohn Baldwin add v29.2d,v29.2d,v9.2d 385c0855eaaSJohn Baldwin umlal v13.2d,v28.2s,v1.s[0] 386c0855eaaSJohn Baldwin mul v29.2s,v29.2s,v30.2s 387c0855eaaSJohn Baldwin umlal v6.2d,v28.2s,v1.s[1] 388c0855eaaSJohn Baldwin st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+3] 389c0855eaaSJohn Baldwin umlal v7.2d,v28.2s,v1.s[2] 390c0855eaaSJohn Baldwin uxtl v29.4s,v29.4h 391c0855eaaSJohn Baldwin umlal v8.2d,v28.2s,v1.s[3] 392c0855eaaSJohn Baldwin ldr s28,[x2],#4 // *b++ 393c0855eaaSJohn Baldwin umlal v9.2d,v29.2s,v2.s[0] 394c0855eaaSJohn Baldwin umlal v10.2d,v29.2s,v2.s[1] 395c0855eaaSJohn Baldwin uxtl v28.4s,v28.4h 396c0855eaaSJohn Baldwin umlal v11.2d,v29.2s,v2.s[2] 397c0855eaaSJohn Baldwin ushr v15.2d,v9.2d,#16 398c0855eaaSJohn Baldwin umlal v12.2d,v29.2s,v2.s[3] 399c0855eaaSJohn Baldwin umlal v13.2d,v29.2s,v3.s[0] 400c0855eaaSJohn Baldwin ext v9.16b,v9.16b,v9.16b,#8 401c0855eaaSJohn Baldwin add v9.2d,v9.2d,v15.2d 402c0855eaaSJohn Baldwin umlal v6.2d,v29.2s,v3.s[1] 403c0855eaaSJohn Baldwin ushr v9.2d,v9.2d,#16 404c0855eaaSJohn Baldwin umlal v7.2d,v29.2s,v3.s[2] 405c0855eaaSJohn Baldwin umlal v8.2d,v29.2s,v3.s[3] 406c0855eaaSJohn Baldwin add v16.2d,v10.2d,v9.2d 407c0855eaaSJohn Baldwin ins v10.d[0],v16.d[0] 408c0855eaaSJohn Baldwin st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+3] 409c0855eaaSJohn Baldwin umlal v10.2d,v28.2s,v0.s[0] 410c0855eaaSJohn Baldwin ld1 {v9.2d},[x6],#16 411c0855eaaSJohn Baldwin umlal v11.2d,v28.2s,v0.s[1] 412c0855eaaSJohn Baldwin umlal v12.2d,v28.2s,v0.s[2] 413c0855eaaSJohn Baldwin shl v29.2d,v10.2d,#16 414c0855eaaSJohn Baldwin ext v29.16b,v29.16b,v29.16b,#8 415c0855eaaSJohn Baldwin umlal v13.2d,v28.2s,v0.s[3] 416c0855eaaSJohn Baldwin add v29.2d,v29.2d,v10.2d 417c0855eaaSJohn Baldwin umlal v6.2d,v28.2s,v1.s[0] 418c0855eaaSJohn Baldwin mul v29.2s,v29.2s,v30.2s 419c0855eaaSJohn Baldwin umlal v7.2d,v28.2s,v1.s[1] 420c0855eaaSJohn Baldwin st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+4] 421c0855eaaSJohn Baldwin umlal v8.2d,v28.2s,v1.s[2] 422c0855eaaSJohn Baldwin uxtl v29.4s,v29.4h 423c0855eaaSJohn Baldwin umlal v9.2d,v28.2s,v1.s[3] 424c0855eaaSJohn Baldwin ldr s28,[x2],#4 // *b++ 425c0855eaaSJohn Baldwin umlal v10.2d,v29.2s,v2.s[0] 426c0855eaaSJohn Baldwin umlal v11.2d,v29.2s,v2.s[1] 427c0855eaaSJohn Baldwin uxtl v28.4s,v28.4h 428c0855eaaSJohn Baldwin umlal v12.2d,v29.2s,v2.s[2] 429c0855eaaSJohn Baldwin ushr v15.2d,v10.2d,#16 430c0855eaaSJohn Baldwin umlal v13.2d,v29.2s,v2.s[3] 431c0855eaaSJohn Baldwin umlal v6.2d,v29.2s,v3.s[0] 432c0855eaaSJohn Baldwin ext v10.16b,v10.16b,v10.16b,#8 433c0855eaaSJohn Baldwin add v10.2d,v10.2d,v15.2d 434c0855eaaSJohn Baldwin umlal v7.2d,v29.2s,v3.s[1] 435c0855eaaSJohn Baldwin ushr v10.2d,v10.2d,#16 436c0855eaaSJohn Baldwin umlal v8.2d,v29.2s,v3.s[2] 437c0855eaaSJohn Baldwin umlal v9.2d,v29.2s,v3.s[3] 438c0855eaaSJohn Baldwin add v16.2d,v11.2d,v10.2d 439c0855eaaSJohn Baldwin ins v11.d[0],v16.d[0] 440c0855eaaSJohn Baldwin st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+4] 441c0855eaaSJohn Baldwin umlal v11.2d,v28.2s,v0.s[0] 442c0855eaaSJohn Baldwin ld1 {v10.2d},[x6],#16 443c0855eaaSJohn Baldwin umlal v12.2d,v28.2s,v0.s[1] 444c0855eaaSJohn Baldwin umlal v13.2d,v28.2s,v0.s[2] 445c0855eaaSJohn Baldwin shl v29.2d,v11.2d,#16 446c0855eaaSJohn Baldwin ext v29.16b,v29.16b,v29.16b,#8 447c0855eaaSJohn Baldwin umlal v6.2d,v28.2s,v0.s[3] 448c0855eaaSJohn Baldwin add v29.2d,v29.2d,v11.2d 449c0855eaaSJohn Baldwin umlal v7.2d,v28.2s,v1.s[0] 450c0855eaaSJohn Baldwin mul v29.2s,v29.2s,v30.2s 451c0855eaaSJohn Baldwin umlal v8.2d,v28.2s,v1.s[1] 452c0855eaaSJohn Baldwin st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+5] 453c0855eaaSJohn Baldwin umlal v9.2d,v28.2s,v1.s[2] 454c0855eaaSJohn Baldwin uxtl v29.4s,v29.4h 455c0855eaaSJohn Baldwin umlal v10.2d,v28.2s,v1.s[3] 456c0855eaaSJohn Baldwin ldr s28,[x2],#4 // *b++ 457c0855eaaSJohn Baldwin umlal v11.2d,v29.2s,v2.s[0] 458c0855eaaSJohn Baldwin umlal v12.2d,v29.2s,v2.s[1] 459c0855eaaSJohn Baldwin uxtl v28.4s,v28.4h 460c0855eaaSJohn Baldwin umlal v13.2d,v29.2s,v2.s[2] 461c0855eaaSJohn Baldwin ushr v15.2d,v11.2d,#16 462c0855eaaSJohn Baldwin umlal v6.2d,v29.2s,v2.s[3] 463c0855eaaSJohn Baldwin umlal v7.2d,v29.2s,v3.s[0] 464c0855eaaSJohn Baldwin ext v11.16b,v11.16b,v11.16b,#8 465c0855eaaSJohn Baldwin add v11.2d,v11.2d,v15.2d 466c0855eaaSJohn Baldwin umlal v8.2d,v29.2s,v3.s[1] 467c0855eaaSJohn Baldwin ushr v11.2d,v11.2d,#16 468c0855eaaSJohn Baldwin umlal v9.2d,v29.2s,v3.s[2] 469c0855eaaSJohn Baldwin umlal v10.2d,v29.2s,v3.s[3] 470c0855eaaSJohn Baldwin add v16.2d,v12.2d,v11.2d 471c0855eaaSJohn Baldwin ins v12.d[0],v16.d[0] 472c0855eaaSJohn Baldwin st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+5] 473c0855eaaSJohn Baldwin umlal v12.2d,v28.2s,v0.s[0] 474c0855eaaSJohn Baldwin ld1 {v11.2d},[x6],#16 475c0855eaaSJohn Baldwin umlal v13.2d,v28.2s,v0.s[1] 476c0855eaaSJohn Baldwin umlal v6.2d,v28.2s,v0.s[2] 477c0855eaaSJohn Baldwin shl v29.2d,v12.2d,#16 478c0855eaaSJohn Baldwin ext v29.16b,v29.16b,v29.16b,#8 479c0855eaaSJohn Baldwin umlal v7.2d,v28.2s,v0.s[3] 480c0855eaaSJohn Baldwin add v29.2d,v29.2d,v12.2d 481c0855eaaSJohn Baldwin umlal v8.2d,v28.2s,v1.s[0] 482c0855eaaSJohn Baldwin mul v29.2s,v29.2s,v30.2s 483c0855eaaSJohn Baldwin umlal v9.2d,v28.2s,v1.s[1] 484c0855eaaSJohn Baldwin st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+6] 485c0855eaaSJohn Baldwin umlal v10.2d,v28.2s,v1.s[2] 486c0855eaaSJohn Baldwin uxtl v29.4s,v29.4h 487c0855eaaSJohn Baldwin umlal v11.2d,v28.2s,v1.s[3] 488c0855eaaSJohn Baldwin ldr s28,[x2],#4 // *b++ 489c0855eaaSJohn Baldwin umlal v12.2d,v29.2s,v2.s[0] 490c0855eaaSJohn Baldwin umlal v13.2d,v29.2s,v2.s[1] 491c0855eaaSJohn Baldwin uxtl v28.4s,v28.4h 492c0855eaaSJohn Baldwin umlal v6.2d,v29.2s,v2.s[2] 493c0855eaaSJohn Baldwin ushr v15.2d,v12.2d,#16 494c0855eaaSJohn Baldwin umlal v7.2d,v29.2s,v2.s[3] 495c0855eaaSJohn Baldwin umlal v8.2d,v29.2s,v3.s[0] 496c0855eaaSJohn Baldwin ext v12.16b,v12.16b,v12.16b,#8 497c0855eaaSJohn Baldwin add v12.2d,v12.2d,v15.2d 498c0855eaaSJohn Baldwin umlal v9.2d,v29.2s,v3.s[1] 499c0855eaaSJohn Baldwin ushr v12.2d,v12.2d,#16 500c0855eaaSJohn Baldwin umlal v10.2d,v29.2s,v3.s[2] 501c0855eaaSJohn Baldwin umlal v11.2d,v29.2s,v3.s[3] 502c0855eaaSJohn Baldwin add v16.2d,v13.2d,v12.2d 503c0855eaaSJohn Baldwin ins v13.d[0],v16.d[0] 504c0855eaaSJohn Baldwin st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+6] 505c0855eaaSJohn Baldwin umlal v13.2d,v28.2s,v0.s[0] 506c0855eaaSJohn Baldwin ld1 {v12.2d},[x6],#16 507c0855eaaSJohn Baldwin umlal v6.2d,v28.2s,v0.s[1] 508c0855eaaSJohn Baldwin umlal v7.2d,v28.2s,v0.s[2] 509c0855eaaSJohn Baldwin shl v29.2d,v13.2d,#16 510c0855eaaSJohn Baldwin ext v29.16b,v29.16b,v29.16b,#8 511c0855eaaSJohn Baldwin umlal v8.2d,v28.2s,v0.s[3] 512c0855eaaSJohn Baldwin add v29.2d,v29.2d,v13.2d 513c0855eaaSJohn Baldwin umlal v9.2d,v28.2s,v1.s[0] 514c0855eaaSJohn Baldwin mul v29.2s,v29.2s,v30.2s 515c0855eaaSJohn Baldwin umlal v10.2d,v28.2s,v1.s[1] 516c0855eaaSJohn Baldwin st1 {v28.2s},[x10],#8 // put aside smashed b[8*i+7] 517c0855eaaSJohn Baldwin umlal v11.2d,v28.2s,v1.s[2] 518c0855eaaSJohn Baldwin uxtl v29.4s,v29.4h 519c0855eaaSJohn Baldwin umlal v12.2d,v28.2s,v1.s[3] 520c0855eaaSJohn Baldwin ld1 {v28.2s},[sp] // pull smashed b[8*i+0] 521c0855eaaSJohn Baldwin umlal v13.2d,v29.2s,v2.s[0] 522c0855eaaSJohn Baldwin ld1 {v0.4s,v1.4s},[x1],#32 523c0855eaaSJohn Baldwin umlal v6.2d,v29.2s,v2.s[1] 524c0855eaaSJohn Baldwin umlal v7.2d,v29.2s,v2.s[2] 525c0855eaaSJohn Baldwin mov v5.16b,v13.16b 526c0855eaaSJohn Baldwin ushr v5.2d,v5.2d,#16 527c0855eaaSJohn Baldwin ext v13.16b,v13.16b,v13.16b,#8 528c0855eaaSJohn Baldwin umlal v8.2d,v29.2s,v2.s[3] 529c0855eaaSJohn Baldwin umlal v9.2d,v29.2s,v3.s[0] 530c0855eaaSJohn Baldwin add v13.2d,v13.2d,v5.2d 531c0855eaaSJohn Baldwin umlal v10.2d,v29.2s,v3.s[1] 532c0855eaaSJohn Baldwin ushr v13.2d,v13.2d,#16 533c0855eaaSJohn Baldwin eor v15.16b,v15.16b,v15.16b 534c0855eaaSJohn Baldwin ins v13.d[1],v15.d[0] 535c0855eaaSJohn Baldwin umlal v11.2d,v29.2s,v3.s[2] 536c0855eaaSJohn Baldwin umlal v12.2d,v29.2s,v3.s[3] 537c0855eaaSJohn Baldwin add v6.2d,v6.2d,v13.2d 538c0855eaaSJohn Baldwin st1 {v29.2s},[x10],#8 // put aside smashed m[8*i+7] 539c0855eaaSJohn Baldwin add x10,sp,#8 // rewind 540c0855eaaSJohn Baldwin sub x8,x5,#8 541c0855eaaSJohn Baldwin b .LNEON_8n_inner 542c0855eaaSJohn Baldwin 543c0855eaaSJohn Baldwin.align 4 544c0855eaaSJohn Baldwin.LNEON_8n_inner: 545c0855eaaSJohn Baldwin subs x8,x8,#8 546c0855eaaSJohn Baldwin umlal v6.2d,v28.2s,v0.s[0] 547c0855eaaSJohn Baldwin ld1 {v13.2d},[x6] 548c0855eaaSJohn Baldwin umlal v7.2d,v28.2s,v0.s[1] 549c0855eaaSJohn Baldwin ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+0] 550c0855eaaSJohn Baldwin umlal v8.2d,v28.2s,v0.s[2] 551c0855eaaSJohn Baldwin ld1 {v2.4s,v3.4s},[x3],#32 552c0855eaaSJohn Baldwin umlal v9.2d,v28.2s,v0.s[3] 553c0855eaaSJohn Baldwin b.eq .LInner_jump 554c0855eaaSJohn Baldwin add x6,x6,#16 // don't advance in last iteration 555c0855eaaSJohn Baldwin.LInner_jump: 556c0855eaaSJohn Baldwin umlal v10.2d,v28.2s,v1.s[0] 557c0855eaaSJohn Baldwin umlal v11.2d,v28.2s,v1.s[1] 558c0855eaaSJohn Baldwin umlal v12.2d,v28.2s,v1.s[2] 559c0855eaaSJohn Baldwin umlal v13.2d,v28.2s,v1.s[3] 560c0855eaaSJohn Baldwin ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+1] 561c0855eaaSJohn Baldwin umlal v6.2d,v29.2s,v2.s[0] 562c0855eaaSJohn Baldwin umlal v7.2d,v29.2s,v2.s[1] 563c0855eaaSJohn Baldwin umlal v8.2d,v29.2s,v2.s[2] 564c0855eaaSJohn Baldwin umlal v9.2d,v29.2s,v2.s[3] 565c0855eaaSJohn Baldwin umlal v10.2d,v29.2s,v3.s[0] 566c0855eaaSJohn Baldwin umlal v11.2d,v29.2s,v3.s[1] 567c0855eaaSJohn Baldwin umlal v12.2d,v29.2s,v3.s[2] 568c0855eaaSJohn Baldwin umlal v13.2d,v29.2s,v3.s[3] 569c0855eaaSJohn Baldwin st1 {v6.2d},[x7],#16 570c0855eaaSJohn Baldwin umlal v7.2d,v28.2s,v0.s[0] 571c0855eaaSJohn Baldwin ld1 {v6.2d},[x6] 572c0855eaaSJohn Baldwin umlal v8.2d,v28.2s,v0.s[1] 573c0855eaaSJohn Baldwin ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+1] 574c0855eaaSJohn Baldwin umlal v9.2d,v28.2s,v0.s[2] 575c0855eaaSJohn Baldwin b.eq .LInner_jump1 576c0855eaaSJohn Baldwin add x6,x6,#16 // don't advance in last iteration 577c0855eaaSJohn Baldwin.LInner_jump1: 578c0855eaaSJohn Baldwin umlal v10.2d,v28.2s,v0.s[3] 579c0855eaaSJohn Baldwin umlal v11.2d,v28.2s,v1.s[0] 580c0855eaaSJohn Baldwin umlal v12.2d,v28.2s,v1.s[1] 581c0855eaaSJohn Baldwin umlal v13.2d,v28.2s,v1.s[2] 582c0855eaaSJohn Baldwin umlal v6.2d,v28.2s,v1.s[3] 583c0855eaaSJohn Baldwin ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+2] 584c0855eaaSJohn Baldwin umlal v7.2d,v29.2s,v2.s[0] 585c0855eaaSJohn Baldwin umlal v8.2d,v29.2s,v2.s[1] 586c0855eaaSJohn Baldwin umlal v9.2d,v29.2s,v2.s[2] 587c0855eaaSJohn Baldwin umlal v10.2d,v29.2s,v2.s[3] 588c0855eaaSJohn Baldwin umlal v11.2d,v29.2s,v3.s[0] 589c0855eaaSJohn Baldwin umlal v12.2d,v29.2s,v3.s[1] 590c0855eaaSJohn Baldwin umlal v13.2d,v29.2s,v3.s[2] 591c0855eaaSJohn Baldwin umlal v6.2d,v29.2s,v3.s[3] 592c0855eaaSJohn Baldwin st1 {v7.2d},[x7],#16 593c0855eaaSJohn Baldwin umlal v8.2d,v28.2s,v0.s[0] 594c0855eaaSJohn Baldwin ld1 {v7.2d},[x6] 595c0855eaaSJohn Baldwin umlal v9.2d,v28.2s,v0.s[1] 596c0855eaaSJohn Baldwin ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+2] 597c0855eaaSJohn Baldwin umlal v10.2d,v28.2s,v0.s[2] 598c0855eaaSJohn Baldwin b.eq .LInner_jump2 599c0855eaaSJohn Baldwin add x6,x6,#16 // don't advance in last iteration 600c0855eaaSJohn Baldwin.LInner_jump2: 601c0855eaaSJohn Baldwin umlal v11.2d,v28.2s,v0.s[3] 602c0855eaaSJohn Baldwin umlal v12.2d,v28.2s,v1.s[0] 603c0855eaaSJohn Baldwin umlal v13.2d,v28.2s,v1.s[1] 604c0855eaaSJohn Baldwin umlal v6.2d,v28.2s,v1.s[2] 605c0855eaaSJohn Baldwin umlal v7.2d,v28.2s,v1.s[3] 606c0855eaaSJohn Baldwin ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+3] 607c0855eaaSJohn Baldwin umlal v8.2d,v29.2s,v2.s[0] 608c0855eaaSJohn Baldwin umlal v9.2d,v29.2s,v2.s[1] 609c0855eaaSJohn Baldwin umlal v10.2d,v29.2s,v2.s[2] 610c0855eaaSJohn Baldwin umlal v11.2d,v29.2s,v2.s[3] 611c0855eaaSJohn Baldwin umlal v12.2d,v29.2s,v3.s[0] 612c0855eaaSJohn Baldwin umlal v13.2d,v29.2s,v3.s[1] 613c0855eaaSJohn Baldwin umlal v6.2d,v29.2s,v3.s[2] 614c0855eaaSJohn Baldwin umlal v7.2d,v29.2s,v3.s[3] 615c0855eaaSJohn Baldwin st1 {v8.2d},[x7],#16 616c0855eaaSJohn Baldwin umlal v9.2d,v28.2s,v0.s[0] 617c0855eaaSJohn Baldwin ld1 {v8.2d},[x6] 618c0855eaaSJohn Baldwin umlal v10.2d,v28.2s,v0.s[1] 619c0855eaaSJohn Baldwin ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+3] 620c0855eaaSJohn Baldwin umlal v11.2d,v28.2s,v0.s[2] 621c0855eaaSJohn Baldwin b.eq .LInner_jump3 622c0855eaaSJohn Baldwin add x6,x6,#16 // don't advance in last iteration 623c0855eaaSJohn Baldwin.LInner_jump3: 624c0855eaaSJohn Baldwin umlal v12.2d,v28.2s,v0.s[3] 625c0855eaaSJohn Baldwin umlal v13.2d,v28.2s,v1.s[0] 626c0855eaaSJohn Baldwin umlal v6.2d,v28.2s,v1.s[1] 627c0855eaaSJohn Baldwin umlal v7.2d,v28.2s,v1.s[2] 628c0855eaaSJohn Baldwin umlal v8.2d,v28.2s,v1.s[3] 629c0855eaaSJohn Baldwin ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+4] 630c0855eaaSJohn Baldwin umlal v9.2d,v29.2s,v2.s[0] 631c0855eaaSJohn Baldwin umlal v10.2d,v29.2s,v2.s[1] 632c0855eaaSJohn Baldwin umlal v11.2d,v29.2s,v2.s[2] 633c0855eaaSJohn Baldwin umlal v12.2d,v29.2s,v2.s[3] 634c0855eaaSJohn Baldwin umlal v13.2d,v29.2s,v3.s[0] 635c0855eaaSJohn Baldwin umlal v6.2d,v29.2s,v3.s[1] 636c0855eaaSJohn Baldwin umlal v7.2d,v29.2s,v3.s[2] 637c0855eaaSJohn Baldwin umlal v8.2d,v29.2s,v3.s[3] 638c0855eaaSJohn Baldwin st1 {v9.2d},[x7],#16 639c0855eaaSJohn Baldwin umlal v10.2d,v28.2s,v0.s[0] 640c0855eaaSJohn Baldwin ld1 {v9.2d},[x6] 641c0855eaaSJohn Baldwin umlal v11.2d,v28.2s,v0.s[1] 642c0855eaaSJohn Baldwin ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+4] 643c0855eaaSJohn Baldwin umlal v12.2d,v28.2s,v0.s[2] 644c0855eaaSJohn Baldwin b.eq .LInner_jump4 645c0855eaaSJohn Baldwin add x6,x6,#16 // don't advance in last iteration 646c0855eaaSJohn Baldwin.LInner_jump4: 647c0855eaaSJohn Baldwin umlal v13.2d,v28.2s,v0.s[3] 648c0855eaaSJohn Baldwin umlal v6.2d,v28.2s,v1.s[0] 649c0855eaaSJohn Baldwin umlal v7.2d,v28.2s,v1.s[1] 650c0855eaaSJohn Baldwin umlal v8.2d,v28.2s,v1.s[2] 651c0855eaaSJohn Baldwin umlal v9.2d,v28.2s,v1.s[3] 652c0855eaaSJohn Baldwin ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+5] 653c0855eaaSJohn Baldwin umlal v10.2d,v29.2s,v2.s[0] 654c0855eaaSJohn Baldwin umlal v11.2d,v29.2s,v2.s[1] 655c0855eaaSJohn Baldwin umlal v12.2d,v29.2s,v2.s[2] 656c0855eaaSJohn Baldwin umlal v13.2d,v29.2s,v2.s[3] 657c0855eaaSJohn Baldwin umlal v6.2d,v29.2s,v3.s[0] 658c0855eaaSJohn Baldwin umlal v7.2d,v29.2s,v3.s[1] 659c0855eaaSJohn Baldwin umlal v8.2d,v29.2s,v3.s[2] 660c0855eaaSJohn Baldwin umlal v9.2d,v29.2s,v3.s[3] 661c0855eaaSJohn Baldwin st1 {v10.2d},[x7],#16 662c0855eaaSJohn Baldwin umlal v11.2d,v28.2s,v0.s[0] 663c0855eaaSJohn Baldwin ld1 {v10.2d},[x6] 664c0855eaaSJohn Baldwin umlal v12.2d,v28.2s,v0.s[1] 665c0855eaaSJohn Baldwin ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+5] 666c0855eaaSJohn Baldwin umlal v13.2d,v28.2s,v0.s[2] 667c0855eaaSJohn Baldwin b.eq .LInner_jump5 668c0855eaaSJohn Baldwin add x6,x6,#16 // don't advance in last iteration 669c0855eaaSJohn Baldwin.LInner_jump5: 670c0855eaaSJohn Baldwin umlal v6.2d,v28.2s,v0.s[3] 671c0855eaaSJohn Baldwin umlal v7.2d,v28.2s,v1.s[0] 672c0855eaaSJohn Baldwin umlal v8.2d,v28.2s,v1.s[1] 673c0855eaaSJohn Baldwin umlal v9.2d,v28.2s,v1.s[2] 674c0855eaaSJohn Baldwin umlal v10.2d,v28.2s,v1.s[3] 675c0855eaaSJohn Baldwin ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+6] 676c0855eaaSJohn Baldwin umlal v11.2d,v29.2s,v2.s[0] 677c0855eaaSJohn Baldwin umlal v12.2d,v29.2s,v2.s[1] 678c0855eaaSJohn Baldwin umlal v13.2d,v29.2s,v2.s[2] 679c0855eaaSJohn Baldwin umlal v6.2d,v29.2s,v2.s[3] 680c0855eaaSJohn Baldwin umlal v7.2d,v29.2s,v3.s[0] 681c0855eaaSJohn Baldwin umlal v8.2d,v29.2s,v3.s[1] 682c0855eaaSJohn Baldwin umlal v9.2d,v29.2s,v3.s[2] 683c0855eaaSJohn Baldwin umlal v10.2d,v29.2s,v3.s[3] 684c0855eaaSJohn Baldwin st1 {v11.2d},[x7],#16 685c0855eaaSJohn Baldwin umlal v12.2d,v28.2s,v0.s[0] 686c0855eaaSJohn Baldwin ld1 {v11.2d},[x6] 687c0855eaaSJohn Baldwin umlal v13.2d,v28.2s,v0.s[1] 688c0855eaaSJohn Baldwin ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+6] 689c0855eaaSJohn Baldwin umlal v6.2d,v28.2s,v0.s[2] 690c0855eaaSJohn Baldwin b.eq .LInner_jump6 691c0855eaaSJohn Baldwin add x6,x6,#16 // don't advance in last iteration 692c0855eaaSJohn Baldwin.LInner_jump6: 693c0855eaaSJohn Baldwin umlal v7.2d,v28.2s,v0.s[3] 694c0855eaaSJohn Baldwin umlal v8.2d,v28.2s,v1.s[0] 695c0855eaaSJohn Baldwin umlal v9.2d,v28.2s,v1.s[1] 696c0855eaaSJohn Baldwin umlal v10.2d,v28.2s,v1.s[2] 697c0855eaaSJohn Baldwin umlal v11.2d,v28.2s,v1.s[3] 698c0855eaaSJohn Baldwin ld1 {v28.2s},[x10],#8 // pull smashed b[8*i+7] 699c0855eaaSJohn Baldwin umlal v12.2d,v29.2s,v2.s[0] 700c0855eaaSJohn Baldwin umlal v13.2d,v29.2s,v2.s[1] 701c0855eaaSJohn Baldwin umlal v6.2d,v29.2s,v2.s[2] 702c0855eaaSJohn Baldwin umlal v7.2d,v29.2s,v2.s[3] 703c0855eaaSJohn Baldwin umlal v8.2d,v29.2s,v3.s[0] 704c0855eaaSJohn Baldwin umlal v9.2d,v29.2s,v3.s[1] 705c0855eaaSJohn Baldwin umlal v10.2d,v29.2s,v3.s[2] 706c0855eaaSJohn Baldwin umlal v11.2d,v29.2s,v3.s[3] 707c0855eaaSJohn Baldwin st1 {v12.2d},[x7],#16 708c0855eaaSJohn Baldwin umlal v13.2d,v28.2s,v0.s[0] 709c0855eaaSJohn Baldwin ld1 {v12.2d},[x6] 710c0855eaaSJohn Baldwin umlal v6.2d,v28.2s,v0.s[1] 711c0855eaaSJohn Baldwin ld1 {v29.2s},[x10],#8 // pull smashed m[8*i+7] 712c0855eaaSJohn Baldwin umlal v7.2d,v28.2s,v0.s[2] 713c0855eaaSJohn Baldwin b.eq .LInner_jump7 714c0855eaaSJohn Baldwin add x6,x6,#16 // don't advance in last iteration 715c0855eaaSJohn Baldwin.LInner_jump7: 716c0855eaaSJohn Baldwin umlal v8.2d,v28.2s,v0.s[3] 717c0855eaaSJohn Baldwin umlal v9.2d,v28.2s,v1.s[0] 718c0855eaaSJohn Baldwin umlal v10.2d,v28.2s,v1.s[1] 719c0855eaaSJohn Baldwin umlal v11.2d,v28.2s,v1.s[2] 720c0855eaaSJohn Baldwin umlal v12.2d,v28.2s,v1.s[3] 721c0855eaaSJohn Baldwin b.ne .LInner_after_rewind8 722c0855eaaSJohn Baldwin sub x1,x1,x5,lsl#2 // rewind 723c0855eaaSJohn Baldwin.LInner_after_rewind8: 724c0855eaaSJohn Baldwin umlal v13.2d,v29.2s,v2.s[0] 725c0855eaaSJohn Baldwin ld1 {v28.2s},[sp] // pull smashed b[8*i+0] 726c0855eaaSJohn Baldwin umlal v6.2d,v29.2s,v2.s[1] 727c0855eaaSJohn Baldwin ld1 {v0.4s,v1.4s},[x1],#32 728c0855eaaSJohn Baldwin umlal v7.2d,v29.2s,v2.s[2] 729c0855eaaSJohn Baldwin add x10,sp,#8 // rewind 730c0855eaaSJohn Baldwin umlal v8.2d,v29.2s,v2.s[3] 731c0855eaaSJohn Baldwin umlal v9.2d,v29.2s,v3.s[0] 732c0855eaaSJohn Baldwin umlal v10.2d,v29.2s,v3.s[1] 733c0855eaaSJohn Baldwin umlal v11.2d,v29.2s,v3.s[2] 734c0855eaaSJohn Baldwin st1 {v13.2d},[x7],#16 735c0855eaaSJohn Baldwin umlal v12.2d,v29.2s,v3.s[3] 736c0855eaaSJohn Baldwin 737c0855eaaSJohn Baldwin bne .LNEON_8n_inner 738c0855eaaSJohn Baldwin add x6,sp,#128 739c0855eaaSJohn Baldwin st1 {v6.2d,v7.2d},[x7],#32 740c0855eaaSJohn Baldwin eor v2.16b,v2.16b,v2.16b // v2 741c0855eaaSJohn Baldwin st1 {v8.2d,v9.2d},[x7],#32 742c0855eaaSJohn Baldwin eor v3.16b,v3.16b,v3.16b // v3 743c0855eaaSJohn Baldwin st1 {v10.2d,v11.2d},[x7],#32 744c0855eaaSJohn Baldwin st1 {v12.2d},[x7] 745c0855eaaSJohn Baldwin 746c0855eaaSJohn Baldwin subs x9,x9,#8 747c0855eaaSJohn Baldwin ld1 {v6.2d,v7.2d},[x6],#32 748c0855eaaSJohn Baldwin ld1 {v8.2d,v9.2d},[x6],#32 749c0855eaaSJohn Baldwin ld1 {v10.2d,v11.2d},[x6],#32 750c0855eaaSJohn Baldwin ld1 {v12.2d,v13.2d},[x6],#32 751c0855eaaSJohn Baldwin 752c0855eaaSJohn Baldwin b.eq .LInner_8n_jump_2steps 753c0855eaaSJohn Baldwin sub x3,x3,x5,lsl#2 // rewind 754c0855eaaSJohn Baldwin b .LNEON_8n_outer 755c0855eaaSJohn Baldwin 756c0855eaaSJohn Baldwin.LInner_8n_jump_2steps: 757c0855eaaSJohn Baldwin add x7,sp,#128 758c0855eaaSJohn Baldwin st1 {v2.2d,v3.2d}, [sp],#32 // start wiping stack frame 759c0855eaaSJohn Baldwin mov v5.16b,v6.16b 760c0855eaaSJohn Baldwin ushr v15.2d,v6.2d,#16 761c0855eaaSJohn Baldwin ext v6.16b,v6.16b,v6.16b,#8 762c0855eaaSJohn Baldwin st1 {v2.2d,v3.2d}, [sp],#32 763c0855eaaSJohn Baldwin add v6.2d,v6.2d,v15.2d 764c0855eaaSJohn Baldwin st1 {v2.2d,v3.2d}, [sp],#32 765c0855eaaSJohn Baldwin ushr v15.2d,v6.2d,#16 766c0855eaaSJohn Baldwin st1 {v2.2d,v3.2d}, [sp],#32 767c0855eaaSJohn Baldwin zip1 v6.4h,v5.4h,v6.4h 768c0855eaaSJohn Baldwin ins v15.d[1],v14.d[0] 769c0855eaaSJohn Baldwin 770c0855eaaSJohn Baldwin mov x8,x5 771c0855eaaSJohn Baldwin b .LNEON_tail_entry 772c0855eaaSJohn Baldwin 773c0855eaaSJohn Baldwin.align 4 774c0855eaaSJohn Baldwin.LNEON_tail: 775c0855eaaSJohn Baldwin add v6.2d,v6.2d,v15.2d 776c0855eaaSJohn Baldwin mov v5.16b,v6.16b 777c0855eaaSJohn Baldwin ushr v15.2d,v6.2d,#16 778c0855eaaSJohn Baldwin ext v6.16b,v6.16b,v6.16b,#8 779c0855eaaSJohn Baldwin ld1 {v8.2d,v9.2d}, [x6],#32 780c0855eaaSJohn Baldwin add v6.2d,v6.2d,v15.2d 781c0855eaaSJohn Baldwin ld1 {v10.2d,v11.2d}, [x6],#32 782c0855eaaSJohn Baldwin ushr v15.2d,v6.2d,#16 783c0855eaaSJohn Baldwin ld1 {v12.2d,v13.2d}, [x6],#32 784c0855eaaSJohn Baldwin zip1 v6.4h,v5.4h,v6.4h 785c0855eaaSJohn Baldwin ins v15.d[1],v14.d[0] 786c0855eaaSJohn Baldwin 787c0855eaaSJohn Baldwin.LNEON_tail_entry: 788c0855eaaSJohn Baldwin add v7.2d,v7.2d,v15.2d 789c0855eaaSJohn Baldwin st1 {v6.s}[0], [x7],#4 790c0855eaaSJohn Baldwin ushr v15.2d,v7.2d,#16 791c0855eaaSJohn Baldwin mov v5.16b,v7.16b 792c0855eaaSJohn Baldwin ext v7.16b,v7.16b,v7.16b,#8 793c0855eaaSJohn Baldwin add v7.2d,v7.2d,v15.2d 794c0855eaaSJohn Baldwin ushr v15.2d,v7.2d,#16 795c0855eaaSJohn Baldwin zip1 v7.4h,v5.4h,v7.4h 796c0855eaaSJohn Baldwin ins v15.d[1],v14.d[0] 797c0855eaaSJohn Baldwin add v8.2d,v8.2d,v15.2d 798c0855eaaSJohn Baldwin st1 {v7.s}[0], [x7],#4 799c0855eaaSJohn Baldwin ushr v15.2d,v8.2d,#16 800c0855eaaSJohn Baldwin mov v5.16b,v8.16b 801c0855eaaSJohn Baldwin ext v8.16b,v8.16b,v8.16b,#8 802c0855eaaSJohn Baldwin add v8.2d,v8.2d,v15.2d 803c0855eaaSJohn Baldwin ushr v15.2d,v8.2d,#16 804c0855eaaSJohn Baldwin zip1 v8.4h,v5.4h,v8.4h 805c0855eaaSJohn Baldwin ins v15.d[1],v14.d[0] 806c0855eaaSJohn Baldwin add v9.2d,v9.2d,v15.2d 807c0855eaaSJohn Baldwin st1 {v8.s}[0], [x7],#4 808c0855eaaSJohn Baldwin ushr v15.2d,v9.2d,#16 809c0855eaaSJohn Baldwin mov v5.16b,v9.16b 810c0855eaaSJohn Baldwin ext v9.16b,v9.16b,v9.16b,#8 811c0855eaaSJohn Baldwin add v9.2d,v9.2d,v15.2d 812c0855eaaSJohn Baldwin ushr v15.2d,v9.2d,#16 813c0855eaaSJohn Baldwin zip1 v9.4h,v5.4h,v9.4h 814c0855eaaSJohn Baldwin ins v15.d[1],v14.d[0] 815c0855eaaSJohn Baldwin add v10.2d,v10.2d,v15.2d 816c0855eaaSJohn Baldwin st1 {v9.s}[0], [x7],#4 817c0855eaaSJohn Baldwin ushr v15.2d,v10.2d,#16 818c0855eaaSJohn Baldwin mov v5.16b,v10.16b 819c0855eaaSJohn Baldwin ext v10.16b,v10.16b,v10.16b,#8 820c0855eaaSJohn Baldwin add v10.2d,v10.2d,v15.2d 821c0855eaaSJohn Baldwin ushr v15.2d,v10.2d,#16 822c0855eaaSJohn Baldwin zip1 v10.4h,v5.4h,v10.4h 823c0855eaaSJohn Baldwin ins v15.d[1],v14.d[0] 824c0855eaaSJohn Baldwin add v11.2d,v11.2d,v15.2d 825c0855eaaSJohn Baldwin st1 {v10.s}[0], [x7],#4 826c0855eaaSJohn Baldwin ushr v15.2d,v11.2d,#16 827c0855eaaSJohn Baldwin mov v5.16b,v11.16b 828c0855eaaSJohn Baldwin ext v11.16b,v11.16b,v11.16b,#8 829c0855eaaSJohn Baldwin add v11.2d,v11.2d,v15.2d 830c0855eaaSJohn Baldwin ushr v15.2d,v11.2d,#16 831c0855eaaSJohn Baldwin zip1 v11.4h,v5.4h,v11.4h 832c0855eaaSJohn Baldwin ins v15.d[1],v14.d[0] 833c0855eaaSJohn Baldwin add v12.2d,v12.2d,v15.2d 834c0855eaaSJohn Baldwin st1 {v11.s}[0], [x7],#4 835c0855eaaSJohn Baldwin ushr v15.2d,v12.2d,#16 836c0855eaaSJohn Baldwin mov v5.16b,v12.16b 837c0855eaaSJohn Baldwin ext v12.16b,v12.16b,v12.16b,#8 838c0855eaaSJohn Baldwin add v12.2d,v12.2d,v15.2d 839c0855eaaSJohn Baldwin ushr v15.2d,v12.2d,#16 840c0855eaaSJohn Baldwin zip1 v12.4h,v5.4h,v12.4h 841c0855eaaSJohn Baldwin ins v15.d[1],v14.d[0] 842c0855eaaSJohn Baldwin add v13.2d,v13.2d,v15.2d 843c0855eaaSJohn Baldwin st1 {v12.s}[0], [x7],#4 844c0855eaaSJohn Baldwin ushr v15.2d,v13.2d,#16 845c0855eaaSJohn Baldwin mov v5.16b,v13.16b 846c0855eaaSJohn Baldwin ext v13.16b,v13.16b,v13.16b,#8 847c0855eaaSJohn Baldwin add v13.2d,v13.2d,v15.2d 848c0855eaaSJohn Baldwin ushr v15.2d,v13.2d,#16 849c0855eaaSJohn Baldwin zip1 v13.4h,v5.4h,v13.4h 850c0855eaaSJohn Baldwin ins v15.d[1],v14.d[0] 851c0855eaaSJohn Baldwin ld1 {v6.2d,v7.2d}, [x6],#32 852c0855eaaSJohn Baldwin subs x8,x8,#8 853c0855eaaSJohn Baldwin st1 {v13.s}[0], [x7],#4 854c0855eaaSJohn Baldwin bne .LNEON_tail 855c0855eaaSJohn Baldwin 856c0855eaaSJohn Baldwin st1 {v15.s}[0], [x7],#4 // top-most bit 857c0855eaaSJohn Baldwin sub x3,x3,x5,lsl#2 // rewind x3 858c0855eaaSJohn Baldwin subs x1,sp,#0 // clear carry flag 859c0855eaaSJohn Baldwin add x2,sp,x5,lsl#2 860c0855eaaSJohn Baldwin 861c0855eaaSJohn Baldwin.LNEON_sub: 862c0855eaaSJohn Baldwin ldp w4,w5,[x1],#8 863c0855eaaSJohn Baldwin ldp w6,w7,[x1],#8 864c0855eaaSJohn Baldwin ldp w8,w9,[x3],#8 865c0855eaaSJohn Baldwin ldp w10,w11,[x3],#8 866c0855eaaSJohn Baldwin sbcs w8,w4,w8 867c0855eaaSJohn Baldwin sbcs w9,w5,w9 868c0855eaaSJohn Baldwin sbcs w10,w6,w10 869c0855eaaSJohn Baldwin sbcs w11,w7,w11 870c0855eaaSJohn Baldwin sub x17,x2,x1 871c0855eaaSJohn Baldwin stp w8,w9,[x0],#8 872c0855eaaSJohn Baldwin stp w10,w11,[x0],#8 873c0855eaaSJohn Baldwin cbnz x17,.LNEON_sub 874c0855eaaSJohn Baldwin 875c0855eaaSJohn Baldwin ldr w10, [x1] // load top-most bit 876c0855eaaSJohn Baldwin mov x11,sp 877c0855eaaSJohn Baldwin eor v0.16b,v0.16b,v0.16b 878c0855eaaSJohn Baldwin sub x11,x2,x11 // this is num*4 879c0855eaaSJohn Baldwin eor v1.16b,v1.16b,v1.16b 880c0855eaaSJohn Baldwin mov x1,sp 881c0855eaaSJohn Baldwin sub x0,x0,x11 // rewind x0 882c0855eaaSJohn Baldwin mov x3,x2 // second 3/4th of frame 883c0855eaaSJohn Baldwin sbcs w10,w10,wzr // result is carry flag 884c0855eaaSJohn Baldwin 885c0855eaaSJohn Baldwin.LNEON_copy_n_zap: 886c0855eaaSJohn Baldwin ldp w4,w5,[x1],#8 887c0855eaaSJohn Baldwin ldp w6,w7,[x1],#8 888c0855eaaSJohn Baldwin ldp w8,w9,[x0],#8 889c0855eaaSJohn Baldwin ldp w10,w11,[x0] 890c0855eaaSJohn Baldwin sub x0,x0,#8 891c0855eaaSJohn Baldwin b.cs .LCopy_1 892c0855eaaSJohn Baldwin mov w8,w4 893c0855eaaSJohn Baldwin mov w9,w5 894c0855eaaSJohn Baldwin mov w10,w6 895c0855eaaSJohn Baldwin mov w11,w7 896c0855eaaSJohn Baldwin.LCopy_1: 897c0855eaaSJohn Baldwin st1 {v0.2d,v1.2d}, [x3],#32 // wipe 898c0855eaaSJohn Baldwin st1 {v0.2d,v1.2d}, [x3],#32 // wipe 899c0855eaaSJohn Baldwin ldp w4,w5,[x1],#8 900c0855eaaSJohn Baldwin ldp w6,w7,[x1],#8 901c0855eaaSJohn Baldwin stp w8,w9,[x0],#8 902c0855eaaSJohn Baldwin stp w10,w11,[x0],#8 903c0855eaaSJohn Baldwin sub x1,x1,#32 904c0855eaaSJohn Baldwin ldp w8,w9,[x0],#8 905c0855eaaSJohn Baldwin ldp w10,w11,[x0] 906c0855eaaSJohn Baldwin sub x0,x0,#8 907c0855eaaSJohn Baldwin b.cs .LCopy_2 908c0855eaaSJohn Baldwin mov w8, w4 909c0855eaaSJohn Baldwin mov w9, w5 910c0855eaaSJohn Baldwin mov w10, w6 911c0855eaaSJohn Baldwin mov w11, w7 912c0855eaaSJohn Baldwin.LCopy_2: 913c0855eaaSJohn Baldwin st1 {v0.2d,v1.2d}, [x1],#32 // wipe 914c0855eaaSJohn Baldwin st1 {v0.2d,v1.2d}, [x3],#32 // wipe 915c0855eaaSJohn Baldwin sub x17,x2,x1 // preserves carry 916c0855eaaSJohn Baldwin stp w8,w9,[x0],#8 917c0855eaaSJohn Baldwin stp w10,w11,[x0],#8 918c0855eaaSJohn Baldwin cbnz x17,.LNEON_copy_n_zap 919c0855eaaSJohn Baldwin 920c0855eaaSJohn Baldwin mov sp,x16 921c0855eaaSJohn Baldwin ldp d14,d15,[sp,#64] 922c0855eaaSJohn Baldwin ldp d12,d13,[sp,#48] 923c0855eaaSJohn Baldwin ldp d10,d11,[sp,#32] 924c0855eaaSJohn Baldwin ldp d8,d9,[sp,#16] 925c0855eaaSJohn Baldwin ldr x29,[sp],#80 926bd9588bcSAndrew Turner AARCH64_VALIDATE_LINK_REGISTER 927c0855eaaSJohn Baldwin ret // bx lr 928c0855eaaSJohn Baldwin 929c0855eaaSJohn Baldwin.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon 930bc3d5698SJohn Baldwin.type __bn_sqr8x_mont,%function 931bc3d5698SJohn Baldwin.align 5 932bc3d5698SJohn Baldwin__bn_sqr8x_mont: 933bc3d5698SJohn Baldwin cmp x1,x2 934bc3d5698SJohn Baldwin b.ne __bn_mul4x_mont 935bc3d5698SJohn Baldwin.Lsqr8x_mont: 936bd9588bcSAndrew Turner // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to 937bd9588bcSAndrew Turner // only from bn_mul_mont which has already signed the return address. 938bc3d5698SJohn Baldwin stp x29,x30,[sp,#-128]! 939bc3d5698SJohn Baldwin add x29,sp,#0 940bc3d5698SJohn Baldwin stp x19,x20,[sp,#16] 941bc3d5698SJohn Baldwin stp x21,x22,[sp,#32] 942bc3d5698SJohn Baldwin stp x23,x24,[sp,#48] 943bc3d5698SJohn Baldwin stp x25,x26,[sp,#64] 944bc3d5698SJohn Baldwin stp x27,x28,[sp,#80] 945bc3d5698SJohn Baldwin stp x0,x3,[sp,#96] // offload rp and np 946bc3d5698SJohn Baldwin 947bc3d5698SJohn Baldwin ldp x6,x7,[x1,#8*0] 948bc3d5698SJohn Baldwin ldp x8,x9,[x1,#8*2] 949bc3d5698SJohn Baldwin ldp x10,x11,[x1,#8*4] 950bc3d5698SJohn Baldwin ldp x12,x13,[x1,#8*6] 951bc3d5698SJohn Baldwin 952bc3d5698SJohn Baldwin sub x2,sp,x5,lsl#4 953bc3d5698SJohn Baldwin lsl x5,x5,#3 954bc3d5698SJohn Baldwin ldr x4,[x4] // *n0 955bc3d5698SJohn Baldwin mov sp,x2 // alloca 956bc3d5698SJohn Baldwin sub x27,x5,#8*8 957bc3d5698SJohn Baldwin b .Lsqr8x_zero_start 958bc3d5698SJohn Baldwin 959bc3d5698SJohn Baldwin.Lsqr8x_zero: 960bc3d5698SJohn Baldwin sub x27,x27,#8*8 961bc3d5698SJohn Baldwin stp xzr,xzr,[x2,#8*0] 962bc3d5698SJohn Baldwin stp xzr,xzr,[x2,#8*2] 963bc3d5698SJohn Baldwin stp xzr,xzr,[x2,#8*4] 964bc3d5698SJohn Baldwin stp xzr,xzr,[x2,#8*6] 965bc3d5698SJohn Baldwin.Lsqr8x_zero_start: 966bc3d5698SJohn Baldwin stp xzr,xzr,[x2,#8*8] 967bc3d5698SJohn Baldwin stp xzr,xzr,[x2,#8*10] 968bc3d5698SJohn Baldwin stp xzr,xzr,[x2,#8*12] 969bc3d5698SJohn Baldwin stp xzr,xzr,[x2,#8*14] 970bc3d5698SJohn Baldwin add x2,x2,#8*16 971bc3d5698SJohn Baldwin cbnz x27,.Lsqr8x_zero 972bc3d5698SJohn Baldwin 973bc3d5698SJohn Baldwin add x3,x1,x5 974bc3d5698SJohn Baldwin add x1,x1,#8*8 975bc3d5698SJohn Baldwin mov x19,xzr 976bc3d5698SJohn Baldwin mov x20,xzr 977bc3d5698SJohn Baldwin mov x21,xzr 978bc3d5698SJohn Baldwin mov x22,xzr 979bc3d5698SJohn Baldwin mov x23,xzr 980bc3d5698SJohn Baldwin mov x24,xzr 981bc3d5698SJohn Baldwin mov x25,xzr 982bc3d5698SJohn Baldwin mov x26,xzr 983bc3d5698SJohn Baldwin mov x2,sp 984bc3d5698SJohn Baldwin str x4,[x29,#112] // offload n0 985bc3d5698SJohn Baldwin 986bc3d5698SJohn Baldwin // Multiply everything but a[i]*a[i] 987bc3d5698SJohn Baldwin.align 4 988bc3d5698SJohn Baldwin.Lsqr8x_outer_loop: 989bc3d5698SJohn Baldwin // a[1]a[0] (i) 990bc3d5698SJohn Baldwin // a[2]a[0] 991bc3d5698SJohn Baldwin // a[3]a[0] 992bc3d5698SJohn Baldwin // a[4]a[0] 993bc3d5698SJohn Baldwin // a[5]a[0] 994bc3d5698SJohn Baldwin // a[6]a[0] 995bc3d5698SJohn Baldwin // a[7]a[0] 996bc3d5698SJohn Baldwin // a[2]a[1] (ii) 997bc3d5698SJohn Baldwin // a[3]a[1] 998bc3d5698SJohn Baldwin // a[4]a[1] 999bc3d5698SJohn Baldwin // a[5]a[1] 1000bc3d5698SJohn Baldwin // a[6]a[1] 1001bc3d5698SJohn Baldwin // a[7]a[1] 1002bc3d5698SJohn Baldwin // a[3]a[2] (iii) 1003bc3d5698SJohn Baldwin // a[4]a[2] 1004bc3d5698SJohn Baldwin // a[5]a[2] 1005bc3d5698SJohn Baldwin // a[6]a[2] 1006bc3d5698SJohn Baldwin // a[7]a[2] 1007bc3d5698SJohn Baldwin // a[4]a[3] (iv) 1008bc3d5698SJohn Baldwin // a[5]a[3] 1009bc3d5698SJohn Baldwin // a[6]a[3] 1010bc3d5698SJohn Baldwin // a[7]a[3] 1011bc3d5698SJohn Baldwin // a[5]a[4] (v) 1012bc3d5698SJohn Baldwin // a[6]a[4] 1013bc3d5698SJohn Baldwin // a[7]a[4] 1014bc3d5698SJohn Baldwin // a[6]a[5] (vi) 1015bc3d5698SJohn Baldwin // a[7]a[5] 1016bc3d5698SJohn Baldwin // a[7]a[6] (vii) 1017bc3d5698SJohn Baldwin 1018bc3d5698SJohn Baldwin mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 1019bc3d5698SJohn Baldwin mul x15,x8,x6 1020bc3d5698SJohn Baldwin mul x16,x9,x6 1021bc3d5698SJohn Baldwin mul x17,x10,x6 1022bc3d5698SJohn Baldwin adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 1023bc3d5698SJohn Baldwin mul x14,x11,x6 1024bc3d5698SJohn Baldwin adcs x21,x21,x15 1025bc3d5698SJohn Baldwin mul x15,x12,x6 1026bc3d5698SJohn Baldwin adcs x22,x22,x16 1027bc3d5698SJohn Baldwin mul x16,x13,x6 1028bc3d5698SJohn Baldwin adcs x23,x23,x17 1029bc3d5698SJohn Baldwin umulh x17,x7,x6 // hi(a[1..7]*a[0]) 1030bc3d5698SJohn Baldwin adcs x24,x24,x14 1031bc3d5698SJohn Baldwin umulh x14,x8,x6 1032bc3d5698SJohn Baldwin adcs x25,x25,x15 1033bc3d5698SJohn Baldwin umulh x15,x9,x6 1034bc3d5698SJohn Baldwin adcs x26,x26,x16 1035bc3d5698SJohn Baldwin umulh x16,x10,x6 1036bc3d5698SJohn Baldwin stp x19,x20,[x2],#8*2 // t[0..1] 1037bc3d5698SJohn Baldwin adc x19,xzr,xzr // t[8] 1038bc3d5698SJohn Baldwin adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 1039bc3d5698SJohn Baldwin umulh x17,x11,x6 1040bc3d5698SJohn Baldwin adcs x22,x22,x14 1041bc3d5698SJohn Baldwin umulh x14,x12,x6 1042bc3d5698SJohn Baldwin adcs x23,x23,x15 1043bc3d5698SJohn Baldwin umulh x15,x13,x6 1044bc3d5698SJohn Baldwin adcs x24,x24,x16 1045bc3d5698SJohn Baldwin mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 1046bc3d5698SJohn Baldwin adcs x25,x25,x17 1047bc3d5698SJohn Baldwin mul x17,x9,x7 1048bc3d5698SJohn Baldwin adcs x26,x26,x14 1049bc3d5698SJohn Baldwin mul x14,x10,x7 1050bc3d5698SJohn Baldwin adc x19,x19,x15 1051bc3d5698SJohn Baldwin 1052bc3d5698SJohn Baldwin mul x15,x11,x7 1053bc3d5698SJohn Baldwin adds x22,x22,x16 1054bc3d5698SJohn Baldwin mul x16,x12,x7 1055bc3d5698SJohn Baldwin adcs x23,x23,x17 1056bc3d5698SJohn Baldwin mul x17,x13,x7 1057bc3d5698SJohn Baldwin adcs x24,x24,x14 1058bc3d5698SJohn Baldwin umulh x14,x8,x7 // hi(a[2..7]*a[1]) 1059bc3d5698SJohn Baldwin adcs x25,x25,x15 1060bc3d5698SJohn Baldwin umulh x15,x9,x7 1061bc3d5698SJohn Baldwin adcs x26,x26,x16 1062bc3d5698SJohn Baldwin umulh x16,x10,x7 1063bc3d5698SJohn Baldwin adcs x19,x19,x17 1064bc3d5698SJohn Baldwin umulh x17,x11,x7 1065bc3d5698SJohn Baldwin stp x21,x22,[x2],#8*2 // t[2..3] 1066bc3d5698SJohn Baldwin adc x20,xzr,xzr // t[9] 1067bc3d5698SJohn Baldwin adds x23,x23,x14 1068bc3d5698SJohn Baldwin umulh x14,x12,x7 1069bc3d5698SJohn Baldwin adcs x24,x24,x15 1070bc3d5698SJohn Baldwin umulh x15,x13,x7 1071bc3d5698SJohn Baldwin adcs x25,x25,x16 1072bc3d5698SJohn Baldwin mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 1073bc3d5698SJohn Baldwin adcs x26,x26,x17 1074bc3d5698SJohn Baldwin mul x17,x10,x8 1075bc3d5698SJohn Baldwin adcs x19,x19,x14 1076bc3d5698SJohn Baldwin mul x14,x11,x8 1077bc3d5698SJohn Baldwin adc x20,x20,x15 1078bc3d5698SJohn Baldwin 1079bc3d5698SJohn Baldwin mul x15,x12,x8 1080bc3d5698SJohn Baldwin adds x24,x24,x16 1081bc3d5698SJohn Baldwin mul x16,x13,x8 1082bc3d5698SJohn Baldwin adcs x25,x25,x17 1083bc3d5698SJohn Baldwin umulh x17,x9,x8 // hi(a[3..7]*a[2]) 1084bc3d5698SJohn Baldwin adcs x26,x26,x14 1085bc3d5698SJohn Baldwin umulh x14,x10,x8 1086bc3d5698SJohn Baldwin adcs x19,x19,x15 1087bc3d5698SJohn Baldwin umulh x15,x11,x8 1088bc3d5698SJohn Baldwin adcs x20,x20,x16 1089bc3d5698SJohn Baldwin umulh x16,x12,x8 1090bc3d5698SJohn Baldwin stp x23,x24,[x2],#8*2 // t[4..5] 1091bc3d5698SJohn Baldwin adc x21,xzr,xzr // t[10] 1092bc3d5698SJohn Baldwin adds x25,x25,x17 1093bc3d5698SJohn Baldwin umulh x17,x13,x8 1094bc3d5698SJohn Baldwin adcs x26,x26,x14 1095bc3d5698SJohn Baldwin mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 1096bc3d5698SJohn Baldwin adcs x19,x19,x15 1097bc3d5698SJohn Baldwin mul x15,x11,x9 1098bc3d5698SJohn Baldwin adcs x20,x20,x16 1099bc3d5698SJohn Baldwin mul x16,x12,x9 1100bc3d5698SJohn Baldwin adc x21,x21,x17 1101bc3d5698SJohn Baldwin 1102bc3d5698SJohn Baldwin mul x17,x13,x9 1103bc3d5698SJohn Baldwin adds x26,x26,x14 1104bc3d5698SJohn Baldwin umulh x14,x10,x9 // hi(a[4..7]*a[3]) 1105bc3d5698SJohn Baldwin adcs x19,x19,x15 1106bc3d5698SJohn Baldwin umulh x15,x11,x9 1107bc3d5698SJohn Baldwin adcs x20,x20,x16 1108bc3d5698SJohn Baldwin umulh x16,x12,x9 1109bc3d5698SJohn Baldwin adcs x21,x21,x17 1110bc3d5698SJohn Baldwin umulh x17,x13,x9 1111bc3d5698SJohn Baldwin stp x25,x26,[x2],#8*2 // t[6..7] 1112bc3d5698SJohn Baldwin adc x22,xzr,xzr // t[11] 1113bc3d5698SJohn Baldwin adds x19,x19,x14 1114bc3d5698SJohn Baldwin mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 1115bc3d5698SJohn Baldwin adcs x20,x20,x15 1116bc3d5698SJohn Baldwin mul x15,x12,x10 1117bc3d5698SJohn Baldwin adcs x21,x21,x16 1118bc3d5698SJohn Baldwin mul x16,x13,x10 1119bc3d5698SJohn Baldwin adc x22,x22,x17 1120bc3d5698SJohn Baldwin 1121bc3d5698SJohn Baldwin umulh x17,x11,x10 // hi(a[5..7]*a[4]) 1122bc3d5698SJohn Baldwin adds x20,x20,x14 1123bc3d5698SJohn Baldwin umulh x14,x12,x10 1124bc3d5698SJohn Baldwin adcs x21,x21,x15 1125bc3d5698SJohn Baldwin umulh x15,x13,x10 1126bc3d5698SJohn Baldwin adcs x22,x22,x16 1127bc3d5698SJohn Baldwin mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 1128bc3d5698SJohn Baldwin adc x23,xzr,xzr // t[12] 1129bc3d5698SJohn Baldwin adds x21,x21,x17 1130bc3d5698SJohn Baldwin mul x17,x13,x11 1131bc3d5698SJohn Baldwin adcs x22,x22,x14 1132bc3d5698SJohn Baldwin umulh x14,x12,x11 // hi(a[6..7]*a[5]) 1133bc3d5698SJohn Baldwin adc x23,x23,x15 1134bc3d5698SJohn Baldwin 1135bc3d5698SJohn Baldwin umulh x15,x13,x11 1136bc3d5698SJohn Baldwin adds x22,x22,x16 1137bc3d5698SJohn Baldwin mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 1138bc3d5698SJohn Baldwin adcs x23,x23,x17 1139bc3d5698SJohn Baldwin umulh x17,x13,x12 // hi(a[7]*a[6]) 1140bc3d5698SJohn Baldwin adc x24,xzr,xzr // t[13] 1141bc3d5698SJohn Baldwin adds x23,x23,x14 1142bc3d5698SJohn Baldwin sub x27,x3,x1 // done yet? 1143bc3d5698SJohn Baldwin adc x24,x24,x15 1144bc3d5698SJohn Baldwin 1145bc3d5698SJohn Baldwin adds x24,x24,x16 1146bc3d5698SJohn Baldwin sub x14,x3,x5 // rewinded ap 1147bc3d5698SJohn Baldwin adc x25,xzr,xzr // t[14] 1148bc3d5698SJohn Baldwin add x25,x25,x17 1149bc3d5698SJohn Baldwin 1150bc3d5698SJohn Baldwin cbz x27,.Lsqr8x_outer_break 1151bc3d5698SJohn Baldwin 1152bc3d5698SJohn Baldwin mov x4,x6 1153bc3d5698SJohn Baldwin ldp x6,x7,[x2,#8*0] 1154bc3d5698SJohn Baldwin ldp x8,x9,[x2,#8*2] 1155bc3d5698SJohn Baldwin ldp x10,x11,[x2,#8*4] 1156bc3d5698SJohn Baldwin ldp x12,x13,[x2,#8*6] 1157bc3d5698SJohn Baldwin adds x19,x19,x6 1158bc3d5698SJohn Baldwin adcs x20,x20,x7 1159bc3d5698SJohn Baldwin ldp x6,x7,[x1,#8*0] 1160bc3d5698SJohn Baldwin adcs x21,x21,x8 1161bc3d5698SJohn Baldwin adcs x22,x22,x9 1162bc3d5698SJohn Baldwin ldp x8,x9,[x1,#8*2] 1163bc3d5698SJohn Baldwin adcs x23,x23,x10 1164bc3d5698SJohn Baldwin adcs x24,x24,x11 1165bc3d5698SJohn Baldwin ldp x10,x11,[x1,#8*4] 1166bc3d5698SJohn Baldwin adcs x25,x25,x12 1167bc3d5698SJohn Baldwin mov x0,x1 1168bc3d5698SJohn Baldwin adcs x26,xzr,x13 1169bc3d5698SJohn Baldwin ldp x12,x13,[x1,#8*6] 1170bc3d5698SJohn Baldwin add x1,x1,#8*8 1171bc3d5698SJohn Baldwin //adc x28,xzr,xzr // moved below 1172bc3d5698SJohn Baldwin mov x27,#-8*8 1173bc3d5698SJohn Baldwin 1174bc3d5698SJohn Baldwin // a[8]a[0] 1175bc3d5698SJohn Baldwin // a[9]a[0] 1176bc3d5698SJohn Baldwin // a[a]a[0] 1177bc3d5698SJohn Baldwin // a[b]a[0] 1178bc3d5698SJohn Baldwin // a[c]a[0] 1179bc3d5698SJohn Baldwin // a[d]a[0] 1180bc3d5698SJohn Baldwin // a[e]a[0] 1181bc3d5698SJohn Baldwin // a[f]a[0] 1182bc3d5698SJohn Baldwin // a[8]a[1] 1183bc3d5698SJohn Baldwin // a[f]a[1]........................ 1184bc3d5698SJohn Baldwin // a[8]a[2] 1185bc3d5698SJohn Baldwin // a[f]a[2]........................ 1186bc3d5698SJohn Baldwin // a[8]a[3] 1187bc3d5698SJohn Baldwin // a[f]a[3]........................ 1188bc3d5698SJohn Baldwin // a[8]a[4] 1189bc3d5698SJohn Baldwin // a[f]a[4]........................ 1190bc3d5698SJohn Baldwin // a[8]a[5] 1191bc3d5698SJohn Baldwin // a[f]a[5]........................ 1192bc3d5698SJohn Baldwin // a[8]a[6] 1193bc3d5698SJohn Baldwin // a[f]a[6]........................ 1194bc3d5698SJohn Baldwin // a[8]a[7] 1195bc3d5698SJohn Baldwin // a[f]a[7]........................ 1196bc3d5698SJohn Baldwin.Lsqr8x_mul: 1197bc3d5698SJohn Baldwin mul x14,x6,x4 1198bc3d5698SJohn Baldwin adc x28,xzr,xzr // carry bit, modulo-scheduled 1199bc3d5698SJohn Baldwin mul x15,x7,x4 1200bc3d5698SJohn Baldwin add x27,x27,#8 1201bc3d5698SJohn Baldwin mul x16,x8,x4 1202bc3d5698SJohn Baldwin mul x17,x9,x4 1203bc3d5698SJohn Baldwin adds x19,x19,x14 1204bc3d5698SJohn Baldwin mul x14,x10,x4 1205bc3d5698SJohn Baldwin adcs x20,x20,x15 1206bc3d5698SJohn Baldwin mul x15,x11,x4 1207bc3d5698SJohn Baldwin adcs x21,x21,x16 1208bc3d5698SJohn Baldwin mul x16,x12,x4 1209bc3d5698SJohn Baldwin adcs x22,x22,x17 1210bc3d5698SJohn Baldwin mul x17,x13,x4 1211bc3d5698SJohn Baldwin adcs x23,x23,x14 1212bc3d5698SJohn Baldwin umulh x14,x6,x4 1213bc3d5698SJohn Baldwin adcs x24,x24,x15 1214bc3d5698SJohn Baldwin umulh x15,x7,x4 1215bc3d5698SJohn Baldwin adcs x25,x25,x16 1216bc3d5698SJohn Baldwin umulh x16,x8,x4 1217bc3d5698SJohn Baldwin adcs x26,x26,x17 1218bc3d5698SJohn Baldwin umulh x17,x9,x4 1219bc3d5698SJohn Baldwin adc x28,x28,xzr 1220bc3d5698SJohn Baldwin str x19,[x2],#8 1221bc3d5698SJohn Baldwin adds x19,x20,x14 1222bc3d5698SJohn Baldwin umulh x14,x10,x4 1223bc3d5698SJohn Baldwin adcs x20,x21,x15 1224bc3d5698SJohn Baldwin umulh x15,x11,x4 1225bc3d5698SJohn Baldwin adcs x21,x22,x16 1226bc3d5698SJohn Baldwin umulh x16,x12,x4 1227bc3d5698SJohn Baldwin adcs x22,x23,x17 1228bc3d5698SJohn Baldwin umulh x17,x13,x4 1229bc3d5698SJohn Baldwin ldr x4,[x0,x27] 1230bc3d5698SJohn Baldwin adcs x23,x24,x14 1231bc3d5698SJohn Baldwin adcs x24,x25,x15 1232bc3d5698SJohn Baldwin adcs x25,x26,x16 1233bc3d5698SJohn Baldwin adcs x26,x28,x17 1234bc3d5698SJohn Baldwin //adc x28,xzr,xzr // moved above 1235bc3d5698SJohn Baldwin cbnz x27,.Lsqr8x_mul 1236bc3d5698SJohn Baldwin // note that carry flag is guaranteed 1237bc3d5698SJohn Baldwin // to be zero at this point 1238bc3d5698SJohn Baldwin cmp x1,x3 // done yet? 1239bc3d5698SJohn Baldwin b.eq .Lsqr8x_break 1240bc3d5698SJohn Baldwin 1241bc3d5698SJohn Baldwin ldp x6,x7,[x2,#8*0] 1242bc3d5698SJohn Baldwin ldp x8,x9,[x2,#8*2] 1243bc3d5698SJohn Baldwin ldp x10,x11,[x2,#8*4] 1244bc3d5698SJohn Baldwin ldp x12,x13,[x2,#8*6] 1245bc3d5698SJohn Baldwin adds x19,x19,x6 1246c0855eaaSJohn Baldwin ldur x4,[x0,#-8*8] 1247bc3d5698SJohn Baldwin adcs x20,x20,x7 1248bc3d5698SJohn Baldwin ldp x6,x7,[x1,#8*0] 1249bc3d5698SJohn Baldwin adcs x21,x21,x8 1250bc3d5698SJohn Baldwin adcs x22,x22,x9 1251bc3d5698SJohn Baldwin ldp x8,x9,[x1,#8*2] 1252bc3d5698SJohn Baldwin adcs x23,x23,x10 1253bc3d5698SJohn Baldwin adcs x24,x24,x11 1254bc3d5698SJohn Baldwin ldp x10,x11,[x1,#8*4] 1255bc3d5698SJohn Baldwin adcs x25,x25,x12 1256bc3d5698SJohn Baldwin mov x27,#-8*8 1257bc3d5698SJohn Baldwin adcs x26,x26,x13 1258bc3d5698SJohn Baldwin ldp x12,x13,[x1,#8*6] 1259bc3d5698SJohn Baldwin add x1,x1,#8*8 1260bc3d5698SJohn Baldwin //adc x28,xzr,xzr // moved above 1261bc3d5698SJohn Baldwin b .Lsqr8x_mul 1262bc3d5698SJohn Baldwin 1263bc3d5698SJohn Baldwin.align 4 1264bc3d5698SJohn Baldwin.Lsqr8x_break: 1265bc3d5698SJohn Baldwin ldp x6,x7,[x0,#8*0] 1266bc3d5698SJohn Baldwin add x1,x0,#8*8 1267bc3d5698SJohn Baldwin ldp x8,x9,[x0,#8*2] 1268bc3d5698SJohn Baldwin sub x14,x3,x1 // is it last iteration? 1269bc3d5698SJohn Baldwin ldp x10,x11,[x0,#8*4] 1270bc3d5698SJohn Baldwin sub x15,x2,x14 1271bc3d5698SJohn Baldwin ldp x12,x13,[x0,#8*6] 1272bc3d5698SJohn Baldwin cbz x14,.Lsqr8x_outer_loop 1273bc3d5698SJohn Baldwin 1274bc3d5698SJohn Baldwin stp x19,x20,[x2,#8*0] 1275bc3d5698SJohn Baldwin ldp x19,x20,[x15,#8*0] 1276bc3d5698SJohn Baldwin stp x21,x22,[x2,#8*2] 1277bc3d5698SJohn Baldwin ldp x21,x22,[x15,#8*2] 1278bc3d5698SJohn Baldwin stp x23,x24,[x2,#8*4] 1279bc3d5698SJohn Baldwin ldp x23,x24,[x15,#8*4] 1280bc3d5698SJohn Baldwin stp x25,x26,[x2,#8*6] 1281bc3d5698SJohn Baldwin mov x2,x15 1282bc3d5698SJohn Baldwin ldp x25,x26,[x15,#8*6] 1283bc3d5698SJohn Baldwin b .Lsqr8x_outer_loop 1284bc3d5698SJohn Baldwin 1285bc3d5698SJohn Baldwin.align 4 1286bc3d5698SJohn Baldwin.Lsqr8x_outer_break: 1287bc3d5698SJohn Baldwin // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 1288bc3d5698SJohn Baldwin ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 1289bc3d5698SJohn Baldwin ldp x15,x16,[sp,#8*1] 1290bc3d5698SJohn Baldwin ldp x11,x13,[x14,#8*2] 1291bc3d5698SJohn Baldwin add x1,x14,#8*4 1292bc3d5698SJohn Baldwin ldp x17,x14,[sp,#8*3] 1293bc3d5698SJohn Baldwin 1294bc3d5698SJohn Baldwin stp x19,x20,[x2,#8*0] 1295bc3d5698SJohn Baldwin mul x19,x7,x7 1296bc3d5698SJohn Baldwin stp x21,x22,[x2,#8*2] 1297bc3d5698SJohn Baldwin umulh x7,x7,x7 1298bc3d5698SJohn Baldwin stp x23,x24,[x2,#8*4] 1299bc3d5698SJohn Baldwin mul x8,x9,x9 1300bc3d5698SJohn Baldwin stp x25,x26,[x2,#8*6] 1301bc3d5698SJohn Baldwin mov x2,sp 1302bc3d5698SJohn Baldwin umulh x9,x9,x9 1303bc3d5698SJohn Baldwin adds x20,x7,x15,lsl#1 1304bc3d5698SJohn Baldwin extr x15,x16,x15,#63 1305bc3d5698SJohn Baldwin sub x27,x5,#8*4 1306bc3d5698SJohn Baldwin 1307bc3d5698SJohn Baldwin.Lsqr4x_shift_n_add: 1308bc3d5698SJohn Baldwin adcs x21,x8,x15 1309bc3d5698SJohn Baldwin extr x16,x17,x16,#63 1310bc3d5698SJohn Baldwin sub x27,x27,#8*4 1311bc3d5698SJohn Baldwin adcs x22,x9,x16 1312bc3d5698SJohn Baldwin ldp x15,x16,[x2,#8*5] 1313bc3d5698SJohn Baldwin mul x10,x11,x11 1314bc3d5698SJohn Baldwin ldp x7,x9,[x1],#8*2 1315bc3d5698SJohn Baldwin umulh x11,x11,x11 1316bc3d5698SJohn Baldwin mul x12,x13,x13 1317bc3d5698SJohn Baldwin umulh x13,x13,x13 1318bc3d5698SJohn Baldwin extr x17,x14,x17,#63 1319bc3d5698SJohn Baldwin stp x19,x20,[x2,#8*0] 1320bc3d5698SJohn Baldwin adcs x23,x10,x17 1321bc3d5698SJohn Baldwin extr x14,x15,x14,#63 1322bc3d5698SJohn Baldwin stp x21,x22,[x2,#8*2] 1323bc3d5698SJohn Baldwin adcs x24,x11,x14 1324bc3d5698SJohn Baldwin ldp x17,x14,[x2,#8*7] 1325bc3d5698SJohn Baldwin extr x15,x16,x15,#63 1326bc3d5698SJohn Baldwin adcs x25,x12,x15 1327bc3d5698SJohn Baldwin extr x16,x17,x16,#63 1328bc3d5698SJohn Baldwin adcs x26,x13,x16 1329bc3d5698SJohn Baldwin ldp x15,x16,[x2,#8*9] 1330bc3d5698SJohn Baldwin mul x6,x7,x7 1331bc3d5698SJohn Baldwin ldp x11,x13,[x1],#8*2 1332bc3d5698SJohn Baldwin umulh x7,x7,x7 1333bc3d5698SJohn Baldwin mul x8,x9,x9 1334bc3d5698SJohn Baldwin umulh x9,x9,x9 1335bc3d5698SJohn Baldwin stp x23,x24,[x2,#8*4] 1336bc3d5698SJohn Baldwin extr x17,x14,x17,#63 1337bc3d5698SJohn Baldwin stp x25,x26,[x2,#8*6] 1338bc3d5698SJohn Baldwin add x2,x2,#8*8 1339bc3d5698SJohn Baldwin adcs x19,x6,x17 1340bc3d5698SJohn Baldwin extr x14,x15,x14,#63 1341bc3d5698SJohn Baldwin adcs x20,x7,x14 1342bc3d5698SJohn Baldwin ldp x17,x14,[x2,#8*3] 1343bc3d5698SJohn Baldwin extr x15,x16,x15,#63 1344bc3d5698SJohn Baldwin cbnz x27,.Lsqr4x_shift_n_add 1345bc3d5698SJohn Baldwin ldp x1,x4,[x29,#104] // pull np and n0 1346bc3d5698SJohn Baldwin 1347bc3d5698SJohn Baldwin adcs x21,x8,x15 1348bc3d5698SJohn Baldwin extr x16,x17,x16,#63 1349bc3d5698SJohn Baldwin adcs x22,x9,x16 1350bc3d5698SJohn Baldwin ldp x15,x16,[x2,#8*5] 1351bc3d5698SJohn Baldwin mul x10,x11,x11 1352bc3d5698SJohn Baldwin umulh x11,x11,x11 1353bc3d5698SJohn Baldwin stp x19,x20,[x2,#8*0] 1354bc3d5698SJohn Baldwin mul x12,x13,x13 1355bc3d5698SJohn Baldwin umulh x13,x13,x13 1356bc3d5698SJohn Baldwin stp x21,x22,[x2,#8*2] 1357bc3d5698SJohn Baldwin extr x17,x14,x17,#63 1358bc3d5698SJohn Baldwin adcs x23,x10,x17 1359bc3d5698SJohn Baldwin extr x14,x15,x14,#63 1360bc3d5698SJohn Baldwin ldp x19,x20,[sp,#8*0] 1361bc3d5698SJohn Baldwin adcs x24,x11,x14 1362bc3d5698SJohn Baldwin extr x15,x16,x15,#63 1363bc3d5698SJohn Baldwin ldp x6,x7,[x1,#8*0] 1364bc3d5698SJohn Baldwin adcs x25,x12,x15 1365bc3d5698SJohn Baldwin extr x16,xzr,x16,#63 1366bc3d5698SJohn Baldwin ldp x8,x9,[x1,#8*2] 1367bc3d5698SJohn Baldwin adc x26,x13,x16 1368bc3d5698SJohn Baldwin ldp x10,x11,[x1,#8*4] 1369bc3d5698SJohn Baldwin 1370bc3d5698SJohn Baldwin // Reduce by 512 bits per iteration 1371bc3d5698SJohn Baldwin mul x28,x4,x19 // t[0]*n0 1372bc3d5698SJohn Baldwin ldp x12,x13,[x1,#8*6] 1373bc3d5698SJohn Baldwin add x3,x1,x5 1374bc3d5698SJohn Baldwin ldp x21,x22,[sp,#8*2] 1375bc3d5698SJohn Baldwin stp x23,x24,[x2,#8*4] 1376bc3d5698SJohn Baldwin ldp x23,x24,[sp,#8*4] 1377bc3d5698SJohn Baldwin stp x25,x26,[x2,#8*6] 1378bc3d5698SJohn Baldwin ldp x25,x26,[sp,#8*6] 1379bc3d5698SJohn Baldwin add x1,x1,#8*8 1380bc3d5698SJohn Baldwin mov x30,xzr // initial top-most carry 1381bc3d5698SJohn Baldwin mov x2,sp 1382bc3d5698SJohn Baldwin mov x27,#8 1383bc3d5698SJohn Baldwin 1384bc3d5698SJohn Baldwin.Lsqr8x_reduction: 1385bc3d5698SJohn Baldwin // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 1386bc3d5698SJohn Baldwin mul x15,x7,x28 1387bc3d5698SJohn Baldwin sub x27,x27,#1 1388bc3d5698SJohn Baldwin mul x16,x8,x28 1389bc3d5698SJohn Baldwin str x28,[x2],#8 // put aside t[0]*n0 for tail processing 1390bc3d5698SJohn Baldwin mul x17,x9,x28 1391bc3d5698SJohn Baldwin // (*) adds xzr,x19,x14 1392bc3d5698SJohn Baldwin subs xzr,x19,#1 // (*) 1393bc3d5698SJohn Baldwin mul x14,x10,x28 1394bc3d5698SJohn Baldwin adcs x19,x20,x15 1395bc3d5698SJohn Baldwin mul x15,x11,x28 1396bc3d5698SJohn Baldwin adcs x20,x21,x16 1397bc3d5698SJohn Baldwin mul x16,x12,x28 1398bc3d5698SJohn Baldwin adcs x21,x22,x17 1399bc3d5698SJohn Baldwin mul x17,x13,x28 1400bc3d5698SJohn Baldwin adcs x22,x23,x14 1401bc3d5698SJohn Baldwin umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 1402bc3d5698SJohn Baldwin adcs x23,x24,x15 1403bc3d5698SJohn Baldwin umulh x15,x7,x28 1404bc3d5698SJohn Baldwin adcs x24,x25,x16 1405bc3d5698SJohn Baldwin umulh x16,x8,x28 1406bc3d5698SJohn Baldwin adcs x25,x26,x17 1407bc3d5698SJohn Baldwin umulh x17,x9,x28 1408bc3d5698SJohn Baldwin adc x26,xzr,xzr 1409bc3d5698SJohn Baldwin adds x19,x19,x14 1410bc3d5698SJohn Baldwin umulh x14,x10,x28 1411bc3d5698SJohn Baldwin adcs x20,x20,x15 1412bc3d5698SJohn Baldwin umulh x15,x11,x28 1413bc3d5698SJohn Baldwin adcs x21,x21,x16 1414bc3d5698SJohn Baldwin umulh x16,x12,x28 1415bc3d5698SJohn Baldwin adcs x22,x22,x17 1416bc3d5698SJohn Baldwin umulh x17,x13,x28 1417bc3d5698SJohn Baldwin mul x28,x4,x19 // next t[0]*n0 1418bc3d5698SJohn Baldwin adcs x23,x23,x14 1419bc3d5698SJohn Baldwin adcs x24,x24,x15 1420bc3d5698SJohn Baldwin adcs x25,x25,x16 1421bc3d5698SJohn Baldwin adc x26,x26,x17 1422bc3d5698SJohn Baldwin cbnz x27,.Lsqr8x_reduction 1423bc3d5698SJohn Baldwin 1424bc3d5698SJohn Baldwin ldp x14,x15,[x2,#8*0] 1425bc3d5698SJohn Baldwin ldp x16,x17,[x2,#8*2] 1426bc3d5698SJohn Baldwin mov x0,x2 1427bc3d5698SJohn Baldwin sub x27,x3,x1 // done yet? 1428bc3d5698SJohn Baldwin adds x19,x19,x14 1429bc3d5698SJohn Baldwin adcs x20,x20,x15 1430bc3d5698SJohn Baldwin ldp x14,x15,[x2,#8*4] 1431bc3d5698SJohn Baldwin adcs x21,x21,x16 1432bc3d5698SJohn Baldwin adcs x22,x22,x17 1433bc3d5698SJohn Baldwin ldp x16,x17,[x2,#8*6] 1434bc3d5698SJohn Baldwin adcs x23,x23,x14 1435bc3d5698SJohn Baldwin adcs x24,x24,x15 1436bc3d5698SJohn Baldwin adcs x25,x25,x16 1437bc3d5698SJohn Baldwin adcs x26,x26,x17 1438bc3d5698SJohn Baldwin //adc x28,xzr,xzr // moved below 1439bc3d5698SJohn Baldwin cbz x27,.Lsqr8x8_post_condition 1440bc3d5698SJohn Baldwin 1441c0855eaaSJohn Baldwin ldur x4,[x2,#-8*8] 1442bc3d5698SJohn Baldwin ldp x6,x7,[x1,#8*0] 1443bc3d5698SJohn Baldwin ldp x8,x9,[x1,#8*2] 1444bc3d5698SJohn Baldwin ldp x10,x11,[x1,#8*4] 1445bc3d5698SJohn Baldwin mov x27,#-8*8 1446bc3d5698SJohn Baldwin ldp x12,x13,[x1,#8*6] 1447bc3d5698SJohn Baldwin add x1,x1,#8*8 1448bc3d5698SJohn Baldwin 1449bc3d5698SJohn Baldwin.Lsqr8x_tail: 1450bc3d5698SJohn Baldwin mul x14,x6,x4 1451bc3d5698SJohn Baldwin adc x28,xzr,xzr // carry bit, modulo-scheduled 1452bc3d5698SJohn Baldwin mul x15,x7,x4 1453bc3d5698SJohn Baldwin add x27,x27,#8 1454bc3d5698SJohn Baldwin mul x16,x8,x4 1455bc3d5698SJohn Baldwin mul x17,x9,x4 1456bc3d5698SJohn Baldwin adds x19,x19,x14 1457bc3d5698SJohn Baldwin mul x14,x10,x4 1458bc3d5698SJohn Baldwin adcs x20,x20,x15 1459bc3d5698SJohn Baldwin mul x15,x11,x4 1460bc3d5698SJohn Baldwin adcs x21,x21,x16 1461bc3d5698SJohn Baldwin mul x16,x12,x4 1462bc3d5698SJohn Baldwin adcs x22,x22,x17 1463bc3d5698SJohn Baldwin mul x17,x13,x4 1464bc3d5698SJohn Baldwin adcs x23,x23,x14 1465bc3d5698SJohn Baldwin umulh x14,x6,x4 1466bc3d5698SJohn Baldwin adcs x24,x24,x15 1467bc3d5698SJohn Baldwin umulh x15,x7,x4 1468bc3d5698SJohn Baldwin adcs x25,x25,x16 1469bc3d5698SJohn Baldwin umulh x16,x8,x4 1470bc3d5698SJohn Baldwin adcs x26,x26,x17 1471bc3d5698SJohn Baldwin umulh x17,x9,x4 1472bc3d5698SJohn Baldwin adc x28,x28,xzr 1473bc3d5698SJohn Baldwin str x19,[x2],#8 1474bc3d5698SJohn Baldwin adds x19,x20,x14 1475bc3d5698SJohn Baldwin umulh x14,x10,x4 1476bc3d5698SJohn Baldwin adcs x20,x21,x15 1477bc3d5698SJohn Baldwin umulh x15,x11,x4 1478bc3d5698SJohn Baldwin adcs x21,x22,x16 1479bc3d5698SJohn Baldwin umulh x16,x12,x4 1480bc3d5698SJohn Baldwin adcs x22,x23,x17 1481bc3d5698SJohn Baldwin umulh x17,x13,x4 1482bc3d5698SJohn Baldwin ldr x4,[x0,x27] 1483bc3d5698SJohn Baldwin adcs x23,x24,x14 1484bc3d5698SJohn Baldwin adcs x24,x25,x15 1485bc3d5698SJohn Baldwin adcs x25,x26,x16 1486bc3d5698SJohn Baldwin adcs x26,x28,x17 1487bc3d5698SJohn Baldwin //adc x28,xzr,xzr // moved above 1488bc3d5698SJohn Baldwin cbnz x27,.Lsqr8x_tail 1489bc3d5698SJohn Baldwin // note that carry flag is guaranteed 1490bc3d5698SJohn Baldwin // to be zero at this point 1491bc3d5698SJohn Baldwin ldp x6,x7,[x2,#8*0] 1492bc3d5698SJohn Baldwin sub x27,x3,x1 // done yet? 1493bc3d5698SJohn Baldwin sub x16,x3,x5 // rewinded np 1494bc3d5698SJohn Baldwin ldp x8,x9,[x2,#8*2] 1495bc3d5698SJohn Baldwin ldp x10,x11,[x2,#8*4] 1496bc3d5698SJohn Baldwin ldp x12,x13,[x2,#8*6] 1497bc3d5698SJohn Baldwin cbz x27,.Lsqr8x_tail_break 1498bc3d5698SJohn Baldwin 1499c0855eaaSJohn Baldwin ldur x4,[x0,#-8*8] 1500bc3d5698SJohn Baldwin adds x19,x19,x6 1501bc3d5698SJohn Baldwin adcs x20,x20,x7 1502bc3d5698SJohn Baldwin ldp x6,x7,[x1,#8*0] 1503bc3d5698SJohn Baldwin adcs x21,x21,x8 1504bc3d5698SJohn Baldwin adcs x22,x22,x9 1505bc3d5698SJohn Baldwin ldp x8,x9,[x1,#8*2] 1506bc3d5698SJohn Baldwin adcs x23,x23,x10 1507bc3d5698SJohn Baldwin adcs x24,x24,x11 1508bc3d5698SJohn Baldwin ldp x10,x11,[x1,#8*4] 1509bc3d5698SJohn Baldwin adcs x25,x25,x12 1510bc3d5698SJohn Baldwin mov x27,#-8*8 1511bc3d5698SJohn Baldwin adcs x26,x26,x13 1512bc3d5698SJohn Baldwin ldp x12,x13,[x1,#8*6] 1513bc3d5698SJohn Baldwin add x1,x1,#8*8 1514bc3d5698SJohn Baldwin //adc x28,xzr,xzr // moved above 1515bc3d5698SJohn Baldwin b .Lsqr8x_tail 1516bc3d5698SJohn Baldwin 1517bc3d5698SJohn Baldwin.align 4 1518bc3d5698SJohn Baldwin.Lsqr8x_tail_break: 1519bc3d5698SJohn Baldwin ldr x4,[x29,#112] // pull n0 1520bc3d5698SJohn Baldwin add x27,x2,#8*8 // end of current t[num] window 1521bc3d5698SJohn Baldwin 1522bc3d5698SJohn Baldwin subs xzr,x30,#1 // "move" top-most carry to carry bit 1523bc3d5698SJohn Baldwin adcs x14,x19,x6 1524bc3d5698SJohn Baldwin adcs x15,x20,x7 1525bc3d5698SJohn Baldwin ldp x19,x20,[x0,#8*0] 1526bc3d5698SJohn Baldwin adcs x21,x21,x8 1527bc3d5698SJohn Baldwin ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 1528bc3d5698SJohn Baldwin adcs x22,x22,x9 1529bc3d5698SJohn Baldwin ldp x8,x9,[x16,#8*2] 1530bc3d5698SJohn Baldwin adcs x23,x23,x10 1531bc3d5698SJohn Baldwin adcs x24,x24,x11 1532bc3d5698SJohn Baldwin ldp x10,x11,[x16,#8*4] 1533bc3d5698SJohn Baldwin adcs x25,x25,x12 1534bc3d5698SJohn Baldwin adcs x26,x26,x13 1535bc3d5698SJohn Baldwin ldp x12,x13,[x16,#8*6] 1536bc3d5698SJohn Baldwin add x1,x16,#8*8 1537bc3d5698SJohn Baldwin adc x30,xzr,xzr // top-most carry 1538bc3d5698SJohn Baldwin mul x28,x4,x19 1539bc3d5698SJohn Baldwin stp x14,x15,[x2,#8*0] 1540bc3d5698SJohn Baldwin stp x21,x22,[x2,#8*2] 1541bc3d5698SJohn Baldwin ldp x21,x22,[x0,#8*2] 1542bc3d5698SJohn Baldwin stp x23,x24,[x2,#8*4] 1543bc3d5698SJohn Baldwin ldp x23,x24,[x0,#8*4] 1544bc3d5698SJohn Baldwin cmp x27,x29 // did we hit the bottom? 1545bc3d5698SJohn Baldwin stp x25,x26,[x2,#8*6] 1546bc3d5698SJohn Baldwin mov x2,x0 // slide the window 1547bc3d5698SJohn Baldwin ldp x25,x26,[x0,#8*6] 1548bc3d5698SJohn Baldwin mov x27,#8 1549bc3d5698SJohn Baldwin b.ne .Lsqr8x_reduction 1550bc3d5698SJohn Baldwin 1551bc3d5698SJohn Baldwin // Final step. We see if result is larger than modulus, and 1552bc3d5698SJohn Baldwin // if it is, subtract the modulus. But comparison implies 1553bc3d5698SJohn Baldwin // subtraction. So we subtract modulus, see if it borrowed, 1554bc3d5698SJohn Baldwin // and conditionally copy original value. 1555bc3d5698SJohn Baldwin ldr x0,[x29,#96] // pull rp 1556bc3d5698SJohn Baldwin add x2,x2,#8*8 1557bc3d5698SJohn Baldwin subs x14,x19,x6 1558bc3d5698SJohn Baldwin sbcs x15,x20,x7 1559bc3d5698SJohn Baldwin sub x27,x5,#8*8 1560bc3d5698SJohn Baldwin mov x3,x0 // x0 copy 1561bc3d5698SJohn Baldwin 1562bc3d5698SJohn Baldwin.Lsqr8x_sub: 1563bc3d5698SJohn Baldwin sbcs x16,x21,x8 1564bc3d5698SJohn Baldwin ldp x6,x7,[x1,#8*0] 1565bc3d5698SJohn Baldwin sbcs x17,x22,x9 1566bc3d5698SJohn Baldwin stp x14,x15,[x0,#8*0] 1567bc3d5698SJohn Baldwin sbcs x14,x23,x10 1568bc3d5698SJohn Baldwin ldp x8,x9,[x1,#8*2] 1569bc3d5698SJohn Baldwin sbcs x15,x24,x11 1570bc3d5698SJohn Baldwin stp x16,x17,[x0,#8*2] 1571bc3d5698SJohn Baldwin sbcs x16,x25,x12 1572bc3d5698SJohn Baldwin ldp x10,x11,[x1,#8*4] 1573bc3d5698SJohn Baldwin sbcs x17,x26,x13 1574bc3d5698SJohn Baldwin ldp x12,x13,[x1,#8*6] 1575bc3d5698SJohn Baldwin add x1,x1,#8*8 1576bc3d5698SJohn Baldwin ldp x19,x20,[x2,#8*0] 1577bc3d5698SJohn Baldwin sub x27,x27,#8*8 1578bc3d5698SJohn Baldwin ldp x21,x22,[x2,#8*2] 1579bc3d5698SJohn Baldwin ldp x23,x24,[x2,#8*4] 1580bc3d5698SJohn Baldwin ldp x25,x26,[x2,#8*6] 1581bc3d5698SJohn Baldwin add x2,x2,#8*8 1582bc3d5698SJohn Baldwin stp x14,x15,[x0,#8*4] 1583bc3d5698SJohn Baldwin sbcs x14,x19,x6 1584bc3d5698SJohn Baldwin stp x16,x17,[x0,#8*6] 1585bc3d5698SJohn Baldwin add x0,x0,#8*8 1586bc3d5698SJohn Baldwin sbcs x15,x20,x7 1587bc3d5698SJohn Baldwin cbnz x27,.Lsqr8x_sub 1588bc3d5698SJohn Baldwin 1589bc3d5698SJohn Baldwin sbcs x16,x21,x8 1590bc3d5698SJohn Baldwin mov x2,sp 1591bc3d5698SJohn Baldwin add x1,sp,x5 1592bc3d5698SJohn Baldwin ldp x6,x7,[x3,#8*0] 1593bc3d5698SJohn Baldwin sbcs x17,x22,x9 1594bc3d5698SJohn Baldwin stp x14,x15,[x0,#8*0] 1595bc3d5698SJohn Baldwin sbcs x14,x23,x10 1596bc3d5698SJohn Baldwin ldp x8,x9,[x3,#8*2] 1597bc3d5698SJohn Baldwin sbcs x15,x24,x11 1598bc3d5698SJohn Baldwin stp x16,x17,[x0,#8*2] 1599bc3d5698SJohn Baldwin sbcs x16,x25,x12 1600bc3d5698SJohn Baldwin ldp x19,x20,[x1,#8*0] 1601bc3d5698SJohn Baldwin sbcs x17,x26,x13 1602bc3d5698SJohn Baldwin ldp x21,x22,[x1,#8*2] 1603bc3d5698SJohn Baldwin sbcs xzr,x30,xzr // did it borrow? 1604bc3d5698SJohn Baldwin ldr x30,[x29,#8] // pull return address 1605bc3d5698SJohn Baldwin stp x14,x15,[x0,#8*4] 1606bc3d5698SJohn Baldwin stp x16,x17,[x0,#8*6] 1607bc3d5698SJohn Baldwin 1608bc3d5698SJohn Baldwin sub x27,x5,#8*4 1609bc3d5698SJohn Baldwin.Lsqr4x_cond_copy: 1610bc3d5698SJohn Baldwin sub x27,x27,#8*4 1611bc3d5698SJohn Baldwin csel x14,x19,x6,lo 1612bc3d5698SJohn Baldwin stp xzr,xzr,[x2,#8*0] 1613bc3d5698SJohn Baldwin csel x15,x20,x7,lo 1614bc3d5698SJohn Baldwin ldp x6,x7,[x3,#8*4] 1615bc3d5698SJohn Baldwin ldp x19,x20,[x1,#8*4] 1616bc3d5698SJohn Baldwin csel x16,x21,x8,lo 1617bc3d5698SJohn Baldwin stp xzr,xzr,[x2,#8*2] 1618bc3d5698SJohn Baldwin add x2,x2,#8*4 1619bc3d5698SJohn Baldwin csel x17,x22,x9,lo 1620bc3d5698SJohn Baldwin ldp x8,x9,[x3,#8*6] 1621bc3d5698SJohn Baldwin ldp x21,x22,[x1,#8*6] 1622bc3d5698SJohn Baldwin add x1,x1,#8*4 1623bc3d5698SJohn Baldwin stp x14,x15,[x3,#8*0] 1624bc3d5698SJohn Baldwin stp x16,x17,[x3,#8*2] 1625bc3d5698SJohn Baldwin add x3,x3,#8*4 1626bc3d5698SJohn Baldwin stp xzr,xzr,[x1,#8*0] 1627bc3d5698SJohn Baldwin stp xzr,xzr,[x1,#8*2] 1628bc3d5698SJohn Baldwin cbnz x27,.Lsqr4x_cond_copy 1629bc3d5698SJohn Baldwin 1630bc3d5698SJohn Baldwin csel x14,x19,x6,lo 1631bc3d5698SJohn Baldwin stp xzr,xzr,[x2,#8*0] 1632bc3d5698SJohn Baldwin csel x15,x20,x7,lo 1633bc3d5698SJohn Baldwin stp xzr,xzr,[x2,#8*2] 1634bc3d5698SJohn Baldwin csel x16,x21,x8,lo 1635bc3d5698SJohn Baldwin csel x17,x22,x9,lo 1636bc3d5698SJohn Baldwin stp x14,x15,[x3,#8*0] 1637bc3d5698SJohn Baldwin stp x16,x17,[x3,#8*2] 1638bc3d5698SJohn Baldwin 1639bc3d5698SJohn Baldwin b .Lsqr8x_done 1640bc3d5698SJohn Baldwin 1641bc3d5698SJohn Baldwin.align 4 1642bc3d5698SJohn Baldwin.Lsqr8x8_post_condition: 1643bc3d5698SJohn Baldwin adc x28,xzr,xzr 1644bc3d5698SJohn Baldwin ldr x30,[x29,#8] // pull return address 1645bc3d5698SJohn Baldwin // x19-7,x28 hold result, x6-7 hold modulus 1646bc3d5698SJohn Baldwin subs x6,x19,x6 1647bc3d5698SJohn Baldwin ldr x1,[x29,#96] // pull rp 1648bc3d5698SJohn Baldwin sbcs x7,x20,x7 1649bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#8*0] 1650bc3d5698SJohn Baldwin sbcs x8,x21,x8 1651bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#8*2] 1652bc3d5698SJohn Baldwin sbcs x9,x22,x9 1653bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#8*4] 1654bc3d5698SJohn Baldwin sbcs x10,x23,x10 1655bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#8*6] 1656bc3d5698SJohn Baldwin sbcs x11,x24,x11 1657bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#8*8] 1658bc3d5698SJohn Baldwin sbcs x12,x25,x12 1659bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#8*10] 1660bc3d5698SJohn Baldwin sbcs x13,x26,x13 1661bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#8*12] 1662bc3d5698SJohn Baldwin sbcs x28,x28,xzr // did it borrow? 1663bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#8*14] 1664bc3d5698SJohn Baldwin 1665bc3d5698SJohn Baldwin // x6-7 hold result-modulus 1666bc3d5698SJohn Baldwin csel x6,x19,x6,lo 1667bc3d5698SJohn Baldwin csel x7,x20,x7,lo 1668bc3d5698SJohn Baldwin csel x8,x21,x8,lo 1669bc3d5698SJohn Baldwin csel x9,x22,x9,lo 1670bc3d5698SJohn Baldwin stp x6,x7,[x1,#8*0] 1671bc3d5698SJohn Baldwin csel x10,x23,x10,lo 1672bc3d5698SJohn Baldwin csel x11,x24,x11,lo 1673bc3d5698SJohn Baldwin stp x8,x9,[x1,#8*2] 1674bc3d5698SJohn Baldwin csel x12,x25,x12,lo 1675bc3d5698SJohn Baldwin csel x13,x26,x13,lo 1676bc3d5698SJohn Baldwin stp x10,x11,[x1,#8*4] 1677bc3d5698SJohn Baldwin stp x12,x13,[x1,#8*6] 1678bc3d5698SJohn Baldwin 1679bc3d5698SJohn Baldwin.Lsqr8x_done: 1680bc3d5698SJohn Baldwin ldp x19,x20,[x29,#16] 1681bc3d5698SJohn Baldwin mov sp,x29 1682bc3d5698SJohn Baldwin ldp x21,x22,[x29,#32] 1683bc3d5698SJohn Baldwin mov x0,#1 1684bc3d5698SJohn Baldwin ldp x23,x24,[x29,#48] 1685bc3d5698SJohn Baldwin ldp x25,x26,[x29,#64] 1686bc3d5698SJohn Baldwin ldp x27,x28,[x29,#80] 1687bc3d5698SJohn Baldwin ldr x29,[sp],#128 1688bd9588bcSAndrew Turner // x30 is loaded earlier 1689bd9588bcSAndrew Turner AARCH64_VALIDATE_LINK_REGISTER 1690bc3d5698SJohn Baldwin ret 1691bc3d5698SJohn Baldwin.size __bn_sqr8x_mont,.-__bn_sqr8x_mont 1692bc3d5698SJohn Baldwin.type __bn_mul4x_mont,%function 1693bc3d5698SJohn Baldwin.align 5 1694bc3d5698SJohn Baldwin__bn_mul4x_mont: 1695bd9588bcSAndrew Turner // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to 1696bd9588bcSAndrew Turner // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address. 1697bc3d5698SJohn Baldwin stp x29,x30,[sp,#-128]! 1698bc3d5698SJohn Baldwin add x29,sp,#0 1699bc3d5698SJohn Baldwin stp x19,x20,[sp,#16] 1700bc3d5698SJohn Baldwin stp x21,x22,[sp,#32] 1701bc3d5698SJohn Baldwin stp x23,x24,[sp,#48] 1702bc3d5698SJohn Baldwin stp x25,x26,[sp,#64] 1703bc3d5698SJohn Baldwin stp x27,x28,[sp,#80] 1704bc3d5698SJohn Baldwin 1705bc3d5698SJohn Baldwin sub x26,sp,x5,lsl#3 1706bc3d5698SJohn Baldwin lsl x5,x5,#3 1707bc3d5698SJohn Baldwin ldr x4,[x4] // *n0 1708bc3d5698SJohn Baldwin sub sp,x26,#8*4 // alloca 1709bc3d5698SJohn Baldwin 1710bc3d5698SJohn Baldwin add x10,x2,x5 1711bc3d5698SJohn Baldwin add x27,x1,x5 1712bc3d5698SJohn Baldwin stp x0,x10,[x29,#96] // offload rp and &b[num] 1713bc3d5698SJohn Baldwin 1714bc3d5698SJohn Baldwin ldr x24,[x2,#8*0] // b[0] 1715bc3d5698SJohn Baldwin ldp x6,x7,[x1,#8*0] // a[0..3] 1716bc3d5698SJohn Baldwin ldp x8,x9,[x1,#8*2] 1717bc3d5698SJohn Baldwin add x1,x1,#8*4 1718bc3d5698SJohn Baldwin mov x19,xzr 1719bc3d5698SJohn Baldwin mov x20,xzr 1720bc3d5698SJohn Baldwin mov x21,xzr 1721bc3d5698SJohn Baldwin mov x22,xzr 1722bc3d5698SJohn Baldwin ldp x14,x15,[x3,#8*0] // n[0..3] 1723bc3d5698SJohn Baldwin ldp x16,x17,[x3,#8*2] 1724bc3d5698SJohn Baldwin adds x3,x3,#8*4 // clear carry bit 1725bc3d5698SJohn Baldwin mov x0,xzr 1726bc3d5698SJohn Baldwin mov x28,#0 1727bc3d5698SJohn Baldwin mov x26,sp 1728bc3d5698SJohn Baldwin 1729bc3d5698SJohn Baldwin.Loop_mul4x_1st_reduction: 1730bc3d5698SJohn Baldwin mul x10,x6,x24 // lo(a[0..3]*b[0]) 1731bc3d5698SJohn Baldwin adc x0,x0,xzr // modulo-scheduled 1732bc3d5698SJohn Baldwin mul x11,x7,x24 1733bc3d5698SJohn Baldwin add x28,x28,#8 1734bc3d5698SJohn Baldwin mul x12,x8,x24 1735bc3d5698SJohn Baldwin and x28,x28,#31 1736bc3d5698SJohn Baldwin mul x13,x9,x24 1737bc3d5698SJohn Baldwin adds x19,x19,x10 1738bc3d5698SJohn Baldwin umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1739bc3d5698SJohn Baldwin adcs x20,x20,x11 1740bc3d5698SJohn Baldwin mul x25,x19,x4 // t[0]*n0 1741bc3d5698SJohn Baldwin adcs x21,x21,x12 1742bc3d5698SJohn Baldwin umulh x11,x7,x24 1743bc3d5698SJohn Baldwin adcs x22,x22,x13 1744bc3d5698SJohn Baldwin umulh x12,x8,x24 1745bc3d5698SJohn Baldwin adc x23,xzr,xzr 1746bc3d5698SJohn Baldwin umulh x13,x9,x24 1747bc3d5698SJohn Baldwin ldr x24,[x2,x28] // next b[i] (or b[0]) 1748bc3d5698SJohn Baldwin adds x20,x20,x10 1749bc3d5698SJohn Baldwin // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1750bc3d5698SJohn Baldwin str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1751bc3d5698SJohn Baldwin adcs x21,x21,x11 1752bc3d5698SJohn Baldwin mul x11,x15,x25 1753bc3d5698SJohn Baldwin adcs x22,x22,x12 1754bc3d5698SJohn Baldwin mul x12,x16,x25 1755bc3d5698SJohn Baldwin adc x23,x23,x13 // can't overflow 1756bc3d5698SJohn Baldwin mul x13,x17,x25 1757bc3d5698SJohn Baldwin // (*) adds xzr,x19,x10 1758bc3d5698SJohn Baldwin subs xzr,x19,#1 // (*) 1759bc3d5698SJohn Baldwin umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1760bc3d5698SJohn Baldwin adcs x19,x20,x11 1761bc3d5698SJohn Baldwin umulh x11,x15,x25 1762bc3d5698SJohn Baldwin adcs x20,x21,x12 1763bc3d5698SJohn Baldwin umulh x12,x16,x25 1764bc3d5698SJohn Baldwin adcs x21,x22,x13 1765bc3d5698SJohn Baldwin umulh x13,x17,x25 1766bc3d5698SJohn Baldwin adcs x22,x23,x0 1767bc3d5698SJohn Baldwin adc x0,xzr,xzr 1768bc3d5698SJohn Baldwin adds x19,x19,x10 1769bc3d5698SJohn Baldwin sub x10,x27,x1 1770bc3d5698SJohn Baldwin adcs x20,x20,x11 1771bc3d5698SJohn Baldwin adcs x21,x21,x12 1772bc3d5698SJohn Baldwin adcs x22,x22,x13 1773bc3d5698SJohn Baldwin //adc x0,x0,xzr 1774bc3d5698SJohn Baldwin cbnz x28,.Loop_mul4x_1st_reduction 1775bc3d5698SJohn Baldwin 1776bc3d5698SJohn Baldwin cbz x10,.Lmul4x4_post_condition 1777bc3d5698SJohn Baldwin 1778bc3d5698SJohn Baldwin ldp x6,x7,[x1,#8*0] // a[4..7] 1779bc3d5698SJohn Baldwin ldp x8,x9,[x1,#8*2] 1780bc3d5698SJohn Baldwin add x1,x1,#8*4 1781bc3d5698SJohn Baldwin ldr x25,[sp] // a[0]*n0 1782bc3d5698SJohn Baldwin ldp x14,x15,[x3,#8*0] // n[4..7] 1783bc3d5698SJohn Baldwin ldp x16,x17,[x3,#8*2] 1784bc3d5698SJohn Baldwin add x3,x3,#8*4 1785bc3d5698SJohn Baldwin 1786bc3d5698SJohn Baldwin.Loop_mul4x_1st_tail: 1787bc3d5698SJohn Baldwin mul x10,x6,x24 // lo(a[4..7]*b[i]) 1788bc3d5698SJohn Baldwin adc x0,x0,xzr // modulo-scheduled 1789bc3d5698SJohn Baldwin mul x11,x7,x24 1790bc3d5698SJohn Baldwin add x28,x28,#8 1791bc3d5698SJohn Baldwin mul x12,x8,x24 1792bc3d5698SJohn Baldwin and x28,x28,#31 1793bc3d5698SJohn Baldwin mul x13,x9,x24 1794bc3d5698SJohn Baldwin adds x19,x19,x10 1795bc3d5698SJohn Baldwin umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1796bc3d5698SJohn Baldwin adcs x20,x20,x11 1797bc3d5698SJohn Baldwin umulh x11,x7,x24 1798bc3d5698SJohn Baldwin adcs x21,x21,x12 1799bc3d5698SJohn Baldwin umulh x12,x8,x24 1800bc3d5698SJohn Baldwin adcs x22,x22,x13 1801bc3d5698SJohn Baldwin umulh x13,x9,x24 1802bc3d5698SJohn Baldwin adc x23,xzr,xzr 1803bc3d5698SJohn Baldwin ldr x24,[x2,x28] // next b[i] (or b[0]) 1804bc3d5698SJohn Baldwin adds x20,x20,x10 1805bc3d5698SJohn Baldwin mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1806bc3d5698SJohn Baldwin adcs x21,x21,x11 1807bc3d5698SJohn Baldwin mul x11,x15,x25 1808bc3d5698SJohn Baldwin adcs x22,x22,x12 1809bc3d5698SJohn Baldwin mul x12,x16,x25 1810bc3d5698SJohn Baldwin adc x23,x23,x13 // can't overflow 1811bc3d5698SJohn Baldwin mul x13,x17,x25 1812bc3d5698SJohn Baldwin adds x19,x19,x10 1813bc3d5698SJohn Baldwin umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1814bc3d5698SJohn Baldwin adcs x20,x20,x11 1815bc3d5698SJohn Baldwin umulh x11,x15,x25 1816bc3d5698SJohn Baldwin adcs x21,x21,x12 1817bc3d5698SJohn Baldwin umulh x12,x16,x25 1818bc3d5698SJohn Baldwin adcs x22,x22,x13 1819bc3d5698SJohn Baldwin adcs x23,x23,x0 1820bc3d5698SJohn Baldwin umulh x13,x17,x25 1821bc3d5698SJohn Baldwin adc x0,xzr,xzr 1822bc3d5698SJohn Baldwin ldr x25,[sp,x28] // next t[0]*n0 1823bc3d5698SJohn Baldwin str x19,[x26],#8 // result!!! 1824bc3d5698SJohn Baldwin adds x19,x20,x10 1825bc3d5698SJohn Baldwin sub x10,x27,x1 // done yet? 1826bc3d5698SJohn Baldwin adcs x20,x21,x11 1827bc3d5698SJohn Baldwin adcs x21,x22,x12 1828bc3d5698SJohn Baldwin adcs x22,x23,x13 1829bc3d5698SJohn Baldwin //adc x0,x0,xzr 1830bc3d5698SJohn Baldwin cbnz x28,.Loop_mul4x_1st_tail 1831bc3d5698SJohn Baldwin 1832bc3d5698SJohn Baldwin sub x11,x27,x5 // rewinded x1 1833bc3d5698SJohn Baldwin cbz x10,.Lmul4x_proceed 1834bc3d5698SJohn Baldwin 1835bc3d5698SJohn Baldwin ldp x6,x7,[x1,#8*0] 1836bc3d5698SJohn Baldwin ldp x8,x9,[x1,#8*2] 1837bc3d5698SJohn Baldwin add x1,x1,#8*4 1838bc3d5698SJohn Baldwin ldp x14,x15,[x3,#8*0] 1839bc3d5698SJohn Baldwin ldp x16,x17,[x3,#8*2] 1840bc3d5698SJohn Baldwin add x3,x3,#8*4 1841bc3d5698SJohn Baldwin b .Loop_mul4x_1st_tail 1842bc3d5698SJohn Baldwin 1843bc3d5698SJohn Baldwin.align 5 1844bc3d5698SJohn Baldwin.Lmul4x_proceed: 1845bc3d5698SJohn Baldwin ldr x24,[x2,#8*4]! // *++b 1846bc3d5698SJohn Baldwin adc x30,x0,xzr 1847bc3d5698SJohn Baldwin ldp x6,x7,[x11,#8*0] // a[0..3] 1848bc3d5698SJohn Baldwin sub x3,x3,x5 // rewind np 1849bc3d5698SJohn Baldwin ldp x8,x9,[x11,#8*2] 1850bc3d5698SJohn Baldwin add x1,x11,#8*4 1851bc3d5698SJohn Baldwin 1852bc3d5698SJohn Baldwin stp x19,x20,[x26,#8*0] // result!!! 1853bc3d5698SJohn Baldwin ldp x19,x20,[sp,#8*4] // t[0..3] 1854bc3d5698SJohn Baldwin stp x21,x22,[x26,#8*2] // result!!! 1855bc3d5698SJohn Baldwin ldp x21,x22,[sp,#8*6] 1856bc3d5698SJohn Baldwin 1857bc3d5698SJohn Baldwin ldp x14,x15,[x3,#8*0] // n[0..3] 1858bc3d5698SJohn Baldwin mov x26,sp 1859bc3d5698SJohn Baldwin ldp x16,x17,[x3,#8*2] 1860bc3d5698SJohn Baldwin adds x3,x3,#8*4 // clear carry bit 1861bc3d5698SJohn Baldwin mov x0,xzr 1862bc3d5698SJohn Baldwin 1863bc3d5698SJohn Baldwin.align 4 1864bc3d5698SJohn Baldwin.Loop_mul4x_reduction: 1865bc3d5698SJohn Baldwin mul x10,x6,x24 // lo(a[0..3]*b[4]) 1866bc3d5698SJohn Baldwin adc x0,x0,xzr // modulo-scheduled 1867bc3d5698SJohn Baldwin mul x11,x7,x24 1868bc3d5698SJohn Baldwin add x28,x28,#8 1869bc3d5698SJohn Baldwin mul x12,x8,x24 1870bc3d5698SJohn Baldwin and x28,x28,#31 1871bc3d5698SJohn Baldwin mul x13,x9,x24 1872bc3d5698SJohn Baldwin adds x19,x19,x10 1873bc3d5698SJohn Baldwin umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1874bc3d5698SJohn Baldwin adcs x20,x20,x11 1875bc3d5698SJohn Baldwin mul x25,x19,x4 // t[0]*n0 1876bc3d5698SJohn Baldwin adcs x21,x21,x12 1877bc3d5698SJohn Baldwin umulh x11,x7,x24 1878bc3d5698SJohn Baldwin adcs x22,x22,x13 1879bc3d5698SJohn Baldwin umulh x12,x8,x24 1880bc3d5698SJohn Baldwin adc x23,xzr,xzr 1881bc3d5698SJohn Baldwin umulh x13,x9,x24 1882bc3d5698SJohn Baldwin ldr x24,[x2,x28] // next b[i] 1883bc3d5698SJohn Baldwin adds x20,x20,x10 1884bc3d5698SJohn Baldwin // (*) mul x10,x14,x25 1885bc3d5698SJohn Baldwin str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1886bc3d5698SJohn Baldwin adcs x21,x21,x11 1887bc3d5698SJohn Baldwin mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1888bc3d5698SJohn Baldwin adcs x22,x22,x12 1889bc3d5698SJohn Baldwin mul x12,x16,x25 1890bc3d5698SJohn Baldwin adc x23,x23,x13 // can't overflow 1891bc3d5698SJohn Baldwin mul x13,x17,x25 1892bc3d5698SJohn Baldwin // (*) adds xzr,x19,x10 1893bc3d5698SJohn Baldwin subs xzr,x19,#1 // (*) 1894bc3d5698SJohn Baldwin umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1895bc3d5698SJohn Baldwin adcs x19,x20,x11 1896bc3d5698SJohn Baldwin umulh x11,x15,x25 1897bc3d5698SJohn Baldwin adcs x20,x21,x12 1898bc3d5698SJohn Baldwin umulh x12,x16,x25 1899bc3d5698SJohn Baldwin adcs x21,x22,x13 1900bc3d5698SJohn Baldwin umulh x13,x17,x25 1901bc3d5698SJohn Baldwin adcs x22,x23,x0 1902bc3d5698SJohn Baldwin adc x0,xzr,xzr 1903bc3d5698SJohn Baldwin adds x19,x19,x10 1904bc3d5698SJohn Baldwin adcs x20,x20,x11 1905bc3d5698SJohn Baldwin adcs x21,x21,x12 1906bc3d5698SJohn Baldwin adcs x22,x22,x13 1907bc3d5698SJohn Baldwin //adc x0,x0,xzr 1908bc3d5698SJohn Baldwin cbnz x28,.Loop_mul4x_reduction 1909bc3d5698SJohn Baldwin 1910bc3d5698SJohn Baldwin adc x0,x0,xzr 1911bc3d5698SJohn Baldwin ldp x10,x11,[x26,#8*4] // t[4..7] 1912bc3d5698SJohn Baldwin ldp x12,x13,[x26,#8*6] 1913bc3d5698SJohn Baldwin ldp x6,x7,[x1,#8*0] // a[4..7] 1914bc3d5698SJohn Baldwin ldp x8,x9,[x1,#8*2] 1915bc3d5698SJohn Baldwin add x1,x1,#8*4 1916bc3d5698SJohn Baldwin adds x19,x19,x10 1917bc3d5698SJohn Baldwin adcs x20,x20,x11 1918bc3d5698SJohn Baldwin adcs x21,x21,x12 1919bc3d5698SJohn Baldwin adcs x22,x22,x13 1920bc3d5698SJohn Baldwin //adc x0,x0,xzr 1921bc3d5698SJohn Baldwin 1922bc3d5698SJohn Baldwin ldr x25,[sp] // t[0]*n0 1923bc3d5698SJohn Baldwin ldp x14,x15,[x3,#8*0] // n[4..7] 1924bc3d5698SJohn Baldwin ldp x16,x17,[x3,#8*2] 1925bc3d5698SJohn Baldwin add x3,x3,#8*4 1926bc3d5698SJohn Baldwin 1927bc3d5698SJohn Baldwin.align 4 1928bc3d5698SJohn Baldwin.Loop_mul4x_tail: 1929bc3d5698SJohn Baldwin mul x10,x6,x24 // lo(a[4..7]*b[4]) 1930bc3d5698SJohn Baldwin adc x0,x0,xzr // modulo-scheduled 1931bc3d5698SJohn Baldwin mul x11,x7,x24 1932bc3d5698SJohn Baldwin add x28,x28,#8 1933bc3d5698SJohn Baldwin mul x12,x8,x24 1934bc3d5698SJohn Baldwin and x28,x28,#31 1935bc3d5698SJohn Baldwin mul x13,x9,x24 1936bc3d5698SJohn Baldwin adds x19,x19,x10 1937bc3d5698SJohn Baldwin umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1938bc3d5698SJohn Baldwin adcs x20,x20,x11 1939bc3d5698SJohn Baldwin umulh x11,x7,x24 1940bc3d5698SJohn Baldwin adcs x21,x21,x12 1941bc3d5698SJohn Baldwin umulh x12,x8,x24 1942bc3d5698SJohn Baldwin adcs x22,x22,x13 1943bc3d5698SJohn Baldwin umulh x13,x9,x24 1944bc3d5698SJohn Baldwin adc x23,xzr,xzr 1945bc3d5698SJohn Baldwin ldr x24,[x2,x28] // next b[i] 1946bc3d5698SJohn Baldwin adds x20,x20,x10 1947bc3d5698SJohn Baldwin mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1948bc3d5698SJohn Baldwin adcs x21,x21,x11 1949bc3d5698SJohn Baldwin mul x11,x15,x25 1950bc3d5698SJohn Baldwin adcs x22,x22,x12 1951bc3d5698SJohn Baldwin mul x12,x16,x25 1952bc3d5698SJohn Baldwin adc x23,x23,x13 // can't overflow 1953bc3d5698SJohn Baldwin mul x13,x17,x25 1954bc3d5698SJohn Baldwin adds x19,x19,x10 1955bc3d5698SJohn Baldwin umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1956bc3d5698SJohn Baldwin adcs x20,x20,x11 1957bc3d5698SJohn Baldwin umulh x11,x15,x25 1958bc3d5698SJohn Baldwin adcs x21,x21,x12 1959bc3d5698SJohn Baldwin umulh x12,x16,x25 1960bc3d5698SJohn Baldwin adcs x22,x22,x13 1961bc3d5698SJohn Baldwin umulh x13,x17,x25 1962bc3d5698SJohn Baldwin adcs x23,x23,x0 1963bc3d5698SJohn Baldwin ldr x25,[sp,x28] // next a[0]*n0 1964bc3d5698SJohn Baldwin adc x0,xzr,xzr 1965bc3d5698SJohn Baldwin str x19,[x26],#8 // result!!! 1966bc3d5698SJohn Baldwin adds x19,x20,x10 1967bc3d5698SJohn Baldwin sub x10,x27,x1 // done yet? 1968bc3d5698SJohn Baldwin adcs x20,x21,x11 1969bc3d5698SJohn Baldwin adcs x21,x22,x12 1970bc3d5698SJohn Baldwin adcs x22,x23,x13 1971bc3d5698SJohn Baldwin //adc x0,x0,xzr 1972bc3d5698SJohn Baldwin cbnz x28,.Loop_mul4x_tail 1973bc3d5698SJohn Baldwin 1974bc3d5698SJohn Baldwin sub x11,x3,x5 // rewinded np? 1975bc3d5698SJohn Baldwin adc x0,x0,xzr 1976bc3d5698SJohn Baldwin cbz x10,.Loop_mul4x_break 1977bc3d5698SJohn Baldwin 1978bc3d5698SJohn Baldwin ldp x10,x11,[x26,#8*4] 1979bc3d5698SJohn Baldwin ldp x12,x13,[x26,#8*6] 1980bc3d5698SJohn Baldwin ldp x6,x7,[x1,#8*0] 1981bc3d5698SJohn Baldwin ldp x8,x9,[x1,#8*2] 1982bc3d5698SJohn Baldwin add x1,x1,#8*4 1983bc3d5698SJohn Baldwin adds x19,x19,x10 1984bc3d5698SJohn Baldwin adcs x20,x20,x11 1985bc3d5698SJohn Baldwin adcs x21,x21,x12 1986bc3d5698SJohn Baldwin adcs x22,x22,x13 1987bc3d5698SJohn Baldwin //adc x0,x0,xzr 1988bc3d5698SJohn Baldwin ldp x14,x15,[x3,#8*0] 1989bc3d5698SJohn Baldwin ldp x16,x17,[x3,#8*2] 1990bc3d5698SJohn Baldwin add x3,x3,#8*4 1991bc3d5698SJohn Baldwin b .Loop_mul4x_tail 1992bc3d5698SJohn Baldwin 1993bc3d5698SJohn Baldwin.align 4 1994bc3d5698SJohn Baldwin.Loop_mul4x_break: 1995bc3d5698SJohn Baldwin ldp x12,x13,[x29,#96] // pull rp and &b[num] 1996bc3d5698SJohn Baldwin adds x19,x19,x30 1997bc3d5698SJohn Baldwin add x2,x2,#8*4 // bp++ 1998bc3d5698SJohn Baldwin adcs x20,x20,xzr 1999bc3d5698SJohn Baldwin sub x1,x1,x5 // rewind ap 2000bc3d5698SJohn Baldwin adcs x21,x21,xzr 2001bc3d5698SJohn Baldwin stp x19,x20,[x26,#8*0] // result!!! 2002bc3d5698SJohn Baldwin adcs x22,x22,xzr 2003bc3d5698SJohn Baldwin ldp x19,x20,[sp,#8*4] // t[0..3] 2004bc3d5698SJohn Baldwin adc x30,x0,xzr 2005bc3d5698SJohn Baldwin stp x21,x22,[x26,#8*2] // result!!! 2006bc3d5698SJohn Baldwin cmp x2,x13 // done yet? 2007bc3d5698SJohn Baldwin ldp x21,x22,[sp,#8*6] 2008bc3d5698SJohn Baldwin ldp x14,x15,[x11,#8*0] // n[0..3] 2009bc3d5698SJohn Baldwin ldp x16,x17,[x11,#8*2] 2010bc3d5698SJohn Baldwin add x3,x11,#8*4 2011bc3d5698SJohn Baldwin b.eq .Lmul4x_post 2012bc3d5698SJohn Baldwin 2013bc3d5698SJohn Baldwin ldr x24,[x2] 2014bc3d5698SJohn Baldwin ldp x6,x7,[x1,#8*0] // a[0..3] 2015bc3d5698SJohn Baldwin ldp x8,x9,[x1,#8*2] 2016bc3d5698SJohn Baldwin adds x1,x1,#8*4 // clear carry bit 2017bc3d5698SJohn Baldwin mov x0,xzr 2018bc3d5698SJohn Baldwin mov x26,sp 2019bc3d5698SJohn Baldwin b .Loop_mul4x_reduction 2020bc3d5698SJohn Baldwin 2021bc3d5698SJohn Baldwin.align 4 2022bc3d5698SJohn Baldwin.Lmul4x_post: 2023bc3d5698SJohn Baldwin // Final step. We see if result is larger than modulus, and 2024bc3d5698SJohn Baldwin // if it is, subtract the modulus. But comparison implies 2025bc3d5698SJohn Baldwin // subtraction. So we subtract modulus, see if it borrowed, 2026bc3d5698SJohn Baldwin // and conditionally copy original value. 2027bc3d5698SJohn Baldwin mov x0,x12 2028bc3d5698SJohn Baldwin mov x27,x12 // x0 copy 2029bc3d5698SJohn Baldwin subs x10,x19,x14 2030bc3d5698SJohn Baldwin add x26,sp,#8*8 2031bc3d5698SJohn Baldwin sbcs x11,x20,x15 2032bc3d5698SJohn Baldwin sub x28,x5,#8*4 2033bc3d5698SJohn Baldwin 2034bc3d5698SJohn Baldwin.Lmul4x_sub: 2035bc3d5698SJohn Baldwin sbcs x12,x21,x16 2036bc3d5698SJohn Baldwin ldp x14,x15,[x3,#8*0] 2037bc3d5698SJohn Baldwin sub x28,x28,#8*4 2038bc3d5698SJohn Baldwin ldp x19,x20,[x26,#8*0] 2039bc3d5698SJohn Baldwin sbcs x13,x22,x17 2040bc3d5698SJohn Baldwin ldp x16,x17,[x3,#8*2] 2041bc3d5698SJohn Baldwin add x3,x3,#8*4 2042bc3d5698SJohn Baldwin ldp x21,x22,[x26,#8*2] 2043bc3d5698SJohn Baldwin add x26,x26,#8*4 2044bc3d5698SJohn Baldwin stp x10,x11,[x0,#8*0] 2045bc3d5698SJohn Baldwin sbcs x10,x19,x14 2046bc3d5698SJohn Baldwin stp x12,x13,[x0,#8*2] 2047bc3d5698SJohn Baldwin add x0,x0,#8*4 2048bc3d5698SJohn Baldwin sbcs x11,x20,x15 2049bc3d5698SJohn Baldwin cbnz x28,.Lmul4x_sub 2050bc3d5698SJohn Baldwin 2051bc3d5698SJohn Baldwin sbcs x12,x21,x16 2052bc3d5698SJohn Baldwin mov x26,sp 2053bc3d5698SJohn Baldwin add x1,sp,#8*4 2054bc3d5698SJohn Baldwin ldp x6,x7,[x27,#8*0] 2055bc3d5698SJohn Baldwin sbcs x13,x22,x17 2056bc3d5698SJohn Baldwin stp x10,x11,[x0,#8*0] 2057bc3d5698SJohn Baldwin ldp x8,x9,[x27,#8*2] 2058bc3d5698SJohn Baldwin stp x12,x13,[x0,#8*2] 2059bc3d5698SJohn Baldwin ldp x19,x20,[x1,#8*0] 2060bc3d5698SJohn Baldwin ldp x21,x22,[x1,#8*2] 2061bc3d5698SJohn Baldwin sbcs xzr,x30,xzr // did it borrow? 2062bc3d5698SJohn Baldwin ldr x30,[x29,#8] // pull return address 2063bc3d5698SJohn Baldwin 2064bc3d5698SJohn Baldwin sub x28,x5,#8*4 2065bc3d5698SJohn Baldwin.Lmul4x_cond_copy: 2066bc3d5698SJohn Baldwin sub x28,x28,#8*4 2067bc3d5698SJohn Baldwin csel x10,x19,x6,lo 2068bc3d5698SJohn Baldwin stp xzr,xzr,[x26,#8*0] 2069bc3d5698SJohn Baldwin csel x11,x20,x7,lo 2070bc3d5698SJohn Baldwin ldp x6,x7,[x27,#8*4] 2071bc3d5698SJohn Baldwin ldp x19,x20,[x1,#8*4] 2072bc3d5698SJohn Baldwin csel x12,x21,x8,lo 2073bc3d5698SJohn Baldwin stp xzr,xzr,[x26,#8*2] 2074bc3d5698SJohn Baldwin add x26,x26,#8*4 2075bc3d5698SJohn Baldwin csel x13,x22,x9,lo 2076bc3d5698SJohn Baldwin ldp x8,x9,[x27,#8*6] 2077bc3d5698SJohn Baldwin ldp x21,x22,[x1,#8*6] 2078bc3d5698SJohn Baldwin add x1,x1,#8*4 2079bc3d5698SJohn Baldwin stp x10,x11,[x27,#8*0] 2080bc3d5698SJohn Baldwin stp x12,x13,[x27,#8*2] 2081bc3d5698SJohn Baldwin add x27,x27,#8*4 2082bc3d5698SJohn Baldwin cbnz x28,.Lmul4x_cond_copy 2083bc3d5698SJohn Baldwin 2084bc3d5698SJohn Baldwin csel x10,x19,x6,lo 2085bc3d5698SJohn Baldwin stp xzr,xzr,[x26,#8*0] 2086bc3d5698SJohn Baldwin csel x11,x20,x7,lo 2087bc3d5698SJohn Baldwin stp xzr,xzr,[x26,#8*2] 2088bc3d5698SJohn Baldwin csel x12,x21,x8,lo 2089bc3d5698SJohn Baldwin stp xzr,xzr,[x26,#8*3] 2090bc3d5698SJohn Baldwin csel x13,x22,x9,lo 2091bc3d5698SJohn Baldwin stp xzr,xzr,[x26,#8*4] 2092bc3d5698SJohn Baldwin stp x10,x11,[x27,#8*0] 2093bc3d5698SJohn Baldwin stp x12,x13,[x27,#8*2] 2094bc3d5698SJohn Baldwin 2095bc3d5698SJohn Baldwin b .Lmul4x_done 2096bc3d5698SJohn Baldwin 2097bc3d5698SJohn Baldwin.align 4 2098bc3d5698SJohn Baldwin.Lmul4x4_post_condition: 2099bc3d5698SJohn Baldwin adc x0,x0,xzr 2100bc3d5698SJohn Baldwin ldr x1,[x29,#96] // pull rp 2101bc3d5698SJohn Baldwin // x19-3,x0 hold result, x14-7 hold modulus 2102bc3d5698SJohn Baldwin subs x6,x19,x14 2103bc3d5698SJohn Baldwin ldr x30,[x29,#8] // pull return address 2104bc3d5698SJohn Baldwin sbcs x7,x20,x15 2105bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#8*0] 2106bc3d5698SJohn Baldwin sbcs x8,x21,x16 2107bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#8*2] 2108bc3d5698SJohn Baldwin sbcs x9,x22,x17 2109bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#8*4] 2110bc3d5698SJohn Baldwin sbcs xzr,x0,xzr // did it borrow? 2111bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#8*6] 2112bc3d5698SJohn Baldwin 2113bc3d5698SJohn Baldwin // x6-3 hold result-modulus 2114bc3d5698SJohn Baldwin csel x6,x19,x6,lo 2115bc3d5698SJohn Baldwin csel x7,x20,x7,lo 2116bc3d5698SJohn Baldwin csel x8,x21,x8,lo 2117bc3d5698SJohn Baldwin csel x9,x22,x9,lo 2118bc3d5698SJohn Baldwin stp x6,x7,[x1,#8*0] 2119bc3d5698SJohn Baldwin stp x8,x9,[x1,#8*2] 2120bc3d5698SJohn Baldwin 2121bc3d5698SJohn Baldwin.Lmul4x_done: 2122bc3d5698SJohn Baldwin ldp x19,x20,[x29,#16] 2123bc3d5698SJohn Baldwin mov sp,x29 2124bc3d5698SJohn Baldwin ldp x21,x22,[x29,#32] 2125bc3d5698SJohn Baldwin mov x0,#1 2126bc3d5698SJohn Baldwin ldp x23,x24,[x29,#48] 2127bc3d5698SJohn Baldwin ldp x25,x26,[x29,#64] 2128bc3d5698SJohn Baldwin ldp x27,x28,[x29,#80] 2129bc3d5698SJohn Baldwin ldr x29,[sp],#128 2130bd9588bcSAndrew Turner // x30 loaded earlier 2131bd9588bcSAndrew Turner AARCH64_VALIDATE_LINK_REGISTER 2132bc3d5698SJohn Baldwin ret 2133bc3d5698SJohn Baldwin.size __bn_mul4x_mont,.-__bn_mul4x_mont 2134bc3d5698SJohn Baldwin.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 2135bc3d5698SJohn Baldwin.align 2 2136bc3d5698SJohn Baldwin.align 4 2137