1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from chacha-armv8.pl. */ 2bc3d5698SJohn Baldwin#include "arm_arch.h" 3*bd9588bcSAndrew Turner#ifndef __KERNEL__ 4bc3d5698SJohn Baldwin 5c3c73b4fSJung-uk Kim.hidden OPENSSL_armcap_P 6c0855eaaSJohn Baldwin#endif 7c0855eaaSJohn Baldwin 8c0855eaaSJohn Baldwin.text 9bc3d5698SJohn Baldwin 10bc3d5698SJohn Baldwin.align 5 11bc3d5698SJohn Baldwin.Lsigma: 12bc3d5698SJohn Baldwin.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 13bc3d5698SJohn Baldwin.Lone: 14c0855eaaSJohn Baldwin.long 1,2,3,4 15c0855eaaSJohn Baldwin.Lrot24: 16c0855eaaSJohn Baldwin.long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f 17c0855eaaSJohn Baldwin.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 18bc3d5698SJohn Baldwin.align 2 19bc3d5698SJohn Baldwin 20bc3d5698SJohn Baldwin.globl ChaCha20_ctr32 21bc3d5698SJohn Baldwin.type ChaCha20_ctr32,%function 22bc3d5698SJohn Baldwin.align 5 23bc3d5698SJohn BaldwinChaCha20_ctr32: 24*bd9588bcSAndrew Turner AARCH64_SIGN_LINK_REGISTER 25bc3d5698SJohn Baldwin cbz x2,.Labort 26bc3d5698SJohn Baldwin cmp x2,#192 27bc3d5698SJohn Baldwin b.lo .Lshort 28c0855eaaSJohn Baldwin 29c0855eaaSJohn Baldwin#ifndef __KERNEL__ 30c0855eaaSJohn Baldwin adrp x17,OPENSSL_armcap_P 31c0855eaaSJohn Baldwin ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 32bc3d5698SJohn Baldwin tst w17,#ARMV7_NEON 33c0855eaaSJohn Baldwin b.ne .LChaCha20_neon 34c0855eaaSJohn Baldwin#endif 35bc3d5698SJohn Baldwin 36bc3d5698SJohn Baldwin.Lshort: 37bc3d5698SJohn Baldwin stp x29,x30,[sp,#-96]! 38bc3d5698SJohn Baldwin add x29,sp,#0 39bc3d5698SJohn Baldwin 40bc3d5698SJohn Baldwin adr x5,.Lsigma 41bc3d5698SJohn Baldwin stp x19,x20,[sp,#16] 42bc3d5698SJohn Baldwin stp x21,x22,[sp,#32] 43bc3d5698SJohn Baldwin stp x23,x24,[sp,#48] 44bc3d5698SJohn Baldwin stp x25,x26,[sp,#64] 45bc3d5698SJohn Baldwin stp x27,x28,[sp,#80] 46bc3d5698SJohn Baldwin sub sp,sp,#64 47bc3d5698SJohn Baldwin 48bc3d5698SJohn Baldwin ldp x22,x23,[x5] // load sigma 49bc3d5698SJohn Baldwin ldp x24,x25,[x3] // load key 50bc3d5698SJohn Baldwin ldp x26,x27,[x3,#16] 51bc3d5698SJohn Baldwin ldp x28,x30,[x4] // load counter 52c0855eaaSJohn Baldwin#ifdef __AARCH64EB__ 53bc3d5698SJohn Baldwin ror x24,x24,#32 54bc3d5698SJohn Baldwin ror x25,x25,#32 55bc3d5698SJohn Baldwin ror x26,x26,#32 56bc3d5698SJohn Baldwin ror x27,x27,#32 57bc3d5698SJohn Baldwin ror x28,x28,#32 58bc3d5698SJohn Baldwin ror x30,x30,#32 59bc3d5698SJohn Baldwin#endif 60bc3d5698SJohn Baldwin 61bc3d5698SJohn Baldwin.Loop_outer: 62bc3d5698SJohn Baldwin mov w5,w22 // unpack key block 63bc3d5698SJohn Baldwin lsr x6,x22,#32 64bc3d5698SJohn Baldwin mov w7,w23 65bc3d5698SJohn Baldwin lsr x8,x23,#32 66bc3d5698SJohn Baldwin mov w9,w24 67bc3d5698SJohn Baldwin lsr x10,x24,#32 68bc3d5698SJohn Baldwin mov w11,w25 69bc3d5698SJohn Baldwin lsr x12,x25,#32 70bc3d5698SJohn Baldwin mov w13,w26 71bc3d5698SJohn Baldwin lsr x14,x26,#32 72bc3d5698SJohn Baldwin mov w15,w27 73bc3d5698SJohn Baldwin lsr x16,x27,#32 74bc3d5698SJohn Baldwin mov w17,w28 75bc3d5698SJohn Baldwin lsr x19,x28,#32 76bc3d5698SJohn Baldwin mov w20,w30 77bc3d5698SJohn Baldwin lsr x21,x30,#32 78bc3d5698SJohn Baldwin 79bc3d5698SJohn Baldwin mov x4,#10 80bc3d5698SJohn Baldwin subs x2,x2,#64 81bc3d5698SJohn Baldwin.Loop: 82bc3d5698SJohn Baldwin sub x4,x4,#1 83bc3d5698SJohn Baldwin add w5,w5,w9 84bc3d5698SJohn Baldwin add w6,w6,w10 85bc3d5698SJohn Baldwin add w7,w7,w11 86bc3d5698SJohn Baldwin add w8,w8,w12 87bc3d5698SJohn Baldwin eor w17,w17,w5 88bc3d5698SJohn Baldwin eor w19,w19,w6 89bc3d5698SJohn Baldwin eor w20,w20,w7 90bc3d5698SJohn Baldwin eor w21,w21,w8 91bc3d5698SJohn Baldwin ror w17,w17,#16 92bc3d5698SJohn Baldwin ror w19,w19,#16 93bc3d5698SJohn Baldwin ror w20,w20,#16 94bc3d5698SJohn Baldwin ror w21,w21,#16 95bc3d5698SJohn Baldwin add w13,w13,w17 96bc3d5698SJohn Baldwin add w14,w14,w19 97bc3d5698SJohn Baldwin add w15,w15,w20 98bc3d5698SJohn Baldwin add w16,w16,w21 99bc3d5698SJohn Baldwin eor w9,w9,w13 100bc3d5698SJohn Baldwin eor w10,w10,w14 101bc3d5698SJohn Baldwin eor w11,w11,w15 102bc3d5698SJohn Baldwin eor w12,w12,w16 103bc3d5698SJohn Baldwin ror w9,w9,#20 104bc3d5698SJohn Baldwin ror w10,w10,#20 105bc3d5698SJohn Baldwin ror w11,w11,#20 106bc3d5698SJohn Baldwin ror w12,w12,#20 107bc3d5698SJohn Baldwin add w5,w5,w9 108bc3d5698SJohn Baldwin add w6,w6,w10 109bc3d5698SJohn Baldwin add w7,w7,w11 110bc3d5698SJohn Baldwin add w8,w8,w12 111bc3d5698SJohn Baldwin eor w17,w17,w5 112bc3d5698SJohn Baldwin eor w19,w19,w6 113bc3d5698SJohn Baldwin eor w20,w20,w7 114bc3d5698SJohn Baldwin eor w21,w21,w8 115bc3d5698SJohn Baldwin ror w17,w17,#24 116bc3d5698SJohn Baldwin ror w19,w19,#24 117bc3d5698SJohn Baldwin ror w20,w20,#24 118bc3d5698SJohn Baldwin ror w21,w21,#24 119bc3d5698SJohn Baldwin add w13,w13,w17 120bc3d5698SJohn Baldwin add w14,w14,w19 121bc3d5698SJohn Baldwin add w15,w15,w20 122bc3d5698SJohn Baldwin add w16,w16,w21 123bc3d5698SJohn Baldwin eor w9,w9,w13 124bc3d5698SJohn Baldwin eor w10,w10,w14 125bc3d5698SJohn Baldwin eor w11,w11,w15 126bc3d5698SJohn Baldwin eor w12,w12,w16 127bc3d5698SJohn Baldwin ror w9,w9,#25 128bc3d5698SJohn Baldwin ror w10,w10,#25 129bc3d5698SJohn Baldwin ror w11,w11,#25 130bc3d5698SJohn Baldwin ror w12,w12,#25 131bc3d5698SJohn Baldwin add w5,w5,w10 132bc3d5698SJohn Baldwin add w6,w6,w11 133bc3d5698SJohn Baldwin add w7,w7,w12 134bc3d5698SJohn Baldwin add w8,w8,w9 135bc3d5698SJohn Baldwin eor w21,w21,w5 136bc3d5698SJohn Baldwin eor w17,w17,w6 137bc3d5698SJohn Baldwin eor w19,w19,w7 138bc3d5698SJohn Baldwin eor w20,w20,w8 139bc3d5698SJohn Baldwin ror w21,w21,#16 140bc3d5698SJohn Baldwin ror w17,w17,#16 141bc3d5698SJohn Baldwin ror w19,w19,#16 142bc3d5698SJohn Baldwin ror w20,w20,#16 143bc3d5698SJohn Baldwin add w15,w15,w21 144bc3d5698SJohn Baldwin add w16,w16,w17 145bc3d5698SJohn Baldwin add w13,w13,w19 146bc3d5698SJohn Baldwin add w14,w14,w20 147bc3d5698SJohn Baldwin eor w10,w10,w15 148bc3d5698SJohn Baldwin eor w11,w11,w16 149bc3d5698SJohn Baldwin eor w12,w12,w13 150bc3d5698SJohn Baldwin eor w9,w9,w14 151bc3d5698SJohn Baldwin ror w10,w10,#20 152bc3d5698SJohn Baldwin ror w11,w11,#20 153bc3d5698SJohn Baldwin ror w12,w12,#20 154bc3d5698SJohn Baldwin ror w9,w9,#20 155bc3d5698SJohn Baldwin add w5,w5,w10 156bc3d5698SJohn Baldwin add w6,w6,w11 157bc3d5698SJohn Baldwin add w7,w7,w12 158bc3d5698SJohn Baldwin add w8,w8,w9 159bc3d5698SJohn Baldwin eor w21,w21,w5 160bc3d5698SJohn Baldwin eor w17,w17,w6 161bc3d5698SJohn Baldwin eor w19,w19,w7 162bc3d5698SJohn Baldwin eor w20,w20,w8 163bc3d5698SJohn Baldwin ror w21,w21,#24 164bc3d5698SJohn Baldwin ror w17,w17,#24 165bc3d5698SJohn Baldwin ror w19,w19,#24 166bc3d5698SJohn Baldwin ror w20,w20,#24 167bc3d5698SJohn Baldwin add w15,w15,w21 168bc3d5698SJohn Baldwin add w16,w16,w17 169bc3d5698SJohn Baldwin add w13,w13,w19 170bc3d5698SJohn Baldwin add w14,w14,w20 171bc3d5698SJohn Baldwin eor w10,w10,w15 172bc3d5698SJohn Baldwin eor w11,w11,w16 173bc3d5698SJohn Baldwin eor w12,w12,w13 174bc3d5698SJohn Baldwin eor w9,w9,w14 175bc3d5698SJohn Baldwin ror w10,w10,#25 176bc3d5698SJohn Baldwin ror w11,w11,#25 177bc3d5698SJohn Baldwin ror w12,w12,#25 178bc3d5698SJohn Baldwin ror w9,w9,#25 179bc3d5698SJohn Baldwin cbnz x4,.Loop 180bc3d5698SJohn Baldwin 181bc3d5698SJohn Baldwin add w5,w5,w22 // accumulate key block 182bc3d5698SJohn Baldwin add x6,x6,x22,lsr#32 183bc3d5698SJohn Baldwin add w7,w7,w23 184bc3d5698SJohn Baldwin add x8,x8,x23,lsr#32 185bc3d5698SJohn Baldwin add w9,w9,w24 186bc3d5698SJohn Baldwin add x10,x10,x24,lsr#32 187bc3d5698SJohn Baldwin add w11,w11,w25 188bc3d5698SJohn Baldwin add x12,x12,x25,lsr#32 189bc3d5698SJohn Baldwin add w13,w13,w26 190bc3d5698SJohn Baldwin add x14,x14,x26,lsr#32 191bc3d5698SJohn Baldwin add w15,w15,w27 192bc3d5698SJohn Baldwin add x16,x16,x27,lsr#32 193bc3d5698SJohn Baldwin add w17,w17,w28 194bc3d5698SJohn Baldwin add x19,x19,x28,lsr#32 195bc3d5698SJohn Baldwin add w20,w20,w30 196bc3d5698SJohn Baldwin add x21,x21,x30,lsr#32 197bc3d5698SJohn Baldwin 198bc3d5698SJohn Baldwin b.lo .Ltail 199bc3d5698SJohn Baldwin 200bc3d5698SJohn Baldwin add x5,x5,x6,lsl#32 // pack 201bc3d5698SJohn Baldwin add x7,x7,x8,lsl#32 202bc3d5698SJohn Baldwin ldp x6,x8,[x1,#0] // load input 203bc3d5698SJohn Baldwin add x9,x9,x10,lsl#32 204bc3d5698SJohn Baldwin add x11,x11,x12,lsl#32 205bc3d5698SJohn Baldwin ldp x10,x12,[x1,#16] 206bc3d5698SJohn Baldwin add x13,x13,x14,lsl#32 207bc3d5698SJohn Baldwin add x15,x15,x16,lsl#32 208bc3d5698SJohn Baldwin ldp x14,x16,[x1,#32] 209bc3d5698SJohn Baldwin add x17,x17,x19,lsl#32 210bc3d5698SJohn Baldwin add x20,x20,x21,lsl#32 211bc3d5698SJohn Baldwin ldp x19,x21,[x1,#48] 212bc3d5698SJohn Baldwin add x1,x1,#64 213c0855eaaSJohn Baldwin#ifdef __AARCH64EB__ 214bc3d5698SJohn Baldwin rev x5,x5 215bc3d5698SJohn Baldwin rev x7,x7 216bc3d5698SJohn Baldwin rev x9,x9 217bc3d5698SJohn Baldwin rev x11,x11 218bc3d5698SJohn Baldwin rev x13,x13 219bc3d5698SJohn Baldwin rev x15,x15 220bc3d5698SJohn Baldwin rev x17,x17 221bc3d5698SJohn Baldwin rev x20,x20 222bc3d5698SJohn Baldwin#endif 223bc3d5698SJohn Baldwin eor x5,x5,x6 224bc3d5698SJohn Baldwin eor x7,x7,x8 225bc3d5698SJohn Baldwin eor x9,x9,x10 226bc3d5698SJohn Baldwin eor x11,x11,x12 227bc3d5698SJohn Baldwin eor x13,x13,x14 228bc3d5698SJohn Baldwin eor x15,x15,x16 229bc3d5698SJohn Baldwin eor x17,x17,x19 230bc3d5698SJohn Baldwin eor x20,x20,x21 231bc3d5698SJohn Baldwin 232bc3d5698SJohn Baldwin stp x5,x7,[x0,#0] // store output 233bc3d5698SJohn Baldwin add x28,x28,#1 // increment counter 234bc3d5698SJohn Baldwin stp x9,x11,[x0,#16] 235bc3d5698SJohn Baldwin stp x13,x15,[x0,#32] 236bc3d5698SJohn Baldwin stp x17,x20,[x0,#48] 237bc3d5698SJohn Baldwin add x0,x0,#64 238bc3d5698SJohn Baldwin 239bc3d5698SJohn Baldwin b.hi .Loop_outer 240bc3d5698SJohn Baldwin 241bc3d5698SJohn Baldwin ldp x19,x20,[x29,#16] 242bc3d5698SJohn Baldwin add sp,sp,#64 243bc3d5698SJohn Baldwin ldp x21,x22,[x29,#32] 244bc3d5698SJohn Baldwin ldp x23,x24,[x29,#48] 245bc3d5698SJohn Baldwin ldp x25,x26,[x29,#64] 246bc3d5698SJohn Baldwin ldp x27,x28,[x29,#80] 247bc3d5698SJohn Baldwin ldp x29,x30,[sp],#96 248bc3d5698SJohn Baldwin.Labort: 249*bd9588bcSAndrew Turner AARCH64_VALIDATE_LINK_REGISTER 250bc3d5698SJohn Baldwin ret 251bc3d5698SJohn Baldwin 252bc3d5698SJohn Baldwin.align 4 253bc3d5698SJohn Baldwin.Ltail: 254bc3d5698SJohn Baldwin add x2,x2,#64 255bc3d5698SJohn Baldwin.Less_than_64: 256bc3d5698SJohn Baldwin sub x0,x0,#1 257bc3d5698SJohn Baldwin add x1,x1,x2 258bc3d5698SJohn Baldwin add x0,x0,x2 259bc3d5698SJohn Baldwin add x4,sp,x2 260bc3d5698SJohn Baldwin neg x2,x2 261bc3d5698SJohn Baldwin 262bc3d5698SJohn Baldwin add x5,x5,x6,lsl#32 // pack 263bc3d5698SJohn Baldwin add x7,x7,x8,lsl#32 264bc3d5698SJohn Baldwin add x9,x9,x10,lsl#32 265bc3d5698SJohn Baldwin add x11,x11,x12,lsl#32 266bc3d5698SJohn Baldwin add x13,x13,x14,lsl#32 267bc3d5698SJohn Baldwin add x15,x15,x16,lsl#32 268bc3d5698SJohn Baldwin add x17,x17,x19,lsl#32 269bc3d5698SJohn Baldwin add x20,x20,x21,lsl#32 270c0855eaaSJohn Baldwin#ifdef __AARCH64EB__ 271bc3d5698SJohn Baldwin rev x5,x5 272bc3d5698SJohn Baldwin rev x7,x7 273bc3d5698SJohn Baldwin rev x9,x9 274bc3d5698SJohn Baldwin rev x11,x11 275bc3d5698SJohn Baldwin rev x13,x13 276bc3d5698SJohn Baldwin rev x15,x15 277bc3d5698SJohn Baldwin rev x17,x17 278bc3d5698SJohn Baldwin rev x20,x20 279bc3d5698SJohn Baldwin#endif 280bc3d5698SJohn Baldwin stp x5,x7,[sp,#0] 281bc3d5698SJohn Baldwin stp x9,x11,[sp,#16] 282bc3d5698SJohn Baldwin stp x13,x15,[sp,#32] 283bc3d5698SJohn Baldwin stp x17,x20,[sp,#48] 284bc3d5698SJohn Baldwin 285bc3d5698SJohn Baldwin.Loop_tail: 286bc3d5698SJohn Baldwin ldrb w10,[x1,x2] 287bc3d5698SJohn Baldwin ldrb w11,[x4,x2] 288bc3d5698SJohn Baldwin add x2,x2,#1 289bc3d5698SJohn Baldwin eor w10,w10,w11 290bc3d5698SJohn Baldwin strb w10,[x0,x2] 291bc3d5698SJohn Baldwin cbnz x2,.Loop_tail 292bc3d5698SJohn Baldwin 293bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#0] 294bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#16] 295bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#32] 296bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#48] 297bc3d5698SJohn Baldwin 298bc3d5698SJohn Baldwin ldp x19,x20,[x29,#16] 299bc3d5698SJohn Baldwin add sp,sp,#64 300bc3d5698SJohn Baldwin ldp x21,x22,[x29,#32] 301bc3d5698SJohn Baldwin ldp x23,x24,[x29,#48] 302bc3d5698SJohn Baldwin ldp x25,x26,[x29,#64] 303bc3d5698SJohn Baldwin ldp x27,x28,[x29,#80] 304bc3d5698SJohn Baldwin ldp x29,x30,[sp],#96 305*bd9588bcSAndrew Turner AARCH64_VALIDATE_LINK_REGISTER 306bc3d5698SJohn Baldwin ret 307bc3d5698SJohn Baldwin.size ChaCha20_ctr32,.-ChaCha20_ctr32 308bc3d5698SJohn Baldwin 309c0855eaaSJohn Baldwin#ifdef __KERNEL__ 310c0855eaaSJohn Baldwin.globl ChaCha20_neon 311c0855eaaSJohn Baldwin#endif 312bc3d5698SJohn Baldwin.type ChaCha20_neon,%function 313bc3d5698SJohn Baldwin.align 5 314bc3d5698SJohn BaldwinChaCha20_neon: 315*bd9588bcSAndrew Turner AARCH64_SIGN_LINK_REGISTER 316c0855eaaSJohn Baldwin.LChaCha20_neon: 317bc3d5698SJohn Baldwin stp x29,x30,[sp,#-96]! 318bc3d5698SJohn Baldwin add x29,sp,#0 319bc3d5698SJohn Baldwin 320bc3d5698SJohn Baldwin adr x5,.Lsigma 321bc3d5698SJohn Baldwin stp x19,x20,[sp,#16] 322bc3d5698SJohn Baldwin stp x21,x22,[sp,#32] 323bc3d5698SJohn Baldwin stp x23,x24,[sp,#48] 324bc3d5698SJohn Baldwin stp x25,x26,[sp,#64] 325bc3d5698SJohn Baldwin stp x27,x28,[sp,#80] 326bc3d5698SJohn Baldwin cmp x2,#512 327bc3d5698SJohn Baldwin b.hs .L512_or_more_neon 328bc3d5698SJohn Baldwin 329bc3d5698SJohn Baldwin sub sp,sp,#64 330bc3d5698SJohn Baldwin 331bc3d5698SJohn Baldwin ldp x22,x23,[x5] // load sigma 332c0855eaaSJohn Baldwin ld1 {v0.4s},[x5],#16 333bc3d5698SJohn Baldwin ldp x24,x25,[x3] // load key 334bc3d5698SJohn Baldwin ldp x26,x27,[x3,#16] 335c0855eaaSJohn Baldwin ld1 {v1.4s,v2.4s},[x3] 336bc3d5698SJohn Baldwin ldp x28,x30,[x4] // load counter 337c0855eaaSJohn Baldwin ld1 {v3.4s},[x4] 338c0855eaaSJohn Baldwin stp d8,d9,[sp] // meet ABI requirements 339c0855eaaSJohn Baldwin ld1 {v8.4s,v9.4s},[x5] 340c0855eaaSJohn Baldwin#ifdef __AARCH64EB__ 341c0855eaaSJohn Baldwin rev64 v0.4s,v0.4s 342bc3d5698SJohn Baldwin ror x24,x24,#32 343bc3d5698SJohn Baldwin ror x25,x25,#32 344bc3d5698SJohn Baldwin ror x26,x26,#32 345bc3d5698SJohn Baldwin ror x27,x27,#32 346bc3d5698SJohn Baldwin ror x28,x28,#32 347bc3d5698SJohn Baldwin ror x30,x30,#32 348bc3d5698SJohn Baldwin#endif 349bc3d5698SJohn Baldwin 350bc3d5698SJohn Baldwin.Loop_outer_neon: 351c0855eaaSJohn Baldwin dup v16.4s,v0.s[0] // unpack key block 352c0855eaaSJohn Baldwin mov w5,w22 353c0855eaaSJohn Baldwin dup v20.4s,v0.s[1] 354bc3d5698SJohn Baldwin lsr x6,x22,#32 355c0855eaaSJohn Baldwin dup v24.4s,v0.s[2] 356bc3d5698SJohn Baldwin mov w7,w23 357c0855eaaSJohn Baldwin dup v28.4s,v0.s[3] 358bc3d5698SJohn Baldwin lsr x8,x23,#32 359c0855eaaSJohn Baldwin dup v17.4s,v1.s[0] 360bc3d5698SJohn Baldwin mov w9,w24 361c0855eaaSJohn Baldwin dup v21.4s,v1.s[1] 362bc3d5698SJohn Baldwin lsr x10,x24,#32 363c0855eaaSJohn Baldwin dup v25.4s,v1.s[2] 364bc3d5698SJohn Baldwin mov w11,w25 365c0855eaaSJohn Baldwin dup v29.4s,v1.s[3] 366bc3d5698SJohn Baldwin lsr x12,x25,#32 367c0855eaaSJohn Baldwin dup v19.4s,v3.s[0] 368bc3d5698SJohn Baldwin mov w13,w26 369c0855eaaSJohn Baldwin dup v23.4s,v3.s[1] 370bc3d5698SJohn Baldwin lsr x14,x26,#32 371c0855eaaSJohn Baldwin dup v27.4s,v3.s[2] 372bc3d5698SJohn Baldwin mov w15,w27 373c0855eaaSJohn Baldwin dup v31.4s,v3.s[3] 374bc3d5698SJohn Baldwin lsr x16,x27,#32 375c0855eaaSJohn Baldwin add v19.4s,v19.4s,v8.4s 376bc3d5698SJohn Baldwin mov w17,w28 377c0855eaaSJohn Baldwin dup v18.4s,v2.s[0] 378bc3d5698SJohn Baldwin lsr x19,x28,#32 379c0855eaaSJohn Baldwin dup v22.4s,v2.s[1] 380bc3d5698SJohn Baldwin mov w20,w30 381c0855eaaSJohn Baldwin dup v26.4s,v2.s[2] 382bc3d5698SJohn Baldwin lsr x21,x30,#32 383c0855eaaSJohn Baldwin dup v30.4s,v2.s[3] 384bc3d5698SJohn Baldwin 385bc3d5698SJohn Baldwin mov x4,#10 386c0855eaaSJohn Baldwin subs x2,x2,#320 387bc3d5698SJohn Baldwin.Loop_neon: 388bc3d5698SJohn Baldwin sub x4,x4,#1 389bc3d5698SJohn Baldwin add v16.4s,v16.4s,v17.4s 390c0855eaaSJohn Baldwin add w5,w5,w9 391c0855eaaSJohn Baldwin add v20.4s,v20.4s,v21.4s 392c0855eaaSJohn Baldwin add w6,w6,w10 393c0855eaaSJohn Baldwin add v24.4s,v24.4s,v25.4s 394bc3d5698SJohn Baldwin add w7,w7,w11 395c0855eaaSJohn Baldwin add v28.4s,v28.4s,v29.4s 396bc3d5698SJohn Baldwin add w8,w8,w12 397bc3d5698SJohn Baldwin eor v19.16b,v19.16b,v16.16b 398c0855eaaSJohn Baldwin eor w17,w17,w5 399c0855eaaSJohn Baldwin eor v23.16b,v23.16b,v20.16b 400bc3d5698SJohn Baldwin eor w19,w19,w6 401c0855eaaSJohn Baldwin eor v27.16b,v27.16b,v24.16b 402bc3d5698SJohn Baldwin eor w20,w20,w7 403c0855eaaSJohn Baldwin eor v31.16b,v31.16b,v28.16b 404bc3d5698SJohn Baldwin eor w21,w21,w8 405bc3d5698SJohn Baldwin rev32 v19.8h,v19.8h 406bc3d5698SJohn Baldwin ror w17,w17,#16 407c0855eaaSJohn Baldwin rev32 v23.8h,v23.8h 408bc3d5698SJohn Baldwin ror w19,w19,#16 409c0855eaaSJohn Baldwin rev32 v27.8h,v27.8h 410bc3d5698SJohn Baldwin ror w20,w20,#16 411c0855eaaSJohn Baldwin rev32 v31.8h,v31.8h 412bc3d5698SJohn Baldwin ror w21,w21,#16 413c0855eaaSJohn Baldwin add v18.4s,v18.4s,v19.4s 414bc3d5698SJohn Baldwin add w13,w13,w17 415c0855eaaSJohn Baldwin add v22.4s,v22.4s,v23.4s 416bc3d5698SJohn Baldwin add w14,w14,w19 417c0855eaaSJohn Baldwin add v26.4s,v26.4s,v27.4s 418bc3d5698SJohn Baldwin add w15,w15,w20 419c0855eaaSJohn Baldwin add v30.4s,v30.4s,v31.4s 420bc3d5698SJohn Baldwin add w16,w16,w21 421c0855eaaSJohn Baldwin eor v4.16b,v17.16b,v18.16b 422bc3d5698SJohn Baldwin eor w9,w9,w13 423c0855eaaSJohn Baldwin eor v5.16b,v21.16b,v22.16b 424bc3d5698SJohn Baldwin eor w10,w10,w14 425c0855eaaSJohn Baldwin eor v6.16b,v25.16b,v26.16b 426bc3d5698SJohn Baldwin eor w11,w11,w15 427c0855eaaSJohn Baldwin eor v7.16b,v29.16b,v30.16b 428bc3d5698SJohn Baldwin eor w12,w12,w16 429c0855eaaSJohn Baldwin ushr v17.4s,v4.4s,#20 430bc3d5698SJohn Baldwin ror w9,w9,#20 431c0855eaaSJohn Baldwin ushr v21.4s,v5.4s,#20 432bc3d5698SJohn Baldwin ror w10,w10,#20 433c0855eaaSJohn Baldwin ushr v25.4s,v6.4s,#20 434bc3d5698SJohn Baldwin ror w11,w11,#20 435c0855eaaSJohn Baldwin ushr v29.4s,v7.4s,#20 436bc3d5698SJohn Baldwin ror w12,w12,#20 437c0855eaaSJohn Baldwin sli v17.4s,v4.4s,#12 438bc3d5698SJohn Baldwin add w5,w5,w9 439c0855eaaSJohn Baldwin sli v21.4s,v5.4s,#12 440bc3d5698SJohn Baldwin add w6,w6,w10 441c0855eaaSJohn Baldwin sli v25.4s,v6.4s,#12 442bc3d5698SJohn Baldwin add w7,w7,w11 443c0855eaaSJohn Baldwin sli v29.4s,v7.4s,#12 444bc3d5698SJohn Baldwin add w8,w8,w12 445c0855eaaSJohn Baldwin add v16.4s,v16.4s,v17.4s 446bc3d5698SJohn Baldwin eor w17,w17,w5 447c0855eaaSJohn Baldwin add v20.4s,v20.4s,v21.4s 448bc3d5698SJohn Baldwin eor w19,w19,w6 449c0855eaaSJohn Baldwin add v24.4s,v24.4s,v25.4s 450bc3d5698SJohn Baldwin eor w20,w20,w7 451c0855eaaSJohn Baldwin add v28.4s,v28.4s,v29.4s 452bc3d5698SJohn Baldwin eor w21,w21,w8 453c0855eaaSJohn Baldwin eor v4.16b,v19.16b,v16.16b 454bc3d5698SJohn Baldwin ror w17,w17,#24 455c0855eaaSJohn Baldwin eor v5.16b,v23.16b,v20.16b 456bc3d5698SJohn Baldwin ror w19,w19,#24 457c0855eaaSJohn Baldwin eor v6.16b,v27.16b,v24.16b 458bc3d5698SJohn Baldwin ror w20,w20,#24 459c0855eaaSJohn Baldwin eor v7.16b,v31.16b,v28.16b 460bc3d5698SJohn Baldwin ror w21,w21,#24 461c0855eaaSJohn Baldwin tbl v19.16b,{v4.16b},v9.16b 462bc3d5698SJohn Baldwin add w13,w13,w17 463c0855eaaSJohn Baldwin tbl v23.16b,{v5.16b},v9.16b 464bc3d5698SJohn Baldwin add w14,w14,w19 465c0855eaaSJohn Baldwin tbl v27.16b,{v6.16b},v9.16b 466bc3d5698SJohn Baldwin add w15,w15,w20 467c0855eaaSJohn Baldwin tbl v31.16b,{v7.16b},v9.16b 468bc3d5698SJohn Baldwin add w16,w16,w21 469c0855eaaSJohn Baldwin add v18.4s,v18.4s,v19.4s 470bc3d5698SJohn Baldwin eor w9,w9,w13 471c0855eaaSJohn Baldwin add v22.4s,v22.4s,v23.4s 472bc3d5698SJohn Baldwin eor w10,w10,w14 473c0855eaaSJohn Baldwin add v26.4s,v26.4s,v27.4s 474bc3d5698SJohn Baldwin eor w11,w11,w15 475c0855eaaSJohn Baldwin add v30.4s,v30.4s,v31.4s 476bc3d5698SJohn Baldwin eor w12,w12,w16 477c0855eaaSJohn Baldwin eor v4.16b,v17.16b,v18.16b 478bc3d5698SJohn Baldwin ror w9,w9,#25 479c0855eaaSJohn Baldwin eor v5.16b,v21.16b,v22.16b 480bc3d5698SJohn Baldwin ror w10,w10,#25 481c0855eaaSJohn Baldwin eor v6.16b,v25.16b,v26.16b 482bc3d5698SJohn Baldwin ror w11,w11,#25 483c0855eaaSJohn Baldwin eor v7.16b,v29.16b,v30.16b 484bc3d5698SJohn Baldwin ror w12,w12,#25 485c0855eaaSJohn Baldwin ushr v17.4s,v4.4s,#25 486c0855eaaSJohn Baldwin ushr v21.4s,v5.4s,#25 487c0855eaaSJohn Baldwin ushr v25.4s,v6.4s,#25 488c0855eaaSJohn Baldwin ushr v29.4s,v7.4s,#25 489c0855eaaSJohn Baldwin sli v17.4s,v4.4s,#7 490c0855eaaSJohn Baldwin sli v21.4s,v5.4s,#7 491c0855eaaSJohn Baldwin sli v25.4s,v6.4s,#7 492c0855eaaSJohn Baldwin sli v29.4s,v7.4s,#7 493c0855eaaSJohn Baldwin add v16.4s,v16.4s,v21.4s 494bc3d5698SJohn Baldwin add w5,w5,w10 495c0855eaaSJohn Baldwin add v20.4s,v20.4s,v25.4s 496bc3d5698SJohn Baldwin add w6,w6,w11 497c0855eaaSJohn Baldwin add v24.4s,v24.4s,v29.4s 498bc3d5698SJohn Baldwin add w7,w7,w12 499c0855eaaSJohn Baldwin add v28.4s,v28.4s,v17.4s 500bc3d5698SJohn Baldwin add w8,w8,w9 501c0855eaaSJohn Baldwin eor v31.16b,v31.16b,v16.16b 502bc3d5698SJohn Baldwin eor w21,w21,w5 503c0855eaaSJohn Baldwin eor v19.16b,v19.16b,v20.16b 504bc3d5698SJohn Baldwin eor w17,w17,w6 505c0855eaaSJohn Baldwin eor v23.16b,v23.16b,v24.16b 506bc3d5698SJohn Baldwin eor w19,w19,w7 507c0855eaaSJohn Baldwin eor v27.16b,v27.16b,v28.16b 508bc3d5698SJohn Baldwin eor w20,w20,w8 509c0855eaaSJohn Baldwin rev32 v31.8h,v31.8h 510bc3d5698SJohn Baldwin ror w21,w21,#16 511c0855eaaSJohn Baldwin rev32 v19.8h,v19.8h 512bc3d5698SJohn Baldwin ror w17,w17,#16 513c0855eaaSJohn Baldwin rev32 v23.8h,v23.8h 514bc3d5698SJohn Baldwin ror w19,w19,#16 515c0855eaaSJohn Baldwin rev32 v27.8h,v27.8h 516bc3d5698SJohn Baldwin ror w20,w20,#16 517c0855eaaSJohn Baldwin add v26.4s,v26.4s,v31.4s 518bc3d5698SJohn Baldwin add w15,w15,w21 519c0855eaaSJohn Baldwin add v30.4s,v30.4s,v19.4s 520bc3d5698SJohn Baldwin add w16,w16,w17 521c0855eaaSJohn Baldwin add v18.4s,v18.4s,v23.4s 522bc3d5698SJohn Baldwin add w13,w13,w19 523c0855eaaSJohn Baldwin add v22.4s,v22.4s,v27.4s 524bc3d5698SJohn Baldwin add w14,w14,w20 525c0855eaaSJohn Baldwin eor v4.16b,v21.16b,v26.16b 526bc3d5698SJohn Baldwin eor w10,w10,w15 527c0855eaaSJohn Baldwin eor v5.16b,v25.16b,v30.16b 528bc3d5698SJohn Baldwin eor w11,w11,w16 529c0855eaaSJohn Baldwin eor v6.16b,v29.16b,v18.16b 530bc3d5698SJohn Baldwin eor w12,w12,w13 531c0855eaaSJohn Baldwin eor v7.16b,v17.16b,v22.16b 532bc3d5698SJohn Baldwin eor w9,w9,w14 533c0855eaaSJohn Baldwin ushr v21.4s,v4.4s,#20 534bc3d5698SJohn Baldwin ror w10,w10,#20 535c0855eaaSJohn Baldwin ushr v25.4s,v5.4s,#20 536bc3d5698SJohn Baldwin ror w11,w11,#20 537c0855eaaSJohn Baldwin ushr v29.4s,v6.4s,#20 538bc3d5698SJohn Baldwin ror w12,w12,#20 539c0855eaaSJohn Baldwin ushr v17.4s,v7.4s,#20 540bc3d5698SJohn Baldwin ror w9,w9,#20 541c0855eaaSJohn Baldwin sli v21.4s,v4.4s,#12 542bc3d5698SJohn Baldwin add w5,w5,w10 543c0855eaaSJohn Baldwin sli v25.4s,v5.4s,#12 544bc3d5698SJohn Baldwin add w6,w6,w11 545c0855eaaSJohn Baldwin sli v29.4s,v6.4s,#12 546bc3d5698SJohn Baldwin add w7,w7,w12 547c0855eaaSJohn Baldwin sli v17.4s,v7.4s,#12 548bc3d5698SJohn Baldwin add w8,w8,w9 549c0855eaaSJohn Baldwin add v16.4s,v16.4s,v21.4s 550bc3d5698SJohn Baldwin eor w21,w21,w5 551c0855eaaSJohn Baldwin add v20.4s,v20.4s,v25.4s 552bc3d5698SJohn Baldwin eor w17,w17,w6 553c0855eaaSJohn Baldwin add v24.4s,v24.4s,v29.4s 554bc3d5698SJohn Baldwin eor w19,w19,w7 555c0855eaaSJohn Baldwin add v28.4s,v28.4s,v17.4s 556bc3d5698SJohn Baldwin eor w20,w20,w8 557c0855eaaSJohn Baldwin eor v4.16b,v31.16b,v16.16b 558bc3d5698SJohn Baldwin ror w21,w21,#24 559c0855eaaSJohn Baldwin eor v5.16b,v19.16b,v20.16b 560bc3d5698SJohn Baldwin ror w17,w17,#24 561c0855eaaSJohn Baldwin eor v6.16b,v23.16b,v24.16b 562bc3d5698SJohn Baldwin ror w19,w19,#24 563c0855eaaSJohn Baldwin eor v7.16b,v27.16b,v28.16b 564bc3d5698SJohn Baldwin ror w20,w20,#24 565c0855eaaSJohn Baldwin tbl v31.16b,{v4.16b},v9.16b 566bc3d5698SJohn Baldwin add w15,w15,w21 567c0855eaaSJohn Baldwin tbl v19.16b,{v5.16b},v9.16b 568bc3d5698SJohn Baldwin add w16,w16,w17 569c0855eaaSJohn Baldwin tbl v23.16b,{v6.16b},v9.16b 570bc3d5698SJohn Baldwin add w13,w13,w19 571c0855eaaSJohn Baldwin tbl v27.16b,{v7.16b},v9.16b 572bc3d5698SJohn Baldwin add w14,w14,w20 573c0855eaaSJohn Baldwin add v26.4s,v26.4s,v31.4s 574bc3d5698SJohn Baldwin eor w10,w10,w15 575c0855eaaSJohn Baldwin add v30.4s,v30.4s,v19.4s 576bc3d5698SJohn Baldwin eor w11,w11,w16 577c0855eaaSJohn Baldwin add v18.4s,v18.4s,v23.4s 578bc3d5698SJohn Baldwin eor w12,w12,w13 579c0855eaaSJohn Baldwin add v22.4s,v22.4s,v27.4s 580bc3d5698SJohn Baldwin eor w9,w9,w14 581c0855eaaSJohn Baldwin eor v4.16b,v21.16b,v26.16b 582bc3d5698SJohn Baldwin ror w10,w10,#25 583c0855eaaSJohn Baldwin eor v5.16b,v25.16b,v30.16b 584bc3d5698SJohn Baldwin ror w11,w11,#25 585c0855eaaSJohn Baldwin eor v6.16b,v29.16b,v18.16b 586bc3d5698SJohn Baldwin ror w12,w12,#25 587c0855eaaSJohn Baldwin eor v7.16b,v17.16b,v22.16b 588bc3d5698SJohn Baldwin ror w9,w9,#25 589c0855eaaSJohn Baldwin ushr v21.4s,v4.4s,#25 590c0855eaaSJohn Baldwin ushr v25.4s,v5.4s,#25 591c0855eaaSJohn Baldwin ushr v29.4s,v6.4s,#25 592c0855eaaSJohn Baldwin ushr v17.4s,v7.4s,#25 593c0855eaaSJohn Baldwin sli v21.4s,v4.4s,#7 594c0855eaaSJohn Baldwin sli v25.4s,v5.4s,#7 595c0855eaaSJohn Baldwin sli v29.4s,v6.4s,#7 596c0855eaaSJohn Baldwin sli v17.4s,v7.4s,#7 597bc3d5698SJohn Baldwin cbnz x4,.Loop_neon 598bc3d5698SJohn Baldwin 599c0855eaaSJohn Baldwin add v19.4s,v19.4s,v8.4s 600c0855eaaSJohn Baldwin 601c0855eaaSJohn Baldwin zip1 v4.4s,v16.4s,v20.4s // transpose data 602c0855eaaSJohn Baldwin zip1 v5.4s,v24.4s,v28.4s 603c0855eaaSJohn Baldwin zip2 v6.4s,v16.4s,v20.4s 604c0855eaaSJohn Baldwin zip2 v7.4s,v24.4s,v28.4s 605c0855eaaSJohn Baldwin zip1 v16.2d,v4.2d,v5.2d 606c0855eaaSJohn Baldwin zip2 v20.2d,v4.2d,v5.2d 607c0855eaaSJohn Baldwin zip1 v24.2d,v6.2d,v7.2d 608c0855eaaSJohn Baldwin zip2 v28.2d,v6.2d,v7.2d 609c0855eaaSJohn Baldwin 610c0855eaaSJohn Baldwin zip1 v4.4s,v17.4s,v21.4s 611c0855eaaSJohn Baldwin zip1 v5.4s,v25.4s,v29.4s 612c0855eaaSJohn Baldwin zip2 v6.4s,v17.4s,v21.4s 613c0855eaaSJohn Baldwin zip2 v7.4s,v25.4s,v29.4s 614c0855eaaSJohn Baldwin zip1 v17.2d,v4.2d,v5.2d 615c0855eaaSJohn Baldwin zip2 v21.2d,v4.2d,v5.2d 616c0855eaaSJohn Baldwin zip1 v25.2d,v6.2d,v7.2d 617c0855eaaSJohn Baldwin zip2 v29.2d,v6.2d,v7.2d 618c0855eaaSJohn Baldwin 619c0855eaaSJohn Baldwin zip1 v4.4s,v18.4s,v22.4s 620bc3d5698SJohn Baldwin add w5,w5,w22 // accumulate key block 621c0855eaaSJohn Baldwin zip1 v5.4s,v26.4s,v30.4s 622bc3d5698SJohn Baldwin add x6,x6,x22,lsr#32 623c0855eaaSJohn Baldwin zip2 v6.4s,v18.4s,v22.4s 624bc3d5698SJohn Baldwin add w7,w7,w23 625c0855eaaSJohn Baldwin zip2 v7.4s,v26.4s,v30.4s 626bc3d5698SJohn Baldwin add x8,x8,x23,lsr#32 627c0855eaaSJohn Baldwin zip1 v18.2d,v4.2d,v5.2d 628bc3d5698SJohn Baldwin add w9,w9,w24 629c0855eaaSJohn Baldwin zip2 v22.2d,v4.2d,v5.2d 630bc3d5698SJohn Baldwin add x10,x10,x24,lsr#32 631c0855eaaSJohn Baldwin zip1 v26.2d,v6.2d,v7.2d 632bc3d5698SJohn Baldwin add w11,w11,w25 633c0855eaaSJohn Baldwin zip2 v30.2d,v6.2d,v7.2d 634bc3d5698SJohn Baldwin add x12,x12,x25,lsr#32 635c0855eaaSJohn Baldwin 636c0855eaaSJohn Baldwin zip1 v4.4s,v19.4s,v23.4s 637bc3d5698SJohn Baldwin add w13,w13,w26 638c0855eaaSJohn Baldwin zip1 v5.4s,v27.4s,v31.4s 639bc3d5698SJohn Baldwin add x14,x14,x26,lsr#32 640c0855eaaSJohn Baldwin zip2 v6.4s,v19.4s,v23.4s 641bc3d5698SJohn Baldwin add w15,w15,w27 642c0855eaaSJohn Baldwin zip2 v7.4s,v27.4s,v31.4s 643bc3d5698SJohn Baldwin add x16,x16,x27,lsr#32 644c0855eaaSJohn Baldwin zip1 v19.2d,v4.2d,v5.2d 645bc3d5698SJohn Baldwin add w17,w17,w28 646c0855eaaSJohn Baldwin zip2 v23.2d,v4.2d,v5.2d 647bc3d5698SJohn Baldwin add x19,x19,x28,lsr#32 648c0855eaaSJohn Baldwin zip1 v27.2d,v6.2d,v7.2d 649bc3d5698SJohn Baldwin add w20,w20,w30 650c0855eaaSJohn Baldwin zip2 v31.2d,v6.2d,v7.2d 651bc3d5698SJohn Baldwin add x21,x21,x30,lsr#32 652bc3d5698SJohn Baldwin 653bc3d5698SJohn Baldwin b.lo .Ltail_neon 654bc3d5698SJohn Baldwin 655bc3d5698SJohn Baldwin add x5,x5,x6,lsl#32 // pack 656bc3d5698SJohn Baldwin add x7,x7,x8,lsl#32 657bc3d5698SJohn Baldwin ldp x6,x8,[x1,#0] // load input 658c0855eaaSJohn Baldwin add v16.4s,v16.4s,v0.4s // accumulate key block 659bc3d5698SJohn Baldwin add x9,x9,x10,lsl#32 660bc3d5698SJohn Baldwin add x11,x11,x12,lsl#32 661bc3d5698SJohn Baldwin ldp x10,x12,[x1,#16] 662c0855eaaSJohn Baldwin add v17.4s,v17.4s,v1.4s 663bc3d5698SJohn Baldwin add x13,x13,x14,lsl#32 664bc3d5698SJohn Baldwin add x15,x15,x16,lsl#32 665bc3d5698SJohn Baldwin ldp x14,x16,[x1,#32] 666c0855eaaSJohn Baldwin add v18.4s,v18.4s,v2.4s 667bc3d5698SJohn Baldwin add x17,x17,x19,lsl#32 668bc3d5698SJohn Baldwin add x20,x20,x21,lsl#32 669bc3d5698SJohn Baldwin ldp x19,x21,[x1,#48] 670c0855eaaSJohn Baldwin add v19.4s,v19.4s,v3.4s 671bc3d5698SJohn Baldwin add x1,x1,#64 672c0855eaaSJohn Baldwin#ifdef __AARCH64EB__ 673bc3d5698SJohn Baldwin rev x5,x5 674bc3d5698SJohn Baldwin rev x7,x7 675bc3d5698SJohn Baldwin rev x9,x9 676bc3d5698SJohn Baldwin rev x11,x11 677bc3d5698SJohn Baldwin rev x13,x13 678bc3d5698SJohn Baldwin rev x15,x15 679bc3d5698SJohn Baldwin rev x17,x17 680bc3d5698SJohn Baldwin rev x20,x20 681bc3d5698SJohn Baldwin#endif 682c0855eaaSJohn Baldwin ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 683bc3d5698SJohn Baldwin eor x5,x5,x6 684c0855eaaSJohn Baldwin add v20.4s,v20.4s,v0.4s 685bc3d5698SJohn Baldwin eor x7,x7,x8 686c0855eaaSJohn Baldwin add v21.4s,v21.4s,v1.4s 687bc3d5698SJohn Baldwin eor x9,x9,x10 688c0855eaaSJohn Baldwin add v22.4s,v22.4s,v2.4s 689bc3d5698SJohn Baldwin eor x11,x11,x12 690c0855eaaSJohn Baldwin add v23.4s,v23.4s,v3.4s 691bc3d5698SJohn Baldwin eor x13,x13,x14 692c0855eaaSJohn Baldwin eor v16.16b,v16.16b,v4.16b 693c0855eaaSJohn Baldwin movi v4.4s,#5 694bc3d5698SJohn Baldwin eor x15,x15,x16 695c0855eaaSJohn Baldwin eor v17.16b,v17.16b,v5.16b 696bc3d5698SJohn Baldwin eor x17,x17,x19 697c0855eaaSJohn Baldwin eor v18.16b,v18.16b,v6.16b 698bc3d5698SJohn Baldwin eor x20,x20,x21 699c0855eaaSJohn Baldwin eor v19.16b,v19.16b,v7.16b 700c0855eaaSJohn Baldwin add v8.4s,v8.4s,v4.4s // += 5 701c0855eaaSJohn Baldwin ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 702bc3d5698SJohn Baldwin 703bc3d5698SJohn Baldwin stp x5,x7,[x0,#0] // store output 704c0855eaaSJohn Baldwin add x28,x28,#5 // increment counter 705bc3d5698SJohn Baldwin stp x9,x11,[x0,#16] 706bc3d5698SJohn Baldwin stp x13,x15,[x0,#32] 707bc3d5698SJohn Baldwin stp x17,x20,[x0,#48] 708bc3d5698SJohn Baldwin add x0,x0,#64 709bc3d5698SJohn Baldwin 710bc3d5698SJohn Baldwin st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 711c0855eaaSJohn Baldwin add v24.4s,v24.4s,v0.4s 712c0855eaaSJohn Baldwin add v25.4s,v25.4s,v1.4s 713c0855eaaSJohn Baldwin add v26.4s,v26.4s,v2.4s 714c0855eaaSJohn Baldwin add v27.4s,v27.4s,v3.4s 715c0855eaaSJohn Baldwin ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 716c0855eaaSJohn Baldwin 717c0855eaaSJohn Baldwin eor v20.16b,v20.16b,v4.16b 718c0855eaaSJohn Baldwin eor v21.16b,v21.16b,v5.16b 719c0855eaaSJohn Baldwin eor v22.16b,v22.16b,v6.16b 720c0855eaaSJohn Baldwin eor v23.16b,v23.16b,v7.16b 721c0855eaaSJohn Baldwin st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 722c0855eaaSJohn Baldwin add v28.4s,v28.4s,v0.4s 723c0855eaaSJohn Baldwin add v29.4s,v29.4s,v1.4s 724c0855eaaSJohn Baldwin add v30.4s,v30.4s,v2.4s 725c0855eaaSJohn Baldwin add v31.4s,v31.4s,v3.4s 726c0855eaaSJohn Baldwin ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 727c0855eaaSJohn Baldwin 728c0855eaaSJohn Baldwin eor v24.16b,v24.16b,v16.16b 729c0855eaaSJohn Baldwin eor v25.16b,v25.16b,v17.16b 730c0855eaaSJohn Baldwin eor v26.16b,v26.16b,v18.16b 731c0855eaaSJohn Baldwin eor v27.16b,v27.16b,v19.16b 732c0855eaaSJohn Baldwin st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 733c0855eaaSJohn Baldwin 734c0855eaaSJohn Baldwin eor v28.16b,v28.16b,v20.16b 735c0855eaaSJohn Baldwin eor v29.16b,v29.16b,v21.16b 736c0855eaaSJohn Baldwin eor v30.16b,v30.16b,v22.16b 737c0855eaaSJohn Baldwin eor v31.16b,v31.16b,v23.16b 738c0855eaaSJohn Baldwin st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64 739bc3d5698SJohn Baldwin 740bc3d5698SJohn Baldwin b.hi .Loop_outer_neon 741bc3d5698SJohn Baldwin 742c0855eaaSJohn Baldwin ldp d8,d9,[sp] // meet ABI requirements 743c0855eaaSJohn Baldwin 744bc3d5698SJohn Baldwin ldp x19,x20,[x29,#16] 745bc3d5698SJohn Baldwin add sp,sp,#64 746bc3d5698SJohn Baldwin ldp x21,x22,[x29,#32] 747bc3d5698SJohn Baldwin ldp x23,x24,[x29,#48] 748bc3d5698SJohn Baldwin ldp x25,x26,[x29,#64] 749bc3d5698SJohn Baldwin ldp x27,x28,[x29,#80] 750bc3d5698SJohn Baldwin ldp x29,x30,[sp],#96 751*bd9588bcSAndrew Turner AARCH64_VALIDATE_LINK_REGISTER 752bc3d5698SJohn Baldwin ret 753bc3d5698SJohn Baldwin 754c0855eaaSJohn Baldwin.align 4 755bc3d5698SJohn Baldwin.Ltail_neon: 756c0855eaaSJohn Baldwin add x2,x2,#320 757c0855eaaSJohn Baldwin ldp d8,d9,[sp] // meet ABI requirements 758bc3d5698SJohn Baldwin cmp x2,#64 759bc3d5698SJohn Baldwin b.lo .Less_than_64 760bc3d5698SJohn Baldwin 761bc3d5698SJohn Baldwin add x5,x5,x6,lsl#32 // pack 762bc3d5698SJohn Baldwin add x7,x7,x8,lsl#32 763bc3d5698SJohn Baldwin ldp x6,x8,[x1,#0] // load input 764bc3d5698SJohn Baldwin add x9,x9,x10,lsl#32 765bc3d5698SJohn Baldwin add x11,x11,x12,lsl#32 766bc3d5698SJohn Baldwin ldp x10,x12,[x1,#16] 767bc3d5698SJohn Baldwin add x13,x13,x14,lsl#32 768bc3d5698SJohn Baldwin add x15,x15,x16,lsl#32 769bc3d5698SJohn Baldwin ldp x14,x16,[x1,#32] 770bc3d5698SJohn Baldwin add x17,x17,x19,lsl#32 771bc3d5698SJohn Baldwin add x20,x20,x21,lsl#32 772bc3d5698SJohn Baldwin ldp x19,x21,[x1,#48] 773bc3d5698SJohn Baldwin add x1,x1,#64 774c0855eaaSJohn Baldwin#ifdef __AARCH64EB__ 775bc3d5698SJohn Baldwin rev x5,x5 776bc3d5698SJohn Baldwin rev x7,x7 777bc3d5698SJohn Baldwin rev x9,x9 778bc3d5698SJohn Baldwin rev x11,x11 779bc3d5698SJohn Baldwin rev x13,x13 780bc3d5698SJohn Baldwin rev x15,x15 781bc3d5698SJohn Baldwin rev x17,x17 782bc3d5698SJohn Baldwin rev x20,x20 783bc3d5698SJohn Baldwin#endif 784bc3d5698SJohn Baldwin eor x5,x5,x6 785bc3d5698SJohn Baldwin eor x7,x7,x8 786bc3d5698SJohn Baldwin eor x9,x9,x10 787bc3d5698SJohn Baldwin eor x11,x11,x12 788bc3d5698SJohn Baldwin eor x13,x13,x14 789bc3d5698SJohn Baldwin eor x15,x15,x16 790bc3d5698SJohn Baldwin eor x17,x17,x19 791bc3d5698SJohn Baldwin eor x20,x20,x21 792bc3d5698SJohn Baldwin 793bc3d5698SJohn Baldwin stp x5,x7,[x0,#0] // store output 794c0855eaaSJohn Baldwin add v16.4s,v16.4s,v0.4s // accumulate key block 795bc3d5698SJohn Baldwin stp x9,x11,[x0,#16] 796c0855eaaSJohn Baldwin add v17.4s,v17.4s,v1.4s 797bc3d5698SJohn Baldwin stp x13,x15,[x0,#32] 798c0855eaaSJohn Baldwin add v18.4s,v18.4s,v2.4s 799bc3d5698SJohn Baldwin stp x17,x20,[x0,#48] 800c0855eaaSJohn Baldwin add v19.4s,v19.4s,v3.4s 801bc3d5698SJohn Baldwin add x0,x0,#64 802bc3d5698SJohn Baldwin b.eq .Ldone_neon 803bc3d5698SJohn Baldwin sub x2,x2,#64 804bc3d5698SJohn Baldwin cmp x2,#64 805c0855eaaSJohn Baldwin b.lo .Last_neon 806bc3d5698SJohn Baldwin 807c0855eaaSJohn Baldwin ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 808c0855eaaSJohn Baldwin eor v16.16b,v16.16b,v4.16b 809c0855eaaSJohn Baldwin eor v17.16b,v17.16b,v5.16b 810c0855eaaSJohn Baldwin eor v18.16b,v18.16b,v6.16b 811c0855eaaSJohn Baldwin eor v19.16b,v19.16b,v7.16b 812c0855eaaSJohn Baldwin st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 813bc3d5698SJohn Baldwin b.eq .Ldone_neon 814c0855eaaSJohn Baldwin 815c0855eaaSJohn Baldwin add v16.4s,v20.4s,v0.4s 816c0855eaaSJohn Baldwin add v17.4s,v21.4s,v1.4s 817bc3d5698SJohn Baldwin sub x2,x2,#64 818c0855eaaSJohn Baldwin add v18.4s,v22.4s,v2.4s 819bc3d5698SJohn Baldwin cmp x2,#64 820c0855eaaSJohn Baldwin add v19.4s,v23.4s,v3.4s 821c0855eaaSJohn Baldwin b.lo .Last_neon 822bc3d5698SJohn Baldwin 823c0855eaaSJohn Baldwin ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 824c0855eaaSJohn Baldwin eor v20.16b,v16.16b,v4.16b 825c0855eaaSJohn Baldwin eor v21.16b,v17.16b,v5.16b 826c0855eaaSJohn Baldwin eor v22.16b,v18.16b,v6.16b 827c0855eaaSJohn Baldwin eor v23.16b,v19.16b,v7.16b 828c0855eaaSJohn Baldwin st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 829bc3d5698SJohn Baldwin b.eq .Ldone_neon 830c0855eaaSJohn Baldwin 831c0855eaaSJohn Baldwin add v16.4s,v24.4s,v0.4s 832c0855eaaSJohn Baldwin add v17.4s,v25.4s,v1.4s 833c0855eaaSJohn Baldwin sub x2,x2,#64 834c0855eaaSJohn Baldwin add v18.4s,v26.4s,v2.4s 835c0855eaaSJohn Baldwin cmp x2,#64 836c0855eaaSJohn Baldwin add v19.4s,v27.4s,v3.4s 837c0855eaaSJohn Baldwin b.lo .Last_neon 838c0855eaaSJohn Baldwin 839c0855eaaSJohn Baldwin ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 840c0855eaaSJohn Baldwin eor v24.16b,v16.16b,v4.16b 841c0855eaaSJohn Baldwin eor v25.16b,v17.16b,v5.16b 842c0855eaaSJohn Baldwin eor v26.16b,v18.16b,v6.16b 843c0855eaaSJohn Baldwin eor v27.16b,v19.16b,v7.16b 844c0855eaaSJohn Baldwin st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 845c0855eaaSJohn Baldwin b.eq .Ldone_neon 846c0855eaaSJohn Baldwin 847c0855eaaSJohn Baldwin add v16.4s,v28.4s,v0.4s 848c0855eaaSJohn Baldwin add v17.4s,v29.4s,v1.4s 849c0855eaaSJohn Baldwin add v18.4s,v30.4s,v2.4s 850c0855eaaSJohn Baldwin add v19.4s,v31.4s,v3.4s 851bc3d5698SJohn Baldwin sub x2,x2,#64 852bc3d5698SJohn Baldwin 853bc3d5698SJohn Baldwin.Last_neon: 854c0855eaaSJohn Baldwin st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 855c0855eaaSJohn Baldwin 856bc3d5698SJohn Baldwin sub x0,x0,#1 857bc3d5698SJohn Baldwin add x1,x1,x2 858bc3d5698SJohn Baldwin add x0,x0,x2 859bc3d5698SJohn Baldwin add x4,sp,x2 860bc3d5698SJohn Baldwin neg x2,x2 861bc3d5698SJohn Baldwin 862bc3d5698SJohn Baldwin.Loop_tail_neon: 863bc3d5698SJohn Baldwin ldrb w10,[x1,x2] 864bc3d5698SJohn Baldwin ldrb w11,[x4,x2] 865bc3d5698SJohn Baldwin add x2,x2,#1 866bc3d5698SJohn Baldwin eor w10,w10,w11 867bc3d5698SJohn Baldwin strb w10,[x0,x2] 868bc3d5698SJohn Baldwin cbnz x2,.Loop_tail_neon 869bc3d5698SJohn Baldwin 870bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#0] 871bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#16] 872bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#32] 873bc3d5698SJohn Baldwin stp xzr,xzr,[sp,#48] 874bc3d5698SJohn Baldwin 875bc3d5698SJohn Baldwin.Ldone_neon: 876bc3d5698SJohn Baldwin ldp x19,x20,[x29,#16] 877bc3d5698SJohn Baldwin add sp,sp,#64 878bc3d5698SJohn Baldwin ldp x21,x22,[x29,#32] 879bc3d5698SJohn Baldwin ldp x23,x24,[x29,#48] 880bc3d5698SJohn Baldwin ldp x25,x26,[x29,#64] 881bc3d5698SJohn Baldwin ldp x27,x28,[x29,#80] 882bc3d5698SJohn Baldwin ldp x29,x30,[sp],#96 883*bd9588bcSAndrew Turner AARCH64_VALIDATE_LINK_REGISTER 884bc3d5698SJohn Baldwin ret 885bc3d5698SJohn Baldwin.size ChaCha20_neon,.-ChaCha20_neon 886bc3d5698SJohn Baldwin.type ChaCha20_512_neon,%function 887bc3d5698SJohn Baldwin.align 5 888bc3d5698SJohn BaldwinChaCha20_512_neon: 889*bd9588bcSAndrew Turner AARCH64_SIGN_LINK_REGISTER 890bc3d5698SJohn Baldwin stp x29,x30,[sp,#-96]! 891bc3d5698SJohn Baldwin add x29,sp,#0 892bc3d5698SJohn Baldwin 893bc3d5698SJohn Baldwin adr x5,.Lsigma 894bc3d5698SJohn Baldwin stp x19,x20,[sp,#16] 895bc3d5698SJohn Baldwin stp x21,x22,[sp,#32] 896bc3d5698SJohn Baldwin stp x23,x24,[sp,#48] 897bc3d5698SJohn Baldwin stp x25,x26,[sp,#64] 898bc3d5698SJohn Baldwin stp x27,x28,[sp,#80] 899bc3d5698SJohn Baldwin 900bc3d5698SJohn Baldwin.L512_or_more_neon: 901bc3d5698SJohn Baldwin sub sp,sp,#128+64 902bc3d5698SJohn Baldwin 903c0855eaaSJohn Baldwin eor v7.16b,v7.16b,v7.16b 904bc3d5698SJohn Baldwin ldp x22,x23,[x5] // load sigma 905c0855eaaSJohn Baldwin ld1 {v0.4s},[x5],#16 906bc3d5698SJohn Baldwin ldp x24,x25,[x3] // load key 907bc3d5698SJohn Baldwin ldp x26,x27,[x3,#16] 908c0855eaaSJohn Baldwin ld1 {v1.4s,v2.4s},[x3] 909bc3d5698SJohn Baldwin ldp x28,x30,[x4] // load counter 910c0855eaaSJohn Baldwin ld1 {v3.4s},[x4] 911c0855eaaSJohn Baldwin ld1 {v7.s}[0],[x5] 912c0855eaaSJohn Baldwin add x3,x5,#16 // .Lrot24 913c0855eaaSJohn Baldwin#ifdef __AARCH64EB__ 914c0855eaaSJohn Baldwin rev64 v0.4s,v0.4s 915bc3d5698SJohn Baldwin ror x24,x24,#32 916bc3d5698SJohn Baldwin ror x25,x25,#32 917bc3d5698SJohn Baldwin ror x26,x26,#32 918bc3d5698SJohn Baldwin ror x27,x27,#32 919bc3d5698SJohn Baldwin ror x28,x28,#32 920bc3d5698SJohn Baldwin ror x30,x30,#32 921bc3d5698SJohn Baldwin#endif 922c0855eaaSJohn Baldwin add v3.4s,v3.4s,v7.4s // += 1 923c0855eaaSJohn Baldwin stp q0,q1,[sp,#0] // off-load key block, invariant part 924c0855eaaSJohn Baldwin add v3.4s,v3.4s,v7.4s // not typo 925c0855eaaSJohn Baldwin str q2,[sp,#32] 926c0855eaaSJohn Baldwin add v4.4s,v3.4s,v7.4s 927c0855eaaSJohn Baldwin add v5.4s,v4.4s,v7.4s 928c0855eaaSJohn Baldwin add v6.4s,v5.4s,v7.4s 929c0855eaaSJohn Baldwin shl v7.4s,v7.4s,#2 // 1 -> 4 930bc3d5698SJohn Baldwin 931bc3d5698SJohn Baldwin stp d8,d9,[sp,#128+0] // meet ABI requirements 932bc3d5698SJohn Baldwin stp d10,d11,[sp,#128+16] 933bc3d5698SJohn Baldwin stp d12,d13,[sp,#128+32] 934bc3d5698SJohn Baldwin stp d14,d15,[sp,#128+48] 935bc3d5698SJohn Baldwin 936bc3d5698SJohn Baldwin sub x2,x2,#512 // not typo 937bc3d5698SJohn Baldwin 938bc3d5698SJohn Baldwin.Loop_outer_512_neon: 939c0855eaaSJohn Baldwin mov v8.16b,v0.16b 940c0855eaaSJohn Baldwin mov v12.16b,v0.16b 941c0855eaaSJohn Baldwin mov v16.16b,v0.16b 942c0855eaaSJohn Baldwin mov v20.16b,v0.16b 943c0855eaaSJohn Baldwin mov v24.16b,v0.16b 944c0855eaaSJohn Baldwin mov v28.16b,v0.16b 945c0855eaaSJohn Baldwin mov v9.16b,v1.16b 946bc3d5698SJohn Baldwin mov w5,w22 // unpack key block 947c0855eaaSJohn Baldwin mov v13.16b,v1.16b 948bc3d5698SJohn Baldwin lsr x6,x22,#32 949c0855eaaSJohn Baldwin mov v17.16b,v1.16b 950bc3d5698SJohn Baldwin mov w7,w23 951c0855eaaSJohn Baldwin mov v21.16b,v1.16b 952bc3d5698SJohn Baldwin lsr x8,x23,#32 953c0855eaaSJohn Baldwin mov v25.16b,v1.16b 954bc3d5698SJohn Baldwin mov w9,w24 955c0855eaaSJohn Baldwin mov v29.16b,v1.16b 956bc3d5698SJohn Baldwin lsr x10,x24,#32 957c0855eaaSJohn Baldwin mov v11.16b,v3.16b 958bc3d5698SJohn Baldwin mov w11,w25 959c0855eaaSJohn Baldwin mov v15.16b,v4.16b 960bc3d5698SJohn Baldwin lsr x12,x25,#32 961c0855eaaSJohn Baldwin mov v19.16b,v5.16b 962bc3d5698SJohn Baldwin mov w13,w26 963c0855eaaSJohn Baldwin mov v23.16b,v6.16b 964bc3d5698SJohn Baldwin lsr x14,x26,#32 965c0855eaaSJohn Baldwin mov v10.16b,v2.16b 966bc3d5698SJohn Baldwin mov w15,w27 967c0855eaaSJohn Baldwin mov v14.16b,v2.16b 968bc3d5698SJohn Baldwin lsr x16,x27,#32 969c0855eaaSJohn Baldwin add v27.4s,v11.4s,v7.4s // +4 970bc3d5698SJohn Baldwin mov w17,w28 971c0855eaaSJohn Baldwin add v31.4s,v15.4s,v7.4s // +4 972bc3d5698SJohn Baldwin lsr x19,x28,#32 973c0855eaaSJohn Baldwin mov v18.16b,v2.16b 974bc3d5698SJohn Baldwin mov w20,w30 975c0855eaaSJohn Baldwin mov v22.16b,v2.16b 976bc3d5698SJohn Baldwin lsr x21,x30,#32 977c0855eaaSJohn Baldwin mov v26.16b,v2.16b 978c0855eaaSJohn Baldwin stp q3,q4,[sp,#48] // off-load key block, variable part 979c0855eaaSJohn Baldwin mov v30.16b,v2.16b 980c0855eaaSJohn Baldwin stp q5,q6,[sp,#80] 981bc3d5698SJohn Baldwin 982bc3d5698SJohn Baldwin mov x4,#5 983c0855eaaSJohn Baldwin ld1 {v6.4s},[x3] 984bc3d5698SJohn Baldwin subs x2,x2,#512 985bc3d5698SJohn Baldwin.Loop_upper_neon: 986bc3d5698SJohn Baldwin sub x4,x4,#1 987bc3d5698SJohn Baldwin add v8.4s,v8.4s,v9.4s 988c0855eaaSJohn Baldwin add w5,w5,w9 989bc3d5698SJohn Baldwin add v12.4s,v12.4s,v13.4s 990c0855eaaSJohn Baldwin add w6,w6,w10 991bc3d5698SJohn Baldwin add v16.4s,v16.4s,v17.4s 992c0855eaaSJohn Baldwin add w7,w7,w11 993bc3d5698SJohn Baldwin add v20.4s,v20.4s,v21.4s 994c0855eaaSJohn Baldwin add w8,w8,w12 995c0855eaaSJohn Baldwin add v24.4s,v24.4s,v25.4s 996c0855eaaSJohn Baldwin eor w17,w17,w5 997c0855eaaSJohn Baldwin add v28.4s,v28.4s,v29.4s 998bc3d5698SJohn Baldwin eor w19,w19,w6 999bc3d5698SJohn Baldwin eor v11.16b,v11.16b,v8.16b 1000bc3d5698SJohn Baldwin eor w20,w20,w7 1001c0855eaaSJohn Baldwin eor v15.16b,v15.16b,v12.16b 1002bc3d5698SJohn Baldwin eor w21,w21,w8 1003c0855eaaSJohn Baldwin eor v19.16b,v19.16b,v16.16b 1004bc3d5698SJohn Baldwin ror w17,w17,#16 1005c0855eaaSJohn Baldwin eor v23.16b,v23.16b,v20.16b 1006bc3d5698SJohn Baldwin ror w19,w19,#16 1007c0855eaaSJohn Baldwin eor v27.16b,v27.16b,v24.16b 1008bc3d5698SJohn Baldwin ror w20,w20,#16 1009c0855eaaSJohn Baldwin eor v31.16b,v31.16b,v28.16b 1010c0855eaaSJohn Baldwin ror w21,w21,#16 1011c0855eaaSJohn Baldwin rev32 v11.8h,v11.8h 1012c0855eaaSJohn Baldwin add w13,w13,w17 1013c0855eaaSJohn Baldwin rev32 v15.8h,v15.8h 1014c0855eaaSJohn Baldwin add w14,w14,w19 1015c0855eaaSJohn Baldwin rev32 v19.8h,v19.8h 1016c0855eaaSJohn Baldwin add w15,w15,w20 1017c0855eaaSJohn Baldwin rev32 v23.8h,v23.8h 1018c0855eaaSJohn Baldwin add w16,w16,w21 1019c0855eaaSJohn Baldwin rev32 v27.8h,v27.8h 1020c0855eaaSJohn Baldwin eor w9,w9,w13 1021c0855eaaSJohn Baldwin rev32 v31.8h,v31.8h 1022c0855eaaSJohn Baldwin eor w10,w10,w14 1023bc3d5698SJohn Baldwin add v10.4s,v10.4s,v11.4s 1024c0855eaaSJohn Baldwin eor w11,w11,w15 1025bc3d5698SJohn Baldwin add v14.4s,v14.4s,v15.4s 1026c0855eaaSJohn Baldwin eor w12,w12,w16 1027bc3d5698SJohn Baldwin add v18.4s,v18.4s,v19.4s 1028bc3d5698SJohn Baldwin ror w9,w9,#20 1029c0855eaaSJohn Baldwin add v22.4s,v22.4s,v23.4s 1030c0855eaaSJohn Baldwin ror w10,w10,#20 1031c0855eaaSJohn Baldwin add v26.4s,v26.4s,v27.4s 1032c0855eaaSJohn Baldwin ror w11,w11,#20 1033c0855eaaSJohn Baldwin add v30.4s,v30.4s,v31.4s 1034c0855eaaSJohn Baldwin ror w12,w12,#20 1035c0855eaaSJohn Baldwin eor v0.16b,v9.16b,v10.16b 1036c0855eaaSJohn Baldwin add w5,w5,w9 1037c0855eaaSJohn Baldwin eor v1.16b,v13.16b,v14.16b 1038c0855eaaSJohn Baldwin add w6,w6,w10 1039c0855eaaSJohn Baldwin eor v2.16b,v17.16b,v18.16b 1040c0855eaaSJohn Baldwin add w7,w7,w11 1041c0855eaaSJohn Baldwin eor v3.16b,v21.16b,v22.16b 1042c0855eaaSJohn Baldwin add w8,w8,w12 1043c0855eaaSJohn Baldwin eor v4.16b,v25.16b,v26.16b 1044c0855eaaSJohn Baldwin eor w17,w17,w5 1045c0855eaaSJohn Baldwin eor v5.16b,v29.16b,v30.16b 1046c0855eaaSJohn Baldwin eor w19,w19,w6 1047c0855eaaSJohn Baldwin ushr v9.4s,v0.4s,#20 1048c0855eaaSJohn Baldwin eor w20,w20,w7 1049c0855eaaSJohn Baldwin ushr v13.4s,v1.4s,#20 1050c0855eaaSJohn Baldwin eor w21,w21,w8 1051c0855eaaSJohn Baldwin ushr v17.4s,v2.4s,#20 1052bc3d5698SJohn Baldwin ror w17,w17,#24 1053c0855eaaSJohn Baldwin ushr v21.4s,v3.4s,#20 1054bc3d5698SJohn Baldwin ror w19,w19,#24 1055c0855eaaSJohn Baldwin ushr v25.4s,v4.4s,#20 1056bc3d5698SJohn Baldwin ror w20,w20,#24 1057c0855eaaSJohn Baldwin ushr v29.4s,v5.4s,#20 1058c0855eaaSJohn Baldwin ror w21,w21,#24 1059c0855eaaSJohn Baldwin sli v9.4s,v0.4s,#12 1060c0855eaaSJohn Baldwin add w13,w13,w17 1061c0855eaaSJohn Baldwin sli v13.4s,v1.4s,#12 1062c0855eaaSJohn Baldwin add w14,w14,w19 1063c0855eaaSJohn Baldwin sli v17.4s,v2.4s,#12 1064c0855eaaSJohn Baldwin add w15,w15,w20 1065c0855eaaSJohn Baldwin sli v21.4s,v3.4s,#12 1066c0855eaaSJohn Baldwin add w16,w16,w21 1067c0855eaaSJohn Baldwin sli v25.4s,v4.4s,#12 1068c0855eaaSJohn Baldwin eor w9,w9,w13 1069c0855eaaSJohn Baldwin sli v29.4s,v5.4s,#12 1070c0855eaaSJohn Baldwin eor w10,w10,w14 1071c0855eaaSJohn Baldwin add v8.4s,v8.4s,v9.4s 1072c0855eaaSJohn Baldwin eor w11,w11,w15 1073c0855eaaSJohn Baldwin add v12.4s,v12.4s,v13.4s 1074c0855eaaSJohn Baldwin eor w12,w12,w16 1075c0855eaaSJohn Baldwin add v16.4s,v16.4s,v17.4s 1076bc3d5698SJohn Baldwin ror w9,w9,#25 1077c0855eaaSJohn Baldwin add v20.4s,v20.4s,v21.4s 1078c0855eaaSJohn Baldwin ror w10,w10,#25 1079c0855eaaSJohn Baldwin add v24.4s,v24.4s,v25.4s 1080c0855eaaSJohn Baldwin ror w11,w11,#25 1081c0855eaaSJohn Baldwin add v28.4s,v28.4s,v29.4s 1082c0855eaaSJohn Baldwin ror w12,w12,#25 1083c0855eaaSJohn Baldwin eor v11.16b,v11.16b,v8.16b 1084c0855eaaSJohn Baldwin add w5,w5,w10 1085c0855eaaSJohn Baldwin eor v15.16b,v15.16b,v12.16b 1086c0855eaaSJohn Baldwin add w6,w6,w11 1087c0855eaaSJohn Baldwin eor v19.16b,v19.16b,v16.16b 1088c0855eaaSJohn Baldwin add w7,w7,w12 1089c0855eaaSJohn Baldwin eor v23.16b,v23.16b,v20.16b 1090c0855eaaSJohn Baldwin add w8,w8,w9 1091c0855eaaSJohn Baldwin eor v27.16b,v27.16b,v24.16b 1092c0855eaaSJohn Baldwin eor w21,w21,w5 1093c0855eaaSJohn Baldwin eor v31.16b,v31.16b,v28.16b 1094c0855eaaSJohn Baldwin eor w17,w17,w6 1095c0855eaaSJohn Baldwin tbl v11.16b,{v11.16b},v6.16b 1096c0855eaaSJohn Baldwin eor w19,w19,w7 1097c0855eaaSJohn Baldwin tbl v15.16b,{v15.16b},v6.16b 1098c0855eaaSJohn Baldwin eor w20,w20,w8 1099c0855eaaSJohn Baldwin tbl v19.16b,{v19.16b},v6.16b 1100c0855eaaSJohn Baldwin ror w21,w21,#16 1101c0855eaaSJohn Baldwin tbl v23.16b,{v23.16b},v6.16b 1102c0855eaaSJohn Baldwin ror w17,w17,#16 1103c0855eaaSJohn Baldwin tbl v27.16b,{v27.16b},v6.16b 1104c0855eaaSJohn Baldwin ror w19,w19,#16 1105c0855eaaSJohn Baldwin tbl v31.16b,{v31.16b},v6.16b 1106c0855eaaSJohn Baldwin ror w20,w20,#16 1107c0855eaaSJohn Baldwin add v10.4s,v10.4s,v11.4s 1108c0855eaaSJohn Baldwin add w15,w15,w21 1109c0855eaaSJohn Baldwin add v14.4s,v14.4s,v15.4s 1110c0855eaaSJohn Baldwin add w16,w16,w17 1111c0855eaaSJohn Baldwin add v18.4s,v18.4s,v19.4s 1112c0855eaaSJohn Baldwin add w13,w13,w19 1113c0855eaaSJohn Baldwin add v22.4s,v22.4s,v23.4s 1114c0855eaaSJohn Baldwin add w14,w14,w20 1115c0855eaaSJohn Baldwin add v26.4s,v26.4s,v27.4s 1116c0855eaaSJohn Baldwin eor w10,w10,w15 1117c0855eaaSJohn Baldwin add v30.4s,v30.4s,v31.4s 1118c0855eaaSJohn Baldwin eor w11,w11,w16 1119c0855eaaSJohn Baldwin eor v0.16b,v9.16b,v10.16b 1120c0855eaaSJohn Baldwin eor w12,w12,w13 1121c0855eaaSJohn Baldwin eor v1.16b,v13.16b,v14.16b 1122c0855eaaSJohn Baldwin eor w9,w9,w14 1123c0855eaaSJohn Baldwin eor v2.16b,v17.16b,v18.16b 1124c0855eaaSJohn Baldwin ror w10,w10,#20 1125c0855eaaSJohn Baldwin eor v3.16b,v21.16b,v22.16b 1126c0855eaaSJohn Baldwin ror w11,w11,#20 1127c0855eaaSJohn Baldwin eor v4.16b,v25.16b,v26.16b 1128c0855eaaSJohn Baldwin ror w12,w12,#20 1129c0855eaaSJohn Baldwin eor v5.16b,v29.16b,v30.16b 1130c0855eaaSJohn Baldwin ror w9,w9,#20 1131c0855eaaSJohn Baldwin ushr v9.4s,v0.4s,#25 1132c0855eaaSJohn Baldwin add w5,w5,w10 1133c0855eaaSJohn Baldwin ushr v13.4s,v1.4s,#25 1134c0855eaaSJohn Baldwin add w6,w6,w11 1135c0855eaaSJohn Baldwin ushr v17.4s,v2.4s,#25 1136c0855eaaSJohn Baldwin add w7,w7,w12 1137c0855eaaSJohn Baldwin ushr v21.4s,v3.4s,#25 1138c0855eaaSJohn Baldwin add w8,w8,w9 1139c0855eaaSJohn Baldwin ushr v25.4s,v4.4s,#25 1140c0855eaaSJohn Baldwin eor w21,w21,w5 1141c0855eaaSJohn Baldwin ushr v29.4s,v5.4s,#25 1142c0855eaaSJohn Baldwin eor w17,w17,w6 1143c0855eaaSJohn Baldwin sli v9.4s,v0.4s,#7 1144c0855eaaSJohn Baldwin eor w19,w19,w7 1145c0855eaaSJohn Baldwin sli v13.4s,v1.4s,#7 1146c0855eaaSJohn Baldwin eor w20,w20,w8 1147c0855eaaSJohn Baldwin sli v17.4s,v2.4s,#7 1148c0855eaaSJohn Baldwin ror w21,w21,#24 1149c0855eaaSJohn Baldwin sli v21.4s,v3.4s,#7 1150c0855eaaSJohn Baldwin ror w17,w17,#24 1151c0855eaaSJohn Baldwin sli v25.4s,v4.4s,#7 1152c0855eaaSJohn Baldwin ror w19,w19,#24 1153c0855eaaSJohn Baldwin sli v29.4s,v5.4s,#7 1154c0855eaaSJohn Baldwin ror w20,w20,#24 1155c0855eaaSJohn Baldwin ext v10.16b,v10.16b,v10.16b,#8 1156c0855eaaSJohn Baldwin add w15,w15,w21 1157c0855eaaSJohn Baldwin ext v14.16b,v14.16b,v14.16b,#8 1158c0855eaaSJohn Baldwin add w16,w16,w17 1159c0855eaaSJohn Baldwin ext v18.16b,v18.16b,v18.16b,#8 1160c0855eaaSJohn Baldwin add w13,w13,w19 1161c0855eaaSJohn Baldwin ext v22.16b,v22.16b,v22.16b,#8 1162c0855eaaSJohn Baldwin add w14,w14,w20 1163c0855eaaSJohn Baldwin ext v26.16b,v26.16b,v26.16b,#8 1164c0855eaaSJohn Baldwin eor w10,w10,w15 1165c0855eaaSJohn Baldwin ext v30.16b,v30.16b,v30.16b,#8 1166c0855eaaSJohn Baldwin eor w11,w11,w16 1167bc3d5698SJohn Baldwin ext v11.16b,v11.16b,v11.16b,#12 1168c0855eaaSJohn Baldwin eor w12,w12,w13 1169bc3d5698SJohn Baldwin ext v15.16b,v15.16b,v15.16b,#12 1170c0855eaaSJohn Baldwin eor w9,w9,w14 1171bc3d5698SJohn Baldwin ext v19.16b,v19.16b,v19.16b,#12 1172c0855eaaSJohn Baldwin ror w10,w10,#25 1173bc3d5698SJohn Baldwin ext v23.16b,v23.16b,v23.16b,#12 1174c0855eaaSJohn Baldwin ror w11,w11,#25 1175c0855eaaSJohn Baldwin ext v27.16b,v27.16b,v27.16b,#12 1176c0855eaaSJohn Baldwin ror w12,w12,#25 1177c0855eaaSJohn Baldwin ext v31.16b,v31.16b,v31.16b,#12 1178c0855eaaSJohn Baldwin ror w9,w9,#25 1179bc3d5698SJohn Baldwin ext v9.16b,v9.16b,v9.16b,#4 1180bc3d5698SJohn Baldwin ext v13.16b,v13.16b,v13.16b,#4 1181bc3d5698SJohn Baldwin ext v17.16b,v17.16b,v17.16b,#4 1182bc3d5698SJohn Baldwin ext v21.16b,v21.16b,v21.16b,#4 1183c0855eaaSJohn Baldwin ext v25.16b,v25.16b,v25.16b,#4 1184c0855eaaSJohn Baldwin ext v29.16b,v29.16b,v29.16b,#4 1185bc3d5698SJohn Baldwin add v8.4s,v8.4s,v9.4s 1186c0855eaaSJohn Baldwin add w5,w5,w9 1187bc3d5698SJohn Baldwin add v12.4s,v12.4s,v13.4s 1188c0855eaaSJohn Baldwin add w6,w6,w10 1189bc3d5698SJohn Baldwin add v16.4s,v16.4s,v17.4s 1190c0855eaaSJohn Baldwin add w7,w7,w11 1191bc3d5698SJohn Baldwin add v20.4s,v20.4s,v21.4s 1192c0855eaaSJohn Baldwin add w8,w8,w12 1193c0855eaaSJohn Baldwin add v24.4s,v24.4s,v25.4s 1194c0855eaaSJohn Baldwin eor w17,w17,w5 1195c0855eaaSJohn Baldwin add v28.4s,v28.4s,v29.4s 1196bc3d5698SJohn Baldwin eor w19,w19,w6 1197bc3d5698SJohn Baldwin eor v11.16b,v11.16b,v8.16b 1198bc3d5698SJohn Baldwin eor w20,w20,w7 1199c0855eaaSJohn Baldwin eor v15.16b,v15.16b,v12.16b 1200bc3d5698SJohn Baldwin eor w21,w21,w8 1201c0855eaaSJohn Baldwin eor v19.16b,v19.16b,v16.16b 1202bc3d5698SJohn Baldwin ror w17,w17,#16 1203c0855eaaSJohn Baldwin eor v23.16b,v23.16b,v20.16b 1204bc3d5698SJohn Baldwin ror w19,w19,#16 1205c0855eaaSJohn Baldwin eor v27.16b,v27.16b,v24.16b 1206bc3d5698SJohn Baldwin ror w20,w20,#16 1207c0855eaaSJohn Baldwin eor v31.16b,v31.16b,v28.16b 1208c0855eaaSJohn Baldwin ror w21,w21,#16 1209c0855eaaSJohn Baldwin rev32 v11.8h,v11.8h 1210c0855eaaSJohn Baldwin add w13,w13,w17 1211c0855eaaSJohn Baldwin rev32 v15.8h,v15.8h 1212c0855eaaSJohn Baldwin add w14,w14,w19 1213c0855eaaSJohn Baldwin rev32 v19.8h,v19.8h 1214c0855eaaSJohn Baldwin add w15,w15,w20 1215c0855eaaSJohn Baldwin rev32 v23.8h,v23.8h 1216c0855eaaSJohn Baldwin add w16,w16,w21 1217c0855eaaSJohn Baldwin rev32 v27.8h,v27.8h 1218c0855eaaSJohn Baldwin eor w9,w9,w13 1219c0855eaaSJohn Baldwin rev32 v31.8h,v31.8h 1220c0855eaaSJohn Baldwin eor w10,w10,w14 1221bc3d5698SJohn Baldwin add v10.4s,v10.4s,v11.4s 1222c0855eaaSJohn Baldwin eor w11,w11,w15 1223bc3d5698SJohn Baldwin add v14.4s,v14.4s,v15.4s 1224c0855eaaSJohn Baldwin eor w12,w12,w16 1225bc3d5698SJohn Baldwin add v18.4s,v18.4s,v19.4s 1226bc3d5698SJohn Baldwin ror w9,w9,#20 1227c0855eaaSJohn Baldwin add v22.4s,v22.4s,v23.4s 1228c0855eaaSJohn Baldwin ror w10,w10,#20 1229c0855eaaSJohn Baldwin add v26.4s,v26.4s,v27.4s 1230c0855eaaSJohn Baldwin ror w11,w11,#20 1231c0855eaaSJohn Baldwin add v30.4s,v30.4s,v31.4s 1232c0855eaaSJohn Baldwin ror w12,w12,#20 1233c0855eaaSJohn Baldwin eor v0.16b,v9.16b,v10.16b 1234c0855eaaSJohn Baldwin add w5,w5,w9 1235c0855eaaSJohn Baldwin eor v1.16b,v13.16b,v14.16b 1236c0855eaaSJohn Baldwin add w6,w6,w10 1237c0855eaaSJohn Baldwin eor v2.16b,v17.16b,v18.16b 1238c0855eaaSJohn Baldwin add w7,w7,w11 1239c0855eaaSJohn Baldwin eor v3.16b,v21.16b,v22.16b 1240c0855eaaSJohn Baldwin add w8,w8,w12 1241c0855eaaSJohn Baldwin eor v4.16b,v25.16b,v26.16b 1242c0855eaaSJohn Baldwin eor w17,w17,w5 1243c0855eaaSJohn Baldwin eor v5.16b,v29.16b,v30.16b 1244c0855eaaSJohn Baldwin eor w19,w19,w6 1245c0855eaaSJohn Baldwin ushr v9.4s,v0.4s,#20 1246c0855eaaSJohn Baldwin eor w20,w20,w7 1247c0855eaaSJohn Baldwin ushr v13.4s,v1.4s,#20 1248c0855eaaSJohn Baldwin eor w21,w21,w8 1249c0855eaaSJohn Baldwin ushr v17.4s,v2.4s,#20 1250bc3d5698SJohn Baldwin ror w17,w17,#24 1251c0855eaaSJohn Baldwin ushr v21.4s,v3.4s,#20 1252bc3d5698SJohn Baldwin ror w19,w19,#24 1253c0855eaaSJohn Baldwin ushr v25.4s,v4.4s,#20 1254bc3d5698SJohn Baldwin ror w20,w20,#24 1255c0855eaaSJohn Baldwin ushr v29.4s,v5.4s,#20 1256c0855eaaSJohn Baldwin ror w21,w21,#24 1257c0855eaaSJohn Baldwin sli v9.4s,v0.4s,#12 1258c0855eaaSJohn Baldwin add w13,w13,w17 1259c0855eaaSJohn Baldwin sli v13.4s,v1.4s,#12 1260c0855eaaSJohn Baldwin add w14,w14,w19 1261c0855eaaSJohn Baldwin sli v17.4s,v2.4s,#12 1262c0855eaaSJohn Baldwin add w15,w15,w20 1263c0855eaaSJohn Baldwin sli v21.4s,v3.4s,#12 1264c0855eaaSJohn Baldwin add w16,w16,w21 1265c0855eaaSJohn Baldwin sli v25.4s,v4.4s,#12 1266c0855eaaSJohn Baldwin eor w9,w9,w13 1267c0855eaaSJohn Baldwin sli v29.4s,v5.4s,#12 1268c0855eaaSJohn Baldwin eor w10,w10,w14 1269c0855eaaSJohn Baldwin add v8.4s,v8.4s,v9.4s 1270c0855eaaSJohn Baldwin eor w11,w11,w15 1271c0855eaaSJohn Baldwin add v12.4s,v12.4s,v13.4s 1272c0855eaaSJohn Baldwin eor w12,w12,w16 1273c0855eaaSJohn Baldwin add v16.4s,v16.4s,v17.4s 1274bc3d5698SJohn Baldwin ror w9,w9,#25 1275c0855eaaSJohn Baldwin add v20.4s,v20.4s,v21.4s 1276c0855eaaSJohn Baldwin ror w10,w10,#25 1277c0855eaaSJohn Baldwin add v24.4s,v24.4s,v25.4s 1278c0855eaaSJohn Baldwin ror w11,w11,#25 1279c0855eaaSJohn Baldwin add v28.4s,v28.4s,v29.4s 1280c0855eaaSJohn Baldwin ror w12,w12,#25 1281c0855eaaSJohn Baldwin eor v11.16b,v11.16b,v8.16b 1282c0855eaaSJohn Baldwin add w5,w5,w10 1283c0855eaaSJohn Baldwin eor v15.16b,v15.16b,v12.16b 1284c0855eaaSJohn Baldwin add w6,w6,w11 1285c0855eaaSJohn Baldwin eor v19.16b,v19.16b,v16.16b 1286c0855eaaSJohn Baldwin add w7,w7,w12 1287c0855eaaSJohn Baldwin eor v23.16b,v23.16b,v20.16b 1288c0855eaaSJohn Baldwin add w8,w8,w9 1289c0855eaaSJohn Baldwin eor v27.16b,v27.16b,v24.16b 1290c0855eaaSJohn Baldwin eor w21,w21,w5 1291c0855eaaSJohn Baldwin eor v31.16b,v31.16b,v28.16b 1292c0855eaaSJohn Baldwin eor w17,w17,w6 1293c0855eaaSJohn Baldwin tbl v11.16b,{v11.16b},v6.16b 1294c0855eaaSJohn Baldwin eor w19,w19,w7 1295c0855eaaSJohn Baldwin tbl v15.16b,{v15.16b},v6.16b 1296c0855eaaSJohn Baldwin eor w20,w20,w8 1297c0855eaaSJohn Baldwin tbl v19.16b,{v19.16b},v6.16b 1298c0855eaaSJohn Baldwin ror w21,w21,#16 1299c0855eaaSJohn Baldwin tbl v23.16b,{v23.16b},v6.16b 1300c0855eaaSJohn Baldwin ror w17,w17,#16 1301c0855eaaSJohn Baldwin tbl v27.16b,{v27.16b},v6.16b 1302c0855eaaSJohn Baldwin ror w19,w19,#16 1303c0855eaaSJohn Baldwin tbl v31.16b,{v31.16b},v6.16b 1304c0855eaaSJohn Baldwin ror w20,w20,#16 1305c0855eaaSJohn Baldwin add v10.4s,v10.4s,v11.4s 1306c0855eaaSJohn Baldwin add w15,w15,w21 1307c0855eaaSJohn Baldwin add v14.4s,v14.4s,v15.4s 1308c0855eaaSJohn Baldwin add w16,w16,w17 1309c0855eaaSJohn Baldwin add v18.4s,v18.4s,v19.4s 1310c0855eaaSJohn Baldwin add w13,w13,w19 1311c0855eaaSJohn Baldwin add v22.4s,v22.4s,v23.4s 1312c0855eaaSJohn Baldwin add w14,w14,w20 1313c0855eaaSJohn Baldwin add v26.4s,v26.4s,v27.4s 1314c0855eaaSJohn Baldwin eor w10,w10,w15 1315c0855eaaSJohn Baldwin add v30.4s,v30.4s,v31.4s 1316c0855eaaSJohn Baldwin eor w11,w11,w16 1317c0855eaaSJohn Baldwin eor v0.16b,v9.16b,v10.16b 1318c0855eaaSJohn Baldwin eor w12,w12,w13 1319c0855eaaSJohn Baldwin eor v1.16b,v13.16b,v14.16b 1320c0855eaaSJohn Baldwin eor w9,w9,w14 1321c0855eaaSJohn Baldwin eor v2.16b,v17.16b,v18.16b 1322c0855eaaSJohn Baldwin ror w10,w10,#20 1323c0855eaaSJohn Baldwin eor v3.16b,v21.16b,v22.16b 1324c0855eaaSJohn Baldwin ror w11,w11,#20 1325c0855eaaSJohn Baldwin eor v4.16b,v25.16b,v26.16b 1326c0855eaaSJohn Baldwin ror w12,w12,#20 1327c0855eaaSJohn Baldwin eor v5.16b,v29.16b,v30.16b 1328c0855eaaSJohn Baldwin ror w9,w9,#20 1329c0855eaaSJohn Baldwin ushr v9.4s,v0.4s,#25 1330c0855eaaSJohn Baldwin add w5,w5,w10 1331c0855eaaSJohn Baldwin ushr v13.4s,v1.4s,#25 1332c0855eaaSJohn Baldwin add w6,w6,w11 1333c0855eaaSJohn Baldwin ushr v17.4s,v2.4s,#25 1334c0855eaaSJohn Baldwin add w7,w7,w12 1335c0855eaaSJohn Baldwin ushr v21.4s,v3.4s,#25 1336c0855eaaSJohn Baldwin add w8,w8,w9 1337c0855eaaSJohn Baldwin ushr v25.4s,v4.4s,#25 1338c0855eaaSJohn Baldwin eor w21,w21,w5 1339c0855eaaSJohn Baldwin ushr v29.4s,v5.4s,#25 1340c0855eaaSJohn Baldwin eor w17,w17,w6 1341c0855eaaSJohn Baldwin sli v9.4s,v0.4s,#7 1342c0855eaaSJohn Baldwin eor w19,w19,w7 1343c0855eaaSJohn Baldwin sli v13.4s,v1.4s,#7 1344c0855eaaSJohn Baldwin eor w20,w20,w8 1345c0855eaaSJohn Baldwin sli v17.4s,v2.4s,#7 1346c0855eaaSJohn Baldwin ror w21,w21,#24 1347c0855eaaSJohn Baldwin sli v21.4s,v3.4s,#7 1348c0855eaaSJohn Baldwin ror w17,w17,#24 1349c0855eaaSJohn Baldwin sli v25.4s,v4.4s,#7 1350c0855eaaSJohn Baldwin ror w19,w19,#24 1351c0855eaaSJohn Baldwin sli v29.4s,v5.4s,#7 1352c0855eaaSJohn Baldwin ror w20,w20,#24 1353c0855eaaSJohn Baldwin ext v10.16b,v10.16b,v10.16b,#8 1354c0855eaaSJohn Baldwin add w15,w15,w21 1355c0855eaaSJohn Baldwin ext v14.16b,v14.16b,v14.16b,#8 1356c0855eaaSJohn Baldwin add w16,w16,w17 1357c0855eaaSJohn Baldwin ext v18.16b,v18.16b,v18.16b,#8 1358c0855eaaSJohn Baldwin add w13,w13,w19 1359c0855eaaSJohn Baldwin ext v22.16b,v22.16b,v22.16b,#8 1360c0855eaaSJohn Baldwin add w14,w14,w20 1361c0855eaaSJohn Baldwin ext v26.16b,v26.16b,v26.16b,#8 1362c0855eaaSJohn Baldwin eor w10,w10,w15 1363c0855eaaSJohn Baldwin ext v30.16b,v30.16b,v30.16b,#8 1364c0855eaaSJohn Baldwin eor w11,w11,w16 1365bc3d5698SJohn Baldwin ext v11.16b,v11.16b,v11.16b,#4 1366c0855eaaSJohn Baldwin eor w12,w12,w13 1367bc3d5698SJohn Baldwin ext v15.16b,v15.16b,v15.16b,#4 1368c0855eaaSJohn Baldwin eor w9,w9,w14 1369bc3d5698SJohn Baldwin ext v19.16b,v19.16b,v19.16b,#4 1370c0855eaaSJohn Baldwin ror w10,w10,#25 1371bc3d5698SJohn Baldwin ext v23.16b,v23.16b,v23.16b,#4 1372c0855eaaSJohn Baldwin ror w11,w11,#25 1373c0855eaaSJohn Baldwin ext v27.16b,v27.16b,v27.16b,#4 1374c0855eaaSJohn Baldwin ror w12,w12,#25 1375c0855eaaSJohn Baldwin ext v31.16b,v31.16b,v31.16b,#4 1376c0855eaaSJohn Baldwin ror w9,w9,#25 1377bc3d5698SJohn Baldwin ext v9.16b,v9.16b,v9.16b,#12 1378bc3d5698SJohn Baldwin ext v13.16b,v13.16b,v13.16b,#12 1379bc3d5698SJohn Baldwin ext v17.16b,v17.16b,v17.16b,#12 1380bc3d5698SJohn Baldwin ext v21.16b,v21.16b,v21.16b,#12 1381c0855eaaSJohn Baldwin ext v25.16b,v25.16b,v25.16b,#12 1382c0855eaaSJohn Baldwin ext v29.16b,v29.16b,v29.16b,#12 1383bc3d5698SJohn Baldwin cbnz x4,.Loop_upper_neon 1384bc3d5698SJohn Baldwin 1385bc3d5698SJohn Baldwin add w5,w5,w22 // accumulate key block 1386bc3d5698SJohn Baldwin add x6,x6,x22,lsr#32 1387bc3d5698SJohn Baldwin add w7,w7,w23 1388bc3d5698SJohn Baldwin add x8,x8,x23,lsr#32 1389bc3d5698SJohn Baldwin add w9,w9,w24 1390bc3d5698SJohn Baldwin add x10,x10,x24,lsr#32 1391bc3d5698SJohn Baldwin add w11,w11,w25 1392bc3d5698SJohn Baldwin add x12,x12,x25,lsr#32 1393bc3d5698SJohn Baldwin add w13,w13,w26 1394bc3d5698SJohn Baldwin add x14,x14,x26,lsr#32 1395bc3d5698SJohn Baldwin add w15,w15,w27 1396bc3d5698SJohn Baldwin add x16,x16,x27,lsr#32 1397bc3d5698SJohn Baldwin add w17,w17,w28 1398bc3d5698SJohn Baldwin add x19,x19,x28,lsr#32 1399bc3d5698SJohn Baldwin add w20,w20,w30 1400bc3d5698SJohn Baldwin add x21,x21,x30,lsr#32 1401bc3d5698SJohn Baldwin 1402bc3d5698SJohn Baldwin add x5,x5,x6,lsl#32 // pack 1403bc3d5698SJohn Baldwin add x7,x7,x8,lsl#32 1404bc3d5698SJohn Baldwin ldp x6,x8,[x1,#0] // load input 1405bc3d5698SJohn Baldwin add x9,x9,x10,lsl#32 1406bc3d5698SJohn Baldwin add x11,x11,x12,lsl#32 1407bc3d5698SJohn Baldwin ldp x10,x12,[x1,#16] 1408bc3d5698SJohn Baldwin add x13,x13,x14,lsl#32 1409bc3d5698SJohn Baldwin add x15,x15,x16,lsl#32 1410bc3d5698SJohn Baldwin ldp x14,x16,[x1,#32] 1411bc3d5698SJohn Baldwin add x17,x17,x19,lsl#32 1412bc3d5698SJohn Baldwin add x20,x20,x21,lsl#32 1413bc3d5698SJohn Baldwin ldp x19,x21,[x1,#48] 1414bc3d5698SJohn Baldwin add x1,x1,#64 1415c0855eaaSJohn Baldwin#ifdef __AARCH64EB__ 1416bc3d5698SJohn Baldwin rev x5,x5 1417bc3d5698SJohn Baldwin rev x7,x7 1418bc3d5698SJohn Baldwin rev x9,x9 1419bc3d5698SJohn Baldwin rev x11,x11 1420bc3d5698SJohn Baldwin rev x13,x13 1421bc3d5698SJohn Baldwin rev x15,x15 1422bc3d5698SJohn Baldwin rev x17,x17 1423bc3d5698SJohn Baldwin rev x20,x20 1424bc3d5698SJohn Baldwin#endif 1425bc3d5698SJohn Baldwin eor x5,x5,x6 1426bc3d5698SJohn Baldwin eor x7,x7,x8 1427bc3d5698SJohn Baldwin eor x9,x9,x10 1428bc3d5698SJohn Baldwin eor x11,x11,x12 1429bc3d5698SJohn Baldwin eor x13,x13,x14 1430bc3d5698SJohn Baldwin eor x15,x15,x16 1431bc3d5698SJohn Baldwin eor x17,x17,x19 1432bc3d5698SJohn Baldwin eor x20,x20,x21 1433bc3d5698SJohn Baldwin 1434bc3d5698SJohn Baldwin stp x5,x7,[x0,#0] // store output 1435bc3d5698SJohn Baldwin add x28,x28,#1 // increment counter 1436bc3d5698SJohn Baldwin mov w5,w22 // unpack key block 1437bc3d5698SJohn Baldwin lsr x6,x22,#32 1438bc3d5698SJohn Baldwin stp x9,x11,[x0,#16] 1439bc3d5698SJohn Baldwin mov w7,w23 1440bc3d5698SJohn Baldwin lsr x8,x23,#32 1441bc3d5698SJohn Baldwin stp x13,x15,[x0,#32] 1442bc3d5698SJohn Baldwin mov w9,w24 1443bc3d5698SJohn Baldwin lsr x10,x24,#32 1444bc3d5698SJohn Baldwin stp x17,x20,[x0,#48] 1445bc3d5698SJohn Baldwin add x0,x0,#64 1446bc3d5698SJohn Baldwin mov w11,w25 1447bc3d5698SJohn Baldwin lsr x12,x25,#32 1448bc3d5698SJohn Baldwin mov w13,w26 1449bc3d5698SJohn Baldwin lsr x14,x26,#32 1450bc3d5698SJohn Baldwin mov w15,w27 1451bc3d5698SJohn Baldwin lsr x16,x27,#32 1452bc3d5698SJohn Baldwin mov w17,w28 1453bc3d5698SJohn Baldwin lsr x19,x28,#32 1454bc3d5698SJohn Baldwin mov w20,w30 1455bc3d5698SJohn Baldwin lsr x21,x30,#32 1456bc3d5698SJohn Baldwin 1457bc3d5698SJohn Baldwin mov x4,#5 1458bc3d5698SJohn Baldwin.Loop_lower_neon: 1459bc3d5698SJohn Baldwin sub x4,x4,#1 1460bc3d5698SJohn Baldwin add v8.4s,v8.4s,v9.4s 1461c0855eaaSJohn Baldwin add w5,w5,w9 1462bc3d5698SJohn Baldwin add v12.4s,v12.4s,v13.4s 1463c0855eaaSJohn Baldwin add w6,w6,w10 1464bc3d5698SJohn Baldwin add v16.4s,v16.4s,v17.4s 1465c0855eaaSJohn Baldwin add w7,w7,w11 1466bc3d5698SJohn Baldwin add v20.4s,v20.4s,v21.4s 1467c0855eaaSJohn Baldwin add w8,w8,w12 1468c0855eaaSJohn Baldwin add v24.4s,v24.4s,v25.4s 1469c0855eaaSJohn Baldwin eor w17,w17,w5 1470c0855eaaSJohn Baldwin add v28.4s,v28.4s,v29.4s 1471bc3d5698SJohn Baldwin eor w19,w19,w6 1472bc3d5698SJohn Baldwin eor v11.16b,v11.16b,v8.16b 1473bc3d5698SJohn Baldwin eor w20,w20,w7 1474c0855eaaSJohn Baldwin eor v15.16b,v15.16b,v12.16b 1475bc3d5698SJohn Baldwin eor w21,w21,w8 1476c0855eaaSJohn Baldwin eor v19.16b,v19.16b,v16.16b 1477bc3d5698SJohn Baldwin ror w17,w17,#16 1478c0855eaaSJohn Baldwin eor v23.16b,v23.16b,v20.16b 1479bc3d5698SJohn Baldwin ror w19,w19,#16 1480c0855eaaSJohn Baldwin eor v27.16b,v27.16b,v24.16b 1481bc3d5698SJohn Baldwin ror w20,w20,#16 1482c0855eaaSJohn Baldwin eor v31.16b,v31.16b,v28.16b 1483c0855eaaSJohn Baldwin ror w21,w21,#16 1484c0855eaaSJohn Baldwin rev32 v11.8h,v11.8h 1485c0855eaaSJohn Baldwin add w13,w13,w17 1486c0855eaaSJohn Baldwin rev32 v15.8h,v15.8h 1487c0855eaaSJohn Baldwin add w14,w14,w19 1488c0855eaaSJohn Baldwin rev32 v19.8h,v19.8h 1489c0855eaaSJohn Baldwin add w15,w15,w20 1490c0855eaaSJohn Baldwin rev32 v23.8h,v23.8h 1491c0855eaaSJohn Baldwin add w16,w16,w21 1492c0855eaaSJohn Baldwin rev32 v27.8h,v27.8h 1493c0855eaaSJohn Baldwin eor w9,w9,w13 1494c0855eaaSJohn Baldwin rev32 v31.8h,v31.8h 1495c0855eaaSJohn Baldwin eor w10,w10,w14 1496bc3d5698SJohn Baldwin add v10.4s,v10.4s,v11.4s 1497c0855eaaSJohn Baldwin eor w11,w11,w15 1498bc3d5698SJohn Baldwin add v14.4s,v14.4s,v15.4s 1499c0855eaaSJohn Baldwin eor w12,w12,w16 1500bc3d5698SJohn Baldwin add v18.4s,v18.4s,v19.4s 1501bc3d5698SJohn Baldwin ror w9,w9,#20 1502c0855eaaSJohn Baldwin add v22.4s,v22.4s,v23.4s 1503c0855eaaSJohn Baldwin ror w10,w10,#20 1504c0855eaaSJohn Baldwin add v26.4s,v26.4s,v27.4s 1505c0855eaaSJohn Baldwin ror w11,w11,#20 1506c0855eaaSJohn Baldwin add v30.4s,v30.4s,v31.4s 1507c0855eaaSJohn Baldwin ror w12,w12,#20 1508c0855eaaSJohn Baldwin eor v0.16b,v9.16b,v10.16b 1509c0855eaaSJohn Baldwin add w5,w5,w9 1510c0855eaaSJohn Baldwin eor v1.16b,v13.16b,v14.16b 1511c0855eaaSJohn Baldwin add w6,w6,w10 1512c0855eaaSJohn Baldwin eor v2.16b,v17.16b,v18.16b 1513c0855eaaSJohn Baldwin add w7,w7,w11 1514c0855eaaSJohn Baldwin eor v3.16b,v21.16b,v22.16b 1515c0855eaaSJohn Baldwin add w8,w8,w12 1516c0855eaaSJohn Baldwin eor v4.16b,v25.16b,v26.16b 1517c0855eaaSJohn Baldwin eor w17,w17,w5 1518c0855eaaSJohn Baldwin eor v5.16b,v29.16b,v30.16b 1519c0855eaaSJohn Baldwin eor w19,w19,w6 1520c0855eaaSJohn Baldwin ushr v9.4s,v0.4s,#20 1521c0855eaaSJohn Baldwin eor w20,w20,w7 1522c0855eaaSJohn Baldwin ushr v13.4s,v1.4s,#20 1523c0855eaaSJohn Baldwin eor w21,w21,w8 1524c0855eaaSJohn Baldwin ushr v17.4s,v2.4s,#20 1525bc3d5698SJohn Baldwin ror w17,w17,#24 1526c0855eaaSJohn Baldwin ushr v21.4s,v3.4s,#20 1527bc3d5698SJohn Baldwin ror w19,w19,#24 1528c0855eaaSJohn Baldwin ushr v25.4s,v4.4s,#20 1529bc3d5698SJohn Baldwin ror w20,w20,#24 1530c0855eaaSJohn Baldwin ushr v29.4s,v5.4s,#20 1531c0855eaaSJohn Baldwin ror w21,w21,#24 1532c0855eaaSJohn Baldwin sli v9.4s,v0.4s,#12 1533c0855eaaSJohn Baldwin add w13,w13,w17 1534c0855eaaSJohn Baldwin sli v13.4s,v1.4s,#12 1535c0855eaaSJohn Baldwin add w14,w14,w19 1536c0855eaaSJohn Baldwin sli v17.4s,v2.4s,#12 1537c0855eaaSJohn Baldwin add w15,w15,w20 1538c0855eaaSJohn Baldwin sli v21.4s,v3.4s,#12 1539c0855eaaSJohn Baldwin add w16,w16,w21 1540c0855eaaSJohn Baldwin sli v25.4s,v4.4s,#12 1541c0855eaaSJohn Baldwin eor w9,w9,w13 1542c0855eaaSJohn Baldwin sli v29.4s,v5.4s,#12 1543c0855eaaSJohn Baldwin eor w10,w10,w14 1544c0855eaaSJohn Baldwin add v8.4s,v8.4s,v9.4s 1545c0855eaaSJohn Baldwin eor w11,w11,w15 1546c0855eaaSJohn Baldwin add v12.4s,v12.4s,v13.4s 1547c0855eaaSJohn Baldwin eor w12,w12,w16 1548c0855eaaSJohn Baldwin add v16.4s,v16.4s,v17.4s 1549bc3d5698SJohn Baldwin ror w9,w9,#25 1550c0855eaaSJohn Baldwin add v20.4s,v20.4s,v21.4s 1551c0855eaaSJohn Baldwin ror w10,w10,#25 1552c0855eaaSJohn Baldwin add v24.4s,v24.4s,v25.4s 1553c0855eaaSJohn Baldwin ror w11,w11,#25 1554c0855eaaSJohn Baldwin add v28.4s,v28.4s,v29.4s 1555c0855eaaSJohn Baldwin ror w12,w12,#25 1556c0855eaaSJohn Baldwin eor v11.16b,v11.16b,v8.16b 1557c0855eaaSJohn Baldwin add w5,w5,w10 1558c0855eaaSJohn Baldwin eor v15.16b,v15.16b,v12.16b 1559c0855eaaSJohn Baldwin add w6,w6,w11 1560c0855eaaSJohn Baldwin eor v19.16b,v19.16b,v16.16b 1561c0855eaaSJohn Baldwin add w7,w7,w12 1562c0855eaaSJohn Baldwin eor v23.16b,v23.16b,v20.16b 1563c0855eaaSJohn Baldwin add w8,w8,w9 1564c0855eaaSJohn Baldwin eor v27.16b,v27.16b,v24.16b 1565c0855eaaSJohn Baldwin eor w21,w21,w5 1566c0855eaaSJohn Baldwin eor v31.16b,v31.16b,v28.16b 1567c0855eaaSJohn Baldwin eor w17,w17,w6 1568c0855eaaSJohn Baldwin tbl v11.16b,{v11.16b},v6.16b 1569c0855eaaSJohn Baldwin eor w19,w19,w7 1570c0855eaaSJohn Baldwin tbl v15.16b,{v15.16b},v6.16b 1571c0855eaaSJohn Baldwin eor w20,w20,w8 1572c0855eaaSJohn Baldwin tbl v19.16b,{v19.16b},v6.16b 1573c0855eaaSJohn Baldwin ror w21,w21,#16 1574c0855eaaSJohn Baldwin tbl v23.16b,{v23.16b},v6.16b 1575c0855eaaSJohn Baldwin ror w17,w17,#16 1576c0855eaaSJohn Baldwin tbl v27.16b,{v27.16b},v6.16b 1577c0855eaaSJohn Baldwin ror w19,w19,#16 1578c0855eaaSJohn Baldwin tbl v31.16b,{v31.16b},v6.16b 1579c0855eaaSJohn Baldwin ror w20,w20,#16 1580c0855eaaSJohn Baldwin add v10.4s,v10.4s,v11.4s 1581c0855eaaSJohn Baldwin add w15,w15,w21 1582c0855eaaSJohn Baldwin add v14.4s,v14.4s,v15.4s 1583c0855eaaSJohn Baldwin add w16,w16,w17 1584c0855eaaSJohn Baldwin add v18.4s,v18.4s,v19.4s 1585c0855eaaSJohn Baldwin add w13,w13,w19 1586c0855eaaSJohn Baldwin add v22.4s,v22.4s,v23.4s 1587c0855eaaSJohn Baldwin add w14,w14,w20 1588c0855eaaSJohn Baldwin add v26.4s,v26.4s,v27.4s 1589c0855eaaSJohn Baldwin eor w10,w10,w15 1590c0855eaaSJohn Baldwin add v30.4s,v30.4s,v31.4s 1591c0855eaaSJohn Baldwin eor w11,w11,w16 1592c0855eaaSJohn Baldwin eor v0.16b,v9.16b,v10.16b 1593c0855eaaSJohn Baldwin eor w12,w12,w13 1594c0855eaaSJohn Baldwin eor v1.16b,v13.16b,v14.16b 1595c0855eaaSJohn Baldwin eor w9,w9,w14 1596c0855eaaSJohn Baldwin eor v2.16b,v17.16b,v18.16b 1597c0855eaaSJohn Baldwin ror w10,w10,#20 1598c0855eaaSJohn Baldwin eor v3.16b,v21.16b,v22.16b 1599c0855eaaSJohn Baldwin ror w11,w11,#20 1600c0855eaaSJohn Baldwin eor v4.16b,v25.16b,v26.16b 1601c0855eaaSJohn Baldwin ror w12,w12,#20 1602c0855eaaSJohn Baldwin eor v5.16b,v29.16b,v30.16b 1603c0855eaaSJohn Baldwin ror w9,w9,#20 1604c0855eaaSJohn Baldwin ushr v9.4s,v0.4s,#25 1605c0855eaaSJohn Baldwin add w5,w5,w10 1606c0855eaaSJohn Baldwin ushr v13.4s,v1.4s,#25 1607c0855eaaSJohn Baldwin add w6,w6,w11 1608c0855eaaSJohn Baldwin ushr v17.4s,v2.4s,#25 1609c0855eaaSJohn Baldwin add w7,w7,w12 1610c0855eaaSJohn Baldwin ushr v21.4s,v3.4s,#25 1611c0855eaaSJohn Baldwin add w8,w8,w9 1612c0855eaaSJohn Baldwin ushr v25.4s,v4.4s,#25 1613c0855eaaSJohn Baldwin eor w21,w21,w5 1614c0855eaaSJohn Baldwin ushr v29.4s,v5.4s,#25 1615c0855eaaSJohn Baldwin eor w17,w17,w6 1616c0855eaaSJohn Baldwin sli v9.4s,v0.4s,#7 1617c0855eaaSJohn Baldwin eor w19,w19,w7 1618c0855eaaSJohn Baldwin sli v13.4s,v1.4s,#7 1619c0855eaaSJohn Baldwin eor w20,w20,w8 1620c0855eaaSJohn Baldwin sli v17.4s,v2.4s,#7 1621c0855eaaSJohn Baldwin ror w21,w21,#24 1622c0855eaaSJohn Baldwin sli v21.4s,v3.4s,#7 1623c0855eaaSJohn Baldwin ror w17,w17,#24 1624c0855eaaSJohn Baldwin sli v25.4s,v4.4s,#7 1625c0855eaaSJohn Baldwin ror w19,w19,#24 1626c0855eaaSJohn Baldwin sli v29.4s,v5.4s,#7 1627c0855eaaSJohn Baldwin ror w20,w20,#24 1628c0855eaaSJohn Baldwin ext v10.16b,v10.16b,v10.16b,#8 1629c0855eaaSJohn Baldwin add w15,w15,w21 1630c0855eaaSJohn Baldwin ext v14.16b,v14.16b,v14.16b,#8 1631c0855eaaSJohn Baldwin add w16,w16,w17 1632c0855eaaSJohn Baldwin ext v18.16b,v18.16b,v18.16b,#8 1633c0855eaaSJohn Baldwin add w13,w13,w19 1634c0855eaaSJohn Baldwin ext v22.16b,v22.16b,v22.16b,#8 1635c0855eaaSJohn Baldwin add w14,w14,w20 1636c0855eaaSJohn Baldwin ext v26.16b,v26.16b,v26.16b,#8 1637c0855eaaSJohn Baldwin eor w10,w10,w15 1638c0855eaaSJohn Baldwin ext v30.16b,v30.16b,v30.16b,#8 1639c0855eaaSJohn Baldwin eor w11,w11,w16 1640bc3d5698SJohn Baldwin ext v11.16b,v11.16b,v11.16b,#12 1641c0855eaaSJohn Baldwin eor w12,w12,w13 1642bc3d5698SJohn Baldwin ext v15.16b,v15.16b,v15.16b,#12 1643c0855eaaSJohn Baldwin eor w9,w9,w14 1644bc3d5698SJohn Baldwin ext v19.16b,v19.16b,v19.16b,#12 1645c0855eaaSJohn Baldwin ror w10,w10,#25 1646bc3d5698SJohn Baldwin ext v23.16b,v23.16b,v23.16b,#12 1647c0855eaaSJohn Baldwin ror w11,w11,#25 1648c0855eaaSJohn Baldwin ext v27.16b,v27.16b,v27.16b,#12 1649c0855eaaSJohn Baldwin ror w12,w12,#25 1650c0855eaaSJohn Baldwin ext v31.16b,v31.16b,v31.16b,#12 1651c0855eaaSJohn Baldwin ror w9,w9,#25 1652bc3d5698SJohn Baldwin ext v9.16b,v9.16b,v9.16b,#4 1653bc3d5698SJohn Baldwin ext v13.16b,v13.16b,v13.16b,#4 1654bc3d5698SJohn Baldwin ext v17.16b,v17.16b,v17.16b,#4 1655bc3d5698SJohn Baldwin ext v21.16b,v21.16b,v21.16b,#4 1656c0855eaaSJohn Baldwin ext v25.16b,v25.16b,v25.16b,#4 1657c0855eaaSJohn Baldwin ext v29.16b,v29.16b,v29.16b,#4 1658bc3d5698SJohn Baldwin add v8.4s,v8.4s,v9.4s 1659c0855eaaSJohn Baldwin add w5,w5,w9 1660bc3d5698SJohn Baldwin add v12.4s,v12.4s,v13.4s 1661c0855eaaSJohn Baldwin add w6,w6,w10 1662bc3d5698SJohn Baldwin add v16.4s,v16.4s,v17.4s 1663c0855eaaSJohn Baldwin add w7,w7,w11 1664bc3d5698SJohn Baldwin add v20.4s,v20.4s,v21.4s 1665c0855eaaSJohn Baldwin add w8,w8,w12 1666c0855eaaSJohn Baldwin add v24.4s,v24.4s,v25.4s 1667c0855eaaSJohn Baldwin eor w17,w17,w5 1668c0855eaaSJohn Baldwin add v28.4s,v28.4s,v29.4s 1669bc3d5698SJohn Baldwin eor w19,w19,w6 1670bc3d5698SJohn Baldwin eor v11.16b,v11.16b,v8.16b 1671bc3d5698SJohn Baldwin eor w20,w20,w7 1672c0855eaaSJohn Baldwin eor v15.16b,v15.16b,v12.16b 1673bc3d5698SJohn Baldwin eor w21,w21,w8 1674c0855eaaSJohn Baldwin eor v19.16b,v19.16b,v16.16b 1675bc3d5698SJohn Baldwin ror w17,w17,#16 1676c0855eaaSJohn Baldwin eor v23.16b,v23.16b,v20.16b 1677bc3d5698SJohn Baldwin ror w19,w19,#16 1678c0855eaaSJohn Baldwin eor v27.16b,v27.16b,v24.16b 1679bc3d5698SJohn Baldwin ror w20,w20,#16 1680c0855eaaSJohn Baldwin eor v31.16b,v31.16b,v28.16b 1681c0855eaaSJohn Baldwin ror w21,w21,#16 1682c0855eaaSJohn Baldwin rev32 v11.8h,v11.8h 1683c0855eaaSJohn Baldwin add w13,w13,w17 1684c0855eaaSJohn Baldwin rev32 v15.8h,v15.8h 1685c0855eaaSJohn Baldwin add w14,w14,w19 1686c0855eaaSJohn Baldwin rev32 v19.8h,v19.8h 1687c0855eaaSJohn Baldwin add w15,w15,w20 1688c0855eaaSJohn Baldwin rev32 v23.8h,v23.8h 1689c0855eaaSJohn Baldwin add w16,w16,w21 1690c0855eaaSJohn Baldwin rev32 v27.8h,v27.8h 1691c0855eaaSJohn Baldwin eor w9,w9,w13 1692c0855eaaSJohn Baldwin rev32 v31.8h,v31.8h 1693c0855eaaSJohn Baldwin eor w10,w10,w14 1694bc3d5698SJohn Baldwin add v10.4s,v10.4s,v11.4s 1695c0855eaaSJohn Baldwin eor w11,w11,w15 1696bc3d5698SJohn Baldwin add v14.4s,v14.4s,v15.4s 1697c0855eaaSJohn Baldwin eor w12,w12,w16 1698bc3d5698SJohn Baldwin add v18.4s,v18.4s,v19.4s 1699bc3d5698SJohn Baldwin ror w9,w9,#20 1700c0855eaaSJohn Baldwin add v22.4s,v22.4s,v23.4s 1701c0855eaaSJohn Baldwin ror w10,w10,#20 1702c0855eaaSJohn Baldwin add v26.4s,v26.4s,v27.4s 1703c0855eaaSJohn Baldwin ror w11,w11,#20 1704c0855eaaSJohn Baldwin add v30.4s,v30.4s,v31.4s 1705c0855eaaSJohn Baldwin ror w12,w12,#20 1706c0855eaaSJohn Baldwin eor v0.16b,v9.16b,v10.16b 1707c0855eaaSJohn Baldwin add w5,w5,w9 1708c0855eaaSJohn Baldwin eor v1.16b,v13.16b,v14.16b 1709c0855eaaSJohn Baldwin add w6,w6,w10 1710c0855eaaSJohn Baldwin eor v2.16b,v17.16b,v18.16b 1711c0855eaaSJohn Baldwin add w7,w7,w11 1712c0855eaaSJohn Baldwin eor v3.16b,v21.16b,v22.16b 1713c0855eaaSJohn Baldwin add w8,w8,w12 1714c0855eaaSJohn Baldwin eor v4.16b,v25.16b,v26.16b 1715c0855eaaSJohn Baldwin eor w17,w17,w5 1716c0855eaaSJohn Baldwin eor v5.16b,v29.16b,v30.16b 1717c0855eaaSJohn Baldwin eor w19,w19,w6 1718c0855eaaSJohn Baldwin ushr v9.4s,v0.4s,#20 1719c0855eaaSJohn Baldwin eor w20,w20,w7 1720c0855eaaSJohn Baldwin ushr v13.4s,v1.4s,#20 1721c0855eaaSJohn Baldwin eor w21,w21,w8 1722c0855eaaSJohn Baldwin ushr v17.4s,v2.4s,#20 1723bc3d5698SJohn Baldwin ror w17,w17,#24 1724c0855eaaSJohn Baldwin ushr v21.4s,v3.4s,#20 1725bc3d5698SJohn Baldwin ror w19,w19,#24 1726c0855eaaSJohn Baldwin ushr v25.4s,v4.4s,#20 1727bc3d5698SJohn Baldwin ror w20,w20,#24 1728c0855eaaSJohn Baldwin ushr v29.4s,v5.4s,#20 1729c0855eaaSJohn Baldwin ror w21,w21,#24 1730c0855eaaSJohn Baldwin sli v9.4s,v0.4s,#12 1731c0855eaaSJohn Baldwin add w13,w13,w17 1732c0855eaaSJohn Baldwin sli v13.4s,v1.4s,#12 1733c0855eaaSJohn Baldwin add w14,w14,w19 1734c0855eaaSJohn Baldwin sli v17.4s,v2.4s,#12 1735c0855eaaSJohn Baldwin add w15,w15,w20 1736c0855eaaSJohn Baldwin sli v21.4s,v3.4s,#12 1737c0855eaaSJohn Baldwin add w16,w16,w21 1738c0855eaaSJohn Baldwin sli v25.4s,v4.4s,#12 1739c0855eaaSJohn Baldwin eor w9,w9,w13 1740c0855eaaSJohn Baldwin sli v29.4s,v5.4s,#12 1741c0855eaaSJohn Baldwin eor w10,w10,w14 1742c0855eaaSJohn Baldwin add v8.4s,v8.4s,v9.4s 1743c0855eaaSJohn Baldwin eor w11,w11,w15 1744c0855eaaSJohn Baldwin add v12.4s,v12.4s,v13.4s 1745c0855eaaSJohn Baldwin eor w12,w12,w16 1746c0855eaaSJohn Baldwin add v16.4s,v16.4s,v17.4s 1747bc3d5698SJohn Baldwin ror w9,w9,#25 1748c0855eaaSJohn Baldwin add v20.4s,v20.4s,v21.4s 1749c0855eaaSJohn Baldwin ror w10,w10,#25 1750c0855eaaSJohn Baldwin add v24.4s,v24.4s,v25.4s 1751c0855eaaSJohn Baldwin ror w11,w11,#25 1752c0855eaaSJohn Baldwin add v28.4s,v28.4s,v29.4s 1753c0855eaaSJohn Baldwin ror w12,w12,#25 1754c0855eaaSJohn Baldwin eor v11.16b,v11.16b,v8.16b 1755c0855eaaSJohn Baldwin add w5,w5,w10 1756c0855eaaSJohn Baldwin eor v15.16b,v15.16b,v12.16b 1757c0855eaaSJohn Baldwin add w6,w6,w11 1758c0855eaaSJohn Baldwin eor v19.16b,v19.16b,v16.16b 1759c0855eaaSJohn Baldwin add w7,w7,w12 1760c0855eaaSJohn Baldwin eor v23.16b,v23.16b,v20.16b 1761c0855eaaSJohn Baldwin add w8,w8,w9 1762c0855eaaSJohn Baldwin eor v27.16b,v27.16b,v24.16b 1763c0855eaaSJohn Baldwin eor w21,w21,w5 1764c0855eaaSJohn Baldwin eor v31.16b,v31.16b,v28.16b 1765c0855eaaSJohn Baldwin eor w17,w17,w6 1766c0855eaaSJohn Baldwin tbl v11.16b,{v11.16b},v6.16b 1767c0855eaaSJohn Baldwin eor w19,w19,w7 1768c0855eaaSJohn Baldwin tbl v15.16b,{v15.16b},v6.16b 1769c0855eaaSJohn Baldwin eor w20,w20,w8 1770c0855eaaSJohn Baldwin tbl v19.16b,{v19.16b},v6.16b 1771c0855eaaSJohn Baldwin ror w21,w21,#16 1772c0855eaaSJohn Baldwin tbl v23.16b,{v23.16b},v6.16b 1773c0855eaaSJohn Baldwin ror w17,w17,#16 1774c0855eaaSJohn Baldwin tbl v27.16b,{v27.16b},v6.16b 1775c0855eaaSJohn Baldwin ror w19,w19,#16 1776c0855eaaSJohn Baldwin tbl v31.16b,{v31.16b},v6.16b 1777c0855eaaSJohn Baldwin ror w20,w20,#16 1778c0855eaaSJohn Baldwin add v10.4s,v10.4s,v11.4s 1779c0855eaaSJohn Baldwin add w15,w15,w21 1780c0855eaaSJohn Baldwin add v14.4s,v14.4s,v15.4s 1781c0855eaaSJohn Baldwin add w16,w16,w17 1782c0855eaaSJohn Baldwin add v18.4s,v18.4s,v19.4s 1783c0855eaaSJohn Baldwin add w13,w13,w19 1784c0855eaaSJohn Baldwin add v22.4s,v22.4s,v23.4s 1785c0855eaaSJohn Baldwin add w14,w14,w20 1786c0855eaaSJohn Baldwin add v26.4s,v26.4s,v27.4s 1787c0855eaaSJohn Baldwin eor w10,w10,w15 1788c0855eaaSJohn Baldwin add v30.4s,v30.4s,v31.4s 1789c0855eaaSJohn Baldwin eor w11,w11,w16 1790c0855eaaSJohn Baldwin eor v0.16b,v9.16b,v10.16b 1791c0855eaaSJohn Baldwin eor w12,w12,w13 1792c0855eaaSJohn Baldwin eor v1.16b,v13.16b,v14.16b 1793c0855eaaSJohn Baldwin eor w9,w9,w14 1794c0855eaaSJohn Baldwin eor v2.16b,v17.16b,v18.16b 1795c0855eaaSJohn Baldwin ror w10,w10,#20 1796c0855eaaSJohn Baldwin eor v3.16b,v21.16b,v22.16b 1797c0855eaaSJohn Baldwin ror w11,w11,#20 1798c0855eaaSJohn Baldwin eor v4.16b,v25.16b,v26.16b 1799c0855eaaSJohn Baldwin ror w12,w12,#20 1800c0855eaaSJohn Baldwin eor v5.16b,v29.16b,v30.16b 1801c0855eaaSJohn Baldwin ror w9,w9,#20 1802c0855eaaSJohn Baldwin ushr v9.4s,v0.4s,#25 1803c0855eaaSJohn Baldwin add w5,w5,w10 1804c0855eaaSJohn Baldwin ushr v13.4s,v1.4s,#25 1805c0855eaaSJohn Baldwin add w6,w6,w11 1806c0855eaaSJohn Baldwin ushr v17.4s,v2.4s,#25 1807c0855eaaSJohn Baldwin add w7,w7,w12 1808c0855eaaSJohn Baldwin ushr v21.4s,v3.4s,#25 1809c0855eaaSJohn Baldwin add w8,w8,w9 1810c0855eaaSJohn Baldwin ushr v25.4s,v4.4s,#25 1811c0855eaaSJohn Baldwin eor w21,w21,w5 1812c0855eaaSJohn Baldwin ushr v29.4s,v5.4s,#25 1813c0855eaaSJohn Baldwin eor w17,w17,w6 1814c0855eaaSJohn Baldwin sli v9.4s,v0.4s,#7 1815c0855eaaSJohn Baldwin eor w19,w19,w7 1816c0855eaaSJohn Baldwin sli v13.4s,v1.4s,#7 1817c0855eaaSJohn Baldwin eor w20,w20,w8 1818c0855eaaSJohn Baldwin sli v17.4s,v2.4s,#7 1819c0855eaaSJohn Baldwin ror w21,w21,#24 1820c0855eaaSJohn Baldwin sli v21.4s,v3.4s,#7 1821c0855eaaSJohn Baldwin ror w17,w17,#24 1822c0855eaaSJohn Baldwin sli v25.4s,v4.4s,#7 1823c0855eaaSJohn Baldwin ror w19,w19,#24 1824c0855eaaSJohn Baldwin sli v29.4s,v5.4s,#7 1825c0855eaaSJohn Baldwin ror w20,w20,#24 1826c0855eaaSJohn Baldwin ext v10.16b,v10.16b,v10.16b,#8 1827c0855eaaSJohn Baldwin add w15,w15,w21 1828c0855eaaSJohn Baldwin ext v14.16b,v14.16b,v14.16b,#8 1829c0855eaaSJohn Baldwin add w16,w16,w17 1830c0855eaaSJohn Baldwin ext v18.16b,v18.16b,v18.16b,#8 1831c0855eaaSJohn Baldwin add w13,w13,w19 1832c0855eaaSJohn Baldwin ext v22.16b,v22.16b,v22.16b,#8 1833c0855eaaSJohn Baldwin add w14,w14,w20 1834c0855eaaSJohn Baldwin ext v26.16b,v26.16b,v26.16b,#8 1835c0855eaaSJohn Baldwin eor w10,w10,w15 1836c0855eaaSJohn Baldwin ext v30.16b,v30.16b,v30.16b,#8 1837c0855eaaSJohn Baldwin eor w11,w11,w16 1838bc3d5698SJohn Baldwin ext v11.16b,v11.16b,v11.16b,#4 1839c0855eaaSJohn Baldwin eor w12,w12,w13 1840bc3d5698SJohn Baldwin ext v15.16b,v15.16b,v15.16b,#4 1841c0855eaaSJohn Baldwin eor w9,w9,w14 1842bc3d5698SJohn Baldwin ext v19.16b,v19.16b,v19.16b,#4 1843c0855eaaSJohn Baldwin ror w10,w10,#25 1844bc3d5698SJohn Baldwin ext v23.16b,v23.16b,v23.16b,#4 1845c0855eaaSJohn Baldwin ror w11,w11,#25 1846c0855eaaSJohn Baldwin ext v27.16b,v27.16b,v27.16b,#4 1847c0855eaaSJohn Baldwin ror w12,w12,#25 1848c0855eaaSJohn Baldwin ext v31.16b,v31.16b,v31.16b,#4 1849c0855eaaSJohn Baldwin ror w9,w9,#25 1850bc3d5698SJohn Baldwin ext v9.16b,v9.16b,v9.16b,#12 1851bc3d5698SJohn Baldwin ext v13.16b,v13.16b,v13.16b,#12 1852bc3d5698SJohn Baldwin ext v17.16b,v17.16b,v17.16b,#12 1853bc3d5698SJohn Baldwin ext v21.16b,v21.16b,v21.16b,#12 1854c0855eaaSJohn Baldwin ext v25.16b,v25.16b,v25.16b,#12 1855c0855eaaSJohn Baldwin ext v29.16b,v29.16b,v29.16b,#12 1856bc3d5698SJohn Baldwin cbnz x4,.Loop_lower_neon 1857bc3d5698SJohn Baldwin 1858bc3d5698SJohn Baldwin add w5,w5,w22 // accumulate key block 1859c0855eaaSJohn Baldwin ldp q0,q1,[sp,#0] 1860bc3d5698SJohn Baldwin add x6,x6,x22,lsr#32 1861c0855eaaSJohn Baldwin ldp q2,q3,[sp,#32] 1862bc3d5698SJohn Baldwin add w7,w7,w23 1863c0855eaaSJohn Baldwin ldp q4,q5,[sp,#64] 1864bc3d5698SJohn Baldwin add x8,x8,x23,lsr#32 1865c0855eaaSJohn Baldwin ldr q6,[sp,#96] 1866c0855eaaSJohn Baldwin add v8.4s,v8.4s,v0.4s 1867bc3d5698SJohn Baldwin add w9,w9,w24 1868c0855eaaSJohn Baldwin add v12.4s,v12.4s,v0.4s 1869bc3d5698SJohn Baldwin add x10,x10,x24,lsr#32 1870c0855eaaSJohn Baldwin add v16.4s,v16.4s,v0.4s 1871bc3d5698SJohn Baldwin add w11,w11,w25 1872c0855eaaSJohn Baldwin add v20.4s,v20.4s,v0.4s 1873bc3d5698SJohn Baldwin add x12,x12,x25,lsr#32 1874c0855eaaSJohn Baldwin add v24.4s,v24.4s,v0.4s 1875bc3d5698SJohn Baldwin add w13,w13,w26 1876c0855eaaSJohn Baldwin add v28.4s,v28.4s,v0.4s 1877bc3d5698SJohn Baldwin add x14,x14,x26,lsr#32 1878c0855eaaSJohn Baldwin add v10.4s,v10.4s,v2.4s 1879bc3d5698SJohn Baldwin add w15,w15,w27 1880c0855eaaSJohn Baldwin add v14.4s,v14.4s,v2.4s 1881bc3d5698SJohn Baldwin add x16,x16,x27,lsr#32 1882c0855eaaSJohn Baldwin add v18.4s,v18.4s,v2.4s 1883bc3d5698SJohn Baldwin add w17,w17,w28 1884c0855eaaSJohn Baldwin add v22.4s,v22.4s,v2.4s 1885bc3d5698SJohn Baldwin add x19,x19,x28,lsr#32 1886c0855eaaSJohn Baldwin add v26.4s,v26.4s,v2.4s 1887bc3d5698SJohn Baldwin add w20,w20,w30 1888c0855eaaSJohn Baldwin add v30.4s,v30.4s,v2.4s 1889bc3d5698SJohn Baldwin add x21,x21,x30,lsr#32 1890c0855eaaSJohn Baldwin add v27.4s,v27.4s,v7.4s // +4 1891bc3d5698SJohn Baldwin add x5,x5,x6,lsl#32 // pack 1892c0855eaaSJohn Baldwin add v31.4s,v31.4s,v7.4s // +4 1893bc3d5698SJohn Baldwin add x7,x7,x8,lsl#32 1894c0855eaaSJohn Baldwin add v11.4s,v11.4s,v3.4s 1895bc3d5698SJohn Baldwin ldp x6,x8,[x1,#0] // load input 1896c0855eaaSJohn Baldwin add v15.4s,v15.4s,v4.4s 1897bc3d5698SJohn Baldwin add x9,x9,x10,lsl#32 1898c0855eaaSJohn Baldwin add v19.4s,v19.4s,v5.4s 1899bc3d5698SJohn Baldwin add x11,x11,x12,lsl#32 1900c0855eaaSJohn Baldwin add v23.4s,v23.4s,v6.4s 1901bc3d5698SJohn Baldwin ldp x10,x12,[x1,#16] 1902c0855eaaSJohn Baldwin add v27.4s,v27.4s,v3.4s 1903bc3d5698SJohn Baldwin add x13,x13,x14,lsl#32 1904c0855eaaSJohn Baldwin add v31.4s,v31.4s,v4.4s 1905bc3d5698SJohn Baldwin add x15,x15,x16,lsl#32 1906c0855eaaSJohn Baldwin add v9.4s,v9.4s,v1.4s 1907bc3d5698SJohn Baldwin ldp x14,x16,[x1,#32] 1908c0855eaaSJohn Baldwin add v13.4s,v13.4s,v1.4s 1909bc3d5698SJohn Baldwin add x17,x17,x19,lsl#32 1910c0855eaaSJohn Baldwin add v17.4s,v17.4s,v1.4s 1911bc3d5698SJohn Baldwin add x20,x20,x21,lsl#32 1912c0855eaaSJohn Baldwin add v21.4s,v21.4s,v1.4s 1913bc3d5698SJohn Baldwin ldp x19,x21,[x1,#48] 1914c0855eaaSJohn Baldwin add v25.4s,v25.4s,v1.4s 1915bc3d5698SJohn Baldwin add x1,x1,#64 1916c0855eaaSJohn Baldwin add v29.4s,v29.4s,v1.4s 1917bc3d5698SJohn Baldwin 1918c0855eaaSJohn Baldwin#ifdef __AARCH64EB__ 1919bc3d5698SJohn Baldwin rev x5,x5 1920bc3d5698SJohn Baldwin rev x7,x7 1921bc3d5698SJohn Baldwin rev x9,x9 1922bc3d5698SJohn Baldwin rev x11,x11 1923bc3d5698SJohn Baldwin rev x13,x13 1924bc3d5698SJohn Baldwin rev x15,x15 1925bc3d5698SJohn Baldwin rev x17,x17 1926bc3d5698SJohn Baldwin rev x20,x20 1927bc3d5698SJohn Baldwin#endif 1928c0855eaaSJohn Baldwin ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1929bc3d5698SJohn Baldwin eor x5,x5,x6 1930bc3d5698SJohn Baldwin eor x7,x7,x8 1931bc3d5698SJohn Baldwin eor x9,x9,x10 1932bc3d5698SJohn Baldwin eor x11,x11,x12 1933bc3d5698SJohn Baldwin eor x13,x13,x14 1934c0855eaaSJohn Baldwin eor v8.16b,v8.16b,v0.16b 1935bc3d5698SJohn Baldwin eor x15,x15,x16 1936c0855eaaSJohn Baldwin eor v9.16b,v9.16b,v1.16b 1937bc3d5698SJohn Baldwin eor x17,x17,x19 1938c0855eaaSJohn Baldwin eor v10.16b,v10.16b,v2.16b 1939bc3d5698SJohn Baldwin eor x20,x20,x21 1940c0855eaaSJohn Baldwin eor v11.16b,v11.16b,v3.16b 1941c0855eaaSJohn Baldwin ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1942bc3d5698SJohn Baldwin 1943bc3d5698SJohn Baldwin stp x5,x7,[x0,#0] // store output 1944bc3d5698SJohn Baldwin add x28,x28,#7 // increment counter 1945bc3d5698SJohn Baldwin stp x9,x11,[x0,#16] 1946bc3d5698SJohn Baldwin stp x13,x15,[x0,#32] 1947bc3d5698SJohn Baldwin stp x17,x20,[x0,#48] 1948bc3d5698SJohn Baldwin add x0,x0,#64 1949bc3d5698SJohn Baldwin st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1950bc3d5698SJohn Baldwin 1951bc3d5698SJohn Baldwin ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1952c0855eaaSJohn Baldwin eor v12.16b,v12.16b,v0.16b 1953c0855eaaSJohn Baldwin eor v13.16b,v13.16b,v1.16b 1954c0855eaaSJohn Baldwin eor v14.16b,v14.16b,v2.16b 1955c0855eaaSJohn Baldwin eor v15.16b,v15.16b,v3.16b 1956bc3d5698SJohn Baldwin st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1957bc3d5698SJohn Baldwin 1958bc3d5698SJohn Baldwin ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1959bc3d5698SJohn Baldwin eor v16.16b,v16.16b,v8.16b 1960c0855eaaSJohn Baldwin ldp q0,q1,[sp,#0] 1961bc3d5698SJohn Baldwin eor v17.16b,v17.16b,v9.16b 1962c0855eaaSJohn Baldwin ldp q2,q3,[sp,#32] 1963bc3d5698SJohn Baldwin eor v18.16b,v18.16b,v10.16b 1964bc3d5698SJohn Baldwin eor v19.16b,v19.16b,v11.16b 1965bc3d5698SJohn Baldwin st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1966bc3d5698SJohn Baldwin 1967c0855eaaSJohn Baldwin ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 1968bc3d5698SJohn Baldwin eor v20.16b,v20.16b,v12.16b 1969bc3d5698SJohn Baldwin eor v21.16b,v21.16b,v13.16b 1970bc3d5698SJohn Baldwin eor v22.16b,v22.16b,v14.16b 1971bc3d5698SJohn Baldwin eor v23.16b,v23.16b,v15.16b 1972bc3d5698SJohn Baldwin st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1973bc3d5698SJohn Baldwin 1974c0855eaaSJohn Baldwin ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 1975c0855eaaSJohn Baldwin eor v24.16b,v24.16b,v16.16b 1976c0855eaaSJohn Baldwin eor v25.16b,v25.16b,v17.16b 1977c0855eaaSJohn Baldwin eor v26.16b,v26.16b,v18.16b 1978c0855eaaSJohn Baldwin eor v27.16b,v27.16b,v19.16b 1979c0855eaaSJohn Baldwin st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 1980c0855eaaSJohn Baldwin 1981c0855eaaSJohn Baldwin shl v8.4s,v7.4s,#1 // 4 -> 8 1982c0855eaaSJohn Baldwin eor v28.16b,v28.16b,v20.16b 1983c0855eaaSJohn Baldwin eor v29.16b,v29.16b,v21.16b 1984c0855eaaSJohn Baldwin eor v30.16b,v30.16b,v22.16b 1985c0855eaaSJohn Baldwin eor v31.16b,v31.16b,v23.16b 1986c0855eaaSJohn Baldwin st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64 1987c0855eaaSJohn Baldwin 1988c0855eaaSJohn Baldwin add v3.4s,v3.4s,v8.4s // += 8 1989c0855eaaSJohn Baldwin add v4.4s,v4.4s,v8.4s 1990c0855eaaSJohn Baldwin add v5.4s,v5.4s,v8.4s 1991c0855eaaSJohn Baldwin add v6.4s,v6.4s,v8.4s 1992bc3d5698SJohn Baldwin 1993bc3d5698SJohn Baldwin b.hs .Loop_outer_512_neon 1994bc3d5698SJohn Baldwin 1995bc3d5698SJohn Baldwin adds x2,x2,#512 1996c0855eaaSJohn Baldwin ushr v7.4s,v7.4s,#1 // 4 -> 2 1997bc3d5698SJohn Baldwin 1998c0855eaaSJohn Baldwin ldp d10,d11,[sp,#128+16] // meet ABI requirements 1999bc3d5698SJohn Baldwin ldp d12,d13,[sp,#128+32] 2000bc3d5698SJohn Baldwin ldp d14,d15,[sp,#128+48] 2001bc3d5698SJohn Baldwin 2002c0855eaaSJohn Baldwin stp q0,q0,[sp,#0] // wipe off-load area 2003c0855eaaSJohn Baldwin stp q0,q0,[sp,#32] 2004c0855eaaSJohn Baldwin stp q0,q0,[sp,#64] 2005bc3d5698SJohn Baldwin 2006bc3d5698SJohn Baldwin b.eq .Ldone_512_neon 2007bc3d5698SJohn Baldwin 2008c0855eaaSJohn Baldwin sub x3,x3,#16 // .Lone 2009bc3d5698SJohn Baldwin cmp x2,#192 2010bc3d5698SJohn Baldwin add sp,sp,#128 2011c0855eaaSJohn Baldwin sub v3.4s,v3.4s,v7.4s // -= 2 2012c0855eaaSJohn Baldwin ld1 {v8.4s,v9.4s},[x3] 2013bc3d5698SJohn Baldwin b.hs .Loop_outer_neon 2014bc3d5698SJohn Baldwin 2015c0855eaaSJohn Baldwin ldp d8,d9,[sp,#0] // meet ABI requirements 2016c0855eaaSJohn Baldwin eor v1.16b,v1.16b,v1.16b 2017c0855eaaSJohn Baldwin eor v2.16b,v2.16b,v2.16b 2018c0855eaaSJohn Baldwin eor v3.16b,v3.16b,v3.16b 2019c0855eaaSJohn Baldwin eor v4.16b,v4.16b,v4.16b 2020c0855eaaSJohn Baldwin eor v5.16b,v5.16b,v5.16b 2021c0855eaaSJohn Baldwin eor v6.16b,v6.16b,v6.16b 2022bc3d5698SJohn Baldwin b .Loop_outer 2023bc3d5698SJohn Baldwin 2024bc3d5698SJohn Baldwin.Ldone_512_neon: 2025c0855eaaSJohn Baldwin ldp d8,d9,[sp,#128+0] // meet ABI requirements 2026bc3d5698SJohn Baldwin ldp x19,x20,[x29,#16] 2027bc3d5698SJohn Baldwin add sp,sp,#128+64 2028bc3d5698SJohn Baldwin ldp x21,x22,[x29,#32] 2029bc3d5698SJohn Baldwin ldp x23,x24,[x29,#48] 2030bc3d5698SJohn Baldwin ldp x25,x26,[x29,#64] 2031bc3d5698SJohn Baldwin ldp x27,x28,[x29,#80] 2032bc3d5698SJohn Baldwin ldp x29,x30,[sp],#96 2033*bd9588bcSAndrew Turner AARCH64_VALIDATE_LINK_REGISTER 2034bc3d5698SJohn Baldwin ret 2035bc3d5698SJohn Baldwin.size ChaCha20_512_neon,.-ChaCha20_512_neon 2036