1/* Do not modify. This file is auto-generated from chacha-armv8-sve.pl. */ 2// Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved. 3// 4// Licensed under the Apache License 2.0 (the "License"). You may not use 5// this file except in compliance with the License. You can obtain a copy 6// in the file LICENSE in the source distribution or at 7// https://www.openssl.org/source/license.html 8// 9// 10// ChaCha20 for ARMv8 via SVE 11// 12// $output is the last argument if it looks like a file (it has an extension) 13// $flavour is the first argument if it doesn't look like a file 14#include "arm_arch.h" 15 16.arch armv8-a 17 18 19.hidden OPENSSL_armcap_P 20 21.text 22 23.section .rodata 24.align 5 25.type _chacha_sve_consts,%object 26_chacha_sve_consts: 27.Lchacha20_consts: 28.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 29.Lrot8: 30.word 0x02010003,0x04040404,0x02010003,0x04040404 31.size _chacha_sve_consts,.-_chacha_sve_consts 32 33.previous 34 35.globl ChaCha20_ctr32_sve 36.type ChaCha20_ctr32_sve,%function 37.align 5 38ChaCha20_ctr32_sve: 39 AARCH64_VALID_CALL_TARGET 40.inst 0x04a0e3e5 //cntw x5, ALL, MUL #1 41 cmp x2,x5,lsl #6 42 b.lt .Lreturn 43 mov x7,0 44 adrp x6,OPENSSL_armcap_P 45 ldr w6,[x6,#:lo12:OPENSSL_armcap_P] 46 tst w6,#ARMV8_SVE2 47 b.eq 1f 48 mov x7,1 49 b 2f 501: 51 cmp x5,4 52 b.le .Lreturn 53 adrp x6,.Lrot8 54 add x6,x6,#:lo12:.Lrot8 55 ldp w9,w10,[x6] 56.inst 0x04aa4d3f //index z31.s,w9,w10 572: 58 AARCH64_SIGN_LINK_REGISTER 59 stp d8,d9,[sp,-192]! 60 stp d10,d11,[sp,16] 61 stp d12,d13,[sp,32] 62 stp d14,d15,[sp,48] 63 stp x16,x17,[sp,64] 64 stp x18,x19,[sp,80] 65 stp x20,x21,[sp,96] 66 stp x22,x23,[sp,112] 67 stp x24,x25,[sp,128] 68 stp x26,x27,[sp,144] 69 stp x28,x29,[sp,160] 70 str x30,[sp,176] 71 72 adrp x6,.Lchacha20_consts 73 add x6,x6,#:lo12:.Lchacha20_consts 74 ldp x23,x24,[x6] 75 ldp x25,x26,[x3] 76 ldp x27,x28,[x3, 16] 77 ldp x29,x30,[x4] 78.inst 0x2599e3e0 //ptrues p0.s,ALL 79#ifdef __AARCH64EB__ 80 ror x25,x25,#32 81 ror x26,x26,#32 82 ror x27,x27,#32 83 ror x28,x28,#32 84 ror x29,x29,#32 85 ror x30,x30,#32 86#endif 87 cbz x7, 1f 88.align 5 89100: 90 subs x7,x2,x5,lsl #6 91 b.lt 110f 92 mov x2,x7 93 b.eq 101f 94 cmp x2,64 95 b.lt 101f 96 mixin=1 97 lsr x8,x23,#32 98.inst 0x05a03ae0 //dup z0.s,w23 99.inst 0x05a03af9 //dup z25.s,w23 100.if mixin == 1 101 mov w7,w23 102.endif 103.inst 0x05a03904 //dup z4.s,w8 104.inst 0x05a0391a //dup z26.s,w8 105 lsr x10,x24,#32 106.inst 0x05a03b08 //dup z8.s,w24 107.inst 0x05a03b1b //dup z27.s,w24 108.if mixin == 1 109 mov w9,w24 110.endif 111.inst 0x05a0394c //dup z12.s,w10 112.inst 0x05a0395c //dup z28.s,w10 113 lsr x12,x25,#32 114.inst 0x05a03b21 //dup z1.s,w25 115.inst 0x05a03b3d //dup z29.s,w25 116.if mixin == 1 117 mov w11,w25 118.endif 119.inst 0x05a03985 //dup z5.s,w12 120.inst 0x05a0399e //dup z30.s,w12 121 lsr x14,x26,#32 122.inst 0x05a03b49 //dup z9.s,w26 123.inst 0x05a03b55 //dup z21.s,w26 124.if mixin == 1 125 mov w13,w26 126.endif 127.inst 0x05a039cd //dup z13.s,w14 128.inst 0x05a039d6 //dup z22.s,w14 129 lsr x16,x27,#32 130.inst 0x05a03b62 //dup z2.s,w27 131.inst 0x05a03b77 //dup z23.s,w27 132.if mixin == 1 133 mov w15,w27 134.endif 135.inst 0x05a03a06 //dup z6.s,w16 136.inst 0x05a03a18 //dup z24.s,w16 137 lsr x18,x28,#32 138.inst 0x05a03b8a //dup z10.s,w28 139.inst 0x05a03b91 //dup z17.s,w28 140.if mixin == 1 141 mov w17,w28 142.endif 143.inst 0x05a03a4e //dup z14.s,w18 144.inst 0x05a03a52 //dup z18.s,w18 145 lsr x22,x30,#32 146.inst 0x05a03bcb //dup z11.s,w30 147.inst 0x05a03bd4 //dup z20.s,w30 148.if mixin == 1 149 mov w21,w30 150.endif 151.inst 0x05a03acf //dup z15.s,w22 152.inst 0x05a03adf //dup z31.s,w22 153.if mixin == 1 154 add w20,w29,#1 155 mov w19,w29 156.inst 0x04a14690 //index z16.s,w20,1 157.inst 0x04a14683 //index z3.s,w20,1 158.else 159.inst 0x04a147b0 //index z16.s,w29,1 160.inst 0x04a147a3 //index z3.s,w29,1 161.endif 162 lsr x20,x29,#32 163.inst 0x05a03a87 //dup z7.s,w20 164.inst 0x05a03a93 //dup z19.s,w20 165 mov x6,#10 16610: 167.align 5 168.inst 0x04a10000 //add z0.s,z0.s,z1.s 169.if mixin == 1 170 add w7,w7,w11 171.endif 172.inst 0x04a50084 //add z4.s,z4.s,z5.s 173.if mixin == 1 174 add w8,w8,w12 175.endif 176.inst 0x04a90108 //add z8.s,z8.s,z9.s 177.if mixin == 1 178 add w9,w9,w13 179.endif 180.inst 0x04ad018c //add z12.s,z12.s,z13.s 181.if mixin == 1 182 add w10,w10,w14 183.endif 184.if mixin == 1 185 eor w19,w19,w7 186.endif 187.inst 0x04703403 //xar z3.s,z3.s,z0.s,16 188.if mixin == 1 189 ror w19,w19,16 190.endif 191.if mixin == 1 192 eor w20,w20,w8 193.endif 194.inst 0x04703487 //xar z7.s,z7.s,z4.s,16 195.if mixin == 1 196 ror w20,w20,16 197.endif 198.if mixin == 1 199 eor w21,w21,w9 200.endif 201.inst 0x0470350b //xar z11.s,z11.s,z8.s,16 202.if mixin == 1 203 ror w21,w21,16 204.endif 205.if mixin == 1 206 eor w22,w22,w10 207.endif 208.inst 0x0470358f //xar z15.s,z15.s,z12.s,16 209.if mixin == 1 210 ror w22,w22,16 211.endif 212.inst 0x04a30042 //add z2.s,z2.s,z3.s 213.if mixin == 1 214 add w15,w15,w19 215.endif 216.inst 0x04a700c6 //add z6.s,z6.s,z7.s 217.if mixin == 1 218 add w16,w16,w20 219.endif 220.inst 0x04ab014a //add z10.s,z10.s,z11.s 221.if mixin == 1 222 add w17,w17,w21 223.endif 224.inst 0x04af01ce //add z14.s,z14.s,z15.s 225.if mixin == 1 226 add w18,w18,w22 227.endif 228.if mixin == 1 229 eor w11,w11,w15 230.endif 231.inst 0x046c3441 //xar z1.s,z1.s,z2.s,20 232.if mixin == 1 233 ror w11,w11,20 234.endif 235.if mixin == 1 236 eor w12,w12,w16 237.endif 238.inst 0x046c34c5 //xar z5.s,z5.s,z6.s,20 239.if mixin == 1 240 ror w12,w12,20 241.endif 242.if mixin == 1 243 eor w13,w13,w17 244.endif 245.inst 0x046c3549 //xar z9.s,z9.s,z10.s,20 246.if mixin == 1 247 ror w13,w13,20 248.endif 249.if mixin == 1 250 eor w14,w14,w18 251.endif 252.inst 0x046c35cd //xar z13.s,z13.s,z14.s,20 253.if mixin == 1 254 ror w14,w14,20 255.endif 256.inst 0x04a10000 //add z0.s,z0.s,z1.s 257.if mixin == 1 258 add w7,w7,w11 259.endif 260.inst 0x04a50084 //add z4.s,z4.s,z5.s 261.if mixin == 1 262 add w8,w8,w12 263.endif 264.inst 0x04a90108 //add z8.s,z8.s,z9.s 265.if mixin == 1 266 add w9,w9,w13 267.endif 268.inst 0x04ad018c //add z12.s,z12.s,z13.s 269.if mixin == 1 270 add w10,w10,w14 271.endif 272.if mixin == 1 273 eor w19,w19,w7 274.endif 275.inst 0x04683403 //xar z3.s,z3.s,z0.s,24 276.if mixin == 1 277 ror w19,w19,24 278.endif 279.if mixin == 1 280 eor w20,w20,w8 281.endif 282.inst 0x04683487 //xar z7.s,z7.s,z4.s,24 283.if mixin == 1 284 ror w20,w20,24 285.endif 286.if mixin == 1 287 eor w21,w21,w9 288.endif 289.inst 0x0468350b //xar z11.s,z11.s,z8.s,24 290.if mixin == 1 291 ror w21,w21,24 292.endif 293.if mixin == 1 294 eor w22,w22,w10 295.endif 296.inst 0x0468358f //xar z15.s,z15.s,z12.s,24 297.if mixin == 1 298 ror w22,w22,24 299.endif 300.inst 0x04a30042 //add z2.s,z2.s,z3.s 301.if mixin == 1 302 add w15,w15,w19 303.endif 304.inst 0x04a700c6 //add z6.s,z6.s,z7.s 305.if mixin == 1 306 add w16,w16,w20 307.endif 308.inst 0x04ab014a //add z10.s,z10.s,z11.s 309.if mixin == 1 310 add w17,w17,w21 311.endif 312.inst 0x04af01ce //add z14.s,z14.s,z15.s 313.if mixin == 1 314 add w18,w18,w22 315.endif 316.if mixin == 1 317 eor w11,w11,w15 318.endif 319.inst 0x04673441 //xar z1.s,z1.s,z2.s,25 320.if mixin == 1 321 ror w11,w11,25 322.endif 323.if mixin == 1 324 eor w12,w12,w16 325.endif 326.inst 0x046734c5 //xar z5.s,z5.s,z6.s,25 327.if mixin == 1 328 ror w12,w12,25 329.endif 330.if mixin == 1 331 eor w13,w13,w17 332.endif 333.inst 0x04673549 //xar z9.s,z9.s,z10.s,25 334.if mixin == 1 335 ror w13,w13,25 336.endif 337.if mixin == 1 338 eor w14,w14,w18 339.endif 340.inst 0x046735cd //xar z13.s,z13.s,z14.s,25 341.if mixin == 1 342 ror w14,w14,25 343.endif 344.inst 0x04a50000 //add z0.s,z0.s,z5.s 345.if mixin == 1 346 add w7,w7,w12 347.endif 348.inst 0x04a90084 //add z4.s,z4.s,z9.s 349.if mixin == 1 350 add w8,w8,w13 351.endif 352.inst 0x04ad0108 //add z8.s,z8.s,z13.s 353.if mixin == 1 354 add w9,w9,w14 355.endif 356.inst 0x04a1018c //add z12.s,z12.s,z1.s 357.if mixin == 1 358 add w10,w10,w11 359.endif 360.if mixin == 1 361 eor w22,w22,w7 362.endif 363.inst 0x0470340f //xar z15.s,z15.s,z0.s,16 364.if mixin == 1 365 ror w22,w22,16 366.endif 367.if mixin == 1 368 eor w19,w19,w8 369.endif 370.inst 0x04703483 //xar z3.s,z3.s,z4.s,16 371.if mixin == 1 372 ror w19,w19,16 373.endif 374.if mixin == 1 375 eor w20,w20,w9 376.endif 377.inst 0x04703507 //xar z7.s,z7.s,z8.s,16 378.if mixin == 1 379 ror w20,w20,16 380.endif 381.if mixin == 1 382 eor w21,w21,w10 383.endif 384.inst 0x0470358b //xar z11.s,z11.s,z12.s,16 385.if mixin == 1 386 ror w21,w21,16 387.endif 388.inst 0x04af014a //add z10.s,z10.s,z15.s 389.if mixin == 1 390 add w17,w17,w22 391.endif 392.inst 0x04a301ce //add z14.s,z14.s,z3.s 393.if mixin == 1 394 add w18,w18,w19 395.endif 396.inst 0x04a70042 //add z2.s,z2.s,z7.s 397.if mixin == 1 398 add w15,w15,w20 399.endif 400.inst 0x04ab00c6 //add z6.s,z6.s,z11.s 401.if mixin == 1 402 add w16,w16,w21 403.endif 404.if mixin == 1 405 eor w12,w12,w17 406.endif 407.inst 0x046c3545 //xar z5.s,z5.s,z10.s,20 408.if mixin == 1 409 ror w12,w12,20 410.endif 411.if mixin == 1 412 eor w13,w13,w18 413.endif 414.inst 0x046c35c9 //xar z9.s,z9.s,z14.s,20 415.if mixin == 1 416 ror w13,w13,20 417.endif 418.if mixin == 1 419 eor w14,w14,w15 420.endif 421.inst 0x046c344d //xar z13.s,z13.s,z2.s,20 422.if mixin == 1 423 ror w14,w14,20 424.endif 425.if mixin == 1 426 eor w11,w11,w16 427.endif 428.inst 0x046c34c1 //xar z1.s,z1.s,z6.s,20 429.if mixin == 1 430 ror w11,w11,20 431.endif 432.inst 0x04a50000 //add z0.s,z0.s,z5.s 433.if mixin == 1 434 add w7,w7,w12 435.endif 436.inst 0x04a90084 //add z4.s,z4.s,z9.s 437.if mixin == 1 438 add w8,w8,w13 439.endif 440.inst 0x04ad0108 //add z8.s,z8.s,z13.s 441.if mixin == 1 442 add w9,w9,w14 443.endif 444.inst 0x04a1018c //add z12.s,z12.s,z1.s 445.if mixin == 1 446 add w10,w10,w11 447.endif 448.if mixin == 1 449 eor w22,w22,w7 450.endif 451.inst 0x0468340f //xar z15.s,z15.s,z0.s,24 452.if mixin == 1 453 ror w22,w22,24 454.endif 455.if mixin == 1 456 eor w19,w19,w8 457.endif 458.inst 0x04683483 //xar z3.s,z3.s,z4.s,24 459.if mixin == 1 460 ror w19,w19,24 461.endif 462.if mixin == 1 463 eor w20,w20,w9 464.endif 465.inst 0x04683507 //xar z7.s,z7.s,z8.s,24 466.if mixin == 1 467 ror w20,w20,24 468.endif 469.if mixin == 1 470 eor w21,w21,w10 471.endif 472.inst 0x0468358b //xar z11.s,z11.s,z12.s,24 473.if mixin == 1 474 ror w21,w21,24 475.endif 476.inst 0x04af014a //add z10.s,z10.s,z15.s 477.if mixin == 1 478 add w17,w17,w22 479.endif 480.inst 0x04a301ce //add z14.s,z14.s,z3.s 481.if mixin == 1 482 add w18,w18,w19 483.endif 484.inst 0x04a70042 //add z2.s,z2.s,z7.s 485.if mixin == 1 486 add w15,w15,w20 487.endif 488.inst 0x04ab00c6 //add z6.s,z6.s,z11.s 489.if mixin == 1 490 add w16,w16,w21 491.endif 492.if mixin == 1 493 eor w12,w12,w17 494.endif 495.inst 0x04673545 //xar z5.s,z5.s,z10.s,25 496.if mixin == 1 497 ror w12,w12,25 498.endif 499.if mixin == 1 500 eor w13,w13,w18 501.endif 502.inst 0x046735c9 //xar z9.s,z9.s,z14.s,25 503.if mixin == 1 504 ror w13,w13,25 505.endif 506.if mixin == 1 507 eor w14,w14,w15 508.endif 509.inst 0x0467344d //xar z13.s,z13.s,z2.s,25 510.if mixin == 1 511 ror w14,w14,25 512.endif 513.if mixin == 1 514 eor w11,w11,w16 515.endif 516.inst 0x046734c1 //xar z1.s,z1.s,z6.s,25 517.if mixin == 1 518 ror w11,w11,25 519.endif 520 sub x6,x6,1 521 cbnz x6,10b 522.if mixin == 1 523 add w7,w7,w23 524.endif 525.inst 0x04b90000 //add z0.s,z0.s,z25.s 526.if mixin == 1 527 add x8,x8,x23,lsr #32 528.endif 529.inst 0x04ba0084 //add z4.s,z4.s,z26.s 530.if mixin == 1 531 add x7,x7,x8,lsl #32 // pack 532.endif 533.if mixin == 1 534 add w9,w9,w24 535.endif 536.inst 0x04bb0108 //add z8.s,z8.s,z27.s 537.if mixin == 1 538 add x10,x10,x24,lsr #32 539.endif 540.inst 0x04bc018c //add z12.s,z12.s,z28.s 541.if mixin == 1 542 add x9,x9,x10,lsl #32 // pack 543.endif 544.if mixin == 1 545 ldp x8,x10,[x1],#16 546.endif 547.if mixin == 1 548 add w11,w11,w25 549.endif 550.inst 0x04bd0021 //add z1.s,z1.s,z29.s 551.if mixin == 1 552 add x12,x12,x25,lsr #32 553.endif 554.inst 0x04be00a5 //add z5.s,z5.s,z30.s 555.if mixin == 1 556 add x11,x11,x12,lsl #32 // pack 557.endif 558.if mixin == 1 559 add w13,w13,w26 560.endif 561.inst 0x04b50129 //add z9.s,z9.s,z21.s 562.if mixin == 1 563 add x14,x14,x26,lsr #32 564.endif 565.inst 0x04b601ad //add z13.s,z13.s,z22.s 566.if mixin == 1 567 add x13,x13,x14,lsl #32 // pack 568.endif 569.if mixin == 1 570 ldp x12,x14,[x1],#16 571.endif 572.if mixin == 1 573 add w15,w15,w27 574.endif 575.inst 0x04b70042 //add z2.s,z2.s,z23.s 576.if mixin == 1 577 add x16,x16,x27,lsr #32 578.endif 579.inst 0x04b800c6 //add z6.s,z6.s,z24.s 580.if mixin == 1 581 add x15,x15,x16,lsl #32 // pack 582.endif 583.if mixin == 1 584 add w17,w17,w28 585.endif 586.inst 0x04b1014a //add z10.s,z10.s,z17.s 587.if mixin == 1 588 add x18,x18,x28,lsr #32 589.endif 590.inst 0x04b201ce //add z14.s,z14.s,z18.s 591.if mixin == 1 592 add x17,x17,x18,lsl #32 // pack 593.endif 594.if mixin == 1 595 ldp x16,x18,[x1],#16 596.endif 597.if mixin == 1 598 add w19,w19,w29 599.endif 600.inst 0x04b00063 //add z3.s,z3.s,z16.s 601.if mixin == 1 602 add x20,x20,x29,lsr #32 603.endif 604.inst 0x04b300e7 //add z7.s,z7.s,z19.s 605.if mixin == 1 606 add x19,x19,x20,lsl #32 // pack 607.endif 608.if mixin == 1 609 add w21,w21,w30 610.endif 611.inst 0x04b4016b //add z11.s,z11.s,z20.s 612.if mixin == 1 613 add x22,x22,x30,lsr #32 614.endif 615.inst 0x04bf01ef //add z15.s,z15.s,z31.s 616.if mixin == 1 617 add x21,x21,x22,lsl #32 // pack 618.endif 619.if mixin == 1 620 ldp x20,x22,[x1],#16 621.endif 622#ifdef __AARCH64EB__ 623 rev x7,x7 624.inst 0x05a48000 //revb z0.s,p0/m,z0.s 625.inst 0x05a48084 //revb z4.s,p0/m,z4.s 626 rev x9,x9 627.inst 0x05a48108 //revb z8.s,p0/m,z8.s 628.inst 0x05a4818c //revb z12.s,p0/m,z12.s 629 rev x11,x11 630.inst 0x05a48021 //revb z1.s,p0/m,z1.s 631.inst 0x05a480a5 //revb z5.s,p0/m,z5.s 632 rev x13,x13 633.inst 0x05a48129 //revb z9.s,p0/m,z9.s 634.inst 0x05a481ad //revb z13.s,p0/m,z13.s 635 rev x15,x15 636.inst 0x05a48042 //revb z2.s,p0/m,z2.s 637.inst 0x05a480c6 //revb z6.s,p0/m,z6.s 638 rev x17,x17 639.inst 0x05a4814a //revb z10.s,p0/m,z10.s 640.inst 0x05a481ce //revb z14.s,p0/m,z14.s 641 rev x19,x19 642.inst 0x05a48063 //revb z3.s,p0/m,z3.s 643.inst 0x05a480e7 //revb z7.s,p0/m,z7.s 644 rev x21,x21 645.inst 0x05a4816b //revb z11.s,p0/m,z11.s 646.inst 0x05a481ef //revb z15.s,p0/m,z15.s 647#endif 648.if mixin == 1 649 add x29,x29,#1 650.endif 651 cmp x5,4 652 b.ne 200f 653.if mixin == 1 654 eor x7,x7,x8 655.endif 656.if mixin == 1 657 eor x9,x9,x10 658.endif 659.if mixin == 1 660 eor x11,x11,x12 661.endif 662.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s 663.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s 664.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s 665.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s 666 667.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s 668.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s 669.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s 670.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s 671 672.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d 673.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d 674.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d 675.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d 676 677.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d 678.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d 679.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d 680.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d 681.if mixin == 1 682 eor x13,x13,x14 683.endif 684.if mixin == 1 685 eor x15,x15,x16 686.endif 687.if mixin == 1 688 eor x17,x17,x18 689.endif 690.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s 691.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s 692.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s 693.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s 694 695.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s 696.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s 697.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s 698.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s 699 700.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d 701.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d 702.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d 703.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d 704 705.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d 706.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d 707.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d 708.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d 709.if mixin == 1 710 eor x19,x19,x20 711.endif 712.if mixin == 1 713 eor x21,x21,x22 714.endif 715 ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64 716 ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64 717.inst 0x04b13000 //eor z0.d,z0.d,z17.d 718.inst 0x04b23021 //eor z1.d,z1.d,z18.d 719.inst 0x04b33042 //eor z2.d,z2.d,z19.d 720.inst 0x04b43063 //eor z3.d,z3.d,z20.d 721.inst 0x04b53084 //eor z4.d,z4.d,z21.d 722.inst 0x04b630a5 //eor z5.d,z5.d,z22.d 723.inst 0x04b730c6 //eor z6.d,z6.d,z23.d 724.inst 0x04b830e7 //eor z7.d,z7.d,z24.d 725 ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64 726 ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64 727.if mixin == 1 728 stp x7,x9,[x0],#16 729.endif 730.inst 0x04b13108 //eor z8.d,z8.d,z17.d 731.inst 0x04b23129 //eor z9.d,z9.d,z18.d 732.if mixin == 1 733 stp x11,x13,[x0],#16 734.endif 735.inst 0x04b3314a //eor z10.d,z10.d,z19.d 736.inst 0x04b4316b //eor z11.d,z11.d,z20.d 737.if mixin == 1 738 stp x15,x17,[x0],#16 739.endif 740.inst 0x04b5318c //eor z12.d,z12.d,z21.d 741.inst 0x04b631ad //eor z13.d,z13.d,z22.d 742.if mixin == 1 743 stp x19,x21,[x0],#16 744.endif 745.inst 0x04b731ce //eor z14.d,z14.d,z23.d 746.inst 0x04b831ef //eor z15.d,z15.d,z24.d 747 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64 748 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 749 st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 750 st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 751 b 210f 752200: 753.inst 0x05a16011 //zip1 z17.s,z0.s,z1.s 754.inst 0x05a16412 //zip2 z18.s,z0.s,z1.s 755.inst 0x05a36053 //zip1 z19.s,z2.s,z3.s 756.inst 0x05a36454 //zip2 z20.s,z2.s,z3.s 757 758.inst 0x05a56095 //zip1 z21.s,z4.s,z5.s 759.inst 0x05a56496 //zip2 z22.s,z4.s,z5.s 760.inst 0x05a760d7 //zip1 z23.s,z6.s,z7.s 761.inst 0x05a764d8 //zip2 z24.s,z6.s,z7.s 762 763.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d 764.inst 0x05f36621 //zip2 z1.d,z17.d,z19.d 765.inst 0x05f46242 //zip1 z2.d,z18.d,z20.d 766.inst 0x05f46643 //zip2 z3.d,z18.d,z20.d 767 768.inst 0x05f762a4 //zip1 z4.d,z21.d,z23.d 769.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d 770.inst 0x05f862c6 //zip1 z6.d,z22.d,z24.d 771.inst 0x05f866c7 //zip2 z7.d,z22.d,z24.d 772.if mixin == 1 773 eor x7,x7,x8 774.endif 775.if mixin == 1 776 eor x9,x9,x10 777.endif 778.inst 0x05a96111 //zip1 z17.s,z8.s,z9.s 779.inst 0x05a96512 //zip2 z18.s,z8.s,z9.s 780.inst 0x05ab6153 //zip1 z19.s,z10.s,z11.s 781.inst 0x05ab6554 //zip2 z20.s,z10.s,z11.s 782 783.inst 0x05ad6195 //zip1 z21.s,z12.s,z13.s 784.inst 0x05ad6596 //zip2 z22.s,z12.s,z13.s 785.inst 0x05af61d7 //zip1 z23.s,z14.s,z15.s 786.inst 0x05af65d8 //zip2 z24.s,z14.s,z15.s 787 788.inst 0x05f36228 //zip1 z8.d,z17.d,z19.d 789.inst 0x05f36629 //zip2 z9.d,z17.d,z19.d 790.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d 791.inst 0x05f4664b //zip2 z11.d,z18.d,z20.d 792 793.inst 0x05f762ac //zip1 z12.d,z21.d,z23.d 794.inst 0x05f766ad //zip2 z13.d,z21.d,z23.d 795.inst 0x05f862ce //zip1 z14.d,z22.d,z24.d 796.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d 797.if mixin == 1 798 eor x11,x11,x12 799.endif 800.if mixin == 1 801 eor x13,x13,x14 802.endif 803.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s 804.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s 805.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s 806.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s 807 808.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s 809.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s 810.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s 811.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s 812 813.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d 814.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d 815.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d 816.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d 817 818.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d 819.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d 820.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d 821.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d 822.if mixin == 1 823 eor x15,x15,x16 824.endif 825.if mixin == 1 826 eor x17,x17,x18 827.endif 828.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s 829.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s 830.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s 831.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s 832 833.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s 834.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s 835.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s 836.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s 837 838.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d 839.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d 840.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d 841.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d 842 843.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d 844.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d 845.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d 846.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d 847.if mixin == 1 848 eor x19,x19,x20 849.endif 850.if mixin == 1 851 eor x21,x21,x22 852.endif 853.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL] 854.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL] 855.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL] 856.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL] 857.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL] 858.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL] 859.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL] 860.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL] 861.inst 0x04215101 //addvl x1,x1,8 862.inst 0x04b13000 //eor z0.d,z0.d,z17.d 863.inst 0x04b23084 //eor z4.d,z4.d,z18.d 864.inst 0x04b33108 //eor z8.d,z8.d,z19.d 865.inst 0x04b4318c //eor z12.d,z12.d,z20.d 866.inst 0x04b53021 //eor z1.d,z1.d,z21.d 867.inst 0x04b630a5 //eor z5.d,z5.d,z22.d 868.inst 0x04b73129 //eor z9.d,z9.d,z23.d 869.inst 0x04b831ad //eor z13.d,z13.d,z24.d 870.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL] 871.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL] 872.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL] 873.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL] 874.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL] 875.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL] 876.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL] 877.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL] 878.inst 0x04215101 //addvl x1,x1,8 879.if mixin == 1 880 stp x7,x9,[x0],#16 881.endif 882.inst 0x04b13042 //eor z2.d,z2.d,z17.d 883.inst 0x04b230c6 //eor z6.d,z6.d,z18.d 884.if mixin == 1 885 stp x11,x13,[x0],#16 886.endif 887.inst 0x04b3314a //eor z10.d,z10.d,z19.d 888.inst 0x04b431ce //eor z14.d,z14.d,z20.d 889.if mixin == 1 890 stp x15,x17,[x0],#16 891.endif 892.inst 0x04b53063 //eor z3.d,z3.d,z21.d 893.inst 0x04b630e7 //eor z7.d,z7.d,z22.d 894.if mixin == 1 895 stp x19,x21,[x0],#16 896.endif 897.inst 0x04b7316b //eor z11.d,z11.d,z23.d 898.inst 0x04b831ef //eor z15.d,z15.d,z24.d 899.inst 0xe540e000 //st1w {z0.s},p0,[x0,#0,MUL VL] 900.inst 0xe541e004 //st1w {z4.s},p0,[x0,#1,MUL VL] 901.inst 0xe542e008 //st1w {z8.s},p0,[x0,#2,MUL VL] 902.inst 0xe543e00c //st1w {z12.s},p0,[x0,#3,MUL VL] 903.inst 0xe544e001 //st1w {z1.s},p0,[x0,#4,MUL VL] 904.inst 0xe545e005 //st1w {z5.s},p0,[x0,#5,MUL VL] 905.inst 0xe546e009 //st1w {z9.s},p0,[x0,#6,MUL VL] 906.inst 0xe547e00d //st1w {z13.s},p0,[x0,#7,MUL VL] 907.inst 0x04205100 //addvl x0,x0,8 908.inst 0xe540e002 //st1w {z2.s},p0,[x0,#0,MUL VL] 909.inst 0xe541e006 //st1w {z6.s},p0,[x0,#1,MUL VL] 910.inst 0xe542e00a //st1w {z10.s},p0,[x0,#2,MUL VL] 911.inst 0xe543e00e //st1w {z14.s},p0,[x0,#3,MUL VL] 912.inst 0xe544e003 //st1w {z3.s},p0,[x0,#4,MUL VL] 913.inst 0xe545e007 //st1w {z7.s},p0,[x0,#5,MUL VL] 914.inst 0xe546e00b //st1w {z11.s},p0,[x0,#6,MUL VL] 915.inst 0xe547e00f //st1w {z15.s},p0,[x0,#7,MUL VL] 916.inst 0x04205100 //addvl x0,x0,8 917210: 918.inst 0x04b0e3fd //incw x29, ALL, MUL #1 919 subs x2,x2,64 920 b.gt 100b 921 b 110f 922101: 923 mixin=0 924 lsr x8,x23,#32 925.inst 0x05a03ae0 //dup z0.s,w23 926.inst 0x05a03af9 //dup z25.s,w23 927.if mixin == 1 928 mov w7,w23 929.endif 930.inst 0x05a03904 //dup z4.s,w8 931.inst 0x05a0391a //dup z26.s,w8 932 lsr x10,x24,#32 933.inst 0x05a03b08 //dup z8.s,w24 934.inst 0x05a03b1b //dup z27.s,w24 935.if mixin == 1 936 mov w9,w24 937.endif 938.inst 0x05a0394c //dup z12.s,w10 939.inst 0x05a0395c //dup z28.s,w10 940 lsr x12,x25,#32 941.inst 0x05a03b21 //dup z1.s,w25 942.inst 0x05a03b3d //dup z29.s,w25 943.if mixin == 1 944 mov w11,w25 945.endif 946.inst 0x05a03985 //dup z5.s,w12 947.inst 0x05a0399e //dup z30.s,w12 948 lsr x14,x26,#32 949.inst 0x05a03b49 //dup z9.s,w26 950.inst 0x05a03b55 //dup z21.s,w26 951.if mixin == 1 952 mov w13,w26 953.endif 954.inst 0x05a039cd //dup z13.s,w14 955.inst 0x05a039d6 //dup z22.s,w14 956 lsr x16,x27,#32 957.inst 0x05a03b62 //dup z2.s,w27 958.inst 0x05a03b77 //dup z23.s,w27 959.if mixin == 1 960 mov w15,w27 961.endif 962.inst 0x05a03a06 //dup z6.s,w16 963.inst 0x05a03a18 //dup z24.s,w16 964 lsr x18,x28,#32 965.inst 0x05a03b8a //dup z10.s,w28 966.inst 0x05a03b91 //dup z17.s,w28 967.if mixin == 1 968 mov w17,w28 969.endif 970.inst 0x05a03a4e //dup z14.s,w18 971.inst 0x05a03a52 //dup z18.s,w18 972 lsr x22,x30,#32 973.inst 0x05a03bcb //dup z11.s,w30 974.inst 0x05a03bd4 //dup z20.s,w30 975.if mixin == 1 976 mov w21,w30 977.endif 978.inst 0x05a03acf //dup z15.s,w22 979.inst 0x05a03adf //dup z31.s,w22 980.if mixin == 1 981 add w20,w29,#1 982 mov w19,w29 983.inst 0x04a14690 //index z16.s,w20,1 984.inst 0x04a14683 //index z3.s,w20,1 985.else 986.inst 0x04a147b0 //index z16.s,w29,1 987.inst 0x04a147a3 //index z3.s,w29,1 988.endif 989 lsr x20,x29,#32 990.inst 0x05a03a87 //dup z7.s,w20 991.inst 0x05a03a93 //dup z19.s,w20 992 mov x6,#10 99310: 994.align 5 995.inst 0x04a10000 //add z0.s,z0.s,z1.s 996.if mixin == 1 997 add w7,w7,w11 998.endif 999.inst 0x04a50084 //add z4.s,z4.s,z5.s 1000.if mixin == 1 1001 add w8,w8,w12 1002.endif 1003.inst 0x04a90108 //add z8.s,z8.s,z9.s 1004.if mixin == 1 1005 add w9,w9,w13 1006.endif 1007.inst 0x04ad018c //add z12.s,z12.s,z13.s 1008.if mixin == 1 1009 add w10,w10,w14 1010.endif 1011.if mixin == 1 1012 eor w19,w19,w7 1013.endif 1014.inst 0x04703403 //xar z3.s,z3.s,z0.s,16 1015.if mixin == 1 1016 ror w19,w19,16 1017.endif 1018.if mixin == 1 1019 eor w20,w20,w8 1020.endif 1021.inst 0x04703487 //xar z7.s,z7.s,z4.s,16 1022.if mixin == 1 1023 ror w20,w20,16 1024.endif 1025.if mixin == 1 1026 eor w21,w21,w9 1027.endif 1028.inst 0x0470350b //xar z11.s,z11.s,z8.s,16 1029.if mixin == 1 1030 ror w21,w21,16 1031.endif 1032.if mixin == 1 1033 eor w22,w22,w10 1034.endif 1035.inst 0x0470358f //xar z15.s,z15.s,z12.s,16 1036.if mixin == 1 1037 ror w22,w22,16 1038.endif 1039.inst 0x04a30042 //add z2.s,z2.s,z3.s 1040.if mixin == 1 1041 add w15,w15,w19 1042.endif 1043.inst 0x04a700c6 //add z6.s,z6.s,z7.s 1044.if mixin == 1 1045 add w16,w16,w20 1046.endif 1047.inst 0x04ab014a //add z10.s,z10.s,z11.s 1048.if mixin == 1 1049 add w17,w17,w21 1050.endif 1051.inst 0x04af01ce //add z14.s,z14.s,z15.s 1052.if mixin == 1 1053 add w18,w18,w22 1054.endif 1055.if mixin == 1 1056 eor w11,w11,w15 1057.endif 1058.inst 0x046c3441 //xar z1.s,z1.s,z2.s,20 1059.if mixin == 1 1060 ror w11,w11,20 1061.endif 1062.if mixin == 1 1063 eor w12,w12,w16 1064.endif 1065.inst 0x046c34c5 //xar z5.s,z5.s,z6.s,20 1066.if mixin == 1 1067 ror w12,w12,20 1068.endif 1069.if mixin == 1 1070 eor w13,w13,w17 1071.endif 1072.inst 0x046c3549 //xar z9.s,z9.s,z10.s,20 1073.if mixin == 1 1074 ror w13,w13,20 1075.endif 1076.if mixin == 1 1077 eor w14,w14,w18 1078.endif 1079.inst 0x046c35cd //xar z13.s,z13.s,z14.s,20 1080.if mixin == 1 1081 ror w14,w14,20 1082.endif 1083.inst 0x04a10000 //add z0.s,z0.s,z1.s 1084.if mixin == 1 1085 add w7,w7,w11 1086.endif 1087.inst 0x04a50084 //add z4.s,z4.s,z5.s 1088.if mixin == 1 1089 add w8,w8,w12 1090.endif 1091.inst 0x04a90108 //add z8.s,z8.s,z9.s 1092.if mixin == 1 1093 add w9,w9,w13 1094.endif 1095.inst 0x04ad018c //add z12.s,z12.s,z13.s 1096.if mixin == 1 1097 add w10,w10,w14 1098.endif 1099.if mixin == 1 1100 eor w19,w19,w7 1101.endif 1102.inst 0x04683403 //xar z3.s,z3.s,z0.s,24 1103.if mixin == 1 1104 ror w19,w19,24 1105.endif 1106.if mixin == 1 1107 eor w20,w20,w8 1108.endif 1109.inst 0x04683487 //xar z7.s,z7.s,z4.s,24 1110.if mixin == 1 1111 ror w20,w20,24 1112.endif 1113.if mixin == 1 1114 eor w21,w21,w9 1115.endif 1116.inst 0x0468350b //xar z11.s,z11.s,z8.s,24 1117.if mixin == 1 1118 ror w21,w21,24 1119.endif 1120.if mixin == 1 1121 eor w22,w22,w10 1122.endif 1123.inst 0x0468358f //xar z15.s,z15.s,z12.s,24 1124.if mixin == 1 1125 ror w22,w22,24 1126.endif 1127.inst 0x04a30042 //add z2.s,z2.s,z3.s 1128.if mixin == 1 1129 add w15,w15,w19 1130.endif 1131.inst 0x04a700c6 //add z6.s,z6.s,z7.s 1132.if mixin == 1 1133 add w16,w16,w20 1134.endif 1135.inst 0x04ab014a //add z10.s,z10.s,z11.s 1136.if mixin == 1 1137 add w17,w17,w21 1138.endif 1139.inst 0x04af01ce //add z14.s,z14.s,z15.s 1140.if mixin == 1 1141 add w18,w18,w22 1142.endif 1143.if mixin == 1 1144 eor w11,w11,w15 1145.endif 1146.inst 0x04673441 //xar z1.s,z1.s,z2.s,25 1147.if mixin == 1 1148 ror w11,w11,25 1149.endif 1150.if mixin == 1 1151 eor w12,w12,w16 1152.endif 1153.inst 0x046734c5 //xar z5.s,z5.s,z6.s,25 1154.if mixin == 1 1155 ror w12,w12,25 1156.endif 1157.if mixin == 1 1158 eor w13,w13,w17 1159.endif 1160.inst 0x04673549 //xar z9.s,z9.s,z10.s,25 1161.if mixin == 1 1162 ror w13,w13,25 1163.endif 1164.if mixin == 1 1165 eor w14,w14,w18 1166.endif 1167.inst 0x046735cd //xar z13.s,z13.s,z14.s,25 1168.if mixin == 1 1169 ror w14,w14,25 1170.endif 1171.inst 0x04a50000 //add z0.s,z0.s,z5.s 1172.if mixin == 1 1173 add w7,w7,w12 1174.endif 1175.inst 0x04a90084 //add z4.s,z4.s,z9.s 1176.if mixin == 1 1177 add w8,w8,w13 1178.endif 1179.inst 0x04ad0108 //add z8.s,z8.s,z13.s 1180.if mixin == 1 1181 add w9,w9,w14 1182.endif 1183.inst 0x04a1018c //add z12.s,z12.s,z1.s 1184.if mixin == 1 1185 add w10,w10,w11 1186.endif 1187.if mixin == 1 1188 eor w22,w22,w7 1189.endif 1190.inst 0x0470340f //xar z15.s,z15.s,z0.s,16 1191.if mixin == 1 1192 ror w22,w22,16 1193.endif 1194.if mixin == 1 1195 eor w19,w19,w8 1196.endif 1197.inst 0x04703483 //xar z3.s,z3.s,z4.s,16 1198.if mixin == 1 1199 ror w19,w19,16 1200.endif 1201.if mixin == 1 1202 eor w20,w20,w9 1203.endif 1204.inst 0x04703507 //xar z7.s,z7.s,z8.s,16 1205.if mixin == 1 1206 ror w20,w20,16 1207.endif 1208.if mixin == 1 1209 eor w21,w21,w10 1210.endif 1211.inst 0x0470358b //xar z11.s,z11.s,z12.s,16 1212.if mixin == 1 1213 ror w21,w21,16 1214.endif 1215.inst 0x04af014a //add z10.s,z10.s,z15.s 1216.if mixin == 1 1217 add w17,w17,w22 1218.endif 1219.inst 0x04a301ce //add z14.s,z14.s,z3.s 1220.if mixin == 1 1221 add w18,w18,w19 1222.endif 1223.inst 0x04a70042 //add z2.s,z2.s,z7.s 1224.if mixin == 1 1225 add w15,w15,w20 1226.endif 1227.inst 0x04ab00c6 //add z6.s,z6.s,z11.s 1228.if mixin == 1 1229 add w16,w16,w21 1230.endif 1231.if mixin == 1 1232 eor w12,w12,w17 1233.endif 1234.inst 0x046c3545 //xar z5.s,z5.s,z10.s,20 1235.if mixin == 1 1236 ror w12,w12,20 1237.endif 1238.if mixin == 1 1239 eor w13,w13,w18 1240.endif 1241.inst 0x046c35c9 //xar z9.s,z9.s,z14.s,20 1242.if mixin == 1 1243 ror w13,w13,20 1244.endif 1245.if mixin == 1 1246 eor w14,w14,w15 1247.endif 1248.inst 0x046c344d //xar z13.s,z13.s,z2.s,20 1249.if mixin == 1 1250 ror w14,w14,20 1251.endif 1252.if mixin == 1 1253 eor w11,w11,w16 1254.endif 1255.inst 0x046c34c1 //xar z1.s,z1.s,z6.s,20 1256.if mixin == 1 1257 ror w11,w11,20 1258.endif 1259.inst 0x04a50000 //add z0.s,z0.s,z5.s 1260.if mixin == 1 1261 add w7,w7,w12 1262.endif 1263.inst 0x04a90084 //add z4.s,z4.s,z9.s 1264.if mixin == 1 1265 add w8,w8,w13 1266.endif 1267.inst 0x04ad0108 //add z8.s,z8.s,z13.s 1268.if mixin == 1 1269 add w9,w9,w14 1270.endif 1271.inst 0x04a1018c //add z12.s,z12.s,z1.s 1272.if mixin == 1 1273 add w10,w10,w11 1274.endif 1275.if mixin == 1 1276 eor w22,w22,w7 1277.endif 1278.inst 0x0468340f //xar z15.s,z15.s,z0.s,24 1279.if mixin == 1 1280 ror w22,w22,24 1281.endif 1282.if mixin == 1 1283 eor w19,w19,w8 1284.endif 1285.inst 0x04683483 //xar z3.s,z3.s,z4.s,24 1286.if mixin == 1 1287 ror w19,w19,24 1288.endif 1289.if mixin == 1 1290 eor w20,w20,w9 1291.endif 1292.inst 0x04683507 //xar z7.s,z7.s,z8.s,24 1293.if mixin == 1 1294 ror w20,w20,24 1295.endif 1296.if mixin == 1 1297 eor w21,w21,w10 1298.endif 1299.inst 0x0468358b //xar z11.s,z11.s,z12.s,24 1300.if mixin == 1 1301 ror w21,w21,24 1302.endif 1303.inst 0x04af014a //add z10.s,z10.s,z15.s 1304.if mixin == 1 1305 add w17,w17,w22 1306.endif 1307.inst 0x04a301ce //add z14.s,z14.s,z3.s 1308.if mixin == 1 1309 add w18,w18,w19 1310.endif 1311.inst 0x04a70042 //add z2.s,z2.s,z7.s 1312.if mixin == 1 1313 add w15,w15,w20 1314.endif 1315.inst 0x04ab00c6 //add z6.s,z6.s,z11.s 1316.if mixin == 1 1317 add w16,w16,w21 1318.endif 1319.if mixin == 1 1320 eor w12,w12,w17 1321.endif 1322.inst 0x04673545 //xar z5.s,z5.s,z10.s,25 1323.if mixin == 1 1324 ror w12,w12,25 1325.endif 1326.if mixin == 1 1327 eor w13,w13,w18 1328.endif 1329.inst 0x046735c9 //xar z9.s,z9.s,z14.s,25 1330.if mixin == 1 1331 ror w13,w13,25 1332.endif 1333.if mixin == 1 1334 eor w14,w14,w15 1335.endif 1336.inst 0x0467344d //xar z13.s,z13.s,z2.s,25 1337.if mixin == 1 1338 ror w14,w14,25 1339.endif 1340.if mixin == 1 1341 eor w11,w11,w16 1342.endif 1343.inst 0x046734c1 //xar z1.s,z1.s,z6.s,25 1344.if mixin == 1 1345 ror w11,w11,25 1346.endif 1347 sub x6,x6,1 1348 cbnz x6,10b 1349.if mixin == 1 1350 add w7,w7,w23 1351.endif 1352.inst 0x04b90000 //add z0.s,z0.s,z25.s 1353.if mixin == 1 1354 add x8,x8,x23,lsr #32 1355.endif 1356.inst 0x04ba0084 //add z4.s,z4.s,z26.s 1357.if mixin == 1 1358 add x7,x7,x8,lsl #32 // pack 1359.endif 1360.if mixin == 1 1361 add w9,w9,w24 1362.endif 1363.inst 0x04bb0108 //add z8.s,z8.s,z27.s 1364.if mixin == 1 1365 add x10,x10,x24,lsr #32 1366.endif 1367.inst 0x04bc018c //add z12.s,z12.s,z28.s 1368.if mixin == 1 1369 add x9,x9,x10,lsl #32 // pack 1370.endif 1371.if mixin == 1 1372 ldp x8,x10,[x1],#16 1373.endif 1374.if mixin == 1 1375 add w11,w11,w25 1376.endif 1377.inst 0x04bd0021 //add z1.s,z1.s,z29.s 1378.if mixin == 1 1379 add x12,x12,x25,lsr #32 1380.endif 1381.inst 0x04be00a5 //add z5.s,z5.s,z30.s 1382.if mixin == 1 1383 add x11,x11,x12,lsl #32 // pack 1384.endif 1385.if mixin == 1 1386 add w13,w13,w26 1387.endif 1388.inst 0x04b50129 //add z9.s,z9.s,z21.s 1389.if mixin == 1 1390 add x14,x14,x26,lsr #32 1391.endif 1392.inst 0x04b601ad //add z13.s,z13.s,z22.s 1393.if mixin == 1 1394 add x13,x13,x14,lsl #32 // pack 1395.endif 1396.if mixin == 1 1397 ldp x12,x14,[x1],#16 1398.endif 1399.if mixin == 1 1400 add w15,w15,w27 1401.endif 1402.inst 0x04b70042 //add z2.s,z2.s,z23.s 1403.if mixin == 1 1404 add x16,x16,x27,lsr #32 1405.endif 1406.inst 0x04b800c6 //add z6.s,z6.s,z24.s 1407.if mixin == 1 1408 add x15,x15,x16,lsl #32 // pack 1409.endif 1410.if mixin == 1 1411 add w17,w17,w28 1412.endif 1413.inst 0x04b1014a //add z10.s,z10.s,z17.s 1414.if mixin == 1 1415 add x18,x18,x28,lsr #32 1416.endif 1417.inst 0x04b201ce //add z14.s,z14.s,z18.s 1418.if mixin == 1 1419 add x17,x17,x18,lsl #32 // pack 1420.endif 1421.if mixin == 1 1422 ldp x16,x18,[x1],#16 1423.endif 1424.if mixin == 1 1425 add w19,w19,w29 1426.endif 1427.inst 0x04b00063 //add z3.s,z3.s,z16.s 1428.if mixin == 1 1429 add x20,x20,x29,lsr #32 1430.endif 1431.inst 0x04b300e7 //add z7.s,z7.s,z19.s 1432.if mixin == 1 1433 add x19,x19,x20,lsl #32 // pack 1434.endif 1435.if mixin == 1 1436 add w21,w21,w30 1437.endif 1438.inst 0x04b4016b //add z11.s,z11.s,z20.s 1439.if mixin == 1 1440 add x22,x22,x30,lsr #32 1441.endif 1442.inst 0x04bf01ef //add z15.s,z15.s,z31.s 1443.if mixin == 1 1444 add x21,x21,x22,lsl #32 // pack 1445.endif 1446.if mixin == 1 1447 ldp x20,x22,[x1],#16 1448.endif 1449#ifdef __AARCH64EB__ 1450 rev x7,x7 1451.inst 0x05a48000 //revb z0.s,p0/m,z0.s 1452.inst 0x05a48084 //revb z4.s,p0/m,z4.s 1453 rev x9,x9 1454.inst 0x05a48108 //revb z8.s,p0/m,z8.s 1455.inst 0x05a4818c //revb z12.s,p0/m,z12.s 1456 rev x11,x11 1457.inst 0x05a48021 //revb z1.s,p0/m,z1.s 1458.inst 0x05a480a5 //revb z5.s,p0/m,z5.s 1459 rev x13,x13 1460.inst 0x05a48129 //revb z9.s,p0/m,z9.s 1461.inst 0x05a481ad //revb z13.s,p0/m,z13.s 1462 rev x15,x15 1463.inst 0x05a48042 //revb z2.s,p0/m,z2.s 1464.inst 0x05a480c6 //revb z6.s,p0/m,z6.s 1465 rev x17,x17 1466.inst 0x05a4814a //revb z10.s,p0/m,z10.s 1467.inst 0x05a481ce //revb z14.s,p0/m,z14.s 1468 rev x19,x19 1469.inst 0x05a48063 //revb z3.s,p0/m,z3.s 1470.inst 0x05a480e7 //revb z7.s,p0/m,z7.s 1471 rev x21,x21 1472.inst 0x05a4816b //revb z11.s,p0/m,z11.s 1473.inst 0x05a481ef //revb z15.s,p0/m,z15.s 1474#endif 1475.if mixin == 1 1476 add x29,x29,#1 1477.endif 1478 cmp x5,4 1479 b.ne 200f 1480.if mixin == 1 1481 eor x7,x7,x8 1482.endif 1483.if mixin == 1 1484 eor x9,x9,x10 1485.endif 1486.if mixin == 1 1487 eor x11,x11,x12 1488.endif 1489.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s 1490.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s 1491.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s 1492.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s 1493 1494.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s 1495.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s 1496.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s 1497.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s 1498 1499.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d 1500.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d 1501.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d 1502.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d 1503 1504.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d 1505.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d 1506.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d 1507.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d 1508.if mixin == 1 1509 eor x13,x13,x14 1510.endif 1511.if mixin == 1 1512 eor x15,x15,x16 1513.endif 1514.if mixin == 1 1515 eor x17,x17,x18 1516.endif 1517.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s 1518.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s 1519.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s 1520.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s 1521 1522.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s 1523.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s 1524.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s 1525.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s 1526 1527.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d 1528.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d 1529.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d 1530.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d 1531 1532.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d 1533.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d 1534.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d 1535.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d 1536.if mixin == 1 1537 eor x19,x19,x20 1538.endif 1539.if mixin == 1 1540 eor x21,x21,x22 1541.endif 1542 ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64 1543 ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64 1544.inst 0x04b13000 //eor z0.d,z0.d,z17.d 1545.inst 0x04b23021 //eor z1.d,z1.d,z18.d 1546.inst 0x04b33042 //eor z2.d,z2.d,z19.d 1547.inst 0x04b43063 //eor z3.d,z3.d,z20.d 1548.inst 0x04b53084 //eor z4.d,z4.d,z21.d 1549.inst 0x04b630a5 //eor z5.d,z5.d,z22.d 1550.inst 0x04b730c6 //eor z6.d,z6.d,z23.d 1551.inst 0x04b830e7 //eor z7.d,z7.d,z24.d 1552 ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64 1553 ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64 1554.if mixin == 1 1555 stp x7,x9,[x0],#16 1556.endif 1557.inst 0x04b13108 //eor z8.d,z8.d,z17.d 1558.inst 0x04b23129 //eor z9.d,z9.d,z18.d 1559.if mixin == 1 1560 stp x11,x13,[x0],#16 1561.endif 1562.inst 0x04b3314a //eor z10.d,z10.d,z19.d 1563.inst 0x04b4316b //eor z11.d,z11.d,z20.d 1564.if mixin == 1 1565 stp x15,x17,[x0],#16 1566.endif 1567.inst 0x04b5318c //eor z12.d,z12.d,z21.d 1568.inst 0x04b631ad //eor z13.d,z13.d,z22.d 1569.if mixin == 1 1570 stp x19,x21,[x0],#16 1571.endif 1572.inst 0x04b731ce //eor z14.d,z14.d,z23.d 1573.inst 0x04b831ef //eor z15.d,z15.d,z24.d 1574 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64 1575 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 1576 st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 1577 st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 1578 b 210f 1579200: 1580.inst 0x05a16011 //zip1 z17.s,z0.s,z1.s 1581.inst 0x05a16412 //zip2 z18.s,z0.s,z1.s 1582.inst 0x05a36053 //zip1 z19.s,z2.s,z3.s 1583.inst 0x05a36454 //zip2 z20.s,z2.s,z3.s 1584 1585.inst 0x05a56095 //zip1 z21.s,z4.s,z5.s 1586.inst 0x05a56496 //zip2 z22.s,z4.s,z5.s 1587.inst 0x05a760d7 //zip1 z23.s,z6.s,z7.s 1588.inst 0x05a764d8 //zip2 z24.s,z6.s,z7.s 1589 1590.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d 1591.inst 0x05f36621 //zip2 z1.d,z17.d,z19.d 1592.inst 0x05f46242 //zip1 z2.d,z18.d,z20.d 1593.inst 0x05f46643 //zip2 z3.d,z18.d,z20.d 1594 1595.inst 0x05f762a4 //zip1 z4.d,z21.d,z23.d 1596.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d 1597.inst 0x05f862c6 //zip1 z6.d,z22.d,z24.d 1598.inst 0x05f866c7 //zip2 z7.d,z22.d,z24.d 1599.if mixin == 1 1600 eor x7,x7,x8 1601.endif 1602.if mixin == 1 1603 eor x9,x9,x10 1604.endif 1605.inst 0x05a96111 //zip1 z17.s,z8.s,z9.s 1606.inst 0x05a96512 //zip2 z18.s,z8.s,z9.s 1607.inst 0x05ab6153 //zip1 z19.s,z10.s,z11.s 1608.inst 0x05ab6554 //zip2 z20.s,z10.s,z11.s 1609 1610.inst 0x05ad6195 //zip1 z21.s,z12.s,z13.s 1611.inst 0x05ad6596 //zip2 z22.s,z12.s,z13.s 1612.inst 0x05af61d7 //zip1 z23.s,z14.s,z15.s 1613.inst 0x05af65d8 //zip2 z24.s,z14.s,z15.s 1614 1615.inst 0x05f36228 //zip1 z8.d,z17.d,z19.d 1616.inst 0x05f36629 //zip2 z9.d,z17.d,z19.d 1617.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d 1618.inst 0x05f4664b //zip2 z11.d,z18.d,z20.d 1619 1620.inst 0x05f762ac //zip1 z12.d,z21.d,z23.d 1621.inst 0x05f766ad //zip2 z13.d,z21.d,z23.d 1622.inst 0x05f862ce //zip1 z14.d,z22.d,z24.d 1623.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d 1624.if mixin == 1 1625 eor x11,x11,x12 1626.endif 1627.if mixin == 1 1628 eor x13,x13,x14 1629.endif 1630.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s 1631.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s 1632.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s 1633.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s 1634 1635.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s 1636.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s 1637.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s 1638.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s 1639 1640.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d 1641.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d 1642.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d 1643.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d 1644 1645.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d 1646.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d 1647.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d 1648.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d 1649.if mixin == 1 1650 eor x15,x15,x16 1651.endif 1652.if mixin == 1 1653 eor x17,x17,x18 1654.endif 1655.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s 1656.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s 1657.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s 1658.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s 1659 1660.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s 1661.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s 1662.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s 1663.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s 1664 1665.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d 1666.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d 1667.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d 1668.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d 1669 1670.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d 1671.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d 1672.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d 1673.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d 1674.if mixin == 1 1675 eor x19,x19,x20 1676.endif 1677.if mixin == 1 1678 eor x21,x21,x22 1679.endif 1680.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL] 1681.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL] 1682.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL] 1683.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL] 1684.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL] 1685.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL] 1686.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL] 1687.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL] 1688.inst 0x04215101 //addvl x1,x1,8 1689.inst 0x04b13000 //eor z0.d,z0.d,z17.d 1690.inst 0x04b23084 //eor z4.d,z4.d,z18.d 1691.inst 0x04b33108 //eor z8.d,z8.d,z19.d 1692.inst 0x04b4318c //eor z12.d,z12.d,z20.d 1693.inst 0x04b53021 //eor z1.d,z1.d,z21.d 1694.inst 0x04b630a5 //eor z5.d,z5.d,z22.d 1695.inst 0x04b73129 //eor z9.d,z9.d,z23.d 1696.inst 0x04b831ad //eor z13.d,z13.d,z24.d 1697.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL] 1698.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL] 1699.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL] 1700.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL] 1701.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL] 1702.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL] 1703.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL] 1704.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL] 1705.inst 0x04215101 //addvl x1,x1,8 1706.if mixin == 1 1707 stp x7,x9,[x0],#16 1708.endif 1709.inst 0x04b13042 //eor z2.d,z2.d,z17.d 1710.inst 0x04b230c6 //eor z6.d,z6.d,z18.d 1711.if mixin == 1 1712 stp x11,x13,[x0],#16 1713.endif 1714.inst 0x04b3314a //eor z10.d,z10.d,z19.d 1715.inst 0x04b431ce //eor z14.d,z14.d,z20.d 1716.if mixin == 1 1717 stp x15,x17,[x0],#16 1718.endif 1719.inst 0x04b53063 //eor z3.d,z3.d,z21.d 1720.inst 0x04b630e7 //eor z7.d,z7.d,z22.d 1721.if mixin == 1 1722 stp x19,x21,[x0],#16 1723.endif 1724.inst 0x04b7316b //eor z11.d,z11.d,z23.d 1725.inst 0x04b831ef //eor z15.d,z15.d,z24.d 1726.inst 0xe540e000 //st1w {z0.s},p0,[x0,#0,MUL VL] 1727.inst 0xe541e004 //st1w {z4.s},p0,[x0,#1,MUL VL] 1728.inst 0xe542e008 //st1w {z8.s},p0,[x0,#2,MUL VL] 1729.inst 0xe543e00c //st1w {z12.s},p0,[x0,#3,MUL VL] 1730.inst 0xe544e001 //st1w {z1.s},p0,[x0,#4,MUL VL] 1731.inst 0xe545e005 //st1w {z5.s},p0,[x0,#5,MUL VL] 1732.inst 0xe546e009 //st1w {z9.s},p0,[x0,#6,MUL VL] 1733.inst 0xe547e00d //st1w {z13.s},p0,[x0,#7,MUL VL] 1734.inst 0x04205100 //addvl x0,x0,8 1735.inst 0xe540e002 //st1w {z2.s},p0,[x0,#0,MUL VL] 1736.inst 0xe541e006 //st1w {z6.s},p0,[x0,#1,MUL VL] 1737.inst 0xe542e00a //st1w {z10.s},p0,[x0,#2,MUL VL] 1738.inst 0xe543e00e //st1w {z14.s},p0,[x0,#3,MUL VL] 1739.inst 0xe544e003 //st1w {z3.s},p0,[x0,#4,MUL VL] 1740.inst 0xe545e007 //st1w {z7.s},p0,[x0,#5,MUL VL] 1741.inst 0xe546e00b //st1w {z11.s},p0,[x0,#6,MUL VL] 1742.inst 0xe547e00f //st1w {z15.s},p0,[x0,#7,MUL VL] 1743.inst 0x04205100 //addvl x0,x0,8 1744210: 1745.inst 0x04b0e3fd //incw x29, ALL, MUL #1 1746110: 1747 b 2f 17481: 1749.align 5 1750100: 1751 subs x7,x2,x5,lsl #6 1752 b.lt 110f 1753 mov x2,x7 1754 b.eq 101f 1755 cmp x2,64 1756 b.lt 101f 1757 mixin=1 1758 lsr x8,x23,#32 1759.inst 0x05a03ae0 //dup z0.s,w23 1760.inst 0x05a03af9 //dup z25.s,w23 1761.if mixin == 1 1762 mov w7,w23 1763.endif 1764.inst 0x05a03904 //dup z4.s,w8 1765.inst 0x05a0391a //dup z26.s,w8 1766 lsr x10,x24,#32 1767.inst 0x05a03b08 //dup z8.s,w24 1768.inst 0x05a03b1b //dup z27.s,w24 1769.if mixin == 1 1770 mov w9,w24 1771.endif 1772.inst 0x05a0394c //dup z12.s,w10 1773.inst 0x05a0395c //dup z28.s,w10 1774 lsr x12,x25,#32 1775.inst 0x05a03b21 //dup z1.s,w25 1776.inst 0x05a03b3d //dup z29.s,w25 1777.if mixin == 1 1778 mov w11,w25 1779.endif 1780.inst 0x05a03985 //dup z5.s,w12 1781.inst 0x05a0399e //dup z30.s,w12 1782 lsr x14,x26,#32 1783.inst 0x05a03b49 //dup z9.s,w26 1784.inst 0x05a03b55 //dup z21.s,w26 1785.if mixin == 1 1786 mov w13,w26 1787.endif 1788.inst 0x05a039cd //dup z13.s,w14 1789.inst 0x05a039d6 //dup z22.s,w14 1790 lsr x16,x27,#32 1791.inst 0x05a03b62 //dup z2.s,w27 1792.inst 0x05a03b77 //dup z23.s,w27 1793.if mixin == 1 1794 mov w15,w27 1795.endif 1796.inst 0x05a03a06 //dup z6.s,w16 1797.inst 0x05a03a18 //dup z24.s,w16 1798 lsr x18,x28,#32 1799.inst 0x05a03b8a //dup z10.s,w28 1800.if mixin == 1 1801 mov w17,w28 1802.endif 1803.inst 0x05a03a4e //dup z14.s,w18 1804 lsr x22,x30,#32 1805.inst 0x05a03bcb //dup z11.s,w30 1806.if mixin == 1 1807 mov w21,w30 1808.endif 1809.inst 0x05a03acf //dup z15.s,w22 1810.if mixin == 1 1811 add w20,w29,#1 1812 mov w19,w29 1813.inst 0x04a14690 //index z16.s,w20,1 1814.inst 0x04a14683 //index z3.s,w20,1 1815.else 1816.inst 0x04a147b0 //index z16.s,w29,1 1817.inst 0x04a147a3 //index z3.s,w29,1 1818.endif 1819 lsr x20,x29,#32 1820.inst 0x05a03a87 //dup z7.s,w20 1821 mov x6,#10 182210: 1823.align 5 1824.inst 0x04a10000 //add z0.s,z0.s,z1.s 1825.if mixin == 1 1826 add w7,w7,w11 1827.endif 1828.inst 0x04a50084 //add z4.s,z4.s,z5.s 1829.if mixin == 1 1830 add w8,w8,w12 1831.endif 1832.inst 0x04a90108 //add z8.s,z8.s,z9.s 1833.if mixin == 1 1834 add w9,w9,w13 1835.endif 1836.inst 0x04ad018c //add z12.s,z12.s,z13.s 1837.if mixin == 1 1838 add w10,w10,w14 1839.endif 1840.inst 0x04a03063 //eor z3.d,z3.d,z0.d 1841.if mixin == 1 1842 eor w19,w19,w7 1843.endif 1844.inst 0x04a430e7 //eor z7.d,z7.d,z4.d 1845.if mixin == 1 1846 eor w20,w20,w8 1847.endif 1848.inst 0x04a8316b //eor z11.d,z11.d,z8.d 1849.if mixin == 1 1850 eor w21,w21,w9 1851.endif 1852.inst 0x04ac31ef //eor z15.d,z15.d,z12.d 1853.if mixin == 1 1854 eor w22,w22,w10 1855.endif 1856.inst 0x05a58063 //revh z3.s,p0/m,z3.s 1857.if mixin == 1 1858 ror w19,w19,#16 1859.endif 1860.inst 0x05a580e7 //revh z7.s,p0/m,z7.s 1861.if mixin == 1 1862 ror w20,w20,#16 1863.endif 1864.inst 0x05a5816b //revh z11.s,p0/m,z11.s 1865.if mixin == 1 1866 ror w21,w21,#16 1867.endif 1868.inst 0x05a581ef //revh z15.s,p0/m,z15.s 1869.if mixin == 1 1870 ror w22,w22,#16 1871.endif 1872.inst 0x04a30042 //add z2.s,z2.s,z3.s 1873.if mixin == 1 1874 add w15,w15,w19 1875.endif 1876.inst 0x04a700c6 //add z6.s,z6.s,z7.s 1877.if mixin == 1 1878 add w16,w16,w20 1879.endif 1880.inst 0x04ab014a //add z10.s,z10.s,z11.s 1881.if mixin == 1 1882 add w17,w17,w21 1883.endif 1884.inst 0x04af01ce //add z14.s,z14.s,z15.s 1885.if mixin == 1 1886 add w18,w18,w22 1887.endif 1888.inst 0x04a23021 //eor z1.d,z1.d,z2.d 1889.if mixin == 1 1890 eor w11,w11,w15 1891.endif 1892.inst 0x04a630a5 //eor z5.d,z5.d,z6.d 1893.if mixin == 1 1894 eor w12,w12,w16 1895.endif 1896.inst 0x04aa3129 //eor z9.d,z9.d,z10.d 1897.if mixin == 1 1898 eor w13,w13,w17 1899.endif 1900.inst 0x04ae31ad //eor z13.d,z13.d,z14.d 1901.if mixin == 1 1902 eor w14,w14,w18 1903.endif 1904.inst 0x046c9c31 //lsl z17.s,z1.s,12 1905.inst 0x046c9cb2 //lsl z18.s,z5.s,12 1906.inst 0x046c9d33 //lsl z19.s,z9.s,12 1907.inst 0x046c9db4 //lsl z20.s,z13.s,12 1908.inst 0x046c9421 //lsr z1.s,z1.s,20 1909.if mixin == 1 1910 ror w11,w11,20 1911.endif 1912.inst 0x046c94a5 //lsr z5.s,z5.s,20 1913.if mixin == 1 1914 ror w12,w12,20 1915.endif 1916.inst 0x046c9529 //lsr z9.s,z9.s,20 1917.if mixin == 1 1918 ror w13,w13,20 1919.endif 1920.inst 0x046c95ad //lsr z13.s,z13.s,20 1921.if mixin == 1 1922 ror w14,w14,20 1923.endif 1924.inst 0x04713021 //orr z1.d,z1.d,z17.d 1925.inst 0x047230a5 //orr z5.d,z5.d,z18.d 1926.inst 0x04733129 //orr z9.d,z9.d,z19.d 1927.inst 0x047431ad //orr z13.d,z13.d,z20.d 1928.inst 0x04a10000 //add z0.s,z0.s,z1.s 1929.if mixin == 1 1930 add w7,w7,w11 1931.endif 1932.inst 0x04a50084 //add z4.s,z4.s,z5.s 1933.if mixin == 1 1934 add w8,w8,w12 1935.endif 1936.inst 0x04a90108 //add z8.s,z8.s,z9.s 1937.if mixin == 1 1938 add w9,w9,w13 1939.endif 1940.inst 0x04ad018c //add z12.s,z12.s,z13.s 1941.if mixin == 1 1942 add w10,w10,w14 1943.endif 1944.inst 0x04a03063 //eor z3.d,z3.d,z0.d 1945.if mixin == 1 1946 eor w19,w19,w7 1947.endif 1948.inst 0x04a430e7 //eor z7.d,z7.d,z4.d 1949.if mixin == 1 1950 eor w20,w20,w8 1951.endif 1952.inst 0x04a8316b //eor z11.d,z11.d,z8.d 1953.if mixin == 1 1954 eor w21,w21,w9 1955.endif 1956.inst 0x04ac31ef //eor z15.d,z15.d,z12.d 1957.if mixin == 1 1958 eor w22,w22,w10 1959.endif 1960.inst 0x053f3063 //tbl z3.b,{z3.b},z31.b 1961.if mixin == 1 1962 ror w19,w19,#24 1963.endif 1964.inst 0x053f30e7 //tbl z7.b,{z7.b},z31.b 1965.if mixin == 1 1966 ror w20,w20,#24 1967.endif 1968.inst 0x053f316b //tbl z11.b,{z11.b},z31.b 1969.if mixin == 1 1970 ror w21,w21,#24 1971.endif 1972.inst 0x053f31ef //tbl z15.b,{z15.b},z31.b 1973.if mixin == 1 1974 ror w22,w22,#24 1975.endif 1976.inst 0x04a30042 //add z2.s,z2.s,z3.s 1977.if mixin == 1 1978 add w15,w15,w19 1979.endif 1980.inst 0x04a700c6 //add z6.s,z6.s,z7.s 1981.if mixin == 1 1982 add w16,w16,w20 1983.endif 1984.inst 0x04ab014a //add z10.s,z10.s,z11.s 1985.if mixin == 1 1986 add w17,w17,w21 1987.endif 1988.inst 0x04af01ce //add z14.s,z14.s,z15.s 1989.if mixin == 1 1990 add w18,w18,w22 1991.endif 1992.inst 0x04a23021 //eor z1.d,z1.d,z2.d 1993.if mixin == 1 1994 eor w11,w11,w15 1995.endif 1996.inst 0x04a630a5 //eor z5.d,z5.d,z6.d 1997.if mixin == 1 1998 eor w12,w12,w16 1999.endif 2000.inst 0x04aa3129 //eor z9.d,z9.d,z10.d 2001.if mixin == 1 2002 eor w13,w13,w17 2003.endif 2004.inst 0x04ae31ad //eor z13.d,z13.d,z14.d 2005.if mixin == 1 2006 eor w14,w14,w18 2007.endif 2008.inst 0x04679c31 //lsl z17.s,z1.s,7 2009.inst 0x04679cb2 //lsl z18.s,z5.s,7 2010.inst 0x04679d33 //lsl z19.s,z9.s,7 2011.inst 0x04679db4 //lsl z20.s,z13.s,7 2012.inst 0x04679421 //lsr z1.s,z1.s,25 2013.if mixin == 1 2014 ror w11,w11,25 2015.endif 2016.inst 0x046794a5 //lsr z5.s,z5.s,25 2017.if mixin == 1 2018 ror w12,w12,25 2019.endif 2020.inst 0x04679529 //lsr z9.s,z9.s,25 2021.if mixin == 1 2022 ror w13,w13,25 2023.endif 2024.inst 0x046795ad //lsr z13.s,z13.s,25 2025.if mixin == 1 2026 ror w14,w14,25 2027.endif 2028.inst 0x04713021 //orr z1.d,z1.d,z17.d 2029.inst 0x047230a5 //orr z5.d,z5.d,z18.d 2030.inst 0x04733129 //orr z9.d,z9.d,z19.d 2031.inst 0x047431ad //orr z13.d,z13.d,z20.d 2032.inst 0x04a50000 //add z0.s,z0.s,z5.s 2033.if mixin == 1 2034 add w7,w7,w12 2035.endif 2036.inst 0x04a90084 //add z4.s,z4.s,z9.s 2037.if mixin == 1 2038 add w8,w8,w13 2039.endif 2040.inst 0x04ad0108 //add z8.s,z8.s,z13.s 2041.if mixin == 1 2042 add w9,w9,w14 2043.endif 2044.inst 0x04a1018c //add z12.s,z12.s,z1.s 2045.if mixin == 1 2046 add w10,w10,w11 2047.endif 2048.inst 0x04a031ef //eor z15.d,z15.d,z0.d 2049.if mixin == 1 2050 eor w22,w22,w7 2051.endif 2052.inst 0x04a43063 //eor z3.d,z3.d,z4.d 2053.if mixin == 1 2054 eor w19,w19,w8 2055.endif 2056.inst 0x04a830e7 //eor z7.d,z7.d,z8.d 2057.if mixin == 1 2058 eor w20,w20,w9 2059.endif 2060.inst 0x04ac316b //eor z11.d,z11.d,z12.d 2061.if mixin == 1 2062 eor w21,w21,w10 2063.endif 2064.inst 0x05a581ef //revh z15.s,p0/m,z15.s 2065.if mixin == 1 2066 ror w22,w22,#16 2067.endif 2068.inst 0x05a58063 //revh z3.s,p0/m,z3.s 2069.if mixin == 1 2070 ror w19,w19,#16 2071.endif 2072.inst 0x05a580e7 //revh z7.s,p0/m,z7.s 2073.if mixin == 1 2074 ror w20,w20,#16 2075.endif 2076.inst 0x05a5816b //revh z11.s,p0/m,z11.s 2077.if mixin == 1 2078 ror w21,w21,#16 2079.endif 2080.inst 0x04af014a //add z10.s,z10.s,z15.s 2081.if mixin == 1 2082 add w17,w17,w22 2083.endif 2084.inst 0x04a301ce //add z14.s,z14.s,z3.s 2085.if mixin == 1 2086 add w18,w18,w19 2087.endif 2088.inst 0x04a70042 //add z2.s,z2.s,z7.s 2089.if mixin == 1 2090 add w15,w15,w20 2091.endif 2092.inst 0x04ab00c6 //add z6.s,z6.s,z11.s 2093.if mixin == 1 2094 add w16,w16,w21 2095.endif 2096.inst 0x04aa30a5 //eor z5.d,z5.d,z10.d 2097.if mixin == 1 2098 eor w12,w12,w17 2099.endif 2100.inst 0x04ae3129 //eor z9.d,z9.d,z14.d 2101.if mixin == 1 2102 eor w13,w13,w18 2103.endif 2104.inst 0x04a231ad //eor z13.d,z13.d,z2.d 2105.if mixin == 1 2106 eor w14,w14,w15 2107.endif 2108.inst 0x04a63021 //eor z1.d,z1.d,z6.d 2109.if mixin == 1 2110 eor w11,w11,w16 2111.endif 2112.inst 0x046c9cb1 //lsl z17.s,z5.s,12 2113.inst 0x046c9d32 //lsl z18.s,z9.s,12 2114.inst 0x046c9db3 //lsl z19.s,z13.s,12 2115.inst 0x046c9c34 //lsl z20.s,z1.s,12 2116.inst 0x046c94a5 //lsr z5.s,z5.s,20 2117.if mixin == 1 2118 ror w12,w12,20 2119.endif 2120.inst 0x046c9529 //lsr z9.s,z9.s,20 2121.if mixin == 1 2122 ror w13,w13,20 2123.endif 2124.inst 0x046c95ad //lsr z13.s,z13.s,20 2125.if mixin == 1 2126 ror w14,w14,20 2127.endif 2128.inst 0x046c9421 //lsr z1.s,z1.s,20 2129.if mixin == 1 2130 ror w11,w11,20 2131.endif 2132.inst 0x047130a5 //orr z5.d,z5.d,z17.d 2133.inst 0x04723129 //orr z9.d,z9.d,z18.d 2134.inst 0x047331ad //orr z13.d,z13.d,z19.d 2135.inst 0x04743021 //orr z1.d,z1.d,z20.d 2136.inst 0x04a50000 //add z0.s,z0.s,z5.s 2137.if mixin == 1 2138 add w7,w7,w12 2139.endif 2140.inst 0x04a90084 //add z4.s,z4.s,z9.s 2141.if mixin == 1 2142 add w8,w8,w13 2143.endif 2144.inst 0x04ad0108 //add z8.s,z8.s,z13.s 2145.if mixin == 1 2146 add w9,w9,w14 2147.endif 2148.inst 0x04a1018c //add z12.s,z12.s,z1.s 2149.if mixin == 1 2150 add w10,w10,w11 2151.endif 2152.inst 0x04a031ef //eor z15.d,z15.d,z0.d 2153.if mixin == 1 2154 eor w22,w22,w7 2155.endif 2156.inst 0x04a43063 //eor z3.d,z3.d,z4.d 2157.if mixin == 1 2158 eor w19,w19,w8 2159.endif 2160.inst 0x04a830e7 //eor z7.d,z7.d,z8.d 2161.if mixin == 1 2162 eor w20,w20,w9 2163.endif 2164.inst 0x04ac316b //eor z11.d,z11.d,z12.d 2165.if mixin == 1 2166 eor w21,w21,w10 2167.endif 2168.inst 0x053f31ef //tbl z15.b,{z15.b},z31.b 2169.if mixin == 1 2170 ror w22,w22,#24 2171.endif 2172.inst 0x053f3063 //tbl z3.b,{z3.b},z31.b 2173.if mixin == 1 2174 ror w19,w19,#24 2175.endif 2176.inst 0x053f30e7 //tbl z7.b,{z7.b},z31.b 2177.if mixin == 1 2178 ror w20,w20,#24 2179.endif 2180.inst 0x053f316b //tbl z11.b,{z11.b},z31.b 2181.if mixin == 1 2182 ror w21,w21,#24 2183.endif 2184.inst 0x04af014a //add z10.s,z10.s,z15.s 2185.if mixin == 1 2186 add w17,w17,w22 2187.endif 2188.inst 0x04a301ce //add z14.s,z14.s,z3.s 2189.if mixin == 1 2190 add w18,w18,w19 2191.endif 2192.inst 0x04a70042 //add z2.s,z2.s,z7.s 2193.if mixin == 1 2194 add w15,w15,w20 2195.endif 2196.inst 0x04ab00c6 //add z6.s,z6.s,z11.s 2197.if mixin == 1 2198 add w16,w16,w21 2199.endif 2200.inst 0x04aa30a5 //eor z5.d,z5.d,z10.d 2201.if mixin == 1 2202 eor w12,w12,w17 2203.endif 2204.inst 0x04ae3129 //eor z9.d,z9.d,z14.d 2205.if mixin == 1 2206 eor w13,w13,w18 2207.endif 2208.inst 0x04a231ad //eor z13.d,z13.d,z2.d 2209.if mixin == 1 2210 eor w14,w14,w15 2211.endif 2212.inst 0x04a63021 //eor z1.d,z1.d,z6.d 2213.if mixin == 1 2214 eor w11,w11,w16 2215.endif 2216.inst 0x04679cb1 //lsl z17.s,z5.s,7 2217.inst 0x04679d32 //lsl z18.s,z9.s,7 2218.inst 0x04679db3 //lsl z19.s,z13.s,7 2219.inst 0x04679c34 //lsl z20.s,z1.s,7 2220.inst 0x046794a5 //lsr z5.s,z5.s,25 2221.if mixin == 1 2222 ror w12,w12,25 2223.endif 2224.inst 0x04679529 //lsr z9.s,z9.s,25 2225.if mixin == 1 2226 ror w13,w13,25 2227.endif 2228.inst 0x046795ad //lsr z13.s,z13.s,25 2229.if mixin == 1 2230 ror w14,w14,25 2231.endif 2232.inst 0x04679421 //lsr z1.s,z1.s,25 2233.if mixin == 1 2234 ror w11,w11,25 2235.endif 2236.inst 0x047130a5 //orr z5.d,z5.d,z17.d 2237.inst 0x04723129 //orr z9.d,z9.d,z18.d 2238.inst 0x047331ad //orr z13.d,z13.d,z19.d 2239.inst 0x04743021 //orr z1.d,z1.d,z20.d 2240 sub x6,x6,1 2241 cbnz x6,10b 2242 lsr x6,x28,#32 2243.inst 0x05a03b91 //dup z17.s,w28 2244.inst 0x05a038d2 //dup z18.s,w6 2245 lsr x6,x29,#32 2246.inst 0x05a038d3 //dup z19.s,w6 2247 lsr x6,x30,#32 2248.if mixin == 1 2249 add w7,w7,w23 2250.endif 2251.inst 0x04b90000 //add z0.s,z0.s,z25.s 2252.if mixin == 1 2253 add x8,x8,x23,lsr #32 2254.endif 2255.inst 0x04ba0084 //add z4.s,z4.s,z26.s 2256.if mixin == 1 2257 add x7,x7,x8,lsl #32 // pack 2258.endif 2259.if mixin == 1 2260 add w9,w9,w24 2261.endif 2262.inst 0x04bb0108 //add z8.s,z8.s,z27.s 2263.if mixin == 1 2264 add x10,x10,x24,lsr #32 2265.endif 2266.inst 0x04bc018c //add z12.s,z12.s,z28.s 2267.if mixin == 1 2268 add x9,x9,x10,lsl #32 // pack 2269.endif 2270.if mixin == 1 2271 ldp x8,x10,[x1],#16 2272.endif 2273.if mixin == 1 2274 add w11,w11,w25 2275.endif 2276.inst 0x04bd0021 //add z1.s,z1.s,z29.s 2277.if mixin == 1 2278 add x12,x12,x25,lsr #32 2279.endif 2280.inst 0x04be00a5 //add z5.s,z5.s,z30.s 2281.if mixin == 1 2282 add x11,x11,x12,lsl #32 // pack 2283.endif 2284.if mixin == 1 2285 add w13,w13,w26 2286.endif 2287.inst 0x04b50129 //add z9.s,z9.s,z21.s 2288.if mixin == 1 2289 add x14,x14,x26,lsr #32 2290.endif 2291.inst 0x04b601ad //add z13.s,z13.s,z22.s 2292.if mixin == 1 2293 add x13,x13,x14,lsl #32 // pack 2294.endif 2295.if mixin == 1 2296 ldp x12,x14,[x1],#16 2297.endif 2298.if mixin == 1 2299 add w15,w15,w27 2300.endif 2301.inst 0x04b70042 //add z2.s,z2.s,z23.s 2302.if mixin == 1 2303 add x16,x16,x27,lsr #32 2304.endif 2305.inst 0x04b800c6 //add z6.s,z6.s,z24.s 2306.if mixin == 1 2307 add x15,x15,x16,lsl #32 // pack 2308.endif 2309.if mixin == 1 2310 add w17,w17,w28 2311.endif 2312.inst 0x04b1014a //add z10.s,z10.s,z17.s 2313.if mixin == 1 2314 add x18,x18,x28,lsr #32 2315.endif 2316.inst 0x04b201ce //add z14.s,z14.s,z18.s 2317.if mixin == 1 2318 add x17,x17,x18,lsl #32 // pack 2319.endif 2320.if mixin == 1 2321 ldp x16,x18,[x1],#16 2322.endif 2323.inst 0x05a03bd4 //dup z20.s,w30 2324.inst 0x05a038d9 //dup z25.s,w6 // bak[15] not available for SVE 2325.if mixin == 1 2326 add w19,w19,w29 2327.endif 2328.inst 0x04b00063 //add z3.s,z3.s,z16.s 2329.if mixin == 1 2330 add x20,x20,x29,lsr #32 2331.endif 2332.inst 0x04b300e7 //add z7.s,z7.s,z19.s 2333.if mixin == 1 2334 add x19,x19,x20,lsl #32 // pack 2335.endif 2336.if mixin == 1 2337 add w21,w21,w30 2338.endif 2339.inst 0x04b4016b //add z11.s,z11.s,z20.s 2340.if mixin == 1 2341 add x22,x22,x30,lsr #32 2342.endif 2343.inst 0x04b901ef //add z15.s,z15.s,z25.s 2344.if mixin == 1 2345 add x21,x21,x22,lsl #32 // pack 2346.endif 2347.if mixin == 1 2348 ldp x20,x22,[x1],#16 2349.endif 2350#ifdef __AARCH64EB__ 2351 rev x7,x7 2352.inst 0x05a48000 //revb z0.s,p0/m,z0.s 2353.inst 0x05a48084 //revb z4.s,p0/m,z4.s 2354 rev x9,x9 2355.inst 0x05a48108 //revb z8.s,p0/m,z8.s 2356.inst 0x05a4818c //revb z12.s,p0/m,z12.s 2357 rev x11,x11 2358.inst 0x05a48021 //revb z1.s,p0/m,z1.s 2359.inst 0x05a480a5 //revb z5.s,p0/m,z5.s 2360 rev x13,x13 2361.inst 0x05a48129 //revb z9.s,p0/m,z9.s 2362.inst 0x05a481ad //revb z13.s,p0/m,z13.s 2363 rev x15,x15 2364.inst 0x05a48042 //revb z2.s,p0/m,z2.s 2365.inst 0x05a480c6 //revb z6.s,p0/m,z6.s 2366 rev x17,x17 2367.inst 0x05a4814a //revb z10.s,p0/m,z10.s 2368.inst 0x05a481ce //revb z14.s,p0/m,z14.s 2369 rev x19,x19 2370.inst 0x05a48063 //revb z3.s,p0/m,z3.s 2371.inst 0x05a480e7 //revb z7.s,p0/m,z7.s 2372 rev x21,x21 2373.inst 0x05a4816b //revb z11.s,p0/m,z11.s 2374.inst 0x05a481ef //revb z15.s,p0/m,z15.s 2375#endif 2376.if mixin == 1 2377 add x29,x29,#1 2378.endif 2379 cmp x5,4 2380 b.ne 200f 2381.if mixin == 1 2382 eor x7,x7,x8 2383.endif 2384.if mixin == 1 2385 eor x9,x9,x10 2386.endif 2387.if mixin == 1 2388 eor x11,x11,x12 2389.endif 2390.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s 2391.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s 2392.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s 2393.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s 2394 2395.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s 2396.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s 2397.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s 2398.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s 2399 2400.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d 2401.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d 2402.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d 2403.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d 2404 2405.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d 2406.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d 2407.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d 2408.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d 2409.if mixin == 1 2410 eor x13,x13,x14 2411.endif 2412.if mixin == 1 2413 eor x15,x15,x16 2414.endif 2415.if mixin == 1 2416 eor x17,x17,x18 2417.endif 2418.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s 2419.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s 2420.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s 2421.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s 2422 2423.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s 2424.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s 2425.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s 2426.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s 2427 2428.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d 2429.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d 2430.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d 2431.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d 2432 2433.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d 2434.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d 2435.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d 2436.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d 2437.if mixin == 1 2438 eor x19,x19,x20 2439.endif 2440.if mixin == 1 2441 eor x21,x21,x22 2442.endif 2443 ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64 2444 ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64 2445.inst 0x04b13000 //eor z0.d,z0.d,z17.d 2446.inst 0x04b23021 //eor z1.d,z1.d,z18.d 2447.inst 0x04b33042 //eor z2.d,z2.d,z19.d 2448.inst 0x04b43063 //eor z3.d,z3.d,z20.d 2449.inst 0x04b53084 //eor z4.d,z4.d,z21.d 2450.inst 0x04b630a5 //eor z5.d,z5.d,z22.d 2451.inst 0x04b730c6 //eor z6.d,z6.d,z23.d 2452.inst 0x04b830e7 //eor z7.d,z7.d,z24.d 2453 ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64 2454 ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64 2455.if mixin == 1 2456 stp x7,x9,[x0],#16 2457.endif 2458.inst 0x04b13108 //eor z8.d,z8.d,z17.d 2459.inst 0x04b23129 //eor z9.d,z9.d,z18.d 2460.if mixin == 1 2461 stp x11,x13,[x0],#16 2462.endif 2463.inst 0x04b3314a //eor z10.d,z10.d,z19.d 2464.inst 0x04b4316b //eor z11.d,z11.d,z20.d 2465.if mixin == 1 2466 stp x15,x17,[x0],#16 2467.endif 2468.inst 0x04b5318c //eor z12.d,z12.d,z21.d 2469.inst 0x04b631ad //eor z13.d,z13.d,z22.d 2470.if mixin == 1 2471 stp x19,x21,[x0],#16 2472.endif 2473.inst 0x04b731ce //eor z14.d,z14.d,z23.d 2474.inst 0x04b831ef //eor z15.d,z15.d,z24.d 2475 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64 2476 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 2477 st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 2478 st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 2479 b 210f 2480200: 2481.inst 0x05a16011 //zip1 z17.s,z0.s,z1.s 2482.inst 0x05a16412 //zip2 z18.s,z0.s,z1.s 2483.inst 0x05a36053 //zip1 z19.s,z2.s,z3.s 2484.inst 0x05a36454 //zip2 z20.s,z2.s,z3.s 2485 2486.inst 0x05a56095 //zip1 z21.s,z4.s,z5.s 2487.inst 0x05a56496 //zip2 z22.s,z4.s,z5.s 2488.inst 0x05a760d7 //zip1 z23.s,z6.s,z7.s 2489.inst 0x05a764d8 //zip2 z24.s,z6.s,z7.s 2490 2491.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d 2492.inst 0x05f36621 //zip2 z1.d,z17.d,z19.d 2493.inst 0x05f46242 //zip1 z2.d,z18.d,z20.d 2494.inst 0x05f46643 //zip2 z3.d,z18.d,z20.d 2495 2496.inst 0x05f762a4 //zip1 z4.d,z21.d,z23.d 2497.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d 2498.inst 0x05f862c6 //zip1 z6.d,z22.d,z24.d 2499.inst 0x05f866c7 //zip2 z7.d,z22.d,z24.d 2500.if mixin == 1 2501 eor x7,x7,x8 2502.endif 2503.if mixin == 1 2504 eor x9,x9,x10 2505.endif 2506.inst 0x05a96111 //zip1 z17.s,z8.s,z9.s 2507.inst 0x05a96512 //zip2 z18.s,z8.s,z9.s 2508.inst 0x05ab6153 //zip1 z19.s,z10.s,z11.s 2509.inst 0x05ab6554 //zip2 z20.s,z10.s,z11.s 2510 2511.inst 0x05ad6195 //zip1 z21.s,z12.s,z13.s 2512.inst 0x05ad6596 //zip2 z22.s,z12.s,z13.s 2513.inst 0x05af61d7 //zip1 z23.s,z14.s,z15.s 2514.inst 0x05af65d8 //zip2 z24.s,z14.s,z15.s 2515 2516.inst 0x05f36228 //zip1 z8.d,z17.d,z19.d 2517.inst 0x05f36629 //zip2 z9.d,z17.d,z19.d 2518.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d 2519.inst 0x05f4664b //zip2 z11.d,z18.d,z20.d 2520 2521.inst 0x05f762ac //zip1 z12.d,z21.d,z23.d 2522.inst 0x05f766ad //zip2 z13.d,z21.d,z23.d 2523.inst 0x05f862ce //zip1 z14.d,z22.d,z24.d 2524.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d 2525.if mixin == 1 2526 eor x11,x11,x12 2527.endif 2528.if mixin == 1 2529 eor x13,x13,x14 2530.endif 2531.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s 2532.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s 2533.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s 2534.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s 2535 2536.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s 2537.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s 2538.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s 2539.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s 2540 2541.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d 2542.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d 2543.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d 2544.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d 2545 2546.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d 2547.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d 2548.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d 2549.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d 2550.if mixin == 1 2551 eor x15,x15,x16 2552.endif 2553.if mixin == 1 2554 eor x17,x17,x18 2555.endif 2556.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s 2557.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s 2558.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s 2559.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s 2560 2561.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s 2562.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s 2563.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s 2564.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s 2565 2566.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d 2567.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d 2568.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d 2569.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d 2570 2571.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d 2572.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d 2573.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d 2574.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d 2575.if mixin == 1 2576 eor x19,x19,x20 2577.endif 2578.if mixin == 1 2579 eor x21,x21,x22 2580.endif 2581.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL] 2582.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL] 2583.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL] 2584.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL] 2585.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL] 2586.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL] 2587.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL] 2588.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL] 2589.inst 0x04215101 //addvl x1,x1,8 2590.inst 0x04b13000 //eor z0.d,z0.d,z17.d 2591.inst 0x04b23084 //eor z4.d,z4.d,z18.d 2592.inst 0x04b33108 //eor z8.d,z8.d,z19.d 2593.inst 0x04b4318c //eor z12.d,z12.d,z20.d 2594.inst 0x04b53021 //eor z1.d,z1.d,z21.d 2595.inst 0x04b630a5 //eor z5.d,z5.d,z22.d 2596.inst 0x04b73129 //eor z9.d,z9.d,z23.d 2597.inst 0x04b831ad //eor z13.d,z13.d,z24.d 2598.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL] 2599.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL] 2600.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL] 2601.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL] 2602.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL] 2603.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL] 2604.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL] 2605.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL] 2606.inst 0x04215101 //addvl x1,x1,8 2607.if mixin == 1 2608 stp x7,x9,[x0],#16 2609.endif 2610.inst 0x04b13042 //eor z2.d,z2.d,z17.d 2611.inst 0x04b230c6 //eor z6.d,z6.d,z18.d 2612.if mixin == 1 2613 stp x11,x13,[x0],#16 2614.endif 2615.inst 0x04b3314a //eor z10.d,z10.d,z19.d 2616.inst 0x04b431ce //eor z14.d,z14.d,z20.d 2617.if mixin == 1 2618 stp x15,x17,[x0],#16 2619.endif 2620.inst 0x04b53063 //eor z3.d,z3.d,z21.d 2621.inst 0x04b630e7 //eor z7.d,z7.d,z22.d 2622.if mixin == 1 2623 stp x19,x21,[x0],#16 2624.endif 2625.inst 0x04b7316b //eor z11.d,z11.d,z23.d 2626.inst 0x04b831ef //eor z15.d,z15.d,z24.d 2627.inst 0xe540e000 //st1w {z0.s},p0,[x0,#0,MUL VL] 2628.inst 0xe541e004 //st1w {z4.s},p0,[x0,#1,MUL VL] 2629.inst 0xe542e008 //st1w {z8.s},p0,[x0,#2,MUL VL] 2630.inst 0xe543e00c //st1w {z12.s},p0,[x0,#3,MUL VL] 2631.inst 0xe544e001 //st1w {z1.s},p0,[x0,#4,MUL VL] 2632.inst 0xe545e005 //st1w {z5.s},p0,[x0,#5,MUL VL] 2633.inst 0xe546e009 //st1w {z9.s},p0,[x0,#6,MUL VL] 2634.inst 0xe547e00d //st1w {z13.s},p0,[x0,#7,MUL VL] 2635.inst 0x04205100 //addvl x0,x0,8 2636.inst 0xe540e002 //st1w {z2.s},p0,[x0,#0,MUL VL] 2637.inst 0xe541e006 //st1w {z6.s},p0,[x0,#1,MUL VL] 2638.inst 0xe542e00a //st1w {z10.s},p0,[x0,#2,MUL VL] 2639.inst 0xe543e00e //st1w {z14.s},p0,[x0,#3,MUL VL] 2640.inst 0xe544e003 //st1w {z3.s},p0,[x0,#4,MUL VL] 2641.inst 0xe545e007 //st1w {z7.s},p0,[x0,#5,MUL VL] 2642.inst 0xe546e00b //st1w {z11.s},p0,[x0,#6,MUL VL] 2643.inst 0xe547e00f //st1w {z15.s},p0,[x0,#7,MUL VL] 2644.inst 0x04205100 //addvl x0,x0,8 2645210: 2646.inst 0x04b0e3fd //incw x29, ALL, MUL #1 2647 subs x2,x2,64 2648 b.gt 100b 2649 b 110f 2650101: 2651 mixin=0 2652 lsr x8,x23,#32 2653.inst 0x05a03ae0 //dup z0.s,w23 2654.inst 0x05a03af9 //dup z25.s,w23 2655.if mixin == 1 2656 mov w7,w23 2657.endif 2658.inst 0x05a03904 //dup z4.s,w8 2659.inst 0x05a0391a //dup z26.s,w8 2660 lsr x10,x24,#32 2661.inst 0x05a03b08 //dup z8.s,w24 2662.inst 0x05a03b1b //dup z27.s,w24 2663.if mixin == 1 2664 mov w9,w24 2665.endif 2666.inst 0x05a0394c //dup z12.s,w10 2667.inst 0x05a0395c //dup z28.s,w10 2668 lsr x12,x25,#32 2669.inst 0x05a03b21 //dup z1.s,w25 2670.inst 0x05a03b3d //dup z29.s,w25 2671.if mixin == 1 2672 mov w11,w25 2673.endif 2674.inst 0x05a03985 //dup z5.s,w12 2675.inst 0x05a0399e //dup z30.s,w12 2676 lsr x14,x26,#32 2677.inst 0x05a03b49 //dup z9.s,w26 2678.inst 0x05a03b55 //dup z21.s,w26 2679.if mixin == 1 2680 mov w13,w26 2681.endif 2682.inst 0x05a039cd //dup z13.s,w14 2683.inst 0x05a039d6 //dup z22.s,w14 2684 lsr x16,x27,#32 2685.inst 0x05a03b62 //dup z2.s,w27 2686.inst 0x05a03b77 //dup z23.s,w27 2687.if mixin == 1 2688 mov w15,w27 2689.endif 2690.inst 0x05a03a06 //dup z6.s,w16 2691.inst 0x05a03a18 //dup z24.s,w16 2692 lsr x18,x28,#32 2693.inst 0x05a03b8a //dup z10.s,w28 2694.if mixin == 1 2695 mov w17,w28 2696.endif 2697.inst 0x05a03a4e //dup z14.s,w18 2698 lsr x22,x30,#32 2699.inst 0x05a03bcb //dup z11.s,w30 2700.if mixin == 1 2701 mov w21,w30 2702.endif 2703.inst 0x05a03acf //dup z15.s,w22 2704.if mixin == 1 2705 add w20,w29,#1 2706 mov w19,w29 2707.inst 0x04a14690 //index z16.s,w20,1 2708.inst 0x04a14683 //index z3.s,w20,1 2709.else 2710.inst 0x04a147b0 //index z16.s,w29,1 2711.inst 0x04a147a3 //index z3.s,w29,1 2712.endif 2713 lsr x20,x29,#32 2714.inst 0x05a03a87 //dup z7.s,w20 2715 mov x6,#10 271610: 2717.align 5 2718.inst 0x04a10000 //add z0.s,z0.s,z1.s 2719.if mixin == 1 2720 add w7,w7,w11 2721.endif 2722.inst 0x04a50084 //add z4.s,z4.s,z5.s 2723.if mixin == 1 2724 add w8,w8,w12 2725.endif 2726.inst 0x04a90108 //add z8.s,z8.s,z9.s 2727.if mixin == 1 2728 add w9,w9,w13 2729.endif 2730.inst 0x04ad018c //add z12.s,z12.s,z13.s 2731.if mixin == 1 2732 add w10,w10,w14 2733.endif 2734.inst 0x04a03063 //eor z3.d,z3.d,z0.d 2735.if mixin == 1 2736 eor w19,w19,w7 2737.endif 2738.inst 0x04a430e7 //eor z7.d,z7.d,z4.d 2739.if mixin == 1 2740 eor w20,w20,w8 2741.endif 2742.inst 0x04a8316b //eor z11.d,z11.d,z8.d 2743.if mixin == 1 2744 eor w21,w21,w9 2745.endif 2746.inst 0x04ac31ef //eor z15.d,z15.d,z12.d 2747.if mixin == 1 2748 eor w22,w22,w10 2749.endif 2750.inst 0x05a58063 //revh z3.s,p0/m,z3.s 2751.if mixin == 1 2752 ror w19,w19,#16 2753.endif 2754.inst 0x05a580e7 //revh z7.s,p0/m,z7.s 2755.if mixin == 1 2756 ror w20,w20,#16 2757.endif 2758.inst 0x05a5816b //revh z11.s,p0/m,z11.s 2759.if mixin == 1 2760 ror w21,w21,#16 2761.endif 2762.inst 0x05a581ef //revh z15.s,p0/m,z15.s 2763.if mixin == 1 2764 ror w22,w22,#16 2765.endif 2766.inst 0x04a30042 //add z2.s,z2.s,z3.s 2767.if mixin == 1 2768 add w15,w15,w19 2769.endif 2770.inst 0x04a700c6 //add z6.s,z6.s,z7.s 2771.if mixin == 1 2772 add w16,w16,w20 2773.endif 2774.inst 0x04ab014a //add z10.s,z10.s,z11.s 2775.if mixin == 1 2776 add w17,w17,w21 2777.endif 2778.inst 0x04af01ce //add z14.s,z14.s,z15.s 2779.if mixin == 1 2780 add w18,w18,w22 2781.endif 2782.inst 0x04a23021 //eor z1.d,z1.d,z2.d 2783.if mixin == 1 2784 eor w11,w11,w15 2785.endif 2786.inst 0x04a630a5 //eor z5.d,z5.d,z6.d 2787.if mixin == 1 2788 eor w12,w12,w16 2789.endif 2790.inst 0x04aa3129 //eor z9.d,z9.d,z10.d 2791.if mixin == 1 2792 eor w13,w13,w17 2793.endif 2794.inst 0x04ae31ad //eor z13.d,z13.d,z14.d 2795.if mixin == 1 2796 eor w14,w14,w18 2797.endif 2798.inst 0x046c9c31 //lsl z17.s,z1.s,12 2799.inst 0x046c9cb2 //lsl z18.s,z5.s,12 2800.inst 0x046c9d33 //lsl z19.s,z9.s,12 2801.inst 0x046c9db4 //lsl z20.s,z13.s,12 2802.inst 0x046c9421 //lsr z1.s,z1.s,20 2803.if mixin == 1 2804 ror w11,w11,20 2805.endif 2806.inst 0x046c94a5 //lsr z5.s,z5.s,20 2807.if mixin == 1 2808 ror w12,w12,20 2809.endif 2810.inst 0x046c9529 //lsr z9.s,z9.s,20 2811.if mixin == 1 2812 ror w13,w13,20 2813.endif 2814.inst 0x046c95ad //lsr z13.s,z13.s,20 2815.if mixin == 1 2816 ror w14,w14,20 2817.endif 2818.inst 0x04713021 //orr z1.d,z1.d,z17.d 2819.inst 0x047230a5 //orr z5.d,z5.d,z18.d 2820.inst 0x04733129 //orr z9.d,z9.d,z19.d 2821.inst 0x047431ad //orr z13.d,z13.d,z20.d 2822.inst 0x04a10000 //add z0.s,z0.s,z1.s 2823.if mixin == 1 2824 add w7,w7,w11 2825.endif 2826.inst 0x04a50084 //add z4.s,z4.s,z5.s 2827.if mixin == 1 2828 add w8,w8,w12 2829.endif 2830.inst 0x04a90108 //add z8.s,z8.s,z9.s 2831.if mixin == 1 2832 add w9,w9,w13 2833.endif 2834.inst 0x04ad018c //add z12.s,z12.s,z13.s 2835.if mixin == 1 2836 add w10,w10,w14 2837.endif 2838.inst 0x04a03063 //eor z3.d,z3.d,z0.d 2839.if mixin == 1 2840 eor w19,w19,w7 2841.endif 2842.inst 0x04a430e7 //eor z7.d,z7.d,z4.d 2843.if mixin == 1 2844 eor w20,w20,w8 2845.endif 2846.inst 0x04a8316b //eor z11.d,z11.d,z8.d 2847.if mixin == 1 2848 eor w21,w21,w9 2849.endif 2850.inst 0x04ac31ef //eor z15.d,z15.d,z12.d 2851.if mixin == 1 2852 eor w22,w22,w10 2853.endif 2854.inst 0x053f3063 //tbl z3.b,{z3.b},z31.b 2855.if mixin == 1 2856 ror w19,w19,#24 2857.endif 2858.inst 0x053f30e7 //tbl z7.b,{z7.b},z31.b 2859.if mixin == 1 2860 ror w20,w20,#24 2861.endif 2862.inst 0x053f316b //tbl z11.b,{z11.b},z31.b 2863.if mixin == 1 2864 ror w21,w21,#24 2865.endif 2866.inst 0x053f31ef //tbl z15.b,{z15.b},z31.b 2867.if mixin == 1 2868 ror w22,w22,#24 2869.endif 2870.inst 0x04a30042 //add z2.s,z2.s,z3.s 2871.if mixin == 1 2872 add w15,w15,w19 2873.endif 2874.inst 0x04a700c6 //add z6.s,z6.s,z7.s 2875.if mixin == 1 2876 add w16,w16,w20 2877.endif 2878.inst 0x04ab014a //add z10.s,z10.s,z11.s 2879.if mixin == 1 2880 add w17,w17,w21 2881.endif 2882.inst 0x04af01ce //add z14.s,z14.s,z15.s 2883.if mixin == 1 2884 add w18,w18,w22 2885.endif 2886.inst 0x04a23021 //eor z1.d,z1.d,z2.d 2887.if mixin == 1 2888 eor w11,w11,w15 2889.endif 2890.inst 0x04a630a5 //eor z5.d,z5.d,z6.d 2891.if mixin == 1 2892 eor w12,w12,w16 2893.endif 2894.inst 0x04aa3129 //eor z9.d,z9.d,z10.d 2895.if mixin == 1 2896 eor w13,w13,w17 2897.endif 2898.inst 0x04ae31ad //eor z13.d,z13.d,z14.d 2899.if mixin == 1 2900 eor w14,w14,w18 2901.endif 2902.inst 0x04679c31 //lsl z17.s,z1.s,7 2903.inst 0x04679cb2 //lsl z18.s,z5.s,7 2904.inst 0x04679d33 //lsl z19.s,z9.s,7 2905.inst 0x04679db4 //lsl z20.s,z13.s,7 2906.inst 0x04679421 //lsr z1.s,z1.s,25 2907.if mixin == 1 2908 ror w11,w11,25 2909.endif 2910.inst 0x046794a5 //lsr z5.s,z5.s,25 2911.if mixin == 1 2912 ror w12,w12,25 2913.endif 2914.inst 0x04679529 //lsr z9.s,z9.s,25 2915.if mixin == 1 2916 ror w13,w13,25 2917.endif 2918.inst 0x046795ad //lsr z13.s,z13.s,25 2919.if mixin == 1 2920 ror w14,w14,25 2921.endif 2922.inst 0x04713021 //orr z1.d,z1.d,z17.d 2923.inst 0x047230a5 //orr z5.d,z5.d,z18.d 2924.inst 0x04733129 //orr z9.d,z9.d,z19.d 2925.inst 0x047431ad //orr z13.d,z13.d,z20.d 2926.inst 0x04a50000 //add z0.s,z0.s,z5.s 2927.if mixin == 1 2928 add w7,w7,w12 2929.endif 2930.inst 0x04a90084 //add z4.s,z4.s,z9.s 2931.if mixin == 1 2932 add w8,w8,w13 2933.endif 2934.inst 0x04ad0108 //add z8.s,z8.s,z13.s 2935.if mixin == 1 2936 add w9,w9,w14 2937.endif 2938.inst 0x04a1018c //add z12.s,z12.s,z1.s 2939.if mixin == 1 2940 add w10,w10,w11 2941.endif 2942.inst 0x04a031ef //eor z15.d,z15.d,z0.d 2943.if mixin == 1 2944 eor w22,w22,w7 2945.endif 2946.inst 0x04a43063 //eor z3.d,z3.d,z4.d 2947.if mixin == 1 2948 eor w19,w19,w8 2949.endif 2950.inst 0x04a830e7 //eor z7.d,z7.d,z8.d 2951.if mixin == 1 2952 eor w20,w20,w9 2953.endif 2954.inst 0x04ac316b //eor z11.d,z11.d,z12.d 2955.if mixin == 1 2956 eor w21,w21,w10 2957.endif 2958.inst 0x05a581ef //revh z15.s,p0/m,z15.s 2959.if mixin == 1 2960 ror w22,w22,#16 2961.endif 2962.inst 0x05a58063 //revh z3.s,p0/m,z3.s 2963.if mixin == 1 2964 ror w19,w19,#16 2965.endif 2966.inst 0x05a580e7 //revh z7.s,p0/m,z7.s 2967.if mixin == 1 2968 ror w20,w20,#16 2969.endif 2970.inst 0x05a5816b //revh z11.s,p0/m,z11.s 2971.if mixin == 1 2972 ror w21,w21,#16 2973.endif 2974.inst 0x04af014a //add z10.s,z10.s,z15.s 2975.if mixin == 1 2976 add w17,w17,w22 2977.endif 2978.inst 0x04a301ce //add z14.s,z14.s,z3.s 2979.if mixin == 1 2980 add w18,w18,w19 2981.endif 2982.inst 0x04a70042 //add z2.s,z2.s,z7.s 2983.if mixin == 1 2984 add w15,w15,w20 2985.endif 2986.inst 0x04ab00c6 //add z6.s,z6.s,z11.s 2987.if mixin == 1 2988 add w16,w16,w21 2989.endif 2990.inst 0x04aa30a5 //eor z5.d,z5.d,z10.d 2991.if mixin == 1 2992 eor w12,w12,w17 2993.endif 2994.inst 0x04ae3129 //eor z9.d,z9.d,z14.d 2995.if mixin == 1 2996 eor w13,w13,w18 2997.endif 2998.inst 0x04a231ad //eor z13.d,z13.d,z2.d 2999.if mixin == 1 3000 eor w14,w14,w15 3001.endif 3002.inst 0x04a63021 //eor z1.d,z1.d,z6.d 3003.if mixin == 1 3004 eor w11,w11,w16 3005.endif 3006.inst 0x046c9cb1 //lsl z17.s,z5.s,12 3007.inst 0x046c9d32 //lsl z18.s,z9.s,12 3008.inst 0x046c9db3 //lsl z19.s,z13.s,12 3009.inst 0x046c9c34 //lsl z20.s,z1.s,12 3010.inst 0x046c94a5 //lsr z5.s,z5.s,20 3011.if mixin == 1 3012 ror w12,w12,20 3013.endif 3014.inst 0x046c9529 //lsr z9.s,z9.s,20 3015.if mixin == 1 3016 ror w13,w13,20 3017.endif 3018.inst 0x046c95ad //lsr z13.s,z13.s,20 3019.if mixin == 1 3020 ror w14,w14,20 3021.endif 3022.inst 0x046c9421 //lsr z1.s,z1.s,20 3023.if mixin == 1 3024 ror w11,w11,20 3025.endif 3026.inst 0x047130a5 //orr z5.d,z5.d,z17.d 3027.inst 0x04723129 //orr z9.d,z9.d,z18.d 3028.inst 0x047331ad //orr z13.d,z13.d,z19.d 3029.inst 0x04743021 //orr z1.d,z1.d,z20.d 3030.inst 0x04a50000 //add z0.s,z0.s,z5.s 3031.if mixin == 1 3032 add w7,w7,w12 3033.endif 3034.inst 0x04a90084 //add z4.s,z4.s,z9.s 3035.if mixin == 1 3036 add w8,w8,w13 3037.endif 3038.inst 0x04ad0108 //add z8.s,z8.s,z13.s 3039.if mixin == 1 3040 add w9,w9,w14 3041.endif 3042.inst 0x04a1018c //add z12.s,z12.s,z1.s 3043.if mixin == 1 3044 add w10,w10,w11 3045.endif 3046.inst 0x04a031ef //eor z15.d,z15.d,z0.d 3047.if mixin == 1 3048 eor w22,w22,w7 3049.endif 3050.inst 0x04a43063 //eor z3.d,z3.d,z4.d 3051.if mixin == 1 3052 eor w19,w19,w8 3053.endif 3054.inst 0x04a830e7 //eor z7.d,z7.d,z8.d 3055.if mixin == 1 3056 eor w20,w20,w9 3057.endif 3058.inst 0x04ac316b //eor z11.d,z11.d,z12.d 3059.if mixin == 1 3060 eor w21,w21,w10 3061.endif 3062.inst 0x053f31ef //tbl z15.b,{z15.b},z31.b 3063.if mixin == 1 3064 ror w22,w22,#24 3065.endif 3066.inst 0x053f3063 //tbl z3.b,{z3.b},z31.b 3067.if mixin == 1 3068 ror w19,w19,#24 3069.endif 3070.inst 0x053f30e7 //tbl z7.b,{z7.b},z31.b 3071.if mixin == 1 3072 ror w20,w20,#24 3073.endif 3074.inst 0x053f316b //tbl z11.b,{z11.b},z31.b 3075.if mixin == 1 3076 ror w21,w21,#24 3077.endif 3078.inst 0x04af014a //add z10.s,z10.s,z15.s 3079.if mixin == 1 3080 add w17,w17,w22 3081.endif 3082.inst 0x04a301ce //add z14.s,z14.s,z3.s 3083.if mixin == 1 3084 add w18,w18,w19 3085.endif 3086.inst 0x04a70042 //add z2.s,z2.s,z7.s 3087.if mixin == 1 3088 add w15,w15,w20 3089.endif 3090.inst 0x04ab00c6 //add z6.s,z6.s,z11.s 3091.if mixin == 1 3092 add w16,w16,w21 3093.endif 3094.inst 0x04aa30a5 //eor z5.d,z5.d,z10.d 3095.if mixin == 1 3096 eor w12,w12,w17 3097.endif 3098.inst 0x04ae3129 //eor z9.d,z9.d,z14.d 3099.if mixin == 1 3100 eor w13,w13,w18 3101.endif 3102.inst 0x04a231ad //eor z13.d,z13.d,z2.d 3103.if mixin == 1 3104 eor w14,w14,w15 3105.endif 3106.inst 0x04a63021 //eor z1.d,z1.d,z6.d 3107.if mixin == 1 3108 eor w11,w11,w16 3109.endif 3110.inst 0x04679cb1 //lsl z17.s,z5.s,7 3111.inst 0x04679d32 //lsl z18.s,z9.s,7 3112.inst 0x04679db3 //lsl z19.s,z13.s,7 3113.inst 0x04679c34 //lsl z20.s,z1.s,7 3114.inst 0x046794a5 //lsr z5.s,z5.s,25 3115.if mixin == 1 3116 ror w12,w12,25 3117.endif 3118.inst 0x04679529 //lsr z9.s,z9.s,25 3119.if mixin == 1 3120 ror w13,w13,25 3121.endif 3122.inst 0x046795ad //lsr z13.s,z13.s,25 3123.if mixin == 1 3124 ror w14,w14,25 3125.endif 3126.inst 0x04679421 //lsr z1.s,z1.s,25 3127.if mixin == 1 3128 ror w11,w11,25 3129.endif 3130.inst 0x047130a5 //orr z5.d,z5.d,z17.d 3131.inst 0x04723129 //orr z9.d,z9.d,z18.d 3132.inst 0x047331ad //orr z13.d,z13.d,z19.d 3133.inst 0x04743021 //orr z1.d,z1.d,z20.d 3134 sub x6,x6,1 3135 cbnz x6,10b 3136 lsr x6,x28,#32 3137.inst 0x05a03b91 //dup z17.s,w28 3138.inst 0x05a038d2 //dup z18.s,w6 3139 lsr x6,x29,#32 3140.inst 0x05a038d3 //dup z19.s,w6 3141 lsr x6,x30,#32 3142.if mixin == 1 3143 add w7,w7,w23 3144.endif 3145.inst 0x04b90000 //add z0.s,z0.s,z25.s 3146.if mixin == 1 3147 add x8,x8,x23,lsr #32 3148.endif 3149.inst 0x04ba0084 //add z4.s,z4.s,z26.s 3150.if mixin == 1 3151 add x7,x7,x8,lsl #32 // pack 3152.endif 3153.if mixin == 1 3154 add w9,w9,w24 3155.endif 3156.inst 0x04bb0108 //add z8.s,z8.s,z27.s 3157.if mixin == 1 3158 add x10,x10,x24,lsr #32 3159.endif 3160.inst 0x04bc018c //add z12.s,z12.s,z28.s 3161.if mixin == 1 3162 add x9,x9,x10,lsl #32 // pack 3163.endif 3164.if mixin == 1 3165 ldp x8,x10,[x1],#16 3166.endif 3167.if mixin == 1 3168 add w11,w11,w25 3169.endif 3170.inst 0x04bd0021 //add z1.s,z1.s,z29.s 3171.if mixin == 1 3172 add x12,x12,x25,lsr #32 3173.endif 3174.inst 0x04be00a5 //add z5.s,z5.s,z30.s 3175.if mixin == 1 3176 add x11,x11,x12,lsl #32 // pack 3177.endif 3178.if mixin == 1 3179 add w13,w13,w26 3180.endif 3181.inst 0x04b50129 //add z9.s,z9.s,z21.s 3182.if mixin == 1 3183 add x14,x14,x26,lsr #32 3184.endif 3185.inst 0x04b601ad //add z13.s,z13.s,z22.s 3186.if mixin == 1 3187 add x13,x13,x14,lsl #32 // pack 3188.endif 3189.if mixin == 1 3190 ldp x12,x14,[x1],#16 3191.endif 3192.if mixin == 1 3193 add w15,w15,w27 3194.endif 3195.inst 0x04b70042 //add z2.s,z2.s,z23.s 3196.if mixin == 1 3197 add x16,x16,x27,lsr #32 3198.endif 3199.inst 0x04b800c6 //add z6.s,z6.s,z24.s 3200.if mixin == 1 3201 add x15,x15,x16,lsl #32 // pack 3202.endif 3203.if mixin == 1 3204 add w17,w17,w28 3205.endif 3206.inst 0x04b1014a //add z10.s,z10.s,z17.s 3207.if mixin == 1 3208 add x18,x18,x28,lsr #32 3209.endif 3210.inst 0x04b201ce //add z14.s,z14.s,z18.s 3211.if mixin == 1 3212 add x17,x17,x18,lsl #32 // pack 3213.endif 3214.if mixin == 1 3215 ldp x16,x18,[x1],#16 3216.endif 3217.inst 0x05a03bd4 //dup z20.s,w30 3218.inst 0x05a038d9 //dup z25.s,w6 // bak[15] not available for SVE 3219.if mixin == 1 3220 add w19,w19,w29 3221.endif 3222.inst 0x04b00063 //add z3.s,z3.s,z16.s 3223.if mixin == 1 3224 add x20,x20,x29,lsr #32 3225.endif 3226.inst 0x04b300e7 //add z7.s,z7.s,z19.s 3227.if mixin == 1 3228 add x19,x19,x20,lsl #32 // pack 3229.endif 3230.if mixin == 1 3231 add w21,w21,w30 3232.endif 3233.inst 0x04b4016b //add z11.s,z11.s,z20.s 3234.if mixin == 1 3235 add x22,x22,x30,lsr #32 3236.endif 3237.inst 0x04b901ef //add z15.s,z15.s,z25.s 3238.if mixin == 1 3239 add x21,x21,x22,lsl #32 // pack 3240.endif 3241.if mixin == 1 3242 ldp x20,x22,[x1],#16 3243.endif 3244#ifdef __AARCH64EB__ 3245 rev x7,x7 3246.inst 0x05a48000 //revb z0.s,p0/m,z0.s 3247.inst 0x05a48084 //revb z4.s,p0/m,z4.s 3248 rev x9,x9 3249.inst 0x05a48108 //revb z8.s,p0/m,z8.s 3250.inst 0x05a4818c //revb z12.s,p0/m,z12.s 3251 rev x11,x11 3252.inst 0x05a48021 //revb z1.s,p0/m,z1.s 3253.inst 0x05a480a5 //revb z5.s,p0/m,z5.s 3254 rev x13,x13 3255.inst 0x05a48129 //revb z9.s,p0/m,z9.s 3256.inst 0x05a481ad //revb z13.s,p0/m,z13.s 3257 rev x15,x15 3258.inst 0x05a48042 //revb z2.s,p0/m,z2.s 3259.inst 0x05a480c6 //revb z6.s,p0/m,z6.s 3260 rev x17,x17 3261.inst 0x05a4814a //revb z10.s,p0/m,z10.s 3262.inst 0x05a481ce //revb z14.s,p0/m,z14.s 3263 rev x19,x19 3264.inst 0x05a48063 //revb z3.s,p0/m,z3.s 3265.inst 0x05a480e7 //revb z7.s,p0/m,z7.s 3266 rev x21,x21 3267.inst 0x05a4816b //revb z11.s,p0/m,z11.s 3268.inst 0x05a481ef //revb z15.s,p0/m,z15.s 3269#endif 3270.if mixin == 1 3271 add x29,x29,#1 3272.endif 3273 cmp x5,4 3274 b.ne 200f 3275.if mixin == 1 3276 eor x7,x7,x8 3277.endif 3278.if mixin == 1 3279 eor x9,x9,x10 3280.endif 3281.if mixin == 1 3282 eor x11,x11,x12 3283.endif 3284.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s 3285.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s 3286.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s 3287.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s 3288 3289.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s 3290.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s 3291.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s 3292.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s 3293 3294.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d 3295.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d 3296.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d 3297.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d 3298 3299.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d 3300.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d 3301.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d 3302.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d 3303.if mixin == 1 3304 eor x13,x13,x14 3305.endif 3306.if mixin == 1 3307 eor x15,x15,x16 3308.endif 3309.if mixin == 1 3310 eor x17,x17,x18 3311.endif 3312.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s 3313.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s 3314.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s 3315.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s 3316 3317.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s 3318.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s 3319.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s 3320.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s 3321 3322.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d 3323.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d 3324.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d 3325.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d 3326 3327.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d 3328.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d 3329.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d 3330.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d 3331.if mixin == 1 3332 eor x19,x19,x20 3333.endif 3334.if mixin == 1 3335 eor x21,x21,x22 3336.endif 3337 ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64 3338 ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64 3339.inst 0x04b13000 //eor z0.d,z0.d,z17.d 3340.inst 0x04b23021 //eor z1.d,z1.d,z18.d 3341.inst 0x04b33042 //eor z2.d,z2.d,z19.d 3342.inst 0x04b43063 //eor z3.d,z3.d,z20.d 3343.inst 0x04b53084 //eor z4.d,z4.d,z21.d 3344.inst 0x04b630a5 //eor z5.d,z5.d,z22.d 3345.inst 0x04b730c6 //eor z6.d,z6.d,z23.d 3346.inst 0x04b830e7 //eor z7.d,z7.d,z24.d 3347 ld1 {v17.4s,v18.4s,v19.4s,v20.4s},[x1],#64 3348 ld1 {v21.4s,v22.4s,v23.4s,v24.4s},[x1],#64 3349.if mixin == 1 3350 stp x7,x9,[x0],#16 3351.endif 3352.inst 0x04b13108 //eor z8.d,z8.d,z17.d 3353.inst 0x04b23129 //eor z9.d,z9.d,z18.d 3354.if mixin == 1 3355 stp x11,x13,[x0],#16 3356.endif 3357.inst 0x04b3314a //eor z10.d,z10.d,z19.d 3358.inst 0x04b4316b //eor z11.d,z11.d,z20.d 3359.if mixin == 1 3360 stp x15,x17,[x0],#16 3361.endif 3362.inst 0x04b5318c //eor z12.d,z12.d,z21.d 3363.inst 0x04b631ad //eor z13.d,z13.d,z22.d 3364.if mixin == 1 3365 stp x19,x21,[x0],#16 3366.endif 3367.inst 0x04b731ce //eor z14.d,z14.d,z23.d 3368.inst 0x04b831ef //eor z15.d,z15.d,z24.d 3369 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x0],#64 3370 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 3371 st1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 3372 st1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 3373 b 210f 3374200: 3375.inst 0x05a16011 //zip1 z17.s,z0.s,z1.s 3376.inst 0x05a16412 //zip2 z18.s,z0.s,z1.s 3377.inst 0x05a36053 //zip1 z19.s,z2.s,z3.s 3378.inst 0x05a36454 //zip2 z20.s,z2.s,z3.s 3379 3380.inst 0x05a56095 //zip1 z21.s,z4.s,z5.s 3381.inst 0x05a56496 //zip2 z22.s,z4.s,z5.s 3382.inst 0x05a760d7 //zip1 z23.s,z6.s,z7.s 3383.inst 0x05a764d8 //zip2 z24.s,z6.s,z7.s 3384 3385.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d 3386.inst 0x05f36621 //zip2 z1.d,z17.d,z19.d 3387.inst 0x05f46242 //zip1 z2.d,z18.d,z20.d 3388.inst 0x05f46643 //zip2 z3.d,z18.d,z20.d 3389 3390.inst 0x05f762a4 //zip1 z4.d,z21.d,z23.d 3391.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d 3392.inst 0x05f862c6 //zip1 z6.d,z22.d,z24.d 3393.inst 0x05f866c7 //zip2 z7.d,z22.d,z24.d 3394.if mixin == 1 3395 eor x7,x7,x8 3396.endif 3397.if mixin == 1 3398 eor x9,x9,x10 3399.endif 3400.inst 0x05a96111 //zip1 z17.s,z8.s,z9.s 3401.inst 0x05a96512 //zip2 z18.s,z8.s,z9.s 3402.inst 0x05ab6153 //zip1 z19.s,z10.s,z11.s 3403.inst 0x05ab6554 //zip2 z20.s,z10.s,z11.s 3404 3405.inst 0x05ad6195 //zip1 z21.s,z12.s,z13.s 3406.inst 0x05ad6596 //zip2 z22.s,z12.s,z13.s 3407.inst 0x05af61d7 //zip1 z23.s,z14.s,z15.s 3408.inst 0x05af65d8 //zip2 z24.s,z14.s,z15.s 3409 3410.inst 0x05f36228 //zip1 z8.d,z17.d,z19.d 3411.inst 0x05f36629 //zip2 z9.d,z17.d,z19.d 3412.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d 3413.inst 0x05f4664b //zip2 z11.d,z18.d,z20.d 3414 3415.inst 0x05f762ac //zip1 z12.d,z21.d,z23.d 3416.inst 0x05f766ad //zip2 z13.d,z21.d,z23.d 3417.inst 0x05f862ce //zip1 z14.d,z22.d,z24.d 3418.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d 3419.if mixin == 1 3420 eor x11,x11,x12 3421.endif 3422.if mixin == 1 3423 eor x13,x13,x14 3424.endif 3425.inst 0x05a46011 //zip1 z17.s,z0.s,z4.s 3426.inst 0x05a46412 //zip2 z18.s,z0.s,z4.s 3427.inst 0x05ac6113 //zip1 z19.s,z8.s,z12.s 3428.inst 0x05ac6514 //zip2 z20.s,z8.s,z12.s 3429 3430.inst 0x05a56035 //zip1 z21.s,z1.s,z5.s 3431.inst 0x05a56436 //zip2 z22.s,z1.s,z5.s 3432.inst 0x05ad6137 //zip1 z23.s,z9.s,z13.s 3433.inst 0x05ad6538 //zip2 z24.s,z9.s,z13.s 3434 3435.inst 0x05f36220 //zip1 z0.d,z17.d,z19.d 3436.inst 0x05f36624 //zip2 z4.d,z17.d,z19.d 3437.inst 0x05f46248 //zip1 z8.d,z18.d,z20.d 3438.inst 0x05f4664c //zip2 z12.d,z18.d,z20.d 3439 3440.inst 0x05f762a1 //zip1 z1.d,z21.d,z23.d 3441.inst 0x05f766a5 //zip2 z5.d,z21.d,z23.d 3442.inst 0x05f862c9 //zip1 z9.d,z22.d,z24.d 3443.inst 0x05f866cd //zip2 z13.d,z22.d,z24.d 3444.if mixin == 1 3445 eor x15,x15,x16 3446.endif 3447.if mixin == 1 3448 eor x17,x17,x18 3449.endif 3450.inst 0x05a66051 //zip1 z17.s,z2.s,z6.s 3451.inst 0x05a66452 //zip2 z18.s,z2.s,z6.s 3452.inst 0x05ae6153 //zip1 z19.s,z10.s,z14.s 3453.inst 0x05ae6554 //zip2 z20.s,z10.s,z14.s 3454 3455.inst 0x05a76075 //zip1 z21.s,z3.s,z7.s 3456.inst 0x05a76476 //zip2 z22.s,z3.s,z7.s 3457.inst 0x05af6177 //zip1 z23.s,z11.s,z15.s 3458.inst 0x05af6578 //zip2 z24.s,z11.s,z15.s 3459 3460.inst 0x05f36222 //zip1 z2.d,z17.d,z19.d 3461.inst 0x05f36626 //zip2 z6.d,z17.d,z19.d 3462.inst 0x05f4624a //zip1 z10.d,z18.d,z20.d 3463.inst 0x05f4664e //zip2 z14.d,z18.d,z20.d 3464 3465.inst 0x05f762a3 //zip1 z3.d,z21.d,z23.d 3466.inst 0x05f766a7 //zip2 z7.d,z21.d,z23.d 3467.inst 0x05f862cb //zip1 z11.d,z22.d,z24.d 3468.inst 0x05f866cf //zip2 z15.d,z22.d,z24.d 3469.if mixin == 1 3470 eor x19,x19,x20 3471.endif 3472.if mixin == 1 3473 eor x21,x21,x22 3474.endif 3475.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL] 3476.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL] 3477.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL] 3478.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL] 3479.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL] 3480.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL] 3481.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL] 3482.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL] 3483.inst 0x04215101 //addvl x1,x1,8 3484.inst 0x04b13000 //eor z0.d,z0.d,z17.d 3485.inst 0x04b23084 //eor z4.d,z4.d,z18.d 3486.inst 0x04b33108 //eor z8.d,z8.d,z19.d 3487.inst 0x04b4318c //eor z12.d,z12.d,z20.d 3488.inst 0x04b53021 //eor z1.d,z1.d,z21.d 3489.inst 0x04b630a5 //eor z5.d,z5.d,z22.d 3490.inst 0x04b73129 //eor z9.d,z9.d,z23.d 3491.inst 0x04b831ad //eor z13.d,z13.d,z24.d 3492.inst 0xa540a031 //ld1w {z17.s},p0/z,[x1,#0,MUL VL] 3493.inst 0xa541a032 //ld1w {z18.s},p0/z,[x1,#1,MUL VL] 3494.inst 0xa542a033 //ld1w {z19.s},p0/z,[x1,#2,MUL VL] 3495.inst 0xa543a034 //ld1w {z20.s},p0/z,[x1,#3,MUL VL] 3496.inst 0xa544a035 //ld1w {z21.s},p0/z,[x1,#4,MUL VL] 3497.inst 0xa545a036 //ld1w {z22.s},p0/z,[x1,#5,MUL VL] 3498.inst 0xa546a037 //ld1w {z23.s},p0/z,[x1,#6,MUL VL] 3499.inst 0xa547a038 //ld1w {z24.s},p0/z,[x1,#7,MUL VL] 3500.inst 0x04215101 //addvl x1,x1,8 3501.if mixin == 1 3502 stp x7,x9,[x0],#16 3503.endif 3504.inst 0x04b13042 //eor z2.d,z2.d,z17.d 3505.inst 0x04b230c6 //eor z6.d,z6.d,z18.d 3506.if mixin == 1 3507 stp x11,x13,[x0],#16 3508.endif 3509.inst 0x04b3314a //eor z10.d,z10.d,z19.d 3510.inst 0x04b431ce //eor z14.d,z14.d,z20.d 3511.if mixin == 1 3512 stp x15,x17,[x0],#16 3513.endif 3514.inst 0x04b53063 //eor z3.d,z3.d,z21.d 3515.inst 0x04b630e7 //eor z7.d,z7.d,z22.d 3516.if mixin == 1 3517 stp x19,x21,[x0],#16 3518.endif 3519.inst 0x04b7316b //eor z11.d,z11.d,z23.d 3520.inst 0x04b831ef //eor z15.d,z15.d,z24.d 3521.inst 0xe540e000 //st1w {z0.s},p0,[x0,#0,MUL VL] 3522.inst 0xe541e004 //st1w {z4.s},p0,[x0,#1,MUL VL] 3523.inst 0xe542e008 //st1w {z8.s},p0,[x0,#2,MUL VL] 3524.inst 0xe543e00c //st1w {z12.s},p0,[x0,#3,MUL VL] 3525.inst 0xe544e001 //st1w {z1.s},p0,[x0,#4,MUL VL] 3526.inst 0xe545e005 //st1w {z5.s},p0,[x0,#5,MUL VL] 3527.inst 0xe546e009 //st1w {z9.s},p0,[x0,#6,MUL VL] 3528.inst 0xe547e00d //st1w {z13.s},p0,[x0,#7,MUL VL] 3529.inst 0x04205100 //addvl x0,x0,8 3530.inst 0xe540e002 //st1w {z2.s},p0,[x0,#0,MUL VL] 3531.inst 0xe541e006 //st1w {z6.s},p0,[x0,#1,MUL VL] 3532.inst 0xe542e00a //st1w {z10.s},p0,[x0,#2,MUL VL] 3533.inst 0xe543e00e //st1w {z14.s},p0,[x0,#3,MUL VL] 3534.inst 0xe544e003 //st1w {z3.s},p0,[x0,#4,MUL VL] 3535.inst 0xe545e007 //st1w {z7.s},p0,[x0,#5,MUL VL] 3536.inst 0xe546e00b //st1w {z11.s},p0,[x0,#6,MUL VL] 3537.inst 0xe547e00f //st1w {z15.s},p0,[x0,#7,MUL VL] 3538.inst 0x04205100 //addvl x0,x0,8 3539210: 3540.inst 0x04b0e3fd //incw x29, ALL, MUL #1 3541110: 35422: 3543 str w29,[x4] 3544 ldp d10,d11,[sp,16] 3545 ldp d12,d13,[sp,32] 3546 ldp d14,d15,[sp,48] 3547 ldp x16,x17,[sp,64] 3548 ldp x18,x19,[sp,80] 3549 ldp x20,x21,[sp,96] 3550 ldp x22,x23,[sp,112] 3551 ldp x24,x25,[sp,128] 3552 ldp x26,x27,[sp,144] 3553 ldp x28,x29,[sp,160] 3554 ldr x30,[sp,176] 3555 ldp d8,d9,[sp],192 3556 AARCH64_VALIDATE_LINK_REGISTER 3557.Lreturn: 3558 ret 3559.size ChaCha20_ctr32_sve,.-ChaCha20_ctr32_sve 3560