1/* Do not modify. This file is auto-generated from chacha-armv8.pl. */ 2#include "arm_arch.h" 3#ifndef __KERNEL__ 4 5.hidden OPENSSL_armcap_P 6 7 8#endif 9 10.section .rodata 11 12.align 5 13.Lsigma: 14.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 15.Lone: 16.long 1,2,3,4 17.Lrot24: 18.long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f 19.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 20.align 2 21 22.text 23 24.globl ChaCha20_ctr32_dflt 25.type ChaCha20_ctr32_dflt,%function 26.align 5 27ChaCha20_ctr32_dflt: 28 AARCH64_SIGN_LINK_REGISTER 29 cmp x2,#192 30 b.lo .Lshort 31#ifndef __KERNEL__ 32 adrp x17,OPENSSL_armcap_P 33 ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 34.Lcheck_neon: 35 tst w17,#ARMV7_NEON 36 b.ne .LChaCha20_neon 37#endif 38 39.Lshort: 40 stp x29,x30,[sp,#-96]! 41 add x29,sp,#0 42 43 adrp x5,.Lsigma 44 add x5,x5,#:lo12:.Lsigma 45 stp x19,x20,[sp,#16] 46 stp x21,x22,[sp,#32] 47 stp x23,x24,[sp,#48] 48 stp x25,x26,[sp,#64] 49 stp x27,x28,[sp,#80] 50 sub sp,sp,#64 51 52 ldp x22,x23,[x5] // load sigma 53 ldp x24,x25,[x3] // load key 54 ldp x26,x27,[x3,#16] 55 ldp x28,x30,[x4] // load counter 56#ifdef __AARCH64EB__ 57 ror x24,x24,#32 58 ror x25,x25,#32 59 ror x26,x26,#32 60 ror x27,x27,#32 61 ror x28,x28,#32 62 ror x30,x30,#32 63#endif 64 65.Loop_outer: 66 mov w5,w22 // unpack key block 67 lsr x6,x22,#32 68 mov w7,w23 69 lsr x8,x23,#32 70 mov w9,w24 71 lsr x10,x24,#32 72 mov w11,w25 73 lsr x12,x25,#32 74 mov w13,w26 75 lsr x14,x26,#32 76 mov w15,w27 77 lsr x16,x27,#32 78 mov w17,w28 79 lsr x19,x28,#32 80 mov w20,w30 81 lsr x21,x30,#32 82 83 mov x4,#10 84 subs x2,x2,#64 85.Loop: 86 sub x4,x4,#1 87 add w5,w5,w9 88 add w6,w6,w10 89 add w7,w7,w11 90 add w8,w8,w12 91 eor w17,w17,w5 92 eor w19,w19,w6 93 eor w20,w20,w7 94 eor w21,w21,w8 95 ror w17,w17,#16 96 ror w19,w19,#16 97 ror w20,w20,#16 98 ror w21,w21,#16 99 add w13,w13,w17 100 add w14,w14,w19 101 add w15,w15,w20 102 add w16,w16,w21 103 eor w9,w9,w13 104 eor w10,w10,w14 105 eor w11,w11,w15 106 eor w12,w12,w16 107 ror w9,w9,#20 108 ror w10,w10,#20 109 ror w11,w11,#20 110 ror w12,w12,#20 111 add w5,w5,w9 112 add w6,w6,w10 113 add w7,w7,w11 114 add w8,w8,w12 115 eor w17,w17,w5 116 eor w19,w19,w6 117 eor w20,w20,w7 118 eor w21,w21,w8 119 ror w17,w17,#24 120 ror w19,w19,#24 121 ror w20,w20,#24 122 ror w21,w21,#24 123 add w13,w13,w17 124 add w14,w14,w19 125 add w15,w15,w20 126 add w16,w16,w21 127 eor w9,w9,w13 128 eor w10,w10,w14 129 eor w11,w11,w15 130 eor w12,w12,w16 131 ror w9,w9,#25 132 ror w10,w10,#25 133 ror w11,w11,#25 134 ror w12,w12,#25 135 add w5,w5,w10 136 add w6,w6,w11 137 add w7,w7,w12 138 add w8,w8,w9 139 eor w21,w21,w5 140 eor w17,w17,w6 141 eor w19,w19,w7 142 eor w20,w20,w8 143 ror w21,w21,#16 144 ror w17,w17,#16 145 ror w19,w19,#16 146 ror w20,w20,#16 147 add w15,w15,w21 148 add w16,w16,w17 149 add w13,w13,w19 150 add w14,w14,w20 151 eor w10,w10,w15 152 eor w11,w11,w16 153 eor w12,w12,w13 154 eor w9,w9,w14 155 ror w10,w10,#20 156 ror w11,w11,#20 157 ror w12,w12,#20 158 ror w9,w9,#20 159 add w5,w5,w10 160 add w6,w6,w11 161 add w7,w7,w12 162 add w8,w8,w9 163 eor w21,w21,w5 164 eor w17,w17,w6 165 eor w19,w19,w7 166 eor w20,w20,w8 167 ror w21,w21,#24 168 ror w17,w17,#24 169 ror w19,w19,#24 170 ror w20,w20,#24 171 add w15,w15,w21 172 add w16,w16,w17 173 add w13,w13,w19 174 add w14,w14,w20 175 eor w10,w10,w15 176 eor w11,w11,w16 177 eor w12,w12,w13 178 eor w9,w9,w14 179 ror w10,w10,#25 180 ror w11,w11,#25 181 ror w12,w12,#25 182 ror w9,w9,#25 183 cbnz x4,.Loop 184 185 add w5,w5,w22 // accumulate key block 186 add x6,x6,x22,lsr#32 187 add w7,w7,w23 188 add x8,x8,x23,lsr#32 189 add w9,w9,w24 190 add x10,x10,x24,lsr#32 191 add w11,w11,w25 192 add x12,x12,x25,lsr#32 193 add w13,w13,w26 194 add x14,x14,x26,lsr#32 195 add w15,w15,w27 196 add x16,x16,x27,lsr#32 197 add w17,w17,w28 198 add x19,x19,x28,lsr#32 199 add w20,w20,w30 200 add x21,x21,x30,lsr#32 201 202 b.lo .Ltail 203 204 add x5,x5,x6,lsl#32 // pack 205 add x7,x7,x8,lsl#32 206 ldp x6,x8,[x1,#0] // load input 207 add x9,x9,x10,lsl#32 208 add x11,x11,x12,lsl#32 209 ldp x10,x12,[x1,#16] 210 add x13,x13,x14,lsl#32 211 add x15,x15,x16,lsl#32 212 ldp x14,x16,[x1,#32] 213 add x17,x17,x19,lsl#32 214 add x20,x20,x21,lsl#32 215 ldp x19,x21,[x1,#48] 216 add x1,x1,#64 217#ifdef __AARCH64EB__ 218 rev x5,x5 219 rev x7,x7 220 rev x9,x9 221 rev x11,x11 222 rev x13,x13 223 rev x15,x15 224 rev x17,x17 225 rev x20,x20 226#endif 227 eor x5,x5,x6 228 eor x7,x7,x8 229 eor x9,x9,x10 230 eor x11,x11,x12 231 eor x13,x13,x14 232 eor x15,x15,x16 233 eor x17,x17,x19 234 eor x20,x20,x21 235 236 stp x5,x7,[x0,#0] // store output 237 add x28,x28,#1 // increment counter 238 stp x9,x11,[x0,#16] 239 stp x13,x15,[x0,#32] 240 stp x17,x20,[x0,#48] 241 add x0,x0,#64 242 243 b.hi .Loop_outer 244 245 ldp x19,x20,[x29,#16] 246 add sp,sp,#64 247 ldp x21,x22,[x29,#32] 248 ldp x23,x24,[x29,#48] 249 ldp x25,x26,[x29,#64] 250 ldp x27,x28,[x29,#80] 251 ldp x29,x30,[sp],#96 252.Labort: 253 AARCH64_VALIDATE_LINK_REGISTER 254 ret 255 256.align 4 257.Ltail: 258 add x2,x2,#64 259.Less_than_64: 260 sub x0,x0,#1 261 add x1,x1,x2 262 add x0,x0,x2 263 add x4,sp,x2 264 neg x2,x2 265 266 add x5,x5,x6,lsl#32 // pack 267 add x7,x7,x8,lsl#32 268 add x9,x9,x10,lsl#32 269 add x11,x11,x12,lsl#32 270 add x13,x13,x14,lsl#32 271 add x15,x15,x16,lsl#32 272 add x17,x17,x19,lsl#32 273 add x20,x20,x21,lsl#32 274#ifdef __AARCH64EB__ 275 rev x5,x5 276 rev x7,x7 277 rev x9,x9 278 rev x11,x11 279 rev x13,x13 280 rev x15,x15 281 rev x17,x17 282 rev x20,x20 283#endif 284 stp x5,x7,[sp,#0] 285 stp x9,x11,[sp,#16] 286 stp x13,x15,[sp,#32] 287 stp x17,x20,[sp,#48] 288 289.Loop_tail: 290 ldrb w10,[x1,x2] 291 ldrb w11,[x4,x2] 292 add x2,x2,#1 293 eor w10,w10,w11 294 strb w10,[x0,x2] 295 cbnz x2,.Loop_tail 296 297 stp xzr,xzr,[sp,#0] 298 stp xzr,xzr,[sp,#16] 299 stp xzr,xzr,[sp,#32] 300 stp xzr,xzr,[sp,#48] 301 302 ldp x19,x20,[x29,#16] 303 add sp,sp,#64 304 ldp x21,x22,[x29,#32] 305 ldp x23,x24,[x29,#48] 306 ldp x25,x26,[x29,#64] 307 ldp x27,x28,[x29,#80] 308 ldp x29,x30,[sp],#96 309 AARCH64_VALIDATE_LINK_REGISTER 310 ret 311.size ChaCha20_ctr32_dflt,.-ChaCha20_ctr32_dflt 312 313.globl ChaCha20_ctr32 314.type ChaCha20_ctr32,%function 315.align 5 316ChaCha20_ctr32: 317 AARCH64_SIGN_LINK_REGISTER 318 cbz x2,.Labort 319 cmp x2,#192 320 b.lo .Lshort 321#ifndef __KERNEL__ 322 adrp x17,OPENSSL_armcap_P 323 ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 324 tst w17,#ARMV8_SVE 325 b.eq .Lcheck_neon 326 stp x29,x30,[sp,#-16]! 327 sub sp,sp,#16 328 // SVE handling will inevitably increment the counter 329 // Neon/Scalar code that follows to process tail data needs to 330 // use new counter, unfortunately the input counter buffer 331 // pointed to by ctr is meant to be read-only per API contract 332 // we have to copy the buffer to stack to be writable by SVE 333 ldp x5,x6,[x4] 334 stp x5,x6,[sp] 335 mov x4,sp 336 bl ChaCha20_ctr32_sve 337 cbz x2,1f 338 bl ChaCha20_ctr32_dflt 3391: 340 add sp,sp,#16 341 ldp x29,x30,[sp],#16 342 AARCH64_VALIDATE_LINK_REGISTER 343 ret 344#endif 345 b .Lshort 346.size ChaCha20_ctr32,.-ChaCha20_ctr32 347 348#ifdef __KERNEL__ 349.globl ChaCha20_neon 350#endif 351.type ChaCha20_neon,%function 352.align 5 353ChaCha20_neon: 354 AARCH64_SIGN_LINK_REGISTER 355.LChaCha20_neon: 356 stp x29,x30,[sp,#-96]! 357 add x29,sp,#0 358 359 adrp x5,.Lsigma 360 add x5,x5,#:lo12:.Lsigma 361 stp x19,x20,[sp,#16] 362 stp x21,x22,[sp,#32] 363 stp x23,x24,[sp,#48] 364 stp x25,x26,[sp,#64] 365 stp x27,x28,[sp,#80] 366 cmp x2,#512 367 b.hs .L512_or_more_neon 368 369 sub sp,sp,#64 370 371 ldp x22,x23,[x5] // load sigma 372 ld1 {v0.4s},[x5],#16 373 ldp x24,x25,[x3] // load key 374 ldp x26,x27,[x3,#16] 375 ld1 {v1.4s,v2.4s},[x3] 376 ldp x28,x30,[x4] // load counter 377 ld1 {v3.4s},[x4] 378 stp d8,d9,[sp] // meet ABI requirements 379 ld1 {v8.4s,v9.4s},[x5] 380#ifdef __AARCH64EB__ 381 rev64 v0.4s,v0.4s 382 ror x24,x24,#32 383 ror x25,x25,#32 384 ror x26,x26,#32 385 ror x27,x27,#32 386 ror x28,x28,#32 387 ror x30,x30,#32 388#endif 389 390.Loop_outer_neon: 391 dup v16.4s,v0.s[0] // unpack key block 392 mov w5,w22 393 dup v20.4s,v0.s[1] 394 lsr x6,x22,#32 395 dup v24.4s,v0.s[2] 396 mov w7,w23 397 dup v28.4s,v0.s[3] 398 lsr x8,x23,#32 399 dup v17.4s,v1.s[0] 400 mov w9,w24 401 dup v21.4s,v1.s[1] 402 lsr x10,x24,#32 403 dup v25.4s,v1.s[2] 404 mov w11,w25 405 dup v29.4s,v1.s[3] 406 lsr x12,x25,#32 407 dup v19.4s,v3.s[0] 408 mov w13,w26 409 dup v23.4s,v3.s[1] 410 lsr x14,x26,#32 411 dup v27.4s,v3.s[2] 412 mov w15,w27 413 dup v31.4s,v3.s[3] 414 lsr x16,x27,#32 415 add v19.4s,v19.4s,v8.4s 416 mov w17,w28 417 dup v18.4s,v2.s[0] 418 lsr x19,x28,#32 419 dup v22.4s,v2.s[1] 420 mov w20,w30 421 dup v26.4s,v2.s[2] 422 lsr x21,x30,#32 423 dup v30.4s,v2.s[3] 424 425 mov x4,#10 426 subs x2,x2,#320 427.Loop_neon: 428 sub x4,x4,#1 429 add v16.4s,v16.4s,v17.4s 430 add w5,w5,w9 431 add v20.4s,v20.4s,v21.4s 432 add w6,w6,w10 433 add v24.4s,v24.4s,v25.4s 434 add w7,w7,w11 435 add v28.4s,v28.4s,v29.4s 436 add w8,w8,w12 437 eor v19.16b,v19.16b,v16.16b 438 eor w17,w17,w5 439 eor v23.16b,v23.16b,v20.16b 440 eor w19,w19,w6 441 eor v27.16b,v27.16b,v24.16b 442 eor w20,w20,w7 443 eor v31.16b,v31.16b,v28.16b 444 eor w21,w21,w8 445 rev32 v19.8h,v19.8h 446 ror w17,w17,#16 447 rev32 v23.8h,v23.8h 448 ror w19,w19,#16 449 rev32 v27.8h,v27.8h 450 ror w20,w20,#16 451 rev32 v31.8h,v31.8h 452 ror w21,w21,#16 453 add v18.4s,v18.4s,v19.4s 454 add w13,w13,w17 455 add v22.4s,v22.4s,v23.4s 456 add w14,w14,w19 457 add v26.4s,v26.4s,v27.4s 458 add w15,w15,w20 459 add v30.4s,v30.4s,v31.4s 460 add w16,w16,w21 461 eor v4.16b,v17.16b,v18.16b 462 eor w9,w9,w13 463 eor v5.16b,v21.16b,v22.16b 464 eor w10,w10,w14 465 eor v6.16b,v25.16b,v26.16b 466 eor w11,w11,w15 467 eor v7.16b,v29.16b,v30.16b 468 eor w12,w12,w16 469 ushr v17.4s,v4.4s,#20 470 ror w9,w9,#20 471 ushr v21.4s,v5.4s,#20 472 ror w10,w10,#20 473 ushr v25.4s,v6.4s,#20 474 ror w11,w11,#20 475 ushr v29.4s,v7.4s,#20 476 ror w12,w12,#20 477 sli v17.4s,v4.4s,#12 478 add w5,w5,w9 479 sli v21.4s,v5.4s,#12 480 add w6,w6,w10 481 sli v25.4s,v6.4s,#12 482 add w7,w7,w11 483 sli v29.4s,v7.4s,#12 484 add w8,w8,w12 485 add v16.4s,v16.4s,v17.4s 486 eor w17,w17,w5 487 add v20.4s,v20.4s,v21.4s 488 eor w19,w19,w6 489 add v24.4s,v24.4s,v25.4s 490 eor w20,w20,w7 491 add v28.4s,v28.4s,v29.4s 492 eor w21,w21,w8 493 eor v4.16b,v19.16b,v16.16b 494 ror w17,w17,#24 495 eor v5.16b,v23.16b,v20.16b 496 ror w19,w19,#24 497 eor v6.16b,v27.16b,v24.16b 498 ror w20,w20,#24 499 eor v7.16b,v31.16b,v28.16b 500 ror w21,w21,#24 501 tbl v19.16b,{v4.16b},v9.16b 502 add w13,w13,w17 503 tbl v23.16b,{v5.16b},v9.16b 504 add w14,w14,w19 505 tbl v27.16b,{v6.16b},v9.16b 506 add w15,w15,w20 507 tbl v31.16b,{v7.16b},v9.16b 508 add w16,w16,w21 509 add v18.4s,v18.4s,v19.4s 510 eor w9,w9,w13 511 add v22.4s,v22.4s,v23.4s 512 eor w10,w10,w14 513 add v26.4s,v26.4s,v27.4s 514 eor w11,w11,w15 515 add v30.4s,v30.4s,v31.4s 516 eor w12,w12,w16 517 eor v4.16b,v17.16b,v18.16b 518 ror w9,w9,#25 519 eor v5.16b,v21.16b,v22.16b 520 ror w10,w10,#25 521 eor v6.16b,v25.16b,v26.16b 522 ror w11,w11,#25 523 eor v7.16b,v29.16b,v30.16b 524 ror w12,w12,#25 525 ushr v17.4s,v4.4s,#25 526 ushr v21.4s,v5.4s,#25 527 ushr v25.4s,v6.4s,#25 528 ushr v29.4s,v7.4s,#25 529 sli v17.4s,v4.4s,#7 530 sli v21.4s,v5.4s,#7 531 sli v25.4s,v6.4s,#7 532 sli v29.4s,v7.4s,#7 533 add v16.4s,v16.4s,v21.4s 534 add w5,w5,w10 535 add v20.4s,v20.4s,v25.4s 536 add w6,w6,w11 537 add v24.4s,v24.4s,v29.4s 538 add w7,w7,w12 539 add v28.4s,v28.4s,v17.4s 540 add w8,w8,w9 541 eor v31.16b,v31.16b,v16.16b 542 eor w21,w21,w5 543 eor v19.16b,v19.16b,v20.16b 544 eor w17,w17,w6 545 eor v23.16b,v23.16b,v24.16b 546 eor w19,w19,w7 547 eor v27.16b,v27.16b,v28.16b 548 eor w20,w20,w8 549 rev32 v31.8h,v31.8h 550 ror w21,w21,#16 551 rev32 v19.8h,v19.8h 552 ror w17,w17,#16 553 rev32 v23.8h,v23.8h 554 ror w19,w19,#16 555 rev32 v27.8h,v27.8h 556 ror w20,w20,#16 557 add v26.4s,v26.4s,v31.4s 558 add w15,w15,w21 559 add v30.4s,v30.4s,v19.4s 560 add w16,w16,w17 561 add v18.4s,v18.4s,v23.4s 562 add w13,w13,w19 563 add v22.4s,v22.4s,v27.4s 564 add w14,w14,w20 565 eor v4.16b,v21.16b,v26.16b 566 eor w10,w10,w15 567 eor v5.16b,v25.16b,v30.16b 568 eor w11,w11,w16 569 eor v6.16b,v29.16b,v18.16b 570 eor w12,w12,w13 571 eor v7.16b,v17.16b,v22.16b 572 eor w9,w9,w14 573 ushr v21.4s,v4.4s,#20 574 ror w10,w10,#20 575 ushr v25.4s,v5.4s,#20 576 ror w11,w11,#20 577 ushr v29.4s,v6.4s,#20 578 ror w12,w12,#20 579 ushr v17.4s,v7.4s,#20 580 ror w9,w9,#20 581 sli v21.4s,v4.4s,#12 582 add w5,w5,w10 583 sli v25.4s,v5.4s,#12 584 add w6,w6,w11 585 sli v29.4s,v6.4s,#12 586 add w7,w7,w12 587 sli v17.4s,v7.4s,#12 588 add w8,w8,w9 589 add v16.4s,v16.4s,v21.4s 590 eor w21,w21,w5 591 add v20.4s,v20.4s,v25.4s 592 eor w17,w17,w6 593 add v24.4s,v24.4s,v29.4s 594 eor w19,w19,w7 595 add v28.4s,v28.4s,v17.4s 596 eor w20,w20,w8 597 eor v4.16b,v31.16b,v16.16b 598 ror w21,w21,#24 599 eor v5.16b,v19.16b,v20.16b 600 ror w17,w17,#24 601 eor v6.16b,v23.16b,v24.16b 602 ror w19,w19,#24 603 eor v7.16b,v27.16b,v28.16b 604 ror w20,w20,#24 605 tbl v31.16b,{v4.16b},v9.16b 606 add w15,w15,w21 607 tbl v19.16b,{v5.16b},v9.16b 608 add w16,w16,w17 609 tbl v23.16b,{v6.16b},v9.16b 610 add w13,w13,w19 611 tbl v27.16b,{v7.16b},v9.16b 612 add w14,w14,w20 613 add v26.4s,v26.4s,v31.4s 614 eor w10,w10,w15 615 add v30.4s,v30.4s,v19.4s 616 eor w11,w11,w16 617 add v18.4s,v18.4s,v23.4s 618 eor w12,w12,w13 619 add v22.4s,v22.4s,v27.4s 620 eor w9,w9,w14 621 eor v4.16b,v21.16b,v26.16b 622 ror w10,w10,#25 623 eor v5.16b,v25.16b,v30.16b 624 ror w11,w11,#25 625 eor v6.16b,v29.16b,v18.16b 626 ror w12,w12,#25 627 eor v7.16b,v17.16b,v22.16b 628 ror w9,w9,#25 629 ushr v21.4s,v4.4s,#25 630 ushr v25.4s,v5.4s,#25 631 ushr v29.4s,v6.4s,#25 632 ushr v17.4s,v7.4s,#25 633 sli v21.4s,v4.4s,#7 634 sli v25.4s,v5.4s,#7 635 sli v29.4s,v6.4s,#7 636 sli v17.4s,v7.4s,#7 637 cbnz x4,.Loop_neon 638 639 add v19.4s,v19.4s,v8.4s 640 641 zip1 v4.4s,v16.4s,v20.4s // transpose data 642 zip1 v5.4s,v24.4s,v28.4s 643 zip2 v6.4s,v16.4s,v20.4s 644 zip2 v7.4s,v24.4s,v28.4s 645 zip1 v16.2d,v4.2d,v5.2d 646 zip2 v20.2d,v4.2d,v5.2d 647 zip1 v24.2d,v6.2d,v7.2d 648 zip2 v28.2d,v6.2d,v7.2d 649 650 zip1 v4.4s,v17.4s,v21.4s 651 zip1 v5.4s,v25.4s,v29.4s 652 zip2 v6.4s,v17.4s,v21.4s 653 zip2 v7.4s,v25.4s,v29.4s 654 zip1 v17.2d,v4.2d,v5.2d 655 zip2 v21.2d,v4.2d,v5.2d 656 zip1 v25.2d,v6.2d,v7.2d 657 zip2 v29.2d,v6.2d,v7.2d 658 659 zip1 v4.4s,v18.4s,v22.4s 660 add w5,w5,w22 // accumulate key block 661 zip1 v5.4s,v26.4s,v30.4s 662 add x6,x6,x22,lsr#32 663 zip2 v6.4s,v18.4s,v22.4s 664 add w7,w7,w23 665 zip2 v7.4s,v26.4s,v30.4s 666 add x8,x8,x23,lsr#32 667 zip1 v18.2d,v4.2d,v5.2d 668 add w9,w9,w24 669 zip2 v22.2d,v4.2d,v5.2d 670 add x10,x10,x24,lsr#32 671 zip1 v26.2d,v6.2d,v7.2d 672 add w11,w11,w25 673 zip2 v30.2d,v6.2d,v7.2d 674 add x12,x12,x25,lsr#32 675 676 zip1 v4.4s,v19.4s,v23.4s 677 add w13,w13,w26 678 zip1 v5.4s,v27.4s,v31.4s 679 add x14,x14,x26,lsr#32 680 zip2 v6.4s,v19.4s,v23.4s 681 add w15,w15,w27 682 zip2 v7.4s,v27.4s,v31.4s 683 add x16,x16,x27,lsr#32 684 zip1 v19.2d,v4.2d,v5.2d 685 add w17,w17,w28 686 zip2 v23.2d,v4.2d,v5.2d 687 add x19,x19,x28,lsr#32 688 zip1 v27.2d,v6.2d,v7.2d 689 add w20,w20,w30 690 zip2 v31.2d,v6.2d,v7.2d 691 add x21,x21,x30,lsr#32 692 693 b.lo .Ltail_neon 694 695 add x5,x5,x6,lsl#32 // pack 696 add x7,x7,x8,lsl#32 697 ldp x6,x8,[x1,#0] // load input 698 add v16.4s,v16.4s,v0.4s // accumulate key block 699 add x9,x9,x10,lsl#32 700 add x11,x11,x12,lsl#32 701 ldp x10,x12,[x1,#16] 702 add v17.4s,v17.4s,v1.4s 703 add x13,x13,x14,lsl#32 704 add x15,x15,x16,lsl#32 705 ldp x14,x16,[x1,#32] 706 add v18.4s,v18.4s,v2.4s 707 add x17,x17,x19,lsl#32 708 add x20,x20,x21,lsl#32 709 ldp x19,x21,[x1,#48] 710 add v19.4s,v19.4s,v3.4s 711 add x1,x1,#64 712#ifdef __AARCH64EB__ 713 rev x5,x5 714 rev x7,x7 715 rev x9,x9 716 rev x11,x11 717 rev x13,x13 718 rev x15,x15 719 rev x17,x17 720 rev x20,x20 721#endif 722 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 723 eor x5,x5,x6 724 add v20.4s,v20.4s,v0.4s 725 eor x7,x7,x8 726 add v21.4s,v21.4s,v1.4s 727 eor x9,x9,x10 728 add v22.4s,v22.4s,v2.4s 729 eor x11,x11,x12 730 add v23.4s,v23.4s,v3.4s 731 eor x13,x13,x14 732 eor v16.16b,v16.16b,v4.16b 733 movi v4.4s,#5 734 eor x15,x15,x16 735 eor v17.16b,v17.16b,v5.16b 736 eor x17,x17,x19 737 eor v18.16b,v18.16b,v6.16b 738 eor x20,x20,x21 739 eor v19.16b,v19.16b,v7.16b 740 add v8.4s,v8.4s,v4.4s // += 5 741 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 742 743 stp x5,x7,[x0,#0] // store output 744 add x28,x28,#5 // increment counter 745 stp x9,x11,[x0,#16] 746 stp x13,x15,[x0,#32] 747 stp x17,x20,[x0,#48] 748 add x0,x0,#64 749 750 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 751 add v24.4s,v24.4s,v0.4s 752 add v25.4s,v25.4s,v1.4s 753 add v26.4s,v26.4s,v2.4s 754 add v27.4s,v27.4s,v3.4s 755 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 756 757 eor v20.16b,v20.16b,v4.16b 758 eor v21.16b,v21.16b,v5.16b 759 eor v22.16b,v22.16b,v6.16b 760 eor v23.16b,v23.16b,v7.16b 761 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 762 add v28.4s,v28.4s,v0.4s 763 add v29.4s,v29.4s,v1.4s 764 add v30.4s,v30.4s,v2.4s 765 add v31.4s,v31.4s,v3.4s 766 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 767 768 eor v24.16b,v24.16b,v16.16b 769 eor v25.16b,v25.16b,v17.16b 770 eor v26.16b,v26.16b,v18.16b 771 eor v27.16b,v27.16b,v19.16b 772 st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 773 774 eor v28.16b,v28.16b,v20.16b 775 eor v29.16b,v29.16b,v21.16b 776 eor v30.16b,v30.16b,v22.16b 777 eor v31.16b,v31.16b,v23.16b 778 st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64 779 780 b.hi .Loop_outer_neon 781 782 ldp d8,d9,[sp] // meet ABI requirements 783 784 ldp x19,x20,[x29,#16] 785 add sp,sp,#64 786 ldp x21,x22,[x29,#32] 787 ldp x23,x24,[x29,#48] 788 ldp x25,x26,[x29,#64] 789 ldp x27,x28,[x29,#80] 790 ldp x29,x30,[sp],#96 791 AARCH64_VALIDATE_LINK_REGISTER 792 ret 793 794.align 4 795.Ltail_neon: 796 add x2,x2,#320 797 ldp d8,d9,[sp] // meet ABI requirements 798 cmp x2,#64 799 b.lo .Less_than_64 800 801 add x5,x5,x6,lsl#32 // pack 802 add x7,x7,x8,lsl#32 803 ldp x6,x8,[x1,#0] // load input 804 add x9,x9,x10,lsl#32 805 add x11,x11,x12,lsl#32 806 ldp x10,x12,[x1,#16] 807 add x13,x13,x14,lsl#32 808 add x15,x15,x16,lsl#32 809 ldp x14,x16,[x1,#32] 810 add x17,x17,x19,lsl#32 811 add x20,x20,x21,lsl#32 812 ldp x19,x21,[x1,#48] 813 add x1,x1,#64 814#ifdef __AARCH64EB__ 815 rev x5,x5 816 rev x7,x7 817 rev x9,x9 818 rev x11,x11 819 rev x13,x13 820 rev x15,x15 821 rev x17,x17 822 rev x20,x20 823#endif 824 eor x5,x5,x6 825 eor x7,x7,x8 826 eor x9,x9,x10 827 eor x11,x11,x12 828 eor x13,x13,x14 829 eor x15,x15,x16 830 eor x17,x17,x19 831 eor x20,x20,x21 832 833 stp x5,x7,[x0,#0] // store output 834 add v16.4s,v16.4s,v0.4s // accumulate key block 835 stp x9,x11,[x0,#16] 836 add v17.4s,v17.4s,v1.4s 837 stp x13,x15,[x0,#32] 838 add v18.4s,v18.4s,v2.4s 839 stp x17,x20,[x0,#48] 840 add v19.4s,v19.4s,v3.4s 841 add x0,x0,#64 842 b.eq .Ldone_neon 843 sub x2,x2,#64 844 cmp x2,#64 845 b.lo .Last_neon 846 847 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 848 eor v16.16b,v16.16b,v4.16b 849 eor v17.16b,v17.16b,v5.16b 850 eor v18.16b,v18.16b,v6.16b 851 eor v19.16b,v19.16b,v7.16b 852 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 853 b.eq .Ldone_neon 854 855 add v16.4s,v20.4s,v0.4s 856 add v17.4s,v21.4s,v1.4s 857 sub x2,x2,#64 858 add v18.4s,v22.4s,v2.4s 859 cmp x2,#64 860 add v19.4s,v23.4s,v3.4s 861 b.lo .Last_neon 862 863 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 864 eor v20.16b,v16.16b,v4.16b 865 eor v21.16b,v17.16b,v5.16b 866 eor v22.16b,v18.16b,v6.16b 867 eor v23.16b,v19.16b,v7.16b 868 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 869 b.eq .Ldone_neon 870 871 add v16.4s,v24.4s,v0.4s 872 add v17.4s,v25.4s,v1.4s 873 sub x2,x2,#64 874 add v18.4s,v26.4s,v2.4s 875 cmp x2,#64 876 add v19.4s,v27.4s,v3.4s 877 b.lo .Last_neon 878 879 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 880 eor v24.16b,v16.16b,v4.16b 881 eor v25.16b,v17.16b,v5.16b 882 eor v26.16b,v18.16b,v6.16b 883 eor v27.16b,v19.16b,v7.16b 884 st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 885 b.eq .Ldone_neon 886 887 add v16.4s,v28.4s,v0.4s 888 add v17.4s,v29.4s,v1.4s 889 add v18.4s,v30.4s,v2.4s 890 add v19.4s,v31.4s,v3.4s 891 sub x2,x2,#64 892 893.Last_neon: 894 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 895 896 sub x0,x0,#1 897 add x1,x1,x2 898 add x0,x0,x2 899 add x4,sp,x2 900 neg x2,x2 901 902.Loop_tail_neon: 903 ldrb w10,[x1,x2] 904 ldrb w11,[x4,x2] 905 add x2,x2,#1 906 eor w10,w10,w11 907 strb w10,[x0,x2] 908 cbnz x2,.Loop_tail_neon 909 910 stp xzr,xzr,[sp,#0] 911 stp xzr,xzr,[sp,#16] 912 stp xzr,xzr,[sp,#32] 913 stp xzr,xzr,[sp,#48] 914 915.Ldone_neon: 916 ldp x19,x20,[x29,#16] 917 add sp,sp,#64 918 ldp x21,x22,[x29,#32] 919 ldp x23,x24,[x29,#48] 920 ldp x25,x26,[x29,#64] 921 ldp x27,x28,[x29,#80] 922 ldp x29,x30,[sp],#96 923 AARCH64_VALIDATE_LINK_REGISTER 924 ret 925.size ChaCha20_neon,.-ChaCha20_neon 926.type ChaCha20_512_neon,%function 927.align 5 928ChaCha20_512_neon: 929 AARCH64_SIGN_LINK_REGISTER 930 stp x29,x30,[sp,#-96]! 931 add x29,sp,#0 932 933 adrp x5,.Lsigma 934 add x5,x5,#:lo12:.Lsigma 935 stp x19,x20,[sp,#16] 936 stp x21,x22,[sp,#32] 937 stp x23,x24,[sp,#48] 938 stp x25,x26,[sp,#64] 939 stp x27,x28,[sp,#80] 940 941.L512_or_more_neon: 942 sub sp,sp,#128+64 943 944 eor v7.16b,v7.16b,v7.16b 945 ldp x22,x23,[x5] // load sigma 946 ld1 {v0.4s},[x5],#16 947 ldp x24,x25,[x3] // load key 948 ldp x26,x27,[x3,#16] 949 ld1 {v1.4s,v2.4s},[x3] 950 ldp x28,x30,[x4] // load counter 951 ld1 {v3.4s},[x4] 952 ld1 {v7.s}[0],[x5] 953 add x3,x5,#16 // .Lrot24 954#ifdef __AARCH64EB__ 955 rev64 v0.4s,v0.4s 956 ror x24,x24,#32 957 ror x25,x25,#32 958 ror x26,x26,#32 959 ror x27,x27,#32 960 ror x28,x28,#32 961 ror x30,x30,#32 962#endif 963 add v3.4s,v3.4s,v7.4s // += 1 964 stp q0,q1,[sp,#0] // off-load key block, invariant part 965 add v3.4s,v3.4s,v7.4s // not typo 966 str q2,[sp,#32] 967 add v4.4s,v3.4s,v7.4s 968 add v5.4s,v4.4s,v7.4s 969 add v6.4s,v5.4s,v7.4s 970 shl v7.4s,v7.4s,#2 // 1 -> 4 971 972 stp d8,d9,[sp,#128+0] // meet ABI requirements 973 stp d10,d11,[sp,#128+16] 974 stp d12,d13,[sp,#128+32] 975 stp d14,d15,[sp,#128+48] 976 977 sub x2,x2,#512 // not typo 978 979.Loop_outer_512_neon: 980 mov v8.16b,v0.16b 981 mov v12.16b,v0.16b 982 mov v16.16b,v0.16b 983 mov v20.16b,v0.16b 984 mov v24.16b,v0.16b 985 mov v28.16b,v0.16b 986 mov v9.16b,v1.16b 987 mov w5,w22 // unpack key block 988 mov v13.16b,v1.16b 989 lsr x6,x22,#32 990 mov v17.16b,v1.16b 991 mov w7,w23 992 mov v21.16b,v1.16b 993 lsr x8,x23,#32 994 mov v25.16b,v1.16b 995 mov w9,w24 996 mov v29.16b,v1.16b 997 lsr x10,x24,#32 998 mov v11.16b,v3.16b 999 mov w11,w25 1000 mov v15.16b,v4.16b 1001 lsr x12,x25,#32 1002 mov v19.16b,v5.16b 1003 mov w13,w26 1004 mov v23.16b,v6.16b 1005 lsr x14,x26,#32 1006 mov v10.16b,v2.16b 1007 mov w15,w27 1008 mov v14.16b,v2.16b 1009 lsr x16,x27,#32 1010 add v27.4s,v11.4s,v7.4s // +4 1011 mov w17,w28 1012 add v31.4s,v15.4s,v7.4s // +4 1013 lsr x19,x28,#32 1014 mov v18.16b,v2.16b 1015 mov w20,w30 1016 mov v22.16b,v2.16b 1017 lsr x21,x30,#32 1018 mov v26.16b,v2.16b 1019 stp q3,q4,[sp,#48] // off-load key block, variable part 1020 mov v30.16b,v2.16b 1021 stp q5,q6,[sp,#80] 1022 1023 mov x4,#5 1024 ld1 {v6.4s},[x3] 1025 subs x2,x2,#512 1026.Loop_upper_neon: 1027 sub x4,x4,#1 1028 add v8.4s,v8.4s,v9.4s 1029 add w5,w5,w9 1030 add v12.4s,v12.4s,v13.4s 1031 add w6,w6,w10 1032 add v16.4s,v16.4s,v17.4s 1033 add w7,w7,w11 1034 add v20.4s,v20.4s,v21.4s 1035 add w8,w8,w12 1036 add v24.4s,v24.4s,v25.4s 1037 eor w17,w17,w5 1038 add v28.4s,v28.4s,v29.4s 1039 eor w19,w19,w6 1040 eor v11.16b,v11.16b,v8.16b 1041 eor w20,w20,w7 1042 eor v15.16b,v15.16b,v12.16b 1043 eor w21,w21,w8 1044 eor v19.16b,v19.16b,v16.16b 1045 ror w17,w17,#16 1046 eor v23.16b,v23.16b,v20.16b 1047 ror w19,w19,#16 1048 eor v27.16b,v27.16b,v24.16b 1049 ror w20,w20,#16 1050 eor v31.16b,v31.16b,v28.16b 1051 ror w21,w21,#16 1052 rev32 v11.8h,v11.8h 1053 add w13,w13,w17 1054 rev32 v15.8h,v15.8h 1055 add w14,w14,w19 1056 rev32 v19.8h,v19.8h 1057 add w15,w15,w20 1058 rev32 v23.8h,v23.8h 1059 add w16,w16,w21 1060 rev32 v27.8h,v27.8h 1061 eor w9,w9,w13 1062 rev32 v31.8h,v31.8h 1063 eor w10,w10,w14 1064 add v10.4s,v10.4s,v11.4s 1065 eor w11,w11,w15 1066 add v14.4s,v14.4s,v15.4s 1067 eor w12,w12,w16 1068 add v18.4s,v18.4s,v19.4s 1069 ror w9,w9,#20 1070 add v22.4s,v22.4s,v23.4s 1071 ror w10,w10,#20 1072 add v26.4s,v26.4s,v27.4s 1073 ror w11,w11,#20 1074 add v30.4s,v30.4s,v31.4s 1075 ror w12,w12,#20 1076 eor v0.16b,v9.16b,v10.16b 1077 add w5,w5,w9 1078 eor v1.16b,v13.16b,v14.16b 1079 add w6,w6,w10 1080 eor v2.16b,v17.16b,v18.16b 1081 add w7,w7,w11 1082 eor v3.16b,v21.16b,v22.16b 1083 add w8,w8,w12 1084 eor v4.16b,v25.16b,v26.16b 1085 eor w17,w17,w5 1086 eor v5.16b,v29.16b,v30.16b 1087 eor w19,w19,w6 1088 ushr v9.4s,v0.4s,#20 1089 eor w20,w20,w7 1090 ushr v13.4s,v1.4s,#20 1091 eor w21,w21,w8 1092 ushr v17.4s,v2.4s,#20 1093 ror w17,w17,#24 1094 ushr v21.4s,v3.4s,#20 1095 ror w19,w19,#24 1096 ushr v25.4s,v4.4s,#20 1097 ror w20,w20,#24 1098 ushr v29.4s,v5.4s,#20 1099 ror w21,w21,#24 1100 sli v9.4s,v0.4s,#12 1101 add w13,w13,w17 1102 sli v13.4s,v1.4s,#12 1103 add w14,w14,w19 1104 sli v17.4s,v2.4s,#12 1105 add w15,w15,w20 1106 sli v21.4s,v3.4s,#12 1107 add w16,w16,w21 1108 sli v25.4s,v4.4s,#12 1109 eor w9,w9,w13 1110 sli v29.4s,v5.4s,#12 1111 eor w10,w10,w14 1112 add v8.4s,v8.4s,v9.4s 1113 eor w11,w11,w15 1114 add v12.4s,v12.4s,v13.4s 1115 eor w12,w12,w16 1116 add v16.4s,v16.4s,v17.4s 1117 ror w9,w9,#25 1118 add v20.4s,v20.4s,v21.4s 1119 ror w10,w10,#25 1120 add v24.4s,v24.4s,v25.4s 1121 ror w11,w11,#25 1122 add v28.4s,v28.4s,v29.4s 1123 ror w12,w12,#25 1124 eor v11.16b,v11.16b,v8.16b 1125 add w5,w5,w10 1126 eor v15.16b,v15.16b,v12.16b 1127 add w6,w6,w11 1128 eor v19.16b,v19.16b,v16.16b 1129 add w7,w7,w12 1130 eor v23.16b,v23.16b,v20.16b 1131 add w8,w8,w9 1132 eor v27.16b,v27.16b,v24.16b 1133 eor w21,w21,w5 1134 eor v31.16b,v31.16b,v28.16b 1135 eor w17,w17,w6 1136 tbl v11.16b,{v11.16b},v6.16b 1137 eor w19,w19,w7 1138 tbl v15.16b,{v15.16b},v6.16b 1139 eor w20,w20,w8 1140 tbl v19.16b,{v19.16b},v6.16b 1141 ror w21,w21,#16 1142 tbl v23.16b,{v23.16b},v6.16b 1143 ror w17,w17,#16 1144 tbl v27.16b,{v27.16b},v6.16b 1145 ror w19,w19,#16 1146 tbl v31.16b,{v31.16b},v6.16b 1147 ror w20,w20,#16 1148 add v10.4s,v10.4s,v11.4s 1149 add w15,w15,w21 1150 add v14.4s,v14.4s,v15.4s 1151 add w16,w16,w17 1152 add v18.4s,v18.4s,v19.4s 1153 add w13,w13,w19 1154 add v22.4s,v22.4s,v23.4s 1155 add w14,w14,w20 1156 add v26.4s,v26.4s,v27.4s 1157 eor w10,w10,w15 1158 add v30.4s,v30.4s,v31.4s 1159 eor w11,w11,w16 1160 eor v0.16b,v9.16b,v10.16b 1161 eor w12,w12,w13 1162 eor v1.16b,v13.16b,v14.16b 1163 eor w9,w9,w14 1164 eor v2.16b,v17.16b,v18.16b 1165 ror w10,w10,#20 1166 eor v3.16b,v21.16b,v22.16b 1167 ror w11,w11,#20 1168 eor v4.16b,v25.16b,v26.16b 1169 ror w12,w12,#20 1170 eor v5.16b,v29.16b,v30.16b 1171 ror w9,w9,#20 1172 ushr v9.4s,v0.4s,#25 1173 add w5,w5,w10 1174 ushr v13.4s,v1.4s,#25 1175 add w6,w6,w11 1176 ushr v17.4s,v2.4s,#25 1177 add w7,w7,w12 1178 ushr v21.4s,v3.4s,#25 1179 add w8,w8,w9 1180 ushr v25.4s,v4.4s,#25 1181 eor w21,w21,w5 1182 ushr v29.4s,v5.4s,#25 1183 eor w17,w17,w6 1184 sli v9.4s,v0.4s,#7 1185 eor w19,w19,w7 1186 sli v13.4s,v1.4s,#7 1187 eor w20,w20,w8 1188 sli v17.4s,v2.4s,#7 1189 ror w21,w21,#24 1190 sli v21.4s,v3.4s,#7 1191 ror w17,w17,#24 1192 sli v25.4s,v4.4s,#7 1193 ror w19,w19,#24 1194 sli v29.4s,v5.4s,#7 1195 ror w20,w20,#24 1196 ext v10.16b,v10.16b,v10.16b,#8 1197 add w15,w15,w21 1198 ext v14.16b,v14.16b,v14.16b,#8 1199 add w16,w16,w17 1200 ext v18.16b,v18.16b,v18.16b,#8 1201 add w13,w13,w19 1202 ext v22.16b,v22.16b,v22.16b,#8 1203 add w14,w14,w20 1204 ext v26.16b,v26.16b,v26.16b,#8 1205 eor w10,w10,w15 1206 ext v30.16b,v30.16b,v30.16b,#8 1207 eor w11,w11,w16 1208 ext v11.16b,v11.16b,v11.16b,#12 1209 eor w12,w12,w13 1210 ext v15.16b,v15.16b,v15.16b,#12 1211 eor w9,w9,w14 1212 ext v19.16b,v19.16b,v19.16b,#12 1213 ror w10,w10,#25 1214 ext v23.16b,v23.16b,v23.16b,#12 1215 ror w11,w11,#25 1216 ext v27.16b,v27.16b,v27.16b,#12 1217 ror w12,w12,#25 1218 ext v31.16b,v31.16b,v31.16b,#12 1219 ror w9,w9,#25 1220 ext v9.16b,v9.16b,v9.16b,#4 1221 ext v13.16b,v13.16b,v13.16b,#4 1222 ext v17.16b,v17.16b,v17.16b,#4 1223 ext v21.16b,v21.16b,v21.16b,#4 1224 ext v25.16b,v25.16b,v25.16b,#4 1225 ext v29.16b,v29.16b,v29.16b,#4 1226 add v8.4s,v8.4s,v9.4s 1227 add w5,w5,w9 1228 add v12.4s,v12.4s,v13.4s 1229 add w6,w6,w10 1230 add v16.4s,v16.4s,v17.4s 1231 add w7,w7,w11 1232 add v20.4s,v20.4s,v21.4s 1233 add w8,w8,w12 1234 add v24.4s,v24.4s,v25.4s 1235 eor w17,w17,w5 1236 add v28.4s,v28.4s,v29.4s 1237 eor w19,w19,w6 1238 eor v11.16b,v11.16b,v8.16b 1239 eor w20,w20,w7 1240 eor v15.16b,v15.16b,v12.16b 1241 eor w21,w21,w8 1242 eor v19.16b,v19.16b,v16.16b 1243 ror w17,w17,#16 1244 eor v23.16b,v23.16b,v20.16b 1245 ror w19,w19,#16 1246 eor v27.16b,v27.16b,v24.16b 1247 ror w20,w20,#16 1248 eor v31.16b,v31.16b,v28.16b 1249 ror w21,w21,#16 1250 rev32 v11.8h,v11.8h 1251 add w13,w13,w17 1252 rev32 v15.8h,v15.8h 1253 add w14,w14,w19 1254 rev32 v19.8h,v19.8h 1255 add w15,w15,w20 1256 rev32 v23.8h,v23.8h 1257 add w16,w16,w21 1258 rev32 v27.8h,v27.8h 1259 eor w9,w9,w13 1260 rev32 v31.8h,v31.8h 1261 eor w10,w10,w14 1262 add v10.4s,v10.4s,v11.4s 1263 eor w11,w11,w15 1264 add v14.4s,v14.4s,v15.4s 1265 eor w12,w12,w16 1266 add v18.4s,v18.4s,v19.4s 1267 ror w9,w9,#20 1268 add v22.4s,v22.4s,v23.4s 1269 ror w10,w10,#20 1270 add v26.4s,v26.4s,v27.4s 1271 ror w11,w11,#20 1272 add v30.4s,v30.4s,v31.4s 1273 ror w12,w12,#20 1274 eor v0.16b,v9.16b,v10.16b 1275 add w5,w5,w9 1276 eor v1.16b,v13.16b,v14.16b 1277 add w6,w6,w10 1278 eor v2.16b,v17.16b,v18.16b 1279 add w7,w7,w11 1280 eor v3.16b,v21.16b,v22.16b 1281 add w8,w8,w12 1282 eor v4.16b,v25.16b,v26.16b 1283 eor w17,w17,w5 1284 eor v5.16b,v29.16b,v30.16b 1285 eor w19,w19,w6 1286 ushr v9.4s,v0.4s,#20 1287 eor w20,w20,w7 1288 ushr v13.4s,v1.4s,#20 1289 eor w21,w21,w8 1290 ushr v17.4s,v2.4s,#20 1291 ror w17,w17,#24 1292 ushr v21.4s,v3.4s,#20 1293 ror w19,w19,#24 1294 ushr v25.4s,v4.4s,#20 1295 ror w20,w20,#24 1296 ushr v29.4s,v5.4s,#20 1297 ror w21,w21,#24 1298 sli v9.4s,v0.4s,#12 1299 add w13,w13,w17 1300 sli v13.4s,v1.4s,#12 1301 add w14,w14,w19 1302 sli v17.4s,v2.4s,#12 1303 add w15,w15,w20 1304 sli v21.4s,v3.4s,#12 1305 add w16,w16,w21 1306 sli v25.4s,v4.4s,#12 1307 eor w9,w9,w13 1308 sli v29.4s,v5.4s,#12 1309 eor w10,w10,w14 1310 add v8.4s,v8.4s,v9.4s 1311 eor w11,w11,w15 1312 add v12.4s,v12.4s,v13.4s 1313 eor w12,w12,w16 1314 add v16.4s,v16.4s,v17.4s 1315 ror w9,w9,#25 1316 add v20.4s,v20.4s,v21.4s 1317 ror w10,w10,#25 1318 add v24.4s,v24.4s,v25.4s 1319 ror w11,w11,#25 1320 add v28.4s,v28.4s,v29.4s 1321 ror w12,w12,#25 1322 eor v11.16b,v11.16b,v8.16b 1323 add w5,w5,w10 1324 eor v15.16b,v15.16b,v12.16b 1325 add w6,w6,w11 1326 eor v19.16b,v19.16b,v16.16b 1327 add w7,w7,w12 1328 eor v23.16b,v23.16b,v20.16b 1329 add w8,w8,w9 1330 eor v27.16b,v27.16b,v24.16b 1331 eor w21,w21,w5 1332 eor v31.16b,v31.16b,v28.16b 1333 eor w17,w17,w6 1334 tbl v11.16b,{v11.16b},v6.16b 1335 eor w19,w19,w7 1336 tbl v15.16b,{v15.16b},v6.16b 1337 eor w20,w20,w8 1338 tbl v19.16b,{v19.16b},v6.16b 1339 ror w21,w21,#16 1340 tbl v23.16b,{v23.16b},v6.16b 1341 ror w17,w17,#16 1342 tbl v27.16b,{v27.16b},v6.16b 1343 ror w19,w19,#16 1344 tbl v31.16b,{v31.16b},v6.16b 1345 ror w20,w20,#16 1346 add v10.4s,v10.4s,v11.4s 1347 add w15,w15,w21 1348 add v14.4s,v14.4s,v15.4s 1349 add w16,w16,w17 1350 add v18.4s,v18.4s,v19.4s 1351 add w13,w13,w19 1352 add v22.4s,v22.4s,v23.4s 1353 add w14,w14,w20 1354 add v26.4s,v26.4s,v27.4s 1355 eor w10,w10,w15 1356 add v30.4s,v30.4s,v31.4s 1357 eor w11,w11,w16 1358 eor v0.16b,v9.16b,v10.16b 1359 eor w12,w12,w13 1360 eor v1.16b,v13.16b,v14.16b 1361 eor w9,w9,w14 1362 eor v2.16b,v17.16b,v18.16b 1363 ror w10,w10,#20 1364 eor v3.16b,v21.16b,v22.16b 1365 ror w11,w11,#20 1366 eor v4.16b,v25.16b,v26.16b 1367 ror w12,w12,#20 1368 eor v5.16b,v29.16b,v30.16b 1369 ror w9,w9,#20 1370 ushr v9.4s,v0.4s,#25 1371 add w5,w5,w10 1372 ushr v13.4s,v1.4s,#25 1373 add w6,w6,w11 1374 ushr v17.4s,v2.4s,#25 1375 add w7,w7,w12 1376 ushr v21.4s,v3.4s,#25 1377 add w8,w8,w9 1378 ushr v25.4s,v4.4s,#25 1379 eor w21,w21,w5 1380 ushr v29.4s,v5.4s,#25 1381 eor w17,w17,w6 1382 sli v9.4s,v0.4s,#7 1383 eor w19,w19,w7 1384 sli v13.4s,v1.4s,#7 1385 eor w20,w20,w8 1386 sli v17.4s,v2.4s,#7 1387 ror w21,w21,#24 1388 sli v21.4s,v3.4s,#7 1389 ror w17,w17,#24 1390 sli v25.4s,v4.4s,#7 1391 ror w19,w19,#24 1392 sli v29.4s,v5.4s,#7 1393 ror w20,w20,#24 1394 ext v10.16b,v10.16b,v10.16b,#8 1395 add w15,w15,w21 1396 ext v14.16b,v14.16b,v14.16b,#8 1397 add w16,w16,w17 1398 ext v18.16b,v18.16b,v18.16b,#8 1399 add w13,w13,w19 1400 ext v22.16b,v22.16b,v22.16b,#8 1401 add w14,w14,w20 1402 ext v26.16b,v26.16b,v26.16b,#8 1403 eor w10,w10,w15 1404 ext v30.16b,v30.16b,v30.16b,#8 1405 eor w11,w11,w16 1406 ext v11.16b,v11.16b,v11.16b,#4 1407 eor w12,w12,w13 1408 ext v15.16b,v15.16b,v15.16b,#4 1409 eor w9,w9,w14 1410 ext v19.16b,v19.16b,v19.16b,#4 1411 ror w10,w10,#25 1412 ext v23.16b,v23.16b,v23.16b,#4 1413 ror w11,w11,#25 1414 ext v27.16b,v27.16b,v27.16b,#4 1415 ror w12,w12,#25 1416 ext v31.16b,v31.16b,v31.16b,#4 1417 ror w9,w9,#25 1418 ext v9.16b,v9.16b,v9.16b,#12 1419 ext v13.16b,v13.16b,v13.16b,#12 1420 ext v17.16b,v17.16b,v17.16b,#12 1421 ext v21.16b,v21.16b,v21.16b,#12 1422 ext v25.16b,v25.16b,v25.16b,#12 1423 ext v29.16b,v29.16b,v29.16b,#12 1424 cbnz x4,.Loop_upper_neon 1425 1426 add w5,w5,w22 // accumulate key block 1427 add x6,x6,x22,lsr#32 1428 add w7,w7,w23 1429 add x8,x8,x23,lsr#32 1430 add w9,w9,w24 1431 add x10,x10,x24,lsr#32 1432 add w11,w11,w25 1433 add x12,x12,x25,lsr#32 1434 add w13,w13,w26 1435 add x14,x14,x26,lsr#32 1436 add w15,w15,w27 1437 add x16,x16,x27,lsr#32 1438 add w17,w17,w28 1439 add x19,x19,x28,lsr#32 1440 add w20,w20,w30 1441 add x21,x21,x30,lsr#32 1442 1443 add x5,x5,x6,lsl#32 // pack 1444 add x7,x7,x8,lsl#32 1445 ldp x6,x8,[x1,#0] // load input 1446 add x9,x9,x10,lsl#32 1447 add x11,x11,x12,lsl#32 1448 ldp x10,x12,[x1,#16] 1449 add x13,x13,x14,lsl#32 1450 add x15,x15,x16,lsl#32 1451 ldp x14,x16,[x1,#32] 1452 add x17,x17,x19,lsl#32 1453 add x20,x20,x21,lsl#32 1454 ldp x19,x21,[x1,#48] 1455 add x1,x1,#64 1456#ifdef __AARCH64EB__ 1457 rev x5,x5 1458 rev x7,x7 1459 rev x9,x9 1460 rev x11,x11 1461 rev x13,x13 1462 rev x15,x15 1463 rev x17,x17 1464 rev x20,x20 1465#endif 1466 eor x5,x5,x6 1467 eor x7,x7,x8 1468 eor x9,x9,x10 1469 eor x11,x11,x12 1470 eor x13,x13,x14 1471 eor x15,x15,x16 1472 eor x17,x17,x19 1473 eor x20,x20,x21 1474 1475 stp x5,x7,[x0,#0] // store output 1476 add x28,x28,#1 // increment counter 1477 mov w5,w22 // unpack key block 1478 lsr x6,x22,#32 1479 stp x9,x11,[x0,#16] 1480 mov w7,w23 1481 lsr x8,x23,#32 1482 stp x13,x15,[x0,#32] 1483 mov w9,w24 1484 lsr x10,x24,#32 1485 stp x17,x20,[x0,#48] 1486 add x0,x0,#64 1487 mov w11,w25 1488 lsr x12,x25,#32 1489 mov w13,w26 1490 lsr x14,x26,#32 1491 mov w15,w27 1492 lsr x16,x27,#32 1493 mov w17,w28 1494 lsr x19,x28,#32 1495 mov w20,w30 1496 lsr x21,x30,#32 1497 1498 mov x4,#5 1499.Loop_lower_neon: 1500 sub x4,x4,#1 1501 add v8.4s,v8.4s,v9.4s 1502 add w5,w5,w9 1503 add v12.4s,v12.4s,v13.4s 1504 add w6,w6,w10 1505 add v16.4s,v16.4s,v17.4s 1506 add w7,w7,w11 1507 add v20.4s,v20.4s,v21.4s 1508 add w8,w8,w12 1509 add v24.4s,v24.4s,v25.4s 1510 eor w17,w17,w5 1511 add v28.4s,v28.4s,v29.4s 1512 eor w19,w19,w6 1513 eor v11.16b,v11.16b,v8.16b 1514 eor w20,w20,w7 1515 eor v15.16b,v15.16b,v12.16b 1516 eor w21,w21,w8 1517 eor v19.16b,v19.16b,v16.16b 1518 ror w17,w17,#16 1519 eor v23.16b,v23.16b,v20.16b 1520 ror w19,w19,#16 1521 eor v27.16b,v27.16b,v24.16b 1522 ror w20,w20,#16 1523 eor v31.16b,v31.16b,v28.16b 1524 ror w21,w21,#16 1525 rev32 v11.8h,v11.8h 1526 add w13,w13,w17 1527 rev32 v15.8h,v15.8h 1528 add w14,w14,w19 1529 rev32 v19.8h,v19.8h 1530 add w15,w15,w20 1531 rev32 v23.8h,v23.8h 1532 add w16,w16,w21 1533 rev32 v27.8h,v27.8h 1534 eor w9,w9,w13 1535 rev32 v31.8h,v31.8h 1536 eor w10,w10,w14 1537 add v10.4s,v10.4s,v11.4s 1538 eor w11,w11,w15 1539 add v14.4s,v14.4s,v15.4s 1540 eor w12,w12,w16 1541 add v18.4s,v18.4s,v19.4s 1542 ror w9,w9,#20 1543 add v22.4s,v22.4s,v23.4s 1544 ror w10,w10,#20 1545 add v26.4s,v26.4s,v27.4s 1546 ror w11,w11,#20 1547 add v30.4s,v30.4s,v31.4s 1548 ror w12,w12,#20 1549 eor v0.16b,v9.16b,v10.16b 1550 add w5,w5,w9 1551 eor v1.16b,v13.16b,v14.16b 1552 add w6,w6,w10 1553 eor v2.16b,v17.16b,v18.16b 1554 add w7,w7,w11 1555 eor v3.16b,v21.16b,v22.16b 1556 add w8,w8,w12 1557 eor v4.16b,v25.16b,v26.16b 1558 eor w17,w17,w5 1559 eor v5.16b,v29.16b,v30.16b 1560 eor w19,w19,w6 1561 ushr v9.4s,v0.4s,#20 1562 eor w20,w20,w7 1563 ushr v13.4s,v1.4s,#20 1564 eor w21,w21,w8 1565 ushr v17.4s,v2.4s,#20 1566 ror w17,w17,#24 1567 ushr v21.4s,v3.4s,#20 1568 ror w19,w19,#24 1569 ushr v25.4s,v4.4s,#20 1570 ror w20,w20,#24 1571 ushr v29.4s,v5.4s,#20 1572 ror w21,w21,#24 1573 sli v9.4s,v0.4s,#12 1574 add w13,w13,w17 1575 sli v13.4s,v1.4s,#12 1576 add w14,w14,w19 1577 sli v17.4s,v2.4s,#12 1578 add w15,w15,w20 1579 sli v21.4s,v3.4s,#12 1580 add w16,w16,w21 1581 sli v25.4s,v4.4s,#12 1582 eor w9,w9,w13 1583 sli v29.4s,v5.4s,#12 1584 eor w10,w10,w14 1585 add v8.4s,v8.4s,v9.4s 1586 eor w11,w11,w15 1587 add v12.4s,v12.4s,v13.4s 1588 eor w12,w12,w16 1589 add v16.4s,v16.4s,v17.4s 1590 ror w9,w9,#25 1591 add v20.4s,v20.4s,v21.4s 1592 ror w10,w10,#25 1593 add v24.4s,v24.4s,v25.4s 1594 ror w11,w11,#25 1595 add v28.4s,v28.4s,v29.4s 1596 ror w12,w12,#25 1597 eor v11.16b,v11.16b,v8.16b 1598 add w5,w5,w10 1599 eor v15.16b,v15.16b,v12.16b 1600 add w6,w6,w11 1601 eor v19.16b,v19.16b,v16.16b 1602 add w7,w7,w12 1603 eor v23.16b,v23.16b,v20.16b 1604 add w8,w8,w9 1605 eor v27.16b,v27.16b,v24.16b 1606 eor w21,w21,w5 1607 eor v31.16b,v31.16b,v28.16b 1608 eor w17,w17,w6 1609 tbl v11.16b,{v11.16b},v6.16b 1610 eor w19,w19,w7 1611 tbl v15.16b,{v15.16b},v6.16b 1612 eor w20,w20,w8 1613 tbl v19.16b,{v19.16b},v6.16b 1614 ror w21,w21,#16 1615 tbl v23.16b,{v23.16b},v6.16b 1616 ror w17,w17,#16 1617 tbl v27.16b,{v27.16b},v6.16b 1618 ror w19,w19,#16 1619 tbl v31.16b,{v31.16b},v6.16b 1620 ror w20,w20,#16 1621 add v10.4s,v10.4s,v11.4s 1622 add w15,w15,w21 1623 add v14.4s,v14.4s,v15.4s 1624 add w16,w16,w17 1625 add v18.4s,v18.4s,v19.4s 1626 add w13,w13,w19 1627 add v22.4s,v22.4s,v23.4s 1628 add w14,w14,w20 1629 add v26.4s,v26.4s,v27.4s 1630 eor w10,w10,w15 1631 add v30.4s,v30.4s,v31.4s 1632 eor w11,w11,w16 1633 eor v0.16b,v9.16b,v10.16b 1634 eor w12,w12,w13 1635 eor v1.16b,v13.16b,v14.16b 1636 eor w9,w9,w14 1637 eor v2.16b,v17.16b,v18.16b 1638 ror w10,w10,#20 1639 eor v3.16b,v21.16b,v22.16b 1640 ror w11,w11,#20 1641 eor v4.16b,v25.16b,v26.16b 1642 ror w12,w12,#20 1643 eor v5.16b,v29.16b,v30.16b 1644 ror w9,w9,#20 1645 ushr v9.4s,v0.4s,#25 1646 add w5,w5,w10 1647 ushr v13.4s,v1.4s,#25 1648 add w6,w6,w11 1649 ushr v17.4s,v2.4s,#25 1650 add w7,w7,w12 1651 ushr v21.4s,v3.4s,#25 1652 add w8,w8,w9 1653 ushr v25.4s,v4.4s,#25 1654 eor w21,w21,w5 1655 ushr v29.4s,v5.4s,#25 1656 eor w17,w17,w6 1657 sli v9.4s,v0.4s,#7 1658 eor w19,w19,w7 1659 sli v13.4s,v1.4s,#7 1660 eor w20,w20,w8 1661 sli v17.4s,v2.4s,#7 1662 ror w21,w21,#24 1663 sli v21.4s,v3.4s,#7 1664 ror w17,w17,#24 1665 sli v25.4s,v4.4s,#7 1666 ror w19,w19,#24 1667 sli v29.4s,v5.4s,#7 1668 ror w20,w20,#24 1669 ext v10.16b,v10.16b,v10.16b,#8 1670 add w15,w15,w21 1671 ext v14.16b,v14.16b,v14.16b,#8 1672 add w16,w16,w17 1673 ext v18.16b,v18.16b,v18.16b,#8 1674 add w13,w13,w19 1675 ext v22.16b,v22.16b,v22.16b,#8 1676 add w14,w14,w20 1677 ext v26.16b,v26.16b,v26.16b,#8 1678 eor w10,w10,w15 1679 ext v30.16b,v30.16b,v30.16b,#8 1680 eor w11,w11,w16 1681 ext v11.16b,v11.16b,v11.16b,#12 1682 eor w12,w12,w13 1683 ext v15.16b,v15.16b,v15.16b,#12 1684 eor w9,w9,w14 1685 ext v19.16b,v19.16b,v19.16b,#12 1686 ror w10,w10,#25 1687 ext v23.16b,v23.16b,v23.16b,#12 1688 ror w11,w11,#25 1689 ext v27.16b,v27.16b,v27.16b,#12 1690 ror w12,w12,#25 1691 ext v31.16b,v31.16b,v31.16b,#12 1692 ror w9,w9,#25 1693 ext v9.16b,v9.16b,v9.16b,#4 1694 ext v13.16b,v13.16b,v13.16b,#4 1695 ext v17.16b,v17.16b,v17.16b,#4 1696 ext v21.16b,v21.16b,v21.16b,#4 1697 ext v25.16b,v25.16b,v25.16b,#4 1698 ext v29.16b,v29.16b,v29.16b,#4 1699 add v8.4s,v8.4s,v9.4s 1700 add w5,w5,w9 1701 add v12.4s,v12.4s,v13.4s 1702 add w6,w6,w10 1703 add v16.4s,v16.4s,v17.4s 1704 add w7,w7,w11 1705 add v20.4s,v20.4s,v21.4s 1706 add w8,w8,w12 1707 add v24.4s,v24.4s,v25.4s 1708 eor w17,w17,w5 1709 add v28.4s,v28.4s,v29.4s 1710 eor w19,w19,w6 1711 eor v11.16b,v11.16b,v8.16b 1712 eor w20,w20,w7 1713 eor v15.16b,v15.16b,v12.16b 1714 eor w21,w21,w8 1715 eor v19.16b,v19.16b,v16.16b 1716 ror w17,w17,#16 1717 eor v23.16b,v23.16b,v20.16b 1718 ror w19,w19,#16 1719 eor v27.16b,v27.16b,v24.16b 1720 ror w20,w20,#16 1721 eor v31.16b,v31.16b,v28.16b 1722 ror w21,w21,#16 1723 rev32 v11.8h,v11.8h 1724 add w13,w13,w17 1725 rev32 v15.8h,v15.8h 1726 add w14,w14,w19 1727 rev32 v19.8h,v19.8h 1728 add w15,w15,w20 1729 rev32 v23.8h,v23.8h 1730 add w16,w16,w21 1731 rev32 v27.8h,v27.8h 1732 eor w9,w9,w13 1733 rev32 v31.8h,v31.8h 1734 eor w10,w10,w14 1735 add v10.4s,v10.4s,v11.4s 1736 eor w11,w11,w15 1737 add v14.4s,v14.4s,v15.4s 1738 eor w12,w12,w16 1739 add v18.4s,v18.4s,v19.4s 1740 ror w9,w9,#20 1741 add v22.4s,v22.4s,v23.4s 1742 ror w10,w10,#20 1743 add v26.4s,v26.4s,v27.4s 1744 ror w11,w11,#20 1745 add v30.4s,v30.4s,v31.4s 1746 ror w12,w12,#20 1747 eor v0.16b,v9.16b,v10.16b 1748 add w5,w5,w9 1749 eor v1.16b,v13.16b,v14.16b 1750 add w6,w6,w10 1751 eor v2.16b,v17.16b,v18.16b 1752 add w7,w7,w11 1753 eor v3.16b,v21.16b,v22.16b 1754 add w8,w8,w12 1755 eor v4.16b,v25.16b,v26.16b 1756 eor w17,w17,w5 1757 eor v5.16b,v29.16b,v30.16b 1758 eor w19,w19,w6 1759 ushr v9.4s,v0.4s,#20 1760 eor w20,w20,w7 1761 ushr v13.4s,v1.4s,#20 1762 eor w21,w21,w8 1763 ushr v17.4s,v2.4s,#20 1764 ror w17,w17,#24 1765 ushr v21.4s,v3.4s,#20 1766 ror w19,w19,#24 1767 ushr v25.4s,v4.4s,#20 1768 ror w20,w20,#24 1769 ushr v29.4s,v5.4s,#20 1770 ror w21,w21,#24 1771 sli v9.4s,v0.4s,#12 1772 add w13,w13,w17 1773 sli v13.4s,v1.4s,#12 1774 add w14,w14,w19 1775 sli v17.4s,v2.4s,#12 1776 add w15,w15,w20 1777 sli v21.4s,v3.4s,#12 1778 add w16,w16,w21 1779 sli v25.4s,v4.4s,#12 1780 eor w9,w9,w13 1781 sli v29.4s,v5.4s,#12 1782 eor w10,w10,w14 1783 add v8.4s,v8.4s,v9.4s 1784 eor w11,w11,w15 1785 add v12.4s,v12.4s,v13.4s 1786 eor w12,w12,w16 1787 add v16.4s,v16.4s,v17.4s 1788 ror w9,w9,#25 1789 add v20.4s,v20.4s,v21.4s 1790 ror w10,w10,#25 1791 add v24.4s,v24.4s,v25.4s 1792 ror w11,w11,#25 1793 add v28.4s,v28.4s,v29.4s 1794 ror w12,w12,#25 1795 eor v11.16b,v11.16b,v8.16b 1796 add w5,w5,w10 1797 eor v15.16b,v15.16b,v12.16b 1798 add w6,w6,w11 1799 eor v19.16b,v19.16b,v16.16b 1800 add w7,w7,w12 1801 eor v23.16b,v23.16b,v20.16b 1802 add w8,w8,w9 1803 eor v27.16b,v27.16b,v24.16b 1804 eor w21,w21,w5 1805 eor v31.16b,v31.16b,v28.16b 1806 eor w17,w17,w6 1807 tbl v11.16b,{v11.16b},v6.16b 1808 eor w19,w19,w7 1809 tbl v15.16b,{v15.16b},v6.16b 1810 eor w20,w20,w8 1811 tbl v19.16b,{v19.16b},v6.16b 1812 ror w21,w21,#16 1813 tbl v23.16b,{v23.16b},v6.16b 1814 ror w17,w17,#16 1815 tbl v27.16b,{v27.16b},v6.16b 1816 ror w19,w19,#16 1817 tbl v31.16b,{v31.16b},v6.16b 1818 ror w20,w20,#16 1819 add v10.4s,v10.4s,v11.4s 1820 add w15,w15,w21 1821 add v14.4s,v14.4s,v15.4s 1822 add w16,w16,w17 1823 add v18.4s,v18.4s,v19.4s 1824 add w13,w13,w19 1825 add v22.4s,v22.4s,v23.4s 1826 add w14,w14,w20 1827 add v26.4s,v26.4s,v27.4s 1828 eor w10,w10,w15 1829 add v30.4s,v30.4s,v31.4s 1830 eor w11,w11,w16 1831 eor v0.16b,v9.16b,v10.16b 1832 eor w12,w12,w13 1833 eor v1.16b,v13.16b,v14.16b 1834 eor w9,w9,w14 1835 eor v2.16b,v17.16b,v18.16b 1836 ror w10,w10,#20 1837 eor v3.16b,v21.16b,v22.16b 1838 ror w11,w11,#20 1839 eor v4.16b,v25.16b,v26.16b 1840 ror w12,w12,#20 1841 eor v5.16b,v29.16b,v30.16b 1842 ror w9,w9,#20 1843 ushr v9.4s,v0.4s,#25 1844 add w5,w5,w10 1845 ushr v13.4s,v1.4s,#25 1846 add w6,w6,w11 1847 ushr v17.4s,v2.4s,#25 1848 add w7,w7,w12 1849 ushr v21.4s,v3.4s,#25 1850 add w8,w8,w9 1851 ushr v25.4s,v4.4s,#25 1852 eor w21,w21,w5 1853 ushr v29.4s,v5.4s,#25 1854 eor w17,w17,w6 1855 sli v9.4s,v0.4s,#7 1856 eor w19,w19,w7 1857 sli v13.4s,v1.4s,#7 1858 eor w20,w20,w8 1859 sli v17.4s,v2.4s,#7 1860 ror w21,w21,#24 1861 sli v21.4s,v3.4s,#7 1862 ror w17,w17,#24 1863 sli v25.4s,v4.4s,#7 1864 ror w19,w19,#24 1865 sli v29.4s,v5.4s,#7 1866 ror w20,w20,#24 1867 ext v10.16b,v10.16b,v10.16b,#8 1868 add w15,w15,w21 1869 ext v14.16b,v14.16b,v14.16b,#8 1870 add w16,w16,w17 1871 ext v18.16b,v18.16b,v18.16b,#8 1872 add w13,w13,w19 1873 ext v22.16b,v22.16b,v22.16b,#8 1874 add w14,w14,w20 1875 ext v26.16b,v26.16b,v26.16b,#8 1876 eor w10,w10,w15 1877 ext v30.16b,v30.16b,v30.16b,#8 1878 eor w11,w11,w16 1879 ext v11.16b,v11.16b,v11.16b,#4 1880 eor w12,w12,w13 1881 ext v15.16b,v15.16b,v15.16b,#4 1882 eor w9,w9,w14 1883 ext v19.16b,v19.16b,v19.16b,#4 1884 ror w10,w10,#25 1885 ext v23.16b,v23.16b,v23.16b,#4 1886 ror w11,w11,#25 1887 ext v27.16b,v27.16b,v27.16b,#4 1888 ror w12,w12,#25 1889 ext v31.16b,v31.16b,v31.16b,#4 1890 ror w9,w9,#25 1891 ext v9.16b,v9.16b,v9.16b,#12 1892 ext v13.16b,v13.16b,v13.16b,#12 1893 ext v17.16b,v17.16b,v17.16b,#12 1894 ext v21.16b,v21.16b,v21.16b,#12 1895 ext v25.16b,v25.16b,v25.16b,#12 1896 ext v29.16b,v29.16b,v29.16b,#12 1897 cbnz x4,.Loop_lower_neon 1898 1899 add w5,w5,w22 // accumulate key block 1900 ldp q0,q1,[sp,#0] 1901 add x6,x6,x22,lsr#32 1902 ldp q2,q3,[sp,#32] 1903 add w7,w7,w23 1904 ldp q4,q5,[sp,#64] 1905 add x8,x8,x23,lsr#32 1906 ldr q6,[sp,#96] 1907 add v8.4s,v8.4s,v0.4s 1908 add w9,w9,w24 1909 add v12.4s,v12.4s,v0.4s 1910 add x10,x10,x24,lsr#32 1911 add v16.4s,v16.4s,v0.4s 1912 add w11,w11,w25 1913 add v20.4s,v20.4s,v0.4s 1914 add x12,x12,x25,lsr#32 1915 add v24.4s,v24.4s,v0.4s 1916 add w13,w13,w26 1917 add v28.4s,v28.4s,v0.4s 1918 add x14,x14,x26,lsr#32 1919 add v10.4s,v10.4s,v2.4s 1920 add w15,w15,w27 1921 add v14.4s,v14.4s,v2.4s 1922 add x16,x16,x27,lsr#32 1923 add v18.4s,v18.4s,v2.4s 1924 add w17,w17,w28 1925 add v22.4s,v22.4s,v2.4s 1926 add x19,x19,x28,lsr#32 1927 add v26.4s,v26.4s,v2.4s 1928 add w20,w20,w30 1929 add v30.4s,v30.4s,v2.4s 1930 add x21,x21,x30,lsr#32 1931 add v27.4s,v27.4s,v7.4s // +4 1932 add x5,x5,x6,lsl#32 // pack 1933 add v31.4s,v31.4s,v7.4s // +4 1934 add x7,x7,x8,lsl#32 1935 add v11.4s,v11.4s,v3.4s 1936 ldp x6,x8,[x1,#0] // load input 1937 add v15.4s,v15.4s,v4.4s 1938 add x9,x9,x10,lsl#32 1939 add v19.4s,v19.4s,v5.4s 1940 add x11,x11,x12,lsl#32 1941 add v23.4s,v23.4s,v6.4s 1942 ldp x10,x12,[x1,#16] 1943 add v27.4s,v27.4s,v3.4s 1944 add x13,x13,x14,lsl#32 1945 add v31.4s,v31.4s,v4.4s 1946 add x15,x15,x16,lsl#32 1947 add v9.4s,v9.4s,v1.4s 1948 ldp x14,x16,[x1,#32] 1949 add v13.4s,v13.4s,v1.4s 1950 add x17,x17,x19,lsl#32 1951 add v17.4s,v17.4s,v1.4s 1952 add x20,x20,x21,lsl#32 1953 add v21.4s,v21.4s,v1.4s 1954 ldp x19,x21,[x1,#48] 1955 add v25.4s,v25.4s,v1.4s 1956 add x1,x1,#64 1957 add v29.4s,v29.4s,v1.4s 1958 1959#ifdef __AARCH64EB__ 1960 rev x5,x5 1961 rev x7,x7 1962 rev x9,x9 1963 rev x11,x11 1964 rev x13,x13 1965 rev x15,x15 1966 rev x17,x17 1967 rev x20,x20 1968#endif 1969 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1970 eor x5,x5,x6 1971 eor x7,x7,x8 1972 eor x9,x9,x10 1973 eor x11,x11,x12 1974 eor x13,x13,x14 1975 eor v8.16b,v8.16b,v0.16b 1976 eor x15,x15,x16 1977 eor v9.16b,v9.16b,v1.16b 1978 eor x17,x17,x19 1979 eor v10.16b,v10.16b,v2.16b 1980 eor x20,x20,x21 1981 eor v11.16b,v11.16b,v3.16b 1982 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1983 1984 stp x5,x7,[x0,#0] // store output 1985 add x28,x28,#7 // increment counter 1986 stp x9,x11,[x0,#16] 1987 stp x13,x15,[x0,#32] 1988 stp x17,x20,[x0,#48] 1989 add x0,x0,#64 1990 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1991 1992 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1993 eor v12.16b,v12.16b,v0.16b 1994 eor v13.16b,v13.16b,v1.16b 1995 eor v14.16b,v14.16b,v2.16b 1996 eor v15.16b,v15.16b,v3.16b 1997 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1998 1999 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 2000 eor v16.16b,v16.16b,v8.16b 2001 ldp q0,q1,[sp,#0] 2002 eor v17.16b,v17.16b,v9.16b 2003 ldp q2,q3,[sp,#32] 2004 eor v18.16b,v18.16b,v10.16b 2005 eor v19.16b,v19.16b,v11.16b 2006 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 2007 2008 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 2009 eor v20.16b,v20.16b,v12.16b 2010 eor v21.16b,v21.16b,v13.16b 2011 eor v22.16b,v22.16b,v14.16b 2012 eor v23.16b,v23.16b,v15.16b 2013 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 2014 2015 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 2016 eor v24.16b,v24.16b,v16.16b 2017 eor v25.16b,v25.16b,v17.16b 2018 eor v26.16b,v26.16b,v18.16b 2019 eor v27.16b,v27.16b,v19.16b 2020 st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 2021 2022 shl v8.4s,v7.4s,#1 // 4 -> 8 2023 eor v28.16b,v28.16b,v20.16b 2024 eor v29.16b,v29.16b,v21.16b 2025 eor v30.16b,v30.16b,v22.16b 2026 eor v31.16b,v31.16b,v23.16b 2027 st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64 2028 2029 add v3.4s,v3.4s,v8.4s // += 8 2030 add v4.4s,v4.4s,v8.4s 2031 add v5.4s,v5.4s,v8.4s 2032 add v6.4s,v6.4s,v8.4s 2033 2034 b.hs .Loop_outer_512_neon 2035 2036 adds x2,x2,#512 2037 ushr v7.4s,v7.4s,#1 // 4 -> 2 2038 2039 ldp d10,d11,[sp,#128+16] // meet ABI requirements 2040 ldp d12,d13,[sp,#128+32] 2041 ldp d14,d15,[sp,#128+48] 2042 2043 stp q0,q0,[sp,#0] // wipe off-load area 2044 stp q0,q0,[sp,#32] 2045 stp q0,q0,[sp,#64] 2046 2047 b.eq .Ldone_512_neon 2048 2049 sub x3,x3,#16 // .Lone 2050 cmp x2,#192 2051 add sp,sp,#128 2052 sub v3.4s,v3.4s,v7.4s // -= 2 2053 ld1 {v8.4s,v9.4s},[x3] 2054 b.hs .Loop_outer_neon 2055 2056 ldp d8,d9,[sp,#0] // meet ABI requirements 2057 eor v1.16b,v1.16b,v1.16b 2058 eor v2.16b,v2.16b,v2.16b 2059 eor v3.16b,v3.16b,v3.16b 2060 eor v4.16b,v4.16b,v4.16b 2061 eor v5.16b,v5.16b,v5.16b 2062 eor v6.16b,v6.16b,v6.16b 2063 b .Loop_outer 2064 2065.Ldone_512_neon: 2066 ldp d8,d9,[sp,#128+0] // meet ABI requirements 2067 ldp x19,x20,[x29,#16] 2068 add sp,sp,#128+64 2069 ldp x21,x22,[x29,#32] 2070 ldp x23,x24,[x29,#48] 2071 ldp x25,x26,[x29,#64] 2072 ldp x27,x28,[x29,#80] 2073 ldp x29,x30,[sp],#96 2074 AARCH64_VALIDATE_LINK_REGISTER 2075 ret 2076.size ChaCha20_512_neon,.-ChaCha20_512_neon 2077