1/* $FreeBSD$ */ 2/* Do not modify. This file is auto-generated from chacha-armv8.pl. */ 3#include "arm_arch.h" 4 5.text 6 7 8.hidden OPENSSL_armcap_P 9 10.align 5 11.Lsigma: 12.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 13.Lone: 14.long 1,0,0,0 15.LOPENSSL_armcap_P: 16#ifdef __ILP32__ 17.long OPENSSL_armcap_P-. 18#else 19.quad OPENSSL_armcap_P-. 20#endif 21.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 22.align 2 23 24.globl ChaCha20_ctr32 25.type ChaCha20_ctr32,%function 26.align 5 27ChaCha20_ctr32: 28 cbz x2,.Labort 29 adr x5,.LOPENSSL_armcap_P 30 cmp x2,#192 31 b.lo .Lshort 32#ifdef __ILP32__ 33 ldrsw x6,[x5] 34#else 35 ldr x6,[x5] 36#endif 37 ldr w17,[x6,x5] 38 tst w17,#ARMV7_NEON 39 b.ne ChaCha20_neon 40 41.Lshort: 42.inst 0xd503233f // paciasp 43 stp x29,x30,[sp,#-96]! 44 add x29,sp,#0 45 46 adr x5,.Lsigma 47 stp x19,x20,[sp,#16] 48 stp x21,x22,[sp,#32] 49 stp x23,x24,[sp,#48] 50 stp x25,x26,[sp,#64] 51 stp x27,x28,[sp,#80] 52 sub sp,sp,#64 53 54 ldp x22,x23,[x5] // load sigma 55 ldp x24,x25,[x3] // load key 56 ldp x26,x27,[x3,#16] 57 ldp x28,x30,[x4] // load counter 58#ifdef __ARMEB__ 59 ror x24,x24,#32 60 ror x25,x25,#32 61 ror x26,x26,#32 62 ror x27,x27,#32 63 ror x28,x28,#32 64 ror x30,x30,#32 65#endif 66 67.Loop_outer: 68 mov w5,w22 // unpack key block 69 lsr x6,x22,#32 70 mov w7,w23 71 lsr x8,x23,#32 72 mov w9,w24 73 lsr x10,x24,#32 74 mov w11,w25 75 lsr x12,x25,#32 76 mov w13,w26 77 lsr x14,x26,#32 78 mov w15,w27 79 lsr x16,x27,#32 80 mov w17,w28 81 lsr x19,x28,#32 82 mov w20,w30 83 lsr x21,x30,#32 84 85 mov x4,#10 86 subs x2,x2,#64 87.Loop: 88 sub x4,x4,#1 89 add w5,w5,w9 90 add w6,w6,w10 91 add w7,w7,w11 92 add w8,w8,w12 93 eor w17,w17,w5 94 eor w19,w19,w6 95 eor w20,w20,w7 96 eor w21,w21,w8 97 ror w17,w17,#16 98 ror w19,w19,#16 99 ror w20,w20,#16 100 ror w21,w21,#16 101 add w13,w13,w17 102 add w14,w14,w19 103 add w15,w15,w20 104 add w16,w16,w21 105 eor w9,w9,w13 106 eor w10,w10,w14 107 eor w11,w11,w15 108 eor w12,w12,w16 109 ror w9,w9,#20 110 ror w10,w10,#20 111 ror w11,w11,#20 112 ror w12,w12,#20 113 add w5,w5,w9 114 add w6,w6,w10 115 add w7,w7,w11 116 add w8,w8,w12 117 eor w17,w17,w5 118 eor w19,w19,w6 119 eor w20,w20,w7 120 eor w21,w21,w8 121 ror w17,w17,#24 122 ror w19,w19,#24 123 ror w20,w20,#24 124 ror w21,w21,#24 125 add w13,w13,w17 126 add w14,w14,w19 127 add w15,w15,w20 128 add w16,w16,w21 129 eor w9,w9,w13 130 eor w10,w10,w14 131 eor w11,w11,w15 132 eor w12,w12,w16 133 ror w9,w9,#25 134 ror w10,w10,#25 135 ror w11,w11,#25 136 ror w12,w12,#25 137 add w5,w5,w10 138 add w6,w6,w11 139 add w7,w7,w12 140 add w8,w8,w9 141 eor w21,w21,w5 142 eor w17,w17,w6 143 eor w19,w19,w7 144 eor w20,w20,w8 145 ror w21,w21,#16 146 ror w17,w17,#16 147 ror w19,w19,#16 148 ror w20,w20,#16 149 add w15,w15,w21 150 add w16,w16,w17 151 add w13,w13,w19 152 add w14,w14,w20 153 eor w10,w10,w15 154 eor w11,w11,w16 155 eor w12,w12,w13 156 eor w9,w9,w14 157 ror w10,w10,#20 158 ror w11,w11,#20 159 ror w12,w12,#20 160 ror w9,w9,#20 161 add w5,w5,w10 162 add w6,w6,w11 163 add w7,w7,w12 164 add w8,w8,w9 165 eor w21,w21,w5 166 eor w17,w17,w6 167 eor w19,w19,w7 168 eor w20,w20,w8 169 ror w21,w21,#24 170 ror w17,w17,#24 171 ror w19,w19,#24 172 ror w20,w20,#24 173 add w15,w15,w21 174 add w16,w16,w17 175 add w13,w13,w19 176 add w14,w14,w20 177 eor w10,w10,w15 178 eor w11,w11,w16 179 eor w12,w12,w13 180 eor w9,w9,w14 181 ror w10,w10,#25 182 ror w11,w11,#25 183 ror w12,w12,#25 184 ror w9,w9,#25 185 cbnz x4,.Loop 186 187 add w5,w5,w22 // accumulate key block 188 add x6,x6,x22,lsr#32 189 add w7,w7,w23 190 add x8,x8,x23,lsr#32 191 add w9,w9,w24 192 add x10,x10,x24,lsr#32 193 add w11,w11,w25 194 add x12,x12,x25,lsr#32 195 add w13,w13,w26 196 add x14,x14,x26,lsr#32 197 add w15,w15,w27 198 add x16,x16,x27,lsr#32 199 add w17,w17,w28 200 add x19,x19,x28,lsr#32 201 add w20,w20,w30 202 add x21,x21,x30,lsr#32 203 204 b.lo .Ltail 205 206 add x5,x5,x6,lsl#32 // pack 207 add x7,x7,x8,lsl#32 208 ldp x6,x8,[x1,#0] // load input 209 add x9,x9,x10,lsl#32 210 add x11,x11,x12,lsl#32 211 ldp x10,x12,[x1,#16] 212 add x13,x13,x14,lsl#32 213 add x15,x15,x16,lsl#32 214 ldp x14,x16,[x1,#32] 215 add x17,x17,x19,lsl#32 216 add x20,x20,x21,lsl#32 217 ldp x19,x21,[x1,#48] 218 add x1,x1,#64 219#ifdef __ARMEB__ 220 rev x5,x5 221 rev x7,x7 222 rev x9,x9 223 rev x11,x11 224 rev x13,x13 225 rev x15,x15 226 rev x17,x17 227 rev x20,x20 228#endif 229 eor x5,x5,x6 230 eor x7,x7,x8 231 eor x9,x9,x10 232 eor x11,x11,x12 233 eor x13,x13,x14 234 eor x15,x15,x16 235 eor x17,x17,x19 236 eor x20,x20,x21 237 238 stp x5,x7,[x0,#0] // store output 239 add x28,x28,#1 // increment counter 240 stp x9,x11,[x0,#16] 241 stp x13,x15,[x0,#32] 242 stp x17,x20,[x0,#48] 243 add x0,x0,#64 244 245 b.hi .Loop_outer 246 247 ldp x19,x20,[x29,#16] 248 add sp,sp,#64 249 ldp x21,x22,[x29,#32] 250 ldp x23,x24,[x29,#48] 251 ldp x25,x26,[x29,#64] 252 ldp x27,x28,[x29,#80] 253 ldp x29,x30,[sp],#96 254.inst 0xd50323bf // autiasp 255.Labort: 256 ret 257 258.align 4 259.Ltail: 260 add x2,x2,#64 261.Less_than_64: 262 sub x0,x0,#1 263 add x1,x1,x2 264 add x0,x0,x2 265 add x4,sp,x2 266 neg x2,x2 267 268 add x5,x5,x6,lsl#32 // pack 269 add x7,x7,x8,lsl#32 270 add x9,x9,x10,lsl#32 271 add x11,x11,x12,lsl#32 272 add x13,x13,x14,lsl#32 273 add x15,x15,x16,lsl#32 274 add x17,x17,x19,lsl#32 275 add x20,x20,x21,lsl#32 276#ifdef __ARMEB__ 277 rev x5,x5 278 rev x7,x7 279 rev x9,x9 280 rev x11,x11 281 rev x13,x13 282 rev x15,x15 283 rev x17,x17 284 rev x20,x20 285#endif 286 stp x5,x7,[sp,#0] 287 stp x9,x11,[sp,#16] 288 stp x13,x15,[sp,#32] 289 stp x17,x20,[sp,#48] 290 291.Loop_tail: 292 ldrb w10,[x1,x2] 293 ldrb w11,[x4,x2] 294 add x2,x2,#1 295 eor w10,w10,w11 296 strb w10,[x0,x2] 297 cbnz x2,.Loop_tail 298 299 stp xzr,xzr,[sp,#0] 300 stp xzr,xzr,[sp,#16] 301 stp xzr,xzr,[sp,#32] 302 stp xzr,xzr,[sp,#48] 303 304 ldp x19,x20,[x29,#16] 305 add sp,sp,#64 306 ldp x21,x22,[x29,#32] 307 ldp x23,x24,[x29,#48] 308 ldp x25,x26,[x29,#64] 309 ldp x27,x28,[x29,#80] 310 ldp x29,x30,[sp],#96 311.inst 0xd50323bf // autiasp 312 ret 313.size ChaCha20_ctr32,.-ChaCha20_ctr32 314 315.type ChaCha20_neon,%function 316.align 5 317ChaCha20_neon: 318.inst 0xd503233f // paciasp 319 stp x29,x30,[sp,#-96]! 320 add x29,sp,#0 321 322 adr x5,.Lsigma 323 stp x19,x20,[sp,#16] 324 stp x21,x22,[sp,#32] 325 stp x23,x24,[sp,#48] 326 stp x25,x26,[sp,#64] 327 stp x27,x28,[sp,#80] 328 cmp x2,#512 329 b.hs .L512_or_more_neon 330 331 sub sp,sp,#64 332 333 ldp x22,x23,[x5] // load sigma 334 ld1 {v24.4s},[x5],#16 335 ldp x24,x25,[x3] // load key 336 ldp x26,x27,[x3,#16] 337 ld1 {v25.4s,v26.4s},[x3] 338 ldp x28,x30,[x4] // load counter 339 ld1 {v27.4s},[x4] 340 ld1 {v31.4s},[x5] 341#ifdef __ARMEB__ 342 rev64 v24.4s,v24.4s 343 ror x24,x24,#32 344 ror x25,x25,#32 345 ror x26,x26,#32 346 ror x27,x27,#32 347 ror x28,x28,#32 348 ror x30,x30,#32 349#endif 350 add v27.4s,v27.4s,v31.4s // += 1 351 add v28.4s,v27.4s,v31.4s 352 add v29.4s,v28.4s,v31.4s 353 shl v31.4s,v31.4s,#2 // 1 -> 4 354 355.Loop_outer_neon: 356 mov w5,w22 // unpack key block 357 lsr x6,x22,#32 358 mov v0.16b,v24.16b 359 mov w7,w23 360 lsr x8,x23,#32 361 mov v4.16b,v24.16b 362 mov w9,w24 363 lsr x10,x24,#32 364 mov v16.16b,v24.16b 365 mov w11,w25 366 mov v1.16b,v25.16b 367 lsr x12,x25,#32 368 mov v5.16b,v25.16b 369 mov w13,w26 370 mov v17.16b,v25.16b 371 lsr x14,x26,#32 372 mov v3.16b,v27.16b 373 mov w15,w27 374 mov v7.16b,v28.16b 375 lsr x16,x27,#32 376 mov v19.16b,v29.16b 377 mov w17,w28 378 mov v2.16b,v26.16b 379 lsr x19,x28,#32 380 mov v6.16b,v26.16b 381 mov w20,w30 382 mov v18.16b,v26.16b 383 lsr x21,x30,#32 384 385 mov x4,#10 386 subs x2,x2,#256 387.Loop_neon: 388 sub x4,x4,#1 389 add v0.4s,v0.4s,v1.4s 390 add w5,w5,w9 391 add v4.4s,v4.4s,v5.4s 392 add w6,w6,w10 393 add v16.4s,v16.4s,v17.4s 394 add w7,w7,w11 395 eor v3.16b,v3.16b,v0.16b 396 add w8,w8,w12 397 eor v7.16b,v7.16b,v4.16b 398 eor w17,w17,w5 399 eor v19.16b,v19.16b,v16.16b 400 eor w19,w19,w6 401 rev32 v3.8h,v3.8h 402 eor w20,w20,w7 403 rev32 v7.8h,v7.8h 404 eor w21,w21,w8 405 rev32 v19.8h,v19.8h 406 ror w17,w17,#16 407 add v2.4s,v2.4s,v3.4s 408 ror w19,w19,#16 409 add v6.4s,v6.4s,v7.4s 410 ror w20,w20,#16 411 add v18.4s,v18.4s,v19.4s 412 ror w21,w21,#16 413 eor v20.16b,v1.16b,v2.16b 414 add w13,w13,w17 415 eor v21.16b,v5.16b,v6.16b 416 add w14,w14,w19 417 eor v22.16b,v17.16b,v18.16b 418 add w15,w15,w20 419 ushr v1.4s,v20.4s,#20 420 add w16,w16,w21 421 ushr v5.4s,v21.4s,#20 422 eor w9,w9,w13 423 ushr v17.4s,v22.4s,#20 424 eor w10,w10,w14 425 sli v1.4s,v20.4s,#12 426 eor w11,w11,w15 427 sli v5.4s,v21.4s,#12 428 eor w12,w12,w16 429 sli v17.4s,v22.4s,#12 430 ror w9,w9,#20 431 add v0.4s,v0.4s,v1.4s 432 ror w10,w10,#20 433 add v4.4s,v4.4s,v5.4s 434 ror w11,w11,#20 435 add v16.4s,v16.4s,v17.4s 436 ror w12,w12,#20 437 eor v20.16b,v3.16b,v0.16b 438 add w5,w5,w9 439 eor v21.16b,v7.16b,v4.16b 440 add w6,w6,w10 441 eor v22.16b,v19.16b,v16.16b 442 add w7,w7,w11 443 ushr v3.4s,v20.4s,#24 444 add w8,w8,w12 445 ushr v7.4s,v21.4s,#24 446 eor w17,w17,w5 447 ushr v19.4s,v22.4s,#24 448 eor w19,w19,w6 449 sli v3.4s,v20.4s,#8 450 eor w20,w20,w7 451 sli v7.4s,v21.4s,#8 452 eor w21,w21,w8 453 sli v19.4s,v22.4s,#8 454 ror w17,w17,#24 455 add v2.4s,v2.4s,v3.4s 456 ror w19,w19,#24 457 add v6.4s,v6.4s,v7.4s 458 ror w20,w20,#24 459 add v18.4s,v18.4s,v19.4s 460 ror w21,w21,#24 461 eor v20.16b,v1.16b,v2.16b 462 add w13,w13,w17 463 eor v21.16b,v5.16b,v6.16b 464 add w14,w14,w19 465 eor v22.16b,v17.16b,v18.16b 466 add w15,w15,w20 467 ushr v1.4s,v20.4s,#25 468 add w16,w16,w21 469 ushr v5.4s,v21.4s,#25 470 eor w9,w9,w13 471 ushr v17.4s,v22.4s,#25 472 eor w10,w10,w14 473 sli v1.4s,v20.4s,#7 474 eor w11,w11,w15 475 sli v5.4s,v21.4s,#7 476 eor w12,w12,w16 477 sli v17.4s,v22.4s,#7 478 ror w9,w9,#25 479 ext v2.16b,v2.16b,v2.16b,#8 480 ror w10,w10,#25 481 ext v6.16b,v6.16b,v6.16b,#8 482 ror w11,w11,#25 483 ext v18.16b,v18.16b,v18.16b,#8 484 ror w12,w12,#25 485 ext v3.16b,v3.16b,v3.16b,#12 486 ext v7.16b,v7.16b,v7.16b,#12 487 ext v19.16b,v19.16b,v19.16b,#12 488 ext v1.16b,v1.16b,v1.16b,#4 489 ext v5.16b,v5.16b,v5.16b,#4 490 ext v17.16b,v17.16b,v17.16b,#4 491 add v0.4s,v0.4s,v1.4s 492 add w5,w5,w10 493 add v4.4s,v4.4s,v5.4s 494 add w6,w6,w11 495 add v16.4s,v16.4s,v17.4s 496 add w7,w7,w12 497 eor v3.16b,v3.16b,v0.16b 498 add w8,w8,w9 499 eor v7.16b,v7.16b,v4.16b 500 eor w21,w21,w5 501 eor v19.16b,v19.16b,v16.16b 502 eor w17,w17,w6 503 rev32 v3.8h,v3.8h 504 eor w19,w19,w7 505 rev32 v7.8h,v7.8h 506 eor w20,w20,w8 507 rev32 v19.8h,v19.8h 508 ror w21,w21,#16 509 add v2.4s,v2.4s,v3.4s 510 ror w17,w17,#16 511 add v6.4s,v6.4s,v7.4s 512 ror w19,w19,#16 513 add v18.4s,v18.4s,v19.4s 514 ror w20,w20,#16 515 eor v20.16b,v1.16b,v2.16b 516 add w15,w15,w21 517 eor v21.16b,v5.16b,v6.16b 518 add w16,w16,w17 519 eor v22.16b,v17.16b,v18.16b 520 add w13,w13,w19 521 ushr v1.4s,v20.4s,#20 522 add w14,w14,w20 523 ushr v5.4s,v21.4s,#20 524 eor w10,w10,w15 525 ushr v17.4s,v22.4s,#20 526 eor w11,w11,w16 527 sli v1.4s,v20.4s,#12 528 eor w12,w12,w13 529 sli v5.4s,v21.4s,#12 530 eor w9,w9,w14 531 sli v17.4s,v22.4s,#12 532 ror w10,w10,#20 533 add v0.4s,v0.4s,v1.4s 534 ror w11,w11,#20 535 add v4.4s,v4.4s,v5.4s 536 ror w12,w12,#20 537 add v16.4s,v16.4s,v17.4s 538 ror w9,w9,#20 539 eor v20.16b,v3.16b,v0.16b 540 add w5,w5,w10 541 eor v21.16b,v7.16b,v4.16b 542 add w6,w6,w11 543 eor v22.16b,v19.16b,v16.16b 544 add w7,w7,w12 545 ushr v3.4s,v20.4s,#24 546 add w8,w8,w9 547 ushr v7.4s,v21.4s,#24 548 eor w21,w21,w5 549 ushr v19.4s,v22.4s,#24 550 eor w17,w17,w6 551 sli v3.4s,v20.4s,#8 552 eor w19,w19,w7 553 sli v7.4s,v21.4s,#8 554 eor w20,w20,w8 555 sli v19.4s,v22.4s,#8 556 ror w21,w21,#24 557 add v2.4s,v2.4s,v3.4s 558 ror w17,w17,#24 559 add v6.4s,v6.4s,v7.4s 560 ror w19,w19,#24 561 add v18.4s,v18.4s,v19.4s 562 ror w20,w20,#24 563 eor v20.16b,v1.16b,v2.16b 564 add w15,w15,w21 565 eor v21.16b,v5.16b,v6.16b 566 add w16,w16,w17 567 eor v22.16b,v17.16b,v18.16b 568 add w13,w13,w19 569 ushr v1.4s,v20.4s,#25 570 add w14,w14,w20 571 ushr v5.4s,v21.4s,#25 572 eor w10,w10,w15 573 ushr v17.4s,v22.4s,#25 574 eor w11,w11,w16 575 sli v1.4s,v20.4s,#7 576 eor w12,w12,w13 577 sli v5.4s,v21.4s,#7 578 eor w9,w9,w14 579 sli v17.4s,v22.4s,#7 580 ror w10,w10,#25 581 ext v2.16b,v2.16b,v2.16b,#8 582 ror w11,w11,#25 583 ext v6.16b,v6.16b,v6.16b,#8 584 ror w12,w12,#25 585 ext v18.16b,v18.16b,v18.16b,#8 586 ror w9,w9,#25 587 ext v3.16b,v3.16b,v3.16b,#4 588 ext v7.16b,v7.16b,v7.16b,#4 589 ext v19.16b,v19.16b,v19.16b,#4 590 ext v1.16b,v1.16b,v1.16b,#12 591 ext v5.16b,v5.16b,v5.16b,#12 592 ext v17.16b,v17.16b,v17.16b,#12 593 cbnz x4,.Loop_neon 594 595 add w5,w5,w22 // accumulate key block 596 add v0.4s,v0.4s,v24.4s 597 add x6,x6,x22,lsr#32 598 add v4.4s,v4.4s,v24.4s 599 add w7,w7,w23 600 add v16.4s,v16.4s,v24.4s 601 add x8,x8,x23,lsr#32 602 add v2.4s,v2.4s,v26.4s 603 add w9,w9,w24 604 add v6.4s,v6.4s,v26.4s 605 add x10,x10,x24,lsr#32 606 add v18.4s,v18.4s,v26.4s 607 add w11,w11,w25 608 add v3.4s,v3.4s,v27.4s 609 add x12,x12,x25,lsr#32 610 add w13,w13,w26 611 add v7.4s,v7.4s,v28.4s 612 add x14,x14,x26,lsr#32 613 add w15,w15,w27 614 add v19.4s,v19.4s,v29.4s 615 add x16,x16,x27,lsr#32 616 add w17,w17,w28 617 add v1.4s,v1.4s,v25.4s 618 add x19,x19,x28,lsr#32 619 add w20,w20,w30 620 add v5.4s,v5.4s,v25.4s 621 add x21,x21,x30,lsr#32 622 add v17.4s,v17.4s,v25.4s 623 624 b.lo .Ltail_neon 625 626 add x5,x5,x6,lsl#32 // pack 627 add x7,x7,x8,lsl#32 628 ldp x6,x8,[x1,#0] // load input 629 add x9,x9,x10,lsl#32 630 add x11,x11,x12,lsl#32 631 ldp x10,x12,[x1,#16] 632 add x13,x13,x14,lsl#32 633 add x15,x15,x16,lsl#32 634 ldp x14,x16,[x1,#32] 635 add x17,x17,x19,lsl#32 636 add x20,x20,x21,lsl#32 637 ldp x19,x21,[x1,#48] 638 add x1,x1,#64 639#ifdef __ARMEB__ 640 rev x5,x5 641 rev x7,x7 642 rev x9,x9 643 rev x11,x11 644 rev x13,x13 645 rev x15,x15 646 rev x17,x17 647 rev x20,x20 648#endif 649 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 650 eor x5,x5,x6 651 eor x7,x7,x8 652 eor x9,x9,x10 653 eor x11,x11,x12 654 eor x13,x13,x14 655 eor v0.16b,v0.16b,v20.16b 656 eor x15,x15,x16 657 eor v1.16b,v1.16b,v21.16b 658 eor x17,x17,x19 659 eor v2.16b,v2.16b,v22.16b 660 eor x20,x20,x21 661 eor v3.16b,v3.16b,v23.16b 662 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 663 664 stp x5,x7,[x0,#0] // store output 665 add x28,x28,#4 // increment counter 666 stp x9,x11,[x0,#16] 667 add v27.4s,v27.4s,v31.4s // += 4 668 stp x13,x15,[x0,#32] 669 add v28.4s,v28.4s,v31.4s 670 stp x17,x20,[x0,#48] 671 add v29.4s,v29.4s,v31.4s 672 add x0,x0,#64 673 674 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 675 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 676 677 eor v4.16b,v4.16b,v20.16b 678 eor v5.16b,v5.16b,v21.16b 679 eor v6.16b,v6.16b,v22.16b 680 eor v7.16b,v7.16b,v23.16b 681 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 682 683 eor v16.16b,v16.16b,v0.16b 684 eor v17.16b,v17.16b,v1.16b 685 eor v18.16b,v18.16b,v2.16b 686 eor v19.16b,v19.16b,v3.16b 687 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 688 689 b.hi .Loop_outer_neon 690 691 ldp x19,x20,[x29,#16] 692 add sp,sp,#64 693 ldp x21,x22,[x29,#32] 694 ldp x23,x24,[x29,#48] 695 ldp x25,x26,[x29,#64] 696 ldp x27,x28,[x29,#80] 697 ldp x29,x30,[sp],#96 698.inst 0xd50323bf // autiasp 699 ret 700 701.Ltail_neon: 702 add x2,x2,#256 703 cmp x2,#64 704 b.lo .Less_than_64 705 706 add x5,x5,x6,lsl#32 // pack 707 add x7,x7,x8,lsl#32 708 ldp x6,x8,[x1,#0] // load input 709 add x9,x9,x10,lsl#32 710 add x11,x11,x12,lsl#32 711 ldp x10,x12,[x1,#16] 712 add x13,x13,x14,lsl#32 713 add x15,x15,x16,lsl#32 714 ldp x14,x16,[x1,#32] 715 add x17,x17,x19,lsl#32 716 add x20,x20,x21,lsl#32 717 ldp x19,x21,[x1,#48] 718 add x1,x1,#64 719#ifdef __ARMEB__ 720 rev x5,x5 721 rev x7,x7 722 rev x9,x9 723 rev x11,x11 724 rev x13,x13 725 rev x15,x15 726 rev x17,x17 727 rev x20,x20 728#endif 729 eor x5,x5,x6 730 eor x7,x7,x8 731 eor x9,x9,x10 732 eor x11,x11,x12 733 eor x13,x13,x14 734 eor x15,x15,x16 735 eor x17,x17,x19 736 eor x20,x20,x21 737 738 stp x5,x7,[x0,#0] // store output 739 add x28,x28,#4 // increment counter 740 stp x9,x11,[x0,#16] 741 stp x13,x15,[x0,#32] 742 stp x17,x20,[x0,#48] 743 add x0,x0,#64 744 b.eq .Ldone_neon 745 sub x2,x2,#64 746 cmp x2,#64 747 b.lo .Less_than_128 748 749 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 750 eor v0.16b,v0.16b,v20.16b 751 eor v1.16b,v1.16b,v21.16b 752 eor v2.16b,v2.16b,v22.16b 753 eor v3.16b,v3.16b,v23.16b 754 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 755 b.eq .Ldone_neon 756 sub x2,x2,#64 757 cmp x2,#64 758 b.lo .Less_than_192 759 760 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 761 eor v4.16b,v4.16b,v20.16b 762 eor v5.16b,v5.16b,v21.16b 763 eor v6.16b,v6.16b,v22.16b 764 eor v7.16b,v7.16b,v23.16b 765 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 766 b.eq .Ldone_neon 767 sub x2,x2,#64 768 769 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 770 b .Last_neon 771 772.Less_than_128: 773 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] 774 b .Last_neon 775.Less_than_192: 776 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] 777 b .Last_neon 778 779.align 4 780.Last_neon: 781 sub x0,x0,#1 782 add x1,x1,x2 783 add x0,x0,x2 784 add x4,sp,x2 785 neg x2,x2 786 787.Loop_tail_neon: 788 ldrb w10,[x1,x2] 789 ldrb w11,[x4,x2] 790 add x2,x2,#1 791 eor w10,w10,w11 792 strb w10,[x0,x2] 793 cbnz x2,.Loop_tail_neon 794 795 stp xzr,xzr,[sp,#0] 796 stp xzr,xzr,[sp,#16] 797 stp xzr,xzr,[sp,#32] 798 stp xzr,xzr,[sp,#48] 799 800.Ldone_neon: 801 ldp x19,x20,[x29,#16] 802 add sp,sp,#64 803 ldp x21,x22,[x29,#32] 804 ldp x23,x24,[x29,#48] 805 ldp x25,x26,[x29,#64] 806 ldp x27,x28,[x29,#80] 807 ldp x29,x30,[sp],#96 808.inst 0xd50323bf // autiasp 809 ret 810.size ChaCha20_neon,.-ChaCha20_neon 811.type ChaCha20_512_neon,%function 812.align 5 813ChaCha20_512_neon: 814.inst 0xd503233f // paciasp 815 stp x29,x30,[sp,#-96]! 816 add x29,sp,#0 817 818 adr x5,.Lsigma 819 stp x19,x20,[sp,#16] 820 stp x21,x22,[sp,#32] 821 stp x23,x24,[sp,#48] 822 stp x25,x26,[sp,#64] 823 stp x27,x28,[sp,#80] 824 825.L512_or_more_neon: 826 sub sp,sp,#128+64 827 828 ldp x22,x23,[x5] // load sigma 829 ld1 {v24.4s},[x5],#16 830 ldp x24,x25,[x3] // load key 831 ldp x26,x27,[x3,#16] 832 ld1 {v25.4s,v26.4s},[x3] 833 ldp x28,x30,[x4] // load counter 834 ld1 {v27.4s},[x4] 835 ld1 {v31.4s},[x5] 836#ifdef __ARMEB__ 837 rev64 v24.4s,v24.4s 838 ror x24,x24,#32 839 ror x25,x25,#32 840 ror x26,x26,#32 841 ror x27,x27,#32 842 ror x28,x28,#32 843 ror x30,x30,#32 844#endif 845 add v27.4s,v27.4s,v31.4s // += 1 846 stp q24,q25,[sp,#0] // off-load key block, invariant part 847 add v27.4s,v27.4s,v31.4s // not typo 848 str q26,[sp,#32] 849 add v28.4s,v27.4s,v31.4s 850 add v29.4s,v28.4s,v31.4s 851 add v30.4s,v29.4s,v31.4s 852 shl v31.4s,v31.4s,#2 // 1 -> 4 853 854 stp d8,d9,[sp,#128+0] // meet ABI requirements 855 stp d10,d11,[sp,#128+16] 856 stp d12,d13,[sp,#128+32] 857 stp d14,d15,[sp,#128+48] 858 859 sub x2,x2,#512 // not typo 860 861.Loop_outer_512_neon: 862 mov v0.16b,v24.16b 863 mov v4.16b,v24.16b 864 mov v8.16b,v24.16b 865 mov v12.16b,v24.16b 866 mov v16.16b,v24.16b 867 mov v20.16b,v24.16b 868 mov v1.16b,v25.16b 869 mov w5,w22 // unpack key block 870 mov v5.16b,v25.16b 871 lsr x6,x22,#32 872 mov v9.16b,v25.16b 873 mov w7,w23 874 mov v13.16b,v25.16b 875 lsr x8,x23,#32 876 mov v17.16b,v25.16b 877 mov w9,w24 878 mov v21.16b,v25.16b 879 lsr x10,x24,#32 880 mov v3.16b,v27.16b 881 mov w11,w25 882 mov v7.16b,v28.16b 883 lsr x12,x25,#32 884 mov v11.16b,v29.16b 885 mov w13,w26 886 mov v15.16b,v30.16b 887 lsr x14,x26,#32 888 mov v2.16b,v26.16b 889 mov w15,w27 890 mov v6.16b,v26.16b 891 lsr x16,x27,#32 892 add v19.4s,v3.4s,v31.4s // +4 893 mov w17,w28 894 add v23.4s,v7.4s,v31.4s // +4 895 lsr x19,x28,#32 896 mov v10.16b,v26.16b 897 mov w20,w30 898 mov v14.16b,v26.16b 899 lsr x21,x30,#32 900 mov v18.16b,v26.16b 901 stp q27,q28,[sp,#48] // off-load key block, variable part 902 mov v22.16b,v26.16b 903 str q29,[sp,#80] 904 905 mov x4,#5 906 subs x2,x2,#512 907.Loop_upper_neon: 908 sub x4,x4,#1 909 add v0.4s,v0.4s,v1.4s 910 add w5,w5,w9 911 add v4.4s,v4.4s,v5.4s 912 add w6,w6,w10 913 add v8.4s,v8.4s,v9.4s 914 add w7,w7,w11 915 add v12.4s,v12.4s,v13.4s 916 add w8,w8,w12 917 add v16.4s,v16.4s,v17.4s 918 eor w17,w17,w5 919 add v20.4s,v20.4s,v21.4s 920 eor w19,w19,w6 921 eor v3.16b,v3.16b,v0.16b 922 eor w20,w20,w7 923 eor v7.16b,v7.16b,v4.16b 924 eor w21,w21,w8 925 eor v11.16b,v11.16b,v8.16b 926 ror w17,w17,#16 927 eor v15.16b,v15.16b,v12.16b 928 ror w19,w19,#16 929 eor v19.16b,v19.16b,v16.16b 930 ror w20,w20,#16 931 eor v23.16b,v23.16b,v20.16b 932 ror w21,w21,#16 933 rev32 v3.8h,v3.8h 934 add w13,w13,w17 935 rev32 v7.8h,v7.8h 936 add w14,w14,w19 937 rev32 v11.8h,v11.8h 938 add w15,w15,w20 939 rev32 v15.8h,v15.8h 940 add w16,w16,w21 941 rev32 v19.8h,v19.8h 942 eor w9,w9,w13 943 rev32 v23.8h,v23.8h 944 eor w10,w10,w14 945 add v2.4s,v2.4s,v3.4s 946 eor w11,w11,w15 947 add v6.4s,v6.4s,v7.4s 948 eor w12,w12,w16 949 add v10.4s,v10.4s,v11.4s 950 ror w9,w9,#20 951 add v14.4s,v14.4s,v15.4s 952 ror w10,w10,#20 953 add v18.4s,v18.4s,v19.4s 954 ror w11,w11,#20 955 add v22.4s,v22.4s,v23.4s 956 ror w12,w12,#20 957 eor v24.16b,v1.16b,v2.16b 958 add w5,w5,w9 959 eor v25.16b,v5.16b,v6.16b 960 add w6,w6,w10 961 eor v26.16b,v9.16b,v10.16b 962 add w7,w7,w11 963 eor v27.16b,v13.16b,v14.16b 964 add w8,w8,w12 965 eor v28.16b,v17.16b,v18.16b 966 eor w17,w17,w5 967 eor v29.16b,v21.16b,v22.16b 968 eor w19,w19,w6 969 ushr v1.4s,v24.4s,#20 970 eor w20,w20,w7 971 ushr v5.4s,v25.4s,#20 972 eor w21,w21,w8 973 ushr v9.4s,v26.4s,#20 974 ror w17,w17,#24 975 ushr v13.4s,v27.4s,#20 976 ror w19,w19,#24 977 ushr v17.4s,v28.4s,#20 978 ror w20,w20,#24 979 ushr v21.4s,v29.4s,#20 980 ror w21,w21,#24 981 sli v1.4s,v24.4s,#12 982 add w13,w13,w17 983 sli v5.4s,v25.4s,#12 984 add w14,w14,w19 985 sli v9.4s,v26.4s,#12 986 add w15,w15,w20 987 sli v13.4s,v27.4s,#12 988 add w16,w16,w21 989 sli v17.4s,v28.4s,#12 990 eor w9,w9,w13 991 sli v21.4s,v29.4s,#12 992 eor w10,w10,w14 993 add v0.4s,v0.4s,v1.4s 994 eor w11,w11,w15 995 add v4.4s,v4.4s,v5.4s 996 eor w12,w12,w16 997 add v8.4s,v8.4s,v9.4s 998 ror w9,w9,#25 999 add v12.4s,v12.4s,v13.4s 1000 ror w10,w10,#25 1001 add v16.4s,v16.4s,v17.4s 1002 ror w11,w11,#25 1003 add v20.4s,v20.4s,v21.4s 1004 ror w12,w12,#25 1005 eor v24.16b,v3.16b,v0.16b 1006 add w5,w5,w10 1007 eor v25.16b,v7.16b,v4.16b 1008 add w6,w6,w11 1009 eor v26.16b,v11.16b,v8.16b 1010 add w7,w7,w12 1011 eor v27.16b,v15.16b,v12.16b 1012 add w8,w8,w9 1013 eor v28.16b,v19.16b,v16.16b 1014 eor w21,w21,w5 1015 eor v29.16b,v23.16b,v20.16b 1016 eor w17,w17,w6 1017 ushr v3.4s,v24.4s,#24 1018 eor w19,w19,w7 1019 ushr v7.4s,v25.4s,#24 1020 eor w20,w20,w8 1021 ushr v11.4s,v26.4s,#24 1022 ror w21,w21,#16 1023 ushr v15.4s,v27.4s,#24 1024 ror w17,w17,#16 1025 ushr v19.4s,v28.4s,#24 1026 ror w19,w19,#16 1027 ushr v23.4s,v29.4s,#24 1028 ror w20,w20,#16 1029 sli v3.4s,v24.4s,#8 1030 add w15,w15,w21 1031 sli v7.4s,v25.4s,#8 1032 add w16,w16,w17 1033 sli v11.4s,v26.4s,#8 1034 add w13,w13,w19 1035 sli v15.4s,v27.4s,#8 1036 add w14,w14,w20 1037 sli v19.4s,v28.4s,#8 1038 eor w10,w10,w15 1039 sli v23.4s,v29.4s,#8 1040 eor w11,w11,w16 1041 add v2.4s,v2.4s,v3.4s 1042 eor w12,w12,w13 1043 add v6.4s,v6.4s,v7.4s 1044 eor w9,w9,w14 1045 add v10.4s,v10.4s,v11.4s 1046 ror w10,w10,#20 1047 add v14.4s,v14.4s,v15.4s 1048 ror w11,w11,#20 1049 add v18.4s,v18.4s,v19.4s 1050 ror w12,w12,#20 1051 add v22.4s,v22.4s,v23.4s 1052 ror w9,w9,#20 1053 eor v24.16b,v1.16b,v2.16b 1054 add w5,w5,w10 1055 eor v25.16b,v5.16b,v6.16b 1056 add w6,w6,w11 1057 eor v26.16b,v9.16b,v10.16b 1058 add w7,w7,w12 1059 eor v27.16b,v13.16b,v14.16b 1060 add w8,w8,w9 1061 eor v28.16b,v17.16b,v18.16b 1062 eor w21,w21,w5 1063 eor v29.16b,v21.16b,v22.16b 1064 eor w17,w17,w6 1065 ushr v1.4s,v24.4s,#25 1066 eor w19,w19,w7 1067 ushr v5.4s,v25.4s,#25 1068 eor w20,w20,w8 1069 ushr v9.4s,v26.4s,#25 1070 ror w21,w21,#24 1071 ushr v13.4s,v27.4s,#25 1072 ror w17,w17,#24 1073 ushr v17.4s,v28.4s,#25 1074 ror w19,w19,#24 1075 ushr v21.4s,v29.4s,#25 1076 ror w20,w20,#24 1077 sli v1.4s,v24.4s,#7 1078 add w15,w15,w21 1079 sli v5.4s,v25.4s,#7 1080 add w16,w16,w17 1081 sli v9.4s,v26.4s,#7 1082 add w13,w13,w19 1083 sli v13.4s,v27.4s,#7 1084 add w14,w14,w20 1085 sli v17.4s,v28.4s,#7 1086 eor w10,w10,w15 1087 sli v21.4s,v29.4s,#7 1088 eor w11,w11,w16 1089 ext v2.16b,v2.16b,v2.16b,#8 1090 eor w12,w12,w13 1091 ext v6.16b,v6.16b,v6.16b,#8 1092 eor w9,w9,w14 1093 ext v10.16b,v10.16b,v10.16b,#8 1094 ror w10,w10,#25 1095 ext v14.16b,v14.16b,v14.16b,#8 1096 ror w11,w11,#25 1097 ext v18.16b,v18.16b,v18.16b,#8 1098 ror w12,w12,#25 1099 ext v22.16b,v22.16b,v22.16b,#8 1100 ror w9,w9,#25 1101 ext v3.16b,v3.16b,v3.16b,#12 1102 ext v7.16b,v7.16b,v7.16b,#12 1103 ext v11.16b,v11.16b,v11.16b,#12 1104 ext v15.16b,v15.16b,v15.16b,#12 1105 ext v19.16b,v19.16b,v19.16b,#12 1106 ext v23.16b,v23.16b,v23.16b,#12 1107 ext v1.16b,v1.16b,v1.16b,#4 1108 ext v5.16b,v5.16b,v5.16b,#4 1109 ext v9.16b,v9.16b,v9.16b,#4 1110 ext v13.16b,v13.16b,v13.16b,#4 1111 ext v17.16b,v17.16b,v17.16b,#4 1112 ext v21.16b,v21.16b,v21.16b,#4 1113 add v0.4s,v0.4s,v1.4s 1114 add w5,w5,w9 1115 add v4.4s,v4.4s,v5.4s 1116 add w6,w6,w10 1117 add v8.4s,v8.4s,v9.4s 1118 add w7,w7,w11 1119 add v12.4s,v12.4s,v13.4s 1120 add w8,w8,w12 1121 add v16.4s,v16.4s,v17.4s 1122 eor w17,w17,w5 1123 add v20.4s,v20.4s,v21.4s 1124 eor w19,w19,w6 1125 eor v3.16b,v3.16b,v0.16b 1126 eor w20,w20,w7 1127 eor v7.16b,v7.16b,v4.16b 1128 eor w21,w21,w8 1129 eor v11.16b,v11.16b,v8.16b 1130 ror w17,w17,#16 1131 eor v15.16b,v15.16b,v12.16b 1132 ror w19,w19,#16 1133 eor v19.16b,v19.16b,v16.16b 1134 ror w20,w20,#16 1135 eor v23.16b,v23.16b,v20.16b 1136 ror w21,w21,#16 1137 rev32 v3.8h,v3.8h 1138 add w13,w13,w17 1139 rev32 v7.8h,v7.8h 1140 add w14,w14,w19 1141 rev32 v11.8h,v11.8h 1142 add w15,w15,w20 1143 rev32 v15.8h,v15.8h 1144 add w16,w16,w21 1145 rev32 v19.8h,v19.8h 1146 eor w9,w9,w13 1147 rev32 v23.8h,v23.8h 1148 eor w10,w10,w14 1149 add v2.4s,v2.4s,v3.4s 1150 eor w11,w11,w15 1151 add v6.4s,v6.4s,v7.4s 1152 eor w12,w12,w16 1153 add v10.4s,v10.4s,v11.4s 1154 ror w9,w9,#20 1155 add v14.4s,v14.4s,v15.4s 1156 ror w10,w10,#20 1157 add v18.4s,v18.4s,v19.4s 1158 ror w11,w11,#20 1159 add v22.4s,v22.4s,v23.4s 1160 ror w12,w12,#20 1161 eor v24.16b,v1.16b,v2.16b 1162 add w5,w5,w9 1163 eor v25.16b,v5.16b,v6.16b 1164 add w6,w6,w10 1165 eor v26.16b,v9.16b,v10.16b 1166 add w7,w7,w11 1167 eor v27.16b,v13.16b,v14.16b 1168 add w8,w8,w12 1169 eor v28.16b,v17.16b,v18.16b 1170 eor w17,w17,w5 1171 eor v29.16b,v21.16b,v22.16b 1172 eor w19,w19,w6 1173 ushr v1.4s,v24.4s,#20 1174 eor w20,w20,w7 1175 ushr v5.4s,v25.4s,#20 1176 eor w21,w21,w8 1177 ushr v9.4s,v26.4s,#20 1178 ror w17,w17,#24 1179 ushr v13.4s,v27.4s,#20 1180 ror w19,w19,#24 1181 ushr v17.4s,v28.4s,#20 1182 ror w20,w20,#24 1183 ushr v21.4s,v29.4s,#20 1184 ror w21,w21,#24 1185 sli v1.4s,v24.4s,#12 1186 add w13,w13,w17 1187 sli v5.4s,v25.4s,#12 1188 add w14,w14,w19 1189 sli v9.4s,v26.4s,#12 1190 add w15,w15,w20 1191 sli v13.4s,v27.4s,#12 1192 add w16,w16,w21 1193 sli v17.4s,v28.4s,#12 1194 eor w9,w9,w13 1195 sli v21.4s,v29.4s,#12 1196 eor w10,w10,w14 1197 add v0.4s,v0.4s,v1.4s 1198 eor w11,w11,w15 1199 add v4.4s,v4.4s,v5.4s 1200 eor w12,w12,w16 1201 add v8.4s,v8.4s,v9.4s 1202 ror w9,w9,#25 1203 add v12.4s,v12.4s,v13.4s 1204 ror w10,w10,#25 1205 add v16.4s,v16.4s,v17.4s 1206 ror w11,w11,#25 1207 add v20.4s,v20.4s,v21.4s 1208 ror w12,w12,#25 1209 eor v24.16b,v3.16b,v0.16b 1210 add w5,w5,w10 1211 eor v25.16b,v7.16b,v4.16b 1212 add w6,w6,w11 1213 eor v26.16b,v11.16b,v8.16b 1214 add w7,w7,w12 1215 eor v27.16b,v15.16b,v12.16b 1216 add w8,w8,w9 1217 eor v28.16b,v19.16b,v16.16b 1218 eor w21,w21,w5 1219 eor v29.16b,v23.16b,v20.16b 1220 eor w17,w17,w6 1221 ushr v3.4s,v24.4s,#24 1222 eor w19,w19,w7 1223 ushr v7.4s,v25.4s,#24 1224 eor w20,w20,w8 1225 ushr v11.4s,v26.4s,#24 1226 ror w21,w21,#16 1227 ushr v15.4s,v27.4s,#24 1228 ror w17,w17,#16 1229 ushr v19.4s,v28.4s,#24 1230 ror w19,w19,#16 1231 ushr v23.4s,v29.4s,#24 1232 ror w20,w20,#16 1233 sli v3.4s,v24.4s,#8 1234 add w15,w15,w21 1235 sli v7.4s,v25.4s,#8 1236 add w16,w16,w17 1237 sli v11.4s,v26.4s,#8 1238 add w13,w13,w19 1239 sli v15.4s,v27.4s,#8 1240 add w14,w14,w20 1241 sli v19.4s,v28.4s,#8 1242 eor w10,w10,w15 1243 sli v23.4s,v29.4s,#8 1244 eor w11,w11,w16 1245 add v2.4s,v2.4s,v3.4s 1246 eor w12,w12,w13 1247 add v6.4s,v6.4s,v7.4s 1248 eor w9,w9,w14 1249 add v10.4s,v10.4s,v11.4s 1250 ror w10,w10,#20 1251 add v14.4s,v14.4s,v15.4s 1252 ror w11,w11,#20 1253 add v18.4s,v18.4s,v19.4s 1254 ror w12,w12,#20 1255 add v22.4s,v22.4s,v23.4s 1256 ror w9,w9,#20 1257 eor v24.16b,v1.16b,v2.16b 1258 add w5,w5,w10 1259 eor v25.16b,v5.16b,v6.16b 1260 add w6,w6,w11 1261 eor v26.16b,v9.16b,v10.16b 1262 add w7,w7,w12 1263 eor v27.16b,v13.16b,v14.16b 1264 add w8,w8,w9 1265 eor v28.16b,v17.16b,v18.16b 1266 eor w21,w21,w5 1267 eor v29.16b,v21.16b,v22.16b 1268 eor w17,w17,w6 1269 ushr v1.4s,v24.4s,#25 1270 eor w19,w19,w7 1271 ushr v5.4s,v25.4s,#25 1272 eor w20,w20,w8 1273 ushr v9.4s,v26.4s,#25 1274 ror w21,w21,#24 1275 ushr v13.4s,v27.4s,#25 1276 ror w17,w17,#24 1277 ushr v17.4s,v28.4s,#25 1278 ror w19,w19,#24 1279 ushr v21.4s,v29.4s,#25 1280 ror w20,w20,#24 1281 sli v1.4s,v24.4s,#7 1282 add w15,w15,w21 1283 sli v5.4s,v25.4s,#7 1284 add w16,w16,w17 1285 sli v9.4s,v26.4s,#7 1286 add w13,w13,w19 1287 sli v13.4s,v27.4s,#7 1288 add w14,w14,w20 1289 sli v17.4s,v28.4s,#7 1290 eor w10,w10,w15 1291 sli v21.4s,v29.4s,#7 1292 eor w11,w11,w16 1293 ext v2.16b,v2.16b,v2.16b,#8 1294 eor w12,w12,w13 1295 ext v6.16b,v6.16b,v6.16b,#8 1296 eor w9,w9,w14 1297 ext v10.16b,v10.16b,v10.16b,#8 1298 ror w10,w10,#25 1299 ext v14.16b,v14.16b,v14.16b,#8 1300 ror w11,w11,#25 1301 ext v18.16b,v18.16b,v18.16b,#8 1302 ror w12,w12,#25 1303 ext v22.16b,v22.16b,v22.16b,#8 1304 ror w9,w9,#25 1305 ext v3.16b,v3.16b,v3.16b,#4 1306 ext v7.16b,v7.16b,v7.16b,#4 1307 ext v11.16b,v11.16b,v11.16b,#4 1308 ext v15.16b,v15.16b,v15.16b,#4 1309 ext v19.16b,v19.16b,v19.16b,#4 1310 ext v23.16b,v23.16b,v23.16b,#4 1311 ext v1.16b,v1.16b,v1.16b,#12 1312 ext v5.16b,v5.16b,v5.16b,#12 1313 ext v9.16b,v9.16b,v9.16b,#12 1314 ext v13.16b,v13.16b,v13.16b,#12 1315 ext v17.16b,v17.16b,v17.16b,#12 1316 ext v21.16b,v21.16b,v21.16b,#12 1317 cbnz x4,.Loop_upper_neon 1318 1319 add w5,w5,w22 // accumulate key block 1320 add x6,x6,x22,lsr#32 1321 add w7,w7,w23 1322 add x8,x8,x23,lsr#32 1323 add w9,w9,w24 1324 add x10,x10,x24,lsr#32 1325 add w11,w11,w25 1326 add x12,x12,x25,lsr#32 1327 add w13,w13,w26 1328 add x14,x14,x26,lsr#32 1329 add w15,w15,w27 1330 add x16,x16,x27,lsr#32 1331 add w17,w17,w28 1332 add x19,x19,x28,lsr#32 1333 add w20,w20,w30 1334 add x21,x21,x30,lsr#32 1335 1336 add x5,x5,x6,lsl#32 // pack 1337 add x7,x7,x8,lsl#32 1338 ldp x6,x8,[x1,#0] // load input 1339 add x9,x9,x10,lsl#32 1340 add x11,x11,x12,lsl#32 1341 ldp x10,x12,[x1,#16] 1342 add x13,x13,x14,lsl#32 1343 add x15,x15,x16,lsl#32 1344 ldp x14,x16,[x1,#32] 1345 add x17,x17,x19,lsl#32 1346 add x20,x20,x21,lsl#32 1347 ldp x19,x21,[x1,#48] 1348 add x1,x1,#64 1349#ifdef __ARMEB__ 1350 rev x5,x5 1351 rev x7,x7 1352 rev x9,x9 1353 rev x11,x11 1354 rev x13,x13 1355 rev x15,x15 1356 rev x17,x17 1357 rev x20,x20 1358#endif 1359 eor x5,x5,x6 1360 eor x7,x7,x8 1361 eor x9,x9,x10 1362 eor x11,x11,x12 1363 eor x13,x13,x14 1364 eor x15,x15,x16 1365 eor x17,x17,x19 1366 eor x20,x20,x21 1367 1368 stp x5,x7,[x0,#0] // store output 1369 add x28,x28,#1 // increment counter 1370 mov w5,w22 // unpack key block 1371 lsr x6,x22,#32 1372 stp x9,x11,[x0,#16] 1373 mov w7,w23 1374 lsr x8,x23,#32 1375 stp x13,x15,[x0,#32] 1376 mov w9,w24 1377 lsr x10,x24,#32 1378 stp x17,x20,[x0,#48] 1379 add x0,x0,#64 1380 mov w11,w25 1381 lsr x12,x25,#32 1382 mov w13,w26 1383 lsr x14,x26,#32 1384 mov w15,w27 1385 lsr x16,x27,#32 1386 mov w17,w28 1387 lsr x19,x28,#32 1388 mov w20,w30 1389 lsr x21,x30,#32 1390 1391 mov x4,#5 1392.Loop_lower_neon: 1393 sub x4,x4,#1 1394 add v0.4s,v0.4s,v1.4s 1395 add w5,w5,w9 1396 add v4.4s,v4.4s,v5.4s 1397 add w6,w6,w10 1398 add v8.4s,v8.4s,v9.4s 1399 add w7,w7,w11 1400 add v12.4s,v12.4s,v13.4s 1401 add w8,w8,w12 1402 add v16.4s,v16.4s,v17.4s 1403 eor w17,w17,w5 1404 add v20.4s,v20.4s,v21.4s 1405 eor w19,w19,w6 1406 eor v3.16b,v3.16b,v0.16b 1407 eor w20,w20,w7 1408 eor v7.16b,v7.16b,v4.16b 1409 eor w21,w21,w8 1410 eor v11.16b,v11.16b,v8.16b 1411 ror w17,w17,#16 1412 eor v15.16b,v15.16b,v12.16b 1413 ror w19,w19,#16 1414 eor v19.16b,v19.16b,v16.16b 1415 ror w20,w20,#16 1416 eor v23.16b,v23.16b,v20.16b 1417 ror w21,w21,#16 1418 rev32 v3.8h,v3.8h 1419 add w13,w13,w17 1420 rev32 v7.8h,v7.8h 1421 add w14,w14,w19 1422 rev32 v11.8h,v11.8h 1423 add w15,w15,w20 1424 rev32 v15.8h,v15.8h 1425 add w16,w16,w21 1426 rev32 v19.8h,v19.8h 1427 eor w9,w9,w13 1428 rev32 v23.8h,v23.8h 1429 eor w10,w10,w14 1430 add v2.4s,v2.4s,v3.4s 1431 eor w11,w11,w15 1432 add v6.4s,v6.4s,v7.4s 1433 eor w12,w12,w16 1434 add v10.4s,v10.4s,v11.4s 1435 ror w9,w9,#20 1436 add v14.4s,v14.4s,v15.4s 1437 ror w10,w10,#20 1438 add v18.4s,v18.4s,v19.4s 1439 ror w11,w11,#20 1440 add v22.4s,v22.4s,v23.4s 1441 ror w12,w12,#20 1442 eor v24.16b,v1.16b,v2.16b 1443 add w5,w5,w9 1444 eor v25.16b,v5.16b,v6.16b 1445 add w6,w6,w10 1446 eor v26.16b,v9.16b,v10.16b 1447 add w7,w7,w11 1448 eor v27.16b,v13.16b,v14.16b 1449 add w8,w8,w12 1450 eor v28.16b,v17.16b,v18.16b 1451 eor w17,w17,w5 1452 eor v29.16b,v21.16b,v22.16b 1453 eor w19,w19,w6 1454 ushr v1.4s,v24.4s,#20 1455 eor w20,w20,w7 1456 ushr v5.4s,v25.4s,#20 1457 eor w21,w21,w8 1458 ushr v9.4s,v26.4s,#20 1459 ror w17,w17,#24 1460 ushr v13.4s,v27.4s,#20 1461 ror w19,w19,#24 1462 ushr v17.4s,v28.4s,#20 1463 ror w20,w20,#24 1464 ushr v21.4s,v29.4s,#20 1465 ror w21,w21,#24 1466 sli v1.4s,v24.4s,#12 1467 add w13,w13,w17 1468 sli v5.4s,v25.4s,#12 1469 add w14,w14,w19 1470 sli v9.4s,v26.4s,#12 1471 add w15,w15,w20 1472 sli v13.4s,v27.4s,#12 1473 add w16,w16,w21 1474 sli v17.4s,v28.4s,#12 1475 eor w9,w9,w13 1476 sli v21.4s,v29.4s,#12 1477 eor w10,w10,w14 1478 add v0.4s,v0.4s,v1.4s 1479 eor w11,w11,w15 1480 add v4.4s,v4.4s,v5.4s 1481 eor w12,w12,w16 1482 add v8.4s,v8.4s,v9.4s 1483 ror w9,w9,#25 1484 add v12.4s,v12.4s,v13.4s 1485 ror w10,w10,#25 1486 add v16.4s,v16.4s,v17.4s 1487 ror w11,w11,#25 1488 add v20.4s,v20.4s,v21.4s 1489 ror w12,w12,#25 1490 eor v24.16b,v3.16b,v0.16b 1491 add w5,w5,w10 1492 eor v25.16b,v7.16b,v4.16b 1493 add w6,w6,w11 1494 eor v26.16b,v11.16b,v8.16b 1495 add w7,w7,w12 1496 eor v27.16b,v15.16b,v12.16b 1497 add w8,w8,w9 1498 eor v28.16b,v19.16b,v16.16b 1499 eor w21,w21,w5 1500 eor v29.16b,v23.16b,v20.16b 1501 eor w17,w17,w6 1502 ushr v3.4s,v24.4s,#24 1503 eor w19,w19,w7 1504 ushr v7.4s,v25.4s,#24 1505 eor w20,w20,w8 1506 ushr v11.4s,v26.4s,#24 1507 ror w21,w21,#16 1508 ushr v15.4s,v27.4s,#24 1509 ror w17,w17,#16 1510 ushr v19.4s,v28.4s,#24 1511 ror w19,w19,#16 1512 ushr v23.4s,v29.4s,#24 1513 ror w20,w20,#16 1514 sli v3.4s,v24.4s,#8 1515 add w15,w15,w21 1516 sli v7.4s,v25.4s,#8 1517 add w16,w16,w17 1518 sli v11.4s,v26.4s,#8 1519 add w13,w13,w19 1520 sli v15.4s,v27.4s,#8 1521 add w14,w14,w20 1522 sli v19.4s,v28.4s,#8 1523 eor w10,w10,w15 1524 sli v23.4s,v29.4s,#8 1525 eor w11,w11,w16 1526 add v2.4s,v2.4s,v3.4s 1527 eor w12,w12,w13 1528 add v6.4s,v6.4s,v7.4s 1529 eor w9,w9,w14 1530 add v10.4s,v10.4s,v11.4s 1531 ror w10,w10,#20 1532 add v14.4s,v14.4s,v15.4s 1533 ror w11,w11,#20 1534 add v18.4s,v18.4s,v19.4s 1535 ror w12,w12,#20 1536 add v22.4s,v22.4s,v23.4s 1537 ror w9,w9,#20 1538 eor v24.16b,v1.16b,v2.16b 1539 add w5,w5,w10 1540 eor v25.16b,v5.16b,v6.16b 1541 add w6,w6,w11 1542 eor v26.16b,v9.16b,v10.16b 1543 add w7,w7,w12 1544 eor v27.16b,v13.16b,v14.16b 1545 add w8,w8,w9 1546 eor v28.16b,v17.16b,v18.16b 1547 eor w21,w21,w5 1548 eor v29.16b,v21.16b,v22.16b 1549 eor w17,w17,w6 1550 ushr v1.4s,v24.4s,#25 1551 eor w19,w19,w7 1552 ushr v5.4s,v25.4s,#25 1553 eor w20,w20,w8 1554 ushr v9.4s,v26.4s,#25 1555 ror w21,w21,#24 1556 ushr v13.4s,v27.4s,#25 1557 ror w17,w17,#24 1558 ushr v17.4s,v28.4s,#25 1559 ror w19,w19,#24 1560 ushr v21.4s,v29.4s,#25 1561 ror w20,w20,#24 1562 sli v1.4s,v24.4s,#7 1563 add w15,w15,w21 1564 sli v5.4s,v25.4s,#7 1565 add w16,w16,w17 1566 sli v9.4s,v26.4s,#7 1567 add w13,w13,w19 1568 sli v13.4s,v27.4s,#7 1569 add w14,w14,w20 1570 sli v17.4s,v28.4s,#7 1571 eor w10,w10,w15 1572 sli v21.4s,v29.4s,#7 1573 eor w11,w11,w16 1574 ext v2.16b,v2.16b,v2.16b,#8 1575 eor w12,w12,w13 1576 ext v6.16b,v6.16b,v6.16b,#8 1577 eor w9,w9,w14 1578 ext v10.16b,v10.16b,v10.16b,#8 1579 ror w10,w10,#25 1580 ext v14.16b,v14.16b,v14.16b,#8 1581 ror w11,w11,#25 1582 ext v18.16b,v18.16b,v18.16b,#8 1583 ror w12,w12,#25 1584 ext v22.16b,v22.16b,v22.16b,#8 1585 ror w9,w9,#25 1586 ext v3.16b,v3.16b,v3.16b,#12 1587 ext v7.16b,v7.16b,v7.16b,#12 1588 ext v11.16b,v11.16b,v11.16b,#12 1589 ext v15.16b,v15.16b,v15.16b,#12 1590 ext v19.16b,v19.16b,v19.16b,#12 1591 ext v23.16b,v23.16b,v23.16b,#12 1592 ext v1.16b,v1.16b,v1.16b,#4 1593 ext v5.16b,v5.16b,v5.16b,#4 1594 ext v9.16b,v9.16b,v9.16b,#4 1595 ext v13.16b,v13.16b,v13.16b,#4 1596 ext v17.16b,v17.16b,v17.16b,#4 1597 ext v21.16b,v21.16b,v21.16b,#4 1598 add v0.4s,v0.4s,v1.4s 1599 add w5,w5,w9 1600 add v4.4s,v4.4s,v5.4s 1601 add w6,w6,w10 1602 add v8.4s,v8.4s,v9.4s 1603 add w7,w7,w11 1604 add v12.4s,v12.4s,v13.4s 1605 add w8,w8,w12 1606 add v16.4s,v16.4s,v17.4s 1607 eor w17,w17,w5 1608 add v20.4s,v20.4s,v21.4s 1609 eor w19,w19,w6 1610 eor v3.16b,v3.16b,v0.16b 1611 eor w20,w20,w7 1612 eor v7.16b,v7.16b,v4.16b 1613 eor w21,w21,w8 1614 eor v11.16b,v11.16b,v8.16b 1615 ror w17,w17,#16 1616 eor v15.16b,v15.16b,v12.16b 1617 ror w19,w19,#16 1618 eor v19.16b,v19.16b,v16.16b 1619 ror w20,w20,#16 1620 eor v23.16b,v23.16b,v20.16b 1621 ror w21,w21,#16 1622 rev32 v3.8h,v3.8h 1623 add w13,w13,w17 1624 rev32 v7.8h,v7.8h 1625 add w14,w14,w19 1626 rev32 v11.8h,v11.8h 1627 add w15,w15,w20 1628 rev32 v15.8h,v15.8h 1629 add w16,w16,w21 1630 rev32 v19.8h,v19.8h 1631 eor w9,w9,w13 1632 rev32 v23.8h,v23.8h 1633 eor w10,w10,w14 1634 add v2.4s,v2.4s,v3.4s 1635 eor w11,w11,w15 1636 add v6.4s,v6.4s,v7.4s 1637 eor w12,w12,w16 1638 add v10.4s,v10.4s,v11.4s 1639 ror w9,w9,#20 1640 add v14.4s,v14.4s,v15.4s 1641 ror w10,w10,#20 1642 add v18.4s,v18.4s,v19.4s 1643 ror w11,w11,#20 1644 add v22.4s,v22.4s,v23.4s 1645 ror w12,w12,#20 1646 eor v24.16b,v1.16b,v2.16b 1647 add w5,w5,w9 1648 eor v25.16b,v5.16b,v6.16b 1649 add w6,w6,w10 1650 eor v26.16b,v9.16b,v10.16b 1651 add w7,w7,w11 1652 eor v27.16b,v13.16b,v14.16b 1653 add w8,w8,w12 1654 eor v28.16b,v17.16b,v18.16b 1655 eor w17,w17,w5 1656 eor v29.16b,v21.16b,v22.16b 1657 eor w19,w19,w6 1658 ushr v1.4s,v24.4s,#20 1659 eor w20,w20,w7 1660 ushr v5.4s,v25.4s,#20 1661 eor w21,w21,w8 1662 ushr v9.4s,v26.4s,#20 1663 ror w17,w17,#24 1664 ushr v13.4s,v27.4s,#20 1665 ror w19,w19,#24 1666 ushr v17.4s,v28.4s,#20 1667 ror w20,w20,#24 1668 ushr v21.4s,v29.4s,#20 1669 ror w21,w21,#24 1670 sli v1.4s,v24.4s,#12 1671 add w13,w13,w17 1672 sli v5.4s,v25.4s,#12 1673 add w14,w14,w19 1674 sli v9.4s,v26.4s,#12 1675 add w15,w15,w20 1676 sli v13.4s,v27.4s,#12 1677 add w16,w16,w21 1678 sli v17.4s,v28.4s,#12 1679 eor w9,w9,w13 1680 sli v21.4s,v29.4s,#12 1681 eor w10,w10,w14 1682 add v0.4s,v0.4s,v1.4s 1683 eor w11,w11,w15 1684 add v4.4s,v4.4s,v5.4s 1685 eor w12,w12,w16 1686 add v8.4s,v8.4s,v9.4s 1687 ror w9,w9,#25 1688 add v12.4s,v12.4s,v13.4s 1689 ror w10,w10,#25 1690 add v16.4s,v16.4s,v17.4s 1691 ror w11,w11,#25 1692 add v20.4s,v20.4s,v21.4s 1693 ror w12,w12,#25 1694 eor v24.16b,v3.16b,v0.16b 1695 add w5,w5,w10 1696 eor v25.16b,v7.16b,v4.16b 1697 add w6,w6,w11 1698 eor v26.16b,v11.16b,v8.16b 1699 add w7,w7,w12 1700 eor v27.16b,v15.16b,v12.16b 1701 add w8,w8,w9 1702 eor v28.16b,v19.16b,v16.16b 1703 eor w21,w21,w5 1704 eor v29.16b,v23.16b,v20.16b 1705 eor w17,w17,w6 1706 ushr v3.4s,v24.4s,#24 1707 eor w19,w19,w7 1708 ushr v7.4s,v25.4s,#24 1709 eor w20,w20,w8 1710 ushr v11.4s,v26.4s,#24 1711 ror w21,w21,#16 1712 ushr v15.4s,v27.4s,#24 1713 ror w17,w17,#16 1714 ushr v19.4s,v28.4s,#24 1715 ror w19,w19,#16 1716 ushr v23.4s,v29.4s,#24 1717 ror w20,w20,#16 1718 sli v3.4s,v24.4s,#8 1719 add w15,w15,w21 1720 sli v7.4s,v25.4s,#8 1721 add w16,w16,w17 1722 sli v11.4s,v26.4s,#8 1723 add w13,w13,w19 1724 sli v15.4s,v27.4s,#8 1725 add w14,w14,w20 1726 sli v19.4s,v28.4s,#8 1727 eor w10,w10,w15 1728 sli v23.4s,v29.4s,#8 1729 eor w11,w11,w16 1730 add v2.4s,v2.4s,v3.4s 1731 eor w12,w12,w13 1732 add v6.4s,v6.4s,v7.4s 1733 eor w9,w9,w14 1734 add v10.4s,v10.4s,v11.4s 1735 ror w10,w10,#20 1736 add v14.4s,v14.4s,v15.4s 1737 ror w11,w11,#20 1738 add v18.4s,v18.4s,v19.4s 1739 ror w12,w12,#20 1740 add v22.4s,v22.4s,v23.4s 1741 ror w9,w9,#20 1742 eor v24.16b,v1.16b,v2.16b 1743 add w5,w5,w10 1744 eor v25.16b,v5.16b,v6.16b 1745 add w6,w6,w11 1746 eor v26.16b,v9.16b,v10.16b 1747 add w7,w7,w12 1748 eor v27.16b,v13.16b,v14.16b 1749 add w8,w8,w9 1750 eor v28.16b,v17.16b,v18.16b 1751 eor w21,w21,w5 1752 eor v29.16b,v21.16b,v22.16b 1753 eor w17,w17,w6 1754 ushr v1.4s,v24.4s,#25 1755 eor w19,w19,w7 1756 ushr v5.4s,v25.4s,#25 1757 eor w20,w20,w8 1758 ushr v9.4s,v26.4s,#25 1759 ror w21,w21,#24 1760 ushr v13.4s,v27.4s,#25 1761 ror w17,w17,#24 1762 ushr v17.4s,v28.4s,#25 1763 ror w19,w19,#24 1764 ushr v21.4s,v29.4s,#25 1765 ror w20,w20,#24 1766 sli v1.4s,v24.4s,#7 1767 add w15,w15,w21 1768 sli v5.4s,v25.4s,#7 1769 add w16,w16,w17 1770 sli v9.4s,v26.4s,#7 1771 add w13,w13,w19 1772 sli v13.4s,v27.4s,#7 1773 add w14,w14,w20 1774 sli v17.4s,v28.4s,#7 1775 eor w10,w10,w15 1776 sli v21.4s,v29.4s,#7 1777 eor w11,w11,w16 1778 ext v2.16b,v2.16b,v2.16b,#8 1779 eor w12,w12,w13 1780 ext v6.16b,v6.16b,v6.16b,#8 1781 eor w9,w9,w14 1782 ext v10.16b,v10.16b,v10.16b,#8 1783 ror w10,w10,#25 1784 ext v14.16b,v14.16b,v14.16b,#8 1785 ror w11,w11,#25 1786 ext v18.16b,v18.16b,v18.16b,#8 1787 ror w12,w12,#25 1788 ext v22.16b,v22.16b,v22.16b,#8 1789 ror w9,w9,#25 1790 ext v3.16b,v3.16b,v3.16b,#4 1791 ext v7.16b,v7.16b,v7.16b,#4 1792 ext v11.16b,v11.16b,v11.16b,#4 1793 ext v15.16b,v15.16b,v15.16b,#4 1794 ext v19.16b,v19.16b,v19.16b,#4 1795 ext v23.16b,v23.16b,v23.16b,#4 1796 ext v1.16b,v1.16b,v1.16b,#12 1797 ext v5.16b,v5.16b,v5.16b,#12 1798 ext v9.16b,v9.16b,v9.16b,#12 1799 ext v13.16b,v13.16b,v13.16b,#12 1800 ext v17.16b,v17.16b,v17.16b,#12 1801 ext v21.16b,v21.16b,v21.16b,#12 1802 cbnz x4,.Loop_lower_neon 1803 1804 add w5,w5,w22 // accumulate key block 1805 ldp q24,q25,[sp,#0] 1806 add x6,x6,x22,lsr#32 1807 ldp q26,q27,[sp,#32] 1808 add w7,w7,w23 1809 ldp q28,q29,[sp,#64] 1810 add x8,x8,x23,lsr#32 1811 add v0.4s,v0.4s,v24.4s 1812 add w9,w9,w24 1813 add v4.4s,v4.4s,v24.4s 1814 add x10,x10,x24,lsr#32 1815 add v8.4s,v8.4s,v24.4s 1816 add w11,w11,w25 1817 add v12.4s,v12.4s,v24.4s 1818 add x12,x12,x25,lsr#32 1819 add v16.4s,v16.4s,v24.4s 1820 add w13,w13,w26 1821 add v20.4s,v20.4s,v24.4s 1822 add x14,x14,x26,lsr#32 1823 add v2.4s,v2.4s,v26.4s 1824 add w15,w15,w27 1825 add v6.4s,v6.4s,v26.4s 1826 add x16,x16,x27,lsr#32 1827 add v10.4s,v10.4s,v26.4s 1828 add w17,w17,w28 1829 add v14.4s,v14.4s,v26.4s 1830 add x19,x19,x28,lsr#32 1831 add v18.4s,v18.4s,v26.4s 1832 add w20,w20,w30 1833 add v22.4s,v22.4s,v26.4s 1834 add x21,x21,x30,lsr#32 1835 add v19.4s,v19.4s,v31.4s // +4 1836 add x5,x5,x6,lsl#32 // pack 1837 add v23.4s,v23.4s,v31.4s // +4 1838 add x7,x7,x8,lsl#32 1839 add v3.4s,v3.4s,v27.4s 1840 ldp x6,x8,[x1,#0] // load input 1841 add v7.4s,v7.4s,v28.4s 1842 add x9,x9,x10,lsl#32 1843 add v11.4s,v11.4s,v29.4s 1844 add x11,x11,x12,lsl#32 1845 add v15.4s,v15.4s,v30.4s 1846 ldp x10,x12,[x1,#16] 1847 add v19.4s,v19.4s,v27.4s 1848 add x13,x13,x14,lsl#32 1849 add v23.4s,v23.4s,v28.4s 1850 add x15,x15,x16,lsl#32 1851 add v1.4s,v1.4s,v25.4s 1852 ldp x14,x16,[x1,#32] 1853 add v5.4s,v5.4s,v25.4s 1854 add x17,x17,x19,lsl#32 1855 add v9.4s,v9.4s,v25.4s 1856 add x20,x20,x21,lsl#32 1857 add v13.4s,v13.4s,v25.4s 1858 ldp x19,x21,[x1,#48] 1859 add v17.4s,v17.4s,v25.4s 1860 add x1,x1,#64 1861 add v21.4s,v21.4s,v25.4s 1862 1863#ifdef __ARMEB__ 1864 rev x5,x5 1865 rev x7,x7 1866 rev x9,x9 1867 rev x11,x11 1868 rev x13,x13 1869 rev x15,x15 1870 rev x17,x17 1871 rev x20,x20 1872#endif 1873 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1874 eor x5,x5,x6 1875 eor x7,x7,x8 1876 eor x9,x9,x10 1877 eor x11,x11,x12 1878 eor x13,x13,x14 1879 eor v0.16b,v0.16b,v24.16b 1880 eor x15,x15,x16 1881 eor v1.16b,v1.16b,v25.16b 1882 eor x17,x17,x19 1883 eor v2.16b,v2.16b,v26.16b 1884 eor x20,x20,x21 1885 eor v3.16b,v3.16b,v27.16b 1886 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1887 1888 stp x5,x7,[x0,#0] // store output 1889 add x28,x28,#7 // increment counter 1890 stp x9,x11,[x0,#16] 1891 stp x13,x15,[x0,#32] 1892 stp x17,x20,[x0,#48] 1893 add x0,x0,#64 1894 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 1895 1896 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1897 eor v4.16b,v4.16b,v24.16b 1898 eor v5.16b,v5.16b,v25.16b 1899 eor v6.16b,v6.16b,v26.16b 1900 eor v7.16b,v7.16b,v27.16b 1901 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1902 1903 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1904 eor v8.16b,v8.16b,v0.16b 1905 ldp q24,q25,[sp,#0] 1906 eor v9.16b,v9.16b,v1.16b 1907 ldp q26,q27,[sp,#32] 1908 eor v10.16b,v10.16b,v2.16b 1909 eor v11.16b,v11.16b,v3.16b 1910 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1911 1912 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1913 eor v12.16b,v12.16b,v4.16b 1914 eor v13.16b,v13.16b,v5.16b 1915 eor v14.16b,v14.16b,v6.16b 1916 eor v15.16b,v15.16b,v7.16b 1917 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1918 1919 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1920 eor v16.16b,v16.16b,v8.16b 1921 eor v17.16b,v17.16b,v9.16b 1922 eor v18.16b,v18.16b,v10.16b 1923 eor v19.16b,v19.16b,v11.16b 1924 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1925 1926 shl v0.4s,v31.4s,#1 // 4 -> 8 1927 eor v20.16b,v20.16b,v12.16b 1928 eor v21.16b,v21.16b,v13.16b 1929 eor v22.16b,v22.16b,v14.16b 1930 eor v23.16b,v23.16b,v15.16b 1931 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1932 1933 add v27.4s,v27.4s,v0.4s // += 8 1934 add v28.4s,v28.4s,v0.4s 1935 add v29.4s,v29.4s,v0.4s 1936 add v30.4s,v30.4s,v0.4s 1937 1938 b.hs .Loop_outer_512_neon 1939 1940 adds x2,x2,#512 1941 ushr v0.4s,v31.4s,#2 // 4 -> 1 1942 1943 ldp d8,d9,[sp,#128+0] // meet ABI requirements 1944 ldp d10,d11,[sp,#128+16] 1945 ldp d12,d13,[sp,#128+32] 1946 ldp d14,d15,[sp,#128+48] 1947 1948 stp q24,q31,[sp,#0] // wipe off-load area 1949 stp q24,q31,[sp,#32] 1950 stp q24,q31,[sp,#64] 1951 1952 b.eq .Ldone_512_neon 1953 1954 cmp x2,#192 1955 sub v27.4s,v27.4s,v0.4s // -= 1 1956 sub v28.4s,v28.4s,v0.4s 1957 sub v29.4s,v29.4s,v0.4s 1958 add sp,sp,#128 1959 b.hs .Loop_outer_neon 1960 1961 eor v25.16b,v25.16b,v25.16b 1962 eor v26.16b,v26.16b,v26.16b 1963 eor v27.16b,v27.16b,v27.16b 1964 eor v28.16b,v28.16b,v28.16b 1965 eor v29.16b,v29.16b,v29.16b 1966 eor v30.16b,v30.16b,v30.16b 1967 b .Loop_outer 1968 1969.Ldone_512_neon: 1970 ldp x19,x20,[x29,#16] 1971 add sp,sp,#128+64 1972 ldp x21,x22,[x29,#32] 1973 ldp x23,x24,[x29,#48] 1974 ldp x25,x26,[x29,#64] 1975 ldp x27,x28,[x29,#80] 1976 ldp x29,x30,[sp],#96 1977.inst 0xd50323bf // autiasp 1978 ret 1979.size ChaCha20_512_neon,.-ChaCha20_512_neon 1980