1/* Do not modify. This file is auto-generated from chacha-armv8.pl. */ 2#ifndef __KERNEL__ 3# include "arm_arch.h" 4 5.hidden OPENSSL_armcap_P 6#endif 7 8.text 9 10.align 5 11.Lsigma: 12.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 13.Lone: 14.long 1,2,3,4 15.Lrot24: 16.long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f 17.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,64,100,111,116,45,97,115,109,0 18.align 2 19 20.globl ChaCha20_ctr32 21.type ChaCha20_ctr32,%function 22.align 5 23ChaCha20_ctr32: 24 cbz x2,.Labort 25 cmp x2,#192 26 b.lo .Lshort 27 28#ifndef __KERNEL__ 29 adrp x17,OPENSSL_armcap_P 30 ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 31 tst w17,#ARMV7_NEON 32 b.ne .LChaCha20_neon 33#endif 34 35.Lshort: 36.inst 0xd503233f // paciasp 37 stp x29,x30,[sp,#-96]! 38 add x29,sp,#0 39 40 adr x5,.Lsigma 41 stp x19,x20,[sp,#16] 42 stp x21,x22,[sp,#32] 43 stp x23,x24,[sp,#48] 44 stp x25,x26,[sp,#64] 45 stp x27,x28,[sp,#80] 46 sub sp,sp,#64 47 48 ldp x22,x23,[x5] // load sigma 49 ldp x24,x25,[x3] // load key 50 ldp x26,x27,[x3,#16] 51 ldp x28,x30,[x4] // load counter 52#ifdef __AARCH64EB__ 53 ror x24,x24,#32 54 ror x25,x25,#32 55 ror x26,x26,#32 56 ror x27,x27,#32 57 ror x28,x28,#32 58 ror x30,x30,#32 59#endif 60 61.Loop_outer: 62 mov w5,w22 // unpack key block 63 lsr x6,x22,#32 64 mov w7,w23 65 lsr x8,x23,#32 66 mov w9,w24 67 lsr x10,x24,#32 68 mov w11,w25 69 lsr x12,x25,#32 70 mov w13,w26 71 lsr x14,x26,#32 72 mov w15,w27 73 lsr x16,x27,#32 74 mov w17,w28 75 lsr x19,x28,#32 76 mov w20,w30 77 lsr x21,x30,#32 78 79 mov x4,#10 80 subs x2,x2,#64 81.Loop: 82 sub x4,x4,#1 83 add w5,w5,w9 84 add w6,w6,w10 85 add w7,w7,w11 86 add w8,w8,w12 87 eor w17,w17,w5 88 eor w19,w19,w6 89 eor w20,w20,w7 90 eor w21,w21,w8 91 ror w17,w17,#16 92 ror w19,w19,#16 93 ror w20,w20,#16 94 ror w21,w21,#16 95 add w13,w13,w17 96 add w14,w14,w19 97 add w15,w15,w20 98 add w16,w16,w21 99 eor w9,w9,w13 100 eor w10,w10,w14 101 eor w11,w11,w15 102 eor w12,w12,w16 103 ror w9,w9,#20 104 ror w10,w10,#20 105 ror w11,w11,#20 106 ror w12,w12,#20 107 add w5,w5,w9 108 add w6,w6,w10 109 add w7,w7,w11 110 add w8,w8,w12 111 eor w17,w17,w5 112 eor w19,w19,w6 113 eor w20,w20,w7 114 eor w21,w21,w8 115 ror w17,w17,#24 116 ror w19,w19,#24 117 ror w20,w20,#24 118 ror w21,w21,#24 119 add w13,w13,w17 120 add w14,w14,w19 121 add w15,w15,w20 122 add w16,w16,w21 123 eor w9,w9,w13 124 eor w10,w10,w14 125 eor w11,w11,w15 126 eor w12,w12,w16 127 ror w9,w9,#25 128 ror w10,w10,#25 129 ror w11,w11,#25 130 ror w12,w12,#25 131 add w5,w5,w10 132 add w6,w6,w11 133 add w7,w7,w12 134 add w8,w8,w9 135 eor w21,w21,w5 136 eor w17,w17,w6 137 eor w19,w19,w7 138 eor w20,w20,w8 139 ror w21,w21,#16 140 ror w17,w17,#16 141 ror w19,w19,#16 142 ror w20,w20,#16 143 add w15,w15,w21 144 add w16,w16,w17 145 add w13,w13,w19 146 add w14,w14,w20 147 eor w10,w10,w15 148 eor w11,w11,w16 149 eor w12,w12,w13 150 eor w9,w9,w14 151 ror w10,w10,#20 152 ror w11,w11,#20 153 ror w12,w12,#20 154 ror w9,w9,#20 155 add w5,w5,w10 156 add w6,w6,w11 157 add w7,w7,w12 158 add w8,w8,w9 159 eor w21,w21,w5 160 eor w17,w17,w6 161 eor w19,w19,w7 162 eor w20,w20,w8 163 ror w21,w21,#24 164 ror w17,w17,#24 165 ror w19,w19,#24 166 ror w20,w20,#24 167 add w15,w15,w21 168 add w16,w16,w17 169 add w13,w13,w19 170 add w14,w14,w20 171 eor w10,w10,w15 172 eor w11,w11,w16 173 eor w12,w12,w13 174 eor w9,w9,w14 175 ror w10,w10,#25 176 ror w11,w11,#25 177 ror w12,w12,#25 178 ror w9,w9,#25 179 cbnz x4,.Loop 180 181 add w5,w5,w22 // accumulate key block 182 add x6,x6,x22,lsr#32 183 add w7,w7,w23 184 add x8,x8,x23,lsr#32 185 add w9,w9,w24 186 add x10,x10,x24,lsr#32 187 add w11,w11,w25 188 add x12,x12,x25,lsr#32 189 add w13,w13,w26 190 add x14,x14,x26,lsr#32 191 add w15,w15,w27 192 add x16,x16,x27,lsr#32 193 add w17,w17,w28 194 add x19,x19,x28,lsr#32 195 add w20,w20,w30 196 add x21,x21,x30,lsr#32 197 198 b.lo .Ltail 199 200 add x5,x5,x6,lsl#32 // pack 201 add x7,x7,x8,lsl#32 202 ldp x6,x8,[x1,#0] // load input 203 add x9,x9,x10,lsl#32 204 add x11,x11,x12,lsl#32 205 ldp x10,x12,[x1,#16] 206 add x13,x13,x14,lsl#32 207 add x15,x15,x16,lsl#32 208 ldp x14,x16,[x1,#32] 209 add x17,x17,x19,lsl#32 210 add x20,x20,x21,lsl#32 211 ldp x19,x21,[x1,#48] 212 add x1,x1,#64 213#ifdef __AARCH64EB__ 214 rev x5,x5 215 rev x7,x7 216 rev x9,x9 217 rev x11,x11 218 rev x13,x13 219 rev x15,x15 220 rev x17,x17 221 rev x20,x20 222#endif 223 eor x5,x5,x6 224 eor x7,x7,x8 225 eor x9,x9,x10 226 eor x11,x11,x12 227 eor x13,x13,x14 228 eor x15,x15,x16 229 eor x17,x17,x19 230 eor x20,x20,x21 231 232 stp x5,x7,[x0,#0] // store output 233 add x28,x28,#1 // increment counter 234 stp x9,x11,[x0,#16] 235 stp x13,x15,[x0,#32] 236 stp x17,x20,[x0,#48] 237 add x0,x0,#64 238 239 b.hi .Loop_outer 240 241 ldp x19,x20,[x29,#16] 242 add sp,sp,#64 243 ldp x21,x22,[x29,#32] 244 ldp x23,x24,[x29,#48] 245 ldp x25,x26,[x29,#64] 246 ldp x27,x28,[x29,#80] 247 ldp x29,x30,[sp],#96 248.inst 0xd50323bf // autiasp 249.Labort: 250 ret 251 252.align 4 253.Ltail: 254 add x2,x2,#64 255.Less_than_64: 256 sub x0,x0,#1 257 add x1,x1,x2 258 add x0,x0,x2 259 add x4,sp,x2 260 neg x2,x2 261 262 add x5,x5,x6,lsl#32 // pack 263 add x7,x7,x8,lsl#32 264 add x9,x9,x10,lsl#32 265 add x11,x11,x12,lsl#32 266 add x13,x13,x14,lsl#32 267 add x15,x15,x16,lsl#32 268 add x17,x17,x19,lsl#32 269 add x20,x20,x21,lsl#32 270#ifdef __AARCH64EB__ 271 rev x5,x5 272 rev x7,x7 273 rev x9,x9 274 rev x11,x11 275 rev x13,x13 276 rev x15,x15 277 rev x17,x17 278 rev x20,x20 279#endif 280 stp x5,x7,[sp,#0] 281 stp x9,x11,[sp,#16] 282 stp x13,x15,[sp,#32] 283 stp x17,x20,[sp,#48] 284 285.Loop_tail: 286 ldrb w10,[x1,x2] 287 ldrb w11,[x4,x2] 288 add x2,x2,#1 289 eor w10,w10,w11 290 strb w10,[x0,x2] 291 cbnz x2,.Loop_tail 292 293 stp xzr,xzr,[sp,#0] 294 stp xzr,xzr,[sp,#16] 295 stp xzr,xzr,[sp,#32] 296 stp xzr,xzr,[sp,#48] 297 298 ldp x19,x20,[x29,#16] 299 add sp,sp,#64 300 ldp x21,x22,[x29,#32] 301 ldp x23,x24,[x29,#48] 302 ldp x25,x26,[x29,#64] 303 ldp x27,x28,[x29,#80] 304 ldp x29,x30,[sp],#96 305.inst 0xd50323bf // autiasp 306 ret 307.size ChaCha20_ctr32,.-ChaCha20_ctr32 308 309#ifdef __KERNEL__ 310.globl ChaCha20_neon 311#endif 312.type ChaCha20_neon,%function 313.align 5 314ChaCha20_neon: 315.LChaCha20_neon: 316.inst 0xd503233f // paciasp 317 stp x29,x30,[sp,#-96]! 318 add x29,sp,#0 319 320 adr x5,.Lsigma 321 stp x19,x20,[sp,#16] 322 stp x21,x22,[sp,#32] 323 stp x23,x24,[sp,#48] 324 stp x25,x26,[sp,#64] 325 stp x27,x28,[sp,#80] 326 cmp x2,#512 327 b.hs .L512_or_more_neon 328 329 sub sp,sp,#64 330 331 ldp x22,x23,[x5] // load sigma 332 ld1 {v0.4s},[x5],#16 333 ldp x24,x25,[x3] // load key 334 ldp x26,x27,[x3,#16] 335 ld1 {v1.4s,v2.4s},[x3] 336 ldp x28,x30,[x4] // load counter 337 ld1 {v3.4s},[x4] 338 stp d8,d9,[sp] // meet ABI requirements 339 ld1 {v8.4s,v9.4s},[x5] 340#ifdef __AARCH64EB__ 341 rev64 v0.4s,v0.4s 342 ror x24,x24,#32 343 ror x25,x25,#32 344 ror x26,x26,#32 345 ror x27,x27,#32 346 ror x28,x28,#32 347 ror x30,x30,#32 348#endif 349 350.Loop_outer_neon: 351 dup v16.4s,v0.s[0] // unpack key block 352 mov w5,w22 353 dup v20.4s,v0.s[1] 354 lsr x6,x22,#32 355 dup v24.4s,v0.s[2] 356 mov w7,w23 357 dup v28.4s,v0.s[3] 358 lsr x8,x23,#32 359 dup v17.4s,v1.s[0] 360 mov w9,w24 361 dup v21.4s,v1.s[1] 362 lsr x10,x24,#32 363 dup v25.4s,v1.s[2] 364 mov w11,w25 365 dup v29.4s,v1.s[3] 366 lsr x12,x25,#32 367 dup v19.4s,v3.s[0] 368 mov w13,w26 369 dup v23.4s,v3.s[1] 370 lsr x14,x26,#32 371 dup v27.4s,v3.s[2] 372 mov w15,w27 373 dup v31.4s,v3.s[3] 374 lsr x16,x27,#32 375 add v19.4s,v19.4s,v8.4s 376 mov w17,w28 377 dup v18.4s,v2.s[0] 378 lsr x19,x28,#32 379 dup v22.4s,v2.s[1] 380 mov w20,w30 381 dup v26.4s,v2.s[2] 382 lsr x21,x30,#32 383 dup v30.4s,v2.s[3] 384 385 mov x4,#10 386 subs x2,x2,#320 387.Loop_neon: 388 sub x4,x4,#1 389 add v16.4s,v16.4s,v17.4s 390 add w5,w5,w9 391 add v20.4s,v20.4s,v21.4s 392 add w6,w6,w10 393 add v24.4s,v24.4s,v25.4s 394 add w7,w7,w11 395 add v28.4s,v28.4s,v29.4s 396 add w8,w8,w12 397 eor v19.16b,v19.16b,v16.16b 398 eor w17,w17,w5 399 eor v23.16b,v23.16b,v20.16b 400 eor w19,w19,w6 401 eor v27.16b,v27.16b,v24.16b 402 eor w20,w20,w7 403 eor v31.16b,v31.16b,v28.16b 404 eor w21,w21,w8 405 rev32 v19.8h,v19.8h 406 ror w17,w17,#16 407 rev32 v23.8h,v23.8h 408 ror w19,w19,#16 409 rev32 v27.8h,v27.8h 410 ror w20,w20,#16 411 rev32 v31.8h,v31.8h 412 ror w21,w21,#16 413 add v18.4s,v18.4s,v19.4s 414 add w13,w13,w17 415 add v22.4s,v22.4s,v23.4s 416 add w14,w14,w19 417 add v26.4s,v26.4s,v27.4s 418 add w15,w15,w20 419 add v30.4s,v30.4s,v31.4s 420 add w16,w16,w21 421 eor v4.16b,v17.16b,v18.16b 422 eor w9,w9,w13 423 eor v5.16b,v21.16b,v22.16b 424 eor w10,w10,w14 425 eor v6.16b,v25.16b,v26.16b 426 eor w11,w11,w15 427 eor v7.16b,v29.16b,v30.16b 428 eor w12,w12,w16 429 ushr v17.4s,v4.4s,#20 430 ror w9,w9,#20 431 ushr v21.4s,v5.4s,#20 432 ror w10,w10,#20 433 ushr v25.4s,v6.4s,#20 434 ror w11,w11,#20 435 ushr v29.4s,v7.4s,#20 436 ror w12,w12,#20 437 sli v17.4s,v4.4s,#12 438 add w5,w5,w9 439 sli v21.4s,v5.4s,#12 440 add w6,w6,w10 441 sli v25.4s,v6.4s,#12 442 add w7,w7,w11 443 sli v29.4s,v7.4s,#12 444 add w8,w8,w12 445 add v16.4s,v16.4s,v17.4s 446 eor w17,w17,w5 447 add v20.4s,v20.4s,v21.4s 448 eor w19,w19,w6 449 add v24.4s,v24.4s,v25.4s 450 eor w20,w20,w7 451 add v28.4s,v28.4s,v29.4s 452 eor w21,w21,w8 453 eor v4.16b,v19.16b,v16.16b 454 ror w17,w17,#24 455 eor v5.16b,v23.16b,v20.16b 456 ror w19,w19,#24 457 eor v6.16b,v27.16b,v24.16b 458 ror w20,w20,#24 459 eor v7.16b,v31.16b,v28.16b 460 ror w21,w21,#24 461 tbl v19.16b,{v4.16b},v9.16b 462 add w13,w13,w17 463 tbl v23.16b,{v5.16b},v9.16b 464 add w14,w14,w19 465 tbl v27.16b,{v6.16b},v9.16b 466 add w15,w15,w20 467 tbl v31.16b,{v7.16b},v9.16b 468 add w16,w16,w21 469 add v18.4s,v18.4s,v19.4s 470 eor w9,w9,w13 471 add v22.4s,v22.4s,v23.4s 472 eor w10,w10,w14 473 add v26.4s,v26.4s,v27.4s 474 eor w11,w11,w15 475 add v30.4s,v30.4s,v31.4s 476 eor w12,w12,w16 477 eor v4.16b,v17.16b,v18.16b 478 ror w9,w9,#25 479 eor v5.16b,v21.16b,v22.16b 480 ror w10,w10,#25 481 eor v6.16b,v25.16b,v26.16b 482 ror w11,w11,#25 483 eor v7.16b,v29.16b,v30.16b 484 ror w12,w12,#25 485 ushr v17.4s,v4.4s,#25 486 ushr v21.4s,v5.4s,#25 487 ushr v25.4s,v6.4s,#25 488 ushr v29.4s,v7.4s,#25 489 sli v17.4s,v4.4s,#7 490 sli v21.4s,v5.4s,#7 491 sli v25.4s,v6.4s,#7 492 sli v29.4s,v7.4s,#7 493 add v16.4s,v16.4s,v21.4s 494 add w5,w5,w10 495 add v20.4s,v20.4s,v25.4s 496 add w6,w6,w11 497 add v24.4s,v24.4s,v29.4s 498 add w7,w7,w12 499 add v28.4s,v28.4s,v17.4s 500 add w8,w8,w9 501 eor v31.16b,v31.16b,v16.16b 502 eor w21,w21,w5 503 eor v19.16b,v19.16b,v20.16b 504 eor w17,w17,w6 505 eor v23.16b,v23.16b,v24.16b 506 eor w19,w19,w7 507 eor v27.16b,v27.16b,v28.16b 508 eor w20,w20,w8 509 rev32 v31.8h,v31.8h 510 ror w21,w21,#16 511 rev32 v19.8h,v19.8h 512 ror w17,w17,#16 513 rev32 v23.8h,v23.8h 514 ror w19,w19,#16 515 rev32 v27.8h,v27.8h 516 ror w20,w20,#16 517 add v26.4s,v26.4s,v31.4s 518 add w15,w15,w21 519 add v30.4s,v30.4s,v19.4s 520 add w16,w16,w17 521 add v18.4s,v18.4s,v23.4s 522 add w13,w13,w19 523 add v22.4s,v22.4s,v27.4s 524 add w14,w14,w20 525 eor v4.16b,v21.16b,v26.16b 526 eor w10,w10,w15 527 eor v5.16b,v25.16b,v30.16b 528 eor w11,w11,w16 529 eor v6.16b,v29.16b,v18.16b 530 eor w12,w12,w13 531 eor v7.16b,v17.16b,v22.16b 532 eor w9,w9,w14 533 ushr v21.4s,v4.4s,#20 534 ror w10,w10,#20 535 ushr v25.4s,v5.4s,#20 536 ror w11,w11,#20 537 ushr v29.4s,v6.4s,#20 538 ror w12,w12,#20 539 ushr v17.4s,v7.4s,#20 540 ror w9,w9,#20 541 sli v21.4s,v4.4s,#12 542 add w5,w5,w10 543 sli v25.4s,v5.4s,#12 544 add w6,w6,w11 545 sli v29.4s,v6.4s,#12 546 add w7,w7,w12 547 sli v17.4s,v7.4s,#12 548 add w8,w8,w9 549 add v16.4s,v16.4s,v21.4s 550 eor w21,w21,w5 551 add v20.4s,v20.4s,v25.4s 552 eor w17,w17,w6 553 add v24.4s,v24.4s,v29.4s 554 eor w19,w19,w7 555 add v28.4s,v28.4s,v17.4s 556 eor w20,w20,w8 557 eor v4.16b,v31.16b,v16.16b 558 ror w21,w21,#24 559 eor v5.16b,v19.16b,v20.16b 560 ror w17,w17,#24 561 eor v6.16b,v23.16b,v24.16b 562 ror w19,w19,#24 563 eor v7.16b,v27.16b,v28.16b 564 ror w20,w20,#24 565 tbl v31.16b,{v4.16b},v9.16b 566 add w15,w15,w21 567 tbl v19.16b,{v5.16b},v9.16b 568 add w16,w16,w17 569 tbl v23.16b,{v6.16b},v9.16b 570 add w13,w13,w19 571 tbl v27.16b,{v7.16b},v9.16b 572 add w14,w14,w20 573 add v26.4s,v26.4s,v31.4s 574 eor w10,w10,w15 575 add v30.4s,v30.4s,v19.4s 576 eor w11,w11,w16 577 add v18.4s,v18.4s,v23.4s 578 eor w12,w12,w13 579 add v22.4s,v22.4s,v27.4s 580 eor w9,w9,w14 581 eor v4.16b,v21.16b,v26.16b 582 ror w10,w10,#25 583 eor v5.16b,v25.16b,v30.16b 584 ror w11,w11,#25 585 eor v6.16b,v29.16b,v18.16b 586 ror w12,w12,#25 587 eor v7.16b,v17.16b,v22.16b 588 ror w9,w9,#25 589 ushr v21.4s,v4.4s,#25 590 ushr v25.4s,v5.4s,#25 591 ushr v29.4s,v6.4s,#25 592 ushr v17.4s,v7.4s,#25 593 sli v21.4s,v4.4s,#7 594 sli v25.4s,v5.4s,#7 595 sli v29.4s,v6.4s,#7 596 sli v17.4s,v7.4s,#7 597 cbnz x4,.Loop_neon 598 599 add v19.4s,v19.4s,v8.4s 600 601 zip1 v4.4s,v16.4s,v20.4s // transpose data 602 zip1 v5.4s,v24.4s,v28.4s 603 zip2 v6.4s,v16.4s,v20.4s 604 zip2 v7.4s,v24.4s,v28.4s 605 zip1 v16.2d,v4.2d,v5.2d 606 zip2 v20.2d,v4.2d,v5.2d 607 zip1 v24.2d,v6.2d,v7.2d 608 zip2 v28.2d,v6.2d,v7.2d 609 610 zip1 v4.4s,v17.4s,v21.4s 611 zip1 v5.4s,v25.4s,v29.4s 612 zip2 v6.4s,v17.4s,v21.4s 613 zip2 v7.4s,v25.4s,v29.4s 614 zip1 v17.2d,v4.2d,v5.2d 615 zip2 v21.2d,v4.2d,v5.2d 616 zip1 v25.2d,v6.2d,v7.2d 617 zip2 v29.2d,v6.2d,v7.2d 618 619 zip1 v4.4s,v18.4s,v22.4s 620 add w5,w5,w22 // accumulate key block 621 zip1 v5.4s,v26.4s,v30.4s 622 add x6,x6,x22,lsr#32 623 zip2 v6.4s,v18.4s,v22.4s 624 add w7,w7,w23 625 zip2 v7.4s,v26.4s,v30.4s 626 add x8,x8,x23,lsr#32 627 zip1 v18.2d,v4.2d,v5.2d 628 add w9,w9,w24 629 zip2 v22.2d,v4.2d,v5.2d 630 add x10,x10,x24,lsr#32 631 zip1 v26.2d,v6.2d,v7.2d 632 add w11,w11,w25 633 zip2 v30.2d,v6.2d,v7.2d 634 add x12,x12,x25,lsr#32 635 636 zip1 v4.4s,v19.4s,v23.4s 637 add w13,w13,w26 638 zip1 v5.4s,v27.4s,v31.4s 639 add x14,x14,x26,lsr#32 640 zip2 v6.4s,v19.4s,v23.4s 641 add w15,w15,w27 642 zip2 v7.4s,v27.4s,v31.4s 643 add x16,x16,x27,lsr#32 644 zip1 v19.2d,v4.2d,v5.2d 645 add w17,w17,w28 646 zip2 v23.2d,v4.2d,v5.2d 647 add x19,x19,x28,lsr#32 648 zip1 v27.2d,v6.2d,v7.2d 649 add w20,w20,w30 650 zip2 v31.2d,v6.2d,v7.2d 651 add x21,x21,x30,lsr#32 652 653 b.lo .Ltail_neon 654 655 add x5,x5,x6,lsl#32 // pack 656 add x7,x7,x8,lsl#32 657 ldp x6,x8,[x1,#0] // load input 658 add v16.4s,v16.4s,v0.4s // accumulate key block 659 add x9,x9,x10,lsl#32 660 add x11,x11,x12,lsl#32 661 ldp x10,x12,[x1,#16] 662 add v17.4s,v17.4s,v1.4s 663 add x13,x13,x14,lsl#32 664 add x15,x15,x16,lsl#32 665 ldp x14,x16,[x1,#32] 666 add v18.4s,v18.4s,v2.4s 667 add x17,x17,x19,lsl#32 668 add x20,x20,x21,lsl#32 669 ldp x19,x21,[x1,#48] 670 add v19.4s,v19.4s,v3.4s 671 add x1,x1,#64 672#ifdef __AARCH64EB__ 673 rev x5,x5 674 rev x7,x7 675 rev x9,x9 676 rev x11,x11 677 rev x13,x13 678 rev x15,x15 679 rev x17,x17 680 rev x20,x20 681#endif 682 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 683 eor x5,x5,x6 684 add v20.4s,v20.4s,v0.4s 685 eor x7,x7,x8 686 add v21.4s,v21.4s,v1.4s 687 eor x9,x9,x10 688 add v22.4s,v22.4s,v2.4s 689 eor x11,x11,x12 690 add v23.4s,v23.4s,v3.4s 691 eor x13,x13,x14 692 eor v16.16b,v16.16b,v4.16b 693 movi v4.4s,#5 694 eor x15,x15,x16 695 eor v17.16b,v17.16b,v5.16b 696 eor x17,x17,x19 697 eor v18.16b,v18.16b,v6.16b 698 eor x20,x20,x21 699 eor v19.16b,v19.16b,v7.16b 700 add v8.4s,v8.4s,v4.4s // += 5 701 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 702 703 stp x5,x7,[x0,#0] // store output 704 add x28,x28,#5 // increment counter 705 stp x9,x11,[x0,#16] 706 stp x13,x15,[x0,#32] 707 stp x17,x20,[x0,#48] 708 add x0,x0,#64 709 710 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 711 add v24.4s,v24.4s,v0.4s 712 add v25.4s,v25.4s,v1.4s 713 add v26.4s,v26.4s,v2.4s 714 add v27.4s,v27.4s,v3.4s 715 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 716 717 eor v20.16b,v20.16b,v4.16b 718 eor v21.16b,v21.16b,v5.16b 719 eor v22.16b,v22.16b,v6.16b 720 eor v23.16b,v23.16b,v7.16b 721 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 722 add v28.4s,v28.4s,v0.4s 723 add v29.4s,v29.4s,v1.4s 724 add v30.4s,v30.4s,v2.4s 725 add v31.4s,v31.4s,v3.4s 726 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 727 728 eor v24.16b,v24.16b,v16.16b 729 eor v25.16b,v25.16b,v17.16b 730 eor v26.16b,v26.16b,v18.16b 731 eor v27.16b,v27.16b,v19.16b 732 st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 733 734 eor v28.16b,v28.16b,v20.16b 735 eor v29.16b,v29.16b,v21.16b 736 eor v30.16b,v30.16b,v22.16b 737 eor v31.16b,v31.16b,v23.16b 738 st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64 739 740 b.hi .Loop_outer_neon 741 742 ldp d8,d9,[sp] // meet ABI requirements 743 744 ldp x19,x20,[x29,#16] 745 add sp,sp,#64 746 ldp x21,x22,[x29,#32] 747 ldp x23,x24,[x29,#48] 748 ldp x25,x26,[x29,#64] 749 ldp x27,x28,[x29,#80] 750 ldp x29,x30,[sp],#96 751.inst 0xd50323bf // autiasp 752 ret 753 754.align 4 755.Ltail_neon: 756 add x2,x2,#320 757 ldp d8,d9,[sp] // meet ABI requirements 758 cmp x2,#64 759 b.lo .Less_than_64 760 761 add x5,x5,x6,lsl#32 // pack 762 add x7,x7,x8,lsl#32 763 ldp x6,x8,[x1,#0] // load input 764 add x9,x9,x10,lsl#32 765 add x11,x11,x12,lsl#32 766 ldp x10,x12,[x1,#16] 767 add x13,x13,x14,lsl#32 768 add x15,x15,x16,lsl#32 769 ldp x14,x16,[x1,#32] 770 add x17,x17,x19,lsl#32 771 add x20,x20,x21,lsl#32 772 ldp x19,x21,[x1,#48] 773 add x1,x1,#64 774#ifdef __AARCH64EB__ 775 rev x5,x5 776 rev x7,x7 777 rev x9,x9 778 rev x11,x11 779 rev x13,x13 780 rev x15,x15 781 rev x17,x17 782 rev x20,x20 783#endif 784 eor x5,x5,x6 785 eor x7,x7,x8 786 eor x9,x9,x10 787 eor x11,x11,x12 788 eor x13,x13,x14 789 eor x15,x15,x16 790 eor x17,x17,x19 791 eor x20,x20,x21 792 793 stp x5,x7,[x0,#0] // store output 794 add v16.4s,v16.4s,v0.4s // accumulate key block 795 stp x9,x11,[x0,#16] 796 add v17.4s,v17.4s,v1.4s 797 stp x13,x15,[x0,#32] 798 add v18.4s,v18.4s,v2.4s 799 stp x17,x20,[x0,#48] 800 add v19.4s,v19.4s,v3.4s 801 add x0,x0,#64 802 b.eq .Ldone_neon 803 sub x2,x2,#64 804 cmp x2,#64 805 b.lo .Last_neon 806 807 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 808 eor v16.16b,v16.16b,v4.16b 809 eor v17.16b,v17.16b,v5.16b 810 eor v18.16b,v18.16b,v6.16b 811 eor v19.16b,v19.16b,v7.16b 812 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 813 b.eq .Ldone_neon 814 815 add v16.4s,v20.4s,v0.4s 816 add v17.4s,v21.4s,v1.4s 817 sub x2,x2,#64 818 add v18.4s,v22.4s,v2.4s 819 cmp x2,#64 820 add v19.4s,v23.4s,v3.4s 821 b.lo .Last_neon 822 823 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 824 eor v20.16b,v16.16b,v4.16b 825 eor v21.16b,v17.16b,v5.16b 826 eor v22.16b,v18.16b,v6.16b 827 eor v23.16b,v19.16b,v7.16b 828 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 829 b.eq .Ldone_neon 830 831 add v16.4s,v24.4s,v0.4s 832 add v17.4s,v25.4s,v1.4s 833 sub x2,x2,#64 834 add v18.4s,v26.4s,v2.4s 835 cmp x2,#64 836 add v19.4s,v27.4s,v3.4s 837 b.lo .Last_neon 838 839 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 840 eor v24.16b,v16.16b,v4.16b 841 eor v25.16b,v17.16b,v5.16b 842 eor v26.16b,v18.16b,v6.16b 843 eor v27.16b,v19.16b,v7.16b 844 st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 845 b.eq .Ldone_neon 846 847 add v16.4s,v28.4s,v0.4s 848 add v17.4s,v29.4s,v1.4s 849 add v18.4s,v30.4s,v2.4s 850 add v19.4s,v31.4s,v3.4s 851 sub x2,x2,#64 852 853.Last_neon: 854 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 855 856 sub x0,x0,#1 857 add x1,x1,x2 858 add x0,x0,x2 859 add x4,sp,x2 860 neg x2,x2 861 862.Loop_tail_neon: 863 ldrb w10,[x1,x2] 864 ldrb w11,[x4,x2] 865 add x2,x2,#1 866 eor w10,w10,w11 867 strb w10,[x0,x2] 868 cbnz x2,.Loop_tail_neon 869 870 stp xzr,xzr,[sp,#0] 871 stp xzr,xzr,[sp,#16] 872 stp xzr,xzr,[sp,#32] 873 stp xzr,xzr,[sp,#48] 874 875.Ldone_neon: 876 ldp x19,x20,[x29,#16] 877 add sp,sp,#64 878 ldp x21,x22,[x29,#32] 879 ldp x23,x24,[x29,#48] 880 ldp x25,x26,[x29,#64] 881 ldp x27,x28,[x29,#80] 882 ldp x29,x30,[sp],#96 883.inst 0xd50323bf // autiasp 884 ret 885.size ChaCha20_neon,.-ChaCha20_neon 886.type ChaCha20_512_neon,%function 887.align 5 888ChaCha20_512_neon: 889.inst 0xd503233f // paciasp 890 stp x29,x30,[sp,#-96]! 891 add x29,sp,#0 892 893 adr x5,.Lsigma 894 stp x19,x20,[sp,#16] 895 stp x21,x22,[sp,#32] 896 stp x23,x24,[sp,#48] 897 stp x25,x26,[sp,#64] 898 stp x27,x28,[sp,#80] 899 900.L512_or_more_neon: 901 sub sp,sp,#128+64 902 903 eor v7.16b,v7.16b,v7.16b 904 ldp x22,x23,[x5] // load sigma 905 ld1 {v0.4s},[x5],#16 906 ldp x24,x25,[x3] // load key 907 ldp x26,x27,[x3,#16] 908 ld1 {v1.4s,v2.4s},[x3] 909 ldp x28,x30,[x4] // load counter 910 ld1 {v3.4s},[x4] 911 ld1 {v7.s}[0],[x5] 912 add x3,x5,#16 // .Lrot24 913#ifdef __AARCH64EB__ 914 rev64 v0.4s,v0.4s 915 ror x24,x24,#32 916 ror x25,x25,#32 917 ror x26,x26,#32 918 ror x27,x27,#32 919 ror x28,x28,#32 920 ror x30,x30,#32 921#endif 922 add v3.4s,v3.4s,v7.4s // += 1 923 stp q0,q1,[sp,#0] // off-load key block, invariant part 924 add v3.4s,v3.4s,v7.4s // not typo 925 str q2,[sp,#32] 926 add v4.4s,v3.4s,v7.4s 927 add v5.4s,v4.4s,v7.4s 928 add v6.4s,v5.4s,v7.4s 929 shl v7.4s,v7.4s,#2 // 1 -> 4 930 931 stp d8,d9,[sp,#128+0] // meet ABI requirements 932 stp d10,d11,[sp,#128+16] 933 stp d12,d13,[sp,#128+32] 934 stp d14,d15,[sp,#128+48] 935 936 sub x2,x2,#512 // not typo 937 938.Loop_outer_512_neon: 939 mov v8.16b,v0.16b 940 mov v12.16b,v0.16b 941 mov v16.16b,v0.16b 942 mov v20.16b,v0.16b 943 mov v24.16b,v0.16b 944 mov v28.16b,v0.16b 945 mov v9.16b,v1.16b 946 mov w5,w22 // unpack key block 947 mov v13.16b,v1.16b 948 lsr x6,x22,#32 949 mov v17.16b,v1.16b 950 mov w7,w23 951 mov v21.16b,v1.16b 952 lsr x8,x23,#32 953 mov v25.16b,v1.16b 954 mov w9,w24 955 mov v29.16b,v1.16b 956 lsr x10,x24,#32 957 mov v11.16b,v3.16b 958 mov w11,w25 959 mov v15.16b,v4.16b 960 lsr x12,x25,#32 961 mov v19.16b,v5.16b 962 mov w13,w26 963 mov v23.16b,v6.16b 964 lsr x14,x26,#32 965 mov v10.16b,v2.16b 966 mov w15,w27 967 mov v14.16b,v2.16b 968 lsr x16,x27,#32 969 add v27.4s,v11.4s,v7.4s // +4 970 mov w17,w28 971 add v31.4s,v15.4s,v7.4s // +4 972 lsr x19,x28,#32 973 mov v18.16b,v2.16b 974 mov w20,w30 975 mov v22.16b,v2.16b 976 lsr x21,x30,#32 977 mov v26.16b,v2.16b 978 stp q3,q4,[sp,#48] // off-load key block, variable part 979 mov v30.16b,v2.16b 980 stp q5,q6,[sp,#80] 981 982 mov x4,#5 983 ld1 {v6.4s},[x3] 984 subs x2,x2,#512 985.Loop_upper_neon: 986 sub x4,x4,#1 987 add v8.4s,v8.4s,v9.4s 988 add w5,w5,w9 989 add v12.4s,v12.4s,v13.4s 990 add w6,w6,w10 991 add v16.4s,v16.4s,v17.4s 992 add w7,w7,w11 993 add v20.4s,v20.4s,v21.4s 994 add w8,w8,w12 995 add v24.4s,v24.4s,v25.4s 996 eor w17,w17,w5 997 add v28.4s,v28.4s,v29.4s 998 eor w19,w19,w6 999 eor v11.16b,v11.16b,v8.16b 1000 eor w20,w20,w7 1001 eor v15.16b,v15.16b,v12.16b 1002 eor w21,w21,w8 1003 eor v19.16b,v19.16b,v16.16b 1004 ror w17,w17,#16 1005 eor v23.16b,v23.16b,v20.16b 1006 ror w19,w19,#16 1007 eor v27.16b,v27.16b,v24.16b 1008 ror w20,w20,#16 1009 eor v31.16b,v31.16b,v28.16b 1010 ror w21,w21,#16 1011 rev32 v11.8h,v11.8h 1012 add w13,w13,w17 1013 rev32 v15.8h,v15.8h 1014 add w14,w14,w19 1015 rev32 v19.8h,v19.8h 1016 add w15,w15,w20 1017 rev32 v23.8h,v23.8h 1018 add w16,w16,w21 1019 rev32 v27.8h,v27.8h 1020 eor w9,w9,w13 1021 rev32 v31.8h,v31.8h 1022 eor w10,w10,w14 1023 add v10.4s,v10.4s,v11.4s 1024 eor w11,w11,w15 1025 add v14.4s,v14.4s,v15.4s 1026 eor w12,w12,w16 1027 add v18.4s,v18.4s,v19.4s 1028 ror w9,w9,#20 1029 add v22.4s,v22.4s,v23.4s 1030 ror w10,w10,#20 1031 add v26.4s,v26.4s,v27.4s 1032 ror w11,w11,#20 1033 add v30.4s,v30.4s,v31.4s 1034 ror w12,w12,#20 1035 eor v0.16b,v9.16b,v10.16b 1036 add w5,w5,w9 1037 eor v1.16b,v13.16b,v14.16b 1038 add w6,w6,w10 1039 eor v2.16b,v17.16b,v18.16b 1040 add w7,w7,w11 1041 eor v3.16b,v21.16b,v22.16b 1042 add w8,w8,w12 1043 eor v4.16b,v25.16b,v26.16b 1044 eor w17,w17,w5 1045 eor v5.16b,v29.16b,v30.16b 1046 eor w19,w19,w6 1047 ushr v9.4s,v0.4s,#20 1048 eor w20,w20,w7 1049 ushr v13.4s,v1.4s,#20 1050 eor w21,w21,w8 1051 ushr v17.4s,v2.4s,#20 1052 ror w17,w17,#24 1053 ushr v21.4s,v3.4s,#20 1054 ror w19,w19,#24 1055 ushr v25.4s,v4.4s,#20 1056 ror w20,w20,#24 1057 ushr v29.4s,v5.4s,#20 1058 ror w21,w21,#24 1059 sli v9.4s,v0.4s,#12 1060 add w13,w13,w17 1061 sli v13.4s,v1.4s,#12 1062 add w14,w14,w19 1063 sli v17.4s,v2.4s,#12 1064 add w15,w15,w20 1065 sli v21.4s,v3.4s,#12 1066 add w16,w16,w21 1067 sli v25.4s,v4.4s,#12 1068 eor w9,w9,w13 1069 sli v29.4s,v5.4s,#12 1070 eor w10,w10,w14 1071 add v8.4s,v8.4s,v9.4s 1072 eor w11,w11,w15 1073 add v12.4s,v12.4s,v13.4s 1074 eor w12,w12,w16 1075 add v16.4s,v16.4s,v17.4s 1076 ror w9,w9,#25 1077 add v20.4s,v20.4s,v21.4s 1078 ror w10,w10,#25 1079 add v24.4s,v24.4s,v25.4s 1080 ror w11,w11,#25 1081 add v28.4s,v28.4s,v29.4s 1082 ror w12,w12,#25 1083 eor v11.16b,v11.16b,v8.16b 1084 add w5,w5,w10 1085 eor v15.16b,v15.16b,v12.16b 1086 add w6,w6,w11 1087 eor v19.16b,v19.16b,v16.16b 1088 add w7,w7,w12 1089 eor v23.16b,v23.16b,v20.16b 1090 add w8,w8,w9 1091 eor v27.16b,v27.16b,v24.16b 1092 eor w21,w21,w5 1093 eor v31.16b,v31.16b,v28.16b 1094 eor w17,w17,w6 1095 tbl v11.16b,{v11.16b},v6.16b 1096 eor w19,w19,w7 1097 tbl v15.16b,{v15.16b},v6.16b 1098 eor w20,w20,w8 1099 tbl v19.16b,{v19.16b},v6.16b 1100 ror w21,w21,#16 1101 tbl v23.16b,{v23.16b},v6.16b 1102 ror w17,w17,#16 1103 tbl v27.16b,{v27.16b},v6.16b 1104 ror w19,w19,#16 1105 tbl v31.16b,{v31.16b},v6.16b 1106 ror w20,w20,#16 1107 add v10.4s,v10.4s,v11.4s 1108 add w15,w15,w21 1109 add v14.4s,v14.4s,v15.4s 1110 add w16,w16,w17 1111 add v18.4s,v18.4s,v19.4s 1112 add w13,w13,w19 1113 add v22.4s,v22.4s,v23.4s 1114 add w14,w14,w20 1115 add v26.4s,v26.4s,v27.4s 1116 eor w10,w10,w15 1117 add v30.4s,v30.4s,v31.4s 1118 eor w11,w11,w16 1119 eor v0.16b,v9.16b,v10.16b 1120 eor w12,w12,w13 1121 eor v1.16b,v13.16b,v14.16b 1122 eor w9,w9,w14 1123 eor v2.16b,v17.16b,v18.16b 1124 ror w10,w10,#20 1125 eor v3.16b,v21.16b,v22.16b 1126 ror w11,w11,#20 1127 eor v4.16b,v25.16b,v26.16b 1128 ror w12,w12,#20 1129 eor v5.16b,v29.16b,v30.16b 1130 ror w9,w9,#20 1131 ushr v9.4s,v0.4s,#25 1132 add w5,w5,w10 1133 ushr v13.4s,v1.4s,#25 1134 add w6,w6,w11 1135 ushr v17.4s,v2.4s,#25 1136 add w7,w7,w12 1137 ushr v21.4s,v3.4s,#25 1138 add w8,w8,w9 1139 ushr v25.4s,v4.4s,#25 1140 eor w21,w21,w5 1141 ushr v29.4s,v5.4s,#25 1142 eor w17,w17,w6 1143 sli v9.4s,v0.4s,#7 1144 eor w19,w19,w7 1145 sli v13.4s,v1.4s,#7 1146 eor w20,w20,w8 1147 sli v17.4s,v2.4s,#7 1148 ror w21,w21,#24 1149 sli v21.4s,v3.4s,#7 1150 ror w17,w17,#24 1151 sli v25.4s,v4.4s,#7 1152 ror w19,w19,#24 1153 sli v29.4s,v5.4s,#7 1154 ror w20,w20,#24 1155 ext v10.16b,v10.16b,v10.16b,#8 1156 add w15,w15,w21 1157 ext v14.16b,v14.16b,v14.16b,#8 1158 add w16,w16,w17 1159 ext v18.16b,v18.16b,v18.16b,#8 1160 add w13,w13,w19 1161 ext v22.16b,v22.16b,v22.16b,#8 1162 add w14,w14,w20 1163 ext v26.16b,v26.16b,v26.16b,#8 1164 eor w10,w10,w15 1165 ext v30.16b,v30.16b,v30.16b,#8 1166 eor w11,w11,w16 1167 ext v11.16b,v11.16b,v11.16b,#12 1168 eor w12,w12,w13 1169 ext v15.16b,v15.16b,v15.16b,#12 1170 eor w9,w9,w14 1171 ext v19.16b,v19.16b,v19.16b,#12 1172 ror w10,w10,#25 1173 ext v23.16b,v23.16b,v23.16b,#12 1174 ror w11,w11,#25 1175 ext v27.16b,v27.16b,v27.16b,#12 1176 ror w12,w12,#25 1177 ext v31.16b,v31.16b,v31.16b,#12 1178 ror w9,w9,#25 1179 ext v9.16b,v9.16b,v9.16b,#4 1180 ext v13.16b,v13.16b,v13.16b,#4 1181 ext v17.16b,v17.16b,v17.16b,#4 1182 ext v21.16b,v21.16b,v21.16b,#4 1183 ext v25.16b,v25.16b,v25.16b,#4 1184 ext v29.16b,v29.16b,v29.16b,#4 1185 add v8.4s,v8.4s,v9.4s 1186 add w5,w5,w9 1187 add v12.4s,v12.4s,v13.4s 1188 add w6,w6,w10 1189 add v16.4s,v16.4s,v17.4s 1190 add w7,w7,w11 1191 add v20.4s,v20.4s,v21.4s 1192 add w8,w8,w12 1193 add v24.4s,v24.4s,v25.4s 1194 eor w17,w17,w5 1195 add v28.4s,v28.4s,v29.4s 1196 eor w19,w19,w6 1197 eor v11.16b,v11.16b,v8.16b 1198 eor w20,w20,w7 1199 eor v15.16b,v15.16b,v12.16b 1200 eor w21,w21,w8 1201 eor v19.16b,v19.16b,v16.16b 1202 ror w17,w17,#16 1203 eor v23.16b,v23.16b,v20.16b 1204 ror w19,w19,#16 1205 eor v27.16b,v27.16b,v24.16b 1206 ror w20,w20,#16 1207 eor v31.16b,v31.16b,v28.16b 1208 ror w21,w21,#16 1209 rev32 v11.8h,v11.8h 1210 add w13,w13,w17 1211 rev32 v15.8h,v15.8h 1212 add w14,w14,w19 1213 rev32 v19.8h,v19.8h 1214 add w15,w15,w20 1215 rev32 v23.8h,v23.8h 1216 add w16,w16,w21 1217 rev32 v27.8h,v27.8h 1218 eor w9,w9,w13 1219 rev32 v31.8h,v31.8h 1220 eor w10,w10,w14 1221 add v10.4s,v10.4s,v11.4s 1222 eor w11,w11,w15 1223 add v14.4s,v14.4s,v15.4s 1224 eor w12,w12,w16 1225 add v18.4s,v18.4s,v19.4s 1226 ror w9,w9,#20 1227 add v22.4s,v22.4s,v23.4s 1228 ror w10,w10,#20 1229 add v26.4s,v26.4s,v27.4s 1230 ror w11,w11,#20 1231 add v30.4s,v30.4s,v31.4s 1232 ror w12,w12,#20 1233 eor v0.16b,v9.16b,v10.16b 1234 add w5,w5,w9 1235 eor v1.16b,v13.16b,v14.16b 1236 add w6,w6,w10 1237 eor v2.16b,v17.16b,v18.16b 1238 add w7,w7,w11 1239 eor v3.16b,v21.16b,v22.16b 1240 add w8,w8,w12 1241 eor v4.16b,v25.16b,v26.16b 1242 eor w17,w17,w5 1243 eor v5.16b,v29.16b,v30.16b 1244 eor w19,w19,w6 1245 ushr v9.4s,v0.4s,#20 1246 eor w20,w20,w7 1247 ushr v13.4s,v1.4s,#20 1248 eor w21,w21,w8 1249 ushr v17.4s,v2.4s,#20 1250 ror w17,w17,#24 1251 ushr v21.4s,v3.4s,#20 1252 ror w19,w19,#24 1253 ushr v25.4s,v4.4s,#20 1254 ror w20,w20,#24 1255 ushr v29.4s,v5.4s,#20 1256 ror w21,w21,#24 1257 sli v9.4s,v0.4s,#12 1258 add w13,w13,w17 1259 sli v13.4s,v1.4s,#12 1260 add w14,w14,w19 1261 sli v17.4s,v2.4s,#12 1262 add w15,w15,w20 1263 sli v21.4s,v3.4s,#12 1264 add w16,w16,w21 1265 sli v25.4s,v4.4s,#12 1266 eor w9,w9,w13 1267 sli v29.4s,v5.4s,#12 1268 eor w10,w10,w14 1269 add v8.4s,v8.4s,v9.4s 1270 eor w11,w11,w15 1271 add v12.4s,v12.4s,v13.4s 1272 eor w12,w12,w16 1273 add v16.4s,v16.4s,v17.4s 1274 ror w9,w9,#25 1275 add v20.4s,v20.4s,v21.4s 1276 ror w10,w10,#25 1277 add v24.4s,v24.4s,v25.4s 1278 ror w11,w11,#25 1279 add v28.4s,v28.4s,v29.4s 1280 ror w12,w12,#25 1281 eor v11.16b,v11.16b,v8.16b 1282 add w5,w5,w10 1283 eor v15.16b,v15.16b,v12.16b 1284 add w6,w6,w11 1285 eor v19.16b,v19.16b,v16.16b 1286 add w7,w7,w12 1287 eor v23.16b,v23.16b,v20.16b 1288 add w8,w8,w9 1289 eor v27.16b,v27.16b,v24.16b 1290 eor w21,w21,w5 1291 eor v31.16b,v31.16b,v28.16b 1292 eor w17,w17,w6 1293 tbl v11.16b,{v11.16b},v6.16b 1294 eor w19,w19,w7 1295 tbl v15.16b,{v15.16b},v6.16b 1296 eor w20,w20,w8 1297 tbl v19.16b,{v19.16b},v6.16b 1298 ror w21,w21,#16 1299 tbl v23.16b,{v23.16b},v6.16b 1300 ror w17,w17,#16 1301 tbl v27.16b,{v27.16b},v6.16b 1302 ror w19,w19,#16 1303 tbl v31.16b,{v31.16b},v6.16b 1304 ror w20,w20,#16 1305 add v10.4s,v10.4s,v11.4s 1306 add w15,w15,w21 1307 add v14.4s,v14.4s,v15.4s 1308 add w16,w16,w17 1309 add v18.4s,v18.4s,v19.4s 1310 add w13,w13,w19 1311 add v22.4s,v22.4s,v23.4s 1312 add w14,w14,w20 1313 add v26.4s,v26.4s,v27.4s 1314 eor w10,w10,w15 1315 add v30.4s,v30.4s,v31.4s 1316 eor w11,w11,w16 1317 eor v0.16b,v9.16b,v10.16b 1318 eor w12,w12,w13 1319 eor v1.16b,v13.16b,v14.16b 1320 eor w9,w9,w14 1321 eor v2.16b,v17.16b,v18.16b 1322 ror w10,w10,#20 1323 eor v3.16b,v21.16b,v22.16b 1324 ror w11,w11,#20 1325 eor v4.16b,v25.16b,v26.16b 1326 ror w12,w12,#20 1327 eor v5.16b,v29.16b,v30.16b 1328 ror w9,w9,#20 1329 ushr v9.4s,v0.4s,#25 1330 add w5,w5,w10 1331 ushr v13.4s,v1.4s,#25 1332 add w6,w6,w11 1333 ushr v17.4s,v2.4s,#25 1334 add w7,w7,w12 1335 ushr v21.4s,v3.4s,#25 1336 add w8,w8,w9 1337 ushr v25.4s,v4.4s,#25 1338 eor w21,w21,w5 1339 ushr v29.4s,v5.4s,#25 1340 eor w17,w17,w6 1341 sli v9.4s,v0.4s,#7 1342 eor w19,w19,w7 1343 sli v13.4s,v1.4s,#7 1344 eor w20,w20,w8 1345 sli v17.4s,v2.4s,#7 1346 ror w21,w21,#24 1347 sli v21.4s,v3.4s,#7 1348 ror w17,w17,#24 1349 sli v25.4s,v4.4s,#7 1350 ror w19,w19,#24 1351 sli v29.4s,v5.4s,#7 1352 ror w20,w20,#24 1353 ext v10.16b,v10.16b,v10.16b,#8 1354 add w15,w15,w21 1355 ext v14.16b,v14.16b,v14.16b,#8 1356 add w16,w16,w17 1357 ext v18.16b,v18.16b,v18.16b,#8 1358 add w13,w13,w19 1359 ext v22.16b,v22.16b,v22.16b,#8 1360 add w14,w14,w20 1361 ext v26.16b,v26.16b,v26.16b,#8 1362 eor w10,w10,w15 1363 ext v30.16b,v30.16b,v30.16b,#8 1364 eor w11,w11,w16 1365 ext v11.16b,v11.16b,v11.16b,#4 1366 eor w12,w12,w13 1367 ext v15.16b,v15.16b,v15.16b,#4 1368 eor w9,w9,w14 1369 ext v19.16b,v19.16b,v19.16b,#4 1370 ror w10,w10,#25 1371 ext v23.16b,v23.16b,v23.16b,#4 1372 ror w11,w11,#25 1373 ext v27.16b,v27.16b,v27.16b,#4 1374 ror w12,w12,#25 1375 ext v31.16b,v31.16b,v31.16b,#4 1376 ror w9,w9,#25 1377 ext v9.16b,v9.16b,v9.16b,#12 1378 ext v13.16b,v13.16b,v13.16b,#12 1379 ext v17.16b,v17.16b,v17.16b,#12 1380 ext v21.16b,v21.16b,v21.16b,#12 1381 ext v25.16b,v25.16b,v25.16b,#12 1382 ext v29.16b,v29.16b,v29.16b,#12 1383 cbnz x4,.Loop_upper_neon 1384 1385 add w5,w5,w22 // accumulate key block 1386 add x6,x6,x22,lsr#32 1387 add w7,w7,w23 1388 add x8,x8,x23,lsr#32 1389 add w9,w9,w24 1390 add x10,x10,x24,lsr#32 1391 add w11,w11,w25 1392 add x12,x12,x25,lsr#32 1393 add w13,w13,w26 1394 add x14,x14,x26,lsr#32 1395 add w15,w15,w27 1396 add x16,x16,x27,lsr#32 1397 add w17,w17,w28 1398 add x19,x19,x28,lsr#32 1399 add w20,w20,w30 1400 add x21,x21,x30,lsr#32 1401 1402 add x5,x5,x6,lsl#32 // pack 1403 add x7,x7,x8,lsl#32 1404 ldp x6,x8,[x1,#0] // load input 1405 add x9,x9,x10,lsl#32 1406 add x11,x11,x12,lsl#32 1407 ldp x10,x12,[x1,#16] 1408 add x13,x13,x14,lsl#32 1409 add x15,x15,x16,lsl#32 1410 ldp x14,x16,[x1,#32] 1411 add x17,x17,x19,lsl#32 1412 add x20,x20,x21,lsl#32 1413 ldp x19,x21,[x1,#48] 1414 add x1,x1,#64 1415#ifdef __AARCH64EB__ 1416 rev x5,x5 1417 rev x7,x7 1418 rev x9,x9 1419 rev x11,x11 1420 rev x13,x13 1421 rev x15,x15 1422 rev x17,x17 1423 rev x20,x20 1424#endif 1425 eor x5,x5,x6 1426 eor x7,x7,x8 1427 eor x9,x9,x10 1428 eor x11,x11,x12 1429 eor x13,x13,x14 1430 eor x15,x15,x16 1431 eor x17,x17,x19 1432 eor x20,x20,x21 1433 1434 stp x5,x7,[x0,#0] // store output 1435 add x28,x28,#1 // increment counter 1436 mov w5,w22 // unpack key block 1437 lsr x6,x22,#32 1438 stp x9,x11,[x0,#16] 1439 mov w7,w23 1440 lsr x8,x23,#32 1441 stp x13,x15,[x0,#32] 1442 mov w9,w24 1443 lsr x10,x24,#32 1444 stp x17,x20,[x0,#48] 1445 add x0,x0,#64 1446 mov w11,w25 1447 lsr x12,x25,#32 1448 mov w13,w26 1449 lsr x14,x26,#32 1450 mov w15,w27 1451 lsr x16,x27,#32 1452 mov w17,w28 1453 lsr x19,x28,#32 1454 mov w20,w30 1455 lsr x21,x30,#32 1456 1457 mov x4,#5 1458.Loop_lower_neon: 1459 sub x4,x4,#1 1460 add v8.4s,v8.4s,v9.4s 1461 add w5,w5,w9 1462 add v12.4s,v12.4s,v13.4s 1463 add w6,w6,w10 1464 add v16.4s,v16.4s,v17.4s 1465 add w7,w7,w11 1466 add v20.4s,v20.4s,v21.4s 1467 add w8,w8,w12 1468 add v24.4s,v24.4s,v25.4s 1469 eor w17,w17,w5 1470 add v28.4s,v28.4s,v29.4s 1471 eor w19,w19,w6 1472 eor v11.16b,v11.16b,v8.16b 1473 eor w20,w20,w7 1474 eor v15.16b,v15.16b,v12.16b 1475 eor w21,w21,w8 1476 eor v19.16b,v19.16b,v16.16b 1477 ror w17,w17,#16 1478 eor v23.16b,v23.16b,v20.16b 1479 ror w19,w19,#16 1480 eor v27.16b,v27.16b,v24.16b 1481 ror w20,w20,#16 1482 eor v31.16b,v31.16b,v28.16b 1483 ror w21,w21,#16 1484 rev32 v11.8h,v11.8h 1485 add w13,w13,w17 1486 rev32 v15.8h,v15.8h 1487 add w14,w14,w19 1488 rev32 v19.8h,v19.8h 1489 add w15,w15,w20 1490 rev32 v23.8h,v23.8h 1491 add w16,w16,w21 1492 rev32 v27.8h,v27.8h 1493 eor w9,w9,w13 1494 rev32 v31.8h,v31.8h 1495 eor w10,w10,w14 1496 add v10.4s,v10.4s,v11.4s 1497 eor w11,w11,w15 1498 add v14.4s,v14.4s,v15.4s 1499 eor w12,w12,w16 1500 add v18.4s,v18.4s,v19.4s 1501 ror w9,w9,#20 1502 add v22.4s,v22.4s,v23.4s 1503 ror w10,w10,#20 1504 add v26.4s,v26.4s,v27.4s 1505 ror w11,w11,#20 1506 add v30.4s,v30.4s,v31.4s 1507 ror w12,w12,#20 1508 eor v0.16b,v9.16b,v10.16b 1509 add w5,w5,w9 1510 eor v1.16b,v13.16b,v14.16b 1511 add w6,w6,w10 1512 eor v2.16b,v17.16b,v18.16b 1513 add w7,w7,w11 1514 eor v3.16b,v21.16b,v22.16b 1515 add w8,w8,w12 1516 eor v4.16b,v25.16b,v26.16b 1517 eor w17,w17,w5 1518 eor v5.16b,v29.16b,v30.16b 1519 eor w19,w19,w6 1520 ushr v9.4s,v0.4s,#20 1521 eor w20,w20,w7 1522 ushr v13.4s,v1.4s,#20 1523 eor w21,w21,w8 1524 ushr v17.4s,v2.4s,#20 1525 ror w17,w17,#24 1526 ushr v21.4s,v3.4s,#20 1527 ror w19,w19,#24 1528 ushr v25.4s,v4.4s,#20 1529 ror w20,w20,#24 1530 ushr v29.4s,v5.4s,#20 1531 ror w21,w21,#24 1532 sli v9.4s,v0.4s,#12 1533 add w13,w13,w17 1534 sli v13.4s,v1.4s,#12 1535 add w14,w14,w19 1536 sli v17.4s,v2.4s,#12 1537 add w15,w15,w20 1538 sli v21.4s,v3.4s,#12 1539 add w16,w16,w21 1540 sli v25.4s,v4.4s,#12 1541 eor w9,w9,w13 1542 sli v29.4s,v5.4s,#12 1543 eor w10,w10,w14 1544 add v8.4s,v8.4s,v9.4s 1545 eor w11,w11,w15 1546 add v12.4s,v12.4s,v13.4s 1547 eor w12,w12,w16 1548 add v16.4s,v16.4s,v17.4s 1549 ror w9,w9,#25 1550 add v20.4s,v20.4s,v21.4s 1551 ror w10,w10,#25 1552 add v24.4s,v24.4s,v25.4s 1553 ror w11,w11,#25 1554 add v28.4s,v28.4s,v29.4s 1555 ror w12,w12,#25 1556 eor v11.16b,v11.16b,v8.16b 1557 add w5,w5,w10 1558 eor v15.16b,v15.16b,v12.16b 1559 add w6,w6,w11 1560 eor v19.16b,v19.16b,v16.16b 1561 add w7,w7,w12 1562 eor v23.16b,v23.16b,v20.16b 1563 add w8,w8,w9 1564 eor v27.16b,v27.16b,v24.16b 1565 eor w21,w21,w5 1566 eor v31.16b,v31.16b,v28.16b 1567 eor w17,w17,w6 1568 tbl v11.16b,{v11.16b},v6.16b 1569 eor w19,w19,w7 1570 tbl v15.16b,{v15.16b},v6.16b 1571 eor w20,w20,w8 1572 tbl v19.16b,{v19.16b},v6.16b 1573 ror w21,w21,#16 1574 tbl v23.16b,{v23.16b},v6.16b 1575 ror w17,w17,#16 1576 tbl v27.16b,{v27.16b},v6.16b 1577 ror w19,w19,#16 1578 tbl v31.16b,{v31.16b},v6.16b 1579 ror w20,w20,#16 1580 add v10.4s,v10.4s,v11.4s 1581 add w15,w15,w21 1582 add v14.4s,v14.4s,v15.4s 1583 add w16,w16,w17 1584 add v18.4s,v18.4s,v19.4s 1585 add w13,w13,w19 1586 add v22.4s,v22.4s,v23.4s 1587 add w14,w14,w20 1588 add v26.4s,v26.4s,v27.4s 1589 eor w10,w10,w15 1590 add v30.4s,v30.4s,v31.4s 1591 eor w11,w11,w16 1592 eor v0.16b,v9.16b,v10.16b 1593 eor w12,w12,w13 1594 eor v1.16b,v13.16b,v14.16b 1595 eor w9,w9,w14 1596 eor v2.16b,v17.16b,v18.16b 1597 ror w10,w10,#20 1598 eor v3.16b,v21.16b,v22.16b 1599 ror w11,w11,#20 1600 eor v4.16b,v25.16b,v26.16b 1601 ror w12,w12,#20 1602 eor v5.16b,v29.16b,v30.16b 1603 ror w9,w9,#20 1604 ushr v9.4s,v0.4s,#25 1605 add w5,w5,w10 1606 ushr v13.4s,v1.4s,#25 1607 add w6,w6,w11 1608 ushr v17.4s,v2.4s,#25 1609 add w7,w7,w12 1610 ushr v21.4s,v3.4s,#25 1611 add w8,w8,w9 1612 ushr v25.4s,v4.4s,#25 1613 eor w21,w21,w5 1614 ushr v29.4s,v5.4s,#25 1615 eor w17,w17,w6 1616 sli v9.4s,v0.4s,#7 1617 eor w19,w19,w7 1618 sli v13.4s,v1.4s,#7 1619 eor w20,w20,w8 1620 sli v17.4s,v2.4s,#7 1621 ror w21,w21,#24 1622 sli v21.4s,v3.4s,#7 1623 ror w17,w17,#24 1624 sli v25.4s,v4.4s,#7 1625 ror w19,w19,#24 1626 sli v29.4s,v5.4s,#7 1627 ror w20,w20,#24 1628 ext v10.16b,v10.16b,v10.16b,#8 1629 add w15,w15,w21 1630 ext v14.16b,v14.16b,v14.16b,#8 1631 add w16,w16,w17 1632 ext v18.16b,v18.16b,v18.16b,#8 1633 add w13,w13,w19 1634 ext v22.16b,v22.16b,v22.16b,#8 1635 add w14,w14,w20 1636 ext v26.16b,v26.16b,v26.16b,#8 1637 eor w10,w10,w15 1638 ext v30.16b,v30.16b,v30.16b,#8 1639 eor w11,w11,w16 1640 ext v11.16b,v11.16b,v11.16b,#12 1641 eor w12,w12,w13 1642 ext v15.16b,v15.16b,v15.16b,#12 1643 eor w9,w9,w14 1644 ext v19.16b,v19.16b,v19.16b,#12 1645 ror w10,w10,#25 1646 ext v23.16b,v23.16b,v23.16b,#12 1647 ror w11,w11,#25 1648 ext v27.16b,v27.16b,v27.16b,#12 1649 ror w12,w12,#25 1650 ext v31.16b,v31.16b,v31.16b,#12 1651 ror w9,w9,#25 1652 ext v9.16b,v9.16b,v9.16b,#4 1653 ext v13.16b,v13.16b,v13.16b,#4 1654 ext v17.16b,v17.16b,v17.16b,#4 1655 ext v21.16b,v21.16b,v21.16b,#4 1656 ext v25.16b,v25.16b,v25.16b,#4 1657 ext v29.16b,v29.16b,v29.16b,#4 1658 add v8.4s,v8.4s,v9.4s 1659 add w5,w5,w9 1660 add v12.4s,v12.4s,v13.4s 1661 add w6,w6,w10 1662 add v16.4s,v16.4s,v17.4s 1663 add w7,w7,w11 1664 add v20.4s,v20.4s,v21.4s 1665 add w8,w8,w12 1666 add v24.4s,v24.4s,v25.4s 1667 eor w17,w17,w5 1668 add v28.4s,v28.4s,v29.4s 1669 eor w19,w19,w6 1670 eor v11.16b,v11.16b,v8.16b 1671 eor w20,w20,w7 1672 eor v15.16b,v15.16b,v12.16b 1673 eor w21,w21,w8 1674 eor v19.16b,v19.16b,v16.16b 1675 ror w17,w17,#16 1676 eor v23.16b,v23.16b,v20.16b 1677 ror w19,w19,#16 1678 eor v27.16b,v27.16b,v24.16b 1679 ror w20,w20,#16 1680 eor v31.16b,v31.16b,v28.16b 1681 ror w21,w21,#16 1682 rev32 v11.8h,v11.8h 1683 add w13,w13,w17 1684 rev32 v15.8h,v15.8h 1685 add w14,w14,w19 1686 rev32 v19.8h,v19.8h 1687 add w15,w15,w20 1688 rev32 v23.8h,v23.8h 1689 add w16,w16,w21 1690 rev32 v27.8h,v27.8h 1691 eor w9,w9,w13 1692 rev32 v31.8h,v31.8h 1693 eor w10,w10,w14 1694 add v10.4s,v10.4s,v11.4s 1695 eor w11,w11,w15 1696 add v14.4s,v14.4s,v15.4s 1697 eor w12,w12,w16 1698 add v18.4s,v18.4s,v19.4s 1699 ror w9,w9,#20 1700 add v22.4s,v22.4s,v23.4s 1701 ror w10,w10,#20 1702 add v26.4s,v26.4s,v27.4s 1703 ror w11,w11,#20 1704 add v30.4s,v30.4s,v31.4s 1705 ror w12,w12,#20 1706 eor v0.16b,v9.16b,v10.16b 1707 add w5,w5,w9 1708 eor v1.16b,v13.16b,v14.16b 1709 add w6,w6,w10 1710 eor v2.16b,v17.16b,v18.16b 1711 add w7,w7,w11 1712 eor v3.16b,v21.16b,v22.16b 1713 add w8,w8,w12 1714 eor v4.16b,v25.16b,v26.16b 1715 eor w17,w17,w5 1716 eor v5.16b,v29.16b,v30.16b 1717 eor w19,w19,w6 1718 ushr v9.4s,v0.4s,#20 1719 eor w20,w20,w7 1720 ushr v13.4s,v1.4s,#20 1721 eor w21,w21,w8 1722 ushr v17.4s,v2.4s,#20 1723 ror w17,w17,#24 1724 ushr v21.4s,v3.4s,#20 1725 ror w19,w19,#24 1726 ushr v25.4s,v4.4s,#20 1727 ror w20,w20,#24 1728 ushr v29.4s,v5.4s,#20 1729 ror w21,w21,#24 1730 sli v9.4s,v0.4s,#12 1731 add w13,w13,w17 1732 sli v13.4s,v1.4s,#12 1733 add w14,w14,w19 1734 sli v17.4s,v2.4s,#12 1735 add w15,w15,w20 1736 sli v21.4s,v3.4s,#12 1737 add w16,w16,w21 1738 sli v25.4s,v4.4s,#12 1739 eor w9,w9,w13 1740 sli v29.4s,v5.4s,#12 1741 eor w10,w10,w14 1742 add v8.4s,v8.4s,v9.4s 1743 eor w11,w11,w15 1744 add v12.4s,v12.4s,v13.4s 1745 eor w12,w12,w16 1746 add v16.4s,v16.4s,v17.4s 1747 ror w9,w9,#25 1748 add v20.4s,v20.4s,v21.4s 1749 ror w10,w10,#25 1750 add v24.4s,v24.4s,v25.4s 1751 ror w11,w11,#25 1752 add v28.4s,v28.4s,v29.4s 1753 ror w12,w12,#25 1754 eor v11.16b,v11.16b,v8.16b 1755 add w5,w5,w10 1756 eor v15.16b,v15.16b,v12.16b 1757 add w6,w6,w11 1758 eor v19.16b,v19.16b,v16.16b 1759 add w7,w7,w12 1760 eor v23.16b,v23.16b,v20.16b 1761 add w8,w8,w9 1762 eor v27.16b,v27.16b,v24.16b 1763 eor w21,w21,w5 1764 eor v31.16b,v31.16b,v28.16b 1765 eor w17,w17,w6 1766 tbl v11.16b,{v11.16b},v6.16b 1767 eor w19,w19,w7 1768 tbl v15.16b,{v15.16b},v6.16b 1769 eor w20,w20,w8 1770 tbl v19.16b,{v19.16b},v6.16b 1771 ror w21,w21,#16 1772 tbl v23.16b,{v23.16b},v6.16b 1773 ror w17,w17,#16 1774 tbl v27.16b,{v27.16b},v6.16b 1775 ror w19,w19,#16 1776 tbl v31.16b,{v31.16b},v6.16b 1777 ror w20,w20,#16 1778 add v10.4s,v10.4s,v11.4s 1779 add w15,w15,w21 1780 add v14.4s,v14.4s,v15.4s 1781 add w16,w16,w17 1782 add v18.4s,v18.4s,v19.4s 1783 add w13,w13,w19 1784 add v22.4s,v22.4s,v23.4s 1785 add w14,w14,w20 1786 add v26.4s,v26.4s,v27.4s 1787 eor w10,w10,w15 1788 add v30.4s,v30.4s,v31.4s 1789 eor w11,w11,w16 1790 eor v0.16b,v9.16b,v10.16b 1791 eor w12,w12,w13 1792 eor v1.16b,v13.16b,v14.16b 1793 eor w9,w9,w14 1794 eor v2.16b,v17.16b,v18.16b 1795 ror w10,w10,#20 1796 eor v3.16b,v21.16b,v22.16b 1797 ror w11,w11,#20 1798 eor v4.16b,v25.16b,v26.16b 1799 ror w12,w12,#20 1800 eor v5.16b,v29.16b,v30.16b 1801 ror w9,w9,#20 1802 ushr v9.4s,v0.4s,#25 1803 add w5,w5,w10 1804 ushr v13.4s,v1.4s,#25 1805 add w6,w6,w11 1806 ushr v17.4s,v2.4s,#25 1807 add w7,w7,w12 1808 ushr v21.4s,v3.4s,#25 1809 add w8,w8,w9 1810 ushr v25.4s,v4.4s,#25 1811 eor w21,w21,w5 1812 ushr v29.4s,v5.4s,#25 1813 eor w17,w17,w6 1814 sli v9.4s,v0.4s,#7 1815 eor w19,w19,w7 1816 sli v13.4s,v1.4s,#7 1817 eor w20,w20,w8 1818 sli v17.4s,v2.4s,#7 1819 ror w21,w21,#24 1820 sli v21.4s,v3.4s,#7 1821 ror w17,w17,#24 1822 sli v25.4s,v4.4s,#7 1823 ror w19,w19,#24 1824 sli v29.4s,v5.4s,#7 1825 ror w20,w20,#24 1826 ext v10.16b,v10.16b,v10.16b,#8 1827 add w15,w15,w21 1828 ext v14.16b,v14.16b,v14.16b,#8 1829 add w16,w16,w17 1830 ext v18.16b,v18.16b,v18.16b,#8 1831 add w13,w13,w19 1832 ext v22.16b,v22.16b,v22.16b,#8 1833 add w14,w14,w20 1834 ext v26.16b,v26.16b,v26.16b,#8 1835 eor w10,w10,w15 1836 ext v30.16b,v30.16b,v30.16b,#8 1837 eor w11,w11,w16 1838 ext v11.16b,v11.16b,v11.16b,#4 1839 eor w12,w12,w13 1840 ext v15.16b,v15.16b,v15.16b,#4 1841 eor w9,w9,w14 1842 ext v19.16b,v19.16b,v19.16b,#4 1843 ror w10,w10,#25 1844 ext v23.16b,v23.16b,v23.16b,#4 1845 ror w11,w11,#25 1846 ext v27.16b,v27.16b,v27.16b,#4 1847 ror w12,w12,#25 1848 ext v31.16b,v31.16b,v31.16b,#4 1849 ror w9,w9,#25 1850 ext v9.16b,v9.16b,v9.16b,#12 1851 ext v13.16b,v13.16b,v13.16b,#12 1852 ext v17.16b,v17.16b,v17.16b,#12 1853 ext v21.16b,v21.16b,v21.16b,#12 1854 ext v25.16b,v25.16b,v25.16b,#12 1855 ext v29.16b,v29.16b,v29.16b,#12 1856 cbnz x4,.Loop_lower_neon 1857 1858 add w5,w5,w22 // accumulate key block 1859 ldp q0,q1,[sp,#0] 1860 add x6,x6,x22,lsr#32 1861 ldp q2,q3,[sp,#32] 1862 add w7,w7,w23 1863 ldp q4,q5,[sp,#64] 1864 add x8,x8,x23,lsr#32 1865 ldr q6,[sp,#96] 1866 add v8.4s,v8.4s,v0.4s 1867 add w9,w9,w24 1868 add v12.4s,v12.4s,v0.4s 1869 add x10,x10,x24,lsr#32 1870 add v16.4s,v16.4s,v0.4s 1871 add w11,w11,w25 1872 add v20.4s,v20.4s,v0.4s 1873 add x12,x12,x25,lsr#32 1874 add v24.4s,v24.4s,v0.4s 1875 add w13,w13,w26 1876 add v28.4s,v28.4s,v0.4s 1877 add x14,x14,x26,lsr#32 1878 add v10.4s,v10.4s,v2.4s 1879 add w15,w15,w27 1880 add v14.4s,v14.4s,v2.4s 1881 add x16,x16,x27,lsr#32 1882 add v18.4s,v18.4s,v2.4s 1883 add w17,w17,w28 1884 add v22.4s,v22.4s,v2.4s 1885 add x19,x19,x28,lsr#32 1886 add v26.4s,v26.4s,v2.4s 1887 add w20,w20,w30 1888 add v30.4s,v30.4s,v2.4s 1889 add x21,x21,x30,lsr#32 1890 add v27.4s,v27.4s,v7.4s // +4 1891 add x5,x5,x6,lsl#32 // pack 1892 add v31.4s,v31.4s,v7.4s // +4 1893 add x7,x7,x8,lsl#32 1894 add v11.4s,v11.4s,v3.4s 1895 ldp x6,x8,[x1,#0] // load input 1896 add v15.4s,v15.4s,v4.4s 1897 add x9,x9,x10,lsl#32 1898 add v19.4s,v19.4s,v5.4s 1899 add x11,x11,x12,lsl#32 1900 add v23.4s,v23.4s,v6.4s 1901 ldp x10,x12,[x1,#16] 1902 add v27.4s,v27.4s,v3.4s 1903 add x13,x13,x14,lsl#32 1904 add v31.4s,v31.4s,v4.4s 1905 add x15,x15,x16,lsl#32 1906 add v9.4s,v9.4s,v1.4s 1907 ldp x14,x16,[x1,#32] 1908 add v13.4s,v13.4s,v1.4s 1909 add x17,x17,x19,lsl#32 1910 add v17.4s,v17.4s,v1.4s 1911 add x20,x20,x21,lsl#32 1912 add v21.4s,v21.4s,v1.4s 1913 ldp x19,x21,[x1,#48] 1914 add v25.4s,v25.4s,v1.4s 1915 add x1,x1,#64 1916 add v29.4s,v29.4s,v1.4s 1917 1918#ifdef __AARCH64EB__ 1919 rev x5,x5 1920 rev x7,x7 1921 rev x9,x9 1922 rev x11,x11 1923 rev x13,x13 1924 rev x15,x15 1925 rev x17,x17 1926 rev x20,x20 1927#endif 1928 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1929 eor x5,x5,x6 1930 eor x7,x7,x8 1931 eor x9,x9,x10 1932 eor x11,x11,x12 1933 eor x13,x13,x14 1934 eor v8.16b,v8.16b,v0.16b 1935 eor x15,x15,x16 1936 eor v9.16b,v9.16b,v1.16b 1937 eor x17,x17,x19 1938 eor v10.16b,v10.16b,v2.16b 1939 eor x20,x20,x21 1940 eor v11.16b,v11.16b,v3.16b 1941 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1942 1943 stp x5,x7,[x0,#0] // store output 1944 add x28,x28,#7 // increment counter 1945 stp x9,x11,[x0,#16] 1946 stp x13,x15,[x0,#32] 1947 stp x17,x20,[x0,#48] 1948 add x0,x0,#64 1949 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1950 1951 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1952 eor v12.16b,v12.16b,v0.16b 1953 eor v13.16b,v13.16b,v1.16b 1954 eor v14.16b,v14.16b,v2.16b 1955 eor v15.16b,v15.16b,v3.16b 1956 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1957 1958 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1959 eor v16.16b,v16.16b,v8.16b 1960 ldp q0,q1,[sp,#0] 1961 eor v17.16b,v17.16b,v9.16b 1962 ldp q2,q3,[sp,#32] 1963 eor v18.16b,v18.16b,v10.16b 1964 eor v19.16b,v19.16b,v11.16b 1965 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1966 1967 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 1968 eor v20.16b,v20.16b,v12.16b 1969 eor v21.16b,v21.16b,v13.16b 1970 eor v22.16b,v22.16b,v14.16b 1971 eor v23.16b,v23.16b,v15.16b 1972 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1973 1974 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 1975 eor v24.16b,v24.16b,v16.16b 1976 eor v25.16b,v25.16b,v17.16b 1977 eor v26.16b,v26.16b,v18.16b 1978 eor v27.16b,v27.16b,v19.16b 1979 st1 {v24.16b,v25.16b,v26.16b,v27.16b},[x0],#64 1980 1981 shl v8.4s,v7.4s,#1 // 4 -> 8 1982 eor v28.16b,v28.16b,v20.16b 1983 eor v29.16b,v29.16b,v21.16b 1984 eor v30.16b,v30.16b,v22.16b 1985 eor v31.16b,v31.16b,v23.16b 1986 st1 {v28.16b,v29.16b,v30.16b,v31.16b},[x0],#64 1987 1988 add v3.4s,v3.4s,v8.4s // += 8 1989 add v4.4s,v4.4s,v8.4s 1990 add v5.4s,v5.4s,v8.4s 1991 add v6.4s,v6.4s,v8.4s 1992 1993 b.hs .Loop_outer_512_neon 1994 1995 adds x2,x2,#512 1996 ushr v7.4s,v7.4s,#1 // 4 -> 2 1997 1998 ldp d10,d11,[sp,#128+16] // meet ABI requirements 1999 ldp d12,d13,[sp,#128+32] 2000 ldp d14,d15,[sp,#128+48] 2001 2002 stp q0,q0,[sp,#0] // wipe off-load area 2003 stp q0,q0,[sp,#32] 2004 stp q0,q0,[sp,#64] 2005 2006 b.eq .Ldone_512_neon 2007 2008 sub x3,x3,#16 // .Lone 2009 cmp x2,#192 2010 add sp,sp,#128 2011 sub v3.4s,v3.4s,v7.4s // -= 2 2012 ld1 {v8.4s,v9.4s},[x3] 2013 b.hs .Loop_outer_neon 2014 2015 ldp d8,d9,[sp,#0] // meet ABI requirements 2016 eor v1.16b,v1.16b,v1.16b 2017 eor v2.16b,v2.16b,v2.16b 2018 eor v3.16b,v3.16b,v3.16b 2019 eor v4.16b,v4.16b,v4.16b 2020 eor v5.16b,v5.16b,v5.16b 2021 eor v6.16b,v6.16b,v6.16b 2022 b .Loop_outer 2023 2024.Ldone_512_neon: 2025 ldp d8,d9,[sp,#128+0] // meet ABI requirements 2026 ldp x19,x20,[x29,#16] 2027 add sp,sp,#128+64 2028 ldp x21,x22,[x29,#32] 2029 ldp x23,x24,[x29,#48] 2030 ldp x25,x26,[x29,#64] 2031 ldp x27,x28,[x29,#80] 2032 ldp x29,x30,[sp],#96 2033.inst 0xd50323bf // autiasp 2034 ret 2035.size ChaCha20_512_neon,.-ChaCha20_512_neon 2036