1/* Do not modify. This file is auto-generated from poly1305-armv8.pl. */ 2#include "arm_arch.h" 3 4.text 5 6// forward "declarations" are required for Apple 7 8.hidden OPENSSL_armcap_P 9.globl poly1305_init 10.hidden poly1305_init 11.globl poly1305_blocks 12.hidden poly1305_blocks 13.globl poly1305_emit 14.hidden poly1305_emit 15 16.type poly1305_init,%function 17.align 5 18poly1305_init: 19 AARCH64_VALID_CALL_TARGET 20 cmp x1,xzr 21 stp xzr,xzr,[x0] // zero hash value 22 stp xzr,xzr,[x0,#16] // [along with is_base2_26] 23 24 csel x0,xzr,x0,eq 25 b.eq .Lno_key 26 27 adrp x17,OPENSSL_armcap_P 28 ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 29 30 ldp x7,x8,[x1] // load key 31 mov x9,#0xfffffffc0fffffff 32 movk x9,#0x0fff,lsl#48 33#ifdef __ARMEB__ 34 rev x7,x7 // flip bytes 35 rev x8,x8 36#endif 37 and x7,x7,x9 // &=0ffffffc0fffffff 38 and x9,x9,#-4 39 and x8,x8,x9 // &=0ffffffc0ffffffc 40 stp x7,x8,[x0,#32] // save key value 41 42 tst w17,#ARMV7_NEON 43 44 adr x12,.Lpoly1305_blocks 45 adr x7,.Lpoly1305_blocks_neon 46 adr x13,.Lpoly1305_emit 47 adr x8,.Lpoly1305_emit_neon 48 49 csel x12,x12,x7,eq 50 csel x13,x13,x8,eq 51 52#ifdef __ILP32__ 53 stp w12,w13,[x2] 54#else 55 stp x12,x13,[x2] 56#endif 57 58 mov x0,#1 59.Lno_key: 60 ret 61.size poly1305_init,.-poly1305_init 62 63.type poly1305_blocks,%function 64.align 5 65poly1305_blocks: 66.Lpoly1305_blocks: 67 // The symbol .Lpoly1305_blocks is not a .globl symbol 68 // but a pointer to it is returned by poly1305_init 69 AARCH64_VALID_CALL_TARGET 70 ands x2,x2,#-16 71 b.eq .Lno_data 72 73 ldp x4,x5,[x0] // load hash value 74 ldp x7,x8,[x0,#32] // load key value 75 ldr x6,[x0,#16] 76 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 77 b .Loop 78 79.align 5 80.Loop: 81 ldp x10,x11,[x1],#16 // load input 82 sub x2,x2,#16 83#ifdef __ARMEB__ 84 rev x10,x10 85 rev x11,x11 86#endif 87 adds x4,x4,x10 // accumulate input 88 adcs x5,x5,x11 89 90 mul x12,x4,x7 // h0*r0 91 adc x6,x6,x3 92 umulh x13,x4,x7 93 94 mul x10,x5,x9 // h1*5*r1 95 umulh x11,x5,x9 96 97 adds x12,x12,x10 98 mul x10,x4,x8 // h0*r1 99 adc x13,x13,x11 100 umulh x14,x4,x8 101 102 adds x13,x13,x10 103 mul x10,x5,x7 // h1*r0 104 adc x14,x14,xzr 105 umulh x11,x5,x7 106 107 adds x13,x13,x10 108 mul x10,x6,x9 // h2*5*r1 109 adc x14,x14,x11 110 mul x11,x6,x7 // h2*r0 111 112 adds x13,x13,x10 113 adc x14,x14,x11 114 115 and x10,x14,#-4 // final reduction 116 and x6,x14,#3 117 add x10,x10,x14,lsr#2 118 adds x4,x12,x10 119 adcs x5,x13,xzr 120 adc x6,x6,xzr 121 122 cbnz x2,.Loop 123 124 stp x4,x5,[x0] // store hash value 125 str x6,[x0,#16] 126 127.Lno_data: 128 ret 129.size poly1305_blocks,.-poly1305_blocks 130 131.type poly1305_emit,%function 132.align 5 133poly1305_emit: 134.Lpoly1305_emit: 135 // The symbol .poly1305_emit is not a .globl symbol 136 // but a pointer to it is returned by poly1305_init 137 AARCH64_VALID_CALL_TARGET 138 ldp x4,x5,[x0] // load hash base 2^64 139 ldr x6,[x0,#16] 140 ldp x10,x11,[x2] // load nonce 141 142 adds x12,x4,#5 // compare to modulus 143 adcs x13,x5,xzr 144 adc x14,x6,xzr 145 146 tst x14,#-4 // see if it's carried/borrowed 147 148 csel x4,x4,x12,eq 149 csel x5,x5,x13,eq 150 151#ifdef __ARMEB__ 152 ror x10,x10,#32 // flip nonce words 153 ror x11,x11,#32 154#endif 155 adds x4,x4,x10 // accumulate nonce 156 adc x5,x5,x11 157#ifdef __ARMEB__ 158 rev x4,x4 // flip output bytes 159 rev x5,x5 160#endif 161 stp x4,x5,[x1] // write result 162 163 ret 164.size poly1305_emit,.-poly1305_emit 165.type poly1305_mult,%function 166.align 5 167poly1305_mult: 168 mul x12,x4,x7 // h0*r0 169 umulh x13,x4,x7 170 171 mul x10,x5,x9 // h1*5*r1 172 umulh x11,x5,x9 173 174 adds x12,x12,x10 175 mul x10,x4,x8 // h0*r1 176 adc x13,x13,x11 177 umulh x14,x4,x8 178 179 adds x13,x13,x10 180 mul x10,x5,x7 // h1*r0 181 adc x14,x14,xzr 182 umulh x11,x5,x7 183 184 adds x13,x13,x10 185 mul x10,x6,x9 // h2*5*r1 186 adc x14,x14,x11 187 mul x11,x6,x7 // h2*r0 188 189 adds x13,x13,x10 190 adc x14,x14,x11 191 192 and x10,x14,#-4 // final reduction 193 and x6,x14,#3 194 add x10,x10,x14,lsr#2 195 adds x4,x12,x10 196 adcs x5,x13,xzr 197 adc x6,x6,xzr 198 199 ret 200.size poly1305_mult,.-poly1305_mult 201 202.type poly1305_splat,%function 203.align 5 204poly1305_splat: 205 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 206 ubfx x13,x4,#26,#26 207 extr x14,x5,x4,#52 208 and x14,x14,#0x03ffffff 209 ubfx x15,x5,#14,#26 210 extr x16,x6,x5,#40 211 212 str w12,[x0,#16*0] // r0 213 add w12,w13,w13,lsl#2 // r1*5 214 str w13,[x0,#16*1] // r1 215 add w13,w14,w14,lsl#2 // r2*5 216 str w12,[x0,#16*2] // s1 217 str w14,[x0,#16*3] // r2 218 add w14,w15,w15,lsl#2 // r3*5 219 str w13,[x0,#16*4] // s2 220 str w15,[x0,#16*5] // r3 221 add w15,w16,w16,lsl#2 // r4*5 222 str w14,[x0,#16*6] // s3 223 str w16,[x0,#16*7] // r4 224 str w15,[x0,#16*8] // s4 225 226 ret 227.size poly1305_splat,.-poly1305_splat 228 229.type poly1305_blocks_neon,%function 230.align 5 231poly1305_blocks_neon: 232.Lpoly1305_blocks_neon: 233 // The symbol .Lpoly1305_blocks_neon is not a .globl symbol 234 // but a pointer to it is returned by poly1305_init 235 AARCH64_VALID_CALL_TARGET 236 ldr x17,[x0,#24] 237 cmp x2,#128 238 b.hs .Lblocks_neon 239 cbz x17,.Lpoly1305_blocks 240 241.Lblocks_neon: 242 AARCH64_SIGN_LINK_REGISTER 243 stp x29,x30,[sp,#-80]! 244 add x29,sp,#0 245 246 ands x2,x2,#-16 247 b.eq .Lno_data_neon 248 249 cbz x17,.Lbase2_64_neon 250 251 ldp w10,w11,[x0] // load hash value base 2^26 252 ldp w12,w13,[x0,#8] 253 ldr w14,[x0,#16] 254 255 tst x2,#31 256 b.eq .Leven_neon 257 258 ldp x7,x8,[x0,#32] // load key value 259 260 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 261 lsr x5,x12,#12 262 adds x4,x4,x12,lsl#52 263 add x5,x5,x13,lsl#14 264 adc x5,x5,xzr 265 lsr x6,x14,#24 266 adds x5,x5,x14,lsl#40 267 adc x14,x6,xzr // can be partially reduced... 268 269 ldp x12,x13,[x1],#16 // load input 270 sub x2,x2,#16 271 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 272 273 and x10,x14,#-4 // ... so reduce 274 and x6,x14,#3 275 add x10,x10,x14,lsr#2 276 adds x4,x4,x10 277 adcs x5,x5,xzr 278 adc x6,x6,xzr 279 280#ifdef __ARMEB__ 281 rev x12,x12 282 rev x13,x13 283#endif 284 adds x4,x4,x12 // accumulate input 285 adcs x5,x5,x13 286 adc x6,x6,x3 287 288 bl poly1305_mult 289 ldr x30,[sp,#8] 290 291 cbz x3,.Lstore_base2_64_neon 292 293 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 294 ubfx x11,x4,#26,#26 295 extr x12,x5,x4,#52 296 and x12,x12,#0x03ffffff 297 ubfx x13,x5,#14,#26 298 extr x14,x6,x5,#40 299 300 cbnz x2,.Leven_neon 301 302 stp w10,w11,[x0] // store hash value base 2^26 303 stp w12,w13,[x0,#8] 304 str w14,[x0,#16] 305 b .Lno_data_neon 306 307.align 4 308.Lstore_base2_64_neon: 309 stp x4,x5,[x0] // store hash value base 2^64 310 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed 311 b .Lno_data_neon 312 313.align 4 314.Lbase2_64_neon: 315 ldp x7,x8,[x0,#32] // load key value 316 317 ldp x4,x5,[x0] // load hash value base 2^64 318 ldr x6,[x0,#16] 319 320 tst x2,#31 321 b.eq .Linit_neon 322 323 ldp x12,x13,[x1],#16 // load input 324 sub x2,x2,#16 325 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 326#ifdef __ARMEB__ 327 rev x12,x12 328 rev x13,x13 329#endif 330 adds x4,x4,x12 // accumulate input 331 adcs x5,x5,x13 332 adc x6,x6,x3 333 334 bl poly1305_mult 335 336.Linit_neon: 337 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 338 ubfx x11,x4,#26,#26 339 extr x12,x5,x4,#52 340 and x12,x12,#0x03ffffff 341 ubfx x13,x5,#14,#26 342 extr x14,x6,x5,#40 343 344 stp d8,d9,[sp,#16] // meet ABI requirements 345 stp d10,d11,[sp,#32] 346 stp d12,d13,[sp,#48] 347 stp d14,d15,[sp,#64] 348 349 fmov d24,x10 350 fmov d25,x11 351 fmov d26,x12 352 fmov d27,x13 353 fmov d28,x14 354 355 ////////////////////////////////// initialize r^n table 356 mov x4,x7 // r^1 357 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 358 mov x5,x8 359 mov x6,xzr 360 add x0,x0,#48+12 361 bl poly1305_splat 362 363 bl poly1305_mult // r^2 364 sub x0,x0,#4 365 bl poly1305_splat 366 367 bl poly1305_mult // r^3 368 sub x0,x0,#4 369 bl poly1305_splat 370 371 bl poly1305_mult // r^4 372 sub x0,x0,#4 373 bl poly1305_splat 374 ldr x30,[sp,#8] 375 376 add x16,x1,#32 377 adr x17,.Lzeros 378 subs x2,x2,#64 379 csel x16,x17,x16,lo 380 381 mov x4,#1 382 stur x4,[x0,#-24] // set is_base2_26 383 sub x0,x0,#48 // restore original x0 384 b .Ldo_neon 385 386.align 4 387.Leven_neon: 388 add x16,x1,#32 389 adr x17,.Lzeros 390 subs x2,x2,#64 391 csel x16,x17,x16,lo 392 393 stp d8,d9,[sp,#16] // meet ABI requirements 394 stp d10,d11,[sp,#32] 395 stp d12,d13,[sp,#48] 396 stp d14,d15,[sp,#64] 397 398 fmov d24,x10 399 fmov d25,x11 400 fmov d26,x12 401 fmov d27,x13 402 fmov d28,x14 403 404.Ldo_neon: 405 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 406 ldp x9,x13,[x16],#48 407 408 lsl x3,x3,#24 409 add x15,x0,#48 410 411#ifdef __ARMEB__ 412 rev x8,x8 413 rev x12,x12 414 rev x9,x9 415 rev x13,x13 416#endif 417 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 418 and x5,x9,#0x03ffffff 419 ubfx x6,x8,#26,#26 420 ubfx x7,x9,#26,#26 421 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 422 extr x8,x12,x8,#52 423 extr x9,x13,x9,#52 424 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 425 fmov d14,x4 426 and x8,x8,#0x03ffffff 427 and x9,x9,#0x03ffffff 428 ubfx x10,x12,#14,#26 429 ubfx x11,x13,#14,#26 430 add x12,x3,x12,lsr#40 431 add x13,x3,x13,lsr#40 432 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 433 fmov d15,x6 434 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 435 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 436 fmov d16,x8 437 fmov d17,x10 438 fmov d18,x12 439 440 ldp x8,x12,[x1],#16 // inp[0:1] 441 ldp x9,x13,[x1],#48 442 443 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 444 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 445 ld1 {v8.4s},[x15] 446 447#ifdef __ARMEB__ 448 rev x8,x8 449 rev x12,x12 450 rev x9,x9 451 rev x13,x13 452#endif 453 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 454 and x5,x9,#0x03ffffff 455 ubfx x6,x8,#26,#26 456 ubfx x7,x9,#26,#26 457 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 458 extr x8,x12,x8,#52 459 extr x9,x13,x9,#52 460 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 461 fmov d9,x4 462 and x8,x8,#0x03ffffff 463 and x9,x9,#0x03ffffff 464 ubfx x10,x12,#14,#26 465 ubfx x11,x13,#14,#26 466 add x12,x3,x12,lsr#40 467 add x13,x3,x13,lsr#40 468 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 469 fmov d10,x6 470 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 471 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 472 movi v31.2d,#-1 473 fmov d11,x8 474 fmov d12,x10 475 fmov d13,x12 476 ushr v31.2d,v31.2d,#38 477 478 b.ls .Lskip_loop 479 480.align 4 481.Loop_neon: 482 //////////////////////////////////////////////////////////////// 483 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 484 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 485 // ___________________/ 486 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 487 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 488 // ___________________/ ____________________/ 489 // 490 // Note that we start with inp[2:3]*r^2. This is because it 491 // doesn't depend on reduction in previous iteration. 492 //////////////////////////////////////////////////////////////// 493 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 494 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 495 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 496 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 497 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 498 499 subs x2,x2,#64 500 umull v23.2d,v14.2s,v7.s[2] 501 csel x16,x17,x16,lo 502 umull v22.2d,v14.2s,v5.s[2] 503 umull v21.2d,v14.2s,v3.s[2] 504 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 505 umull v20.2d,v14.2s,v1.s[2] 506 ldp x9,x13,[x16],#48 507 umull v19.2d,v14.2s,v0.s[2] 508#ifdef __ARMEB__ 509 rev x8,x8 510 rev x12,x12 511 rev x9,x9 512 rev x13,x13 513#endif 514 515 umlal v23.2d,v15.2s,v5.s[2] 516 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 517 umlal v22.2d,v15.2s,v3.s[2] 518 and x5,x9,#0x03ffffff 519 umlal v21.2d,v15.2s,v1.s[2] 520 ubfx x6,x8,#26,#26 521 umlal v20.2d,v15.2s,v0.s[2] 522 ubfx x7,x9,#26,#26 523 umlal v19.2d,v15.2s,v8.s[2] 524 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 525 526 umlal v23.2d,v16.2s,v3.s[2] 527 extr x8,x12,x8,#52 528 umlal v22.2d,v16.2s,v1.s[2] 529 extr x9,x13,x9,#52 530 umlal v21.2d,v16.2s,v0.s[2] 531 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 532 umlal v20.2d,v16.2s,v8.s[2] 533 fmov d14,x4 534 umlal v19.2d,v16.2s,v6.s[2] 535 and x8,x8,#0x03ffffff 536 537 umlal v23.2d,v17.2s,v1.s[2] 538 and x9,x9,#0x03ffffff 539 umlal v22.2d,v17.2s,v0.s[2] 540 ubfx x10,x12,#14,#26 541 umlal v21.2d,v17.2s,v8.s[2] 542 ubfx x11,x13,#14,#26 543 umlal v20.2d,v17.2s,v6.s[2] 544 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 545 umlal v19.2d,v17.2s,v4.s[2] 546 fmov d15,x6 547 548 add v11.2s,v11.2s,v26.2s 549 add x12,x3,x12,lsr#40 550 umlal v23.2d,v18.2s,v0.s[2] 551 add x13,x3,x13,lsr#40 552 umlal v22.2d,v18.2s,v8.s[2] 553 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 554 umlal v21.2d,v18.2s,v6.s[2] 555 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 556 umlal v20.2d,v18.2s,v4.s[2] 557 fmov d16,x8 558 umlal v19.2d,v18.2s,v2.s[2] 559 fmov d17,x10 560 561 //////////////////////////////////////////////////////////////// 562 // (hash+inp[0:1])*r^4 and accumulate 563 564 add v9.2s,v9.2s,v24.2s 565 fmov d18,x12 566 umlal v22.2d,v11.2s,v1.s[0] 567 ldp x8,x12,[x1],#16 // inp[0:1] 568 umlal v19.2d,v11.2s,v6.s[0] 569 ldp x9,x13,[x1],#48 570 umlal v23.2d,v11.2s,v3.s[0] 571 umlal v20.2d,v11.2s,v8.s[0] 572 umlal v21.2d,v11.2s,v0.s[0] 573#ifdef __ARMEB__ 574 rev x8,x8 575 rev x12,x12 576 rev x9,x9 577 rev x13,x13 578#endif 579 580 add v10.2s,v10.2s,v25.2s 581 umlal v22.2d,v9.2s,v5.s[0] 582 umlal v23.2d,v9.2s,v7.s[0] 583 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 584 umlal v21.2d,v9.2s,v3.s[0] 585 and x5,x9,#0x03ffffff 586 umlal v19.2d,v9.2s,v0.s[0] 587 ubfx x6,x8,#26,#26 588 umlal v20.2d,v9.2s,v1.s[0] 589 ubfx x7,x9,#26,#26 590 591 add v12.2s,v12.2s,v27.2s 592 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 593 umlal v22.2d,v10.2s,v3.s[0] 594 extr x8,x12,x8,#52 595 umlal v23.2d,v10.2s,v5.s[0] 596 extr x9,x13,x9,#52 597 umlal v19.2d,v10.2s,v8.s[0] 598 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 599 umlal v21.2d,v10.2s,v1.s[0] 600 fmov d9,x4 601 umlal v20.2d,v10.2s,v0.s[0] 602 and x8,x8,#0x03ffffff 603 604 add v13.2s,v13.2s,v28.2s 605 and x9,x9,#0x03ffffff 606 umlal v22.2d,v12.2s,v0.s[0] 607 ubfx x10,x12,#14,#26 608 umlal v19.2d,v12.2s,v4.s[0] 609 ubfx x11,x13,#14,#26 610 umlal v23.2d,v12.2s,v1.s[0] 611 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 612 umlal v20.2d,v12.2s,v6.s[0] 613 fmov d10,x6 614 umlal v21.2d,v12.2s,v8.s[0] 615 add x12,x3,x12,lsr#40 616 617 umlal v22.2d,v13.2s,v8.s[0] 618 add x13,x3,x13,lsr#40 619 umlal v19.2d,v13.2s,v2.s[0] 620 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 621 umlal v23.2d,v13.2s,v0.s[0] 622 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 623 umlal v20.2d,v13.2s,v4.s[0] 624 fmov d11,x8 625 umlal v21.2d,v13.2s,v6.s[0] 626 fmov d12,x10 627 fmov d13,x12 628 629 ///////////////////////////////////////////////////////////////// 630 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 631 // and P. Schwabe 632 // 633 // [see discussion in poly1305-armv4 module] 634 635 ushr v29.2d,v22.2d,#26 636 xtn v27.2s,v22.2d 637 ushr v30.2d,v19.2d,#26 638 and v19.16b,v19.16b,v31.16b 639 add v23.2d,v23.2d,v29.2d // h3 -> h4 640 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff 641 add v20.2d,v20.2d,v30.2d // h0 -> h1 642 643 ushr v29.2d,v23.2d,#26 644 xtn v28.2s,v23.2d 645 ushr v30.2d,v20.2d,#26 646 xtn v25.2s,v20.2d 647 bic v28.2s,#0xfc,lsl#24 648 add v21.2d,v21.2d,v30.2d // h1 -> h2 649 650 add v19.2d,v19.2d,v29.2d 651 shl v29.2d,v29.2d,#2 652 shrn v30.2s,v21.2d,#26 653 xtn v26.2s,v21.2d 654 add v19.2d,v19.2d,v29.2d // h4 -> h0 655 bic v25.2s,#0xfc,lsl#24 656 add v27.2s,v27.2s,v30.2s // h2 -> h3 657 bic v26.2s,#0xfc,lsl#24 658 659 shrn v29.2s,v19.2d,#26 660 xtn v24.2s,v19.2d 661 ushr v30.2s,v27.2s,#26 662 bic v27.2s,#0xfc,lsl#24 663 bic v24.2s,#0xfc,lsl#24 664 add v25.2s,v25.2s,v29.2s // h0 -> h1 665 add v28.2s,v28.2s,v30.2s // h3 -> h4 666 667 b.hi .Loop_neon 668 669.Lskip_loop: 670 dup v16.2d,v16.d[0] 671 add v11.2s,v11.2s,v26.2s 672 673 //////////////////////////////////////////////////////////////// 674 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 675 676 adds x2,x2,#32 677 b.ne .Long_tail 678 679 dup v16.2d,v11.d[0] 680 add v14.2s,v9.2s,v24.2s 681 add v17.2s,v12.2s,v27.2s 682 add v15.2s,v10.2s,v25.2s 683 add v18.2s,v13.2s,v28.2s 684 685.Long_tail: 686 dup v14.2d,v14.d[0] 687 umull2 v19.2d,v16.4s,v6.4s 688 umull2 v22.2d,v16.4s,v1.4s 689 umull2 v23.2d,v16.4s,v3.4s 690 umull2 v21.2d,v16.4s,v0.4s 691 umull2 v20.2d,v16.4s,v8.4s 692 693 dup v15.2d,v15.d[0] 694 umlal2 v19.2d,v14.4s,v0.4s 695 umlal2 v21.2d,v14.4s,v3.4s 696 umlal2 v22.2d,v14.4s,v5.4s 697 umlal2 v23.2d,v14.4s,v7.4s 698 umlal2 v20.2d,v14.4s,v1.4s 699 700 dup v17.2d,v17.d[0] 701 umlal2 v19.2d,v15.4s,v8.4s 702 umlal2 v22.2d,v15.4s,v3.4s 703 umlal2 v21.2d,v15.4s,v1.4s 704 umlal2 v23.2d,v15.4s,v5.4s 705 umlal2 v20.2d,v15.4s,v0.4s 706 707 dup v18.2d,v18.d[0] 708 umlal2 v22.2d,v17.4s,v0.4s 709 umlal2 v23.2d,v17.4s,v1.4s 710 umlal2 v19.2d,v17.4s,v4.4s 711 umlal2 v20.2d,v17.4s,v6.4s 712 umlal2 v21.2d,v17.4s,v8.4s 713 714 umlal2 v22.2d,v18.4s,v8.4s 715 umlal2 v19.2d,v18.4s,v2.4s 716 umlal2 v23.2d,v18.4s,v0.4s 717 umlal2 v20.2d,v18.4s,v4.4s 718 umlal2 v21.2d,v18.4s,v6.4s 719 720 b.eq .Lshort_tail 721 722 //////////////////////////////////////////////////////////////// 723 // (hash+inp[0:1])*r^4:r^3 and accumulate 724 725 add v9.2s,v9.2s,v24.2s 726 umlal v22.2d,v11.2s,v1.2s 727 umlal v19.2d,v11.2s,v6.2s 728 umlal v23.2d,v11.2s,v3.2s 729 umlal v20.2d,v11.2s,v8.2s 730 umlal v21.2d,v11.2s,v0.2s 731 732 add v10.2s,v10.2s,v25.2s 733 umlal v22.2d,v9.2s,v5.2s 734 umlal v19.2d,v9.2s,v0.2s 735 umlal v23.2d,v9.2s,v7.2s 736 umlal v20.2d,v9.2s,v1.2s 737 umlal v21.2d,v9.2s,v3.2s 738 739 add v12.2s,v12.2s,v27.2s 740 umlal v22.2d,v10.2s,v3.2s 741 umlal v19.2d,v10.2s,v8.2s 742 umlal v23.2d,v10.2s,v5.2s 743 umlal v20.2d,v10.2s,v0.2s 744 umlal v21.2d,v10.2s,v1.2s 745 746 add v13.2s,v13.2s,v28.2s 747 umlal v22.2d,v12.2s,v0.2s 748 umlal v19.2d,v12.2s,v4.2s 749 umlal v23.2d,v12.2s,v1.2s 750 umlal v20.2d,v12.2s,v6.2s 751 umlal v21.2d,v12.2s,v8.2s 752 753 umlal v22.2d,v13.2s,v8.2s 754 umlal v19.2d,v13.2s,v2.2s 755 umlal v23.2d,v13.2s,v0.2s 756 umlal v20.2d,v13.2s,v4.2s 757 umlal v21.2d,v13.2s,v6.2s 758 759.Lshort_tail: 760 //////////////////////////////////////////////////////////////// 761 // horizontal add 762 763 addp v22.2d,v22.2d,v22.2d 764 ldp d8,d9,[sp,#16] // meet ABI requirements 765 addp v19.2d,v19.2d,v19.2d 766 ldp d10,d11,[sp,#32] 767 addp v23.2d,v23.2d,v23.2d 768 ldp d12,d13,[sp,#48] 769 addp v20.2d,v20.2d,v20.2d 770 ldp d14,d15,[sp,#64] 771 addp v21.2d,v21.2d,v21.2d 772 773 //////////////////////////////////////////////////////////////// 774 // lazy reduction, but without narrowing 775 776 ushr v29.2d,v22.2d,#26 777 and v22.16b,v22.16b,v31.16b 778 ushr v30.2d,v19.2d,#26 779 and v19.16b,v19.16b,v31.16b 780 781 add v23.2d,v23.2d,v29.2d // h3 -> h4 782 add v20.2d,v20.2d,v30.2d // h0 -> h1 783 784 ushr v29.2d,v23.2d,#26 785 and v23.16b,v23.16b,v31.16b 786 ushr v30.2d,v20.2d,#26 787 and v20.16b,v20.16b,v31.16b 788 add v21.2d,v21.2d,v30.2d // h1 -> h2 789 790 add v19.2d,v19.2d,v29.2d 791 shl v29.2d,v29.2d,#2 792 ushr v30.2d,v21.2d,#26 793 and v21.16b,v21.16b,v31.16b 794 add v19.2d,v19.2d,v29.2d // h4 -> h0 795 add v22.2d,v22.2d,v30.2d // h2 -> h3 796 797 ushr v29.2d,v19.2d,#26 798 and v19.16b,v19.16b,v31.16b 799 ushr v30.2d,v22.2d,#26 800 and v22.16b,v22.16b,v31.16b 801 add v20.2d,v20.2d,v29.2d // h0 -> h1 802 add v23.2d,v23.2d,v30.2d // h3 -> h4 803 804 //////////////////////////////////////////////////////////////// 805 // write the result, can be partially reduced 806 807 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 808 st1 {v23.s}[0],[x0] 809 810.Lno_data_neon: 811 ldr x29,[sp],#80 812 AARCH64_VALIDATE_LINK_REGISTER 813 ret 814.size poly1305_blocks_neon,.-poly1305_blocks_neon 815 816.type poly1305_emit_neon,%function 817.align 5 818poly1305_emit_neon: 819.Lpoly1305_emit_neon: 820 // The symbol .Lpoly1305_emit_neon is not a .globl symbol 821 // but a pointer to it is returned by poly1305_init 822 AARCH64_VALID_CALL_TARGET 823 ldr x17,[x0,#24] 824 cbz x17,poly1305_emit 825 826 ldp w10,w11,[x0] // load hash value base 2^26 827 ldp w12,w13,[x0,#8] 828 ldr w14,[x0,#16] 829 830 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 831 lsr x5,x12,#12 832 adds x4,x4,x12,lsl#52 833 add x5,x5,x13,lsl#14 834 adc x5,x5,xzr 835 lsr x6,x14,#24 836 adds x5,x5,x14,lsl#40 837 adc x6,x6,xzr // can be partially reduced... 838 839 ldp x10,x11,[x2] // load nonce 840 841 and x12,x6,#-4 // ... so reduce 842 add x12,x12,x6,lsr#2 843 and x6,x6,#3 844 adds x4,x4,x12 845 adcs x5,x5,xzr 846 adc x6,x6,xzr 847 848 adds x12,x4,#5 // compare to modulus 849 adcs x13,x5,xzr 850 adc x14,x6,xzr 851 852 tst x14,#-4 // see if it's carried/borrowed 853 854 csel x4,x4,x12,eq 855 csel x5,x5,x13,eq 856 857#ifdef __ARMEB__ 858 ror x10,x10,#32 // flip nonce words 859 ror x11,x11,#32 860#endif 861 adds x4,x4,x10 // accumulate nonce 862 adc x5,x5,x11 863#ifdef __ARMEB__ 864 rev x4,x4 // flip output bytes 865 rev x5,x5 866#endif 867 stp x4,x5,[x1] // write result 868 869 ret 870.size poly1305_emit_neon,.-poly1305_emit_neon 871 872.align 5 873.Lzeros: 874.long 0,0,0,0,0,0,0,0 875.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 876.align 2 877.align 2 878