1/* Do not modify. This file is auto-generated from poly1305-armv8.pl. */ 2#include "arm_arch.h" 3 4.text 5 6// forward "declarations" are required for Apple 7 8.hidden OPENSSL_armcap_P 9.globl poly1305_init 10.hidden poly1305_init 11.globl poly1305_blocks 12.hidden poly1305_blocks 13.globl poly1305_emit 14.hidden poly1305_emit 15 16.type poly1305_init,%function 17.align 5 18poly1305_init: 19 cmp x1,xzr 20 stp xzr,xzr,[x0] // zero hash value 21 stp xzr,xzr,[x0,#16] // [along with is_base2_26] 22 23 csel x0,xzr,x0,eq 24 b.eq .Lno_key 25 26 adrp x17,OPENSSL_armcap_P 27 ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 28 29 ldp x7,x8,[x1] // load key 30 mov x9,#0xfffffffc0fffffff 31 movk x9,#0x0fff,lsl#48 32#ifdef __ARMEB__ 33 rev x7,x7 // flip bytes 34 rev x8,x8 35#endif 36 and x7,x7,x9 // &=0ffffffc0fffffff 37 and x9,x9,#-4 38 and x8,x8,x9 // &=0ffffffc0ffffffc 39 stp x7,x8,[x0,#32] // save key value 40 41 tst w17,#ARMV7_NEON 42 43 adr x12,.Lpoly1305_blocks 44 adr x7,.Lpoly1305_blocks_neon 45 adr x13,.Lpoly1305_emit 46 adr x8,.Lpoly1305_emit_neon 47 48 csel x12,x12,x7,eq 49 csel x13,x13,x8,eq 50 51#ifdef __ILP32__ 52 stp w12,w13,[x2] 53#else 54 stp x12,x13,[x2] 55#endif 56 57 mov x0,#1 58.Lno_key: 59 ret 60.size poly1305_init,.-poly1305_init 61 62.type poly1305_blocks,%function 63.align 5 64poly1305_blocks: 65.Lpoly1305_blocks: 66 ands x2,x2,#-16 67 b.eq .Lno_data 68 69 ldp x4,x5,[x0] // load hash value 70 ldp x7,x8,[x0,#32] // load key value 71 ldr x6,[x0,#16] 72 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 73 b .Loop 74 75.align 5 76.Loop: 77 ldp x10,x11,[x1],#16 // load input 78 sub x2,x2,#16 79#ifdef __ARMEB__ 80 rev x10,x10 81 rev x11,x11 82#endif 83 adds x4,x4,x10 // accumulate input 84 adcs x5,x5,x11 85 86 mul x12,x4,x7 // h0*r0 87 adc x6,x6,x3 88 umulh x13,x4,x7 89 90 mul x10,x5,x9 // h1*5*r1 91 umulh x11,x5,x9 92 93 adds x12,x12,x10 94 mul x10,x4,x8 // h0*r1 95 adc x13,x13,x11 96 umulh x14,x4,x8 97 98 adds x13,x13,x10 99 mul x10,x5,x7 // h1*r0 100 adc x14,x14,xzr 101 umulh x11,x5,x7 102 103 adds x13,x13,x10 104 mul x10,x6,x9 // h2*5*r1 105 adc x14,x14,x11 106 mul x11,x6,x7 // h2*r0 107 108 adds x13,x13,x10 109 adc x14,x14,x11 110 111 and x10,x14,#-4 // final reduction 112 and x6,x14,#3 113 add x10,x10,x14,lsr#2 114 adds x4,x12,x10 115 adcs x5,x13,xzr 116 adc x6,x6,xzr 117 118 cbnz x2,.Loop 119 120 stp x4,x5,[x0] // store hash value 121 str x6,[x0,#16] 122 123.Lno_data: 124 ret 125.size poly1305_blocks,.-poly1305_blocks 126 127.type poly1305_emit,%function 128.align 5 129poly1305_emit: 130.Lpoly1305_emit: 131 ldp x4,x5,[x0] // load hash base 2^64 132 ldr x6,[x0,#16] 133 ldp x10,x11,[x2] // load nonce 134 135 adds x12,x4,#5 // compare to modulus 136 adcs x13,x5,xzr 137 adc x14,x6,xzr 138 139 tst x14,#-4 // see if it's carried/borrowed 140 141 csel x4,x4,x12,eq 142 csel x5,x5,x13,eq 143 144#ifdef __ARMEB__ 145 ror x10,x10,#32 // flip nonce words 146 ror x11,x11,#32 147#endif 148 adds x4,x4,x10 // accumulate nonce 149 adc x5,x5,x11 150#ifdef __ARMEB__ 151 rev x4,x4 // flip output bytes 152 rev x5,x5 153#endif 154 stp x4,x5,[x1] // write result 155 156 ret 157.size poly1305_emit,.-poly1305_emit 158.type poly1305_mult,%function 159.align 5 160poly1305_mult: 161 mul x12,x4,x7 // h0*r0 162 umulh x13,x4,x7 163 164 mul x10,x5,x9 // h1*5*r1 165 umulh x11,x5,x9 166 167 adds x12,x12,x10 168 mul x10,x4,x8 // h0*r1 169 adc x13,x13,x11 170 umulh x14,x4,x8 171 172 adds x13,x13,x10 173 mul x10,x5,x7 // h1*r0 174 adc x14,x14,xzr 175 umulh x11,x5,x7 176 177 adds x13,x13,x10 178 mul x10,x6,x9 // h2*5*r1 179 adc x14,x14,x11 180 mul x11,x6,x7 // h2*r0 181 182 adds x13,x13,x10 183 adc x14,x14,x11 184 185 and x10,x14,#-4 // final reduction 186 and x6,x14,#3 187 add x10,x10,x14,lsr#2 188 adds x4,x12,x10 189 adcs x5,x13,xzr 190 adc x6,x6,xzr 191 192 ret 193.size poly1305_mult,.-poly1305_mult 194 195.type poly1305_splat,%function 196.align 5 197poly1305_splat: 198 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 199 ubfx x13,x4,#26,#26 200 extr x14,x5,x4,#52 201 and x14,x14,#0x03ffffff 202 ubfx x15,x5,#14,#26 203 extr x16,x6,x5,#40 204 205 str w12,[x0,#16*0] // r0 206 add w12,w13,w13,lsl#2 // r1*5 207 str w13,[x0,#16*1] // r1 208 add w13,w14,w14,lsl#2 // r2*5 209 str w12,[x0,#16*2] // s1 210 str w14,[x0,#16*3] // r2 211 add w14,w15,w15,lsl#2 // r3*5 212 str w13,[x0,#16*4] // s2 213 str w15,[x0,#16*5] // r3 214 add w15,w16,w16,lsl#2 // r4*5 215 str w14,[x0,#16*6] // s3 216 str w16,[x0,#16*7] // r4 217 str w15,[x0,#16*8] // s4 218 219 ret 220.size poly1305_splat,.-poly1305_splat 221 222.type poly1305_blocks_neon,%function 223.align 5 224poly1305_blocks_neon: 225.Lpoly1305_blocks_neon: 226 ldr x17,[x0,#24] 227 cmp x2,#128 228 b.hs .Lblocks_neon 229 cbz x17,.Lpoly1305_blocks 230 231.Lblocks_neon: 232.inst 0xd503233f // paciasp 233 stp x29,x30,[sp,#-80]! 234 add x29,sp,#0 235 236 ands x2,x2,#-16 237 b.eq .Lno_data_neon 238 239 cbz x17,.Lbase2_64_neon 240 241 ldp w10,w11,[x0] // load hash value base 2^26 242 ldp w12,w13,[x0,#8] 243 ldr w14,[x0,#16] 244 245 tst x2,#31 246 b.eq .Leven_neon 247 248 ldp x7,x8,[x0,#32] // load key value 249 250 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 251 lsr x5,x12,#12 252 adds x4,x4,x12,lsl#52 253 add x5,x5,x13,lsl#14 254 adc x5,x5,xzr 255 lsr x6,x14,#24 256 adds x5,x5,x14,lsl#40 257 adc x14,x6,xzr // can be partially reduced... 258 259 ldp x12,x13,[x1],#16 // load input 260 sub x2,x2,#16 261 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 262 263 and x10,x14,#-4 // ... so reduce 264 and x6,x14,#3 265 add x10,x10,x14,lsr#2 266 adds x4,x4,x10 267 adcs x5,x5,xzr 268 adc x6,x6,xzr 269 270#ifdef __ARMEB__ 271 rev x12,x12 272 rev x13,x13 273#endif 274 adds x4,x4,x12 // accumulate input 275 adcs x5,x5,x13 276 adc x6,x6,x3 277 278 bl poly1305_mult 279 ldr x30,[sp,#8] 280 281 cbz x3,.Lstore_base2_64_neon 282 283 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 284 ubfx x11,x4,#26,#26 285 extr x12,x5,x4,#52 286 and x12,x12,#0x03ffffff 287 ubfx x13,x5,#14,#26 288 extr x14,x6,x5,#40 289 290 cbnz x2,.Leven_neon 291 292 stp w10,w11,[x0] // store hash value base 2^26 293 stp w12,w13,[x0,#8] 294 str w14,[x0,#16] 295 b .Lno_data_neon 296 297.align 4 298.Lstore_base2_64_neon: 299 stp x4,x5,[x0] // store hash value base 2^64 300 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed 301 b .Lno_data_neon 302 303.align 4 304.Lbase2_64_neon: 305 ldp x7,x8,[x0,#32] // load key value 306 307 ldp x4,x5,[x0] // load hash value base 2^64 308 ldr x6,[x0,#16] 309 310 tst x2,#31 311 b.eq .Linit_neon 312 313 ldp x12,x13,[x1],#16 // load input 314 sub x2,x2,#16 315 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 316#ifdef __ARMEB__ 317 rev x12,x12 318 rev x13,x13 319#endif 320 adds x4,x4,x12 // accumulate input 321 adcs x5,x5,x13 322 adc x6,x6,x3 323 324 bl poly1305_mult 325 326.Linit_neon: 327 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 328 ubfx x11,x4,#26,#26 329 extr x12,x5,x4,#52 330 and x12,x12,#0x03ffffff 331 ubfx x13,x5,#14,#26 332 extr x14,x6,x5,#40 333 334 stp d8,d9,[sp,#16] // meet ABI requirements 335 stp d10,d11,[sp,#32] 336 stp d12,d13,[sp,#48] 337 stp d14,d15,[sp,#64] 338 339 fmov d24,x10 340 fmov d25,x11 341 fmov d26,x12 342 fmov d27,x13 343 fmov d28,x14 344 345 ////////////////////////////////// initialize r^n table 346 mov x4,x7 // r^1 347 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 348 mov x5,x8 349 mov x6,xzr 350 add x0,x0,#48+12 351 bl poly1305_splat 352 353 bl poly1305_mult // r^2 354 sub x0,x0,#4 355 bl poly1305_splat 356 357 bl poly1305_mult // r^3 358 sub x0,x0,#4 359 bl poly1305_splat 360 361 bl poly1305_mult // r^4 362 sub x0,x0,#4 363 bl poly1305_splat 364 ldr x30,[sp,#8] 365 366 add x16,x1,#32 367 adr x17,.Lzeros 368 subs x2,x2,#64 369 csel x16,x17,x16,lo 370 371 mov x4,#1 372 stur x4,[x0,#-24] // set is_base2_26 373 sub x0,x0,#48 // restore original x0 374 b .Ldo_neon 375 376.align 4 377.Leven_neon: 378 add x16,x1,#32 379 adr x17,.Lzeros 380 subs x2,x2,#64 381 csel x16,x17,x16,lo 382 383 stp d8,d9,[sp,#16] // meet ABI requirements 384 stp d10,d11,[sp,#32] 385 stp d12,d13,[sp,#48] 386 stp d14,d15,[sp,#64] 387 388 fmov d24,x10 389 fmov d25,x11 390 fmov d26,x12 391 fmov d27,x13 392 fmov d28,x14 393 394.Ldo_neon: 395 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 396 ldp x9,x13,[x16],#48 397 398 lsl x3,x3,#24 399 add x15,x0,#48 400 401#ifdef __ARMEB__ 402 rev x8,x8 403 rev x12,x12 404 rev x9,x9 405 rev x13,x13 406#endif 407 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 408 and x5,x9,#0x03ffffff 409 ubfx x6,x8,#26,#26 410 ubfx x7,x9,#26,#26 411 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 412 extr x8,x12,x8,#52 413 extr x9,x13,x9,#52 414 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 415 fmov d14,x4 416 and x8,x8,#0x03ffffff 417 and x9,x9,#0x03ffffff 418 ubfx x10,x12,#14,#26 419 ubfx x11,x13,#14,#26 420 add x12,x3,x12,lsr#40 421 add x13,x3,x13,lsr#40 422 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 423 fmov d15,x6 424 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 425 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 426 fmov d16,x8 427 fmov d17,x10 428 fmov d18,x12 429 430 ldp x8,x12,[x1],#16 // inp[0:1] 431 ldp x9,x13,[x1],#48 432 433 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 434 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 435 ld1 {v8.4s},[x15] 436 437#ifdef __ARMEB__ 438 rev x8,x8 439 rev x12,x12 440 rev x9,x9 441 rev x13,x13 442#endif 443 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 444 and x5,x9,#0x03ffffff 445 ubfx x6,x8,#26,#26 446 ubfx x7,x9,#26,#26 447 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 448 extr x8,x12,x8,#52 449 extr x9,x13,x9,#52 450 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 451 fmov d9,x4 452 and x8,x8,#0x03ffffff 453 and x9,x9,#0x03ffffff 454 ubfx x10,x12,#14,#26 455 ubfx x11,x13,#14,#26 456 add x12,x3,x12,lsr#40 457 add x13,x3,x13,lsr#40 458 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 459 fmov d10,x6 460 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 461 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 462 movi v31.2d,#-1 463 fmov d11,x8 464 fmov d12,x10 465 fmov d13,x12 466 ushr v31.2d,v31.2d,#38 467 468 b.ls .Lskip_loop 469 470.align 4 471.Loop_neon: 472 //////////////////////////////////////////////////////////////// 473 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 474 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 475 // ___________________/ 476 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 477 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 478 // ___________________/ ____________________/ 479 // 480 // Note that we start with inp[2:3]*r^2. This is because it 481 // doesn't depend on reduction in previous iteration. 482 //////////////////////////////////////////////////////////////// 483 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 484 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 485 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 486 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 487 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 488 489 subs x2,x2,#64 490 umull v23.2d,v14.2s,v7.s[2] 491 csel x16,x17,x16,lo 492 umull v22.2d,v14.2s,v5.s[2] 493 umull v21.2d,v14.2s,v3.s[2] 494 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 495 umull v20.2d,v14.2s,v1.s[2] 496 ldp x9,x13,[x16],#48 497 umull v19.2d,v14.2s,v0.s[2] 498#ifdef __ARMEB__ 499 rev x8,x8 500 rev x12,x12 501 rev x9,x9 502 rev x13,x13 503#endif 504 505 umlal v23.2d,v15.2s,v5.s[2] 506 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 507 umlal v22.2d,v15.2s,v3.s[2] 508 and x5,x9,#0x03ffffff 509 umlal v21.2d,v15.2s,v1.s[2] 510 ubfx x6,x8,#26,#26 511 umlal v20.2d,v15.2s,v0.s[2] 512 ubfx x7,x9,#26,#26 513 umlal v19.2d,v15.2s,v8.s[2] 514 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 515 516 umlal v23.2d,v16.2s,v3.s[2] 517 extr x8,x12,x8,#52 518 umlal v22.2d,v16.2s,v1.s[2] 519 extr x9,x13,x9,#52 520 umlal v21.2d,v16.2s,v0.s[2] 521 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 522 umlal v20.2d,v16.2s,v8.s[2] 523 fmov d14,x4 524 umlal v19.2d,v16.2s,v6.s[2] 525 and x8,x8,#0x03ffffff 526 527 umlal v23.2d,v17.2s,v1.s[2] 528 and x9,x9,#0x03ffffff 529 umlal v22.2d,v17.2s,v0.s[2] 530 ubfx x10,x12,#14,#26 531 umlal v21.2d,v17.2s,v8.s[2] 532 ubfx x11,x13,#14,#26 533 umlal v20.2d,v17.2s,v6.s[2] 534 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 535 umlal v19.2d,v17.2s,v4.s[2] 536 fmov d15,x6 537 538 add v11.2s,v11.2s,v26.2s 539 add x12,x3,x12,lsr#40 540 umlal v23.2d,v18.2s,v0.s[2] 541 add x13,x3,x13,lsr#40 542 umlal v22.2d,v18.2s,v8.s[2] 543 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 544 umlal v21.2d,v18.2s,v6.s[2] 545 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 546 umlal v20.2d,v18.2s,v4.s[2] 547 fmov d16,x8 548 umlal v19.2d,v18.2s,v2.s[2] 549 fmov d17,x10 550 551 //////////////////////////////////////////////////////////////// 552 // (hash+inp[0:1])*r^4 and accumulate 553 554 add v9.2s,v9.2s,v24.2s 555 fmov d18,x12 556 umlal v22.2d,v11.2s,v1.s[0] 557 ldp x8,x12,[x1],#16 // inp[0:1] 558 umlal v19.2d,v11.2s,v6.s[0] 559 ldp x9,x13,[x1],#48 560 umlal v23.2d,v11.2s,v3.s[0] 561 umlal v20.2d,v11.2s,v8.s[0] 562 umlal v21.2d,v11.2s,v0.s[0] 563#ifdef __ARMEB__ 564 rev x8,x8 565 rev x12,x12 566 rev x9,x9 567 rev x13,x13 568#endif 569 570 add v10.2s,v10.2s,v25.2s 571 umlal v22.2d,v9.2s,v5.s[0] 572 umlal v23.2d,v9.2s,v7.s[0] 573 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 574 umlal v21.2d,v9.2s,v3.s[0] 575 and x5,x9,#0x03ffffff 576 umlal v19.2d,v9.2s,v0.s[0] 577 ubfx x6,x8,#26,#26 578 umlal v20.2d,v9.2s,v1.s[0] 579 ubfx x7,x9,#26,#26 580 581 add v12.2s,v12.2s,v27.2s 582 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 583 umlal v22.2d,v10.2s,v3.s[0] 584 extr x8,x12,x8,#52 585 umlal v23.2d,v10.2s,v5.s[0] 586 extr x9,x13,x9,#52 587 umlal v19.2d,v10.2s,v8.s[0] 588 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 589 umlal v21.2d,v10.2s,v1.s[0] 590 fmov d9,x4 591 umlal v20.2d,v10.2s,v0.s[0] 592 and x8,x8,#0x03ffffff 593 594 add v13.2s,v13.2s,v28.2s 595 and x9,x9,#0x03ffffff 596 umlal v22.2d,v12.2s,v0.s[0] 597 ubfx x10,x12,#14,#26 598 umlal v19.2d,v12.2s,v4.s[0] 599 ubfx x11,x13,#14,#26 600 umlal v23.2d,v12.2s,v1.s[0] 601 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 602 umlal v20.2d,v12.2s,v6.s[0] 603 fmov d10,x6 604 umlal v21.2d,v12.2s,v8.s[0] 605 add x12,x3,x12,lsr#40 606 607 umlal v22.2d,v13.2s,v8.s[0] 608 add x13,x3,x13,lsr#40 609 umlal v19.2d,v13.2s,v2.s[0] 610 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 611 umlal v23.2d,v13.2s,v0.s[0] 612 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 613 umlal v20.2d,v13.2s,v4.s[0] 614 fmov d11,x8 615 umlal v21.2d,v13.2s,v6.s[0] 616 fmov d12,x10 617 fmov d13,x12 618 619 ///////////////////////////////////////////////////////////////// 620 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 621 // and P. Schwabe 622 // 623 // [see discussion in poly1305-armv4 module] 624 625 ushr v29.2d,v22.2d,#26 626 xtn v27.2s,v22.2d 627 ushr v30.2d,v19.2d,#26 628 and v19.16b,v19.16b,v31.16b 629 add v23.2d,v23.2d,v29.2d // h3 -> h4 630 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff 631 add v20.2d,v20.2d,v30.2d // h0 -> h1 632 633 ushr v29.2d,v23.2d,#26 634 xtn v28.2s,v23.2d 635 ushr v30.2d,v20.2d,#26 636 xtn v25.2s,v20.2d 637 bic v28.2s,#0xfc,lsl#24 638 add v21.2d,v21.2d,v30.2d // h1 -> h2 639 640 add v19.2d,v19.2d,v29.2d 641 shl v29.2d,v29.2d,#2 642 shrn v30.2s,v21.2d,#26 643 xtn v26.2s,v21.2d 644 add v19.2d,v19.2d,v29.2d // h4 -> h0 645 bic v25.2s,#0xfc,lsl#24 646 add v27.2s,v27.2s,v30.2s // h2 -> h3 647 bic v26.2s,#0xfc,lsl#24 648 649 shrn v29.2s,v19.2d,#26 650 xtn v24.2s,v19.2d 651 ushr v30.2s,v27.2s,#26 652 bic v27.2s,#0xfc,lsl#24 653 bic v24.2s,#0xfc,lsl#24 654 add v25.2s,v25.2s,v29.2s // h0 -> h1 655 add v28.2s,v28.2s,v30.2s // h3 -> h4 656 657 b.hi .Loop_neon 658 659.Lskip_loop: 660 dup v16.2d,v16.d[0] 661 add v11.2s,v11.2s,v26.2s 662 663 //////////////////////////////////////////////////////////////// 664 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 665 666 adds x2,x2,#32 667 b.ne .Long_tail 668 669 dup v16.2d,v11.d[0] 670 add v14.2s,v9.2s,v24.2s 671 add v17.2s,v12.2s,v27.2s 672 add v15.2s,v10.2s,v25.2s 673 add v18.2s,v13.2s,v28.2s 674 675.Long_tail: 676 dup v14.2d,v14.d[0] 677 umull2 v19.2d,v16.4s,v6.4s 678 umull2 v22.2d,v16.4s,v1.4s 679 umull2 v23.2d,v16.4s,v3.4s 680 umull2 v21.2d,v16.4s,v0.4s 681 umull2 v20.2d,v16.4s,v8.4s 682 683 dup v15.2d,v15.d[0] 684 umlal2 v19.2d,v14.4s,v0.4s 685 umlal2 v21.2d,v14.4s,v3.4s 686 umlal2 v22.2d,v14.4s,v5.4s 687 umlal2 v23.2d,v14.4s,v7.4s 688 umlal2 v20.2d,v14.4s,v1.4s 689 690 dup v17.2d,v17.d[0] 691 umlal2 v19.2d,v15.4s,v8.4s 692 umlal2 v22.2d,v15.4s,v3.4s 693 umlal2 v21.2d,v15.4s,v1.4s 694 umlal2 v23.2d,v15.4s,v5.4s 695 umlal2 v20.2d,v15.4s,v0.4s 696 697 dup v18.2d,v18.d[0] 698 umlal2 v22.2d,v17.4s,v0.4s 699 umlal2 v23.2d,v17.4s,v1.4s 700 umlal2 v19.2d,v17.4s,v4.4s 701 umlal2 v20.2d,v17.4s,v6.4s 702 umlal2 v21.2d,v17.4s,v8.4s 703 704 umlal2 v22.2d,v18.4s,v8.4s 705 umlal2 v19.2d,v18.4s,v2.4s 706 umlal2 v23.2d,v18.4s,v0.4s 707 umlal2 v20.2d,v18.4s,v4.4s 708 umlal2 v21.2d,v18.4s,v6.4s 709 710 b.eq .Lshort_tail 711 712 //////////////////////////////////////////////////////////////// 713 // (hash+inp[0:1])*r^4:r^3 and accumulate 714 715 add v9.2s,v9.2s,v24.2s 716 umlal v22.2d,v11.2s,v1.2s 717 umlal v19.2d,v11.2s,v6.2s 718 umlal v23.2d,v11.2s,v3.2s 719 umlal v20.2d,v11.2s,v8.2s 720 umlal v21.2d,v11.2s,v0.2s 721 722 add v10.2s,v10.2s,v25.2s 723 umlal v22.2d,v9.2s,v5.2s 724 umlal v19.2d,v9.2s,v0.2s 725 umlal v23.2d,v9.2s,v7.2s 726 umlal v20.2d,v9.2s,v1.2s 727 umlal v21.2d,v9.2s,v3.2s 728 729 add v12.2s,v12.2s,v27.2s 730 umlal v22.2d,v10.2s,v3.2s 731 umlal v19.2d,v10.2s,v8.2s 732 umlal v23.2d,v10.2s,v5.2s 733 umlal v20.2d,v10.2s,v0.2s 734 umlal v21.2d,v10.2s,v1.2s 735 736 add v13.2s,v13.2s,v28.2s 737 umlal v22.2d,v12.2s,v0.2s 738 umlal v19.2d,v12.2s,v4.2s 739 umlal v23.2d,v12.2s,v1.2s 740 umlal v20.2d,v12.2s,v6.2s 741 umlal v21.2d,v12.2s,v8.2s 742 743 umlal v22.2d,v13.2s,v8.2s 744 umlal v19.2d,v13.2s,v2.2s 745 umlal v23.2d,v13.2s,v0.2s 746 umlal v20.2d,v13.2s,v4.2s 747 umlal v21.2d,v13.2s,v6.2s 748 749.Lshort_tail: 750 //////////////////////////////////////////////////////////////// 751 // horizontal add 752 753 addp v22.2d,v22.2d,v22.2d 754 ldp d8,d9,[sp,#16] // meet ABI requirements 755 addp v19.2d,v19.2d,v19.2d 756 ldp d10,d11,[sp,#32] 757 addp v23.2d,v23.2d,v23.2d 758 ldp d12,d13,[sp,#48] 759 addp v20.2d,v20.2d,v20.2d 760 ldp d14,d15,[sp,#64] 761 addp v21.2d,v21.2d,v21.2d 762 763 //////////////////////////////////////////////////////////////// 764 // lazy reduction, but without narrowing 765 766 ushr v29.2d,v22.2d,#26 767 and v22.16b,v22.16b,v31.16b 768 ushr v30.2d,v19.2d,#26 769 and v19.16b,v19.16b,v31.16b 770 771 add v23.2d,v23.2d,v29.2d // h3 -> h4 772 add v20.2d,v20.2d,v30.2d // h0 -> h1 773 774 ushr v29.2d,v23.2d,#26 775 and v23.16b,v23.16b,v31.16b 776 ushr v30.2d,v20.2d,#26 777 and v20.16b,v20.16b,v31.16b 778 add v21.2d,v21.2d,v30.2d // h1 -> h2 779 780 add v19.2d,v19.2d,v29.2d 781 shl v29.2d,v29.2d,#2 782 ushr v30.2d,v21.2d,#26 783 and v21.16b,v21.16b,v31.16b 784 add v19.2d,v19.2d,v29.2d // h4 -> h0 785 add v22.2d,v22.2d,v30.2d // h2 -> h3 786 787 ushr v29.2d,v19.2d,#26 788 and v19.16b,v19.16b,v31.16b 789 ushr v30.2d,v22.2d,#26 790 and v22.16b,v22.16b,v31.16b 791 add v20.2d,v20.2d,v29.2d // h0 -> h1 792 add v23.2d,v23.2d,v30.2d // h3 -> h4 793 794 //////////////////////////////////////////////////////////////// 795 // write the result, can be partially reduced 796 797 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 798 st1 {v23.s}[0],[x0] 799 800.Lno_data_neon: 801 ldr x29,[sp],#80 802.inst 0xd50323bf // autiasp 803 ret 804.size poly1305_blocks_neon,.-poly1305_blocks_neon 805 806.type poly1305_emit_neon,%function 807.align 5 808poly1305_emit_neon: 809.Lpoly1305_emit_neon: 810 ldr x17,[x0,#24] 811 cbz x17,poly1305_emit 812 813 ldp w10,w11,[x0] // load hash value base 2^26 814 ldp w12,w13,[x0,#8] 815 ldr w14,[x0,#16] 816 817 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 818 lsr x5,x12,#12 819 adds x4,x4,x12,lsl#52 820 add x5,x5,x13,lsl#14 821 adc x5,x5,xzr 822 lsr x6,x14,#24 823 adds x5,x5,x14,lsl#40 824 adc x6,x6,xzr // can be partially reduced... 825 826 ldp x10,x11,[x2] // load nonce 827 828 and x12,x6,#-4 // ... so reduce 829 add x12,x12,x6,lsr#2 830 and x6,x6,#3 831 adds x4,x4,x12 832 adcs x5,x5,xzr 833 adc x6,x6,xzr 834 835 adds x12,x4,#5 // compare to modulus 836 adcs x13,x5,xzr 837 adc x14,x6,xzr 838 839 tst x14,#-4 // see if it's carried/borrowed 840 841 csel x4,x4,x12,eq 842 csel x5,x5,x13,eq 843 844#ifdef __ARMEB__ 845 ror x10,x10,#32 // flip nonce words 846 ror x11,x11,#32 847#endif 848 adds x4,x4,x10 // accumulate nonce 849 adc x5,x5,x11 850#ifdef __ARMEB__ 851 rev x4,x4 // flip output bytes 852 rev x5,x5 853#endif 854 stp x4,x5,[x1] // write result 855 856 ret 857.size poly1305_emit_neon,.-poly1305_emit_neon 858 859.align 5 860.Lzeros: 861.long 0,0,0,0,0,0,0,0 862.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 863.align 2 864.align 2 865