1/* Do not modify. This file is auto-generated from poly1305-armv8.pl. */ 2#include "arm_arch.h" 3 4.text 5 6// forward "declarations" are required for Apple 7 8.hidden OPENSSL_armcap_P 9.globl poly1305_init 10.hidden poly1305_init 11.globl poly1305_blocks 12.hidden poly1305_blocks 13.globl poly1305_emit 14.hidden poly1305_emit 15 16.type poly1305_init,%function 17.align 5 18poly1305_init: 19 AARCH64_VALID_CALL_TARGET 20 cmp x1,xzr 21 stp xzr,xzr,[x0] // zero hash value 22 stp xzr,xzr,[x0,#16] // [along with is_base2_26] 23 24 csel x0,xzr,x0,eq 25 b.eq .Lno_key 26 27 adrp x17,OPENSSL_armcap_P 28 ldr w17,[x17,#:lo12:OPENSSL_armcap_P] 29 30 ldp x7,x8,[x1] // load key 31 mov x9,#0xfffffffc0fffffff 32 movk x9,#0x0fff,lsl#48 33#ifdef __AARCH64EB__ 34 rev x7,x7 // flip bytes 35 rev x8,x8 36#endif 37 and x7,x7,x9 // &=0ffffffc0fffffff 38 and x9,x9,#-4 39 and x8,x8,x9 // &=0ffffffc0ffffffc 40 stp x7,x8,[x0,#32] // save key value 41 42 tst w17,#ARMV7_NEON 43 44 adrp x12,poly1305_blocks 45 add x12,x12,#:lo12:.Lpoly1305_blocks 46 adrp x7,poly1305_blocks_neon 47 add x7,x7,#:lo12:.Lpoly1305_blocks_neon 48 adrp x13,poly1305_emit 49 add x13,x13,#:lo12:.Lpoly1305_emit 50 adrp x8,poly1305_emit_neon 51 add x8,x8,#:lo12:.Lpoly1305_emit_neon 52 53 csel x12,x12,x7,eq 54 csel x13,x13,x8,eq 55 56#ifdef __ILP32__ 57 stp w12,w13,[x2] 58#else 59 stp x12,x13,[x2] 60#endif 61 62 mov x0,#1 63.Lno_key: 64 ret 65.size poly1305_init,.-poly1305_init 66 67.type poly1305_blocks,%function 68.align 5 69poly1305_blocks: 70.Lpoly1305_blocks: 71 // The symbol .Lpoly1305_blocks is not a .globl symbol 72 // but a pointer to it is returned by poly1305_init 73 AARCH64_VALID_CALL_TARGET 74 ands x2,x2,#-16 75 b.eq .Lno_data 76 77 ldp x4,x5,[x0] // load hash value 78 ldp x7,x8,[x0,#32] // load key value 79 ldr x6,[x0,#16] 80 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 81 b .Loop 82 83.align 5 84.Loop: 85 ldp x10,x11,[x1],#16 // load input 86 sub x2,x2,#16 87#ifdef __AARCH64EB__ 88 rev x10,x10 89 rev x11,x11 90#endif 91 adds x4,x4,x10 // accumulate input 92 adcs x5,x5,x11 93 94 mul x12,x4,x7 // h0*r0 95 adc x6,x6,x3 96 umulh x13,x4,x7 97 98 mul x10,x5,x9 // h1*5*r1 99 umulh x11,x5,x9 100 101 adds x12,x12,x10 102 mul x10,x4,x8 // h0*r1 103 adc x13,x13,x11 104 umulh x14,x4,x8 105 106 adds x13,x13,x10 107 mul x10,x5,x7 // h1*r0 108 adc x14,x14,xzr 109 umulh x11,x5,x7 110 111 adds x13,x13,x10 112 mul x10,x6,x9 // h2*5*r1 113 adc x14,x14,x11 114 mul x11,x6,x7 // h2*r0 115 116 adds x13,x13,x10 117 adc x14,x14,x11 118 119 and x10,x14,#-4 // final reduction 120 and x6,x14,#3 121 add x10,x10,x14,lsr#2 122 adds x4,x12,x10 123 adcs x5,x13,xzr 124 adc x6,x6,xzr 125 126 cbnz x2,.Loop 127 128 stp x4,x5,[x0] // store hash value 129 str x6,[x0,#16] 130 131.Lno_data: 132 ret 133.size poly1305_blocks,.-poly1305_blocks 134 135.type poly1305_emit,%function 136.align 5 137poly1305_emit: 138.Lpoly1305_emit: 139 // The symbol .poly1305_emit is not a .globl symbol 140 // but a pointer to it is returned by poly1305_init 141 AARCH64_VALID_CALL_TARGET 142 ldp x4,x5,[x0] // load hash base 2^64 143 ldr x6,[x0,#16] 144 ldp x10,x11,[x2] // load nonce 145 146 adds x12,x4,#5 // compare to modulus 147 adcs x13,x5,xzr 148 adc x14,x6,xzr 149 150 tst x14,#-4 // see if it's carried/borrowed 151 152 csel x4,x4,x12,eq 153 csel x5,x5,x13,eq 154 155#ifdef __AARCH64EB__ 156 ror x10,x10,#32 // flip nonce words 157 ror x11,x11,#32 158#endif 159 adds x4,x4,x10 // accumulate nonce 160 adc x5,x5,x11 161#ifdef __AARCH64EB__ 162 rev x4,x4 // flip output bytes 163 rev x5,x5 164#endif 165 stp x4,x5,[x1] // write result 166 167 ret 168.size poly1305_emit,.-poly1305_emit 169.type poly1305_mult,%function 170.align 5 171poly1305_mult: 172 mul x12,x4,x7 // h0*r0 173 umulh x13,x4,x7 174 175 mul x10,x5,x9 // h1*5*r1 176 umulh x11,x5,x9 177 178 adds x12,x12,x10 179 mul x10,x4,x8 // h0*r1 180 adc x13,x13,x11 181 umulh x14,x4,x8 182 183 adds x13,x13,x10 184 mul x10,x5,x7 // h1*r0 185 adc x14,x14,xzr 186 umulh x11,x5,x7 187 188 adds x13,x13,x10 189 mul x10,x6,x9 // h2*5*r1 190 adc x14,x14,x11 191 mul x11,x6,x7 // h2*r0 192 193 adds x13,x13,x10 194 adc x14,x14,x11 195 196 and x10,x14,#-4 // final reduction 197 and x6,x14,#3 198 add x10,x10,x14,lsr#2 199 adds x4,x12,x10 200 adcs x5,x13,xzr 201 adc x6,x6,xzr 202 203 ret 204.size poly1305_mult,.-poly1305_mult 205 206.type poly1305_splat,%function 207.align 5 208poly1305_splat: 209 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 210 ubfx x13,x4,#26,#26 211 extr x14,x5,x4,#52 212 and x14,x14,#0x03ffffff 213 ubfx x15,x5,#14,#26 214 extr x16,x6,x5,#40 215 216 str w12,[x0,#16*0] // r0 217 add w12,w13,w13,lsl#2 // r1*5 218 str w13,[x0,#16*1] // r1 219 add w13,w14,w14,lsl#2 // r2*5 220 str w12,[x0,#16*2] // s1 221 str w14,[x0,#16*3] // r2 222 add w14,w15,w15,lsl#2 // r3*5 223 str w13,[x0,#16*4] // s2 224 str w15,[x0,#16*5] // r3 225 add w15,w16,w16,lsl#2 // r4*5 226 str w14,[x0,#16*6] // s3 227 str w16,[x0,#16*7] // r4 228 str w15,[x0,#16*8] // s4 229 230 ret 231.size poly1305_splat,.-poly1305_splat 232 233.type poly1305_blocks_neon,%function 234.align 5 235poly1305_blocks_neon: 236.Lpoly1305_blocks_neon: 237 // The symbol .Lpoly1305_blocks_neon is not a .globl symbol 238 // but a pointer to it is returned by poly1305_init 239 AARCH64_VALID_CALL_TARGET 240 ldr x17,[x0,#24] 241 cmp x2,#128 242 b.hs .Lblocks_neon 243 cbz x17,.Lpoly1305_blocks 244 245.Lblocks_neon: 246 AARCH64_SIGN_LINK_REGISTER 247 stp x29,x30,[sp,#-80]! 248 add x29,sp,#0 249 250 ands x2,x2,#-16 251 b.eq .Lno_data_neon 252 253 cbz x17,.Lbase2_64_neon 254 255 ldp w10,w11,[x0] // load hash value base 2^26 256 ldp w12,w13,[x0,#8] 257 ldr w14,[x0,#16] 258 259 tst x2,#31 260 b.eq .Leven_neon 261 262 ldp x7,x8,[x0,#32] // load key value 263 264 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 265 lsr x5,x12,#12 266 adds x4,x4,x12,lsl#52 267 add x5,x5,x13,lsl#14 268 adc x5,x5,xzr 269 lsr x6,x14,#24 270 adds x5,x5,x14,lsl#40 271 adc x14,x6,xzr // can be partially reduced... 272 273 ldp x12,x13,[x1],#16 // load input 274 sub x2,x2,#16 275 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 276 277 and x10,x14,#-4 // ... so reduce 278 and x6,x14,#3 279 add x10,x10,x14,lsr#2 280 adds x4,x4,x10 281 adcs x5,x5,xzr 282 adc x6,x6,xzr 283 284#ifdef __AARCH64EB__ 285 rev x12,x12 286 rev x13,x13 287#endif 288 adds x4,x4,x12 // accumulate input 289 adcs x5,x5,x13 290 adc x6,x6,x3 291 292 bl poly1305_mult 293 ldr x30,[sp,#8] 294 295 cbz x3,.Lstore_base2_64_neon 296 297 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 298 ubfx x11,x4,#26,#26 299 extr x12,x5,x4,#52 300 and x12,x12,#0x03ffffff 301 ubfx x13,x5,#14,#26 302 extr x14,x6,x5,#40 303 304 cbnz x2,.Leven_neon 305 306 stp w10,w11,[x0] // store hash value base 2^26 307 stp w12,w13,[x0,#8] 308 str w14,[x0,#16] 309 b .Lno_data_neon 310 311.align 4 312.Lstore_base2_64_neon: 313 stp x4,x5,[x0] // store hash value base 2^64 314 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed 315 b .Lno_data_neon 316 317.align 4 318.Lbase2_64_neon: 319 ldp x7,x8,[x0,#32] // load key value 320 321 ldp x4,x5,[x0] // load hash value base 2^64 322 ldr x6,[x0,#16] 323 324 tst x2,#31 325 b.eq .Linit_neon 326 327 ldp x12,x13,[x1],#16 // load input 328 sub x2,x2,#16 329 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 330#ifdef __AARCH64EB__ 331 rev x12,x12 332 rev x13,x13 333#endif 334 adds x4,x4,x12 // accumulate input 335 adcs x5,x5,x13 336 adc x6,x6,x3 337 338 bl poly1305_mult 339 340.Linit_neon: 341 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 342 ubfx x11,x4,#26,#26 343 extr x12,x5,x4,#52 344 and x12,x12,#0x03ffffff 345 ubfx x13,x5,#14,#26 346 extr x14,x6,x5,#40 347 348 stp d8,d9,[sp,#16] // meet ABI requirements 349 stp d10,d11,[sp,#32] 350 stp d12,d13,[sp,#48] 351 stp d14,d15,[sp,#64] 352 353 fmov d24,x10 354 fmov d25,x11 355 fmov d26,x12 356 fmov d27,x13 357 fmov d28,x14 358 359 ////////////////////////////////// initialize r^n table 360 mov x4,x7 // r^1 361 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 362 mov x5,x8 363 mov x6,xzr 364 add x0,x0,#48+12 365 bl poly1305_splat 366 367 bl poly1305_mult // r^2 368 sub x0,x0,#4 369 bl poly1305_splat 370 371 bl poly1305_mult // r^3 372 sub x0,x0,#4 373 bl poly1305_splat 374 375 bl poly1305_mult // r^4 376 sub x0,x0,#4 377 bl poly1305_splat 378 ldr x30,[sp,#8] 379 380 add x16,x1,#32 381 adrp x17,.Lzeros 382 add x17,x17,#:lo12:.Lzeros 383 subs x2,x2,#64 384 csel x16,x17,x16,lo 385 386 mov x4,#1 387 stur x4,[x0,#-24] // set is_base2_26 388 sub x0,x0,#48 // restore original x0 389 b .Ldo_neon 390 391.align 4 392.Leven_neon: 393 add x16,x1,#32 394 adrp x17,.Lzeros 395 add x17,x17,#:lo12:.Lzeros 396 subs x2,x2,#64 397 csel x16,x17,x16,lo 398 399 stp d8,d9,[sp,#16] // meet ABI requirements 400 stp d10,d11,[sp,#32] 401 stp d12,d13,[sp,#48] 402 stp d14,d15,[sp,#64] 403 404 fmov d24,x10 405 fmov d25,x11 406 fmov d26,x12 407 fmov d27,x13 408 fmov d28,x14 409 410.Ldo_neon: 411 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 412 ldp x9,x13,[x16],#48 413 414 lsl x3,x3,#24 415 add x15,x0,#48 416 417#ifdef __AARCH64EB__ 418 rev x8,x8 419 rev x12,x12 420 rev x9,x9 421 rev x13,x13 422#endif 423 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 424 and x5,x9,#0x03ffffff 425 ubfx x6,x8,#26,#26 426 ubfx x7,x9,#26,#26 427 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 428 extr x8,x12,x8,#52 429 extr x9,x13,x9,#52 430 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 431 fmov d14,x4 432 and x8,x8,#0x03ffffff 433 and x9,x9,#0x03ffffff 434 ubfx x10,x12,#14,#26 435 ubfx x11,x13,#14,#26 436 add x12,x3,x12,lsr#40 437 add x13,x3,x13,lsr#40 438 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 439 fmov d15,x6 440 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 441 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 442 fmov d16,x8 443 fmov d17,x10 444 fmov d18,x12 445 446 ldp x8,x12,[x1],#16 // inp[0:1] 447 ldp x9,x13,[x1],#48 448 449 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 450 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 451 ld1 {v8.4s},[x15] 452 453#ifdef __AARCH64EB__ 454 rev x8,x8 455 rev x12,x12 456 rev x9,x9 457 rev x13,x13 458#endif 459 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 460 and x5,x9,#0x03ffffff 461 ubfx x6,x8,#26,#26 462 ubfx x7,x9,#26,#26 463 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 464 extr x8,x12,x8,#52 465 extr x9,x13,x9,#52 466 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 467 fmov d9,x4 468 and x8,x8,#0x03ffffff 469 and x9,x9,#0x03ffffff 470 ubfx x10,x12,#14,#26 471 ubfx x11,x13,#14,#26 472 add x12,x3,x12,lsr#40 473 add x13,x3,x13,lsr#40 474 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 475 fmov d10,x6 476 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 477 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 478 movi v31.2d,#-1 479 fmov d11,x8 480 fmov d12,x10 481 fmov d13,x12 482 ushr v31.2d,v31.2d,#38 483 484 b.ls .Lskip_loop 485 486.align 4 487.Loop_neon: 488 //////////////////////////////////////////////////////////////// 489 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 490 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 491 // ___________________/ 492 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 493 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 494 // ___________________/ ____________________/ 495 // 496 // Note that we start with inp[2:3]*r^2. This is because it 497 // doesn't depend on reduction in previous iteration. 498 //////////////////////////////////////////////////////////////// 499 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 500 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 501 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 502 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 503 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 504 505 subs x2,x2,#64 506 umull v23.2d,v14.2s,v7.s[2] 507 csel x16,x17,x16,lo 508 umull v22.2d,v14.2s,v5.s[2] 509 umull v21.2d,v14.2s,v3.s[2] 510 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 511 umull v20.2d,v14.2s,v1.s[2] 512 ldp x9,x13,[x16],#48 513 umull v19.2d,v14.2s,v0.s[2] 514#ifdef __AARCH64EB__ 515 rev x8,x8 516 rev x12,x12 517 rev x9,x9 518 rev x13,x13 519#endif 520 521 umlal v23.2d,v15.2s,v5.s[2] 522 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 523 umlal v22.2d,v15.2s,v3.s[2] 524 and x5,x9,#0x03ffffff 525 umlal v21.2d,v15.2s,v1.s[2] 526 ubfx x6,x8,#26,#26 527 umlal v20.2d,v15.2s,v0.s[2] 528 ubfx x7,x9,#26,#26 529 umlal v19.2d,v15.2s,v8.s[2] 530 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 531 532 umlal v23.2d,v16.2s,v3.s[2] 533 extr x8,x12,x8,#52 534 umlal v22.2d,v16.2s,v1.s[2] 535 extr x9,x13,x9,#52 536 umlal v21.2d,v16.2s,v0.s[2] 537 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 538 umlal v20.2d,v16.2s,v8.s[2] 539 fmov d14,x4 540 umlal v19.2d,v16.2s,v6.s[2] 541 and x8,x8,#0x03ffffff 542 543 umlal v23.2d,v17.2s,v1.s[2] 544 and x9,x9,#0x03ffffff 545 umlal v22.2d,v17.2s,v0.s[2] 546 ubfx x10,x12,#14,#26 547 umlal v21.2d,v17.2s,v8.s[2] 548 ubfx x11,x13,#14,#26 549 umlal v20.2d,v17.2s,v6.s[2] 550 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 551 umlal v19.2d,v17.2s,v4.s[2] 552 fmov d15,x6 553 554 add v11.2s,v11.2s,v26.2s 555 add x12,x3,x12,lsr#40 556 umlal v23.2d,v18.2s,v0.s[2] 557 add x13,x3,x13,lsr#40 558 umlal v22.2d,v18.2s,v8.s[2] 559 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 560 umlal v21.2d,v18.2s,v6.s[2] 561 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 562 umlal v20.2d,v18.2s,v4.s[2] 563 fmov d16,x8 564 umlal v19.2d,v18.2s,v2.s[2] 565 fmov d17,x10 566 567 //////////////////////////////////////////////////////////////// 568 // (hash+inp[0:1])*r^4 and accumulate 569 570 add v9.2s,v9.2s,v24.2s 571 fmov d18,x12 572 umlal v22.2d,v11.2s,v1.s[0] 573 ldp x8,x12,[x1],#16 // inp[0:1] 574 umlal v19.2d,v11.2s,v6.s[0] 575 ldp x9,x13,[x1],#48 576 umlal v23.2d,v11.2s,v3.s[0] 577 umlal v20.2d,v11.2s,v8.s[0] 578 umlal v21.2d,v11.2s,v0.s[0] 579#ifdef __AARCH64EB__ 580 rev x8,x8 581 rev x12,x12 582 rev x9,x9 583 rev x13,x13 584#endif 585 586 add v10.2s,v10.2s,v25.2s 587 umlal v22.2d,v9.2s,v5.s[0] 588 umlal v23.2d,v9.2s,v7.s[0] 589 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 590 umlal v21.2d,v9.2s,v3.s[0] 591 and x5,x9,#0x03ffffff 592 umlal v19.2d,v9.2s,v0.s[0] 593 ubfx x6,x8,#26,#26 594 umlal v20.2d,v9.2s,v1.s[0] 595 ubfx x7,x9,#26,#26 596 597 add v12.2s,v12.2s,v27.2s 598 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 599 umlal v22.2d,v10.2s,v3.s[0] 600 extr x8,x12,x8,#52 601 umlal v23.2d,v10.2s,v5.s[0] 602 extr x9,x13,x9,#52 603 umlal v19.2d,v10.2s,v8.s[0] 604 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 605 umlal v21.2d,v10.2s,v1.s[0] 606 fmov d9,x4 607 umlal v20.2d,v10.2s,v0.s[0] 608 and x8,x8,#0x03ffffff 609 610 add v13.2s,v13.2s,v28.2s 611 and x9,x9,#0x03ffffff 612 umlal v22.2d,v12.2s,v0.s[0] 613 ubfx x10,x12,#14,#26 614 umlal v19.2d,v12.2s,v4.s[0] 615 ubfx x11,x13,#14,#26 616 umlal v23.2d,v12.2s,v1.s[0] 617 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 618 umlal v20.2d,v12.2s,v6.s[0] 619 fmov d10,x6 620 umlal v21.2d,v12.2s,v8.s[0] 621 add x12,x3,x12,lsr#40 622 623 umlal v22.2d,v13.2s,v8.s[0] 624 add x13,x3,x13,lsr#40 625 umlal v19.2d,v13.2s,v2.s[0] 626 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 627 umlal v23.2d,v13.2s,v0.s[0] 628 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 629 umlal v20.2d,v13.2s,v4.s[0] 630 fmov d11,x8 631 umlal v21.2d,v13.2s,v6.s[0] 632 fmov d12,x10 633 fmov d13,x12 634 635 ///////////////////////////////////////////////////////////////// 636 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 637 // and P. Schwabe 638 // 639 // [see discussion in poly1305-armv4 module] 640 641 ushr v29.2d,v22.2d,#26 642 xtn v27.2s,v22.2d 643 ushr v30.2d,v19.2d,#26 644 and v19.16b,v19.16b,v31.16b 645 add v23.2d,v23.2d,v29.2d // h3 -> h4 646 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff 647 add v20.2d,v20.2d,v30.2d // h0 -> h1 648 649 ushr v29.2d,v23.2d,#26 650 xtn v28.2s,v23.2d 651 ushr v30.2d,v20.2d,#26 652 xtn v25.2s,v20.2d 653 bic v28.2s,#0xfc,lsl#24 654 add v21.2d,v21.2d,v30.2d // h1 -> h2 655 656 add v19.2d,v19.2d,v29.2d 657 shl v29.2d,v29.2d,#2 658 shrn v30.2s,v21.2d,#26 659 xtn v26.2s,v21.2d 660 add v19.2d,v19.2d,v29.2d // h4 -> h0 661 bic v25.2s,#0xfc,lsl#24 662 add v27.2s,v27.2s,v30.2s // h2 -> h3 663 bic v26.2s,#0xfc,lsl#24 664 665 shrn v29.2s,v19.2d,#26 666 xtn v24.2s,v19.2d 667 ushr v30.2s,v27.2s,#26 668 bic v27.2s,#0xfc,lsl#24 669 bic v24.2s,#0xfc,lsl#24 670 add v25.2s,v25.2s,v29.2s // h0 -> h1 671 add v28.2s,v28.2s,v30.2s // h3 -> h4 672 673 b.hi .Loop_neon 674 675.Lskip_loop: 676 dup v16.2d,v16.d[0] 677 add v11.2s,v11.2s,v26.2s 678 679 //////////////////////////////////////////////////////////////// 680 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 681 682 adds x2,x2,#32 683 b.ne .Long_tail 684 685 dup v16.2d,v11.d[0] 686 add v14.2s,v9.2s,v24.2s 687 add v17.2s,v12.2s,v27.2s 688 add v15.2s,v10.2s,v25.2s 689 add v18.2s,v13.2s,v28.2s 690 691.Long_tail: 692 dup v14.2d,v14.d[0] 693 umull2 v19.2d,v16.4s,v6.4s 694 umull2 v22.2d,v16.4s,v1.4s 695 umull2 v23.2d,v16.4s,v3.4s 696 umull2 v21.2d,v16.4s,v0.4s 697 umull2 v20.2d,v16.4s,v8.4s 698 699 dup v15.2d,v15.d[0] 700 umlal2 v19.2d,v14.4s,v0.4s 701 umlal2 v21.2d,v14.4s,v3.4s 702 umlal2 v22.2d,v14.4s,v5.4s 703 umlal2 v23.2d,v14.4s,v7.4s 704 umlal2 v20.2d,v14.4s,v1.4s 705 706 dup v17.2d,v17.d[0] 707 umlal2 v19.2d,v15.4s,v8.4s 708 umlal2 v22.2d,v15.4s,v3.4s 709 umlal2 v21.2d,v15.4s,v1.4s 710 umlal2 v23.2d,v15.4s,v5.4s 711 umlal2 v20.2d,v15.4s,v0.4s 712 713 dup v18.2d,v18.d[0] 714 umlal2 v22.2d,v17.4s,v0.4s 715 umlal2 v23.2d,v17.4s,v1.4s 716 umlal2 v19.2d,v17.4s,v4.4s 717 umlal2 v20.2d,v17.4s,v6.4s 718 umlal2 v21.2d,v17.4s,v8.4s 719 720 umlal2 v22.2d,v18.4s,v8.4s 721 umlal2 v19.2d,v18.4s,v2.4s 722 umlal2 v23.2d,v18.4s,v0.4s 723 umlal2 v20.2d,v18.4s,v4.4s 724 umlal2 v21.2d,v18.4s,v6.4s 725 726 b.eq .Lshort_tail 727 728 //////////////////////////////////////////////////////////////// 729 // (hash+inp[0:1])*r^4:r^3 and accumulate 730 731 add v9.2s,v9.2s,v24.2s 732 umlal v22.2d,v11.2s,v1.2s 733 umlal v19.2d,v11.2s,v6.2s 734 umlal v23.2d,v11.2s,v3.2s 735 umlal v20.2d,v11.2s,v8.2s 736 umlal v21.2d,v11.2s,v0.2s 737 738 add v10.2s,v10.2s,v25.2s 739 umlal v22.2d,v9.2s,v5.2s 740 umlal v19.2d,v9.2s,v0.2s 741 umlal v23.2d,v9.2s,v7.2s 742 umlal v20.2d,v9.2s,v1.2s 743 umlal v21.2d,v9.2s,v3.2s 744 745 add v12.2s,v12.2s,v27.2s 746 umlal v22.2d,v10.2s,v3.2s 747 umlal v19.2d,v10.2s,v8.2s 748 umlal v23.2d,v10.2s,v5.2s 749 umlal v20.2d,v10.2s,v0.2s 750 umlal v21.2d,v10.2s,v1.2s 751 752 add v13.2s,v13.2s,v28.2s 753 umlal v22.2d,v12.2s,v0.2s 754 umlal v19.2d,v12.2s,v4.2s 755 umlal v23.2d,v12.2s,v1.2s 756 umlal v20.2d,v12.2s,v6.2s 757 umlal v21.2d,v12.2s,v8.2s 758 759 umlal v22.2d,v13.2s,v8.2s 760 umlal v19.2d,v13.2s,v2.2s 761 umlal v23.2d,v13.2s,v0.2s 762 umlal v20.2d,v13.2s,v4.2s 763 umlal v21.2d,v13.2s,v6.2s 764 765.Lshort_tail: 766 //////////////////////////////////////////////////////////////// 767 // horizontal add 768 769 addp v22.2d,v22.2d,v22.2d 770 ldp d8,d9,[sp,#16] // meet ABI requirements 771 addp v19.2d,v19.2d,v19.2d 772 ldp d10,d11,[sp,#32] 773 addp v23.2d,v23.2d,v23.2d 774 ldp d12,d13,[sp,#48] 775 addp v20.2d,v20.2d,v20.2d 776 ldp d14,d15,[sp,#64] 777 addp v21.2d,v21.2d,v21.2d 778 779 //////////////////////////////////////////////////////////////// 780 // lazy reduction, but without narrowing 781 782 ushr v29.2d,v22.2d,#26 783 and v22.16b,v22.16b,v31.16b 784 ushr v30.2d,v19.2d,#26 785 and v19.16b,v19.16b,v31.16b 786 787 add v23.2d,v23.2d,v29.2d // h3 -> h4 788 add v20.2d,v20.2d,v30.2d // h0 -> h1 789 790 ushr v29.2d,v23.2d,#26 791 and v23.16b,v23.16b,v31.16b 792 ushr v30.2d,v20.2d,#26 793 and v20.16b,v20.16b,v31.16b 794 add v21.2d,v21.2d,v30.2d // h1 -> h2 795 796 add v19.2d,v19.2d,v29.2d 797 shl v29.2d,v29.2d,#2 798 ushr v30.2d,v21.2d,#26 799 and v21.16b,v21.16b,v31.16b 800 add v19.2d,v19.2d,v29.2d // h4 -> h0 801 add v22.2d,v22.2d,v30.2d // h2 -> h3 802 803 ushr v29.2d,v19.2d,#26 804 and v19.16b,v19.16b,v31.16b 805 ushr v30.2d,v22.2d,#26 806 and v22.16b,v22.16b,v31.16b 807 add v20.2d,v20.2d,v29.2d // h0 -> h1 808 add v23.2d,v23.2d,v30.2d // h3 -> h4 809 810 //////////////////////////////////////////////////////////////// 811 // write the result, can be partially reduced 812 813 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 814 st1 {v23.s}[0],[x0] 815 816.Lno_data_neon: 817 ldr x29,[sp],#80 818 AARCH64_VALIDATE_LINK_REGISTER 819 ret 820.size poly1305_blocks_neon,.-poly1305_blocks_neon 821 822.type poly1305_emit_neon,%function 823.align 5 824poly1305_emit_neon: 825.Lpoly1305_emit_neon: 826 // The symbol .Lpoly1305_emit_neon is not a .globl symbol 827 // but a pointer to it is returned by poly1305_init 828 AARCH64_VALID_CALL_TARGET 829 ldr x17,[x0,#24] 830 cbz x17,poly1305_emit 831 832 ldp w10,w11,[x0] // load hash value base 2^26 833 ldp w12,w13,[x0,#8] 834 ldr w14,[x0,#16] 835 836 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 837 lsr x5,x12,#12 838 adds x4,x4,x12,lsl#52 839 add x5,x5,x13,lsl#14 840 adc x5,x5,xzr 841 lsr x6,x14,#24 842 adds x5,x5,x14,lsl#40 843 adc x6,x6,xzr // can be partially reduced... 844 845 ldp x10,x11,[x2] // load nonce 846 847 and x12,x6,#-4 // ... so reduce 848 add x12,x12,x6,lsr#2 849 and x6,x6,#3 850 adds x4,x4,x12 851 adcs x5,x5,xzr 852 adc x6,x6,xzr 853 854 adds x12,x4,#5 // compare to modulus 855 adcs x13,x5,xzr 856 adc x14,x6,xzr 857 858 tst x14,#-4 // see if it's carried/borrowed 859 860 csel x4,x4,x12,eq 861 csel x5,x5,x13,eq 862 863#ifdef __AARCH64EB__ 864 ror x10,x10,#32 // flip nonce words 865 ror x11,x11,#32 866#endif 867 adds x4,x4,x10 // accumulate nonce 868 adc x5,x5,x11 869#ifdef __AARCH64EB__ 870 rev x4,x4 // flip output bytes 871 rev x5,x5 872#endif 873 stp x4,x5,[x1] // write result 874 875 ret 876.size poly1305_emit_neon,.-poly1305_emit_neon 877 878.section .rodata 879 880.align 5 881.Lzeros: 882.long 0,0,0,0,0,0,0,0 883.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 884.align 2 885.align 2 886