1/* Do not modify. This file is auto-generated from poly1305-armv8.pl. */ 2#include "arm_arch.h" 3 4.text 5 6// forward "declarations" are required for Apple 7 8.hidden OPENSSL_armcap_P 9.globl poly1305_init 10.hidden poly1305_init 11.globl poly1305_blocks 12.hidden poly1305_blocks 13.globl poly1305_emit 14.hidden poly1305_emit 15 16.type poly1305_init,%function 17.align 5 18poly1305_init: 19 cmp x1,xzr 20 stp xzr,xzr,[x0] // zero hash value 21 stp xzr,xzr,[x0,#16] // [along with is_base2_26] 22 23 csel x0,xzr,x0,eq 24 b.eq .Lno_key 25 26#ifdef __ILP32__ 27 ldrsw x11,.LOPENSSL_armcap_P 28#else 29 ldr x11,.LOPENSSL_armcap_P 30#endif 31 adr x10,.LOPENSSL_armcap_P 32 33 ldp x7,x8,[x1] // load key 34 mov x9,#0xfffffffc0fffffff 35 movk x9,#0x0fff,lsl#48 36 ldr w17,[x10,x11] 37#ifdef __ARMEB__ 38 rev x7,x7 // flip bytes 39 rev x8,x8 40#endif 41 and x7,x7,x9 // &=0ffffffc0fffffff 42 and x9,x9,#-4 43 and x8,x8,x9 // &=0ffffffc0ffffffc 44 stp x7,x8,[x0,#32] // save key value 45 46 tst w17,#ARMV7_NEON 47 48 adr x12,poly1305_blocks 49 adr x7,poly1305_blocks_neon 50 adr x13,poly1305_emit 51 adr x8,poly1305_emit_neon 52 53 csel x12,x12,x7,eq 54 csel x13,x13,x8,eq 55 56#ifdef __ILP32__ 57 stp w12,w13,[x2] 58#else 59 stp x12,x13,[x2] 60#endif 61 62 mov x0,#1 63.Lno_key: 64 ret 65.size poly1305_init,.-poly1305_init 66 67.type poly1305_blocks,%function 68.align 5 69poly1305_blocks: 70 ands x2,x2,#-16 71 b.eq .Lno_data 72 73 ldp x4,x5,[x0] // load hash value 74 ldp x7,x8,[x0,#32] // load key value 75 ldr x6,[x0,#16] 76 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 77 b .Loop 78 79.align 5 80.Loop: 81 ldp x10,x11,[x1],#16 // load input 82 sub x2,x2,#16 83#ifdef __ARMEB__ 84 rev x10,x10 85 rev x11,x11 86#endif 87 adds x4,x4,x10 // accumulate input 88 adcs x5,x5,x11 89 90 mul x12,x4,x7 // h0*r0 91 adc x6,x6,x3 92 umulh x13,x4,x7 93 94 mul x10,x5,x9 // h1*5*r1 95 umulh x11,x5,x9 96 97 adds x12,x12,x10 98 mul x10,x4,x8 // h0*r1 99 adc x13,x13,x11 100 umulh x14,x4,x8 101 102 adds x13,x13,x10 103 mul x10,x5,x7 // h1*r0 104 adc x14,x14,xzr 105 umulh x11,x5,x7 106 107 adds x13,x13,x10 108 mul x10,x6,x9 // h2*5*r1 109 adc x14,x14,x11 110 mul x11,x6,x7 // h2*r0 111 112 adds x13,x13,x10 113 adc x14,x14,x11 114 115 and x10,x14,#-4 // final reduction 116 and x6,x14,#3 117 add x10,x10,x14,lsr#2 118 adds x4,x12,x10 119 adcs x5,x13,xzr 120 adc x6,x6,xzr 121 122 cbnz x2,.Loop 123 124 stp x4,x5,[x0] // store hash value 125 str x6,[x0,#16] 126 127.Lno_data: 128 ret 129.size poly1305_blocks,.-poly1305_blocks 130 131.type poly1305_emit,%function 132.align 5 133poly1305_emit: 134 ldp x4,x5,[x0] // load hash base 2^64 135 ldr x6,[x0,#16] 136 ldp x10,x11,[x2] // load nonce 137 138 adds x12,x4,#5 // compare to modulus 139 adcs x13,x5,xzr 140 adc x14,x6,xzr 141 142 tst x14,#-4 // see if it's carried/borrowed 143 144 csel x4,x4,x12,eq 145 csel x5,x5,x13,eq 146 147#ifdef __ARMEB__ 148 ror x10,x10,#32 // flip nonce words 149 ror x11,x11,#32 150#endif 151 adds x4,x4,x10 // accumulate nonce 152 adc x5,x5,x11 153#ifdef __ARMEB__ 154 rev x4,x4 // flip output bytes 155 rev x5,x5 156#endif 157 stp x4,x5,[x1] // write result 158 159 ret 160.size poly1305_emit,.-poly1305_emit 161.type poly1305_mult,%function 162.align 5 163poly1305_mult: 164 mul x12,x4,x7 // h0*r0 165 umulh x13,x4,x7 166 167 mul x10,x5,x9 // h1*5*r1 168 umulh x11,x5,x9 169 170 adds x12,x12,x10 171 mul x10,x4,x8 // h0*r1 172 adc x13,x13,x11 173 umulh x14,x4,x8 174 175 adds x13,x13,x10 176 mul x10,x5,x7 // h1*r0 177 adc x14,x14,xzr 178 umulh x11,x5,x7 179 180 adds x13,x13,x10 181 mul x10,x6,x9 // h2*5*r1 182 adc x14,x14,x11 183 mul x11,x6,x7 // h2*r0 184 185 adds x13,x13,x10 186 adc x14,x14,x11 187 188 and x10,x14,#-4 // final reduction 189 and x6,x14,#3 190 add x10,x10,x14,lsr#2 191 adds x4,x12,x10 192 adcs x5,x13,xzr 193 adc x6,x6,xzr 194 195 ret 196.size poly1305_mult,.-poly1305_mult 197 198.type poly1305_splat,%function 199.align 5 200poly1305_splat: 201 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 202 ubfx x13,x4,#26,#26 203 extr x14,x5,x4,#52 204 and x14,x14,#0x03ffffff 205 ubfx x15,x5,#14,#26 206 extr x16,x6,x5,#40 207 208 str w12,[x0,#16*0] // r0 209 add w12,w13,w13,lsl#2 // r1*5 210 str w13,[x0,#16*1] // r1 211 add w13,w14,w14,lsl#2 // r2*5 212 str w12,[x0,#16*2] // s1 213 str w14,[x0,#16*3] // r2 214 add w14,w15,w15,lsl#2 // r3*5 215 str w13,[x0,#16*4] // s2 216 str w15,[x0,#16*5] // r3 217 add w15,w16,w16,lsl#2 // r4*5 218 str w14,[x0,#16*6] // s3 219 str w16,[x0,#16*7] // r4 220 str w15,[x0,#16*8] // s4 221 222 ret 223.size poly1305_splat,.-poly1305_splat 224 225.type poly1305_blocks_neon,%function 226.align 5 227poly1305_blocks_neon: 228 ldr x17,[x0,#24] 229 cmp x2,#128 230 b.hs .Lblocks_neon 231 cbz x17,poly1305_blocks 232 233.Lblocks_neon: 234.inst 0xd503233f // paciasp 235 stp x29,x30,[sp,#-80]! 236 add x29,sp,#0 237 238 ands x2,x2,#-16 239 b.eq .Lno_data_neon 240 241 cbz x17,.Lbase2_64_neon 242 243 ldp w10,w11,[x0] // load hash value base 2^26 244 ldp w12,w13,[x0,#8] 245 ldr w14,[x0,#16] 246 247 tst x2,#31 248 b.eq .Leven_neon 249 250 ldp x7,x8,[x0,#32] // load key value 251 252 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 253 lsr x5,x12,#12 254 adds x4,x4,x12,lsl#52 255 add x5,x5,x13,lsl#14 256 adc x5,x5,xzr 257 lsr x6,x14,#24 258 adds x5,x5,x14,lsl#40 259 adc x14,x6,xzr // can be partially reduced... 260 261 ldp x12,x13,[x1],#16 // load input 262 sub x2,x2,#16 263 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 264 265 and x10,x14,#-4 // ... so reduce 266 and x6,x14,#3 267 add x10,x10,x14,lsr#2 268 adds x4,x4,x10 269 adcs x5,x5,xzr 270 adc x6,x6,xzr 271 272#ifdef __ARMEB__ 273 rev x12,x12 274 rev x13,x13 275#endif 276 adds x4,x4,x12 // accumulate input 277 adcs x5,x5,x13 278 adc x6,x6,x3 279 280 bl poly1305_mult 281 ldr x30,[sp,#8] 282 283 cbz x3,.Lstore_base2_64_neon 284 285 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 286 ubfx x11,x4,#26,#26 287 extr x12,x5,x4,#52 288 and x12,x12,#0x03ffffff 289 ubfx x13,x5,#14,#26 290 extr x14,x6,x5,#40 291 292 cbnz x2,.Leven_neon 293 294 stp w10,w11,[x0] // store hash value base 2^26 295 stp w12,w13,[x0,#8] 296 str w14,[x0,#16] 297 b .Lno_data_neon 298 299.align 4 300.Lstore_base2_64_neon: 301 stp x4,x5,[x0] // store hash value base 2^64 302 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed 303 b .Lno_data_neon 304 305.align 4 306.Lbase2_64_neon: 307 ldp x7,x8,[x0,#32] // load key value 308 309 ldp x4,x5,[x0] // load hash value base 2^64 310 ldr x6,[x0,#16] 311 312 tst x2,#31 313 b.eq .Linit_neon 314 315 ldp x12,x13,[x1],#16 // load input 316 sub x2,x2,#16 317 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 318#ifdef __ARMEB__ 319 rev x12,x12 320 rev x13,x13 321#endif 322 adds x4,x4,x12 // accumulate input 323 adcs x5,x5,x13 324 adc x6,x6,x3 325 326 bl poly1305_mult 327 328.Linit_neon: 329 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 330 ubfx x11,x4,#26,#26 331 extr x12,x5,x4,#52 332 and x12,x12,#0x03ffffff 333 ubfx x13,x5,#14,#26 334 extr x14,x6,x5,#40 335 336 stp d8,d9,[sp,#16] // meet ABI requirements 337 stp d10,d11,[sp,#32] 338 stp d12,d13,[sp,#48] 339 stp d14,d15,[sp,#64] 340 341 fmov d24,x10 342 fmov d25,x11 343 fmov d26,x12 344 fmov d27,x13 345 fmov d28,x14 346 347 ////////////////////////////////// initialize r^n table 348 mov x4,x7 // r^1 349 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 350 mov x5,x8 351 mov x6,xzr 352 add x0,x0,#48+12 353 bl poly1305_splat 354 355 bl poly1305_mult // r^2 356 sub x0,x0,#4 357 bl poly1305_splat 358 359 bl poly1305_mult // r^3 360 sub x0,x0,#4 361 bl poly1305_splat 362 363 bl poly1305_mult // r^4 364 sub x0,x0,#4 365 bl poly1305_splat 366 ldr x30,[sp,#8] 367 368 add x16,x1,#32 369 adr x17,.Lzeros 370 subs x2,x2,#64 371 csel x16,x17,x16,lo 372 373 mov x4,#1 374 str x4,[x0,#-24] // set is_base2_26 375 sub x0,x0,#48 // restore original x0 376 b .Ldo_neon 377 378.align 4 379.Leven_neon: 380 add x16,x1,#32 381 adr x17,.Lzeros 382 subs x2,x2,#64 383 csel x16,x17,x16,lo 384 385 stp d8,d9,[sp,#16] // meet ABI requirements 386 stp d10,d11,[sp,#32] 387 stp d12,d13,[sp,#48] 388 stp d14,d15,[sp,#64] 389 390 fmov d24,x10 391 fmov d25,x11 392 fmov d26,x12 393 fmov d27,x13 394 fmov d28,x14 395 396.Ldo_neon: 397 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 398 ldp x9,x13,[x16],#48 399 400 lsl x3,x3,#24 401 add x15,x0,#48 402 403#ifdef __ARMEB__ 404 rev x8,x8 405 rev x12,x12 406 rev x9,x9 407 rev x13,x13 408#endif 409 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 410 and x5,x9,#0x03ffffff 411 ubfx x6,x8,#26,#26 412 ubfx x7,x9,#26,#26 413 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 414 extr x8,x12,x8,#52 415 extr x9,x13,x9,#52 416 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 417 fmov d14,x4 418 and x8,x8,#0x03ffffff 419 and x9,x9,#0x03ffffff 420 ubfx x10,x12,#14,#26 421 ubfx x11,x13,#14,#26 422 add x12,x3,x12,lsr#40 423 add x13,x3,x13,lsr#40 424 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 425 fmov d15,x6 426 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 427 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 428 fmov d16,x8 429 fmov d17,x10 430 fmov d18,x12 431 432 ldp x8,x12,[x1],#16 // inp[0:1] 433 ldp x9,x13,[x1],#48 434 435 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 436 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 437 ld1 {v8.4s},[x15] 438 439#ifdef __ARMEB__ 440 rev x8,x8 441 rev x12,x12 442 rev x9,x9 443 rev x13,x13 444#endif 445 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 446 and x5,x9,#0x03ffffff 447 ubfx x6,x8,#26,#26 448 ubfx x7,x9,#26,#26 449 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 450 extr x8,x12,x8,#52 451 extr x9,x13,x9,#52 452 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 453 fmov d9,x4 454 and x8,x8,#0x03ffffff 455 and x9,x9,#0x03ffffff 456 ubfx x10,x12,#14,#26 457 ubfx x11,x13,#14,#26 458 add x12,x3,x12,lsr#40 459 add x13,x3,x13,lsr#40 460 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 461 fmov d10,x6 462 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 463 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 464 movi v31.2d,#-1 465 fmov d11,x8 466 fmov d12,x10 467 fmov d13,x12 468 ushr v31.2d,v31.2d,#38 469 470 b.ls .Lskip_loop 471 472.align 4 473.Loop_neon: 474 //////////////////////////////////////////////////////////////// 475 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 476 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 477 // ___________________/ 478 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 479 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 480 // ___________________/ ____________________/ 481 // 482 // Note that we start with inp[2:3]*r^2. This is because it 483 // doesn't depend on reduction in previous iteration. 484 //////////////////////////////////////////////////////////////// 485 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 486 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 487 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 488 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 489 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 490 491 subs x2,x2,#64 492 umull v23.2d,v14.2s,v7.s[2] 493 csel x16,x17,x16,lo 494 umull v22.2d,v14.2s,v5.s[2] 495 umull v21.2d,v14.2s,v3.s[2] 496 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 497 umull v20.2d,v14.2s,v1.s[2] 498 ldp x9,x13,[x16],#48 499 umull v19.2d,v14.2s,v0.s[2] 500#ifdef __ARMEB__ 501 rev x8,x8 502 rev x12,x12 503 rev x9,x9 504 rev x13,x13 505#endif 506 507 umlal v23.2d,v15.2s,v5.s[2] 508 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 509 umlal v22.2d,v15.2s,v3.s[2] 510 and x5,x9,#0x03ffffff 511 umlal v21.2d,v15.2s,v1.s[2] 512 ubfx x6,x8,#26,#26 513 umlal v20.2d,v15.2s,v0.s[2] 514 ubfx x7,x9,#26,#26 515 umlal v19.2d,v15.2s,v8.s[2] 516 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 517 518 umlal v23.2d,v16.2s,v3.s[2] 519 extr x8,x12,x8,#52 520 umlal v22.2d,v16.2s,v1.s[2] 521 extr x9,x13,x9,#52 522 umlal v21.2d,v16.2s,v0.s[2] 523 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 524 umlal v20.2d,v16.2s,v8.s[2] 525 fmov d14,x4 526 umlal v19.2d,v16.2s,v6.s[2] 527 and x8,x8,#0x03ffffff 528 529 umlal v23.2d,v17.2s,v1.s[2] 530 and x9,x9,#0x03ffffff 531 umlal v22.2d,v17.2s,v0.s[2] 532 ubfx x10,x12,#14,#26 533 umlal v21.2d,v17.2s,v8.s[2] 534 ubfx x11,x13,#14,#26 535 umlal v20.2d,v17.2s,v6.s[2] 536 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 537 umlal v19.2d,v17.2s,v4.s[2] 538 fmov d15,x6 539 540 add v11.2s,v11.2s,v26.2s 541 add x12,x3,x12,lsr#40 542 umlal v23.2d,v18.2s,v0.s[2] 543 add x13,x3,x13,lsr#40 544 umlal v22.2d,v18.2s,v8.s[2] 545 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 546 umlal v21.2d,v18.2s,v6.s[2] 547 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 548 umlal v20.2d,v18.2s,v4.s[2] 549 fmov d16,x8 550 umlal v19.2d,v18.2s,v2.s[2] 551 fmov d17,x10 552 553 //////////////////////////////////////////////////////////////// 554 // (hash+inp[0:1])*r^4 and accumulate 555 556 add v9.2s,v9.2s,v24.2s 557 fmov d18,x12 558 umlal v22.2d,v11.2s,v1.s[0] 559 ldp x8,x12,[x1],#16 // inp[0:1] 560 umlal v19.2d,v11.2s,v6.s[0] 561 ldp x9,x13,[x1],#48 562 umlal v23.2d,v11.2s,v3.s[0] 563 umlal v20.2d,v11.2s,v8.s[0] 564 umlal v21.2d,v11.2s,v0.s[0] 565#ifdef __ARMEB__ 566 rev x8,x8 567 rev x12,x12 568 rev x9,x9 569 rev x13,x13 570#endif 571 572 add v10.2s,v10.2s,v25.2s 573 umlal v22.2d,v9.2s,v5.s[0] 574 umlal v23.2d,v9.2s,v7.s[0] 575 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 576 umlal v21.2d,v9.2s,v3.s[0] 577 and x5,x9,#0x03ffffff 578 umlal v19.2d,v9.2s,v0.s[0] 579 ubfx x6,x8,#26,#26 580 umlal v20.2d,v9.2s,v1.s[0] 581 ubfx x7,x9,#26,#26 582 583 add v12.2s,v12.2s,v27.2s 584 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 585 umlal v22.2d,v10.2s,v3.s[0] 586 extr x8,x12,x8,#52 587 umlal v23.2d,v10.2s,v5.s[0] 588 extr x9,x13,x9,#52 589 umlal v19.2d,v10.2s,v8.s[0] 590 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 591 umlal v21.2d,v10.2s,v1.s[0] 592 fmov d9,x4 593 umlal v20.2d,v10.2s,v0.s[0] 594 and x8,x8,#0x03ffffff 595 596 add v13.2s,v13.2s,v28.2s 597 and x9,x9,#0x03ffffff 598 umlal v22.2d,v12.2s,v0.s[0] 599 ubfx x10,x12,#14,#26 600 umlal v19.2d,v12.2s,v4.s[0] 601 ubfx x11,x13,#14,#26 602 umlal v23.2d,v12.2s,v1.s[0] 603 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 604 umlal v20.2d,v12.2s,v6.s[0] 605 fmov d10,x6 606 umlal v21.2d,v12.2s,v8.s[0] 607 add x12,x3,x12,lsr#40 608 609 umlal v22.2d,v13.2s,v8.s[0] 610 add x13,x3,x13,lsr#40 611 umlal v19.2d,v13.2s,v2.s[0] 612 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 613 umlal v23.2d,v13.2s,v0.s[0] 614 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 615 umlal v20.2d,v13.2s,v4.s[0] 616 fmov d11,x8 617 umlal v21.2d,v13.2s,v6.s[0] 618 fmov d12,x10 619 fmov d13,x12 620 621 ///////////////////////////////////////////////////////////////// 622 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 623 // and P. Schwabe 624 // 625 // [see discussion in poly1305-armv4 module] 626 627 ushr v29.2d,v22.2d,#26 628 xtn v27.2s,v22.2d 629 ushr v30.2d,v19.2d,#26 630 and v19.16b,v19.16b,v31.16b 631 add v23.2d,v23.2d,v29.2d // h3 -> h4 632 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff 633 add v20.2d,v20.2d,v30.2d // h0 -> h1 634 635 ushr v29.2d,v23.2d,#26 636 xtn v28.2s,v23.2d 637 ushr v30.2d,v20.2d,#26 638 xtn v25.2s,v20.2d 639 bic v28.2s,#0xfc,lsl#24 640 add v21.2d,v21.2d,v30.2d // h1 -> h2 641 642 add v19.2d,v19.2d,v29.2d 643 shl v29.2d,v29.2d,#2 644 shrn v30.2s,v21.2d,#26 645 xtn v26.2s,v21.2d 646 add v19.2d,v19.2d,v29.2d // h4 -> h0 647 bic v25.2s,#0xfc,lsl#24 648 add v27.2s,v27.2s,v30.2s // h2 -> h3 649 bic v26.2s,#0xfc,lsl#24 650 651 shrn v29.2s,v19.2d,#26 652 xtn v24.2s,v19.2d 653 ushr v30.2s,v27.2s,#26 654 bic v27.2s,#0xfc,lsl#24 655 bic v24.2s,#0xfc,lsl#24 656 add v25.2s,v25.2s,v29.2s // h0 -> h1 657 add v28.2s,v28.2s,v30.2s // h3 -> h4 658 659 b.hi .Loop_neon 660 661.Lskip_loop: 662 dup v16.2d,v16.d[0] 663 add v11.2s,v11.2s,v26.2s 664 665 //////////////////////////////////////////////////////////////// 666 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 667 668 adds x2,x2,#32 669 b.ne .Long_tail 670 671 dup v16.2d,v11.d[0] 672 add v14.2s,v9.2s,v24.2s 673 add v17.2s,v12.2s,v27.2s 674 add v15.2s,v10.2s,v25.2s 675 add v18.2s,v13.2s,v28.2s 676 677.Long_tail: 678 dup v14.2d,v14.d[0] 679 umull2 v19.2d,v16.4s,v6.4s 680 umull2 v22.2d,v16.4s,v1.4s 681 umull2 v23.2d,v16.4s,v3.4s 682 umull2 v21.2d,v16.4s,v0.4s 683 umull2 v20.2d,v16.4s,v8.4s 684 685 dup v15.2d,v15.d[0] 686 umlal2 v19.2d,v14.4s,v0.4s 687 umlal2 v21.2d,v14.4s,v3.4s 688 umlal2 v22.2d,v14.4s,v5.4s 689 umlal2 v23.2d,v14.4s,v7.4s 690 umlal2 v20.2d,v14.4s,v1.4s 691 692 dup v17.2d,v17.d[0] 693 umlal2 v19.2d,v15.4s,v8.4s 694 umlal2 v22.2d,v15.4s,v3.4s 695 umlal2 v21.2d,v15.4s,v1.4s 696 umlal2 v23.2d,v15.4s,v5.4s 697 umlal2 v20.2d,v15.4s,v0.4s 698 699 dup v18.2d,v18.d[0] 700 umlal2 v22.2d,v17.4s,v0.4s 701 umlal2 v23.2d,v17.4s,v1.4s 702 umlal2 v19.2d,v17.4s,v4.4s 703 umlal2 v20.2d,v17.4s,v6.4s 704 umlal2 v21.2d,v17.4s,v8.4s 705 706 umlal2 v22.2d,v18.4s,v8.4s 707 umlal2 v19.2d,v18.4s,v2.4s 708 umlal2 v23.2d,v18.4s,v0.4s 709 umlal2 v20.2d,v18.4s,v4.4s 710 umlal2 v21.2d,v18.4s,v6.4s 711 712 b.eq .Lshort_tail 713 714 //////////////////////////////////////////////////////////////// 715 // (hash+inp[0:1])*r^4:r^3 and accumulate 716 717 add v9.2s,v9.2s,v24.2s 718 umlal v22.2d,v11.2s,v1.2s 719 umlal v19.2d,v11.2s,v6.2s 720 umlal v23.2d,v11.2s,v3.2s 721 umlal v20.2d,v11.2s,v8.2s 722 umlal v21.2d,v11.2s,v0.2s 723 724 add v10.2s,v10.2s,v25.2s 725 umlal v22.2d,v9.2s,v5.2s 726 umlal v19.2d,v9.2s,v0.2s 727 umlal v23.2d,v9.2s,v7.2s 728 umlal v20.2d,v9.2s,v1.2s 729 umlal v21.2d,v9.2s,v3.2s 730 731 add v12.2s,v12.2s,v27.2s 732 umlal v22.2d,v10.2s,v3.2s 733 umlal v19.2d,v10.2s,v8.2s 734 umlal v23.2d,v10.2s,v5.2s 735 umlal v20.2d,v10.2s,v0.2s 736 umlal v21.2d,v10.2s,v1.2s 737 738 add v13.2s,v13.2s,v28.2s 739 umlal v22.2d,v12.2s,v0.2s 740 umlal v19.2d,v12.2s,v4.2s 741 umlal v23.2d,v12.2s,v1.2s 742 umlal v20.2d,v12.2s,v6.2s 743 umlal v21.2d,v12.2s,v8.2s 744 745 umlal v22.2d,v13.2s,v8.2s 746 umlal v19.2d,v13.2s,v2.2s 747 umlal v23.2d,v13.2s,v0.2s 748 umlal v20.2d,v13.2s,v4.2s 749 umlal v21.2d,v13.2s,v6.2s 750 751.Lshort_tail: 752 //////////////////////////////////////////////////////////////// 753 // horizontal add 754 755 addp v22.2d,v22.2d,v22.2d 756 ldp d8,d9,[sp,#16] // meet ABI requirements 757 addp v19.2d,v19.2d,v19.2d 758 ldp d10,d11,[sp,#32] 759 addp v23.2d,v23.2d,v23.2d 760 ldp d12,d13,[sp,#48] 761 addp v20.2d,v20.2d,v20.2d 762 ldp d14,d15,[sp,#64] 763 addp v21.2d,v21.2d,v21.2d 764 765 //////////////////////////////////////////////////////////////// 766 // lazy reduction, but without narrowing 767 768 ushr v29.2d,v22.2d,#26 769 and v22.16b,v22.16b,v31.16b 770 ushr v30.2d,v19.2d,#26 771 and v19.16b,v19.16b,v31.16b 772 773 add v23.2d,v23.2d,v29.2d // h3 -> h4 774 add v20.2d,v20.2d,v30.2d // h0 -> h1 775 776 ushr v29.2d,v23.2d,#26 777 and v23.16b,v23.16b,v31.16b 778 ushr v30.2d,v20.2d,#26 779 and v20.16b,v20.16b,v31.16b 780 add v21.2d,v21.2d,v30.2d // h1 -> h2 781 782 add v19.2d,v19.2d,v29.2d 783 shl v29.2d,v29.2d,#2 784 ushr v30.2d,v21.2d,#26 785 and v21.16b,v21.16b,v31.16b 786 add v19.2d,v19.2d,v29.2d // h4 -> h0 787 add v22.2d,v22.2d,v30.2d // h2 -> h3 788 789 ushr v29.2d,v19.2d,#26 790 and v19.16b,v19.16b,v31.16b 791 ushr v30.2d,v22.2d,#26 792 and v22.16b,v22.16b,v31.16b 793 add v20.2d,v20.2d,v29.2d // h0 -> h1 794 add v23.2d,v23.2d,v30.2d // h3 -> h4 795 796 //////////////////////////////////////////////////////////////// 797 // write the result, can be partially reduced 798 799 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 800 st1 {v23.s}[0],[x0] 801 802.Lno_data_neon: 803 ldr x29,[sp],#80 804.inst 0xd50323bf // autiasp 805 ret 806.size poly1305_blocks_neon,.-poly1305_blocks_neon 807 808.type poly1305_emit_neon,%function 809.align 5 810poly1305_emit_neon: 811 ldr x17,[x0,#24] 812 cbz x17,poly1305_emit 813 814 ldp w10,w11,[x0] // load hash value base 2^26 815 ldp w12,w13,[x0,#8] 816 ldr w14,[x0,#16] 817 818 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 819 lsr x5,x12,#12 820 adds x4,x4,x12,lsl#52 821 add x5,x5,x13,lsl#14 822 adc x5,x5,xzr 823 lsr x6,x14,#24 824 adds x5,x5,x14,lsl#40 825 adc x6,x6,xzr // can be partially reduced... 826 827 ldp x10,x11,[x2] // load nonce 828 829 and x12,x6,#-4 // ... so reduce 830 add x12,x12,x6,lsr#2 831 and x6,x6,#3 832 adds x4,x4,x12 833 adcs x5,x5,xzr 834 adc x6,x6,xzr 835 836 adds x12,x4,#5 // compare to modulus 837 adcs x13,x5,xzr 838 adc x14,x6,xzr 839 840 tst x14,#-4 // see if it's carried/borrowed 841 842 csel x4,x4,x12,eq 843 csel x5,x5,x13,eq 844 845#ifdef __ARMEB__ 846 ror x10,x10,#32 // flip nonce words 847 ror x11,x11,#32 848#endif 849 adds x4,x4,x10 // accumulate nonce 850 adc x5,x5,x11 851#ifdef __ARMEB__ 852 rev x4,x4 // flip output bytes 853 rev x5,x5 854#endif 855 stp x4,x5,[x1] // write result 856 857 ret 858.size poly1305_emit_neon,.-poly1305_emit_neon 859 860.align 5 861.Lzeros: 862.long 0,0,0,0,0,0,0,0 863.LOPENSSL_armcap_P: 864#ifdef __ILP32__ 865.long OPENSSL_armcap_P-. 866#else 867.quad OPENSSL_armcap_P-. 868#endif 869.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 870.align 2 871.align 2 872