1/* $FreeBSD$ */ 2/* Do not modify. This file is auto-generated from poly1305-armv8.pl. */ 3#include "arm_arch.h" 4 5.text 6 7// forward "declarations" are required for Apple 8 9.globl poly1305_blocks 10.globl poly1305_emit 11 12.globl poly1305_init 13.type poly1305_init,%function 14.align 5 15poly1305_init: 16 cmp x1,xzr 17 stp xzr,xzr,[x0] // zero hash value 18 stp xzr,xzr,[x0,#16] // [along with is_base2_26] 19 20 csel x0,xzr,x0,eq 21 b.eq .Lno_key 22 23#ifdef __ILP32__ 24 ldrsw x11,.LOPENSSL_armcap_P 25#else 26 ldr x11,.LOPENSSL_armcap_P 27#endif 28 adr x10,.LOPENSSL_armcap_P 29 30 ldp x7,x8,[x1] // load key 31 mov x9,#0xfffffffc0fffffff 32 movk x9,#0x0fff,lsl#48 33 ldr w17,[x10,x11] 34#ifdef __ARMEB__ 35 rev x7,x7 // flip bytes 36 rev x8,x8 37#endif 38 and x7,x7,x9 // &=0ffffffc0fffffff 39 and x9,x9,#-4 40 and x8,x8,x9 // &=0ffffffc0ffffffc 41 stp x7,x8,[x0,#32] // save key value 42 43 tst w17,#ARMV7_NEON 44 45 adr x12,poly1305_blocks 46 adr x7,poly1305_blocks_neon 47 adr x13,poly1305_emit 48 adr x8,poly1305_emit_neon 49 50 csel x12,x12,x7,eq 51 csel x13,x13,x8,eq 52 53#ifdef __ILP32__ 54 stp w12,w13,[x2] 55#else 56 stp x12,x13,[x2] 57#endif 58 59 mov x0,#1 60.Lno_key: 61 ret 62.size poly1305_init,.-poly1305_init 63 64.type poly1305_blocks,%function 65.align 5 66poly1305_blocks: 67 ands x2,x2,#-16 68 b.eq .Lno_data 69 70 ldp x4,x5,[x0] // load hash value 71 ldp x7,x8,[x0,#32] // load key value 72 ldr x6,[x0,#16] 73 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 74 b .Loop 75 76.align 5 77.Loop: 78 ldp x10,x11,[x1],#16 // load input 79 sub x2,x2,#16 80#ifdef __ARMEB__ 81 rev x10,x10 82 rev x11,x11 83#endif 84 adds x4,x4,x10 // accumulate input 85 adcs x5,x5,x11 86 87 mul x12,x4,x7 // h0*r0 88 adc x6,x6,x3 89 umulh x13,x4,x7 90 91 mul x10,x5,x9 // h1*5*r1 92 umulh x11,x5,x9 93 94 adds x12,x12,x10 95 mul x10,x4,x8 // h0*r1 96 adc x13,x13,x11 97 umulh x14,x4,x8 98 99 adds x13,x13,x10 100 mul x10,x5,x7 // h1*r0 101 adc x14,x14,xzr 102 umulh x11,x5,x7 103 104 adds x13,x13,x10 105 mul x10,x6,x9 // h2*5*r1 106 adc x14,x14,x11 107 mul x11,x6,x7 // h2*r0 108 109 adds x13,x13,x10 110 adc x14,x14,x11 111 112 and x10,x14,#-4 // final reduction 113 and x6,x14,#3 114 add x10,x10,x14,lsr#2 115 adds x4,x12,x10 116 adcs x5,x13,xzr 117 adc x6,x6,xzr 118 119 cbnz x2,.Loop 120 121 stp x4,x5,[x0] // store hash value 122 str x6,[x0,#16] 123 124.Lno_data: 125 ret 126.size poly1305_blocks,.-poly1305_blocks 127 128.type poly1305_emit,%function 129.align 5 130poly1305_emit: 131 ldp x4,x5,[x0] // load hash base 2^64 132 ldr x6,[x0,#16] 133 ldp x10,x11,[x2] // load nonce 134 135 adds x12,x4,#5 // compare to modulus 136 adcs x13,x5,xzr 137 adc x14,x6,xzr 138 139 tst x14,#-4 // see if it's carried/borrowed 140 141 csel x4,x4,x12,eq 142 csel x5,x5,x13,eq 143 144#ifdef __ARMEB__ 145 ror x10,x10,#32 // flip nonce words 146 ror x11,x11,#32 147#endif 148 adds x4,x4,x10 // accumulate nonce 149 adc x5,x5,x11 150#ifdef __ARMEB__ 151 rev x4,x4 // flip output bytes 152 rev x5,x5 153#endif 154 stp x4,x5,[x1] // write result 155 156 ret 157.size poly1305_emit,.-poly1305_emit 158.type poly1305_mult,%function 159.align 5 160poly1305_mult: 161 mul x12,x4,x7 // h0*r0 162 umulh x13,x4,x7 163 164 mul x10,x5,x9 // h1*5*r1 165 umulh x11,x5,x9 166 167 adds x12,x12,x10 168 mul x10,x4,x8 // h0*r1 169 adc x13,x13,x11 170 umulh x14,x4,x8 171 172 adds x13,x13,x10 173 mul x10,x5,x7 // h1*r0 174 adc x14,x14,xzr 175 umulh x11,x5,x7 176 177 adds x13,x13,x10 178 mul x10,x6,x9 // h2*5*r1 179 adc x14,x14,x11 180 mul x11,x6,x7 // h2*r0 181 182 adds x13,x13,x10 183 adc x14,x14,x11 184 185 and x10,x14,#-4 // final reduction 186 and x6,x14,#3 187 add x10,x10,x14,lsr#2 188 adds x4,x12,x10 189 adcs x5,x13,xzr 190 adc x6,x6,xzr 191 192 ret 193.size poly1305_mult,.-poly1305_mult 194 195.type poly1305_splat,%function 196.align 5 197poly1305_splat: 198 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 199 ubfx x13,x4,#26,#26 200 extr x14,x5,x4,#52 201 and x14,x14,#0x03ffffff 202 ubfx x15,x5,#14,#26 203 extr x16,x6,x5,#40 204 205 str w12,[x0,#16*0] // r0 206 add w12,w13,w13,lsl#2 // r1*5 207 str w13,[x0,#16*1] // r1 208 add w13,w14,w14,lsl#2 // r2*5 209 str w12,[x0,#16*2] // s1 210 str w14,[x0,#16*3] // r2 211 add w14,w15,w15,lsl#2 // r3*5 212 str w13,[x0,#16*4] // s2 213 str w15,[x0,#16*5] // r3 214 add w15,w16,w16,lsl#2 // r4*5 215 str w14,[x0,#16*6] // s3 216 str w16,[x0,#16*7] // r4 217 str w15,[x0,#16*8] // s4 218 219 ret 220.size poly1305_splat,.-poly1305_splat 221 222.type poly1305_blocks_neon,%function 223.align 5 224poly1305_blocks_neon: 225 ldr x17,[x0,#24] 226 cmp x2,#128 227 b.hs .Lblocks_neon 228 cbz x17,poly1305_blocks 229 230.Lblocks_neon: 231.inst 0xd503233f // paciasp 232 stp x29,x30,[sp,#-80]! 233 add x29,sp,#0 234 235 ands x2,x2,#-16 236 b.eq .Lno_data_neon 237 238 cbz x17,.Lbase2_64_neon 239 240 ldp w10,w11,[x0] // load hash value base 2^26 241 ldp w12,w13,[x0,#8] 242 ldr w14,[x0,#16] 243 244 tst x2,#31 245 b.eq .Leven_neon 246 247 ldp x7,x8,[x0,#32] // load key value 248 249 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 250 lsr x5,x12,#12 251 adds x4,x4,x12,lsl#52 252 add x5,x5,x13,lsl#14 253 adc x5,x5,xzr 254 lsr x6,x14,#24 255 adds x5,x5,x14,lsl#40 256 adc x14,x6,xzr // can be partially reduced... 257 258 ldp x12,x13,[x1],#16 // load input 259 sub x2,x2,#16 260 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 261 262 and x10,x14,#-4 // ... so reduce 263 and x6,x14,#3 264 add x10,x10,x14,lsr#2 265 adds x4,x4,x10 266 adcs x5,x5,xzr 267 adc x6,x6,xzr 268 269#ifdef __ARMEB__ 270 rev x12,x12 271 rev x13,x13 272#endif 273 adds x4,x4,x12 // accumulate input 274 adcs x5,x5,x13 275 adc x6,x6,x3 276 277 bl poly1305_mult 278 ldr x30,[sp,#8] 279 280 cbz x3,.Lstore_base2_64_neon 281 282 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 283 ubfx x11,x4,#26,#26 284 extr x12,x5,x4,#52 285 and x12,x12,#0x03ffffff 286 ubfx x13,x5,#14,#26 287 extr x14,x6,x5,#40 288 289 cbnz x2,.Leven_neon 290 291 stp w10,w11,[x0] // store hash value base 2^26 292 stp w12,w13,[x0,#8] 293 str w14,[x0,#16] 294 b .Lno_data_neon 295 296.align 4 297.Lstore_base2_64_neon: 298 stp x4,x5,[x0] // store hash value base 2^64 299 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed 300 b .Lno_data_neon 301 302.align 4 303.Lbase2_64_neon: 304 ldp x7,x8,[x0,#32] // load key value 305 306 ldp x4,x5,[x0] // load hash value base 2^64 307 ldr x6,[x0,#16] 308 309 tst x2,#31 310 b.eq .Linit_neon 311 312 ldp x12,x13,[x1],#16 // load input 313 sub x2,x2,#16 314 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 315#ifdef __ARMEB__ 316 rev x12,x12 317 rev x13,x13 318#endif 319 adds x4,x4,x12 // accumulate input 320 adcs x5,x5,x13 321 adc x6,x6,x3 322 323 bl poly1305_mult 324 325.Linit_neon: 326 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 327 ubfx x11,x4,#26,#26 328 extr x12,x5,x4,#52 329 and x12,x12,#0x03ffffff 330 ubfx x13,x5,#14,#26 331 extr x14,x6,x5,#40 332 333 stp d8,d9,[sp,#16] // meet ABI requirements 334 stp d10,d11,[sp,#32] 335 stp d12,d13,[sp,#48] 336 stp d14,d15,[sp,#64] 337 338 fmov d24,x10 339 fmov d25,x11 340 fmov d26,x12 341 fmov d27,x13 342 fmov d28,x14 343 344 ////////////////////////////////// initialize r^n table 345 mov x4,x7 // r^1 346 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 347 mov x5,x8 348 mov x6,xzr 349 add x0,x0,#48+12 350 bl poly1305_splat 351 352 bl poly1305_mult // r^2 353 sub x0,x0,#4 354 bl poly1305_splat 355 356 bl poly1305_mult // r^3 357 sub x0,x0,#4 358 bl poly1305_splat 359 360 bl poly1305_mult // r^4 361 sub x0,x0,#4 362 bl poly1305_splat 363 ldr x30,[sp,#8] 364 365 add x16,x1,#32 366 adr x17,.Lzeros 367 subs x2,x2,#64 368 csel x16,x17,x16,lo 369 370 mov x4,#1 371 str x4,[x0,#-24] // set is_base2_26 372 sub x0,x0,#48 // restore original x0 373 b .Ldo_neon 374 375.align 4 376.Leven_neon: 377 add x16,x1,#32 378 adr x17,.Lzeros 379 subs x2,x2,#64 380 csel x16,x17,x16,lo 381 382 stp d8,d9,[sp,#16] // meet ABI requirements 383 stp d10,d11,[sp,#32] 384 stp d12,d13,[sp,#48] 385 stp d14,d15,[sp,#64] 386 387 fmov d24,x10 388 fmov d25,x11 389 fmov d26,x12 390 fmov d27,x13 391 fmov d28,x14 392 393.Ldo_neon: 394 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 395 ldp x9,x13,[x16],#48 396 397 lsl x3,x3,#24 398 add x15,x0,#48 399 400#ifdef __ARMEB__ 401 rev x8,x8 402 rev x12,x12 403 rev x9,x9 404 rev x13,x13 405#endif 406 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 407 and x5,x9,#0x03ffffff 408 ubfx x6,x8,#26,#26 409 ubfx x7,x9,#26,#26 410 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 411 extr x8,x12,x8,#52 412 extr x9,x13,x9,#52 413 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 414 fmov d14,x4 415 and x8,x8,#0x03ffffff 416 and x9,x9,#0x03ffffff 417 ubfx x10,x12,#14,#26 418 ubfx x11,x13,#14,#26 419 add x12,x3,x12,lsr#40 420 add x13,x3,x13,lsr#40 421 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 422 fmov d15,x6 423 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 424 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 425 fmov d16,x8 426 fmov d17,x10 427 fmov d18,x12 428 429 ldp x8,x12,[x1],#16 // inp[0:1] 430 ldp x9,x13,[x1],#48 431 432 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 433 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 434 ld1 {v8.4s},[x15] 435 436#ifdef __ARMEB__ 437 rev x8,x8 438 rev x12,x12 439 rev x9,x9 440 rev x13,x13 441#endif 442 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 443 and x5,x9,#0x03ffffff 444 ubfx x6,x8,#26,#26 445 ubfx x7,x9,#26,#26 446 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 447 extr x8,x12,x8,#52 448 extr x9,x13,x9,#52 449 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 450 fmov d9,x4 451 and x8,x8,#0x03ffffff 452 and x9,x9,#0x03ffffff 453 ubfx x10,x12,#14,#26 454 ubfx x11,x13,#14,#26 455 add x12,x3,x12,lsr#40 456 add x13,x3,x13,lsr#40 457 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 458 fmov d10,x6 459 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 460 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 461 movi v31.2d,#-1 462 fmov d11,x8 463 fmov d12,x10 464 fmov d13,x12 465 ushr v31.2d,v31.2d,#38 466 467 b.ls .Lskip_loop 468 469.align 4 470.Loop_neon: 471 //////////////////////////////////////////////////////////////// 472 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 473 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 474 // ___________________/ 475 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 476 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 477 // ___________________/ ____________________/ 478 // 479 // Note that we start with inp[2:3]*r^2. This is because it 480 // doesn't depend on reduction in previous iteration. 481 //////////////////////////////////////////////////////////////// 482 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 483 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 484 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 485 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 486 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 487 488 subs x2,x2,#64 489 umull v23.2d,v14.2s,v7.s[2] 490 csel x16,x17,x16,lo 491 umull v22.2d,v14.2s,v5.s[2] 492 umull v21.2d,v14.2s,v3.s[2] 493 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 494 umull v20.2d,v14.2s,v1.s[2] 495 ldp x9,x13,[x16],#48 496 umull v19.2d,v14.2s,v0.s[2] 497#ifdef __ARMEB__ 498 rev x8,x8 499 rev x12,x12 500 rev x9,x9 501 rev x13,x13 502#endif 503 504 umlal v23.2d,v15.2s,v5.s[2] 505 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 506 umlal v22.2d,v15.2s,v3.s[2] 507 and x5,x9,#0x03ffffff 508 umlal v21.2d,v15.2s,v1.s[2] 509 ubfx x6,x8,#26,#26 510 umlal v20.2d,v15.2s,v0.s[2] 511 ubfx x7,x9,#26,#26 512 umlal v19.2d,v15.2s,v8.s[2] 513 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 514 515 umlal v23.2d,v16.2s,v3.s[2] 516 extr x8,x12,x8,#52 517 umlal v22.2d,v16.2s,v1.s[2] 518 extr x9,x13,x9,#52 519 umlal v21.2d,v16.2s,v0.s[2] 520 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 521 umlal v20.2d,v16.2s,v8.s[2] 522 fmov d14,x4 523 umlal v19.2d,v16.2s,v6.s[2] 524 and x8,x8,#0x03ffffff 525 526 umlal v23.2d,v17.2s,v1.s[2] 527 and x9,x9,#0x03ffffff 528 umlal v22.2d,v17.2s,v0.s[2] 529 ubfx x10,x12,#14,#26 530 umlal v21.2d,v17.2s,v8.s[2] 531 ubfx x11,x13,#14,#26 532 umlal v20.2d,v17.2s,v6.s[2] 533 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 534 umlal v19.2d,v17.2s,v4.s[2] 535 fmov d15,x6 536 537 add v11.2s,v11.2s,v26.2s 538 add x12,x3,x12,lsr#40 539 umlal v23.2d,v18.2s,v0.s[2] 540 add x13,x3,x13,lsr#40 541 umlal v22.2d,v18.2s,v8.s[2] 542 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 543 umlal v21.2d,v18.2s,v6.s[2] 544 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 545 umlal v20.2d,v18.2s,v4.s[2] 546 fmov d16,x8 547 umlal v19.2d,v18.2s,v2.s[2] 548 fmov d17,x10 549 550 //////////////////////////////////////////////////////////////// 551 // (hash+inp[0:1])*r^4 and accumulate 552 553 add v9.2s,v9.2s,v24.2s 554 fmov d18,x12 555 umlal v22.2d,v11.2s,v1.s[0] 556 ldp x8,x12,[x1],#16 // inp[0:1] 557 umlal v19.2d,v11.2s,v6.s[0] 558 ldp x9,x13,[x1],#48 559 umlal v23.2d,v11.2s,v3.s[0] 560 umlal v20.2d,v11.2s,v8.s[0] 561 umlal v21.2d,v11.2s,v0.s[0] 562#ifdef __ARMEB__ 563 rev x8,x8 564 rev x12,x12 565 rev x9,x9 566 rev x13,x13 567#endif 568 569 add v10.2s,v10.2s,v25.2s 570 umlal v22.2d,v9.2s,v5.s[0] 571 umlal v23.2d,v9.2s,v7.s[0] 572 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 573 umlal v21.2d,v9.2s,v3.s[0] 574 and x5,x9,#0x03ffffff 575 umlal v19.2d,v9.2s,v0.s[0] 576 ubfx x6,x8,#26,#26 577 umlal v20.2d,v9.2s,v1.s[0] 578 ubfx x7,x9,#26,#26 579 580 add v12.2s,v12.2s,v27.2s 581 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 582 umlal v22.2d,v10.2s,v3.s[0] 583 extr x8,x12,x8,#52 584 umlal v23.2d,v10.2s,v5.s[0] 585 extr x9,x13,x9,#52 586 umlal v19.2d,v10.2s,v8.s[0] 587 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 588 umlal v21.2d,v10.2s,v1.s[0] 589 fmov d9,x4 590 umlal v20.2d,v10.2s,v0.s[0] 591 and x8,x8,#0x03ffffff 592 593 add v13.2s,v13.2s,v28.2s 594 and x9,x9,#0x03ffffff 595 umlal v22.2d,v12.2s,v0.s[0] 596 ubfx x10,x12,#14,#26 597 umlal v19.2d,v12.2s,v4.s[0] 598 ubfx x11,x13,#14,#26 599 umlal v23.2d,v12.2s,v1.s[0] 600 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 601 umlal v20.2d,v12.2s,v6.s[0] 602 fmov d10,x6 603 umlal v21.2d,v12.2s,v8.s[0] 604 add x12,x3,x12,lsr#40 605 606 umlal v22.2d,v13.2s,v8.s[0] 607 add x13,x3,x13,lsr#40 608 umlal v19.2d,v13.2s,v2.s[0] 609 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 610 umlal v23.2d,v13.2s,v0.s[0] 611 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 612 umlal v20.2d,v13.2s,v4.s[0] 613 fmov d11,x8 614 umlal v21.2d,v13.2s,v6.s[0] 615 fmov d12,x10 616 fmov d13,x12 617 618 ///////////////////////////////////////////////////////////////// 619 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 620 // and P. Schwabe 621 // 622 // [see discussion in poly1305-armv4 module] 623 624 ushr v29.2d,v22.2d,#26 625 xtn v27.2s,v22.2d 626 ushr v30.2d,v19.2d,#26 627 and v19.16b,v19.16b,v31.16b 628 add v23.2d,v23.2d,v29.2d // h3 -> h4 629 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff 630 add v20.2d,v20.2d,v30.2d // h0 -> h1 631 632 ushr v29.2d,v23.2d,#26 633 xtn v28.2s,v23.2d 634 ushr v30.2d,v20.2d,#26 635 xtn v25.2s,v20.2d 636 bic v28.2s,#0xfc,lsl#24 637 add v21.2d,v21.2d,v30.2d // h1 -> h2 638 639 add v19.2d,v19.2d,v29.2d 640 shl v29.2d,v29.2d,#2 641 shrn v30.2s,v21.2d,#26 642 xtn v26.2s,v21.2d 643 add v19.2d,v19.2d,v29.2d // h4 -> h0 644 bic v25.2s,#0xfc,lsl#24 645 add v27.2s,v27.2s,v30.2s // h2 -> h3 646 bic v26.2s,#0xfc,lsl#24 647 648 shrn v29.2s,v19.2d,#26 649 xtn v24.2s,v19.2d 650 ushr v30.2s,v27.2s,#26 651 bic v27.2s,#0xfc,lsl#24 652 bic v24.2s,#0xfc,lsl#24 653 add v25.2s,v25.2s,v29.2s // h0 -> h1 654 add v28.2s,v28.2s,v30.2s // h3 -> h4 655 656 b.hi .Loop_neon 657 658.Lskip_loop: 659 dup v16.2d,v16.d[0] 660 add v11.2s,v11.2s,v26.2s 661 662 //////////////////////////////////////////////////////////////// 663 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 664 665 adds x2,x2,#32 666 b.ne .Long_tail 667 668 dup v16.2d,v11.d[0] 669 add v14.2s,v9.2s,v24.2s 670 add v17.2s,v12.2s,v27.2s 671 add v15.2s,v10.2s,v25.2s 672 add v18.2s,v13.2s,v28.2s 673 674.Long_tail: 675 dup v14.2d,v14.d[0] 676 umull2 v19.2d,v16.4s,v6.4s 677 umull2 v22.2d,v16.4s,v1.4s 678 umull2 v23.2d,v16.4s,v3.4s 679 umull2 v21.2d,v16.4s,v0.4s 680 umull2 v20.2d,v16.4s,v8.4s 681 682 dup v15.2d,v15.d[0] 683 umlal2 v19.2d,v14.4s,v0.4s 684 umlal2 v21.2d,v14.4s,v3.4s 685 umlal2 v22.2d,v14.4s,v5.4s 686 umlal2 v23.2d,v14.4s,v7.4s 687 umlal2 v20.2d,v14.4s,v1.4s 688 689 dup v17.2d,v17.d[0] 690 umlal2 v19.2d,v15.4s,v8.4s 691 umlal2 v22.2d,v15.4s,v3.4s 692 umlal2 v21.2d,v15.4s,v1.4s 693 umlal2 v23.2d,v15.4s,v5.4s 694 umlal2 v20.2d,v15.4s,v0.4s 695 696 dup v18.2d,v18.d[0] 697 umlal2 v22.2d,v17.4s,v0.4s 698 umlal2 v23.2d,v17.4s,v1.4s 699 umlal2 v19.2d,v17.4s,v4.4s 700 umlal2 v20.2d,v17.4s,v6.4s 701 umlal2 v21.2d,v17.4s,v8.4s 702 703 umlal2 v22.2d,v18.4s,v8.4s 704 umlal2 v19.2d,v18.4s,v2.4s 705 umlal2 v23.2d,v18.4s,v0.4s 706 umlal2 v20.2d,v18.4s,v4.4s 707 umlal2 v21.2d,v18.4s,v6.4s 708 709 b.eq .Lshort_tail 710 711 //////////////////////////////////////////////////////////////// 712 // (hash+inp[0:1])*r^4:r^3 and accumulate 713 714 add v9.2s,v9.2s,v24.2s 715 umlal v22.2d,v11.2s,v1.2s 716 umlal v19.2d,v11.2s,v6.2s 717 umlal v23.2d,v11.2s,v3.2s 718 umlal v20.2d,v11.2s,v8.2s 719 umlal v21.2d,v11.2s,v0.2s 720 721 add v10.2s,v10.2s,v25.2s 722 umlal v22.2d,v9.2s,v5.2s 723 umlal v19.2d,v9.2s,v0.2s 724 umlal v23.2d,v9.2s,v7.2s 725 umlal v20.2d,v9.2s,v1.2s 726 umlal v21.2d,v9.2s,v3.2s 727 728 add v12.2s,v12.2s,v27.2s 729 umlal v22.2d,v10.2s,v3.2s 730 umlal v19.2d,v10.2s,v8.2s 731 umlal v23.2d,v10.2s,v5.2s 732 umlal v20.2d,v10.2s,v0.2s 733 umlal v21.2d,v10.2s,v1.2s 734 735 add v13.2s,v13.2s,v28.2s 736 umlal v22.2d,v12.2s,v0.2s 737 umlal v19.2d,v12.2s,v4.2s 738 umlal v23.2d,v12.2s,v1.2s 739 umlal v20.2d,v12.2s,v6.2s 740 umlal v21.2d,v12.2s,v8.2s 741 742 umlal v22.2d,v13.2s,v8.2s 743 umlal v19.2d,v13.2s,v2.2s 744 umlal v23.2d,v13.2s,v0.2s 745 umlal v20.2d,v13.2s,v4.2s 746 umlal v21.2d,v13.2s,v6.2s 747 748.Lshort_tail: 749 //////////////////////////////////////////////////////////////// 750 // horizontal add 751 752 addp v22.2d,v22.2d,v22.2d 753 ldp d8,d9,[sp,#16] // meet ABI requirements 754 addp v19.2d,v19.2d,v19.2d 755 ldp d10,d11,[sp,#32] 756 addp v23.2d,v23.2d,v23.2d 757 ldp d12,d13,[sp,#48] 758 addp v20.2d,v20.2d,v20.2d 759 ldp d14,d15,[sp,#64] 760 addp v21.2d,v21.2d,v21.2d 761 762 //////////////////////////////////////////////////////////////// 763 // lazy reduction, but without narrowing 764 765 ushr v29.2d,v22.2d,#26 766 and v22.16b,v22.16b,v31.16b 767 ushr v30.2d,v19.2d,#26 768 and v19.16b,v19.16b,v31.16b 769 770 add v23.2d,v23.2d,v29.2d // h3 -> h4 771 add v20.2d,v20.2d,v30.2d // h0 -> h1 772 773 ushr v29.2d,v23.2d,#26 774 and v23.16b,v23.16b,v31.16b 775 ushr v30.2d,v20.2d,#26 776 and v20.16b,v20.16b,v31.16b 777 add v21.2d,v21.2d,v30.2d // h1 -> h2 778 779 add v19.2d,v19.2d,v29.2d 780 shl v29.2d,v29.2d,#2 781 ushr v30.2d,v21.2d,#26 782 and v21.16b,v21.16b,v31.16b 783 add v19.2d,v19.2d,v29.2d // h4 -> h0 784 add v22.2d,v22.2d,v30.2d // h2 -> h3 785 786 ushr v29.2d,v19.2d,#26 787 and v19.16b,v19.16b,v31.16b 788 ushr v30.2d,v22.2d,#26 789 and v22.16b,v22.16b,v31.16b 790 add v20.2d,v20.2d,v29.2d // h0 -> h1 791 add v23.2d,v23.2d,v30.2d // h3 -> h4 792 793 //////////////////////////////////////////////////////////////// 794 // write the result, can be partially reduced 795 796 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 797 st1 {v23.s}[0],[x0] 798 799.Lno_data_neon: 800.inst 0xd50323bf // autiasp 801 ldr x29,[sp],#80 802 ret 803.size poly1305_blocks_neon,.-poly1305_blocks_neon 804 805.type poly1305_emit_neon,%function 806.align 5 807poly1305_emit_neon: 808 ldr x17,[x0,#24] 809 cbz x17,poly1305_emit 810 811 ldp w10,w11,[x0] // load hash value base 2^26 812 ldp w12,w13,[x0,#8] 813 ldr w14,[x0,#16] 814 815 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 816 lsr x5,x12,#12 817 adds x4,x4,x12,lsl#52 818 add x5,x5,x13,lsl#14 819 adc x5,x5,xzr 820 lsr x6,x14,#24 821 adds x5,x5,x14,lsl#40 822 adc x6,x6,xzr // can be partially reduced... 823 824 ldp x10,x11,[x2] // load nonce 825 826 and x12,x6,#-4 // ... so reduce 827 add x12,x12,x6,lsr#2 828 and x6,x6,#3 829 adds x4,x4,x12 830 adcs x5,x5,xzr 831 adc x6,x6,xzr 832 833 adds x12,x4,#5 // compare to modulus 834 adcs x13,x5,xzr 835 adc x14,x6,xzr 836 837 tst x14,#-4 // see if it's carried/borrowed 838 839 csel x4,x4,x12,eq 840 csel x5,x5,x13,eq 841 842#ifdef __ARMEB__ 843 ror x10,x10,#32 // flip nonce words 844 ror x11,x11,#32 845#endif 846 adds x4,x4,x10 // accumulate nonce 847 adc x5,x5,x11 848#ifdef __ARMEB__ 849 rev x4,x4 // flip output bytes 850 rev x5,x5 851#endif 852 stp x4,x5,[x1] // write result 853 854 ret 855.size poly1305_emit_neon,.-poly1305_emit_neon 856 857.align 5 858.Lzeros: 859.long 0,0,0,0,0,0,0,0 860.LOPENSSL_armcap_P: 861#ifdef __ILP32__ 862.long OPENSSL_armcap_P-. 863#else 864.quad OPENSSL_armcap_P-. 865#endif 866.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 867.align 2 868.align 2 869