1/* $FreeBSD$ */ 2/* Do not modify. This file is auto-generated from poly1305-armv8.pl. */ 3#include "arm_arch.h" 4 5.text 6 7// forward "declarations" are required for Apple 8 9.hidden OPENSSL_armcap_P 10.globl poly1305_init 11.hidden poly1305_init 12.globl poly1305_blocks 13.hidden poly1305_blocks 14.globl poly1305_emit 15.hidden poly1305_emit 16 17.type poly1305_init,%function 18.align 5 19poly1305_init: 20 cmp x1,xzr 21 stp xzr,xzr,[x0] // zero hash value 22 stp xzr,xzr,[x0,#16] // [along with is_base2_26] 23 24 csel x0,xzr,x0,eq 25 b.eq .Lno_key 26 27#ifdef __ILP32__ 28 ldrsw x11,.LOPENSSL_armcap_P 29#else 30 ldr x11,.LOPENSSL_armcap_P 31#endif 32 adr x10,.LOPENSSL_armcap_P 33 34 ldp x7,x8,[x1] // load key 35 mov x9,#0xfffffffc0fffffff 36 movk x9,#0x0fff,lsl#48 37 ldr w17,[x10,x11] 38#ifdef __ARMEB__ 39 rev x7,x7 // flip bytes 40 rev x8,x8 41#endif 42 and x7,x7,x9 // &=0ffffffc0fffffff 43 and x9,x9,#-4 44 and x8,x8,x9 // &=0ffffffc0ffffffc 45 stp x7,x8,[x0,#32] // save key value 46 47 tst w17,#ARMV7_NEON 48 49 adr x12,poly1305_blocks 50 adr x7,poly1305_blocks_neon 51 adr x13,poly1305_emit 52 adr x8,poly1305_emit_neon 53 54 csel x12,x12,x7,eq 55 csel x13,x13,x8,eq 56 57#ifdef __ILP32__ 58 stp w12,w13,[x2] 59#else 60 stp x12,x13,[x2] 61#endif 62 63 mov x0,#1 64.Lno_key: 65 ret 66.size poly1305_init,.-poly1305_init 67 68.type poly1305_blocks,%function 69.align 5 70poly1305_blocks: 71 ands x2,x2,#-16 72 b.eq .Lno_data 73 74 ldp x4,x5,[x0] // load hash value 75 ldp x7,x8,[x0,#32] // load key value 76 ldr x6,[x0,#16] 77 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 78 b .Loop 79 80.align 5 81.Loop: 82 ldp x10,x11,[x1],#16 // load input 83 sub x2,x2,#16 84#ifdef __ARMEB__ 85 rev x10,x10 86 rev x11,x11 87#endif 88 adds x4,x4,x10 // accumulate input 89 adcs x5,x5,x11 90 91 mul x12,x4,x7 // h0*r0 92 adc x6,x6,x3 93 umulh x13,x4,x7 94 95 mul x10,x5,x9 // h1*5*r1 96 umulh x11,x5,x9 97 98 adds x12,x12,x10 99 mul x10,x4,x8 // h0*r1 100 adc x13,x13,x11 101 umulh x14,x4,x8 102 103 adds x13,x13,x10 104 mul x10,x5,x7 // h1*r0 105 adc x14,x14,xzr 106 umulh x11,x5,x7 107 108 adds x13,x13,x10 109 mul x10,x6,x9 // h2*5*r1 110 adc x14,x14,x11 111 mul x11,x6,x7 // h2*r0 112 113 adds x13,x13,x10 114 adc x14,x14,x11 115 116 and x10,x14,#-4 // final reduction 117 and x6,x14,#3 118 add x10,x10,x14,lsr#2 119 adds x4,x12,x10 120 adcs x5,x13,xzr 121 adc x6,x6,xzr 122 123 cbnz x2,.Loop 124 125 stp x4,x5,[x0] // store hash value 126 str x6,[x0,#16] 127 128.Lno_data: 129 ret 130.size poly1305_blocks,.-poly1305_blocks 131 132.type poly1305_emit,%function 133.align 5 134poly1305_emit: 135 ldp x4,x5,[x0] // load hash base 2^64 136 ldr x6,[x0,#16] 137 ldp x10,x11,[x2] // load nonce 138 139 adds x12,x4,#5 // compare to modulus 140 adcs x13,x5,xzr 141 adc x14,x6,xzr 142 143 tst x14,#-4 // see if it's carried/borrowed 144 145 csel x4,x4,x12,eq 146 csel x5,x5,x13,eq 147 148#ifdef __ARMEB__ 149 ror x10,x10,#32 // flip nonce words 150 ror x11,x11,#32 151#endif 152 adds x4,x4,x10 // accumulate nonce 153 adc x5,x5,x11 154#ifdef __ARMEB__ 155 rev x4,x4 // flip output bytes 156 rev x5,x5 157#endif 158 stp x4,x5,[x1] // write result 159 160 ret 161.size poly1305_emit,.-poly1305_emit 162.type poly1305_mult,%function 163.align 5 164poly1305_mult: 165 mul x12,x4,x7 // h0*r0 166 umulh x13,x4,x7 167 168 mul x10,x5,x9 // h1*5*r1 169 umulh x11,x5,x9 170 171 adds x12,x12,x10 172 mul x10,x4,x8 // h0*r1 173 adc x13,x13,x11 174 umulh x14,x4,x8 175 176 adds x13,x13,x10 177 mul x10,x5,x7 // h1*r0 178 adc x14,x14,xzr 179 umulh x11,x5,x7 180 181 adds x13,x13,x10 182 mul x10,x6,x9 // h2*5*r1 183 adc x14,x14,x11 184 mul x11,x6,x7 // h2*r0 185 186 adds x13,x13,x10 187 adc x14,x14,x11 188 189 and x10,x14,#-4 // final reduction 190 and x6,x14,#3 191 add x10,x10,x14,lsr#2 192 adds x4,x12,x10 193 adcs x5,x13,xzr 194 adc x6,x6,xzr 195 196 ret 197.size poly1305_mult,.-poly1305_mult 198 199.type poly1305_splat,%function 200.align 5 201poly1305_splat: 202 and x12,x4,#0x03ffffff // base 2^64 -> base 2^26 203 ubfx x13,x4,#26,#26 204 extr x14,x5,x4,#52 205 and x14,x14,#0x03ffffff 206 ubfx x15,x5,#14,#26 207 extr x16,x6,x5,#40 208 209 str w12,[x0,#16*0] // r0 210 add w12,w13,w13,lsl#2 // r1*5 211 str w13,[x0,#16*1] // r1 212 add w13,w14,w14,lsl#2 // r2*5 213 str w12,[x0,#16*2] // s1 214 str w14,[x0,#16*3] // r2 215 add w14,w15,w15,lsl#2 // r3*5 216 str w13,[x0,#16*4] // s2 217 str w15,[x0,#16*5] // r3 218 add w15,w16,w16,lsl#2 // r4*5 219 str w14,[x0,#16*6] // s3 220 str w16,[x0,#16*7] // r4 221 str w15,[x0,#16*8] // s4 222 223 ret 224.size poly1305_splat,.-poly1305_splat 225 226.type poly1305_blocks_neon,%function 227.align 5 228poly1305_blocks_neon: 229 ldr x17,[x0,#24] 230 cmp x2,#128 231 b.hs .Lblocks_neon 232 cbz x17,poly1305_blocks 233 234.Lblocks_neon: 235.inst 0xd503233f // paciasp 236 stp x29,x30,[sp,#-80]! 237 add x29,sp,#0 238 239 ands x2,x2,#-16 240 b.eq .Lno_data_neon 241 242 cbz x17,.Lbase2_64_neon 243 244 ldp w10,w11,[x0] // load hash value base 2^26 245 ldp w12,w13,[x0,#8] 246 ldr w14,[x0,#16] 247 248 tst x2,#31 249 b.eq .Leven_neon 250 251 ldp x7,x8,[x0,#32] // load key value 252 253 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 254 lsr x5,x12,#12 255 adds x4,x4,x12,lsl#52 256 add x5,x5,x13,lsl#14 257 adc x5,x5,xzr 258 lsr x6,x14,#24 259 adds x5,x5,x14,lsl#40 260 adc x14,x6,xzr // can be partially reduced... 261 262 ldp x12,x13,[x1],#16 // load input 263 sub x2,x2,#16 264 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 265 266 and x10,x14,#-4 // ... so reduce 267 and x6,x14,#3 268 add x10,x10,x14,lsr#2 269 adds x4,x4,x10 270 adcs x5,x5,xzr 271 adc x6,x6,xzr 272 273#ifdef __ARMEB__ 274 rev x12,x12 275 rev x13,x13 276#endif 277 adds x4,x4,x12 // accumulate input 278 adcs x5,x5,x13 279 adc x6,x6,x3 280 281 bl poly1305_mult 282 ldr x30,[sp,#8] 283 284 cbz x3,.Lstore_base2_64_neon 285 286 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 287 ubfx x11,x4,#26,#26 288 extr x12,x5,x4,#52 289 and x12,x12,#0x03ffffff 290 ubfx x13,x5,#14,#26 291 extr x14,x6,x5,#40 292 293 cbnz x2,.Leven_neon 294 295 stp w10,w11,[x0] // store hash value base 2^26 296 stp w12,w13,[x0,#8] 297 str w14,[x0,#16] 298 b .Lno_data_neon 299 300.align 4 301.Lstore_base2_64_neon: 302 stp x4,x5,[x0] // store hash value base 2^64 303 stp x6,xzr,[x0,#16] // note that is_base2_26 is zeroed 304 b .Lno_data_neon 305 306.align 4 307.Lbase2_64_neon: 308 ldp x7,x8,[x0,#32] // load key value 309 310 ldp x4,x5,[x0] // load hash value base 2^64 311 ldr x6,[x0,#16] 312 313 tst x2,#31 314 b.eq .Linit_neon 315 316 ldp x12,x13,[x1],#16 // load input 317 sub x2,x2,#16 318 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 319#ifdef __ARMEB__ 320 rev x12,x12 321 rev x13,x13 322#endif 323 adds x4,x4,x12 // accumulate input 324 adcs x5,x5,x13 325 adc x6,x6,x3 326 327 bl poly1305_mult 328 329.Linit_neon: 330 and x10,x4,#0x03ffffff // base 2^64 -> base 2^26 331 ubfx x11,x4,#26,#26 332 extr x12,x5,x4,#52 333 and x12,x12,#0x03ffffff 334 ubfx x13,x5,#14,#26 335 extr x14,x6,x5,#40 336 337 stp d8,d9,[sp,#16] // meet ABI requirements 338 stp d10,d11,[sp,#32] 339 stp d12,d13,[sp,#48] 340 stp d14,d15,[sp,#64] 341 342 fmov d24,x10 343 fmov d25,x11 344 fmov d26,x12 345 fmov d27,x13 346 fmov d28,x14 347 348 ////////////////////////////////// initialize r^n table 349 mov x4,x7 // r^1 350 add x9,x8,x8,lsr#2 // s1 = r1 + (r1 >> 2) 351 mov x5,x8 352 mov x6,xzr 353 add x0,x0,#48+12 354 bl poly1305_splat 355 356 bl poly1305_mult // r^2 357 sub x0,x0,#4 358 bl poly1305_splat 359 360 bl poly1305_mult // r^3 361 sub x0,x0,#4 362 bl poly1305_splat 363 364 bl poly1305_mult // r^4 365 sub x0,x0,#4 366 bl poly1305_splat 367 ldr x30,[sp,#8] 368 369 add x16,x1,#32 370 adr x17,.Lzeros 371 subs x2,x2,#64 372 csel x16,x17,x16,lo 373 374 mov x4,#1 375 str x4,[x0,#-24] // set is_base2_26 376 sub x0,x0,#48 // restore original x0 377 b .Ldo_neon 378 379.align 4 380.Leven_neon: 381 add x16,x1,#32 382 adr x17,.Lzeros 383 subs x2,x2,#64 384 csel x16,x17,x16,lo 385 386 stp d8,d9,[sp,#16] // meet ABI requirements 387 stp d10,d11,[sp,#32] 388 stp d12,d13,[sp,#48] 389 stp d14,d15,[sp,#64] 390 391 fmov d24,x10 392 fmov d25,x11 393 fmov d26,x12 394 fmov d27,x13 395 fmov d28,x14 396 397.Ldo_neon: 398 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 399 ldp x9,x13,[x16],#48 400 401 lsl x3,x3,#24 402 add x15,x0,#48 403 404#ifdef __ARMEB__ 405 rev x8,x8 406 rev x12,x12 407 rev x9,x9 408 rev x13,x13 409#endif 410 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 411 and x5,x9,#0x03ffffff 412 ubfx x6,x8,#26,#26 413 ubfx x7,x9,#26,#26 414 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 415 extr x8,x12,x8,#52 416 extr x9,x13,x9,#52 417 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 418 fmov d14,x4 419 and x8,x8,#0x03ffffff 420 and x9,x9,#0x03ffffff 421 ubfx x10,x12,#14,#26 422 ubfx x11,x13,#14,#26 423 add x12,x3,x12,lsr#40 424 add x13,x3,x13,lsr#40 425 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 426 fmov d15,x6 427 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 428 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 429 fmov d16,x8 430 fmov d17,x10 431 fmov d18,x12 432 433 ldp x8,x12,[x1],#16 // inp[0:1] 434 ldp x9,x13,[x1],#48 435 436 ld1 {v0.4s,v1.4s,v2.4s,v3.4s},[x15],#64 437 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x15],#64 438 ld1 {v8.4s},[x15] 439 440#ifdef __ARMEB__ 441 rev x8,x8 442 rev x12,x12 443 rev x9,x9 444 rev x13,x13 445#endif 446 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 447 and x5,x9,#0x03ffffff 448 ubfx x6,x8,#26,#26 449 ubfx x7,x9,#26,#26 450 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 451 extr x8,x12,x8,#52 452 extr x9,x13,x9,#52 453 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 454 fmov d9,x4 455 and x8,x8,#0x03ffffff 456 and x9,x9,#0x03ffffff 457 ubfx x10,x12,#14,#26 458 ubfx x11,x13,#14,#26 459 add x12,x3,x12,lsr#40 460 add x13,x3,x13,lsr#40 461 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 462 fmov d10,x6 463 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 464 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 465 movi v31.2d,#-1 466 fmov d11,x8 467 fmov d12,x10 468 fmov d13,x12 469 ushr v31.2d,v31.2d,#38 470 471 b.ls .Lskip_loop 472 473.align 4 474.Loop_neon: 475 //////////////////////////////////////////////////////////////// 476 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 477 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 478 // ___________________/ 479 // ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 480 // ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 481 // ___________________/ ____________________/ 482 // 483 // Note that we start with inp[2:3]*r^2. This is because it 484 // doesn't depend on reduction in previous iteration. 485 //////////////////////////////////////////////////////////////// 486 // d4 = h0*r4 + h1*r3 + h2*r2 + h3*r1 + h4*r0 487 // d3 = h0*r3 + h1*r2 + h2*r1 + h3*r0 + h4*5*r4 488 // d2 = h0*r2 + h1*r1 + h2*r0 + h3*5*r4 + h4*5*r3 489 // d1 = h0*r1 + h1*r0 + h2*5*r4 + h3*5*r3 + h4*5*r2 490 // d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 491 492 subs x2,x2,#64 493 umull v23.2d,v14.2s,v7.s[2] 494 csel x16,x17,x16,lo 495 umull v22.2d,v14.2s,v5.s[2] 496 umull v21.2d,v14.2s,v3.s[2] 497 ldp x8,x12,[x16],#16 // inp[2:3] (or zero) 498 umull v20.2d,v14.2s,v1.s[2] 499 ldp x9,x13,[x16],#48 500 umull v19.2d,v14.2s,v0.s[2] 501#ifdef __ARMEB__ 502 rev x8,x8 503 rev x12,x12 504 rev x9,x9 505 rev x13,x13 506#endif 507 508 umlal v23.2d,v15.2s,v5.s[2] 509 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 510 umlal v22.2d,v15.2s,v3.s[2] 511 and x5,x9,#0x03ffffff 512 umlal v21.2d,v15.2s,v1.s[2] 513 ubfx x6,x8,#26,#26 514 umlal v20.2d,v15.2s,v0.s[2] 515 ubfx x7,x9,#26,#26 516 umlal v19.2d,v15.2s,v8.s[2] 517 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 518 519 umlal v23.2d,v16.2s,v3.s[2] 520 extr x8,x12,x8,#52 521 umlal v22.2d,v16.2s,v1.s[2] 522 extr x9,x13,x9,#52 523 umlal v21.2d,v16.2s,v0.s[2] 524 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 525 umlal v20.2d,v16.2s,v8.s[2] 526 fmov d14,x4 527 umlal v19.2d,v16.2s,v6.s[2] 528 and x8,x8,#0x03ffffff 529 530 umlal v23.2d,v17.2s,v1.s[2] 531 and x9,x9,#0x03ffffff 532 umlal v22.2d,v17.2s,v0.s[2] 533 ubfx x10,x12,#14,#26 534 umlal v21.2d,v17.2s,v8.s[2] 535 ubfx x11,x13,#14,#26 536 umlal v20.2d,v17.2s,v6.s[2] 537 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 538 umlal v19.2d,v17.2s,v4.s[2] 539 fmov d15,x6 540 541 add v11.2s,v11.2s,v26.2s 542 add x12,x3,x12,lsr#40 543 umlal v23.2d,v18.2s,v0.s[2] 544 add x13,x3,x13,lsr#40 545 umlal v22.2d,v18.2s,v8.s[2] 546 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 547 umlal v21.2d,v18.2s,v6.s[2] 548 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 549 umlal v20.2d,v18.2s,v4.s[2] 550 fmov d16,x8 551 umlal v19.2d,v18.2s,v2.s[2] 552 fmov d17,x10 553 554 //////////////////////////////////////////////////////////////// 555 // (hash+inp[0:1])*r^4 and accumulate 556 557 add v9.2s,v9.2s,v24.2s 558 fmov d18,x12 559 umlal v22.2d,v11.2s,v1.s[0] 560 ldp x8,x12,[x1],#16 // inp[0:1] 561 umlal v19.2d,v11.2s,v6.s[0] 562 ldp x9,x13,[x1],#48 563 umlal v23.2d,v11.2s,v3.s[0] 564 umlal v20.2d,v11.2s,v8.s[0] 565 umlal v21.2d,v11.2s,v0.s[0] 566#ifdef __ARMEB__ 567 rev x8,x8 568 rev x12,x12 569 rev x9,x9 570 rev x13,x13 571#endif 572 573 add v10.2s,v10.2s,v25.2s 574 umlal v22.2d,v9.2s,v5.s[0] 575 umlal v23.2d,v9.2s,v7.s[0] 576 and x4,x8,#0x03ffffff // base 2^64 -> base 2^26 577 umlal v21.2d,v9.2s,v3.s[0] 578 and x5,x9,#0x03ffffff 579 umlal v19.2d,v9.2s,v0.s[0] 580 ubfx x6,x8,#26,#26 581 umlal v20.2d,v9.2s,v1.s[0] 582 ubfx x7,x9,#26,#26 583 584 add v12.2s,v12.2s,v27.2s 585 add x4,x4,x5,lsl#32 // bfi x4,x5,#32,#32 586 umlal v22.2d,v10.2s,v3.s[0] 587 extr x8,x12,x8,#52 588 umlal v23.2d,v10.2s,v5.s[0] 589 extr x9,x13,x9,#52 590 umlal v19.2d,v10.2s,v8.s[0] 591 add x6,x6,x7,lsl#32 // bfi x6,x7,#32,#32 592 umlal v21.2d,v10.2s,v1.s[0] 593 fmov d9,x4 594 umlal v20.2d,v10.2s,v0.s[0] 595 and x8,x8,#0x03ffffff 596 597 add v13.2s,v13.2s,v28.2s 598 and x9,x9,#0x03ffffff 599 umlal v22.2d,v12.2s,v0.s[0] 600 ubfx x10,x12,#14,#26 601 umlal v19.2d,v12.2s,v4.s[0] 602 ubfx x11,x13,#14,#26 603 umlal v23.2d,v12.2s,v1.s[0] 604 add x8,x8,x9,lsl#32 // bfi x8,x9,#32,#32 605 umlal v20.2d,v12.2s,v6.s[0] 606 fmov d10,x6 607 umlal v21.2d,v12.2s,v8.s[0] 608 add x12,x3,x12,lsr#40 609 610 umlal v22.2d,v13.2s,v8.s[0] 611 add x13,x3,x13,lsr#40 612 umlal v19.2d,v13.2s,v2.s[0] 613 add x10,x10,x11,lsl#32 // bfi x10,x11,#32,#32 614 umlal v23.2d,v13.2s,v0.s[0] 615 add x12,x12,x13,lsl#32 // bfi x12,x13,#32,#32 616 umlal v20.2d,v13.2s,v4.s[0] 617 fmov d11,x8 618 umlal v21.2d,v13.2s,v6.s[0] 619 fmov d12,x10 620 fmov d13,x12 621 622 ///////////////////////////////////////////////////////////////// 623 // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 624 // and P. Schwabe 625 // 626 // [see discussion in poly1305-armv4 module] 627 628 ushr v29.2d,v22.2d,#26 629 xtn v27.2s,v22.2d 630 ushr v30.2d,v19.2d,#26 631 and v19.16b,v19.16b,v31.16b 632 add v23.2d,v23.2d,v29.2d // h3 -> h4 633 bic v27.2s,#0xfc,lsl#24 // &=0x03ffffff 634 add v20.2d,v20.2d,v30.2d // h0 -> h1 635 636 ushr v29.2d,v23.2d,#26 637 xtn v28.2s,v23.2d 638 ushr v30.2d,v20.2d,#26 639 xtn v25.2s,v20.2d 640 bic v28.2s,#0xfc,lsl#24 641 add v21.2d,v21.2d,v30.2d // h1 -> h2 642 643 add v19.2d,v19.2d,v29.2d 644 shl v29.2d,v29.2d,#2 645 shrn v30.2s,v21.2d,#26 646 xtn v26.2s,v21.2d 647 add v19.2d,v19.2d,v29.2d // h4 -> h0 648 bic v25.2s,#0xfc,lsl#24 649 add v27.2s,v27.2s,v30.2s // h2 -> h3 650 bic v26.2s,#0xfc,lsl#24 651 652 shrn v29.2s,v19.2d,#26 653 xtn v24.2s,v19.2d 654 ushr v30.2s,v27.2s,#26 655 bic v27.2s,#0xfc,lsl#24 656 bic v24.2s,#0xfc,lsl#24 657 add v25.2s,v25.2s,v29.2s // h0 -> h1 658 add v28.2s,v28.2s,v30.2s // h3 -> h4 659 660 b.hi .Loop_neon 661 662.Lskip_loop: 663 dup v16.2d,v16.d[0] 664 add v11.2s,v11.2s,v26.2s 665 666 //////////////////////////////////////////////////////////////// 667 // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 668 669 adds x2,x2,#32 670 b.ne .Long_tail 671 672 dup v16.2d,v11.d[0] 673 add v14.2s,v9.2s,v24.2s 674 add v17.2s,v12.2s,v27.2s 675 add v15.2s,v10.2s,v25.2s 676 add v18.2s,v13.2s,v28.2s 677 678.Long_tail: 679 dup v14.2d,v14.d[0] 680 umull2 v19.2d,v16.4s,v6.4s 681 umull2 v22.2d,v16.4s,v1.4s 682 umull2 v23.2d,v16.4s,v3.4s 683 umull2 v21.2d,v16.4s,v0.4s 684 umull2 v20.2d,v16.4s,v8.4s 685 686 dup v15.2d,v15.d[0] 687 umlal2 v19.2d,v14.4s,v0.4s 688 umlal2 v21.2d,v14.4s,v3.4s 689 umlal2 v22.2d,v14.4s,v5.4s 690 umlal2 v23.2d,v14.4s,v7.4s 691 umlal2 v20.2d,v14.4s,v1.4s 692 693 dup v17.2d,v17.d[0] 694 umlal2 v19.2d,v15.4s,v8.4s 695 umlal2 v22.2d,v15.4s,v3.4s 696 umlal2 v21.2d,v15.4s,v1.4s 697 umlal2 v23.2d,v15.4s,v5.4s 698 umlal2 v20.2d,v15.4s,v0.4s 699 700 dup v18.2d,v18.d[0] 701 umlal2 v22.2d,v17.4s,v0.4s 702 umlal2 v23.2d,v17.4s,v1.4s 703 umlal2 v19.2d,v17.4s,v4.4s 704 umlal2 v20.2d,v17.4s,v6.4s 705 umlal2 v21.2d,v17.4s,v8.4s 706 707 umlal2 v22.2d,v18.4s,v8.4s 708 umlal2 v19.2d,v18.4s,v2.4s 709 umlal2 v23.2d,v18.4s,v0.4s 710 umlal2 v20.2d,v18.4s,v4.4s 711 umlal2 v21.2d,v18.4s,v6.4s 712 713 b.eq .Lshort_tail 714 715 //////////////////////////////////////////////////////////////// 716 // (hash+inp[0:1])*r^4:r^3 and accumulate 717 718 add v9.2s,v9.2s,v24.2s 719 umlal v22.2d,v11.2s,v1.2s 720 umlal v19.2d,v11.2s,v6.2s 721 umlal v23.2d,v11.2s,v3.2s 722 umlal v20.2d,v11.2s,v8.2s 723 umlal v21.2d,v11.2s,v0.2s 724 725 add v10.2s,v10.2s,v25.2s 726 umlal v22.2d,v9.2s,v5.2s 727 umlal v19.2d,v9.2s,v0.2s 728 umlal v23.2d,v9.2s,v7.2s 729 umlal v20.2d,v9.2s,v1.2s 730 umlal v21.2d,v9.2s,v3.2s 731 732 add v12.2s,v12.2s,v27.2s 733 umlal v22.2d,v10.2s,v3.2s 734 umlal v19.2d,v10.2s,v8.2s 735 umlal v23.2d,v10.2s,v5.2s 736 umlal v20.2d,v10.2s,v0.2s 737 umlal v21.2d,v10.2s,v1.2s 738 739 add v13.2s,v13.2s,v28.2s 740 umlal v22.2d,v12.2s,v0.2s 741 umlal v19.2d,v12.2s,v4.2s 742 umlal v23.2d,v12.2s,v1.2s 743 umlal v20.2d,v12.2s,v6.2s 744 umlal v21.2d,v12.2s,v8.2s 745 746 umlal v22.2d,v13.2s,v8.2s 747 umlal v19.2d,v13.2s,v2.2s 748 umlal v23.2d,v13.2s,v0.2s 749 umlal v20.2d,v13.2s,v4.2s 750 umlal v21.2d,v13.2s,v6.2s 751 752.Lshort_tail: 753 //////////////////////////////////////////////////////////////// 754 // horizontal add 755 756 addp v22.2d,v22.2d,v22.2d 757 ldp d8,d9,[sp,#16] // meet ABI requirements 758 addp v19.2d,v19.2d,v19.2d 759 ldp d10,d11,[sp,#32] 760 addp v23.2d,v23.2d,v23.2d 761 ldp d12,d13,[sp,#48] 762 addp v20.2d,v20.2d,v20.2d 763 ldp d14,d15,[sp,#64] 764 addp v21.2d,v21.2d,v21.2d 765 766 //////////////////////////////////////////////////////////////// 767 // lazy reduction, but without narrowing 768 769 ushr v29.2d,v22.2d,#26 770 and v22.16b,v22.16b,v31.16b 771 ushr v30.2d,v19.2d,#26 772 and v19.16b,v19.16b,v31.16b 773 774 add v23.2d,v23.2d,v29.2d // h3 -> h4 775 add v20.2d,v20.2d,v30.2d // h0 -> h1 776 777 ushr v29.2d,v23.2d,#26 778 and v23.16b,v23.16b,v31.16b 779 ushr v30.2d,v20.2d,#26 780 and v20.16b,v20.16b,v31.16b 781 add v21.2d,v21.2d,v30.2d // h1 -> h2 782 783 add v19.2d,v19.2d,v29.2d 784 shl v29.2d,v29.2d,#2 785 ushr v30.2d,v21.2d,#26 786 and v21.16b,v21.16b,v31.16b 787 add v19.2d,v19.2d,v29.2d // h4 -> h0 788 add v22.2d,v22.2d,v30.2d // h2 -> h3 789 790 ushr v29.2d,v19.2d,#26 791 and v19.16b,v19.16b,v31.16b 792 ushr v30.2d,v22.2d,#26 793 and v22.16b,v22.16b,v31.16b 794 add v20.2d,v20.2d,v29.2d // h0 -> h1 795 add v23.2d,v23.2d,v30.2d // h3 -> h4 796 797 //////////////////////////////////////////////////////////////// 798 // write the result, can be partially reduced 799 800 st4 {v19.s,v20.s,v21.s,v22.s}[0],[x0],#16 801 st1 {v23.s}[0],[x0] 802 803.Lno_data_neon: 804 ldr x29,[sp],#80 805.inst 0xd50323bf // autiasp 806 ret 807.size poly1305_blocks_neon,.-poly1305_blocks_neon 808 809.type poly1305_emit_neon,%function 810.align 5 811poly1305_emit_neon: 812 ldr x17,[x0,#24] 813 cbz x17,poly1305_emit 814 815 ldp w10,w11,[x0] // load hash value base 2^26 816 ldp w12,w13,[x0,#8] 817 ldr w14,[x0,#16] 818 819 add x4,x10,x11,lsl#26 // base 2^26 -> base 2^64 820 lsr x5,x12,#12 821 adds x4,x4,x12,lsl#52 822 add x5,x5,x13,lsl#14 823 adc x5,x5,xzr 824 lsr x6,x14,#24 825 adds x5,x5,x14,lsl#40 826 adc x6,x6,xzr // can be partially reduced... 827 828 ldp x10,x11,[x2] // load nonce 829 830 and x12,x6,#-4 // ... so reduce 831 add x12,x12,x6,lsr#2 832 and x6,x6,#3 833 adds x4,x4,x12 834 adcs x5,x5,xzr 835 adc x6,x6,xzr 836 837 adds x12,x4,#5 // compare to modulus 838 adcs x13,x5,xzr 839 adc x14,x6,xzr 840 841 tst x14,#-4 // see if it's carried/borrowed 842 843 csel x4,x4,x12,eq 844 csel x5,x5,x13,eq 845 846#ifdef __ARMEB__ 847 ror x10,x10,#32 // flip nonce words 848 ror x11,x11,#32 849#endif 850 adds x4,x4,x10 // accumulate nonce 851 adc x5,x5,x11 852#ifdef __ARMEB__ 853 rev x4,x4 // flip output bytes 854 rev x5,x5 855#endif 856 stp x4,x5,[x1] // write result 857 858 ret 859.size poly1305_emit_neon,.-poly1305_emit_neon 860 861.align 5 862.Lzeros: 863.long 0,0,0,0,0,0,0,0 864.LOPENSSL_armcap_P: 865#ifdef __ILP32__ 866.long OPENSSL_armcap_P-. 867#else 868.quad OPENSSL_armcap_P-. 869#endif 870.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 871.align 2 872.align 2 873