1/* Do not modify. This file is auto-generated from poly1305-armv4.pl. */ 2#include "arm_arch.h" 3 4.text 5#if defined(__thumb2__) 6.syntax unified 7.thumb 8#else 9.code 32 10#endif 11 12.globl poly1305_emit 13.globl poly1305_blocks 14.globl poly1305_init 15.type poly1305_init,%function 16.align 5 17poly1305_init: 18.Lpoly1305_init: 19 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 20 21 eor r3,r3,r3 22 cmp r1,#0 23 str r3,[r0,#0] @ zero hash value 24 str r3,[r0,#4] 25 str r3,[r0,#8] 26 str r3,[r0,#12] 27 str r3,[r0,#16] 28 str r3,[r0,#36] @ is_base2_26 29 add r0,r0,#20 30 31#ifdef __thumb2__ 32 it eq 33#endif 34 moveq r0,#0 35 beq .Lno_key 36 37#if __ARM_MAX_ARCH__>=7 38 adr r11,.Lpoly1305_init 39 ldr r12,.LOPENSSL_armcap 40#endif 41 ldrb r4,[r1,#0] 42 mov r10,#0x0fffffff 43 ldrb r5,[r1,#1] 44 and r3,r10,#-4 @ 0x0ffffffc 45 ldrb r6,[r1,#2] 46 ldrb r7,[r1,#3] 47 orr r4,r4,r5,lsl#8 48 ldrb r5,[r1,#4] 49 orr r4,r4,r6,lsl#16 50 ldrb r6,[r1,#5] 51 orr r4,r4,r7,lsl#24 52 ldrb r7,[r1,#6] 53 and r4,r4,r10 54 55#if __ARM_MAX_ARCH__>=7 56 ldr r12,[r11,r12] @ OPENSSL_armcap_P 57# ifdef __APPLE__ 58 ldr r12,[r12] 59# endif 60#endif 61 ldrb r8,[r1,#7] 62 orr r5,r5,r6,lsl#8 63 ldrb r6,[r1,#8] 64 orr r5,r5,r7,lsl#16 65 ldrb r7,[r1,#9] 66 orr r5,r5,r8,lsl#24 67 ldrb r8,[r1,#10] 68 and r5,r5,r3 69 70#if __ARM_MAX_ARCH__>=7 71 tst r12,#ARMV7_NEON @ check for NEON 72# ifdef __APPLE__ 73 adr r9,poly1305_blocks_neon 74 adr r11,poly1305_blocks 75# ifdef __thumb2__ 76 it ne 77# endif 78 movne r11,r9 79 adr r12,poly1305_emit 80 adr r10,poly1305_emit_neon 81# ifdef __thumb2__ 82 it ne 83# endif 84 movne r12,r10 85# else 86# ifdef __thumb2__ 87 itete eq 88# endif 89 addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) 90 addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init) 91 addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) 92 addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) 93# endif 94# ifdef __thumb2__ 95 orr r12,r12,#1 @ thumb-ify address 96 orr r11,r11,#1 97# endif 98#endif 99 ldrb r9,[r1,#11] 100 orr r6,r6,r7,lsl#8 101 ldrb r7,[r1,#12] 102 orr r6,r6,r8,lsl#16 103 ldrb r8,[r1,#13] 104 orr r6,r6,r9,lsl#24 105 ldrb r9,[r1,#14] 106 and r6,r6,r3 107 108 ldrb r10,[r1,#15] 109 orr r7,r7,r8,lsl#8 110 str r4,[r0,#0] 111 orr r7,r7,r9,lsl#16 112 str r5,[r0,#4] 113 orr r7,r7,r10,lsl#24 114 str r6,[r0,#8] 115 and r7,r7,r3 116 str r7,[r0,#12] 117#if __ARM_MAX_ARCH__>=7 118 stmia r2,{r11,r12} @ fill functions table 119 mov r0,#1 120#else 121 mov r0,#0 122#endif 123.Lno_key: 124 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 125#if __ARM_ARCH__>=5 126 bx lr @ bx lr 127#else 128 tst lr,#1 129 moveq pc,lr @ be binary compatible with V4, yet 130.word 0xe12fff1e @ interoperable with Thumb ISA:-) 131#endif 132.size poly1305_init,.-poly1305_init 133.type poly1305_blocks,%function 134.align 5 135poly1305_blocks: 136.Lpoly1305_blocks: 137 stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} 138 139 ands r2,r2,#-16 140 beq .Lno_data 141 142 cmp r3,#0 143 add r2,r2,r1 @ end pointer 144 sub sp,sp,#32 145 146 ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12} @ load context 147 148 str r0,[sp,#12] @ offload stuff 149 mov lr,r1 150 str r2,[sp,#16] 151 str r10,[sp,#20] 152 str r11,[sp,#24] 153 str r12,[sp,#28] 154 b .Loop 155 156.Loop: 157#if __ARM_ARCH__<7 158 ldrb r0,[lr],#16 @ load input 159# ifdef __thumb2__ 160 it hi 161# endif 162 addhi r8,r8,#1 @ 1<<128 163 ldrb r1,[lr,#-15] 164 ldrb r2,[lr,#-14] 165 ldrb r3,[lr,#-13] 166 orr r1,r0,r1,lsl#8 167 ldrb r0,[lr,#-12] 168 orr r2,r1,r2,lsl#16 169 ldrb r1,[lr,#-11] 170 orr r3,r2,r3,lsl#24 171 ldrb r2,[lr,#-10] 172 adds r4,r4,r3 @ accumulate input 173 174 ldrb r3,[lr,#-9] 175 orr r1,r0,r1,lsl#8 176 ldrb r0,[lr,#-8] 177 orr r2,r1,r2,lsl#16 178 ldrb r1,[lr,#-7] 179 orr r3,r2,r3,lsl#24 180 ldrb r2,[lr,#-6] 181 adcs r5,r5,r3 182 183 ldrb r3,[lr,#-5] 184 orr r1,r0,r1,lsl#8 185 ldrb r0,[lr,#-4] 186 orr r2,r1,r2,lsl#16 187 ldrb r1,[lr,#-3] 188 orr r3,r2,r3,lsl#24 189 ldrb r2,[lr,#-2] 190 adcs r6,r6,r3 191 192 ldrb r3,[lr,#-1] 193 orr r1,r0,r1,lsl#8 194 str lr,[sp,#8] @ offload input pointer 195 orr r2,r1,r2,lsl#16 196 add r10,r10,r10,lsr#2 197 orr r3,r2,r3,lsl#24 198#else 199 ldr r0,[lr],#16 @ load input 200# ifdef __thumb2__ 201 it hi 202# endif 203 addhi r8,r8,#1 @ padbit 204 ldr r1,[lr,#-12] 205 ldr r2,[lr,#-8] 206 ldr r3,[lr,#-4] 207# ifdef __ARMEB__ 208 rev r0,r0 209 rev r1,r1 210 rev r2,r2 211 rev r3,r3 212# endif 213 adds r4,r4,r0 @ accumulate input 214 str lr,[sp,#8] @ offload input pointer 215 adcs r5,r5,r1 216 add r10,r10,r10,lsr#2 217 adcs r6,r6,r2 218#endif 219 add r11,r11,r11,lsr#2 220 adcs r7,r7,r3 221 add r12,r12,r12,lsr#2 222 223 umull r2,r3,r5,r9 224 adc r8,r8,#0 225 umull r0,r1,r4,r9 226 umlal r2,r3,r8,r10 227 umlal r0,r1,r7,r10 228 ldr r10,[sp,#20] @ reload r10 229 umlal r2,r3,r6,r12 230 umlal r0,r1,r5,r12 231 umlal r2,r3,r7,r11 232 umlal r0,r1,r6,r11 233 umlal r2,r3,r4,r10 234 str r0,[sp,#0] @ future r4 235 mul r0,r11,r8 236 ldr r11,[sp,#24] @ reload r11 237 adds r2,r2,r1 @ d1+=d0>>32 238 eor r1,r1,r1 239 adc lr,r3,#0 @ future r6 240 str r2,[sp,#4] @ future r5 241 242 mul r2,r12,r8 243 eor r3,r3,r3 244 umlal r0,r1,r7,r12 245 ldr r12,[sp,#28] @ reload r12 246 umlal r2,r3,r7,r9 247 umlal r0,r1,r6,r9 248 umlal r2,r3,r6,r10 249 umlal r0,r1,r5,r10 250 umlal r2,r3,r5,r11 251 umlal r0,r1,r4,r11 252 umlal r2,r3,r4,r12 253 ldr r4,[sp,#0] 254 mul r8,r9,r8 255 ldr r5,[sp,#4] 256 257 adds r6,lr,r0 @ d2+=d1>>32 258 ldr lr,[sp,#8] @ reload input pointer 259 adc r1,r1,#0 260 adds r7,r2,r1 @ d3+=d2>>32 261 ldr r0,[sp,#16] @ reload end pointer 262 adc r3,r3,#0 263 add r8,r8,r3 @ h4+=d3>>32 264 265 and r1,r8,#-4 266 and r8,r8,#3 267 add r1,r1,r1,lsr#2 @ *=5 268 adds r4,r4,r1 269 adcs r5,r5,#0 270 adcs r6,r6,#0 271 adcs r7,r7,#0 272 adc r8,r8,#0 273 274 cmp r0,lr @ done yet? 275 bhi .Loop 276 277 ldr r0,[sp,#12] 278 add sp,sp,#32 279 stmia r0,{r4,r5,r6,r7,r8} @ store the result 280 281.Lno_data: 282#if __ARM_ARCH__>=5 283 ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc} 284#else 285 ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} 286 tst lr,#1 287 moveq pc,lr @ be binary compatible with V4, yet 288.word 0xe12fff1e @ interoperable with Thumb ISA:-) 289#endif 290.size poly1305_blocks,.-poly1305_blocks 291.type poly1305_emit,%function 292.align 5 293poly1305_emit: 294.Lpoly1305_emit: 295 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 296.Lpoly1305_emit_enter: 297 298 ldmia r0,{r3,r4,r5,r6,r7} 299 adds r8,r3,#5 @ compare to modulus 300 adcs r9,r4,#0 301 adcs r10,r5,#0 302 adcs r11,r6,#0 303 adc r7,r7,#0 304 tst r7,#4 @ did it carry/borrow? 305 306#ifdef __thumb2__ 307 it ne 308#endif 309 movne r3,r8 310 ldr r8,[r2,#0] 311#ifdef __thumb2__ 312 it ne 313#endif 314 movne r4,r9 315 ldr r9,[r2,#4] 316#ifdef __thumb2__ 317 it ne 318#endif 319 movne r5,r10 320 ldr r10,[r2,#8] 321#ifdef __thumb2__ 322 it ne 323#endif 324 movne r6,r11 325 ldr r11,[r2,#12] 326 327 adds r3,r3,r8 328 adcs r4,r4,r9 329 adcs r5,r5,r10 330 adc r6,r6,r11 331 332#if __ARM_ARCH__>=7 333# ifdef __ARMEB__ 334 rev r3,r3 335 rev r4,r4 336 rev r5,r5 337 rev r6,r6 338# endif 339 str r3,[r1,#0] 340 str r4,[r1,#4] 341 str r5,[r1,#8] 342 str r6,[r1,#12] 343#else 344 strb r3,[r1,#0] 345 mov r3,r3,lsr#8 346 strb r4,[r1,#4] 347 mov r4,r4,lsr#8 348 strb r5,[r1,#8] 349 mov r5,r5,lsr#8 350 strb r6,[r1,#12] 351 mov r6,r6,lsr#8 352 353 strb r3,[r1,#1] 354 mov r3,r3,lsr#8 355 strb r4,[r1,#5] 356 mov r4,r4,lsr#8 357 strb r5,[r1,#9] 358 mov r5,r5,lsr#8 359 strb r6,[r1,#13] 360 mov r6,r6,lsr#8 361 362 strb r3,[r1,#2] 363 mov r3,r3,lsr#8 364 strb r4,[r1,#6] 365 mov r4,r4,lsr#8 366 strb r5,[r1,#10] 367 mov r5,r5,lsr#8 368 strb r6,[r1,#14] 369 mov r6,r6,lsr#8 370 371 strb r3,[r1,#3] 372 strb r4,[r1,#7] 373 strb r5,[r1,#11] 374 strb r6,[r1,#15] 375#endif 376 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 377#if __ARM_ARCH__>=5 378 bx lr @ bx lr 379#else 380 tst lr,#1 381 moveq pc,lr @ be binary compatible with V4, yet 382.word 0xe12fff1e @ interoperable with Thumb ISA:-) 383#endif 384.size poly1305_emit,.-poly1305_emit 385#if __ARM_MAX_ARCH__>=7 386.fpu neon 387 388.type poly1305_init_neon,%function 389.align 5 390poly1305_init_neon: 391 ldr r4,[r0,#20] @ load key base 2^32 392 ldr r5,[r0,#24] 393 ldr r6,[r0,#28] 394 ldr r7,[r0,#32] 395 396 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 397 mov r3,r4,lsr#26 398 mov r4,r5,lsr#20 399 orr r3,r3,r5,lsl#6 400 mov r5,r6,lsr#14 401 orr r4,r4,r6,lsl#12 402 mov r6,r7,lsr#8 403 orr r5,r5,r7,lsl#18 404 and r3,r3,#0x03ffffff 405 and r4,r4,#0x03ffffff 406 and r5,r5,#0x03ffffff 407 408 vdup.32 d0,r2 @ r^1 in both lanes 409 add r2,r3,r3,lsl#2 @ *5 410 vdup.32 d1,r3 411 add r3,r4,r4,lsl#2 412 vdup.32 d2,r2 413 vdup.32 d3,r4 414 add r4,r5,r5,lsl#2 415 vdup.32 d4,r3 416 vdup.32 d5,r5 417 add r5,r6,r6,lsl#2 418 vdup.32 d6,r4 419 vdup.32 d7,r6 420 vdup.32 d8,r5 421 422 mov r5,#2 @ counter 423 424.Lsquare_neon: 425 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 426 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 427 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 428 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 429 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 430 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 431 432 vmull.u32 q5,d0,d0[1] 433 vmull.u32 q6,d1,d0[1] 434 vmull.u32 q7,d3,d0[1] 435 vmull.u32 q8,d5,d0[1] 436 vmull.u32 q9,d7,d0[1] 437 438 vmlal.u32 q5,d7,d2[1] 439 vmlal.u32 q6,d0,d1[1] 440 vmlal.u32 q7,d1,d1[1] 441 vmlal.u32 q8,d3,d1[1] 442 vmlal.u32 q9,d5,d1[1] 443 444 vmlal.u32 q5,d5,d4[1] 445 vmlal.u32 q6,d7,d4[1] 446 vmlal.u32 q8,d1,d3[1] 447 vmlal.u32 q7,d0,d3[1] 448 vmlal.u32 q9,d3,d3[1] 449 450 vmlal.u32 q5,d3,d6[1] 451 vmlal.u32 q8,d0,d5[1] 452 vmlal.u32 q6,d5,d6[1] 453 vmlal.u32 q7,d7,d6[1] 454 vmlal.u32 q9,d1,d5[1] 455 456 vmlal.u32 q8,d7,d8[1] 457 vmlal.u32 q5,d1,d8[1] 458 vmlal.u32 q6,d3,d8[1] 459 vmlal.u32 q7,d5,d8[1] 460 vmlal.u32 q9,d0,d7[1] 461 462 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 463 @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 464 @ and P. Schwabe 465 @ 466 @ H0>>+H1>>+H2>>+H3>>+H4 467 @ H3>>+H4>>*5+H0>>+H1 468 @ 469 @ Trivia. 470 @ 471 @ Result of multiplication of n-bit number by m-bit number is 472 @ n+m bits wide. However! Even though 2^n is a n+1-bit number, 473 @ m-bit number multiplied by 2^n is still n+m bits wide. 474 @ 475 @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, 476 @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit 477 @ one is n+1 bits wide. 478 @ 479 @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that 480 @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 481 @ can be 27. However! In cases when their width exceeds 26 bits 482 @ they are limited by 2^26+2^6. This in turn means that *sum* 483 @ of the products with these values can still be viewed as sum 484 @ of 52-bit numbers as long as the amount of addends is not a 485 @ power of 2. For example, 486 @ 487 @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, 488 @ 489 @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or 490 @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than 491 @ 8 * (2^52) or 2^55. However, the value is then multiplied by 492 @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), 493 @ which is less than 32 * (2^52) or 2^57. And when processing 494 @ data we are looking at triple as many addends... 495 @ 496 @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and 497 @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the 498 @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while 499 @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 500 @ instruction accepts 2x32-bit input and writes 2x64-bit result. 501 @ This means that result of reduction have to be compressed upon 502 @ loop wrap-around. This can be done in the process of reduction 503 @ to minimize amount of instructions [as well as amount of 504 @ 128-bit instructions, which benefits low-end processors], but 505 @ one has to watch for H2 (which is narrower than H0) and 5*H4 506 @ not being wider than 58 bits, so that result of right shift 507 @ by 26 bits fits in 32 bits. This is also useful on x86, 508 @ because it allows to use paddd in place for paddq, which 509 @ benefits Atom, where paddq is ridiculously slow. 510 511 vshr.u64 q15,q8,#26 512 vmovn.i64 d16,q8 513 vshr.u64 q4,q5,#26 514 vmovn.i64 d10,q5 515 vadd.i64 q9,q9,q15 @ h3 -> h4 516 vbic.i32 d16,#0xfc000000 @ &=0x03ffffff 517 vadd.i64 q6,q6,q4 @ h0 -> h1 518 vbic.i32 d10,#0xfc000000 519 520 vshrn.u64 d30,q9,#26 521 vmovn.i64 d18,q9 522 vshr.u64 q4,q6,#26 523 vmovn.i64 d12,q6 524 vadd.i64 q7,q7,q4 @ h1 -> h2 525 vbic.i32 d18,#0xfc000000 526 vbic.i32 d12,#0xfc000000 527 528 vadd.i32 d10,d10,d30 529 vshl.u32 d30,d30,#2 530 vshrn.u64 d8,q7,#26 531 vmovn.i64 d14,q7 532 vadd.i32 d10,d10,d30 @ h4 -> h0 533 vadd.i32 d16,d16,d8 @ h2 -> h3 534 vbic.i32 d14,#0xfc000000 535 536 vshr.u32 d30,d10,#26 537 vbic.i32 d10,#0xfc000000 538 vshr.u32 d8,d16,#26 539 vbic.i32 d16,#0xfc000000 540 vadd.i32 d12,d12,d30 @ h0 -> h1 541 vadd.i32 d18,d18,d8 @ h3 -> h4 542 543 subs r5,r5,#1 544 beq .Lsquare_break_neon 545 546 add r6,r0,#(48+0*9*4) 547 add r7,r0,#(48+1*9*4) 548 549 vtrn.32 d0,d10 @ r^2:r^1 550 vtrn.32 d3,d14 551 vtrn.32 d5,d16 552 vtrn.32 d1,d12 553 vtrn.32 d7,d18 554 555 vshl.u32 d4,d3,#2 @ *5 556 vshl.u32 d6,d5,#2 557 vshl.u32 d2,d1,#2 558 vshl.u32 d8,d7,#2 559 vadd.i32 d4,d4,d3 560 vadd.i32 d2,d2,d1 561 vadd.i32 d6,d6,d5 562 vadd.i32 d8,d8,d7 563 564 vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! 565 vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! 566 vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 567 vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 568 vst1.32 {d8[0]},[r6,:32] 569 vst1.32 {d8[1]},[r7,:32] 570 571 b .Lsquare_neon 572 573.align 4 574.Lsquare_break_neon: 575 add r6,r0,#(48+2*4*9) 576 add r7,r0,#(48+3*4*9) 577 578 vmov d0,d10 @ r^4:r^3 579 vshl.u32 d2,d12,#2 @ *5 580 vmov d1,d12 581 vshl.u32 d4,d14,#2 582 vmov d3,d14 583 vshl.u32 d6,d16,#2 584 vmov d5,d16 585 vshl.u32 d8,d18,#2 586 vmov d7,d18 587 vadd.i32 d2,d2,d12 588 vadd.i32 d4,d4,d14 589 vadd.i32 d6,d6,d16 590 vadd.i32 d8,d8,d18 591 592 vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! 593 vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! 594 vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 595 vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 596 vst1.32 {d8[0]},[r6] 597 vst1.32 {d8[1]},[r7] 598 599 bx lr @ bx lr 600.size poly1305_init_neon,.-poly1305_init_neon 601 602.type poly1305_blocks_neon,%function 603.align 5 604poly1305_blocks_neon: 605.Lpoly1305_blocks_neon: 606 ldr ip,[r0,#36] @ is_base2_26 607 ands r2,r2,#-16 608 beq .Lno_data_neon 609 610 cmp r2,#64 611 bhs .Lenter_neon 612 tst ip,ip @ is_base2_26? 613 beq .Lpoly1305_blocks 614 615.Lenter_neon: 616 stmdb sp!,{r4,r5,r6,r7} 617 vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so 618 619 tst ip,ip @ is_base2_26? 620 bne .Lbase2_26_neon 621 622 stmdb sp!,{r1,r2,r3,lr} 623 bl poly1305_init_neon 624 625 ldr r4,[r0,#0] @ load hash value base 2^32 626 ldr r5,[r0,#4] 627 ldr r6,[r0,#8] 628 ldr r7,[r0,#12] 629 ldr ip,[r0,#16] 630 631 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 632 mov r3,r4,lsr#26 633 veor d10,d10,d10 634 mov r4,r5,lsr#20 635 orr r3,r3,r5,lsl#6 636 veor d12,d12,d12 637 mov r5,r6,lsr#14 638 orr r4,r4,r6,lsl#12 639 veor d14,d14,d14 640 mov r6,r7,lsr#8 641 orr r5,r5,r7,lsl#18 642 veor d16,d16,d16 643 and r3,r3,#0x03ffffff 644 orr r6,r6,ip,lsl#24 645 veor d18,d18,d18 646 and r4,r4,#0x03ffffff 647 mov r1,#1 648 and r5,r5,#0x03ffffff 649 str r1,[r0,#36] @ is_base2_26 650 651 vmov.32 d10[0],r2 652 vmov.32 d12[0],r3 653 vmov.32 d14[0],r4 654 vmov.32 d16[0],r5 655 vmov.32 d18[0],r6 656 adr r5,.Lzeros 657 658 ldmia sp!,{r1,r2,r3,lr} 659 b .Lbase2_32_neon 660 661.align 4 662.Lbase2_26_neon: 663 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 664 @ load hash value 665 666 veor d10,d10,d10 667 veor d12,d12,d12 668 veor d14,d14,d14 669 veor d16,d16,d16 670 veor d18,d18,d18 671 vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! 672 adr r5,.Lzeros 673 vld1.32 {d18[0]},[r0] 674 sub r0,r0,#16 @ rewind 675 676.Lbase2_32_neon: 677 add r4,r1,#32 678 mov r3,r3,lsl#24 679 tst r2,#31 680 beq .Leven 681 682 vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]! 683 vmov.32 d28[0],r3 684 sub r2,r2,#16 685 add r4,r1,#32 686 687# ifdef __ARMEB__ 688 vrev32.8 q10,q10 689 vrev32.8 q13,q13 690 vrev32.8 q11,q11 691 vrev32.8 q12,q12 692# endif 693 vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26 694 vshl.u32 d26,d26,#18 695 696 vsri.u32 d26,d24,#14 697 vshl.u32 d24,d24,#12 698 vadd.i32 d29,d28,d18 @ add hash value and move to #hi 699 700 vbic.i32 d26,#0xfc000000 701 vsri.u32 d24,d22,#20 702 vshl.u32 d22,d22,#6 703 704 vbic.i32 d24,#0xfc000000 705 vsri.u32 d22,d20,#26 706 vadd.i32 d27,d26,d16 707 708 vbic.i32 d20,#0xfc000000 709 vbic.i32 d22,#0xfc000000 710 vadd.i32 d25,d24,d14 711 712 vadd.i32 d21,d20,d10 713 vadd.i32 d23,d22,d12 714 715 mov r7,r5 716 add r6,r0,#48 717 718 cmp r2,r2 719 b .Long_tail 720 721.align 4 722.Leven: 723 subs r2,r2,#64 724 it lo 725 movlo r4,r5 726 727 vmov.i32 q14,#1<<24 @ padbit, yes, always 728 vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] 729 add r1,r1,#64 730 vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) 731 add r4,r4,#64 732 itt hi 733 addhi r7,r0,#(48+1*9*4) 734 addhi r6,r0,#(48+3*9*4) 735 736# ifdef __ARMEB__ 737 vrev32.8 q10,q10 738 vrev32.8 q13,q13 739 vrev32.8 q11,q11 740 vrev32.8 q12,q12 741# endif 742 vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 743 vshl.u32 q13,q13,#18 744 745 vsri.u32 q13,q12,#14 746 vshl.u32 q12,q12,#12 747 748 vbic.i32 q13,#0xfc000000 749 vsri.u32 q12,q11,#20 750 vshl.u32 q11,q11,#6 751 752 vbic.i32 q12,#0xfc000000 753 vsri.u32 q11,q10,#26 754 755 vbic.i32 q10,#0xfc000000 756 vbic.i32 q11,#0xfc000000 757 758 bls .Lskip_loop 759 760 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2 761 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 762 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 763 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 764 b .Loop_neon 765 766.align 5 767.Loop_neon: 768 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 769 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 770 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 771 @ ___________________/ 772 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 773 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 774 @ ___________________/ ____________________/ 775 @ 776 @ Note that we start with inp[2:3]*r^2. This is because it 777 @ doesn't depend on reduction in previous iteration. 778 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 779 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 780 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 781 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 782 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 783 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 784 785 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 786 @ inp[2:3]*r^2 787 788 vadd.i32 d24,d24,d14 @ accumulate inp[0:1] 789 vmull.u32 q7,d25,d0[1] 790 vadd.i32 d20,d20,d10 791 vmull.u32 q5,d21,d0[1] 792 vadd.i32 d26,d26,d16 793 vmull.u32 q8,d27,d0[1] 794 vmlal.u32 q7,d23,d1[1] 795 vadd.i32 d22,d22,d12 796 vmull.u32 q6,d23,d0[1] 797 798 vadd.i32 d28,d28,d18 799 vmull.u32 q9,d29,d0[1] 800 subs r2,r2,#64 801 vmlal.u32 q5,d29,d2[1] 802 it lo 803 movlo r4,r5 804 vmlal.u32 q8,d25,d1[1] 805 vld1.32 d8[1],[r7,:32] 806 vmlal.u32 q6,d21,d1[1] 807 vmlal.u32 q9,d27,d1[1] 808 809 vmlal.u32 q5,d27,d4[1] 810 vmlal.u32 q8,d23,d3[1] 811 vmlal.u32 q9,d25,d3[1] 812 vmlal.u32 q6,d29,d4[1] 813 vmlal.u32 q7,d21,d3[1] 814 815 vmlal.u32 q8,d21,d5[1] 816 vmlal.u32 q5,d25,d6[1] 817 vmlal.u32 q9,d23,d5[1] 818 vmlal.u32 q6,d27,d6[1] 819 vmlal.u32 q7,d29,d6[1] 820 821 vmlal.u32 q8,d29,d8[1] 822 vmlal.u32 q5,d23,d8[1] 823 vmlal.u32 q9,d21,d7[1] 824 vmlal.u32 q6,d25,d8[1] 825 vmlal.u32 q7,d27,d8[1] 826 827 vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) 828 add r4,r4,#64 829 830 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 831 @ (hash+inp[0:1])*r^4 and accumulate 832 833 vmlal.u32 q8,d26,d0[0] 834 vmlal.u32 q5,d20,d0[0] 835 vmlal.u32 q9,d28,d0[0] 836 vmlal.u32 q6,d22,d0[0] 837 vmlal.u32 q7,d24,d0[0] 838 vld1.32 d8[0],[r6,:32] 839 840 vmlal.u32 q8,d24,d1[0] 841 vmlal.u32 q5,d28,d2[0] 842 vmlal.u32 q9,d26,d1[0] 843 vmlal.u32 q6,d20,d1[0] 844 vmlal.u32 q7,d22,d1[0] 845 846 vmlal.u32 q8,d22,d3[0] 847 vmlal.u32 q5,d26,d4[0] 848 vmlal.u32 q9,d24,d3[0] 849 vmlal.u32 q6,d28,d4[0] 850 vmlal.u32 q7,d20,d3[0] 851 852 vmlal.u32 q8,d20,d5[0] 853 vmlal.u32 q5,d24,d6[0] 854 vmlal.u32 q9,d22,d5[0] 855 vmlal.u32 q6,d26,d6[0] 856 vmlal.u32 q8,d28,d8[0] 857 858 vmlal.u32 q7,d28,d6[0] 859 vmlal.u32 q5,d22,d8[0] 860 vmlal.u32 q9,d20,d7[0] 861 vmov.i32 q14,#1<<24 @ padbit, yes, always 862 vmlal.u32 q6,d24,d8[0] 863 vmlal.u32 q7,d26,d8[0] 864 865 vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] 866 add r1,r1,#64 867# ifdef __ARMEB__ 868 vrev32.8 q10,q10 869 vrev32.8 q11,q11 870 vrev32.8 q12,q12 871 vrev32.8 q13,q13 872# endif 873 874 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 875 @ lazy reduction interleaved with base 2^32 -> base 2^26 of 876 @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14. 877 878 vshr.u64 q15,q8,#26 879 vmovn.i64 d16,q8 880 vshr.u64 q4,q5,#26 881 vmovn.i64 d10,q5 882 vadd.i64 q9,q9,q15 @ h3 -> h4 883 vbic.i32 d16,#0xfc000000 884 vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 885 vadd.i64 q6,q6,q4 @ h0 -> h1 886 vshl.u32 q13,q13,#18 887 vbic.i32 d10,#0xfc000000 888 889 vshrn.u64 d30,q9,#26 890 vmovn.i64 d18,q9 891 vshr.u64 q4,q6,#26 892 vmovn.i64 d12,q6 893 vadd.i64 q7,q7,q4 @ h1 -> h2 894 vsri.u32 q13,q12,#14 895 vbic.i32 d18,#0xfc000000 896 vshl.u32 q12,q12,#12 897 vbic.i32 d12,#0xfc000000 898 899 vadd.i32 d10,d10,d30 900 vshl.u32 d30,d30,#2 901 vbic.i32 q13,#0xfc000000 902 vshrn.u64 d8,q7,#26 903 vmovn.i64 d14,q7 904 vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec] 905 vsri.u32 q12,q11,#20 906 vadd.i32 d16,d16,d8 @ h2 -> h3 907 vshl.u32 q11,q11,#6 908 vbic.i32 d14,#0xfc000000 909 vbic.i32 q12,#0xfc000000 910 911 vshrn.u64 d30,q5,#26 @ re-narrow 912 vmovn.i64 d10,q5 913 vsri.u32 q11,q10,#26 914 vbic.i32 q10,#0xfc000000 915 vshr.u32 d8,d16,#26 916 vbic.i32 d16,#0xfc000000 917 vbic.i32 d10,#0xfc000000 918 vadd.i32 d12,d12,d30 @ h0 -> h1 919 vadd.i32 d18,d18,d8 @ h3 -> h4 920 vbic.i32 q11,#0xfc000000 921 922 bhi .Loop_neon 923 924.Lskip_loop: 925 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 926 @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 927 928 add r7,r0,#(48+0*9*4) 929 add r6,r0,#(48+1*9*4) 930 adds r2,r2,#32 931 it ne 932 movne r2,#0 933 bne .Long_tail 934 935 vadd.i32 d25,d24,d14 @ add hash value and move to #hi 936 vadd.i32 d21,d20,d10 937 vadd.i32 d27,d26,d16 938 vadd.i32 d23,d22,d12 939 vadd.i32 d29,d28,d18 940 941.Long_tail: 942 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1 943 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2 944 945 vadd.i32 d24,d24,d14 @ can be redundant 946 vmull.u32 q7,d25,d0 947 vadd.i32 d20,d20,d10 948 vmull.u32 q5,d21,d0 949 vadd.i32 d26,d26,d16 950 vmull.u32 q8,d27,d0 951 vadd.i32 d22,d22,d12 952 vmull.u32 q6,d23,d0 953 vadd.i32 d28,d28,d18 954 vmull.u32 q9,d29,d0 955 956 vmlal.u32 q5,d29,d2 957 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 958 vmlal.u32 q8,d25,d1 959 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 960 vmlal.u32 q6,d21,d1 961 vmlal.u32 q9,d27,d1 962 vmlal.u32 q7,d23,d1 963 964 vmlal.u32 q8,d23,d3 965 vld1.32 d8[1],[r7,:32] 966 vmlal.u32 q5,d27,d4 967 vld1.32 d8[0],[r6,:32] 968 vmlal.u32 q9,d25,d3 969 vmlal.u32 q6,d29,d4 970 vmlal.u32 q7,d21,d3 971 972 vmlal.u32 q8,d21,d5 973 it ne 974 addne r7,r0,#(48+2*9*4) 975 vmlal.u32 q5,d25,d6 976 it ne 977 addne r6,r0,#(48+3*9*4) 978 vmlal.u32 q9,d23,d5 979 vmlal.u32 q6,d27,d6 980 vmlal.u32 q7,d29,d6 981 982 vmlal.u32 q8,d29,d8 983 vorn q0,q0,q0 @ all-ones, can be redundant 984 vmlal.u32 q5,d23,d8 985 vshr.u64 q0,q0,#38 986 vmlal.u32 q9,d21,d7 987 vmlal.u32 q6,d25,d8 988 vmlal.u32 q7,d27,d8 989 990 beq .Lshort_tail 991 992 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 993 @ (hash+inp[0:1])*r^4:r^3 and accumulate 994 995 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3 996 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 997 998 vmlal.u32 q7,d24,d0 999 vmlal.u32 q5,d20,d0 1000 vmlal.u32 q8,d26,d0 1001 vmlal.u32 q6,d22,d0 1002 vmlal.u32 q9,d28,d0 1003 1004 vmlal.u32 q5,d28,d2 1005 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 1006 vmlal.u32 q8,d24,d1 1007 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 1008 vmlal.u32 q6,d20,d1 1009 vmlal.u32 q9,d26,d1 1010 vmlal.u32 q7,d22,d1 1011 1012 vmlal.u32 q8,d22,d3 1013 vld1.32 d8[1],[r7,:32] 1014 vmlal.u32 q5,d26,d4 1015 vld1.32 d8[0],[r6,:32] 1016 vmlal.u32 q9,d24,d3 1017 vmlal.u32 q6,d28,d4 1018 vmlal.u32 q7,d20,d3 1019 1020 vmlal.u32 q8,d20,d5 1021 vmlal.u32 q5,d24,d6 1022 vmlal.u32 q9,d22,d5 1023 vmlal.u32 q6,d26,d6 1024 vmlal.u32 q7,d28,d6 1025 1026 vmlal.u32 q8,d28,d8 1027 vorn q0,q0,q0 @ all-ones 1028 vmlal.u32 q5,d22,d8 1029 vshr.u64 q0,q0,#38 1030 vmlal.u32 q9,d20,d7 1031 vmlal.u32 q6,d24,d8 1032 vmlal.u32 q7,d26,d8 1033 1034.Lshort_tail: 1035 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1036 @ horizontal addition 1037 1038 vadd.i64 d16,d16,d17 1039 vadd.i64 d10,d10,d11 1040 vadd.i64 d18,d18,d19 1041 vadd.i64 d12,d12,d13 1042 vadd.i64 d14,d14,d15 1043 1044 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1045 @ lazy reduction, but without narrowing 1046 1047 vshr.u64 q15,q8,#26 1048 vand.i64 q8,q8,q0 1049 vshr.u64 q4,q5,#26 1050 vand.i64 q5,q5,q0 1051 vadd.i64 q9,q9,q15 @ h3 -> h4 1052 vadd.i64 q6,q6,q4 @ h0 -> h1 1053 1054 vshr.u64 q15,q9,#26 1055 vand.i64 q9,q9,q0 1056 vshr.u64 q4,q6,#26 1057 vand.i64 q6,q6,q0 1058 vadd.i64 q7,q7,q4 @ h1 -> h2 1059 1060 vadd.i64 q5,q5,q15 1061 vshl.u64 q15,q15,#2 1062 vshr.u64 q4,q7,#26 1063 vand.i64 q7,q7,q0 1064 vadd.i64 q5,q5,q15 @ h4 -> h0 1065 vadd.i64 q8,q8,q4 @ h2 -> h3 1066 1067 vshr.u64 q15,q5,#26 1068 vand.i64 q5,q5,q0 1069 vshr.u64 q4,q8,#26 1070 vand.i64 q8,q8,q0 1071 vadd.i64 q6,q6,q15 @ h0 -> h1 1072 vadd.i64 q9,q9,q4 @ h3 -> h4 1073 1074 cmp r2,#0 1075 bne .Leven 1076 1077 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1078 @ store hash value 1079 1080 vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! 1081 vst1.32 {d18[0]},[r0] 1082 1083 vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ epilogue 1084 ldmia sp!,{r4,r5,r6,r7} 1085.Lno_data_neon: 1086 bx lr @ bx lr 1087.size poly1305_blocks_neon,.-poly1305_blocks_neon 1088 1089.type poly1305_emit_neon,%function 1090.align 5 1091poly1305_emit_neon: 1092.Lpoly1305_emit_neon: 1093 ldr ip,[r0,#36] @ is_base2_26 1094 1095 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 1096 1097 tst ip,ip 1098 beq .Lpoly1305_emit_enter 1099 1100 ldmia r0,{r3,r4,r5,r6,r7} 1101 eor r8,r8,r8 1102 1103 adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32 1104 mov r4,r4,lsr#6 1105 adcs r4,r4,r5,lsl#20 1106 mov r5,r5,lsr#12 1107 adcs r5,r5,r6,lsl#14 1108 mov r6,r6,lsr#18 1109 adcs r6,r6,r7,lsl#8 1110 adc r7,r8,r7,lsr#24 @ can be partially reduced ... 1111 1112 and r8,r7,#-4 @ ... so reduce 1113 and r7,r6,#3 1114 add r8,r8,r8,lsr#2 @ *= 5 1115 adds r3,r3,r8 1116 adcs r4,r4,#0 1117 adcs r5,r5,#0 1118 adcs r6,r6,#0 1119 adc r7,r7,#0 1120 1121 adds r8,r3,#5 @ compare to modulus 1122 adcs r9,r4,#0 1123 adcs r10,r5,#0 1124 adcs r11,r6,#0 1125 adc r7,r7,#0 1126 tst r7,#4 @ did it carry/borrow? 1127 1128 it ne 1129 movne r3,r8 1130 ldr r8,[r2,#0] 1131 it ne 1132 movne r4,r9 1133 ldr r9,[r2,#4] 1134 it ne 1135 movne r5,r10 1136 ldr r10,[r2,#8] 1137 it ne 1138 movne r6,r11 1139 ldr r11,[r2,#12] 1140 1141 adds r3,r3,r8 @ accumulate nonce 1142 adcs r4,r4,r9 1143 adcs r5,r5,r10 1144 adc r6,r6,r11 1145 1146# ifdef __ARMEB__ 1147 rev r3,r3 1148 rev r4,r4 1149 rev r5,r5 1150 rev r6,r6 1151# endif 1152 str r3,[r1,#0] @ store the result 1153 str r4,[r1,#4] 1154 str r5,[r1,#8] 1155 str r6,[r1,#12] 1156 1157 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 1158 bx lr @ bx lr 1159.size poly1305_emit_neon,.-poly1305_emit_neon 1160 1161.align 5 1162.Lzeros: 1163.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1164.LOPENSSL_armcap: 1165.word OPENSSL_armcap_P-.Lpoly1305_init 1166#endif 1167.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1168.align 2 1169.align 2 1170#if __ARM_MAX_ARCH__>=7 1171.comm OPENSSL_armcap_P,4,4 1172#endif 1173