1/* Do not modify. This file is auto-generated from poly1305-armv4.pl. */ 2#include "arm_arch.h" 3 4#if defined(__thumb2__) 5.syntax unified 6.thumb 7#else 8.code 32 9#endif 10 11.text 12 13.globl poly1305_emit 14.globl poly1305_blocks 15.globl poly1305_init 16.type poly1305_init,%function 17.align 5 18poly1305_init: 19.Lpoly1305_init: 20 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 21 22 eor r3,r3,r3 23 cmp r1,#0 24 str r3,[r0,#0] @ zero hash value 25 str r3,[r0,#4] 26 str r3,[r0,#8] 27 str r3,[r0,#12] 28 str r3,[r0,#16] 29 str r3,[r0,#36] @ is_base2_26 30 add r0,r0,#20 31 32#ifdef __thumb2__ 33 it eq 34#endif 35 moveq r0,#0 36 beq .Lno_key 37 38#if __ARM_MAX_ARCH__>=7 39 adr r11,.Lpoly1305_init 40 ldr r12,.LOPENSSL_armcap 41#endif 42 ldrb r4,[r1,#0] 43 mov r10,#0x0fffffff 44 ldrb r5,[r1,#1] 45 and r3,r10,#-4 @ 0x0ffffffc 46 ldrb r6,[r1,#2] 47 ldrb r7,[r1,#3] 48 orr r4,r4,r5,lsl#8 49 ldrb r5,[r1,#4] 50 orr r4,r4,r6,lsl#16 51 ldrb r6,[r1,#5] 52 orr r4,r4,r7,lsl#24 53 ldrb r7,[r1,#6] 54 and r4,r4,r10 55 56#if __ARM_MAX_ARCH__>=7 57# if !defined(_WIN32) 58 ldr r12,[r11,r12] @ OPENSSL_armcap_P 59# endif 60# if defined(__APPLE__) || defined(_WIN32) 61 ldr r12,[r12] 62# endif 63#endif 64 ldrb r8,[r1,#7] 65 orr r5,r5,r6,lsl#8 66 ldrb r6,[r1,#8] 67 orr r5,r5,r7,lsl#16 68 ldrb r7,[r1,#9] 69 orr r5,r5,r8,lsl#24 70 ldrb r8,[r1,#10] 71 and r5,r5,r3 72 73#if __ARM_MAX_ARCH__>=7 74 tst r12,#ARMV7_NEON @ check for NEON 75# ifdef __thumb2__ 76 adr r9,.Lpoly1305_blocks_neon 77 adr r11,.Lpoly1305_blocks 78 adr r12,.Lpoly1305_emit 79 adr r10,.Lpoly1305_emit_neon 80 itt ne 81 movne r11,r9 82 movne r12,r10 83 orr r11,r11,#1 @ thumb-ify address 84 orr r12,r12,#1 85# else 86 addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) 87 addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init) 88 addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) 89 addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) 90# endif 91#endif 92 ldrb r9,[r1,#11] 93 orr r6,r6,r7,lsl#8 94 ldrb r7,[r1,#12] 95 orr r6,r6,r8,lsl#16 96 ldrb r8,[r1,#13] 97 orr r6,r6,r9,lsl#24 98 ldrb r9,[r1,#14] 99 and r6,r6,r3 100 101 ldrb r10,[r1,#15] 102 orr r7,r7,r8,lsl#8 103 str r4,[r0,#0] 104 orr r7,r7,r9,lsl#16 105 str r5,[r0,#4] 106 orr r7,r7,r10,lsl#24 107 str r6,[r0,#8] 108 and r7,r7,r3 109 str r7,[r0,#12] 110#if __ARM_MAX_ARCH__>=7 111 stmia r2,{r11,r12} @ fill functions table 112 mov r0,#1 113#else 114 mov r0,#0 115#endif 116.Lno_key: 117 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 118#if __ARM_ARCH__>=5 119 bx lr @ bx lr 120#else 121 tst lr,#1 122 moveq pc,lr @ be binary compatible with V4, yet 123.word 0xe12fff1e @ interoperable with Thumb ISA:-) 124#endif 125.size poly1305_init,.-poly1305_init 126.type poly1305_blocks,%function 127.align 5 128poly1305_blocks: 129.Lpoly1305_blocks: 130 stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} 131 132 ands r2,r2,#-16 133 beq .Lno_data 134 135 cmp r3,#0 136 add r2,r2,r1 @ end pointer 137 sub sp,sp,#32 138 139 ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12} @ load context 140 141 str r0,[sp,#12] @ offload stuff 142 mov lr,r1 143 str r2,[sp,#16] 144 str r10,[sp,#20] 145 str r11,[sp,#24] 146 str r12,[sp,#28] 147 b .Loop 148 149.Loop: 150#if __ARM_ARCH__<7 151 ldrb r0,[lr],#16 @ load input 152# ifdef __thumb2__ 153 it hi 154# endif 155 addhi r8,r8,#1 @ 1<<128 156 ldrb r1,[lr,#-15] 157 ldrb r2,[lr,#-14] 158 ldrb r3,[lr,#-13] 159 orr r1,r0,r1,lsl#8 160 ldrb r0,[lr,#-12] 161 orr r2,r1,r2,lsl#16 162 ldrb r1,[lr,#-11] 163 orr r3,r2,r3,lsl#24 164 ldrb r2,[lr,#-10] 165 adds r4,r4,r3 @ accumulate input 166 167 ldrb r3,[lr,#-9] 168 orr r1,r0,r1,lsl#8 169 ldrb r0,[lr,#-8] 170 orr r2,r1,r2,lsl#16 171 ldrb r1,[lr,#-7] 172 orr r3,r2,r3,lsl#24 173 ldrb r2,[lr,#-6] 174 adcs r5,r5,r3 175 176 ldrb r3,[lr,#-5] 177 orr r1,r0,r1,lsl#8 178 ldrb r0,[lr,#-4] 179 orr r2,r1,r2,lsl#16 180 ldrb r1,[lr,#-3] 181 orr r3,r2,r3,lsl#24 182 ldrb r2,[lr,#-2] 183 adcs r6,r6,r3 184 185 ldrb r3,[lr,#-1] 186 orr r1,r0,r1,lsl#8 187 str lr,[sp,#8] @ offload input pointer 188 orr r2,r1,r2,lsl#16 189 add r10,r10,r10,lsr#2 190 orr r3,r2,r3,lsl#24 191#else 192 ldr r0,[lr],#16 @ load input 193# ifdef __thumb2__ 194 it hi 195# endif 196 addhi r8,r8,#1 @ padbit 197 ldr r1,[lr,#-12] 198 ldr r2,[lr,#-8] 199 ldr r3,[lr,#-4] 200# ifdef __ARMEB__ 201 rev r0,r0 202 rev r1,r1 203 rev r2,r2 204 rev r3,r3 205# endif 206 adds r4,r4,r0 @ accumulate input 207 str lr,[sp,#8] @ offload input pointer 208 adcs r5,r5,r1 209 add r10,r10,r10,lsr#2 210 adcs r6,r6,r2 211#endif 212 add r11,r11,r11,lsr#2 213 adcs r7,r7,r3 214 add r12,r12,r12,lsr#2 215 216 umull r2,r3,r5,r9 217 adc r8,r8,#0 218 umull r0,r1,r4,r9 219 umlal r2,r3,r8,r10 220 umlal r0,r1,r7,r10 221 ldr r10,[sp,#20] @ reload r10 222 umlal r2,r3,r6,r12 223 umlal r0,r1,r5,r12 224 umlal r2,r3,r7,r11 225 umlal r0,r1,r6,r11 226 umlal r2,r3,r4,r10 227 str r0,[sp,#0] @ future r4 228 mul r0,r11,r8 229 ldr r11,[sp,#24] @ reload r11 230 adds r2,r2,r1 @ d1+=d0>>32 231 eor r1,r1,r1 232 adc lr,r3,#0 @ future r6 233 str r2,[sp,#4] @ future r5 234 235 mul r2,r12,r8 236 eor r3,r3,r3 237 umlal r0,r1,r7,r12 238 ldr r12,[sp,#28] @ reload r12 239 umlal r2,r3,r7,r9 240 umlal r0,r1,r6,r9 241 umlal r2,r3,r6,r10 242 umlal r0,r1,r5,r10 243 umlal r2,r3,r5,r11 244 umlal r0,r1,r4,r11 245 umlal r2,r3,r4,r12 246 ldr r4,[sp,#0] 247 mul r8,r9,r8 248 ldr r5,[sp,#4] 249 250 adds r6,lr,r0 @ d2+=d1>>32 251 ldr lr,[sp,#8] @ reload input pointer 252 adc r1,r1,#0 253 adds r7,r2,r1 @ d3+=d2>>32 254 ldr r0,[sp,#16] @ reload end pointer 255 adc r3,r3,#0 256 add r8,r8,r3 @ h4+=d3>>32 257 258 and r1,r8,#-4 259 and r8,r8,#3 260 add r1,r1,r1,lsr#2 @ *=5 261 adds r4,r4,r1 262 adcs r5,r5,#0 263 adcs r6,r6,#0 264 adcs r7,r7,#0 265 adc r8,r8,#0 266 267 cmp r0,lr @ done yet? 268 bhi .Loop 269 270 ldr r0,[sp,#12] 271 add sp,sp,#32 272 stmia r0,{r4,r5,r6,r7,r8} @ store the result 273 274.Lno_data: 275#if __ARM_ARCH__>=5 276 ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc} 277#else 278 ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} 279 tst lr,#1 280 moveq pc,lr @ be binary compatible with V4, yet 281.word 0xe12fff1e @ interoperable with Thumb ISA:-) 282#endif 283.size poly1305_blocks,.-poly1305_blocks 284.type poly1305_emit,%function 285.align 5 286poly1305_emit: 287.Lpoly1305_emit: 288 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 289.Lpoly1305_emit_enter: 290 291 ldmia r0,{r3,r4,r5,r6,r7} 292 adds r8,r3,#5 @ compare to modulus 293 adcs r9,r4,#0 294 adcs r10,r5,#0 295 adcs r11,r6,#0 296 adc r7,r7,#0 297 tst r7,#4 @ did it carry/borrow? 298 299#ifdef __thumb2__ 300 it ne 301#endif 302 movne r3,r8 303 ldr r8,[r2,#0] 304#ifdef __thumb2__ 305 it ne 306#endif 307 movne r4,r9 308 ldr r9,[r2,#4] 309#ifdef __thumb2__ 310 it ne 311#endif 312 movne r5,r10 313 ldr r10,[r2,#8] 314#ifdef __thumb2__ 315 it ne 316#endif 317 movne r6,r11 318 ldr r11,[r2,#12] 319 320 adds r3,r3,r8 321 adcs r4,r4,r9 322 adcs r5,r5,r10 323 adc r6,r6,r11 324 325#if __ARM_ARCH__>=7 326# ifdef __ARMEB__ 327 rev r3,r3 328 rev r4,r4 329 rev r5,r5 330 rev r6,r6 331# endif 332 str r3,[r1,#0] 333 str r4,[r1,#4] 334 str r5,[r1,#8] 335 str r6,[r1,#12] 336#else 337 strb r3,[r1,#0] 338 mov r3,r3,lsr#8 339 strb r4,[r1,#4] 340 mov r4,r4,lsr#8 341 strb r5,[r1,#8] 342 mov r5,r5,lsr#8 343 strb r6,[r1,#12] 344 mov r6,r6,lsr#8 345 346 strb r3,[r1,#1] 347 mov r3,r3,lsr#8 348 strb r4,[r1,#5] 349 mov r4,r4,lsr#8 350 strb r5,[r1,#9] 351 mov r5,r5,lsr#8 352 strb r6,[r1,#13] 353 mov r6,r6,lsr#8 354 355 strb r3,[r1,#2] 356 mov r3,r3,lsr#8 357 strb r4,[r1,#6] 358 mov r4,r4,lsr#8 359 strb r5,[r1,#10] 360 mov r5,r5,lsr#8 361 strb r6,[r1,#14] 362 mov r6,r6,lsr#8 363 364 strb r3,[r1,#3] 365 strb r4,[r1,#7] 366 strb r5,[r1,#11] 367 strb r6,[r1,#15] 368#endif 369 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 370#if __ARM_ARCH__>=5 371 bx lr @ bx lr 372#else 373 tst lr,#1 374 moveq pc,lr @ be binary compatible with V4, yet 375.word 0xe12fff1e @ interoperable with Thumb ISA:-) 376#endif 377.size poly1305_emit,.-poly1305_emit 378#if __ARM_MAX_ARCH__>=7 379.fpu neon 380 381.type poly1305_init_neon,%function 382.align 5 383poly1305_init_neon: 384 ldr r4,[r0,#20] @ load key base 2^32 385 ldr r5,[r0,#24] 386 ldr r6,[r0,#28] 387 ldr r7,[r0,#32] 388 389 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 390 mov r3,r4,lsr#26 391 mov r4,r5,lsr#20 392 orr r3,r3,r5,lsl#6 393 mov r5,r6,lsr#14 394 orr r4,r4,r6,lsl#12 395 mov r6,r7,lsr#8 396 orr r5,r5,r7,lsl#18 397 and r3,r3,#0x03ffffff 398 and r4,r4,#0x03ffffff 399 and r5,r5,#0x03ffffff 400 401 vdup.32 d0,r2 @ r^1 in both lanes 402 add r2,r3,r3,lsl#2 @ *5 403 vdup.32 d1,r3 404 add r3,r4,r4,lsl#2 405 vdup.32 d2,r2 406 vdup.32 d3,r4 407 add r4,r5,r5,lsl#2 408 vdup.32 d4,r3 409 vdup.32 d5,r5 410 add r5,r6,r6,lsl#2 411 vdup.32 d6,r4 412 vdup.32 d7,r6 413 vdup.32 d8,r5 414 415 mov r5,#2 @ counter 416 417.Lsquare_neon: 418 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 419 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 420 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 421 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 422 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 423 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 424 425 vmull.u32 q5,d0,d0[1] 426 vmull.u32 q6,d1,d0[1] 427 vmull.u32 q7,d3,d0[1] 428 vmull.u32 q8,d5,d0[1] 429 vmull.u32 q9,d7,d0[1] 430 431 vmlal.u32 q5,d7,d2[1] 432 vmlal.u32 q6,d0,d1[1] 433 vmlal.u32 q7,d1,d1[1] 434 vmlal.u32 q8,d3,d1[1] 435 vmlal.u32 q9,d5,d1[1] 436 437 vmlal.u32 q5,d5,d4[1] 438 vmlal.u32 q6,d7,d4[1] 439 vmlal.u32 q8,d1,d3[1] 440 vmlal.u32 q7,d0,d3[1] 441 vmlal.u32 q9,d3,d3[1] 442 443 vmlal.u32 q5,d3,d6[1] 444 vmlal.u32 q8,d0,d5[1] 445 vmlal.u32 q6,d5,d6[1] 446 vmlal.u32 q7,d7,d6[1] 447 vmlal.u32 q9,d1,d5[1] 448 449 vmlal.u32 q8,d7,d8[1] 450 vmlal.u32 q5,d1,d8[1] 451 vmlal.u32 q6,d3,d8[1] 452 vmlal.u32 q7,d5,d8[1] 453 vmlal.u32 q9,d0,d7[1] 454 455 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 456 @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 457 @ and P. Schwabe 458 @ 459 @ H0>>+H1>>+H2>>+H3>>+H4 460 @ H3>>+H4>>*5+H0>>+H1 461 @ 462 @ Trivia. 463 @ 464 @ Result of multiplication of n-bit number by m-bit number is 465 @ n+m bits wide. However! Even though 2^n is a n+1-bit number, 466 @ m-bit number multiplied by 2^n is still n+m bits wide. 467 @ 468 @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, 469 @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit 470 @ one is n+1 bits wide. 471 @ 472 @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that 473 @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 474 @ can be 27. However! In cases when their width exceeds 26 bits 475 @ they are limited by 2^26+2^6. This in turn means that *sum* 476 @ of the products with these values can still be viewed as sum 477 @ of 52-bit numbers as long as the amount of addends is not a 478 @ power of 2. For example, 479 @ 480 @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, 481 @ 482 @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or 483 @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than 484 @ 8 * (2^52) or 2^55. However, the value is then multiplied by 485 @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), 486 @ which is less than 32 * (2^52) or 2^57. And when processing 487 @ data we are looking at triple as many addends... 488 @ 489 @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and 490 @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the 491 @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while 492 @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 493 @ instruction accepts 2x32-bit input and writes 2x64-bit result. 494 @ This means that result of reduction have to be compressed upon 495 @ loop wrap-around. This can be done in the process of reduction 496 @ to minimize amount of instructions [as well as amount of 497 @ 128-bit instructions, which benefits low-end processors], but 498 @ one has to watch for H2 (which is narrower than H0) and 5*H4 499 @ not being wider than 58 bits, so that result of right shift 500 @ by 26 bits fits in 32 bits. This is also useful on x86, 501 @ because it allows to use paddd in place for paddq, which 502 @ benefits Atom, where paddq is ridiculously slow. 503 504 vshr.u64 q15,q8,#26 505 vmovn.i64 d16,q8 506 vshr.u64 q4,q5,#26 507 vmovn.i64 d10,q5 508 vadd.i64 q9,q9,q15 @ h3 -> h4 509 vbic.i32 d16,#0xfc000000 @ &=0x03ffffff 510 vadd.i64 q6,q6,q4 @ h0 -> h1 511 vbic.i32 d10,#0xfc000000 512 513 vshrn.u64 d30,q9,#26 514 vmovn.i64 d18,q9 515 vshr.u64 q4,q6,#26 516 vmovn.i64 d12,q6 517 vadd.i64 q7,q7,q4 @ h1 -> h2 518 vbic.i32 d18,#0xfc000000 519 vbic.i32 d12,#0xfc000000 520 521 vadd.i32 d10,d10,d30 522 vshl.u32 d30,d30,#2 523 vshrn.u64 d8,q7,#26 524 vmovn.i64 d14,q7 525 vadd.i32 d10,d10,d30 @ h4 -> h0 526 vadd.i32 d16,d16,d8 @ h2 -> h3 527 vbic.i32 d14,#0xfc000000 528 529 vshr.u32 d30,d10,#26 530 vbic.i32 d10,#0xfc000000 531 vshr.u32 d8,d16,#26 532 vbic.i32 d16,#0xfc000000 533 vadd.i32 d12,d12,d30 @ h0 -> h1 534 vadd.i32 d18,d18,d8 @ h3 -> h4 535 536 subs r5,r5,#1 537 beq .Lsquare_break_neon 538 539 add r6,r0,#(48+0*9*4) 540 add r7,r0,#(48+1*9*4) 541 542 vtrn.32 d0,d10 @ r^2:r^1 543 vtrn.32 d3,d14 544 vtrn.32 d5,d16 545 vtrn.32 d1,d12 546 vtrn.32 d7,d18 547 548 vshl.u32 d4,d3,#2 @ *5 549 vshl.u32 d6,d5,#2 550 vshl.u32 d2,d1,#2 551 vshl.u32 d8,d7,#2 552 vadd.i32 d4,d4,d3 553 vadd.i32 d2,d2,d1 554 vadd.i32 d6,d6,d5 555 vadd.i32 d8,d8,d7 556 557 vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! 558 vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! 559 vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 560 vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 561 vst1.32 {d8[0]},[r6,:32] 562 vst1.32 {d8[1]},[r7,:32] 563 564 b .Lsquare_neon 565 566.align 4 567.Lsquare_break_neon: 568 add r6,r0,#(48+2*4*9) 569 add r7,r0,#(48+3*4*9) 570 571 vmov d0,d10 @ r^4:r^3 572 vshl.u32 d2,d12,#2 @ *5 573 vmov d1,d12 574 vshl.u32 d4,d14,#2 575 vmov d3,d14 576 vshl.u32 d6,d16,#2 577 vmov d5,d16 578 vshl.u32 d8,d18,#2 579 vmov d7,d18 580 vadd.i32 d2,d2,d12 581 vadd.i32 d4,d4,d14 582 vadd.i32 d6,d6,d16 583 vadd.i32 d8,d8,d18 584 585 vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! 586 vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! 587 vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 588 vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 589 vst1.32 {d8[0]},[r6] 590 vst1.32 {d8[1]},[r7] 591 592 bx lr @ bx lr 593.size poly1305_init_neon,.-poly1305_init_neon 594 595.type poly1305_blocks_neon,%function 596.align 5 597poly1305_blocks_neon: 598.Lpoly1305_blocks_neon: 599 ldr ip,[r0,#36] @ is_base2_26 600 ands r2,r2,#-16 601 beq .Lno_data_neon 602 603 cmp r2,#64 604 bhs .Lenter_neon 605 tst ip,ip @ is_base2_26? 606 beq .Lpoly1305_blocks 607 608.Lenter_neon: 609 stmdb sp!,{r4,r5,r6,r7} 610 vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so 611 612 tst ip,ip @ is_base2_26? 613 bne .Lbase2_26_neon 614 615 stmdb sp!,{r1,r2,r3,lr} 616 bl poly1305_init_neon 617 618 ldr r4,[r0,#0] @ load hash value base 2^32 619 ldr r5,[r0,#4] 620 ldr r6,[r0,#8] 621 ldr r7,[r0,#12] 622 ldr ip,[r0,#16] 623 624 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 625 mov r3,r4,lsr#26 626 veor d10,d10,d10 627 mov r4,r5,lsr#20 628 orr r3,r3,r5,lsl#6 629 veor d12,d12,d12 630 mov r5,r6,lsr#14 631 orr r4,r4,r6,lsl#12 632 veor d14,d14,d14 633 mov r6,r7,lsr#8 634 orr r5,r5,r7,lsl#18 635 veor d16,d16,d16 636 and r3,r3,#0x03ffffff 637 orr r6,r6,ip,lsl#24 638 veor d18,d18,d18 639 and r4,r4,#0x03ffffff 640 mov r1,#1 641 and r5,r5,#0x03ffffff 642 str r1,[r0,#36] @ is_base2_26 643 644 vmov.32 d10[0],r2 645 vmov.32 d12[0],r3 646 vmov.32 d14[0],r4 647 vmov.32 d16[0],r5 648 vmov.32 d18[0],r6 649 adr r5,.Lzeros 650 651 ldmia sp!,{r1,r2,r3,lr} 652 b .Lbase2_32_neon 653 654.align 4 655.Lbase2_26_neon: 656 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 657 @ load hash value 658 659 veor d10,d10,d10 660 veor d12,d12,d12 661 veor d14,d14,d14 662 veor d16,d16,d16 663 veor d18,d18,d18 664 vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! 665 adr r5,.Lzeros 666 vld1.32 {d18[0]},[r0] 667 sub r0,r0,#16 @ rewind 668 669.Lbase2_32_neon: 670 add r4,r1,#32 671 mov r3,r3,lsl#24 672 tst r2,#31 673 beq .Leven 674 675 vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]! 676 vmov.32 d28[0],r3 677 sub r2,r2,#16 678 add r4,r1,#32 679 680# ifdef __ARMEB__ 681 vrev32.8 q10,q10 682 vrev32.8 q13,q13 683 vrev32.8 q11,q11 684 vrev32.8 q12,q12 685# endif 686 vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26 687 vshl.u32 d26,d26,#18 688 689 vsri.u32 d26,d24,#14 690 vshl.u32 d24,d24,#12 691 vadd.i32 d29,d28,d18 @ add hash value and move to #hi 692 693 vbic.i32 d26,#0xfc000000 694 vsri.u32 d24,d22,#20 695 vshl.u32 d22,d22,#6 696 697 vbic.i32 d24,#0xfc000000 698 vsri.u32 d22,d20,#26 699 vadd.i32 d27,d26,d16 700 701 vbic.i32 d20,#0xfc000000 702 vbic.i32 d22,#0xfc000000 703 vadd.i32 d25,d24,d14 704 705 vadd.i32 d21,d20,d10 706 vadd.i32 d23,d22,d12 707 708 mov r7,r5 709 add r6,r0,#48 710 711 cmp r2,r2 712 b .Long_tail 713 714.align 4 715.Leven: 716 subs r2,r2,#64 717 it lo 718 movlo r4,r5 719 720 vmov.i32 q14,#1<<24 @ padbit, yes, always 721 vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] 722 add r1,r1,#64 723 vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) 724 add r4,r4,#64 725 itt hi 726 addhi r7,r0,#(48+1*9*4) 727 addhi r6,r0,#(48+3*9*4) 728 729# ifdef __ARMEB__ 730 vrev32.8 q10,q10 731 vrev32.8 q13,q13 732 vrev32.8 q11,q11 733 vrev32.8 q12,q12 734# endif 735 vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 736 vshl.u32 q13,q13,#18 737 738 vsri.u32 q13,q12,#14 739 vshl.u32 q12,q12,#12 740 741 vbic.i32 q13,#0xfc000000 742 vsri.u32 q12,q11,#20 743 vshl.u32 q11,q11,#6 744 745 vbic.i32 q12,#0xfc000000 746 vsri.u32 q11,q10,#26 747 748 vbic.i32 q10,#0xfc000000 749 vbic.i32 q11,#0xfc000000 750 751 bls .Lskip_loop 752 753 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2 754 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 755 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 756 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 757 b .Loop_neon 758 759.align 5 760.Loop_neon: 761 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 762 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 763 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 764 @ ___________________/ 765 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 766 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 767 @ ___________________/ ____________________/ 768 @ 769 @ Note that we start with inp[2:3]*r^2. This is because it 770 @ doesn't depend on reduction in previous iteration. 771 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 772 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 773 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 774 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 775 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 776 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 777 778 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 779 @ inp[2:3]*r^2 780 781 vadd.i32 d24,d24,d14 @ accumulate inp[0:1] 782 vmull.u32 q7,d25,d0[1] 783 vadd.i32 d20,d20,d10 784 vmull.u32 q5,d21,d0[1] 785 vadd.i32 d26,d26,d16 786 vmull.u32 q8,d27,d0[1] 787 vmlal.u32 q7,d23,d1[1] 788 vadd.i32 d22,d22,d12 789 vmull.u32 q6,d23,d0[1] 790 791 vadd.i32 d28,d28,d18 792 vmull.u32 q9,d29,d0[1] 793 subs r2,r2,#64 794 vmlal.u32 q5,d29,d2[1] 795 it lo 796 movlo r4,r5 797 vmlal.u32 q8,d25,d1[1] 798 vld1.32 d8[1],[r7,:32] 799 vmlal.u32 q6,d21,d1[1] 800 vmlal.u32 q9,d27,d1[1] 801 802 vmlal.u32 q5,d27,d4[1] 803 vmlal.u32 q8,d23,d3[1] 804 vmlal.u32 q9,d25,d3[1] 805 vmlal.u32 q6,d29,d4[1] 806 vmlal.u32 q7,d21,d3[1] 807 808 vmlal.u32 q8,d21,d5[1] 809 vmlal.u32 q5,d25,d6[1] 810 vmlal.u32 q9,d23,d5[1] 811 vmlal.u32 q6,d27,d6[1] 812 vmlal.u32 q7,d29,d6[1] 813 814 vmlal.u32 q8,d29,d8[1] 815 vmlal.u32 q5,d23,d8[1] 816 vmlal.u32 q9,d21,d7[1] 817 vmlal.u32 q6,d25,d8[1] 818 vmlal.u32 q7,d27,d8[1] 819 820 vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) 821 add r4,r4,#64 822 823 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 824 @ (hash+inp[0:1])*r^4 and accumulate 825 826 vmlal.u32 q8,d26,d0[0] 827 vmlal.u32 q5,d20,d0[0] 828 vmlal.u32 q9,d28,d0[0] 829 vmlal.u32 q6,d22,d0[0] 830 vmlal.u32 q7,d24,d0[0] 831 vld1.32 d8[0],[r6,:32] 832 833 vmlal.u32 q8,d24,d1[0] 834 vmlal.u32 q5,d28,d2[0] 835 vmlal.u32 q9,d26,d1[0] 836 vmlal.u32 q6,d20,d1[0] 837 vmlal.u32 q7,d22,d1[0] 838 839 vmlal.u32 q8,d22,d3[0] 840 vmlal.u32 q5,d26,d4[0] 841 vmlal.u32 q9,d24,d3[0] 842 vmlal.u32 q6,d28,d4[0] 843 vmlal.u32 q7,d20,d3[0] 844 845 vmlal.u32 q8,d20,d5[0] 846 vmlal.u32 q5,d24,d6[0] 847 vmlal.u32 q9,d22,d5[0] 848 vmlal.u32 q6,d26,d6[0] 849 vmlal.u32 q8,d28,d8[0] 850 851 vmlal.u32 q7,d28,d6[0] 852 vmlal.u32 q5,d22,d8[0] 853 vmlal.u32 q9,d20,d7[0] 854 vmov.i32 q14,#1<<24 @ padbit, yes, always 855 vmlal.u32 q6,d24,d8[0] 856 vmlal.u32 q7,d26,d8[0] 857 858 vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] 859 add r1,r1,#64 860# ifdef __ARMEB__ 861 vrev32.8 q10,q10 862 vrev32.8 q11,q11 863 vrev32.8 q12,q12 864 vrev32.8 q13,q13 865# endif 866 867 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 868 @ lazy reduction interleaved with base 2^32 -> base 2^26 of 869 @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14. 870 871 vshr.u64 q15,q8,#26 872 vmovn.i64 d16,q8 873 vshr.u64 q4,q5,#26 874 vmovn.i64 d10,q5 875 vadd.i64 q9,q9,q15 @ h3 -> h4 876 vbic.i32 d16,#0xfc000000 877 vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 878 vadd.i64 q6,q6,q4 @ h0 -> h1 879 vshl.u32 q13,q13,#18 880 vbic.i32 d10,#0xfc000000 881 882 vshrn.u64 d30,q9,#26 883 vmovn.i64 d18,q9 884 vshr.u64 q4,q6,#26 885 vmovn.i64 d12,q6 886 vadd.i64 q7,q7,q4 @ h1 -> h2 887 vsri.u32 q13,q12,#14 888 vbic.i32 d18,#0xfc000000 889 vshl.u32 q12,q12,#12 890 vbic.i32 d12,#0xfc000000 891 892 vadd.i32 d10,d10,d30 893 vshl.u32 d30,d30,#2 894 vbic.i32 q13,#0xfc000000 895 vshrn.u64 d8,q7,#26 896 vmovn.i64 d14,q7 897 vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec] 898 vsri.u32 q12,q11,#20 899 vadd.i32 d16,d16,d8 @ h2 -> h3 900 vshl.u32 q11,q11,#6 901 vbic.i32 d14,#0xfc000000 902 vbic.i32 q12,#0xfc000000 903 904 vshrn.u64 d30,q5,#26 @ re-narrow 905 vmovn.i64 d10,q5 906 vsri.u32 q11,q10,#26 907 vbic.i32 q10,#0xfc000000 908 vshr.u32 d8,d16,#26 909 vbic.i32 d16,#0xfc000000 910 vbic.i32 d10,#0xfc000000 911 vadd.i32 d12,d12,d30 @ h0 -> h1 912 vadd.i32 d18,d18,d8 @ h3 -> h4 913 vbic.i32 q11,#0xfc000000 914 915 bhi .Loop_neon 916 917.Lskip_loop: 918 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 919 @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 920 921 add r7,r0,#(48+0*9*4) 922 add r6,r0,#(48+1*9*4) 923 adds r2,r2,#32 924 it ne 925 movne r2,#0 926 bne .Long_tail 927 928 vadd.i32 d25,d24,d14 @ add hash value and move to #hi 929 vadd.i32 d21,d20,d10 930 vadd.i32 d27,d26,d16 931 vadd.i32 d23,d22,d12 932 vadd.i32 d29,d28,d18 933 934.Long_tail: 935 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1 936 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2 937 938 vadd.i32 d24,d24,d14 @ can be redundant 939 vmull.u32 q7,d25,d0 940 vadd.i32 d20,d20,d10 941 vmull.u32 q5,d21,d0 942 vadd.i32 d26,d26,d16 943 vmull.u32 q8,d27,d0 944 vadd.i32 d22,d22,d12 945 vmull.u32 q6,d23,d0 946 vadd.i32 d28,d28,d18 947 vmull.u32 q9,d29,d0 948 949 vmlal.u32 q5,d29,d2 950 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 951 vmlal.u32 q8,d25,d1 952 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 953 vmlal.u32 q6,d21,d1 954 vmlal.u32 q9,d27,d1 955 vmlal.u32 q7,d23,d1 956 957 vmlal.u32 q8,d23,d3 958 vld1.32 d8[1],[r7,:32] 959 vmlal.u32 q5,d27,d4 960 vld1.32 d8[0],[r6,:32] 961 vmlal.u32 q9,d25,d3 962 vmlal.u32 q6,d29,d4 963 vmlal.u32 q7,d21,d3 964 965 vmlal.u32 q8,d21,d5 966 it ne 967 addne r7,r0,#(48+2*9*4) 968 vmlal.u32 q5,d25,d6 969 it ne 970 addne r6,r0,#(48+3*9*4) 971 vmlal.u32 q9,d23,d5 972 vmlal.u32 q6,d27,d6 973 vmlal.u32 q7,d29,d6 974 975 vmlal.u32 q8,d29,d8 976 vorn q0,q0,q0 @ all-ones, can be redundant 977 vmlal.u32 q5,d23,d8 978 vshr.u64 q0,q0,#38 979 vmlal.u32 q9,d21,d7 980 vmlal.u32 q6,d25,d8 981 vmlal.u32 q7,d27,d8 982 983 beq .Lshort_tail 984 985 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 986 @ (hash+inp[0:1])*r^4:r^3 and accumulate 987 988 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3 989 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 990 991 vmlal.u32 q7,d24,d0 992 vmlal.u32 q5,d20,d0 993 vmlal.u32 q8,d26,d0 994 vmlal.u32 q6,d22,d0 995 vmlal.u32 q9,d28,d0 996 997 vmlal.u32 q5,d28,d2 998 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 999 vmlal.u32 q8,d24,d1 1000 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 1001 vmlal.u32 q6,d20,d1 1002 vmlal.u32 q9,d26,d1 1003 vmlal.u32 q7,d22,d1 1004 1005 vmlal.u32 q8,d22,d3 1006 vld1.32 d8[1],[r7,:32] 1007 vmlal.u32 q5,d26,d4 1008 vld1.32 d8[0],[r6,:32] 1009 vmlal.u32 q9,d24,d3 1010 vmlal.u32 q6,d28,d4 1011 vmlal.u32 q7,d20,d3 1012 1013 vmlal.u32 q8,d20,d5 1014 vmlal.u32 q5,d24,d6 1015 vmlal.u32 q9,d22,d5 1016 vmlal.u32 q6,d26,d6 1017 vmlal.u32 q7,d28,d6 1018 1019 vmlal.u32 q8,d28,d8 1020 vorn q0,q0,q0 @ all-ones 1021 vmlal.u32 q5,d22,d8 1022 vshr.u64 q0,q0,#38 1023 vmlal.u32 q9,d20,d7 1024 vmlal.u32 q6,d24,d8 1025 vmlal.u32 q7,d26,d8 1026 1027.Lshort_tail: 1028 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1029 @ horizontal addition 1030 1031 vadd.i64 d16,d16,d17 1032 vadd.i64 d10,d10,d11 1033 vadd.i64 d18,d18,d19 1034 vadd.i64 d12,d12,d13 1035 vadd.i64 d14,d14,d15 1036 1037 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1038 @ lazy reduction, but without narrowing 1039 1040 vshr.u64 q15,q8,#26 1041 vand.i64 q8,q8,q0 1042 vshr.u64 q4,q5,#26 1043 vand.i64 q5,q5,q0 1044 vadd.i64 q9,q9,q15 @ h3 -> h4 1045 vadd.i64 q6,q6,q4 @ h0 -> h1 1046 1047 vshr.u64 q15,q9,#26 1048 vand.i64 q9,q9,q0 1049 vshr.u64 q4,q6,#26 1050 vand.i64 q6,q6,q0 1051 vadd.i64 q7,q7,q4 @ h1 -> h2 1052 1053 vadd.i64 q5,q5,q15 1054 vshl.u64 q15,q15,#2 1055 vshr.u64 q4,q7,#26 1056 vand.i64 q7,q7,q0 1057 vadd.i64 q5,q5,q15 @ h4 -> h0 1058 vadd.i64 q8,q8,q4 @ h2 -> h3 1059 1060 vshr.u64 q15,q5,#26 1061 vand.i64 q5,q5,q0 1062 vshr.u64 q4,q8,#26 1063 vand.i64 q8,q8,q0 1064 vadd.i64 q6,q6,q15 @ h0 -> h1 1065 vadd.i64 q9,q9,q4 @ h3 -> h4 1066 1067 cmp r2,#0 1068 bne .Leven 1069 1070 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1071 @ store hash value 1072 1073 vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! 1074 vst1.32 {d18[0]},[r0] 1075 1076 vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ epilogue 1077 ldmia sp!,{r4,r5,r6,r7} 1078.Lno_data_neon: 1079 bx lr @ bx lr 1080.size poly1305_blocks_neon,.-poly1305_blocks_neon 1081 1082.type poly1305_emit_neon,%function 1083.align 5 1084poly1305_emit_neon: 1085.Lpoly1305_emit_neon: 1086 ldr ip,[r0,#36] @ is_base2_26 1087 1088 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 1089 1090 tst ip,ip 1091 beq .Lpoly1305_emit_enter 1092 1093 ldmia r0,{r3,r4,r5,r6,r7} 1094 eor r8,r8,r8 1095 1096 adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32 1097 mov r4,r4,lsr#6 1098 adcs r4,r4,r5,lsl#20 1099 mov r5,r5,lsr#12 1100 adcs r5,r5,r6,lsl#14 1101 mov r6,r6,lsr#18 1102 adcs r6,r6,r7,lsl#8 1103 adc r7,r8,r7,lsr#24 @ can be partially reduced ... 1104 1105 and r8,r7,#-4 @ ... so reduce 1106 and r7,r6,#3 1107 add r8,r8,r8,lsr#2 @ *= 5 1108 adds r3,r3,r8 1109 adcs r4,r4,#0 1110 adcs r5,r5,#0 1111 adcs r6,r6,#0 1112 adc r7,r7,#0 1113 1114 adds r8,r3,#5 @ compare to modulus 1115 adcs r9,r4,#0 1116 adcs r10,r5,#0 1117 adcs r11,r6,#0 1118 adc r7,r7,#0 1119 tst r7,#4 @ did it carry/borrow? 1120 1121 it ne 1122 movne r3,r8 1123 ldr r8,[r2,#0] 1124 it ne 1125 movne r4,r9 1126 ldr r9,[r2,#4] 1127 it ne 1128 movne r5,r10 1129 ldr r10,[r2,#8] 1130 it ne 1131 movne r6,r11 1132 ldr r11,[r2,#12] 1133 1134 adds r3,r3,r8 @ accumulate nonce 1135 adcs r4,r4,r9 1136 adcs r5,r5,r10 1137 adc r6,r6,r11 1138 1139# ifdef __ARMEB__ 1140 rev r3,r3 1141 rev r4,r4 1142 rev r5,r5 1143 rev r6,r6 1144# endif 1145 str r3,[r1,#0] @ store the result 1146 str r4,[r1,#4] 1147 str r5,[r1,#8] 1148 str r6,[r1,#12] 1149 1150 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 1151 bx lr @ bx lr 1152.size poly1305_emit_neon,.-poly1305_emit_neon 1153 1154.align 5 1155.Lzeros: 1156.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1157.LOPENSSL_armcap: 1158# ifdef _WIN32 1159.word OPENSSL_armcap_P 1160# else 1161.word OPENSSL_armcap_P-.Lpoly1305_init 1162# endif 1163#endif 1164.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1165.align 2 1166.align 2 1167#if __ARM_MAX_ARCH__>=7 1168.comm OPENSSL_armcap_P,4,4 1169#endif 1170