1/* $FreeBSD$ */ 2/* Do not modify. This file is auto-generated from poly1305-armv4.pl. */ 3#include "arm_arch.h" 4 5.text 6#if defined(__thumb2__) 7.syntax unified 8.thumb 9#else 10.code 32 11#endif 12 13.globl poly1305_emit 14.globl poly1305_blocks 15.globl poly1305_init 16.type poly1305_init,%function 17.align 5 18poly1305_init: 19.Lpoly1305_init: 20 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 21 22 eor r3,r3,r3 23 cmp r1,#0 24 str r3,[r0,#0] @ zero hash value 25 str r3,[r0,#4] 26 str r3,[r0,#8] 27 str r3,[r0,#12] 28 str r3,[r0,#16] 29 str r3,[r0,#36] @ is_base2_26 30 add r0,r0,#20 31 32#ifdef __thumb2__ 33 it eq 34#endif 35 moveq r0,#0 36 beq .Lno_key 37 38#if __ARM_MAX_ARCH__>=7 39 adr r11,.Lpoly1305_init 40 ldr r12,.LOPENSSL_armcap 41#endif 42 ldrb r4,[r1,#0] 43 mov r10,#0x0fffffff 44 ldrb r5,[r1,#1] 45 and r3,r10,#-4 @ 0x0ffffffc 46 ldrb r6,[r1,#2] 47 ldrb r7,[r1,#3] 48 orr r4,r4,r5,lsl#8 49 ldrb r5,[r1,#4] 50 orr r4,r4,r6,lsl#16 51 ldrb r6,[r1,#5] 52 orr r4,r4,r7,lsl#24 53 ldrb r7,[r1,#6] 54 and r4,r4,r10 55 56#if __ARM_MAX_ARCH__>=7 57 ldr r12,[r11,r12] @ OPENSSL_armcap_P 58# ifdef __APPLE__ 59 ldr r12,[r12] 60# endif 61#endif 62 ldrb r8,[r1,#7] 63 orr r5,r5,r6,lsl#8 64 ldrb r6,[r1,#8] 65 orr r5,r5,r7,lsl#16 66 ldrb r7,[r1,#9] 67 orr r5,r5,r8,lsl#24 68 ldrb r8,[r1,#10] 69 and r5,r5,r3 70 71#if __ARM_MAX_ARCH__>=7 72 tst r12,#ARMV7_NEON @ check for NEON 73# ifdef __APPLE__ 74 adr r9,poly1305_blocks_neon 75 adr r11,poly1305_blocks 76# ifdef __thumb2__ 77 it ne 78# endif 79 movne r11,r9 80 adr r12,poly1305_emit 81 adr r10,poly1305_emit_neon 82# ifdef __thumb2__ 83 it ne 84# endif 85 movne r12,r10 86# else 87# ifdef __thumb2__ 88 itete eq 89# endif 90 addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) 91 addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init) 92 addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) 93 addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) 94# endif 95# ifdef __thumb2__ 96 orr r12,r12,#1 @ thumb-ify address 97 orr r11,r11,#1 98# endif 99#endif 100 ldrb r9,[r1,#11] 101 orr r6,r6,r7,lsl#8 102 ldrb r7,[r1,#12] 103 orr r6,r6,r8,lsl#16 104 ldrb r8,[r1,#13] 105 orr r6,r6,r9,lsl#24 106 ldrb r9,[r1,#14] 107 and r6,r6,r3 108 109 ldrb r10,[r1,#15] 110 orr r7,r7,r8,lsl#8 111 str r4,[r0,#0] 112 orr r7,r7,r9,lsl#16 113 str r5,[r0,#4] 114 orr r7,r7,r10,lsl#24 115 str r6,[r0,#8] 116 and r7,r7,r3 117 str r7,[r0,#12] 118#if __ARM_MAX_ARCH__>=7 119 stmia r2,{r11,r12} @ fill functions table 120 mov r0,#1 121#else 122 mov r0,#0 123#endif 124.Lno_key: 125 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 126#if __ARM_ARCH__>=5 127 bx lr @ bx lr 128#else 129 tst lr,#1 130 moveq pc,lr @ be binary compatible with V4, yet 131.word 0xe12fff1e @ interoperable with Thumb ISA:-) 132#endif 133.size poly1305_init,.-poly1305_init 134.type poly1305_blocks,%function 135.align 5 136poly1305_blocks: 137.Lpoly1305_blocks: 138 stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} 139 140 ands r2,r2,#-16 141 beq .Lno_data 142 143 cmp r3,#0 144 add r2,r2,r1 @ end pointer 145 sub sp,sp,#32 146 147 ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12} @ load context 148 149 str r0,[sp,#12] @ offload stuff 150 mov lr,r1 151 str r2,[sp,#16] 152 str r10,[sp,#20] 153 str r11,[sp,#24] 154 str r12,[sp,#28] 155 b .Loop 156 157.Loop: 158#if __ARM_ARCH__<7 159 ldrb r0,[lr],#16 @ load input 160# ifdef __thumb2__ 161 it hi 162# endif 163 addhi r8,r8,#1 @ 1<<128 164 ldrb r1,[lr,#-15] 165 ldrb r2,[lr,#-14] 166 ldrb r3,[lr,#-13] 167 orr r1,r0,r1,lsl#8 168 ldrb r0,[lr,#-12] 169 orr r2,r1,r2,lsl#16 170 ldrb r1,[lr,#-11] 171 orr r3,r2,r3,lsl#24 172 ldrb r2,[lr,#-10] 173 adds r4,r4,r3 @ accumulate input 174 175 ldrb r3,[lr,#-9] 176 orr r1,r0,r1,lsl#8 177 ldrb r0,[lr,#-8] 178 orr r2,r1,r2,lsl#16 179 ldrb r1,[lr,#-7] 180 orr r3,r2,r3,lsl#24 181 ldrb r2,[lr,#-6] 182 adcs r5,r5,r3 183 184 ldrb r3,[lr,#-5] 185 orr r1,r0,r1,lsl#8 186 ldrb r0,[lr,#-4] 187 orr r2,r1,r2,lsl#16 188 ldrb r1,[lr,#-3] 189 orr r3,r2,r3,lsl#24 190 ldrb r2,[lr,#-2] 191 adcs r6,r6,r3 192 193 ldrb r3,[lr,#-1] 194 orr r1,r0,r1,lsl#8 195 str lr,[sp,#8] @ offload input pointer 196 orr r2,r1,r2,lsl#16 197 add r10,r10,r10,lsr#2 198 orr r3,r2,r3,lsl#24 199#else 200 ldr r0,[lr],#16 @ load input 201# ifdef __thumb2__ 202 it hi 203# endif 204 addhi r8,r8,#1 @ padbit 205 ldr r1,[lr,#-12] 206 ldr r2,[lr,#-8] 207 ldr r3,[lr,#-4] 208# ifdef __ARMEB__ 209 rev r0,r0 210 rev r1,r1 211 rev r2,r2 212 rev r3,r3 213# endif 214 adds r4,r4,r0 @ accumulate input 215 str lr,[sp,#8] @ offload input pointer 216 adcs r5,r5,r1 217 add r10,r10,r10,lsr#2 218 adcs r6,r6,r2 219#endif 220 add r11,r11,r11,lsr#2 221 adcs r7,r7,r3 222 add r12,r12,r12,lsr#2 223 224 umull r2,r3,r5,r9 225 adc r8,r8,#0 226 umull r0,r1,r4,r9 227 umlal r2,r3,r8,r10 228 umlal r0,r1,r7,r10 229 ldr r10,[sp,#20] @ reload r10 230 umlal r2,r3,r6,r12 231 umlal r0,r1,r5,r12 232 umlal r2,r3,r7,r11 233 umlal r0,r1,r6,r11 234 umlal r2,r3,r4,r10 235 str r0,[sp,#0] @ future r4 236 mul r0,r11,r8 237 ldr r11,[sp,#24] @ reload r11 238 adds r2,r2,r1 @ d1+=d0>>32 239 eor r1,r1,r1 240 adc lr,r3,#0 @ future r6 241 str r2,[sp,#4] @ future r5 242 243 mul r2,r12,r8 244 eor r3,r3,r3 245 umlal r0,r1,r7,r12 246 ldr r12,[sp,#28] @ reload r12 247 umlal r2,r3,r7,r9 248 umlal r0,r1,r6,r9 249 umlal r2,r3,r6,r10 250 umlal r0,r1,r5,r10 251 umlal r2,r3,r5,r11 252 umlal r0,r1,r4,r11 253 umlal r2,r3,r4,r12 254 ldr r4,[sp,#0] 255 mul r8,r9,r8 256 ldr r5,[sp,#4] 257 258 adds r6,lr,r0 @ d2+=d1>>32 259 ldr lr,[sp,#8] @ reload input pointer 260 adc r1,r1,#0 261 adds r7,r2,r1 @ d3+=d2>>32 262 ldr r0,[sp,#16] @ reload end pointer 263 adc r3,r3,#0 264 add r8,r8,r3 @ h4+=d3>>32 265 266 and r1,r8,#-4 267 and r8,r8,#3 268 add r1,r1,r1,lsr#2 @ *=5 269 adds r4,r4,r1 270 adcs r5,r5,#0 271 adcs r6,r6,#0 272 adcs r7,r7,#0 273 adc r8,r8,#0 274 275 cmp r0,lr @ done yet? 276 bhi .Loop 277 278 ldr r0,[sp,#12] 279 add sp,sp,#32 280 stmia r0,{r4,r5,r6,r7,r8} @ store the result 281 282.Lno_data: 283#if __ARM_ARCH__>=5 284 ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc} 285#else 286 ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} 287 tst lr,#1 288 moveq pc,lr @ be binary compatible with V4, yet 289.word 0xe12fff1e @ interoperable with Thumb ISA:-) 290#endif 291.size poly1305_blocks,.-poly1305_blocks 292.type poly1305_emit,%function 293.align 5 294poly1305_emit: 295.Lpoly1305_emit: 296 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 297.Lpoly1305_emit_enter: 298 299 ldmia r0,{r3,r4,r5,r6,r7} 300 adds r8,r3,#5 @ compare to modulus 301 adcs r9,r4,#0 302 adcs r10,r5,#0 303 adcs r11,r6,#0 304 adc r7,r7,#0 305 tst r7,#4 @ did it carry/borrow? 306 307#ifdef __thumb2__ 308 it ne 309#endif 310 movne r3,r8 311 ldr r8,[r2,#0] 312#ifdef __thumb2__ 313 it ne 314#endif 315 movne r4,r9 316 ldr r9,[r2,#4] 317#ifdef __thumb2__ 318 it ne 319#endif 320 movne r5,r10 321 ldr r10,[r2,#8] 322#ifdef __thumb2__ 323 it ne 324#endif 325 movne r6,r11 326 ldr r11,[r2,#12] 327 328 adds r3,r3,r8 329 adcs r4,r4,r9 330 adcs r5,r5,r10 331 adc r6,r6,r11 332 333#if __ARM_ARCH__>=7 334# ifdef __ARMEB__ 335 rev r3,r3 336 rev r4,r4 337 rev r5,r5 338 rev r6,r6 339# endif 340 str r3,[r1,#0] 341 str r4,[r1,#4] 342 str r5,[r1,#8] 343 str r6,[r1,#12] 344#else 345 strb r3,[r1,#0] 346 mov r3,r3,lsr#8 347 strb r4,[r1,#4] 348 mov r4,r4,lsr#8 349 strb r5,[r1,#8] 350 mov r5,r5,lsr#8 351 strb r6,[r1,#12] 352 mov r6,r6,lsr#8 353 354 strb r3,[r1,#1] 355 mov r3,r3,lsr#8 356 strb r4,[r1,#5] 357 mov r4,r4,lsr#8 358 strb r5,[r1,#9] 359 mov r5,r5,lsr#8 360 strb r6,[r1,#13] 361 mov r6,r6,lsr#8 362 363 strb r3,[r1,#2] 364 mov r3,r3,lsr#8 365 strb r4,[r1,#6] 366 mov r4,r4,lsr#8 367 strb r5,[r1,#10] 368 mov r5,r5,lsr#8 369 strb r6,[r1,#14] 370 mov r6,r6,lsr#8 371 372 strb r3,[r1,#3] 373 strb r4,[r1,#7] 374 strb r5,[r1,#11] 375 strb r6,[r1,#15] 376#endif 377 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 378#if __ARM_ARCH__>=5 379 bx lr @ bx lr 380#else 381 tst lr,#1 382 moveq pc,lr @ be binary compatible with V4, yet 383.word 0xe12fff1e @ interoperable with Thumb ISA:-) 384#endif 385.size poly1305_emit,.-poly1305_emit 386#if __ARM_MAX_ARCH__>=7 387.fpu neon 388 389.type poly1305_init_neon,%function 390.align 5 391poly1305_init_neon: 392 ldr r4,[r0,#20] @ load key base 2^32 393 ldr r5,[r0,#24] 394 ldr r6,[r0,#28] 395 ldr r7,[r0,#32] 396 397 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 398 mov r3,r4,lsr#26 399 mov r4,r5,lsr#20 400 orr r3,r3,r5,lsl#6 401 mov r5,r6,lsr#14 402 orr r4,r4,r6,lsl#12 403 mov r6,r7,lsr#8 404 orr r5,r5,r7,lsl#18 405 and r3,r3,#0x03ffffff 406 and r4,r4,#0x03ffffff 407 and r5,r5,#0x03ffffff 408 409 vdup.32 d0,r2 @ r^1 in both lanes 410 add r2,r3,r3,lsl#2 @ *5 411 vdup.32 d1,r3 412 add r3,r4,r4,lsl#2 413 vdup.32 d2,r2 414 vdup.32 d3,r4 415 add r4,r5,r5,lsl#2 416 vdup.32 d4,r3 417 vdup.32 d5,r5 418 add r5,r6,r6,lsl#2 419 vdup.32 d6,r4 420 vdup.32 d7,r6 421 vdup.32 d8,r5 422 423 mov r5,#2 @ counter 424 425.Lsquare_neon: 426 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 427 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 428 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 429 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 430 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 431 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 432 433 vmull.u32 q5,d0,d0[1] 434 vmull.u32 q6,d1,d0[1] 435 vmull.u32 q7,d3,d0[1] 436 vmull.u32 q8,d5,d0[1] 437 vmull.u32 q9,d7,d0[1] 438 439 vmlal.u32 q5,d7,d2[1] 440 vmlal.u32 q6,d0,d1[1] 441 vmlal.u32 q7,d1,d1[1] 442 vmlal.u32 q8,d3,d1[1] 443 vmlal.u32 q9,d5,d1[1] 444 445 vmlal.u32 q5,d5,d4[1] 446 vmlal.u32 q6,d7,d4[1] 447 vmlal.u32 q8,d1,d3[1] 448 vmlal.u32 q7,d0,d3[1] 449 vmlal.u32 q9,d3,d3[1] 450 451 vmlal.u32 q5,d3,d6[1] 452 vmlal.u32 q8,d0,d5[1] 453 vmlal.u32 q6,d5,d6[1] 454 vmlal.u32 q7,d7,d6[1] 455 vmlal.u32 q9,d1,d5[1] 456 457 vmlal.u32 q8,d7,d8[1] 458 vmlal.u32 q5,d1,d8[1] 459 vmlal.u32 q6,d3,d8[1] 460 vmlal.u32 q7,d5,d8[1] 461 vmlal.u32 q9,d0,d7[1] 462 463 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 464 @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 465 @ and P. Schwabe 466 @ 467 @ H0>>+H1>>+H2>>+H3>>+H4 468 @ H3>>+H4>>*5+H0>>+H1 469 @ 470 @ Trivia. 471 @ 472 @ Result of multiplication of n-bit number by m-bit number is 473 @ n+m bits wide. However! Even though 2^n is a n+1-bit number, 474 @ m-bit number multiplied by 2^n is still n+m bits wide. 475 @ 476 @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, 477 @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit 478 @ one is n+1 bits wide. 479 @ 480 @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that 481 @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 482 @ can be 27. However! In cases when their width exceeds 26 bits 483 @ they are limited by 2^26+2^6. This in turn means that *sum* 484 @ of the products with these values can still be viewed as sum 485 @ of 52-bit numbers as long as the amount of addends is not a 486 @ power of 2. For example, 487 @ 488 @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, 489 @ 490 @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or 491 @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than 492 @ 8 * (2^52) or 2^55. However, the value is then multiplied by 493 @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), 494 @ which is less than 32 * (2^52) or 2^57. And when processing 495 @ data we are looking at triple as many addends... 496 @ 497 @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and 498 @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the 499 @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while 500 @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 501 @ instruction accepts 2x32-bit input and writes 2x64-bit result. 502 @ This means that result of reduction have to be compressed upon 503 @ loop wrap-around. This can be done in the process of reduction 504 @ to minimize amount of instructions [as well as amount of 505 @ 128-bit instructions, which benefits low-end processors], but 506 @ one has to watch for H2 (which is narrower than H0) and 5*H4 507 @ not being wider than 58 bits, so that result of right shift 508 @ by 26 bits fits in 32 bits. This is also useful on x86, 509 @ because it allows to use paddd in place for paddq, which 510 @ benefits Atom, where paddq is ridiculously slow. 511 512 vshr.u64 q15,q8,#26 513 vmovn.i64 d16,q8 514 vshr.u64 q4,q5,#26 515 vmovn.i64 d10,q5 516 vadd.i64 q9,q9,q15 @ h3 -> h4 517 vbic.i32 d16,#0xfc000000 @ &=0x03ffffff 518 vadd.i64 q6,q6,q4 @ h0 -> h1 519 vbic.i32 d10,#0xfc000000 520 521 vshrn.u64 d30,q9,#26 522 vmovn.i64 d18,q9 523 vshr.u64 q4,q6,#26 524 vmovn.i64 d12,q6 525 vadd.i64 q7,q7,q4 @ h1 -> h2 526 vbic.i32 d18,#0xfc000000 527 vbic.i32 d12,#0xfc000000 528 529 vadd.i32 d10,d10,d30 530 vshl.u32 d30,d30,#2 531 vshrn.u64 d8,q7,#26 532 vmovn.i64 d14,q7 533 vadd.i32 d10,d10,d30 @ h4 -> h0 534 vadd.i32 d16,d16,d8 @ h2 -> h3 535 vbic.i32 d14,#0xfc000000 536 537 vshr.u32 d30,d10,#26 538 vbic.i32 d10,#0xfc000000 539 vshr.u32 d8,d16,#26 540 vbic.i32 d16,#0xfc000000 541 vadd.i32 d12,d12,d30 @ h0 -> h1 542 vadd.i32 d18,d18,d8 @ h3 -> h4 543 544 subs r5,r5,#1 545 beq .Lsquare_break_neon 546 547 add r6,r0,#(48+0*9*4) 548 add r7,r0,#(48+1*9*4) 549 550 vtrn.32 d0,d10 @ r^2:r^1 551 vtrn.32 d3,d14 552 vtrn.32 d5,d16 553 vtrn.32 d1,d12 554 vtrn.32 d7,d18 555 556 vshl.u32 d4,d3,#2 @ *5 557 vshl.u32 d6,d5,#2 558 vshl.u32 d2,d1,#2 559 vshl.u32 d8,d7,#2 560 vadd.i32 d4,d4,d3 561 vadd.i32 d2,d2,d1 562 vadd.i32 d6,d6,d5 563 vadd.i32 d8,d8,d7 564 565 vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! 566 vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! 567 vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 568 vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 569 vst1.32 {d8[0]},[r6,:32] 570 vst1.32 {d8[1]},[r7,:32] 571 572 b .Lsquare_neon 573 574.align 4 575.Lsquare_break_neon: 576 add r6,r0,#(48+2*4*9) 577 add r7,r0,#(48+3*4*9) 578 579 vmov d0,d10 @ r^4:r^3 580 vshl.u32 d2,d12,#2 @ *5 581 vmov d1,d12 582 vshl.u32 d4,d14,#2 583 vmov d3,d14 584 vshl.u32 d6,d16,#2 585 vmov d5,d16 586 vshl.u32 d8,d18,#2 587 vmov d7,d18 588 vadd.i32 d2,d2,d12 589 vadd.i32 d4,d4,d14 590 vadd.i32 d6,d6,d16 591 vadd.i32 d8,d8,d18 592 593 vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! 594 vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! 595 vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 596 vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 597 vst1.32 {d8[0]},[r6] 598 vst1.32 {d8[1]},[r7] 599 600 bx lr @ bx lr 601.size poly1305_init_neon,.-poly1305_init_neon 602 603.type poly1305_blocks_neon,%function 604.align 5 605poly1305_blocks_neon: 606.Lpoly1305_blocks_neon: 607 ldr ip,[r0,#36] @ is_base2_26 608 ands r2,r2,#-16 609 beq .Lno_data_neon 610 611 cmp r2,#64 612 bhs .Lenter_neon 613 tst ip,ip @ is_base2_26? 614 beq .Lpoly1305_blocks 615 616.Lenter_neon: 617 stmdb sp!,{r4,r5,r6,r7} 618 vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so 619 620 tst ip,ip @ is_base2_26? 621 bne .Lbase2_26_neon 622 623 stmdb sp!,{r1,r2,r3,lr} 624 bl poly1305_init_neon 625 626 ldr r4,[r0,#0] @ load hash value base 2^32 627 ldr r5,[r0,#4] 628 ldr r6,[r0,#8] 629 ldr r7,[r0,#12] 630 ldr ip,[r0,#16] 631 632 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 633 mov r3,r4,lsr#26 634 veor d10,d10,d10 635 mov r4,r5,lsr#20 636 orr r3,r3,r5,lsl#6 637 veor d12,d12,d12 638 mov r5,r6,lsr#14 639 orr r4,r4,r6,lsl#12 640 veor d14,d14,d14 641 mov r6,r7,lsr#8 642 orr r5,r5,r7,lsl#18 643 veor d16,d16,d16 644 and r3,r3,#0x03ffffff 645 orr r6,r6,ip,lsl#24 646 veor d18,d18,d18 647 and r4,r4,#0x03ffffff 648 mov r1,#1 649 and r5,r5,#0x03ffffff 650 str r1,[r0,#36] @ is_base2_26 651 652 vmov.32 d10[0],r2 653 vmov.32 d12[0],r3 654 vmov.32 d14[0],r4 655 vmov.32 d16[0],r5 656 vmov.32 d18[0],r6 657 adr r5,.Lzeros 658 659 ldmia sp!,{r1,r2,r3,lr} 660 b .Lbase2_32_neon 661 662.align 4 663.Lbase2_26_neon: 664 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 665 @ load hash value 666 667 veor d10,d10,d10 668 veor d12,d12,d12 669 veor d14,d14,d14 670 veor d16,d16,d16 671 veor d18,d18,d18 672 vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! 673 adr r5,.Lzeros 674 vld1.32 {d18[0]},[r0] 675 sub r0,r0,#16 @ rewind 676 677.Lbase2_32_neon: 678 add r4,r1,#32 679 mov r3,r3,lsl#24 680 tst r2,#31 681 beq .Leven 682 683 vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]! 684 vmov.32 d28[0],r3 685 sub r2,r2,#16 686 add r4,r1,#32 687 688# ifdef __ARMEB__ 689 vrev32.8 q10,q10 690 vrev32.8 q13,q13 691 vrev32.8 q11,q11 692 vrev32.8 q12,q12 693# endif 694 vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26 695 vshl.u32 d26,d26,#18 696 697 vsri.u32 d26,d24,#14 698 vshl.u32 d24,d24,#12 699 vadd.i32 d29,d28,d18 @ add hash value and move to #hi 700 701 vbic.i32 d26,#0xfc000000 702 vsri.u32 d24,d22,#20 703 vshl.u32 d22,d22,#6 704 705 vbic.i32 d24,#0xfc000000 706 vsri.u32 d22,d20,#26 707 vadd.i32 d27,d26,d16 708 709 vbic.i32 d20,#0xfc000000 710 vbic.i32 d22,#0xfc000000 711 vadd.i32 d25,d24,d14 712 713 vadd.i32 d21,d20,d10 714 vadd.i32 d23,d22,d12 715 716 mov r7,r5 717 add r6,r0,#48 718 719 cmp r2,r2 720 b .Long_tail 721 722.align 4 723.Leven: 724 subs r2,r2,#64 725 it lo 726 movlo r4,r5 727 728 vmov.i32 q14,#1<<24 @ padbit, yes, always 729 vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] 730 add r1,r1,#64 731 vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) 732 add r4,r4,#64 733 itt hi 734 addhi r7,r0,#(48+1*9*4) 735 addhi r6,r0,#(48+3*9*4) 736 737# ifdef __ARMEB__ 738 vrev32.8 q10,q10 739 vrev32.8 q13,q13 740 vrev32.8 q11,q11 741 vrev32.8 q12,q12 742# endif 743 vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 744 vshl.u32 q13,q13,#18 745 746 vsri.u32 q13,q12,#14 747 vshl.u32 q12,q12,#12 748 749 vbic.i32 q13,#0xfc000000 750 vsri.u32 q12,q11,#20 751 vshl.u32 q11,q11,#6 752 753 vbic.i32 q12,#0xfc000000 754 vsri.u32 q11,q10,#26 755 756 vbic.i32 q10,#0xfc000000 757 vbic.i32 q11,#0xfc000000 758 759 bls .Lskip_loop 760 761 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2 762 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 763 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 764 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 765 b .Loop_neon 766 767.align 5 768.Loop_neon: 769 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 770 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 771 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 772 @ ___________________/ 773 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 774 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 775 @ ___________________/ ____________________/ 776 @ 777 @ Note that we start with inp[2:3]*r^2. This is because it 778 @ doesn't depend on reduction in previous iteration. 779 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 780 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 781 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 782 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 783 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 784 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 785 786 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 787 @ inp[2:3]*r^2 788 789 vadd.i32 d24,d24,d14 @ accumulate inp[0:1] 790 vmull.u32 q7,d25,d0[1] 791 vadd.i32 d20,d20,d10 792 vmull.u32 q5,d21,d0[1] 793 vadd.i32 d26,d26,d16 794 vmull.u32 q8,d27,d0[1] 795 vmlal.u32 q7,d23,d1[1] 796 vadd.i32 d22,d22,d12 797 vmull.u32 q6,d23,d0[1] 798 799 vadd.i32 d28,d28,d18 800 vmull.u32 q9,d29,d0[1] 801 subs r2,r2,#64 802 vmlal.u32 q5,d29,d2[1] 803 it lo 804 movlo r4,r5 805 vmlal.u32 q8,d25,d1[1] 806 vld1.32 d8[1],[r7,:32] 807 vmlal.u32 q6,d21,d1[1] 808 vmlal.u32 q9,d27,d1[1] 809 810 vmlal.u32 q5,d27,d4[1] 811 vmlal.u32 q8,d23,d3[1] 812 vmlal.u32 q9,d25,d3[1] 813 vmlal.u32 q6,d29,d4[1] 814 vmlal.u32 q7,d21,d3[1] 815 816 vmlal.u32 q8,d21,d5[1] 817 vmlal.u32 q5,d25,d6[1] 818 vmlal.u32 q9,d23,d5[1] 819 vmlal.u32 q6,d27,d6[1] 820 vmlal.u32 q7,d29,d6[1] 821 822 vmlal.u32 q8,d29,d8[1] 823 vmlal.u32 q5,d23,d8[1] 824 vmlal.u32 q9,d21,d7[1] 825 vmlal.u32 q6,d25,d8[1] 826 vmlal.u32 q7,d27,d8[1] 827 828 vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) 829 add r4,r4,#64 830 831 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 832 @ (hash+inp[0:1])*r^4 and accumulate 833 834 vmlal.u32 q8,d26,d0[0] 835 vmlal.u32 q5,d20,d0[0] 836 vmlal.u32 q9,d28,d0[0] 837 vmlal.u32 q6,d22,d0[0] 838 vmlal.u32 q7,d24,d0[0] 839 vld1.32 d8[0],[r6,:32] 840 841 vmlal.u32 q8,d24,d1[0] 842 vmlal.u32 q5,d28,d2[0] 843 vmlal.u32 q9,d26,d1[0] 844 vmlal.u32 q6,d20,d1[0] 845 vmlal.u32 q7,d22,d1[0] 846 847 vmlal.u32 q8,d22,d3[0] 848 vmlal.u32 q5,d26,d4[0] 849 vmlal.u32 q9,d24,d3[0] 850 vmlal.u32 q6,d28,d4[0] 851 vmlal.u32 q7,d20,d3[0] 852 853 vmlal.u32 q8,d20,d5[0] 854 vmlal.u32 q5,d24,d6[0] 855 vmlal.u32 q9,d22,d5[0] 856 vmlal.u32 q6,d26,d6[0] 857 vmlal.u32 q8,d28,d8[0] 858 859 vmlal.u32 q7,d28,d6[0] 860 vmlal.u32 q5,d22,d8[0] 861 vmlal.u32 q9,d20,d7[0] 862 vmov.i32 q14,#1<<24 @ padbit, yes, always 863 vmlal.u32 q6,d24,d8[0] 864 vmlal.u32 q7,d26,d8[0] 865 866 vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] 867 add r1,r1,#64 868# ifdef __ARMEB__ 869 vrev32.8 q10,q10 870 vrev32.8 q11,q11 871 vrev32.8 q12,q12 872 vrev32.8 q13,q13 873# endif 874 875 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 876 @ lazy reduction interleaved with base 2^32 -> base 2^26 of 877 @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14. 878 879 vshr.u64 q15,q8,#26 880 vmovn.i64 d16,q8 881 vshr.u64 q4,q5,#26 882 vmovn.i64 d10,q5 883 vadd.i64 q9,q9,q15 @ h3 -> h4 884 vbic.i32 d16,#0xfc000000 885 vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 886 vadd.i64 q6,q6,q4 @ h0 -> h1 887 vshl.u32 q13,q13,#18 888 vbic.i32 d10,#0xfc000000 889 890 vshrn.u64 d30,q9,#26 891 vmovn.i64 d18,q9 892 vshr.u64 q4,q6,#26 893 vmovn.i64 d12,q6 894 vadd.i64 q7,q7,q4 @ h1 -> h2 895 vsri.u32 q13,q12,#14 896 vbic.i32 d18,#0xfc000000 897 vshl.u32 q12,q12,#12 898 vbic.i32 d12,#0xfc000000 899 900 vadd.i32 d10,d10,d30 901 vshl.u32 d30,d30,#2 902 vbic.i32 q13,#0xfc000000 903 vshrn.u64 d8,q7,#26 904 vmovn.i64 d14,q7 905 vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec] 906 vsri.u32 q12,q11,#20 907 vadd.i32 d16,d16,d8 @ h2 -> h3 908 vshl.u32 q11,q11,#6 909 vbic.i32 d14,#0xfc000000 910 vbic.i32 q12,#0xfc000000 911 912 vshrn.u64 d30,q5,#26 @ re-narrow 913 vmovn.i64 d10,q5 914 vsri.u32 q11,q10,#26 915 vbic.i32 q10,#0xfc000000 916 vshr.u32 d8,d16,#26 917 vbic.i32 d16,#0xfc000000 918 vbic.i32 d10,#0xfc000000 919 vadd.i32 d12,d12,d30 @ h0 -> h1 920 vadd.i32 d18,d18,d8 @ h3 -> h4 921 vbic.i32 q11,#0xfc000000 922 923 bhi .Loop_neon 924 925.Lskip_loop: 926 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 927 @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 928 929 add r7,r0,#(48+0*9*4) 930 add r6,r0,#(48+1*9*4) 931 adds r2,r2,#32 932 it ne 933 movne r2,#0 934 bne .Long_tail 935 936 vadd.i32 d25,d24,d14 @ add hash value and move to #hi 937 vadd.i32 d21,d20,d10 938 vadd.i32 d27,d26,d16 939 vadd.i32 d23,d22,d12 940 vadd.i32 d29,d28,d18 941 942.Long_tail: 943 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1 944 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2 945 946 vadd.i32 d24,d24,d14 @ can be redundant 947 vmull.u32 q7,d25,d0 948 vadd.i32 d20,d20,d10 949 vmull.u32 q5,d21,d0 950 vadd.i32 d26,d26,d16 951 vmull.u32 q8,d27,d0 952 vadd.i32 d22,d22,d12 953 vmull.u32 q6,d23,d0 954 vadd.i32 d28,d28,d18 955 vmull.u32 q9,d29,d0 956 957 vmlal.u32 q5,d29,d2 958 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 959 vmlal.u32 q8,d25,d1 960 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 961 vmlal.u32 q6,d21,d1 962 vmlal.u32 q9,d27,d1 963 vmlal.u32 q7,d23,d1 964 965 vmlal.u32 q8,d23,d3 966 vld1.32 d8[1],[r7,:32] 967 vmlal.u32 q5,d27,d4 968 vld1.32 d8[0],[r6,:32] 969 vmlal.u32 q9,d25,d3 970 vmlal.u32 q6,d29,d4 971 vmlal.u32 q7,d21,d3 972 973 vmlal.u32 q8,d21,d5 974 it ne 975 addne r7,r0,#(48+2*9*4) 976 vmlal.u32 q5,d25,d6 977 it ne 978 addne r6,r0,#(48+3*9*4) 979 vmlal.u32 q9,d23,d5 980 vmlal.u32 q6,d27,d6 981 vmlal.u32 q7,d29,d6 982 983 vmlal.u32 q8,d29,d8 984 vorn q0,q0,q0 @ all-ones, can be redundant 985 vmlal.u32 q5,d23,d8 986 vshr.u64 q0,q0,#38 987 vmlal.u32 q9,d21,d7 988 vmlal.u32 q6,d25,d8 989 vmlal.u32 q7,d27,d8 990 991 beq .Lshort_tail 992 993 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 994 @ (hash+inp[0:1])*r^4:r^3 and accumulate 995 996 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3 997 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 998 999 vmlal.u32 q7,d24,d0 1000 vmlal.u32 q5,d20,d0 1001 vmlal.u32 q8,d26,d0 1002 vmlal.u32 q6,d22,d0 1003 vmlal.u32 q9,d28,d0 1004 1005 vmlal.u32 q5,d28,d2 1006 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 1007 vmlal.u32 q8,d24,d1 1008 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 1009 vmlal.u32 q6,d20,d1 1010 vmlal.u32 q9,d26,d1 1011 vmlal.u32 q7,d22,d1 1012 1013 vmlal.u32 q8,d22,d3 1014 vld1.32 d8[1],[r7,:32] 1015 vmlal.u32 q5,d26,d4 1016 vld1.32 d8[0],[r6,:32] 1017 vmlal.u32 q9,d24,d3 1018 vmlal.u32 q6,d28,d4 1019 vmlal.u32 q7,d20,d3 1020 1021 vmlal.u32 q8,d20,d5 1022 vmlal.u32 q5,d24,d6 1023 vmlal.u32 q9,d22,d5 1024 vmlal.u32 q6,d26,d6 1025 vmlal.u32 q7,d28,d6 1026 1027 vmlal.u32 q8,d28,d8 1028 vorn q0,q0,q0 @ all-ones 1029 vmlal.u32 q5,d22,d8 1030 vshr.u64 q0,q0,#38 1031 vmlal.u32 q9,d20,d7 1032 vmlal.u32 q6,d24,d8 1033 vmlal.u32 q7,d26,d8 1034 1035.Lshort_tail: 1036 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1037 @ horizontal addition 1038 1039 vadd.i64 d16,d16,d17 1040 vadd.i64 d10,d10,d11 1041 vadd.i64 d18,d18,d19 1042 vadd.i64 d12,d12,d13 1043 vadd.i64 d14,d14,d15 1044 1045 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1046 @ lazy reduction, but without narrowing 1047 1048 vshr.u64 q15,q8,#26 1049 vand.i64 q8,q8,q0 1050 vshr.u64 q4,q5,#26 1051 vand.i64 q5,q5,q0 1052 vadd.i64 q9,q9,q15 @ h3 -> h4 1053 vadd.i64 q6,q6,q4 @ h0 -> h1 1054 1055 vshr.u64 q15,q9,#26 1056 vand.i64 q9,q9,q0 1057 vshr.u64 q4,q6,#26 1058 vand.i64 q6,q6,q0 1059 vadd.i64 q7,q7,q4 @ h1 -> h2 1060 1061 vadd.i64 q5,q5,q15 1062 vshl.u64 q15,q15,#2 1063 vshr.u64 q4,q7,#26 1064 vand.i64 q7,q7,q0 1065 vadd.i64 q5,q5,q15 @ h4 -> h0 1066 vadd.i64 q8,q8,q4 @ h2 -> h3 1067 1068 vshr.u64 q15,q5,#26 1069 vand.i64 q5,q5,q0 1070 vshr.u64 q4,q8,#26 1071 vand.i64 q8,q8,q0 1072 vadd.i64 q6,q6,q15 @ h0 -> h1 1073 vadd.i64 q9,q9,q4 @ h3 -> h4 1074 1075 cmp r2,#0 1076 bne .Leven 1077 1078 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1079 @ store hash value 1080 1081 vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! 1082 vst1.32 {d18[0]},[r0] 1083 1084 vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ epilogue 1085 ldmia sp!,{r4,r5,r6,r7} 1086.Lno_data_neon: 1087 bx lr @ bx lr 1088.size poly1305_blocks_neon,.-poly1305_blocks_neon 1089 1090.type poly1305_emit_neon,%function 1091.align 5 1092poly1305_emit_neon: 1093.Lpoly1305_emit_neon: 1094 ldr ip,[r0,#36] @ is_base2_26 1095 1096 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 1097 1098 tst ip,ip 1099 beq .Lpoly1305_emit_enter 1100 1101 ldmia r0,{r3,r4,r5,r6,r7} 1102 eor r8,r8,r8 1103 1104 adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32 1105 mov r4,r4,lsr#6 1106 adcs r4,r4,r5,lsl#20 1107 mov r5,r5,lsr#12 1108 adcs r5,r5,r6,lsl#14 1109 mov r6,r6,lsr#18 1110 adcs r6,r6,r7,lsl#8 1111 adc r7,r8,r7,lsr#24 @ can be partially reduced ... 1112 1113 and r8,r7,#-4 @ ... so reduce 1114 and r7,r6,#3 1115 add r8,r8,r8,lsr#2 @ *= 5 1116 adds r3,r3,r8 1117 adcs r4,r4,#0 1118 adcs r5,r5,#0 1119 adcs r6,r6,#0 1120 adc r7,r7,#0 1121 1122 adds r8,r3,#5 @ compare to modulus 1123 adcs r9,r4,#0 1124 adcs r10,r5,#0 1125 adcs r11,r6,#0 1126 adc r7,r7,#0 1127 tst r7,#4 @ did it carry/borrow? 1128 1129 it ne 1130 movne r3,r8 1131 ldr r8,[r2,#0] 1132 it ne 1133 movne r4,r9 1134 ldr r9,[r2,#4] 1135 it ne 1136 movne r5,r10 1137 ldr r10,[r2,#8] 1138 it ne 1139 movne r6,r11 1140 ldr r11,[r2,#12] 1141 1142 adds r3,r3,r8 @ accumulate nonce 1143 adcs r4,r4,r9 1144 adcs r5,r5,r10 1145 adc r6,r6,r11 1146 1147# ifdef __ARMEB__ 1148 rev r3,r3 1149 rev r4,r4 1150 rev r5,r5 1151 rev r6,r6 1152# endif 1153 str r3,[r1,#0] @ store the result 1154 str r4,[r1,#4] 1155 str r5,[r1,#8] 1156 str r6,[r1,#12] 1157 1158 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 1159 bx lr @ bx lr 1160.size poly1305_emit_neon,.-poly1305_emit_neon 1161 1162.align 5 1163.Lzeros: 1164.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1165.LOPENSSL_armcap: 1166.word OPENSSL_armcap_P-.Lpoly1305_init 1167#endif 1168.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1169.align 2 1170.align 2 1171#if __ARM_MAX_ARCH__>=7 1172.comm OPENSSL_armcap_P,4,4 1173#endif 1174