1/* $FreeBSD$ */ 2/* Do not modify. This file is auto-generated from poly1305-armv4.pl. */ 3#include "arm_arch.h" 4 5.text 6#if defined(__thumb2__) 7.syntax unified 8.thumb 9#else 10.code 32 11#endif 12 13.globl poly1305_emit 14.globl poly1305_blocks 15.globl poly1305_init 16.type poly1305_init,%function 17.align 5 18poly1305_init: 19.Lpoly1305_init: 20 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 21 22 eor r3,r3,r3 23 cmp r1,#0 24 str r3,[r0,#0] @ zero hash value 25 str r3,[r0,#4] 26 str r3,[r0,#8] 27 str r3,[r0,#12] 28 str r3,[r0,#16] 29 str r3,[r0,#36] @ is_base2_26 30 add r0,r0,#20 31 32#ifdef __thumb2__ 33 it eq 34#endif 35 moveq r0,#0 36 beq .Lno_key 37 38#if __ARM_MAX_ARCH__>=7 39 adr r11,.Lpoly1305_init 40 ldr r12,.LOPENSSL_armcap 41#endif 42 ldrb r4,[r1,#0] 43 mov r10,#0x0fffffff 44 ldrb r5,[r1,#1] 45 and r3,r10,#-4 @ 0x0ffffffc 46 ldrb r6,[r1,#2] 47 ldrb r7,[r1,#3] 48 orr r4,r4,r5,lsl#8 49 ldrb r5,[r1,#4] 50 orr r4,r4,r6,lsl#16 51 ldrb r6,[r1,#5] 52 orr r4,r4,r7,lsl#24 53 ldrb r7,[r1,#6] 54 and r4,r4,r10 55 56#if __ARM_MAX_ARCH__>=7 57 ldr r12,[r11,r12] @ OPENSSL_armcap_P 58# ifdef __APPLE__ 59 ldr r12,[r12] 60# endif 61#endif 62 ldrb r8,[r1,#7] 63 orr r5,r5,r6,lsl#8 64 ldrb r6,[r1,#8] 65 orr r5,r5,r7,lsl#16 66 ldrb r7,[r1,#9] 67 orr r5,r5,r8,lsl#24 68 ldrb r8,[r1,#10] 69 and r5,r5,r3 70 71#if __ARM_MAX_ARCH__>=7 72 tst r12,#ARMV7_NEON @ check for NEON 73# ifdef __APPLE__ 74 adr r9,poly1305_blocks_neon 75 adr r11,poly1305_blocks 76# ifdef __thumb2__ 77 it ne 78# endif 79 movne r11,r9 80 adr r12,poly1305_emit 81 adr r10,poly1305_emit_neon 82# ifdef __thumb2__ 83 it ne 84# endif 85 movne r12,r10 86# else 87# ifdef __thumb2__ 88 itete eq 89# endif 90 addeq r12,r11,#(poly1305_emit-.Lpoly1305_init) 91 addne r12,r11,#(poly1305_emit_neon-.Lpoly1305_init) 92 addeq r11,r11,#(poly1305_blocks-.Lpoly1305_init) 93 addne r11,r11,#(poly1305_blocks_neon-.Lpoly1305_init) 94# endif 95# ifdef __thumb2__ 96 orr r12,r12,#1 @ thumb-ify address 97 orr r11,r11,#1 98# endif 99#endif 100 ldrb r9,[r1,#11] 101 orr r6,r6,r7,lsl#8 102 ldrb r7,[r1,#12] 103 orr r6,r6,r8,lsl#16 104 ldrb r8,[r1,#13] 105 orr r6,r6,r9,lsl#24 106 ldrb r9,[r1,#14] 107 and r6,r6,r3 108 109 ldrb r10,[r1,#15] 110 orr r7,r7,r8,lsl#8 111 str r4,[r0,#0] 112 orr r7,r7,r9,lsl#16 113 str r5,[r0,#4] 114 orr r7,r7,r10,lsl#24 115 str r6,[r0,#8] 116 and r7,r7,r3 117 str r7,[r0,#12] 118#if __ARM_MAX_ARCH__>=7 119 stmia r2,{r11,r12} @ fill functions table 120 mov r0,#1 121#else 122 mov r0,#0 123#endif 124.Lno_key: 125 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 126#if __ARM_ARCH__>=5 127 bx lr @ bx lr 128#else 129 tst lr,#1 130 moveq pc,lr @ be binary compatible with V4, yet 131.word 0xe12fff1e @ interoperable with Thumb ISA:-) 132#endif 133.size poly1305_init,.-poly1305_init 134.type poly1305_blocks,%function 135.align 5 136poly1305_blocks: 137.Lpoly1305_blocks: 138 stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} 139 140 ands r2,r2,#-16 141 beq .Lno_data 142 143 cmp r3,#0 144 add r2,r2,r1 @ end pointer 145 sub sp,sp,#32 146 147 ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12} @ load context 148 149 str r0,[sp,#12] @ offload stuff 150 mov lr,r1 151 str r2,[sp,#16] 152 str r10,[sp,#20] 153 str r11,[sp,#24] 154 str r12,[sp,#28] 155 b .Loop 156 157.Loop: 158#if __ARM_ARCH__<7 159 ldrb r0,[lr],#16 @ load input 160# ifdef __thumb2__ 161 it hi 162# endif 163 addhi r8,r8,#1 @ 1<<128 164 ldrb r1,[lr,#-15] 165 ldrb r2,[lr,#-14] 166 ldrb r3,[lr,#-13] 167 orr r1,r0,r1,lsl#8 168 ldrb r0,[lr,#-12] 169 orr r2,r1,r2,lsl#16 170 ldrb r1,[lr,#-11] 171 orr r3,r2,r3,lsl#24 172 ldrb r2,[lr,#-10] 173 adds r4,r4,r3 @ accumulate input 174 175 ldrb r3,[lr,#-9] 176 orr r1,r0,r1,lsl#8 177 ldrb r0,[lr,#-8] 178 orr r2,r1,r2,lsl#16 179 ldrb r1,[lr,#-7] 180 orr r3,r2,r3,lsl#24 181 ldrb r2,[lr,#-6] 182 adcs r5,r5,r3 183 184 ldrb r3,[lr,#-5] 185 orr r1,r0,r1,lsl#8 186 ldrb r0,[lr,#-4] 187 orr r2,r1,r2,lsl#16 188 ldrb r1,[lr,#-3] 189 orr r3,r2,r3,lsl#24 190 ldrb r2,[lr,#-2] 191 adcs r6,r6,r3 192 193 ldrb r3,[lr,#-1] 194 orr r1,r0,r1,lsl#8 195 str lr,[sp,#8] @ offload input pointer 196 orr r2,r1,r2,lsl#16 197 add r10,r10,r10,lsr#2 198 orr r3,r2,r3,lsl#24 199#else 200 ldr r0,[lr],#16 @ load input 201# ifdef __thumb2__ 202 it hi 203# endif 204 addhi r8,r8,#1 @ padbit 205 ldr r1,[lr,#-12] 206 ldr r2,[lr,#-8] 207 ldr r3,[lr,#-4] 208# ifdef __ARMEB__ 209 rev r0,r0 210 rev r1,r1 211 rev r2,r2 212 rev r3,r3 213# endif 214 adds r4,r4,r0 @ accumulate input 215 str lr,[sp,#8] @ offload input pointer 216 adcs r5,r5,r1 217 add r10,r10,r10,lsr#2 218 adcs r6,r6,r2 219#endif 220 add r11,r11,r11,lsr#2 221 adcs r7,r7,r3 222 add r12,r12,r12,lsr#2 223 224 umull r2,r3,r5,r9 225 adc r8,r8,#0 226 umull r0,r1,r4,r9 227 umlal r2,r3,r8,r10 228 umlal r0,r1,r7,r10 229 ldr r10,[sp,#20] @ reload r10 230 umlal r2,r3,r6,r12 231 umlal r0,r1,r5,r12 232 umlal r2,r3,r7,r11 233 umlal r0,r1,r6,r11 234 umlal r2,r3,r4,r10 235 str r0,[sp,#0] @ future r4 236 mul r0,r11,r8 237 ldr r11,[sp,#24] @ reload r11 238 adds r2,r2,r1 @ d1+=d0>>32 239 eor r1,r1,r1 240 adc lr,r3,#0 @ future r6 241 str r2,[sp,#4] @ future r5 242 243 mul r2,r12,r8 244 eor r3,r3,r3 245 umlal r0,r1,r7,r12 246 ldr r12,[sp,#28] @ reload r12 247 umlal r2,r3,r7,r9 248 umlal r0,r1,r6,r9 249 umlal r2,r3,r6,r10 250 umlal r0,r1,r5,r10 251 umlal r2,r3,r5,r11 252 umlal r0,r1,r4,r11 253 umlal r2,r3,r4,r12 254 ldr r4,[sp,#0] 255 mul r8,r9,r8 256 ldr r5,[sp,#4] 257 258 adds r6,lr,r0 @ d2+=d1>>32 259 ldr lr,[sp,#8] @ reload input pointer 260 adc r1,r1,#0 261 adds r7,r2,r1 @ d3+=d2>>32 262 ldr r0,[sp,#16] @ reload end pointer 263 adc r3,r3,#0 264 add r8,r8,r3 @ h4+=d3>>32 265 266 and r1,r8,#-4 267 and r8,r8,#3 268 add r1,r1,r1,lsr#2 @ *=5 269 adds r4,r4,r1 270 adcs r5,r5,#0 271 adcs r6,r6,#0 272 adcs r7,r7,#0 273 adc r8,r8,#0 274 275 cmp r0,lr @ done yet? 276 bhi .Loop 277 278 ldr r0,[sp,#12] 279 add sp,sp,#32 280 stmia r0,{r4,r5,r6,r7,r8} @ store the result 281 282.Lno_data: 283#if __ARM_ARCH__>=5 284 ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc} 285#else 286 ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} 287 tst lr,#1 288 moveq pc,lr @ be binary compatible with V4, yet 289.word 0xe12fff1e @ interoperable with Thumb ISA:-) 290#endif 291.size poly1305_blocks,.-poly1305_blocks 292.type poly1305_emit,%function 293.align 5 294poly1305_emit: 295 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 296.Lpoly1305_emit_enter: 297 298 ldmia r0,{r3,r4,r5,r6,r7} 299 adds r8,r3,#5 @ compare to modulus 300 adcs r9,r4,#0 301 adcs r10,r5,#0 302 adcs r11,r6,#0 303 adc r7,r7,#0 304 tst r7,#4 @ did it carry/borrow? 305 306#ifdef __thumb2__ 307 it ne 308#endif 309 movne r3,r8 310 ldr r8,[r2,#0] 311#ifdef __thumb2__ 312 it ne 313#endif 314 movne r4,r9 315 ldr r9,[r2,#4] 316#ifdef __thumb2__ 317 it ne 318#endif 319 movne r5,r10 320 ldr r10,[r2,#8] 321#ifdef __thumb2__ 322 it ne 323#endif 324 movne r6,r11 325 ldr r11,[r2,#12] 326 327 adds r3,r3,r8 328 adcs r4,r4,r9 329 adcs r5,r5,r10 330 adc r6,r6,r11 331 332#if __ARM_ARCH__>=7 333# ifdef __ARMEB__ 334 rev r3,r3 335 rev r4,r4 336 rev r5,r5 337 rev r6,r6 338# endif 339 str r3,[r1,#0] 340 str r4,[r1,#4] 341 str r5,[r1,#8] 342 str r6,[r1,#12] 343#else 344 strb r3,[r1,#0] 345 mov r3,r3,lsr#8 346 strb r4,[r1,#4] 347 mov r4,r4,lsr#8 348 strb r5,[r1,#8] 349 mov r5,r5,lsr#8 350 strb r6,[r1,#12] 351 mov r6,r6,lsr#8 352 353 strb r3,[r1,#1] 354 mov r3,r3,lsr#8 355 strb r4,[r1,#5] 356 mov r4,r4,lsr#8 357 strb r5,[r1,#9] 358 mov r5,r5,lsr#8 359 strb r6,[r1,#13] 360 mov r6,r6,lsr#8 361 362 strb r3,[r1,#2] 363 mov r3,r3,lsr#8 364 strb r4,[r1,#6] 365 mov r4,r4,lsr#8 366 strb r5,[r1,#10] 367 mov r5,r5,lsr#8 368 strb r6,[r1,#14] 369 mov r6,r6,lsr#8 370 371 strb r3,[r1,#3] 372 strb r4,[r1,#7] 373 strb r5,[r1,#11] 374 strb r6,[r1,#15] 375#endif 376 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 377#if __ARM_ARCH__>=5 378 bx lr @ bx lr 379#else 380 tst lr,#1 381 moveq pc,lr @ be binary compatible with V4, yet 382.word 0xe12fff1e @ interoperable with Thumb ISA:-) 383#endif 384.size poly1305_emit,.-poly1305_emit 385#if __ARM_MAX_ARCH__>=7 386.fpu neon 387 388.type poly1305_init_neon,%function 389.align 5 390poly1305_init_neon: 391 ldr r4,[r0,#20] @ load key base 2^32 392 ldr r5,[r0,#24] 393 ldr r6,[r0,#28] 394 ldr r7,[r0,#32] 395 396 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 397 mov r3,r4,lsr#26 398 mov r4,r5,lsr#20 399 orr r3,r3,r5,lsl#6 400 mov r5,r6,lsr#14 401 orr r4,r4,r6,lsl#12 402 mov r6,r7,lsr#8 403 orr r5,r5,r7,lsl#18 404 and r3,r3,#0x03ffffff 405 and r4,r4,#0x03ffffff 406 and r5,r5,#0x03ffffff 407 408 vdup.32 d0,r2 @ r^1 in both lanes 409 add r2,r3,r3,lsl#2 @ *5 410 vdup.32 d1,r3 411 add r3,r4,r4,lsl#2 412 vdup.32 d2,r2 413 vdup.32 d3,r4 414 add r4,r5,r5,lsl#2 415 vdup.32 d4,r3 416 vdup.32 d5,r5 417 add r5,r6,r6,lsl#2 418 vdup.32 d6,r4 419 vdup.32 d7,r6 420 vdup.32 d8,r5 421 422 mov r5,#2 @ counter 423 424.Lsquare_neon: 425 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 426 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 427 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 428 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 429 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 430 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 431 432 vmull.u32 q5,d0,d0[1] 433 vmull.u32 q6,d1,d0[1] 434 vmull.u32 q7,d3,d0[1] 435 vmull.u32 q8,d5,d0[1] 436 vmull.u32 q9,d7,d0[1] 437 438 vmlal.u32 q5,d7,d2[1] 439 vmlal.u32 q6,d0,d1[1] 440 vmlal.u32 q7,d1,d1[1] 441 vmlal.u32 q8,d3,d1[1] 442 vmlal.u32 q9,d5,d1[1] 443 444 vmlal.u32 q5,d5,d4[1] 445 vmlal.u32 q6,d7,d4[1] 446 vmlal.u32 q8,d1,d3[1] 447 vmlal.u32 q7,d0,d3[1] 448 vmlal.u32 q9,d3,d3[1] 449 450 vmlal.u32 q5,d3,d6[1] 451 vmlal.u32 q8,d0,d5[1] 452 vmlal.u32 q6,d5,d6[1] 453 vmlal.u32 q7,d7,d6[1] 454 vmlal.u32 q9,d1,d5[1] 455 456 vmlal.u32 q8,d7,d8[1] 457 vmlal.u32 q5,d1,d8[1] 458 vmlal.u32 q6,d3,d8[1] 459 vmlal.u32 q7,d5,d8[1] 460 vmlal.u32 q9,d0,d7[1] 461 462 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 463 @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 464 @ and P. Schwabe 465 @ 466 @ H0>>+H1>>+H2>>+H3>>+H4 467 @ H3>>+H4>>*5+H0>>+H1 468 @ 469 @ Trivia. 470 @ 471 @ Result of multiplication of n-bit number by m-bit number is 472 @ n+m bits wide. However! Even though 2^n is a n+1-bit number, 473 @ m-bit number multiplied by 2^n is still n+m bits wide. 474 @ 475 @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, 476 @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit 477 @ one is n+1 bits wide. 478 @ 479 @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that 480 @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 481 @ can be 27. However! In cases when their width exceeds 26 bits 482 @ they are limited by 2^26+2^6. This in turn means that *sum* 483 @ of the products with these values can still be viewed as sum 484 @ of 52-bit numbers as long as the amount of addends is not a 485 @ power of 2. For example, 486 @ 487 @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, 488 @ 489 @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or 490 @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than 491 @ 8 * (2^52) or 2^55. However, the value is then multiplied by 492 @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), 493 @ which is less than 32 * (2^52) or 2^57. And when processing 494 @ data we are looking at triple as many addends... 495 @ 496 @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and 497 @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the 498 @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while 499 @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 500 @ instruction accepts 2x32-bit input and writes 2x64-bit result. 501 @ This means that result of reduction have to be compressed upon 502 @ loop wrap-around. This can be done in the process of reduction 503 @ to minimize amount of instructions [as well as amount of 504 @ 128-bit instructions, which benefits low-end processors], but 505 @ one has to watch for H2 (which is narrower than H0) and 5*H4 506 @ not being wider than 58 bits, so that result of right shift 507 @ by 26 bits fits in 32 bits. This is also useful on x86, 508 @ because it allows to use paddd in place for paddq, which 509 @ benefits Atom, where paddq is ridiculously slow. 510 511 vshr.u64 q15,q8,#26 512 vmovn.i64 d16,q8 513 vshr.u64 q4,q5,#26 514 vmovn.i64 d10,q5 515 vadd.i64 q9,q9,q15 @ h3 -> h4 516 vbic.i32 d16,#0xfc000000 @ &=0x03ffffff 517 vadd.i64 q6,q6,q4 @ h0 -> h1 518 vbic.i32 d10,#0xfc000000 519 520 vshrn.u64 d30,q9,#26 521 vmovn.i64 d18,q9 522 vshr.u64 q4,q6,#26 523 vmovn.i64 d12,q6 524 vadd.i64 q7,q7,q4 @ h1 -> h2 525 vbic.i32 d18,#0xfc000000 526 vbic.i32 d12,#0xfc000000 527 528 vadd.i32 d10,d10,d30 529 vshl.u32 d30,d30,#2 530 vshrn.u64 d8,q7,#26 531 vmovn.i64 d14,q7 532 vadd.i32 d10,d10,d30 @ h4 -> h0 533 vadd.i32 d16,d16,d8 @ h2 -> h3 534 vbic.i32 d14,#0xfc000000 535 536 vshr.u32 d30,d10,#26 537 vbic.i32 d10,#0xfc000000 538 vshr.u32 d8,d16,#26 539 vbic.i32 d16,#0xfc000000 540 vadd.i32 d12,d12,d30 @ h0 -> h1 541 vadd.i32 d18,d18,d8 @ h3 -> h4 542 543 subs r5,r5,#1 544 beq .Lsquare_break_neon 545 546 add r6,r0,#(48+0*9*4) 547 add r7,r0,#(48+1*9*4) 548 549 vtrn.32 d0,d10 @ r^2:r^1 550 vtrn.32 d3,d14 551 vtrn.32 d5,d16 552 vtrn.32 d1,d12 553 vtrn.32 d7,d18 554 555 vshl.u32 d4,d3,#2 @ *5 556 vshl.u32 d6,d5,#2 557 vshl.u32 d2,d1,#2 558 vshl.u32 d8,d7,#2 559 vadd.i32 d4,d4,d3 560 vadd.i32 d2,d2,d1 561 vadd.i32 d6,d6,d5 562 vadd.i32 d8,d8,d7 563 564 vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! 565 vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! 566 vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 567 vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 568 vst1.32 {d8[0]},[r6,:32] 569 vst1.32 {d8[1]},[r7,:32] 570 571 b .Lsquare_neon 572 573.align 4 574.Lsquare_break_neon: 575 add r6,r0,#(48+2*4*9) 576 add r7,r0,#(48+3*4*9) 577 578 vmov d0,d10 @ r^4:r^3 579 vshl.u32 d2,d12,#2 @ *5 580 vmov d1,d12 581 vshl.u32 d4,d14,#2 582 vmov d3,d14 583 vshl.u32 d6,d16,#2 584 vmov d5,d16 585 vshl.u32 d8,d18,#2 586 vmov d7,d18 587 vadd.i32 d2,d2,d12 588 vadd.i32 d4,d4,d14 589 vadd.i32 d6,d6,d16 590 vadd.i32 d8,d8,d18 591 592 vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! 593 vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! 594 vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 595 vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 596 vst1.32 {d8[0]},[r6] 597 vst1.32 {d8[1]},[r7] 598 599 bx lr @ bx lr 600.size poly1305_init_neon,.-poly1305_init_neon 601 602.type poly1305_blocks_neon,%function 603.align 5 604poly1305_blocks_neon: 605 ldr ip,[r0,#36] @ is_base2_26 606 ands r2,r2,#-16 607 beq .Lno_data_neon 608 609 cmp r2,#64 610 bhs .Lenter_neon 611 tst ip,ip @ is_base2_26? 612 beq .Lpoly1305_blocks 613 614.Lenter_neon: 615 stmdb sp!,{r4,r5,r6,r7} 616 vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so 617 618 tst ip,ip @ is_base2_26? 619 bne .Lbase2_26_neon 620 621 stmdb sp!,{r1,r2,r3,lr} 622 bl poly1305_init_neon 623 624 ldr r4,[r0,#0] @ load hash value base 2^32 625 ldr r5,[r0,#4] 626 ldr r6,[r0,#8] 627 ldr r7,[r0,#12] 628 ldr ip,[r0,#16] 629 630 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 631 mov r3,r4,lsr#26 632 veor d10,d10,d10 633 mov r4,r5,lsr#20 634 orr r3,r3,r5,lsl#6 635 veor d12,d12,d12 636 mov r5,r6,lsr#14 637 orr r4,r4,r6,lsl#12 638 veor d14,d14,d14 639 mov r6,r7,lsr#8 640 orr r5,r5,r7,lsl#18 641 veor d16,d16,d16 642 and r3,r3,#0x03ffffff 643 orr r6,r6,ip,lsl#24 644 veor d18,d18,d18 645 and r4,r4,#0x03ffffff 646 mov r1,#1 647 and r5,r5,#0x03ffffff 648 str r1,[r0,#36] @ is_base2_26 649 650 vmov.32 d10[0],r2 651 vmov.32 d12[0],r3 652 vmov.32 d14[0],r4 653 vmov.32 d16[0],r5 654 vmov.32 d18[0],r6 655 adr r5,.Lzeros 656 657 ldmia sp!,{r1,r2,r3,lr} 658 b .Lbase2_32_neon 659 660.align 4 661.Lbase2_26_neon: 662 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 663 @ load hash value 664 665 veor d10,d10,d10 666 veor d12,d12,d12 667 veor d14,d14,d14 668 veor d16,d16,d16 669 veor d18,d18,d18 670 vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! 671 adr r5,.Lzeros 672 vld1.32 {d18[0]},[r0] 673 sub r0,r0,#16 @ rewind 674 675.Lbase2_32_neon: 676 add r4,r1,#32 677 mov r3,r3,lsl#24 678 tst r2,#31 679 beq .Leven 680 681 vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]! 682 vmov.32 d28[0],r3 683 sub r2,r2,#16 684 add r4,r1,#32 685 686# ifdef __ARMEB__ 687 vrev32.8 q10,q10 688 vrev32.8 q13,q13 689 vrev32.8 q11,q11 690 vrev32.8 q12,q12 691# endif 692 vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26 693 vshl.u32 d26,d26,#18 694 695 vsri.u32 d26,d24,#14 696 vshl.u32 d24,d24,#12 697 vadd.i32 d29,d28,d18 @ add hash value and move to #hi 698 699 vbic.i32 d26,#0xfc000000 700 vsri.u32 d24,d22,#20 701 vshl.u32 d22,d22,#6 702 703 vbic.i32 d24,#0xfc000000 704 vsri.u32 d22,d20,#26 705 vadd.i32 d27,d26,d16 706 707 vbic.i32 d20,#0xfc000000 708 vbic.i32 d22,#0xfc000000 709 vadd.i32 d25,d24,d14 710 711 vadd.i32 d21,d20,d10 712 vadd.i32 d23,d22,d12 713 714 mov r7,r5 715 add r6,r0,#48 716 717 cmp r2,r2 718 b .Long_tail 719 720.align 4 721.Leven: 722 subs r2,r2,#64 723 it lo 724 movlo r4,r5 725 726 vmov.i32 q14,#1<<24 @ padbit, yes, always 727 vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] 728 add r1,r1,#64 729 vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) 730 add r4,r4,#64 731 itt hi 732 addhi r7,r0,#(48+1*9*4) 733 addhi r6,r0,#(48+3*9*4) 734 735# ifdef __ARMEB__ 736 vrev32.8 q10,q10 737 vrev32.8 q13,q13 738 vrev32.8 q11,q11 739 vrev32.8 q12,q12 740# endif 741 vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 742 vshl.u32 q13,q13,#18 743 744 vsri.u32 q13,q12,#14 745 vshl.u32 q12,q12,#12 746 747 vbic.i32 q13,#0xfc000000 748 vsri.u32 q12,q11,#20 749 vshl.u32 q11,q11,#6 750 751 vbic.i32 q12,#0xfc000000 752 vsri.u32 q11,q10,#26 753 754 vbic.i32 q10,#0xfc000000 755 vbic.i32 q11,#0xfc000000 756 757 bls .Lskip_loop 758 759 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2 760 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 761 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 762 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 763 b .Loop_neon 764 765.align 5 766.Loop_neon: 767 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 768 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 769 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 770 @ ___________________/ 771 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 772 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 773 @ ___________________/ ____________________/ 774 @ 775 @ Note that we start with inp[2:3]*r^2. This is because it 776 @ doesn't depend on reduction in previous iteration. 777 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 778 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 779 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 780 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 781 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 782 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 783 784 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 785 @ inp[2:3]*r^2 786 787 vadd.i32 d24,d24,d14 @ accumulate inp[0:1] 788 vmull.u32 q7,d25,d0[1] 789 vadd.i32 d20,d20,d10 790 vmull.u32 q5,d21,d0[1] 791 vadd.i32 d26,d26,d16 792 vmull.u32 q8,d27,d0[1] 793 vmlal.u32 q7,d23,d1[1] 794 vadd.i32 d22,d22,d12 795 vmull.u32 q6,d23,d0[1] 796 797 vadd.i32 d28,d28,d18 798 vmull.u32 q9,d29,d0[1] 799 subs r2,r2,#64 800 vmlal.u32 q5,d29,d2[1] 801 it lo 802 movlo r4,r5 803 vmlal.u32 q8,d25,d1[1] 804 vld1.32 d8[1],[r7,:32] 805 vmlal.u32 q6,d21,d1[1] 806 vmlal.u32 q9,d27,d1[1] 807 808 vmlal.u32 q5,d27,d4[1] 809 vmlal.u32 q8,d23,d3[1] 810 vmlal.u32 q9,d25,d3[1] 811 vmlal.u32 q6,d29,d4[1] 812 vmlal.u32 q7,d21,d3[1] 813 814 vmlal.u32 q8,d21,d5[1] 815 vmlal.u32 q5,d25,d6[1] 816 vmlal.u32 q9,d23,d5[1] 817 vmlal.u32 q6,d27,d6[1] 818 vmlal.u32 q7,d29,d6[1] 819 820 vmlal.u32 q8,d29,d8[1] 821 vmlal.u32 q5,d23,d8[1] 822 vmlal.u32 q9,d21,d7[1] 823 vmlal.u32 q6,d25,d8[1] 824 vmlal.u32 q7,d27,d8[1] 825 826 vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) 827 add r4,r4,#64 828 829 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 830 @ (hash+inp[0:1])*r^4 and accumulate 831 832 vmlal.u32 q8,d26,d0[0] 833 vmlal.u32 q5,d20,d0[0] 834 vmlal.u32 q9,d28,d0[0] 835 vmlal.u32 q6,d22,d0[0] 836 vmlal.u32 q7,d24,d0[0] 837 vld1.32 d8[0],[r6,:32] 838 839 vmlal.u32 q8,d24,d1[0] 840 vmlal.u32 q5,d28,d2[0] 841 vmlal.u32 q9,d26,d1[0] 842 vmlal.u32 q6,d20,d1[0] 843 vmlal.u32 q7,d22,d1[0] 844 845 vmlal.u32 q8,d22,d3[0] 846 vmlal.u32 q5,d26,d4[0] 847 vmlal.u32 q9,d24,d3[0] 848 vmlal.u32 q6,d28,d4[0] 849 vmlal.u32 q7,d20,d3[0] 850 851 vmlal.u32 q8,d20,d5[0] 852 vmlal.u32 q5,d24,d6[0] 853 vmlal.u32 q9,d22,d5[0] 854 vmlal.u32 q6,d26,d6[0] 855 vmlal.u32 q8,d28,d8[0] 856 857 vmlal.u32 q7,d28,d6[0] 858 vmlal.u32 q5,d22,d8[0] 859 vmlal.u32 q9,d20,d7[0] 860 vmov.i32 q14,#1<<24 @ padbit, yes, always 861 vmlal.u32 q6,d24,d8[0] 862 vmlal.u32 q7,d26,d8[0] 863 864 vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] 865 add r1,r1,#64 866# ifdef __ARMEB__ 867 vrev32.8 q10,q10 868 vrev32.8 q11,q11 869 vrev32.8 q12,q12 870 vrev32.8 q13,q13 871# endif 872 873 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 874 @ lazy reduction interleaved with base 2^32 -> base 2^26 of 875 @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14. 876 877 vshr.u64 q15,q8,#26 878 vmovn.i64 d16,q8 879 vshr.u64 q4,q5,#26 880 vmovn.i64 d10,q5 881 vadd.i64 q9,q9,q15 @ h3 -> h4 882 vbic.i32 d16,#0xfc000000 883 vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 884 vadd.i64 q6,q6,q4 @ h0 -> h1 885 vshl.u32 q13,q13,#18 886 vbic.i32 d10,#0xfc000000 887 888 vshrn.u64 d30,q9,#26 889 vmovn.i64 d18,q9 890 vshr.u64 q4,q6,#26 891 vmovn.i64 d12,q6 892 vadd.i64 q7,q7,q4 @ h1 -> h2 893 vsri.u32 q13,q12,#14 894 vbic.i32 d18,#0xfc000000 895 vshl.u32 q12,q12,#12 896 vbic.i32 d12,#0xfc000000 897 898 vadd.i32 d10,d10,d30 899 vshl.u32 d30,d30,#2 900 vbic.i32 q13,#0xfc000000 901 vshrn.u64 d8,q7,#26 902 vmovn.i64 d14,q7 903 vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec] 904 vsri.u32 q12,q11,#20 905 vadd.i32 d16,d16,d8 @ h2 -> h3 906 vshl.u32 q11,q11,#6 907 vbic.i32 d14,#0xfc000000 908 vbic.i32 q12,#0xfc000000 909 910 vshrn.u64 d30,q5,#26 @ re-narrow 911 vmovn.i64 d10,q5 912 vsri.u32 q11,q10,#26 913 vbic.i32 q10,#0xfc000000 914 vshr.u32 d8,d16,#26 915 vbic.i32 d16,#0xfc000000 916 vbic.i32 d10,#0xfc000000 917 vadd.i32 d12,d12,d30 @ h0 -> h1 918 vadd.i32 d18,d18,d8 @ h3 -> h4 919 vbic.i32 q11,#0xfc000000 920 921 bhi .Loop_neon 922 923.Lskip_loop: 924 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 925 @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 926 927 add r7,r0,#(48+0*9*4) 928 add r6,r0,#(48+1*9*4) 929 adds r2,r2,#32 930 it ne 931 movne r2,#0 932 bne .Long_tail 933 934 vadd.i32 d25,d24,d14 @ add hash value and move to #hi 935 vadd.i32 d21,d20,d10 936 vadd.i32 d27,d26,d16 937 vadd.i32 d23,d22,d12 938 vadd.i32 d29,d28,d18 939 940.Long_tail: 941 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1 942 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2 943 944 vadd.i32 d24,d24,d14 @ can be redundant 945 vmull.u32 q7,d25,d0 946 vadd.i32 d20,d20,d10 947 vmull.u32 q5,d21,d0 948 vadd.i32 d26,d26,d16 949 vmull.u32 q8,d27,d0 950 vadd.i32 d22,d22,d12 951 vmull.u32 q6,d23,d0 952 vadd.i32 d28,d28,d18 953 vmull.u32 q9,d29,d0 954 955 vmlal.u32 q5,d29,d2 956 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 957 vmlal.u32 q8,d25,d1 958 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 959 vmlal.u32 q6,d21,d1 960 vmlal.u32 q9,d27,d1 961 vmlal.u32 q7,d23,d1 962 963 vmlal.u32 q8,d23,d3 964 vld1.32 d8[1],[r7,:32] 965 vmlal.u32 q5,d27,d4 966 vld1.32 d8[0],[r6,:32] 967 vmlal.u32 q9,d25,d3 968 vmlal.u32 q6,d29,d4 969 vmlal.u32 q7,d21,d3 970 971 vmlal.u32 q8,d21,d5 972 it ne 973 addne r7,r0,#(48+2*9*4) 974 vmlal.u32 q5,d25,d6 975 it ne 976 addne r6,r0,#(48+3*9*4) 977 vmlal.u32 q9,d23,d5 978 vmlal.u32 q6,d27,d6 979 vmlal.u32 q7,d29,d6 980 981 vmlal.u32 q8,d29,d8 982 vorn q0,q0,q0 @ all-ones, can be redundant 983 vmlal.u32 q5,d23,d8 984 vshr.u64 q0,q0,#38 985 vmlal.u32 q9,d21,d7 986 vmlal.u32 q6,d25,d8 987 vmlal.u32 q7,d27,d8 988 989 beq .Lshort_tail 990 991 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 992 @ (hash+inp[0:1])*r^4:r^3 and accumulate 993 994 vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3 995 vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 996 997 vmlal.u32 q7,d24,d0 998 vmlal.u32 q5,d20,d0 999 vmlal.u32 q8,d26,d0 1000 vmlal.u32 q6,d22,d0 1001 vmlal.u32 q9,d28,d0 1002 1003 vmlal.u32 q5,d28,d2 1004 vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 1005 vmlal.u32 q8,d24,d1 1006 vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 1007 vmlal.u32 q6,d20,d1 1008 vmlal.u32 q9,d26,d1 1009 vmlal.u32 q7,d22,d1 1010 1011 vmlal.u32 q8,d22,d3 1012 vld1.32 d8[1],[r7,:32] 1013 vmlal.u32 q5,d26,d4 1014 vld1.32 d8[0],[r6,:32] 1015 vmlal.u32 q9,d24,d3 1016 vmlal.u32 q6,d28,d4 1017 vmlal.u32 q7,d20,d3 1018 1019 vmlal.u32 q8,d20,d5 1020 vmlal.u32 q5,d24,d6 1021 vmlal.u32 q9,d22,d5 1022 vmlal.u32 q6,d26,d6 1023 vmlal.u32 q7,d28,d6 1024 1025 vmlal.u32 q8,d28,d8 1026 vorn q0,q0,q0 @ all-ones 1027 vmlal.u32 q5,d22,d8 1028 vshr.u64 q0,q0,#38 1029 vmlal.u32 q9,d20,d7 1030 vmlal.u32 q6,d24,d8 1031 vmlal.u32 q7,d26,d8 1032 1033.Lshort_tail: 1034 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1035 @ horizontal addition 1036 1037 vadd.i64 d16,d16,d17 1038 vadd.i64 d10,d10,d11 1039 vadd.i64 d18,d18,d19 1040 vadd.i64 d12,d12,d13 1041 vadd.i64 d14,d14,d15 1042 1043 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1044 @ lazy reduction, but without narrowing 1045 1046 vshr.u64 q15,q8,#26 1047 vand.i64 q8,q8,q0 1048 vshr.u64 q4,q5,#26 1049 vand.i64 q5,q5,q0 1050 vadd.i64 q9,q9,q15 @ h3 -> h4 1051 vadd.i64 q6,q6,q4 @ h0 -> h1 1052 1053 vshr.u64 q15,q9,#26 1054 vand.i64 q9,q9,q0 1055 vshr.u64 q4,q6,#26 1056 vand.i64 q6,q6,q0 1057 vadd.i64 q7,q7,q4 @ h1 -> h2 1058 1059 vadd.i64 q5,q5,q15 1060 vshl.u64 q15,q15,#2 1061 vshr.u64 q4,q7,#26 1062 vand.i64 q7,q7,q0 1063 vadd.i64 q5,q5,q15 @ h4 -> h0 1064 vadd.i64 q8,q8,q4 @ h2 -> h3 1065 1066 vshr.u64 q15,q5,#26 1067 vand.i64 q5,q5,q0 1068 vshr.u64 q4,q8,#26 1069 vand.i64 q8,q8,q0 1070 vadd.i64 q6,q6,q15 @ h0 -> h1 1071 vadd.i64 q9,q9,q4 @ h3 -> h4 1072 1073 cmp r2,#0 1074 bne .Leven 1075 1076 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1077 @ store hash value 1078 1079 vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! 1080 vst1.32 {d18[0]},[r0] 1081 1082 vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ epilogue 1083 ldmia sp!,{r4,r5,r6,r7} 1084.Lno_data_neon: 1085 bx lr @ bx lr 1086.size poly1305_blocks_neon,.-poly1305_blocks_neon 1087 1088.type poly1305_emit_neon,%function 1089.align 5 1090poly1305_emit_neon: 1091 ldr ip,[r0,#36] @ is_base2_26 1092 1093 stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 1094 1095 tst ip,ip 1096 beq .Lpoly1305_emit_enter 1097 1098 ldmia r0,{r3,r4,r5,r6,r7} 1099 eor r8,r8,r8 1100 1101 adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32 1102 mov r4,r4,lsr#6 1103 adcs r4,r4,r5,lsl#20 1104 mov r5,r5,lsr#12 1105 adcs r5,r5,r6,lsl#14 1106 mov r6,r6,lsr#18 1107 adcs r6,r6,r7,lsl#8 1108 adc r7,r8,r7,lsr#24 @ can be partially reduced ... 1109 1110 and r8,r7,#-4 @ ... so reduce 1111 and r7,r6,#3 1112 add r8,r8,r8,lsr#2 @ *= 5 1113 adds r3,r3,r8 1114 adcs r4,r4,#0 1115 adcs r5,r5,#0 1116 adcs r6,r6,#0 1117 adc r7,r7,#0 1118 1119 adds r8,r3,#5 @ compare to modulus 1120 adcs r9,r4,#0 1121 adcs r10,r5,#0 1122 adcs r11,r6,#0 1123 adc r7,r7,#0 1124 tst r7,#4 @ did it carry/borrow? 1125 1126 it ne 1127 movne r3,r8 1128 ldr r8,[r2,#0] 1129 it ne 1130 movne r4,r9 1131 ldr r9,[r2,#4] 1132 it ne 1133 movne r5,r10 1134 ldr r10,[r2,#8] 1135 it ne 1136 movne r6,r11 1137 ldr r11,[r2,#12] 1138 1139 adds r3,r3,r8 @ accumulate nonce 1140 adcs r4,r4,r9 1141 adcs r5,r5,r10 1142 adc r6,r6,r11 1143 1144# ifdef __ARMEB__ 1145 rev r3,r3 1146 rev r4,r4 1147 rev r5,r5 1148 rev r6,r6 1149# endif 1150 str r3,[r1,#0] @ store the result 1151 str r4,[r1,#4] 1152 str r5,[r1,#8] 1153 str r6,[r1,#12] 1154 1155 ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 1156 bx lr @ bx lr 1157.size poly1305_emit_neon,.-poly1305_emit_neon 1158 1159.align 5 1160.Lzeros: 1161.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1162.LOPENSSL_armcap: 1163.word OPENSSL_armcap_P-.Lpoly1305_init 1164#endif 1165.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1166.align 2 1167.align 2 1168#if __ARM_MAX_ARCH__>=7 1169.comm OPENSSL_armcap_P,4,4 1170#endif 1171