1#!/usr/bin/env perl 2# SPDX-License-Identifier: GPL-1.0+ OR BSD-3-Clause 3# 4# ==================================================================== 5# Written by Andy Polyakov, @dot-asm, initially for the OpenSSL 6# project. 7# ==================================================================== 8# 9# IALU(*)/gcc-4.4 NEON 10# 11# ARM11xx(ARMv6) 7.78/+100% - 12# Cortex-A5 6.35/+130% 3.00 13# Cortex-A8 6.25/+115% 2.36 14# Cortex-A9 5.10/+95% 2.55 15# Cortex-A15 3.85/+85% 1.25(**) 16# Snapdragon S4 5.70/+100% 1.48(**) 17# 18# (*) this is for -march=armv6, i.e. with bunch of ldrb loading data; 19# (**) these are trade-off results, they can be improved by ~8% but at 20# the cost of 15/12% regression on Cortex-A5/A7, it's even possible 21# to improve Cortex-A9 result, but then A5/A7 loose more than 20%; 22 23$flavour = shift; 24if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 25else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 26 27if ($flavour && $flavour ne "void") { 28 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 29 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 30 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 31 die "can't locate arm-xlate.pl"; 32 33 open STDOUT,"| \"$^X\" $xlate $flavour $output"; 34} else { 35 open STDOUT,">$output"; 36} 37 38($ctx,$inp,$len,$padbit)=map("r$_",(0..3)); 39 40$code.=<<___; 41#ifndef __KERNEL__ 42# include "arm_arch.h" 43#else 44# define __ARM_ARCH__ __LINUX_ARM_ARCH__ 45# define __ARM_MAX_ARCH__ __LINUX_ARM_ARCH__ 46# define poly1305_init poly1305_block_init 47# define poly1305_blocks poly1305_blocks_arm 48#endif 49 50#if defined(__thumb2__) 51.syntax unified 52.thumb 53#else 54.code 32 55#endif 56 57.text 58 59.globl poly1305_emit 60.globl poly1305_blocks 61.globl poly1305_init 62.type poly1305_init,%function 63.align 5 64poly1305_init: 65.Lpoly1305_init: 66 stmdb sp!,{r4-r11} 67 68 eor r3,r3,r3 69 cmp $inp,#0 70 str r3,[$ctx,#0] @ zero hash value 71 str r3,[$ctx,#4] 72 str r3,[$ctx,#8] 73 str r3,[$ctx,#12] 74 str r3,[$ctx,#16] 75 str r3,[$ctx,#36] @ clear is_base2_26 76 add $ctx,$ctx,#20 77 78#ifdef __thumb2__ 79 it eq 80#endif 81 moveq r0,#0 82 beq .Lno_key 83 84#if __ARM_MAX_ARCH__>=7 85 mov r3,#-1 86 str r3,[$ctx,#28] @ impossible key power value 87# ifndef __KERNEL__ 88 adr r11,.Lpoly1305_init 89 ldr r12,.LOPENSSL_armcap 90# endif 91#endif 92 ldrb r4,[$inp,#0] 93 mov r10,#0x0fffffff 94 ldrb r5,[$inp,#1] 95 and r3,r10,#-4 @ 0x0ffffffc 96 ldrb r6,[$inp,#2] 97 ldrb r7,[$inp,#3] 98 orr r4,r4,r5,lsl#8 99 ldrb r5,[$inp,#4] 100 orr r4,r4,r6,lsl#16 101 ldrb r6,[$inp,#5] 102 orr r4,r4,r7,lsl#24 103 ldrb r7,[$inp,#6] 104 and r4,r4,r10 105 106#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 107# if !defined(_WIN32) 108 ldr r12,[r11,r12] @ OPENSSL_armcap_P 109# endif 110# if defined(__APPLE__) || defined(_WIN32) 111 ldr r12,[r12] 112# endif 113#endif 114 ldrb r8,[$inp,#7] 115 orr r5,r5,r6,lsl#8 116 ldrb r6,[$inp,#8] 117 orr r5,r5,r7,lsl#16 118 ldrb r7,[$inp,#9] 119 orr r5,r5,r8,lsl#24 120 ldrb r8,[$inp,#10] 121 and r5,r5,r3 122 123#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 124 tst r12,#ARMV7_NEON @ check for NEON 125# ifdef __thumb2__ 126 adr r9,.Lpoly1305_blocks_neon 127 adr r11,.Lpoly1305_blocks 128 it ne 129 movne r11,r9 130 adr r12,.Lpoly1305_emit 131 orr r11,r11,#1 @ thumb-ify addresses 132 orr r12,r12,#1 133# else 134 add r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) 135 ite eq 136 addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) 137 addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) 138# endif 139#endif 140 ldrb r9,[$inp,#11] 141 orr r6,r6,r7,lsl#8 142 ldrb r7,[$inp,#12] 143 orr r6,r6,r8,lsl#16 144 ldrb r8,[$inp,#13] 145 orr r6,r6,r9,lsl#24 146 ldrb r9,[$inp,#14] 147 and r6,r6,r3 148 149 ldrb r10,[$inp,#15] 150 orr r7,r7,r8,lsl#8 151 str r4,[$ctx,#0] 152 orr r7,r7,r9,lsl#16 153 str r5,[$ctx,#4] 154 orr r7,r7,r10,lsl#24 155 str r6,[$ctx,#8] 156 and r7,r7,r3 157 str r7,[$ctx,#12] 158#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 159 stmia r2,{r11,r12} @ fill functions table 160 mov r0,#1 161#else 162 mov r0,#0 163#endif 164.Lno_key: 165 ldmia sp!,{r4-r11} 166#if __ARM_ARCH__>=5 167 ret @ bx lr 168#else 169 tst lr,#1 170 moveq pc,lr @ be binary compatible with V4, yet 171 bx lr @ interoperable with Thumb ISA:-) 172#endif 173.size poly1305_init,.-poly1305_init 174___ 175{ 176my ($h0,$h1,$h2,$h3,$h4,$r0,$r1,$r2,$r3)=map("r$_",(4..12)); 177my ($s1,$s2,$s3)=($r1,$r2,$r3); 178 179$code.=<<___; 180.type poly1305_blocks,%function 181.align 5 182poly1305_blocks: 183.Lpoly1305_blocks: 184 stmdb sp!,{r3-r11,lr} 185 186 ands $len,$len,#-16 187 beq .Lno_data 188 189 add $len,$len,$inp @ end pointer 190 sub sp,sp,#32 191 192#if __ARM_ARCH__<7 193 ldmia $ctx,{$h0-$r3} @ load context 194 add $ctx,$ctx,#20 195 str $len,[sp,#16] @ offload stuff 196 str $ctx,[sp,#12] 197#else 198 ldr lr,[$ctx,#36] @ is_base2_26 199 ldmia $ctx!,{$h0-$h4} @ load hash value 200 str $len,[sp,#16] @ offload stuff 201 str $ctx,[sp,#12] 202 203 adds $r0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 204 mov $r1,$h1,lsr#6 205 adcs $r1,$r1,$h2,lsl#20 206 mov $r2,$h2,lsr#12 207 adcs $r2,$r2,$h3,lsl#14 208 mov $r3,$h3,lsr#18 209 adcs $r3,$r3,$h4,lsl#8 210 mov $len,#0 211 teq lr,#0 212 str $len,[$ctx,#16] @ clear is_base2_26 213 adc $len,$len,$h4,lsr#24 214 215 itttt ne 216 movne $h0,$r0 @ choose between radixes 217 movne $h1,$r1 218 movne $h2,$r2 219 movne $h3,$r3 220 ldmia $ctx,{$r0-$r3} @ load key 221 it ne 222 movne $h4,$len 223#endif 224 225 mov lr,$inp 226 cmp $padbit,#0 227 str $r1,[sp,#20] 228 str $r2,[sp,#24] 229 str $r3,[sp,#28] 230 b .Loop 231 232.align 4 233.Loop: 234#if __ARM_ARCH__<7 235 ldrb r0,[lr],#16 @ load input 236# ifdef __thumb2__ 237 it hi 238# endif 239 addhi $h4,$h4,#1 @ 1<<128 240 ldrb r1,[lr,#-15] 241 ldrb r2,[lr,#-14] 242 ldrb r3,[lr,#-13] 243 orr r1,r0,r1,lsl#8 244 ldrb r0,[lr,#-12] 245 orr r2,r1,r2,lsl#16 246 ldrb r1,[lr,#-11] 247 orr r3,r2,r3,lsl#24 248 ldrb r2,[lr,#-10] 249 adds $h0,$h0,r3 @ accumulate input 250 251 ldrb r3,[lr,#-9] 252 orr r1,r0,r1,lsl#8 253 ldrb r0,[lr,#-8] 254 orr r2,r1,r2,lsl#16 255 ldrb r1,[lr,#-7] 256 orr r3,r2,r3,lsl#24 257 ldrb r2,[lr,#-6] 258 adcs $h1,$h1,r3 259 260 ldrb r3,[lr,#-5] 261 orr r1,r0,r1,lsl#8 262 ldrb r0,[lr,#-4] 263 orr r2,r1,r2,lsl#16 264 ldrb r1,[lr,#-3] 265 orr r3,r2,r3,lsl#24 266 ldrb r2,[lr,#-2] 267 adcs $h2,$h2,r3 268 269 ldrb r3,[lr,#-1] 270 orr r1,r0,r1,lsl#8 271 str lr,[sp,#8] @ offload input pointer 272 orr r2,r1,r2,lsl#16 273 add $s1,$r1,$r1,lsr#2 274 orr r3,r2,r3,lsl#24 275#else 276 ldr r0,[lr],#16 @ load input 277 it hi 278 addhi $h4,$h4,#1 @ padbit 279 ldr r1,[lr,#-12] 280 ldr r2,[lr,#-8] 281 ldr r3,[lr,#-4] 282# ifdef __ARMEB__ 283 rev r0,r0 284 rev r1,r1 285 rev r2,r2 286 rev r3,r3 287# endif 288 adds $h0,$h0,r0 @ accumulate input 289 str lr,[sp,#8] @ offload input pointer 290 adcs $h1,$h1,r1 291 add $s1,$r1,$r1,lsr#2 292 adcs $h2,$h2,r2 293#endif 294 add $s2,$r2,$r2,lsr#2 295 adcs $h3,$h3,r3 296 add $s3,$r3,$r3,lsr#2 297 298 umull r2,r3,$h1,$r0 299 adc $h4,$h4,#0 300 umull r0,r1,$h0,$r0 301 umlal r2,r3,$h4,$s1 302 umlal r0,r1,$h3,$s1 303 ldr $r1,[sp,#20] @ reload $r1 304 umlal r2,r3,$h2,$s3 305 umlal r0,r1,$h1,$s3 306 umlal r2,r3,$h3,$s2 307 umlal r0,r1,$h2,$s2 308 umlal r2,r3,$h0,$r1 309 str r0,[sp,#0] @ future $h0 310 mul r0,$s2,$h4 311 ldr $r2,[sp,#24] @ reload $r2 312 adds r2,r2,r1 @ d1+=d0>>32 313 eor r1,r1,r1 314 adc lr,r3,#0 @ future $h2 315 str r2,[sp,#4] @ future $h1 316 317 mul r2,$s3,$h4 318 eor r3,r3,r3 319 umlal r0,r1,$h3,$s3 320 ldr $r3,[sp,#28] @ reload $r3 321 umlal r2,r3,$h3,$r0 322 umlal r0,r1,$h2,$r0 323 umlal r2,r3,$h2,$r1 324 umlal r0,r1,$h1,$r1 325 umlal r2,r3,$h1,$r2 326 umlal r0,r1,$h0,$r2 327 umlal r2,r3,$h0,$r3 328 ldr $h0,[sp,#0] 329 mul $h4,$r0,$h4 330 ldr $h1,[sp,#4] 331 332 adds $h2,lr,r0 @ d2+=d1>>32 333 ldr lr,[sp,#8] @ reload input pointer 334 adc r1,r1,#0 335 adds $h3,r2,r1 @ d3+=d2>>32 336 ldr r0,[sp,#16] @ reload end pointer 337 adc r3,r3,#0 338 add $h4,$h4,r3 @ h4+=d3>>32 339 340 and r1,$h4,#-4 341 and $h4,$h4,#3 342 add r1,r1,r1,lsr#2 @ *=5 343 adds $h0,$h0,r1 344 adcs $h1,$h1,#0 345 adcs $h2,$h2,#0 346 adcs $h3,$h3,#0 347 adc $h4,$h4,#0 348 349 cmp r0,lr @ done yet? 350 bhi .Loop 351 352 ldr $ctx,[sp,#12] 353 add sp,sp,#32 354 stmdb $ctx,{$h0-$h4} @ store the result 355 356.Lno_data: 357#if __ARM_ARCH__>=5 358 ldmia sp!,{r3-r11,pc} 359#else 360 ldmia sp!,{r3-r11,lr} 361 tst lr,#1 362 moveq pc,lr @ be binary compatible with V4, yet 363 bx lr @ interoperable with Thumb ISA:-) 364#endif 365.size poly1305_blocks,.-poly1305_blocks 366___ 367} 368{ 369my ($ctx,$mac,$nonce)=map("r$_",(0..2)); 370my ($h0,$h1,$h2,$h3,$h4,$g0,$g1,$g2,$g3)=map("r$_",(3..11)); 371my $g4=$ctx; 372 373$code.=<<___; 374.type poly1305_emit,%function 375.align 5 376poly1305_emit: 377.Lpoly1305_emit: 378 stmdb sp!,{r4-r11} 379 380 ldmia $ctx,{$h0-$h4} 381 382#if __ARM_ARCH__>=7 383 ldr ip,[$ctx,#36] @ is_base2_26 384 385 adds $g0,$h0,$h1,lsl#26 @ base 2^26 -> base 2^32 386 mov $g1,$h1,lsr#6 387 adcs $g1,$g1,$h2,lsl#20 388 mov $g2,$h2,lsr#12 389 adcs $g2,$g2,$h3,lsl#14 390 mov $g3,$h3,lsr#18 391 adcs $g3,$g3,$h4,lsl#8 392 mov $g4,#0 393 adc $g4,$g4,$h4,lsr#24 394 395 tst ip,ip 396 itttt ne 397 movne $h0,$g0 398 movne $h1,$g1 399 movne $h2,$g2 400 movne $h3,$g3 401 it ne 402 movne $h4,$g4 403#endif 404 405 adds $g0,$h0,#5 @ compare to modulus 406 adcs $g1,$h1,#0 407 adcs $g2,$h2,#0 408 adcs $g3,$h3,#0 409 adc $g4,$h4,#0 410 tst $g4,#4 @ did it carry/borrow? 411 412#ifdef __thumb2__ 413 it ne 414#endif 415 movne $h0,$g0 416 ldr $g0,[$nonce,#0] 417#ifdef __thumb2__ 418 it ne 419#endif 420 movne $h1,$g1 421 ldr $g1,[$nonce,#4] 422#ifdef __thumb2__ 423 it ne 424#endif 425 movne $h2,$g2 426 ldr $g2,[$nonce,#8] 427#ifdef __thumb2__ 428 it ne 429#endif 430 movne $h3,$g3 431 ldr $g3,[$nonce,#12] 432 433 adds $h0,$h0,$g0 434 adcs $h1,$h1,$g1 435 adcs $h2,$h2,$g2 436 adc $h3,$h3,$g3 437 438#if __ARM_ARCH__>=7 439# ifdef __ARMEB__ 440 rev $h0,$h0 441 rev $h1,$h1 442 rev $h2,$h2 443 rev $h3,$h3 444# endif 445 str $h0,[$mac,#0] 446 str $h1,[$mac,#4] 447 str $h2,[$mac,#8] 448 str $h3,[$mac,#12] 449#else 450 strb $h0,[$mac,#0] 451 mov $h0,$h0,lsr#8 452 strb $h1,[$mac,#4] 453 mov $h1,$h1,lsr#8 454 strb $h2,[$mac,#8] 455 mov $h2,$h2,lsr#8 456 strb $h3,[$mac,#12] 457 mov $h3,$h3,lsr#8 458 459 strb $h0,[$mac,#1] 460 mov $h0,$h0,lsr#8 461 strb $h1,[$mac,#5] 462 mov $h1,$h1,lsr#8 463 strb $h2,[$mac,#9] 464 mov $h2,$h2,lsr#8 465 strb $h3,[$mac,#13] 466 mov $h3,$h3,lsr#8 467 468 strb $h0,[$mac,#2] 469 mov $h0,$h0,lsr#8 470 strb $h1,[$mac,#6] 471 mov $h1,$h1,lsr#8 472 strb $h2,[$mac,#10] 473 mov $h2,$h2,lsr#8 474 strb $h3,[$mac,#14] 475 mov $h3,$h3,lsr#8 476 477 strb $h0,[$mac,#3] 478 strb $h1,[$mac,#7] 479 strb $h2,[$mac,#11] 480 strb $h3,[$mac,#15] 481#endif 482 ldmia sp!,{r4-r11} 483#if __ARM_ARCH__>=5 484 ret @ bx lr 485#else 486 tst lr,#1 487 moveq pc,lr @ be binary compatible with V4, yet 488 bx lr @ interoperable with Thumb ISA:-) 489#endif 490.size poly1305_emit,.-poly1305_emit 491___ 492{ 493my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("d$_",(0..9)); 494my ($D0,$D1,$D2,$D3,$D4, $H0,$H1,$H2,$H3,$H4) = map("q$_",(5..14)); 495my ($T0,$T1,$MASK) = map("q$_",(15,4,0)); 496 497my ($in2,$zeros,$tbl0,$tbl1) = map("r$_",(4..7)); 498 499$code.=<<___; 500#if __ARM_MAX_ARCH__>=7 501.fpu neon 502 503.type poly1305_init_neon,%function 504.align 5 505poly1305_init_neon: 506.Lpoly1305_init_neon: 507 ldr r3,[$ctx,#48] @ first table element 508 cmp r3,#-1 @ is value impossible? 509 bne .Lno_init_neon 510 511 ldr r4,[$ctx,#20] @ load key base 2^32 512 ldr r5,[$ctx,#24] 513 ldr r6,[$ctx,#28] 514 ldr r7,[$ctx,#32] 515 516 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 517 mov r3,r4,lsr#26 518 mov r4,r5,lsr#20 519 orr r3,r3,r5,lsl#6 520 mov r5,r6,lsr#14 521 orr r4,r4,r6,lsl#12 522 mov r6,r7,lsr#8 523 orr r5,r5,r7,lsl#18 524 and r3,r3,#0x03ffffff 525 and r4,r4,#0x03ffffff 526 and r5,r5,#0x03ffffff 527 528 vdup.32 $R0,r2 @ r^1 in both lanes 529 add r2,r3,r3,lsl#2 @ *5 530 vdup.32 $R1,r3 531 add r3,r4,r4,lsl#2 532 vdup.32 $S1,r2 533 vdup.32 $R2,r4 534 add r4,r5,r5,lsl#2 535 vdup.32 $S2,r3 536 vdup.32 $R3,r5 537 add r5,r6,r6,lsl#2 538 vdup.32 $S3,r4 539 vdup.32 $R4,r6 540 vdup.32 $S4,r5 541 542 mov $zeros,#2 @ counter 543 544.Lsquare_neon: 545 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 546 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 547 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 548 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 549 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 550 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 551 552 vmull.u32 $D0,$R0,${R0}[1] 553 vmull.u32 $D1,$R1,${R0}[1] 554 vmull.u32 $D2,$R2,${R0}[1] 555 vmull.u32 $D3,$R3,${R0}[1] 556 vmull.u32 $D4,$R4,${R0}[1] 557 558 vmlal.u32 $D0,$R4,${S1}[1] 559 vmlal.u32 $D1,$R0,${R1}[1] 560 vmlal.u32 $D2,$R1,${R1}[1] 561 vmlal.u32 $D3,$R2,${R1}[1] 562 vmlal.u32 $D4,$R3,${R1}[1] 563 564 vmlal.u32 $D0,$R3,${S2}[1] 565 vmlal.u32 $D1,$R4,${S2}[1] 566 vmlal.u32 $D3,$R1,${R2}[1] 567 vmlal.u32 $D2,$R0,${R2}[1] 568 vmlal.u32 $D4,$R2,${R2}[1] 569 570 vmlal.u32 $D0,$R2,${S3}[1] 571 vmlal.u32 $D3,$R0,${R3}[1] 572 vmlal.u32 $D1,$R3,${S3}[1] 573 vmlal.u32 $D2,$R4,${S3}[1] 574 vmlal.u32 $D4,$R1,${R3}[1] 575 576 vmlal.u32 $D3,$R4,${S4}[1] 577 vmlal.u32 $D0,$R1,${S4}[1] 578 vmlal.u32 $D1,$R2,${S4}[1] 579 vmlal.u32 $D2,$R3,${S4}[1] 580 vmlal.u32 $D4,$R0,${R4}[1] 581 582 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 583 @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 584 @ and P. Schwabe 585 @ 586 @ H0>>+H1>>+H2>>+H3>>+H4 587 @ H3>>+H4>>*5+H0>>+H1 588 @ 589 @ Trivia. 590 @ 591 @ Result of multiplication of n-bit number by m-bit number is 592 @ n+m bits wide. However! Even though 2^n is a n+1-bit number, 593 @ m-bit number multiplied by 2^n is still n+m bits wide. 594 @ 595 @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, 596 @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit 597 @ one is n+1 bits wide. 598 @ 599 @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that 600 @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 601 @ can be 27. However! In cases when their width exceeds 26 bits 602 @ they are limited by 2^26+2^6. This in turn means that *sum* 603 @ of the products with these values can still be viewed as sum 604 @ of 52-bit numbers as long as the amount of addends is not a 605 @ power of 2. For example, 606 @ 607 @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, 608 @ 609 @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or 610 @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than 611 @ 8 * (2^52) or 2^55. However, the value is then multiplied by 612 @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), 613 @ which is less than 32 * (2^52) or 2^57. And when processing 614 @ data we are looking at triple as many addends... 615 @ 616 @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and 617 @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the 618 @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while 619 @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 620 @ instruction accepts 2x32-bit input and writes 2x64-bit result. 621 @ This means that result of reduction have to be compressed upon 622 @ loop wrap-around. This can be done in the process of reduction 623 @ to minimize amount of instructions [as well as amount of 624 @ 128-bit instructions, which benefits low-end processors], but 625 @ one has to watch for H2 (which is narrower than H0) and 5*H4 626 @ not being wider than 58 bits, so that result of right shift 627 @ by 26 bits fits in 32 bits. This is also useful on x86, 628 @ because it allows to use paddd in place for paddq, which 629 @ benefits Atom, where paddq is ridiculously slow. 630 631 vshr.u64 $T0,$D3,#26 632 vmovn.i64 $D3#lo,$D3 633 vshr.u64 $T1,$D0,#26 634 vmovn.i64 $D0#lo,$D0 635 vadd.i64 $D4,$D4,$T0 @ h3 -> h4 636 vbic.i32 $D3#lo,#0xfc000000 @ &=0x03ffffff 637 vadd.i64 $D1,$D1,$T1 @ h0 -> h1 638 vbic.i32 $D0#lo,#0xfc000000 639 640 vshrn.u64 $T0#lo,$D4,#26 641 vmovn.i64 $D4#lo,$D4 642 vshr.u64 $T1,$D1,#26 643 vmovn.i64 $D1#lo,$D1 644 vadd.i64 $D2,$D2,$T1 @ h1 -> h2 645 vbic.i32 $D4#lo,#0xfc000000 646 vbic.i32 $D1#lo,#0xfc000000 647 648 vadd.i32 $D0#lo,$D0#lo,$T0#lo 649 vshl.u32 $T0#lo,$T0#lo,#2 650 vshrn.u64 $T1#lo,$D2,#26 651 vmovn.i64 $D2#lo,$D2 652 vadd.i32 $D0#lo,$D0#lo,$T0#lo @ h4 -> h0 653 vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 654 vbic.i32 $D2#lo,#0xfc000000 655 656 vshr.u32 $T0#lo,$D0#lo,#26 657 vbic.i32 $D0#lo,#0xfc000000 658 vshr.u32 $T1#lo,$D3#lo,#26 659 vbic.i32 $D3#lo,#0xfc000000 660 vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 661 vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 662 663 subs $zeros,$zeros,#1 664 beq .Lsquare_break_neon 665 666 add $tbl0,$ctx,#(48+0*9*4) 667 add $tbl1,$ctx,#(48+1*9*4) 668 669 vtrn.32 $R0,$D0#lo @ r^2:r^1 670 vtrn.32 $R2,$D2#lo 671 vtrn.32 $R3,$D3#lo 672 vtrn.32 $R1,$D1#lo 673 vtrn.32 $R4,$D4#lo 674 675 vshl.u32 $S2,$R2,#2 @ *5 676 vshl.u32 $S3,$R3,#2 677 vshl.u32 $S1,$R1,#2 678 vshl.u32 $S4,$R4,#2 679 vadd.i32 $S2,$S2,$R2 680 vadd.i32 $S1,$S1,$R1 681 vadd.i32 $S3,$S3,$R3 682 vadd.i32 $S4,$S4,$R4 683 684 vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! 685 vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! 686 vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 687 vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 688 vst1.32 {${S4}[0]},[$tbl0,:32] 689 vst1.32 {${S4}[1]},[$tbl1,:32] 690 691 b .Lsquare_neon 692 693.align 4 694.Lsquare_break_neon: 695 add $tbl0,$ctx,#(48+2*4*9) 696 add $tbl1,$ctx,#(48+3*4*9) 697 698 vmov $R0,$D0#lo @ r^4:r^3 699 vshl.u32 $S1,$D1#lo,#2 @ *5 700 vmov $R1,$D1#lo 701 vshl.u32 $S2,$D2#lo,#2 702 vmov $R2,$D2#lo 703 vshl.u32 $S3,$D3#lo,#2 704 vmov $R3,$D3#lo 705 vshl.u32 $S4,$D4#lo,#2 706 vmov $R4,$D4#lo 707 vadd.i32 $S1,$S1,$D1#lo 708 vadd.i32 $S2,$S2,$D2#lo 709 vadd.i32 $S3,$S3,$D3#lo 710 vadd.i32 $S4,$S4,$D4#lo 711 712 vst4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! 713 vst4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! 714 vst4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 715 vst4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 716 vst1.32 {${S4}[0]},[$tbl0] 717 vst1.32 {${S4}[1]},[$tbl1] 718 719.Lno_init_neon: 720 ret @ bx lr 721.size poly1305_init_neon,.-poly1305_init_neon 722 723.globl poly1305_blocks_neon 724.type poly1305_blocks_neon,%function 725.align 5 726poly1305_blocks_neon: 727.Lpoly1305_blocks_neon: 728 ldr ip,[$ctx,#36] @ is_base2_26 729 730 cmp $len,#64 731 blo .Lpoly1305_blocks 732 733 stmdb sp!,{r4-r7} 734 vstmdb sp!,{d8-d15} @ ABI specification says so 735 736 tst ip,ip @ is_base2_26? 737 bne .Lbase2_26_neon 738 739 stmdb sp!,{r1-r3,lr} 740 bl .Lpoly1305_init_neon 741 742 ldr r4,[$ctx,#0] @ load hash value base 2^32 743 ldr r5,[$ctx,#4] 744 ldr r6,[$ctx,#8] 745 ldr r7,[$ctx,#12] 746 ldr ip,[$ctx,#16] 747 748 and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 749 mov r3,r4,lsr#26 750 veor $D0#lo,$D0#lo,$D0#lo 751 mov r4,r5,lsr#20 752 orr r3,r3,r5,lsl#6 753 veor $D1#lo,$D1#lo,$D1#lo 754 mov r5,r6,lsr#14 755 orr r4,r4,r6,lsl#12 756 veor $D2#lo,$D2#lo,$D2#lo 757 mov r6,r7,lsr#8 758 orr r5,r5,r7,lsl#18 759 veor $D3#lo,$D3#lo,$D3#lo 760 and r3,r3,#0x03ffffff 761 orr r6,r6,ip,lsl#24 762 veor $D4#lo,$D4#lo,$D4#lo 763 and r4,r4,#0x03ffffff 764 mov r1,#1 765 and r5,r5,#0x03ffffff 766 str r1,[$ctx,#36] @ set is_base2_26 767 768 vmov.32 $D0#lo[0],r2 769 vmov.32 $D1#lo[0],r3 770 vmov.32 $D2#lo[0],r4 771 vmov.32 $D3#lo[0],r5 772 vmov.32 $D4#lo[0],r6 773 adr $zeros,.Lzeros 774 775 ldmia sp!,{r1-r3,lr} 776 b .Lhash_loaded 777 778.align 4 779.Lbase2_26_neon: 780 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 781 @ load hash value 782 783 veor $D0#lo,$D0#lo,$D0#lo 784 veor $D1#lo,$D1#lo,$D1#lo 785 veor $D2#lo,$D2#lo,$D2#lo 786 veor $D3#lo,$D3#lo,$D3#lo 787 veor $D4#lo,$D4#lo,$D4#lo 788 vld4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! 789 adr $zeros,.Lzeros 790 vld1.32 {$D4#lo[0]},[$ctx] 791 sub $ctx,$ctx,#16 @ rewind 792 793.Lhash_loaded: 794 add $in2,$inp,#32 795 mov $padbit,$padbit,lsl#24 796 tst $len,#31 797 beq .Leven 798 799 vld4.32 {$H0#lo[0],$H1#lo[0],$H2#lo[0],$H3#lo[0]},[$inp]! 800 vmov.32 $H4#lo[0],$padbit 801 sub $len,$len,#16 802 add $in2,$inp,#32 803 804# ifdef __ARMEB__ 805 vrev32.8 $H0,$H0 806 vrev32.8 $H3,$H3 807 vrev32.8 $H1,$H1 808 vrev32.8 $H2,$H2 809# endif 810 vsri.u32 $H4#lo,$H3#lo,#8 @ base 2^32 -> base 2^26 811 vshl.u32 $H3#lo,$H3#lo,#18 812 813 vsri.u32 $H3#lo,$H2#lo,#14 814 vshl.u32 $H2#lo,$H2#lo,#12 815 vadd.i32 $H4#hi,$H4#lo,$D4#lo @ add hash value and move to #hi 816 817 vbic.i32 $H3#lo,#0xfc000000 818 vsri.u32 $H2#lo,$H1#lo,#20 819 vshl.u32 $H1#lo,$H1#lo,#6 820 821 vbic.i32 $H2#lo,#0xfc000000 822 vsri.u32 $H1#lo,$H0#lo,#26 823 vadd.i32 $H3#hi,$H3#lo,$D3#lo 824 825 vbic.i32 $H0#lo,#0xfc000000 826 vbic.i32 $H1#lo,#0xfc000000 827 vadd.i32 $H2#hi,$H2#lo,$D2#lo 828 829 vadd.i32 $H0#hi,$H0#lo,$D0#lo 830 vadd.i32 $H1#hi,$H1#lo,$D1#lo 831 832 mov $tbl1,$zeros 833 add $tbl0,$ctx,#48 834 835 cmp $len,$len 836 b .Long_tail 837 838.align 4 839.Leven: 840 subs $len,$len,#64 841 it lo 842 movlo $in2,$zeros 843 844 vmov.i32 $H4,#1<<24 @ padbit, yes, always 845 vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] 846 add $inp,$inp,#64 847 vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) 848 add $in2,$in2,#64 849 itt hi 850 addhi $tbl1,$ctx,#(48+1*9*4) 851 addhi $tbl0,$ctx,#(48+3*9*4) 852 853# ifdef __ARMEB__ 854 vrev32.8 $H0,$H0 855 vrev32.8 $H3,$H3 856 vrev32.8 $H1,$H1 857 vrev32.8 $H2,$H2 858# endif 859 vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 860 vshl.u32 $H3,$H3,#18 861 862 vsri.u32 $H3,$H2,#14 863 vshl.u32 $H2,$H2,#12 864 865 vbic.i32 $H3,#0xfc000000 866 vsri.u32 $H2,$H1,#20 867 vshl.u32 $H1,$H1,#6 868 869 vbic.i32 $H2,#0xfc000000 870 vsri.u32 $H1,$H0,#26 871 872 vbic.i32 $H0,#0xfc000000 873 vbic.i32 $H1,#0xfc000000 874 875 bls .Lskip_loop 876 877 vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^2 878 vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 879 vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 880 vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 881 b .Loop_neon 882 883.align 5 884.Loop_neon: 885 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 886 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 887 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 888 @ \___________________/ 889 @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 890 @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 891 @ \___________________/ \____________________/ 892 @ 893 @ Note that we start with inp[2:3]*r^2. This is because it 894 @ doesn't depend on reduction in previous iteration. 895 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 896 @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 897 @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 898 @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 899 @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 900 @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 901 902 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 903 @ inp[2:3]*r^2 904 905 vadd.i32 $H2#lo,$H2#lo,$D2#lo @ accumulate inp[0:1] 906 vmull.u32 $D2,$H2#hi,${R0}[1] 907 vadd.i32 $H0#lo,$H0#lo,$D0#lo 908 vmull.u32 $D0,$H0#hi,${R0}[1] 909 vadd.i32 $H3#lo,$H3#lo,$D3#lo 910 vmull.u32 $D3,$H3#hi,${R0}[1] 911 vmlal.u32 $D2,$H1#hi,${R1}[1] 912 vadd.i32 $H1#lo,$H1#lo,$D1#lo 913 vmull.u32 $D1,$H1#hi,${R0}[1] 914 915 vadd.i32 $H4#lo,$H4#lo,$D4#lo 916 vmull.u32 $D4,$H4#hi,${R0}[1] 917 subs $len,$len,#64 918 vmlal.u32 $D0,$H4#hi,${S1}[1] 919 it lo 920 movlo $in2,$zeros 921 vmlal.u32 $D3,$H2#hi,${R1}[1] 922 vld1.32 ${S4}[1],[$tbl1,:32] 923 vmlal.u32 $D1,$H0#hi,${R1}[1] 924 vmlal.u32 $D4,$H3#hi,${R1}[1] 925 926 vmlal.u32 $D0,$H3#hi,${S2}[1] 927 vmlal.u32 $D3,$H1#hi,${R2}[1] 928 vmlal.u32 $D4,$H2#hi,${R2}[1] 929 vmlal.u32 $D1,$H4#hi,${S2}[1] 930 vmlal.u32 $D2,$H0#hi,${R2}[1] 931 932 vmlal.u32 $D3,$H0#hi,${R3}[1] 933 vmlal.u32 $D0,$H2#hi,${S3}[1] 934 vmlal.u32 $D4,$H1#hi,${R3}[1] 935 vmlal.u32 $D1,$H3#hi,${S3}[1] 936 vmlal.u32 $D2,$H4#hi,${S3}[1] 937 938 vmlal.u32 $D3,$H4#hi,${S4}[1] 939 vmlal.u32 $D0,$H1#hi,${S4}[1] 940 vmlal.u32 $D4,$H0#hi,${R4}[1] 941 vmlal.u32 $D1,$H2#hi,${S4}[1] 942 vmlal.u32 $D2,$H3#hi,${S4}[1] 943 944 vld4.32 {$H0#hi,$H1#hi,$H2#hi,$H3#hi},[$in2] @ inp[2:3] (or 0) 945 add $in2,$in2,#64 946 947 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 948 @ (hash+inp[0:1])*r^4 and accumulate 949 950 vmlal.u32 $D3,$H3#lo,${R0}[0] 951 vmlal.u32 $D0,$H0#lo,${R0}[0] 952 vmlal.u32 $D4,$H4#lo,${R0}[0] 953 vmlal.u32 $D1,$H1#lo,${R0}[0] 954 vmlal.u32 $D2,$H2#lo,${R0}[0] 955 vld1.32 ${S4}[0],[$tbl0,:32] 956 957 vmlal.u32 $D3,$H2#lo,${R1}[0] 958 vmlal.u32 $D0,$H4#lo,${S1}[0] 959 vmlal.u32 $D4,$H3#lo,${R1}[0] 960 vmlal.u32 $D1,$H0#lo,${R1}[0] 961 vmlal.u32 $D2,$H1#lo,${R1}[0] 962 963 vmlal.u32 $D3,$H1#lo,${R2}[0] 964 vmlal.u32 $D0,$H3#lo,${S2}[0] 965 vmlal.u32 $D4,$H2#lo,${R2}[0] 966 vmlal.u32 $D1,$H4#lo,${S2}[0] 967 vmlal.u32 $D2,$H0#lo,${R2}[0] 968 969 vmlal.u32 $D3,$H0#lo,${R3}[0] 970 vmlal.u32 $D0,$H2#lo,${S3}[0] 971 vmlal.u32 $D4,$H1#lo,${R3}[0] 972 vmlal.u32 $D1,$H3#lo,${S3}[0] 973 vmlal.u32 $D3,$H4#lo,${S4}[0] 974 975 vmlal.u32 $D2,$H4#lo,${S3}[0] 976 vmlal.u32 $D0,$H1#lo,${S4}[0] 977 vmlal.u32 $D4,$H0#lo,${R4}[0] 978 vmov.i32 $H4,#1<<24 @ padbit, yes, always 979 vmlal.u32 $D1,$H2#lo,${S4}[0] 980 vmlal.u32 $D2,$H3#lo,${S4}[0] 981 982 vld4.32 {$H0#lo,$H1#lo,$H2#lo,$H3#lo},[$inp] @ inp[0:1] 983 add $inp,$inp,#64 984# ifdef __ARMEB__ 985 vrev32.8 $H0,$H0 986 vrev32.8 $H1,$H1 987 vrev32.8 $H2,$H2 988 vrev32.8 $H3,$H3 989# endif 990 991 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 992 @ lazy reduction interleaved with base 2^32 -> base 2^26 of 993 @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4. 994 995 vshr.u64 $T0,$D3,#26 996 vmovn.i64 $D3#lo,$D3 997 vshr.u64 $T1,$D0,#26 998 vmovn.i64 $D0#lo,$D0 999 vadd.i64 $D4,$D4,$T0 @ h3 -> h4 1000 vbic.i32 $D3#lo,#0xfc000000 1001 vsri.u32 $H4,$H3,#8 @ base 2^32 -> base 2^26 1002 vadd.i64 $D1,$D1,$T1 @ h0 -> h1 1003 vshl.u32 $H3,$H3,#18 1004 vbic.i32 $D0#lo,#0xfc000000 1005 1006 vshrn.u64 $T0#lo,$D4,#26 1007 vmovn.i64 $D4#lo,$D4 1008 vshr.u64 $T1,$D1,#26 1009 vmovn.i64 $D1#lo,$D1 1010 vadd.i64 $D2,$D2,$T1 @ h1 -> h2 1011 vsri.u32 $H3,$H2,#14 1012 vbic.i32 $D4#lo,#0xfc000000 1013 vshl.u32 $H2,$H2,#12 1014 vbic.i32 $D1#lo,#0xfc000000 1015 1016 vadd.i32 $D0#lo,$D0#lo,$T0#lo 1017 vshl.u32 $T0#lo,$T0#lo,#2 1018 vbic.i32 $H3,#0xfc000000 1019 vshrn.u64 $T1#lo,$D2,#26 1020 vmovn.i64 $D2#lo,$D2 1021 vaddl.u32 $D0,$D0#lo,$T0#lo @ h4 -> h0 [widen for a sec] 1022 vsri.u32 $H2,$H1,#20 1023 vadd.i32 $D3#lo,$D3#lo,$T1#lo @ h2 -> h3 1024 vshl.u32 $H1,$H1,#6 1025 vbic.i32 $D2#lo,#0xfc000000 1026 vbic.i32 $H2,#0xfc000000 1027 1028 vshrn.u64 $T0#lo,$D0,#26 @ re-narrow 1029 vmovn.i64 $D0#lo,$D0 1030 vsri.u32 $H1,$H0,#26 1031 vbic.i32 $H0,#0xfc000000 1032 vshr.u32 $T1#lo,$D3#lo,#26 1033 vbic.i32 $D3#lo,#0xfc000000 1034 vbic.i32 $D0#lo,#0xfc000000 1035 vadd.i32 $D1#lo,$D1#lo,$T0#lo @ h0 -> h1 1036 vadd.i32 $D4#lo,$D4#lo,$T1#lo @ h3 -> h4 1037 vbic.i32 $H1,#0xfc000000 1038 1039 bhi .Loop_neon 1040 1041.Lskip_loop: 1042 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1043 @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 1044 1045 add $tbl1,$ctx,#(48+0*9*4) 1046 add $tbl0,$ctx,#(48+1*9*4) 1047 adds $len,$len,#32 1048 it ne 1049 movne $len,#0 1050 bne .Long_tail 1051 1052 vadd.i32 $H2#hi,$H2#lo,$D2#lo @ add hash value and move to #hi 1053 vadd.i32 $H0#hi,$H0#lo,$D0#lo 1054 vadd.i32 $H3#hi,$H3#lo,$D3#lo 1055 vadd.i32 $H1#hi,$H1#lo,$D1#lo 1056 vadd.i32 $H4#hi,$H4#lo,$D4#lo 1057 1058.Long_tail: 1059 vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^1 1060 vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^2 1061 1062 vadd.i32 $H2#lo,$H2#lo,$D2#lo @ can be redundant 1063 vmull.u32 $D2,$H2#hi,$R0 1064 vadd.i32 $H0#lo,$H0#lo,$D0#lo 1065 vmull.u32 $D0,$H0#hi,$R0 1066 vadd.i32 $H3#lo,$H3#lo,$D3#lo 1067 vmull.u32 $D3,$H3#hi,$R0 1068 vadd.i32 $H1#lo,$H1#lo,$D1#lo 1069 vmull.u32 $D1,$H1#hi,$R0 1070 vadd.i32 $H4#lo,$H4#lo,$D4#lo 1071 vmull.u32 $D4,$H4#hi,$R0 1072 1073 vmlal.u32 $D0,$H4#hi,$S1 1074 vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 1075 vmlal.u32 $D3,$H2#hi,$R1 1076 vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 1077 vmlal.u32 $D1,$H0#hi,$R1 1078 vmlal.u32 $D4,$H3#hi,$R1 1079 vmlal.u32 $D2,$H1#hi,$R1 1080 1081 vmlal.u32 $D3,$H1#hi,$R2 1082 vld1.32 ${S4}[1],[$tbl1,:32] 1083 vmlal.u32 $D0,$H3#hi,$S2 1084 vld1.32 ${S4}[0],[$tbl0,:32] 1085 vmlal.u32 $D4,$H2#hi,$R2 1086 vmlal.u32 $D1,$H4#hi,$S2 1087 vmlal.u32 $D2,$H0#hi,$R2 1088 1089 vmlal.u32 $D3,$H0#hi,$R3 1090 it ne 1091 addne $tbl1,$ctx,#(48+2*9*4) 1092 vmlal.u32 $D0,$H2#hi,$S3 1093 it ne 1094 addne $tbl0,$ctx,#(48+3*9*4) 1095 vmlal.u32 $D4,$H1#hi,$R3 1096 vmlal.u32 $D1,$H3#hi,$S3 1097 vmlal.u32 $D2,$H4#hi,$S3 1098 1099 vmlal.u32 $D3,$H4#hi,$S4 1100 vorn $MASK,$MASK,$MASK @ all-ones, can be redundant 1101 vmlal.u32 $D0,$H1#hi,$S4 1102 vshr.u64 $MASK,$MASK,#38 1103 vmlal.u32 $D4,$H0#hi,$R4 1104 vmlal.u32 $D1,$H2#hi,$S4 1105 vmlal.u32 $D2,$H3#hi,$S4 1106 1107 beq .Lshort_tail 1108 1109 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1110 @ (hash+inp[0:1])*r^4:r^3 and accumulate 1111 1112 vld4.32 {${R0}[1],${R1}[1],${S1}[1],${R2}[1]},[$tbl1]! @ load r^3 1113 vld4.32 {${R0}[0],${R1}[0],${S1}[0],${R2}[0]},[$tbl0]! @ load r^4 1114 1115 vmlal.u32 $D2,$H2#lo,$R0 1116 vmlal.u32 $D0,$H0#lo,$R0 1117 vmlal.u32 $D3,$H3#lo,$R0 1118 vmlal.u32 $D1,$H1#lo,$R0 1119 vmlal.u32 $D4,$H4#lo,$R0 1120 1121 vmlal.u32 $D0,$H4#lo,$S1 1122 vld4.32 {${S2}[1],${R3}[1],${S3}[1],${R4}[1]},[$tbl1]! 1123 vmlal.u32 $D3,$H2#lo,$R1 1124 vld4.32 {${S2}[0],${R3}[0],${S3}[0],${R4}[0]},[$tbl0]! 1125 vmlal.u32 $D1,$H0#lo,$R1 1126 vmlal.u32 $D4,$H3#lo,$R1 1127 vmlal.u32 $D2,$H1#lo,$R1 1128 1129 vmlal.u32 $D3,$H1#lo,$R2 1130 vld1.32 ${S4}[1],[$tbl1,:32] 1131 vmlal.u32 $D0,$H3#lo,$S2 1132 vld1.32 ${S4}[0],[$tbl0,:32] 1133 vmlal.u32 $D4,$H2#lo,$R2 1134 vmlal.u32 $D1,$H4#lo,$S2 1135 vmlal.u32 $D2,$H0#lo,$R2 1136 1137 vmlal.u32 $D3,$H0#lo,$R3 1138 vmlal.u32 $D0,$H2#lo,$S3 1139 vmlal.u32 $D4,$H1#lo,$R3 1140 vmlal.u32 $D1,$H3#lo,$S3 1141 vmlal.u32 $D2,$H4#lo,$S3 1142 1143 vmlal.u32 $D3,$H4#lo,$S4 1144 vorn $MASK,$MASK,$MASK @ all-ones 1145 vmlal.u32 $D0,$H1#lo,$S4 1146 vshr.u64 $MASK,$MASK,#38 1147 vmlal.u32 $D4,$H0#lo,$R4 1148 vmlal.u32 $D1,$H2#lo,$S4 1149 vmlal.u32 $D2,$H3#lo,$S4 1150 1151.Lshort_tail: 1152 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1153 @ horizontal addition 1154 1155 vadd.i64 $D3#lo,$D3#lo,$D3#hi 1156 vadd.i64 $D0#lo,$D0#lo,$D0#hi 1157 vadd.i64 $D4#lo,$D4#lo,$D4#hi 1158 vadd.i64 $D1#lo,$D1#lo,$D1#hi 1159 vadd.i64 $D2#lo,$D2#lo,$D2#hi 1160 1161 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1162 @ lazy reduction, but without narrowing 1163 1164 vshr.u64 $T0,$D3,#26 1165 vand.i64 $D3,$D3,$MASK 1166 vshr.u64 $T1,$D0,#26 1167 vand.i64 $D0,$D0,$MASK 1168 vadd.i64 $D4,$D4,$T0 @ h3 -> h4 1169 vadd.i64 $D1,$D1,$T1 @ h0 -> h1 1170 1171 vshr.u64 $T0,$D4,#26 1172 vand.i64 $D4,$D4,$MASK 1173 vshr.u64 $T1,$D1,#26 1174 vand.i64 $D1,$D1,$MASK 1175 vadd.i64 $D2,$D2,$T1 @ h1 -> h2 1176 1177 vadd.i64 $D0,$D0,$T0 1178 vshl.u64 $T0,$T0,#2 1179 vshr.u64 $T1,$D2,#26 1180 vand.i64 $D2,$D2,$MASK 1181 vadd.i64 $D0,$D0,$T0 @ h4 -> h0 1182 vadd.i64 $D3,$D3,$T1 @ h2 -> h3 1183 1184 vshr.u64 $T0,$D0,#26 1185 vand.i64 $D0,$D0,$MASK 1186 vshr.u64 $T1,$D3,#26 1187 vand.i64 $D3,$D3,$MASK 1188 vadd.i64 $D1,$D1,$T0 @ h0 -> h1 1189 vadd.i64 $D4,$D4,$T1 @ h3 -> h4 1190 1191 cmp $len,#0 1192 bne .Leven 1193 1194 @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1195 @ store hash value 1196 1197 vst4.32 {$D0#lo[0],$D1#lo[0],$D2#lo[0],$D3#lo[0]},[$ctx]! 1198 vst1.32 {$D4#lo[0]},[$ctx] 1199 1200 vldmia sp!,{d8-d15} @ epilogue 1201 ldmia sp!,{r4-r7} 1202 ret @ bx lr 1203.size poly1305_blocks_neon,.-poly1305_blocks_neon 1204 1205.align 5 1206.Lzeros: 1207.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1208#ifndef __KERNEL__ 1209.LOPENSSL_armcap: 1210# ifdef _WIN32 1211.word OPENSSL_armcap_P 1212# else 1213.word OPENSSL_armcap_P-.Lpoly1305_init 1214# endif 1215.comm OPENSSL_armcap_P,4,4 1216.hidden OPENSSL_armcap_P 1217#endif 1218#endif 1219___ 1220} } 1221$code.=<<___; 1222.asciz "Poly1305 for ARMv4/NEON, CRYPTOGAMS by \@dot-asm" 1223.align 2 1224___ 1225 1226foreach (split("\n",$code)) { 1227 s/\`([^\`]*)\`/eval $1/geo; 1228 1229 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or 1230 s/\bret\b/bx lr/go or 1231 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 1232 1233 print $_,"\n"; 1234} 1235close STDOUT; # enforce flush 1236