1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from poly1305-armv4.pl. */ 2bc3d5698SJohn Baldwin#include "arm_arch.h" 3bc3d5698SJohn Baldwin 4bc3d5698SJohn Baldwin#if defined(__thumb2__) 5bc3d5698SJohn Baldwin.syntax unified 6bc3d5698SJohn Baldwin.thumb 7bc3d5698SJohn Baldwin#else 8bc3d5698SJohn Baldwin.code 32 9bc3d5698SJohn Baldwin#endif 10bc3d5698SJohn Baldwin 11*c0855eaaSJohn Baldwin.text 12*c0855eaaSJohn Baldwin 13bc3d5698SJohn Baldwin.globl poly1305_emit 14bc3d5698SJohn Baldwin.globl poly1305_blocks 15bc3d5698SJohn Baldwin.globl poly1305_init 16bc3d5698SJohn Baldwin.type poly1305_init,%function 17bc3d5698SJohn Baldwin.align 5 18bc3d5698SJohn Baldwinpoly1305_init: 19bc3d5698SJohn Baldwin.Lpoly1305_init: 20bc3d5698SJohn Baldwin stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 21bc3d5698SJohn Baldwin 22bc3d5698SJohn Baldwin eor r3,r3,r3 23bc3d5698SJohn Baldwin cmp r1,#0 24bc3d5698SJohn Baldwin str r3,[r0,#0] @ zero hash value 25bc3d5698SJohn Baldwin str r3,[r0,#4] 26bc3d5698SJohn Baldwin str r3,[r0,#8] 27bc3d5698SJohn Baldwin str r3,[r0,#12] 28bc3d5698SJohn Baldwin str r3,[r0,#16] 29bc3d5698SJohn Baldwin str r3,[r0,#36] @ is_base2_26 30bc3d5698SJohn Baldwin add r0,r0,#20 31bc3d5698SJohn Baldwin 32bc3d5698SJohn Baldwin#ifdef __thumb2__ 33bc3d5698SJohn Baldwin it eq 34bc3d5698SJohn Baldwin#endif 35bc3d5698SJohn Baldwin moveq r0,#0 36bc3d5698SJohn Baldwin beq .Lno_key 37bc3d5698SJohn Baldwin 38bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7 39bc3d5698SJohn Baldwin adr r11,.Lpoly1305_init 40bc3d5698SJohn Baldwin ldr r12,.LOPENSSL_armcap 41bc3d5698SJohn Baldwin#endif 42bc3d5698SJohn Baldwin ldrb r4,[r1,#0] 43bc3d5698SJohn Baldwin mov r10,#0x0fffffff 44bc3d5698SJohn Baldwin ldrb r5,[r1,#1] 45bc3d5698SJohn Baldwin and r3,r10,#-4 @ 0x0ffffffc 46bc3d5698SJohn Baldwin ldrb r6,[r1,#2] 47bc3d5698SJohn Baldwin ldrb r7,[r1,#3] 48bc3d5698SJohn Baldwin orr r4,r4,r5,lsl#8 49bc3d5698SJohn Baldwin ldrb r5,[r1,#4] 50bc3d5698SJohn Baldwin orr r4,r4,r6,lsl#16 51bc3d5698SJohn Baldwin ldrb r6,[r1,#5] 52bc3d5698SJohn Baldwin orr r4,r4,r7,lsl#24 53bc3d5698SJohn Baldwin ldrb r7,[r1,#6] 54bc3d5698SJohn Baldwin and r4,r4,r10 55bc3d5698SJohn Baldwin 56bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7 57*c0855eaaSJohn Baldwin# if !defined(_WIN32) 58bc3d5698SJohn Baldwin ldr r12,[r11,r12] @ OPENSSL_armcap_P 59*c0855eaaSJohn Baldwin# endif 60*c0855eaaSJohn Baldwin# if defined(__APPLE__) || defined(_WIN32) 61bc3d5698SJohn Baldwin ldr r12,[r12] 62bc3d5698SJohn Baldwin# endif 63bc3d5698SJohn Baldwin#endif 64bc3d5698SJohn Baldwin ldrb r8,[r1,#7] 65bc3d5698SJohn Baldwin orr r5,r5,r6,lsl#8 66bc3d5698SJohn Baldwin ldrb r6,[r1,#8] 67bc3d5698SJohn Baldwin orr r5,r5,r7,lsl#16 68bc3d5698SJohn Baldwin ldrb r7,[r1,#9] 69bc3d5698SJohn Baldwin orr r5,r5,r8,lsl#24 70bc3d5698SJohn Baldwin ldrb r8,[r1,#10] 71bc3d5698SJohn Baldwin and r5,r5,r3 72bc3d5698SJohn Baldwin 73bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7 74bc3d5698SJohn Baldwin tst r12,#ARMV7_NEON @ check for NEON 75bc3d5698SJohn Baldwin# ifdef __thumb2__ 76*c0855eaaSJohn Baldwin adr r9,.Lpoly1305_blocks_neon 77*c0855eaaSJohn Baldwin adr r11,.Lpoly1305_blocks 78*c0855eaaSJohn Baldwin adr r12,.Lpoly1305_emit 79*c0855eaaSJohn Baldwin adr r10,.Lpoly1305_emit_neon 80*c0855eaaSJohn Baldwin itt ne 81bc3d5698SJohn Baldwin movne r11,r9 82bc3d5698SJohn Baldwin movne r12,r10 83*c0855eaaSJohn Baldwin orr r11,r11,#1 @ thumb-ify address 84*c0855eaaSJohn Baldwin orr r12,r12,#1 85bc3d5698SJohn Baldwin# else 863396647cSJung-uk Kim addeq r12,r11,#(.Lpoly1305_emit-.Lpoly1305_init) 873396647cSJung-uk Kim addne r12,r11,#(.Lpoly1305_emit_neon-.Lpoly1305_init) 883396647cSJung-uk Kim addeq r11,r11,#(.Lpoly1305_blocks-.Lpoly1305_init) 893396647cSJung-uk Kim addne r11,r11,#(.Lpoly1305_blocks_neon-.Lpoly1305_init) 90bc3d5698SJohn Baldwin# endif 91bc3d5698SJohn Baldwin#endif 92bc3d5698SJohn Baldwin ldrb r9,[r1,#11] 93bc3d5698SJohn Baldwin orr r6,r6,r7,lsl#8 94bc3d5698SJohn Baldwin ldrb r7,[r1,#12] 95bc3d5698SJohn Baldwin orr r6,r6,r8,lsl#16 96bc3d5698SJohn Baldwin ldrb r8,[r1,#13] 97bc3d5698SJohn Baldwin orr r6,r6,r9,lsl#24 98bc3d5698SJohn Baldwin ldrb r9,[r1,#14] 99bc3d5698SJohn Baldwin and r6,r6,r3 100bc3d5698SJohn Baldwin 101bc3d5698SJohn Baldwin ldrb r10,[r1,#15] 102bc3d5698SJohn Baldwin orr r7,r7,r8,lsl#8 103bc3d5698SJohn Baldwin str r4,[r0,#0] 104bc3d5698SJohn Baldwin orr r7,r7,r9,lsl#16 105bc3d5698SJohn Baldwin str r5,[r0,#4] 106bc3d5698SJohn Baldwin orr r7,r7,r10,lsl#24 107bc3d5698SJohn Baldwin str r6,[r0,#8] 108bc3d5698SJohn Baldwin and r7,r7,r3 109bc3d5698SJohn Baldwin str r7,[r0,#12] 110bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7 111bc3d5698SJohn Baldwin stmia r2,{r11,r12} @ fill functions table 112bc3d5698SJohn Baldwin mov r0,#1 113bc3d5698SJohn Baldwin#else 114bc3d5698SJohn Baldwin mov r0,#0 115bc3d5698SJohn Baldwin#endif 116bc3d5698SJohn Baldwin.Lno_key: 117bc3d5698SJohn Baldwin ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 118bc3d5698SJohn Baldwin#if __ARM_ARCH__>=5 119bc3d5698SJohn Baldwin bx lr @ bx lr 120bc3d5698SJohn Baldwin#else 121bc3d5698SJohn Baldwin tst lr,#1 122bc3d5698SJohn Baldwin moveq pc,lr @ be binary compatible with V4, yet 123bc3d5698SJohn Baldwin.word 0xe12fff1e @ interoperable with Thumb ISA:-) 124bc3d5698SJohn Baldwin#endif 125bc3d5698SJohn Baldwin.size poly1305_init,.-poly1305_init 126bc3d5698SJohn Baldwin.type poly1305_blocks,%function 127bc3d5698SJohn Baldwin.align 5 128bc3d5698SJohn Baldwinpoly1305_blocks: 129bc3d5698SJohn Baldwin.Lpoly1305_blocks: 130bc3d5698SJohn Baldwin stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} 131bc3d5698SJohn Baldwin 132bc3d5698SJohn Baldwin ands r2,r2,#-16 133bc3d5698SJohn Baldwin beq .Lno_data 134bc3d5698SJohn Baldwin 135bc3d5698SJohn Baldwin cmp r3,#0 136bc3d5698SJohn Baldwin add r2,r2,r1 @ end pointer 137bc3d5698SJohn Baldwin sub sp,sp,#32 138bc3d5698SJohn Baldwin 139bc3d5698SJohn Baldwin ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11,r12} @ load context 140bc3d5698SJohn Baldwin 141bc3d5698SJohn Baldwin str r0,[sp,#12] @ offload stuff 142bc3d5698SJohn Baldwin mov lr,r1 143bc3d5698SJohn Baldwin str r2,[sp,#16] 144bc3d5698SJohn Baldwin str r10,[sp,#20] 145bc3d5698SJohn Baldwin str r11,[sp,#24] 146bc3d5698SJohn Baldwin str r12,[sp,#28] 147bc3d5698SJohn Baldwin b .Loop 148bc3d5698SJohn Baldwin 149bc3d5698SJohn Baldwin.Loop: 150bc3d5698SJohn Baldwin#if __ARM_ARCH__<7 151bc3d5698SJohn Baldwin ldrb r0,[lr],#16 @ load input 152bc3d5698SJohn Baldwin# ifdef __thumb2__ 153bc3d5698SJohn Baldwin it hi 154bc3d5698SJohn Baldwin# endif 155bc3d5698SJohn Baldwin addhi r8,r8,#1 @ 1<<128 156bc3d5698SJohn Baldwin ldrb r1,[lr,#-15] 157bc3d5698SJohn Baldwin ldrb r2,[lr,#-14] 158bc3d5698SJohn Baldwin ldrb r3,[lr,#-13] 159bc3d5698SJohn Baldwin orr r1,r0,r1,lsl#8 160bc3d5698SJohn Baldwin ldrb r0,[lr,#-12] 161bc3d5698SJohn Baldwin orr r2,r1,r2,lsl#16 162bc3d5698SJohn Baldwin ldrb r1,[lr,#-11] 163bc3d5698SJohn Baldwin orr r3,r2,r3,lsl#24 164bc3d5698SJohn Baldwin ldrb r2,[lr,#-10] 165bc3d5698SJohn Baldwin adds r4,r4,r3 @ accumulate input 166bc3d5698SJohn Baldwin 167bc3d5698SJohn Baldwin ldrb r3,[lr,#-9] 168bc3d5698SJohn Baldwin orr r1,r0,r1,lsl#8 169bc3d5698SJohn Baldwin ldrb r0,[lr,#-8] 170bc3d5698SJohn Baldwin orr r2,r1,r2,lsl#16 171bc3d5698SJohn Baldwin ldrb r1,[lr,#-7] 172bc3d5698SJohn Baldwin orr r3,r2,r3,lsl#24 173bc3d5698SJohn Baldwin ldrb r2,[lr,#-6] 174bc3d5698SJohn Baldwin adcs r5,r5,r3 175bc3d5698SJohn Baldwin 176bc3d5698SJohn Baldwin ldrb r3,[lr,#-5] 177bc3d5698SJohn Baldwin orr r1,r0,r1,lsl#8 178bc3d5698SJohn Baldwin ldrb r0,[lr,#-4] 179bc3d5698SJohn Baldwin orr r2,r1,r2,lsl#16 180bc3d5698SJohn Baldwin ldrb r1,[lr,#-3] 181bc3d5698SJohn Baldwin orr r3,r2,r3,lsl#24 182bc3d5698SJohn Baldwin ldrb r2,[lr,#-2] 183bc3d5698SJohn Baldwin adcs r6,r6,r3 184bc3d5698SJohn Baldwin 185bc3d5698SJohn Baldwin ldrb r3,[lr,#-1] 186bc3d5698SJohn Baldwin orr r1,r0,r1,lsl#8 187bc3d5698SJohn Baldwin str lr,[sp,#8] @ offload input pointer 188bc3d5698SJohn Baldwin orr r2,r1,r2,lsl#16 189bc3d5698SJohn Baldwin add r10,r10,r10,lsr#2 190bc3d5698SJohn Baldwin orr r3,r2,r3,lsl#24 191bc3d5698SJohn Baldwin#else 192bc3d5698SJohn Baldwin ldr r0,[lr],#16 @ load input 193bc3d5698SJohn Baldwin# ifdef __thumb2__ 194bc3d5698SJohn Baldwin it hi 195bc3d5698SJohn Baldwin# endif 196bc3d5698SJohn Baldwin addhi r8,r8,#1 @ padbit 197bc3d5698SJohn Baldwin ldr r1,[lr,#-12] 198bc3d5698SJohn Baldwin ldr r2,[lr,#-8] 199bc3d5698SJohn Baldwin ldr r3,[lr,#-4] 200bc3d5698SJohn Baldwin# ifdef __ARMEB__ 201bc3d5698SJohn Baldwin rev r0,r0 202bc3d5698SJohn Baldwin rev r1,r1 203bc3d5698SJohn Baldwin rev r2,r2 204bc3d5698SJohn Baldwin rev r3,r3 205bc3d5698SJohn Baldwin# endif 206bc3d5698SJohn Baldwin adds r4,r4,r0 @ accumulate input 207bc3d5698SJohn Baldwin str lr,[sp,#8] @ offload input pointer 208bc3d5698SJohn Baldwin adcs r5,r5,r1 209bc3d5698SJohn Baldwin add r10,r10,r10,lsr#2 210bc3d5698SJohn Baldwin adcs r6,r6,r2 211bc3d5698SJohn Baldwin#endif 212bc3d5698SJohn Baldwin add r11,r11,r11,lsr#2 213bc3d5698SJohn Baldwin adcs r7,r7,r3 214bc3d5698SJohn Baldwin add r12,r12,r12,lsr#2 215bc3d5698SJohn Baldwin 216bc3d5698SJohn Baldwin umull r2,r3,r5,r9 217bc3d5698SJohn Baldwin adc r8,r8,#0 218bc3d5698SJohn Baldwin umull r0,r1,r4,r9 219bc3d5698SJohn Baldwin umlal r2,r3,r8,r10 220bc3d5698SJohn Baldwin umlal r0,r1,r7,r10 221bc3d5698SJohn Baldwin ldr r10,[sp,#20] @ reload r10 222bc3d5698SJohn Baldwin umlal r2,r3,r6,r12 223bc3d5698SJohn Baldwin umlal r0,r1,r5,r12 224bc3d5698SJohn Baldwin umlal r2,r3,r7,r11 225bc3d5698SJohn Baldwin umlal r0,r1,r6,r11 226bc3d5698SJohn Baldwin umlal r2,r3,r4,r10 227bc3d5698SJohn Baldwin str r0,[sp,#0] @ future r4 228bc3d5698SJohn Baldwin mul r0,r11,r8 229bc3d5698SJohn Baldwin ldr r11,[sp,#24] @ reload r11 230bc3d5698SJohn Baldwin adds r2,r2,r1 @ d1+=d0>>32 231bc3d5698SJohn Baldwin eor r1,r1,r1 232bc3d5698SJohn Baldwin adc lr,r3,#0 @ future r6 233bc3d5698SJohn Baldwin str r2,[sp,#4] @ future r5 234bc3d5698SJohn Baldwin 235bc3d5698SJohn Baldwin mul r2,r12,r8 236bc3d5698SJohn Baldwin eor r3,r3,r3 237bc3d5698SJohn Baldwin umlal r0,r1,r7,r12 238bc3d5698SJohn Baldwin ldr r12,[sp,#28] @ reload r12 239bc3d5698SJohn Baldwin umlal r2,r3,r7,r9 240bc3d5698SJohn Baldwin umlal r0,r1,r6,r9 241bc3d5698SJohn Baldwin umlal r2,r3,r6,r10 242bc3d5698SJohn Baldwin umlal r0,r1,r5,r10 243bc3d5698SJohn Baldwin umlal r2,r3,r5,r11 244bc3d5698SJohn Baldwin umlal r0,r1,r4,r11 245bc3d5698SJohn Baldwin umlal r2,r3,r4,r12 246bc3d5698SJohn Baldwin ldr r4,[sp,#0] 247bc3d5698SJohn Baldwin mul r8,r9,r8 248bc3d5698SJohn Baldwin ldr r5,[sp,#4] 249bc3d5698SJohn Baldwin 250bc3d5698SJohn Baldwin adds r6,lr,r0 @ d2+=d1>>32 251bc3d5698SJohn Baldwin ldr lr,[sp,#8] @ reload input pointer 252bc3d5698SJohn Baldwin adc r1,r1,#0 253bc3d5698SJohn Baldwin adds r7,r2,r1 @ d3+=d2>>32 254bc3d5698SJohn Baldwin ldr r0,[sp,#16] @ reload end pointer 255bc3d5698SJohn Baldwin adc r3,r3,#0 256bc3d5698SJohn Baldwin add r8,r8,r3 @ h4+=d3>>32 257bc3d5698SJohn Baldwin 258bc3d5698SJohn Baldwin and r1,r8,#-4 259bc3d5698SJohn Baldwin and r8,r8,#3 260bc3d5698SJohn Baldwin add r1,r1,r1,lsr#2 @ *=5 261bc3d5698SJohn Baldwin adds r4,r4,r1 262bc3d5698SJohn Baldwin adcs r5,r5,#0 263bc3d5698SJohn Baldwin adcs r6,r6,#0 264bc3d5698SJohn Baldwin adcs r7,r7,#0 265bc3d5698SJohn Baldwin adc r8,r8,#0 266bc3d5698SJohn Baldwin 267bc3d5698SJohn Baldwin cmp r0,lr @ done yet? 268bc3d5698SJohn Baldwin bhi .Loop 269bc3d5698SJohn Baldwin 270bc3d5698SJohn Baldwin ldr r0,[sp,#12] 271bc3d5698SJohn Baldwin add sp,sp,#32 272bc3d5698SJohn Baldwin stmia r0,{r4,r5,r6,r7,r8} @ store the result 273bc3d5698SJohn Baldwin 274bc3d5698SJohn Baldwin.Lno_data: 275bc3d5698SJohn Baldwin#if __ARM_ARCH__>=5 276bc3d5698SJohn Baldwin ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,pc} 277bc3d5698SJohn Baldwin#else 278bc3d5698SJohn Baldwin ldmia sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} 279bc3d5698SJohn Baldwin tst lr,#1 280bc3d5698SJohn Baldwin moveq pc,lr @ be binary compatible with V4, yet 281bc3d5698SJohn Baldwin.word 0xe12fff1e @ interoperable with Thumb ISA:-) 282bc3d5698SJohn Baldwin#endif 283bc3d5698SJohn Baldwin.size poly1305_blocks,.-poly1305_blocks 284bc3d5698SJohn Baldwin.type poly1305_emit,%function 285bc3d5698SJohn Baldwin.align 5 286bc3d5698SJohn Baldwinpoly1305_emit: 2873396647cSJung-uk Kim.Lpoly1305_emit: 288bc3d5698SJohn Baldwin stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 289bc3d5698SJohn Baldwin.Lpoly1305_emit_enter: 290bc3d5698SJohn Baldwin 291bc3d5698SJohn Baldwin ldmia r0,{r3,r4,r5,r6,r7} 292bc3d5698SJohn Baldwin adds r8,r3,#5 @ compare to modulus 293bc3d5698SJohn Baldwin adcs r9,r4,#0 294bc3d5698SJohn Baldwin adcs r10,r5,#0 295bc3d5698SJohn Baldwin adcs r11,r6,#0 296bc3d5698SJohn Baldwin adc r7,r7,#0 297bc3d5698SJohn Baldwin tst r7,#4 @ did it carry/borrow? 298bc3d5698SJohn Baldwin 299bc3d5698SJohn Baldwin#ifdef __thumb2__ 300bc3d5698SJohn Baldwin it ne 301bc3d5698SJohn Baldwin#endif 302bc3d5698SJohn Baldwin movne r3,r8 303bc3d5698SJohn Baldwin ldr r8,[r2,#0] 304bc3d5698SJohn Baldwin#ifdef __thumb2__ 305bc3d5698SJohn Baldwin it ne 306bc3d5698SJohn Baldwin#endif 307bc3d5698SJohn Baldwin movne r4,r9 308bc3d5698SJohn Baldwin ldr r9,[r2,#4] 309bc3d5698SJohn Baldwin#ifdef __thumb2__ 310bc3d5698SJohn Baldwin it ne 311bc3d5698SJohn Baldwin#endif 312bc3d5698SJohn Baldwin movne r5,r10 313bc3d5698SJohn Baldwin ldr r10,[r2,#8] 314bc3d5698SJohn Baldwin#ifdef __thumb2__ 315bc3d5698SJohn Baldwin it ne 316bc3d5698SJohn Baldwin#endif 317bc3d5698SJohn Baldwin movne r6,r11 318bc3d5698SJohn Baldwin ldr r11,[r2,#12] 319bc3d5698SJohn Baldwin 320bc3d5698SJohn Baldwin adds r3,r3,r8 321bc3d5698SJohn Baldwin adcs r4,r4,r9 322bc3d5698SJohn Baldwin adcs r5,r5,r10 323bc3d5698SJohn Baldwin adc r6,r6,r11 324bc3d5698SJohn Baldwin 325bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 326bc3d5698SJohn Baldwin# ifdef __ARMEB__ 327bc3d5698SJohn Baldwin rev r3,r3 328bc3d5698SJohn Baldwin rev r4,r4 329bc3d5698SJohn Baldwin rev r5,r5 330bc3d5698SJohn Baldwin rev r6,r6 331bc3d5698SJohn Baldwin# endif 332bc3d5698SJohn Baldwin str r3,[r1,#0] 333bc3d5698SJohn Baldwin str r4,[r1,#4] 334bc3d5698SJohn Baldwin str r5,[r1,#8] 335bc3d5698SJohn Baldwin str r6,[r1,#12] 336bc3d5698SJohn Baldwin#else 337bc3d5698SJohn Baldwin strb r3,[r1,#0] 338bc3d5698SJohn Baldwin mov r3,r3,lsr#8 339bc3d5698SJohn Baldwin strb r4,[r1,#4] 340bc3d5698SJohn Baldwin mov r4,r4,lsr#8 341bc3d5698SJohn Baldwin strb r5,[r1,#8] 342bc3d5698SJohn Baldwin mov r5,r5,lsr#8 343bc3d5698SJohn Baldwin strb r6,[r1,#12] 344bc3d5698SJohn Baldwin mov r6,r6,lsr#8 345bc3d5698SJohn Baldwin 346bc3d5698SJohn Baldwin strb r3,[r1,#1] 347bc3d5698SJohn Baldwin mov r3,r3,lsr#8 348bc3d5698SJohn Baldwin strb r4,[r1,#5] 349bc3d5698SJohn Baldwin mov r4,r4,lsr#8 350bc3d5698SJohn Baldwin strb r5,[r1,#9] 351bc3d5698SJohn Baldwin mov r5,r5,lsr#8 352bc3d5698SJohn Baldwin strb r6,[r1,#13] 353bc3d5698SJohn Baldwin mov r6,r6,lsr#8 354bc3d5698SJohn Baldwin 355bc3d5698SJohn Baldwin strb r3,[r1,#2] 356bc3d5698SJohn Baldwin mov r3,r3,lsr#8 357bc3d5698SJohn Baldwin strb r4,[r1,#6] 358bc3d5698SJohn Baldwin mov r4,r4,lsr#8 359bc3d5698SJohn Baldwin strb r5,[r1,#10] 360bc3d5698SJohn Baldwin mov r5,r5,lsr#8 361bc3d5698SJohn Baldwin strb r6,[r1,#14] 362bc3d5698SJohn Baldwin mov r6,r6,lsr#8 363bc3d5698SJohn Baldwin 364bc3d5698SJohn Baldwin strb r3,[r1,#3] 365bc3d5698SJohn Baldwin strb r4,[r1,#7] 366bc3d5698SJohn Baldwin strb r5,[r1,#11] 367bc3d5698SJohn Baldwin strb r6,[r1,#15] 368bc3d5698SJohn Baldwin#endif 369bc3d5698SJohn Baldwin ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 370bc3d5698SJohn Baldwin#if __ARM_ARCH__>=5 371bc3d5698SJohn Baldwin bx lr @ bx lr 372bc3d5698SJohn Baldwin#else 373bc3d5698SJohn Baldwin tst lr,#1 374bc3d5698SJohn Baldwin moveq pc,lr @ be binary compatible with V4, yet 375bc3d5698SJohn Baldwin.word 0xe12fff1e @ interoperable with Thumb ISA:-) 376bc3d5698SJohn Baldwin#endif 377bc3d5698SJohn Baldwin.size poly1305_emit,.-poly1305_emit 378bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7 379bc3d5698SJohn Baldwin.fpu neon 380bc3d5698SJohn Baldwin 381bc3d5698SJohn Baldwin.type poly1305_init_neon,%function 382bc3d5698SJohn Baldwin.align 5 383bc3d5698SJohn Baldwinpoly1305_init_neon: 384bc3d5698SJohn Baldwin ldr r4,[r0,#20] @ load key base 2^32 385bc3d5698SJohn Baldwin ldr r5,[r0,#24] 386bc3d5698SJohn Baldwin ldr r6,[r0,#28] 387bc3d5698SJohn Baldwin ldr r7,[r0,#32] 388bc3d5698SJohn Baldwin 389bc3d5698SJohn Baldwin and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 390bc3d5698SJohn Baldwin mov r3,r4,lsr#26 391bc3d5698SJohn Baldwin mov r4,r5,lsr#20 392bc3d5698SJohn Baldwin orr r3,r3,r5,lsl#6 393bc3d5698SJohn Baldwin mov r5,r6,lsr#14 394bc3d5698SJohn Baldwin orr r4,r4,r6,lsl#12 395bc3d5698SJohn Baldwin mov r6,r7,lsr#8 396bc3d5698SJohn Baldwin orr r5,r5,r7,lsl#18 397bc3d5698SJohn Baldwin and r3,r3,#0x03ffffff 398bc3d5698SJohn Baldwin and r4,r4,#0x03ffffff 399bc3d5698SJohn Baldwin and r5,r5,#0x03ffffff 400bc3d5698SJohn Baldwin 401bc3d5698SJohn Baldwin vdup.32 d0,r2 @ r^1 in both lanes 402bc3d5698SJohn Baldwin add r2,r3,r3,lsl#2 @ *5 403bc3d5698SJohn Baldwin vdup.32 d1,r3 404bc3d5698SJohn Baldwin add r3,r4,r4,lsl#2 405bc3d5698SJohn Baldwin vdup.32 d2,r2 406bc3d5698SJohn Baldwin vdup.32 d3,r4 407bc3d5698SJohn Baldwin add r4,r5,r5,lsl#2 408bc3d5698SJohn Baldwin vdup.32 d4,r3 409bc3d5698SJohn Baldwin vdup.32 d5,r5 410bc3d5698SJohn Baldwin add r5,r6,r6,lsl#2 411bc3d5698SJohn Baldwin vdup.32 d6,r4 412bc3d5698SJohn Baldwin vdup.32 d7,r6 413bc3d5698SJohn Baldwin vdup.32 d8,r5 414bc3d5698SJohn Baldwin 415bc3d5698SJohn Baldwin mov r5,#2 @ counter 416bc3d5698SJohn Baldwin 417bc3d5698SJohn Baldwin.Lsquare_neon: 418bc3d5698SJohn Baldwin @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 419bc3d5698SJohn Baldwin @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 420bc3d5698SJohn Baldwin @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 421bc3d5698SJohn Baldwin @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 422bc3d5698SJohn Baldwin @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 423bc3d5698SJohn Baldwin @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 424bc3d5698SJohn Baldwin 425bc3d5698SJohn Baldwin vmull.u32 q5,d0,d0[1] 426bc3d5698SJohn Baldwin vmull.u32 q6,d1,d0[1] 427bc3d5698SJohn Baldwin vmull.u32 q7,d3,d0[1] 428bc3d5698SJohn Baldwin vmull.u32 q8,d5,d0[1] 429bc3d5698SJohn Baldwin vmull.u32 q9,d7,d0[1] 430bc3d5698SJohn Baldwin 431bc3d5698SJohn Baldwin vmlal.u32 q5,d7,d2[1] 432bc3d5698SJohn Baldwin vmlal.u32 q6,d0,d1[1] 433bc3d5698SJohn Baldwin vmlal.u32 q7,d1,d1[1] 434bc3d5698SJohn Baldwin vmlal.u32 q8,d3,d1[1] 435bc3d5698SJohn Baldwin vmlal.u32 q9,d5,d1[1] 436bc3d5698SJohn Baldwin 437bc3d5698SJohn Baldwin vmlal.u32 q5,d5,d4[1] 438bc3d5698SJohn Baldwin vmlal.u32 q6,d7,d4[1] 439bc3d5698SJohn Baldwin vmlal.u32 q8,d1,d3[1] 440bc3d5698SJohn Baldwin vmlal.u32 q7,d0,d3[1] 441bc3d5698SJohn Baldwin vmlal.u32 q9,d3,d3[1] 442bc3d5698SJohn Baldwin 443bc3d5698SJohn Baldwin vmlal.u32 q5,d3,d6[1] 444bc3d5698SJohn Baldwin vmlal.u32 q8,d0,d5[1] 445bc3d5698SJohn Baldwin vmlal.u32 q6,d5,d6[1] 446bc3d5698SJohn Baldwin vmlal.u32 q7,d7,d6[1] 447bc3d5698SJohn Baldwin vmlal.u32 q9,d1,d5[1] 448bc3d5698SJohn Baldwin 449bc3d5698SJohn Baldwin vmlal.u32 q8,d7,d8[1] 450bc3d5698SJohn Baldwin vmlal.u32 q5,d1,d8[1] 451bc3d5698SJohn Baldwin vmlal.u32 q6,d3,d8[1] 452bc3d5698SJohn Baldwin vmlal.u32 q7,d5,d8[1] 453bc3d5698SJohn Baldwin vmlal.u32 q9,d0,d7[1] 454bc3d5698SJohn Baldwin 455bc3d5698SJohn Baldwin @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 456bc3d5698SJohn Baldwin @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein 457bc3d5698SJohn Baldwin @ and P. Schwabe 458bc3d5698SJohn Baldwin @ 459bc3d5698SJohn Baldwin @ H0>>+H1>>+H2>>+H3>>+H4 460bc3d5698SJohn Baldwin @ H3>>+H4>>*5+H0>>+H1 461bc3d5698SJohn Baldwin @ 462bc3d5698SJohn Baldwin @ Trivia. 463bc3d5698SJohn Baldwin @ 464bc3d5698SJohn Baldwin @ Result of multiplication of n-bit number by m-bit number is 465bc3d5698SJohn Baldwin @ n+m bits wide. However! Even though 2^n is a n+1-bit number, 466bc3d5698SJohn Baldwin @ m-bit number multiplied by 2^n is still n+m bits wide. 467bc3d5698SJohn Baldwin @ 468bc3d5698SJohn Baldwin @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2, 469bc3d5698SJohn Baldwin @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit 470bc3d5698SJohn Baldwin @ one is n+1 bits wide. 471bc3d5698SJohn Baldwin @ 472bc3d5698SJohn Baldwin @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that 473bc3d5698SJohn Baldwin @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4 474bc3d5698SJohn Baldwin @ can be 27. However! In cases when their width exceeds 26 bits 475bc3d5698SJohn Baldwin @ they are limited by 2^26+2^6. This in turn means that *sum* 476bc3d5698SJohn Baldwin @ of the products with these values can still be viewed as sum 477bc3d5698SJohn Baldwin @ of 52-bit numbers as long as the amount of addends is not a 478bc3d5698SJohn Baldwin @ power of 2. For example, 479bc3d5698SJohn Baldwin @ 480bc3d5698SJohn Baldwin @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4, 481bc3d5698SJohn Baldwin @ 482bc3d5698SJohn Baldwin @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or 483bc3d5698SJohn Baldwin @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than 484bc3d5698SJohn Baldwin @ 8 * (2^52) or 2^55. However, the value is then multiplied by 485bc3d5698SJohn Baldwin @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12), 486bc3d5698SJohn Baldwin @ which is less than 32 * (2^52) or 2^57. And when processing 487bc3d5698SJohn Baldwin @ data we are looking at triple as many addends... 488bc3d5698SJohn Baldwin @ 489bc3d5698SJohn Baldwin @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and 490bc3d5698SJohn Baldwin @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the 491bc3d5698SJohn Baldwin @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while 492bc3d5698SJohn Baldwin @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32 493bc3d5698SJohn Baldwin @ instruction accepts 2x32-bit input and writes 2x64-bit result. 494bc3d5698SJohn Baldwin @ This means that result of reduction have to be compressed upon 495bc3d5698SJohn Baldwin @ loop wrap-around. This can be done in the process of reduction 496bc3d5698SJohn Baldwin @ to minimize amount of instructions [as well as amount of 497bc3d5698SJohn Baldwin @ 128-bit instructions, which benefits low-end processors], but 498bc3d5698SJohn Baldwin @ one has to watch for H2 (which is narrower than H0) and 5*H4 499bc3d5698SJohn Baldwin @ not being wider than 58 bits, so that result of right shift 500bc3d5698SJohn Baldwin @ by 26 bits fits in 32 bits. This is also useful on x86, 501bc3d5698SJohn Baldwin @ because it allows to use paddd in place for paddq, which 502bc3d5698SJohn Baldwin @ benefits Atom, where paddq is ridiculously slow. 503bc3d5698SJohn Baldwin 504bc3d5698SJohn Baldwin vshr.u64 q15,q8,#26 505bc3d5698SJohn Baldwin vmovn.i64 d16,q8 506bc3d5698SJohn Baldwin vshr.u64 q4,q5,#26 507bc3d5698SJohn Baldwin vmovn.i64 d10,q5 508bc3d5698SJohn Baldwin vadd.i64 q9,q9,q15 @ h3 -> h4 509bc3d5698SJohn Baldwin vbic.i32 d16,#0xfc000000 @ &=0x03ffffff 510bc3d5698SJohn Baldwin vadd.i64 q6,q6,q4 @ h0 -> h1 511bc3d5698SJohn Baldwin vbic.i32 d10,#0xfc000000 512bc3d5698SJohn Baldwin 513bc3d5698SJohn Baldwin vshrn.u64 d30,q9,#26 514bc3d5698SJohn Baldwin vmovn.i64 d18,q9 515bc3d5698SJohn Baldwin vshr.u64 q4,q6,#26 516bc3d5698SJohn Baldwin vmovn.i64 d12,q6 517bc3d5698SJohn Baldwin vadd.i64 q7,q7,q4 @ h1 -> h2 518bc3d5698SJohn Baldwin vbic.i32 d18,#0xfc000000 519bc3d5698SJohn Baldwin vbic.i32 d12,#0xfc000000 520bc3d5698SJohn Baldwin 521bc3d5698SJohn Baldwin vadd.i32 d10,d10,d30 522bc3d5698SJohn Baldwin vshl.u32 d30,d30,#2 523bc3d5698SJohn Baldwin vshrn.u64 d8,q7,#26 524bc3d5698SJohn Baldwin vmovn.i64 d14,q7 525bc3d5698SJohn Baldwin vadd.i32 d10,d10,d30 @ h4 -> h0 526bc3d5698SJohn Baldwin vadd.i32 d16,d16,d8 @ h2 -> h3 527bc3d5698SJohn Baldwin vbic.i32 d14,#0xfc000000 528bc3d5698SJohn Baldwin 529bc3d5698SJohn Baldwin vshr.u32 d30,d10,#26 530bc3d5698SJohn Baldwin vbic.i32 d10,#0xfc000000 531bc3d5698SJohn Baldwin vshr.u32 d8,d16,#26 532bc3d5698SJohn Baldwin vbic.i32 d16,#0xfc000000 533bc3d5698SJohn Baldwin vadd.i32 d12,d12,d30 @ h0 -> h1 534bc3d5698SJohn Baldwin vadd.i32 d18,d18,d8 @ h3 -> h4 535bc3d5698SJohn Baldwin 536bc3d5698SJohn Baldwin subs r5,r5,#1 537bc3d5698SJohn Baldwin beq .Lsquare_break_neon 538bc3d5698SJohn Baldwin 539bc3d5698SJohn Baldwin add r6,r0,#(48+0*9*4) 540bc3d5698SJohn Baldwin add r7,r0,#(48+1*9*4) 541bc3d5698SJohn Baldwin 542bc3d5698SJohn Baldwin vtrn.32 d0,d10 @ r^2:r^1 543bc3d5698SJohn Baldwin vtrn.32 d3,d14 544bc3d5698SJohn Baldwin vtrn.32 d5,d16 545bc3d5698SJohn Baldwin vtrn.32 d1,d12 546bc3d5698SJohn Baldwin vtrn.32 d7,d18 547bc3d5698SJohn Baldwin 548bc3d5698SJohn Baldwin vshl.u32 d4,d3,#2 @ *5 549bc3d5698SJohn Baldwin vshl.u32 d6,d5,#2 550bc3d5698SJohn Baldwin vshl.u32 d2,d1,#2 551bc3d5698SJohn Baldwin vshl.u32 d8,d7,#2 552bc3d5698SJohn Baldwin vadd.i32 d4,d4,d3 553bc3d5698SJohn Baldwin vadd.i32 d2,d2,d1 554bc3d5698SJohn Baldwin vadd.i32 d6,d6,d5 555bc3d5698SJohn Baldwin vadd.i32 d8,d8,d7 556bc3d5698SJohn Baldwin 557bc3d5698SJohn Baldwin vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! 558bc3d5698SJohn Baldwin vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! 559bc3d5698SJohn Baldwin vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 560bc3d5698SJohn Baldwin vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 561bc3d5698SJohn Baldwin vst1.32 {d8[0]},[r6,:32] 562bc3d5698SJohn Baldwin vst1.32 {d8[1]},[r7,:32] 563bc3d5698SJohn Baldwin 564bc3d5698SJohn Baldwin b .Lsquare_neon 565bc3d5698SJohn Baldwin 566bc3d5698SJohn Baldwin.align 4 567bc3d5698SJohn Baldwin.Lsquare_break_neon: 568bc3d5698SJohn Baldwin add r6,r0,#(48+2*4*9) 569bc3d5698SJohn Baldwin add r7,r0,#(48+3*4*9) 570bc3d5698SJohn Baldwin 571bc3d5698SJohn Baldwin vmov d0,d10 @ r^4:r^3 572bc3d5698SJohn Baldwin vshl.u32 d2,d12,#2 @ *5 573bc3d5698SJohn Baldwin vmov d1,d12 574bc3d5698SJohn Baldwin vshl.u32 d4,d14,#2 575bc3d5698SJohn Baldwin vmov d3,d14 576bc3d5698SJohn Baldwin vshl.u32 d6,d16,#2 577bc3d5698SJohn Baldwin vmov d5,d16 578bc3d5698SJohn Baldwin vshl.u32 d8,d18,#2 579bc3d5698SJohn Baldwin vmov d7,d18 580bc3d5698SJohn Baldwin vadd.i32 d2,d2,d12 581bc3d5698SJohn Baldwin vadd.i32 d4,d4,d14 582bc3d5698SJohn Baldwin vadd.i32 d6,d6,d16 583bc3d5698SJohn Baldwin vadd.i32 d8,d8,d18 584bc3d5698SJohn Baldwin 585bc3d5698SJohn Baldwin vst4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! 586bc3d5698SJohn Baldwin vst4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! 587bc3d5698SJohn Baldwin vst4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 588bc3d5698SJohn Baldwin vst4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 589bc3d5698SJohn Baldwin vst1.32 {d8[0]},[r6] 590bc3d5698SJohn Baldwin vst1.32 {d8[1]},[r7] 591bc3d5698SJohn Baldwin 592bc3d5698SJohn Baldwin bx lr @ bx lr 593bc3d5698SJohn Baldwin.size poly1305_init_neon,.-poly1305_init_neon 594bc3d5698SJohn Baldwin 595bc3d5698SJohn Baldwin.type poly1305_blocks_neon,%function 596bc3d5698SJohn Baldwin.align 5 597bc3d5698SJohn Baldwinpoly1305_blocks_neon: 5983396647cSJung-uk Kim.Lpoly1305_blocks_neon: 599bc3d5698SJohn Baldwin ldr ip,[r0,#36] @ is_base2_26 600bc3d5698SJohn Baldwin ands r2,r2,#-16 601bc3d5698SJohn Baldwin beq .Lno_data_neon 602bc3d5698SJohn Baldwin 603bc3d5698SJohn Baldwin cmp r2,#64 604bc3d5698SJohn Baldwin bhs .Lenter_neon 605bc3d5698SJohn Baldwin tst ip,ip @ is_base2_26? 606bc3d5698SJohn Baldwin beq .Lpoly1305_blocks 607bc3d5698SJohn Baldwin 608bc3d5698SJohn Baldwin.Lenter_neon: 609bc3d5698SJohn Baldwin stmdb sp!,{r4,r5,r6,r7} 610bc3d5698SJohn Baldwin vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so 611bc3d5698SJohn Baldwin 612bc3d5698SJohn Baldwin tst ip,ip @ is_base2_26? 613bc3d5698SJohn Baldwin bne .Lbase2_26_neon 614bc3d5698SJohn Baldwin 615bc3d5698SJohn Baldwin stmdb sp!,{r1,r2,r3,lr} 616bc3d5698SJohn Baldwin bl poly1305_init_neon 617bc3d5698SJohn Baldwin 618bc3d5698SJohn Baldwin ldr r4,[r0,#0] @ load hash value base 2^32 619bc3d5698SJohn Baldwin ldr r5,[r0,#4] 620bc3d5698SJohn Baldwin ldr r6,[r0,#8] 621bc3d5698SJohn Baldwin ldr r7,[r0,#12] 622bc3d5698SJohn Baldwin ldr ip,[r0,#16] 623bc3d5698SJohn Baldwin 624bc3d5698SJohn Baldwin and r2,r4,#0x03ffffff @ base 2^32 -> base 2^26 625bc3d5698SJohn Baldwin mov r3,r4,lsr#26 626bc3d5698SJohn Baldwin veor d10,d10,d10 627bc3d5698SJohn Baldwin mov r4,r5,lsr#20 628bc3d5698SJohn Baldwin orr r3,r3,r5,lsl#6 629bc3d5698SJohn Baldwin veor d12,d12,d12 630bc3d5698SJohn Baldwin mov r5,r6,lsr#14 631bc3d5698SJohn Baldwin orr r4,r4,r6,lsl#12 632bc3d5698SJohn Baldwin veor d14,d14,d14 633bc3d5698SJohn Baldwin mov r6,r7,lsr#8 634bc3d5698SJohn Baldwin orr r5,r5,r7,lsl#18 635bc3d5698SJohn Baldwin veor d16,d16,d16 636bc3d5698SJohn Baldwin and r3,r3,#0x03ffffff 637bc3d5698SJohn Baldwin orr r6,r6,ip,lsl#24 638bc3d5698SJohn Baldwin veor d18,d18,d18 639bc3d5698SJohn Baldwin and r4,r4,#0x03ffffff 640bc3d5698SJohn Baldwin mov r1,#1 641bc3d5698SJohn Baldwin and r5,r5,#0x03ffffff 642bc3d5698SJohn Baldwin str r1,[r0,#36] @ is_base2_26 643bc3d5698SJohn Baldwin 644bc3d5698SJohn Baldwin vmov.32 d10[0],r2 645bc3d5698SJohn Baldwin vmov.32 d12[0],r3 646bc3d5698SJohn Baldwin vmov.32 d14[0],r4 647bc3d5698SJohn Baldwin vmov.32 d16[0],r5 648bc3d5698SJohn Baldwin vmov.32 d18[0],r6 649bc3d5698SJohn Baldwin adr r5,.Lzeros 650bc3d5698SJohn Baldwin 651bc3d5698SJohn Baldwin ldmia sp!,{r1,r2,r3,lr} 652bc3d5698SJohn Baldwin b .Lbase2_32_neon 653bc3d5698SJohn Baldwin 654bc3d5698SJohn Baldwin.align 4 655bc3d5698SJohn Baldwin.Lbase2_26_neon: 656bc3d5698SJohn Baldwin @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 657bc3d5698SJohn Baldwin @ load hash value 658bc3d5698SJohn Baldwin 659bc3d5698SJohn Baldwin veor d10,d10,d10 660bc3d5698SJohn Baldwin veor d12,d12,d12 661bc3d5698SJohn Baldwin veor d14,d14,d14 662bc3d5698SJohn Baldwin veor d16,d16,d16 663bc3d5698SJohn Baldwin veor d18,d18,d18 664bc3d5698SJohn Baldwin vld4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! 665bc3d5698SJohn Baldwin adr r5,.Lzeros 666bc3d5698SJohn Baldwin vld1.32 {d18[0]},[r0] 667bc3d5698SJohn Baldwin sub r0,r0,#16 @ rewind 668bc3d5698SJohn Baldwin 669bc3d5698SJohn Baldwin.Lbase2_32_neon: 670bc3d5698SJohn Baldwin add r4,r1,#32 671bc3d5698SJohn Baldwin mov r3,r3,lsl#24 672bc3d5698SJohn Baldwin tst r2,#31 673bc3d5698SJohn Baldwin beq .Leven 674bc3d5698SJohn Baldwin 675bc3d5698SJohn Baldwin vld4.32 {d20[0],d22[0],d24[0],d26[0]},[r1]! 676bc3d5698SJohn Baldwin vmov.32 d28[0],r3 677bc3d5698SJohn Baldwin sub r2,r2,#16 678bc3d5698SJohn Baldwin add r4,r1,#32 679bc3d5698SJohn Baldwin 680bc3d5698SJohn Baldwin# ifdef __ARMEB__ 681bc3d5698SJohn Baldwin vrev32.8 q10,q10 682bc3d5698SJohn Baldwin vrev32.8 q13,q13 683bc3d5698SJohn Baldwin vrev32.8 q11,q11 684bc3d5698SJohn Baldwin vrev32.8 q12,q12 685bc3d5698SJohn Baldwin# endif 686bc3d5698SJohn Baldwin vsri.u32 d28,d26,#8 @ base 2^32 -> base 2^26 687bc3d5698SJohn Baldwin vshl.u32 d26,d26,#18 688bc3d5698SJohn Baldwin 689bc3d5698SJohn Baldwin vsri.u32 d26,d24,#14 690bc3d5698SJohn Baldwin vshl.u32 d24,d24,#12 691bc3d5698SJohn Baldwin vadd.i32 d29,d28,d18 @ add hash value and move to #hi 692bc3d5698SJohn Baldwin 693bc3d5698SJohn Baldwin vbic.i32 d26,#0xfc000000 694bc3d5698SJohn Baldwin vsri.u32 d24,d22,#20 695bc3d5698SJohn Baldwin vshl.u32 d22,d22,#6 696bc3d5698SJohn Baldwin 697bc3d5698SJohn Baldwin vbic.i32 d24,#0xfc000000 698bc3d5698SJohn Baldwin vsri.u32 d22,d20,#26 699bc3d5698SJohn Baldwin vadd.i32 d27,d26,d16 700bc3d5698SJohn Baldwin 701bc3d5698SJohn Baldwin vbic.i32 d20,#0xfc000000 702bc3d5698SJohn Baldwin vbic.i32 d22,#0xfc000000 703bc3d5698SJohn Baldwin vadd.i32 d25,d24,d14 704bc3d5698SJohn Baldwin 705bc3d5698SJohn Baldwin vadd.i32 d21,d20,d10 706bc3d5698SJohn Baldwin vadd.i32 d23,d22,d12 707bc3d5698SJohn Baldwin 708bc3d5698SJohn Baldwin mov r7,r5 709bc3d5698SJohn Baldwin add r6,r0,#48 710bc3d5698SJohn Baldwin 711bc3d5698SJohn Baldwin cmp r2,r2 712bc3d5698SJohn Baldwin b .Long_tail 713bc3d5698SJohn Baldwin 714bc3d5698SJohn Baldwin.align 4 715bc3d5698SJohn Baldwin.Leven: 716bc3d5698SJohn Baldwin subs r2,r2,#64 717bc3d5698SJohn Baldwin it lo 718bc3d5698SJohn Baldwin movlo r4,r5 719bc3d5698SJohn Baldwin 720bc3d5698SJohn Baldwin vmov.i32 q14,#1<<24 @ padbit, yes, always 721bc3d5698SJohn Baldwin vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] 722bc3d5698SJohn Baldwin add r1,r1,#64 723bc3d5698SJohn Baldwin vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) 724bc3d5698SJohn Baldwin add r4,r4,#64 725bc3d5698SJohn Baldwin itt hi 726bc3d5698SJohn Baldwin addhi r7,r0,#(48+1*9*4) 727bc3d5698SJohn Baldwin addhi r6,r0,#(48+3*9*4) 728bc3d5698SJohn Baldwin 729bc3d5698SJohn Baldwin# ifdef __ARMEB__ 730bc3d5698SJohn Baldwin vrev32.8 q10,q10 731bc3d5698SJohn Baldwin vrev32.8 q13,q13 732bc3d5698SJohn Baldwin vrev32.8 q11,q11 733bc3d5698SJohn Baldwin vrev32.8 q12,q12 734bc3d5698SJohn Baldwin# endif 735bc3d5698SJohn Baldwin vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 736bc3d5698SJohn Baldwin vshl.u32 q13,q13,#18 737bc3d5698SJohn Baldwin 738bc3d5698SJohn Baldwin vsri.u32 q13,q12,#14 739bc3d5698SJohn Baldwin vshl.u32 q12,q12,#12 740bc3d5698SJohn Baldwin 741bc3d5698SJohn Baldwin vbic.i32 q13,#0xfc000000 742bc3d5698SJohn Baldwin vsri.u32 q12,q11,#20 743bc3d5698SJohn Baldwin vshl.u32 q11,q11,#6 744bc3d5698SJohn Baldwin 745bc3d5698SJohn Baldwin vbic.i32 q12,#0xfc000000 746bc3d5698SJohn Baldwin vsri.u32 q11,q10,#26 747bc3d5698SJohn Baldwin 748bc3d5698SJohn Baldwin vbic.i32 q10,#0xfc000000 749bc3d5698SJohn Baldwin vbic.i32 q11,#0xfc000000 750bc3d5698SJohn Baldwin 751bc3d5698SJohn Baldwin bls .Lskip_loop 752bc3d5698SJohn Baldwin 753bc3d5698SJohn Baldwin vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^2 754bc3d5698SJohn Baldwin vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 755bc3d5698SJohn Baldwin vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 756bc3d5698SJohn Baldwin vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 757bc3d5698SJohn Baldwin b .Loop_neon 758bc3d5698SJohn Baldwin 759bc3d5698SJohn Baldwin.align 5 760bc3d5698SJohn Baldwin.Loop_neon: 761bc3d5698SJohn Baldwin @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 762bc3d5698SJohn Baldwin @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 763bc3d5698SJohn Baldwin @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r 764bc3d5698SJohn Baldwin @ ___________________/ 765bc3d5698SJohn Baldwin @ ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 766bc3d5698SJohn Baldwin @ ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r 767bc3d5698SJohn Baldwin @ ___________________/ ____________________/ 768bc3d5698SJohn Baldwin @ 769bc3d5698SJohn Baldwin @ Note that we start with inp[2:3]*r^2. This is because it 770bc3d5698SJohn Baldwin @ doesn't depend on reduction in previous iteration. 771bc3d5698SJohn Baldwin @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 772bc3d5698SJohn Baldwin @ d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 773bc3d5698SJohn Baldwin @ d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 774bc3d5698SJohn Baldwin @ d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 775bc3d5698SJohn Baldwin @ d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 776bc3d5698SJohn Baldwin @ d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 777bc3d5698SJohn Baldwin 778bc3d5698SJohn Baldwin @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 779bc3d5698SJohn Baldwin @ inp[2:3]*r^2 780bc3d5698SJohn Baldwin 781bc3d5698SJohn Baldwin vadd.i32 d24,d24,d14 @ accumulate inp[0:1] 782bc3d5698SJohn Baldwin vmull.u32 q7,d25,d0[1] 783bc3d5698SJohn Baldwin vadd.i32 d20,d20,d10 784bc3d5698SJohn Baldwin vmull.u32 q5,d21,d0[1] 785bc3d5698SJohn Baldwin vadd.i32 d26,d26,d16 786bc3d5698SJohn Baldwin vmull.u32 q8,d27,d0[1] 787bc3d5698SJohn Baldwin vmlal.u32 q7,d23,d1[1] 788bc3d5698SJohn Baldwin vadd.i32 d22,d22,d12 789bc3d5698SJohn Baldwin vmull.u32 q6,d23,d0[1] 790bc3d5698SJohn Baldwin 791bc3d5698SJohn Baldwin vadd.i32 d28,d28,d18 792bc3d5698SJohn Baldwin vmull.u32 q9,d29,d0[1] 793bc3d5698SJohn Baldwin subs r2,r2,#64 794bc3d5698SJohn Baldwin vmlal.u32 q5,d29,d2[1] 795bc3d5698SJohn Baldwin it lo 796bc3d5698SJohn Baldwin movlo r4,r5 797bc3d5698SJohn Baldwin vmlal.u32 q8,d25,d1[1] 798bc3d5698SJohn Baldwin vld1.32 d8[1],[r7,:32] 799bc3d5698SJohn Baldwin vmlal.u32 q6,d21,d1[1] 800bc3d5698SJohn Baldwin vmlal.u32 q9,d27,d1[1] 801bc3d5698SJohn Baldwin 802bc3d5698SJohn Baldwin vmlal.u32 q5,d27,d4[1] 803bc3d5698SJohn Baldwin vmlal.u32 q8,d23,d3[1] 804bc3d5698SJohn Baldwin vmlal.u32 q9,d25,d3[1] 805bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d4[1] 806bc3d5698SJohn Baldwin vmlal.u32 q7,d21,d3[1] 807bc3d5698SJohn Baldwin 808bc3d5698SJohn Baldwin vmlal.u32 q8,d21,d5[1] 809bc3d5698SJohn Baldwin vmlal.u32 q5,d25,d6[1] 810bc3d5698SJohn Baldwin vmlal.u32 q9,d23,d5[1] 811bc3d5698SJohn Baldwin vmlal.u32 q6,d27,d6[1] 812bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d6[1] 813bc3d5698SJohn Baldwin 814bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d8[1] 815bc3d5698SJohn Baldwin vmlal.u32 q5,d23,d8[1] 816bc3d5698SJohn Baldwin vmlal.u32 q9,d21,d7[1] 817bc3d5698SJohn Baldwin vmlal.u32 q6,d25,d8[1] 818bc3d5698SJohn Baldwin vmlal.u32 q7,d27,d8[1] 819bc3d5698SJohn Baldwin 820bc3d5698SJohn Baldwin vld4.32 {d21,d23,d25,d27},[r4] @ inp[2:3] (or 0) 821bc3d5698SJohn Baldwin add r4,r4,#64 822bc3d5698SJohn Baldwin 823bc3d5698SJohn Baldwin @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 824bc3d5698SJohn Baldwin @ (hash+inp[0:1])*r^4 and accumulate 825bc3d5698SJohn Baldwin 826bc3d5698SJohn Baldwin vmlal.u32 q8,d26,d0[0] 827bc3d5698SJohn Baldwin vmlal.u32 q5,d20,d0[0] 828bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d0[0] 829bc3d5698SJohn Baldwin vmlal.u32 q6,d22,d0[0] 830bc3d5698SJohn Baldwin vmlal.u32 q7,d24,d0[0] 831bc3d5698SJohn Baldwin vld1.32 d8[0],[r6,:32] 832bc3d5698SJohn Baldwin 833bc3d5698SJohn Baldwin vmlal.u32 q8,d24,d1[0] 834bc3d5698SJohn Baldwin vmlal.u32 q5,d28,d2[0] 835bc3d5698SJohn Baldwin vmlal.u32 q9,d26,d1[0] 836bc3d5698SJohn Baldwin vmlal.u32 q6,d20,d1[0] 837bc3d5698SJohn Baldwin vmlal.u32 q7,d22,d1[0] 838bc3d5698SJohn Baldwin 839bc3d5698SJohn Baldwin vmlal.u32 q8,d22,d3[0] 840bc3d5698SJohn Baldwin vmlal.u32 q5,d26,d4[0] 841bc3d5698SJohn Baldwin vmlal.u32 q9,d24,d3[0] 842bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d4[0] 843bc3d5698SJohn Baldwin vmlal.u32 q7,d20,d3[0] 844bc3d5698SJohn Baldwin 845bc3d5698SJohn Baldwin vmlal.u32 q8,d20,d5[0] 846bc3d5698SJohn Baldwin vmlal.u32 q5,d24,d6[0] 847bc3d5698SJohn Baldwin vmlal.u32 q9,d22,d5[0] 848bc3d5698SJohn Baldwin vmlal.u32 q6,d26,d6[0] 849bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d8[0] 850bc3d5698SJohn Baldwin 851bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d6[0] 852bc3d5698SJohn Baldwin vmlal.u32 q5,d22,d8[0] 853bc3d5698SJohn Baldwin vmlal.u32 q9,d20,d7[0] 854bc3d5698SJohn Baldwin vmov.i32 q14,#1<<24 @ padbit, yes, always 855bc3d5698SJohn Baldwin vmlal.u32 q6,d24,d8[0] 856bc3d5698SJohn Baldwin vmlal.u32 q7,d26,d8[0] 857bc3d5698SJohn Baldwin 858bc3d5698SJohn Baldwin vld4.32 {d20,d22,d24,d26},[r1] @ inp[0:1] 859bc3d5698SJohn Baldwin add r1,r1,#64 860bc3d5698SJohn Baldwin# ifdef __ARMEB__ 861bc3d5698SJohn Baldwin vrev32.8 q10,q10 862bc3d5698SJohn Baldwin vrev32.8 q11,q11 863bc3d5698SJohn Baldwin vrev32.8 q12,q12 864bc3d5698SJohn Baldwin vrev32.8 q13,q13 865bc3d5698SJohn Baldwin# endif 866bc3d5698SJohn Baldwin 867bc3d5698SJohn Baldwin @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 868bc3d5698SJohn Baldwin @ lazy reduction interleaved with base 2^32 -> base 2^26 of 869bc3d5698SJohn Baldwin @ inp[0:3] previously loaded to q10-q13 and smashed to q10-q14. 870bc3d5698SJohn Baldwin 871bc3d5698SJohn Baldwin vshr.u64 q15,q8,#26 872bc3d5698SJohn Baldwin vmovn.i64 d16,q8 873bc3d5698SJohn Baldwin vshr.u64 q4,q5,#26 874bc3d5698SJohn Baldwin vmovn.i64 d10,q5 875bc3d5698SJohn Baldwin vadd.i64 q9,q9,q15 @ h3 -> h4 876bc3d5698SJohn Baldwin vbic.i32 d16,#0xfc000000 877bc3d5698SJohn Baldwin vsri.u32 q14,q13,#8 @ base 2^32 -> base 2^26 878bc3d5698SJohn Baldwin vadd.i64 q6,q6,q4 @ h0 -> h1 879bc3d5698SJohn Baldwin vshl.u32 q13,q13,#18 880bc3d5698SJohn Baldwin vbic.i32 d10,#0xfc000000 881bc3d5698SJohn Baldwin 882bc3d5698SJohn Baldwin vshrn.u64 d30,q9,#26 883bc3d5698SJohn Baldwin vmovn.i64 d18,q9 884bc3d5698SJohn Baldwin vshr.u64 q4,q6,#26 885bc3d5698SJohn Baldwin vmovn.i64 d12,q6 886bc3d5698SJohn Baldwin vadd.i64 q7,q7,q4 @ h1 -> h2 887bc3d5698SJohn Baldwin vsri.u32 q13,q12,#14 888bc3d5698SJohn Baldwin vbic.i32 d18,#0xfc000000 889bc3d5698SJohn Baldwin vshl.u32 q12,q12,#12 890bc3d5698SJohn Baldwin vbic.i32 d12,#0xfc000000 891bc3d5698SJohn Baldwin 892bc3d5698SJohn Baldwin vadd.i32 d10,d10,d30 893bc3d5698SJohn Baldwin vshl.u32 d30,d30,#2 894bc3d5698SJohn Baldwin vbic.i32 q13,#0xfc000000 895bc3d5698SJohn Baldwin vshrn.u64 d8,q7,#26 896bc3d5698SJohn Baldwin vmovn.i64 d14,q7 897bc3d5698SJohn Baldwin vaddl.u32 q5,d10,d30 @ h4 -> h0 [widen for a sec] 898bc3d5698SJohn Baldwin vsri.u32 q12,q11,#20 899bc3d5698SJohn Baldwin vadd.i32 d16,d16,d8 @ h2 -> h3 900bc3d5698SJohn Baldwin vshl.u32 q11,q11,#6 901bc3d5698SJohn Baldwin vbic.i32 d14,#0xfc000000 902bc3d5698SJohn Baldwin vbic.i32 q12,#0xfc000000 903bc3d5698SJohn Baldwin 904bc3d5698SJohn Baldwin vshrn.u64 d30,q5,#26 @ re-narrow 905bc3d5698SJohn Baldwin vmovn.i64 d10,q5 906bc3d5698SJohn Baldwin vsri.u32 q11,q10,#26 907bc3d5698SJohn Baldwin vbic.i32 q10,#0xfc000000 908bc3d5698SJohn Baldwin vshr.u32 d8,d16,#26 909bc3d5698SJohn Baldwin vbic.i32 d16,#0xfc000000 910bc3d5698SJohn Baldwin vbic.i32 d10,#0xfc000000 911bc3d5698SJohn Baldwin vadd.i32 d12,d12,d30 @ h0 -> h1 912bc3d5698SJohn Baldwin vadd.i32 d18,d18,d8 @ h3 -> h4 913bc3d5698SJohn Baldwin vbic.i32 q11,#0xfc000000 914bc3d5698SJohn Baldwin 915bc3d5698SJohn Baldwin bhi .Loop_neon 916bc3d5698SJohn Baldwin 917bc3d5698SJohn Baldwin.Lskip_loop: 918bc3d5698SJohn Baldwin @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 919bc3d5698SJohn Baldwin @ multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 920bc3d5698SJohn Baldwin 921bc3d5698SJohn Baldwin add r7,r0,#(48+0*9*4) 922bc3d5698SJohn Baldwin add r6,r0,#(48+1*9*4) 923bc3d5698SJohn Baldwin adds r2,r2,#32 924bc3d5698SJohn Baldwin it ne 925bc3d5698SJohn Baldwin movne r2,#0 926bc3d5698SJohn Baldwin bne .Long_tail 927bc3d5698SJohn Baldwin 928bc3d5698SJohn Baldwin vadd.i32 d25,d24,d14 @ add hash value and move to #hi 929bc3d5698SJohn Baldwin vadd.i32 d21,d20,d10 930bc3d5698SJohn Baldwin vadd.i32 d27,d26,d16 931bc3d5698SJohn Baldwin vadd.i32 d23,d22,d12 932bc3d5698SJohn Baldwin vadd.i32 d29,d28,d18 933bc3d5698SJohn Baldwin 934bc3d5698SJohn Baldwin.Long_tail: 935bc3d5698SJohn Baldwin vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^1 936bc3d5698SJohn Baldwin vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^2 937bc3d5698SJohn Baldwin 938bc3d5698SJohn Baldwin vadd.i32 d24,d24,d14 @ can be redundant 939bc3d5698SJohn Baldwin vmull.u32 q7,d25,d0 940bc3d5698SJohn Baldwin vadd.i32 d20,d20,d10 941bc3d5698SJohn Baldwin vmull.u32 q5,d21,d0 942bc3d5698SJohn Baldwin vadd.i32 d26,d26,d16 943bc3d5698SJohn Baldwin vmull.u32 q8,d27,d0 944bc3d5698SJohn Baldwin vadd.i32 d22,d22,d12 945bc3d5698SJohn Baldwin vmull.u32 q6,d23,d0 946bc3d5698SJohn Baldwin vadd.i32 d28,d28,d18 947bc3d5698SJohn Baldwin vmull.u32 q9,d29,d0 948bc3d5698SJohn Baldwin 949bc3d5698SJohn Baldwin vmlal.u32 q5,d29,d2 950bc3d5698SJohn Baldwin vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 951bc3d5698SJohn Baldwin vmlal.u32 q8,d25,d1 952bc3d5698SJohn Baldwin vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 953bc3d5698SJohn Baldwin vmlal.u32 q6,d21,d1 954bc3d5698SJohn Baldwin vmlal.u32 q9,d27,d1 955bc3d5698SJohn Baldwin vmlal.u32 q7,d23,d1 956bc3d5698SJohn Baldwin 957bc3d5698SJohn Baldwin vmlal.u32 q8,d23,d3 958bc3d5698SJohn Baldwin vld1.32 d8[1],[r7,:32] 959bc3d5698SJohn Baldwin vmlal.u32 q5,d27,d4 960bc3d5698SJohn Baldwin vld1.32 d8[0],[r6,:32] 961bc3d5698SJohn Baldwin vmlal.u32 q9,d25,d3 962bc3d5698SJohn Baldwin vmlal.u32 q6,d29,d4 963bc3d5698SJohn Baldwin vmlal.u32 q7,d21,d3 964bc3d5698SJohn Baldwin 965bc3d5698SJohn Baldwin vmlal.u32 q8,d21,d5 966bc3d5698SJohn Baldwin it ne 967bc3d5698SJohn Baldwin addne r7,r0,#(48+2*9*4) 968bc3d5698SJohn Baldwin vmlal.u32 q5,d25,d6 969bc3d5698SJohn Baldwin it ne 970bc3d5698SJohn Baldwin addne r6,r0,#(48+3*9*4) 971bc3d5698SJohn Baldwin vmlal.u32 q9,d23,d5 972bc3d5698SJohn Baldwin vmlal.u32 q6,d27,d6 973bc3d5698SJohn Baldwin vmlal.u32 q7,d29,d6 974bc3d5698SJohn Baldwin 975bc3d5698SJohn Baldwin vmlal.u32 q8,d29,d8 976bc3d5698SJohn Baldwin vorn q0,q0,q0 @ all-ones, can be redundant 977bc3d5698SJohn Baldwin vmlal.u32 q5,d23,d8 978bc3d5698SJohn Baldwin vshr.u64 q0,q0,#38 979bc3d5698SJohn Baldwin vmlal.u32 q9,d21,d7 980bc3d5698SJohn Baldwin vmlal.u32 q6,d25,d8 981bc3d5698SJohn Baldwin vmlal.u32 q7,d27,d8 982bc3d5698SJohn Baldwin 983bc3d5698SJohn Baldwin beq .Lshort_tail 984bc3d5698SJohn Baldwin 985bc3d5698SJohn Baldwin @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 986bc3d5698SJohn Baldwin @ (hash+inp[0:1])*r^4:r^3 and accumulate 987bc3d5698SJohn Baldwin 988bc3d5698SJohn Baldwin vld4.32 {d0[1],d1[1],d2[1],d3[1]},[r7]! @ load r^3 989bc3d5698SJohn Baldwin vld4.32 {d0[0],d1[0],d2[0],d3[0]},[r6]! @ load r^4 990bc3d5698SJohn Baldwin 991bc3d5698SJohn Baldwin vmlal.u32 q7,d24,d0 992bc3d5698SJohn Baldwin vmlal.u32 q5,d20,d0 993bc3d5698SJohn Baldwin vmlal.u32 q8,d26,d0 994bc3d5698SJohn Baldwin vmlal.u32 q6,d22,d0 995bc3d5698SJohn Baldwin vmlal.u32 q9,d28,d0 996bc3d5698SJohn Baldwin 997bc3d5698SJohn Baldwin vmlal.u32 q5,d28,d2 998bc3d5698SJohn Baldwin vld4.32 {d4[1],d5[1],d6[1],d7[1]},[r7]! 999bc3d5698SJohn Baldwin vmlal.u32 q8,d24,d1 1000bc3d5698SJohn Baldwin vld4.32 {d4[0],d5[0],d6[0],d7[0]},[r6]! 1001bc3d5698SJohn Baldwin vmlal.u32 q6,d20,d1 1002bc3d5698SJohn Baldwin vmlal.u32 q9,d26,d1 1003bc3d5698SJohn Baldwin vmlal.u32 q7,d22,d1 1004bc3d5698SJohn Baldwin 1005bc3d5698SJohn Baldwin vmlal.u32 q8,d22,d3 1006bc3d5698SJohn Baldwin vld1.32 d8[1],[r7,:32] 1007bc3d5698SJohn Baldwin vmlal.u32 q5,d26,d4 1008bc3d5698SJohn Baldwin vld1.32 d8[0],[r6,:32] 1009bc3d5698SJohn Baldwin vmlal.u32 q9,d24,d3 1010bc3d5698SJohn Baldwin vmlal.u32 q6,d28,d4 1011bc3d5698SJohn Baldwin vmlal.u32 q7,d20,d3 1012bc3d5698SJohn Baldwin 1013bc3d5698SJohn Baldwin vmlal.u32 q8,d20,d5 1014bc3d5698SJohn Baldwin vmlal.u32 q5,d24,d6 1015bc3d5698SJohn Baldwin vmlal.u32 q9,d22,d5 1016bc3d5698SJohn Baldwin vmlal.u32 q6,d26,d6 1017bc3d5698SJohn Baldwin vmlal.u32 q7,d28,d6 1018bc3d5698SJohn Baldwin 1019bc3d5698SJohn Baldwin vmlal.u32 q8,d28,d8 1020bc3d5698SJohn Baldwin vorn q0,q0,q0 @ all-ones 1021bc3d5698SJohn Baldwin vmlal.u32 q5,d22,d8 1022bc3d5698SJohn Baldwin vshr.u64 q0,q0,#38 1023bc3d5698SJohn Baldwin vmlal.u32 q9,d20,d7 1024bc3d5698SJohn Baldwin vmlal.u32 q6,d24,d8 1025bc3d5698SJohn Baldwin vmlal.u32 q7,d26,d8 1026bc3d5698SJohn Baldwin 1027bc3d5698SJohn Baldwin.Lshort_tail: 1028bc3d5698SJohn Baldwin @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1029bc3d5698SJohn Baldwin @ horizontal addition 1030bc3d5698SJohn Baldwin 1031bc3d5698SJohn Baldwin vadd.i64 d16,d16,d17 1032bc3d5698SJohn Baldwin vadd.i64 d10,d10,d11 1033bc3d5698SJohn Baldwin vadd.i64 d18,d18,d19 1034bc3d5698SJohn Baldwin vadd.i64 d12,d12,d13 1035bc3d5698SJohn Baldwin vadd.i64 d14,d14,d15 1036bc3d5698SJohn Baldwin 1037bc3d5698SJohn Baldwin @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1038bc3d5698SJohn Baldwin @ lazy reduction, but without narrowing 1039bc3d5698SJohn Baldwin 1040bc3d5698SJohn Baldwin vshr.u64 q15,q8,#26 1041bc3d5698SJohn Baldwin vand.i64 q8,q8,q0 1042bc3d5698SJohn Baldwin vshr.u64 q4,q5,#26 1043bc3d5698SJohn Baldwin vand.i64 q5,q5,q0 1044bc3d5698SJohn Baldwin vadd.i64 q9,q9,q15 @ h3 -> h4 1045bc3d5698SJohn Baldwin vadd.i64 q6,q6,q4 @ h0 -> h1 1046bc3d5698SJohn Baldwin 1047bc3d5698SJohn Baldwin vshr.u64 q15,q9,#26 1048bc3d5698SJohn Baldwin vand.i64 q9,q9,q0 1049bc3d5698SJohn Baldwin vshr.u64 q4,q6,#26 1050bc3d5698SJohn Baldwin vand.i64 q6,q6,q0 1051bc3d5698SJohn Baldwin vadd.i64 q7,q7,q4 @ h1 -> h2 1052bc3d5698SJohn Baldwin 1053bc3d5698SJohn Baldwin vadd.i64 q5,q5,q15 1054bc3d5698SJohn Baldwin vshl.u64 q15,q15,#2 1055bc3d5698SJohn Baldwin vshr.u64 q4,q7,#26 1056bc3d5698SJohn Baldwin vand.i64 q7,q7,q0 1057bc3d5698SJohn Baldwin vadd.i64 q5,q5,q15 @ h4 -> h0 1058bc3d5698SJohn Baldwin vadd.i64 q8,q8,q4 @ h2 -> h3 1059bc3d5698SJohn Baldwin 1060bc3d5698SJohn Baldwin vshr.u64 q15,q5,#26 1061bc3d5698SJohn Baldwin vand.i64 q5,q5,q0 1062bc3d5698SJohn Baldwin vshr.u64 q4,q8,#26 1063bc3d5698SJohn Baldwin vand.i64 q8,q8,q0 1064bc3d5698SJohn Baldwin vadd.i64 q6,q6,q15 @ h0 -> h1 1065bc3d5698SJohn Baldwin vadd.i64 q9,q9,q4 @ h3 -> h4 1066bc3d5698SJohn Baldwin 1067bc3d5698SJohn Baldwin cmp r2,#0 1068bc3d5698SJohn Baldwin bne .Leven 1069bc3d5698SJohn Baldwin 1070bc3d5698SJohn Baldwin @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ 1071bc3d5698SJohn Baldwin @ store hash value 1072bc3d5698SJohn Baldwin 1073bc3d5698SJohn Baldwin vst4.32 {d10[0],d12[0],d14[0],d16[0]},[r0]! 1074bc3d5698SJohn Baldwin vst1.32 {d18[0]},[r0] 1075bc3d5698SJohn Baldwin 1076bc3d5698SJohn Baldwin vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ epilogue 1077bc3d5698SJohn Baldwin ldmia sp!,{r4,r5,r6,r7} 1078bc3d5698SJohn Baldwin.Lno_data_neon: 1079bc3d5698SJohn Baldwin bx lr @ bx lr 1080bc3d5698SJohn Baldwin.size poly1305_blocks_neon,.-poly1305_blocks_neon 1081bc3d5698SJohn Baldwin 1082bc3d5698SJohn Baldwin.type poly1305_emit_neon,%function 1083bc3d5698SJohn Baldwin.align 5 1084bc3d5698SJohn Baldwinpoly1305_emit_neon: 10853396647cSJung-uk Kim.Lpoly1305_emit_neon: 1086bc3d5698SJohn Baldwin ldr ip,[r0,#36] @ is_base2_26 1087bc3d5698SJohn Baldwin 1088bc3d5698SJohn Baldwin stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 1089bc3d5698SJohn Baldwin 1090bc3d5698SJohn Baldwin tst ip,ip 1091bc3d5698SJohn Baldwin beq .Lpoly1305_emit_enter 1092bc3d5698SJohn Baldwin 1093bc3d5698SJohn Baldwin ldmia r0,{r3,r4,r5,r6,r7} 1094bc3d5698SJohn Baldwin eor r8,r8,r8 1095bc3d5698SJohn Baldwin 1096bc3d5698SJohn Baldwin adds r3,r3,r4,lsl#26 @ base 2^26 -> base 2^32 1097bc3d5698SJohn Baldwin mov r4,r4,lsr#6 1098bc3d5698SJohn Baldwin adcs r4,r4,r5,lsl#20 1099bc3d5698SJohn Baldwin mov r5,r5,lsr#12 1100bc3d5698SJohn Baldwin adcs r5,r5,r6,lsl#14 1101bc3d5698SJohn Baldwin mov r6,r6,lsr#18 1102bc3d5698SJohn Baldwin adcs r6,r6,r7,lsl#8 1103bc3d5698SJohn Baldwin adc r7,r8,r7,lsr#24 @ can be partially reduced ... 1104bc3d5698SJohn Baldwin 1105bc3d5698SJohn Baldwin and r8,r7,#-4 @ ... so reduce 1106bc3d5698SJohn Baldwin and r7,r6,#3 1107bc3d5698SJohn Baldwin add r8,r8,r8,lsr#2 @ *= 5 1108bc3d5698SJohn Baldwin adds r3,r3,r8 1109bc3d5698SJohn Baldwin adcs r4,r4,#0 1110bc3d5698SJohn Baldwin adcs r5,r5,#0 1111bc3d5698SJohn Baldwin adcs r6,r6,#0 1112bc3d5698SJohn Baldwin adc r7,r7,#0 1113bc3d5698SJohn Baldwin 1114bc3d5698SJohn Baldwin adds r8,r3,#5 @ compare to modulus 1115bc3d5698SJohn Baldwin adcs r9,r4,#0 1116bc3d5698SJohn Baldwin adcs r10,r5,#0 1117bc3d5698SJohn Baldwin adcs r11,r6,#0 1118bc3d5698SJohn Baldwin adc r7,r7,#0 1119bc3d5698SJohn Baldwin tst r7,#4 @ did it carry/borrow? 1120bc3d5698SJohn Baldwin 1121bc3d5698SJohn Baldwin it ne 1122bc3d5698SJohn Baldwin movne r3,r8 1123bc3d5698SJohn Baldwin ldr r8,[r2,#0] 1124bc3d5698SJohn Baldwin it ne 1125bc3d5698SJohn Baldwin movne r4,r9 1126bc3d5698SJohn Baldwin ldr r9,[r2,#4] 1127bc3d5698SJohn Baldwin it ne 1128bc3d5698SJohn Baldwin movne r5,r10 1129bc3d5698SJohn Baldwin ldr r10,[r2,#8] 1130bc3d5698SJohn Baldwin it ne 1131bc3d5698SJohn Baldwin movne r6,r11 1132bc3d5698SJohn Baldwin ldr r11,[r2,#12] 1133bc3d5698SJohn Baldwin 1134bc3d5698SJohn Baldwin adds r3,r3,r8 @ accumulate nonce 1135bc3d5698SJohn Baldwin adcs r4,r4,r9 1136bc3d5698SJohn Baldwin adcs r5,r5,r10 1137bc3d5698SJohn Baldwin adc r6,r6,r11 1138bc3d5698SJohn Baldwin 1139bc3d5698SJohn Baldwin# ifdef __ARMEB__ 1140bc3d5698SJohn Baldwin rev r3,r3 1141bc3d5698SJohn Baldwin rev r4,r4 1142bc3d5698SJohn Baldwin rev r5,r5 1143bc3d5698SJohn Baldwin rev r6,r6 1144bc3d5698SJohn Baldwin# endif 1145bc3d5698SJohn Baldwin str r3,[r1,#0] @ store the result 1146bc3d5698SJohn Baldwin str r4,[r1,#4] 1147bc3d5698SJohn Baldwin str r5,[r1,#8] 1148bc3d5698SJohn Baldwin str r6,[r1,#12] 1149bc3d5698SJohn Baldwin 1150bc3d5698SJohn Baldwin ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} 1151bc3d5698SJohn Baldwin bx lr @ bx lr 1152bc3d5698SJohn Baldwin.size poly1305_emit_neon,.-poly1305_emit_neon 1153bc3d5698SJohn Baldwin 1154bc3d5698SJohn Baldwin.align 5 1155bc3d5698SJohn Baldwin.Lzeros: 1156bc3d5698SJohn Baldwin.long 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 1157bc3d5698SJohn Baldwin.LOPENSSL_armcap: 1158*c0855eaaSJohn Baldwin# ifdef _WIN32 1159*c0855eaaSJohn Baldwin.word OPENSSL_armcap_P 1160*c0855eaaSJohn Baldwin# else 1161bc3d5698SJohn Baldwin.word OPENSSL_armcap_P-.Lpoly1305_init 1162bc3d5698SJohn Baldwin# endif 1163*c0855eaaSJohn Baldwin#endif 1164bc3d5698SJohn Baldwin.byte 80,111,108,121,49,51,48,53,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1165bc3d5698SJohn Baldwin.align 2 1166bc3d5698SJohn Baldwin.align 2 1167bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7 1168bc3d5698SJohn Baldwin.comm OPENSSL_armcap_P,4,4 1169bc3d5698SJohn Baldwin#endif 1170