1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from chacha-armv4.pl. */ 2bc3d5698SJohn Baldwin#include "arm_arch.h" 3bc3d5698SJohn Baldwin 4bc3d5698SJohn Baldwin#if defined(__thumb2__) || defined(__clang__) 5bc3d5698SJohn Baldwin.syntax unified 6bc3d5698SJohn Baldwin#endif 7bc3d5698SJohn Baldwin#if defined(__thumb2__) 8bc3d5698SJohn Baldwin.thumb 9bc3d5698SJohn Baldwin#else 10bc3d5698SJohn Baldwin.code 32 11bc3d5698SJohn Baldwin#endif 12bc3d5698SJohn Baldwin 13bc3d5698SJohn Baldwin#if defined(__thumb2__) || defined(__clang__) 14bc3d5698SJohn Baldwin#define ldrhsb ldrbhs 15bc3d5698SJohn Baldwin#endif 16bc3d5698SJohn Baldwin 17*c0855eaaSJohn Baldwin.text 18*c0855eaaSJohn Baldwin 19bc3d5698SJohn Baldwin.align 5 20bc3d5698SJohn Baldwin.Lsigma: 21bc3d5698SJohn Baldwin.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral 22bc3d5698SJohn Baldwin.Lone: 23bc3d5698SJohn Baldwin.long 1,0,0,0 24bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7 25bc3d5698SJohn Baldwin.LOPENSSL_armcap: 26*c0855eaaSJohn Baldwin# ifdef _WIN32 27*c0855eaaSJohn Baldwin.word OPENSSL_armcap_P 28*c0855eaaSJohn Baldwin# else 29bc3d5698SJohn Baldwin.word OPENSSL_armcap_P-.LChaCha20_ctr32 30*c0855eaaSJohn Baldwin# endif 31bc3d5698SJohn Baldwin#else 32bc3d5698SJohn Baldwin.word -1 33bc3d5698SJohn Baldwin#endif 34bc3d5698SJohn Baldwin 35bc3d5698SJohn Baldwin.globl ChaCha20_ctr32 36bc3d5698SJohn Baldwin.type ChaCha20_ctr32,%function 37bc3d5698SJohn Baldwin.align 5 38bc3d5698SJohn BaldwinChaCha20_ctr32: 39bc3d5698SJohn Baldwin.LChaCha20_ctr32: 40bc3d5698SJohn Baldwin ldr r12,[sp,#0] @ pull pointer to counter and nonce 41bc3d5698SJohn Baldwin stmdb sp!,{r0,r1,r2,r4-r11,lr} 42bc3d5698SJohn Baldwin#if __ARM_ARCH__<7 && !defined(__thumb2__) 43bc3d5698SJohn Baldwin sub r14,pc,#16 @ ChaCha20_ctr32 44bc3d5698SJohn Baldwin#else 45bc3d5698SJohn Baldwin adr r14,.LChaCha20_ctr32 46bc3d5698SJohn Baldwin#endif 47bc3d5698SJohn Baldwin cmp r2,#0 @ len==0? 48bc3d5698SJohn Baldwin#ifdef __thumb2__ 49bc3d5698SJohn Baldwin itt eq 50bc3d5698SJohn Baldwin#endif 51bc3d5698SJohn Baldwin addeq sp,sp,#4*3 52bc3d5698SJohn Baldwin beq .Lno_data 53bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7 54bc3d5698SJohn Baldwin cmp r2,#192 @ test len 55bc3d5698SJohn Baldwin bls .Lshort 56bc3d5698SJohn Baldwin ldr r4,[r14,#-32] 57*c0855eaaSJohn Baldwin# if !defined(_WIN32) 58bc3d5698SJohn Baldwin ldr r4,[r14,r4] 59*c0855eaaSJohn Baldwin# endif 60*c0855eaaSJohn Baldwin# if defined(__APPLE__) || defined(_WIN32) 61bc3d5698SJohn Baldwin ldr r4,[r4] 62bc3d5698SJohn Baldwin# endif 63bc3d5698SJohn Baldwin tst r4,#ARMV7_NEON 64bc3d5698SJohn Baldwin bne .LChaCha20_neon 65bc3d5698SJohn Baldwin.Lshort: 66bc3d5698SJohn Baldwin#endif 67bc3d5698SJohn Baldwin ldmia r12,{r4,r5,r6,r7} @ load counter and nonce 68bc3d5698SJohn Baldwin sub sp,sp,#4*(16) @ off-load area 69bc3d5698SJohn Baldwin sub r14,r14,#64 @ .Lsigma 70bc3d5698SJohn Baldwin stmdb sp!,{r4,r5,r6,r7} @ copy counter and nonce 71bc3d5698SJohn Baldwin ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key 72bc3d5698SJohn Baldwin ldmia r14,{r0,r1,r2,r3} @ load sigma 73bc3d5698SJohn Baldwin stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy key 74bc3d5698SJohn Baldwin stmdb sp!,{r0,r1,r2,r3} @ copy sigma 75bc3d5698SJohn Baldwin str r10,[sp,#4*(16+10)] @ off-load "rx" 76bc3d5698SJohn Baldwin str r11,[sp,#4*(16+11)] @ off-load "rx" 77bc3d5698SJohn Baldwin b .Loop_outer_enter 78bc3d5698SJohn Baldwin 79bc3d5698SJohn Baldwin.align 4 80bc3d5698SJohn Baldwin.Loop_outer: 81bc3d5698SJohn Baldwin ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material 82bc3d5698SJohn Baldwin str r11,[sp,#4*(32+2)] @ save len 83bc3d5698SJohn Baldwin str r12, [sp,#4*(32+1)] @ save inp 84bc3d5698SJohn Baldwin str r14, [sp,#4*(32+0)] @ save out 85bc3d5698SJohn Baldwin.Loop_outer_enter: 86bc3d5698SJohn Baldwin ldr r11, [sp,#4*(15)] 87bc3d5698SJohn Baldwin ldr r12,[sp,#4*(12)] @ modulo-scheduled load 88bc3d5698SJohn Baldwin ldr r10, [sp,#4*(13)] 89bc3d5698SJohn Baldwin ldr r14,[sp,#4*(14)] 90bc3d5698SJohn Baldwin str r11, [sp,#4*(16+15)] 91bc3d5698SJohn Baldwin mov r11,#10 92bc3d5698SJohn Baldwin b .Loop 93bc3d5698SJohn Baldwin 94bc3d5698SJohn Baldwin.align 4 95bc3d5698SJohn Baldwin.Loop: 96bc3d5698SJohn Baldwin subs r11,r11,#1 97bc3d5698SJohn Baldwin add r0,r0,r4 98bc3d5698SJohn Baldwin mov r12,r12,ror#16 99bc3d5698SJohn Baldwin add r1,r1,r5 100bc3d5698SJohn Baldwin mov r10,r10,ror#16 101bc3d5698SJohn Baldwin eor r12,r12,r0,ror#16 102bc3d5698SJohn Baldwin eor r10,r10,r1,ror#16 103bc3d5698SJohn Baldwin add r8,r8,r12 104bc3d5698SJohn Baldwin mov r4,r4,ror#20 105bc3d5698SJohn Baldwin add r9,r9,r10 106bc3d5698SJohn Baldwin mov r5,r5,ror#20 107bc3d5698SJohn Baldwin eor r4,r4,r8,ror#20 108bc3d5698SJohn Baldwin eor r5,r5,r9,ror#20 109bc3d5698SJohn Baldwin add r0,r0,r4 110bc3d5698SJohn Baldwin mov r12,r12,ror#24 111bc3d5698SJohn Baldwin add r1,r1,r5 112bc3d5698SJohn Baldwin mov r10,r10,ror#24 113bc3d5698SJohn Baldwin eor r12,r12,r0,ror#24 114bc3d5698SJohn Baldwin eor r10,r10,r1,ror#24 115bc3d5698SJohn Baldwin add r8,r8,r12 116bc3d5698SJohn Baldwin mov r4,r4,ror#25 117bc3d5698SJohn Baldwin add r9,r9,r10 118bc3d5698SJohn Baldwin mov r5,r5,ror#25 119bc3d5698SJohn Baldwin str r10,[sp,#4*(16+13)] 120bc3d5698SJohn Baldwin ldr r10,[sp,#4*(16+15)] 121bc3d5698SJohn Baldwin eor r4,r4,r8,ror#25 122bc3d5698SJohn Baldwin eor r5,r5,r9,ror#25 123bc3d5698SJohn Baldwin str r8,[sp,#4*(16+8)] 124bc3d5698SJohn Baldwin ldr r8,[sp,#4*(16+10)] 125bc3d5698SJohn Baldwin add r2,r2,r6 126bc3d5698SJohn Baldwin mov r14,r14,ror#16 127bc3d5698SJohn Baldwin str r9,[sp,#4*(16+9)] 128bc3d5698SJohn Baldwin ldr r9,[sp,#4*(16+11)] 129bc3d5698SJohn Baldwin add r3,r3,r7 130bc3d5698SJohn Baldwin mov r10,r10,ror#16 131bc3d5698SJohn Baldwin eor r14,r14,r2,ror#16 132bc3d5698SJohn Baldwin eor r10,r10,r3,ror#16 133bc3d5698SJohn Baldwin add r8,r8,r14 134bc3d5698SJohn Baldwin mov r6,r6,ror#20 135bc3d5698SJohn Baldwin add r9,r9,r10 136bc3d5698SJohn Baldwin mov r7,r7,ror#20 137bc3d5698SJohn Baldwin eor r6,r6,r8,ror#20 138bc3d5698SJohn Baldwin eor r7,r7,r9,ror#20 139bc3d5698SJohn Baldwin add r2,r2,r6 140bc3d5698SJohn Baldwin mov r14,r14,ror#24 141bc3d5698SJohn Baldwin add r3,r3,r7 142bc3d5698SJohn Baldwin mov r10,r10,ror#24 143bc3d5698SJohn Baldwin eor r14,r14,r2,ror#24 144bc3d5698SJohn Baldwin eor r10,r10,r3,ror#24 145bc3d5698SJohn Baldwin add r8,r8,r14 146bc3d5698SJohn Baldwin mov r6,r6,ror#25 147bc3d5698SJohn Baldwin add r9,r9,r10 148bc3d5698SJohn Baldwin mov r7,r7,ror#25 149bc3d5698SJohn Baldwin eor r6,r6,r8,ror#25 150bc3d5698SJohn Baldwin eor r7,r7,r9,ror#25 151bc3d5698SJohn Baldwin add r0,r0,r5 152bc3d5698SJohn Baldwin mov r10,r10,ror#16 153bc3d5698SJohn Baldwin add r1,r1,r6 154bc3d5698SJohn Baldwin mov r12,r12,ror#16 155bc3d5698SJohn Baldwin eor r10,r10,r0,ror#16 156bc3d5698SJohn Baldwin eor r12,r12,r1,ror#16 157bc3d5698SJohn Baldwin add r8,r8,r10 158bc3d5698SJohn Baldwin mov r5,r5,ror#20 159bc3d5698SJohn Baldwin add r9,r9,r12 160bc3d5698SJohn Baldwin mov r6,r6,ror#20 161bc3d5698SJohn Baldwin eor r5,r5,r8,ror#20 162bc3d5698SJohn Baldwin eor r6,r6,r9,ror#20 163bc3d5698SJohn Baldwin add r0,r0,r5 164bc3d5698SJohn Baldwin mov r10,r10,ror#24 165bc3d5698SJohn Baldwin add r1,r1,r6 166bc3d5698SJohn Baldwin mov r12,r12,ror#24 167bc3d5698SJohn Baldwin eor r10,r10,r0,ror#24 168bc3d5698SJohn Baldwin eor r12,r12,r1,ror#24 169bc3d5698SJohn Baldwin add r8,r8,r10 170bc3d5698SJohn Baldwin mov r5,r5,ror#25 171bc3d5698SJohn Baldwin str r10,[sp,#4*(16+15)] 172bc3d5698SJohn Baldwin ldr r10,[sp,#4*(16+13)] 173bc3d5698SJohn Baldwin add r9,r9,r12 174bc3d5698SJohn Baldwin mov r6,r6,ror#25 175bc3d5698SJohn Baldwin eor r5,r5,r8,ror#25 176bc3d5698SJohn Baldwin eor r6,r6,r9,ror#25 177bc3d5698SJohn Baldwin str r8,[sp,#4*(16+10)] 178bc3d5698SJohn Baldwin ldr r8,[sp,#4*(16+8)] 179bc3d5698SJohn Baldwin add r2,r2,r7 180bc3d5698SJohn Baldwin mov r10,r10,ror#16 181bc3d5698SJohn Baldwin str r9,[sp,#4*(16+11)] 182bc3d5698SJohn Baldwin ldr r9,[sp,#4*(16+9)] 183bc3d5698SJohn Baldwin add r3,r3,r4 184bc3d5698SJohn Baldwin mov r14,r14,ror#16 185bc3d5698SJohn Baldwin eor r10,r10,r2,ror#16 186bc3d5698SJohn Baldwin eor r14,r14,r3,ror#16 187bc3d5698SJohn Baldwin add r8,r8,r10 188bc3d5698SJohn Baldwin mov r7,r7,ror#20 189bc3d5698SJohn Baldwin add r9,r9,r14 190bc3d5698SJohn Baldwin mov r4,r4,ror#20 191bc3d5698SJohn Baldwin eor r7,r7,r8,ror#20 192bc3d5698SJohn Baldwin eor r4,r4,r9,ror#20 193bc3d5698SJohn Baldwin add r2,r2,r7 194bc3d5698SJohn Baldwin mov r10,r10,ror#24 195bc3d5698SJohn Baldwin add r3,r3,r4 196bc3d5698SJohn Baldwin mov r14,r14,ror#24 197bc3d5698SJohn Baldwin eor r10,r10,r2,ror#24 198bc3d5698SJohn Baldwin eor r14,r14,r3,ror#24 199bc3d5698SJohn Baldwin add r8,r8,r10 200bc3d5698SJohn Baldwin mov r7,r7,ror#25 201bc3d5698SJohn Baldwin add r9,r9,r14 202bc3d5698SJohn Baldwin mov r4,r4,ror#25 203bc3d5698SJohn Baldwin eor r7,r7,r8,ror#25 204bc3d5698SJohn Baldwin eor r4,r4,r9,ror#25 205bc3d5698SJohn Baldwin bne .Loop 206bc3d5698SJohn Baldwin 207bc3d5698SJohn Baldwin ldr r11,[sp,#4*(32+2)] @ load len 208bc3d5698SJohn Baldwin 209bc3d5698SJohn Baldwin str r8, [sp,#4*(16+8)] @ modulo-scheduled store 210bc3d5698SJohn Baldwin str r9, [sp,#4*(16+9)] 211bc3d5698SJohn Baldwin str r12,[sp,#4*(16+12)] 212bc3d5698SJohn Baldwin str r10, [sp,#4*(16+13)] 213bc3d5698SJohn Baldwin str r14,[sp,#4*(16+14)] 214bc3d5698SJohn Baldwin 215bc3d5698SJohn Baldwin @ at this point we have first half of 512-bit result in 216bc3d5698SJohn Baldwin @ rx and second half at sp+4*(16+8) 217bc3d5698SJohn Baldwin 218bc3d5698SJohn Baldwin cmp r11,#64 @ done yet? 219bc3d5698SJohn Baldwin#ifdef __thumb2__ 220bc3d5698SJohn Baldwin itete lo 221bc3d5698SJohn Baldwin#endif 222bc3d5698SJohn Baldwin addlo r12,sp,#4*(0) @ shortcut or ... 223bc3d5698SJohn Baldwin ldrhs r12,[sp,#4*(32+1)] @ ... load inp 224bc3d5698SJohn Baldwin addlo r14,sp,#4*(0) @ shortcut or ... 225bc3d5698SJohn Baldwin ldrhs r14,[sp,#4*(32+0)] @ ... load out 226bc3d5698SJohn Baldwin 227bc3d5698SJohn Baldwin ldr r8,[sp,#4*(0)] @ load key material 228bc3d5698SJohn Baldwin ldr r9,[sp,#4*(1)] 229bc3d5698SJohn Baldwin 230bc3d5698SJohn Baldwin#if __ARM_ARCH__>=6 || !defined(__ARMEB__) 231bc3d5698SJohn Baldwin# if __ARM_ARCH__<7 232bc3d5698SJohn Baldwin orr r10,r12,r14 233bc3d5698SJohn Baldwin tst r10,#3 @ are input and output aligned? 234bc3d5698SJohn Baldwin ldr r10,[sp,#4*(2)] 235bc3d5698SJohn Baldwin bne .Lunaligned 236bc3d5698SJohn Baldwin cmp r11,#64 @ restore flags 237bc3d5698SJohn Baldwin# else 238bc3d5698SJohn Baldwin ldr r10,[sp,#4*(2)] 239bc3d5698SJohn Baldwin# endif 240bc3d5698SJohn Baldwin ldr r11,[sp,#4*(3)] 241bc3d5698SJohn Baldwin 242bc3d5698SJohn Baldwin add r0,r0,r8 @ accumulate key material 243bc3d5698SJohn Baldwin add r1,r1,r9 244bc3d5698SJohn Baldwin# ifdef __thumb2__ 245bc3d5698SJohn Baldwin itt hs 246bc3d5698SJohn Baldwin# endif 247bc3d5698SJohn Baldwin ldrhs r8,[r12],#16 @ load input 248bc3d5698SJohn Baldwin ldrhs r9,[r12,#-12] 249bc3d5698SJohn Baldwin 250bc3d5698SJohn Baldwin add r2,r2,r10 251bc3d5698SJohn Baldwin add r3,r3,r11 252bc3d5698SJohn Baldwin# ifdef __thumb2__ 253bc3d5698SJohn Baldwin itt hs 254bc3d5698SJohn Baldwin# endif 255bc3d5698SJohn Baldwin ldrhs r10,[r12,#-8] 256bc3d5698SJohn Baldwin ldrhs r11,[r12,#-4] 257bc3d5698SJohn Baldwin# if __ARM_ARCH__>=6 && defined(__ARMEB__) 258bc3d5698SJohn Baldwin rev r0,r0 259bc3d5698SJohn Baldwin rev r1,r1 260bc3d5698SJohn Baldwin rev r2,r2 261bc3d5698SJohn Baldwin rev r3,r3 262bc3d5698SJohn Baldwin# endif 263bc3d5698SJohn Baldwin# ifdef __thumb2__ 264bc3d5698SJohn Baldwin itt hs 265bc3d5698SJohn Baldwin# endif 266bc3d5698SJohn Baldwin eorhs r0,r0,r8 @ xor with input 267bc3d5698SJohn Baldwin eorhs r1,r1,r9 268bc3d5698SJohn Baldwin add r8,sp,#4*(4) 269bc3d5698SJohn Baldwin str r0,[r14],#16 @ store output 270bc3d5698SJohn Baldwin# ifdef __thumb2__ 271bc3d5698SJohn Baldwin itt hs 272bc3d5698SJohn Baldwin# endif 273bc3d5698SJohn Baldwin eorhs r2,r2,r10 274bc3d5698SJohn Baldwin eorhs r3,r3,r11 275bc3d5698SJohn Baldwin ldmia r8,{r8,r9,r10,r11} @ load key material 276bc3d5698SJohn Baldwin str r1,[r14,#-12] 277bc3d5698SJohn Baldwin str r2,[r14,#-8] 278bc3d5698SJohn Baldwin str r3,[r14,#-4] 279bc3d5698SJohn Baldwin 280bc3d5698SJohn Baldwin add r4,r4,r8 @ accumulate key material 281bc3d5698SJohn Baldwin add r5,r5,r9 282bc3d5698SJohn Baldwin# ifdef __thumb2__ 283bc3d5698SJohn Baldwin itt hs 284bc3d5698SJohn Baldwin# endif 285bc3d5698SJohn Baldwin ldrhs r8,[r12],#16 @ load input 286bc3d5698SJohn Baldwin ldrhs r9,[r12,#-12] 287bc3d5698SJohn Baldwin add r6,r6,r10 288bc3d5698SJohn Baldwin add r7,r7,r11 289bc3d5698SJohn Baldwin# ifdef __thumb2__ 290bc3d5698SJohn Baldwin itt hs 291bc3d5698SJohn Baldwin# endif 292bc3d5698SJohn Baldwin ldrhs r10,[r12,#-8] 293bc3d5698SJohn Baldwin ldrhs r11,[r12,#-4] 294bc3d5698SJohn Baldwin# if __ARM_ARCH__>=6 && defined(__ARMEB__) 295bc3d5698SJohn Baldwin rev r4,r4 296bc3d5698SJohn Baldwin rev r5,r5 297bc3d5698SJohn Baldwin rev r6,r6 298bc3d5698SJohn Baldwin rev r7,r7 299bc3d5698SJohn Baldwin# endif 300bc3d5698SJohn Baldwin# ifdef __thumb2__ 301bc3d5698SJohn Baldwin itt hs 302bc3d5698SJohn Baldwin# endif 303bc3d5698SJohn Baldwin eorhs r4,r4,r8 304bc3d5698SJohn Baldwin eorhs r5,r5,r9 305bc3d5698SJohn Baldwin add r8,sp,#4*(8) 306bc3d5698SJohn Baldwin str r4,[r14],#16 @ store output 307bc3d5698SJohn Baldwin# ifdef __thumb2__ 308bc3d5698SJohn Baldwin itt hs 309bc3d5698SJohn Baldwin# endif 310bc3d5698SJohn Baldwin eorhs r6,r6,r10 311bc3d5698SJohn Baldwin eorhs r7,r7,r11 312bc3d5698SJohn Baldwin str r5,[r14,#-12] 313bc3d5698SJohn Baldwin ldmia r8,{r8,r9,r10,r11} @ load key material 314bc3d5698SJohn Baldwin str r6,[r14,#-8] 315bc3d5698SJohn Baldwin add r0,sp,#4*(16+8) 316bc3d5698SJohn Baldwin str r7,[r14,#-4] 317bc3d5698SJohn Baldwin 318bc3d5698SJohn Baldwin ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half 319bc3d5698SJohn Baldwin 320bc3d5698SJohn Baldwin add r0,r0,r8 @ accumulate key material 321bc3d5698SJohn Baldwin add r1,r1,r9 322bc3d5698SJohn Baldwin# ifdef __thumb2__ 323bc3d5698SJohn Baldwin itt hs 324bc3d5698SJohn Baldwin# endif 325bc3d5698SJohn Baldwin ldrhs r8,[r12],#16 @ load input 326bc3d5698SJohn Baldwin ldrhs r9,[r12,#-12] 327bc3d5698SJohn Baldwin# ifdef __thumb2__ 328bc3d5698SJohn Baldwin itt hi 329bc3d5698SJohn Baldwin# endif 330bc3d5698SJohn Baldwin strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it 331bc3d5698SJohn Baldwin strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it 332bc3d5698SJohn Baldwin add r2,r2,r10 333bc3d5698SJohn Baldwin add r3,r3,r11 334bc3d5698SJohn Baldwin# ifdef __thumb2__ 335bc3d5698SJohn Baldwin itt hs 336bc3d5698SJohn Baldwin# endif 337bc3d5698SJohn Baldwin ldrhs r10,[r12,#-8] 338bc3d5698SJohn Baldwin ldrhs r11,[r12,#-4] 339bc3d5698SJohn Baldwin# if __ARM_ARCH__>=6 && defined(__ARMEB__) 340bc3d5698SJohn Baldwin rev r0,r0 341bc3d5698SJohn Baldwin rev r1,r1 342bc3d5698SJohn Baldwin rev r2,r2 343bc3d5698SJohn Baldwin rev r3,r3 344bc3d5698SJohn Baldwin# endif 345bc3d5698SJohn Baldwin# ifdef __thumb2__ 346bc3d5698SJohn Baldwin itt hs 347bc3d5698SJohn Baldwin# endif 348bc3d5698SJohn Baldwin eorhs r0,r0,r8 349bc3d5698SJohn Baldwin eorhs r1,r1,r9 350bc3d5698SJohn Baldwin add r8,sp,#4*(12) 351bc3d5698SJohn Baldwin str r0,[r14],#16 @ store output 352bc3d5698SJohn Baldwin# ifdef __thumb2__ 353bc3d5698SJohn Baldwin itt hs 354bc3d5698SJohn Baldwin# endif 355bc3d5698SJohn Baldwin eorhs r2,r2,r10 356bc3d5698SJohn Baldwin eorhs r3,r3,r11 357bc3d5698SJohn Baldwin str r1,[r14,#-12] 358bc3d5698SJohn Baldwin ldmia r8,{r8,r9,r10,r11} @ load key material 359bc3d5698SJohn Baldwin str r2,[r14,#-8] 360bc3d5698SJohn Baldwin str r3,[r14,#-4] 361bc3d5698SJohn Baldwin 362bc3d5698SJohn Baldwin add r4,r4,r8 @ accumulate key material 363bc3d5698SJohn Baldwin add r5,r5,r9 364bc3d5698SJohn Baldwin# ifdef __thumb2__ 365bc3d5698SJohn Baldwin itt hi 366bc3d5698SJohn Baldwin# endif 367bc3d5698SJohn Baldwin addhi r8,r8,#1 @ next counter value 368bc3d5698SJohn Baldwin strhi r8,[sp,#4*(12)] @ save next counter value 369bc3d5698SJohn Baldwin# ifdef __thumb2__ 370bc3d5698SJohn Baldwin itt hs 371bc3d5698SJohn Baldwin# endif 372bc3d5698SJohn Baldwin ldrhs r8,[r12],#16 @ load input 373bc3d5698SJohn Baldwin ldrhs r9,[r12,#-12] 374bc3d5698SJohn Baldwin add r6,r6,r10 375bc3d5698SJohn Baldwin add r7,r7,r11 376bc3d5698SJohn Baldwin# ifdef __thumb2__ 377bc3d5698SJohn Baldwin itt hs 378bc3d5698SJohn Baldwin# endif 379bc3d5698SJohn Baldwin ldrhs r10,[r12,#-8] 380bc3d5698SJohn Baldwin ldrhs r11,[r12,#-4] 381bc3d5698SJohn Baldwin# if __ARM_ARCH__>=6 && defined(__ARMEB__) 382bc3d5698SJohn Baldwin rev r4,r4 383bc3d5698SJohn Baldwin rev r5,r5 384bc3d5698SJohn Baldwin rev r6,r6 385bc3d5698SJohn Baldwin rev r7,r7 386bc3d5698SJohn Baldwin# endif 387bc3d5698SJohn Baldwin# ifdef __thumb2__ 388bc3d5698SJohn Baldwin itt hs 389bc3d5698SJohn Baldwin# endif 390bc3d5698SJohn Baldwin eorhs r4,r4,r8 391bc3d5698SJohn Baldwin eorhs r5,r5,r9 392bc3d5698SJohn Baldwin# ifdef __thumb2__ 393bc3d5698SJohn Baldwin it ne 394bc3d5698SJohn Baldwin# endif 395bc3d5698SJohn Baldwin ldrne r8,[sp,#4*(32+2)] @ re-load len 396bc3d5698SJohn Baldwin# ifdef __thumb2__ 397bc3d5698SJohn Baldwin itt hs 398bc3d5698SJohn Baldwin# endif 399bc3d5698SJohn Baldwin eorhs r6,r6,r10 400bc3d5698SJohn Baldwin eorhs r7,r7,r11 401bc3d5698SJohn Baldwin str r4,[r14],#16 @ store output 402bc3d5698SJohn Baldwin str r5,[r14,#-12] 403bc3d5698SJohn Baldwin# ifdef __thumb2__ 404bc3d5698SJohn Baldwin it hs 405bc3d5698SJohn Baldwin# endif 406bc3d5698SJohn Baldwin subhs r11,r8,#64 @ len-=64 407bc3d5698SJohn Baldwin str r6,[r14,#-8] 408bc3d5698SJohn Baldwin str r7,[r14,#-4] 409bc3d5698SJohn Baldwin bhi .Loop_outer 410bc3d5698SJohn Baldwin 411bc3d5698SJohn Baldwin beq .Ldone 412bc3d5698SJohn Baldwin# if __ARM_ARCH__<7 413bc3d5698SJohn Baldwin b .Ltail 414bc3d5698SJohn Baldwin 415bc3d5698SJohn Baldwin.align 4 416bc3d5698SJohn Baldwin.Lunaligned:@ unaligned endian-neutral path 417bc3d5698SJohn Baldwin cmp r11,#64 @ restore flags 418bc3d5698SJohn Baldwin# endif 419bc3d5698SJohn Baldwin#endif 420bc3d5698SJohn Baldwin#if __ARM_ARCH__<7 421bc3d5698SJohn Baldwin ldr r11,[sp,#4*(3)] 422bc3d5698SJohn Baldwin add r0,r0,r8 @ accumulate key material 423bc3d5698SJohn Baldwin add r1,r1,r9 424bc3d5698SJohn Baldwin add r2,r2,r10 425bc3d5698SJohn Baldwin# ifdef __thumb2__ 426bc3d5698SJohn Baldwin itete lo 427bc3d5698SJohn Baldwin# endif 428bc3d5698SJohn Baldwin eorlo r8,r8,r8 @ zero or ... 429bc3d5698SJohn Baldwin ldrhsb r8,[r12],#16 @ ... load input 430bc3d5698SJohn Baldwin eorlo r9,r9,r9 431bc3d5698SJohn Baldwin ldrhsb r9,[r12,#-12] 432bc3d5698SJohn Baldwin 433bc3d5698SJohn Baldwin add r3,r3,r11 434bc3d5698SJohn Baldwin# ifdef __thumb2__ 435bc3d5698SJohn Baldwin itete lo 436bc3d5698SJohn Baldwin# endif 437bc3d5698SJohn Baldwin eorlo r10,r10,r10 438bc3d5698SJohn Baldwin ldrhsb r10,[r12,#-8] 439bc3d5698SJohn Baldwin eorlo r11,r11,r11 440bc3d5698SJohn Baldwin ldrhsb r11,[r12,#-4] 441bc3d5698SJohn Baldwin 442bc3d5698SJohn Baldwin eor r0,r8,r0 @ xor with input (or zero) 443bc3d5698SJohn Baldwin eor r1,r9,r1 444bc3d5698SJohn Baldwin# ifdef __thumb2__ 445bc3d5698SJohn Baldwin itt hs 446bc3d5698SJohn Baldwin# endif 447bc3d5698SJohn Baldwin ldrhsb r8,[r12,#-15] @ load more input 448bc3d5698SJohn Baldwin ldrhsb r9,[r12,#-11] 449bc3d5698SJohn Baldwin eor r2,r10,r2 450bc3d5698SJohn Baldwin strb r0,[r14],#16 @ store output 451bc3d5698SJohn Baldwin eor r3,r11,r3 452bc3d5698SJohn Baldwin# ifdef __thumb2__ 453bc3d5698SJohn Baldwin itt hs 454bc3d5698SJohn Baldwin# endif 455bc3d5698SJohn Baldwin ldrhsb r10,[r12,#-7] 456bc3d5698SJohn Baldwin ldrhsb r11,[r12,#-3] 457bc3d5698SJohn Baldwin strb r1,[r14,#-12] 458bc3d5698SJohn Baldwin eor r0,r8,r0,lsr#8 459bc3d5698SJohn Baldwin strb r2,[r14,#-8] 460bc3d5698SJohn Baldwin eor r1,r9,r1,lsr#8 461bc3d5698SJohn Baldwin# ifdef __thumb2__ 462bc3d5698SJohn Baldwin itt hs 463bc3d5698SJohn Baldwin# endif 464bc3d5698SJohn Baldwin ldrhsb r8,[r12,#-14] @ load more input 465bc3d5698SJohn Baldwin ldrhsb r9,[r12,#-10] 466bc3d5698SJohn Baldwin strb r3,[r14,#-4] 467bc3d5698SJohn Baldwin eor r2,r10,r2,lsr#8 468bc3d5698SJohn Baldwin strb r0,[r14,#-15] 469bc3d5698SJohn Baldwin eor r3,r11,r3,lsr#8 470bc3d5698SJohn Baldwin# ifdef __thumb2__ 471bc3d5698SJohn Baldwin itt hs 472bc3d5698SJohn Baldwin# endif 473bc3d5698SJohn Baldwin ldrhsb r10,[r12,#-6] 474bc3d5698SJohn Baldwin ldrhsb r11,[r12,#-2] 475bc3d5698SJohn Baldwin strb r1,[r14,#-11] 476bc3d5698SJohn Baldwin eor r0,r8,r0,lsr#8 477bc3d5698SJohn Baldwin strb r2,[r14,#-7] 478bc3d5698SJohn Baldwin eor r1,r9,r1,lsr#8 479bc3d5698SJohn Baldwin# ifdef __thumb2__ 480bc3d5698SJohn Baldwin itt hs 481bc3d5698SJohn Baldwin# endif 482bc3d5698SJohn Baldwin ldrhsb r8,[r12,#-13] @ load more input 483bc3d5698SJohn Baldwin ldrhsb r9,[r12,#-9] 484bc3d5698SJohn Baldwin strb r3,[r14,#-3] 485bc3d5698SJohn Baldwin eor r2,r10,r2,lsr#8 486bc3d5698SJohn Baldwin strb r0,[r14,#-14] 487bc3d5698SJohn Baldwin eor r3,r11,r3,lsr#8 488bc3d5698SJohn Baldwin# ifdef __thumb2__ 489bc3d5698SJohn Baldwin itt hs 490bc3d5698SJohn Baldwin# endif 491bc3d5698SJohn Baldwin ldrhsb r10,[r12,#-5] 492bc3d5698SJohn Baldwin ldrhsb r11,[r12,#-1] 493bc3d5698SJohn Baldwin strb r1,[r14,#-10] 494bc3d5698SJohn Baldwin strb r2,[r14,#-6] 495bc3d5698SJohn Baldwin eor r0,r8,r0,lsr#8 496bc3d5698SJohn Baldwin strb r3,[r14,#-2] 497bc3d5698SJohn Baldwin eor r1,r9,r1,lsr#8 498bc3d5698SJohn Baldwin strb r0,[r14,#-13] 499bc3d5698SJohn Baldwin eor r2,r10,r2,lsr#8 500bc3d5698SJohn Baldwin strb r1,[r14,#-9] 501bc3d5698SJohn Baldwin eor r3,r11,r3,lsr#8 502bc3d5698SJohn Baldwin strb r2,[r14,#-5] 503bc3d5698SJohn Baldwin strb r3,[r14,#-1] 504bc3d5698SJohn Baldwin add r8,sp,#4*(4+0) 505bc3d5698SJohn Baldwin ldmia r8,{r8,r9,r10,r11} @ load key material 506bc3d5698SJohn Baldwin add r0,sp,#4*(16+8) 507bc3d5698SJohn Baldwin add r4,r4,r8 @ accumulate key material 508bc3d5698SJohn Baldwin add r5,r5,r9 509bc3d5698SJohn Baldwin add r6,r6,r10 510bc3d5698SJohn Baldwin# ifdef __thumb2__ 511bc3d5698SJohn Baldwin itete lo 512bc3d5698SJohn Baldwin# endif 513bc3d5698SJohn Baldwin eorlo r8,r8,r8 @ zero or ... 514bc3d5698SJohn Baldwin ldrhsb r8,[r12],#16 @ ... load input 515bc3d5698SJohn Baldwin eorlo r9,r9,r9 516bc3d5698SJohn Baldwin ldrhsb r9,[r12,#-12] 517bc3d5698SJohn Baldwin 518bc3d5698SJohn Baldwin add r7,r7,r11 519bc3d5698SJohn Baldwin# ifdef __thumb2__ 520bc3d5698SJohn Baldwin itete lo 521bc3d5698SJohn Baldwin# endif 522bc3d5698SJohn Baldwin eorlo r10,r10,r10 523bc3d5698SJohn Baldwin ldrhsb r10,[r12,#-8] 524bc3d5698SJohn Baldwin eorlo r11,r11,r11 525bc3d5698SJohn Baldwin ldrhsb r11,[r12,#-4] 526bc3d5698SJohn Baldwin 527bc3d5698SJohn Baldwin eor r4,r8,r4 @ xor with input (or zero) 528bc3d5698SJohn Baldwin eor r5,r9,r5 529bc3d5698SJohn Baldwin# ifdef __thumb2__ 530bc3d5698SJohn Baldwin itt hs 531bc3d5698SJohn Baldwin# endif 532bc3d5698SJohn Baldwin ldrhsb r8,[r12,#-15] @ load more input 533bc3d5698SJohn Baldwin ldrhsb r9,[r12,#-11] 534bc3d5698SJohn Baldwin eor r6,r10,r6 535bc3d5698SJohn Baldwin strb r4,[r14],#16 @ store output 536bc3d5698SJohn Baldwin eor r7,r11,r7 537bc3d5698SJohn Baldwin# ifdef __thumb2__ 538bc3d5698SJohn Baldwin itt hs 539bc3d5698SJohn Baldwin# endif 540bc3d5698SJohn Baldwin ldrhsb r10,[r12,#-7] 541bc3d5698SJohn Baldwin ldrhsb r11,[r12,#-3] 542bc3d5698SJohn Baldwin strb r5,[r14,#-12] 543bc3d5698SJohn Baldwin eor r4,r8,r4,lsr#8 544bc3d5698SJohn Baldwin strb r6,[r14,#-8] 545bc3d5698SJohn Baldwin eor r5,r9,r5,lsr#8 546bc3d5698SJohn Baldwin# ifdef __thumb2__ 547bc3d5698SJohn Baldwin itt hs 548bc3d5698SJohn Baldwin# endif 549bc3d5698SJohn Baldwin ldrhsb r8,[r12,#-14] @ load more input 550bc3d5698SJohn Baldwin ldrhsb r9,[r12,#-10] 551bc3d5698SJohn Baldwin strb r7,[r14,#-4] 552bc3d5698SJohn Baldwin eor r6,r10,r6,lsr#8 553bc3d5698SJohn Baldwin strb r4,[r14,#-15] 554bc3d5698SJohn Baldwin eor r7,r11,r7,lsr#8 555bc3d5698SJohn Baldwin# ifdef __thumb2__ 556bc3d5698SJohn Baldwin itt hs 557bc3d5698SJohn Baldwin# endif 558bc3d5698SJohn Baldwin ldrhsb r10,[r12,#-6] 559bc3d5698SJohn Baldwin ldrhsb r11,[r12,#-2] 560bc3d5698SJohn Baldwin strb r5,[r14,#-11] 561bc3d5698SJohn Baldwin eor r4,r8,r4,lsr#8 562bc3d5698SJohn Baldwin strb r6,[r14,#-7] 563bc3d5698SJohn Baldwin eor r5,r9,r5,lsr#8 564bc3d5698SJohn Baldwin# ifdef __thumb2__ 565bc3d5698SJohn Baldwin itt hs 566bc3d5698SJohn Baldwin# endif 567bc3d5698SJohn Baldwin ldrhsb r8,[r12,#-13] @ load more input 568bc3d5698SJohn Baldwin ldrhsb r9,[r12,#-9] 569bc3d5698SJohn Baldwin strb r7,[r14,#-3] 570bc3d5698SJohn Baldwin eor r6,r10,r6,lsr#8 571bc3d5698SJohn Baldwin strb r4,[r14,#-14] 572bc3d5698SJohn Baldwin eor r7,r11,r7,lsr#8 573bc3d5698SJohn Baldwin# ifdef __thumb2__ 574bc3d5698SJohn Baldwin itt hs 575bc3d5698SJohn Baldwin# endif 576bc3d5698SJohn Baldwin ldrhsb r10,[r12,#-5] 577bc3d5698SJohn Baldwin ldrhsb r11,[r12,#-1] 578bc3d5698SJohn Baldwin strb r5,[r14,#-10] 579bc3d5698SJohn Baldwin strb r6,[r14,#-6] 580bc3d5698SJohn Baldwin eor r4,r8,r4,lsr#8 581bc3d5698SJohn Baldwin strb r7,[r14,#-2] 582bc3d5698SJohn Baldwin eor r5,r9,r5,lsr#8 583bc3d5698SJohn Baldwin strb r4,[r14,#-13] 584bc3d5698SJohn Baldwin eor r6,r10,r6,lsr#8 585bc3d5698SJohn Baldwin strb r5,[r14,#-9] 586bc3d5698SJohn Baldwin eor r7,r11,r7,lsr#8 587bc3d5698SJohn Baldwin strb r6,[r14,#-5] 588bc3d5698SJohn Baldwin strb r7,[r14,#-1] 589bc3d5698SJohn Baldwin add r8,sp,#4*(4+4) 590bc3d5698SJohn Baldwin ldmia r8,{r8,r9,r10,r11} @ load key material 591bc3d5698SJohn Baldwin ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half 592bc3d5698SJohn Baldwin# ifdef __thumb2__ 593bc3d5698SJohn Baldwin itt hi 594bc3d5698SJohn Baldwin# endif 595bc3d5698SJohn Baldwin strhi r10,[sp,#4*(16+10)] @ copy "rx" 596bc3d5698SJohn Baldwin strhi r11,[sp,#4*(16+11)] @ copy "rx" 597bc3d5698SJohn Baldwin add r0,r0,r8 @ accumulate key material 598bc3d5698SJohn Baldwin add r1,r1,r9 599bc3d5698SJohn Baldwin add r2,r2,r10 600bc3d5698SJohn Baldwin# ifdef __thumb2__ 601bc3d5698SJohn Baldwin itete lo 602bc3d5698SJohn Baldwin# endif 603bc3d5698SJohn Baldwin eorlo r8,r8,r8 @ zero or ... 604bc3d5698SJohn Baldwin ldrhsb r8,[r12],#16 @ ... load input 605bc3d5698SJohn Baldwin eorlo r9,r9,r9 606bc3d5698SJohn Baldwin ldrhsb r9,[r12,#-12] 607bc3d5698SJohn Baldwin 608bc3d5698SJohn Baldwin add r3,r3,r11 609bc3d5698SJohn Baldwin# ifdef __thumb2__ 610bc3d5698SJohn Baldwin itete lo 611bc3d5698SJohn Baldwin# endif 612bc3d5698SJohn Baldwin eorlo r10,r10,r10 613bc3d5698SJohn Baldwin ldrhsb r10,[r12,#-8] 614bc3d5698SJohn Baldwin eorlo r11,r11,r11 615bc3d5698SJohn Baldwin ldrhsb r11,[r12,#-4] 616bc3d5698SJohn Baldwin 617bc3d5698SJohn Baldwin eor r0,r8,r0 @ xor with input (or zero) 618bc3d5698SJohn Baldwin eor r1,r9,r1 619bc3d5698SJohn Baldwin# ifdef __thumb2__ 620bc3d5698SJohn Baldwin itt hs 621bc3d5698SJohn Baldwin# endif 622bc3d5698SJohn Baldwin ldrhsb r8,[r12,#-15] @ load more input 623bc3d5698SJohn Baldwin ldrhsb r9,[r12,#-11] 624bc3d5698SJohn Baldwin eor r2,r10,r2 625bc3d5698SJohn Baldwin strb r0,[r14],#16 @ store output 626bc3d5698SJohn Baldwin eor r3,r11,r3 627bc3d5698SJohn Baldwin# ifdef __thumb2__ 628bc3d5698SJohn Baldwin itt hs 629bc3d5698SJohn Baldwin# endif 630bc3d5698SJohn Baldwin ldrhsb r10,[r12,#-7] 631bc3d5698SJohn Baldwin ldrhsb r11,[r12,#-3] 632bc3d5698SJohn Baldwin strb r1,[r14,#-12] 633bc3d5698SJohn Baldwin eor r0,r8,r0,lsr#8 634bc3d5698SJohn Baldwin strb r2,[r14,#-8] 635bc3d5698SJohn Baldwin eor r1,r9,r1,lsr#8 636bc3d5698SJohn Baldwin# ifdef __thumb2__ 637bc3d5698SJohn Baldwin itt hs 638bc3d5698SJohn Baldwin# endif 639bc3d5698SJohn Baldwin ldrhsb r8,[r12,#-14] @ load more input 640bc3d5698SJohn Baldwin ldrhsb r9,[r12,#-10] 641bc3d5698SJohn Baldwin strb r3,[r14,#-4] 642bc3d5698SJohn Baldwin eor r2,r10,r2,lsr#8 643bc3d5698SJohn Baldwin strb r0,[r14,#-15] 644bc3d5698SJohn Baldwin eor r3,r11,r3,lsr#8 645bc3d5698SJohn Baldwin# ifdef __thumb2__ 646bc3d5698SJohn Baldwin itt hs 647bc3d5698SJohn Baldwin# endif 648bc3d5698SJohn Baldwin ldrhsb r10,[r12,#-6] 649bc3d5698SJohn Baldwin ldrhsb r11,[r12,#-2] 650bc3d5698SJohn Baldwin strb r1,[r14,#-11] 651bc3d5698SJohn Baldwin eor r0,r8,r0,lsr#8 652bc3d5698SJohn Baldwin strb r2,[r14,#-7] 653bc3d5698SJohn Baldwin eor r1,r9,r1,lsr#8 654bc3d5698SJohn Baldwin# ifdef __thumb2__ 655bc3d5698SJohn Baldwin itt hs 656bc3d5698SJohn Baldwin# endif 657bc3d5698SJohn Baldwin ldrhsb r8,[r12,#-13] @ load more input 658bc3d5698SJohn Baldwin ldrhsb r9,[r12,#-9] 659bc3d5698SJohn Baldwin strb r3,[r14,#-3] 660bc3d5698SJohn Baldwin eor r2,r10,r2,lsr#8 661bc3d5698SJohn Baldwin strb r0,[r14,#-14] 662bc3d5698SJohn Baldwin eor r3,r11,r3,lsr#8 663bc3d5698SJohn Baldwin# ifdef __thumb2__ 664bc3d5698SJohn Baldwin itt hs 665bc3d5698SJohn Baldwin# endif 666bc3d5698SJohn Baldwin ldrhsb r10,[r12,#-5] 667bc3d5698SJohn Baldwin ldrhsb r11,[r12,#-1] 668bc3d5698SJohn Baldwin strb r1,[r14,#-10] 669bc3d5698SJohn Baldwin strb r2,[r14,#-6] 670bc3d5698SJohn Baldwin eor r0,r8,r0,lsr#8 671bc3d5698SJohn Baldwin strb r3,[r14,#-2] 672bc3d5698SJohn Baldwin eor r1,r9,r1,lsr#8 673bc3d5698SJohn Baldwin strb r0,[r14,#-13] 674bc3d5698SJohn Baldwin eor r2,r10,r2,lsr#8 675bc3d5698SJohn Baldwin strb r1,[r14,#-9] 676bc3d5698SJohn Baldwin eor r3,r11,r3,lsr#8 677bc3d5698SJohn Baldwin strb r2,[r14,#-5] 678bc3d5698SJohn Baldwin strb r3,[r14,#-1] 679bc3d5698SJohn Baldwin add r8,sp,#4*(4+8) 680bc3d5698SJohn Baldwin ldmia r8,{r8,r9,r10,r11} @ load key material 681bc3d5698SJohn Baldwin add r4,r4,r8 @ accumulate key material 682bc3d5698SJohn Baldwin# ifdef __thumb2__ 683bc3d5698SJohn Baldwin itt hi 684bc3d5698SJohn Baldwin# endif 685bc3d5698SJohn Baldwin addhi r8,r8,#1 @ next counter value 686bc3d5698SJohn Baldwin strhi r8,[sp,#4*(12)] @ save next counter value 687bc3d5698SJohn Baldwin add r5,r5,r9 688bc3d5698SJohn Baldwin add r6,r6,r10 689bc3d5698SJohn Baldwin# ifdef __thumb2__ 690bc3d5698SJohn Baldwin itete lo 691bc3d5698SJohn Baldwin# endif 692bc3d5698SJohn Baldwin eorlo r8,r8,r8 @ zero or ... 693bc3d5698SJohn Baldwin ldrhsb r8,[r12],#16 @ ... load input 694bc3d5698SJohn Baldwin eorlo r9,r9,r9 695bc3d5698SJohn Baldwin ldrhsb r9,[r12,#-12] 696bc3d5698SJohn Baldwin 697bc3d5698SJohn Baldwin add r7,r7,r11 698bc3d5698SJohn Baldwin# ifdef __thumb2__ 699bc3d5698SJohn Baldwin itete lo 700bc3d5698SJohn Baldwin# endif 701bc3d5698SJohn Baldwin eorlo r10,r10,r10 702bc3d5698SJohn Baldwin ldrhsb r10,[r12,#-8] 703bc3d5698SJohn Baldwin eorlo r11,r11,r11 704bc3d5698SJohn Baldwin ldrhsb r11,[r12,#-4] 705bc3d5698SJohn Baldwin 706bc3d5698SJohn Baldwin eor r4,r8,r4 @ xor with input (or zero) 707bc3d5698SJohn Baldwin eor r5,r9,r5 708bc3d5698SJohn Baldwin# ifdef __thumb2__ 709bc3d5698SJohn Baldwin itt hs 710bc3d5698SJohn Baldwin# endif 711bc3d5698SJohn Baldwin ldrhsb r8,[r12,#-15] @ load more input 712bc3d5698SJohn Baldwin ldrhsb r9,[r12,#-11] 713bc3d5698SJohn Baldwin eor r6,r10,r6 714bc3d5698SJohn Baldwin strb r4,[r14],#16 @ store output 715bc3d5698SJohn Baldwin eor r7,r11,r7 716bc3d5698SJohn Baldwin# ifdef __thumb2__ 717bc3d5698SJohn Baldwin itt hs 718bc3d5698SJohn Baldwin# endif 719bc3d5698SJohn Baldwin ldrhsb r10,[r12,#-7] 720bc3d5698SJohn Baldwin ldrhsb r11,[r12,#-3] 721bc3d5698SJohn Baldwin strb r5,[r14,#-12] 722bc3d5698SJohn Baldwin eor r4,r8,r4,lsr#8 723bc3d5698SJohn Baldwin strb r6,[r14,#-8] 724bc3d5698SJohn Baldwin eor r5,r9,r5,lsr#8 725bc3d5698SJohn Baldwin# ifdef __thumb2__ 726bc3d5698SJohn Baldwin itt hs 727bc3d5698SJohn Baldwin# endif 728bc3d5698SJohn Baldwin ldrhsb r8,[r12,#-14] @ load more input 729bc3d5698SJohn Baldwin ldrhsb r9,[r12,#-10] 730bc3d5698SJohn Baldwin strb r7,[r14,#-4] 731bc3d5698SJohn Baldwin eor r6,r10,r6,lsr#8 732bc3d5698SJohn Baldwin strb r4,[r14,#-15] 733bc3d5698SJohn Baldwin eor r7,r11,r7,lsr#8 734bc3d5698SJohn Baldwin# ifdef __thumb2__ 735bc3d5698SJohn Baldwin itt hs 736bc3d5698SJohn Baldwin# endif 737bc3d5698SJohn Baldwin ldrhsb r10,[r12,#-6] 738bc3d5698SJohn Baldwin ldrhsb r11,[r12,#-2] 739bc3d5698SJohn Baldwin strb r5,[r14,#-11] 740bc3d5698SJohn Baldwin eor r4,r8,r4,lsr#8 741bc3d5698SJohn Baldwin strb r6,[r14,#-7] 742bc3d5698SJohn Baldwin eor r5,r9,r5,lsr#8 743bc3d5698SJohn Baldwin# ifdef __thumb2__ 744bc3d5698SJohn Baldwin itt hs 745bc3d5698SJohn Baldwin# endif 746bc3d5698SJohn Baldwin ldrhsb r8,[r12,#-13] @ load more input 747bc3d5698SJohn Baldwin ldrhsb r9,[r12,#-9] 748bc3d5698SJohn Baldwin strb r7,[r14,#-3] 749bc3d5698SJohn Baldwin eor r6,r10,r6,lsr#8 750bc3d5698SJohn Baldwin strb r4,[r14,#-14] 751bc3d5698SJohn Baldwin eor r7,r11,r7,lsr#8 752bc3d5698SJohn Baldwin# ifdef __thumb2__ 753bc3d5698SJohn Baldwin itt hs 754bc3d5698SJohn Baldwin# endif 755bc3d5698SJohn Baldwin ldrhsb r10,[r12,#-5] 756bc3d5698SJohn Baldwin ldrhsb r11,[r12,#-1] 757bc3d5698SJohn Baldwin strb r5,[r14,#-10] 758bc3d5698SJohn Baldwin strb r6,[r14,#-6] 759bc3d5698SJohn Baldwin eor r4,r8,r4,lsr#8 760bc3d5698SJohn Baldwin strb r7,[r14,#-2] 761bc3d5698SJohn Baldwin eor r5,r9,r5,lsr#8 762bc3d5698SJohn Baldwin strb r4,[r14,#-13] 763bc3d5698SJohn Baldwin eor r6,r10,r6,lsr#8 764bc3d5698SJohn Baldwin strb r5,[r14,#-9] 765bc3d5698SJohn Baldwin eor r7,r11,r7,lsr#8 766bc3d5698SJohn Baldwin strb r6,[r14,#-5] 767bc3d5698SJohn Baldwin strb r7,[r14,#-1] 768bc3d5698SJohn Baldwin# ifdef __thumb2__ 769bc3d5698SJohn Baldwin it ne 770bc3d5698SJohn Baldwin# endif 771bc3d5698SJohn Baldwin ldrne r8,[sp,#4*(32+2)] @ re-load len 772bc3d5698SJohn Baldwin# ifdef __thumb2__ 773bc3d5698SJohn Baldwin it hs 774bc3d5698SJohn Baldwin# endif 775bc3d5698SJohn Baldwin subhs r11,r8,#64 @ len-=64 776bc3d5698SJohn Baldwin bhi .Loop_outer 777bc3d5698SJohn Baldwin 778bc3d5698SJohn Baldwin beq .Ldone 779bc3d5698SJohn Baldwin#endif 780bc3d5698SJohn Baldwin 781bc3d5698SJohn Baldwin.Ltail: 782bc3d5698SJohn Baldwin ldr r12,[sp,#4*(32+1)] @ load inp 783bc3d5698SJohn Baldwin add r9,sp,#4*(0) 784bc3d5698SJohn Baldwin ldr r14,[sp,#4*(32+0)] @ load out 785bc3d5698SJohn Baldwin 786bc3d5698SJohn Baldwin.Loop_tail: 787bc3d5698SJohn Baldwin ldrb r10,[r9],#1 @ read buffer on stack 788bc3d5698SJohn Baldwin ldrb r11,[r12],#1 @ read input 789bc3d5698SJohn Baldwin subs r8,r8,#1 790bc3d5698SJohn Baldwin eor r11,r11,r10 791bc3d5698SJohn Baldwin strb r11,[r14],#1 @ store output 792bc3d5698SJohn Baldwin bne .Loop_tail 793bc3d5698SJohn Baldwin 794bc3d5698SJohn Baldwin.Ldone: 795bc3d5698SJohn Baldwin add sp,sp,#4*(32+3) 796bc3d5698SJohn Baldwin.Lno_data: 797bc3d5698SJohn Baldwin ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} 798bc3d5698SJohn Baldwin.size ChaCha20_ctr32,.-ChaCha20_ctr32 799bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7 800bc3d5698SJohn Baldwin.arch armv7-a 801bc3d5698SJohn Baldwin.fpu neon 802bc3d5698SJohn Baldwin 803bc3d5698SJohn Baldwin.type ChaCha20_neon,%function 804bc3d5698SJohn Baldwin.align 5 805bc3d5698SJohn BaldwinChaCha20_neon: 806bc3d5698SJohn Baldwin ldr r12,[sp,#0] @ pull pointer to counter and nonce 807bc3d5698SJohn Baldwin stmdb sp!,{r0,r1,r2,r4-r11,lr} 808bc3d5698SJohn Baldwin.LChaCha20_neon: 809bc3d5698SJohn Baldwin adr r14,.Lsigma 810bc3d5698SJohn Baldwin vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI spec says so 811bc3d5698SJohn Baldwin stmdb sp!,{r0,r1,r2,r3} 812bc3d5698SJohn Baldwin 813bc3d5698SJohn Baldwin vld1.32 {q1,q2},[r3] @ load key 814bc3d5698SJohn Baldwin ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key 815bc3d5698SJohn Baldwin 816bc3d5698SJohn Baldwin sub sp,sp,#4*(16+16) 817bc3d5698SJohn Baldwin vld1.32 {q3},[r12] @ load counter and nonce 818bc3d5698SJohn Baldwin add r12,sp,#4*8 819bc3d5698SJohn Baldwin ldmia r14,{r0,r1,r2,r3} @ load sigma 820bc3d5698SJohn Baldwin vld1.32 {q0},[r14]! @ load sigma 821bc3d5698SJohn Baldwin vld1.32 {q12},[r14] @ one 822bc3d5698SJohn Baldwin vst1.32 {q2,q3},[r12] @ copy 1/2key|counter|nonce 823bc3d5698SJohn Baldwin vst1.32 {q0,q1},[sp] @ copy sigma|1/2key 824bc3d5698SJohn Baldwin 825bc3d5698SJohn Baldwin str r10,[sp,#4*(16+10)] @ off-load "rx" 826bc3d5698SJohn Baldwin str r11,[sp,#4*(16+11)] @ off-load "rx" 827bc3d5698SJohn Baldwin vshl.i32 d26,d24,#1 @ two 828bc3d5698SJohn Baldwin vstr d24,[sp,#4*(16+0)] 829bc3d5698SJohn Baldwin vshl.i32 d28,d24,#2 @ four 830bc3d5698SJohn Baldwin vstr d26,[sp,#4*(16+2)] 831bc3d5698SJohn Baldwin vmov q4,q0 832bc3d5698SJohn Baldwin vstr d28,[sp,#4*(16+4)] 833bc3d5698SJohn Baldwin vmov q8,q0 834bc3d5698SJohn Baldwin vmov q5,q1 835bc3d5698SJohn Baldwin vmov q9,q1 836bc3d5698SJohn Baldwin b .Loop_neon_enter 837bc3d5698SJohn Baldwin 838bc3d5698SJohn Baldwin.align 4 839bc3d5698SJohn Baldwin.Loop_neon_outer: 840bc3d5698SJohn Baldwin ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material 841bc3d5698SJohn Baldwin cmp r11,#64*2 @ if len<=64*2 842bc3d5698SJohn Baldwin bls .Lbreak_neon @ switch to integer-only 843bc3d5698SJohn Baldwin vmov q4,q0 844bc3d5698SJohn Baldwin str r11,[sp,#4*(32+2)] @ save len 845bc3d5698SJohn Baldwin vmov q8,q0 846bc3d5698SJohn Baldwin str r12, [sp,#4*(32+1)] @ save inp 847bc3d5698SJohn Baldwin vmov q5,q1 848bc3d5698SJohn Baldwin str r14, [sp,#4*(32+0)] @ save out 849bc3d5698SJohn Baldwin vmov q9,q1 850bc3d5698SJohn Baldwin.Loop_neon_enter: 851bc3d5698SJohn Baldwin ldr r11, [sp,#4*(15)] 852bc3d5698SJohn Baldwin vadd.i32 q7,q3,q12 @ counter+1 853bc3d5698SJohn Baldwin ldr r12,[sp,#4*(12)] @ modulo-scheduled load 854bc3d5698SJohn Baldwin vmov q6,q2 855bc3d5698SJohn Baldwin ldr r10, [sp,#4*(13)] 856bc3d5698SJohn Baldwin vmov q10,q2 857bc3d5698SJohn Baldwin ldr r14,[sp,#4*(14)] 858bc3d5698SJohn Baldwin vadd.i32 q11,q7,q12 @ counter+2 859bc3d5698SJohn Baldwin str r11, [sp,#4*(16+15)] 860bc3d5698SJohn Baldwin mov r11,#10 861bc3d5698SJohn Baldwin add r12,r12,#3 @ counter+3 862bc3d5698SJohn Baldwin b .Loop_neon 863bc3d5698SJohn Baldwin 864bc3d5698SJohn Baldwin.align 4 865bc3d5698SJohn Baldwin.Loop_neon: 866bc3d5698SJohn Baldwin subs r11,r11,#1 867bc3d5698SJohn Baldwin vadd.i32 q0,q0,q1 868bc3d5698SJohn Baldwin add r0,r0,r4 869bc3d5698SJohn Baldwin vadd.i32 q4,q4,q5 870bc3d5698SJohn Baldwin mov r12,r12,ror#16 871bc3d5698SJohn Baldwin vadd.i32 q8,q8,q9 872bc3d5698SJohn Baldwin add r1,r1,r5 873bc3d5698SJohn Baldwin veor q3,q3,q0 874bc3d5698SJohn Baldwin mov r10,r10,ror#16 875bc3d5698SJohn Baldwin veor q7,q7,q4 876bc3d5698SJohn Baldwin eor r12,r12,r0,ror#16 877bc3d5698SJohn Baldwin veor q11,q11,q8 878bc3d5698SJohn Baldwin eor r10,r10,r1,ror#16 879bc3d5698SJohn Baldwin vrev32.16 q3,q3 880bc3d5698SJohn Baldwin add r8,r8,r12 881bc3d5698SJohn Baldwin vrev32.16 q7,q7 882bc3d5698SJohn Baldwin mov r4,r4,ror#20 883bc3d5698SJohn Baldwin vrev32.16 q11,q11 884bc3d5698SJohn Baldwin add r9,r9,r10 885bc3d5698SJohn Baldwin vadd.i32 q2,q2,q3 886bc3d5698SJohn Baldwin mov r5,r5,ror#20 887bc3d5698SJohn Baldwin vadd.i32 q6,q6,q7 888bc3d5698SJohn Baldwin eor r4,r4,r8,ror#20 889bc3d5698SJohn Baldwin vadd.i32 q10,q10,q11 890bc3d5698SJohn Baldwin eor r5,r5,r9,ror#20 891bc3d5698SJohn Baldwin veor q12,q1,q2 892bc3d5698SJohn Baldwin add r0,r0,r4 893bc3d5698SJohn Baldwin veor q13,q5,q6 894bc3d5698SJohn Baldwin mov r12,r12,ror#24 895bc3d5698SJohn Baldwin veor q14,q9,q10 896bc3d5698SJohn Baldwin add r1,r1,r5 897bc3d5698SJohn Baldwin vshr.u32 q1,q12,#20 898bc3d5698SJohn Baldwin mov r10,r10,ror#24 899bc3d5698SJohn Baldwin vshr.u32 q5,q13,#20 900bc3d5698SJohn Baldwin eor r12,r12,r0,ror#24 901bc3d5698SJohn Baldwin vshr.u32 q9,q14,#20 902bc3d5698SJohn Baldwin eor r10,r10,r1,ror#24 903bc3d5698SJohn Baldwin vsli.32 q1,q12,#12 904bc3d5698SJohn Baldwin add r8,r8,r12 905bc3d5698SJohn Baldwin vsli.32 q5,q13,#12 906bc3d5698SJohn Baldwin mov r4,r4,ror#25 907bc3d5698SJohn Baldwin vsli.32 q9,q14,#12 908bc3d5698SJohn Baldwin add r9,r9,r10 909bc3d5698SJohn Baldwin vadd.i32 q0,q0,q1 910bc3d5698SJohn Baldwin mov r5,r5,ror#25 911bc3d5698SJohn Baldwin vadd.i32 q4,q4,q5 912bc3d5698SJohn Baldwin str r10,[sp,#4*(16+13)] 913bc3d5698SJohn Baldwin vadd.i32 q8,q8,q9 914bc3d5698SJohn Baldwin ldr r10,[sp,#4*(16+15)] 915bc3d5698SJohn Baldwin veor q12,q3,q0 916bc3d5698SJohn Baldwin eor r4,r4,r8,ror#25 917bc3d5698SJohn Baldwin veor q13,q7,q4 918bc3d5698SJohn Baldwin eor r5,r5,r9,ror#25 919bc3d5698SJohn Baldwin veor q14,q11,q8 920bc3d5698SJohn Baldwin str r8,[sp,#4*(16+8)] 921bc3d5698SJohn Baldwin vshr.u32 q3,q12,#24 922bc3d5698SJohn Baldwin ldr r8,[sp,#4*(16+10)] 923bc3d5698SJohn Baldwin vshr.u32 q7,q13,#24 924bc3d5698SJohn Baldwin add r2,r2,r6 925bc3d5698SJohn Baldwin vshr.u32 q11,q14,#24 926bc3d5698SJohn Baldwin mov r14,r14,ror#16 927bc3d5698SJohn Baldwin vsli.32 q3,q12,#8 928bc3d5698SJohn Baldwin str r9,[sp,#4*(16+9)] 929bc3d5698SJohn Baldwin vsli.32 q7,q13,#8 930bc3d5698SJohn Baldwin ldr r9,[sp,#4*(16+11)] 931bc3d5698SJohn Baldwin vsli.32 q11,q14,#8 932bc3d5698SJohn Baldwin add r3,r3,r7 933bc3d5698SJohn Baldwin vadd.i32 q2,q2,q3 934bc3d5698SJohn Baldwin mov r10,r10,ror#16 935bc3d5698SJohn Baldwin vadd.i32 q6,q6,q7 936bc3d5698SJohn Baldwin eor r14,r14,r2,ror#16 937bc3d5698SJohn Baldwin vadd.i32 q10,q10,q11 938bc3d5698SJohn Baldwin eor r10,r10,r3,ror#16 939bc3d5698SJohn Baldwin veor q12,q1,q2 940bc3d5698SJohn Baldwin add r8,r8,r14 941bc3d5698SJohn Baldwin veor q13,q5,q6 942bc3d5698SJohn Baldwin mov r6,r6,ror#20 943bc3d5698SJohn Baldwin veor q14,q9,q10 944bc3d5698SJohn Baldwin add r9,r9,r10 945bc3d5698SJohn Baldwin vshr.u32 q1,q12,#25 946bc3d5698SJohn Baldwin mov r7,r7,ror#20 947bc3d5698SJohn Baldwin vshr.u32 q5,q13,#25 948bc3d5698SJohn Baldwin eor r6,r6,r8,ror#20 949bc3d5698SJohn Baldwin vshr.u32 q9,q14,#25 950bc3d5698SJohn Baldwin eor r7,r7,r9,ror#20 951bc3d5698SJohn Baldwin vsli.32 q1,q12,#7 952bc3d5698SJohn Baldwin add r2,r2,r6 953bc3d5698SJohn Baldwin vsli.32 q5,q13,#7 954bc3d5698SJohn Baldwin mov r14,r14,ror#24 955bc3d5698SJohn Baldwin vsli.32 q9,q14,#7 956bc3d5698SJohn Baldwin add r3,r3,r7 957bc3d5698SJohn Baldwin vext.8 q2,q2,q2,#8 958bc3d5698SJohn Baldwin mov r10,r10,ror#24 959bc3d5698SJohn Baldwin vext.8 q6,q6,q6,#8 960bc3d5698SJohn Baldwin eor r14,r14,r2,ror#24 961bc3d5698SJohn Baldwin vext.8 q10,q10,q10,#8 962bc3d5698SJohn Baldwin eor r10,r10,r3,ror#24 963bc3d5698SJohn Baldwin vext.8 q1,q1,q1,#4 964bc3d5698SJohn Baldwin add r8,r8,r14 965bc3d5698SJohn Baldwin vext.8 q5,q5,q5,#4 966bc3d5698SJohn Baldwin mov r6,r6,ror#25 967bc3d5698SJohn Baldwin vext.8 q9,q9,q9,#4 968bc3d5698SJohn Baldwin add r9,r9,r10 969bc3d5698SJohn Baldwin vext.8 q3,q3,q3,#12 970bc3d5698SJohn Baldwin mov r7,r7,ror#25 971bc3d5698SJohn Baldwin vext.8 q7,q7,q7,#12 972bc3d5698SJohn Baldwin eor r6,r6,r8,ror#25 973bc3d5698SJohn Baldwin vext.8 q11,q11,q11,#12 974bc3d5698SJohn Baldwin eor r7,r7,r9,ror#25 975bc3d5698SJohn Baldwin vadd.i32 q0,q0,q1 976bc3d5698SJohn Baldwin add r0,r0,r5 977bc3d5698SJohn Baldwin vadd.i32 q4,q4,q5 978bc3d5698SJohn Baldwin mov r10,r10,ror#16 979bc3d5698SJohn Baldwin vadd.i32 q8,q8,q9 980bc3d5698SJohn Baldwin add r1,r1,r6 981bc3d5698SJohn Baldwin veor q3,q3,q0 982bc3d5698SJohn Baldwin mov r12,r12,ror#16 983bc3d5698SJohn Baldwin veor q7,q7,q4 984bc3d5698SJohn Baldwin eor r10,r10,r0,ror#16 985bc3d5698SJohn Baldwin veor q11,q11,q8 986bc3d5698SJohn Baldwin eor r12,r12,r1,ror#16 987bc3d5698SJohn Baldwin vrev32.16 q3,q3 988bc3d5698SJohn Baldwin add r8,r8,r10 989bc3d5698SJohn Baldwin vrev32.16 q7,q7 990bc3d5698SJohn Baldwin mov r5,r5,ror#20 991bc3d5698SJohn Baldwin vrev32.16 q11,q11 992bc3d5698SJohn Baldwin add r9,r9,r12 993bc3d5698SJohn Baldwin vadd.i32 q2,q2,q3 994bc3d5698SJohn Baldwin mov r6,r6,ror#20 995bc3d5698SJohn Baldwin vadd.i32 q6,q6,q7 996bc3d5698SJohn Baldwin eor r5,r5,r8,ror#20 997bc3d5698SJohn Baldwin vadd.i32 q10,q10,q11 998bc3d5698SJohn Baldwin eor r6,r6,r9,ror#20 999bc3d5698SJohn Baldwin veor q12,q1,q2 1000bc3d5698SJohn Baldwin add r0,r0,r5 1001bc3d5698SJohn Baldwin veor q13,q5,q6 1002bc3d5698SJohn Baldwin mov r10,r10,ror#24 1003bc3d5698SJohn Baldwin veor q14,q9,q10 1004bc3d5698SJohn Baldwin add r1,r1,r6 1005bc3d5698SJohn Baldwin vshr.u32 q1,q12,#20 1006bc3d5698SJohn Baldwin mov r12,r12,ror#24 1007bc3d5698SJohn Baldwin vshr.u32 q5,q13,#20 1008bc3d5698SJohn Baldwin eor r10,r10,r0,ror#24 1009bc3d5698SJohn Baldwin vshr.u32 q9,q14,#20 1010bc3d5698SJohn Baldwin eor r12,r12,r1,ror#24 1011bc3d5698SJohn Baldwin vsli.32 q1,q12,#12 1012bc3d5698SJohn Baldwin add r8,r8,r10 1013bc3d5698SJohn Baldwin vsli.32 q5,q13,#12 1014bc3d5698SJohn Baldwin mov r5,r5,ror#25 1015bc3d5698SJohn Baldwin vsli.32 q9,q14,#12 1016bc3d5698SJohn Baldwin str r10,[sp,#4*(16+15)] 1017bc3d5698SJohn Baldwin vadd.i32 q0,q0,q1 1018bc3d5698SJohn Baldwin ldr r10,[sp,#4*(16+13)] 1019bc3d5698SJohn Baldwin vadd.i32 q4,q4,q5 1020bc3d5698SJohn Baldwin add r9,r9,r12 1021bc3d5698SJohn Baldwin vadd.i32 q8,q8,q9 1022bc3d5698SJohn Baldwin mov r6,r6,ror#25 1023bc3d5698SJohn Baldwin veor q12,q3,q0 1024bc3d5698SJohn Baldwin eor r5,r5,r8,ror#25 1025bc3d5698SJohn Baldwin veor q13,q7,q4 1026bc3d5698SJohn Baldwin eor r6,r6,r9,ror#25 1027bc3d5698SJohn Baldwin veor q14,q11,q8 1028bc3d5698SJohn Baldwin str r8,[sp,#4*(16+10)] 1029bc3d5698SJohn Baldwin vshr.u32 q3,q12,#24 1030bc3d5698SJohn Baldwin ldr r8,[sp,#4*(16+8)] 1031bc3d5698SJohn Baldwin vshr.u32 q7,q13,#24 1032bc3d5698SJohn Baldwin add r2,r2,r7 1033bc3d5698SJohn Baldwin vshr.u32 q11,q14,#24 1034bc3d5698SJohn Baldwin mov r10,r10,ror#16 1035bc3d5698SJohn Baldwin vsli.32 q3,q12,#8 1036bc3d5698SJohn Baldwin str r9,[sp,#4*(16+11)] 1037bc3d5698SJohn Baldwin vsli.32 q7,q13,#8 1038bc3d5698SJohn Baldwin ldr r9,[sp,#4*(16+9)] 1039bc3d5698SJohn Baldwin vsli.32 q11,q14,#8 1040bc3d5698SJohn Baldwin add r3,r3,r4 1041bc3d5698SJohn Baldwin vadd.i32 q2,q2,q3 1042bc3d5698SJohn Baldwin mov r14,r14,ror#16 1043bc3d5698SJohn Baldwin vadd.i32 q6,q6,q7 1044bc3d5698SJohn Baldwin eor r10,r10,r2,ror#16 1045bc3d5698SJohn Baldwin vadd.i32 q10,q10,q11 1046bc3d5698SJohn Baldwin eor r14,r14,r3,ror#16 1047bc3d5698SJohn Baldwin veor q12,q1,q2 1048bc3d5698SJohn Baldwin add r8,r8,r10 1049bc3d5698SJohn Baldwin veor q13,q5,q6 1050bc3d5698SJohn Baldwin mov r7,r7,ror#20 1051bc3d5698SJohn Baldwin veor q14,q9,q10 1052bc3d5698SJohn Baldwin add r9,r9,r14 1053bc3d5698SJohn Baldwin vshr.u32 q1,q12,#25 1054bc3d5698SJohn Baldwin mov r4,r4,ror#20 1055bc3d5698SJohn Baldwin vshr.u32 q5,q13,#25 1056bc3d5698SJohn Baldwin eor r7,r7,r8,ror#20 1057bc3d5698SJohn Baldwin vshr.u32 q9,q14,#25 1058bc3d5698SJohn Baldwin eor r4,r4,r9,ror#20 1059bc3d5698SJohn Baldwin vsli.32 q1,q12,#7 1060bc3d5698SJohn Baldwin add r2,r2,r7 1061bc3d5698SJohn Baldwin vsli.32 q5,q13,#7 1062bc3d5698SJohn Baldwin mov r10,r10,ror#24 1063bc3d5698SJohn Baldwin vsli.32 q9,q14,#7 1064bc3d5698SJohn Baldwin add r3,r3,r4 1065bc3d5698SJohn Baldwin vext.8 q2,q2,q2,#8 1066bc3d5698SJohn Baldwin mov r14,r14,ror#24 1067bc3d5698SJohn Baldwin vext.8 q6,q6,q6,#8 1068bc3d5698SJohn Baldwin eor r10,r10,r2,ror#24 1069bc3d5698SJohn Baldwin vext.8 q10,q10,q10,#8 1070bc3d5698SJohn Baldwin eor r14,r14,r3,ror#24 1071bc3d5698SJohn Baldwin vext.8 q1,q1,q1,#12 1072bc3d5698SJohn Baldwin add r8,r8,r10 1073bc3d5698SJohn Baldwin vext.8 q5,q5,q5,#12 1074bc3d5698SJohn Baldwin mov r7,r7,ror#25 1075bc3d5698SJohn Baldwin vext.8 q9,q9,q9,#12 1076bc3d5698SJohn Baldwin add r9,r9,r14 1077bc3d5698SJohn Baldwin vext.8 q3,q3,q3,#4 1078bc3d5698SJohn Baldwin mov r4,r4,ror#25 1079bc3d5698SJohn Baldwin vext.8 q7,q7,q7,#4 1080bc3d5698SJohn Baldwin eor r7,r7,r8,ror#25 1081bc3d5698SJohn Baldwin vext.8 q11,q11,q11,#4 1082bc3d5698SJohn Baldwin eor r4,r4,r9,ror#25 1083bc3d5698SJohn Baldwin bne .Loop_neon 1084bc3d5698SJohn Baldwin 1085bc3d5698SJohn Baldwin add r11,sp,#32 1086bc3d5698SJohn Baldwin vld1.32 {q12,q13},[sp] @ load key material 1087bc3d5698SJohn Baldwin vld1.32 {q14,q15},[r11] 1088bc3d5698SJohn Baldwin 1089bc3d5698SJohn Baldwin ldr r11,[sp,#4*(32+2)] @ load len 1090bc3d5698SJohn Baldwin 1091bc3d5698SJohn Baldwin str r8, [sp,#4*(16+8)] @ modulo-scheduled store 1092bc3d5698SJohn Baldwin str r9, [sp,#4*(16+9)] 1093bc3d5698SJohn Baldwin str r12,[sp,#4*(16+12)] 1094bc3d5698SJohn Baldwin str r10, [sp,#4*(16+13)] 1095bc3d5698SJohn Baldwin str r14,[sp,#4*(16+14)] 1096bc3d5698SJohn Baldwin 1097bc3d5698SJohn Baldwin @ at this point we have first half of 512-bit result in 1098bc3d5698SJohn Baldwin @ rx and second half at sp+4*(16+8) 1099bc3d5698SJohn Baldwin 1100bc3d5698SJohn Baldwin ldr r12,[sp,#4*(32+1)] @ load inp 1101bc3d5698SJohn Baldwin ldr r14,[sp,#4*(32+0)] @ load out 1102bc3d5698SJohn Baldwin 1103bc3d5698SJohn Baldwin vadd.i32 q0,q0,q12 @ accumulate key material 1104bc3d5698SJohn Baldwin vadd.i32 q4,q4,q12 1105bc3d5698SJohn Baldwin vadd.i32 q8,q8,q12 1106bc3d5698SJohn Baldwin vldr d24,[sp,#4*(16+0)] @ one 1107bc3d5698SJohn Baldwin 1108bc3d5698SJohn Baldwin vadd.i32 q1,q1,q13 1109bc3d5698SJohn Baldwin vadd.i32 q5,q5,q13 1110bc3d5698SJohn Baldwin vadd.i32 q9,q9,q13 1111bc3d5698SJohn Baldwin vldr d26,[sp,#4*(16+2)] @ two 1112bc3d5698SJohn Baldwin 1113bc3d5698SJohn Baldwin vadd.i32 q2,q2,q14 1114bc3d5698SJohn Baldwin vadd.i32 q6,q6,q14 1115bc3d5698SJohn Baldwin vadd.i32 q10,q10,q14 1116bc3d5698SJohn Baldwin vadd.i32 d14,d14,d24 @ counter+1 1117bc3d5698SJohn Baldwin vadd.i32 d22,d22,d26 @ counter+2 1118bc3d5698SJohn Baldwin 1119bc3d5698SJohn Baldwin vadd.i32 q3,q3,q15 1120bc3d5698SJohn Baldwin vadd.i32 q7,q7,q15 1121bc3d5698SJohn Baldwin vadd.i32 q11,q11,q15 1122bc3d5698SJohn Baldwin 1123bc3d5698SJohn Baldwin cmp r11,#64*4 1124bc3d5698SJohn Baldwin blo .Ltail_neon 1125bc3d5698SJohn Baldwin 1126bc3d5698SJohn Baldwin vld1.8 {q12,q13},[r12]! @ load input 1127bc3d5698SJohn Baldwin mov r11,sp 1128bc3d5698SJohn Baldwin vld1.8 {q14,q15},[r12]! 1129bc3d5698SJohn Baldwin veor q0,q0,q12 @ xor with input 1130bc3d5698SJohn Baldwin veor q1,q1,q13 1131bc3d5698SJohn Baldwin vld1.8 {q12,q13},[r12]! 1132bc3d5698SJohn Baldwin veor q2,q2,q14 1133bc3d5698SJohn Baldwin veor q3,q3,q15 1134bc3d5698SJohn Baldwin vld1.8 {q14,q15},[r12]! 1135bc3d5698SJohn Baldwin 1136bc3d5698SJohn Baldwin veor q4,q4,q12 1137bc3d5698SJohn Baldwin vst1.8 {q0,q1},[r14]! @ store output 1138bc3d5698SJohn Baldwin veor q5,q5,q13 1139bc3d5698SJohn Baldwin vld1.8 {q12,q13},[r12]! 1140bc3d5698SJohn Baldwin veor q6,q6,q14 1141bc3d5698SJohn Baldwin vst1.8 {q2,q3},[r14]! 1142bc3d5698SJohn Baldwin veor q7,q7,q15 1143bc3d5698SJohn Baldwin vld1.8 {q14,q15},[r12]! 1144bc3d5698SJohn Baldwin 1145bc3d5698SJohn Baldwin veor q8,q8,q12 1146bc3d5698SJohn Baldwin vld1.32 {q0,q1},[r11]! @ load for next iteration 1147bc3d5698SJohn Baldwin veor d25,d25,d25 1148bc3d5698SJohn Baldwin vldr d24,[sp,#4*(16+4)] @ four 1149bc3d5698SJohn Baldwin veor q9,q9,q13 1150bc3d5698SJohn Baldwin vld1.32 {q2,q3},[r11] 1151bc3d5698SJohn Baldwin veor q10,q10,q14 1152bc3d5698SJohn Baldwin vst1.8 {q4,q5},[r14]! 1153bc3d5698SJohn Baldwin veor q11,q11,q15 1154bc3d5698SJohn Baldwin vst1.8 {q6,q7},[r14]! 1155bc3d5698SJohn Baldwin 1156bc3d5698SJohn Baldwin vadd.i32 d6,d6,d24 @ next counter value 1157bc3d5698SJohn Baldwin vldr d24,[sp,#4*(16+0)] @ one 1158bc3d5698SJohn Baldwin 1159bc3d5698SJohn Baldwin ldmia sp,{r8,r9,r10,r11} @ load key material 1160bc3d5698SJohn Baldwin add r0,r0,r8 @ accumulate key material 1161bc3d5698SJohn Baldwin ldr r8,[r12],#16 @ load input 1162bc3d5698SJohn Baldwin vst1.8 {q8,q9},[r14]! 1163bc3d5698SJohn Baldwin add r1,r1,r9 1164bc3d5698SJohn Baldwin ldr r9,[r12,#-12] 1165bc3d5698SJohn Baldwin vst1.8 {q10,q11},[r14]! 1166bc3d5698SJohn Baldwin add r2,r2,r10 1167bc3d5698SJohn Baldwin ldr r10,[r12,#-8] 1168bc3d5698SJohn Baldwin add r3,r3,r11 1169bc3d5698SJohn Baldwin ldr r11,[r12,#-4] 1170bc3d5698SJohn Baldwin# ifdef __ARMEB__ 1171bc3d5698SJohn Baldwin rev r0,r0 1172bc3d5698SJohn Baldwin rev r1,r1 1173bc3d5698SJohn Baldwin rev r2,r2 1174bc3d5698SJohn Baldwin rev r3,r3 1175bc3d5698SJohn Baldwin# endif 1176bc3d5698SJohn Baldwin eor r0,r0,r8 @ xor with input 1177bc3d5698SJohn Baldwin add r8,sp,#4*(4) 1178bc3d5698SJohn Baldwin eor r1,r1,r9 1179bc3d5698SJohn Baldwin str r0,[r14],#16 @ store output 1180bc3d5698SJohn Baldwin eor r2,r2,r10 1181bc3d5698SJohn Baldwin str r1,[r14,#-12] 1182bc3d5698SJohn Baldwin eor r3,r3,r11 1183bc3d5698SJohn Baldwin ldmia r8,{r8,r9,r10,r11} @ load key material 1184bc3d5698SJohn Baldwin str r2,[r14,#-8] 1185bc3d5698SJohn Baldwin str r3,[r14,#-4] 1186bc3d5698SJohn Baldwin 1187bc3d5698SJohn Baldwin add r4,r4,r8 @ accumulate key material 1188bc3d5698SJohn Baldwin ldr r8,[r12],#16 @ load input 1189bc3d5698SJohn Baldwin add r5,r5,r9 1190bc3d5698SJohn Baldwin ldr r9,[r12,#-12] 1191bc3d5698SJohn Baldwin add r6,r6,r10 1192bc3d5698SJohn Baldwin ldr r10,[r12,#-8] 1193bc3d5698SJohn Baldwin add r7,r7,r11 1194bc3d5698SJohn Baldwin ldr r11,[r12,#-4] 1195bc3d5698SJohn Baldwin# ifdef __ARMEB__ 1196bc3d5698SJohn Baldwin rev r4,r4 1197bc3d5698SJohn Baldwin rev r5,r5 1198bc3d5698SJohn Baldwin rev r6,r6 1199bc3d5698SJohn Baldwin rev r7,r7 1200bc3d5698SJohn Baldwin# endif 1201bc3d5698SJohn Baldwin eor r4,r4,r8 1202bc3d5698SJohn Baldwin add r8,sp,#4*(8) 1203bc3d5698SJohn Baldwin eor r5,r5,r9 1204bc3d5698SJohn Baldwin str r4,[r14],#16 @ store output 1205bc3d5698SJohn Baldwin eor r6,r6,r10 1206bc3d5698SJohn Baldwin str r5,[r14,#-12] 1207bc3d5698SJohn Baldwin eor r7,r7,r11 1208bc3d5698SJohn Baldwin ldmia r8,{r8,r9,r10,r11} @ load key material 1209bc3d5698SJohn Baldwin str r6,[r14,#-8] 1210bc3d5698SJohn Baldwin add r0,sp,#4*(16+8) 1211bc3d5698SJohn Baldwin str r7,[r14,#-4] 1212bc3d5698SJohn Baldwin 1213bc3d5698SJohn Baldwin ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half 1214bc3d5698SJohn Baldwin 1215bc3d5698SJohn Baldwin add r0,r0,r8 @ accumulate key material 1216bc3d5698SJohn Baldwin ldr r8,[r12],#16 @ load input 1217bc3d5698SJohn Baldwin add r1,r1,r9 1218bc3d5698SJohn Baldwin ldr r9,[r12,#-12] 1219bc3d5698SJohn Baldwin# ifdef __thumb2__ 1220bc3d5698SJohn Baldwin it hi 1221bc3d5698SJohn Baldwin# endif 1222bc3d5698SJohn Baldwin strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it 1223bc3d5698SJohn Baldwin add r2,r2,r10 1224bc3d5698SJohn Baldwin ldr r10,[r12,#-8] 1225bc3d5698SJohn Baldwin# ifdef __thumb2__ 1226bc3d5698SJohn Baldwin it hi 1227bc3d5698SJohn Baldwin# endif 1228bc3d5698SJohn Baldwin strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it 1229bc3d5698SJohn Baldwin add r3,r3,r11 1230bc3d5698SJohn Baldwin ldr r11,[r12,#-4] 1231bc3d5698SJohn Baldwin# ifdef __ARMEB__ 1232bc3d5698SJohn Baldwin rev r0,r0 1233bc3d5698SJohn Baldwin rev r1,r1 1234bc3d5698SJohn Baldwin rev r2,r2 1235bc3d5698SJohn Baldwin rev r3,r3 1236bc3d5698SJohn Baldwin# endif 1237bc3d5698SJohn Baldwin eor r0,r0,r8 1238bc3d5698SJohn Baldwin add r8,sp,#4*(12) 1239bc3d5698SJohn Baldwin eor r1,r1,r9 1240bc3d5698SJohn Baldwin str r0,[r14],#16 @ store output 1241bc3d5698SJohn Baldwin eor r2,r2,r10 1242bc3d5698SJohn Baldwin str r1,[r14,#-12] 1243bc3d5698SJohn Baldwin eor r3,r3,r11 1244bc3d5698SJohn Baldwin ldmia r8,{r8,r9,r10,r11} @ load key material 1245bc3d5698SJohn Baldwin str r2,[r14,#-8] 1246bc3d5698SJohn Baldwin str r3,[r14,#-4] 1247bc3d5698SJohn Baldwin 1248bc3d5698SJohn Baldwin add r4,r4,r8 @ accumulate key material 1249bc3d5698SJohn Baldwin add r8,r8,#4 @ next counter value 1250bc3d5698SJohn Baldwin add r5,r5,r9 1251bc3d5698SJohn Baldwin str r8,[sp,#4*(12)] @ save next counter value 1252bc3d5698SJohn Baldwin ldr r8,[r12],#16 @ load input 1253bc3d5698SJohn Baldwin add r6,r6,r10 1254bc3d5698SJohn Baldwin add r4,r4,#3 @ counter+3 1255bc3d5698SJohn Baldwin ldr r9,[r12,#-12] 1256bc3d5698SJohn Baldwin add r7,r7,r11 1257bc3d5698SJohn Baldwin ldr r10,[r12,#-8] 1258bc3d5698SJohn Baldwin ldr r11,[r12,#-4] 1259bc3d5698SJohn Baldwin# ifdef __ARMEB__ 1260bc3d5698SJohn Baldwin rev r4,r4 1261bc3d5698SJohn Baldwin rev r5,r5 1262bc3d5698SJohn Baldwin rev r6,r6 1263bc3d5698SJohn Baldwin rev r7,r7 1264bc3d5698SJohn Baldwin# endif 1265bc3d5698SJohn Baldwin eor r4,r4,r8 1266bc3d5698SJohn Baldwin# ifdef __thumb2__ 1267bc3d5698SJohn Baldwin it hi 1268bc3d5698SJohn Baldwin# endif 1269bc3d5698SJohn Baldwin ldrhi r8,[sp,#4*(32+2)] @ re-load len 1270bc3d5698SJohn Baldwin eor r5,r5,r9 1271bc3d5698SJohn Baldwin eor r6,r6,r10 1272bc3d5698SJohn Baldwin str r4,[r14],#16 @ store output 1273bc3d5698SJohn Baldwin eor r7,r7,r11 1274bc3d5698SJohn Baldwin str r5,[r14,#-12] 1275bc3d5698SJohn Baldwin sub r11,r8,#64*4 @ len-=64*4 1276bc3d5698SJohn Baldwin str r6,[r14,#-8] 1277bc3d5698SJohn Baldwin str r7,[r14,#-4] 1278bc3d5698SJohn Baldwin bhi .Loop_neon_outer 1279bc3d5698SJohn Baldwin 1280bc3d5698SJohn Baldwin b .Ldone_neon 1281bc3d5698SJohn Baldwin 1282bc3d5698SJohn Baldwin.align 4 1283bc3d5698SJohn Baldwin.Lbreak_neon: 1284bc3d5698SJohn Baldwin @ harmonize NEON and integer-only stack frames: load data 1285bc3d5698SJohn Baldwin @ from NEON frame, but save to integer-only one; distance 1286bc3d5698SJohn Baldwin @ between the two is 4*(32+4+16-32)=4*(20). 1287bc3d5698SJohn Baldwin 1288bc3d5698SJohn Baldwin str r11, [sp,#4*(20+32+2)] @ save len 1289bc3d5698SJohn Baldwin add r11,sp,#4*(32+4) 1290bc3d5698SJohn Baldwin str r12, [sp,#4*(20+32+1)] @ save inp 1291bc3d5698SJohn Baldwin str r14, [sp,#4*(20+32+0)] @ save out 1292bc3d5698SJohn Baldwin 1293bc3d5698SJohn Baldwin ldr r12,[sp,#4*(16+10)] 1294bc3d5698SJohn Baldwin ldr r14,[sp,#4*(16+11)] 1295bc3d5698SJohn Baldwin vldmia r11,{d8,d9,d10,d11,d12,d13,d14,d15} @ fulfill ABI requirement 1296bc3d5698SJohn Baldwin str r12,[sp,#4*(20+16+10)] @ copy "rx" 1297bc3d5698SJohn Baldwin str r14,[sp,#4*(20+16+11)] @ copy "rx" 1298bc3d5698SJohn Baldwin 1299bc3d5698SJohn Baldwin ldr r11, [sp,#4*(15)] 1300bc3d5698SJohn Baldwin ldr r12,[sp,#4*(12)] @ modulo-scheduled load 1301bc3d5698SJohn Baldwin ldr r10, [sp,#4*(13)] 1302bc3d5698SJohn Baldwin ldr r14,[sp,#4*(14)] 1303bc3d5698SJohn Baldwin str r11, [sp,#4*(20+16+15)] 1304bc3d5698SJohn Baldwin add r11,sp,#4*(20) 1305bc3d5698SJohn Baldwin vst1.32 {q0,q1},[r11]! @ copy key 1306bc3d5698SJohn Baldwin add sp,sp,#4*(20) @ switch frame 1307bc3d5698SJohn Baldwin vst1.32 {q2,q3},[r11] 1308bc3d5698SJohn Baldwin mov r11,#10 1309bc3d5698SJohn Baldwin b .Loop @ go integer-only 1310bc3d5698SJohn Baldwin 1311bc3d5698SJohn Baldwin.align 4 1312bc3d5698SJohn Baldwin.Ltail_neon: 1313bc3d5698SJohn Baldwin cmp r11,#64*3 1314bc3d5698SJohn Baldwin bhs .L192_or_more_neon 1315bc3d5698SJohn Baldwin cmp r11,#64*2 1316bc3d5698SJohn Baldwin bhs .L128_or_more_neon 1317bc3d5698SJohn Baldwin cmp r11,#64*1 1318bc3d5698SJohn Baldwin bhs .L64_or_more_neon 1319bc3d5698SJohn Baldwin 1320bc3d5698SJohn Baldwin add r8,sp,#4*(8) 1321bc3d5698SJohn Baldwin vst1.8 {q0,q1},[sp] 1322bc3d5698SJohn Baldwin add r10,sp,#4*(0) 1323bc3d5698SJohn Baldwin vst1.8 {q2,q3},[r8] 1324bc3d5698SJohn Baldwin b .Loop_tail_neon 1325bc3d5698SJohn Baldwin 1326bc3d5698SJohn Baldwin.align 4 1327bc3d5698SJohn Baldwin.L64_or_more_neon: 1328bc3d5698SJohn Baldwin vld1.8 {q12,q13},[r12]! 1329bc3d5698SJohn Baldwin vld1.8 {q14,q15},[r12]! 1330bc3d5698SJohn Baldwin veor q0,q0,q12 1331bc3d5698SJohn Baldwin veor q1,q1,q13 1332bc3d5698SJohn Baldwin veor q2,q2,q14 1333bc3d5698SJohn Baldwin veor q3,q3,q15 1334bc3d5698SJohn Baldwin vst1.8 {q0,q1},[r14]! 1335bc3d5698SJohn Baldwin vst1.8 {q2,q3},[r14]! 1336bc3d5698SJohn Baldwin 1337bc3d5698SJohn Baldwin beq .Ldone_neon 1338bc3d5698SJohn Baldwin 1339bc3d5698SJohn Baldwin add r8,sp,#4*(8) 1340bc3d5698SJohn Baldwin vst1.8 {q4,q5},[sp] 1341bc3d5698SJohn Baldwin add r10,sp,#4*(0) 1342bc3d5698SJohn Baldwin vst1.8 {q6,q7},[r8] 1343bc3d5698SJohn Baldwin sub r11,r11,#64*1 @ len-=64*1 1344bc3d5698SJohn Baldwin b .Loop_tail_neon 1345bc3d5698SJohn Baldwin 1346bc3d5698SJohn Baldwin.align 4 1347bc3d5698SJohn Baldwin.L128_or_more_neon: 1348bc3d5698SJohn Baldwin vld1.8 {q12,q13},[r12]! 1349bc3d5698SJohn Baldwin vld1.8 {q14,q15},[r12]! 1350bc3d5698SJohn Baldwin veor q0,q0,q12 1351bc3d5698SJohn Baldwin veor q1,q1,q13 1352bc3d5698SJohn Baldwin vld1.8 {q12,q13},[r12]! 1353bc3d5698SJohn Baldwin veor q2,q2,q14 1354bc3d5698SJohn Baldwin veor q3,q3,q15 1355bc3d5698SJohn Baldwin vld1.8 {q14,q15},[r12]! 1356bc3d5698SJohn Baldwin 1357bc3d5698SJohn Baldwin veor q4,q4,q12 1358bc3d5698SJohn Baldwin veor q5,q5,q13 1359bc3d5698SJohn Baldwin vst1.8 {q0,q1},[r14]! 1360bc3d5698SJohn Baldwin veor q6,q6,q14 1361bc3d5698SJohn Baldwin vst1.8 {q2,q3},[r14]! 1362bc3d5698SJohn Baldwin veor q7,q7,q15 1363bc3d5698SJohn Baldwin vst1.8 {q4,q5},[r14]! 1364bc3d5698SJohn Baldwin vst1.8 {q6,q7},[r14]! 1365bc3d5698SJohn Baldwin 1366bc3d5698SJohn Baldwin beq .Ldone_neon 1367bc3d5698SJohn Baldwin 1368bc3d5698SJohn Baldwin add r8,sp,#4*(8) 1369bc3d5698SJohn Baldwin vst1.8 {q8,q9},[sp] 1370bc3d5698SJohn Baldwin add r10,sp,#4*(0) 1371bc3d5698SJohn Baldwin vst1.8 {q10,q11},[r8] 1372bc3d5698SJohn Baldwin sub r11,r11,#64*2 @ len-=64*2 1373bc3d5698SJohn Baldwin b .Loop_tail_neon 1374bc3d5698SJohn Baldwin 1375bc3d5698SJohn Baldwin.align 4 1376bc3d5698SJohn Baldwin.L192_or_more_neon: 1377bc3d5698SJohn Baldwin vld1.8 {q12,q13},[r12]! 1378bc3d5698SJohn Baldwin vld1.8 {q14,q15},[r12]! 1379bc3d5698SJohn Baldwin veor q0,q0,q12 1380bc3d5698SJohn Baldwin veor q1,q1,q13 1381bc3d5698SJohn Baldwin vld1.8 {q12,q13},[r12]! 1382bc3d5698SJohn Baldwin veor q2,q2,q14 1383bc3d5698SJohn Baldwin veor q3,q3,q15 1384bc3d5698SJohn Baldwin vld1.8 {q14,q15},[r12]! 1385bc3d5698SJohn Baldwin 1386bc3d5698SJohn Baldwin veor q4,q4,q12 1387bc3d5698SJohn Baldwin veor q5,q5,q13 1388bc3d5698SJohn Baldwin vld1.8 {q12,q13},[r12]! 1389bc3d5698SJohn Baldwin veor q6,q6,q14 1390bc3d5698SJohn Baldwin vst1.8 {q0,q1},[r14]! 1391bc3d5698SJohn Baldwin veor q7,q7,q15 1392bc3d5698SJohn Baldwin vld1.8 {q14,q15},[r12]! 1393bc3d5698SJohn Baldwin 1394bc3d5698SJohn Baldwin veor q8,q8,q12 1395bc3d5698SJohn Baldwin vst1.8 {q2,q3},[r14]! 1396bc3d5698SJohn Baldwin veor q9,q9,q13 1397bc3d5698SJohn Baldwin vst1.8 {q4,q5},[r14]! 1398bc3d5698SJohn Baldwin veor q10,q10,q14 1399bc3d5698SJohn Baldwin vst1.8 {q6,q7},[r14]! 1400bc3d5698SJohn Baldwin veor q11,q11,q15 1401bc3d5698SJohn Baldwin vst1.8 {q8,q9},[r14]! 1402bc3d5698SJohn Baldwin vst1.8 {q10,q11},[r14]! 1403bc3d5698SJohn Baldwin 1404bc3d5698SJohn Baldwin beq .Ldone_neon 1405bc3d5698SJohn Baldwin 1406bc3d5698SJohn Baldwin ldmia sp,{r8,r9,r10,r11} @ load key material 1407bc3d5698SJohn Baldwin add r0,r0,r8 @ accumulate key material 1408bc3d5698SJohn Baldwin add r8,sp,#4*(4) 1409bc3d5698SJohn Baldwin add r1,r1,r9 1410bc3d5698SJohn Baldwin add r2,r2,r10 1411bc3d5698SJohn Baldwin add r3,r3,r11 1412bc3d5698SJohn Baldwin ldmia r8,{r8,r9,r10,r11} @ load key material 1413bc3d5698SJohn Baldwin 1414bc3d5698SJohn Baldwin add r4,r4,r8 @ accumulate key material 1415bc3d5698SJohn Baldwin add r8,sp,#4*(8) 1416bc3d5698SJohn Baldwin add r5,r5,r9 1417bc3d5698SJohn Baldwin add r6,r6,r10 1418bc3d5698SJohn Baldwin add r7,r7,r11 1419bc3d5698SJohn Baldwin ldmia r8,{r8,r9,r10,r11} @ load key material 1420bc3d5698SJohn Baldwin# ifdef __ARMEB__ 1421bc3d5698SJohn Baldwin rev r0,r0 1422bc3d5698SJohn Baldwin rev r1,r1 1423bc3d5698SJohn Baldwin rev r2,r2 1424bc3d5698SJohn Baldwin rev r3,r3 1425bc3d5698SJohn Baldwin rev r4,r4 1426bc3d5698SJohn Baldwin rev r5,r5 1427bc3d5698SJohn Baldwin rev r6,r6 1428bc3d5698SJohn Baldwin rev r7,r7 1429bc3d5698SJohn Baldwin# endif 1430bc3d5698SJohn Baldwin stmia sp,{r0,r1,r2,r3,r4,r5,r6,r7} 1431bc3d5698SJohn Baldwin add r0,sp,#4*(16+8) 1432bc3d5698SJohn Baldwin 1433bc3d5698SJohn Baldwin ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half 1434bc3d5698SJohn Baldwin 1435bc3d5698SJohn Baldwin add r0,r0,r8 @ accumulate key material 1436bc3d5698SJohn Baldwin add r8,sp,#4*(12) 1437bc3d5698SJohn Baldwin add r1,r1,r9 1438bc3d5698SJohn Baldwin add r2,r2,r10 1439bc3d5698SJohn Baldwin add r3,r3,r11 1440bc3d5698SJohn Baldwin ldmia r8,{r8,r9,r10,r11} @ load key material 1441bc3d5698SJohn Baldwin 1442bc3d5698SJohn Baldwin add r4,r4,r8 @ accumulate key material 1443bc3d5698SJohn Baldwin add r8,sp,#4*(8) 1444bc3d5698SJohn Baldwin add r5,r5,r9 1445bc3d5698SJohn Baldwin add r4,r4,#3 @ counter+3 1446bc3d5698SJohn Baldwin add r6,r6,r10 1447bc3d5698SJohn Baldwin add r7,r7,r11 1448bc3d5698SJohn Baldwin ldr r11,[sp,#4*(32+2)] @ re-load len 1449bc3d5698SJohn Baldwin# ifdef __ARMEB__ 1450bc3d5698SJohn Baldwin rev r0,r0 1451bc3d5698SJohn Baldwin rev r1,r1 1452bc3d5698SJohn Baldwin rev r2,r2 1453bc3d5698SJohn Baldwin rev r3,r3 1454bc3d5698SJohn Baldwin rev r4,r4 1455bc3d5698SJohn Baldwin rev r5,r5 1456bc3d5698SJohn Baldwin rev r6,r6 1457bc3d5698SJohn Baldwin rev r7,r7 1458bc3d5698SJohn Baldwin# endif 1459bc3d5698SJohn Baldwin stmia r8,{r0,r1,r2,r3,r4,r5,r6,r7} 1460bc3d5698SJohn Baldwin add r10,sp,#4*(0) 1461bc3d5698SJohn Baldwin sub r11,r11,#64*3 @ len-=64*3 1462bc3d5698SJohn Baldwin 1463bc3d5698SJohn Baldwin.Loop_tail_neon: 1464bc3d5698SJohn Baldwin ldrb r8,[r10],#1 @ read buffer on stack 1465bc3d5698SJohn Baldwin ldrb r9,[r12],#1 @ read input 1466bc3d5698SJohn Baldwin subs r11,r11,#1 1467bc3d5698SJohn Baldwin eor r8,r8,r9 1468bc3d5698SJohn Baldwin strb r8,[r14],#1 @ store output 1469bc3d5698SJohn Baldwin bne .Loop_tail_neon 1470bc3d5698SJohn Baldwin 1471bc3d5698SJohn Baldwin.Ldone_neon: 1472bc3d5698SJohn Baldwin add sp,sp,#4*(32+4) 1473bc3d5698SJohn Baldwin vldmia sp,{d8,d9,d10,d11,d12,d13,d14,d15} 1474bc3d5698SJohn Baldwin add sp,sp,#4*(16+3) 1475bc3d5698SJohn Baldwin ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} 1476bc3d5698SJohn Baldwin.size ChaCha20_neon,.-ChaCha20_neon 1477bc3d5698SJohn Baldwin.comm OPENSSL_armcap_P,4,4 1478bc3d5698SJohn Baldwin#endif 1479