1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from ghash-armv4.pl. */ 2bc3d5698SJohn Baldwin#include "arm_arch.h" 3bc3d5698SJohn Baldwin 4bc3d5698SJohn Baldwin#if defined(__thumb2__) || defined(__clang__) 5bc3d5698SJohn Baldwin.syntax unified 6bc3d5698SJohn Baldwin#define ldrplb ldrbpl 7bc3d5698SJohn Baldwin#define ldrneb ldrbne 8bc3d5698SJohn Baldwin#endif 9bc3d5698SJohn Baldwin#if defined(__thumb2__) 10bc3d5698SJohn Baldwin.thumb 11bc3d5698SJohn Baldwin#else 12bc3d5698SJohn Baldwin.code 32 13bc3d5698SJohn Baldwin#endif 14bc3d5698SJohn Baldwin 15*c0855eaaSJohn Baldwin.text 16*c0855eaaSJohn Baldwin 17bc3d5698SJohn Baldwin.type rem_4bit,%object 18bc3d5698SJohn Baldwin.align 5 19bc3d5698SJohn Baldwinrem_4bit: 20bc3d5698SJohn Baldwin.short 0x0000,0x1C20,0x3840,0x2460 21bc3d5698SJohn Baldwin.short 0x7080,0x6CA0,0x48C0,0x54E0 22bc3d5698SJohn Baldwin.short 0xE100,0xFD20,0xD940,0xC560 23bc3d5698SJohn Baldwin.short 0x9180,0x8DA0,0xA9C0,0xB5E0 24bc3d5698SJohn Baldwin.size rem_4bit,.-rem_4bit 25bc3d5698SJohn Baldwin 26bc3d5698SJohn Baldwin.type rem_4bit_get,%function 27bc3d5698SJohn Baldwinrem_4bit_get: 28bc3d5698SJohn Baldwin#if defined(__thumb2__) 29bc3d5698SJohn Baldwin adr r2,rem_4bit 30bc3d5698SJohn Baldwin#else 31bc3d5698SJohn Baldwin sub r2,pc,#8+32 @ &rem_4bit 32bc3d5698SJohn Baldwin#endif 33bc3d5698SJohn Baldwin b .Lrem_4bit_got 34bc3d5698SJohn Baldwin nop 35bc3d5698SJohn Baldwin nop 36bc3d5698SJohn Baldwin.size rem_4bit_get,.-rem_4bit_get 37bc3d5698SJohn Baldwin 38bc3d5698SJohn Baldwin.globl gcm_ghash_4bit 39bc3d5698SJohn Baldwin.type gcm_ghash_4bit,%function 40bc3d5698SJohn Baldwin.align 4 41bc3d5698SJohn Baldwingcm_ghash_4bit: 42bc3d5698SJohn Baldwin#if defined(__thumb2__) 43bc3d5698SJohn Baldwin adr r12,rem_4bit 44bc3d5698SJohn Baldwin#else 45bc3d5698SJohn Baldwin sub r12,pc,#8+48 @ &rem_4bit 46bc3d5698SJohn Baldwin#endif 47bc3d5698SJohn Baldwin add r3,r2,r3 @ r3 to point at the end 48bc3d5698SJohn Baldwin stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} @ save r3/end too 49bc3d5698SJohn Baldwin 50bc3d5698SJohn Baldwin ldmia r12,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy rem_4bit ... 51bc3d5698SJohn Baldwin stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ ... to stack 52bc3d5698SJohn Baldwin 53bc3d5698SJohn Baldwin ldrb r12,[r2,#15] 54bc3d5698SJohn Baldwin ldrb r14,[r0,#15] 55bc3d5698SJohn Baldwin.Louter: 56bc3d5698SJohn Baldwin eor r12,r12,r14 57bc3d5698SJohn Baldwin and r14,r12,#0xf0 58bc3d5698SJohn Baldwin and r12,r12,#0x0f 59bc3d5698SJohn Baldwin mov r3,#14 60bc3d5698SJohn Baldwin 61bc3d5698SJohn Baldwin add r7,r1,r12,lsl#4 62bc3d5698SJohn Baldwin ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo] 63bc3d5698SJohn Baldwin add r11,r1,r14 64bc3d5698SJohn Baldwin ldrb r12,[r2,#14] 65bc3d5698SJohn Baldwin 66bc3d5698SJohn Baldwin and r14,r4,#0xf @ rem 67bc3d5698SJohn Baldwin ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] 68bc3d5698SJohn Baldwin add r14,r14,r14 69bc3d5698SJohn Baldwin eor r4,r8,r4,lsr#4 70bc3d5698SJohn Baldwin ldrh r8,[sp,r14] @ rem_4bit[rem] 71bc3d5698SJohn Baldwin eor r4,r4,r5,lsl#28 72bc3d5698SJohn Baldwin ldrb r14,[r0,#14] 73bc3d5698SJohn Baldwin eor r5,r9,r5,lsr#4 74bc3d5698SJohn Baldwin eor r5,r5,r6,lsl#28 75bc3d5698SJohn Baldwin eor r6,r10,r6,lsr#4 76bc3d5698SJohn Baldwin eor r6,r6,r7,lsl#28 77bc3d5698SJohn Baldwin eor r7,r11,r7,lsr#4 78bc3d5698SJohn Baldwin eor r12,r12,r14 79bc3d5698SJohn Baldwin and r14,r12,#0xf0 80bc3d5698SJohn Baldwin and r12,r12,#0x0f 81bc3d5698SJohn Baldwin eor r7,r7,r8,lsl#16 82bc3d5698SJohn Baldwin 83bc3d5698SJohn Baldwin.Linner: 84bc3d5698SJohn Baldwin add r11,r1,r12,lsl#4 85bc3d5698SJohn Baldwin and r12,r4,#0xf @ rem 86bc3d5698SJohn Baldwin subs r3,r3,#1 87bc3d5698SJohn Baldwin add r12,r12,r12 88bc3d5698SJohn Baldwin ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo] 89bc3d5698SJohn Baldwin eor r4,r8,r4,lsr#4 90bc3d5698SJohn Baldwin eor r4,r4,r5,lsl#28 91bc3d5698SJohn Baldwin eor r5,r9,r5,lsr#4 92bc3d5698SJohn Baldwin eor r5,r5,r6,lsl#28 93bc3d5698SJohn Baldwin ldrh r8,[sp,r12] @ rem_4bit[rem] 94bc3d5698SJohn Baldwin eor r6,r10,r6,lsr#4 95bc3d5698SJohn Baldwin#ifdef __thumb2__ 96bc3d5698SJohn Baldwin it pl 97bc3d5698SJohn Baldwin#endif 98bc3d5698SJohn Baldwin ldrplb r12,[r2,r3] 99bc3d5698SJohn Baldwin eor r6,r6,r7,lsl#28 100bc3d5698SJohn Baldwin eor r7,r11,r7,lsr#4 101bc3d5698SJohn Baldwin 102bc3d5698SJohn Baldwin add r11,r1,r14 103bc3d5698SJohn Baldwin and r14,r4,#0xf @ rem 104bc3d5698SJohn Baldwin eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] 105bc3d5698SJohn Baldwin add r14,r14,r14 106bc3d5698SJohn Baldwin ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] 107bc3d5698SJohn Baldwin eor r4,r8,r4,lsr#4 108bc3d5698SJohn Baldwin#ifdef __thumb2__ 109bc3d5698SJohn Baldwin it pl 110bc3d5698SJohn Baldwin#endif 111bc3d5698SJohn Baldwin ldrplb r8,[r0,r3] 112bc3d5698SJohn Baldwin eor r4,r4,r5,lsl#28 113bc3d5698SJohn Baldwin eor r5,r9,r5,lsr#4 114bc3d5698SJohn Baldwin ldrh r9,[sp,r14] 115bc3d5698SJohn Baldwin eor r5,r5,r6,lsl#28 116bc3d5698SJohn Baldwin eor r6,r10,r6,lsr#4 117bc3d5698SJohn Baldwin eor r6,r6,r7,lsl#28 118bc3d5698SJohn Baldwin#ifdef __thumb2__ 119bc3d5698SJohn Baldwin it pl 120bc3d5698SJohn Baldwin#endif 121bc3d5698SJohn Baldwin eorpl r12,r12,r8 122bc3d5698SJohn Baldwin eor r7,r11,r7,lsr#4 123bc3d5698SJohn Baldwin#ifdef __thumb2__ 124bc3d5698SJohn Baldwin itt pl 125bc3d5698SJohn Baldwin#endif 126bc3d5698SJohn Baldwin andpl r14,r12,#0xf0 127bc3d5698SJohn Baldwin andpl r12,r12,#0x0f 128bc3d5698SJohn Baldwin eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem] 129bc3d5698SJohn Baldwin bpl .Linner 130bc3d5698SJohn Baldwin 131bc3d5698SJohn Baldwin ldr r3,[sp,#32] @ re-load r3/end 132bc3d5698SJohn Baldwin add r2,r2,#16 133bc3d5698SJohn Baldwin mov r14,r4 134bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__) 135bc3d5698SJohn Baldwin rev r4,r4 136bc3d5698SJohn Baldwin str r4,[r0,#12] 137bc3d5698SJohn Baldwin#elif defined(__ARMEB__) 138bc3d5698SJohn Baldwin str r4,[r0,#12] 139bc3d5698SJohn Baldwin#else 140bc3d5698SJohn Baldwin mov r9,r4,lsr#8 141bc3d5698SJohn Baldwin strb r4,[r0,#12+3] 142bc3d5698SJohn Baldwin mov r10,r4,lsr#16 143bc3d5698SJohn Baldwin strb r9,[r0,#12+2] 144bc3d5698SJohn Baldwin mov r11,r4,lsr#24 145bc3d5698SJohn Baldwin strb r10,[r0,#12+1] 146bc3d5698SJohn Baldwin strb r11,[r0,#12] 147bc3d5698SJohn Baldwin#endif 148bc3d5698SJohn Baldwin cmp r2,r3 149bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__) 150bc3d5698SJohn Baldwin rev r5,r5 151bc3d5698SJohn Baldwin str r5,[r0,#8] 152bc3d5698SJohn Baldwin#elif defined(__ARMEB__) 153bc3d5698SJohn Baldwin str r5,[r0,#8] 154bc3d5698SJohn Baldwin#else 155bc3d5698SJohn Baldwin mov r9,r5,lsr#8 156bc3d5698SJohn Baldwin strb r5,[r0,#8+3] 157bc3d5698SJohn Baldwin mov r10,r5,lsr#16 158bc3d5698SJohn Baldwin strb r9,[r0,#8+2] 159bc3d5698SJohn Baldwin mov r11,r5,lsr#24 160bc3d5698SJohn Baldwin strb r10,[r0,#8+1] 161bc3d5698SJohn Baldwin strb r11,[r0,#8] 162bc3d5698SJohn Baldwin#endif 163bc3d5698SJohn Baldwin 164bc3d5698SJohn Baldwin#ifdef __thumb2__ 165bc3d5698SJohn Baldwin it ne 166bc3d5698SJohn Baldwin#endif 167bc3d5698SJohn Baldwin ldrneb r12,[r2,#15] 168bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__) 169bc3d5698SJohn Baldwin rev r6,r6 170bc3d5698SJohn Baldwin str r6,[r0,#4] 171bc3d5698SJohn Baldwin#elif defined(__ARMEB__) 172bc3d5698SJohn Baldwin str r6,[r0,#4] 173bc3d5698SJohn Baldwin#else 174bc3d5698SJohn Baldwin mov r9,r6,lsr#8 175bc3d5698SJohn Baldwin strb r6,[r0,#4+3] 176bc3d5698SJohn Baldwin mov r10,r6,lsr#16 177bc3d5698SJohn Baldwin strb r9,[r0,#4+2] 178bc3d5698SJohn Baldwin mov r11,r6,lsr#24 179bc3d5698SJohn Baldwin strb r10,[r0,#4+1] 180bc3d5698SJohn Baldwin strb r11,[r0,#4] 181bc3d5698SJohn Baldwin#endif 182bc3d5698SJohn Baldwin 183bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__) 184bc3d5698SJohn Baldwin rev r7,r7 185bc3d5698SJohn Baldwin str r7,[r0,#0] 186bc3d5698SJohn Baldwin#elif defined(__ARMEB__) 187bc3d5698SJohn Baldwin str r7,[r0,#0] 188bc3d5698SJohn Baldwin#else 189bc3d5698SJohn Baldwin mov r9,r7,lsr#8 190bc3d5698SJohn Baldwin strb r7,[r0,#0+3] 191bc3d5698SJohn Baldwin mov r10,r7,lsr#16 192bc3d5698SJohn Baldwin strb r9,[r0,#0+2] 193bc3d5698SJohn Baldwin mov r11,r7,lsr#24 194bc3d5698SJohn Baldwin strb r10,[r0,#0+1] 195bc3d5698SJohn Baldwin strb r11,[r0,#0] 196bc3d5698SJohn Baldwin#endif 197bc3d5698SJohn Baldwin 198bc3d5698SJohn Baldwin bne .Louter 199bc3d5698SJohn Baldwin 200bc3d5698SJohn Baldwin add sp,sp,#36 201bc3d5698SJohn Baldwin#if __ARM_ARCH__>=5 202bc3d5698SJohn Baldwin ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} 203bc3d5698SJohn Baldwin#else 204bc3d5698SJohn Baldwin ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} 205bc3d5698SJohn Baldwin tst lr,#1 206bc3d5698SJohn Baldwin moveq pc,lr @ be binary compatible with V4, yet 207bc3d5698SJohn Baldwin.word 0xe12fff1e @ interoperable with Thumb ISA:-) 208bc3d5698SJohn Baldwin#endif 209bc3d5698SJohn Baldwin.size gcm_ghash_4bit,.-gcm_ghash_4bit 210bc3d5698SJohn Baldwin 211bc3d5698SJohn Baldwin.globl gcm_gmult_4bit 212bc3d5698SJohn Baldwin.type gcm_gmult_4bit,%function 213bc3d5698SJohn Baldwingcm_gmult_4bit: 214bc3d5698SJohn Baldwin stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} 215bc3d5698SJohn Baldwin ldrb r12,[r0,#15] 216bc3d5698SJohn Baldwin b rem_4bit_get 217bc3d5698SJohn Baldwin.Lrem_4bit_got: 218bc3d5698SJohn Baldwin and r14,r12,#0xf0 219bc3d5698SJohn Baldwin and r12,r12,#0x0f 220bc3d5698SJohn Baldwin mov r3,#14 221bc3d5698SJohn Baldwin 222bc3d5698SJohn Baldwin add r7,r1,r12,lsl#4 223bc3d5698SJohn Baldwin ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo] 224bc3d5698SJohn Baldwin ldrb r12,[r0,#14] 225bc3d5698SJohn Baldwin 226bc3d5698SJohn Baldwin add r11,r1,r14 227bc3d5698SJohn Baldwin and r14,r4,#0xf @ rem 228bc3d5698SJohn Baldwin ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] 229bc3d5698SJohn Baldwin add r14,r14,r14 230bc3d5698SJohn Baldwin eor r4,r8,r4,lsr#4 231bc3d5698SJohn Baldwin ldrh r8,[r2,r14] @ rem_4bit[rem] 232bc3d5698SJohn Baldwin eor r4,r4,r5,lsl#28 233bc3d5698SJohn Baldwin eor r5,r9,r5,lsr#4 234bc3d5698SJohn Baldwin eor r5,r5,r6,lsl#28 235bc3d5698SJohn Baldwin eor r6,r10,r6,lsr#4 236bc3d5698SJohn Baldwin eor r6,r6,r7,lsl#28 237bc3d5698SJohn Baldwin eor r7,r11,r7,lsr#4 238bc3d5698SJohn Baldwin and r14,r12,#0xf0 239bc3d5698SJohn Baldwin eor r7,r7,r8,lsl#16 240bc3d5698SJohn Baldwin and r12,r12,#0x0f 241bc3d5698SJohn Baldwin 242bc3d5698SJohn Baldwin.Loop: 243bc3d5698SJohn Baldwin add r11,r1,r12,lsl#4 244bc3d5698SJohn Baldwin and r12,r4,#0xf @ rem 245bc3d5698SJohn Baldwin subs r3,r3,#1 246bc3d5698SJohn Baldwin add r12,r12,r12 247bc3d5698SJohn Baldwin ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo] 248bc3d5698SJohn Baldwin eor r4,r8,r4,lsr#4 249bc3d5698SJohn Baldwin eor r4,r4,r5,lsl#28 250bc3d5698SJohn Baldwin eor r5,r9,r5,lsr#4 251bc3d5698SJohn Baldwin eor r5,r5,r6,lsl#28 252bc3d5698SJohn Baldwin ldrh r8,[r2,r12] @ rem_4bit[rem] 253bc3d5698SJohn Baldwin eor r6,r10,r6,lsr#4 254bc3d5698SJohn Baldwin#ifdef __thumb2__ 255bc3d5698SJohn Baldwin it pl 256bc3d5698SJohn Baldwin#endif 257bc3d5698SJohn Baldwin ldrplb r12,[r0,r3] 258bc3d5698SJohn Baldwin eor r6,r6,r7,lsl#28 259bc3d5698SJohn Baldwin eor r7,r11,r7,lsr#4 260bc3d5698SJohn Baldwin 261bc3d5698SJohn Baldwin add r11,r1,r14 262bc3d5698SJohn Baldwin and r14,r4,#0xf @ rem 263bc3d5698SJohn Baldwin eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] 264bc3d5698SJohn Baldwin add r14,r14,r14 265bc3d5698SJohn Baldwin ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] 266bc3d5698SJohn Baldwin eor r4,r8,r4,lsr#4 267bc3d5698SJohn Baldwin eor r4,r4,r5,lsl#28 268bc3d5698SJohn Baldwin eor r5,r9,r5,lsr#4 269bc3d5698SJohn Baldwin ldrh r8,[r2,r14] @ rem_4bit[rem] 270bc3d5698SJohn Baldwin eor r5,r5,r6,lsl#28 271bc3d5698SJohn Baldwin eor r6,r10,r6,lsr#4 272bc3d5698SJohn Baldwin eor r6,r6,r7,lsl#28 273bc3d5698SJohn Baldwin eor r7,r11,r7,lsr#4 274bc3d5698SJohn Baldwin#ifdef __thumb2__ 275bc3d5698SJohn Baldwin itt pl 276bc3d5698SJohn Baldwin#endif 277bc3d5698SJohn Baldwin andpl r14,r12,#0xf0 278bc3d5698SJohn Baldwin andpl r12,r12,#0x0f 279bc3d5698SJohn Baldwin eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] 280bc3d5698SJohn Baldwin bpl .Loop 281bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__) 282bc3d5698SJohn Baldwin rev r4,r4 283bc3d5698SJohn Baldwin str r4,[r0,#12] 284bc3d5698SJohn Baldwin#elif defined(__ARMEB__) 285bc3d5698SJohn Baldwin str r4,[r0,#12] 286bc3d5698SJohn Baldwin#else 287bc3d5698SJohn Baldwin mov r9,r4,lsr#8 288bc3d5698SJohn Baldwin strb r4,[r0,#12+3] 289bc3d5698SJohn Baldwin mov r10,r4,lsr#16 290bc3d5698SJohn Baldwin strb r9,[r0,#12+2] 291bc3d5698SJohn Baldwin mov r11,r4,lsr#24 292bc3d5698SJohn Baldwin strb r10,[r0,#12+1] 293bc3d5698SJohn Baldwin strb r11,[r0,#12] 294bc3d5698SJohn Baldwin#endif 295bc3d5698SJohn Baldwin 296bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__) 297bc3d5698SJohn Baldwin rev r5,r5 298bc3d5698SJohn Baldwin str r5,[r0,#8] 299bc3d5698SJohn Baldwin#elif defined(__ARMEB__) 300bc3d5698SJohn Baldwin str r5,[r0,#8] 301bc3d5698SJohn Baldwin#else 302bc3d5698SJohn Baldwin mov r9,r5,lsr#8 303bc3d5698SJohn Baldwin strb r5,[r0,#8+3] 304bc3d5698SJohn Baldwin mov r10,r5,lsr#16 305bc3d5698SJohn Baldwin strb r9,[r0,#8+2] 306bc3d5698SJohn Baldwin mov r11,r5,lsr#24 307bc3d5698SJohn Baldwin strb r10,[r0,#8+1] 308bc3d5698SJohn Baldwin strb r11,[r0,#8] 309bc3d5698SJohn Baldwin#endif 310bc3d5698SJohn Baldwin 311bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__) 312bc3d5698SJohn Baldwin rev r6,r6 313bc3d5698SJohn Baldwin str r6,[r0,#4] 314bc3d5698SJohn Baldwin#elif defined(__ARMEB__) 315bc3d5698SJohn Baldwin str r6,[r0,#4] 316bc3d5698SJohn Baldwin#else 317bc3d5698SJohn Baldwin mov r9,r6,lsr#8 318bc3d5698SJohn Baldwin strb r6,[r0,#4+3] 319bc3d5698SJohn Baldwin mov r10,r6,lsr#16 320bc3d5698SJohn Baldwin strb r9,[r0,#4+2] 321bc3d5698SJohn Baldwin mov r11,r6,lsr#24 322bc3d5698SJohn Baldwin strb r10,[r0,#4+1] 323bc3d5698SJohn Baldwin strb r11,[r0,#4] 324bc3d5698SJohn Baldwin#endif 325bc3d5698SJohn Baldwin 326bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__) 327bc3d5698SJohn Baldwin rev r7,r7 328bc3d5698SJohn Baldwin str r7,[r0,#0] 329bc3d5698SJohn Baldwin#elif defined(__ARMEB__) 330bc3d5698SJohn Baldwin str r7,[r0,#0] 331bc3d5698SJohn Baldwin#else 332bc3d5698SJohn Baldwin mov r9,r7,lsr#8 333bc3d5698SJohn Baldwin strb r7,[r0,#0+3] 334bc3d5698SJohn Baldwin mov r10,r7,lsr#16 335bc3d5698SJohn Baldwin strb r9,[r0,#0+2] 336bc3d5698SJohn Baldwin mov r11,r7,lsr#24 337bc3d5698SJohn Baldwin strb r10,[r0,#0+1] 338bc3d5698SJohn Baldwin strb r11,[r0,#0] 339bc3d5698SJohn Baldwin#endif 340bc3d5698SJohn Baldwin 341bc3d5698SJohn Baldwin#if __ARM_ARCH__>=5 342bc3d5698SJohn Baldwin ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} 343bc3d5698SJohn Baldwin#else 344bc3d5698SJohn Baldwin ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} 345bc3d5698SJohn Baldwin tst lr,#1 346bc3d5698SJohn Baldwin moveq pc,lr @ be binary compatible with V4, yet 347bc3d5698SJohn Baldwin.word 0xe12fff1e @ interoperable with Thumb ISA:-) 348bc3d5698SJohn Baldwin#endif 349bc3d5698SJohn Baldwin.size gcm_gmult_4bit,.-gcm_gmult_4bit 350bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7 351bc3d5698SJohn Baldwin.arch armv7-a 352bc3d5698SJohn Baldwin.fpu neon 353bc3d5698SJohn Baldwin 354bc3d5698SJohn Baldwin.globl gcm_init_neon 355bc3d5698SJohn Baldwin.type gcm_init_neon,%function 356bc3d5698SJohn Baldwin.align 4 357bc3d5698SJohn Baldwingcm_init_neon: 358bc3d5698SJohn Baldwin vld1.64 d7,[r1]! @ load H 359bc3d5698SJohn Baldwin vmov.i8 q8,#0xe1 360bc3d5698SJohn Baldwin vld1.64 d6,[r1] 361bc3d5698SJohn Baldwin vshl.i64 d17,#57 362bc3d5698SJohn Baldwin vshr.u64 d16,#63 @ t0=0xc2....01 363bc3d5698SJohn Baldwin vdup.8 q9,d7[7] 364bc3d5698SJohn Baldwin vshr.u64 d26,d6,#63 365bc3d5698SJohn Baldwin vshr.s8 q9,#7 @ broadcast carry bit 366bc3d5698SJohn Baldwin vshl.i64 q3,q3,#1 367bc3d5698SJohn Baldwin vand q8,q8,q9 368bc3d5698SJohn Baldwin vorr d7,d26 @ H<<<=1 369bc3d5698SJohn Baldwin veor q3,q3,q8 @ twisted H 370bc3d5698SJohn Baldwin vstmia r0,{q3} 371bc3d5698SJohn Baldwin 372bc3d5698SJohn Baldwin bx lr @ bx lr 373bc3d5698SJohn Baldwin.size gcm_init_neon,.-gcm_init_neon 374bc3d5698SJohn Baldwin 375bc3d5698SJohn Baldwin.globl gcm_gmult_neon 376bc3d5698SJohn Baldwin.type gcm_gmult_neon,%function 377bc3d5698SJohn Baldwin.align 4 378bc3d5698SJohn Baldwingcm_gmult_neon: 379bc3d5698SJohn Baldwin vld1.64 d7,[r0]! @ load Xi 380bc3d5698SJohn Baldwin vld1.64 d6,[r0]! 381bc3d5698SJohn Baldwin vmov.i64 d29,#0x0000ffffffffffff 382bc3d5698SJohn Baldwin vldmia r1,{d26,d27} @ load twisted H 383bc3d5698SJohn Baldwin vmov.i64 d30,#0x00000000ffffffff 384bc3d5698SJohn Baldwin#ifdef __ARMEL__ 385bc3d5698SJohn Baldwin vrev64.8 q3,q3 386bc3d5698SJohn Baldwin#endif 387bc3d5698SJohn Baldwin vmov.i64 d31,#0x000000000000ffff 388bc3d5698SJohn Baldwin veor d28,d26,d27 @ Karatsuba pre-processing 389bc3d5698SJohn Baldwin mov r3,#16 390bc3d5698SJohn Baldwin b .Lgmult_neon 391bc3d5698SJohn Baldwin.size gcm_gmult_neon,.-gcm_gmult_neon 392bc3d5698SJohn Baldwin 393bc3d5698SJohn Baldwin.globl gcm_ghash_neon 394bc3d5698SJohn Baldwin.type gcm_ghash_neon,%function 395bc3d5698SJohn Baldwin.align 4 396bc3d5698SJohn Baldwingcm_ghash_neon: 397bc3d5698SJohn Baldwin vld1.64 d1,[r0]! @ load Xi 398bc3d5698SJohn Baldwin vld1.64 d0,[r0]! 399bc3d5698SJohn Baldwin vmov.i64 d29,#0x0000ffffffffffff 400bc3d5698SJohn Baldwin vldmia r1,{d26,d27} @ load twisted H 401bc3d5698SJohn Baldwin vmov.i64 d30,#0x00000000ffffffff 402bc3d5698SJohn Baldwin#ifdef __ARMEL__ 403bc3d5698SJohn Baldwin vrev64.8 q0,q0 404bc3d5698SJohn Baldwin#endif 405bc3d5698SJohn Baldwin vmov.i64 d31,#0x000000000000ffff 406bc3d5698SJohn Baldwin veor d28,d26,d27 @ Karatsuba pre-processing 407bc3d5698SJohn Baldwin 408bc3d5698SJohn Baldwin.Loop_neon: 409bc3d5698SJohn Baldwin vld1.64 d7,[r2]! @ load inp 410bc3d5698SJohn Baldwin vld1.64 d6,[r2]! 411bc3d5698SJohn Baldwin#ifdef __ARMEL__ 412bc3d5698SJohn Baldwin vrev64.8 q3,q3 413bc3d5698SJohn Baldwin#endif 414bc3d5698SJohn Baldwin veor q3,q0 @ inp^=Xi 415bc3d5698SJohn Baldwin.Lgmult_neon: 416bc3d5698SJohn Baldwin vext.8 d16, d26, d26, #1 @ A1 417bc3d5698SJohn Baldwin vmull.p8 q8, d16, d6 @ F = A1*B 418bc3d5698SJohn Baldwin vext.8 d0, d6, d6, #1 @ B1 419bc3d5698SJohn Baldwin vmull.p8 q0, d26, d0 @ E = A*B1 420bc3d5698SJohn Baldwin vext.8 d18, d26, d26, #2 @ A2 421bc3d5698SJohn Baldwin vmull.p8 q9, d18, d6 @ H = A2*B 422bc3d5698SJohn Baldwin vext.8 d22, d6, d6, #2 @ B2 423bc3d5698SJohn Baldwin vmull.p8 q11, d26, d22 @ G = A*B2 424bc3d5698SJohn Baldwin vext.8 d20, d26, d26, #3 @ A3 425bc3d5698SJohn Baldwin veor q8, q8, q0 @ L = E + F 426bc3d5698SJohn Baldwin vmull.p8 q10, d20, d6 @ J = A3*B 427bc3d5698SJohn Baldwin vext.8 d0, d6, d6, #3 @ B3 428bc3d5698SJohn Baldwin veor q9, q9, q11 @ M = G + H 429bc3d5698SJohn Baldwin vmull.p8 q0, d26, d0 @ I = A*B3 430bc3d5698SJohn Baldwin veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 431bc3d5698SJohn Baldwin vand d17, d17, d29 432bc3d5698SJohn Baldwin vext.8 d22, d6, d6, #4 @ B4 433bc3d5698SJohn Baldwin veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 434bc3d5698SJohn Baldwin vand d19, d19, d30 435bc3d5698SJohn Baldwin vmull.p8 q11, d26, d22 @ K = A*B4 436bc3d5698SJohn Baldwin veor q10, q10, q0 @ N = I + J 437bc3d5698SJohn Baldwin veor d16, d16, d17 438bc3d5698SJohn Baldwin veor d18, d18, d19 439bc3d5698SJohn Baldwin veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 440bc3d5698SJohn Baldwin vand d21, d21, d31 441bc3d5698SJohn Baldwin vext.8 q8, q8, q8, #15 442bc3d5698SJohn Baldwin veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 443bc3d5698SJohn Baldwin vmov.i64 d23, #0 444bc3d5698SJohn Baldwin vext.8 q9, q9, q9, #14 445bc3d5698SJohn Baldwin veor d20, d20, d21 446bc3d5698SJohn Baldwin vmull.p8 q0, d26, d6 @ D = A*B 447bc3d5698SJohn Baldwin vext.8 q11, q11, q11, #12 448bc3d5698SJohn Baldwin vext.8 q10, q10, q10, #13 449bc3d5698SJohn Baldwin veor q8, q8, q9 450bc3d5698SJohn Baldwin veor q10, q10, q11 451bc3d5698SJohn Baldwin veor q0, q0, q8 452bc3d5698SJohn Baldwin veor q0, q0, q10 453bc3d5698SJohn Baldwin veor d6,d6,d7 @ Karatsuba pre-processing 454bc3d5698SJohn Baldwin vext.8 d16, d28, d28, #1 @ A1 455bc3d5698SJohn Baldwin vmull.p8 q8, d16, d6 @ F = A1*B 456bc3d5698SJohn Baldwin vext.8 d2, d6, d6, #1 @ B1 457bc3d5698SJohn Baldwin vmull.p8 q1, d28, d2 @ E = A*B1 458bc3d5698SJohn Baldwin vext.8 d18, d28, d28, #2 @ A2 459bc3d5698SJohn Baldwin vmull.p8 q9, d18, d6 @ H = A2*B 460bc3d5698SJohn Baldwin vext.8 d22, d6, d6, #2 @ B2 461bc3d5698SJohn Baldwin vmull.p8 q11, d28, d22 @ G = A*B2 462bc3d5698SJohn Baldwin vext.8 d20, d28, d28, #3 @ A3 463bc3d5698SJohn Baldwin veor q8, q8, q1 @ L = E + F 464bc3d5698SJohn Baldwin vmull.p8 q10, d20, d6 @ J = A3*B 465bc3d5698SJohn Baldwin vext.8 d2, d6, d6, #3 @ B3 466bc3d5698SJohn Baldwin veor q9, q9, q11 @ M = G + H 467bc3d5698SJohn Baldwin vmull.p8 q1, d28, d2 @ I = A*B3 468bc3d5698SJohn Baldwin veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 469bc3d5698SJohn Baldwin vand d17, d17, d29 470bc3d5698SJohn Baldwin vext.8 d22, d6, d6, #4 @ B4 471bc3d5698SJohn Baldwin veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 472bc3d5698SJohn Baldwin vand d19, d19, d30 473bc3d5698SJohn Baldwin vmull.p8 q11, d28, d22 @ K = A*B4 474bc3d5698SJohn Baldwin veor q10, q10, q1 @ N = I + J 475bc3d5698SJohn Baldwin veor d16, d16, d17 476bc3d5698SJohn Baldwin veor d18, d18, d19 477bc3d5698SJohn Baldwin veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 478bc3d5698SJohn Baldwin vand d21, d21, d31 479bc3d5698SJohn Baldwin vext.8 q8, q8, q8, #15 480bc3d5698SJohn Baldwin veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 481bc3d5698SJohn Baldwin vmov.i64 d23, #0 482bc3d5698SJohn Baldwin vext.8 q9, q9, q9, #14 483bc3d5698SJohn Baldwin veor d20, d20, d21 484bc3d5698SJohn Baldwin vmull.p8 q1, d28, d6 @ D = A*B 485bc3d5698SJohn Baldwin vext.8 q11, q11, q11, #12 486bc3d5698SJohn Baldwin vext.8 q10, q10, q10, #13 487bc3d5698SJohn Baldwin veor q8, q8, q9 488bc3d5698SJohn Baldwin veor q10, q10, q11 489bc3d5698SJohn Baldwin veor q1, q1, q8 490bc3d5698SJohn Baldwin veor q1, q1, q10 491bc3d5698SJohn Baldwin vext.8 d16, d27, d27, #1 @ A1 492bc3d5698SJohn Baldwin vmull.p8 q8, d16, d7 @ F = A1*B 493bc3d5698SJohn Baldwin vext.8 d4, d7, d7, #1 @ B1 494bc3d5698SJohn Baldwin vmull.p8 q2, d27, d4 @ E = A*B1 495bc3d5698SJohn Baldwin vext.8 d18, d27, d27, #2 @ A2 496bc3d5698SJohn Baldwin vmull.p8 q9, d18, d7 @ H = A2*B 497bc3d5698SJohn Baldwin vext.8 d22, d7, d7, #2 @ B2 498bc3d5698SJohn Baldwin vmull.p8 q11, d27, d22 @ G = A*B2 499bc3d5698SJohn Baldwin vext.8 d20, d27, d27, #3 @ A3 500bc3d5698SJohn Baldwin veor q8, q8, q2 @ L = E + F 501bc3d5698SJohn Baldwin vmull.p8 q10, d20, d7 @ J = A3*B 502bc3d5698SJohn Baldwin vext.8 d4, d7, d7, #3 @ B3 503bc3d5698SJohn Baldwin veor q9, q9, q11 @ M = G + H 504bc3d5698SJohn Baldwin vmull.p8 q2, d27, d4 @ I = A*B3 505bc3d5698SJohn Baldwin veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 506bc3d5698SJohn Baldwin vand d17, d17, d29 507bc3d5698SJohn Baldwin vext.8 d22, d7, d7, #4 @ B4 508bc3d5698SJohn Baldwin veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 509bc3d5698SJohn Baldwin vand d19, d19, d30 510bc3d5698SJohn Baldwin vmull.p8 q11, d27, d22 @ K = A*B4 511bc3d5698SJohn Baldwin veor q10, q10, q2 @ N = I + J 512bc3d5698SJohn Baldwin veor d16, d16, d17 513bc3d5698SJohn Baldwin veor d18, d18, d19 514bc3d5698SJohn Baldwin veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 515bc3d5698SJohn Baldwin vand d21, d21, d31 516bc3d5698SJohn Baldwin vext.8 q8, q8, q8, #15 517bc3d5698SJohn Baldwin veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 518bc3d5698SJohn Baldwin vmov.i64 d23, #0 519bc3d5698SJohn Baldwin vext.8 q9, q9, q9, #14 520bc3d5698SJohn Baldwin veor d20, d20, d21 521bc3d5698SJohn Baldwin vmull.p8 q2, d27, d7 @ D = A*B 522bc3d5698SJohn Baldwin vext.8 q11, q11, q11, #12 523bc3d5698SJohn Baldwin vext.8 q10, q10, q10, #13 524bc3d5698SJohn Baldwin veor q8, q8, q9 525bc3d5698SJohn Baldwin veor q10, q10, q11 526bc3d5698SJohn Baldwin veor q2, q2, q8 527bc3d5698SJohn Baldwin veor q2, q2, q10 528bc3d5698SJohn Baldwin veor q1,q1,q0 @ Karatsuba post-processing 529bc3d5698SJohn Baldwin veor q1,q1,q2 530bc3d5698SJohn Baldwin veor d1,d1,d2 531bc3d5698SJohn Baldwin veor d4,d4,d3 @ Xh|Xl - 256-bit result 532bc3d5698SJohn Baldwin 533bc3d5698SJohn Baldwin @ equivalent of reduction_avx from ghash-x86_64.pl 534bc3d5698SJohn Baldwin vshl.i64 q9,q0,#57 @ 1st phase 535bc3d5698SJohn Baldwin vshl.i64 q10,q0,#62 536bc3d5698SJohn Baldwin veor q10,q10,q9 @ 537bc3d5698SJohn Baldwin vshl.i64 q9,q0,#63 538bc3d5698SJohn Baldwin veor q10, q10, q9 @ 539bc3d5698SJohn Baldwin veor d1,d1,d20 @ 540bc3d5698SJohn Baldwin veor d4,d4,d21 541bc3d5698SJohn Baldwin 542bc3d5698SJohn Baldwin vshr.u64 q10,q0,#1 @ 2nd phase 543bc3d5698SJohn Baldwin veor q2,q2,q0 544bc3d5698SJohn Baldwin veor q0,q0,q10 @ 545bc3d5698SJohn Baldwin vshr.u64 q10,q10,#6 546bc3d5698SJohn Baldwin vshr.u64 q0,q0,#1 @ 547bc3d5698SJohn Baldwin veor q0,q0,q2 @ 548bc3d5698SJohn Baldwin veor q0,q0,q10 @ 549bc3d5698SJohn Baldwin 550bc3d5698SJohn Baldwin subs r3,#16 551bc3d5698SJohn Baldwin bne .Loop_neon 552bc3d5698SJohn Baldwin 553bc3d5698SJohn Baldwin#ifdef __ARMEL__ 554bc3d5698SJohn Baldwin vrev64.8 q0,q0 555bc3d5698SJohn Baldwin#endif 556bc3d5698SJohn Baldwin sub r0,#16 557bc3d5698SJohn Baldwin vst1.64 d1,[r0]! @ write out Xi 558bc3d5698SJohn Baldwin vst1.64 d0,[r0] 559bc3d5698SJohn Baldwin 560bc3d5698SJohn Baldwin bx lr @ bx lr 561bc3d5698SJohn Baldwin.size gcm_ghash_neon,.-gcm_ghash_neon 562bc3d5698SJohn Baldwin#endif 563bc3d5698SJohn Baldwin.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 564bc3d5698SJohn Baldwin.align 2 565bc3d5698SJohn Baldwin.align 2 566