1*bc3d5698SJohn Baldwin/* $FreeBSD$ */ 2*bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from ghash-armv4.pl. */ 3*bc3d5698SJohn Baldwin#include "arm_arch.h" 4*bc3d5698SJohn Baldwin 5*bc3d5698SJohn Baldwin.text 6*bc3d5698SJohn Baldwin#if defined(__thumb2__) || defined(__clang__) 7*bc3d5698SJohn Baldwin.syntax unified 8*bc3d5698SJohn Baldwin#define ldrplb ldrbpl 9*bc3d5698SJohn Baldwin#define ldrneb ldrbne 10*bc3d5698SJohn Baldwin#endif 11*bc3d5698SJohn Baldwin#if defined(__thumb2__) 12*bc3d5698SJohn Baldwin.thumb 13*bc3d5698SJohn Baldwin#else 14*bc3d5698SJohn Baldwin.code 32 15*bc3d5698SJohn Baldwin#endif 16*bc3d5698SJohn Baldwin 17*bc3d5698SJohn Baldwin.type rem_4bit,%object 18*bc3d5698SJohn Baldwin.align 5 19*bc3d5698SJohn Baldwinrem_4bit: 20*bc3d5698SJohn Baldwin.short 0x0000,0x1C20,0x3840,0x2460 21*bc3d5698SJohn Baldwin.short 0x7080,0x6CA0,0x48C0,0x54E0 22*bc3d5698SJohn Baldwin.short 0xE100,0xFD20,0xD940,0xC560 23*bc3d5698SJohn Baldwin.short 0x9180,0x8DA0,0xA9C0,0xB5E0 24*bc3d5698SJohn Baldwin.size rem_4bit,.-rem_4bit 25*bc3d5698SJohn Baldwin 26*bc3d5698SJohn Baldwin.type rem_4bit_get,%function 27*bc3d5698SJohn Baldwinrem_4bit_get: 28*bc3d5698SJohn Baldwin#if defined(__thumb2__) 29*bc3d5698SJohn Baldwin adr r2,rem_4bit 30*bc3d5698SJohn Baldwin#else 31*bc3d5698SJohn Baldwin sub r2,pc,#8+32 @ &rem_4bit 32*bc3d5698SJohn Baldwin#endif 33*bc3d5698SJohn Baldwin b .Lrem_4bit_got 34*bc3d5698SJohn Baldwin nop 35*bc3d5698SJohn Baldwin nop 36*bc3d5698SJohn Baldwin.size rem_4bit_get,.-rem_4bit_get 37*bc3d5698SJohn Baldwin 38*bc3d5698SJohn Baldwin.globl gcm_ghash_4bit 39*bc3d5698SJohn Baldwin.type gcm_ghash_4bit,%function 40*bc3d5698SJohn Baldwin.align 4 41*bc3d5698SJohn Baldwingcm_ghash_4bit: 42*bc3d5698SJohn Baldwin#if defined(__thumb2__) 43*bc3d5698SJohn Baldwin adr r12,rem_4bit 44*bc3d5698SJohn Baldwin#else 45*bc3d5698SJohn Baldwin sub r12,pc,#8+48 @ &rem_4bit 46*bc3d5698SJohn Baldwin#endif 47*bc3d5698SJohn Baldwin add r3,r2,r3 @ r3 to point at the end 48*bc3d5698SJohn Baldwin stmdb sp!,{r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} @ save r3/end too 49*bc3d5698SJohn Baldwin 50*bc3d5698SJohn Baldwin ldmia r12,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy rem_4bit ... 51*bc3d5698SJohn Baldwin stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ ... to stack 52*bc3d5698SJohn Baldwin 53*bc3d5698SJohn Baldwin ldrb r12,[r2,#15] 54*bc3d5698SJohn Baldwin ldrb r14,[r0,#15] 55*bc3d5698SJohn Baldwin.Louter: 56*bc3d5698SJohn Baldwin eor r12,r12,r14 57*bc3d5698SJohn Baldwin and r14,r12,#0xf0 58*bc3d5698SJohn Baldwin and r12,r12,#0x0f 59*bc3d5698SJohn Baldwin mov r3,#14 60*bc3d5698SJohn Baldwin 61*bc3d5698SJohn Baldwin add r7,r1,r12,lsl#4 62*bc3d5698SJohn Baldwin ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo] 63*bc3d5698SJohn Baldwin add r11,r1,r14 64*bc3d5698SJohn Baldwin ldrb r12,[r2,#14] 65*bc3d5698SJohn Baldwin 66*bc3d5698SJohn Baldwin and r14,r4,#0xf @ rem 67*bc3d5698SJohn Baldwin ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] 68*bc3d5698SJohn Baldwin add r14,r14,r14 69*bc3d5698SJohn Baldwin eor r4,r8,r4,lsr#4 70*bc3d5698SJohn Baldwin ldrh r8,[sp,r14] @ rem_4bit[rem] 71*bc3d5698SJohn Baldwin eor r4,r4,r5,lsl#28 72*bc3d5698SJohn Baldwin ldrb r14,[r0,#14] 73*bc3d5698SJohn Baldwin eor r5,r9,r5,lsr#4 74*bc3d5698SJohn Baldwin eor r5,r5,r6,lsl#28 75*bc3d5698SJohn Baldwin eor r6,r10,r6,lsr#4 76*bc3d5698SJohn Baldwin eor r6,r6,r7,lsl#28 77*bc3d5698SJohn Baldwin eor r7,r11,r7,lsr#4 78*bc3d5698SJohn Baldwin eor r12,r12,r14 79*bc3d5698SJohn Baldwin and r14,r12,#0xf0 80*bc3d5698SJohn Baldwin and r12,r12,#0x0f 81*bc3d5698SJohn Baldwin eor r7,r7,r8,lsl#16 82*bc3d5698SJohn Baldwin 83*bc3d5698SJohn Baldwin.Linner: 84*bc3d5698SJohn Baldwin add r11,r1,r12,lsl#4 85*bc3d5698SJohn Baldwin and r12,r4,#0xf @ rem 86*bc3d5698SJohn Baldwin subs r3,r3,#1 87*bc3d5698SJohn Baldwin add r12,r12,r12 88*bc3d5698SJohn Baldwin ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo] 89*bc3d5698SJohn Baldwin eor r4,r8,r4,lsr#4 90*bc3d5698SJohn Baldwin eor r4,r4,r5,lsl#28 91*bc3d5698SJohn Baldwin eor r5,r9,r5,lsr#4 92*bc3d5698SJohn Baldwin eor r5,r5,r6,lsl#28 93*bc3d5698SJohn Baldwin ldrh r8,[sp,r12] @ rem_4bit[rem] 94*bc3d5698SJohn Baldwin eor r6,r10,r6,lsr#4 95*bc3d5698SJohn Baldwin#ifdef __thumb2__ 96*bc3d5698SJohn Baldwin it pl 97*bc3d5698SJohn Baldwin#endif 98*bc3d5698SJohn Baldwin ldrplb r12,[r2,r3] 99*bc3d5698SJohn Baldwin eor r6,r6,r7,lsl#28 100*bc3d5698SJohn Baldwin eor r7,r11,r7,lsr#4 101*bc3d5698SJohn Baldwin 102*bc3d5698SJohn Baldwin add r11,r1,r14 103*bc3d5698SJohn Baldwin and r14,r4,#0xf @ rem 104*bc3d5698SJohn Baldwin eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] 105*bc3d5698SJohn Baldwin add r14,r14,r14 106*bc3d5698SJohn Baldwin ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] 107*bc3d5698SJohn Baldwin eor r4,r8,r4,lsr#4 108*bc3d5698SJohn Baldwin#ifdef __thumb2__ 109*bc3d5698SJohn Baldwin it pl 110*bc3d5698SJohn Baldwin#endif 111*bc3d5698SJohn Baldwin ldrplb r8,[r0,r3] 112*bc3d5698SJohn Baldwin eor r4,r4,r5,lsl#28 113*bc3d5698SJohn Baldwin eor r5,r9,r5,lsr#4 114*bc3d5698SJohn Baldwin ldrh r9,[sp,r14] 115*bc3d5698SJohn Baldwin eor r5,r5,r6,lsl#28 116*bc3d5698SJohn Baldwin eor r6,r10,r6,lsr#4 117*bc3d5698SJohn Baldwin eor r6,r6,r7,lsl#28 118*bc3d5698SJohn Baldwin#ifdef __thumb2__ 119*bc3d5698SJohn Baldwin it pl 120*bc3d5698SJohn Baldwin#endif 121*bc3d5698SJohn Baldwin eorpl r12,r12,r8 122*bc3d5698SJohn Baldwin eor r7,r11,r7,lsr#4 123*bc3d5698SJohn Baldwin#ifdef __thumb2__ 124*bc3d5698SJohn Baldwin itt pl 125*bc3d5698SJohn Baldwin#endif 126*bc3d5698SJohn Baldwin andpl r14,r12,#0xf0 127*bc3d5698SJohn Baldwin andpl r12,r12,#0x0f 128*bc3d5698SJohn Baldwin eor r7,r7,r9,lsl#16 @ ^= rem_4bit[rem] 129*bc3d5698SJohn Baldwin bpl .Linner 130*bc3d5698SJohn Baldwin 131*bc3d5698SJohn Baldwin ldr r3,[sp,#32] @ re-load r3/end 132*bc3d5698SJohn Baldwin add r2,r2,#16 133*bc3d5698SJohn Baldwin mov r14,r4 134*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__) 135*bc3d5698SJohn Baldwin rev r4,r4 136*bc3d5698SJohn Baldwin str r4,[r0,#12] 137*bc3d5698SJohn Baldwin#elif defined(__ARMEB__) 138*bc3d5698SJohn Baldwin str r4,[r0,#12] 139*bc3d5698SJohn Baldwin#else 140*bc3d5698SJohn Baldwin mov r9,r4,lsr#8 141*bc3d5698SJohn Baldwin strb r4,[r0,#12+3] 142*bc3d5698SJohn Baldwin mov r10,r4,lsr#16 143*bc3d5698SJohn Baldwin strb r9,[r0,#12+2] 144*bc3d5698SJohn Baldwin mov r11,r4,lsr#24 145*bc3d5698SJohn Baldwin strb r10,[r0,#12+1] 146*bc3d5698SJohn Baldwin strb r11,[r0,#12] 147*bc3d5698SJohn Baldwin#endif 148*bc3d5698SJohn Baldwin cmp r2,r3 149*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__) 150*bc3d5698SJohn Baldwin rev r5,r5 151*bc3d5698SJohn Baldwin str r5,[r0,#8] 152*bc3d5698SJohn Baldwin#elif defined(__ARMEB__) 153*bc3d5698SJohn Baldwin str r5,[r0,#8] 154*bc3d5698SJohn Baldwin#else 155*bc3d5698SJohn Baldwin mov r9,r5,lsr#8 156*bc3d5698SJohn Baldwin strb r5,[r0,#8+3] 157*bc3d5698SJohn Baldwin mov r10,r5,lsr#16 158*bc3d5698SJohn Baldwin strb r9,[r0,#8+2] 159*bc3d5698SJohn Baldwin mov r11,r5,lsr#24 160*bc3d5698SJohn Baldwin strb r10,[r0,#8+1] 161*bc3d5698SJohn Baldwin strb r11,[r0,#8] 162*bc3d5698SJohn Baldwin#endif 163*bc3d5698SJohn Baldwin 164*bc3d5698SJohn Baldwin#ifdef __thumb2__ 165*bc3d5698SJohn Baldwin it ne 166*bc3d5698SJohn Baldwin#endif 167*bc3d5698SJohn Baldwin ldrneb r12,[r2,#15] 168*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__) 169*bc3d5698SJohn Baldwin rev r6,r6 170*bc3d5698SJohn Baldwin str r6,[r0,#4] 171*bc3d5698SJohn Baldwin#elif defined(__ARMEB__) 172*bc3d5698SJohn Baldwin str r6,[r0,#4] 173*bc3d5698SJohn Baldwin#else 174*bc3d5698SJohn Baldwin mov r9,r6,lsr#8 175*bc3d5698SJohn Baldwin strb r6,[r0,#4+3] 176*bc3d5698SJohn Baldwin mov r10,r6,lsr#16 177*bc3d5698SJohn Baldwin strb r9,[r0,#4+2] 178*bc3d5698SJohn Baldwin mov r11,r6,lsr#24 179*bc3d5698SJohn Baldwin strb r10,[r0,#4+1] 180*bc3d5698SJohn Baldwin strb r11,[r0,#4] 181*bc3d5698SJohn Baldwin#endif 182*bc3d5698SJohn Baldwin 183*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__) 184*bc3d5698SJohn Baldwin rev r7,r7 185*bc3d5698SJohn Baldwin str r7,[r0,#0] 186*bc3d5698SJohn Baldwin#elif defined(__ARMEB__) 187*bc3d5698SJohn Baldwin str r7,[r0,#0] 188*bc3d5698SJohn Baldwin#else 189*bc3d5698SJohn Baldwin mov r9,r7,lsr#8 190*bc3d5698SJohn Baldwin strb r7,[r0,#0+3] 191*bc3d5698SJohn Baldwin mov r10,r7,lsr#16 192*bc3d5698SJohn Baldwin strb r9,[r0,#0+2] 193*bc3d5698SJohn Baldwin mov r11,r7,lsr#24 194*bc3d5698SJohn Baldwin strb r10,[r0,#0+1] 195*bc3d5698SJohn Baldwin strb r11,[r0,#0] 196*bc3d5698SJohn Baldwin#endif 197*bc3d5698SJohn Baldwin 198*bc3d5698SJohn Baldwin bne .Louter 199*bc3d5698SJohn Baldwin 200*bc3d5698SJohn Baldwin add sp,sp,#36 201*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=5 202*bc3d5698SJohn Baldwin ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} 203*bc3d5698SJohn Baldwin#else 204*bc3d5698SJohn Baldwin ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} 205*bc3d5698SJohn Baldwin tst lr,#1 206*bc3d5698SJohn Baldwin moveq pc,lr @ be binary compatible with V4, yet 207*bc3d5698SJohn Baldwin.word 0xe12fff1e @ interoperable with Thumb ISA:-) 208*bc3d5698SJohn Baldwin#endif 209*bc3d5698SJohn Baldwin.size gcm_ghash_4bit,.-gcm_ghash_4bit 210*bc3d5698SJohn Baldwin 211*bc3d5698SJohn Baldwin.globl gcm_gmult_4bit 212*bc3d5698SJohn Baldwin.type gcm_gmult_4bit,%function 213*bc3d5698SJohn Baldwingcm_gmult_4bit: 214*bc3d5698SJohn Baldwin stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} 215*bc3d5698SJohn Baldwin ldrb r12,[r0,#15] 216*bc3d5698SJohn Baldwin b rem_4bit_get 217*bc3d5698SJohn Baldwin.Lrem_4bit_got: 218*bc3d5698SJohn Baldwin and r14,r12,#0xf0 219*bc3d5698SJohn Baldwin and r12,r12,#0x0f 220*bc3d5698SJohn Baldwin mov r3,#14 221*bc3d5698SJohn Baldwin 222*bc3d5698SJohn Baldwin add r7,r1,r12,lsl#4 223*bc3d5698SJohn Baldwin ldmia r7,{r4,r5,r6,r7} @ load Htbl[nlo] 224*bc3d5698SJohn Baldwin ldrb r12,[r0,#14] 225*bc3d5698SJohn Baldwin 226*bc3d5698SJohn Baldwin add r11,r1,r14 227*bc3d5698SJohn Baldwin and r14,r4,#0xf @ rem 228*bc3d5698SJohn Baldwin ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] 229*bc3d5698SJohn Baldwin add r14,r14,r14 230*bc3d5698SJohn Baldwin eor r4,r8,r4,lsr#4 231*bc3d5698SJohn Baldwin ldrh r8,[r2,r14] @ rem_4bit[rem] 232*bc3d5698SJohn Baldwin eor r4,r4,r5,lsl#28 233*bc3d5698SJohn Baldwin eor r5,r9,r5,lsr#4 234*bc3d5698SJohn Baldwin eor r5,r5,r6,lsl#28 235*bc3d5698SJohn Baldwin eor r6,r10,r6,lsr#4 236*bc3d5698SJohn Baldwin eor r6,r6,r7,lsl#28 237*bc3d5698SJohn Baldwin eor r7,r11,r7,lsr#4 238*bc3d5698SJohn Baldwin and r14,r12,#0xf0 239*bc3d5698SJohn Baldwin eor r7,r7,r8,lsl#16 240*bc3d5698SJohn Baldwin and r12,r12,#0x0f 241*bc3d5698SJohn Baldwin 242*bc3d5698SJohn Baldwin.Loop: 243*bc3d5698SJohn Baldwin add r11,r1,r12,lsl#4 244*bc3d5698SJohn Baldwin and r12,r4,#0xf @ rem 245*bc3d5698SJohn Baldwin subs r3,r3,#1 246*bc3d5698SJohn Baldwin add r12,r12,r12 247*bc3d5698SJohn Baldwin ldmia r11,{r8,r9,r10,r11} @ load Htbl[nlo] 248*bc3d5698SJohn Baldwin eor r4,r8,r4,lsr#4 249*bc3d5698SJohn Baldwin eor r4,r4,r5,lsl#28 250*bc3d5698SJohn Baldwin eor r5,r9,r5,lsr#4 251*bc3d5698SJohn Baldwin eor r5,r5,r6,lsl#28 252*bc3d5698SJohn Baldwin ldrh r8,[r2,r12] @ rem_4bit[rem] 253*bc3d5698SJohn Baldwin eor r6,r10,r6,lsr#4 254*bc3d5698SJohn Baldwin#ifdef __thumb2__ 255*bc3d5698SJohn Baldwin it pl 256*bc3d5698SJohn Baldwin#endif 257*bc3d5698SJohn Baldwin ldrplb r12,[r0,r3] 258*bc3d5698SJohn Baldwin eor r6,r6,r7,lsl#28 259*bc3d5698SJohn Baldwin eor r7,r11,r7,lsr#4 260*bc3d5698SJohn Baldwin 261*bc3d5698SJohn Baldwin add r11,r1,r14 262*bc3d5698SJohn Baldwin and r14,r4,#0xf @ rem 263*bc3d5698SJohn Baldwin eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] 264*bc3d5698SJohn Baldwin add r14,r14,r14 265*bc3d5698SJohn Baldwin ldmia r11,{r8,r9,r10,r11} @ load Htbl[nhi] 266*bc3d5698SJohn Baldwin eor r4,r8,r4,lsr#4 267*bc3d5698SJohn Baldwin eor r4,r4,r5,lsl#28 268*bc3d5698SJohn Baldwin eor r5,r9,r5,lsr#4 269*bc3d5698SJohn Baldwin ldrh r8,[r2,r14] @ rem_4bit[rem] 270*bc3d5698SJohn Baldwin eor r5,r5,r6,lsl#28 271*bc3d5698SJohn Baldwin eor r6,r10,r6,lsr#4 272*bc3d5698SJohn Baldwin eor r6,r6,r7,lsl#28 273*bc3d5698SJohn Baldwin eor r7,r11,r7,lsr#4 274*bc3d5698SJohn Baldwin#ifdef __thumb2__ 275*bc3d5698SJohn Baldwin itt pl 276*bc3d5698SJohn Baldwin#endif 277*bc3d5698SJohn Baldwin andpl r14,r12,#0xf0 278*bc3d5698SJohn Baldwin andpl r12,r12,#0x0f 279*bc3d5698SJohn Baldwin eor r7,r7,r8,lsl#16 @ ^= rem_4bit[rem] 280*bc3d5698SJohn Baldwin bpl .Loop 281*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__) 282*bc3d5698SJohn Baldwin rev r4,r4 283*bc3d5698SJohn Baldwin str r4,[r0,#12] 284*bc3d5698SJohn Baldwin#elif defined(__ARMEB__) 285*bc3d5698SJohn Baldwin str r4,[r0,#12] 286*bc3d5698SJohn Baldwin#else 287*bc3d5698SJohn Baldwin mov r9,r4,lsr#8 288*bc3d5698SJohn Baldwin strb r4,[r0,#12+3] 289*bc3d5698SJohn Baldwin mov r10,r4,lsr#16 290*bc3d5698SJohn Baldwin strb r9,[r0,#12+2] 291*bc3d5698SJohn Baldwin mov r11,r4,lsr#24 292*bc3d5698SJohn Baldwin strb r10,[r0,#12+1] 293*bc3d5698SJohn Baldwin strb r11,[r0,#12] 294*bc3d5698SJohn Baldwin#endif 295*bc3d5698SJohn Baldwin 296*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__) 297*bc3d5698SJohn Baldwin rev r5,r5 298*bc3d5698SJohn Baldwin str r5,[r0,#8] 299*bc3d5698SJohn Baldwin#elif defined(__ARMEB__) 300*bc3d5698SJohn Baldwin str r5,[r0,#8] 301*bc3d5698SJohn Baldwin#else 302*bc3d5698SJohn Baldwin mov r9,r5,lsr#8 303*bc3d5698SJohn Baldwin strb r5,[r0,#8+3] 304*bc3d5698SJohn Baldwin mov r10,r5,lsr#16 305*bc3d5698SJohn Baldwin strb r9,[r0,#8+2] 306*bc3d5698SJohn Baldwin mov r11,r5,lsr#24 307*bc3d5698SJohn Baldwin strb r10,[r0,#8+1] 308*bc3d5698SJohn Baldwin strb r11,[r0,#8] 309*bc3d5698SJohn Baldwin#endif 310*bc3d5698SJohn Baldwin 311*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__) 312*bc3d5698SJohn Baldwin rev r6,r6 313*bc3d5698SJohn Baldwin str r6,[r0,#4] 314*bc3d5698SJohn Baldwin#elif defined(__ARMEB__) 315*bc3d5698SJohn Baldwin str r6,[r0,#4] 316*bc3d5698SJohn Baldwin#else 317*bc3d5698SJohn Baldwin mov r9,r6,lsr#8 318*bc3d5698SJohn Baldwin strb r6,[r0,#4+3] 319*bc3d5698SJohn Baldwin mov r10,r6,lsr#16 320*bc3d5698SJohn Baldwin strb r9,[r0,#4+2] 321*bc3d5698SJohn Baldwin mov r11,r6,lsr#24 322*bc3d5698SJohn Baldwin strb r10,[r0,#4+1] 323*bc3d5698SJohn Baldwin strb r11,[r0,#4] 324*bc3d5698SJohn Baldwin#endif 325*bc3d5698SJohn Baldwin 326*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=7 && defined(__ARMEL__) 327*bc3d5698SJohn Baldwin rev r7,r7 328*bc3d5698SJohn Baldwin str r7,[r0,#0] 329*bc3d5698SJohn Baldwin#elif defined(__ARMEB__) 330*bc3d5698SJohn Baldwin str r7,[r0,#0] 331*bc3d5698SJohn Baldwin#else 332*bc3d5698SJohn Baldwin mov r9,r7,lsr#8 333*bc3d5698SJohn Baldwin strb r7,[r0,#0+3] 334*bc3d5698SJohn Baldwin mov r10,r7,lsr#16 335*bc3d5698SJohn Baldwin strb r9,[r0,#0+2] 336*bc3d5698SJohn Baldwin mov r11,r7,lsr#24 337*bc3d5698SJohn Baldwin strb r10,[r0,#0+1] 338*bc3d5698SJohn Baldwin strb r11,[r0,#0] 339*bc3d5698SJohn Baldwin#endif 340*bc3d5698SJohn Baldwin 341*bc3d5698SJohn Baldwin#if __ARM_ARCH__>=5 342*bc3d5698SJohn Baldwin ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} 343*bc3d5698SJohn Baldwin#else 344*bc3d5698SJohn Baldwin ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} 345*bc3d5698SJohn Baldwin tst lr,#1 346*bc3d5698SJohn Baldwin moveq pc,lr @ be binary compatible with V4, yet 347*bc3d5698SJohn Baldwin.word 0xe12fff1e @ interoperable with Thumb ISA:-) 348*bc3d5698SJohn Baldwin#endif 349*bc3d5698SJohn Baldwin.size gcm_gmult_4bit,.-gcm_gmult_4bit 350*bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7 351*bc3d5698SJohn Baldwin.arch armv7-a 352*bc3d5698SJohn Baldwin.fpu neon 353*bc3d5698SJohn Baldwin 354*bc3d5698SJohn Baldwin.globl gcm_init_neon 355*bc3d5698SJohn Baldwin.type gcm_init_neon,%function 356*bc3d5698SJohn Baldwin.align 4 357*bc3d5698SJohn Baldwingcm_init_neon: 358*bc3d5698SJohn Baldwin vld1.64 d7,[r1]! @ load H 359*bc3d5698SJohn Baldwin vmov.i8 q8,#0xe1 360*bc3d5698SJohn Baldwin vld1.64 d6,[r1] 361*bc3d5698SJohn Baldwin vshl.i64 d17,#57 362*bc3d5698SJohn Baldwin vshr.u64 d16,#63 @ t0=0xc2....01 363*bc3d5698SJohn Baldwin vdup.8 q9,d7[7] 364*bc3d5698SJohn Baldwin vshr.u64 d26,d6,#63 365*bc3d5698SJohn Baldwin vshr.s8 q9,#7 @ broadcast carry bit 366*bc3d5698SJohn Baldwin vshl.i64 q3,q3,#1 367*bc3d5698SJohn Baldwin vand q8,q8,q9 368*bc3d5698SJohn Baldwin vorr d7,d26 @ H<<<=1 369*bc3d5698SJohn Baldwin veor q3,q3,q8 @ twisted H 370*bc3d5698SJohn Baldwin vstmia r0,{q3} 371*bc3d5698SJohn Baldwin 372*bc3d5698SJohn Baldwin bx lr @ bx lr 373*bc3d5698SJohn Baldwin.size gcm_init_neon,.-gcm_init_neon 374*bc3d5698SJohn Baldwin 375*bc3d5698SJohn Baldwin.globl gcm_gmult_neon 376*bc3d5698SJohn Baldwin.type gcm_gmult_neon,%function 377*bc3d5698SJohn Baldwin.align 4 378*bc3d5698SJohn Baldwingcm_gmult_neon: 379*bc3d5698SJohn Baldwin vld1.64 d7,[r0]! @ load Xi 380*bc3d5698SJohn Baldwin vld1.64 d6,[r0]! 381*bc3d5698SJohn Baldwin vmov.i64 d29,#0x0000ffffffffffff 382*bc3d5698SJohn Baldwin vldmia r1,{d26,d27} @ load twisted H 383*bc3d5698SJohn Baldwin vmov.i64 d30,#0x00000000ffffffff 384*bc3d5698SJohn Baldwin#ifdef __ARMEL__ 385*bc3d5698SJohn Baldwin vrev64.8 q3,q3 386*bc3d5698SJohn Baldwin#endif 387*bc3d5698SJohn Baldwin vmov.i64 d31,#0x000000000000ffff 388*bc3d5698SJohn Baldwin veor d28,d26,d27 @ Karatsuba pre-processing 389*bc3d5698SJohn Baldwin mov r3,#16 390*bc3d5698SJohn Baldwin b .Lgmult_neon 391*bc3d5698SJohn Baldwin.size gcm_gmult_neon,.-gcm_gmult_neon 392*bc3d5698SJohn Baldwin 393*bc3d5698SJohn Baldwin.globl gcm_ghash_neon 394*bc3d5698SJohn Baldwin.type gcm_ghash_neon,%function 395*bc3d5698SJohn Baldwin.align 4 396*bc3d5698SJohn Baldwingcm_ghash_neon: 397*bc3d5698SJohn Baldwin vld1.64 d1,[r0]! @ load Xi 398*bc3d5698SJohn Baldwin vld1.64 d0,[r0]! 399*bc3d5698SJohn Baldwin vmov.i64 d29,#0x0000ffffffffffff 400*bc3d5698SJohn Baldwin vldmia r1,{d26,d27} @ load twisted H 401*bc3d5698SJohn Baldwin vmov.i64 d30,#0x00000000ffffffff 402*bc3d5698SJohn Baldwin#ifdef __ARMEL__ 403*bc3d5698SJohn Baldwin vrev64.8 q0,q0 404*bc3d5698SJohn Baldwin#endif 405*bc3d5698SJohn Baldwin vmov.i64 d31,#0x000000000000ffff 406*bc3d5698SJohn Baldwin veor d28,d26,d27 @ Karatsuba pre-processing 407*bc3d5698SJohn Baldwin 408*bc3d5698SJohn Baldwin.Loop_neon: 409*bc3d5698SJohn Baldwin vld1.64 d7,[r2]! @ load inp 410*bc3d5698SJohn Baldwin vld1.64 d6,[r2]! 411*bc3d5698SJohn Baldwin#ifdef __ARMEL__ 412*bc3d5698SJohn Baldwin vrev64.8 q3,q3 413*bc3d5698SJohn Baldwin#endif 414*bc3d5698SJohn Baldwin veor q3,q0 @ inp^=Xi 415*bc3d5698SJohn Baldwin.Lgmult_neon: 416*bc3d5698SJohn Baldwin vext.8 d16, d26, d26, #1 @ A1 417*bc3d5698SJohn Baldwin vmull.p8 q8, d16, d6 @ F = A1*B 418*bc3d5698SJohn Baldwin vext.8 d0, d6, d6, #1 @ B1 419*bc3d5698SJohn Baldwin vmull.p8 q0, d26, d0 @ E = A*B1 420*bc3d5698SJohn Baldwin vext.8 d18, d26, d26, #2 @ A2 421*bc3d5698SJohn Baldwin vmull.p8 q9, d18, d6 @ H = A2*B 422*bc3d5698SJohn Baldwin vext.8 d22, d6, d6, #2 @ B2 423*bc3d5698SJohn Baldwin vmull.p8 q11, d26, d22 @ G = A*B2 424*bc3d5698SJohn Baldwin vext.8 d20, d26, d26, #3 @ A3 425*bc3d5698SJohn Baldwin veor q8, q8, q0 @ L = E + F 426*bc3d5698SJohn Baldwin vmull.p8 q10, d20, d6 @ J = A3*B 427*bc3d5698SJohn Baldwin vext.8 d0, d6, d6, #3 @ B3 428*bc3d5698SJohn Baldwin veor q9, q9, q11 @ M = G + H 429*bc3d5698SJohn Baldwin vmull.p8 q0, d26, d0 @ I = A*B3 430*bc3d5698SJohn Baldwin veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 431*bc3d5698SJohn Baldwin vand d17, d17, d29 432*bc3d5698SJohn Baldwin vext.8 d22, d6, d6, #4 @ B4 433*bc3d5698SJohn Baldwin veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 434*bc3d5698SJohn Baldwin vand d19, d19, d30 435*bc3d5698SJohn Baldwin vmull.p8 q11, d26, d22 @ K = A*B4 436*bc3d5698SJohn Baldwin veor q10, q10, q0 @ N = I + J 437*bc3d5698SJohn Baldwin veor d16, d16, d17 438*bc3d5698SJohn Baldwin veor d18, d18, d19 439*bc3d5698SJohn Baldwin veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 440*bc3d5698SJohn Baldwin vand d21, d21, d31 441*bc3d5698SJohn Baldwin vext.8 q8, q8, q8, #15 442*bc3d5698SJohn Baldwin veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 443*bc3d5698SJohn Baldwin vmov.i64 d23, #0 444*bc3d5698SJohn Baldwin vext.8 q9, q9, q9, #14 445*bc3d5698SJohn Baldwin veor d20, d20, d21 446*bc3d5698SJohn Baldwin vmull.p8 q0, d26, d6 @ D = A*B 447*bc3d5698SJohn Baldwin vext.8 q11, q11, q11, #12 448*bc3d5698SJohn Baldwin vext.8 q10, q10, q10, #13 449*bc3d5698SJohn Baldwin veor q8, q8, q9 450*bc3d5698SJohn Baldwin veor q10, q10, q11 451*bc3d5698SJohn Baldwin veor q0, q0, q8 452*bc3d5698SJohn Baldwin veor q0, q0, q10 453*bc3d5698SJohn Baldwin veor d6,d6,d7 @ Karatsuba pre-processing 454*bc3d5698SJohn Baldwin vext.8 d16, d28, d28, #1 @ A1 455*bc3d5698SJohn Baldwin vmull.p8 q8, d16, d6 @ F = A1*B 456*bc3d5698SJohn Baldwin vext.8 d2, d6, d6, #1 @ B1 457*bc3d5698SJohn Baldwin vmull.p8 q1, d28, d2 @ E = A*B1 458*bc3d5698SJohn Baldwin vext.8 d18, d28, d28, #2 @ A2 459*bc3d5698SJohn Baldwin vmull.p8 q9, d18, d6 @ H = A2*B 460*bc3d5698SJohn Baldwin vext.8 d22, d6, d6, #2 @ B2 461*bc3d5698SJohn Baldwin vmull.p8 q11, d28, d22 @ G = A*B2 462*bc3d5698SJohn Baldwin vext.8 d20, d28, d28, #3 @ A3 463*bc3d5698SJohn Baldwin veor q8, q8, q1 @ L = E + F 464*bc3d5698SJohn Baldwin vmull.p8 q10, d20, d6 @ J = A3*B 465*bc3d5698SJohn Baldwin vext.8 d2, d6, d6, #3 @ B3 466*bc3d5698SJohn Baldwin veor q9, q9, q11 @ M = G + H 467*bc3d5698SJohn Baldwin vmull.p8 q1, d28, d2 @ I = A*B3 468*bc3d5698SJohn Baldwin veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 469*bc3d5698SJohn Baldwin vand d17, d17, d29 470*bc3d5698SJohn Baldwin vext.8 d22, d6, d6, #4 @ B4 471*bc3d5698SJohn Baldwin veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 472*bc3d5698SJohn Baldwin vand d19, d19, d30 473*bc3d5698SJohn Baldwin vmull.p8 q11, d28, d22 @ K = A*B4 474*bc3d5698SJohn Baldwin veor q10, q10, q1 @ N = I + J 475*bc3d5698SJohn Baldwin veor d16, d16, d17 476*bc3d5698SJohn Baldwin veor d18, d18, d19 477*bc3d5698SJohn Baldwin veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 478*bc3d5698SJohn Baldwin vand d21, d21, d31 479*bc3d5698SJohn Baldwin vext.8 q8, q8, q8, #15 480*bc3d5698SJohn Baldwin veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 481*bc3d5698SJohn Baldwin vmov.i64 d23, #0 482*bc3d5698SJohn Baldwin vext.8 q9, q9, q9, #14 483*bc3d5698SJohn Baldwin veor d20, d20, d21 484*bc3d5698SJohn Baldwin vmull.p8 q1, d28, d6 @ D = A*B 485*bc3d5698SJohn Baldwin vext.8 q11, q11, q11, #12 486*bc3d5698SJohn Baldwin vext.8 q10, q10, q10, #13 487*bc3d5698SJohn Baldwin veor q8, q8, q9 488*bc3d5698SJohn Baldwin veor q10, q10, q11 489*bc3d5698SJohn Baldwin veor q1, q1, q8 490*bc3d5698SJohn Baldwin veor q1, q1, q10 491*bc3d5698SJohn Baldwin vext.8 d16, d27, d27, #1 @ A1 492*bc3d5698SJohn Baldwin vmull.p8 q8, d16, d7 @ F = A1*B 493*bc3d5698SJohn Baldwin vext.8 d4, d7, d7, #1 @ B1 494*bc3d5698SJohn Baldwin vmull.p8 q2, d27, d4 @ E = A*B1 495*bc3d5698SJohn Baldwin vext.8 d18, d27, d27, #2 @ A2 496*bc3d5698SJohn Baldwin vmull.p8 q9, d18, d7 @ H = A2*B 497*bc3d5698SJohn Baldwin vext.8 d22, d7, d7, #2 @ B2 498*bc3d5698SJohn Baldwin vmull.p8 q11, d27, d22 @ G = A*B2 499*bc3d5698SJohn Baldwin vext.8 d20, d27, d27, #3 @ A3 500*bc3d5698SJohn Baldwin veor q8, q8, q2 @ L = E + F 501*bc3d5698SJohn Baldwin vmull.p8 q10, d20, d7 @ J = A3*B 502*bc3d5698SJohn Baldwin vext.8 d4, d7, d7, #3 @ B3 503*bc3d5698SJohn Baldwin veor q9, q9, q11 @ M = G + H 504*bc3d5698SJohn Baldwin vmull.p8 q2, d27, d4 @ I = A*B3 505*bc3d5698SJohn Baldwin veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 506*bc3d5698SJohn Baldwin vand d17, d17, d29 507*bc3d5698SJohn Baldwin vext.8 d22, d7, d7, #4 @ B4 508*bc3d5698SJohn Baldwin veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 509*bc3d5698SJohn Baldwin vand d19, d19, d30 510*bc3d5698SJohn Baldwin vmull.p8 q11, d27, d22 @ K = A*B4 511*bc3d5698SJohn Baldwin veor q10, q10, q2 @ N = I + J 512*bc3d5698SJohn Baldwin veor d16, d16, d17 513*bc3d5698SJohn Baldwin veor d18, d18, d19 514*bc3d5698SJohn Baldwin veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 515*bc3d5698SJohn Baldwin vand d21, d21, d31 516*bc3d5698SJohn Baldwin vext.8 q8, q8, q8, #15 517*bc3d5698SJohn Baldwin veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 518*bc3d5698SJohn Baldwin vmov.i64 d23, #0 519*bc3d5698SJohn Baldwin vext.8 q9, q9, q9, #14 520*bc3d5698SJohn Baldwin veor d20, d20, d21 521*bc3d5698SJohn Baldwin vmull.p8 q2, d27, d7 @ D = A*B 522*bc3d5698SJohn Baldwin vext.8 q11, q11, q11, #12 523*bc3d5698SJohn Baldwin vext.8 q10, q10, q10, #13 524*bc3d5698SJohn Baldwin veor q8, q8, q9 525*bc3d5698SJohn Baldwin veor q10, q10, q11 526*bc3d5698SJohn Baldwin veor q2, q2, q8 527*bc3d5698SJohn Baldwin veor q2, q2, q10 528*bc3d5698SJohn Baldwin veor q1,q1,q0 @ Karatsuba post-processing 529*bc3d5698SJohn Baldwin veor q1,q1,q2 530*bc3d5698SJohn Baldwin veor d1,d1,d2 531*bc3d5698SJohn Baldwin veor d4,d4,d3 @ Xh|Xl - 256-bit result 532*bc3d5698SJohn Baldwin 533*bc3d5698SJohn Baldwin @ equivalent of reduction_avx from ghash-x86_64.pl 534*bc3d5698SJohn Baldwin vshl.i64 q9,q0,#57 @ 1st phase 535*bc3d5698SJohn Baldwin vshl.i64 q10,q0,#62 536*bc3d5698SJohn Baldwin veor q10,q10,q9 @ 537*bc3d5698SJohn Baldwin vshl.i64 q9,q0,#63 538*bc3d5698SJohn Baldwin veor q10, q10, q9 @ 539*bc3d5698SJohn Baldwin veor d1,d1,d20 @ 540*bc3d5698SJohn Baldwin veor d4,d4,d21 541*bc3d5698SJohn Baldwin 542*bc3d5698SJohn Baldwin vshr.u64 q10,q0,#1 @ 2nd phase 543*bc3d5698SJohn Baldwin veor q2,q2,q0 544*bc3d5698SJohn Baldwin veor q0,q0,q10 @ 545*bc3d5698SJohn Baldwin vshr.u64 q10,q10,#6 546*bc3d5698SJohn Baldwin vshr.u64 q0,q0,#1 @ 547*bc3d5698SJohn Baldwin veor q0,q0,q2 @ 548*bc3d5698SJohn Baldwin veor q0,q0,q10 @ 549*bc3d5698SJohn Baldwin 550*bc3d5698SJohn Baldwin subs r3,#16 551*bc3d5698SJohn Baldwin bne .Loop_neon 552*bc3d5698SJohn Baldwin 553*bc3d5698SJohn Baldwin#ifdef __ARMEL__ 554*bc3d5698SJohn Baldwin vrev64.8 q0,q0 555*bc3d5698SJohn Baldwin#endif 556*bc3d5698SJohn Baldwin sub r0,#16 557*bc3d5698SJohn Baldwin vst1.64 d1,[r0]! @ write out Xi 558*bc3d5698SJohn Baldwin vst1.64 d0,[r0] 559*bc3d5698SJohn Baldwin 560*bc3d5698SJohn Baldwin bx lr @ bx lr 561*bc3d5698SJohn Baldwin.size gcm_ghash_neon,.-gcm_ghash_neon 562*bc3d5698SJohn Baldwin#endif 563*bc3d5698SJohn Baldwin.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 564*bc3d5698SJohn Baldwin.align 2 565*bc3d5698SJohn Baldwin.align 2 566