1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */ 2bc3d5698SJohn Baldwin#include "arm_arch.h" 3bc3d5698SJohn Baldwin 4bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7 5bc3d5698SJohn Baldwin.fpu neon 6*c0855eaaSJohn Baldwin#ifdef __thumb2__ 7*c0855eaaSJohn Baldwin.syntax unified 8*c0855eaaSJohn Baldwin.thumb 9*c0855eaaSJohn Baldwin# define INST(a,b,c,d) .byte c,0xef,a,b 10*c0855eaaSJohn Baldwin#else 11bc3d5698SJohn Baldwin.code 32 12*c0855eaaSJohn Baldwin# define INST(a,b,c,d) .byte a,b,c,0xf2 13*c0855eaaSJohn Baldwin#endif 14*c0855eaaSJohn Baldwin 15*c0855eaaSJohn Baldwin.text 16bc3d5698SJohn Baldwin.globl gcm_init_v8 17bc3d5698SJohn Baldwin.type gcm_init_v8,%function 18bc3d5698SJohn Baldwin.align 4 19bc3d5698SJohn Baldwingcm_init_v8: 20bc3d5698SJohn Baldwin vld1.64 {q9},[r1] @ load input H 21bc3d5698SJohn Baldwin vmov.i8 q11,#0xe1 22bc3d5698SJohn Baldwin vshl.i64 q11,q11,#57 @ 0xc2.0 23bc3d5698SJohn Baldwin vext.8 q3,q9,q9,#8 24bc3d5698SJohn Baldwin vshr.u64 q10,q11,#63 25bc3d5698SJohn Baldwin vdup.32 q9,d18[1] 26bc3d5698SJohn Baldwin vext.8 q8,q10,q11,#8 @ t0=0xc2....01 27bc3d5698SJohn Baldwin vshr.u64 q10,q3,#63 28bc3d5698SJohn Baldwin vshr.s32 q9,q9,#31 @ broadcast carry bit 29bc3d5698SJohn Baldwin vand q10,q10,q8 30bc3d5698SJohn Baldwin vshl.i64 q3,q3,#1 31bc3d5698SJohn Baldwin vext.8 q10,q10,q10,#8 32bc3d5698SJohn Baldwin vand q8,q8,q9 33bc3d5698SJohn Baldwin vorr q3,q3,q10 @ H<<<=1 34bc3d5698SJohn Baldwin veor q12,q3,q8 @ twisted H 35bc3d5698SJohn Baldwin vst1.64 {q12},[r0]! @ store Htable[0] 36bc3d5698SJohn Baldwin 37bc3d5698SJohn Baldwin @ calculate H^2 38bc3d5698SJohn Baldwin vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing 39*c0855eaaSJohn Baldwin INST(0xa8,0x0e,0xa8,0xf2) @ pmull q0,q12,q12 40bc3d5698SJohn Baldwin veor q8,q8,q12 41*c0855eaaSJohn Baldwin INST(0xa9,0x4e,0xa9,0xf2) @ pmull2 q2,q12,q12 42*c0855eaaSJohn Baldwin INST(0xa0,0x2e,0xa0,0xf2) @ pmull q1,q8,q8 43bc3d5698SJohn Baldwin 44bc3d5698SJohn Baldwin vext.8 q9,q0,q2,#8 @ Karatsuba post-processing 45bc3d5698SJohn Baldwin veor q10,q0,q2 46bc3d5698SJohn Baldwin veor q1,q1,q9 47bc3d5698SJohn Baldwin veor q1,q1,q10 48*c0855eaaSJohn Baldwin INST(0x26,0x4e,0xe0,0xf2) @ pmull q10,q0,q11 @ 1st phase 49bc3d5698SJohn Baldwin 50bc3d5698SJohn Baldwin vmov d4,d3 @ Xh|Xm - 256-bit result 51bc3d5698SJohn Baldwin vmov d3,d0 @ Xm is rotated Xl 52bc3d5698SJohn Baldwin veor q0,q1,q10 53bc3d5698SJohn Baldwin 54bc3d5698SJohn Baldwin vext.8 q10,q0,q0,#8 @ 2nd phase 55*c0855eaaSJohn Baldwin INST(0x26,0x0e,0xa0,0xf2) @ pmull q0,q0,q11 56bc3d5698SJohn Baldwin veor q10,q10,q2 57bc3d5698SJohn Baldwin veor q14,q0,q10 58bc3d5698SJohn Baldwin 59bc3d5698SJohn Baldwin vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing 60bc3d5698SJohn Baldwin veor q9,q9,q14 61bc3d5698SJohn Baldwin vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed 62bc3d5698SJohn Baldwin vst1.64 {q13,q14},[r0]! @ store Htable[1..2] 63bc3d5698SJohn Baldwin bx lr 64bc3d5698SJohn Baldwin.size gcm_init_v8,.-gcm_init_v8 65bc3d5698SJohn Baldwin.globl gcm_gmult_v8 66bc3d5698SJohn Baldwin.type gcm_gmult_v8,%function 67bc3d5698SJohn Baldwin.align 4 68bc3d5698SJohn Baldwingcm_gmult_v8: 69bc3d5698SJohn Baldwin vld1.64 {q9},[r0] @ load Xi 70bc3d5698SJohn Baldwin vmov.i8 q11,#0xe1 71bc3d5698SJohn Baldwin vld1.64 {q12,q13},[r1] @ load twisted H, ... 72bc3d5698SJohn Baldwin vshl.u64 q11,q11,#57 73bc3d5698SJohn Baldwin#ifndef __ARMEB__ 74bc3d5698SJohn Baldwin vrev64.8 q9,q9 75bc3d5698SJohn Baldwin#endif 76bc3d5698SJohn Baldwin vext.8 q3,q9,q9,#8 77bc3d5698SJohn Baldwin 78*c0855eaaSJohn Baldwin INST(0x86,0x0e,0xa8,0xf2) @ pmull q0,q12,q3 @ H.lo·Xi.lo 79bc3d5698SJohn Baldwin veor q9,q9,q3 @ Karatsuba pre-processing 80*c0855eaaSJohn Baldwin INST(0x87,0x4e,0xa9,0xf2) @ pmull2 q2,q12,q3 @ H.hi·Xi.hi 81*c0855eaaSJohn Baldwin INST(0xa2,0x2e,0xaa,0xf2) @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) 82bc3d5698SJohn Baldwin 83bc3d5698SJohn Baldwin vext.8 q9,q0,q2,#8 @ Karatsuba post-processing 84bc3d5698SJohn Baldwin veor q10,q0,q2 85bc3d5698SJohn Baldwin veor q1,q1,q9 86bc3d5698SJohn Baldwin veor q1,q1,q10 87*c0855eaaSJohn Baldwin INST(0x26,0x4e,0xe0,0xf2) @ pmull q10,q0,q11 @ 1st phase of reduction 88bc3d5698SJohn Baldwin 89bc3d5698SJohn Baldwin vmov d4,d3 @ Xh|Xm - 256-bit result 90bc3d5698SJohn Baldwin vmov d3,d0 @ Xm is rotated Xl 91bc3d5698SJohn Baldwin veor q0,q1,q10 92bc3d5698SJohn Baldwin 93bc3d5698SJohn Baldwin vext.8 q10,q0,q0,#8 @ 2nd phase of reduction 94*c0855eaaSJohn Baldwin INST(0x26,0x0e,0xa0,0xf2) @ pmull q0,q0,q11 95bc3d5698SJohn Baldwin veor q10,q10,q2 96bc3d5698SJohn Baldwin veor q0,q0,q10 97bc3d5698SJohn Baldwin 98bc3d5698SJohn Baldwin#ifndef __ARMEB__ 99bc3d5698SJohn Baldwin vrev64.8 q0,q0 100bc3d5698SJohn Baldwin#endif 101bc3d5698SJohn Baldwin vext.8 q0,q0,q0,#8 102bc3d5698SJohn Baldwin vst1.64 {q0},[r0] @ write out Xi 103bc3d5698SJohn Baldwin 104bc3d5698SJohn Baldwin bx lr 105bc3d5698SJohn Baldwin.size gcm_gmult_v8,.-gcm_gmult_v8 106bc3d5698SJohn Baldwin.globl gcm_ghash_v8 107bc3d5698SJohn Baldwin.type gcm_ghash_v8,%function 108bc3d5698SJohn Baldwin.align 4 109bc3d5698SJohn Baldwingcm_ghash_v8: 110bc3d5698SJohn Baldwin vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so 111bc3d5698SJohn Baldwin vld1.64 {q0},[r0] @ load [rotated] Xi 112bc3d5698SJohn Baldwin @ "[rotated]" means that 113bc3d5698SJohn Baldwin @ loaded value would have 114bc3d5698SJohn Baldwin @ to be rotated in order to 115bc3d5698SJohn Baldwin @ make it appear as in 116bc3d5698SJohn Baldwin @ algorithm specification 117bc3d5698SJohn Baldwin subs r3,r3,#32 @ see if r3 is 32 or larger 118bc3d5698SJohn Baldwin mov r12,#16 @ r12 is used as post- 119bc3d5698SJohn Baldwin @ increment for input pointer; 120bc3d5698SJohn Baldwin @ as loop is modulo-scheduled 121bc3d5698SJohn Baldwin @ r12 is zeroed just in time 122bc3d5698SJohn Baldwin @ to preclude overstepping 123bc3d5698SJohn Baldwin @ inp[len], which means that 124bc3d5698SJohn Baldwin @ last block[s] are actually 125bc3d5698SJohn Baldwin @ loaded twice, but last 126bc3d5698SJohn Baldwin @ copy is not processed 127bc3d5698SJohn Baldwin vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2 128bc3d5698SJohn Baldwin vmov.i8 q11,#0xe1 129bc3d5698SJohn Baldwin vld1.64 {q14},[r1] 130*c0855eaaSJohn Baldwin it eq 131bc3d5698SJohn Baldwin moveq r12,#0 @ is it time to zero r12? 132bc3d5698SJohn Baldwin vext.8 q0,q0,q0,#8 @ rotate Xi 133bc3d5698SJohn Baldwin vld1.64 {q8},[r2]! @ load [rotated] I[0] 134bc3d5698SJohn Baldwin vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant 135bc3d5698SJohn Baldwin#ifndef __ARMEB__ 136bc3d5698SJohn Baldwin vrev64.8 q8,q8 137bc3d5698SJohn Baldwin vrev64.8 q0,q0 138bc3d5698SJohn Baldwin#endif 139bc3d5698SJohn Baldwin vext.8 q3,q8,q8,#8 @ rotate I[0] 140bc3d5698SJohn Baldwin blo .Lodd_tail_v8 @ r3 was less than 32 141bc3d5698SJohn Baldwin vld1.64 {q9},[r2],r12 @ load [rotated] I[1] 142bc3d5698SJohn Baldwin#ifndef __ARMEB__ 143bc3d5698SJohn Baldwin vrev64.8 q9,q9 144bc3d5698SJohn Baldwin#endif 145bc3d5698SJohn Baldwin vext.8 q7,q9,q9,#8 146bc3d5698SJohn Baldwin veor q3,q3,q0 @ I[i]^=Xi 147*c0855eaaSJohn Baldwin INST(0x8e,0x8e,0xa8,0xf2) @ pmull q4,q12,q7 @ H·Ii+1 148bc3d5698SJohn Baldwin veor q9,q9,q7 @ Karatsuba pre-processing 149*c0855eaaSJohn Baldwin INST(0x8f,0xce,0xa9,0xf2) @ pmull2 q6,q12,q7 150bc3d5698SJohn Baldwin b .Loop_mod2x_v8 151bc3d5698SJohn Baldwin 152bc3d5698SJohn Baldwin.align 4 153bc3d5698SJohn Baldwin.Loop_mod2x_v8: 154bc3d5698SJohn Baldwin vext.8 q10,q3,q3,#8 155bc3d5698SJohn Baldwin subs r3,r3,#32 @ is there more data? 156*c0855eaaSJohn Baldwin INST(0x86,0x0e,0xac,0xf2) @ pmull q0,q14,q3 @ H^2.lo·Xi.lo 157*c0855eaaSJohn Baldwin it lo 158bc3d5698SJohn Baldwin movlo r12,#0 @ is it time to zero r12? 159bc3d5698SJohn Baldwin 160*c0855eaaSJohn Baldwin INST(0xa2,0xae,0xaa,0xf2) @ pmull q5,q13,q9 161bc3d5698SJohn Baldwin veor q10,q10,q3 @ Karatsuba pre-processing 162*c0855eaaSJohn Baldwin INST(0x87,0x4e,0xad,0xf2) @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi 163bc3d5698SJohn Baldwin veor q0,q0,q4 @ accumulate 164*c0855eaaSJohn Baldwin INST(0xa5,0x2e,0xab,0xf2) @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 165bc3d5698SJohn Baldwin vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2] 166bc3d5698SJohn Baldwin 167bc3d5698SJohn Baldwin veor q2,q2,q6 168*c0855eaaSJohn Baldwin it eq 169bc3d5698SJohn Baldwin moveq r12,#0 @ is it time to zero r12? 170bc3d5698SJohn Baldwin veor q1,q1,q5 171bc3d5698SJohn Baldwin 172bc3d5698SJohn Baldwin vext.8 q9,q0,q2,#8 @ Karatsuba post-processing 173bc3d5698SJohn Baldwin veor q10,q0,q2 174bc3d5698SJohn Baldwin veor q1,q1,q9 175bc3d5698SJohn Baldwin vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3] 176bc3d5698SJohn Baldwin#ifndef __ARMEB__ 177bc3d5698SJohn Baldwin vrev64.8 q8,q8 178bc3d5698SJohn Baldwin#endif 179bc3d5698SJohn Baldwin veor q1,q1,q10 180*c0855eaaSJohn Baldwin INST(0x26,0x4e,0xe0,0xf2) @ pmull q10,q0,q11 @ 1st phase of reduction 181bc3d5698SJohn Baldwin 182bc3d5698SJohn Baldwin#ifndef __ARMEB__ 183bc3d5698SJohn Baldwin vrev64.8 q9,q9 184bc3d5698SJohn Baldwin#endif 185bc3d5698SJohn Baldwin vmov d4,d3 @ Xh|Xm - 256-bit result 186bc3d5698SJohn Baldwin vmov d3,d0 @ Xm is rotated Xl 187bc3d5698SJohn Baldwin vext.8 q7,q9,q9,#8 188bc3d5698SJohn Baldwin vext.8 q3,q8,q8,#8 189bc3d5698SJohn Baldwin veor q0,q1,q10 190*c0855eaaSJohn Baldwin INST(0x8e,0x8e,0xa8,0xf2) @ pmull q4,q12,q7 @ H·Ii+1 191bc3d5698SJohn Baldwin veor q3,q3,q2 @ accumulate q3 early 192bc3d5698SJohn Baldwin 193bc3d5698SJohn Baldwin vext.8 q10,q0,q0,#8 @ 2nd phase of reduction 194*c0855eaaSJohn Baldwin INST(0x26,0x0e,0xa0,0xf2) @ pmull q0,q0,q11 195bc3d5698SJohn Baldwin veor q3,q3,q10 196bc3d5698SJohn Baldwin veor q9,q9,q7 @ Karatsuba pre-processing 197bc3d5698SJohn Baldwin veor q3,q3,q0 198*c0855eaaSJohn Baldwin INST(0x8f,0xce,0xa9,0xf2) @ pmull2 q6,q12,q7 199bc3d5698SJohn Baldwin bhs .Loop_mod2x_v8 @ there was at least 32 more bytes 200bc3d5698SJohn Baldwin 201bc3d5698SJohn Baldwin veor q2,q2,q10 202bc3d5698SJohn Baldwin vext.8 q3,q8,q8,#8 @ re-construct q3 203bc3d5698SJohn Baldwin adds r3,r3,#32 @ re-construct r3 204bc3d5698SJohn Baldwin veor q0,q0,q2 @ re-construct q0 205bc3d5698SJohn Baldwin beq .Ldone_v8 @ is r3 zero? 206bc3d5698SJohn Baldwin.Lodd_tail_v8: 207bc3d5698SJohn Baldwin vext.8 q10,q0,q0,#8 208bc3d5698SJohn Baldwin veor q3,q3,q0 @ inp^=Xi 209bc3d5698SJohn Baldwin veor q9,q8,q10 @ q9 is rotated inp^Xi 210bc3d5698SJohn Baldwin 211*c0855eaaSJohn Baldwin INST(0x86,0x0e,0xa8,0xf2) @ pmull q0,q12,q3 @ H.lo·Xi.lo 212bc3d5698SJohn Baldwin veor q9,q9,q3 @ Karatsuba pre-processing 213*c0855eaaSJohn Baldwin INST(0x87,0x4e,0xa9,0xf2) @ pmull2 q2,q12,q3 @ H.hi·Xi.hi 214*c0855eaaSJohn Baldwin INST(0xa2,0x2e,0xaa,0xf2) @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) 215bc3d5698SJohn Baldwin 216bc3d5698SJohn Baldwin vext.8 q9,q0,q2,#8 @ Karatsuba post-processing 217bc3d5698SJohn Baldwin veor q10,q0,q2 218bc3d5698SJohn Baldwin veor q1,q1,q9 219bc3d5698SJohn Baldwin veor q1,q1,q10 220*c0855eaaSJohn Baldwin INST(0x26,0x4e,0xe0,0xf2) @ pmull q10,q0,q11 @ 1st phase of reduction 221bc3d5698SJohn Baldwin 222bc3d5698SJohn Baldwin vmov d4,d3 @ Xh|Xm - 256-bit result 223bc3d5698SJohn Baldwin vmov d3,d0 @ Xm is rotated Xl 224bc3d5698SJohn Baldwin veor q0,q1,q10 225bc3d5698SJohn Baldwin 226bc3d5698SJohn Baldwin vext.8 q10,q0,q0,#8 @ 2nd phase of reduction 227*c0855eaaSJohn Baldwin INST(0x26,0x0e,0xa0,0xf2) @ pmull q0,q0,q11 228bc3d5698SJohn Baldwin veor q10,q10,q2 229bc3d5698SJohn Baldwin veor q0,q0,q10 230bc3d5698SJohn Baldwin 231bc3d5698SJohn Baldwin.Ldone_v8: 232bc3d5698SJohn Baldwin#ifndef __ARMEB__ 233bc3d5698SJohn Baldwin vrev64.8 q0,q0 234bc3d5698SJohn Baldwin#endif 235bc3d5698SJohn Baldwin vext.8 q0,q0,q0,#8 236bc3d5698SJohn Baldwin vst1.64 {q0},[r0] @ write out Xi 237bc3d5698SJohn Baldwin 238bc3d5698SJohn Baldwin vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so 239bc3d5698SJohn Baldwin bx lr 240bc3d5698SJohn Baldwin.size gcm_ghash_v8,.-gcm_ghash_v8 241bc3d5698SJohn Baldwin.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 242bc3d5698SJohn Baldwin.align 2 243bc3d5698SJohn Baldwin.align 2 244bc3d5698SJohn Baldwin#endif 245