1bc3d5698SJohn Baldwin/* Do not modify. This file is auto-generated from ghashv8-armx.pl. */ 2bc3d5698SJohn Baldwin#include "arm_arch.h" 3bc3d5698SJohn Baldwin 4bc3d5698SJohn Baldwin#if __ARM_MAX_ARCH__>=7 5c0855eaaSJohn Baldwin.arch armv8-a+crypto 6bc3d5698SJohn Baldwin.text 7bc3d5698SJohn Baldwin.globl gcm_init_v8 8bc3d5698SJohn Baldwin.type gcm_init_v8,%function 9bc3d5698SJohn Baldwin.align 4 10bc3d5698SJohn Baldwingcm_init_v8: 11bd9588bcSAndrew Turner AARCH64_VALID_CALL_TARGET 12bc3d5698SJohn Baldwin ld1 {v17.2d},[x1] //load input H 13bc3d5698SJohn Baldwin movi v19.16b,#0xe1 14bc3d5698SJohn Baldwin shl v19.2d,v19.2d,#57 //0xc2.0 15bc3d5698SJohn Baldwin ext v3.16b,v17.16b,v17.16b,#8 16bc3d5698SJohn Baldwin ushr v18.2d,v19.2d,#63 17bc3d5698SJohn Baldwin dup v17.4s,v17.s[1] 18bc3d5698SJohn Baldwin ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 19bc3d5698SJohn Baldwin ushr v18.2d,v3.2d,#63 20bc3d5698SJohn Baldwin sshr v17.4s,v17.4s,#31 //broadcast carry bit 21bc3d5698SJohn Baldwin and v18.16b,v18.16b,v16.16b 22bc3d5698SJohn Baldwin shl v3.2d,v3.2d,#1 23bc3d5698SJohn Baldwin ext v18.16b,v18.16b,v18.16b,#8 24bc3d5698SJohn Baldwin and v16.16b,v16.16b,v17.16b 25bc3d5698SJohn Baldwin orr v3.16b,v3.16b,v18.16b //H<<<=1 26bc3d5698SJohn Baldwin eor v20.16b,v3.16b,v16.16b //twisted H 27bc3d5698SJohn Baldwin st1 {v20.2d},[x0],#16 //store Htable[0] 28bc3d5698SJohn Baldwin 29bc3d5698SJohn Baldwin //calculate H^2 30bc3d5698SJohn Baldwin ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing 31bc3d5698SJohn Baldwin pmull v0.1q,v20.1d,v20.1d 32bc3d5698SJohn Baldwin eor v16.16b,v16.16b,v20.16b 33bc3d5698SJohn Baldwin pmull2 v2.1q,v20.2d,v20.2d 34bc3d5698SJohn Baldwin pmull v1.1q,v16.1d,v16.1d 35bc3d5698SJohn Baldwin 36bc3d5698SJohn Baldwin ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 37bc3d5698SJohn Baldwin eor v18.16b,v0.16b,v2.16b 38bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v17.16b 39bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v18.16b 40bc3d5698SJohn Baldwin pmull v18.1q,v0.1d,v19.1d //1st phase 41bc3d5698SJohn Baldwin 42bc3d5698SJohn Baldwin ins v2.d[0],v1.d[1] 43bc3d5698SJohn Baldwin ins v1.d[1],v0.d[0] 44bc3d5698SJohn Baldwin eor v0.16b,v1.16b,v18.16b 45bc3d5698SJohn Baldwin 46bc3d5698SJohn Baldwin ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 47bc3d5698SJohn Baldwin pmull v0.1q,v0.1d,v19.1d 48bc3d5698SJohn Baldwin eor v18.16b,v18.16b,v2.16b 49bc3d5698SJohn Baldwin eor v22.16b,v0.16b,v18.16b 50bc3d5698SJohn Baldwin 51bc3d5698SJohn Baldwin ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing 52bc3d5698SJohn Baldwin eor v17.16b,v17.16b,v22.16b 53bc3d5698SJohn Baldwin ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 54bc3d5698SJohn Baldwin st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] 55bc3d5698SJohn Baldwin //calculate H^3 and H^4 56bc3d5698SJohn Baldwin pmull v0.1q,v20.1d, v22.1d 57bc3d5698SJohn Baldwin pmull v5.1q,v22.1d,v22.1d 58bc3d5698SJohn Baldwin pmull2 v2.1q,v20.2d, v22.2d 59bc3d5698SJohn Baldwin pmull2 v7.1q,v22.2d,v22.2d 60bc3d5698SJohn Baldwin pmull v1.1q,v16.1d,v17.1d 61bc3d5698SJohn Baldwin pmull v6.1q,v17.1d,v17.1d 62bc3d5698SJohn Baldwin 63bc3d5698SJohn Baldwin ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 64bc3d5698SJohn Baldwin ext v17.16b,v5.16b,v7.16b,#8 65bc3d5698SJohn Baldwin eor v18.16b,v0.16b,v2.16b 66bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v16.16b 67bc3d5698SJohn Baldwin eor v4.16b,v5.16b,v7.16b 68bc3d5698SJohn Baldwin eor v6.16b,v6.16b,v17.16b 69bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v18.16b 70bc3d5698SJohn Baldwin pmull v18.1q,v0.1d,v19.1d //1st phase 71bc3d5698SJohn Baldwin eor v6.16b,v6.16b,v4.16b 72bc3d5698SJohn Baldwin pmull v4.1q,v5.1d,v19.1d 73bc3d5698SJohn Baldwin 74bc3d5698SJohn Baldwin ins v2.d[0],v1.d[1] 75bc3d5698SJohn Baldwin ins v7.d[0],v6.d[1] 76bc3d5698SJohn Baldwin ins v1.d[1],v0.d[0] 77bc3d5698SJohn Baldwin ins v6.d[1],v5.d[0] 78bc3d5698SJohn Baldwin eor v0.16b,v1.16b,v18.16b 79bc3d5698SJohn Baldwin eor v5.16b,v6.16b,v4.16b 80bc3d5698SJohn Baldwin 81bc3d5698SJohn Baldwin ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 82bc3d5698SJohn Baldwin ext v4.16b,v5.16b,v5.16b,#8 83bc3d5698SJohn Baldwin pmull v0.1q,v0.1d,v19.1d 84bc3d5698SJohn Baldwin pmull v5.1q,v5.1d,v19.1d 85bc3d5698SJohn Baldwin eor v18.16b,v18.16b,v2.16b 86bc3d5698SJohn Baldwin eor v4.16b,v4.16b,v7.16b 87bc3d5698SJohn Baldwin eor v20.16b, v0.16b,v18.16b //H^3 88bc3d5698SJohn Baldwin eor v22.16b,v5.16b,v4.16b //H^4 89bc3d5698SJohn Baldwin 90bc3d5698SJohn Baldwin ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing 91bc3d5698SJohn Baldwin ext v17.16b,v22.16b,v22.16b,#8 92bc3d5698SJohn Baldwin eor v16.16b,v16.16b,v20.16b 93bc3d5698SJohn Baldwin eor v17.16b,v17.16b,v22.16b 94bc3d5698SJohn Baldwin ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 95bc3d5698SJohn Baldwin st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] 96bc3d5698SJohn Baldwin ret 97bc3d5698SJohn Baldwin.size gcm_init_v8,.-gcm_init_v8 98bc3d5698SJohn Baldwin.globl gcm_gmult_v8 99bc3d5698SJohn Baldwin.type gcm_gmult_v8,%function 100bc3d5698SJohn Baldwin.align 4 101bc3d5698SJohn Baldwingcm_gmult_v8: 102bd9588bcSAndrew Turner AARCH64_VALID_CALL_TARGET 103bc3d5698SJohn Baldwin ld1 {v17.2d},[x0] //load Xi 104bc3d5698SJohn Baldwin movi v19.16b,#0xe1 105bc3d5698SJohn Baldwin ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... 106bc3d5698SJohn Baldwin shl v19.2d,v19.2d,#57 107*575878a5SEd Maste#ifndef __AARCH64EB__ 108bc3d5698SJohn Baldwin rev64 v17.16b,v17.16b 109bc3d5698SJohn Baldwin#endif 110bc3d5698SJohn Baldwin ext v3.16b,v17.16b,v17.16b,#8 111bc3d5698SJohn Baldwin 112bc3d5698SJohn Baldwin pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 113bc3d5698SJohn Baldwin eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 114bc3d5698SJohn Baldwin pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 115bc3d5698SJohn Baldwin pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 116bc3d5698SJohn Baldwin 117bc3d5698SJohn Baldwin ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 118bc3d5698SJohn Baldwin eor v18.16b,v0.16b,v2.16b 119bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v17.16b 120bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v18.16b 121bc3d5698SJohn Baldwin pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 122bc3d5698SJohn Baldwin 123bc3d5698SJohn Baldwin ins v2.d[0],v1.d[1] 124bc3d5698SJohn Baldwin ins v1.d[1],v0.d[0] 125bc3d5698SJohn Baldwin eor v0.16b,v1.16b,v18.16b 126bc3d5698SJohn Baldwin 127bc3d5698SJohn Baldwin ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 128bc3d5698SJohn Baldwin pmull v0.1q,v0.1d,v19.1d 129bc3d5698SJohn Baldwin eor v18.16b,v18.16b,v2.16b 130bc3d5698SJohn Baldwin eor v0.16b,v0.16b,v18.16b 131bc3d5698SJohn Baldwin 132*575878a5SEd Maste#ifndef __AARCH64EB__ 133bc3d5698SJohn Baldwin rev64 v0.16b,v0.16b 134bc3d5698SJohn Baldwin#endif 135bc3d5698SJohn Baldwin ext v0.16b,v0.16b,v0.16b,#8 136bc3d5698SJohn Baldwin st1 {v0.2d},[x0] //write out Xi 137bc3d5698SJohn Baldwin 138bc3d5698SJohn Baldwin ret 139bc3d5698SJohn Baldwin.size gcm_gmult_v8,.-gcm_gmult_v8 140bc3d5698SJohn Baldwin.globl gcm_ghash_v8 141bc3d5698SJohn Baldwin.type gcm_ghash_v8,%function 142bc3d5698SJohn Baldwin.align 4 143bc3d5698SJohn Baldwingcm_ghash_v8: 144bd9588bcSAndrew Turner AARCH64_VALID_CALL_TARGET 145bc3d5698SJohn Baldwin cmp x3,#64 146bc3d5698SJohn Baldwin b.hs .Lgcm_ghash_v8_4x 147bc3d5698SJohn Baldwin ld1 {v0.2d},[x0] //load [rotated] Xi 148bc3d5698SJohn Baldwin //"[rotated]" means that 149bc3d5698SJohn Baldwin //loaded value would have 150bc3d5698SJohn Baldwin //to be rotated in order to 151bc3d5698SJohn Baldwin //make it appear as in 152bc3d5698SJohn Baldwin //algorithm specification 153bc3d5698SJohn Baldwin subs x3,x3,#32 //see if x3 is 32 or larger 154bc3d5698SJohn Baldwin mov x12,#16 //x12 is used as post- 155bc3d5698SJohn Baldwin //increment for input pointer; 156bc3d5698SJohn Baldwin //as loop is modulo-scheduled 157bc3d5698SJohn Baldwin //x12 is zeroed just in time 158bc3d5698SJohn Baldwin //to preclude overstepping 159bc3d5698SJohn Baldwin //inp[len], which means that 160bc3d5698SJohn Baldwin //last block[s] are actually 161bc3d5698SJohn Baldwin //loaded twice, but last 162bc3d5698SJohn Baldwin //copy is not processed 163bc3d5698SJohn Baldwin ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 164bc3d5698SJohn Baldwin movi v19.16b,#0xe1 165bc3d5698SJohn Baldwin ld1 {v22.2d},[x1] 166bc3d5698SJohn Baldwin csel x12,xzr,x12,eq //is it time to zero x12? 167bc3d5698SJohn Baldwin ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi 168bc3d5698SJohn Baldwin ld1 {v16.2d},[x2],#16 //load [rotated] I[0] 169bc3d5698SJohn Baldwin shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 170*575878a5SEd Maste#ifndef __AARCH64EB__ 171bc3d5698SJohn Baldwin rev64 v16.16b,v16.16b 172bc3d5698SJohn Baldwin rev64 v0.16b,v0.16b 173bc3d5698SJohn Baldwin#endif 174bc3d5698SJohn Baldwin ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] 175bc3d5698SJohn Baldwin b.lo .Lodd_tail_v8 //x3 was less than 32 176bc3d5698SJohn Baldwin ld1 {v17.2d},[x2],x12 //load [rotated] I[1] 177*575878a5SEd Maste#ifndef __AARCH64EB__ 178bc3d5698SJohn Baldwin rev64 v17.16b,v17.16b 179bc3d5698SJohn Baldwin#endif 180bc3d5698SJohn Baldwin ext v7.16b,v17.16b,v17.16b,#8 181bc3d5698SJohn Baldwin eor v3.16b,v3.16b,v0.16b //I[i]^=Xi 182bc3d5698SJohn Baldwin pmull v4.1q,v20.1d,v7.1d //H·Ii+1 183bc3d5698SJohn Baldwin eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 184bc3d5698SJohn Baldwin pmull2 v6.1q,v20.2d,v7.2d 185bc3d5698SJohn Baldwin b .Loop_mod2x_v8 186bc3d5698SJohn Baldwin 187bc3d5698SJohn Baldwin.align 4 188bc3d5698SJohn Baldwin.Loop_mod2x_v8: 189bc3d5698SJohn Baldwin ext v18.16b,v3.16b,v3.16b,#8 190bc3d5698SJohn Baldwin subs x3,x3,#32 //is there more data? 191bc3d5698SJohn Baldwin pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo 192bc3d5698SJohn Baldwin csel x12,xzr,x12,lo //is it time to zero x12? 193bc3d5698SJohn Baldwin 194bc3d5698SJohn Baldwin pmull v5.1q,v21.1d,v17.1d 195bc3d5698SJohn Baldwin eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing 196bc3d5698SJohn Baldwin pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi 197bc3d5698SJohn Baldwin eor v0.16b,v0.16b,v4.16b //accumulate 198bc3d5698SJohn Baldwin pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 199bc3d5698SJohn Baldwin ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] 200bc3d5698SJohn Baldwin 201bc3d5698SJohn Baldwin eor v2.16b,v2.16b,v6.16b 202bc3d5698SJohn Baldwin csel x12,xzr,x12,eq //is it time to zero x12? 203bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v5.16b 204bc3d5698SJohn Baldwin 205bc3d5698SJohn Baldwin ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 206bc3d5698SJohn Baldwin eor v18.16b,v0.16b,v2.16b 207bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v17.16b 208bc3d5698SJohn Baldwin ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] 209*575878a5SEd Maste#ifndef __AARCH64EB__ 210bc3d5698SJohn Baldwin rev64 v16.16b,v16.16b 211bc3d5698SJohn Baldwin#endif 212bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v18.16b 213bc3d5698SJohn Baldwin pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 214bc3d5698SJohn Baldwin 215*575878a5SEd Maste#ifndef __AARCH64EB__ 216bc3d5698SJohn Baldwin rev64 v17.16b,v17.16b 217bc3d5698SJohn Baldwin#endif 218bc3d5698SJohn Baldwin ins v2.d[0],v1.d[1] 219bc3d5698SJohn Baldwin ins v1.d[1],v0.d[0] 220bc3d5698SJohn Baldwin ext v7.16b,v17.16b,v17.16b,#8 221bc3d5698SJohn Baldwin ext v3.16b,v16.16b,v16.16b,#8 222bc3d5698SJohn Baldwin eor v0.16b,v1.16b,v18.16b 223bc3d5698SJohn Baldwin pmull v4.1q,v20.1d,v7.1d //H·Ii+1 224bc3d5698SJohn Baldwin eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early 225bc3d5698SJohn Baldwin 226bc3d5698SJohn Baldwin ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 227bc3d5698SJohn Baldwin pmull v0.1q,v0.1d,v19.1d 228bc3d5698SJohn Baldwin eor v3.16b,v3.16b,v18.16b 229bc3d5698SJohn Baldwin eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 230bc3d5698SJohn Baldwin eor v3.16b,v3.16b,v0.16b 231bc3d5698SJohn Baldwin pmull2 v6.1q,v20.2d,v7.2d 232bc3d5698SJohn Baldwin b.hs .Loop_mod2x_v8 //there was at least 32 more bytes 233bc3d5698SJohn Baldwin 234bc3d5698SJohn Baldwin eor v2.16b,v2.16b,v18.16b 235bc3d5698SJohn Baldwin ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b 236bc3d5698SJohn Baldwin adds x3,x3,#32 //re-construct x3 237bc3d5698SJohn Baldwin eor v0.16b,v0.16b,v2.16b //re-construct v0.16b 238bc3d5698SJohn Baldwin b.eq .Ldone_v8 //is x3 zero? 239bc3d5698SJohn Baldwin.Lodd_tail_v8: 240bc3d5698SJohn Baldwin ext v18.16b,v0.16b,v0.16b,#8 241bc3d5698SJohn Baldwin eor v3.16b,v3.16b,v0.16b //inp^=Xi 242bc3d5698SJohn Baldwin eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi 243bc3d5698SJohn Baldwin 244bc3d5698SJohn Baldwin pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 245bc3d5698SJohn Baldwin eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 246bc3d5698SJohn Baldwin pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 247bc3d5698SJohn Baldwin pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 248bc3d5698SJohn Baldwin 249bc3d5698SJohn Baldwin ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 250bc3d5698SJohn Baldwin eor v18.16b,v0.16b,v2.16b 251bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v17.16b 252bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v18.16b 253bc3d5698SJohn Baldwin pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 254bc3d5698SJohn Baldwin 255bc3d5698SJohn Baldwin ins v2.d[0],v1.d[1] 256bc3d5698SJohn Baldwin ins v1.d[1],v0.d[0] 257bc3d5698SJohn Baldwin eor v0.16b,v1.16b,v18.16b 258bc3d5698SJohn Baldwin 259bc3d5698SJohn Baldwin ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 260bc3d5698SJohn Baldwin pmull v0.1q,v0.1d,v19.1d 261bc3d5698SJohn Baldwin eor v18.16b,v18.16b,v2.16b 262bc3d5698SJohn Baldwin eor v0.16b,v0.16b,v18.16b 263bc3d5698SJohn Baldwin 264bc3d5698SJohn Baldwin.Ldone_v8: 265*575878a5SEd Maste#ifndef __AARCH64EB__ 266bc3d5698SJohn Baldwin rev64 v0.16b,v0.16b 267bc3d5698SJohn Baldwin#endif 268bc3d5698SJohn Baldwin ext v0.16b,v0.16b,v0.16b,#8 269bc3d5698SJohn Baldwin st1 {v0.2d},[x0] //write out Xi 270bc3d5698SJohn Baldwin 271bc3d5698SJohn Baldwin ret 272bc3d5698SJohn Baldwin.size gcm_ghash_v8,.-gcm_ghash_v8 273bc3d5698SJohn Baldwin.type gcm_ghash_v8_4x,%function 274bc3d5698SJohn Baldwin.align 4 275bc3d5698SJohn Baldwingcm_ghash_v8_4x: 276bc3d5698SJohn Baldwin.Lgcm_ghash_v8_4x: 277bc3d5698SJohn Baldwin ld1 {v0.2d},[x0] //load [rotated] Xi 278bc3d5698SJohn Baldwin ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 279bc3d5698SJohn Baldwin movi v19.16b,#0xe1 280bc3d5698SJohn Baldwin ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 281bc3d5698SJohn Baldwin shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 282bc3d5698SJohn Baldwin 283bc3d5698SJohn Baldwin ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 284*575878a5SEd Maste#ifndef __AARCH64EB__ 285bc3d5698SJohn Baldwin rev64 v0.16b,v0.16b 286bc3d5698SJohn Baldwin rev64 v5.16b,v5.16b 287bc3d5698SJohn Baldwin rev64 v6.16b,v6.16b 288bc3d5698SJohn Baldwin rev64 v7.16b,v7.16b 289bc3d5698SJohn Baldwin rev64 v4.16b,v4.16b 290bc3d5698SJohn Baldwin#endif 291bc3d5698SJohn Baldwin ext v25.16b,v7.16b,v7.16b,#8 292bc3d5698SJohn Baldwin ext v24.16b,v6.16b,v6.16b,#8 293bc3d5698SJohn Baldwin ext v23.16b,v5.16b,v5.16b,#8 294bc3d5698SJohn Baldwin 295bc3d5698SJohn Baldwin pmull v29.1q,v20.1d,v25.1d //H·Ii+3 296bc3d5698SJohn Baldwin eor v7.16b,v7.16b,v25.16b 297bc3d5698SJohn Baldwin pmull2 v31.1q,v20.2d,v25.2d 298bc3d5698SJohn Baldwin pmull v30.1q,v21.1d,v7.1d 299bc3d5698SJohn Baldwin 300bc3d5698SJohn Baldwin pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 301bc3d5698SJohn Baldwin eor v6.16b,v6.16b,v24.16b 302bc3d5698SJohn Baldwin pmull2 v24.1q,v22.2d,v24.2d 303bc3d5698SJohn Baldwin pmull2 v6.1q,v21.2d,v6.2d 304bc3d5698SJohn Baldwin 305bc3d5698SJohn Baldwin eor v29.16b,v29.16b,v16.16b 306bc3d5698SJohn Baldwin eor v31.16b,v31.16b,v24.16b 307bc3d5698SJohn Baldwin eor v30.16b,v30.16b,v6.16b 308bc3d5698SJohn Baldwin 309bc3d5698SJohn Baldwin pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 310bc3d5698SJohn Baldwin eor v5.16b,v5.16b,v23.16b 311bc3d5698SJohn Baldwin pmull2 v23.1q,v26.2d,v23.2d 312bc3d5698SJohn Baldwin pmull v5.1q,v27.1d,v5.1d 313bc3d5698SJohn Baldwin 314bc3d5698SJohn Baldwin eor v29.16b,v29.16b,v7.16b 315bc3d5698SJohn Baldwin eor v31.16b,v31.16b,v23.16b 316bc3d5698SJohn Baldwin eor v30.16b,v30.16b,v5.16b 317bc3d5698SJohn Baldwin 318bc3d5698SJohn Baldwin subs x3,x3,#128 319bc3d5698SJohn Baldwin b.lo .Ltail4x 320bc3d5698SJohn Baldwin 321bc3d5698SJohn Baldwin b .Loop4x 322bc3d5698SJohn Baldwin 323bc3d5698SJohn Baldwin.align 4 324bc3d5698SJohn Baldwin.Loop4x: 325bc3d5698SJohn Baldwin eor v16.16b,v4.16b,v0.16b 326bc3d5698SJohn Baldwin ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 327bc3d5698SJohn Baldwin ext v3.16b,v16.16b,v16.16b,#8 328*575878a5SEd Maste#ifndef __AARCH64EB__ 329bc3d5698SJohn Baldwin rev64 v5.16b,v5.16b 330bc3d5698SJohn Baldwin rev64 v6.16b,v6.16b 331bc3d5698SJohn Baldwin rev64 v7.16b,v7.16b 332bc3d5698SJohn Baldwin rev64 v4.16b,v4.16b 333bc3d5698SJohn Baldwin#endif 334bc3d5698SJohn Baldwin 335bc3d5698SJohn Baldwin pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 336bc3d5698SJohn Baldwin eor v16.16b,v16.16b,v3.16b 337bc3d5698SJohn Baldwin pmull2 v2.1q,v28.2d,v3.2d 338bc3d5698SJohn Baldwin ext v25.16b,v7.16b,v7.16b,#8 339bc3d5698SJohn Baldwin pmull2 v1.1q,v27.2d,v16.2d 340bc3d5698SJohn Baldwin 341bc3d5698SJohn Baldwin eor v0.16b,v0.16b,v29.16b 342bc3d5698SJohn Baldwin eor v2.16b,v2.16b,v31.16b 343bc3d5698SJohn Baldwin ext v24.16b,v6.16b,v6.16b,#8 344bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v30.16b 345bc3d5698SJohn Baldwin ext v23.16b,v5.16b,v5.16b,#8 346bc3d5698SJohn Baldwin 347bc3d5698SJohn Baldwin ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 348bc3d5698SJohn Baldwin eor v18.16b,v0.16b,v2.16b 349bc3d5698SJohn Baldwin pmull v29.1q,v20.1d,v25.1d //H·Ii+3 350bc3d5698SJohn Baldwin eor v7.16b,v7.16b,v25.16b 351bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v17.16b 352bc3d5698SJohn Baldwin pmull2 v31.1q,v20.2d,v25.2d 353bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v18.16b 354bc3d5698SJohn Baldwin pmull v30.1q,v21.1d,v7.1d 355bc3d5698SJohn Baldwin 356bc3d5698SJohn Baldwin pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 357bc3d5698SJohn Baldwin ins v2.d[0],v1.d[1] 358bc3d5698SJohn Baldwin ins v1.d[1],v0.d[0] 359bc3d5698SJohn Baldwin pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 360bc3d5698SJohn Baldwin eor v6.16b,v6.16b,v24.16b 361bc3d5698SJohn Baldwin pmull2 v24.1q,v22.2d,v24.2d 362bc3d5698SJohn Baldwin eor v0.16b,v1.16b,v18.16b 363bc3d5698SJohn Baldwin pmull2 v6.1q,v21.2d,v6.2d 364bc3d5698SJohn Baldwin 365bc3d5698SJohn Baldwin eor v29.16b,v29.16b,v16.16b 366bc3d5698SJohn Baldwin eor v31.16b,v31.16b,v24.16b 367bc3d5698SJohn Baldwin eor v30.16b,v30.16b,v6.16b 368bc3d5698SJohn Baldwin 369bc3d5698SJohn Baldwin ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 370bc3d5698SJohn Baldwin pmull v0.1q,v0.1d,v19.1d 371bc3d5698SJohn Baldwin pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 372bc3d5698SJohn Baldwin eor v5.16b,v5.16b,v23.16b 373bc3d5698SJohn Baldwin eor v18.16b,v18.16b,v2.16b 374bc3d5698SJohn Baldwin pmull2 v23.1q,v26.2d,v23.2d 375bc3d5698SJohn Baldwin pmull v5.1q,v27.1d,v5.1d 376bc3d5698SJohn Baldwin 377bc3d5698SJohn Baldwin eor v0.16b,v0.16b,v18.16b 378bc3d5698SJohn Baldwin eor v29.16b,v29.16b,v7.16b 379bc3d5698SJohn Baldwin eor v31.16b,v31.16b,v23.16b 380bc3d5698SJohn Baldwin ext v0.16b,v0.16b,v0.16b,#8 381bc3d5698SJohn Baldwin eor v30.16b,v30.16b,v5.16b 382bc3d5698SJohn Baldwin 383bc3d5698SJohn Baldwin subs x3,x3,#64 384bc3d5698SJohn Baldwin b.hs .Loop4x 385bc3d5698SJohn Baldwin 386bc3d5698SJohn Baldwin.Ltail4x: 387bc3d5698SJohn Baldwin eor v16.16b,v4.16b,v0.16b 388bc3d5698SJohn Baldwin ext v3.16b,v16.16b,v16.16b,#8 389bc3d5698SJohn Baldwin 390bc3d5698SJohn Baldwin pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 391bc3d5698SJohn Baldwin eor v16.16b,v16.16b,v3.16b 392bc3d5698SJohn Baldwin pmull2 v2.1q,v28.2d,v3.2d 393bc3d5698SJohn Baldwin pmull2 v1.1q,v27.2d,v16.2d 394bc3d5698SJohn Baldwin 395bc3d5698SJohn Baldwin eor v0.16b,v0.16b,v29.16b 396bc3d5698SJohn Baldwin eor v2.16b,v2.16b,v31.16b 397bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v30.16b 398bc3d5698SJohn Baldwin 399bc3d5698SJohn Baldwin adds x3,x3,#64 400bc3d5698SJohn Baldwin b.eq .Ldone4x 401bc3d5698SJohn Baldwin 402bc3d5698SJohn Baldwin cmp x3,#32 403bc3d5698SJohn Baldwin b.lo .Lone 404bc3d5698SJohn Baldwin b.eq .Ltwo 405bc3d5698SJohn Baldwin.Lthree: 406bc3d5698SJohn Baldwin ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 407bc3d5698SJohn Baldwin eor v18.16b,v0.16b,v2.16b 408bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v17.16b 409bc3d5698SJohn Baldwin ld1 {v4.2d,v5.2d,v6.2d},[x2] 410bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v18.16b 411*575878a5SEd Maste#ifndef __AARCH64EB__ 412bc3d5698SJohn Baldwin rev64 v5.16b,v5.16b 413bc3d5698SJohn Baldwin rev64 v6.16b,v6.16b 414bc3d5698SJohn Baldwin rev64 v4.16b,v4.16b 415bc3d5698SJohn Baldwin#endif 416bc3d5698SJohn Baldwin 417bc3d5698SJohn Baldwin pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 418bc3d5698SJohn Baldwin ins v2.d[0],v1.d[1] 419bc3d5698SJohn Baldwin ins v1.d[1],v0.d[0] 420bc3d5698SJohn Baldwin ext v24.16b,v6.16b,v6.16b,#8 421bc3d5698SJohn Baldwin ext v23.16b,v5.16b,v5.16b,#8 422bc3d5698SJohn Baldwin eor v0.16b,v1.16b,v18.16b 423bc3d5698SJohn Baldwin 424bc3d5698SJohn Baldwin pmull v29.1q,v20.1d,v24.1d //H·Ii+2 425bc3d5698SJohn Baldwin eor v6.16b,v6.16b,v24.16b 426bc3d5698SJohn Baldwin 427bc3d5698SJohn Baldwin ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 428bc3d5698SJohn Baldwin pmull v0.1q,v0.1d,v19.1d 429bc3d5698SJohn Baldwin eor v18.16b,v18.16b,v2.16b 430bc3d5698SJohn Baldwin pmull2 v31.1q,v20.2d,v24.2d 431bc3d5698SJohn Baldwin pmull v30.1q,v21.1d,v6.1d 432bc3d5698SJohn Baldwin eor v0.16b,v0.16b,v18.16b 433bc3d5698SJohn Baldwin pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1 434bc3d5698SJohn Baldwin eor v5.16b,v5.16b,v23.16b 435bc3d5698SJohn Baldwin ext v0.16b,v0.16b,v0.16b,#8 436bc3d5698SJohn Baldwin 437bc3d5698SJohn Baldwin pmull2 v23.1q,v22.2d,v23.2d 438bc3d5698SJohn Baldwin eor v16.16b,v4.16b,v0.16b 439bc3d5698SJohn Baldwin pmull2 v5.1q,v21.2d,v5.2d 440bc3d5698SJohn Baldwin ext v3.16b,v16.16b,v16.16b,#8 441bc3d5698SJohn Baldwin 442bc3d5698SJohn Baldwin eor v29.16b,v29.16b,v7.16b 443bc3d5698SJohn Baldwin eor v31.16b,v31.16b,v23.16b 444bc3d5698SJohn Baldwin eor v30.16b,v30.16b,v5.16b 445bc3d5698SJohn Baldwin 446bc3d5698SJohn Baldwin pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii) 447bc3d5698SJohn Baldwin eor v16.16b,v16.16b,v3.16b 448bc3d5698SJohn Baldwin pmull2 v2.1q,v26.2d,v3.2d 449bc3d5698SJohn Baldwin pmull v1.1q,v27.1d,v16.1d 450bc3d5698SJohn Baldwin 451bc3d5698SJohn Baldwin eor v0.16b,v0.16b,v29.16b 452bc3d5698SJohn Baldwin eor v2.16b,v2.16b,v31.16b 453bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v30.16b 454bc3d5698SJohn Baldwin b .Ldone4x 455bc3d5698SJohn Baldwin 456bc3d5698SJohn Baldwin.align 4 457bc3d5698SJohn Baldwin.Ltwo: 458bc3d5698SJohn Baldwin ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 459bc3d5698SJohn Baldwin eor v18.16b,v0.16b,v2.16b 460bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v17.16b 461bc3d5698SJohn Baldwin ld1 {v4.2d,v5.2d},[x2] 462bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v18.16b 463*575878a5SEd Maste#ifndef __AARCH64EB__ 464bc3d5698SJohn Baldwin rev64 v5.16b,v5.16b 465bc3d5698SJohn Baldwin rev64 v4.16b,v4.16b 466bc3d5698SJohn Baldwin#endif 467bc3d5698SJohn Baldwin 468bc3d5698SJohn Baldwin pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 469bc3d5698SJohn Baldwin ins v2.d[0],v1.d[1] 470bc3d5698SJohn Baldwin ins v1.d[1],v0.d[0] 471bc3d5698SJohn Baldwin ext v23.16b,v5.16b,v5.16b,#8 472bc3d5698SJohn Baldwin eor v0.16b,v1.16b,v18.16b 473bc3d5698SJohn Baldwin 474bc3d5698SJohn Baldwin ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 475bc3d5698SJohn Baldwin pmull v0.1q,v0.1d,v19.1d 476bc3d5698SJohn Baldwin eor v18.16b,v18.16b,v2.16b 477bc3d5698SJohn Baldwin eor v0.16b,v0.16b,v18.16b 478bc3d5698SJohn Baldwin ext v0.16b,v0.16b,v0.16b,#8 479bc3d5698SJohn Baldwin 480bc3d5698SJohn Baldwin pmull v29.1q,v20.1d,v23.1d //H·Ii+1 481bc3d5698SJohn Baldwin eor v5.16b,v5.16b,v23.16b 482bc3d5698SJohn Baldwin 483bc3d5698SJohn Baldwin eor v16.16b,v4.16b,v0.16b 484bc3d5698SJohn Baldwin ext v3.16b,v16.16b,v16.16b,#8 485bc3d5698SJohn Baldwin 486bc3d5698SJohn Baldwin pmull2 v31.1q,v20.2d,v23.2d 487bc3d5698SJohn Baldwin pmull v30.1q,v21.1d,v5.1d 488bc3d5698SJohn Baldwin 489bc3d5698SJohn Baldwin pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii) 490bc3d5698SJohn Baldwin eor v16.16b,v16.16b,v3.16b 491bc3d5698SJohn Baldwin pmull2 v2.1q,v22.2d,v3.2d 492bc3d5698SJohn Baldwin pmull2 v1.1q,v21.2d,v16.2d 493bc3d5698SJohn Baldwin 494bc3d5698SJohn Baldwin eor v0.16b,v0.16b,v29.16b 495bc3d5698SJohn Baldwin eor v2.16b,v2.16b,v31.16b 496bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v30.16b 497bc3d5698SJohn Baldwin b .Ldone4x 498bc3d5698SJohn Baldwin 499bc3d5698SJohn Baldwin.align 4 500bc3d5698SJohn Baldwin.Lone: 501bc3d5698SJohn Baldwin ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 502bc3d5698SJohn Baldwin eor v18.16b,v0.16b,v2.16b 503bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v17.16b 504bc3d5698SJohn Baldwin ld1 {v4.2d},[x2] 505bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v18.16b 506*575878a5SEd Maste#ifndef __AARCH64EB__ 507bc3d5698SJohn Baldwin rev64 v4.16b,v4.16b 508bc3d5698SJohn Baldwin#endif 509bc3d5698SJohn Baldwin 510bc3d5698SJohn Baldwin pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 511bc3d5698SJohn Baldwin ins v2.d[0],v1.d[1] 512bc3d5698SJohn Baldwin ins v1.d[1],v0.d[0] 513bc3d5698SJohn Baldwin eor v0.16b,v1.16b,v18.16b 514bc3d5698SJohn Baldwin 515bc3d5698SJohn Baldwin ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 516bc3d5698SJohn Baldwin pmull v0.1q,v0.1d,v19.1d 517bc3d5698SJohn Baldwin eor v18.16b,v18.16b,v2.16b 518bc3d5698SJohn Baldwin eor v0.16b,v0.16b,v18.16b 519bc3d5698SJohn Baldwin ext v0.16b,v0.16b,v0.16b,#8 520bc3d5698SJohn Baldwin 521bc3d5698SJohn Baldwin eor v16.16b,v4.16b,v0.16b 522bc3d5698SJohn Baldwin ext v3.16b,v16.16b,v16.16b,#8 523bc3d5698SJohn Baldwin 524bc3d5698SJohn Baldwin pmull v0.1q,v20.1d,v3.1d 525bc3d5698SJohn Baldwin eor v16.16b,v16.16b,v3.16b 526bc3d5698SJohn Baldwin pmull2 v2.1q,v20.2d,v3.2d 527bc3d5698SJohn Baldwin pmull v1.1q,v21.1d,v16.1d 528bc3d5698SJohn Baldwin 529bc3d5698SJohn Baldwin.Ldone4x: 530bc3d5698SJohn Baldwin ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 531bc3d5698SJohn Baldwin eor v18.16b,v0.16b,v2.16b 532bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v17.16b 533bc3d5698SJohn Baldwin eor v1.16b,v1.16b,v18.16b 534bc3d5698SJohn Baldwin 535bc3d5698SJohn Baldwin pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 536bc3d5698SJohn Baldwin ins v2.d[0],v1.d[1] 537bc3d5698SJohn Baldwin ins v1.d[1],v0.d[0] 538bc3d5698SJohn Baldwin eor v0.16b,v1.16b,v18.16b 539bc3d5698SJohn Baldwin 540bc3d5698SJohn Baldwin ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 541bc3d5698SJohn Baldwin pmull v0.1q,v0.1d,v19.1d 542bc3d5698SJohn Baldwin eor v18.16b,v18.16b,v2.16b 543bc3d5698SJohn Baldwin eor v0.16b,v0.16b,v18.16b 544bc3d5698SJohn Baldwin ext v0.16b,v0.16b,v0.16b,#8 545bc3d5698SJohn Baldwin 546*575878a5SEd Maste#ifndef __AARCH64EB__ 547bc3d5698SJohn Baldwin rev64 v0.16b,v0.16b 548bc3d5698SJohn Baldwin#endif 549bc3d5698SJohn Baldwin st1 {v0.2d},[x0] //write out Xi 550bc3d5698SJohn Baldwin 551bc3d5698SJohn Baldwin ret 552bc3d5698SJohn Baldwin.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x 553bc3d5698SJohn Baldwin.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 554bc3d5698SJohn Baldwin.align 2 555bc3d5698SJohn Baldwin.align 2 556bc3d5698SJohn Baldwin#endif 557